[
  {
    "path": ".circleci/config.yml",
    "content": "version: 2.1\n\ncommon_step: &cmake_build_test\n  - run: |\n      cmake --build /tmp/build --target input\n\n      if [ -n \"$CIRCLE_PULL_REQUEST\" ]; then \\\n        subset=$(/bin/bash .circleci/longest_common_path.sh); \\\n        echo \"Changes of ${CIRCLE_SHA1} are all under $subset\"; \\\n      fi\n\n      cmake --build /tmp/build/${subset:-.} --parallel 2\n      # Run tests as non-root otherwise MPI will complain\n      (cd /tmp/build/${subset:-.} \\\n        && chown -R runner . \\\n        && su runner -c \"ctest --output-on-failure --label-regex quick --parallel 2\")\n\n# TODO: These builds are currently configured to\n# install the needed dependencies in each container\n# at the start of each build. The dependencies aren't huge,\n# but that is slower and does waste some bandwidth.\n# We should eventually roll the set up for each\n# container into a separate dockerfile and push custom\n# build images to dockerhub so that setting up packages\n# during the actual CI testing is no longer necessary.\n\njobs:\n  \"CheckFormat\":\n    docker:\n      - image: ubuntu:bionic\n    steps:\n      - checkout\n      - run: |\n          apt-get -q update -y\n          apt-get -q install -y apt-transport-https ca-certificates git gnupg software-properties-common wget\n          wget -O - https://apt.llvm.org/llvm-snapshot.gpg.key 2>/dev/null | apt-key add -\n          apt-add-repository -y 'deb https://apt.llvm.org/bionic/ llvm-toolchain-bionic-10 main'\n          apt-get -q update -y\n      - run: git submodule sync\n      - run: git submodule update --init\n      - run: |\n          apt-get -q install -y \\\n            clang-format-10\n\n          update-alternatives --install /usr/bin/clang-format clang-format /usr/bin/clang-format-10 50\n\n          if [ -n \"$CIRCLE_PULL_REQUEST\" ]; then \\\n            subset=$(/bin/bash .circleci/longest_common_path.sh); \\\n            echo \"Changes of ${CIRCLE_SHA1} are all under $subset\"; \\\n          fi\n          scripts/check_format.sh ${subset:-.}\n  \"Sanitize\":\n    docker:\n      - image: ubuntu:bionic\n    steps:\n      - checkout\n      - run: |\n          apt-get -q update -y\n          apt-get -q install -y apt-transport-https ca-certificates git gnupg software-properties-common wget\n          wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | apt-key add -\n          wget -O - https://apt.llvm.org/llvm-snapshot.gpg.key 2>/dev/null | apt-key add -\n          apt-add-repository -y 'ppa:ubuntu-toolchain-r/test'\n          apt-add-repository -y 'deb https://apt.llvm.org/bionic/ llvm-toolchain-bionic-10 main'\n          apt-add-repository -y 'deb https://apt.kitware.com/ubuntu/ bionic main'\n          apt-add-repository -y 'ppa:mhier/libboost-latest'\n          apt-get -q update -y\n      - run: git submodule sync\n      - run: git submodule update --init\n      - run: |\n          apt-get -q install -y \\\n            clang-10 \\\n            cmake \\\n            libboost1.70-dev \\\n            libeigen3-dev \\\n            openmpi-bin \\\n            libopenmpi-dev \\\n            llvm-7-dev \\\n            libz-dev \\\n            libfmt-dev\n\n          update-alternatives --install /usr/bin/clang clang /usr/bin/clang-10 50\n          update-alternatives --install /usr/bin/clang++ clang++ /usr/bin/clang++-10 50\n\n          chmod 755 /root\n          useradd runner\n          mkdir -p /tmp/build\n\n          cmake -S . -B /tmp/build \\\n            -DCMAKE_C_COMPILER=clang \\\n            -DCMAKE_CXX_COMPILER=clang++ \\\n            -DGALOIS_USE_SANITIZER=\"Address;Undefined\"\n      - <<: *cmake_build_test\n  \"Debian\":\n    docker:\n      - image: debian:10\n    steps:\n      - checkout\n      - run: |\n          apt-get -q update -y\n          apt-get -q install -y git\n      - run: git submodule sync\n      - run: git submodule update --init\n      - run: |\n          apt-get -q install -y \\\n            cmake \\\n            g++ \\\n            gcc \\\n            libboost-iostreams-dev \\\n            libboost-serialization-dev \\\n            libeigen3-dev \\\n            libmpich-dev \\\n            llvm-7-dev \\\n            mpich \\\n            zlib1g-dev \\\n            libfmt-dev\n\n          chmod 755 /root\n          useradd runner\n          mkdir -p /tmp/build\n\n          cmake -S . -B /tmp/build \\\n            -DGALOIS_ENABLE_DIST=ON\n      - <<: *cmake_build_test\n  \"Ubuntu-18_04\":\n    docker:\n      - image: ubuntu:18.04\n    steps:\n      - checkout\n      - run: |\n          apt-get -q update -y\n          apt-get -q install -y apt-transport-https ca-certificates git gnupg software-properties-common wget\n          wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | apt-key add -\n          apt-add-repository 'deb https://apt.kitware.com/ubuntu/ bionic main'\n          apt-get -q update -y\n      - run: git submodule sync\n      - run: git submodule update --init\n      - run: |\n          apt-get -q install -y \\\n            cmake \\\n            g++ \\\n            gcc \\\n            libboost-all-dev \\\n            libeigen3-dev \\\n            libopenmpi-dev \\\n            llvm-7-dev \\\n            openmpi-bin \\\n            ssh \\\n            libfmt-dev\n\n          chmod 755 /root\n          useradd runner\n          mkdir -p /tmp/build\n\n          cmake -S . -B /tmp/build \\\n            -DGALOIS_ENABLE_DIST=ON\n      - <<: *cmake_build_test\n  \"Ubuntu-18_04-cuda11_0_3-build-only\":\n    docker:\n      - image: nvidia/cuda:11.0.3-devel-ubuntu18.04\n    steps:\n      - checkout\n      - run: |\n          apt-get -q update -y\n          apt-get -q install -y apt-transport-https ca-certificates git gnupg software-properties-common wget\n          wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | apt-key add -\n          apt-add-repository 'deb https://apt.kitware.com/ubuntu/ bionic main'\n          apt-get -q update -y\n      - run: git submodule sync\n      - run: git submodule update --init\n      - run: |\n          apt-get -q install -y \\\n            cmake \\\n            g++ \\\n            gcc \\\n            libboost-all-dev \\\n            libeigen3-dev \\\n            libopenmpi-dev \\\n            llvm-7-dev \\\n            openmpi-bin \\\n            ssh \\\n            libfmt-dev\n\n          cmake -S . -B /tmp/build \\\n            -DGALOIS_ENABLE_DIST=ON \\\n            -DGALOIS_ENABLE_GPU=ON\n          cmake --build /tmp/build --target input\n          cmake --build /tmp/build --parallel 2\n  \"Ubuntu-18_04-cuda11_1_1-build-only\":\n    docker:\n      - image: nvidia/cuda:11.1.1-devel-ubuntu18.04\n    steps:\n      - checkout\n      - run: |\n          apt-get -q update -y\n          apt-get -q install -y apt-transport-https ca-certificates git gnupg software-properties-common wget\n          wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | apt-key add -\n          apt-add-repository 'deb https://apt.kitware.com/ubuntu/ bionic main'\n          apt-get -q update -y\n      - run: git submodule sync\n      - run: git submodule update --init\n      - run: |\n          apt-get -q install -y \\\n            cmake \\\n            g++ \\\n            gcc \\\n            libboost-all-dev \\\n            libeigen3-dev \\\n            libopenmpi-dev \\\n            llvm-7-dev \\\n            openmpi-bin \\\n            ssh \\\n            libfmt-dev\n\n          cmake -S . -B /tmp/build \\\n            -DGALOIS_ENABLE_DIST=ON \\\n            -DGALOIS_ENABLE_GPU=ON\n          cmake --build /tmp/build --target input\n          cmake --build /tmp/build --parallel 2\n  \"CentOS-8-gcc\":\n    docker:\n      - image: centos:8\n    steps:\n      - checkout\n      - run: |\n          # CentOS Linux 8 has reached End Of Life (EOL) on December 31st, 2021\n          ls /etc/yum.repos.d/ > /dev/null 2>&1\n          sed -i 's/mirrorlist/#mirrorlist/g' /etc/yum.repos.d/CentOS-*\n          sed -i 's|#baseurl=http://mirror.centos.org|baseurl=http://vault.centos.org|g' /etc/yum.repos.d/CentOS-*\n      - run: |\n          # fmt-devel is in EPEL\n          yum -y -q install https://dl.fedoraproject.org/pub/epel/epel-release-latest-8.noarch.rpm\n\n          # eigen3-devel needs PowerTools packages\n          yum -y -q install dnf-plugins-core\n          yum -y -q config-manager --set-enabled powertools\n\n          yum -y -q install git\n      - run: git submodule sync\n      - run: git submodule update --init\n      - run: |\n          yum -y -q install \\\n            boost-devel \\\n            cmake \\\n            eigen3-devel \\\n            gcc \\\n            gcc-c++ \\\n            llvm-devel \\\n            llvm-static \\\n            make \\\n            mpich-devel \\\n            ncurses-devel \\\n            wget \\\n            zlib-devel \\\n            fmt-devel\n\n          wget -O - https://github.com/Kitware/CMake/releases/download/v3.17.0/cmake-3.17.0-Linux-x86_64.tar.gz | tar -xz -f - -C /usr/local\n          ln -s /usr/local/cmake-3.17.0-Linux-x86_64/bin/cmake /usr/local/bin/cmake\n          ln -s /usr/local/cmake-3.17.0-Linux-x86_64/bin/ctest /usr/local/bin/ctest\n\n          # Make the \"module\" command work in the subsequent shell sessions.\n          cat /etc/profile.d/modules.sh >> $BASH_ENV\n          echo \"module load mpi\" >> $BASH_ENV\n      - run: |\n          chmod 755 /root\n          useradd runner\n          mkdir -p /tmp/build\n\n          cmake -S . -B /tmp/build \\\n            -DGALOIS_ENABLE_DIST=ON\n      - <<: *cmake_build_test\n  \"CentOS-8-clang\":\n    docker:\n      - image: centos:8\n    steps:\n      - checkout\n      - run: |\n          # CentOS Linux 8 has reached End Of Life (EOL) on December 31st, 2021\n          ls /etc/yum.repos.d/ > /dev/null 2>&1\n          sed -i 's/mirrorlist/#mirrorlist/g' /etc/yum.repos.d/CentOS-*\n          sed -i 's|#baseurl=http://mirror.centos.org|baseurl=http://vault.centos.org|g' /etc/yum.repos.d/CentOS-*\n      - run: |\n          # fmt-devel is in EPEL\n          yum -y -q install https://dl.fedoraproject.org/pub/epel/epel-release-latest-8.noarch.rpm\n\n          # eigen3-devel needs PowerTools packages\n          yum -y -q install dnf-plugins-core\n          yum -y -q config-manager --set-enabled powertools\n\n          yum -y -q install git\n      - run: git submodule sync\n      - run: git submodule update --init\n      - run: |\n          yum -y -q install \\\n            boost-devel \\\n            eigen3-devel \\\n            llvm-devel \\\n            llvm-static \\\n            llvm-toolset \\\n            make \\\n            openmpi-devel \\\n            ncurses-devel \\\n            wget \\\n            zlib-devel \\\n            fmt-devel\n\n          wget -O - https://github.com/Kitware/CMake/releases/download/v3.17.0/cmake-3.17.0-Linux-x86_64.tar.gz | tar -xz -f - -C /usr/local\n          ln -s /usr/local/cmake-3.17.0-Linux-x86_64/bin/cmake /usr/local/bin/cmake\n          ln -s /usr/local/cmake-3.17.0-Linux-x86_64/bin/ctest /usr/local/bin/ctest\n\n          # Make the \"module\" command work in the subsequent shell sessions.\n          cat /etc/profile.d/modules.sh >> $BASH_ENV\n          echo \"module load mpi\" >> $BASH_ENV\n      - run: |\n          chmod 755 /root\n          useradd runner\n          mkdir -p /tmp/build\n\n          cmake -S . -B /tmp/build \\\n            -DCMAKE_C_COMPILER=clang \\\n            -DCMAKE_CXX_COMPILER=clang++ \\\n            # -DGALOIS_ENABLE_DIST=ON\n      - <<: *cmake_build_test\n  \"Arch\":\n    docker:\n      - image: archlinux:base\n    steps:\n      - checkout\n      - run: |\n          pacman -Syu --noconfirm\n          pacman -q -S --noconfirm git\n      - run: git submodule sync\n      - run: git submodule update --init\n      - run: |\n          # NB(ddn): make requires libffi but its package doesn't depend on it.\n          pacman -q -S --noconfirm \\\n            boost \\\n            cmake \\\n            eigen \\\n            gcc \\\n            libffi \\\n            llvm \\\n            make \\\n            openmpi \\\n            fmt\n\n          chmod 755 /root\n          useradd runner\n          mkdir -p /tmp/build\n\n          cmake -S . -B /tmp/build \\\n            -DGALOIS_ENABLE_DIST=ON\n      - <<: *cmake_build_test\n  \"Alpine\":\n    docker:\n      - image: alpine:latest\n    steps:\n      - checkout\n      - run: |\n          apk add --no-cache --no-progress git bash\n      - run: git submodule sync\n      - run: git submodule update --init\n      - run: |\n          apk add --no-cache --no-progress \\\n            boost-dev \\\n            cmake \\\n            eigen \\\n            g++ \\\n            gcc \\\n            llvm14-dev \\\n            llvm14-static \\\n            make \\\n            musl-dev \\\n            openssh-client \\\n            zlib-dev \\\n            fmt-dev\n\n          chmod 755 /root\n          adduser -D runner\n          mkdir -p /tmp/build\n\n          cmake -S . -B /tmp/build\n      - <<: *cmake_build_test\n  \"Fedora-gcc\":\n    docker:\n      - image: fedora:latest\n    steps:\n      - checkout\n      - run: |\n          yum -y -q install git\n      - run: git submodule sync\n      - run: git submodule update --init\n      - run: |\n          yum -y -q install \\\n            boost-devel \\\n            cmake \\\n            eigen3-devel \\\n            gcc-c++ \\\n            llvm-devel \\\n            llvm-static \\\n            make \\\n            mpich-devel \\\n            wget \\\n            zlib-devel \\\n            fmt-devel\n\n          chmod 755 /root\n          useradd runner\n          mkdir -p /tmp/build\n\n          # Get the \"module\" function set up before loading MPI.\n          cat /etc/profile.d/modules.sh >> $BASH_ENV\n          echo \"module load mpi\" >> $BASH_ENV\n      - run: |\n          cmake -S . -B /tmp/build \\\n            -DGALOIS_ENABLE_DIST=ON\n      - <<: *cmake_build_test\n  \"Fedora-clang\":\n    docker:\n      - image: fedora:latest\n    steps:\n      - checkout\n      - run: |\n          yum -y -q install git\n      - run: git submodule sync\n      - run: git submodule update --init\n      - run: |\n          yum -y -q install \\\n            boost-devel \\\n            clang \\\n            cmake \\\n            eigen3-devel \\\n            llvm-devel \\\n            llvm-static \\\n            make \\\n            openmpi-devel \\\n            wget \\\n            zlib-devel \\\n            fmt-devel\n\n          chmod 755 /root\n          useradd runner\n          mkdir -p /tmp/build\n\n          # Get the \"module\" function set up before loading MPI.\n          cat /etc/profile.d/modules.sh >> $BASH_ENV\n          echo \"module load mpi\" >> $BASH_ENV\n      - run: |\n          cmake -S . -B /tmp/build \\\n            -DCMAKE_C_COMPILER=clang \\\n            -DCMAKE_CXX_COMPILER=clang++ \\\n            # -DGALOIS_ENABLE_DIST=ON\n      - <<: *cmake_build_test\n\nworkflows:\n  build:\n    jobs:\n      - \"CheckFormat\"\n      - \"Sanitize\"\n      - \"Alpine\":\n          requires:\n            - \"CheckFormat\"\n            # - \"Sanitize\"\n      - \"Arch\":\n          requires:\n            - \"CheckFormat\"\n            # - \"Sanitize\"\n      - \"CentOS-8-clang\":\n          requires:\n            - \"CheckFormat\"\n            # - \"Sanitize\"\n      - \"CentOS-8-gcc\":\n          requires:\n            - \"CheckFormat\"\n            # - \"Sanitize\"\n      - \"Debian\":\n          requires:\n            - \"CheckFormat\"\n            # - \"Sanitize\"\n      - \"Fedora-clang\":\n          requires:\n            - \"CheckFormat\"\n            # - \"Sanitize\"\n      - \"Fedora-gcc\":\n          requires:\n            - \"CheckFormat\"\n            # - \"Sanitize\"\n      - \"Ubuntu-18_04\":\n          requires:\n            - \"CheckFormat\"\n            # - \"Sanitize\"\n      - \"Ubuntu-18_04-cuda11_1_1-build-only\":\n          requires:\n            - \"CheckFormat\"\n            # - \"Sanitize\"\n      - \"Ubuntu-18_04-cuda11_0_3-build-only\":\n          requires:\n            - \"CheckFormat\"\n            # - \"Sanitize\"\n"
  },
  {
    "path": ".circleci/longest_common_path.sh",
    "content": "#!/bin/bash\n# For PR build only; find the longest common path prefix as the build and test subset\n\nlongest_common_prefix() {\n    declare -a possible_prefix\n    declare i=0\n\n    path=\"${1%/}\"\n    while [ \"$path\" != \".\" ]; do\n        if [[ -d $path && -f \"$path/CMakeLists.txt\" ]]; then\n            possible_prefix[$i]=\"$path\"\n        fi\n        i=$(($i + 1))\n        path=$(dirname \"$path\");\n    done\n\n    lcp=\".\"\n    for prefix in \"${possible_prefix[@]}\"; do\n        for path in $@; do\n            if [ \"${path#$prefix}\" = \"${path}\" ]; then\n                continue 2\n            fi\n        done\n        lcp=\"$prefix\"\n        break\n    done\n    echo $lcp\n}\nbase=$( \\\n    wget -q -O - \"https://api.github.com/repos/$(echo ${CIRCLE_PULL_REQUEST:19} | sed \"s/\\/pull\\//\\/pulls\\//\")\" \\\n    | sed -n -e \"s/^.*IntelligentSoftwareSystems://p\" \\\n    | sed -n -e \"s/\\\".*$//p\" \\\n)\nlongest_common_prefix $(git -c core.quotepath=false diff --name-only $base $CIRCLE_SHA1)"
  },
  {
    "path": ".clang-format",
    "content": "---\nLanguage:        Cpp\nBasedOnStyle:  LLVM\nAccessModifierOffset: -2\nAlignAfterOpenBracket: true\nAlignConsecutiveAssignments: true\nAlignEscapedNewlinesLeft: false\nAlignOperands:   true\nAlignTrailingComments: true\nAllowAllParametersOfDeclarationOnNextLine: true\nAllowShortBlocksOnASingleLine: false\nAllowShortCaseLabelsOnASingleLine: false\nAllowShortFunctionsOnASingleLine: All\nAllowShortIfStatementsOnASingleLine: false\nAllowShortLoopsOnASingleLine: false\nAlwaysBreakAfterDefinitionReturnType: None\nAlwaysBreakBeforeMultilineStrings: false\nAlwaysBreakTemplateDeclarations: true\nBinPackArguments: true\nBinPackParameters: true\nBreakBeforeBinaryOperators: None\nBreakBeforeBraces: Attach\nBreakBeforeTernaryOperators: true\nBreakConstructorInitializersBeforeComma: false\nColumnLimit:     80\nCommentPragmas:  '^ IWYU pragma:'\nConstructorInitializerAllOnOneLineOrOnePerLine: false\nConstructorInitializerIndentWidth: 4\nContinuationIndentWidth: 4\nCpp11BracedListStyle: true\nDeriveLineEnding: false\nDerivePointerAlignment: false\nDisableFormat:   false\nExperimentalAutoDetectBinPacking: false\nForEachMacros:   [ foreach, Q_FOREACH, BOOST_FOREACH ]\nIndentCaseLabels: false\nIndentWidth:     2\nIndentWrappedFunctionNames: false\nKeepEmptyLinesAtTheStartOfBlocks: true\nMacroBlockBegin: ''\nMacroBlockEnd:   ''\nMaxEmptyLinesToKeep: 1\nNamespaceIndentation: None\nObjCBlockIndentWidth: 2\nObjCSpaceAfterProperty: false\nObjCSpaceBeforeProtocolList: true\nPenaltyBreakBeforeFirstCallParameter: 19\nPenaltyBreakComment: 300\nPenaltyBreakFirstLessLess: 120\nPenaltyBreakString: 1000\nPenaltyExcessCharacter: 1000000\nPenaltyReturnTypeOnItsOwnLine: 60\nPointerAlignment: Left\nSortIncludes: false\nSpaceAfterCStyleCast: false\nSpaceBeforeAssignmentOperators: true\nSpaceBeforeParens: ControlStatements\nSpaceInEmptyParentheses: false\nSpacesBeforeTrailingComments: 1\nSpacesInAngles:  false\nSpacesInContainerLiterals: true\nSpacesInCStyleCastParentheses: false\nSpacesInParentheses: false\nSpacesInSquareBrackets: false\nStandard:        Cpp11\nTabWidth:        4\nUseTab:          Never\n...\n\n"
  },
  {
    "path": ".clang-tidy",
    "content": "---\n# Enable most checks then disable (-) problematics ones:\n#\n# Some checks are good in principle but cannot be applied automatically either\n# because they require taste or the autofix can generate wrong code:\n#\n# - cppcoreguidelines-pro-type-member-init: wrong code sometimes\n# - google-explicit-constructor: libllvm has implicit conversions\n# - modernize-use-no-discard\n# - modernize-use-transparent-functors\n# - modernize-use-using: autofix doesn't handle dependent type templates\n# - readability-static-accessed-through-instance: wrong code sometimes\n#\n# No consensus:\n#\n# - modernize-use-trailing-return-type: also huge code churn\n# - readability-convert-member-functions-to-static\n# - readability-implicit-bool-conversion\nChecks: |\n  abseil-*,\n  boost-*,\n  bugprone-*,\n  clang-analyzer-*,\n  clang-diagnostic-*,\n  cppcoreguidelines-*,\n  -cppcoreguidelines-pro-type-member-init,\n  google-*,\n  -google-explicit-constructor,\n  modernize-*,\n  -modernize-use-nodiscard,\n  -modernize-use-trailing-return-type,\n  -modernize-use-transparent-functors,\n  -modernize-use-using,\n  mpi-*,\n  openmp-*,\n  performance-*,\n  readability-*,\n  -readability-convert-member-functions-to-static,\n  -readability-static-accessed-through-instance,\n  -readability-implicit-bool-conversion,\nWarningsAsErrors: ''\nHeaderFilterRegex: ''\nAnalyzeTemporaryDtors: false\nFormatStyle:     file\nCheckOptions:\n  - key:             cert-dcl16-c.NewSuffixes\n    value:           'L;LL;LU;LLU'\n  - key:             cert-oop54-cpp.WarnOnlyIfThisHasSuspiciousField\n    value:           '0'\n  - key:             cppcoreguidelines-explicit-virtual-functions.IgnoreDestructors\n    value:           '1'\n  - key:             cppcoreguidelines-non-private-member-variables-in-classes.IgnoreClassesWithAllMemberVariablesBeingPublic\n    value:           '1'\n  - key:             google-readability-braces-around-statements.ShortStatementLines\n    value:           '1'\n  - key:             google-readability-function-size.StatementThreshold\n    value:           '800'\n  - key:             google-readability-namespace-comments.ShortNamespaceLines\n    value:           '10'\n  - key:             google-readability-namespace-comments.SpacesBeforeComments\n    value:           '2'\n  - key:             modernize-loop-convert.MaxCopySize\n    value:           '16'\n  - key:             modernize-loop-convert.MinConfidence\n    value:           reasonable\n  - key:             modernize-loop-convert.NamingStyle\n    value:           CamelCase\n  - key:             modernize-pass-by-value.IncludeStyle\n    value:           llvm\n  - key:             modernize-replace-auto-ptr.IncludeStyle\n    value:           llvm\n  - key:             modernize-use-nullptr.NullMacros\n    value:           'NULL'\n...\n"
  },
  {
    "path": ".git-blame-ignore-revs",
    "content": "# Bulk-change revisions to ignore in git blame\n#\n# Requires git v2.23\n#\n# To use:\n#\n#   git blame --ignore-revs-file .git-blame-ignore-revs\n#\n# or more permanently:\n#\n#   git config blame.ignoreRevsFile .git-blame-ignore-revs\n\n# Run clang-format.\n02ecf4f4ea6ed8618a3826f98c3ea192ee38ca2d\n\n# Re-run clang-format.\n47ddbe14de2e61b87749cd20bd368f07ef3c322f\n\n# Reorganize the lonestar directories.\n6ade1c5ac3cf0c261aff7bee863e46b2c124d174\n\n# Run clang-format.\n517fca343c75f842096b661e3ff883bb93f5c09e\n\n# Another round of clang-format\n2264b05ece3f9ec2b9bf397594cc14ef99f498de\n\n# Fix endlines for barneshut app\n558ccb83ab2e388c1202396f42d0881912e6393d\n"
  },
  {
    "path": ".gitignore",
    "content": "# no editor files\n*~\n*.backup\n/.dir-locals.el\n*.orig\n*.patch\n/.project\n.settings\n.*.swo\n*.swp\n.vscode\n\n# no tool generated files\n.clang-complete\n.clangd\ncompile_commands.json\ncscope.out\n/GPATH\n/GRTAGS\n/GTAGS\n.tags*\ntags\n.ycm_extra_conf.py\n\n# no build files\n/build*\n\n# no python build artifacts\n*.pyc\n/python/galois.egg-info\n/python/galois/*.so\n/_skbuild\n"
  },
  {
    "path": ".gitmodules",
    "content": "[submodule \"moderngpu\"]\n\tpath = external/moderngpu\n\turl = https://github.com/moderngpu/moderngpu.git\n[submodule \"cub\"]\n\tpath = external/cub\n\turl = https://github.com/NVlabs/cub.git\n[submodule \"docs\"]\n\tpath = docs\n\turl = https://github.com/IntelligentSoftwareSystems/Galois-docs.git\n"
  },
  {
    "path": ".travis.yml",
    "content": "dist: bionic\n\nlanguage: c++\n\ngit:\n  submodules: true\n\nmatrix:\n  include:\n    - os: osx\n      osx_image: xcode11.3\n      before_script:\n        - export CC=clang\n        - export CXX=clang++\n        - brew install openmpi llvm fmt\n        - mkdir build\n        - export PATH=$PATH:/usr/local/opt/llvm/bin\n        - cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DGALOIS_ENABLE_DIST=ON || exit 1\n    - env:\n        - GCC_VER=7\n      addons:\n        apt:\n          sources:\n            - sourceline: 'ppa:ubuntu-toolchain-r/test'\n            - sourceline: 'deb https://apt.kitware.com/ubuntu/ bionic main'\n              key_url: 'https://apt.kitware.com/keys/kitware-archive-latest.asc'\n            - sourceline: 'ppa:mhier/libboost-latest'\n          packages:\n            - gcc-7\n            - g++-7\n            - cmake\n            - libboost1.70-dev\n            - libeigen3-dev\n            - openmpi-bin\n            - libopenmpi-dev\n            - llvm-7-dev\n            - libz-dev\n            - libfmt-dev\n    - env:\n        - GCC_VER=8\n      addons:\n        apt:\n          sources:\n            - sourceline: 'ppa:ubuntu-toolchain-r/test'\n            - sourceline: 'deb https://apt.kitware.com/ubuntu/ bionic main'\n              key_url: 'https://apt.kitware.com/keys/kitware-archive-latest.asc'\n            - sourceline: 'ppa:mhier/libboost-latest'\n          packages:\n            - gcc-8\n            - g++-8\n            - cmake\n            - libboost1.70-dev\n            - libeigen3-dev\n            - openmpi-bin\n            - libopenmpi-dev\n            - llvm-7-dev\n            - libz-dev\n            - libfmt-dev\n    - env:\n        - GCC_VER=9\n      addons:\n        apt:\n          sources:\n            - sourceline: 'ppa:ubuntu-toolchain-r/test'\n            - sourceline: 'deb https://apt.kitware.com/ubuntu/ bionic main'\n              key_url: 'https://apt.kitware.com/keys/kitware-archive-latest.asc'\n            - sourceline: 'ppa:mhier/libboost-latest'\n          packages:\n            - gcc-9\n            - g++-9\n            - cmake\n            - libboost1.70-dev\n            - libeigen3-dev\n            - openmpi-bin\n            - libopenmpi-dev\n            - llvm-7-dev\n            - libz-dev\n            - libfmt-dev\n    - env:\n        - GCC_VER=10\n      addons:\n        apt:\n          sources:\n            - sourceline: 'ppa:ubuntu-toolchain-r/test'\n            - sourceline: 'deb https://apt.kitware.com/ubuntu/ bionic main'\n              key_url: 'https://apt.kitware.com/keys/kitware-archive-latest.asc'\n            - sourceline: 'ppa:mhier/libboost-latest'\n          packages:\n            - gcc-10\n            - g++-10\n            - cmake\n            - libboost1.70-dev\n            - libeigen3-dev\n            - openmpi-bin\n            - libopenmpi-dev\n            - llvm-7-dev\n            - libz-dev\n            - libfmt-dev\n    - env:\n        - GCC_VER=10\n        - BUILD_TYPE=Debug\n      addons:\n        apt:\n          sources:\n            - sourceline: 'ppa:ubuntu-toolchain-r/test'\n            - sourceline: 'deb https://apt.kitware.com/ubuntu/ bionic main'\n              key_url: 'https://apt.kitware.com/keys/kitware-archive-latest.asc'\n            - sourceline: 'ppa:mhier/libboost-latest'\n          packages:\n            - gcc-10\n            - g++-10\n            - cmake\n            - libboost1.70-dev\n            - libeigen3-dev\n            - openmpi-bin\n            - libopenmpi-dev\n            - llvm-7-dev\n            - libz-dev\n            - libfmt-dev\n    - env:\n        - CLANG_VER=7\n      addons:\n        apt:\n          sources:\n            - sourceline: 'ppa:ubuntu-toolchain-r/test'\n            - sourceline: 'deb http://apt.llvm.org/bionic/ llvm-toolchain-bionic-7 main'\n              key_url: 'https://apt.llvm.org/llvm-snapshot.gpg.key'\n            - sourceline: 'deb https://apt.kitware.com/ubuntu/ bionic main'\n              key_url: 'https://apt.kitware.com/keys/kitware-archive-latest.asc'\n            - sourceline: 'ppa:mhier/libboost-latest'\n          packages:\n            - clang-7\n            - cmake\n            - libboost1.70-dev\n            - libeigen3-dev\n            - openmpi-bin\n            - libopenmpi-dev\n            - llvm-7-dev\n            - libz-dev\n            - libfmt-dev\n    - env:\n        - CLANG_VER=8\n      addons:\n        apt:\n          sources:\n            - sourceline: 'ppa:ubuntu-toolchain-r/test'\n            - sourceline: 'deb http://apt.llvm.org/bionic/ llvm-toolchain-bionic-8 main'\n              key_url: 'https://apt.llvm.org/llvm-snapshot.gpg.key'\n            - sourceline: 'deb https://apt.kitware.com/ubuntu/ bionic main'\n              key_url: 'https://apt.kitware.com/keys/kitware-archive-latest.asc'\n            - sourceline: 'ppa:mhier/libboost-latest'\n          packages:\n            - clang-8\n            - cmake\n            - libboost1.70-dev\n            - libeigen3-dev\n            - openmpi-bin\n            - libopenmpi-dev\n            - llvm-7-dev\n            - libz-dev\n            - libfmt-dev\n    - env:\n        - CLANG_VER=9\n      addons:\n        apt:\n          sources:\n            - sourceline: 'ppa:ubuntu-toolchain-r/test'\n            - sourceline: 'deb http://apt.llvm.org/bionic/ llvm-toolchain-bionic-9 main'\n              key_url: 'https://apt.llvm.org/llvm-snapshot.gpg.key'\n            - sourceline: 'deb https://apt.kitware.com/ubuntu/ bionic main'\n              key_url: 'https://apt.kitware.com/keys/kitware-archive-latest.asc'\n            - sourceline: 'ppa:mhier/libboost-latest'\n          packages:\n            - clang-9\n            - cmake\n            - libboost1.70-dev\n            - libeigen3-dev\n            - openmpi-bin\n            - libopenmpi-dev\n            - llvm-7-dev\n            - libz-dev\n            - libfmt-dev\n    - env:\n        - CLANG_VER=10\n      addons:\n        apt:\n          sources:\n            - sourceline: 'ppa:ubuntu-toolchain-r/test'\n            - sourceline: 'deb http://apt.llvm.org/bionic/ llvm-toolchain-bionic-10 main'\n              key_url: 'https://apt.llvm.org/llvm-snapshot.gpg.key'\n            - sourceline: 'deb https://apt.kitware.com/ubuntu/ bionic main'\n              key_url: 'https://apt.kitware.com/keys/kitware-archive-latest.asc'\n            - sourceline: 'ppa:mhier/libboost-latest'\n          packages:\n            - clang-10\n            - cmake\n            - libboost1.70-dev\n            - libeigen3-dev\n            - openmpi-bin\n            - libopenmpi-dev\n            - llvm-7-dev\n            - libz-dev\n            - libfmt-dev\n    - env:\n        - CLANG_VER=10\n        - BUILD_TYPE=Debug\n      addons:\n        apt:\n          sources:\n            - sourceline: 'ppa:ubuntu-toolchain-r/test'\n            - sourceline: 'deb http://apt.llvm.org/bionic/ llvm-toolchain-bionic-10 main'\n              key_url: 'https://apt.llvm.org/llvm-snapshot.gpg.key'\n            - sourceline: 'deb https://apt.kitware.com/ubuntu/ bionic main'\n              key_url: 'https://apt.kitware.com/keys/kitware-archive-latest.asc'\n            - sourceline: 'ppa:mhier/libboost-latest'\n          packages:\n            - clang-10\n            - cmake\n            - libboost1.70-dev\n            - libeigen3-dev\n            - openmpi-bin\n            - libopenmpi-dev\n            - llvm-7-dev\n            - libz-dev\n            - libfmt-dev\n\nbefore_script:\n  # Depending on whether GCC_VER or CLANG_VER is set and nonempty,\n  # set CC and CXX accordingly.\n  - |\n    if [ -n \"$GCC_VER\" ]; then\n      export CC=\"gcc-$GCC_VER\"\n      export CXX=\"g++-$GCC_VER\"\n    fi\n  - |\n    if [ -n \"$CLANG_VER\" ]; then\n      export CC=\"clang-$CLANG_VER\"\n      export CXX=\"clang++-$CLANG_VER\"\n    fi\n  - |\n    # Check if BUILD_TYPE is set at all, not just whether it is empty or unset.\n    # See https://stackoverflow.com/a/13864829/1935144.\n    if [ -z ${BUILD_TYPE+x} ]; then\n      export BUILD_TYPE=Release\n    fi\n  - mkdir build\n  # Use apt-installed llvm-7-dev rather than travis-provided one which is\n  # picked up through the local clang-7 install in /usr/local/clang-7.\n  - export CMAKE_PREFIX_PATH=/usr/lib/llvm-7\n  # Use apt-installed cmake rather than travis-provided one\n  # (/usr/local/cmake-3.12.4/bin/cmake).\n  - /usr/bin/cmake -S . -B build -DCMAKE_PREFIX_PATH=$CMAKE_PREFIX_PATH -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DGALOIS_ENABLE_DIST=ON || exit 1\n\nscript:\n  - make -C build input\n  - cmake --build build --parallel 2 || exit 1\n  - (cd build && ctest --output-on-failure --parallel 2 --label-regex quick) || exit 1\n\nnotifications:\n  email: false\n"
  },
  {
    "path": "CMakeLists.txt",
    "content": "cmake_minimum_required(VERSION 3.13)\n\nproject(Galois)\n\nlist(APPEND CMAKE_MODULE_PATH \"${CMAKE_CURRENT_SOURCE_DIR}/cmake/Modules\")\n\ninclude(GNUInstallDirs)\n\nfile(STRINGS config/version.txt GALOIS_VERSION)\nstring(REGEX REPLACE \"[ \\t\\n]\" \"\" GALOIS_VERSION ${GALOIS_VERSION})\nstring(REGEX REPLACE \"([0-9]+)\\\\.([0-9]+)\\\\.([0-9]+)\" \"\\\\1\" GALOIS_VERSION_MAJOR ${GALOIS_VERSION})\nstring(REGEX REPLACE \"([0-9]+)\\\\.([0-9]+)\\\\.([0-9]+)\" \"\\\\2\" GALOIS_VERSION_MINOR ${GALOIS_VERSION})\nstring(REGEX REPLACE \"([0-9]+)\\\\.([0-9]+)\\\\.([0-9]+)\" \"\\\\3\" GALOIS_VERSION_PATCH ${GALOIS_VERSION})\nset(GALOIS_COPYRIGHT_YEAR \"2018\") # Also in COPYRIGHT\n\nif(NOT CMAKE_BUILD_TYPE)\n  message(STATUS \"No build type selected, default to Release\")\n  # cmake default flags with relwithdebinfo is -O2 -g\n  # cmake default flags with release is -O3 -DNDEBUG\n  set(CMAKE_BUILD_TYPE \"Release\")\nendif()\n\n###### Options (alternatively pass as options to cmake -DName=Value) ######\n###### Distributed-heterogeneous features ######\nset(GALOIS_ENABLE_DIST OFF CACHE BOOL \"Enable distributed features\")\nset(GALOIS_CUDA_CAPABILITY \"\" CACHE STRING \"Semi-colon list of CUDA compute capability version numbers to enable GPU features\") # e.g., \"3.7;6.1\"\nset(GALOIS_COMM_STATS OFF CACHE BOOL \"Report more detailed statistics of communication\")\n###### General features ######\nset(GALOIS_ENABLE_PAPI OFF CACHE BOOL \"Use PAPI counters for profiling\")\nset(GALOIS_ENABLE_VTUNE OFF CACHE BOOL \"Use VTune for profiling\")\nset(GALOIS_STRICT_CONFIG OFF CACHE BOOL \"Instead of falling back gracefully, fail\")\nset(GALOIS_GRAPH_LOCATION \"\" CACHE PATH \"Location of inputs for tests if downloaded/stored separately.\")\nset(CXX_CLANG_TIDY \"\" CACHE STRING \"Semi-colon list specifying clang-tidy command and arguments\")\nset(CMAKE_CXX_COMPILER_LAUNCHER \"\" CACHE STRING \"Semi-colon list specifying command to wrap compiler invocations (e.g., ccache)\")\nset(USE_ARCH native CACHE STRING \"Optimize for a specific processor architecture ('none' to disable)\")\nset(GALOIS_USE_SANITIZER \"\" CACHE STRING \"Semi-colon list of sanitizers to use (Memory, MemoryWithOrigins, Address, Undefined, Thread)\")\n# This option is automatically handled by CMake.\n# It makes add_library build a shared lib unless STATIC is explicitly specified.\n# Putting this here is mostly just a placeholder so people know it's an option.\n# Currently this is really only intended to change anything for the libgalois_shmem target.\nset(BUILD_SHARED_LIBS OFF CACHE BOOL \"Build shared libraries\")\nset(BUILD_DOCS \"\" CACHE STRING \"Build documentation with make doc. Supported values: <unset>, external, internal. external docs hide '*-draft*' and '*-internal* documentation pages and directories when building documentation\")\n###### Developer features ######\nset(GALOIS_PER_ROUND_STATS OFF CACHE BOOL \"Report statistics of each round of execution\")\nset(GALOIS_NUM_TEST_GPUS \"0\" CACHE STRING \"Number of test GPUs to use (on a single machine) for running the tests.\")\nset(GALOIS_USE_LCI OFF CACHE BOOL \"Use LCI network runtime instead of MPI\")\nset(GALOIS_USE_BARE_MPI OFF CACHE BOOL \"Use MPI directly (no dedicated network-runtime thread)\")\nset(GALOIS_NUM_TEST_THREADS \"\" CACHE STRING \"Maximum number of threads to use when running tests (default: number of physical cores)\")\n\nif(NOT GALOIS_NUM_TEST_THREADS)\n  cmake_host_system_information(RESULT GALOIS_NUM_TEST_THREADS QUERY NUMBER_OF_PHYSICAL_CORES)\nendif()\nif(GALOIS_NUM_TEST_THREADS LESS_EQUAL 0)\n  set(GALOIS_NUM_TEST_THREADS 1)\nendif()\n\n###### Configure (users don't need to go beyond here) ######\n\ninclude(CTest)\n\n###### Configure compiler ######\n\n# generate compile_commands.json\nset(CMAKE_EXPORT_COMPILE_COMMANDS ON)\n\nset(CMAKE_CXX_STANDARD 17)\nset(CMAKE_CXX_STANDARD_REQUIRED ON)\nset(CMAKE_CXX_EXTENSIONS OFF) #...without compiler extensions like gnu++11\nset(CMAKE_POSITION_INDEPENDENT_CODE ON)\n\n# Always include debug info\nadd_compile_options(\"$<$<COMPILE_LANGUAGE:CXX>:-g>\")\n\n# GCC\nif(CMAKE_CXX_COMPILER_ID STREQUAL \"GNU\")\n  if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS 7)\n    message(FATAL_ERROR \"gcc must be version 7 or higher. Found ${CMAKE_CXX_COMPILER_VERSION}.\")\n  endif()\n\n  add_compile_options(\"$<$<COMPILE_LANGUAGE:CXX>:-Wall;-Wextra>\")\n\n  if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS 11)\n    add_compile_options(\"$<$<COMPILE_LANGUAGE:CXX>:-Werror>\")\n  endif()\nendif()\n\nif(CMAKE_CXX_COMPILER_ID STREQUAL \"Clang\")\n  if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS 7)\n    message(FATAL_ERROR \"clang must be version 7 or higher. Found ${CMAKE_CXX_COMPILER_VERSION}.\")\n  endif()\n\n  add_compile_options(\"$<$<COMPILE_LANGUAGE:CXX>:-Wall;-Wextra>\")\n\n  if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS 11)\n    add_compile_options(\"$<$<COMPILE_LANGUAGE:CXX>:-Werror>\")\n  endif()\nendif()\n\nif(CMAKE_CXX_COMPILER_ID STREQUAL \"AppleClang\")\n  add_compile_options(\"$<$<COMPILE_LANGUAGE:CXX>:-Wall;-Wextra>\")\n\n  if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS 12)\n    add_compile_options(\"$<$<COMPILE_LANGUAGE:CXX>:-Werror>\")\n  endif()\nendif()\n\nif(CMAKE_CXX_COMPILER_ID STREQUAL \"Intel\")\n  if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS 19.0.1)\n    message(FATAL_ERROR \"icpc must be 19.0.1 or higher. Found ${CMAKE_CXX_COMPILER_VERSION}.\")\n  endif()\n\n  # Avoid warnings when using noinline for methods defined inside class defintion.\n  add_compile_options(\"$<$<COMPILE_LANGUAGE:CXX>:-wd2196>\")\nendif()\n\n# Enable architecture-specific optimizations\ninclude(CheckArchFlags)\nif(ARCH_FLAGS_FOUND)\n  add_compile_options(\"$<$<COMPILE_LANGUAGE:CXX>:${ARCH_CXX_FLAGS}>\")\n  add_compile_options(\"$<$<COMPILE_LANGUAGE:C>:${ARCH_C_FLAGS}>\")\n  add_link_options(${ARCH_LINK_FLAGS})\nendif()\n\nif(CXX_CLANG_TIDY)\n  set(CMAKE_CXX_CLANG_TIDY ${CXX_CLANG_TIDY} \"-header-filter=.*${PROJECT_SOURCE_DIR}.*\")\n  # Ignore warning flags intended for the CXX program. This only works because\n  # the two compilers we care about, clang and gcc, both understand\n  # -Wno-unknown-warning-option.\n  add_compile_options(\"$<$<COMPILE_LANGUAGE:CXX>:-Wno-unknown-warning-option>\")\nendif()\n\n###### Configure features ######\n\nif(GALOIS_ENABLE_VTUNE)\n  set(VTune_ROOT /opt/intel/vtune_amplifier)\n  find_package(VTune REQUIRED)\n  include_directories(${VTune_INCLUDE_DIRS})\n  add_definitions(-DGALOIS_ENABLE_VTUNE)\nendif()\n\nif(GALOIS_ENABLE_PAPI)\n  find_package(PAPI REQUIRED)\n  include_directories(${PAPI_INCLUDE_DIRS})\n  add_definitions(-DGALOIS_ENABLE_PAPI)\nendif()\n\nfind_package(Threads REQUIRED)\n\ninclude(CheckMmap)\n\ninclude(CheckHugePages)\nif(NOT HAVE_HUGEPAGES AND GALOIS_STRICT_CONFIG)\n  message(FATAL_ERROR \"Need huge pages\")\nendif()\n\nfind_package(Boost 1.58.0 REQUIRED COMPONENTS serialization iostreams)\n\nfind_package(LLVM REQUIRED CONFIG)\nif(\"${LLVM_PACKAGE_VERSION}\" VERSION_LESS \"7\")\n  message(FATAL_ERROR \"LLVM 7 or greater is required.\")\nendif()\nif(NOT DEFINED LLVM_ENABLE_RTTI)\n  message(FATAL_ERROR \"Could not determine if LLVM has RTTI enabled.\")\nendif()\nif(NOT ${LLVM_ENABLE_RTTI})\n  message(FATAL_ERROR \"Galois requires a build of LLVM that includes RTTI. Most package managers do this already, but if you built LLVM from source you need to configure it with `-DLLVM_ENABLE_RTTI=ON`\")\nendif()\ntarget_include_directories(LLVMSupport INTERFACE ${LLVM_INCLUDE_DIRS})\n\ninclude(HandleSanitizer)\n\ninclude(CheckEndian)\n\n###### Test Inputs ######\n\nif(GALOIS_GRAPH_LOCATION)\n  set(BASEINPUT \"${GALOIS_GRAPH_LOCATION}\")\n  set(BASEOUTPUT \"${GALOIS_GRAPH_LOCATION}\")\n  message(STATUS \"Using graph input and output location ${GALOIS_GRAPH_LOCATION}\")\nelseif(EXISTS /net/ohm/export/iss)\n  set(BASEINPUT /net/ohm/export/iss/inputs)\n  MESSAGE(STATUS \"Using graph input location /net/ohm/export/iss/inputs\")\n  set(BASEOUTPUT /net/ohm/export/iss/dist-outputs)\n  MESSAGE(STATUS \"Using graph output location /net/ohm/export/iss/dist-outputs\")\nelse()\n  set(BASEINPUT \"${PROJECT_BINARY_DIR}/inputs\")\n  set(BASEOUTPUT \"${PROJECT_BINARY_DIR}/inputs\")\n  message(STATUS \"Use 'make input' to download inputs and outputs in the build directory\")\nendif()\n\n###### Source finding ######\n\nadd_custom_target(lib)\nadd_custom_target(apps)\n\n# Core libraries (lib)\nadd_subdirectory(libsupport)\nadd_subdirectory(libgalois)\nadd_subdirectory(libpygalois)\nif (GALOIS_ENABLE_DIST)\n  find_package(MPI REQUIRED)\n  add_subdirectory(libdist)\n  add_subdirectory(libcusp)\n  add_subdirectory(libgluon)\nendif()\nstring(COMPARE NOTEQUAL \"${GALOIS_CUDA_CAPABILITY}\" \"\" GALOIS_ENABLE_GPU)\nif (GALOIS_ENABLE_GPU)\n  enable_language(CUDA)\n  foreach(GENCODE ${GALOIS_CUDA_CAPABILITY})\n    string(REPLACE \".\" \"\" GENCODE ${GENCODE})\n    add_compile_options(\"$<$<COMPILE_LANGUAGE:CUDA>:-gencode=arch=compute_${GENCODE},code=sm_${GENCODE}>\")\n  endforeach()\n\n  # This is necessary to allow building for CUDA 11.x (where CUB is bundled) and earlier versions (where CUB is not included)\n  add_definitions(-DTHRUST_IGNORE_CUB_VERSION_CHECK)\n\n  add_subdirectory(libgpu)\nendif()\nadd_subdirectory(libpangolin)\n\n# Applications (apps)\nadd_subdirectory(lonestar)\n\nadd_subdirectory(scripts)\nadd_subdirectory(inputs)\nadd_subdirectory(tools)\n\nif(USE_EXP)\n  add_subdirectory(lonestar/experimental)\nendif(USE_EXP)\n\n###### Documentation ######\n\nif(BUILD_DOCS)\n  set(GALOIS_ROOT ${CMAKE_CURRENT_SOURCE_DIR})\n  add_subdirectory(docs)\nendif()\n\n###### Installation ######\n\ninclude(CMakePackageConfigHelpers)\nwrite_basic_package_version_file(\n  ${CMAKE_CURRENT_BINARY_DIR}/GaloisConfigVersion.cmake\n  VERSION ${GALOIS_VERSION}\n  COMPATIBILITY SameMajorVersion\n)\nconfigure_package_config_file(\n  cmake/GaloisConfig.cmake.in\n  ${CMAKE_CURRENT_BINARY_DIR}/GaloisConfig.cmake\n  INSTALL_DESTINATION \"${CMAKE_INSTALL_LIBDIR}/cmake/Galois\"\n  PATH_VARS CMAKE_INSTALL_INCLUDEDIR CMAKE_INSTALL_LIBDIR CMAKE_INSTALL_BINDIR\n)\ninstall(\n  FILES \"${CMAKE_CURRENT_BINARY_DIR}/GaloisConfigVersion.cmake\" \"${CMAKE_CURRENT_BINARY_DIR}/GaloisConfig.cmake\"\n  DESTINATION \"${CMAKE_INSTALL_LIBDIR}/cmake/Galois\"\n  COMPONENT dev\n)\ninstall(\n  EXPORT GaloisTargets\n  NAMESPACE Galois::\n  DESTINATION \"${CMAKE_INSTALL_LIBDIR}/cmake/Galois\"\n  COMPONENT dev\n)\n\n###### Distribution ######\n\nset(CPACK_GENERATOR \"TGZ\")\nset(CPACK_ARCHIVE_COMPONENT_INSTALL ON)\nset(CPACK_RESOURCE_FILE_LICENSE \"${CMAKE_CURRENT_SOURCE_DIR}/COPYRIGHT\")\nset(CPACK_RESOURCE_FILE_README \"${CMAKE_CURRENT_SOURCE_DIR}/README.md\")\nset(CPACK_PACKAGE_VERSION_MAJOR ${GALOIS_VERSION_MAJOR})\nset(CPACK_PACKAGE_VERSION_MINOR ${GALOIS_VERSION_MINOR})\nset(CPACK_PACKAGE_VERSION_PATCH ${GALOIS_VERSION_PATCH})\ninclude(CPack)\n"
  },
  {
    "path": "COPYRIGHT",
    "content": "Galois, a framework to exploit amorphous data-parallelism in irregular\nprograms.\n\nCopyright (C) 2018, The University of Texas at Austin. All rights reserved.\nUNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\nSOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\nFITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\nPERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\nDEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\nRESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\nshall University be liable for incidental, special, indirect, direct or\nconsequential damages or loss of profits, interruption of business, or\nrelated expenses which may arise from use of Software or Documentation,\nincluding but not limited to those resulting from defects in Software and/or\nDocumentation, or loss or inaccuracy of data of any kind.\n\nThis software is released under the terms of the 3-Clause BSD License (a\ncopy is located in LICENSE.txt at the top-level directory).\n"
  },
  {
    "path": "LICENSE.txt",
    "content": "The 3-Clause BSD License\n\nCopyright 2018 The University of Texas at Austin\n\nRedistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:\n\n1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.\n\n2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.\n\n3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.\n\nTHIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS \"AS IS\" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n"
  },
  {
    "path": "README.md",
    "content": "Overview\n========\n\n[![CircleCI](https://circleci.com/gh/IntelligentSoftwareSystems/Galois.svg?style=svg)](https://circleci.com/gh/IntelligentSoftwareSystems/Galois)\n[![Build Status](https://travis-ci.org/IntelligentSoftwareSystems/Galois.svg?branch=master)](https://travis-ci.org/IntelligentSoftwareSystems/Galois)\n\nGalois is a C++ library designed to ease parallel programming, especially for\napplications with irregular parallelism (e.g., irregular amount of work in parallel\nsections, irregular memory accesses and branching patterns). It implements\nan implicitly parallel programming model, where the programmer replaces serial loop\nconstructs (e.g. for and while) and serial data structures in their algorithms with parallel loop\nconstructs and concurrent data structures provided by Galois to express their algorithms.\nGalois is designed so that the programmer does not have to deal with low-level parallel programming constructs such as\nthreads, locks, barriers, condition variables, etc. \n\nHighlights include:\n- Parallel *for_each* loop that handles dependencies between iterations, as well as\n  dynamic work creation, and a *do_all* loop for simple parallelism. Both provide load balancing and excellent\n  scalability on multi-socket systems\n- A concurrent graph library designed for graph analytics algorithms as well as\n  other domains such as irregular meshes. \n- Scalable concurrent containers such as bag, vector, list, etc. \n\nGalois is released under the BSD-3-Clause license. \n\n\nBuilding Galois\n===============\n\nYou can checkout the latest release by typing (in a terminal):\n\n```Shell\ngit clone -b release-5.0 https://github.com/IntelligentSoftwareSystems/Galois\n```\n\nThe master branch will be regularly updated, so you may try out the latest\ndevelopment code as well by checking out master branch:\n\n```Shell\ngit clone https://github.com/IntelligentSoftwareSystems/Galois\n```\n\nDependencies\n------------\n\nGalois builds, runs, and has been tested on GNU/Linux. Even though\nGalois may build on systems similar to Linux, we have not tested correctness or performance, so please\nbeware. \n\nAt the minimum, Galois depends on the following software:\n\n- A modern C++ compiler compliant with the C++-17 standard (gcc >= 7, Intel >= 19.0.1, clang >= 7.0)\n- CMake (>= 3.13)\n- Boost library (>= 1.58.0, we recommend building/installing the full library)\n- libllvm (>= 7.0 with RTTI support)\n- libfmt (>= 4.0)\n\nHere are the dependencies for the optional features: \n\n- Linux HUGE_PAGES support (please see [www.kernel.org/doc/Documentation/vm/hugetlbpage.txt](https://www.kernel.org/doc/Documentation/vm/hugetlbpage.txt)). Performance will most likely degrade without HUGE_PAGES\n  enabled. Galois uses 2MB huge page size and relies on the kernel configuration to set aside a large amount of 2MB pages. For example, our performance testing machine (4x14 cores, 192GB RAM) is configured to support up to 65536 2MB pages:\n  ```Shell\n  cat /proc/meminfo | fgrep Huge\n  AnonHugePages:    104448 kB\n  HugePages_Total:   65536\n  HugePages_Free:    65536\n  HugePages_Rsvd:        0\n  HugePages_Surp:        0\n  Hugepagesize:       2048 kB\n  ```\n\n- libnuma support. Performance may degrade without it. Please install\n  libnuma-dev on Debian like systems, and numactl-dev on Red Hat like systems. \n- Doxygen (>= 1.8.5) for compiling documentation as webpages or latex files \n- PAPI (>= 5.2.0.0 ) for profiling sections of code\n- Vtune (>= 2017 ) for profiling sections of code\n- MPICH2 (>= 3.2) if you are interested in building and running distributed system\n  applications in Galois\n- CUDA (>= 8.0 and < 11.0) if you want to build GPU or distributed heterogeneous applications.\n  Note that versions >= 11.0 use an incompatible CUB module and will fail to execute.\n- Eigen (3.3.1 works for us) for some matrix-completion app variants\n\n\nCompiling and Testing Galois\n----------------------------\nWe use CMake to streamline building, testing and installing Galois. In the\nfollowing, we will highlight some common commands.\n\nLet's assume that `SRC_DIR` is the directory where the source code for Galois\nresides, and you wish to build Galois in some `BUILD_DIR`. Run the following\ncommands to set up a build directory:\n\n```Shell\nSRC_DIR=`pwd` # Or top-level Galois source dir\nBUILD_DIR=<path-to-your-build-dir>\n\nmkdir -p $BUILD_DIR\ncmake -S $SRC_DIR -B $BUILD_DIR -DCMAKE_BUILD_TYPE=Release\n```\n\nYou can also set up a `Debug` build by running the following instead of the last command above:\n\n```Shell\ncmake -S $SRC_DIR -B $BUILD_DIR -DCMAKE_BUILD_TYPE=Debug\n```\n\nGalois applications are in `lonestar` directory.  In order to build a particular application:\n\n```Shell\nmake -C $BUILD_DIR/lonestar/<app-dir-name> -j\n# or alternatively\nmake -C $BUILD_DIR <app-executable-name> -j\n# or\ncmake --build $BUILD_DIR <app-executable-name> --parallel\n```\n\nYou can also build everything by running `make -j` in the top-level of build directory, but that may\ntake a lot of time.\n\nSetting the `BUILD_SHARED_LIBS` to `ON` when calling CMake will make the core runtime library be built as a shared object instead of a static library.\n\nThe tests for the core runtime will be built by default when you run `make`\nwith no target specified. They can be also built explicitly with:\n\n```Shell\nmake -C $BUILD_DIR/test\n```\n\nWe provide a few sample inputs that can be downloaded by running:\n\n```Shell\nmake -C $BUILD_DIR input\n```\n\n`make input` will download a tarball of inputs and extract it to\n`$BUILD_DIR/inputs/small_inputs` directory. The tarball is downloaded to\n`$BUILD_DIR/inputs`\n\nMost of the Galois apps have corresponding tests.\nThese tests depend on downloading the reference inputs and building the corresponding apps and test binaries.\nOnce the reference inputs have been downloaded and everything has been built,\nthe tests for the core library and all the apps can be run by running:\n\n```Shell\nmake test\n# or alternatively\nctest\n```\n\nin the build directory.\n\n\nRunning Galois Applications\n===========================\n\nGraph Format\n------------\n\nMany Galois/Lonestar applications work with graphs. We store graphs in a binary format\ncalled *galois graph file* \n(`.gr` file extension). Other formats such as edge-list or Matrix-Market can be\nconverted to `.gr` format with `graph-convert` tool provided in galois. \nYou can build graph-convert as follows:\n\n```Shell\ncd $BUILD_DIR\nmake graph-convert\n./tools/graph-convert/graph-convert --help\n```\n\nOther applications, such as Delaunay Mesh Refinement may read special file formats\nor some may even generate random inputs on the fly. \n\nRunning\n-------\n\nAll Lonestar applications take a `-t` command-line option to specify the number of\nthreads to use. All applications run a basic sanity check (often insufficient for\ncorrectness) on the program output, which can be turned off with the `-noverify` option. You \ncan specify `-help` command-line option to print all available options. \n\nUpon successful completion, each application will produce some stats regarding running\ntime of various sections, parallel loop iterations and memory usage, etc. These\nstats are in CSV format and can be redirected to a file using `-statFile` option.\nPlease refer to the manual for details on stats. \n\nRunning LonestarGPU applications\n--------------------------\n\nPlease refer to `lonestar/analytics/gpu/README.md` and `lonestar/scientific/gpu/README.md` for more details on\ncompiling and running LonestarGPU applications.\n\nRunning Distributed Galois\n--------------------------\n\nPlease refer to `lonestar/analytics/distributed/README.md` for more details on\nrunning distributed benchmarks.\n\nDocumentation\n=============\n\nGalois documentation is produced using doxygen, included in this repository, which includes a tutorial, a user's\nmanual and API documentation for the Galois library. \n\nUsers can build doxygen documentation in the build directory using:\n\n```Shell\ncd $BUILD_DIR\nmake doc\nyour-fav-browser html/index.html &\n```\n\nSee online documentation at:\n [http://iss.ices.utexas.edu/?p=projects/galois](http://iss.ices.utexas.edu/?p=projects/galois)\n\nSource-Tree Organization\n========================\n\n- `libgalois` contains the source code for the shared-memory Galois library, e.g., runtime, graphs, worklists, etc. \n- `lonestar` contains the Lonestar benchmark applications and tutorial examples for Galois\n- `libdist` contains the source code for the distributed-memory and heterogeneous Galois library\n- `lonestardist` contains the source code for the distributed-memory and heterogeneous\n  benchmark applications. Please refer to `lonestardist/README.md` for instructions on\n  building and running these apps. \n- `tools` contains various helper programs such as graph-converter to convert\n  between graph file formats and graph-stats to print graph properties\n\nUsing Galois as a library\n=========================\n\nThere are two common ways to use Galois as a library. One way is to copy this\nrepository into your own CMake project, typically using a git submodule. Then\nyou can put the following in your CMakeLists.txt:\n\n```CMake\nadd_subdirectory(galois EXCLUDE_FROM_ALL)\nadd_executable(app ...)\ntarget_link_libraries(app Galois::shmem)\n```\n\nThe other common method is to install Galois outside your project and import it\nas a package.\n\nIf you want to install Galois, assuming that you wish to install it under\n`INSTALL_DIR`:\n\n```Shell\ncmake -DCMAKE_INSTALL_PREFIX=$INSTALL_DIR $SRC_DIR\nmake install\n```\n\nThen, you can put something like the following in CMakeLists.txt:\n\n```CMake\nlist(APPEND CMAKE_PREFIX_PATH ${INSTALL_DIR})\nfind_package(Galois REQUIRED)\nadd_executable(app ...)\ntarget_link_libraries(app Galois::shmem)\n```\n\nIf you are not using CMake, the corresponding basic commands (although the\nspecific commands vary by system) are:\n\n```Shell\nc++ -std=c++14 app.cpp -I$INSTALL_DIR/include -L$INSTALL_DIR/lib -lgalois_shmem\n```\nThird-Party Libraries and Licensing\n====================\n\nGalois includes some third party libraries that do not use the same license as\nGalois. This includes the bliss library (located in lonestar/include/Mining/bliss)\nand Modern GPU (located in libgpu/moderngpu). Please be aware of this when\nusing Galois.\n\nContact Us\n==========\nFor bugs, please raise an\n[issue](https://github.com/IntelligentSoftwareSystems/Galois/issues) on\nGiHub.\nQuestions and comments are also welcome at the Galois users mailing list:\n[galois-users@utlists.utexas.edu](galois-users@utlists.utexas.edu). You may\n[subscribe here](https://utlists.utexas.edu/sympa/subscribe/galois-users).\n\nIf you find a bug, it would help us if you sent (1) the command line and\nprogram inputs and outputs and (2) a core dump, preferably from an executable\nbuilt with the debug build.\n\nYou can enable core dumps by setting `ulimit -c unlimited` before running your\nprogram. The location where the core dumps will be stored can be determined with\n`cat /proc/sys/kernel/core_pattern`.\n\nTo create a debug build, assuming you will build Galois in `BUILD_DIR` and the\nsource is in `SRC_DIR`:\n\n```Shell\ncmake -S $SRC_DIR -B $BUILD_DIR -DCMAKE_BUILD_TYPE=Debug\nmake -C $BUILD_DIR\n```\n\nA simple way to capture relevant debugging details is to use the `script`\ncommand, which will record your terminal input and output. For example,\n\n```Shell\nscript debug-log.txt\nulimit -c unlimited\ncat /proc/sys/kernel/core_pattern\nmake -C $BUILD_DIR <my-app> VERBOSE=1\nmy-app with-failing-input\nexit\n```\n\nThis will generate a file `debug-log.txt`, which you can send to the mailing\nlist:[galois-users@utlists.utexas.edu](galois-users@utlists.utexas.edu) for\nfurther debugging or supply when opening a GitHub issue.\n"
  },
  {
    "path": "cmake/GaloisConfig.cmake.in",
    "content": "# Config file for the Galois package\n#\n# It exports the following targets:\n#   Galois::shmem\n#   Galois::dist\n#   ...\n#   (see GaloisTargets.cmake for all of them)\n#\n# It defines the following variables for legacy importing:\n#   Galois_INCLUDE_DIRS\n#   Galois_LIBRARIES\n#   Galois_LIBRARY_DIRS\n#   Galois_BIN_DIRS\ninclude(CMakeFindDependencyMacro)\n\n@PACKAGE_INIT@\n\nset_and_check(Galois_INCLUDE_DIRS \"@PACKAGE_CMAKE_INSTALL_INCLUDEDIR@\")\nset_and_check(Galois_LIBRARY_DIRS \"@PACKAGE_CMAKE_INSTALL_LIBDIR@\")\nset_and_check(Galois_BIN_DIRS \"@PACKAGE_CMAKE_INSTALL_BINDIR@\")\nset(Galois_LIBRARIES galois_shmem)\n\nfind_dependency(Threads REQUIRED)\nfind_dependency(Boost 1.58.0 REQUIRED COMPONENTS serialization iostreams)\nif (@GALOIS_ENABLE_DIST@)\n  find_dependency(MPI REQUIRED)\nendif()\n\nget_filename_component(GALOIS_CMAKE_DIR \"${CMAKE_CURRENT_LIST_FILE}\" PATH)\n\nif(NOT Galois::shmem)\n  include(\"${GALOIS_CMAKE_DIR}/GaloisTargets.cmake\")\nendif()\n"
  },
  {
    "path": "cmake/Modules/CheckArchFlags.cmake",
    "content": "# Find architecture-specific flags\n#\n# Once done this will define\n#  ARCH_FLAGS_FOUND\n#  ARCH_CXX_FLAGS - Compiler flags to enable architecture-specific optimizations\n#  ARCH_C_FLAGS - Compiler flags to enable architecture-specific optimizations\n#  ARCH_LINK_FLAGS - Compiler flags to enable architecture-specific optimizations\ninclude(CheckCXXCompilerFlag)\n\nif(NOT USE_ARCH OR USE_ARCH STREQUAL \"none\" OR ARCH_FLAGS_FOUND)\n  set(ARCH_CXX_FLAGS_CANDIDATES)\nelse()\n  set(ARCH_CXX_FLAGS_CANDIDATES \"-march=${USE_ARCH}\")\nendif()\n\nif(USE_ARCH STREQUAL \"mic\")\n  if(CMAKE_CXX_COMPILER_ID MATCHES \"Intel\")\n    list(APPEND ARCH_CXX_FLAGS_CANDIDATES -mmic)\n  endif()\n\n  if(CMAKE_COMPILER_IS_GNUCC)\n    list(APPEND ARCH_CXX_FLAGS_CANDIDATES -march=knc)\n  endif()\nendif()\n\nforeach(FLAG ${ARCH_CXX_FLAGS_CANDIDATES})\n  message(STATUS \"Try architecture flag = [${FLAG}]\")\n  unset(ARCH_CXX_FLAGS_DETECTED)\n  check_cxx_compiler_flag(\"${FLAG}\" ARCH_CXX_FLAGS_DETECTED)\n  if(ARCH_CXX_FLAGS_DETECTED)\n    set(ARCH_FLAGS_FOUND \"YES\")\n    set(ARCH_CXX_FLAGS \"${FLAG}\")\n    set(ARCH_C_FLAGS \"${FLAG}\")\n    set(ARCH_LINK_FLAGS \"${FLAG}\")\n  endif()\nendforeach()\n"
  },
  {
    "path": "cmake/Modules/CheckCilk.cmake",
    "content": "include(CheckCXXSourceCompiles)\nset(Cilk_CXX_TEST_SOURCE\n\"\n#include <cilk/cilk.h>\nint main(){ cilk_for(int i=0;i<1; ++i); }\n\")\nCHECK_CXX_SOURCE_COMPILES(\"${Cilk_CXX_TEST_SOURCE}\" HAVE_CILK)\nif(HAVE_CILK)\n  message(STATUS \"A compiler with CILK support found\")\nendif()\n"
  },
  {
    "path": "cmake/Modules/CheckEndian.cmake",
    "content": "include(TestBigEndian)\nTEST_BIG_ENDIAN(HAVE_BIG_ENDIAN)\ninclude(CheckIncludeFiles)\nCHECK_INCLUDE_FILES(endian.h HAVE_ENDIAN_H)\ninclude(CheckSymbolExists)\nCHECK_SYMBOL_EXISTS(le64toh \"endian.h\" HAVE_LE64TOH)\nCHECK_SYMBOL_EXISTS(le32toh \"endian.h\" HAVE_LE32TOH)\nCHECK_SYMBOL_EXISTS(htobe64 \"endian.h\" HAVE_HTOBE64)\nCHECK_SYMBOL_EXISTS(htobe32 \"endian.h\" HAVE_HTOBE32)\nCHECK_SYMBOL_EXISTS(htole64 \"endian.h\" HAVE_HTOLE64)\nCHECK_SYMBOL_EXISTS(htole32 \"endian.h\" HAVE_HTOLE32)\n"
  },
  {
    "path": "cmake/Modules/CheckHugePages.cmake",
    "content": "include(CheckCSourceRuns)\nset(HugePages_C_TEST_SOURCE\n\"\n#ifdef __linux__\n#include <linux/mman.h>\n#endif\n#include <sys/mman.h>\n\nint main(int c, char** argv) {\n  void *ptr = mmap(0, 2*1024*1024, PROT_READ|PROT_WRITE, MAP_HUGETLB, -1, 0);\n\n  return ptr != MAP_FAILED;\n}\n\")\nif(HAVE_HUGEPAGES)\n\nelse()\n  CHECK_C_SOURCE_RUNS(\"${HugePages_C_TEST_SOURCE}\" HAVE_HUGEPAGES_INTERNAL)\n  if(HAVE_HUGEPAGES_INTERNAL)\n    message(STATUS \"Huge pages found\")\n    set(HAVE_HUGEPAGES \"${HAVE_HUGEPAGES_INTERNAL}\" CACHE BOOL \"Have hugepages\")\n  endif()\nendif()\n"
  },
  {
    "path": "cmake/Modules/CheckMmap.cmake",
    "content": "include(CheckCSourceCompiles)\nset(Mmap64_C_TEST_SOURCE\n\"\n#ifdef __linux__\n#include <linux/mman.h>\n#endif\n#include <sys/mman.h>\n\nint main(int c, char** argv) {\n  void *ptr = mmap64(0, 2*1024*1024, PROT_READ|PROT_WRITE, MAP_PRIVATE, -1, 0);\n  return 0;\n}\n\")\n\nif(HAVE_MMAP64)\n\nelse()\n  CHECK_C_SOURCE_COMPILES(\"${Mmap64_C_TEST_SOURCE}\" HAVE_MMAP64_INTERNAL)\n  if(HAVE_MMAP64_INTERNAL)\n    message(STATUS \"mmap64 found\")\n    set(HAVE_MMAP64 \"${HAVE_MMAP64_INTERNAL}\" CACHE BOOL \"Have mmap64\")\n  endif()\nendif()\n"
  },
  {
    "path": "cmake/Modules/CheckSchedSetAffinity.cmake",
    "content": "include(CheckSymbolExists)\n\nif(SCHED_SETAFFINITY_FOUND)\n\nelse()\n  set(CMAKE_REQUIRED_DEFINITIONS -D_GNU_SOURCE)\n  CHECK_SYMBOL_EXISTS(sched_setaffinity sched.h HAVE_SCHED_SETAFFINITY_INTERNAL)\n  if(HAVE_SCHED_SETAFFINITY_INTERNAL)\n    message(STATUS \"sched_setaffinity found\")\n    set(SCHED_SETAFFINITY_FOUND \"${HAVE_SCHED_SETAFFINITY_INTERNAL}\")\n    set(SCHED_SETAFFINITY_LIBRARIES rt)\n  endif()\nendif()\n"
  },
  {
    "path": "cmake/Modules/FindCBLAS.cmake",
    "content": "# Copyright 2009-2011 The VOTCA Development Team (http://www.votca.org)\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\n#! \\file\n#! \\ingroup FindPackage\n#! \\brief Find CBLAS\n#!\n#! Find the native CBLAS headers and libraries.\n#!\n#! - `CBLAS_LIBRARIES`    - List of libraries when using cblas.\n#! - `CBLAS_INCLUDE_DIRS` - List of include directories\n#! - `CBLAS_FOUND`        - True if cblas found.\n#!\n#! Cblas can be provided by libblas (Ubuntu), cblas or gslcblas, it will be searched for in\n#! this order.\n\ninclude(LibFindMacros)\n\nif (UNIX)\n  find_package(PkgConfig QUIET)\n  pkg_check_modules(CBLAS_PKGCONF QUIET cblas)\nendif()\n\nif (NOT CBLAS_FOUND)\n\nif(CBLAS_PKGCONF_FOUND)\n\nforeach(NEW_CBLAS_LIB ${CBLAS_PKGCONF_LIBRARIES})\n  find_library(LIB_${NEW_CBLAS_LIB} ${NEW_CBLAS_LIB} HINTS ${CBLAS_PKGCONF_LIBRARY_DIRS})\n  if(NOT LIB_${NEW_CBLAS_LIB})\n    message(FATAL_ERROR \"Could not find ${NEW_CBLAS_LIB} where pkgconfig said it is: ${CBLAS_PKGCONF_LIBRARY_DIRS}\")\n  else(NOT LIB_${NEW_CBLAS_LIB})\n    message(STATUS \"Found ${LIB_${NEW_CBLAS_LIB}}.\")\n  endif(NOT LIB_${NEW_CBLAS_LIB})\n  set(CBLAS_LIBRARY ${CBLAS_LIBRARY} ${LIB_${NEW_CBLAS_LIB}})\nendforeach(NEW_CBLAS_LIB)\n\nelse(CBLAS_PKGCONF_FOUND)\n\nset(CBLAS_HINT_PATH $ENV{CBLASDIR}/lib $ENV{CBLASDIR}/lib64 $ENV{UIBK_GSL_LIB})\n\n# Check if libblas provides cblas (Ubuntu)\nfind_library(BLAS_LIBRARY NAMES blas PATHS ${CBLAS_HINT_PATH})\nif(BLAS_LIBRARY)\n  include(CheckSymbolExists)\n  set(CMAKE_REQUIRED_LIBRARIES ${BLAS_LIBRARY})\n  check_symbol_exists(cblas_scopy \"cblas.h\" BLAS_HAS_CBLAS)\nendif(BLAS_LIBRARY)\n\nset(CBLAS_CANDIDATES cblas gslcblas)\nif(BLAS_HAS_CBLAS)\n  message(STATUS \"libblas provides cblas.\")\n  set(CBLAS_CANDIDATES blas ${CBLAS_CANDIDATES})\nendif(BLAS_HAS_CBLAS)\n\nfind_library(CBLAS_LIBRARY\n  NAMES ${CBLAS_CANDIDATES}\n  PATHS ${CBLAS_HINT_PATH}\n)\nendif(CBLAS_PKGCONF_FOUND)\n\nif(\"${CBLAS_LIBRARY}\" MATCHES gslcblas)\n  set(CBLAS_INCLUDE_CANDIDATE gsl/gsl_cblas.h)\nelse(\"${CBLAS_LIBRARY}\" MATCHES gslcblas)\n  set(CBLAS_INCLUDE_CANDIDATE cblas.h)\nendif(\"${CBLAS_LIBRARY}\" MATCHES gslcblas)\n\nfind_path(CBLAS_INCLUDE_DIR ${CBLAS_INCLUDE_CANDIDATE} HINTS ${CBLAS_PKGCONF_INCLUDE_DIRS} $ENV{CBLASDIR}/include $ENV{UIBK_GSL_INC})\n\n# Set the include dir variables and the libraries and let libfind_process do the rest.\n# NOTE: Singular variables for this library, plural for libraries this this lib depends on.\nset(CBLAS_PROCESS_INCLUDES CBLAS_INCLUDE_DIR)\nset(CBLAS_PROCESS_LIBS CBLAS_LIBRARY)\nlibfind_process(CBLAS)\nmessage(STATUS \"Using '${CBLAS_LIBRARIES}' for cblas.\")\n\nendif(NOT CBLAS_FOUND)\n"
  },
  {
    "path": "cmake/Modules/FindFortran.cmake",
    "content": "# Check if Fortran is possibly around before using enable_lanauge because\n# enable_language(... OPTIONAL) does not fail gracefully if language is not\n# found:\n#  http://public.kitware.com/Bug/view.php?id=9220\nset(Fortran_EXECUTABLE)\nif(Fortran_EXECUTABLE)\n  set(Fortran_FIND_QUIETLY TRUE)\nendif()\nfind_program(Fortran_EXECUTABLE NAMES gfortran ifort g77 f77 g90 f90)\ninclude(FindPackageHandleStandardArgs)\nfind_package_handle_standard_args(Fortran DEFAULT_MSG Fortran_EXECUTABLE)\nif(FORTRAN_FOUND)\n  set(Fortran_FOUND TRUE)\nendif()\n"
  },
  {
    "path": "cmake/Modules/FindGASNET.cmake",
    "content": "# Find the GasNet librairy\n#  GASNET_FOUND - system has GasNet lib\n#  GASNET_INCLUDE_DIR - the GasNet include directory\n#  GASNET_LIBRARIES - Libraries needed to use GasNet\n\nif(GASNET_INCLUDE_DIRS AND GASNET_LIBRARIES)\n  set(GASNET_FIND_QUIETLY TRUE)\nendif()\n\nfind_path(GASNET_INCLUDE_DIRS NAMES gasnet.h)\nfind_library(GASNET_LIBRARY_1 NAMES gasnet amudp HINTS ${GASNET_INCLUDE_DIRS}/../lib )\nfind_library(GASNET_LIBRARY_2 NAMES gasnet gasnet-udp-par HINTS ${GASNET_INCLUDE_DIRS}/../lib )\n\nset(GASNET_LIBRARIES ${GASNET_LIBRARY_2} ${GASNET_LIBRARY_1})\n\ninclude(FindPackageHandleStandardArgs)\nfind_package_handle_standard_args(GASNET DEFAULT_MSG GASNET_INCLUDE_DIRS GASNET_LIBRARIES)\n\nmark_as_advanced(GASNET_INCLUDE_DIRS GASNET_LIBRARIES)\n"
  },
  {
    "path": "cmake/Modules/FindGMP.cmake",
    "content": "# Find the GMP librairies\n#  GMP_FOUND - system has GMP lib\n#  GMP_INCLUDE_DIR - the GMP include directory\n#  GMP_LIBRARIES - Libraries needed to use GMP\n\n# Copyright (c) 2006, Laurent Montel, <montel@kde.org>\n#\n# Redistribution and use is allowed according to the terms of the BSD license.\n# For details see the accompanying COPYING-CMAKE-SCRIPTS file.\n\nif(GMP_INCLUDE_DIRS AND GMP_LIBRARIES AND GMPXX_LIBRARIES)\n  set(GMP_FIND_QUIETLY TRUE)\nendif()\n\nfind_path(GMP_INCLUDE_DIRS NAMES gmp.h)\nfind_library(GMP_LIBRARIES NAMES gmp libgmp)\nfind_library(GMPXX_LIBRARIES NAMES gmpxx libgmpxx)\n\ninclude(FindPackageHandleStandardArgs)\nfind_package_handle_standard_args(GMP DEFAULT_MSG GMP_INCLUDE_DIRS GMP_LIBRARIES)\n\nmark_as_advanced(GMP_INCLUDE_DIRS GMP_LIBRARIES GMPXX_LIBRARIES)\n"
  },
  {
    "path": "cmake/Modules/FindGit.cmake",
    "content": "# The module defines the following variables:\n#   GIT_EXECUTABLE - path to git command line client\n#   GIT_FOUND - true if the command line client was found\n# Example usage:\n#   find_package(Git)\n#   if(GIT_FOUND)\n#     message(\"git found: ${GIT_EXECUTABLE}\")\n#   endif()\n\n#=============================================================================\n# Copyright 2010 Kitware, Inc.\n#\n# Distributed under the OSI-approved BSD License (the \"License\");\n# see accompanying file Copyright.txt for details.\n#\n# This software is distributed WITHOUT ANY WARRANTY; without even the\n# implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.\n# See the License for more information.\n#=============================================================================\n# (To distributed this file outside of CMake, substitute the full\n#  License text for the above reference.)\n\n# Look for 'git' or 'eg' (easy git)\n#\nset(git_names git eg)\n\n# Prefer .cmd variants on Windows unless running in a Makefile\n# in the MSYS shell.\n#\nif(WIN32)\n  if(NOT CMAKE_GENERATOR MATCHES \"MSYS\")\n    set(git_names git.cmd git eg.cmd eg)\n  endif()\nendif()\n\nfind_program(GIT_EXECUTABLE\n  NAMES ${git_names}\n  DOC \"git command line client\"\n  )\nmark_as_advanced(GIT_EXECUTABLE)\n\n# Handle the QUIETLY and REQUIRED arguments and set GIT_FOUND to TRUE if\n# all listed variables are TRUE\n\ninclude(FindPackageHandleStandardArgs)\nfind_package_handle_standard_args(Git DEFAULT_MSG GIT_EXECUTABLE)\n"
  },
  {
    "path": "cmake/Modules/FindNUMA.cmake",
    "content": "# Find numa library\n# Once done this will define\n#  NUMA_FOUND - libnuma found\n#  NUMA_OLD - old libnuma API\nif(NOT NUMA_FOUND)\n  find_library(NUMA_LIBRARY NAMES numa PATH_SUFFIXES lib lib64)\n  if(NUMA_LIBRARY)\n    include(CheckLibraryExists)\n    check_library_exists(${NUMA_LIBRARY} numa_available \"\" NUMA_FOUND_INTERNAL)\n    if(NUMA_FOUND_INTERNAL)\n      check_library_exists(${NUMA_LIBRARY} numa_allocate_nodemask \"\" NUMA_NEW_INTERNAL)\n      if(NOT NUMA_NEW_INTERNAL)\n        set(NUMA_OLD \"yes\" CACHE)\n      endif()\n    endif()\n\n    include(FindPackageHandleStandardArgs)\n    find_package_handle_standard_args(NUMA DEFAULT_MSG NUMA_LIBRARY)\n    mark_as_advanced(NUMA_FOUND)\n  endif()\nendif()\n\n"
  },
  {
    "path": "cmake/Modules/FindOpenCL.cmake",
    "content": "#\n#  This file taken from FindOpenCL project @ http://gitorious.com/findopencl\n#\n# - Try to find OpenCL\n# This module tries to find an OpenCL implementation on your system. It supports\n# AMD / ATI, Apple and NVIDIA implementations, but should work, too.\n#\n# Once done this will define\n#  OPENCL_FOUND        - system has OpenCL\n#  OPENCL_INCLUDE_DIRS  - the OpenCL include directory\n#  OPENCL_LIBRARIES    - link these to use OpenCL\n#\n# WIN32 should work, but is untested\n\nFIND_PACKAGE( PackageHandleStandardArgs )\n\nSET (OPENCL_VERSION_STRING \"0.1.0\")\nSET (OPENCL_VERSION_MAJOR 0)\nSET (OPENCL_VERSION_MINOR 1)\nSET (OPENCL_VERSION_PATCH 0)\n\nIF (APPLE)\n\n  FIND_LIBRARY(OPENCL_LIBRARIES OpenCL DOC \"OpenCL lib for OSX\")\n  FIND_PATH(OPENCL_INCLUDE_DIRS opencl/cl.h DOC \"Include for OpenCL on OSX\")\n  FIND_PATH(_OPENCL_CPP_INCLUDE_DIRS opencl/cl.hpp DOC \"Include for OpenCL CPP bindings on OSX\")\n\nELSE (APPLE)\n\n\tIF (WIN32)\n\t\n\t    FIND_PATH(OPENCL_INCLUDE_DIRS CL/cl.h)\n\t    FIND_PATH(_OPENCL_CPP_INCLUDE_DIRS CL/cl.hpp)\n\t\n\t    # The AMD SDK currently installs both x86 and x86_64 libraries\n\t    # This is only a hack to find out architecture\n\t    IF( ${CMAKE_SYSTEM_PROCESSOR} STREQUAL \"AMD64\" )\n\t    \tSET(OPENCL_LIB_DIR \"$ENV{ATISTREAMSDKROOT}/lib/x86_64\")\n\t\t\tSET(OPENCL_LIB_DIR \"$ENV{ATIINTERNALSTREAMSDKROOT}/lib/x86_64\")\n\t    ELSE (${CMAKE_SYSTEM_PROCESSOR} STREQUAL \"AMD64\")\n\t    \tSET(OPENCL_LIB_DIR \"$ENV{ATISTREAMSDKROOT}/lib/x86\")\n\t   \t\tSET(OPENCL_LIB_DIR \"$ENV{ATIINTERNALSTREAMSDKROOT}/lib/x86\")\n\t    ENDIF( ${CMAKE_SYSTEM_PROCESSOR} STREQUAL \"AMD64\" )\n\n\t    # find out if the user asked for a 64-bit build, and use the corresponding \n\t    # 64 or 32 bit NVIDIA library paths to the search:\n\t    STRING(REGEX MATCH \"Win64\" ISWIN64 ${CMAKE_GENERATOR})\n\t    IF(\"${ISWIN64}\" STREQUAL \"Win64\") \n\t    \tFIND_LIBRARY(OPENCL_LIBRARIES OpenCL.lib ${OPENCL_LIB_DIR} $ENV{CUDA_LIB_PATH} $ENV{CUDA_PATH}/lib/x64)\n\t    ELSE(\"${ISWIN64}\" STREQUAL \"Win64\") \n\t    \tFIND_LIBRARY(OPENCL_LIBRARIES OpenCL.lib ${OPENCL_LIB_DIR} $ENV{CUDA_LIB_PATH} $ENV{CUDA_PATH}/lib/Win32)\n\t    ENDIF(\"${ISWIN64}\" STREQUAL \"Win64\") \n\n\t    GET_FILENAME_COMPONENT(_OPENCL_INC_CAND ${OPENCL_LIB_DIR}/../../include ABSOLUTE)\n\t    \n\t    # On Win32 search relative to the library\n\t    FIND_PATH(OPENCL_INCLUDE_DIRS CL/cl.h PATHS \"${_OPENCL_INC_CAND}\" $ENV{CUDA_INC_PATH} $ENV{CUDA_PATH}/include)\n\t    FIND_PATH(_OPENCL_CPP_INCLUDE_DIRS CL/cl.hpp PATHS \"${_OPENCL_INC_CAND}\" $ENV{CUDA_INC_PATH} $ENV{CUDA_PATH}/include)\n\t\n\tELSE (WIN32)\n\n            # Unix style platforms\n            FIND_LIBRARY(OPENCL_LIBRARIES OpenCL\n              ENV LD_LIBRARY_PATH\n            )\n\n            GET_FILENAME_COMPONENT(OPENCL_LIB_DIR ${OPENCL_LIBRARIES} PATH)\n            GET_FILENAME_COMPONENT(_OPENCL_INC_CAND ${OPENCL_LIB_DIR}/../../include ABSOLUTE)\n\n            # The AMD SDK currently does not place its headers\n            # in /usr/include, therefore also search relative\n            # to the library\n            FIND_PATH(OPENCL_INCLUDE_DIRS CL/cl.h PATHS ${_OPENCL_INC_CAND} $ENV{OPENCL_INCLUDE_DIRS} \"/usr/local/cuda/include\")\n            FIND_PATH(_OPENCL_CPP_INCLUDE_DIRS CL/cl.hpp PATHS ${_OPENCL_INC_CAND} $ENV{OPENCL_LIB_DIR} \"/usr/local/cuda/include\")\n\n\tENDIF (WIN32)\n\nENDIF (APPLE)\n\nFIND_PACKAGE_HANDLE_STANDARD_ARGS( OpenCL DEFAULT_MSG OPENCL_LIBRARIES OPENCL_INCLUDE_DIRS )\n\nIF( _OPENCL_CPP_INCLUDE_DIRS )\n\tSET( OPENCL_HAS_CPP_BINDINGS TRUE )\n\tLIST( APPEND OPENCL_INCLUDE_DIRS ${_OPENCL_CPP_INCLUDE_DIRS} )\n\t# This is often the same, so clean up\n\tLIST( REMOVE_DUPLICATES OPENCL_INCLUDE_DIRS )\nENDIF( _OPENCL_CPP_INCLUDE_DIRS )\n\nMARK_AS_ADVANCED(\n  OPENCL_INCLUDE_DIRS\n)\n"
  },
  {
    "path": "cmake/Modules/FindPAPI.cmake",
    "content": "# Find PAPI libraries\n# Once done this will define\n#  PAPI_FOUND - System has PAPI\n#  PAPI_INCLUDE_DIRS - The PAPI include directories\n#  PAPI_LIBRARIES - The libraries needed to use PAPI\n\nif(PAPI_INCLUDE_DIRS AND PAPI_LIBRARIES)\n  set(PAPI_FIND_QUIETLY TRUE)\nendif()\n\n# XXX(ddn): our system papi is broken so ignore for now\n# find_path(PAPI_INCLUDE_DIRS papi.h HINTS ${PAPI_ROOT} PATH_SUFFIXES include NO_DEFAULT_PATH )\nfind_path(PAPI_INCLUDE_DIRS papi.h HINTS ${PAPI_ROOT} ENV TACC_PAPI_DIR PATH_SUFFIXES include)\nmessage(STATUS \"PAPI_INCLUDE_DIRS: ${PAPI_INCLUDE_DIRS}\")\nfind_library(PAPI_LIBRARY NAMES papi HINTS ${PAPI_ROOT} ENV TACC_PAPI_DIR PATH_SUFFIXES lib lib64)\nmessage(STATUS \"PAPI_LIBRARY: ${PAPI_LIBRARY}\")\nfind_library(PAPI_LIBRARIES NAMES rt PATH_SUFFIXES lib lib64)\n\ninclude(FindPackageHandleStandardArgs)\nfind_package_handle_standard_args(PAPI DEFAULT_MSG PAPI_LIBRARY PAPI_LIBRARIES PAPI_INCLUDE_DIRS)\nif(PAPI_FOUND)\n  set(PAPI_LIBRARIES ${PAPI_LIBRARY} ${PAPI_LIBRARIES})\nendif()\n\nmark_as_advanced(PAPI_INCLUDE_DIRS PAPI_LIBRARIES)\n"
  },
  {
    "path": "cmake/Modules/FindQGLViewer.cmake",
    "content": "# Find QGLViewer libraries\n# Once done this will define\n#  QGLViewer_FOUND - System has QGLViewer\n#  QGLViewer_INCLUDE_DIRS - The QGLViewer include directories\n#  QGLViewer_LIBRARIES - The libraries needed to use QGLViewer\n\nif(QGLViewer_INCLUDE_DIRS AND QGLVIEWER_LIBRARIES)\n  set(QGLViewer_FIND_QUIETLY TRUE)\nendif()\n\nfind_path(QGLViewer_INCLUDE_DIRS NAMES QGLViewer/qglviewer.h)\nfind_library(QGLViewer_LIBRARIES NAMES QGLViewer PATH_SUFFIXES lib lib64)\n\ninclude(FindPackageHandleStandardArgs)\nfind_package_handle_standard_args(QGLViewer DEFAULT_MSG QGLViewer_INCLUDE_DIRS QGLViewer_LIBRARIES)\nif(QGLVIEWER_FOUND)\n  set(QGLViewer_FOUND TRUE)\nendif()\n\nmark_as_advanced(QGLViewer_INCLUDE_DIRS QGLViewer_LIBRARIES)\n"
  },
  {
    "path": "cmake/Modules/FindTBB.cmake",
    "content": "# Locate Intel Threading Building Blocks include paths and libraries\r\n# FindTBB.cmake can be found at https://code.google.com/p/findtbb/\r\n# Written by Hannes Hofmann <hannes.hofmann _at_ informatik.uni-erlangen.de>\r\n# Improvements by Gino van den Bergen <gino _at_ dtecta.com>,\r\n#   Florian Uhlig <F.Uhlig _at_ gsi.de>,\r\n#   Jiri Marsik <jiri.marsik89 _at_ gmail.com>\r\n\r\n# The MIT License\r\n#\r\n# Copyright (c) 2011 Hannes Hofmann\r\n#\r\n# Permission is hereby granted, free of charge, to any person obtaining a copy\r\n# of this software and associated documentation files (the \"Software\"), to deal\r\n# in the Software without restriction, including without limitation the rights\r\n# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\r\n# copies of the Software, and to permit persons to whom the Software is\r\n# furnished to do so, subject to the following conditions:\r\n#\r\n# The above copyright notice and this permission notice shall be included in\r\n# all copies or substantial portions of the Software.\r\n#\r\n# THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\r\n# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\r\n# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\r\n# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\r\n# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\r\n# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN\r\n# THE SOFTWARE.\r\n\r\n# GvdB: This module uses the environment variable TBB_ARCH_PLATFORM which defines architecture and compiler.\r\n#   e.g. \"ia32/vc8\" or \"em64t/cc4.1.0_libc2.4_kernel2.6.16.21\"\r\n#   TBB_ARCH_PLATFORM is set by the build script tbbvars[.bat|.sh|.csh], which can be found\r\n#   in the TBB installation directory (TBB_INSTALL_DIR).\r\n#\r\n# GvdB: Mac OS X distribution places libraries directly in lib directory.\r\n#\r\n# For backwards compatibility, you may explicitely set the CMake variables TBB_ARCHITECTURE and TBB_COMPILER.\r\n# TBB_ARCHITECTURE [ ia32 | em64t | itanium ]\r\n#   which architecture to use\r\n# TBB_COMPILER e.g. vc9 or cc3.2.3_libc2.3.2_kernel2.4.21 or cc4.0.1_os10.4.9\r\n#   which compiler to use (detected automatically on Windows)\r\n\r\n# This module respects\r\n# TBB_INSTALL_DIR or $ENV{TBB21_INSTALL_DIR} or $ENV{TBB_INSTALL_DIR}\r\n\r\n# This module defines\r\n# TBB_INCLUDE_DIRS, where to find task_scheduler_init.h, etc.\r\n# TBB_LIBRARY_DIRS, where to find libtbb, libtbbmalloc\r\n# TBB_DEBUG_LIBRARY_DIRS, where to find libtbb_debug, libtbbmalloc_debug\r\n# TBB_INSTALL_DIR, the base TBB install directory\r\n# TBB_LIBRARIES, the libraries to link against to use TBB.\r\n# TBB_DEBUG_LIBRARIES, the libraries to link against to use TBB with debug symbols.\r\n# TBB_FOUND, If false, don't try to use TBB.\r\n# TBB_INTERFACE_VERSION, as defined in tbb/tbb_stddef.h\r\n\r\n\r\nif (WIN32)\r\n    # has em64t/vc8 em64t/vc9\r\n    # has ia32/vc7.1 ia32/vc8 ia32/vc9\r\n    set(_TBB_DEFAULT_INSTALL_DIR \"C:/Program Files/Intel/TBB\" \"C:/Program Files (x86)/Intel/TBB\")\r\n    set(_TBB_LIB_NAME \"tbb\")\r\n    set(_TBB_LIB_MALLOC_NAME \"${_TBB_LIB_NAME}malloc\")\r\n    set(_TBB_LIB_DEBUG_NAME \"${_TBB_LIB_NAME}_debug\")\r\n    set(_TBB_LIB_MALLOC_DEBUG_NAME \"${_TBB_LIB_MALLOC_NAME}_debug\")\r\n    if (MSVC71)\r\n        set (_TBB_COMPILER \"vc7.1\")\r\n    endif(MSVC71)\r\n    if (MSVC80)\r\n        set(_TBB_COMPILER \"vc8\")\r\n    endif(MSVC80)\r\n    if (MSVC90)\r\n        set(_TBB_COMPILER \"vc9\")\r\n    endif(MSVC90)\r\n    if(MSVC10)\r\n        set(_TBB_COMPILER \"vc10\")\r\n    endif(MSVC10)\r\n    # Todo: add other Windows compilers such as ICL.\r\n    set(_TBB_ARCHITECTURE ${TBB_ARCHITECTURE})\r\nendif (WIN32)\r\n\r\nif (UNIX)\r\n    if (APPLE)\r\n        # MAC\r\n        set(_TBB_DEFAULT_INSTALL_DIR \"/Library/Frameworks/Intel_TBB.framework/Versions\")\r\n        # libs: libtbb.dylib, libtbbmalloc.dylib, *_debug\r\n        set(_TBB_LIB_NAME \"tbb\")\r\n        set(_TBB_LIB_MALLOC_NAME \"${_TBB_LIB_NAME}malloc\")\r\n        set(_TBB_LIB_DEBUG_NAME \"${_TBB_LIB_NAME}_debug\")\r\n        set(_TBB_LIB_MALLOC_DEBUG_NAME \"${_TBB_LIB_MALLOC_NAME}_debug\")\r\n        # default flavor on apple: ia32/cc4.0.1_os10.4.9\r\n        # Jiri: There is no reason to presume there is only one flavor and\r\n        #       that user's setting of variables should be ignored.\r\n        if(NOT TBB_COMPILER)\r\n            set(_TBB_COMPILER \"cc4.0.1_os10.4.9\")\r\n        elseif (NOT TBB_COMPILER)\r\n            set(_TBB_COMPILER ${TBB_COMPILER})\r\n        endif(NOT TBB_COMPILER)\r\n        if(NOT TBB_ARCHITECTURE)\r\n            set(_TBB_ARCHITECTURE \"ia32\")\r\n        elseif(NOT TBB_ARCHITECTURE)\r\n            set(_TBB_ARCHITECTURE ${TBB_ARCHITECTURE})\r\n        endif(NOT TBB_ARCHITECTURE)\r\n    else (APPLE)\r\n        # LINUX\r\n        set(_TBB_DEFAULT_INSTALL_DIR \"/opt/intel/tbb\" \"/usr/local/include\" \"/usr/include\")\r\n        set(_TBB_LIB_NAME \"tbb\")\r\n        set(_TBB_LIB_MALLOC_NAME \"${_TBB_LIB_NAME}malloc\")\r\n        set(_TBB_LIB_DEBUG_NAME \"${_TBB_LIB_NAME}_debug\")\r\n        set(_TBB_LIB_MALLOC_DEBUG_NAME \"${_TBB_LIB_MALLOC_NAME}_debug\")\r\n        # has em64t/cc3.2.3_libc2.3.2_kernel2.4.21 em64t/cc3.3.3_libc2.3.3_kernel2.6.5 em64t/cc3.4.3_libc2.3.4_kernel2.6.9 em64t/cc4.1.0_libc2.4_kernel2.6.16.21\r\n        # has ia32/*\r\n        # has itanium/*\r\n        set(_TBB_COMPILER ${TBB_COMPILER})\r\n        set(_TBB_ARCHITECTURE ${TBB_ARCHITECTURE})\r\n    endif (APPLE)\r\nendif (UNIX)\r\n\r\nif (CMAKE_SYSTEM MATCHES \"SunOS.*\")\r\n# SUN\r\n# not yet supported\r\n# has em64t/cc3.4.3_kernel5.10\r\n# has ia32/*\r\nendif (CMAKE_SYSTEM MATCHES \"SunOS.*\")\r\n\r\n\r\n#-- Clear the public variables\r\nset (TBB_FOUND \"NO\")\r\n\r\n\r\n#-- Find TBB install dir and set ${_TBB_INSTALL_DIR} and cached ${TBB_INSTALL_DIR}\r\n# first: use CMake variable TBB_INSTALL_DIR\r\nif (TBB_INSTALL_DIR)\r\n    set (_TBB_INSTALL_DIR ${TBB_INSTALL_DIR})\r\nendif (TBB_INSTALL_DIR)\r\n# second: use environment variable\r\nif (NOT _TBB_INSTALL_DIR)\r\n    if (NOT \"$ENV{TBBROOT}\" STREQUAL \"\")\r\n        set (_TBB_INSTALL_DIR $ENV{TBBROOT})\r\n    endif()\r\n    if (NOT \"$ENV{TBB_INSTALL_DIR}\" STREQUAL \"\")\r\n        set (_TBB_INSTALL_DIR $ENV{TBB_INSTALL_DIR})\r\n    endif (NOT \"$ENV{TBB_INSTALL_DIR}\" STREQUAL \"\")\r\n    # Intel recommends setting TBB21_INSTALL_DIR\r\n    if (NOT \"$ENV{TBB21_INSTALL_DIR}\" STREQUAL \"\")\r\n        set (_TBB_INSTALL_DIR $ENV{TBB21_INSTALL_DIR})\r\n    endif (NOT \"$ENV{TBB21_INSTALL_DIR}\" STREQUAL \"\")\r\n    if (NOT \"$ENV{TBB22_INSTALL_DIR}\" STREQUAL \"\")\r\n        set (_TBB_INSTALL_DIR $ENV{TBB22_INSTALL_DIR})\r\n    endif (NOT \"$ENV{TBB22_INSTALL_DIR}\" STREQUAL \"\")\r\n    if (NOT \"$ENV{TBB30_INSTALL_DIR}\" STREQUAL \"\")\r\n        set (_TBB_INSTALL_DIR $ENV{TBB30_INSTALL_DIR})\r\n    endif (NOT \"$ENV{TBB30_INSTALL_DIR}\" STREQUAL \"\")\r\nendif (NOT _TBB_INSTALL_DIR)\r\n# third: try to find path automatically\r\nif (NOT _TBB_INSTALL_DIR)\r\n    if (_TBB_DEFAULT_INSTALL_DIR)\r\n        set (_TBB_INSTALL_DIR ${_TBB_DEFAULT_INSTALL_DIR})\r\n    endif (_TBB_DEFAULT_INSTALL_DIR)\r\nendif (NOT _TBB_INSTALL_DIR)\r\n# sanity check\r\nif (NOT _TBB_INSTALL_DIR)\r\n    message (\"ERROR: Unable to find Intel TBB install directory. ${_TBB_INSTALL_DIR}\")\r\nelse (NOT _TBB_INSTALL_DIR)\r\n# finally: set the cached CMake variable TBB_INSTALL_DIR\r\nif (NOT TBB_INSTALL_DIR)\r\n    set (TBB_INSTALL_DIR ${_TBB_INSTALL_DIR} CACHE PATH \"Intel TBB install directory\")\r\n    mark_as_advanced(TBB_INSTALL_DIR)\r\nendif (NOT TBB_INSTALL_DIR)\r\n\r\n\r\n#-- A macro to rewrite the paths of the library. This is necessary, because\r\n#   find_library() always found the em64t/vc9 version of the TBB libs\r\nmacro(TBB_CORRECT_LIB_DIR var_name)\r\n#    if (NOT \"${_TBB_ARCHITECTURE}\" STREQUAL \"em64t\")\r\n        string(REPLACE em64t \"${_TBB_ARCHITECTURE}\" ${var_name} ${${var_name}})\r\n#    endif (NOT \"${_TBB_ARCHITECTURE}\" STREQUAL \"em64t\")\r\n    string(REPLACE ia32 \"${_TBB_ARCHITECTURE}\" ${var_name} ${${var_name}})\r\n    string(REPLACE vc7.1 \"${_TBB_COMPILER}\" ${var_name} ${${var_name}})\r\n    string(REPLACE vc8 \"${_TBB_COMPILER}\" ${var_name} ${${var_name}})\r\n    string(REPLACE vc9 \"${_TBB_COMPILER}\" ${var_name} ${${var_name}})\r\n    string(REPLACE vc10 \"${_TBB_COMPILER}\" ${var_name} ${${var_name}})\r\nendmacro(TBB_CORRECT_LIB_DIR var_content)\r\n\r\n\r\n#-- Look for include directory and set ${TBB_INCLUDE_DIR}\r\nset (TBB_INC_SEARCH_DIR ${_TBB_INSTALL_DIR}/include)\r\n# Jiri: tbbvars now sets the CPATH environment variable to the directory\r\n#       containing the headers.\r\nfind_path(TBB_INCLUDE_DIR\r\n    tbb/task_scheduler_init.h\r\n    PATHS ${TBB_INC_SEARCH_DIR} ENV CPATH\r\n)\r\nmark_as_advanced(TBB_INCLUDE_DIR)\r\n\r\n\r\n#-- Look for libraries\r\n# GvdB: $ENV{TBB_ARCH_PLATFORM} is set by the build script tbbvars[.bat|.sh|.csh]\r\nif (NOT $ENV{TBB_ARCH_PLATFORM} STREQUAL \"\")\r\n    set (_TBB_LIBRARY_DIR \r\n         ${_TBB_INSTALL_DIR}/lib/$ENV{TBB_ARCH_PLATFORM}\r\n         ${_TBB_INSTALL_DIR}/$ENV{TBB_ARCH_PLATFORM}/lib\r\n        )\r\nendif (NOT $ENV{TBB_ARCH_PLATFORM} STREQUAL \"\")\r\n# Jiri: This block isn't mutually exclusive with the previous one\r\n#       (hence no else), instead I test if the user really specified\r\n#       the variables in question.\r\nif ((NOT ${TBB_ARCHITECTURE} STREQUAL \"\") AND (NOT ${TBB_COMPILER} STREQUAL \"\"))\r\n    # HH: deprecated\r\n    message(STATUS \"[Warning] FindTBB.cmake: The use of TBB_ARCHITECTURE and TBB_COMPILER is deprecated and may not be supported in future versions. Please set \\$ENV{TBB_ARCH_PLATFORM} (using tbbvars.[bat|csh|sh]).\")\r\n    # Jiri: It doesn't hurt to look in more places, so I store the hints from\r\n    #       ENV{TBB_ARCH_PLATFORM} and the TBB_ARCHITECTURE and TBB_COMPILER\r\n    #       variables and search them both.\r\n    set (_TBB_LIBRARY_DIR \"${_TBB_INSTALL_DIR}/${_TBB_ARCHITECTURE}/${_TBB_COMPILER}/lib\" ${_TBB_LIBRARY_DIR})\r\nendif ((NOT ${TBB_ARCHITECTURE} STREQUAL \"\") AND (NOT ${TBB_COMPILER} STREQUAL \"\"))\r\n\r\n# GvdB: Mac OS X distribution places libraries directly in lib directory.\r\nlist(APPEND _TBB_LIBRARY_DIR ${_TBB_INSTALL_DIR}/lib)\r\n\r\n# Jiri: No reason not to check the default paths. From recent versions,\r\n#       tbbvars has started exporting the LIBRARY_PATH and LD_LIBRARY_PATH\r\n#       variables, which now point to the directories of the lib files.\r\n#       It all makes more sense to use the ${_TBB_LIBRARY_DIR} as a HINTS\r\n#       argument instead of the implicit PATHS as it isn't hard-coded\r\n#       but computed by system introspection. Searching the LIBRARY_PATH\r\n#       and LD_LIBRARY_PATH environment variables is now even more important\r\n#       that tbbvars doesn't export TBB_ARCH_PLATFORM and it facilitates\r\n#       the use of TBB built from sources.\r\nfind_library(TBB_LIBRARY ${_TBB_LIB_NAME} HINTS ${_TBB_LIBRARY_DIR}\r\n        PATHS ENV LIBRARY_PATH ENV LD_LIBRARY_PATH)\r\nfind_library(TBB_MALLOC_LIBRARY ${_TBB_LIB_MALLOC_NAME} HINTS ${_TBB_LIBRARY_DIR}\r\n        PATHS ENV LIBRARY_PATH ENV LD_LIBRARY_PATH)\r\n\r\n#Extract path from TBB_LIBRARY name\r\nget_filename_component(TBB_LIBRARY_DIR ${TBB_LIBRARY} PATH)\r\n\r\n#TBB_CORRECT_LIB_DIR(TBB_LIBRARY)\r\n#TBB_CORRECT_LIB_DIR(TBB_MALLOC_LIBRARY)\r\nmark_as_advanced(TBB_LIBRARY TBB_MALLOC_LIBRARY)\r\n\r\n#-- Look for debug libraries\r\n# Jiri: Changed the same way as for the release libraries.\r\nfind_library(TBB_LIBRARY_DEBUG ${_TBB_LIB_DEBUG_NAME} HINTS ${_TBB_LIBRARY_DIR}\r\n        PATHS ENV LIBRARY_PATH ENV LD_LIBRARY_PATH)\r\nfind_library(TBB_MALLOC_LIBRARY_DEBUG ${_TBB_LIB_MALLOC_DEBUG_NAME} HINTS ${_TBB_LIBRARY_DIR}\r\n        PATHS ENV LIBRARY_PATH ENV LD_LIBRARY_PATH)\r\n\r\n# Jiri: Self-built TBB stores the debug libraries in a separate directory.\r\n#       Extract path from TBB_LIBRARY_DEBUG name\r\nget_filename_component(TBB_LIBRARY_DEBUG_DIR ${TBB_LIBRARY_DEBUG} PATH)\r\n\r\n#TBB_CORRECT_LIB_DIR(TBB_LIBRARY_DEBUG)\r\n#TBB_CORRECT_LIB_DIR(TBB_MALLOC_LIBRARY_DEBUG)\r\nmark_as_advanced(TBB_LIBRARY_DEBUG TBB_MALLOC_LIBRARY_DEBUG)\r\n\r\n\r\nif (TBB_INCLUDE_DIR)\r\n    if (TBB_LIBRARY)\r\n        set (TBB_FOUND \"YES\")\r\n        set (TBB_LIBRARIES ${TBB_LIBRARY} ${TBB_MALLOC_LIBRARY} ${TBB_LIBRARIES})\r\n        set (TBB_DEBUG_LIBRARIES ${TBB_LIBRARY_DEBUG} ${TBB_MALLOC_LIBRARY_DEBUG} ${TBB_DEBUG_LIBRARIES})\r\n        set (TBB_INCLUDE_DIRS ${TBB_INCLUDE_DIR} CACHE PATH \"TBB include directory\" FORCE)\r\n        set (TBB_LIBRARY_DIRS ${TBB_LIBRARY_DIR} CACHE PATH \"TBB library directory\" FORCE)\r\n        # Jiri: Self-built TBB stores the debug libraries in a separate directory.\r\n        set (TBB_DEBUG_LIBRARY_DIRS ${TBB_LIBRARY_DEBUG_DIR} CACHE PATH \"TBB debug library directory\" FORCE)\r\n        mark_as_advanced(TBB_INCLUDE_DIRS TBB_LIBRARY_DIRS TBB_DEBUG_LIBRARY_DIRS TBB_LIBRARIES TBB_DEBUG_LIBRARIES)\r\n        message(STATUS \"Found Intel TBB\")\r\n    endif (TBB_LIBRARY)\r\nendif (TBB_INCLUDE_DIR)\r\n\r\nif (NOT TBB_FOUND)\r\n    message(\"ERROR: Intel TBB NOT found!\")\r\n    message(STATUS \"Looked for Threading Building Blocks in ${_TBB_INSTALL_DIR}\")\r\n    # do only throw fatal, if this pkg is REQUIRED\r\n    if (TBB_FIND_REQUIRED)\r\n        message(FATAL_ERROR \"Could NOT find TBB library.\")\r\n    endif (TBB_FIND_REQUIRED)\r\nendif (NOT TBB_FOUND)\r\n\r\nendif (NOT _TBB_INSTALL_DIR)\r\n\r\nif (TBB_FOUND)\r\n\tset(TBB_INTERFACE_VERSION 0)\r\n#\tFILE(READ \"${TBB_INCLUDE_DIRS}/tbb/tbb_stddef.h\" _TBB_VERSION_CONTENTS)\r\n\tSTRING(REGEX REPLACE \".*#define TBB_INTERFACE_VERSION ([0-9]+).*\" \"\\\\1\" TBB_INTERFACE_VERSION \"${_TBB_VERSION_CONTENTS}\")\r\n\tset(TBB_INTERFACE_VERSION \"${TBB_INTERFACE_VERSION}\")\r\nendif (TBB_FOUND)\r\n"
  },
  {
    "path": "cmake/Modules/FindVTune.cmake",
    "content": "# Find VTune libraries\n# Once done this will define\n#  VTune_FOUND - System has VTune\n#  VTune_INCLUDE_DIRS - The VTune include directories\n#  VTune_LIBRARIES - The libraries needed to use VTune\n\nmessage(STATUS \"${VTune_INCLUDE_DIRS}\")\n\nif(VTune_INCLUDE_DIRS AND VTune_LIBRARIES)\n  set(VTune_FIND_QUIETLY TRUE)\nendif()\n\n\nset(VTune_LIBRARY_PATH_CANDIDATES lib lib64 lib32 bin64/k1om bin32/k1om)\nfind_path(VTune_INCLUDE_DIRS ittnotify.h PATHS ${VTune_ROOT} PATH_SUFFIXES include)\nfind_library(VTune_LIBRARY NAMES ittnotify PATHS ${VTune_ROOT} PATH_SUFFIXES ${VTune_LIBRARY_PATH_CANDIDATES})\nfind_library(VTune_LIBRARIES NAMES dl PATH_SUFFIXES lib lib64 lib32)\n\ninclude(FindPackageHandleStandardArgs)\nfind_package_handle_standard_args(VTune DEFAULT_MSG VTune_LIBRARY VTune_LIBRARIES VTune_INCLUDE_DIRS)\nif(VTUNE_FOUND)\n  set(VTune_FOUND on)\n  set(VTune_LIBRARIES ${VTune_LIBRARY} ${VTune_LIBRARIES})\nendif()\nmark_as_advanced(VTune_INCLUDE_DIRS VTune_LIBRARIES)\n"
  },
  {
    "path": "cmake/Modules/GetGitVersion-write.cmake",
    "content": "### Don't include directly, for use by GetSVNVersion.cmake\nfind_package(Git)\n# Extract svn info into MY_XXX variables\nif(GIT_FOUND)\n  execute_process(COMMAND ${GIT_EXECUTABLE} rev-parse --verify --short HEAD\n    WORKING_DIRECTORY ${SOURCE_DIR}\n    OUTPUT_VARIABLE GIT_REVISION\n    OUTPUT_STRIP_TRAILING_WHITESPACE)\n  file(WRITE include/galois/revision.h.txt \"#define GALOIS_REVISION \\\"${GIT_REVISION}\\\"\\n\")\nelse()\n  file(WRITE include/galois/revision.h.txt \"#define GALOIS_REVISION \\\"0\\\"\\n\")\nendif()\n\nexecute_process(COMMAND ${CMAKE_COMMAND} -E copy_if_different include/galois/revision.h.txt include/galois/revision.h)\n"
  },
  {
    "path": "cmake/Modules/GetGitVersion.cmake",
    "content": "# DUMMY is a non-existent file to force regeneration of svn header every build\nadd_custom_target(revision ALL DEPENDS DUMMY ${PROJECT_BINARY_DIR}/include/galois/revision.h)\n\nfind_file(_MODULE \"GetGitVersion-write.cmake\" PATHS ${CMAKE_MODULE_PATH})\n\nadd_custom_command(OUTPUT DUMMY ${PROJECT_BINARY_DIR}/include/galois/revision.h\n  COMMAND ${CMAKE_COMMAND} -DSOURCE_DIR=${CMAKE_SOURCE_DIR}\n  -DCMAKE_MODULE_PATH=\"${CMAKE_SOURCE_DIR}/cmake/Modules/\" -P ${_MODULE})\n\nset(_MODULE off)\n\nset_source_files_properties(${PROJECT_BINARY_DIR}/include/galois/revision.h\n  PROPERTIES GENERATED TRUE\n  HEADER_FILE_ONLY TRUE)\n"
  },
  {
    "path": "cmake/Modules/HandleSanitizer.cmake",
    "content": "# Galois: taken from:\n#   https://github.com/llvm/llvm-project/blob/master/llvm/cmake/modules/HandleLLVMOptions.cmake \n\ninclude(CheckCCompilerFlag)\ninclude(CheckCXXCompilerFlag)\n\nstring(TOUPPER \"${CMAKE_BUILD_TYPE}\" uppercase_CMAKE_BUILD_TYPE)\n\nif(NOT GALOIS_USE_SANITIZER)\n  return()\nendif()\n\nfunction(append value)\n  foreach(variable ${ARGN})\n    set(${variable} \"${${variable}} ${value}\" PARENT_SCOPE)\n  endforeach(variable)\nendfunction()\n\nfunction(append_if condition value)\n  if (${condition})\n    foreach(variable ${ARGN})\n      set(${variable} \"${${variable}} ${value}\" PARENT_SCOPE)\n    endforeach(variable)\n  endif()\nendfunction()\n\nmacro(add_flag_if_supported flag name)\n  check_c_compiler_flag(\"-Werror ${flag}\" \"C_SUPPORTS_${name}\")\n  append_if(\"C_SUPPORTS_${name}\" \"${flag}\" CMAKE_C_FLAGS)\n  check_cxx_compiler_flag(\"-Werror ${flag}\" \"CXX_SUPPORTS_${name}\")\n  append_if(\"CXX_SUPPORTS_${name}\" \"${flag}\" CMAKE_CXX_FLAGS)\nendmacro()\n\nmacro(append_common_sanitizer_flags)\n  # Append -fno-omit-frame-pointer and turn on debug info to get better\n  # stack traces.\n  add_flag_if_supported(\"-fno-omit-frame-pointer\" FNO_OMIT_FRAME_POINTER)\n  if (NOT uppercase_CMAKE_BUILD_TYPE STREQUAL \"DEBUG\" AND\n      NOT uppercase_CMAKE_BUILD_TYPE STREQUAL \"RELWITHDEBINFO\")\n    add_flag_if_supported(\"-gline-tables-only\" GLINE_TABLES_ONLY)\n  endif()\n  # Use -O1 even in debug mode, otherwise sanitizers slowdown is too large.\n  if (uppercase_CMAKE_BUILD_TYPE STREQUAL \"DEBUG\")\n    add_flag_if_supported(\"-O1\" O1)\n  endif()\nendmacro()\n\nif (GALOIS_USE_SANITIZER STREQUAL \"Address\")\n  append_common_sanitizer_flags()\n  append(\"-fsanitize=address\" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)\nelseif (GALOIS_USE_SANITIZER STREQUAL \"HWAddress\")\n  append_common_sanitizer_flags()\n  append(\"-fsanitize=hwaddress\" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)\nelseif (GALOIS_USE_SANITIZER MATCHES \"Memory(WithOrigins)?\")\n  append_common_sanitizer_flags()\n  append(\"-fsanitize=memory\" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)\n  if(GALOIS_USE_SANITIZER STREQUAL \"MemoryWithOrigins\")\n    append(\"-fsanitize-memory-track-origins\" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)\n  endif()\nelseif (GALOIS_USE_SANITIZER STREQUAL \"Undefined\")\n  append_common_sanitizer_flags()\n  append(\"-fsanitize=undefined -fno-sanitize-recover=all\"\n          CMAKE_C_FLAGS CMAKE_CXX_FLAGS)\nelseif (GALOIS_USE_SANITIZER STREQUAL \"Thread\")\n  append_common_sanitizer_flags()\n  append(\"-fsanitize=thread\" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)\nelseif (GALOIS_USE_SANITIZER STREQUAL \"DataFlow\")\n  append(\"-fsanitize=dataflow\" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)\nelseif (GALOIS_USE_SANITIZER STREQUAL \"Address;Undefined\" OR\n        GALOIS_USE_SANITIZER STREQUAL \"Undefined;Address\")\n  append_common_sanitizer_flags()\n  append(\"-fsanitize=address,undefined -fno-sanitize-recover=all\"\n          CMAKE_C_FLAGS CMAKE_CXX_FLAGS)\nelseif (GALOIS_USE_SANITIZER STREQUAL \"Leaks\")\n  append_common_sanitizer_flags()\n  append(\"-fsanitize=leak\" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)\nelse()\n  message(FATAL_ERROR \"Unsupported value of GALOIS_USE_SANITIZER: ${GALOIS_USE_SANITIZER}\")\nendif()\n\nif (GALOIS_USE_SANITIZER MATCHES \"(Undefined;)?Address(;Undefined)?\")\n  add_flag_if_supported(\"-fsanitize-address-use-after-scope\"\n                        FSANITIZE_USE_AFTER_SCOPE_FLAG)\nendif()\n\nif (GALOIS_USE_SANITIZE_COVERAGE)\n  append(\"-fsanitize=fuzzer-no-link\" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)\nendif()\n\nif (GALOIS_USE_SANITIZER MATCHES \".*Undefined.*\")\n  set(BLACKLIST_CONFIGURE_FILE \"${PROJECT_SOURCE_DIR}/config/sanitizers/ubsan_blacklist.txt.in\")\n  if (EXISTS \"${BLACKLIST_CONFIGURE_FILE}\")\n    set(BLACKLIST_FILE \"${PROJECT_BINARY_DIR}/config/sanitizers/ubsan_blacklist.txt\")\n    configure_file(\"${BLACKLIST_CONFIGURE_FILE}\" \"${BLACKLIST_FILE}\")\n    append(\"-fsanitize-blacklist=${BLACKLIST_FILE}\"\n           CMAKE_C_FLAGS CMAKE_CXX_FLAGS)\n  endif()\nendif()\n"
  },
  {
    "path": "cmake/Modules/LibFindMacros.cmake",
    "content": "# Copyright Raimar Sandner 2012–2014. Distributed under the Boost Software License, Version 1.0. (See accompanying file LICENSE.txt)\n\n#! \\file\n#! \\ingroup Helpers\n#! \\brief Improved versions of %CMake's `find_package`\n\n#! \\ingroup Helpers\n#! \\brief Works the same as `find_package`, but forwards the \"REQUIRED\" and \"QUIET\" arguments\n#!   used for the current package.\n#!\n#! For this to work, the first parameter must be the prefix of the current package, then the\n#! prefix of the new package etc, which are passed to `find_package`.\nmacro (libfind_package PREFIX)\n  set (LIBFIND_PACKAGE_ARGS ${ARGN})\n  if (${PREFIX}_FIND_QUIETLY)\n    set (LIBFIND_PACKAGE_ARGS ${LIBFIND_PACKAGE_ARGS} QUIET)\n  endif (${PREFIX}_FIND_QUIETLY)\n  if (${PREFIX}_FIND_REQUIRED)\n    set (LIBFIND_PACKAGE_ARGS ${LIBFIND_PACKAGE_ARGS} REQUIRED)\n  endif (${PREFIX}_FIND_REQUIRED)\n  find_package(${LIBFIND_PACKAGE_ARGS})\nendmacro (libfind_package)\n\n\n#! \\ingroup Helpers\n#! \\brief Do the final processing once the paths have been detected.\n#!\n#! If include dirs are needed, `${PREFIX}_PROCESS_INCLUDES` should be set to contain\n#! all the variables, each of which contain one include directory.\n#! Ditto for `${PREFIX}_PROCESS_LIBS` and library files.\n#! Will set `${PREFIX}_FOUND`, `${PREFIX}_INCLUDE_DIRS` and `${PREFIX}_LIBRARIES`.\n#! Also handles errors in case library detection was required, etc.\nmacro (libfind_process PREFIX)\n  # Skip processing if already processed during this run\n  if (NOT ${PREFIX}_FOUND)\n    # Start with the assumption that the library was found\n    set (${PREFIX}_FOUND TRUE)\n\n    # Process all includes and set _FOUND to false if any are missing\n    foreach (i ${${PREFIX}_PROCESS_INCLUDES})\n      if (${i})\n        set (${PREFIX}_INCLUDE_DIRS ${${PREFIX}_INCLUDE_DIRS} ${${i}})\n        mark_as_advanced(${i})\n      else (${i})\n        set (${PREFIX}_FOUND FALSE)\n      endif (${i})\n    endforeach (i)\n\n    # Process all libraries and set _FOUND to false if any are missing\n    foreach (i ${${PREFIX}_PROCESS_LIBS})\n      if (${i})\n        set (${PREFIX}_LIBRARIES ${${PREFIX}_LIBRARIES} ${${i}})\n        mark_as_advanced(${i})\n      else (${i})\n        set (${PREFIX}_FOUND FALSE)\n      endif (${i})\n    endforeach (i)\n\n    # Print message and/or exit on fatal error\n    if (${PREFIX}_FOUND)\n      if (NOT ${PREFIX}_FIND_QUIETLY)\n        message (STATUS \"Found ${PREFIX} ${${PREFIX}_VERSION}\")\n      endif (NOT ${PREFIX}_FIND_QUIETLY)\n    else (${PREFIX}_FOUND)\n      if (${PREFIX}_FIND_REQUIRED)\n        foreach (i ${${PREFIX}_PROCESS_INCLUDES} ${${PREFIX}_PROCESS_LIBS})\n          message(\"${i}=${${i}}\")\n        endforeach (i)\n        message (FATAL_ERROR \"Required library ${PREFIX} NOT FOUND.\\nInstall the library (dev version) and try again. If the library is already installed, use ccmake to set the missing variables manually.\")\n      endif (${PREFIX}_FIND_REQUIRED)\n    endif (${PREFIX}_FOUND)\n  endif (NOT ${PREFIX}_FOUND)\nendmacro (libfind_process)\n"
  },
  {
    "path": "cmake/Modules/UseStdMacro.cmake",
    "content": "add_definitions(-D__STDC_LIMIT_MACROS)\nadd_definitions(-D__STDC_CONSTANT_MACROS)\n"
  },
  {
    "path": "config/sanitizers/ubsan_blacklist.txt.in",
    "content": "[undefined]\nsrc:@PROJECT_SOURCE_DIR@/external/bliss/*\n"
  },
  {
    "path": "config/version.txt",
    "content": "6.0.0\n"
  },
  {
    "path": "external/bliss/bliss/COPYING",
    "content": "                    GNU GENERAL PUBLIC LICENSE\n                       Version 3, 29 June 2007\n\n Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>\n Everyone is permitted to copy and distribute verbatim copies\n of this license document, but changing it is not allowed.\n\n                            Preamble\n\n  The GNU General Public License is a free, copyleft license for\nsoftware and other kinds of works.\n\n  The licenses for most software and other practical works are designed\nto take away your freedom to share and change the works.  By contrast,\nthe GNU General Public License is intended to guarantee your freedom to\nshare and change all versions of a program--to make sure it remains free\nsoftware for all its users.  We, the Free Software Foundation, use the\nGNU General Public License for most of our software; it applies also to\nany other work released this way by its authors.  You can apply it to\nyour programs, too.\n\n  When we speak of free software, we are referring to freedom, not\nprice.  Our General Public Licenses are designed to make sure that you\nhave the freedom to distribute copies of free software (and charge for\nthem if you wish), that you receive source code or can get it if you\nwant it, that you can change the software or use pieces of it in new\nfree programs, and that you know you can do these things.\n\n  To protect your rights, we need to prevent others from denying you\nthese rights or asking you to surrender the rights.  Therefore, you have\ncertain responsibilities if you distribute copies of the software, or if\nyou modify it: responsibilities to respect the freedom of others.\n\n  For example, if you distribute copies of such a program, whether\ngratis or for a fee, you must pass on to the recipients the same\nfreedoms that you received.  You must make sure that they, too, receive\nor can get the source code.  And you must show them these terms so they\nknow their rights.\n\n  Developers that use the GNU GPL protect your rights with two steps:\n(1) assert copyright on the software, and (2) offer you this License\ngiving you legal permission to copy, distribute and/or modify it.\n\n  For the developers' and authors' protection, the GPL clearly explains\nthat there is no warranty for this free software.  For both users' and\nauthors' sake, the GPL requires that modified versions be marked as\nchanged, so that their problems will not be attributed erroneously to\nauthors of previous versions.\n\n  Some devices are designed to deny users access to install or run\nmodified versions of the software inside them, although the manufacturer\ncan do so.  This is fundamentally incompatible with the aim of\nprotecting users' freedom to change the software.  The systematic\npattern of such abuse occurs in the area of products for individuals to\nuse, which is precisely where it is most unacceptable.  Therefore, we\nhave designed this version of the GPL to prohibit the practice for those\nproducts.  If such problems arise substantially in other domains, we\nstand ready to extend this provision to those domains in future versions\nof the GPL, as needed to protect the freedom of users.\n\n  Finally, every program is threatened constantly by software patents.\nStates should not allow patents to restrict development and use of\nsoftware on general-purpose computers, but in those that do, we wish to\navoid the special danger that patents applied to a free program could\nmake it effectively proprietary.  To prevent this, the GPL assures that\npatents cannot be used to render the program non-free.\n\n  The precise terms and conditions for copying, distribution and\nmodification follow.\n\n                       TERMS AND CONDITIONS\n\n  0. Definitions.\n\n  \"This License\" refers to version 3 of the GNU General Public License.\n\n  \"Copyright\" also means copyright-like laws that apply to other kinds of\nworks, such as semiconductor masks.\n\n  \"The Program\" refers to any copyrightable work licensed under this\nLicense.  Each licensee is addressed as \"you\".  \"Licensees\" and\n\"recipients\" may be individuals or organizations.\n\n  To \"modify\" a work means to copy from or adapt all or part of the work\nin a fashion requiring copyright permission, other than the making of an\nexact copy.  The resulting work is called a \"modified version\" of the\nearlier work or a work \"based on\" the earlier work.\n\n  A \"covered work\" means either the unmodified Program or a work based\non the Program.\n\n  To \"propagate\" a work means to do anything with it that, without\npermission, would make you directly or secondarily liable for\ninfringement under applicable copyright law, except executing it on a\ncomputer or modifying a private copy.  Propagation includes copying,\ndistribution (with or without modification), making available to the\npublic, and in some countries other activities as well.\n\n  To \"convey\" a work means any kind of propagation that enables other\nparties to make or receive copies.  Mere interaction with a user through\na computer network, with no transfer of a copy, is not conveying.\n\n  An interactive user interface displays \"Appropriate Legal Notices\"\nto the extent that it includes a convenient and prominently visible\nfeature that (1) displays an appropriate copyright notice, and (2)\ntells the user that there is no warranty for the work (except to the\nextent that warranties are provided), that licensees may convey the\nwork under this License, and how to view a copy of this License.  If\nthe interface presents a list of user commands or options, such as a\nmenu, a prominent item in the list meets this criterion.\n\n  1. Source Code.\n\n  The \"source code\" for a work means the preferred form of the work\nfor making modifications to it.  \"Object code\" means any non-source\nform of a work.\n\n  A \"Standard Interface\" means an interface that either is an official\nstandard defined by a recognized standards body, or, in the case of\ninterfaces specified for a particular programming language, one that\nis widely used among developers working in that language.\n\n  The \"System Libraries\" of an executable work include anything, other\nthan the work as a whole, that (a) is included in the normal form of\npackaging a Major Component, but which is not part of that Major\nComponent, and (b) serves only to enable use of the work with that\nMajor Component, or to implement a Standard Interface for which an\nimplementation is available to the public in source code form.  A\n\"Major Component\", in this context, means a major essential component\n(kernel, window system, and so on) of the specific operating system\n(if any) on which the executable work runs, or a compiler used to\nproduce the work, or an object code interpreter used to run it.\n\n  The \"Corresponding Source\" for a work in object code form means all\nthe source code needed to generate, install, and (for an executable\nwork) run the object code and to modify the work, including scripts to\ncontrol those activities.  However, it does not include the work's\nSystem Libraries, or general-purpose tools or generally available free\nprograms which are used unmodified in performing those activities but\nwhich are not part of the work.  For example, Corresponding Source\nincludes interface definition files associated with source files for\nthe work, and the source code for shared libraries and dynamically\nlinked subprograms that the work is specifically designed to require,\nsuch as by intimate data communication or control flow between those\nsubprograms and other parts of the work.\n\n  The Corresponding Source need not include anything that users\ncan regenerate automatically from other parts of the Corresponding\nSource.\n\n  The Corresponding Source for a work in source code form is that\nsame work.\n\n  2. Basic Permissions.\n\n  All rights granted under this License are granted for the term of\ncopyright on the Program, and are irrevocable provided the stated\nconditions are met.  This License explicitly affirms your unlimited\npermission to run the unmodified Program.  The output from running a\ncovered work is covered by this License only if the output, given its\ncontent, constitutes a covered work.  This License acknowledges your\nrights of fair use or other equivalent, as provided by copyright law.\n\n  You may make, run and propagate covered works that you do not\nconvey, without conditions so long as your license otherwise remains\nin force.  You may convey covered works to others for the sole purpose\nof having them make modifications exclusively for you, or provide you\nwith facilities for running those works, provided that you comply with\nthe terms of this License in conveying all material for which you do\nnot control copyright.  Those thus making or running the covered works\nfor you must do so exclusively on your behalf, under your direction\nand control, on terms that prohibit them from making any copies of\nyour copyrighted material outside their relationship with you.\n\n  Conveying under any other circumstances is permitted solely under\nthe conditions stated below.  Sublicensing is not allowed; section 10\nmakes it unnecessary.\n\n  3. Protecting Users' Legal Rights From Anti-Circumvention Law.\n\n  No covered work shall be deemed part of an effective technological\nmeasure under any applicable law fulfilling obligations under article\n11 of the WIPO copyright treaty adopted on 20 December 1996, or\nsimilar laws prohibiting or restricting circumvention of such\nmeasures.\n\n  When you convey a covered work, you waive any legal power to forbid\ncircumvention of technological measures to the extent such circumvention\nis effected by exercising rights under this License with respect to\nthe covered work, and you disclaim any intention to limit operation or\nmodification of the work as a means of enforcing, against the work's\nusers, your or third parties' legal rights to forbid circumvention of\ntechnological measures.\n\n  4. Conveying Verbatim Copies.\n\n  You may convey verbatim copies of the Program's source code as you\nreceive it, in any medium, provided that you conspicuously and\nappropriately publish on each copy an appropriate copyright notice;\nkeep intact all notices stating that this License and any\nnon-permissive terms added in accord with section 7 apply to the code;\nkeep intact all notices of the absence of any warranty; and give all\nrecipients a copy of this License along with the Program.\n\n  You may charge any price or no price for each copy that you convey,\nand you may offer support or warranty protection for a fee.\n\n  5. Conveying Modified Source Versions.\n\n  You may convey a work based on the Program, or the modifications to\nproduce it from the Program, in the form of source code under the\nterms of section 4, provided that you also meet all of these conditions:\n\n    a) The work must carry prominent notices stating that you modified\n    it, and giving a relevant date.\n\n    b) The work must carry prominent notices stating that it is\n    released under this License and any conditions added under section\n    7.  This requirement modifies the requirement in section 4 to\n    \"keep intact all notices\".\n\n    c) You must license the entire work, as a whole, under this\n    License to anyone who comes into possession of a copy.  This\n    License will therefore apply, along with any applicable section 7\n    additional terms, to the whole of the work, and all its parts,\n    regardless of how they are packaged.  This License gives no\n    permission to license the work in any other way, but it does not\n    invalidate such permission if you have separately received it.\n\n    d) If the work has interactive user interfaces, each must display\n    Appropriate Legal Notices; however, if the Program has interactive\n    interfaces that do not display Appropriate Legal Notices, your\n    work need not make them do so.\n\n  A compilation of a covered work with other separate and independent\nworks, which are not by their nature extensions of the covered work,\nand which are not combined with it such as to form a larger program,\nin or on a volume of a storage or distribution medium, is called an\n\"aggregate\" if the compilation and its resulting copyright are not\nused to limit the access or legal rights of the compilation's users\nbeyond what the individual works permit.  Inclusion of a covered work\nin an aggregate does not cause this License to apply to the other\nparts of the aggregate.\n\n  6. Conveying Non-Source Forms.\n\n  You may convey a covered work in object code form under the terms\nof sections 4 and 5, provided that you also convey the\nmachine-readable Corresponding Source under the terms of this License,\nin one of these ways:\n\n    a) Convey the object code in, or embodied in, a physical product\n    (including a physical distribution medium), accompanied by the\n    Corresponding Source fixed on a durable physical medium\n    customarily used for software interchange.\n\n    b) Convey the object code in, or embodied in, a physical product\n    (including a physical distribution medium), accompanied by a\n    written offer, valid for at least three years and valid for as\n    long as you offer spare parts or customer support for that product\n    model, to give anyone who possesses the object code either (1) a\n    copy of the Corresponding Source for all the software in the\n    product that is covered by this License, on a durable physical\n    medium customarily used for software interchange, for a price no\n    more than your reasonable cost of physically performing this\n    conveying of source, or (2) access to copy the\n    Corresponding Source from a network server at no charge.\n\n    c) Convey individual copies of the object code with a copy of the\n    written offer to provide the Corresponding Source.  This\n    alternative is allowed only occasionally and noncommercially, and\n    only if you received the object code with such an offer, in accord\n    with subsection 6b.\n\n    d) Convey the object code by offering access from a designated\n    place (gratis or for a charge), and offer equivalent access to the\n    Corresponding Source in the same way through the same place at no\n    further charge.  You need not require recipients to copy the\n    Corresponding Source along with the object code.  If the place to\n    copy the object code is a network server, the Corresponding Source\n    may be on a different server (operated by you or a third party)\n    that supports equivalent copying facilities, provided you maintain\n    clear directions next to the object code saying where to find the\n    Corresponding Source.  Regardless of what server hosts the\n    Corresponding Source, you remain obligated to ensure that it is\n    available for as long as needed to satisfy these requirements.\n\n    e) Convey the object code using peer-to-peer transmission, provided\n    you inform other peers where the object code and Corresponding\n    Source of the work are being offered to the general public at no\n    charge under subsection 6d.\n\n  A separable portion of the object code, whose source code is excluded\nfrom the Corresponding Source as a System Library, need not be\nincluded in conveying the object code work.\n\n  A \"User Product\" is either (1) a \"consumer product\", which means any\ntangible personal property which is normally used for personal, family,\nor household purposes, or (2) anything designed or sold for incorporation\ninto a dwelling.  In determining whether a product is a consumer product,\ndoubtful cases shall be resolved in favor of coverage.  For a particular\nproduct received by a particular user, \"normally used\" refers to a\ntypical or common use of that class of product, regardless of the status\nof the particular user or of the way in which the particular user\nactually uses, or expects or is expected to use, the product.  A product\nis a consumer product regardless of whether the product has substantial\ncommercial, industrial or non-consumer uses, unless such uses represent\nthe only significant mode of use of the product.\n\n  \"Installation Information\" for a User Product means any methods,\nprocedures, authorization keys, or other information required to install\nand execute modified versions of a covered work in that User Product from\na modified version of its Corresponding Source.  The information must\nsuffice to ensure that the continued functioning of the modified object\ncode is in no case prevented or interfered with solely because\nmodification has been made.\n\n  If you convey an object code work under this section in, or with, or\nspecifically for use in, a User Product, and the conveying occurs as\npart of a transaction in which the right of possession and use of the\nUser Product is transferred to the recipient in perpetuity or for a\nfixed term (regardless of how the transaction is characterized), the\nCorresponding Source conveyed under this section must be accompanied\nby the Installation Information.  But this requirement does not apply\nif neither you nor any third party retains the ability to install\nmodified object code on the User Product (for example, the work has\nbeen installed in ROM).\n\n  The requirement to provide Installation Information does not include a\nrequirement to continue to provide support service, warranty, or updates\nfor a work that has been modified or installed by the recipient, or for\nthe User Product in which it has been modified or installed.  Access to a\nnetwork may be denied when the modification itself materially and\nadversely affects the operation of the network or violates the rules and\nprotocols for communication across the network.\n\n  Corresponding Source conveyed, and Installation Information provided,\nin accord with this section must be in a format that is publicly\ndocumented (and with an implementation available to the public in\nsource code form), and must require no special password or key for\nunpacking, reading or copying.\n\n  7. Additional Terms.\n\n  \"Additional permissions\" are terms that supplement the terms of this\nLicense by making exceptions from one or more of its conditions.\nAdditional permissions that are applicable to the entire Program shall\nbe treated as though they were included in this License, to the extent\nthat they are valid under applicable law.  If additional permissions\napply only to part of the Program, that part may be used separately\nunder those permissions, but the entire Program remains governed by\nthis License without regard to the additional permissions.\n\n  When you convey a copy of a covered work, you may at your option\nremove any additional permissions from that copy, or from any part of\nit.  (Additional permissions may be written to require their own\nremoval in certain cases when you modify the work.)  You may place\nadditional permissions on material, added by you to a covered work,\nfor which you have or can give appropriate copyright permission.\n\n  Notwithstanding any other provision of this License, for material you\nadd to a covered work, you may (if authorized by the copyright holders of\nthat material) supplement the terms of this License with terms:\n\n    a) Disclaiming warranty or limiting liability differently from the\n    terms of sections 15 and 16 of this License; or\n\n    b) Requiring preservation of specified reasonable legal notices or\n    author attributions in that material or in the Appropriate Legal\n    Notices displayed by works containing it; or\n\n    c) Prohibiting misrepresentation of the origin of that material, or\n    requiring that modified versions of such material be marked in\n    reasonable ways as different from the original version; or\n\n    d) Limiting the use for publicity purposes of names of licensors or\n    authors of the material; or\n\n    e) Declining to grant rights under trademark law for use of some\n    trade names, trademarks, or service marks; or\n\n    f) Requiring indemnification of licensors and authors of that\n    material by anyone who conveys the material (or modified versions of\n    it) with contractual assumptions of liability to the recipient, for\n    any liability that these contractual assumptions directly impose on\n    those licensors and authors.\n\n  All other non-permissive additional terms are considered \"further\nrestrictions\" within the meaning of section 10.  If the Program as you\nreceived it, or any part of it, contains a notice stating that it is\ngoverned by this License along with a term that is a further\nrestriction, you may remove that term.  If a license document contains\na further restriction but permits relicensing or conveying under this\nLicense, you may add to a covered work material governed by the terms\nof that license document, provided that the further restriction does\nnot survive such relicensing or conveying.\n\n  If you add terms to a covered work in accord with this section, you\nmust place, in the relevant source files, a statement of the\nadditional terms that apply to those files, or a notice indicating\nwhere to find the applicable terms.\n\n  Additional terms, permissive or non-permissive, may be stated in the\nform of a separately written license, or stated as exceptions;\nthe above requirements apply either way.\n\n  8. Termination.\n\n  You may not propagate or modify a covered work except as expressly\nprovided under this License.  Any attempt otherwise to propagate or\nmodify it is void, and will automatically terminate your rights under\nthis License (including any patent licenses granted under the third\nparagraph of section 11).\n\n  However, if you cease all violation of this License, then your\nlicense from a particular copyright holder is reinstated (a)\nprovisionally, unless and until the copyright holder explicitly and\nfinally terminates your license, and (b) permanently, if the copyright\nholder fails to notify you of the violation by some reasonable means\nprior to 60 days after the cessation.\n\n  Moreover, your license from a particular copyright holder is\nreinstated permanently if the copyright holder notifies you of the\nviolation by some reasonable means, this is the first time you have\nreceived notice of violation of this License (for any work) from that\ncopyright holder, and you cure the violation prior to 30 days after\nyour receipt of the notice.\n\n  Termination of your rights under this section does not terminate the\nlicenses of parties who have received copies or rights from you under\nthis License.  If your rights have been terminated and not permanently\nreinstated, you do not qualify to receive new licenses for the same\nmaterial under section 10.\n\n  9. Acceptance Not Required for Having Copies.\n\n  You are not required to accept this License in order to receive or\nrun a copy of the Program.  Ancillary propagation of a covered work\noccurring solely as a consequence of using peer-to-peer transmission\nto receive a copy likewise does not require acceptance.  However,\nnothing other than this License grants you permission to propagate or\nmodify any covered work.  These actions infringe copyright if you do\nnot accept this License.  Therefore, by modifying or propagating a\ncovered work, you indicate your acceptance of this License to do so.\n\n  10. Automatic Licensing of Downstream Recipients.\n\n  Each time you convey a covered work, the recipient automatically\nreceives a license from the original licensors, to run, modify and\npropagate that work, subject to this License.  You are not responsible\nfor enforcing compliance by third parties with this License.\n\n  An \"entity transaction\" is a transaction transferring control of an\norganization, or substantially all assets of one, or subdividing an\norganization, or merging organizations.  If propagation of a covered\nwork results from an entity transaction, each party to that\ntransaction who receives a copy of the work also receives whatever\nlicenses to the work the party's predecessor in interest had or could\ngive under the previous paragraph, plus a right to possession of the\nCorresponding Source of the work from the predecessor in interest, if\nthe predecessor has it or can get it with reasonable efforts.\n\n  You may not impose any further restrictions on the exercise of the\nrights granted or affirmed under this License.  For example, you may\nnot impose a license fee, royalty, or other charge for exercise of\nrights granted under this License, and you may not initiate litigation\n(including a cross-claim or counterclaim in a lawsuit) alleging that\nany patent claim is infringed by making, using, selling, offering for\nsale, or importing the Program or any portion of it.\n\n  11. Patents.\n\n  A \"contributor\" is a copyright holder who authorizes use under this\nLicense of the Program or a work on which the Program is based.  The\nwork thus licensed is called the contributor's \"contributor version\".\n\n  A contributor's \"essential patent claims\" are all patent claims\nowned or controlled by the contributor, whether already acquired or\nhereafter acquired, that would be infringed by some manner, permitted\nby this License, of making, using, or selling its contributor version,\nbut do not include claims that would be infringed only as a\nconsequence of further modification of the contributor version.  For\npurposes of this definition, \"control\" includes the right to grant\npatent sublicenses in a manner consistent with the requirements of\nthis License.\n\n  Each contributor grants you a non-exclusive, worldwide, royalty-free\npatent license under the contributor's essential patent claims, to\nmake, use, sell, offer for sale, import and otherwise run, modify and\npropagate the contents of its contributor version.\n\n  In the following three paragraphs, a \"patent license\" is any express\nagreement or commitment, however denominated, not to enforce a patent\n(such as an express permission to practice a patent or covenant not to\nsue for patent infringement).  To \"grant\" such a patent license to a\nparty means to make such an agreement or commitment not to enforce a\npatent against the party.\n\n  If you convey a covered work, knowingly relying on a patent license,\nand the Corresponding Source of the work is not available for anyone\nto copy, free of charge and under the terms of this License, through a\npublicly available network server or other readily accessible means,\nthen you must either (1) cause the Corresponding Source to be so\navailable, or (2) arrange to deprive yourself of the benefit of the\npatent license for this particular work, or (3) arrange, in a manner\nconsistent with the requirements of this License, to extend the patent\nlicense to downstream recipients.  \"Knowingly relying\" means you have\nactual knowledge that, but for the patent license, your conveying the\ncovered work in a country, or your recipient's use of the covered work\nin a country, would infringe one or more identifiable patents in that\ncountry that you have reason to believe are valid.\n\n  If, pursuant to or in connection with a single transaction or\narrangement, you convey, or propagate by procuring conveyance of, a\ncovered work, and grant a patent license to some of the parties\nreceiving the covered work authorizing them to use, propagate, modify\nor convey a specific copy of the covered work, then the patent license\nyou grant is automatically extended to all recipients of the covered\nwork and works based on it.\n\n  A patent license is \"discriminatory\" if it does not include within\nthe scope of its coverage, prohibits the exercise of, or is\nconditioned on the non-exercise of one or more of the rights that are\nspecifically granted under this License.  You may not convey a covered\nwork if you are a party to an arrangement with a third party that is\nin the business of distributing software, under which you make payment\nto the third party based on the extent of your activity of conveying\nthe work, and under which the third party grants, to any of the\nparties who would receive the covered work from you, a discriminatory\npatent license (a) in connection with copies of the covered work\nconveyed by you (or copies made from those copies), or (b) primarily\nfor and in connection with specific products or compilations that\ncontain the covered work, unless you entered into that arrangement,\nor that patent license was granted, prior to 28 March 2007.\n\n  Nothing in this License shall be construed as excluding or limiting\nany implied license or other defenses to infringement that may\notherwise be available to you under applicable patent law.\n\n  12. No Surrender of Others' Freedom.\n\n  If conditions are imposed on you (whether by court order, agreement or\notherwise) that contradict the conditions of this License, they do not\nexcuse you from the conditions of this License.  If you cannot convey a\ncovered work so as to satisfy simultaneously your obligations under this\nLicense and any other pertinent obligations, then as a consequence you may\nnot convey it at all.  For example, if you agree to terms that obligate you\nto collect a royalty for further conveying from those to whom you convey\nthe Program, the only way you could satisfy both those terms and this\nLicense would be to refrain entirely from conveying the Program.\n\n  13. Use with the GNU Affero General Public License.\n\n  Notwithstanding any other provision of this License, you have\npermission to link or combine any covered work with a work licensed\nunder version 3 of the GNU Affero General Public License into a single\ncombined work, and to convey the resulting work.  The terms of this\nLicense will continue to apply to the part which is the covered work,\nbut the special requirements of the GNU Affero General Public License,\nsection 13, concerning interaction through a network will apply to the\ncombination as such.\n\n  14. Revised Versions of this License.\n\n  The Free Software Foundation may publish revised and/or new versions of\nthe GNU General Public License from time to time.  Such new versions will\nbe similar in spirit to the present version, but may differ in detail to\naddress new problems or concerns.\n\n  Each version is given a distinguishing version number.  If the\nProgram specifies that a certain numbered version of the GNU General\nPublic License \"or any later version\" applies to it, you have the\noption of following the terms and conditions either of that numbered\nversion or of any later version published by the Free Software\nFoundation.  If the Program does not specify a version number of the\nGNU General Public License, you may choose any version ever published\nby the Free Software Foundation.\n\n  If the Program specifies that a proxy can decide which future\nversions of the GNU General Public License can be used, that proxy's\npublic statement of acceptance of a version permanently authorizes you\nto choose that version for the Program.\n\n  Later license versions may give you additional or different\npermissions.  However, no additional obligations are imposed on any\nauthor or copyright holder as a result of your choosing to follow a\nlater version.\n\n  15. Disclaimer of Warranty.\n\n  THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY\nAPPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT\nHOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM \"AS IS\" WITHOUT WARRANTY\nOF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,\nTHE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR\nPURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM\nIS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF\nALL NECESSARY SERVICING, REPAIR OR CORRECTION.\n\n  16. Limitation of Liability.\n\n  IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING\nWILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS\nTHE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY\nGENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE\nUSE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF\nDATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD\nPARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),\nEVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF\nSUCH DAMAGES.\n\n  17. Interpretation of Sections 15 and 16.\n\n  If the disclaimer of warranty and limitation of liability provided\nabove cannot be given local legal effect according to their terms,\nreviewing courts shall apply local law that most closely approximates\nan absolute waiver of all civil liability in connection with the\nProgram, unless a warranty or assumption of liability accompanies a\ncopy of the Program in return for a fee.\n\n                     END OF TERMS AND CONDITIONS\n\n            How to Apply These Terms to Your New Programs\n\n  If you develop a new program, and you want it to be of the greatest\npossible use to the public, the best way to achieve this is to make it\nfree software which everyone can redistribute and change under these terms.\n\n  To do so, attach the following notices to the program.  It is safest\nto attach them to the start of each source file to most effectively\nstate the exclusion of warranty; and each file should have at least\nthe \"copyright\" line and a pointer to where the full notice is found.\n\n    <one line to give the program's name and a brief idea of what it does.>\n    Copyright (C) <year>  <name of author>\n\n    This program is free software: you can redistribute it and/or modify\n    it under the terms of the GNU General Public License as published by\n    the Free Software Foundation, either version 3 of the License, or\n    (at your option) any later version.\n\n    This program is distributed in the hope that it will be useful,\n    but WITHOUT ANY WARRANTY; without even the implied warranty of\n    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the\n    GNU General Public License for more details.\n\n    You should have received a copy of the GNU General Public License\n    along with this program.  If not, see <http://www.gnu.org/licenses/>.\n\nAlso add information on how to contact you by electronic and paper mail.\n\n  If the program does terminal interaction, make it output a short\nnotice like this when it starts in an interactive mode:\n\n    <program>  Copyright (C) <year>  <name of author>\n    This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.\n    This is free software, and you are welcome to redistribute it\n    under certain conditions; type `show c' for details.\n\nThe hypothetical commands `show w' and `show c' should show the appropriate\nparts of the General Public License.  Of course, your program's commands\nmight be different; for a GUI interface, you would use an \"about box\".\n\n  You should also get your employer (if you work as a programmer) or school,\nif any, to sign a \"copyright disclaimer\" for the program, if necessary.\nFor more information on this, and how to apply and follow the GNU GPL, see\n<http://www.gnu.org/licenses/>.\n\n  The GNU General Public License does not permit incorporating your program\ninto proprietary programs.  If your program is a subroutine library, you\nmay consider it more useful to permit linking proprietary applications with\nthe library.  If this is what you want to do, use the GNU Lesser General\nPublic License instead of this License.  But first, please read\n<http://www.gnu.org/philosophy/why-not-lgpl.html>.\n"
  },
  {
    "path": "external/bliss/bliss/COPYING.LESSER",
    "content": "                   GNU LESSER GENERAL PUBLIC LICENSE\n                       Version 3, 29 June 2007\n\n Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>\n Everyone is permitted to copy and distribute verbatim copies\n of this license document, but changing it is not allowed.\n\n\n  This version of the GNU Lesser General Public License incorporates\nthe terms and conditions of version 3 of the GNU General Public\nLicense, supplemented by the additional permissions listed below.\n\n  0. Additional Definitions.\n\n  As used herein, \"this License\" refers to version 3 of the GNU Lesser\nGeneral Public License, and the \"GNU GPL\" refers to version 3 of the GNU\nGeneral Public License.\n\n  \"The Library\" refers to a covered work governed by this License,\nother than an Application or a Combined Work as defined below.\n\n  An \"Application\" is any work that makes use of an interface provided\nby the Library, but which is not otherwise based on the Library.\nDefining a subclass of a class defined by the Library is deemed a mode\nof using an interface provided by the Library.\n\n  A \"Combined Work\" is a work produced by combining or linking an\nApplication with the Library.  The particular version of the Library\nwith which the Combined Work was made is also called the \"Linked\nVersion\".\n\n  The \"Minimal Corresponding Source\" for a Combined Work means the\nCorresponding Source for the Combined Work, excluding any source code\nfor portions of the Combined Work that, considered in isolation, are\nbased on the Application, and not on the Linked Version.\n\n  The \"Corresponding Application Code\" for a Combined Work means the\nobject code and/or source code for the Application, including any data\nand utility programs needed for reproducing the Combined Work from the\nApplication, but excluding the System Libraries of the Combined Work.\n\n  1. Exception to Section 3 of the GNU GPL.\n\n  You may convey a covered work under sections 3 and 4 of this License\nwithout being bound by section 3 of the GNU GPL.\n\n  2. Conveying Modified Versions.\n\n  If you modify a copy of the Library, and, in your modifications, a\nfacility refers to a function or data to be supplied by an Application\nthat uses the facility (other than as an argument passed when the\nfacility is invoked), then you may convey a copy of the modified\nversion:\n\n   a) under this License, provided that you make a good faith effort to\n   ensure that, in the event an Application does not supply the\n   function or data, the facility still operates, and performs\n   whatever part of its purpose remains meaningful, or\n\n   b) under the GNU GPL, with none of the additional permissions of\n   this License applicable to that copy.\n\n  3. Object Code Incorporating Material from Library Header Files.\n\n  The object code form of an Application may incorporate material from\na header file that is part of the Library.  You may convey such object\ncode under terms of your choice, provided that, if the incorporated\nmaterial is not limited to numerical parameters, data structure\nlayouts and accessors, or small macros, inline functions and templates\n(ten or fewer lines in length), you do both of the following:\n\n   a) Give prominent notice with each copy of the object code that the\n   Library is used in it and that the Library and its use are\n   covered by this License.\n\n   b) Accompany the object code with a copy of the GNU GPL and this license\n   document.\n\n  4. Combined Works.\n\n  You may convey a Combined Work under terms of your choice that,\ntaken together, effectively do not restrict modification of the\nportions of the Library contained in the Combined Work and reverse\nengineering for debugging such modifications, if you also do each of\nthe following:\n\n   a) Give prominent notice with each copy of the Combined Work that\n   the Library is used in it and that the Library and its use are\n   covered by this License.\n\n   b) Accompany the Combined Work with a copy of the GNU GPL and this license\n   document.\n\n   c) For a Combined Work that displays copyright notices during\n   execution, include the copyright notice for the Library among\n   these notices, as well as a reference directing the user to the\n   copies of the GNU GPL and this license document.\n\n   d) Do one of the following:\n\n       0) Convey the Minimal Corresponding Source under the terms of this\n       License, and the Corresponding Application Code in a form\n       suitable for, and under terms that permit, the user to\n       recombine or relink the Application with a modified version of\n       the Linked Version to produce a modified Combined Work, in the\n       manner specified by section 6 of the GNU GPL for conveying\n       Corresponding Source.\n\n       1) Use a suitable shared library mechanism for linking with the\n       Library.  A suitable mechanism is one that (a) uses at run time\n       a copy of the Library already present on the user's computer\n       system, and (b) will operate properly with a modified version\n       of the Library that is interface-compatible with the Linked\n       Version.\n\n   e) Provide Installation Information, but only if you would otherwise\n   be required to provide such information under section 6 of the\n   GNU GPL, and only to the extent that such information is\n   necessary to install and execute a modified version of the\n   Combined Work produced by recombining or relinking the\n   Application with a modified version of the Linked Version. (If\n   you use option 4d0, the Installation Information must accompany\n   the Minimal Corresponding Source and Corresponding Application\n   Code. If you use option 4d1, you must provide the Installation\n   Information in the manner specified by section 6 of the GNU GPL\n   for conveying Corresponding Source.)\n\n  5. Combined Libraries.\n\n  You may place library facilities that are a work based on the\nLibrary side by side in a single library together with other library\nfacilities that are not Applications and are not covered by this\nLicense, and convey such a combined library under terms of your\nchoice, if you do both of the following:\n\n   a) Accompany the combined library with a copy of the same work based\n   on the Library, uncombined with any other library facilities,\n   conveyed under the terms of this License.\n\n   b) Give prominent notice with the combined library that part of it\n   is a work based on the Library, and explaining where to find the\n   accompanying uncombined form of the same work.\n\n  6. Revised Versions of the GNU Lesser General Public License.\n\n  The Free Software Foundation may publish revised and/or new versions\nof the GNU Lesser General Public License from time to time. Such new\nversions will be similar in spirit to the present version, but may\ndiffer in detail to address new problems or concerns.\n\n  Each version is given a distinguishing version number. If the\nLibrary as you received it specifies that a certain numbered version\nof the GNU Lesser General Public License \"or any later version\"\napplies to it, you have the option of following the terms and\nconditions either of that published version or of any later version\npublished by the Free Software Foundation. If the Library as you\nreceived it does not specify a version number of the GNU Lesser\nGeneral Public License, you may choose any version of the GNU Lesser\nGeneral Public License ever published by the Free Software Foundation.\n\n  If the Library as you received it specifies that a proxy can decide\nwhether future versions of the GNU Lesser General Public License shall\napply, that proxy's public statement of acceptance of any version is\npermanent authorization for you to choose that version for the\nLibrary.\n"
  },
  {
    "path": "external/bliss/bliss/abgraph.hh",
    "content": "#ifndef BLISS_AB_GRAPH_HH\n#define BLISS_AB_GRAPH_HH\n#include <set>\n#include <list>\n#include <cstdio>\n#include <vector>\n#include <cassert>\n#include <climits>\n#include <algorithm>\n\nnamespace bliss {\n\tclass AbstractGraph;\n}\n\n#include \"kstack.hh\"\n#include \"kqueue.hh\"\n#include \"heap.hh\"\n#include \"orbit.hh\"\n#include \"partition.hh\"\n#include \"bignum.hh\"\n#include \"uintseqhash.hh\"\n\nnamespace bliss {\n\nvoid fatal_error(const char* fmt, ...) {\n\tva_list ap;\n\tva_start(ap, fmt);\n\tfprintf(stderr,\"Bliss fatal error: \");\n\tvfprintf(stderr, fmt, ap);\n\tfprintf(stderr, \"\\nAborting!\\n\");\n\tva_end(ap);\n\texit(1);\n}\n\n#define _INTERNAL_ERROR() fatal_error(\"%s:%d: internal error\",__FILE__,__LINE__)\n#define _OUT_OF_MEMORY() fatal_error(\"%s:%d: out of memory\",__FILE__,__LINE__)\n\ntypedef std::pair<unsigned,unsigned> Index;\n\nclass TreeNode {\n//friend class AbstractGraph;\npublic:\n\tunsigned int split_cell_first;\n\n\tint split_element;\n\tstatic const int SPLIT_START = -1;\n\tstatic const int SPLIT_END   = -2;\n\tPartition::BacktrackPoint partition_bt_point;\n\tunsigned int certificate_index;\n\tstatic const char NO = -1;\n\tstatic const char MAYBE = 0;\n\tstatic const char YES = 1;\n\t/* First path stuff */\n\tbool fp_on;\n\tbool fp_cert_equal;\n\tchar fp_extendable;\n\t/* Best path stuff */\n\tbool in_best_path;\n\tint cmp_to_best_path;\n\tunsigned int failure_recording_ival;\n\t/* Component recursion related data */\n\tunsigned int cr_cep_stack_size;\n\tunsigned int cr_cep_index;\n\tunsigned int cr_level;\n\tbool needs_long_prune;\n\tunsigned int long_prune_begin;\n\tstd::set<unsigned int, std::less<unsigned int> > long_prune_redundant;\n\tUintSeqHash eqref_hash;\n\tunsigned int subcertificate_length;\n};\n\ntypedef struct {\n\tunsigned int splitting_element;\n\tunsigned int certificate_index;\n\tunsigned int subcertificate_length;\n\tUintSeqHash eqref_hash;\n} PathInfo;\n\n// \\brief Statistics returned by the bliss search algorithm.\nclass Stats {\n\tfriend class AbstractGraph;\n\t/** \\internal The size of the automorphism group. */\n\tBigNum group_size;\n\t/** \\internal An approximation (due to possible overflows) of\n\t * the size of the automorphism group. */\n\tlong double group_size_approx;\n\t/** \\internal The number of nodes in the search tree. */\n\tlong unsigned int nof_nodes;\n\t/** \\internal The number of leaf nodes in the search tree. */\n\tlong unsigned int nof_leaf_nodes;\n\t/** \\internal The number of bad nodes in the search tree. */\n\tlong unsigned int nof_bad_nodes;\n\t/** \\internal The number of canonical representative updates. */\n\tlong unsigned int nof_canupdates;\n\t/** \\internal The number of generator permutations. */\n\tlong unsigned int nof_generators;\n\t/** \\internal The maximal depth of the search tree. */\n\tunsigned long int max_level;\n\t/** */\n\tvoid reset() {\n\t\tgroup_size.assign(1);\n\t\tgroup_size_approx = 1.0;\n\t\tnof_nodes = 0;\n\t\tnof_leaf_nodes = 0;\n\t\tnof_bad_nodes = 0;\n\t\tnof_canupdates = 0;\n\t\tnof_generators = 0;\n\t\tmax_level = 0;\n\t}\n\tpublic:\n\tStats() { reset(); }\n\t/** Print the statistics. */\n\tsize_t print(FILE* const fp) const {\n\t\tsize_t r = 0;\n\t\tr += fprintf(fp, \"Nodes:          %lu\\n\", nof_nodes);\n\t\tr += fprintf(fp, \"Leaf nodes:     %lu\\n\", nof_leaf_nodes);\n\t\tr += fprintf(fp, \"Bad nodes:      %lu\\n\", nof_bad_nodes);\n\t\tr += fprintf(fp, \"Canrep updates: %lu\\n\", nof_canupdates);\n\t\tr += fprintf(fp, \"Generators:     %lu\\n\", nof_generators);\n\t\tr += fprintf(fp, \"Max level:      %lu\\n\", max_level);\n\t\tr += fprintf(fp, \"|Aut|:          \")+group_size.print(fp)+fprintf(fp, \"\\n\");\n\t\tfflush(fp);\n\t\treturn r;\n\t}\n\t/** An approximation (due to possible overflows/rounding errors) of\n\t * the size of the automorphism group. */\n\tlong double get_group_size_approx() const {return group_size_approx;}\n\t/** The number of nodes in the search tree. */\n\tlong unsigned int get_nof_nodes() const {return nof_nodes;}\n\t/** The number of leaf nodes in the search tree. */\n\tlong unsigned int get_nof_leaf_nodes() const {return nof_leaf_nodes;}\n\t/** The number of bad nodes in the search tree. */\n\tlong unsigned int get_nof_bad_nodes() const {return nof_bad_nodes;}\n\t/** The number of canonical representative updates. */\n\tlong unsigned int get_nof_canupdates() const {return nof_canupdates;}\n\t/** The number of generator permutations. */\n\tlong unsigned int get_nof_generators() const {return nof_generators;}\n\t/** The maximal depth of the search tree. */\n\tunsigned long int get_max_level() const {return max_level;}\n};\n\n// \\brief An abstract base class for different types of graphs.\nclass AbstractGraph {\n\tfriend class Partition;\npublic:\n\t//AbstractGraph();\n\t// Constructor and destructor routines for the abstract graph class\n\tAbstractGraph() {\n\t// Initialize stuff\n\tfirst_path_labeling = 0;\n\tfirst_path_labeling_inv = 0;\n\tbest_path_labeling = 0;\n\tbest_path_labeling_inv = 0;\n\tfirst_path_automorphism = 0;\n\tbest_path_automorphism = 0;\n\tin_search = false;\n\t// Default value for using \"long prune\"\n\topt_use_long_prune = true;\n\t// Default value for using failure recording\n\topt_use_failure_recording = true;\n\t// Default value for using component recursion\n\topt_use_comprec = true;\n\tverbose_level = 0;\n\tverbstr = stdout;\n\treport_hook = 0;\n\treport_user_param = 0;\n\t}\n\t//virtual ~AbstractGraph();\n\tvirtual ~AbstractGraph() {\n\tif(first_path_labeling) {\n\t\tfree(first_path_labeling); first_path_labeling = 0; }\n\tif(first_path_labeling_inv) {\n\t\tfree(first_path_labeling_inv); first_path_labeling_inv = 0; }\n\tif(best_path_labeling) {\n\t\tfree(best_path_labeling); best_path_labeling = 0; }\n\tif(best_path_labeling_inv) {\n\t\tfree(best_path_labeling_inv); best_path_labeling_inv = 0; }\n\tif(first_path_automorphism) {\n\t\tfree(first_path_automorphism); first_path_automorphism = 0; }\n\tif(best_path_automorphism) {\n\t\tfree(best_path_automorphism); best_path_automorphism = 0; }\n\treport_hook = 0;\n\treport_user_param = 0;\n\t}\n\n\t//Set the verbose output level for the algorithms.\n\t// \\param level  the level of verbose output, 0 means no verbose output\n\t//void set_verbose_level(const unsigned int level);\nvoid set_verbose_level(const unsigned int level) {\n\tverbose_level = level;\n}\n\n\t/**\n\t * Set the file stream for the verbose output.\n\t * \\param fp  the file stream; if null, no verbose output is written\n\t */\n\t//void set_verbose_file(FILE * const fp);\nvoid set_verbose_file(FILE* const fp) {\n\tverbstr = fp;\n}\n\t/**\n\t * Add a new vertex with color \\a color in the graph and return its index.\n\t */\n\tvirtual unsigned int add_vertex(const unsigned int color = 0) = 0;\n\n\t/**\n\t * Add an edge between vertices \\a source and \\a target.\n\t * Duplicate edges between vertices are ignored but try to avoid introducing\n\t * them in the first place as they are not ignored immediately but will\n\t * consume memory and computation resources for a while.\n\t */\n\tvirtual void add_edge(const unsigned int source, const unsigned int target, Index index) = 0;\n\n\t/**\n\t * Change the color of the vertex \\a vertex to \\a color.\n\t */\n\tvirtual void change_color(const unsigned int vertex, const unsigned int color) = 0;\n\n\t/**\n\t * Check whether \\a perm is an automorphism of this graph.\n\t * Unoptimized, mainly for debugging purposes.\n\t */\n\t//virtual bool is_automorphism(const std::vector<unsigned int>& perm) const;\n\nvirtual bool is_automorphism(const std::vector<unsigned int>& perm) const {\n\t_INTERNAL_ERROR();\n\treturn false;\n}\n\n\t/** Activate/deactivate failure recording.\n\t * May not be called during the search, i.e. from an automorphism reporting\n\t * hook function.\n\t * \\param active  if true, activate failure recording, deactivate otherwise\n\t */\n\tvoid set_failure_recording(const bool active) {assert(!in_search); opt_use_failure_recording = active;}\n\n\t/** Activate/deactivate component recursion.\n\t * The choice affects the computed canonical labelings;\n\t * therefore, if you want to compare whether two graphs are isomorphic by\n\t * computing and comparing (for equality) their canonical versions,\n\t * be sure to use the same choice for both graphs.\n\t * May not be called during the search, i.e. from an automorphism reporting\n\t * hook function.\n\t * \\param active  if true, activate component recursion, deactivate otherwise\n\t */\n\tvoid set_component_recursion(const bool active) {assert(!in_search); opt_use_comprec = active;}\n\n\t/**\n\t * Return the number of vertices in the graph.\n\t */\n\tvirtual unsigned int get_nof_vertices() const = 0;\n\n\t/**\n\t * Return a new graph that is the result of applying the permutation \\a perm\n\t * to this graph. This graph is not modified.\n\t * \\a perm must contain N=this.get_nof_vertices() elements and be a bijection\n\t * on {0,1,...,N-1}, otherwise the result is undefined or a segfault.\n\t */\n\tvirtual AbstractGraph* permute(const unsigned* const perm) const = 0;\n\tvirtual AbstractGraph* permute(const std::vector<unsigned int>& perm) const = 0;\n\n\t/**\n\t * Find a set of generators for the automorphism group of the graph.\n\t * The function \\a hook (if non-null) is called each time a new generator\n\t * for the automorphism group is found.\n\t * The first argument \\a user_param for the hook is the\n\t * \\a hook_user_param given below,\n\t * the second argument \\a n is the length of the automorphism (equal to\n\t * get_nof_vertices()) and\n\t * the third argument \\a aut is the automorphism\n\t * (a bijection on {0,...,get_nof_vertices()-1}).\n\t * The memory for the automorphism \\a aut will be invalidated immediately\n\t * after the return from the hook function;\n\t * if you want to use the automorphism later, you have to take a copy of it.\n\t * Do not call any member functions in the hook.\n\t * The search statistics are copied in \\a stats.\n\t */\n\t//void find_automorphisms(Stats& stats, void (*hook)(void* user_param, unsigned int n, const unsigned int* aut), void* hook_user_param);\nvoid find_automorphisms(Stats& stats, void (*hook)(void *user_param, unsigned int n, const unsigned int *aut), void *user_param) {\n\treport_hook = hook;\n\treport_user_param = user_param;\n\tsearch(false, stats);\n\tif(first_path_labeling) {\n\t\tfree(first_path_labeling);\n\t\tfirst_path_labeling = 0;\n\t}\n\tif(best_path_labeling) {\n\t\tfree(best_path_labeling);\n\t\tbest_path_labeling = 0;\n\t}\n}\n\t/**\n\t * Otherwise the same as find_automorphisms() except that\n\t * a canonical labeling of the graph (a bijection on\n\t * {0,...,get_nof_vertices()-1}) is returned.\n\t * The memory allocated for the returned canonical labeling will remain\n\t * valid only until the next call to a member function with the exception\n\t * that constant member functions (for example, bliss::Graph::permute()) can\n\t * be called without invalidating the labeling.\n\t * To compute the canonical version of an undirected graph, call this\n\t * function and then bliss::Graph::permute() with the returned canonical\n\t * labeling.\n\t * Note that the computed canonical version may depend on the applied version\n\t * of bliss as well as on some other options (for instance, the splitting\n\t * heuristic selected with bliss::Graph::set_splitting_heuristic()).\n\t */\n\t//const unsigned int* canonical_form(Stats& stats, void (*hook)(void* user_param, unsigned int n, const unsigned int* aut), void* hook_user_param);\nconst unsigned * canonical_form(Stats& stats, void (*hook)(void *user_param, unsigned int n, const unsigned int *aut), void *user_param) {\n\treport_hook = hook;\n\treport_user_param = user_param;\n\tsearch(true, stats);\n\treturn best_path_labeling;\n}\n\t/**\n\t * Write the graph to a file in a variant of the DIMACS format.\n\t * See the <A href=\"http://www.tcs.hut.fi/Software/bliss/\">bliss website</A>\n\t * for the definition of the file format.\n\t * Note that in the DIMACS file the vertices are numbered from 1 to N while\n\t * in this C++ API they are from 0 to N-1.\n\t * Thus the vertex n in the file corresponds to the vertex n-1 in the API.\n\t * \\param fp  the file stream where the graph is written\n\t */\n\tvirtual void write_dimacs(FILE * const fp) = 0;\n\n\t/**\n\t * Write the graph to a file in the graphviz dotty format.\n\t * \\param fp  the file stream where the graph is written\n\t */\n\tvirtual void write_dot(FILE * const fp) = 0;\n\n\t/**\n\t * Write the graph in a file in the graphviz dotty format.\n\t * Do nothing if the file cannot be written.\n\t * \\param file_name  the name of the file to which the graph is written\n\t */\n\tvirtual void write_dot(const char * const file_name) = 0;\n\n\t/**\n\t * Get a hash value for the graph.\n\t * \\return  the hash value\n\t */ \n\tvirtual unsigned int get_hash() = 0;\n\n\t/**\n\t * Disable/enable the \"long prune\" method.\n\t * The choice affects the computed canonical labelings;\n\t * therefore, if you want to compare whether two graphs are isomorphic by\n\t * computing and comparing (for equality) their canonical versions,\n\t * be sure to use the same choice for both graphs.\n\t * May not be called during the search, i.e. from an automorphism reporting\n\t * hook function.\n\t * \\param active  if true, activate \"long prune\", deactivate otherwise\n\t */\n\tvoid set_long_prune_activity(const bool active) {\n\t\tassert(!in_search);\n\t\topt_use_long_prune = active;\n\t}\n\nprotected:\n\t/** \\internal\n\t * How much verbose output is produced (0 means none) */\n\tunsigned int verbose_level;\n\t/** \\internal\n\t * The output stream for verbose output. */\n\tFILE *verbstr;\n\nprotected:\n\t/** \\internal\n\t * The ordered partition used in the search algorithm. */\n\tPartition p;\n\n\t/** \\internal\n\t * Whether the search for automorphisms and a canonical labeling is\n\t * in progress.\n\t */\n\tbool in_search;\n\n\t/** \\internal\n\t * Is failure recording in use?\n\t */\n\tbool opt_use_failure_recording;\n\t/* The \"tree-specific\" invariant value for the point when current path\n\t * got different from the first path */\n\tunsigned int failure_recording_fp_deviation;\n\n\t/** \\internal\n\t * Is component recursion in use?\n\t */\n\tbool opt_use_comprec;\n\n\tunsigned int refine_current_path_certificate_index;\n\tbool refine_compare_certificate;\n\tbool refine_equal_to_first;\n\tunsigned int refine_first_path_subcertificate_end;\n\tint refine_cmp_to_best;\n\tunsigned int refine_best_path_subcertificate_end;\n\tstatic const unsigned int CERT_SPLIT = 0; //UINT_MAX;\n\tstatic const unsigned int CERT_EDGE  = 1; //UINT_MAX-1;\n\t/** \\internal\n\t * Add a triple (v1,v2,v3) in the certificate.\n\t * May modify refine_equal_to_first and refine_cmp_to_best.\n\t * May also update eqref_hash and failure_recording_fp_deviation. */\n\t//void cert_add(const unsigned int v1, const unsigned int v2, const unsigned int v3);\n// Certificate building\nvoid cert_add(const unsigned int v1, const unsigned int v2, const unsigned int v3) {\n\tif(refine_compare_certificate) {\n\t\tif(refine_equal_to_first) {\n\t\t\t/* So far equivalent to the first path... */\n\t\t\tunsigned int index = certificate_current_path.size();\n\t\t\tif(index >= refine_first_path_subcertificate_end) {\n\t\t\t\trefine_equal_to_first = false;\n\t\t\t} else if(certificate_first_path[index] != v1) {\n\t\t\t\trefine_equal_to_first = false;\n\t\t\t} else if(certificate_first_path[++index] != v2) {\n\t\t\t\trefine_equal_to_first = false;\n\t\t\t} else if(certificate_first_path[++index] != v3) {\n\t\t\t\trefine_equal_to_first = false;\n\t\t\t} if(opt_use_failure_recording and !refine_equal_to_first) {\n\t\t\t\t/* We just became different from the first path,\n\t\t\t\t * remember the deviation point tree-specific invariant\n\t\t\t\t * for the use of failure recording */\n\t\t\t\tUintSeqHash h;\n\t\t\t\th.update(v1);\n\t\t\t\th.update(v2);\n\t\t\t\th.update(v3);\n\t\t\t\th.update(index);\n\t\t\t\th.update(eqref_hash.get_value());\n\t\t\t\tfailure_recording_fp_deviation = h.get_value();\n\t\t\t}\n\t\t}\n\t\tif(refine_cmp_to_best == 0) {\n\t\t\t/* So far equivalent to the current best path... */\n\t\t\tunsigned int index = certificate_current_path.size();\n\t\t\tif(index >= refine_best_path_subcertificate_end) {\n\t\t\t\trefine_cmp_to_best = 1;\n\t\t\t} else if(v1 > certificate_best_path[index]) {\n\t\t\t\trefine_cmp_to_best = 1;\n\t\t\t} else if(v1 < certificate_best_path[index]) {\n\t\t\t\trefine_cmp_to_best = -1;\n\t\t\t} else if(v2 > certificate_best_path[++index]) {\n\t\t\t\trefine_cmp_to_best = 1;\n\t\t\t} else if(v2 < certificate_best_path[index]) {\n\t\t\t\trefine_cmp_to_best = -1;\n\t\t\t} else if(v3 > certificate_best_path[++index]) {\n\t\t\t\trefine_cmp_to_best = 1;\n\t\t\t} else if(v3 < certificate_best_path[index]) {\n\t\t\t\trefine_cmp_to_best = -1;\n\t\t\t}\n\t\t}\n\t\tif((refine_equal_to_first == false) and (refine_cmp_to_best < 0))\n\t\t\treturn;\n\t}\n\t/* Update the current path certificate */\n\tcertificate_current_path.push_back(v1);\n\tcertificate_current_path.push_back(v2);\n\tcertificate_current_path.push_back(v3);\n}\n\t/** \\internal\n\t * Add a redundant triple (v1,v2,v3) in the certificate.\n\t * Can also just dicard the triple.\n\t * May modify refine_equal_to_first and refine_cmp_to_best.\n\t * May also update eqref_hash and failure_recording_fp_deviation. */\n\t//void cert_add_redundant(const unsigned int x, const unsigned int y, const unsigned int z);\nvoid cert_add_redundant(const unsigned int v1, const unsigned int v2, const unsigned int v3) {\n\treturn cert_add(v1, v2, v3);\n}\n\t/**\\internal\n\t * Is the long prune method in use?\n\t */\n\tbool opt_use_long_prune;\n\t/**\\internal\n\t * Maximum amount of memory (in megabytes) available for\n\t * the long prune method\n\t */\n\tstatic const unsigned int long_prune_options_max_mem = 50;\n\t/**\\internal\n\t * Maximum amount of automorphisms stored for the long prune method;\n\t * less than this is stored if the memory limit above is reached first\n\t */\n\tstatic const unsigned int long_prune_options_max_stored_auts = 100;\n\n\tunsigned int long_prune_max_stored_autss;\n\tstd::vector<std::vector<bool> *> long_prune_fixed;\n\tstd::vector<std::vector<bool> *> long_prune_mcrs;\n\tstd::vector<bool> long_prune_temp;\n\tunsigned int long_prune_begin;\n\tunsigned int long_prune_end;\n\t/** \\internal\n\t * Initialize the \"long prune\" data structures.\n\t */\n\t//void long_prune_init();\n\t/** \\internal\n\t * Release the memory allocated for \"long prune\" data structures.\n\t */\n\t//void long_prune_deallocate();\n\t//void long_prune_add_automorphism(const unsigned int *aut);\n\t//std::vector<bool>& long_prune_get_fixed(const unsigned int index);\n\t//std::vector<bool>& long_prune_allocget_fixed(const unsigned int index);\n\t//std::vector<bool>& long_prune_get_mcrs(const unsigned int index);\n\t//std::vector<bool>& long_prune_allocget_mcrs(const unsigned int index);\n\t/** \\internal\n\t * Swap the i:th and j:th stored automorphism information;\n\t * i and j must be \"in window, i.e. in [long_prune_begin,long_prune_end[\n\t */\n\t//void long_prune_swap(const unsigned int i, const unsigned int j);\n//Long prune code\nvoid long_prune_init() {\n\tconst unsigned int N = get_nof_vertices();\n\tlong_prune_temp.clear();\n\tlong_prune_temp.resize(N);\n\t/* Of how many automorphisms we can store information in\n\t   the predefined, fixed amount of memory? */\n\tconst unsigned int nof_fitting_in_max_mem =\n\t\t(long_prune_options_max_mem * 1024 * 1024) / (((N * 2) / 8)+1);\n\tlong_prune_max_stored_autss = long_prune_options_max_stored_auts;\n\t/* Had some problems with g++ in using (a<b)?a:b when constants involved,\n\t   so had to make this in a stupid way... */\n\tif(nof_fitting_in_max_mem < long_prune_options_max_stored_auts)\n\t\tlong_prune_max_stored_autss = nof_fitting_in_max_mem;\n\tlong_prune_deallocate();\n\tlong_prune_fixed.resize(N, 0);\n\tlong_prune_mcrs.resize(N, 0);\n\tlong_prune_begin = 0;\n\tlong_prune_end = 0;\n}\n\nvoid long_prune_deallocate() {\n\twhile(!long_prune_fixed.empty()) {\n\t\tdelete long_prune_fixed.back();\n\t\tlong_prune_fixed.pop_back();\n\t}\n\twhile(!long_prune_mcrs.empty()) {\n\t\tdelete long_prune_mcrs.back();\n\t\tlong_prune_mcrs.pop_back();\n\t}\n}\n\nvoid long_prune_swap(const unsigned int i, const unsigned int j) {\n\tconst unsigned int real_i = i % long_prune_max_stored_autss;\n\tconst unsigned int real_j = j % long_prune_max_stored_autss;\n\tstd::vector<bool>* tmp = long_prune_fixed[real_i];\n\tlong_prune_fixed[real_i] = long_prune_fixed[real_j];\n\tlong_prune_fixed[real_j] = tmp;\n\ttmp = long_prune_mcrs[real_i];\n\tlong_prune_mcrs[real_i] = long_prune_mcrs[real_j];\n\tlong_prune_mcrs[real_j] = tmp;\n}\n\nstd::vector<bool>& long_prune_allocget_fixed(const unsigned int index) {\n\tconst unsigned int i = index % long_prune_max_stored_autss;\n\tif(!long_prune_fixed[i])\n\t\tlong_prune_fixed[i] = new std::vector<bool>(get_nof_vertices());\n\treturn *long_prune_fixed[i];\n}\n\nstd::vector<bool>& long_prune_get_fixed(const unsigned int index) {\n\treturn *long_prune_fixed[index % long_prune_max_stored_autss];\n}\n\nstd::vector<bool>& long_prune_allocget_mcrs(const unsigned int index) {\n\tconst unsigned int i = index % long_prune_max_stored_autss;\n\tif(!long_prune_mcrs[i])\n\t\tlong_prune_mcrs[i] = new std::vector<bool>(get_nof_vertices());\n\treturn *long_prune_mcrs[i];\n}\n\nstd::vector<bool>& long_prune_get_mcrs(const unsigned int index) {\n\treturn *long_prune_mcrs[index % long_prune_max_stored_autss];\n}\n\nvoid long_prune_add_automorphism(const unsigned int* aut) {\n\tif(long_prune_max_stored_autss == 0) return;\n\tconst unsigned int N = get_nof_vertices();\n\t/* If the buffer of stored auts is full, remove the oldest aut */\n\tif(long_prune_end - long_prune_begin == long_prune_max_stored_autss) {\n\t\tlong_prune_begin++;\n\t}\n\tlong_prune_end++;\n\tstd::vector<bool>& fixed = long_prune_allocget_fixed(long_prune_end-1);\n\tstd::vector<bool>& mcrs = long_prune_allocget_mcrs(long_prune_end-1);\n\t/* Mark nodes that are (i) fixed or (ii) minimal orbit representatives\n\t * under the automorphism 'aut' */\n\tfor(unsigned int i = 0; i < N; i++) {\n\t\tfixed[i] = (aut[i] == i);\n\t\tif(long_prune_temp[i] == false) {\n\t\t\tmcrs[i] = true;\n\t\t\tunsigned int j = aut[i];\n\t\t\twhile(j != i) {\n\t\t\t\tlong_prune_temp[j] = true;\n\t\t\t\tj = aut[j];\n\t\t\t}\n\t\t} else {\n\t\t\tmcrs[i] = false;\n\t\t}\n\t\t/* Clear the temp array on-the-fly... */\n\t\tlong_prune_temp[i] = false;\n\t}\n}\n\n\t/*\n\t * Data structures and routines for refining the partition p into equitable\n\t */\n\tHeap neighbour_heap;\n\tvirtual bool split_neighbourhood_of_unit_cell(Partition::Cell *) = 0;\n\tvirtual bool split_neighbourhood_of_cell(Partition::Cell * const) = 0;\n\t//void refine_to_equitable();\n\t//void refine_to_equitable(Partition::Cell * const unit_cell);\n\t//void refine_to_equitable(Partition::Cell * const unit_cell1, Partition::Cell * const unit_cell2);\nvoid refine_to_equitable() {\n\t/* Start refinement from all cells -> push 'em all in the splitting queue */\n\tfor(Partition::Cell* cell = p.first_cell; cell; cell = cell->next)\n\t\tp.splitting_queue_add(cell);\n\tdo_refine_to_equitable();\n}\n\nvoid refine_to_equitable(Partition::Cell* const unit_cell) {\n\tp.splitting_queue_add(unit_cell);\n\tdo_refine_to_equitable();\n}\n\nvoid refine_to_equitable(Partition::Cell* const unit_cell1, Partition::Cell* const unit_cell2) {\n\tp.splitting_queue_add(unit_cell1);\n\tp.splitting_queue_add(unit_cell2);\n\tdo_refine_to_equitable();\n}\n\t/** \\internal\n\t * \\return false if it was detected that the current certificate\n\t *         is different from the first and/or best (whether this is checked\n\t *         depends on in_search and refine_compare_certificate flags.\n\t */\n\t//bool do_refine_to_equitable();\nbool do_refine_to_equitable() {\n\teqref_hash.reset();\n\twhile(!p.splitting_queue_is_empty()) {\n\t\tPartition::Cell* const cell = p.splitting_queue_pop();\n\t\tif(cell->is_unit()) {\n\t\t\tif(in_search) {\n\t\t\t\tconst unsigned int index = cell->first;\n\t\t\t\tif(first_path_automorphism) {\n\t\t\t\t\t/* Build the (potential) automorphism on-the-fly */\n\t\t\t\t\tfirst_path_automorphism[first_path_labeling_inv[index]] =\n\t\t\t\t\t\tp.elements[index];\n\t\t\t\t}\n\t\t\t\tif(best_path_automorphism) {\n\t\t\t\t\t/* Build the (potential) automorphism on-the-fly */\n\t\t\t\t\tbest_path_automorphism[best_path_labeling_inv[index]] =\n\t\t\t\t\t\tp.elements[index];\n\t\t\t\t}\n\t\t\t}\n\t\t\tconst bool worse = split_neighbourhood_of_unit_cell(cell);\n\t\t\tif(in_search and worse) goto worse_exit;\n\t\t}\n\t\telse {\n\t\t\tconst bool worse = split_neighbourhood_of_cell(cell);\n\t\t\tif(in_search and worse) goto worse_exit;\n\t\t}\n\t}\n\treturn true;\nworse_exit:\n\t/* Clear splitting_queue */\n\tp.splitting_queue_clear();\n\treturn false;\n}\n\tunsigned int eqref_max_certificate_index;\n\t/** \\internal\n\t * Whether eqref_hash is updated during equitable refinement process.\n\t */\n\tbool compute_eqref_hash;\n\tUintSeqHash eqref_hash;\n\t/** \\internal\n\t * Check whether the current partition p is equitable.\n\t * Performance: very slow, use only for debugging purposes.\n\t */\n\tvirtual bool is_equitable() const = 0;\n\n\tunsigned int *first_path_labeling;\n\tunsigned int *first_path_labeling_inv;\n\tOrbit         first_path_orbits;\n\tunsigned int *first_path_automorphism;\n\tunsigned int *best_path_labeling;\n\tunsigned int *best_path_labeling_inv;\n\tOrbit         best_path_orbits;\n\tunsigned int *best_path_automorphism;\n\n\t//void update_labeling(unsigned int * const lab);\n/** \\internal\n * Assign the labeling induced by the current partition 'this.p' to\n * \\a labeling.\n * That is, if the partition is [[2,0],[1]],\n * then \\a labeling will map 0 to 1, 1 to 2, and 2 to 0.\n */\nvoid update_labeling(unsigned int* const labeling) {\n\tconst unsigned int N = get_nof_vertices();\n\tunsigned int* ep = p.elements;\n\tfor(unsigned int i = 0; i < N; i++, ep++)\n\t\tlabeling[*ep] = i;\n}\n\t//void update_labeling_and_its_inverse(unsigned int * const lab, unsigned int * const lab_inv);\n/** \\internal\n * The same as update_labeling() except that the inverse of the labeling\n * is also produced and assigned to \\a labeling_inv.\n */\nvoid update_labeling_and_its_inverse(unsigned int* const labeling, unsigned int* const labeling_inv) {\n\tconst unsigned int N = get_nof_vertices();\n\tunsigned int* ep = p.elements;\n\tunsigned int* clip = labeling_inv;\n\tfor(unsigned int i = 0; i < N; ) {\n\t\tlabeling[*ep] = i;\n\t\ti++;\n\t\t*clip = *ep;\n\t\tep++;\n\t\tclip++;\n\t}\n}\n\tvoid update_orbit_information(Orbit &o, const unsigned int *perm) {\n\t\tconst unsigned int N = get_nof_vertices();\n\t\tfor(unsigned int i = 0; i < N; i++)\n\t\t\tif(perm[i] != i) o.merge_orbits(i, perm[i]);\n\t}\n\t//void reset_permutation(unsigned int *perm);\n\t/* Mainly for debugging purposes */\n\t//virtual bool is_automorphism(unsigned int* const perm);\n\n// \\internal\n// Reset the permutation \\a perm to the identity permutation.\nvoid reset_permutation(unsigned int* perm) {\n\tconst unsigned int N = get_nof_vertices();\n\tfor(unsigned int i = 0; i < N; i++, perm++)\n\t\t*perm = i;\n}\n\nvirtual bool is_automorphism(unsigned int* const perm) {\n\t_INTERNAL_ERROR();\n\treturn false;\n}\n\tstd::vector<unsigned int> certificate_current_path;\n\tstd::vector<unsigned int> certificate_first_path;\n\tstd::vector<unsigned int> certificate_best_path;\n\tunsigned int certificate_index;\n\tvirtual void initialize_certificate() = 0;\n\tvirtual void remove_duplicate_edges() = 0;\n\tvirtual void make_initial_equitable_partition() = 0;\n\tvirtual Partition::Cell* find_next_cell_to_be_splitted(Partition::Cell *cell) = 0;\n\t//void search(const bool canonical, Stats &stats);\n#include \"search.h\"\n\tvoid (*report_hook)(void *user_param, unsigned int n, const unsigned int *aut);\n\tvoid *report_user_param;\n\t/*\n\t *\n\t * Nonuniform component recursion (NUCR)\n\t *\n\t */\n\n\t/** The currently traversed component */\n\tunsigned int cr_level;\n\n\t/** \\internal\n\t * The \"Component End Point\" data structure\n\t */\n\tclass CR_CEP {\n\t\tpublic:\n\t\t\t/** At which level in the search was this CEP created */\n\t\t\tunsigned int creation_level;\n\t\t\t/** The current component has been fully traversed when the partition has\n\t\t\t * this many discrete cells left */\n\t\t\tunsigned int discrete_cell_limit;\n\t\t\t/** The component to be traversed after the current one */\n\t\t\tunsigned int next_cr_level;\n\t\t\t/** The next component end point */\n\t\t\tunsigned int next_cep_index;\n\t\t\tbool first_checked;\n\t\t\tbool best_checked;\n\t};\n\t/** \\internal\n\t * A stack for storing Component End Points\n\t */\n\tstd::vector<CR_CEP> cr_cep_stack;\n\n\t/** \\internal\n\t * Find the first non-uniformity component at the component recursion\n\t * level \\a level.\n\t * The component is stored in \\a cr_component.\n\t * If no component is found, \\a cr_component is empty.\n\t * Returns false if all the cells in the component recursion level \\a level\n\t * were discrete.\n\t * Modifies the max_ival and max_ival_count fields of Partition:Cell\n\t * (assumes that they are 0 when called and\n\t *  quarantees that they are 0 when returned).\n\t */\n\tvirtual bool nucr_find_first_component(const unsigned int level) = 0;\n\tvirtual bool nucr_find_first_component(const unsigned int level,\n\t\t\tstd::vector<unsigned int>& component,\n\t\t\tunsigned int& component_elements,\n\t\t\tPartition::Cell*& sh_return) = 0;\n\t/** \\internal\n\t * The non-uniformity component found by nucr_find_first_component()\n\t * is stored here.\n\t */\n\tstd::vector<unsigned int> cr_component;\n\t/** \\internal\n\t * The number of vertices in the component \\a cr_component\n\t */\n\tunsigned int cr_component_elements;\n};\n\n// Assumes that the elements in the cell are sorted according to their invariant values.\nPartition::Cell* Partition::split_cell(Partition::Cell* const original_cell) {\n  Partition::Cell* cell = original_cell;\n  const bool original_cell_was_in_splitting_queue =\n    original_cell->in_splitting_queue;\n  Partition::Cell* largest_new_cell = 0;\n\n  while(true) {\n      unsigned int* ep = elements + cell->first;\n      const unsigned int* const lp = ep + cell->length;\n      const unsigned int ival = invariant_values[*ep];\n      invariant_values[*ep] = 0;\n      element_to_cell_map[*ep] = cell;\n      in_pos[*ep] = ep;\n      ep++;\n      while(ep < lp) {\n\t  const unsigned int e = *ep;\n\t  if(invariant_values[e] != ival)\n\t    break;\n\t  invariant_values[e] = 0;\n\t  in_pos[e] = ep;\n\t  ep++;\n\t  element_to_cell_map[e] = cell;\n\t}\n      if(ep == lp) break;\n      Partition::Cell* const new_cell = aux_split_in_two(cell, (ep - elements) - cell->first);\n      if(graph and graph->compute_eqref_hash) {\n\t  graph->eqref_hash.update(new_cell->first);\n\t  graph->eqref_hash.update(new_cell->length);\n\t  graph->eqref_hash.update(ival);\n\t}\n      /* Add cells in splitting_queue */\n      assert(!new_cell->is_in_splitting_queue());\n      if(original_cell_was_in_splitting_queue)\n\t{\n\t  /* In this case, all new cells are inserted in splitting_queue */\n\t  assert(cell->is_in_splitting_queue());\n\t  splitting_queue_add(new_cell);\n\t}\n      else\n\t{\n\t  /* Otherwise, we can omit one new cell from splitting_queue */\n\t  assert(!cell->is_in_splitting_queue());\n\t  if(largest_new_cell == 0) {\n\t    largest_new_cell = cell;\n\t  } else {\n\t    assert(!largest_new_cell->is_in_splitting_queue());\n\t    if(cell->length > largest_new_cell->length) {\n\t      splitting_queue_add(largest_new_cell);\n\t      largest_new_cell = cell;\n\t    } else {\n\t      splitting_queue_add(cell);\n\t    }\n\t  }\n\t}\n      /* Process the rest of the cell */\n      cell = new_cell;\n    }\n\n  \n  if(original_cell == cell) {\n    /* All the elements in cell had the same invariant value */\n    return cell;\n  }\n\n  /* Add cells in splitting_queue */\n  if(!original_cell_was_in_splitting_queue) {\n      /* Also consider the last new cell */\n      assert(largest_new_cell);\n      if(cell->length > largest_new_cell->length) {\n\t  splitting_queue_add(largest_new_cell);\n\t  largest_new_cell = cell;\n\t} else {\n\t  splitting_queue_add(cell);\n\t}\n      if(largest_new_cell->is_unit()) {\n\t  /* Needed in certificate computation */\n\t  splitting_queue_add(largest_new_cell);\n\t}\n    }\n  return cell;\n}\n\n}\n\n#endif\n"
  },
  {
    "path": "external/bliss/bliss/bignum.hh",
    "content": "#ifndef BLISS_BIGNUM_HH\n#define BLISS_BIGNUM_HH\n\n/*\n  Copyright (c) 2003-2015 Tommi Junttila\n  Released under the GNU Lesser General Public License version 3.\n  \n  This file is part of bliss.\n  \n  bliss is free software: you can redistribute it and/or modify\n  it under the terms of the GNU Lesser General Public License as published by\n  the Free Software Foundation, version 3 of the License.\n\n  bliss is distributed in the hope that it will be useful,\n  but WITHOUT ANY WARRANTY; without even the implied warranty of\n  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the\n  GNU Lesser General Public License for more details.\n\n  You should have received a copy of the GNU Lesser General Public License\n  along with bliss.  If not, see <http://www.gnu.org/licenses/>.\n*/\n\n#if defined(BLISS_USE_GMP)\n#include <gmp.h>\n#endif\n\n#include <cstdlib>\n#include <cstdio>\n#include \"defs.hh\"\n\nnamespace bliss {\n\n/**\n * \\brief A very simple class for big integers (or approximation of them).\n *\n * If the compile time flag BLISS_USE_GMP is set,\n * then the GNU Multiple Precision Arithmetic library (GMP) is used to\n * obtain arbitrary precision, otherwise \"long double\" is used to\n * approximate big integers.\n */\n\n#if defined(BLISS_USE_GMP)\n\nclass BigNum\n{\n  mpz_t v;\npublic:\n  /**\n   * Create a new big number and set it to zero.\n   */\n  BigNum() {mpz_init(v); }\n\n  /**\n   * Destroy the number.\n   */\n  ~BigNum() {mpz_clear(v); }\n\n  /**\n   * Set the number to \\a n.\n   */\n  void assign(const int n) {mpz_set_si(v, n); }\n\n  /**\n   * Multiply the number with \\a n.\n   */\n  void multiply(const int n) {mpz_mul_si(v, v, n); }\n\n  /**\n   * Print the number in the file stream \\a fp.\n   */\n  size_t print(FILE* const fp) const {return mpz_out_str(fp, 10, v); }\n};\n\n#else\n\nclass BigNum\n{\n  long double v;\npublic:\n  /**\n   * Create a new big number and set it to zero.\n   */\n  BigNum(): v(0.0) {}\n\n  /**\n   * Set the number to \\a n.\n   */\n  void assign(const int n) {v = (long double)n; }\n\n  /**\n   * Multiply the number with \\a n.\n   */\n  void multiply(const int n) {v *= (long double)n; }\n\n  /**\n   * Print the number in the file stream \\a fp.\n   */\n  size_t print(FILE* const fp) const {return fprintf(fp, \"%Lg\", v); }\n};\n\n#endif\n\n} //namespace bliss\n\n#endif\n"
  },
  {
    "path": "external/bliss/bliss/defs.hh",
    "content": "#ifndef BLISS_DEFS_HH\n#define BLISS_DEFS_HH\n\n#include <cassert>\n#include <cstdarg>\n\n/*\n  Copyright (c) 2003-2015 Tommi Junttila\n  Released under the GNU Lesser General Public License version 3.\n  \n  This file is part of bliss.\n  \n  bliss is free software: you can redistribute it and/or modify\n  it under the terms of the GNU Lesser General Public License as published by\n  the Free Software Foundation, version 3 of the License.\n\n  bliss is distributed in the hope that it will be useful,\n  but WITHOUT ANY WARRANTY; without even the implied warranty of\n  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the\n  GNU Lesser General Public License for more details.\n\n  You should have received a copy of the GNU Lesser General Public License\n  along with bliss.  If not, see <http://www.gnu.org/licenses/>.\n*/\n\nnamespace bliss {\n\n/**\n * The version number of bliss.\n */\nstatic const char * const version = \"0.73\";\n\n/*\n * If a fatal error (out of memory, internal error) is encountered,\n * this function is called.\n * There should not be a return from this function but exit or\n * a jump to code that deallocates the AbstractGraph instance that called this.\n */\nvoid fatal_error(const char* fmt, ...);\n\n\n#if defined(BLISS_DEBUG)\n#define BLISS_CONSISTENCY_CHECKS\n#define BLISS_EXPENSIVE_CONSISTENCY_CHECKS\n#endif\n\n\n#if defined(BLISS_CONSISTENCY_CHECKS)\n/* Force a check that the found automorphisms are valid */\n#define BLISS_VERIFY_AUTOMORPHISMS\n#endif\n\n\n#if defined(BLISS_CONSISTENCY_CHECKS)\n/* Force a check that the generated partitions are equitable */\n#define BLISS_VERIFY_EQUITABLEDNESS\n#endif\n\n} // namespace bliss\n\n\n\n/*! \\mainpage Bliss\n *\n * \\section intro_sec Introduction\n *\n * This is the source code documentation of bliss,\n * produced by running <A href=\"http://www.doxygen.org\">doxygen</A> in\n * the source directory.\n * The algorithms and data structures used in bliss are documented in\n * the papers found at the\n * <A href=\"http://www.tcs.hut.fi/Software/bliss\">bliss web site</A>.\n *\n *\n * \\section compile_sec Compiling\n *\n * Compiling bliss in Linux should be easy, just execute\n * \\code\n * make\n * \\endcode\n * in the bliss source directory.\n * This will produce the executable program \\c bliss as well as\n * the library file \\c libbliss.a that can be linked in other programs.\n * If you have the <A href=\"http://gmplib.org/\">GNU Multiple Precision\n * Arithmetic Library</A> (GMP) installed in your machine, you can also use\n * \\code\n * make gmp\n * \\endcode\n * to enable exact computation of automorphism group sizes.\n *\n * When linking the bliss library \\c libbliss.a in other programs,\n * remember to include the standard c++ library\n * (and the GMP library if you compiled bliss to include it).\n * For instance,\n * \\code gcc -o test test.c -lstdc++ -lgmp -lbliss\\endcode\n *\n * \\section cppapi_sec The C++ language API\n *\n * The C++ language API is the main API to bliss;\n * all other APIs are just more or less complete variants of it.\n * The C++ API consists basically of the public methods in\n * the classes bliss::AbstractGraph, bliss::Graph, and bliss::Digraph.\n * For an example of its use,\n * see the \\ref executable \"source of the bliss executable\".\n *\n *\n * \\section capi_sec The C language API\n *\n * The C language API is given in the file bliss_C.h.\n * It is currently more restricted than the C++ API so\n * consider using the C++ API whenever possible.\n */\n\n\n#endif\n"
  },
  {
    "path": "external/bliss/bliss/graph.hh",
    "content": "#ifndef BLISS_GRAPH_HH\n#define BLISS_GRAPH_HH\n\n#include \"abgraph.hh\"\n\nnamespace bliss {\n\n#ifdef USE_DOMAIN\ntypedef std::pair<unsigned, Index> IndexEdge;\n#else\ntypedef unsigned IndexEdge;\n#endif\n\n#if defined(BLISS_CONSISTENCY_CHECKS)\nstatic bool is_permutation(const unsigned int N, const unsigned int* perm) {\n\tif(N == 0) return true;\n\tstd::vector<bool> m(N, false);\n\tfor(unsigned int i = 0; i < N; i++) {\n\t\tif(perm[i] >= N) return false;\n\t\tif(m[perm[i]]) return false;\n\t\tm[perm[i]] = true;\n\t}\n\treturn true;\n}\n#endif\nstatic bool is_permutation(const std::vector<unsigned int>& perm) {\n\tconst unsigned int N = perm.size();\n\tif(N == 0)\n\t\treturn true;\n\tstd::vector<bool> m(N, false);\n\tfor(unsigned int i = 0; i < N; i++) {\n\t\tif(perm[i] >= N) return false;\n\t\tif(m[perm[i]]) return false;\n\t\tm[perm[i]] = true;\n\t}\n\treturn true;\n}\n\n// \\brief The class for undirected, vertex colored graphs.\n// Multiple edges between vertices are not allowed (i.e., are ignored).\nclass Graph : public AbstractGraph {\npublic:\n\t/**\n\t * The possible splitting heuristics.\n\t * The selected splitting heuristics affects the computed canonical\n\t * labelings; therefore, if you want to compare whether two graphs\n\t * are isomorphic by computing and comparing (for equality) their\n\t * canonical versions, be sure to use the same splitting heuristics\n\t * for both graphs.\n\t */\n\ttypedef enum {\n\t\t/** First non-unit cell.\n\t\t * Very fast but may result in large search spaces on difficult graphs.\n\t\t * Use for large but easy graphs. */\n\t\tshs_f = 0,\n\t\t/** First smallest non-unit cell.\n\t\t * Fast, should usually produce smaller search spaces than shs_f. */\n\t\tshs_fs,\n\t\t/** First largest non-unit cell.\n\t\t * Fast, should usually produce smaller search spaces than shs_f. */\n\t\tshs_fl,\n\t\t/** First maximally non-trivially connected non-unit cell.\n\t\t * Not so fast, should usually produce smaller search spaces than shs_f,\n\t\t * shs_fs, and shs_fl. */\n\t\tshs_fm,\n\t\t/** First smallest maximally non-trivially connected non-unit cell.\n\t\t * Not so fast, should usually produce smaller search spaces than shs_f,\n\t\t * shs_fs, and shs_fl. */\n\t\tshs_fsm,\n\t\t/** First largest maximally non-trivially connected non-unit cell.\n\t\t * Not so fast, should usually produce smaller search spaces than shs_f,\n\t\t * shs_fs, and shs_fl. */\n\t\tshs_flm\n\t} SplittingHeuristic;\n\n\t//moved from protected scope by Zhiqiang\n\tclass Vertex {\n\t\tpublic:\n\t\t\tVertex() { color = 0;}\n\t\t\t~Vertex(){ ; }\n#ifdef USE_DOMAIN\n\t\t\tvoid add_edge(const unsigned other_vertex, Index index) {\n\t\t\t\tedges.push_back(std::make_pair(other_vertex, index));\n#else\n\t\t\tvoid add_edge(const unsigned other_vertex) {\n\t\t\t\tedges.push_back(other_vertex);\n#endif\n\t\t\t}\n\t\t\tvoid remove_duplicate_edges(std::vector<bool>& tmp) {\n#if defined(BLISS_CONSISTENCY_CHECKS)\n\t\t\t\t/* Pre-conditions  */\n\t\t\t\tfor(unsigned int i = 0; i < tmp.size(); i++) assert(tmp[i] == false);\n#endif\n\t\t\t\tfor(std::vector<IndexEdge>::iterator iter = edges.begin(); iter != edges.end(); ) {\n#ifdef USE_DOMAIN\n\t\t\t\t\tconst unsigned int dest_vertex = iter->first; //cxh\n#else\n\t\t\t\t\tconst unsigned int dest_vertex = *iter;\n#endif\n\t\t\t\t\tif(tmp[dest_vertex] == true) {\n\t\t\t\t\t\t/* A duplicate edge found! */\n\t\t\t\t\t\titer = edges.erase(iter);\n\t\t\t\t\t} else {\n\t\t\t\t\t\t/* Not seen earlier, mark as seen */\n\t\t\t\t\t\ttmp[dest_vertex] = true;\n\t\t\t\t\t\titer++;\n\t\t\t\t\t}\n\t\t\t\t}\n\t\t\t\t/* Clear tmp */\n\t\t\t\tfor(std::vector<IndexEdge>::iterator iter = edges.begin(); iter != edges.end(); iter++) {\n#ifdef USE_DOMAIN\n\t\t\t\t\ttmp[iter->first] = false;// cxh\n#else\n\t\t\t\t\ttmp[*iter] = false;\n#endif\n\t\t\t\t}\n#if defined(BLISS_CONSISTENCY_CHECKS)\n\t\t\t\t/* Post-conditions  */\n\t\t\t\tfor(unsigned int i = 0; i < tmp.size(); i++) assert(tmp[i] == false);\n#endif\n\t\t\t}\n\t\t\tvoid sort_edges() { std::sort(edges.begin(), edges.end()); }\n\t\t\tunsigned color;\n\t\t\t//std::vector<unsigned> edges;\n\t\t\tstd::vector<IndexEdge> edges; // cxh: add the edge ids from the embedding\n\t\t\tunsigned nof_edges() const {return edges.size(); }\n\t\t\t};\n\t\t\t//added by Zhiqiang\n\t\t\tstd::vector<Vertex> & get_vertices_rstream() { return vertices; }\n\t\t\tvoid sort_edges_rstream() { sort_edges(); }\n\nprotected:\n\tstd::vector<Vertex> vertices;\n\tvoid sort_edges() {\n\t\tfor(unsigned int i = 0; i < get_nof_vertices(); i++)\n\t\t\tvertices[i].sort_edges();\n\t}\n\tvoid remove_duplicate_edges() {\n\t\tstd::vector<bool> tmp(vertices.size(), false);\n\t\tfor(std::vector<Vertex>::iterator vi = vertices.begin();\n\t\t\t\tvi != vertices.end();\n\t\t\t\tvi++)\n\t\t{\n#if defined(BLISS_EXPENSIVE_CONSISTENCY_CHECKS)\n\t\t\tfor(unsigned int i = 0; i < tmp.size(); i++) assert(tmp[i] == false);\n#endif\n\t\t\t(*vi).remove_duplicate_edges(tmp);\n\t\t}\n\t}\n\t// \\internal Partition independent invariant.\n\t// Return the color of the vertex. Time complexity: O(1)\n\tstatic unsigned int vertex_color_invariant(const Graph* const g, const unsigned int v) { \n\t\treturn g->vertices[v].color;\n\t}\n\t/** \\internal\n\t * Partition independent invariant.\n\t * Returns the degree of the vertex.\n\t * DUPLICATE EDGES MUST HAVE BEEN REMOVED BEFORE.\n\t * Time complexity: O(1).\n\t */\n\t// Return the degree of the vertex. Time complexity: O(1)\n\tstatic unsigned int degree_invariant(const Graph* const g, const unsigned int v) {\n\t\treturn g->vertices[v].nof_edges();\n\t}\n\t/** \\internal\n\t * Partition independent invariant.\n\t * Returns 1 if there is an edge from the vertex to itself, 0 if not.\n\t * Time complexity: O(k), where k is the number of edges leaving the vertex.\n\t */\n\t// Return 1 if the vertex v has a self-loop, 0 otherwise\n\t// Time complexity: O(E_v), where E_v is the number of edges leaving v\n\tstatic unsigned selfloop_invariant(const Graph* const g, const unsigned int v) {\n\t\tconst Vertex& vertex = g->vertices[v];\n\t\tfor(std::vector<IndexEdge>::const_iterator ei = vertex.edges.begin(); ei != vertex.edges.end(); ei++) {\n#ifdef USE_DOMAIN\n\t\t\tif(ei->first == v) return 1; // cxh\n#else\n\t\t\tif(*ei == v) return 1;\n#endif\n\t\t}\n\t\treturn 0;\n\t}\n\n\t// Refine the partition p according to a partition independent invariant\n\tbool refine_according_to_invariant(unsigned int (*inv)(const Graph* const g, const unsigned int v)) {\n\t\tbool refined = false;\n\t\tfor(Partition::Cell* cell = p.first_nonsingleton_cell; cell; ) {\n\t\t\tPartition::Cell* const next_cell = cell->next_nonsingleton;\n\t\t\tconst unsigned int* ep = p.elements + cell->first;\n\t\t\tfor(unsigned int i = cell->length; i > 0; i--, ep++) {\n\t\t\t\tconst unsigned int ival = inv(this, *ep);\n\t\t\t\tp.invariant_values[*ep] = ival;\n\t\t\t\tif(ival > cell->max_ival) {\n\t\t\t\t\tcell->max_ival = ival;\n\t\t\t\t\tcell->max_ival_count = 1;\n\t\t\t\t}\n\t\t\t\telse if(ival == cell->max_ival) {\n\t\t\t\t\tcell->max_ival_count++;\n\t\t\t\t}\n\t\t\t}\n\t\t\tPartition::Cell* const last_new_cell = p.zplit_cell(cell, true);\n\t\t\trefined |= (last_new_cell != cell);\n\t\t\tcell = next_cell;\n\t\t}\n\t\treturn refined;\n\t}\n\t// Routines needed when refining the partition p into equitable\n\t// Split the neighbourhood of a cell according to the equitable invariant\n\tbool split_neighbourhood_of_cell(Partition::Cell* const cell) {\n\t\tconst bool was_equal_to_first = refine_equal_to_first;\n\t\tif(compute_eqref_hash) {\n\t\t\teqref_hash.update(cell->first);\n\t\t\teqref_hash.update(cell->length);\n\t\t}\n\t\tconst unsigned int* ep = p.elements + cell->first;\n\t\tfor(unsigned int i = cell->length; i > 0; i--) {\n\t\t\tconst Vertex& v = vertices[*ep++];   \n\t\t\tstd::vector<IndexEdge>::const_iterator ei = v.edges.begin();\n\t\t\tfor(unsigned int j = v.nof_edges(); j != 0; j--) {\n#ifdef USE_DOMAIN\n\t\t\t\tconst unsigned int dest_vertex = (ei++)->first; // cxh\n#else\n\t\t\t\tconst unsigned int dest_vertex = *ei++;\n#endif\n\t\t\t\tPartition::Cell * const neighbour_cell = p.get_cell(dest_vertex);\n\t\t\t\tif(neighbour_cell->is_unit())\n\t\t\t\t\tcontinue;\n\t\t\t\tconst unsigned int ival = ++p.invariant_values[dest_vertex];\n\t\t\t\tif(ival > neighbour_cell->max_ival) {\n\t\t\t\t\tneighbour_cell->max_ival = ival;\n\t\t\t\t\tneighbour_cell->max_ival_count = 1;\n\t\t\t\t\tif(ival == 1) {\n\t\t\t\t\t\tneighbour_heap.insert(neighbour_cell->first);\n\t\t\t\t\t}\n\t\t\t\t}\n\t\t\t\telse if(ival == neighbour_cell->max_ival) {\n\t\t\t\t\tneighbour_cell->max_ival_count++;\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t\twhile(!neighbour_heap.is_empty()) {\n\t\t\tconst unsigned int start = neighbour_heap.remove();\n\t\t\tPartition::Cell * const neighbour_cell = p.get_cell(p.elements[start]);\n\t\t\tif(compute_eqref_hash) {\n\t\t\t\teqref_hash.update(neighbour_cell->first);\n\t\t\t\teqref_hash.update(neighbour_cell->length);\n\t\t\t\teqref_hash.update(neighbour_cell->max_ival);\n\t\t\t\teqref_hash.update(neighbour_cell->max_ival_count);\n\t\t\t}\n\t\t\tPartition::Cell* const last_new_cell = p.zplit_cell(neighbour_cell, true);\n\t\t\t// Update certificate and hash if needed\n\t\t\tconst Partition::Cell* c = neighbour_cell;\n\t\t\twhile(1) {\n\t\t\t\tif(in_search) {\n\t\t\t\t\t// Build certificate\n\t\t\t\t\tcert_add_redundant(CERT_SPLIT, c->first, c->length);\n\t\t\t\t\t// No need to continue?\n\t\t\t\t\tif(refine_compare_certificate and\n\t\t\t\t\t\t\t(refine_equal_to_first == false) and\n\t\t\t\t\t\t\t(refine_cmp_to_best < 0))\n\t\t\t\t\t\tgoto worse_exit;\n\t\t\t\t}\n\t\t\t\tif(compute_eqref_hash) {\n\t\t\t\t\teqref_hash.update(c->first);\n\t\t\t\t\teqref_hash.update(c->length);\n\t\t\t\t}\n\t\t\t\tif(c == last_new_cell) break;\n\t\t\t\tc = c->next;\n\t\t\t}\n\t\t}\n\n\t\tif(refine_compare_certificate and (refine_equal_to_first == false) and (refine_cmp_to_best < 0))\n\t\t\treturn true;\n\t\treturn false;\nworse_exit:\n\t\t// Clear neighbour heap \n\t\tUintSeqHash rest;\n\t\twhile(!neighbour_heap.is_empty()) {\n\t\t\tconst unsigned int start = neighbour_heap.remove();\n\t\t\tPartition::Cell * const neighbour_cell = p.get_cell(p.elements[start]);\n\t\t\tif(opt_use_failure_recording and was_equal_to_first) {\n\t\t\t\trest.update(neighbour_cell->first);\n\t\t\t\trest.update(neighbour_cell->length);\n\t\t\t\trest.update(neighbour_cell->max_ival);\n\t\t\t\trest.update(neighbour_cell->max_ival_count);\n\t\t\t}\n\t\t\tneighbour_cell->max_ival = 0;\n\t\t\tneighbour_cell->max_ival_count = 0;\n\t\t\tp.clear_ivs(neighbour_cell);\n\t\t}\n\t\tif(opt_use_failure_recording and was_equal_to_first) {\n\t\t\tfor(unsigned int i = p.splitting_queue.size(); i > 0; i--) {\n\t\t\t\tPartition::Cell* const cell = p.splitting_queue.pop_front();\n\t\t\t\trest.update(cell->first);\n\t\t\t\trest.update(cell->length);\n\t\t\t\tp.splitting_queue.push_back(cell);\n\t\t\t}\n\t\t\trest.update(failure_recording_fp_deviation);\n\t\t\tfailure_recording_fp_deviation = rest.get_value();\n\t\t}\n\t\treturn true;\n\t}\n\n\tbool split_neighbourhood_of_unit_cell(Partition::Cell* const unit_cell) {\n\t\tconst bool was_equal_to_first = refine_equal_to_first;\n\t\tif(compute_eqref_hash) {\n\t\t\teqref_hash.update(0x87654321);\n\t\t\teqref_hash.update(unit_cell->first);\n\t\t\teqref_hash.update(1);\n\t\t}\n\t\tconst Vertex& v = vertices[p.elements[unit_cell->first]];\n\t\tstd::vector<IndexEdge>::const_iterator ei = v.edges.begin();\n\t\tfor(unsigned int j = v.nof_edges(); j > 0; j--) {\n#ifdef USE_DOMAIN\n\t\t\tconst unsigned int dest_vertex = (ei++)->first; // cxh\n#else\n\t\t\tconst unsigned int dest_vertex = *ei++;\n#endif\n\t\t\tPartition::Cell * const neighbour_cell = p.get_cell(dest_vertex);\n\n\t\t\tif(neighbour_cell->is_unit()) {\n\t\t\t\tif(in_search) {\n\t\t\t\t\t/* Remember neighbour in order to generate certificate */\n\t\t\t\t\tneighbour_heap.insert(neighbour_cell->first);\n\t\t\t\t}\n\t\t\t\tcontinue;\n\t\t\t}\n\t\t\tif(neighbour_cell->max_ival_count == 0) {\n\t\t\t\tneighbour_heap.insert(neighbour_cell->first);\n\t\t\t}\n\t\t\tneighbour_cell->max_ival_count++;\n\n\t\t\tunsigned int * const swap_position =\n\t\t\t\tp.elements + neighbour_cell->first + neighbour_cell->length -\n\t\t\t\tneighbour_cell->max_ival_count;\n\t\t\t*p.in_pos[dest_vertex] = *swap_position;\n\t\t\tp.in_pos[*swap_position] = p.in_pos[dest_vertex];\n\t\t\t*swap_position = dest_vertex;\n\t\t\tp.in_pos[dest_vertex] = swap_position;\n\t\t}\n\n\t\twhile(!neighbour_heap.is_empty()) {\n\t\t\tconst unsigned int start = neighbour_heap.remove();\n\t\t\tPartition::Cell* neighbour_cell =\tp.get_cell(p.elements[start]);\n#if defined(BLISS_CONSISTENCY_CHECKS)\n\t\t\tif(neighbour_cell->is_unit()) { } else { }\n#endif\n\t\t\tif(compute_eqref_hash) {\n\t\t\t\teqref_hash.update(neighbour_cell->first);\n\t\t\t\teqref_hash.update(neighbour_cell->length);\n\t\t\t\teqref_hash.update(neighbour_cell->max_ival_count);\n\t\t\t}\n\n\t\t\tif(neighbour_cell->length > 1 and neighbour_cell->max_ival_count != neighbour_cell->length) {\n\t\t\t\tPartition::Cell * const new_cell =\n\t\t\t\t\tp.aux_split_in_two(neighbour_cell, neighbour_cell->length - neighbour_cell->max_ival_count);\n\t\t\t\tunsigned int *ep = p.elements + new_cell->first;\n\t\t\t\tunsigned int * const lp = p.elements+new_cell->first+new_cell->length;\n\t\t\t\twhile(ep < lp) {\n\t\t\t\t\tp.element_to_cell_map[*ep] = new_cell;\n\t\t\t\t\tep++;\n\t\t\t\t}\n\t\t\t\tneighbour_cell->max_ival_count = 0;\n\n\n\t\t\t\tif(compute_eqref_hash) {\n\t\t\t\t\t/* Update hash */\n\t\t\t\t\teqref_hash.update(neighbour_cell->first);\n\t\t\t\t\teqref_hash.update(neighbour_cell->length);\n\t\t\t\t\teqref_hash.update(0);\n\t\t\t\t\teqref_hash.update(new_cell->first);\n\t\t\t\t\teqref_hash.update(new_cell->length);\n\t\t\t\t\teqref_hash.update(1);\n\t\t\t\t}\n\n\t\t\t\t/* Add cells in splitting_queue */\n\t\t\t\tif(neighbour_cell->is_in_splitting_queue()) {\n\t\t\t\t\t/* Both cells must be included in splitting_queue in order\n\t\t\t\t\t   to ensure refinement into equitable partition */\n\t\t\t\t\tp.splitting_queue_add(new_cell);\n\t\t\t\t} else {\n\t\t\t\t\tPartition::Cell *min_cell, *max_cell;\n\t\t\t\t\tif(neighbour_cell->length <= new_cell->length) {\n\t\t\t\t\t\tmin_cell = neighbour_cell;\n\t\t\t\t\t\tmax_cell = new_cell;\n\t\t\t\t\t} else {\n\t\t\t\t\t\tmin_cell = new_cell;\n\t\t\t\t\t\tmax_cell = neighbour_cell;\n\t\t\t\t\t}\n\t\t\t\t\t/* Put the smaller cell in splitting_queue */\n\t\t\t\t\tp.splitting_queue_add(min_cell);\n\t\t\t\t\tif(max_cell->is_unit()) {\n\t\t\t\t\t\t/* Put the \"larger\" cell also in splitting_queue */\n\t\t\t\t\t\tp.splitting_queue_add(max_cell);\n\t\t\t\t\t}\n\t\t\t\t}\n\t\t\t\t/* Update pointer for certificate generation */\n\t\t\t\tneighbour_cell = new_cell;\n\t\t\t} else {\n\t\t\t\t/* neighbour_cell->length == 1 ||\n\t\t\t\t   neighbour_cell->max_ival_count == neighbour_cell->length */\n\t\t\t\tneighbour_cell->max_ival_count = 0;\n\t\t\t}\n\n\t\t\t/*\n\t\t\t * Build certificate if required\n\t\t\t */\n\t\t\tif(in_search) {\n\t\t\t\tfor(unsigned int i = neighbour_cell->first, j = neighbour_cell->length; j > 0; j--, i++) {\n\t\t\t\t\t/* Build certificate */\n\t\t\t\t\tcert_add(CERT_EDGE, unit_cell->first, i);\n\t\t\t\t\t/* No need to continue? */\n\t\t\t\t\tif(refine_compare_certificate and (refine_equal_to_first == false) and (refine_cmp_to_best < 0))\n\t\t\t\t\t\tgoto worse_exit;\n\t\t\t\t}\n\t\t\t} /* if(in_search) */\n\t\t} /* while(!neighbour_heap.is_empty()) */\n\n\t\tif(refine_compare_certificate and\n\t\t\t\t(refine_equal_to_first == false) and\n\t\t\t\t(refine_cmp_to_best < 0))\n\t\t\treturn true;\n\t\treturn false;\n\nworse_exit:\n\t\t/* Clear neighbour heap */\n\t\tUintSeqHash rest;\n\t\twhile(!neighbour_heap.is_empty()) {\n\t\t\tconst unsigned int start = neighbour_heap.remove();\n\t\t\tPartition::Cell * const neighbour_cell = p.get_cell(p.elements[start]);\n\t\t\tif(opt_use_failure_recording and was_equal_to_first) {\n\t\t\t\trest.update(neighbour_cell->first);\n\t\t\t\trest.update(neighbour_cell->length);\n\t\t\t\trest.update(neighbour_cell->max_ival_count);\n\t\t\t}\n\t\t\tneighbour_cell->max_ival_count = 0;\n\t\t}\n\t\tif(opt_use_failure_recording and was_equal_to_first) {\n\t\t\trest.update(failure_recording_fp_deviation);\n\t\t\tfailure_recording_fp_deviation = rest.get_value();\n\t\t}\n\t\treturn true;\n\t}\n\n\t//  Build the initial equitable partition\n\tvoid make_initial_equitable_partition() {\n\t\trefine_according_to_invariant(&vertex_color_invariant);\n\t\tp.splitting_queue_clear();\n\t\t//p.print_signature(stderr); fprintf(stderr, \"\\n\");\n\t\trefine_according_to_invariant(&selfloop_invariant);\n\t\tp.splitting_queue_clear();\n\t\t//p.print_signature(stderr); fprintf(stderr, \"\\n\");\n\t\trefine_according_to_invariant(&degree_invariant);\n\t\tp.splitting_queue_clear();\n\t\t//p.print_signature(stderr); fprintf(stderr, \"\\n\");\n\t\trefine_to_equitable();\n\t\t//p.print_signature(stderr); fprintf(stderr, \"\\n\");\n\t}\n\t// \\internal\n\t// \\copydoc AbstractGraph::is_equitable() const\n\t//Check whether the current partition p is equitable.\n\t//Performance: very slow, use only for debugging purposes.\n\tbool is_equitable() const {\n\t\tconst unsigned int N = get_nof_vertices();\n\t\tif(N == 0) return true;\n\t\tstd::vector<unsigned int> first_count = std::vector<unsigned int>(N, 0);\n\t\tstd::vector<unsigned int> other_count = std::vector<unsigned int>(N, 0);\n\t\tfor(Partition::Cell *cell = p.first_cell; cell; cell = cell->next) {\n\t\t\tif(cell->is_unit()) continue;\n\t\t\tunsigned int *ep = p.elements + cell->first;\n\t\t\tconst Vertex &first_vertex = vertices[*ep++];\n\t\t\t/* Count how many edges lead from the first vertex to\n\t\t\t * the neighbouring cells */\n\t\t\tfor(std::vector<IndexEdge>::const_iterator ei = first_vertex.edges.begin(); ei != first_vertex.edges.end(); ei++) {\n#ifdef USE_DOMAIN\n\t\t\t\tfirst_count[p.get_cell(ei->first)->first]++; // cxh\n#else\n\t\t\t\tfirst_count[p.get_cell(*ei)->first]++;\n#endif\n\t\t\t}\n\t\t\t/* Count and compare to the edges of the other vertices */\n\t\t\tfor(unsigned int i = cell->length; i > 1; i--) {\n\t\t\t\tconst Vertex &vertex = vertices[*ep++];\n\t\t\t\tfor(std::vector<IndexEdge>::const_iterator ei = vertex.edges.begin(); ei != vertex.edges.end(); ei++) {\n#ifdef USE_DOMAIN\n\t\t\t\t\tother_count[p.get_cell(ei->first)->first]++; // cxh\n#else\n\t\t\t\t\tother_count[p.get_cell(*ei)->first]++;\n#endif\n\t\t\t\t}\n\t\t\t\tfor(Partition::Cell *cell2 = p.first_cell; cell2; cell2 = cell2->next) {\n\t\t\t\t\tif(first_count[cell2->first] != other_count[cell2->first]) {\n\t\t\t\t\t\t/* Not equitable */\n\t\t\t\t\t\treturn false;\n\t\t\t\t\t}\n\t\t\t\t\tother_count[cell2->first] = 0;\n\t\t\t\t}\n\t\t\t}\n\t\t\t/* Reset first_count */\n\t\t\tfor(unsigned int i = 0; i < N; i++) first_count[i] = 0;\n\t\t}\n\t\treturn true;\n\t}\n\t/* Splitting heuristics, documented in more detail in graph.cc */\n\tSplittingHeuristic sh;\n\n\t// Find the next cell to be splitted\n\tPartition::Cell* find_next_cell_to_be_splitted(Partition::Cell* cell) {\n\t\tswitch(sh) {\n\t\t\tcase shs_f:   return sh_first();\n\t\t\tcase shs_fs:  return sh_first_smallest();\n\t\t\tcase shs_fl:  return sh_first_largest();\n\t\t\tcase shs_fm:  return sh_first_max_neighbours();\n\t\t\tcase shs_fsm: return sh_first_smallest_max_neighbours();\n\t\t\tcase shs_flm: return sh_first_largest_max_neighbours();\n\t\t\tdefault:      fatal_error(\"Internal error - unknown splitting heuristics\");\n\t\t\t\t\t\t  return 0;\n\t\t}\n\t}\n\t// \\internal\n\t// A splitting heuristic.\n\t// Returns the first nonsingleton cell in the current partition.\n\tPartition::Cell* sh_first() {\n\t\tPartition::Cell* best_cell = 0;\n\t\tfor(Partition::Cell* cell = p.first_nonsingleton_cell; cell; cell = cell->next_nonsingleton) {\n\t\t\tif(opt_use_comprec and p.cr_get_level(cell->first) != cr_level)\n\t\t\t\tcontinue;\n\t\t\tbest_cell = cell;\n\t\t\tbreak;\n\t\t}\n\t\treturn best_cell;\n\t}\n\t// \\internal A splitting heuristic.\n\t// Returns the first smallest nonsingleton cell in the current partition.\n\tPartition::Cell* sh_first_smallest() {\n\t\tPartition::Cell* best_cell = 0;\n\t\tunsigned int best_size = UINT_MAX;\n\t\tfor(Partition::Cell* cell = p.first_nonsingleton_cell; cell; cell = cell->next_nonsingleton) {\n\t\t\tif(opt_use_comprec and p.cr_get_level(cell->first) != cr_level) continue;\n\t\t\tif(cell->length < best_size) {\n\t\t\t\tbest_size = cell->length;\n\t\t\t\tbest_cell = cell;\n\t\t\t}\n\t\t}\n\t\treturn best_cell;\n\t}\n\t// \\internal A splitting heuristic.\n\t// Returns the first largest nonsingleton cell in the current partition.\n\tPartition::Cell* sh_first_largest() {\n\t\tPartition::Cell* best_cell = 0;\n\t\tunsigned int best_size = 0;\n\t\tfor(Partition::Cell* cell = p.first_nonsingleton_cell; cell; cell = cell->next_nonsingleton) {\n\t\t\tif(opt_use_comprec and p.cr_get_level(cell->first) != cr_level)\n\t\t\t\tcontinue;\n\t\t\tif(cell->length > best_size) {\n\t\t\t\tbest_size = cell->length;\n\t\t\t\tbest_cell = cell;\n\t\t\t}\n\t\t}\n\t\treturn best_cell;\n\t}\n\t// \\internal\n\t// A splitting heuristic.\n\t// Returns the first nonsingleton cell with max number of neighbouring nonsingleton cells.\n\t// Assumes that the partition p is equitable.\n\t// Assumes that the max_ival fields of the cells are all 0.\n\tPartition::Cell* sh_first_max_neighbours() {\n\t\tPartition::Cell* best_cell = 0;\n\t\tint best_value = -1;\n\t\tKStack<Partition::Cell*> neighbour_cells_visited;\n\t\tneighbour_cells_visited.init(get_nof_vertices());\n\t\tfor(Partition::Cell* cell = p.first_nonsingleton_cell; cell; cell = cell->next_nonsingleton) {\n\t\t\tif(opt_use_comprec and p.cr_get_level(cell->first) != cr_level)\n\t\t\t\tcontinue;\n\t\t\tconst Vertex& v = vertices[p.elements[cell->first]];\n\t\t\tstd::vector<IndexEdge>::const_iterator ei = v.edges.begin();\n\t\t\tfor(unsigned int j = v.nof_edges(); j > 0; j--) {\n#ifdef USE_DOMAIN\n\t\t\t\tPartition::Cell * const neighbour_cell = p.get_cell((ei++)->first); // cxh\n#else\n\t\t\t\tPartition::Cell * const neighbour_cell = p.get_cell(*ei++);\n#endif\n\t\t\t\tif(neighbour_cell->is_unit()) continue;\n\t\t\t\tneighbour_cell->max_ival++;\n\t\t\t\tif(neighbour_cell->max_ival == 1)\n\t\t\t\t\tneighbour_cells_visited.push(neighbour_cell);\n\t\t\t}\n\t\t\tint value = 0;\n\t\t\twhile(!neighbour_cells_visited.is_empty()) {\n\t\t\t\tPartition::Cell* const neighbour_cell = neighbour_cells_visited.pop();\n\t\t\t\tif(neighbour_cell->max_ival != neighbour_cell->length)\n\t\t\t\t\tvalue++;\n\t\t\t\tneighbour_cell->max_ival = 0;\n\t\t\t}\n\t\t\tif(value > best_value) {\n\t\t\t\tbest_value = value;\n\t\t\t\tbest_cell = cell;\n\t\t\t}\n\t\t}\n\t\treturn best_cell;\n\t}\n\t// \\internal A splitting heuristic.\n\t// Returns the first smallest nonsingleton cell with max number of neighbouring nonsingleton cells.\n\t// Assumes that the partition p is equitable. Assumes that the max_ival fields of the cells are all 0.\n\tPartition::Cell* sh_first_smallest_max_neighbours() {\n\t\tPartition::Cell* best_cell = 0;\n\t\tint best_value = -1;\n\t\tunsigned int best_size = UINT_MAX;\n\t\tKStack<Partition::Cell*> neighbour_cells_visited;\n\t\tneighbour_cells_visited.init(get_nof_vertices());\n\t\tfor(Partition::Cell* cell = p.first_nonsingleton_cell; cell; cell = cell->next_nonsingleton) {\n\t\t\tif(opt_use_comprec and p.cr_get_level(cell->first) != cr_level)\n\t\t\t\tcontinue;\n\t\t\tconst Vertex& v = vertices[p.elements[cell->first]];\n\t\t\tstd::vector<IndexEdge>::const_iterator ei = v.edges.begin();\n\t\t\tfor(unsigned int j = v.nof_edges(); j > 0; j--) {\n#ifdef USE_DOMAIN\n\t\t\t\tPartition::Cell* const neighbour_cell = p.get_cell((ei++)->first); // cxh\n#else\n\t\t\t\tPartition::Cell* const neighbour_cell = p.get_cell(*ei++);\n#endif\n\t\t\t\tif(neighbour_cell->is_unit()) continue;\n\t\t\t\tneighbour_cell->max_ival++;\n\t\t\t\tif(neighbour_cell->max_ival == 1)\n\t\t\t\t\tneighbour_cells_visited.push(neighbour_cell);\n\t\t\t}\n\t\t\tint value = 0;\n\t\t\twhile(!neighbour_cells_visited.is_empty()) {\n\t\t\t\tPartition::Cell* const neighbour_cell = neighbour_cells_visited.pop();\n\t\t\t\tif(neighbour_cell->max_ival != neighbour_cell->length)\n\t\t\t\t\tvalue++;\n\t\t\t\tneighbour_cell->max_ival = 0;\n\t\t\t}\n\t\t\tif((value > best_value) or (value == best_value and cell->length < best_size)) {\n\t\t\t\tbest_value = value;\n\t\t\t\tbest_size = cell->length;\n\t\t\t\tbest_cell = cell;\n\t\t\t}\n\t\t}\n\t\treturn best_cell;\n\t}\n\t// \\internal A splitting heuristic.\n\t// Returns the first largest nonsingleton cell with max number of neighbouring nonsingleton cells.\n\t// Assumes that the partition p is equitable. Assumes that the max_ival fields of the cells are all 0.\n\tPartition::Cell* sh_first_largest_max_neighbours() {\n\t\tPartition::Cell* best_cell = 0;\n\t\tint best_value = -1;\n\t\tunsigned int best_size = 0;\n\t\tKStack<Partition::Cell*> neighbour_cells_visited;\n\t\tneighbour_cells_visited.init(get_nof_vertices());\n\t\tfor(Partition::Cell* cell = p.first_nonsingleton_cell; cell; cell = cell->next_nonsingleton) {\n\t\t\tif(opt_use_comprec and p.cr_get_level(cell->first) != cr_level) continue;\n\t\t\tconst Vertex& v = vertices[p.elements[cell->first]];\n\t\t\tstd::vector<IndexEdge>::const_iterator ei = v.edges.begin();\n\t\t\tfor(unsigned int j = v.nof_edges(); j > 0; j--) {\n#ifdef USE_DOMAIN\n\t\t\t\tPartition::Cell* const neighbour_cell = p.get_cell((ei++)->first); // cxh\n#else\n\t\t\t\tPartition::Cell* const neighbour_cell = p.get_cell(*ei++);\n#endif\n\t\t\t\tif(neighbour_cell->is_unit()) continue;\n\t\t\t\tneighbour_cell->max_ival++;\n\t\t\t\tif(neighbour_cell->max_ival == 1)\n\t\t\t\t\tneighbour_cells_visited.push(neighbour_cell);\n\t\t\t}\n\t\t\tint value = 0;\n\t\t\twhile(!neighbour_cells_visited.is_empty()) {\n\t\t\t\tPartition::Cell* const neighbour_cell = neighbour_cells_visited.pop();\n\t\t\t\tif(neighbour_cell->max_ival != neighbour_cell->length) value++;\n\t\t\t\tneighbour_cell->max_ival = 0;\n\t\t\t}\n\t\t\tif((value > best_value) or (value == best_value and cell->length > best_size)) {\n\t\t\t\tbest_value = value;\n\t\t\t\tbest_size = cell->length;\n\t\t\t\tbest_cell = cell;\n\t\t\t}\n\t\t}\n\t\treturn best_cell;\n\t}\n\t//Initialize the certificate size and memory\n\tvoid initialize_certificate() {\n\t\tcertificate_index = 0;\n\t\tcertificate_current_path.clear();\n\t\tcertificate_first_path.clear();\n\t\tcertificate_best_path.clear();\n\t}\n\tbool is_automorphism(unsigned* const perm) {\n\t\tstd::set<unsigned int, std::less<unsigned int> > edges1;\n\t\tstd::set<unsigned int, std::less<unsigned int> > edges2;\n\n#if defined(BLISS_CONSISTENCY_CHECKS)\n\t\tif(!is_permutation(get_nof_vertices(), perm))\n\t\t\t_INTERNAL_ERROR();\n#endif\n\n\t\tfor(unsigned int i = 0; i < get_nof_vertices(); i++) {\n\t\t\tVertex& v1 = vertices[i];\n\t\t\tedges1.clear();\n\t\t\tfor(std::vector<IndexEdge>::iterator ei = v1.edges.begin(); ei != v1.edges.end(); ei++)\n#ifdef USE_DOMAIN\n\t\t\t\tedges1.insert(perm[ei->first]); // cxh\n#else\n\t\t\tedges1.insert(perm[*ei]);\n#endif\n\t\t\tVertex& v2 = vertices[perm[i]];\n\t\t\tedges2.clear();\n\t\t\tfor(std::vector<IndexEdge>::iterator ei = v2.edges.begin(); ei != v2.edges.end(); ei++)\n#ifdef USE_DOMAIN\n\t\t\t\tedges2.insert(ei->first); // cxh\n#else\n\t\t\tedges2.insert(*ei);\n#endif\n\t\t\tif(!(edges1 == edges2)) return false;\n\t\t}\n\t\treturn true;\n\t}\n\n\tbool nucr_find_first_component(const unsigned level) {\n\t\tcr_component.clear();\n\t\tcr_component_elements = 0;\n\t\t/* Find first non-discrete cell in the component level */\n\t\tPartition::Cell* first_cell = p.first_nonsingleton_cell;\n\t\twhile(first_cell) {\n\t\t\tif(p.cr_get_level(first_cell->first) == level) break;\n\t\t\tfirst_cell = first_cell->next_nonsingleton;\n\t\t}\n\t\t/* The component is discrete, return false */\n\t\tif(!first_cell) return false;\n\t\tstd::vector<Partition::Cell*> component;\n\t\tfirst_cell->max_ival = 1;\n\t\tcomponent.push_back(first_cell);\n\t\tfor(unsigned int i = 0; i < component.size(); i++) {\n\t\t\tPartition::Cell* const cell = component[i];\n\t\t\tconst Vertex& v = vertices[p.elements[cell->first]];\n\t\t\tstd::vector<IndexEdge>::const_iterator ei = v.edges.begin();\n\t\t\tfor(unsigned int j = v.nof_edges(); j > 0; j--) {\n#ifdef USE_DOMAIN\n\t\t\t\tconst unsigned int neighbour = (ei++)->first; // cxh\n#else\n\t\t\t\tconst unsigned int neighbour = *ei++;\n#endif \n\t\t\t\tPartition::Cell* const neighbour_cell = p.get_cell(neighbour);\n\t\t\t\t/* Skip unit neighbours */\n\t\t\t\tif(neighbour_cell->is_unit()) continue;\n\t\t\t\t/* Already marked to be in the same component? */\n\t\t\t\tif(neighbour_cell->max_ival == 1) continue;\n\t\t\t\t/* Is the neighbour at the same component recursion level? */\n\t\t\t\tif(p.cr_get_level(neighbour_cell->first) != level) continue;\n\t\t\t\tif(neighbour_cell->max_ival_count == 0)\n\t\t\t\t\tneighbour_heap.insert(neighbour_cell->first);\n\t\t\t\tneighbour_cell->max_ival_count++;\n\t\t\t}\n\t\t\twhile(!neighbour_heap.is_empty()) {\n\t\t\t\tconst unsigned int start = neighbour_heap.remove();\n\t\t\t\tPartition::Cell* const neighbour_cell =\n\t\t\t\t\tp.get_cell(p.elements[start]);\n\t\t\t\t/* Skip saturated neighbour cells */\n\t\t\t\tif(neighbour_cell->max_ival_count == neighbour_cell->length) {\n\t\t\t\t\tneighbour_cell->max_ival_count = 0;\n\t\t\t\t\tcontinue;\n\t\t\t\t} \n\t\t\t\tneighbour_cell->max_ival_count = 0;\n\t\t\t\tneighbour_cell->max_ival = 1;\n\t\t\t\tcomponent.push_back(neighbour_cell);\n\t\t\t}\n\t\t}\n\t\tfor(unsigned int i = 0; i < component.size(); i++) {\n\t\t\tPartition::Cell* const cell = component[i];\n\t\t\tcell->max_ival = 0;\n\t\t\tcr_component.push_back(cell->first);\n\t\t\tcr_component_elements += cell->length;\n\t\t}\n\t\tif(verbstr and verbose_level > 2) {\n\t\t\tfprintf(verbstr, \"NU-component with %lu cells and %u vertices\\n\",\n\t\t\t\t\t(long unsigned)cr_component.size(), cr_component_elements);\n\t\t\tfflush(verbstr);\n\t\t}\n\t\treturn true;\n\t}\n\tbool nucr_find_first_component(const unsigned int level, std::vector<unsigned int>& component, unsigned int& component_elements, Partition::Cell*& sh_return) {\n\t\tcomponent.clear();\n\t\tcomponent_elements = 0;\n\t\tsh_return = 0;\n\t\tunsigned int sh_first  = 0;\n\t\tunsigned int sh_size   = 0;\n\t\tunsigned int sh_nuconn = 0;\n\n\t\t/* Find first non-discrete cell in the component level */\n\t\tPartition::Cell* first_cell = p.first_nonsingleton_cell;\n\t\twhile(first_cell) {\n\t\t\tif(p.cr_get_level(first_cell->first) == level) break;\n\t\t\tfirst_cell = first_cell->next_nonsingleton;\n\t\t}\n\t\tif(!first_cell) {\n\t\t\t/* The component is discrete, return false */\n\t\t\treturn false;\n\t\t}\n\t\tstd::vector<Partition::Cell*> comp;\n\t\tKStack<Partition::Cell*> neighbours;\n\t\tneighbours.init(get_nof_vertices());\n\t\tfirst_cell->max_ival = 1;\n\t\tcomp.push_back(first_cell);\n\t\tfor(unsigned int i = 0; i < comp.size(); i++) {\n\t\t\tPartition::Cell* const cell = comp[i];\n\t\t\tconst Vertex& v = vertices[p.elements[cell->first]];\n\t\t\tstd::vector<IndexEdge>::const_iterator ei = v.edges.begin();\n\t\t\tfor(unsigned int j = v.nof_edges(); j > 0; j--) {\n#ifdef USE_DOMAIN\n\t\t\t\tconst unsigned int neighbour = (ei++)->first; // cxh\n#else\n\t\t\t\tconst unsigned int neighbour = *ei++;\n#endif\n\t\t\t\tPartition::Cell* const neighbour_cell = p.get_cell(neighbour);\n\t\t\t\t/* Skip unit neighbours */\n\t\t\t\tif(neighbour_cell->is_unit()) continue;\n\t\t\t\t/* Is the neighbour at the same component recursion level? */\n\t\t\t\t//if(p.cr_get_level(neighbour_cell->first) != level)\n\t\t\t\t//  continue;\n\t\t\t\tif(neighbour_cell->max_ival_count == 0)\n\t\t\t\t\tneighbours.push(neighbour_cell);\n\t\t\t\tneighbour_cell->max_ival_count++;\n\t\t\t}\n\t\t\tunsigned int nuconn = 1;\n\t\t\twhile(!neighbours.is_empty()) {\n\t\t\t\tPartition::Cell* const neighbour_cell = neighbours.pop();\n\t\t\t\t//neighbours.pop_back();\n\t\t\t\t/* Skip saturated neighbour cells */\n\t\t\t\tif(neighbour_cell->max_ival_count == neighbour_cell->length) {\n\t\t\t\t\tneighbour_cell->max_ival_count = 0;\n\t\t\t\t\tcontinue;\n\t\t\t\t}\n\t\t\t\tnuconn++;\n\t\t\t\tneighbour_cell->max_ival_count = 0;\n\t\t\t\tif(neighbour_cell->max_ival == 0) {\n\t\t\t\t\tcomp.push_back(neighbour_cell);\n\t\t\t\t\tneighbour_cell->max_ival = 1;\n\t\t\t\t}\n\t\t\t}\n\t\t\tswitch(sh) {\n\t\t\t\tcase shs_f:\n\t\t\t\t\tif(sh_return == 0 or cell->first <= sh_first) {\n\t\t\t\t\t\tsh_return = cell;\n\t\t\t\t\t\tsh_first = cell->first;\n\t\t\t\t\t}\n\t\t\t\t\tbreak;\n\t\t\t\tcase shs_fs:\n\t\t\t\t\tif(sh_return == 0 or cell->length < sh_size or\n\t\t\t\t\t\t\t(cell->length == sh_size and cell->first <= sh_first)) {\n\t\t\t\t\t\tsh_return = cell;\n\t\t\t\t\t\tsh_first = cell->first;\n\t\t\t\t\t\tsh_size = cell->length;\n\t\t\t\t\t}\n\t\t\t\t\tbreak;\n\t\t\t\tcase shs_fl:\n\t\t\t\t\tif(sh_return == 0 or cell->length > sh_size or\n\t\t\t\t\t\t\t(cell->length == sh_size and cell->first <= sh_first)) {\n\t\t\t\t\t\tsh_return = cell;\n\t\t\t\t\t\tsh_first = cell->first;\n\t\t\t\t\t\tsh_size = cell->length;\n\t\t\t\t\t}\n\t\t\t\t\tbreak;\n\t\t\t\tcase shs_fm:\n\t\t\t\t\tif(sh_return == 0 or nuconn > sh_nuconn or\n\t\t\t\t\t\t\t(nuconn == sh_nuconn and cell->first <= sh_first)) {\n\t\t\t\t\t\tsh_return = cell;\n\t\t\t\t\t\tsh_first = cell->first;\n\t\t\t\t\t\tsh_nuconn = nuconn;\n\t\t\t\t\t}\n\t\t\t\t\tbreak;\n\t\t\t\tcase shs_fsm:\n\t\t\t\t\tif(sh_return == 0 or\n\t\t\t\t\t\t\tnuconn > sh_nuconn or\n\t\t\t\t\t\t\t(nuconn == sh_nuconn and\n\t\t\t\t\t\t\t (cell->length < sh_size or\n\t\t\t\t\t\t\t  (cell->length == sh_size and cell->first <= sh_first)))) {\n\t\t\t\t\t\tsh_return = cell;\n\t\t\t\t\t\tsh_first = cell->first;\n\t\t\t\t\t\tsh_size = cell->length;\n\t\t\t\t\t\tsh_nuconn = nuconn;\n\t\t\t\t\t}\n\t\t\t\t\tbreak;\n\t\t\t\tcase shs_flm:\n\t\t\t\t\tif(sh_return == 0 or\n\t\t\t\t\t\t\tnuconn > sh_nuconn or\n\t\t\t\t\t\t\t(nuconn == sh_nuconn and\n\t\t\t\t\t\t\t (cell->length > sh_size or\n\t\t\t\t\t\t\t  (cell->length == sh_size and cell->first <= sh_first)))) {\n\t\t\t\t\t\tsh_return = cell;\n\t\t\t\t\t\tsh_first = cell->first;\n\t\t\t\t\t\tsh_size = cell->length;\n\t\t\t\t\t\tsh_nuconn = nuconn;\n\t\t\t\t\t}\n\t\t\t\t\tbreak;\n\t\t\t\tdefault:\n\t\t\t\t\tfatal_error(\"Internal error - unknown splitting heuristics\");\n\t\t\t\t\treturn 0;\n\t\t\t}\n\t\t}\n\t\tassert(sh_return);\n\t\tfor(unsigned int i = 0; i < comp.size(); i++) {\n\t\t\tPartition::Cell* const cell = comp[i];\n\t\t\tcell->max_ival = 0;\n\t\t\tcomponent.push_back(cell->first);\n\t\t\tcomponent_elements += cell->length;\n\t\t}\n\t\tif(verbstr and verbose_level > 2) {\n\t\t\tfprintf(verbstr, \"NU-component with %lu cells and %u vertices\\n\",\n\t\t\t\t\t(long unsigned)component.size(), component_elements);\n\t\t\tfflush(verbstr);\n\t\t}\n\t\treturn true;\n\t}\n\npublic:\n\t// Create a new graph with \\a N vertices and no edges.\n\tGraph(const unsigned nof_vertices = 0) {\n\t\tvertices.resize(nof_vertices);\n\t\tsh = shs_flm;\n\t}\n\n\t/**\n\t * Destroy the graph.\n\t */\n\t~Graph() { ; }\n\n\t/**\n\t * Read the graph from the file \\a fp in a variant of the DIMACS format.\n\t * See the <A href=\"http://www.tcs.hut.fi/Software/bliss/\">bliss website</A>\n\t * for the definition of the file format.\n\t * Note that in the DIMACS file the vertices are numbered from 1 to N while\n\t * in this C++ API they are from 0 to N-1.\n\t * Thus the vertex n in the file corresponds to the vertex n-1 in the API.\n\t *\n\t * \\param fp      the file stream for the graph file\n\t * \\param errstr  if non-null, the possible error messages are printed\n\t *                in this file stream\n\t * \\return        a new Graph object or 0 if reading failed for some\n\t *                reason\n\t */\n\tstatic Graph* read_dimacs(FILE* const fp, FILE* const errstr = stderr) { return NULL; }\n\n\t/**\n\t * Write the graph to a file in a variant of the DIMACS format.\n\t * See the <A href=\"http://www.tcs.hut.fi/Software/bliss/\">bliss website</A>\n\t * for the definition of the file format.\n\t */\n\tvoid write_dimacs(FILE* const fp) {}\n\n\t// \\copydoc AbstractGraph::write_dot(FILE * const fp)\n\tvoid write_dot(FILE* const fp) {}\n\n\t// \\copydoc AbstractGraph::write_dot(const char * const file_name)\n\tvoid write_dot(const char* const file_name) {}\n\n\t// \\copydoc AbstractGraph::is_automorphism(const std::vector<unsigned int>& perm) const\n\tbool is_automorphism(const std::vector<unsigned>& perm) const {\n\t\tif(!(perm.size() == get_nof_vertices() and is_permutation(perm)))\n\t\t\treturn false;\n\t\tstd::set<unsigned, std::less<unsigned> > edges1;\n\t\tstd::set<unsigned, std::less<unsigned> > edges2;\n\t\tfor(unsigned i = 0; i < get_nof_vertices(); i++) {\n\t\t\tconst Vertex& v1 = vertices[i];\n\t\t\tedges1.clear();\n\t\t\tfor(std::vector<IndexEdge>::const_iterator ei = v1.edges.begin(); ei != v1.edges.end(); ei++)\n#ifdef USE_DOMAIN\n\t\t\t\tedges1.insert(perm[ei->first]); // cxh\n#else\n\t\t\tedges1.insert(perm[*ei]);\n#endif\n\t\t\tconst Vertex& v2 = vertices[perm[i]];\n\t\t\tedges2.clear();\n\t\t\tfor(std::vector<IndexEdge>::const_iterator ei = v2.edges.begin(); ei != v2.edges.end(); ei++)\n#ifdef USE_DOMAIN\n\t\t\t\tedges2.insert(ei->first); // cxh\n#else\n\t\t\tedges2.insert(*ei);\n#endif\n\t\t\tif(!(edges1 == edges2)) return false;\n\t\t}\n\t\treturn true;\n\t}\n\t// \\copydoc AbstractGraph::get_hash()\n\tvirtual unsigned get_hash() {\n\t\tremove_duplicate_edges();\n\t\tsort_edges();\n\t\tUintSeqHash h;\n\t\th.update(get_nof_vertices());\n\t\t/* Hash the color of each vertex */\n\t\tfor(unsigned int i = 0; i < get_nof_vertices(); i++) {\n\t\t\th.update(vertices[i].color);\n\t\t}\n\t\t/* Hash the edges */\n\t\tfor(unsigned int i = 0; i < get_nof_vertices(); i++) {\n\t\t\tVertex &v = vertices[i];\n\t\t\tfor(std::vector<IndexEdge>::const_iterator ei = v.edges.begin(); ei != v.edges.end(); ei++) {\n#ifdef USE_DOMAIN\n\t\t\t\tconst unsigned int dest_i = ei->first; // cxh\n#else\n\t\t\t\tconst unsigned int dest_i = *ei;\n#endif\n\t\t\t\tif(dest_i < i) continue;\n\t\t\t\th.update(i);\n\t\t\t\th.update(dest_i);\n\t\t\t}\n\t\t}\n\t\treturn h.get_value();\n\t}\n\t// Return the number of vertices in the graph.\n\tunsigned int get_nof_vertices() const {return vertices.size(); }\n\n\t// \\copydoc AbstractGraph::permute(const unsigned int* const perm) const\n\tGraph* permute(const unsigned* perm) const {\n#if defined(BLISS_CONSISTENCY_CHECKS)\n\t\tif(!is_permutation(get_nof_vertices(), perm))\n\t\t\t_INTERNAL_ERROR();\n#endif\n\t\tGraph* const g = new Graph(get_nof_vertices());\n\t\tfor(unsigned i = 0; i < get_nof_vertices(); i++) {\n\t\t\tconst Vertex& v = vertices[i];\n\t\t\tVertex& permuted_v = g->vertices[perm[i]];\n\t\t\tpermuted_v.color = v.color;\n\t\t\tfor(std::vector<IndexEdge>::const_iterator ei = v.edges.begin(); ei != v.edges.end(); ei++) {\n#ifdef USE_DOMAIN\n\t\t\t\tconst unsigned dest_v = ei->first; //cxh\n\t\t\t\tpermuted_v.add_edge(perm[dest_v], ei->second);\n#else\n\t\t\t\tconst unsigned dest_v = *ei;\n\t\t\t\tpermuted_v.add_edge(perm[dest_v]);\n#endif\n\t\t\t}\n\t\t\tpermuted_v.sort_edges();\n\t\t}\n\t\treturn g;\n\t}\n\tGraph* permute(const std::vector<unsigned>& perm) const {\n#if defined(BLISS_CONSISTENCY_CHECKS)\n#endif\n\t\tGraph* const g = new Graph(get_nof_vertices());\n\t\tfor(unsigned int i = 0; i < get_nof_vertices(); i++) {\n\t\t\tconst Vertex& v = vertices[i];\n\t\t\tVertex& permuted_v = g->vertices[perm[i]];\n\t\t\tpermuted_v.color = v.color;\n\t\t\tfor(std::vector<IndexEdge>::const_iterator ei = v.edges.begin(); ei != v.edges.end(); ei++) {\n#ifdef USE_DOMAIN\n\t\t\t\tconst unsigned dest_v = ei->first; // cxh\n\t\t\t\tpermuted_v.add_edge(perm[dest_v], ei->second);\n#else\n\t\t\t\tconst unsigned dest_v = *ei;\n\t\t\t\tpermuted_v.add_edge(perm[dest_v]);\n#endif\n\t\t\t}\n\t\t\tpermuted_v.sort_edges();\n\t\t}\n\t\treturn g;\n\t}\n\t// Add a new vertex with color \\a color in the graph and return its index.\n\tunsigned add_vertex(const unsigned color = 0) {\n\t\tconst unsigned int vertex_num = vertices.size();\n\t\tvertices.resize(vertex_num + 1);\n\t\tvertices.back().color = color;\n\t\treturn vertex_num;\n\t}\n\t/**\n\t * Add an edge between vertices \\a v1 and \\a v2.\n\t * Duplicate edges between vertices are ignored but try to avoid introducing\n\t * them in the first place as they are not ignored immediately but will\n\t * consume memory and computation resources for a while.\n\t */\n\tvoid add_edge(const unsigned vertex1, const unsigned vertex2, Index index) {\n\t\t//printf(\"Adding edge (%u -> %u)\\n\", vertex1, vertex2);\n#ifdef USE_DOMAIN\n\t\tvertices[vertex1].add_edge(vertex2, index);\n\t\tvertices[vertex2].add_edge(vertex1, std::make_pair(index.second, index.first));\n#else\n\t\tvertices[vertex1].add_edge(vertex2);\n\t\tvertices[vertex2].add_edge(vertex1);\n#endif\n\t}\n\t// Change the color of the vertex \\a vertex to \\a color.\n\tvoid change_color(const unsigned vertex, const unsigned color) {\n\t\tvertices[vertex].color = color;\n\t}\n\n\t/**\n\t * Compare this graph with the graph \\a other.\n\t * Returns 0 if the graphs are equal, and a negative (positive) integer\n\t * if this graph is \"smaller than\" (\"greater than\", resp.) than \\a other.\n\t */\n\tint cmp(Graph& other) {\n\t\t/* Compare the numbers of vertices */\n\t\tif(get_nof_vertices() < other.get_nof_vertices())\n\t\t\treturn -1;\n\t\tif(get_nof_vertices() > other.get_nof_vertices())\n\t\t\treturn 1;\n\t\t/* Compare vertex colors */\n\t\tfor(unsigned i = 0; i < get_nof_vertices(); i++) {\n\t\t\tif(vertices[i].color < other.vertices[i].color)\n\t\t\t\treturn -1;\n\t\t\tif(vertices[i].color > other.vertices[i].color)\n\t\t\t\treturn 1;\n\t\t}\n\t\t/* Compare vertex degrees */\n\t\tremove_duplicate_edges();\n\t\tother.remove_duplicate_edges();\n\t\tfor(unsigned i = 0; i < get_nof_vertices(); i++) {\n\t\t\tif(vertices[i].nof_edges() < other.vertices[i].nof_edges())\n\t\t\t\treturn -1;\n\t\t\tif(vertices[i].nof_edges() > other.vertices[i].nof_edges())\n\t\t\t\treturn 1;\n\t\t}\n\t\t/* Compare edges */\n\t\tfor(unsigned i = 0; i < get_nof_vertices(); i++) {\n\t\t\tVertex &v1 = vertices[i];\n\t\t\tVertex &v2 = other.vertices[i];\n\t\t\tv1.sort_edges();\n\t\t\tv2.sort_edges();\n\t\t\tstd::vector<IndexEdge>::const_iterator ei1 = v1.edges.begin();\n\t\t\tstd::vector<IndexEdge>::const_iterator ei2 = v2.edges.begin();\n\t\t\twhile(ei1 != v1.edges.end()) {\n#ifdef USE_DOMAIN\n\t\t\t\tif(ei1->first < ei2->first) return -1; // cxh\n\t\t\t\tif(ei1->first > ei2->first) return 1; // cxh\n#else\n\t\t\t\tif(*ei1 < *ei2) return -1;\n\t\t\t\tif(*ei1 > *ei2) return 1;\n#endif\n\t\t\t\tei1++;\n\t\t\t\tei2++;\n\t\t\t}\n\t\t}\n\t\treturn 0;\n\t}\n\t/**\n\t * Set the splitting heuristic used by the automorphism and canonical\n\t * labeling algorithm.\n\t * The selected splitting heuristics affects the computed canonical\n\t * labelings; therefore, if you want to compare whether two graphs\n\t * are isomorphic by computing and comparing (for equality) their\n\t * canonical versions, be sure to use the same splitting heuristics\n\t * for both graphs.\n\t */\n\tvoid set_splitting_heuristic(const SplittingHeuristic shs) {sh = shs; }\n};\n\n}\n\n#endif\n"
  },
  {
    "path": "external/bliss/bliss/heap.hh",
    "content": "#ifndef BLISS_HEAP_HH\n#define BLISS_HEAP_HH\n#include <stdlib.h>\n#include <stdio.h>\n#include <limits.h>\nnamespace bliss {\n/** \\internal\n * \\brief A capacity bounded heap data structure.\n */\nclass Heap {\n\tunsigned int N;\n\tunsigned int n;\n\tunsigned int *array;\n\t//void upheap(unsigned int k);\n\tvoid upheap(unsigned int index) {\n\t\tconst unsigned int v = array[index];\n\t\tarray[0] = 0;\n\t\twhile(array[index/2] > v) {\n\t\t\tarray[index] = array[index/2];\n\t\t\tindex = index/2;\n\t\t}\n\t\tarray[index] = v;\n\t}\n\t//void downheap(unsigned int k);\n\tvoid downheap(unsigned int index) {\n\t\tconst unsigned int v = array[index];\n\t\tconst unsigned int lim = n/2;\n\t\twhile(index <= lim) {\n\t\t\tunsigned int new_index = index + index;\n\t\t\tif((new_index < n) and (array[new_index] > array[new_index+1]))\n\t\t\t\tnew_index++;\n\t\t\tif(v <= array[new_index])\n\t\t\t\tbreak;\n\t\t\tarray[index] = array[new_index];\n\t\t\tindex = new_index;\n\t\t}\n\t\tarray[index] = v;\n\t}\n\npublic:\n\t/**\n\t * Create a new heap.\n\t * init() must be called after this.\n\t */\n\tHeap() {array = 0; n = 0; N = 0; }\n\t~Heap() {\n\t\tif(array) {\n\t\t\tfree(array);\n\t\t\tarray = 0;\n\t\t\tn = 0;\n\t\t\tN = 0;\n\t\t}\n\t}\n\t/**\n\t * Initialize the heap to have the capacity to hold \\e size elements.\n\t */\n\t//void init(const unsigned int size);\n\tvoid init(const unsigned int size) {\n\t\tif(size > N) {\n\t\t\tif(array) free(array);\n\t\t\tarray = (unsigned int*)malloc((size + 1) * sizeof(unsigned int));\n\t\t\tN = size;\n\t\t}\n\t}\n\t/**\n\t * Is the heap empty?\n\t * Time complexity is O(1).\n\t */\n\tbool is_empty() const { return (n==0); }\n\n\t/**\n\t * Remove all the elements in the heap.\n\t * Time complexity is O(1).\n\t */\n\tvoid clear() { n = 0; }\n\n\t/**\n\t * Insert the element \\a e in the heap.\n\t * Time complexity is O(log(N)), where N is the number of elements\n\t * currently in the heap.\n\t */\n\t//void insert(const unsigned int e);\n\tvoid insert(const unsigned int v) {\n\t\tarray[++n] = v;\n\t\tupheap(n);\n\t}\n\n\t/**\n\t * Remove and return the smallest element in the heap.\n\t * Time complexity is O(log(N)), where N is the number of elements\n\t * currently in the heap.\n\t */\n\t//unsigned int remove();\n\tunsigned int remove() {\n\t\tconst unsigned int v = array[1];\n\t\tarray[1] = array[n--];\n\t\tdownheap(1);\n\t\treturn v;\n\t}\n\n\t/**\n\t * Get the number of elements in the heap.\n\t */\n\tunsigned int size() const {return n; }\n\n};\n} // namespace bliss\n\n#endif\n"
  },
  {
    "path": "external/bliss/bliss/kqueue.hh",
    "content": "#ifndef BLISS_KQUEUE_HH\n#define BLISS_KQUEUE_HH\n\n/*\n  Copyright (c) 2003-2015 Tommi Junttila\n  Released under the GNU Lesser General Public License version 3.\n  \n  This file is part of bliss.\n  \n  bliss is free software: you can redistribute it and/or modify\n  it under the terms of the GNU Lesser General Public License as published by\n  the Free Software Foundation, version 3 of the License.\n\n  bliss is distributed in the hope that it will be useful,\n  but WITHOUT ANY WARRANTY; without even the implied warranty of\n  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the\n  GNU Lesser General Public License for more details.\n\n  You should have received a copy of the GNU Lesser General Public License\n  along with bliss.  If not, see <http://www.gnu.org/licenses/>.\n*/\n\n#include \"defs.hh\"\n\nnamespace bliss {\n\n/** \\internal\n * \\brief A very simple implementation of queues with fixed capacity.\n */\n\ntemplate <class Type>\nclass KQueue\n{\npublic:\n  /**\n   * Create a new queue with capacity zero.\n   * The function init() should be called next.\n   */\n  KQueue();\n\n  ~KQueue();\n\n  /**\n   * Initialize the queue to have the capacity to hold at most \\a N elements.\n   */\n  void init(const unsigned int N);\n  \n  /** Is the queue empty? */\n  bool is_empty() const;\n\n  /** Return the number of elements in the queue. */\n  unsigned int size() const;\n\n  /** Remove all the elements in the queue. */\n  void clear();\n\n  /** Return (but don't remove) the first element in the queue. */\n  Type front() const;\n\n  /** Remove and return the first element of the queue. */\n  Type pop_front();\n\n  /** Push the element \\a e in the front of the queue. */\n  void push_front(Type e);\n\n  /** Remove and return the last element of the queue. */\n  Type pop_back();\n\n  /** Push the element \\a e in the back of the queue. */\n  void push_back(Type e);\nprivate:\n  Type *entries, *end;\n  Type *head, *tail;\n};\n\ntemplate <class Type>\nKQueue<Type>::KQueue()\n{\n  entries = 0;\n  end = 0;\n  head = 0;\n  tail = 0;\n}\n\ntemplate <class Type>\nKQueue<Type>::~KQueue()\n{\n  if(entries)\n    free(entries);\n}\n\ntemplate <class Type>\nvoid KQueue<Type>::init(const unsigned int k)\n{\n  assert(k > 0);\n  if(entries)\n    free(entries);\n  entries = (Type*)malloc((k + 1) * sizeof(Type));\n  end = entries + k + 1;\n  head = entries;\n  tail = head;\n}\n\ntemplate <class Type>\nvoid KQueue<Type>::clear()\n{\n  head = entries;\n  tail = head;\n}\n\ntemplate <class Type>\nbool KQueue<Type>::is_empty() const\n{\n  return(head == tail);\n}\n\ntemplate <class Type>\nunsigned int KQueue<Type>::size() const\n{\n  if(tail >= head)\n    return(tail - head);\n  return((end - head) + (tail - entries));\n}\n\ntemplate <class Type>\nType KQueue<Type>::front() const\n{\n  return *head;\n}\n\ntemplate <class Type>\nType KQueue<Type>::pop_front()\n{\n  Type *old_head = head;\n  head++;\n  if(head == end)\n    head = entries;\n  return *old_head;\n}\n\ntemplate <class Type>\nvoid KQueue<Type>::push_front(Type e)\n{\n  if(head == entries)\n    head = end - 1;\n  else\n    head--;\n  *head = e;\n}\n\ntemplate <class Type>\nvoid KQueue<Type>::push_back(Type e)\n{\n  *tail = e;\n  tail++;\n  if(tail == end)\n    tail = entries;\n}\n\n} // namespace bliss\n\n#endif\n"
  },
  {
    "path": "external/bliss/bliss/kstack.hh",
    "content": "#ifndef BLISS_KSTACK_H\n#define BLISS_KSTACK_H\n\n/*\n  Copyright (c) 2003-2015 Tommi Junttila\n  Released under the GNU Lesser General Public License version 3.\n  \n  This file is part of bliss.\n  \n  bliss is free software: you can redistribute it and/or modify\n  it under the terms of the GNU Lesser General Public License as published by\n  the Free Software Foundation, version 3 of the License.\n\n  bliss is distributed in the hope that it will be useful,\n  but WITHOUT ANY WARRANTY; without even the implied warranty of\n  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the\n  GNU Lesser General Public License for more details.\n\n  You should have received a copy of the GNU Lesser General Public License\n  along with bliss.  If not, see <http://www.gnu.org/licenses/>.\n*/\n\n#include <cstdlib>\n#include \"defs.hh\"\n\nnamespace bliss {\n\n/** \\internal\n * \\brief A very simple implementation of a stack with fixed capacity.\n */\ntemplate <class Type>\nclass KStack {\npublic:\n  /**\n   * Create a new stack with zero capacity.\n   * The function init() should be called next.\n   */\n  KStack();\n\n  /**\n   * Create a new stack with the capacity to hold at most \\a N elements.\n   */\n  KStack(int N);\n\n  ~KStack();\n\n  /**\n   * Initialize the stack to have the capacity to hold at most \\a N elements.\n   */\n  void init(int N);\n\n  /**\n   * Is the stack empty?\n   */\n  bool is_empty() const {return(cursor == entries); }\n\n  /**\n   * Return (but don't remove) the top element of the stack.\n   */\n  Type top() const {BLISS_ASSERT(cursor > entries); return *cursor; }\n\n  /**\n   * Pop (remove) the top element of the stack.\n   */\n  Type pop()\n  {\n    return *cursor--;\n  }\n\n  /**\n   * Push the element \\a e in the stack.\n   */\n  void push(Type e)\n  {\n    *(++cursor) = e;\n  }\n\n  /** Remove all the elements in the stack. */\n  void clean() {cursor = entries; }\n\n  /**\n   * Get the number of elements in the stack.\n   */\n  unsigned int size() const {return(cursor - entries); }\n\n  /**\n   * Return the i:th element in the stack, where \\a i is in the range\n   * 0,...,this.size()-1; the 0:th element is the bottom element\n   * in the stack.\n   */\n  Type element_at(unsigned int i)\n  {\n    assert(i < size());\n    return entries[i+1];\n  }\n\n  /** Return the capacity (NOT the number of elements) of the stack. */\n  int capacity() {return kapacity; }\nprivate:\n  int kapacity;\n  Type *entries;\n  Type *cursor;\n};\n\ntemplate <class Type>\nKStack<Type>::KStack()\n{\n  kapacity = 0;\n  entries = 0;\n  cursor = 0;\n}\n\ntemplate <class Type>\nKStack<Type>::KStack(int k)\n{\n  assert(k > 0);\n  kapacity = k;\n  entries = (Type*)malloc((k+1) * sizeof(Type));\n  cursor = entries;\n}\n\ntemplate <class Type>\nvoid KStack<Type>::init(int k)\n{\n  assert(k > 0);\n  if(entries)\n    free(entries);\n  kapacity = k;\n  entries = (Type*)malloc((k+1) * sizeof(Type));\n  cursor = entries;\n}\n\ntemplate <class Type>\nKStack<Type>::~KStack()\n{\n  free(entries);\n}\n\n} // namespace bliss\n\n#endif\n"
  },
  {
    "path": "external/bliss/bliss/orbit.hh",
    "content": "#ifndef BLISS_ORBIT_HH\n#define BLISS_ORBIT_HH\n\nnamespace bliss {\nclass Orbit {\n\tclass OrbitEntry {\n\t\tpublic:\n\t\t\tunsigned int element;\n\t\t\tOrbitEntry *next;\n\t\t\tunsigned int size;\n\t};\n\tOrbitEntry *orbits;\n\tOrbitEntry **in_orbit;\n\tunsigned int nof_elements;\n\tunsigned int _nof_orbits;\n\tvoid merge_orbits(OrbitEntry *orbit1, OrbitEntry *orbit2) {\n\t\tif(orbit1 != orbit2) {\n\t\t\t_nof_orbits--;\n\t\t\t// Only update the elements in the smaller orbit\n\t\t\tif(orbit1->size > orbit2->size) {\n\t\t\t\tOrbitEntry * const temp = orbit2;\n\t\t\t\torbit2 = orbit1;\n\t\t\t\torbit1 = temp;\n\t\t\t}\n\t\t\t// Link the elements of orbit1 to the almost beginning of orbit2\n\t\t\tOrbitEntry *e = orbit1;\n\t\t\twhile(e->next) {\n\t\t\t\tin_orbit[e->element] = orbit2;\n\t\t\t\te = e->next;\n\t\t\t}\n\t\t\tin_orbit[e->element] = orbit2;\n\t\t\te->next = orbit2->next;\n\t\t\torbit2->next = orbit1;\n\t\t\t// Keep the minimal orbit representative in the beginning\n\t\t\tif(orbit1->element < orbit2->element) {\n\t\t\t\tconst unsigned int temp = orbit1->element;\n\t\t\t\torbit1->element = orbit2->element;\n\t\t\t\torbit2->element = temp;\n\t\t\t}\n\t\t\torbit2->size += orbit1->size;\n\t\t}\n\t}\n\n\tpublic:\n\t// Create a new orbit information object.\n\t// The init() function must be called next to actually initialize the object.\n\tOrbit() {\n\t\torbits = 0;\n\t\tin_orbit = 0;\n\t\tnof_elements = 0;\n\t}\n\t~Orbit() {\n\t\tif(orbits) {\n\t\t\tfree(orbits);\n\t\t\torbits = 0;\n\t\t}\n\t\tif(in_orbit) {\n\t\t\tfree(in_orbit);\n\t\t\tin_orbit = 0;\n\t\t}\n\t\tnof_elements = 0;\n\t}\n\n\t// Initialize the orbit information to consider sets of \\a N elements.\n\t// It is required that \\a N > 0.\n\t// The orbit information is reset so that each element forms an orbit of its own.\n\t// Time complexity is O(N). \\sa reset()\n\tvoid init(const unsigned int n) {\n\t\tassert(n > 0);\n\t\tif(orbits) free(orbits);\n\t\torbits = (OrbitEntry*)malloc(n * sizeof(OrbitEntry));\n\t\tif(in_orbit) free(in_orbit);\n\t\tin_orbit = (OrbitEntry**)malloc(n * sizeof(OrbitEntry*));\n\t\tnof_elements = n;\n\t\treset();\n\t}\n\n\t// Reset the orbits so that each element forms an orbit of its own.\n\t// Time complexity is O(N).\n\tvoid reset() {\n\t\tassert(orbits);\n\t\tassert(in_orbit);\n\t\tfor(unsigned int i = 0; i < nof_elements; i++) {\n\t\t\torbits[i].element = i;\n\t\t\torbits[i].next = 0;\n\t\t\torbits[i].size = 1;\n\t\t\tin_orbit[i] = &orbits[i];\n\t\t}\n\t\t_nof_orbits = nof_elements;\n\t}\n\n\t// Merge the orbits of the elements \\a e1 and \\a e2.\n\t// Time complexity is O(k), where k is the number of elements in\n\t// the smaller of the merged orbits.\n\tvoid merge_orbits(unsigned int e1, unsigned int e2) {\n\t\tmerge_orbits(in_orbit[e1], in_orbit[e2]);\n\t}\n\n\t// Is the element \\a e the smallest element in its orbit?\n\t// Time complexity is O(1).\n\tbool is_minimal_representative(unsigned element) const {\n\t\treturn(get_minimal_representative(element) == element);\n\t}\n\t/// Get the smallest element in the orbit of the element \\a e.\n\t// Time complexity is O(1).\n\tunsigned get_minimal_representative(unsigned element) const {\n\t\tOrbitEntry * const orbit = in_orbit[element];\n\t\treturn(orbit->element);\n\t}\n\t// Get the number of elements in the orbit of the element \\a e.\n\t// Time complexity is O(1).\n\n\tunsigned orbit_size(unsigned element) const {\n\t\treturn(in_orbit[element]->size);\n\t}\n\t// Get the number of orbits.\n\t// Time complexity is O(1).\n\tunsigned int nof_orbits() const {return _nof_orbits; }\n};\n\n} // namespace bliss\n\n#endif\n"
  },
  {
    "path": "external/bliss/bliss/partition.hh",
    "content": "#ifndef BLISS_PARTITION_HH\n#define BLISS_PARTITION_HH\n\n/*\n  Copyright (c) 2003-2015 Tommi Junttila\n  Released under the GNU Lesser General Public License version 3.\n  \n  This file is part of bliss.\n  \n  bliss is free software: you can redistribute it and/or modify\n  it under the terms of the GNU Lesser General Public License as published by\n  the Free Software Foundation, version 3 of the License.\n\n  bliss is distributed in the hope that it will be useful,\n  but WITHOUT ANY WARRANTY; without even the implied warranty of\n  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the\n  GNU Lesser General Public License for more details.\n\n  You should have received a copy of the GNU Lesser General Public License\n  along with bliss.  If not, see <http://www.gnu.org/licenses/>.\n*/\n\nnamespace bliss {\n  class Partition;\n}\n\n#include <cstdio>\n#include <cstdlib>\n#include <climits>\n#include <assert.h>\n#include \"kstack.hh\"\n#include \"kqueue.hh\"\n#include \"heap.hh\"\n#include \"orbit.hh\"\n#include \"abgraph.hh\"\n#include \"graph.hh\"\n\nnamespace bliss {\n\n/** \\internal\n * \\brief A class for refinable, backtrackable ordered partitions.\n *\n * This is rather a data structure with some helper functions than\n * a proper self-contained class.\n * That is, for efficiency reasons the fields of this class are directly\n * manipulated from bliss::AbstractGraph and its subclasses.\n * Conversely, some methods of this class modify the fields of\n * bliss::AbstractGraph, too.\n */\nclass Partition\n{\npublic:\n  /**\n   * \\brief Data structure for holding information about a cell in a Partition.\n   */\n  class Cell\n  {\n    friend class Partition;\n  public:\n    unsigned int length;\n    // Index of the first element of the cell in the Partition::elements array\n    unsigned int first;\n    unsigned int max_ival;\n    unsigned int max_ival_count;\n  private:\n    bool in_splitting_queue;\n  public:\n    bool in_neighbour_heap;\n    /* Pointer to the next cell, null if this is the last one. */\n    Cell* next;\n    Cell* prev;\n    Cell* next_nonsingleton;\n    Cell* prev_nonsingleton;\n    unsigned int split_level;\n    /** Is this a unit cell? */\n    bool is_unit() const {return(length == 1); }\n    /** Is this cell in splitting queue? */\n    bool is_in_splitting_queue() const {return(in_splitting_queue); }\n  };\n\n\nprivate:\n\n  /** \\internal\n   * Data structure for remembering information about splits in order to\n   * perform efficient backtracking over the splits.\n   */\n  class RefInfo {\n  public:\n    unsigned int split_cell_first;\n    int prev_nonsingleton_first;\n    int next_nonsingleton_first;\n  };\n  /** \\internal\n   * A stack for remembering the splits, used for backtracking.\n   */\n  KStack<RefInfo> refinement_stack;\n\n  class BacktrackInfo {\n  public:\n    BacktrackInfo() : refinement_stack_size(0), cr_backtrack_point(0) {}\n    unsigned int refinement_stack_size;\n    unsigned int cr_backtrack_point;\n  };\n\n  /** \\internal\n   * The main stack for enabling backtracking.\n   */\n  std::vector<BacktrackInfo> bt_stack;\n\npublic:\n  AbstractGraph* graph;\n\n  /* Used during equitable partition refinement */\n  KQueue<Cell*> splitting_queue;\n  //void  splitting_queue_add(Cell* const cell);\n  Cell* splitting_queue_pop();\n  bool  splitting_queue_is_empty() const;\n  //void  splitting_queue_clear();\n\nvoid splitting_queue_add(Cell* const cell) {\n  static const unsigned int smallish_cell_threshold = 1;\n  cell->in_splitting_queue = true;\n  if(cell->length <= smallish_cell_threshold)\n    splitting_queue.push_front(cell);\n  else\n    splitting_queue.push_back(cell);    \n}\n\nvoid splitting_queue_clear() {\n  while(!splitting_queue_is_empty())\n    splitting_queue_pop();\n}\n\n  /** Type for backtracking points. */\n  typedef unsigned int BacktrackPoint;\n\n  /**\n   * Get a new backtrack point for the current partition\n   */\n  //BacktrackPoint set_backtrack_point();\nBacktrackPoint set_backtrack_point() {\n  BacktrackInfo info;\n  info.refinement_stack_size = refinement_stack.size();\n  if(cr_enabled)\n    info.cr_backtrack_point = cr_get_backtrack_point();\n  BacktrackPoint p = bt_stack.size();\n  bt_stack.push_back(info);\n  return p;\n}\n\n  /**\n   * Backtrack to the point \\a p and remove it.\n   */\n  //void goto_backtrack_point(BacktrackPoint p);\nvoid goto_backtrack_point(BacktrackPoint p) {\n  BacktrackInfo info = bt_stack[p];\n  bt_stack.resize(p);\n  if(cr_enabled)\n    cr_goto_backtrack_point(info.cr_backtrack_point);\n  const unsigned int dest_refinement_stack_size = info.refinement_stack_size;\n  assert(refinement_stack.size() >= dest_refinement_stack_size);\n  while(refinement_stack.size() > dest_refinement_stack_size) {\n      RefInfo i = refinement_stack.pop();\n      const unsigned int first = i.split_cell_first;\n      Cell* cell = get_cell(elements[first]);\n      if(cell->first != first) {\n\t  assert(cell->first < first);\n\t  assert(cell->split_level <= dest_refinement_stack_size);\n\t  goto done;\n\t}\n      assert(cell->split_level > dest_refinement_stack_size);\n      while(cell->split_level > dest_refinement_stack_size) {\n\t  assert(cell->prev);\n\t  cell = cell->prev;\n\t}\n      while(cell->next and cell->next->split_level > dest_refinement_stack_size) {\n\t  /* Merge next cell */\n\t  Cell* const next_cell = cell->next;\n\t  if(cell->length == 1)\n\t    discrete_cell_count--;\n\t  if(next_cell->length == 1)\n\t    discrete_cell_count--;\n\t  /* Update element_to_cell_map values of elements added in cell */\n\t  unsigned int* ep = elements + next_cell->first;\n\t  unsigned int* const lp = ep + next_cell->length;\n\t  for( ; ep < lp; ep++)\n\t    element_to_cell_map[*ep] = cell;\n\t  /* Update cell parameters */\n\t  cell->length += next_cell->length;\n\t  if(next_cell->next)\n\t    next_cell->next->prev = cell;\n\t  cell->next = next_cell->next;\n\t  /* (Pseudo)free next_cell */\n\t  next_cell->first = 0;\n\t  next_cell->length = 0;\n\t  next_cell->prev = 0;\n\t  next_cell->next = free_cells;\n\t  free_cells = next_cell;\n\t}\n\n    done:\n      if(i.prev_nonsingleton_first >= 0) {\n\t  Cell* const prev_cell = get_cell(elements[i.prev_nonsingleton_first]);\n\t  cell->prev_nonsingleton = prev_cell;\n\t  prev_cell->next_nonsingleton = cell;\n\t} else {\n\t  //assert(cell->prev_nonsingleton == 0);\n\t  cell->prev_nonsingleton = 0;\n\t  first_nonsingleton_cell = cell;\n\t}\n\n      if(i.next_nonsingleton_first >= 0) {\n\t  Cell* const next_cell = get_cell(elements[i.next_nonsingleton_first]);\n\t  cell->next_nonsingleton = next_cell;\n\t  next_cell->prev_nonsingleton = cell;\n\t} else {\n\t  //assert(cell->next_nonsingleton == 0);\n\t  cell->next_nonsingleton = 0;\n\t}\n    }\n}\n  /**\n   * Split the non-unit Cell \\a cell = {\\a element,e1,e2,...,en} containing\n   * the element \\a element in two:\n   * \\a cell = {e1,...,en} and \\a newcell = {\\a element}.\n   * @param cell     a non-unit Cell\n   * @param element  an element in \\a cell\n   * @return         the new unit Cell \\a newcell\n   */\n  //Cell* individualize(Cell* const cell, const unsigned int element);\nCell* individualize(Cell * const cell, const unsigned int element) {\n  unsigned int * const pos = in_pos[element];\n  const unsigned int last = cell->first + cell->length - 1;\n  *pos = elements[last];\n  in_pos[*pos] = pos;\n  elements[last] = element;\n  in_pos[element] = elements + last;\n  Partition::Cell * const new_cell = aux_split_in_two(cell, cell->length-1);\n  element_to_cell_map[element] = new_cell;\n  return new_cell;\n}\n\n  //Cell* aux_split_in_two(Cell* const cell, const unsigned int first_half_size);\nCell* aux_split_in_two(Cell* const cell, const unsigned int first_half_size) {\n  RefInfo i;\n  /* (Pseudo)allocate new cell */\n  Cell * const new_cell = free_cells;\n  free_cells = new_cell->next;\n  /* Update new cell parameters */\n  new_cell->first = cell->first + first_half_size;\n  new_cell->length = cell->length - first_half_size;\n  new_cell->next = cell->next;\n  if(new_cell->next)\n    new_cell->next->prev = new_cell;\n  new_cell->prev = cell;\n  new_cell->split_level = refinement_stack.size()+1;\n  /* Update old, splitted cell parameters */\n  cell->length = first_half_size;\n  cell->next = new_cell;\n  /* CR */\n  if(cr_enabled)\n    cr_create_at_level_trailed(new_cell->first, cr_get_level(cell->first));\n\n  /* Add cell in refinement_stack for backtracking */\n  i.split_cell_first = new_cell->first;\n  if(cell->prev_nonsingleton)\n    i.prev_nonsingleton_first = cell->prev_nonsingleton->first;\n  else\n    i.prev_nonsingleton_first = -1;\n  if(cell->next_nonsingleton)\n    i.next_nonsingleton_first = cell->next_nonsingleton->first;\n  else\n    i.next_nonsingleton_first = -1;\n  refinement_stack.push(i);\n\n  /* Modify nonsingleton cell list */\n  if(new_cell->length > 1)\n    {\n      new_cell->prev_nonsingleton = cell;\n      new_cell->next_nonsingleton = cell->next_nonsingleton;\n      if(new_cell->next_nonsingleton)\n\tnew_cell->next_nonsingleton->prev_nonsingleton = new_cell;\n      cell->next_nonsingleton = new_cell;\n    }\n  else\n    {\n      new_cell->next_nonsingleton = 0;\n      new_cell->prev_nonsingleton = 0;\n      discrete_cell_count++;\n    }\n\n  if(cell->is_unit())\n    {\n      if(cell->prev_nonsingleton)\n\tcell->prev_nonsingleton->next_nonsingleton = cell->next_nonsingleton;\n      else\n\tfirst_nonsingleton_cell = cell->next_nonsingleton;\n      if(cell->next_nonsingleton)\n\tcell->next_nonsingleton->prev_nonsingleton = cell->prev_nonsingleton;\n      cell->next_nonsingleton = 0;\n      cell->prev_nonsingleton = 0;\n      discrete_cell_count++;\n    }\n\n  return new_cell;\n} \n\nprivate:\n  unsigned int N;\n  Cell* cells;\n  Cell* free_cells;\n  unsigned int discrete_cell_count;\npublic:\n  Cell* first_cell;\n  Cell* first_nonsingleton_cell;\n  unsigned int *elements;\n  /* invariant_values[e] gives the invariant value of the element e */\n  unsigned int *invariant_values;\n  /* element_to_cell_map[e] gives the cell of the element e */\n  Cell **element_to_cell_map;\n  /** Get the cell of the element \\a e */\n  Cell* get_cell(const unsigned int e) const {\n    return element_to_cell_map[e];\n  }\n  /* in_pos[e] points to the elements array s.t. *in_pos[e] = e  */\n  unsigned int **in_pos;\n\n  //Partition();\n  //~Partition();\nPartition() {\n  N = 0;\n  elements = 0;\n  in_pos = 0;\n  invariant_values = 0;\n  cells = 0;\n  free_cells = 0;\n  element_to_cell_map = 0;\n  graph = 0;\n  discrete_cell_count = 0;\n  /* Initialize a distribution count sorting array. */\n  for(unsigned int i = 0; i < 256; i++)\n    dcs_count[i] = 0;\n\n  cr_enabled = false;\n  cr_cells = 0;\n  cr_levels = 0;\n}\n\n~Partition() {\n  if(elements)            {free(elements); elements = 0; }\n  if(cells)               {free(cells); cells = 0; }\n  if(element_to_cell_map) {free(element_to_cell_map); element_to_cell_map = 0; }\n  if(in_pos)              {free(in_pos); in_pos = 0; }\n  if(invariant_values)    {free(invariant_values); invariant_values = 0; }\n  N = 0;\n}\n\n  /**\n   * Initialize the partition to the unit partition (all elements in one cell)\n   * over the \\a N > 0 elements {0,...,\\a N-1}.\n   */\n  //void init(const unsigned int N);\nvoid init(const unsigned int M) {\n  assert(M > 0);\n  N = M;\n\n  if(elements)\n    free(elements);\n  elements = (unsigned int*)malloc(N * sizeof(unsigned int));\n  for(unsigned int i = 0; i < N; i++)\n    elements[i] = i;\n\n  if(in_pos)\n    free(in_pos);\n  in_pos = (unsigned int**)malloc(N * sizeof(unsigned int*));\n  for(unsigned int i = 0; i < N; i++)\n    in_pos[i] = elements + i;\n\n  if(invariant_values)\n    free(invariant_values);\n  invariant_values = (unsigned int*)malloc(N * sizeof(unsigned int));\n  for(unsigned int i = 0; i < N; i++)\n    invariant_values[i] = 0;\n\n  if(cells)\n    free(cells);\n  cells = (Cell*)malloc(N * sizeof(Cell));\n\n  cells[0].first = 0;\n  cells[0].length = N;\n  cells[0].max_ival = 0;\n  cells[0].max_ival_count = 0;\n  cells[0].in_splitting_queue = false;\n  cells[0].in_neighbour_heap = false;\n  cells[0].prev = 0;\n  cells[0].next = 0;\n  cells[0].next_nonsingleton = 0;\n  cells[0].prev_nonsingleton = 0;\n  cells[0].split_level = 0;\n  first_cell = &cells[0];\n  if(N == 1)\n    {\n      first_nonsingleton_cell = 0;\n      discrete_cell_count = 1;\n    }\n  else\n    {\n      first_nonsingleton_cell = &cells[0];\n      discrete_cell_count = 0;\n    }\n\n  for(unsigned int i = 1; i < N; i++)\n    {\n      cells[i].first = 0;\n      cells[i].length = 0;\n      cells[i].max_ival = 0;\n      cells[i].max_ival_count = 0;\n      cells[i].in_splitting_queue = false;\n      cells[i].in_neighbour_heap = false;\n      cells[i].prev = 0;\n      cells[i].next = (i < N-1)?&cells[i+1]:0;\n      cells[i].next_nonsingleton = 0;\n      cells[i].prev_nonsingleton = 0;\n    }\n  if(N > 1)\n    free_cells = &cells[1];\n  else\n    free_cells = 0;\n\n  if(element_to_cell_map)\n    free(element_to_cell_map);\n  element_to_cell_map = (Cell **)malloc(N * sizeof(Cell *));\n  for(unsigned int i = 0; i < N; i++)\n    element_to_cell_map[i] = first_cell;\n\n  splitting_queue.init(N);\n  refinement_stack.init(N);\n\n  /* Reset the main backtracking stack */\n  bt_stack.clear();\n}\n  /**\n   * Returns true iff the partition is discrete, meaning that all\n   * the elements are in their own cells.\n   */\n  bool is_discrete() const {return(free_cells == 0); }\n\n  unsigned int nof_discrete_cells() const {return(discrete_cell_count); }\n\n  /**\n   * Print the partition into the file stream \\a fp.\n   */\n  //size_t print(FILE* const fp, const bool add_newline = true) const;\nsize_t print(FILE* const fp, const bool add_newline = true) const {\n  size_t r = 0;\n  const char* cell_sep = \"\";\n  r += fprintf(fp, \"[\");\n  for(Cell* cell = first_cell; cell; cell = cell->next)\n    {\n      /* Print cell */\n      r += fprintf(fp, \"%s{\", cell_sep);\n      cell_sep = \",\";\n      const char* elem_sep = \"\";\n      for(unsigned int i = 0; i < cell->length; i++)\n\t{\n\t  r += fprintf(fp, \"%s%u\", elem_sep, elements[cell->first + i]);\n\t  elem_sep = \",\";\n\t}\n      r += fprintf(fp, \"}\");\n    }\n  r += fprintf(fp, \"]\");\n  if(add_newline) r += fprintf(fp, \"\\n\");\n  return r;\n}\n\n  /**\n   * Print the partition cell sizes into the file stream \\a fp.\n   */\n  //size_t print_signature(FILE* const fp, const bool add_newline = true) const;\nsize_t print_signature(FILE* const fp, const bool add_newline = true) const {\n  size_t r = 0;\n  const char* cell_sep = \"\";\n  r += fprintf(fp, \"[\");\n  for(Cell* cell = first_cell; cell; cell = cell->next)\n    {\n      if(cell->is_unit()) continue;\n      //fprintf(fp, \"%s%u\", cell_sep, cr_cells[cell->first].level);\n      r += fprintf(fp, \"%s%u\", cell_sep, cell->length);\n      cell_sep = \",\";\n    }\n  r += fprintf(fp, \"]\");\n  if(add_newline) r += fprintf(fp, \"\\n\");\n  return r;\n}\n\n  /*\n   * Splits the Cell \\a cell into [cell_1,...,cell_n]\n   * according to the invariant_values of the elements in \\a cell.\n   * After splitting, cell_1 == \\a cell.\n   * Returns the pointer to the Cell cell_n;\n   * cell_n != cell iff the Cell \\a cell was actually splitted.\n   * The flag \\a max_ival_info_ok indicates whether the max_ival and\n   * max_ival_count fields of the Cell \\a cell have consistent values\n   * when the method is called.\n   * Clears the invariant values of elements in the Cell \\a cell as well as\n   * the max_ival and max_ival_count fields of the Cell \\a cell.\n   */\n  //Cell *zplit_cell(Cell * const cell, const bool max_ival_info_ok);\nCell* zplit_cell(Cell* const cell, const bool max_ival_info_ok) {\n  Cell* last_new_cell = cell;\n  if(!max_ival_info_ok) {\n      /* Compute max_ival info */\n      assert(cell->max_ival == 0);\n      assert(cell->max_ival_count == 0);\n      unsigned int *ep = elements + cell->first;\n      for(unsigned int i = cell->length; i > 0; i--, ep++) {\n\t  const unsigned int ival = invariant_values[*ep];\n\t  if(ival > cell->max_ival) {\n\t      cell->max_ival = ival;\n\t      cell->max_ival_count = 1;\n\t    }\n\t  else if(ival == cell->max_ival) {\n\t      cell->max_ival_count++;\n\t    }\n\t}\n    }\n\n#ifdef BLISS_CONSISTENCY_CHECKS\n  /* Verify max_ival info */\n  {\n    unsigned int nof_zeros = 0;\n    unsigned int max_ival = 0;\n    unsigned int max_ival_count = 0;\n    unsigned int *ep = elements + cell->first;\n    for(unsigned int i = cell->length; i > 0; i--, ep++)\n      {\n\tconst unsigned int ival = invariant_values[*ep];\n\tif(ival == 0)\n\t  nof_zeros++;\n\tif(ival > max_ival)\n\t  {\n\t    max_ival = ival;\n\t    max_ival_count = 1;\n\t  }\n\telse if(ival == max_ival)\n\t  max_ival_count++;\n      }\n    assert(max_ival == cell->max_ival);\n    assert(max_ival_count == cell->max_ival_count);\n  }\n#endif\n\n  /* max_ival info has been computed */\n\n  if(cell->max_ival_count == cell->length)\n    {\n      /* All invariant values are the same, clear 'em */\n      if(cell->max_ival > 0)\n\tclear_ivs(cell);\n    }\n  else\n    {\n      /* All invariant values are not the same */\n      if(cell->max_ival == 1)\n\t{\n\t  /* Specialized splitting for cells with binary invariant values */\n\t  last_new_cell = sort_and_split_cell1(cell);\n\t}\n      else if(cell->max_ival < 256)\n\t{\n\t  /* Specialized splitting for cells with invariant values < 256 */\n\t  last_new_cell = sort_and_split_cell255(cell, cell->max_ival);\n\t}\n      else\n\t{\n\t  /* Generic sorting and splitting */\n\t  const bool sorted = shellsort_cell(cell);\n\t  if (!sorted) printf(\"error sorting\\n\");\n\t  assert(sorted);\n\t  last_new_cell = split_cell(cell);\n\t}\n    }\n  cell->max_ival = 0;\n  cell->max_ival_count = 0;\n  return last_new_cell;\n}\n  /*\n   * Routines for component recursion\n   */\n  //void cr_init();\n  //void cr_free();\n  unsigned int cr_get_level(const unsigned int cell_index) const;\n  //unsigned int cr_split_level(const unsigned int level, const std::vector<unsigned int>& cells);\n/*\n *\n * Component recursion specific code\n *\n */\nvoid cr_init() {\n  assert(bt_stack.empty());\n  cr_enabled = true;\n  if(cr_cells) free(cr_cells);\n  cr_cells = (CRCell*)malloc(N * sizeof(CRCell));\n  if(!cr_cells) {assert(false && \"Mem out\"); }\n\n  if(cr_levels) free(cr_levels);\n  cr_levels = (CRCell**)malloc(N * sizeof(CRCell*));\n  if(!cr_levels) {assert(false && \"Mem out\"); }\n\n  for(unsigned int i = 0; i < N; i++) {\n    cr_levels[i] = 0;\n    cr_cells[i].level = UINT_MAX;\n    cr_cells[i].next = 0;\n    cr_cells[i].prev_next_ptr = 0;\n  }\n\n  for(const Cell *cell = first_cell; cell; cell = cell->next)\n    cr_create_at_level_trailed(cell->first, 0);\n\n  cr_max_level = 0;\n}\n\n\nvoid cr_free() {\n  if(cr_cells) {free(cr_cells); cr_cells = 0; }\n  if(cr_levels) {free(cr_levels); cr_levels = 0; }\n  cr_created_trail.clear();\n  cr_splitted_level_trail.clear();\n  cr_bt_info.clear();\n  cr_max_level = 0;\n  cr_enabled = false;\n}\n\nunsigned int cr_split_level(const unsigned int level, const std::vector<unsigned int>& splitted_cells) {\n  assert(cr_enabled);\n  assert(level <= cr_max_level);\n  cr_levels[++cr_max_level] = 0;\n  cr_splitted_level_trail.push_back(level);\n  for(unsigned int i = 0; i < splitted_cells.size(); i++) {\n      const unsigned int cell_index = splitted_cells[i];\n      assert(cell_index < N);\n      CRCell& cr_cell = cr_cells[cell_index];\n      assert(cr_cell.level == level);\n      cr_cell.detach();\n      cr_create_at_level(cell_index, cr_max_level);\n    }\n  return cr_max_level;\n}\n\n  /** Clear the invariant_values of the elements in the Cell \\a cell. */\n  //void clear_ivs(Cell* const cell);\nvoid clear_ivs(Cell* const cell) {\n  unsigned int* ep = elements + cell->first;\n  for(unsigned int i = cell->length; i > 0; i--, ep++)\n    invariant_values[*ep] = 0;\n}\n\nprivate:\n  /*\n   * Component recursion data structures\n   */\n\n  /* Is component recursion support in use? */\n  bool cr_enabled;\n\n  class CRCell {\n  public:\n    unsigned int level;\n    CRCell* next;\n    CRCell** prev_next_ptr;\n    void detach() {\n      if(next)\n\tnext->prev_next_ptr = prev_next_ptr;\n      *(prev_next_ptr) = next;\n      level = UINT_MAX;\n      next = 0;\n      prev_next_ptr = 0;\n    }\n  };\n  CRCell* cr_cells;\n  CRCell** cr_levels;\n  class CR_BTInfo {\n  public:\n    unsigned int created_trail_index;\n    unsigned int splitted_level_trail_index;\n  };\n  std::vector<unsigned int> cr_created_trail;\n  std::vector<unsigned int> cr_splitted_level_trail;\n  std::vector<CR_BTInfo> cr_bt_info;\n  unsigned int cr_max_level;\n  //void cr_create_at_level(const unsigned int cell_index, unsigned int level);\n  //void cr_create_at_level_trailed(const unsigned int cell_index, unsigned int level);\n  //unsigned int cr_get_backtrack_point();\n  //void cr_goto_backtrack_point(const unsigned int btpoint);\n\nvoid cr_create_at_level(const unsigned int cell_index, const unsigned int level) {\n  assert(cr_enabled);\n  assert(cell_index < N);\n  assert(level < N);\n  CRCell& cr_cell = cr_cells[cell_index];\n  assert(cr_cell.level == UINT_MAX);\n  assert(cr_cell.next == 0);\n  assert(cr_cell.prev_next_ptr == 0);\n  if(cr_levels[level])\n    cr_levels[level]->prev_next_ptr = &(cr_cell.next);\n  cr_cell.next = cr_levels[level];\n  cr_levels[level] = &cr_cell;\n  cr_cell.prev_next_ptr = &cr_levels[level];\n  cr_cell.level = level;\n}\n\n\nvoid cr_create_at_level_trailed(const unsigned int cell_index, const unsigned int level) {\n  assert(cr_enabled);\n  cr_create_at_level(cell_index, level);\n  cr_created_trail.push_back(cell_index);\n}\n\nunsigned int cr_get_backtrack_point() {\n  assert(cr_enabled);\n  CR_BTInfo info;\n  info.created_trail_index = cr_created_trail.size();\n  info.splitted_level_trail_index = cr_splitted_level_trail.size();\n  cr_bt_info.push_back(info);\n  return cr_bt_info.size()-1;\n}\n\n\nvoid cr_goto_backtrack_point(const unsigned int btpoint) {\n  assert(cr_enabled);\n  assert(btpoint < cr_bt_info.size());\n  while(cr_created_trail.size() > cr_bt_info[btpoint].created_trail_index) {\n      const unsigned int cell_index = cr_created_trail.back();\n      cr_created_trail.pop_back();\n      CRCell& cr_cell = cr_cells[cell_index];\n      assert(cr_cell.level != UINT_MAX);\n      assert(cr_cell.prev_next_ptr);\n      cr_cell.detach();\n    }\n\n  while(cr_splitted_level_trail.size() >\n\tcr_bt_info[btpoint].splitted_level_trail_index)\n    {\n      const unsigned int dest_level = cr_splitted_level_trail.back();\n      cr_splitted_level_trail.pop_back();\n      assert(cr_max_level > 0);\n      assert(dest_level < cr_max_level);\n      while(cr_levels[cr_max_level]) {\n\tCRCell *cr_cell = cr_levels[cr_max_level];\n\tcr_cell->detach();\n\tcr_create_at_level(cr_cell - cr_cells, dest_level);\n      }\n      cr_max_level--;\n    }\n  cr_bt_info.resize(btpoint);\n}\n\n  // Auxiliary routines for sorting and splitting cells\n  //Cell* sort_and_split_cell1(Cell* cell);\n  //Cell* sort_and_split_cell255(Cell* const cell, const unsigned int max_ival);\n  //bool shellsort_cell(Cell* cell);\n\n// Assumes that the invariant values are NOT the same and that the cell contains more than one element\nCell* sort_and_split_cell1(Cell* const cell) {\n#if defined(BLISS_EXPENSIVE_CONSISTENCY_CHECKS)\n  assert(cell->length > 1);\n  assert(cell->first + cell->length <= N);\n  unsigned int nof_0_found = 0;\n  unsigned int nof_1_found = 0;\n  for(unsigned int i = cell->first; i < cell->first + cell->length; i++)\n    {\n      const unsigned int ival = invariant_values[elements[i]];\n      assert(ival == 0 or ival == 1);\n      if(ival == 0) nof_0_found++;\n      else nof_1_found++;\n    }\n  assert(nof_0_found > 0);\n  assert(nof_1_found > 0);\n  assert(nof_1_found == cell->max_ival_count);\n  assert(nof_0_found + nof_1_found == cell->length);\n  assert(cell->max_ival == 1);\n#endif\n\n\n  /* (Pseudo)allocate new cell */\n  Cell* const new_cell = free_cells;\n  free_cells = new_cell->next;\n\n#define NEW_SORT1\n#ifdef NEW_SORT1\n      unsigned int *ep0 = elements + cell->first;\n      unsigned int *ep1 = ep0 + cell->length - cell->max_ival_count;\n      if(cell->max_ival_count > cell->length / 2)\n\t{\n\t  /* There are more ones than zeros, only move zeros */\n\t  unsigned int * const end = ep0 + cell->length;\n\t  while(ep1 < end)\n\t    {\n\t      while(invariant_values[*ep1] == 0)\n\t\t{\n\t\t  const unsigned int tmp = *ep1;\n\t\t  *ep1 = *ep0;\n\t\t  *ep0 = tmp;\n\t\t  in_pos[tmp] = ep0;\n\t\t  in_pos[*ep1] = ep1;\n\t\t  ep0++;\n\t\t}\n\t      element_to_cell_map[*ep1] = new_cell;\n\t      invariant_values[*ep1] = 0;\n\t      ep1++;\n\t    }\n\t}\n      else\n\t{\n\t  /* There are more zeros than ones, only move ones */\n\t  unsigned int * const end = ep1;\n\t  while(ep0 < end)\n\t    {\n\t      while(invariant_values[*ep0] != 0)\n\t\t{\n\t\t  const unsigned int tmp = *ep0;\n\t\t  *ep0 = *ep1;\n\t\t  *ep1 = tmp;\n\t\t  in_pos[tmp] = ep1;\n\t\t  in_pos[*ep0] = ep0;\n\t\t  ep1++;\n\t\t}\n\t      ep0++;\n\t    }\n\t  ep1 = end;\n\t  while(ep1 < elements + cell->first + cell->length)\n\t    {\n\t      element_to_cell_map[*ep1] = new_cell;\n\t      invariant_values[*ep1] = 0;\n\t      ep1++;\n\t    }\n\t}\n  /* Update new cell parameters */\n  new_cell->first = cell->first + cell->length - cell->max_ival_count;\n  new_cell->length = cell->length - (new_cell->first - cell->first);\n  new_cell->next = cell->next;\n  if(new_cell->next)\n    new_cell->next->prev = new_cell;\n  new_cell->prev = cell;\n  new_cell->split_level = refinement_stack.size()+1;\n  /* Update old, splitted cell parameters */\n  cell->length = new_cell->first - cell->first;\n  cell->next = new_cell;\n  /* CR */\n  if(cr_enabled)\n    cr_create_at_level_trailed(new_cell->first, cr_get_level(cell->first));\n\n#else\n  /* Sort vertices in the cell according to the invariant values */\n  unsigned int *ep0 = elements + cell->first;\n  unsigned int *ep1 = ep0 + cell->length;\n  while(ep1 > ep0)\n    {\n      const unsigned int element = *ep0;\n      const unsigned int ival = invariant_values[element];\n      invariant_values[element] = 0;\n      if(ival == 0)\n\t{\n\t  ep0++;\n\t}\n      else\n\t{\n\t  ep1--;\n\t  *ep0 = *ep1;\n\t  *ep1 = element;\n\t  element_to_cell_map[element] = new_cell;\n\t  in_pos[element] = ep1;\n\t  in_pos[*ep0] = ep0;\n\t}\n    }\n\n\n  /* Update new cell parameters */\n  new_cell->first = ep1 - elements;\n  new_cell->length = cell->length - (new_cell->first - cell->first);\n  new_cell->next = cell->next;\n  if(new_cell->next)\n    new_cell->next->prev = new_cell;\n  new_cell->prev = cell;\n  new_cell->split_level = cell->split_level;\n  /* Update old, splitted cell parameters */\n  cell->length = new_cell->first - cell->first;\n  cell->next = new_cell;\n  cell->split_level = refinement_stack.size()+1;\n  /* CR */\n  if(cr_enabled)\n    cr_create_at_level_trailed(new_cell->first, cr_get_level(cell->first));\n\n#endif /* ifdef NEW_SORT1*/\n\n  /* Add cell in refinement stack for backtracking */\n  {\n    RefInfo i;\n    i.split_cell_first = new_cell->first;\n    if(cell->prev_nonsingleton)\n      i.prev_nonsingleton_first = cell->prev_nonsingleton->first;\n    else\n      i.prev_nonsingleton_first = -1;\n    if(cell->next_nonsingleton)\n      i.next_nonsingleton_first = cell->next_nonsingleton->first;\n    else\n      i.next_nonsingleton_first = -1;\n    /* Modify nonsingleton cell list */\n    if(new_cell->length > 1)\n      {\n\tnew_cell->prev_nonsingleton = cell;\n\tnew_cell->next_nonsingleton = cell->next_nonsingleton;\n\tif(new_cell->next_nonsingleton)\n\t  new_cell->next_nonsingleton->prev_nonsingleton = new_cell;\n\tcell->next_nonsingleton = new_cell;\n      }\n    else\n      {\n\tnew_cell->next_nonsingleton = 0;\n\tnew_cell->prev_nonsingleton = 0;\n\tdiscrete_cell_count++;\n      }\n    if(cell->is_unit())\n      {\n\tif(cell->prev_nonsingleton)\n\t  cell->prev_nonsingleton->next_nonsingleton = cell->next_nonsingleton;\n\telse\n\t  first_nonsingleton_cell = cell->next_nonsingleton;\n\tif(cell->next_nonsingleton)\n\t  cell->next_nonsingleton->prev_nonsingleton = cell->prev_nonsingleton;\n\tcell->next_nonsingleton = 0;\n\tcell->prev_nonsingleton = 0;\n\tdiscrete_cell_count++;\n      }\n    refinement_stack.push(i);\n  }\n\n\n  /* Add cells in splitting queue */\n  if(cell->in_splitting_queue) {\n    /* Both cells must be included in splitting_queue in order to have\n       refinement to equitable partition */\n    splitting_queue_add(new_cell);\n  } else {\n    Cell *min_cell, *max_cell;\n    if(cell->length <= new_cell->length) {\n      min_cell = cell;\n      max_cell = new_cell;\n    } else {\n      min_cell = new_cell;\n      max_cell = cell;\n    }\n    /* Put the smaller cell in splitting_queue */\n    splitting_queue_add(min_cell);\n    if(max_cell->is_unit()) {\n      /* Put the \"larger\" cell also in splitting_queue */\n      splitting_queue_add(max_cell);\n    }\n  }\n\n\n  return new_cell;\n}\n\n// Sort the elements in a cell according to their invariant values.\n// The invariant values are not cleared.\n// Warning: the in_pos array is left in incorrect state.\nbool shellsort_cell(Cell* const cell) {\n  unsigned int h;\n  unsigned int* ep;\n  if(cell->is_unit())\n    return false;\n\n  /* Check whether all the elements have the same invariant value */\n  bool equal_invariant_values = true;\n  {\n    ep = elements + cell->first;\n    const unsigned int ival = invariant_values[*ep];\n    ep++;\n    for(unsigned int i = cell->length - 1; i > 0; i--)\n      {\n\tif(invariant_values[*ep] != ival) {\n\t  equal_invariant_values = false;\n\t  break;\n\t}\n\tep++;\n      }\n  }\n  if(equal_invariant_values)\n    return false;\n\n  ep = elements + cell->first;\n\n  for(h = 1; h <= cell->length/9; h = 3*h + 1)\n    ;\n  for( ; h > 0; h = h/3) {\n    for(unsigned int i = h; i < cell->length; i++) {\n      const unsigned int element = ep[i];\n      const unsigned int ival = invariant_values[element];\n      unsigned int j = i;\n      while(j >= h and invariant_values[ep[j-h]] > ival) {\n        ep[j] = ep[j-h];\n        j -= h;\n      }\n      ep[j] = element;\n    }\n  }\n  return true;\n}\n\n// Distribution count sorting of cells with invariant values less than 256.\nCell* sort_and_split_cell255(Cell* const cell, const unsigned int max_ival) {\n  if(cell->is_unit()) {\n      /* Reset invariant value */\n      invariant_values[elements[cell->first]] = 0;\n      return cell;\n    }\n  \n#ifdef BLISS_CONSISTENCY_CHECKS\n  for(unsigned int i = 0; i < 256; i++)\n    assert(dcs_count[i] == 0);\n#endif\n\n  /*\n   * Compute the distribution of invariant values to the count array\n   */\n  {\n    const unsigned int *ep = elements + cell->first;\n    const unsigned int ival = invariant_values[*ep];\n    dcs_count[ival]++;\n    ep++;\n#if defined(BLISS_CONSISTENCY_CHECKS)\n    bool equal_invariant_values = true;\n#endif\n    for(unsigned int i = cell->length - 1; i != 0; i--)\n      {\n\tconst unsigned int ival2 = invariant_values[*ep];\n\tdcs_count[ival2]++;\n#if defined(BLISS_CONSISTENCY_CHECKS)\n\tif(ival2 != ival) {\n\t  equal_invariant_values = false;\n\t}\n#endif\n\tep++;\n      }\n#if defined(BLISS_CONSISTENCY_CHECKS)\n    assert(!equal_invariant_values);\n    if(equal_invariant_values) {\n      assert(dcs_count[ival] == cell->length);\n      dcs_count[ival] = 0;\n      clear_ivs(cell);\n      return cell;\n    }\n#endif\n  }\n\n  /* Build start array */\n  dcs_cumulate_count(max_ival);\n\n\n  /* Do the sorting */\n  for(unsigned int i = 0; i <= max_ival; i++)\n    {\n      unsigned int *ep = elements + cell->first + dcs_start[i];\n      for(unsigned int j = dcs_count[i]; j > 0; j--)\n\t{\n\t  while(true)\n\t    {\n\t      const unsigned int element = *ep;\n\t      const unsigned int ival = invariant_values[element];\n\t      if(ival == i)\n\t\tbreak;\n\t      *ep = elements[cell->first + dcs_start[ival]];\n\t      elements[cell->first + dcs_start[ival]] = element;\n\t      dcs_start[ival]++;\n\t      dcs_count[ival]--;\n\t    }\n\t  ep++;\n\t}\n      dcs_count[i] = 0;\n    }\n\n#if defined(BLISS_CONSISTENCY_CHECKS)\n  for(unsigned int i = 0; i < 256; i++)\n    assert(dcs_count[i] == 0);\n#endif\n\n  /* split cell */\n  Cell* const new_cell = split_cell(cell);\n  return new_cell;\n}\n\n  Cell* split_cell(Cell* const cell);\n\n  /*\n   * Some auxiliary stuff needed for distribution count sorting.\n   * To make the code thread-safe (modulo the requirement that each graph is\n   * only accessed in one thread at a time), the arrays are owned by\n   * the partition instance, not statically defined.\n   */\n  unsigned int dcs_count[256];\n  unsigned int dcs_start[256];\n  //void dcs_cumulate_count(const unsigned int max);\n// An auxiliary function for distribution count sorting.\n// Build start array so that\n// dcs_start[0] = 0 and dcs_start[i+1] = dcs_start[i] + dcs_count[i].\nvoid dcs_cumulate_count(const unsigned int max)  {\n  unsigned int* count_p = dcs_count;\n  unsigned int* start_p = dcs_start;\n  unsigned int sum = 0;\n  for(unsigned int i = max+1; i > 0; i--) {\n      *start_p = sum;\n      start_p++;\n      sum += *count_p;\n      count_p++;\n    }\n}\n};\n\ninline Partition::Cell* Partition::splitting_queue_pop() {\n  Cell* const cell = splitting_queue.pop_front();\n  cell->in_splitting_queue = false;\n  return cell;\n}\n\ninline bool Partition::splitting_queue_is_empty() const {\n  return splitting_queue.is_empty();\n}\n\ninline unsigned int Partition::cr_get_level(const unsigned int cell_index) const {\n  return(cr_cells[cell_index].level);\n}\n\n} // namespace bliss\n\n#endif\n"
  },
  {
    "path": "external/bliss/bliss/search.h",
    "content": "\nvoid search(const bool canonical, Stats& stats) {\n  const unsigned int N        = get_nof_vertices();\n  unsigned int all_same_level = UINT_MAX;\n  p.graph                     = this;\n  /*\n   * Must be done!\n   */\n  remove_duplicate_edges();\n  /*\n   * Reset search statistics\n   */\n  stats.reset();\n  stats.nof_nodes      = 1;\n  stats.nof_leaf_nodes = 1;\n\n  /* Free old first path data structures */\n  if (first_path_labeling) {\n    free(first_path_labeling);\n    first_path_labeling = 0;\n  }\n  if (first_path_labeling_inv) {\n    free(first_path_labeling_inv);\n    first_path_labeling_inv = 0;\n  }\n  if (first_path_automorphism) {\n    free(first_path_automorphism);\n    first_path_automorphism = 0;\n  }\n\n  /* Free old best path data structures */\n  if (best_path_labeling) {\n    free(best_path_labeling);\n    best_path_labeling = 0;\n  }\n  if (best_path_labeling_inv) {\n    free(best_path_labeling_inv);\n    best_path_labeling_inv = 0;\n  }\n  if (best_path_automorphism) {\n    free(best_path_automorphism);\n    best_path_automorphism = 0;\n  }\n\n  if (N == 0) {\n    /* Nothing to do, return... */\n    return;\n  }\n\n  /* Initialize the partition ... */\n  p.init(N);\n  /* ... and the component recursion data structures in the partition */\n  if (opt_use_comprec)\n    p.cr_init();\n\n  neighbour_heap.init(N);\n\n  in_search = false;\n  /* Do not compute certificate when building the initial partition */\n  refine_compare_certificate = false;\n  /* The 'eqref_hash' hash value is not computed when building\n   * the initial partition as it is not used for anything at the moment.\n   * This saves some cycles. */\n  compute_eqref_hash = false;\n\n  // Timer timer1;\n\n  make_initial_equitable_partition();\n\n  if (verbstr and verbose_level >= 2) {\n    fprintf(verbstr, \"Initial partition computed in %.2f seconds\\n\", 0.0);\n    //  timer1.get_duration());\n    fflush(verbstr);\n  }\n\n  /*\n   * Allocate space for the \"first path\" and \"best path\" labelings\n   */\n  if (first_path_labeling)\n    free(first_path_labeling);\n  first_path_labeling = (unsigned int*)calloc(N, sizeof(unsigned int));\n  if (!first_path_labeling)\n    _OUT_OF_MEMORY();\n  if (best_path_labeling)\n    free(best_path_labeling);\n  best_path_labeling = (unsigned int*)calloc(N, sizeof(unsigned int));\n  if (!best_path_labeling)\n    _OUT_OF_MEMORY();\n\n  /*\n   * Is the initial partition discrete?\n   */\n  if (p.is_discrete()) {\n    /* Make the best path labeling i.e. the canonical labeling */\n    update_labeling(best_path_labeling);\n    /* Update statistics */\n    stats.nof_leaf_nodes = 1;\n    return;\n  }\n\n  /*\n   * Allocate the inverses of the \"first path\" and \"best path\" labelings\n   */\n  if (first_path_labeling_inv)\n    free(first_path_labeling_inv);\n  first_path_labeling_inv = (unsigned int*)calloc(N, sizeof(unsigned int));\n  if (!first_path_labeling_inv)\n    _OUT_OF_MEMORY();\n  if (best_path_labeling_inv)\n    free(best_path_labeling_inv);\n  best_path_labeling_inv = (unsigned int*)calloc(N, sizeof(unsigned int));\n  if (!best_path_labeling_inv)\n    _OUT_OF_MEMORY();\n\n  /*\n   * Allocate space for the automorphisms\n   */\n  if (first_path_automorphism)\n    free(first_path_automorphism);\n  first_path_automorphism = (unsigned int*)malloc(N * sizeof(unsigned int));\n  if (!first_path_automorphism)\n    _OUT_OF_MEMORY();\n  if (best_path_automorphism)\n    free(best_path_automorphism);\n  best_path_automorphism = (unsigned int*)malloc(N * sizeof(unsigned int));\n  if (!best_path_automorphism)\n    _OUT_OF_MEMORY();\n\n  /*\n   * Initialize orbit information so that all vertices are in their own orbits\n   */\n  first_path_orbits.init(N);\n  best_path_orbits.init(N);\n\n  /*\n   * Initialize certificate memory\n   */\n  initialize_certificate();\n\n  std::vector<TreeNode> search_stack;\n  std::vector<PathInfo> first_path_info;\n  std::vector<PathInfo> best_path_info;\n\n  search_stack.clear();\n\n  /* Initialize \"long prune\" data structures */\n  if (opt_use_long_prune)\n    long_prune_init();\n\n  /*\n   * Initialize failure recording data structures\n   */\n  typedef std::set<unsigned int, std::less<unsigned int>> FailureRecordingSet;\n  std::vector<FailureRecordingSet> failure_recording_hashes;\n\n  /*\n   * Initialize component recursion data structures\n   */\n  cr_cep_stack.clear();\n  unsigned int cr_cep_index = 0;\n  {\n    /* Inset a sentinel \"component end point\" */\n    CR_CEP sentinel;\n    sentinel.creation_level      = 0;\n    sentinel.discrete_cell_limit = get_nof_vertices();\n    sentinel.next_cr_level       = 0;\n    sentinel.next_cep_index      = 0;\n    sentinel.first_checked       = false;\n    sentinel.best_checked        = false;\n    cr_cep_index                 = 0;\n    cr_cep_stack.push_back(sentinel);\n  }\n  cr_level = 0;\n  if (opt_use_comprec and nucr_find_first_component(cr_level) == true and\n      p.nof_discrete_cells() + cr_component_elements <\n          cr_cep_stack[cr_cep_index].discrete_cell_limit) {\n    cr_level = p.cr_split_level(0, cr_component);\n    CR_CEP cep;\n    cep.creation_level      = 0;\n    cep.discrete_cell_limit = p.nof_discrete_cells() + cr_component_elements;\n    cep.next_cr_level       = 0;\n    cep.next_cep_index      = cr_cep_index;\n    cep.first_checked       = false;\n    cep.best_checked        = false;\n    cr_cep_index            = cr_cep_stack.size();\n    cr_cep_stack.push_back(cep);\n  }\n\n  /*\n   * Build the root node of the search tree\n   */\n  {\n    TreeNode root;\n    Partition::Cell* split_cell = find_next_cell_to_be_splitted(p.first_cell);\n    root.split_cell_first       = split_cell->first;\n    root.split_element          = TreeNode::SPLIT_START;\n    root.partition_bt_point     = p.set_backtrack_point();\n    root.certificate_index      = 0;\n    root.fp_on                  = true;\n    root.fp_cert_equal          = true;\n    root.fp_extendable          = TreeNode::MAYBE;\n    root.in_best_path           = false;\n    root.cmp_to_best_path       = 0;\n    root.long_prune_begin       = 0;\n\n    root.failure_recording_ival = 0;\n\n    /* Save component recursion info for backtracking */\n    root.cr_level          = cr_level;\n    root.cr_cep_stack_size = cr_cep_stack.size();\n    root.cr_cep_index      = cr_cep_index;\n    search_stack.push_back(root);\n  }\n\n  /*\n   * Set status and global flags for search related procedures\n   */\n  in_search = true;\n  /* Do not compare certificates during refinement until the first path has been\n   * traversed to the leaf */\n  refine_compare_certificate = false;\n\n  /*\n   * The actual backtracking search\n   */\n  while (!search_stack.empty()) {\n    TreeNode& current_node           = search_stack.back();\n    const unsigned int current_level = (unsigned int)search_stack.size() - 1;\n\n    if (opt_use_comprec) {\n      CR_CEP& cep = cr_cep_stack[current_node.cr_cep_index];\n      if (cep.first_checked == true and\n          current_node.fp_extendable == TreeNode::MAYBE and\n          !search_stack[cep.creation_level].fp_on) {\n        current_node.fp_extendable = TreeNode::NO;\n      }\n    }\n\n    if (current_node.fp_on) {\n      if (current_node.split_element == TreeNode::SPLIT_END) {\n        search_stack.pop_back();\n        continue;\n      }\n    } else {\n      if (current_node.fp_extendable == TreeNode::YES) {\n        search_stack.pop_back();\n        continue;\n      }\n      if (current_node.split_element == TreeNode::SPLIT_END) {\n        if (opt_use_failure_recording) {\n          TreeNode& parent_node = search_stack[current_level - 1];\n          if (parent_node.fp_on)\n            failure_recording_hashes[current_level - 1].insert(\n                current_node.failure_recording_ival);\n        }\n        search_stack.pop_back();\n        continue;\n      }\n      if (current_node.fp_extendable == TreeNode::NO and\n          (!canonical or current_node.cmp_to_best_path < 0)) {\n        if (opt_use_failure_recording) {\n          TreeNode& parent_node = search_stack[current_level - 1];\n          if (parent_node.fp_on)\n            failure_recording_hashes[current_level - 1].insert(\n                current_node.failure_recording_ival);\n        }\n        search_stack.pop_back();\n        continue;\n      }\n    }\n\n    /* Restore partition ... */\n    p.goto_backtrack_point(current_node.partition_bt_point);\n    /* ... and re-remember backtracking point */\n    current_node.partition_bt_point = p.set_backtrack_point();\n\n    /* Restore current path certificate */\n    certificate_index                     = current_node.certificate_index;\n    refine_current_path_certificate_index = current_node.certificate_index;\n    certificate_current_path.resize(certificate_index);\n\n    /* Fetch split cell information */\n    Partition::Cell* const cell =\n        p.get_cell(p.elements[current_node.split_cell_first]);\n\n    /* Restore component recursion information */\n    cr_level = current_node.cr_level;\n    cr_cep_stack.resize(current_node.cr_cep_stack_size);\n    cr_cep_index = current_node.cr_cep_index;\n\n    /*\n     * Update long prune redundancy sets\n     */\n    if (opt_use_long_prune and current_level >= 1 and !current_node.fp_on) {\n      unsigned int begin = (current_node.long_prune_begin > long_prune_begin)\n                               ? current_node.long_prune_begin\n                               : long_prune_begin;\n      for (unsigned int i = begin; i < long_prune_end; i++) {\n        const std::vector<bool>& fixed = long_prune_get_fixed(i);\n#if defined(BLISS_CONSISTENCY_CHECKS)\n        for (unsigned int l = 0; l < search_stack.size() - 2; l++)\n          assert(fixed[search_stack[l].split_element]);\n#endif\n        if (fixed[search_stack[search_stack.size() - 1 - 1].split_element] ==\n            false) {\n          long_prune_swap(begin, i);\n          begin++;\n          current_node.long_prune_begin = begin;\n          continue;\n        }\n      }\n\n      if (current_node.split_element == TreeNode::SPLIT_START) {\n        current_node.needs_long_prune = true;\n      } else if (current_node.needs_long_prune) {\n        current_node.needs_long_prune = false;\n        unsigned int begin = (current_node.long_prune_begin > long_prune_begin)\n                                 ? current_node.long_prune_begin\n                                 : long_prune_begin;\n        for (unsigned int i = begin; i < long_prune_end; i++) {\n          const std::vector<bool>& fixed = long_prune_get_fixed(i);\n#if defined(BLISS_CONSISTENCY_CHECKS)\n          for (unsigned int l = 0; l < search_stack.size() - 2; l++)\n            assert(fixed[search_stack[l].split_element]);\n#endif\n          assert(fixed[search_stack[current_level - 1].split_element] == true);\n          if (fixed[search_stack[current_level - 1].split_element] == false) {\n            long_prune_swap(begin, i);\n            begin++;\n            current_node.long_prune_begin = begin;\n            continue;\n          }\n          const std::vector<bool>& mcrs = long_prune_get_mcrs(i);\n          unsigned int* ep              = p.elements + cell->first;\n          for (unsigned int j = cell->length; j > 0; j--, ep++) {\n            if (mcrs[*ep] == false)\n              current_node.long_prune_redundant.insert(*ep);\n          }\n        }\n      }\n    }\n\n    /*\n     * Find the next smallest, non-isomorphic element in the cell and\n     * store it in current_node.split_element\n     */\n    {\n      unsigned int next_split_element = UINT_MAX;\n      // unsigned int* next_split_element_pos = 0;\n      unsigned int* ep = p.elements + cell->first;\n      if (current_node.fp_on) {\n        /* Find the next larger splitting element that is\n         * a minimal orbit representative w.r.t. first_path_orbits */\n        for (unsigned int i = cell->length; i > 0; i--, ep++) {\n          if ((int)(*ep) > current_node.split_element and\n              *ep < next_split_element and\n              first_path_orbits.is_minimal_representative(*ep)) {\n            next_split_element = *ep;\n            // next_split_element_pos = ep;\n          }\n        }\n      } else if (current_node.in_best_path) {\n        /* Find the next larger splitting element that is\n         * a minimal orbit representative w.r.t. best_path_orbits */\n        for (unsigned int i = cell->length; i > 0; i--, ep++) {\n          if ((int)(*ep) > current_node.split_element and\n              *ep < next_split_element and\n              best_path_orbits.is_minimal_representative(*ep) and\n              (!opt_use_long_prune or\n               current_node.long_prune_redundant.find(*ep) ==\n                   current_node.long_prune_redundant.end())) {\n            next_split_element = *ep;\n            // next_split_element_pos = ep;\n          }\n        }\n      } else {\n        /* Find the next larger splitting element */\n        for (unsigned int i = cell->length; i > 0; i--, ep++) {\n          if ((int)(*ep) > current_node.split_element and\n              *ep < next_split_element and\n              (!opt_use_long_prune or\n               current_node.long_prune_redundant.find(*ep) ==\n                   current_node.long_prune_redundant.end())) {\n            next_split_element = *ep;\n            // next_split_element_pos = ep;\n          }\n        }\n      }\n      if (next_split_element == UINT_MAX) {\n        /* No more (unexplored children) in the cell */\n        current_node.split_element = TreeNode::SPLIT_END;\n        if (current_node.fp_on) {\n          /* Update group size */\n          const unsigned int index = first_path_orbits.orbit_size(\n              first_path_info[search_stack.size() - 1].splitting_element);\n          stats.group_size.multiply(index);\n          stats.group_size_approx *= (long double)index;\n          /*\n           * Update all_same_level\n           */\n          if (index == cell->length and all_same_level == current_level + 1)\n            all_same_level = current_level;\n          if (verbstr and verbose_level >= 2) {\n            fprintf(verbstr,\n                    \"Level %u: orbits=%u, index=%u/%u, all_same_level=%u\\n\",\n                    current_level, first_path_orbits.nof_orbits(), index,\n                    cell->length, all_same_level);\n            fflush(verbstr);\n          }\n        }\n        continue;\n      }\n\n      /* Split on smallest */\n      current_node.split_element = next_split_element;\n    }\n\n    const unsigned int child_level = current_level + 1;\n    /* Update some statistics */\n    stats.nof_nodes++;\n    if (search_stack.size() > stats.max_level)\n      stats.max_level = search_stack.size();\n\n    /* Set flags and indices for the refiner certificate builder */\n    refine_equal_to_first = current_node.fp_cert_equal;\n    refine_cmp_to_best    = current_node.cmp_to_best_path;\n    if (!first_path_info.empty()) {\n      if (refine_equal_to_first)\n        refine_first_path_subcertificate_end =\n            first_path_info[search_stack.size() - 1].certificate_index +\n            first_path_info[search_stack.size() - 1].subcertificate_length;\n      if (canonical) {\n        if (refine_cmp_to_best == 0)\n          refine_best_path_subcertificate_end =\n              best_path_info[search_stack.size() - 1].certificate_index +\n              best_path_info[search_stack.size() - 1].subcertificate_length;\n      } else\n        refine_cmp_to_best = -1;\n    }\n\n    const bool was_fp_cert_equal = current_node.fp_cert_equal;\n\n    /* Individualize, i.e. split the cell in two, the latter new cell\n     * will be a unit one containing info.split_element */\n    Partition::Cell* const new_cell =\n        p.individualize(cell, current_node.split_element);\n\n    /*\n     * Refine the new partition to equitable\n     */\n    if (cell->is_unit())\n      refine_to_equitable(cell, new_cell);\n    else\n      refine_to_equitable(new_cell);\n\n    /* Update statistics */\n    if (p.is_discrete())\n      stats.nof_leaf_nodes++;\n\n    if (!first_path_info.empty()) {\n      /* We are no longer on the first path */\n      const unsigned int subcertificate_length =\n          certificate_current_path.size() - certificate_index;\n      if (refine_equal_to_first) {\n        /* Was equal to the first path so far */\n        PathInfo& first_pinfo = first_path_info[current_level];\n        assert(first_pinfo.certificate_index == certificate_index);\n        if (subcertificate_length != first_pinfo.subcertificate_length) {\n          refine_equal_to_first = false;\n          if (opt_use_failure_recording)\n            failure_recording_fp_deviation = subcertificate_length;\n        } else if (first_pinfo.eqref_hash.cmp(eqref_hash) != 0) {\n          refine_equal_to_first = false;\n          if (opt_use_failure_recording)\n            failure_recording_fp_deviation = eqref_hash.get_value();\n        }\n      }\n      if (canonical and (refine_cmp_to_best == 0)) {\n        /* Was equal to the best path so far */\n        PathInfo& bestp_info = best_path_info[current_level];\n        assert(bestp_info.certificate_index == certificate_index);\n        if (subcertificate_length < bestp_info.subcertificate_length) {\n          refine_cmp_to_best = -1;\n        } else if (subcertificate_length > bestp_info.subcertificate_length) {\n          refine_cmp_to_best = 1;\n        } else if (bestp_info.eqref_hash.cmp(eqref_hash) > 0) {\n          refine_cmp_to_best = -1;\n        } else if (bestp_info.eqref_hash.cmp(eqref_hash) < 0) {\n          refine_cmp_to_best = 1;\n        }\n      }\n\n      if (opt_use_failure_recording and was_fp_cert_equal and\n          !refine_equal_to_first) {\n        UintSeqHash k;\n        k.update(failure_recording_fp_deviation);\n        k.update(eqref_hash.get_value());\n        failure_recording_fp_deviation = k.get_value();\n\n        if (current_node.fp_on)\n          failure_recording_hashes[current_level].insert(\n              failure_recording_fp_deviation);\n        else {\n          for (unsigned int i = current_level; i > 0; i--) {\n            if (search_stack[i].fp_on)\n              break;\n            const FailureRecordingSet& s = failure_recording_hashes[i];\n            if (i == current_level and\n                s.find(failure_recording_fp_deviation) != s.end())\n              break;\n            if (s.find(0) != s.end())\n              break;\n            search_stack[i].fp_extendable = TreeNode::NO;\n          }\n        }\n      }\n\n      /* Check if no longer equal to the first path and,\n       * if canonical labeling is desired, also worse than the\n       * current best path */\n      if (refine_equal_to_first == false and\n          (!canonical or (refine_cmp_to_best < 0))) {\n        /* Yes, backtrack */\n        stats.nof_bad_nodes++;\n        if (current_node.fp_cert_equal == true and\n            current_level + 1 > all_same_level) {\n          assert(all_same_level >= 1);\n          for (unsigned int i = all_same_level; i < search_stack.size(); i++) {\n            search_stack[i].fp_extendable = TreeNode::NO;\n          }\n        }\n\n        continue;\n      }\n    }\n\n#if defined(BLISS_VERIFY_EQUITABLEDNESS)\n    /* The new partition should be equitable */\n    if (!is_equitable())\n      fatal_error(\"consistency check failed - partition after refinement is \"\n                  \"not equitable\");\n#endif\n\n    /*\n     * Next level search tree node info\n     */\n    TreeNode child_node;\n\n    /* No more in the first path */\n    child_node.fp_on = false;\n    /* No more in the best path */\n    child_node.in_best_path = false;\n\n    child_node.fp_cert_equal = refine_equal_to_first;\n    if (current_node.fp_extendable == TreeNode::NO or\n        (current_node.fp_extendable == TreeNode::MAYBE and\n         child_node.fp_cert_equal == false))\n      child_node.fp_extendable = TreeNode::NO;\n    else\n      child_node.fp_extendable = TreeNode::MAYBE;\n    child_node.cmp_to_best_path = refine_cmp_to_best;\n\n    child_node.failure_recording_ival = 0;\n    child_node.cr_cep_stack_size      = current_node.cr_cep_stack_size;\n    child_node.cr_cep_index           = current_node.cr_cep_index;\n    child_node.cr_level               = current_node.cr_level;\n\n    certificate_index = certificate_current_path.size();\n\n    current_node.eqref_hash = eqref_hash;\n    current_node.subcertificate_length =\n        certificate_index - current_node.certificate_index;\n\n    /*\n     * The first encountered leaf node at the end of the \"first path\"?\n     */\n    if (p.is_discrete() and first_path_info.empty()) {\n      // fprintf(stdout, \"Level %u: FIRST\\n\", child_level); fflush(stdout);\n      stats.nof_canupdates++;\n      /*\n       * Update labelings and their inverses\n       */\n      update_labeling_and_its_inverse(first_path_labeling,\n                                      first_path_labeling_inv);\n      update_labeling_and_its_inverse(best_path_labeling,\n                                      best_path_labeling_inv);\n      /*\n       * Reset automorphism array\n       */\n      reset_permutation(first_path_automorphism);\n      reset_permutation(best_path_automorphism);\n      /*\n       * Reset orbit information\n       */\n      first_path_orbits.reset();\n      best_path_orbits.reset();\n      /*\n       * Reset group size\n       */\n      stats.group_size.assign(1);\n      stats.group_size_approx = 1.0;\n      /*\n       * Reset all_same_level\n       */\n      all_same_level = child_level;\n      /*\n       * Mark the current path to be the first and best one and save it\n       */\n      const unsigned int base_size = search_stack.size();\n      best_path_info.clear();\n      // fprintf(stdout, \" New base is: \");\n      for (unsigned int i = 0; i < base_size; i++) {\n        search_stack[i].fp_on            = true;\n        search_stack[i].fp_cert_equal    = true;\n        search_stack[i].fp_extendable    = TreeNode::YES;\n        search_stack[i].in_best_path     = true;\n        search_stack[i].cmp_to_best_path = 0;\n        PathInfo path_info;\n        path_info.splitting_element     = search_stack[i].split_element;\n        path_info.certificate_index     = search_stack[i].certificate_index;\n        path_info.eqref_hash            = search_stack[i].eqref_hash;\n        path_info.subcertificate_length = search_stack[i].subcertificate_length;\n        first_path_info.push_back(path_info);\n        best_path_info.push_back(path_info);\n        // fprintf(stdout, \"%u \", search_stack[i].split_element);\n      }\n      // fprintf(stdout, \"\\n\"); fflush(stdout);\n      /* Copy certificates */\n      certificate_first_path = certificate_current_path;\n      certificate_best_path  = certificate_current_path;\n\n      /* From now on, compare certificates when refining */\n      refine_compare_certificate = true;\n\n      if (opt_use_failure_recording)\n        failure_recording_hashes.resize(base_size);\n\n      /*\n         for(unsigned int j = 0; j < search_stack.size(); j++)\n         fprintf(stderr, \"%u \", search_stack[j].split_element);\n         fprintf(stderr, \"\\n\");\n         p.print(stderr); fprintf(stderr, \"\\n\");\n         */\n\n      /*\n       * Backtrack to the previous level\n       */\n      continue;\n    }\n\n    if (p.is_discrete() and child_node.fp_cert_equal) {\n      /*\n       * A leaf node that is equal to the first one.\n       * An automorphism found: aut[i] = elements[first_path_labeling[i]]\n       */\n      goto handle_first_path_automorphism;\n    }\n\n    if (!p.is_discrete()) {\n      Partition::Cell* next_split_cell = 0;\n      /*\n       * An internal, non-leaf node\n       */\n      if (opt_use_comprec) {\n        assert(p.nof_discrete_cells() <=\n               cr_cep_stack[cr_cep_index].discrete_cell_limit);\n        assert(cr_level == child_node.cr_level);\n\n        if (p.nof_discrete_cells() ==\n            cr_cep_stack[cr_cep_index].discrete_cell_limit) {\n          /* We have reached the end of a component */\n          assert(cr_cep_index != 0);\n          CR_CEP& cep = cr_cep_stack[cr_cep_index];\n\n          /* First, compare with respect to the first path */\n          if (first_path_info.empty() or child_node.fp_cert_equal) {\n            if (cep.first_checked == false) {\n              /* First time, go to the next component */\n              cep.first_checked = true;\n            } else {\n              assert(!first_path_info.empty());\n              assert(cep.creation_level < search_stack.size());\n              TreeNode& old_info = search_stack[cep.creation_level];\n              /* If the component was found when on the first path,\n               * handle the found automorphism as the other\n               * first path automorphisms */\n              if (old_info.fp_on)\n                goto handle_first_path_automorphism;\n            }\n          }\n\n          if (canonical and !first_path_info.empty() and\n              child_node.cmp_to_best_path >= 0) {\n            if (cep.best_checked == false) {\n              /* First time, go to the next component */\n              cep.best_checked = true;\n            } else {\n              assert(cep.creation_level < search_stack.size());\n              TreeNode& old_info = search_stack[cep.creation_level];\n              if (child_node.cmp_to_best_path == 0) {\n                /* If the component was found when on the best path,\n                 * handle the found automorphism as the other\n                 * best path automorphisms */\n                if (old_info.in_best_path)\n                  goto handle_best_path_automorphism;\n                /* Otherwise, we do not remember the automorhism as\n                 * we didn't memorize the path that was invariant\n                 * equal to the best one and passed through the\n                 * component.\n                 * Thus we can only backtrack to the previous level */\n                child_node.cmp_to_best_path = -1;\n                if (!child_node.fp_cert_equal) {\n                  continue;\n                }\n              } else {\n                assert(child_node.cmp_to_best_path > 0);\n                if (old_info.in_best_path) {\n                  stats.nof_canupdates++;\n                  /*\n                   * Update canonical labeling and its inverse\n                   */\n                  for (unsigned int i = 0; i < N; i++) {\n                    if (p.get_cell(p.elements[i])->is_unit()) {\n                      best_path_labeling[p.elements[i]] = i;\n                      best_path_labeling_inv[i]         = p.elements[i];\n                    }\n                  }\n                  // update_labeling_and_its_inverse(best_path_labeling,\n                  // best_path_labeling_inv);\n                  /* Reset best path automorphism */\n                  reset_permutation(best_path_automorphism);\n                  /* Reset best path orbit structure */\n                  best_path_orbits.reset();\n                  /* Mark to be the best one and save prefix */\n                  unsigned int postfix_start = cep.creation_level;\n                  assert(postfix_start < best_path_info.size());\n                  while (p.get_cell(\n                              best_path_info[postfix_start].splitting_element)\n                             ->is_unit()) {\n                    postfix_start++;\n                    assert(postfix_start < best_path_info.size());\n                  }\n                  unsigned int postfix_start_cert =\n                      best_path_info[postfix_start].certificate_index;\n                  std::vector<PathInfo> best_path_temp = best_path_info;\n                  best_path_info.clear();\n                  for (unsigned int i = 0; i < search_stack.size(); i++) {\n                    TreeNode& ss_info = search_stack[i];\n                    PathInfo bp_info;\n                    ss_info.cmp_to_best_path  = 0;\n                    ss_info.in_best_path      = true;\n                    bp_info.splitting_element = ss_info.split_element;\n                    bp_info.certificate_index = ss_info.certificate_index;\n                    bp_info.subcertificate_length =\n                        ss_info.subcertificate_length;\n                    bp_info.eqref_hash = ss_info.eqref_hash;\n                    best_path_info.push_back(bp_info);\n                  }\n                  /* Copy the postfix of the previous best path */\n                  for (unsigned int i = postfix_start;\n                       i < best_path_temp.size(); i++) {\n                    best_path_info.push_back(best_path_temp[i]);\n                    best_path_info[best_path_info.size() - 1]\n                        .certificate_index =\n                        best_path_info[best_path_info.size() - 2]\n                            .certificate_index +\n                        best_path_info[best_path_info.size() - 2]\n                            .subcertificate_length;\n                  }\n                  std::vector<unsigned int> certificate_best_path_old =\n                      certificate_best_path;\n                  certificate_best_path = certificate_current_path;\n                  for (unsigned int i = postfix_start_cert;\n                       i < certificate_best_path_old.size(); i++)\n                    certificate_best_path.push_back(\n                        certificate_best_path_old[i]);\n                  assert(certificate_best_path.size() ==\n                         best_path_info.back().certificate_index +\n                             best_path_info.back().subcertificate_length);\n                  /* Backtrack to the previous level */\n                  continue;\n                }\n              }\n            }\n          }\n\n          /* No backtracking performed, go to next componenet */\n          cr_level     = cep.next_cr_level;\n          cr_cep_index = cep.next_cep_index;\n        }\n\n        /* Check if the current component has been split into\n         * new non-uniformity subcomponents */\n        // if(nucr_find_first_component(cr_level) == true and\n        // p.nof_discrete_cells() + cr_component_elements <\n        // cr_cep_stack[cr_cep_index].discrete_cell_limit)\n        if (nucr_find_first_component(cr_level, cr_component,\n                                      cr_component_elements,\n                                      next_split_cell) == true and\n            p.nof_discrete_cells() + cr_component_elements <\n                cr_cep_stack[cr_cep_index].discrete_cell_limit) {\n          const unsigned int next_cr_level =\n              p.cr_split_level(cr_level, cr_component);\n          CR_CEP cep;\n          cep.creation_level = search_stack.size();\n          cep.discrete_cell_limit =\n              p.nof_discrete_cells() + cr_component_elements;\n          cep.next_cr_level  = cr_level;\n          cep.next_cep_index = cr_cep_index;\n          cep.first_checked  = false;\n          cep.best_checked   = false;\n          cr_cep_index       = cr_cep_stack.size();\n          cr_cep_stack.push_back(cep);\n          cr_level = next_cr_level;\n        }\n      }\n\n      /*\n       * Build the next node info\n       */\n      /* Find the next cell to be splitted */\n      if (!next_split_cell)\n        next_split_cell = find_next_cell_to_be_splitted(\n            p.get_cell(p.elements[current_node.split_cell_first]));\n      // Partition::Cell * const next_split_cell =\n      // find_next_cell_to_be_splitted(p.get_cell(p.elements[current_node.split_cell_first]));\n      child_node.split_cell_first   = next_split_cell->first;\n      child_node.split_element      = TreeNode::SPLIT_START;\n      child_node.certificate_index  = certificate_index;\n      child_node.partition_bt_point = p.set_backtrack_point();\n      child_node.long_prune_redundant.clear();\n      child_node.long_prune_begin = current_node.long_prune_begin;\n\n      /* Save component recursion info for backtracking */\n      child_node.cr_level          = cr_level;\n      child_node.cr_cep_stack_size = cr_cep_stack.size();\n      child_node.cr_cep_index      = cr_cep_index;\n\n      search_stack.push_back(child_node);\n      continue;\n    }\n\n    /*\n     * A leaf node not in the first path or equivalent to the first path\n     */\n\n    if (child_node.cmp_to_best_path > 0) {\n      /*\n       * A new, better representative found\n       */\n      // fprintf(stdout, \"Level %u: NEW BEST\\n\", child_level); fflush(stdout);\n      stats.nof_canupdates++;\n      /*\n       * Update canonical labeling and its inverse\n       */\n      update_labeling_and_its_inverse(best_path_labeling,\n                                      best_path_labeling_inv);\n      /* Reset best path automorphism */\n      reset_permutation(best_path_automorphism);\n      /* Reset best path orbit structure */\n      best_path_orbits.reset();\n      /*\n       * Mark the current path to be the best one and save it\n       */\n      const unsigned int base_size = search_stack.size();\n      assert(current_level + 1 == base_size);\n      best_path_info.clear();\n      for (unsigned int i = 0; i < base_size; i++) {\n        search_stack[i].cmp_to_best_path = 0;\n        search_stack[i].in_best_path     = true;\n        PathInfo path_info;\n        path_info.splitting_element     = search_stack[i].split_element;\n        path_info.certificate_index     = search_stack[i].certificate_index;\n        path_info.subcertificate_length = search_stack[i].subcertificate_length;\n        path_info.eqref_hash            = search_stack[i].eqref_hash;\n        best_path_info.push_back(path_info);\n      }\n      certificate_best_path = certificate_current_path;\n      /*\n       * Backtrack to the previous level\n       */\n      continue;\n    }\n\n  handle_best_path_automorphism:\n    /*\n     *\n     * Best path automorphism handling\n     *\n     */\n    {\n\n      /*\n       * Equal to the previous best path\n       */\n      if (p.is_discrete()) {\n#if defined(BLISS_CONSISTENCY_CHECKS)\n        /* Verify that the automorphism is correctly built */\n        for (unsigned int i = 0; i < N; i++)\n          assert(best_path_automorphism[i] ==\n                 p.elements[best_path_labeling[i]]);\n#endif\n      } else {\n        /* An automorphism that was found before the partition was discrete.\n         * Set the image of all elements in non-disrete cells accordingly */\n        for (Partition::Cell* c = p.first_nonsingleton_cell; c;\n             c                  = c->next_nonsingleton) {\n          for (unsigned int i = c->first; i < c->first + c->length; i++)\n            if (p.get_cell(p.elements[best_path_labeling[p.elements[i]]])\n                    ->is_unit())\n              best_path_automorphism\n                  [p.elements[best_path_labeling[p.elements[i]]]] =\n                      p.elements[i];\n            else\n              best_path_automorphism[p.elements[i]] = p.elements[i];\n        }\n      }\n\n#if defined(BLISS_VERIFY_AUTOMORPHISMS)\n      /* Verify that it really is an automorphism */\n      if (!is_automorphism(best_path_automorphism))\n        fatal_error(\"Best path automorhism validation check failed\");\n#endif\n\n      unsigned int gca_level_with_first = 0;\n      for (unsigned int i = search_stack.size(); i > 0; i--) {\n        if ((int)first_path_info[gca_level_with_first].splitting_element !=\n            search_stack[gca_level_with_first].split_element)\n          break;\n        gca_level_with_first++;\n      }\n\n      unsigned int gca_level_with_best = 0;\n      for (unsigned int i = search_stack.size(); i > 0; i--) {\n        if ((int)best_path_info[gca_level_with_best].splitting_element !=\n            search_stack[gca_level_with_best].split_element)\n          break;\n        gca_level_with_best++;\n      }\n\n      if (opt_use_long_prune) {\n        /* Record automorphism */\n        long_prune_add_automorphism(best_path_automorphism);\n      }\n\n      /*\n       * Update orbit information\n       */\n      update_orbit_information(best_path_orbits, best_path_automorphism);\n\n      /*\n       * Update orbit information\n       */\n      const unsigned int nof_old_orbits = first_path_orbits.nof_orbits();\n      update_orbit_information(first_path_orbits, best_path_automorphism);\n      if (nof_old_orbits != first_path_orbits.nof_orbits()) {\n        /* Some orbits were merged */\n        /* Report automorphism */\n        if (report_hook)\n          (*report_hook)(report_user_param, get_nof_vertices(),\n                         best_path_automorphism);\n        /* Update statistics */\n        stats.nof_generators++;\n      }\n\n      /*\n       * Compute backjumping level\n       */\n      unsigned int backjumping_level = current_level + 1 - 1;\n      if (!first_path_orbits.is_minimal_representative(\n              search_stack[gca_level_with_first].split_element)) {\n        backjumping_level = gca_level_with_first;\n      } else {\n        assert(!best_path_orbits.is_minimal_representative(\n            search_stack[gca_level_with_best].split_element));\n        backjumping_level = gca_level_with_best;\n      }\n      /* Backtrack */\n      search_stack.resize(backjumping_level + 1);\n      continue;\n    }\n    _INTERNAL_ERROR();\n\n  handle_first_path_automorphism:\n    /*\n     *\n     * A first-path automorphism: aut[i] = elements[first_path_labeling[i]]\n     *\n     */\n\n    if (p.is_discrete()) {\n#if defined(BLISS_CONSISTENCY_CHECKS)\n      /* Verify that the complete automorphism is correctly built */\n      for (unsigned int i = 0; i < N; i++)\n        assert(first_path_automorphism[i] ==\n               p.elements[first_path_labeling[i]]);\n#endif\n    } else {\n      /* An automorphism that was found before the partition was discrete.\n       * Set the image of all elements in non-disrete cells accordingly */\n      for (Partition::Cell* c = p.first_nonsingleton_cell; c;\n           c                  = c->next_nonsingleton) {\n        for (unsigned int i = c->first; i < c->first + c->length; i++)\n          if (p.get_cell(p.elements[first_path_labeling[p.elements[i]]])\n                  ->is_unit())\n            first_path_automorphism\n                [p.elements[first_path_labeling[p.elements[i]]]] =\n                    p.elements[i];\n          else\n            first_path_automorphism[p.elements[i]] = p.elements[i];\n      }\n    }\n\n#if defined(BLISS_VERIFY_AUTOMORPHISMS)\n    /* Verify that it really is an automorphism */\n    if (!is_automorphism(first_path_automorphism))\n      fatal_error(\"First path automorphism validation check failed\");\n#endif\n\n    if (opt_use_long_prune) {\n      long_prune_add_automorphism(first_path_automorphism);\n    }\n\n    /*\n     * Update orbit information\n     */\n    update_orbit_information(first_path_orbits, first_path_automorphism);\n\n    /*\n     * Compute backjumping level\n     */\n    for (unsigned int i = 0; i < search_stack.size(); i++) {\n      TreeNode& n = search_stack[i];\n      if (n.fp_on) {\n        ;\n      } else {\n        n.fp_extendable = TreeNode::YES;\n      }\n    }\n\n    /* Report automorphism by calling the user defined hook function */\n    if (report_hook)\n      (*report_hook)(report_user_param, get_nof_vertices(),\n                     first_path_automorphism);\n\n    /* Update statistics */\n    stats.nof_generators++;\n    continue;\n\n  } /* while(!search_stack.empty()) */\n\n  /* Free \"long prune\" technique memory */\n  if (opt_use_long_prune)\n    long_prune_deallocate();\n\n  /* Release component recursion data in partition */\n  if (opt_use_comprec)\n    p.cr_free();\n}\n"
  },
  {
    "path": "external/bliss/bliss/uintseqhash.hh",
    "content": "#ifndef BLISS_UINTSEQHASH_HH\n#define BLISS_UINTSEQHASH_HH\n\n#include <cstdio>\nnamespace bliss {\nstatic unsigned int rtab[256] = {\n\t0xAEAA35B8, 0x65632E16, 0x155EDBA9, 0x01349B39,\n\t0x8EB8BD97, 0x8E4C5367, 0x8EA78B35, 0x2B1B4072,\n\t0xC1163893, 0x269A8642, 0xC79D7F6D, 0x6A32DEA0,\n\t0xD4D2DA56, 0xD96D4F47, 0x47B5F48A, 0x2587C6BF,\n\t0x642B71D8, 0x5DBBAF58, 0x5C178169, 0xA16D9279,\n\t0x75CDA063, 0x291BC48B, 0x01AC2F47, 0x5416DF7C,\n\t0x45307514, 0xB3E1317B, 0xE1C7A8DE, 0x3ACDAC96,\n\t0x11B96831, 0x32DE22DD, 0x6A1DA93B, 0x58B62381,\n\t0x283810E2, 0xBC30E6A6, 0x8EE51705, 0xB06E8DFB,\n\t0x729AB12A, 0xA9634922, 0x1A6E8525, 0x49DD4E19,\n\t0xE5DB3D44, 0x8C5B3A02, 0xEBDE2864, 0xA9146D9F,\n\t0x736D2CB4, 0xF5229F42, 0x712BA846, 0x20631593,\n\t0x89C02603, 0xD5A5BF6A, 0x823F4E18, 0x5BE5DEFF,\n\t0x1C4EBBFA, 0x5FAB8490, 0x6E559B0C, 0x1FE528D6,\n\t0xB3198066, 0x4A965EB5, 0xFE8BB3D5, 0x4D2F6234,\n\t0x5F125AA4, 0xBCC640FA, 0x4F8BC191, 0xA447E537,\n\t0xAC474D3C, 0x703BFA2C, 0x617DC0E7, 0xF26299D7,\n\t0xC90FD835, 0x33B71C7B, 0x6D83E138, 0xCBB1BB14,\n\t0x029CF5FF, 0x7CBD093D, 0x4C9825EF, 0x845C4D6D,\n\t0x124349A5, 0x53942D21, 0x800E60DA, 0x2BA6EB7F,\n\t0xCEBF30D3, 0xEB18D449, 0xE281F724, 0x58B1CB09,\n\t0xD469A13D, 0x9C7495C3, 0xE53A7810, 0xA866C08E,\n\t0x832A038B, 0xDDDCA484, 0xD5FE0DDE, 0x0756002B,\n\t0x2FF51342, 0x60FEC9C8, 0x061A53E3, 0x47B1884E,\n\t0xDC17E461, 0xA17A6A37, 0x3158E7E2, 0xA40D873B,\n\t0x45AE2140, 0xC8F36149, 0x63A4EE2D, 0xD7107447,\n\t0x6F90994F, 0x5006770F, 0xC1F3CA9A, 0x91B317B2,\n\t0xF61B4406, 0xA8C9EE8F, 0xC6939B75, 0xB28BBC3B,\n\t0x36BF4AEF, 0x3B12118D, 0x4D536ECF, 0x9CF4B46B,\n\t0xE8AB1E03, 0x8225A360, 0x7AE4A130, 0xC4EE8B50,\n\t0x50651797, 0x5BB4C59F, 0xD120EE47, 0x24F3A386,\n\t0xBE579B45, 0x3A378EFC, 0xC5AB007B, 0x3668942B,\n\t0x2DBDCC3A, 0x6F37F64C, 0xC24F862A, 0xB6F97FCF,\n\t0x9E4FA23D, 0x551AE769, 0x46A8A5A6, 0xDC1BCFDD,\n\t0x8F684CF9, 0x501D811B, 0x84279F80, 0x2614E0AC,\n\t0x86445276, 0xAEA0CE71, 0x0812250F, 0xB586D18A,\n\t0xC68D721B, 0x44514E1D, 0x37CDB99A, 0x24731F89,\n\t0xFA72E589, 0x81E6EBA2, 0x15452965, 0x55523D9D,\n\t0x2DC47E14, 0x2E7FA107, 0xA7790F23, 0x40EBFDBB,\n\t0x77E7906B, 0x6C1DB960, 0x1A8B9898, 0x65FA0D90,\n\t0xED28B4D8, 0x34C3ED75, 0x768FD2EC, 0xFAB60BCB,\n\t0x962C75F4, 0x304F0498, 0x0A41A36B, 0xF7DE2A4A,\n\t0xF4770FE2, 0x73C93BBB, 0xD21C82C5, 0x6C387447,\n\t0x8CDB4CB9, 0x2CC243E8, 0x41859E3D, 0xB667B9CB,\n\t0x89681E8A, 0x61A0526C, 0x883EDDDC, 0x539DE9A4,\n\t0xC29E1DEC, 0x97C71EC5, 0x4A560A66, 0xBD7ECACF,\n\t0x576AE998, 0x31CE5616, 0x97172A6C, 0x83D047C4,\n\t0x274EA9A8, 0xEB31A9DA, 0x327209B5, 0x14D1F2CB,\n\t0x00FE1D96, 0x817DBE08, 0xD3E55AED, 0xF2D30AFC,\n\t0xFB072660, 0x866687D6, 0x92552EB9, 0xEA8219CD,\n\t0xF7927269, 0xF1948483, 0x694C1DF5, 0xB7D8B7BF,\n\t0xFFBC5D2F, 0x2E88B849, 0x883FD32B, 0xA0331192,\n\t0x8CB244DF, 0x41FAF895, 0x16902220, 0x97FB512A,\n\t0x2BEA3CC4, 0xAF9CAE61, 0x41ACD0D5, 0xFD2F28FF,\n\t0xE780ADFA, 0xB3A3A76E, 0x7112AD87, 0x7C3D6058,\n\t0x69E64FFF, 0xE5F8617C, 0x8580727C, 0x41F54F04,\n\t0xD72BE498, 0x653D1795, 0x1275A327, 0x14B499D4,\n\t0x4E34D553, 0x4687AA39, 0x68B64292, 0x5C18ABC3,\n\t0x41EABFCC, 0x92A85616, 0x82684CF8, 0x5B9F8A4E,\n\t0x35382FFE, 0xFB936318, 0x52C08E15, 0x80918B2E,\n\t0x199EDEE0, 0xA9470163, 0xEC44ACDD, 0x612D6735,\n\t0x8F88EA7D, 0x759F5EA4, 0xE5CC7240, 0x68CFEB8B,\n\t0x04725601, 0x0C22C23E, 0x5BC97174, 0x89965841,\n\t0x5D939479, 0x690F338A, 0x3C2D4380, 0xDAE97F2B\n};\n\n// A hash for sequences of unsigned ints.\nclass UintSeqHash {\nprotected:\n\tunsigned int h;\npublic:\n\tUintSeqHash() {h = 0; }\n\tUintSeqHash(const UintSeqHash &other) {h = other.h; }\n\tUintSeqHash& operator=(const UintSeqHash &other) {h = other.h; return *this; }\n\t/** Reset the hash value. */\n\tvoid reset() {h = 0; }\n\t/** Add the unsigned int \\a n to the sequence. */\n\tvoid update(unsigned int i) {\n\t\ti++;\n\t\twhile(i > 0) {\n\t\t\th ^= rtab[i & 0xff];\n\t\t\tconst unsigned int b = (h & 0x80000000) >> 31;\n\t\t\ti = i >> 8;\n\t\t\th = (h << 1) | b;\n\t\t}\n\t}\n\t/** Get the hash value of the sequence seen so far. */\n\tunsigned int get_value() const {return h; }\n\t/** Compare the hash values of this and \\a other.\n\t * Return -1/0/1 if the value of this is smaller/equal/greater than\n\t * that of \\a other. */\n\tint cmp(const UintSeqHash &other) const {\n\t\treturn (h < other.h)?-1:((h == other.h)?0:1);\n\t}\n\t/** An abbreviation for cmp(other) < 0 */\n\tbool is_lt(const UintSeqHash &other) const {return(cmp(other) < 0); }\n\t/** An abbreviation for cmp(other) <= 0 */\n\tbool is_le(const UintSeqHash &other) const {return(cmp(other) <= 0); }\n\t/** An abbreviation for cmp(other) == 0 */\n\tbool is_equal(const UintSeqHash &other) const {return(cmp(other) == 0); }\n};\n} // namespace bliss\n#endif\n"
  },
  {
    "path": "external/bliss/bliss/utils.hh",
    "content": "#ifndef BLISS_UTILS_HH\n#define BLISS_UTILS_HH\n\n/*\n  Copyright (c) 2003-2015 Tommi Junttila\n  Released under the GNU Lesser General Public License version 3.\n  \n  This file is part of bliss.\n  \n  bliss is free software: you can redistribute it and/or modify\n  it under the terms of the GNU Lesser General Public License as published by\n  the Free Software Foundation, version 3 of the License.\n\n  bliss is distributed in the hope that it will be useful,\n  but WITHOUT ANY WARRANTY; without even the implied warranty of\n  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the\n  GNU Lesser General Public License for more details.\n\n  You should have received a copy of the GNU Lesser General Public License\n  along with bliss.  If not, see <http://www.gnu.org/licenses/>.\n*/\n\n/**\n * \\file\n * \\brief Some small utilities.\n *\n */\n\n#include <cstdio>\n\nnamespace bliss {\n\n/**\n * Print the permutation \\a perm of {0,...,N-1} in the cycle format\n * in the file stream \\a fp.\n * The amount \\a offset is added to each element before printing,\n * e.g. the permutation (2 4) is printed as (3 5) when \\a offset is 1.\n */\nvoid print_permutation(FILE* fp,\n\t\t       const unsigned int N,\n\t\t       const unsigned int* perm,\n\t\t       const unsigned int offset = 0);\n\n/**\n * Print the permutation \\a perm of {0,...,N-1} in the cycle format\n * in the file stream \\a fp.\n * The amount \\a offset is added to each element before printing,\n * e.g. the permutation (2 4) is printed as (3 5) when \\a offset is 1.\n */\nvoid print_permutation(FILE* fp,\n\t\t       const std::vector<unsigned int>& perm,\n\t\t       const unsigned int offset = 0);\n\n/**\n * Check whether \\a perm is a valid permutation on {0,...,N-1}.\n * Slow, mainly for debugging and validation purposes.\n */\nbool is_permutation(const unsigned int N, const unsigned int* perm);\n\n/**\n * Check whether \\a perm is a valid permutation on {0,...,N-1}.\n * Slow, mainly for debugging and validation purposes.\n */\nbool is_permutation(const std::vector<unsigned int>& perm);\n\n} // namespace bliss\n\n#endif\n"
  },
  {
    "path": "inputs/CMakeLists.txt",
    "content": "file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/download.cmake \"file(DOWNLOAD http://iss.oden.utexas.edu/projects/galois/downloads/small_inputs_for_lonestar_test.tar.gz ${CMAKE_CURRENT_BINARY_DIR}/lonestar-cpu-inputs.tar.gz SHOW_PROGRESS)\")\n\nadd_custom_command(\nOUTPUT lonestar-cpu-inputs.tar.gz\nCOMMAND ${CMAKE_COMMAND} -P ${CMAKE_CURRENT_BINARY_DIR}/download.cmake\n)\n\nadd_custom_command(\nOUTPUT small_inputs\nCOMMAND ${CMAKE_COMMAND} -E tar xJf lonestar-cpu-inputs.tar.gz\nDEPENDS lonestar-cpu-inputs.tar.gz\nWORKING_DIRECTORY ${PROJECT_BINARY_DIR}/inputs\nCOMMENT \"Unpacking lonestar-cpu-inputs.tar.gz\"\nVERBATIM\n)\n\nadd_custom_target(input DEPENDS small_inputs)\n"
  },
  {
    "path": "inputs/cholesky/matrix1.txt",
    "content": "4\t0\t0\t2\t0\t1\n0\t7\t2\t0\t0\t3\n0\t2\t3\t0\t0\t0\n2\t0\t0\t6\t3\t0\n0\t0\t0\t3\t7\t3\n1\t3\t0\t0\t3\t11\n"
  },
  {
    "path": "inputs/cholesky/matrix1.txt.choleskyedges",
    "content": "0 0 2.000000\n0 3 1.000000\n0 5 0.500000\n1 1 2.645751\n1 2 0.755929\n1 5 1.133893\n2 2 1.558387\n2 5 -0.550019\n3 3 2.236068\n3 4 1.341641\n3 5 -0.223607\n4 4 2.280351\n4 5 1.447146\n5 5 2.649063\n"
  },
  {
    "path": "inputs/cholesky/matrix1.txt.dep",
    "content": "0 1 2 3 4 5\n"
  },
  {
    "path": "inputs/cholesky/matrix1.txt.filled",
    "content": "0 0 4.000000\n0 3 2.000000\n0 5 1.000000\n1 1 7.000000\n1 2 2.000000\n1 5 3.000000\n2 2 3.000000\n2 5 0.000000\n3 3 6.000000\n3 4 3.000000\n3 5 0.000000\n4 4 7.000000\n4 5 3.000000\n5 5 11.000000\n"
  },
  {
    "path": "inputs/cholesky/very-sparse.txt",
    "content": "576.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\n0.0\t256.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t256.0\n0.0\t0.0\t1369.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\n0.0\t0.0\t0.0\t144.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\n0.0\t0.0\t0.0\t0.0\t144.0\t0.0\t0.0\t0.0\t0.0\t0.0\n0.0\t0.0\t0.0\t0.0\t0.0\t2304.0\t0.0\t1200.0\t0.0\t0.0\n0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t324.0\t0.0\t0.0\t0.0\n0.0\t0.0\t0.0\t0.0\t0.0\t1200.0\t0.0\t641.0\t0.0\t0.0\n0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t9.0\t0.0\n0.0\t256.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t1040.0\n"
  },
  {
    "path": "inputs/cholesky/very-sparse.txt.choleskyedges",
    "content": "0 0 24.000000\n1 1 16.000000\n1 9 16.000000\n2 2 37.000000\n3 3 12.000000\n4 4 12.000000\n5 5 48.000000\n5 7 25.000000\n6 6 18.000000\n7 7 4.000000\n8 8 3.000000\n9 9 28.000000\n"
  },
  {
    "path": "inputs/cholesky/very-sparse.txt.dep",
    "content": "0 1 2 3 4 5 6 7 8 9\n"
  },
  {
    "path": "inputs/cholesky/very-sparse.txt.filled",
    "content": "0 0 576.000000\n1 1 256.000000\n1 9 256.000000\n2 2 1369.000000\n3 3 144.000000\n4 4 144.000000\n5 5 2304.000000\n5 7 1200.000000\n6 6 324.000000\n7 7 641.000000\n8 8 9.000000\n9 9 1040.000000\n"
  },
  {
    "path": "libcusp/CMakeLists.txt",
    "content": "add_library(galois_cusp INTERFACE)\nadd_library(Galois::cusp ALIAS galois_cusp)\nset_target_properties(galois_cusp PROPERTIES EXPORT_NAME cusp)\nadd_dependencies(lib galois_cusp)\n\ntarget_include_directories(galois_cusp INTERFACE\n  $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>\n  $<INSTALL_INTERFACE:include>\n)\n\ntarget_link_libraries(galois_cusp INTERFACE galois_dist_async)\n\ninstall(\n  DIRECTORY include/\n  DESTINATION \"${CMAKE_INSTALL_INCLUDEDIR}\"\n  COMPONENT dev\n  FILES_MATCHING PATTERN \"*.h\"\n)\n\ninstall(TARGETS galois_cusp\n  EXPORT GaloisTargets\n  LIBRARY\n    DESTINATION \"${CMAKE_INSTALL_LIBDIR}\"\n    COMPONENT shlib\n  ARCHIVE\n    DESTINATION \"${CMAKE_INSTALL_LIBDIR}\"\n    COMPONENT lib\n  INCLUDES DESTINATION \"${CMAKE_INSTALL_INCLUDEDIR}\"\n)\n"
  },
  {
    "path": "libcusp/include/galois/graphs/BasePolicies.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2019, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n/**\n * @file BasePolicies.h\n *\n * Header file that includes the base classes for defining CuSP partitioning\n * policies.\n */\n\n#ifndef _GALOIS_CUSP_PSCAFFOLD_H_\n#define _GALOIS_CUSP_PSCAFFOLD_H_\n\nnamespace galois {\nnamespace graphs {\n\n/**\n * Default fields and functions all CuSP partitioners use; this is a class to\n * inherit from.\n */\nclass PartitioningScaffold {\nprotected:\n  uint32_t _hostID;   //!< host ID of owner of this object\n  uint32_t _numHosts; //!< total number of hosts\n  uint64_t _numNodes; //!< number of nodes in graph\n  uint64_t _numEdges; //!< number of edges in graph\n  //! maps from host id to nodes that host as read from disk\n  std::vector<std::pair<uint64_t, uint64_t>> _gid2host;\n\npublic:\n  /**\n   * Constructor for Scaffold.\n   *\n   * @param hostID Host ID of caller\n   * @param numHosts Total num hosts in execution\n   * @param numNodes Total number of nodes in graph\n   * @param numEdges Total number of edges in graph\n   */\n  PartitioningScaffold(uint32_t hostID, uint32_t numHosts, uint64_t numNodes,\n                       uint64_t numEdges)\n      : _hostID(hostID), _numHosts(numHosts), _numNodes(numNodes),\n        _numEdges(numEdges) {}\n\n  /**\n   * Save a provided map from host to nodes a host has read into this object\n   *\n   * @param gid2host Map of hosts to read nodes to save\n   */\n  void saveGIDToHost(std::vector<std::pair<uint64_t, uint64_t>>& gid2host) {\n    _gid2host = gid2host;\n  }\n};\n\n/**\n * Policies that use the read assignment of nodes as the masters. Does not\n * need to go through a master assignment phase, saving overhead.\n */\nclass ReadMasterAssignment : public PartitioningScaffold {\npublic:\n  /**\n   * Constructor simply calls parent constructor.\n   */\n  ReadMasterAssignment(uint32_t hostID, uint32_t numHosts, uint64_t numNodes,\n                       uint64_t numEdges)\n      : PartitioningScaffold(hostID, numHosts, numNodes, numEdges) {}\n\n  /**\n   * Returns the host ID of the host that read a particular node and its edges\n   * from disk.\n   *\n   * @param gid GID of node to get master of\n   * @returns Host ID of host that read the node specified by the GID.\n   */\n  uint32_t retrieveMaster(uint32_t gid) const {\n    for (auto h = 0U; h < _numHosts; ++h) {\n      uint64_t start, end;\n      std::tie(start, end) = _gid2host[h];\n      if (gid >= start && gid < end) {\n        return h;\n      }\n    }\n    assert(false);\n    return _numHosts;\n  }\n\n  // below all unused if not assigning masters in default manner, but must be\n  // defined or compiler complains\n\n  /**\n   * Returns false as this partitioning policy doesn't have a master assignment\n   * phase.\n   */\n  bool masterAssignPhase() const { return false; }\n  /**\n   * Does nothing as this policy doesn't have a master assignment phase\n   */\n  void enterStage2() {}\n\n  /**\n   * Does nothing because this policy doesn't have a master assignment phase.\n   * (uses read assignment)\n   */\n  template <typename EdgeTy>\n  uint32_t getMaster(uint32_t, galois::graphs::BufferedGraph<EdgeTy>&,\n                     const std::vector<uint32_t>&,\n                     std::unordered_map<uint64_t, uint32_t>&,\n                     const std::vector<uint64_t>&,\n                     std::vector<galois::CopyableAtomic<uint64_t>>&,\n                     const std::vector<uint64_t>&,\n                     std::vector<galois::CopyableAtomic<uint64_t>>&) {\n    return 0;\n  }\n\n  /**\n   * No-op because no master assignment phase.\n   */\n  void saveGID2HostInfo(std::unordered_map<uint64_t, uint32_t>&,\n                        std::vector<uint32_t>&, uint64_t) {}\n  /**\n   * Technically doesn't nothing and should never be called because no master\n   * assignment phase.\n   */\n  bool addMasterMapping(uint32_t, uint32_t) { return false; }\n};\n\n/**\n * Policies that use a custom assignment of masters (from the user).\n * Needs to go through  a master assignment phase, which adds overhead\n * to partitioning, but may get better quality partitions.\n */\nclass CustomMasterAssignment : public PartitioningScaffold {\nprotected:\n  char _status; //!< Specifies what phase of master assignment partitioner is on\n  //! Metadata for determining where a node's master is\n  std::vector<uint32_t> _localNodeToMaster;\n  //! Map GID to its master\n  std::unordered_map<uint64_t, uint32_t> _gid2masters;\n  //! This host's node offset (each host reads a distinct contiguous portion\n  //! of graph\n  uint64_t _nodeOffset;\n\n  /**\n   * Return the reader of a particular node.\n   * @param gid GID of node to get reader of\n   * @return Host reader of node passed in as param\n   */\n  unsigned getHostReader(uint64_t gid) const {\n    for (auto i = 0U; i < _numHosts; ++i) {\n      uint64_t start, end;\n      std::tie(start, end) = _gid2host[i];\n      if (gid >= start && gid < end) {\n        return i;\n      }\n    }\n    return -1;\n  }\n\npublic:\n  //! Calls parent constructor to initialize common data\n  CustomMasterAssignment(uint32_t hostID, uint32_t numHosts, uint64_t numNodes,\n                         uint64_t numEdges)\n      : PartitioningScaffold(hostID, numHosts, numNodes, numEdges), _status(0) {\n  }\n\n  /**\n   * Retrieves a saved master mapping: does not fail if a GID\n   * mapping is not found but instead returns -1 if in stage 1, else\n   * fails.\n   *\n   * @param gid GID to get master of\n   * @returns Master of specified GID, -1, unsigned, if not found\n   */\n  uint32_t retrieveMaster(uint32_t gid) const {\n    if (_status != 0) {\n      // use map if not a locally read node, else use vector\n      if (getHostReader(gid) != _hostID) {\n        auto gidMasterIter = _gid2masters.find(gid);\n        // found in map\n        if (gidMasterIter != _gid2masters.end()) {\n          uint32_t mappedMaster = gidMasterIter->second;\n          // galois::gDebug(\"[\", _hostID, \"] \", gid, \" found with master \",\n          //               mappedMaster, \"!\");\n          // make sure host is in bounds\n          assert(mappedMaster < _numHosts);\n          return mappedMaster;\n        } else {\n          // NOT FOUND (not necessarily a bad thing, and required for\n          // some cases)\n          galois::gDebug(\"[\", _hostID, \"] \", gid, \" not found!\");\n          if (_status == 2) {\n            // die if we expect all gids to be mapped already (stage 2)\n            GALOIS_DIE(\"should not fail to find a GID after stage 2 \"\n                       \"of master assignment phase\");\n          }\n          return (uint32_t)-1;\n        }\n      } else {\n        // determine offset\n        uint32_t offsetIntoMap = gid - _nodeOffset;\n        assert(offsetIntoMap != (uint32_t)-1);\n        assert(offsetIntoMap < _localNodeToMaster.size());\n        return _localNodeToMaster[offsetIntoMap];\n      }\n    } else {\n      // stage 0 = this function shouldn't be called\n      GALOIS_DIE(\"master setup incomplete\");\n      return (uint32_t)-1;\n    }\n  }\n\n  /**\n   * Given gid to master mapping info, save it into a local map.\n   *\n   * @param gid2offsets Map a GID to an offset into a vector containing master\n   * mapping information\n   * @param localNodeToMaster Vector that represents the master mapping of\n   * local nodes\n   * @param nodeOffset First GID of nodes read by this host\n   */\n  void saveGID2HostInfo(std::unordered_map<uint64_t, uint32_t>& gid2offsets,\n                        std::vector<uint32_t>& localNodeToMaster,\n                        uint64_t nodeOffset) {\n#ifndef NDEBUG\n    size_t originalSize = _gid2masters.size();\n#endif\n\n    for (auto i = gid2offsets.begin(); i != gid2offsets.end(); i++) {\n      assert(i->second < localNodeToMaster.size());\n      galois::gDebug(\"Map \", i->first, \" to \", localNodeToMaster[i->second]);\n      _gid2masters[i->first] = localNodeToMaster[i->second];\n    }\n    assert(_gid2masters.size() == (originalSize + gid2offsets.size()));\n    // get memory back\n    gid2offsets.clear();\n\n    size_t myLocalNodes = _gid2host[_hostID].second - _gid2host[_hostID].first;\n    assert((myLocalNodes + _gid2masters.size() - originalSize) ==\n           localNodeToMaster.size());\n    // copy over to this structure\n    _localNodeToMaster = std::move(localNodeToMaster);\n    assert(myLocalNodes <= _localNodeToMaster.size());\n\n    // resize to fit only this host's read nodes\n    _localNodeToMaster.resize(myLocalNodes);\n    _nodeOffset = nodeOffset;\n\n    // stage 1 setup complete\n    _status = 1;\n  }\n\n  //! Returns true as policies that inherit from this should define master\n  //! assignment function\n  bool masterAssignPhase() const { return true; }\n  //! Shifts master assignment phase to stage 2.\n  void enterStage2() { _status = 2; }\n\n  /**\n   * CuSP's \"getMaster\" function.\n   * This function should be defined by user in child class to assign a node to\n   * a host.\n   *\n   * @todo Consolidate metadata into single struct to clean up function.\n   * @returns Host id in which to assing a node\n   */\n  template <typename EdgeTy>\n  uint32_t getMaster(uint32_t, galois::graphs::BufferedGraph<EdgeTy>&,\n                     const std::vector<uint32_t>&,\n                     std::unordered_map<uint64_t, uint32_t>&,\n                     const std::vector<uint64_t>&,\n                     std::vector<galois::CopyableAtomic<uint64_t>>&,\n                     const std::vector<uint64_t>&,\n                     std::vector<galois::CopyableAtomic<uint64_t>>&) {\n    return (uint32_t)-1;\n  }\n\n  /**\n   * Add a new master mapping to the local map: needs to be in stage 1\n   *\n   * @param gid GID to map; should not be a GID read by this host (won't\n   * cause problems, but would just be a waste of compute resouces)\n   * @param mappedMaster master to map a GID to\n   * @returns true if new mapping added; false if already existed in map\n   */\n  bool addMasterMapping(uint32_t gid, uint32_t mappedMaster) {\n    assert(mappedMaster < _numHosts);\n    if (_status <= 1) {\n      auto offsetIntoMapIter = _gid2masters.find(gid);\n      if (offsetIntoMapIter == _gid2masters.end()) {\n        // NOT FOUND\n        galois::gDebug(\"[\", _hostID, \"] \", gid, \" not found; mapping!\");\n        _gid2masters[gid] = mappedMaster;\n        return true;\n      } else {\n        // already mapped\n        galois::gDebug(\"[\", _hostID, \"] \", gid, \" already mapped with master \",\n                       offsetIntoMapIter->second, \"!\");\n        assert(offsetIntoMapIter->second == mappedMaster);\n        return false;\n      }\n    } else {\n      GALOIS_DIE(\"unexpected status in add master mapping: \", _status);\n      return false;\n    }\n  }\n};\n\n} // end namespace graphs\n} // end namespace galois\n\n#endif\n"
  },
  {
    "path": "libcusp/include/galois/graphs/CuSPPartitioner.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2019, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n/**\n * @file CuSPPartitioner.h\n *\n * Contains the main CuSP partitioning function.\n */\n\n#ifndef _GALOIS_CUSP_\n#define _GALOIS_CUSP_\n\n#include \"galois/DistGalois.h\"\n#include \"galois/graphs/DistributedGraph.h\"\n#include \"galois/graphs/NewGeneric.h\"\n#include \"galois/graphs/GenericPartitioners.h\"\n\nnamespace galois {\n//! Enum for the input/output format of the partitioner.\nenum CUSP_GRAPH_TYPE {\n  CUSP_CSR, //!< Compressed sparse row graph format, i.e. outgoing edges\n  CUSP_CSC  //!< Compressed sparse column graph format, i.e. incoming edges\n};\n\ntemplate <typename NodeData, typename EdgeData>\nusing DistGraphPtr =\n    std::unique_ptr<galois::graphs::DistGraph<NodeData, EdgeData>>;\n\n/**\n * Main CuSP function: partitions a graph on disk, one partition per host.\n *\n * @param graphFile Graph file to read in the Galois binary CSR format\n * @param inputType Specifies which input format (CSR or CSC) should be given\n * to the partitioner\n * @param outputType Specifies the output format (CSR or CSC) that each\n * partition will be created in\n * @param symmetricGraph This should be \"true\" if the passed in graphFile\n * is a symmetric graph\n * @param transposeGraphFile Transpose graph of graphFile in Galois binary\n * CSC format (i.e. give it the transpose version of graphFile). Ignore\n * this argument if the graph is symmetric.\n * @param masterBlockFile\n * @param cuspAsync Toggles asynchronous master assignment phase during\n * partitioning\n * @param cuspStateRounds Toggles number of rounds used to synchronize\n * partitioning state during master assignment phase\n * @param readPolicy Determines how each host should divide the reading\n * load of the graph on disk\n * @param nodeWeight When using a read policy that involves nodes and edges,\n * this argument assigns a weight to give each node.\n * @param edgeWeight When using a read policy that involves nodes and edges,\n * this argument assigns a weight to give each edge.\n *\n * @tparam PartitionPolicy Partitioning policy object that specifies the\n * placement of nodes/edges during partitioning.\n * @tparam NodeData Data structure to be created for each node in the graph\n * @tparam EdgeData Type of data to be stored on each edge. Currently\n * only guarantee support for void or uint32_t; all other types may cause\n * undefined behavior.\n *\n * @returns A local partition of the passed in graph as a DistributedGraph\n *\n * @todo Look into making void node data work in LargeArray for D-Galois;\n * void specialization. For now, use char as default type\n */\ntemplate <typename PartitionPolicy, typename NodeData = char,\n          typename EdgeData = void>\nDistGraphPtr<NodeData, EdgeData>\ncuspPartitionGraph(std::string graphFile, CUSP_GRAPH_TYPE inputType,\n                   CUSP_GRAPH_TYPE outputType, bool symmetricGraph = false,\n                   std::string transposeGraphFile = \"\",\n                   std::string masterBlockFile = \"\", bool cuspAsync = true,\n                   uint32_t cuspStateRounds = 100,\n                   galois::graphs::MASTERS_DISTRIBUTION readPolicy =\n                       galois::graphs::BALANCED_EDGES_OF_MASTERS,\n                   uint32_t nodeWeight = 0, uint32_t edgeWeight = 0) {\n  auto& net = galois::runtime::getSystemNetworkInterface();\n  using DistGraphConstructor =\n      galois::graphs::NewDistGraphGeneric<NodeData, EdgeData, PartitionPolicy>;\n\n  // TODO @todo bring back graph saving/reading functionality?\n\n  if (!symmetricGraph) {\n    // out edges or in edges\n    std::string inputToUse;\n    // depending on output type may need to transpose edges\n    bool useTranspose;\n\n    // see what input is specified\n    if (inputType == CUSP_CSR) {\n      inputToUse = graphFile;\n      if (outputType == CUSP_CSR) {\n        useTranspose = false;\n      } else if (outputType == CUSP_CSC) {\n        useTranspose = true;\n      } else {\n        GALOIS_DIE(\"CuSP output graph type is invalid\");\n      }\n    } else if (inputType == CUSP_CSC) {\n      inputToUse = transposeGraphFile;\n      if (outputType == CUSP_CSR) {\n        useTranspose = true;\n      } else if (outputType == CUSP_CSC) {\n        useTranspose = false;\n      } else {\n        GALOIS_DIE(\"CuSP output graph type is invalid\");\n      }\n    } else {\n      GALOIS_DIE(\"Invalid input graph type specified in CuSP partitioner\");\n    }\n\n    return std::make_unique<DistGraphConstructor>(\n        inputToUse, net.ID, net.Num, cuspAsync, cuspStateRounds, useTranspose,\n        readPolicy, nodeWeight, edgeWeight, masterBlockFile);\n  } else {\n    // symmetric graph path: assume the passed in graphFile is a symmetric\n    // graph; output is also symmetric\n    return std::make_unique<DistGraphConstructor>(\n        graphFile, net.ID, net.Num, cuspAsync, cuspStateRounds, false,\n        readPolicy, nodeWeight, edgeWeight, masterBlockFile);\n  }\n}\n} // end namespace galois\n#endif\n"
  },
  {
    "path": "libcusp/include/galois/graphs/DistributedGraph.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n/**\n * @file DistributedGraph.h\n *\n * Contains the implementation for DistGraph. Command line argument definitions\n * are found in DistributedGraph.cpp.\n */\n\n#ifndef _GALOIS_DIST_HGRAPH_H_\n#define _GALOIS_DIST_HGRAPH_H_\n\n#include <unordered_map>\n#include <fstream>\n\n#include \"galois/graphs/LC_CSR_Graph.h\"\n#include \"galois/graphs/BufferedGraph.h\"\n#include \"galois/runtime/DistStats.h\"\n#include \"galois/graphs/OfflineGraph.h\"\n#include \"galois/DynamicBitset.h\"\n\n/*\n * Headers for boost serialization\n */\n\nnamespace galois {\nnamespace graphs {\n/**\n * Enums specifying how masters are to be distributed among hosts.\n */\nenum MASTERS_DISTRIBUTION {\n  //! balance nodes\n  BALANCED_MASTERS,\n  //! balance edges\n  BALANCED_EDGES_OF_MASTERS,\n  //! balance nodes and edges\n  BALANCED_MASTERS_AND_EDGES\n};\n\n/**\n * Base DistGraph class that all distributed graphs extend from.\n *\n * @tparam NodeTy type of node data for the graph\n * @tparam EdgeTy type of edge data for the graph\n */\ntemplate <typename NodeTy, typename EdgeTy>\nclass DistGraph {\nprivate:\n  //! Graph name used for printing things\n  constexpr static const char* const GRNAME = \"dGraph\";\n\n  using GraphTy = galois::graphs::LC_CSR_Graph<NodeTy, EdgeTy, true>;\n\n  // vector for determining range objects for master nodes + nodes\n  // with edges (which includes masters)\n  //! represents split of all nodes among threads to balance edges\n  std::vector<uint32_t> allNodesRanges;\n  //! represents split of master nodes among threads to balance edges\n  std::vector<uint32_t> masterRanges;\n  //! represents split of nodes with edges (includes masters) among threads to\n  //! balance edges\n  std::vector<uint32_t> withEdgeRanges;\n  //! represents split of all nodes among threads to balance in-edges\n  std::vector<uint32_t> allNodesRangesIn;\n  //! represents split of master nodes among threads to balance in-edges\n  std::vector<uint32_t> masterRangesIn;\n\n  using NodeRangeType =\n      galois::runtime::SpecificRange<boost::counting_iterator<size_t>>;\n\n  //! Vector of ranges that stores the 3 different range objects that a user is\n  //! able to access\n  std::vector<NodeRangeType> specificRanges;\n  //! Like specificRanges, but for in edges\n  std::vector<NodeRangeType> specificRangesIn;\n\nprotected:\n  //! The internal graph used by DistGraph to represent the graph\n  GraphTy graph;\n\n  //! Marks if the graph is transposed or not.\n  bool transposed;\n\n  // global graph variables\n  uint64_t numGlobalNodes; //!< Total nodes in the global unpartitioned graph.\n  uint64_t numGlobalEdges; //!< Total edges in the global unpartitioned graph.\n  uint32_t numNodes;       //!< Num nodes in this graph in total\n  uint64_t numEdges;       //!< Num edges in this graph in total\n\n  const unsigned id;       //!< ID of the machine.\n  const uint32_t numHosts; //!< Total number of machines\n\n  // local graph\n  // size() = Number of nodes created on this host (masters + mirrors)\n  uint32_t numOwned;    //!< Number of nodes owned (masters) by this host.\n                        //!< size() - numOwned = mirrors on this host\n  uint32_t beginMaster; //!< Local id of the beginning of master nodes.\n                        //!< beginMaster + numOwned = local id of the end of\n                        //!< master nodes\n  uint32_t numNodesWithEdges; //!< Number of nodes (masters + mirrors) that have\n                              //!< outgoing edges\n\n  //! Information that converts host to range of nodes that host reads\n  std::vector<std::pair<uint64_t, uint64_t>> gid2host;\n  //! Mirror nodes from different hosts. For reduce\n  std::vector<std::vector<size_t>> mirrorNodes;\n\n  //! GID = localToGlobalVector[LID]\n  std::vector<uint64_t> localToGlobalVector;\n  //! LID = globalToLocalMap[GID]\n  std::unordered_map<uint64_t, uint32_t> globalToLocalMap;\n\n  //! Increments evilPhase, a phase counter used by communication.\n  void inline increment_evilPhase() {\n    ++galois::runtime::evilPhase;\n    if (galois::runtime::evilPhase >=\n        static_cast<uint32_t>(\n            std::numeric_limits<int16_t>::max())) { // limit defined by MPI or\n                                                    // LCI\n      galois::runtime::evilPhase = 1;\n    }\n  }\n\n  //! Returns evilPhase + 1, handling loop around as necessary\n  unsigned inline evilPhasePlus1() {\n    unsigned result = galois::runtime::evilPhase + 1;\n\n    // limit defined by MPI or LCI\n    if (result >= uint32_t{std::numeric_limits<int16_t>::max()}) {\n      return 1;\n    }\n    return result;\n  }\n\n  //! used to sort edges in the sort edges function\n  template <typename GraphNode, typename ET>\n  struct IdLess {\n    bool\n    operator()(const galois::graphs::EdgeSortValue<GraphNode, ET>& e1,\n               const galois::graphs::EdgeSortValue<GraphNode, ET>& e2) const {\n      return e1.dst < e2.dst;\n    }\n  };\n\nprivate:\n  /**\n   * Given an OfflineGraph, compute the masters for each node by\n   * evenly (or unevenly as specified by scale factor)\n   * blocking the nodes off to assign to each host. Considers\n   * ONLY nodes and not edges.\n   *\n   * @param g The offline graph which has loaded the graph you want\n   * to get the masters for\n   * @param scalefactor A vector that specifies if a particular host\n   * should have more or less than other hosts\n   * @param DecomposeFactor Specifies how decomposed the blocking\n   * of nodes should be. For example, a factor of 2 will make 2 blocks\n   * out of 1 block had the decompose factor been set to 1.\n   */\n  void computeMastersBlockedNodes(galois::graphs::OfflineGraph& g,\n                                  const std::vector<unsigned>& scalefactor,\n                                  unsigned DecomposeFactor = 1) {\n    uint64_t numNodes_to_divide = g.size();\n    if (scalefactor.empty() || (numHosts * DecomposeFactor == 1)) {\n      for (unsigned i = 0; i < numHosts * DecomposeFactor; ++i)\n        gid2host.push_back(galois::block_range(uint64_t{0}, numNodes_to_divide,\n                                               i, numHosts * DecomposeFactor));\n      return;\n    }\n\n    // TODO: not compatible with DecomposeFactor.\n    assert(scalefactor.size() == numHosts);\n\n    unsigned numBlocks = 0;\n\n    for (unsigned i = 0; i < numHosts; ++i) {\n      numBlocks += scalefactor[i];\n    }\n\n    std::vector<std::pair<uint64_t, uint64_t>> blocks;\n    for (unsigned i = 0; i < numBlocks; ++i) {\n      blocks.push_back(\n          galois::block_range(uint64_t{0}, numNodes_to_divide, i, numBlocks));\n    }\n\n    std::vector<unsigned> prefixSums;\n    prefixSums.push_back(0);\n\n    for (unsigned i = 1; i < numHosts; ++i) {\n      prefixSums.push_back(prefixSums[i - 1] + scalefactor[i - 1]);\n    }\n\n    for (unsigned i = 0; i < numHosts; ++i) {\n      unsigned firstBlock = prefixSums[i];\n      unsigned lastBlock  = prefixSums[i] + scalefactor[i] - 1;\n      gid2host.push_back(\n          std::make_pair(blocks[firstBlock].first, blocks[lastBlock].second));\n    }\n  }\n\n  /**\n   * Given an OfflineGraph, compute the masters for each node by\n   * evenly (or unevenly as specified by scale factor)\n   * blocking the nodes off to assign to each host while taking\n   * into consideration the only edges of the node to get\n   * even blocks.\n   *\n   * @param g The offline graph which has loaded the graph you want\n   * to get the masters for\n   * @param scalefactor A vector that specifies if a particular host\n   * should have more or less than other hosts\n   * @param DecomposeFactor Specifies how decomposed the blocking\n   * of nodes should be. For example, a factor of 2 will make 2 blocks\n   * out of 1 block had the decompose factor been set to 1.\n   */\n  void computeMastersBalancedEdges(galois::graphs::OfflineGraph& g,\n                                   const std::vector<unsigned>& scalefactor,\n                                   uint32_t edgeWeight,\n                                   unsigned DecomposeFactor = 1) {\n    if (edgeWeight == 0) {\n      edgeWeight = 1;\n    }\n\n    auto& net = galois::runtime::getSystemNetworkInterface();\n\n    gid2host.resize(numHosts * DecomposeFactor);\n    for (unsigned d = 0; d < DecomposeFactor; ++d) {\n      auto r = g.divideByNode(0, edgeWeight, (id + d * numHosts),\n                              numHosts * DecomposeFactor, scalefactor);\n      gid2host[id + d * numHosts].first  = *(r.first.first);\n      gid2host[id + d * numHosts].second = *(r.first.second);\n    }\n\n    for (unsigned h = 0; h < numHosts; ++h) {\n      if (h == id) {\n        continue;\n      }\n      galois::runtime::SendBuffer b;\n      for (unsigned d = 0; d < DecomposeFactor; ++d) {\n        galois::runtime::gSerialize(b, gid2host[id + d * numHosts]);\n      }\n      net.sendTagged(h, galois::runtime::evilPhase, b);\n    }\n    net.flush();\n    unsigned received = 1;\n    while (received < numHosts) {\n      decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr)) p;\n      do {\n        p = net.recieveTagged(galois::runtime::evilPhase, nullptr);\n      } while (!p);\n      assert(p->first != id);\n      auto& b = p->second;\n      for (unsigned d = 0; d < DecomposeFactor; ++d) {\n        galois::runtime::gDeserialize(b, gid2host[p->first + d * numHosts]);\n      }\n      ++received;\n    }\n    increment_evilPhase();\n\n#ifndef NDEBUG\n    for (unsigned h = 0; h < numHosts; h++) {\n      if (h == 0) {\n        assert(gid2host[h].first == 0);\n      } else if (h == numHosts - 1) {\n        assert(gid2host[h].first == gid2host[h - 1].second);\n        assert(gid2host[h].second == g.size());\n      } else {\n        assert(gid2host[h].first == gid2host[h - 1].second);\n        assert(gid2host[h].second == gid2host[h + 1].first);\n      }\n    }\n#endif\n  }\n\n  /**\n   * Given an OfflineGraph, compute the masters for each node by\n   * evenly (or unevenly as specified by scale factor)\n   * blocking the nodes off to assign to each host while taking\n   * into consideration the edges of the node AND the node itself.\n   *\n   * @param g The offline graph which has loaded the graph you want\n   * to get the masters for\n   * @param scalefactor A vector that specifies if a particular host\n   * should have more or less than other hosts\n   * @param DecomposeFactor Specifies how decomposed the blocking\n   * of nodes should be. For example, a factor of 2 will make 2 blocks\n   * out of 1 block had the decompose factor been set to 1. Ignored\n   * in this function currently.\n   *\n   * @todo make this function work with decompose factor\n   */\n  void computeMastersBalancedNodesAndEdges(\n      galois::graphs::OfflineGraph& g, const std::vector<unsigned>& scalefactor,\n      uint32_t nodeWeight, uint32_t edgeWeight, unsigned) {\n    if (nodeWeight == 0) {\n      nodeWeight = g.sizeEdges() / g.size(); // average degree\n    }\n    if (edgeWeight == 0) {\n      edgeWeight = 1;\n    }\n\n    auto& net = galois::runtime::getSystemNetworkInterface();\n    gid2host.resize(numHosts);\n    auto r = g.divideByNode(nodeWeight, edgeWeight, id, numHosts, scalefactor);\n    gid2host[id].first  = *r.first.first;\n    gid2host[id].second = *r.first.second;\n    for (unsigned h = 0; h < numHosts; ++h) {\n      if (h == id)\n        continue;\n      galois::runtime::SendBuffer b;\n      galois::runtime::gSerialize(b, gid2host[id]);\n      net.sendTagged(h, galois::runtime::evilPhase, b);\n    }\n    net.flush();\n    unsigned received = 1;\n    while (received < numHosts) {\n      decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr)) p;\n      do {\n        p = net.recieveTagged(galois::runtime::evilPhase, nullptr);\n      } while (!p);\n      assert(p->first != id);\n      auto& b = p->second;\n      galois::runtime::gDeserialize(b, gid2host[p->first]);\n      ++received;\n    }\n    increment_evilPhase();\n  }\n\nprotected:\n  /**\n   * Wrapper call that will call into more specific compute masters\n   * functions that compute masters based on nodes, edges, or both.\n   *\n   * @param masters_distribution method of masters distribution to use\n   * @param g The offline graph which has loaded the graph you want\n   * to get the masters for\n   * @param scalefactor A vector that specifies if a particular host\n   * should have more or less than other hosts\n   * @param nodeWeight weight to give nodes when computing balance\n   * @param edgeWeight weight to give edges when computing balance\n   * @param DecomposeFactor Specifies how decomposed the blocking\n   * of nodes should be. For example, a factor of 2 will make 2 blocks\n   * out of 1 block had the decompose factor been set to 1.\n   */\n  uint64_t computeMasters(MASTERS_DISTRIBUTION masters_distribution,\n                          galois::graphs::OfflineGraph& g,\n                          const std::vector<unsigned>& scalefactor,\n                          uint32_t nodeWeight = 0, uint32_t edgeWeight = 0,\n                          unsigned DecomposeFactor = 1) {\n    galois::Timer timer;\n    timer.start();\n    g.reset_seek_counters();\n\n    uint64_t numNodes_to_divide = g.size();\n\n    // compute masters for all nodes\n    switch (masters_distribution) {\n    case BALANCED_MASTERS:\n      computeMastersBlockedNodes(g, scalefactor, DecomposeFactor);\n      break;\n    case BALANCED_MASTERS_AND_EDGES:\n      computeMastersBalancedNodesAndEdges(g, scalefactor, nodeWeight,\n                                          edgeWeight, DecomposeFactor);\n      break;\n    case BALANCED_EDGES_OF_MASTERS:\n    default:\n      computeMastersBalancedEdges(g, scalefactor, edgeWeight, DecomposeFactor);\n      break;\n    }\n\n    timer.stop();\n\n    galois::runtime::reportStatCond_Tmax<MORE_DIST_STATS>(\n        GRNAME, \"MasterDistTime\", timer.get());\n\n    galois::gPrint(\n        \"[\", id, \"] Master distribution time : \", timer.get_usec() / 1000000.0f,\n        \" seconds to read \", g.num_bytes_read(), \" bytes in \", g.num_seeks(),\n        \" seeks (\", g.num_bytes_read() / (float)timer.get_usec(), \" MBPS)\\n\");\n    return numNodes_to_divide;\n  }\n\n  //! reader assignment from a file\n  //! corresponds to master assignment if using an edge cut\n  void readersFromFile(galois::graphs::OfflineGraph& g, std::string filename) {\n    // read file lines\n    std::ifstream mappings(filename);\n    std::string curLine;\n\n    unsigned timesToRead = id + 1;\n\n    for (unsigned i = 0; i < timesToRead; i++) {\n      std::getline(mappings, curLine);\n    }\n\n    std::vector<char> modifyLine(curLine.begin(), curLine.end());\n    char* tokenizedString = modifyLine.data();\n    char* token;\n    token = strtok(tokenizedString, \" \");\n\n    // loop 6 more times\n    for (unsigned i = 0; i < 6; i++) {\n      token = strtok(NULL, \" \");\n    }\n    std::string left(token);\n\n    // 3 more times for right\n    for (unsigned i = 0; i < 3; i++) {\n      token = strtok(NULL, \" \");\n    }\n    std::string right(token);\n\n    gid2host.resize(numHosts);\n    gid2host[id].first  = std::stoul(left);\n    gid2host[id].second = std::stoul(right) + 1;\n    galois::gPrint(\"[\", id, \"] Left: \", gid2host[id].first,\n                   \", Right: \", gid2host[id].second, \"\\n\");\n\n    /////////////////////////\n    // send/recv from other hosts\n    /////////////////////////\n    auto& net = galois::runtime::getSystemNetworkInterface();\n\n    for (unsigned h = 0; h < numHosts; ++h) {\n      if (h == id)\n        continue;\n      galois::runtime::SendBuffer b;\n      galois::runtime::gSerialize(b, gid2host[id]);\n      net.sendTagged(h, galois::runtime::evilPhase, b);\n    }\n    net.flush();\n    unsigned received = 1;\n    while (received < numHosts) {\n      decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr)) p;\n      do {\n        p = net.recieveTagged(galois::runtime::evilPhase, nullptr);\n      } while (!p);\n      assert(p->first != id);\n      auto& b = p->second;\n      galois::runtime::gDeserialize(b, gid2host[p->first]);\n      ++received;\n    }\n    increment_evilPhase();\n\n    // sanity checking assignment\n    for (unsigned h = 0; h < numHosts; h++) {\n      if (h == 0) {\n        GALOIS_ASSERT(gid2host[h].first == 0);\n      } else if (h == numHosts - 1) {\n        GALOIS_ASSERT(gid2host[h].first == gid2host[h - 1].second,\n                      gid2host[h].first, \" \", gid2host[h - 1].second);\n        GALOIS_ASSERT(gid2host[h].second == g.size(), gid2host[h].second, \" \",\n                      g.size());\n      } else {\n        GALOIS_ASSERT(gid2host[h].first == gid2host[h - 1].second,\n                      gid2host[h].first, \" \", gid2host[h - 1].second);\n        GALOIS_ASSERT(gid2host[h].second == gid2host[h + 1].first,\n                      gid2host[h].second, \" \", gid2host[h + 1].first);\n      }\n    }\n  }\n\n  uint32_t G2L(uint64_t gid) const {\n    assert(isLocal(gid));\n    return globalToLocalMap.at(gid);\n  }\n\n  uint64_t L2G(uint32_t lid) const { return localToGlobalVector[lid]; }\n\npublic:\n  //! Type representing a node in this graph\n  using GraphNode = typename GraphTy::GraphNode;\n  //! Expose EdgeTy to other classes\n  using EdgeType = EdgeTy;\n  //! iterator type over nodes\n  using iterator = typename GraphTy::iterator;\n  //! constant iterator type over nodes\n  using const_iterator = typename GraphTy::const_iterator;\n  //! iterator type over edges\n  using edge_iterator = typename GraphTy::edge_iterator;\n\n  /**\n   * Constructor for DistGraph. Initializes metadata fields.\n   *\n   * @param host host number that this graph resides on\n   * @param numHosts total number of hosts in the currently executing program\n   */\n  DistGraph(unsigned host, unsigned numHosts)\n      : transposed(false), id(host), numHosts(numHosts) {\n    mirrorNodes.resize(numHosts);\n    numGlobalNodes = 0;\n    numGlobalEdges = 0;\n  }\n\n  /**\n   * Return a vector of pairs denoting mirror node ranges.\n   *\n   * Assumes all mirror nodes occur after the masters: this invariant should be\n   * held by CuSP.\n   */\n  std::vector<std::pair<uint32_t, uint32_t>> getMirrorRanges() const {\n    std::vector<std::pair<uint32_t, uint32_t>> mirrorRangesVector;\n    // order of nodes locally is masters, outgoing mirrors, incoming mirrors,\n    // so just get from numOwned to end\n    if (numOwned != numNodes) {\n      assert(numOwned < numNodes);\n      mirrorRangesVector.push_back(std::make_pair(numOwned, numNodes));\n    }\n    return mirrorRangesVector;\n  }\n\n  std::vector<std::vector<size_t>>& getMirrorNodes() { return mirrorNodes; }\n\nprivate:\n  virtual unsigned getHostIDImpl(uint64_t) const = 0;\n  virtual bool isOwnedImpl(uint64_t) const       = 0;\n  virtual bool isLocalImpl(uint64_t) const       = 0;\n  virtual bool isVertexCutImpl() const           = 0;\n  virtual std::pair<unsigned, unsigned> cartesianGridImpl() const {\n    return std::make_pair(0u, 0u);\n  }\n\npublic:\n  virtual ~DistGraph() {}\n  //! Determines which host has the master for a particular node\n  //! @returns Host id of node in question\n  inline unsigned getHostID(uint64_t gid) const { return getHostIDImpl(gid); }\n  //! Determine if a node has a master on this host.\n  //! @returns True if passed in global id has a master on this host\n  inline bool isOwned(uint64_t gid) const { return isOwnedImpl(gid); }\n  //! Determine if a node has a proxy on this host\n  //! @returns True if passed in global id has a proxy on this host\n  inline bool isLocal(uint64_t gid) const { return isLocalImpl(gid); }\n  /**\n   * Returns true if current partition is a vertex cut\n   * @returns true if partition being stored in this graph is a vertex cut\n   */\n  inline bool is_vertex_cut() const { return isVertexCutImpl(); }\n  /**\n   * Returns Cartesian split (if it exists, else returns pair of 0s\n   */\n  inline std::pair<unsigned, unsigned> cartesianGrid() const {\n    return cartesianGridImpl();\n  }\n\n  bool isTransposed() { return transposed; }\n\n  /**\n   * Converts a local node id into a global node id\n   *\n   * @param nodeID local node id\n   * @returns global node id corresponding to the local one\n   */\n  inline uint64_t getGID(const uint32_t nodeID) const { return L2G(nodeID); }\n\n  /**\n   * Converts a global node id into a local node id\n   *\n   * @param nodeID global node id\n   * @returns local node id corresponding to the global one\n   */\n  inline uint32_t getLID(const uint64_t nodeID) const { return G2L(nodeID); }\n\n  /**\n   * Get data of a node.\n   *\n   * @param N node to get the data of\n   * @param mflag access flag for node data\n   * @returns A node data object\n   */\n  inline typename GraphTy::node_data_reference\n  getData(GraphNode N,\n          galois::MethodFlag mflag = galois::MethodFlag::UNPROTECTED) {\n    auto& r = graph.getData(N, mflag);\n    return r;\n  }\n\n  /**\n   * Get the edge data for a particular edge in the graph.\n   *\n   * @param ni edge to get the data of\n   * @param mflag access flag for edge data\n   * @returns The edge data for the requested edge\n   */\n  inline typename GraphTy::edge_data_reference\n  getEdgeData(edge_iterator ni,\n              galois::MethodFlag mflag = galois::MethodFlag::UNPROTECTED) {\n    auto& r = graph.getEdgeData(ni, mflag);\n    return r;\n  }\n\n  /**\n   * Gets edge destination of edge ni.\n   *\n   * @param ni edge id to get destination of\n   * @returns Local ID of destination of edge ni\n   */\n  GraphNode getEdgeDst(edge_iterator ni) { return graph.getEdgeDst(ni); }\n\n  /**\n   * Gets the first edge of some node.\n   *\n   * @param N node to get the edge of\n   * @returns iterator to first edge of N\n   */\n  inline edge_iterator edge_begin(GraphNode N) {\n    return graph.edge_begin(N, galois::MethodFlag::UNPROTECTED);\n  }\n\n  /**\n   * Gets the end edge boundary of some node.\n   *\n   * @param N node to get the edge of\n   * @returns iterator to the end of the edges of node N, i.e. the first edge\n   * of the next node (or an \"end\" iterator if there is no next node)\n   */\n  inline edge_iterator edge_end(GraphNode N) {\n    return graph.edge_end(N, galois::MethodFlag::UNPROTECTED);\n  }\n\n  /**\n   * Returns an iterable object over the edges of a particular node in the\n   * graph.\n   *\n   * @param N node to get edges iterator over\n   */\n  inline galois::runtime::iterable<galois::NoDerefIterator<edge_iterator>>\n  edges(GraphNode N) {\n    return galois::graphs::internal::make_no_deref_range(edge_begin(N),\n                                                         edge_end(N));\n  }\n\n  /**\n   * Gets number of nodes on this (local) graph.\n   *\n   * @returns number of nodes present in this (local) graph\n   */\n  inline size_t size() const { return graph.size(); }\n\n  /**\n   * Gets number of edges on this (local) graph.\n   *\n   * @returns number of edges present in this (local) graph\n   */\n  inline size_t sizeEdges() const { return graph.sizeEdges(); }\n\n  /**\n   * Gets number of nodes on this (local) graph.\n   *\n   * @returns number of nodes present in this (local) graph\n   */\n  inline size_t numMasters() const { return numOwned; }\n\n  /**\n   * Gets number of nodes with edges (may include nodes without edges)\n   * on this (local) graph.\n   *\n   * @returns number of nodes with edges (may include nodes without edges\n   * as it measures a contiguous range)\n   */\n  inline size_t getNumNodesWithEdges() const { return numNodesWithEdges; }\n\n  /**\n   * Gets number of nodes on the global unpartitioned graph.\n   *\n   * @returns number of nodes present in the global unpartitioned graph\n   */\n  inline size_t globalSize() const { return numGlobalNodes; }\n\n  /**\n   * Gets number of edges on the global unpartitioned graph.\n   *\n   * @returns number of edges present in the global unpartitioned graph\n   */\n  inline size_t globalSizeEdges() const { return numGlobalEdges; }\n\n  /**\n   * Returns a range object that encapsulates all nodes of the graph.\n   *\n   * @returns A range object that contains all the nodes in this graph\n   */\n  inline const NodeRangeType& allNodesRange() const {\n    assert(specificRanges.size() == 3);\n    return specificRanges[0];\n  }\n\n  /**\n   * Returns a range object that encapsulates only master nodes in this\n   * graph.\n   *\n   * @returns A range object that contains the master nodes in this graph\n   */\n  inline const NodeRangeType& masterNodesRange() const {\n    assert(specificRanges.size() == 3);\n    return specificRanges[1];\n  }\n\n  /**\n   * Returns a range object that encapsulates master nodes and nodes\n   * with edges in this graph.\n   *\n   * @returns A range object that contains the master nodes and the nodes\n   * with outgoing edges in this graph\n   */\n  inline const NodeRangeType& allNodesWithEdgesRange() const {\n    assert(specificRanges.size() == 3);\n    return specificRanges[2];\n  }\n\n  /**\n   * Returns a vector object that contains the global IDs (in order) of\n   * the master nodes in this graph.\n   *\n   * @returns A vector object that contains the global IDs (in order) of\n   * the master nodes in this graph\n   */\n  std::vector<uint64_t> getMasterGlobalIDs() {\n    std::vector<uint64_t> IDs;\n\n    IDs.reserve(numMasters());\n    for (auto node : masterNodesRange()) {\n      IDs.push_back(getGID(node));\n    }\n\n    return IDs;\n  }\n\nprotected:\n  /**\n   * Uses a pre-computed prefix sum to determine division of nodes among\n   * threads.\n   *\n   * The call uses binary search to determine the ranges.\n   */\n  inline void determineThreadRanges() {\n    allNodesRanges = galois::graphs::determineUnitRangesFromPrefixSum(\n        galois::runtime::activeThreads, graph.getEdgePrefixSum());\n  }\n\n  /**\n   * Determines the thread ranges for master nodes only and saves them to\n   * the object.\n   *\n   * Only call after graph is constructed + only call once\n   */\n  inline void determineThreadRangesMaster() {\n    // make sure this hasn't been called before\n    assert(masterRanges.size() == 0);\n\n    // first check if we even need to do any work; if already calculated,\n    // use already calculated vector\n    if (beginMaster == 0 && (beginMaster + numOwned) == size()) {\n      masterRanges = allNodesRanges;\n    } else if (beginMaster == 0 &&\n               (beginMaster + numOwned) == numNodesWithEdges &&\n               withEdgeRanges.size() != 0) {\n      masterRanges = withEdgeRanges;\n    } else {\n      galois::gDebug(\"Manually det. master thread ranges\");\n      masterRanges = galois::graphs::determineUnitRangesFromGraph(\n          graph, galois::runtime::activeThreads, beginMaster,\n          beginMaster + numOwned, 0);\n    }\n  }\n\n  /**\n   * Determines the thread ranges for nodes with edges only and saves them to\n   * the object.\n   *\n   * Only call after graph is constructed + only call once\n   */\n  inline void determineThreadRangesWithEdges() {\n    // make sure not called before\n    assert(withEdgeRanges.size() == 0);\n\n    // first check if we even need to do any work; if already calculated,\n    // use already calculated vector\n    if (numNodesWithEdges == size()) {\n      withEdgeRanges = allNodesRanges;\n    } else if (beginMaster == 0 &&\n               (beginMaster + numOwned) == numNodesWithEdges &&\n               masterRanges.size() != 0) {\n      withEdgeRanges = masterRanges;\n    } else {\n      galois::gDebug(\"Manually det. with edges thread ranges\");\n      withEdgeRanges = galois::graphs::determineUnitRangesFromGraph(\n          graph, galois::runtime::activeThreads, 0, numNodesWithEdges, 0);\n    }\n  }\n\n  /**\n   * Initializes the 3 range objects that a user can access to iterate\n   * over the graph in different ways.\n   */\n  void initializeSpecificRanges() {\n    assert(specificRanges.size() == 0);\n\n    // TODO/FIXME assertion likely not safe if a host gets no nodes\n    // make sure the thread ranges have already been calculated\n    // for the 3 ranges\n    assert(allNodesRanges.size() != 0);\n    assert(masterRanges.size() != 0);\n    assert(withEdgeRanges.size() != 0);\n\n    // 0 is all nodes\n    specificRanges.push_back(galois::runtime::makeSpecificRange(\n        boost::counting_iterator<size_t>(0),\n        boost::counting_iterator<size_t>(size()), allNodesRanges.data()));\n\n    // 1 is master nodes\n    specificRanges.push_back(galois::runtime::makeSpecificRange(\n        boost::counting_iterator<size_t>(beginMaster),\n        boost::counting_iterator<size_t>(beginMaster + numOwned),\n        masterRanges.data()));\n\n    // 2 is with edge nodes\n    specificRanges.push_back(galois::runtime::makeSpecificRange(\n        boost::counting_iterator<size_t>(0),\n        boost::counting_iterator<size_t>(numNodesWithEdges),\n        withEdgeRanges.data()));\n\n    assert(specificRanges.size() == 3);\n  }\n\n  /**\n   * Specific range editor: makes the range for edges equivalent to the range\n   * for masters.\n   */\n  void edgesEqualMasters() { specificRanges[2] = specificRanges[1]; }\n\npublic:\n  /**\n   * Write the local LC_CSR graph to the file on a disk.\n   *\n   * @todo revive this\n   */\n  void save_local_graph_to_file(std::string) { GALOIS_DIE(\"not implemented\"); }\n\n  /**\n   * Read the local LC_CSR graph from the file on a disk.\n   *\n   * @todo revive this\n   */\n  void read_local_graph_from_file(std::string) {\n    GALOIS_DIE(\"not implemented\");\n  }\n\n  /**\n   * Deallocates underlying LC CSR Graph\n   */\n  void deallocate() {\n    galois::gDebug(\"Deallocating CSR in DistGraph\");\n    graph.deallocate();\n  }\n\n  /**\n   * Sort the underlying LC_CSR_Graph by ID (destinations)\n   * It sorts edges of the nodes by destination.\n   */\n  void sortEdgesByDestination() {\n    using GN = typename GraphTy::GraphNode;\n    galois::do_all(\n        galois::iterate(graph),\n        [&](GN n) { graph.sortEdges(n, IdLess<GN, EdgeTy>()); },\n        galois::no_stats(), galois::loopname(\"CSREdgeSort\"), galois::steal());\n  }\n};\n\ntemplate <typename NodeTy, typename EdgeTy>\nconstexpr const char* const galois::graphs::DistGraph<NodeTy, EdgeTy>::GRNAME;\n} // end namespace graphs\n} // end namespace galois\n\n#endif //_GALOIS_DIST_HGRAPH_H\n"
  },
  {
    "path": "libcusp/include/galois/graphs/GenericPartitioners.h",
    "content": "#ifndef _GALOIS_DIST_GENERICPARTS_H\n#define _GALOIS_DIST_GENERICPARTS_H\n\n#include \"DistributedGraph.h\"\n#include \"BasePolicies.h\"\n#include <utility>\n#include <cmath>\n#include <limits>\n\nclass NoCommunication : public galois::graphs::ReadMasterAssignment {\npublic:\n  NoCommunication(uint32_t, uint32_t numHosts, uint64_t, uint64_t)\n      : galois::graphs::ReadMasterAssignment(0, numHosts, 0, 0) {}\n\n  uint32_t getEdgeOwner(uint32_t src, uint32_t, uint64_t) const {\n    return retrieveMaster(src);\n  }\n\n  bool noCommunication() { return true; }\n  bool isVertexCut() const { return false; }\n  void serializePartition(boost::archive::binary_oarchive&) {}\n  void deserializePartition(boost::archive::binary_iarchive&) {}\n  std::pair<unsigned, unsigned> cartesianGrid() {\n    return std::make_pair(0u, 0u);\n  }\n};\n\n/**\n */\nclass MiningPolicyNaive : public galois::graphs::ReadMasterAssignment {\npublic:\n  MiningPolicyNaive(uint32_t, uint32_t numHosts, uint64_t, uint64_t,\n                    std::vector<uint64_t>&)\n      : galois::graphs::ReadMasterAssignment(0, numHosts, 0, 0) {}\n\n  static bool needNodeDegrees() { return false; }\n\n  bool keepEdge(uint32_t src, uint32_t dst) const { return src < dst; }\n};\n\nclass MiningPolicyDegrees : public galois::graphs::ReadMasterAssignment {\n  std::vector<uint64_t>& ndegrees;\n\npublic:\n  MiningPolicyDegrees(uint32_t, uint32_t numHosts, uint64_t, uint64_t,\n                      std::vector<uint64_t>& _ndeg)\n      : galois::graphs::ReadMasterAssignment(0, numHosts, 0, 0),\n        ndegrees(_ndeg) {}\n\n  static bool needNodeDegrees() { return true; }\n\n  bool keepEdge(uint32_t src, uint32_t dst) const {\n    uint64_t sourceDegree = ndegrees[src];\n    uint64_t destDegree   = ndegrees[dst];\n    if ((destDegree > sourceDegree) ||\n        ((destDegree == sourceDegree) && (src < dst))) {\n      return true;\n    } else {\n      return false;\n    }\n  }\n};\n\n////////////////////////////////////////////////////////////////////////////////\n\nclass GenericCVC : public galois::graphs::ReadMasterAssignment {\n  unsigned numRowHosts;\n  unsigned numColumnHosts;\n  unsigned _h_offset;\n\n  void factorizeHosts() {\n    numColumnHosts = sqrt(_numHosts);\n\n    while ((_numHosts % numColumnHosts) != 0)\n      numColumnHosts--;\n\n    numRowHosts = _numHosts / numColumnHosts;\n    assert(numRowHosts >= numColumnHosts);\n\n    // if (moreColumnHosts) {\n    //  std::swap(numRowHosts, numColumnHosts);\n    //}\n\n    if (_hostID == 0) {\n      galois::gPrint(\"Cartesian grid: \", numRowHosts, \" x \", numColumnHosts,\n                     \"\\n\");\n    }\n  }\n\n  //! Returns the grid row ID of this host\n  unsigned gridRowID() const { return (_hostID / numColumnHosts); }\n  //! Returns the grid row ID of the specified host\n  unsigned gridRowID(unsigned id) const { return (id / numColumnHosts); }\n  //! Returns the grid column ID of this host\n  unsigned gridColumnID() const { return (_hostID % numColumnHosts); }\n  //! Returns the grid column ID of the specified host\n  unsigned gridColumnID(unsigned id) const { return (id % numColumnHosts); }\n\n  //! Find the column of a particular node\n  unsigned getColumnOfNode(uint64_t gid) const {\n    return gridColumnID(retrieveMaster(gid));\n  }\n\npublic:\n  GenericCVC(uint32_t hostID, uint32_t numHosts, uint64_t numNodes,\n             uint64_t numEdges)\n      : galois::graphs::ReadMasterAssignment(hostID, numHosts, numNodes,\n                                             numEdges) {\n    factorizeHosts();\n    _h_offset = gridRowID() * numColumnHosts;\n  }\n\n  uint32_t getEdgeOwner(uint32_t, uint32_t dst, uint64_t) const {\n    int i = getColumnOfNode(dst);\n    return _h_offset + i;\n  }\n\n  bool noCommunication() { return false; }\n  bool isVertexCut() const {\n    if ((numRowHosts == 1) || (numColumnHosts == 1))\n      return false;\n    return true;\n  }\n  void serializePartition(boost::archive::binary_oarchive& ar) {\n    ar << numRowHosts;\n    ar << numColumnHosts;\n  }\n  void deserializePartition(boost::archive::binary_iarchive& ar) {\n    ar >> numRowHosts;\n    ar >> numColumnHosts;\n  }\n\n  std::pair<unsigned, unsigned> cartesianGrid() {\n    return std::make_pair(numRowHosts, numColumnHosts);\n  }\n};\n\n////////////////////////////////////////////////////////////////////////////////\n\n// same as above, except columns are flipped (changes behavior of vertex cut\n// call as well)\nclass GenericCVCColumnFlip : public galois::graphs::ReadMasterAssignment {\n  unsigned numRowHosts;\n  unsigned numColumnHosts;\n  unsigned _h_offset;\n\n  void factorizeHosts() {\n    numColumnHosts = sqrt(_numHosts);\n\n    while ((_numHosts % numColumnHosts) != 0)\n      numColumnHosts--;\n\n    numRowHosts = _numHosts / numColumnHosts;\n    assert(numRowHosts >= numColumnHosts);\n\n    // column flip\n    std::swap(numRowHosts, numColumnHosts);\n\n    if (_hostID == 0) {\n      galois::gPrint(\"Cartesian grid: \", numRowHosts, \" x \", numColumnHosts,\n                     \"\\n\");\n    }\n  }\n\n  //! Returns the grid row ID of this host\n  unsigned gridRowID() const { return (_hostID / numColumnHosts); }\n  //! Returns the grid row ID of the specified host\n  unsigned gridRowID(unsigned id) const { return (id / numColumnHosts); }\n  //! Returns the grid column ID of this host\n  unsigned gridColumnID() const { return (_hostID % numColumnHosts); }\n  //! Returns the grid column ID of the specified host\n  unsigned gridColumnID(unsigned id) const { return (id % numColumnHosts); }\n\n  //! Find the column of a particular node\n  unsigned getColumnOfNode(uint64_t gid) const {\n    return gridColumnID(retrieveMaster(gid));\n  }\n\npublic:\n  GenericCVCColumnFlip(uint32_t hostID, uint32_t numHosts, uint64_t numNodes,\n                       uint64_t numEdges)\n      : galois::graphs::ReadMasterAssignment(hostID, numHosts, numNodes,\n                                             numEdges) {\n    factorizeHosts();\n    _h_offset = gridRowID() * numColumnHosts;\n  }\n\n  uint32_t getEdgeOwner(uint32_t, uint32_t dst, uint64_t) const {\n    int i = getColumnOfNode(dst);\n    return _h_offset + i;\n  }\n\n  bool noCommunication() { return false; }\n  bool isVertexCut() const {\n    if ((numRowHosts == 1) && (numColumnHosts == 1))\n      return false;\n    return true;\n  }\n\n  void serializePartition(boost::archive::binary_oarchive& ar) {\n    ar << numRowHosts;\n    ar << numColumnHosts;\n  }\n\n  void deserializePartition(boost::archive::binary_iarchive& ar) {\n    ar >> numRowHosts;\n    ar >> numColumnHosts;\n  }\n\n  std::pair<unsigned, unsigned> cartesianGrid() {\n    return std::make_pair(numRowHosts, numColumnHosts);\n  }\n};\n////////////////////////////////////////////////////////////////////////////////\nclass GenericHVC : public galois::graphs::ReadMasterAssignment {\n  uint32_t _vCutThreshold;\n\npublic:\n  GenericHVC(uint32_t hostID, uint32_t numHosts, uint64_t numNodes,\n             uint64_t numEdges)\n      : galois::graphs::ReadMasterAssignment(hostID, numHosts, numNodes,\n                                             numEdges) {\n    _vCutThreshold = 1000; // can be changed, but default seems to be 1000\n  }\n\n  uint32_t getEdgeOwner(uint32_t src, uint32_t dst, uint64_t numEdges) const {\n    if (numEdges > _vCutThreshold) {\n      return retrieveMaster(dst);\n    } else {\n      return retrieveMaster(src);\n    }\n  }\n\n  bool noCommunication() { return false; }\n  // TODO I should be able to make this runtime detectable\n  bool isVertexCut() const { return true; }\n  void serializePartition(boost::archive::binary_oarchive&) {}\n  void deserializePartition(boost::archive::binary_iarchive&) {}\n  std::pair<unsigned, unsigned> cartesianGrid() {\n    return std::make_pair(0u, 0u);\n  }\n};\n\n////////////////////////////////////////////////////////////////////////////////\n\nclass GingerP : public galois::graphs::CustomMasterAssignment {\n  // used in hybrid cut\n  uint32_t _vCutThreshold;\n  // ginger scoring constants\n  double _gamma;\n  double _alpha;\n  // ginger node/edge ratio\n  double _neRatio;\n\n  /**\n   * Returns Ginger's composite balance parameter for a given host\n   */\n  double getCompositeBalanceParam(\n      unsigned host, const std::vector<uint64_t>& nodeLoads,\n      const std::vector<galois::CopyableAtomic<uint64_t>>& nodeAccum,\n      const std::vector<uint64_t>& edgeLoads,\n      const std::vector<galois::CopyableAtomic<uint64_t>>& edgeAccum) {\n    // get node/edge loads\n    uint64_t hostNodeLoad = nodeLoads[host] + nodeAccum[host].load();\n    uint64_t hostEdgeLoad = edgeLoads[host] + edgeAccum[host].load();\n\n    return (hostNodeLoad + (_neRatio * hostEdgeLoad)) / 2;\n  }\n\n  /**\n   * Use FENNEL balance equation to get a score value for partition\n   * scoring\n   */\n  double getFennelBalanceScore(double param) {\n    return _alpha * _gamma * pow(param, _gamma - 1);\n  }\n\npublic:\n  GingerP(uint32_t hostID, uint32_t numHosts, uint64_t numNodes,\n          uint64_t numEdges)\n      : galois::graphs::CustomMasterAssignment(hostID, numHosts, numNodes,\n                                               numEdges) {\n    _vCutThreshold = 1000;\n    _gamma         = 1.5;\n    _alpha   = numEdges * pow(numHosts, _gamma - 1.0) / pow(numNodes, _gamma);\n    _neRatio = (double)numNodes / (double)numEdges;\n  }\n\n  template <typename EdgeTy>\n  uint32_t getMaster(uint32_t src,\n                     galois::graphs::BufferedGraph<EdgeTy>& bufGraph,\n                     const std::vector<uint32_t>& localNodeToMaster,\n                     std::unordered_map<uint64_t, uint32_t>& gid2offsets,\n                     const std::vector<uint64_t>& nodeLoads,\n                     std::vector<galois::CopyableAtomic<uint64_t>>& nodeAccum,\n                     const std::vector<uint64_t>& edgeLoads,\n                     std::vector<galois::CopyableAtomic<uint64_t>>& edgeAccum) {\n    auto ii = bufGraph.edgeBegin(src);\n    auto ee = bufGraph.edgeEnd(src);\n    // number of edges\n    uint64_t ne = std::distance(ii, ee);\n\n    // high in-degree nodes masters stay the same\n    if (ne > _vCutThreshold) {\n      return _hostID;\n    } else {\n      // low in degree masters move based on augmented FENNEL scoring metric\n      // initialize array to hold scores\n      galois::PODResizeableArray<double> scores;\n      scores.resize(_numHosts);\n      for (unsigned i = 0; i < _numHosts; i++) {\n        scores[i] = 0.0;\n      }\n\n      for (; ii < ee; ++ii) {\n        uint64_t dst         = bufGraph.edgeDestination(*ii);\n        size_t offsetIntoMap = (unsigned)-1;\n\n        auto it = gid2offsets.find(dst);\n        if (it != gid2offsets.end()) {\n          offsetIntoMap = it->second;\n        } else {\n          // determine offset\n          offsetIntoMap = dst - bufGraph.getNodeOffset();\n        }\n\n        assert(offsetIntoMap != (unsigned)-1);\n        assert(offsetIntoMap < localNodeToMaster.size());\n\n        unsigned currentAssignment = localNodeToMaster[offsetIntoMap];\n\n        if (currentAssignment != (unsigned)-1) {\n          scores[currentAssignment] += 1.0;\n        } else {\n          galois::gDebug(\"[\", _hostID, \"] \", dst, \" unassigned\");\n        }\n      }\n\n      // subtraction of the composite balance term\n      for (unsigned i = 0; i < _numHosts; i++) {\n        scores[i] -= getFennelBalanceScore(getCompositeBalanceParam(\n            i, nodeLoads, nodeAccum, edgeLoads, edgeAccum));\n      }\n\n      unsigned bestHost = -1;\n      double bestScore  = std::numeric_limits<double>::lowest();\n      // find max score\n      for (unsigned i = 0; i < _numHosts; i++) {\n        if (scores[i] >= bestScore) {\n          // galois::gDebug(\"best score \", bestScore, \" beaten by \", scores[i]);\n          bestScore = scores[i];\n          bestHost  = i;\n        }\n      }\n\n      galois::gDebug(\"[\", _hostID, \"] \", src, \" assigned to \", bestHost,\n                     \" with num edge \", ne);\n\n      // update metadata; TODO make this a nicer interface\n      galois::atomicAdd(nodeAccum[bestHost], (uint64_t)1);\n      galois::atomicAdd(edgeAccum[bestHost], ne);\n\n      return bestHost;\n    }\n  }\n\n  uint32_t getEdgeOwner(uint32_t src, uint32_t dst, uint64_t numEdges) const {\n    // if high indegree, then move to source (which is dst), else stay on\n    // dst (which is src)\n    // note \"dst\" here is actually the source on the actual graph\n    // since we're reading transpose\n    if (numEdges > _vCutThreshold) {\n      return retrieveMaster(dst);\n    } else {\n      return retrieveMaster(src);\n    }\n  }\n\n  bool noCommunication() { return false; }\n  // TODO I should be able to make this runtime detectable\n  bool isVertexCut() const { return true; }\n  void serializePartition(boost::archive::binary_oarchive&) {}\n  void deserializePartition(boost::archive::binary_iarchive&) {}\n  std::pair<unsigned, unsigned> cartesianGrid() {\n    return std::make_pair(0u, 0u);\n  }\n};\n\nclass FennelP : public galois::graphs::CustomMasterAssignment {\n  // used in hybrid cut\n  uint32_t _vCutThreshold;\n  // ginger scoring constants\n  double _gamma;\n  double _alpha;\n  // ginger node/edge ratio\n  double _neRatio;\n\n  /**\n   * Returns Ginger's composite balance parameter for a given host\n   */\n  double getCompositeBalanceParam(\n      unsigned host, const std::vector<uint64_t>& nodeLoads,\n      const std::vector<galois::CopyableAtomic<uint64_t>>& nodeAccum,\n      const std::vector<uint64_t>& edgeLoads,\n      const std::vector<galois::CopyableAtomic<uint64_t>>& edgeAccum) {\n    // get node/edge loads\n    uint64_t hostNodeLoad = nodeLoads[host] + nodeAccum[host].load();\n    uint64_t hostEdgeLoad = edgeLoads[host] + edgeAccum[host].load();\n\n    return (hostNodeLoad + (_neRatio * hostEdgeLoad)) / 2;\n  }\n\n  /**\n   * Use FENNEL balance equation to get a score value for partition\n   * scoring\n   */\n  double getFennelBalanceScore(double param) {\n    return _alpha * _gamma * pow(param, _gamma - 1);\n  }\n\npublic:\n  FennelP(uint32_t hostID, uint32_t numHosts, uint64_t numNodes,\n          uint64_t numEdges)\n      : galois::graphs::CustomMasterAssignment(hostID, numHosts, numNodes,\n                                               numEdges) {\n    _vCutThreshold = 1000;\n    _gamma         = 1.5;\n    _alpha   = numEdges * pow(numHosts, _gamma - 1.0) / pow(numNodes, _gamma);\n    _neRatio = (double)numNodes / (double)numEdges;\n  }\n\n  template <typename EdgeTy>\n  uint32_t getMaster(uint32_t src,\n                     galois::graphs::BufferedGraph<EdgeTy>& bufGraph,\n                     const std::vector<uint32_t>& localNodeToMaster,\n                     std::unordered_map<uint64_t, uint32_t>& gid2offsets,\n                     const std::vector<uint64_t>& nodeLoads,\n                     std::vector<galois::CopyableAtomic<uint64_t>>& nodeAccum,\n                     const std::vector<uint64_t>& edgeLoads,\n                     std::vector<galois::CopyableAtomic<uint64_t>>& edgeAccum) {\n    auto ii = bufGraph.edgeBegin(src);\n    auto ee = bufGraph.edgeEnd(src);\n    // number of edges\n    uint64_t ne = std::distance(ii, ee);\n\n    // high degree nodes masters stay the same\n    if (ne > _vCutThreshold) {\n      return _hostID;\n    } else {\n      // low degree masters move based on augmented FENNEL scoring metric\n      // initialize array to hold scores\n      galois::PODResizeableArray<double> scores;\n      scores.resize(_numHosts);\n      for (unsigned i = 0; i < _numHosts; i++) {\n        scores[i] = 0.0;\n      }\n\n      for (; ii < ee; ++ii) {\n        uint64_t dst         = bufGraph.edgeDestination(*ii);\n        size_t offsetIntoMap = (unsigned)-1;\n\n        auto it = gid2offsets.find(dst);\n        if (it != gid2offsets.end()) {\n          offsetIntoMap = it->second;\n        } else {\n          // determine offset\n          offsetIntoMap = dst - bufGraph.getNodeOffset();\n        }\n\n        assert(offsetIntoMap != (unsigned)-1);\n        assert(offsetIntoMap < localNodeToMaster.size());\n\n        unsigned currentAssignment = localNodeToMaster[offsetIntoMap];\n\n        if (currentAssignment != (unsigned)-1) {\n          scores[currentAssignment] += 1.0;\n        } else {\n          galois::gDebug(\"[\", _hostID, \"] \", dst, \" unassigned\");\n        }\n      }\n\n      // subtraction of the composite balance term\n      for (unsigned i = 0; i < _numHosts; i++) {\n        scores[i] -= getFennelBalanceScore(getCompositeBalanceParam(\n            i, nodeLoads, nodeAccum, edgeLoads, edgeAccum));\n      }\n\n      unsigned bestHost = -1;\n      double bestScore  = std::numeric_limits<double>::lowest();\n      // find max score\n      for (unsigned i = 0; i < _numHosts; i++) {\n        if (scores[i] >= bestScore) {\n          // galois::gDebug(\"best score \", bestScore, \" beaten by \", scores[i]);\n          bestScore = scores[i];\n          bestHost  = i;\n        }\n      }\n\n      galois::gDebug(\"[\", _hostID, \"] \", src, \" assigned to \", bestHost,\n                     \" with num edge \", ne);\n\n      // update metadata; TODO make this a nicer interface\n      galois::atomicAdd(nodeAccum[bestHost], (uint64_t)1);\n      galois::atomicAdd(edgeAccum[bestHost], ne);\n\n      return bestHost;\n    }\n  }\n\n  // Fennel is an edge cut: all edges on source\n  uint32_t getEdgeOwner(uint32_t src, uint32_t, uint64_t) const {\n    return retrieveMaster(src);\n  }\n\n  bool noCommunication() { return false; }\n  // TODO I should be able to make this runtime detectable\n  bool isVertexCut() const { return false; }\n  void serializePartition(boost::archive::binary_oarchive&) {}\n  void deserializePartition(boost::archive::binary_iarchive&) {}\n  std::pair<unsigned, unsigned> cartesianGrid() {\n    return std::make_pair(0u, 0u);\n  }\n};\n\nclass SugarP : public galois::graphs::CustomMasterAssignment {\n  // used in hybrid cut\n  uint32_t _vCutThreshold;\n  // ginger scoring constants\n  double _gamma;\n  double _alpha;\n  // ginger node/edge ratio\n  double _neRatio;\n\n  unsigned numRowHosts;\n  unsigned numColumnHosts;\n\n  void factorizeHosts() {\n    numColumnHosts = sqrt(_numHosts);\n\n    while ((_numHosts % numColumnHosts) != 0)\n      numColumnHosts--;\n\n    numRowHosts = _numHosts / numColumnHosts;\n    assert(numRowHosts >= numColumnHosts);\n\n    if (_hostID == 0) {\n      galois::gPrint(\"Cartesian grid: \", numRowHosts, \" x \", numColumnHosts,\n                     \"\\n\");\n    }\n  }\n\n  //! Returns the grid row ID of this host\n  unsigned gridRowID() const { return (_hostID / numColumnHosts); }\n  //! Returns the grid row ID of the specified host\n  unsigned gridRowID(unsigned id) const { return (id / numColumnHosts); }\n  //! Returns the grid column ID of this host\n  unsigned gridColumnID() const { return (_hostID % numColumnHosts); }\n  //! Returns the grid column ID of the specified host\n  unsigned gridColumnID(unsigned id) const { return (id % numColumnHosts); }\n\n  //! Find the row of a particular node\n  unsigned getRowOfNode(uint64_t gid) const {\n    return gridRowID(retrieveMaster(gid));\n  }\n\n  //! Find the column of a particular node\n  unsigned getColumnOfNode(uint64_t gid) const {\n    return gridColumnID(retrieveMaster(gid));\n  }\n\n  /**\n   * Returns Ginger's composite balance parameter for a given host\n   */\n  double getCompositeBalanceParam(\n      unsigned host, const std::vector<uint64_t>& nodeLoads,\n      const std::vector<galois::CopyableAtomic<uint64_t>>& nodeAccum,\n      const std::vector<uint64_t>& edgeLoads,\n      const std::vector<galois::CopyableAtomic<uint64_t>>& edgeAccum) {\n    // get node/edge loads\n    uint64_t hostNodeLoad = nodeLoads[host] + nodeAccum[host].load();\n    uint64_t hostEdgeLoad = edgeLoads[host] + edgeAccum[host].load();\n\n    return (hostNodeLoad + (_neRatio * hostEdgeLoad)) / 2;\n  }\n\n  /**\n   * Use FENNEL balance equation to get a score value for partition\n   * scoring\n   */\n  double getFennelBalanceScore(double param) {\n    return _alpha * _gamma * pow(param, _gamma - 1);\n  }\n\npublic:\n  SugarP(uint32_t hostID, uint32_t numHosts, uint64_t numNodes,\n         uint64_t numEdges)\n      : galois::graphs::CustomMasterAssignment(hostID, numHosts, numNodes,\n                                               numEdges) {\n    _vCutThreshold = 1000;\n    _gamma         = 1.5;\n    _alpha   = numEdges * pow(numHosts, _gamma - 1.0) / pow(numNodes, _gamma);\n    _neRatio = (double)numNodes / (double)numEdges;\n    // CVC things\n    factorizeHosts();\n  }\n\n  template <typename EdgeTy>\n  uint32_t getMaster(uint32_t src,\n                     galois::graphs::BufferedGraph<EdgeTy>& bufGraph,\n                     const std::vector<uint32_t>& localNodeToMaster,\n                     std::unordered_map<uint64_t, uint32_t>& gid2offsets,\n                     const std::vector<uint64_t>& nodeLoads,\n                     std::vector<galois::CopyableAtomic<uint64_t>>& nodeAccum,\n                     const std::vector<uint64_t>& edgeLoads,\n                     std::vector<galois::CopyableAtomic<uint64_t>>& edgeAccum) {\n    auto ii = bufGraph.edgeBegin(src);\n    auto ee = bufGraph.edgeEnd(src);\n    // number of edges\n    uint64_t ne = std::distance(ii, ee);\n\n    // high degree nodes masters stay the same\n    if (ne > _vCutThreshold) {\n      return _hostID;\n    } else {\n      // low degree masters move based on augmented FENNEL scoring metric\n      // initialize array to hold scores\n      galois::PODResizeableArray<double> scores;\n      scores.resize(_numHosts);\n      for (unsigned i = 0; i < _numHosts; i++) {\n        scores[i] = 0.0;\n      }\n\n      for (; ii < ee; ++ii) {\n        uint64_t dst         = bufGraph.edgeDestination(*ii);\n        size_t offsetIntoMap = (unsigned)-1;\n\n        auto it = gid2offsets.find(dst);\n        if (it != gid2offsets.end()) {\n          offsetIntoMap = it->second;\n        } else {\n          // determine offset\n          offsetIntoMap = dst - bufGraph.getNodeOffset();\n        }\n\n        assert(offsetIntoMap != (unsigned)-1);\n        assert(offsetIntoMap < localNodeToMaster.size());\n\n        unsigned currentAssignment = localNodeToMaster[offsetIntoMap];\n\n        if (currentAssignment != (unsigned)-1) {\n          scores[currentAssignment] += 1.0;\n        } else {\n          // galois::gDebug(\"[\", _hostID, \"] \", dst, \" unassigned\");\n        }\n      }\n\n      // subtraction of the composite balance term\n      for (unsigned i = 0; i < _numHosts; i++) {\n        scores[i] -= getFennelBalanceScore(getCompositeBalanceParam(\n            i, nodeLoads, nodeAccum, edgeLoads, edgeAccum));\n      }\n\n      unsigned bestHost = -1;\n      double bestScore  = std::numeric_limits<double>::lowest();\n      // find max score\n      for (unsigned i = 0; i < _numHosts; i++) {\n        if (scores[i] >= bestScore) {\n          // galois::gDebug(\"best score \", bestScore, \" beaten by \", scores[i]);\n          bestScore = scores[i];\n          bestHost  = i;\n        }\n      }\n\n      galois::gDebug(\"[\", _hostID, \"] \", src, \" assigned to \", bestHost,\n                     \" with num edge \", ne);\n\n      // update metadata; TODO make this a nicer interface\n      galois::atomicAdd(nodeAccum[bestHost], (uint64_t)1);\n      galois::atomicAdd(edgeAccum[bestHost], ne);\n\n      return bestHost;\n    }\n  }\n\n  /**\n   * return owner of edge using cartesian edge owner determination\n   */\n  uint32_t getEdgeOwner(uint32_t src, uint32_t dst, uint64_t) const {\n    unsigned blockedRowOffset   = getRowOfNode(src) * numColumnHosts;\n    unsigned cyclicColumnOffset = getColumnOfNode(dst);\n    return blockedRowOffset + cyclicColumnOffset;\n  }\n\n  bool noCommunication() { return false; }\n  bool isVertexCut() const {\n    if ((numRowHosts == 1) || (numColumnHosts == 1))\n      return false;\n    return true;\n  }\n\n  void serializePartition(boost::archive::binary_oarchive& ar) {\n    ar << numRowHosts;\n    ar << numColumnHosts;\n  }\n\n  void deserializePartition(boost::archive::binary_iarchive& ar) {\n    ar >> numRowHosts;\n    ar >> numColumnHosts;\n  }\n\n  std::pair<unsigned, unsigned> cartesianGrid() {\n    return std::make_pair(numRowHosts, numColumnHosts);\n  }\n};\n\nclass SugarColumnFlipP : public galois::graphs::CustomMasterAssignment {\n  // used in hybrid cut\n  uint32_t _vCutThreshold;\n  // ginger scoring constants\n  double _gamma;\n  double _alpha;\n  // ginger node/edge ratio\n  double _neRatio;\n\n  unsigned numRowHosts;\n  unsigned numColumnHosts;\n\n  void factorizeHosts() {\n    numColumnHosts = sqrt(_numHosts);\n\n    while ((_numHosts % numColumnHosts) != 0)\n      numColumnHosts--;\n\n    numRowHosts = _numHosts / numColumnHosts;\n    assert(numRowHosts >= numColumnHosts);\n\n    // column flip\n    std::swap(numRowHosts, numColumnHosts);\n\n    if (_hostID == 0) {\n      galois::gPrint(\"Cartesian grid: \", numRowHosts, \" x \", numColumnHosts,\n                     \"\\n\");\n    }\n  }\n\n  //! Returns the grid row ID of this host\n  unsigned gridRowID() const { return (_hostID / numColumnHosts); }\n  //! Returns the grid row ID of the specified host\n  unsigned gridRowID(unsigned id) const { return (id / numColumnHosts); }\n  //! Returns the grid column ID of this host\n  unsigned gridColumnID() const { return (_hostID % numColumnHosts); }\n  //! Returns the grid column ID of the specified host\n  unsigned gridColumnID(unsigned id) const { return (id % numColumnHosts); }\n\n  //! Find the row of a particular node\n  unsigned getRowOfNode(uint64_t gid) const {\n    return gridRowID(retrieveMaster(gid));\n  }\n\n  //! Find the column of a particular node\n  unsigned getColumnOfNode(uint64_t gid) const {\n    return gridColumnID(retrieveMaster(gid));\n  }\n\n  /**\n   * Returns Ginger's composite balance parameter for a given host\n   */\n  double getCompositeBalanceParam(\n      unsigned host, const std::vector<uint64_t>& nodeLoads,\n      const std::vector<galois::CopyableAtomic<uint64_t>>& nodeAccum,\n      const std::vector<uint64_t>& edgeLoads,\n      const std::vector<galois::CopyableAtomic<uint64_t>>& edgeAccum) {\n    // get node/edge loads\n    uint64_t hostNodeLoad = nodeLoads[host] + nodeAccum[host].load();\n    uint64_t hostEdgeLoad = edgeLoads[host] + edgeAccum[host].load();\n\n    return (hostNodeLoad + (_neRatio * hostEdgeLoad)) / 2;\n  }\n\n  /**\n   * Use FENNEL balance equation to get a score value for partition\n   * scoring\n   */\n  double getFennelBalanceScore(double param) {\n    return _alpha * _gamma * pow(param, _gamma - 1);\n  }\n\npublic:\n  SugarColumnFlipP(uint32_t hostID, uint32_t numHosts, uint64_t numNodes,\n                   uint64_t numEdges)\n      : galois::graphs::CustomMasterAssignment(hostID, numHosts, numNodes,\n                                               numEdges) {\n    _vCutThreshold = 1000;\n    _gamma         = 1.5;\n    _alpha   = numEdges * pow(numHosts, _gamma - 1.0) / pow(numNodes, _gamma);\n    _neRatio = (double)numNodes / (double)numEdges;\n    // CVC things\n    factorizeHosts();\n  }\n\n  template <typename EdgeTy>\n  uint32_t getMaster(uint32_t src,\n                     galois::graphs::BufferedGraph<EdgeTy>& bufGraph,\n                     const std::vector<uint32_t>& localNodeToMaster,\n                     std::unordered_map<uint64_t, uint32_t>& gid2offsets,\n                     const std::vector<uint64_t>& nodeLoads,\n                     std::vector<galois::CopyableAtomic<uint64_t>>& nodeAccum,\n                     const std::vector<uint64_t>& edgeLoads,\n                     std::vector<galois::CopyableAtomic<uint64_t>>& edgeAccum) {\n    auto ii = bufGraph.edgeBegin(src);\n    auto ee = bufGraph.edgeEnd(src);\n    // number of edges\n    uint64_t ne = std::distance(ii, ee);\n\n    // high degree nodes masters stay the same\n    if (ne > _vCutThreshold) {\n      return _hostID;\n    } else {\n      // low degree masters move based on augmented FENNEL scoring metric\n      // initialize array to hold scores\n      galois::PODResizeableArray<double> scores;\n      scores.resize(_numHosts);\n      for (unsigned i = 0; i < _numHosts; i++) {\n        scores[i] = 0.0;\n      }\n\n      for (; ii < ee; ++ii) {\n        uint64_t dst         = bufGraph.edgeDestination(*ii);\n        size_t offsetIntoMap = (unsigned)-1;\n\n        auto it = gid2offsets.find(dst);\n        if (it != gid2offsets.end()) {\n          offsetIntoMap = it->second;\n        } else {\n          // determine offset\n          offsetIntoMap = dst - bufGraph.getNodeOffset();\n        }\n\n        assert(offsetIntoMap != (unsigned)-1);\n        assert(offsetIntoMap < localNodeToMaster.size());\n\n        unsigned currentAssignment = localNodeToMaster[offsetIntoMap];\n\n        if (currentAssignment != (unsigned)-1) {\n          scores[currentAssignment] += 1.0;\n        } else {\n          galois::gDebug(\"[\", _hostID, \"] \", dst, \" unassigned\");\n        }\n      }\n\n      // subtraction of the composite balance term\n      for (unsigned i = 0; i < _numHosts; i++) {\n        scores[i] -= getFennelBalanceScore(getCompositeBalanceParam(\n            i, nodeLoads, nodeAccum, edgeLoads, edgeAccum));\n      }\n\n      unsigned bestHost = -1;\n      double bestScore  = std::numeric_limits<double>::lowest();\n      // find max score\n      for (unsigned i = 0; i < _numHosts; i++) {\n        if (scores[i] >= bestScore) {\n          // galois::gDebug(\"best score \", bestScore, \" beaten by \", scores[i]);\n          bestScore = scores[i];\n          bestHost  = i;\n        }\n      }\n\n      galois::gDebug(\"[\", _hostID, \"] \", src, \" assigned to \", bestHost,\n                     \" with num edge \", ne);\n\n      // update metadata; TODO make this a nicer interface\n      galois::atomicAdd(nodeAccum[bestHost], (uint64_t)1);\n      galois::atomicAdd(edgeAccum[bestHost], ne);\n\n      return bestHost;\n    }\n  }\n\n  /**\n   * return owner of edge using cartesian edge owner determination\n   */\n  uint32_t getEdgeOwner(uint32_t src, uint32_t dst, uint64_t) const {\n    unsigned blockedRowOffset   = getRowOfNode(src) * numColumnHosts;\n    unsigned cyclicColumnOffset = getColumnOfNode(dst);\n    return blockedRowOffset + cyclicColumnOffset;\n  }\n\n  bool noCommunication() { return false; }\n  bool isVertexCut() const {\n    if ((numRowHosts == 1) && (numColumnHosts == 1))\n      return false;\n    return true;\n  }\n  void serializePartition(boost::archive::binary_oarchive& ar) {\n    ar << numRowHosts;\n    ar << numColumnHosts;\n  }\n  void deserializePartition(boost::archive::binary_iarchive& ar) {\n    ar >> numRowHosts;\n    ar >> numColumnHosts;\n  }\n\n  std::pair<unsigned, unsigned> cartesianGrid() {\n    return std::make_pair(numRowHosts, numColumnHosts);\n  }\n};\n\n#endif\n"
  },
  {
    "path": "libcusp/include/galois/graphs/MiningPartitioner.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2019, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n/**\n * @file MiningPartitioner.h\n *\n * Graph mining partitioning that duplicates edges. Currently only supports an\n * outgoing edge cut.\n *\n * TODO lots of code dpulication here with regular cusp partitioner; need to\n * merge\n */\n\n#ifndef _GALOIS_DIST_MINING_H\n#define _GALOIS_DIST_MINING_H\n\n#include \"galois/graphs/DistributedGraph.h\"\n#include \"galois/DReducible.h\"\n\nnamespace galois {\nnamespace graphs {\n/**\n * @tparam NodeTy type of node data for the graph\n * @tparam EdgeTy type of edge data for the graph\n *\n * @todo fully document and clean up code\n * @warning not meant for public use + not fully documented yet\n */\ntemplate <typename NodeTy, typename EdgeTy, typename Partitioner>\nclass MiningGraph : public DistGraph<NodeTy, EdgeTy> {\n  //! size used to buffer edge sends during partitioning\n  constexpr static unsigned edgePartitionSendBufSize = 8388608;\n  constexpr static const char* const GRNAME          = \"dGraph_Mining\";\n  std::unique_ptr<Partitioner> graphPartitioner;\n\n  uint32_t G2LEdgeCut(uint64_t gid, uint32_t globalOffset) const {\n    assert(base_DistGraph::isLocal(gid));\n    // optimized for edge cuts\n    if (gid >= globalOffset && gid < globalOffset + base_DistGraph::numOwned)\n      return gid - globalOffset;\n\n    return base_DistGraph::globalToLocalMap.at(gid);\n  }\n\n  /**\n   * Free memory of a vector by swapping an empty vector with it\n   */\n  template <typename V>\n  void freeVector(V& vectorToKill) {\n    V dummyVector;\n    vectorToKill.swap(dummyVector);\n  }\n\n  uint32_t nodesToReceive;\n\n  uint64_t myKeptEdges;\n  uint64_t myReadEdges;\n  uint64_t globalKeptEdges;\n  uint64_t totalEdgeProxies;\n\n  std::vector<std::vector<size_t>> mirrorEdges;\n  std::unordered_map<uint64_t, uint64_t> localEdgeGIDToLID;\n\n  std::vector<uint64_t> getNodeDegrees(const std::string filename,\n                                       uint32_t numNodes) {\n    std::vector<uint64_t> nodeDegrees;\n    nodeDegrees.resize(numNodes);\n\n    // read in prefix sum from GR on disk\n    std::ifstream graphFile(filename.c_str());\n    graphFile.seekg(sizeof(uint64_t) * 4);\n\n    uint64_t* outIndexBuffer = (uint64_t*)malloc(sizeof(uint64_t) * numNodes);\n    if (outIndexBuffer == nullptr) {\n      GALOIS_DIE(\"out of memory\");\n    }\n    uint64_t numBytesToLoad = numNodes * sizeof(uint64_t);\n    uint64_t bytesRead      = 0;\n\n    while (numBytesToLoad > 0) {\n      graphFile.read(((char*)outIndexBuffer) + bytesRead, numBytesToLoad);\n      size_t numRead = graphFile.gcount();\n      numBytesToLoad -= numRead;\n      bytesRead += numRead;\n    }\n    assert(numBytesToLoad == 0);\n\n    galois::do_all(\n        galois::iterate(0u, numNodes),\n        [&](unsigned n) {\n          if (n != 0) {\n            nodeDegrees[n] = outIndexBuffer[n] - outIndexBuffer[n - 1];\n          } else {\n            nodeDegrees[n] = outIndexBuffer[0];\n          }\n          // galois::gDebug(n, \" degree \", nodeDegrees[n]);\n        },\n        galois::loopname(\"GetNodeDegrees\"), galois::no_stats());\n    free(outIndexBuffer);\n\n#ifndef NDEBUG\n    if (base_DistGraph::id == 0) {\n      galois::gDebug(\"Sanity checking node degrees\");\n    }\n\n    galois::GAccumulator<uint64_t> edgeCount;\n    galois::do_all(\n        galois::iterate(0u, numNodes),\n        [&](unsigned n) { edgeCount += nodeDegrees[n]; },\n        galois::loopname(\"SanityCheckDegrees\"), galois::no_stats());\n    GALOIS_ASSERT(edgeCount.reduce() == base_DistGraph::numGlobalEdges);\n#endif\n\n    return nodeDegrees;\n  }\n\n  virtual unsigned getHostIDImpl(uint64_t gid) const {\n    assert(gid < base_DistGraph::numGlobalNodes);\n    return graphPartitioner->retrieveMaster(gid);\n  }\n\n  virtual bool isOwnedImpl(uint64_t gid) const {\n    assert(gid < base_DistGraph::numGlobalNodes);\n    return (graphPartitioner->retrieveMaster(gid) == base_DistGraph::id);\n  }\n\n  virtual bool isLocalImpl(uint64_t gid) const {\n    assert(gid < base_DistGraph::numGlobalNodes);\n    return (base_DistGraph::globalToLocalMap.find(gid) !=\n            base_DistGraph::globalToLocalMap.end());\n  }\n\n  virtual bool isVertexCutImpl() const { return false; }\n\npublic:\n  //! typedef for base DistGraph class\n  using base_DistGraph = DistGraph<NodeTy, EdgeTy>;\n\n  /**\n   * Returns edges owned by this graph (i.e. read).\n   */\n  uint64_t numOwnedEdges() const { return myKeptEdges; }\n\n  /**\n   * Returns # edges kept in all graphs.\n   */\n  uint64_t globalEdges() const { return globalKeptEdges; }\n\n  std::vector<std::vector<size_t>>& getMirrorEdges() { return mirrorEdges; }\n\n  /**\n   * Return the reader of a particular node.\n   * @param gid GID of node to get reader of\n   * @return Host reader of node passed in as param\n   */\n  unsigned getHostReader(uint64_t gid) const {\n    for (auto i = 0U; i < base_DistGraph::numHosts; ++i) {\n      uint64_t start, end;\n      std::tie(start, end) = base_DistGraph::gid2host[i];\n      if (gid >= start && gid < end) {\n        return i;\n      }\n    }\n    return -1;\n  }\n\n  /**\n   * Constructor\n   */\n  MiningGraph(\n      const std::string& filename, unsigned host, unsigned _numHosts,\n      bool setupGluon = true, bool doSort = false,\n      galois::graphs::MASTERS_DISTRIBUTION md = BALANCED_EDGES_OF_MASTERS,\n      uint32_t nodeWeight = 0, uint32_t edgeWeight = 0)\n      : base_DistGraph(host, _numHosts) {\n    galois::runtime::reportParam(GRNAME, \"MiningGraph\", \"0\");\n    galois::CondStatTimer<MORE_DIST_STATS> Tgraph_construct(\n        \"GraphPartitioningTime\", GRNAME);\n    Tgraph_construct.start();\n\n    ////////////////////////////////////////////////////////////////////////////\n\n    galois::graphs::OfflineGraph g(filename);\n    base_DistGraph::numGlobalNodes = g.size();\n    base_DistGraph::numGlobalEdges = g.sizeEdges();\n    std::vector<unsigned> dummy;\n\n    // not actually getting masters, but getting assigned readers for nodes\n    base_DistGraph::computeMasters(md, g, dummy, nodeWeight, edgeWeight);\n\n    std::vector<uint64_t> ndegrees;\n\n    if (Partitioner::needNodeDegrees()) {\n      if (base_DistGraph::id == 0) {\n        galois::gInfo(\"Calculating node degrees for partitioner\");\n      }\n\n      galois::runtime::reportParam(GRNAME, \"UsingDegreeOrdering\", \"1\");\n      ndegrees = getNodeDegrees(filename, base_DistGraph::numGlobalNodes);\n    }\n\n    graphPartitioner = std::make_unique<Partitioner>(\n        host, _numHosts, base_DistGraph::numGlobalNodes,\n        base_DistGraph::numGlobalEdges, ndegrees);\n    graphPartitioner->saveGIDToHost(base_DistGraph::gid2host);\n\n    ////////////////////////////////////////////////////////////////////////////\n\n    uint64_t nodeBegin = base_DistGraph::gid2host[base_DistGraph::id].first;\n    typename galois::graphs::OfflineGraph::edge_iterator edgeBegin =\n        g.edge_begin(nodeBegin);\n    uint64_t nodeEnd = base_DistGraph::gid2host[base_DistGraph::id].second;\n    typename galois::graphs::OfflineGraph::edge_iterator edgeEnd =\n        g.edge_begin(nodeEnd);\n\n    galois::gPrint(\"[\", base_DistGraph::id, \"] Starting graph reading.\\n\");\n    // never read edge data from disk\n    galois::graphs::BufferedGraph<void> bufGraph;\n    bufGraph.resetReadCounters();\n    galois::StatTimer graphReadTimer(\"GraphReading\", GRNAME);\n    graphReadTimer.start();\n    bufGraph.loadPartialGraph(filename, nodeBegin, nodeEnd, *edgeBegin,\n                              *edgeEnd, base_DistGraph::numGlobalNodes,\n                              base_DistGraph::numGlobalEdges);\n    graphReadTimer.stop();\n    galois::gPrint(\"[\", base_DistGraph::id, \"] Reading graph complete.\\n\");\n\n    ////////////////////////////////////////////////////////////////////////////\n\n    galois::StatTimer inspectionTimer(\"EdgeInspection\", GRNAME);\n    inspectionTimer.start();\n    bufGraph.resetReadCounters();\n    galois::gstl::Vector<uint64_t> prefixSumOfEdges;\n    base_DistGraph::numOwned = nodeEnd - nodeBegin;\n    prefixSumOfEdges.resize(base_DistGraph::numOwned);\n\n    // initial pass; set up lid-gid mappings, determine which proxies exist on\n    // this host; prefix sum of edges cna be set up up to the last master\n    // node\n    galois::DynamicBitSet presentProxies =\n        edgeInspectionRound1(bufGraph, prefixSumOfEdges);\n    // set my read nodes on present proxies\n    // TODO parallel?\n    for (uint64_t i = nodeBegin; i < nodeEnd; i++) {\n      presentProxies.set(i);\n    }\n\n    // vector to store bitsets received from other hosts\n    std::vector<galois::DynamicBitSet> proxiesOnOtherHosts;\n    proxiesOnOtherHosts.resize(_numHosts);\n\n    // send off mirror proxies that exist on this host to other hosts\n    communicateProxyInfo(presentProxies, proxiesOnOtherHosts);\n\n    // signifies how many outgoing edges a particular host should expect from\n    // this host\n    std::vector<std::vector<uint64_t>> numOutgoingEdges;\n    numOutgoingEdges.resize(base_DistGraph::numHosts);\n    // edge inspection phase 2: determine how many edges to send to each host\n    // don't actually send yet\n    edgeInspectionRound2(bufGraph, numOutgoingEdges, proxiesOnOtherHosts);\n\n    // prefix sum finalization\n    finalizePrefixSum(numOutgoingEdges, prefixSumOfEdges);\n\n    // doubly make sure the data is cleared\n    freeVector(numOutgoingEdges); // should no longer use this variable\n    inspectionTimer.stop();\n\n    ////////////////////////////////////////////////////////////////////////////\n\n    galois::StatTimer allocationTimer(\"GraphAllocation\", GRNAME);\n    allocationTimer.start();\n\n    // Graph construction related calls\n    base_DistGraph::beginMaster = 0;\n    // Allocate and construct the graph\n    base_DistGraph::graph.allocateFrom(base_DistGraph::numNodes,\n                                       base_DistGraph::numEdges);\n    base_DistGraph::graph.constructNodes();\n\n    // edge end fixing\n    auto& base_graph = base_DistGraph::graph;\n    galois::do_all(\n        galois::iterate((uint32_t)0, base_DistGraph::numNodes),\n        [&](uint64_t n) { base_graph.fixEndEdge(n, prefixSumOfEdges[n]); },\n#if MORE_DIST_STATS\n        galois::loopname(\"FixEndEdgeLoop\"),\n#endif\n        galois::no_stats());\n    // get memory from prefix sum back\n    prefixSumOfEdges.clear();\n    freeVector(prefixSumOfEdges); // should no longer use this variable\n\n    allocationTimer.stop();\n\n    ////////////////////////////////////////////////////////////////////////////\n\n    if (setupGluon) {\n      galois::CondStatTimer<MORE_DIST_STATS> TfillMirrors(\"FillMirrors\",\n                                                          GRNAME);\n\n      TfillMirrors.start();\n      fillMirrors();\n      TfillMirrors.stop();\n    }\n\n    ////////////////////////////////////////////////////////////////////////////\n\n    loadEdges(base_DistGraph::graph, bufGraph, proxiesOnOtherHosts);\n    // TODO this might be useful to keep around\n    proxiesOnOtherHosts.clear();\n    ndegrees.clear();\n\n    // SORT EDGES\n    if (doSort) {\n      base_DistGraph::sortEdgesByDestination();\n    }\n\n    if (setupGluon) {\n      galois::CondStatTimer<MORE_DIST_STATS> TfillMirrorsEdges(\n          \"FillMirrorsEdges\", GRNAME);\n      TfillMirrorsEdges.start();\n      // edges\n      mirrorEdges.resize(base_DistGraph::numHosts);\n      galois::gPrint(\"[\", base_DistGraph::id,\n                     \"] Filling mirrors and creating \"\n                     \"mirror map\\n\");\n      fillMirrorsEdgesAndCreateMirrorMap();\n      TfillMirrorsEdges.stop();\n    }\n\n    ////////////////////////////////////////////////////////////////////////////\n\n    galois::CondStatTimer<MORE_DIST_STATS> Tthread_ranges(\"ThreadRangesTime\",\n                                                          GRNAME);\n\n    galois::gPrint(\"[\", base_DistGraph::id, \"] Determining thread ranges\\n\");\n\n    Tthread_ranges.start();\n    base_DistGraph::determineThreadRanges();\n    base_DistGraph::determineThreadRangesMaster();\n    base_DistGraph::determineThreadRangesWithEdges();\n    base_DistGraph::initializeSpecificRanges();\n    Tthread_ranges.stop();\n\n    Tgraph_construct.stop();\n    galois::gPrint(\"[\", base_DistGraph::id, \"] Graph construction complete.\\n\");\n\n    galois::DGAccumulator<uint64_t> accumer;\n    accumer.reset();\n    accumer += base_DistGraph::sizeEdges();\n    totalEdgeProxies = accumer.reduce();\n\n    uint64_t totalNodeProxies;\n    accumer.reset();\n    accumer += base_DistGraph::size();\n    totalNodeProxies = accumer.reduce();\n\n    // report some statistics\n    if (base_DistGraph::id == 0) {\n      galois::runtime::reportStat_Single(\n          GRNAME, std::string(\"TotalNodeProxies\"), totalNodeProxies);\n      galois::runtime::reportStat_Single(\n          GRNAME, std::string(\"TotalEdgeProxies\"), totalEdgeProxies);\n      galois::runtime::reportStat_Single(GRNAME,\n                                         std::string(\"OriginalNumberEdges\"),\n                                         base_DistGraph::globalSizeEdges());\n      galois::runtime::reportStat_Single(GRNAME, std::string(\"TotalKeptEdges\"),\n                                         globalKeptEdges);\n      GALOIS_ASSERT(globalKeptEdges * 2 == base_DistGraph::globalSizeEdges());\n      galois::runtime::reportStat_Single(\n          GRNAME, std::string(\"ReplicationFactorNodes\"),\n          (totalNodeProxies) / (double)base_DistGraph::globalSize());\n      galois::runtime::reportStat_Single(\n          GRNAME, std::string(\"ReplicatonFactorEdges\"),\n          (totalEdgeProxies) / (double)globalKeptEdges);\n    }\n  }\n\nprivate:\n  galois::DynamicBitSet\n  edgeInspectionRound1(galois::graphs::BufferedGraph<void>& bufGraph,\n                       galois::gstl::Vector<uint64_t>& prefixSumOfEdges) {\n    galois::DynamicBitSet incomingMirrors;\n    incomingMirrors.resize(base_DistGraph::numGlobalNodes);\n    incomingMirrors.reset();\n\n    uint32_t myID         = base_DistGraph::id;\n    uint64_t globalOffset = base_DistGraph::gid2host[base_DistGraph::id].first;\n\n    // already set before this is called\n    base_DistGraph::localToGlobalVector.resize(base_DistGraph::numOwned);\n\n    galois::DGAccumulator<uint64_t> keptEdges;\n    keptEdges.reset();\n\n    galois::GAccumulator<uint64_t> allEdges;\n    allEdges.reset();\n\n    auto& ltgv = base_DistGraph::localToGlobalVector;\n    galois::do_all(\n        galois::iterate(base_DistGraph::gid2host[base_DistGraph::id].first,\n                        base_DistGraph::gid2host[base_DistGraph::id].second),\n        [&](size_t n) {\n          uint64_t edgeCount = 0;\n          auto ii            = bufGraph.edgeBegin(n);\n          auto ee            = bufGraph.edgeEnd(n);\n          allEdges += std::distance(ii, ee);\n          for (; ii < ee; ++ii) {\n            uint32_t dst = bufGraph.edgeDestination(*ii);\n\n            if (graphPartitioner->keepEdge(n, dst)) {\n              edgeCount++;\n              keptEdges += 1;\n              // which mirrors do I have\n              if (graphPartitioner->retrieveMaster(dst) != myID) {\n                incomingMirrors.set(dst);\n              }\n            }\n          }\n          prefixSumOfEdges[n - globalOffset] = edgeCount;\n          ltgv[n - globalOffset]             = n;\n        },\n#if MORE_DIST_STATS\n        galois::loopname(\"EdgeInspectionLoop\"),\n#endif\n        galois::steal(), galois::no_stats());\n\n    myKeptEdges     = keptEdges.read_local();\n    myReadEdges     = allEdges.reduce();\n    globalKeptEdges = keptEdges.reduce();\n\n    // get incoming mirrors ready for creation\n    uint32_t additionalMirrorCount = incomingMirrors.count();\n    base_DistGraph::localToGlobalVector.resize(\n        base_DistGraph::localToGlobalVector.size() + additionalMirrorCount);\n\n    // note prefix sum will get finalized in a later step\n    if (base_DistGraph::numOwned > 0) {\n      prefixSumOfEdges.resize(prefixSumOfEdges.size() + additionalMirrorCount,\n                              0);\n    } else {\n      prefixSumOfEdges.resize(additionalMirrorCount, 0);\n    }\n\n    // map creation: lid to gid\n    if (additionalMirrorCount > 0) {\n      uint32_t totalNumNodes = base_DistGraph::numGlobalNodes;\n      uint32_t activeThreads = galois::getActiveThreads();\n      std::vector<uint64_t> threadPrefixSums(activeThreads);\n      galois::on_each([&](unsigned tid, unsigned nthreads) {\n        size_t beginNode;\n        size_t endNode;\n        std::tie(beginNode, endNode) =\n            galois::block_range(0u, totalNumNodes, tid, nthreads);\n        uint64_t count = 0;\n        for (size_t i = beginNode; i < endNode; i++) {\n          if (incomingMirrors.test(i))\n            ++count;\n        }\n        threadPrefixSums[tid] = count;\n      });\n      // get prefix sums\n      for (unsigned int i = 1; i < threadPrefixSums.size(); i++) {\n        threadPrefixSums[i] += threadPrefixSums[i - 1];\n      }\n\n      assert(threadPrefixSums.back() == additionalMirrorCount);\n\n      uint32_t startingNodeIndex = base_DistGraph::numOwned;\n      // do actual work, second on_each\n      galois::on_each([&](unsigned tid, unsigned nthreads) {\n        size_t beginNode;\n        size_t endNode;\n        std::tie(beginNode, endNode) =\n            galois::block_range(0u, totalNumNodes, tid, nthreads);\n        // start location to start adding things into prefix sums/vectors\n        uint32_t threadStartLocation = 0;\n        if (tid != 0) {\n          threadStartLocation = threadPrefixSums[tid - 1];\n        }\n        uint32_t handledNodes = 0;\n        for (size_t i = beginNode; i < endNode; i++) {\n          if (incomingMirrors.test(i)) {\n            base_DistGraph::localToGlobalVector[startingNodeIndex +\n                                                threadStartLocation +\n                                                handledNodes] = i;\n            handledNodes++;\n          }\n        }\n      });\n    }\n\n    base_DistGraph::numNodes = base_DistGraph::numOwned + additionalMirrorCount;\n    base_DistGraph::numNodesWithEdges = base_DistGraph::numNodes;\n    assert(base_DistGraph::localToGlobalVector.size() ==\n           base_DistGraph::numNodes);\n\n    // g2l mapping\n    base_DistGraph::globalToLocalMap.reserve(base_DistGraph::numNodes);\n    for (unsigned i = 0; i < base_DistGraph::numNodes; i++) {\n      // global to local map construction\n      base_DistGraph::globalToLocalMap[base_DistGraph::localToGlobalVector[i]] =\n          i;\n    }\n    assert(base_DistGraph::globalToLocalMap.size() == base_DistGraph::numNodes);\n\n    return incomingMirrors;\n  }\n\n  /**\n   * Communicate to other hosts which proxies exist on this host.\n   *\n   * @param presentProxies Bitset marking which proxies are present on this host\n   * @param proxiesOnOtherHosts Vector to deserialize received bitsets into\n   */\n  void communicateProxyInfo(\n      galois::DynamicBitSet& presentProxies,\n      std::vector<galois::DynamicBitSet>& proxiesOnOtherHosts) {\n    auto& net = galois::runtime::getSystemNetworkInterface();\n    // Send proxies on this host to other hosts\n    for (unsigned h = 0; h < base_DistGraph::numHosts; ++h) {\n      if (h != base_DistGraph::id) {\n        galois::runtime::SendBuffer bitsetBuffer;\n        galois::runtime::gSerialize(bitsetBuffer, presentProxies);\n        net.sendTagged(h, galois::runtime::evilPhase, bitsetBuffer);\n      }\n    }\n\n    // receive loop\n    for (unsigned h = 0; h < net.Num - 1; h++) {\n      decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr)) p;\n      do {\n        p = net.recieveTagged(galois::runtime::evilPhase, nullptr);\n      } while (!p);\n      uint32_t sendingHost = p->first;\n      // deserialize proxiesOnOtherHosts\n      galois::runtime::gDeserialize(p->second,\n                                    proxiesOnOtherHosts[sendingHost]);\n    }\n\n    base_DistGraph::increment_evilPhase();\n  }\n\n  void edgeInspectionRound2(\n      galois::graphs::BufferedGraph<void>& bufGraph,\n      std::vector<std::vector<uint64_t>>& numOutgoingEdges,\n      std::vector<galois::DynamicBitSet>& proxiesOnOtherHosts) {\n    auto& net = galois::runtime::getSystemNetworkInterface();\n    // allocate vectors for counting edges that must be sent\n    // number of nodes that this host has read from disk\n    uint32_t numRead = base_DistGraph::gid2host[base_DistGraph::id].second -\n                       base_DistGraph::gid2host[base_DistGraph::id].first;\n    // allocate space for outgoing edges\n    for (uint32_t i = 0; i < base_DistGraph::numHosts; ++i) {\n      numOutgoingEdges[i].assign(numRead, 0);\n    }\n    uint64_t globalOffset = base_DistGraph::gid2host[base_DistGraph::id].first;\n\n    galois::DynamicBitSet hostHasOutgoing;\n    hostHasOutgoing.resize(base_DistGraph::numHosts);\n    hostHasOutgoing.reset();\n\n    // flip loop order, this can be optimized\n    // for each host, loop over my local nodes\n    galois::do_all(\n        galois::iterate(base_DistGraph::gid2host[base_DistGraph::id].first,\n                        base_DistGraph::gid2host[base_DistGraph::id].second),\n        [&](size_t n) {\n          auto ii = bufGraph.edgeBegin(n);\n          auto ee = bufGraph.edgeEnd(n);\n\n          for (; ii < ee; ++ii) {\n            uint32_t dst = bufGraph.edgeDestination(*ii);\n            // make sure this edge is going to be kept and not dropped\n            if (graphPartitioner->keepEdge(n, dst)) {\n              for (unsigned h = 0; h < net.Num; h++) {\n                if (h != net.ID) {\n                  if (proxiesOnOtherHosts[h].test(n)) {\n                    // if kept, make sure destination exists on that host\n                    if (proxiesOnOtherHosts[h].test(dst)) {\n                      // if it does, this edge must be duplicated on that host;\n                      // increment count\n                      numOutgoingEdges[h][n - globalOffset] += 1;\n                      hostHasOutgoing.set(h);\n                    }\n                  }\n                }\n              }\n            }\n          }\n        },\n#if MORE_DIST_STATS\n        galois::loopname(\"EdgeInspectionRound2Loop\"),\n#endif\n        galois::steal(), galois::no_stats());\n\n    // send data off, then receive it\n    sendInspectionData(numOutgoingEdges, hostHasOutgoing);\n    recvInspectionData(numOutgoingEdges);\n    base_DistGraph::increment_evilPhase();\n  }\n\n  /**\n   * Send data out from inspection to other hosts.\n   *\n   * @param[in,out] numOutgoingEdges specifies which nodes on a host will have\n   * outgoing edges\n   * @param[in] hostHasOutgoing bitset tracking which hosts have outgoing\n   * edges from this host\n   */\n  void sendInspectionData(std::vector<std::vector<uint64_t>>& numOutgoingEdges,\n                          galois::DynamicBitSet& hostHasOutgoing) {\n    auto& net = galois::runtime::getSystemNetworkInterface();\n\n    galois::GAccumulator<uint64_t> bytesSent;\n    bytesSent.reset();\n\n    for (unsigned h = 0; h < net.Num; h++) {\n      if (h == net.ID) {\n        continue;\n      }\n\n      // send outgoing edges data off to comm partner\n      galois::runtime::SendBuffer b;\n\n      // only send if non-zeros exist\n      if (hostHasOutgoing.test(h)) {\n        galois::runtime::gSerialize(b, 1); // token saying data exists\n        galois::runtime::gSerialize(b, numOutgoingEdges[h]);\n      } else {\n        galois::runtime::gSerialize(b, 0); // token saying no data exists\n      }\n      numOutgoingEdges[h].clear();\n\n      bytesSent.update(b.size());\n\n      // send buffer and free memory\n      net.sendTagged(h, galois::runtime::evilPhase, b);\n      b.getVec().clear();\n    }\n    galois::runtime::reportStat_Tsum(\n        GRNAME, std::string(\"EdgeInspectionBytesSent\"), bytesSent.reduce());\n\n    galois::gPrint(\"[\", base_DistGraph::id, \"] Inspection sends complete.\\n\");\n  }\n\n  /**\n   * Receive data from inspection from other hosts. Processes the incoming\n   * edge bitsets/offsets.\n   *\n   * @param[in,out] numOutgoingEdges specifies which nodes on a host will have\n   * outgoing edges\n   */\n  void\n  recvInspectionData(std::vector<std::vector<uint64_t>>& numOutgoingEdges) {\n    auto& net = galois::runtime::getSystemNetworkInterface();\n\n    for (unsigned h = 0; h < net.Num - 1; h++) {\n      // expect data from comm partner back\n      decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr)) p;\n      do {\n        p = net.recieveTagged(galois::runtime::evilPhase, nullptr);\n      } while (!p);\n\n      uint32_t sendingHost = p->first;\n\n      // get outgoing edges; first get status var\n      uint32_t outgoingExists = 2;\n      galois::runtime::gDeserialize(p->second, outgoingExists);\n\n      if (outgoingExists == 1) {\n        // actual data sent\n        galois::runtime::gDeserialize(p->second, numOutgoingEdges[sendingHost]);\n      } else if (outgoingExists == 0) {\n        // no data sent; just clear again\n        numOutgoingEdges[sendingHost].clear();\n      } else {\n        GALOIS_DIE(\"unreachable: \", outgoingExists);\n      }\n    }\n\n    galois::gPrint(\"[\", base_DistGraph::id,\n                   \"] Inspection receives complete.\\n\");\n  }\n\n  /**\n   * Take inspection metadata and begin mapping nodes/creating prefix sums,\n   * return the prefix sum.\n   */\n  galois::gstl::Vector<uint64_t>\n  finalizePrefixSum(std::vector<std::vector<uint64_t>>& numOutgoingEdges,\n                    galois::gstl::Vector<uint64_t>& prefixSumOfEdges) {\n    base_DistGraph::numEdges = 0;\n\n    inspectOutgoingNodes(numOutgoingEdges, prefixSumOfEdges);\n    finalizeInspection(prefixSumOfEdges);\n    galois::gDebug(\"[\", base_DistGraph::id,\n                   \"] To receive this many nodes: \", nodesToReceive);\n    galois::gPrint(\"[\", base_DistGraph::id,\n                   \"] Inspection allocation complete.\\n\");\n    return prefixSumOfEdges;\n  }\n\n  /**\n   * Outgoing inspection: loop over proxy nodes, determnine if need to receive\n   * edges.\n   */\n  void\n  inspectOutgoingNodes(std::vector<std::vector<uint64_t>>& numOutgoingEdges,\n                       galois::gstl::Vector<uint64_t>& prefixSumOfEdges) {\n    galois::GAccumulator<uint32_t> toReceive;\n    toReceive.reset();\n\n    uint32_t proxyStart = base_DistGraph::numOwned;\n    uint32_t proxyEnd   = base_DistGraph::numNodes;\n    assert(proxyEnd == prefixSumOfEdges.size());\n\n    galois::GAccumulator<uint64_t> edgesToReceive;\n    edgesToReceive.reset();\n\n    // loop over proxy nodes, see if edges need to be sent from another host\n    // by looking at results of edge inspection\n    galois::do_all(\n        galois::iterate(proxyStart, proxyEnd),\n        [&](uint32_t lid) {\n          uint64_t gid = base_DistGraph::localToGlobalVector[lid];\n          assert(gid < base_DistGraph::numGlobalNodes);\n          unsigned hostReader = getHostReader(gid);\n          assert(hostReader < base_DistGraph::numHosts);\n          assert(hostReader != base_DistGraph::id); // self shouldn't be proxy\n\n          uint64_t nodeOffset = base_DistGraph::gid2host[hostReader].first;\n          if (numOutgoingEdges[hostReader].size()) {\n            if (numOutgoingEdges[hostReader][gid - nodeOffset]) {\n              // if this host is going to send me edges, note it for future use\n              prefixSumOfEdges[lid] =\n                  numOutgoingEdges[hostReader][gid - nodeOffset];\n              edgesToReceive += numOutgoingEdges[hostReader][gid - nodeOffset];\n              toReceive += 1;\n            }\n          }\n        },\n        galois::loopname(\"OutgoingNodeInspection\"), galois::steal(),\n        galois::no_stats());\n\n    galois::gPrint(\"[\", base_DistGraph::id, \"] Need receive \",\n                   edgesToReceive.reduce(), \" edges; self is \", myKeptEdges,\n                   \"\\n\");\n    // get memory back\n    numOutgoingEdges.clear();\n    nodesToReceive = toReceive.reduce();\n  }\n\n  /**\n   * finalize metadata maps\n   */\n  void finalizeInspection(galois::gstl::Vector<uint64_t>& prefixSumOfEdges) {\n    for (unsigned i = 1; i < base_DistGraph::numNodes; i++) {\n      // finalize prefix sum\n      prefixSumOfEdges[i] += prefixSumOfEdges[i - 1];\n    }\n    if (prefixSumOfEdges.size() != 0) {\n      base_DistGraph::numEdges = prefixSumOfEdges.back();\n    } else {\n      base_DistGraph::numEdges = 0;\n    }\n  }\n\n  ////////////////////////////////////////////////////////////////////////////////\npublic:\n  galois::GAccumulator<uint64_t> lgMapAccesses;\n  /**\n   * Construct a map from local edge GIDs to LID\n   */\n  void constructLocalEdgeGIDMap() {\n    lgMapAccesses.reset();\n    galois::StatTimer mapConstructTimer(\"GID2LIDMapConstructTimer\", GRNAME);\n    mapConstructTimer.start();\n\n    localEdgeGIDToLID.reserve(base_DistGraph::sizeEdges());\n\n    uint64_t count = 0;\n    for (unsigned src = 0; src < base_DistGraph::size(); src++) {\n      for (auto edge = base_DistGraph::edge_begin(src);\n           edge != base_DistGraph::edge_end(src); edge++) {\n        assert((*edge) == count);\n        unsigned dst      = base_DistGraph::getEdgeDst(edge);\n        uint64_t localGID = getEdgeGIDFromSD(src, dst);\n        // insert into map\n        localEdgeGIDToLID.insert(std::make_pair(localGID, count));\n        count++;\n      }\n    }\n\n    GALOIS_ASSERT(localEdgeGIDToLID.size() == base_DistGraph::sizeEdges());\n    GALOIS_ASSERT(count == base_DistGraph::sizeEdges());\n\n    mapConstructTimer.stop();\n  }\n\n  void reportAccessBefore() {\n    galois::runtime::reportStat_Single(GRNAME, std::string(\"MapAccessesBefore\"),\n                                       lgMapAccesses.reduce());\n  }\n\n  void reportAccess() {\n    galois::runtime::reportStat_Single(GRNAME, std::string(\"MapAccesses\"),\n                                       lgMapAccesses.reduce());\n  }\n\n  /**\n   * checks map constructed above to see which local id corresponds\n   * to a node/edge (if it exists)\n   *\n   * assumes map is generated\n   */\n  std::pair<uint64_t, bool> getLIDFromMap(unsigned src, unsigned dst) {\n    lgMapAccesses += 1;\n    // try to find gid in map\n    uint64_t localGID = getEdgeGIDFromSD(src, dst);\n    auto findResult   = localEdgeGIDToLID.find(localGID);\n\n    // return if found, else return a false\n    if (findResult != localEdgeGIDToLID.end()) {\n      return std::make_pair(findResult->second, true);\n    } else {\n      // not found\n      return std::make_pair((uint64_t)-1, false);\n    }\n  }\n\n  uint64_t getEdgeLID(uint64_t gid) {\n    uint64_t sourceNodeGID = edgeGIDToSource(gid);\n    uint64_t sourceNodeLID = base_DistGraph::getLID(sourceNodeGID);\n    uint64_t destNodeLID   = base_DistGraph::getLID(edgeGIDToDest(gid));\n\n    for (auto edge : base_DistGraph::edges(sourceNodeLID)) {\n      uint64_t edgeDst = base_DistGraph::getEdgeDst(edge);\n      if (edgeDst == destNodeLID) {\n        return *edge;\n      }\n    }\n    GALOIS_DIE(\"unreachable\");\n    return (uint64_t)-1;\n  }\n\n  uint32_t findSourceFromEdge(uint64_t lid) {\n    // TODO binary search\n    // uint32_t left = 0;\n    // uint32_t right = base_DistGraph::numNodes;\n    // uint32_t mid = (left + right) / 2;\n\n    for (uint32_t mid = 0; mid < base_DistGraph::numNodes; mid++) {\n      uint64_t edge_left  = *(base_DistGraph::edge_begin(mid));\n      uint64_t edge_right = *(base_DistGraph::edge_begin(mid + 1));\n\n      if (edge_left <= lid && lid < edge_right) {\n        return mid;\n      }\n    }\n\n    GALOIS_DIE(\"unreachable\");\n    return (uint32_t)-1;\n  }\n\n  uint64_t getEdgeGID(uint64_t lid) {\n    uint32_t src = base_DistGraph::getGID(findSourceFromEdge(lid));\n    uint32_t dst = base_DistGraph::getGID(base_DistGraph::getEdgeDst(lid));\n    return getEdgeGIDFromSD(src, dst);\n  }\n\nprivate:\n  // https://www.quora.com/\n  // Is-there-a-mathematical-function-that-converts-two-numbers-into-one-so-\n  // that-the-two-numbers-can-always-be-extracted-again\n  // GLOBAL IDS ONLY\n  uint64_t getEdgeGIDFromSD(uint32_t source, uint32_t dest) {\n    return source + (dest % base_DistGraph::numGlobalNodes) *\n                        base_DistGraph::numGlobalNodes;\n  }\n\n  uint64_t edgeGIDToSource(uint64_t gid) {\n    return gid % base_DistGraph::numGlobalNodes;\n  }\n\n  uint64_t edgeGIDToDest(uint64_t gid) {\n    // assuming this floors\n    return gid / base_DistGraph::numGlobalNodes;\n  }\n\n  /**\n   * Fill up mirror arrays.\n   * TODO make parallel?\n   */\n  void fillMirrors() {\n    base_DistGraph::mirrorNodes.reserve(base_DistGraph::numNodes -\n                                        base_DistGraph::numOwned);\n    for (uint32_t i = base_DistGraph::numOwned; i < base_DistGraph::numNodes;\n         i++) {\n      uint32_t globalID = base_DistGraph::localToGlobalVector[i];\n      base_DistGraph::mirrorNodes[graphPartitioner->retrieveMaster(globalID)]\n          .push_back(globalID);\n    }\n  }\n\n  void fillMirrorsEdgesAndCreateMirrorMap() {\n    for (uint32_t src = base_DistGraph::numOwned;\n         src < base_DistGraph::numNodes; src++) {\n      auto ee               = base_DistGraph::edge_begin(src);\n      auto ee_end           = base_DistGraph::edge_end(src);\n      uint32_t globalSource = base_DistGraph::getGID(src);\n      unsigned sourceOwner  = graphPartitioner->retrieveMaster(globalSource);\n\n      for (; ee != ee_end; ++ee) {\n        // create mirror array\n        uint64_t edgeGID = getEdgeGIDFromSD(\n            globalSource,\n            base_DistGraph::getGID(base_DistGraph::getEdgeDst(ee)));\n        mirrorEdges[sourceOwner].push_back(edgeGID);\n      }\n    }\n  }\n\n  ////////////////////////////////////////////////////////////////////////////////\n\n  template <typename GraphTy>\n  void loadEdges(GraphTy& graph, galois::graphs::BufferedGraph<void>& bufGraph,\n                 std::vector<galois::DynamicBitSet>& proxiesOnOtherHosts) {\n    galois::StatTimer loadEdgeTimer(\"EdgeLoading\", GRNAME);\n    loadEdgeTimer.start();\n\n    bufGraph.resetReadCounters();\n    std::atomic<uint32_t> receivedNodes;\n    receivedNodes.store(0);\n\n    // sends data\n    sendEdges(graph, bufGraph, receivedNodes, proxiesOnOtherHosts);\n    // uint64_t bufBytesRead = bufGraph.getBytesRead();\n    // get data from graph back (don't need it after sending things out)\n    bufGraph.resetAndFree();\n\n    // receives data\n    galois::on_each(\n        [&](unsigned GALOIS_UNUSED(tid), unsigned GALOIS_UNUSED(nthreads)) {\n          receiveEdges(graph, receivedNodes);\n        });\n    base_DistGraph::increment_evilPhase();\n    loadEdgeTimer.stop();\n\n    galois::gPrint(\"[\", base_DistGraph::id, \"] Edge loading time: \",\n                   loadEdgeTimer.get_usec() / 1000000.0f, \" seconds\\n\");\n  }\n\n  // no edge data version\n  template <typename GraphTy>\n  void sendEdges(GraphTy& graph, galois::graphs::BufferedGraph<void>& bufGraph,\n                 std::atomic<uint32_t>& receivedNodes,\n                 std::vector<galois::DynamicBitSet>& proxiesOnOtherHosts) {\n    using DstVecType      = std::vector<std::vector<uint64_t>>;\n    using SendBufferVecTy = std::vector<galois::runtime::SendBuffer>;\n\n    galois::substrate::PerThreadStorage<DstVecType> gdst_vecs(\n        base_DistGraph::numHosts);\n    galois::substrate::PerThreadStorage<SendBufferVecTy> sendBuffers(\n        base_DistGraph::numHosts);\n\n    auto& net                = galois::runtime::getSystemNetworkInterface();\n    const unsigned& id       = this->base_DistGraph::id;\n    const unsigned& numHosts = this->base_DistGraph::numHosts;\n\n    galois::GAccumulator<uint64_t> messagesSent;\n    galois::GAccumulator<uint64_t> bytesSent;\n    galois::GReduceMax<uint64_t> maxBytesSent;\n    messagesSent.reset();\n    bytesSent.reset();\n    maxBytesSent.reset();\n\n    // Go over assigned nodes and distribute edges.\n    galois::do_all(\n        galois::iterate(base_DistGraph::gid2host[base_DistGraph::id].first,\n                        base_DistGraph::gid2host[base_DistGraph::id].second),\n        [&](uint64_t src) {\n          uint32_t lsrc    = 0;\n          uint64_t curEdge = 0;\n          if (base_DistGraph::isLocal(src)) {\n            lsrc    = this->G2L(src);\n            curEdge = *graph.edge_begin(lsrc, galois::MethodFlag::UNPROTECTED);\n          }\n\n          auto ee        = bufGraph.edgeBegin(src);\n          auto ee_end    = bufGraph.edgeEnd(src);\n          auto& gdst_vec = *gdst_vecs.getLocal();\n\n          for (unsigned i = 0; i < numHosts; ++i) {\n            gdst_vec[i].clear();\n          }\n\n          for (; ee != ee_end; ++ee) {\n            uint32_t gdst = bufGraph.edgeDestination(*ee);\n            // make sure this edge is going to be kept and not dropped\n            if (graphPartitioner->keepEdge(src, gdst)) {\n              assert(base_DistGraph::isLocal(src));\n              uint32_t ldst = this->G2L(gdst);\n              graph.constructEdge(curEdge++, ldst);\n\n              for (unsigned h = 0; h < net.Num; h++) {\n                if (h != net.ID) {\n                  if (proxiesOnOtherHosts[h].test(src)) {\n                    // if kept, make sure destination exists on that host\n                    if (proxiesOnOtherHosts[h].test(gdst)) {\n                      // if it does, this edge must be duplicated on that host;\n                      // increment count\n                      gdst_vec[h].push_back(gdst);\n                    }\n                  }\n                }\n              }\n            }\n          }\n\n          // make sure all edges accounted for if local\n          if (base_DistGraph::isLocal(src)) {\n            assert(curEdge == (*graph.edge_end(lsrc)));\n          }\n\n          // send\n          for (uint32_t h = 0; h < numHosts; ++h) {\n            if (h == id)\n              continue;\n\n            if (gdst_vec[h].size() > 0) {\n              auto& b = (*sendBuffers.getLocal())[h];\n              galois::runtime::gSerialize(b, src);\n              galois::runtime::gSerialize(b, gdst_vec[h]);\n\n              // send if over limit\n              if (b.size() > edgePartitionSendBufSize) {\n                messagesSent += 1;\n                bytesSent.update(b.size());\n                maxBytesSent.update(b.size());\n\n                net.sendTagged(h, galois::runtime::evilPhase, b);\n                b.getVec().clear();\n                b.getVec().reserve(edgePartitionSendBufSize * 1.25);\n              }\n            }\n          }\n\n          // overlap receives\n          auto buffer = net.recieveTagged(galois::runtime::evilPhase, nullptr);\n          this->processReceivedEdgeBuffer(buffer, graph, receivedNodes);\n        },\n#if MORE_DIST_STATS\n        galois::loopname(\"EdgeLoading\"),\n#endif\n        galois::steal(), galois::no_stats());\n\n    // flush buffers\n    for (unsigned threadNum = 0; threadNum < sendBuffers.size(); ++threadNum) {\n      auto& sbr = *sendBuffers.getRemote(threadNum);\n      for (unsigned h = 0; h < this->base_DistGraph::numHosts; ++h) {\n        if (h == this->base_DistGraph::id)\n          continue;\n        auto& sendBuffer = sbr[h];\n        if (sendBuffer.size() > 0) {\n          messagesSent += 1;\n          bytesSent.update(sendBuffer.size());\n          maxBytesSent.update(sendBuffer.size());\n\n          net.sendTagged(h, galois::runtime::evilPhase, sendBuffer);\n          sendBuffer.getVec().clear();\n        }\n      }\n    }\n\n    net.flush();\n\n    galois::runtime::reportStat_Tsum(\n        GRNAME, std::string(\"EdgeLoadingMessagesSent\"), messagesSent.reduce());\n    galois::runtime::reportStat_Tsum(\n        GRNAME, std::string(\"EdgeLoadingBytesSent\"), bytesSent.reduce());\n    galois::runtime::reportStat_Tmax(\n        GRNAME, std::string(\"EdgeLoadingMaxBytesSent\"), maxBytesSent.reduce());\n  }\n\n  //! @copydoc DistGraphHybridCut::processReceivedEdgeBuffer\n  template <typename GraphTy>\n  void processReceivedEdgeBuffer(\n      std::optional<std::pair<uint32_t, galois::runtime::RecvBuffer>>& buffer,\n      GraphTy& graph, std::atomic<uint32_t>& receivedNodes) {\n    if (buffer) {\n      auto& rb = buffer->second;\n      while (rb.r_size() > 0) {\n        uint64_t n;\n        std::vector<uint64_t> gdst_vec;\n        galois::runtime::gDeserialize(rb, n);\n        galois::runtime::gDeserialize(rb, gdst_vec);\n        assert(base_DistGraph::isLocal(n));\n        uint32_t lsrc = this->G2L(n);\n        uint64_t cur = *graph.edge_begin(lsrc, galois::MethodFlag::UNPROTECTED);\n        uint64_t cur_end = *graph.edge_end(lsrc);\n        assert((cur_end - cur) == gdst_vec.size());\n        deserializeEdges(graph, gdst_vec, cur, cur_end);\n        ++receivedNodes;\n      }\n    }\n  }\n\n  /**\n   * Receive the edge dest/data assigned to this host from other hosts\n   * that were responsible for reading them.\n   */\n  template <typename GraphTy>\n  void receiveEdges(GraphTy& graph, std::atomic<uint32_t>& receivedNodes) {\n    auto& net = galois::runtime::getSystemNetworkInterface();\n\n    // receive edges for all mirror nodes\n    while (receivedNodes < nodesToReceive) {\n      decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr)) p;\n      p = net.recieveTagged(galois::runtime::evilPhase, nullptr);\n      processReceivedEdgeBuffer(p, graph, receivedNodes);\n    }\n  }\n\n  template <typename GraphTy>\n  void deserializeEdges(GraphTy& graph, std::vector<uint64_t>& gdst_vec,\n                        uint64_t& cur, uint64_t& cur_end) {\n    uint64_t i = 0;\n    while (cur < cur_end) {\n      uint64_t gdst = gdst_vec[i++];\n      uint32_t ldst = this->G2L(gdst);\n      graph.constructEdge(cur++, ldst);\n    }\n  }\n};\n\n// make GRNAME visible to public\ntemplate <typename NodeTy, typename EdgeTy, typename Partitioner>\nconstexpr const char* const\n    galois::graphs::MiningGraph<NodeTy, EdgeTy, Partitioner>::GRNAME;\n\n} // end namespace graphs\n} // end namespace galois\n#endif\n"
  },
  {
    "path": "libcusp/include/galois/graphs/NewGeneric.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2020, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n/**\n * @file NewGeneric.h\n *\n * Contains the main graph class as well as the partitioning logic that CuSP\n * uses.\n */\n\n#ifndef _GALOIS_DIST_NEWGENERIC_H\n#define _GALOIS_DIST_NEWGENERIC_H\n\n#include \"galois/graphs/DistributedGraph.h\"\n#include \"galois/DReducible.h\"\n#include <optional>\n#include <sstream>\n\n#define CUSP_PT_TIMER 0\n\nnamespace galois {\nnamespace graphs {\n/**\n * @tparam NodeTy type of node data for the graph\n * @tparam EdgeTy type of edge data for the graph\n *\n * @todo fully document and clean up code\n * @warning not meant for public use + not fully documented yet\n */\ntemplate <typename NodeTy, typename EdgeTy, typename Partitioner>\nclass NewDistGraphGeneric : public DistGraph<NodeTy, EdgeTy> {\n  //! size used to buffer edge sends during partitioning\n  constexpr static unsigned edgePartitionSendBufSize = 8388608;\n  constexpr static const char* const GRNAME          = \"dGraph_Generic\";\n  std::unique_ptr<Partitioner> graphPartitioner;\n\n  //! How many rounds to sync state during edge assignment phase\n  uint32_t _edgeStateRounds;\n  std::vector<galois::DGAccumulator<uint64_t>> hostLoads;\n  std::vector<uint64_t> old_hostLoads;\n\n  uint32_t G2LEdgeCut(uint64_t gid, uint32_t globalOffset) const {\n    assert(base_DistGraph::isLocal(gid));\n    // optimized for edge cuts\n    if (gid >= globalOffset && gid < globalOffset + base_DistGraph::numOwned)\n      return gid - globalOffset;\n\n    return base_DistGraph::globalToLocalMap.at(gid);\n  }\n\n  /**\n   * Free memory of a vector by swapping an empty vector with it\n   */\n  template <typename V>\n  void freeVector(V& vectorToKill) {\n    V dummyVector;\n    vectorToKill.swap(dummyVector);\n  }\n\n  uint32_t nodesToReceive;\n\npublic:\n  //! typedef for base DistGraph class\n  using base_DistGraph = DistGraph<NodeTy, EdgeTy>;\n\nprivate:\n  virtual unsigned getHostIDImpl(uint64_t gid) const {\n    assert(gid < base_DistGraph::numGlobalNodes);\n    return graphPartitioner->retrieveMaster(gid);\n  }\n\n  virtual bool isOwnedImpl(uint64_t gid) const {\n    assert(gid < base_DistGraph::numGlobalNodes);\n    return (graphPartitioner->retrieveMaster(gid) == base_DistGraph::id);\n  }\n\n  virtual bool isLocalImpl(uint64_t gid) const {\n    assert(gid < base_DistGraph::numGlobalNodes);\n    return (base_DistGraph::globalToLocalMap.find(gid) !=\n            base_DistGraph::globalToLocalMap.end());\n  }\n\n  // TODO current uses graph partitioner\n  // TODO make it so user doens't have to specify; can be done by tracking\n  // if an outgoing mirror is marked as having an incoming edge on any\n  // host\n  virtual bool isVertexCutImpl() const {\n    return graphPartitioner->isVertexCut();\n  }\n  virtual std::pair<unsigned, unsigned> cartesianGridImpl() const {\n    return graphPartitioner->cartesianGrid();\n  }\n\npublic:\n  /**\n   * Reset load balance on host reducibles.\n   */\n  void resetEdgeLoad() {\n    if (_edgeStateRounds > 1) {\n      if (!graphPartitioner->noCommunication()) {\n        for (unsigned i = 0; i < base_DistGraph::numHosts; i++) {\n          hostLoads[i].reset();\n          old_hostLoads[i] = 0;\n        }\n      }\n    }\n  }\n\n  /**\n   * Sync load balance on hosts using reducibles.\n   */\n  void syncEdgeLoad() {\n    if (_edgeStateRounds > 1) {\n      if (!graphPartitioner->noCommunication()) {\n        for (unsigned i = 0; i < base_DistGraph::numHosts; i++) {\n          old_hostLoads[i] += hostLoads[i].reduce();\n          hostLoads[i].reset();\n        }\n      }\n    }\n  }\n\n  /**\n   * Debug function: prints host loads.\n   */\n  void printEdgeLoad() {\n    if (_edgeStateRounds > 1) {\n      if (!graphPartitioner->noCommunication()) {\n        if (base_DistGraph::id == 0) {\n          for (unsigned i = 0; i < base_DistGraph::numHosts; i++) {\n            galois::gDebug(\"[\", base_DistGraph::id, \"] \", i, \" \",\n                           old_hostLoads[i], \"\\n\");\n          }\n        }\n      }\n    }\n  }\n\n  /**\n   * Constructor\n   */\n  NewDistGraphGeneric(\n      const std::string& filename, unsigned host, unsigned _numHosts,\n      bool cuspAsync = true, uint32_t stateRounds = 100, bool transpose = false,\n      galois::graphs::MASTERS_DISTRIBUTION md = BALANCED_EDGES_OF_MASTERS,\n      uint32_t nodeWeight = 0, uint32_t edgeWeight = 0,\n      std::string masterBlockFile = \"\", bool readFromFile = false,\n      std::string localGraphFileName = \"local_graph\",\n      uint32_t edgeStateRounds       = 1)\n      : base_DistGraph(host, _numHosts), _edgeStateRounds(edgeStateRounds) {\n    galois::runtime::reportParam(\"dGraph\", \"GenericPartitioner\", \"0\");\n    galois::CondStatTimer<MORE_DIST_STATS> Tgraph_construct(\n        \"GraphPartitioningTime\", GRNAME);\n    Tgraph_construct.start();\n\n    if (readFromFile) {\n      galois::gPrint(\"[\", base_DistGraph::id,\n                     \"] Reading local graph from file \", localGraphFileName,\n                     \"\\n\");\n      base_DistGraph::read_local_graph_from_file(localGraphFileName);\n      Tgraph_construct.stop();\n      return;\n    }\n\n    galois::graphs::OfflineGraph g(filename);\n    base_DistGraph::numGlobalNodes = g.size();\n    base_DistGraph::numGlobalEdges = g.sizeEdges();\n    std::vector<unsigned> dummy;\n    // not actually getting masters, but getting assigned readers for nodes\n    if (masterBlockFile == \"\") {\n      base_DistGraph::computeMasters(md, g, dummy, nodeWeight, edgeWeight);\n    } else {\n      galois::gInfo(\"Getting reader assignment from file\");\n      base_DistGraph::readersFromFile(g, masterBlockFile);\n    }\n\n    graphPartitioner = std::make_unique<Partitioner>(\n        host, _numHosts, base_DistGraph::numGlobalNodes,\n        base_DistGraph::numGlobalEdges);\n    // TODO abstract this away somehow\n    graphPartitioner->saveGIDToHost(base_DistGraph::gid2host);\n\n    uint64_t nodeBegin = base_DistGraph::gid2host[base_DistGraph::id].first;\n    typename galois::graphs::OfflineGraph::edge_iterator edgeBegin =\n        g.edge_begin(nodeBegin);\n    uint64_t nodeEnd = base_DistGraph::gid2host[base_DistGraph::id].second;\n    typename galois::graphs::OfflineGraph::edge_iterator edgeEnd =\n        g.edge_begin(nodeEnd);\n\n    // signifies how many outgoing edges a particular host should expect from\n    // this host\n    std::vector<std::vector<uint64_t>> numOutgoingEdges;\n    // signifies if a host should create a node because it has an incoming edge\n    std::vector<galois::DynamicBitSet> hasIncomingEdge;\n\n    // only need to use for things that need communication\n    if (!graphPartitioner->noCommunication()) {\n      if (_edgeStateRounds > 1) {\n        hostLoads.resize(base_DistGraph::numHosts);\n        old_hostLoads.resize(base_DistGraph::numHosts);\n        resetEdgeLoad();\n      }\n      numOutgoingEdges.resize(base_DistGraph::numHosts);\n      hasIncomingEdge.resize(base_DistGraph::numHosts);\n    }\n\n    // phase 0\n\n    galois::gPrint(\"[\", base_DistGraph::id, \"] Starting graph reading.\\n\");\n    galois::graphs::BufferedGraph<EdgeTy> bufGraph;\n    bufGraph.resetReadCounters();\n    galois::StatTimer graphReadTimer(\"GraphReading\", GRNAME);\n    graphReadTimer.start();\n    bufGraph.loadPartialGraph(filename, nodeBegin, nodeEnd, *edgeBegin,\n                              *edgeEnd, base_DistGraph::numGlobalNodes,\n                              base_DistGraph::numGlobalEdges);\n    graphReadTimer.stop();\n    galois::gPrint(\"[\", base_DistGraph::id, \"] Reading graph complete.\\n\");\n\n    if (graphPartitioner->masterAssignPhase()) {\n      // loop over all nodes, determine where neighbors are, assign masters\n      galois::StatTimer phase0Timer(\"Phase0\", GRNAME);\n      galois::gPrint(\"[\", base_DistGraph::id,\n                     \"] Starting master assignment.\\n\");\n      phase0Timer.start();\n      phase0(bufGraph, cuspAsync, stateRounds);\n      phase0Timer.stop();\n      galois::gPrint(\"[\", base_DistGraph::id,\n                     \"] Master assignment complete.\\n\");\n    }\n\n    galois::StatTimer inspectionTimer(\"EdgeInspection\", GRNAME);\n    inspectionTimer.start();\n    bufGraph.resetReadCounters();\n    galois::gstl::Vector<uint64_t> prefixSumOfEdges;\n\n    // assign edges to other nodes\n    if (!graphPartitioner->noCommunication()) {\n      edgeInspection(bufGraph, numOutgoingEdges, hasIncomingEdge,\n                     inspectionTimer);\n      galois::DynamicBitSet& finalIncoming =\n          hasIncomingEdge[base_DistGraph::id];\n\n      galois::StatTimer mapTimer(\"NodeMapping\", GRNAME);\n      mapTimer.start();\n      nodeMapping(numOutgoingEdges, finalIncoming, prefixSumOfEdges);\n      mapTimer.stop();\n\n      finalIncoming.resize(0);\n    } else {\n      base_DistGraph::numOwned = nodeEnd - nodeBegin;\n      uint64_t edgeOffset      = *bufGraph.edgeBegin(nodeBegin);\n      // edge prefix sum, no comm required\n      edgeCutInspection(bufGraph, inspectionTimer, edgeOffset,\n                        prefixSumOfEdges);\n    }\n    // inspection timer is stopped in edgeInspection function\n\n    // flip partitioners that have a master assignment phase to stage 2\n    // (meaning all nodes and masters that will be on this host are present in\n    // the partitioner's metadata)\n    if (graphPartitioner->masterAssignPhase()) {\n      graphPartitioner->enterStage2();\n    }\n\n    // get memory back from inspection metadata\n    numOutgoingEdges.clear();\n    hasIncomingEdge.clear();\n    // doubly make sure the data is cleared\n    freeVector(numOutgoingEdges); // should no longer use this variable\n    freeVector(hasIncomingEdge);  // should no longer use this variable\n\n    // Graph construction related calls\n\n    base_DistGraph::beginMaster = 0;\n    // Allocate and construct the graph\n    base_DistGraph::graph.allocateFrom(base_DistGraph::numNodes,\n                                       base_DistGraph::numEdges);\n    base_DistGraph::graph.constructNodes();\n\n    // edge end fixing\n    auto& base_graph = base_DistGraph::graph;\n    galois::do_all(\n        galois::iterate((uint32_t)0, base_DistGraph::numNodes),\n        [&](uint64_t n) { base_graph.fixEndEdge(n, prefixSumOfEdges[n]); },\n#if MORE_DIST_STATS\n        galois::loopname(\"FixEndEdgeLoop\"),\n#endif\n        galois::no_stats());\n    // get memory from prefix sum back\n    prefixSumOfEdges.clear();\n    freeVector(prefixSumOfEdges); // should no longer use this variable\n    galois::CondStatTimer<MORE_DIST_STATS> TfillMirrors(\"FillMirrors\", GRNAME);\n\n    TfillMirrors.start();\n    fillMirrors();\n    TfillMirrors.stop();\n\n    if (_edgeStateRounds > 1) {\n      // reset edge load since we need exact same answers again\n      resetEdgeLoad();\n    }\n\n    // Edge loading\n    if (!graphPartitioner->noCommunication()) {\n      loadEdges(base_DistGraph::graph, bufGraph);\n    } else {\n      // Edge cut construction\n      edgeCutLoad(base_DistGraph::graph, bufGraph);\n      bufGraph.resetAndFree();\n    }\n\n    // Finalization\n\n    // TODO this is a hack; fix it somehow\n    // if vertex cut but not a cart cut is the condition\n    if (graphPartitioner->isVertexCut() &&\n        graphPartitioner->cartesianGrid().first == 0) {\n      base_DistGraph::numNodesWithEdges = base_DistGraph::numNodes;\n    }\n\n    if (transpose) {\n      base_DistGraph::transposed        = true;\n      base_DistGraph::numNodesWithEdges = base_DistGraph::numNodes;\n      if (base_DistGraph::numNodes > 0) {\n        // consider all nodes to have outgoing edges (TODO better way to do\n        // this?) for now it's fine I guess\n        base_DistGraph::graph.transpose(GRNAME);\n      }\n    }\n\n    galois::CondStatTimer<MORE_DIST_STATS> Tthread_ranges(\"ThreadRangesTime\",\n                                                          GRNAME);\n\n    Tthread_ranges.start();\n    base_DistGraph::determineThreadRanges();\n    Tthread_ranges.stop();\n\n    base_DistGraph::determineThreadRangesMaster();\n    base_DistGraph::determineThreadRangesWithEdges();\n    base_DistGraph::initializeSpecificRanges();\n\n    Tgraph_construct.stop();\n    galois::gPrint(\"[\", base_DistGraph::id, \"] Graph construction complete.\\n\");\n\n    // report state rounds\n    if (base_DistGraph::id == 0) {\n      galois::runtime::reportStat_Single(GRNAME, \"CuSPStateRounds\",\n                                         (uint32_t)stateRounds);\n    }\n  }\n\nprivate:\n  galois::runtime::SpecificRange<boost::counting_iterator<size_t>>\n  getSpecificThreadRange(galois::graphs::BufferedGraph<EdgeTy>& bufGraph,\n                         std::vector<uint32_t>& assignedThreadRanges,\n                         uint64_t startNode, uint64_t endNode) {\n    galois::StatTimer threadRangeTime(\"Phase0ThreadRangeTime\");\n    threadRangeTime.start();\n    uint64_t numLocalNodes = endNode - startNode;\n    galois::PODResizeableArray<uint64_t> edgePrefixSum;\n    edgePrefixSum.resize(numLocalNodes);\n\n    // get thread ranges with a prefix sum\n    galois::do_all(\n        galois::iterate(startNode, endNode),\n        [&](unsigned n) {\n          uint64_t offset       = n - startNode;\n          edgePrefixSum[offset] = bufGraph.edgeEnd(n) - bufGraph.edgeBegin(n);\n        },\n        galois::no_stats());\n\n    for (unsigned i = 1; i < numLocalNodes; i++) {\n      edgePrefixSum[i] += edgePrefixSum[i - 1];\n    }\n\n    assignedThreadRanges = galois::graphs::determineUnitRangesFromPrefixSum(\n        galois::runtime::activeThreads, edgePrefixSum);\n\n    for (unsigned i = 0; i < galois::runtime::activeThreads + 1; i++) {\n      assignedThreadRanges[i] += startNode;\n    }\n\n    auto toReturn = galois::runtime::makeSpecificRange(\n        boost::counting_iterator<size_t>(startNode),\n        boost::counting_iterator<size_t>(startNode + numLocalNodes),\n        assignedThreadRanges.data());\n\n    threadRangeTime.stop();\n    return toReturn;\n  }\n\n  /**\n   * For each other host, determine which nodes that this host needs to get\n   * info from\n   *\n   * @param bufGraph Buffered graph used to loop over edges\n   * @param ghosts bitset; at end\n   * of execution, marked bits signify neighbors on this host that that other\n   * host has read (and therefore must sync with me)\n   */\n  // steps 1 and 2 of neighbor location setup: memory allocation, bitset setting\n  void phase0BitsetSetup(galois::graphs::BufferedGraph<EdgeTy>& bufGraph,\n                         galois::DynamicBitSet& ghosts) {\n    galois::StatTimer bitsetSetupTimer(\"Phase0BitsetSetup\", GRNAME);\n    bitsetSetupTimer.start();\n\n    ghosts.resize(bufGraph.size());\n    ghosts.reset();\n\n    std::vector<uint32_t> rangeVector;\n    auto start = base_DistGraph::gid2host[base_DistGraph::id].first;\n    auto end   = base_DistGraph::gid2host[base_DistGraph::id].second;\n\n    galois::runtime::SpecificRange<boost::counting_iterator<size_t>> work =\n        getSpecificThreadRange(bufGraph, rangeVector, start, end);\n\n    // Step 2: loop over all local nodes, determine neighbor locations\n    galois::do_all(\n        galois::iterate(work),\n        // galois::iterate(base_DistGraph::gid2host[base_DistGraph::id].first,\n        //                base_DistGraph::gid2host[base_DistGraph::id].second),\n        [&](unsigned n) {\n          // ptt.start();\n          // galois::gPrint(\"[\", base_DistGraph::id, \" \",\n          // galois::substrate::getThreadPool().getTID(), \"] \", n, \"\\n\");\n          auto ii = bufGraph.edgeBegin(n);\n          auto ee = bufGraph.edgeEnd(n);\n          for (; ii < ee; ++ii) {\n            uint32_t dst = bufGraph.edgeDestination(*ii);\n            if ((dst < start) || (dst >= end)) { // not owned by this host\n              // set on bitset\n              ghosts.set(dst);\n            }\n          }\n          // ptt.stop();\n        },\n        galois::loopname(\"Phase0BitsetSetup_DetermineNeighborLocations\"),\n        galois::steal(), galois::no_stats());\n\n    bitsetSetupTimer.stop();\n  }\n\n  // sets up the gid to lid mapping for phase 0\n  /**\n   * Set up the GID to LID mapping for phase 0: In the mapping vector,\n   * read nodes occupy the first chunk, and nodes read by other hosts follow.\n   *\n   * @param ghosts\n   * @param gid2offsets mapping vector: element at an offset corresponds to a\n   * particular GID (and its master)\n   * @param syncNodes one vector of nodes for each host: at the end of\n   * execution, will contain mirrors on this host whose master is on that host\n   * @returns Number of set bits\n   */\n  uint64_t phase0MapSetup(\n      galois::DynamicBitSet& ghosts,\n      std::unordered_map<uint64_t, uint32_t>& gid2offsets,\n      galois::gstl::Vector<galois::gstl::Vector<uint32_t>>& syncNodes) {\n    galois::StatTimer mapSetupTimer(\"Phase0MapSetup\", GRNAME);\n    mapSetupTimer.start();\n\n    uint32_t numLocal = base_DistGraph::gid2host[base_DistGraph::id].second -\n                        base_DistGraph::gid2host[base_DistGraph::id].first;\n    uint32_t lid = numLocal;\n\n    uint64_t numToReserve = ghosts.count();\n    gid2offsets.reserve(numToReserve);\n\n    // TODO: parallelize using prefix sum?\n    for (unsigned h = 0; h < base_DistGraph::numHosts; ++h) {\n      if (h == base_DistGraph::id)\n        continue;\n      auto start = base_DistGraph::gid2host[h].first;\n      auto end   = base_DistGraph::gid2host[h].second;\n      for (uint64_t gid = start; gid < end; ++gid) {\n        if (ghosts.test(gid)) {\n          gid2offsets[gid] = lid;\n          syncNodes[h].push_back(gid - start);\n          lid++;\n        }\n      }\n      galois::gDebug(\"[\", base_DistGraph::id, \" -> \", h, \"] bitset size \",\n                     (end - start) / 64, \" vs. vector size \",\n                     syncNodes[h].size() / 2);\n    }\n    lid -= numLocal;\n\n    assert(lid == numToReserve);\n    galois::gDebug(\"[\", base_DistGraph::id, \"] total bitset size \",\n                   (ghosts.size() - numLocal) / 64, \" vs. total vector size \",\n                   numToReserve / 2);\n\n    // TODO: should not be used after this - refactor to make this clean\n    ghosts.resize(0);\n\n    mapSetupTimer.stop();\n\n    return lid;\n  }\n\n  // steps 4 and 5 of neighbor location setup\n  /**\n   * Let other hosts know which nodes they need to send to me by giving them\n   * the bitset marked with nodes I am interested in on the other host.\n   *\n   * @param syncNodes one vector of nodes for each host: at the begin of\n   * execution, will contain mirrors on this host whose master is on that host;\n   * at the end of execution, will contain masters on this host whose mirror\n   * is on that host\n   */\n  void phase0SendRecv(\n      galois::gstl::Vector<galois::gstl::Vector<uint32_t>>& syncNodes) {\n    auto& net = galois::runtime::getSystemNetworkInterface();\n    galois::StatTimer p0BitsetCommTimer(\"Phase0SendRecvBitsets\", GRNAME);\n    p0BitsetCommTimer.start();\n    uint64_t bytesSent = 0;\n\n    // Step 4: send bitset to other hosts\n    for (unsigned h = 0; h < base_DistGraph::numHosts; h++) {\n      galois::runtime::SendBuffer bitsetBuffer;\n\n      if (h != base_DistGraph::id) {\n        galois::runtime::gSerialize(bitsetBuffer, syncNodes[h]);\n        bytesSent += bitsetBuffer.size();\n        net.sendTagged(h, galois::runtime::evilPhase, bitsetBuffer);\n      }\n    }\n\n    // Step 5: recv bitset to other hosts; this indicates which local nodes each\n    // other host needs to be informed of updates of\n    for (unsigned h = 0; h < net.Num - 1; h++) {\n      decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr)) p;\n      do {\n        p = net.recieveTagged(galois::runtime::evilPhase, nullptr);\n      } while (!p);\n      uint32_t sendingHost = p->first;\n      // deserialize into neighbor bitsets\n      galois::runtime::gDeserialize(p->second, syncNodes[sendingHost]);\n    }\n\n    p0BitsetCommTimer.stop();\n\n    galois::runtime::reportStat_Tsum(\n        GRNAME, std::string(\"Phase0SendRecvBitsetsBytesSent\"), bytesSent);\n\n    // comm phase complete\n    base_DistGraph::increment_evilPhase();\n  }\n\n  /**\n   * Given a set of loads in a vector and the accumulation to those loads,\n   * synchronize them across hosts and do the accumulation into the vector\n   * of loads.\n   *\n   * @param loads Vector of loads to accumulate to\n   * @param accums Vector of accuulations to loads that occured since last\n   * sync\n   */\n  void syncLoad(std::vector<uint64_t>& loads,\n                std::vector<galois::CopyableAtomic<uint64_t>>& accums) {\n    assert(loads.size() == accums.size());\n    // use DG accumulator to force barrier on all hosts to sync this data\n    galois::DGAccumulator<uint64_t> syncer;\n    // sync accum for each host one by one\n    for (unsigned i = 0; i < loads.size(); i++) {\n      syncer.reset();\n      syncer += (accums[i].load());\n      accums[i].store(0);\n      uint64_t accumulation = syncer.reduce();\n      loads[i] += accumulation;\n    }\n  }\n\n  /**\n   * Given a copyable atomic vector, get data from it, save to a\n   * PODResizeableArray, and reset value in the atomic array.\n   *\n   * @param atomic Atomic vector to extract and reset\n   * @param nonAtomic PODarray to extract data into\n   */\n  template <typename VType>\n  void\n  extractAtomicToPODArray(std::vector<galois::CopyableAtomic<VType>>& atomic,\n                          galois::PODResizeableArray<VType>& nonAtomic) {\n    nonAtomic.resize(atomic.size());\n\n    galois::do_all(\n        galois::iterate((size_t)0, atomic.size()),\n        [&](size_t i) {\n          nonAtomic[i] = atomic[i].load();\n          atomic[i].store(0);\n        },\n        galois::no_stats());\n  }\n\n  /**\n   * Send newly accumulated node and edge loads to all other hosts and reset\n   * the accumulated values. No DG accmulator used.\n   *\n   * @param nodeAccum new node accumulation for each host in system\n   * @param edgeAccum new edge accumulation for each host in system\n   */\n  void asyncSendLoad(galois::PODResizeableArray<uint64_t>& nodeAccum,\n                     galois::PODResizeableArray<uint64_t>& edgeAccum) {\n    auto& net = galois::runtime::getSystemNetworkInterface();\n\n    unsigned bytesSent = 0;\n    galois::StatTimer sendTimer(\"Phase0AsyncSendLoadTime\", GRNAME);\n\n    sendTimer.start();\n    for (unsigned h = 0; h < base_DistGraph::numHosts; h++) {\n      if (h != base_DistGraph::id) {\n        // serialize node and edge accumulations with tag 4 (to avoid\n        // conflict with other tags being used) and send\n        galois::runtime::SendBuffer b;\n\n        galois::runtime::gSerialize(b, 4);\n        galois::runtime::gSerialize(b, nodeAccum);\n        galois::runtime::gSerialize(b, edgeAccum);\n        bytesSent += b.size();\n\n        // note the +1 on evil phase; load messages send using a different\n        // phase to avoid conflicts\n        net.sendTagged(h, base_DistGraph::evilPhasePlus1(), b);\n      }\n    }\n    sendTimer.stop();\n\n    galois::runtime::reportStat_Tsum(GRNAME, \"Phase0AsyncSendLoadBytesSent\",\n                                     bytesSent);\n  }\n\n  /**\n   * Receive (if it exists) new node/edge loads from other hosts and add it to\n   * our own loads.\n   *\n   * @param nodeLoads current node load information for each host in system\n   * @param edgeLoads current edge load information for each host in system\n   */\n  void asyncRecvLoad(std::vector<uint64_t>& nodeLoads,\n                     std::vector<uint64_t>& edgeLoads,\n                     galois::DynamicBitSet& loadsClear) {\n    auto& net = galois::runtime::getSystemNetworkInterface();\n    decltype(net.recieveTagged(base_DistGraph::evilPhasePlus1(), nullptr)) p;\n\n    galois::StatTimer recvTimer(\"Phase0AsyncRecvLoadTime\", GRNAME);\n    recvTimer.start();\n    do {\n      // note the +1\n      p = net.recieveTagged(base_DistGraph::evilPhasePlus1(), nullptr);\n\n      if (p) {\n        unsigned messageType = (unsigned)-1;\n        // deserialize message type\n        galois::runtime::gDeserialize(p->second, messageType);\n\n        if (messageType == 4) {\n          galois::PODResizeableArray<uint64_t> recvNodeAccum;\n          galois::PODResizeableArray<uint64_t> recvEdgeAccum;\n          // loads to add\n          galois::runtime::gDeserialize(p->second, recvNodeAccum);\n          galois::runtime::gDeserialize(p->second, recvEdgeAccum);\n\n          assert(recvNodeAccum.size() == recvEdgeAccum.size());\n          assert(recvNodeAccum.size() == nodeLoads.size());\n          assert(recvEdgeAccum.size() == edgeLoads.size());\n\n          galois::do_all(\n              galois::iterate((size_t)0, recvNodeAccum.size()),\n              [&](size_t i) {\n                nodeLoads[i] += recvNodeAccum[i];\n                edgeLoads[i] += recvEdgeAccum[i];\n              },\n              galois::no_stats());\n        } else if (messageType == 3) {\n          // all clear message from host\n          uint32_t sendingHost = p->first;\n          assert(!loadsClear.test(sendingHost));\n          loadsClear.set(sendingHost);\n        } else {\n          GALOIS_DIE(\"unexpected message type in async load synchronization: \",\n                     messageType);\n        }\n      }\n    } while (p);\n\n    recvTimer.stop();\n  }\n\n  /**\n   * Send out accumulated loads from a round of node assignments to all other\n   * hosts and also receive loads from other hosts if they exist\n   * (non-blocking).\n   *\n   * @param nodeLoads current known node loads on this host\n   * @param nodeAccum newly accumulated node loads from a prior round of node\n   * assignments\n   * @param edgeLoads current known edge loads on this host\n   * @param edgeAccum newly accumulated edge loads from a prior round of node\n   * assignments\n   * @param loadsClear Bitset tracking if we have received all loads from\n   * a particular host\n   */\n  void asyncSyncLoad(std::vector<uint64_t>& nodeLoads,\n                     std::vector<galois::CopyableAtomic<uint64_t>>& nodeAccum,\n                     std::vector<uint64_t>& edgeLoads,\n                     std::vector<galois::CopyableAtomic<uint64_t>>& edgeAccum,\n                     galois::DynamicBitSet& loadsClear) {\n    assert(nodeLoads.size() == base_DistGraph::numHosts);\n    assert(nodeAccum.size() == base_DistGraph::numHosts);\n    assert(edgeLoads.size() == base_DistGraph::numHosts);\n    assert(edgeAccum.size() == base_DistGraph::numHosts);\n\n    galois::StatTimer syncTimer(\"Phase0AsyncSyncLoadTime\", GRNAME);\n    syncTimer.start();\n\n    // extract out data to send\n    galois::PODResizeableArray<uint64_t> nonAtomicNodeAccum;\n    galois::PODResizeableArray<uint64_t> nonAtomicEdgeAccum;\n    extractAtomicToPODArray(nodeAccum, nonAtomicNodeAccum);\n    extractAtomicToPODArray(edgeAccum, nonAtomicEdgeAccum);\n\n    assert(nonAtomicNodeAccum.size() == base_DistGraph::numHosts);\n    assert(nonAtomicEdgeAccum.size() == base_DistGraph::numHosts);\n\n    // apply loads to self\n    galois::do_all(\n        galois::iterate((uint32_t)0, base_DistGraph::numHosts),\n        [&](size_t i) {\n          nodeLoads[i] += nonAtomicNodeAccum[i];\n          edgeLoads[i] += nonAtomicEdgeAccum[i];\n        },\n        galois::no_stats());\n\n#ifndef NDEBUG\n    for (unsigned i = 0; i < nodeAccum.size(); i++) {\n      assert(nodeAccum[i].load() == 0);\n      assert(edgeAccum[i].load() == 0);\n    }\n#endif\n\n    // send both nodes and edges accumulation at once\n    asyncSendLoad(nonAtomicNodeAccum, nonAtomicEdgeAccum);\n    asyncRecvLoad(nodeLoads, edgeLoads, loadsClear);\n\n    syncTimer.stop();\n  }\n\n  /**\n   * Debug function: simply prints loads and accumulations\n   *\n   * @param loads Vector of loads to accumulate to\n   * @param accums Vector of accuulations to loads that occured since last\n   * sync\n   */\n  void printLoad(std::vector<uint64_t>& loads,\n                 std::vector<galois::CopyableAtomic<uint64_t>>& accums) {\n    assert(loads.size() == accums.size());\n    for (unsigned i = 0; i < loads.size(); i++) {\n      galois::gDebug(\"[\", base_DistGraph::id, \"] \", i, \" total \", loads[i],\n                     \" accum \", accums[i].load());\n    }\n  }\n\n  /**\n   * Given a vector of data and a bitset specifying which elements in the data\n   * vector need to be extracted, extract the appropriate elements into\n   * a vector.\n   *\n   * @param offsets Bitset specifying which elements in the data vector need\n   * to be extracted.\n   * @param dataVector Data vector to extract data from according to the bitset\n   * @return Vector of extracted elements\n   */\n  template <typename T>\n  std::vector<T> getDataFromOffsets(std::vector<uint32_t>& offsetVector,\n                                    const std::vector<T>& dataVector) {\n    std::vector<T> toReturn;\n    toReturn.resize(offsetVector.size());\n\n    galois::do_all(\n        galois::iterate((size_t)0, offsetVector.size()),\n        [&](unsigned i) { toReturn[i] = dataVector[offsetVector[i]]; },\n        galois::no_stats());\n\n    return toReturn;\n  }\n\n  /**\n   * Given a host, a bitset that marks offsets, and a vector,\n   * send the data located at the offsets from the vector to the\n   * specified host. If bitset is unmarked, send a no-op.\n   *\n   * @param targetHost Host to send data to\n   * @param toSync Bitset that specifies which offsets in the data vector\n   * to send\n   * @param dataVector Data to be sent to the target host\n   */\n  void sendOffsets(unsigned targetHost, galois::DynamicBitSet& toSync,\n                   std::vector<uint32_t>& dataVector,\n                   std::string timerName = std::string()) {\n    auto& net              = galois::runtime::getSystemNetworkInterface();\n    std::string statString = std::string(\"Phase0SendOffsets_\") + timerName;\n    uint64_t bytesSent     = 0;\n\n    galois::StatTimer sendOffsetsTimer(statString.c_str(), GRNAME);\n\n    sendOffsetsTimer.start();\n\n    // this means there are updates to send\n    if (toSync.count()) {\n      std::vector<uint32_t> offsetVector = toSync.getOffsets();\n      // get masters to send into a vector\n      std::vector<uint32_t> mastersToSend =\n          getDataFromOffsets(offsetVector, dataVector);\n\n      assert(mastersToSend.size());\n\n      size_t num_selected = toSync.count();\n      size_t num_total    = toSync.size();\n      // figure out how to send (most efficient method; either bitset\n      // and data or offsets + data)\n      size_t bitset_alloc_size =\n          ((num_total + 63) / 64) * sizeof(uint64_t) + (2 * sizeof(size_t));\n      size_t bitsetDataSize = (num_selected * sizeof(uint32_t)) +\n                              bitset_alloc_size + sizeof(num_selected);\n      size_t offsetsDataSize = (num_selected * sizeof(uint32_t)) +\n                               (num_selected * sizeof(unsigned int)) +\n                               sizeof(uint32_t) + sizeof(num_selected);\n\n      galois::runtime::SendBuffer b;\n      // tag with send method and do send\n      if (bitsetDataSize < offsetsDataSize) {\n        // send bitset, tag 1\n        galois::runtime::gSerialize(b, 1u);\n        galois::runtime::gSerialize(b, toSync);\n        galois::runtime::gSerialize(b, mastersToSend);\n      } else {\n        // send offsets, tag 2\n        galois::runtime::gSerialize(b, 2u);\n        galois::runtime::gSerialize(b, offsetVector);\n        galois::runtime::gSerialize(b, mastersToSend);\n      }\n      bytesSent += b.size();\n      net.sendTagged(targetHost, galois::runtime::evilPhase, b);\n    } else {\n      // send empty no-op message, tag 0\n      galois::runtime::SendBuffer b;\n      galois::runtime::gSerialize(b, 0u);\n      bytesSent += b.size();\n      net.sendTagged(targetHost, galois::runtime::evilPhase, b);\n    }\n    sendOffsetsTimer.stop();\n\n    galois::runtime::reportStat_Tsum(GRNAME, statString + \"BytesSent\",\n                                     bytesSent);\n  }\n\n  /**\n   * Send new master assignment updates to other hosts based on syncNodes\n   * for each host prepared in advance.\n   *\n   * @param begin to end: which nodes on this host have been updated\n   * @param numLocalNodes: number of owned nodes\n   * @param localNodeToMaster Vector map: an offset corresponds to a particular\n   * GID; indicates masters of GIDs\n   * @param syncNodes one vector of nodes for each host: contains mirrors on\n   * this host whose master is on that host\n   */\n  void syncAssignmentSends(\n      uint32_t begin, uint32_t end, uint32_t numLocalNodes,\n      std::vector<uint32_t>& localNodeToMaster,\n      galois::gstl::Vector<galois::gstl::Vector<uint32_t>>& syncNodes) {\n    galois::StatTimer p0assignSendTime(\"Phase0AssignmentSendTime\", GRNAME);\n    p0assignSendTime.start();\n\n    galois::DynamicBitSet toSync;\n    toSync.resize(numLocalNodes);\n\n    // send loop\n    for (unsigned h = 0; h < base_DistGraph::numHosts; h++) {\n      if (h != base_DistGraph::id) {\n        toSync.reset();\n        // send if in [start,end) and present in syncNodes[h]\n        galois::do_all(\n            galois::iterate(syncNodes[h]),\n            [&](uint32_t lid) {\n              if ((lid >= begin) && (lid < end)) {\n                toSync.set(lid);\n              }\n            },\n            galois::no_stats());\n        // do actual send based on sync bitset\n        sendOffsets(h, toSync, localNodeToMaster, \"NewAssignments\");\n      }\n    }\n\n    p0assignSendTime.stop();\n  }\n\n  /**\n   * Send message to all hosts saying we're done with assignments. Can\n   * specify a phase to distinguish between all clears for assignments\n   * and loads\n   */\n  void sendAllClears(unsigned phase = 0) {\n    unsigned bytesSent = 0;\n    auto& net          = galois::runtime::getSystemNetworkInterface();\n    galois::StatTimer allClearTimer(\"Phase0SendAllClearTime\", GRNAME);\n    allClearTimer.start();\n\n    // send loop\n    for (unsigned h = 0; h < base_DistGraph::numHosts; h++) {\n      if (h != base_DistGraph::id) {\n        galois::runtime::SendBuffer b;\n        galois::runtime::gSerialize(b, 3u);\n        bytesSent += b.size();\n        // assumes phase is 0 or 1\n        if (phase == 1) {\n          net.sendTagged(h, base_DistGraph::evilPhasePlus1(), b);\n        } else if (phase == 0) {\n          net.sendTagged(h, galois::runtime::evilPhase, b);\n        } else {\n          GALOIS_DIE(\"unexpected phase: \", phase);\n        }\n      }\n    }\n    allClearTimer.stop();\n\n    galois::runtime::reportStat_Tsum(GRNAME, \"Phase0SendAllClearBytesSent\",\n                                     bytesSent);\n  }\n\n  void saveReceivedMappings(std::vector<uint32_t>& localNodeToMaster,\n                            std::unordered_map<uint64_t, uint32_t>& gid2offsets,\n                            unsigned sendingHost,\n                            std::vector<uint32_t>& receivedOffsets,\n                            std::vector<uint32_t>& receivedMasters) {\n    uint64_t hostOffset = base_DistGraph::gid2host[sendingHost].first;\n    galois::gDebug(\"[\", base_DistGraph::id, \"] host \", sendingHost, \" offset \",\n                   hostOffset);\n\n    // if execution gets here, messageType was 1 or 2\n    assert(receivedMasters.size() == receivedOffsets.size());\n\n    galois::do_all(\n        galois::iterate((size_t)0, receivedMasters.size()),\n        [&](size_t i) {\n          uint64_t curGID       = hostOffset + receivedOffsets[i];\n          uint32_t indexIntoMap = gid2offsets[curGID];\n          galois::gDebug(\"[\", base_DistGraph::id, \"] gid \", curGID, \" offset \",\n                         indexIntoMap);\n          localNodeToMaster[indexIntoMap] = receivedMasters[i];\n        },\n        galois::no_stats());\n  }\n\n  /**\n   * Receive offsets and masters into the provided vectors and return sending\n   * host and the message type.\n   *\n   * @param receivedOffsets vector to receive offsets into\n   * @param receivedMasters vector to receive masters mappings into\n   * @returns sending host and message type of received data\n   */\n  std::pair<unsigned, unsigned>\n  recvOffsetsAndMasters(std::vector<uint32_t>& receivedOffsets,\n                        std::vector<uint32_t>& receivedMasters) {\n    auto& net = galois::runtime::getSystemNetworkInterface();\n\n    decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr)) p;\n    do {\n      p = net.recieveTagged(galois::runtime::evilPhase, nullptr);\n    } while (!p);\n\n    uint32_t sendingHost = p->first;\n    unsigned messageType = (unsigned)-1;\n\n    // deserialize message type\n    galois::runtime::gDeserialize(p->second, messageType);\n\n    if (messageType == 1) {\n      // bitset; deserialize, then get offsets\n      galois::DynamicBitSet receivedSet;\n      galois::runtime::gDeserialize(p->second, receivedSet);\n      receivedOffsets = receivedSet.getOffsets();\n      galois::runtime::gDeserialize(p->second, receivedMasters);\n    } else if (messageType == 2) {\n      // offsets\n      galois::runtime::gDeserialize(p->second, receivedOffsets);\n      galois::runtime::gDeserialize(p->second, receivedMasters);\n    } else if (messageType != 0) {\n      GALOIS_DIE(\"invalid message type for sync of master assignments: \",\n                 messageType);\n    }\n\n    galois::gDebug(\"[\", base_DistGraph::id, \"] host \", sendingHost,\n                   \" send message type \", messageType);\n\n    return std::make_pair(sendingHost, messageType);\n  }\n\n  /**\n   * Receive offsets and masters into the provided vectors and return sending\n   * host and the message type, async (i.e. does not have to receive anything\n   * to exit function.\n   *\n   * @param receivedOffsets vector to receive offsets into\n   * @param receivedMasters vector to receive masters mappings into\n   */\n  void recvOffsetsAndMastersAsync(\n      std::vector<uint32_t>& localNodeToMaster,\n      std::unordered_map<uint64_t, uint32_t>& gid2offsets,\n      galois::DynamicBitSet& hostFinished) {\n    auto& net = galois::runtime::getSystemNetworkInterface();\n    decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr)) p;\n\n    // repeat loop until no message\n    do {\n      p = net.recieveTagged(galois::runtime::evilPhase, nullptr);\n      if (p) {\n        uint32_t sendingHost = p->first;\n        unsigned messageType = (unsigned)-1;\n\n        std::vector<uint32_t> receivedOffsets;\n        std::vector<uint32_t> receivedMasters;\n\n        // deserialize message type\n        galois::runtime::gDeserialize(p->second, messageType);\n\n        if (messageType == 1) {\n          // bitset; deserialize, then get offsets\n          galois::DynamicBitSet receivedSet;\n          galois::runtime::gDeserialize(p->second, receivedSet);\n          receivedOffsets = receivedSet.getOffsets();\n          galois::runtime::gDeserialize(p->second, receivedMasters);\n          saveReceivedMappings(localNodeToMaster, gid2offsets, sendingHost,\n                               receivedOffsets, receivedMasters);\n        } else if (messageType == 2) {\n          // offsets\n          galois::runtime::gDeserialize(p->second, receivedOffsets);\n          galois::runtime::gDeserialize(p->second, receivedMasters);\n          saveReceivedMappings(localNodeToMaster, gid2offsets, sendingHost,\n                               receivedOffsets, receivedMasters);\n        } else if (messageType == 3) {\n          // host indicating that it is done with all assignments from its\n          // end; mark as such in bitset\n          assert(!hostFinished.test(sendingHost));\n          hostFinished.set(sendingHost);\n        } else if (messageType != 0) {\n          GALOIS_DIE(\"invalid message type for sync of master assignments: \",\n                     messageType);\n        }\n\n        galois::gDebug(\"[\", base_DistGraph::id, \"] host \", sendingHost,\n                       \" send message type \", messageType);\n      }\n    } while (p);\n  }\n\n  /**\n   * Receive new master assignment updates from other hosts and update local\n   * mappings.\n   *\n   * @param localNodeToMaster Vector map: an offset corresponds to a particular\n   * GID; indicates masters of GIDs\n   * @param gid2offsets Map of GIDs to the offset into the vector map that\n   * corresponds to it\n   */\n  void\n  syncAssignmentReceives(std::vector<uint32_t>& localNodeToMaster,\n                         std::unordered_map<uint64_t, uint32_t>& gid2offsets) {\n    galois::StatTimer p0assignReceiveTime(\"Phase0AssignmentReceiveTime\",\n                                          GRNAME);\n    p0assignReceiveTime.start();\n\n    // receive loop\n    for (unsigned h = 0; h < base_DistGraph::numHosts - 1; h++) {\n      unsigned sendingHost;\n      unsigned messageType;\n      std::vector<uint32_t> receivedOffsets;\n      std::vector<uint32_t> receivedMasters;\n\n      std::tie(sendingHost, messageType) =\n          recvOffsetsAndMasters(receivedOffsets, receivedMasters);\n\n      if (messageType == 1 || messageType == 2) {\n        saveReceivedMappings(localNodeToMaster, gid2offsets, sendingHost,\n                             receivedOffsets, receivedMasters);\n      }\n    }\n\n    p0assignReceiveTime.stop();\n  }\n\n  void syncAssignmentReceivesAsync(\n      std::vector<uint32_t>& localNodeToMaster,\n      std::unordered_map<uint64_t, uint32_t>& gid2offsets,\n      galois::DynamicBitSet& hostFinished) {\n    galois::StatTimer p0assignReceiveTime(\"Phase0AssignmentReceiveTimeAsync\",\n                                          GRNAME);\n    p0assignReceiveTime.start();\n\n    recvOffsetsAndMastersAsync(localNodeToMaster, gid2offsets, hostFinished);\n\n    p0assignReceiveTime.stop();\n  }\n\n  /**\n   * Send/receive new master assignment updates to other hosts.\n   *\n   * @param begin to end: which nodes on this host have been updated\n   * @param numLocalNodes: number of owned nodes\n   * @param localNodeToMaster Vector map: an offset corresponds to a particular\n   * GID; indicates masters of GIDs\n   * @param syncNodes one vector of nodes for each host: contains mirrors on\n   * this host whose master is on that host\n   * @param gid2offsets Map of GIDs to the offset into the vector map that\n   * corresponds to it\n   */\n  void syncAssignment(\n      uint32_t begin, uint32_t end, uint32_t numLocalNodes,\n      std::vector<uint32_t>& localNodeToMaster,\n      galois::gstl::Vector<galois::gstl::Vector<uint32_t>>& syncNodes,\n      std::unordered_map<uint64_t, uint32_t>& gid2offsets) {\n    galois::StatTimer syncAssignmentTimer(\"Phase0SyncAssignmentTime\", GRNAME);\n    syncAssignmentTimer.start();\n\n    syncAssignmentSends(begin, end, numLocalNodes, localNodeToMaster,\n                        syncNodes);\n    syncAssignmentReceives(localNodeToMaster, gid2offsets);\n\n    syncAssignmentTimer.stop();\n  }\n\n  void syncAssignmentAsync(\n      uint32_t begin, uint32_t end, uint32_t numLocalNodes,\n      std::vector<uint32_t>& localNodeToMaster,\n      galois::gstl::Vector<galois::gstl::Vector<uint32_t>>& syncNodes,\n      std::unordered_map<uint64_t, uint32_t>& gid2offsets,\n      galois::DynamicBitSet& hostFinished) {\n    galois::StatTimer syncAssignmentTimer(\"Phase0SyncAssignmentAsyncTime\",\n                                          GRNAME);\n    syncAssignmentTimer.start();\n\n    syncAssignmentSends(begin, end, numLocalNodes, localNodeToMaster,\n                        syncNodes);\n    syncAssignmentReceivesAsync(localNodeToMaster, gid2offsets, hostFinished);\n\n    syncAssignmentTimer.stop();\n  }\n\n  /**\n   * Send masters mappings that were read on this host to their appropirate\n   * owners\n   *\n   * @param localNodeToMaster local id to master mapping map\n   * @param ghosts bitsets specifying which hosts have which neighbors\n   * that this host has read\n   */\n  void sendMastersToOwners(\n      std::vector<uint32_t>& localNodeToMaster,\n      galois::gstl::Vector<galois::gstl::Vector<uint32_t>>& syncNodes) {\n    uint32_t begin = base_DistGraph::gid2host[base_DistGraph::id].first;\n    uint32_t end   = base_DistGraph::gid2host[base_DistGraph::id].second;\n    // for each host, determine which master assignments still need to be sent\n    // (if a host is a master of a node, but that node is not present as a\n    // neighbor on the host, then this host needs to send the master assignment)\n    galois::DynamicBitSet toSend;\n    toSend.resize(end - begin);\n\n    for (unsigned h = 0; h < base_DistGraph::numHosts; ++h) {\n      if (h != base_DistGraph::id) {\n        toSend.reset();\n        // send if present in localNodeToMaster but not present in syncNodes\n        galois::do_all(\n            galois::iterate((uint32_t)0, end - begin),\n            [&](uint32_t lid) {\n              if (localNodeToMaster[lid] == h) {\n                toSend.set(lid);\n              }\n            },\n            galois::no_stats());\n        galois::do_all(\n            galois::iterate(syncNodes[h]),\n            [&](uint32_t lid) { toSend.reset(lid); }, galois::no_stats());\n\n        sendOffsets(h, toSend, localNodeToMaster, \"MastersToOwners\");\n      }\n    }\n  }\n\n  /**\n   * Receive master mapping messages from hosts and add it to the graph\n   * partitioner's map.\n   */\n  void recvMastersToOwners() {\n    for (unsigned h = 0; h < base_DistGraph::numHosts - 1; h++) {\n      unsigned sendingHost;\n      unsigned messageType;\n      std::vector<uint32_t> receivedOffsets;\n      std::vector<uint32_t> receivedMasters;\n\n      std::tie(sendingHost, messageType) =\n          recvOffsetsAndMasters(receivedOffsets, receivedMasters);\n\n      if (messageType == 1 || messageType == 2) {\n        assert(receivedMasters.size() == receivedOffsets.size());\n        uint64_t hostOffset = base_DistGraph::gid2host[sendingHost].first;\n\n        // must be single threaded as map updating isn't thread-safe\n        for (unsigned i = 0; i < receivedMasters.size(); i++) {\n          uint64_t gidToMap = hostOffset + receivedOffsets[i];\n#ifndef NDEBUG\n          bool newMapped =\n#endif\n              graphPartitioner->addMasterMapping(gidToMap, receivedMasters[i]);\n          assert(newMapped);\n        }\n      }\n    }\n  }\n\n  /**\n   * Phase responsible for initial master assignment.\n   *\n   * @param bufGraph Locally read graph on this host\n   * @param async Specifies whether or not do synchronization of node\n   * assignments BSP style or asynchronous style. Note regardless of which\n   * is chosen there is a barrier at the end of master assignment.\n   */\n  void phase0(galois::graphs::BufferedGraph<EdgeTy>& bufGraph, bool async,\n              const uint32_t stateRounds) {\n    galois::DynamicBitSet ghosts;\n    galois::gstl::Vector<galois::gstl::Vector<uint32_t>>\n        syncNodes; // masterNodes\n    syncNodes.resize(base_DistGraph::numHosts);\n\n    // determine on which hosts that this host's read nodes havs neighbors on\n    phase0BitsetSetup(bufGraph, ghosts);\n    // gid to vector offset setup\n    std::unordered_map<uint64_t, uint32_t> gid2offsets;\n    uint64_t neighborCount = phase0MapSetup(ghosts, gid2offsets, syncNodes);\n    galois::gDebug(\"[\", base_DistGraph::id, \"] num neighbors found is \",\n                   neighborCount);\n    // send off neighbor metadata\n    phase0SendRecv(syncNodes);\n\n    galois::StatTimer p0allocTimer(\"Phase0AllocationTime\", GRNAME);\n\n    p0allocTimer.start();\n\n    // setup other partitioning metadata: nodes on each host, edges on each\n    // host (as determined by edge cut)\n    std::vector<uint64_t> nodeLoads;\n    std::vector<uint64_t> edgeLoads;\n    std::vector<galois::CopyableAtomic<uint64_t>> nodeAccum;\n    std::vector<galois::CopyableAtomic<uint64_t>> edgeAccum;\n    nodeLoads.assign(base_DistGraph::numHosts, 0);\n    edgeLoads.assign(base_DistGraph::numHosts, 0);\n    nodeAccum.assign(base_DistGraph::numHosts, 0);\n    edgeAccum.assign(base_DistGraph::numHosts, 0);\n\n    uint32_t numLocalNodes =\n        base_DistGraph::gid2host[base_DistGraph::id].second -\n        base_DistGraph::gid2host[base_DistGraph::id].first;\n\n    std::vector<uint32_t> localNodeToMaster;\n    localNodeToMaster.assign(numLocalNodes + neighborCount, (uint32_t)-1);\n\n    // bitsets tracking termination of assignments and partitioning loads\n    galois::DynamicBitSet hostFinished;\n    galois::DynamicBitSet loadsClear;\n\n    if (async) {\n      if (base_DistGraph::id == 0) {\n        galois::gPrint(\"Using asynchronous master determination sends.\\n\");\n      }\n\n      hostFinished.resize(base_DistGraph::numHosts);\n      loadsClear.resize(base_DistGraph::numHosts);\n    }\n\n    p0allocTimer.stop();\n\n    uint64_t globalOffset = base_DistGraph::gid2host[base_DistGraph::id].first;\n\n#ifndef NDEBUG\n    for (uint32_t i : localNodeToMaster) {\n      assert(i == (uint32_t)-1);\n    }\n#endif\n\n    if (base_DistGraph::id == 0) {\n      galois::gPrint(\"Number of BSP sync rounds in master assignment: \",\n                     stateRounds, \"\\n\");\n    }\n\n    // galois::PerThreadTimer<CUSP_PT_TIMER> ptt(\n    //  GRNAME, \"Phase0DetermineMaster_\" + std::string(base_DistGraph::id)\n    //);\n    for (unsigned syncRound = 0; syncRound < stateRounds; syncRound++) {\n      uint32_t beginNode;\n      uint32_t endNode;\n      std::tie(beginNode, endNode) = galois::block_range(\n          globalOffset, base_DistGraph::gid2host[base_DistGraph::id].second,\n          syncRound, stateRounds);\n\n      // create specific range for this block\n      std::vector<uint32_t> rangeVec;\n      auto work =\n          getSpecificThreadRange(bufGraph, rangeVec, beginNode, endNode);\n\n      // debug print\n      // galois::on_each([&] (unsigned i, unsigned j) {\n      //  galois::gDebug(\"[\", base_DistGraph::id, \" \", i, \"] sync round \",\n      //  syncRound, \" local range \",\n      //                 *work.local_begin(), \" \", *work.local_end());\n      //});\n\n      galois::do_all(\n          // iterate over my read nodes\n          galois::iterate(work),\n          // galois::iterate(beginNode, endNode),\n          [&](uint32_t node) {\n            // ptt.start();\n            // determine master function takes source node, iterator of\n            // neighbors\n            uint32_t assignedHost = graphPartitioner->getMaster(\n                node, bufGraph, localNodeToMaster, gid2offsets, nodeLoads,\n                nodeAccum, edgeLoads, edgeAccum);\n            // != -1 means it was assigned a host\n            assert(assignedHost != (uint32_t)-1);\n            // update mapping; this is a local node, so can get position\n            // on map with subtraction\n            localNodeToMaster[node - globalOffset] = assignedHost;\n\n            // galois::gDebug(\"[\", base_DistGraph::id, \"] state round \",\n            // syncRound,\n            //               \" set \", node, \" \", node - globalOffset);\n\n            // ptt.stop();\n          },\n          galois::loopname(\"Phase0DetermineMasters\"), galois::steal(),\n          galois::no_stats());\n\n      // do synchronization of master assignment of neighbors\n      if (!async) {\n        syncAssignment(beginNode - globalOffset, endNode - globalOffset,\n                       numLocalNodes, localNodeToMaster, syncNodes,\n                       gid2offsets);\n      } else {\n        // don't need to send anything if there is nothing to send unlike sync\n        if (beginNode != endNode) {\n          syncAssignmentAsync(beginNode - globalOffset, endNode - globalOffset,\n                              numLocalNodes, localNodeToMaster, syncNodes,\n                              gid2offsets, hostFinished);\n        }\n      }\n\n      // sync node/edge loads\n      galois::StatTimer loadSyncTimer(\"Phase0LoadSyncTime\", GRNAME);\n\n      loadSyncTimer.start();\n      if (!async) {\n        syncLoad(nodeLoads, nodeAccum);\n        syncLoad(edgeLoads, edgeAccum);\n      } else {\n        asyncSyncLoad(nodeLoads, nodeAccum, edgeLoads, edgeAccum, loadsClear);\n      }\n      loadSyncTimer.stop();\n\n#ifndef NDEBUG\n      if (async) {\n        galois::gDebug(\"[\", base_DistGraph::id, \"] host count \",\n                       hostFinished.count());\n      }\n#endif\n    }\n\n    // if asynchronous, don't move on until everything is done\n    if (async) {\n      galois::StatTimer waitTime(\"Phase0AsyncWaitTime\", GRNAME);\n      // assignment clears\n      sendAllClears();\n      // load clears\n      sendAllClears(1);\n\n      hostFinished.set(base_DistGraph::id);\n      loadsClear.set(base_DistGraph::id);\n\n      waitTime.start();\n      while (hostFinished.count() != base_DistGraph::numHosts ||\n             loadsClear.count() != base_DistGraph::numHosts) {\n        //#ifndef NDEBUG\n        // galois::gDebug(\"[\", base_DistGraph::id, \"] waiting for all hosts to\n        // finish, \",\n        //               hostFinished.count());\n        // galois::gDebug(\"[\", base_DistGraph::id, \"] waiting for all hosts\n        // loads \"\n        //               \"syncs to finish, \", loadsClear.count());\n        //#endif\n        // make sure all assignments are done and all loads are done\n        syncAssignmentReceivesAsync(localNodeToMaster, gid2offsets,\n                                    hostFinished);\n        asyncRecvLoad(nodeLoads, edgeLoads, loadsClear);\n      }\n      waitTime.stop();\n    }\n\n#ifndef NDEBUG\n    printLoad(nodeLoads, nodeAccum);\n    printLoad(edgeLoads, edgeAccum);\n#endif\n\n    // sanity check for correctness (all should be assigned)\n    for (uint32_t i = 0; i < localNodeToMaster.size(); i++) {\n      if (localNodeToMaster[i] == (uint32_t)-1) {\n        // galois::gDebug(\"[\", base_DistGraph::id, \"] bad index \", i);\n        assert(localNodeToMaster[i] != (uint32_t)-1);\n      }\n    }\n\n    base_DistGraph::increment_evilPhase();\n    // increment twice if async is used as async uses 2 phases\n    if (async) {\n      base_DistGraph::increment_evilPhase();\n    }\n\n    galois::gPrint(\"[\", base_DistGraph::id,\n                   \"] Local master assignment \"\n                   \"complete.\\n\");\n\n    // one more step: let masters know of nodes they own (if they don't\n    // have the node locally then this is the only way they will learn about\n    // it)\n    galois::StatTimer p0master2ownerTimer(\"Phase0MastersToOwners\", GRNAME);\n\n    p0master2ownerTimer.start();\n    sendMastersToOwners(localNodeToMaster, syncNodes);\n    recvMastersToOwners();\n    p0master2ownerTimer.stop();\n\n    galois::gPrint(\"[\", base_DistGraph::id, \"] Received my master mappings.\\n\");\n\n    base_DistGraph::increment_evilPhase();\n\n    graphPartitioner->saveGID2HostInfo(gid2offsets, localNodeToMaster,\n                                       bufGraph.getNodeOffset());\n  }\n\n  void edgeCutInspection(galois::graphs::BufferedGraph<EdgeTy>& bufGraph,\n                         galois::StatTimer& inspectionTimer,\n                         uint64_t edgeOffset,\n                         galois::gstl::Vector<uint64_t>& prefixSumOfEdges) {\n    galois::DynamicBitSet incomingMirrors;\n    incomingMirrors.resize(base_DistGraph::numGlobalNodes);\n    incomingMirrors.reset();\n    uint32_t myID         = base_DistGraph::id;\n    uint64_t globalOffset = base_DistGraph::gid2host[base_DistGraph::id].first;\n\n    // already set before this is called\n    base_DistGraph::localToGlobalVector.resize(base_DistGraph::numOwned);\n    prefixSumOfEdges.resize(base_DistGraph::numOwned);\n\n    auto& ltgv = base_DistGraph::localToGlobalVector;\n    galois::do_all(\n        galois::iterate(base_DistGraph::gid2host[base_DistGraph::id].first,\n                        base_DistGraph::gid2host[base_DistGraph::id].second),\n        [&](size_t n) {\n          auto ii = bufGraph.edgeBegin(n);\n          auto ee = bufGraph.edgeEnd(n);\n          for (; ii < ee; ++ii) {\n            uint32_t dst = bufGraph.edgeDestination(*ii);\n            if (graphPartitioner->retrieveMaster(dst) != myID) {\n              incomingMirrors.set(dst);\n            }\n          }\n          prefixSumOfEdges[n - globalOffset] = (*ee) - edgeOffset;\n          ltgv[n - globalOffset]             = n;\n        },\n#if MORE_DIST_STATS\n        galois::loopname(\"EdgeInspectionLoop\"),\n#endif\n        galois::steal(), galois::no_stats());\n    inspectionTimer.stop();\n\n    uint64_t allBytesRead = bufGraph.getBytesRead();\n    galois::gPrint(\n        \"[\", base_DistGraph::id,\n        \"] Edge inspection time: \", inspectionTimer.get_usec() / 1000000.0f,\n        \" seconds to read \", allBytesRead, \" bytes (\",\n        allBytesRead / (float)inspectionTimer.get_usec(), \" MBPS)\\n\");\n\n    // get incoming mirrors ready for creation\n    uint32_t additionalMirrorCount = incomingMirrors.count();\n    base_DistGraph::localToGlobalVector.resize(\n        base_DistGraph::localToGlobalVector.size() + additionalMirrorCount);\n    if (base_DistGraph::numOwned > 0) {\n      // fill prefix sum with last number (incomings have no edges)\n      prefixSumOfEdges.resize(prefixSumOfEdges.size() + additionalMirrorCount,\n                              prefixSumOfEdges.back());\n    } else {\n      prefixSumOfEdges.resize(additionalMirrorCount);\n    }\n\n    if (additionalMirrorCount > 0) {\n      // TODO move this part below into separate function\n      uint32_t totalNumNodes = base_DistGraph::numGlobalNodes;\n      uint32_t activeThreads = galois::getActiveThreads();\n      std::vector<uint64_t> threadPrefixSums(activeThreads);\n      galois::on_each([&](unsigned tid, unsigned nthreads) {\n        size_t beginNode;\n        size_t endNode;\n        std::tie(beginNode, endNode) =\n            galois::block_range(0u, totalNumNodes, tid, nthreads);\n        uint64_t count = 0;\n        for (size_t i = beginNode; i < endNode; i++) {\n          if (incomingMirrors.test(i))\n            ++count;\n        }\n        threadPrefixSums[tid] = count;\n      });\n      // get prefix sums\n      for (unsigned int i = 1; i < threadPrefixSums.size(); i++) {\n        threadPrefixSums[i] += threadPrefixSums[i - 1];\n      }\n\n      assert(threadPrefixSums.back() == additionalMirrorCount);\n\n      uint32_t startingNodeIndex = base_DistGraph::numOwned;\n      // do actual work, second on_each\n      galois::on_each([&](unsigned tid, unsigned nthreads) {\n        size_t beginNode;\n        size_t endNode;\n        std::tie(beginNode, endNode) =\n            galois::block_range(0u, totalNumNodes, tid, nthreads);\n        // start location to start adding things into prefix sums/vectors\n        uint32_t threadStartLocation = 0;\n        if (tid != 0) {\n          threadStartLocation = threadPrefixSums[tid - 1];\n        }\n        uint32_t handledNodes = 0;\n        for (size_t i = beginNode; i < endNode; i++) {\n          if (incomingMirrors.test(i)) {\n            base_DistGraph::localToGlobalVector[startingNodeIndex +\n                                                threadStartLocation +\n                                                handledNodes] = i;\n            handledNodes++;\n          }\n        }\n      });\n    }\n\n    base_DistGraph::numNodes = base_DistGraph::numOwned + additionalMirrorCount;\n    if (prefixSumOfEdges.size() != 0) {\n      base_DistGraph::numEdges = prefixSumOfEdges.back();\n    } else {\n      base_DistGraph::numEdges = 0;\n    }\n    assert(base_DistGraph::localToGlobalVector.size() ==\n           base_DistGraph::numNodes);\n    assert(prefixSumOfEdges.size() == base_DistGraph::numNodes);\n\n    // g2l mapping\n    base_DistGraph::globalToLocalMap.reserve(base_DistGraph::numNodes);\n    for (unsigned i = 0; i < base_DistGraph::numNodes; i++) {\n      // global to local map construction\n      base_DistGraph::globalToLocalMap[base_DistGraph::localToGlobalVector[i]] =\n          i;\n    }\n    assert(base_DistGraph::globalToLocalMap.size() == base_DistGraph::numNodes);\n\n    base_DistGraph::numNodesWithEdges = base_DistGraph::numOwned;\n  }\n\n  /**\n   * Given a loaded graph, construct the edges in the DistGraph graph.\n   * Variant that constructs edge data as well.\n   *\n   * @tparam GraphTy type of graph to construct\n   *\n   * @param [in,out] graph Graph to construct edges in\n   * @param bGraph Buffered graph that has edges to write into graph in memory\n   */\n  template <typename GraphTy,\n            typename std::enable_if<!std::is_void<\n                typename GraphTy::edge_data_type>::value>::type* = nullptr>\n  void edgeCutLoad(GraphTy& graph,\n                   galois::graphs::BufferedGraph<EdgeTy>& bGraph) {\n    if (base_DistGraph::id == 0) {\n      galois::gPrint(\"Loading edge-data while creating edges\\n\");\n    }\n\n    uint64_t globalOffset = base_DistGraph::gid2host[base_DistGraph::id].first;\n    bGraph.resetReadCounters();\n    galois::StatTimer timer(\"EdgeLoading\", GRNAME);\n    timer.start();\n\n    galois::do_all(\n        galois::iterate(base_DistGraph::gid2host[base_DistGraph::id].first,\n                        base_DistGraph::gid2host[base_DistGraph::id].second),\n        [&](size_t n) {\n          auto ii       = bGraph.edgeBegin(n);\n          auto ee       = bGraph.edgeEnd(n);\n          uint32_t lsrc = this->G2LEdgeCut(n, globalOffset);\n          uint64_t cur =\n              *graph.edge_begin(lsrc, galois::MethodFlag::UNPROTECTED);\n          for (; ii < ee; ++ii) {\n            auto gdst           = bGraph.edgeDestination(*ii);\n            decltype(gdst) ldst = this->G2LEdgeCut(gdst, globalOffset);\n            auto gdata          = bGraph.edgeData(*ii);\n            graph.constructEdge(cur++, ldst, gdata);\n          }\n          assert(cur == (*graph.edge_end(lsrc)));\n        },\n#if MORE_DIST_STATS\n        galois::loopname(\"EdgeLoadingLoop\"),\n#endif\n        galois::steal(), galois::no_stats());\n\n    timer.stop();\n    galois::gPrint(\"[\", base_DistGraph::id,\n                   \"] Edge loading time: \", timer.get_usec() / 1000000.0f,\n                   \" seconds to read \", bGraph.getBytesRead(), \" bytes (\",\n                   bGraph.getBytesRead() / (float)timer.get_usec(), \" MBPS)\\n\");\n  }\n\n  /**\n   * Given a loaded graph, construct the edges in the DistGraph graph.\n   * No edge data.\n   *\n   * @tparam GraphTy type of graph to construct\n   *\n   * @param [in,out] graph Graph to construct edges in\n   * @param bGraph Buffered graph that has edges to write into graph in memory\n   */\n  template <typename GraphTy,\n            typename std::enable_if<std::is_void<\n                typename GraphTy::edge_data_type>::value>::type* = nullptr>\n  void edgeCutLoad(GraphTy& graph,\n                   galois::graphs::BufferedGraph<EdgeTy>& bGraph) {\n    if (base_DistGraph::id == 0) {\n      galois::gPrint(\"Loading edge-data while creating edges\\n\");\n    }\n\n    uint64_t globalOffset = base_DistGraph::gid2host[base_DistGraph::id].first;\n    bGraph.resetReadCounters();\n    galois::StatTimer timer(\"EdgeLoading\", GRNAME);\n    timer.start();\n\n    galois::do_all(\n        galois::iterate(base_DistGraph::gid2host[base_DistGraph::id].first,\n                        base_DistGraph::gid2host[base_DistGraph::id].second),\n        [&](size_t n) {\n          auto ii       = bGraph.edgeBegin(n);\n          auto ee       = bGraph.edgeEnd(n);\n          uint32_t lsrc = this->G2LEdgeCut(n, globalOffset);\n          uint64_t cur =\n              *graph.edge_begin(lsrc, galois::MethodFlag::UNPROTECTED);\n          for (; ii < ee; ++ii) {\n            auto gdst           = bGraph.edgeDestination(*ii);\n            decltype(gdst) ldst = this->G2LEdgeCut(gdst, globalOffset);\n            graph.constructEdge(cur++, ldst);\n          }\n          assert(cur == (*graph.edge_end(lsrc)));\n        },\n#if MORE_DIST_STATS\n        galois::loopname(\"EdgeLoadingLoop\"),\n#endif\n        galois::steal(), galois::no_stats());\n\n    timer.stop();\n    galois::gPrint(\"[\", base_DistGraph::id,\n                   \"] Edge loading time: \", timer.get_usec() / 1000000.0f,\n                   \" seconds to read \", bGraph.getBytesRead(), \" bytes (\",\n                   bGraph.getBytesRead() / (float)timer.get_usec(), \" MBPS)\\n\");\n  }\n\n  /**\n   * Assign edges to hosts (but don't actually send), and send this information\n   * out to all hosts\n   * @param[in] bufGraph local graph to read\n   * @param[in,out] numOutgoingEdges specifies which nodes on a host will have\n   * outgoing edges\n   * @param[in,out] hasIncomingEdge indicates which nodes (that need to be\n   * created)on a host have incoming edges\n   */\n  void edgeInspection(galois::graphs::BufferedGraph<EdgeTy>& bufGraph,\n                      std::vector<std::vector<uint64_t>>& numOutgoingEdges,\n                      std::vector<galois::DynamicBitSet>& hasIncomingEdge,\n                      galois::StatTimer& inspectionTimer) {\n    // number of nodes that this host has read from disk\n    uint32_t numRead = base_DistGraph::gid2host[base_DistGraph::id].second -\n                       base_DistGraph::gid2host[base_DistGraph::id].first;\n\n    // allocate space for outgoing edges\n    for (uint32_t i = 0; i < base_DistGraph::numHosts; ++i) {\n      numOutgoingEdges[i].assign(numRead, 0);\n    }\n\n    galois::DynamicBitSet hostHasOutgoing;\n    hostHasOutgoing.resize(base_DistGraph::numHosts);\n    hostHasOutgoing.reset();\n    assignEdges(bufGraph, numOutgoingEdges, hasIncomingEdge, hostHasOutgoing);\n\n    inspectionTimer.stop();\n    // report edge inspection time\n    uint64_t allBytesRead = bufGraph.getBytesRead();\n    galois::gPrint(\n        \"[\", base_DistGraph::id,\n        \"] Edge inspection time: \", inspectionTimer.get_usec() / 1000000.0f,\n        \" seconds to read \", allBytesRead, \" bytes (\",\n        allBytesRead / (float)inspectionTimer.get_usec(), \" MBPS)\\n\");\n\n    // old inspection barrier\n    // galois::runtime::getHostBarrier().wait();\n\n    sendInspectionData(numOutgoingEdges, hasIncomingEdge, hostHasOutgoing);\n\n    // setup a single hasIncomingEdge bitvector\n\n    uint32_t myHostID = base_DistGraph::id;\n    if (hasIncomingEdge[myHostID].size() == 0) {\n      hasIncomingEdge[myHostID].resize(base_DistGraph::numGlobalNodes);\n      hasIncomingEdge[myHostID].reset();\n    }\n    recvInspectionData(numOutgoingEdges, hasIncomingEdge[myHostID]);\n    base_DistGraph::increment_evilPhase();\n  }\n\n  /**\n   * Inspect read edges and determine where to send them. Mark metadata as\n   * necessary.\n   *\n   * @param[in] bufGraph local graph to read\n   * @param[in,out] numOutgoingEdges specifies which nodes on a host will have\n   * outgoing edges\n   * @param[in,out] hasIncomingEdge indicates which nodes (that need to be\n   * created)on a host have incoming edges\n   * @param[in,out] hostHasOutgoing bitset tracking which hosts have outgoing\n   * edges from this host\n   */\n  void assignEdges(galois::graphs::BufferedGraph<EdgeTy>& bufGraph,\n                   std::vector<std::vector<uint64_t>>& numOutgoingEdges,\n                   std::vector<galois::DynamicBitSet>& hasIncomingEdge,\n                   galois::DynamicBitSet& hostHasOutgoing) {\n    std::vector<galois::CopyableAtomic<char>> indicatorVars(\n        base_DistGraph::numHosts);\n    // initialize indicators of initialized bitsets to 0\n    for (unsigned i = 0; i < base_DistGraph::numHosts; i++) {\n      indicatorVars[i] = 0;\n    }\n\n    // global offset into my read nodes\n    uint64_t globalOffset = base_DistGraph::gid2host[base_DistGraph::id].first;\n    uint32_t globalNodes  = base_DistGraph::numGlobalNodes;\n\n    for (unsigned syncRound = 0; syncRound < _edgeStateRounds; syncRound++) {\n      uint32_t beginNode;\n      uint32_t endNode;\n      std::tie(beginNode, endNode) = galois::block_range(\n          globalOffset, base_DistGraph::gid2host[base_DistGraph::id].second,\n          syncRound, _edgeStateRounds);\n      // TODO maybe edge range this?\n\n      galois::do_all(\n          // iterate over my read nodes\n          galois::iterate(beginNode, endNode),\n          [&](size_t src) {\n            auto ee            = bufGraph.edgeBegin(src);\n            auto ee_end        = bufGraph.edgeEnd(src);\n            uint64_t numEdgesL = std::distance(ee, ee_end);\n\n            for (; ee != ee_end; ee++) {\n              uint32_t dst         = bufGraph.edgeDestination(*ee);\n              uint32_t hostBelongs = -1;\n              hostBelongs = graphPartitioner->getEdgeOwner(src, dst, numEdgesL);\n              if (_edgeStateRounds > 1) {\n                hostLoads[hostBelongs] += 1;\n              }\n\n              numOutgoingEdges[hostBelongs][src - globalOffset] += 1;\n              hostHasOutgoing.set(hostBelongs);\n              bool hostIsMasterOfDest =\n                  (hostBelongs == graphPartitioner->retrieveMaster(dst));\n\n              // this means a mirror must be created for destination node on\n              // that host since it will not be created otherwise\n              if (!hostIsMasterOfDest) {\n                auto& bitsetStatus = indicatorVars[hostBelongs];\n\n                // initialize the bitset if necessary\n                if (bitsetStatus == 0) {\n                  char expected = 0;\n                  bool result =\n                      bitsetStatus.compare_exchange_strong(expected, 1);\n                  // i swapped successfully, therefore do allocation\n                  if (result) {\n                    hasIncomingEdge[hostBelongs].resize(globalNodes);\n                    hasIncomingEdge[hostBelongs].reset();\n                    bitsetStatus = 2;\n                  }\n                }\n                // until initialized, loop\n                while (indicatorVars[hostBelongs] != 2)\n                  ;\n                hasIncomingEdge[hostBelongs].set(dst);\n              }\n            }\n          },\n#if MORE_DIST_STATS\n          galois::loopname(\"AssignEdges\"),\n#endif\n          galois::steal(), galois::no_stats());\n      syncEdgeLoad();\n    }\n  }\n\n  /**\n   * Given a vector specifying which nodes have edges for an unspecified\n   * receiver host, save the masters of those nodes (which are known on this\n   * host but not necessarily other hosts) into a vector and serialize it for\n   * the receiver to update their master node mapping.\n   *\n   * @param b Send buffer\n   * @param hostOutgoingEdges Number of edges that the receiver of this\n   * vector should expect for each node on this host\n   */\n  void\n  serializeOutgoingMasterMap(galois::runtime::SendBuffer& b,\n                             const std::vector<uint64_t>& hostOutgoingEdges) {\n    // 2 phase: one phase determines amount of work each thread does,\n    // second has threads actually do copies\n    uint32_t activeThreads = galois::getActiveThreads();\n    std::vector<uint64_t> threadPrefixSums(activeThreads);\n    size_t hostSize = base_DistGraph::gid2host[base_DistGraph::id].second -\n                      base_DistGraph::gid2host[base_DistGraph::id].first;\n    assert(hostSize == hostOutgoingEdges.size());\n\n    // for each thread, figure out how many items it will work with\n    // (non-zero outgoing edges)\n    galois::on_each([&](unsigned tid, unsigned nthreads) {\n      size_t beginNode;\n      size_t endNode;\n      std::tie(beginNode, endNode) =\n          galois::block_range((size_t)0, hostSize, tid, nthreads);\n      uint64_t count = 0;\n      for (size_t i = beginNode; i < endNode; i++) {\n        if (hostOutgoingEdges[i] > 0) {\n          count++;\n        }\n      }\n      threadPrefixSums[tid] = count;\n    });\n\n    // get prefix sums\n    for (unsigned int i = 1; i < threadPrefixSums.size(); i++) {\n      threadPrefixSums[i] += threadPrefixSums[i - 1];\n    }\n\n    uint32_t numNonZero = threadPrefixSums[activeThreads - 1];\n    std::vector<uint32_t> masterLocation;\n    masterLocation.resize(numNonZero, (uint32_t)-1);\n    // should only be in here if there's something to send in first place\n    assert(numNonZero > 0);\n\n    uint64_t startNode = base_DistGraph::gid2host[base_DistGraph::id].first;\n\n    // do actual work, second on_each; find non-zeros again, get master\n    // corresponding to that non-zero and send to other end\n    galois::on_each([&](unsigned tid, unsigned nthreads) {\n      size_t beginNode;\n      size_t endNode;\n      std::tie(beginNode, endNode) =\n          galois::block_range((size_t)0, hostSize, tid, nthreads);\n      // start location to start adding things into prefix sums/vectors\n      uint32_t threadStartLocation = 0;\n      if (tid != 0) {\n        threadStartLocation = threadPrefixSums[tid - 1];\n      }\n\n      uint32_t handledNodes = 0;\n      for (size_t i = beginNode; i < endNode; i++) {\n        if (hostOutgoingEdges[i] > 0) {\n          // get master of i\n          masterLocation[threadStartLocation + handledNodes] =\n              graphPartitioner->retrieveMaster(i + startNode);\n          handledNodes++;\n        }\n      }\n    });\n\n#ifndef NDEBUG\n    for (uint32_t i : masterLocation) {\n      assert(i != (uint32_t)-1);\n    }\n#endif\n\n    // serialize into buffer; since this is sent along with vector receiver end\n    // will know how to deal with it\n    galois::runtime::gSerialize(b, masterLocation);\n  }\n\n  void\n  serializeIncomingMasterMap(galois::runtime::SendBuffer& b,\n                             const galois::DynamicBitSet& hostIncomingEdges) {\n    size_t numOfNodes = hostIncomingEdges.count();\n    std::vector<uint32_t> masterMap;\n    masterMap.resize(numOfNodes, (uint32_t)-1);\n\n    std::vector<uint32_t> bitsetOffsets = hostIncomingEdges.getOffsets();\n\n    // size_t firstBound = base_DistGraph::gid2host[h].first;\n    // size_t secondBound = base_DistGraph::gid2host[h].second;\n\n    // galois::do_all(\n    //  galois::iterate((size_t)0, firstBound),\n    //  [&] (size_t offset) {\n    //    masterMap[offset] =\n    //    graphPartitioner->retrieveMaster(bitsetOffsets[offset]);\n    //  },\n    //  galois::no_stats()\n    //);\n\n    galois::do_all(\n        // galois::iterate((size_t)secondBound, numOfNodes),\n        galois::iterate((size_t)0, numOfNodes),\n        [&](size_t offset) {\n          masterMap[offset] =\n              graphPartitioner->retrieveMaster(bitsetOffsets[offset]);\n        },\n        galois::no_stats());\n\n#ifndef NDEBUG\n    for (uint32_t i : masterMap) {\n      assert(i != (uint32_t)-1);\n      assert(i < base_DistGraph::numHosts);\n    }\n#endif\n\n    // serialize into buffer; since this is sent along with vector receiver end\n    // will know how to deal with it\n    galois::runtime::gSerialize(b, masterMap);\n  }\n\n  void deserializeOutgoingMasterMap(\n      uint32_t senderHost, const std::vector<uint64_t>& hostOutgoingEdges,\n      const std::vector<uint32_t>& recvMasterLocations) {\n    uint64_t hostOffset = base_DistGraph::gid2host[senderHost].first;\n    size_t hostSize     = base_DistGraph::gid2host[senderHost].second -\n                      base_DistGraph::gid2host[senderHost].first;\n    assert(hostSize == hostOutgoingEdges.size());\n    galois::DynamicBitSet offsetsToConsider;\n    offsetsToConsider.resize(hostSize);\n    offsetsToConsider.reset();\n\n    // step 1: figure out offsets that need to be handled (i.e. non-zero): only\n    // handle if not already in map\n    galois::do_all(\n        galois::iterate((size_t)0, hostOutgoingEdges.size()),\n        [&](size_t offset) {\n          if (hostOutgoingEdges[offset] > 0) {\n            offsetsToConsider.set(offset);\n          }\n        },\n        galois::no_stats(), galois::steal());\n    assert(offsetsToConsider.count() == recvMasterLocations.size());\n\n    // step 2: using bitset that tells which offsets are set, add\n    // to already master map in partitioner (this is single threaded\n    // since map is not a concurrent data structure)\n    size_t curCount = 0;\n    // size_t actuallySet = 0;\n    for (uint32_t offset : offsetsToConsider.getOffsets()) {\n      // galois::gDebug(\"[\", base_DistGraph::id, \"] \", \" setting \",\n      //               offset + hostOffset, \" from host \", senderHost,\n      //               \" to \", recvMasterLocations[curCount]);\n      graphPartitioner->addMasterMapping(offset + hostOffset,\n                                         recvMasterLocations[curCount]);\n      // bool set = graphPartitioner->addMasterMapping(offset + hostOffset,\n      //                                          recvMasterLocations[curCount]);\n      // if (set) { actuallySet++; }\n      curCount++;\n    }\n\n    // galois::gDebug(\"[\", base_DistGraph::id, \"] host \", senderHost, \": set \",\n    //               actuallySet, \" out of \", recvMasterLocations.size());\n  }\n\n  /**\n   * Map GIDs to masters from incoming master map sent from hosts.\n   *\n   * @param senderHost host that sent the data\n   * @param gids GIDs corresponding to the received master locations\n   * @param recvMasterLocations masters of GIDs in the gids vector\n   */\n  void deserializeIncomingMasterMap(\n      const std::vector<uint32_t>& gids,\n      const std::vector<uint32_t>& recvMasterLocations) {\n    assert(gids.size() == recvMasterLocations.size());\n    size_t curCount = 0;\n    for (uint64_t gid : gids) {\n      assert(gid < base_DistGraph::numGlobalNodes);\n      // galois::gDebug(\"[\", base_DistGraph::id, \"] \", \" in-setting \", gid, \" to\n      // \",\n      //               recvMasterLocations[curCount]);\n      graphPartitioner->addMasterMapping(gid, recvMasterLocations[curCount]);\n      curCount++;\n    }\n  }\n\n  /**\n   * Send data out from inspection to other hosts.\n   *\n   * @param[in,out] numOutgoingEdges specifies which nodes on a host will have\n   * outgoing edges\n   * @param[in,out] hasIncomingEdge indicates which nodes (that need to be\n   * created)on a host have incoming edges\n   * @param[in] hostHasOutgoing bitset tracking which hosts have outgoing\n   * edges from this host\n   */\n  void sendInspectionData(std::vector<std::vector<uint64_t>>& numOutgoingEdges,\n                          std::vector<galois::DynamicBitSet>& hasIncomingEdge,\n                          galois::DynamicBitSet& hostHasOutgoing) {\n    auto& net = galois::runtime::getSystemNetworkInterface();\n\n    galois::GAccumulator<uint64_t> bytesSent;\n    bytesSent.reset();\n\n    for (unsigned h = 0; h < net.Num; h++) {\n      if (h == net.ID) {\n        // i have no outgoing edges i will keep; go ahead and clear\n        if (!hostHasOutgoing.test(h)) {\n          numOutgoingEdges[h].clear();\n        }\n        continue;\n      }\n      // send outgoing edges data off to comm partner\n      galois::runtime::SendBuffer b;\n\n      // only send if non-zeros exist\n      if (hostHasOutgoing.test(h)) {\n        galois::runtime::gSerialize(b, 1); // token saying data exists\n        galois::runtime::gSerialize(b, numOutgoingEdges[h]);\n        if (graphPartitioner->masterAssignPhase()) {\n          serializeOutgoingMasterMap(b, numOutgoingEdges[h]);\n        }\n      } else {\n        galois::runtime::gSerialize(b, 0); // token saying no data exists\n      }\n      numOutgoingEdges[h].clear();\n\n      // determine form to send bitset in\n      galois::DynamicBitSet& curBitset = hasIncomingEdge[h];\n      uint64_t bitsetSize              = curBitset.size(); // num bits\n      uint64_t onlyOffsetsSize         = curBitset.count() * 32;\n      if (bitsetSize == 0) {\n        // there was nothing there to send in first place\n        galois::runtime::gSerialize(b, 0);\n      } else if (onlyOffsetsSize <= bitsetSize) {\n        // send only offsets\n        std::vector<uint32_t> offsets = curBitset.getOffsets();\n        galois::runtime::gSerialize(b, 2); // 2 = only offsets\n        galois::runtime::gSerialize(b, offsets);\n\n        if (graphPartitioner->masterAssignPhase()) {\n          // galois::gDebug(\"incoming master map serialization\");\n          // serializeIncomingMasterMap(b, curBitset, h);\n          serializeIncomingMasterMap(b, curBitset);\n        }\n      } else {\n        // send entire bitset\n        galois::runtime::gSerialize(b, 1);\n        galois::runtime::gSerialize(b, curBitset);\n        if (graphPartitioner->masterAssignPhase()) {\n          // galois::gDebug(\"incoming master map serialization\");\n          // serializeIncomingMasterMap(b, curBitset, h);\n          serializeIncomingMasterMap(b, curBitset);\n        }\n      }\n      // get memory from bitset back\n      curBitset.resize(0);\n\n      bytesSent.update(b.size());\n\n      // send buffer and free memory\n      net.sendTagged(h, galois::runtime::evilPhase, b);\n      b.getVec().clear();\n    }\n\n    galois::runtime::reportStat_Tsum(\n        GRNAME, std::string(\"EdgeInspectionBytesSent\"), bytesSent.reduce());\n\n    galois::gPrint(\"[\", base_DistGraph::id, \"] Inspection sends complete.\\n\");\n  }\n\n  /**\n   * Receive data from inspection from other hosts. Processes the incoming\n   * edge bitsets/offsets.\n   *\n   * @param[in,out] numOutgoingEdges specifies which nodes on a host will have\n   * outgoing edges\n   * @param[in,out] hasIncomingEdge indicates which nodes (that need to be\n   * created) on this host have incoming edges\n   */\n  void recvInspectionData(std::vector<std::vector<uint64_t>>& numOutgoingEdges,\n                          galois::DynamicBitSet& hasIncomingEdge) {\n    auto& net = galois::runtime::getSystemNetworkInterface();\n\n    for (unsigned h = 0; h < net.Num - 1; h++) {\n      // expect data from comm partner back\n      decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr)) p;\n      do {\n        p = net.recieveTagged(galois::runtime::evilPhase, nullptr);\n      } while (!p);\n\n      uint32_t sendingHost = p->first;\n\n      // get outgoing edges; first get status var\n      uint32_t outgoingExists = 2;\n      galois::runtime::gDeserialize(p->second, outgoingExists);\n\n      if (outgoingExists == 1) {\n        // actual data sent\n        galois::runtime::gDeserialize(p->second, numOutgoingEdges[sendingHost]);\n\n        if (graphPartitioner->masterAssignPhase()) {\n          std::vector<uint32_t> recvMasterLocations;\n          galois::runtime::gDeserialize(p->second, recvMasterLocations);\n          deserializeOutgoingMasterMap(\n              sendingHost, numOutgoingEdges[sendingHost], recvMasterLocations);\n        }\n      } else if (outgoingExists == 0) {\n        // no data sent; just clear again\n        numOutgoingEdges[sendingHost].clear();\n      } else {\n        GALOIS_DIE(\"invalid recv inspection data metadata mode, outgoing\");\n      }\n\n      uint32_t bitsetMetaMode = 3; // initialize to invalid mode\n      galois::runtime::gDeserialize(p->second, bitsetMetaMode);\n      if (bitsetMetaMode == 1) {\n        // sent as bitset; deserialize then or with main bitset\n        galois::DynamicBitSet recvSet;\n        galois::runtime::gDeserialize(p->second, recvSet);\n        hasIncomingEdge.bitwise_or(recvSet);\n\n        if (graphPartitioner->masterAssignPhase()) {\n          std::vector<uint32_t> recvMasterLocations;\n          galois::runtime::gDeserialize(p->second, recvMasterLocations);\n          deserializeIncomingMasterMap(recvSet.getOffsets(),\n                                       recvMasterLocations);\n        }\n      } else if (bitsetMetaMode == 2) {\n        // sent as vector of offsets\n        std::vector<uint32_t> recvOffsets;\n        galois::runtime::gDeserialize(p->second, recvOffsets);\n        for (uint32_t offset : recvOffsets) {\n          hasIncomingEdge.set(offset);\n        }\n\n        if (graphPartitioner->masterAssignPhase()) {\n          std::vector<uint32_t> recvMasterLocations;\n          galois::runtime::gDeserialize(p->second, recvMasterLocations);\n          deserializeIncomingMasterMap(recvOffsets, recvMasterLocations);\n        }\n      } else if (bitsetMetaMode == 0) {\n        // do nothing; there was nothing to receive\n      } else {\n        GALOIS_DIE(\"invalid recv inspection data metadata mode\");\n      }\n    }\n\n    galois::gPrint(\"[\", base_DistGraph::id,\n                   \"] Inspection receives complete.\\n\");\n  }\n\n  /**\n   * Take inspection metadata and being mapping nodes/creating prefix sums,\n   * return the prefix sum.\n   */\n  galois::gstl::Vector<uint64_t>\n  nodeMapping(std::vector<std::vector<uint64_t>>& numOutgoingEdges,\n              galois::DynamicBitSet& hasIncomingEdge,\n              galois::gstl::Vector<uint64_t>& prefixSumOfEdges) {\n    base_DistGraph::numNodes = 0;\n    base_DistGraph::numEdges = 0;\n    nodesToReceive           = 0;\n\n    // reserve overestimation of nodes\n    prefixSumOfEdges.reserve(base_DistGraph::numGlobalNodes /\n                             base_DistGraph::numHosts * 1.15);\n    base_DistGraph::localToGlobalVector.reserve(\n        base_DistGraph::numGlobalNodes / base_DistGraph::numHosts * 1.15);\n\n    inspectMasterNodes(numOutgoingEdges, prefixSumOfEdges);\n    inspectOutgoingNodes(numOutgoingEdges, prefixSumOfEdges);\n    createIntermediateMetadata(prefixSumOfEdges, hasIncomingEdge.count());\n    inspectIncomingNodes(hasIncomingEdge, prefixSumOfEdges);\n    finalizeInspection(prefixSumOfEdges);\n\n    galois::gDebug(\"[\", base_DistGraph::id,\n                   \"] To receive this many nodes: \", nodesToReceive);\n\n    galois::gPrint(\"[\", base_DistGraph::id, \"] Inspection mapping complete.\\n\");\n    return prefixSumOfEdges;\n  }\n\n  /**\n   * Inspect master nodes; loop over all nodes, determine if master; if is,\n   * create mapping + get num edges\n   */\n  void inspectMasterNodes(std::vector<std::vector<uint64_t>>& numOutgoingEdges,\n                          galois::gstl::Vector<uint64_t>& prefixSumOfEdges) {\n    uint32_t myHID = base_DistGraph::id;\n\n    galois::GAccumulator<uint32_t> toReceive;\n    toReceive.reset();\n\n    for (unsigned h = 0; h < base_DistGraph::numHosts; ++h) {\n      uint32_t activeThreads = galois::getActiveThreads();\n      std::vector<uint64_t> threadPrefixSums(activeThreads);\n      uint64_t startNode = base_DistGraph::gid2host[h].first;\n      uint64_t lastNode  = base_DistGraph::gid2host[h].second;\n      size_t hostSize    = lastNode - startNode;\n\n      if (numOutgoingEdges[h].size() != 0) {\n        assert(hostSize == numOutgoingEdges[h].size());\n      }\n\n      // for each thread, figure out how many items it will work with (only\n      // owned nodes)\n      galois::on_each([&](unsigned tid, unsigned nthreads) {\n        size_t beginNode;\n        size_t endNode;\n        // loop over all nodes that host h has read\n        std::tie(beginNode, endNode) =\n            galois::block_range((size_t)0, hostSize, tid, nthreads);\n        uint64_t count = 0;\n        for (size_t i = beginNode; i < endNode; i++) {\n          // galois::gDebug(\"[\", base_DistGraph::id, \"] \", i + startNode,\n          //               \" mapped to \",\n          //               graphPartitioner->retrieveMaster(i+startNode));\n          if (graphPartitioner->retrieveMaster(i + startNode) == myHID) {\n            count++;\n          }\n        }\n        threadPrefixSums[tid] = count;\n      });\n\n      // get prefix sums\n      for (unsigned int i = 1; i < threadPrefixSums.size(); i++) {\n        threadPrefixSums[i] += threadPrefixSums[i - 1];\n      }\n\n      assert(prefixSumOfEdges.size() == base_DistGraph::numNodes);\n      assert(base_DistGraph::localToGlobalVector.size() ==\n             base_DistGraph::numNodes);\n\n      uint32_t newMasterNodes = threadPrefixSums[activeThreads - 1];\n      galois::gDebug(\"[\", base_DistGraph::id, \"] This many masters from host \",\n                     h, \": \", newMasterNodes);\n      uint32_t startingNodeIndex = base_DistGraph::numNodes;\n      // increase size of prefix sum + mapping vector\n      prefixSumOfEdges.resize(base_DistGraph::numNodes + newMasterNodes);\n      base_DistGraph::localToGlobalVector.resize(base_DistGraph::numNodes +\n                                                 newMasterNodes);\n\n      if (newMasterNodes > 0) {\n        // do actual work, second on_each\n        galois::on_each([&](unsigned tid, unsigned nthreads) {\n          size_t beginNode;\n          size_t endNode;\n          std::tie(beginNode, endNode) =\n              galois::block_range((size_t)0, hostSize, tid, nthreads);\n\n          // start location to start adding things into prefix sums/vectors\n          uint32_t threadStartLocation = 0;\n          if (tid != 0) {\n            threadStartLocation = threadPrefixSums[tid - 1];\n          }\n\n          uint32_t handledNodes = 0;\n          for (size_t i = beginNode; i < endNode; i++) {\n            uint32_t globalID = startNode + i;\n            // if this node is master, get outgoing edges + save mapping\n            if (graphPartitioner->retrieveMaster(globalID) == myHID) {\n              // check size\n              if (numOutgoingEdges[h].size() > 0) {\n                uint64_t myEdges       = numOutgoingEdges[h][i];\n                numOutgoingEdges[h][i] = 0; // set to 0; does not need to be\n                                            // handled later\n                prefixSumOfEdges[startingNodeIndex + threadStartLocation +\n                                 handledNodes] = myEdges;\n                if (myEdges > 0 && h != myHID) {\n                  toReceive += 1;\n                }\n              } else {\n                prefixSumOfEdges[startingNodeIndex + threadStartLocation +\n                                 handledNodes] = 0;\n              }\n\n              base_DistGraph::localToGlobalVector[startingNodeIndex +\n                                                  threadStartLocation +\n                                                  handledNodes] = globalID;\n              handledNodes++;\n            }\n          }\n        });\n        base_DistGraph::numNodes += newMasterNodes;\n      }\n    }\n\n    nodesToReceive += toReceive.reduce();\n    // masters have been handled\n    base_DistGraph::numOwned = base_DistGraph::numNodes;\n  }\n\n  /**\n   * Outgoing inspection: loop over all nodes, determnine if outgoing exists;\n   * if does, create mapping, get edges\n   */\n  void\n  inspectOutgoingNodes(std::vector<std::vector<uint64_t>>& numOutgoingEdges,\n                       galois::gstl::Vector<uint64_t>& prefixSumOfEdges) {\n    uint32_t myHID = base_DistGraph::id;\n\n    galois::GAccumulator<uint32_t> toReceive;\n    toReceive.reset();\n\n    for (unsigned h = 0; h < base_DistGraph::numHosts; ++h) {\n      size_t hostSize = numOutgoingEdges[h].size();\n      // if i got no outgoing info from this host, safely continue to next one\n      if (hostSize == 0) {\n        continue;\n      }\n\n      uint32_t activeThreads = galois::getActiveThreads();\n      std::vector<uint64_t> threadPrefixSums(activeThreads);\n\n      // for each thread, figure out how many items it will work with (only\n      // owned nodes)\n      galois::on_each([&](unsigned tid, unsigned nthreads) {\n        size_t beginNode;\n        size_t endNode;\n        std::tie(beginNode, endNode) =\n            galois::block_range((size_t)0, hostSize, tid, nthreads);\n        uint64_t count = 0;\n        for (size_t i = beginNode; i < endNode; i++) {\n          if (numOutgoingEdges[h][i] > 0) {\n            count++;\n          }\n        }\n        threadPrefixSums[tid] = count;\n      });\n\n      // get prefix sums\n      for (unsigned int i = 1; i < threadPrefixSums.size(); i++) {\n        threadPrefixSums[i] += threadPrefixSums[i - 1];\n      }\n\n      assert(prefixSumOfEdges.size() == base_DistGraph::numNodes);\n      assert(base_DistGraph::localToGlobalVector.size() ==\n             base_DistGraph::numNodes);\n\n      uint32_t newOutgoingNodes = threadPrefixSums[activeThreads - 1];\n      // increase size of prefix sum + mapping vector\n      prefixSumOfEdges.resize(base_DistGraph::numNodes + newOutgoingNodes);\n      base_DistGraph::localToGlobalVector.resize(base_DistGraph::numNodes +\n                                                 newOutgoingNodes);\n\n      uint64_t startNode         = base_DistGraph::gid2host[h].first;\n      uint32_t startingNodeIndex = base_DistGraph::numNodes;\n\n      if (newOutgoingNodes > 0) {\n        // do actual work, second on_each\n        galois::on_each([&](unsigned tid, unsigned nthreads) {\n          size_t beginNode;\n          size_t endNode;\n          std::tie(beginNode, endNode) =\n              galois::block_range((size_t)0, hostSize, tid, nthreads);\n\n          // start location to start adding things into prefix sums/vectors\n          uint32_t threadStartLocation = 0;\n          if (tid != 0) {\n            threadStartLocation = threadPrefixSums[tid - 1];\n          }\n\n          uint32_t handledNodes = 0;\n\n          for (size_t i = beginNode; i < endNode; i++) {\n            uint64_t myEdges = numOutgoingEdges[h][i];\n            if (myEdges > 0) {\n              prefixSumOfEdges[startingNodeIndex + threadStartLocation +\n                               handledNodes]                    = myEdges;\n              base_DistGraph::localToGlobalVector[startingNodeIndex +\n                                                  threadStartLocation +\n                                                  handledNodes] = startNode + i;\n              handledNodes++;\n\n              if (myEdges > 0 && h != myHID) {\n                toReceive += 1;\n              }\n            }\n          }\n        });\n        base_DistGraph::numNodes += newOutgoingNodes;\n      }\n      // don't need anymore after this point; get memory back\n      numOutgoingEdges[h].clear();\n    }\n\n    nodesToReceive += toReceive.reduce();\n    base_DistGraph::numNodesWithEdges = base_DistGraph::numNodes;\n  }\n\n  /**\n   * Create a part of the global to local map (it's missing the incoming\n   * mirrors with no edges) + part of prefix sum\n   *\n   * @param[in, out] prefixSumOfEdges edge prefix sum to build\n   * @param[in] incomingEstimate estimate of number of incoming nodes to build\n   */\n  void\n  createIntermediateMetadata(galois::gstl::Vector<uint64_t>& prefixSumOfEdges,\n                             const uint64_t incomingEstimate) {\n    if (base_DistGraph::numNodes == 0) {\n      return;\n    }\n    base_DistGraph::globalToLocalMap.reserve(base_DistGraph::numNodesWithEdges +\n                                             incomingEstimate);\n    base_DistGraph::globalToLocalMap[base_DistGraph::localToGlobalVector[0]] =\n        0;\n    // global to local map construction using num nodes with edges\n    for (unsigned i = 1; i < base_DistGraph::numNodesWithEdges; i++) {\n      prefixSumOfEdges[i] += prefixSumOfEdges[i - 1];\n      base_DistGraph::globalToLocalMap[base_DistGraph::localToGlobalVector[i]] =\n          i;\n    }\n  }\n\n  /**\n   * incoming node creation if is doesn't already exist + if actually amrked\n   * as having incoming node\n   */\n  void inspectIncomingNodes(galois::DynamicBitSet& hasIncomingEdge,\n                            galois::gstl::Vector<uint64_t>& prefixSumOfEdges) {\n    uint32_t totalNumNodes = base_DistGraph::numGlobalNodes;\n\n    uint32_t activeThreads = galois::getActiveThreads();\n    std::vector<uint64_t> threadPrefixSums(activeThreads);\n\n    galois::on_each([&](unsigned tid, unsigned nthreads) {\n      size_t beginNode;\n      size_t endNode;\n      std::tie(beginNode, endNode) =\n          galois::block_range(0u, totalNumNodes, tid, nthreads);\n      uint64_t count = 0;\n      for (size_t i = beginNode; i < endNode; i++) {\n        // only count if doesn't exist in global/local map + is incoming\n        // edge\n        if (hasIncomingEdge.test(i) &&\n            !base_DistGraph::globalToLocalMap.count(i))\n          ++count;\n      }\n      threadPrefixSums[tid] = count;\n    });\n    // get prefix sums\n    for (unsigned int i = 1; i < threadPrefixSums.size(); i++) {\n      threadPrefixSums[i] += threadPrefixSums[i - 1];\n    }\n\n    assert(prefixSumOfEdges.size() == base_DistGraph::numNodes);\n    assert(base_DistGraph::localToGlobalVector.size() ==\n           base_DistGraph::numNodes);\n\n    uint32_t newIncomingNodes = threadPrefixSums[activeThreads - 1];\n    // increase size of prefix sum + mapping vector\n    prefixSumOfEdges.resize(base_DistGraph::numNodes + newIncomingNodes);\n    base_DistGraph::localToGlobalVector.resize(base_DistGraph::numNodes +\n                                               newIncomingNodes);\n\n    uint32_t startingNodeIndex = base_DistGraph::numNodes;\n\n    if (newIncomingNodes > 0) {\n      // do actual work, second on_each\n      galois::on_each([&](unsigned tid, unsigned nthreads) {\n        size_t beginNode;\n        size_t endNode;\n        std::tie(beginNode, endNode) =\n            galois::block_range(0u, totalNumNodes, tid, nthreads);\n\n        // start location to start adding things into prefix sums/vectors\n        uint32_t threadStartLocation = 0;\n        if (tid != 0) {\n          threadStartLocation = threadPrefixSums[tid - 1];\n        }\n\n        uint32_t handledNodes = 0;\n\n        for (size_t i = beginNode; i < endNode; i++) {\n          if (hasIncomingEdge.test(i) &&\n              !base_DistGraph::globalToLocalMap.count(i)) {\n            prefixSumOfEdges[startingNodeIndex + threadStartLocation +\n                             handledNodes]                    = 0;\n            base_DistGraph::localToGlobalVector[startingNodeIndex +\n                                                threadStartLocation +\n                                                handledNodes] = i;\n            handledNodes++;\n          }\n        }\n      });\n      base_DistGraph::numNodes += newIncomingNodes;\n    }\n  }\n\n  /**\n   * finalize metadata maps\n   */\n  void finalizeInspection(galois::gstl::Vector<uint64_t>& prefixSumOfEdges) {\n    // reserve rest of memory needed\n    base_DistGraph::globalToLocalMap.reserve(base_DistGraph::numNodes);\n    for (unsigned i = base_DistGraph::numNodesWithEdges;\n         i < base_DistGraph::numNodes; i++) {\n      // finalize prefix sum\n      prefixSumOfEdges[i] += prefixSumOfEdges[i - 1];\n      // global to local map construction\n      base_DistGraph::globalToLocalMap[base_DistGraph::localToGlobalVector[i]] =\n          i;\n    }\n    if (prefixSumOfEdges.size() != 0) {\n      base_DistGraph::numEdges = prefixSumOfEdges.back();\n    } else {\n      base_DistGraph::numEdges = 0;\n    }\n  }\n\n  ////////////////////////////////////////////////////////////////////////////////\n\n  /**\n   * Fill up mirror arrays.\n   * TODO make parallel?\n   */\n  void fillMirrors() {\n    base_DistGraph::mirrorNodes.reserve(base_DistGraph::numNodes -\n                                        base_DistGraph::numOwned);\n    for (uint32_t i = base_DistGraph::numOwned; i < base_DistGraph::numNodes;\n         i++) {\n      uint32_t globalID = base_DistGraph::localToGlobalVector[i];\n      base_DistGraph::mirrorNodes[graphPartitioner->retrieveMaster(globalID)]\n          .push_back(globalID);\n    }\n  }\n\n  ////////////////////////////////////////////////////////////////////////////////\n\n  template <typename GraphTy>\n  void loadEdges(GraphTy& graph,\n                 galois::graphs::BufferedGraph<EdgeTy>& bufGraph) {\n    if (base_DistGraph::id == 0) {\n      if (std::is_void<typename GraphTy::edge_data_type>::value) {\n        fprintf(stderr, \"Loading void edge-data while creating edges.\\n\");\n      } else {\n        fprintf(stderr, \"Loading edge-data while creating edges.\\n\");\n      }\n    }\n\n    bufGraph.resetReadCounters();\n\n    std::atomic<uint32_t> receivedNodes;\n    receivedNodes.store(0);\n\n    galois::StatTimer loadEdgeTimer(\"EdgeLoading\", GRNAME);\n    loadEdgeTimer.start();\n\n    // sends data\n    sendEdges(graph, bufGraph, receivedNodes);\n    uint64_t bufBytesRead = bufGraph.getBytesRead();\n    // get data from graph back (don't need it after sending things out)\n    bufGraph.resetAndFree();\n\n    // receives data\n    galois::on_each(\n        [&](unsigned, unsigned) { receiveEdges(graph, receivedNodes); });\n    base_DistGraph::increment_evilPhase();\n\n    loadEdgeTimer.stop();\n\n    galois::gPrint(\"[\", base_DistGraph::id, \"] Edge loading time: \",\n                   loadEdgeTimer.get_usec() / 1000000.0f, \" seconds to read \",\n                   bufBytesRead, \" bytes (\",\n                   bufBytesRead / (float)loadEdgeTimer.get_usec(), \" MBPS)\\n\");\n  }\n\n  // Edge type is not void. (i.e. edge data exists)\n  template <typename GraphTy,\n            typename std::enable_if<!std::is_void<\n                typename GraphTy::edge_data_type>::value>::type* = nullptr>\n  void sendEdges(GraphTy& graph,\n                 galois::graphs::BufferedGraph<EdgeTy>& bufGraph,\n                 std::atomic<uint32_t>& receivedNodes) {\n    using DstVecType = std::vector<std::vector<uint64_t>>;\n    using DataVecType =\n        std::vector<std::vector<typename GraphTy::edge_data_type>>;\n    using SendBufferVecTy = std::vector<galois::runtime::SendBuffer>;\n\n    galois::substrate::PerThreadStorage<DstVecType> gdst_vecs(\n        base_DistGraph::numHosts);\n    galois::substrate::PerThreadStorage<DataVecType> gdata_vecs(\n        base_DistGraph::numHosts);\n    galois::substrate::PerThreadStorage<SendBufferVecTy> sendBuffers(\n        base_DistGraph::numHosts);\n\n    auto& net                = galois::runtime::getSystemNetworkInterface();\n    const unsigned& id       = this->base_DistGraph::id;\n    const unsigned& numHosts = this->base_DistGraph::numHosts;\n\n    galois::GAccumulator<uint64_t> messagesSent;\n    galois::GAccumulator<uint64_t> bytesSent;\n    galois::GReduceMax<uint64_t> maxBytesSent;\n    messagesSent.reset();\n    bytesSent.reset();\n    maxBytesSent.reset();\n\n    for (unsigned syncRound = 0; syncRound < _edgeStateRounds; syncRound++) {\n      uint32_t beginNode;\n      uint32_t endNode;\n      std::tie(beginNode, endNode) = galois::block_range(\n          base_DistGraph::gid2host[base_DistGraph::id].first,\n          base_DistGraph::gid2host[base_DistGraph::id].second, syncRound,\n          _edgeStateRounds);\n\n      // Go over assigned nodes and distribute edges.\n      galois::do_all(\n          galois::iterate(beginNode, endNode),\n          [&](uint64_t src) {\n            uint32_t lsrc    = 0;\n            uint64_t curEdge = 0;\n            if (base_DistGraph::isLocal(src)) {\n              lsrc = this->G2L(src);\n              curEdge =\n                  *graph.edge_begin(lsrc, galois::MethodFlag::UNPROTECTED);\n            }\n\n            auto ee            = bufGraph.edgeBegin(src);\n            auto ee_end        = bufGraph.edgeEnd(src);\n            uint64_t numEdgesL = std::distance(ee, ee_end);\n            auto& gdst_vec     = *gdst_vecs.getLocal();\n            auto& gdata_vec    = *gdata_vecs.getLocal();\n\n            for (unsigned i = 0; i < numHosts; ++i) {\n              gdst_vec[i].clear();\n              gdata_vec[i].clear();\n              gdst_vec[i].reserve(numEdgesL);\n              // gdata_vec[i].reserve(numEdgesL);\n            }\n\n            for (; ee != ee_end; ++ee) {\n              uint32_t gdst = bufGraph.edgeDestination(*ee);\n              auto gdata    = bufGraph.edgeData(*ee);\n\n              uint32_t hostBelongs =\n                  graphPartitioner->getEdgeOwner(src, gdst, numEdgesL);\n              if (_edgeStateRounds > 1) {\n                hostLoads[hostBelongs] += 1;\n              }\n\n              if (hostBelongs == id) {\n                // edge belongs here, construct on self\n                assert(base_DistGraph::isLocal(src));\n                uint32_t ldst = this->G2L(gdst);\n                graph.constructEdge(curEdge++, ldst, gdata);\n                // TODO\n                // if ldst is an outgoing mirror, this is vertex cut\n              } else {\n                // add to host vector to send out later\n                gdst_vec[hostBelongs].push_back(gdst);\n                gdata_vec[hostBelongs].push_back(gdata);\n              }\n            }\n\n            // make sure all edges accounted for if local\n            if (base_DistGraph::isLocal(src)) {\n              assert(curEdge == (*graph.edge_end(lsrc)));\n            }\n\n            // send\n            for (uint32_t h = 0; h < numHosts; ++h) {\n              if (h == id)\n                continue;\n\n              if (gdst_vec[h].size() > 0) {\n                auto& b = (*sendBuffers.getLocal())[h];\n                galois::runtime::gSerialize(b, src);\n                galois::runtime::gSerialize(b, gdst_vec[h]);\n                galois::runtime::gSerialize(b, gdata_vec[h]);\n\n                // send if over limit\n                if (b.size() > edgePartitionSendBufSize) {\n                  messagesSent += 1;\n                  bytesSent.update(b.size());\n                  maxBytesSent.update(b.size());\n\n                  net.sendTagged(h, galois::runtime::evilPhase, b);\n                  b.getVec().clear();\n                  b.getVec().reserve(edgePartitionSendBufSize * 1.25);\n                }\n              }\n            }\n\n            // overlap receives\n            auto buffer =\n                net.recieveTagged(galois::runtime::evilPhase, nullptr);\n            this->processReceivedEdgeBuffer(buffer, graph, receivedNodes);\n          },\n#if MORE_DIST_STATS\n          galois::loopname(\"EdgeLoadingLoop\"),\n#endif\n          galois::steal(), galois::no_stats());\n      syncEdgeLoad();\n      // printEdgeLoad();\n    }\n\n    // flush buffers\n    for (unsigned threadNum = 0; threadNum < sendBuffers.size(); ++threadNum) {\n      auto& sbr = *sendBuffers.getRemote(threadNum);\n      for (unsigned h = 0; h < this->base_DistGraph::numHosts; ++h) {\n        if (h == this->base_DistGraph::id)\n          continue;\n        auto& sendBuffer = sbr[h];\n        if (sendBuffer.size() > 0) {\n          messagesSent += 1;\n          bytesSent.update(sendBuffer.size());\n          maxBytesSent.update(sendBuffer.size());\n\n          net.sendTagged(h, galois::runtime::evilPhase, sendBuffer);\n          sendBuffer.getVec().clear();\n        }\n      }\n    }\n\n    net.flush();\n\n    galois::runtime::reportStat_Tsum(\n        GRNAME, std::string(\"EdgeLoadingMessagesSent\"), messagesSent.reduce());\n    galois::runtime::reportStat_Tsum(\n        GRNAME, std::string(\"EdgeLoadingBytesSent\"), bytesSent.reduce());\n    galois::runtime::reportStat_Tmax(\n        GRNAME, std::string(\"EdgeLoadingMaxBytesSent\"), maxBytesSent.reduce());\n  }\n\n  // no edge data version\n  template <typename GraphTy,\n            typename std::enable_if<std::is_void<\n                typename GraphTy::edge_data_type>::value>::type* = nullptr>\n  void sendEdges(GraphTy& graph,\n                 galois::graphs::BufferedGraph<EdgeTy>& bufGraph,\n                 std::atomic<uint32_t>& receivedNodes) {\n    using DstVecType      = std::vector<std::vector<uint64_t>>;\n    using SendBufferVecTy = std::vector<galois::runtime::SendBuffer>;\n\n    galois::substrate::PerThreadStorage<DstVecType> gdst_vecs(\n        base_DistGraph::numHosts);\n    galois::substrate::PerThreadStorage<SendBufferVecTy> sendBuffers(\n        base_DistGraph::numHosts);\n\n    auto& net                = galois::runtime::getSystemNetworkInterface();\n    const unsigned& id       = this->base_DistGraph::id;\n    const unsigned& numHosts = this->base_DistGraph::numHosts;\n\n    galois::GAccumulator<uint64_t> messagesSent;\n    galois::GAccumulator<uint64_t> bytesSent;\n    galois::GReduceMax<uint64_t> maxBytesSent;\n    messagesSent.reset();\n    bytesSent.reset();\n    maxBytesSent.reset();\n\n    for (unsigned syncRound = 0; syncRound < _edgeStateRounds; syncRound++) {\n      uint64_t beginNode;\n      uint64_t endNode;\n      std::tie(beginNode, endNode) = galois::block_range(\n          base_DistGraph::gid2host[base_DistGraph::id].first,\n          base_DistGraph::gid2host[base_DistGraph::id].second, syncRound,\n          _edgeStateRounds);\n\n      // Go over assigned nodes and distribute edges.\n      galois::do_all(\n          galois::iterate(beginNode, endNode),\n          [&](uint64_t src) {\n            uint32_t lsrc    = 0;\n            uint64_t curEdge = 0;\n            if (base_DistGraph::isLocal(src)) {\n              lsrc = this->G2L(src);\n              curEdge =\n                  *graph.edge_begin(lsrc, galois::MethodFlag::UNPROTECTED);\n            }\n\n            auto ee            = bufGraph.edgeBegin(src);\n            auto ee_end        = bufGraph.edgeEnd(src);\n            uint64_t numEdgesL = std::distance(ee, ee_end);\n            auto& gdst_vec     = *gdst_vecs.getLocal();\n\n            for (unsigned i = 0; i < numHosts; ++i) {\n              gdst_vec[i].clear();\n              // gdst_vec[i].reserve(numEdgesL);\n            }\n\n            for (; ee != ee_end; ++ee) {\n              uint32_t gdst = bufGraph.edgeDestination(*ee);\n              uint32_t hostBelongs =\n                  graphPartitioner->getEdgeOwner(src, gdst, numEdgesL);\n              if (_edgeStateRounds > 1) {\n                hostLoads[hostBelongs] += 1;\n              }\n\n              if (hostBelongs == id) {\n                // edge belongs here, construct on self\n                assert(base_DistGraph::isLocal(src));\n                uint32_t ldst = this->G2L(gdst);\n                graph.constructEdge(curEdge++, ldst);\n                // TODO\n                // if ldst is an outgoing mirror, this is vertex cut\n              } else {\n                // add to host vector to send out later\n                gdst_vec[hostBelongs].push_back(gdst);\n              }\n            }\n\n            // make sure all edges accounted for if local\n            if (base_DistGraph::isLocal(src)) {\n              assert(curEdge == (*graph.edge_end(lsrc)));\n            }\n\n            // send\n            for (uint32_t h = 0; h < numHosts; ++h) {\n              if (h == id)\n                continue;\n\n              if (gdst_vec[h].size() > 0) {\n                auto& b = (*sendBuffers.getLocal())[h];\n                galois::runtime::gSerialize(b, src);\n                galois::runtime::gSerialize(b, gdst_vec[h]);\n\n                // send if over limit\n                if (b.size() > edgePartitionSendBufSize) {\n                  messagesSent += 1;\n                  bytesSent.update(b.size());\n                  maxBytesSent.update(b.size());\n\n                  net.sendTagged(h, galois::runtime::evilPhase, b);\n                  b.getVec().clear();\n                  b.getVec().reserve(edgePartitionSendBufSize * 1.25);\n                }\n              }\n            }\n\n            // overlap receives\n            auto buffer =\n                net.recieveTagged(galois::runtime::evilPhase, nullptr);\n            this->processReceivedEdgeBuffer(buffer, graph, receivedNodes);\n          },\n#if MORE_DIST_STATS\n          galois::loopname(\"EdgeLoading\"),\n#endif\n          galois::steal(), galois::no_stats());\n      syncEdgeLoad();\n      // printEdgeLoad();\n    }\n\n    // flush buffers\n    for (unsigned threadNum = 0; threadNum < sendBuffers.size(); ++threadNum) {\n      auto& sbr = *sendBuffers.getRemote(threadNum);\n      for (unsigned h = 0; h < this->base_DistGraph::numHosts; ++h) {\n        if (h == this->base_DistGraph::id)\n          continue;\n        auto& sendBuffer = sbr[h];\n        if (sendBuffer.size() > 0) {\n          messagesSent += 1;\n          bytesSent.update(sendBuffer.size());\n          maxBytesSent.update(sendBuffer.size());\n\n          net.sendTagged(h, galois::runtime::evilPhase, sendBuffer);\n          sendBuffer.getVec().clear();\n        }\n      }\n    }\n\n    net.flush();\n\n    galois::runtime::reportStat_Tsum(\n        GRNAME, std::string(\"EdgeLoadingMessagesSent\"), messagesSent.reduce());\n    galois::runtime::reportStat_Tsum(\n        GRNAME, std::string(\"EdgeLoadingBytesSent\"), bytesSent.reduce());\n    galois::runtime::reportStat_Tmax(\n        GRNAME, std::string(\"EdgeLoadingMaxBytesSent\"), maxBytesSent.reduce());\n  }\n\n  //! @copydoc DistGraphHybridCut::processReceivedEdgeBuffer\n  template <typename GraphTy>\n  void processReceivedEdgeBuffer(\n      std::optional<std::pair<uint32_t, galois::runtime::RecvBuffer>>& buffer,\n      GraphTy& graph, std::atomic<uint32_t>& receivedNodes) {\n    if (buffer) {\n      auto& rb = buffer->second;\n      while (rb.r_size() > 0) {\n        uint64_t n;\n        std::vector<uint64_t> gdst_vec;\n        galois::runtime::gDeserialize(rb, n);\n        galois::runtime::gDeserialize(rb, gdst_vec);\n        assert(base_DistGraph::isLocal(n));\n        uint32_t lsrc = this->G2L(n);\n        uint64_t cur = *graph.edge_begin(lsrc, galois::MethodFlag::UNPROTECTED);\n        uint64_t cur_end = *graph.edge_end(lsrc);\n        assert((cur_end - cur) == gdst_vec.size());\n        deserializeEdges(graph, rb, gdst_vec, cur, cur_end);\n        ++receivedNodes;\n      }\n    }\n  }\n\n  /**\n   * Receive the edge dest/data assigned to this host from other hosts\n   * that were responsible for reading them.\n   */\n  template <typename GraphTy>\n  void receiveEdges(GraphTy& graph, std::atomic<uint32_t>& receivedNodes) {\n    auto& net = galois::runtime::getSystemNetworkInterface();\n\n    // receive edges for all mirror nodes\n    while (receivedNodes < nodesToReceive) {\n      decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr)) p;\n      p = net.recieveTagged(galois::runtime::evilPhase, nullptr);\n      processReceivedEdgeBuffer(p, graph, receivedNodes);\n    }\n  }\n\n  template <typename GraphTy,\n            typename std::enable_if<!std::is_void<\n                typename GraphTy::edge_data_type>::value>::type* = nullptr>\n  void deserializeEdges(GraphTy& graph, galois::runtime::RecvBuffer& b,\n                        std::vector<uint64_t>& gdst_vec, uint64_t& cur,\n                        uint64_t& cur_end) {\n    std::vector<typename GraphTy::edge_data_type> gdata_vec;\n    galois::runtime::gDeserialize(b, gdata_vec);\n    uint64_t i = 0;\n    while (cur < cur_end) {\n      auto gdata    = gdata_vec[i];\n      uint64_t gdst = gdst_vec[i++];\n      uint32_t ldst = this->G2L(gdst);\n      graph.constructEdge(cur++, ldst, gdata);\n      // TODO if ldst is an outgoing mirror, this is vertex cut\n    }\n  }\n\n  template <typename GraphTy,\n            typename std::enable_if<std::is_void<\n                typename GraphTy::edge_data_type>::value>::type* = nullptr>\n  void deserializeEdges(GraphTy& graph, galois::runtime::RecvBuffer&,\n                        std::vector<uint64_t>& gdst_vec, uint64_t& cur,\n                        uint64_t& cur_end) {\n    uint64_t i = 0;\n    while (cur < cur_end) {\n      uint64_t gdst = gdst_vec[i++];\n      uint32_t ldst = this->G2L(gdst);\n      graph.constructEdge(cur++, ldst);\n      // TODO if ldst is an outgoing mirror, this is vertex cut\n    }\n  }\n};\n\n// make GRNAME visible to public\ntemplate <typename NodeTy, typename EdgeTy, typename Partitioner>\nconstexpr const char* const\n    galois::graphs::NewDistGraphGeneric<NodeTy, EdgeTy, Partitioner>::GRNAME;\n\n} // end namespace graphs\n} // end namespace galois\n#endif\n"
  },
  {
    "path": "libdist/CMakeLists.txt",
    "content": "add_library(galois_dist_async STATIC)\nadd_library(Galois::dist_async ALIAS galois_dist_async)\nadd_dependencies(lib galois_dist_async)\nset_target_properties(galois_dist_async PROPERTIES EXPORT_NAME dist_async)\n\ntarget_sources(galois_dist_async PRIVATE\n        src/Barrier.cpp\n        src/DistGalois.cpp\n        src/DistStats.cpp\n        src/Network.cpp\n        src/NetworkBuffered.cpp\n        src/NetworkIOMPI.cpp\n        src/NetworkLCI.cpp\n)\n\ntarget_include_directories(galois_dist_async PUBLIC\n  $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>\n  $<INSTALL_INTERFACE:include>\n)\n\ntarget_link_libraries(galois_dist_async PUBLIC MPI::MPI_CXX)\ntarget_link_libraries(galois_dist_async PUBLIC galois_shmem)\n\ntarget_compile_definitions(galois_dist_async PRIVATE GALOIS_SUPPORT_ASYNC=1)\n\nif (GALOIS_USE_BARE_MPI)\n  target_compile_definitions(galois_dist_async PRIVATE GALOIS_USE_BARE_MPI=1)\nendif()\n\nif (GALOIS_USE_LCI)\n  add_definitions(-DGALOIS_USE_LCI)\n  set(LCI_ROOT \"${CMAKE_BINARY_DIR}/libdist/external/src/lci\")\n  set(LCI_INCLUDE \"${LCI_ROOT}/include\")\n  set(LCI_LIBRARY \"${LCI_ROOT}/liblci.a\")\n\n  include(ExternalProject)\n  # do not clone submodules for external projects\n  cmake_policy(SET CMP0097 NEW)\n\n  ExternalProject_Add(lci\n    PREFIX external\n    BUILD_IN_SOURCE 1\n    CONFIGURE_COMMAND \"\"\n    INSTALL_COMMAND \"\"\n    LOG_OUTPUT_ON_FAILURE 1\n    GIT_REPOSITORY \"https://github.com/uiuc-hpc/LC.git\"\n    GIT_SUBMODULES \"\"\n    GIT_TAG \"9bf912829339879e1132614c6d24cd032c32366b\")\n\n  add_dependencies(galois_dist_async lci)\n  target_link_libraries(galois_dist_async PRIVATE ${LCI_LIBRARY} -lpsm2)\n  target_include_directories(galois_dist_async PUBLIC \n    $<BUILD_INTERFACE:${LCI_INCLUDE}>\n    $<INSTALL_INTERFACE:include>\n  )\nendif(GALOIS_USE_LCI)\n\ninstall(\n  DIRECTORY include/\n  DESTINATION \"${CMAKE_INSTALL_INCLUDEDIR}\"\n  COMPONENT dev\n  FILES_MATCHING PATTERN \"*.h\"\n)\n\ninstall(TARGETS galois_dist_async\n  EXPORT GaloisTargets\n  LIBRARY\n    DESTINATION \"${CMAKE_INSTALL_LIBDIR}\"\n    COMPONENT shlib\n  ARCHIVE\n    DESTINATION \"${CMAKE_INSTALL_LIBDIR}\"\n    COMPONENT lib\n  INCLUDES DESTINATION \"${CMAKE_INSTALL_INCLUDEDIR}\"\n)\n"
  },
  {
    "path": "libdist/include/galois/DReducible.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n/**\n * @file DReducible.h\n *\n * Implements distributed reducible objects for easy reduction of values\n * across a distributed system.\n */\n#ifndef GALOIS_DISTACCUMULATOR_H\n#define GALOIS_DISTACCUMULATOR_H\n\n#include <limits>\n#include \"galois/Galois.h\"\n#include \"galois/Reduction.h\"\n#include \"galois/AtomicHelpers.h\"\n#include \"galois/runtime/LWCI.h\"\n#include \"galois/runtime/DistStats.h\"\n\nnamespace galois {\n\n/**\n * Distributed sum-reducer for getting the sum of some value across multiple\n * hosts.\n *\n * @tparam Ty type of value to max-reduce\n */\ntemplate <typename Ty>\nclass DGAccumulator {\n  galois::runtime::NetworkInterface& net =\n      galois::runtime::getSystemNetworkInterface();\n\n  galois::GAccumulator<Ty> mdata;\n  Ty local_mdata, global_mdata;\n\n#ifdef GALOIS_USE_LCI\n  /**\n   * Sum reduction using LWCI\n   */\n  inline void reduce_lwci() {\n    lc_alreduce(&local_mdata, &global_mdata, sizeof(Ty),\n                &galois::runtime::internal::ompi_op_sum<Ty>, lc_col_ep);\n  }\n#else\n  /**\n   * Sum reduction using MPI\n   */\n  inline void reduce_mpi() {\n    if (typeid(Ty) == typeid(int32_t)) {\n      MPI_Allreduce(&local_mdata, &global_mdata, 1, MPI_INT, MPI_SUM,\n                    MPI_COMM_WORLD);\n    } else if (typeid(Ty) == typeid(int64_t)) {\n      MPI_Allreduce(&local_mdata, &global_mdata, 1, MPI_LONG, MPI_SUM,\n                    MPI_COMM_WORLD);\n    } else if (typeid(Ty) == typeid(uint32_t)) {\n      MPI_Allreduce(&local_mdata, &global_mdata, 1, MPI_UNSIGNED, MPI_SUM,\n                    MPI_COMM_WORLD);\n    } else if (typeid(Ty) == typeid(uint64_t)) {\n      MPI_Allreduce(&local_mdata, &global_mdata, 1, MPI_UNSIGNED_LONG, MPI_SUM,\n                    MPI_COMM_WORLD);\n    } else if (typeid(Ty) == typeid(float)) {\n      MPI_Allreduce(&local_mdata, &global_mdata, 1, MPI_FLOAT, MPI_SUM,\n                    MPI_COMM_WORLD);\n    } else if (typeid(Ty) == typeid(double)) {\n      MPI_Allreduce(&local_mdata, &global_mdata, 1, MPI_DOUBLE, MPI_SUM,\n                    MPI_COMM_WORLD);\n    } else if (typeid(Ty) == typeid(long double)) {\n      MPI_Allreduce(&local_mdata, &global_mdata, 1, MPI_LONG_DOUBLE, MPI_SUM,\n                    MPI_COMM_WORLD);\n    } else {\n      static_assert(true,\n                    \"Type of DGAccumulator not supported for MPI reduction\");\n    }\n  }\n#endif\n\npublic:\n  //! Default constructor\n  DGAccumulator() {}\n\n  /**\n   * Adds to accumulated value\n   *\n   * @param rhs Value to add\n   * @returns reference to this object\n   */\n  DGAccumulator& operator+=(const Ty& rhs) {\n    mdata += rhs;\n    return *this;\n  }\n\n  /**\n   * Sets current value stored in accumulator.\n   *\n   * @param rhs Value to set\n   */\n  void operator=(const Ty rhs) {\n    mdata.reset();\n    mdata += rhs;\n  }\n\n  /**\n   * Sets current value stored in accumulator.\n   *\n   * @param rhs Value to set\n   */\n  void set(const Ty rhs) {\n    mdata.reset();\n    mdata += rhs;\n  }\n\n  /**\n   * Read local accumulated value.\n   *\n   * @returns locally accumulated value\n   */\n  Ty read_local() {\n    if (local_mdata == 0)\n      local_mdata = mdata.reduce();\n    return local_mdata;\n  }\n\n  /**\n   * Read the value returned by the last reduce call.\n   * Should call reduce before calling this function if an up to date\n   * value is required\n   *\n   * @returns the value of the last reduce call\n   */\n  Ty read() { return global_mdata; }\n\n  /**\n   * Reset the entire accumulator.\n   *\n   * @returns the value of the last reduce call\n   */\n  Ty reset() {\n    Ty retval = global_mdata;\n    mdata.reset();\n    local_mdata = global_mdata = 0;\n    return retval;\n  }\n\n  /**\n   * Reduce data across all hosts, saves the value, and returns the\n   * reduced value\n   *\n   * @param runID optional argument used to create a statistics timer\n   * for later reporting\n   *\n   * @returns The reduced value\n   */\n  Ty reduce(std::string runID = std::string()) {\n    std::string timer_str(\"ReduceDGAccum_\" + runID);\n\n    galois::CondStatTimer<GALOIS_COMM_STATS> reduceTimer(timer_str.c_str(),\n                                                         \"DGReducible\");\n    reduceTimer.start();\n\n    if (local_mdata == 0)\n      local_mdata = mdata.reduce();\n\n#ifdef GALOIS_USE_LCI\n    reduce_lwci();\n#else\n    reduce_mpi();\n#endif\n\n    reduceTimer.stop();\n\n    return global_mdata;\n  }\n};\n\n////////////////////////////////////////////////////////////////////////////////\n\n/**\n * Distributed max-reducer for getting the max of some value across multiple\n * hosts.\n *\n * @tparam Ty type of value to max-reduce\n */\ntemplate <typename Ty>\nclass DGReduceMax {\n  galois::runtime::NetworkInterface& net =\n      galois::runtime::getSystemNetworkInterface();\n\n  galois::GReduceMax<Ty> mdata; // local max reducer\n  Ty local_mdata, global_mdata;\n\n#ifdef GALOIS_USE_LCI\n  /**\n   * Use LWCI to reduce max across hosts\n   */\n  inline void reduce_lwci() {\n    lc_alreduce(&local_mdata, &global_mdata, sizeof(Ty),\n                &galois::runtime::internal::ompi_op_max<Ty>, lc_col_ep);\n  }\n#else\n  /**\n   * Use MPI to reduce max across hosts\n   */\n  inline void reduce_mpi() {\n    if (typeid(Ty) == typeid(int32_t)) {\n      MPI_Allreduce(&local_mdata, &global_mdata, 1, MPI_INT, MPI_MAX,\n                    MPI_COMM_WORLD);\n    } else if (typeid(Ty) == typeid(int64_t)) {\n      MPI_Allreduce(&local_mdata, &global_mdata, 1, MPI_LONG, MPI_MAX,\n                    MPI_COMM_WORLD);\n    } else if (typeid(Ty) == typeid(uint32_t)) {\n      MPI_Allreduce(&local_mdata, &global_mdata, 1, MPI_UNSIGNED, MPI_MAX,\n                    MPI_COMM_WORLD);\n    } else if (typeid(Ty) == typeid(uint64_t)) {\n      MPI_Allreduce(&local_mdata, &global_mdata, 1, MPI_UNSIGNED_LONG, MPI_MAX,\n                    MPI_COMM_WORLD);\n    } else if (typeid(Ty) == typeid(float)) {\n      MPI_Allreduce(&local_mdata, &global_mdata, 1, MPI_FLOAT, MPI_MAX,\n                    MPI_COMM_WORLD);\n    } else if (typeid(Ty) == typeid(double)) {\n      MPI_Allreduce(&local_mdata, &global_mdata, 1, MPI_DOUBLE, MPI_MAX,\n                    MPI_COMM_WORLD);\n    } else if (typeid(Ty) == typeid(long double)) {\n      MPI_Allreduce(&local_mdata, &global_mdata, 1, MPI_LONG_DOUBLE, MPI_MAX,\n                    MPI_COMM_WORLD);\n    } else {\n      static_assert(true, \"Type of DGReduceMax not supported for MPI \"\n                          \"reduction\");\n    }\n  }\n#endif\n\npublic:\n  /**\n   * Default constructor; initializes everything to 0.\n   */\n  DGReduceMax() {\n    local_mdata  = 0;\n    global_mdata = 0;\n  }\n\n  /**\n   * Update the local max-reduced value.\n   *\n   * @param rhs Value to max-reduce locally with\n   */\n  void update(const Ty rhs) { mdata.update(rhs); }\n\n  /**\n   * Read the local reduced max value; if it has never been reduced, it will\n   * attempt get the global value through a reduce (i.e. all other hosts\n   * should call reduce as well).\n   *\n   * @returns the local value stored in the accumulator or a global value if\n   * reduce has never been called\n   */\n  Ty read_local() {\n    if (local_mdata == 0)\n      local_mdata = mdata.reduce();\n    return local_mdata;\n  }\n\n  /**\n   * Read the global reduced max value. For accurate results, you should\n   * call reduce before calling this.\n   *\n   * @returns the global value stored in the accumulator\n   */\n  Ty read() { return global_mdata; }\n\n  /**\n   * Reset this accumulator.\n   *\n   * @returns the previous global value stored in this accumulator (note if\n   * never reduced, it will be 0\n   */\n  Ty reset() {\n    Ty retval = global_mdata;\n    mdata.reset();\n    local_mdata = global_mdata = 0;\n    return retval;\n  }\n\n  /**\n   * Do a max reduction across all hosts by sending data to all other hosts\n   * and reducing received data.\n   *\n   * @returns the max-reduced value after reducing from all hosts.\n   */\n  Ty reduce(std::string runID = std::string()) {\n    std::string timer_str(\"ReduceDGReduceMax_\" + runID);\n\n    galois::CondStatTimer<GALOIS_COMM_STATS> reduceTimer(timer_str.c_str(),\n                                                         \"DGReduceMax\");\n\n    reduceTimer.start();\n    if (local_mdata == 0)\n      local_mdata = mdata.reduce();\n\n#ifdef GALOIS_USE_LCI\n    reduce_lwci();\n#else\n    reduce_mpi();\n#endif\n    reduceTimer.stop();\n\n    return global_mdata;\n  }\n};\n\n////////////////////////////////////////////////////////////////////////////////\n\n/**\n * Distributed min-reducer for getting the min of some value across multiple\n * hosts.\n *\n * @tparam Ty type of value to min-reduce\n */\ntemplate <typename Ty>\nclass DGReduceMin {\n  galois::runtime::NetworkInterface& net =\n      galois::runtime::getSystemNetworkInterface();\n\n  galois::GReduceMin<Ty> mdata; // local min reducer\n  Ty local_mdata, global_mdata;\n\n#ifdef GALOIS_USE_LCI\n  /**\n   * Use LWCI to reduce min across hosts\n   */\n  inline void reduce_lwci() {\n    lc_alreduce(&local_mdata, &global_mdata, sizeof(Ty),\n                &galois::runtime::internal::ompi_op_min<Ty>, lc_col_ep);\n  }\n#else\n  /**\n   * Use MPI to reduce min across hosts\n   */\n  inline void reduce_mpi() {\n    if (typeid(Ty) == typeid(int32_t)) {\n      MPI_Allreduce(&local_mdata, &global_mdata, 1, MPI_INT, MPI_MIN,\n                    MPI_COMM_WORLD);\n    } else if (typeid(Ty) == typeid(int64_t)) {\n      MPI_Allreduce(&local_mdata, &global_mdata, 1, MPI_LONG, MPI_MIN,\n                    MPI_COMM_WORLD);\n    } else if (typeid(Ty) == typeid(uint32_t)) {\n      MPI_Allreduce(&local_mdata, &global_mdata, 1, MPI_UNSIGNED, MPI_MIN,\n                    MPI_COMM_WORLD);\n    } else if (typeid(Ty) == typeid(uint64_t)) {\n      MPI_Allreduce(&local_mdata, &global_mdata, 1, MPI_UNSIGNED_LONG, MPI_MIN,\n                    MPI_COMM_WORLD);\n    } else if (typeid(Ty) == typeid(float)) {\n      MPI_Allreduce(&local_mdata, &global_mdata, 1, MPI_FLOAT, MPI_MIN,\n                    MPI_COMM_WORLD);\n    } else if (typeid(Ty) == typeid(double)) {\n      MPI_Allreduce(&local_mdata, &global_mdata, 1, MPI_DOUBLE, MPI_MIN,\n                    MPI_COMM_WORLD);\n    } else if (typeid(Ty) == typeid(long double)) {\n      MPI_Allreduce(&local_mdata, &global_mdata, 1, MPI_LONG_DOUBLE, MPI_MIN,\n                    MPI_COMM_WORLD);\n    } else {\n      static_assert(true, \"Type of DGReduceMin not supported for MPI \"\n                          \"reduction\");\n    }\n  }\n#endif\n\npublic:\n  /**\n   * Default constructor; initializes everything to the max value of the type.\n   */\n  DGReduceMin() {\n    local_mdata  = std::numeric_limits<Ty>::max();\n    global_mdata = std::numeric_limits<Ty>::max();\n    ;\n  }\n\n  /**\n   * Update the local min-reduced value.\n   *\n   * @param rhs Value to min-reduce locally with\n   */\n  void update(const Ty rhs) { mdata.update(rhs); }\n\n  /**\n   * Read the local reduced min value; if it has never been reduced, it will\n   * attempt get the global value through a reduce (i.e. all other hosts\n   * should call reduce as well).\n   *\n   * @returns the local value stored in the accumulator or a global value if\n   * reduce has never been called\n   */\n  Ty read_local() {\n    if (local_mdata == std::numeric_limits<Ty>::max())\n      local_mdata = mdata.reduce();\n    return local_mdata;\n  }\n\n  /**\n   * Read the global reduced min value. For accurate results, you should\n   * call reduce before calling this.\n   *\n   * @returns the global value stored in the accumulator\n   */\n  Ty read() { return global_mdata; }\n\n  /**\n   * Reset this accumulator.\n   *\n   * @returns the previous global value stored in this accumulator (note if\n   * never reduced, it will be 0\n   */\n  Ty reset() {\n    Ty retval = global_mdata;\n    mdata.reset();\n    local_mdata = global_mdata = std::numeric_limits<Ty>::max();\n    return retval;\n  }\n\n  /**\n   * Do a min reduction across all hosts by sending data to all other hosts\n   * and reducing received data.\n   *\n   * @returns the min-reduced value after reducing from all hosts.\n   */\n  Ty reduce(std::string runID = std::string()) {\n    std::string timer_str(\"ReduceDGReduceMin_\" + runID);\n\n    galois::CondStatTimer<GALOIS_COMM_STATS> reduceTimer(timer_str.c_str(),\n                                                         \"DGReduceMin\");\n\n    reduceTimer.start();\n    if (local_mdata == std::numeric_limits<Ty>::max())\n      local_mdata = mdata.reduce();\n\n#ifdef GALOIS_USE_LCI\n    reduce_lwci();\n#else\n    reduce_mpi();\n#endif\n    reduceTimer.stop();\n\n    return global_mdata;\n  }\n};\n\n} // namespace galois\n#endif\n"
  },
  {
    "path": "libdist/include/galois/DTerminationDetector.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n/**\n * @file DReducible.h\n *\n * Implements distributed reducible objects for easy reduction of values\n * across a distributed system.\n */\n#ifndef GALOIS_DISTTERMINATOR_H\n#define GALOIS_DISTTERMINATOR_H\n\n#include <limits>\n#include \"galois/Galois.h\"\n#include \"galois/Reduction.h\"\n#include \"galois/AtomicHelpers.h\"\n#include \"galois/runtime/LWCI.h\"\n#include \"galois/runtime/DistStats.h\"\n\nnamespace galois {\n\n/**\n * Distributed sum-reducer for getting the sum of some value across multiple\n * hosts.\n *\n * @tparam Ty type of value to max-reduce\n */\ntemplate <typename Ty>\nclass DGTerminator {\n  galois::runtime::NetworkInterface& net =\n      galois::runtime::getSystemNetworkInterface();\n\n  galois::GAccumulator<Ty> mdata;\n  Ty local_mdata, global_mdata;\n\n  uint64_t prev_snapshot;\n  uint64_t snapshot;\n  uint64_t global_snapshot;\n  bool work_done;\n#ifndef GALOIS_USE_LCI\n  MPI_Request snapshot_request;\n#else\n  lc_colreq snapshot_request;\n#endif\n\npublic:\n  //! Default constructor\n  DGTerminator() {\n    reinitialize();\n    initiate_snapshot();\n    reset();\n  }\n\n  void reinitialize() {\n    prev_snapshot   = 0;\n    snapshot        = 1;\n    global_snapshot = 1;\n    work_done       = false;\n  }\n\n  /**\n   * Adds to accumulated value\n   *\n   * @param rhs Value to add\n   * @returns reference to this object\n   */\n  DGTerminator& operator+=(const Ty& rhs) {\n    mdata += rhs;\n    return *this;\n  }\n\n  /**\n   * Sets current value stored in accumulator.\n   *\n   * @param rhs Value to set\n   */\n  void operator=(const Ty rhs) {\n    mdata.reset();\n    mdata += rhs;\n  }\n\n  /**\n   * Sets current value stored in accumulator.\n   *\n   * @param rhs Value to set\n   */\n  void set(const Ty rhs) {\n    mdata.reset();\n    mdata += rhs;\n  }\n\n  /**\n   * Read local accumulated value.\n   *\n   * @returns locally accumulated value\n   */\n  Ty read_local() {\n    if (local_mdata == 0)\n      local_mdata = mdata.reduce();\n    return local_mdata;\n  }\n\n  /**\n   * Read the value returned by the last reduce call.\n   * Should call reduce before calling this function if an up to date\n   * value is required\n   *\n   * @returns the value of the last reduce call\n   */\n  Ty read() { return global_mdata; }\n\n  /**\n   * Reset the entire accumulator.\n   *\n   * @returns the value of the last reduce call\n   */\n  Ty reset() {\n    Ty retval = global_mdata;\n    mdata.reset();\n    local_mdata = global_mdata = 0;\n    return retval;\n  }\n\n  void initiate_snapshot() {\n#ifdef GALOIS_USE_LCI\n    lc_ialreduce(&snapshot, &global_snapshot, sizeof(Ty),\n                 &galois::runtime::internal::ompi_op_max<Ty>, lc_col_ep,\n                 &snapshot_request);\n#else\n    MPI_Iallreduce(&snapshot, &global_snapshot, 1, MPI_UNSIGNED_LONG, MPI_MAX,\n                   MPI_COMM_WORLD, &snapshot_request);\n#endif\n  }\n\n  bool terminate() {\n    bool active = (local_mdata != 0);\n    if (!active) {\n      active = net.anyPendingSends();\n    }\n    int snapshot_ended = 0;\n    if (!active) {\n#ifndef GALOIS_USE_LCI\n      MPI_Test(&snapshot_request, &snapshot_ended, MPI_STATUS_IGNORE);\n#else\n      lc_col_progress(&snapshot_request);\n      snapshot_ended = snapshot_request.flag;\n#endif\n    }\n    if (!active) { // check pending receives after checking snapshot\n      active = net.anyPendingReceives();\n      if (active)\n        galois::gDebug(\"[\", net.ID, \"] pending receive\");\n    }\n    if (active) {\n      work_done = true;\n    } else {\n      if (snapshot_ended != 0) {\n        snapshot = global_snapshot;\n        if (work_done) {\n          work_done     = false;\n          prev_snapshot = snapshot;\n          ++snapshot;\n          galois::gDebug(\"[\", net.ID, \"] work done, taking snapshot \",\n                         snapshot);\n          initiate_snapshot();\n        } else if (prev_snapshot != snapshot) {\n          prev_snapshot = snapshot;\n          galois::gDebug(\"[\", net.ID, \"] no work done, taking snapshot \",\n                         snapshot);\n          initiate_snapshot();\n        } else {\n          galois::gDebug(\"[\", net.ID, \"] terminating \", snapshot);\n          // an explicit barrier may be required here\n          // so that the next async phase begins on all hosts at the same time\n          // however, this may add overheads when it is not required\n          // (depending on when the next async phase actually begins), so\n          // ASSUME: caller will call getHostBarrier().wait() if required\n          reinitialize(); // for next async phase\n          return true;\n        }\n      }\n    }\n    return false;\n  }\n\n  /**\n   * Reduce data across all hosts, saves the value, and returns the\n   * reduced value\n   *\n   * @param runID optional argument used to create a statistics timer\n   * for later reporting\n   *\n   * @returns The reduced value\n   */\n  Ty reduce(std::string runID = std::string()) {\n    std::string timer_str(\"ReduceDGAccum_\" + runID);\n\n    galois::CondStatTimer<GALOIS_COMM_STATS> reduceTimer(timer_str.c_str(),\n                                                         \"DGReducible\");\n    reduceTimer.start();\n\n    if (local_mdata == 0)\n      local_mdata = mdata.reduce();\n\n    bool halt    = terminate();\n    global_mdata = !halt;\n    if (halt) {\n      galois::runtime::evilPhase += 2; // one for reduce and one for broadcast\n      if (galois::runtime::evilPhase >=\n          static_cast<uint32_t>(\n              std::numeric_limits<int16_t>::max())) { // limit defined by MPI or\n                                                      // LCI\n        galois::runtime::evilPhase = 1;\n      }\n    }\n\n    reduceTimer.stop();\n\n    return global_mdata;\n  }\n};\n\n} // namespace galois\n#endif\n"
  },
  {
    "path": "libdist/include/galois/DistGalois.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n/**\n * @file DistGalois.h\n *\n * Contains the declaration of DistMemSys, a way to explicitly initiate the\n * Galois runtime.\n */\n#ifndef GALOIS_DIST_GALOIS_H\n#define GALOIS_DIST_GALOIS_H\n\n#include \"galois/runtime/SharedMem.h\"\n#include \"galois/runtime/DistStats.h\"\n\n#include <string>\n#include <utility>\n#include <tuple>\n\nnamespace galois {\n/**\n * Explicit class to initialize the Galois Runtime.\n * The runtime is destroyed when this object is destroyed\n */\nclass DistMemSys : public runtime::SharedMem<runtime::DistStatManager> {\npublic:\n  explicit DistMemSys();\n\n  ~DistMemSys();\n\n  DistMemSys(const DistMemSys&) = delete;\n  DistMemSys& operator=(const DistMemSys&) = delete;\n\n  DistMemSys(DistMemSys&&) = delete;\n  DistMemSys& operator=(DistMemSys&&) = delete;\n};\n\n} // namespace galois\n#endif\n"
  },
  {
    "path": "libdist/include/galois/runtime/BareMPI.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n/*\n */\n\n/**\n * @file BareMPI.h\n *\n * Contains the BareMPI enum and the command line option that controls bare\n * MPI usage.\n */\n#pragma once\n#ifdef GALOIS_USE_BARE_MPI\n#include \"mpi.h\"\n\n//! Defines types of bare MPI to use\nenum BareMPI {\n  noBareMPI,          //!< do not use bare MPI; use our network layer\n  nonBlockingBareMPI, //!< non blocking bare MPI\n  oneSidedBareMPI     //!< one sided bare MPI\n};\n#endif\n"
  },
  {
    "path": "libdist/include/galois/runtime/DistStats.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n/**\n * @file DistStats.h\n *\n * Contains declaration of DistStatManager, which reports runtime statistics of\n * a distributed application in Galois.\n */\n\n#ifndef GALOIS_RUNTIME_DIST_STATS_H\n#define GALOIS_RUNTIME_DIST_STATS_H\n\n//! Turn on if you want more distributed stats to be printed\n#ifndef MORE_DIST_STATS\n#define MORE_DIST_STATS 0\n#endif\n//! Turn on if you want more communication statistics to be printed\n#ifndef GALOIS_COMM_STATS\n#define GALOIS_COMM_STATS 0\n#endif\n//! Turn on if you want per-bulk-synchronous parallel timers to be printed\n//! (otherwise all rounds are under 1 timer)\n#ifndef GALOIS_PER_ROUND_STATS\n#define GALOIS_PER_ROUND_STATS 0\n#endif\n\n#include \"galois/runtime/Statistics.h\"\n#include \"galois/runtime/Network.h\"\n\n#include <string>\n\nnamespace galois {\nnamespace runtime {\n\n/**\n * Helper class for the DistStatManager that aids in receiving statistics\n */\nclass StatRecvHelper;\n\n/**\n * Class responsible for tracking all statistics of a running distributed\n * Galois program and reporting them at the end of program execution.\n */\nclass DistStatManager : public galois::runtime::StatManager {\n  //! Friend class that helps with receiving stats\n  friend class galois::runtime::StatRecvHelper;\n  using Base = galois::runtime::StatManager;\n  using Str  = galois::gstl::Str;\n  using Base::SEP;\n\n  static constexpr const char* const HSTAT_SEP     = Base::TSTAT_SEP;\n  static constexpr const char* const HSTAT_NAME    = \"HostValues\";\n  static constexpr const char* const HSTAT_ENV_VAR = \"PRINT_PER_HOST_STATS\";\n\n  static bool printingHostVals(void);\n\n  template <typename _UNUSED = void>\n  struct HostTotalTypesImpl {\n    struct DummyStat {\n      StatTotal::Type m_totalTy;\n\n      explicit DummyStat(StatTotal::Type total) : m_totalTy(total) {}\n\n      template <typename _U>\n      void add(const _U&) const {}\n\n      const StatTotal::Type& totalTy(void) const { return m_totalTy; }\n    };\n\n    using TMap = internal::BasicStatMap<DummyStat>;\n\n    bool merged = false;\n    substrate::PerThreadStorage<TMap> perThrdMap;\n\n    void addToStat(const Str& region, const Str& category,\n                   const StatTotal::Type& hTotalTy) {\n      perThrdMap.getLocal()->addToStat(region, category, 0, hTotalTy);\n    }\n\n    void mergeStats(void) {\n      if (merged) {\n        return;\n      }\n      GALOIS_ASSERT(perThrdMap.getLocal() == perThrdMap.getRemote(0),\n                    \"Must call from Thread 0\");\n\n      auto* t0Map = perThrdMap.getRemote(0);\n\n      for (unsigned t = 1; t < perThrdMap.size(); ++t) {\n        const auto* manager = perThrdMap.getRemote(t);\n\n        for (auto i = manager->cbegin(), end_i = manager->cend(); i != end_i;\n             ++i) {\n          t0Map->addToStat(manager->region(i), manager->category(i), 0,\n                           manager->stat(i).totalTy());\n        }\n      }\n\n      merged = true;\n    }\n\n    const TMap& mergedMap(void) const {\n      assert(merged && \"Must merge first\");\n      return *perThrdMap.getRemote(0);\n    }\n  };\n\n  using HostTotalTypes = HostTotalTypesImpl<>;\n\n  template <typename T>\n  using ThrdVals = galois::gstl::Vector<T>;\n\n  template <typename T>\n  using HostStatVal =\n      std::tuple<unsigned, T, StatTotal::Type, const ThrdVals<T>&>;\n\n  template <typename T>\n  struct HostStat : public internal::VecStat<T> {\n    using Base             = internal::VecStat<T>;\n    using ThrdStats        = internal::VecStat<T>;\n    using PerHostThrdStats = galois::gstl::Map<unsigned, ThrdStats>;\n\n    PerHostThrdStats perHostThrdStats;\n\n    explicit HostStat(const StatTotal::Type& hTotalTy) : Base(hTotalTy) {}\n\n    void add(const HostStatVal<T>& val) {\n      const auto& hostID      = std::get<0>(val);\n      const auto& thrdTotal   = std::get<1>(val);\n      const auto& thrdTotalTy = std::get<2>(val);\n      const auto& thrdVals    = std::get<3>(val);\n\n      Base::add(thrdTotal);\n\n      auto p      = perHostThrdStats.emplace(hostID, ThrdStats(thrdTotalTy));\n      auto& tstat = p.first->second;\n\n      for (const auto& i : thrdVals) {\n        tstat.add(i);\n      }\n    }\n\n    void printHostVals(std::ostream& out, const Str& region,\n                       const Str& category) const {\n      out << StatManager::statKind<T>() << SEP << galois::runtime::getHostID()\n          << SEP;\n      out << region << SEP << category << SEP;\n      out << HSTAT_NAME << SEP;\n\n      const char* sep = \"\";\n\n      for (const auto& v : Base::values()) {\n        out << sep << v;\n        sep = HSTAT_SEP;\n      }\n\n      out << std::endl;\n    }\n\n    void printThreadVals(std::ostream& out, const Str& region,\n                         const Str& category) const {\n      for (const auto& p : perHostThrdStats) {\n        out << StatManager::statKind<T>() << SEP << p.first << SEP;\n        out << region << SEP << category << SEP;\n        out << StatTotal::str(p.second.totalTy()) << SEP << p.second.total();\n        out << std::endl;\n\n        out << StatManager::statKind<T>() << SEP << p.first << SEP;\n        out << region << SEP << category << SEP;\n        out << StatManager::TSTAT_NAME << SEP;\n\n        const char* sep = \"\";\n        for (const auto& v : p.second.values()) {\n          out << sep << v;\n          sep = StatManager::TSTAT_SEP;\n        }\n\n        out << std::endl;\n      }\n    }\n  };\n\n  template <typename T>\n  struct DistStatCombiner : public internal::BasicStatMap<HostStat<T>> {\n    using Base = internal::BasicStatMap<HostStat<T>>;\n\n#if __GNUC__ < 5\n    static const char* htotalName(const StatTotal::Type& type){\n#else\n    static constexpr const char* htotalName(const StatTotal::Type& type) {\n#endif\n        switch (type) {\n          case StatTotal::SINGLE : return \"HOST_0\";\n  case StatTotal::TSUM:\n    return \"HSUM\";\n  case StatTotal::TAVG:\n    return \"HAVG\";\n  case StatTotal::TMIN:\n    return \"HMIN\";\n  case StatTotal::TMAX:\n    return \"HMAX\";\n  default:\n    std::abort();\n    return nullptr;\n  }\n}\n\n    void print(std::ostream& out) const {\n  for (auto i = Base::cbegin(), end_i = Base::cend(); i != end_i; ++i) {\n    out << StatManager::statKind<T>() << SEP << galois::runtime::getHostID()\n        << SEP;\n    out << Base::region(i) << SEP << Base::category(i) << SEP;\n\n    const HostStat<T>& hs = Base::stat(i);\n\n    out << htotalName(hs.totalTy()) << SEP << hs.total();\n    out << std::endl;\n\n    if (DistStatManager::printingHostVals()) {\n      hs.printHostVals(out, Base::region(i), Base::category(i));\n    }\n\n    if (StatManager::printingThreadVals()) {\n      hs.printThreadVals(out, Base::region(i), Base::category(i));\n    }\n  }\n}\n}; // namespace runtime\n\nDistStatCombiner<int64_t> intDistStats;\nDistStatCombiner<double> fpDistStats;\nDistStatCombiner<Str> strDistStats;\nHostTotalTypes hostTotalTypes;\n\nprotected:\n/**\n * Merge all stats from each individual thread as well as each individual\n * host as prescribed the the reduction (Total) type specified for each\n * statistic.\n */\nvoid mergeStats(void);\n\n/**\n * Print the header of the stats file output.\n *\n * @param out File to print header out to\n */\nvoid printHeader(std::ostream& out) const;\n\n/**\n * Merge all stats. Host 0 will then print out all collected stats.\n */\nvirtual void printStats(std::ostream& out);\n\npublic:\n//! Dist stat manager constructor\nDistStatManager(const std::string& outfile = \"\");\n~DistStatManager();\n\n/**\n * Adds a statistic to the statistics manager.\n *\n * @param region Region name to give statistic\n * @param category Category of statistic\n * @param val Value of the statistic\n * @param thrdTotalTy The type of reduction used to combine thread statistics\n * of the same kind\n * @param hTotalTy The type of reduction used to combine host statistics\n * of the same kind\n */\ntemplate <typename T>\nvoid addToStat(const Str& region, const Str& category, const T& val,\n               const StatTotal::Type& thrdTotalTy,\n               const StatTotal::Type& hTotalTy) {\n  Base::addToStat(region, category, val, thrdTotalTy);\n  hostTotalTypes.addToStat(region, category, hTotalTy);\n}\n\nprivate:\nvoid combineAtHost_0_helper(void);\nvoid combineAtHost_0_helper2(void);\nvoid receiveAtHost_0_helper(void);\nvoid receiveAtHost_0_helper2(void);\nvoid combineAtHost_0(void);\nStatTotal::Type findHostTotalTy(const Str& region, const Str& category,\n                                const StatTotal::Type& thrdTotalTy) const;\nvoid addRecvdHostTotalTy(const Str& region, const Str& category,\n                         const StatTotal::Type& totalTy);\nvoid addRecvdStat(unsigned hostID, const Str& region, const Str& category,\n                  int64_t thrdTotal, const StatTotal::Type& thrdTotalTy,\n                  const ThrdVals<int64_t>& thrdVals);\nvoid addRecvdStat(unsigned hostID, const Str& region, const Str& category,\n                  double thrdTotal, const StatTotal::Type& thrdTotalTy,\n                  const ThrdVals<double>& thrdVals);\nvoid addRecvdParam(unsigned hostID, const Str& region, const Str& category,\n                   const Str& thrdTotal, const StatTotal::Type& thrdTotalTy,\n                   const ThrdVals<Str>& thrdVals);\n}; // namespace galois\n\nnamespace internal {\n/**\n * Gets a pointer to the distributed stat manager.\n *\n * @returns Pointer to distributed statistics manager\n */\nDistStatManager* distSysStatManager(void);\n} // namespace internal\n\n/**\n * Adds a statistic to the statistics manager. Calls addToStat in\n * DistStatManager.\n *\n * @param region Region name to give statistic\n * @param category Category of statistic\n * @param value Value of the statistic\n * @param thrdTotalTy The type of reduction used to combine thread statistics\n * of the same kind\n * @param hTotalTy The type of reduction used to combine host statistics\n * of the same kind\n */\ntemplate <typename S1, typename S2, typename T>\ninline void reportDistStat(const S1& region, const S2& category, const T& value,\n                           const StatTotal::Type& thrdTotalTy,\n                           const StatTotal::Type& hTotalTy) {\n  internal::distSysStatManager()->addToStat(gstl::makeStr(region),\n                                            gstl::makeStr(category), value,\n                                            thrdTotalTy, hTotalTy);\n}\n\n} // end namespace runtime\n} // end namespace galois\n\n#endif // GALOIS_RUNTIME_DIST_STATS_H\n"
  },
  {
    "path": "libdist/include/galois/runtime/LWCI.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n/**\n * @file LWCI.h\n *\n * LWCI header that includes lc.h (LCI library) and internal helper functions\n * on arrays.\n */\n\n#pragma once\n#ifdef GALOIS_USE_LCI\nGALOIS_IGNORE_UNUSED_PARAMETERS\n#include \"lc.h\"\nGALOIS_END_IGNORE_UNUSED_PARAMETERS\n\nextern lc_ep lc_col_ep;\nextern lc_ep lc_p2p_ep[3];\n\nnamespace galois {\nnamespace runtime {\nnamespace internal {\n\n/**\n * Element-wise sum of 2 arrays.\n *\n * @tparam Ty type of elements contained in the arrays\n *\n * @param dst destination array to write to\n * @param src source array to read from\n * @param count Size of array in bytes\n */\ntemplate <typename Ty>\nvoid ompi_op_sum(void* dst, void* src, size_t count) {\n  Ty* dst_ty = (Ty*)dst;\n  Ty* src_ty = (Ty*)src;\n  for (size_t i = 0; i < (count / sizeof(Ty)); ++i) {\n    dst_ty[i] += src_ty[i];\n  }\n}\n\n/**\n * Element-wise max of 2 arrays.\n *\n * @tparam Ty type of elements contained in the arrays\n *\n * @param dst destination array to write to\n * @param src source array to read from\n * @param count Size of array in bytes\n */\ntemplate <typename Ty>\nvoid ompi_op_max(void* dst, void* src, size_t count) {\n  Ty* dst_ty = (Ty*)dst;\n  Ty* src_ty = (Ty*)src;\n  for (size_t i = 0; i < (count / sizeof(Ty)); ++i) {\n    if (dst_ty[i] < src_ty[i]) {\n      dst_ty[i] = src_ty[i];\n    }\n  }\n}\n\n/**\n * Element-wise min of 2 arrays.\n *\n * @tparam Ty type of elements contained in the arrays\n *\n * @param dst destination array to write to\n * @param src source array to read from\n * @param count Size of array in bytes\n */\ntemplate <typename Ty>\nvoid ompi_op_min(void* dst, void* src, size_t count) {\n  Ty* dst_ty = (Ty*)dst;\n  Ty* src_ty = (Ty*)src;\n  for (size_t i = 0; i < (count / sizeof(Ty)); ++i) {\n    if (dst_ty[i] > src_ty[i]) {\n      dst_ty[i] = src_ty[i];\n    }\n  }\n}\n\n} // namespace internal\n} // namespace runtime\n} // namespace galois\n#endif\n"
  },
  {
    "path": "libdist/include/galois/runtime/MemUsage.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n/*\n */\n\n/**\n * @file MemUsage.h\n *\n * Contains MemUsageTracker, a class that tracks memory usage throughout\n * runtime of a program of send/receive buffers.\n */\n\n#pragma once\n#include <atomic>\nnamespace galois {\nnamespace runtime {\n\n/**\n * Class that tracks memory usage (mainly of send and receive buffers).\n */\nclass MemUsageTracker {\n  std::atomic<int64_t>\n      currentMemUsage; //!< mem usage of send and receive buffers\n  int64_t maxMemUsage; //!< max mem usage of send and receive buffers\n\npublic:\n  //! Default constructor initializes everything to 0.\n  MemUsageTracker() : currentMemUsage(0), maxMemUsage(0) {}\n\n  /**\n   * Increment memory usage.\n   *\n   * @param size amount to increment mem usage by\n   */\n  inline void incrementMemUsage(uint64_t size) {\n    currentMemUsage += size;\n    if (currentMemUsage > maxMemUsage)\n      maxMemUsage = currentMemUsage;\n  }\n\n  /**\n   * Decrement memory usage.\n   *\n   * @param size amount to decrement mem usage by\n   */\n  inline void decrementMemUsage(uint64_t size) { currentMemUsage -= size; }\n\n  /**\n   * Reset mem usage and max mem usage to 0.\n   */\n  inline void resetMemUsage() {\n    currentMemUsage = 0;\n    maxMemUsage     = 0;\n  }\n\n  /**\n   * Get max mem usage.\n   *\n   * @returns maximum memory usage tracked so far\n   */\n  inline int64_t getMaxMemUsage() const { return maxMemUsage; }\n};\n\n} // namespace runtime\n} // namespace galois\n"
  },
  {
    "path": "libdist/include/galois/runtime/Network.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n/**\n * @file Network.h\n *\n * Contains the network interface class which is the base class for all\n * network layer implementations.\n */\n\n#ifndef GALOIS_RUNTIME_NETWORK_H\n#define GALOIS_RUNTIME_NETWORK_H\n\n#include \"galois/runtime/Serialize.h\"\n#include \"galois/runtime/MemUsage.h\"\n#include \"galois/substrate/Barrier.h\"\n\n#include <mpi.h>\n\n#include <cstdint>\n#include <optional>\n#include <tuple>\n\nnamespace galois::runtime {\n\n//! typedef for buffer that stores data to be sent out\nusing SendBuffer = SerializeBuffer;\n//! typedef for buffer that received data is saved into\nusing RecvBuffer = DeSerializeBuffer;\n\n/**\n * A class that defines functions that a network interface in Galois should\n * have. How the sends/recvs/stat-collecting happens as well\n * as the network layer itself is up to the implemention of the class.\n */\nclass NetworkInterface {\nprotected:\n  //! Initialize the MPI system. Should only be called once per process.\n  void initializeMPI();\n\n  //! Finalize the MPI system. Should only be called once per process.\n  void finalizeMPI();\n\n  //! Memory usage tracker\n  MemUsageTracker memUsageTracker;\n\n  //! Number of inflight sends and receives\n  std::atomic<size_t> inflightSends;\n  std::atomic<size_t> inflightRecvs;\n\n#ifdef GALOIS_USE_BARE_MPI\npublic:\n  //! Wrapper that calls into increment mem usage on the memory usage tracker\n  inline void incrementMemUsage(uint64_t size) {\n    memUsageTracker.incrementMemUsage(size);\n  }\n  //! Wrapper that calls into decrement mem usage on the memory usage tracker\n  inline void decrementMemUsage(uint64_t size) {\n    memUsageTracker.decrementMemUsage(size);\n  }\n#endif\n\npublic:\n  //! This machine's host ID\n  static uint32_t ID;\n  //! The total number of machines in the current program\n  static uint32_t Num;\n\n  /**\n   * Constructor for interface.\n   */\n  NetworkInterface();\n\n  /**\n   * Destructor destroys MPI (if it exists).\n   */\n  virtual ~NetworkInterface();\n\n  //! Send a message to a given (dest) host.  A message is simply a\n  //! landing pad (recv, funciton pointer) and some data (buf)\n  //! on the receiver, recv(buf) will be called durring handleReceives()\n  //! buf is invalidated by this operation\n  void sendMsg(uint32_t dest, void (*recv)(uint32_t, RecvBuffer&),\n               SendBuffer& buf);\n\n  //! Send a message letting the network handle the serialization and\n  //! deserialization slightly slower\n  template <typename... Args>\n  void sendSimple(uint32_t dest, void (*recv)(uint32_t, Args...),\n                  Args... param);\n\n  //! Send a message to a given (dest) host.  A message is simply a\n  //! tag (tag) and some data (buf)\n  //! on the receiver, buf will be returned on a receiveTagged(tag)\n  //! buf is invalidated by this operation\n  virtual void sendTagged(uint32_t dest, uint32_t tag, SendBuffer& buf,\n                          int type = 0) = 0;\n\n  //! Send a message to all hosts.  A message is simply a\n  //! landing pad (recv) and some data (buf)\n  //! buf is invalidated by this operation\n  void broadcast(void (*recv)(uint32_t, RecvBuffer&), SendBuffer& buf,\n                 bool self = false);\n\n  //! Broadcast a message allowing the network to handle serialization and\n  //! deserialization\n  template <typename... Args>\n  void broadcastSimple(void (*recv)(uint32_t, Args...), Args... param);\n\n  //! Receive and dispatch messages\n  void handleReceives();\n\n  //! Wrapper to reset the mem usage tracker's stats\n  inline void resetMemUsage() { memUsageTracker.resetMemUsage(); }\n\n  //! Reports the memory usage tracker's statistics to the stat manager\n  void reportMemUsage() const;\n\n  //! Receive a tagged message\n  virtual std::optional<std::pair<uint32_t, RecvBuffer>>\n  recieveTagged(uint32_t tag, std::unique_lock<substrate::SimpleLock>* rlg,\n                int type = 0) = 0;\n\n  //! move send buffers out to network\n  virtual void flush() = 0;\n\n  //! @returns true if any send is in progress or is pending to be enqueued\n  virtual bool anyPendingSends() = 0;\n\n  //! @returns true if any receive is in progress or is pending to be dequeued\n  virtual bool anyPendingReceives() = 0;\n\n  //! Get how many bytes were sent\n  //! @returns num bytes sent\n  virtual unsigned long reportSendBytes() const = 0;\n  //! Get how many messages were sent\n  //! @returns num messages sent\n  virtual unsigned long reportSendMsgs() const = 0;\n  //! Get how many bytes were received\n  //! @returns num bytes received\n  virtual unsigned long reportRecvBytes() const = 0;\n  //! Get how many messages were received\n  //! @returns num messages received\n  virtual unsigned long reportRecvMsgs() const = 0;\n  //! Get any other extra statistics that might need to be reported; varies\n  //! depending on implementation\n  //! @returns vector of extra things to be reported\n  virtual std::vector<unsigned long> reportExtra() const = 0;\n  //! Get the names of the extra things that are returned by reportExtra\n  //! @returns vector of the names of the reported extra things\n  virtual std::vector<std::pair<std::string, unsigned long>>\n  reportExtraNamed() const = 0;\n};\n\n//! Variable that keeps track of which network send/recv phase a program is\n//! currently on. Can be seen as a count of send/recv rounds that have occured.\nextern uint32_t evilPhase;\n\n//! Get the network interface\n//! @returns network interface\nNetworkInterface& getSystemNetworkInterface();\n\nnamespace internal {\n//! Deletes the system network interface (if it exists).\nvoid destroySystemNetworkInterface();\n} // namespace internal\n\n//! Gets this host's ID\n//! @returns ID of this host\nuint32_t getHostID();\n\n//! Returns a BufferedNetwork interface\nNetworkInterface& makeNetworkBuffered();\n\n//! Returns a LCINetwork interface\nNetworkInterface& makeNetworkLCI();\n\n//! Returns a host barrier, which is a regular MPI-Like Barrier for all hosts.\n//! @warning Should not be called within a parallel region; assumes only one\n//! thread is calling it\nsubstrate::Barrier& getHostBarrier();\n//! Returns a fence that ensures all pending messages are delivered, acting\n//! like a memory-barrier\nsubstrate::Barrier& getHostFence();\n\n////////////////////////////////////////////////////////////////////////////////\n// Implementations\n////////////////////////////////////////////////////////////////////////////////\nnamespace { // anon\ntemplate <typename... Args>\nstatic void genericLandingPad(uint32_t src, RecvBuffer& buf) {\n  void (*fp)(uint32_t, Args...);\n  std::tuple<Args...> args;\n  gDeserialize(buf, fp, args);\n  std::apply([fp, src](Args... params) { fp(src, params...); }, args);\n}\n\n} // namespace\n\ntemplate <typename... Args>\nvoid NetworkInterface::sendSimple(uint32_t dest,\n                                  void (*recv)(uint32_t, Args...),\n                                  Args... param) {\n  SendBuffer buf;\n  gSerialize(buf, (uintptr_t)recv, param...,\n             (uintptr_t)genericLandingPad<Args...>);\n  sendTagged(dest, 0, buf);\n}\n\ntemplate <typename... Args>\nvoid NetworkInterface::broadcastSimple(void (*recv)(uint32_t, Args...),\n                                       Args... param) {\n  SendBuffer buf;\n  gSerialize(buf, (uintptr_t)recv, param...);\n  broadcast(genericLandingPad<Args...>, buf, false);\n}\n\n} // namespace galois::runtime\n#endif\n"
  },
  {
    "path": "libdist/include/galois/runtime/NetworkIO.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n/**\n * @file NetworkIO.h\n *\n * Contains NetworkIO, a base class that is inherited by classes that want to\n * implement the communication layer of Galois. (e.g. NetworkIOMPI and\n * NetworkIOLWCI)\n */\n\n#ifndef GALOIS_RUNTIME_NETWORKTHREAD_H\n#define GALOIS_RUNTIME_NETWORKTHREAD_H\n\n#include <cstdint>\n#include <vector>\n#include <tuple>\n#include <memory>\n#include <cassert>\n#include <cstring>\n#include <deque>\n#include <string>\n#include <fstream>\n#include <unistd.h>\n#include <mpi.h>\n#include \"galois/runtime/MemUsage.h\"\n#include \"galois/PODResizeableArray.h\"\n\nnamespace galois {\nnamespace runtime {\n\n/**\n * Class for the network IO layer which is responsible for doing sends/receives\n * of data. Used by the network interface to do the actual communication.\n */\nclass NetworkIO {\nprotected:\n  /**\n   * Wrapper for dealing with MPI error codes. Program dies if the error code\n   * isn't MPI_SUCCESS.\n   *\n   * @param rc Error code to check for success\n   */\n  static void handleError(int rc) {\n    if (rc != MPI_SUCCESS) {\n      MPI_Abort(MPI_COMM_WORLD, rc);\n    }\n  }\n\n  //! memory usage tracker\n  MemUsageTracker& memUsageTracker;\n\n  //! Number of inflight sends and receives\n  std::atomic<size_t>& inflightSends;\n  std::atomic<size_t>& inflightRecvs;\n\n  // using vTy = std::vector<uint8_t>;\n  using vTy = galois::PODResizeableArray<uint8_t>;\n\npublic:\n  /**\n   * Message structure for sending data across the network.\n   */\n  struct message {\n    uint32_t host; //!< destination of this message\n    uint32_t tag;  //!< tag on message indicating distinct communication phases\n    vTy data;      //!< data portion of message\n\n    //! Default constructor initializes host and tag to large numbers.\n    message() : host(~0), tag(~0) {}\n    //! @param h Host to send message to\n    //! @param t Tag to associate with message\n    //! @param d Data to save in message\n    message(uint32_t h, uint32_t t, vTy&& d)\n        : host(h), tag(t), data(std::move(d)) {}\n\n    //! A message is valid if there is data to be sent\n    //! @returns true if data is non-empty\n    bool valid() const { return !data.empty(); }\n  };\n\n  //! The default constructor takes a memory usage tracker and saves it\n  //! @param tracker reference to a memory usage tracker used by the system\n  //! @param sends\n  //! @param recvs\n  NetworkIO(MemUsageTracker& tracker, std::atomic<size_t>& sends,\n            std::atomic<size_t>& recvs)\n      : memUsageTracker(tracker), inflightSends(sends), inflightRecvs(recvs) {}\n\n  //! Default destructor does nothing.\n  virtual ~NetworkIO();\n  //! Queues a message for sending out. Takes ownership of data buffer.\n  virtual void enqueue(message m) = 0;\n  //! Checks to see if a message is here for this host to receive. If so, take\n  //! and return it\n  //! @returns an empty message if no message\n  virtual message dequeue() = 0;\n  //! Make progress. Other functions don't have to make progress.\n  virtual void progress() = 0;\n};\n\n/**\n * Creates/returns a network IO layer that uses MPI to do communication.\n *\n * @returns tuple with pointer to the MPI IO layer, this host's ID, and the\n * total number of hosts in the system\n */\nstd::tuple<std::unique_ptr<NetworkIO>, uint32_t, uint32_t>\nmakeNetworkIOMPI(galois::runtime::MemUsageTracker& tracker,\n                 std::atomic<size_t>& sends, std::atomic<size_t>& recvs);\n// #ifdef GALOIS_USE_LCI\n// /**\n//  * Creates/returns a network IO layer that uses LWCI to do communication.\n//  *\n//  * @returns tuple with pointer to the LWCI IO layer, this host's ID, and the\n//  * total number of hosts in the system\n//  */\n// std::tuple<std::unique_ptr<NetworkIO>, uint32_t, uint32_t>\n// makeNetworkIOLWCI(galois::runtime::MemUsageTracker& tracker,\n//                   std::atomic<size_t>& sends, std::atomic<size_t>& recvs);\n// #endif\n\n} // namespace runtime\n} // namespace galois\n\n#endif\n"
  },
  {
    "path": "libdist/include/galois/runtime/Serialize.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n/**\n * @file Serialize.h\n *\n * Contains functions that serialize/deserialize data, mainly for sending\n * out serialized data over the network and deserializing it on the other end.\n */\n\n#ifndef GALOIS_RUNTIME_SERIALIZE_H\n#define GALOIS_RUNTIME_SERIALIZE_H\n\n#include <type_traits>\n#include <ostream>\n#include <vector>\n#include <deque>\n#include <string>\n#include <cassert>\n#include <tuple>\n\n#include <boost/mpl/has_xxx.hpp>\n#include \"galois/runtime/ExtraTraits.h\"\n\n#include <galois/gdeque.h>\n#include <galois/DynamicBitset.h>\n#include <galois/AtomicWrapper.h>\n#include <galois/PODResizeableArray.h>\n#include \"galois/CopyableTuple.h\"\n#include \"galois/Bag.h\"\n\nnamespace galois {\nnamespace runtime {\n\nclass DeSerializeBuffer; // forward declaration for friend declaration\n\n/**\n * Buffer for serialization of data. Mainly used during network communication.\n */\nclass SerializeBuffer {\n  //! Access to a deserialize buffer\n  friend DeSerializeBuffer;\n\n  //! type of data buffer\n  // using vTy = std::vector<uint8_t>;\n  using vTy = galois::PODResizeableArray<uint8_t>;\n  //! the actual data stored in this buffer\n  vTy bufdata;\n\npublic:\n  //! default constructor\n  SerializeBuffer() = default;\n  //! disabled copy constructor\n  SerializeBuffer(SerializeBuffer&& rhs) = default;\n  //! Creates a buffer from another buffer\n  //! @param d buffer to create from\n  //! @param len amount of copy from buffer d\n  SerializeBuffer(const char* d, unsigned len) : bufdata(d, d + len) {}\n\n  //! Push a character onto the serialize buffer\n  inline void push(const char c) { bufdata.push_back(c); }\n\n  //! Insert characters from a buffer into the serialize buffer\n  void insert(const uint8_t* c, size_t bytes) {\n    bufdata.insert(bufdata.end(), c, c + bytes);\n  }\n\n  //! Insert characters from a buffer into the serialize buffer at a particular\n  //! offset\n  void insertAt(const uint8_t* c, size_t bytes, size_t offset) {\n    std::copy_n(c, bytes, bufdata.begin() + offset);\n  }\n\n  /**\n   * Reserve space at the end for inserting new data into the serialize\n   * buffer\n   *\n   * @param bytes number of bytes to reserve at the end\n   * @returns offset to the end of the buffer before new space was reserved\n   */\n  size_t encomber(size_t bytes) {\n    size_t retval = bufdata.size();\n    bufdata.resize(retval + bytes);\n    return retval;\n  }\n\n  void resize(size_t bytes) { bufdata.resize(bytes); }\n\n  /**\n   * Reserve more space in the serialize buffer.\n   *\n   * @param s extra space to reserve\n   */\n  void reserve(size_t s) { bufdata.reserve(bufdata.size() + s); }\n\n  //! Returns a pointer to the data stored in this serialize buffer\n  const uint8_t* linearData() const { return bufdata.data(); }\n  //! Returns vector of data stored in this serialize buffer\n  vTy& getVec() { return bufdata; }\n\n  //! Returns an iterator to the beginning of the data in this serialize buffer\n  vTy::const_iterator begin() const { return bufdata.cbegin(); }\n  //! Returns an iterator to the end of the data in this serialize buffer\n  vTy::const_iterator end() const { return bufdata.cend(); }\n\n  using size_type = vTy::size_type;\n\n  //! Returns the size of the serialize buffer\n  size_type size() const { return bufdata.size(); }\n\n  //! Utility print function for the serialize buffer\n  //! @param o stream to print to\n  void print(std::ostream& o) const {\n    o << \"<{\" << std::hex;\n    for (auto& i : bufdata)\n      o << (unsigned int)i << \" \";\n    o << std::dec << \"}>\";\n  }\n\n  //! Operator that calls the print function of the serialize buffer\n  friend std::ostream& operator<<(std::ostream& os, const SerializeBuffer& b) {\n    b.print(os);\n    return os;\n  }\n};\n\n/**\n * Buffer for deserialization of data. Mainly used during network\n * communication.\n */\nclass DeSerializeBuffer {\n  //! Access to serialize buffer\n  friend SerializeBuffer;\n  //! type of data buffer\n  // using vTy = std::vector<uint8_t>;\n  using vTy = galois::PODResizeableArray<uint8_t>;\n  //! the actual data stored in this buffer\n  vTy bufdata;\n  int offset;\n\npublic:\n  //! Constructor initializes offset into buffer to 0\n  DeSerializeBuffer() : offset(0) {}\n  //! Disable copy constructor\n  DeSerializeBuffer(DeSerializeBuffer&&) = default;\n  //! Move constructor\n  //! @param v vector to act as deserialize buffer\n  //! @param start offset to start saving data into\n  DeSerializeBuffer(vTy&& v, uint32_t start = 0)\n      : bufdata(std::move(v)), offset(start) {}\n\n  //! Constructor that takes an existing vector to use as the deserialize\n  //! buffer\n  explicit DeSerializeBuffer(vTy& data) {\n    bufdata.swap(data);\n    offset = 0;\n  }\n\n  /**\n   * Initializes the deserialize buffer with a certain size\n   * @param [in] count size to initialize buffer to\n   */\n  explicit DeSerializeBuffer(int count) : bufdata(count), offset(0) {}\n\n  /**\n   * Initializes the deserialize buffer using vector initialization from\n   * 2 iterators.\n   */\n  template <typename Iter>\n  DeSerializeBuffer(Iter b, Iter e) : bufdata(b, e), offset{0} {}\n\n  /**\n   * Initialize a deserialize buffer from a serialize buffer\n   */\n  explicit DeSerializeBuffer(SerializeBuffer&& buf) : offset(0) {\n    bufdata.swap(buf.bufdata);\n  }\n\n  /**\n   * Disable copy constructor\n   */\n  DeSerializeBuffer& operator=(DeSerializeBuffer&& buf) = default;\n\n  /**\n   * Reset deserialize buffer\n   * @param count new size of buffer\n   */\n  void reset(int count) {\n    offset = 0;\n    bufdata.resize(count);\n  }\n\n  //! Gets the current offset into the deserialize buffer\n  unsigned getOffset() const { return offset; }\n  //! Sets the offset into the deserialize buffer\n  void setOffset(unsigned off) {\n    assert(off <= size());\n    offset = off;\n  }\n\n  //! Gets the size of the deserialize buffer\n  unsigned size() const { return bufdata.size(); }\n\n  //! Returns true if the deserialize buffer is empty\n  //! @returns true if the deserialize buffer is empty\n  bool empty() const { return bufdata.empty(); }\n\n  //! Get the next character in the deserialize buffer\n  unsigned char pop() { return bufdata.at(offset++); }\n\n  //! Clears the last x bytes of the deserialize buffer, resizing it as well\n  //! @param x How many bytes from the end to clear\n  void pop_back(unsigned x) { bufdata.resize(bufdata.size() - x); }\n\n  /**\n   * Extracts a certain amount of data from the deserialize buffer\n   *\n   * @param dst buffer to copy data from deserialize buffer into\n   * @param num Amount of data to get from deserialize buffer\n   */\n  void extract(uint8_t* dst, size_t num) {\n    if (num > 0) {\n      memcpy(dst, &bufdata[offset], num);\n      offset += num;\n    }\n  }\n\n  //! Get the underlying vector storing the data of the deserialize\n  //! buffer\n  vTy& getVec() { return bufdata; }\n\n  //! Get a pointer to the underlying data of the deserialize buffer\n  void* linearData() { return &bufdata[0]; }\n\n  //! Get a pointer to the remaining data of the deserialize buffer\n  //! (as determined by offset)\n  const uint8_t* r_linearData() const { return &bufdata[offset]; }\n  //! Get the remaining size of the deserialize buffer (as determined\n  //! by offset)\n  size_t r_size() const { return bufdata.size() - offset; }\n\n  //! Checks if the current location in the deserialize buffer is aligned\n  //! to some size a\n  bool atAlignment(size_t a) { return (uintptr_t)r_linearData() % a == 0; }\n\n  //! Utility print of deserialize buffer\n  //! @param o stream to print to\n  void print(std::ostream& o) const {\n    o << \"<{(\" << offset << \") \" << std::hex;\n    for (auto ii = bufdata.begin(), ee = bufdata.end(); ii != ee; ++ii)\n      o << (unsigned int)*ii << \" \";\n    o << std::dec << \"}>\";\n  }\n\n  //! Operator for printing deserialize buffer\n  friend std::ostream& operator<<(std::ostream& os,\n                                  const DeSerializeBuffer& buf) {\n    buf.print(os);\n    return os;\n  }\n};\n\nnamespace internal {\n\n/**\n * Returns the size necessary for an object in a buffer.\n * This version runs if the data is memory copyable; uses sizeof.\n *\n * @tparam T type of datato get size of\n */\ntemplate <typename T>\n__attribute__((always_inline)) constexpr size_t\ngSizedObj(const T&,\n          typename std::enable_if<is_memory_copyable<T>::value>::type* = 0) {\n  return sizeof(T);\n}\n\n/**\n * Returns the size necessary for an object in a buffer.\n * This version runs if the data is not memory copyable but is serializable.\n * It returns the size of a uintptr_t.\n *\n * @tparam T type of datato get size of\n * @returns size of uintptr_t\n */\ntemplate <typename T>\n__attribute__((always_inline)) constexpr size_t\ngSizedObj(const T&,\n          typename std::enable_if<!is_memory_copyable<T>::value>::type* = 0,\n          typename std::enable_if<has_serialize<T>::value>::type*       = 0) {\n  return sizeof(uintptr_t);\n}\n\n/**\n * Returns the size necessary for storing 2 elements of a pair into a\n * serialize buffer.\n *\n * @param data pair of 2 elements\n */\ntemplate <typename T1, typename T2>\ninline size_t gSizedObj(const std::pair<T1, T2>& data) {\n  return gSizedObj(data.first) + gSizedObj(data.second);\n}\n\n/**\n * Returns the size necessary to store a sequence in a serialize buffer.\n * This depends on if the sequence is memory copyable.\n */\ntemplate <typename Seq>\nsize_t gSizedSeq(const Seq& seq) {\n  typename Seq::size_type size = seq.size();\n  typedef typename Seq::value_type T;\n  size_t tsize = std::conditional<\n      is_memory_copyable<T>::value, std::integral_constant<size_t, sizeof(T)>,\n      std::integral_constant<size_t, sizeof(uintptr_t)>>::type::value;\n  return sizeof(size) + tsize * size;\n}\n\n/**\n * Returns the size needed to store the elements a vector in a serialize\n * buffer.\n *\n * @returns size needed to store a vector into a serialize buffer\n */\ntemplate <typename T, typename Alloc>\ninline size_t gSizedObj(const std::vector<T, Alloc>& data) {\n  return gSizedSeq(data);\n}\n\n/**\n * Returns the size needed to store the elements a PODResizeableArray in a\n * serialize buffer.\n *\n * @returns size needed to store a PODResizeableArray into a serialize buffer\n */\ntemplate <typename T>\ninline size_t gSizedObj(const galois::PODResizeableArray<T>& data) {\n  return gSizedSeq(data);\n}\n\n/**\n * Returns the size needed to store the elements a deque into a serialize\n * buffer.\n *\n * @returns size needed to store a deque into a serialize buffer\n */\ntemplate <typename T, typename Alloc>\ninline size_t gSerializeObj(const std::deque<T, Alloc>& data) {\n  return gSizedSeq(data);\n}\n\n/**\n * Returns the size needed to store the elements a Galois deque into a serialize\n * buffer.\n *\n * @returns size needed to store a Galois deque into a serialize buffer\n */\ntemplate <typename T, unsigned CS>\ninline size_t gSizedObj(const galois::gdeque<T, CS>& data) {\n  return gSizedSeq(data);\n}\n\n/**\n * Returns the size needed to store a string into a serialize\n * buffer.\n *\n * @returns size needed to store a string into a serialize buffer\n */\ntemplate <typename A>\ninline size_t\ngSizedObj(const std::basic_string<char, std::char_traits<char>, A>& data) {\n  return data.length() + 1;\n}\n\n/**\n * Returns the size of the passed in serialize buffer\n *\n * @returns size of the serialize buffer passed into it\n */\ninline size_t gSizedObj(const SerializeBuffer& data) { return data.size(); }\n\n/**\n * Returns the size of the passed in deserialize buffer\n *\n * @returns size of the deserialize buffer passed into it\n */\ninline size_t gSizedObj(const DeSerializeBuffer& rbuf) { return rbuf.r_size(); }\n\n/**\n * Returns the size of the passed in insert bag.\n *\n * @returns size of the insert bag passed into it\n */\ntemplate <typename T>\ninline size_t gSizedObj(const galois::InsertBag<T>& bag) {\n  return bag.size();\n}\n\n/**\n * Returns 0.\n * @returns 0\n */\ninline size_t adder() { return 0; }\n/**\n * Returns the passed in argument.\n * @param a a number\n * @returns a\n */\ninline size_t adder(size_t a) { return a; }\n/**\n * Returns the sum of all passed in arguments.\n * @returns sum of all arguments\n */\ntemplate <typename... Args>\ninline size_t adder(size_t a, size_t b, Args&&... args) {\n  return a + b + adder(args...);\n}\n\n} // namespace internal\n\n/**\n * Gets the total size necessary for storing all of the passed in arguments into\n * a serialize buffer.\n *\n * @returns size necessary for storing all arguments into a serialize buffer\n */\ntemplate <typename... Args>\nstatic inline size_t gSized(Args&&... args) {\n  return internal::adder(internal::gSizedObj(args)...);\n}\n\n////////////////////////////////////////////////////////////////////////////////\n// Serialize support\n////////////////////////////////////////////////////////////////////////////////\n\nnamespace internal {\n\n/**\n * Serialize a memory copyable object into a serialize buffer.\n *\n * @param [in,out] buf Serialize buffer to serialize into\n * @param [in] data Data to serialize\n */\ntemplate <typename T>\ninline void gSerializeObj(\n    SerializeBuffer& buf, const T& data,\n    typename std::enable_if<is_memory_copyable<T>::value>::type* = 0) {\n  uint8_t* pdata = (uint8_t*)&data;\n  buf.insert(pdata, sizeof(T));\n}\n\n/**\n * Serialize a non-memory copyable but serializable object into a serialize\n * buffer.\n *\n * @param [in,out] buf Serialize buffer to serialize into\n * @param [in] data Data to serialize\n */\ntemplate <typename T>\ninline void\ngSerializeObj(SerializeBuffer& buf, const T& data,\n              typename std::enable_if<!is_memory_copyable<T>::value>::type* = 0,\n              typename std::enable_if<has_serialize<T>::value>::type* = 0) {\n  data.serialize(buf);\n}\n\n/**\n * Serialize a pair into a serialize buffer.\n *\n * @param [in,out] buf Serialize buffer to serialize into\n * @param [in] data Pair to serialize\n */\ntemplate <typename T1, typename T2>\ninline void gSerializeObj(SerializeBuffer& buf, const std::pair<T1, T2>& data) {\n  gSerialize(buf, data.first, data.second);\n}\n\n/**\n * Serialize a pair. Either memcpys entire struct or serializes\n * each element individually.\n *\n * @param [in,out] buf Serialize buffer to serialize into\n * @param [in] data Pair to serialize\n */\ntemplate <typename T1, typename T2>\ninline void gSerializeObj(SerializeBuffer& buf,\n                          const galois::Pair<T1, T2>& data) {\n  if (is_memory_copyable<T1>::value && is_memory_copyable<T2>::value) {\n    // do memcpy\n    buf.insert((uint8_t*)&data, sizeof(data));\n  } else {\n    // serialize each individually\n    gSerialize(buf, data.first, data.second);\n  }\n}\n\n/**\n * Serialize a tuple of 3. Either memcpys entire struct or serializes\n * each element individually.\n *\n * @param [in,out] buf Serialize buffer to serialize into\n * @param [in] data Tuple of 3 to serialize\n * @todo This specialization isn't being used as expected. Figure out why.\n */\ntemplate <typename T1, typename T2, typename T3>\ninline void gSerializeObj(SerializeBuffer& buf,\n                          const galois::TupleOfThree<T1, T2, T3>& data) {\n  if (is_memory_copyable<T1>::value && is_memory_copyable<T2>::value &&\n      is_memory_copyable<T3>::value) {\n    // do memcpy\n    buf.insert((uint8_t*)&data, sizeof(data));\n  } else {\n    // serialize each individually\n    gSerialize(buf, data.first, data.second, data.third);\n  }\n}\n\n/**\n * Serialize a copyable atomic: load atomic data as a plain old\n * datatype (POD) and mem copy it to the buffer.\n *\n * @param [in,out] buf Serialize buffer to serialize into\n * @param [in] data copyable atomic to serialize\n */\ntemplate <typename T>\ninline void gSerializeObj(SerializeBuffer& buf,\n                          const galois::CopyableAtomic<T>& data) {\n  T temp = data.load();\n  buf.insert((uint8_t*)(&temp), sizeof(T));\n}\n\n/**\n * Serialize a string into a buffer.\n *\n * @param [in,out] buf Serialize buffer to serialize into\n * @param [in] data String\n */\ntemplate <typename A>\ninline void\ngSerializeObj(SerializeBuffer& buf,\n              const std::basic_string<char, std::char_traits<char>, A>& data) {\n  buf.insert((uint8_t*)data.data(), data.length() + 1);\n}\n\n// Forward declaration of vector serialize\ntemplate <typename T, typename Alloc>\ninline void gSerializeObj(SerializeBuffer& buf,\n                          const std::vector<T, Alloc>& data);\n\n/**\n * Serialize a sequence type into a buffer.\n *\n * @param [in,out] buf Serialize buffer to serialize into\n * @param [in] seq sequence to serialize\n * @todo specialize for Sequences with consecutive PODS\n */\ntemplate <typename Seq>\nvoid gSerializeSeq(SerializeBuffer& buf, const Seq& seq) {\n  typename Seq::size_type size = seq.size();\n  gSerializeObj(buf, size);\n  for (auto& o : seq)\n    gSerializeObj(buf, o);\n}\n\n/**\n * Serialize a linear sequence type (i.e. memcopyable) into a buffer.\n *\n * @param [in,out] buf Serialize buffer to serialize into\n * @param [in] seq sequence to serialize\n */\ntemplate <typename Seq>\nvoid gSerializeLinearSeq(SerializeBuffer& buf, const Seq& seq) {\n  typename Seq::size_type size = seq.size();\n  typedef typename Seq::value_type T;\n  size_t tsize = sizeof(T);\n  //  buf.reserve(size * tsize + sizeof(size));\n  gSerializeObj(buf, size);\n  buf.insert((uint8_t*)seq.data(), size * tsize);\n}\n\n/**\n * Serialize a vector into a buffer, choosing to do a memcopy or\n * to serialize each element individually depending on data.\n *\n * @param [in,out] buf Serialize buffer to serialize into\n * @param [in] data vector to serialize\n */\ntemplate <typename T, typename Alloc>\ninline void gSerializeObj(SerializeBuffer& buf,\n                          const std::vector<T, Alloc>& data) {\n  if (is_memory_copyable<T>::value)\n    gSerializeLinearSeq(buf, data);\n  else\n    gSerializeSeq(buf, data);\n}\n\n/**\n * Serialize a PODResizeableArray into a buffer, choosing to do a memcopy or\n * to serialize each element individually depending on data.\n *\n * @param [in,out] buf Serialize buffer to serialize into\n * @param [in] data PODResizeableArray to serialize\n */\ntemplate <typename T>\ninline void gSerializeObj(SerializeBuffer& buf,\n                          const galois::PODResizeableArray<T>& data) {\n  gSerializeLinearSeq(buf, data);\n}\n\n/**\n * Serialize a deque into a buffer.\n *\n * @param [in,out] buf Serialize buffer to serialize into\n * @param [in] data deque to serialize\n */\ntemplate <typename T, typename Alloc>\ninline void gSerializeObj(SerializeBuffer& buf,\n                          const std::deque<T, Alloc>& data) {\n  gSerializeSeq(buf, data);\n}\n\n/**\n * Serialize a Galois deque into a buffer.\n *\n * @param [in,out] buf Serialize buffer to serialize into\n * @param [in] data deque to serialize\n */\ntemplate <typename T, unsigned CS>\ninline void gSerializeObj(SerializeBuffer& buf,\n                          const galois::gdeque<T, CS>& data) {\n  gSerializeSeq(buf, data);\n}\n\n/**\n * Serialize data in another serialize buffer into a buffer.\n *\n * @param [in,out] buf Serialize buffer to serialize into\n * @param [in] data serialize buffer to get data from\n */\ninline void gSerializeObj(SerializeBuffer& buf, const SerializeBuffer& data) {\n  buf.insert(data.linearData(), data.size());\n}\n\n/**\n * Serialize data in a deserialize buffer into a buffer.\n *\n * @param [in,out] buf Serialize buffer to serialize into\n * @param [in] rbuf deserialize buffer to get data from\n */\ninline void gSerializeObj(SerializeBuffer& buf, const DeSerializeBuffer& rbuf) {\n  //  buf.reserve(rbuf.r_size());\n  buf.insert(rbuf.r_linearData(), rbuf.r_size());\n}\n\n/**\n * Serialize a dynamic bitset into a buffer.\n *\n * @param [in,out] buf Serialize buffer to serialize into\n * @param [in] data dynamic bitset to serialize\n */\ninline void gSerializeObj(SerializeBuffer& buf,\n                          const galois::DynamicBitSet& data) {\n  gSerializeObj(buf, data.size());\n  gSerializeObj(buf, data.get_vec());\n}\n\n// we removed the functions in Bag.h that this function requires, so this\n// won't work\n#if 0\n/**\n * For serializing insertBag.\n * Insert contigous memory chunks for each thread\n * and clear it.\n * Can not be const.\n * Implemention below makes sure that it can be deserialized\n * into a linear sequence like vector or deque.\n */\ntemplate<typename T>\ninline void gSerializeObj(SerializeBuffer& buf, galois::InsertBag<T>& bag){\n  gSerializeObj(buf, bag.size());\n  auto headerVec = bag.getHeads();\n  size_t totalSize = 0;\n  for(auto h : headerVec){\n    size_t localSize = (h->dend - h->dbegin);\n    buf.insert((uint8_t*)h->dbegin, localSize*sizeof(T));\n    totalSize += (h->dend - h->dbegin);\n  }\n\n  assert(totalSize == bag.size());\n  bag.clear();\n}\n#endif\n} // namespace internal\n\n/**\n * LazyRef structure; used to store both a type and an offset to begin\n * saving data into\n */\ntemplate <typename T>\nstruct LazyRef {\n  size_t off;\n};\n\n/**\n * Lazy serialize: doesn't actually serialize the data itself, but only\n * reserves space for it in the serialize buffer + serializes the\n * passed in num.\n */\ntemplate <typename Seq>\nstatic inline LazyRef<typename Seq::value_type>\ngSerializeLazySeq(SerializeBuffer& buf, unsigned num, Seq*) {\n  static_assert(is_memory_copyable<typename Seq::value_type>::value,\n                \"Not POD Sequence\");\n  typename Seq::size_type size = num;\n  internal::gSerializeObj(buf, size);\n  size_t tsize = sizeof(typename Seq::value_type);\n  return LazyRef<typename Seq::value_type>{buf.encomber(tsize * num)};\n}\n\n/**\n * Lazy serialize: given an offset and type through a LazyRef object,\n * serializes a certain amount from the passed in data array.\n *\n * @param buf Buffer to serialize into\n * @param r struct with info on where to start saving data and the type\n * of the data that needs to be saved\n * @param item Number of items that need to be serialized\n * @param data Data array containing data that needs to be serialized\n */\ntemplate <typename Ty>\nstatic inline void gSerializeLazy(SerializeBuffer& buf, LazyRef<Ty> r,\n                                  unsigned item, Ty&& data) {\n  size_t off     = r.off + sizeof(Ty) * item;\n  uint8_t* pdata = (uint8_t*)&data;\n  buf.insertAt(pdata, sizeof(Ty), off);\n}\n\n/**\n * Serialize an entire series of datatypes into a provided serialize buffer\n */\ntemplate <typename T1, typename... Args>\nstatic inline void gSerialize(SerializeBuffer& buf, T1&& t1, Args&&... args) {\n  buf.reserve(gSized(t1, args...));\n  internal::gSerializeObj(buf, std::forward<T1>(t1));\n  gSerialize(buf, std::forward<Args>(args)...);\n}\n\n/**\n * No-op function. \"Base case\" for recursive gSerialize function.\n */\nstatic inline void gSerialize(SerializeBuffer&) {}\n\n////////////////////////////////////////////////////////////////////////////////\n// Deserialize support\n////////////////////////////////////////////////////////////////////////////////\n\nnamespace internal {\n\n/**\n * Deserialize a memcopyable object from a buffer.\n *\n * @param buf [in,out] Buffer to deserialize from\n * @param data [in,out] Data to deserialize into\n */\ntemplate <typename T>\nvoid gDeserializeObj(\n    DeSerializeBuffer& buf, T& data,\n    typename std::enable_if<is_memory_copyable<T>::value>::type* = 0) {\n  uint8_t* pdata = (uint8_t*)&data;\n  buf.extract(pdata, sizeof(T));\n}\n\n/**\n * Deserialize a non-memcopyable but seralizable object from a buffer.\n *\n * @param buf [in,out] Buffer to deserialize from\n * @param data [in,out] Data to deserialize into\n */\ntemplate <typename T>\nvoid gDeserializeObj(\n    DeSerializeBuffer& buf, T& data,\n    typename std::enable_if<!is_memory_copyable<T>::value>::type* = 0,\n    typename std::enable_if<has_serialize<T>::value>::type*       = 0) {\n  data.deserialize(buf);\n}\n\n/**\n * Deserialize a pair from a buffer.\n *\n * @param buf [in,out] Buffer to deserialize from\n * @param data [in,out] pair to deserialize into\n */\ntemplate <typename T1, typename T2>\nvoid gDeserializeObj(DeSerializeBuffer& buf, std::pair<T1, T2>& data) {\n  gDeserialize(buf, data.first, data.second);\n}\n\n/**\n * Deserialize into a pair. Either memcpys from buffer or deserializes\n * each element individually.\n *\n * @param [in,out] buf Buffer to deserialize from\n * @param [in] data Pair to deserialize into\n */\ntemplate <typename T1, typename T2>\ninline void gDeserializeObj(DeSerializeBuffer& buf,\n                            galois::Pair<T1, T2>& data) {\n  if (is_memory_copyable<T1>::value && is_memory_copyable<T2>::value) {\n    // do memcpy\n    buf.extract((uint8_t*)&data, sizeof(data));\n  } else {\n    // deserialize each individually\n    gDeserialize(buf, data.first, data.second);\n  }\n}\n\n/**\n * Deserialize into a tuple of 3. Either memcpys from buffer or deserializes\n * each element individually.\n *\n * @param buf [in,out] Buffer to deserialize from\n * @param data [in,out] triple to deserialize into\n * @todo This specialization isn't being used as expected. Figure out why.\n */\ntemplate <typename T1, typename T2, typename T3>\ninline void gDeserializeObj(DeSerializeBuffer& buf,\n                            galois::TupleOfThree<T1, T2, T3>& data) {\n  if (is_memory_copyable<T1>::value && is_memory_copyable<T2>::value &&\n      is_memory_copyable<T3>::value) {\n    // do memcpy straight to data\n    buf.extract((uint8_t*)&data, sizeof(data));\n  } else {\n    // deserialize each individually\n    gDeserialize(buf, data.first, data.second, data.third);\n  }\n}\n\n/**\n * Deserialize into a CopyableAtomic. Loads the POD from the DeserializeBuffer\n * then stores it into the atomic.\n *\n * @param buf [in,out] Buffer to deserialize from\n * @param data [in,out] copyable atomic to deserialize into\n */\ntemplate <typename T>\nvoid gDeserializeObj(DeSerializeBuffer& buf, galois::CopyableAtomic<T>& data) {\n  T tempData;\n  uint8_t* pointerToTemp = (uint8_t*)&tempData;\n  buf.extract(pointerToTemp, sizeof(T));\n  data.store(tempData);\n}\n\nnamespace {\ntemplate <int...>\nstruct seq {};\ntemplate <int N, int... S>\nstruct gens : gens<N - 1, N - 1, S...> {};\ntemplate <int... S>\nstruct gens<0, S...> {\n  typedef seq<S...> type;\n};\n} // namespace\n\n/**\n * Deserialize into a tuple.\n *\n * @param buf [in,out] Buffer to deserialize from\n * @param data [in,out] tuple to serialize into\n */\ntemplate <typename... T, int... S>\nvoid gDeserializeTuple(DeSerializeBuffer& buf, std::tuple<T...>& data,\n                       seq<S...>) {\n  gDeserialize(buf, std::get<S>(data)...);\n}\n\n/**\n * Wrapper for deserialization into a tuple.\n *\n * @param buf [in,out] Buffer to deserialize from\n * @param data [in,out] tuple to serialize into\n */\ntemplate <typename... T>\nvoid gDeserializeObj(DeSerializeBuffer& buf, std::tuple<T...>& data) {\n  return gDeserializeTuple(buf, data, typename gens<sizeof...(T)>::type());\n}\n\n/**\n * Deserialize into a string.\n *\n * @param buf [in,out] Buffer to deserialize from\n * @param data [in,out] string to serialize into\n */\ntemplate <typename A>\ninline void\ngDeserializeObj(DeSerializeBuffer& buf,\n                std::basic_string<char, std::char_traits<char>, A>& data) {\n  char c = buf.pop();\n  while (c != '\\0') {\n    data.push_back(c);\n    c = buf.pop();\n  };\n}\n\n// Forward declaration of vector deserialize\ntemplate <typename T, typename Alloc>\nvoid gDeserializeObj(DeSerializeBuffer& buf, std::vector<T, Alloc>& data);\n\n/**\n * Deserialize into a sequence object\n *\n * @param buf [in,out] Buffer to deserialize from\n * @param seq [in,out] sequence to deserialize into\n */\ntemplate <typename Seq>\nvoid gDeserializeSeq(DeSerializeBuffer& buf, Seq& seq) {\n  seq.clear();\n  typename Seq::size_type size;\n  gDeserializeObj(buf, size);\n  while (size--) {\n    typename Seq::value_type v;\n    gDeserializeObj(buf, v);\n    seq.push_back(v);\n  }\n}\n\n/**\n * Deserialize into a linear sequence object (i.e. one that is mem-copyable)\n *\n * @param buf [in,out] Buffer to deserialize from\n * @param seq [in,out] sequence to deserialize into\n */\ntemplate <typename Seq>\nvoid gDeserializeLinearSeq(DeSerializeBuffer& buf, Seq& seq) {\n  typedef typename Seq::value_type T;\n  //  seq.clear();\n  typename Seq::size_type size;\n  gDeserializeObj(buf, size);\n  // If the alignment is right, cast to a T array and insert\n  if (buf.atAlignment(alignof(T))) {\n    T* src = (T*)buf.r_linearData();\n    seq.assign(src, &src[size]);\n    buf.setOffset(buf.getOffset() + size * sizeof(T));\n  } else {\n    seq.resize(size);\n    buf.extract((uint8_t*)seq.data(), size * sizeof(T));\n  }\n}\n\n/**\n * Deserialize into a deque\n *\n * @param buf [in,out] Buffer to deserialize from\n * @param data [in,out] deque to deserialize into\n */\ntemplate <typename T, typename Alloc>\nvoid gDeserializeObj(DeSerializeBuffer& buf, std::deque<T, Alloc>& data) {\n  gDeserializeSeq(buf, data);\n}\n\n/**\n * Deserialize into a vector; implementation depends on whether or not data in\n * vector is mem-copyable\n *\n * @param buf [in,out] Buffer to deserialize from\n * @param data [in,out] vector to deserialize into\n */\ntemplate <typename T, typename Alloc>\nvoid gDeserializeObj(DeSerializeBuffer& buf, std::vector<T, Alloc>& data) {\n  if (is_memory_copyable<T>::value)\n    gDeserializeLinearSeq(buf, data);\n  else\n    gDeserializeSeq(buf, data);\n}\n\n/**\n * Deserialize into a PODResizeableArray\n *\n * @param buf [in,out] Buffer to deserialize from\n * @param data [in,out] PODResizeableArray to deserialize into\n */\ntemplate <typename T>\nvoid gDeserializeObj(DeSerializeBuffer& buf,\n                     galois::PODResizeableArray<T>& data) {\n  gDeserializeLinearSeq(buf, data);\n}\n\n/**\n * Deserialize into a galois deque\n *\n * @param buf [in,out] Buffer to deserialize from\n * @param data [in,out] galois deque to deserialize into\n */\ntemplate <typename T, unsigned CS>\nvoid gDeserializeObj(DeSerializeBuffer& buf, galois::gdeque<T, CS>& data) {\n  gDeserializeSeq(buf, data);\n}\n\n/**\n * Deserialize into a dynamic bitset\n *\n * @param buf [in,out] Buffer to deserialize from\n * @param data [in,out] bitset to deserialize into\n */\ninline void gDeserializeObj(DeSerializeBuffer& buf,\n                            galois::DynamicBitSet& data) {\n  size_t size = 0;\n  gDeserializeObj(buf, size);\n  data.resize(size);\n  gDeserializeObj(buf, data.get_vec());\n}\n\n} // namespace internal\n\n/**\n * Deserialize data in a buffer into a series of objects\n */\ntemplate <typename T1, typename... Args>\nvoid gDeserialize(DeSerializeBuffer& buf, T1&& t1, Args&&... args) {\n  internal::gDeserializeObj(buf, std::forward<T1>(t1));\n  gDeserialize(buf, std::forward<Args>(args)...);\n}\n\n/**\n * Base case for regular gDeserialize recursive call.\n */\ninline void gDeserialize(DeSerializeBuffer&) {}\n\n/**\n * \"Deserialize\" data in an iterator type into a data object.\n *\n * @tparam Iter iterator type that has objects of type T\n * @tparam T type of data to deserialize into\n * @param iter Iterator containing data that we want to save into the passed in\n * data reference\n * @param data Object to save data in the iterator type into\n */\ntemplate <typename Iter, typename T>\nauto gDeserializeRaw(Iter iter, T& data) -> decltype(\n    std::declval<typename std::enable_if<is_memory_copyable<T>::value>::type>(),\n    Iter()) {\n  unsigned char* pdata = (unsigned char*)&data;\n  for (size_t i = 0; i < sizeof(T); ++i)\n    pdata[i] = *iter++;\n  return iter;\n}\n\n} // namespace runtime\n} // namespace galois\n\n#endif // SERIALIZE DEF end\n"
  },
  {
    "path": "libdist/src/Barrier.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n/**\n * @file libdist/src/Barrier.cpp\n *\n * Contains implementation of HostFence and HostBarrier as well as functions\n * that get static singletons of the 2.\n *\n * A fence flushes out and receives all messages in the network while a barrier\n * simply acts as a barrier in the code for all hosts.\n */\n\n#include \"galois/substrate/PerThreadStorage.h\"\n#include \"galois/runtime/Substrate.h\"\n#include \"galois/substrate/CompilerSpecific.h\"\n#include \"galois/runtime/Network.h\"\n#include \"galois/runtime/LWCI.h\"\n\n#include <cstdlib>\n#include <cstdio>\n#include <limits>\n\n#include <iostream>\n#include \"galois/runtime/BareMPI.h\"\n\nnamespace {\nclass HostFence : public galois::substrate::Barrier {\npublic:\n  virtual const char* name() const { return \"HostFence\"; }\n\n  virtual void reinit(unsigned) {}\n\n  //! control-flow barrier across distributed hosts\n  //! acts as a distributed-memory fence as well (flushes send and receives)\n  virtual void wait() {\n    auto& net = galois::runtime::getSystemNetworkInterface();\n\n    if (galois::runtime::evilPhase == 0) {\n      galois::gWarn(\"evilPhase is 0, implying loop-around or no use: fence \"\n                    \"may not work correctly!\");\n    }\n\n    for (unsigned h = 0; h < net.Num; ++h) {\n      if (h == net.ID)\n        continue;\n      galois::runtime::SendBuffer b;\n      galois::runtime::gSerialize(b, net.ID + 1); // non-zero message\n      net.sendTagged(h, galois::runtime::evilPhase, b);\n    }\n    net.flush(); // flush all sends\n\n    unsigned received = 1; // self\n    while (received < net.Num) {\n      decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr)) p;\n      do {\n        net.handleReceives(); // flush all receives from net.sendMsg() or\n                              // net.sendSimple()\n        p = net.recieveTagged(galois::runtime::evilPhase, nullptr);\n      } while (!p);\n      assert(p->first != net.ID);\n      // ignore received data\n      ++received;\n    }\n    ++galois::runtime::evilPhase;\n    if (galois::runtime::evilPhase >=\n        static_cast<uint32_t>(\n            std::numeric_limits<int16_t>::max())) { // limit defined by MPI or\n                                                    // LCI\n      galois::runtime::evilPhase = 1;\n    }\n  }\n};\n\nclass HostBarrier : public galois::substrate::Barrier {\npublic:\n  virtual const char* name() const { return \"HostBarrier\"; }\n\n  virtual void reinit(unsigned) {}\n\n  //! Control-flow barrier across distributed hosts\n  virtual void wait() {\n#ifdef GALOIS_USE_LCI\n    lc_barrier(lc_col_ep);\n#else\n    MPI_Barrier(MPI_COMM_WORLD); // assumes MPI_THREAD_MULTIPLE\n#endif\n  }\n};\n\n} // end anonymous namespace\n\ngalois::substrate::Barrier& galois::runtime::getHostBarrier() {\n  static HostBarrier b;\n  return b;\n}\n\ngalois::substrate::Barrier& galois::runtime::getHostFence() {\n  static HostFence b;\n  return b;\n}\n"
  },
  {
    "path": "libdist/src/DistGalois.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n/**\n * @file DistGalois.cpp\n *\n * Includes the definitions for DistMemSys's constructor and destructor\n */\n\n#include \"galois/DistGalois.h\"\n#include \"galois/runtime/Network.h\"\n\n//! DistMemSys constructor which calls the shared memory runtime constructor\n//! with the distributed stats manager\ngalois::DistMemSys::DistMemSys()\n    : galois::runtime::SharedMem<galois::runtime::DistStatManager>() {}\n\n//! DistMemSys destructor which reports memory usage from the network\ngalois::DistMemSys::~DistMemSys() {\n  if (MORE_DIST_STATS) {\n    auto& net = galois::runtime::getSystemNetworkInterface();\n    net.reportMemUsage();\n  }\n}\n"
  },
  {
    "path": "libdist/src/DistStats.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n/**\n * @file DistStats.cpp\n *\n * Contains implementations for DistStats.h.\n */\n#include \"galois/runtime/DistStats.h\"\n#include \"galois/runtime/Serialize.h\"\n#include \"galois/DTerminationDetector.h\"\n\nusing namespace galois::runtime;\n\nDistStatManager* internal::distSysStatManager(void) {\n  galois::runtime::StatManager* sm = internal::sysStatManager();\n\n  assert(sm && \"StatManager not initialized\");\n\n  DistStatManager* d = dynamic_cast<DistStatManager*>(sm);\n\n  assert(d && \"dynamic_cast<DistStatManager*> failed\");\n\n  return d;\n}\n\ninline static DistStatManager* dsm(void) {\n  return internal::distSysStatManager();\n}\n\nDistStatManager::DistStatManager(const std::string& outfile)\n    : StatManager(outfile) {}\nDistStatManager::~DistStatManager() {\n  galois::runtime::internal::destroySystemNetworkInterface();\n}\n\nclass galois::runtime::StatRecvHelper {\n\npublic:\n  static void recvAtHost_0_hostTotalTy(galois::gstl::Str region,\n                                       galois::gstl::Str category,\n                                       StatTotal::Type totalTy) {\n\n    dsm()->addRecvdHostTotalTy(region, category, totalTy);\n  }\n\n  static void recvAtHost_0_int(uint32_t hostID, galois::gstl::Str region,\n                               galois::gstl::Str category, int64_t thrdTotal,\n                               StatTotal::Type totalTy,\n                               const galois::gstl::Vector<int64_t> thrdVals) {\n\n    dsm()->addRecvdStat(hostID, region, category, thrdTotal, totalTy, thrdVals);\n  }\n\n  static void recvAtHost_0_fp(uint32_t hostID, galois::gstl::Str region,\n                              galois::gstl::Str category, double thrdTotal,\n                              StatTotal::Type totalTy,\n                              const galois::gstl::Vector<double> thrdVals) {\n\n    dsm()->addRecvdStat(hostID, region, category, thrdTotal, totalTy, thrdVals);\n  }\n\n  static void\n  recvAtHost_0_str(uint32_t hostID, galois::gstl::Str region,\n                   galois::gstl::Str category, galois::gstl::Str thrdTotal,\n                   StatTotal::Type totalTy,\n                   const galois::gstl::Vector<galois::gstl::Str> thrdVals) {\n\n    dsm()->addRecvdParam(hostID, region, category, thrdTotal, totalTy,\n                         thrdVals);\n  }\n};\n\nvoid DistStatManager::mergeStats(void) {\n  Base::mergeStats();\n  hostTotalTypes.mergeStats();\n  combineAtHost_0();\n}\n\nvoid DistStatManager::combineAtHost_0_helper(void) {\n  const bool IS_HOST0 = getHostID() == 0;\n\n  const auto& hTotalMap = hostTotalTypes.mergedMap();\n\n  size_t syncTypePhase = 0;\n  if (!IS_HOST0) {\n    for (auto i = hTotalMap.cbegin(), end_i = hTotalMap.cend(); i != end_i;\n         ++i) {\n      SendBuffer b;\n      gSerialize(b, hTotalMap.region(i), hTotalMap.category(i),\n                 hTotalMap.stat(i).totalTy());\n      getSystemNetworkInterface().sendTagged(0, galois::runtime::evilPhase, b,\n                                             syncTypePhase);\n    }\n  }\n\n  ++syncTypePhase;\n  for (auto i = Base::intBegin(), end_i = Base::intEnd(); i != end_i; ++i) {\n    Str ln;\n    Str cat;\n    int64_t thrdTotal;\n    StatTotal::Type totalTy;\n    galois::gstl::Vector<int64_t> thrdVals;\n\n    Base::readIntStat(i, ln, cat, thrdTotal, totalTy, thrdVals);\n\n    if (IS_HOST0) {\n      addRecvdStat(0, ln, cat, thrdTotal, totalTy, thrdVals);\n\n    } else {\n      SendBuffer b;\n      gSerialize(b, ln, cat, thrdTotal, totalTy, thrdVals);\n      getSystemNetworkInterface().sendTagged(0, galois::runtime::evilPhase, b,\n                                             syncTypePhase);\n    }\n  }\n}\n\nvoid DistStatManager::combineAtHost_0_helper2(void) {\n  const bool IS_HOST0 = getHostID() == 0;\n\n  size_t syncTypePhase = 0;\n  for (auto i = Base::fpBegin(), end_i = Base::fpEnd(); i != end_i; ++i) {\n    Str ln;\n    Str cat;\n    double thrdTotal;\n    StatTotal::Type totalTy;\n    galois::gstl::Vector<double> thrdVals;\n\n    Base::readFPstat(i, ln, cat, thrdTotal, totalTy, thrdVals);\n\n    if (IS_HOST0) {\n      addRecvdStat(0, ln, cat, thrdTotal, totalTy, thrdVals);\n\n    } else {\n      SendBuffer b;\n      gSerialize(b, ln, cat, thrdTotal, totalTy, thrdVals);\n      getSystemNetworkInterface().sendTagged(0, galois::runtime::evilPhase, b,\n                                             syncTypePhase);\n    }\n  }\n\n  ++syncTypePhase;\n  for (auto i = Base::paramBegin(), end_i = Base::paramEnd(); i != end_i; ++i) {\n    Str ln;\n    Str cat;\n    Str thrdTotal;\n    StatTotal::Type totalTy;\n    galois::gstl::Vector<Str> thrdVals;\n\n    Base::readParam(i, ln, cat, thrdTotal, totalTy, thrdVals);\n\n    if (IS_HOST0) {\n      addRecvdParam(0, ln, cat, thrdTotal, totalTy, thrdVals);\n\n    } else {\n      SendBuffer b;\n      gSerialize(b, ln, cat, thrdTotal, totalTy, thrdVals);\n      getSystemNetworkInterface().sendTagged(0, galois::runtime::evilPhase, b,\n                                             syncTypePhase);\n    }\n  }\n}\n\nvoid DistStatManager::receiveAtHost_0_helper(void) {\n  size_t syncTypePhase = 0;\n  {\n    decltype(getSystemNetworkInterface().recieveTagged(\n        galois::runtime::evilPhase, nullptr, syncTypePhase)) p;\n    do {\n      p = getSystemNetworkInterface().recieveTagged(galois::runtime::evilPhase,\n                                                    nullptr, syncTypePhase);\n\n      if (p) {\n        RecvBuffer& b = p->second;\n\n        galois::gstl::Str region;\n        galois::gstl::Str category;\n        StatTotal::Type totalTy;\n        gDeserialize(b, region, category, totalTy);\n\n        StatRecvHelper::recvAtHost_0_hostTotalTy(region, category, totalTy);\n      }\n    } while (p);\n  }\n\n  ++syncTypePhase;\n  {\n    decltype(getSystemNetworkInterface().recieveTagged(\n        galois::runtime::evilPhase, nullptr, syncTypePhase)) p;\n    do {\n      p = getSystemNetworkInterface().recieveTagged(galois::runtime::evilPhase,\n                                                    nullptr, syncTypePhase);\n\n      if (p) {\n        uint32_t hostID = p->first;\n        RecvBuffer& b   = p->second;\n\n        Str ln;\n        Str cat;\n        int64_t thrdTotal;\n        StatTotal::Type totalTy;\n        galois::gstl::Vector<int64_t> thrdVals;\n        gDeserialize(b, ln, cat, thrdTotal, totalTy, thrdVals);\n\n        StatRecvHelper::recvAtHost_0_int(hostID, ln, cat, thrdTotal, totalTy,\n                                         thrdVals);\n      }\n    } while (p);\n  }\n}\n\nvoid DistStatManager::receiveAtHost_0_helper2(void) {\n  size_t syncTypePhase = 0;\n  {\n    decltype(getSystemNetworkInterface().recieveTagged(\n        galois::runtime::evilPhase, nullptr, syncTypePhase)) p;\n    do {\n      p = getSystemNetworkInterface().recieveTagged(galois::runtime::evilPhase,\n                                                    nullptr, syncTypePhase);\n\n      if (p) {\n        uint32_t hostID = p->first;\n        RecvBuffer& b   = p->second;\n\n        Str ln;\n        Str cat;\n        double thrdTotal;\n        StatTotal::Type totalTy;\n        galois::gstl::Vector<double> thrdVals;\n        gDeserialize(b, ln, cat, thrdTotal, totalTy, thrdVals);\n\n        StatRecvHelper::recvAtHost_0_fp(hostID, ln, cat, thrdTotal, totalTy,\n                                        thrdVals);\n      }\n    } while (p);\n  }\n\n  ++syncTypePhase;\n  {\n    decltype(getSystemNetworkInterface().recieveTagged(\n        galois::runtime::evilPhase, nullptr, syncTypePhase)) p;\n    do {\n      p = getSystemNetworkInterface().recieveTagged(galois::runtime::evilPhase,\n                                                    nullptr, syncTypePhase);\n\n      if (p) {\n        uint32_t hostID = p->first;\n        RecvBuffer& b   = p->second;\n\n        Str ln;\n        Str cat;\n        Str thrdTotal;\n        StatTotal::Type totalTy;\n        galois::gstl::Vector<Str> thrdVals;\n        gDeserialize(b, ln, cat, thrdTotal, totalTy, thrdVals);\n\n        StatRecvHelper::recvAtHost_0_str(hostID, ln, cat, thrdTotal, totalTy,\n                                         thrdVals);\n      }\n    } while (p);\n  }\n}\n\nvoid DistStatManager::combineAtHost_0(void) {\n  galois::DGTerminator<unsigned int> td;\n\n  // host 0 reads stats from Base class\n  // other hosts send stats to host 0\n  combineAtHost_0_helper();\n  getSystemNetworkInterface().flush();\n\n  // work done before check\n  td += 1;\n\n  // barrier\n  while (td.reduce()) {\n    td.reset();\n    if (getHostID() == 0) {\n      // receive from other hosts\n      receiveAtHost_0_helper();\n    }\n  }\n\n  // explicit barrier after logical barrier is required\n  // as next async phase begins immediately\n  getHostBarrier().wait();\n\n  // host 0 reads stats from Base class\n  // other hosts send stats to host 0\n  combineAtHost_0_helper2();\n  getSystemNetworkInterface().flush();\n\n  td += 1;\n\n  // barrier\n  while (td.reduce()) {\n    td.reset();\n\n    if (getHostID() == 0) {\n      // receive from other hosts\n      receiveAtHost_0_helper2();\n    }\n  }\n\n  // explicit barrier after logical barrier is required\n  // as next async phase begins immediately\n  getHostBarrier().wait();\n}\n\nbool DistStatManager::printingHostVals(void) {\n  return galois::substrate::EnvCheck(DistStatManager::HSTAT_ENV_VAR);\n}\n\nStatTotal::Type\nDistStatManager::findHostTotalTy(const Str& region, const Str& category,\n                                 const StatTotal::Type& thrdTotalTy) const {\n\n  StatTotal::Type hostTotalTy = thrdTotalTy;\n\n  auto& mrgMap = hostTotalTypes.mergedMap();\n\n  auto i = mrgMap.findStat(region, category);\n  if (i != mrgMap.cend()) {\n    hostTotalTy = mrgMap.stat(i).totalTy();\n  }\n\n  return hostTotalTy;\n}\n\nvoid DistStatManager::addRecvdHostTotalTy(const Str& region,\n                                          const Str& category,\n                                          const StatTotal::Type& totalTy) {\n  hostTotalTypes.addToStat(region, category, totalTy);\n}\n\nvoid DistStatManager::addRecvdStat(\n    unsigned hostID, const Str& region, const Str& category, int64_t thrdTotal,\n    const StatTotal::Type& thrdTotalTy,\n    const DistStatManager::ThrdVals<int64_t>& thrdVals) {\n\n  intDistStats.addToStat(\n      region, category,\n      std::make_tuple(hostID, thrdTotal, thrdTotalTy, thrdVals),\n      findHostTotalTy(region, category, thrdTotalTy));\n}\n\nvoid DistStatManager::addRecvdStat(\n    unsigned hostID, const Str& region, const Str& category, double thrdTotal,\n    const StatTotal::Type& thrdTotalTy,\n    const DistStatManager::ThrdVals<double>& thrdVals) {\n\n  fpDistStats.addToStat(\n      region, category,\n      std::make_tuple(hostID, thrdTotal, thrdTotalTy, thrdVals),\n      findHostTotalTy(region, category, thrdTotalTy));\n}\n\nvoid DistStatManager::addRecvdParam(\n    unsigned hostID, const Str& region, const Str& category,\n    const Str& thrdTotal, const StatTotal::Type& thrdTotalTy,\n    const DistStatManager::ThrdVals<Str>& thrdVals) {\n\n  strDistStats.addToStat(\n      region, category,\n      std::make_tuple(hostID, thrdTotal, thrdTotalTy, thrdVals),\n      findHostTotalTy(region, category, thrdTotalTy));\n}\n\nvoid DistStatManager::printHeader(std::ostream& out) const {\n  out << \"STAT_TYPE\" << SEP;\n  out << \"HOST_ID\" << SEP;\n  out << \"REGION\" << SEP << \"CATEGORY\" << SEP;\n  out << \"TOTAL_TYPE\" << SEP << \"TOTAL\";\n\n  out << std::endl;\n}\n\nvoid DistStatManager::printStats(std::ostream& out) {\n  mergeStats();\n\n  galois::DGTerminator<unsigned int> td;\n  if (getHostID() == 0) {\n    printHeader(out);\n\n    intDistStats.print(out);\n    fpDistStats.print(out);\n    strDistStats.print(out);\n  }\n  // all hosts must wait for host 0 to finish printing stats\n  while (td.reduce()) {\n  };\n}\n"
  },
  {
    "path": "libdist/src/Network.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n/**\n * @file Network.cpp\n *\n * Contains implementations for basic NetworkInterface functions and\n * initializations of some NetworkInterface variables.\n */\n\n#include \"galois/runtime/Tracer.h\"\n#include \"galois/runtime/Network.h\"\n#include \"galois/runtime/NetworkIO.h\"\n\n#include <iostream>\n#include <mutex>\n\nusing namespace galois::runtime;\n\nuint32_t galois::runtime::evilPhase = 1;\n\nuint32_t galois::runtime::NetworkInterface::ID  = 0;\nuint32_t galois::runtime::NetworkInterface::Num = 1;\n\nuint32_t galois::runtime::getHostID() { return NetworkInterface::ID; }\n\ngalois::runtime::NetworkIO::~NetworkIO() {}\n\nvoid NetworkInterface::initializeMPI() {\n  int supportProvided;\n  int initSuccess =\n      MPI_Init_thread(NULL, NULL, MPI_THREAD_MULTIPLE, &supportProvided);\n  if (initSuccess != MPI_SUCCESS) {\n    MPI_Abort(MPI_COMM_WORLD, initSuccess);\n  }\n\n  if (supportProvided != MPI_THREAD_MULTIPLE) {\n    GALOIS_DIE(\"MPI_THREAD_MULTIPLE not supported.\");\n  }\n}\n\nvoid NetworkInterface::finalizeMPI() {\n  int finalizeSuccess = MPI_Finalize();\n\n  if (finalizeSuccess != MPI_SUCCESS) {\n    MPI_Abort(MPI_COMM_WORLD, finalizeSuccess);\n  }\n\n  galois::gDebug(\"[\", NetworkInterface::ID, \"] MPI finalized\");\n}\n\nNetworkInterface::NetworkInterface() {}\n\nNetworkInterface::~NetworkInterface() {}\n\nvoid NetworkInterface::reportMemUsage() const {\n  std::string str(\"CommunicationMemUsage\");\n  galois::runtime::reportStat_Tmin(\"dGraph\", str + \"Min\",\n                                   memUsageTracker.getMaxMemUsage());\n  galois::runtime::reportStat_Tmax(\"dGraph\", str + \"Max\",\n                                   memUsageTracker.getMaxMemUsage());\n}\n\n// forward decl\n//! Receive broadcasted messages over the network\nstatic void bcastLandingPad(uint32_t src, ::RecvBuffer& buf);\n\nstatic void bcastLandingPad(uint32_t src, RecvBuffer& buf) {\n  uintptr_t fp;\n  gDeserialize(buf, fp);\n  auto recv = (void (*)(uint32_t, RecvBuffer&))fp;\n  trace(\"NetworkInterface::bcastLandingPad\", (void*)recv);\n  recv(src, buf);\n}\n\nvoid NetworkInterface::sendMsg(uint32_t dest,\n                               void (*recv)(uint32_t, RecvBuffer&),\n                               SendBuffer& buf) {\n  gSerialize(buf, recv);\n  sendTagged(dest, 0, buf);\n}\n\nvoid NetworkInterface::broadcast(void (*recv)(uint32_t, RecvBuffer&),\n                                 SendBuffer& buf, bool self) {\n  trace(\"NetworkInterface::broadcast\", (void*)recv);\n  auto fp = (uintptr_t)recv;\n  for (unsigned x = 0; x < Num; ++x) {\n    if (x != ID) {\n      SendBuffer b;\n      gSerialize(b, fp, buf, (uintptr_t)&bcastLandingPad);\n      sendTagged(x, 0, b);\n    } else if (self) {\n      RecvBuffer rb(buf.begin(), buf.end());\n      recv(ID, rb);\n    }\n  }\n}\n\nvoid NetworkInterface::handleReceives() {\n  std::unique_lock<substrate::SimpleLock> lg;\n  auto opt = recieveTagged(0, &lg);\n  while (opt) {\n    uint32_t src    = std::get<0>(*opt);\n    RecvBuffer& buf = std::get<1>(*opt);\n    uintptr_t fp    = 0;\n    gDeserializeRaw(buf.r_linearData() + buf.r_size() - sizeof(uintptr_t), fp);\n    buf.pop_back(sizeof(uintptr_t));\n    assert(fp);\n    auto f = (void (*)(uint32_t, RecvBuffer&))fp;\n    f(src, buf);\n    opt = recieveTagged(0, &lg);\n  }\n}\n\nNetworkInterface& galois::runtime::getSystemNetworkInterface() {\n#ifndef GALOIS_USE_LCI\n  return makeNetworkBuffered();\n#else\n  return makeNetworkLCI();\n#endif\n}\n\nvoid galois::runtime::internal::destroySystemNetworkInterface() {\n  // get net interface, then delete it\n  NetworkInterface& netInterface = getSystemNetworkInterface();\n  delete &netInterface;\n}\n"
  },
  {
    "path": "libdist/src/NetworkBuffered.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n/**\n * @file NetworkBuffered.cpp\n *\n * Contains NetworkInterfaceBuffered, an implementation of a network interface\n * that buffers messages before sending them out.\n *\n * @todo document this file more\n */\n\n#include \"galois/runtime/Network.h\"\n#include \"galois/runtime/NetworkIO.h\"\n#include \"galois/runtime/Tracer.h\"\n\n#ifdef GALOIS_USE_LCI\n#define NO_AGG\n#endif\n\n#include <thread>\n#include <mutex>\n#include <iostream>\n#include <limits>\n\nusing namespace galois::runtime;\nusing namespace galois::substrate;\n\nnamespace {\n\n/**\n * @class NetworkInterfaceBuffered\n *\n * Buffered network interface: messages are buffered before they are sent out.\n * A single worker thread is initialized to send/receive messages from/to\n * buffers.\n */\nclass NetworkInterfaceBuffered : public NetworkInterface {\n  static const int COMM_MIN =\n      1400; //! bytes (sligtly smaller than an ethernet packet)\n  static const int COMM_DELAY = 100; //! microseconds delay\n\n  unsigned long statSendNum;\n  unsigned long statSendBytes;\n  unsigned long statSendEnqueued;\n  unsigned long statRecvNum;\n  unsigned long statRecvBytes;\n  unsigned long statRecvDequeued;\n  bool anyReceivedMessages;\n\n  // using vTy = std::vector<uint8_t>;\n  using vTy = galois::PODResizeableArray<uint8_t>;\n\n  /**\n   * Receive buffers for the buffered network interface\n   */\n  class recvBuffer {\n    std::deque<NetworkIO::message> data;\n    size_t frontOffset;\n    SimpleLock qlock;\n    // tag of head of queue\n    std::atomic<uint32_t> dataPresent;\n\n    bool sizeAtLeast(size_t n, uint32_t tag) {\n      size_t tot = -frontOffset;\n      for (auto& v : data) {\n        if (v.tag == tag) {\n          tot += v.data.size();\n          if (tot >= n)\n            return true;\n        } else {\n          return false;\n        }\n      }\n      return false;\n    }\n\n    template <typename IterTy>\n    void copyOut(IterTy it, size_t n) {\n      // assert(sizeAtLeast(n));\n      // fast path is first buffer\n      { // limit scope\n        auto& f0data = data[0].data;\n        for (int k = frontOffset, ke = f0data.size(); k < ke && n; ++k, --n)\n          *it++ = f0data[k];\n      }\n      if (n) { // more data (slow path)\n        for (int j = 1, je = data.size(); j < je && n; ++j) {\n          auto& vdata = data[j].data;\n          for (int k = 0, ke = vdata.size(); k < ke && n; ++k, --n) {\n            *it++ = vdata[k];\n          }\n        }\n      }\n    }\n\n    /**\n     * Return a (moved) vector if the len bytes requested are the last len\n     * bytes of the front of the buffer queue\n     */\n    std::optional<vTy> popVec(uint32_t len,\n                              std::atomic<size_t>& inflightRecvs) {\n      if (data[0].data.size() == frontOffset + len) {\n        vTy retval(std::move(data[0].data));\n        data.pop_front();\n        --inflightRecvs;\n        frontOffset = 0;\n        if (data.size()) {\n          dataPresent = data.front().tag;\n        } else {\n          dataPresent = ~0;\n        }\n        return std::optional<vTy>(std::move(retval));\n      } else {\n        return std::optional<vTy>();\n      }\n    }\n\n    void erase(size_t n, std::atomic<size_t>& inflightRecvs) {\n      frontOffset += n;\n      while (frontOffset && frontOffset >= data.front().data.size()) {\n        frontOffset -= data.front().data.size();\n        data.pop_front();\n        --inflightRecvs;\n      }\n      if (data.size()) {\n        dataPresent = data.front().tag;\n      } else {\n        dataPresent = ~0;\n      }\n    }\n\n    uint32_t getLenFromFront(uint32_t tag) {\n      if (sizeAtLeast(sizeof(uint32_t), tag)) {\n        union {\n          uint8_t a[sizeof(uint32_t)];\n          uint32_t b;\n        } c;\n        copyOut(&c.a[0], sizeof(uint32_t));\n        return c.b;\n      } else {\n        return ~0;\n      }\n    }\n\n  public:\n    std::optional<RecvBuffer> popMsg(uint32_t tag,\n                                     std::atomic<size_t>& inflightRecvs) {\n      std::lock_guard<SimpleLock> lg(qlock);\n#ifndef NO_AGG\n      uint32_t len = getLenFromFront(tag);\n      //      assert(len);\n      if (len == ~0U || len == 0)\n        return std::optional<RecvBuffer>();\n      if (!sizeAtLeast(sizeof(uint32_t) + len, tag))\n        return std::optional<RecvBuffer>();\n      erase(4, inflightRecvs);\n\n      // Try just using the buffer\n      if (auto r = popVec(len, inflightRecvs)) {\n        auto start = r->size() - len;\n        //        std::cerr << \"FP \" << r->size() << \" \" << len << \" \" << start\n        //        << \"\\n\";\n        return std::optional<RecvBuffer>(RecvBuffer(std::move(*r), start));\n      }\n\n      RecvBuffer buf(len);\n      // FIXME: This is slows things down 25%\n      copyOut((char*)buf.linearData(), len);\n      erase(len, inflightRecvs);\n      // std::cerr << \"p \" << tag << \" \" << len << \"\\n\";\n      return std::optional<RecvBuffer>(std::move(buf));\n#else\n      if (data.empty() || data.front().tag != tag)\n        return std::optional<RecvBuffer>();\n\n      vTy vec(std::move(data.front().data));\n\n      data.pop_front();\n      --inflightRecvs;\n      if (!data.empty()) {\n        dataPresent = data.front().tag;\n      } else {\n        dataPresent = ~0;\n      }\n\n      return std::optional<RecvBuffer>(RecvBuffer(std::move(vec), 0));\n#endif\n    }\n\n    // Worker thread interface\n    void add(NetworkIO::message m) {\n      std::lock_guard<SimpleLock> lg(qlock);\n      if (data.empty()) {\n        galois::runtime::trace(\"ADD LATEST \", m.tag);\n        dataPresent = m.tag;\n      }\n\n      // std::cerr << m.data.size() << \" \" <<\n      //              std::count(m.data.begin(), m.data.end(), 0) << \"\\n\";\n      // for (auto x : m.data) {\n      //   std::cerr << (int) x << \" \";\n      // }\n      // std::cerr << \"\\n\";\n      // std::cerr << \"A \" << m.host << \" \" << m.tag << \" \" << m.data.size() <<\n      // \"\\n\";\n\n      data.push_back(std::move(m));\n\n      assert(data.back().data.size() !=\n             (unsigned int)std::count(data.back().data.begin(),\n                                      data.back().data.end(), 0));\n    }\n\n    bool hasData(uint32_t tag) { return dataPresent == tag; }\n\n    size_t size() { return data.size(); }\n\n    uint32_t getPresentTag() { return dataPresent; }\n  }; // end recv buffer class\n\n  std::vector<recvBuffer> recvData;\n  std::vector<SimpleLock> recvLock;\n\n  /**\n   * Send buffers for the buffered network interface\n   */\n  class sendBuffer {\n    struct msg {\n      uint32_t tag;\n      vTy data;\n      msg(uint32_t t, vTy& _data) : tag(t), data(std::move(_data)) {}\n    };\n\n    std::deque<msg> messages;\n    std::atomic<size_t> numBytes;\n    std::atomic<unsigned> urgent;\n    //! @todo FIXME track time since some epoch in an atomic.\n    std::chrono::high_resolution_clock::time_point time;\n    SimpleLock lock, timelock;\n\n  public:\n    unsigned long statSendTimeout;\n    unsigned long statSendOverflow;\n    unsigned long statSendUrgent;\n\n    size_t size() { return messages.size(); }\n\n    void markUrgent() {\n      if (numBytes) {\n        std::lock_guard<SimpleLock> lg(lock);\n        urgent = messages.size();\n      }\n    }\n\n    bool ready() {\n#ifndef NO_AGG\n      if (numBytes == 0)\n        return false;\n      if (urgent) {\n        ++statSendUrgent;\n        return true;\n      }\n      if (numBytes > COMM_MIN) {\n        ++statSendOverflow;\n        return true;\n      }\n      auto n = std::chrono::high_resolution_clock::now();\n      decltype(n) mytime;\n      {\n        std::lock_guard<SimpleLock> lg(timelock);\n        mytime = time;\n      }\n      auto elapsed =\n          std::chrono::duration_cast<std::chrono::microseconds>(n - mytime);\n      if (elapsed.count() > COMM_DELAY) {\n        ++statSendTimeout;\n        return true;\n      }\n      return false;\n#else\n      return messages.size() > 0;\n#endif\n    }\n\n    std::pair<uint32_t, vTy>\n    assemble(std::atomic<size_t>& GALOIS_UNUSED(inflightSends)) {\n      std::unique_lock<SimpleLock> lg(lock);\n      if (messages.empty())\n        return std::make_pair(~0, vTy());\n#ifndef NO_AGG\n      // compute message size\n      uint32_t len = 0;\n      int num      = 0;\n      uint32_t tag = messages.front().tag;\n      for (auto& m : messages) {\n        if (m.tag != tag) {\n          break;\n        } else {\n          // do not let it go over the integer limit because MPI_Isend cannot\n          // deal with it\n          if ((m.data.size() + sizeof(uint32_t) + len + num) >\n              static_cast<size_t>(std::numeric_limits<int>::max())) {\n            break;\n          }\n          len += m.data.size();\n          num += sizeof(uint32_t);\n        }\n      }\n      lg.unlock();\n      // construct message\n      vTy vec;\n      vec.reserve(len + num);\n      // go out of our way to avoid locking out senders when making messages\n      lg.lock();\n      do {\n        auto& m = messages.front();\n        lg.unlock();\n        union {\n          uint32_t a;\n          uint8_t b[sizeof(uint32_t)];\n        } foo;\n        foo.a = m.data.size();\n        vec.insert(vec.end(), &foo.b[0], &foo.b[sizeof(uint32_t)]);\n        vec.insert(vec.end(), m.data.begin(), m.data.end());\n        if (urgent)\n          --urgent;\n        lg.lock();\n        messages.pop_front();\n        --inflightSends;\n      } while (vec.size() < len + num);\n      ++inflightSends;\n      numBytes -= len;\n#else\n      uint32_t tag = messages.front().tag;\n      vTy vec(std::move(messages.front().data));\n      messages.pop_front();\n#endif\n      return std::make_pair(tag, std::move(vec));\n    }\n\n    void add(uint32_t tag, vTy& b) {\n      std::lock_guard<SimpleLock> lg(lock);\n      if (messages.empty()) {\n        std::lock_guard<SimpleLock> lg(timelock);\n        time = std::chrono::high_resolution_clock::now();\n      }\n      unsigned oldNumBytes = numBytes;\n      numBytes += b.size();\n      galois::runtime::trace(\"BufferedAdd\", oldNumBytes, numBytes, tag,\n                             galois::runtime::printVec(b));\n      messages.emplace_back(tag, b);\n    }\n  }; // end send buffer class\n\n  std::vector<sendBuffer> sendData;\n\n  void workerThread() {\n    initializeMPI();\n    int rank;\n    int hostSize;\n\n    int rankSuccess = MPI_Comm_rank(MPI_COMM_WORLD, &rank);\n    if (rankSuccess != MPI_SUCCESS) {\n      MPI_Abort(MPI_COMM_WORLD, rankSuccess);\n    }\n\n    int sizeSuccess = MPI_Comm_size(MPI_COMM_WORLD, &hostSize);\n    if (sizeSuccess != MPI_SUCCESS) {\n      MPI_Abort(MPI_COMM_WORLD, sizeSuccess);\n    }\n\n    galois::gDebug(\"[\", NetworkInterface::ID, \"] MPI initialized\");\n    std::tie(netio, ID, Num) =\n        makeNetworkIOMPI(memUsageTracker, inflightSends, inflightRecvs);\n\n    assert(ID == (unsigned)rank);\n    assert(Num == (unsigned)hostSize);\n\n    ready = 1;\n    while (ready < 2) { /*fprintf(stderr, \"[WaitOnReady-2]\");*/\n    };\n    while (ready != 3) {\n      for (unsigned i = 0; i < sendData.size(); ++i) {\n        netio->progress();\n        // handle send queue i\n        auto& sd = sendData[i];\n        if (sd.ready()) {\n          NetworkIO::message msg;\n          msg.host                    = i;\n          std::tie(msg.tag, msg.data) = sd.assemble(inflightSends);\n          galois::runtime::trace(\"BufferedSending\", msg.host, msg.tag,\n                                 galois::runtime::printVec(msg.data));\n          ++statSendEnqueued;\n          netio->enqueue(std::move(msg));\n        }\n        // handle receive\n        NetworkIO::message rdata = netio->dequeue();\n        if (rdata.data.size()) {\n          ++statRecvDequeued;\n          assert(rdata.data.size() !=\n                 (unsigned int)std::count(rdata.data.begin(), rdata.data.end(),\n                                          0));\n          galois::runtime::trace(\"BufferedRecieving\", rdata.host, rdata.tag,\n                                 galois::runtime::printVec(rdata.data));\n          recvData[rdata.host].add(std::move(rdata));\n        }\n      }\n    }\n    finalizeMPI();\n  }\n\n  std::thread worker;\n  std::atomic<int> ready;\n\npublic:\n  using NetworkInterface::ID;\n  using NetworkInterface::Num;\n\n  NetworkInterfaceBuffered() {\n    inflightSends       = 0;\n    inflightRecvs       = 0;\n    ready               = 0;\n    anyReceivedMessages = false;\n    worker = std::thread(&NetworkInterfaceBuffered::workerThread, this);\n    while (ready != 1) {\n    };\n    recvData = decltype(recvData)(Num);\n    recvLock.resize(Num);\n    sendData = decltype(sendData)(Num);\n    ready    = 2;\n  }\n\n  virtual ~NetworkInterfaceBuffered() {\n    ready = 3;\n    worker.join();\n  }\n\n  std::unique_ptr<galois::runtime::NetworkIO> netio;\n\n  virtual void sendTagged(uint32_t dest, uint32_t tag, SendBuffer& buf,\n                          int phase) {\n    ++inflightSends;\n    tag += phase;\n    statSendNum += 1;\n    statSendBytes += buf.size();\n    galois::runtime::trace(\"sendTagged\", dest, tag,\n                           galois::runtime::printVec(buf.getVec()));\n    auto& sd = sendData[dest];\n    sd.add(tag, buf.getVec());\n  }\n\n  virtual std::optional<std::pair<uint32_t, RecvBuffer>>\n  recieveTagged(uint32_t tag,\n                std::unique_lock<galois::substrate::SimpleLock>* rlg,\n                int phase) {\n    tag += phase;\n    for (unsigned h = 0; h < recvData.size(); ++h) {\n      auto& rq = recvData[h];\n      if (rq.hasData(tag)) {\n        if (recvLock[h].try_lock()) {\n          std::unique_lock<galois::substrate::SimpleLock> lg(recvLock[h],\n                                                             std::adopt_lock);\n          auto buf = rq.popMsg(tag, inflightRecvs);\n          if (buf) {\n            ++statRecvNum;\n            statRecvBytes += buf->size();\n            memUsageTracker.decrementMemUsage(buf->size());\n            if (rlg)\n              *rlg = std::move(lg);\n            galois::runtime::trace(\"recvTagged\", h, tag,\n                                   galois::runtime::printVec(buf->getVec()));\n            anyReceivedMessages = true;\n            return std::optional<std::pair<uint32_t, RecvBuffer>>(\n                std::make_pair(h, std::move(*buf)));\n          }\n        }\n      }\n      galois::runtime::trace(\"recvTagged BLOCKED this by that\", tag,\n                             rq.getPresentTag());\n    }\n\n    return std::optional<std::pair<uint32_t, RecvBuffer>>();\n  }\n\n  virtual void flush() {\n    for (auto& sd : sendData)\n      sd.markUrgent();\n  }\n\n  virtual bool anyPendingSends() { return (inflightSends > 0); }\n\n  virtual bool anyPendingReceives() {\n    if (anyReceivedMessages) { // might not be acted on by the computation yet\n      anyReceivedMessages = false;\n      // galois::gDebug(\"[\", ID, \"] receive out of buffer \\n\");\n      return true;\n    }\n    // if (inflightRecvs > 0) {\n    // galois::gDebug(\"[\", ID, \"] inflight receive: \", inflightRecvs, \" \\n\");\n    // }\n    return (inflightRecvs > 0);\n  }\n\n  virtual unsigned long reportSendBytes() const { return statSendBytes; }\n  virtual unsigned long reportSendMsgs() const { return statSendNum; }\n  virtual unsigned long reportRecvBytes() const { return statRecvBytes; }\n  virtual unsigned long reportRecvMsgs() const { return statRecvNum; }\n\n  virtual std::vector<unsigned long> reportExtra() const {\n    std::vector<unsigned long> retval(5);\n    for (auto& sd : sendData) {\n      retval[0] += sd.statSendTimeout;\n      retval[1] += sd.statSendOverflow;\n      retval[2] += sd.statSendUrgent;\n    }\n    retval[3] = statSendEnqueued;\n    retval[4] = statRecvDequeued;\n    return retval;\n  }\n\n  virtual std::vector<std::pair<std::string, unsigned long>>\n  reportExtraNamed() const {\n    std::vector<std::pair<std::string, unsigned long>> retval(5);\n    retval[0].first = \"SendTimeout\";\n    retval[1].first = \"SendOverflow\";\n    retval[2].first = \"SendUrgent\";\n    retval[3].first = \"SendEnqueued\";\n    retval[4].first = \"RecvDequeued\";\n    for (auto& sd : sendData) {\n      retval[0].second += sd.statSendTimeout;\n      retval[1].second += sd.statSendOverflow;\n      retval[2].second += sd.statSendUrgent;\n    }\n    retval[3].second = statSendEnqueued;\n    retval[4].second = statRecvDequeued;\n    return retval;\n  }\n};\n\n} // namespace\n\n/**\n * Create a buffered network interface, or return one if already\n * created.\n */\nNetworkInterface& galois::runtime::makeNetworkBuffered() {\n  static std::atomic<NetworkInterfaceBuffered*> net;\n  static substrate::SimpleLock m_mutex;\n\n  // create the interface if it doesn't yet exist in the static variable\n  auto* tmp = net.load();\n  if (tmp == nullptr) {\n    std::lock_guard<substrate::SimpleLock> lock(m_mutex);\n    tmp = net.load();\n    if (tmp == nullptr) {\n      tmp = new NetworkInterfaceBuffered();\n      net.store(tmp);\n    }\n  }\n\n  return *tmp;\n}\n"
  },
  {
    "path": "libdist/src/NetworkIOMPI.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n/**\n * @file NetworkIOMPI.cpp\n *\n * Contains an implementation of network IO that uses MPI.\n */\n\n#include \"galois/runtime/NetworkIO.h\"\n#include \"galois/runtime/Tracer.h\"\n#include \"galois/substrate/SimpleLock.h\"\n\n/**\n * MPI implementation of network IO. ASSUMES THAT MPI IS INITIALIZED\n * UPON CREATION OF THIS OBJECT.\n */\nclass NetworkIOMPI : public galois::runtime::NetworkIO {\nprivate:\n  /**\n   * Get the host id of the caller.\n   *\n   * @returns host id of the caller with regard to the MPI setup\n   */\n  static int getID() {\n    int taskRank;\n    handleError(MPI_Comm_rank(MPI_COMM_WORLD, &taskRank));\n    return taskRank;\n  }\n\n  /**\n   * Get the total number of hosts in the system.\n   *\n   * @returns number of hosts with regard to the MPI setup\n   */\n  static int getNum() {\n    int numTasks;\n    handleError(MPI_Comm_size(MPI_COMM_WORLD, &numTasks));\n    return numTasks;\n  }\n\n  /**\n   * Get both the ID of the caller + number of hosts.\n   */\n  std::pair<int, int> getIDAndHostNum() {\n    return std::make_pair(getID(), getNum());\n  }\n\n  /**\n   * Message type to send/recv in this network IO layer.\n   */\n  struct mpiMessage {\n    uint32_t host;\n    uint32_t tag;\n    vTy data;\n    MPI_Request req;\n    // mpiMessage(message&& _m, MPI_Request _req) : m(std::move(_m)), req(_req)\n    // {}\n    mpiMessage(uint32_t host, uint32_t tag, vTy&& data)\n        : host(host), tag(tag), data(std::move(data)) {}\n    mpiMessage(uint32_t host, uint32_t tag, size_t len)\n        : host(host), tag(tag), data(len) {}\n  };\n\n  /**\n   * Send queue structure.\n   */\n  struct sendQueueTy {\n    std::deque<mpiMessage> inflight;\n\n    galois::runtime::MemUsageTracker& memUsageTracker;\n\n    std::atomic<size_t>& inflightSends;\n\n    sendQueueTy(galois::runtime::MemUsageTracker& tracker,\n                std::atomic<size_t>& sends)\n        : memUsageTracker(tracker), inflightSends(sends) {}\n\n    void complete() {\n      while (!inflight.empty()) {\n        int flag = 0;\n        MPI_Status status;\n        auto& f = inflight.front();\n        int rv  = MPI_Test(&f.req, &flag, &status);\n        handleError(rv);\n        if (flag) {\n          memUsageTracker.decrementMemUsage(f.data.size());\n          inflight.pop_front();\n          --inflightSends;\n        } else\n          break;\n      }\n    }\n\n    void send(message m) {\n      inflight.emplace_back(m.host, m.tag, std::move(m.data));\n      auto& f = inflight.back();\n      galois::runtime::trace(\"MPI SEND\", f.host, f.tag, f.data.size(),\n                             galois::runtime::printVec(f.data));\n#ifdef GALOIS_SUPPORT_ASYNC\n      int rv = MPI_Issend(f.data.data(), f.data.size(), MPI_BYTE, f.host, f.tag,\n                          MPI_COMM_WORLD, &f.req);\n#else\n      int rv = MPI_Isend(f.data.data(), f.data.size(), MPI_BYTE, f.host, f.tag,\n                         MPI_COMM_WORLD, &f.req);\n#endif\n      handleError(rv);\n    }\n  };\n\n  /**\n   * Receive queue structure\n   */\n  struct recvQueueTy {\n    std::deque<message> done;\n    std::deque<mpiMessage> inflight;\n\n    galois::runtime::MemUsageTracker& memUsageTracker;\n\n    std::atomic<size_t>& inflightRecvs;\n\n    recvQueueTy(galois::runtime::MemUsageTracker& tracker,\n                std::atomic<size_t>& recvs)\n        : memUsageTracker(tracker), inflightRecvs(recvs) {}\n\n    // FIXME: Does synchronous recieves overly halt forward progress?\n    void probe() {\n      int flag = 0;\n      MPI_Status status;\n      // check for new messages\n      int rv = MPI_Iprobe(MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD, &flag,\n                          &status);\n      handleError(rv);\n      if (flag) {\n#ifdef GALOIS_USE_BARE_MPI\n        assert(status.MPI_TAG <= 32767);\n        if (status.MPI_TAG != 32767) {\n#endif\n          ++inflightRecvs;\n          int nbytes;\n          rv = MPI_Get_count(&status, MPI_BYTE, &nbytes);\n          handleError(rv);\n          inflight.emplace_back(status.MPI_SOURCE, status.MPI_TAG, nbytes);\n          auto& m = inflight.back();\n          memUsageTracker.incrementMemUsage(m.data.size());\n          rv = MPI_Irecv(m.data.data(), nbytes, MPI_BYTE, status.MPI_SOURCE,\n                         status.MPI_TAG, MPI_COMM_WORLD, &m.req);\n          handleError(rv);\n          galois::runtime::trace(\"MPI IRECV\", status.MPI_SOURCE, status.MPI_TAG,\n                                 m.data.size());\n#ifdef GALOIS_USE_BARE_MPI\n        }\n#endif\n      }\n\n      // complete messages\n      if (!inflight.empty()) {\n        auto& m  = inflight.front();\n        int flag = 0;\n        rv       = MPI_Test(&m.req, &flag, MPI_STATUS_IGNORE);\n        handleError(rv);\n        if (flag) {\n          done.emplace_back(m.host, m.tag, std::move(m.data));\n          inflight.pop_front();\n        }\n      }\n    }\n  };\n\n  sendQueueTy sendQueue;\n  recvQueueTy recvQueue;\n\npublic:\n  /**\n   * Constructor.\n   *\n   * @param tracker memory usage tracker\n   * @param sends\n   * @param recvs\n   * @param [out] ID this machine's host id\n   * @param [out] NUM total number of hosts in the system\n   */\n  NetworkIOMPI(galois::runtime::MemUsageTracker& tracker,\n               std::atomic<size_t>& sends, std::atomic<size_t>& recvs,\n               uint32_t& ID, uint32_t& NUM)\n      : NetworkIO(tracker, sends, recvs), sendQueue(tracker, inflightSends),\n        recvQueue(tracker, inflightRecvs) {\n    auto p = getIDAndHostNum();\n    ID     = p.first;\n    NUM    = p.second;\n  }\n\n  /**\n   * Adds a message to the send queue\n   */\n  virtual void enqueue(message m) {\n    memUsageTracker.incrementMemUsage(m.data.size());\n    sendQueue.send(std::move(m));\n  }\n\n  /**\n   * Attempts to get a message from the recv queue.\n   */\n  virtual message dequeue() {\n    if (!recvQueue.done.empty()) {\n      auto msg = std::move(recvQueue.done.front());\n      recvQueue.done.pop_front();\n      return msg;\n    }\n    return message{~0U, 0, vTy()};\n  }\n\n  /**\n   * Push progress forward in the system.\n   */\n  virtual void progress() {\n    sendQueue.complete();\n    recvQueue.probe();\n  }\n}; // end NetworkIOMPI class\n\nstd::tuple<std::unique_ptr<galois::runtime::NetworkIO>, uint32_t, uint32_t>\ngalois::runtime::makeNetworkIOMPI(galois::runtime::MemUsageTracker& tracker,\n                                  std::atomic<size_t>& sends,\n                                  std::atomic<size_t>& recvs) {\n  uint32_t ID, NUM;\n  std::unique_ptr<galois::runtime::NetworkIO> n{\n      new NetworkIOMPI(tracker, sends, recvs, ID, NUM)};\n  return std::make_tuple(std::move(n), ID, NUM);\n}\n"
  },
  {
    "path": "libdist/src/NetworkLCI.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n/**\n * @file NetworkBuffered.cpp\n *\n * Contains NetworkInterfaceLCI, an implementation of a network interface\n * that buffers messages before sending them out.\n *\n * @todo document this file more\n */\n\n#ifdef GALOIS_USE_LCI\n#include \"galois/runtime/Network.h\"\n#include \"galois/runtime/NetworkIO.h\"\n#include \"galois/runtime/Tracer.h\"\n#include \"galois/runtime/LWCI.h\"\n\nusing vTy = galois::PODResizeableArray<uint8_t>;\n\n#include <thread>\n#include <mutex>\n#include <iostream>\n#include <limits>\n#include <queue>\n\n#include <boost/lockfree/queue.hpp>\n\nusing namespace galois::runtime;\nusing namespace galois::substrate;\n\n/* CRC-32C (iSCSI) polynomial in reversed bit order. */\n#define POLY 0x82f63b78\ninline uint32_t crc32c(char* buf, size_t len) {\n  uint32_t crc = 0;\n  int k;\n\n  crc = ~crc;\n  while (len--) {\n    crc ^= *buf++;\n    for (k = 0; k < 8; k++)\n      crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;\n  }\n  return ~crc;\n}\n\nlc_ep lc_p2p_ep[3];\nlc_ep lc_col_ep;\n\nstruct pendingReq {\n  uint32_t dest;\n  uint32_t tag;\n  int phase;\n  vTy buf;\n  lc_req req;\n  std::atomic<size_t>& inflight;\n  pendingReq(uint32_t _d, uint32_t _t, int _p, vTy& _buf,\n             std::atomic<size_t>& s)\n      : dest(_d), tag(_t), phase(_p), buf(std::move(_buf)), inflight(s) {\n    s++;\n  }\n  ~pendingReq() { inflight--; }\n};\n\nstatic void* alloc_req(size_t size, void** ctx) {\n  vTy** vector = (vTy**)ctx;\n  *vector      = new vTy(size);\n  return (*vector)->data();\n}\n\nstatic void free_req(void* ctx) {\n  pendingReq* req = (pendingReq*)ctx;\n  delete req;\n}\n\nnamespace {\n\n/**\n * @class NetworkInterfaceLCI\n *\n * Buffered network interface: messages are buffered before they are sent out.\n * A single worker thread is initialized to send/receive messages from/to\n * buffers.\n */\nclass NetworkInterfaceLCI : public NetworkInterface {\n  unsigned long statSendNum;\n  unsigned long statSendBytes;\n  unsigned long statSendEnqueued;\n  unsigned long statRecvNum;\n  unsigned long statRecvBytes;\n  unsigned long statRecvDequeued;\n  bool anyReceivedMessages;\n\n  // using vTy = std::vector<uint8_t>;\n  using vTy = galois::PODResizeableArray<uint8_t>;\n\npublic:\n  void workerThread() {\n    // Initialize LWCI\n    // makeNetworkIOLWCI(memUsageTracker, inflightSends, inflightRecvs);\n    if (ID == 0)\n      fprintf(stderr, \"**Using LWCI Communication layer**\\n\");\n\n    ready = 1;\n    while (ready < 2) { /*fprintf(stderr, \"[WaitOnReady-2]\");*/\n    };\n    while (ready != 3) {\n      lc_progress(0);\n\n      lc_req* req_ptr;\n      for (int phase = 0; phase < 3; phase++) {\n        if (lc_cq_pop(lc_p2p_ep[phase], &req_ptr) == LC_OK) {\n          int bin = ((req_ptr->meta % 3) * 3) + phase;\n          bufferedRecv[bin].push(convertReq(req_ptr, phase));\n        }\n      }\n\n      sched_yield();\n    }\n  }\n\n  std::thread worker;\n  std::atomic<int> ready;\n\npublic:\n  using NetworkInterface::ID;\n  using NetworkInterface::Num;\n\n  NetworkInterfaceLCI() {\n    lc_init(1, &lc_col_ep);\n    lc_opt opt;\n    opt.dev   = 0;\n    opt.desc  = LC_DYN_CQ;\n    opt.alloc = alloc_req;\n    lc_ep_dup(&opt, lc_col_ep, &lc_p2p_ep[0]);\n    lc_ep_dup(&opt, lc_col_ep, &lc_p2p_ep[1]);\n    lc_ep_dup(&opt, lc_col_ep, &lc_p2p_ep[2]);\n\n    lc_get_proc_num((int*)&ID);\n    lc_get_num_proc((int*)&Num);\n\n    inflightSends       = 0;\n    inflightRecvs       = 0;\n    ready               = 0;\n    anyReceivedMessages = false;\n    worker              = std::thread(&NetworkInterfaceLCI::workerThread, this);\n    while (ready != 1) {\n    };\n    ready = 2;\n  }\n\n  virtual ~NetworkInterfaceLCI() {\n    ready = 3;\n    worker.join();\n  }\n\n  boost::lockfree::queue<pendingReq*>\n      bufferedRecv[9]; // [0, 1, 2] [0, 1, 2] 0: normal, 1: reduce, 2: AM\n\n  virtual void sendTagged(uint32_t dest, uint32_t tag, SendBuffer& buf,\n                          int phase) {\n    if (tag == 0)\n      phase = 2;\n\n    statSendNum += 1;\n    statSendBytes += buf.size();\n    // int count = 0;\n#ifndef GALOIS_SUPPORT_ASYNC\n    if (buf.getVec().size() < 8192) {\n      while (lc_sendm(buf.getVec().data(), buf.getVec().size(), dest, tag,\n                      lc_p2p_ep[phase]) != LC_OK) {\n        sched_yield();\n      }\n    } else\n#endif\n    {\n      pendingReq* msg =\n          new pendingReq(dest, tag, phase, buf.getVec(), inflightSends);\n      while (lc_sendl(msg->buf.data(), msg->buf.size(), dest, tag,\n                      lc_p2p_ep[phase], free_req, msg) != LC_OK) {\n        sched_yield();\n      }\n    }\n  }\n\n  inline pendingReq* convertReq(lc_req* req_ptr, int phase) {\n    // Need to drain LCI queue to allow more injection.\n    // Convert internal LCI request to a Galois pending request.\n    vTy buf  = std::move(*((vTy*)(req_ptr->ctx)));\n    int rank = req_ptr->rank;\n    int meta = req_ptr->meta;\n    delete (vTy*)req_ptr->ctx;\n    lc_cq_reqfree(lc_p2p_ep[phase], req_ptr);\n    return new pendingReq(rank, meta, phase, buf, inflightRecvs);\n  }\n\n  virtual std::optional<std::pair<uint32_t, RecvBuffer>>\n  recieveTagged(uint32_t tag,\n                std::unique_lock<galois::substrate::SimpleLock>* /*rlg*/,\n                int phase) {\n    if (tag == 0)\n      phase = 2;\n    // static int count = 0;\n\n    pendingReq* req;\n    int bin = ((tag % 3) * 3) + phase;\n    if (!bufferedRecv[bin].pop(req)) {\n      // if (count ++ == 10000) {\n      //  printf(\"[%d] WARNING possible lock out on RECV %d\\n\", ID, tag);\n      // }\n      return std::optional<std::pair<uint32_t, RecvBuffer>>();\n    }\n\n    if (req->tag == tag) {\n      vTy buf  = std::move(req->buf);\n      int dest = req->dest;\n      delete req;\n      return std::optional<std::pair<uint32_t, RecvBuffer>>(\n          std::make_pair(dest, std::move(buf)));\n    } else {\n      printf(\"[%d] WARNING possible lock out, wrong tag %d/%d.\\n\", ID, req->tag,\n             tag);\n      return std::optional<std::pair<uint32_t, RecvBuffer>>();\n    }\n  }\n\n  virtual void flush() {}\n\n  virtual bool anyPendingSends() {\n    // static int count = 0;\n    // if (count++ == 10000)\n    // printf(\"[%d] WARNING possible lock out terminate %d %d\\n\", ID,\n    // inflightSends.load(), inflightRecvs.load());\n    return (inflightSends > 0);\n  }\n\n  virtual bool anyPendingReceives() {\n    if (anyReceivedMessages) { // might not be acted on by the computation yet\n      anyReceivedMessages = false;\n      // galois::gDebug(\"[\", ID, \"] receive out of buffer \\n\");\n      return true;\n    }\n    // if (inflightRecvs > 0) {\n    // galois::gDebug(\"[\", ID, \"] inflight receive: \", inflightRecvs, \" \\n\");\n    // }\n    return (inflightRecvs > 0);\n  }\n\n  virtual unsigned long reportSendBytes() const { return statSendBytes; }\n  virtual unsigned long reportSendMsgs() const { return statSendNum; }\n  virtual unsigned long reportRecvBytes() const { return statRecvBytes; }\n  virtual unsigned long reportRecvMsgs() const { return statRecvNum; }\n\n  virtual std::vector<unsigned long> reportExtra() const {\n    std::vector<unsigned long> retval(5);\n    return retval;\n  }\n\n  virtual std::vector<std::pair<std::string, unsigned long>>\n  reportExtraNamed() const {\n    std::vector<std::pair<std::string, unsigned long>> retval(5);\n    retval[0].first  = \"SendTimeout\";\n    retval[1].first  = \"SendOverflow\";\n    retval[2].first  = \"SendUrgent\";\n    retval[3].first  = \"SendEnqueued\";\n    retval[4].first  = \"RecvDequeued\";\n    retval[3].second = statSendEnqueued;\n    retval[4].second = statRecvDequeued;\n    return retval;\n  }\n};\n\n} // namespace\n\n/**\n * Create a buffered network interface, or return one if already\n * created.\n */\nNetworkInterface& galois::runtime::makeNetworkLCI() {\n  static std::atomic<NetworkInterfaceLCI*> net;\n  static substrate::SimpleLock m_mutex;\n\n  // create the interface if it doesn't yet exist in the static variable\n  auto* tmp = net.load();\n  if (tmp == nullptr) {\n    std::lock_guard<substrate::SimpleLock> lock(m_mutex);\n    tmp = net.load();\n    if (tmp == nullptr) {\n      tmp = new NetworkInterfaceLCI();\n      net.store(tmp);\n    }\n  }\n\n  return *tmp;\n}\n#endif\n"
  },
  {
    "path": "libgalois/CMakeLists.txt",
    "content": "add_library(galois_shmem)\nadd_library(Galois::shmem ALIAS galois_shmem)\nset_target_properties(galois_shmem PROPERTIES EXPORT_NAME shmem)\nadd_dependencies(lib galois_shmem)\n\nconfigure_file(src/Version.cpp.in Version.cpp @ONLY)\nconfigure_file(include/galois/config.h.in include/galois/config.h)\n\nset(sources\n        \"${CMAKE_CURRENT_BINARY_DIR}/Version.cpp\"\n        src/Barrier_Counting.cpp\n        src/Barrier.cpp\n        src/Barrier_Dissemination.cpp \n        src/Barrier_MCS.cpp\n        src/Barrier_Pthread.cpp\n        src/Barrier_Simple.cpp\n        src/Barrier_Topo.cpp\n        src/Context.cpp\n        src/Deterministic.cpp\n        src/DynamicBitset.cpp\n        src/EnvCheck.cpp\n        src/FileGraph.cpp\n        src/FileGraphParallel.cpp\n        src/gIO.cpp\n        src/GraphHelpers.cpp\n        src/HWTopo.cpp\n        src/Mem.cpp\n        src/NumaMem.cpp\n        src/OCFileGraph.cpp\n        src/PageAlloc.cpp\n        src/PagePool.cpp\n        src/PagePool.cpp\n        src/ParaMeter.cpp\n        src/PerThreadStorage.cpp\n        src/PreAlloc.cpp\n        src/Profile.cpp\n        src/PtrLock.cpp\n        src/SharedMem.cpp\n        src/SharedMemSys.cpp\n        src/SimpleLock.cpp\n        src/Statistics.cpp\n        src/Substrate.cpp\n        src/Support.cpp\n        src/Termination.cpp\n        src/ThreadPool.cpp\n        src/Threads.cpp\n        src/ThreadTimer.cpp\n        src/Timer.cpp\n        src/Tracer.cpp\n)\n\nif (${CMAKE_SYSTEM_NAME} MATCHES \"Darwin\")\n  list(APPEND sources src/HWTopoDarwin.cpp)\nelse()\n  include(CheckSchedSetAffinity)\n  if (NOT SCHED_SETAFFINITY_FOUND)\n    if (GALOIS_STRICT_CONFIG)\n      message(FATAL_ERROR \"Need sched_setaffinity\")\n    endif()\n  endif()\n  list(APPEND sources src/HWTopoLinux.cpp)\nendif()\n\ntarget_sources(galois_shmem PRIVATE ${sources})\n\ntarget_include_directories(galois_shmem PUBLIC\n  $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>\n  $<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}/include>\n  $<INSTALL_INTERFACE:include>\n)\n\nif (TARGET Boost::Boost)\n  # Autogenerated conan module doesn't provide header-only target. Extract one\n  # manually.\n  get_target_property(include_dirs Boost::Boost INTERFACE_INCLUDE_DIRECTORIES)\n  target_include_directories(galois_shmem PUBLIC ${include_dirs})\nelse()\n  # Standard CMake Boost module\n  target_link_libraries(galois_shmem PUBLIC Boost::boost)\nendif()\n\nif (SCHED_SETAFFINITY_FOUND)\n  target_compile_definitions(galois_shmem PRIVATE GALOIS_USE_SCHED_SETAFFINITY)\n  target_link_libraries(galois_shmem PRIVATE ${SCHED_SETAFFINITY_LIBRARIES})\nendif()\n\ntarget_link_libraries(galois_shmem INTERFACE pygalois)\ntarget_link_libraries(galois_shmem PRIVATE Threads::Threads)\n\nif (CMAKE_HAVE_PTHREAD_H)\n  target_compile_definitions(galois_shmem PRIVATE GALOIS_HAVE_PTHREAD)\nendif()\n\nfind_package(NUMA)\nif (NUMA_FOUND)\n  target_compile_definitions(galois_shmem PRIVATE GALOIS_USE_NUMA)\n  target_link_libraries(galois_shmem PRIVATE ${NUMA_LIBRARY})\nelse()\n  message(WARNING \"No NUMA Support.  Likely poor performance for multi-socket systems.\")\nendif()\n\nif (VTune_FOUND)\n  target_link_libraries(galois_shmem PRIVATE ${VTune_LIBRARIES})\nendif()\n\n\nadd_subdirectory(test)\n\ninstall(\n  DIRECTORY include/\n  DESTINATION \"${CMAKE_INSTALL_INCLUDEDIR}\"\n  COMPONENT dev\n  FILES_MATCHING PATTERN \"*.h\"\n)\n\ninstall(\n  DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/include/\n  DESTINATION \"${CMAKE_INSTALL_INCLUDEDIR}\"\n  COMPONENT dev\n  FILES_MATCHING PATTERN \"*.h\"\n)\n\ninstall(\n  TARGETS galois_shmem\n  EXPORT GaloisTargets\n  LIBRARY\n    DESTINATION \"${CMAKE_INSTALL_LIBDIR}\"\n    COMPONENT shlib\n  ARCHIVE\n    DESTINATION \"${CMAKE_INSTALL_LIBDIR}\"\n    COMPONENT lib\n  INCLUDES DESTINATION \"${CMAKE_INSTALL_INCLUDEDIR}\"\n)\n"
  },
  {
    "path": "libgalois/include/galois/ArrayWrapper.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n/**\n * @file ArrayWrapper.h\n *\n * Defines the CopyableArray subclass used to make arrays trivially copyable if\n * possible.\n */\n\n#ifndef _ARRAY_WRAPPER_H_\n#define _ARRAY_WRAPPER_H_\n\n#include <array>\n#include \"galois/config.h\"\n#include \"galois/runtime/ExtraTraits.h\"\n\nnamespace galois {\n/**\n * A subclass of std::array that is marked trivially copyable if the type is\n * also memory copyable. Useful when you need a trivially copyable type for\n * serialization.\n *\n * @tparam T type of the items to be stored in the array\n * @tparam N total number of items in the array\n */\ntemplate <class T, size_t N>\nclass CopyableArray : public std::array<T, N> {\npublic:\n  //! Only typedef tt_is_copyable if T is trivially copyable.\n  //! Allows the use of memcopy in serialize/deserialize.\n  using tt_is_copyable =\n      typename std::enable_if<galois::runtime::is_memory_copyable<T>::value,\n                              int>::type;\n};\n} // namespace galois\n#endif\n"
  },
  {
    "path": "libgalois/include/galois/AtomicHelpers.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#pragma once\n#include <atomic>\n#include <algorithm>\n#include <vector>\n\n#include \"galois/config.h\"\n\nnamespace galois {\n/** galois::atomicMax + non-atomic max calls **/\ntemplate <typename Ty>\nconst Ty atomicMax(std::atomic<Ty>& a, const Ty b) {\n  Ty old_a = a.load(std::memory_order_relaxed);\n  // if old value is less than new value, atomically exchange\n  while (old_a < b &&\n         !a.compare_exchange_weak(old_a, b, std::memory_order_relaxed))\n    ;\n  return old_a;\n}\n\ntemplate <typename Ty>\nconst Ty max(std::atomic<Ty>& a, const Ty& b) {\n  Ty old_a = a.load(std::memory_order_relaxed);\n\n  if (a < b) {\n    a.store(b, std::memory_order_relaxed);\n  }\n  return old_a;\n}\n\ntemplate <typename Ty>\nconst Ty max(Ty& a, const Ty& b) {\n  Ty old_a = a;\n\n  if (a < b) {\n    a = b;\n  }\n  return old_a;\n}\n\n/** galois::atomicMin **/\ntemplate <typename Ty>\nconst Ty atomicMin(std::atomic<Ty>& a, const Ty b) {\n  Ty old_a = a.load(std::memory_order_relaxed);\n  while (old_a > b &&\n         !a.compare_exchange_weak(old_a, b, std::memory_order_relaxed))\n    ;\n  return old_a;\n}\n\ntemplate <typename Ty>\nconst Ty min(std::atomic<Ty>& a, const Ty& b) {\n  Ty old_a = a.load(std::memory_order_relaxed);\n  if (a > b) {\n    a.store(b, std::memory_order_relaxed);\n  }\n  return old_a;\n}\n\ntemplate <typename Ty>\nconst Ty min(Ty& a, const Ty& b) {\n  Ty old_a = a;\n  if (a > b) {\n    a = b;\n  }\n  return old_a;\n}\n\n/** galois::atomicAdd **/\ntemplate <typename Ty>\nconst Ty atomicAdd(std::atomic<Ty>& val, Ty delta) {\n  Ty old_val = val.load(std::memory_order_relaxed);\n  while (!val.compare_exchange_weak(old_val, old_val + delta,\n                                    std::memory_order_relaxed))\n    ;\n  return old_val;\n}\n\ntemplate <typename Ty>\nconst Ty add(std::atomic<Ty>& a, const Ty& b) {\n  Ty old_a = a.load(std::memory_order_relaxed);\n  a.store(a + b, std::memory_order_relaxed);\n  return old_a;\n}\n\ntemplate <typename Ty>\nconst Ty add(Ty& a, std::atomic<Ty>& b) {\n  Ty old_a = a;\n  a        = a + b.load(std::memory_order_relaxed);\n  return old_a;\n}\n\ntemplate <typename Ty>\nconst Ty add(Ty& a, const Ty& b) {\n  Ty old_a = a;\n  a += b;\n  return old_a;\n}\n\n/**\n * atomic subtraction of delta (because atomicAdd with negative numbers implies\n * a signed integer cast)\n */\ntemplate <typename Ty>\nconst Ty atomicSubtract(std::atomic<Ty>& val, Ty delta) {\n  Ty old_val = val.load(std::memory_order_relaxed);\n  while (!val.compare_exchange_weak(old_val, old_val - delta,\n                                    std::memory_order_relaxed))\n    ;\n  return old_val;\n}\n\ntemplate <typename Ty>\nconst Ty set(Ty& a, const Ty& b) {\n  a = b;\n  return a;\n}\n\ntemplate <typename Ty>\nconst Ty set(std::atomic<Ty>& a, const Ty& b) {\n  a.store(b, std::memory_order_relaxed);\n  return a;\n}\n\n/** Pair Wise Average function **/\ntemplate <typename Ty>\nconst Ty pairWiseAvg(Ty a, Ty b) {\n  return (a + b) / 2.0;\n}\n\ntemplate <typename Ty>\nvoid pairWiseAvg_vec(std::vector<Ty>& a_vec, std::vector<Ty>& b_vec) {\n  for (unsigned i = 0; i < a_vec.size(); ++i) {\n    a_vec[i] = (a_vec[i] + b_vec[i]) / 2.0;\n  }\n}\n\ntemplate <typename Ty>\nvoid resetVec(Ty& a_arr) {\n  // std::for_each(a_arr.begin(), a_arr.end(),[](Ty &ele){ele = 0;} );\n  std::fill(a_arr.begin(), a_arr.end(), 0);\n}\n\ntemplate <typename Ty>\nvoid pairWiseAvg_vec(Ty& a_arr, Ty& b_arr) {\n  for (unsigned i = 0; i < a_arr.size(); ++i) {\n    a_arr[i] = (a_arr[i] + b_arr[i]) / 2.0;\n  }\n}\n\ntemplate <typename Ty>\nvoid addArray(Ty& a_arr, Ty& b_arr) {\n  for (unsigned i = 0; i < a_arr.size(); ++i) {\n    a_arr[i] = (a_arr[i] + b_arr[i]);\n  }\n}\n\ntemplate <typename Ty>\nvoid resetVec(std::vector<Ty>& a_vec) {\n  std::for_each(a_vec.begin(), a_vec.end(), [](Ty& ele) { ele = 0; });\n}\n\n// like std::inner_product\ntemplate <typename ItrTy, typename Ty>\nTy innerProduct(ItrTy a_begin, ItrTy a_end, ItrTy b_begin, Ty init_value) {\n  auto jj = b_begin;\n  for (auto ii = a_begin; ii != a_end; ++ii, ++jj) {\n    init_value += (*ii) * (*jj);\n  }\n  return init_value;\n}\n\n// like std::inner_product\ntemplate <typename ItrTy, typename Ty>\nTy innerProduct(ItrTy& a_arr, ItrTy& b_arr, Ty init_value) {\n  auto jj = b_arr.begin();\n  for (auto ii = a_arr.begin(); ii != a_arr.end(); ++ii, ++jj) {\n    init_value += (*ii) * (*jj);\n  }\n  return init_value;\n}\n\ntemplate <typename Ty>\nvoid reset(Ty& var, Ty val) {\n  var = val;\n}\n\ntemplate <typename Ty>\nvoid reset(std::atomic<Ty>& var, Ty val) {\n  var.store(val, std::memory_order_relaxed);\n}\n} // end namespace galois\n"
  },
  {
    "path": "libgalois/include/galois/AtomicWrapper.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n/**\n * @file AtomicWrapper.h\n *\n * Contains a copyable atomics class.\n */\n#ifndef _ATOMIC_WRAPPER_H_\n#define _ATOMIC_WRAPPER_H_\n\n#include <atomic>\n\n#include \"galois/config.h\"\n\nnamespace galois {\n/**\n * Class that inherits from std::atomic to make it copyable by defining a copy\n * constructor.\n *\n * @tparam T type of the atomic\n */\ntemplate <class T>\nclass CopyableAtomic : public std::atomic<T> {\npublic:\n  //! Default constructor\n  CopyableAtomic() : std::atomic<T>(T{}) {}\n\n  //! Constructor initializing atomic to passed in data\n  constexpr CopyableAtomic(T desired) : std::atomic<T>(desired) {}\n\n  //! Copy constructor\n  constexpr CopyableAtomic(const CopyableAtomic<T>& other)\n      : CopyableAtomic(other.load(std::memory_order_relaxed)) {}\n\n  //! Copy constructor operator\n  CopyableAtomic& operator=(const CopyableAtomic<T>& other) {\n    this->store(other.load(std::memory_order_relaxed),\n                std::memory_order_relaxed);\n    return *this;\n  }\n};\n\n} // namespace galois\n#endif\n"
  },
  {
    "path": "libgalois/include/galois/Bag.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#ifndef GALOIS_BAG_H\n#define GALOIS_BAG_H\n\n#include <algorithm>\n#include <stdexcept>\n\n#include <boost/iterator/iterator_facade.hpp>\n\n#include \"galois/config.h\"\n#include \"galois/gstl.h\"\n#include \"galois/runtime/Executor_OnEach.h\"\n#include \"galois/substrate/PerThreadStorage.h\"\n#include \"galois/gIO.h\"\n#include \"galois/runtime/Mem.h\"\n\nnamespace galois {\n\n/**\n * Unordered collection of elements. This data structure supports scalable\n * concurrent pushes but reading the bag can only be done serially.\n */\ntemplate <typename T, unsigned int BlockSize = 0>\nclass InsertBag {\n\n  struct header {\n    header* next;\n    T* dbegin; // start of interesting data\n    T* dend;   // end of valid data\n    T* dlast;  // end of storage\n  };\n\n  typedef std::pair<header*, header*> PerThread;\n\npublic:\n  template <typename U>\n  class Iterator : public boost::iterator_facade<Iterator<U>, U,\n                                                 boost::forward_traversal_tag> {\n    friend class boost::iterator_core_access;\n\n    galois::substrate::PerThreadStorage<std::pair<header*, header*>>* hd;\n    unsigned int thr;\n    header* p;\n    U* v;\n\n    bool init_thread() {\n      p = thr < hd->size() ? hd->getRemote(thr)->first : 0;\n      v = p ? p->dbegin : 0;\n      return p;\n    }\n\n    bool advance_local() {\n      if (p) {\n        ++v;\n        return v != p->dend;\n      }\n      return false;\n    }\n\n    bool advance_chunk() {\n      if (p) {\n        p = p->next;\n        v = p ? p->dbegin : 0;\n      }\n      return p;\n    }\n\n    void advance_thread() {\n      while (thr < hd->size()) {\n        ++thr;\n        if (init_thread())\n          return;\n      }\n    }\n\n    void increment() {\n      if (advance_local())\n        return;\n      if (advance_chunk())\n        return;\n      advance_thread();\n    }\n\n    template <typename OtherTy>\n    bool equal(const Iterator<OtherTy>& o) const {\n      return hd == o.hd && thr == o.thr && p == o.p && v == o.v;\n    }\n\n    U& dereference() const { return *v; }\n\n  public:\n    Iterator() : hd(0), thr(0), p(0), v(0) {}\n\n    template <typename OtherTy>\n    Iterator(const Iterator<OtherTy>& o)\n        : hd(o.hd), thr(o.thr), p(o.p), v(o.v) {}\n\n    Iterator(\n        galois::substrate::PerThreadStorage<std::pair<header*, header*>>* h,\n        unsigned t)\n        : hd(h), thr(t), p(0), v(0) {\n      // find first valid item\n      if (!init_thread())\n        advance_thread();\n    }\n  };\n\nprivate:\n  galois::runtime::FixedSizeHeap heap;\n  galois::substrate::PerThreadStorage<PerThread> heads;\n\n  void insHeader(header* h) {\n    PerThread& hpair = *heads.getLocal();\n    if (hpair.second) {\n      hpair.second->next = h;\n      hpair.second       = h;\n    } else {\n      hpair.first = hpair.second = h;\n    }\n  }\n\n  header* newHeaderFromHeap(void* m, unsigned size) {\n    header* H  = new (m) header();\n    int offset = 1;\n    if (sizeof(T) < sizeof(header))\n      offset += sizeof(header) / sizeof(T);\n    T* a      = reinterpret_cast<T*>(m);\n    H->dbegin = &a[offset];\n    H->dend   = H->dbegin;\n    H->dlast  = &a[(size / sizeof(T))];\n    H->next   = 0;\n    return H;\n  }\n\n  header* newHeader() {\n    if (BlockSize) {\n      return newHeaderFromHeap(heap.allocate(BlockSize), BlockSize);\n    } else {\n      return newHeaderFromHeap(galois::runtime::pagePoolAlloc(),\n                               galois::runtime::pagePoolSize());\n    }\n  }\n\n  void destruct_serial() {\n    for (unsigned x = 0; x < heads.size(); ++x) {\n      PerThread& hpair = *heads.getRemote(x);\n      header*& h       = hpair.first;\n      while (h) {\n        uninitialized_destroy(h->dbegin, h->dend);\n        header* h2 = h;\n        h          = h->next;\n        if (BlockSize)\n          heap.deallocate(h2);\n        else\n          galois::runtime::pagePoolFree(h2);\n      }\n      hpair.second = 0;\n    }\n  }\n\n  void destruct_parallel(void) {\n    galois::runtime::on_each_gen(\n        [this](const unsigned int tid, const unsigned int) {\n          PerThread& hpair = *heads.getLocal(tid);\n          header*& h       = hpair.first;\n          while (h) {\n            uninitialized_destroy(h->dbegin, h->dend);\n            header* h2 = h;\n            h          = h->next;\n            if (BlockSize)\n              heap.deallocate(h2);\n            else\n              galois::runtime::pagePoolFree(h2);\n          }\n          hpair.second = 0;\n        },\n        std::make_tuple(galois::no_stats()));\n  }\n\npublic:\n  // static_assert(BlockSize == 0 || BlockSize >= (2 * sizeof(T) +\n  // sizeof(header)),\n  //     \"BlockSize should larger than sizeof(T) + O(1)\");\n\n  InsertBag() : heap(BlockSize) {}\n  InsertBag(InsertBag&& o) : heap(BlockSize) {\n    std::swap(heap, o.heap);\n    std::swap(heads, o.heads);\n  }\n\n  InsertBag& operator=(InsertBag&& o) {\n    std::swap(heap, o.heap);\n    std::swap(heads, o.heads);\n    return *this;\n  }\n\n  InsertBag(const InsertBag&) = delete;\n  InsertBag& operator=(const InsertBag&) = delete;\n\n  ~InsertBag() { destruct_parallel(); }\n\n  void clear() { destruct_parallel(); }\n\n  void clear_serial() { destruct_serial(); }\n\n  void swap(InsertBag& o) {\n    std::swap(heap, o.heap);\n    std::swap(heads, o.heads);\n  }\n\n  typedef T value_type;\n  typedef T* pointer;\n  typedef const T* const_pointer;\n  typedef const T& const_reference;\n  typedef T& reference;\n  typedef Iterator<T> iterator;\n  typedef Iterator<const T> const_iterator;\n  typedef iterator local_iterator;\n\n  iterator begin() { return iterator(&heads, 0); }\n  iterator end() { return iterator(&heads, heads.size()); }\n  const_iterator begin() const { return const_iterator(&heads, 0); }\n  const_iterator end() const { return const_iterator(&heads, heads.size()); }\n\n  local_iterator local_begin() {\n    return local_iterator(&heads, galois::substrate::ThreadPool::getTID());\n  }\n  local_iterator local_end() {\n    return local_iterator(&heads, galois::substrate::ThreadPool::getTID() + 1);\n  }\n\n  bool empty() const {\n    for (unsigned x = 0; x < heads.size(); ++x) {\n      header* h = heads.getRemote(x)->first;\n      if (h)\n        return false;\n    }\n    return true;\n  }\n  //! Thread safe bag insertion\n  template <typename... Args>\n  reference emplace(Args&&... args) {\n    header* H = heads.getLocal()->second;\n    T* rv;\n    if (!H || H->dend == H->dlast) {\n      H = newHeader();\n      insHeader(H);\n    }\n    rv = new (H->dend) T(std::forward<Args>(args)...);\n    ++H->dend;\n    return *rv;\n  }\n\n  template <typename... Args>\n  reference emplace_back(Args&&... args) {\n    return emplace(std::forward<Args>(args)...);\n  }\n\n  /**\n   * Pop the last element pushed by this thread. The number of consecutive\n   * pops supported without intevening pushes is implementation dependent.\n   */\n  void pop() {\n    header* H = heads.getLocal()->second;\n    if (H->dbegin == H->dend) {\n      throw std::out_of_range(\"InsertBag::pop\");\n    }\n    uninitialized_destroy(H->dend - 1, H->dend);\n    --H->dend;\n  }\n\n  //! Thread safe bag insertion\n  template <typename ItemTy>\n  reference push(ItemTy&& val) {\n    return emplace(std::forward<ItemTy>(val));\n  }\n\n  //! Thread safe bag insertion\n  template <typename ItemTy>\n  reference push_back(ItemTy&& val) {\n    return emplace(std::forward<ItemTy>(val));\n  }\n};\n\n} // namespace galois\n\n#endif\n"
  },
  {
    "path": "libgalois/include/galois/CheckedObject.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#ifndef GALOIS_CHECKEDOBJECT_H\n#define GALOIS_CHECKEDOBJECT_H\n\n#include \"galois/config.h\"\n#include \"galois/runtime/Context.h\"\n\nnamespace galois {\n\n/**\n * Conflict-checking wrapper for any type.  Performs global conflict detection\n * on the enclosed object.  This enables arbitrary types to be managed by the\n * Galois runtime.\n */\ntemplate <typename T>\nclass GChecked : public galois::runtime::Lockable {\n  T val;\n\npublic:\n  template <typename... Args>\n  GChecked(Args&&... args) : val(std::forward<Args>(args)...) {}\n\n  T& get(galois::MethodFlag m = MethodFlag::WRITE) {\n    galois::runtime::acquire(this, m);\n    return val;\n  }\n\n  const T& get(galois::MethodFlag m = MethodFlag::WRITE) const {\n    galois::runtime::acquire(const_cast<GChecked*>(this), m);\n    return val;\n  }\n};\n\ntemplate <>\nclass GChecked<void> : public galois::runtime::Lockable {\npublic:\n  void get(galois::MethodFlag m = MethodFlag::WRITE) const {\n    galois::runtime::acquire(const_cast<GChecked*>(this), m);\n  }\n};\n\n} // namespace galois\n\n#endif // _GALOIS_CHECKEDOBJECT_H\n"
  },
  {
    "path": "libgalois/include/galois/CopyableTuple.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n/**\n * @file CopyableTuple.h\n *\n * Contains copyable tuple classes whose elements are contiguous in memory\n */\n#pragma once\n\n#include \"galois/config.h\"\n\nnamespace galois {\n\n/**\n * Struct that contains 2 elements. Used over std::pair as std::pair memory\n * layout isn't guaranteed.\n *\n * @tparam T1 type of first element\n * @tparam T2 type of second element\n */\ntemplate <typename T1, typename T2>\nstruct Pair {\n  //! first element\n  T1 first;\n  //! second element\n  T2 second;\n\n  //! empty constructor\n  Pair() {}\n\n  //! Constructor that initializes 2 fields\n  Pair(T1 one, T2 two) {\n    first  = one;\n    second = two;\n  }\n};\n\n/**\n * Struct that contains 3 elements. Used over std::tuple as std::tuple memory\n * layout isn't guaranteed.\n *\n * @tparam T1 type of first element\n * @tparam T2 type of second element\n * @tparam T3 type of third element\n */\ntemplate <typename T1, typename T2, typename T3>\nstruct TupleOfThree {\n  //! first element\n  T1 first;\n  //! second element\n  T2 second;\n  //! third element\n  T3 third;\n\n  //! empty constructor\n  TupleOfThree() {}\n\n  //! Constructor that initializes 3 fields\n  TupleOfThree(T1 one, T2 two, T3 three) {\n    first  = one;\n    second = two;\n    third  = three;\n  }\n};\n\n} // namespace galois\n"
  },
  {
    "path": "libgalois/include/galois/DynamicBitset.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2019, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n/**\n * @file galois/DynamicBitset.h\n *\n * Contains the DynamicBitSet class and most of its implementation.\n */\n\n#ifndef _GALOIS_DYNAMIC_BIT_SET_\n#define _GALOIS_DYNAMIC_BIT_SET_\n\n#include <climits>\n#include <vector>\n#include <cassert>\n\n#include <boost/iterator/counting_iterator.hpp>\n#include <boost/mpl/has_xxx.hpp>\n\n#include \"galois/config.h\"\n#include \"galois/AtomicWrapper.h\"\n#include \"galois/PODResizeableArray.h\"\n#include \"galois/GaloisForwardDecl.h\"\n#include \"galois/Traits.h\"\n#include \"galois/Galois.h\"\n\nnamespace galois {\n/**\n * Concurrent dynamically allocated bitset\n **/\nclass DynamicBitSet {\nprotected:\n  galois::PODResizeableArray<galois::CopyableAtomic<uint64_t>> bitvec;\n  size_t num_bits;\n  static constexpr uint32_t bits_uint64 = sizeof(uint64_t) * CHAR_BIT;\n\npublic:\n  //! Constructor which initializes to an empty bitset.\n  DynamicBitSet() : num_bits(0) {}\n\n  /**\n   * Returns the underlying bitset representation to the user\n   *\n   * @returns constant reference vector of copyable atomics that represents\n   * the bitset\n   */\n  const auto& get_vec() const { return bitvec; }\n\n  /**\n   * Returns the underlying bitset representation to the user\n   *\n   * @returns reference to vector of copyable atomics that represents the\n   * bitset\n   */\n  auto& get_vec() { return bitvec; }\n\n  /**\n   * Resizes the bitset.\n   *\n   * @param n Size to change the bitset to\n   */\n  void resize(uint64_t n) {\n    assert(bits_uint64 == 64); // compatibility with other devices\n    num_bits = n;\n    bitvec.resize((n + bits_uint64 - 1) / bits_uint64);\n    reset();\n  }\n\n  /**\n   * Reserves capacity for the bitset.\n   *\n   * @param n Size to reserve the capacity of the bitset to\n   */\n  void reserve(uint64_t n) {\n    assert(bits_uint64 == 64); // compatibility with other devices\n    bitvec.reserve((n + bits_uint64 - 1) / bits_uint64);\n  }\n\n  /**\n   * Gets the size of the bitset\n   * @returns The number of bits held by the bitset\n   */\n  size_t size() const { return num_bits; }\n\n  /**\n   * Gets the space taken by the bitset\n   * @returns the space in bytes taken by this bitset\n   */\n  // size_t alloc_size() const { return bitvec.size() * sizeof(uint64_t); }\n\n  /**\n   * Unset every bit in the bitset.\n   */\n  void reset() { std::fill(bitvec.begin(), bitvec.end(), 0); }\n\n  /**\n   * Unset a range of bits given an inclusive range\n   *\n   * @param begin first bit in range to reset\n   * @param end last bit in range to reset\n   */\n  void reset(size_t begin, size_t end) {\n    if (num_bits == 0)\n      return;\n\n    assert(begin <= (num_bits - 1));\n    assert(end <= (num_bits - 1));\n\n    // 100% safe implementation, but slow\n    // for (unsigned long i = begin; i <= end; i++) {\n    //  size_t bit_index = i / bits_uint64;\n    //  uint64_t bit_offset = 1;\n    //  bit_offset <<= (i % bits_uint64);\n    //  uint64_t mask = ~bit_offset;\n    //  bitvec[bit_index] &= mask;\n    //}\n\n    // block which you are safe to clear\n    size_t vec_begin = (begin + bits_uint64 - 1) / bits_uint64;\n    size_t vec_end;\n\n    if (end == (num_bits - 1))\n      vec_end = bitvec.size();\n    else\n      vec_end = (end + 1) / bits_uint64; // floor\n\n    if (vec_begin < vec_end) {\n      std::fill(bitvec.begin() + vec_begin, bitvec.begin() + vec_end, 0);\n    }\n\n    vec_begin *= bits_uint64;\n    vec_end *= bits_uint64;\n\n    // at this point vec_begin -> vec_end-1 has been reset\n\n    if (vec_begin > vec_end) {\n      // no fill happened\n      if (begin < vec_begin) {\n        size_t diff = vec_begin - begin;\n        assert(diff < 64);\n        uint64_t mask = ((uint64_t)1 << (64 - diff)) - 1;\n\n        size_t end_diff  = end - vec_end + 1;\n        uint64_t or_mask = ((uint64_t)1 << end_diff) - 1;\n        mask |= ~or_mask;\n\n        size_t bit_index = begin / bits_uint64;\n        bitvec[bit_index] &= mask;\n      }\n    } else {\n      if (begin < vec_begin) {\n        size_t diff = vec_begin - begin;\n        assert(diff < 64);\n        uint64_t mask    = ((uint64_t)1 << (64 - diff)) - 1;\n        size_t bit_index = begin / bits_uint64;\n        bitvec[bit_index] &= mask;\n      }\n      if (end >= vec_end) {\n        size_t diff = end - vec_end + 1;\n        assert(diff < 64);\n        uint64_t mask    = ((uint64_t)1 << diff) - 1;\n        size_t bit_index = end / bits_uint64;\n        bitvec[bit_index] &= ~mask;\n      }\n    }\n  }\n\n  /**\n   * Check a bit to see if it is currently set.\n   * Using this is recommeneded only if set() and reset()\n   * are not being used in that parallel section/phase\n   *\n   * @param index Bit to check to see if set\n   * @returns true if index is set\n   */\n  bool test(size_t index) const {\n    size_t bit_index    = index / bits_uint64;\n    uint64_t bit_offset = 1;\n    bit_offset <<= (index % bits_uint64);\n    return ((bitvec[bit_index].load(std::memory_order_relaxed) & bit_offset) !=\n            0);\n  }\n\n  /**\n   * Set a bit in the bitset.\n   *\n   * @param index Bit to set\n   * @returns the old value\n   */\n  bool set(size_t index) {\n    size_t bit_index    = index / bits_uint64;\n    uint64_t bit_offset = 1;\n    bit_offset <<= (index % bits_uint64);\n    uint64_t old_val = bitvec[bit_index];\n    // test and set\n    // if old_bit is 0, then atomically set it\n    while (((old_val & bit_offset) == 0) &&\n           !bitvec[bit_index].compare_exchange_weak(\n               old_val, old_val | bit_offset, std::memory_order_relaxed))\n      ;\n    return (old_val & bit_offset);\n  }\n\n  /**\n   * Reset a bit in the bitset.\n   *\n   * @param index Bit to reset\n   * @returns the old value\n   */\n  bool reset(size_t index) {\n    size_t bit_index    = index / bits_uint64;\n    uint64_t bit_offset = 1;\n    bit_offset <<= (index % bits_uint64);\n    uint64_t old_val = bitvec[bit_index];\n    // test and reset\n    // if old_bit is 1, then atomically reset it\n    while (((old_val & bit_offset) != 0) &&\n           !bitvec[bit_index].compare_exchange_weak(\n               old_val, old_val & ~bit_offset, std::memory_order_relaxed))\n      ;\n    return (old_val & bit_offset);\n  }\n\n  // assumes bit_vector is not updated (set) in parallel\n  void bitwise_or(const DynamicBitSet& other) {\n    assert(size() == other.size());\n    auto& other_bitvec = other.get_vec();\n    galois::do_all(\n        galois::iterate(size_t{0}, bitvec.size()),\n        [&](size_t i) { bitvec[i] |= other_bitvec[i]; }, galois::no_stats());\n  }\n\n  // assumes bit_vector is not updated (set) in parallel\n\n  /**\n   * Does an IN-PLACE bitwise and of this bitset and another bitset\n   *\n   * @param other Other bitset to do bitwise and with\n   */\n  void bitwise_and(const DynamicBitSet& other) {\n    assert(size() == other.size());\n    auto& other_bitvec = other.get_vec();\n    galois::do_all(\n        galois::iterate(size_t{0}, bitvec.size()),\n        [&](size_t i) { bitvec[i] &= other_bitvec[i]; }, galois::no_stats());\n  }\n\n  /**\n   * Does an IN-PLACE bitwise and of 2 passed in bitsets and saves to this\n   * bitset\n   *\n   * @param other1 Bitset to and with other 2\n   * @param other2 Bitset to and with other 1\n   */\n  void bitwise_and(const DynamicBitSet& other1, const DynamicBitSet& other2) {\n    assert(size() == other1.size());\n    assert(size() == other2.size());\n    auto& other_bitvec1 = other1.get_vec();\n    auto& other_bitvec2 = other2.get_vec();\n\n    galois::do_all(\n        galois::iterate(size_t{0}, bitvec.size()),\n        [&](size_t i) { bitvec[i] = other_bitvec1[i] & other_bitvec2[i]; },\n        galois::no_stats());\n  }\n\n  /**\n   * Does an IN-PLACE bitwise xor of this bitset and another bitset\n   *\n   * @param other Other bitset to do bitwise xor with\n   */\n  void bitwise_xor(const DynamicBitSet& other) {\n    assert(size() == other.size());\n    auto& other_bitvec = other.get_vec();\n    galois::do_all(\n        galois::iterate(size_t{0}, bitvec.size()),\n        [&](size_t i) { bitvec[i] ^= other_bitvec[i]; }, galois::no_stats());\n  }\n\n  /**\n   * Does an IN-PLACE bitwise and of 2 passed in bitsets and saves to this\n   * bitset\n   *\n   * @param other1 Bitset to xor with other 2\n   * @param other2 Bitset to xor with other 1\n   */\n  void bitwise_xor(const DynamicBitSet& other1, const DynamicBitSet& other2) {\n    assert(size() == other1.size());\n    assert(size() == other2.size());\n    auto& other_bitvec1 = other1.get_vec();\n    auto& other_bitvec2 = other2.get_vec();\n\n    galois::do_all(\n        galois::iterate(size_t{0}, bitvec.size()),\n        [&](size_t i) { bitvec[i] = other_bitvec1[i] ^ other_bitvec2[i]; },\n        galois::no_stats());\n  }\n\n  /**\n   * Count how many bits are set in the bitset\n   *\n   * @returns number of set bits in the bitset\n   */\n  uint64_t count() const {\n    galois::GAccumulator<uint64_t> ret;\n    galois::do_all(\n        galois::iterate(bitvec.begin(), bitvec.end()),\n        [&](uint64_t n) {\n#ifdef __GNUC__\n          ret += __builtin_popcountll(n);\n#else\n          n = n - ((n >> 1) & 0x5555555555555555UL);\n          n = (n & 0x3333333333333333UL) + ((n >> 2) & 0x3333333333333333UL);\n          ret +=\n              (((n + (n >> 4)) & 0xF0F0F0F0F0F0F0FUL) * 0x101010101010101UL) >>\n              56;\n#endif\n        },\n        galois::no_stats());\n    return ret.reduce();\n  }\n\n  /**\n   * Returns a vector containing the set bits in this bitset in order\n   * from left to right.\n   * Do NOT call in a parallel region as it uses galois::on_each.\n   *\n   * @returns vector with offsets into set bits\n   */\n  // TODO uint32_t is somewhat dangerous; change in the future\n  std::vector<uint32_t> getOffsets() const {\n    uint32_t activeThreads = galois::getActiveThreads();\n    std::vector<unsigned int> tPrefixBitCounts(activeThreads);\n\n    // count how many bits are set on each thread\n    galois::on_each([&](unsigned tid, unsigned nthreads) {\n      size_t start;\n      size_t end;\n      std::tie(start, end) =\n          galois::block_range((size_t)0, this->size(), tid, nthreads);\n\n      unsigned int count = 0;\n      for (unsigned int i = start; i < end; ++i) {\n        if (this->test(i))\n          ++count;\n      }\n\n      tPrefixBitCounts[tid] = count;\n    });\n\n    // calculate prefix sum of bits per thread\n    for (unsigned int i = 1; i < activeThreads; ++i) {\n      tPrefixBitCounts[i] += tPrefixBitCounts[i - 1];\n    }\n\n    // total num of set bits\n    uint64_t bitsetCount = tPrefixBitCounts[activeThreads - 1];\n    std::vector<uint32_t> offsets;\n\n    // calculate the indices of the set bits and save them to the offset\n    // vector\n    if (bitsetCount > 0) {\n      offsets.resize(bitsetCount);\n      galois::on_each([&](unsigned tid, unsigned nthreads) {\n        size_t start;\n        size_t end;\n        std::tie(start, end) =\n            galois::block_range((size_t)0, this->size(), tid, nthreads);\n        unsigned int count = 0;\n        unsigned int tPrefixBitCount;\n        if (tid == 0) {\n          tPrefixBitCount = 0;\n        } else {\n          tPrefixBitCount = tPrefixBitCounts[tid - 1];\n        }\n\n        for (unsigned int i = start; i < end; ++i) {\n          if (this->test(i)) {\n            offsets[tPrefixBitCount + count] = i;\n            ++count;\n          }\n        }\n      });\n    }\n\n    return offsets;\n  }\n\n  //! this is defined to\n  using tt_is_copyable = int;\n};\n\n//! An empty bitset object; used mainly by InvalidBitsetFnTy\nstatic galois::DynamicBitSet EmptyBitset;\n\n//! A structure representing an empty bitset.\nstruct InvalidBitsetFnTy {\n  //! Returns false as this is an empty bitset\n  static constexpr bool is_vector_bitset() { return false; }\n\n  //! Returns false as this is an empty bitset (invalid)\n  static constexpr bool is_valid() { return false; }\n\n  //! Returns the empty bitset\n  static galois::DynamicBitSet& get() { return EmptyBitset; }\n\n  //! No-op since it's an empty bitset\n  static void reset_range(size_t, size_t) {}\n};\n} // namespace galois\n#endif\n"
  },
  {
    "path": "libgalois/include/galois/Endian.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#ifndef GALOIS_ENDIAN_H\n#define GALOIS_ENDIAN_H\n\n#include <cstdint>\n\n#include \"galois/config.h\"\n\nnamespace galois {\n\nstatic inline uint32_t bswap32(uint32_t x) {\n#if defined(__GNUC__) || defined(__clang__)\n  return __builtin_bswap32(x);\n#else\n  return ((x << 24) & 0xff000000) | ((x << 8) & 0x00ff0000) |\n         ((x >> 8) & 0x0000ff00) | ((x >> 24) & 0x000000ff);\n#endif\n}\n\nstatic inline uint64_t bswap64(uint64_t x) {\n#if defined(__GNUC__) || defined(__clang__)\n  return __builtin_bswap64(x);\n#else\n  return ((x << 56) & 0xff00000000000000UL) |\n         ((x << 40) & 0x00ff000000000000UL) |\n         ((x << 24) & 0x0000ff0000000000UL) |\n         ((x << 8) & 0x000000ff00000000UL) | ((x >> 8) & 0x00000000ff000000UL) |\n         ((x >> 24) & 0x0000000000ff0000UL) |\n         ((x >> 40) & 0x000000000000ff00UL) |\n         ((x >> 56) & 0x00000000000000ffUL);\n#endif\n}\n\n// NB: Wrap these standard functions with different names because\n// sometimes le64toh and such are implemented as macros and we don't\n// want any nasty surprises.\nstatic inline uint64_t convert_le64toh(uint64_t x) {\n#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__\n  return x;\n#else\n  return bswap64(x);\n#endif\n}\n\nstatic inline uint32_t convert_le32toh(uint32_t x) {\n#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__\n  return x;\n#else\n  return bswap32(x);\n#endif\n}\n\nstatic inline uint64_t convert_htobe64(uint64_t x) {\n#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__\n  return x;\n#else\n  return bswap64(x);\n#endif\n}\n\nstatic inline uint32_t convert_htobe32(uint32_t x) {\n#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__\n  return x;\n#else\n  return bswap32(x);\n#endif\n}\n\nstatic inline uint64_t convert_htole64(uint64_t x) {\n#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__\n  return x;\n#else\n  return bswap64(x);\n#endif\n}\n\nstatic inline uint32_t convert_htole32(uint32_t x) {\n#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__\n  return x;\n#else\n  return bswap32(x);\n#endif\n}\n\n} // namespace galois\n#endif\n"
  },
  {
    "path": "libgalois/include/galois/FixedSizeRing.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#ifndef GALOIS_FIXEDSIZERING_H\n#define GALOIS_FIXEDSIZERING_H\n\n#include <atomic>\n#include <utility>\n\n#include <boost/mpl/if.hpp>\n#include <boost/iterator/iterator_facade.hpp>\n#include <boost/iterator/reverse_iterator.hpp>\n\n#include \"galois/config.h\"\n#include \"galois/optional.h\"\n#include \"galois/LazyArray.h\"\n\nnamespace galois {\n\n//! Unordered collection of bounded size\ntemplate <typename T, unsigned ChunkSize, bool Concurrent>\nclass FixedSizeBagBase {\n  LazyArray<T, ChunkSize> datac;\n  typedef typename boost::mpl::if_c<Concurrent, std::atomic<unsigned>,\n                                    unsigned>::type Count;\n  Count count;\n\n  T* at(unsigned i) { return &datac[i]; }\n  const T* at(unsigned i) const { return &datac[i]; }\n\n  bool precondition() const { return count <= ChunkSize; }\n\npublic:\n  typedef T value_type;\n  typedef T* pointer;\n  typedef const T* const_pointer;\n  typedef T& reference;\n  typedef const T& const_reference;\n  typedef boost::reverse_iterator<pointer> iterator;\n  typedef boost::reverse_iterator<const_pointer> const_iterator;\n  typedef pointer reverse_iterator;\n  typedef const_pointer const_reverse_iterator;\n\n  FixedSizeBagBase() : count(0) {}\n\n  template <typename InputIterator>\n  FixedSizeBagBase(InputIterator first, InputIterator last) : count(0) {\n    while (first != last) {\n      assert(count < ChunkSize);\n      datac.emplace(count++, *first++);\n    }\n  }\n\n  FixedSizeBagBase(const FixedSizeBagBase& o) = delete;\n  FixedSizeBagBase& operator=(const FixedSizeBagBase& o) = delete;\n\n  ~FixedSizeBagBase() { clear(); }\n\n  unsigned size() const {\n    assert(precondition());\n    return count;\n  }\n\n  bool empty() const {\n    assert(precondition());\n    return count == 0;\n  }\n\n  bool full() const {\n    assert(precondition());\n    return count == ChunkSize;\n  }\n\n  void clear() {\n    assert(precondition());\n    for (unsigned x = 0; x < count; ++x)\n      datac.destroy(x);\n    count = 0;\n  }\n\n  template <typename U>\n  pointer push_back(U&& val) {\n    return push_front(std::forward<U>(val));\n  }\n\n  template <typename... Args>\n  pointer emplace_back(Args&&... args) {\n    return emplace_front(std::forward<Args>(args)...);\n  }\n\n  template <typename U, bool C = Concurrent>\n  auto push_front(U&& val) -> typename std::enable_if<!C, pointer>::type {\n    return emplace_front(std::forward<U>(val));\n  }\n\n  template <bool C = Concurrent>\n  auto push_front(const value_type& val) ->\n      typename std::enable_if<C, pointer>::type {\n    unsigned top;\n    do {\n      top = count.load(std::memory_order_relaxed);\n      if (top >= ChunkSize)\n        return nullptr;\n    } while (!count.compare_exchange_weak(top, top + 1));\n    return datac.emplace(top, val);\n  }\n\n  /**\n   * emplace_front is not available for concurrent versions because it is not\n   * possible for clients to know in advance whether insertion will succeed,\n   * which will leave xvalue arguments in indeterminate state.\n   */\n  template <typename... Args, bool C = Concurrent>\n  auto emplace_front(Args&&... args) ->\n      typename std::enable_if<!C, pointer>::type {\n    if (full())\n      return 0;\n    unsigned top = count++;\n    return datac.emplace(top, std::forward<Args>(args)...);\n  }\n\n  reference back() { return front(); }\n  const_reference back() const { return front(); }\n  galois::optional<value_type> extract_back() { return extract_front(); }\n\n  bool pop_back() { return pop_front(); }\n\n  reference front() {\n    assert(precondition());\n    assert(!empty());\n    return *at(count - 1);\n  }\n\n  const_reference front() const { return *at(count - 1); }\n\n  template <bool C = Concurrent>\n  auto extract_front() ->\n      typename std::enable_if<!C, galois::optional<value_type>>::type {\n    if (!empty()) {\n      galois::optional<value_type> retval(back());\n      pop_back();\n      return retval;\n    }\n    return galois::optional<value_type>();\n  }\n\n  //! returns true if something was popped\n  template <bool C = Concurrent>\n  auto pop_front() -> typename std::enable_if<C, bool>::type {\n    unsigned top;\n    do {\n      top = count.load(std::memory_order_relaxed);\n      if (top == 0)\n        return false;\n    } while (!count.compare_exchange_weak(top, top - 1));\n    datac.destroy(top);\n    return true;\n  }\n\n  //! returns true if something was popped\n  template <bool C = Concurrent>\n  auto pop_front() -> typename std::enable_if<!C, bool>::type {\n    if (count == 0)\n      return false;\n    datac.destroy(--count);\n    return true;\n  }\n\n  reverse_iterator rbegin() { return &datac[0]; }\n  reverse_iterator rend() { return &datac[count]; }\n  const_reverse_iterator rbegin() const { return &datac[0]; }\n  const_reverse_iterator rend() const { return &datac[count]; }\n\n  iterator begin() { return iterator(rend()); }\n  iterator end() { return iterator(rbegin()); }\n  const_iterator begin() const { return const_iterator(rend()); }\n  const_iterator end() const { return const_iterator(rbegin()); }\n};\n\n//! Unordered collection of bounded size\ntemplate <typename T, unsigned ChunkSize = 64>\nusing FixedSizeBag = FixedSizeBagBase<T, ChunkSize, false>;\n\n//! Unordered collection of bounded size with concurrent insertion or deletion\n//! but not both simultaneously\ntemplate <typename T, unsigned ChunkSize = 64>\nusing ConcurrentFixedSizeBag = FixedSizeBagBase<T, ChunkSize, true>;\n\n//! Ordered collection of bounded size\ntemplate <typename T, unsigned ChunkSize = 64>\nclass FixedSizeRing {\n  LazyArray<T, ChunkSize> datac;\n  unsigned start;\n  unsigned count;\n\n  T* at(unsigned i) { return &datac[i]; }\n  const T* at(unsigned i) const { return &datac[i]; }\n\n  bool precondition() const { return count <= ChunkSize && start <= ChunkSize; }\n\n  template <typename U>\n  class Iterator\n      : public boost::iterator_facade<Iterator<U>, U,\n                                      boost::random_access_traversal_tag> {\n    friend class boost::iterator_core_access;\n    U* base;\n    unsigned cur;\n    unsigned count;\n\n    template <typename OtherTy>\n    bool equal(const Iterator<OtherTy>& o) const {\n      assert(base && o.base);\n      return &base[cur] == &o.base[o.cur] && count == o.count;\n    }\n\n    U& dereference() const { return base[cur]; }\n\n    void increment() {\n      assert(base && count != 0);\n      count -= 1;\n      cur = (cur + 1) % ChunkSize;\n    }\n\n    void decrement() {\n      assert(base && count < ChunkSize);\n      count += 1;\n      cur = (cur + ChunkSize - 1) % ChunkSize;\n    }\n\n    void advance(ptrdiff_t x) {\n      count -= x;\n      cur = (cur + ChunkSize + x) % ChunkSize;\n    }\n\n    ptrdiff_t distance_to(const Iterator& o) const {\n      ptrdiff_t c  = count;\n      ptrdiff_t oc = o.count;\n      return c - oc;\n    }\n\n  public:\n    Iterator() : base(0), cur(0), count(0) {}\n\n    template <typename OtherTy>\n    Iterator(const Iterator<OtherTy>& o)\n        : base(o.base), cur(o.cur), count(o.count) {}\n\n    Iterator(U* b, unsigned c, unsigned co) : base(b), cur(c), count(co) {}\n  };\n\npublic:\n  typedef T value_type;\n  typedef T* pointer;\n  typedef T& reference;\n  typedef const T& const_reference;\n  typedef Iterator<T> iterator;\n  typedef Iterator<const T> const_iterator;\n  typedef boost::reverse_iterator<Iterator<T>> reverse_iterator;\n  typedef boost::reverse_iterator<Iterator<const T>> const_reverse_iterator;\n\n  FixedSizeRing() : start(0), count(0) {}\n\n  template <typename InputIterator>\n  FixedSizeRing(InputIterator first, InputIterator last) : start(0), count(0) {\n    while (first != last) {\n      assert(count < ChunkSize);\n      datac.emplace(count++, *first++);\n    }\n  }\n\n  FixedSizeRing(const FixedSizeRing& o) = delete;\n  FixedSizeRing& operator=(const FixedSizeRing& o) = delete;\n\n  ~FixedSizeRing() { clear(); }\n\n  unsigned size() const {\n    assert(precondition());\n    return count;\n  }\n\n  bool empty() const {\n    assert(precondition());\n    return count == 0;\n  }\n\n  bool full() const {\n    assert(precondition());\n    return count == ChunkSize;\n  }\n\n  reference getAt(unsigned x) {\n    assert(precondition());\n    assert(!empty());\n    return *at((start + x) % ChunkSize);\n  }\n\n  const_reference getAt(unsigned x) const {\n    assert(precondition());\n    assert(!empty());\n    return *at((start + x) % ChunkSize);\n  }\n\n  void clear() {\n    assert(precondition());\n    for (unsigned x = 0; x < count; ++x)\n      datac.destroy((start + x) % ChunkSize);\n    count = 0;\n    start = 0;\n  }\n\n  // NB(ddn): Keeping emplace_front/_back code paths separate to improve\n  // branch prediction etc\n  template <typename... Args>\n  pointer emplace(iterator pos, Args&&... args) {\n    if (full())\n      return 0;\n    unsigned i;\n    if (pos == begin()) {\n      i = start = (start + ChunkSize - 1) % ChunkSize;\n      ++count;\n    } else if (pos == end()) {\n      i = (start + count) % ChunkSize;\n      ++count;\n    } else {\n      auto d = std::distance(begin(), pos);\n      i      = (start + d) % ChunkSize;\n      emplace_back();\n      std::move_backward(begin() + d, end() - 1, end());\n      datac.destroy(i);\n    }\n    return datac.emplace(i, std::forward<Args>(args)...);\n  }\n\n  template <typename U>\n  pointer push_front(U&& val) {\n    return emplace_front(std::forward<U>(val));\n  }\n\n  template <typename... Args>\n  pointer emplace_front(Args&&... args) {\n    if (full())\n      return 0;\n    start = (start + ChunkSize - 1) % ChunkSize;\n    ++count;\n    return datac.emplace(start, std::forward<Args>(args)...);\n  }\n\n  template <typename U>\n  pointer push_back(U&& val) {\n    return emplace_back(std::forward<U>(val));\n  }\n\n  template <typename... Args>\n  pointer emplace_back(Args&&... args) {\n    if (full())\n      return 0;\n    unsigned end = (start + count) % ChunkSize;\n    ++count;\n    return datac.emplace(end, std::forward<Args>(args)...);\n  }\n\n  reference front() {\n    assert(precondition());\n    assert(!empty());\n    return *at(start);\n  }\n\n  const_reference front() const {\n    assert(precondition());\n    assert(!empty());\n    return *at(start);\n  }\n\n  galois::optional<value_type> extract_front() {\n    if (!empty()) {\n      galois::optional<value_type> retval(front());\n      pop_front();\n      return retval;\n    }\n    return galois::optional<value_type>();\n  }\n\n  void pop_front() {\n    assert(precondition());\n    assert(!empty());\n    datac.destroy(start);\n    start = (start + 1) % ChunkSize;\n    --count;\n  }\n\n  reference back() {\n    assert(precondition());\n    assert(!empty());\n    return *at((start + count - 1) % ChunkSize);\n  }\n\n  const_reference back() const {\n    assert(precondition());\n    assert(!empty());\n    return *at((start + count - 1) % ChunkSize);\n  }\n\n  galois::optional<value_type> extract_back() {\n    if (!empty()) {\n      galois::optional<value_type> retval(back());\n      pop_back();\n      return retval;\n    }\n    return galois::optional<value_type>();\n  }\n\n  void pop_back() {\n    assert(precondition());\n    assert(!empty());\n    datac.destroy((start + count - 1) % ChunkSize);\n    --count;\n  }\n\n  iterator begin() { return iterator(at(0), start, count); }\n  iterator end() { return iterator(at(0), (start + count) % ChunkSize, 0); }\n  const_iterator begin() const { return const_iterator(at(0), start, count); }\n  const_iterator end() const {\n    return const_iterator(at(0), (start + count) % ChunkSize, 0);\n  }\n\n  reverse_iterator rbegin() { return reverse_iterator(end()); }\n  reverse_iterator rend() { return reverse_iterator(begin()); }\n  const_iterator rbegin() const { const_reverse_iterator(this->end()); }\n  const_iterator rend() const { const_reverse_iterator(this->begin()); }\n};\n\n} // namespace galois\n#endif\n"
  },
  {
    "path": "libgalois/include/galois/FlatMap.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#ifndef GALOIS_FLATMAP_H\n#define GALOIS_FLATMAP_H\n\n#include <algorithm>\n#include <stdexcept>\n#include <type_traits>\n#include <vector>\n\n#include \"galois/config.h\"\n\nnamespace galois {\n\n//! Simple map data structure, based off a single array.\ntemplate <class _Key, class _Tp, class _Compare = std::less<_Key>,\n          class _Alloc = std::allocator<std::pair<_Key, _Tp>>,\n          class _Store = std::vector<std::pair<_Key, _Tp>, _Alloc>>\nclass flat_map {\npublic:\n  typedef _Key key_type;\n  typedef _Tp mapped_type;\n  typedef std::pair<_Key, _Tp> value_type;\n  typedef _Compare key_compare;\n  typedef _Alloc allocator_type;\n\n  class value_compare {\n    friend class flat_map<_Key, _Tp, _Compare, _Alloc, _Store>;\n\n  protected:\n    _Compare comp;\n\n    value_compare(_Compare __c) : comp(__c) {}\n\n  public:\n    bool operator()(const value_type& __x, const value_type& __y) const {\n      return comp(__x.first, __y.first);\n    }\n  };\n\nprivate:\n  /// This turns...\n  typedef typename _Alloc::template rebind<value_type>::other _Pair_alloc_type;\n\n  typedef _Store _VectTy;\n  _VectTy _data;\n  _Compare _comp;\n\n  class value_key_compare {\n    friend class flat_map<_Key, _Tp, _Compare, _Alloc, _Store>;\n\n  protected:\n    _Compare comp;\n\n    value_key_compare(_Compare __c) : comp(__c) {}\n\n  public:\n    bool operator()(const value_type& __x, const key_type& __y) const {\n      return comp(__x.first, __y);\n    }\n  };\n\n  value_key_compare value_key_comp() const {\n    return value_key_compare(key_comp());\n  }\n\n  bool key_eq(const key_type& k1, const key_type& k2) const {\n    return !key_comp()(k1, k2) && !key_comp()(k2, k1);\n  }\n\n  void resort() { std::sort(_data.begin(), _data.end(), value_comp()); }\n\npublic:\n  typedef typename _Pair_alloc_type::pointer pointer;\n  typedef typename _Pair_alloc_type::const_pointer const_pointer;\n  typedef typename _Pair_alloc_type::reference reference;\n  typedef typename _Pair_alloc_type::const_reference const_reference;\n  typedef typename _VectTy::iterator iterator;\n  typedef typename _VectTy::const_iterator const_iterator;\n  typedef typename _VectTy::size_type size_type;\n  typedef typename _VectTy::difference_type difference_type;\n  typedef typename _VectTy::reverse_iterator reverse_iterator;\n  typedef typename _VectTy::const_reverse_iterator const_reverse_iterator;\n\n  flat_map() : _data(), _comp() {}\n\n  explicit flat_map(const _Compare& __comp,\n                    const allocator_type& = allocator_type())\n      // XXX :_data(_Pair_alloc_type(__a)), _comp(__comp) {}\n      : _data(), _comp(__comp) {}\n\n  flat_map(const flat_map& __x) : _data(__x._data), _comp(__x._comp) {}\n\n  flat_map(flat_map&& __x)\n      /* noexcept(std::is_nothrow_copy_constructible<_Compare>::value) */\n      : _data(std::move(__x._data)), _comp(std::move(__x._comp)) {}\n\n  /*\n  flat_map(std::initializer_list<value_type> __l,\n       const _Compare& __comp = _Compare(),\n       const allocator_type& __a = allocator_type())\n    : _data(__l, _Pair_alloc_type(__a)), _comp(__comp) { resort(); }\n   */\n\n  template <typename _InputIterator>\n  flat_map(_InputIterator __first, _InputIterator __last)\n      : _data(__first, __last), _comp() {\n    resort();\n  }\n\n  template <typename _InputIterator>\n  flat_map(_InputIterator __first, _InputIterator __last, const _Compare&,\n           const allocator_type& __a = allocator_type())\n      : _data(__first, __last, _Pair_alloc_type(__a)) {\n    resort();\n  }\n\n  flat_map& operator=(const flat_map& __x) {\n    _data = __x._data;\n    _comp = __x._comp;\n    return *this;\n  }\n\n  flat_map& operator=(flat_map&& __x) {\n    clear();\n    swap(__x);\n    return *this;\n  }\n\n  /*\n  flat_map& operator=(std::initializer_list<value_type> __l) {\n    clear();\n    insert(__l.begin(), __l.end());\n    return *this;\n  }\n   */\n\n  allocator_type get_allocator() const /* noexcept */ {\n    return allocator_type(_data.get_allocator());\n  }\n\n  // iterators\n\n  iterator begin() /* noexcept */ { return _data.begin(); }\n  const_iterator begin() const /* noexcept */ { return _data.begin(); }\n  iterator end() /* noexcept */ { return _data.end(); }\n  const_iterator end() const /* noexcept */ { return _data.end(); }\n  reverse_iterator rbegin() /* noexcept */ { return _data.rbegin(); }\n  const_reverse_iterator rbegin() const /* noexcept */ {\n    return _data.rbegin();\n  }\n  reverse_iterator rend() /* noexcept */ { return _data.rend(); }\n  const_reverse_iterator rend() const /* noexcept */ { return _data.rend(); }\n  const_iterator cbegin() const /* noexcept */ { return _data.begin(); }\n  const_iterator cend() const /* noexcept */ { return _data.end(); }\n  const_reverse_iterator crbegin() const /* noexcept */ {\n    return _data.rbegin();\n  }\n  const_reverse_iterator crend() const /* noexcept */ { return _data.rend(); }\n\n  bool empty() const /* noexcept */ { return _data.empty(); }\n  size_type size() const /* noexcept */ { return _data.size(); }\n  size_type max_size() const /* noexcept */ { return _data.max_size(); }\n\n  template <typename... Args>\n  std::pair<iterator, bool> emplace(Args&&... args) {\n    // assert(std::adjacent_find(_data.begin(), _data.end(), [&](const\n    // value_type& a, const value_type& b) {\n    //    return key_comp()(b.first, a.first);\n    //}) == _data.end());\n    _data.emplace_back(std::forward<Args>(args)...);\n    value_type& v = _data.back();\n    auto ee       = _data.end();\n    --ee;\n    auto __i = std::lower_bound(_data.begin(), ee, v.first, value_key_comp());\n    // key < __i->first\n    bool retval = __i == ee || key_comp()(v.first, (*__i).first);\n    if (retval) {\n      if (__i != ee) {\n        value_type tmp = std::move(v);\n        __i            = _data.emplace(__i, std::move(tmp));\n        _data.pop_back();\n      }\n    } else {\n      // key == __i->first\n      _data.pop_back();\n    }\n    return std::make_pair(__i, retval);\n  }\n\n  mapped_type& operator[](const key_type& __k) {\n    iterator __i = lower_bound(__k);\n    // __i->first is greater than or equivalent to __k.\n    if (__i == end() || key_comp()(__k, (*__i).first))\n      __i = _data.emplace(__i, std::piecewise_construct,\n                          std::forward_as_tuple(__k), std::tuple<>());\n    return (*__i).second;\n  }\n\n  mapped_type& operator[](key_type&& __k) {\n    iterator __i = lower_bound(__k);\n    // __i->first is greater than or equivalent to __k.\n    if (__i == end() || key_comp()(__k, (*__i).first))\n      __i =\n          _data.emplace(__i, std::piecewise_construct,\n                        std::forward_as_tuple(std::move(__k)), std::tuple<>());\n    return (*__i).second;\n  }\n\n  mapped_type& at(const key_type& __k) {\n    iterator __i = lower_bound(__k);\n    if (__i == end() || key_comp()(__k, (*__i).first))\n      throw std::out_of_range(\"flat_map::at\");\n    return (*__i).second;\n  }\n\n  const mapped_type& at(const key_type& __k) const {\n    const_iterator __i = lower_bound(__k);\n    if (__i == end() || key_comp()(__k, (*__i).first))\n      throw std::out_of_range(\"flat_map::at\");\n    return (*__i).second;\n  }\n\n  template <typename PairTy,\n            typename = typename std::enable_if<\n                std::is_constructible<value_type, PairTy&&>::value>::type>\n  std::pair<iterator, bool> insert(PairTy&& __x) {\n    return emplace(std::forward<PairTy>(__x));\n  }\n\n  /*\n  void insert(std::initializer_list<value_type> __list) {\n    insert(__list.begin(), __list.end());\n  }\n   */\n\n  template <typename _InputIterator>\n  void insert(_InputIterator __first, _InputIterator __last) {\n    while (__first != __last)\n      insert(*__first++);\n  }\n\n  iterator erase(const_iterator __position) { return _data.erase(__position); }\n  iterator erase(iterator __position) { return _data.erase(__position); }\n\n  size_type erase(const key_type& __x) {\n    auto i = find(__x);\n    if (i != end()) {\n      _data.erase(i);\n      return 1;\n    }\n    return 0;\n  }\n\n  iterator erase(const_iterator __first, const_iterator __last) {\n    return _data.erase(__first, __last);\n  }\n\n  void swap(flat_map& __x) {\n    _data.swap(__x._data);\n    std::swap(_comp, __x._comp);\n  }\n\n  void clear() /* noexcept */ { _data.clear(); }\n\n  key_compare key_comp() const { return _comp; }\n  value_compare value_comp() const { return value_compare(key_comp()); }\n\n  iterator find(const key_type& __x) {\n    auto i = lower_bound(__x);\n    if (i != end() && key_eq(i->first, __x))\n      return i;\n    return end();\n  }\n\n  const_iterator find(const key_type& __x) const {\n    auto i = lower_bound(__x);\n    if (i != end() && key_eq(i->first, __x))\n      return i;\n    return end();\n  }\n\n  size_type count(const key_type& __x) const {\n    return find(__x) == end() ? 0 : 1;\n  }\n\n  iterator lower_bound(const key_type& __x) {\n    return std::lower_bound(_data.begin(), _data.end(), __x, value_key_comp());\n  }\n  const_iterator lower_bound(const key_type& __x) const {\n    return std::lower_bound(_data.begin(), _data.end(), __x, value_key_comp());\n  }\n\n  iterator upper_bound(const key_type& __x) {\n    return std::upper_bound(_data.begin(), _data.end(), __x, value_key_comp());\n  }\n  const_iterator upper_bound(const key_type& __x) const {\n    return std::upper_bound(_data.begin(), _data.end(), __x, value_key_comp());\n  }\n\n  std::pair<iterator, iterator> equal_range(const key_type& __x) {\n    return std::make_pair(lower_bound(__x), upper_bound(__x));\n  }\n\n  std::pair<const_iterator, const_iterator>\n  equal_range(const key_type& __x) const {\n    return std::make_pair(lower_bound(__x), upper_bound(__x));\n  }\n};\n\ntemplate <typename _Key, typename _Tp, typename _Compare, typename _Alloc>\ninline bool operator==(const flat_map<_Key, _Tp, _Compare, _Alloc>& __x,\n                       const flat_map<_Key, _Tp, _Compare, _Alloc>& __y) {\n  return __x._data == __y._data;\n}\n\ntemplate <typename _Key, typename _Tp, typename _Compare, typename _Alloc>\ninline bool operator<(const flat_map<_Key, _Tp, _Compare, _Alloc>& __x,\n                      const flat_map<_Key, _Tp, _Compare, _Alloc>& __y) {\n  return __x._data < __y._data;\n}\n\n/// Based on operator==\ntemplate <typename _Key, typename _Tp, typename _Compare, typename _Alloc>\ninline bool operator!=(const flat_map<_Key, _Tp, _Compare, _Alloc>& __x,\n                       const flat_map<_Key, _Tp, _Compare, _Alloc>& __y) {\n  return !(__x == __y);\n}\n\n/// Based on operator<\ntemplate <typename _Key, typename _Tp, typename _Compare, typename _Alloc>\ninline bool operator>(const flat_map<_Key, _Tp, _Compare, _Alloc>& __x,\n                      const flat_map<_Key, _Tp, _Compare, _Alloc>& __y) {\n  return __y < __x;\n}\n\n/// Based on operator<\ntemplate <typename _Key, typename _Tp, typename _Compare, typename _Alloc>\ninline bool operator<=(const flat_map<_Key, _Tp, _Compare, _Alloc>& __x,\n                       const flat_map<_Key, _Tp, _Compare, _Alloc>& __y) {\n  return !(__y < __x);\n}\n\n/// Based on operator<\ntemplate <typename _Key, typename _Tp, typename _Compare, typename _Alloc>\ninline bool operator>=(const flat_map<_Key, _Tp, _Compare, _Alloc>& __x,\n                       const flat_map<_Key, _Tp, _Compare, _Alloc>& __y) {\n  return !(__x < __y);\n}\n\n} // namespace galois\n\nnamespace std {\n\n/// See galois::flat_map::swap().\ntemplate <typename _Key, typename _Tp, typename _Compare, typename _Alloc>\ninline void swap(galois::flat_map<_Key, _Tp, _Compare, _Alloc>& __x,\n                 galois::flat_map<_Key, _Tp, _Compare, _Alloc>& __y) {\n  __x.swap(__y);\n}\n\n} // namespace std\n\n#endif\n"
  },
  {
    "path": "libgalois/include/galois/Galois.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#ifndef GALOIS_GALOIS_H\n#define GALOIS_GALOIS_H\n\n#include \"galois/config.h\"\n#include \"galois/Loops.h\"\n#include \"galois/SharedMemSys.h\"\n#include \"galois/runtime/Mem.h\"\n\n#endif\n"
  },
  {
    "path": "libgalois/include/galois/GaloisForwardDecl.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#include \"galois/config.h\"\n\nnamespace galois {\n\ntemplate <typename RangeFunc, typename FunctionTy, typename... Args>\nvoid for_each(const RangeFunc& rangeMaker, FunctionTy&& fn,\n              const Args&... args);\n\ntemplate <typename RangeFunc, typename FunctionTy, typename... Args>\nvoid do_all(const RangeFunc& rangeMaker, FunctionTy&& fn, const Args&... args);\n\ntemplate <typename FunctionTy, typename... Args>\nvoid on_each(FunctionTy&& fn, const Args&... args);\n\n} // end namespace galois\n"
  },
  {
    "path": "libgalois/include/galois/LargeArray.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#ifndef GALOIS_LARGEARRAY_H\n#define GALOIS_LARGEARRAY_H\n\n#include <iostream>\n#include <utility>\n\n#include <boost/archive/binary_iarchive.hpp>\n#include <boost/archive/binary_oarchive.hpp>\n#include <boost/serialization/binary_object.hpp>\n#include <boost/serialization/array.hpp>\n#include <boost/serialization/serialization.hpp>\n#include <boost/serialization/split_member.hpp>\n\n#include \"galois/config.h\"\n#include \"galois/Galois.h\"\n#include \"galois/gIO.h\"\n#include \"galois/ParallelSTL.h\"\n#include \"galois/runtime/Mem.h\"\n#include \"galois/substrate/NumaMem.h\"\n\nnamespace galois {\n\nnamespace runtime {\nextern unsigned activeThreads;\n} // end namespace runtime\n\n/**\n * Large array of objects with proper specialization for void type and\n * supporting various allocation and construction policies.\n *\n * @tparam T value type of container\n */\ntemplate <typename T>\nclass LargeArray {\n  substrate::LAptr m_realdata;\n  T* m_data;\n  size_t m_size;\n\npublic:\n  typedef T raw_value_type;\n  typedef T value_type;\n  typedef size_t size_type;\n  typedef ptrdiff_t difference_type;\n  typedef value_type& reference;\n  typedef const value_type& const_reference;\n  typedef value_type* pointer;\n  typedef const value_type* const_pointer;\n  typedef pointer iterator;\n  typedef const_pointer const_iterator;\n  const static bool has_value = true;\n\n  // Extra indirection to support incomplete T's\n  struct size_of {\n    const static size_t value = sizeof(T);\n  };\n\nprotected:\n  enum AllocType { Blocked, Local, Interleaved, Floating };\n  void allocate(size_type n, AllocType t) {\n    assert(!m_data);\n    m_size = n;\n    switch (t) {\n    case Blocked:\n      galois::gDebug(\"Block-alloc'd\");\n      m_realdata =\n          substrate::largeMallocBlocked(n * sizeof(T), runtime::activeThreads);\n      break;\n    case Interleaved:\n      galois::gDebug(\"Interleave-alloc'd\");\n      m_realdata = substrate::largeMallocInterleaved(n * sizeof(T),\n                                                     runtime::activeThreads);\n      break;\n    case Local:\n      galois::gDebug(\"Local-allocd\");\n      m_realdata = substrate::largeMallocLocal(n * sizeof(T));\n      break;\n    case Floating:\n      galois::gDebug(\"Floating-alloc'd\");\n      m_realdata = substrate::largeMallocFloating(n * sizeof(T));\n      break;\n    };\n    m_data = reinterpret_cast<T*>(m_realdata.get());\n  }\n\nprivate:\n  /*\n   * To support boost serialization\n   */\n  friend class boost::serialization::access;\n  template <typename Archive>\n  void save(Archive& ar, const unsigned int) const {\n\n    // TODO DON'T USE CERR\n    // std::cerr << \"save m_size : \" << m_size << \" Threads : \" <<\n    // runtime::activeThreads << \"\\n\";\n    ar << m_size;\n    // for(size_t i = 0; i < m_size; ++i){\n    // ar << m_data[i];\n    //}\n    ar << boost::serialization::make_binary_object(m_data, m_size * sizeof(T));\n    /*\n     * Cas use make_array too as shown below\n     * IMPORTANT: Use make_array as temp fix for benchmarks using non-trivial\n     * structures in nodeData (Eg. SGD) This also requires changes in\n     * libgalois/include/galois/graphs/Details.h (specified in the file).\n     */\n    // ar << boost::serialization::make_array<T>(m_data, m_size);\n  }\n  template <typename Archive>\n  void load(Archive& ar, const unsigned int) {\n    ar >> m_size;\n\n    // TODO DON'T USE CERR\n    // std::cerr << \"load m_size : \" << m_size << \" Threads : \" <<\n    // runtime::activeThreads << \"\\n\";\n\n    // TODO: For now, always use allocateInterleaved\n    // Allocates and sets m_data pointer\n    if (!m_data)\n      allocateInterleaved(m_size);\n\n    // for(size_t i = 0; i < m_size; ++i){\n    // ar >> m_data[i];\n    //}\n    ar >> boost::serialization::make_binary_object(m_data, m_size * sizeof(T));\n    /*\n     * Cas use make_array too as shown below\n     * IMPORTANT: Use make_array as temp fix for SGD\n     *            This also requires changes in\n     * libgalois/include/galois/graphs/Details.h (specified in the file).\n     */\n    // ar >> boost::serialization::make_array<T>(m_data, m_size);\n  }\n  // The macro BOOST_SERIALIZATION_SPLIT_MEMBER() generates code which invokes\n  // the save or load depending on whether the archive is used for saving or\n  // loading\n  BOOST_SERIALIZATION_SPLIT_MEMBER()\n\npublic:\n  /**\n   * Wraps existing buffer in LargeArray interface.\n   */\n  LargeArray(void* d, size_t s) : m_data(reinterpret_cast<T*>(d)), m_size(s) {}\n\n  LargeArray() : m_data(0), m_size(0) {}\n\n  LargeArray(LargeArray&& o) : m_data(0), m_size(0) {\n    std::swap(this->m_realdata, o.m_realdata);\n    std::swap(this->m_data, o.m_data);\n    std::swap(this->m_size, o.m_size);\n  }\n\n  LargeArray& operator=(LargeArray&& o) {\n    std::swap(this->m_realdata, o.m_realdata);\n    std::swap(this->m_data, o.m_data);\n    std::swap(this->m_size, o.m_size);\n    return *this;\n  }\n\n  LargeArray(const LargeArray&) = delete;\n  LargeArray& operator=(const LargeArray&) = delete;\n\n  ~LargeArray() {\n    destroy();\n    deallocate();\n  }\n\n  friend void swap(LargeArray& lhs, LargeArray& rhs) {\n    std::swap(lhs.m_realdata, rhs.m_realdata);\n    std::swap(lhs.m_data, rhs.m_data);\n    std::swap(lhs.m_size, rhs.m_size);\n  }\n\n  const_reference at(difference_type x) const { return m_data[x]; }\n  reference at(difference_type x) { return m_data[x]; }\n  const_reference operator[](size_type x) const { return m_data[x]; }\n  reference operator[](size_type x) { return m_data[x]; }\n  void set(difference_type x, const_reference v) { m_data[x] = v; }\n  size_type size() const { return m_size; }\n  iterator begin() { return m_data; }\n  const_iterator begin() const { return m_data; }\n  iterator end() { return m_data + m_size; }\n  const_iterator end() const { return m_data + m_size; }\n\n  //! [allocatefunctions]\n  //! Allocates interleaved across NUMA (memory) nodes.\n  void allocateInterleaved(size_type n) { allocate(n, Interleaved); }\n\n  /**\n   * Allocates using blocked memory policy\n   *\n   * @param  n         number of elements to allocate\n   */\n  void allocateBlocked(size_type n) { allocate(n, Blocked); }\n\n  /**\n   * Allocates using Thread Local memory policy\n   *\n   * @param  n         number of elements to allocate\n   */\n  void allocateLocal(size_type n) { allocate(n, Local); }\n\n  /**\n   * Allocates using no memory policy (no pre alloc)\n   *\n   * @param  n         number of elements to allocate\n   */\n  void allocateFloating(size_type n) { allocate(n, Floating); }\n\n  /**\n   * Allocate memory to threads based on a provided array specifying which\n   * threads receive which elements of data.\n   *\n   * @tparam RangeArrayTy The type of the threadRanges array; should either\n   * be uint32_t* or uint64_t*\n   * @param numberOfElements Number of elements to allocate space for\n   * @param threadRanges An array specifying how elements should be split\n   * among threads\n   */\n  template <typename RangeArrayTy>\n  void allocateSpecified(size_type numberOfElements,\n                         RangeArrayTy& threadRanges) {\n    assert(!m_data);\n\n    m_realdata = substrate::largeMallocSpecified(numberOfElements * sizeof(T),\n                                                 runtime::activeThreads,\n                                                 threadRanges, sizeof(T));\n\n    m_size = numberOfElements;\n    m_data = reinterpret_cast<T*>(m_realdata.get());\n  }\n  //! [allocatefunctions]\n\n  template <typename... Args>\n  void construct(Args&&... args) {\n    for (T *ii = m_data, *ei = m_data + m_size; ii != ei; ++ii)\n      new (ii) T(std::forward<Args>(args)...);\n  }\n\n  template <typename... Args>\n  void constructAt(size_type n, Args&&... args) {\n    new (&m_data[n]) T(std::forward<Args>(args)...);\n  }\n\n  //! Allocate and construct\n  template <typename... Args>\n  void create(size_type n, Args&&... args) {\n    allocateInterleaved(n);\n    construct(std::forward<Args>(args)...);\n  }\n\n  void deallocate() {\n    m_realdata.reset();\n    m_data = 0;\n    m_size = 0;\n  }\n\n  void destroy() {\n    if (!m_data)\n      return;\n    galois::ParallelSTL::destroy(m_data, m_data + m_size);\n  }\n\n  template <typename U = T>\n  std::enable_if_t<!std::is_scalar<U>::value> destroyAt(size_type n) {\n    (&m_data[n])->~T();\n  }\n\n  template <typename U = T>\n  std::enable_if_t<std::is_scalar<U>::value> destroyAt(size_type) {}\n\n  // The following methods are not shared with void specialization\n  const_pointer data() const { return m_data; }\n  pointer data() { return m_data; }\n};\n\n//! Void specialization\ntemplate <>\nclass LargeArray<void> {\n\nprivate:\n  /*\n   * To support boost serialization\n   * Can use single function serialize instead of save and load, since both save\n   * and load have identical code.\n   */\n  friend class boost::serialization::access;\n  template <typename Archive>\n  void serialize(Archive&, const unsigned int) const {}\n\npublic:\n  LargeArray(void*, size_t) {}\n  LargeArray()                  = default;\n  LargeArray(const LargeArray&) = delete;\n  LargeArray& operator=(const LargeArray&) = delete;\n\n  friend void swap(LargeArray&, LargeArray&) {}\n\n  typedef void raw_value_type;\n  typedef void* value_type;\n  typedef size_t size_type;\n  typedef ptrdiff_t difference_type;\n  typedef value_type reference;\n  typedef value_type const_reference;\n  typedef value_type* pointer;\n  typedef value_type* const_pointer;\n  typedef pointer iterator;\n  typedef const_pointer const_iterator;\n  const static bool has_value = false;\n  struct size_of {\n    const static size_t value = 0;\n  };\n\n  const_reference at(difference_type) const { return 0; }\n  reference at(difference_type) { return 0; }\n  const_reference operator[](size_type) const { return 0; }\n  template <typename AnyTy>\n  void set(difference_type, AnyTy) {}\n  size_type size() const { return 0; }\n  iterator begin() { return 0; }\n  const_iterator begin() const { return 0; }\n  iterator end() { return 0; }\n  const_iterator end() const { return 0; }\n\n  void allocateInterleaved(size_type) {}\n  void allocateBlocked(size_type) {}\n  void allocateLocal(size_type, bool = true) {}\n  void allocateFloating(size_type) {}\n  template <typename RangeArrayTy>\n  void allocateSpecified(size_type, RangeArrayTy) {}\n\n  template <typename... Args>\n  void construct(Args&&...) {}\n  template <typename... Args>\n  void constructAt(size_type, Args&&...) {}\n  template <typename... Args>\n  void create(size_type, Args&&...) {}\n\n  void deallocate() {}\n  void destroy() {}\n  void destroyAt(size_type) {}\n\n  const_pointer data() const { return 0; }\n  pointer data() { return 0; }\n};\n\n} // namespace galois\n#endif\n"
  },
  {
    "path": "libgalois/include/galois/LazyArray.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#ifndef GALOIS_LAZYARRAY_H\n#define GALOIS_LAZYARRAY_H\n\n#include <algorithm>\n#include <cstddef>\n#include <iterator>\n#include <stdexcept>\n#include <type_traits>\n#include <utility>\n\n#include \"galois/config.h\"\n#include \"galois/LazyObject.h\"\n\nnamespace galois {\n\n/**\n * This is a container that encapsulates space for a constant size array.  The\n * initialization and destruction of items is explicitly under the control of\n * the user.\n */\ntemplate <typename _Tp, unsigned _Size>\nclass LazyArray {\n  typedef typename std::aligned_storage<\n      sizeof(_Tp), std::alignment_of<_Tp>::value>::type CharData;\n\n  LazyObject<_Tp> data_[(_Size > 0 ? _Size : 1)];\n\n  _Tp* get(size_t __n) { return &data_[__n].get(); }\n  const _Tp* get(size_t __n) const { return &data_[__n].get(); }\n\npublic:\n  typedef _Tp value_type;\n  typedef size_t size_type;\n  typedef ptrdiff_t difference_type;\n  typedef value_type& reference;\n  typedef const value_type& const_reference;\n  typedef value_type* pointer;\n  typedef const value_type* const_pointer;\n  typedef pointer iterator;\n  typedef const_pointer const_iterator;\n  typedef std::reverse_iterator<iterator> reverse_iterator;\n  typedef std::reverse_iterator<const_iterator> const_reverse_iterator;\n\n  // iterators:\n  iterator begin() { return iterator(get(0)); }\n  const_iterator begin() const { return const_iterator(get(0)); }\n  iterator end() { return iterator(get(_Size)); }\n  const_iterator end() const { return const_iterator(get(_Size)); }\n\n  reverse_iterator rbegin() { return reverse_iterator(end()); }\n  const_reverse_iterator rbegin() const {\n    return const_reverse_iterator(end());\n  }\n  reverse_iterator rend() { return reverse_iterator(begin()); }\n  const_reverse_iterator rend() const {\n    return const_reverse_iterator(begin());\n  }\n\n  const_iterator cbegin() const { return begin(); }\n  const_iterator cend() const { return end(); }\n  const_reverse_iterator crbegin() const { return rbegin(); }\n  const_reverse_iterator crend() const { return rend(); }\n\n  // capacity:\n  size_type size() const { return _Size; }\n  size_type max_size() const { return _Size; }\n  bool empty() const { return _Size == 0; }\n\n  // element access:\n  reference operator[](size_type __n) { return *get(__n); }\n  const_reference operator[](size_type __n) const { return *get(__n); }\n  reference at(size_type __n) {\n    if (__n >= _Size)\n      throw std::out_of_range(\"lazyArray::at\");\n    return get(__n);\n  }\n  const_reference at(size_type __n) const {\n    if (__n >= _Size)\n      throw std::out_of_range(\"lazyArray::at\");\n    return get(__n);\n  }\n\n  reference front() { return *get(0); }\n  const_reference front() const { return *get(0); }\n  reference back() { return *get(_Size > 0 ? _Size - 1 : 0); }\n  const_reference back() const { return *get(_Size > 0 ? _Size - 1 : 0); }\n\n  pointer data() { return get(0); }\n  const_pointer data() const { return get(0); }\n\n  // missing: fill swap\n\n  template <typename... Args>\n  pointer emplace(size_type __n, Args&&... args) {\n    return new (get(__n)) _Tp(std::forward<Args>(args)...);\n  }\n\n  pointer construct(size_type __n, const _Tp& val) { return emplace(__n, val); }\n  pointer construct(size_type __n, _Tp&& val) {\n    return emplace(__n, std::move(val));\n  }\n\n  void destroy(size_type __n) { (get(__n))->~_Tp(); }\n};\n\n} // namespace galois\n#endif // GALOIS_LAZYARRAY_H\n"
  },
  {
    "path": "libgalois/include/galois/LazyObject.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#ifndef GALOIS_LAZYOBJECT_H\n#define GALOIS_LAZYOBJECT_H\n\n#include <type_traits>\n#include <utility>\n\n#include \"galois/config.h\"\n#include \"galois/gIO.h\"\n\nnamespace galois {\n\n/**\n * Single object with specialization for void type. To take advantage of empty\n * member optimization, users should subclass this class, otherwise the\n * compiler will insert non-zero padding for fields (even when empty).\n */\ntemplate <typename T>\nclass StrictObject {\n  T data;\n\npublic:\n  typedef T value_type;\n  typedef T& reference;\n  typedef const T& const_reference;\n  const static bool has_value = true;\n\n  StrictObject() {}\n  StrictObject(const_reference t) : data(t) {}\n  const_reference get() const { return data; }\n  reference get() { return data; }\n};\n\ntemplate <>\nstruct StrictObject<void> {\n  typedef void* value_type;\n  typedef void* reference;\n  typedef void* const_reference;\n  const static bool has_value = false;\n\n  StrictObject() {}\n  StrictObject(const_reference) {}\n  reference get() const { return 0; }\n};\n\n/**\n * Single (uninitialized) object with specialization for void type. To take\n * advantage of empty member optimization, users should subclass this class,\n * otherwise the compiler will insert non-zero padding for fields (even when\n * empty).\n */\ntemplate <typename T>\nclass LazyObject {\n  typedef\n      typename std::aligned_storage<sizeof(T),\n                                    std::alignment_of<T>::value>::type CharData;\n\n  union Data {\n    CharData buf;\n    T value_;\n\n    // Declare constructor explicitly because Data must be default\n    // constructable regardless of the constructability of T.\n    Data() {}  // NOLINT(modernize-use-equals-default)\n    ~Data() {} // NOLINT(modernize-use-equals-default)\n\n    T& value() { return value_; }\n    const T& value() const { return value_; }\n  };\n\n  Data data_;\n\n  T* cast() { return &data_.value(); }\n  const T* cast() const { return &data_.value(); }\n\npublic:\n  typedef T value_type;\n  typedef T& reference;\n  typedef const T& const_reference;\n  const static bool has_value = true;\n  // Can't support incomplete T's but provide same interface as\n  // {@link galois::LargeArray} for consistency\n  struct size_of {\n    const static size_t value = sizeof(T);\n  };\n\n  void destroy() { cast()->~T(); }\n  void construct(const_reference x) { new (cast()) T(x); }\n\n  template <typename... Args>\n  void construct(Args&&... args) {\n    new (cast()) T(std::forward<Args>(args)...);\n  }\n\n  const_reference get() const { return *cast(); }\n  reference get() { return *cast(); }\n};\n\ntemplate <>\nstruct LazyObject<void> {\n  typedef void* value_type;\n  typedef void* reference;\n  typedef void* const_reference;\n  const static bool has_value = false;\n  struct size_of {\n    const static size_t value = 0;\n  };\n\n  void destroy() {}\n  void construct(const_reference) {}\n\n  template <typename... Args>\n  void construct(Args&&...) {}\n\n  const_reference get() const { return 0; }\n};\n\n} // namespace galois\n#endif\n"
  },
  {
    "path": "libgalois/include/galois/Loops.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#ifndef GALOIS_LOOPS_H\n#define GALOIS_LOOPS_H\n\n#include \"galois/config.h\"\n#include \"galois/runtime/Executor_Deterministic.h\"\n#include \"galois/runtime/Executor_DoAll.h\"\n#include \"galois/runtime/Executor_ForEach.h\"\n#include \"galois/runtime/Executor_OnEach.h\"\n#include \"galois/runtime/Executor_Ordered.h\"\n#include \"galois/runtime/Executor_ParaMeter.h\"\n#include \"galois/worklists/WorkList.h\"\n\nnamespace galois {\n\n////////////////////////////////////////////////////////////////////////////////\n// Foreach\n////////////////////////////////////////////////////////////////////////////////\n\n/**\n * Galois unordered set iterator.\n * Operator should conform to <code>fn(item, UserContext<T>&)</code> where item\n * is a value from the iteration range and T is the type of item.\n *\n * @param rangeMaker an iterate range maker typically returned by\n * <code>galois::iterate(...)</code>\n * (@see galois::iterate()). rangeMaker is a functor which when called returns a\n * range object\n * @param fn operator\n * @param args optional arguments to loop, e.g., {@see loopname}, {@see wl}\n */\n\ntemplate <typename RangeFunc, typename FunctionTy, typename... Args>\nvoid for_each(const RangeFunc& rangeMaker, FunctionTy&& fn,\n              const Args&... args) {\n  auto tpl = std::make_tuple(args...);\n  runtime::for_each_gen(rangeMaker(tpl), std::forward<FunctionTy>(fn), tpl);\n}\n\n/**\n * Standard do-all loop. All iterations should be independent.\n * Operator should conform to <code>fn(item)</code> where item is a value from\n * the iteration range.\n *\n * @param rangeMaker an iterate range maker typically returned by\n * <code>galois::iterate(...)</code>\n * (@see galois::iterate()). rangeMaker is a functor which when called returns a\n * range object\n * @param fn operator\n * @param args optional arguments to loop\n */\ntemplate <typename RangeFunc, typename FunctionTy, typename... Args>\nvoid do_all(const RangeFunc& rangeMaker, FunctionTy&& fn, const Args&... args) {\n  auto tpl = std::make_tuple(args...);\n  runtime::do_all_gen(rangeMaker(tpl), std::forward<FunctionTy>(fn), tpl);\n}\n\n/**\n * Low-level parallel loop. Operator is applied for each running thread.\n * Operator should confirm to <code>fn(tid, numThreads)</code> where tid is\n * the id of the current thread and numThreads is the total number of running\n * threads.\n *\n * @param fn operator, which is never copied\n * @param args optional arguments to loop\n */\ntemplate <typename FunctionTy, typename... Args>\nvoid on_each(FunctionTy&& fn, const Args&... args) {\n  runtime::on_each_gen(std::forward<FunctionTy>(fn), std::make_tuple(args...));\n}\n\n/**\n * Preallocates hugepages on each thread.\n *\n * @param num number of pages to allocate of size {@link\n * galois::runtime::MM::hugePageSize}\n */\nstatic inline void preAlloc(int num) {\n  static const bool DISABLE_PREALLOC = false;\n  if (DISABLE_PREALLOC) {\n    galois::gWarn(\"preAlloc disabled\");\n\n  } else {\n    runtime::preAlloc_impl(num);\n  }\n}\n\n/**\n * Reports number of hugepages allocated by the Galois system so far. The value\n * is printing using the statistics infrastructure.\n *\n * @param label Label to associated with report at this program point\n */\nstatic inline void reportPageAlloc(const char* label) {\n  runtime::reportPageAlloc(label);\n}\n\n/**\n * Galois ordered set iterator for stable source algorithms.\n *\n * Operator should conform to <code>fn(item, UserContext<T>&)</code> where item\n * is a value from the iteration range and T is the type of item. Comparison\n * function should conform to <code>bool r = cmp(item1, item2)</code> where r is\n * true if item1 is less than or equal to item2. Neighborhood function should\n * conform to <code>nhFunc(item)</code> and should visit every element in the\n * neighborhood of active element item.\n *\n * @param b begining of range of initial items\n * @param e end of range of initial items\n * @param cmp comparison function\n * @param nhFunc neighborhood function\n * @param fn operator\n * @param loopname string to identity loop in statistics output\n */\ntemplate <typename Iter, typename Cmp, typename NhFunc, typename OpFunc>\nvoid for_each_ordered(Iter b, Iter e, const Cmp& cmp, const NhFunc& nhFunc,\n                      const OpFunc& fn, const char* loopname = 0) {\n  runtime::for_each_ordered_impl(b, e, cmp, nhFunc, fn, loopname);\n}\n\n/**\n * Galois ordered set iterator for unstable source algorithms.\n *\n * Operator should conform to <code>fn(item, UserContext<T>&)</code> where item\n * is a value from the iteration range and T is the type of item. Comparison\n * function should conform to <code>bool r = cmp(item1, item2)</code> where r is\n * true if item1 is less than or equal to item2. Neighborhood function should\n * conform to <code>nhFunc(item)</code> and should visit every element in the\n * neighborhood of active element item. The stability test should conform to\n * <code>bool r = stabilityTest(item)</code> where r is true if item is a stable\n * source.\n *\n * @param b begining of range of initial items\n * @param e end of range of initial items\n * @param cmp comparison function\n * @param nhFunc neighborhood function\n * @param fn operator\n * @param stabilityTest stability test\n * @param loopname string to identity loop in statistics output\n */\ntemplate <typename Iter, typename Cmp, typename NhFunc, typename OpFunc,\n          typename StableTest>\nvoid for_each_ordered(Iter b, Iter e, const Cmp& cmp, const NhFunc& nhFunc,\n                      const OpFunc& fn, const StableTest& stabilityTest,\n                      const char* loopname = 0) {\n  runtime::for_each_ordered_impl(b, e, cmp, nhFunc, fn, stabilityTest,\n                                 loopname);\n}\n\n/**\n * Helper functor class to invoke galois::do_all on provided args\n * Can be used to choose between galois::do_all and other equivalents such as\n * std::for_each\n */\nstruct DoAll {\n  template <typename RangeFunc, typename F, typename... Args>\n  void operator()(const RangeFunc& rangeMaker, const F& f,\n                  Args&&... args) const {\n    galois::do_all(rangeMaker, f, std::forward<Args>(args)...);\n  }\n};\n\n/**\n * Helper functor to invoke std::for_each with the same interface as\n * galois::do_all\n */\n\nstruct StdForEach {\n  template <typename RangeFunc, typename F, typename... Args>\n  void operator()(const RangeFunc& rangeMaker, const F& f,\n                  Args&&... args) const {\n    auto range = rangeMaker(std::make_tuple(args...));\n    std::for_each(range.begin(), range.end(), f);\n  }\n};\n\nstruct ForEach {\n  template <typename RangeFunc, typename F, typename... Args>\n  void operator()(const RangeFunc& rangeMaker, const F& f,\n                  Args&&... args) const {\n    galois::for_each(rangeMaker, f, std::forward<Args>(args)...);\n  }\n};\n\ntemplate <typename Q>\nstruct WhileQ {\n  Q m_q;\n\n  WhileQ(Q&& q = Q()) : m_q(std::move(q)) {}\n\n  template <typename RangeFunc, typename F, typename... Args>\n  void operator()(const RangeFunc& rangeMaker, const F& f, Args&&... args) {\n\n    auto range = rangeMaker(std::make_tuple(args...));\n\n    m_q.push(range.begin(), range.end());\n\n    while (!m_q.empty()) {\n      auto val = m_q.pop();\n\n      f(val, m_q);\n    }\n  }\n};\n\n} // namespace galois\n\n#endif // GALOIS_LOOPS_H\n"
  },
  {
    "path": "libgalois/include/galois/Mem.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#ifndef GALOIS_MEM_H\n#define GALOIS_MEM_H\n\n#include \"galois/config.h\"\n#include \"galois/runtime/Mem.h\"\n\nnamespace galois {\n\n//! [PerIterAllocTy example]\n//! Base allocator for per-iteration allocator\ntypedef galois::runtime::BumpWithMallocHeap<\n    galois::runtime::FreeListHeap<galois::runtime::SystemHeap>>\n    IterAllocBaseTy;\n\n//! Per-iteration allocator that conforms to STL allocator interface\ntypedef galois::runtime::ExternalHeapAllocator<char, IterAllocBaseTy>\n    PerIterAllocTy;\n//! [PerIterAllocTy example]\n\n//! Scalable fixed-sized allocator for T that conforms to STL allocator\n//! interface but does not support variable sized allocations\ntemplate <typename Ty>\nusing FixedSizeAllocator = galois::runtime::FixedSizeAllocator<Ty>;\n\n//! Scalable variable-sized allocator for T that allocates blocks of sizes in\n//! powers of 2 Useful for small and medium sized allocations, e.g. small or\n//! medium vectors, strings, deques\ntemplate <typename T>\nusing Pow_2_VarSizeAlloc = typename runtime::Pow_2_BlockAllocator<T>;\n\n} // namespace galois\n#endif\n"
  },
  {
    "path": "libgalois/include/galois/MethodFlags.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#ifndef GALOIS_METHODFLAGS_H\n#define GALOIS_METHODFLAGS_H\n\n#include \"galois/config.h\"\n\nnamespace galois {\n\n/**\n * What should the runtime do when executing a method.\n *\n * Various methods take an optional parameter indicating what actions\n * the runtime should do on the user's behalf: (1) checking for conflicts,\n * and/or (2) saving undo information. By default, both are performed (ALL).\n */\nenum class MethodFlag : char {\n  UNPROTECTED   = 0,\n  WRITE         = 1,\n  READ          = 2,\n  INTERNAL_MASK = 3,\n  PREVIOUS      = 4,\n};\n\n//! Bitwise & for method flags\ninline MethodFlag operator&(MethodFlag x, MethodFlag y) {\n  return (MethodFlag)(((int)x) & ((int)y));\n}\n\n//! Bitwise | for method flags\ninline MethodFlag operator|(MethodFlag x, MethodFlag y) {\n  return (MethodFlag)(((int)x) | ((int)y));\n}\n} // namespace galois\n\n#endif // GALOIS_METHODFLAGS_H\n"
  },
  {
    "path": "libgalois/include/galois/NoDerefIterator.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#ifndef GALOIS_NODEREFITERATOR_H\n#define GALOIS_NODEREFITERATOR_H\n\n#include \"boost/iterator/iterator_adaptor.hpp\"\n\n#include \"galois/config.h\"\n\nnamespace galois {\n\n//! Modify an iterator so that *it == it\ntemplate <typename Iterator>\nstruct NoDerefIterator\n    : public boost::iterator_adaptor<NoDerefIterator<Iterator>, Iterator,\n                                     Iterator, boost::use_default,\n                                     const Iterator&> {\n  NoDerefIterator() : NoDerefIterator::iterator_adaptor_() {}\n  explicit NoDerefIterator(Iterator it)\n      : NoDerefIterator::iterator_adaptor_(it) {}\n  const Iterator& dereference() const {\n    return NoDerefIterator::iterator_adaptor_::base_reference();\n  }\n  Iterator& dereference() {\n    return NoDerefIterator::iterator_adaptor_::base_reference();\n  }\n};\n\n//! Convenience function to create {@link NoDerefIterator}.\ntemplate <typename Iterator>\nNoDerefIterator<Iterator> make_no_deref_iterator(Iterator it) {\n  return NoDerefIterator<Iterator>(it);\n}\n\n} // namespace galois\n\n#endif\n"
  },
  {
    "path": "libgalois/include/galois/PODResizeableArray.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#ifndef GALOIS_PODRESIZEABLEARRAY_H\n#define GALOIS_PODRESIZEABLEARRAY_H\n\n#include <iterator>\n#include <stdexcept>\n#include <cstddef>\n#include <algorithm>\n#include <utility>\n#include <type_traits>\n\n#include \"galois/config.h\"\n\nnamespace galois {\n\n/**\n * This is a container that encapsulates a resizeable array\n * of plain-old-datatype (POD) elements.\n * There is no initialization or destruction of elements.\n */\ntemplate <typename _Tp>\nclass PODResizeableArray {\n  _Tp* data_;\n  size_t capacity_;\n  size_t size_;\n\npublic:\n  typedef _Tp value_type;\n  typedef size_t size_type;\n  typedef ptrdiff_t difference_type;\n  typedef value_type& reference;\n  typedef const value_type& const_reference;\n  typedef value_type* pointer;\n  typedef const value_type* const_pointer;\n  typedef pointer iterator;\n  typedef const_pointer const_iterator;\n  typedef std::reverse_iterator<iterator> reverse_iterator;\n  typedef std::reverse_iterator<const_iterator> const_reverse_iterator;\n\n  PODResizeableArray() : data_(NULL), capacity_(0), size_(0) {}\n\n  template <class InputIterator>\n  PODResizeableArray(InputIterator first, InputIterator last)\n      : data_(NULL), capacity_(0), size_(0) {\n    size_t to_add = last - first;\n    resize(to_add);\n    std::copy_n(first, to_add, begin());\n  }\n\n  PODResizeableArray(size_t n) : data_(NULL), capacity_(0), size_(0) {\n    resize(n);\n  }\n\n  //! disabled (shallow) copy constructor\n  PODResizeableArray(const PODResizeableArray&) = delete;\n\n  //! move constructor\n  PODResizeableArray(PODResizeableArray&& v)\n      : data_(v.data_), capacity_(v.capacity_), size_(v.size_) {\n    v.data_     = NULL;\n    v.capacity_ = 0;\n    v.size_     = 0;\n  }\n\n  //! disabled (shallow) copy assignment operator\n  PODResizeableArray& operator=(const PODResizeableArray&) = delete;\n\n  //! move assignment operator\n  PODResizeableArray& operator=(PODResizeableArray&& v) {\n    if (data_ != NULL)\n      free(data_);\n    data_       = v.data_;\n    capacity_   = v.capacity_;\n    size_       = v.size_;\n    v.data_     = NULL;\n    v.capacity_ = 0;\n    v.size_     = 0;\n    return *this;\n  }\n\n  ~PODResizeableArray() {\n    if (data_ != NULL)\n      free(data_);\n  }\n\n  // iterators:\n  iterator begin() { return iterator(&data_[0]); }\n  const_iterator begin() const { return const_iterator(&data_[0]); }\n  iterator end() { return iterator(&data_[size_]); }\n  const_iterator end() const { return const_iterator(&data_[size_]); }\n\n  reverse_iterator rbegin() { return reverse_iterator(end()); }\n  const_reverse_iterator rbegin() const {\n    return const_reverse_iterator(end());\n  }\n  reverse_iterator rend() { return reverse_iterator(begin()); }\n  const_reverse_iterator rend() const {\n    return const_reverse_iterator(begin());\n  }\n\n  const_iterator cbegin() const { return begin(); }\n  const_iterator cend() const { return end(); }\n  const_reverse_iterator crbegin() const { return rbegin(); }\n  const_reverse_iterator crend() const { return rend(); }\n\n  // size:\n  size_type size() const { return size_; }\n  size_type max_size() const { return capacity_; }\n  bool empty() const { return size_ == 0; }\n\n  void reserve(size_t n) {\n    if (n > capacity_) {\n      if (capacity_ == 0) {\n        capacity_ = 1;\n      }\n      while (capacity_ < n) {\n        capacity_ <<= 1;\n      }\n      data_ = static_cast<_Tp*>(\n          realloc(reinterpret_cast<void*>(data_), capacity_ * sizeof(_Tp)));\n    }\n  }\n\n  void resize(size_t n) {\n    reserve(n);\n    size_ = n;\n  }\n\n  void clear() { size_ = 0; }\n\n  // element access:\n  reference operator[](size_type __n) { return data_[__n]; }\n  const_reference operator[](size_type __n) const { return data_[__n]; }\n  reference at(size_type __n) {\n    if (__n >= size_)\n      throw std::out_of_range(\"PODResizeableArray::at\");\n    return data_[__n];\n  }\n  const_reference at(size_type __n) const {\n    if (__n >= size_)\n      throw std::out_of_range(\"PODResizeableArray::at\");\n    return data_[__n];\n  }\n\n  void assign(iterator first, iterator last) {\n    size_t n = last - first;\n    resize(n);\n    memcpy(reinterpret_cast<void*>(data_), first, n * sizeof(_Tp));\n  }\n\n  reference front() { return data_[0]; }\n  const_reference front() const { return data_[0]; }\n  reference back() { return data_[size_ - 1]; }\n  const_reference back() const { return data_[size_ - 1]; }\n\n  pointer data() { return data_; }\n  const_pointer data() const { return data_; }\n\n  void push_back(const _Tp& value) {\n    resize(size_ + 1);\n    data_[size_ - 1] = value;\n  }\n\n  template <class InputIterator>\n  void insert(iterator GALOIS_USED_ONLY_IN_DEBUG(position), InputIterator first,\n              InputIterator last) {\n    assert(position == end());\n    size_t old_size = size_;\n    size_t to_add   = last - first;\n    resize(old_size + to_add);\n    std::copy_n(first, to_add, begin() + old_size);\n  }\n\n  void swap(PODResizeableArray& v) {\n    std::swap(data_, v.data_);\n    std::swap(size_, v.size_);\n    std::swap(capacity_, v.capacity_);\n  }\n};\n\n} // namespace galois\n#endif // GALOIS_PODRESIZEABLEARRAY_H\n"
  },
  {
    "path": "libgalois/include/galois/ParallelSTL.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#ifndef GALOIS_PARALLELSTL_H\n#define GALOIS_PARALLELSTL_H\n\n#include \"galois/config.h\"\n#include \"galois/GaloisForwardDecl.h\"\n#include \"galois/NoDerefIterator.h\"\n#include \"galois/runtime/Range.h\"\n#include \"galois/Reduction.h\"\n#include \"galois/Traits.h\"\n#include \"galois/UserContext.h\"\n#include \"galois/Threads.h\"\n#include \"galois/worklists/Chunk.h\"\n\nnamespace galois {\n//! Parallel versions of STL library algorithms.\n// TODO: rename to gstl?\nnamespace ParallelSTL {\n\ntemplate <class InputIterator, class Predicate>\nsize_t count_if(InputIterator first, InputIterator last, Predicate pred) {\n\n  galois::GAccumulator<size_t> count;\n\n  galois::do_all(galois::iterate(first, last), [&](const auto& v) {\n    if (pred(v)) {\n      count += 1;\n    }\n  });\n\n  return count.reduce();\n}\n\ntemplate <typename InputIterator, class Predicate>\nstruct find_if_helper {\n\n  typedef galois::optional<InputIterator> ElementTy;\n  typedef substrate::PerThreadStorage<ElementTy> AccumulatorTy;\n  AccumulatorTy& accum;\n  Predicate& f;\n\n  find_if_helper(AccumulatorTy& a, Predicate& p) : accum(a), f(p) {}\n  void operator()(const InputIterator& v, UserContext<InputIterator>& ctx) {\n    if (f(*v)) {\n      *accum.getLocal() = v;\n      ctx.breakLoop();\n    }\n  }\n};\n\ntemplate <class InputIterator, class Predicate>\nInputIterator find_if(InputIterator first, InputIterator last, Predicate pred) {\n  typedef find_if_helper<InputIterator, Predicate> HelperTy;\n  typedef typename HelperTy::AccumulatorTy AccumulatorTy;\n  typedef galois::worklists::PerSocketChunkFIFO<256> WL;\n  AccumulatorTy accum;\n  HelperTy helper(accum, pred);\n  for_each(galois::iterate(make_no_deref_iterator(first),\n                           make_no_deref_iterator(last)),\n           helper, galois::disable_conflict_detection(), galois::no_pushes(),\n           galois::parallel_break(), galois::wl<WL>());\n  for (unsigned i = 0; i < accum.size(); ++i) {\n    if (*accum.getRemote(i))\n      return **accum.getRemote(i);\n  }\n  return last;\n}\n\ntemplate <class Iterator>\nIterator choose_rand(Iterator first, Iterator last) {\n  size_t dist = std::distance(first, last);\n  if (dist)\n    std::advance(first, rand() % dist);\n  return first;\n}\n\ntemplate <class Compare>\nstruct sort_helper {\n  Compare comp;\n\n  //! Not equal in terms of less-than\n  template <class value_type>\n  struct neq_to {\n    Compare comp;\n    neq_to(Compare c) : comp(c) {}\n    bool operator()(const value_type& a, const value_type& b) const {\n      return comp(a, b) || comp(b, a);\n    }\n  };\n\n  sort_helper(Compare c) : comp(c) {}\n\n  template <class RandomAccessIterator, class Context>\n  void operator()(std::pair<RandomAccessIterator, RandomAccessIterator> bounds,\n                  Context& ctx) {\n    if (std::distance(bounds.first, bounds.second) <= 1024) {\n      std::sort(bounds.first, bounds.second, comp);\n    } else {\n      typedef\n          typename std::iterator_traits<RandomAccessIterator>::value_type VT;\n      RandomAccessIterator pivot = choose_rand(bounds.first, bounds.second);\n      VT pv                      = *pivot;\n      pivot                      = std::partition(bounds.first, bounds.second,\n                             std::bind(comp, std::placeholders::_1, pv));\n      // push the lower bit\n      if (bounds.first != pivot)\n        ctx.push(std::make_pair(bounds.first, pivot));\n      // adjust the upper bit\n      pivot =\n          std::find_if(pivot, bounds.second,\n                       std::bind(neq_to<VT>(comp), std::placeholders::_1, pv));\n      // push the upper bit\n      if (bounds.second != pivot)\n        ctx.push(std::make_pair(pivot, bounds.second));\n    }\n  }\n};\n\ntemplate <typename RandomAccessIterator, class Predicate>\nstd::pair<RandomAccessIterator, RandomAccessIterator>\ndual_partition(RandomAccessIterator first1, RandomAccessIterator last1,\n               RandomAccessIterator first2, RandomAccessIterator last2,\n               Predicate pred) {\n  typedef std::reverse_iterator<RandomAccessIterator> RI;\n  RI first3(last2), last3(first2);\n  while (true) {\n    while (first1 != last1 && pred(*first1))\n      ++first1;\n    if (first1 == last1)\n      break;\n    while (first3 != last3 && !pred(*first3))\n      ++first3;\n    if (first3 == last3)\n      break;\n    std::swap(*first1++, *first3++);\n  }\n  return std::make_pair(first1, first3.base());\n}\n\ntemplate <typename RandomAccessIterator, class Predicate>\nstruct partition_helper {\n  typedef std::pair<RandomAccessIterator, RandomAccessIterator> RP;\n  struct partition_helper_state {\n    RandomAccessIterator first, last;\n    RandomAccessIterator rfirst, rlast;\n    substrate::SimpleLock Lock;\n    Predicate pred;\n    typename std::iterator_traits<RandomAccessIterator>::difference_type\n    BlockSize() {\n      return 1024;\n    }\n\n    partition_helper_state(RandomAccessIterator f, RandomAccessIterator l,\n                           Predicate p)\n        : first(f), last(l), rfirst(l), rlast(f), pred(p) {}\n    RP takeHigh() {\n      Lock.lock();\n      unsigned BS = std::min(BlockSize(), std::distance(first, last));\n      last -= BS;\n      RandomAccessIterator rv = last;\n      Lock.unlock();\n      return std::make_pair(rv, rv + BS);\n    }\n    RP takeLow() {\n      Lock.lock();\n      unsigned BS = std::min(BlockSize(), std::distance(first, last));\n      RandomAccessIterator rv = first;\n      first += BS;\n      Lock.unlock();\n      return std::make_pair(rv, rv + BS);\n    }\n    void update(RP low, RP high) {\n      Lock.lock();\n      if (low.first != low.second) {\n        rfirst = std::min(rfirst, low.first);\n        rlast  = std::max(rlast, low.second);\n      }\n      if (high.first != high.second) {\n        rfirst = std::min(rfirst, high.first);\n        rlast  = std::max(rlast, high.second);\n      }\n      Lock.unlock();\n    }\n  };\n\n  partition_helper(partition_helper_state* s) : state(s) {}\n\n  partition_helper_state* state;\n\n  void operator()(unsigned, unsigned) {\n    RP high, low;\n    do {\n      RP parts  = dual_partition(low.first, low.second, high.first, high.second,\n                                state->pred);\n      low.first = parts.first;\n      high.second = parts.second;\n      if (low.first == low.second)\n        low = state->takeLow();\n      if (high.first == high.second)\n        high = state->takeHigh();\n    } while (low.first != low.second && high.first != high.second);\n    state->update(low, high);\n  }\n};\n\ntemplate <class RandomAccessIterator, class Predicate>\nRandomAccessIterator partition(RandomAccessIterator first,\n                               RandomAccessIterator last, Predicate pred) {\n  if (std::distance(first, last) <= 1024)\n    return std::partition(first, last, pred);\n  typedef partition_helper<RandomAccessIterator, Predicate> P;\n  typename P::partition_helper_state s(first, last, pred);\n  on_each(P(&s));\n  if (s.rfirst == first && s.rlast == last) { // perfect !\n    // abort();\n    return s.first;\n  }\n  return std::partition(s.rfirst, s.rlast, pred);\n}\n\nstruct pair_dist {\n  template <typename RP>\n  bool operator()(const RP& x, const RP& y) {\n    return std::distance(x.first, x.second) > std::distance(y.first, y.second);\n  }\n};\n\ntemplate <class RandomAccessIterator, class Compare>\nvoid sort(RandomAccessIterator first, RandomAccessIterator last, Compare comp) {\n  if (std::distance(first, last) <= 1024) {\n    std::sort(first, last, comp);\n    return;\n  }\n  typedef galois::worklists::PerSocketChunkFIFO<1> WL;\n\n  for_each(galois::iterate({std::make_pair(first, last)}),\n           sort_helper<Compare>(comp), galois::disable_conflict_detection(),\n           galois::wl<WL>());\n}\n\ntemplate <class RandomAccessIterator>\nvoid sort(RandomAccessIterator first, RandomAccessIterator last) {\n  galois::ParallelSTL::sort(\n      first, last,\n      std::less<\n          typename std::iterator_traits<RandomAccessIterator>::value_type>());\n}\n\ntemplate <class InputIterator, class T, typename BinaryOperation>\nT accumulate(InputIterator first, InputIterator last, const T& identity,\n             const BinaryOperation& binary_op) {\n\n  auto id_fn = [=]() { return identity; };\n\n  auto r = make_reducible(binary_op, id_fn);\n\n  do_all(galois::iterate(first, last), [&](const T& v) { r.update(v); });\n\n  return r.reduce();\n}\n\ntemplate <class InputIterator, class T>\nT accumulate(InputIterator first, InputIterator last, const T& identity = T()) {\n  return accumulate(first, last, identity, std::plus<T>());\n}\n\ntemplate <class InputIterator, class MapFn, class T, class ReduceFn>\nT map_reduce(InputIterator first, InputIterator last, MapFn map_fn,\n             ReduceFn reduce_fn, const T& identity) {\n\n  auto id_fn = [=]() { return identity; };\n\n  auto r = make_reducible(reduce_fn, id_fn);\n\n  galois::do_all(galois::iterate(first, last),\n                 [&](const auto& v) { r.update(map_fn(v)); });\n\n  return r.reduce();\n}\n\ntemplate <typename I>\nstd::enable_if_t<!std::is_scalar<internal::Val_ty<I>>::value> destroy(I first,\n                                                                      I last) {\n  using T = internal::Val_ty<I>;\n  do_all(iterate(first, last), [=](T& i) { (&i)->~T(); });\n}\n\ntemplate <class I>\nstd::enable_if_t<std::is_scalar<internal::Val_ty<I>>::value> destroy(I, I) {}\n\n/**\n * Does a partial sum from first -> last and writes the results to the d_first\n * iterator.\n */\ntemplate <class InputIt, class OutputIt>\nOutputIt partial_sum(InputIt first, InputIt last, OutputIt d_first) {\n  using ValueType = typename std::iterator_traits<InputIt>::value_type;\n\n  size_t sizeOfVector = std::distance(first, last);\n\n  // only bother with parallel execution if vector is larger than some size\n  if (sizeOfVector >= 1024) {\n    const size_t numBlocks = galois::getActiveThreads();\n    const size_t blockSize = (sizeOfVector + numBlocks - 1) / numBlocks;\n    assert(numBlocks * blockSize >= sizeOfVector);\n\n    std::vector<ValueType> localSums(numBlocks);\n\n    // get the block sums\n    galois::do_all(\n        galois::iterate((size_t)0, numBlocks), [&](const size_t& block) {\n          // block start can extend past sizeOfVector if doesn't divide evenly\n          size_t blockStart = std::min(block * blockSize, sizeOfVector);\n          size_t blockEnd   = std::min((block + 1) * blockSize, sizeOfVector);\n          assert(blockStart <= blockEnd);\n\n          // partial accumulation of each block done now\n          std::partial_sum(first + blockStart, first + blockEnd,\n                           d_first + blockStart);\n          // save the last number in this block: used for block prefix sum\n          if (blockEnd > 0) {\n            localSums[block] = *(d_first + blockEnd - 1);\n          } else {\n            localSums[block] = 0;\n          }\n        });\n\n    // bulkPrefix[i] holds the starting sum of a particular block i\n    std::vector<ValueType> bulkPrefix(numBlocks);\n    // exclusive scan on local sums to get number to add to each block's\n    // set of indices\n    // Not using std::exclusive_scan because apparently it doesn't work for\n    // some compilers\n    ValueType runningSum = 0;\n    for (size_t i = 0; i < numBlocks; i++) {\n      bulkPrefix[i] = runningSum;\n      runningSum += localSums[i];\n    }\n\n    galois::do_all(\n        galois::iterate((size_t)0, numBlocks), [&](const size_t& block) {\n          // add the sums of previous elements to blocks\n          ValueType numToAdd = bulkPrefix[block];\n          size_t blockStart  = std::min(block * blockSize, sizeOfVector);\n          size_t blockEnd    = std::min((block + 1) * blockSize, sizeOfVector);\n          assert(blockStart <= blockEnd);\n\n          // transform applies addition to appropriate range\n          std::transform(d_first + blockStart, d_first + blockEnd,\n                         d_first + blockStart,\n                         [&](ValueType& val) { return val + numToAdd; });\n        });\n\n    // return the iterator past the last element written\n    return d_first + sizeOfVector;\n  } else {\n    // vector is small; do it serially using standard library\n    return std::partial_sum(first, last, d_first);\n  }\n}\n\n} // end namespace ParallelSTL\n} // end namespace galois\n#endif\n"
  },
  {
    "path": "libgalois/include/galois/PerThreadContainer.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#ifndef GALOIS_PERTHREADCONTAINER_H\n#define GALOIS_PERTHREADCONTAINER_H\n\n#include <cstdio>\n#include <vector>\n#include <deque>\n#include <list>\n#include <map>\n#include <set>\n#include <limits>\n#include <iterator>\n\n#include <boost/iterator/counting_iterator.hpp>\n#include <boost/iterator/iterator_facade.hpp>\n#include <boost/iterator/transform_iterator.hpp>\n\n#include \"galois/config.h\"\n#include \"galois/gdeque.h\"\n#include \"galois/gIO.h\"\n#include \"galois/gstl.h\"\n#include \"galois/PriorityQueue.h\"\n#include \"galois/runtime/Executor_DoAll.h\"\n#include \"galois/runtime/Executor_OnEach.h\"\n#include \"galois/runtime/Mem.h\"\n#include \"galois/substrate/PerThreadStorage.h\"\n#include \"galois/substrate/ThreadPool.h\"\n#include \"galois/Threads.h\"\n#include \"galois/TwoLevelIterator.h\"\n\nnamespace galois {\n\nnamespace {\n\nenum GlobalPos { GLOBAL_BEGIN, GLOBAL_END };\n\n#define ADAPTOR_BASED_OUTER_ITER\n\n// XXX: use a combination of boost::transform_iterator and\n// boost::counting_iterator to implement the following OuterPerThreadWLIter\n#ifdef ADAPTOR_BASED_OUTER_ITER\n\ntemplate <typename PerThrdCont>\nstruct WLindexer {\n  typedef typename PerThrdCont::container_type Ret_ty;\n\n  PerThrdCont* wl;\n\n  WLindexer() : wl(NULL) {}\n\n  WLindexer(PerThrdCont& _wl) : wl(&_wl) {}\n\n  Ret_ty& operator()(unsigned i) const {\n    assert(wl != NULL);\n    assert(i < wl->numRows());\n    return const_cast<Ret_ty&>(wl->get(i));\n  }\n};\n\ntemplate <typename PerThrdCont>\nstruct TypeFactory {\n  typedef typename boost::transform_iterator<WLindexer<PerThrdCont>,\n                                             boost::counting_iterator<unsigned>>\n      OuterIter;\n  typedef typename std::reverse_iterator<OuterIter> RvrsOuterIter;\n};\n\ntemplate <typename PerThrdCont>\ntypename TypeFactory<PerThrdCont>::OuterIter make_outer_begin(PerThrdCont& wl) {\n  return boost::make_transform_iterator(boost::counting_iterator<unsigned>(0),\n                                        WLindexer<PerThrdCont>(wl));\n}\n\ntemplate <typename PerThrdCont>\ntypename TypeFactory<PerThrdCont>::OuterIter make_outer_end(PerThrdCont& wl) {\n  return boost::make_transform_iterator(\n      boost::counting_iterator<unsigned>(wl.numRows()),\n      WLindexer<PerThrdCont>(wl));\n}\n\ntemplate <typename PerThrdCont>\ntypename TypeFactory<PerThrdCont>::RvrsOuterIter\nmake_outer_rbegin(PerThrdCont& wl) {\n  return typename TypeFactory<PerThrdCont>::RvrsOuterIter(make_outer_end(wl));\n}\n\ntemplate <typename PerThrdCont>\ntypename TypeFactory<PerThrdCont>::RvrsOuterIter\nmake_outer_rend(PerThrdCont& wl) {\n  return typename TypeFactory<PerThrdCont>::RvrsOuterIter(make_outer_begin(wl));\n}\n\n#else\n\ntemplate <typename PerThrdCont>\nclass OuterPerThreadWLIter\n    : public boost::iterator_facade<OuterPerThreadWLIter<PerThrdCont>,\n                                    typename PerThrdCont::container_type,\n                                    boost::random_access_traversal_tag> {\n\n  using container_type = typename PerThrdCont::container_type;\n  using Diff_ty        = ptrdiff_t;\n\n  friend class boost::iterator_core_access;\n\n  PerThrdCont* workList;\n  // using Diff_ty due to reverse iterator, whose\n  // end is -1, and,  begin is numRows - 1\n  Diff_ty row;\n\n  void assertInRange() const {\n    assert((row >= 0) && (row < workList->numRows()));\n  }\n\n  // container_type& getWL() {\n  // assertInRange();\n  // return (*workList)[row];\n  // }\n\n  container_type& getWL() const {\n    assertInRange();\n    return (*workList)[row];\n  }\n\npublic:\n  OuterPerThreadWLIter() : workList(NULL), row(0) {}\n\n  OuterPerThreadWLIter(PerThrdCont& wl, const GlobalPos& pos)\n      : workList(&wl), row(0) {\n\n    switch (pos) {\n    case GLOBAL_BEGIN:\n      row = 0;\n      break;\n    case GLOBAL_END:\n      row = wl.numRows();\n      break;\n    default:\n      std::abort();\n    }\n  }\n\n  container_type& dereference(void) const { return getWL(); }\n\n  // const container_type& dereference (void) const {\n  // getWL ();\n  // }\n\n  void increment(void) { ++row; }\n\n  void decrement(void) { --row; }\n\n  bool equal(const OuterPerThreadWLIter& that) const {\n    assert(this->workList == that.workList);\n    return this->row == that.row;\n  }\n\n  void advance(ptrdiff_t n) { row += n; }\n\n  Diff_ty distance_to(const OuterPerThreadWLIter& that) const {\n    assert(this->workList == that.workList);\n    return that.row - this->row;\n  }\n};\n\ntemplate <typename PerThrdCont>\nOuterPerThreadWLIter<PerThrdCont> make_outer_begin(PerThrdCont& wl) {\n  return OuterPerThreadWLIter<PerThrdCont>(wl, GLOBAL_BEGIN);\n}\n\ntemplate <typename PerThrdCont>\nOuterPerThreadWLIter<PerThrdCont> make_outer_end(PerThrdCont& wl) {\n  return OuterPerThreadWLIter<PerThrdCont>(wl, GLOBAL_END);\n}\n\ntemplate <typename PerThrdCont>\nstd::reverse_iterator<OuterPerThreadWLIter<PerThrdCont>>\nmake_outer_rbegin(PerThrdCont& wl) {\n  typedef typename std::reverse_iterator<OuterPerThreadWLIter<PerThrdCont>>\n      Ret_ty;\n  return Ret_ty(make_outer_end(wl));\n}\n\ntemplate <typename PerThrdCont>\nstd::reverse_iterator<OuterPerThreadWLIter<PerThrdCont>>\nmake_outer_rend(PerThrdCont& wl) {\n  typedef typename std::reverse_iterator<OuterPerThreadWLIter<PerThrdCont>>\n      Ret_ty;\n  return Ret_ty(make_outer_begin(wl));\n}\n\n#endif\n\n} // end namespace\n\ntemplate <typename Cont_tp>\nclass PerThreadContainer {\npublic:\n  typedef Cont_tp container_type;\n  typedef typename container_type::value_type value_type;\n  typedef typename container_type::reference reference;\n  typedef typename container_type::pointer pointer;\n  typedef typename container_type::size_type size_type;\n\n  typedef typename container_type::iterator local_iterator;\n  typedef typename container_type::const_iterator local_const_iterator;\n  typedef typename container_type::reverse_iterator local_reverse_iterator;\n  typedef typename container_type::const_reverse_iterator\n      local_const_reverse_iterator;\n\n  typedef PerThreadContainer This_ty;\n\n#ifdef ADAPTOR_BASED_OUTER_ITER\n  typedef typename TypeFactory<This_ty>::OuterIter OuterIter;\n  typedef typename TypeFactory<This_ty>::RvrsOuterIter RvrsOuterIter;\n#else\n  typedef OuterPerThreadWLIter<This_ty> OuterIter;\n  typedef typename std::reverse_iterator<OuterIter> RvrsOuterIter;\n#endif\n  typedef typename galois::ChooseStlTwoLevelIterator<\n      OuterIter, typename container_type::iterator>::type global_iterator;\n  typedef typename galois::ChooseStlTwoLevelIterator<\n      OuterIter, typename container_type::const_iterator>::type\n      global_const_iterator;\n  typedef typename galois::ChooseStlTwoLevelIterator<\n      RvrsOuterIter, typename container_type::reverse_iterator>::type\n      global_reverse_iterator;\n  typedef typename galois::ChooseStlTwoLevelIterator<\n      RvrsOuterIter, typename container_type::const_reverse_iterator>::type\n      global_const_reverse_iterator;\n\n  typedef global_iterator iterator;\n  typedef global_const_iterator const_iterator;\n  typedef global_reverse_iterator reverse_iterator;\n  typedef global_const_reverse_iterator const_reverse_iterator;\n\nprivate:\n  // XXX: for testing only\n\n#if 0\n  struct FakePTS {\n    std::vector<container_type*> v;\n\n    FakePTS () {\n      v.resize (size ());\n    }\n\n    container_type** getLocal () const {\n      return getRemote (galois::runtime::LL::getTID ());\n    }\n\n    container_type** getRemote (size_t i) const {\n      assert (i < v.size ());\n      return const_cast<container_type**> (&v[i]);\n    }\n\n    size_t size () const { return galois::runtime::LL::getMaxThreads(); }\n\n  };\n#endif\n  // typedef FakePTS PerThrdCont_ty;\n  typedef galois::substrate::PerThreadStorage<container_type*> PerThrdCont_ty;\n  PerThrdCont_ty perThrdCont;\n\n  void destroy() {\n    for (unsigned i = 0; i < perThrdCont.size(); ++i) {\n      delete *perThrdCont.getRemote(i);\n      *perThrdCont.getRemote(i) = NULL;\n    }\n  }\n\nprotected:\n  PerThreadContainer() : perThrdCont() {\n    for (unsigned i = 0; i < perThrdCont.size(); ++i) {\n      *perThrdCont.getRemote(i) = NULL;\n    }\n  }\n\n  template <typename... Args>\n  void init(Args&&... args) {\n    for (unsigned i = 0; i < perThrdCont.size(); ++i) {\n      *perThrdCont.getRemote(i) =\n          new container_type(std::forward<Args>(args)...);\n    }\n  }\n\n  ~PerThreadContainer() {\n    clear_all_parallel();\n    destroy();\n  }\n\npublic:\n  unsigned numRows() const { return perThrdCont.size(); }\n\n  container_type& get() { return **(perThrdCont.getLocal()); }\n\n  const container_type& get() const { return **(perThrdCont.getLocal()); }\n\n  container_type& get(unsigned i) { return **(perThrdCont.getRemote(i)); }\n\n  const container_type& get(unsigned i) const {\n    return **(perThrdCont.getRemote(i));\n  }\n\n  container_type& operator[](unsigned i) { return get(i); }\n\n  const container_type& operator[](unsigned i) const { return get(i); }\n\n  global_iterator begin_all() {\n    return galois::stl_two_level_begin(make_outer_begin(*this),\n                                       make_outer_end(*this));\n  }\n\n  global_iterator end_all() {\n    return galois::stl_two_level_end(make_outer_begin(*this),\n                                     make_outer_end(*this));\n  }\n\n  global_const_iterator begin_all() const { return cbegin_all(); }\n\n  global_const_iterator end_all() const { return cend_all(); }\n\n  // for compatibility with Range.h\n  global_iterator begin() { return begin_all(); }\n\n  global_iterator end() { return end_all(); }\n\n  global_const_iterator begin() const { return begin_all(); }\n\n  global_const_iterator end() const { return end_all(); }\n\n  global_const_iterator cbegin() const { return cbegin_all(); }\n\n  global_const_iterator cend() const { return cend_all(); }\n\n  global_const_iterator cbegin_all() const {\n    return galois::stl_two_level_cbegin(make_outer_begin(*this),\n                                        make_outer_end(*this));\n  }\n\n  global_const_iterator cend_all() const {\n    return galois::stl_two_level_cend(make_outer_begin(*this),\n                                      make_outer_end(*this));\n  }\n\n  global_reverse_iterator rbegin_all() {\n    return galois::stl_two_level_rbegin(make_outer_rbegin(*this),\n                                        make_outer_rend(*this));\n  }\n\n  global_reverse_iterator rend_all() {\n    return galois::stl_two_level_rend(make_outer_rbegin(*this),\n                                      make_outer_rend(*this));\n  }\n\n  global_const_reverse_iterator rbegin_all() const { return crbegin_all(); }\n\n  global_const_reverse_iterator rend_all() const { return crend_all(); }\n\n  global_const_reverse_iterator crbegin_all() const {\n    return galois::stl_two_level_crbegin(make_outer_rbegin(*this),\n                                         make_outer_rend(*this));\n  }\n\n  global_const_reverse_iterator crend_all() const {\n    return galois::stl_two_level_crend(make_outer_rbegin(*this),\n                                       make_outer_rend(*this));\n  }\n\n  local_iterator local_begin() { return get().begin(); }\n  local_iterator local_end() { return get().end(); }\n\n  // legacy STL\n  local_const_iterator local_begin() const { return get().begin(); }\n  local_const_iterator local_end() const { return get().end(); }\n\n  local_const_iterator local_cbegin() const { return get().cbegin(); }\n  local_const_iterator local_cend() const { return get().cend(); }\n\n  local_reverse_iterator local_rbegin() { return get().rbegin(); }\n  local_reverse_iterator local_rend() { return get().rend(); }\n\n  local_const_reverse_iterator local_crbegin() const { return get().crbegin(); }\n  local_const_reverse_iterator local_crend() const { return get().crend(); }\n\n  size_type size_all() const {\n    size_type sz = 0;\n\n    for (unsigned i = 0; i < perThrdCont.size(); ++i) {\n      sz += get(i).size();\n    }\n\n    return sz;\n  }\n\n  // XXX: disabling because of per thread memory allocators\n  // void clear_all() {\n  // for (unsigned i = 0; i < perThrdCont.size(); ++i) {\n  // get(i).clear();\n  // }\n  // }\n\n  void clear_all_parallel(void) {\n    galois::runtime::on_each_gen(\n        [this](const unsigned, const unsigned) { get().clear(); },\n        std::make_tuple());\n  }\n\n  bool empty_all() const {\n    bool res = true;\n    for (unsigned i = 0; i < perThrdCont.size(); ++i) {\n      res = res && get(i).empty();\n    }\n\n    return res;\n  }\n\n  template <typename Range, typename Ret>\n  void fill_parallel(const Range& range,\n                     Ret (container_type::*pushFn)(const value_type&) =\n                         &container_type::push_back) {\n    galois::runtime::do_all_gen(\n        range,\n        [this, pushFn](const typename Range::value_type& v) {\n          container_type& my = get();\n          (my.*pushFn)(v);\n          // (get ().*pushFn)(v);\n        },\n        std::make_tuple());\n  }\n};\n\ntemplate <typename T>\nclass PerThreadVector\n    : public PerThreadContainer<typename gstl::template Vector<T>> {\npublic:\n  typedef typename gstl::template Pow2Alloc<T> Alloc_ty;\n  typedef typename gstl::template Vector<T> container_type;\n\nprotected:\n  typedef PerThreadContainer<container_type> Super_ty;\n\n  Alloc_ty alloc;\n\npublic:\n  PerThreadVector() : Super_ty(), alloc() { Super_ty::init(alloc); }\n\n  void reserve_all(size_t sz) {\n    size_t numT = galois::getActiveThreads();\n    size_t perT = (sz + numT - 1) / numT; // round up\n\n    for (unsigned i = 0; i < numT; ++i) {\n      Super_ty::get(i).reserve(perT);\n    }\n  }\n};\n\ntemplate <typename T>\nclass PerThreadDeque\n    : public PerThreadContainer<typename gstl::template Deque<T>> {\n\npublic:\n  typedef typename gstl::template Pow2Alloc<T> Alloc_ty;\n\nprotected:\n  typedef typename gstl::template Deque<T> container_type;\n  typedef PerThreadContainer<container_type> Super_ty;\n\n  Alloc_ty alloc;\n\npublic:\n  PerThreadDeque() : Super_ty(), alloc() { Super_ty::init(alloc); }\n};\n\ntemplate <typename T, unsigned ChunkSize = 64>\nclass PerThreadGdeque\n    : public PerThreadContainer<galois::gdeque<T, ChunkSize>> {\n\n  using Super_ty = PerThreadContainer<galois::gdeque<T, ChunkSize>>;\n\npublic:\n  PerThreadGdeque() : Super_ty() { Super_ty::init(); }\n};\n\ntemplate <typename T>\nclass PerThreadList\n    : public PerThreadContainer<typename gstl::template List<T>> {\n\npublic:\n  typedef typename gstl::template FixedSizeAlloc<T> Alloc_ty;\n\nprotected:\n  typedef typename gstl::template List<T> container_type;\n  typedef PerThreadContainer<container_type> Super_ty;\n\n  Alloc_ty alloc;\n\npublic:\n  PerThreadList() : Super_ty(), alloc() { Super_ty::init(alloc); }\n};\n\ntemplate <typename K, typename V, typename C = std::less<K>>\nclass PerThreadMap\n    : public PerThreadContainer<typename gstl::template Map<K, V, C>> {\n\npublic:\n  typedef typename gstl::template Map<K, V, C> container_type;\n  typedef typename gstl::template FixedSizeAlloc<\n      typename container_type::value_type>\n      Alloc_ty;\n\nprotected:\n  typedef PerThreadContainer<container_type> Super_ty;\n\n  Alloc_ty alloc;\n\npublic:\n  explicit PerThreadMap(const C& cmp = C()) : Super_ty(), alloc() {\n    Super_ty::init(cmp, alloc);\n  }\n\n  typedef typename Super_ty::global_const_iterator global_const_iterator;\n  typedef typename Super_ty::global_const_reverse_iterator\n      global_const_reverse_iterator;\n\n  // hiding non-const (and const) versions in Super_ty\n  global_const_iterator begin_all() const { return Super_ty::cbegin_all(); }\n  global_const_iterator end_all() const { return Super_ty::cend_all(); }\n\n  // hiding non-const (and const) versions in Super_ty\n  global_const_reverse_iterator rbegin_all() const {\n    return Super_ty::crbegin_all();\n  }\n  global_const_reverse_iterator rend_all() const {\n    return Super_ty::crend_all();\n  }\n};\n\ntemplate <typename T, typename C = std::less<T>>\nclass PerThreadSet\n    : public PerThreadContainer<typename gstl::template Set<T, C>> {\n\npublic:\n  typedef typename gstl::template FixedSizeAlloc<T> Alloc_ty;\n\nprotected:\n  typedef typename gstl::template Set<T, C> container_type;\n  typedef PerThreadContainer<container_type> Super_ty;\n\n  Alloc_ty alloc;\n\npublic:\n  explicit PerThreadSet(const C& cmp = C()) : Super_ty(), alloc() {\n    Super_ty::init(cmp, alloc);\n  }\n\n  typedef typename Super_ty::global_const_iterator global_const_iterator;\n  typedef typename Super_ty::global_const_reverse_iterator\n      global_const_reverse_iterator;\n\n  // hiding non-const (and const) versions in Super_ty\n  global_const_iterator begin_all() const { return Super_ty::cbegin_all(); }\n  global_const_iterator end_all() const { return Super_ty::cend_all(); }\n\n  // hiding non-const (and const) versions in Super_ty\n  global_const_reverse_iterator rbegin_all() const {\n    return Super_ty::crbegin_all();\n  }\n  global_const_reverse_iterator rend_all() const {\n    return Super_ty::crend_all();\n  }\n};\n\ntemplate <typename T, typename C = std::less<T>>\nclass PerThreadMinHeap\n    : public PerThreadContainer<typename gstl::template PQ<T, C>> {\n\npublic:\n  typedef typename gstl::template Pow2Alloc<T> Alloc_ty;\n\nprotected:\n  typedef typename gstl::template Vector<T> Vec_ty;\n  typedef typename gstl::template PQ<T, C> container_type;\n  typedef PerThreadContainer<container_type> Super_ty;\n\n  Alloc_ty alloc;\n\npublic:\n  explicit PerThreadMinHeap(const C& cmp = C()) : Super_ty(), alloc() {\n    Super_ty::init(cmp, Vec_ty(alloc));\n  }\n\n  typedef typename Super_ty::global_const_iterator global_const_iterator;\n  typedef typename Super_ty::global_const_reverse_iterator\n      global_const_reverse_iterator;\n\n  // hiding non-const (and const) versions in Super_ty\n  global_const_iterator begin_all() const { return Super_ty::cbegin_all(); }\n  global_const_iterator end_all() const { return Super_ty::cend_all(); }\n\n  // hiding non-const (and const) versions in Super_ty\n  global_const_reverse_iterator rbegin_all() const {\n    return Super_ty::crbegin_all();\n  }\n  global_const_reverse_iterator rend_all() const {\n    return Super_ty::crend_all();\n  }\n};\n\n} // end namespace galois\n#endif // GALOIS_PERTHREADCONTAINER_H\n"
  },
  {
    "path": "libgalois/include/galois/PriorityQueue.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#ifndef GALOIS_PRIORITYQUEUE_H\n#define GALOIS_PRIORITYQUEUE_H\n\n#include <algorithm>\n#include <set>\n#include <vector>\n\n#include \"galois/config.h\"\n#include \"galois/Mem.h\"\n#include \"galois/substrate/PaddedLock.h\"\n#include \"galois/substrate/CompilerSpecific.h\"\n\nnamespace galois {\n\n/**\n * Thread-safe ordered set. Faster than STL heap operations (about 10%-15%\n * faster on serially) and can use scalable allocation, e.g., {@link\n * FixedSizeAllocator}.\n */\ntemplate <typename T, typename Cmp = std::less<T>,\n          typename Alloc = galois::FixedSizeAllocator<T>>\nclass ThreadSafeOrderedSet {\n  typedef std::set<T, Cmp, Alloc> Set;\n\npublic:\n  typedef Set container_type;\n  typedef typename container_type::value_type value_type;\n  typedef typename container_type::reference reference;\n  typedef typename container_type::const_reference const_reference;\n  typedef typename container_type::pointer pointer;\n  typedef typename container_type::size_type size_type;\n  typedef typename container_type::const_iterator iterator;\n  typedef typename container_type::const_iterator const_iterator;\n  typedef typename container_type::const_reverse_iterator reverse_iterator;\n  typedef\n      typename container_type::const_reverse_iterator const_reverse_iterator;\n  typedef galois::substrate::SimpleLock Lock_ty;\n\nprivate:\n  alignas(substrate::GALOIS_CACHE_LINE_SIZE) Lock_ty mutex;\n  Set orderedSet;\n\npublic:\n  template <typename _T, typename _Cmp = std::less<_T>,\n            typename _Alloc = galois::FixedSizeAllocator<_T>>\n  using retype =\n      ThreadSafeOrderedSet<_T, _Cmp,\n                           _Alloc>; // FIXME: loses Alloc and Cmp types\n\n  explicit ThreadSafeOrderedSet(const Cmp& cmp     = Cmp(),\n                                const Alloc& alloc = Alloc())\n      : orderedSet(cmp, alloc) {}\n\n  template <typename Iter>\n  ThreadSafeOrderedSet(Iter b, Iter e, const Cmp& cmp = Cmp(),\n                       const Alloc& alloc = Alloc())\n      : orderedSet(cmp, alloc) {\n    for (; b != e; ++b) {\n      orderedSet.insert(*b);\n    }\n  }\n\n  bool empty() const {\n    mutex.lock();\n    bool ret = orderedSet.empty();\n    mutex.unlock();\n\n    return ret;\n  }\n\n  size_type size() const {\n    mutex.lock();\n    size_type sz = orderedSet.size();\n    mutex.unlock();\n\n    return sz;\n  }\n\n  value_type top() const {\n    mutex.lock();\n    value_type x = *orderedSet.begin();\n    mutex.unlock();\n    return x;\n  }\n\n  bool find(const value_type& x) const {\n    mutex.lock();\n    bool ret = (orderedSet.find(x) != orderedSet.end());\n    mutex.unlock();\n    return ret;\n  }\n\n  // for compatibility with various stl types\n  inline void push_back(const value_type& x) { this->push(x); }\n  inline void insert(const value_type& x) { this->push(x); }\n\n  bool push(const value_type& x) {\n    mutex.lock();\n    auto p = orderedSet.insert(x);\n    mutex.unlock();\n    return p.second;\n  }\n\n  value_type pop() {\n    mutex.lock();\n    value_type x = *orderedSet.begin();\n    orderedSet.erase(orderedSet.begin());\n    mutex.unlock();\n    return x;\n  }\n\n  bool remove(const value_type& x) {\n    mutex.lock();\n    bool ret = false;\n\n    if (x == *orderedSet.begin()) {\n      orderedSet.erase(orderedSet.begin());\n      ret = true;\n    } else {\n      size_type s = orderedSet.erase(x);\n      ret         = (s > 0);\n    }\n    mutex.unlock();\n\n    return ret;\n  }\n\n  void clear() {\n    mutex.lock();\n    orderedSet.clear();\n    mutex.unlock();\n  }\n\n  const_iterator begin() const { return orderedSet.begin(); }\n  const_iterator end() const { return orderedSet.end(); }\n};\n\ntemplate <typename T, typename Cmp = std::less<T>,\n          typename Cont = std::vector<T, runtime::Pow_2_BlockAllocator<T>>>\nclass MinHeap {\npublic:\n  typedef runtime::Pow_2_BlockAllocator<T> alloc_type;\n  typedef Cont container_type;\n\n  typedef typename container_type::value_type value_type;\n  typedef typename container_type::reference reference;\n  typedef typename container_type::const_reference const_reference;\n  typedef typename container_type::pointer pointer;\n  typedef typename container_type::size_type size_type;\n  typedef typename container_type::const_iterator iterator;\n  typedef typename container_type::const_iterator const_iterator;\n  typedef typename container_type::const_reverse_iterator reverse_iterator;\n  typedef\n      typename container_type::const_reverse_iterator const_reverse_iterator;\n  // typedef typename container_type::const_iterator iterator;\n\nprotected:\n  struct RevCmp {\n    Cmp cmp;\n\n    explicit RevCmp(const Cmp& cmp) : cmp(cmp) {}\n\n    bool operator()(const T& left, const T& right) const {\n      return cmp(right, left);\n    }\n  };\n\n  Cont container;\n  RevCmp revCmp;\n\n  const_reference top_internal() const {\n    assert(!container.empty());\n    return container.front();\n  }\n\n  value_type pop_internal() {\n    assert(!container.empty());\n    std::pop_heap(container.begin(), container.end(), revCmp);\n\n    value_type x = container.back();\n    container.pop_back();\n\n    return x;\n  }\n\npublic:\n  explicit MinHeap(const Cmp& cmp = Cmp(), const Cont& container = Cont())\n      : container(container), revCmp(cmp) {}\n\n  template <typename Iter>\n  MinHeap(Iter b, Iter e, const Cmp& cmp = Cmp())\n      : container(b, e), revCmp(cmp) {\n    std::make_heap(container.begin(), container.end());\n  }\n\n  bool empty() const { return container.empty(); }\n\n  size_type size() const { return container.size(); }\n\n  const_reference top() const { return container.front(); }\n\n  // for compatibility with various stl types\n  inline void push_back(const value_type& x) { this->push(x); }\n  inline void insert(const value_type& x) { this->push(x); }\n\n  void push(const value_type& x) {\n    container.push_back(x);\n    std::push_heap(container.begin(), container.end(), revCmp);\n  }\n\n  value_type pop() {\n    assert(!container.empty());\n    std::pop_heap(container.begin(), container.end(), revCmp);\n\n    value_type x = container.back();\n    container.pop_back();\n    return x;\n  }\n\n  bool remove(const value_type& x) {\n    bool ret = false;\n\n    // TODO: write a better remove method\n    if (x == top()) {\n      pop();\n      ret = true;\n    } else {\n      typename container_type::iterator nend =\n          std::remove(container.begin(), container.end(), x);\n\n      ret = (nend != container.end());\n      container.erase(nend, container.end());\n\n      std::make_heap(container.begin(), container.end(), revCmp);\n    }\n\n    return ret;\n  }\n\n  bool find(const value_type& x) const {\n    return (std::find(begin(), end(), x) != end());\n  }\n\n  void clear() { container.clear(); }\n\n  const_iterator begin() const { return container.begin(); }\n  const_iterator end() const { return container.end(); }\n\n  void reserve(size_type s) { container.reserve(s); }\n};\n\n/**\n * Thread-safe min heap.\n */\ntemplate <typename T, typename Cmp = std::less<T>>\nclass ThreadSafeMinHeap {\npublic:\n  typedef MinHeap<T, Cmp> container_type;\n\n  typedef typename container_type::value_type value_type;\n  typedef typename container_type::reference reference;\n  typedef typename container_type::const_reference const_reference;\n  typedef typename container_type::pointer pointer;\n  typedef typename container_type::size_type size_type;\n  typedef typename container_type::const_iterator iterator;\n  typedef typename container_type::const_iterator const_iterator;\n  typedef typename container_type::const_reverse_iterator reverse_iterator;\n  typedef\n      typename container_type::const_reverse_iterator const_reverse_iterator;\n\nprotected:\n  typedef galois::substrate::SimpleLock Lock_ty;\n\n  alignas(substrate::GALOIS_CACHE_LINE_SIZE) Lock_ty mutex;\n  container_type heap;\n\npublic:\n  explicit ThreadSafeMinHeap(const Cmp& cmp = Cmp()) : heap(cmp) {}\n\n  template <typename Iter>\n  ThreadSafeMinHeap(Iter b, Iter e, const Cmp& cmp = Cmp()) : heap(b, e, cmp) {}\n\n  bool empty() const {\n    mutex.lock();\n    bool ret = heap.empty();\n    mutex.unlock();\n\n    return ret;\n  }\n\n  size_type size() const {\n    mutex.lock();\n    size_type sz = heap.size();\n    mutex.unlock();\n\n    return sz;\n  }\n\n  // can't return a reference, because the reference may not be pointing\n  // to a valid location due to vector doubling in size and moving to\n  // another memory location\n  value_type top() const {\n    mutex.lock();\n    value_type x = heap.top();\n    mutex.unlock();\n\n    return x;\n  }\n\n  // for compatibility with various stl types\n  inline void push_back(const value_type& x) { this->push(x); }\n  inline void insert(const value_type& x) { this->push(x); }\n\n  void push(const value_type& x) {\n    mutex.lock();\n    heap.push(x);\n    mutex.unlock();\n  }\n\n  value_type pop() {\n    mutex.lock();\n    value_type x = heap.pop();\n    mutex.unlock();\n    return x;\n  }\n\n  bool remove(const value_type& x) {\n    // TODO: write a better remove method\n    mutex.lock();\n    bool ret = heap.remove(x);\n    mutex.unlock();\n\n    return ret;\n  }\n\n  bool find(const value_type& x) const {\n    mutex.lock();\n    bool ret = heap.find(x);\n    mutex.unlock();\n\n    return ret;\n  }\n\n  void clear() {\n    mutex.lock();\n    heap.clear();\n    mutex.unlock();\n  }\n\n  // TODO: can't use in parallel context\n  const_iterator begin() const { return heap.begin(); }\n  const_iterator end() const { return heap.end(); }\n\n  void reserve(size_type s) { heap.reserve(s); }\n};\n\n} // namespace galois\n\n#endif\n"
  },
  {
    "path": "libgalois/include/galois/Reduction.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#ifndef GALOIS_REDUCTION_H\n#define GALOIS_REDUCTION_H\n\n#include <functional>\n#include <limits>\n\n#include \"galois/config.h\"\n#include \"galois/substrate/PerThreadStorage.h\"\n\nnamespace galois {\n\n/**\n * A Reducible stores per-thread values of a variable of type T and merges\n * multiple values into one.\n *\n * The reduced value is obtained by merging per thread values using the binary\n * functor MergeFunc. MergeFunc takes two values of type T and produces the\n * resulting merged value:\n *\n *   T operator()(T lhs, T rhs)\n *\n * If T is expensive to copy, a moving merge function is more appropriate:\n *\n *   T& operator()(T& lhs, T&& rhs)\n *\n * IdFunc returns the identity element, which is used to initialize and reset\n * the per thread values.\n *\n * Both MergeFunc and IdFunc should be copy constructable.\n *\n * The MergeFunc and IdFunc should be related as follows:\n *\n *   MergeFunc(x, IdFunc()) == x    for all x in X\n *\n * An example of using a move merge function:\n *\n *   auto merge_func = [](T& lhs, T&& rhs) -> T& { ... }\n *   auto identity_func = []() -> T { ... }\n *\n *   auto r = make_reducible(merge_func, identity_func);\n *   T u = ...\n *   r.update(std::move(u));\n *   T& result = r.reduce();\n */\ntemplate <typename T, typename MergeFunc, typename IdFunc>\nclass Reducible : public MergeFunc, public IdFunc {\n\n  galois::substrate::PerThreadStorage<T> data_;\n\n  void merge(T& lhs, T&& rhs) {\n    T v{std::move(MergeFunc::operator()(lhs, std::move(rhs)))};\n    lhs = std::move(v);\n  }\n\n  void merge(T& lhs, const T& rhs) { lhs = MergeFunc::operator()(lhs, rhs); }\n\npublic:\n  using value_type = T;\n\n  Reducible(MergeFunc merge_func, IdFunc id_func)\n      : MergeFunc(merge_func), IdFunc(id_func) {\n    for (unsigned i = 0; i < data_.size(); ++i) {\n      *(data_.getRemote(i)) = IdFunc::operator()();\n    }\n  }\n\n  /**\n   * Updates the thread local value by applying the reduction operator to\n   * current and newly provided value\n   */\n  void update(T&& rhs) { merge(*data_.getLocal(), std::move(rhs)); }\n\n  void update(const T& rhs) { merge(*data_.getLocal(), rhs); }\n\n  /**\n   * Returns a reference to the local value of T.\n   */\n  T& getLocal() { return *data_.getLocal(); }\n\n  /**\n   * Returns the final reduction value. Only valid outside the parallel region.\n   */\n  T& reduce() {\n    T& lhs = *data_.getLocal();\n    for (unsigned int i = 1; i < data_.size(); ++i) {\n      T& rhs = *data_.getRemote(i);\n      merge(lhs, std::move(rhs));\n      rhs = IdFunc::operator()();\n    }\n\n    return lhs;\n  }\n\n  void reset() {\n    for (unsigned int i = 0; i < data_.size(); ++i) {\n      *data_.getRemote(i) = IdFunc::operator()();\n    }\n  }\n};\n\n/**\n * make_reducible creates a Reducible from a merge function and identity\n * function.\n */\ntemplate <typename MergeFn, typename IdFn>\nauto make_reducible(const MergeFn& mergeFn, const IdFn& idFn) {\n  return Reducible<std::invoke_result_t<IdFn>, MergeFn, IdFn>(mergeFn, idFn);\n}\n\n//! gmax is the functional form of std::max\ntemplate <typename T>\nstruct gmax {\n  constexpr T operator()(const T& lhs, const T& rhs) const {\n    return std::max<T>(lhs, rhs);\n  }\n};\n\n//! gmax is the functional form of std::max\ntemplate <typename T>\nstruct gmin {\n  constexpr T operator()(const T& lhs, const T& rhs) const {\n    return std::min<T>(lhs, rhs);\n  }\n};\n\ntemplate <typename T, T value>\nstruct identity_value {\n  constexpr T operator()() const { return T{value}; }\n};\n\n// The following identity_value specializations exist because floating point\n// numbers cannot be template arguments.\n\ntemplate <typename T>\nstruct identity_value_zero {\n  constexpr T operator()() const { return T{0}; }\n};\n\ntemplate <typename T>\nstruct identity_value_min {\n  constexpr T operator()() const { return std::numeric_limits<T>::min(); }\n};\n\ntemplate <typename T>\nstruct identity_value_max {\n  constexpr T operator()() const { return std::numeric_limits<T>::max(); }\n};\n\n//! Accumulator for T where accumulation is plus\ntemplate <typename T>\nclass GAccumulator : public Reducible<T, std::plus<T>, identity_value_zero<T>> {\n  using base_type = Reducible<T, std::plus<T>, identity_value_zero<T>>;\n\npublic:\n  GAccumulator() : base_type(std::plus<T>(), identity_value_zero<T>()) {}\n\n  GAccumulator& operator+=(const T& rhs) {\n    base_type::update(rhs);\n    return *this;\n  }\n\n  GAccumulator& operator-=(const T& rhs) {\n    base_type::update(rhs);\n    return *this;\n  }\n};\n\n//! Accumulator for T where accumulation is max\ntemplate <typename T>\nclass GReduceMax : public Reducible<T, gmax<T>, identity_value_min<T>> {\n  using base_type = Reducible<T, gmax<T>, identity_value_min<T>>;\n\npublic:\n  GReduceMax() : base_type(gmax<T>(), identity_value_min<T>()) {}\n};\n\n//! Accumulator for T where accumulation is min\ntemplate <typename T>\nclass GReduceMin : public Reducible<T, gmin<T>, identity_value_max<T>> {\n  using base_type = Reducible<T, gmin<T>, identity_value_max<T>>;\n\npublic:\n  GReduceMin() : base_type(gmin<T>(), identity_value_max<T>()) {}\n};\n\n//! logical AND reduction\nclass GReduceLogicalAnd : public Reducible<bool, std::logical_and<bool>,\n                                           identity_value<bool, true>> {\n  using base_type =\n      Reducible<bool, std::logical_and<bool>, identity_value<bool, true>>;\n\npublic:\n  GReduceLogicalAnd()\n      : base_type(std::logical_and<bool>(), identity_value<bool, true>()) {}\n};\n\n//! logical OR reduction\nclass GReduceLogicalOr : public Reducible<bool, std::logical_or<bool>,\n                                          identity_value<bool, false>> {\n  using base_type =\n      Reducible<bool, std::logical_or<bool>, identity_value<bool, false>>;\n\npublic:\n  GReduceLogicalOr()\n      : base_type(std::logical_or<bool>(), identity_value<bool, false>()) {}\n};\n\n} // namespace galois\n#endif // GALOIS_REDUCTION_H\n"
  },
  {
    "path": "libgalois/include/galois/SharedMemSys.h",
    "content": "#ifndef GALOIS_SHAREDMEMSYS_H\n#define GALOIS_SHAREDMEMSYS_H\n\n#include \"galois/config.h\"\n#include \"galois/runtime/SharedMem.h\"\n\nnamespace galois {\n\n/**\n * SharedMemSys is an explicit class to initialize the Galois runtime. The\n * runtime is destroyed when this object is destroyed.\n */\nclass SharedMemSys : public runtime::SharedMem<runtime::StatManager> {\n\npublic:\n  explicit SharedMemSys();\n  ~SharedMemSys();\n\n  SharedMemSys(const SharedMemSys&) = delete;\n  SharedMemSys& operator=(const SharedMemSys&) = delete;\n\n  SharedMemSys(SharedMemSys&&) = delete;\n  SharedMemSys& operator=(SharedMemSys&&) = delete;\n};\n\n} // namespace galois\n\n#endif\n"
  },
  {
    "path": "libgalois/include/galois/Threads.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#ifndef GALOIS_THREADS_H\n#define GALOIS_THREADS_H\n\n#include \"galois/config.h\"\n\nnamespace galois {\n\n/**\n * Sets the number of threads to use when running any Galois iterator. Returns\n * the actual value of threads used, which could be less than the requested\n * value. System behavior is undefined if this function is called during\n * parallel execution or after the first parallel execution.\n */\nunsigned int setActiveThreads(unsigned int num) noexcept;\n\n/**\n * Returns the number of threads in use.\n */\nunsigned int getActiveThreads() noexcept;\n\n} // namespace galois\n#endif\n"
  },
  {
    "path": "libgalois/include/galois/Timer.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#ifndef GALOIS_TIMER_H\n#define GALOIS_TIMER_H\n\n#include <chrono>\n\n#include \"galois/config.h\"\n#include \"galois/gstl.h\"\n\nnamespace galois {\n\n//! A simple timer\nclass Timer {\n  typedef std::chrono::steady_clock clockTy;\n  // typedef std::chrono::high_resolution_clock clockTy;\n  std::chrono::time_point<clockTy> startT, stopT;\n\npublic:\n  void start();\n  void stop();\n  uint64_t get() const;\n  uint64_t get_usec() const;\n};\n\n//! A multi-start time accumulator.\n//! Gives the final runtime for a series of intervals\nclass TimeAccumulator {\n  Timer ltimer;\n  uint64_t acc;\n\npublic:\n  TimeAccumulator();\n\n  void start();\n  //! adds the current timed interval to the total\n  void stop();\n  uint64_t get() const;\n  uint64_t get_usec() const;\n  TimeAccumulator& operator+=(const TimeAccumulator& rhs);\n  TimeAccumulator& operator+=(const Timer& rhs);\n};\n\n//! Galois Timer that automatically reports stats upon destruction\n//! Provides statistic interface around timer\nclass StatTimer : public TimeAccumulator {\n  gstl::Str name_;\n  gstl::Str region_;\n  bool valid_;\n\npublic:\n  StatTimer(const char* name, const char* region);\n\n  StatTimer(const char* const n) : StatTimer(n, nullptr) {}\n\n  StatTimer() : StatTimer(nullptr, nullptr) {}\n\n  StatTimer(const StatTimer&) = delete;\n  StatTimer(StatTimer&&)      = delete;\n  StatTimer& operator=(const StatTimer&) = delete;\n  StatTimer& operator=(StatTimer&&) = delete;\n\n  ~StatTimer();\n\n  void start();\n  void stop();\n  uint64_t get_usec() const;\n};\n\ntemplate <bool Enable>\nclass CondStatTimer : public StatTimer {\npublic:\n  CondStatTimer(const char* const n, const char* region)\n      : StatTimer(n, region) {}\n\n  CondStatTimer(const char* region) : CondStatTimer(\"Time\", region) {}\n};\n\ntemplate <>\nclass CondStatTimer<false> {\npublic:\n  CondStatTimer(const char*) {}\n  CondStatTimer(const char* const, const char*) {}\n\n  void start() const {}\n  void stop() const {}\n  uint64_t get_usec() const { return 0; }\n};\n\ntemplate <typename F>\nvoid timeThis(const F& f, const char* const name) {\n  StatTimer t(\"Time\", name);\n\n  t.start();\n\n  f();\n\n  t.stop();\n}\n\n} // end namespace galois\n#endif\n"
  },
  {
    "path": "libgalois/include/galois/Traits.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#ifndef GALOIS_TRAITS_H\n#define GALOIS_TRAITS_H\n\n#include <tuple>\n#include <type_traits>\n\n#include \"galois/config.h\"\n#include \"galois/worklists/WorkList.h\"\n\nnamespace galois {\n\n// Trait classifications\n\ntemplate <typename T>\nstruct trait_has_type {\n  typedef T type;\n};\n\ntemplate <typename T>\nstruct trait_has_value {\n  typedef T type;\n  type value;\n  trait_has_value(const type& v) : value(v) {}\n  trait_has_value(type&& v) : value(std::move(v)) {}\n  T getValue() const { return value; }\n};\n\ntemplate <typename T, T V>\nstruct trait_has_svalue {\n  typedef T type;\n  static const type value = V;\n  T getValue() const { return V; }\n};\n\n/**\n * Utility function to simplify creating traits that take unnamed functions\n * (i.e., lambdas).\n */\ntemplate <template <typename...> class TT, typename... Args>\nauto make_trait_with_args(Args... args) -> TT<Args...> {\n  return TT<Args...>(args...);\n}\n\n/**\n * True if Derived is derived from Base or is Base itself.\n *\n * A matching trait is any type that inherits from a trait.\n */\ntemplate <typename Base, typename Derived>\nconstexpr bool at_least_base_of =\n    std::is_base_of<Base, Derived>::value || std::is_same<Base, Derived>::value;\n\n/**\n * Returns index of first matching trait in Tuple.\n *\n * This function is not well-defined if there is no matching trait.\n */\ntemplate <typename T, typename Tuple, size_t Int, size_t... Ints>\nconstexpr size_t find_trait(std::index_sequence<Int, Ints...> /*seq*/) {\n  if constexpr (at_least_base_of<\n                    T, typename std::tuple_element<Int, Tuple>::type>) {\n    return Int;\n  } else {\n    return find_trait<T, Tuple>(std::index_sequence<Ints...>{});\n  }\n}\n\ntemplate <typename T, typename Tuple>\nconstexpr size_t find_trait() {\n  constexpr std::make_index_sequence<std::tuple_size<Tuple>::value> seq{};\n  return find_trait<T, Tuple>(seq);\n}\n\n/**\n * Returns true if the tuple type contains the given trait T.\n */\ntemplate <typename T, typename... Ts>\nconstexpr bool has_trait(std::tuple<Ts...>* /*tpl*/) {\n  return (... || at_least_base_of<T, Ts>);\n}\n\ntemplate <typename T, typename Tuple>\nconstexpr bool has_trait() {\n  return has_trait<T>(static_cast<Tuple*>(nullptr));\n}\n\n/**\n * Returns the value associated with the given trait T in a tuple.\n *\n * This function is not well-defined when there is not matching trait.\n */\ntemplate <typename T, typename Tuple>\nconstexpr auto get_trait_value(Tuple tpl) {\n  constexpr size_t match(find_trait<T, Tuple>());\n  return std::get<match>(tpl);\n}\n\n/**\n * Returns the type associated with the given trait in a tuple.\n */\ntemplate <typename T, typename Tuple>\nstruct get_trait_type {\n  using type = typename std::tuple_element<find_trait<T, Tuple>(), Tuple>::type;\n};\n\n// Fallback to enable_if tricks over if constexpr to play more nicely with\n// unused parameter warnings.\n\ntemplate <typename S, typename T, typename D>\nconstexpr auto get_default_trait_value(\n    S /*source*/, T /*tag*/, D /*def*/,\n    typename std::enable_if<has_trait<T, S>()>::type* = nullptr) {\n  return std::make_tuple();\n}\n\ntemplate <typename S, typename T, typename D>\nconstexpr auto get_default_trait_value(\n    S GALOIS_UNUSED(source), T GALOIS_UNUSED(tags), D defaults,\n    typename std::enable_if<!has_trait<T, S>()>::type* = nullptr) {\n  return std::make_tuple(defaults);\n}\n\n/**\n * Returns a tuple that has an element from defaults[i] for every type\n * from tags[i] missing in source.\n */\ntemplate <typename S, typename T, typename D>\nconstexpr auto\nget_default_trait_values(std::index_sequence<> GALOIS_UNUSED(seq),\n                         S GALOIS_UNUSED(source), T GALOIS_UNUSED(tags),\n                         D GALOIS_UNUSED(defaults)) {\n  return std::make_tuple();\n}\n\ntemplate <size_t... Ints, typename S, typename T, typename D>\nconstexpr auto\nget_default_trait_values(std::index_sequence<Ints...> GALOIS_UNUSED(seq),\n                         S source, T tags, D defaults) {\n  return std::tuple_cat(get_default_trait_value(source, std::get<Ints>(tags),\n                                                std::get<Ints>(defaults))...);\n}\n\ntemplate <typename S, typename T, typename D>\nconstexpr auto get_default_trait_values(S source, T tags, D defaults) {\n  constexpr std::make_index_sequence<std::tuple_size<T>::value> seq{};\n  return get_default_trait_values(seq, source, tags, defaults);\n}\n\ntemplate <typename T>\nconstexpr auto has_function_traits(int)\n    -> decltype(std::declval<typename T::function_traits>(), bool()) {\n  return true;\n}\n\ntemplate <typename>\nconstexpr auto has_function_traits(...) -> bool {\n  return false;\n}\n\ntemplate <typename T, typename Enable = void>\nstruct function_traits {\n  typedef std::tuple<> type;\n};\n\ntemplate <typename T>\nstruct function_traits<\n    T, typename std::enable_if<has_function_traits<T>(0)>::type> {\n  typedef typename T::function_traits type;\n};\n\n// Traits\n\n/**\n * Indicate name to appear in statistics. Optional argument to {@link do_all()}\n * and {@link for_each()} loops.\n */\nstruct loopname_tag {};\nstruct loopname : public trait_has_value<const char*>, loopname_tag {\n  loopname(const char* p = \"ANON_LOOP\") : trait_has_value<const char*>(p) {}\n};\n\n/**\n * Indicate whether @{link do_all()} loops should perform work-stealing.\n * Optional argument to {@link do_all()} loops.\n */\nstruct steal_tag {};\nstruct steal : public trait_has_type<bool>, steal_tag {};\n\n/**\n * Indicates worklist to use. Optional argument to {@link for_each()} loops.\n */\nstruct wl_tag {};\ntemplate <typename T, typename... Args>\nstruct s_wl : public trait_has_type<T>, wl_tag {\n  std::tuple<Args...> args;\n  s_wl(Args&&... a) : args(std::forward<Args>(a)...) {}\n};\n\ntemplate <typename T, typename... Args>\ns_wl<T, Args...> wl(Args&&... args) {\n  return s_wl<T, Args...>(std::forward<Args>(args)...);\n}\n\n//\n/**\n * Indicates the operator may request the parallel loop to be suspended and a\n * given function run in serial\n */\nstruct parallel_break_tag {};\nstruct parallel_break : public trait_has_type<bool>, parallel_break_tag {};\n\n/**\n * Indicates the operator does not generate new work and push it on the worklist\n */\nstruct no_pushes_tag {};\nstruct no_pushes : public trait_has_type<bool>, no_pushes_tag {};\n\n/**\n * Indicates the operator may request the access to a per-iteration allocator\n */\nstruct per_iter_alloc_tag {};\nstruct per_iter_alloc : public trait_has_type<bool>, per_iter_alloc_tag {};\n\n/**\n * Indicates the operator doesn't need its execution stats recorded\n */\nstruct no_stats_tag {};\nstruct no_stats : public trait_has_type<bool>, no_stats_tag {};\n\n/**\n * Indicates the operator needs detailed stats\n * Must provide loopname to enable this flag\n */\nstruct more_stats_tag {};\nstruct more_stats : public trait_has_type<bool>, more_stats_tag {};\n\n/**\n * Indicates the operator doesn't need abort support\n */\nstruct disable_conflict_detection_tag {};\nstruct disable_conflict_detection : public trait_has_type<bool>,\n                                    disable_conflict_detection_tag {};\n\n/**\n * Indicates that the neighborhood set does not change through out i.e. is not\n * dependent on computed values. Examples of such fixed neighborhood is e.g.\n * the neighborhood being all the neighbors of a node in the input graph,\n * while the counter example is the neighborhood being some of the neighbors\n * based on some predicate.\n */\nstruct fixed_neighborhood_tag {};\nstruct fixed_neighborhood : public trait_has_type<bool>,\n                            fixed_neighborhood_tag {};\n\n/**\n * Indicates that the operator uses the intent to read flag.\n */\nstruct intent_to_read_tag {};\nstruct intent_to_read : public trait_has_type<bool>, intent_to_read_tag {};\n\n/**\n * Indicates the operator has a function that visits the neighborhood of the\n * operator without modifying it.\n */\nstruct neighborhood_visitor_tag {};\ntemplate <typename T>\nstruct neighborhood_visitor : public trait_has_value<T>,\n                              neighborhood_visitor_tag {\n  neighborhood_visitor(const T& t = T{}) : trait_has_value<T>(t) {}\n  neighborhood_visitor(T&& t) : trait_has_value<T>(std::move(t)) {}\n};\n\n/**\n * Indicates the operator has a function that allows a {@link\n * galois::for_each} loop to be exited deterministically.\n *\n * The function should have the signature <code>bool()</code>.\n *\n * It will be periodically called by the deterministic scheduler.  If it\n * returns true, the loop ends as if calling {@link UserContext::breakLoop},\n * but unlike that function, these breaks are deterministic.\n */\nstruct det_parallel_break_tag {};\ntemplate <typename T>\nstruct det_parallel_break : public trait_has_value<T>, det_parallel_break_tag {\n  static_assert(std::is_same<typename std::result_of<T()>::type, bool>::value,\n                \"signature must be bool()\");\n  det_parallel_break(const T& t = T()) : trait_has_value<T>(t) {}\n  det_parallel_break(T&& t) : trait_has_value<T>(std::move(t)) {}\n};\n\n/**\n * Indicates the operator has a function that optimizes the generation of\n * unique ids for active elements. This function should be thread-safe.\n *\n * The function should have the signature <code>uintptr_t (A)</code> where\n * A is the type of active elements.\n */\nstruct det_id_tag {};\ntemplate <typename T>\nstruct det_id : public trait_has_value<T>, det_id_tag {\n  det_id(const T& t = T()) : trait_has_value<T>(t) {}\n  det_id(T&& t) : trait_has_value<T>(std::move(t)) {}\n};\n\n/**\n * Indicates the operator has a type that encapsulates state that is passed\n * between the suspension and resumpsion of an operator during deterministic\n * scheduling.\n */\nstruct local_state_tag {};\ntemplate <typename T>\nstruct local_state : public trait_has_type<T>, local_state_tag {};\n\n// TODO: separate to libdist\n/** For distributed Galois **/\nstruct op_tag {};\n\nstruct chunk_size_tag {\n  enum { MIN = 1, MAX = 4096 };\n};\n\n/**\n * Specify chunk size for do_all_coupled & do_all_choice at compile time or at\n * runtime.\n *\n * For compile time, use the template argument, e.g., galois::chunk_size<16> ()\n * Additionally, user may provide a runtime argument, e.g,\n * galois::chunk_size<16> (8)\n *\n * Currently, only do_all_coupled can take advantage of the runtime argument.\n * TODO: allow runtime provision/tuning of chunk_size in other loop executors\n *\n * chunk size is clamped to within [chunk_size_tag::MIN, chunk_size_tag::MAX]\n */\ntemplate <unsigned SZ = 32>\nstruct chunk_size : public trait_has_value<unsigned>, chunk_size_tag {\nprivate:\n  constexpr static unsigned clamp(unsigned int v) {\n    return std::min(std::max(v, unsigned{chunk_size_tag::MIN}),\n                    unsigned{chunk_size_tag::MAX});\n  }\n\npublic:\n  constexpr static unsigned value = clamp(SZ);\n\n  chunk_size(unsigned cs = SZ) : trait_has_value(clamp(cs)) {}\n};\n\ntypedef worklists::PerSocketChunkFIFO<chunk_size<>::value> defaultWL;\n\nnamespace internal {\n\ntemplate <typename Tup>\nstruct NeedStats {\n  constexpr static const bool value =\n      !has_trait<no_stats_tag, Tup>() && has_trait<loopname_tag, Tup>();\n};\n\ntemplate <typename Tup>\nstd::enable_if_t<has_trait<loopname_tag, Tup>(), const char*>\ngetLoopName(const Tup& t) {\n  return get_trait_value<loopname_tag>(t).value;\n}\n\ntemplate <typename Tup>\nstd::enable_if_t<!has_trait<loopname_tag, Tup>(), const char*>\ngetLoopName(const Tup&) {\n  return \"ANON_LOOP\";\n}\n} // namespace internal\n\n} // namespace galois\n\n#endif\n"
  },
  {
    "path": "libgalois/include/galois/TwoLevelIterator.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#ifndef GALOIS_TWO_LEVEL_ITER_H\n#define GALOIS_TWO_LEVEL_ITER_H\n\n#include <cassert>\n#include <cstdlib>\n#include <functional>\n#include <iterator>\n#include <type_traits>\n\n#include <cstdlib>\n#include <cassert>\n\n#include \"galois/config.h\"\n\nnamespace galois {\n\nnamespace internal {\ntemplate <typename Iter>\nvoid safe_decrement(Iter& it, const Iter& beg,\n                    const Iter& GALOIS_USED_ONLY_IN_DEBUG(end),\n                    std::forward_iterator_tag) {\n\n  Iter next = beg;\n  Iter curr(next);\n\n  while (next != it) {\n    curr = next;\n    assert(next != end);\n    ++next;\n  }\n\n  assert(next == it);\n  assert(curr != it);\n\n  it = curr;\n}\n\ntemplate <typename Iter>\nvoid safe_decrement(Iter& it, const Iter& GALOIS_USED_ONLY_IN_DEBUG(beg),\n                    const Iter&, std::bidirectional_iterator_tag) {\n  assert(it != beg);\n  --it;\n}\n\ntemplate <typename Iter>\nvoid safe_decrement(Iter& it, const Iter& beg, const Iter& end) {\n  safe_decrement(it, beg, end,\n                 typename std::iterator_traits<Iter>::iterator_category());\n}\n} // namespace internal\n\n//! Common functionality of TwoLevelIterators\ntemplate <typename Outer, typename Inner, typename InnerBegFn,\n          typename InnerEndFn>\nclass TwoLevelIterBase {\n\nprotected:\n  // TODO: make begin and end const\n  Outer m_beg_outer;\n  Outer m_end_outer;\n  Outer m_outer;\n\n  Inner m_beg_inner;\n  Inner m_end_inner;\n  Inner m_inner;\n\n  InnerBegFn innerBegFn;\n  InnerEndFn innerEndFn;\n\n  inline bool outerAtBegin() const { return m_outer == m_beg_outer; }\n\n  inline bool outerAtEnd() const { return m_outer == m_end_outer; }\n\n  inline bool outerEmpty() const { return m_beg_outer == m_end_outer; }\n\n  inline const Inner& getInnerBegin() const { return m_beg_inner; }\n\n  inline const Inner& getInnerEnd() const { return m_end_inner; }\n\n  inline void setInnerAtBegin(void) {\n    assert(!outerAtEnd());\n    m_inner = m_beg_inner = innerBegFn(*m_outer);\n    m_end_inner           = innerEndFn(*m_outer);\n  }\n\n  inline void setInnerAtEnd(void) {\n    assert(!outerAtEnd());\n    m_beg_inner = innerBegFn(*m_outer);\n    m_inner = m_end_inner = innerEndFn(*m_outer);\n  }\n\n  inline bool innerAtBegin() const {\n    assert(m_beg_inner == innerBegFn(*m_outer));\n    return m_inner == m_beg_inner;\n  }\n\n  inline bool innerAtEnd() const {\n    assert(m_end_inner == innerEndFn(*m_outer));\n    return m_inner == m_end_inner;\n  }\n\n  TwoLevelIterBase()\n      : m_beg_outer(), m_end_outer(), m_outer(), m_beg_inner(), m_end_inner(),\n        m_inner(), innerBegFn(), innerEndFn() {}\n\n  TwoLevelIterBase(Outer beg_outer, Outer end_outer, Outer outer_pos,\n                   InnerBegFn innerBegFn, InnerEndFn innerEndFn)\n      : m_beg_outer(beg_outer), m_end_outer(end_outer), m_outer(outer_pos),\n        m_beg_inner(), m_end_inner(), m_inner(), innerBegFn(innerBegFn),\n        innerEndFn(innerEndFn) {}\n};\n\n//! Two-Level forward iterator\ntemplate <typename Outer, typename Inner, typename InnerBegFn,\n          typename InnerEndFn>\nclass TwoLevelFwdIter\n    : public std::iterator_traits<Inner>,\n      public TwoLevelIterBase<Outer, Inner, InnerBegFn, InnerEndFn> {\n\nprotected:\n  typedef std::iterator_traits<Inner> Traits;\n  typedef TwoLevelIterBase<Outer, Inner, InnerBegFn, InnerEndFn> Base;\n\n  void nextOuter() {\n    assert(!Base::outerAtEnd());\n    assert(!Base::outerEmpty());\n    ++Base::m_outer;\n    if (!Base::outerAtEnd()) {\n\n      Base::setInnerAtBegin();\n      // Base::m_inner = Base::innerBegin ();\n    }\n  }\n\n  void seekValidBegin() {\n    while (!Base::outerAtEnd() && Base::innerAtEnd()) {\n      nextOuter();\n    }\n  }\n\n  void step_forward() {\n    assert(!Base::innerAtEnd());\n    ++Base::m_inner;\n\n    if (Base::innerAtEnd()) {\n      seekValidBegin();\n    }\n  }\n\n  bool is_equal(const TwoLevelFwdIter& that) const {\n    // the outer iterators of 'this' and 'that' have been initialized\n    // with either (beg,end), or, (end, end)\n    //  - for two level begin, outer is initialized to (beg,end)\n    //  - for two level end, outer is initialized to (end, end)\n    assert(this->m_end_outer == that.m_end_outer);\n\n    return (this->m_outer == that.m_outer) &&\n           (Base::outerAtEnd() || (this->m_inner == that.m_inner));\n  }\n\npublic:\n  TwoLevelFwdIter() : Base() {}\n\n  TwoLevelFwdIter(Outer beg_outer, Outer end_outer, Outer outer_pos,\n                  InnerBegFn innerBegFn, InnerEndFn innerEndFn)\n      : Base(beg_outer, end_outer, outer_pos, innerBegFn, innerEndFn) {\n\n    if (!Base::outerAtEnd()) {\n      // Base::m_inner = Base::innerBegin ();\n      Base::setInnerAtBegin();\n      seekValidBegin();\n    }\n  }\n\n  typename Traits::reference operator*() const { return *Base::m_inner; }\n\n  typename Traits::pointer operator->() const {\n    return Base::m_inner->operator->();\n  }\n\n  TwoLevelFwdIter& operator++() {\n    step_forward();\n    return *this;\n  }\n\n  TwoLevelFwdIter operator++(int) {\n    TwoLevelFwdIter tmp(*this);\n    step_forward();\n    return tmp;\n  }\n\n  friend bool operator==(const TwoLevelFwdIter& left,\n                         const TwoLevelFwdIter& right) {\n    return left.is_equal(right);\n  }\n\n  friend bool operator!=(const TwoLevelFwdIter& left,\n                         const TwoLevelFwdIter& right) {\n    return !left.is_equal(right);\n  }\n};\n\n//! Two-Level bidirectional iterator\ntemplate <typename Outer, typename Inner, typename InnerBegFn,\n          typename InnerEndFn>\nclass TwoLevelBiDirIter\n    : public TwoLevelFwdIter<Outer, Inner, InnerBegFn, InnerEndFn> {\n\nprotected:\n  typedef TwoLevelFwdIter<Outer, Inner, InnerBegFn, InnerEndFn> FwdBase;\n\nprotected:\n  void prevOuter() {\n    assert(!FwdBase::outerAtBegin());\n    assert(!FwdBase::outerEmpty());\n\n    internal::safe_decrement(FwdBase::m_outer, FwdBase::m_beg_outer,\n                             FwdBase::m_end_outer);\n\n    // FwdBase::m_inner = FwdBase::innerEnd ();\n    FwdBase::setInnerAtEnd();\n  }\n\n  void step_backward() {\n    assert(!FwdBase::outerEmpty());\n\n    // assert (!FwdBase::outerAtBegin ());\n\n    // calling innerBegin when m_outer == m_end_outer is invalid\n    // so call prevOuter first, and check for innerBegin afterwards\n\n    if (FwdBase::outerAtEnd()) {\n      prevOuter();\n    }\n\n    while (FwdBase::innerAtBegin()) {\n      assert(!FwdBase::outerAtBegin());\n      prevOuter();\n    }\n\n    assert(FwdBase::innerAtBegin() ? FwdBase::outerAtBegin() : true);\n\n    --FwdBase::m_inner;\n  }\n\npublic:\n  TwoLevelBiDirIter() : FwdBase() {}\n\n  TwoLevelBiDirIter(Outer beg_outer, Outer end_outer, Outer outer_pos,\n                    InnerBegFn innerBegFn, InnerEndFn innerEndFn)\n      : FwdBase(beg_outer, end_outer, outer_pos, innerBegFn, innerEndFn) {}\n\n  TwoLevelBiDirIter& operator--() {\n    step_backward();\n    return *this;\n  }\n\n  TwoLevelBiDirIter operator--(int) {\n    TwoLevelBiDirIter tmp(*this);\n    step_backward();\n    return tmp;\n  }\n};\n\n//! Two-Level random access iterator\ntemplate <typename Outer, typename Inner, typename InnerBegFn,\n          typename InnerEndFn>\nclass TwoLevelRandIter\n    : public TwoLevelBiDirIter<Outer, Inner, InnerBegFn, InnerEndFn> {\n\nprotected:\n  typedef TwoLevelBiDirIter<Outer, Inner, InnerBegFn, InnerEndFn> BiDirBase;\n\n  typedef typename BiDirBase::Traits::difference_type Diff_ty;\n\n  void jump_forward(const Diff_ty d) {\n    assert(!BiDirBase::outerEmpty());\n\n    if (d < 0) {\n      jump_backward(-d);\n\n    } else {\n      Diff_ty rem(d);\n\n      while (rem > 0) {\n        assert(!BiDirBase::outerAtEnd());\n\n        Diff_ty avail =\n            std::distance(BiDirBase::m_inner, BiDirBase::getInnerEnd());\n        assert(avail >= 0);\n\n        if (rem > avail) {\n          rem -= avail;\n          assert(!BiDirBase::outerAtEnd());\n          BiDirBase::nextOuter();\n\n        } else {\n          BiDirBase::m_inner += rem;\n          rem = 0;\n        }\n\n        BiDirBase::seekValidBegin();\n      }\n    }\n  }\n\n  void jump_backward(const Diff_ty d) {\n    assert(!BiDirBase::outerEmpty());\n\n    if (d < 0) {\n      jump_forward(-d);\n\n    } else {\n\n      Diff_ty rem(d);\n\n      if ((rem > 0) && BiDirBase::outerAtEnd()) {\n        BiDirBase::prevOuter();\n      }\n\n      while (rem > 0) {\n        Diff_ty avail =\n            std::distance(BiDirBase::getInnerBegin(), BiDirBase::m_inner);\n        assert(avail >= 0);\n\n        if (rem > avail) {\n          rem -= avail;\n          assert(!BiDirBase::outerAtBegin());\n          BiDirBase::prevOuter();\n\n        } else {\n\n          BiDirBase::m_inner -= rem;\n          rem = 0;\n          break;\n        }\n      }\n    }\n  }\n\n  Diff_ty compute_dist(const TwoLevelRandIter& that) const {\n\n    if (std::distance(this->m_outer, that.m_outer) <\n        0) { // this->m_outer > that.m_outer\n      return -(that.compute_dist(*this));\n\n    } else if (this->m_outer == that.m_outer) {\n      if (!BiDirBase::outerAtEnd()) {\n        return std::distance(this->m_inner, that.m_inner);\n\n      } else {\n        return 0;\n      }\n\n    } else {\n\n      assert(std::distance(this->m_outer, that.m_outer) >\n             0); // this->m_outer < that.m_outer;\n      assert(!BiDirBase::outerAtEnd());\n\n      TwoLevelRandIter tmp(*this);\n\n      Diff_ty d = tmp.m_inner - tmp.m_inner; // 0\n\n      while (tmp.m_outer != that.m_outer) {\n        d += std::distance(tmp.m_inner, tmp.getInnerEnd());\n        tmp.nextOuter();\n      }\n\n      assert(tmp.m_outer == that.m_outer);\n\n      if (tmp.m_outer != tmp.m_end_outer) {\n        d += std::distance(tmp.m_inner, that.m_inner);\n      }\n\n      assert(d >= 0);\n\n      return d;\n    }\n  }\n\npublic:\n  TwoLevelRandIter() : BiDirBase() {}\n\n  TwoLevelRandIter(Outer beg_outer, Outer end_outer, Outer outer_pos,\n                   InnerBegFn innerBegFn, InnerEndFn innerEndFn)\n      : BiDirBase(beg_outer, end_outer, outer_pos, innerBegFn, innerEndFn) {}\n\n  TwoLevelRandIter& operator+=(Diff_ty d) {\n    jump_forward(d);\n    return *this;\n  }\n\n  TwoLevelRandIter& operator-=(Diff_ty d) {\n    jump_backward(d);\n    return *this;\n  }\n\n  friend TwoLevelRandIter operator+(const TwoLevelRandIter& it, Diff_ty d) {\n    TwoLevelRandIter tmp(it);\n    tmp += d;\n    return tmp;\n  }\n\n  friend TwoLevelRandIter operator+(Diff_ty d, const TwoLevelRandIter& it) {\n    return (it + d);\n  }\n\n  friend TwoLevelRandIter operator-(const TwoLevelRandIter& it, Diff_ty d) {\n    TwoLevelRandIter tmp(it);\n    tmp -= d;\n    return tmp;\n  }\n\n  friend Diff_ty operator-(const TwoLevelRandIter& left,\n                           const TwoLevelRandIter& right) {\n\n    return right.compute_dist(left);\n  }\n\n  typename BiDirBase::Traits::reference operator[](Diff_ty d) const {\n    return *((*this) + d);\n  }\n\n  friend bool operator<(const TwoLevelRandIter& left,\n                        const TwoLevelRandIter& right) {\n    return ((left.m_outer == right.m_outer) ? (left.m_inner < right.m_inner)\n                                            : (left.m_outer < right.m_outer));\n  }\n\n  friend bool operator<=(const TwoLevelRandIter& left,\n                         const TwoLevelRandIter& right) {\n    return (left < right) || (left == right);\n  }\n\n  friend bool operator>(const TwoLevelRandIter& left,\n                        const TwoLevelRandIter& right) {\n    return !(left <= right);\n  }\n\n  friend bool operator>=(const TwoLevelRandIter& left,\n                         const TwoLevelRandIter& right) {\n    return !(left < right);\n  }\n};\n\nnamespace internal {\n\ntemplate <typename Outer, typename Inner, typename InnerBegFn,\n          typename InnerEndFn, typename Cat>\nstruct ByCategory {};\n\ntemplate <typename Outer, typename Inner, typename InnerBegFn,\n          typename InnerEndFn>\nstruct ByCategory<Outer, Inner, InnerBegFn, InnerEndFn,\n                  std::forward_iterator_tag> {\n  typedef TwoLevelFwdIter<Outer, Inner, InnerBegFn, InnerEndFn> type;\n};\n\ntemplate <typename Outer, typename Inner, typename InnerBegFn,\n          typename InnerEndFn>\nstruct ByCategory<Outer, Inner, InnerBegFn, InnerEndFn,\n                  std::bidirectional_iterator_tag> {\n  typedef TwoLevelBiDirIter<Outer, Inner, InnerBegFn, InnerEndFn> type;\n};\n\ntemplate <typename Outer, typename Inner, typename InnerBegFn,\n          typename InnerEndFn>\nstruct ByCategory<Outer, Inner, InnerBegFn, InnerEndFn,\n                  std::random_access_iterator_tag> {\n  typedef TwoLevelRandIter<Outer, Inner, InnerBegFn, InnerEndFn> type;\n};\n\n// template <typename Outer, typename Inner>\n// struct IsRvrsIter {\n//\n// template <typename O, typename I>\n// struct IsRev {\n// static const bool VAL = false;\n// };\n//\n// template <typename O>\n// struct IsRev<O, typename O::value_type::reverse_iterator> {\n// static const bool VAL = true;\n// };\n//\n// template <typename O, typename I>\n// struct IsConstRev {\n// static const bool VAL = false;\n// };\n//\n// template <typename O>\n// struct IsConstRev<O, typename O::value_type::const_reverse_iterator> {\n// static const bool VAL = true;\n// };\n//\n//\n// static const bool VAL =\n// IsRev<Outer, Inner>::VAL || IsConstRev<Outer, Inner>::VAL;\n// };\n\n} // namespace internal\n\n//! Type function to select appropriate two-level iterator\ntemplate <typename Outer, typename Inner, typename InnerBegFn,\n          typename InnerEndFn>\nstruct ChooseTwoLevelIterator {\nprivate:\n  // typedef typename std::iterator_traits<Outer>::iterator_category CatOuter;\n  typedef typename std::iterator_traits<Inner>::iterator_category CatInner;\n\npublic:\n  typedef typename internal::ByCategory<Outer, Inner, InnerBegFn, InnerEndFn,\n                                        CatInner>::type type;\n};\n\n//! Creates two level iterator\ntemplate <typename Outer, typename InnerBegFn, typename InnerEndFn>\ntypename ChooseTwoLevelIterator<Outer, typename InnerBegFn::result_type,\n                                InnerBegFn, InnerEndFn>::type\nmake_two_level_begin(Outer beg, Outer end, InnerBegFn innerBegFn,\n                     InnerEndFn innerEndFn) {\n#ifndef NDEBUG\n  const bool V = std::is_same<typename InnerBegFn::result_type,\n                              typename InnerEndFn::result_type>::value;\n  assert(V);\n#endif\n\n  typedef typename InnerBegFn::result_type Inner;\n  typedef typename ChooseTwoLevelIterator<Outer, Inner, InnerBegFn,\n                                          InnerEndFn>::type Ret_ty;\n\n  return Ret_ty(beg, end, beg, innerBegFn, innerEndFn);\n}\n\n//! Creates two level iterator\ntemplate <typename Outer, typename InnerBegFn, typename InnerEndFn>\ntypename ChooseTwoLevelIterator<Outer, typename InnerBegFn::result_type,\n                                InnerBegFn, InnerEndFn>::type\nmake_two_level_end(Outer beg, Outer end, InnerBegFn innerBegFn,\n                   InnerEndFn innerEndFn) {\n  // const bool V = std::is_same<typename InnerBegFn::result_type, typename\n  // InnerEndFn::result_type>::value; static_assert (V);\n\n  typedef typename InnerBegFn::result_type Inner;\n  typedef typename ChooseTwoLevelIterator<Outer, Inner, InnerBegFn,\n                                          InnerEndFn>::type Ret_ty;\n\n  return Ret_ty(beg, end, end, innerBegFn, innerEndFn);\n}\n\nnamespace internal {\ntemplate <typename C>\nstruct GetBegin {\n  inline typename C::iterator operator()(C& c) const { return c.begin(); }\n};\n\ntemplate <typename C>\nstruct GetEnd {\n  inline typename C::iterator operator()(C& c) const { return c.end(); }\n};\n\n// TODO: update to c++11 names\ntemplate <typename C>\nstruct GetCbegin {\n  inline typename C::const_iterator operator()(const C& c) const {\n    return c.begin();\n  }\n};\n\ntemplate <typename C>\nstruct GetCend {\n  inline typename C::const_iterator operator()(const C& c) const {\n    return c.end();\n  }\n};\n\ntemplate <typename C>\nstruct GetRbegin {\n  inline typename C::reverse_iterator operator()(C& c) const {\n    return c.rbegin();\n  }\n};\n\ntemplate <typename C>\nstruct GetRend {\n  inline typename C::reverse_iterator operator()(C& c) const {\n    return c.rend();\n  }\n};\n\n// TODO: update to c++11 names\ntemplate <typename C>\nstruct GetCRbegin {\n  inline typename C::const_reverse_iterator operator()(const C& c) const {\n    return c.rbegin();\n  }\n};\n\ntemplate <typename C>\nstruct GetCRend {\n  inline typename C::const_reverse_iterator operator()(const C& c) const {\n    return c.rend();\n  }\n};\n\nenum StlIterKind { NORMAL, _CONST, REVERSE, _CONST_REVERSE };\n\ntemplate <typename C, typename I>\nstruct IsConstIter {\n  static const bool value = false;\n};\n\ntemplate <typename C>\nstruct IsConstIter<C, typename C::const_iterator> {\n  static const bool value = true;\n};\n\ntemplate <typename C, typename I>\nstruct IsRvrsIter {\n  static const bool value = false;\n};\n\ntemplate <typename C>\nstruct IsRvrsIter<C, typename C::reverse_iterator> {\n  static const bool value = true;\n};\n\ntemplate <typename C, typename I>\nstruct IsRvrsConstIter {\n  static const bool value = false;\n};\n\ntemplate <typename C>\nstruct IsRvrsConstIter<C, typename C::const_reverse_iterator> {\n  static const bool value = true;\n};\n\ntemplate <typename C, typename I>\nstruct GetStlIterKind {\n  static const bool isRvrs =\n      IsRvrsIter<C, I>::value || IsRvrsConstIter<C, I>::value;\n  static const bool isConst =\n      IsConstIter<C, I>::value || IsRvrsConstIter<C, I>::value;\n\n  static const StlIterKind value = isRvrs ? (isConst ? _CONST_REVERSE : REVERSE)\n                                          : (isConst ? _CONST : NORMAL);\n};\n\ntemplate <typename C, typename I, enum StlIterKind>\nstruct ChooseStlIter {\n  typedef void Inner;\n};\n\ntemplate <typename C, typename I>\nstruct ChooseStlIter<C, I, NORMAL> {\n\n  typedef typename C::iterator Inner;\n  typedef GetBegin<C> InnerBegFn;\n  typedef GetEnd<C> InnerEndFn;\n};\n\ntemplate <typename C, typename I>\nstruct ChooseStlIter<C, I, _CONST> {\n\n  typedef typename C::const_iterator Inner;\n  typedef GetCbegin<C> InnerBegFn;\n  typedef GetCend<C> InnerEndFn;\n};\n\ntemplate <typename C, typename I>\nstruct ChooseStlIter<C, I, REVERSE> {\n\n  typedef typename C::reverse_iterator Inner;\n  typedef GetRbegin<C> InnerBegFn;\n  typedef GetRend<C> InnerEndFn;\n};\n\ntemplate <typename C, typename I>\nstruct ChooseStlIter<C, I, _CONST_REVERSE> {\n\n  typedef typename C::const_reverse_iterator Inner;\n  typedef GetCRbegin<C> InnerBegFn;\n  typedef GetCRend<C> InnerEndFn;\n};\n\ntemplate <typename Outer, typename Inner>\nstruct ChooseStlTwoLevelIterImpl {\n\n  typedef typename std::iterator_traits<Outer>::value_type C;\n  static const internal::StlIterKind KIND =\n      internal::GetStlIterKind<C, Inner>::value;\n  typedef internal::ChooseStlIter<C, Inner, KIND> CStl;\n  typedef typename CStl::InnerBegFn InnerBegFn;\n  typedef typename CStl::InnerEndFn InnerEndFn;\n  typedef typename ChooseTwoLevelIterator<Outer, Inner, InnerBegFn,\n                                          InnerEndFn>::type type;\n\n  static type make(Outer beg, Outer end, Outer outer_pos) {\n    return type(beg, end, outer_pos, InnerBegFn(), InnerEndFn());\n  }\n};\n\ntemplate <typename Outer>\nstruct StlInnerIsIterator\n    : public ChooseStlTwoLevelIterImpl<\n          Outer, typename std::iterator_traits<Outer>::value_type::iterator> {};\n\ntemplate <typename Outer>\nstruct StlInnerIsConstIterator\n    : public ChooseStlTwoLevelIterImpl<\n          Outer,\n          typename std::iterator_traits<Outer>::value_type::const_iterator> {};\n\ntemplate <typename Outer>\nstruct StlInnerIsRvrsIterator\n    : public ChooseStlTwoLevelIterImpl<\n          Outer,\n          typename std::iterator_traits<Outer>::value_type::reverse_iterator> {\n};\n\ntemplate <typename Outer>\nstruct StlInnerIsConstRvrsIterator\n    : public ChooseStlTwoLevelIterImpl<\n          Outer, typename std::iterator_traits<\n                     Outer>::value_type::const_reverse_iterator> {};\n\n} // namespace internal\n\n//! Type function to select appropriate two-level iterator\ntemplate <typename Outer, typename Inner>\nstruct ChooseStlTwoLevelIterator {\n  typedef typename internal::ChooseStlTwoLevelIterImpl<Outer, Inner>::type type;\n};\n\ntemplate <typename Outer>\ntypename internal::StlInnerIsIterator<Outer>::type\nstl_two_level_begin(Outer beg, Outer end) {\n  return internal::StlInnerIsIterator<Outer>::make(beg, end, beg);\n}\n\ntemplate <typename Outer>\ntypename internal::StlInnerIsIterator<Outer>::type\nstl_two_level_end(Outer beg, Outer end) {\n  return internal::StlInnerIsIterator<Outer>::make(beg, end, end);\n}\n\ntemplate <typename Outer>\ntypename internal::StlInnerIsConstIterator<Outer>::type\nstl_two_level_cbegin(Outer beg, Outer end) {\n  return internal::StlInnerIsConstIterator<Outer>::make(beg, end, beg);\n}\n\ntemplate <typename Outer>\ntypename internal::StlInnerIsConstIterator<Outer>::type\nstl_two_level_cend(Outer beg, Outer end) {\n  return internal::StlInnerIsConstIterator<Outer>::make(beg, end, end);\n}\n\ntemplate <typename Outer>\ntypename internal::StlInnerIsRvrsIterator<Outer>::type\nstl_two_level_rbegin(Outer beg, Outer end) {\n  return internal::StlInnerIsRvrsIterator<Outer>::make(beg, end, beg);\n}\n\ntemplate <typename Outer>\ntypename internal::StlInnerIsRvrsIterator<Outer>::type\nstl_two_level_rend(Outer beg, Outer end) {\n  return internal::StlInnerIsRvrsIterator<Outer>::make(beg, end, end);\n}\n\ntemplate <typename Outer>\ntypename internal::StlInnerIsConstRvrsIterator<Outer>::type\nstl_two_level_crbegin(Outer beg, Outer end) {\n  return internal::StlInnerIsConstRvrsIterator<Outer>::make(beg, end, beg);\n}\n\ntemplate <typename Outer>\ntypename internal::StlInnerIsConstRvrsIterator<Outer>::type\nstl_two_level_crend(Outer beg, Outer end) {\n  return internal::StlInnerIsConstRvrsIterator<Outer>::make(beg, end, end);\n}\n\n} // end namespace galois\n\n#endif // GALOIS_TWO_LEVEL_ITER_H\n"
  },
  {
    "path": "libgalois/include/galois/TwoLevelIteratorA.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#ifndef GALOIS_TWOLEVELITERATORA_H\n#define GALOIS_TWOLEVELITERATORA_H\n\n#include <cassert>\n#include <iterator>\n#include <type_traits>\n#include <utility>\n\n#include <boost/iterator/iterator_adaptor.hpp>\n\n#include \"galois/config.h\"\n#include \"galois/gIO.h\"\n\nnamespace galois {\n\n/**\n * Alternate implementation of {@link ChooseTwoLevelIterator}.\n */\ntemplate <class OuterIter, class InnerIter, class CategoryOrTraversal,\n          class InnerBeginFn, class InnerEndFn>\nclass TwoLevelIteratorA\n    : public boost::iterator_adaptor<\n          TwoLevelIteratorA<OuterIter, InnerIter, CategoryOrTraversal,\n                            InnerBeginFn, InnerEndFn>,\n          InnerIter, boost::use_default, CategoryOrTraversal> {\npublic:\n  typedef typename TwoLevelIteratorA::iterator_adaptor_::difference_type\n      difference_type;\n\nprivate:\n  OuterIter m_outer_begin; // TODO could skip this field when modeling a forward\n                           // iterator\n  OuterIter m_outer_end;\n  OuterIter m_outer;\n  InnerBeginFn m_inner_begin_fn;\n  InnerEndFn m_inner_end_fn;\n\n#if __cplusplus >= 201103L\n  static_assert(\n      std::is_convertible<typename std::result_of<InnerBeginFn(\n                              decltype(*std::declval<OuterIter>()))>::type,\n                          InnerIter>::value,\n      \"Result of InnerBeginFn(*OuterIter) should be convertable to InnerIter\");\n  static_assert(\n      std::is_convertible<typename std::result_of<InnerEndFn(\n                              decltype(*std::declval<OuterIter>()))>::type,\n                          InnerIter>::value,\n      \"Result of InnerEndFn(*OuterIter) should be convertable to InnerIter\");\n#endif\n\n  friend class boost::iterator_core_access;\n\n  /**\n   * Update base iterator to beginning of first non-empty inner range after\n   * current one. Also update outer iterators appropriately.\n   */\n  void seek_forward() {\n    if (this->base_reference() != m_inner_end_fn(*m_outer))\n      return;\n\n    ++m_outer;\n\n    for (; m_outer != m_outer_end; ++m_outer) {\n      this->base_reference() = m_inner_begin_fn(*m_outer);\n\n      if (this->base_reference() != m_inner_end_fn(*m_outer))\n        break;\n    }\n  }\n\n  template <class Iter>\n  void safe_decrement_dispatch(std::forward_iterator_tag, Iter& it,\n                               Iter begin) {\n    Iter prev = begin;\n\n    for (; begin != it; ++begin)\n      prev = begin;\n  }\n\n  template <class Iter>\n  void safe_decrement_dispatch(std::bidirectional_iterator_tag, Iter& it,\n                               const Iter&) {\n    --it;\n  }\n\n  //! Decrement iterator or return true if it == begin.\n  template <class Iter>\n  bool safe_decrement(Iter& it, const Iter& begin) {\n    if (it == begin)\n      return true;\n    safe_decrement_dispatch(\n        typename std::iterator_traits<Iter>::iterator_category(), it, begin);\n    return false;\n  }\n\n  template <class Iter>\n  typename std::iterator_traits<Iter>::difference_type\n  safe_difference_dispatch(Iter it1, Iter it2, Iter end,\n                           std::input_iterator_tag) const {\n    if (it1 == it2)\n      return 0;\n\n    Iter it1_orig(it1);\n    Iter it2_orig(it2);\n\n    typename std::iterator_traits<Iter>::difference_type count1 = 0;\n    typename std::iterator_traits<Iter>::difference_type count2 = 0;\n\n    while (true) {\n      if (it1 != end) {\n        ++count1;\n        if (++it1 == it2_orig)\n          return count1;\n      }\n      if (it2 != end) {\n        ++count2;\n        if (++it2 == it1_orig)\n          return -count2;\n      }\n    }\n  }\n\n  template <class Iter>\n  typename std::iterator_traits<Iter>::difference_type\n  safe_difference_dispatch(Iter it1, Iter it2, Iter,\n                           std::random_access_iterator_tag) const {\n    return std::distance(it1, it2);\n  }\n\n  /**\n   * Returns correct distances even for forward iterators when it2 is not\n   * reachable from it1.\n   */\n  template <class Iter>\n  typename std::iterator_traits<Iter>::difference_type\n  safe_distance(Iter it1, Iter it2, Iter end) const {\n    return safe_difference_dispatch(\n        it1, it2, end,\n        typename std::iterator_traits<Iter>::iterator_category());\n  }\n\n  /**\n   * Update base iterator to end of first non-empty inner range before current\n   * one. Also update outer iterators appropriately.\n   */\n  void seek_backward() {\n    InnerIter end;\n\n    for (end = m_inner_end_fn(*m_outer); m_inner_begin_fn(*m_outer) == end;) {\n      bool too_far __attribute__((unused)) =\n          safe_decrement(m_outer, m_outer_begin);\n      assert(!too_far);\n      end = m_inner_end_fn(*m_outer);\n    }\n\n    this->base_reference() = end;\n  }\n\n  void increment() {\n    ++this->base_reference();\n    seek_forward();\n  }\n\n  void decrement() {\n    if (m_outer == m_outer_end) {\n      bool too_far __attribute__((unused)) =\n          safe_decrement(m_outer, m_outer_begin);\n      assert(!too_far);\n      seek_backward();\n    } else if (!safe_decrement(this->base_reference(),\n                               m_inner_begin_fn(*m_outer))) {\n      // Common case\n      return;\n    } else {\n      // Reached end of inner range\n      bool too_far __attribute__((unused)) =\n          safe_decrement(m_outer, m_outer_begin);\n      assert(!too_far);\n      seek_backward();\n    }\n\n    bool too_far __attribute__((unused)) =\n        safe_decrement(this->base_reference(), m_inner_begin_fn(*m_outer));\n    assert(!too_far);\n  }\n\n  template <class DiffType = difference_type>\n  void advance_dispatch(DiffType n, std::input_iterator_tag) {\n    if (n < 0) {\n      for (; n; ++n)\n        decrement();\n    } else if (n > 0) {\n      for (; n; --n)\n        increment();\n    }\n  }\n\n  template <class DiffType = difference_type>\n  void jump_forward(DiffType n) {\n    assert(n >= 0);\n    while (n) {\n      difference_type k =\n          std::distance(this->base_reference(), m_inner_end_fn(*m_outer));\n      difference_type m = std::min(k, n);\n      n -= m;\n      std::advance(this->base_reference(), m);\n      if (m == k)\n        seek_forward();\n    }\n  }\n\n  template <class DiffType = difference_type>\n  void jump_backward(DiffType n) {\n    // Note: not the same as jump_forward due to difference between beginning\n    // and end of ranges\n    assert(n >= 0);\n    if (n && m_outer == m_outer_end) {\n      decrement();\n      --n;\n    }\n\n    while (n) {\n      difference_type k =\n          std::distance(m_inner_begin_fn(*m_outer), this->base_reference()) + 1;\n      if (k == 1) {\n        decrement();\n        --n;\n      } else if (k < n) {\n        seek_backward();\n        n -= k;\n      } else {\n        std::advance(this->base_reference(), -n);\n        n = 0;\n      }\n    }\n  }\n\n  template <class DiffType = difference_type>\n  void advance_dispatch(DiffType n, std::random_access_iterator_tag) {\n    if (n == 1)\n      increment();\n    else if (n == -1)\n      decrement();\n    else if (n < 0)\n      jump_backward(-n);\n    else if (n > 0)\n      jump_forward(n);\n  }\n\n  void advance(difference_type n) {\n    advance_dispatch(\n        n, typename std::iterator_traits<InnerIter>::iterator_category());\n  }\n\n  template <class Other>\n  difference_type distance_to_dispatch(Other it2,\n                                       std::input_iterator_tag) const {\n    // Inline safe_distance here otherwise there is a cyclic dependency:\n    // std::distance -> iterator_adaptor -> distance_to -> safe_distance ->\n    // std::distance\n    if (*this == it2)\n      return 0;\n\n    TwoLevelIteratorA it1(*this);\n    TwoLevelIteratorA it2_orig(it2);\n\n    difference_type count1 = 0;\n    difference_type count2 = 0;\n\n    while (true) {\n      if (it1.m_outer != it1.m_outer_end) {\n        ++count1;\n        if (++it1 == it2_orig)\n          return count1;\n      }\n      if (it2.m_outer != it2.m_outer_end) {\n        ++count2;\n        if (++it2 == *this)\n          return -count2;\n      }\n    }\n  }\n\n  template <class Other>\n  difference_type distance_to_dispatch(const Other& x,\n                                       std::random_access_iterator_tag) const {\n    if (*this == x)\n      return 0;\n    else if (m_outer == x.m_outer)\n      return safe_distance(this->base_reference(), x.base_reference(),\n                           m_inner_end_fn(*m_outer));\n    else if (safe_distance(m_outer, x.m_outer, m_outer_end) < 0)\n      return -x.distance_to(*this);\n\n    difference_type me_count = 0;\n\n    TwoLevelIteratorA me(*this);\n\n    while (me.m_outer != me.m_outer_end) {\n      difference_type d;\n      if (me.m_outer != x.m_outer)\n        d = std::distance(me.base_reference(), me.m_inner_end_fn(*me.m_outer));\n      else\n        d = std::distance(me.base_reference(), x.base_reference());\n      me_count += d;\n      std::advance(me, d);\n      if (me == x)\n        return me_count;\n    }\n\n    GALOIS_DIE(\"invalid iterator \", std::distance(m_outer, x.m_outer));\n    return 0;\n  }\n\n  template <class OtherOuterIter, class OtherInnerIter, class C, class BF,\n            class EF>\n  difference_type distance_to(\n      const TwoLevelIteratorA<OtherOuterIter, OtherInnerIter, C, BF, EF>& x)\n      const {\n    return distance_to_dispatch(\n        x, typename std::iterator_traits<InnerIter>::iterator_category());\n  }\n\n  template <class OtherOuterIter, class OtherInnerIter, class C, class BF,\n            class EF>\n  bool\n  equal(const TwoLevelIteratorA<OtherOuterIter, OtherInnerIter, C, BF, EF>& x)\n      const {\n    if (m_outer == m_outer_end && m_outer == x.m_outer)\n      return true;\n\n    return m_outer == x.m_outer && this->base_reference() == x.base_reference();\n  }\n\npublic:\n  TwoLevelIteratorA() {}\n\n  TwoLevelIteratorA(OuterIter outer_begin, OuterIter outer_end, OuterIter outer,\n                    InnerBeginFn inner_begin_fn, InnerEndFn inner_end_fn)\n      : m_outer_begin(outer_begin), m_outer_end(outer_end), m_outer(outer),\n        m_inner_begin_fn(inner_begin_fn), m_inner_end_fn(inner_end_fn) {\n    if (m_outer != m_outer_end) {\n      this->base_reference() = m_inner_begin_fn(*m_outer);\n      seek_forward();\n    }\n  }\n\n  TwoLevelIteratorA(OuterIter outer_begin, OuterIter outer_end, OuterIter outer,\n                    InnerIter inner, InnerBeginFn inner_begin_fn,\n                    InnerEndFn inner_end_fn)\n      : m_outer_begin(outer_begin), m_outer_end(outer_end), m_outer(outer),\n        m_inner_begin_fn(inner_begin_fn), m_inner_end_fn(inner_end_fn) {\n    this->base_reference() = inner;\n  }\n\n  const OuterIter& get_outer_reference() const { return m_outer; }\n\n  const InnerIter& get_inner_reference() const {\n    return this->base_reference();\n  }\n};\n\n//! Helper functor, returns <code>t.end()</code>\nstruct GetBegin {\n  template <class T>\n  auto operator()(T&& x) const -> decltype(std::forward<T>(x).begin()) {\n    return std::forward<T>(x).begin();\n  }\n};\n\n//! Helper functor, returns <code>t.end()</code>\nstruct GetEnd {\n  template <class T>\n  auto operator()(T&& x) const -> decltype(std::forward<T>(x).end()) {\n    return std::forward<T>(x).end();\n  }\n};\n\n#if __cplusplus >= 201103L\ntemplate <\n    class CategoryOrTraversal = std::forward_iterator_tag, class OuterIter,\n    class InnerIter           = decltype(std::declval<OuterIter>()->begin()),\n    class InnerBeginFn = GetBegin, class InnerEndFn = GetEnd,\n    class Iter = TwoLevelIteratorA<OuterIter, InnerIter, CategoryOrTraversal,\n                                   InnerBeginFn, InnerEndFn>>\nstd::pair<Iter, Iter> make_two_level_iterator(OuterIter outer_begin,\n                                              OuterIter outer_end) {\n  return std::make_pair(\n      Iter(outer_begin, outer_end, outer_begin, InnerBeginFn(), InnerEndFn()),\n      Iter(outer_begin, outer_end, outer_end, InnerBeginFn(), InnerEndFn()));\n}\n#else\n// XXX(ddn): More direct encoding crashes XL 12.1, so lean towards more verbose\n// types\ntemplate <class CategoryOrTraversal, class OuterIter, class InnerIter,\n          class InnerBeginFn, class InnerEndFn>\nstd::pair<TwoLevelIteratorA<OuterIter, InnerIter, CategoryOrTraversal,\n                            InnerBeginFn, InnerEndFn>,\n          TwoLevelIteratorA<OuterIter, InnerIter, CategoryOrTraversal,\n                            InnerBeginFn, InnerEndFn>>\nmake_two_level_iterator(OuterIter outer_begin, OuterIter outer_end) {\n  return std::make_pair(\n      TwoLevelIteratorA<OuterIter, InnerIter, CategoryOrTraversal, InnerBeginFn,\n                        InnerEndFn>(outer_begin, outer_end, outer_begin,\n                                    InnerBeginFn(), InnerEndFn()),\n      TwoLevelIteratorA<OuterIter, InnerIter, CategoryOrTraversal, InnerBeginFn,\n                        InnerEndFn>(outer_begin, outer_end, outer_end,\n                                    InnerBeginFn(), InnerEndFn()));\n}\n#endif\n\n} // end namespace galois\n\n#endif\n"
  },
  {
    "path": "libgalois/include/galois/UnionFind.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#ifndef GALOIS_UNIONFIND_H\n#define GALOIS_UNIONFIND_H\n\n#include <atomic>\n\n#include \"galois/config.h\"\n\nnamespace galois {\n/**\n * Intrusive union-find implementation. Users subclass this to get disjoint\n * functionality for the subclass object.\n */\ntemplate <typename T>\nclass UnionFindNode {\n  T* findImpl() const {\n    if (isRep())\n      return m_component.load(std::memory_order_relaxed);\n\n    T* rep = m_component;\n    while (rep->m_component != rep) {\n      T* next = rep->m_component.load(std::memory_order_relaxed);\n      rep     = next;\n    }\n    return rep;\n  }\n\nprotected:\n  std::atomic<T*> m_component;\n\n  UnionFindNode(T* s) : m_component(s) {}\n\npublic:\n  typedef UnionFindNode<T> SuperTy;\n\n  bool isRep() const {\n    return m_component.load(std::memory_order_relaxed) == this;\n  }\n\n  T* get() const { return m_component.load(std::memory_order_relaxed); }\n\n  const T* find() const { return findImpl(); }\n\n  T* find() { return findImpl(); }\n\n  //! Compress ONLY node to point directly to the root of the tree;\n  //! nodes on path are not altered\n  void compress() {\n    if (isRep())\n      return;\n\n    // my current component\n    T* rep = m_component;\n\n    // loop until rep == itself; i.e. get root\n    while (rep->m_component.load(std::memory_order_relaxed) != rep) {\n      // get next parent\n      T* next = rep->m_component.load(std::memory_order_relaxed);\n      rep     = next;\n    }\n\n    // at this point rep is the parent: save as my parent\n    m_component.store(rep, std::memory_order_relaxed);\n  }\n\n  T* findAndCompress() {\n    // Basic outline of race in synchronous path compression is that two path\n    // compressions along two different paths to the root can create a cycle\n    // in the union-find tree. Prevent that from happening by compressing\n    // incrementally.\n    if (isRep())\n      return m_component.load(std::memory_order_relaxed);\n\n    T* rep  = m_component;\n    T* prev = 0;\n    while (rep->m_component.load(std::memory_order_relaxed) != rep) {\n      T* next = rep->m_component.load(std::memory_order_relaxed);\n\n      if (prev && prev->m_component.load(std::memory_order_relaxed) == rep) {\n        prev->m_component.store(next, std::memory_order_relaxed);\n      }\n      prev = rep;\n      rep  = next;\n    }\n\n    return rep;\n  }\n\n  //! Lock-free merge. Returns if merge was done.\n  T* merge(T* b) {\n    T* a = m_component.load(std::memory_order_relaxed);\n    while (true) {\n      a = a->findAndCompress();\n      b = b->findAndCompress();\n      if (a == b)\n        return 0;\n      // Avoid cycles by directing edges consistently\n      if (a < b)\n        std::swap(a, b);\n      if (a->m_component.compare_exchange_strong(a, b)) {\n        return b;\n      }\n    }\n  }\n};\n} // namespace galois\n#endif\n"
  },
  {
    "path": "libgalois/include/galois/UserContext.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#ifndef GALOIS_USERCONTEXT_H\n#define GALOIS_USERCONTEXT_H\n\n#include <functional>\n\n#include \"galois/config.h\"\n#include \"galois/gdeque.h\"\n#include \"galois/Mem.h\"\n#include \"galois/runtime/Context.h\"\n\nnamespace galois {\n\n/**\n * This is the object passed to the user's parallel loop.  This\n * provides the in-loop api.\n */\ntemplate <typename T>\nclass UserContext : private boost::noncopyable {\nprotected:\n  //! push stuff\n  typedef gdeque<T> PushBufferTy;\n  static const unsigned int fastPushBackLimit = 64;\n  typedef std::function<void(PushBufferTy&)> FastPushBack;\n\n  PushBufferTy pushBuffer;\n  //! Allocator stuff\n  IterAllocBaseTy IterationAllocatorBase;\n  PerIterAllocTy PerIterationAllocator;\n\n  //! used by all\n  bool* didBreak = nullptr;\n  FastPushBack fastPushBack;\n\n  //! some flags used by deterministic\n  bool firstPassFlag = false;\n  void* localState   = nullptr;\n\n  void __resetAlloc() { IterationAllocatorBase.clear(); }\n\n  void __setFirstPass(void) { firstPassFlag = true; }\n\n  void __resetFirstPass(void) { firstPassFlag = false; }\n\n  PushBufferTy& __getPushBuffer() { return pushBuffer; }\n\n  void __resetPushBuffer() { pushBuffer.clear(); }\n\n  void __setLocalState(void* p) { localState = p; }\n\n  void __setFastPushBack(FastPushBack f) { fastPushBack = f; }\n\npublic:\n  UserContext()\n      : IterationAllocatorBase(),\n        PerIterationAllocator(&IterationAllocatorBase), didBreak(0) {}\n\n  //! Signal break in parallel loop, current iteration continues\n  //! untill natural termination\n  void breakLoop() { *didBreak = true; }\n\n  //! Acquire a per-iteration allocator\n  PerIterAllocTy& getPerIterAlloc() { return PerIterationAllocator; }\n\n  //! Push new work\n  template <typename... Args>\n  void push(Args&&... args) {\n    // galois::runtime::checkWrite(MethodFlag::WRITE, true);\n    pushBuffer.emplace_back(std::forward<Args>(args)...);\n    if (fastPushBack && pushBuffer.size() > fastPushBackLimit)\n      fastPushBack(pushBuffer);\n  }\n\n  //! Push new work\n  template <typename... Args>\n  inline void push_back(Args&&... args) {\n    this->push(std::forward<Args>(args)...);\n  }\n\n  //! Push new work\n  template <typename... Args>\n  inline void insert(Args&&... args) {\n    this->push(std::forward<Args>(args)...);\n  }\n\n  //! Force the abort of this iteration\n  void abort() { galois::runtime::signalConflict(); }\n\n  //! Store and retrieve local state for deterministic\n  template <typename LS>\n  LS* getLocalState(void) {\n    return reinterpret_cast<LS*>(localState);\n  }\n\n  template <typename LS, typename... Args>\n  LS* createLocalState(Args&&... args) {\n    new (localState) LS(std::forward<Args>(args)...);\n    return getLocalState<LS>();\n  }\n\n  //! used by deterministic and ordered\n  //! @returns true when the operator is invoked for the first time. The\n  //! operator can use this information and choose to expand the neighborhood\n  //! only in the first pass.\n  bool isFirstPass(void) const { return firstPassFlag; }\n\n  //! declare that the operator has crossed the cautious point.  This\n  //! implies all data has been touched thus no new locks will be\n  //! acquired.\n  void cautiousPoint() {\n    if (isFirstPass()) {\n      galois::runtime::signalFailSafe();\n    }\n  }\n};\n\n} // namespace galois\n\n#endif\n"
  },
  {
    "path": "libgalois/include/galois/Version.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#ifndef GALOIS_VERSION_H\n#define GALOIS_VERSION_H\n\n#include <string>\n\n#include \"galois/config.h\"\n\nnamespace galois {\n\nstd::string getVersion();\nstd::string getRevision();\nint getVersionMajor();\nint getVersionMinor();\nint getVersionPatch();\nint getCopyrightYear();\n\n} // end namespace galois\n\n#endif\n"
  },
  {
    "path": "libgalois/include/galois/config.h.in",
    "content": "#ifndef GALOIS_CONFIG_H\n#define GALOIS_CONFIG_H\n\n#if !(defined(GALOIS_USE_LONGJMP_ABORT) || defined(GALOIS_USE_EXCEPTION_ABORT))\n#define GALOIS_USE_LONGJMP_ABORT\n#endif\n#if !(defined(GALOIS_USE_LONGJMP_ABORT) ^ defined(GALOIS_USE_EXCEPTION_ABORT))\n#error Exactly one of GALOIS_USE_LONGJMP_ABORT or GALOIS_USE_EXCEPTION_ABORT must be defined.\n#endif\n\n#if defined(__GNUC__)\n#define GALOIS_ALLOW_WARNINGS                \\\n_Pragma(\"GCC diagnostic push\")               \\\n_Pragma(\"GCC diagnostic warning \\\"-Wall\\\"\")  \\\n_Pragma(\"GCC diagnostic warning \\\"-Wextra\\\"\")\n#define GALOIS_END_ALLOW_WARNINGS _Pragma(\"GCC diagnostic pop\")\n#else\n#define GALOIS_ALLOW_WARNINGS\n#define GALOIS_END_ALLOW_WARNINGS\n#endif\n\n#if defined(__GNUC__)\n#define GALOIS_IGNORE_WARNINGS               \\\n_Pragma(\"GCC diagnostic push\")               \\\n_Pragma(\"GCC diagnostic ignored \\\"-Wall\\\"\")  \\\n_Pragma(\"GCC diagnostic ignored \\\"-Wextra\\\"\")\n#define GALOIS_END_IGNORE_WARNINGS _Pragma(\"GCC diagnostic pop\")\n#else\n#define GALOIS_IGNORE_WARNINGS\n#define GALOIS_END_IGNORE_WARNINGS\n#endif\n\n#if defined(__GNUC__)\n#define GALOIS_IGNORE_UNUSED_PARAMETERS                 \\\n_Pragma(\"GCC diagnostic push\")                          \\\n_Pragma(\"GCC diagnostic ignored \\\"-Wunused-parameter\\\"\")\n#define GALOIS_END_IGNORE_UNUSED_PARAMETERS _Pragma(\"GCC diagnostic pop\")\n#else\n#define GALOIS_IGNORE_UNUSED_PARAMETERS\n#define GALOIS_END_IGNORE_UNUSED_PARAMETERS\n#endif\n\n#if defined(__GNUC__) && !defined(__clang__)\n#define GALOIS_IGNORE_MAYBE_UNINITIALIZED                  \\\n_Pragma(\"GCC diagnostic push\")                             \\\n_Pragma(\"GCC diagnostic ignored \\\"-Wmaybe-uninitialized\\\"\")\n#define GALOIS_END_IGNORE_MAYBE_UNINITIALIZED _Pragma(\"GCC diagnostic pop\")\n#else\n#define GALOIS_IGNORE_MAYBE_UNINITIALIZED\n#define GALOIS_END_IGNORE_MAYBE_UNINITIALIZED\n#endif\n\n#if defined(__GNUC__)\n#define GALOIS_IGNORE_UNUSED_BUT_SET                           \\\n_Pragma(\"GCC diagnostic push\")                                 \\\n_Pragma(\"GCC diagnostic ignored \\\"-Wunused-but-set-variable\\\"\")\n#define GALOIS_END_IGNORE_UNUSED_BUT_SET _Pragma(\"GCC diagnostic pop\")\n#else\n#define GALOIS_IGNORE_UNUSED_BUT_SET\n#define GALOIS_END_IGNORE_UNUSED_BUT_SET\n#endif\n\n// Macro to suppress compiler warnings that a variable is set but unused.\n// This warning is buggy in gcc 7.\n#if defined(__GNUC__) && !defined(__clang__) && __GNUC__ < 8\n#define GALOIS_GCC7_IGNORE_UNUSED_BUT_SET                      \\\n_Pragma(\"GCC diagnostic push\")                                 \\\n_Pragma(\"GCC diagnostic ignored \\\"-Wunused-but-set-variable\\\"\")\n#define GALOIS_END_GCC7_IGNORE_UNUSED_BUT_SET _Pragma(\"GCC diagnostic pop\")\n#else\n#define GALOIS_GCC7_IGNORE_UNUSED_BUT_SET\n#define GALOIS_END_GCC7_IGNORE_UNUSED_BUT_SET\n#endif\n\n#if defined(NDEBUG)\n#define GALOIS_USED_ONLY_IN_DEBUG(NAME) NAME [[maybe_unused]]\n#else\n#define GALOIS_USED_ONLY_IN_DEBUG(NAME) NAME\n#endif\n\n#define GALOIS_UNUSED(NAME) NAME [[maybe_unused]]\n\n#if defined(__GNUC__)\n#define GALOIS_IGNORE_EXTERNAL_UNUSED_PARAMETERS         \\\n_Pragma(\"GCC diagnostic push\")                           \\\n_Pragma(\"GCC diagnostic ignored \\\"-Wunused-parameter\\\"\")\n#define GALOIS_END_IGNORE_EXTERNAL_UNUSED_PARAMETERS _Pragma(\"GCC diagnostic pop\")\n#else\n#define GALOIS_IGNORE_EXTERNAL_UNUSED_PARAMETERS\n#define GALOIS_END_IGNORE_EXTERNAL_UNUSED_PARAMETERS\n#endif\n\n#endif\n"
  },
  {
    "path": "libgalois/include/galois/gIO.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#ifndef GALOIS_GIO_H\n#define GALOIS_GIO_H\n\n#include <sstream>\n#include <cerrno>\n#include <cstdlib>\n#include <string.h>\n\n#include \"galois/config.h\"\n\n// FIXME: move to Runtime\n\nnamespace galois {\n\n//! Prints a string\nvoid gPrintStr(const std::string&);\n//! Prints an info string (for easy parsing)\nvoid gInfoStr(const std::string&);\n//! Prints a warning string (for easy parsing)\nvoid gWarnStr(const std::string&);\n//! Prints a debug string (for easy parsing)\nvoid gDebugStr(const std::string&);\n//! Prints an error string (for easy parsing)\nvoid gErrorStr(const std::string&);\n\n//! Prints a sequence of things\ntemplate <typename... Args>\nvoid gPrint(Args&&... args) {\n  std::ostringstream os;\n  (os << ... << args);\n  gPrintStr(os.str());\n}\n\n//! Prints an info string from a sequence of things\ntemplate <typename... Args>\nvoid gInfo(Args&&... args) {\n  std::ostringstream os;\n  (os << ... << args);\n  gInfoStr(os.str());\n}\n\n//! Prints a warning string from a sequence of things\ntemplate <typename... Args>\nvoid gWarn(Args&&... args) {\n  std::ostringstream os;\n  (os << ... << args);\n  gWarnStr(os.str());\n}\n\n//! Prints a debug string from a sequence of things; prints nothing if NDEBUG\n//! is defined.\ntemplate <typename... Args>\nvoid gDebug(Args&&... GALOIS_USED_ONLY_IN_DEBUG(args)) {\n#ifndef NDEBUG\n  std::ostringstream os;\n  (os << ... << args);\n  gDebugStr(os.str());\n#endif\n}\n\n//! Prints error message\ntemplate <typename... Args>\nvoid gError(Args&&... args) {\n  std::ostringstream os;\n  (os << ... << args);\n  gErrorStr(os.str());\n}\n\nvoid gFlush();\n\n#define GALOIS_SYS_DIE(...)                                                    \\\n  do {                                                                         \\\n    galois::gError(__FILE__, \":\", __LINE__, \": \", strerror(errno), \": \",       \\\n                   ##__VA_ARGS__);                                             \\\n    abort();                                                                   \\\n  } while (0)\n#define GALOIS_DIE(...)                                                        \\\n  do {                                                                         \\\n    galois::gError(__FILE__, \":\", __LINE__, \": \", ##__VA_ARGS__);              \\\n    abort();                                                                   \\\n  } while (0)\n//! Like assert but unconditionally executed\n#define GALOIS_ASSERT(cond, ...)                                               \\\n  do {                                                                         \\\n    bool b = (cond);                                                           \\\n    if (!b) {                                                                  \\\n      galois::gError(__FILE__, \":\", __LINE__, \": assertion failed: \", #cond,   \\\n                     \" \", ##__VA_ARGS__);                                      \\\n      abort();                                                                 \\\n    }                                                                          \\\n  } while (0)\n\ntemplate <unsigned ENABLE>\nstruct debug {\n  template <typename... Args>\n  static void print(const Args&... args) {\n    gDebug(args...);\n  }\n};\n\ntemplate <>\nstruct debug<0> {\n  template <typename... Args>\n  inline static void print(const Args&...) {}\n};\n\n} // end namespace galois\n\n#endif //_GIO_H\n"
  },
  {
    "path": "libgalois/include/galois/gdeque.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#ifndef GALOIS_GDEQUE_H\n#define GALOIS_GDEQUE_H\n\n#include \"galois/config.h\"\n#include \"galois/FixedSizeRing.h\"\n#include \"galois/Mem.h\"\n#include \"galois/TwoLevelIteratorA.h\"\n\n#include <boost/iterator/iterator_facade.hpp>\n#include <boost/iterator/reverse_iterator.hpp>\n\n#include <algorithm>\n#include <utility>\n\nnamespace galois {\n\n// Experimental random access iterator. Slower than old iterator for simple\n// traversals, so disable for now\n//#define _NEW_ITERATOR\n\n//! Like std::deque but use Galois memory management functionality\ntemplate <typename T, unsigned ChunkSize = 64,\n          typename ContainerTy = FixedSizeRing<T, ChunkSize>>\nclass gdeque {\n\nprotected:\n  struct Block : ContainerTy {\n    Block* next;\n    Block* prev;\n\n    Block() : next(), prev() {}\n\n    template <typename InputIterator>\n    Block(InputIterator first, InputIterator second)\n        : ContainerTy(first, second), next(), prev() {}\n  };\n\n#ifdef _NEW_ITERATOR\n  template <typename U>\n  class outer_iterator\n      : public boost::iterator_facade<outer_iterator<U>, U,\n                                      boost::bidirectional_traversal_tag> {\n    friend class boost::iterator_core_access;\n    template <typename, unsigned, typename>\n    friend class gdeque;\n    Block* cur;\n    Block* last;\n\n    void increment() { cur = cur->next; }\n    void decrement() {\n      if (cur) {\n        cur = cur->prev;\n      } else {\n        cur = last;\n      }\n    }\n\n    template <typename OtherTy>\n    bool equal(const outer_iterator<OtherTy>& o) const {\n      return cur == o.cur;\n    }\n\n    U& dereference() const { return *cur; }\n\n  public:\n    outer_iterator(Block* b = 0, Block* l = 0) : cur(b), last(l) {}\n\n    template <typename OtherTy>\n    outer_iterator(const outer_iterator<OtherTy>& o)\n        : cur(o.cur), last(o.last) {}\n  };\n\n  typedef typename Block::iterator inner_iterator;\n  typedef typename Block::const_iterator const_inner_iterator;\n#endif\n\n  Block* first;\n\nprivate:\n  Block* last;\n  unsigned num;\n\n  //! [Example Fixed Size Allocator]\n  galois::FixedSizeAllocator<Block> heap;\n\n  template <typename... Args>\n  Block* alloc_block(Args&&... args) {\n    // Fixed size allocator can only allocate 1 object at a time of size\n    // sizeof(Block). Argument to allocate is always 1.\n    Block* b = heap.allocate(1);\n    return new (b) Block(std::forward<Args>(args)...);\n  }\n\n  void free_block(Block* b) {\n    b->~Block();\n    heap.deallocate(b, 1);\n  }\n  //! [Example Fixed Size Allocator]\n\n  bool precondition() const {\n    return (num == 0 && first == NULL && last == NULL) ||\n           (num > 0 && first != NULL && last != NULL);\n  }\n\n  Block* extend_first() {\n    Block* b = alloc_block();\n    b->next  = first;\n    if (b->next)\n      b->next->prev = b;\n    first = b;\n    if (!last)\n      last = b;\n    return b;\n  }\n\n  Block* extend_last() {\n    Block* b = alloc_block();\n    b->prev  = last;\n    if (b->prev)\n      b->prev->next = b;\n    last = b;\n    if (!first)\n      first = b;\n    return b;\n  }\n\n  void shrink(Block* b) {\n    if (b->next)\n      b->next->prev = b->prev;\n    if (b->prev)\n      b->prev->next = b->next;\n    if (b == first)\n      first = b->next;\n    if (b == last)\n      last = b->prev;\n    free_block(b);\n  }\n\n  template <typename... Args>\n  std::pair<Block*, typename Block::iterator>\n  emplace(Block* b, typename Block::iterator ii, Args&&... args) {\n    ++num;\n    if (!b) {\n      // gdeque is empty or iteration == end\n      b = last;\n      if (!b || b->full())\n        b = extend_last();\n      ii = b->end();\n    } else if (b == first && ii == b->begin()) {\n      // iteration == begin\n      b = first;\n      if (!b || b->full())\n        b = extend_first();\n      ii = b->begin();\n    } else if (b->full()) {\n      auto d   = std::distance(ii, b->end());\n      Block* n = alloc_block(std::make_move_iterator(ii),\n                             std::make_move_iterator(b->end()));\n      for (; d > 0; --d)\n        b->pop_back();\n      ii      = b->end();\n      n->next = b->next;\n      n->prev = b;\n      b->next = n;\n      if (b == last)\n        last = n;\n    }\n    unsigned boff = std::distance(b->begin(), ii);\n    b->emplace(ii, std::forward<Args>(args)...);\n    return std::make_pair(b, b->begin() + boff);\n  }\n\npublic:\n#ifdef _NEW_ITERATOR\n  typedef galois::TwoLevelIteratorA<outer_iterator<Block>, inner_iterator,\n                                    std::random_access_iterator_tag,\n                                    GetBegin<Block>, GetEnd<Block>>\n      iterator;\n  typedef galois::TwoLevelIteratorA<outer_iterator<const Block>,\n                                    const_inner_iterator,\n                                    std::random_access_iterator_tag,\n                                    GetBegin<const Block>, GetEnd<const Block>>\n      const_iterator;\n#endif\n#ifndef _NEW_ITERATOR\n  template <typename U>\n  struct Iterator\n      : public boost::iterator_facade<Iterator<U>, U,\n                                      boost::bidirectional_traversal_tag> {\n    friend class boost::iterator_core_access;\n\n    Block* b;\n    Block* last;\n    unsigned offset;\n\n  private:\n    void increment() {\n      ++offset;\n      if (offset == b->size()) {\n        b      = b->next;\n        offset = 0;\n      }\n    }\n\n    void decrement() {\n      if (!b) {\n        b      = last;\n        offset = b->size() - 1;\n        return;\n      } else if (offset == 0) {\n        b      = b->prev;\n        offset = b->size() - 1;\n      } else {\n        --offset;\n      }\n    }\n\n    template <typename OtherTy>\n    bool equal(const Iterator<OtherTy>& o) const {\n      return b == o.b && offset == o.offset;\n    }\n\n    U& dereference() const { return b->getAt(offset); }\n\n  public:\n    Iterator(Block* _b = 0, Block* _l = 0, unsigned _off = 0)\n        : b(_b), last(_l), offset(_off) {}\n\n    template <typename OtherTy>\n    Iterator(const Iterator<OtherTy>& o)\n        : b(o.b), last(o.last), offset(o.offset) {}\n  };\n  typedef Iterator<T> iterator;\n  typedef Iterator<const T> const_iterator;\n#endif\n\n  typedef boost::reverse_iterator<iterator> reverse_iterator;\n  typedef boost::reverse_iterator<const_iterator> const_reverse_iterator;\n  typedef typename iterator::value_type value_type;\n  typedef typename iterator::pointer pointer;\n  typedef typename iterator::reference reference;\n  typedef typename const_iterator::reference const_reference;\n  typedef typename iterator::difference_type difference_type;\n  typedef size_t size_type;\n\n  gdeque() : first(), last(), num(), heap() {}\n\n  gdeque(gdeque&& o) : first(), last(), num(), heap() {\n    std::swap(first, o.first);\n    std::swap(last, o.last);\n    std::swap(num, o.num);\n  }\n\n  gdeque& operator=(gdeque&& o) {\n    std::swap(first, o.first);\n    std::swap(last, o.last);\n    std::swap(num, o.num);\n    return *this;\n  }\n\n  gdeque(const gdeque&) = delete;\n  gdeque& operator=(const gdeque&) = delete;\n\n  ~gdeque() { clear(); }\n\n  iterator begin() {\n    assert(precondition());\n\n#ifdef _NEW_ITERATOR\n    return iterator{outer_iterator<Block>{first, last},\n                    outer_iterator<Block>{nullptr, last},\n                    outer_iterator<Block>{first, last}, GetBegin<Block>{},\n                    GetEnd<Block>{}};\n#else\n    return iterator{first, last, 0};\n#endif\n  }\n\n  iterator end() {\n    assert(precondition());\n#ifdef _NEW_ITERATOR\n    return iterator{outer_iterator<Block>{first, last},\n                    outer_iterator<Block>{nullptr, last},\n                    outer_iterator<Block>{nullptr, last}, GetBegin<Block>{},\n                    GetEnd<Block>{}};\n#else\n    return iterator{nullptr, last, 0};\n#endif\n  }\n\n  const_iterator begin() const {\n    assert(precondition());\n\n#ifdef _NEW_ITERATOR\n    return const_iterator{outer_iterator<const Block>{first, last},\n                          outer_iterator<const Block>{nullptr, last},\n                          outer_iterator<const Block>{first, last},\n                          GetBegin<const Block>{},\n                          GetEnd<const Block, const_inner_iterator>{}};\n#else\n    return const_iterator{first, last, 0};\n#endif\n  }\n\n  const_iterator end() const {\n#ifdef _NEW_ITERATOR\n    return const_iterator{outer_iterator<const Block>{first, last},\n                          outer_iterator<const Block>{nullptr, last},\n                          outer_iterator<const Block>{nullptr, last},\n                          GetBegin<const Block>{},\n                          GetEnd<const Block, const_inner_iterator>{}};\n#else\n    return const_iterator{nullptr, last, 0};\n#endif\n  }\n\n  reverse_iterator rbegin() { return reverse_iterator{end()}; }\n\n  reverse_iterator rend() { return reverse_iterator{begin()}; }\n\n  const_reverse_iterator rbegin() const {\n    return const_reverse_iterator{end()};\n  }\n\n  const_reverse_iterator rend() const {\n    return const_reverse_iterator{begin()};\n  }\n\n  size_t size() const {\n    assert(precondition());\n    return num;\n  }\n\n  bool empty() const {\n    assert(precondition());\n    return num == 0;\n  }\n\n  reference front() {\n    assert(!empty());\n    return first->front();\n  }\n\n  const_reference front() const {\n    assert(!empty());\n    return first->front();\n  }\n\n  reference back() {\n    assert(!empty());\n    return last->back();\n  }\n\n  const_reference back() const {\n    assert(!empty());\n    return last->back();\n  }\n\n  void pop_back() {\n    assert(!empty());\n    --num;\n    last->pop_back();\n    if (last->empty())\n      shrink(last);\n  }\n\n  void pop_front() {\n    assert(!empty());\n    --num;\n    first->pop_front();\n    if (first->empty())\n      shrink(first);\n  }\n\n  void clear() {\n    assert(precondition());\n    Block* b = first;\n    while (b) {\n      b->clear();\n      Block* old = b;\n      b          = b->next;\n      free_block(old);\n    }\n    first = last = NULL;\n    num          = 0;\n  }\n\n  //! Invalidates pointers\n  template <typename... Args>\n  iterator emplace(iterator pos, Args&&... args) {\n#ifdef _NEW_ITERATOR\n    Block* b          = pos.get_outer_reference().cur;\n    inner_iterator ii = pos.get_inner_reference();\n#else\n    Block* b = pos.b;\n    typename Block::iterator ii;\n    if (b)\n      ii = b->begin() + pos.offset;\n#endif\n    auto p = emplace(b, ii, std::forward<Args>(args)...);\n#ifdef _NEW_ITERATOR\n    return iterator{outer_iterator<Block>{first, last},\n                    outer_iterator<Block>{nullptr, last},\n                    outer_iterator<Block>{p.first, last},\n                    p.second,\n                    GetBegin<Block>{},\n                    GetEnd<Block>{}};\n#else\n    return iterator(p.first, last, std::distance(p.first->begin(), p.second));\n#endif\n  }\n\n  iterator erase(iterator pos) {\n    GALOIS_DIE(\"not yet implemented\");\n    return pos;\n  }\n\n#ifdef _NEW_ITERATOR\n  //! Not truly constant time\n  reference operator[](size_t x) {\n    if (x == 0)\n      return front();\n    else if (x == num)\n      return back();\n    auto ii = begin();\n    std::advance(ii, x);\n    return *ii;\n  }\n\n  //! Not truly constant time\n  const_reference operator[](size_t x) const {\n    if (x == 0)\n      return front();\n    else if (x == num)\n      return back();\n    auto ii = begin();\n    std::advance(ii, x);\n    return *ii;\n  }\n#endif\n\n  template <typename... Args>\n  void emplace_back(Args&&... args) {\n    assert(precondition());\n    ++num;\n    if (!last || last->full())\n      extend_last();\n#ifndef NDEBUG\n    pointer p = last->emplace_back(std::forward<Args>(args)...);\n    assert(p);\n#else\n    last->emplace_back(std::forward<Args>(args)...);\n#endif\n  }\n\n  template <typename ValueTy>\n  void push_back(ValueTy&& v) {\n    emplace_back(std::forward<ValueTy>(v));\n  }\n\n  template <typename... Args>\n  void emplace_front(Args&&... args) {\n    assert(precondition());\n    ++num;\n    if (!first || first->full())\n      extend_first();\n    pointer p = first->emplace_front(std::forward<Args>(args)...);\n    assert(p);\n  }\n\n  template <typename ValueTy>\n  void push_front(ValueTy&& v) {\n    emplace_front(std::forward<ValueTy>(v));\n  }\n};\n\n#undef _NEW_ITERATOR\n} // namespace galois\n#endif\n"
  },
  {
    "path": "libgalois/include/galois/graphs/BufferedGraph.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n/**\n * @file BufferedGraph.h\n *\n * Contains the implementation of BufferedGraph\n */\n\n#ifndef GALOIS_GRAPHS_BUFGRAPH_H\n#define GALOIS_GRAPHS_BUFGRAPH_H\n\n#include <fstream>\n\n#include <boost/iterator/counting_iterator.hpp>\n\n#include \"galois/config.h\"\n#include \"galois/gIO.h\"\n#include \"galois/Reduction.h\"\n\nnamespace galois {\nnamespace graphs {\n\n/**\n * Class that loads a portion of a Galois graph from disk directly into\n * memory buffers for access.\n *\n * @tparam EdgeDataType type of the edge data\n * @todo version 2 Galois binary graph support; currently only suppports\n * version 1\n */\ntemplate <typename EdgeDataType>\nclass BufferedGraph {\nprivate:\n  // buffers that you load data into\n  //! buffer that tells you where a particular node's edges begin\n  uint64_t* outIndexBuffer = nullptr;\n  //! buffer that tells the destination of edges\n  uint32_t* edgeDestBuffer = nullptr;\n  //! buffer that stores edge data\n  EdgeDataType* edgeDataBuffer = nullptr;\n\n  //! size of the entire graph (not just locallly loaded portion)\n  uint32_t globalSize = 0;\n  //! number of edges in the entire graph (not just locallly loaded portion)\n  uint64_t globalEdgeSize = 0;\n\n  //! number of nodes loaded into this graph\n  uint32_t numLocalNodes = 0;\n  //! number of edges loaded into this graph\n  uint64_t numLocalEdges = 0;\n\n  //! specifies how many nodes are skipped from the beginning of the graph\n  //! in this loaded portion of it\n  uint64_t nodeOffset = 0;\n  //! specifies how many edges are skipped from the beginning of the graph\n  //! in this loaded portion of it\n  uint64_t edgeOffset = 0;\n  //! specifies whether or not the graph is loaded\n  bool graphLoaded = false;\n\n  // accumulators for tracking bytes read\n  //! number of bytes read related to the out index buffer\n  galois::GAccumulator<uint64_t> numBytesReadOutIndex;\n  //! number of bytes read related to the edge dest buffer\n  galois::GAccumulator<uint64_t> numBytesReadEdgeDest;\n  //! number of bytes read related to the edge data buffer\n  galois::GAccumulator<uint64_t> numBytesReadEdgeData;\n\n  /**\n   * Load the out indices (i.e. where a particular node's edges begin in the\n   * array of edges) from the file.\n   *\n   * @param graphFile loaded file for the graph\n   * @param nodeStart the first node to load\n   * @param numNodesToLoad number of nodes to load\n   */\n  void loadOutIndex(std::ifstream& graphFile, uint64_t nodeStart,\n                    uint64_t numNodesToLoad) {\n    if (numNodesToLoad == 0) {\n      return;\n    }\n    assert(outIndexBuffer == nullptr);\n    outIndexBuffer = (uint64_t*)malloc(sizeof(uint64_t) * numNodesToLoad);\n\n    if (outIndexBuffer == nullptr) {\n      GALOIS_DIE(\"Failed to allocate memory for out index buffer.\");\n    }\n\n    // position to start of contiguous chunk of nodes to read\n    uint64_t readPosition = (4 + nodeStart) * sizeof(uint64_t);\n    graphFile.seekg(readPosition);\n\n    uint64_t numBytesToLoad = numNodesToLoad * sizeof(uint64_t);\n    uint64_t bytesRead      = 0;\n\n    while (numBytesToLoad > 0) {\n      graphFile.read(((char*)this->outIndexBuffer) + bytesRead, numBytesToLoad);\n      size_t numRead = graphFile.gcount();\n      numBytesToLoad -= numRead;\n      bytesRead += numRead;\n    }\n\n    assert(numBytesToLoad == 0);\n\n    nodeOffset = nodeStart;\n  }\n\n  /**\n   * Load the edge destination information from the file.\n   *\n   * @param graphFile loaded file for the graph\n   * @param edgeStart the first edge to load\n   * @param numEdgesToLoad number of edges to load\n   * @param numGlobalNodes total number of nodes in the graph file; needed\n   * to determine offset into the file\n   */\n  void loadEdgeDest(std::ifstream& graphFile, uint64_t edgeStart,\n                    uint64_t numEdgesToLoad, uint64_t numGlobalNodes) {\n    if (numEdgesToLoad == 0) {\n      return;\n    }\n\n    assert(edgeDestBuffer == nullptr);\n    edgeDestBuffer = (uint32_t*)malloc(sizeof(uint32_t) * numEdgesToLoad);\n\n    if (edgeDestBuffer == nullptr) {\n      GALOIS_DIE(\"Failed to allocate memory for edge dest buffer.\");\n    }\n\n    // position to start of contiguous chunk of edges to read\n    uint64_t readPosition = (4 + numGlobalNodes) * sizeof(uint64_t) +\n                            (sizeof(uint32_t) * edgeStart);\n    graphFile.seekg(readPosition);\n\n    uint64_t numBytesToLoad = numEdgesToLoad * sizeof(uint32_t);\n    uint64_t bytesRead      = 0;\n    while (numBytesToLoad > 0) {\n      graphFile.read(((char*)this->edgeDestBuffer) + bytesRead, numBytesToLoad);\n      size_t numRead = graphFile.gcount();\n      numBytesToLoad -= numRead;\n      bytesRead += numRead;\n    }\n\n    assert(numBytesToLoad == 0);\n    // save edge offset of this graph for later use\n    edgeOffset = edgeStart;\n  }\n\n  /**\n   * Load the edge data information from the file.\n   *\n   * @tparam EdgeType must be non-void in order to call this function\n   *\n   * @param edgeStart the first edge to load\n   * @param numEdgesToLoad number of edges to load\n   * @param numGlobalNodes total number of nodes in the graph file; needed\n   * to determine offset into the file\n   * @param numGlobalEdges total number of edges in the graph file; needed\n   * to determine offset into the file\n   */\n  template <\n      typename EdgeType,\n      typename std::enable_if<!std::is_void<EdgeType>::value>::type* = nullptr>\n  void loadEdgeData(std::ifstream& graphFile, uint64_t edgeStart,\n                    uint64_t numEdgesToLoad, uint64_t numGlobalNodes,\n                    uint64_t numGlobalEdges) {\n    galois::gDebug(\"Loading edge data\");\n\n    if (numEdgesToLoad == 0) {\n      return;\n    }\n\n    assert(edgeDataBuffer == nullptr);\n    edgeDataBuffer =\n        (EdgeDataType*)malloc(sizeof(EdgeDataType) * numEdgesToLoad);\n\n    if (edgeDataBuffer == nullptr) {\n      GALOIS_DIE(\"Failed to allocate memory for edge data buffer.\");\n    }\n\n    // position after nodes + edges\n    uint64_t baseReadPosition = (4 + numGlobalNodes) * sizeof(uint64_t) +\n                                (sizeof(uint32_t) * numGlobalEdges);\n\n    // version 1 padding TODO make version agnostic\n    if (numGlobalEdges % 2) {\n      baseReadPosition += sizeof(uint32_t);\n    }\n\n    // jump to first byte of edge data\n    uint64_t readPosition =\n        baseReadPosition + (sizeof(EdgeDataType) * edgeStart);\n    graphFile.seekg(readPosition);\n    uint64_t numBytesToLoad = numEdgesToLoad * sizeof(EdgeDataType);\n    uint64_t bytesRead      = 0;\n\n    while (numBytesToLoad > 0) {\n      graphFile.read(((char*)this->edgeDataBuffer) + bytesRead, numBytesToLoad);\n      size_t numRead = graphFile.gcount();\n      numBytesToLoad -= numRead;\n      bytesRead += numRead;\n    }\n\n    assert(numBytesToLoad == 0);\n  }\n\n  /**\n   * Load edge data function for when the edge data type is void, i.e.\n   * no edge data to load.\n   *\n   * Does nothing of importance.\n   *\n   * @tparam EdgeType if EdgeType is void, this function will be used\n   */\n  template <\n      typename EdgeType,\n      typename std::enable_if<std::is_void<EdgeType>::value>::type* = nullptr>\n  void loadEdgeData(std::ifstream&, uint64_t, uint64_t, uint64_t, uint64_t) {\n    galois::gDebug(\"Not loading edge data\");\n    // do nothing (edge data is void, i.e. no edge data)\n  }\n\n  /**\n   * Resets graph metadata to default values. Does NOT touch the buffers.\n   */\n  void resetGraphStatus() {\n    graphLoaded    = false;\n    globalSize     = 0;\n    globalEdgeSize = 0;\n    nodeOffset     = 0;\n    edgeOffset     = 0;\n    numLocalNodes  = 0;\n    numLocalEdges  = 0;\n    resetReadCounters();\n  }\n\n  /**\n   * Free all of the buffers in memory.\n   */\n  void freeMemory() {\n    free(outIndexBuffer);\n    outIndexBuffer = nullptr;\n    free(edgeDestBuffer);\n    edgeDestBuffer = nullptr;\n    free(edgeDataBuffer);\n    edgeDataBuffer = nullptr;\n  }\n\npublic:\n  /**\n   * Class vars should be initialized by in-class initialization; all\n   * left is to reset read counters.\n   */\n  BufferedGraph() { resetReadCounters(); }\n\n  /**\n   * On destruction, free allocated buffers (if necessary).\n   */\n  ~BufferedGraph() noexcept { freeMemory(); }\n\n  // copy not allowed\n  //! disabled copy constructor\n  BufferedGraph(const BufferedGraph&) = delete;\n  //! disabled copy constructor operator\n  BufferedGraph& operator=(const BufferedGraph&) = delete;\n  // move not allowed\n  //! disabled move operator\n  BufferedGraph(BufferedGraph&&) = delete;\n  //! disabled move constructor operator\n  BufferedGraph& operator=(BufferedGraph&&) = delete;\n\n  /**\n   * Gets the number of global nodes in the graph\n   * @returns the total number of nodes in the graph (not just local loaded\n   * nodes)\n   */\n  uint32_t size() const { return globalSize; }\n\n  /**\n   * Gets the number of global edges in the graph\n   * @returns the total number of edges in the graph (not just local loaded\n   * edges)\n   */\n  uint32_t sizeEdges() const { return globalEdgeSize; }\n\n  //! @returns node offset of this buffered graph\n  uint64_t getNodeOffset() const { return nodeOffset; }\n\n  /**\n   * Loads given Galois CSR graph into memory.\n   *\n   * @param filename name of graph to load; should be in Galois binary graph\n   * format\n   */\n  void loadGraph(const std::string& filename) {\n    if (graphLoaded) {\n      GALOIS_DIE(\"Cannot load an buffered graph more than once.\");\n    }\n\n    std::ifstream graphFile(filename.c_str());\n    uint64_t header[4];\n    graphFile.read(((char*)header), sizeof(uint64_t) * 4);\n\n    numLocalNodes = globalSize = header[2];\n    numLocalEdges = globalEdgeSize = header[3];\n\n    loadOutIndex(graphFile, 0, globalSize);\n    loadEdgeDest(graphFile, 0, globalEdgeSize, globalSize);\n    // may or may not do something depending on EdgeDataType\n    loadEdgeData<EdgeDataType>(graphFile, 0, globalEdgeSize, globalSize,\n                               globalEdgeSize);\n    graphLoaded = true;\n\n    graphFile.close();\n  }\n\n  /**\n   * Given a node/edge range to load, loads the specified portion of the graph\n   * into memory buffers using read.\n   *\n   * @param filename name of graph to load; should be in Galois binary graph\n   * format\n   * @param nodeStart First node to load\n   * @param nodeEnd Last node to load, non-inclusive\n   * @param edgeStart First edge to load; should correspond to first edge of\n   * first node\n   * @param edgeEnd Last edge to load, non-inclusive\n   * @param numGlobalNodes Total number of nodes in the graph\n   * @param numGlobalEdges Total number of edges in the graph\n   */\n  void loadPartialGraph(const std::string& filename, uint64_t nodeStart,\n                        uint64_t nodeEnd, uint64_t edgeStart, uint64_t edgeEnd,\n                        uint64_t numGlobalNodes, uint64_t numGlobalEdges) {\n    if (graphLoaded) {\n      GALOIS_DIE(\"Cannot load an buffered graph more than once.\");\n    }\n\n    std::ifstream graphFile(filename.c_str());\n\n    globalSize     = numGlobalNodes;\n    globalEdgeSize = numGlobalEdges;\n\n    assert(nodeEnd >= nodeStart);\n    numLocalNodes = nodeEnd - nodeStart;\n    loadOutIndex(graphFile, nodeStart, numLocalNodes);\n\n    assert(edgeEnd >= edgeStart);\n    numLocalEdges = edgeEnd - edgeStart;\n    loadEdgeDest(graphFile, edgeStart, numLocalEdges, numGlobalNodes);\n\n    // may or may not do something depending on EdgeDataType\n    loadEdgeData<EdgeDataType>(graphFile, edgeStart, numLocalEdges,\n                               numGlobalNodes, numGlobalEdges);\n    graphLoaded = true;\n\n    graphFile.close();\n  }\n\n  //! Edge iterator typedef\n  using EdgeIterator = boost::counting_iterator<uint64_t>;\n  /**\n   * Get the index to the first edge of the provided node THAT THIS GRAPH\n   * HAS LOADED (not necessary the first edge of it globally).\n   *\n   * @param globalNodeID the global node id of the node to get the edge\n   * for\n   * @returns a GLOBAL edge id iterator\n   */\n  EdgeIterator edgeBegin(uint64_t globalNodeID) {\n    if (!graphLoaded) {\n      GALOIS_DIE(\"Graph hasn't been loaded yet.\");\n    }\n\n    if (numLocalNodes == 0) {\n      return EdgeIterator(0);\n    }\n    assert(nodeOffset <= globalNodeID);\n    assert(globalNodeID < (nodeOffset + numLocalNodes));\n\n    uint64_t localNodeID = globalNodeID - nodeOffset;\n\n    if (localNodeID != 0) {\n      numBytesReadOutIndex += sizeof(uint64_t);\n      return EdgeIterator(outIndexBuffer[localNodeID - 1]);\n    } else {\n      return EdgeIterator(edgeOffset);\n    }\n  }\n\n  /**\n   * Get the index to the first edge of the node after the provided node.\n   *\n   * @param globalNodeID the global node id of the node to get the edge\n   * for\n   * @returns a GLOBAL edge id iterator\n   */\n  EdgeIterator edgeEnd(uint64_t globalNodeID) {\n    if (!graphLoaded) {\n      GALOIS_DIE(\"Graph hasn't been loaded yet.\");\n    }\n\n    if (numLocalNodes == 0) {\n      return EdgeIterator(0);\n    }\n    assert(nodeOffset <= globalNodeID);\n    assert(globalNodeID < (nodeOffset + numLocalNodes));\n\n    numBytesReadOutIndex += sizeof(uint64_t);\n\n    uint64_t localNodeID = globalNodeID - nodeOffset;\n    return EdgeIterator(outIndexBuffer[localNodeID]);\n  }\n\n  /**\n   * Get the global node id of the destination of the provided edge.\n   *\n   * @param globalEdgeID the global edge id of the edge to get the destination\n   * for (should obtain from edgeBegin/End)\n   */\n  uint64_t edgeDestination(uint64_t globalEdgeID) {\n    if (!graphLoaded) {\n      GALOIS_DIE(\"Graph hasn't been loaded yet.\");\n    }\n\n    if (numLocalEdges == 0) {\n      return 0;\n    }\n    assert(edgeOffset <= globalEdgeID);\n    assert(globalEdgeID < (edgeOffset + numLocalEdges));\n\n    numBytesReadEdgeDest += sizeof(uint32_t);\n\n    uint64_t localEdgeID = globalEdgeID - edgeOffset;\n    return edgeDestBuffer[localEdgeID];\n  }\n\n  /**\n   * Get the edge data of some edge.\n   *\n   * @param globalEdgeID the global edge id of the edge to get the data of\n   * @returns the edge data of the requested edge id\n   */\n  template <typename K = EdgeDataType,\n            typename std::enable_if<!std::is_void<K>::value>::type* = nullptr>\n  EdgeDataType edgeData(uint64_t globalEdgeID) {\n    if (!graphLoaded) {\n      GALOIS_DIE(\"Graph hasn't been loaded yet.\");\n    }\n    if (edgeDataBuffer == nullptr) {\n      GALOIS_DIE(\"Trying to get edge data when graph has no edge data.\");\n    }\n\n    if (numLocalEdges == 0) {\n      return 0;\n    }\n\n    assert(edgeOffset <= globalEdgeID);\n    assert(globalEdgeID < (edgeOffset + numLocalEdges));\n\n    numBytesReadEdgeData += sizeof(EdgeDataType);\n\n    uint64_t localEdgeID = globalEdgeID - edgeOffset;\n    return edgeDataBuffer[localEdgeID];\n  }\n\n  /**\n   * Version of above function when edge data type is void.\n   */\n  template <typename K = EdgeDataType,\n            typename std::enable_if<std::is_void<K>::value>::type* = nullptr>\n  unsigned edgeData(uint64_t) {\n    galois::gWarn(\"Getting edge data on graph when it doesn't exist\\n\");\n    return 0;\n  }\n\n  /**\n   * Reset reading counters.\n   */\n  void resetReadCounters() {\n    numBytesReadOutIndex.reset();\n    numBytesReadEdgeDest.reset();\n    numBytesReadEdgeData.reset();\n  }\n\n  /**\n   * Returns the total number of bytes read from this graph so far.\n   *\n   * @returns Total number of bytes read using the \"get\" functions on\n   * out indices, edge destinations, and edge data.\n   */\n  uint64_t getBytesRead() {\n    return numBytesReadOutIndex.reduce() + numBytesReadEdgeDest.reduce() +\n           numBytesReadEdgeData.reduce();\n  }\n\n  /**\n   * Free all of the in memory buffers in this object and reset graph status.\n   */\n  void resetAndFree() {\n    freeMemory();\n    resetGraphStatus();\n  }\n};\n} // namespace graphs\n} // namespace galois\n#endif\n"
  },
  {
    "path": "libgalois/include/galois/graphs/Details.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#ifndef GALOIS_GRAPHS_DETAILS_H\n#define GALOIS_GRAPHS_DETAILS_H\n\n#include <algorithm>\n#include <boost/mpl/if.hpp>\n\n#include \"galois/config.h\"\n#include \"galois/LargeArray.h\"\n#include \"galois/LazyObject.h\"\n#include \"galois/NoDerefIterator.h\"\n#include \"galois/Threads.h\"\n#include \"galois/runtime/Iterable.h\"\n#include \"galois/runtime/Context.h\"\n#include \"galois/substrate/PerThreadStorage.h\"\n\n// Forward declarations\n\nnamespace galois::graphs {\n\nstruct read_default_graph_tag {};\nstruct read_with_aux_graph_tag {};\nstruct read_lc_inout_graph_tag {};\nstruct read_with_aux_first_graph_tag {};\n\n} // namespace galois::graphs\n\nnamespace galois::graphs::internal {\n\ntemplate <typename, typename, typename, typename, typename>\nstruct EdgeSortReference;\n} // namespace galois::graphs::internal\n\nnamespace galois::graphs {\n\n//! Proxy object for internal EdgeSortReference\ntemplate <typename GraphNode, typename EdgeTy>\nclass EdgeSortValue : public StrictObject<EdgeTy> {\n  template <typename, typename, typename, typename, typename>\n  friend struct internal::EdgeSortReference;\n\n  GraphNode rawDst;\n\npublic:\n  GraphNode dst;\n  typedef StrictObject<EdgeTy> Super;\n  typedef typename Super::value_type value_type;\n\n  EdgeSortValue(GraphNode d, GraphNode rd, const value_type& v)\n      : Super(v), rawDst(rd), dst(d) {}\n\n  template <typename ER>\n  EdgeSortValue(const ER& ref) {\n    ref.initialize(*this);\n  }\n};\n\n} // namespace galois::graphs\n\nnamespace galois::graphs::internal {\n\ntemplate <bool Enable>\nclass LocalIteratorFeature {\n  typedef std::pair<uint64_t, uint64_t> Range;\n  substrate::PerThreadStorage<Range> localIterators;\n\npublic:\n  uint64_t localBegin(uint64_t numNodes) const {\n    return std::min(localIterators.getLocal()->first, numNodes);\n  }\n\n  uint64_t localEnd(uint64_t numNodes) const {\n    return std::min(localIterators.getLocal()->second, numNodes);\n  }\n\n  void setLocalRange(uint64_t begin, uint64_t end) {\n    Range& r = *localIterators.getLocal();\n    r.first  = begin;\n    r.second = end;\n  }\n};\n\ntemplate <>\nstruct LocalIteratorFeature<false> {\n  uint64_t localBegin(uint64_t numNodes) const {\n    unsigned int id  = substrate::ThreadPool::getTID();\n    unsigned int num = galois::getActiveThreads();\n    uint64_t begin   = (numNodes + num - 1) / num * id;\n    return std::min(begin, numNodes);\n  }\n\n  uint64_t localEnd(uint64_t numNodes) const {\n    unsigned int id  = substrate::ThreadPool::getTID();\n    unsigned int num = galois::getActiveThreads();\n    uint64_t end     = (numNodes + num - 1) / num * (id + 1);\n    return std::min(end, numNodes);\n  }\n\n  void setLocalRange(uint64_t, uint64_t) {}\n};\n\n//! Proxy object for {@link EdgeSortIterator}\ntemplate <typename GraphNode, typename EdgeIndex, typename EdgeDst,\n          typename EdgeData, typename GraphNodeConverter>\nstruct EdgeSortReference {\n  typedef typename EdgeData::raw_value_type EdgeTy;\n  EdgeIndex at;\n  EdgeDst* edgeDst;\n  EdgeData* edgeData;\n\n  EdgeSortReference(EdgeIndex x, EdgeDst* dsts, EdgeData* data)\n      : at(x), edgeDst(dsts), edgeData(data) {}\n\n  // Explicitly declare what the implicit copy constructor\n  // would do since using the implicit copy constructor\n  // from a class with a non-defaulted copy assignment\n  // operator is deprecated.\n  EdgeSortReference(EdgeSortReference const& x) {\n    at       = x.at;\n    edgeDst  = x.edgeDst;\n    edgeData = x.edgeData;\n  }\n\n  EdgeSortReference operator=(const EdgeSortValue<GraphNode, EdgeTy>& x) {\n    edgeDst->set(at, x.rawDst);\n    edgeData->set(at, x.get());\n    return *this;\n  }\n\n  EdgeSortReference operator=(const EdgeSortReference& x) {\n    edgeDst->set(at, edgeDst->at(x.at));\n    edgeData->set(at, edgeData->at(x.at));\n    return *this;\n  }\n\n  EdgeSortValue<GraphNode, EdgeTy> operator*() const {\n    return EdgeSortValue<GraphNode, EdgeTy>(\n        GraphNodeConverter()(edgeDst->at(at)), edgeDst->at(at),\n        edgeData->at(at));\n  }\n\n  void initialize(EdgeSortValue<GraphNode, EdgeTy>& value) const {\n    value = *(*this);\n  }\n};\n\n/**\n * Converts comparison functions over EdgeTy to be over {@link EdgeSortValue}.\n */\ntemplate <typename EdgeSortValueTy, typename CompTy>\nstruct EdgeSortCompWrapper {\n  const CompTy& comp;\n\n  EdgeSortCompWrapper(const CompTy& c) : comp(c) {}\n  bool operator()(const EdgeSortValueTy& a, const EdgeSortValueTy& b) const {\n    return comp(a.get(), b.get());\n  }\n};\n\nstruct Identity {\n  template <typename T>\n  T operator()(const T& x) const {\n    return x;\n  }\n};\n\n/**\n * Iterator to facilitate sorting of CSR-like graphs. Converts random access\n * operations on iterator to appropriate computations on edge destinations and\n * edge data.\n *\n * @tparam GraphNode Graph node pointer\n * @tparam EdgeIndex Integer-like value that is passed to EdgeDst and EdgeData\n * @tparam EdgeDst {@link LargeArray}-like container of edge destinations\n * @tparam EdgeData {@link LargeArray}-like container of edge data\n * @tparam GraphNodeConverter A functor to apply when returning values of\n *   EdgeDst when dereferencing this iterator; assignment uses untransformed\n *   EdgeDst values\n */\ntemplate <typename GraphNode, typename EdgeIndex, typename EdgeDst,\n          typename EdgeData, typename GraphNodeConverter = Identity>\nclass EdgeSortIterator\n    : public boost::iterator_facade<\n          EdgeSortIterator<GraphNode, EdgeIndex, EdgeDst, EdgeData,\n                           GraphNodeConverter>,\n          EdgeSortValue<GraphNode, typename EdgeData::raw_value_type>,\n          boost::random_access_traversal_tag,\n          EdgeSortReference<GraphNode, EdgeIndex, EdgeDst, EdgeData,\n                            GraphNodeConverter>> {\n  typedef EdgeSortIterator<GraphNode, EdgeIndex, EdgeDst, EdgeData,\n                           GraphNodeConverter>\n      Self;\n  typedef EdgeSortReference<GraphNode, EdgeIndex, EdgeDst, EdgeData,\n                            GraphNodeConverter>\n      Reference;\n\n  EdgeIndex at;\n  EdgeDst* edgeDst;\n  EdgeData* edgeData;\n\npublic:\n  EdgeSortIterator() : at(0) {}\n  EdgeSortIterator(EdgeIndex x, EdgeDst* dsts, EdgeData* data)\n      : at(x), edgeDst(dsts), edgeData(data) {}\n\nprivate:\n  friend class boost::iterator_core_access;\n\n  bool equal(const Self& other) const { return at == other.at; }\n  Reference dereference() const { return Reference(at, edgeDst, edgeData); }\n  ptrdiff_t distance_to(const Self& other) const {\n    return other.at - (ptrdiff_t)at;\n  }\n  void increment() { ++at; }\n  void decrement() { --at; }\n  void advance(ptrdiff_t n) { at += n; }\n};\n\ntemplate <typename IdTy>\nclass IntrusiveId {\n  IdTy id;\n\npublic:\n  IdTy& getId() { return id; }\n  void setId(size_t n) { id = n; }\n};\n\ntemplate <>\nclass IntrusiveId<void> {\npublic:\n  char getId() { return 0; }\n  void setId(size_t) {}\n};\n\n//! Empty class for HasLockable optimization\nclass NoLockable {};\n\n//! Separate types from definitions to allow incomplete types as NodeTy\ntemplate <typename NodeTy, bool HasLockable>\nstruct NodeInfoBaseTypes {\n  typedef NodeTy& reference;\n};\n\ntemplate <bool HasLockable>\nstruct NodeInfoBaseTypes<void, HasLockable> {\n  typedef void* reference;\n};\n\n//! Specializations for void node data\ntemplate <typename NodeTy, bool HasLockable>\nclass NodeInfoBase\n    : public boost::mpl::if_c<HasLockable, galois::runtime::Lockable,\n                              NoLockable>::type,\n      public NodeInfoBaseTypes<NodeTy, HasLockable> {\n  NodeTy data;\n\npublic:\n  template <typename... Args>\n  NodeInfoBase(Args&&... args) : data(std::forward<Args>(args)...) {}\n\n  typename NodeInfoBase::reference getData() { return data; }\n};\n\ntemplate <bool HasLockable>\nstruct NodeInfoBase<void, HasLockable>\n    : public boost::mpl::if_c<HasLockable, galois::runtime::Lockable,\n                              NoLockable>::type,\n      public NodeInfoBaseTypes<void, HasLockable> {\n  typename NodeInfoBase::reference getData() { return 0; }\n};\n\ntemplate <bool Enable>\nclass OutOfLineLockableFeature {\n  typedef NodeInfoBase<void, true> OutOfLineLock;\n  LargeArray<OutOfLineLock> outOfLineLocks;\n\npublic:\n  struct size_of_out_of_line {\n    static const size_t value = sizeof(OutOfLineLock);\n  };\n\n  void outOfLineAcquire(size_t n, MethodFlag mflag) {\n    galois::runtime::acquire(&outOfLineLocks[n], mflag);\n  }\n  void outOfLineAllocateLocal(size_t numNodes) {\n    outOfLineLocks.allocateLocal(numNodes);\n  }\n  void outOfLineAllocateInterleaved(size_t numNodes) {\n    outOfLineLocks.allocateInterleaved(numNodes);\n  }\n  void outOfLineAllocateBlocked(size_t numNodes) {\n    outOfLineLocks.allocateBlocked(numNodes);\n  }\n  void outOfLineAllocateFloating(size_t numNodes) {\n    outOfLineLocks.allocateFloating(numNodes);\n  }\n\n  template <typename RangeArrayType>\n  void outOfLineAllocateSpecified(size_t n, RangeArrayType threadRanges) {\n    outOfLineLocks.allocateSpecified(n, threadRanges);\n  }\n\n  void outOfLineConstructAt(size_t n) { outOfLineLocks.constructAt(n); }\n};\n\ntemplate <>\nclass OutOfLineLockableFeature<false> {\npublic:\n  struct size_of_out_of_line {\n    static const size_t value = 0;\n  };\n  void outOfLineAcquire(size_t, MethodFlag) {}\n  void outOfLineAllocateLocal(size_t) {}\n  void outOfLineAllocateInterleaved(size_t) {}\n  void outOfLineAllocateBlocked(size_t) {}\n  void outOfLineAllocateFloating(size_t) {}\n  void outOfLineConstructAt(size_t) {}\n  template <typename RangeArrayType>\n  void outOfLineAllocateSpecified(size_t, RangeArrayType) {}\n};\n\n//! Edge specialization for void edge data\ntemplate <typename NodeInfoPtrTy, typename EdgeTy>\nstruct EdgeInfoBase : public LazyObject<EdgeTy> {\n  NodeInfoPtrTy dst;\n};\n\n/**\n * Convenience wrapper around Graph.edge_begin and Graph.edge_end to allow\n * C++11 foreach iteration of edges.\n */\ntemplate <typename GraphTy>\nclass EdgesIterator {\n  typename GraphTy::edge_iterator ii, ee;\n\npublic:\n  typedef NoDerefIterator<typename GraphTy::edge_iterator> iterator;\n\n  EdgesIterator(GraphTy& g, typename GraphTy::GraphNode n, MethodFlag f)\n      : ii(g.edge_begin(n, f)), ee(g.edge_end(n, f)) {}\n  EdgesIterator(typename GraphTy::edge_iterator _ii,\n                typename GraphTy::edge_iterator _ee)\n      : ii(_ii), ee(_ee) {}\n\n  iterator begin() { return make_no_deref_iterator(ii); }\n  iterator end() { return make_no_deref_iterator(ee); }\n};\n\ntemplate <typename ItTy>\nruntime::iterable<NoDerefIterator<ItTy>> make_no_deref_range(ItTy ii, ItTy ee) {\n  return runtime::make_iterable(make_no_deref_iterator(ii),\n                                make_no_deref_iterator(ee));\n}\n\n/**\n * Convenience wrapper around Graph.in_edge_begin and Graph.in_edge_end to allow\n * C++11 foreach iteration of in edges.\n */\ntemplate <typename GraphTy>\nclass InEdgesIterator {\n  GraphTy& g;\n  typename GraphTy::GraphNode n;\n  MethodFlag flag;\n\npublic:\n  typedef NoDerefIterator<typename GraphTy::in_edge_iterator> iterator;\n\n  InEdgesIterator(GraphTy& g, typename GraphTy::GraphNode n, MethodFlag f)\n      : g(g), n(n), flag(f) {}\n\n  iterator begin() { return make_no_deref_iterator(g.in_edge_begin(n, flag)); }\n  iterator end() { return make_no_deref_iterator(g.in_edge_end(n, flag)); }\n};\n\ntemplate <typename GraphTy>\nclass EdgesWithNoFlagIterator {\n  GraphTy& g;\n  typename GraphTy::GraphNode n;\n\npublic:\n  typedef NoDerefIterator<typename GraphTy::edge_iterator> iterator;\n\n  EdgesWithNoFlagIterator(GraphTy& g, typename GraphTy::GraphNode n)\n      : g(g), n(n) {}\n\n  iterator begin() { return make_no_deref_iterator(g.edge_begin(n)); }\n  iterator end() { return make_no_deref_iterator(g.edge_end(n)); }\n};\n\ntemplate <typename A, typename B, typename C, typename D, typename E>\nvoid swap(EdgeSortReference<A, B, C, D, E> a,\n          EdgeSortReference<A, B, C, D, E> b) {\n  auto aa = *a;\n  auto bb = *b;\n  a       = bb;\n  b       = aa;\n}\n\n} // namespace galois::graphs::internal\n\n#endif\n"
  },
  {
    "path": "libgalois/include/galois/graphs/FileGraph.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n/**\n * @file FileGraph.h\n *\n * Contains FileGraph and FileGraphWriter class declarations.\n *\n * @todo finish up doxygen\n */\n\n#ifndef GALOIS_GRAPHS_FILEGRAPH_H\n#define GALOIS_GRAPHS_FILEGRAPH_H\n\n#include <cstring>\n#include <deque>\n#include <type_traits>\n#include <vector>\n\n#include <boost/iterator/counting_iterator.hpp>\n#include <boost/iterator/transform_iterator.hpp>\n\n#include \"galois/config.h\"\n#include \"galois/Endian.h\"\n#include \"galois/MethodFlags.h\"\n#include \"galois/LargeArray.h\"\n#include \"galois/graphs/Details.h\"\n#include \"galois/graphs/GraphHelpers.h\"\n#include \"galois/runtime/Context.h\"\n#include \"galois/substrate/CacheLineStorage.h\"\n#include \"galois/substrate/CompilerSpecific.h\"\n#include \"galois/substrate/NumaMem.h\"\n#include \"galois/Reduction.h\"\n\nnamespace galois {\nnamespace graphs {\n\n// XXX(ddn): Refactor to eliminate OCFileGraph\n\n//! Graph that mmaps Galois gr files for access\nclass FileGraph {\npublic:\n  //! type of a node\n  using GraphNode = uint64_t;\n\nprivate:\n  struct Convert32 {\n    uint32_t operator()(uint32_t x) const { return convert_le32toh(x); }\n  };\n\n  struct Convert64 {\n    uint64_t operator()(uint64_t x) const { return convert_le64toh(x); }\n  };\n\n  struct mapping {\n    void* ptr;\n    size_t len;\n  };\n\nprotected:\n  std::deque<mapping> mappings;\n  std::deque<int> fds;\n\n  //! The size of edge data (on 1 edge)\n  uint64_t sizeofEdge;\n  //! Number of nodes in this (sub)graph\n  uint64_t numNodes;\n  //! Number of edges in this (sub)graph\n  uint64_t numEdges;\n\n  //! Array specifying where a node's edges begin in memory\n  uint64_t* outIdx;\n  //! Array storing outgoing edge destinations\n  void* outs;\n  //! Array storing edge data (if it exists)\n  char* edgeData;\n\n  //! Galois gr version of read in graph\n  int graphVersion;\n\n  //! adjustments to node index when we load only part of a graph\n  uint64_t nodeOffset;\n  //! adjustments to edge index when we load only part of a graph\n  uint64_t edgeOffset;\n\nprivate:\n  //! If initialized, this array stores node degrees in memory for fast access\n  //! via the getDegree function\n  LargeArray<uint64_t> node_degrees;\n\n  // graph reading speed variables\n  galois::GAccumulator<uint64_t> numBytesReadIndex, numBytesReadEdgeDst,\n      numBytesReadEdgeData;\n\n  /**\n   * Construct a file graph by moving in structures from the passed in file\n   * graph.\n   */\n  void move_assign(FileGraph&&);\n  /**\n   * Get the local edge id of the edge with a specific source and destination\n   * if it exists.\n   *\n   * @param src Global source id of edge to find\n   * @param dst Global destination id of edge to find\n   * @returns the local edge id of the edge (src, dst) if it exists, otherwise\n   * return ~0\n   */\n  uint64_t getEdgeIdx(GraphNode src, GraphNode dst);\n\n  /**\n   * Gets a pointer to the first neighbor of node N.\n   *\n   * @param N global node id of neighbor begin to get\n   * @returns pointer to global id of first neighbor of node N\n   */\n  void* raw_neighbor_begin(GraphNode N);\n  /**\n   * Gets a pointer to the end of node N's neighbors in the edge destination\n   * array.\n   *\n   * @param N global node id of neighbor end to get\n   * @returns pointer to end of node N's neighbors in edge destination array.\n   */\n  void* raw_neighbor_end(GraphNode N);\n\n  /**\n   * Given an mmap'd version of the graph, initialize graph from that block of\n   * memory\n   */\n  void fromMem(void* m, uint64_t nodeOffset, uint64_t edgeOffset, uint64_t);\n\n  /**\n   * Loads a graph from another file graph\n   *\n   * @param g FileGraph to load from\n   * @param sizeofEdgeData Size of edge data (for 1 edge)\n   *\n   * @returns Pointer to the edge data array of the newly loaded file graph\n   */\n  void* fromGraph(FileGraph& g, size_t sizeofEdgeData);\n\n  /**\n   * Finds the first node N such that\n   *\n   *  N * nodeSize +\n   *  (sum_{i=0}^{N-1} E[i]) * edgeSize\n   *    >=\n   *  targetSize\n   *\n   *  in range [lb, ub). Returns ub if unsuccessful.\n   *\n   * @param nodeSize Weight of nodes\n   * @param edgeSize Weight of edges\n   * @param targetSize Size that returned node id should attempt to hit\n   * @param lb Lower bound of nodes to consider\n   * @param ub Upper bound of nodes to consider\n   *\n   * @returns A node id that hits the target size (or gets close to it)\n   */\n  size_t findIndex(size_t nodeSize, size_t edgeSize, size_t targetSize,\n                   size_t lb, size_t ub);\n\n  void fromFileInterleaved(const std::string& filename, size_t sizeofEdgeData);\n\n  /**\n   * Page in a portion of the loaded graph data based based on division of labor\n   * by nodes.\n   *\n   * @param id ID of unit of thread/unit of execution that will page in pages\n   * @param total Total number of threads/units of execution to split page in\n   * work among\n   * @param sizeofEdgeData Size of the loaded edge data\n   */\n  void pageInByNode(size_t id, size_t total, size_t sizeofEdgeData);\n\n  /**\n   * Copies graph connectivity information from arrays. Returns a pointer to\n   * array to populate with edge data.\n   *\n   * @param outIdx Out index information in an array\n   * @param numNodes number of nodes\n   * @param outs edge destination array\n   * @param numEdges number of edges\n   * @param edgeData array of edge data\n   * @param sizeofEdgeData The size of the edge data\n   * @param nodeOffset how many nodes from the beginning will this graph start\n   * from\n   * @param edgeOffset how many edges from the beginning will this edge start\n   * from\n   * @param converted whether values in arrays are in host byte ordering\n   * (false) or in FileGraph byte ordering (true)\n   * @param oGraphVersion Galois graph version to use\n   * @return pointer to begining of edgeData in graph\n   */\n  void* fromArrays(uint64_t* outIdx, uint64_t numNodes, void* outs,\n                   uint64_t numEdges, char* edgeData, size_t sizeofEdgeData,\n                   uint64_t nodeOffset, uint64_t edgeOffset, bool converted,\n                   int oGraphVersion = 1);\n\npublic:\n  /**\n   * Reset the num bytes counters\n   */\n  void reset_byte_counters() {\n    numBytesReadEdgeDst.reset();\n    numBytesReadIndex.reset();\n    numBytesReadEdgeData.reset();\n  }\n\n  /**\n   * Return all bytes read\n   */\n  uint64_t num_bytes_read() {\n    return numBytesReadEdgeDst.reduce() + numBytesReadEdgeData.reduce() +\n           numBytesReadIndex.reduce();\n  }\n\n  // Node Handling\n\n  //! Checks if a node is in the graph (already added)\n  bool containsNode(const GraphNode n) const {\n    return n + nodeOffset < numNodes;\n  }\n\n  // Edge Handling\n\n  //! Get edge data of an edge between 2 nodes\n  template <typename EdgeTy>\n  EdgeTy& getEdgeData(GraphNode src, GraphNode dst) {\n    assert(sizeofEdge == sizeof(EdgeTy));\n    numBytesReadEdgeData += sizeof(EdgeTy);\n    return reinterpret_cast<EdgeTy*>(edgeData)[getEdgeIdx(src, dst)];\n  }\n\n  //! Edge iterators (boost iterator)\n  using edge_iterator = boost::counting_iterator<uint64_t>;\n\n  /**\n   * Returns the index to the beginning of global node N's outgoing edges\n   * in the outgoing edges array.\n   *\n   * @param N global node id of edge begin to get\n   * @returns Iterator to first edge of node N\n   */\n  edge_iterator edge_begin(GraphNode N);\n  /**\n   * Returns the index to the end of global node N's outgoing edges\n   * in the outgoing edges array.\n   *\n   * @param N global node id of edge end to get\n   * @returns Iterator to end of node N's edges\n   */\n  edge_iterator edge_end(GraphNode N);\n\n  /**\n   * Returns the edges of node N as a range that can be iterated through\n   * by C++ foreach.\n   */\n  runtime::iterable<NoDerefIterator<edge_iterator>> edges(GraphNode N) {\n    return internal::make_no_deref_range(edge_begin(N), edge_end(N));\n  }\n\n  /**\n   * Returns the edges of node N as a range that can be iterated through\n   * by C++ foreach.\n   */\n  runtime::iterable<NoDerefIterator<edge_iterator>> out_edges(GraphNode N) {\n    return edges(N);\n  }\n\n  /**\n   * Sorts outgoing edges of a node. Comparison function is over EdgeTy.\n   */\n  template <typename EdgeTy, typename CompTy>\n  void sortEdgesByEdgeData(GraphNode N,\n                           const CompTy& comp = std::less<EdgeTy>()) {\n    if (graphVersion == 1) {\n      typedef LargeArray<uint32_t> EdgeDst;\n      typedef LargeArray<EdgeTy> EdgeData;\n\n      typedef internal::EdgeSortIterator<GraphNode, uint64_t, EdgeDst, EdgeData,\n                                         Convert32>\n          edge_sort_iterator;\n\n      EdgeDst edgeDst(outs, numEdges);\n      EdgeData ed(edgeData, numEdges);\n\n      edge_sort_iterator begin(\n          std::distance((uint32_t*)outs, (uint32_t*)raw_neighbor_begin(N)),\n          &edgeDst, &ed);\n      edge_sort_iterator end(\n          std::distance((uint32_t*)outs, (uint32_t*)raw_neighbor_end(N)),\n          &edgeDst, &ed);\n      std::sort(begin, end,\n                internal::EdgeSortCompWrapper<EdgeSortValue<GraphNode, EdgeTy>,\n                                              CompTy>(comp));\n    } else if (graphVersion == 2) {\n      typedef LargeArray<uint64_t> EdgeDst;\n      typedef LargeArray<EdgeTy> EdgeData;\n\n      typedef internal::EdgeSortIterator<GraphNode, uint64_t, EdgeDst, EdgeData,\n                                         Convert64>\n          edge_sort_iterator;\n\n      EdgeDst edgeDst(outs, numEdges);\n      EdgeData ed(edgeData, numEdges);\n\n      edge_sort_iterator begin(\n          std::distance((uint64_t*)outs, (uint64_t*)raw_neighbor_begin(N)),\n          &edgeDst, &ed);\n      edge_sort_iterator end(\n          std::distance((uint64_t*)outs, (uint64_t*)raw_neighbor_end(N)),\n          &edgeDst, &ed);\n      std::sort(begin, end,\n                internal::EdgeSortCompWrapper<EdgeSortValue<GraphNode, EdgeTy>,\n                                              CompTy>(comp));\n    } else {\n      GALOIS_DIE(\"unknown file version: \", graphVersion);\n    }\n  }\n\n  /**\n   * Sorts outgoing edges of a node.\n   * Comparison function is over <code>EdgeSortValue<EdgeTy></code>.\n   */\n  template <typename EdgeTy, typename CompTy>\n  void sortEdges(GraphNode N, const CompTy& comp) {\n    if (graphVersion == 1) {\n      typedef LargeArray<uint32_t> EdgeDst;\n      typedef LargeArray<EdgeTy> EdgeData;\n      typedef internal::EdgeSortIterator<GraphNode, uint64_t, EdgeDst, EdgeData,\n                                         Convert32>\n          edge_sort_iterator;\n\n      EdgeDst edgeDst(outs, numEdges);\n      EdgeData ed(edgeData, numEdges);\n\n      edge_sort_iterator begin(\n          std::distance((uint32_t*)outs, (uint32_t*)raw_neighbor_begin(N)),\n          &edgeDst, &ed);\n      edge_sort_iterator end(\n          std::distance((uint32_t*)outs, (uint32_t*)raw_neighbor_end(N)),\n          &edgeDst, &ed);\n      std::sort(begin, end, comp);\n    } else if (graphVersion == 2) {\n      typedef LargeArray<uint64_t> EdgeDst;\n      typedef LargeArray<EdgeTy> EdgeData;\n      typedef internal::EdgeSortIterator<GraphNode, uint64_t, EdgeDst, EdgeData,\n                                         Convert64>\n          edge_sort_iterator;\n\n      EdgeDst edgeDst(outs, numEdges);\n      EdgeData ed(edgeData, numEdges);\n\n      edge_sort_iterator begin(\n          std::distance((uint64_t*)outs, (uint64_t*)raw_neighbor_begin(N)),\n          &edgeDst, &ed);\n      edge_sort_iterator end(\n          std::distance((uint64_t*)outs, (uint64_t*)raw_neighbor_end(N)),\n          &edgeDst, &ed);\n      std::sort(begin, end, comp);\n    } else {\n      GALOIS_DIE(\"unknown file version: \", graphVersion);\n    }\n  }\n\n  // template<typename EdgeTy>\n  // const EdgeTy& getEdgeData(edge_iterator it) const {\n  //  assert(edgeData);\n  //  return reinterpret_cast<const EdgeTy*>(edgeData)[*it];\n  //}\n\n  //! Get edge data given an edge iterator\n  template <typename EdgeTy>\n  EdgeTy& getEdgeData(edge_iterator it) {\n    assert(edgeData);\n    numBytesReadEdgeData += sizeof(EdgeTy);\n    return reinterpret_cast<EdgeTy*>(edgeData)[*it];\n  }\n\n  /**\n   * Gets the destination of some edge.\n   *\n   * @param it local edge id of edge destination to get\n   * @returns a global node id representing the destination of the edge\n   */\n  GraphNode getEdgeDst(edge_iterator it);\n\n  //! iterator over neighbors\n  typedef boost::transform_iterator<Convert32, uint32_t*> neighbor_iterator;\n  //! iterator over node ids\n  typedef boost::transform_iterator<Convert32, uint32_t*> node_id_iterator;\n  //! edge iterator\n  typedef boost::transform_iterator<Convert64, uint64_t*> edge_id_iterator;\n  //! uint64 boost counting iterator\n  typedef boost::counting_iterator<uint64_t> iterator;\n\n  /**\n   * Gets an iterator to the first neighbor of node N\n   *\n   * @warning only version 1 support, do not use with version 2\n   */\n  neighbor_iterator neighbor_begin(GraphNode N) {\n    return boost::make_transform_iterator((uint32_t*)raw_neighbor_begin(N),\n                                          Convert32());\n  }\n\n  /**\n   * Gets an iterator to the end of node N's neighbors\n   *\n   * @warning only version 1 support, do not use with version 2\n   */\n  neighbor_iterator neighbor_end(GraphNode N) {\n    return boost::make_transform_iterator((uint32_t*)raw_neighbor_end(N),\n                                          Convert32());\n  }\n\n  template <typename EdgeTy>\n  EdgeTy* edge_data_begin() const {\n    assert(edgeData);\n    return reinterpret_cast<EdgeTy*>(edgeData);\n  }\n\n  template <typename EdgeTy>\n  EdgeTy* edge_data_end() const {\n    assert(edgeData);\n    assert(sizeof(EdgeTy) == sizeofEdge);\n    EdgeTy* r = reinterpret_cast<EdgeTy*>(edgeData);\n    return &r[numEdges];\n  }\n\n  //! Calculates node degrees and saves them to a class variable for\n  //! access by getDegree.\n  void initNodeDegrees();\n\n  /**\n   * Gets the degree of a particular node. Assumes that initNodeDegrees has\n   * been called to initialize the array of degrees; if not, it is very likely\n   * going to segfault as it will attempt to access uninitialized memory.\n   *\n   * @param node_id ID of a node to get the degree of.\n   * @returns Degree of a specified node\n   */\n  uint64_t getDegree(uint32_t node_id) const;\n\n  /**\n   * Gets the first node of the loaded graph.\n   *\n   * @returns An iterator to the first node of the graph. Note it is a GLOBAL\n   * id.\n   */\n  iterator begin() const;\n  /**\n   * Gets the end of the nodes of the loaded graph.\n   *\n   * @returns An iterator to the end of the nodes of the graph (of the\n   * loaded part of the graph).\n   */\n  iterator end() const;\n\n  //! pair specifying a node range\n  typedef std::pair<iterator, iterator> NodeRange;\n  //! pair specifying an edge range\n  typedef std::pair<edge_iterator, edge_iterator> EdgeRange;\n  //! pair of a NodeRange and an EdgeRange\n  typedef std::pair<NodeRange, EdgeRange> GraphRange;\n\n  /**\n   * Given a division and a total number of divisions, return a range for that\n   * particular division to work on. (i.e. this divides labor among divisions\n   * depending on how much weight is given to nodes/edges).\n   *\n   * @param nodeSize Weight of nodes\n   * @param edgeSize Weight of edges\n   * @param id Division id\n   * @param total Total number of divisions\n   *\n   * @returns A node range and an edge range specifying division \"id\"'s assigned\n   * nodes/edges\n   */\n  GraphRange divideByNode(size_t nodeSize, size_t edgeSize, size_t id,\n                          size_t total);\n\n  /**\n   * Divides nodes only considering edges.\n   *\n   * IMPORTANT: Note that it may potentially not return all nodes in the graph\n   * (it will return up to the last node with edges).\n   *\n   * @param nodeSize Weight of nodes\n   * @param edgeSize Weight of edges\n   * @param id Division id\n   * @param total Total number of divisions\n   *\n   * @returns A node range and an edge range specifying division \"id\"'s assigned\n   * nodes/edges\n   */\n  GraphRange divideByEdge(size_t nodeSize, size_t edgeSize, size_t id,\n                          size_t total);\n  /**\n   * Returns an iterator to the beginning of the node destination\n   * array.\n   *\n   * @returns iterator to beginning of the node destination array of the\n   * loaded graph (local)\n   * @todo implement version 2 support\n   */\n  node_id_iterator node_id_begin() const;\n  /**\n   * Returns an iterator to the end of the node destination\n   * array.\n   *\n   * @returns iterator to end of the node destination array of the loaded\n   * graph (local)\n   * @todo implement version 2 support\n   */\n  node_id_iterator node_id_end() const;\n  /**\n   * Returns an iterator to the beginning of the array specifying\n   * the index into the destination array where a particular node's\n   * edges begin.\n   *\n   * @return iterator to beginning of edge index array of the loaded graph\n   */\n  edge_id_iterator edge_id_begin() const;\n  /**\n   * Returns an iterator to the end of the array specifying\n   * the index into the destination array where a particular node's\n   * edges begin.\n   *\n   * @return iterator to end of edge index array of the loaded graph\n   */\n  edge_id_iterator edge_id_end() const;\n\n  /**\n   * Determines if an edge with source N1 and destination N2 existed\n   * in the currently loaded (local) graph.\n   *\n   * @param N1 global node id of neighbor 1 (source)\n   * @param N2 global node id of neighbor 2 (destination)\n   *\n   * @returns true if edge (N1, N2) exists locally, false otherwise\n   */\n  bool hasNeighbor(GraphNode N1, GraphNode N2);\n\n  //! Returns the number of nodes in the (sub)graph\n  size_t size() const { return numNodes; }\n\n  //! Returns the number of edges in the (sub)graph\n  size_t sizeEdges() const { return numEdges; }\n\n  //! Returns the size of an edge\n  size_t edgeSize() const { return sizeofEdge; }\n\n  /**\n   * Default file graph constructor which initializes fields to null values.\n   */\n  FileGraph();\n\n  /**\n   * Construct graph from another FileGraph\n   *\n   * @param o Other filegraph to initialize from.\n   */\n  FileGraph(const FileGraph&);\n  /**\n   * Copy constructor operator for FileGraph\n   */\n  FileGraph& operator=(const FileGraph&);\n  /**\n   * Move constructor for FileGraph\n   */\n  FileGraph(FileGraph&&);\n  /**\n   * Move constructor operator for FileGraph\n   */\n  FileGraph& operator=(FileGraph&&);\n  /**\n   * Destructor. Un-mmaps mmap'd things and closes opened files.\n   */\n  ~FileGraph();\n\n  /**\n   * Given a file name, mmap the entire file into memory. Should\n   * be a graph with some specific layout.\n   *\n   * @param filename Graph file to load\n   */\n  void fromFile(const std::string& filename);\n\n  /**\n   * Loads/mmaps particular portions of a graph corresponding to a node\n   * range and edge range into memory.\n   *\n   * Note that it makes the object work on a LOCAL scale (i.e. there are\n   * now local ids corresponding to the subgraph). Most functions will\n   * still handle global ids, though. (see below)\n   *\n   * @param filename File to load\n   * @param nrange Node range to load\n   * @param erange Edge range to load\n   * @param numaMap if true, does interleaved numa allocation for data\n   * structures\n   */\n  void partFromFile(const std::string& filename, NodeRange nrange,\n                    EdgeRange erange, bool numaMap = false);\n\n  /**\n   * Reads graph connectivity information from file. Tries to balance memory\n   * evenly across system.  Cannot be called during parallel execution.\n   *\n   * Edge data version.\n   */\n  template <typename EdgeTy>\n  void fromFileInterleaved(\n      const std::string& filename,\n      typename std::enable_if<!std::is_void<EdgeTy>::value>::type* = 0) {\n    fromFileInterleaved(filename, sizeof(EdgeTy));\n  }\n\n  /**\n   * Reads graph connectivity information from file. Tries to balance memory\n   * evenly across system.  Cannot be called during parallel execution.\n   *\n   * No edge data version.\n   */\n  template <typename EdgeTy>\n  void fromFileInterleaved(\n      const std::string& filename,\n      typename std::enable_if<std::is_void<EdgeTy>::value>::type* = 0) {\n    fromFileInterleaved(filename, 0);\n  }\n\n  /**\n   * Reads graph connectivity information from graph but not edge data. Returns\n   * a pointer to array to populate with edge data.\n   */\n  template <typename T>\n  T* fromGraph(FileGraph& g) {\n    return reinterpret_cast<T*>(fromGraph(g, sizeof(T)));\n  }\n\n  /**\n   * Write current contents of mappings to a file\n   *\n   * @param file File to write to\n   * @todo perform host -> le on data\n   */\n  void toFile(const std::string& file);\n};\n\n/**\n * Simplifies writing graphs.\n *\n * Writer your file in rounds:\n * <ol>\n *  <li>setNumNodes(), setNumEdges<EdgeTy>()</li>\n *  <li>phase1(), for each node, incrementDegree(Node x)</li>\n *  <li>phase2(), add neighbors for each node, addNeighbor(Node src, Node dst),\n *    or add neighbors and corresponding data, addNeighbor<EdgeTy>(Node src,\n *    Node dst, EdgeTy data)</li>\n *  <li>finish(), use as FileGraph</li>\n * </ol>\n */\nclass FileGraphWriter : public FileGraph {\n  std::unique_ptr<uint64_t[]> starts;\n\npublic:\n  //! Set number of nodes to write to n\n  //! @param n number of nodes to set to\n  void setNumNodes(size_t n) { numNodes = n; }\n  //! Set number of edges to write to n\n  //! @tparam EdgeTy edge data type\n  //! @param n number of edges to set to\n  template <typename EdgeTy, typename std::enable_if<\n                                 std::is_void<EdgeTy>::value>::type* = nullptr>\n  void setNumEdges(size_t n) {\n    numEdges   = n;\n    sizeofEdge = 0;\n  }\n  template <typename EdgeTy, typename std::enable_if<\n                                 !std::is_void<EdgeTy>::value>::type* = nullptr>\n  void setNumEdges(size_t n) {\n    numEdges   = n;\n    sizeofEdge = sizeof(EdgeTy);\n  }\n\n  //! Marks the transition to next phase of parsing: counting the degree of\n  //! nodes\n  void phase1();\n\n  //! Increments degree of id by delta\n  void incrementDegree(size_t id, uint64_t delta = 1) {\n    assert(id < numNodes);\n    outIdx[id] += delta;\n  }\n\n  //! Marks the transition to next phase of parsing, adding edges\n  void phase2();\n\n  //! Adds a neighbor between src and dst\n  size_t addNeighbor(size_t src, size_t dst) {\n    size_t base = src ? outIdx[src - 1] : 0;\n    size_t idx  = base + starts[src]++;\n    assert(idx < outIdx[src]);\n\n    if (numNodes <= std::numeric_limits<uint32_t>::max())\n      reinterpret_cast<uint32_t*>(outs)[idx] = dst; // version 1\n    else\n      reinterpret_cast<uint64_t*>(outs)[idx] = dst; // version 2\n    return idx;\n  }\n\n  //! Adds a neighbor between src and dst w/ corresponding data\n  template <typename T>\n  size_t addNeighbor(\n      size_t src, size_t dst,\n      const typename std::enable_if<!std::is_void<T>::value, T>::type& data) {\n    assert(edgeData);\n    size_t idx                          = addNeighbor(src, dst);\n    reinterpret_cast<T*>(edgeData)[idx] = data;\n    return idx;\n  }\n\n  /**\n   * Finish making graph.\n   */\n  void finish() { starts.reset(nullptr); } // free reserved memory asap\n\n  /**\n   * Finish making graph. Returns pointer to block of memory that should be\n   * used to store edge data.\n   */\n  template <typename T>\n  T* finish() {\n    starts.reset(nullptr); // free reserved memory asap\n    return reinterpret_cast<T*>(edgeData);\n  }\n};\n\n/**\n * Adds reverse edges to a graph. Reverse edges have edge data copied from the\n * original edge. The new graph is placed in the out parameter.  The previous\n * out is destroyed.\n */\ntemplate <typename EdgeTy>\nvoid makeSymmetric(FileGraph& in_graph, FileGraph& out) {\n  typedef FileGraph::GraphNode GNode;\n\n  FileGraphWriter g;\n\n  size_t numEdges = 0;\n\n  for (FileGraph::iterator ii = in_graph.begin(), ei = in_graph.end(); ii != ei;\n       ++ii) {\n    GNode src = *ii;\n    for (FileGraph::edge_iterator jj = in_graph.edge_begin(src),\n                                  ej = in_graph.edge_end(src);\n         jj != ej; ++jj) {\n      GNode dst = in_graph.getEdgeDst(jj);\n      numEdges += 1;\n      if (src != dst)\n        numEdges += 1;\n    }\n  }\n\n  g.setNumNodes(in_graph.size());\n  g.setNumEdges<EdgeTy>(numEdges);\n\n  g.phase1();\n  for (FileGraph::iterator ii = in_graph.begin(), ei = in_graph.end(); ii != ei;\n       ++ii) {\n    GNode src = *ii;\n    for (FileGraph::edge_iterator jj = in_graph.edge_begin(src),\n                                  ej = in_graph.edge_end(src);\n         jj != ej; ++jj) {\n      GNode dst = in_graph.getEdgeDst(jj);\n      g.incrementDegree(src);\n      if (src != dst)\n        g.incrementDegree(dst);\n    }\n  }\n\n  g.phase2();\n  for (FileGraph::iterator ii = in_graph.begin(), ei = in_graph.end(); ii != ei;\n       ++ii) {\n    GNode src = *ii;\n    for (FileGraph::edge_iterator jj = in_graph.edge_begin(src),\n                                  ej = in_graph.edge_end(src);\n         jj != ej; ++jj) {\n      GNode dst = in_graph.getEdgeDst(jj);\n      if constexpr (std::is_void<EdgeTy>::value) {\n        g.addNeighbor(src, dst);\n        if (src != dst)\n          g.addNeighbor(dst, src);\n      } else {\n        EdgeTy& data = in_graph.getEdgeData<EdgeTy>(jj);\n        g.addNeighbor<EdgeTy>(src, dst, data);\n        if (src != dst)\n          g.addNeighbor<EdgeTy>(dst, src, data);\n      }\n    }\n  }\n\n  g.finish();\n\n  out = std::move(g);\n}\n\n/**\n * Permutes a graph.\n *\n * Permutation array, P, conforms to: P[i] = j where i is a node index from the\n * original graph and j is a node index in the permuted graph. New, permuted\n * graph is placed in the out parameter. The previous out is destroyed.\n *\n * @param in_graph original graph\n * @param p permutation array\n * @param out permuted graph\n */\ntemplate <typename EdgeTy, typename PTy>\nvoid permute(FileGraph& in_graph, const PTy& p, FileGraph& out) {\n  typedef FileGraph::GraphNode GNode;\n\n  FileGraphWriter g;\n\n  size_t numEdges = in_graph.sizeEdges();\n  g.setNumNodes(in_graph.size());\n  g.setNumEdges<EdgeTy>(numEdges);\n\n  g.phase1();\n  for (FileGraph::iterator ii = in_graph.begin(), ei = in_graph.end(); ii != ei;\n       ++ii) {\n    GNode src = *ii;\n    for (FileGraph::edge_iterator jj = in_graph.edge_begin(src),\n                                  ej = in_graph.edge_end(src);\n         jj != ej; ++jj) {\n      g.incrementDegree(p[src]);\n    }\n  }\n\n  g.phase2();\n  for (FileGraph::iterator ii = in_graph.begin(), ei = in_graph.end(); ii != ei;\n       ++ii) {\n    GNode src = *ii;\n    for (FileGraph::edge_iterator jj = in_graph.edge_begin(src),\n                                  ej = in_graph.edge_end(src);\n         jj != ej; ++jj) {\n      GNode dst = in_graph.getEdgeDst(jj);\n      if constexpr (std::is_void<EdgeTy>::value) {\n        g.addNeighbor(p[src], p[dst]);\n      } else {\n        EdgeTy& data = in_graph.getEdgeData<EdgeTy>(jj);\n        g.addNeighbor<EdgeTy>(p[src], p[dst], data);\n      }\n    }\n  }\n\n  g.finish();\n\n  out = std::move(g);\n}\n\n} // namespace graphs\n} // namespace galois\n#endif\n"
  },
  {
    "path": "libgalois/include/galois/graphs/Graph.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#ifndef GALOIS_GRAPHS_GRAPH_H\n#define GALOIS_GRAPHS_GRAPH_H\n\n#include \"galois/config.h\"\n#include \"galois/graphs/MorphGraph.h\"\n#include \"galois/graphs/LCGraph.h\"\n#include \"galois/graphs/ReadGraph.h\"\n\n#endif\n"
  },
  {
    "path": "libgalois/include/galois/graphs/GraphHelpers.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#pragma once\n\n#include <cassert>\n#include <vector>\n\n#include <boost/iterator/counting_iterator.hpp>\n\n#include \"galois/config.h\"\n#include \"galois/gIO.h\"\n\nnamespace galois {\nnamespace graphs {\n\nnamespace internal {\n/**\n * Return a suitable index between an upper bound and a lower bound that\n * attempts to get close to the target size (i.e. find a good chunk that\n * corresponds to some size) using a prefix sum.\n *\n * @tparam PrefixSumType type of the object that holds the edge prefix sum\n *\n * @param nodeWeight weight to give to a node in division\n * @param edgeWeight weight to give to an edge in division\n * @param targetWeight The amount of weight we want from the returned index\n * @param lb lower bound to start search from\n * @param ub upper bound to start search from\n * @param edgePrefixSum prefix sum of edges; may be full or partial prefix\n * sum of the object you are attempting to split\n * @param edgeOffset number of edges to subtract from edge count retrieved\n * from prefix sum; used if array is a partial prefix sum\n * @param nodeOffset number of nodes to skip over when looking in the\n * prefix sum: useful if the prefix sum is over the entire graph while you\n * just want to divide the nodes for a particular region (jump to the region\n * with the nodeOffset)\n *\n * @returns The node id that hits (or gets close to) the target size\n */\n// Note: \"inline\" may be required if PrefixSumType is exactly the same type\n// in 2 different translation units; otherwise it should be fine\ntemplate <typename PrefixSumType>\nsize_t findIndexPrefixSum(size_t nodeWeight, size_t edgeWeight,\n                          size_t targetWeight, uint64_t lb, uint64_t ub,\n                          PrefixSumType& edgePrefixSum, uint64_t edgeOffset,\n                          uint64_t nodeOffset) {\n  assert(nodeWeight != 0 || edgeWeight != 0);\n\n  while (lb < ub) {\n    size_t mid = lb + (ub - lb) / 2;\n    size_t num_edges;\n\n    if ((mid + nodeOffset) != 0) {\n      num_edges = edgePrefixSum[mid - 1 + nodeOffset] - edgeOffset;\n    } else {\n      num_edges = 0;\n    }\n\n    size_t weight = num_edges * edgeWeight + mid * nodeWeight;\n\n    if (weight < targetWeight) {\n      lb = mid + 1;\n    } else if (weight >= targetWeight) {\n      ub = mid;\n    }\n  }\n\n  return lb;\n}\n\n/**\n * Given a number of divisions and a scale factor specifying how much of a\n * chunk of blocks each division should get, determine the total number\n * of blocks to split among all divisions + calculate the prefix sum and\n * save it in-place to the scaleFactor variable.\n *\n * @param numDivisions number of divisions to split blocks among\n * @param scaleFactor vector specifying how much a particular vision should get\n *\n * @returns The total number of blocks to split among all divisions\n */\nuint32_t determine_block_division(uint32_t numDivisions,\n                                  std::vector<unsigned>& scaleFactor);\n\n} // end namespace internal\n\n/**\n * Returns 2 ranges (one for nodes, one for edges) for a particular division.\n * The ranges specify the nodes/edges that a division is responsible for. The\n * function attempts to split them evenly among units given some kind of\n * weighting for both nodes and edges.\n *\n * Assumes the parameters passed in apply to a local portion of whatever\n * is being divided (i.e. concept of a \"global\" object is abstracted away in\n * some sense)\n *\n * @tparam PrefixSumType type of the object that holds the edge prefix sum\n * @tparam NodeType size of the type representing the node\n *\n * @param numNodes Total number of nodes included in prefix sum\n * @param numEdges Total number of edges included in prefix sum\n * @param nodeWeight weight to give to a node in division\n * @param edgeWeight weight to give to an edge in division\n * @param id Division number you want the range for\n * @param total Total number of divisions to divide nodes among\n * @param edgePrefixSum Prefix sum of the edges in the graph\n * @param scaleFactor Vector specifying if certain divisions should get more\n * than other divisions\n * @param edgeOffset number of edges to subtract from numbers in edgePrefixSum\n * @param nodeOffset number of nodes to skip over when looking in the\n * prefix sum: useful if the prefix sum is over the entire graph while you\n * just want to divide the nodes for a particular region (jump to the region\n * with the nodeOffset)\n *\n * @returns A node pair and an edge pair specifying the assigned nodes/edges\n * to division \"id\"; returns LOCAL ids, not global ids (i.e. if node offset\n * was used, it is up to the caller to add the offset to the numbers)\n */\n// Note: \"inline\" may be required if PrefixSumType is exactly the same type\n// in 2 different translation units; otherwise it should be fine\n// If inline is used, then apparently you cannot use typedefs, so get rid\n// of those if the need arises.\ntemplate <typename PrefixSumType, typename NodeType = uint64_t>\nauto divideNodesBinarySearch(\n    NodeType numNodes, uint64_t numEdges, size_t nodeWeight, size_t edgeWeight,\n    size_t id, size_t total, PrefixSumType& edgePrefixSum,\n    std::vector<unsigned> scaleFactor = std::vector<unsigned>(),\n    uint64_t edgeOffset = 0, uint64_t nodeOffset = 0) {\n  typedef boost::counting_iterator<NodeType> iterator;\n  typedef boost::counting_iterator<uint64_t> edge_iterator;\n  typedef std::pair<iterator, iterator> NodeRange;\n  typedef std::pair<edge_iterator, edge_iterator> EdgeRange;\n  typedef std::pair<NodeRange, EdgeRange> GraphRange;\n\n  // numNodes = 0 corner case\n  if (numNodes == 0) {\n    return GraphRange(NodeRange(iterator(0), iterator(0)),\n                      EdgeRange(edge_iterator(0), edge_iterator(0)));\n  }\n\n  assert(nodeWeight != 0 || edgeWeight != 0);\n  assert(total >= 1);\n  assert(id < total);\n\n  // weight of all data\n  uint64_t weight = numNodes * nodeWeight + (numEdges + 1) * edgeWeight;\n  // determine the number of blocks to divide among total divisions + setup the\n  // scale factor vector if necessary\n  uint32_t numBlocks = internal::determine_block_division(total, scaleFactor);\n  // weight of a block (one block for each division by default; if scale\n  // factor specifies something different, then use that instead)\n  uint64_t blockWeight = (weight + numBlocks - 1) / numBlocks;\n  // galois::gDebug(\"weight \", weight, \" numblock \", numBlocks, \" blockwegith \",\n  //               blockWeight);\n\n  // lower and upper blocks that this division should use determined\n  // using scaleFactor\n  uint32_t blockLower;\n  if (id != 0) {\n    blockLower = scaleFactor[id - 1];\n  } else {\n    blockLower = 0;\n  }\n\n  uint32_t blockUpper = scaleFactor[id];\n\n  assert(blockLower <= blockUpper);\n  // galois::gDebug(\"Unit \", id, \" block \", blockLower, \" to \",\n  //               blockUpper, \"; \", blockLower * blockWeight, \" \",\n  //               blockUpper * blockWeight);\n\n  uint64_t nodesLower;\n  // use prefix sum to find node bounds\n  if (blockLower == 0) {\n    nodesLower = 0;\n  } else {\n    nodesLower = internal::findIndexPrefixSum(\n        nodeWeight, edgeWeight, blockWeight * blockLower, 0, numNodes,\n        edgePrefixSum, edgeOffset, nodeOffset);\n  }\n\n  uint64_t nodesUpper;\n  nodesUpper = internal::findIndexPrefixSum(\n      nodeWeight, edgeWeight, blockWeight * blockUpper, nodesLower, numNodes,\n      edgePrefixSum, edgeOffset, nodeOffset);\n\n  // get the edges bounds using node lower/upper bounds\n  uint64_t edgesLower = numEdges;\n  uint64_t edgesUpper = numEdges;\n\n  if (nodesLower != nodesUpper) {\n    if ((nodesLower + nodeOffset) != 0) {\n      edgesLower = edgePrefixSum[nodesLower - 1 + nodeOffset] - edgeOffset;\n    } else {\n      edgesLower = 0;\n    }\n\n    edgesUpper = edgePrefixSum[nodesUpper - 1 + nodeOffset] - edgeOffset;\n  }\n\n  // galois::gDebug(\"Unit \", id, \" nodes \", nodesLower, \" to \",\n  //               nodesUpper, \" edges \", edgesLower, \" \",\n  //               edgesUpper);\n\n  return GraphRange(\n      NodeRange(iterator(nodesLower), iterator(nodesUpper)),\n      EdgeRange(edge_iterator(edgesLower), edge_iterator(edgesUpper)));\n}\n\n// second internal namespace\nnamespace internal {\n\n/**\n * Checks the begin/end node and number of units to split to for corner cases\n * (e.g. only one unit to split to, only 1 node, etc.).\n *\n * @param unitsToSplit number of units to split nodes among\n * @param beginNode Beginning of range\n * @param endNode End of range, non-inclusive\n * @param returnRanges vector to store result in\n * @returns true if a corner case was found (indicates that returnRanges has\n * been finalized)\n */\nbool unitRangeCornerCaseHandle(uint32_t unitsToSplit, uint32_t beginNode,\n                               uint32_t endNode,\n                               std::vector<uint32_t>& returnRanges);\n\n/**\n * Helper function used by determineUnitRangesGraph that consists of the main\n * loop over all units and calls to divide by node to determine the\n * division of nodes to units.\n *\n * Saves the ranges to an argument vector provided by the caller.\n *\n * @tparam GraphTy type of the graph object\n *\n * @param graph The graph object to get prefix sum information from\n * @param unitsToSplit number of units to split nodes among\n * @param beginNode Beginning of range\n * @param endNode End of range, non-inclusive\n * @param returnRanges Vector to store unit offsets for ranges in\n * @param nodeAlpha The higher the number, the more weight nodes have in\n * determining division of nodes (edges have weight 1).\n */\ntemplate <typename GraphTy>\nvoid determineUnitRangesLoopGraph(GraphTy& graph, uint32_t unitsToSplit,\n                                  uint32_t beginNode, uint32_t endNode,\n                                  std::vector<uint32_t>& returnRanges,\n                                  uint32_t nodeAlpha) {\n  assert(beginNode != endNode);\n\n  uint32_t numNodesInRange = endNode - beginNode;\n  uint64_t numEdgesInRange =\n      graph.edge_end(endNode - 1) - graph.edge_begin(beginNode);\n  uint64_t edgeOffset = *graph.edge_begin(beginNode);\n\n  returnRanges[0] = beginNode;\n  std::vector<unsigned int> dummyScaleFactor;\n\n  for (uint32_t i = 0; i < unitsToSplit; i++) {\n    // determine division for unit i\n    auto nodeSplits =\n        divideNodesBinarySearch<GraphTy, uint32_t>(\n            numNodesInRange, numEdgesInRange, nodeAlpha, 1, i, unitsToSplit,\n            graph, dummyScaleFactor, edgeOffset, beginNode)\n            .first;\n\n    // i.e. if there are actually assigned nodes\n    if (nodeSplits.first != nodeSplits.second) {\n      if (i != 0) {\n        assert(returnRanges[i] == *(nodeSplits.first) + beginNode);\n      } else { // i == 0\n        assert(returnRanges[i] == beginNode);\n      }\n      returnRanges[i + 1] = *(nodeSplits.second) + beginNode;\n    } else {\n      // unit assinged no nodes, copy last one\n      returnRanges[i + 1] = returnRanges[i];\n    }\n\n    galois::gDebug(\"LoopGraph Unit \", i, \" gets nodes \", returnRanges[i],\n                   \" to \", returnRanges[i + 1], \", num edges is \",\n                   graph.edge_end(returnRanges[i + 1] - 1) -\n                       graph.edge_begin(returnRanges[i]));\n  }\n}\n\n/**\n * Helper function used by determineUnitRangesPrefixSum that consists of the\n * main loop over all units and calls to divide by node to determine the\n * division of nodes to units.\n *\n * Saves the ranges to an argument vector provided by the caller.\n *\n * @tparam VectorTy type of the prefix sum object\n *\n * @param prefixSum Holds prefix sum information\n * @param unitsToSplit number of units to split nodes among\n * @param beginNode Beginning of range\n * @param endNode End of range, non-inclusive\n * @param returnRanges Vector to store unit offsets for ranges in\n * @param nodeAlpha The higher the number, the more weight nodes have in\n * determining division of nodes (edges have weight 1).\n */\ntemplate <typename VectorTy>\nvoid determineUnitRangesLoopPrefixSum(VectorTy& prefixSum,\n                                      uint32_t unitsToSplit, uint32_t beginNode,\n                                      uint32_t endNode,\n                                      std::vector<uint32_t>& returnRanges,\n                                      uint32_t nodeAlpha) {\n  assert(beginNode != endNode);\n\n  uint32_t numNodesInRange = endNode - beginNode;\n\n  uint64_t numEdgesInRange;\n  uint64_t edgeOffset;\n  if (beginNode != 0) {\n    numEdgesInRange = prefixSum[endNode - 1] - prefixSum[beginNode - 1];\n    edgeOffset      = prefixSum[beginNode - 1];\n  } else {\n    numEdgesInRange = prefixSum[endNode - 1];\n    edgeOffset      = 0;\n  }\n\n  returnRanges[0] = beginNode;\n  std::vector<unsigned int> dummyScaleFactor;\n\n  for (uint32_t i = 0; i < unitsToSplit; i++) {\n    // determine division for unit i\n    auto nodeSplits =\n        divideNodesBinarySearch<VectorTy, uint32_t>(\n            numNodesInRange, numEdgesInRange, nodeAlpha, 1, i, unitsToSplit,\n            prefixSum, dummyScaleFactor, edgeOffset, beginNode)\n            .first;\n\n    // i.e. if there are actually assigned nodes\n    if (nodeSplits.first != nodeSplits.second) {\n      if (i != 0) {\n        assert(returnRanges[i] == *(nodeSplits.first) + beginNode);\n      } else { // i == 0\n        assert(returnRanges[i] == beginNode);\n      }\n      returnRanges[i + 1] = *(nodeSplits.second) + beginNode;\n    } else {\n      // unit assinged no nodes\n      returnRanges[i + 1] = returnRanges[i];\n    }\n\n    galois::gDebug(\"Unit \", i, \" gets nodes \", returnRanges[i], \" to \",\n                   returnRanges[i + 1]);\n  }\n}\n\n/**\n * Sanity checks a finalized unit range vector.\n *\n * @param unitsToSplit number of units to split nodes among\n * @param beginNode Beginning of range\n * @param endNode End of range, non-inclusive\n * @param returnRanges Ranges to sanity check\n */\nvoid unitRangeSanity(uint32_t unitsToSplit, uint32_t beginNode,\n                     uint32_t endNode, std::vector<uint32_t>& returnRanges);\n\n} // namespace internal\n\n/**\n * Determines node division ranges for all nodes in a graph and returns it in\n * an offset vector. (node ranges = assigned nodes that a particular unit\n * of execution should work on)\n *\n * Checks for corner cases, then calls the main loop function.\n *\n * ONLY CALL AFTER GRAPH IS CONSTRUCTED as it uses functions that assume\n * the graph is already constructed.\n *\n * @tparam GraphTy type of the graph object\n *\n * @param graph The graph object to get prefix sum information from\n * @param unitsToSplit number of units to split nodes among\n * @param nodeAlpha The higher the number, the more weight nodes have in\n * determining division of nodes (edges have weight 1).\n * @returns vector that indirectly specifies which units get which nodes\n */\ntemplate <typename GraphTy>\nstd::vector<uint32_t> determineUnitRangesFromGraph(GraphTy& graph,\n                                                   uint32_t unitsToSplit,\n                                                   uint32_t nodeAlpha = 0) {\n  uint32_t totalNodes = graph.size();\n\n  std::vector<uint32_t> returnRanges;\n  returnRanges.resize(unitsToSplit + 1);\n\n  // check corner cases\n  if (internal::unitRangeCornerCaseHandle(unitsToSplit, 0, totalNodes,\n                                          returnRanges)) {\n    return returnRanges;\n  }\n\n  // no corner cases: onto main loop over nodes that determines\n  // node ranges\n  internal::determineUnitRangesLoopGraph(graph, unitsToSplit, 0, totalNodes,\n                                         returnRanges, nodeAlpha);\n\n  internal::unitRangeSanity(unitsToSplit, 0, totalNodes, returnRanges);\n\n  return returnRanges;\n}\n\n/**\n * Determines node division ranges for a given range of nodes and returns it\n * as an offset vector. (node ranges = assigned nodes that a particular unit\n * of execution should work on)\n *\n * Checks for corner cases, then calls the main loop function.\n *\n * ONLY CALL AFTER GRAPH IS CONSTRUCTED as it uses functions that assume\n * the graph is already constructed.\n *\n * @tparam GraphTy type of the graph object\n *\n * @param graph The graph object to get prefix sum information from\n * @param unitsToSplit number of units to split nodes among\n * @param beginNode Beginning of range\n * @param endNode End of range, non-inclusive\n * @param nodeAlpha The higher the number, the more weight nodes have in\n * determining division of nodes (edges have weight 1).\n * @returns vector that indirectly specifies which units get which nodes\n */\ntemplate <typename GraphTy>\nstd::vector<uint32_t>\ndetermineUnitRangesFromGraph(GraphTy& graph, uint32_t unitsToSplit,\n                             uint32_t beginNode, uint32_t endNode,\n                             uint32_t nodeAlpha = 0) {\n  std::vector<uint32_t> returnRanges;\n  returnRanges.resize(unitsToSplit + 1);\n\n  if (internal::unitRangeCornerCaseHandle(unitsToSplit, beginNode, endNode,\n                                          returnRanges)) {\n    return returnRanges;\n  }\n\n  // no corner cases: onto main loop over nodes that determines\n  // node ranges\n  internal::determineUnitRangesLoopGraph(graph, unitsToSplit, beginNode,\n                                         endNode, returnRanges, nodeAlpha);\n\n  internal::unitRangeSanity(unitsToSplit, beginNode, endNode, returnRanges);\n\n  return returnRanges;\n}\n\n/**\n * Uses the divideByNode function (which is binary search based) to\n * divide nodes among units using a provided prefix sum.\n *\n * @tparam VectorTy type of the prefix sum object\n *\n * @param unitsToSplit number of units to split nodes among\n * @param edgePrefixSum A prefix sum of edges\n * @param nodeAlpha amount of weight to give to nodes when dividing work among\n * threads\n * @returns vector that indirectly specifies how nodes are split amongs units\n * of execution\n */\ntemplate <typename VectorTy>\nstd::vector<uint32_t> determineUnitRangesFromPrefixSum(uint32_t unitsToSplit,\n                                                       VectorTy& edgePrefixSum,\n                                                       uint32_t nodeAlpha = 0) {\n  assert(unitsToSplit > 0);\n\n  std::vector<uint32_t> nodeRanges;\n  nodeRanges.resize(unitsToSplit + 1);\n\n  nodeRanges[0] = 0;\n\n  uint32_t numNodes = edgePrefixSum.size();\n  // handle corner case TODO there are better ways to do this, i.e. call helper\n  if (numNodes == 0) {\n    nodeRanges[0] = 0;\n\n    for (uint32_t i = 0; i < unitsToSplit; i++) {\n      nodeRanges[i + 1] = 0;\n    }\n    return nodeRanges;\n  }\n\n  uint64_t numEdges = edgePrefixSum[numNodes - 1];\n\n  for (uint32_t i = 0; i < unitsToSplit; i++) {\n    auto nodeSplits =\n        divideNodesBinarySearch<VectorTy, uint32_t>(\n            numNodes, numEdges, nodeAlpha, 1, i, unitsToSplit, edgePrefixSum)\n            .first;\n\n    // i.e. if there are actually assigned nodes\n    if (nodeSplits.first != nodeSplits.second) {\n      if (i != 0) {\n        assert(nodeRanges[i] == *(nodeSplits.first));\n      } else { // i == 0\n        assert(nodeRanges[i] == 0);\n      }\n      nodeRanges[i + 1] = *(nodeSplits.second);\n    } else {\n      // unit assinged no nodes\n      nodeRanges[i + 1] = nodeRanges[i];\n    }\n\n    galois::gDebug(\"Unit \", i, \" gets nodes \", nodeRanges[i], \" to \",\n                   nodeRanges[i + 1]);\n  }\n\n  return nodeRanges;\n}\n\n/**\n * Uses the divideByNode function (which is binary search based) to\n * divide nodes among units using a provided prefix sum. Provide a node range\n * so that the prefix sum is only calculated using that range.\n *\n * @tparam VectorTy type of the prefix sum object\n *\n * @param unitsToSplit number of units to split nodes among\n * @param edgePrefixSum A prefix sum of edges\n * @param beginNode Beginning of range\n * @param endNode End of range, non-inclusive\n * @param nodeAlpha amount of weight to give to nodes when dividing work among\n * threads\n * @returns vector that indirectly specifies how nodes are split amongs units\n * of execution\n */\ntemplate <typename VectorTy>\nstd::vector<uint32_t>\ndetermineUnitRangesFromPrefixSum(uint32_t unitsToSplit, VectorTy& edgePrefixSum,\n                                 uint32_t beginNode, uint32_t endNode,\n                                 uint32_t nodeAlpha = 0) {\n  std::vector<uint32_t> returnRanges;\n  returnRanges.resize(unitsToSplit + 1);\n\n  if (internal::unitRangeCornerCaseHandle(unitsToSplit, beginNode, endNode,\n                                          returnRanges)) {\n    return returnRanges;\n  }\n\n  // no corner cases: onto main loop over nodes that determines\n  // node ranges\n  internal::determineUnitRangesLoopPrefixSum(\n      edgePrefixSum, unitsToSplit, beginNode, endNode, returnRanges, nodeAlpha);\n\n  internal::unitRangeSanity(unitsToSplit, beginNode, endNode, returnRanges);\n\n  return returnRanges;\n}\n\n} // end namespace graphs\n} // end namespace galois\n"
  },
  {
    "path": "libgalois/include/galois/graphs/LCGraph.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#ifndef GALOIS_GRAPHS_LCGRAPH_H\n#define GALOIS_GRAPHS_LCGRAPH_H\n\n#include \"galois/config.h\"\n#include \"galois/graphs/LC_CSR_Graph.h\"\n#include \"galois/graphs/LC_InlineEdge_Graph.h\"\n#include \"galois/graphs/LC_Linear_Graph.h\"\n#include \"galois/graphs/LC_Morph_Graph.h\"\n#include \"galois/graphs/LC_InOut_Graph.h\"\n#include \"galois/graphs/LC_Adaptor_Graph.h\"\n#include \"galois/graphs/ReadGraph.h\"\n\n#endif\n"
  },
  {
    "path": "libgalois/include/galois/graphs/LC_Adaptor_Graph.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#ifndef GALOIS_GRAPHS_LC_ADAPTOR_GRAPH_H\n#define GALOIS_GRAPHS_LC_ADAPTOR_GRAPH_H\n\n#include \"galois/config.h\"\n#include \"galois/graphs/Details.h\"\n#include \"galois/LargeArray.h\"\n\nnamespace galois {\nnamespace graphs {\n\ntemplate <typename NodeTy, typename EdgeTy, typename DerivedTy,\n          typename GraphNodeTy, typename IteratorTy, typename EdgeIteratorTy,\n          bool HasNoLockable = false>\nclass LC_Adaptor_Graph\n    : private internal::OutOfLineLockableFeature<HasNoLockable>,\n      private internal::LocalIteratorFeature<false> {\npublic:\n  //! If true, do not use abstract locks in graph\n  template <bool _has_no_lockable>\n  struct with_no_lockable {\n    typedef LC_Adaptor_Graph<NodeTy, EdgeTy, DerivedTy, GraphNodeTy, IteratorTy,\n                             EdgeIteratorTy, _has_no_lockable>\n        type;\n  };\n\n  typedef GraphNodeTy GraphNode;\n  typedef EdgeTy edge_data_type;\n  typedef NodeTy node_data_type;\n  typedef typename internal::EdgeInfoBase<void*, EdgeTy>::reference\n      edge_data_reference;\n  typedef typename internal::NodeInfoBase<NodeTy, false>::reference\n      node_data_reference;\n  typedef EdgeIteratorTy edge_iterator;\n  typedef IteratorTy iterator;\n  typedef iterator const_iterator;\n  typedef iterator local_iterator;\n\nprotected:\n  template <bool _A1 = HasNoLockable>\n  void acquireNode(GraphNode N, MethodFlag mflag,\n                   typename std::enable_if<!_A1>::type* = 0) {\n    this->outOfLineAcquire(getId(N), mflag);\n  }\n\n  template <bool _A1 = HasNoLockable>\n  void acquireNode(GraphNode, MethodFlag,\n                   typename std::enable_if<_A1>::type* = 0) {}\n\n  const DerivedTy& derived() const {\n    return *static_cast<const DerivedTy*>(this);\n  }\n\n  DerivedTy& derived() { return *static_cast<DerivedTy*>(this); }\n\n  size_t getId(GraphNode n) { return derived().get_id(n); }\n\npublic:\n  node_data_reference getData(GraphNode N,\n                              MethodFlag mflag = MethodFlag::WRITE) {\n    // galois::runtime::checkWrite(mflag, false);\n    acquireNode(N, mflag);\n    return derived().get_data(N);\n  }\n\n  edge_data_reference\n  getEdgeData(edge_iterator ni,\n              MethodFlag GALOIS_UNUSED(mflag) = MethodFlag::UNPROTECTED) {\n    // galois::runtime::checkWrite(mflag, false);\n    return derived().get_edge_data(ni);\n  }\n\n  GraphNode getEdgeDst(edge_iterator ni) { return derived().get_edge_dst(ni); }\n\n  uint64_t size() const { return derived().get_size(); }\n  uint64_t sizeEdges() const { return derived().get_size_edges(); }\n\n  iterator begin() const { return derived().get_begin(); }\n  iterator end() const { return derived().get_end(); }\n  local_iterator local_begin() {\n    return local_iterator(this->localBegin(size()));\n  }\n  local_iterator local_end() { return local_iterator(this->localEnd(size())); }\n\n  edge_iterator edge_begin(GraphNode N, MethodFlag mflag = MethodFlag::WRITE) {\n    acquireNode(N, mflag);\n    if (galois::runtime::shouldLock(mflag)) {\n      for (edge_iterator ii = derived().get_edge_begin(N),\n                         ee = derived().get_edge_end(N);\n           ii != ee; ++ii) {\n        acquireNode(getEdgeDst(ii), mflag);\n      }\n    }\n    return derived().get_edge_begin(N);\n  }\n\n  edge_iterator edge_end(GraphNode N, MethodFlag mflag = MethodFlag::WRITE) {\n    acquireNode(N, mflag);\n    return derived().get_edge_end(N);\n  }\n\n  internal::EdgesIterator<LC_Adaptor_Graph>\n  out_edges(GraphNode N, MethodFlag mflag = MethodFlag::WRITE) {\n    return internal::EdgesIterator<LC_Adaptor_Graph>(*this, N, mflag);\n  }\n};\n\n} // namespace graphs\n} // namespace galois\n\n#endif\n"
  },
  {
    "path": "libgalois/include/galois/graphs/LC_CSR_CSC_Graph.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n/**\n * @file LC_CSR_CSC_Graph.h\n *\n * Contains the implementation of a bidirectional LC_CS_Graph.\n */\n#ifndef GALOIS_GRAPHS_LC_CSR_CSC_GRAPH_H\n#define GALOIS_GRAPHS_LC_CSR_CSC_GRAPH_H\n\n#include \"galois/config.h\"\n\n#include \"galois/graphs/LC_CSR_Graph.h\"\n\nnamespace galois {\nnamespace graphs {\n\n/**\n * An bidirectional LC_CSR_Graph that allows the construction of in-edges from\n * its outedges.\n *\n * @tparam NodeTy type of the node data\n * @tparam EdgeTy type of the edge data\n * @tparam EdgeDataByValue If set to true, the in-edges will have their own\n * copy of the edge data. Otherwise, the in-edge edge data will be shared with\n * its corresponding out-edge.\n * @tparam HasNoLockable If set to true, then node accesses will cannot acquire\n * an abstract lock. Otherwise, accessing nodes can get a lock.\n * @tparam UseNumaAlloc If set to true, allocate data in a possibly more NUMA\n * friendly way.\n * @tparam HasOutOfLineLockable\n * @tparam FileEdgeTy\n */\ntemplate <typename NodeTy, typename EdgeTy, bool EdgeDataByValue = false,\n          bool HasNoLockable = false, bool UseNumaAlloc = false,\n          bool HasOutOfLineLockable = false, typename FileEdgeTy = EdgeTy>\nclass LC_CSR_CSC_Graph\n    : public LC_CSR_Graph<NodeTy, EdgeTy, HasNoLockable, UseNumaAlloc,\n                          HasOutOfLineLockable, FileEdgeTy> {\n  // typedef to make it easier to read\n  //! Typedef referring to base LC_CSR_Graph\n  using BaseGraph = LC_CSR_Graph<NodeTy, EdgeTy, HasNoLockable, UseNumaAlloc,\n                                 HasOutOfLineLockable, FileEdgeTy>;\n  //! Typedef referring to this class itself\n  using ThisGraph =\n      LC_CSR_CSC_Graph<NodeTy, EdgeTy, EdgeDataByValue, HasNoLockable,\n                       UseNumaAlloc, HasOutOfLineLockable, FileEdgeTy>;\n\npublic:\n  //! Graph node typedef\n  using GraphNode = uint32_t;\n\nprotected:\n  // retypedefs of base class\n  //! large array for edge data\n  using EdgeData = LargeArray<EdgeTy>;\n  //! large array for edge destinations\n  using EdgeDst = LargeArray<uint32_t>;\n  //! large array for edge index data\n  using EdgeIndData = LargeArray<uint64_t>;\n\npublic:\n  //! iterator for edges\n  using edge_iterator =\n      boost::counting_iterator<typename EdgeIndData::value_type>;\n  //! reference to edge data\n  using edge_data_reference = typename EdgeData::reference;\n\nprotected:\n  //! edge index data for the reverse edges\n  EdgeIndData inEdgeIndData;\n  //! edge destination data for the reverse edges\n  EdgeDst inEdgeDst;\n  //! Edge data of inedges can be a value copy of the outedges (i.e. in and\n  //! out edges have separate edge values) or inedges can refer to the same\n  //! data as its corresponding outedge; this is what this typedef is for\n  using EdgeDataRep =\n      typename std::conditional<EdgeDataByValue, EdgeData, EdgeIndData>::type;\n  //! The data for the reverse edges\n  EdgeDataRep inEdgeData;\n\n  //! redefinition of the edge sort iterator in LC_CSR_Graph\n  using edge_sort_iterator =\n      internal::EdgeSortIterator<GraphNode, typename EdgeIndData::value_type,\n                                 EdgeDst, EdgeDataRep>;\n\n  //! beginning iterator to an edge sorter for in-edges\n  edge_sort_iterator in_edge_sort_begin(GraphNode N) {\n    return edge_sort_iterator(*in_raw_begin(N), &inEdgeDst, &inEdgeData);\n  }\n\n  //! ending iterator to an edge sorter for in-edges\n  edge_sort_iterator in_edge_sort_end(GraphNode N) {\n    return edge_sort_iterator(*in_raw_end(N), &inEdgeDst, &inEdgeData);\n  }\n\n  /**\n   * Copy the data of outedge by value to inedge.\n   *\n   * @param e_new position of out-edge to copy as an in-edge\n   * @param e position of in-edge\n   */\n  template <bool A                            = EdgeDataByValue,\n            typename std::enable_if<A>::type* = nullptr>\n  void createEdgeData(const uint64_t e_new, const uint64_t e) {\n    BaseGraph::edgeDataCopy(inEdgeData, BaseGraph::edgeData, e_new, e);\n  }\n\n  /**\n   * Save a pointer to an outedge (i.e. map an in-edge to an out-edge). Done\n   * to share edge data.\n   *\n   * @param e_new position of out-edge to save\n   * @param e position of in-edge\n   */\n  template <bool A                             = EdgeDataByValue,\n            typename std::enable_if<!A>::type* = nullptr>\n  void createEdgeData(const uint64_t e_new, const uint64_t e) {\n    if (!std::is_void<EdgeTy>::value) {\n      inEdgeData[e_new] = e;\n    }\n  }\n\n  /**\n   * Determine the in-edge indices for every node by accumulating how many\n   * in-edges each node has, getting a prefix sum, and saving it to the\n   * in edge index data array.\n   *\n   * @param dataBuffer temporary buffer that is used to accumulate in-edge\n   * counts; at the end of this function, it will contain a prefix sum of\n   * in-edges\n   */\n  void determineInEdgeIndices(EdgeIndData& dataBuffer) {\n    // counting outgoing edges in the tranpose graph by\n    // counting incoming edges in the original graph\n    galois::do_all(galois::iterate(UINT64_C(0), BaseGraph::numEdges),\n                   [&](uint64_t e) {\n                     auto dst = BaseGraph::edgeDst[e];\n                     __sync_add_and_fetch(&(dataBuffer[dst]), 1);\n                   });\n\n    // prefix sum calculation of the edge index array\n    for (uint32_t n = 1; n < BaseGraph::numNodes; ++n) {\n      dataBuffer[n] += dataBuffer[n - 1];\n    }\n\n    // copy over the new tranposed edge index data\n    inEdgeIndData.allocateInterleaved(BaseGraph::numNodes);\n    galois::do_all(galois::iterate(UINT64_C(0), BaseGraph::numNodes),\n                   [&](uint64_t n) { inEdgeIndData[n] = dataBuffer[n]; });\n  }\n\n  /**\n   * Determine the destination of each in-edge and copy the data associated\n   * with an edge (or point to it).\n   *\n   * @param dataBuffer A prefix sum of in-edges\n   */\n  void determineInEdgeDestAndData(EdgeIndData& dataBuffer) {\n    // after this block dataBuffer[i] will now hold number of edges that all\n    // nodes before the ith node have; used to determine where to start\n    // saving an edge for a node\n    if (BaseGraph::numNodes >= 1) {\n      dataBuffer[0] = 0;\n      galois::do_all(galois::iterate(UINT64_C(1), BaseGraph::numNodes),\n                     [&](uint64_t n) { dataBuffer[n] = inEdgeIndData[n - 1]; });\n    }\n\n    // allocate edge dests and data\n    inEdgeDst.allocateInterleaved(BaseGraph::numEdges);\n\n    if (!std::is_void<EdgeTy>::value) {\n      inEdgeData.allocateInterleaved(BaseGraph::numEdges);\n    }\n\n    galois::do_all(\n        galois::iterate(UINT64_C(0), BaseGraph::numNodes), [&](uint64_t src) {\n          // e = start index into edge array for a particular node\n          uint64_t e = (src == 0) ? 0 : BaseGraph::edgeIndData[src - 1];\n\n          // get all outgoing edges of a particular node in the non-transpose\n          // and convert to incoming\n          while (e < BaseGraph::edgeIndData[src]) {\n            // destination nodde\n            auto dst = BaseGraph::edgeDst[e];\n            // location to save edge\n            auto e_new = __sync_fetch_and_add(&(dataBuffer[dst]), 1);\n            // save src as destination\n            inEdgeDst[e_new] = src;\n            // edge data to \"new\" array\n            createEdgeData(e_new, e);\n            e++;\n          }\n        });\n  }\n\npublic:\n  //! default constructor\n  LC_CSR_CSC_Graph() = default;\n  //! default move constructor\n  LC_CSR_CSC_Graph(LC_CSR_CSC_Graph&& rhs) = default;\n  //! default = operator\n  LC_CSR_CSC_Graph& operator=(LC_CSR_CSC_Graph&&) = default;\n\n  /////////////////////////////////////////////////////////////////////////////\n  // Construction functions\n  /////////////////////////////////////////////////////////////////////////////\n\n  /**\n   * Call only after the LC_CSR_Graph part of this class is fully constructed.\n   * Creates the in edge data by reading from the out edge data.\n   */\n  void constructIncomingEdges() {\n    galois::StatTimer incomingEdgeConstructTimer(\"IncomingEdgeConstruct\");\n    incomingEdgeConstructTimer.start();\n\n    // initialize the temp array\n    EdgeIndData dataBuffer;\n    dataBuffer.allocateInterleaved(BaseGraph::numNodes);\n    galois::do_all(galois::iterate(UINT64_C(0), BaseGraph::numNodes),\n                   [&](uint64_t n) { dataBuffer[n] = 0; });\n\n    determineInEdgeIndices(dataBuffer);\n    determineInEdgeDestAndData(dataBuffer);\n\n    incomingEdgeConstructTimer.stop();\n  }\n\n  /////////////////////////////////////////////////////////////////////////////\n  // Access functions\n  /////////////////////////////////////////////////////////////////////////////\n\n  /**\n   * Grabs in edge beginning without lock/safety.\n   *\n   * @param N node to get edge beginning of\n   * @returns Iterator to first in edge of node N\n   */\n  edge_iterator in_raw_begin(GraphNode N) const {\n    return edge_iterator((N == 0) ? 0 : inEdgeIndData[N - 1]);\n  }\n\n  /**\n   * Grabs in edge end without lock/safety.\n   *\n   * @param N node to get edge end of\n   * @returns Iterator to end of in edges of node N (i.e. first edge of\n   * node N+1)\n   */\n  edge_iterator in_raw_end(GraphNode N) const {\n    return edge_iterator(inEdgeIndData[N]);\n  }\n\n  /**\n   * Wrapper to get the in edge end of a node; lock if necessary.\n   *\n   * @param N node to get edge beginning of\n   * @param mflag how safe the acquire should be\n   * @returns Iterator to first in edge of node N\n   */\n  edge_iterator in_edge_begin(GraphNode N,\n                              MethodFlag mflag = MethodFlag::WRITE) {\n    BaseGraph::acquireNode(N, mflag);\n    if (!HasNoLockable && galois::runtime::shouldLock(mflag)) {\n      for (edge_iterator ii = in_raw_begin(N), ee = in_raw_end(N); ii != ee;\n           ++ii) {\n        BaseGraph::acquireNode(inEdgeDst[*ii], mflag);\n      }\n    }\n    return in_raw_begin(N);\n  }\n\n  /**\n   * Wrapper to get the in edge end of a node; lock if necessary.\n   *\n   * @param N node to get in edge end of\n   * @param mflag how safe the acquire should be\n   * @returns Iterator to end of in edges of node N (i.e. first in edge of N+1)\n   */\n  edge_iterator in_edge_end(GraphNode N, MethodFlag mflag = MethodFlag::WRITE) {\n    BaseGraph::acquireNode(N, mflag);\n    return in_raw_end(N);\n  }\n\n  uint64_t getInDegree(GraphNode N) const {\n    return (in_raw_end(N) - in_raw_begin(N));\n  }\n\n  /**\n   * @param N node to get in edges for\n   * @param mflag how safe the acquire should be\n   * @returns Range to in edges of node N\n   */\n  runtime::iterable<NoDerefIterator<edge_iterator>>\n  in_edges(GraphNode N, MethodFlag mflag = MethodFlag::WRITE) {\n    return internal::make_no_deref_range(in_edge_begin(N, mflag),\n                                         in_edge_end(N, mflag));\n  }\n\n  /**\n   * Given an edge id for in edges, get the destination of the edge\n   *\n   * @param ni edge id\n   * @returns destination for that in edge\n   */\n  GraphNode getInEdgeDst(edge_iterator ni) const { return inEdgeDst[*ni]; }\n\n  /**\n   * Given an edge id for in edge, get the data associated with that edge.\n   * Returns a constant reference.\n   *\n   * In-edge has own copy of edge-data version.\n   *\n   * @param ni in-edge id\n   * @returns data of the edge\n   */\n  template <bool A                            = EdgeDataByValue,\n            typename std::enable_if<A>::type* = nullptr>\n  edge_data_reference\n  getInEdgeData(edge_iterator ni, MethodFlag = MethodFlag::UNPROTECTED) const {\n    return inEdgeData[*ni];\n  }\n\n  /**\n   * Given an edge id for in edge, get the data associated with that edge.\n   * Returns a non-constant reference.\n   *\n   * In-edge has own copy of edge-data version.\n   *\n   * @param ni in-edge id\n   * @returns data of the edge\n   */\n  template <bool A                            = EdgeDataByValue,\n            typename std::enable_if<A>::type* = nullptr>\n  edge_data_reference getInEdgeData(edge_iterator ni,\n                                    MethodFlag = MethodFlag::UNPROTECTED) {\n    return inEdgeData[*ni];\n  }\n\n  /**\n   * Given an edge id for in edge, get the data associated with that edge.\n   * Returns a constant reference.\n   *\n   * In-edge and out-edge share edge data version.\n   *\n   * @param ni in-edge id\n   * @returns data of the edge\n   */\n  template <bool A                             = EdgeDataByValue,\n            typename std::enable_if<!A>::type* = nullptr>\n  edge_data_reference\n  getInEdgeData(edge_iterator ni, MethodFlag = MethodFlag::UNPROTECTED) const {\n    return BaseGraph::edgeData[inEdgeData[*ni]];\n  }\n\n  /**\n   * Given an edge id for in edge, get the data associated with that edge.\n   * Returns a non-constant reference.\n   *\n   * In-edge and out-edge share edge data version.\n   *\n   * @param ni in-edge id\n   * @returns data of the edge\n   */\n  template <bool A                             = EdgeDataByValue,\n            typename std::enable_if<!A>::type* = nullptr>\n  edge_data_reference getInEdgeData(edge_iterator ni,\n                                    MethodFlag = MethodFlag::UNPROTECTED) {\n    return BaseGraph::edgeData[inEdgeData[*ni]];\n  }\n\n  /**\n   * @returns the prefix sum of in-edges\n   */\n  const EdgeIndData& getInEdgePrefixSum() const { return inEdgeIndData; }\n\n  /////////////////////////////////////////////////////////////////////////////\n  // Utility\n  /////////////////////////////////////////////////////////////////////////////\n\n  /**\n   * Sorts outgoing edges of a node. Comparison is over getEdgeDst(e).\n   */\n  void sortInEdgesByDst(GraphNode N, MethodFlag mflag = MethodFlag::WRITE) {\n    BaseGraph::acquireNode(N, mflag);\n    // depending on value/ref the type of EdgeSortValue changes\n    using EdgeSortVal = EdgeSortValue<\n        GraphNode,\n        typename std::conditional<EdgeDataByValue, EdgeTy, uint64_t>::type>;\n\n    std::sort(in_edge_sort_begin(N), in_edge_sort_end(N),\n              [=](const EdgeSortVal& e1, const EdgeSortVal& e2) {\n                return e1.dst < e2.dst;\n              });\n  }\n\n  /**\n   * Sorts all incoming edges of all nodes in parallel. Comparison is over\n   * getEdgeDst(e).\n   */\n  void sortAllInEdgesByDst(MethodFlag mflag = MethodFlag::WRITE) {\n    galois::do_all(\n        galois::iterate((size_t)0, this->size()),\n        [=](GraphNode N) { this->sortInEdgesByDst(N, mflag); },\n        galois::no_stats(), galois::steal());\n  }\n\n  /**\n   * Directly reads the GR file to construct CSR graph\n   * and then constructs reverse edges based on that.\n   */\n  void readAndConstructBiGraphFromGRFile(const std::string& filename) {\n    this->readGraphFromGRFile(filename);\n    constructIncomingEdges();\n  }\n};\n\n} // namespace graphs\n} // namespace galois\n#endif\n"
  },
  {
    "path": "libgalois/include/galois/graphs/LC_CSR_Graph.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#ifndef GALOIS_GRAPHS_LC_CSR_GRAPH_H\n#define GALOIS_GRAPHS_LC_CSR_GRAPH_H\n\n#include <fstream>\n#include <type_traits>\n\n#include <boost/archive/binary_oarchive.hpp>\n#include <boost/archive/binary_iarchive.hpp>\n#include <boost/serialization/split_member.hpp>\n#include <boost/serialization/binary_object.hpp>\n#include <boost/serialization/serialization.hpp>\n\n#include \"galois/config.h\"\n#include \"galois/Galois.h\"\n#include \"galois/graphs/Details.h\"\n#include \"galois/graphs/FileGraph.h\"\n#include \"galois/graphs/GraphHelpers.h\"\n#include \"galois/PODResizeableArray.h\"\n\nnamespace galois::graphs {\n/**\n * Local computation graph (i.e., graph structure does not change). The data\n * representation is the traditional compressed-sparse-row (CSR) format.\n *\n * The position of template parameters may change between Galois releases; the\n * most robust way to specify them is through the with_XXX nested templates.\n *\n * An example of use:\n *\n * \\snippet test/graph.cpp Using a graph\n *\n * And in C++11:\n *\n * \\snippet test/graph.cpp Using a graph cxx11\n *\n * @tparam NodeTy data on nodes\n * @tparam EdgeTy data on out edges\n */\n//! [doxygennuma]\ntemplate <typename NodeTy, typename EdgeTy, bool HasNoLockable = false,\n          bool UseNumaAlloc = false, bool HasOutOfLineLockable = false,\n          typename FileEdgeTy = EdgeTy>\nclass LC_CSR_Graph :\n    //! [doxygennuma]\n    private boost::noncopyable,\n    private internal::LocalIteratorFeature<UseNumaAlloc>,\n    private internal::OutOfLineLockableFeature<HasOutOfLineLockable &&\n                                               !HasNoLockable> {\n  template <typename Graph>\n  friend class LC_InOut_Graph;\n\npublic:\n  template <bool _has_id>\n  struct with_id {\n    typedef LC_CSR_Graph type;\n  };\n\n  template <typename _node_data>\n  struct with_node_data {\n    typedef LC_CSR_Graph<_node_data, EdgeTy, HasNoLockable, UseNumaAlloc,\n                         HasOutOfLineLockable, FileEdgeTy>\n        type;\n  };\n\n  template <typename _edge_data>\n  struct with_edge_data {\n    typedef LC_CSR_Graph<NodeTy, _edge_data, HasNoLockable, UseNumaAlloc,\n                         HasOutOfLineLockable, FileEdgeTy>\n        type;\n  };\n\n  template <typename _file_edge_data>\n  struct with_file_edge_data {\n    typedef LC_CSR_Graph<NodeTy, EdgeTy, HasNoLockable, UseNumaAlloc,\n                         HasOutOfLineLockable, _file_edge_data>\n        type;\n  };\n\n  //! If true, do not use abstract locks in graph\n  template <bool _has_no_lockable>\n  struct with_no_lockable {\n    typedef LC_CSR_Graph<NodeTy, EdgeTy, _has_no_lockable, UseNumaAlloc,\n                         HasOutOfLineLockable, FileEdgeTy>\n        type;\n  };\n  template <bool _has_no_lockable>\n  using _with_no_lockable =\n      LC_CSR_Graph<NodeTy, EdgeTy, _has_no_lockable, UseNumaAlloc,\n                   HasOutOfLineLockable, FileEdgeTy>;\n\n  //! If true, use NUMA-aware graph allocation; otherwise, use NUMA interleaved\n  //! allocation.\n  template <bool _use_numa_alloc>\n  struct with_numa_alloc {\n    typedef LC_CSR_Graph<NodeTy, EdgeTy, HasNoLockable, _use_numa_alloc,\n                         HasOutOfLineLockable, FileEdgeTy>\n        type;\n  };\n  template <bool _use_numa_alloc>\n  using _with_numa_alloc =\n      LC_CSR_Graph<NodeTy, EdgeTy, HasNoLockable, _use_numa_alloc,\n                   HasOutOfLineLockable, FileEdgeTy>;\n\n  //! If true, store abstract locks separate from nodes\n  template <bool _has_out_of_line_lockable>\n  struct with_out_of_line_lockable {\n    typedef LC_CSR_Graph<NodeTy, EdgeTy, HasNoLockable, UseNumaAlloc,\n                         _has_out_of_line_lockable, FileEdgeTy>\n        type;\n  };\n\n  typedef read_default_graph_tag read_tag;\n\nprotected:\n  typedef LargeArray<EdgeTy> EdgeData;\n  typedef LargeArray<uint32_t> EdgeDst;\n  typedef internal::NodeInfoBaseTypes<NodeTy,\n                                      !HasNoLockable && !HasOutOfLineLockable>\n      NodeInfoTypes;\n  typedef internal::NodeInfoBase<NodeTy,\n                                 !HasNoLockable && !HasOutOfLineLockable>\n      NodeInfo;\n  typedef LargeArray<uint64_t> EdgeIndData;\n  typedef LargeArray<NodeInfo> NodeData;\n\npublic:\n  typedef uint32_t GraphNode;\n  typedef EdgeTy edge_data_type;\n  typedef FileEdgeTy file_edge_data_type;\n  typedef NodeTy node_data_type;\n  typedef typename EdgeData::reference edge_data_reference;\n  typedef typename NodeInfoTypes::reference node_data_reference;\n  using edge_iterator =\n      boost::counting_iterator<typename EdgeIndData::value_type>;\n  using iterator = boost::counting_iterator<typename EdgeDst::value_type>;\n  typedef iterator const_iterator;\n  typedef iterator local_iterator;\n  typedef iterator const_local_iterator;\n\nprotected:\n  NodeData nodeData;\n  EdgeIndData edgeIndData;\n  EdgeDst edgeDst;\n  EdgeData edgeData;\n\n  uint64_t numNodes;\n  uint64_t numEdges;\n\n  typedef internal::EdgeSortIterator<\n      GraphNode, typename EdgeIndData::value_type, EdgeDst, EdgeData>\n      edge_sort_iterator;\n\n  edge_iterator raw_begin(GraphNode N) const {\n    return edge_iterator((N == 0) ? 0 : edgeIndData[N - 1]);\n  }\n\n  edge_iterator raw_end(GraphNode N) const {\n    return edge_iterator(edgeIndData[N]);\n  }\n\n  edge_sort_iterator edge_sort_begin(GraphNode N) {\n    return edge_sort_iterator(*raw_begin(N), &edgeDst, &edgeData);\n  }\n\n  edge_sort_iterator edge_sort_end(GraphNode N) {\n    return edge_sort_iterator(*raw_end(N), &edgeDst, &edgeData);\n  }\n\n  template <bool _A1 = HasNoLockable, bool _A2 = HasOutOfLineLockable>\n  void acquireNode(GraphNode N, MethodFlag mflag,\n                   typename std::enable_if<!_A1 && !_A2>::type* = 0) {\n    galois::runtime::acquire(&nodeData[N], mflag);\n  }\n\n  template <bool _A1 = HasOutOfLineLockable, bool _A2 = HasNoLockable>\n  void acquireNode(GraphNode N, MethodFlag mflag,\n                   typename std::enable_if<_A1 && !_A2>::type* = 0) {\n    this->outOfLineAcquire(getId(N), mflag);\n  }\n\n  template <bool _A1 = HasOutOfLineLockable, bool _A2 = HasNoLockable>\n  void acquireNode(GraphNode, MethodFlag,\n                   typename std::enable_if<_A2>::type* = 0) {}\n\n  template <bool _A1 = EdgeData::has_value,\n            bool _A2 = LargeArray<FileEdgeTy>::has_value>\n  void constructEdgeValue(FileGraph& graph,\n                          typename FileGraph::edge_iterator nn,\n                          typename std::enable_if<!_A1 || _A2>::type* = 0) {\n    typedef LargeArray<FileEdgeTy> FED;\n    if (EdgeData::has_value)\n      edgeData.set(*nn, graph.getEdgeData<typename FED::value_type>(nn));\n  }\n\n  template <bool _A1 = EdgeData::has_value,\n            bool _A2 = LargeArray<FileEdgeTy>::has_value>\n  void constructEdgeValue(FileGraph&, typename FileGraph::edge_iterator nn,\n                          typename std::enable_if<_A1 && !_A2>::type* = 0) {\n    edgeData.set(*nn, {});\n  }\n\n  size_t getId(GraphNode N) { return N; }\n\n  GraphNode getNode(size_t n) { return n; }\n\nprivate:\n  friend class boost::serialization::access;\n\n  template <typename Archive>\n  void save(Archive& ar, const unsigned int) const {\n    ar << numNodes;\n    ar << numEdges;\n\n    // Large Arrays\n    ar << edgeIndData;\n    ar << edgeDst;\n    ar << edgeData;\n  }\n\n  template <typename Archive>\n  void load(Archive& ar, const unsigned int) {\n    ar >> numNodes;\n    ar >> numEdges;\n\n    // Large Arrays\n    ar >> edgeIndData;\n    ar >> edgeDst;\n    ar >> edgeData;\n\n    if (!nodeData.data()) {\n      if (UseNumaAlloc) {\n        nodeData.allocateBlocked(numNodes);\n        this->outOfLineAllocateBlocked(numNodes);\n      } else {\n        nodeData.allocateInterleaved(numNodes);\n        this->outOfLineAllocateInterleaved(numNodes);\n      }\n\n      // Construct nodeData largeArray\n      for (size_t n = 0; n < numNodes; ++n) {\n        nodeData.constructAt(n);\n      }\n    }\n  }\n\n  // The macro BOOST_SERIALIZATION_SPLIT_MEMBER() generates code which invokes\n  // the save or load depending on whether the archive is used for saving or\n  // loading\n  BOOST_SERIALIZATION_SPLIT_MEMBER()\n\npublic:\n  LC_CSR_Graph(LC_CSR_Graph&& rhs) = default;\n\n  LC_CSR_Graph() = default;\n\n  LC_CSR_Graph& operator=(LC_CSR_Graph&&) = default;\n\n  /**\n   * Serializes node data using Boost.\n   *\n   * @param ar Boost archive to serialize to.\n   */\n  void serializeNodeData(boost::archive::binary_oarchive& ar) const {\n    ar << nodeData;\n  }\n\n  /**\n   * Deserializes a Boost archive containing node data to the local node data\n   * variable.\n   *\n   * @param ar Boost archive to deserialize from.\n   */\n  void deSerializeNodeData(boost::archive::binary_iarchive& ar) {\n    ar >> nodeData;\n  }\n\n  /**\n   * Serializes graph using Boost.\n   *\n   * @param ar Boost archive to serialize to.\n   */\n  void serializeGraph(boost::archive::binary_oarchive& ar) const {\n    ar << numNodes;\n    ar << numEdges;\n\n    // Large Arrays\n    ar << nodeData;\n    ar << edgeIndData;\n    ar << edgeDst;\n    ar << edgeData;\n  }\n\n  /**\n   * Deserializes a Boost archive to the local graph.\n   *\n   * @param ar Boost archive to deserialize from.\n   */\n  void deSerializeGraph(boost::archive::binary_iarchive& ar) {\n    ar >> numNodes;\n    ar >> numEdges;\n\n    // Large Arrays\n    ar >> nodeData;\n    ar >> edgeIndData;\n    ar >> edgeDst;\n    ar >> edgeData;\n  }\n\n  /**\n   * Accesses the \"prefix sum\" of this graph; takes advantage of the fact\n   * that edge_end(n) is basically prefix_sum[n] (if a prefix sum existed +\n   * if prefix_sum[0] = number of edges in node 0).\n   *\n   * ONLY USE IF GRAPH HAS BEEN LOADED\n   *\n   * @param n Index into edge prefix sum\n   * @returns The value that would be located at index n in an edge prefix sum\n   * array\n   */\n  uint64_t operator[](uint64_t n) { return *(edge_end(n)); }\n\n  template <typename EdgeNumFnTy, typename EdgeDstFnTy, typename EdgeDataFnTy>\n  LC_CSR_Graph(uint32_t _numNodes, uint64_t _numEdges, EdgeNumFnTy edgeNum,\n               EdgeDstFnTy _edgeDst, EdgeDataFnTy _edgeData)\n      : numNodes(_numNodes), numEdges(_numEdges) {\n    if (UseNumaAlloc) {\n      //! [numaallocex]\n      nodeData.allocateBlocked(numNodes);\n      edgeIndData.allocateBlocked(numNodes);\n      edgeDst.allocateBlocked(numEdges);\n      edgeData.allocateBlocked(numEdges);\n      //! [numaallocex]\n      this->outOfLineAllocateBlocked(numNodes, false);\n    } else {\n      nodeData.allocateInterleaved(numNodes);\n      edgeIndData.allocateInterleaved(numNodes);\n      edgeDst.allocateInterleaved(numEdges);\n      edgeData.allocateInterleaved(numEdges);\n      this->outOfLineAllocateInterleaved(numNodes);\n    }\n    for (size_t n = 0; n < numNodes; ++n) {\n      nodeData.constructAt(n);\n    }\n    uint64_t cur = 0;\n    for (size_t n = 0; n < numNodes; ++n) {\n      cur += edgeNum(n);\n      edgeIndData[n] = cur;\n    }\n    cur = 0;\n    for (size_t n = 0; n < numNodes; ++n) {\n      for (uint64_t e = 0, ee = edgeNum(n); e < ee; ++e) {\n        if (EdgeData::has_value)\n          edgeData.set(cur, _edgeData(n, e));\n        edgeDst[cur] = _edgeDst(n, e);\n        ++cur;\n      }\n    }\n  }\n\n  friend void swap(LC_CSR_Graph& lhs, LC_CSR_Graph& rhs) {\n    swap(lhs.nodeData, rhs.nodeData);\n    swap(lhs.edgeIndData, rhs.edgeIndData);\n    swap(lhs.edgeDst, rhs.edgeDst);\n    swap(lhs.edgeData, rhs.edgeData);\n    std::swap(lhs.numNodes, rhs.numNodes);\n    std::swap(lhs.numEdges, rhs.numEdges);\n  }\n\n  node_data_reference getData(GraphNode N,\n                              MethodFlag mflag = MethodFlag::WRITE) {\n    // galois::runtime::checkWrite(mflag, false);\n    NodeInfo& NI = nodeData[N];\n    acquireNode(N, mflag);\n    return NI.getData();\n  }\n\n  edge_data_reference\n  getEdgeData(edge_iterator ni,\n              MethodFlag GALOIS_UNUSED(mflag) = MethodFlag::UNPROTECTED) {\n    // galois::runtime::checkWrite(mflag, false);\n    return edgeData[*ni];\n  }\n\n  GraphNode getEdgeDst(edge_iterator ni) { return edgeDst[*ni]; }\n\n  size_t size() const { return numNodes; }\n  size_t sizeEdges() const { return numEdges; }\n\n  iterator begin() const { return iterator(0); }\n  iterator end() const { return iterator(numNodes); }\n\n  const_local_iterator local_begin() const {\n    return const_local_iterator(this->localBegin(numNodes));\n  }\n\n  const_local_iterator local_end() const {\n    return const_local_iterator(this->localEnd(numNodes));\n  }\n\n  local_iterator local_begin() {\n    return local_iterator(this->localBegin(numNodes));\n  }\n\n  local_iterator local_end() {\n    return local_iterator(this->localEnd(numNodes));\n  }\n\n  edge_iterator edge_begin(GraphNode N, MethodFlag mflag = MethodFlag::WRITE) {\n    acquireNode(N, mflag);\n    if (!HasNoLockable && galois::runtime::shouldLock(mflag)) {\n      for (edge_iterator ii = raw_begin(N), ee = raw_end(N); ii != ee; ++ii) {\n        acquireNode(edgeDst[*ii], mflag);\n      }\n    }\n    return raw_begin(N);\n  }\n\n  edge_iterator edge_end(GraphNode N, MethodFlag mflag = MethodFlag::WRITE) {\n    acquireNode(N, mflag);\n    return raw_end(N);\n  }\n\n  uint64_t getDegree(GraphNode N) const { return (raw_end(N) - raw_begin(N)); }\n\n  edge_iterator findEdge(GraphNode N1, GraphNode N2) {\n    return std::find_if(edge_begin(N1), edge_end(N1),\n                        [=](edge_iterator e) { return getEdgeDst(e) == N2; });\n  }\n\n  edge_iterator findEdgeSortedByDst(GraphNode N1, GraphNode N2) {\n    auto e = std::lower_bound(\n        edge_begin(N1), edge_end(N1), N2,\n        [=](edge_iterator e, GraphNode N) { return getEdgeDst(e) < N; });\n    return (getEdgeDst(e) == N2) ? e : edge_end(N1);\n  }\n\n  runtime::iterable<NoDerefIterator<edge_iterator>>\n  edges(GraphNode N, MethodFlag mflag = MethodFlag::WRITE) {\n    return internal::make_no_deref_range(edge_begin(N, mflag),\n                                         edge_end(N, mflag));\n  }\n\n  runtime::iterable<NoDerefIterator<edge_iterator>>\n  out_edges(GraphNode N, MethodFlag mflag = MethodFlag::WRITE) {\n    return edges(N, mflag);\n  }\n\n  /**\n   * Sorts outgoing edges of a node. Comparison function is over EdgeTy.\n   */\n  template <typename CompTy>\n  void sortEdgesByEdgeData(GraphNode N,\n                           const CompTy& comp = std::less<EdgeTy>(),\n                           MethodFlag mflag   = MethodFlag::WRITE) {\n    acquireNode(N, mflag);\n    std::sort(\n        edge_sort_begin(N), edge_sort_end(N),\n        internal::EdgeSortCompWrapper<EdgeSortValue<GraphNode, EdgeTy>, CompTy>(\n            comp));\n  }\n\n  /**\n   * Sorts outgoing edges of a node.\n   * Comparison function is over <code>EdgeSortValue<EdgeTy></code>.\n   */\n  template <typename CompTy>\n  void sortEdges(GraphNode N, const CompTy& comp,\n                 MethodFlag mflag = MethodFlag::WRITE) {\n    acquireNode(N, mflag);\n    std::sort(edge_sort_begin(N), edge_sort_end(N), comp);\n  }\n\n  /**\n   * Sorts outgoing edges of a node. Comparison is over getEdgeDst(e).\n   */\n  void sortEdgesByDst(GraphNode N, MethodFlag mflag = MethodFlag::WRITE) {\n    acquireNode(N, mflag);\n    typedef EdgeSortValue<GraphNode, EdgeTy> EdgeSortVal;\n    std::sort(edge_sort_begin(N), edge_sort_end(N),\n              [=](const EdgeSortVal& e1, const EdgeSortVal& e2) {\n                return e1.dst < e2.dst;\n              });\n  }\n\n  /**\n   * Sorts all outgoing edges of all nodes in parallel. Comparison is over\n   * getEdgeDst(e).\n   */\n  void sortAllEdgesByDst(MethodFlag mflag = MethodFlag::WRITE) {\n    galois::do_all(\n        galois::iterate(size_t{0}, this->size()),\n        [=](GraphNode N) { this->sortEdgesByDst(N, mflag); },\n        galois::no_stats(), galois::steal());\n  }\n\n  void allocateFrom(const FileGraph& graph) {\n    numNodes = graph.size();\n    numEdges = graph.sizeEdges();\n    if (UseNumaAlloc) {\n      nodeData.allocateBlocked(numNodes);\n      edgeIndData.allocateBlocked(numNodes);\n      edgeDst.allocateBlocked(numEdges);\n      edgeData.allocateBlocked(numEdges);\n      this->outOfLineAllocateBlocked(numNodes);\n    } else {\n      nodeData.allocateInterleaved(numNodes);\n      edgeIndData.allocateInterleaved(numNodes);\n      edgeDst.allocateInterleaved(numEdges);\n      edgeData.allocateInterleaved(numEdges);\n      this->outOfLineAllocateInterleaved(numNodes);\n    }\n  }\n\n  void allocateFrom(uint32_t nNodes, uint64_t nEdges) {\n    numNodes = nNodes;\n    numEdges = nEdges;\n\n    if (UseNumaAlloc) {\n      nodeData.allocateBlocked(numNodes);\n      edgeIndData.allocateBlocked(numNodes);\n      edgeDst.allocateBlocked(numEdges);\n      edgeData.allocateBlocked(numEdges);\n      this->outOfLineAllocateBlocked(numNodes);\n    } else {\n      nodeData.allocateInterleaved(numNodes);\n      edgeIndData.allocateInterleaved(numNodes);\n      edgeDst.allocateInterleaved(numEdges);\n      edgeData.allocateInterleaved(numEdges);\n      this->outOfLineAllocateInterleaved(numNodes);\n    }\n  }\n\n  void destroyAndAllocateFrom(uint32_t nNodes, uint64_t nEdges) {\n    numNodes = nNodes;\n    numEdges = nEdges;\n\n    deallocate();\n    if (UseNumaAlloc) {\n      nodeData.allocateBlocked(numNodes);\n      edgeIndData.allocateBlocked(numNodes);\n      edgeDst.allocateBlocked(numEdges);\n      edgeData.allocateBlocked(numEdges);\n      this->outOfLineAllocateBlocked(numNodes);\n    } else {\n      nodeData.allocateInterleaved(numNodes);\n      edgeIndData.allocateInterleaved(numNodes);\n      edgeDst.allocateInterleaved(numEdges);\n      edgeData.allocateInterleaved(numEdges);\n      this->outOfLineAllocateInterleaved(numNodes);\n    }\n  }\n\n  void constructNodes() {\n#ifndef GALOIS_GRAPH_CONSTRUCT_SERIAL\n    for (uint32_t x = 0; x < numNodes; ++x) {\n      nodeData.constructAt(x);\n      this->outOfLineConstructAt(x);\n    }\n#else\n    galois::do_all(\n        galois::iterate(UINT64_C(0), numNodes),\n        [&](uint64_t x) {\n          nodeData.constructAt(x);\n          this->outOfLineConstructAt(x);\n        },\n        galois::no_stats(), galois::loopname(\"CONSTRUCT_NODES\"));\n#endif\n  }\n\n  void deallocate() {\n    nodeData.destroy();\n    nodeData.deallocate();\n\n    edgeIndData.deallocate();\n    edgeIndData.destroy();\n\n    edgeDst.deallocate();\n    edgeDst.destroy();\n\n    edgeData.deallocate();\n    edgeData.destroy();\n  }\n\n  void constructEdge(uint64_t e, uint32_t dst,\n                     const typename EdgeData::value_type& val) {\n    edgeData.set(e, val);\n    edgeDst[e] = dst;\n  }\n\n  void constructEdge(uint64_t e, uint32_t dst) { edgeDst[e] = dst; }\n\n  void fixEndEdge(uint32_t n, uint64_t e) { edgeIndData[n] = e; }\n\n  /**\n   * Perform an in-memory transpose of the graph, replacing the original\n   * CSR to CSC\n   */\n  void transpose(const char* regionName = NULL) {\n    galois::StatTimer timer(\"TIMER_GRAPH_TRANSPOSE\", regionName);\n    timer.start();\n\n    EdgeDst edgeDst_old;\n    EdgeData edgeData_new;\n    EdgeIndData edgeIndData_old;\n    EdgeIndData edgeIndData_temp;\n\n    if (UseNumaAlloc) {\n      edgeIndData_old.allocateBlocked(numNodes);\n      edgeIndData_temp.allocateBlocked(numNodes);\n      edgeDst_old.allocateBlocked(numEdges);\n      edgeData_new.allocateBlocked(numEdges);\n    } else {\n      edgeIndData_old.allocateInterleaved(numNodes);\n      edgeIndData_temp.allocateInterleaved(numNodes);\n      edgeDst_old.allocateInterleaved(numEdges);\n      edgeData_new.allocateInterleaved(numEdges);\n    }\n\n    // Copy old node->index location + initialize the temp array\n    galois::do_all(\n        galois::iterate(UINT64_C(0), numNodes),\n        [&](uint64_t n) {\n          edgeIndData_old[n]  = edgeIndData[n];\n          edgeIndData_temp[n] = 0;\n        },\n        galois::no_stats(), galois::loopname(\"TRANSPOSE_EDGEINTDATA_COPY\"));\n\n    // get destination of edge, copy to array, and\n    galois::do_all(\n        galois::iterate(UINT64_C(0), numEdges),\n        [&](uint64_t e) {\n          auto dst       = edgeDst[e];\n          edgeDst_old[e] = dst;\n          // counting outgoing edges in the tranpose graph by\n          // counting incoming edges in the original graph\n          __sync_add_and_fetch(&edgeIndData_temp[dst], 1);\n        },\n        galois::no_stats(), galois::loopname(\"TRANSPOSE_EDGEINTDATA_INC\"));\n\n    // TODO is it worth doing parallel prefix sum?\n    // prefix sum calculation of the edge index array\n    for (uint32_t n = 1; n < numNodes; ++n) {\n      edgeIndData_temp[n] += edgeIndData_temp[n - 1];\n    }\n\n    // copy over the new tranposed edge index data\n    galois::do_all(\n        galois::iterate(UINT64_C(0), numNodes),\n        [&](uint64_t n) { edgeIndData[n] = edgeIndData_temp[n]; },\n        galois::no_stats(), galois::loopname(\"TRANSPOSE_EDGEINTDATA_SET\"));\n\n    // edgeIndData_temp[i] will now hold number of edges that all nodes\n    // before the ith node have\n    if (numNodes >= 1) {\n      edgeIndData_temp[0] = 0;\n      galois::do_all(\n          galois::iterate(UINT64_C(1), numNodes),\n          [&](uint64_t n) { edgeIndData_temp[n] = edgeIndData[n - 1]; },\n          galois::no_stats(), galois::loopname(\"TRANSPOSE_EDGEINTDATA_TEMP\"));\n    }\n\n    galois::do_all(\n        galois::iterate(UINT64_C(0), numNodes),\n        [&](uint64_t src) {\n          // e = start index into edge array for a particular node\n          uint64_t e = (src == 0) ? 0 : edgeIndData_old[src - 1];\n\n          // get all outgoing edges of a particular node in the\n          // non-transpose and convert to incoming\n          while (e < edgeIndData_old[src]) {\n            // destination nodde\n            auto dst = edgeDst_old[e];\n            // location to save edge\n            auto e_new = __sync_fetch_and_add(&(edgeIndData_temp[dst]), 1);\n            // save src as destination\n            edgeDst[e_new] = src;\n            // copy edge data to \"new\" array\n            edgeDataCopy(edgeData_new, edgeData, e_new, e);\n            e++;\n          }\n        },\n        galois::no_stats(), galois::loopname(\"TRANSPOSE_EDGEDST\"));\n\n    // if edge weights, then overwrite edgeData with new edge data\n    if (EdgeData::has_value) {\n      galois::do_all(\n          galois::iterate(UINT64_C(0), numEdges),\n          [&](uint64_t e) { edgeDataCopy(edgeData, edgeData_new, e, e); },\n          galois::no_stats(), galois::loopname(\"TRANSPOSE_EDGEDATA_SET\"));\n    }\n\n    timer.stop();\n  }\n\n  template <bool is_non_void = EdgeData::has_value>\n  void edgeDataCopy(EdgeData& edgeData_new, EdgeData& edgeData, uint64_t e_new,\n                    uint64_t e,\n                    typename std::enable_if<is_non_void>::type* = 0) {\n    edgeData_new[e_new] = edgeData[e];\n  }\n\n  template <bool is_non_void = EdgeData::has_value>\n  void edgeDataCopy(EdgeData&, EdgeData&, uint64_t, uint64_t,\n                    typename std::enable_if<!is_non_void>::type* = 0) {\n    // does nothing\n  }\n\n  template <typename E                                            = EdgeTy,\n            std::enable_if_t<!std::is_same<E, void>::value, int>* = nullptr>\n  void constructFrom(FileGraph& graph, unsigned tid, unsigned total,\n                     const bool readUnweighted = false) {\n    // at this point memory should already be allocated\n    auto r =\n        graph\n            .divideByNode(\n                NodeData::size_of::value + EdgeIndData::size_of::value +\n                    LC_CSR_Graph::size_of_out_of_line::value,\n                EdgeDst::size_of::value + EdgeData::size_of::value, tid, total)\n            .first;\n\n    this->setLocalRange(*r.first, *r.second);\n\n    for (FileGraph::iterator ii = r.first, ei = r.second; ii != ei; ++ii) {\n      nodeData.constructAt(*ii);\n      edgeIndData[*ii] = *graph.edge_end(*ii);\n\n      this->outOfLineConstructAt(*ii);\n\n      for (FileGraph::edge_iterator nn = graph.edge_begin(*ii),\n                                    en = graph.edge_end(*ii);\n           nn != en; ++nn) {\n        if (readUnweighted) {\n          edgeData.set(*nn, {});\n        } else {\n          constructEdgeValue(graph, nn);\n        }\n        edgeDst[*nn] = graph.getEdgeDst(nn);\n      }\n    }\n  }\n\n  template <typename E                                           = EdgeTy,\n            std::enable_if_t<std::is_same<E, void>::value, int>* = nullptr>\n  void constructFrom(FileGraph& graph, unsigned tid, unsigned total,\n                     const bool GALOIS_UNUSED(readUnweighted) = false) {\n    // at this point memory should already be allocated\n    auto r =\n        graph\n            .divideByNode(\n                NodeData::size_of::value + EdgeIndData::size_of::value +\n                    LC_CSR_Graph::size_of_out_of_line::value,\n                EdgeDst::size_of::value + EdgeData::size_of::value, tid, total)\n            .first;\n\n    this->setLocalRange(*r.first, *r.second);\n\n    for (FileGraph::iterator ii = r.first, ei = r.second; ii != ei; ++ii) {\n      nodeData.constructAt(*ii);\n      edgeIndData[*ii] = *graph.edge_end(*ii);\n\n      this->outOfLineConstructAt(*ii);\n\n      for (FileGraph::edge_iterator nn = graph.edge_begin(*ii),\n                                    en = graph.edge_end(*ii);\n           nn != en; ++nn) {\n        constructEdgeValue(graph, nn);\n        edgeDst[*nn] = graph.getEdgeDst(nn);\n      }\n    }\n  }\n\n  /**\n   * Returns the reference to the edgeIndData LargeArray\n   * (a prefix sum of edges)\n   *\n   * @returns reference to LargeArray edgeIndData\n   */\n  const EdgeIndData& getEdgePrefixSum() const { return edgeIndData; }\n\n  auto divideByNode(size_t nodeSize, size_t edgeSize, size_t id, size_t total) {\n    return galois::graphs::divideNodesBinarySearch(\n        numNodes, numEdges, nodeSize, edgeSize, id, total, edgeIndData);\n  }\n  /**\n   *\n   * custom allocator for vector<vector<>>\n   * Adding for Louvain clustering\n   * TODO: Find better way to do this\n   */\n  void constructFrom(uint32_t numNodes, uint64_t numEdges,\n                     std::vector<uint64_t>& prefix_sum,\n                     std::vector<std::vector<uint32_t>>& edges_id,\n                     std::vector<std::vector<EdgeTy>>& edges_data) {\n    // allocateFrom(numNodes, numEdges);\n    /*\n     * Deallocate if reusing the graph\n     */\n    destroyAndAllocateFrom(numNodes, numEdges);\n    constructNodes();\n\n    galois::do_all(galois::iterate((uint32_t)0, numNodes),\n                   [&](uint32_t n) { edgeIndData[n] = prefix_sum[n]; });\n\n    galois::do_all(galois::iterate((uint32_t)0, numNodes), [&](uint32_t n) {\n      if (n == 0) {\n        if (edgeIndData[n] > 0) {\n          std::copy(edges_id[n].begin(), edges_id[n].end(), edgeDst.begin());\n          std::copy(edges_data[n].begin(), edges_data[n].end(),\n                    edgeData.begin());\n        }\n      } else {\n        if (edgeIndData[n] - edgeIndData[n - 1] > 0) {\n          std::copy(edges_id[n].begin(), edges_id[n].end(),\n                    edgeDst.begin() + edgeIndData[n - 1]);\n          std::copy(edges_data[n].begin(), edges_data[n].end(),\n                    edgeData.begin() + edgeIndData[n - 1]);\n        }\n      }\n    });\n\n    initializeLocalRanges();\n  }\n  void constructFrom(\n      uint32_t numNodes, uint64_t numEdges, std::vector<uint64_t>& prefix_sum,\n      galois::gstl::Vector<galois::PODResizeableArray<uint32_t>>& edges_id,\n      std::vector<std::vector<EdgeTy>>& edges_data) {\n    allocateFrom(numNodes, numEdges);\n    constructNodes();\n\n    galois::do_all(galois::iterate((uint32_t)0, numNodes),\n                   [&](uint32_t n) { edgeIndData[n] = prefix_sum[n]; });\n\n    galois::do_all(galois::iterate((uint32_t)0, numNodes), [&](uint32_t n) {\n      if (n == 0) {\n        if (edgeIndData[n] > 0) {\n          std::copy(edges_id[n].begin(), edges_id[n].end(), edgeDst.begin());\n          std::copy(edges_data[n].begin(), edges_data[n].end(),\n                    edgeData.begin());\n        }\n      } else {\n        if (edgeIndData[n] - edgeIndData[n - 1] > 0) {\n          std::copy(edges_id[n].begin(), edges_id[n].end(),\n                    edgeDst.begin() + edgeIndData[n - 1]);\n          std::copy(edges_data[n].begin(), edges_data[n].end(),\n                    edgeData.begin() + edgeIndData[n - 1]);\n        }\n      }\n    });\n\n    initializeLocalRanges();\n  }\n\n  /**\n   * Reads the GR files directly into in-memory\n   * data-structures of LC_CSR graphs using freads.\n   *\n   * Edge is not void.\n   *\n   */\n  template <\n      typename U                                                      = void,\n      typename std::enable_if<!std::is_void<EdgeTy>::value, U>::type* = nullptr>\n  void readGraphFromGRFile(const std::string& filename) {\n    std::ifstream graphFile(filename.c_str());\n    if (!graphFile.is_open()) {\n      GALOIS_DIE(\"failed to open file\");\n    }\n    uint64_t header[4];\n    graphFile.read(reinterpret_cast<char*>(header), sizeof(uint64_t) * 4);\n    uint64_t version = header[0];\n    numNodes         = header[2];\n    numEdges         = header[3];\n    galois::gPrint(\"Number of Nodes: \", numNodes,\n                   \", Number of Edges: \", numEdges, \"\\n\");\n    allocateFrom(numNodes, numEdges);\n    constructNodes();\n    /**\n     * Load outIndex array\n     **/\n    assert(edgeIndData.data());\n    if (!edgeIndData.data()) {\n      GALOIS_DIE(\"out of memory\");\n    }\n\n    // start position to read index data\n    uint64_t readPosition = (4 * sizeof(uint64_t));\n    graphFile.seekg(readPosition);\n    graphFile.read(reinterpret_cast<char*>(edgeIndData.data()),\n                   sizeof(uint64_t) * numNodes);\n    /**\n     * Load edgeDst array\n     **/\n    assert(edgeDst.data());\n    if (!edgeDst.data()) {\n      GALOIS_DIE(\"out of memory\");\n    }\n\n    readPosition = ((4 + numNodes) * sizeof(uint64_t));\n    graphFile.seekg(readPosition);\n    if (version == 1) {\n      graphFile.read(reinterpret_cast<char*>(edgeDst.data()),\n                     sizeof(uint32_t) * numEdges);\n      readPosition =\n          ((4 + numNodes) * sizeof(uint64_t) + numEdges * sizeof(uint32_t));\n      // version 1 padding TODO make version agnostic\n      if (numEdges % 2) {\n        readPosition += sizeof(uint32_t);\n      }\n    } else if (version == 2) {\n      graphFile.read(reinterpret_cast<char*>(edgeDst.data()),\n                     sizeof(uint64_t) * numEdges);\n      readPosition =\n          ((4 + numNodes) * sizeof(uint64_t) + numEdges * sizeof(uint64_t));\n      if (numEdges % 2) {\n        readPosition += sizeof(uint64_t);\n      }\n    } else {\n      GALOIS_DIE(\"unknown file version: \", version);\n    }\n    /**\n     * Load edge data array\n     **/\n    assert(edgeData.data());\n    if (!edgeData.data()) {\n      GALOIS_DIE(\"out of memory\");\n    }\n    graphFile.seekg(readPosition);\n    graphFile.read(reinterpret_cast<char*>(edgeData.data()),\n                   sizeof(EdgeTy) * numEdges);\n\n    initializeLocalRanges();\n    graphFile.close();\n  }\n\n  /**\n   * Reads the GR files directly into in-memory\n   * data-structures of LC_CSR graphs using freads.\n   *\n   * Edge is void.\n   *\n   */\n  template <\n      typename U                                                     = void,\n      typename std::enable_if<std::is_void<EdgeTy>::value, U>::type* = nullptr>\n  void readGraphFromGRFile(const std::string& filename) {\n    std::ifstream graphFile(filename.c_str());\n    if (!graphFile.is_open()) {\n      GALOIS_DIE(\"failed to open file\");\n    }\n    uint64_t header[4];\n    graphFile.read(reinterpret_cast<char*>(header), sizeof(uint64_t) * 4);\n    uint64_t version = header[0];\n    numNodes         = header[2];\n    numEdges         = header[3];\n    galois::gPrint(\"Number of Nodes: \", numNodes,\n                   \", Number of Edges: \", numEdges, \"\\n\");\n    allocateFrom(numNodes, numEdges);\n    constructNodes();\n    /**\n     * Load outIndex array\n     **/\n    assert(edgeIndData.data());\n    if (!edgeIndData.data()) {\n      GALOIS_DIE(\"out of memory\");\n    }\n    // start position to read index data\n    uint64_t readPosition = (4 * sizeof(uint64_t));\n    graphFile.seekg(readPosition);\n    graphFile.read(reinterpret_cast<char*>(edgeIndData.data()),\n                   sizeof(uint64_t) * numNodes);\n    /**\n     * Load edgeDst array\n     **/\n    assert(edgeDst.data());\n    if (!edgeDst.data()) {\n      GALOIS_DIE(\"out of memory\");\n    }\n    readPosition = ((4 + numNodes) * sizeof(uint64_t));\n    graphFile.seekg(readPosition);\n    if (version == 1) {\n      graphFile.read(reinterpret_cast<char*>(edgeDst.data()),\n                     sizeof(uint32_t) * numEdges);\n    } else if (version == 2) {\n      graphFile.read(reinterpret_cast<char*>(edgeDst.data()),\n                     sizeof(uint64_t) * numEdges);\n    } else {\n      GALOIS_DIE(\"unknown file version: \", version);\n    }\n\n    initializeLocalRanges();\n    graphFile.close();\n  }\n\n  /**\n   * Given a manually created graph, initialize the local ranges on this graph\n   * so that threads can iterate over a balanced number of vertices.\n   */\n  void initializeLocalRanges() {\n    galois::on_each([&](unsigned tid, unsigned total) {\n      auto r = divideByNode(0, 1, tid, total).first;\n      this->setLocalRange(*r.first, *r.second);\n    });\n  }\n};\n\n} // namespace galois::graphs\n\n#endif\n"
  },
  {
    "path": "libgalois/include/galois/graphs/LC_CSR_Hypergraph.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#ifndef GALOIS_GRAPHS_LC_CSR_HYPERGRAPH_H\n#define GALOIS_GRAPHS_LC_CSR_HYPERGRAPH_H\n\n#include <type_traits>\n\n#include <boost/archive/binary_oarchive.hpp>\n#include <boost/archive/binary_iarchive.hpp>\n#include <boost/serialization/split_member.hpp>\n#include <boost/serialization/binary_object.hpp>\n#include <boost/serialization/serialization.hpp>\n\n#include \"galois/config.h\"\n#include \"galois/Galois.h\"\n#include \"galois/graphs/Details.h\"\n#include \"galois/graphs/FileGraph.h\"\n#include \"galois/graphs/GraphHelpers.h\"\n#include \"galois/PODResizeableArray.h\"\n\nnamespace galois {\nnamespace graphs {\n/**\n * Local computation graph (i.e., graph structure does not change). The data\n * representation is the traditional compressed-sparse-row (CSR) format.\n *\n * The position of template parameters may change between Galois releases; the\n * most robust way to specify them is through the with_XXX nested templates.\n *\n * An example of use:\n *\n * \\snippet test/graph.cpp Using a graph\n *\n * And in C++11:\n *\n * \\snippet test/graph.cpp Using a graph cxx11\n *\n * @tparam NodeTy data on nodes\n * @tparam EdgeTy data on out edges\n */\n//! [doxygennuma]\ntemplate <typename NodeTy, typename EdgeTy, bool HasNoLockable = false,\n          bool UseNumaAlloc =\n              false, // true => numa-blocked, false => numa-interleaved\n          bool HasOutOfLineLockable = false, typename FileEdgeTy = EdgeTy>\nclass LC_CSR_Hypergraph :\n    //! [doxygennuma]\n    private boost::noncopyable,\n    private internal::LocalIteratorFeature<UseNumaAlloc>,\n    private internal::OutOfLineLockableFeature<HasOutOfLineLockable &&\n                                               !HasNoLockable> {\n  template <typename Graph>\n  friend class LC_InOut_Graph;\n\npublic:\n  template <bool _has_id>\n  struct with_id {\n    typedef LC_CSR_Hypergraph type;\n  };\n\n  template <typename _node_data>\n  struct with_node_data {\n    typedef LC_CSR_Hypergraph<_node_data, EdgeTy, HasNoLockable, UseNumaAlloc,\n                              HasOutOfLineLockable, FileEdgeTy>\n        type;\n  };\n\n  template <typename _edge_data>\n  struct with_edge_data {\n    typedef LC_CSR_Hypergraph<NodeTy, _edge_data, HasNoLockable, UseNumaAlloc,\n                              HasOutOfLineLockable, FileEdgeTy>\n        type;\n  };\n\n  template <typename _file_edge_data>\n  struct with_file_edge_data {\n    typedef LC_CSR_Hypergraph<NodeTy, EdgeTy, HasNoLockable, UseNumaAlloc,\n                              HasOutOfLineLockable, _file_edge_data>\n        type;\n  };\n\n  //! If true, do not use abstract locks in graph\n  template <bool _has_no_lockable>\n  struct with_no_lockable {\n    typedef LC_CSR_Hypergraph<NodeTy, EdgeTy, _has_no_lockable, UseNumaAlloc,\n                              HasOutOfLineLockable, FileEdgeTy>\n        type;\n  };\n  template <bool _has_no_lockable>\n  using _with_no_lockable =\n      LC_CSR_Hypergraph<NodeTy, EdgeTy, _has_no_lockable, UseNumaAlloc,\n                        HasOutOfLineLockable, FileEdgeTy>;\n\n  //! If true, use NUMA-aware graph allocation\n  template <bool _use_numa_alloc>\n  struct with_numa_alloc {\n    typedef LC_CSR_Hypergraph<NodeTy, EdgeTy, HasNoLockable, _use_numa_alloc,\n                              HasOutOfLineLockable, FileEdgeTy>\n        type;\n  };\n  template <bool _use_numa_alloc>\n  using _with_numa_alloc =\n      LC_CSR_Hypergraph<NodeTy, EdgeTy, HasNoLockable, _use_numa_alloc,\n                        HasOutOfLineLockable, FileEdgeTy>;\n\n  //! If true, store abstract locks separate from nodes\n  template <bool _has_out_of_line_lockable>\n  struct with_out_of_line_lockable {\n    typedef LC_CSR_Hypergraph<NodeTy, EdgeTy, HasNoLockable, UseNumaAlloc,\n                              _has_out_of_line_lockable, FileEdgeTy>\n        type;\n  };\n\n  typedef read_default_graph_tag read_tag;\n\nprotected:\n  typedef LargeArray<EdgeTy> EdgeData;\n  typedef LargeArray<uint32_t> EdgeDst;\n  typedef internal::NodeInfoBaseTypes<NodeTy,\n                                      !HasNoLockable && !HasOutOfLineLockable>\n      NodeInfoTypes;\n  typedef internal::NodeInfoBase<NodeTy,\n                                 !HasNoLockable && !HasOutOfLineLockable>\n      NodeInfo;\n  typedef LargeArray<uint64_t> EdgeIndData;\n  typedef LargeArray<NodeInfo> NodeData;\n\npublic:\n  typedef uint32_t GraphNode;\n  typedef EdgeTy edge_data_type;\n  typedef FileEdgeTy file_edge_data_type;\n  typedef NodeTy node_data_type;\n  typedef typename EdgeData::reference edge_data_reference;\n  typedef typename NodeInfoTypes::reference node_data_reference;\n  using edge_iterator =\n      boost::counting_iterator<typename EdgeIndData::value_type>;\n  using iterator = boost::counting_iterator<typename EdgeDst::value_type>;\n  typedef iterator const_iterator;\n  typedef iterator local_iterator;\n  typedef iterator const_local_iterator;\n  // for hypergraphs\n  size_t hedges;\n  size_t hnodes;\n\nprotected:\n  NodeData nodeData;\n  EdgeIndData edgeIndData;\n  EdgeDst edgeDst;\n  EdgeData edgeData;\n\n  uint64_t numNodes;\n  uint64_t numEdges;\n\n  typedef internal::EdgeSortIterator<\n      GraphNode, typename EdgeIndData::value_type, EdgeDst, EdgeData>\n      edge_sort_iterator;\n\n  edge_iterator raw_begin(GraphNode N) const {\n    return edge_iterator((N == 0) ? 0 : edgeIndData[N - 1]);\n  }\n\n  edge_iterator raw_end(GraphNode N) const {\n    return edge_iterator(edgeIndData[N]);\n  }\n\n  edge_sort_iterator edge_sort_begin(GraphNode N) {\n    return edge_sort_iterator(*raw_begin(N), &edgeDst, &edgeData);\n  }\n\n  edge_sort_iterator edge_sort_end(GraphNode N) {\n    return edge_sort_iterator(*raw_end(N), &edgeDst, &edgeData);\n  }\n\n  template <bool _A1 = HasNoLockable, bool _A2 = HasOutOfLineLockable>\n  void acquireNode(GraphNode N, MethodFlag mflag,\n                   typename std::enable_if<!_A1 && !_A2>::type* = 0) {\n    galois::runtime::acquire(&nodeData[N], mflag);\n  }\n\n  template <bool _A1 = HasOutOfLineLockable, bool _A2 = HasNoLockable>\n  void acquireNode(GraphNode N, MethodFlag mflag,\n                   typename std::enable_if<_A1 && !_A2>::type* = 0) {\n    this->outOfLineAcquire(getId(N), mflag);\n  }\n\n  template <bool _A1 = HasOutOfLineLockable, bool _A2 = HasNoLockable>\n  void acquireNode(GraphNode, MethodFlag,\n                   typename std::enable_if<_A2>::type* = 0) {}\n\n  template <bool _A1 = EdgeData::has_value,\n            bool _A2 = LargeArray<FileEdgeTy>::has_value>\n  void constructEdgeValue(FileGraph& graph,\n                          typename FileGraph::edge_iterator nn,\n                          typename std::enable_if<!_A1 || _A2>::type* = 0) {\n    typedef LargeArray<FileEdgeTy> FED;\n    if (EdgeData::has_value)\n      edgeData.set(*nn, graph.getEdgeData<typename FED::value_type>(nn));\n  }\n\n  template <bool _A1 = EdgeData::has_value,\n            bool _A2 = LargeArray<FileEdgeTy>::has_value>\n  void constructEdgeValue(FileGraph&, typename FileGraph::edge_iterator nn,\n                          typename std::enable_if<_A1 && !_A2>::type* = 0) {\n    edgeData.set(*nn, {});\n  }\n\n  size_t getId(GraphNode N) { return N; }\n\n  GraphNode getNode(size_t n) { return n; }\n\nprivate:\n  friend class boost::serialization::access;\n  template <typename Archive>\n  void save(Archive& ar, const unsigned int) const {\n    ar << numNodes;\n    ar << numEdges;\n\n    // Large Arrays\n    ar << edgeIndData;\n    ar << edgeDst;\n    ar << edgeData;\n  }\n\n  template <typename Archive>\n  void load(Archive& ar, const unsigned int) {\n    ar >> numNodes;\n    ar >> numEdges;\n\n    // Large Arrays\n    ar >> edgeIndData;\n    ar >> edgeDst;\n    ar >> edgeData;\n\n    if (!nodeData.data()) {\n      if (UseNumaAlloc) {\n        nodeData.allocateBlocked(numNodes);\n        this->outOfLineAllocateBlocked(numNodes);\n      } else {\n        nodeData.allocateInterleaved(numNodes);\n        this->outOfLineAllocateInterleaved(numNodes);\n      }\n\n      // Construct nodeData largeArray\n      for (size_t n = 0; n < numNodes; ++n) {\n        nodeData.constructAt(n);\n      }\n    }\n  }\n  // The macro BOOST_SERIALIZATION_SPLIT_MEMBER() generates code which invokes\n  // the save or load depending on whether the archive is used for saving or\n  // loading\n  BOOST_SERIALIZATION_SPLIT_MEMBER()\n\npublic:\n  LC_CSR_Hypergraph(LC_CSR_Hypergraph&& rhs) = default;\n  LC_CSR_Hypergraph()                        = default;\n  LC_CSR_Hypergraph& operator=(LC_CSR_Hypergraph&&) = default;\n\n  /**\n   * Serializes node data using Boost.\n   *\n   * @param ar Boost archive to serialize to.\n   */\n  void serializeNodeData(boost::archive::binary_oarchive& ar) const {\n    ar << nodeData;\n  }\n\n  /**\n   * Deserializes a Boost archive containing node data to the local node data\n   * variable.\n   *\n   * @param ar Boost archive to deserialize from.\n   */\n  void deSerializeNodeData(boost::archive::binary_iarchive& ar) {\n    ar >> nodeData;\n  }\n\n  /**\n   * Serializes graph using Boost.\n   *\n   * @param ar Boost archive to serialize to.\n   */\n  void serializeGraph(boost::archive::binary_oarchive& ar) const {\n    ar << numNodes;\n    ar << numEdges;\n\n    // Large Arrays\n    ar << nodeData;\n    ar << edgeIndData;\n    ar << edgeDst;\n    ar << edgeData;\n  }\n\n  /**\n   * Deserializes a Boost archive to the local graph.\n   *\n   * @param ar Boost archive to deserialize from.\n   */\n  void deSerializeGraph(boost::archive::binary_iarchive& ar) {\n    ar >> numNodes;\n    ar >> numEdges;\n\n    // Large Arrays\n    ar >> nodeData;\n    ar >> edgeIndData;\n    ar >> edgeDst;\n    ar >> edgeData;\n  }\n\n  /**\n   * Accesses the \"prefix sum\" of this graph; takes advantage of the fact\n   * that edge_end(n) is basically prefix_sum[n] (if a prefix sum existed +\n   * if prefix_sum[0] = number of edges in node 0).\n   *\n   * ONLY USE IF GRAPH HAS BEEN LOADED\n   *\n   * @param n Index into edge prefix sum\n   * @returns The value that would be located at index n in an edge prefix sum\n   * array\n   */\n  uint64_t operator[](uint64_t n) { return *(edge_end(n)); }\n\n  template <typename EdgeNumFnTy, typename EdgeDstFnTy, typename EdgeDataFnTy>\n  LC_CSR_Hypergraph(uint32_t _numNodes, uint64_t _numEdges, EdgeNumFnTy edgeNum,\n                    EdgeDstFnTy _edgeDst, EdgeDataFnTy _edgeData)\n      : numNodes(_numNodes), numEdges(_numEdges) {\n    // std::cerr << \"\\n**\" << numNodes << \" \" << numEdges << \"\\n\\n\";\n    if (UseNumaAlloc) {\n      //! [numaallocex]\n      nodeData.allocateBlocked(numNodes);\n      edgeIndData.allocateBlocked(numNodes);\n      edgeDst.allocateBlocked(numEdges);\n      edgeData.allocateBlocked(numEdges);\n      //! [numaallocex]\n      this->outOfLineAllocateBlocked(numNodes, false);\n    } else {\n      nodeData.allocateInterleaved(numNodes);\n      edgeIndData.allocateInterleaved(numNodes);\n      edgeDst.allocateInterleaved(numEdges);\n      edgeData.allocateInterleaved(numEdges);\n      this->outOfLineAllocateInterleaved(numNodes);\n    }\n    // std::cerr << \"Done Alloc\\n\";\n    for (size_t n = 0; n < numNodes; ++n) {\n      nodeData.constructAt(n);\n    }\n    // std::cerr << \"Done Node Construct\\n\";\n    uint64_t cur = 0;\n    for (size_t n = 0; n < numNodes; ++n) {\n      cur += edgeNum(n);\n      edgeIndData[n] = cur;\n    }\n    // std::cerr << \"Done Edge Reserve\\n\";\n    cur = 0;\n    for (size_t n = 0; n < numNodes; ++n) {\n      // if (n % (1024*128) == 0)\n      //  std::cout << n << \" \" << cur << \"\\n\";\n      for (uint64_t e = 0, ee = edgeNum(n); e < ee; ++e) {\n        if (EdgeData::has_value)\n          edgeData.set(cur, _edgeData(n, e));\n        edgeDst[cur] = _edgeDst(n, e);\n        ++cur;\n      }\n    }\n\n    // std::cerr << \"Done Construct\\n\";\n  }\n\n  friend void swap(LC_CSR_Hypergraph& lhs, LC_CSR_Hypergraph& rhs) {\n    swap(lhs.nodeData, rhs.nodeData);\n    swap(lhs.edgeIndData, rhs.edgeIndData);\n    swap(lhs.edgeDst, rhs.edgeDst);\n    swap(lhs.edgeData, rhs.edgeData);\n    std::swap(lhs.numNodes, rhs.numNodes);\n    std::swap(lhs.numEdges, rhs.numEdges);\n  }\n\n  node_data_reference getData(GraphNode N,\n                              MethodFlag mflag = MethodFlag::WRITE) {\n    // galois::runtime::checkWrite(mflag, false);\n    NodeInfo& NI = nodeData[N];\n    acquireNode(N, mflag);\n    return NI.getData();\n  }\n\n  edge_data_reference\n  getEdgeData(edge_iterator ni,\n              MethodFlag GALOIS_UNUSED(mflag) = MethodFlag::UNPROTECTED) {\n    // galois::runtime::checkWrite(mflag, false);\n    return edgeData[*ni];\n  }\n\n  GraphNode getEdgeDst(edge_iterator ni) { return edgeDst[*ni]; }\n\n  size_t size() const { return numNodes; }\n  size_t sizeEdges() const { return numEdges; }\n\n  iterator begin() const { return iterator(0); }\n  iterator end() const { return iterator(numNodes); }\n\n  const_local_iterator local_begin() const {\n    return const_local_iterator(this->localBegin(numNodes));\n  }\n\n  const_local_iterator local_end() const {\n    return const_local_iterator(this->localEnd(numNodes));\n  }\n\n  local_iterator local_begin() {\n    return local_iterator(this->localBegin(numNodes));\n  }\n\n  local_iterator local_end() {\n    return local_iterator(this->localEnd(numNodes));\n  }\n\n  edge_iterator edge_begin(GraphNode N, MethodFlag mflag = MethodFlag::WRITE) {\n    acquireNode(N, mflag);\n    if (!HasNoLockable && galois::runtime::shouldLock(mflag)) {\n      for (edge_iterator ii = raw_begin(N), ee = raw_end(N); ii != ee; ++ii) {\n        acquireNode(edgeDst[*ii], mflag);\n      }\n    }\n    return raw_begin(N);\n  }\n\n  edge_iterator edge_end(GraphNode N, MethodFlag mflag = MethodFlag::WRITE) {\n    acquireNode(N, mflag);\n    return raw_end(N);\n  }\n\n  edge_iterator findEdge(GraphNode N1, GraphNode N2) {\n    return std::find_if(edge_begin(N1), edge_end(N1),\n                        [=](edge_iterator e) { return getEdgeDst(e) == N2; });\n  }\n\n  edge_iterator findEdgeSortedByDst(GraphNode N1, GraphNode N2) {\n    auto e = std::lower_bound(\n        edge_begin(N1), edge_end(N1), N2,\n        [=](edge_iterator e, GraphNode N) { return getEdgeDst(e) < N; });\n    return (getEdgeDst(e) == N2) ? e : edge_end(N1);\n  }\n\n  runtime::iterable<NoDerefIterator<edge_iterator>>\n  edges(GraphNode N, MethodFlag mflag = MethodFlag::WRITE) {\n    return internal::make_no_deref_range(edge_begin(N, mflag),\n                                         edge_end(N, mflag));\n  }\n\n  runtime::iterable<NoDerefIterator<edge_iterator>>\n  out_edges(GraphNode N, MethodFlag mflag = MethodFlag::WRITE) {\n    return edges(N, mflag);\n  }\n\n  /**\n   * Sorts outgoing edges of a node. Comparison function is over EdgeTy.\n   */\n  template <typename CompTy>\n  void sortEdgesByEdgeData(GraphNode N,\n                           const CompTy& comp = std::less<EdgeTy>(),\n                           MethodFlag mflag   = MethodFlag::WRITE) {\n    acquireNode(N, mflag);\n    std::sort(\n        edge_sort_begin(N), edge_sort_end(N),\n        internal::EdgeSortCompWrapper<EdgeSortValue<GraphNode, EdgeTy>, CompTy>(\n            comp));\n  }\n\n  /**\n   * Sorts outgoing edges of a node.\n   * Comparison function is over <code>EdgeSortValue<EdgeTy></code>.\n   */\n  template <typename CompTy>\n  void sortEdges(GraphNode N, const CompTy& comp,\n                 MethodFlag mflag = MethodFlag::WRITE) {\n    acquireNode(N, mflag);\n    std::sort(edge_sort_begin(N), edge_sort_end(N), comp);\n  }\n\n  /**\n   * Sorts outgoing edges of a node. Comparison is over getEdgeDst(e).\n   */\n  void sortEdgesByDst(GraphNode N, MethodFlag mflag = MethodFlag::WRITE) {\n    acquireNode(N, mflag);\n    typedef EdgeSortValue<GraphNode, EdgeTy> EdgeSortVal;\n    std::sort(edge_sort_begin(N), edge_sort_end(N),\n              [=](const EdgeSortVal& e1, const EdgeSortVal& e2) {\n                return e1.dst < e2.dst;\n              });\n  }\n\n  /**\n   * Sorts all outgoing edges of all nodes in parallel. Comparison is over\n   * getEdgeDst(e).\n   */\n  void sortAllEdgesByDst(MethodFlag mflag = MethodFlag::WRITE) {\n    galois::do_all(\n        galois::iterate(size_t{0}, this->size()),\n        [=](GraphNode N) { this->sortEdgesByDst(N, mflag); },\n        galois::no_stats(), galois::steal());\n  }\n\n  void allocateFrom(FileGraph& graph) {\n    numNodes = graph.size();\n    numEdges = graph.sizeEdges();\n    if (UseNumaAlloc) {\n      nodeData.allocateBlocked(numNodes);\n      edgeIndData.allocateBlocked(numNodes);\n      edgeDst.allocateBlocked(numEdges);\n      edgeData.allocateBlocked(numEdges);\n      this->outOfLineAllocateBlocked(numNodes);\n    } else {\n      nodeData.allocateInterleaved(numNodes);\n      edgeIndData.allocateInterleaved(numNodes);\n      edgeDst.allocateInterleaved(numEdges);\n      edgeData.allocateInterleaved(numEdges);\n      this->outOfLineAllocateInterleaved(numNodes);\n    }\n  }\n\n  void allocateFrom(uint32_t nNodes, uint64_t nEdges) {\n    numNodes = nNodes;\n    numEdges = nEdges;\n\n    if (UseNumaAlloc) {\n      nodeData.allocateBlocked(numNodes);\n      edgeIndData.allocateBlocked(numNodes);\n      edgeDst.allocateBlocked(numEdges);\n      edgeData.allocateBlocked(numEdges);\n      this->outOfLineAllocateBlocked(numNodes);\n    } else {\n      nodeData.allocateInterleaved(numNodes);\n      edgeIndData.allocateInterleaved(numNodes);\n      edgeDst.allocateInterleaved(numEdges);\n      edgeData.allocateInterleaved(numEdges);\n      this->outOfLineAllocateInterleaved(numNodes);\n    }\n  }\n\n  void constructNodes() {\n#ifndef GALOIS_GRAPH_CONSTRUCT_SERIAL\n    for (uint32_t x = 0; x < numNodes; ++x) {\n      nodeData.constructAt(x);\n      this->outOfLineConstructAt(x);\n    }\n#else\n    galois::do_all(\n        galois::iterate(UINT64_C(0), numNodes),\n        [&](uint64_t x) {\n          nodeData.constructAt(x);\n          this->outOfLineConstructAt(x);\n        },\n        galois::no_stats(), galois::loopname(\"CONSTRUCT_NODES\"));\n#endif\n  }\n\n  void deallocate() {\n    nodeData.destroy();\n    nodeData.deallocate();\n\n    edgeIndData.deallocate();\n    edgeIndData.destroy();\n\n    edgeDst.deallocate();\n    edgeDst.destroy();\n\n    edgeData.deallocate();\n    edgeData.destroy();\n  }\n\n  void constructEdge(uint64_t e, uint32_t dst,\n                     const typename EdgeData::value_type& val) {\n    edgeData.set(e, val);\n    edgeDst[e] = dst;\n  }\n\n  void constructEdge(uint64_t e, uint32_t dst) { edgeDst[e] = dst; }\n\n  void fixEndEdge(uint32_t n, uint64_t e) { edgeIndData[n] = e; }\n\n  /**\n   * Perform an in-memory transpose of the graph, replacing the original\n   * CSR to CSC\n   */\n  void transpose(const char* regionName = NULL) {\n    galois::StatTimer timer(\"TIMER_GRAPH_TRANSPOSE\", regionName);\n    timer.start();\n\n    EdgeDst edgeDst_old;\n    EdgeData edgeData_new;\n    EdgeIndData edgeIndData_old;\n    EdgeIndData edgeIndData_temp;\n\n    if (UseNumaAlloc) {\n      edgeIndData_old.allocateBlocked(numNodes);\n      edgeIndData_temp.allocateBlocked(numNodes);\n      edgeDst_old.allocateBlocked(numEdges);\n      edgeData_new.allocateBlocked(numEdges);\n    } else {\n      edgeIndData_old.allocateInterleaved(numNodes);\n      edgeIndData_temp.allocateInterleaved(numNodes);\n      edgeDst_old.allocateInterleaved(numEdges);\n      edgeData_new.allocateInterleaved(numEdges);\n    }\n\n    // Copy old node->index location + initialize the temp array\n    galois::do_all(\n        galois::iterate(UINT64_C(0), numNodes),\n        [&](uint64_t n) {\n          edgeIndData_old[n]  = edgeIndData[n];\n          edgeIndData_temp[n] = 0;\n        },\n        galois::no_stats(), galois::loopname(\"TRANSPOSE_EDGEINTDATA_COPY\"));\n\n    // get destination of edge, copy to array, and\n    galois::do_all(\n        galois::iterate(UINT64_C(0), numEdges),\n        [&](uint64_t e) {\n          auto dst       = edgeDst[e];\n          edgeDst_old[e] = dst;\n          // counting outgoing edges in the tranpose graph by\n          // counting incoming edges in the original graph\n          __sync_add_and_fetch(&(edgeIndData_temp[dst]), 1);\n        },\n        galois::no_stats(), galois::loopname(\"TRANSPOSE_EDGEINTDATA_INC\"));\n\n    // TODO is it worth doing parallel prefix sum?\n    // prefix sum calculation of the edge index array\n    for (uint32_t n = 1; n < numNodes; ++n) {\n      edgeIndData_temp[n] += edgeIndData_temp[n - 1];\n    }\n\n    // copy over the new tranposed edge index data\n    galois::do_all(\n        galois::iterate(UINT64_C(0), numNodes),\n        [&](uint64_t n) { edgeIndData[n] = edgeIndData_temp[n]; },\n        galois::no_stats(), galois::loopname(\"TRANSPOSE_EDGEINTDATA_SET\"));\n\n    // edgeIndData_temp[i] will now hold number of edges that all nodes\n    // before the ith node have\n    if (numNodes >= 1) {\n      edgeIndData_temp[0] = 0;\n      galois::do_all(\n          galois::iterate(UINT64_C(1), numNodes),\n          [&](uint64_t n) { edgeIndData_temp[n] = edgeIndData[n - 1]; },\n          galois::no_stats(), galois::loopname(\"TRANSPOSE_EDGEINTDATA_TEMP\"));\n    }\n\n    galois::do_all(\n        galois::iterate(UINT64_C(0), numNodes),\n        [&](uint64_t src) {\n          // e = start index into edge array for a particular node\n          uint64_t e = (src == 0) ? 0 : edgeIndData_old[src - 1];\n\n          // get all outgoing edges of a particular node in the\n          // non-transpose and convert to incoming\n          while (e < edgeIndData_old[src]) {\n            // destination nodde\n            auto dst = edgeDst_old[e];\n            // location to save edge\n            auto e_new = __sync_fetch_and_add(&(edgeIndData_temp[dst]), 1);\n            // save src as destination\n            edgeDst[e_new] = src;\n            // copy edge data to \"new\" array\n            edgeDataCopy(edgeData_new, edgeData, e_new, e);\n            e++;\n          }\n        },\n        galois::no_stats(), galois::loopname(\"TRANSPOSE_EDGEDST\"));\n\n    // if edge weights, then overwrite edgeData with new edge data\n    if (EdgeData::has_value) {\n      galois::do_all(\n          galois::iterate(UINT64_C(0), numEdges),\n          [&](uint64_t e) { edgeDataCopy(edgeData, edgeData_new, e, e); },\n          galois::no_stats(), galois::loopname(\"TRANSPOSE_EDGEDATA_SET\"));\n    }\n\n    timer.stop();\n  }\n\n  template <bool is_non_void = EdgeData::has_value>\n  void edgeDataCopy(EdgeData& edgeData_new, EdgeData& edgeData, uint64_t e_new,\n                    uint64_t e,\n                    typename std::enable_if<is_non_void>::type* = 0) {\n    edgeData_new[e_new] = edgeData[e];\n  }\n\n  template <bool is_non_void = EdgeData::has_value>\n  void edgeDataCopy(EdgeData&, EdgeData&, uint64_t, uint64_t,\n                    typename std::enable_if<!is_non_void>::type* = 0) {\n    // does nothing\n  }\n\n  void constructFrom(FileGraph& graph, unsigned tid, unsigned total) {\n    // at this point memory should already be allocated\n    auto r =\n        graph\n            .divideByNode(\n                NodeData::size_of::value + EdgeIndData::size_of::value +\n                    LC_CSR_Hypergraph::size_of_out_of_line::value,\n                EdgeDst::size_of::value + EdgeData::size_of::value, tid, total)\n            .first;\n\n    this->setLocalRange(*r.first, *r.second);\n\n    for (FileGraph::iterator ii = r.first, ei = r.second; ii != ei; ++ii) {\n      nodeData.constructAt(*ii);\n      edgeIndData[*ii] = *graph.edge_end(*ii);\n\n      this->outOfLineConstructAt(*ii);\n\n      for (FileGraph::edge_iterator nn = graph.edge_begin(*ii),\n                                    en = graph.edge_end(*ii);\n           nn != en; ++nn) {\n        constructEdgeValue(graph, nn);\n        edgeDst[*nn] = graph.getEdgeDst(nn);\n      }\n    }\n  }\n\n  /**\n   * Returns the reference to the edgeIndData LargeArray\n   * (a prefix sum of edges)\n   *\n   * @returns reference to LargeArray edgeIndData\n   */\n  const EdgeIndData& getEdgePrefixSum() const { return edgeIndData; }\n\n  auto divideByNode(size_t nodeSize, size_t edgeSize, size_t id, size_t total) {\n    return galois::graphs::divideNodesBinarySearch(\n        numNodes, numEdges, nodeSize, edgeSize, id, total, edgeIndData);\n  }\n  /**\n   *\n   * custom allocator for vector<vector<>>\n   * Adding for Louvain clustering\n   * TODO: Find better way to do this\n   */\n  void constructFrom(\n      uint32_t numNodes, uint64_t numEdges, std::vector<uint64_t>& prefix_sum,\n      std::vector<std::vector<uint32_t>>&\n          edges_id) { //, std::vector<std::vector<EdgeTy>>& edges_data) {\n    allocateFrom(numNodes, numEdges);\n    constructNodes();\n\n    galois::do_all(galois::iterate((uint32_t)0, numNodes),\n                   [&](uint32_t n) { edgeIndData[n] = prefix_sum[n]; });\n\n    galois::do_all(galois::iterate((uint32_t)0, numNodes), [&](uint32_t n) {\n      if (n == 0) {\n        if (edgeIndData[n] > 0) {\n          std::copy(edges_id[n].begin(), edges_id[n].end(), edgeDst.begin());\n          // std::copy(edges_data[n].begin(), edges_data[n].end(),\n          // edgeData.begin());\n        }\n      } else {\n        if (edgeIndData[n] - edgeIndData[n - 1] > 0) {\n          std::copy(edges_id[n].begin(), edges_id[n].end(),\n                    edgeDst.begin() + edgeIndData[n - 1]);\n          //   std::copy(edges_data[n].begin(), edges_data[n].end(),\n          //   edgeData.begin() + edgeIndData[n-1]);\n        }\n      }\n    });\n\n    galois::on_each([&](unsigned tid, unsigned total) {\n      std::vector<unsigned>\n          dummy_scale_factor; // dummy passed in to function call\n\n      auto r = divideByNode(0, 1, tid, total).first;\n\n      // galois::gPrint(\"[\", tid, \"] : Ranges : \", *r.first, \", \", *r.second,\n      // \"\\n\");\n      this->setLocalRange(*r.first, *r.second);\n    });\n  }\n  void constructFrom(\n      uint32_t numNodes, uint64_t numEdges, std::vector<uint64_t>& prefix_sum,\n      galois::gstl::Vector<galois::PODResizeableArray<uint32_t>>&\n          edges_id) { //, std::vector<std::vector<EdgeTy>>& edges_data) {\n    allocateFrom(numNodes, numEdges);\n    constructNodes();\n\n    galois::do_all(galois::iterate((uint32_t)0, numNodes),\n                   [&](uint32_t n) { edgeIndData[n] = prefix_sum[n]; });\n\n    galois::do_all(galois::iterate((uint32_t)0, numNodes), [&](uint32_t n) {\n      if (n == 0) {\n        if (edgeIndData[n] > 0) {\n          std::copy(edges_id[n].begin(), edges_id[n].end(), edgeDst.begin());\n          // std::copy(edges_data[n].begin(), edges_data[n].end(),\n          // edgeData.begin());\n        }\n      } else {\n        if (edgeIndData[n] - edgeIndData[n - 1] > 0) {\n          std::copy(edges_id[n].begin(), edges_id[n].end(),\n                    edgeDst.begin() + edgeIndData[n - 1]);\n          //   std::copy(edges_data[n].begin(), edges_data[n].end(),\n          //   edgeData.begin() + edgeIndData[n-1]);\n        }\n      }\n    });\n\n    galois::on_each([&](unsigned tid, unsigned total) {\n      std::vector<unsigned>\n          dummy_scale_factor; // dummy passed in to function call\n\n      auto r = divideByNode(0, 1, tid, total).first;\n\n      // galois::gPrint(\"[\", tid, \"] : Ranges : \", *r.first, \", \", *r.second,\n      // \"\\n\");\n      this->setLocalRange(*r.first, *r.second);\n    });\n  }\n  // uint32_t edgeSize(uint32_t n) {\n  //   return edgeDst[n].size();\n  //}\n};\n} // namespace graphs\n} // namespace galois\n\n#endif\n"
  },
  {
    "path": "libgalois/include/galois/graphs/LC_InOut_Graph.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#ifndef GALOIS_GRAPHS_LC_INOUT_GRAPH_H\n#define GALOIS_GRAPHS_LC_INOUT_GRAPH_H\n\n#include <boost/iterator/iterator_facade.hpp>\n#include <boost/fusion/include/vector.hpp>\n#include <boost/fusion/include/at_c.hpp>\n\n#include \"galois/config.h\"\n#include \"galois/graphs/Details.h\"\n#include \"galois/Galois.h\"\n\nnamespace galois {\nnamespace graphs {\n\n/**\n * Modify a LC_Graph to have in and out edges. In edges are stored by value, so\n * modifying them does not modify the corresponding out edge.\n */\ntemplate <typename GraphTy>\nclass LC_InOut_Graph : public GraphTy::template with_id<true>::type {\npublic:\n  template <typename _node_data>\n  struct with_node_data {\n    typedef LC_InOut_Graph<\n        typename GraphTy::template with_node_data<_node_data>::type>\n        type;\n  };\n\n  template <typename _edge_data>\n  struct with_edge_data {\n    typedef LC_InOut_Graph<\n        typename GraphTy::template with_edge_data<_edge_data>::type>\n        type;\n  };\n\nprivate:\n  template <typename G>\n  friend void readGraphDispatch(G&, read_lc_inout_graph_tag, const std::string&,\n                                const std::string&);\n\n  typedef typename GraphTy ::template with_id<true>::type Super;\n  typedef\n      typename GraphTy ::template with_id<true>::type ::template with_node_data<\n          void>::type ::template with_no_lockable<true>::type InGraph;\n  InGraph inGraph;\n  bool asymmetric;\n\n  typename InGraph::GraphNode inGraphNode(typename Super::GraphNode n) {\n    return inGraph.getNode(idFromNode(n));\n  }\n\n  void createAsymmetric() { asymmetric = true; }\n\npublic:\n  typedef Super out_graph_type;\n  typedef InGraph in_graph_type;\n  typedef typename Super::GraphNode GraphNode;\n  typedef typename Super::file_edge_data_type file_edge_data_type;\n  typedef typename Super::edge_data_type edge_data_type;\n  typedef typename Super::node_data_type node_data_type;\n  typedef typename Super::edge_data_reference edge_data_reference;\n  typedef typename Super::node_data_reference node_data_reference;\n  typedef typename Super::edge_iterator edge_iterator;\n  typedef typename Super::iterator iterator;\n  typedef typename Super::const_iterator const_iterator;\n  typedef typename Super::local_iterator local_iterator;\n  typedef typename Super::const_local_iterator const_local_iterator;\n  typedef read_lc_inout_graph_tag read_tag;\n\n  // Union of edge_iterator and InGraph::edge_iterator\n  class in_edge_iterator\n      : public boost::iterator_facade<in_edge_iterator, void*,\n                                      std::random_access_iterator_tag, void*> {\n    friend class boost::iterator_core_access;\n    friend class LC_InOut_Graph;\n    typedef edge_iterator Iterator0;\n    typedef typename InGraph::edge_iterator Iterator1;\n    typedef boost::fusion::vector<Iterator0, Iterator1> Iterators;\n\n    Iterators its;\n    LC_InOut_Graph* self;\n    int type;\n\n    void increment() {\n      if (type == 0)\n        ++boost::fusion::at_c<0>(its);\n      else\n        ++boost::fusion::at_c<1>(its);\n    }\n\n    void advance(unsigned n) {\n      if (type == 0)\n        boost::fusion::at_c<0>(its) += n;\n      else\n        boost::fusion::at_c<1>(its) += n;\n    }\n\n    bool equal(const in_edge_iterator& o) const {\n      if (type != o.type)\n        return false;\n      if (type == 0) {\n        return boost::fusion::at_c<0>(its) == boost::fusion::at_c<0>(o.its);\n      } else {\n        return boost::fusion::at_c<1>(its) == boost::fusion::at_c<1>(o.its);\n      }\n    }\n\n    typename in_edge_iterator::difference_type\n    distance_to(const in_edge_iterator& lhs) const {\n      if (type == 0)\n        return boost::fusion::at_c<0>(lhs.its) - boost::fusion::at_c<0>(its);\n      else\n        return boost::fusion::at_c<1>(lhs.its) - boost::fusion::at_c<1>(its);\n    }\n\n    void* dereference() const { return 0; }\n\n  public:\n    in_edge_iterator() : type(0) {}\n    in_edge_iterator(Iterator0 it) : type(0) {\n      boost::fusion::at_c<0>(its) = it;\n    }\n    in_edge_iterator(Iterator1 it, int) : type(1) {\n      boost::fusion::at_c<1>(its) = it;\n    }\n  };\n\n  LC_InOut_Graph() : asymmetric(false) {}\n\n  edge_data_reference\n  getInEdgeData(in_edge_iterator ni,\n                MethodFlag GALOIS_UNUSED(mflag) = MethodFlag::UNPROTECTED) {\n    // galois::runtime::checkWrite(mflag, false);\n    if (ni.type == 0) {\n      return this->getEdgeData(boost::fusion::at_c<0>(ni.its));\n    } else {\n      return inGraph.getEdgeData(boost::fusion::at_c<1>(ni.its));\n    }\n  }\n\n  GraphNode getInEdgeDst(in_edge_iterator ni) {\n    if (ni.type == 0) {\n      return this->getEdgeDst(boost::fusion::at_c<0>(ni.its));\n    } else {\n      return nodeFromId(\n          inGraph.getId(inGraph.getEdgeDst(boost::fusion::at_c<1>(ni.its))));\n    }\n  }\n\n  in_edge_iterator in_edge_begin(GraphNode N,\n                                 MethodFlag mflag = MethodFlag::WRITE) {\n    this->acquireNode(N, mflag);\n    if (!asymmetric) {\n      if (galois::runtime::shouldLock(mflag)) {\n        for (edge_iterator ii = this->raw_begin(N), ei = this->raw_end(N);\n             ii != ei; ++ii) {\n          this->acquireNode(this->getEdgeDst(ii), mflag);\n        }\n      }\n      return in_edge_iterator(this->raw_begin(N));\n    } else {\n      if (galois::runtime::shouldLock(mflag)) {\n        for (typename InGraph::edge_iterator\n                 ii = inGraph.raw_begin(inGraphNode(N)),\n                 ei = inGraph.raw_end(inGraphNode(N));\n             ii != ei; ++ii) {\n          this->acquireNode(nodeFromId(inGraph.getId(inGraph.getEdgeDst(ii))),\n                            mflag);\n        }\n      }\n      return in_edge_iterator(inGraph.raw_begin(inGraphNode(N)), 0);\n    }\n  }\n\n  in_edge_iterator in_edge_end(GraphNode N,\n                               MethodFlag mflag = MethodFlag::WRITE) {\n    this->acquireNode(N, mflag);\n    if (!asymmetric) {\n      return in_edge_iterator(this->raw_end(N));\n    } else {\n      return in_edge_iterator(inGraph.raw_end(inGraphNode(N)), 0);\n    }\n  }\n\n  internal::InEdgesIterator<LC_InOut_Graph>\n  in_edges(GraphNode N, MethodFlag mflag = MethodFlag::WRITE) {\n    return internal::InEdgesIterator<LC_InOut_Graph>(*this, N, mflag);\n  }\n\n  /**\n   * Sorts incoming edges of a node. Comparison function is over\n   * Graph::edge_data_type.\n   */\n  template <typename CompTy>\n  void sortInEdgesByEdgeData(\n      GraphNode N,\n      const CompTy& comp = std::less<typename GraphTy::edge_data_type>(),\n      MethodFlag mflag   = MethodFlag::WRITE) {\n    this->acquireNode(N, mflag);\n    if (!asymmetric) {\n      std::sort(this->edge_sort_begin(N), this->edge_sort_end(N),\n                internal::EdgeSortCompWrapper<\n                    EdgeSortValue<GraphNode, typename GraphTy::edge_data_type>,\n                    CompTy>(comp));\n    } else {\n      std::sort(inGraph.edge_sort_begin(inGraphNode(N)),\n                inGraph.edge_sort_end(inGraphNode(N)),\n                internal::EdgeSortCompWrapper<\n                    EdgeSortValue<GraphNode, typename GraphTy::edge_data_type>,\n                    CompTy>(comp));\n    }\n  }\n\n  /**\n   * Sorts incoming edges of a node. Comparison function is over\n   * <code>EdgeSortValue<GraphTy::edge_data_type></code>.\n   */\n  template <typename CompTy>\n  void sortInEdges(GraphNode N, const CompTy& comp,\n                   MethodFlag mflag = MethodFlag::WRITE) {\n    this->acquireNode(N, mflag);\n    if (!asymmetric) {\n      std::sort(this->edge_sort_begin(N), this->edge_sort_end(N), comp);\n    } else {\n      std::sort(inGraph.edge_sort_begin(inGraphNode(N)),\n                inGraph.edge_sort_end(inGraphNode(N)), comp);\n    }\n  }\n\n  /**\n   * Sorts incoming edges of a node. Comparison is by getInEdgeDst(e).\n   */\n  void sortInEdgesByDst(GraphNode N, MethodFlag mflag = MethodFlag::WRITE) {\n    this->acquireNode(N, mflag);\n    if (!asymmetric) {\n      typedef EdgeSortValue<GraphNode, edge_data_type> EdgeSortVal;\n      std::sort(this->edge_sort_begin(N), this->edge_sort_end(N),\n                [=](const EdgeSortVal& e1, const EdgeSortVal& e2) {\n                  return e1.dst < e2.dst;\n                });\n    } else {\n      typedef EdgeSortValue<typename InGraph::GraphNode,\n                            typename InGraph::edge_data_type>\n          InEdgeSortVal;\n      std::sort(inGraph.edge_sort_begin(inGraphNode(N)),\n                inGraph.edge_sort_end(inGraphNode(N)),\n                [=](const InEdgeSortVal& e1, const InEdgeSortVal& e2) {\n                  return e1.dst < e2.dst;\n                });\n    }\n  }\n\n  /**\n   * Sorts incoming edges of all nodes. Comparison is by getInEdgeDst(e).\n   */\n  void sortAllInEdgesByDst(MethodFlag mflag = MethodFlag::WRITE) {\n    galois::do_all(\n        galois::iterate(*this),\n        [=](GraphNode N) { this->sortInEdgesByDst(N, mflag); },\n        galois::steal());\n  }\n\n  size_t idFromNode(GraphNode N) { return this->getId(N); }\n\n  GraphNode nodeFromId(size_t N) { return this->getNode(N); }\n};\n\n} // namespace graphs\n} // namespace galois\n#endif\n"
  },
  {
    "path": "libgalois/include/galois/graphs/LC_InlineEdge_Graph.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#ifndef GALOIS_GRAPHS_LC_INLINEEDGE_GRAPH_H\n#define GALOIS_GRAPHS_LC_INLINEEDGE_GRAPH_H\n\n#include <type_traits>\n\n#include \"galois/config.h\"\n#include \"galois/LargeArray.h\"\n#include \"galois/graphs/FileGraph.h\"\n#include \"galois/graphs/Details.h\"\n\nnamespace galois {\nnamespace graphs {\n\n/**\n * Local computation graph (i.e., graph structure does not change). The data\n * representation is a modification of {@link LC_CSR_Graph} where the edge data\n * is stored inline with the adjacency information.\n *\n * The position of template parameters may change between Galois releases; the\n * most robust way to specify them is through the with_XXX nested templates.\n */\ntemplate <typename NodeTy, typename EdgeTy, bool HasNoLockable = false,\n          bool UseNumaAlloc = false, bool HasOutOfLineLockable = false,\n          bool HasCompressedNodePtr = false, typename FileEdgeTy = EdgeTy>\nclass LC_InlineEdge_Graph\n    : private boost::noncopyable,\n      private internal::LocalIteratorFeature<UseNumaAlloc>,\n      private internal::OutOfLineLockableFeature<HasOutOfLineLockable &&\n                                                 !HasNoLockable> {\n  template <typename Graph>\n  friend class LC_InOut_Graph;\n\npublic:\n  template <bool _has_id>\n  struct with_id {\n    typedef LC_InlineEdge_Graph type;\n  };\n\n  template <typename _node_data>\n  struct with_node_data {\n    typedef LC_InlineEdge_Graph<_node_data, EdgeTy, HasNoLockable, UseNumaAlloc,\n                                HasOutOfLineLockable, HasCompressedNodePtr,\n                                FileEdgeTy>\n        type;\n  };\n\n  template <typename _edge_data>\n  struct with_edge_data {\n    typedef LC_InlineEdge_Graph<NodeTy, _edge_data, HasNoLockable, UseNumaAlloc,\n                                HasOutOfLineLockable, HasCompressedNodePtr,\n                                FileEdgeTy>\n        type;\n  };\n\n  template <typename _file_edge_data>\n  struct with_file_edge_data {\n    typedef LC_InlineEdge_Graph<NodeTy, EdgeTy, HasNoLockable, UseNumaAlloc,\n                                HasOutOfLineLockable, HasCompressedNodePtr,\n                                _file_edge_data>\n        type;\n  };\n\n  template <bool _has_no_lockable>\n  struct with_no_lockable {\n    typedef LC_InlineEdge_Graph<NodeTy, EdgeTy, _has_no_lockable, UseNumaAlloc,\n                                HasOutOfLineLockable, HasCompressedNodePtr,\n                                FileEdgeTy>\n        type;\n  };\n\n  template <bool _use_numa_alloc>\n  struct with_numa_alloc {\n    typedef LC_InlineEdge_Graph<NodeTy, EdgeTy, HasNoLockable, _use_numa_alloc,\n                                HasOutOfLineLockable, HasCompressedNodePtr,\n                                FileEdgeTy>\n        type;\n  };\n\n  template <bool _has_out_of_line_lockable>\n  struct with_out_of_line_lockable {\n    typedef LC_InlineEdge_Graph<NodeTy, EdgeTy, HasNoLockable, UseNumaAlloc,\n                                _has_out_of_line_lockable, HasCompressedNodePtr,\n                                FileEdgeTy>\n        type;\n  };\n\n  /**\n   * Compress representation of graph at the expense of one level of indirection\n   * on accessing neighbors of a node\n   */\n  template <bool _has_compressed_node_ptr>\n  struct with_compressed_node_ptr {\n    typedef LC_InlineEdge_Graph<NodeTy, EdgeTy, HasNoLockable, UseNumaAlloc,\n                                HasOutOfLineLockable, _has_compressed_node_ptr,\n                                FileEdgeTy>\n        type;\n  };\n\n  typedef read_default_graph_tag read_tag;\n\nprotected:\n  class NodeInfo;\n  typedef internal::EdgeInfoBase<\n      typename std::conditional<HasCompressedNodePtr, uint32_t,\n                                NodeInfo*>::type,\n      EdgeTy>\n      EdgeInfo;\n  typedef LargeArray<EdgeInfo> EdgeData;\n  typedef LargeArray<NodeInfo> NodeData;\n  typedef internal::NodeInfoBaseTypes<NodeTy,\n                                      !HasNoLockable && !HasOutOfLineLockable>\n      NodeInfoTypes;\n\n  class NodeInfo\n      : public internal::NodeInfoBase<NodeTy,\n                                      !HasNoLockable && !HasOutOfLineLockable> {\n    EdgeInfo* m_edgeBegin;\n    EdgeInfo* m_edgeEnd;\n\n  public:\n    EdgeInfo*& edgeBegin() { return m_edgeBegin; }\n    EdgeInfo*& edgeEnd() { return m_edgeEnd; }\n  };\n\npublic:\n  typedef NodeInfo* GraphNode;\n  typedef EdgeTy edge_data_type;\n  typedef FileEdgeTy file_edge_data_type;\n  typedef NodeTy node_data_type;\n  typedef typename EdgeInfo::reference edge_data_reference;\n  typedef typename NodeInfoTypes::reference node_data_reference;\n  typedef EdgeInfo* edge_iterator;\n  typedef galois::NoDerefIterator<NodeInfo*> iterator;\n  typedef galois::NoDerefIterator<const NodeInfo*> const_iterator;\n  typedef iterator local_iterator;\n  typedef const_iterator const_local_iterator;\n\nprotected:\n  NodeData nodeData;\n  EdgeData edgeData;\n  uint64_t numNodes;\n  uint64_t numEdges;\n\n  template <bool C_b = HasCompressedNodePtr>\n  NodeInfo* getDst(edge_iterator ii,\n                   typename std::enable_if<C_b>::type* = 0) const {\n    return const_cast<NodeInfo*>(&nodeData[ii->dst]);\n  }\n\n  template <bool C_b = HasCompressedNodePtr>\n  NodeInfo* getDst(edge_iterator ii,\n                   typename std::enable_if<!C_b>::type* = 0) const {\n    return ii->dst;\n  }\n\n  template <typename Container, typename Index, bool C_b = HasCompressedNodePtr>\n  void setEdgeDst(Container&, edge_iterator edge, Index idx,\n                  typename std::enable_if<C_b>::type* = 0) {\n    edge->dst = idx;\n  }\n\n  template <typename Container, typename Index, bool C_b = HasCompressedNodePtr>\n  void setEdgeDst(Container& c, edge_iterator edge, Index idx,\n                  typename std::enable_if<!C_b>::type* = 0) {\n    edge->dst = &c[idx];\n  }\n\n  template <bool _A1 = HasNoLockable, bool _A2 = HasOutOfLineLockable>\n  void acquireNode(GraphNode N, MethodFlag mflag,\n                   typename std::enable_if<!_A1 && !_A2>::type* = 0) {\n    galois::runtime::acquire(N, mflag);\n  }\n\n  template <bool _A1 = HasOutOfLineLockable, bool _A2 = HasNoLockable>\n  void acquireNode(GraphNode N, MethodFlag mflag,\n                   typename std::enable_if<_A1 && !_A2>::type* = 0) {\n    this->outOfLineAcquire(getId(N), mflag);\n  }\n\n  template <bool _A1 = HasOutOfLineLockable, bool _A2 = HasNoLockable>\n  void acquireNode(GraphNode, MethodFlag,\n                   typename std::enable_if<_A2>::type* = 0) {}\n\n  edge_iterator raw_begin(GraphNode N) {\n    return nodeData[getId(N)].edgeBegin();\n  }\n\n  edge_iterator raw_end(GraphNode N) { return nodeData[getId(N)].edgeEnd(); }\n\n  template <bool _A1 = EdgeInfo::has_value,\n            bool _A2 = LargeArray<FileEdgeTy>::has_value>\n  void constructEdgeValue(FileGraph& graph,\n                          typename FileGraph::edge_iterator nn, EdgeInfo* edge,\n                          typename std::enable_if<!_A1 || _A2>::type* = 0) {\n    typedef LargeArray<FileEdgeTy> FED;\n    if (EdgeInfo::has_value)\n      edge->construct(graph.getEdgeData<typename FED::value_type>(nn));\n  }\n\n  template <bool _A1 = EdgeInfo::has_value,\n            bool _A2 = LargeArray<FileEdgeTy>::has_value>\n  void constructEdgeValue(FileGraph&, typename FileGraph::edge_iterator,\n                          EdgeInfo* edge,\n                          typename std::enable_if<_A1 && !_A2>::type* = 0) {\n    edge->construct();\n  }\n\n  size_t getId(GraphNode N) { return std::distance(this->nodeData.data(), N); }\n\n  GraphNode getNode(size_t n) { return &nodeData[n]; }\n\npublic:\n  ~LC_InlineEdge_Graph() {\n    if (!EdgeInfo::has_value)\n      return;\n    if (numNodes == 0)\n      return;\n\n    for (edge_iterator ii = nodeData[0].edgeBegin(),\n                       ei = nodeData[numNodes - 1].edgeEnd();\n         ii != ei; ++ii) {\n      ii->destroy();\n    }\n  }\n\n  node_data_reference getData(GraphNode N,\n                              MethodFlag mflag = MethodFlag::WRITE) {\n    // galois::runtime::checkWrite(mflag, false);\n    acquireNode(N, mflag);\n    return N->getData();\n  }\n\n  edge_data_reference\n  getEdgeData(edge_iterator ni,\n              MethodFlag GALOIS_UNUSED(mflag) = MethodFlag::UNPROTECTED) const {\n    // galois::runtime::checkWrite(mflag, false);\n    return ni->get();\n  }\n\n  GraphNode getEdgeDst(edge_iterator ni) const { return getDst(ni); }\n\n  size_t size() const { return numNodes; }\n  size_t sizeEdges() const { return numEdges; }\n\n  const_iterator begin() const { return const_iterator(nodeData.begin()); }\n  const_iterator end() const { return const_iterator(nodeData.end()); }\n  iterator begin() { return iterator(nodeData.data()); }\n  iterator end() { return iterator(nodeData.end()); }\n\n  local_iterator local_begin() {\n    return local_iterator(&nodeData[this->localBegin(numNodes)]);\n  }\n  local_iterator local_end() {\n    return local_iterator(&nodeData[this->localEnd(numNodes)]);\n  }\n  const_local_iterator local_begin() const {\n    return const_local_iterator(&nodeData[this->localBegin(numNodes)]);\n  }\n  const_local_iterator local_end() const {\n    return const_local_iterator(&nodeData[this->localEnd(numNodes)]);\n  }\n\n  edge_iterator edge_begin(GraphNode N, MethodFlag mflag = MethodFlag::WRITE) {\n    acquireNode(N, mflag);\n    if (galois::runtime::shouldLock(mflag)) {\n      for (edge_iterator ii = N->edgeBegin(), ee = N->edgeEnd(); ii != ee;\n           ++ii) {\n        acquireNode(getDst(ii), mflag);\n      }\n    }\n    return N->edgeBegin();\n  }\n\n  edge_iterator edge_end(GraphNode N, MethodFlag mflag = MethodFlag::WRITE) {\n    acquireNode(N, mflag);\n    return N->edgeEnd();\n  }\n\n  runtime::iterable<NoDerefIterator<edge_iterator>>\n  edges(GraphNode N, MethodFlag mflag = MethodFlag::WRITE) {\n    return internal::make_no_deref_range(edge_begin(N, mflag),\n                                         edge_end(N, mflag));\n  }\n\n  runtime::iterable<NoDerefIterator<edge_iterator>>\n  out_edges(GraphNode N, MethodFlag mflag = MethodFlag::WRITE) {\n    return edges(N, mflag);\n  }\n\n#if 0\n  /**\n   * Sorts outgoing edges of a node. Comparison function is over EdgeTy.\n   */\n  template<typename CompTy>\n  void sortEdgesByEdgeData(GraphNode N, const CompTy& comp = std::less<EdgeTy>(), MethodFlag mflag = MethodFlag::WRITE) {\n    galois::runtime::acquire(N, mflag);\n    std::sort(edge_sort_begin(N), edge_sort_end(N), EdgeSortCompWrapper<EdgeSortValue<GraphNode,EdgeTy>,CompTy>(comp));\n  }\n\n  /**\n   * Sorts outgoing edges of a node. Comparison function is over <code>EdgeSortValue<EdgeTy></code>.\n   */\n  template<typename CompTy>\n  void sortEdges(GraphNode N, const CompTy& comp, MethodFlag mflag = MethodFlag::WRITE) {\n    galois::runtime::acquire(N, mflag);\n    std::sort(edge_sort_begin(N), edge_sort_end(N), comp);\n  }\n#endif\n\n  void allocateFrom(FileGraph& graph) {\n    numNodes = graph.size();\n    numEdges = graph.sizeEdges();\n\n    if (UseNumaAlloc) {\n      nodeData.allocateBlocked(numNodes);\n      edgeData.allocateBlocked(numEdges);\n      this->outOfLineAllocateBlocked(numNodes);\n    } else {\n      nodeData.allocateInterleaved(numNodes);\n      edgeData.allocateInterleaved(numEdges);\n      this->outOfLineAllocateInterleaved(numNodes);\n    }\n  }\n\n  void constructFrom(FileGraph& graph, unsigned tid, unsigned total) {\n    auto r =\n        graph\n            .divideByNode(NodeData::size_of::value +\n                              LC_InlineEdge_Graph::size_of_out_of_line::value,\n                          EdgeData::size_of::value, tid, total)\n            .first;\n\n    EdgeInfo* curEdge = edgeData.data() + *graph.edge_begin(*r.first);\n\n    this->setLocalRange(*r.first, *r.second);\n\n    for (FileGraph::iterator ii = r.first, ei = r.second; ii != ei; ++ii) {\n      nodeData.constructAt(*ii);\n      this->outOfLineConstructAt(*ii);\n      nodeData[*ii].edgeBegin() = curEdge;\n      for (FileGraph::edge_iterator nn = graph.edge_begin(*ii),\n                                    en = graph.edge_end(*ii);\n           nn != en; ++nn) {\n        constructEdgeValue(graph, nn, curEdge);\n        setEdgeDst(nodeData, curEdge, graph.getEdgeDst(nn));\n        ++curEdge;\n      }\n      nodeData[*ii].edgeEnd() = curEdge;\n    }\n  }\n};\n\n} // namespace graphs\n} // namespace galois\n\n#endif\n"
  },
  {
    "path": "libgalois/include/galois/graphs/LC_Linear_Graph.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#ifndef GALOIS_GRAPHS_LC_LINEAR_GRAPH_H\n#define GALOIS_GRAPHS_LC_LINEAR_GRAPH_H\n\n#include <type_traits>\n\n#include <boost/mpl/if.hpp>\n\n#include \"galois/config.h\"\n#include \"galois/LargeArray.h\"\n#include \"galois/graphs/FileGraph.h\"\n#include \"galois/graphs/Details.h\"\n\nnamespace galois {\nnamespace graphs {\n\n/**\n * Local computation graph (i.e., graph structure does not change). The data\n * representation is a modification of {@link LC_CSR_Graph} where the edge data\n * and node data is stored inline with the adjacency information.\n *\n * The position of template parameters may change between Galois releases; the\n * most robust way to specify them is through the with_XXX nested templates.\n */\ntemplate <typename NodeTy, typename EdgeTy, bool HasNoLockable = false,\n          bool UseNumaAlloc = false, bool HasOutOfLineLockable = false,\n          bool HasId = false, typename FileEdgeTy = EdgeTy>\nclass LC_Linear_Graph\n    : private boost::noncopyable,\n      private internal::LocalIteratorFeature<UseNumaAlloc>,\n      private internal::OutOfLineLockableFeature<HasOutOfLineLockable &&\n                                                 !HasNoLockable> {\n  template <typename Graph>\n  friend class LC_InOut_Graph;\n\npublic:\n  template <bool _has_id>\n  struct with_id {\n    typedef LC_Linear_Graph<NodeTy, EdgeTy, HasNoLockable, UseNumaAlloc,\n                            HasOutOfLineLockable, _has_id, FileEdgeTy>\n        type;\n  };\n\n  template <typename _node_data>\n  struct with_node_data {\n    typedef LC_Linear_Graph<_node_data, EdgeTy, HasNoLockable, UseNumaAlloc,\n                            HasOutOfLineLockable, HasId, FileEdgeTy>\n        type;\n  };\n\n  template <typename _edge_data>\n  struct with_edge_data {\n    typedef LC_Linear_Graph<NodeTy, _edge_data, HasNoLockable, UseNumaAlloc,\n                            HasOutOfLineLockable, HasId, FileEdgeTy>\n        type;\n  };\n\n  template <typename _file_edge_data>\n  struct with_file_edge_data {\n    typedef LC_Linear_Graph<NodeTy, EdgeTy, HasNoLockable, UseNumaAlloc,\n                            HasOutOfLineLockable, HasId, _file_edge_data>\n        type;\n  };\n\n  template <bool _has_no_lockable>\n  struct with_no_lockable {\n    typedef LC_Linear_Graph<NodeTy, EdgeTy, _has_no_lockable, UseNumaAlloc,\n                            HasOutOfLineLockable, HasId, FileEdgeTy>\n        type;\n  };\n\n  template <bool _use_numa_alloc>\n  struct with_numa_alloc {\n    typedef LC_Linear_Graph<NodeTy, EdgeTy, HasNoLockable, _use_numa_alloc,\n                            HasOutOfLineLockable, HasId, FileEdgeTy>\n        type;\n  };\n\n  template <bool _has_out_of_line_lockable>\n  struct with_out_of_line_lockable {\n    typedef LC_Linear_Graph<NodeTy, EdgeTy, HasNoLockable, UseNumaAlloc,\n                            _has_out_of_line_lockable,\n                            _has_out_of_line_lockable || HasId, FileEdgeTy>\n        type;\n  };\n\n  typedef read_with_aux_graph_tag read_tag;\n\nprotected:\n  class NodeInfo;\n  typedef internal::EdgeInfoBase<NodeInfo*, EdgeTy> EdgeInfo;\n  typedef LargeArray<NodeInfo*> Nodes;\n  typedef internal::NodeInfoBaseTypes<NodeTy,\n                                      !HasNoLockable && !HasOutOfLineLockable>\n      NodeInfoTypes;\n\n  class NodeInfo\n      : public internal::NodeInfoBase<NodeTy,\n                                      !HasNoLockable && !HasOutOfLineLockable>,\n        public internal::IntrusiveId<\n            typename boost::mpl::if_c<HasId, uint32_t, void>::type> {\n    friend class LC_Linear_Graph;\n    int numEdges;\n\n    EdgeInfo* edgeBegin() {\n      NodeInfo* n = this;\n      ++n; // start of edges\n      return reinterpret_cast<EdgeInfo*>(n);\n    }\n\n    EdgeInfo* edgeEnd() {\n      EdgeInfo* ei = edgeBegin();\n      ei += numEdges;\n      return ei;\n    }\n\n    NodeInfo* next() {\n      NodeInfo* ni = this;\n      EdgeInfo* ei = edgeEnd();\n      while (reinterpret_cast<char*>(ni) < reinterpret_cast<char*>(ei))\n        ++ni;\n      return ni;\n    }\n  };\n\npublic:\n  typedef NodeInfo* GraphNode;\n  typedef EdgeTy edge_data_type;\n  typedef FileEdgeTy file_edge_data_type;\n  typedef NodeTy node_data_type;\n  typedef typename NodeInfoTypes::reference node_data_reference;\n  typedef typename EdgeInfo::reference edge_data_reference;\n  typedef EdgeInfo* edge_iterator;\n  typedef NodeInfo** iterator;\n  typedef NodeInfo* const* const_iterator;\n  typedef iterator local_iterator;\n  typedef const_iterator const_local_iterator;\n  typedef int ReadGraphAuxData;\n\nprotected:\n  LargeArray<char> data;\n  uint64_t numNodes;\n  uint64_t numEdges;\n  Nodes nodes;\n\n  template <bool _A1 = HasNoLockable, bool _A2 = HasOutOfLineLockable>\n  void acquireNode(GraphNode N, MethodFlag mflag,\n                   typename std::enable_if<!_A1 && !_A2>::type* = 0) {\n    galois::runtime::acquire(N, mflag);\n  }\n\n  template <bool _A1 = HasOutOfLineLockable, bool _A2 = HasNoLockable>\n  void acquireNode(GraphNode N, MethodFlag mflag,\n                   typename std::enable_if<_A1 && !_A2>::type* = 0) {\n    this->outOfLineAcquire(getId(N), mflag);\n  }\n\n  template <bool _A1 = HasOutOfLineLockable, bool _A2 = HasNoLockable>\n  void acquireNode(GraphNode, MethodFlag,\n                   typename std::enable_if<_A2>::type* = 0) {}\n\n  edge_iterator raw_begin(GraphNode N) { return N->edgeBegin(); }\n\n  edge_iterator raw_end(GraphNode N) { return N->edgeEnd(); }\n\n  template <bool _A1 = EdgeInfo::has_value,\n            bool _A2 = LargeArray<FileEdgeTy>::has_value>\n  void constructEdgeValue(FileGraph& graph,\n                          typename FileGraph::edge_iterator nn, EdgeInfo* edge,\n                          typename std::enable_if<!_A1 || _A2>::type* = 0) {\n    typedef LargeArray<FileEdgeTy> FED;\n    if (EdgeInfo::has_value)\n      edge->construct(graph.getEdgeData<typename FED::value_type>(nn));\n  }\n\n  template <bool _A1 = EdgeInfo::has_value,\n            bool _A2 = LargeArray<FileEdgeTy>::has_value>\n  void constructEdgeValue(FileGraph&, typename FileGraph::edge_iterator,\n                          EdgeInfo* edge,\n                          typename std::enable_if<_A1 && !_A2>::type* = 0) {\n    edge->construct();\n  }\n\n  template <bool _Enable = HasId>\n  size_t getId(GraphNode N, typename std::enable_if<_Enable>::type* = 0) {\n    return N->getId();\n  }\n\n  template <bool _Enable = HasId>\n  GraphNode getNode(size_t n, typename std::enable_if<_Enable>::type* = 0) {\n    return nodes[n];\n  }\n\npublic:\n  ~LC_Linear_Graph() {\n    for (typename Nodes::iterator ii = nodes.begin(), ei = nodes.end();\n         ii != ei; ++ii) {\n      NodeInfo* n         = *ii;\n      EdgeInfo* edgeBegin = n->edgeBegin();\n      EdgeInfo* edgeEnd   = n->edgeEnd();\n\n      if (EdgeInfo::has_value) {\n        while (edgeBegin != edgeEnd) {\n          edgeBegin->destroy();\n          ++edgeBegin;\n        }\n      }\n      n->~NodeInfo();\n    }\n  }\n\n  node_data_reference getData(GraphNode N,\n                              MethodFlag mflag = MethodFlag::WRITE) {\n    // galois::runtime::checkWrite(mflag, false);\n    acquireNode(N, mflag);\n    return N->getData();\n  }\n\n  edge_data_reference\n  getEdgeData(edge_iterator ni,\n              MethodFlag GALOIS_UNUSED(mflag) = MethodFlag::UNPROTECTED) const {\n    // galois::runtime::checkWrite(mflag, false);\n    return ni->get();\n  }\n\n  GraphNode getEdgeDst(edge_iterator ni) const { return ni->dst; }\n\n  size_t size() const { return numNodes; }\n  size_t sizeEdges() const { return numEdges; }\n  iterator begin() { return &nodes[0]; }\n  iterator end() { return &nodes[numNodes]; }\n  const_iterator begin() const { return &nodes[0]; }\n  const_iterator end() const { return &nodes[numNodes]; }\n\n  local_iterator local_begin() { return &nodes[this->localBegin(numNodes)]; }\n  local_iterator local_end() { return &nodes[this->localEnd(numNodes)]; }\n  const_local_iterator local_begin() const {\n    return &nodes[this->localBegin(numNodes)];\n  }\n  const_local_iterator local_end() const {\n    return &nodes[this->localEnd(numNodes)];\n  }\n\n  edge_iterator edge_begin(GraphNode N, MethodFlag mflag = MethodFlag::WRITE) {\n    acquireNode(N, mflag);\n    if (galois::runtime::shouldLock(mflag)) {\n      for (edge_iterator ii = N->edgeBegin(), ee = N->edgeEnd(); ii != ee;\n           ++ii) {\n        acquireNode(ii->dst, mflag);\n      }\n    }\n    return N->edgeBegin();\n  }\n\n  edge_iterator edge_end(GraphNode N, MethodFlag mflag = MethodFlag::WRITE) {\n    acquireNode(N, mflag);\n    return N->edgeEnd();\n  }\n\n  runtime::iterable<NoDerefIterator<edge_iterator>>\n  edges(GraphNode N, MethodFlag mflag = MethodFlag::WRITE) {\n    return internal::make_no_deref_range(edge_begin(N, mflag),\n                                         edge_end(N, mflag));\n  }\n\n  runtime::iterable<NoDerefIterator<edge_iterator>>\n  out_edges(GraphNode N, MethodFlag mflag = MethodFlag::WRITE) {\n    return edges(N, mflag);\n  }\n\n  /**\n   * Sorts outgoing edges of a node. Comparison function is over EdgeTy.\n   */\n  template <typename CompTy>\n  void sortEdgesByEdgeData(GraphNode N,\n                           const CompTy& comp = std::less<EdgeTy>(),\n                           MethodFlag mflag   = MethodFlag::WRITE) {\n    acquireNode(N, mflag);\n    std::sort(N->edgeBegin(), N->edgeEnd(),\n              internal::EdgeSortCompWrapper<EdgeInfo, CompTy>(comp));\n  }\n\n  /**\n   * Sorts outgoing edges of a node. Comparison function is over\n   * <code>EdgeSortValue<EdgeTy></code>.\n   */\n  template <typename CompTy>\n  void sortEdges(GraphNode N, const CompTy& comp,\n                 MethodFlag mflag = MethodFlag::WRITE) {\n    acquireNode(N, mflag);\n    std::sort(N->edgeBegin(), N->edgeEnd(), comp);\n  }\n\n  void allocateFrom(FileGraph& graph, const ReadGraphAuxData&) {\n    numNodes = graph.size();\n    numEdges = graph.sizeEdges();\n    if (UseNumaAlloc) {\n      data.allocateLocal(sizeof(NodeInfo) * numNodes * 2 +\n                         sizeof(EdgeInfo) * numEdges);\n      nodes.allocateLocal(numNodes);\n      this->outOfLineAllocateLocal(numNodes);\n    } else {\n      data.allocateInterleaved(sizeof(NodeInfo) * numNodes * 2 +\n                               sizeof(EdgeInfo) * numEdges);\n      nodes.allocateInterleaved(numNodes);\n      this->outOfLineAllocateInterleaved(numNodes);\n    }\n  }\n\n  void constructNodesFrom(FileGraph& graph, unsigned tid, unsigned total,\n                          const ReadGraphAuxData&) {\n    auto r = graph\n                 .divideByNode(Nodes::size_of::value + 2 * sizeof(NodeInfo) +\n                                   LC_Linear_Graph::size_of_out_of_line::value,\n                               sizeof(EdgeInfo), tid, total)\n                 .first;\n\n    this->setLocalRange(*r.first, *r.second);\n    NodeInfo* curNode = reinterpret_cast<NodeInfo*>(data.data());\n\n    size_t id    = *r.first;\n    size_t edges = *graph.edge_begin(*r.first);\n    size_t bytes = edges * sizeof(EdgeInfo) + 2 * (id + 1) * sizeof(NodeInfo);\n    curNode += bytes / sizeof(NodeInfo);\n    for (FileGraph::iterator ii = r.first, ei = r.second; ii != ei;\n         ++ii, ++id) {\n      nodes.constructAt(*ii);\n      new (curNode) NodeInfo();\n      // curNode->construct();\n      curNode->setId(id);\n      curNode->numEdges =\n          std::distance(graph.edge_begin(*ii), graph.edge_end(*ii));\n      nodes[*ii] = curNode;\n      curNode    = curNode->next();\n    }\n  }\n\n  void constructEdgesFrom(FileGraph& graph, unsigned tid, unsigned total,\n                          const ReadGraphAuxData&) {\n    auto r = graph\n                 .divideByNode(Nodes::size_of::value + 2 * sizeof(NodeInfo) +\n                                   LC_Linear_Graph::size_of_out_of_line::value,\n                               sizeof(EdgeInfo), tid, total)\n                 .first;\n\n    for (FileGraph::iterator ii = r.first, ei = r.second; ii != ei; ++ii) {\n      EdgeInfo* edge = nodes[*ii]->edgeBegin();\n      for (FileGraph::edge_iterator nn = graph.edge_begin(*ii),\n                                    en = graph.edge_end(*ii);\n           nn != en; ++nn) {\n        constructEdgeValue(graph, nn, edge);\n        edge->dst = nodes[graph.getEdgeDst(nn)];\n        ++edge;\n      }\n    }\n  }\n};\n\n} // namespace graphs\n} // namespace galois\n\n#endif\n"
  },
  {
    "path": "libgalois/include/galois/graphs/LC_Morph_Graph.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n/**\n * @file LC_Morph_Graph.h\n *\n * Contains the LC_Morph_Graph class.\n */\n\n#ifndef GALOIS_GRAPHS_LC_MORPH_GRAPH_H\n#define GALOIS_GRAPHS_LC_MORPH_GRAPH_H\n\n#include <type_traits>\n\n#include <boost/mpl/if.hpp>\n\n#include \"galois/Bag.h\"\n#include \"galois/config.h\"\n#include \"galois/LargeArray.h\"\n#include \"galois/graphs/FileGraph.h\"\n#include \"galois/graphs/Details.h\"\n\nnamespace galois {\nnamespace graphs {\n\n/**\n * Local computation graph that allows addition of nodes (but not removals)\n * if the maximum degree of a node is known at the time it is added.\n */\ntemplate <typename NodeTy, typename EdgeTy, bool HasNoLockable = false,\n          bool UseNumaAlloc = false, bool HasOutOfLineLockable = false,\n          bool HasId = false, typename FileEdgeTy = EdgeTy>\nclass LC_Morph_Graph\n    : private boost::noncopyable,\n      private internal::OutOfLineLockableFeature<HasOutOfLineLockable &&\n                                                 !HasNoLockable> {\n  //! Friend of LC_InOut_Graph (makes a graph have both in and out edges)\n  template <typename Graph>\n  friend class LC_InOut_Graph;\n\npublic:\n  /**\n   * Struct that allows activation of the HasId template parameter\n   * Example: using Graph = LC_Morph_Graph::with_id<true> defines\n   * LC_Morph_Graph with HasId = true\n   */\n  template <bool _has_id>\n  struct with_id {\n    using type = LC_Morph_Graph<NodeTy, EdgeTy, HasNoLockable, UseNumaAlloc,\n                                HasOutOfLineLockable, _has_id, FileEdgeTy>;\n  };\n\n  /**\n   * Struct used to define the type of node data through the\n   * template parameter. See with_id doxygen for example.\n   */\n  template <typename _node_data>\n  struct with_node_data {\n    using type = LC_Morph_Graph<_node_data, EdgeTy, HasNoLockable, UseNumaAlloc,\n                                HasOutOfLineLockable, HasId, FileEdgeTy>;\n  };\n\n  /**\n   * Struct used to define the type of edge data through the\n   * template parameter. See with_id doxygen for example.\n   */\n  template <typename _edge_data>\n  struct with_edge_data {\n    using type = LC_Morph_Graph<NodeTy, _edge_data, HasNoLockable, UseNumaAlloc,\n                                HasOutOfLineLockable, HasId, FileEdgeTy>;\n  };\n\n  /**\n   * Struct used to define the type of file edge data through the\n   * template parameter. See with_id doxygen for example.\n   */\n  template <typename _file_edge_data>\n  struct with_file_edge_data {\n    using type = LC_Morph_Graph<NodeTy, EdgeTy, HasNoLockable, UseNumaAlloc,\n                                HasOutOfLineLockable, HasId, _file_edge_data>;\n  };\n\n  /**\n   * Struct used to define the HasNoLockable template parameter.\n   * See with_id doxygen for example.\n   */\n  template <bool _has_no_lockable>\n  struct with_no_lockable {\n    using type = LC_Morph_Graph<NodeTy, EdgeTy, _has_no_lockable, UseNumaAlloc,\n                                HasOutOfLineLockable, HasId, FileEdgeTy>;\n  };\n\n  /**\n   * Struct used to define the UseNumaAlloc template parameter.\n   * See with_id doxygen for example.\n   */\n  template <bool _use_numa_alloc>\n  struct with_numa_alloc {\n    using type = LC_Morph_Graph<NodeTy, EdgeTy, HasNoLockable, _use_numa_alloc,\n                                HasOutOfLineLockable, HasId, FileEdgeTy>;\n  };\n\n  /**\n   * Struct used to define the HasOutOfLineLockable template parameter.\n   * See with_id doxygen for example.\n   */\n  template <bool _has_out_of_line_lockable>\n  struct with_out_of_line_lockable {\n    using type = LC_Morph_Graph<NodeTy, EdgeTy, HasNoLockable, UseNumaAlloc,\n                                _has_out_of_line_lockable,\n                                _has_out_of_line_lockable || HasId, FileEdgeTy>;\n  };\n\n  //! type that tells graph reader how to read a file for this graph\n  using read_tag = read_with_aux_graph_tag;\n\nprotected:\n  // Forward declaration of class (defined below)\n  class NodeInfo;\n\n  //! EdgeInfo keeps destination of edges\n  using EdgeInfo = internal::EdgeInfoBase<NodeInfo*, EdgeTy>;\n  //! Nodes are stored in an insert bag\n  using Nodes = galois::InsertBag<NodeInfo>;\n  //! Type of nodes\n  using NodeInfoTypes =\n      internal::NodeInfoBaseTypes<NodeTy,\n                                  !HasNoLockable && !HasOutOfLineLockable>;\n\n  //! Linked list structure holding together blocks of memory that stores\n  //! edges.\n  struct EdgeHolder {\n    //! Beginning of memory for this block.\n    EdgeInfo* begin;\n    //! End of memory for this block.\n    EdgeInfo* end;\n    //! Pointer to another block of memory for edges (if it exists).\n    EdgeHolder* next;\n  };\n\n  /**\n   * Class that stores node info (e.g. where its edges begin and end, its data,\n   * etc.).\n   */\n  class NodeInfo\n      : public internal::NodeInfoBase<NodeTy,\n                                      !HasNoLockable && !HasOutOfLineLockable> {\n    using Super =\n        internal::NodeInfoBase<NodeTy, !HasNoLockable && !HasOutOfLineLockable>;\n    friend class LC_Morph_Graph;\n\n    EdgeInfo* edgeBegin;\n    EdgeInfo* edgeEnd;\n#ifndef NDEBUG\n    EdgeInfo* trueEdgeEnd;\n#endif\n\n  public:\n    //! Calls NodeInfoBase constructor\n    template <typename... Args>\n    NodeInfo(Args&&... args) : Super(std::forward<Args>(args)...) {}\n  }; // end NodeInfo\n\n  //! Functor that returns pointers to NodeInfo objects given references\n  struct makeGraphNode {\n    //! Returns a pointer to the NodeInfo reference passed into this functor\n    NodeInfo* operator()(NodeInfo& data) const { return &data; }\n  };\n\n  /**\n   * Functor: contains an operator to compare the destination of an edge with\n   * a particular node.\n   */\n  struct dst_equals {\n    //! Destination to compare with\n    NodeInfo* dst;\n    //! Constructor: takes a node to compare edge destinations with\n    dst_equals(NodeInfo* d) : dst(d) {}\n    //! Given an edge, check if the edge destination matches the node that\n    //! this functor was constructed with\n    bool operator()(const EdgeInfo& edge) { return edge.dst == dst; }\n  };\n\npublic:\n  //! A graph node is a NodeInfo object.\n  using GraphNode = NodeInfo*;\n  //! Type of edge data in file\n  using file_edge_data_type = FileEdgeTy;\n  //! Type of edge data\n  using edge_data_type = EdgeTy;\n  //! Type of node data\n  using node_data_type = NodeTy;\n  //! Reference type to node data\n  using node_data_reference = typename NodeInfoTypes::reference;\n  //! Reference type to edge data\n  using edge_data_reference = typename EdgeInfo::reference;\n  //! Iterator over EdgeInfo objects (edges)\n  using edge_iterator = EdgeInfo*;\n  //! Iterator over nodes\n  using iterator =\n      boost::transform_iterator<makeGraphNode, typename Nodes::iterator>;\n  //! Constant iterator over nodes\n  using const_iterator =\n      boost::transform_iterator<makeGraphNode, typename Nodes::const_iterator>;\n  //! Local iterator is just an iterator\n  using local_iterator = iterator;\n  //! Const local iterator is just an const_iterator\n  using const_local_iterator = const_iterator;\n  //! @todo doxygen this\n  using ReadGraphAuxData = LargeArray<GraphNode>;\n\nprotected:\n  //! Nodes in this graph\n  Nodes nodes;\n  //! Memory for edges in this graph (memory held in EdgeHolders)\n  galois::substrate::PerThreadStorage<EdgeHolder*> edgesL;\n\n  /**\n   * Acquire a node for the scope in which the function is called.\n   *\n   * @param N node to acquire\n   * @param mflag Method flag specifying type of acquire (e.g. read, write,\n   * etc.)\n   */\n  template <bool _A1 = HasNoLockable, bool _A2 = HasOutOfLineLockable>\n  void acquireNode(GraphNode N, MethodFlag mflag,\n                   typename std::enable_if<!_A1 && !_A2>::type* = 0) {\n    galois::runtime::acquire(N, mflag);\n  }\n\n  /**\n   * Acquire a node for the scope in which the function is called. The\n   * lock is out of line (not local to the node).\n   *\n   * @param N node to acquire\n   * @param mflag Method flag specifying type of acquire (e.g. read, write,\n   * etc.)\n   */\n  template <bool _A1 = HasOutOfLineLockable, bool _A2 = HasNoLockable>\n  void acquireNode(GraphNode N, MethodFlag mflag,\n                   typename std::enable_if<_A1 && !_A2>::type* = 0) {\n    this->outOfLineAcquire(getId(N), mflag);\n  }\n\n  /**\n   * Given a FileGraph and an edge in it, add it to the LCMorphGraph.\n   * Can handle edge weights.\n   */\n  template <bool _A1 = EdgeInfo::has_value,\n            bool _A2 = LargeArray<FileEdgeTy>::has_value>\n  void constructEdgeValue(FileGraph& graph,\n                          typename FileGraph::edge_iterator nn, GraphNode src,\n                          GraphNode dst,\n                          typename std::enable_if<!_A1 || _A2>::type* = 0) {\n    if (EdgeInfo::has_value) {\n      // type of edge data in file graph\n      using FEDV = typename LargeArray<FileEdgeTy>::value_type;\n      // add an edge with edge data\n      addMultiEdge(src, dst, galois::MethodFlag::UNPROTECTED,\n                   graph.getEdgeData<FEDV>(nn));\n    } else {\n      // add an edge without edge data\n      addMultiEdge(src, dst, galois::MethodFlag::UNPROTECTED);\n    }\n  }\n\n  /**\n   * Given a FileGraph and an edge in it, add it to the LCMorphGraph.\n   * Does not handle edge weights.\n   */\n  template <bool _A1 = EdgeInfo::has_value,\n            bool _A2 = LargeArray<FileEdgeTy>::has_value>\n  void constructEdgeValue(FileGraph&, typename FileGraph::edge_iterator,\n                          GraphNode src, GraphNode dst,\n                          typename std::enable_if<_A1 && !_A2>::type* = 0) {\n    addMultiEdge(src, dst, galois::MethodFlag::UNPROTECTED);\n  }\n\n  /**\n   * No-op acquire node when HasNoLockable is true.\n   */\n  template <bool _A1 = HasOutOfLineLockable, bool _A2 = HasNoLockable>\n  void acquireNode(GraphNode, MethodFlag,\n                   typename std::enable_if<_A2>::type* = 0) {}\n\n  /**\n   * Get the ID of a graph node if they're enabled in the class.\n   */\n  template <bool _Enable = HasId>\n  size_t getId(GraphNode N, typename std::enable_if<_Enable>::type* = 0) {\n    return N->getId();\n  }\n\npublic:\n  /**\n   * Destructor. If edges have some value, destory all of it (i.e. free up\n   * memory).\n   */\n  ~LC_Morph_Graph() {\n    for (typename Nodes::iterator ii = nodes.begin(), ei = nodes.end();\n         ii != ei; ++ii) {\n      NodeInfo& n         = *ii;\n      EdgeInfo* edgeBegin = n.edgeBegin;\n      EdgeInfo* edgeEnd   = n.edgeEnd;\n\n      if (EdgeInfo::has_value) {\n        while (edgeBegin != edgeEnd) {\n          edgeBegin->destroy();\n          ++edgeBegin;\n        }\n      }\n    }\n  }\n\n  /**\n   * Get the data of a node N.\n   */\n  node_data_reference getData(const GraphNode& N,\n                              MethodFlag mflag = MethodFlag::WRITE) {\n    // galois::runtime::checkWrite(mflag, false);\n    acquireNode(N, mflag);\n    return N->getData();\n  }\n\n  /**\n   * Get edge data of an edge given an iterator to the edge.\n   */\n  edge_data_reference getEdgeData(edge_iterator ni,\n                                  MethodFlag mflag = MethodFlag::UNPROTECTED) {\n    // galois::runtime::checkWrite(mflag, false);\n    acquireNode(ni->dst, mflag);\n    return ni->get();\n  }\n\n  /**\n   * Get the destination of an edge given an iterator to the edge.\n   */\n  GraphNode getEdgeDst(edge_iterator ni) {\n    // galois::runtime::checkWrite(mflag, false);\n    // acquireNode(ni->dst, mflag);\n    return GraphNode(ni->dst);\n  }\n\n  /**\n   * Returns an iterator to all the nodes in the graph. Not thread-safe.\n   */\n  iterator begin() {\n    return boost::make_transform_iterator(nodes.begin(), makeGraphNode());\n  }\n\n  //! Returns the end of the node iterator. Not thread-safe.\n  iterator end() {\n    return boost::make_transform_iterator(nodes.end(), makeGraphNode());\n  }\n\n  //! Return an iterator to the beginning of the local nodes of the graph.\n  local_iterator local_begin() {\n    return boost::make_transform_iterator(nodes.local_begin(), makeGraphNode());\n  }\n\n  //! Return an iterator to the end of the local nodes of the graph.\n  local_iterator local_end() {\n    return boost::make_transform_iterator(nodes.local_end(), makeGraphNode());\n  }\n\n  /**\n   * Return an iterator to the first edge of a particular node.\n   */\n  edge_iterator edge_begin(GraphNode N, MethodFlag mflag = MethodFlag::WRITE) {\n    acquireNode(N, mflag);\n    // Locks all destinations before returning edge iterator.\n    if (galois::runtime::shouldLock(mflag)) {\n      for (edge_iterator ii = N->edgeBegin, ee = N->edgeEnd; ii != ee; ++ii) {\n        acquireNode(ii->dst, mflag);\n      }\n    }\n    return N->edgeBegin;\n  }\n\n  /**\n   * Return an iterator to the end of edges of a particular node.\n   */\n  edge_iterator edge_end(GraphNode N,\n                         MethodFlag GALOIS_UNUSED(mflag) = MethodFlag::WRITE) {\n    return N->edgeEnd;\n  }\n\n  /**\n   * Return a range for edges of a node for use by C++ for_each loops.\n   */\n  runtime::iterable<NoDerefIterator<edge_iterator>>\n  edges(GraphNode N, MethodFlag mflag = MethodFlag::WRITE) {\n    return internal::make_no_deref_range(edge_begin(N, mflag),\n                                         edge_end(N, mflag));\n  }\n\n  /**\n   * Returns an object with begin() and end() methods to iterate over the\n   * outgoing edges of N.\n   */\n  internal::EdgesIterator<LC_Morph_Graph>\n  out_edges(GraphNode N, MethodFlag mflag = MethodFlag::WRITE) {\n    return internal::EdgesIterator<LC_Morph_Graph>(*this, N, mflag);\n  }\n\n  /**\n   * Creates a new node with a cap on the number of edges.\n   *\n   * @param nedges Number of edges reserved for this node.\n   * @param args Arguments required to construct a new node\n   * @returns Newly created node\n   */\n  template <typename... Args>\n  GraphNode createNode(int nedges, Args&&... args) {\n    NodeInfo* N = &nodes.emplace(std::forward<Args>(args)...);\n    acquireNode(N, MethodFlag::WRITE);\n    EdgeHolder*& local_edges = *edgesL.getLocal();\n\n    // Allocate space for a new EdgeHolder object if necessary\n    if (!local_edges ||\n        std::distance(local_edges->begin, local_edges->end) < nedges) {\n      EdgeHolder* old = local_edges;\n      // FIXME: this seems to leak\n      size_t size       = runtime::pagePoolSize();\n      void* block       = runtime::pagePoolAlloc();\n      local_edges       = reinterpret_cast<EdgeHolder*>(block);\n      local_edges->next = old;\n\n      size -= sizeof(EdgeHolder);\n      block = reinterpret_cast<char*>(block) + sizeof(EdgeHolder);\n\n      if (!std::align(std::alignment_of_v<EdgeInfo>, sizeof(EdgeInfo), block,\n                      size)) {\n        GALOIS_DIE(\"not enough space for EdgeInfo\");\n      }\n\n      local_edges->begin = reinterpret_cast<EdgeInfo*>(block);\n      local_edges->end   = local_edges->begin;\n      local_edges->end += size / sizeof(EdgeInfo);\n      if (std::distance(local_edges->begin, local_edges->end) < nedges) {\n        GALOIS_DIE(\"not enough space for EdgeInfo\");\n      }\n    }\n\n    // Set the memory aside for the new node in the edge holder object\n    N->edgeBegin = N->edgeEnd = local_edges->begin;\n    local_edges->begin += nedges;\n#ifndef NDEBUG\n    N->trueEdgeEnd = local_edges->begin;\n#endif\n    return GraphNode(N);\n  }\n\n  /**\n   * Adds an edge if it doesn't already exist.\n   *\n   * @param src Source to add edge to\n   * @param dst Destination to add edge to\n   * @param mflag Method flag specifying type of acquire (e.g. read, write)\n   * @param args Arguments needed to construct an edge\n   */\n  template <typename... Args>\n  edge_iterator addEdge(GraphNode src, GraphNode dst, galois::MethodFlag mflag,\n                        Args&&... args) {\n    // galois::runtime::checkWrite(mflag, true);\n    acquireNode(src, mflag);\n    auto it = std::find_if(src->edgeBegin, src->edgeEnd, dst_equals(dst));\n    if (it == src->edgeEnd) {\n      it->dst = dst;\n      it->construct(std::forward<Args>(args)...);\n      src->edgeEnd++;\n      assert(src->edgeEnd <= src->trueEdgeEnd);\n    }\n    return it;\n  }\n\n  /**\n   * Construct a new edge for a node. Can add duplicate edges.\n   *\n   * @param src Source node to add edge to\n   * @param dst Destination node of new edge\n   * @param mflag Method flag specifying type of acquire (e.g. read, write)\n   * @param args Other arguments that need to be passed in to construct\n   * a new edge\n   * @returns Iterator to newly added edge\n   */\n  template <typename... Args>\n  edge_iterator addMultiEdge(GraphNode src, GraphNode dst,\n                             galois::MethodFlag mflag, Args&&... args) {\n    acquireNode(src, mflag);\n    auto it = src->edgeEnd;\n    it->dst = dst;\n    it->construct(std::forward<Args>(args)...);\n    src->edgeEnd++;\n    assert(src->edgeEnd <= src->trueEdgeEnd);\n    return it;\n  }\n\n  /**\n   * Remove an edge from the graph.\n   *\n   * Invalidates edge iterator.\n   */\n  void removeEdge(GraphNode src, edge_iterator dst,\n                  galois::MethodFlag mflag = MethodFlag::WRITE) {\n    // galois::runtime::checkWrite(mflag, true);\n    acquireNode(src, mflag);\n    src->edgeEnd--;\n    assert(src->edgeBegin <= src->edgeEnd);\n    std::swap(*dst, *src->edgeEnd);\n    src->edgeEnd->destroy();\n  }\n\n  /**\n   * Finds an edge between 2 nodes and returns the iterator to it if it exists.\n   */\n  edge_iterator findEdge(GraphNode src, GraphNode dst,\n                         galois::MethodFlag mflag = MethodFlag::WRITE) {\n    // galois::runtime::checkWrite(mflag, true); // TODO: double check 'true'\n    // here\n    acquireNode(src, mflag);\n    return std::find_if(src->edgeBegin, src->edgeEnd, dst_equals(dst));\n  }\n\n  /**\n   * Allocate memory for nodes given a file graph with a particular number of\n   * nodes. This graph will allocate out of line space for that number of\n   * nodes as well.\n   *\n   * @param graph FileGraph with a number of nodes to allocate\n   * @param aux Data structure in which to allocate space for nodes.\n   */\n  void allocateFrom(FileGraph& graph, ReadGraphAuxData& aux) {\n    size_t numNodes = graph.size();\n\n    if (UseNumaAlloc) {\n      aux.allocateLocal(numNodes);\n      this->outOfLineAllocateLocal(numNodes);\n    } else {\n      aux.allocateInterleaved(numNodes);\n      this->outOfLineAllocateInterleaved(numNodes);\n    }\n  }\n\n  /**\n   * Constructs the LCMorphGraph nodes given a FileGraph to construct it from.\n   * Meant to be called by multiple threads.\n   *\n   * @param[in] graph FileGraph to construct a morph graph from\n   * @param[in] tid Thread id of thread calling this function\n   * @param[in] total Total number of threads in current execution\n   * @param[in,out] aux Allocated memory to store pointers to the created nodes\n   */\n  void constructNodesFrom(FileGraph& graph, unsigned tid, unsigned total,\n                          ReadGraphAuxData& aux) {\n    // get the portion of the graph that this thread is responsible for\n    // creating\n    auto r = graph\n                 .divideByNode(sizeof(NodeInfo) +\n                                   LC_Morph_Graph::size_of_out_of_line::value,\n                               sizeof(EdgeInfo), tid, total)\n                 .first;\n\n    // create nodes of portion we are responsible for only\n    for (FileGraph::iterator ii = r.first, ei = r.second; ii != ei; ++ii) {\n      aux[*ii] =\n          createNode(std::distance(graph.edge_begin(*ii), graph.edge_end(*ii)));\n    }\n  }\n\n  /**\n   * Constructs the LCMorphGraph edges given a FileGraph to construct it from\n   * and pointers to already created nodes. Meant to be called by multiple\n   * threads.\n   *\n   * @param[in] graph FileGraph to construct a morph graph from\n   * @param[in] tid Thread id of thread calling this function\n   * @param[in] total Total number of threads in current execution\n   * @param[in] aux Contains pointers to already created nodes to\n   * create edges for.\n   */\n  void constructEdgesFrom(FileGraph& graph, unsigned tid, unsigned total,\n                          const ReadGraphAuxData& aux) {\n    auto r = graph\n                 .divideByNode(sizeof(NodeInfo) +\n                                   LC_Morph_Graph::size_of_out_of_line::value,\n                               sizeof(EdgeInfo), tid, total)\n                 .first;\n\n    for (FileGraph::iterator ii = r.first, ei = r.second; ii != ei; ++ii) {\n      for (FileGraph::edge_iterator nn = graph.edge_begin(*ii),\n                                    en = graph.edge_end(*ii);\n           nn != en; ++nn) {\n        constructEdgeValue(graph, nn, aux[*ii], aux[graph.getEdgeDst(nn)]);\n      }\n    }\n  }\n};\n\n} // namespace graphs\n} // namespace galois\n\n#endif /* LC_MORPH_GRAPH_H_ */\n"
  },
  {
    "path": "libgalois/include/galois/graphs/MorphGraph.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n/**\n * @file MorphGraph.h\n *\n * Contains MorphGraph and associated helpers.\n */\n\n#ifndef GALOIS_GRAPHS_MORPHGRAPH_H\n#define GALOIS_GRAPHS_MORPHGRAPH_H\n\n#include <algorithm>\n#include <map>\n#include <set>\n#include <type_traits>\n#include <vector>\n\n#include <boost/container/small_vector.hpp>\n#include <boost/functional.hpp>\n#include <boost/iterator/transform_iterator.hpp>\n#include <boost/iterator/filter_iterator.hpp>\n#include <boost/container/small_vector.hpp>\n\n#include \"galois/Bag.h\"\n#include \"galois/config.h\"\n#include \"galois/Galois.h\"\n#include \"galois/graphs/FileGraph.h\"\n#include \"galois/graphs/Details.h\"\n#include \"galois/gstl.h\"\n\n#ifdef AUX_MAP\n#include \"galois/PerThreadContainer.h\"\n#else\n#include \"galois/substrate/CacheLineStorage.h\"\n#include \"galois/substrate/SimpleLock.h\"\n#endif\n\nnamespace galois {\nnamespace graphs {\n\nnamespace internal {\n/**\n * Wrapper class to have a valid type on void edges\n */\ntemplate <typename NTy, typename ETy, bool DirectedButNotInOut>\nstruct UEdgeInfoBase;\n\ntemplate <typename NTy, typename ETy>\nstruct UEdgeInfoBase<NTy, ETy, true> {\n  typedef ETy& reference;\n\n  NTy* N;\n  ETy Ea;\n\n  inline NTy* first() {\n    assert(N);\n    return N;\n  }\n  inline const NTy* first() const {\n    assert(N);\n    return N;\n  }\n  inline ETy* second() { return &Ea; }\n  inline const ETy* second() const { return &Ea; }\n\n  template <typename... Args>\n  UEdgeInfoBase(NTy* n, ETy*, bool, Args&&... args)\n      : N(n), Ea(std::forward<Args>(args)...) {}\n\n  template <typename... Args>\n  UEdgeInfoBase(NTy* n, ETy& v, bool, Args&&...) : N(n) {\n    Ea = v;\n  }\n\n  static size_t sizeOfSecond() { return sizeof(ETy); }\n  bool isInEdge() const { return false; }\n};\n\ntemplate <typename NTy, typename ETy>\nstruct UEdgeInfoBase<NTy, ETy, false> {\n  typedef ETy& reference;\n\n  NTy* N;\n  ETy* Ea;\n\n  inline NTy* first() {\n    assert(N);\n    return (NTy*)((uintptr_t)N & ~1);\n  }\n  inline const NTy* first() const {\n    assert(N);\n    return (NTy*)((uintptr_t)N & ~1);\n  }\n  inline ETy* second() { return Ea; }\n  inline const ETy* second() const { return Ea; }\n  template <typename... Args>\n  UEdgeInfoBase(NTy* n, ETy* v, bool f, Args&&...)\n      : N((NTy*)((uintptr_t)n | f)), Ea(v) {}\n  static size_t sizeOfSecond() { return sizeof(ETy); }\n  bool isInEdge() const { return (uintptr_t)N & 1; }\n};\n\ntemplate <typename NTy>\nstruct UEdgeInfoBase<NTy, void, true> {\n  typedef char& reference;\n\n  NTy* N;\n  inline NTy* first() { return N; }\n  inline const NTy* first() const { return N; }\n  inline char* second() const { return static_cast<char*>(NULL); }\n  inline char* addr() const { return second(); }\n  template <typename... Args>\n  UEdgeInfoBase(NTy* n, void*, bool, Args&&...) : N(n) {}\n  static size_t sizeOfSecond() { return 0; }\n  bool isInEdge() const { return false; }\n};\n\ntemplate <typename NTy>\nstruct UEdgeInfoBase<NTy, void, false> {\n  typedef char& reference;\n\n  NTy* N;\n  inline NTy* first() { return (NTy*)((uintptr_t)N & ~1); }\n  inline const NTy* first() const { return (NTy*)((uintptr_t)N & ~1); }\n  inline char* second() const { return static_cast<char*>(NULL); }\n  inline char* addr() const { return second(); }\n  template <typename... Args>\n  UEdgeInfoBase(NTy* n, void*, bool f, Args&&...)\n      : N((NTy*)((uintptr_t)n | f)) {}\n  static size_t sizeOfSecond() { return 0; }\n  bool isInEdge() const { return (uintptr_t)N & 1; }\n};\n\n/*\n * Only graphs w/ in-out/symmetric edges and non-void edge data,\n * i.e. ETy != void and DirectedNotInOut = false,\n * need to allocate memory for edge data\n */\ntemplate <typename ETy, bool DirectedNotInOut>\nstruct EdgeFactory {\n  galois::InsertBag<ETy> mem;\n  template <typename... Args>\n  ETy* mkEdge(Args&&... args) {\n    return &mem.emplace(std::forward<Args>(args)...);\n  }\n  void delEdge(ETy*) {}\n  bool mustDel() const { return false; }\n};\n\ntemplate <typename ETy>\nstruct EdgeFactory<ETy, true> {\n  template <typename... Args>\n  ETy* mkEdge(Args&&...) {\n    return nullptr;\n  }\n  void delEdge(ETy*) {}\n  bool mustDel() const { return false; }\n};\n\ntemplate <>\nstruct EdgeFactory<void, false> {\n  template <typename... Args>\n  void* mkEdge(Args&&...) {\n    return static_cast<void*>(NULL);\n  }\n  void delEdge(void*) {}\n  bool mustDel() const { return false; }\n};\n\n} // namespace internal\n\n/**\n * A graph that can have new nodes and edges added to it.\n *\n * An example of use:\n *\n * \\code\n * struct Node {\n *   ... // Definition of node data\n * };\n *\n * typedef galois::graphs::MorphGraph<Node,int,true> Graph;\n *\n * // Create graph\n * Graph g;\n * Node n1, n2;\n * Graph::GraphNode a, b;\n * a = g.createNode(n1);\n * g.addNode(a);\n * b = g.createNode(n2);\n * g.addNode(b);\n * g.getEdgeData(g.addEdge(a, b)) = 5;\n *\n * // Traverse graph\n * for (Graph::iterator ii = g.begin(), ei = g.end(); ii != ei; ++ii) {\n *   Graph::GraphNode src = *ii;\n *   for (Graph::edge_iterator jj = g.edge_begin(src), ej = g.edge_end(src);\n *        jj != ej;\n *        ++jj) {\n *     Graph::GraphNode dst = graph.getEdgeDst(jj);\n *     int edgeData = g.getEdgeData(jj);\n *     assert(edgeData == 5);\n *   }\n * }\n * \\endcode\n *\n * And in C++11:\n *\n * \\code\n * // Traverse graph\n * for (Graph::GraphNode src : g) {\n *   for (Graph::edge_iterator edge : g.out_edges(src)) {\n *     Graph::GraphNode dst = g.getEdgeDst(edge);\n *     int edgeData = g.getEdgeData(edge);\n *     assert(edgeData == 5);\n *   }\n * }\n * \\endcode\n *\n * @tparam NodeTy Type of node data\n * @tparam EdgeTy Type of edge data\n * @tparam Directional true if graph is directed\n * @tparam InOut true if directed graph tracks in-edges\n * @tparam HasNoLockable if true, use no abstract locks in the graph\n * @tparam SortedNeighbors Keep neighbors sorted (for faster findEdge)\n * @tparam FileEdgeTy type of edges on file to be read from\n */\ntemplate <typename NodeTy, typename EdgeTy, bool Directional,\n          bool InOut = false, bool HasNoLockable = false,\n          bool SortedNeighbors = false, typename FileEdgeTy = EdgeTy>\nclass MorphGraph : private boost::noncopyable {\npublic:\n  /**\n   * Struct used to define the HasNoLockable template parameter as a type\n   * in the struct.\n   */\n  template <bool _has_no_lockable>\n  struct with_no_lockable {\n    //! Type with Lockable parameter set according to struct template arg\n    using type = MorphGraph<NodeTy, EdgeTy, Directional, InOut,\n                            _has_no_lockable, SortedNeighbors, FileEdgeTy>;\n  };\n\n  /**\n   * Struct used to define the type of node data in the graph.\n   */\n  template <typename _node_data>\n  struct with_node_data {\n    //! Type with node data parameter set according to struct template arg\n    using type = MorphGraph<_node_data, EdgeTy, Directional, InOut,\n                            HasNoLockable, SortedNeighbors, FileEdgeTy>;\n  };\n\n  /**\n   * Struct used to define the type of edge data in the graph.\n   */\n  template <typename _edge_data>\n  struct with_edge_data {\n    //! Type with edge data parameter set according to struct template arg\n    using type = MorphGraph<NodeTy, _edge_data, Directional, InOut,\n                            HasNoLockable, SortedNeighbors, FileEdgeTy>;\n  };\n\n  /**\n   * Struct used to define the type of file edge data in the graph.\n   */\n  template <typename _file_edge_data>\n  struct with_file_edge_data {\n    //! Type with file edge data parameter set according to struct template arg\n    using type = MorphGraph<NodeTy, EdgeTy, Directional, InOut, HasNoLockable,\n                            SortedNeighbors, _file_edge_data>;\n  };\n\n  /**\n   * Struct used to define directionality of the graph.\n   */\n  template <bool _directional>\n  struct with_directional {\n    //! Type with directional parameter set according to struct template arg\n    using type = MorphGraph<NodeTy, EdgeTy, _directional, InOut, HasNoLockable,\n                            SortedNeighbors, FileEdgeTy>;\n  };\n\n  /**\n   * Struct used to define if neighbors are sorted or not in the graph.\n   */\n  template <bool _sorted_neighbors>\n  struct with_sorted_neighbors {\n    //! Type with sort neighbor parameter set according to struct template arg\n    using type = MorphGraph<NodeTy, EdgeTy, Directional, InOut, HasNoLockable,\n                            _sorted_neighbors, FileEdgeTy>;\n  };\n\n  //! Tag that defines to graph reader how to read a graph into this class\n  using read_tag = read_with_aux_first_graph_tag;\n\nprivate: ///////////////////////////////////////////////////////////////////////\n  template <typename T>\n  struct first_eq_and_valid {\n    T N2;\n    first_eq_and_valid(T& n) : N2(n) {}\n    template <typename T2>\n    bool operator()(const T2& ii) const {\n      return ii.first() == N2 && ii.first() && ii.first()->active;\n    }\n  };\n\n  struct first_not_valid {\n    template <typename T2>\n    bool operator()(const T2& ii) const {\n      return !ii.first() || !ii.first()->active;\n    }\n  };\n\n  template <typename T>\n  struct first_lt {\n    template <typename T2>\n    bool operator()(const T& N2, const T2& ii) const {\n      assert(ii.first() && \"UNEXPECTED: invalid item in edgelist\");\n      return N2 < ii.first();\n    }\n    template <typename T2>\n    bool operator()(const T2& ii, const T& N2) const {\n      assert(ii.first() && \"UNEXPECTED: invalid item in edgelist\");\n      return ii.first() < N2;\n    }\n  };\n\n  // forward declaration for graph node type\n  class gNode;\n  struct gNodeTypes\n      : public internal::NodeInfoBaseTypes<NodeTy, !HasNoLockable> {\n    //! The storage type for an edge\n    using EdgeInfo =\n        internal::UEdgeInfoBase<gNode, EdgeTy, Directional & !InOut>;\n\n    //! The storage type for edges\n    // typedef galois::gstl::Vector<EdgeInfo> EdgesTy;\n    using EdgesTy = boost::container::small_vector<\n        EdgeInfo, 3, galois::runtime::Pow_2_BlockAllocator<EdgeInfo>>;\n\n    using iterator = typename EdgesTy::iterator;\n  };\n\n  class gNode : public internal::NodeInfoBase<NodeTy, !HasNoLockable>,\n                public gNodeTypes {\n    //! friend of MorphGraph since MorphGraph contains gNodes\n    friend class MorphGraph;\n    //! Storage type for node\n    using NodeInfo = internal::NodeInfoBase<NodeTy, !HasNoLockable>;\n    //! iterator over edges (taken from gNodeTypes)\n    using iterator = typename gNode::iterator;\n    //! Storage type of a single edge (taken from gNodeTypes)\n    using EdgeInfo = typename gNode::EdgeInfo;\n\n    //! edges on this node\n    typename gNodeTypes::EdgesTy edges;\n\n    //! Tracks if this node is considered as \"in\" the graph\n    bool active;\n\n    //! Return iterator to first edge\n    iterator begin() { return edges.begin(); }\n    //! Return iterator to end of edges\n    iterator end() { return edges.end(); }\n\n    //! Remove the provided edge from this node\n    //! @param ii iterator to edge to remove\n    void erase(iterator ii) {\n      if (SortedNeighbors) {\n        // For sorted case remove the element, moving following\n        // elements back to fill the space.\n        edges.erase(ii);\n      } else {\n        // We don't need to preserve the order, so move the last edge\n        // into this place and then remove last edge.\n        *ii = edges.back();\n        edges.pop_back();\n      }\n    }\n\n    /**\n     * Erase an edge with a provided destination.\n     */\n    void erase(gNode* N, bool inEdge = false) {\n      iterator ii = find(N, inEdge);\n      if (ii != end())\n        edges.erase(ii);\n    }\n\n    /**\n     * Find an edge with a particular destination node.\n     */\n    iterator find(gNode* N, bool inEdge = false) {\n      iterator ii, ei = edges.end();\n      // find starting point to start search\n      if (SortedNeighbors) {\n        assert(std::is_sorted(edges.begin(), edges.end(),\n                              [=](const EdgeInfo& e1, const EdgeInfo& e2) {\n                                return e1.first() < e2.first();\n                              }));\n        ii =\n            std::lower_bound(edges.begin(), edges.end(), N, first_lt<gNode*>());\n      } else {\n        ii = edges.begin();\n      }\n\n      first_eq_and_valid<gNode*> checker(N);\n      ii = std::find_if(ii, ei, checker);\n      while (ii != ei && ii->isInEdge() != inEdge) {\n        ++ii;\n        ii = std::find_if(ii, ei, checker);\n      };\n      return ii;\n    }\n\n    /**\n     * Make space for more edges stored by this node\n     */\n    void resizeEdges(size_t size) {\n      edges.resize(size, EdgeInfo(new gNode(), 0));\n    }\n\n    /**\n     * Add a new edge to this node\n     */\n    template <typename... Args>\n    iterator createEdge(gNode* N, EdgeTy* v, bool inEdge, Args&&... args) {\n      iterator ii;\n      if (SortedNeighbors) {\n        // If neighbors are sorted, find appropriate insertion point.\n        // Insert before first neighbor that is too far.\n        ii =\n            std::upper_bound(edges.begin(), edges.end(), N, first_lt<gNode*>());\n      } else {\n        ii = edges.end();\n      }\n\n      return edges.insert(ii,\n                          EdgeInfo(N, v, inEdge, std::forward<Args>(args)...));\n    }\n\n    /**\n     * Add an edge to this node; if space exists to add it in, then reuse that\n     * space.\n     */\n    template <typename... Args>\n    iterator createEdgeWithReuse(gNode* N, EdgeTy* v, bool inEdge,\n                                 Args&&... args) {\n      // First check for holes\n      iterator ii, ei;\n      if (SortedNeighbors) {\n        // If neighbors are sorted, find acceptable range for insertion.\n        ii =\n            std::lower_bound(edges.begin(), edges.end(), N, first_lt<gNode*>());\n        ei = std::upper_bound(ii, edges.end(), N, first_lt<gNode*>());\n      } else {\n        // If not sorted, we can insert anywhere in the list.\n        ii = edges.begin();\n        ei = edges.end();\n      }\n      ii = std::find_if(ii, ei, first_not_valid());\n      if (ii != ei) {\n        // FIXME: We could move elements around (short distances).\n        *ii = EdgeInfo(N, v, inEdge, std::forward<Args>(args)...);\n        return ii;\n      }\n      return edges.insert(ei,\n                          EdgeInfo(N, v, inEdge, std::forward<Args>(args)...));\n    }\n\n    template <bool _A1 = HasNoLockable>\n    void acquire(MethodFlag mflag, typename std::enable_if<!_A1>::type* = 0) {\n      galois::runtime::acquire(this, mflag);\n    }\n\n    template <bool _A1 = HasNoLockable>\n    void acquire(MethodFlag, typename std::enable_if<_A1>::type* = 0) {}\n\n  public:\n    template <typename... Args>\n    gNode(Args&&... args)\n        : NodeInfo(std::forward<Args>(args)...), active(false) {}\n  };\n\n  // The graph manages the lifetimes of the data in the nodes and edges\n  //! Container for nodes\n  using NodeListTy = galois::InsertBag<gNode>;\n  //! nodes in this graph\n  NodeListTy nodes;\n\n  internal::EdgeFactory<EdgeTy, Directional && !InOut> edgesF;\n\n  // Helpers for iterator classes\n  struct is_node {\n    bool operator()(const gNode& g) const { return g.active; }\n  };\n  struct is_edge {\n    bool operator()(typename gNodeTypes::EdgeInfo& e) const {\n      return e.first()->active;\n    }\n  };\n  struct is_in_edge {\n    bool operator()(typename gNodeTypes::EdgeInfo& e) const {\n      return e.first()->active && e.isInEdge();\n    }\n  };\n  struct is_out_edge {\n    bool operator()(typename gNodeTypes::EdgeInfo& e) const {\n      return e.first()->active && !e.isInEdge();\n    }\n  };\n  struct makeGraphNode {\n    gNode* operator()(gNode& data) const { return &data; }\n  };\n\npublic: ////////////////////////////////////////////////////////////////////////\n  //! Graph node handle\n  using GraphNode = gNode*;\n  //! Edge data type\n  using edge_data_type = EdgeTy;\n  //! Edge data type of file we are loading this graph from\n  using file_edge_data_type = FileEdgeTy;\n  //! Node data type\n  using node_data_type = NodeTy;\n  //! (Out or Undirected) Edge iterator\n  using edge_iterator =\n      typename boost::filter_iterator<is_out_edge,\n                                      typename gNodeTypes::iterator>;\n  //! In Edge iterator\n  using in_edge_iterator =\n      typename boost::filter_iterator<is_in_edge,\n                                      typename gNodeTypes::iterator>;\n\n  //! Reference to edge data\n  using edge_data_reference = typename gNodeTypes::EdgeInfo::reference;\n  //! Reference to node data\n  using node_data_reference = typename gNodeTypes::reference;\n  //! Node iterator\n  using iterator = boost::transform_iterator<\n      makeGraphNode,\n      boost::filter_iterator<is_node, typename NodeListTy::iterator>>;\n\n#ifdef AUX_MAP\n  //! Auxiliary data for nodes that stores in neighbors in per thread storage\n  //! accessed through a map\n  struct ReadGraphAuxData {\n    LargeArray<GraphNode> nodes;\n    //! stores in neighbors\n    galois::PerThreadMap<FileGraph::GraphNode,\n                         galois::gstl::Vector<std::pair<GraphNode, EdgeTy*>>>\n        inNghs;\n  };\n#else\n  //! Wrapper around a graph node that provides a lock for it as well as\n  //! in-neighbor tracking\n  struct AuxNode {\n    //! lock for wrapped graph node\n    galois::substrate::SimpleLock lock;\n    //! single graph node wrapped by this struct\n    GraphNode n;\n    //! stores in neighbors\n    galois::gstl::Vector<std::pair<GraphNode, EdgeTy*>> inNghs;\n  };\n  //! Padded version of AuxNode\n  using AuxNodePadded = typename galois::substrate::CacheLineStorage<AuxNode>;\n\n  //! True if a node is both directional and not storing both in and out\n  //! edges\n  constexpr static const bool DirectedNotInOut = (Directional && !InOut);\n  //! Large array that contains auxiliary data for each node (AuxNodes)\n  using ReadGraphAuxData =\n      typename std::conditional<DirectedNotInOut, LargeArray<GraphNode>,\n                                LargeArray<AuxNodePadded>>::type;\n#endif\n\nprivate: ///////////////////////////////////////////////////////////////////////\n  template <typename... Args>\n  edge_iterator createEdgeWithReuse(GraphNode src, GraphNode dst,\n                                    galois::MethodFlag mflag, Args&&... args) {\n    assert(src);\n    assert(dst);\n    // galois::runtime::checkWrite(mflag, true);\n    src->acquire(mflag);\n    typename gNode::iterator ii = src->find(dst);\n    // add edge only if it doesn't already exist\n    if (ii == src->end()) {\n      if (Directional && !InOut) {\n        ii = src->createEdgeWithReuse(dst, 0, false,\n                                      std::forward<Args>(args)...);\n      } else {\n        dst->acquire(mflag);\n        EdgeTy* e = edgesF.mkEdge(std::forward<Args>(args)...);\n        ii        = dst->createEdgeWithReuse(src, e, Directional ? true : false,\n                                      std::forward<Args>(args)...);\n        ii        = src->createEdgeWithReuse(dst, e, false,\n                                      std::forward<Args>(args)...);\n      }\n    }\n    return boost::make_filter_iterator(is_out_edge(), ii, src->end());\n  }\n\n  template <typename... Args>\n  edge_iterator createEdge(GraphNode src, GraphNode dst,\n                           galois::MethodFlag mflag, Args&&... args) {\n    assert(src);\n    assert(dst);\n    // galois::runtime::checkWrite(mflag, true);\n    src->acquire(mflag);\n    typename gNode::iterator ii = src->end();\n    // add edge only if it doesn't already exist\n    if (ii == src->end()) {\n      if (Directional && !InOut) {\n        ii = src->createEdge(dst, 0, false, std::forward<Args>(args)...);\n      } else {\n        dst->acquire(mflag);\n        EdgeTy* e = edgesF.mkEdge(std::forward<Args>(args)...);\n        ii        = dst->createEdge(src, e, Directional ? true : false,\n                             std::forward<Args>(args)...);\n        ii        = src->createEdge(dst, e, false, std::forward<Args>(args)...);\n      }\n    }\n    return boost::make_filter_iterator(is_out_edge(), ii, src->end());\n  }\n\n  /**\n   * Creates an outgoing edge at src for the edge from src to dst.\n   * Only called by constructOutEdgeValue.\n   */\n  template <typename... Args>\n  EdgeTy* createOutEdge(GraphNode src, GraphNode dst, galois::MethodFlag mflag,\n                        Args&&... args) {\n    assert(src);\n    assert(dst);\n\n    src->acquire(mflag);\n    typename gNode::iterator ii = src->end();\n    if (ii == src->end()) {\n      dst->acquire(mflag);\n      EdgeTy* e = edgesF.mkEdge(std::forward<Args>(args)...);\n      ii        = src->createEdge(dst, e, false, std::forward<Args>(args)...);\n      return e;\n    }\n    return nullptr;\n  }\n\n  /**\n   * Creates an incoming edge at dst for the edge from src to dst.\n   * Only called by constructInEdgeValue.\n   * Reuse data from the corresponding outgoing edge.\n   */\n  template <typename... Args>\n  void createInEdge(GraphNode src, GraphNode dst, EdgeTy* e,\n                    galois::MethodFlag mflag, Args&&... args) {\n    assert(src);\n    assert(dst);\n\n    dst->acquire(mflag);\n    typename gNode::iterator ii = dst->end();\n    if (ii == dst->end()) {\n      src->acquire(mflag);\n      ii = dst->createEdge(src, e, Directional ? true : false,\n                           std::forward<Args>(args)...);\n    }\n  }\n\n  template <bool _A1 = LargeArray<EdgeTy>::has_value,\n            bool _A2 = LargeArray<FileEdgeTy>::has_value>\n  EdgeTy*\n  constructOutEdgeValue(FileGraph& graph, typename FileGraph::edge_iterator nn,\n                        GraphNode src, GraphNode dst,\n                        typename std::enable_if<!_A1 || _A2>::type* = 0) {\n    typedef typename LargeArray<FileEdgeTy>::value_type FEDV;\n    typedef LargeArray<EdgeTy> ED;\n    if (ED::has_value) {\n      return createOutEdge(src, dst, galois::MethodFlag::UNPROTECTED,\n                           graph.getEdgeData<FEDV>(nn));\n    } else {\n      return createOutEdge(src, dst, galois::MethodFlag::UNPROTECTED);\n    }\n  }\n\n  template <bool _A1 = LargeArray<EdgeTy>::has_value,\n            bool _A2 = LargeArray<FileEdgeTy>::has_value>\n  EdgeTy*\n  constructOutEdgeValue(FileGraph&, typename FileGraph::edge_iterator,\n                        GraphNode src, GraphNode dst,\n                        typename std::enable_if<_A1 && !_A2>::type* = 0) {\n    return createOutEdge(src, dst, galois::MethodFlag::UNPROTECTED);\n  }\n\n  // will reuse edge data from outgoing edges\n  void constructInEdgeValue(FileGraph&, EdgeTy* e, GraphNode src,\n                            GraphNode dst) {\n    createInEdge(src, dst, e, galois::MethodFlag::UNPROTECTED);\n  }\n\npublic\n    : /////////////////////////////////////////////////////////////////////////\n  /**\n   * Creates a new node holding the indicated data. Usually you should call\n   * {@link addNode()} afterwards.\n   *\n   * @param[in] args constructor arguments for node data\n   * @returns newly created graph node\n   */\n  template <typename... Args>\n  GraphNode createNode(Args&&... args) {\n    gNode* N  = &(nodes.emplace(std::forward<Args>(args)...));\n    N->active = false;\n    return GraphNode(N);\n  }\n\n  /**\n   * Adds a node to the graph.\n   */\n  void addNode(const GraphNode& n,\n               galois::MethodFlag mflag = MethodFlag::WRITE) {\n    // galois::runtime::checkWrite(mflag, true);\n    n->acquire(mflag);\n    n->active = true;\n  }\n\n  //! Gets the node data for a node.\n  node_data_reference\n  getData(const GraphNode& n,\n          galois::MethodFlag mflag = MethodFlag::WRITE) const {\n    assert(n);\n    // galois::runtime::checkWrite(mflag, false);\n    n->acquire(mflag);\n    return n->getData();\n  }\n\n  //! Checks if a node is in the graph\n  //! @returns true if a node has is in the graph\n  bool containsNode(const GraphNode& n,\n                    galois::MethodFlag mflag = MethodFlag::WRITE) const {\n    assert(n);\n    n->acquire(mflag);\n    return n->active;\n  }\n\n  /**\n   * Removes a node from the graph along with all its outgoing/incoming edges\n   * for undirected graphs or outgoing edges for directed graphs.\n   *\n   * @todo handle edge memory\n   */\n  void removeNode(GraphNode n, galois::MethodFlag mflag = MethodFlag::WRITE) {\n    assert(n);\n    // galois::runtime::checkWrite(mflag, true);\n    n->acquire(mflag);\n    gNode* N = n;\n    if (N->active) {\n      N->active = false;\n      N->edges.clear();\n    }\n  }\n\n  /**\n   * Resize the edges of the node. For best performance, should be done\n   * serially.\n   */\n  void resizeEdges(GraphNode src, size_t size,\n                   galois::MethodFlag mflag = MethodFlag::WRITE) {\n    assert(src);\n    // galois::runtime::checkWrite(mflag, false);\n    src->acquire(mflag);\n    src->resizeEdges(size);\n  }\n\n  /**\n   * Adds an edge to graph, replacing existing value if edge already exists.\n   *\n   * Ignore the edge data, let the caller use the returned iterator to set the\n   * value if desired.  This frees us from dealing with the void edge data\n   * problem in this API\n   */\n  edge_iterator addEdge(GraphNode src, GraphNode dst,\n                        galois::MethodFlag mflag = MethodFlag::WRITE) {\n    return createEdgeWithReuse(src, dst, mflag);\n  }\n\n  //! Adds and initializes an edge to graph but does not check for duplicate\n  //! edges\n  template <typename... Args>\n  edge_iterator addMultiEdge(GraphNode src, GraphNode dst,\n                             galois::MethodFlag mflag, Args&&... args) {\n    return createEdge(src, dst, mflag, std::forward<Args>(args)...);\n  }\n\n  //! Removes an edge from the graph\n  void removeEdge(GraphNode src, edge_iterator dst,\n                  galois::MethodFlag mflag = MethodFlag::WRITE) {\n    assert(src);\n    // galois::runtime::checkWrite(mflag, true);\n    src->acquire(mflag);\n    if (Directional && !InOut) {\n      src->erase(dst.base());\n    } else {\n      dst->first()->acquire(mflag);\n      // EdgeTy* e = dst->second();\n      dst->first()->erase(\n          src, Directional ? true : false); // erase incoming/symmetric edge\n      src->erase(dst.base());\n    }\n  }\n\n  //! Finds if an edge between src and dst exists\n  edge_iterator findEdge(GraphNode src, GraphNode dst,\n                         galois::MethodFlag mflag = MethodFlag::WRITE) {\n    assert(src);\n    assert(dst);\n    src->acquire(mflag);\n    typename gNodeTypes::iterator ii = src->find(dst), ei = src->end();\n    is_out_edge edge_predicate;\n    if (ii != ei && edge_predicate(*ii)) {\n      // After finding edge, lock dst and verify still active\n      dst->acquire(mflag);\n      if (!edge_predicate(*ii))\n        // I think we need this too, else we'll return some random iterator.\n        ii = ei;\n    } else {\n      ii = ei;\n    }\n    return boost::make_filter_iterator(edge_predicate, ii, ei);\n  }\n\n  //! Find/return edge between src/dst if it exists; assumes that edges\n  //! are sorted by destination\n  edge_iterator\n  findEdgeSortedByDst(GraphNode src, GraphNode dst,\n                      galois::MethodFlag mflag = MethodFlag::WRITE) {\n    assert(src);\n    assert(dst);\n    src->acquire(mflag);\n    assert(std::is_sorted(src->begin(), src->end(),\n                          [=](const typename gNode::EdgeInfo& e1,\n                              const typename gNode::EdgeInfo& e2) {\n                            return e1.first() < e2.first();\n                          }));\n\n    auto ei = src->end();\n\n    // jump directly to edges with destination we are looking for\n    auto ii =\n        std::lower_bound(src->begin(), src->end(), dst, first_lt<gNode*>());\n\n    first_eq_and_valid<gNode*> checker(dst);\n    ii = std::find_if(ii, ei, checker); // bug if ei set to upper_bound\n    // ignore in edges\n    while (ii != ei && ii->isInEdge()) {\n      ++ii;\n      ii = std::find_if(ii, ei, checker);\n    };\n\n    // make sure destination node is active else return end iterator\n    is_out_edge edge_predicate;\n    if (ii != ei) {\n      dst->acquire(mflag);\n      if (!edge_predicate(*ii)) {\n        ii = ei;\n      }\n    }\n    return boost::make_filter_iterator(edge_predicate, ii, ei);\n  }\n\n  //! Find a particular in-edge: note this function activates for the undirected\n  //! graph case, so it just calls the regular out-edge finding function\n  template <bool _Undirected = !Directional>\n  edge_iterator findInEdge(GraphNode src, GraphNode dst,\n                           galois::MethodFlag mflag = MethodFlag::WRITE,\n                           typename std::enable_if<_Undirected>::type* = 0) {\n    // incoming neighbors are the same as outgoing neighbors in undirected\n    // graphs\n    return findEdge(src, dst, mflag);\n  }\n\n  //! Find if an incoming edge between src and dst exists for directed in-out\n  //! graphs\n  template <bool _DirectedInOut = (Directional && InOut)>\n  in_edge_iterator\n  findInEdge(GraphNode src, GraphNode dst,\n             galois::MethodFlag mflag                       = MethodFlag::WRITE,\n             typename std::enable_if<_DirectedInOut>::type* = 0) {\n    assert(src);\n    assert(dst);\n    src->acquire(mflag);\n    typename gNodeTypes::iterator ii = src->find(dst, true), ei = src->end();\n    is_in_edge edge_predicate;\n    if (ii != ei && edge_predicate(*ii)) {\n      // After finding edges, lock dst and verify still active\n      dst->acquire(mflag);\n      if (!edge_predicate(*ii))\n        // need this to avoid returning a random iterator\n        ii = ei;\n    } else\n      ii = ei;\n    return boost::make_filter_iterator(edge_predicate, ii, ei);\n  }\n\n  /**\n   * Returns the edge data associated with the edge. It is an error to\n   * get the edge data for a non-existent edge.  It is an error to get\n   * edge data for inactive edges. By default, the mflag is\n   * galois::MethodFlag::UNPROTECTED because edge_begin() dominates this call\n   * and should perform the appropriate locking.\n   */\n  edge_data_reference\n  getEdgeData(edge_iterator ii,\n              galois::MethodFlag mflag = MethodFlag::UNPROTECTED) const {\n    assert(ii->first()->active);\n    // galois::runtime::checkWrite(mflag, false);\n    ii->first()->acquire(mflag);\n    return *ii->second();\n  }\n\n  /**\n   * Get edge data for an in-edge\n   */\n  edge_data_reference\n  getEdgeData(in_edge_iterator ii,\n              galois::MethodFlag mflag = MethodFlag::UNPROTECTED) const {\n    assert(ii->first()->active);\n    // galois::runtime::checkWrite(mflag, false);\n    ii->first()->acquire(mflag);\n    return *ii->second();\n  }\n\n  //! Returns the destination of an edge\n  GraphNode getEdgeDst(edge_iterator ii) {\n    assert(ii->first()->active);\n    return GraphNode(ii->first());\n  }\n\n  //! Returns the destination of an in-edge\n  GraphNode getEdgeDst(in_edge_iterator ii) {\n    assert(ii->first()->active);\n    return GraphNode(ii->first());\n  }\n\n  //! Sorts edge of a node by destination.\n  void sortEdgesByDst(GraphNode N,\n                      galois::MethodFlag mflag = MethodFlag::WRITE) {\n    acquire(N, mflag);\n    typedef typename gNode::EdgeInfo EdgeInfo;\n    std::sort(N->begin(), N->end(),\n              [=](const EdgeInfo& e1, const EdgeInfo& e2) {\n                return e1.first() < e2.first();\n              });\n  }\n\n  //! Sort all edges by destination\n  void sortAllEdgesByDst(MethodFlag mflag = MethodFlag::WRITE) {\n    galois::do_all(\n        galois::iterate(*this),\n        [=](GraphNode N) { this->sortEdgesByDst(N, mflag); }, galois::steal());\n  }\n\n  // General Things\n\n  //! Returns an iterator to the neighbors of a node\n  edge_iterator edge_begin(GraphNode N,\n                           galois::MethodFlag mflag = MethodFlag::WRITE) {\n    assert(N);\n    N->acquire(mflag);\n\n    if (galois::runtime::shouldLock(mflag)) {\n      for (typename gNode::iterator ii = N->begin(), ee = N->end(); ii != ee;\n           ++ii) {\n        if (ii->first()->active && !ii->isInEdge())\n          ii->first()->acquire(mflag);\n      }\n    }\n    return boost::make_filter_iterator(is_out_edge(), N->begin(), N->end());\n  }\n\n  //! Returns an iterator to the in-neighbors of a node\n  template <bool _Undirected = !Directional>\n  in_edge_iterator\n  in_edge_begin(GraphNode N, galois::MethodFlag mflag = MethodFlag::WRITE,\n                typename std::enable_if<!_Undirected>::type* = 0) {\n    assert(N);\n    N->acquire(mflag);\n\n    if (galois::runtime::shouldLock(mflag)) {\n      for (typename gNode::iterator ii = N->begin(), ee = N->end(); ii != ee;\n           ++ii) {\n        if (ii->first()->active && ii->isInEdge())\n          ii->first()->acquire(mflag);\n      }\n    }\n    return boost::make_filter_iterator(is_in_edge(), N->begin(), N->end());\n  }\n\n  //! Returns an iterator to the in-neighbors of a node; undirected case\n  //! in which it's the same as a regular neighbor\n  template <bool _Undirected = !Directional>\n  edge_iterator in_edge_begin(GraphNode N,\n                              galois::MethodFlag mflag = MethodFlag::WRITE,\n                              typename std::enable_if<_Undirected>::type* = 0) {\n    return edge_begin(N, mflag);\n  }\n\n  //! Returns the end of the neighbor edge iterator\n  edge_iterator\n  edge_end(GraphNode N,\n           galois::MethodFlag GALOIS_UNUSED(mflag) = MethodFlag::WRITE) {\n    assert(N);\n    // Acquiring lock is not necessary: no valid use for an end pointer should\n    // ever require it\n    // N->acquire(mflag);\n    return boost::make_filter_iterator(is_out_edge(), N->end(), N->end());\n  }\n\n  //! Returns the end of an in-neighbor edge iterator\n  template <bool _Undirected = !Directional>\n  in_edge_iterator\n  in_edge_end(GraphNode N,\n              galois::MethodFlag GALOIS_UNUSED(mflag)      = MethodFlag::WRITE,\n              typename std::enable_if<!_Undirected>::type* = 0) {\n    assert(N);\n    // Acquiring lock is not necessary: no valid use for an end pointer should\n    // ever require it\n    // N->acquire(mflag);\n    return boost::make_filter_iterator(is_in_edge(), N->end(), N->end());\n  }\n\n  //! Returns the end of an in-neighbor edge iterator, undirected case\n  template <bool _Undirected = !Directional>\n  edge_iterator in_edge_end(GraphNode N,\n                            galois::MethodFlag mflag = MethodFlag::WRITE,\n                            typename std::enable_if<_Undirected>::type* = 0) {\n    return edge_end(N, mflag);\n  }\n\n  //! Return a range of edges that can be iterated over by C++ for-each\n  runtime::iterable<NoDerefIterator<edge_iterator>>\n  edges(GraphNode N, galois::MethodFlag mflag = MethodFlag::WRITE) {\n    return internal::make_no_deref_range(edge_begin(N, mflag),\n                                         edge_end(N, mflag));\n  }\n\n  //! Return a range of in-edges that can be iterated over by C++ for-each\n  template <bool _Undirected = !Directional>\n  runtime::iterable<NoDerefIterator<in_edge_iterator>>\n  in_edges(GraphNode N, galois::MethodFlag mflag = MethodFlag::WRITE,\n           typename std::enable_if<!_Undirected>::type* = 0) {\n    return internal::make_no_deref_range(in_edge_begin(N, mflag),\n                                         in_edge_end(N, mflag));\n  }\n\n  //! Return a range of in-edges that can be iterated over by C++ for-each\n  //! Undirected case, equivalent to out-edge iteration\n  template <bool _Undirected = !Directional>\n  runtime::iterable<NoDerefIterator<edge_iterator>>\n  in_edges(GraphNode N, galois::MethodFlag mflag = MethodFlag::WRITE,\n           typename std::enable_if<_Undirected>::type* = 0) {\n    return edges(N, mflag);\n  }\n\n  /**\n   * An object with begin() and end() methods to iterate over the outgoing\n   * edges of N.\n   */\n  internal::EdgesIterator<MorphGraph>\n  out_edges(GraphNode N, MethodFlag mflag = MethodFlag::WRITE) {\n    return internal::EdgesIterator<MorphGraph>(*this, N, mflag);\n  }\n\n  /**\n   * Returns an iterator to all the nodes in the graph. Not thread-safe.\n   */\n  iterator begin() {\n    return boost::make_transform_iterator(\n        boost::make_filter_iterator(is_node(), nodes.begin(), nodes.end()),\n        makeGraphNode());\n  }\n\n  //! Returns the end of the node iterator. Not thread-safe.\n  iterator end() {\n    return boost::make_transform_iterator(\n        boost::make_filter_iterator(is_node(), nodes.end(), nodes.end()),\n        makeGraphNode());\n  }\n\n  //! local iterator over nodes\n  using local_iterator = iterator;\n\n  //! Return the beginning of local range of nodes\n  local_iterator local_begin() {\n    return boost::make_transform_iterator(\n        boost::make_filter_iterator(is_node(), nodes.local_begin(),\n                                    nodes.local_end()),\n        makeGraphNode());\n  }\n\n  //! Return the end of local range of nodes\n  local_iterator local_end() {\n    return boost::make_transform_iterator(\n        boost::make_filter_iterator(is_node(), nodes.local_end(),\n                                    nodes.local_end()),\n        makeGraphNode());\n  }\n\n  /**\n   * Returns the number of nodes in the graph. Not thread-safe.\n   */\n  unsigned int size() { return std::distance(begin(), end()); }\n\n  //! Returns the size of edge data.\n  size_t sizeOfEdgeData() const { return gNode::EdgeInfo::sizeOfSecond(); }\n\n#ifdef AUX_MAP\n  /**\n   * Allocate memory for nodes given a file graph with a particular number of\n   * nodes.\n   *\n   * @param graph FileGraph with a number of nodes to allocate\n   * @param aux Data structure in which to allocate space for nodes.\n   */\n  void allocateFrom(FileGraph& graph, ReadGraphAuxData& aux) {\n    size_t numNodes = graph.size();\n    aux.nodes.allocateInterleaved(numNodes);\n  }\n\n  /**\n   * Constructs the MorphGraph nodes given a FileGraph to construct it from.\n   * Meant to be called by multiple threads.\n   *\n   * @param[in] graph FileGraph to construct a morph graph from\n   * @param[in] tid Thread id of thread calling this function\n   * @param[in] total Total number of threads in current execution\n   * @param[in,out] aux Allocated memory to store newly created nodes\n   */\n  void constructNodesFrom(FileGraph& graph, unsigned tid, unsigned total,\n                          ReadGraphAuxData& aux) {\n    auto r = graph\n                 .divideByNode(sizeof(gNode), sizeof(typename gNode::EdgeInfo),\n                               tid, total)\n                 .first;\n    for (FileGraph::iterator ii = r.first, ei = r.second; ii != ei; ++ii) {\n      aux.nodes[*ii] = createNode();\n      addNode(aux.nodes[*ii], galois::MethodFlag::UNPROTECTED);\n    }\n  }\n\n  /**\n   * Constructs the MorphGraph edges given a FileGraph to construct it from and\n   * already created nodes.\n   * Meant to be called by multiple threads.\n   *\n   * @param[in] graph FileGraph to construct a morph graph from\n   * @param[in] tid Thread id of thread calling this function\n   * @param[in] total Total number of threads in current execution\n   * @param[in] aux Contains created nodes to create edges for\n   */\n  void constructOutEdgesFrom(FileGraph& graph, unsigned tid, unsigned total,\n                             ReadGraphAuxData& aux) {\n    auto r = graph\n                 .divideByNode(sizeof(gNode), sizeof(typename gNode::EdgeInfo),\n                               tid, total)\n                 .first;\n    auto& map = aux.inNghs.get();\n\n    for (FileGraph::iterator ii = r.first, ei = r.second; ii != ei; ++ii) {\n      for (FileGraph::edge_iterator nn = graph.edge_begin(*ii),\n                                    en = graph.edge_end(*ii);\n           nn != en; ++nn) {\n        auto dstID = graph.getEdgeDst(nn);\n        auto src = aux.nodes[*ii], dst = aux.nodes[dstID];\n        auto e = constructOutEdgeValue(graph, nn, src, dst);\n        if (!Directional || InOut) {\n          map[dstID].push_back({src, e});\n        }\n      }\n    }\n  }\n\n  /**\n   * Constructs the MorphGraph in-edges given a FileGraph to construct it from\n   * and already created nodes. Meant to be called by multiple threads.\n   * DirectedNotInOut = false version\n   *\n   * @param[in] graph FileGraph to construct a morph graph from\n   * @param[in] tid Thread id of thread calling this function\n   * @param[in] total Total number of threads in current execution\n   * @param[in] aux Contains created nodes to create edges for\n   */\n  void constructInEdgesFrom(FileGraph& graph, unsigned tid, unsigned total,\n                            const ReadGraphAuxData& aux) {\n    // only do it if not directioal or an inout graph\n    if (!Directional || InOut) {\n      auto r = graph\n                   .divideByNode(sizeof(gNode),\n                                 sizeof(typename gNode::EdgeInfo), tid, total)\n                   .first;\n\n      for (size_t i = 0; i < aux.inNghs.numRows(); ++i) {\n        const auto& map = aux.inNghs.get(i);\n        auto ii         = map.lower_bound(*(r.first));  // inclusive begin\n        auto ei         = map.lower_bound(*(r.second)); // exclusive end\n        for (; ii != ei; ++ii) {\n          auto dst = aux.nodes[ii->first];\n          for (const auto& ie : ii->second) {\n            constructInEdgeValue(graph, ie.second, ie.first, dst);\n          }\n        }\n      }\n    }\n  }\n#else\n  /**\n   * Allocate memory for nodes given a file graph with a particular number of\n   * nodes.\n   *\n   * @param graph FileGraph with a number of nodes to allocate\n   * @param aux Data structure in which to allocate space for nodes.\n   */\n  void allocateFrom(FileGraph& graph, ReadGraphAuxData& aux) {\n    size_t numNodes = graph.size();\n    aux.allocateInterleaved(numNodes);\n\n    if (!DirectedNotInOut) {\n      galois::do_all(galois::iterate(size_t{0}, aux.size()),\n                     [&](size_t index) { aux.constructAt(index); });\n    }\n  }\n\n  /**\n   * Constructs the MorphGraph nodes given a FileGraph to construct it from.\n   * Meant to be called by multiple threads.\n   * Version for DirectedNotInOut = false.\n   *\n   * @param[in] graph FileGraph to construct a morph graph from\n   * @param[in] tid Thread id of thread calling this function\n   * @param[in] total Total number of threads in current execution\n   * @param[in,out] aux Allocated memory to store newly created nodes\n   */\n  template <bool V = DirectedNotInOut>\n  std::enable_if_t<!V> constructNodesFrom(FileGraph& graph, unsigned tid,\n                                          unsigned total,\n                                          ReadGraphAuxData& aux) {\n    auto r = graph\n                 .divideByNode(sizeof(gNode), sizeof(typename gNode::EdgeInfo),\n                               tid, total)\n                 .first;\n    for (FileGraph::iterator ii = r.first, ei = r.second; ii != ei; ++ii) {\n      auto& auxNode = aux[*ii].get();\n      auxNode.n     = createNode();\n      addNode(auxNode.n, galois::MethodFlag::UNPROTECTED);\n    }\n  }\n\n  /**\n   * Constructs the MorphGraph nodes given a FileGraph to construct it from.\n   * Meant to be called by multiple threads.\n   * Version for DirectedNotInOut = true.\n   *\n   * @param[in] graph FileGraph to construct a morph graph from\n   * @param[in] tid Thread id of thread calling this function\n   * @param[in] total Total number of threads in current execution\n   * @param[in,out] aux Allocated memory to store newly created nodes\n   */\n  template <bool V = DirectedNotInOut>\n  std::enable_if_t<V> constructNodesFrom(FileGraph& graph, unsigned tid,\n                                         unsigned total,\n                                         ReadGraphAuxData& aux) {\n    auto r = graph\n                 .divideByNode(sizeof(gNode), sizeof(typename gNode::EdgeInfo),\n                               tid, total)\n                 .first;\n    for (FileGraph::iterator ii = r.first, ei = r.second; ii != ei; ++ii) {\n      aux[*ii] = createNode();\n      addNode(aux[*ii], galois::MethodFlag::UNPROTECTED);\n    }\n  }\n\n  /**\n   * Constructs the MorphGraph edges given a FileGraph to construct it from and\n   * already created nodes.\n   * Meant to be called by multiple threads.\n   * DirectedNotInOut = false version\n   *\n   * @param[in] graph FileGraph to construct a morph graph from\n   * @param[in] tid Thread id of thread calling this function\n   * @param[in] total Total number of threads in current execution\n   * @param[in] aux Contains created nodes to create edges for\n   */\n  template <bool V = DirectedNotInOut>\n  std::enable_if_t<!V> constructOutEdgesFrom(FileGraph& graph, unsigned tid,\n                                             unsigned total,\n                                             ReadGraphAuxData& aux) {\n    auto r = graph\n                 .divideByNode(sizeof(gNode), sizeof(typename gNode::EdgeInfo),\n                               tid, total)\n                 .first;\n\n    for (FileGraph::iterator ii = r.first, ei = r.second; ii != ei; ++ii) {\n      for (FileGraph::edge_iterator nn = graph.edge_begin(*ii),\n                                    en = graph.edge_end(*ii);\n           nn != en; ++nn) {\n        auto src     = aux[*ii].get().n;\n        auto& dstAux = aux[graph.getEdgeDst(nn)].get();\n        auto e       = constructOutEdgeValue(graph, nn, src, dstAux.n);\n        dstAux.lock.lock();\n        dstAux.inNghs.push_back({src, e});\n        dstAux.lock.unlock();\n      }\n    }\n  }\n\n  /**\n   * Constructs the MorphGraph edges given a FileGraph to construct it from and\n   * already created nodes.\n   * Meant to be called by multiple threads.\n   * DirectedNotInOut = true version\n   *\n   * @param[in] graph FileGraph to construct a morph graph from\n   * @param[in] tid Thread id of thread calling this function\n   * @param[in] total Total number of threads in current execution\n   * @param[in] aux Contains created nodes to create edges for\n   */\n  template <bool V = DirectedNotInOut>\n  std::enable_if_t<V> constructOutEdgesFrom(FileGraph& graph, unsigned tid,\n                                            unsigned total,\n                                            const ReadGraphAuxData& aux) {\n    auto r = graph\n                 .divideByNode(sizeof(gNode), sizeof(typename gNode::EdgeInfo),\n                               tid, total)\n                 .first;\n\n    for (FileGraph::iterator ii = r.first, ei = r.second; ii != ei; ++ii) {\n      for (FileGraph::edge_iterator nn = graph.edge_begin(*ii),\n                                    en = graph.edge_end(*ii);\n           nn != en; ++nn) {\n        constructOutEdgeValue(graph, nn, aux[*ii], aux[graph.getEdgeDst(nn)]);\n      }\n    }\n  }\n\n  /**\n   * Constructs the MorphGraph in-edges given a FileGraph to construct it from\n   * and already created nodes. Meant to be called by multiple threads.\n   * DirectedNotInOut = false version\n   *\n   * @param[in] graph FileGraph to construct a morph graph from\n   * @param[in] tid Thread id of thread calling this function\n   * @param[in] total Total number of threads in current execution\n   * @param[in] aux Contains created nodes to create edges for\n   */\n  template <bool V = DirectedNotInOut>\n  std::enable_if_t<!V> constructInEdgesFrom(FileGraph& graph, unsigned tid,\n                                            unsigned total,\n                                            ReadGraphAuxData& aux) {\n    auto r = graph\n                 .divideByNode(sizeof(gNode), sizeof(typename gNode::EdgeInfo),\n                               tid, total)\n                 .first;\n\n    for (FileGraph::iterator ii = r.first, ei = r.second; ii != ei; ++ii) {\n      auto& auxNode = aux[*ii].get();\n      for (auto ie : auxNode.inNghs) {\n        constructInEdgeValue(graph, ie.second, ie.first, auxNode.n);\n      }\n    }\n  }\n\n  //! If a directed graph and no in-edges exist (i.e. DirectedNotInOut = true),\n  //! then construct in edges should do nothing.\n  template <bool V = DirectedNotInOut>\n  std::enable_if_t<V> constructInEdgesFrom(FileGraph&, unsigned, unsigned,\n                                           ReadGraphAuxData&) {}\n#endif\n};\n\n} // namespace graphs\n} // namespace galois\n#endif\n"
  },
  {
    "path": "libgalois/include/galois/graphs/MorphHyperGraph.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n/**\n * @file MorphGraph.h\n *\n * Contains MorphGraph and associated helpers.\n */\n\n#ifndef GALOIS_GRAPHS_MORPHHYPERGRAPH_H\n#define GALOIS_GRAPHS_MORPHHYPERGRAPH_H\n\n#include <algorithm>\n#include <map>\n#include <set>\n#include <type_traits>\n#include <vector>\n\n#include <boost/container/small_vector.hpp>\n#include <boost/functional.hpp>\n#include <boost/iterator/transform_iterator.hpp>\n#include <boost/iterator/filter_iterator.hpp>\n\n#include \"galois/Bag.h\"\n#include \"galois/config.h\"\n#include \"galois/Galois.h\"\n#include \"galois/graphs/FileGraph.h\"\n#include \"galois/graphs/Details.h\"\n#include \"galois/gstl.h\"\n\n#ifdef AUX_MAP\n#include \"galois/PerThreadContainer.h\"\n#else\n#include \"galois/substrate/CacheLineStorage.h\"\n#include \"galois/substrate/SimpleLock.h\"\n#endif\n\nnamespace galois {\nnamespace graphs {\n\nnamespace internal {\n/**\n * Wrapper class to have a valid type on void edges\n */\ntemplate <typename NTy, typename ETy, bool DirectedButNotInOut>\nstruct UEdgeInfoBase;\n\ntemplate <typename NTy, typename ETy>\nstruct UEdgeInfoBase<NTy, ETy, true> {\n  typedef ETy& reference;\n\n  NTy* N;\n  ETy Ea;\n\n  inline NTy* first() {\n    assert(N);\n    return N;\n  }\n  inline NTy* first() const {\n    assert(N);\n    return N;\n  }\n  inline ETy* second() { return &Ea; }\n  inline const ETy* second() const { return &Ea; }\n\n  template <typename... Args>\n  UEdgeInfoBase(NTy* n, ETy*, bool, Args&&... args)\n      : N(n), Ea(std::forward<Args>(args)...) {}\n\n  template <typename... Args>\n  UEdgeInfoBase(NTy* n, ETy& v, bool, Args&&...) : N(n) {\n    Ea = v;\n  }\n\n  static size_t sizeOfSecond() { return sizeof(ETy); }\n  bool isInEdge() const { return false; }\n};\n\ntemplate <typename NTy, typename ETy>\nstruct UEdgeInfoBase<NTy, ETy, false> {\n  typedef ETy& reference;\n\n  NTy* N;\n  ETy* Ea;\n\n  inline NTy* first() {\n    assert(N);\n    return (NTy*)((uintptr_t)N & ~1);\n  }\n  inline NTy* first() const {\n    assert(N);\n    return (NTy*)((uintptr_t)N & ~1);\n  }\n  inline ETy* second() { return Ea; }\n  inline const ETy* second() const { return Ea; }\n  template <typename... Args>\n  UEdgeInfoBase(NTy* n, ETy* v, bool f, Args&&...)\n      : N((NTy*)((uintptr_t)n | f)), Ea(v) {}\n  static size_t sizeOfSecond() { return sizeof(ETy); }\n  bool isInEdge() const { return (uintptr_t)N & 1; }\n};\n\ntemplate <typename NTy>\nstruct UEdgeInfoBase<NTy, void, true> {\n  typedef char& reference;\n\n  NTy* N;\n  inline NTy* first() { return N; }\n  inline NTy* first() const { return N; }\n  inline char* second() const { return static_cast<char*>(NULL); }\n  inline char* addr() const { return second(); }\n  template <typename... Args>\n  UEdgeInfoBase(NTy* n, void*, bool, Args&&...) : N(n) {}\n  static size_t sizeOfSecond() { return 0; }\n  bool isInEdge() const { return false; }\n};\n\ntemplate <typename NTy>\nstruct UEdgeInfoBase<NTy, void, false> {\n  typedef char& reference;\n\n  NTy* N;\n  inline NTy* first() { return (NTy*)((uintptr_t)N & ~1); }\n  inline NTy* first() const { return (NTy*)((uintptr_t)N & ~1); }\n  inline char* second() const { return static_cast<char*>(NULL); }\n  inline char* addr() const { return second(); }\n  template <typename... Args>\n  UEdgeInfoBase(NTy* n, void*, bool f, Args&&...)\n      : N((NTy*)((uintptr_t)n | f)) {}\n  static size_t sizeOfSecond() { return 0; }\n  bool isInEdge() const { return (uintptr_t)N & 1; }\n};\n\n/*\n * Only graphs w/ in-out/symmetric edges and non-void edge data,\n * i.e. ETy != void and DirectedNotInOut = false,\n * need to allocate memory for edge data\n */\ntemplate <typename ETy, bool DirectedNotInOut>\nstruct EdgeFactory {\n  galois::InsertBag<ETy> mem;\n  template <typename... Args>\n  ETy* mkEdge(Args&&... args) {\n    return &mem.emplace(std::forward<Args>(args)...);\n  }\n  void delEdge(ETy*) {}\n  bool mustDel() const { return false; }\n};\n\ntemplate <typename ETy>\nstruct EdgeFactory<ETy, true> {\n  template <typename... Args>\n  ETy* mkEdge(Args&&...) {\n    return nullptr;\n  }\n  void delEdge(ETy*) {}\n  bool mustDel() const { return false; }\n};\n\ntemplate <>\nstruct EdgeFactory<void, false> {\n  template <typename... Args>\n  void* mkEdge(Args&&...) {\n    return static_cast<void*>(NULL);\n  }\n  void delEdge(void*) {}\n  bool mustDel() const { return false; }\n};\n\n} // namespace internal\n\n/**\n * A graph that can have new nodes and edges added to it.\n *\n * An example of use:\n *\n * \\code\n * struct Node {\n *   ... // Definition of node data\n * };\n *\n * typedef galois::graphs::MorphGraph<Node,int,true> Graph;\n *\n * // Create graph\n * Graph g;\n * Node n1, n2;\n * Graph::GraphNode a, b;\n * a = g.createNode(n1);\n * g.addNode(a);\n * b = g.createNode(n2);\n * g.addNode(b);\n * g.getEdgeData(g.addEdge(a, b)) = 5;\n *\n * // Traverse graph\n * for (Graph::iterator ii = g.begin(), ei = g.end(); ii != ei; ++ii) {\n *   Graph::GraphNode src = *ii;\n *   for (Graph::edge_iterator jj = g.edge_begin(src), ej = g.edge_end(src);\n *        jj != ej;\n *        ++jj) {\n *     Graph::GraphNode dst = graph.getEdgeDst(jj);\n *     int edgeData = g.getEdgeData(jj);\n *     assert(edgeData == 5);\n *   }\n * }\n * \\endcode\n *\n * And in C++11:\n *\n * \\code\n * // Traverse graph\n * for (Graph::GraphNode src : g) {\n *   for (Graph::edge_iterator edge : g.out_edges(src)) {\n *     Graph::GraphNode dst = g.getEdgeDst(edge);\n *     int edgeData = g.getEdgeData(edge);\n *     assert(edgeData == 5);\n *   }\n * }\n * \\endcode\n *\n * @tparam NodeTy Type of node data\n * @tparam EdgeTy Type of edge data\n * @tparam Directional true if graph is directed\n * @tparam InOut true if directed graph tracks in-edges\n * @tparam HasNoLockable if true, use no abstract locks in the graph\n * @tparam SortedNeighbors Keep neighbors sorted (for faster findEdge)\n * @tparam FileEdgeTy type of edges on file to be read from\n */\ntemplate <typename NodeTy, typename EdgeTy, bool Directional,\n          bool InOut = false, bool HasNoLockable = false,\n          bool SortedNeighbors = false, typename FileEdgeTy = EdgeTy>\nclass MorphHyperGraph : private boost::noncopyable {\npublic:\n  /**\n   * Struct used to define the HasNoLockable template parameter as a type\n   * in the struct.\n   */\n  template <bool _has_no_lockable>\n  struct with_no_lockable {\n    //! Type with Lockable parameter set according to struct template arg\n    using type = MorphHyperGraph<NodeTy, EdgeTy, Directional, InOut,\n                                 _has_no_lockable, SortedNeighbors, FileEdgeTy>;\n  };\n\n  /**\n   * Struct used to define the type of node data in the graph.\n   */\n  template <typename _node_data>\n  struct with_node_data {\n    //! Type with node data parameter set according to struct template arg\n    using type = MorphHyperGraph<_node_data, EdgeTy, Directional, InOut,\n                                 HasNoLockable, SortedNeighbors, FileEdgeTy>;\n  };\n\n  /**\n   * Struct used to define the type of edge data in the graph.\n   */\n  template <typename _edge_data>\n  struct with_edge_data {\n    //! Type with edge data parameter set according to struct template arg\n    using type = MorphHyperGraph<NodeTy, _edge_data, Directional, InOut,\n                                 HasNoLockable, SortedNeighbors, FileEdgeTy>;\n  };\n\n  /**\n   * Struct used to define the type of file edge data in the graph.\n   */\n  template <typename _file_edge_data>\n  struct with_file_edge_data {\n    //! Type with file edge data parameter set according to struct template arg\n    using type =\n        MorphHyperGraph<NodeTy, EdgeTy, Directional, InOut, HasNoLockable,\n                        SortedNeighbors, _file_edge_data>;\n  };\n\n  /**\n   * Struct used to define directionality of the graph.\n   */\n  template <bool _directional>\n  struct with_directional {\n    //! Type with directional parameter set according to struct template arg\n    using type = MorphHyperGraph<NodeTy, EdgeTy, _directional, InOut,\n                                 HasNoLockable, SortedNeighbors, FileEdgeTy>;\n  };\n\n  /**\n   * Struct used to define if neighbors are sorted or not in the graph.\n   */\n  template <bool _sorted_neighbors>\n  struct with_sorted_neighbors {\n    //! Type with sort neighbor parameter set according to struct template arg\n    using type = MorphHyperGraph<NodeTy, EdgeTy, Directional, InOut,\n                                 HasNoLockable, _sorted_neighbors, FileEdgeTy>;\n  };\n\n  //! Tag that defines to graph reader how to read a graph into this class\n  using read_tag = read_with_aux_first_graph_tag;\n\nprivate: ///////////////////////////////////////////////////////////////////////\n  template <typename T>\n  struct first_eq_and_valid {\n    T N2;\n    first_eq_and_valid(T& n) : N2(n) {}\n    template <typename T2>\n    bool operator()(const T2& ii) const {\n      return ii.first() == N2 && ii.first() && ii.first()->active;\n    }\n  };\n\n  struct first_not_valid {\n    template <typename T2>\n    bool operator()(const T2& ii) const {\n      return !ii.first() || !ii.first()->active;\n    }\n  };\n\n  template <typename T>\n  struct first_lt {\n    template <typename T2>\n    bool operator()(const T& N2, const T2& ii) const {\n      assert(ii.first() && \"UNEXPECTED: invalid item in edgelist\");\n      return N2 < ii.first();\n    }\n    template <typename T2>\n    bool operator()(const T2& ii, const T& N2) const {\n      assert(ii.first() && \"UNEXPECTED: invalid item in edgelist\");\n      return ii.first() < N2;\n    }\n  };\n\n  // forward declaration for graph node type\n  class gNode;\n  struct gNodeTypes\n      : public internal::NodeInfoBaseTypes<NodeTy, !HasNoLockable> {\n    //! The storage type for an edge\n    using EdgeInfo =\n        internal::UEdgeInfoBase<gNode, EdgeTy, Directional & !InOut>;\n\n    //! The storage type for edges\n    // typedef galois::gstl::Vector<EdgeInfo> EdgesTy;\n    using EdgesTy = boost::container::small_vector<\n        EdgeInfo, 3, galois::runtime::Pow_2_BlockAllocator<EdgeInfo>>;\n\n    using iterator = typename EdgesTy::iterator;\n  };\n\n  class gNode : public internal::NodeInfoBase<NodeTy, !HasNoLockable>,\n                public gNodeTypes {\n    //! friend of MorphHyperGraph since MorphHyperGraph contains gNodes\n    friend class MorphHyperGraph;\n    //! Storage type for node\n    using NodeInfo = internal::NodeInfoBase<NodeTy, !HasNoLockable>;\n    //! iterator over edges (taken from gNodeTypes)\n    using iterator = typename gNode::iterator;\n    //! Storage type of a single edge (taken from gNodeTypes)\n    using EdgeInfo = typename gNode::EdgeInfo;\n\n    //! edges on this node\n    typename gNodeTypes::EdgesTy edges;\n\n    //! Tracks if this node is considered as \"in\" the graph\n    bool active;\n\n    //! Return iterator to first edge\n    iterator begin() { return edges.begin(); }\n    //! Return iterator to end of edges\n    iterator end() { return edges.end(); }\n\n    //! Remove the provided edge from this node\n    //! @param ii iterator to edge to remove\n    void erase(iterator ii) {\n      if (SortedNeighbors) {\n        // For sorted case remove the element, moving following\n        // elements back to fill the space.\n        edges.erase(ii);\n      } else {\n        // We don't need to preserve the order, so move the last edge\n        // into this place and then remove last edge.\n        *ii = edges.back();\n        edges.pop_back();\n      }\n    }\n\n    /**\n     * Erase an edge with a provided destination.\n     */\n    void erase(gNode* N, bool inEdge = false) {\n      iterator ii = find(N, inEdge);\n      if (ii != end())\n        edges.erase(ii);\n    }\n\n    /**\n     * Find an edge with a particular destination node.\n     */\n    iterator find(gNode* N, bool inEdge = false) {\n      iterator ii, ei = edges.end();\n      // find starting point to start search\n      if (SortedNeighbors) {\n        assert(std::is_sorted(edges.begin(), edges.end(),\n                              [=](const EdgeInfo& e1, const EdgeInfo& e2) {\n                                return e1.first() < e2.first();\n                              }));\n        ii =\n            std::lower_bound(edges.begin(), edges.end(), N, first_lt<gNode*>());\n      } else {\n        ii = edges.begin();\n      }\n\n      first_eq_and_valid<gNode*> checker(N);\n      ii = std::find_if(ii, ei, checker);\n      while (ii != ei && ii->isInEdge() != inEdge) {\n        ++ii;\n        ii = std::find_if(ii, ei, checker);\n      };\n      return ii;\n    }\n\n    /**\n     * Make space for more edges stored by this node\n     */\n    void resizeEdges(size_t size) {\n      edges.resize(size, EdgeInfo(new gNode(), 0));\n    }\n\n    /**\n     * Add a new edge to this node\n     */\n    template <typename... Args>\n    iterator createEdge(gNode* N, EdgeTy* v, bool inEdge, Args&&... args) {\n      iterator ii;\n      if (SortedNeighbors) {\n        // If neighbors are sorted, find appropriate insertion point.\n        // Insert before first neighbor that is too far.\n        ii =\n            std::upper_bound(edges.begin(), edges.end(), N, first_lt<gNode*>());\n      } else {\n        ii = edges.end();\n      }\n\n      return edges.insert(ii,\n                          EdgeInfo(N, v, inEdge, std::forward<Args>(args)...));\n    }\n\n    /**\n     * Add an edge to this node; if space exists to add it in, then reuse that\n     * space.\n     */\n    template <typename... Args>\n    iterator createEdgeWithReuse(gNode* N, EdgeTy* v, bool inEdge,\n                                 Args&&... args) {\n      // First check for holes\n      iterator ii, ei;\n      if (SortedNeighbors) {\n        // If neighbors are sorted, find acceptable range for insertion.\n        ii =\n            std::lower_bound(edges.begin(), edges.end(), N, first_lt<gNode*>());\n        ei = std::upper_bound(ii, edges.end(), N, first_lt<gNode*>());\n      } else {\n        // If not sorted, we can insert anywhere in the list.\n        ii = edges.begin();\n        ei = edges.end();\n      }\n      ii = std::find_if(ii, ei, first_not_valid());\n      if (ii != ei) {\n        // FIXME: We could move elements around (short distances).\n        *ii = EdgeInfo(N, v, inEdge, std::forward<Args>(args)...);\n        return ii;\n      }\n      return edges.insert(ei,\n                          EdgeInfo(N, v, inEdge, std::forward<Args>(args)...));\n    }\n\n    template <bool _A1 = HasNoLockable>\n    void acquire(MethodFlag mflag, typename std::enable_if<!_A1>::type* = 0) {\n      galois::runtime::acquire(this, mflag);\n    }\n\n    template <bool _A1 = HasNoLockable>\n    void acquire(MethodFlag, typename std::enable_if<_A1>::type* = 0) {}\n\n  public:\n    int gain;\n    template <typename... Args>\n    gNode(Args&&... args)\n        : NodeInfo(std::forward<Args>(args)...), active(false) {}\n  };\n\n  // The graph manages the lifetimes of the data in the nodes and edges\n  //! Container for nodes\n  using NodeListTy = galois::InsertBag<gNode>;\n  using Bnodes     = galois::InsertBag<gNode*>;\n  //! nodes in this graph\n  NodeListTy nodes;\n  Bnodes cells;\n  Bnodes nets;\n\n  internal::EdgeFactory<EdgeTy, Directional && !InOut> edgesF;\n\n  // Helpers for iterator classes\n  struct is_node {\n    bool operator()(const gNode& g) const { return g.active; }\n  };\n  struct is_edge {\n    bool operator()(typename gNodeTypes::EdgeInfo& e) const {\n      return e.first()->active;\n    }\n  };\n  struct is_in_edge {\n    bool operator()(typename gNodeTypes::EdgeInfo& e) const {\n      return e.first()->active && e.isInEdge();\n    }\n  };\n  struct is_out_edge {\n    bool operator()(typename gNodeTypes::EdgeInfo& e) const {\n      return e.first()->active && !e.isInEdge();\n    }\n  };\n  struct makeGraphNode {\n    gNode* operator()(gNode& data) const { return &data; }\n  };\n\npublic: ////////////////////////////////////////////////////////////////////////\n  //! Graph node handle\n  using GraphNode = gNode*;\n  //! Edge data type\n  using edge_data_type = EdgeTy;\n  //! Edge data type of file we are loading this graph from\n  using file_edge_data_type = FileEdgeTy;\n  //! Node data type\n  using node_data_type = NodeTy;\n  //! (Out or Undirected) Edge iterator\n  using edge_iterator =\n      typename boost::filter_iterator<is_out_edge,\n                                      typename gNodeTypes::iterator>;\n  //! In Edge iterator\n  using in_edge_iterator =\n      typename boost::filter_iterator<is_in_edge,\n                                      typename gNodeTypes::iterator>;\n\n  //! Reference to edge data\n  using edge_data_reference = typename gNodeTypes::EdgeInfo::reference;\n  //! Reference to node data\n  using node_data_reference = typename gNodeTypes::reference;\n  //! Node iterator\n  using iterator = boost::transform_iterator<\n      makeGraphNode,\n      boost::filter_iterator<is_node, typename NodeListTy::iterator>>;\n\n  gstl::Vector<GraphNode> locked_cells;\n  int max_cell_area;\n  int total_area;\n  std::list<int> freeCells;\n\n  gstl::Vector<int> maxgain;\n  gstl::Vector<int> balance;\n#ifdef AUX_MAP\n  //! Auxiliary data for nodes that stores in neighbors in per thread storage\n  //! accessed through a map\n  struct ReadGraphAuxData {\n    LargeArray<GraphNode> nodes;\n    //! stores in neighbors\n    galois::PerThreadMap<FileGraph::GraphNode,\n                         galois::gstl::Vector<std::pair<GraphNode, EdgeTy*>>>\n        inNghs;\n  };\n#else\n  //! Wrapper around a graph node that provides a lock for it as well as\n  //! in-neighbor tracking\n  struct AuxNode {\n    //! lock for wrapped graph node\n    galois::substrate::SimpleLock lock;\n    //! single graph node wrapped by this struct\n    GraphNode n;\n    //! stores in neighbors\n    galois::gstl::Vector<std::pair<GraphNode, EdgeTy*>> inNghs;\n  };\n  //! Padded version of AuxNode\n  using AuxNodePadded = typename galois::substrate::CacheLineStorage<AuxNode>;\n\n  //! True if a node is both directional and not storing both in and out\n  //! edges\n  constexpr static const bool DirectedNotInOut = (Directional && !InOut);\n  //! Large array that contains auxiliary data for each node (AuxNodes)\n  using ReadGraphAuxData =\n      typename std::conditional<DirectedNotInOut, LargeArray<GraphNode>,\n                                LargeArray<AuxNodePadded>>::type;\n#endif\n\nprivate: ///////////////////////////////////////////////////////////////////////\n  template <typename... Args>\n  edge_iterator createEdgeWithReuse(GraphNode src, GraphNode dst,\n                                    galois::MethodFlag mflag, Args&&... args) {\n    assert(src);\n    assert(dst);\n    // galois::runtime::checkWrite(mflag, true);\n    src->acquire(mflag);\n    typename gNode::iterator ii = src->find(dst);\n    // add edge only if it doesn't already exist\n    if (ii == src->end()) {\n      if (Directional && !InOut) {\n        ii = src->createEdgeWithReuse(dst, 0, false,\n                                      std::forward<Args>(args)...);\n      } else {\n        dst->acquire(mflag);\n        EdgeTy* e = edgesF.mkEdge(std::forward<Args>(args)...);\n        ii        = dst->createEdgeWithReuse(src, e, Directional ? true : false,\n                                      std::forward<Args>(args)...);\n        ii        = src->createEdgeWithReuse(dst, e, false,\n                                      std::forward<Args>(args)...);\n      }\n    }\n    return boost::make_filter_iterator(is_out_edge(), ii, src->end());\n  }\n\n  template <typename... Args>\n  edge_iterator createEdge(GraphNode src, GraphNode dst,\n                           galois::MethodFlag mflag, Args&&... args) {\n    assert(src);\n    assert(dst);\n    // galois::runtime::checkWrite(mflag, true);\n    src->acquire(mflag);\n    typename gNode::iterator ii = src->end();\n    // add edge only if it doesn't already exist\n    if (ii == src->end()) {\n      if (Directional && !InOut) {\n        ii = src->createEdge(dst, 0, false, std::forward<Args>(args)...);\n      } else {\n        dst->acquire(mflag);\n        EdgeTy* e = edgesF.mkEdge(std::forward<Args>(args)...);\n        ii        = dst->createEdge(src, e, Directional ? true : false,\n                             std::forward<Args>(args)...);\n        ii        = src->createEdge(dst, e, false, std::forward<Args>(args)...);\n      }\n    }\n    return boost::make_filter_iterator(is_out_edge(), ii, src->end());\n  }\n\n  /**\n   * Creates an outgoing edge at src for the edge from src to dst.\n   * Only called by constructOutEdgeValue.\n   */\n  template <typename... Args>\n  EdgeTy* createOutEdge(GraphNode src, GraphNode dst, galois::MethodFlag mflag,\n                        Args&&... args) {\n    assert(src);\n    assert(dst);\n\n    src->acquire(mflag);\n    typename gNode::iterator ii = src->end();\n    if (ii == src->end()) {\n      dst->acquire(mflag);\n      EdgeTy* e = edgesF.mkEdge(std::forward<Args>(args)...);\n      ii        = src->createEdge(dst, e, false, std::forward<Args>(args)...);\n      return e;\n    }\n    return nullptr;\n  }\n\n  /**\n   * Creates an incoming edge at dst for the edge from src to dst.\n   * Only called by constructInEdgeValue.\n   * Reuse data from the corresponding outgoing edge.\n   */\n  template <typename... Args>\n  void createInEdge(GraphNode src, GraphNode dst, EdgeTy* e,\n                    galois::MethodFlag mflag, Args&&... args) {\n    assert(src);\n    assert(dst);\n\n    dst->acquire(mflag);\n    typename gNode::iterator ii = dst->end();\n    if (ii == dst->end()) {\n      src->acquire(mflag);\n      ii = dst->createEdge(src, e, Directional ? true : false,\n                           std::forward<Args>(args)...);\n    }\n  }\n\n  template <bool _A1 = LargeArray<EdgeTy>::has_value,\n            bool _A2 = LargeArray<FileEdgeTy>::has_value>\n  EdgeTy*\n  constructOutEdgeValue(FileGraph& graph, typename FileGraph::edge_iterator nn,\n                        GraphNode src, GraphNode dst,\n                        typename std::enable_if<!_A1 || _A2>::type* = 0) {\n    typedef typename LargeArray<FileEdgeTy>::value_type FEDV;\n    typedef LargeArray<EdgeTy> ED;\n    if (ED::has_value) {\n      return createOutEdge(src, dst, galois::MethodFlag::UNPROTECTED,\n                           graph.getEdgeData<FEDV>(nn));\n    } else {\n      return createOutEdge(src, dst, galois::MethodFlag::UNPROTECTED);\n    }\n  }\n\n  template <bool _A1 = LargeArray<EdgeTy>::has_value,\n            bool _A2 = LargeArray<FileEdgeTy>::has_value>\n  EdgeTy*\n  constructOutEdgeValue(FileGraph&, typename FileGraph::edge_iterator,\n                        GraphNode src, GraphNode dst,\n                        typename std::enable_if<_A1 && !_A2>::type* = 0) {\n    return createOutEdge(src, dst, galois::MethodFlag::UNPROTECTED);\n  }\n\n  // will reuse edge data from outgoing edges\n  void constructInEdgeValue(FileGraph&, EdgeTy* e, GraphNode src,\n                            GraphNode dst) {\n    createInEdge(src, dst, e, galois::MethodFlag::UNPROTECTED);\n  }\n\npublic\n    : /////////////////////////////////////////////////////////////////////////\n  /**\n   * Creates a new node holding the indicated data. Usually you should call\n   * {@link addNode()} afterwards.\n   *\n   * @param[in] args constructor arguments for node data\n   * @returns newly created graph node\n   */\n  template <typename... Args>\n  GraphNode createNode(Args&&... args) {\n    gNode* N  = &(nodes.emplace(std::forward<Args>(args)...));\n    N->active = false;\n    return GraphNode(N);\n  }\n\n  /**\n   * Adds a node to the graph.\n   */\n  void addNode(const GraphNode& n,\n               galois::MethodFlag mflag = MethodFlag::WRITE) {\n    // galois::runtime::checkWrite(mflag, true);\n    n->acquire(mflag);\n    n->active = true;\n  }\n\n  //! Gets the node data for a node.\n  node_data_reference\n  getData(const GraphNode& n,\n          galois::MethodFlag mflag = MethodFlag::WRITE) const {\n    assert(n);\n    // galois::runtime::checkWrite(mflag, false);\n    n->acquire(mflag);\n    return n->getData();\n  }\n\n  //! Checks if a node is in the graph\n  //! @returns true if a node has is in the graph\n  bool containsNode(const GraphNode& n,\n                    galois::MethodFlag mflag = MethodFlag::WRITE) const {\n    assert(n);\n    n->acquire(mflag);\n    return n->active;\n  }\n\n  /**\n   * Removes a node from the graph along with all its outgoing/incoming edges\n   * for undirected graphs or outgoing edges for directed graphs.\n   *\n   * @todo handle edge memory\n   */\n  void removeNode(GraphNode n, galois::MethodFlag mflag = MethodFlag::WRITE) {\n    assert(n);\n    // galois::runtime::checkWrite(mflag, true);\n    n->acquire(mflag);\n    gNode* N = n;\n    if (N->active) {\n      N->active = false;\n      N->edges.clear();\n    }\n  }\n\n  /**\n   * Resize the edges of the node. For best performance, should be done\n   * serially.\n   */\n  void resizeEdges(GraphNode src, size_t size,\n                   galois::MethodFlag mflag = MethodFlag::WRITE) {\n    assert(src);\n    // galois::runtime::checkWrite(mflag, false);\n    src->acquire(mflag);\n    src->resizeEdges(size);\n  }\n\n  /**\n   * Adds an edge to graph, replacing existing value if edge already exists.\n   *\n   * Ignore the edge data, let the caller use the returned iterator to set the\n   * value if desired.  This frees us from dealing with the void edge data\n   * problem in this API\n   */\n  edge_iterator addEdge(GraphNode src, GraphNode dst,\n                        galois::MethodFlag mflag = MethodFlag::WRITE) {\n    return createEdgeWithReuse(src, dst, mflag);\n  }\n\n  //! Adds and initializes an edge to graph but does not check for duplicate\n  //! edges\n  template <typename... Args>\n  edge_iterator addMultiEdge(GraphNode src, GraphNode dst,\n                             galois::MethodFlag mflag, Args&&... args) {\n    return createEdge(src, dst, mflag, std::forward<Args>(args)...);\n  }\n\n  //! Removes an edge from the graph\n  void removeEdge(GraphNode src, edge_iterator dst,\n                  galois::MethodFlag mflag = MethodFlag::WRITE) {\n    assert(src);\n    // galois::runtime::checkWrite(mflag, true);\n    src->acquire(mflag);\n    if (Directional && !InOut) {\n      src->erase(dst.base());\n    } else {\n      dst->first()->acquire(mflag);\n      // EdgeTy* e = dst->second();\n      dst->first()->erase(\n          src, Directional ? true : false); // erase incoming/symmetric edge\n      src->erase(dst.base());\n    }\n  }\n\n  //! Finds if an edge between src and dst exists\n  edge_iterator findEdge(GraphNode src, GraphNode dst,\n                         galois::MethodFlag mflag = MethodFlag::WRITE) {\n    assert(src);\n    assert(dst);\n    src->acquire(mflag);\n    typename gNodeTypes::iterator ii = src->find(dst), ei = src->end();\n    is_out_edge edge_predicate;\n    if (ii != ei && edge_predicate(*ii)) {\n      // After finding edge, lock dst and verify still active\n      dst->acquire(mflag);\n      if (!edge_predicate(*ii))\n        // I think we need this too, else we'll return some random iterator.\n        ii = ei;\n    } else {\n      ii = ei;\n    }\n    return boost::make_filter_iterator(edge_predicate, ii, ei);\n  }\n\n  //! Find/return edge between src/dst if it exists; assumes that edges\n  //! are sorted by destination\n  edge_iterator\n  findEdgeSortedByDst(GraphNode src, GraphNode dst,\n                      galois::MethodFlag mflag = MethodFlag::WRITE) {\n    assert(src);\n    assert(dst);\n    src->acquire(mflag);\n    assert(std::is_sorted(src->begin(), src->end(),\n                          [=](const typename gNode::EdgeInfo& e1,\n                              const typename gNode::EdgeInfo& e2) {\n                            return e1.first() < e2.first();\n                          }));\n\n    auto ei = src->end();\n\n    // jump directly to edges with destination we are looking for\n    auto ii =\n        std::lower_bound(src->begin(), src->end(), dst, first_lt<gNode*>());\n\n    first_eq_and_valid<gNode*> checker(dst);\n    ii = std::find_if(ii, ei, checker); // bug if ei set to upper_bound\n    // ignore in edges\n    while (ii != ei && ii->isInEdge()) {\n      ++ii;\n      ii = std::find_if(ii, ei, checker);\n    };\n\n    // make sure destination node is active else return end iterator\n    is_out_edge edge_predicate;\n    if (ii != ei) {\n      dst->acquire(mflag);\n      if (!edge_predicate(*ii)) {\n        ii = ei;\n      }\n    }\n    return boost::make_filter_iterator(edge_predicate, ii, ei);\n  }\n\n  //! Find a particular in-edge: note this function activates for the undirected\n  //! graph case, so it just calls the regular out-edge finding function\n  template <bool _Undirected = !Directional>\n  edge_iterator findInEdge(GraphNode src, GraphNode dst,\n                           galois::MethodFlag mflag = MethodFlag::WRITE,\n                           typename std::enable_if<_Undirected>::type* = 0) {\n    // incoming neighbors are the same as outgoing neighbors in undirected\n    // graphs\n    return findEdge(src, dst, mflag);\n  }\n\n  //! Find if an incoming edge between src and dst exists for directed in-out\n  //! graphs\n  template <bool _DirectedInOut = (Directional && InOut)>\n  in_edge_iterator\n  findInEdge(GraphNode src, GraphNode dst,\n             galois::MethodFlag mflag                       = MethodFlag::WRITE,\n             typename std::enable_if<_DirectedInOut>::type* = 0) {\n    assert(src);\n    assert(dst);\n    src->acquire(mflag);\n    typename gNodeTypes::iterator ii = src->find(dst, true), ei = src->end();\n    is_in_edge edge_predicate;\n    if (ii != ei && edge_predicate(*ii)) {\n      // After finding edges, lock dst and verify still active\n      dst->acquire(mflag);\n      if (!edge_predicate(*ii))\n        // need this to avoid returning a random iterator\n        ii = ei;\n    } else\n      ii = ei;\n    return boost::make_filter_iterator(edge_predicate, ii, ei);\n  }\n\n  /**\n   * Returns the edge data associated with the edge. It is an error to\n   * get the edge data for a non-existent edge.  It is an error to get\n   * edge data for inactive edges. By default, the mflag is\n   * galois::MethodFlag::UNPROTECTED because edge_begin() dominates this call\n   * and should perform the appropriate locking.\n   */\n  edge_data_reference\n  getEdgeData(edge_iterator ii,\n              galois::MethodFlag mflag = MethodFlag::UNPROTECTED) const {\n    assert(ii->first()->active);\n    // galois::runtime::checkWrite(mflag, false);\n    ii->first()->acquire(mflag);\n    return *ii->second();\n  }\n\n  /**\n   * Get edge data for an in-edge\n   */\n  edge_data_reference\n  getEdgeData(in_edge_iterator ii,\n              galois::MethodFlag mflag = MethodFlag::UNPROTECTED) const {\n    assert(ii->first()->active);\n    // galois::runtime::checkWrite(mflag, false);\n    ii->first()->acquire(mflag);\n    return *ii->second();\n  }\n\n  //! Returns the destination of an edge\n  GraphNode getEdgeDst(edge_iterator ii) {\n    assert(ii->first()->active);\n    return GraphNode(ii->first());\n  }\n\n  //! Returns the destination of an in-edge\n  GraphNode getEdgeDst(in_edge_iterator ii) {\n    assert(ii->first()->active);\n    return GraphNode(ii->first());\n  }\n\n  //! Sorts edge of a node by destination.\n  void sortEdgesByDst(GraphNode N,\n                      galois::MethodFlag mflag = MethodFlag::WRITE) {\n    acquire(N, mflag);\n    typedef typename gNode::EdgeInfo EdgeInfo;\n    std::sort(N->begin(), N->end(),\n              [=](const EdgeInfo& e1, const EdgeInfo& e2) {\n                return e1.first() < e2.first();\n              });\n  }\n\n  //! Sort all edges by destination\n  void sortAllEdgesByDst(MethodFlag mflag = MethodFlag::WRITE) {\n    galois::do_all(\n        galois::iterate(*this),\n        [=](GraphNode N) { this->sortEdgesByDst(N, mflag); }, galois::steal());\n  }\n\n  // General Things\n  void sortEdgesByDeg(GraphNode N,\n                      galois::MethodFlag mflag = MethodFlag::WRITE) {\n    acquire(N, mflag);\n    typedef typename gNode::EdgeInfo EdgeInfo;\n    std::sort(N->begin(), N->end(),\n              [=](const EdgeInfo& e1, const EdgeInfo& e2) {\n                return getallneighbor(e1.first()).size() <\n                       getallneighbor(e2.first()).size();\n              });\n  }\n\n  // Sort cells in a net by their degree\n  void sortCellDegree(MethodFlag mflag = MethodFlag::WRITE) {\n    galois::do_all(galois::iterate(this->getNets()),\n                   [=](GraphNode N) { this->sortEdgesByDeg(N, mflag); });\n  }\n\n  //! Returns an iterator to the neighbors of a node\n  edge_iterator edge_begin(GraphNode N,\n                           galois::MethodFlag mflag = MethodFlag::WRITE) {\n    assert(N);\n    N->acquire(mflag);\n\n    if (galois::runtime::shouldLock(mflag)) {\n      for (typename gNode::iterator ii = N->begin(), ee = N->end(); ii != ee;\n           ++ii) {\n        if (ii->first()->active && !ii->isInEdge())\n          ii->first()->acquire(mflag);\n      }\n    }\n    return boost::make_filter_iterator(is_out_edge(), N->begin(), N->end());\n  }\n\n  //! Returns an iterator to the in-neighbors of a node\n  template <bool _Undirected = !Directional>\n  in_edge_iterator\n  in_edge_begin(GraphNode N, galois::MethodFlag mflag = MethodFlag::WRITE,\n                typename std::enable_if<!_Undirected>::type* = 0) {\n    assert(N);\n    N->acquire(mflag);\n\n    if (galois::runtime::shouldLock(mflag)) {\n      for (typename gNode::iterator ii = N->begin(), ee = N->end(); ii != ee;\n           ++ii) {\n        if (ii->first()->active && ii->isInEdge())\n          ii->first()->acquire(mflag);\n      }\n    }\n    return boost::make_filter_iterator(is_in_edge(), N->begin(), N->end());\n  }\n\n  //! Returns an iterator to the in-neighbors of a node; undirected case\n  //! in which it's the same as a regular neighbor\n  template <bool _Undirected = !Directional>\n  edge_iterator in_edge_begin(GraphNode N,\n                              galois::MethodFlag mflag = MethodFlag::WRITE,\n                              typename std::enable_if<_Undirected>::type* = 0) {\n    return edge_begin(N, mflag);\n  }\n\n  //! Returns the end of the neighbor edge iterator\n  edge_iterator\n  edge_end(GraphNode N,\n           galois::MethodFlag GALOIS_UNUSED(mflag) = MethodFlag::WRITE) {\n    assert(N);\n    // Acquiring lock is not necessary: no valid use for an end pointer should\n    // ever require it\n    // N->acquire(mflag);\n    return boost::make_filter_iterator(is_out_edge(), N->end(), N->end());\n  }\n\n  //! Returns the end of an in-neighbor edge iterator\n  template <bool _Undirected = !Directional>\n  in_edge_iterator\n  in_edge_end(GraphNode N,\n              galois::MethodFlag GALOIS_UNUSED(mflag)      = MethodFlag::WRITE,\n              typename std::enable_if<!_Undirected>::type* = 0) {\n    assert(N);\n    // Acquiring lock is not necessary: no valid use for an end pointer should\n    // ever require it\n    // N->acquire(mflag);\n    return boost::make_filter_iterator(is_in_edge(), N->end(), N->end());\n  }\n\n  //! Returns the end of an in-neighbor edge iterator, undirected case\n  template <bool _Undirected = !Directional>\n  edge_iterator in_edge_end(GraphNode N,\n                            galois::MethodFlag mflag = MethodFlag::WRITE,\n                            typename std::enable_if<_Undirected>::type* = 0) {\n    return edge_end(N, mflag);\n  }\n\n  //! Return a range of edges that can be iterated over by C++ for-each\n  runtime::iterable<NoDerefIterator<edge_iterator>>\n  edges(GraphNode N, galois::MethodFlag mflag = MethodFlag::WRITE) {\n    return internal::make_no_deref_range(edge_begin(N, mflag),\n                                         edge_end(N, mflag));\n  }\n\n  //! Return a range of in-edges that can be iterated over by C++ for-each\n  template <bool _Undirected = !Directional>\n  runtime::iterable<NoDerefIterator<in_edge_iterator>>\n  in_edges(GraphNode N, galois::MethodFlag mflag = MethodFlag::WRITE,\n           typename std::enable_if<!_Undirected>::type* = 0) {\n    return internal::make_no_deref_range(in_edge_begin(N, mflag),\n                                         in_edge_end(N, mflag));\n  }\n\n  //! Return a range of in-edges that can be iterated over by C++ for-each\n  //! Undirected case, equivalent to out-edge iteration\n  template <bool _Undirected = !Directional>\n  runtime::iterable<NoDerefIterator<edge_iterator>>\n  in_edges(GraphNode N, galois::MethodFlag mflag = MethodFlag::WRITE,\n           typename std::enable_if<_Undirected>::type* = 0) {\n    return edges(N, mflag);\n  }\n\n  /**\n   * An object with begin() and end() methods to iterate over the outgoing\n   * edges of N.\n   */\n  internal::EdgesIterator<MorphHyperGraph>\n  out_edges(GraphNode N, MethodFlag mflag = MethodFlag::WRITE) {\n    return internal::EdgesIterator<MorphHyperGraph>(*this, N, mflag);\n  }\n\n  /**\n   * Returns an iterator to all the nodes in the graph. Not thread-safe.\n   */\n  iterator begin() {\n    return boost::make_transform_iterator(\n        boost::make_filter_iterator(is_node(), nodes.begin(), nodes.end()),\n        makeGraphNode());\n  }\n\n  //! Returns the end of the node iterator. Not thread-safe.\n  iterator end() {\n    return boost::make_transform_iterator(\n        boost::make_filter_iterator(is_node(), nodes.end(), nodes.end()),\n        makeGraphNode());\n  }\n\n  //! local iterator over nodes\n  using local_iterator = iterator;\n\n  //! Return the beginning of local range of nodes\n  local_iterator local_begin() {\n    return boost::make_transform_iterator(\n        boost::make_filter_iterator(is_node(), nodes.local_begin(),\n                                    nodes.local_end()),\n        makeGraphNode());\n  }\n\n  //! Return the end of local range of nodes\n  local_iterator local_end() {\n    return boost::make_transform_iterator(\n        boost::make_filter_iterator(is_node(), nodes.local_end(),\n                                    nodes.local_end()),\n        makeGraphNode());\n  }\n\n  /**\n   * Returns the number of nodes in the graph. Not thread-safe.\n   */\n  unsigned int size() { return std::distance(begin(), end()); }\n\n  //! Returns the size of edge data.\n  size_t sizeOfEdgeData() const { return gNode::EdgeInfo::sizeOfSecond(); }\n\n#ifdef AUX_MAP\n  /**\n   * Allocate memory for nodes given a file graph with a particular number of\n   * nodes.\n   *\n   * @param graph FileGraph with a number of nodes to allocate\n   * @param aux Data structure in which to allocate space for nodes.\n   */\n  void allocateFrom(FileGraph& graph, ReadGraphAuxData& aux) {\n    size_t numNodes = graph.size();\n    aux.nodes.allocateInterleaved(numNodes);\n  }\n\n  /**\n   * Constructs the MorphGraph nodes given a FileGraph to construct it from.\n   * Meant to be called by multiple threads.\n   *\n   * @param[in] graph FileGraph to construct a morph graph from\n   * @param[in] tid Thread id of thread calling this function\n   * @param[in] total Total number of threads in current execution\n   * @param[in,out] aux Allocated memory to store newly created nodes\n   */\n  void constructNodesFrom(FileGraph& graph, unsigned tid, unsigned total,\n                          ReadGraphAuxData& aux) {\n    auto r = graph\n                 .divideByNode(sizeof(gNode), sizeof(typename gNode::EdgeInfo),\n                               tid, total)\n                 .first;\n    for (FileGraph::iterator ii = r.first, ei = r.second; ii != ei; ++ii) {\n      aux.nodes[*ii] = createNode();\n      addNode(aux.nodes[*ii], galois::MethodFlag::UNPROTECTED);\n    }\n  }\n\n  /**\n   * Constructs the MorphGraph edges given a FileGraph to construct it from and\n   * already created nodes.\n   * Meant to be called by multiple threads.\n   *\n   * @param[in] graph FileGraph to construct a morph graph from\n   * @param[in] tid Thread id of thread calling this function\n   * @param[in] total Total number of threads in current execution\n   * @param[in] aux Contains created nodes to create edges for\n   */\n  void constructOutEdgesFrom(FileGraph& graph, unsigned tid, unsigned total,\n                             ReadGraphAuxData& aux) {\n    auto r = graph\n                 .divideByNode(sizeof(gNode), sizeof(typename gNode::EdgeInfo),\n                               tid, total)\n                 .first;\n    auto& map = aux.inNghs.get();\n\n    for (FileGraph::iterator ii = r.first, ei = r.second; ii != ei; ++ii) {\n      for (FileGraph::edge_iterator nn = graph.edge_begin(*ii),\n                                    en = graph.edge_end(*ii);\n           nn != en; ++nn) {\n        auto dstID = graph.getEdgeDst(nn);\n        auto src = aux.nodes[*ii], dst = aux.nodes[dstID];\n        auto e = constructOutEdgeValue(graph, nn, src, dst);\n        if (!Directional || InOut) {\n          map[dstID].push_back({src, e});\n        }\n      }\n    }\n  }\n\n  /**\n   * Constructs the MorphGraph in-edges given a FileGraph to construct it from\n   * and already created nodes. Meant to be called by multiple threads.\n   * DirectedNotInOut = false version\n   *\n   * @param[in] graph FileGraph to construct a morph graph from\n   * @param[in] tid Thread id of thread calling this function\n   * @param[in] total Total number of threads in current execution\n   * @param[in] aux Contains created nodes to create edges for\n   */\n  void constructInEdgesFrom(FileGraph& graph, unsigned tid, unsigned total,\n                            const ReadGraphAuxData& aux) {\n    // only do it if not directioal or an inout graph\n    if (!Directional || InOut) {\n      auto r = graph\n                   .divideByNode(sizeof(gNode),\n                                 sizeof(typename gNode::EdgeInfo), tid, total)\n                   .first;\n\n      for (size_t i = 0; i < aux.inNghs.numRows(); ++i) {\n        const auto& map = aux.inNghs.get(i);\n        auto ii         = map.lower_bound(*(r.first));  // inclusive begin\n        auto ei         = map.lower_bound(*(r.second)); // exclusive end\n        for (; ii != ei; ++ii) {\n          auto dst = aux.nodes[ii->first];\n          for (const auto& ie : ii->second) {\n            constructInEdgeValue(graph, ie.second, ie.first, dst);\n          }\n        }\n      }\n    }\n  }\n#else\n  /**\n   * Allocate memory for nodes given a file graph with a particular number of\n   * nodes.\n   *\n   * @param graph FileGraph with a number of nodes to allocate\n   * @param aux Data structure in which to allocate space for nodes.\n   */\n  void allocateFrom(FileGraph& graph, ReadGraphAuxData& aux) {\n    size_t numNodes = graph.size();\n    aux.allocateInterleaved(numNodes);\n\n    if (!DirectedNotInOut) {\n      galois::do_all(galois::iterate(0ul, aux.size()),\n                     [&](size_t index) { aux.constructAt(index); });\n    }\n  }\n\n  /**\n   * Constructs the MorphGraph nodes given a FileGraph to construct it from.\n   * Meant to be called by multiple threads.\n   * Version for DirectedNotInOut = false.\n   *\n   * @param[in] graph FileGraph to construct a morph graph from\n   * @param[in] tid Thread id of thread calling this function\n   * @param[in] total Total number of threads in current execution\n   * @param[in,out] aux Allocated memory to store newly created nodes\n   */\n  template <bool V = DirectedNotInOut>\n  std::enable_if_t<!V> constructNodesFrom(FileGraph& graph, unsigned tid,\n                                          unsigned total,\n                                          ReadGraphAuxData& aux) {\n    auto r = graph\n                 .divideByNode(sizeof(gNode), sizeof(typename gNode::EdgeInfo),\n                               tid, total)\n                 .first;\n    for (FileGraph::iterator ii = r.first, ei = r.second; ii != ei; ++ii) {\n      auto& auxNode = aux[*ii].get();\n      auxNode.n     = createNode();\n      addNode(auxNode.n, galois::MethodFlag::UNPROTECTED);\n    }\n  }\n\n  /**\n   * Constructs the MorphGraph nodes given a FileGraph to construct it from.\n   * Meant to be called by multiple threads.\n   * Version for DirectedNotInOut = true.\n   *\n   * @param[in] graph FileGraph to construct a morph graph from\n   * @param[in] tid Thread id of thread calling this function\n   * @param[in] total Total number of threads in current execution\n   * @param[in,out] aux Allocated memory to store newly created nodes\n   */\n  template <bool V = DirectedNotInOut>\n  std::enable_if_t<V> constructNodesFrom(FileGraph& graph, unsigned tid,\n                                         unsigned total,\n                                         ReadGraphAuxData& aux) {\n    auto r = graph\n                 .divideByNode(sizeof(gNode), sizeof(typename gNode::EdgeInfo),\n                               tid, total)\n                 .first;\n    for (FileGraph::iterator ii = r.first, ei = r.second; ii != ei; ++ii) {\n      aux[*ii] = createNode();\n      addNode(aux[*ii], galois::MethodFlag::UNPROTECTED);\n    }\n  }\n\n  /**\n   * Constructs the MorphGraph edges given a FileGraph to construct it from and\n   * already created nodes.\n   * Meant to be called by multiple threads.\n   * DirectedNotInOut = false version\n   *\n   * @param[in] graph FileGraph to construct a morph graph from\n   * @param[in] tid Thread id of thread calling this function\n   * @param[in] total Total number of threads in current execution\n   * @param[in] aux Contains created nodes to create edges for\n   */\n  template <bool V = DirectedNotInOut>\n  std::enable_if_t<!V> constructOutEdgesFrom(FileGraph& graph, unsigned tid,\n                                             unsigned total,\n                                             ReadGraphAuxData& aux) {\n    auto r = graph\n                 .divideByNode(sizeof(gNode), sizeof(typename gNode::EdgeInfo),\n                               tid, total)\n                 .first;\n\n    for (FileGraph::iterator ii = r.first, ei = r.second; ii != ei; ++ii) {\n      for (FileGraph::edge_iterator nn = graph.edge_begin(*ii),\n                                    en = graph.edge_end(*ii);\n           nn != en; ++nn) {\n        auto src     = aux[*ii].get().n;\n        auto& dstAux = aux[graph.getEdgeDst(nn)].get();\n        auto e       = constructOutEdgeValue(graph, nn, src, dstAux.n);\n        dstAux.lock.lock();\n        dstAux.inNghs.push_back({src, e});\n        dstAux.lock.unlock();\n      }\n    }\n  }\n\n  /**\n   * Constructs the MorphGraph edges given a FileGraph to construct it from and\n   * already created nodes.\n   * Meant to be called by multiple threads.\n   * DirectedNotInOut = true version\n   *\n   * @param[in] graph FileGraph to construct a morph graph from\n   * @param[in] tid Thread id of thread calling this function\n   * @param[in] total Total number of threads in current execution\n   * @param[in] aux Contains created nodes to create edges for\n   */\n  template <bool V = DirectedNotInOut>\n  std::enable_if_t<V> constructOutEdgesFrom(FileGraph& graph, unsigned tid,\n                                            unsigned total,\n                                            const ReadGraphAuxData& aux) {\n    auto r = graph\n                 .divideByNode(sizeof(gNode), sizeof(typename gNode::EdgeInfo),\n                               tid, total)\n                 .first;\n\n    for (FileGraph::iterator ii = r.first, ei = r.second; ii != ei; ++ii) {\n      for (FileGraph::edge_iterator nn = graph.edge_begin(*ii),\n                                    en = graph.edge_end(*ii);\n           nn != en; ++nn) {\n        constructOutEdgeValue(graph, nn, aux[*ii], aux[graph.getEdgeDst(nn)]);\n      }\n    }\n  }\n\n  /**\n   * Constructs the MorphGraph in-edges given a FileGraph to construct it from\n   * and already created nodes. Meant to be called by multiple threads.\n   * DirectedNotInOut = false version\n   *\n   * @param[in] graph FileGraph to construct a morph graph from\n   * @param[in] tid Thread id of thread calling this function\n   * @param[in] total Total number of threads in current execution\n   * @param[in] aux Contains created nodes to create edges for\n   */\n  template <bool V = DirectedNotInOut>\n  std::enable_if_t<!V> constructInEdgesFrom(FileGraph& graph, unsigned tid,\n                                            unsigned total,\n                                            ReadGraphAuxData& aux) {\n    auto r = graph\n                 .divideByNode(sizeof(gNode), sizeof(typename gNode::EdgeInfo),\n                               tid, total)\n                 .first;\n\n    for (FileGraph::iterator ii = r.first, ei = r.second; ii != ei; ++ii) {\n      auto& auxNode = aux[*ii].get();\n      for (auto ie : auxNode.inNghs) {\n        constructInEdgeValue(graph, ie.second, ie.first, auxNode.n);\n      }\n    }\n  }\n\n  //! If a directed graph and no in-edges exist (i.e. DirectedNotInOut = true),\n  //! then construct in edges should do nothing.\n  template <bool V = DirectedNotInOut>\n  std::enable_if_t<V> constructInEdgesFrom(FileGraph&, unsigned, unsigned,\n                                           ReadGraphAuxData&) {}\n#endif\n  gstl::Vector<GraphNode> getneighbor(GraphNode N, GraphNode H) {\n\n    gstl::Vector<GraphNode> neighbors;\n    //    if (findEdge(N, H)) {\n    for (auto it : edges(H)) {\n      auto n = getEdgeDst(it);\n      if (n != N)\n        neighbors.push_back(n);\n    }\n    //  }\n    return neighbors;\n  }\n  // get all the neighbors\n  gstl::Vector<GraphNode> getallneighbor(GraphNode N) {\n\n    gstl::Vector<GraphNode> neighbors;\n    for (auto it : edges(N)) {\n      auto hedge = getEdgeDst(it);\n      for (auto h : edges(hedge)) {\n        auto hneighbor = getEdgeDst(h);\n        if (hneighbor != N)\n          neighbors.push_back(hneighbor);\n      }\n    }\n\n    return neighbors;\n  }\n  // get all the nets on a cell\n  gstl::Vector<GraphNode> getNets(GraphNode N) {\n    gstl::Vector<GraphNode> n;\n    for (auto net : edges(N)) {\n      auto nnet = getEdgeDst(net);\n      n.push_back(nnet);\n    }\n    return n;\n  }\n\n  // get all the cells on a net\n  gstl::Vector<GraphNode> getCells(GraphNode N) {\n    gstl::Vector<GraphNode> cells;\n    for (auto net : edges(N)) {\n      auto hedge = getEdgeDst(net);\n      cells.push_back(hedge);\n    }\n    return cells;\n  }\n\n  void addHyperedge(GraphNode n) { nets.push_back(n); }\n  void addCell(GraphNode n) { cells.push_back(n); }\n  Bnodes& cellList() { return cells; }\n\n  Bnodes& getNets() { return nets; }\n\n  GraphNode getneighbornet(GraphNode N, GraphNode C) {\n    for (auto n : edges(N)) {\n      GraphNode n1 = getEdgeDst(n);\n      for (auto c : edges(C)) {\n        GraphNode n2 = getEdgeDst(c);\n        if (n2 == n1)\n          return n1;\n      }\n    }\n  }\n  //\n  std::vector<GraphNode> getallneighbornets(GraphNode N) {\n    std::vector<GraphNode> nets;\n    for (auto n : edges(N)) {\n      GraphNode n1 = getEdgeDst(n);\n      nets.push_back(n1);\n    }\n    return nets;\n  }\n};\n\n} // namespace graphs\n} // namespace galois\n#endif\n"
  },
  {
    "path": "libgalois/include/galois/graphs/Morph_SepInOut_Graph.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#ifndef GALOIS_GRAPH_MORPH_SEPINOUT_GRAPH_H\n#define GALOIS_GRAPH_MORPH_SEPINOUT_GRAPH_H\n\n#include <algorithm>\n#include <map>\n#include <set>\n#include <type_traits>\n#include <vector>\n\n#include <boost/container/small_vector.hpp>\n#include <boost/functional.hpp>\n#include <boost/iterator/filter_iterator.hpp>\n#include <boost/iterator/transform_iterator.hpp>\n\n#include \"galois/Bag.h\"\n#include \"galois/Galois.h\"\n#include \"galois/graphs/Details.h\"\n#include \"galois/graphs/FileGraph.h\"\n#include \"galois/gstl.h\"\n\n#ifdef AUX_MAP\n#include \"galois/PerThreadContainer.h\"\n#else\n#include \"galois/substrate/CacheLineStorage.h\"\n#include \"galois/substrate/SimpleLock.h\"\n#endif\n\nnamespace galois {\n//! Parallel graph data structures.\nnamespace graphs {\n\nnamespace internal {\n/**\n * Wrapper class to have a valid type on void edges\n */\ntemplate <typename NTy, typename ETy, bool DirectedButNotInOut>\nstruct UEdgeInfoBase;\n\ntemplate <typename NTy, typename ETy>\nstruct UEdgeInfoBase<NTy, ETy, true> {\n  typedef ETy& reference;\n\n  NTy* N;\n  ETy Ea;\n\n  inline NTy* first() {\n    assert(N);\n    return N;\n  }\n  inline NTy const* first() const {\n    assert(N);\n    return N;\n  }\n  inline ETy* second() { return &Ea; }\n  inline const ETy* second() const { return &Ea; }\n\n  template <typename... Args>\n  UEdgeInfoBase(NTy* n, ETy*, bool, Args&&... args)\n      : N(n), Ea(std::forward<Args>(args)...) {}\n\n  template <typename... Args>\n  UEdgeInfoBase(NTy* n, ETy& v, bool, Args&&...) : N(n) {\n    Ea = v;\n  }\n\n  static size_t sizeOfSecond() { return sizeof(ETy); }\n  bool isInEdge() const { return false; }\n};\n\ntemplate <typename NTy, typename ETy>\nstruct UEdgeInfoBase<NTy, ETy, false> {\n  typedef ETy& reference;\n\n  NTy* N;\n  ETy* Ea;\n\n  inline NTy* first() {\n    assert(N);\n    return (NTy*)((uintptr_t)N & ~1);\n  }\n  inline NTy const* first() const {\n    assert(N);\n    return (NTy*)((uintptr_t)N & ~1);\n  }\n  inline ETy* second() { return Ea; }\n  inline const ETy* second() const { return Ea; }\n  template <typename... Args>\n  UEdgeInfoBase(NTy* n, ETy* v, bool f, Args&&...)\n      : N((NTy*)((uintptr_t)n | f)), Ea(v) {}\n  static size_t sizeOfSecond() { return sizeof(ETy); }\n  bool isInEdge() const { return (uintptr_t)N & 1; }\n};\n\ntemplate <typename NTy>\nstruct UEdgeInfoBase<NTy, void, true> {\n  typedef char& reference;\n\n  NTy* N;\n  inline NTy* first() { return N; }\n  inline NTy const* first() const { return N; }\n  inline char* second() const { return static_cast<char*>(NULL); }\n  inline char* addr() const { return second(); }\n  template <typename... Args>\n  UEdgeInfoBase(NTy* n, void*, bool, Args&&...) : N(n) {}\n  static size_t sizeOfSecond() { return 0; }\n  bool isInEdge() const { return false; }\n};\n\ntemplate <typename NTy>\nstruct UEdgeInfoBase<NTy, void, false> {\n  typedef char& reference;\n\n  NTy* N;\n  inline NTy* first() { return (NTy*)((uintptr_t)N & ~1); }\n  inline NTy const* first() const { return (NTy*)((uintptr_t)N & ~1); }\n  inline char* second() const { return static_cast<char*>(NULL); }\n  inline char* addr() const { return second(); }\n  template <typename... Args>\n  UEdgeInfoBase(NTy* n, void*, bool f, Args&&...)\n      : N((NTy*)((uintptr_t)n | f)) {}\n  static size_t sizeOfSecond() { return 0; }\n  bool isInEdge() const { return (uintptr_t)N & 1; }\n};\n\n/*\n * Only graphs w/ in-out/symmetric edges and non-void edge data,\n * i.e. ETy != void and DirectedNotInOut = false,\n * need to allocate memory for edge data\n */\ntemplate <typename ETy, bool DirectedNotInOut>\nstruct EdgeFactory {\n  galois::InsertBag<ETy> mem;\n  template <typename... Args>\n  ETy* mkEdge(Args&&... args) {\n    return &mem.emplace(std::forward<Args>(args)...);\n  }\n  void delEdge(ETy*) {}\n  bool mustDel() const { return false; }\n};\n\ntemplate <typename ETy>\nstruct EdgeFactory<ETy, true> {\n  template <typename... Args>\n  ETy* mkEdge(Args&&...) {\n    return nullptr;\n  }\n  void delEdge(ETy*) {}\n  bool mustDel() const { return false; }\n};\n\ntemplate <>\nstruct EdgeFactory<void, false> {\n  template <typename... Args>\n  void* mkEdge(Args&&...) {\n    return static_cast<void*>(NULL);\n  }\n  void delEdge(void*) {}\n  bool mustDel() const { return false; }\n};\n\n} // namespace internal\n\n/**\n * A Graph.\n *\n * An example of use:\n *\n * \\code\n * struct Node {\n *   ... // Definition of node data\n * };\n *\n * typedef galois::graphs::Morph_SepInOut_Graph<Node,int,true> Graph;\n *\n * // Create graph\n * Graph g;\n * Node n1, n2;\n * Graph::GraphNode a, b;\n * a = g.createNode(n1);\n * g.addNode(a);\n * b = g.createNode(n2);\n * g.addNode(b);\n * g.getEdgeData(g.addEdge(a, b)) = 5;\n *\n * // Traverse graph\n * for (Graph::iterator ii = g.begin(), ei = g.end(); ii != ei; ++ii) {\n *   Graph::GraphNode src = *ii;\n *   for (Graph::edge_iterator jj = g.edge_begin(src), ej = g.edge_end(src);\n * ++jj) { Graph::GraphNode dst = graph.getEdgeDst(jj); int edgeData =\n * g.getEdgeData(jj); assert(edgeData == 5);\n *   }\n * }\n * \\endcode\n *\n * And in C++11:\n *\n * \\code\n * // Traverse graph\n * for (Graph::GraphNode src : g) {\n *   for (Graph::edge_iterator edge : g.out_edges(src)) {\n *     Graph::GraphNode dst = g.getEdgeDst(edge);\n *     int edgeData = g.getEdgeData(edge);\n *     assert(edgeData == 5);\n *   }\n * }\n * \\endcode\n *\n * @tparam NodeTy Type of node data\n * @tparam EdgeTy Type of edge data\n * @tparam Directional true if graph is directed\n * @tparam InOut true if directed graph tracks in-edges\n * @tparam SortedNeighbors Keep neighbors sorted (for faster findEdge)\n */\ntemplate <typename NodeTy, typename EdgeTy, bool Directional,\n          bool InOut = false, bool HasNoLockable = false,\n          bool SortedNeighbors = false, typename FileEdgeTy = EdgeTy>\nclass Morph_SepInOut_Graph : private boost::noncopyable {\npublic:\n  //! If true, do not use abstract locks in graph\n  template <bool _has_no_lockable>\n  struct with_no_lockable {\n    typedef Morph_SepInOut_Graph<NodeTy, EdgeTy, Directional, InOut,\n                                 _has_no_lockable, SortedNeighbors, FileEdgeTy>\n        type;\n  };\n\n  template <typename _node_data>\n  struct with_node_data {\n    typedef Morph_SepInOut_Graph<_node_data, EdgeTy, Directional, InOut,\n                                 HasNoLockable, SortedNeighbors, FileEdgeTy>\n        type;\n  };\n\n  template <typename _edge_data>\n  struct with_edge_data {\n    typedef Morph_SepInOut_Graph<NodeTy, _edge_data, Directional, InOut,\n                                 HasNoLockable, SortedNeighbors, FileEdgeTy>\n        type;\n  };\n\n  template <typename _file_edge_data>\n  struct with_file_edge_data {\n    typedef Morph_SepInOut_Graph<NodeTy, EdgeTy, Directional, InOut,\n                                 HasNoLockable, SortedNeighbors,\n                                 _file_edge_data>\n        type;\n  };\n\n  template <bool _directional>\n  struct with_directional {\n    typedef Morph_SepInOut_Graph<NodeTy, EdgeTy, _directional, InOut,\n                                 HasNoLockable, SortedNeighbors, FileEdgeTy>\n        type;\n  };\n\n  template <bool _sorted_neighbors>\n  struct with_sorted_neighbors {\n    typedef Morph_SepInOut_Graph<NodeTy, EdgeTy, Directional, InOut,\n                                 HasNoLockable, _sorted_neighbors, FileEdgeTy>\n        type;\n  };\n\n  typedef read_with_aux_first_graph_tag read_tag;\n\nprivate:\n  template <typename T>\n  struct first_eq_and_valid {\n    T N2;\n    first_eq_and_valid(T& n) : N2(n) {}\n    template <typename T2>\n    bool operator()(const T2& ii) const {\n      return ii.first() == N2 && ii.first() && ii.first()->active;\n    }\n  };\n\n  struct first_not_valid {\n    template <typename T2>\n    bool operator()(const T2& ii) const {\n      return !ii.first() || !ii.first()->active;\n    }\n  };\n\n  template <typename T>\n  struct first_lt {\n    template <typename T2>\n    bool operator()(const T& N2, const T2& ii) const {\n      assert(ii.first() && \"UNEXPECTED: invalid item in edgelist\");\n      return N2 < ii.first();\n    }\n    template <typename T2>\n    bool operator()(const T2& ii, const T& N2) const {\n      assert(ii.first() && \"UNEXPECTED: invalid item in edgelist\");\n      return ii.first() < N2;\n    }\n  };\n\n  class gNode;\n  struct gNodeTypes\n      : public internal::NodeInfoBaseTypes<NodeTy, !HasNoLockable> {\n    //! The storage type for an edge\n    typedef internal::UEdgeInfoBase<gNode, EdgeTy, Directional & !InOut>\n        EdgeInfo;\n\n    //! The storage type for edges\n    // typedef llvm::SmallVector<EdgeInfo, 3> EdgesTy;\n    // typedef galois::gstl::Vector<EdgeInfo> EdgesTy;\n    typedef boost::container::small_vector<\n        EdgeInfo, 3, galois::runtime::Pow_2_BlockAllocator<EdgeInfo>>\n        EdgesTy;\n\n    typedef typename EdgesTy::iterator iterator;\n  };\n\n  class gNode : public internal::NodeInfoBase<NodeTy, !HasNoLockable>,\n                public gNodeTypes {\n    friend class Morph_SepInOut_Graph;\n    typedef internal::NodeInfoBase<NodeTy, !HasNoLockable> NodeInfo;\n    typename gNodeTypes::EdgesTy edges;\n    typename gNodeTypes::EdgesTy in_edges;\n    typedef typename gNode::iterator iterator;\n    typedef typename gNode::EdgeInfo EdgeInfo;\n\n    bool active;\n\n    iterator begin() { return edges.begin(); }\n    iterator end() { return edges.end(); }\n\n    iterator in_edge_begin() { return in_edges.begin(); }\n    iterator in_edge_end() { return in_edges.end(); }\n\n    void erase(iterator ii, bool inEdge = false) {\n      auto& edgelist = (inEdge) ? in_edges : edges;\n      if (SortedNeighbors) {\n        // For sorted case remove the element, moving following\n        // elements back to fill the space.\n        edgelist.erase(ii);\n      } else {\n        // We don't need to preserve the order, so move the last edge\n        // into this place and then remove last edge.\n        *ii = edgelist.back();\n        edgelist.pop_back();\n      }\n    }\n\n    void erase(gNode* N, bool inEdge = false) {\n      iterator ii = find(N, inEdge);\n      erase(ii, inEdge);\n    }\n\n    iterator find(gNode* N, bool inEdge = false) {\n      auto& edgelist = (inEdge) ? in_edges : edges;\n      iterator ii, ei = edgelist.end();\n      if (SortedNeighbors) {\n        assert(std::is_sorted(edgelist.begin(), edgelist.end(),\n                              [=](const EdgeInfo& e1, const EdgeInfo& e2) {\n                                return e1.first() < e2.first();\n                              }));\n        ii = std::lower_bound(edgelist.begin(), edgelist.end(), N,\n                              first_lt<gNode*>());\n      } else {\n        ii = edgelist.begin();\n      }\n\n      first_eq_and_valid<gNode*> checker(N);\n      ii = std::find_if(ii, ei, checker);\n      while (ii != ei && ii->isInEdge() != inEdge) {\n        ++ii;\n        ii = std::find_if(ii, ei, checker);\n      };\n      return ii;\n    }\n\n    void resizeEdges(size_t size, bool inEdge = false) {\n      auto& edgelist = (inEdge) ? in_edges : edges;\n      edgelist.resize(size, EdgeInfo(new gNode(), 0));\n    }\n\n    template <typename... Args>\n    iterator createEdge(gNode* N, EdgeTy* v, bool inEdge, Args&&... args) {\n      iterator ii;\n      auto& edgelist = (inEdge) ? in_edges : edges;\n      if (SortedNeighbors) {\n        // If neighbors are sorted, find appropriate insertion point.\n        // Insert before first neighbor that is too far.\n        ii = std::upper_bound(edgelist.begin(), edgelist.end(), N,\n                              first_lt<gNode*>());\n      } else\n        ii = edgelist.end();\n      return edgelist.insert(\n          ii, EdgeInfo(N, v, inEdge, std::forward<Args>(args)...));\n    }\n\n    template <typename... Args>\n    iterator createEdgeWithReuse(gNode* N, EdgeTy* v, bool inEdge,\n                                 Args&&... args) {\n      auto& edgelist = (inEdge) ? in_edges : edges;\n      // Morph check for holes\n      iterator ii, ei;\n      if (SortedNeighbors) {\n        // If neighbors are sorted, find acceptable range for insertion.\n        ii = std::lower_bound(edgelist.begin(), edgelist.end(), N,\n                              first_lt<gNode*>());\n        ei = std::upper_bound(ii, edgelist.end(), N, first_lt<gNode*>());\n      } else {\n        // If not sorted, we can insert anywhere in the list.\n        ii = edgelist.begin();\n        ei = edgelist.end();\n      }\n      ii = std::find_if(ii, ei, first_not_valid());\n      if (ii != ei) {\n        // FIXME: We could move elements around (short distances).\n        *ii = EdgeInfo(N, v, inEdge, std::forward<Args>(args)...);\n        return ii;\n      }\n      return edgelist.insert(\n          ei, EdgeInfo(N, v, inEdge, std::forward<Args>(args)...));\n    }\n\n    template <bool _A1 = HasNoLockable>\n    void acquire(MethodFlag mflag, typename std::enable_if<!_A1>::type* = 0) {\n      galois::runtime::acquire(this, mflag);\n    }\n\n    template <bool _A1 = HasNoLockable>\n    void acquire(MethodFlag, typename std::enable_if<_A1>::type* = 0) {}\n\n  public:\n    template <typename... Args>\n    gNode(Args&&... args)\n        : NodeInfo(std::forward<Args>(args)...), active(false) {}\n  };\n\n  // The graph manages the lifetimes of the data in the nodes and edges\n  typedef galois::InsertBag<gNode> NodeListTy;\n  NodeListTy nodes;\n\n  internal::EdgeFactory<EdgeTy, Directional && !InOut> edgesF;\n\n  // Helpers for iterator classes\n  struct is_node {\n    bool operator()(const gNode& g) const { return g.active; }\n  };\n  struct is_edge {\n    bool operator()(typename gNodeTypes::EdgeInfo& e) const {\n      return e.first()->active;\n    }\n  };\n  struct is_in_edge {\n    bool operator()(typename gNodeTypes::EdgeInfo& e) const {\n      return e.first()->active && e.isInEdge();\n    }\n  };\n  struct is_out_edge {\n    bool operator()(typename gNodeTypes::EdgeInfo& e) const {\n      return e.first()->active && !e.isInEdge();\n    }\n  };\n  struct makeGraphNode {\n    gNode* operator()(gNode& data) const { return &data; }\n  };\n\npublic:\n  //! Graph node handle\n  typedef gNode* GraphNode;\n  //! Edge data type\n  typedef EdgeTy edge_data_type;\n  //! Edge data type of file we are loading this graph from\n  typedef FileEdgeTy file_edge_data_type;\n  //! Node data type\n  typedef NodeTy node_data_type;\n  //! (Out or Undirected) Edge iterator\n  typedef typename boost::filter_iterator<is_out_edge,\n                                          typename gNodeTypes::iterator>\n      edge_iterator;\n  //! In Edge iterator\n  typedef\n      typename boost::filter_iterator<is_in_edge, typename gNodeTypes::iterator>\n          in_edge_iterator;\n  //! Reference to edge data\n  typedef typename gNodeTypes::EdgeInfo::reference edge_data_reference;\n  //! Reference to node data\n  typedef typename gNodeTypes::reference node_data_reference;\n  //! Node iterator\n  typedef boost::transform_iterator<\n      makeGraphNode,\n      boost::filter_iterator<is_node, typename NodeListTy::iterator>>\n      iterator;\n#ifdef AUX_MAP\n  struct ReadGraphAuxData {\n    LargeArray<GraphNode> nodes;\n    galois::PerThreadMap<FileGraph::GraphNode,\n                         galois::gstl::Vector<std::pair<GraphNode, EdgeTy*>>>\n        inNghs;\n  };\n#else\n  struct AuxNode {\n    galois::substrate::SimpleLock lock;\n    GraphNode n;\n    galois::gstl::Vector<std::pair<GraphNode, EdgeTy*>> inNghs;\n  };\n  using AuxNodePadded = typename galois::substrate::CacheLineStorage<AuxNode>;\n\n  constexpr static const bool DirectedNotInOut = (Directional && !InOut);\n  using ReadGraphAuxData =\n      typename std::conditional<DirectedNotInOut, LargeArray<GraphNode>,\n                                LargeArray<AuxNodePadded>>::type;\n#endif\n\nprivate:\n  template <typename... Args>\n  edge_iterator createEdgeWithReuse(GraphNode src, GraphNode dst,\n                                    galois::MethodFlag mflag, Args&&... args) {\n    assert(src);\n    assert(dst);\n    // galois::runtime::checkWrite(mflag, true);\n    src->acquire(mflag);\n    typename gNode::iterator ii = src->find(dst);\n    if (ii == src->end()) {\n      if (Directional && !InOut) {\n        ii = src->createEdgeWithReuse(dst, 0, false,\n                                      std::forward<Args>(args)...);\n      } else {\n        dst->acquire(mflag);\n        EdgeTy* e = edgesF.mkEdge(std::forward<Args>(args)...);\n        ii        = dst->createEdgeWithReuse(src, e, Directional ? true : false,\n                                      std::forward<Args>(args)...);\n        ii        = src->createEdgeWithReuse(dst, e, false,\n                                      std::forward<Args>(args)...);\n      }\n    }\n    return boost::make_filter_iterator(is_out_edge(), ii, src->end());\n  }\n\n  template <typename... Args>\n  edge_iterator createEdge(GraphNode src, GraphNode dst,\n                           galois::MethodFlag mflag, Args&&... args) {\n    assert(src);\n    assert(dst);\n    // galois::runtime::checkWrite(mflag, true);\n    src->acquire(mflag);\n    typename gNode::iterator ii = src->end();\n    if (ii == src->end()) {\n      if (Directional && !InOut) {\n        ii = src->createEdge(dst, 0, false, std::forward<Args>(args)...);\n      } else {\n        dst->acquire(mflag);\n        EdgeTy* e = edgesF.mkEdge(std::forward<Args>(args)...);\n        ii        = dst->createEdge(src, e, Directional ? true : false,\n                             std::forward<Args>(args)...);\n        ii        = src->createEdge(dst, e, false, std::forward<Args>(args)...);\n      }\n    }\n    return boost::make_filter_iterator(is_out_edge(), ii, src->end());\n  }\n\n  /**\n   * Creates an outgoing edge at src for the edge from src to dst.\n   * Only called by constructOutEdgeValue.\n   */\n  template <typename... Args>\n  EdgeTy* createOutEdge(GraphNode src, GraphNode dst, galois::MethodFlag mflag,\n                        Args&&... args) {\n    assert(src);\n    assert(dst);\n\n    src->acquire(mflag);\n    typename gNode::iterator ii = src->end();\n    if (ii == src->end()) {\n      dst->acquire(mflag);\n      EdgeTy* e = edgesF.mkEdge(std::forward<Args>(args)...);\n      ii        = src->createEdge(dst, e, false, std::forward<Args>(args)...);\n      return e;\n    }\n    return nullptr;\n  }\n\n  /**\n   * Creates an incoming edge at dst for the edge from src to dst.\n   * Only called by constructInEdgeValue.\n   * Reuse data from the corresponding outgoing edge.\n   */\n  template <typename... Args>\n  void createInEdge(GraphNode src, GraphNode dst, EdgeTy* e,\n                    galois::MethodFlag mflag, Args&&... args) {\n    assert(src);\n    assert(dst);\n\n    dst->acquire(mflag);\n    typename gNode::iterator ii = dst->end();\n    if (ii == dst->end()) {\n      src->acquire(mflag);\n      ii = dst->createEdge(src, e, Directional ? true : false,\n                           std::forward<Args>(args)...);\n    }\n  }\n\n  template <bool _A1 = LargeArray<EdgeTy>::has_value,\n            bool _A2 = LargeArray<FileEdgeTy>::has_value>\n  EdgeTy*\n  constructOutEdgeValue(FileGraph& graph, typename FileGraph::edge_iterator nn,\n                        GraphNode src, GraphNode dst,\n                        typename std::enable_if<!_A1 || _A2>::type* = 0) {\n    typedef typename LargeArray<FileEdgeTy>::value_type FEDV;\n    typedef LargeArray<EdgeTy> ED;\n    if (ED::has_value) {\n      return createOutEdge(src, dst, galois::MethodFlag::UNPROTECTED,\n                           graph.getEdgeData<FEDV>(nn));\n    } else {\n      return createOutEdge(src, dst, galois::MethodFlag::UNPROTECTED);\n    }\n  }\n\n  template <bool _A1 = LargeArray<EdgeTy>::has_value,\n            bool _A2 = LargeArray<FileEdgeTy>::has_value>\n  EdgeTy*\n  constructOutEdgeValue(FileGraph&, typename FileGraph::edge_iterator,\n                        GraphNode src, GraphNode dst,\n                        typename std::enable_if<_A1 && !_A2>::type* = 0) {\n    return createOutEdge(src, dst, galois::MethodFlag::UNPROTECTED);\n  }\n\n  // will reuse edge data from outgoing edges\n  void constructInEdgeValue(FileGraph&, EdgeTy* e, GraphNode src,\n                            GraphNode dst) {\n    createInEdge(src, dst, e, galois::MethodFlag::UNPROTECTED);\n  }\n\npublic:\n  /**\n   * Creates a new node holding the indicated data. Usually you should call\n   * {@link addNode()} afterwards.\n   */\n  template <typename... Args>\n  GraphNode createNode(Args&&... args) {\n    gNode* N  = &(nodes.emplace(std::forward<Args>(args)...));\n    N->active = false;\n    return GraphNode(N);\n  }\n\n  /**\n   * Adds a node to the graph.\n   */\n  void addNode(const GraphNode& n,\n               galois::MethodFlag mflag = MethodFlag::WRITE) {\n    // galois::runtime::checkWrite(mflag, true);\n    n->acquire(mflag);\n    n->active = true;\n  }\n\n  //! Gets the node data for a node.\n  node_data_reference\n  getData(const GraphNode& n,\n          galois::MethodFlag mflag = MethodFlag::WRITE) const {\n    assert(n);\n    // galois::runtime::checkWrite(mflag, false);\n    n->acquire(mflag);\n    return n->getData();\n  }\n\n  //! Checks if a node is in the graph\n  bool containsNode(const GraphNode& n,\n                    galois::MethodFlag mflag = MethodFlag::WRITE) const {\n    assert(n);\n    n->acquire(mflag);\n    return n->active;\n  }\n\n  /**\n   * Removes a node from the graph along with all its outgoing/incoming edges\n   * for undirected graphs or outgoing edges for directed graphs.\n   */\n  // FIXME: handle edge memory\n  void removeNode(GraphNode n, galois::MethodFlag mflag = MethodFlag::WRITE) {\n    assert(n);\n    // galois::runtime::checkWrite(mflag, true);\n    n->acquire(mflag);\n    gNode* N = n;\n    if (N->active) {\n      N->active = false;\n      N->edges.clear();\n      N->in_edges.clear();\n    }\n  }\n\n  /**\n   * Resize the edges of the node. For best performance, should be done\n   * serially.\n   */\n  void resizeEdges(GraphNode src, size_t size,\n                   galois::MethodFlag mflag = MethodFlag::WRITE) {\n    assert(src);\n    // galois::runtime::checkWrite(mflag, false);\n    src->acquire(mflag);\n    src->resizeEdges(size);\n    src->resizeEdges(size, true); // for incoming edges\n  }\n\n  /**\n   * Adds an edge to graph, replacing existing value if edge already exists.\n   *\n   * Ignore the edge data, let the caller use the returned iterator to set the\n   * value if desired.  This frees us from dealing with the void edge data\n   * problem in this API\n   */\n  edge_iterator addEdge(GraphNode src, GraphNode dst,\n                        galois::MethodFlag mflag = MethodFlag::WRITE) {\n    return createEdgeWithReuse(src, dst, mflag);\n  }\n\n  //! Adds and initializes an edge to graph but does not check for duplicate\n  //! edges\n  template <typename... Args>\n  edge_iterator addMultiEdge(GraphNode src, GraphNode dst,\n                             galois::MethodFlag mflag, Args&&... args) {\n    return createEdge(src, dst, mflag, std::forward<Args>(args)...);\n  }\n\n  //! Removes an edge from the graph\n  void removeEdge(GraphNode src, edge_iterator dst,\n                  galois::MethodFlag mflag = MethodFlag::WRITE) {\n    assert(src);\n    // galois::runtime::checkWrite(mflag, true);\n    src->acquire(mflag);\n    if (Directional && !InOut) {\n      src->erase(dst.base());\n    } else {\n      dst->first()->acquire(mflag);\n      // EdgeTy* e = dst->second();\n      dst->first()->erase(\n          src, Directional ? true : false); // erase incoming/symmetric edge\n      src->erase(dst.base());\n    }\n  }\n\n  template <bool _DirectedInOut = (Directional && InOut)>\n  void removeInEdge(GraphNode dst, in_edge_iterator src,\n                    galois::MethodFlag mflag = MethodFlag::WRITE,\n                    typename std::enable_if<_DirectedInOut>::type* = 0) {\n    assert(dst);\n\n    dst->acquire(mflag);\n    src->first()->acquire(mflag);\n    // EdgeTy* e = src->second();\n    src->first()->erase(dst); // erase the outgoing edge\n    dst->erase(src.base(), true);\n  }\n\n  //! Finds if an edge between src and dst exists\n  edge_iterator findEdge(GraphNode src, GraphNode dst,\n                         galois::MethodFlag mflag = MethodFlag::WRITE) {\n    assert(src);\n    assert(dst);\n    src->acquire(mflag);\n    typename gNodeTypes::iterator ii = src->find(dst), ei = src->end();\n    is_out_edge edge_predicate;\n    if (ii != ei && edge_predicate(*ii)) {\n      // After finding edge, lock dst and verify still active\n      dst->acquire(mflag);\n      if (!edge_predicate(*ii))\n        // I think we need this too, else we'll return some random iterator.\n        ii = ei;\n    } else\n      ii = ei;\n    return boost::make_filter_iterator(edge_predicate, ii, ei);\n  }\n\n  edge_iterator\n  findEdgeSortedByDst(GraphNode src, GraphNode dst,\n                      galois::MethodFlag mflag = MethodFlag::WRITE) {\n    assert(src);\n    assert(dst);\n    src->acquire(mflag);\n    assert(std::is_sorted(src->begin(), src->end(),\n                          [=](const typename gNode::EdgeInfo& e1,\n                              const typename gNode::EdgeInfo& e2) {\n                            return e1.first() < e2.first();\n                          }));\n\n    auto ei = src->end();\n    auto ii =\n        std::lower_bound(src->begin(), src->end(), dst, first_lt<gNode*>());\n\n    first_eq_and_valid<gNode*> checker(dst);\n    ii = std::find_if(ii, ei, checker); // bug if ei set to upper_bound\n    while (ii != ei && ii->isInEdge()) {\n      ++ii;\n      ii = std::find_if(ii, ei, checker);\n    };\n\n    is_out_edge edge_predicate;\n    if (ii != ei) {\n      dst->acquire(mflag);\n      if (!edge_predicate(*ii)) {\n        ii = ei;\n      }\n    }\n    return boost::make_filter_iterator(edge_predicate, ii, ei);\n  }\n\n  template <bool _Undirected = !Directional>\n  edge_iterator findInEdge(GraphNode src, GraphNode dst,\n                           galois::MethodFlag mflag = MethodFlag::WRITE,\n                           typename std::enable_if<_Undirected>::type* = 0) {\n    // incoming neighbors are the same as outgoing neighbors in undirected\n    // graphs\n    return findEdge(src, dst, mflag);\n  }\n\n  // Find if an incoming edge between src and dst exists for directed in-out\n  // graphs\n  template <bool _DirectedInOut = (Directional && InOut)>\n  in_edge_iterator\n  findInEdge(GraphNode src, GraphNode dst,\n             galois::MethodFlag mflag                       = MethodFlag::WRITE,\n             typename std::enable_if<_DirectedInOut>::type* = 0) {\n    assert(src);\n    assert(dst);\n    dst->acquire(mflag);\n    typename gNodeTypes::iterator ii = dst->find(src, true),\n                                  ei = dst->in_edge_end();\n    is_in_edge edge_predicate;\n    if (ii != ei && edge_predicate(*ii)) {\n      // After finding edges, lock dst and verify still active\n      src->acquire(mflag);\n      if (!edge_predicate(*ii))\n        // need this to avoid returning a random iterator\n        ii = ei;\n    } else\n      ii = ei;\n    return boost::make_filter_iterator(edge_predicate, ii, ei);\n  }\n\n  /**\n   * Returns the edge data associated with the edge. It is an error to\n   * get the edge data for a non-existent edge.  It is an error to get\n   * edge data for inactive edges. By default, the mflag is\n   * galois::MethodFlag::UNPROTECTED because edge_begin() dominates this call\n   * and should perform the appropriate locking.\n   */\n  edge_data_reference\n  getEdgeData(edge_iterator ii,\n              galois::MethodFlag mflag = MethodFlag::UNPROTECTED) const {\n    assert(ii->first()->active);\n    // galois::runtime::checkWrite(mflag, false);\n    ii->first()->acquire(mflag);\n    return *ii->second();\n  }\n\n  edge_data_reference\n  getEdgeData(in_edge_iterator ii,\n              galois::MethodFlag mflag = MethodFlag::UNPROTECTED) const {\n    assert(ii->first()->active);\n    // galois::runtime::checkWrite(mflag, false);\n    ii->first()->acquire(mflag);\n    return *ii->second();\n  }\n\n  //! Returns the destination of an edge\n  GraphNode getEdgeDst(edge_iterator ii) {\n    assert(ii->first()->active);\n    return GraphNode(ii->first());\n  }\n\n  GraphNode getEdgeDst(in_edge_iterator ii) {\n    assert(ii->first()->active);\n    return GraphNode(ii->first());\n  }\n\n  void sortEdgesByDst(GraphNode N,\n                      galois::MethodFlag mflag = MethodFlag::WRITE) {\n    acquire(N, mflag);\n    typedef typename gNode::EdgeInfo EdgeInfo;\n    auto eDstCompare = [=](const EdgeInfo& e1, const EdgeInfo& e2) {\n      return e1.first() < e2.first();\n    };\n    std::sort(N->begin(), N->end(), eDstCompare);\n    std::sort(N->in_edge_begin(), N->in_edge_end(), eDstCompare);\n  }\n\n  void sortAllEdgesByDst(MethodFlag mflag = MethodFlag::WRITE) {\n    galois::do_all(\n        galois::iterate(*this),\n        [=](GraphNode N) { this->sortEdgesByDst(N, mflag); }, galois::steal());\n  }\n\n  //// General Things ////\n\n  //! Returns an iterator to the neighbors of a node\n  edge_iterator edge_begin(GraphNode N,\n                           galois::MethodFlag mflag = MethodFlag::WRITE) {\n    assert(N);\n    N->acquire(mflag);\n\n    if (galois::runtime::shouldLock(mflag)) {\n      for (typename gNode::iterator ii = N->begin(), ee = N->end(); ii != ee;\n           ++ii) {\n        if (ii->first()->active && !ii->isInEdge())\n          ii->first()->acquire(mflag);\n      }\n    }\n    return boost::make_filter_iterator(is_out_edge(), N->begin(), N->end());\n  }\n\n  template <bool _Undirected = !Directional>\n  in_edge_iterator\n  in_edge_begin(GraphNode N, galois::MethodFlag mflag = MethodFlag::WRITE,\n                typename std::enable_if<!_Undirected>::type* = 0) {\n    assert(N);\n    N->acquire(mflag);\n\n    if (galois::runtime::shouldLock(mflag)) {\n      for (typename gNode::iterator ii = N->in_edge_begin(),\n                                    ee = N->in_edge_end();\n           ii != ee; ++ii) {\n        if (ii->first()->active && ii->isInEdge())\n          ii->first()->acquire(mflag);\n      }\n    }\n    return boost::make_filter_iterator(is_in_edge(), N->in_edge_begin(),\n                                       N->in_edge_end());\n  }\n\n  template <bool _Undirected = !Directional>\n  edge_iterator in_edge_begin(GraphNode N,\n                              galois::MethodFlag mflag = MethodFlag::WRITE,\n                              typename std::enable_if<_Undirected>::type* = 0) {\n    return edge_begin(N, mflag);\n  }\n\n  //! Returns the end of the neighbor iterator\n  edge_iterator\n  edge_end(GraphNode N,\n           galois::MethodFlag GALOIS_UNUSED(mflag) = MethodFlag::WRITE) {\n    assert(N);\n    // Acquiring lock is not necessary: no valid use for an end pointer should\n    // ever require it\n    // N->acquire(mflag);\n    return boost::make_filter_iterator(is_out_edge(), N->end(), N->end());\n  }\n\n  template <bool _Undirected = !Directional>\n  in_edge_iterator\n  in_edge_end(GraphNode N,\n              galois::MethodFlag GALOIS_UNUSED(mflag)      = MethodFlag::WRITE,\n              typename std::enable_if<!_Undirected>::type* = 0) {\n    assert(N);\n    // Acquiring lock is not necessary: no valid use for an end pointer should\n    // ever require it\n    // N->acquire(mflag);\n    return boost::make_filter_iterator(is_in_edge(), N->in_edge_end(),\n                                       N->in_edge_end());\n  }\n\n  template <bool _Undirected = !Directional>\n  edge_iterator in_edge_end(GraphNode N,\n                            galois::MethodFlag mflag = MethodFlag::WRITE,\n                            typename std::enable_if<_Undirected>::type* = 0) {\n    return edge_end(N, mflag);\n  }\n\n  runtime::iterable<NoDerefIterator<edge_iterator>>\n  edges(GraphNode N, galois::MethodFlag mflag = MethodFlag::WRITE) {\n    return internal::make_no_deref_range(edge_begin(N, mflag),\n                                         edge_end(N, mflag));\n  }\n\n  template <bool _Undirected = !Directional>\n  runtime::iterable<NoDerefIterator<in_edge_iterator>>\n  in_edges(GraphNode N, galois::MethodFlag mflag = MethodFlag::WRITE,\n           typename std::enable_if<!_Undirected>::type* = 0) {\n    return internal::make_no_deref_range(in_edge_begin(N, mflag),\n                                         in_edge_end(N, mflag));\n  }\n\n  template <bool _Undirected = !Directional>\n  runtime::iterable<NoDerefIterator<edge_iterator>>\n  in_edges(GraphNode N, galois::MethodFlag mflag = MethodFlag::WRITE,\n           typename std::enable_if<_Undirected>::type* = 0) {\n    return edges(N, mflag);\n  }\n\n  /**\n   * An object with begin() and end() methods to iterate over the outgoing\n   * edges of N.\n   */\n  internal::EdgesIterator<Morph_SepInOut_Graph>\n  out_edges(GraphNode N, MethodFlag mflag = MethodFlag::WRITE) {\n    return internal::EdgesIterator<Morph_SepInOut_Graph>(*this, N, mflag);\n  }\n\n  /**\n   * Returns an iterator to all the nodes in the graph. Not thread-safe.\n   */\n  iterator begin() {\n    return boost::make_transform_iterator(\n        boost::make_filter_iterator(is_node(), nodes.begin(), nodes.end()),\n        makeGraphNode());\n  }\n\n  //! Returns the end of the node iterator. Not thread-safe.\n  iterator end() {\n    return boost::make_transform_iterator(\n        boost::make_filter_iterator(is_node(), nodes.end(), nodes.end()),\n        makeGraphNode());\n  }\n\n  typedef iterator local_iterator;\n\n  local_iterator local_begin() {\n    return boost::make_transform_iterator(\n        boost::make_filter_iterator(is_node(), nodes.local_begin(),\n                                    nodes.local_end()),\n        makeGraphNode());\n  }\n\n  local_iterator local_end() {\n    return boost::make_transform_iterator(\n        boost::make_filter_iterator(is_node(), nodes.local_end(),\n                                    nodes.local_end()),\n        makeGraphNode());\n  }\n\n  /**\n   * Returns the number of nodes in the graph. Not thread-safe.\n   */\n  unsigned int size() { return std::distance(begin(), end()); }\n\n  //! Returns the size of edge data.\n  size_t sizeOfEdgeData() const { return gNode::EdgeInfo::sizeOfSecond(); }\n\n#ifdef AUX_MAP\n  void allocateFrom(FileGraph& graph, ReadGraphAuxData& aux) {\n    size_t numNodes = graph.size();\n    aux.nodes.allocateInterleaved(numNodes);\n  }\n\n  void constructNodesFrom(FileGraph& graph, unsigned tid, unsigned total,\n                          ReadGraphAuxData& aux) {\n    auto r = graph\n                 .divideByNode(sizeof(gNode), sizeof(typename gNode::EdgeInfo),\n                               tid, total)\n                 .first;\n    for (FileGraph::iterator ii = r.first, ei = r.second; ii != ei; ++ii) {\n      aux.nodes[*ii] = createNode();\n      addNode(aux.nodes[*ii], galois::MethodFlag::UNPROTECTED);\n    }\n  }\n\n  void constructOutEdgesFrom(FileGraph& graph, unsigned tid, unsigned total,\n                             ReadGraphAuxData& aux) {\n    auto r = graph\n                 .divideByNode(sizeof(gNode), sizeof(typename gNode::EdgeInfo),\n                               tid, total)\n                 .first;\n    auto& map = aux.inNghs.get();\n\n    for (FileGraph::iterator ii = r.first, ei = r.second; ii != ei; ++ii) {\n      for (FileGraph::edge_iterator nn = graph.edge_begin(*ii),\n                                    en = graph.edge_end(*ii);\n           nn != en; ++nn) {\n        auto dstID = graph.getEdgeDst(nn);\n        auto src = aux.nodes[*ii], dst = aux.nodes[dstID];\n        auto e = constructOutEdgeValue(graph, nn, src, dst);\n        if (!Directional || InOut) {\n          map[dstID].push_back({src, e});\n        }\n      }\n    }\n  }\n\n  void constructInEdgesFrom(FileGraph& graph, unsigned tid, unsigned total,\n                            const ReadGraphAuxData& aux) {\n    if (!Directional || InOut) {\n      auto r = graph\n                   .divideByNode(sizeof(gNode),\n                                 sizeof(typename gNode::EdgeInfo), tid, total)\n                   .first;\n\n      for (size_t i = 0; i < aux.inNghs.numRows(); ++i) {\n        const auto& map = aux.inNghs.get(i);\n        auto ii         = map.lower_bound(*(r.first));  // inclusive begin\n        auto ei         = map.lower_bound(*(r.second)); // exclusive end\n        for (; ii != ei; ++ii) {\n          auto dst = aux.nodes[ii->first];\n          for (const auto& ie : ii->second) {\n            constructInEdgeValue(graph, ie.second, ie.first, dst);\n          }\n        }\n      }\n    }\n  }\n#else\n  void allocateFrom(FileGraph& graph, ReadGraphAuxData& aux) {\n    size_t numNodes = graph.size();\n    aux.allocateInterleaved(numNodes);\n\n    if (!DirectedNotInOut) {\n      galois::do_all(galois::iterate(0ul, aux.size()),\n                     [&](size_t index) { aux.constructAt(index); });\n    }\n  }\n\n  template <bool V = DirectedNotInOut>\n  std::enable_if_t<!V> constructNodesFrom(FileGraph& graph, unsigned tid,\n                                          unsigned total,\n                                          ReadGraphAuxData& aux) {\n    auto r = graph\n                 .divideByNode(sizeof(gNode), sizeof(typename gNode::EdgeInfo),\n                               tid, total)\n                 .first;\n    for (FileGraph::iterator ii = r.first, ei = r.second; ii != ei; ++ii) {\n      auto& auxNode = aux[*ii].get();\n      auxNode.n     = createNode();\n      addNode(auxNode.n, galois::MethodFlag::UNPROTECTED);\n    }\n  }\n\n  template <bool V = DirectedNotInOut>\n  std::enable_if_t<V> constructNodesFrom(FileGraph& graph, unsigned tid,\n                                         unsigned total,\n                                         ReadGraphAuxData& aux) {\n    auto r = graph\n                 .divideByNode(sizeof(gNode), sizeof(typename gNode::EdgeInfo),\n                               tid, total)\n                 .first;\n    for (FileGraph::iterator ii = r.first, ei = r.second; ii != ei; ++ii) {\n      aux[*ii] = createNode();\n      addNode(aux[*ii], galois::MethodFlag::UNPROTECTED);\n    }\n  }\n\n  template <bool V = DirectedNotInOut>\n  std::enable_if_t<!V> constructOutEdgesFrom(FileGraph& graph, unsigned tid,\n                                             unsigned total,\n                                             ReadGraphAuxData& aux) {\n    auto r = graph\n                 .divideByNode(sizeof(gNode), sizeof(typename gNode::EdgeInfo),\n                               tid, total)\n                 .first;\n\n    for (FileGraph::iterator ii = r.first, ei = r.second; ii != ei; ++ii) {\n      for (FileGraph::edge_iterator nn = graph.edge_begin(*ii),\n                                    en = graph.edge_end(*ii);\n           nn != en; ++nn) {\n        auto src     = aux[*ii].get().n;\n        auto& dstAux = aux[graph.getEdgeDst(nn)].get();\n        auto e       = constructOutEdgeValue(graph, nn, src, dstAux.n);\n        dstAux.lock.lock();\n        dstAux.inNghs.push_back({src, e});\n        dstAux.lock.unlock();\n      }\n    }\n  }\n\n  template <bool V = DirectedNotInOut>\n  std::enable_if_t<V> constructOutEdgesFrom(FileGraph& graph, unsigned tid,\n                                            unsigned total,\n                                            const ReadGraphAuxData& aux) {\n    auto r = graph\n                 .divideByNode(sizeof(gNode), sizeof(typename gNode::EdgeInfo),\n                               tid, total)\n                 .first;\n\n    for (FileGraph::iterator ii = r.first, ei = r.second; ii != ei; ++ii) {\n      for (FileGraph::edge_iterator nn = graph.edge_begin(*ii),\n                                    en = graph.edge_end(*ii);\n           nn != en; ++nn) {\n        constructOutEdgeValue(graph, nn, aux[*ii], aux[graph.getEdgeDst(nn)]);\n      }\n    }\n  }\n\n  template <bool V = DirectedNotInOut>\n  std::enable_if_t<!V> constructInEdgesFrom(FileGraph& graph, unsigned tid,\n                                            unsigned total,\n                                            ReadGraphAuxData& aux) {\n    auto r = graph\n                 .divideByNode(sizeof(gNode), sizeof(typename gNode::EdgeInfo),\n                               tid, total)\n                 .first;\n\n    for (FileGraph::iterator ii = r.first, ei = r.second; ii != ei; ++ii) {\n      auto& auxNode = aux[*ii].get();\n      for (auto ie : auxNode.inNghs) {\n        constructInEdgeValue(graph, ie.second, ie.first, auxNode.n);\n      }\n    }\n  }\n\n  template <bool V = DirectedNotInOut>\n  std::enable_if_t<V> constructInEdgesFrom(FileGraph&, unsigned, unsigned,\n                                           ReadGraphAuxData&) {}\n#endif\n};\n\n} // namespace graphs\n} // namespace galois\n#endif\n"
  },
  {
    "path": "libgalois/include/galois/graphs/OCGraph.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#ifndef GALOIS_GRAPHS_OCGRAPH_H\n#define GALOIS_GRAPHS_OCGRAPH_H\n\n#include <string>\n#include <type_traits>\n\n#include <boost/iterator/counting_iterator.hpp>\n#include <boost/utility.hpp>\n\n#include \"galois/config.h\"\n#include \"galois/graphs/Details.h\"\n#include \"galois/substrate/PageAlloc.h\"\n#include \"galois/LazyObject.h\"\n#include \"galois/LargeArray.h\"\n#include \"galois/optional.h\"\n\nnamespace galois {\nnamespace graphs {\n\n/**\n * Binds the segment parameter of an out-of-core graph so that it can be used in\n * place of a non out-of-core graph.\n */\ntemplate <typename Graph>\nclass BindSegmentGraph : private boost::noncopyable {\n  typedef typename Graph::segment_type segment_type;\n\n  Graph& graph;\n  segment_type segment;\n\npublic:\n  explicit BindSegmentGraph(Graph& g) : graph(g) {}\n  BindSegmentGraph(Graph& g, segment_type s) : graph(g), segment(s) {}\n\n  void setSegment(const segment_type& s) { segment = s; }\n\n  typedef typename Graph::GraphNode GraphNode;\n  typedef typename Graph::edge_data_type edge_data_type;\n  typedef typename Graph::node_data_type node_data_type;\n  typedef typename Graph::edge_data_reference edge_data_reference;\n  typedef typename Graph::node_data_reference node_data_reference;\n  typedef typename Graph::edge_iterator edge_iterator;\n  typedef typename Graph::in_edge_iterator in_edge_iterator;\n  typedef typename Graph::iterator iterator;\n  typedef typename Graph::const_iterator const_iterator;\n  typedef typename Graph::local_iterator local_iterator;\n  typedef typename Graph::const_local_iterator const_local_iterator;\n\n  node_data_reference getData(GraphNode N,\n                              MethodFlag mflag = MethodFlag::WRITE) {\n    return graph.getData(N, mflag);\n  }\n\n  edge_data_reference getEdgeData(edge_iterator ni,\n                                  MethodFlag mflag = MethodFlag::UNPROTECTED) {\n    return graph.getEdgeData(segment, ni, mflag);\n  }\n\n  GraphNode getEdgeDst(edge_iterator ni) {\n    return graph.getEdgeDst(segment, ni);\n  }\n\n  size_t size() const { return graph.size(); }\n  size_t sizeEdges() const { return graph.sizeEdges(); }\n\n  iterator begin() const { return graph.begin(); }\n  iterator end() const { return graph.end(); }\n\n  local_iterator local_begin() const { return graph.local_begin(); }\n  local_iterator local_end() const { return graph.local_end(); }\n\n  edge_iterator edge_begin(GraphNode N, MethodFlag mflag = MethodFlag::WRITE) {\n    return graph.edge_begin(segment, N, mflag);\n  }\n\n  edge_iterator edge_end(GraphNode N, MethodFlag mflag = MethodFlag::WRITE) {\n    return graph.edge_end(segment, N, mflag);\n  }\n\n  runtime::iterable<NoDerefIterator<edge_iterator>>\n  edges(GraphNode N, MethodFlag mflag = MethodFlag::WRITE) {\n    return internal::make_no_deref_range(edge_begin(N, mflag),\n                                         edge_end(N, mflag));\n  }\n\n  runtime::iterable<NoDerefIterator<edge_iterator>>\n  out_edges(GraphNode N, MethodFlag mflag = MethodFlag::WRITE) {\n    return edges(N, mflag);\n  }\n\n  edge_data_reference\n  getInEdgeData(edge_iterator ni, MethodFlag mflag = MethodFlag::UNPROTECTED) {\n    return graph.getInEdgeData(segment, ni, mflag);\n  }\n\n  GraphNode getInEdgeDst(in_edge_iterator ni) {\n    return graph.getInEdgeDst(segment, ni);\n  }\n\n  in_edge_iterator in_edge_begin(GraphNode N,\n                                 MethodFlag mflag = MethodFlag::WRITE) {\n    return graph.in_edge_begin(segment, N, mflag);\n  }\n\n  in_edge_iterator in_edge_end(GraphNode N,\n                               MethodFlag mflag = MethodFlag::WRITE) {\n    return graph.in_edge_end(segment, N, mflag);\n  }\n\n  internal::InEdgesIterator<BindSegmentGraph>\n  in_edges(GraphNode N, MethodFlag mflag = MethodFlag::WRITE) {\n    return internal::InEdgesIterator<BindSegmentGraph>(*this, N, mflag);\n  }\n\n  size_t idFromNode(GraphNode N) { return graph.idFromNode(N); }\n\n  GraphNode nodeFromId(size_t N) { return graph.nodeFromId(N); }\n};\n\n//! Like {@link FileGraph} but allows partial loading of the graph.\nclass OCFileGraph : private boost::noncopyable {\npublic:\n  typedef uint32_t GraphNode;\n  typedef boost::counting_iterator<uint32_t> iterator;\n  typedef boost::counting_iterator<uint64_t> edge_iterator;\n  typedef uint64_t* edge_offset_iterator;\n\n  template <typename EdgeTy>\n  struct EdgeReference {\n    typedef typename LazyObject<EdgeTy>::reference type;\n  };\n\nprivate:\n  struct PageSizeConf;\n\n  class Block {\n    friend class OCFileGraph;\n    void* m_mapping;\n    size_t m_length;\n    char* m_data;\n    size_t m_begin;\n    size_t m_sizeof_data;\n\n    void unload();\n    void load(int fd, offset_t offset, size_t begin, size_t len,\n              size_t sizeof_data);\n\n  public:\n    Block() : m_mapping(0) {}\n\n    char* get(size_t index) const {\n      char* p = m_data + (m_sizeof_data * (index - m_begin));\n      assert(p < reinterpret_cast<char*>(m_mapping) + m_length);\n      assert(m_mapping <= p);\n      return p;\n    }\n  };\n\n  struct Segment {\n    Block outs;\n    Block edgeData;\n    bool loaded;\n\n    Segment() : loaded(false) {}\n\n    void unload() {\n      outs.unload();\n      edgeData.unload();\n      loaded = false;\n    }\n  };\n\n  void* masterMapping;\n  int masterFD;\n  size_t masterLength;\n  uint64_t numEdges;\n  uint64_t numNodes;\n  uint64_t* outIdx;\n\npublic:\n  typedef Segment segment_type;\n\n  OCFileGraph()\n      : masterMapping(0), masterFD(-1), numEdges(0), numNodes(0), outIdx(0) {}\n  ~OCFileGraph();\n\n  iterator begin() const { return iterator(0); }\n  iterator end() const { return iterator(numNodes); }\n  size_t size() const { return numNodes; }\n  size_t sizeEdges() const { return numEdges; }\n  edge_iterator edge_begin(GraphNode n) const {\n    return edge_iterator(n == 0 ? 0 : outIdx[n - 1]);\n  }\n  edge_iterator edge_end(GraphNode n) const { return edge_iterator(outIdx[n]); }\n  edge_offset_iterator edge_offset_begin() const { return outIdx; }\n  edge_offset_iterator edge_offset_end() const { return outIdx + numNodes; }\n\n  template <typename EdgeTy>\n  typename EdgeReference<EdgeTy>::type getEdgeData(\n      const segment_type& s, edge_iterator it,\n      typename std::enable_if<!std::is_same<void, EdgeTy>::value>::type* = 0) {\n    EdgeTy* p = reinterpret_cast<EdgeTy*>(s.edgeData.get(*it));\n    return *p;\n  }\n\n  template <typename EdgeTy>\n  typename EdgeReference<EdgeTy>::type getEdgeData(\n      const segment_type&, edge_iterator,\n      typename std::enable_if<std::is_same<void, EdgeTy>::value>::type* = 0) {\n    return 0;\n  }\n\n  GraphNode getEdgeDst(const segment_type& s, edge_iterator it) {\n    uint32_t* p = reinterpret_cast<uint32_t*>(s.outs.get(*it));\n    return *p;\n  }\n\n  void unload(segment_type& s) {\n    if (!s.loaded)\n      return;\n\n    s.outs.unload();\n    s.edgeData.unload();\n    s.loaded = false;\n  }\n\n  void load(segment_type& s, edge_iterator begin, edge_iterator end,\n            size_t sizeof_data);\n\n  void fromFile(const std::string& fname);\n};\n\nstruct read_oc_immutable_edge_graph_tag {};\n\ntemplate <typename NodeTy, typename EdgeTy, bool HasNoLockable = false,\n          // bool UseNumaAlloc=false, // XXX: implement this\n          bool HasOutOfLineLockable = false>\nclass OCImmutableEdgeGraph\n    : private internal::LocalIteratorFeature<false>,\n      private internal::OutOfLineLockableFeature<HasOutOfLineLockable &&\n                                                 !HasNoLockable> {\npublic:\n  template <bool _has_id>\n  struct with_id {\n    typedef OCImmutableEdgeGraph type;\n  };\n\n  template <typename _node_data>\n  struct with_node_data {\n    typedef OCImmutableEdgeGraph<_node_data, EdgeTy, HasNoLockable,\n                                 HasOutOfLineLockable>\n        type;\n  };\n\n  template <typename _edge_data>\n  struct with_edge_data {\n    typedef OCImmutableEdgeGraph<NodeTy, _edge_data, HasNoLockable,\n                                 HasOutOfLineLockable>\n        type;\n  };\n\n  template <bool _has_no_lockable>\n  struct with_no_lockable {\n    typedef OCImmutableEdgeGraph<NodeTy, EdgeTy, _has_no_lockable,\n                                 HasOutOfLineLockable>\n        type;\n  };\n\n  template <bool _use_numa_alloc>\n  struct with_numa_alloc {\n    typedef OCImmutableEdgeGraph type;\n  };\n\n  template <bool _has_out_of_line_lockable>\n  struct with_out_of_line_lockable {\n    typedef OCImmutableEdgeGraph<NodeTy, EdgeTy, HasNoLockable,\n                                 _has_out_of_line_lockable>\n        type;\n  };\n\n  typedef read_oc_immutable_edge_graph_tag read_tag;\n\nprivate:\n  typedef internal::NodeInfoBase<NodeTy,\n                                 !HasNoLockable && !HasOutOfLineLockable>\n      NodeInfo;\n  typedef LargeArray<NodeInfo> NodeData;\n\n  NodeData nodeData;\n  OCFileGraph outGraph;\n  OCFileGraph inGraphStorage;\n  OCFileGraph* inGraph;\n\n  uint64_t numNodes;\n  uint64_t numEdges;\n\npublic:\n  typedef int tt_is_segmented;\n\n  typedef typename OCFileGraph::GraphNode GraphNode;\n  typedef EdgeTy edge_data_type;\n  typedef edge_data_type file_edge_data_type;\n  typedef NodeTy node_data_type;\n  typedef typename OCFileGraph::template EdgeReference<EdgeTy>::type\n      edge_data_reference;\n  typedef typename NodeInfo::reference node_data_reference;\n  typedef typename OCFileGraph::edge_iterator edge_iterator;\n  typedef edge_iterator in_edge_iterator;\n  typedef typename OCFileGraph::iterator iterator;\n  typedef iterator const_iterator;\n  typedef boost::counting_iterator<GraphNode> local_iterator;\n  typedef local_iterator const_local_iterator;\n\n  class segment_type {\n    template <typename, typename, bool, bool>\n    friend class OCImmutableEdgeGraph;\n    OCFileGraph::segment_type out;\n    OCFileGraph::segment_type in;\n    iterator nodeBegin;\n    iterator nodeEnd;\n\n  public:\n    //! Returns true if segment has been loaded into memory\n    bool loaded() const { return out.loaded; }\n    //! Returns true if segment represents a non-empty range\n    explicit operator bool() { return nodeBegin != nodeEnd; }\n    size_t size() const { return std::distance(nodeBegin, nodeEnd); }\n    bool containsNode(size_t n) const { // XXX: hack\n      return *nodeBegin <= n && n < *nodeEnd;\n    }\n  };\n\nprivate:\n  galois::optional<segment_type> memorySegment;\n\n  segment_type computeSegment(size_t startNode, size_t numEdges) {\n    typedef typename OCFileGraph::edge_offset_iterator edge_offset_iterator;\n\n    segment_type ret;\n\n    edge_offset_iterator outStart = outGraph.edge_offset_begin();\n    edge_offset_iterator outEnd   = outGraph.edge_offset_end();\n    std::advance(outStart, startNode);\n    if (outStart == outEnd) {\n      ret.nodeBegin = ret.nodeEnd = iterator(0);\n      return ret;\n    }\n    edge_offset_iterator outNext =\n        std::lower_bound(outStart + 1, outEnd, *outStart + numEdges);\n    ptrdiff_t outNodes = std::distance(outStart, outNext);\n\n    edge_offset_iterator inStart = inGraph->edge_offset_begin();\n    edge_offset_iterator inEnd   = inGraph->edge_offset_end();\n    std::advance(inStart, startNode);\n    edge_offset_iterator inNext =\n        std::lower_bound(inStart + 1, inEnd, *inStart + numEdges);\n    ptrdiff_t inNodes = std::distance(inStart, inNext);\n\n    ptrdiff_t nodes = std::min(outNodes, inNodes);\n\n    ret.nodeBegin = iterator(startNode);\n    ret.nodeEnd   = iterator(startNode + nodes);\n    return ret;\n  }\n\n  void load(segment_type& seg, size_t sizeof_data) {\n    outGraph.load(seg.out, outGraph.edge_begin(*seg.nodeBegin),\n                  outGraph.edge_end(seg.nodeEnd[-1]), sizeof_data);\n    if (inGraph != &outGraph)\n      inGraph->load(seg.in, inGraph->edge_begin(*seg.nodeBegin),\n                    inGraph->edge_end(seg.nodeEnd[-1]), sizeof_data);\n    else\n      seg.in = seg.out;\n  }\n\n  template <bool _A1 = HasNoLockable, bool _A2 = HasOutOfLineLockable>\n  void acquireNode(GraphNode N, MethodFlag mflag,\n                   typename std::enable_if<!_A1 && !_A2>::type* = 0) {\n    galois::runtime::acquire(&nodeData[N], mflag);\n  }\n\n  template <bool _A1 = HasOutOfLineLockable, bool _A2 = HasNoLockable>\n  void acquireNode(GraphNode N, MethodFlag mflag,\n                   typename std::enable_if<_A1 && !_A2>::type* = 0) {\n    this->outOfLineAcquire(idFromNode(N), mflag);\n  }\n\n  template <bool _A1 = HasOutOfLineLockable, bool _A2 = HasNoLockable>\n  void acquireNode(GraphNode, MethodFlag,\n                   typename std::enable_if<_A2>::type* = 0) {}\n\npublic:\n  ~OCImmutableEdgeGraph() {\n    if (memorySegment) {\n      outGraph.unload(memorySegment->out);\n      if (inGraph != &outGraph)\n        inGraph->unload(memorySegment->in);\n    }\n  }\n\n  void keepInMemory() {\n    memorySegment = galois::optional<segment_type>(computeSegment(0, numEdges));\n    load(*memorySegment, LazyObject<EdgeTy>::size_of::value);\n  }\n\n  /**\n   * Returns a segment starting from the beginning of the graph with either\n   * (1) some number of nodes with all their edges but no more than numEdges\n   * else (2) one node and all its edges.\n   */\n  segment_type nextSegment(size_t edges) {\n    if (memorySegment)\n      return *memorySegment;\n    else\n      return computeSegment(0, edges);\n  }\n\n  /**\n   * Returns the next segment after cur.\n   */\n  segment_type nextSegment(const segment_type& cur, size_t edges) {\n    return computeSegment(*cur.nodeEnd, edges);\n  }\n\n  void load(segment_type& seg) {\n    if (memorySegment)\n      return;\n\n    load(seg, LazyObject<EdgeTy>::size_of::value);\n  }\n\n  void unload(segment_type& seg) {\n    if (memorySegment)\n      return;\n\n    outGraph.unload(seg.out);\n    if (inGraph != &outGraph)\n      inGraph->unload(seg.in);\n  }\n\n  iterator begin(const segment_type& cur) { return cur.nodeBegin; }\n  iterator end(const segment_type& cur) { return cur.nodeEnd; }\n\n  node_data_reference getData(GraphNode N,\n                              MethodFlag mflag = MethodFlag::WRITE) {\n    // galois::runtime::checkWrite(mflag, false);\n    NodeInfo& NI = nodeData[N];\n    acquireNode(N, mflag);\n    return NI.getData();\n  }\n\n  edge_data_reference\n  getEdgeData(const segment_type& segment, edge_iterator ni,\n              MethodFlag GALOIS_UNUSED(mflag) = MethodFlag::UNPROTECTED) {\n    // galois::runtime::checkWrite(mflag, false);\n    return outGraph.getEdgeData<EdgeTy>(segment.out, ni);\n  }\n\n  GraphNode getEdgeDst(const segment_type& segment, edge_iterator ni) {\n    return outGraph.getEdgeDst(segment.out, ni);\n  }\n\n  size_t size() const { return numNodes; }\n  size_t sizeEdges() const { return numEdges; }\n\n  iterator begin() const { return outGraph.begin(); }\n  iterator end() const { return outGraph.end(); }\n\n  const_local_iterator local_begin() const {\n    return const_local_iterator(this->localBegin(numNodes));\n  }\n  const_local_iterator local_end() const {\n    return const_local_iterator(this->localEnd(numNodes));\n  }\n  local_iterator local_begin() {\n    return local_iterator(this->localBegin(numNodes));\n  }\n  local_iterator local_end() {\n    return local_iterator(this->localEnd(numNodes));\n  }\n\n  edge_iterator edge_begin(const segment_type& segment, GraphNode N,\n                           MethodFlag mflag = MethodFlag::WRITE) {\n    acquireNode(N, mflag);\n    if (galois::runtime::shouldLock(mflag)) {\n      for (edge_iterator ii = outGraph.edge_begin(N), ee = outGraph.edge_end(N);\n           ii != ee; ++ii) {\n        acquireNode(outGraph.getEdgeDst(segment.out, *ii), mflag);\n      }\n    }\n    return outGraph.edge_begin(N);\n  }\n\n  edge_iterator edge_end(const segment_type&, GraphNode N,\n                         MethodFlag mflag = MethodFlag::WRITE) {\n    acquireNode(N, mflag);\n    return outGraph.edge_end(N);\n  }\n\n  edge_data_reference\n  getInEdgeData(const segment_type& segment, edge_iterator ni,\n                MethodFlag GALOIS_UNUSED(mflag) = MethodFlag::UNPROTECTED) {\n    // galois::runtime::checkWrite(mflag, false);\n    return inGraph->getEdgeData<EdgeTy>(segment.in, ni);\n  }\n\n  GraphNode getInEdgeDst(const segment_type& segment, in_edge_iterator ni) {\n    return inGraph->getEdgeDst(segment.in, ni);\n  }\n\n  in_edge_iterator in_edge_begin(const segment_type& segment, GraphNode N,\n                                 MethodFlag mflag = MethodFlag::WRITE) {\n    acquireNode(N, mflag);\n    if (galois::runtime::shouldLock(mflag)) {\n      for (in_edge_iterator ii = inGraph->edge_begin(N),\n                            ee = inGraph->edge_end(N);\n           ii != ee; ++ii) {\n        acquireNode(inGraph->getEdgeDst(segment.in, ii), mflag);\n      }\n    }\n    return inGraph->edge_begin(N);\n  }\n\n  in_edge_iterator in_edge_end(const segment_type&, GraphNode N,\n                               MethodFlag mflag = MethodFlag::WRITE) {\n    acquireNode(N, mflag);\n    return inGraph->edge_end(N);\n  }\n\n  size_t idFromNode(GraphNode N) { return N; }\n\n  GraphNode nodeFromId(size_t N) { return N; }\n\n  //! Assumes that the graph is symmetric\n  void createFrom(const std::string& fname) {\n    outGraph.fromFile(fname);\n    numNodes = outGraph.size();\n    numEdges = outGraph.sizeEdges();\n    nodeData.create(numNodes);\n    inGraph = &outGraph;\n    this->outOfLineAllocateInterleaved(numNodes);\n    for (size_t i = 0; i < numNodes; ++i)\n      this->outOfLineConstructAt(i);\n  }\n\n  void createFrom(const std::string& fname, const std::string& transpose) {\n    outGraph.fromFile(fname);\n    inGraphStorage.fromFile(transpose);\n    numNodes = outGraph.size();\n    if (numNodes != inGraphStorage.size())\n      GALOIS_DIE(\n          \"graph does not have the same number of nodes as its transpose\");\n    numEdges = outGraph.sizeEdges();\n    nodeData.create(numNodes);\n    inGraph = &inGraphStorage;\n    this->outOfLineAllocateInterleaved(numNodes);\n    for (size_t i = 0; i < numNodes; ++i)\n      this->outOfLineConstructAt(i);\n  }\n};\n\ntemplate <typename GraphTy, typename... Args>\nvoid readGraphDispatch(GraphTy& graph, read_oc_immutable_edge_graph_tag,\n                       Args&&... args) {\n  graph.createFrom(std::forward<Args>(args)...);\n}\n\n} // namespace graphs\n} // namespace galois\n\n#endif\n"
  },
  {
    "path": "libgalois/include/galois/graphs/OfflineGraph.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#ifndef _GALOIS_DIST_OFFLINE_GRAPH_\n#define _GALOIS_DIST_OFFLINE_GRAPH_\n\n#include <cstdint>\n#include <fstream>\n#include <iostream>\n#include <mutex>\n#include <numeric>\n\n#include <fcntl.h>\n#include <sys/mman.h>\n\n#include <boost/iterator/counting_iterator.hpp>\n\n#include \"galois/config.h\"\n#include \"galois/graphs/Details.h\"\n#include \"galois/graphs/GraphHelpers.h\"\n#include \"galois/substrate/SimpleLock.h\"\n\nnamespace galois {\nnamespace graphs {\n\n// File format V1:\n// version (1) {uint64_t LE}\n// EdgeType size {uint64_t LE}\n// numNodes {uint64_t LE}\n// numEdges {uint64_t LE}\n// outindexs[numNodes] {uint64_t LE} (outindex[nodeid] is index of first edge\n// for nodeid + 1 (end interator.  node 0 has an implicit start iterator of 0.\n// outedges[numEdges] {uint32_t LE}\n// potential padding (32bit max) to Re-Align to 64bits\n// EdgeType[numEdges] {EdgeType size}\n\n// File format V2:\n// version (2) {uint64_t LE}\n// EdgeType size {uint64_t LE}\n// numNodes {uint64_t LE}\n// numEdges {uint64_t LE}\n// outindexs[numNodes] {uint64_t LE} (outindex[nodeid] is index of first edge\n// for nodeid + 1 (end interator.  node 0 has an implicit start iterator of 0.\n// outedges[numEdges] {uint64_t LE}\n// EdgeType[numEdges] {EdgeType size}\n\nclass OfflineGraph {\n  std::ifstream fileEdgeDst, fileIndex, fileEdgeData;\n  std::streamoff locEdgeDst, locIndex, locEdgeData;\n\n  uint64_t numNodes;\n  uint64_t numEdges;\n  uint64_t sizeEdgeData;\n  size_t length;\n  bool v2;\n  uint64_t numSeeksEdgeDst, numSeeksIndex, numSeeksEdgeData;\n  uint64_t numBytesReadEdgeDst, numBytesReadIndex, numBytesReadEdgeData;\n\n  galois::substrate::SimpleLock lock;\n\n  uint64_t outIndexs(uint64_t node) {\n    std::lock_guard<decltype(lock)> lg(lock);\n    std::streamoff pos = (4 + node) * sizeof(uint64_t);\n\n    // move to correct position in file\n    if (locEdgeDst != pos) {\n      numSeeksEdgeDst++;\n      fileEdgeDst.seekg(pos, fileEdgeDst.beg);\n      locEdgeDst = pos;\n    }\n\n    // read the value\n    uint64_t retval;\n    try {\n      fileEdgeDst.read(reinterpret_cast<char*>(&retval), sizeof(uint64_t));\n    } catch (const std::ifstream::failure& e) {\n      std::cerr << \"Exception while reading edge destinations:\" << e.what()\n                << \"\\n\";\n      std::cerr << \"IO error flags: EOF \" << fileEdgeDst.eof() << \" FAIL \"\n                << fileEdgeDst.fail() << \" BAD \" << fileEdgeDst.bad() << \"\\n\";\n    }\n\n    // metadata update\n    auto numBytesRead = fileEdgeDst.gcount();\n    assert(numBytesRead == sizeof(uint64_t));\n    locEdgeDst += numBytesRead;\n    numBytesReadEdgeDst += numBytesRead;\n\n    return retval;\n  }\n\n  uint64_t outEdges(uint64_t edge) {\n    std::lock_guard<decltype(lock)> lg(lock);\n    std::streamoff pos = (4 + numNodes) * sizeof(uint64_t) +\n                         edge * (v2 ? sizeof(uint64_t) : sizeof(uint32_t));\n\n    // move to correct position\n    if (locIndex != pos) {\n      numSeeksIndex++;\n      fileIndex.seekg(pos, fileEdgeDst.beg);\n      locIndex = pos;\n    }\n\n    // v2 reads 64 bits, v1 reads 32 bits\n    if (v2) {\n      uint64_t retval;\n      try {\n        fileIndex.read(reinterpret_cast<char*>(&retval), sizeof(uint64_t));\n      } catch (const std::ifstream::failure& e) {\n        std::cerr << \"Exception while reading index:\" << e.what() << \"\\n\";\n        std::cerr << \"IO error flags: EOF \" << fileIndex.eof() << \" FAIL \"\n                  << fileIndex.fail() << \" BAD \" << fileIndex.bad() << \"\\n\";\n      }\n\n      auto numBytesRead = fileIndex.gcount();\n      assert(numBytesRead == sizeof(uint64_t));\n      locIndex += numBytesRead;\n      numBytesReadIndex += numBytesRead;\n      return retval;\n    } else {\n      uint32_t retval;\n      try {\n        fileIndex.read(reinterpret_cast<char*>(&retval), sizeof(uint32_t));\n      } catch (const std::ifstream::failure& e) {\n        std::cerr << \"Exception while reading index:\" << e.what() << \"\\n\";\n        std::cerr << \"IO error flags: EOF \" << fileIndex.eof() << \" FAIL \"\n                  << fileIndex.fail() << \" BAD \" << fileIndex.bad() << \"\\n\";\n      }\n\n      auto numBytesRead = fileIndex.gcount();\n      assert(numBytesRead == sizeof(uint32_t));\n      locIndex += numBytesRead;\n      numBytesReadIndex += numBytesRead;\n      return retval;\n    }\n  }\n\n  template <typename T>\n  T edgeData(uint64_t edge) {\n    assert(sizeof(T) <= sizeEdgeData);\n    std::lock_guard<decltype(lock)> lg(lock);\n    std::streamoff pos = (4 + numNodes) * sizeof(uint64_t) +\n                         numEdges * (v2 ? sizeof(uint64_t) : sizeof(uint32_t));\n\n    // align + move to correct position\n    pos = (pos + 7) & ~7;\n    pos += edge * sizeEdgeData;\n\n    if (locEdgeData != pos) {\n      numSeeksEdgeData++;\n      fileEdgeData.seekg(pos, fileEdgeDst.beg);\n      locEdgeData = pos;\n    }\n\n    T retval;\n    try {\n      fileEdgeData.read(reinterpret_cast<char*>(&retval), sizeof(T));\n    } catch (const std::ifstream::failure& e) {\n      std::cerr << \"Exception while reading edge data:\" << e.what() << \"\\n\";\n      std::cerr << \"IO error flags: EOF \" << fileEdgeData.eof() << \" FAIL \"\n                << fileEdgeData.fail() << \" BAD \" << fileEdgeData.bad() << \"\\n\";\n    }\n\n    auto numBytesRead = fileEdgeData.gcount();\n    assert(numBytesRead == sizeof(T));\n    locEdgeData += numBytesRead;\n    numBytesReadEdgeData += numBytesRead;\n    /*fprintf(stderr, \"READ:: %ld[\", edge);\n    for(int i=0; i<sizeof(T); ++i){\n       fprintf(stderr, \"%c\", reinterpret_cast<char*>(&retval)[i]);\n    }\n    fprintf(stderr, \"]\");*/\n    return retval;\n  }\n\npublic:\n  typedef boost::counting_iterator<uint64_t> iterator;\n  typedef boost::counting_iterator<uint64_t> edge_iterator;\n  typedef uint64_t GraphNode;\n\n  OfflineGraph(const std::string& name)\n      : fileEdgeDst(name, std::ios_base::binary),\n        fileIndex(name, std::ios_base::binary),\n        fileEdgeData(name, std::ios_base::binary), locEdgeDst(0), locIndex(0),\n        locEdgeData(0), numSeeksEdgeDst(0), numSeeksIndex(0),\n        numSeeksEdgeData(0), numBytesReadEdgeDst(0), numBytesReadIndex(0),\n        numBytesReadEdgeData(0) {\n    if (!fileEdgeDst.is_open() || !fileEdgeDst.good())\n      throw \"Bad filename\";\n    if (!fileIndex.is_open() || !fileIndex.good())\n      throw \"Bad filename\";\n    if (!fileEdgeData.is_open() || !fileEdgeData.good())\n      throw \"Bad filename\";\n\n    fileEdgeDst.exceptions(std::ifstream::eofbit | std::ifstream::failbit |\n                           std::ifstream::badbit);\n    fileIndex.exceptions(std::ifstream::eofbit | std::ifstream::failbit |\n                         std::ifstream::badbit);\n    fileEdgeData.exceptions(std::ifstream::eofbit | std::ifstream::failbit |\n                            std::ifstream::badbit);\n\n    uint64_t ver = 0;\n\n    try {\n      fileEdgeDst.read(reinterpret_cast<char*>(&ver), sizeof(uint64_t));\n      fileEdgeDst.read(reinterpret_cast<char*>(&sizeEdgeData),\n                       sizeof(uint64_t));\n      fileEdgeDst.read(reinterpret_cast<char*>(&numNodes), sizeof(uint64_t));\n      fileEdgeDst.read(reinterpret_cast<char*>(&numEdges), sizeof(uint64_t));\n    } catch (const std::ifstream::failure& e) {\n      std::cerr << \"Exception while reading graph header:\" << e.what() << \"\\n\";\n      std::cerr << \"IO error flags: EOF \" << fileEdgeDst.eof() << \" FAIL \"\n                << fileEdgeDst.fail() << \" BAD \" << fileEdgeDst.bad() << \"\\n\";\n    }\n\n    if (ver == 0 || ver > 2)\n      throw \"Bad Version\";\n\n    v2 = ver == 2;\n\n    if (!fileEdgeDst)\n      throw \"Out of data\";\n\n    // File length\n    fileEdgeDst.seekg(0, fileEdgeDst.end);\n    length = fileEdgeDst.tellg();\n    if (length < sizeof(uint64_t) * (4 + numNodes) +\n                     (v2 ? sizeof(uint64_t) : sizeof(uint32_t)) * numEdges)\n      throw \"File too small\";\n\n    fileEdgeDst.seekg(0, std::ios_base::beg);\n    fileEdgeData.seekg(0, std::ios_base::beg);\n    fileIndex.seekg(0, std::ios_base::beg);\n  }\n\n  uint64_t num_seeks() {\n    // std::cout << \"Seeks :: \" << numSeeksEdgeDst << \" , \" << numSeeksEdgeData\n    //          << \" , \" << numSeeksIndex << \" \\n\";\n    return numSeeksEdgeDst + numSeeksEdgeData + numSeeksIndex;\n  }\n\n  uint64_t num_bytes_read() {\n    // std::cout << \"Bytes read :: \" << numBytesReadEdgeDst << \" , \" <<\n    // numBytesReadEdgeData << \" , \" << numBytesReadIndex << \" \\n\";\n    return numBytesReadEdgeDst + numBytesReadEdgeData + numBytesReadIndex;\n  }\n\n  void reset_seek_counters() {\n    numSeeksEdgeDst = numSeeksEdgeData = numSeeksIndex = 0;\n    numBytesReadEdgeDst = numBytesReadEdgeData = numBytesReadIndex = 0;\n  }\n\n  OfflineGraph(OfflineGraph&&) = default;\n\n  size_t size() const { return numNodes; }\n  size_t sizeEdges() const { return numEdges; }\n  size_t edgeSize() const { return sizeEdgeData; }\n\n  iterator begin() { return iterator(0); }\n  iterator end() { return iterator(numNodes); }\n\n  edge_iterator edge_begin(GraphNode N) {\n    if (N == 0)\n      return edge_iterator(0);\n    else\n      return edge_iterator(outIndexs(N - 1));\n  }\n\n  edge_iterator edge_end(GraphNode N) { return edge_iterator(outIndexs(N)); }\n\n  GraphNode getEdgeDst(edge_iterator ni) { return outEdges(*ni); }\n\n  runtime::iterable<NoDerefIterator<edge_iterator>> edges(GraphNode N) {\n    return internal::make_no_deref_range(edge_begin(N), edge_end(N));\n  }\n\n  template <typename T>\n  T getEdgeData(edge_iterator ni) {\n    return edgeData<T>(*ni);\n  }\n\n  /**\n   * Accesses the prefix sum on disk.\n   *\n   * @param n Index into edge prefix sum\n   * @returns The value located at index n in the edge prefix sum array\n   */\n  uint64_t operator[](uint64_t n) { return outIndexs(n); }\n\n  // typedefs used by divide by node below\n  typedef std::pair<iterator, iterator> NodeRange;\n  typedef std::pair<edge_iterator, edge_iterator> EdgeRange;\n  typedef std::pair<NodeRange, EdgeRange> GraphRange;\n\n  /**\n   * Returns 2 ranges (one for nodes, one for edges) for a particular division.\n   * The ranges specify the nodes/edges that a division is responsible for. The\n   * function attempts to split them evenly among threads given some kind of\n   * weighting\n   *\n   * @param nodeWeight weight to give to a node in division\n   * @param edgeWeight weight to give to an edge in division\n   * @param id Division number you want the ranges for\n   * @param total Total number of divisions\n   * @param scaleFactor Vector specifying if certain divisions should get more\n   * than other divisions\n   */\n  auto divideByNode(size_t nodeWeight, size_t edgeWeight, size_t id,\n                    size_t total,\n                    std::vector<unsigned> scaleFactor = std::vector<unsigned>())\n      -> GraphRange {\n    return galois::graphs::divideNodesBinarySearch<OfflineGraph>(\n        numNodes, numEdges, nodeWeight, edgeWeight, id, total, *this,\n        scaleFactor);\n  }\n};\n\nclass OfflineGraphWriter {\n  std::fstream file;\n  uint64_t numNodes, numEdges;\n  bool smallData;\n  uint64_t ver;\n  std::vector<uint64_t> bufferDst;\n\n  std::deque<uint64_t> edgeOffsets;\n\n  std::streamoff offsetOfDst(uint64_t edge) {\n    return sizeof(uint64_t) * (4 + numNodes + edge);\n  }\n  std::streamoff offsetOfData(uint64_t edge) {\n    return sizeof(uint64_t) * (4 + numNodes + numEdges) +\n           (smallData ? sizeof(float) : sizeof(double)) * edge;\n  }\n\n  void setEdge32(uint64_t src, uint64_t offset, uint64_t dst, uint32_t val) {\n    if (src)\n      offset += edgeOffsets[src - 1];\n    file.seekg(offsetOfDst(offset), std::ios_base::beg);\n    file.write(reinterpret_cast<char*>(&dst), sizeof(uint64_t));\n    file.seekg(offsetOfData(offset), std::ios_base::beg);\n    file.write(reinterpret_cast<char*>(&val), sizeof(uint32_t));\n  }\n\n  void setEdge64(uint64_t src, uint64_t offset, uint64_t dst, uint64_t val) {\n    if (src)\n      offset += edgeOffsets[src - 1];\n    file.seekg(offsetOfDst(offset), std::ios_base::beg);\n    file.write(reinterpret_cast<char*>(&dst), sizeof(uint64_t));\n    file.seekg(offsetOfData(offset), std::ios_base::beg);\n    file.write(reinterpret_cast<char*>(&val), sizeof(uint64_t));\n  }\n\n  void setEdge_sorted(uint64_t dst) {\n    if (ver == 1) {\n      uint32_t dst32 = dst;\n      file.write(reinterpret_cast<char*>(&dst32), sizeof(uint32_t));\n    } else {\n      file.write(reinterpret_cast<char*>(&dst), sizeof(uint64_t));\n    }\n  }\n\n  void setEdge_sortedBuffer() {\n    if (ver == 1) {\n      std::vector<uint32_t> tmp(bufferDst.begin(), bufferDst.end());\n      file.write(reinterpret_cast<char*>(&tmp[0]),\n                 (sizeof(uint32_t) * tmp.size()));\n    }\n    file.write(reinterpret_cast<char*>(&bufferDst[0]),\n               (sizeof(uint64_t) * bufferDst.size()));\n  }\n\n  // void setEdge64_sorted(uint64_t dst) {\n  // file.write(reinterpret_cast<char*>(&dst), sizeof(uint32_t));\n  //}\n\npublic:\n  OfflineGraphWriter(const std::string& name, bool use32 = false,\n                     uint64_t _numNodes = 0, uint64_t _numEdges = 0)\n      : file(name, std::ios_base::in | std::ios_base::out |\n                       std::ios_base::binary | std::ios_base::trunc),\n        numNodes(_numNodes), numEdges(_numEdges), smallData(use32), ver(1) {\n    if (!file.is_open() || !file.good())\n      throw \"Bad filename\";\n    uint64_t etSize = smallData ? sizeof(float) : sizeof(double);\n    file.write(reinterpret_cast<char*>(&ver), sizeof(uint64_t));\n    file.write(reinterpret_cast<char*>(&etSize), sizeof(uint64_t));\n    file.write(reinterpret_cast<char*>(&numNodes), sizeof(uint64_t));\n    file.write(reinterpret_cast<char*>(&numEdges), sizeof(uint64_t));\n    file.seekg(0, std::ios_base::beg);\n  }\n\n  ~OfflineGraphWriter() {}\n\n  // sets the number of nodes and edges.  points to an container of edge counts\n  void setCounts(std::deque<uint64_t> edgeCounts) {\n    edgeOffsets = std::move(edgeCounts);\n    numNodes    = edgeOffsets.size();\n    numEdges    = std::accumulate(edgeOffsets.begin(), edgeOffsets.end(), 0);\n    std::cout << \" NUM EDGES  : \" << numEdges << \"\\n\";\n    std::partial_sum(edgeOffsets.begin(), edgeOffsets.end(),\n                     edgeOffsets.begin());\n    // Nodes are greater than 2^32 so need ver = 2.\n    if (numNodes >= 4294967296) {\n      ver = 2;\n    } else {\n      ver = 1;\n    }\n    std::cout << \" USING VERSION : \" << ver << \"\\n\";\n    uint64_t etSize = 0; // smallData ? sizeof(float) : sizeof(double);\n    file.seekg(0, std::ios_base::beg);\n    file.write(reinterpret_cast<char*>(&ver), sizeof(uint64_t));\n    file.write(reinterpret_cast<char*>(&etSize), sizeof(uint64_t));\n    // file.seekg(sizeof(uint64_t)*2, std::ios_base::beg);\n    file.write(reinterpret_cast<char*>(&numNodes), sizeof(uint64_t));\n    file.write(reinterpret_cast<char*>(&numEdges), sizeof(uint64_t));\n    for (auto i : edgeOffsets)\n      file.write(reinterpret_cast<char*>(&i), sizeof(uint64_t));\n    file.seekg(0, std::ios_base::beg);\n  }\n\n  void setEdge(uint64_t src, uint64_t offset, uint64_t dst, uint64_t val) {\n    if (smallData)\n      setEdge32(src, offset, dst, val);\n    else\n      setEdge64(src, offset, dst, val);\n  }\n\n  void setEdgeSorted(uint64_t dst) { setEdge_sorted(dst); }\n\n  void seekEdgesDstStart() { file.seekg(offsetOfDst(0), std::ios_base::beg); }\n};\n\n} // namespace graphs\n} // namespace galois\n\n#endif //_GALOIS_DIST_OFFLINE_GRAPH_\n"
  },
  {
    "path": "libgalois/include/galois/graphs/ReadGraph.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#ifndef GALOIS_GRAPHS_READGRAPH_H\n#define GALOIS_GRAPHS_READGRAPH_H\n\n#include \"galois/config.h\"\n#include \"galois/Galois.h\"\n#include \"galois/graphs/Details.h\"\n#include \"galois/graphs/FileGraph.h\"\n#include \"galois/Timer.h\"\n\nnamespace galois {\nnamespace graphs {\n\n/**\n * Allocates and constructs a graph from a file. Tries to balance\n * memory evenly across system. Cannot be called during parallel\n * execution.\n */\ntemplate <typename GraphTy, typename... Args>\nvoid readGraph(GraphTy& graph, Args&&... args) {\n  typename GraphTy::read_tag tag;\n  readGraphDispatch(graph, tag, std::forward<Args>(args)...);\n}\n\ntemplate <typename GraphTy>\nvoid readGraphDispatch(GraphTy& graph, read_default_graph_tag tag,\n                       const std::string& filename,\n                       const bool readUnweighted = false) {\n  FileGraph f;\n  if (readUnweighted) {\n    //! If user specifies that the input graph is unweighted,\n    //! the file graph also should be aware of this.\n    //! Note that the application still could use the edge data array.\n    f.fromFileInterleaved<void>(filename);\n  } else {\n    f.fromFileInterleaved<typename GraphTy::file_edge_data_type>(filename);\n  }\n  readGraphDispatch(graph, tag, f, readUnweighted);\n}\n\ntemplate <typename GraphTy>\nstruct ReadGraphConstructFrom {\n  GraphTy& graph;\n  FileGraph& f;\n  bool readUnweighted = false;\n  ReadGraphConstructFrom(GraphTy& g, FileGraph& _f) : graph(g), f(_f) {}\n  ReadGraphConstructFrom(GraphTy& g, FileGraph& _f, bool _readUnweighted)\n      : graph(g), f(_f), readUnweighted(_readUnweighted) {}\n  void operator()(unsigned tid, unsigned total) {\n    graph.constructFrom(f, tid, total, readUnweighted);\n  }\n};\n\ntemplate <typename GraphTy>\nvoid readGraphDispatch(GraphTy& graph, read_default_graph_tag, FileGraph& f,\n                       const bool readUnweighted = false) {\n  graph.allocateFrom(f);\n\n  ReadGraphConstructFrom<GraphTy> reader(graph, f, readUnweighted);\n  galois::on_each(reader);\n}\n\ntemplate <typename GraphTy, typename Aux>\nstruct ReadGraphConstructNodesFrom {\n  GraphTy& graph;\n  FileGraph& f;\n  Aux& aux;\n  ReadGraphConstructNodesFrom(GraphTy& g, FileGraph& _f, Aux& a)\n      : graph(g), f(_f), aux(a) {}\n  void operator()(unsigned tid, unsigned total) {\n    graph.constructNodesFrom(f, tid, total, aux);\n  }\n};\n\ntemplate <typename GraphTy, typename Aux>\nstruct ReadGraphConstructEdgesFrom {\n  GraphTy& graph;\n  FileGraph& f;\n  Aux& aux;\n  ReadGraphConstructEdgesFrom(GraphTy& g, FileGraph& _f, Aux& a)\n      : graph(g), f(_f), aux(a) {}\n  void operator()(unsigned tid, unsigned total) {\n    graph.constructEdgesFrom(f, tid, total, aux);\n  }\n};\n\ntemplate <typename GraphTy>\nvoid readGraphDispatch(GraphTy& graph, read_with_aux_graph_tag tag,\n                       const std::string& filename) {\n  FileGraph f;\n  f.fromFileInterleaved<typename GraphTy::file_edge_data_type>(filename);\n  readGraphDispatch(graph, tag, f);\n}\n\ntemplate <typename GraphTy>\nvoid readGraphDispatch(GraphTy& graph, read_with_aux_graph_tag, FileGraph& f) {\n  typedef typename GraphTy::ReadGraphAuxData Aux;\n\n  Aux aux;\n  graph.allocateFrom(f, aux);\n\n  ReadGraphConstructNodesFrom<GraphTy, Aux> nodeReader(graph, f, aux);\n  galois::on_each(nodeReader);\n\n  ReadGraphConstructEdgesFrom<GraphTy, Aux> edgeReader(graph, f, aux);\n  galois::on_each(edgeReader);\n}\n\ntemplate <typename GraphTy, typename Aux>\nstruct ReadGraphConstructOutEdgesFrom {\n  GraphTy& graph;\n  FileGraph& f;\n  Aux& aux;\n  ReadGraphConstructOutEdgesFrom(GraphTy& g, FileGraph& _f, Aux& a)\n      : graph(g), f(_f), aux(a) {}\n  void operator()(unsigned tid, unsigned total) {\n    graph.constructOutEdgesFrom(f, tid, total, aux);\n  }\n};\n\ntemplate <typename GraphTy, typename Aux>\nstruct ReadGraphConstructInEdgesFrom {\n  GraphTy& graph;\n  FileGraph& f;\n  Aux& aux;\n  ReadGraphConstructInEdgesFrom(GraphTy& g, FileGraph& _f, Aux& a)\n      : graph(g), f(_f), aux(a) {}\n  void operator()(unsigned tid, unsigned total) {\n    graph.constructInEdgesFrom(f, tid, total, aux);\n  }\n};\n\ntemplate <typename GraphTy>\nvoid readGraphDispatch(GraphTy& graph, read_with_aux_first_graph_tag,\n                       FileGraph& f) {\n  typedef typename GraphTy::ReadGraphAuxData Aux;\n  constexpr static const bool profile = false;\n\n  galois::CondStatTimer<profile> TAlloc(\"AllocateAux\");\n  TAlloc.start();\n  Aux* auxPtr = new Aux;\n  graph.allocateFrom(f, *auxPtr);\n  TAlloc.stop();\n\n  galois::CondStatTimer<profile> TNode(\"ConstructNode\");\n  TNode.start();\n  ReadGraphConstructNodesFrom<GraphTy, Aux> nodeReader(graph, f, *auxPtr);\n  galois::on_each(nodeReader);\n  TNode.stop();\n\n  galois::CondStatTimer<profile> TOutEdge(\"ConstructOutEdge\");\n  TOutEdge.start();\n  ReadGraphConstructOutEdgesFrom<GraphTy, Aux> outEdgeReader(graph, f, *auxPtr);\n  galois::on_each(outEdgeReader);\n  TOutEdge.stop();\n\n  galois::CondStatTimer<profile> TInEdge(\"ConstructInEdge\");\n  TInEdge.start();\n  ReadGraphConstructInEdgesFrom<GraphTy, Aux> inEdgeReader(graph, f, *auxPtr);\n  galois::on_each(inEdgeReader);\n  TInEdge.stop();\n\n  galois::CondStatTimer<profile> TDestruct(\"DestructAux\");\n  TDestruct.start();\n  delete auxPtr;\n  TDestruct.stop();\n}\n\ntemplate <typename GraphTy>\nvoid readGraphDispatch(GraphTy& graph, read_with_aux_first_graph_tag tag,\n                       const std::string& filename) {\n  FileGraph f;\n  f.fromFileInterleaved<typename GraphTy::file_edge_data_type>(filename);\n  readGraphDispatch(graph, tag, f);\n}\n\ntemplate <typename GraphTy>\nvoid readGraphDispatch(GraphTy& graph, read_lc_inout_graph_tag,\n                       const std::string& f1, const std::string& f2) {\n  graph.createAsymmetric();\n\n  typename GraphTy::out_graph_type::read_tag tag1;\n  readGraphDispatch(graph, tag1, f1);\n\n  typename GraphTy::in_graph_type::read_tag tag2;\n  readGraphDispatch(graph.inGraph, tag2, f2);\n}\n\ntemplate <typename GraphTy>\nvoid readGraphDispatch(GraphTy& graph, read_lc_inout_graph_tag, FileGraph& f1,\n                       FileGraph& f2) {\n  graph.createAsymmetric();\n\n  typename GraphTy::out_graph_type::read_tag tag1;\n  readGraphDispatch(graph, tag1, f1);\n\n  typename GraphTy::in_graph_type::read_tag tag2;\n  readGraphDispatch(graph.inGraph, tag2, f2);\n}\n\ntemplate <typename GraphTy>\nvoid readGraphDispatch(GraphTy& graph, read_lc_inout_graph_tag, FileGraph& f1) {\n  typename GraphTy::out_graph_type::read_tag tag1;\n  readGraphDispatch(graph, tag1, f1);\n}\n\ntemplate <typename GraphTy>\nvoid readGraphDispatch(GraphTy& graph, read_lc_inout_graph_tag,\n                       const std::string& f1) {\n  typename GraphTy::out_graph_type::read_tag tag1;\n  readGraphDispatch(graph, tag1, f1);\n}\n\n} // namespace graphs\n} // namespace galois\n\n#endif\n"
  },
  {
    "path": "libgalois/include/galois/graphs/SpatialTree.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#ifndef GALOIS_GRAPHS_SPATIALTREE_H\n#define GALOIS_GRAPHS_SPATIALTREE_H\n\n#include \"galois/config.h\"\n\nnamespace galois {\nnamespace graphs {\n\n//! Stores sets of objects at specific spatial coordinates in a quad tree.\n//! Lookup returns an approximation of the closest item\ntemplate <typename T>\nclass SpatialTree2d {\n  struct Box2d {\n    double xmin;\n    double ymin;\n    double xmax;\n    double ymax;\n\n    double xmid() const { return (xmin + xmax) / 2.0; }\n    double ymid() const { return (ymin + ymax) / 2.0; }\n\n    void decimate(int quad, double midx, double midy) {\n      if (quad & 1)\n        xmin = midx;\n      else\n        xmax = midx;\n      if (quad & 2)\n        ymin = midy;\n      else\n        ymax = midy;\n    }\n  };\n  struct Node {\n    // existing item\n    T val;\n    double x, y;\n\n    // center\n    double midx, midy;\n\n    Node* children[4];\n    // needs c++11: Node(const T& v) :val(v), children({0,0,0,0}) {}\n    Node(const T& v, double _x, double _y) : val(v), x(_x), y(_y) {\n      children[0] = children[1] = children[2] = children[3] = 0;\n    }\n\n    void setCenter(double cx, double cy) {\n      midx = cx;\n      midy = cy;\n    }\n\n    int getQuad(double _x, double _y) {\n      int retval = 0;\n      if (_x > midx)\n        retval += 1;\n      if (_y > midy)\n        retval += 2;\n      return retval;\n    }\n  };\n\n  galois::runtime::FixedSizeAllocator<Node> nodeAlloc;\n\n  Node* root;\n  Box2d bounds;\n\n  // true if x,y is closer to testx, testy than oldx, oldy\n  bool closer(double x, double y, double testx, double testy, double oldx,\n              double oldy) const {\n    double doldx  = x - oldx;\n    double doldy  = y - oldy;\n    double dtestx = x - testx;\n    double dtesty = y - testy;\n    doldx *= doldx;\n    doldy *= doldy;\n    dtestx *= dtestx;\n    dtesty *= dtesty;\n    return (dtestx + dtesty) < (doldx + doldy);\n  }\n\n  /*\n  T* recfind(Node* n, T* best, double bestx, double besty, double x, double y,\n  Box2d b) { if (!n) return best; if (!best) { // || closer(x, y, n->x, n->y,\n  bestx, besty)) { best = &n->val; bestx = n->x; besty = n->y;\n    }\n    int quad = b.getQuad(x,y);\n    b.decimate(quad);\n    return recfind(n->children[quad], best, bestx, besty, x, y, b);\n  }\n  */\n\n  T* recfind(Node* n, double x, double y) {\n    Node* best = 0;\n    while (n) {\n      if (!best || closer(x, y, n->x, n->y, best->x, best->y))\n        best = n;\n      //      best = &n->val;\n      int quad = n->getQuad(x, y);\n      n        = n->children[quad];\n    }\n    return &best->val;\n  }\n\n  void recinsert(Node** pos, Box2d b, Node* node) {\n    if (!*pos) {\n      // only do an atomic if it looks empty\n      node->setCenter(b.xmid(), b.ymid());\n      if (__sync_bool_compare_and_swap(pos, 0, node))\n        return; // worked!\n    }\n    // We should recurse\n    int quad = (*pos)->getQuad(node->x, node->y);\n    b.decimate(quad, (*pos)->midx, (*pos)->midy);\n    recinsert(&(*pos)->children[quad], b, node);\n  }\n\n  Node* mkNode(const T& v, double x, double y) {\n    Node* n = nodeAlloc.allocate(1);\n    nodeAlloc.construct(n, Node(v, x, y));\n    return n;\n    // return new Node(v,x,y);\n  }\n\n  void delNode(Node* n) {\n    nodeAlloc.destroy(n);\n    nodeAlloc.deallocate(n, 1);\n    // delete n;\n  }\n\n  void freeTree(Node* n) {\n    if (!n)\n      return;\n    for (int x = 0; x < 4; ++x)\n      freeTree(n->children[x]);\n    delNode(n);\n  }\n\npublic:\n  SpatialTree2d(double xmin = 0.0, double ymin = 0.0, double xmax = 0.0,\n                double ymax = 0.0)\n      : root(0) {\n    init(xmin, ymin, xmax, ymax);\n  }\n\n  ~SpatialTree2d() {\n    freeTree(root);\n    root = 0;\n  }\n\n  void init(double xmin, double ymin, double xmax, double ymax) {\n    bounds.xmin = xmin;\n    bounds.ymin = ymin;\n    bounds.xmax = xmax;\n    bounds.ymax = ymax;\n  }\n\n  //! Returns null if tree is empty\n  T* find(double x, double y) {\n    assert(root);\n    return recfind(root, x, y);\n  }\n\n  //! Insert an element. Will always insert and never roll back and thus must\n  //! be used after failsafe point.\n  void insert(double x, double y, const T& v) {\n    recinsert(&root, bounds, mkNode(v, x, y));\n  }\n};\n\n} // namespace graphs\n} // namespace galois\n\n#endif\n"
  },
  {
    "path": "libgalois/include/galois/graphs/TypeTraits.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#ifndef GALOIS_GRAPHS_TYPETRAITS_H\n#define GALOIS_GRAPHS_TYPETRAITS_H\n\n#include <boost/mpl/has_xxx.hpp>\n\n#include \"galois/config.h\"\n\nnamespace galois {\nnamespace graphs {\n\nBOOST_MPL_HAS_XXX_TRAIT_DEF(tt_is_segmented)\ntemplate <typename T>\nstruct is_segmented : public has_tt_is_segmented<T> {};\n\n} // namespace graphs\n} // namespace galois\n#endif\n"
  },
  {
    "path": "libgalois/include/galois/gslist.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#ifndef GALOIS_GSLIST_H\n#define GALOIS_GSLIST_H\n\n#include <type_traits>\n\n#include <boost/iterator/iterator_facade.hpp>\n#include <boost/mpl/if.hpp>\n\n#include \"galois/config.h\"\n#include \"galois/FixedSizeRing.h\"\n#include \"galois/TwoLevelIteratorA.h\"\n\nnamespace galois {\n\ntemplate <typename T, int ChunkSize, bool Concurrent>\nclass gslist_base {\npublic:\n  //! Tag for methods that depend on user to deallocate memory, although gslist\n  //! will destroy elements\n  struct promise_to_dealloc {};\n\nprivate:\n  typedef typename boost::mpl::if_c<Concurrent,\n                                    ConcurrentFixedSizeBag<T, ChunkSize>,\n                                    FixedSizeBag<T, ChunkSize>>::type Ring;\n\n  struct Block : public Ring {\n    Block* next;\n    Block() : next() {}\n  };\n\n  template <typename U>\n  class outer_iterator\n      : public boost::iterator_facade<outer_iterator<U>, U,\n                                      boost::forward_traversal_tag> {\n    friend class boost::iterator_core_access;\n    U* cur;\n\n    void increment() { cur = cur->next; }\n\n    template <typename OtherTy>\n    bool equal(const outer_iterator<OtherTy>& o) const {\n      return cur == o.cur;\n    }\n\n    U& dereference() const { return *cur; }\n\n  public:\n    outer_iterator(U* c = 0) : cur(c) {}\n\n    template <typename OtherTy>\n    outer_iterator(const outer_iterator<OtherTy>& o) : cur(o.cur) {}\n  };\n\n  typedef\n      typename boost::mpl::if_c<Concurrent, std::atomic<Block*>, Block*>::type\n          First;\n\n  First first;\n\n  template <typename HeapTy>\n  Block* alloc_block(HeapTy& heap) {\n    return new (heap.allocate(sizeof(Block))) Block();\n  }\n\n  template <typename HeapTy>\n  void free_block(HeapTy& heap, Block* b) {\n    b->~Block();\n    heap.deallocate(b);\n  }\n\n  void free_block(promise_to_dealloc, Block* b) { b->~Block(); }\n\n  template <typename HeapTy, bool C = Concurrent>\n  auto extend_first(HeapTy& heap) -> typename std::enable_if<C>::type {\n    Block* b = alloc_block(heap);\n    while (true) {\n      Block* f = first.load(std::memory_order_relaxed);\n      b->next  = f;\n      if (first.compare_exchange_weak(f, b))\n        return;\n    }\n  }\n\n  template <typename HeapTy, bool C = Concurrent>\n  auto extend_first(HeapTy& heap) -> typename std::enable_if<!C>::type {\n    Block* b = alloc_block(heap);\n    b->next  = first;\n    first    = b;\n  }\n\n  Block* get_first() {\n    Block* b = first;\n    return b;\n  }\n\n  const Block* get_first() const {\n    Block* b = first;\n    return b;\n  }\n\n  template <typename U, bool C = Concurrent>\n  auto shrink_first(Block* old_first, U&& arg) ->\n      typename std::enable_if<C>::type {\n    if (first.compare_exchange_strong(old_first, old_first->next)) {\n      // old_first->clear();\n      free_block(std::forward<U>(arg), old_first);\n    }\n  }\n\n  template <typename U, bool C = Concurrent>\n  auto shrink_first(Block* old_first, U&& arg) ->\n      typename std::enable_if<!C>::type {\n    if (first != old_first)\n      return;\n    first = old_first->next;\n    // old_first->clear();\n    free_block(std::forward<U>(arg), old_first);\n  }\n\n  template <typename U>\n  void _clear(U&& arg) {\n    Block* b = get_first();\n    while (b) {\n      shrink_first(b, std::forward<U>(arg));\n      b = get_first();\n    }\n  }\n\n  template <typename U>\n  bool _pop_front(U&& arg) {\n    while (true) {\n      Block* b = get_first();\n      if (!b)\n        return false;\n      if (b->pop_front())\n        return true;\n\n      shrink_first(b, std::forward<U>(arg));\n    }\n  }\n\npublic:\n  //! External allocator must be able to allocate this type\n  typedef Block block_type;\n  typedef T value_type;\n  typedef galois::TwoLevelIteratorA<outer_iterator<Block>,\n                                    typename Block::iterator,\n                                    std::forward_iterator_tag, GetBegin, GetEnd>\n      iterator;\n  typedef galois::TwoLevelIteratorA<outer_iterator<const Block>,\n                                    typename Block::const_iterator,\n                                    std::forward_iterator_tag, GetBegin, GetEnd>\n      const_iterator;\n\n  gslist_base() : first(0) {}\n\n  gslist_base(const gslist_base&) = delete;\n  gslist_base& operator=(const gslist_base&) = delete;\n\n  gslist_base(gslist_base&& other) : first(0) { *this = std::move(other); }\n\n  gslist_base& operator=(gslist_base&& o) {\n    Block* m_first = first;\n    Block* o_first = o.first;\n    first          = o_first;\n    o.first        = m_first;\n    return *this;\n  }\n\n  ~gslist_base() {\n    _clear(promise_to_dealloc());\n    // assert(empty() && \"Memory leak if gslist is not empty before\n    // destruction\");\n  }\n\n  iterator begin() {\n    return galois::make_two_level_iterator(outer_iterator<Block>(get_first()),\n                                           outer_iterator<Block>(nullptr))\n        .first;\n  }\n\n  iterator end() {\n    return galois::make_two_level_iterator(outer_iterator<Block>(get_first()),\n                                           outer_iterator<Block>(nullptr))\n        .second;\n  }\n\n  const_iterator begin() const {\n    return galois::make_two_level_iterator(\n               outer_iterator<const Block>(get_first()),\n               outer_iterator<const Block>(nullptr))\n        .first;\n  }\n\n  const_iterator end() const {\n    return galois::make_two_level_iterator(\n               outer_iterator<const Block>(get_first()),\n               outer_iterator<const Block>(nullptr))\n        .second;\n  }\n\n  bool empty() const {\n    return first == NULL || (get_first()->empty() && get_first()->next == NULL);\n  }\n\n  value_type& front() { return get_first()->front(); }\n\n  const value_type& front() const { return get_first()->front(); }\n\n  template <typename HeapTy, typename... Args, bool C = Concurrent>\n  auto emplace_front(HeapTy& heap, Args&&... args) ->\n      typename std::enable_if<!C>::type {\n    if (!first || first->full())\n      extend_first(heap);\n    first->emplace_front(std::forward<Args>(args)...);\n  }\n\n  template <typename HeapTy, bool C = Concurrent>\n  auto push_front(HeapTy& heap, const value_type& v) ->\n      typename std::enable_if<C>::type {\n    while (true) {\n      Block* b = get_first();\n      if (b && b->push_front(v))\n        return;\n      extend_first(heap);\n    }\n  }\n\n  template <typename HeapTy, typename ValueTy, bool C = Concurrent>\n  auto push_front(HeapTy& heap, ValueTy&& v) ->\n      typename std::enable_if<!C>::type {\n    emplace_front(heap, std::forward<ValueTy>(v));\n  }\n\n  //! Returns true if something was popped\n  template <typename HeapTy>\n  bool pop_front(HeapTy& heap) {\n    return _pop_front(heap);\n  }\n\n  //! Returns true if something was popped\n  bool pop_front(promise_to_dealloc) {\n    return _pop_front(promise_to_dealloc());\n  }\n\n  template <typename HeapTy>\n  void clear(HeapTy& heap) {\n    _clear(heap);\n  }\n\n  void clear(promise_to_dealloc) { _clear(promise_to_dealloc()); }\n};\n\n/**\n * Singly linked list. To conserve space, allocator is maintained external to\n * the list.\n */\ntemplate <typename T, unsigned chunksize = 16>\nusing gslist = gslist_base<T, chunksize, false>;\n\n/**\n * Concurrent linked list. To conserve space, allocator is maintained external\n * to the list. Iteration order is unspecified.\n */\ntemplate <typename T, unsigned chunksize = 16>\nusing concurrent_gslist = gslist_base<T, chunksize, true>;\n\n} // namespace galois\n#endif\n"
  },
  {
    "path": "libgalois/include/galois/gstl.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#ifndef GALOIS_GSTL_H\n#define GALOIS_GSTL_H\n\n#include <algorithm>\n#include <iterator>\n#include <utility>\n#include <cassert>\n#include <vector>\n#include <set>\n#include <deque>\n#include <map>\n#include <list>\n#include <string>\n#include <sstream>\n\n#include \"galois/config.h\"\n#include \"galois/PriorityQueue.h\"\n\nnamespace galois {\n\nnamespace gstl {\n\n//! [define Pow_2_VarSizeAlloc]\ntemplate <typename T>\nusing Pow2Alloc = typename runtime::Pow_2_BlockAllocator<T>;\n//! [define Pow_2_VarSizeAlloc]\n\ntemplate <typename T>\nusing FixedSizeAlloc = typename runtime::FixedSizeAllocator<T>;\n\n//! [STL vector using Pow_2_VarSizeAlloc]\ntemplate <typename T>\nusing Vector = std::vector<T, Pow2Alloc<T>>;\n//! [STL vector using Pow_2_VarSizeAlloc]\n\ntemplate <typename T>\nusing Deque = std::deque<T, Pow2Alloc<T>>;\n\ntemplate <typename T>\nusing List = std::list<T, FixedSizeAlloc<T>>;\n\ntemplate <typename T, typename C = std::less<T>>\nusing Set = std::set<T, C, FixedSizeAlloc<T>>;\n\ntemplate <typename K, typename V, typename C = std::less<K>>\nusing Map = std::map<K, V, C, FixedSizeAlloc<std::pair<const K, V>>>;\n\ntemplate <typename K, typename V, typename Hash = std::hash<K>,\n          typename KeyEqual = std::equal_to<K>>\nusing UnorderedMap = std::unordered_map<K, V, Hash, KeyEqual,\n                                        FixedSizeAlloc<std::pair<const K, V>>>;\n\ntemplate <typename T, typename C = std::less<T>>\nusing PQ = MinHeap<T, C, Vector<T>>;\n\nusing Str = std::basic_string<char, std::char_traits<char>, Pow2Alloc<char>>;\n\ntemplate <typename T>\nstruct StrMaker {\n  Str operator()(const T& x) const {\n    std::basic_ostringstream<char, std::char_traits<char>, Pow2Alloc<char>> os;\n    os << x;\n    return Str(os.str());\n  }\n};\n\ntemplate <>\nstruct StrMaker<std::string> {\n  Str operator()(const std::string& x) const { return Str(x.begin(), x.end()); }\n};\n\ntemplate <>\nstruct StrMaker<Str> {\n  const Str& operator()(const Str& x) const { return x; }\n};\n\ntemplate <>\nstruct StrMaker<const char*> {\n  Str operator()(const char* x) const { return Str(x); }\n};\n\ntemplate <typename T>\nStr makeStr(const T& x) {\n  return StrMaker<T>()(x);\n}\n} // end namespace gstl\n\ntemplate <typename I>\nclass IterRange {\n  I m_beg;\n  I m_end;\n\npublic:\n  IterRange(const I& b, const I& e) : m_beg(b), m_end(e) {}\n  const I& begin(void) const { return m_beg; }\n  const I& end(void) const { return m_end; }\n};\n\ntemplate <typename I>\nauto makeIterRange(const I& beg, const I& end) {\n  return IterRange<I>(beg, end);\n}\n\ntemplate <typename C>\nauto makeIterRange(C&& cont) {\n  using I = decltype(std::forward<C>(cont).begin());\n  return IterRange<I>(std::forward<C>(cont).begin(),\n                      std::forward<C>(cont).end());\n}\n\nnamespace internal {\n\ntemplate <typename T, typename C>\nstruct SerCont {\n  C m_q;\n\n  explicit SerCont(const C& q = C()) : m_q(q) {}\n\n  void push(const T& x) { m_q.push_back(x); }\n\n  template <typename I>\n  void push(const I& beg, const I& end) {\n    for (I i = beg; i != end; ++i) {\n      push(*i);\n    }\n  }\n\n  template <typename... Args>\n  void emplace(Args&&... args) {\n    m_q.emplace_back(std::forward<Args>(args)...);\n  }\n\n  bool empty(void) const { return m_q.empty(); }\n\n  void clear(void) { m_q.clear(); }\n\n  using value_type     = typename C::value_type;\n  using iterator       = typename C::iterator;\n  using const_iterator = typename C::const_iterator;\n\n  iterator begin(void) { return m_q.begin(); }\n  iterator end(void) { return m_q.end(); }\n\n  const_iterator begin(void) const { return m_q.begin(); }\n  const_iterator end(void) const { return m_q.end(); }\n\n  const_iterator cbegin(void) const { return m_q.cbegin(); }\n  const_iterator cend(void) const { return m_q.cend(); }\n};\n} // namespace internal\n\ntemplate <typename T, typename C = std::deque<T>>\nclass SerFIFO : public internal::SerCont<T, C> {\n\n  using Base = internal::SerCont<T, C>;\n\npublic:\n  explicit SerFIFO(const C& q = C()) : Base(q) {}\n\n  T pop(void) {\n    T ret = Base::m_q.front();\n    Base::m_q.pop_front();\n    return ret;\n  }\n};\n\ntemplate <typename T, typename C = std::vector<T>>\nclass SerStack : public internal::SerCont<T, C> {\n\n  using Base = internal::SerCont<T, C>;\n\npublic:\n  explicit SerStack(const C& q = C()) : Base(q) {}\n\n  T pop(void) {\n    T ret = Base::m_q.back();\n    Base::m_q.pop_back();\n    return ret;\n  }\n};\n\ntemplate <typename IterTy, class Distance>\nIterTy safe_advance_dispatch(IterTy b, IterTy e, Distance n,\n                             std::random_access_iterator_tag) {\n  if (std::distance(b, e) >= n)\n    return b + n;\n  else\n    return e;\n}\n\ntemplate <typename IterTy, class Distance>\nIterTy safe_advance_dispatch(IterTy b, IterTy e, Distance n,\n                             std::input_iterator_tag) {\n  while (b != e && n--)\n    ++b;\n  return b;\n}\n\n/**\n * Like std::advance but returns end if end is closer than the advance amount.\n */\ntemplate <typename IterTy, class Distance>\nIterTy safe_advance(IterTy b, IterTy e, Distance n) {\n  typename std::iterator_traits<IterTy>::iterator_category category;\n  return safe_advance_dispatch(b, e, n, category);\n}\n\n/**\n * Finds the midpoint of a range.  The first half is always be bigger than\n * the second half if the range has an odd length.\n */\ntemplate <typename IterTy>\nIterTy split_range(IterTy b, IterTy e) {\n  std::advance(b, (std::distance(b, e) + 1) / 2);\n  return b;\n}\n\n/**\n * Returns a continuous block from the range based on the number of\n * divisions and the id of the block requested\n */\ntemplate <\n    typename IterTy,\n    typename std::enable_if<!std::is_integral<IterTy>::value>::type* = nullptr>\nstd::pair<IterTy, IterTy> block_range(IterTy b, IterTy e, unsigned id,\n                                      unsigned num) {\n  size_t dist   = std::distance(b, e);\n  size_t numper = std::max((dist + num - 1) / num, (size_t)1); // round up\n  size_t A      = std::min(numper * id, dist);\n  size_t B      = std::min(numper * (id + 1), dist);\n  std::advance(b, A);\n\n  if (dist != B) {\n    e = b;\n    std::advance(e, B - A);\n  }\n\n  return std::make_pair(b, e);\n}\n\ntemplate <typename IntTy, typename std::enable_if<\n                              std::is_integral<IntTy>::value>::type* = nullptr>\nstd::pair<IntTy, IntTy> block_range(IntTy b, IntTy e, unsigned id,\n                                    unsigned num) {\n  IntTy dist   = e - b;\n  IntTy numper = std::max((dist + num - 1) / num, (IntTy)1); // round up\n  IntTy A      = std::min(numper * id, dist);\n  IntTy B      = std::min(numper * (id + 1), dist);\n  b += A;\n  if (dist != B) {\n    e = b;\n    e += (B - A);\n  }\n  return std::make_pair(b, e);\n}\n\nnamespace internal {\ntemplate <typename I>\nusing Val_ty = typename std::iterator_traits<I>::value_type;\n} // namespace internal\n\n//! Destroy a range\ntemplate <typename I>\nstd::enable_if_t<!std::is_scalar<internal::Val_ty<I>>::value>\nuninitialized_destroy(I first, I last) {\n\n  using T = internal::Val_ty<I>;\n  for (; first != last; ++first)\n    (&*first)->~T();\n}\n\ntemplate <class I>\nstd::enable_if_t<std::is_scalar<internal::Val_ty<I>>::value>\nuninitialized_destroy(I, I) {}\n\n} // namespace galois\n#endif\n"
  },
  {
    "path": "libgalois/include/galois/optional.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#ifndef GALOIS_OPTIONAL_H\n#define GALOIS_OPTIONAL_H\n\n#include <cassert>\n\n#include \"galois/config.h\"\n#include \"galois/LazyObject.h\"\n\nnamespace galois {\n\n/**\n * Galois version of <code>boost::optional</code>.\n */\ntemplate <typename T>\nclass optional {\n  LazyObject<T> data_;\n  bool initialized_;\n\n  void construct(const T& val) {\n    data_.construct(val);\n    initialized_ = true;\n  }\n\n  void assign_impl(const T& val) { get_impl() = val; }\n\n  void destroy() {\n    if (initialized_) {\n      data_.destroy();\n      initialized_ = false;\n    }\n  }\n\n  T& get_impl() { return data_.get(); }\n  const T& get_impl() const { return data_.get(); }\n\npublic:\n  typedef bool (optional::*unspecified_bool_type)() const;\n\n  optional() : initialized_(false) {}\n\n  optional(const T& val) : initialized_(false) { construct(val); }\n\n  optional(const optional& rhs) : initialized_(false) {\n    if (rhs.is_initialized())\n      construct(rhs.get_impl());\n  }\n\n  template <typename U>\n  explicit optional(const optional<U>& rhs) : initialized_(false) {\n    assign(rhs);\n  }\n\n  ~optional() { destroy(); }\n\n  void assign(const optional& rhs) {\n    if (is_initialized()) {\n      if (rhs.is_initialized())\n        assign_impl(rhs.get_impl());\n      else\n        destroy();\n    } else {\n      if (rhs.is_initialized())\n        construct(rhs.get_impl());\n    }\n  }\n\n  template <typename U>\n  void assign(const optional<U>& rhs) {\n    if (is_initialized()) {\n      if (rhs.is_initialized())\n        assign_impl(rhs.get_impl());\n      else\n        destroy();\n    } else {\n      if (rhs.is_initialized())\n        construct(rhs.get_impl());\n    }\n  }\n\n  void assign(const T& val) {\n    if (is_initialized())\n      assign_impl(val);\n    else\n      construct(val);\n  }\n\n  bool is_initialized() const { return initialized_; }\n\n  optional& operator=(const optional& rhs) {\n    assign(rhs);\n    return *this;\n  }\n\n  template <typename U>\n  optional& operator=(const optional<U>& rhs) {\n    assign(rhs);\n    return *this;\n  }\n\n  optional& operator=(const T& val) {\n    assign(val);\n    return *this;\n  }\n\n  T& get() {\n    assert(initialized_);\n    return get_impl();\n  }\n  const T& get() const {\n    assert(initialized_);\n    return get_impl();\n  }\n  T& operator*() { return get(); }\n  const T& operator*() const { return get(); }\n  T* operator->() {\n    assert(initialized_);\n    return &get_impl();\n  }\n  const T* operator->() const {\n    assert(initialized_);\n    return &get_impl();\n  }\n\n  operator unspecified_bool_type() const {\n    return initialized_ ? &optional::is_initialized : 0;\n  }\n};\n\n} // namespace galois\n\n#endif\n"
  },
  {
    "path": "libgalois/include/galois/runtime/Context.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#ifndef GALOIS_RUNTIME_CONTEXT_H\n#define GALOIS_RUNTIME_CONTEXT_H\n\n#include <cassert>\n#include <cstdlib>\n\n#include <boost/utility.hpp>\n\n#include \"galois/config.h\"\n\n#ifdef GALOIS_USE_LONGJMP_ABORT\n#include <csetjmp>\n#endif\n\n#include \"galois/gIO.h\"\n#include \"galois/MethodFlags.h\"\n#include \"galois/substrate/PtrLock.h\"\n\nnamespace galois {\nnamespace runtime {\n\nenum ConflictFlag {\n  CONFLICT         = -1,\n  NO_CONFLICT      = 0,\n  REACHED_FAILSAFE = 1,\n  BREAK            = 2\n};\n\nextern thread_local std::jmp_buf execFrame;\n\nclass Lockable;\n\n[[noreturn]] inline void signalConflict(Lockable* = nullptr) {\n#if defined(GALOIS_USE_LONGJMP_ABORT)\n  std::longjmp(execFrame, CONFLICT);\n  std::abort(); // shouldn't reach here after longjmp\n#elif defined(GALOIS_USE_EXCEPTION_ABORT)\n  throw CONFLICT;\n#endif\n}\n\n#ifdef GALOIS_USE_EXP\nbool owns(Lockable* lockable, MethodFlag m);\n#endif\n\n[[noreturn]] inline void signalFailSafe(void) {\n#if defined(GALOIS_USE_LONGJMP_ABORT)\n  std::longjmp(galois::runtime::execFrame, galois::runtime::REACHED_FAILSAFE);\n  std::abort(); // shouldn't reach here after longjmp\n#elif defined(GALOIS_USE_EXCEPTION_ABORT)\n  throw REACHED_FAILSAFE;\n#endif\n}\n\n//! used to release lock over exception path\nstatic inline void clearConflictLock() {}\n\nclass LockManagerBase;\n\n/**\n * All objects that may be locked (nodes primarily) must inherit from\n * Lockable.\n */\nclass Lockable {\n  substrate::PtrLock<LockManagerBase> owner;\n  //! Use an intrusive list to track neighborhood of a context without\n  //! allocation overhead. Works for cases where a Lockable needs to be only in\n  //! one context's neighborhood list\n  Lockable* next;\n  friend class LockManagerBase;\n  friend class SimpleRuntimeContext;\n\npublic:\n  Lockable() : next(0) {}\n};\n\nclass LockManagerBase : private boost::noncopyable {\nprotected:\n  enum AcquireStatus { FAIL, NEW_OWNER, ALREADY_OWNER };\n\n  AcquireStatus tryAcquire(Lockable* lockable);\n\n  inline bool stealByCAS(Lockable* lockable, LockManagerBase* other) {\n    assert(lockable != nullptr);\n    return lockable->owner.stealing_CAS(other, this);\n  }\n\n  inline bool CASowner(Lockable* lockable, LockManagerBase* other) {\n    assert(lockable != nullptr);\n    return lockable->owner.CAS(other, this);\n  }\n\n  inline void setOwner(Lockable* lockable) {\n    assert(lockable != nullptr);\n    assert(!lockable->owner.getValue());\n    lockable->owner.setValue(this);\n  }\n\n  inline void release(Lockable* lockable) {\n    assert(lockable != nullptr);\n    assert(getOwner(lockable) == this);\n    lockable->owner.unlock_and_clear();\n  }\n\n  inline static bool tryLock(Lockable* lockable) {\n    assert(lockable != nullptr);\n    return lockable->owner.try_lock();\n  }\n\n  inline static LockManagerBase* getOwner(Lockable* lockable) {\n    assert(lockable != nullptr);\n    return lockable->owner.getValue();\n  }\n};\n\nclass SimpleRuntimeContext : public LockManagerBase {\n  //! The locks we hold\n  Lockable* locks;\n  bool customAcquire;\n\nprotected:\n  friend void doAcquire(Lockable*, galois::MethodFlag);\n\n  static SimpleRuntimeContext* getOwner(Lockable* lockable) {\n    LockManagerBase* owner = LockManagerBase::getOwner(lockable);\n    return static_cast<SimpleRuntimeContext*>(owner);\n  }\n\n  virtual void subAcquire(Lockable* lockable, galois::MethodFlag m);\n\n  void addToNhood(Lockable* lockable) {\n    assert(!lockable->next);\n    lockable->next = locks;\n    locks          = lockable;\n  }\n\n  void acquire(Lockable* lockable, galois::MethodFlag m) {\n    AcquireStatus i;\n    if (customAcquire) {\n      subAcquire(lockable, m);\n    } else if ((i = tryAcquire(lockable)) != AcquireStatus::FAIL) {\n      if (i == AcquireStatus::NEW_OWNER) {\n        addToNhood(lockable);\n      }\n    } else {\n      signalConflict(lockable);\n    }\n  }\n\n  void release(Lockable* lockable);\n\npublic:\n  SimpleRuntimeContext(bool child = false) : locks(0), customAcquire(child) {}\n  virtual ~SimpleRuntimeContext() {}\n\n  void startIteration() { assert(!locks); }\n\n  unsigned cancelIteration();\n  unsigned commitIteration();\n};\n\n//! get the current conflict detection class, may be null if not in parallel\n//! region\nSimpleRuntimeContext* getThreadContext();\n\n//! used by the parallel code to set up conflict detection per thread\nvoid setThreadContext(SimpleRuntimeContext* n);\n\n//! Helper function to decide if the conflict detection lock should be taken\ninline bool shouldLock(const galois::MethodFlag g) {\n  // Mask out additional \"optional\" flags\n  switch (g & galois::MethodFlag::INTERNAL_MASK) {\n  case MethodFlag::UNPROTECTED:\n  case MethodFlag::PREVIOUS:\n    return false;\n\n  case MethodFlag::READ:\n  case MethodFlag::WRITE:\n    return true;\n\n  default:\n    // XXX(ddn): Adding error checking code here either upsets the inlining\n    // heuristics or icache behavior. Avoid complex code if possible.\n    // GALOIS_DIE(\"shouldn't get here\");\n    assert(false);\n  }\n  return false;\n}\n\n//! actual locking function.  Will always lock.\ninline void doAcquire(Lockable* lockable, galois::MethodFlag m) {\n  SimpleRuntimeContext* ctx = getThreadContext();\n  if (ctx)\n    ctx->acquire(lockable, m);\n}\n\n//! Master function which handles conflict detection\n//! used to acquire a lockable thing\ninline void acquire(Lockable* lockable, galois::MethodFlag m) {\n  if (shouldLock(m))\n    doAcquire(lockable, m);\n}\n\nstruct AlwaysLockObj {\n  void operator()(Lockable* lockable) const {\n    doAcquire(lockable, galois::MethodFlag::WRITE);\n  }\n};\n\nstruct CheckedLockObj {\n  galois::MethodFlag m;\n  CheckedLockObj(galois::MethodFlag _m) : m(_m) {}\n  void operator()(Lockable* lockable) const { acquire(lockable, m); }\n};\n\n} // namespace runtime\n} // end namespace galois\n\n#endif\n"
  },
  {
    "path": "libgalois/include/galois/runtime/Executor_Deterministic.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#ifndef GALOIS_RUNTIME_EXECUTOR_DETERMINISTIC_H\n#define GALOIS_RUNTIME_EXECUTOR_DETERMINISTIC_H\n\n#include <deque>\n#include <queue>\n#include <type_traits>\n\n#include <boost/iterator/counting_iterator.hpp>\n#include <boost/iterator/iterator_facade.hpp>\n#include <boost/iterator/transform_iterator.hpp>\n\n#include \"galois/Bag.h\"\n#include \"galois/config.h\"\n#include \"galois/gIO.h\"\n#include \"galois/gslist.h\"\n#include \"galois/ParallelSTL.h\"\n#include \"galois/runtime/Executor_ForEach.h\"\n#include \"galois/runtime/LoopStatistics.h\"\n#include \"galois/runtime/Mem.h\"\n#include \"galois/runtime/Range.h\"\n#include \"galois/runtime/Statistics.h\"\n#include \"galois/runtime/Substrate.h\"\n#include \"galois/runtime/UserContextAccess.h\"\n#include \"galois/substrate/Termination.h\"\n#include \"galois/substrate/ThreadPool.h\"\n#include \"galois/Threads.h\"\n#include \"galois/TwoLevelIteratorA.h\"\n#include \"galois/UnionFind.h\"\n#include \"galois/worklists/WorkList.h\"\n\n// TODO deterministic hash\n// TODO deterministic hash: only give ids to window\n// TODO detect and fail if using releasable objects\n// TODO fixed neighborhood: cyclic scheduling\n// TODO fixed neighborhood: reduce list contention\n// TODO fixed neighborhood: profile, reuse graph\n// TODO fixed neighborhood: still ~2X slower than implicit version on bfs\nnamespace galois {\nnamespace runtime {\n//! Implementation of deterministic execution\nnamespace internal {\n\nextern thread_local SizedHeapFactory::SizedHeap* dagListHeap;\n\ntemplate <typename T, bool UseLocalState>\nclass DItemBase {\npublic:\n  T val;\n  unsigned long id;\n\n  DItemBase(const T& _val, unsigned long _id) : val(_val), id(_id) {}\n  void* getLocalState() const { return nullptr; }\n  void setLocalState(void*) {}\n};\n\ntemplate <typename T>\nclass DItemBase<T, true> {\npublic:\n  T val;\n\nprivate:\n  void* localState;\n\npublic:\n  unsigned long id;\n\n  DItemBase(const T& _val, unsigned long _id)\n      : val(_val), localState(nullptr), id(_id) {}\n  void* getLocalState() const { return localState; }\n  void setLocalState(void* ptr) { localState = ptr; }\n};\n\ntemplate <typename OptionsTy>\nusing DItem =\n    DItemBase<typename OptionsTy::value_type, OptionsTy::useLocalState>;\n\nclass FirstPassBase : public SimpleRuntimeContext {\nprotected:\n  bool firstPassFlag;\n\npublic:\n  explicit FirstPassBase(bool f = true)\n      : SimpleRuntimeContext(true), firstPassFlag(f) {}\n\n  bool isFirstPass(void) const { return firstPassFlag; }\n\n  void setFirstPass(void) { firstPassFlag = true; }\n\n  void resetFirstPass(void) { firstPassFlag = false; }\n\n  virtual void alwaysAcquire(Lockable*, galois::MethodFlag) = 0;\n\n  virtual void subAcquire(Lockable* lockable, galois::MethodFlag f) {\n    if (isFirstPass()) {\n      alwaysAcquire(lockable, f);\n    }\n  }\n};\n\ntemplate <typename OptionsTy, bool HasFixedNeighborhood, bool HasIntentToRead>\nclass DeterministicContextBase : public FirstPassBase {\npublic:\n  typedef DItem<OptionsTy> Item;\n  Item item;\n\nprivate:\n  bool notReady;\n\npublic:\n  DeterministicContextBase(const Item& _item)\n      : FirstPassBase(true), item(_item), notReady(false) {}\n\n  void clear() {}\n\n  bool isReady() { return !notReady; }\n\n  virtual void alwaysAcquire(Lockable* lockable, galois::MethodFlag) {\n\n    if (this->tryLock(lockable))\n      this->addToNhood(lockable);\n\n    DeterministicContextBase* other;\n    do {\n      other = static_cast<DeterministicContextBase*>(this->getOwner(lockable));\n      if (other == this)\n        return;\n      if (other) {\n        bool conflict = other->item.id < this->item.id;\n        if (conflict) {\n          // A lock that I want but can't get\n          notReady = true;\n          return;\n        }\n      }\n    } while (!this->stealByCAS(lockable, other));\n\n    // Disable loser\n    if (other) {\n      // Only need atomic write\n      other->notReady = true;\n    }\n  }\n\n  static void initialize() {}\n};\n\nclass HasIntentToReadContext : public FirstPassBase {\npublic:\n  unsigned long id;\n  bool notReady;\n  bool isWriter;\n\n  HasIntentToReadContext(unsigned long id, bool w)\n      : FirstPassBase(true), id(id), notReady(false), isWriter(w) {}\n\n  bool isReady() { return !notReady; }\n};\n\nclass ReaderContext : public galois::UnionFindNode<ReaderContext>,\n                      public HasIntentToReadContext {\n  template <typename, bool, bool>\n  friend class DeterministicContextBase;\n\npublic:\n  ReaderContext(unsigned long id)\n      : galois::UnionFindNode<ReaderContext>(const_cast<ReaderContext*>(this)),\n        HasIntentToReadContext(id, false) {}\n\n  void build() {\n    if (this->isReady())\n      return;\n    ReaderContext* r = this->find();\n    if (r->isReady())\n      r->notReady = true;\n  }\n\n  bool propagate() { return this->find()->isReady(); }\n\n  virtual void alwaysAcquire(Lockable*, galois::MethodFlag) {\n    GALOIS_DIE(\"unreachable\");\n  }\n};\n\ntemplate <typename OptionsTy>\nclass DeterministicContextBase<OptionsTy, false, true>\n    : public HasIntentToReadContext {\npublic:\n  typedef DItem<OptionsTy> Item;\n  Item item;\n\nprivate:\n  ReaderContext readerCtx;\n\n  void acquireRead(Lockable* lockable) {\n    HasIntentToReadContext* other;\n    do {\n      other = static_cast<HasIntentToReadContext*>(this->getOwner(lockable));\n      if (other == this || other == &readerCtx)\n        return;\n      if (other) {\n        bool conflict = other->id < this->id;\n        if (conflict) {\n          if (other->isWriter)\n            readerCtx.notReady = true;\n          else\n            readerCtx.merge(static_cast<ReaderContext*>(other));\n          return;\n        }\n      }\n    } while (!readerCtx.stealByCAS(lockable, other));\n\n    // Disable loser\n    if (other) {\n      if (other->isWriter) {\n        // Only need atomic write\n        other->notReady = true;\n      } else {\n        static_cast<ReaderContext*>(other)->merge(&readerCtx);\n      }\n    }\n  }\n\n  void acquireWrite(Lockable* lockable) {\n    HasIntentToReadContext* other;\n    do {\n      other = static_cast<HasIntentToReadContext*>(this->getOwner(lockable));\n      if (other == this || other == &readerCtx)\n        return;\n      if (other) {\n        bool conflict = other->id < this->id;\n        if (conflict) {\n          // A lock that I want but can't get\n          this->notReady = true;\n          return;\n        }\n      }\n    } while (!this->stealByCAS(lockable, other));\n\n    // Disable loser\n    if (other) {\n      // Only need atomic write\n      other->notReady = true;\n    }\n  }\n\npublic:\n  DeterministicContextBase(const Item& i)\n      : HasIntentToReadContext(i.id, true), item(i), readerCtx(i.id) {}\n\n  void clear() {}\n\n  void build() { readerCtx.build(); }\n\n  void propagate() {\n    if (this->isReady() && !readerCtx.propagate())\n      this->notReady = true;\n  }\n\n  virtual void alwaysAcquire(Lockable* lockable, galois::MethodFlag m) {\n    assert(m == MethodFlag::READ || m == MethodFlag::WRITE);\n\n    if (this->tryLock(lockable))\n      this->addToNhood(lockable);\n\n    if (m == MethodFlag::READ) {\n      acquireRead(lockable);\n    } else {\n      assert(m == MethodFlag::WRITE);\n      acquireWrite(lockable);\n    }\n  }\n\n  static void initialize() {}\n};\n\ntemplate <typename OptionsTy>\nclass DeterministicContextBase<OptionsTy, true, false> : public FirstPassBase {\npublic:\n  typedef DItem<OptionsTy> Item;\n  typedef galois::concurrent_gslist<DeterministicContextBase*, 8> ContextList;\n  Item item;\n  ContextList edges;\n  ContextList succs;\n  std::atomic<int> preds;\n\n  struct ContextPtrLessThan {\n    bool operator()(const DeterministicContextBase* a,\n                    const DeterministicContextBase* b) const {\n      // XXX non-deterministic behavior when we have multiple items with the\n      // same id\n      if (a->item.id == b->item.id)\n        return a < b;\n      return a->item.id < b->item.id;\n    }\n  };\n\npublic:\n  DeterministicContextBase(const Item& _item)\n      : FirstPassBase(true), item(_item), preds(0) {}\n\n  void clear() {\n    assert(preds == 0);\n    this->commitIteration();\n    // TODO replace with bulk heap\n    edges.clear(*dagListHeap);\n    succs.clear(*dagListHeap);\n  }\n\n  void addEdge(DeterministicContextBase* o) {\n    succs.push_front(*dagListHeap, o);\n    o->preds += 1;\n  }\n\n  bool isReady() { return false; }\n\n  virtual void alwaysAcquire(Lockable* lockable, galois::MethodFlag) {\n\n    // First to lock becomes representative\n    DeterministicContextBase* owner =\n        static_cast<DeterministicContextBase*>(this->getOwner(lockable));\n    while (!owner) {\n      if (this->tryLock(lockable)) {\n        this->setOwner(lockable);\n        this->addToNhood(lockable);\n      }\n\n      owner = static_cast<DeterministicContextBase*>(this->getOwner(lockable));\n    }\n\n    if (std::find(edges.begin(), edges.end(), owner) != edges.end())\n      return;\n    edges.push_front(*dagListHeap, owner);\n  }\n\n  static void initialize() {\n    if (!dagListHeap)\n      dagListHeap = SizedHeapFactory::getHeapForSize(\n          sizeof(typename ContextList::block_type));\n  }\n};\n\ntemplate <typename OptionsTy>\nclass DeterministicContextBase<OptionsTy, true, true> {\n  // TODO implement me\n};\n\ntemplate <typename OptionsTy>\nusing DeterministicContext =\n    DeterministicContextBase<OptionsTy, OptionsTy::hasFixedNeighborhood,\n                             OptionsTy::hasIntentToRead>;\n\ntemplate <typename T>\nstruct DNewItem {\n  T val;\n  unsigned long parent;\n  unsigned count;\n\n  DNewItem(const T& _val, unsigned long _parent, unsigned _count)\n      : val(_val), parent(_parent), count(_count) {}\n\n  bool operator<(const DNewItem<T>& o) const {\n    if (parent < o.parent)\n      return true;\n    else if (parent == o.parent)\n      return count < o.count;\n    else\n      return false;\n  }\n\n  bool operator==(const DNewItem<T>& o) const {\n    return parent == o.parent && count == o.count;\n  }\n\n  bool operator!=(const DNewItem<T>& o) const { return !(*this == o); }\n\n  struct GetValue {\n    const T& operator()(const DNewItem<T>& x) const { return x.val; }\n  };\n};\n\ntemplate <typename InputIteratorTy>\nvoid safe_advance(InputIteratorTy& it, size_t d, size_t& cur, size_t dist) {\n  if (d + cur >= dist) {\n    d = dist - cur;\n  }\n  std::advance(it, d);\n  cur += d;\n}\n\n//! Wrapper around worklists::ChunkFIFO to allow peek() and empty() and still\n//! have FIFO order\ntemplate <int ChunkSize, typename T>\nstruct FIFO {\n  worklists::ChunkFIFO<ChunkSize, T, false> m_data;\n  worklists::ChunkLIFO<16, T, false> m_buffer;\n  size_t m_size;\n\n  FIFO() : m_size(0) {}\n\n  ~FIFO() {\n    galois::optional<T> p;\n    while ((p = m_buffer.pop()))\n      ;\n    while ((p = m_data.pop()))\n      ;\n  }\n\n  galois::optional<T> pop() {\n    galois::optional<T> p;\n    if ((p = m_buffer.pop()) || (p = m_data.pop())) {\n      --m_size;\n    }\n    return p;\n  }\n\n  galois::optional<T> peek() {\n    galois::optional<T> p;\n    if ((p = m_buffer.pop())) {\n      m_buffer.push(*p);\n    } else if ((p = m_data.pop())) {\n      m_buffer.push(*p);\n    }\n    return p;\n  }\n\n  void push(const T& val) {\n    m_data.push(val);\n    ++m_size;\n  }\n\n  size_t size() const { return m_size; }\n\n  bool empty() const { return m_size == 0; }\n};\n\ntemplate <typename T, typename FunctionTy, typename ArgsTy>\nstruct OptionsCommon {\n  typedef T value_type;\n  typedef FunctionTy function2_type;\n  typedef ArgsTy args_type;\n\n  constexpr static bool needStats = galois::internal::NeedStats<ArgsTy>::value;\n  constexpr static bool needsPush = !has_trait<no_pushes_tag, ArgsTy>();\n  constexpr static bool needsAborts =\n      !has_trait<disable_conflict_detection_tag, ArgsTy>();\n  constexpr static bool needsPia   = has_trait<per_iter_alloc_tag, ArgsTy>();\n  constexpr static bool needsBreak = has_trait<parallel_break_tag, ArgsTy>();\n\n  constexpr static bool hasBreak = has_trait<det_parallel_break_tag, ArgsTy>();\n  constexpr static bool hasId    = has_trait<det_id_tag, ArgsTy>();\n\n  constexpr static bool useLocalState = has_trait<local_state_tag, ArgsTy>();\n  constexpr static bool hasFixedNeighborhood =\n      has_trait<fixed_neighborhood_tag, ArgsTy>();\n  constexpr static bool hasIntentToRead =\n      has_trait<intent_to_read_tag, ArgsTy>();\n\n  static const int ChunkSize             = 32;\n  static const unsigned InitialNumRounds = 100;\n  static const size_t MinDelta           = ChunkSize * 40;\n\n  static_assert(\n      !hasFixedNeighborhood || (hasFixedNeighborhood && hasId),\n      \"Please provide id function when operator has fixed neighborhood\");\n\n  function2_type fn2;\n  args_type args;\n\n  OptionsCommon(function2_type f, ArgsTy a) : fn2(f), args(a) {}\n};\n\ntemplate <typename T, typename FunctionTy, typename ArgsTy, bool Enable>\nstruct OptionsBase : public OptionsCommon<T, FunctionTy, ArgsTy> {\n  typedef OptionsCommon<T, FunctionTy, ArgsTy> SuperTy;\n  typedef FunctionTy function1_type;\n\n  function1_type fn1;\n\n  OptionsBase(function1_type f, ArgsTy a) : SuperTy(f, a), fn1(f) {}\n};\n\ntemplate <typename T, typename FunctionTy, typename ArgsTy>\nstruct OptionsBase<T, FunctionTy, ArgsTy, true>\n    : public OptionsCommon<T, FunctionTy, ArgsTy> {\n  typedef OptionsCommon<T, FunctionTy, ArgsTy> SuperTy;\n  typedef typename get_trait_type<neighborhood_visitor_tag, ArgsTy>::type::type\n      function1_type;\n\n  function1_type fn1;\n\n  OptionsBase(FunctionTy f, ArgsTy a)\n      : SuperTy(f, a), fn1(get_trait_value<neighborhood_visitor_tag>(a).value) {\n  }\n};\n\ntemplate <typename T, typename FunctionTy, typename ArgsTy>\nusing Options = OptionsBase<T, FunctionTy, ArgsTy,\n                            has_trait<neighborhood_visitor_tag, ArgsTy>()>;\n\ntemplate <typename OptionsTy, bool Enable>\nclass DAGManagerBase {\n  typedef DeterministicContext<OptionsTy> Context;\n\npublic:\n  void destroyDAGManager() {}\n  void pushDAGTask(Context*) {}\n  bool buildDAG() { return false; }\n  template <typename Executor, typename ExecutorTLD>\n  bool executeDAG(Executor&, ExecutorTLD&) {\n    return false;\n  }\n};\n\ntemplate <typename OptionsTy>\nclass DAGManagerBase<OptionsTy, true> {\n  typedef DeterministicContext<OptionsTy> Context;\n  typedef worklists::PerSocketChunkFIFO<OptionsTy::ChunkSize * 2, Context*> WL1;\n  typedef worklists::PerThreadChunkLIFO<OptionsTy::ChunkSize * 2, Context*> WL2;\n  typedef worklists::PerSocketChunkFIFO<32, Context*> WL3;\n\n  struct ThreadLocalData : private boost::noncopyable {\n    typedef std::vector<Context*,\n                        typename PerIterAllocTy::rebind<Context*>::other>\n        SortBuf;\n    IterAllocBaseTy heap;\n    PerIterAllocTy alloc;\n    SortBuf sortBuf;\n    ThreadLocalData() : alloc(&heap), sortBuf(alloc) {}\n  };\n\n  substrate::PerThreadStorage<ThreadLocalData> data;\n  WL1 taskList;\n  WL2 taskList2;\n  WL3 sourceList;\n  substrate::TerminationDetection& term;\n  substrate::Barrier& barrier;\n\npublic:\n  DAGManagerBase()\n      : term(substrate::getSystemTermination(activeThreads)),\n        barrier(getBarrier(activeThreads)) {}\n\n  void destroyDAGManager() { data.getLocal()->heap.clear(); }\n\n  void pushDAGTask(Context* ctx) { taskList.push(ctx); }\n\n  bool buildDAG() {\n    ThreadLocalData& tld = *data.getLocal();\n    galois::optional<Context*> p;\n    while ((p = taskList.pop())) {\n      Context* ctx = *p;\n      tld.sortBuf.clear();\n      std::copy(ctx->edges.begin(), ctx->edges.end(),\n                std::back_inserter(tld.sortBuf));\n      std::sort(tld.sortBuf.begin(), tld.sortBuf.end(),\n                typename Context::ContextPtrLessThan());\n\n      if (!tld.sortBuf.empty()) {\n        Context* last = tld.sortBuf.front();\n        for (auto ii = tld.sortBuf.begin() + 1, ei = tld.sortBuf.end();\n             ii != ei; ++ii) {\n          Context* cur = *ii;\n          if (last != cur && cur != ctx)\n            last->addEdge(cur);\n          last = cur;\n        }\n      }\n\n      taskList2.push(ctx);\n    }\n    return true;\n  }\n\n  template <typename Executor, typename ExecutorTLD>\n  bool executeDAG(Executor& e, ExecutorTLD& etld) {\n    auto& local = e.getLocalWindowManager();\n    galois::optional<Context*> p;\n    Context* ctx;\n\n    // Go through all tasks to find intial sources and\n    while ((p = taskList2.pop())) {\n      ctx = *p;\n      if (ctx->preds.load(std::memory_order_relaxed) == 0)\n        sourceList.push(ctx);\n    }\n\n    term.initializeThread();\n\n    barrier.wait();\n\n    size_t oldCommitted = 0;\n    size_t committed    = 0;\n    do {\n      galois::optional<Context*> p;\n      while ((p = sourceList.pop())) {\n        ctx = *p;\n        assert(ctx->preds == 0);\n        bool commit;\n        commit = e.executeTask(etld, ctx);\n        local.incrementCommitted();\n        assert(commit);\n        committed += 1;\n        e.deallocLocalState(etld.facing);\n\n        if (OptionsTy::needsPia && !OptionsTy::useLocalState)\n          etld.facing.resetAlloc();\n\n        etld.facing.resetPushBuffer();\n\n        // enqueue successors\n        for (auto& succ : ctx->succs) {\n          int v = --succ->preds;\n          assert(v >= 0);\n          if (v == 0)\n            sourceList.push(succ);\n        }\n      }\n\n      term.localTermination(oldCommitted != committed);\n      oldCommitted = committed;\n      substrate::asmPause();\n    } while (!term.globalTermination());\n\n    if (OptionsTy::needsPia && OptionsTy::useLocalState)\n      etld.facing.resetAlloc();\n\n    setThreadContext(0);\n\n    return true;\n  }\n};\n\ntemplate <typename OptionsTy>\nusing DAGManager = DAGManagerBase<OptionsTy, OptionsTy::hasFixedNeighborhood>;\n\ntemplate <typename OptionsTy, bool Enable>\nstruct StateManagerBase {\n  typedef typename OptionsTy::value_type value_type;\n  typedef typename OptionsTy::function2_type function_type;\n  void allocLocalState(UserContextAccess<value_type>&, function_type&) {}\n  void deallocLocalState(UserContextAccess<value_type>&) {}\n  void saveLocalState(UserContextAccess<value_type>&, DItem<OptionsTy>&) {}\n  void restoreLocalState(UserContextAccess<value_type>&,\n                         const DItem<OptionsTy>&) {}\n  void reuseItem(DItem<OptionsTy>&) {}\n\n  template <typename LWL, typename GWL>\n  typename GWL::value_type* emplaceContext(LWL&, GWL& gwl,\n                                           const DItem<OptionsTy>& item) const {\n    return gwl.emplace(item);\n  }\n\n  template <typename LWL, typename GWL>\n  typename GWL::value_type* peekContext(LWL&, GWL& gwl) const {\n    return gwl.peek();\n  }\n\n  template <typename LWL, typename GWL>\n  void popContext(LWL&, GWL& gwl) const {\n    gwl.pop_peeked();\n  }\n};\n\ntemplate <typename OptionsTy>\nstruct StateManagerBase<OptionsTy, true> {\n  typedef typename OptionsTy::value_type value_type;\n  typedef typename OptionsTy::function2_type function_type;\n  typedef typename get_trait_type<\n      local_state_tag, typename OptionsTy::args_type>::type::type LocalState;\n\n  void allocLocalState(UserContextAccess<value_type>& c, function_type&) {\n    void* p = c.data().getPerIterAlloc().allocate(sizeof(LocalState));\n    // new (p) LocalState(self, c.data().getPerIterAlloc());\n    c.setLocalState(p);\n  }\n\n  void deallocLocalState(UserContextAccess<value_type>& c) {\n    LocalState* p = c.data().template getLocalState<LocalState>();\n    if (p)\n      p->~LocalState();\n  }\n\n  void saveLocalState(UserContextAccess<value_type>& c,\n                      DItem<OptionsTy>& item) {\n    item.setLocalState(c.data().template getLocalState<LocalState>());\n  }\n\n  void restoreLocalState(UserContextAccess<value_type>& c,\n                         const DItem<OptionsTy>& item) {\n    c.setLocalState(item.getLocalState());\n  }\n\n  template <typename LWL, typename GWL>\n  typename LWL::value_type* emplaceContext(LWL& lwl, GWL&,\n                                           const DItem<OptionsTy>& item) const {\n    return lwl.emplace(item);\n  }\n\n  template <typename LWL, typename GWL>\n  typename LWL::value_type* peekContext(LWL& lwl, GWL&) const {\n    return lwl.peek();\n  }\n\n  template <typename LWL, typename GWL>\n  void popContext(LWL& lwl, GWL&) const {\n    lwl.pop_peeked();\n  }\n\n  void reuseItem(DItem<OptionsTy>& item) { item.setLocalState(nullptr); }\n};\n\ntemplate <typename OptionsTy>\nusing StateManager = StateManagerBase<OptionsTy, OptionsTy::useLocalState>;\n\ntemplate <typename OptionsTy, bool Enable>\nclass BreakManagerBase {\npublic:\n  bool checkBreak() { return false; }\n  BreakManagerBase(const OptionsTy&) {}\n};\n\ntemplate <typename OptionsTy>\nclass BreakManagerBase<OptionsTy, true> {\n  typedef typename get_trait_type<det_parallel_break_tag,\n                                  typename OptionsTy::args_type>::type::type\n      BreakFn;\n  BreakFn breakFn;\n  substrate::Barrier& barrier;\n  substrate::CacheLineStorage<volatile long> done;\n\npublic:\n  BreakManagerBase(const OptionsTy& o)\n      : breakFn(get_trait_value<det_parallel_break_tag>(o.args).value),\n        barrier(getBarrier(activeThreads)) {}\n\n  bool checkBreak() {\n    if (substrate::ThreadPool::getTID() == 0)\n      done.get() = breakFn();\n    barrier.wait();\n    return done.get();\n  }\n};\n\ntemplate <typename OptionsTy>\nusing BreakManager = BreakManagerBase<OptionsTy, OptionsTy::hasBreak>;\n\ntemplate <typename OptionsTy, bool Enable>\nclass IntentToReadManagerBase {\n  typedef DeterministicContext<OptionsTy> Context;\n\npublic:\n  void pushIntentToReadTask(Context*) {}\n  bool buildIntentToRead() { return false; }\n};\n\ntemplate <typename OptionsTy>\nclass IntentToReadManagerBase<OptionsTy, true> {\n  typedef DeterministicContext<OptionsTy> Context;\n  typedef galois::gdeque<Context*> WL;\n  substrate::PerThreadStorage<WL> pending;\n  substrate::Barrier& barrier;\n\npublic:\n  IntentToReadManagerBase() : barrier(getBarrier(activeThreads)) {}\n\n  void pushIntentToReadTask(Context* ctx) {\n    pending.getLocal()->push_back(ctx);\n  }\n\n  // NB(ddn): Need to gather information from dependees before commitLoop\n  // otherwise some contexts will be deallocated before we have time to check\n  bool buildIntentToRead() {\n    for (Context* ctx : *pending.getLocal())\n      ctx->build();\n    barrier.wait();\n    for (Context* ctx : *pending.getLocal())\n      ctx->propagate();\n    pending.getLocal()->clear();\n    return true;\n  }\n};\n\ntemplate <typename OptionsTy>\nusing IntentToReadManager =\n    IntentToReadManagerBase<OptionsTy, OptionsTy::hasIntentToRead>;\n\ntemplate <typename OptionsTy, bool Enable>\nclass WindowManagerBase {\npublic:\n  class ThreadLocalData {\n    template <typename, bool>\n    friend class WindowManagerBase;\n    size_t window;\n    size_t delta;\n    size_t committed;\n    size_t iterations;\n\n  public:\n    size_t nextWindow(bool first = false) {\n      if (first)\n        window = delta;\n      else\n        window += delta;\n      committed = iterations = 0;\n      return window;\n    }\n\n    void incrementIterations() { ++iterations; }\n    void incrementCommitted() { ++committed; }\n  };\n\nprivate:\n  substrate::PerThreadStorage<ThreadLocalData> data;\n  unsigned numActive;\n\npublic:\n  WindowManagerBase() { numActive = getActiveThreads(); }\n\n  ThreadLocalData& getLocalWindowManager() { return *data.getLocal(); }\n\n  size_t nextWindow(size_t dist, size_t atleast, size_t base = 0) {\n    if (false) {\n      // This, which tries to continue delta with new work, seems to result in\n      // more conflicts (although less total rounds) and more time\n      ThreadLocalData& local = *data.getLocal();\n      return local.nextWindow(true);\n    } else {\n      return initialWindow(dist, atleast, base);\n    }\n  }\n\n  size_t initialWindow(size_t dist, size_t atleast, size_t base = 0) {\n    ThreadLocalData& local = *data.getLocal();\n    size_t w     = std::max(dist / OptionsTy::InitialNumRounds, atleast) + base;\n    local.window = local.delta = w;\n    return w;\n  }\n\n  void calculateWindow(bool inner) {\n    ThreadLocalData& local = *data.getLocal();\n\n    // Accumulate all threads' info\n    size_t allcommitted  = 0;\n    size_t alliterations = 0;\n    for (unsigned i = 0; i < numActive; ++i) {\n      ThreadLocalData& r = *data.getRemote(i);\n      allcommitted += r.committed;\n      alliterations += r.iterations;\n    }\n\n    float commitRatio =\n        alliterations > 0 ? allcommitted / (float)alliterations : 0.0;\n    const float target = 0.95;\n\n    if (commitRatio >= target)\n      local.delta += local.delta;\n    else if (allcommitted == 0) {\n      assert((alliterations == 0) && \"someone should have committed\");\n      local.delta += local.delta;\n    } else\n      local.delta = commitRatio / target * local.delta;\n\n    if (!inner) {\n      if (local.delta < OptionsTy::MinDelta)\n        local.delta = OptionsTy::MinDelta;\n    } else if (local.delta < OptionsTy::MinDelta) {\n      // Try to get some new work instead of increasing window\n      local.delta = 0;\n    }\n\n    // Useful debugging info\n    if (false) {\n      if (substrate::ThreadPool::getTID() == 0) {\n        char buf[1024];\n        snprintf(buf, 1024, \"%d %.3f (%zu/%zu) window: %zu delta: %zu\\n\", inner,\n                 commitRatio, allcommitted, alliterations, local.window,\n                 local.delta);\n        gPrint(buf);\n      }\n    }\n  }\n};\n\ntemplate <typename OptionsTy>\nclass WindowManagerBase<OptionsTy, true> {\npublic:\n  class ThreadLocalData {\n  public:\n    size_t nextWindow() { return std::numeric_limits<size_t>::max(); }\n\n    void incrementIterations() {}\n    void incrementCommitted() {}\n  };\n\nprivate:\n  ThreadLocalData data;\n\npublic:\n  ThreadLocalData& getLocalWindowManager() { return data; }\n\n  size_t nextWindow(size_t, size_t, size_t = 0) { return data.nextWindow(); }\n\n  size_t initialWindow(size_t, size_t, size_t = 0) {\n    return std::numeric_limits<size_t>::max();\n  }\n\n  void calculateWindow(bool) {}\n};\n\ntemplate <typename OptionsTy>\nusing WindowManager =\n    WindowManagerBase<OptionsTy, OptionsTy::hasFixedNeighborhood>;\n\ntemplate <typename OptionsTy, bool Enable>\nstruct IdManagerBase {\n  typedef typename OptionsTy::value_type value_type;\n  IdManagerBase(const OptionsTy&) {}\n  uintptr_t id(const value_type&) { return 0; }\n};\n\ntemplate <typename OptionsTy>\nclass IdManagerBase<OptionsTy, true> {\n  typedef typename OptionsTy::value_type value_type;\n  typedef\n      typename get_trait_type<det_id_tag,\n                              typename OptionsTy::args_type>::type::type IdFn;\n  IdFn idFn;\n\npublic:\n  IdManagerBase(const OptionsTy& o)\n      : idFn(get_trait_value<det_id_tag>(o.args).value) {}\n  uintptr_t id(const value_type& x) { return idFn(x); }\n};\n\ntemplate <typename OptionsTy>\nusing IdManager = IdManagerBase<OptionsTy, OptionsTy::hasId>;\n\ntemplate <typename OptionsTy>\nclass NewWorkManager : public IdManager<OptionsTy> {\n  typedef typename OptionsTy::value_type value_type;\n  typedef DItem<OptionsTy> Item;\n  typedef DNewItem<value_type> NewItem;\n  typedef std::vector<NewItem, typename PerIterAllocTy::rebind<NewItem>::other>\n      NewItemsTy;\n  typedef typename NewItemsTy::iterator NewItemsIterator;\n  typedef FIFO<1024, Item> ReserveTy;\n  typedef worklists::PerSocketChunkFIFO<OptionsTy::ChunkSize, NewItem> NewWork;\n\n  struct GetNewItem {\n    NewWorkManager* self;\n    GetNewItem(NewWorkManager* s = 0) : self(s) {}\n    NewItemsTy& operator()(int i) const {\n      return self->data.getRemote(i)->newItems;\n    }\n  };\n\n  typedef boost::transform_iterator<GetNewItem, boost::counting_iterator<int>>\n      MergeOuterIt;\n  typedef std::vector<NewItem, typename PerIterAllocTy::rebind<NewItem>::other>\n      MergeBuf;\n  typedef std::vector<value_type,\n                      typename PerIterAllocTy::rebind<value_type>::other>\n      DistributeBuf;\n\n  struct ThreadLocalData {\n    IterAllocBaseTy heap;\n    PerIterAllocTy alloc;\n    NewItemsTy newItems;\n    ReserveTy reserve;\n    size_t minId;\n    size_t maxId;\n    size_t size;\n\n    ThreadLocalData() : alloc(&heap), newItems(alloc) {}\n  };\n\n  IterAllocBaseTy heap;\n  PerIterAllocTy alloc;\n  substrate::PerThreadStorage<ThreadLocalData> data;\n  NewWork new_;\n  MergeBuf mergeBuf;\n  DistributeBuf distributeBuf;\n  substrate::Barrier& barrier;\n  unsigned numActive;\n\n  bool merge(int begin, int end) {\n    if (begin == end)\n      return false;\n    else if (begin + 1 == end)\n      return !data.getRemote(begin)->newItems.empty();\n\n    bool retval = false;\n    int mid     = (end - begin) / 2 + begin;\n    retval |= merge(begin, mid);\n    retval |= merge(mid, end);\n\n    GetNewItem fn(this);\n\n    MergeOuterIt bbegin(boost::make_counting_iterator(begin), fn);\n    MergeOuterIt mmid(boost::make_counting_iterator(mid), fn);\n    MergeOuterIt eend(boost::make_counting_iterator(end), fn);\n    auto aa = make_two_level_iterator<std::forward_iterator_tag, MergeOuterIt,\n                                      typename NewItemsTy::iterator, GetBegin,\n                                      GetEnd>(bbegin, mmid);\n    auto bb = make_two_level_iterator<std::forward_iterator_tag, MergeOuterIt,\n                                      typename NewItemsTy::iterator, GetBegin,\n                                      GetEnd>(mmid, eend);\n    auto cc = make_two_level_iterator<std::forward_iterator_tag, MergeOuterIt,\n                                      typename NewItemsTy::iterator, GetBegin,\n                                      GetEnd>(bbegin, eend);\n\n    while (aa.first != aa.second && bb.first != bb.second) {\n      if (*aa.first < *bb.first)\n        mergeBuf.push_back(*aa.first++);\n      else\n        mergeBuf.push_back(*bb.first++);\n    }\n\n    for (; aa.first != aa.second; ++aa.first)\n      mergeBuf.push_back(*aa.first);\n\n    for (; bb.first != bb.second; ++bb.first)\n      mergeBuf.push_back(*bb.first);\n\n    for (NewItemsIterator ii = mergeBuf.begin(), ei = mergeBuf.end(); ii != ei;\n         ++ii)\n      *cc.first++ = *ii;\n\n    mergeBuf.clear();\n\n    assert(cc.first == cc.second);\n\n    return retval;\n  }\n\n  /**\n   * Slightly complicated reindexing to separate out continuous elements in\n   * InputIterator. <pre> Example:\n   *\n   * blocksize: 2\n   * pos:  0 1 2 3 4 5\n   * item: A B C D E F\n   * new:  A D B E C F\n   * </pre>\n   */\n  template <typename InputIteratorTy>\n  void redistribute(InputIteratorTy ii, InputIteratorTy ei, size_t dist,\n                    size_t window, unsigned tid) {\n    // ThreadLocalData& local = *data.getLocal();\n    size_t blockSize = window;\n    size_t numBlocks = dist / blockSize;\n\n    size_t cur = 0;\n    safe_advance(ii, tid, cur, dist);\n    while (ii != ei) {\n      unsigned long id;\n      if (cur < blockSize * numBlocks)\n        id = (cur % numBlocks) * blockSize + (cur / numBlocks);\n      else\n        id = cur;\n      distributeBuf[id] = *ii;\n      safe_advance(ii, numActive, cur, dist);\n    }\n  }\n\n  template <typename InputIteratorTy, typename WL>\n  void copyMine(InputIteratorTy ii, InputIteratorTy ei, size_t dist, WL* wl,\n                size_t window, unsigned tid) {\n    ThreadLocalData& local = *data.getLocal();\n    size_t cur             = 0;\n    size_t k               = 0;\n    safe_advance(ii, tid, cur, dist);\n    while (ii != ei) {\n      unsigned long id = k * numActive + tid;\n      if (id < window)\n        wl->push(Item(*ii, id));\n      else\n        break;\n      ++k;\n      safe_advance(ii, numActive, cur, dist);\n    }\n\n    while (ii != ei) {\n      unsigned long id = k * numActive + tid;\n      local.reserve.push(Item(*ii, id));\n      ++k;\n      safe_advance(ii, numActive, cur, dist);\n    }\n  }\n\n  template <typename InputIteratorTy, typename WL>\n  void copyAllWithIds(InputIteratorTy ii, InputIteratorTy ei, WL* wl,\n                      size_t window) {\n    ThreadLocalData& local = *data.getLocal();\n    for (; ii != ei; ++ii) {\n      unsigned long id = ii->parent;\n      if (id < window)\n        wl->push(Item(ii->val, id));\n      else\n        break;\n    }\n\n    for (; ii != ei; ++ii) {\n      unsigned long id = ii->parent;\n      local.reserve.push(Item(ii->val, id));\n    }\n  }\n\n  template <typename InputIteratorTy, typename WL>\n  void copyMineAfterRedistribute(InputIteratorTy ii, InputIteratorTy ei,\n                                 size_t dist, WL* wl, size_t window,\n                                 unsigned tid) {\n    if (tid == 0) {\n      distributeBuf.resize(dist);\n    }\n    barrier.wait();\n    redistribute(ii, ei, dist, window, tid);\n    barrier.wait();\n    copyMine(distributeBuf.begin(), distributeBuf.end(), dist, wl, window, tid);\n  }\n\n  template <typename WL>\n  void parallelSort(WindowManager<OptionsTy>& wm, WL* wl, unsigned tid) {\n    ThreadLocalData& local = *data.getLocal();\n\n    local.newItems.clear();\n    galois::optional<NewItem> p;\n    while ((p = this->new_.pop())) {\n      local.newItems.push_back(*p);\n    }\n\n    NewItemsIterator ii = local.newItems.begin();\n    NewItemsIterator ei = local.newItems.end();\n    std::sort(ii, ei);\n    initialLimits(ii, ei);\n    local.size = local.newItems.size();\n\n    barrier.wait();\n\n    if (tid == 0) {\n      receiveLimits(local);\n      broadcastLimits(local);\n      if (!OptionsTy::hasId) {\n        mergeBuf.reserve(local.size);\n        merge(0, numActive);\n      }\n    }\n\n    barrier.wait();\n\n    if (OptionsTy::hasId) {\n      size_t window = wm.nextWindow(local.maxId - local.minId,\n                                    OptionsTy::MinDelta, local.minId);\n      copyAllWithIds(ii, ei, wl, window);\n    } else {\n      GetNewItem fn(this);\n      MergeOuterIt bbegin(boost::make_counting_iterator(0), fn);\n      MergeOuterIt eend(boost::make_counting_iterator((int)numActive), fn);\n      auto ii = make_two_level_iterator<std::forward_iterator_tag, MergeOuterIt,\n                                        typename NewItemsTy::iterator, GetBegin,\n                                        GetEnd>(bbegin, eend);\n\n      size_t window = wm.nextWindow(local.size, OptionsTy::MinDelta);\n      copyMineAfterRedistribute(boost::make_transform_iterator(\n                                    ii.first, typename NewItem::GetValue()),\n                                boost::make_transform_iterator(\n                                    ii.second, typename NewItem::GetValue()),\n                                local.size, wl, window, tid);\n    }\n  }\n\n  void broadcastLimits(ThreadLocalData& local) {\n    for (unsigned i = 1; i < numActive; ++i) {\n      ThreadLocalData& other = *data.getRemote(i);\n      other.minId            = local.minId;\n      other.maxId            = local.maxId;\n      other.size             = local.size;\n    }\n  }\n\n  void receiveLimits(ThreadLocalData& local) {\n    for (unsigned i = 1; i < numActive; ++i) {\n      ThreadLocalData& other = *data.getRemote(i);\n      local.minId            = std::min(other.minId, local.minId);\n      local.maxId            = std::max(other.maxId, local.maxId);\n      local.size += other.size;\n    }\n  }\n\n  //! Update min and max from sorted iterator\n  template <typename BiIteratorTy>\n  void initialLimits(BiIteratorTy ii, BiIteratorTy ei) {\n    ThreadLocalData& local = *data.getLocal();\n\n    local.minId = std::numeric_limits<size_t>::max();\n    local.maxId = std::numeric_limits<size_t>::min();\n    local.size  = std::distance(ii, ei);\n\n    if (ii != ei) {\n      if (ii + 1 == ei) {\n        local.minId = local.maxId = ii->parent;\n      } else {\n        local.minId = ii->parent;\n        local.maxId = (ei - 1)->parent;\n      }\n    }\n  }\n\n  template <typename InputIteratorTy>\n  void sortInitialWorkDispatch(InputIteratorTy, InputIteratorTy, ...) {}\n\n  template <typename InputIteratorTy, bool HasId = OptionsTy::hasId,\n            bool HasFixed = OptionsTy::hasFixedNeighborhood>\n  auto sortInitialWorkDispatch(InputIteratorTy ii, InputIteratorTy ei, int) ->\n      typename std::enable_if<HasId && !HasFixed, void>::type {\n    ThreadLocalData& local = *data.getLocal();\n    size_t dist            = std::distance(ii, ei);\n\n    mergeBuf.reserve(dist);\n    for (; ii != ei; ++ii)\n      mergeBuf.emplace_back(*ii, this->id(*ii), 1);\n\n    ParallelSTL::sort(mergeBuf.begin(), mergeBuf.end());\n\n    initialLimits(mergeBuf.begin(), mergeBuf.end());\n    broadcastLimits(local);\n  }\n\npublic:\n  NewWorkManager(const OptionsTy& o)\n      : IdManager<OptionsTy>(o), alloc(&heap), mergeBuf(alloc),\n        distributeBuf(alloc), barrier(getBarrier(activeThreads)) {\n    numActive = getActiveThreads();\n  }\n\n  bool emptyReserve() { return data.getLocal()->reserve.empty(); }\n\n  template <typename WL>\n  void pushNextWindow(WL* wl, size_t window) {\n    ThreadLocalData& local = *data.getLocal();\n    galois::optional<Item> p;\n    while ((p = local.reserve.peek())) {\n      if (p->id >= window)\n        break;\n      wl->push(*p);\n      local.reserve.pop();\n    }\n  }\n\n  void clearNewWork() { data.getLocal()->heap.clear(); }\n\n  template <typename InputIteratorTy>\n  void sortInitialWork(InputIteratorTy ii, InputIteratorTy ei) {\n    return sortInitialWorkDispatch(ii, ei, 0);\n  }\n\n  template <typename InputIteratorTy, typename WL>\n  void addInitialWork(WindowManager<OptionsTy>& wm, InputIteratorTy b,\n                      InputIteratorTy e, WL* wl) {\n    size_t dist = std::distance(b, e);\n    if (OptionsTy::hasId) {\n      ThreadLocalData& local = *data.getLocal();\n      size_t window = wm.initialWindow(dist, OptionsTy::MinDelta, local.minId);\n      if (OptionsTy::hasFixedNeighborhood) {\n        copyMine(b, e, dist, wl, window, substrate::ThreadPool::getTID());\n      } else {\n        copyMine(boost::make_transform_iterator(mergeBuf.begin(),\n                                                typename NewItem::GetValue()),\n                 boost::make_transform_iterator(mergeBuf.end(),\n                                                typename NewItem::GetValue()),\n                 mergeBuf.size(), wl, window, substrate::ThreadPool::getTID());\n      }\n    } else {\n      size_t window = wm.initialWindow(dist, OptionsTy::MinDelta);\n      copyMineAfterRedistribute(b, e, dist, wl, window,\n                                substrate::ThreadPool::getTID());\n    }\n  }\n\n  template <bool HasId = OptionsTy::hasId>\n  auto pushNew(const value_type& val, unsigned long, unsigned) ->\n      typename std::enable_if<HasId, void>::type {\n    new_.push(NewItem(val, this->id(val), 1));\n  }\n\n  template <bool HasId = OptionsTy::hasId>\n  auto pushNew(const value_type& val, unsigned long parent, unsigned count) ->\n      typename std::enable_if<!HasId, void>::type {\n    new_.push(NewItem(val, parent, count));\n  }\n\n  template <typename WL>\n  void distributeNewWork(WindowManager<OptionsTy>& wm, WL* wl) {\n    parallelSort(wm, wl, substrate::ThreadPool::getTID());\n  }\n};\n\ntemplate <typename OptionsTy>\nclass Executor : public BreakManager<OptionsTy>,\n                 public StateManager<OptionsTy>,\n                 public NewWorkManager<OptionsTy>,\n                 public WindowManager<OptionsTy>,\n                 public DAGManager<OptionsTy>,\n                 public IntentToReadManager<OptionsTy> {\n  typedef typename OptionsTy::value_type value_type;\n  typedef DItem<OptionsTy> Item;\n  typedef DeterministicContext<OptionsTy> Context;\n\n  typedef worklists::PerSocketChunkFIFO<OptionsTy::ChunkSize, Item> WL;\n  typedef worklists::PerSocketChunkFIFO<OptionsTy::ChunkSize, Context>\n      PendingWork;\n  typedef worklists::ChunkFIFO<OptionsTy::ChunkSize, Context, false>\n      LocalPendingWork;\n\n  // Truly thread-local\n  using LoopStat = LoopStatistics<OptionsTy::needStats>;\n  struct ThreadLocalData : public LoopStat {\n\n    typename OptionsTy::function1_type fn1;\n    typename OptionsTy::function2_type fn2;\n    LocalPendingWork localPending;\n    UserContextAccess<value_type> facing;\n\n    WL* wlcur;\n    WL* wlnext;\n    size_t rounds;\n    size_t outerRounds;\n    bool hasNewWork;\n    ThreadLocalData(const OptionsTy& o, const char* loopname)\n        : LoopStat(loopname), fn1(o.fn1), fn2(o.fn2), rounds(0),\n          outerRounds(0) {}\n  };\n\n  OptionsTy options;\n  substrate::Barrier& barrier;\n  WL worklists[2];\n  PendingWork pending;\n  const char* loopname;\n  substrate::CacheLineStorage<volatile long> innerDone;\n  substrate::CacheLineStorage<volatile long> outerDone;\n  substrate::CacheLineStorage<volatile long> hasNewWork;\n\n  int runFunction(ThreadLocalData& tld, Context* ctx);\n\n  bool pendingLoop(ThreadLocalData& tld);\n  bool commitLoop(ThreadLocalData& tld);\n  void go();\n\n  void drainPending(ThreadLocalData& tld) {\n    Context* ctx;\n    while ((ctx = this->peekContext(tld.localPending, pending))) {\n      ctx->clear();\n      this->popContext(tld.localPending, pending);\n    }\n  }\n\npublic:\n  Executor(const OptionsTy& o)\n      : BreakManager<OptionsTy>(o), NewWorkManager<OptionsTy>(o), options(o),\n        barrier(getBarrier(activeThreads)),\n        loopname(galois::internal::getLoopName(o.args)) {\n    static_assert(!OptionsTy::needsBreak || OptionsTy::hasBreak,\n                  \"need to use break function to break loop\");\n  }\n\n  bool executeTask(ThreadLocalData& tld, Context* ctx);\n\n  template <typename RangeTy>\n  void initThread(const RangeTy& range) {\n    Context::initialize();\n    this->addInitialWork(*this, range.begin(), range.end(), &worklists[1]);\n  }\n\n  template <typename RangeTy>\n  void init(const RangeTy& range) {\n    this->sortInitialWork(range.begin(), range.end());\n  }\n\n  void operator()() { go(); }\n};\n\ntemplate <typename OptionsTy>\nvoid Executor<OptionsTy>::go() {\n  ThreadLocalData tld(options, loopname);\n  auto& local = this->getLocalWindowManager();\n  tld.wlcur   = &worklists[0];\n  tld.wlnext  = &worklists[1];\n\n  tld.hasNewWork = false;\n\n  while (true) {\n    ++tld.outerRounds;\n\n    while (true) {\n      ++tld.rounds;\n\n      std::swap(tld.wlcur, tld.wlnext);\n      bool nextPending = pendingLoop(tld);\n      innerDone.get()  = true;\n\n      barrier.wait();\n\n      if (this->buildDAG())\n        barrier.wait();\n\n      if (this->buildIntentToRead())\n        barrier.wait();\n\n      bool nextCommit = false;\n      outerDone.get() = true;\n\n      if (this->executeDAG(*this, tld)) {\n        if (OptionsTy::needsBreak)\n          barrier.wait();\n        drainPending(tld);\n        break;\n      }\n\n      nextCommit = commitLoop(tld);\n\n      if (nextPending || nextCommit)\n        innerDone.get() = false;\n\n      barrier.wait();\n\n      if (innerDone.get())\n        break;\n\n      this->calculateWindow(true);\n\n      barrier.wait();\n\n      this->pushNextWindow(tld.wlnext, local.nextWindow());\n    }\n\n    if (!this->emptyReserve())\n      outerDone.get() = false;\n\n    if (tld.hasNewWork)\n      hasNewWork.get() = true;\n\n    if (this->checkBreak())\n      break;\n\n    barrier.wait();\n\n    if (outerDone.get()) {\n      if (!OptionsTy::needsPush)\n        break;\n      if (!hasNewWork.get()) // (1)\n        break;\n      this->distributeNewWork(*this, tld.wlnext);\n      tld.hasNewWork = false;\n      // NB: assumes that distributeNewWork has a barrier otherwise checking at\n      // (1) is erroneous\n      hasNewWork.get() = false;\n    } else {\n      this->calculateWindow(false);\n\n      this->pushNextWindow(tld.wlnext, local.nextWindow());\n    }\n  }\n\n  this->destroyDAGManager();\n  this->clearNewWork();\n\n  if (OptionsTy::needStats) {\n    if (substrate::ThreadPool::getTID() == 0) {\n      reportStat_Single(loopname, \"RoundsExecuted\", tld.rounds);\n      reportStat_Single(loopname, \"OuterRoundsExecuted\", tld.outerRounds);\n    }\n  }\n}\n\ntemplate <typename OptionsTy>\nint Executor<OptionsTy>::runFunction(ThreadLocalData& tld, Context* ctx) {\n  int result = 0;\n#ifdef GALOIS_USE_LONGJMP_ABORT\n  if ((result = setjmp(execFrame)) == 0) {\n#elif defined(GALOIS_USE_EXCEPTION_ABORT)\n  try {\n#endif\n    tld.fn1(ctx->item.val, tld.facing.data());\n#ifdef GALOIS_USE_LONGJMP_ABORT\n  } else {\n    clearConflictLock();\n  }\n#elif defined(GALOIS_USE_EXCEPTION_ABORT)\n  } catch (const ConflictFlag& flag) {\n    clearConflictLock();\n    result = flag;\n  }\n#endif\n  return result;\n}\n\ntemplate <typename OptionsTy>\nbool Executor<OptionsTy>::pendingLoop(ThreadLocalData& tld) {\n  auto& local = this->getLocalWindowManager();\n  bool retval = false;\n  galois::optional<Item> p;\n  while ((p = tld.wlcur->pop())) {\n    // Use a new context for each item because there is a race when reusing\n    // between aborted iterations.\n    Context* ctx = this->emplaceContext(tld.localPending, pending, *p);\n    this->pushDAGTask(ctx);\n    local.incrementIterations();\n    bool commit = true;\n\n    ctx->startIteration();\n    ctx->setFirstPass();\n    tld.inc_iterations();\n    tld.facing.setFirstPass();\n    setThreadContext(ctx);\n\n    this->allocLocalState(tld.facing, tld.fn2);\n    int result = runFunction(tld, ctx);\n    // FIXME:    clearReleasable();\n    tld.facing.resetFirstPass();\n    ctx->resetFirstPass();\n    switch (result) {\n    case 0:\n    case REACHED_FAILSAFE:\n      break;\n    case CONFLICT:\n      commit = false;\n      break;\n    default:\n      abort();\n      break;\n    }\n\n    // TODO only needed if fn1 needs pia\n    if (OptionsTy::needsPia && !OptionsTy::useLocalState)\n      tld.facing.resetAlloc();\n\n    if (commit || OptionsTy::hasFixedNeighborhood) {\n      this->saveLocalState(tld.facing, ctx->item);\n    } else {\n      retval = true;\n    }\n  }\n\n  return retval;\n}\n\ntemplate <typename OptionsTy>\nbool Executor<OptionsTy>::executeTask(ThreadLocalData& tld, Context* ctx) {\n  setThreadContext(ctx);\n  this->restoreLocalState(tld.facing, ctx->item);\n  tld.facing.resetFirstPass();\n  ctx->resetFirstPass();\n  int result = 0;\n#ifdef GALOIS_USE_LONGJMP_ABORT\n  if ((result = setjmp(execFrame)) == 0) {\n#elif defined(GALOIS_USE_EXCEPTION_ABORT)\n  try {\n#endif\n    tld.fn2(ctx->item.val, tld.facing.data());\n#ifdef GALOIS_USE_LONGJMP_ABORT\n  } else {\n    clearConflictLock();\n  }\n#elif defined(GALOIS_USE_EXCEPTION_ABORT)\n  } catch (const ConflictFlag& flag) {\n    clearConflictLock();\n    result = flag;\n  }\n#endif\n  // FIXME: clearReleasable();\n  switch (result) {\n  case 0:\n    break;\n  case CONFLICT:\n    return false;\n    break;\n  default:\n    GALOIS_DIE(\"unknown conflict flag\");\n    break;\n  }\n\n  if (OptionsTy::needsPush) {\n    unsigned long parent = ctx->item.id;\n    //    typedef typename UserContextAccess<value_type>::PushBufferTy::iterator\n    //    iterator;\n    unsigned count = 0;\n    for (auto& item : tld.facing.getPushBuffer()) {\n      this->pushNew(item, parent, ++count);\n      if (count == 0) {\n        GALOIS_DIE(\"counter overflow\");\n      }\n    }\n    if (count)\n      tld.hasNewWork = true;\n  }\n  assert(OptionsTy::needsPush || tld.facing.getPushBuffer().begin() ==\n                                     tld.facing.getPushBuffer().end());\n\n  return true;\n}\n\ntemplate <typename OptionsTy>\nbool Executor<OptionsTy>::commitLoop(ThreadLocalData& tld) {\n  bool retval = false;\n  auto& local = this->getLocalWindowManager();\n\n  Context* ctx;\n  while ((ctx = this->peekContext(tld.localPending, pending))) {\n    bool commit = false;\n    if (ctx->isReady())\n      commit = executeTask(tld, ctx);\n\n    if (commit) {\n      ctx->commitIteration();\n      local.incrementCommitted();\n    } else {\n      this->reuseItem(ctx->item);\n      tld.wlnext->push(ctx->item);\n      tld.inc_conflicts();\n      retval = true;\n      ctx->cancelIteration();\n    }\n\n    this->deallocLocalState(tld.facing);\n\n    if (OptionsTy::needsPia && !OptionsTy::useLocalState)\n      tld.facing.resetAlloc();\n\n    tld.facing.resetPushBuffer();\n    ctx->clear();\n    this->popContext(tld.localPending, pending);\n  }\n\n  if (OptionsTy::needsPia && OptionsTy::useLocalState)\n    tld.facing.resetAlloc();\n\n  setThreadContext(0);\n\n  return retval;\n}\n\n} // namespace internal\n} // namespace runtime\n\nnamespace worklists {\n\n/**\n * Deterministic execution. Operator should be cautious.\n */\ntemplate <typename T = int>\nstruct Deterministic {\n  template <bool _concurrent>\n  using rethread = Deterministic<T>;\n\n  template <typename _T>\n  using retype = Deterministic<_T>;\n\n  typedef T value_type;\n};\n\n} // namespace worklists\n\nnamespace runtime {\n\ntemplate <class T, class FunctionTy, class ArgsTy>\nstruct ForEachExecutor<worklists::Deterministic<T>, FunctionTy, ArgsTy>\n    : public internal::Executor<internal::Options<T, FunctionTy, ArgsTy>> {\n  typedef internal::Options<T, FunctionTy, ArgsTy> OptionsTy;\n  typedef internal::Executor<OptionsTy> SuperTy;\n  ForEachExecutor(FunctionTy f, const ArgsTy& args)\n      : SuperTy(OptionsTy(f, args)) {}\n};\n\n} // namespace runtime\n\n} // namespace galois\n#endif\n"
  },
  {
    "path": "libgalois/include/galois/runtime/Executor_DoAll.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#ifndef GALOIS_RUNTIME_EXECUTOR_DOALL_H\n#define GALOIS_RUNTIME_EXECUTOR_DOALL_H\n\n#include \"galois/config.h\"\n#include \"galois/gIO.h\"\n#include \"galois/runtime/Executor_OnEach.h\"\n#include \"galois/runtime/OperatorReferenceTypes.h\"\n#include \"galois/runtime/Statistics.h\"\n#include \"galois/substrate/Barrier.h\"\n#include \"galois/substrate/CompilerSpecific.h\"\n#include \"galois/substrate/PaddedLock.h\"\n#include \"galois/substrate/PerThreadStorage.h\"\n#include \"galois/substrate/Termination.h\"\n#include \"galois/substrate/ThreadPool.h\"\n#include \"galois/Timer.h\"\n\nnamespace galois::runtime {\n\nnamespace internal {\n\ntemplate <typename R, typename F, typename ArgsTuple>\nclass DoAllStealingExec {\n\n  typedef typename R::local_iterator Iter;\n  typedef typename std::iterator_traits<Iter>::difference_type Diff_ty;\n\n  enum StealAmt { HALF, FULL };\n\n  constexpr static const bool NEED_STATS =\n      galois::internal::NeedStats<ArgsTuple>::value;\n  constexpr static const bool MORE_STATS =\n      NEED_STATS && has_trait<more_stats_tag, ArgsTuple>();\n  constexpr static const bool USE_TERM = false;\n\n  struct ThreadContext {\n\n    alignas(substrate::GALOIS_CACHE_LINE_SIZE) substrate::SimpleLock work_mutex;\n    unsigned id;\n\n    Iter shared_beg;\n    Iter shared_end;\n    Diff_ty m_size;\n    size_t num_iter;\n\n    // Stats\n\n    ThreadContext()\n        : work_mutex(), id(substrate::getThreadPool().getMaxThreads()),\n          shared_beg(), shared_end(), m_size(0), num_iter(0) {\n      // TODO: fix this initialization problem,\n      // see initThread\n    }\n\n    ThreadContext(unsigned id, Iter beg, Iter end)\n        : work_mutex(), id(id), shared_beg(beg), shared_end(end),\n          m_size(std::distance(beg, end)), num_iter(0) {}\n\n    bool doWork(F func, const unsigned chunk_size) {\n      Iter beg(shared_beg);\n      Iter end(shared_end);\n\n      bool didwork = false;\n\n      while (getWork(beg, end, chunk_size)) {\n\n        didwork = true;\n\n        for (; beg != end; ++beg) {\n          if (NEED_STATS) {\n            ++num_iter;\n          }\n          func(*beg);\n        }\n      }\n\n      return didwork;\n    }\n\n    bool hasWorkWeak() const { return (m_size > 0); }\n\n    bool hasWork() const {\n      bool ret = false;\n\n      work_mutex.lock();\n      {\n        ret = hasWorkWeak();\n\n        if (m_size > 0) {\n          assert(shared_beg != shared_end);\n        }\n      }\n      work_mutex.unlock();\n\n      return ret;\n    }\n\n  private:\n    bool getWork(Iter& priv_beg, Iter& priv_end, const unsigned chunk_size) {\n      bool succ = false;\n\n      work_mutex.lock();\n      {\n        if (hasWorkWeak()) {\n          succ = true;\n\n          Iter nbeg = shared_beg;\n          if (m_size <= chunk_size) {\n            nbeg   = shared_end;\n            m_size = 0;\n\n          } else {\n            std::advance(nbeg, chunk_size);\n            m_size -= chunk_size;\n            assert(m_size > 0);\n          }\n\n          priv_beg   = shared_beg;\n          priv_end   = nbeg;\n          shared_beg = nbeg;\n        }\n      }\n      work_mutex.unlock();\n\n      return succ;\n    }\n\n    void steal_from_end_impl(Iter& steal_beg, Iter& steal_end, const Diff_ty sz,\n                             std::forward_iterator_tag) {\n\n      // steal from front for forward_iterator_tag\n      steal_beg = shared_beg;\n      std::advance(shared_beg, sz);\n      steal_end = shared_beg;\n    }\n\n    void steal_from_end_impl(Iter& steal_beg, Iter& steal_end, const Diff_ty sz,\n                             std::bidirectional_iterator_tag) {\n\n      steal_end = shared_end;\n      std::advance(shared_end, -sz);\n      steal_beg = shared_end;\n    }\n\n    void steal_from_end(Iter& steal_beg, Iter& steal_end, const Diff_ty sz) {\n      assert(sz > 0);\n      steal_from_end_impl(\n          steal_beg, steal_end, sz,\n          typename std::iterator_traits<Iter>::iterator_category());\n    }\n\n    void steal_from_beg(Iter& steal_beg, Iter& steal_end, const Diff_ty sz) {\n      assert(sz > 0);\n      steal_beg = shared_beg;\n      std::advance(shared_beg, sz);\n      steal_end = shared_beg;\n    }\n\n  public:\n    bool stealWork(Iter& steal_beg, Iter& steal_end, Diff_ty& steal_size,\n                   StealAmt amount, size_t chunk_size) {\n      bool succ = false;\n\n      if (work_mutex.try_lock()) {\n\n        if (hasWorkWeak()) {\n          succ = true;\n\n          if (amount == HALF && m_size > (Diff_ty)chunk_size) {\n            steal_size = m_size / 2;\n          } else {\n            steal_size = m_size;\n          }\n\n          if (m_size <= steal_size) {\n            steal_beg = shared_beg;\n            steal_end = shared_end;\n\n            shared_beg = shared_end;\n\n            steal_size = m_size;\n            m_size     = 0;\n\n          } else {\n\n            // steal_from_end (steal_beg, steal_end, steal_size);\n            steal_from_beg(steal_beg, steal_end, steal_size);\n            m_size -= steal_size;\n          }\n        }\n\n        work_mutex.unlock();\n      }\n\n      return succ;\n    }\n\n    void assignWork(const Iter& beg, const Iter& end, const Diff_ty sz) {\n      work_mutex.lock();\n      {\n        assert(!hasWorkWeak());\n        assert(beg != end);\n        assert(std::distance(beg, end) == sz);\n\n        shared_beg = beg;\n        shared_end = end;\n        m_size     = sz;\n      }\n      work_mutex.unlock();\n    }\n  };\n\nprivate:\n  GALOIS_ATTRIBUTE_NOINLINE bool\n  transferWork(ThreadContext& rich, ThreadContext& poor, StealAmt amount) {\n\n    assert(rich.id != poor.id);\n    assert(rich.id < galois::getActiveThreads());\n    assert(poor.id < galois::getActiveThreads());\n\n    Iter steal_beg;\n    Iter steal_end;\n\n    // stealWork should initialize to a more appropriate value\n    Diff_ty steal_size = 0;\n\n    bool succ =\n        rich.stealWork(steal_beg, steal_end, steal_size, amount, chunk_size);\n\n    if (succ) {\n      assert(steal_beg != steal_end);\n      assert(std::distance(steal_beg, steal_end) == steal_size);\n\n      poor.assignWork(steal_beg, steal_end, steal_size);\n    }\n\n    return succ;\n  }\n\n  GALOIS_ATTRIBUTE_NOINLINE bool stealWithinSocket(ThreadContext& poor) {\n\n    bool sawWork   = false;\n    bool stoleWork = false;\n\n    auto& tp = substrate::getThreadPool();\n\n    const unsigned maxT     = galois::getActiveThreads();\n    const unsigned my_pack  = substrate::ThreadPool::getSocket();\n    const unsigned per_pack = tp.getMaxThreads() / tp.getMaxSockets();\n\n    const unsigned pack_beg = my_pack * per_pack;\n    const unsigned pack_end = (my_pack + 1) * per_pack;\n\n    for (unsigned i = 1; i < pack_end; ++i) {\n\n      // go around the socket in circle starting from the next thread\n      unsigned t = (poor.id + i) % per_pack + pack_beg;\n      assert((t >= pack_beg) && (t < pack_end));\n\n      if (t < maxT) {\n        if (workers.getRemote(t)->hasWorkWeak()) {\n          sawWork = true;\n\n          stoleWork = transferWork(*workers.getRemote(t), poor, HALF);\n\n          if (stoleWork) {\n            break;\n          }\n        }\n      }\n    }\n\n    return sawWork || stoleWork;\n  }\n\n  GALOIS_ATTRIBUTE_NOINLINE bool stealOutsideSocket(ThreadContext& poor,\n                                                    const StealAmt& amt) {\n    bool sawWork   = false;\n    bool stoleWork = false;\n\n    auto& tp       = substrate::getThreadPool();\n    unsigned myPkg = substrate::ThreadPool::getSocket();\n    // unsigned maxT = LL::getMaxThreads ();\n    unsigned maxT = galois::getActiveThreads();\n\n    for (unsigned i = 0; i < maxT; ++i) {\n      ThreadContext& rich = *(workers.getRemote((poor.id + i) % maxT));\n\n      if (tp.getSocket(rich.id) != myPkg) {\n        if (rich.hasWorkWeak()) {\n          sawWork = true;\n\n          stoleWork = transferWork(rich, poor, amt);\n          // stoleWork = transferWork (rich, poor, HALF);\n\n          if (stoleWork) {\n            break;\n          }\n        }\n      }\n    }\n\n    return sawWork || stoleWork;\n  }\n\n  GALOIS_ATTRIBUTE_NOINLINE bool trySteal(ThreadContext& poor) {\n    bool ret = false;\n\n    ret = stealWithinSocket(poor);\n\n    if (ret) {\n      return true;\n    }\n\n    substrate::asmPause();\n\n    if (substrate::getThreadPool().isLeader(poor.id)) {\n      ret = stealOutsideSocket(poor, HALF);\n\n      if (ret) {\n        return true;\n      }\n      substrate::asmPause();\n    }\n\n    ret = stealOutsideSocket(poor, HALF);\n    if (ret) {\n      return true;\n    }\n    substrate::asmPause();\n\n    return ret;\n  }\n\nprivate:\n  R range;\n  F func;\n  const char* loopname;\n  Diff_ty chunk_size;\n  substrate::PerThreadStorage<ThreadContext> workers;\n\n  substrate::TerminationDetection& term;\n\n  // for stats\n  PerThreadTimer<MORE_STATS> totalTime;\n  PerThreadTimer<MORE_STATS> initTime;\n  PerThreadTimer<MORE_STATS> execTime;\n  PerThreadTimer<MORE_STATS> stealTime;\n  PerThreadTimer<MORE_STATS> termTime;\n\npublic:\n  DoAllStealingExec(const R& _range, F _func, const ArgsTuple& argsTuple)\n      : range(_range), func(_func),\n        loopname(galois::internal::getLoopName(argsTuple)),\n        chunk_size(get_trait_value<chunk_size_tag>(argsTuple).value),\n        term(substrate::getSystemTermination(activeThreads)),\n        totalTime(loopname, \"Total\"), initTime(loopname, \"Init\"),\n        execTime(loopname, \"Execute\"), stealTime(loopname, \"Steal\"),\n        termTime(loopname, \"Term\") {\n    assert(chunk_size > 0);\n  }\n\n  // parallel call\n  void initThread(void) {\n    initTime.start();\n\n    term.initializeThread();\n\n    unsigned id = substrate::ThreadPool::getTID();\n\n    *workers.getLocal(id) =\n        ThreadContext(id, range.local_begin(), range.local_end());\n\n    initTime.stop();\n  }\n\n  ~DoAllStealingExec() {\n// executed serially\n#ifndef NDEBUG\n    for (unsigned i = 0; i < workers.size(); ++i) {\n      auto& ctx = *(workers.getRemote(i));\n      assert(!ctx.hasWork() && \"Unprocessed work left\");\n    }\n#endif\n\n    // printStats ();\n  }\n\n  void operator()(void) {\n\n    ThreadContext& ctx = *workers.getLocal();\n    totalTime.start();\n\n    while (true) {\n      bool workHappened = false;\n\n      execTime.start();\n\n      if (ctx.doWork(func, chunk_size)) {\n        workHappened = true;\n      }\n\n      execTime.stop();\n\n      assert(!ctx.hasWork());\n\n      stealTime.start();\n      bool stole = trySteal(ctx);\n      stealTime.stop();\n\n      if (stole) {\n        continue;\n\n      } else {\n\n        assert(!ctx.hasWork());\n        if (USE_TERM) {\n          termTime.start();\n          term.localTermination(workHappened);\n\n          bool quit = term.globalTermination();\n          termTime.stop();\n\n          if (quit) {\n            break;\n          }\n        } else {\n          break;\n        }\n      }\n    }\n\n    totalTime.stop();\n    assert(!ctx.hasWork());\n\n    if (NEED_STATS) {\n      galois::runtime::reportStat_Tsum(loopname, \"Iterations\", ctx.num_iter);\n    }\n  }\n};\n\ntemplate <bool _STEAL>\nstruct ChooseDoAllImpl {\n\n  template <typename R, typename F, typename ArgsT>\n  static void call(const R& range, F&& func, const ArgsT& argsTuple) {\n\n    internal::DoAllStealingExec<\n        R, OperatorReferenceType<decltype(std::forward<F>(func))>, ArgsT>\n        exec(range, std::forward<F>(func), argsTuple);\n\n    substrate::Barrier& barrier = getBarrier(activeThreads);\n\n    substrate::getThreadPool().run(\n        activeThreads, [&exec](void) { exec.initThread(); }, std::ref(barrier),\n        std::ref(exec));\n  }\n};\n\ntemplate <>\nstruct ChooseDoAllImpl<false> {\n\n  template <typename R, typename F, typename ArgsT>\n  static void call(const R& range, F func, const ArgsT& argsTuple) {\n\n    runtime::on_each_gen(\n        [&](const unsigned int, const unsigned int) {\n          static constexpr bool NEED_STATS =\n              galois::internal::NeedStats<ArgsT>::value;\n          static constexpr bool MORE_STATS =\n              NEED_STATS && has_trait<more_stats_tag, ArgsT>();\n\n          const char* const loopname = galois::internal::getLoopName(argsTuple);\n\n          PerThreadTimer<MORE_STATS> totalTime(loopname, \"Total\");\n          PerThreadTimer<MORE_STATS> initTime(loopname, \"Init\");\n          PerThreadTimer<MORE_STATS> execTime(loopname, \"Work\");\n\n          totalTime.start();\n          initTime.start();\n\n          auto begin     = range.local_begin();\n          const auto end = range.local_end();\n\n          initTime.stop();\n\n          execTime.start();\n\n          size_t iter = 0;\n\n          while (begin != end) {\n            func(*begin++);\n            if (NEED_STATS) {\n              ++iter;\n            }\n          }\n          execTime.stop();\n\n          totalTime.stop();\n\n          if (NEED_STATS) {\n            galois::runtime::reportStat_Tsum(loopname, \"Iterations\", iter);\n          }\n        },\n        std::make_tuple());\n  }\n};\n\n} // end namespace internal\n\ntemplate <typename R, typename F, typename ArgsTuple>\nvoid do_all_gen(const R& range, F&& func, const ArgsTuple& argsTuple) {\n\n  static_assert(!has_trait<char*, ArgsTuple>(), \"old loopname\");\n  static_assert(!has_trait<char const*, ArgsTuple>(), \"old loopname\");\n  static_assert(!has_trait<bool, ArgsTuple>(), \"old steal\");\n\n  auto argsT = std::tuple_cat(\n      argsTuple,\n      get_default_trait_values(argsTuple, std::make_tuple(chunk_size_tag{}),\n                               std::make_tuple(chunk_size<>{})));\n\n  using ArgsT = decltype(argsT);\n\n  constexpr bool TIME_IT = has_trait<loopname_tag, ArgsT>();\n  CondStatTimer<TIME_IT> timer(galois::internal::getLoopName(argsT));\n\n  timer.start();\n\n  constexpr bool STEAL = has_trait<steal_tag, ArgsT>();\n\n  OperatorReferenceType<decltype(std::forward<F>(func))> func_ref = func;\n  internal::ChooseDoAllImpl<STEAL>::call(range, func_ref, argsT);\n\n  timer.stop();\n}\n\n} // namespace galois::runtime\n\n#endif\n"
  },
  {
    "path": "libgalois/include/galois/runtime/Executor_ForEach.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#ifndef GALOIS_RUNTIME_EXECUTOR_FOREACH_H\n#define GALOIS_RUNTIME_EXECUTOR_FOREACH_H\n\n#include <algorithm>\n#include <functional>\n#include <memory>\n#include <utility>\n\n#include \"galois/config.h\"\n#include \"galois/gIO.h\"\n#include \"galois/Mem.h\"\n#include \"galois/runtime/Context.h\"\n#include \"galois/runtime/LoopStatistics.h\"\n#include \"galois/runtime/OperatorReferenceTypes.h\"\n#include \"galois/runtime/Range.h\"\n#include \"galois/runtime/Statistics.h\"\n#include \"galois/runtime/Substrate.h\"\n#include \"galois/runtime/ThreadTimer.h\"\n#include \"galois/runtime/UserContextAccess.h\"\n#include \"galois/substrate/Termination.h\"\n#include \"galois/substrate/ThreadPool.h\"\n#include \"galois/Threads.h\"\n#include \"galois/Timer.h\"\n#include \"galois/Traits.h\"\n#include \"galois/worklists/Chunk.h\"\n#include \"galois/worklists/Simple.h\"\n\nnamespace galois {\n//! Internal Galois functionality - Use at your own risk.\nnamespace runtime {\n\ntemplate <typename value_type>\nclass AbortHandler {\n  struct Item {\n    value_type val;\n    int retries;\n  };\n\n  typedef worklists::GFIFO<Item> AbortedList;\n  substrate::PerThreadStorage<AbortedList> queues;\n  bool useBasicPolicy;\n\n  /**\n   * Policy: serialize via tree over sockets.\n   */\n  void basicPolicy(const Item& item) {\n    auto& tp        = substrate::getThreadPool();\n    unsigned socket = tp.getSocket();\n    queues.getRemote(tp.getLeaderForSocket(socket / 2))->push(item);\n  }\n\n  /**\n   * Policy: retry work 2X locally, then serialize via tree on socket (trying\n   * twice at each level), then serialize via tree over sockets.\n   */\n  void doublePolicy(const Item& item) {\n    int retries = item.retries - 1;\n    if ((retries & 1) == 1) {\n      queues.getLocal()->push(item);\n      return;\n    }\n\n    unsigned tid    = substrate::ThreadPool::getTID();\n    auto& tp        = substrate::getThreadPool();\n    unsigned socket = substrate::ThreadPool::getSocket();\n    unsigned leader = substrate::ThreadPool::getLeader();\n    if (tid != leader) {\n      unsigned next = leader + (tid - leader) / 2;\n      queues.getRemote(next)->push(item);\n    } else {\n      queues.getRemote(tp.getLeaderForSocket(socket / 2))->push(item);\n    }\n  }\n\n  /**\n   * Policy: retry work 2X locally, then serialize via tree on socket but\n   * try at most 3 levels, then serialize via tree over sockets.\n   */\n  void boundedPolicy(const Item& item) {\n    int retries = item.retries - 1;\n    if (retries < 2) {\n      queues.getLocal()->push(item);\n      return;\n    }\n\n    unsigned tid    = substrate::ThreadPool::getTID();\n    auto& tp        = substrate::getThreadPool();\n    unsigned socket = substrate::ThreadPool::getSocket();\n    unsigned leader = tp.getLeaderForSocket(socket);\n    if (retries < 5 && tid != leader) {\n      unsigned next = leader + (tid - leader) / 2;\n      queues.getRemote(next)->push(item);\n    } else {\n      queues.getRemote(tp.getLeaderForSocket(socket / 2))->push(item);\n    }\n  }\n\n  /**\n   * Retry locally only.\n   */\n  void eagerPolicy(const Item& item) { queues.getLocal()->push(item); }\n\npublic:\n  AbortHandler() {\n    // XXX(ddn): Implement smarter adaptive policy\n    useBasicPolicy = substrate::getThreadPool().getMaxSockets() > 2;\n  }\n\n  value_type& value(Item& item) const { return item.val; }\n  value_type& value(value_type& val) const { return val; }\n\n  void push(const value_type& val) {\n    Item item = {val, 1};\n    queues.getLocal()->push(item);\n  }\n\n  void push(const Item& item) {\n    Item newitem = {item.val, item.retries + 1};\n    if (useBasicPolicy)\n      basicPolicy(newitem);\n    else\n      doublePolicy(newitem);\n  }\n\n  AbortedList* getQueue() { return queues.getLocal(); }\n};\n\n// TODO(ddn): Implement wrapper to allow calling without UserContext\n// TODO(ddn): Check for operators that implement both with and without context\ntemplate <class WorkListTy, class FunctionTy, typename ArgsTy>\nclass ForEachExecutor {\npublic:\n  static constexpr bool needStats = galois::internal::NeedStats<ArgsTy>::value;\n  static constexpr bool needsPush = !has_trait<no_pushes_tag, ArgsTy>();\n  static constexpr bool needsAborts =\n      !has_trait<disable_conflict_detection_tag, ArgsTy>();\n  static constexpr bool needsPia   = has_trait<per_iter_alloc_tag, ArgsTy>();\n  static constexpr bool needsBreak = has_trait<parallel_break_tag, ArgsTy>();\n  static constexpr bool MORE_STATS =\n      needStats && has_trait<more_stats_tag, ArgsTy>();\n\nprotected:\n  typedef typename WorkListTy::value_type value_type;\n\n  struct ThreadLocalBasics {\n    UserContextAccess<value_type> facing;\n    FunctionTy function;\n    SimpleRuntimeContext ctx;\n\n    explicit ThreadLocalBasics(FunctionTy fn) : facing(), function(fn), ctx() {}\n  };\n\n  using LoopStat = LoopStatistics<needStats>;\n\n  struct ThreadLocalData : public ThreadLocalBasics, public LoopStat {\n\n    ThreadLocalData(FunctionTy fn, const char* ln)\n        : ThreadLocalBasics(fn), LoopStat(ln) {}\n  };\n\n  // RunQueueState factors out state within runQueue iterations to protect it\n  // from being overwritten when using longjmp/setjmp.\n  template <typename WL>\n  struct RunQueueState {\n    unsigned int num = 0;\n    galois::optional<typename WL::value_type> item;\n  };\n\n  // NB: Place dynamically growing wl after fixed-size PerThreadStorage\n  // members to give higher likelihood of reclaiming PerThreadStorage\n\n  AbortHandler<value_type> aborted;\n  substrate::TerminationDetection& term;\n  substrate::Barrier& barrier;\n\n  WorkListTy wl;\n  FunctionTy origFunction;\n  const char* loopname;\n  bool broke;\n\n  PerThreadTimer<MORE_STATS> initTime;\n  PerThreadTimer<MORE_STATS> execTime;\n\n  inline void commitIteration(ThreadLocalData& tld) {\n    if (needsPush) {\n      // auto ii = tld.facing.getPushBuffer().begin();\n      // auto ee = tld.facing.getPushBuffer().end();\n      auto& pb = tld.facing.getPushBuffer();\n      auto n   = pb.size();\n      if (n) {\n        tld.inc_pushes(n);\n        wl.push(pb.begin(), pb.end());\n        pb.clear();\n      }\n    }\n    if (needsPia)\n      tld.facing.resetAlloc();\n    if (needsAborts)\n      tld.ctx.commitIteration();\n    //++tld.stat_commits;\n  }\n\n  template <typename Item>\n  GALOIS_ATTRIBUTE_NOINLINE void abortIteration(const Item& item,\n                                                ThreadLocalData& tld) {\n    assert(needsAborts);\n    tld.ctx.cancelIteration();\n    tld.inc_conflicts();\n    aborted.push(item);\n    // clear push buffer\n    if (needsPush)\n      tld.facing.resetPushBuffer();\n    // reset allocator\n    if (needsPia)\n      tld.facing.resetAlloc();\n  }\n\n  inline void doProcess(value_type& val, ThreadLocalData& tld) {\n    if (needsAborts)\n      tld.ctx.startIteration();\n\n    tld.inc_iterations();\n    tld.function(val, tld.facing.data());\n    commitIteration(tld);\n  }\n\n  bool runQueueSimple(ThreadLocalData& tld) {\n    galois::optional<value_type> p;\n    bool didWork = false;\n    while ((p = wl.pop())) {\n      didWork = true;\n      doProcess(*p, tld);\n    }\n    return didWork;\n  }\n\n  template <unsigned int limit, typename WL>\n  void runQueueDispatch(ThreadLocalData& tld, WL& lwl, RunQueueState<WL>& s) {\n#ifdef GALOIS_USE_LONGJMP_ABORT\n    if (setjmp(execFrame) == 0) {\n      while ((!limit || s.num < limit) && (s.item = lwl.pop())) {\n        ++s.num;\n        doProcess(aborted.value(*s.item), tld);\n      }\n    } else {\n      clearConflictLock();\n      abortIteration(*s.item, tld);\n    }\n#elif defined(GALOIS_USE_EXCEPTION_ABORT)\n    try {\n      while ((!limit || s.num < limit) && (s.item = lwl.pop())) {\n        ++s.num;\n        doProcess(aborted.value(*s.item), tld);\n      }\n    } catch (ConflictFlag const& flag) {\n      clearConflictLock();\n      abortIteration(*s.item, tld);\n    }\n#endif\n  }\n\n  template <unsigned int limit, typename WL>\n  bool runQueue(ThreadLocalData& tld, WL& lwl) {\n    RunQueueState<WL> s;\n    runQueueDispatch<limit>(tld, lwl, s);\n    return s.num > 0;\n  }\n\n  GALOIS_ATTRIBUTE_NOINLINE\n  bool handleAborts(ThreadLocalData& tld) {\n    return runQueue<0>(tld, *aborted.getQueue());\n  }\n\n  void fastPushBack(typename UserContextAccess<value_type>::PushBufferTy& x) {\n    wl.push(x.begin(), x.end());\n    x.clear();\n  }\n\n  bool checkEmpty(WorkListTy&, ThreadLocalData&, ...) { return true; }\n\n  template <typename WL>\n  auto checkEmpty(WL& wl, ThreadLocalData&, int)\n      -> decltype(wl.empty(), bool()) {\n    return wl.empty();\n  }\n\n  template <bool couldAbort, bool isLeader>\n  void go() {\n\n    execTime.start();\n\n    // Thread-local data goes on the local stack to be NUMA friendly\n    ThreadLocalData tld(origFunction, loopname);\n    if (needsBreak)\n      tld.facing.setBreakFlag(&broke);\n    if (couldAbort)\n      setThreadContext(&tld.ctx);\n    if (needsPush && !couldAbort)\n      tld.facing.setFastPushBack(std::bind(&ForEachExecutor::fastPushBack, this,\n                                           std::placeholders::_1));\n\n    while (true) {\n      do {\n        bool didWork = false;\n\n        // Run some iterations\n        if (couldAbort || needsBreak) {\n          constexpr int __NUM = (needsBreak || isLeader) ? 64 : 0;\n          bool b              = runQueue<__NUM>(tld, wl);\n          didWork             = b || didWork;\n          // Check for abort\n          if (couldAbort) {\n            b       = handleAborts(tld);\n            didWork = b || didWork;\n          }\n        } else { // No try/catch\n          bool b  = runQueueSimple(tld);\n          didWork = b || didWork;\n        }\n\n        // Update node color and prop token\n        term.localTermination(didWork);\n        substrate::asmPause(); // Let token propagate\n      } while (!term.globalTermination() && (!needsBreak || !broke));\n\n      if (checkEmpty(wl, tld, 0)) {\n        execTime.stop();\n        break;\n      }\n\n      if (needsBreak && broke) {\n        execTime.stop();\n        break;\n      }\n\n      term.initializeThread();\n      barrier.wait();\n    }\n\n    if (couldAbort)\n      setThreadContext(0);\n  }\n\n  struct T1 {};\n  struct T2 {};\n\n  template <typename... WArgsTy>\n  ForEachExecutor(T2, FunctionTy f, const ArgsTy& args, WArgsTy... wargs)\n      : term(substrate::getSystemTermination(activeThreads)),\n        barrier(getBarrier(activeThreads)), wl(std::forward<WArgsTy>(wargs)...),\n        origFunction(f), loopname(galois::internal::getLoopName(args)),\n        broke(false), initTime(loopname, \"Init\"),\n        execTime(loopname, \"Execute\") {}\n\n  template <typename WArgsTy, size_t... Is>\n  ForEachExecutor(T1, FunctionTy f, const ArgsTy& args, const WArgsTy& wlargs,\n                  std::index_sequence<Is...>)\n      : ForEachExecutor(T2{}, f, args, std::get<Is>(wlargs)...) {}\n\n  template <typename WArgsTy>\n  ForEachExecutor(T1, FunctionTy f, const ArgsTy& args, const WArgsTy&,\n                  std::index_sequence<>)\n      : ForEachExecutor(T2{}, f, args) {}\n\npublic:\n  ForEachExecutor(FunctionTy f, const ArgsTy& args)\n      : ForEachExecutor(T1{}, f, args, get_trait_value<wl_tag>(args).args,\n                        std::make_index_sequence<std::tuple_size<decltype(\n                            get_trait_value<wl_tag>(args).args)>::value>{}) {}\n\n  template <typename RangeTy>\n  void init(const RangeTy&) {}\n\n  template <typename RangeTy>\n  void initThread(const RangeTy& range) {\n\n    initTime.start();\n\n    wl.push_initial(range);\n    term.initializeThread();\n\n    initTime.stop();\n  }\n\n  void operator()() {\n    bool isLeader   = substrate::ThreadPool::isLeader();\n    bool couldAbort = needsAborts && activeThreads > 1;\n    if (couldAbort && isLeader)\n      go<true, true>();\n    else if (couldAbort && !isLeader)\n      go<true, false>();\n    else if (!couldAbort && isLeader)\n      go<false, true>();\n    else\n      go<false, false>();\n  }\n};\n\ntemplate <typename WLTy>\nconstexpr auto has_with_iterator(int) -> decltype(\n    std::declval<typename WLTy::template with_iterator<int*>::type>(), bool()) {\n  return true;\n}\n\ntemplate <typename>\nconstexpr auto has_with_iterator(...) -> bool {\n  return false;\n}\n\ntemplate <typename WLTy, typename IterTy, typename Enable = void>\nstruct reiterator {\n  typedef WLTy type;\n};\n\ntemplate <typename WLTy, typename IterTy>\nstruct reiterator<WLTy, IterTy,\n                  typename std::enable_if<has_with_iterator<WLTy>(0)>::type> {\n  typedef typename WLTy::template with_iterator<IterTy>::type type;\n};\n\n// TODO(ddn): Think about folding in range into args too\ntemplate <typename RangeTy, typename FunctionTy, typename ArgsTy>\nvoid for_each_impl(const RangeTy& range, FunctionTy&& fn, const ArgsTy& args) {\n  typedef typename std::iterator_traits<typename RangeTy::iterator>::value_type\n      value_type;\n  typedef typename get_trait_type<wl_tag, ArgsTy>::type::type BaseWorkListTy;\n  typedef typename reiterator<BaseWorkListTy, typename RangeTy::iterator>::\n      type ::template retype<value_type>\n          WorkListTy;\n  using FuncRefType =\n      OperatorReferenceType<decltype(std::forward<FunctionTy>(fn))>;\n  typedef ForEachExecutor<WorkListTy, FuncRefType, ArgsTy> WorkTy;\n\n  auto& barrier      = getBarrier(activeThreads);\n  FuncRefType fn_ref = fn;\n  WorkTy W(fn_ref, args);\n  W.init(range);\n  substrate::getThreadPool().run(\n      activeThreads, [&W, &range]() { W.initThread(range); }, std::ref(barrier),\n      std::ref(W));\n}\n\n// TODO: Need to decide whether user should provide num_run tag or\n// num_run can be provided by loop instance which is guaranteed to be unique\n\n//! Normalize arguments to for_each\ntemplate <typename RangeTy, typename FunctionTy, typename TupleTy>\nvoid for_each_gen(const RangeTy& r, FunctionTy&& fn, const TupleTy& tpl) {\n  static_assert(!has_trait<char*, TupleTy>(), \"old loopname\");\n  static_assert(!has_trait<char const*, TupleTy>(), \"old loopname\");\n  static_assert(!has_trait<bool, TupleTy>(), \"old steal\");\n\n  auto ftpl = std::tuple_cat(tpl, typename function_traits<FunctionTy>::type{});\n\n  auto xtpl = std::tuple_cat(\n      ftpl, get_default_trait_values(tpl, std::make_tuple(wl_tag{}),\n                                     std::make_tuple(wl<defaultWL>())));\n\n  constexpr bool TIME_IT = has_trait<loopname_tag, decltype(xtpl)>();\n  CondStatTimer<TIME_IT> timer(galois::internal::getLoopName(xtpl));\n\n  timer.start();\n\n  runtime::for_each_impl(r, std::forward<FunctionTy>(fn), xtpl);\n\n  timer.stop();\n}\n\n} // end namespace runtime\n} // end namespace galois\n#endif\n"
  },
  {
    "path": "libgalois/include/galois/runtime/Executor_OnEach.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#ifndef GALOIS_RUNTIME_EXECUTOR_ONEACH_H\n#define GALOIS_RUNTIME_EXECUTOR_ONEACH_H\n\n#include \"galois/config.h\"\n#include \"galois/gIO.h\"\n#include \"galois/runtime/OperatorReferenceTypes.h\"\n#include \"galois/runtime/Statistics.h\"\n#include \"galois/runtime/ThreadTimer.h\"\n#include \"galois/substrate/ThreadPool.h\"\n#include \"galois/Threads.h\"\n#include \"galois/Timer.h\"\n#include \"galois/Traits.h\"\n\nnamespace galois {\nnamespace runtime {\n\nnamespace internal {\n\ntemplate <typename FunctionTy, typename ArgsTy>\ninline void on_each_impl(FunctionTy&& fn, const ArgsTy& argsTuple) {\n\n  static_assert(!has_trait<char*, ArgsTy>(), \"old loopname\");\n  static_assert(!has_trait<char const*, ArgsTy>(), \"old loopname\");\n\n  static constexpr bool NEEDS_STATS = has_trait<loopname_tag, ArgsTy>();\n  static constexpr bool MORE_STATS =\n      NEEDS_STATS && has_trait<more_stats_tag, ArgsTy>();\n\n  const char* const loopname = galois::internal::getLoopName(argsTuple);\n\n  CondStatTimer<NEEDS_STATS> timer(loopname);\n\n  PerThreadTimer<MORE_STATS> execTime(loopname, \"Execute\");\n\n  const auto numT = getActiveThreads();\n\n  OperatorReferenceType<decltype(std::forward<FunctionTy>(fn))> fn_ref = fn;\n\n  auto runFun = [&] {\n    execTime.start();\n\n    fn_ref(substrate::ThreadPool::getTID(), numT);\n\n    execTime.stop();\n  };\n\n  timer.start();\n  substrate::getThreadPool().run(numT, runFun);\n  timer.stop();\n}\n\n} // namespace internal\n\ntemplate <typename FunctionTy, typename TupleTy>\ninline void on_each_gen(FunctionTy&& fn, const TupleTy& tpl) {\n  internal::on_each_impl(std::forward<FunctionTy>(fn), tpl);\n}\n\n} // end namespace runtime\n} // end namespace galois\n\n#endif\n"
  },
  {
    "path": "libgalois/include/galois/runtime/Executor_Ordered.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#ifndef GALOIS_RUNTIME_EXECUTOR_ORDERED_H\n#define GALOIS_RUNTIME_EXECUTOR_ORDERED_H\n\n#include \"galois/config.h\"\n\nnamespace galois {\nnamespace runtime {\n\n// TODO(ddn): Pull in and integrate in executors from exp\n\ntemplate <typename Iter, typename Cmp, typename NhFunc, typename OpFunc>\nvoid for_each_ordered_impl(Iter GALOIS_UNUSED(beg), Iter GALOIS_UNUSED(end),\n                           const Cmp& GALOIS_UNUSED(cmp),\n                           const NhFunc& GALOIS_UNUSED(nhFunc),\n                           const OpFunc& GALOIS_UNUSED(opFunc),\n                           const char* GALOIS_UNUSED(loopname)) {\n  GALOIS_DIE(\"not yet implemented\");\n}\n\ntemplate <typename Iter, typename Cmp, typename NhFunc, typename OpFunc,\n          typename StableTest>\nvoid for_each_ordered_impl(Iter GALOIS_UNUSED(beg), Iter GALOIS_UNUSED(end),\n                           const Cmp& GALOIS_UNUSED(cmp),\n                           const NhFunc& GALOIS_UNUSED(nhFunc),\n                           const OpFunc& GALOIS_UNUSED(opFunc),\n                           const StableTest& GALOIS_UNUSED(stabilityTest),\n                           const char* GALOIS_UNUSED(loopname)) {\n  GALOIS_DIE(\"not yet implemented\");\n}\n\n} // end namespace runtime\n} // end namespace galois\n\n#endif\n"
  },
  {
    "path": "libgalois/include/galois/runtime/Executor_ParaMeter.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#ifndef GALOIS_RUNTIME_EXECUTOR_PARAMETER_H\n#define GALOIS_RUNTIME_EXECUTOR_PARAMETER_H\n\n#include <algorithm>\n#include <cstdio>\n#include <cstdlib>\n#include <ctime>\n#include <deque>\n#include <random>\n#include <vector>\n\n#include \"galois/config.h\"\n#include \"galois/gIO.h\"\n#include \"galois/Mem.h\"\n#include \"galois/Reduction.h\"\n#include \"galois/runtime/Context.h\"\n#include \"galois/runtime/Executor_ForEach.h\"\n#include \"galois/runtime/Executor_DoAll.h\"\n#include \"galois/runtime/Executor_OnEach.h\"\n#include \"galois/PerThreadContainer.h\"\n#include \"galois/Traits.h\"\n#include \"galois/worklists/Simple.h\"\n\nnamespace galois {\nnamespace runtime {\n\nnamespace ParaMeter {\n\nstruct StepStatsBase {\n  static inline void printHeader(FILE* out) {\n    fprintf(out,\n            \"LOOPNAME, STEP, PARALLELISM, WORKLIST_SIZE, NEIGHBORHOOD_SIZE\\n\");\n  }\n\n  static inline void dump(FILE* out, const char* loopname, size_t step,\n                          size_t parallelism, size_t wlSize, size_t nhSize) {\n    assert(out && \"StepStatsBase::dump() file handle is null\");\n    fprintf(out, \"%s, %zu, %zu, %zu, %zu\\n\", loopname, step, parallelism,\n            wlSize, nhSize);\n  }\n};\n\nstruct OrderedStepStats : public StepStatsBase {\n  using Base = StepStatsBase;\n\n  const size_t step;\n  GAccumulator<size_t> parallelism;\n  const size_t wlSize;\n\n  explicit OrderedStepStats(size_t _step, size_t _wlsz)\n      : Base(), step(_step), parallelism(), wlSize(_wlsz) {}\n\n  explicit OrderedStepStats(size_t _step, size_t par, size_t _wlsz)\n      : Base(), step(_step), parallelism(), wlSize(_wlsz) {\n    parallelism += par;\n  }\n\n  void dump(FILE* out, const char* loopname) {\n    Base::dump(out, loopname, step, parallelism.reduce(), wlSize, 0ul);\n  }\n};\n\nstruct UnorderedStepStats : public StepStatsBase {\n  using Base = StepStatsBase;\n\n  size_t step;\n  GAccumulator<size_t> parallelism;\n  GAccumulator<size_t> wlSize;\n  GAccumulator<size_t> nhSize;\n\n  UnorderedStepStats(void) : Base(), step(0) {}\n\n  void nextStep(void) {\n    ++step;\n    parallelism.reset();\n    wlSize.reset();\n    nhSize.reset();\n  }\n\n  void dump(FILE* out, const char* loopname) {\n    Base::dump(out, loopname, step, parallelism.reduce(), wlSize.reduce(),\n               nhSize.reduce());\n  }\n};\n\n// Single ParaMeter stats file per run of an app\n// which includes all instances of for_each loops\n// run with ParaMeter Executor\nFILE* getStatsFile(void);\nvoid closeStatsFile(void);\n\ntemplate <typename T>\nclass FIFO_WL {\n\nprotected:\n  using PTcont = galois::PerThreadVector<T>;\n\n  PTcont* curr;\n  PTcont* next;\n\npublic:\n  FIFO_WL(void) : curr(new PTcont()), next(new PTcont()) {}\n\n  ~FIFO_WL(void) {\n    delete curr;\n    curr = nullptr;\n    delete next;\n    next = nullptr;\n  }\n\n  auto iterateCurr(void) { return galois::runtime::makeLocalRange(*curr); }\n\n  void pushNext(const T& item) { next->get().push_back(item); }\n\n  void nextStep(void) {\n    std::swap(curr, next);\n    next->clear_all_parallel();\n  }\n\n  bool empty(void) const { return next->empty_all(); }\n};\n\ntemplate <typename T>\nclass RAND_WL : public FIFO_WL<T> {\n  using Base = FIFO_WL<T>;\n\npublic:\n  auto iterateCurr(void) {\n    galois::runtime::on_each_gen(\n        [&](int, int) {\n          auto& lwl = Base::curr->get();\n\n          std::random_device r;\n          std::mt19937 rng(r());\n          std::shuffle(lwl.begin(), lwl.end(), rng);\n        },\n        std::make_tuple());\n\n    return galois::runtime::makeLocalRange(*Base::curr);\n  }\n};\n\ntemplate <typename T>\nclass LIFO_WL : public FIFO_WL<T> {\n  using Base = FIFO_WL<T>;\n\npublic:\n  auto iterateCurr(void) {\n\n    // TODO: use reverse iterator instead of std::reverse\n    galois::runtime::on_each_gen(\n        [&](int, int) {\n          auto& lwl = Base::curr->get();\n          std::reverse(lwl.begin(), lwl.end());\n        },\n        std::make_tuple());\n\n    return galois::runtime::makeLocalRange(*Base::curr);\n  }\n};\n\nenum class SchedType { FIFO, RAND, LIFO };\n\ntemplate <typename T, SchedType SCHED>\nstruct ChooseWL {};\n\ntemplate <typename T>\nstruct ChooseWL<T, SchedType::FIFO> {\n  using type = FIFO_WL<T>;\n};\n\ntemplate <typename T>\nstruct ChooseWL<T, SchedType::LIFO> {\n  using type = LIFO_WL<T>;\n};\n\ntemplate <typename T>\nstruct ChooseWL<T, SchedType::RAND> {\n  using type = RAND_WL<T>;\n};\n\ntemplate <class T, class FunctionTy, class ArgsTy>\nclass ParaMeterExecutor {\n\n  using value_type = T;\n  using GenericWL  = typename get_trait_type<wl_tag, ArgsTy>::type::type;\n  using WorkListTy = typename GenericWL::template retype<T>;\n  using dbg        = galois::debug<1>;\n\n  constexpr static bool needsStats = !has_trait<no_stats_tag, ArgsTy>();\n  constexpr static bool needsPush  = !has_trait<no_pushes_tag, ArgsTy>();\n  constexpr static bool needsAborts =\n      !has_trait<disable_conflict_detection_tag, ArgsTy>();\n  constexpr static bool needsPia   = has_trait<per_iter_alloc_tag, ArgsTy>();\n  constexpr static bool needsBreak = has_trait<parallel_break_tag, ArgsTy>();\n\n  struct IterationContext {\n    T item;\n    bool doabort;\n    galois::runtime::UserContextAccess<value_type> facing;\n    SimpleRuntimeContext ctx;\n\n    explicit IterationContext(const T& v) : item(v), doabort(false) {}\n\n    void reset() {\n      doabort = false;\n      if (needsPia)\n        facing.resetAlloc();\n\n      if (needsPush)\n        facing.getPushBuffer().clear();\n    }\n  };\n\n  using PWL = typename ChooseWL<IterationContext*, WorkListTy::SCHEDULE>::type;\n\nprivate:\n  PWL m_wl;\n  FunctionTy m_func;\n  const char* loopname;\n  FILE* m_statsFile;\n  FixedSizeAllocator<IterationContext> m_iterAlloc;\n  galois::GReduceLogicalOr m_broken;\n\n  IterationContext* newIteration(const T& item) {\n    IterationContext* it = m_iterAlloc.allocate(1);\n    assert(it && \"IterationContext allocation failed\");\n\n    m_iterAlloc.construct(it, item);\n\n    it->reset();\n    return it;\n  }\n\n  unsigned abortIteration(IterationContext* it) {\n    assert(it && \"nullptr arg\");\n    assert(it->doabort &&\n           \"aborting an iteration without setting its doabort flag\");\n\n    unsigned numLocks = it->ctx.cancelIteration();\n    it->reset();\n\n    m_wl.pushNext(it);\n    return numLocks;\n  }\n\n  unsigned commitIteration(IterationContext* it) {\n    assert(it && \"nullptr arg\");\n\n    if (needsPush) {\n      for (const auto& item : it->facing.getPushBuffer()) {\n        IterationContext* child = newIteration(item);\n        m_wl.pushNext(child);\n      }\n    }\n\n    unsigned numLocks = it->ctx.commitIteration();\n    it->reset();\n\n    m_iterAlloc.destroy(it);\n    m_iterAlloc.deallocate(it, 1);\n\n    return numLocks;\n  }\n\nprivate:\n  void runSimpleStep(UnorderedStepStats& stats) {\n    galois::runtime::do_all_gen(\n        m_wl.iterateCurr(),\n        [&, this](IterationContext* it) {\n          stats.wlSize += 1;\n\n          setThreadContext(&(it->ctx));\n\n          m_func(it->item, it->facing.data());\n          stats.parallelism += 1;\n          unsigned nh = commitIteration(it);\n          stats.nhSize += nh;\n\n          setThreadContext(nullptr);\n        },\n        std::make_tuple(galois::steal(), galois::loopname(\"ParaM-Simple\")));\n  }\n\n  void runCautiousStep(UnorderedStepStats& stats){galois::runtime::do_all_gen(\n      m_wl.iterateCurr(),\n      [&, this](IterationContext* it) {\n        stats.wlSize += 1;\n\n        setThreadContext(&(it->ctx));\n        bool broke = false;\n\n        if (needsBreak) {\n          it->facing.setBreakFlag(&broke);\n        }\n#ifdef GALOIS_USE_LONGJMP_ABORT\n        int flag = 0;\n        if ((flag = setjmp(execFrame)) == 0) {\n          m_func(it->item, it->facing.data());\n\n        } else {\n#elif GALOIS_USE_EXCEPTION_ABORT\n        try {\n          m_func(it->item, it->facing.data());\n\n        } catch (const ConflictFlag& flag) {\n#endif\n          clearConflictLock();\n          switch (flag) {\n          case galois::runtime::CONFLICT:\n            it->doabort = true;\n            break;\n          default:\n            std::abort();\n          }\n        }\n\n        if (needsBreak && broke) {\n          m_broken.update(true);\n        }\n\n        setThreadContext(nullptr);\n      },\n      std::make_tuple(galois::steal(), galois::loopname(\"ParaM-Expand-NH\")));\n\n  galois::runtime::do_all_gen(\n      m_wl.iterateCurr(),\n      [&, this](IterationContext* it) {\n        if (it->doabort) {\n          abortIteration(it);\n\n        } else {\n          stats.parallelism += 1;\n          unsigned nh = commitIteration(it);\n          stats.nhSize += nh;\n        }\n      },\n      std::make_tuple(galois::steal(), galois::loopname(\"ParaM-Commit\")));\n}\n\ntemplate <typename R>\nvoid execute(const R& range) {\n\n  galois::runtime::on_each_gen(\n      [&, this](const unsigned, const unsigned) {\n        auto p = range.local_pair();\n\n        for (auto i = p.first; i != p.second; ++i) {\n          IterationContext* it = newIteration(*i);\n          m_wl.pushNext(it);\n        }\n      },\n      std::make_tuple());\n\n  UnorderedStepStats stats;\n\n  while (!m_wl.empty()) {\n\n    m_wl.nextStep();\n\n    if (needsAborts) {\n      runCautiousStep(stats);\n\n    } else {\n      runSimpleStep(stats);\n    }\n\n    // dbg::print(\"Step: \", stats.step, \", Parallelism: \",\n    // stats.parallelism.reduce());\n    assert(stats.parallelism.reduce() && \"ERROR: No Progress\");\n\n    stats.dump(m_statsFile, loopname);\n    stats.nextStep();\n\n    if (needsBreak && m_broken.reduce()) {\n      break;\n    }\n\n  } // end while\n\n  closeStatsFile();\n}\n\npublic:\nParaMeterExecutor(const FunctionTy& f, const ArgsTy& args)\n    : m_func(f), loopname(galois::internal::getLoopName(args)),\n      m_statsFile(getStatsFile()) {}\n\n// called serially once\ntemplate <typename RangeTy>\nvoid init(const RangeTy& range) {\n  execute(range);\n}\n\n// called once on each thread followed by a barrier\ntemplate <typename RangeTy>\nvoid initThread(const RangeTy&) const {}\n\nvoid operator()(void) {}\n\n}; // namespace ParaMeter\n\n} // namespace runtime\n} // namespace galois\n\nnamespace worklists {\n\ntemplate <class T = int, runtime::ParaMeter::SchedType SCHED =\n                             runtime::ParaMeter::SchedType::FIFO>\nclass ParaMeter {\npublic:\n  template <bool _concurrent>\n  using rethread = ParaMeter<T, SCHED>;\n\n  template <typename _T>\n  using retype = ParaMeter<_T, SCHED>;\n\n  using value_type = T;\n\n  constexpr static const runtime::ParaMeter::SchedType SCHEDULE = SCHED;\n\n  using fifo   = ParaMeter<T, runtime::ParaMeter::SchedType::FIFO>;\n  using random = ParaMeter<T, runtime::ParaMeter::SchedType::RAND>;\n  using lifo   = ParaMeter<T, runtime::ParaMeter::SchedType::LIFO>;\n};\n\n} // namespace worklists\n\nnamespace runtime {\n\n// hookup into galois::for_each. Invoke galois::for_each with\n// wl<galois::worklists::ParaMeter<> >\ntemplate <class T, class FunctionTy, class ArgsTy>\nstruct ForEachExecutor<galois::worklists::ParaMeter<T>, FunctionTy, ArgsTy>\n    : public ParaMeter::ParaMeterExecutor<T, FunctionTy, ArgsTy> {\n  using SuperTy = ParaMeter::ParaMeterExecutor<T, FunctionTy, ArgsTy>;\n  ForEachExecutor(const FunctionTy& f, const ArgsTy& args) : SuperTy(f, args) {}\n};\n\n//! invoke ParaMeter tool to execute a for_each style loop\ntemplate <typename R, typename F, typename ArgsTuple>\nvoid for_each_ParaMeter(const R& range, const F& func,\n                        const ArgsTuple& argsTuple) {\n\n  using T = typename R::values_type;\n\n  auto tpl = galois::get_default_trait_values(\n      argsTuple, std::make_tuple(wl_tag{}),\n      std::make_tuple(wl<galois::worklists::ParaMeter<>>()));\n\n  using Tpl_ty = decltype(tpl);\n\n  using Exec = runtime::ParaMeter::ParaMeterExecutor<T, F, Tpl_ty>;\n  Exec exec(func, tpl);\n\n  exec.execute(range);\n}\n\n} // end namespace runtime\n} // end namespace galois\n#endif\n\n/*\n * requirements:\n * - support random and fifo schedules, maybe lifo\n * - write stats to a single file.\n * - support multi-threaded execution\n *\n * interface:\n * - file set by environment variable\n * - ParaMeter invoked by choosing wl type, e.g. ParaMeter<>::with_rand, or\n * ParaMeter<>::fifo\n */\n"
  },
  {
    "path": "libgalois/include/galois/runtime/ExtraTraits.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n/**\n * @file ExtraTraits.h\n *\n * Defines particular traits used by the distributed runtime.\n */\n\n#ifndef GALOIS_RUNTIME_EXTRA_TRAITS_H\n#define GALOIS_RUNTIME_EXTRA_TRAITS_H\n\n#include <type_traits>\n#include <boost/mpl/has_xxx.hpp>\n\n// depending on compiler version, trivially copyable defintion changes\n#if __GNUC__ < 5\n//! Defines what it means to be trivially copyable\n#define __is_trivially_copyable(type) __has_trivial_copy(type)\n#else\n//! Defines what it means to be trivially copyable\n#define __is_trivially_copyable(type) std::is_trivially_copyable<type>::value\n#endif\n\nnamespace galois {\nnamespace runtime {\n\nBOOST_MPL_HAS_XXX_TRAIT_DEF(tt_has_serialize)\n//! Indicates if T has the serialize trait\ntemplate <typename T>\nstruct has_serialize : public has_tt_has_serialize<T> {};\n\nBOOST_MPL_HAS_XXX_TRAIT_DEF(tt_is_copyable)\n//! Indicates if T is trivially copyable\ntemplate <typename T>\nstruct is_copyable : public has_tt_is_copyable<T> {};\n\n//! Indicates if T is serializable\ntemplate <typename T>\nstruct is_serializable {\n  //! true if T is serializable\n  static const bool value = has_serialize<T>::value || is_copyable<T>::value ||\n                            __is_trivially_copyable(T);\n};\n\n//! Indicates if T is memory copyable\ntemplate <typename T>\nstruct is_memory_copyable {\n  //! true if T is memory copyable\n  static const bool value = is_copyable<T>::value || __is_trivially_copyable(T);\n};\n\n} // namespace runtime\n} // namespace galois\n\n#endif\n"
  },
  {
    "path": "libgalois/include/galois/runtime/Iterable.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#ifndef GALOIS_RUNTIME_ITERABLE_H\n#define GALOIS_RUNTIME_ITERABLE_H\n\n#include \"galois/config.h\"\n\nnamespace galois {\nnamespace runtime {\n\n// iterable and make_iterable specific\n// From:\n// https://github.com/CppCon/CppCon2014/tree/master/Presentations/C%2B%2B11%20in%20the%20Wild%20-%20Techniques%20from%20a%20Real%20Codebase\n// Author: Arthur O'Dwyer\n// License: The C++ code in this directory is placed in the public domain and\n// may be reused or modified for any purpose, commercial or non-commercial.\n\ntemplate <class It>\nclass iterable {\n  It m_first, m_last;\n\npublic:\n  iterable() = default;\n  iterable(It first, It last) : m_first(first), m_last(last) {}\n  It begin() const { return m_first; }\n  It end() const { return m_last; }\n};\n\ntemplate <class It>\nstatic inline iterable<It> make_iterable(It a, It b) {\n  return iterable<It>(a, b);\n}\n\n} // end namespace runtime\n} // end namespace galois\n\n#endif // GALOIS_RUNTIME_ITERABLE_H\n"
  },
  {
    "path": "libgalois/include/galois/runtime/LoopStatistics.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#ifndef GALOIS_RUNTIME_LOOPSTATISTICS_H\n#define GALOIS_RUNTIME_LOOPSTATISTICS_H\n\n#include \"galois/config.h\"\n#include \"galois/runtime/Statistics.h\"\n\nnamespace galois {\nnamespace runtime {\n\n// Usually instantiated per thread\ntemplate <bool Enabled>\nclass LoopStatistics {\n\nprotected:\n  size_t m_iterations;\n  size_t m_pushes;\n  size_t m_conflicts;\n  const char* loopname;\n\npublic:\n  explicit LoopStatistics(const char* ln)\n      : m_iterations(0), m_pushes(0), m_conflicts(0), loopname(ln) {}\n\n  ~LoopStatistics() {\n    reportStat_Tsum(loopname, \"Iterations\", m_iterations);\n    reportStat_Tsum(loopname, \"Commits\", (m_iterations - m_conflicts));\n    reportStat_Tsum(loopname, \"Pushes\", m_pushes);\n    reportStat_Tsum(loopname, \"Conflicts\", m_conflicts);\n  }\n\n  size_t iterations(void) const { return m_iterations; }\n  size_t pushes(void) const { return m_pushes; }\n  size_t conflicts(void) const { return m_conflicts; }\n\n  inline void inc_pushes(size_t v = 1) { m_pushes += v; }\n\n  inline void inc_iterations() { ++m_iterations; }\n\n  inline void inc_conflicts() { ++m_conflicts; }\n};\n\ntemplate <>\nclass LoopStatistics<false> {\npublic:\n  explicit LoopStatistics(const char*) {}\n\n  size_t iterations(void) const { return 0; }\n  size_t pushes(void) const { return 0; }\n  size_t conflicts(void) const { return 0; }\n\n  inline void inc_iterations() const {}\n  inline void inc_pushes(size_t = 0) const {}\n  inline void inc_conflicts() const {}\n};\n\n} // namespace runtime\n} // namespace galois\n#endif\n"
  },
  {
    "path": "libgalois/include/galois/runtime/Mem.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#ifndef GALOIS_RUNTIME_MEM_H\n#define GALOIS_RUNTIME_MEM_H\n\n#include <cstddef>\n#include <cstdio>\n#include <cstdlib>\n#include <cstring>\n#include <list>\n#include <map>\n#include <memory>\n\n#include <boost/utility.hpp>\n\n#include \"galois/config.h\"\n#include \"galois/gIO.h\"\n#include \"galois/runtime/PagePool.h\"\n#include \"galois/substrate/CacheLineStorage.h\"\n#include \"galois/substrate/NumaMem.h\"\n#include \"galois/substrate/PerThreadStorage.h\"\n#include \"galois/substrate/PtrLock.h\"\n#include \"galois/substrate/SimpleLock.h\"\n\nnamespace galois {\nnamespace runtime {\n\nextern unsigned activeThreads;\n\n//! Memory management functionality.\n\nvoid preAlloc_impl(unsigned num);\n\n// const size_t hugePageSize = 2*1024*1024;\n\n//! Preallocate numpages large pages for each thread\nvoid pagePreAlloc(int numpages);\n//! Forces the given block to be paged into physical memory\nvoid pageIn(void* buf, size_t len, size_t stride);\n//! Forces the given readonly block to be paged into physical memory\nvoid pageInReadOnly(void* buf, size_t len, size_t stride);\n\n//! Returns total small pages allocated by OS on a NUMA node\nint numNumaAllocForNode(unsigned nodeid);\n\n//! Print lines from /proc/pid/numa_maps that contain at least n (non-huge)\n//! pages\nvoid printInterleavedStats(int minPages = 16 * 1024);\n\n//! [Example Third Party Allocator]\nclass MallocHeap {\npublic:\n  //! Supported allocation size in bytes. If 0, heap supports variable sized\n  //! allocations\n  enum { AllocSize = 0 };\n\n  void* allocate(size_t size) { return malloc(size); }\n\n  void deallocate(void* ptr) { free(ptr); }\n};\n//! [Example Third Party Allocator]\n\n//! Per-thread heaps using Galois thread aware construct\ntemplate <class SourceHeap>\nclass ThreadPrivateHeap {\n  substrate::PerThreadStorage<SourceHeap> heaps;\n\npublic:\n  enum { AllocSize = SourceHeap::AllocSize };\n\n  ThreadPrivateHeap() {}\n  ~ThreadPrivateHeap() { clear(); }\n\n  template <typename... Args>\n  inline void* allocate(size_t size, Args&&... args) {\n    return heaps.getLocal()->allocate(size, std::forward<Args>(args)...);\n  }\n\n  inline void deallocate(void* ptr) { heaps.getLocal()->deallocate(ptr); }\n\n  void clear() {\n    for (unsigned int i = 0; i < heaps.size(); i++)\n      heaps.getRemote(i)->clear();\n  }\n};\n\n//! Apply a lock to a heap\ntemplate <class SourceHeap>\nclass LockedHeap : public SourceHeap {\n  substrate::SimpleLock lock;\n\npublic:\n  enum { AllocSize = SourceHeap::AllocSize };\n\n  inline void* allocate(size_t size) {\n    lock.lock();\n    void* retval = SourceHeap::allocate(size);\n    lock.unlock();\n    return retval;\n  }\n\n  inline void deallocate(void* ptr) {\n    lock.lock();\n    SourceHeap::deallocate(ptr);\n    lock.unlock();\n  }\n};\n\ntemplate <typename SourceHeap>\nclass ZeroOut : public SourceHeap {\npublic:\n  enum { AllocSize = SourceHeap::AllocSize };\n\n  inline void* allocate(size_t size) {\n    void* retval = SourceHeap::allocate(size);\n    memset(retval, 0, size);\n    return retval;\n  }\n\n  inline void deallocate(void* ptr) { SourceHeap::deallocate(ptr); }\n};\n\n//! Add a header to objects\ntemplate <typename Header, typename SourceHeap>\nclass AddHeader : public SourceHeap {\n  enum {\n    offset = (sizeof(Header) + (sizeof(double) - 1)) & ~(sizeof(double) - 1)\n  };\n\npublic:\n  inline void* allocate(size_t size) {\n    // First increase the size of the header to be aligned to a double\n    void* ptr = SourceHeap::allocate(size + offset);\n    // Now return the offseted pointer\n    return (char*)ptr + offset;\n  }\n\n  inline void deallocate(void* ptr) { SourceHeap::deallocate(getHeader(ptr)); }\n\n  inline static Header* getHeader(void* ptr) {\n    return (Header*)((char*)ptr - offset);\n  }\n};\n\n//! Allow looking up parent heap pointers\ntemplate <class SourceHeap>\nclass OwnerTaggedHeap : public AddHeader<void*, SourceHeap> {\n  typedef AddHeader<OwnerTaggedHeap*, SourceHeap> Src;\n\npublic:\n  inline void* allocate(size_t size) {\n    void* retval              = Src::allocate(size);\n    *(Src::getHeader(retval)) = this;\n    return retval;\n  }\n\n  inline void deallocate(void* ptr) {\n    assert(*(Src::getHeader(ptr)) == this);\n    Src::deallocate(ptr);\n  }\n\n  inline static OwnerTaggedHeap* owner(void* ptr) {\n    return *(OwnerTaggedHeap**)Src::getHeader(ptr);\n  }\n};\n\n//! Maintain a freelist\ntemplate <class SourceHeap>\nclass FreeListHeap : public SourceHeap {\n  struct FreeNode {\n    FreeNode* next;\n  };\n  FreeNode* head;\n\n  using dbg = galois::debug<0>;\n\npublic:\n  enum { AllocSize = SourceHeap::AllocSize };\n\n  void clear() {\n    while (head) {\n      FreeNode* N = head;\n      head        = N->next;\n      SourceHeap::deallocate(N);\n    }\n  }\n\n  FreeListHeap() : head(0) {}\n  ~FreeListHeap() { clear(); }\n\n  inline void* allocate(size_t size) {\n    if (head) {\n      void* ptr = head;\n      head      = head->next;\n      dbg::print(this, \" picking from free list, ptr = \", ptr);\n      return ptr;\n    } else {\n      void* ptr = SourceHeap::allocate(size);\n      dbg::print(this, \" allocating from SourceHeap, ptr = \", ptr);\n      return ptr;\n    }\n  }\n\n  inline void deallocate(void* ptr) {\n    if (!ptr)\n      return;\n    assert((uintptr_t)ptr > 0x100);\n    FreeNode* NH = (FreeNode*)ptr;\n    NH->next     = head;\n    head         = NH;\n    dbg::print(this, \" adding block to list, head = \", head);\n  }\n};\n\n//! Maintain a freelist using a lock which doesn't cover SourceHeap\ntemplate <class SourceHeap>\nclass SelfLockFreeListHeap : public SourceHeap {\n  struct FreeNode {\n    FreeNode* next;\n  };\n  FreeNode* head;\n\npublic:\n  enum { AllocSize = SourceHeap::AllocSize };\n\n  void clear() {\n    FreeNode* h = 0;\n    do {\n      h = head;\n    } while (!__sync_bool_compare_and_swap(&head, h, 0));\n    while (h) {\n      FreeNode* N = h;\n      h           = N->next;\n      SourceHeap::deallocate(N);\n    }\n  }\n\n  SelfLockFreeListHeap() : head(0) {}\n  ~SelfLockFreeListHeap() { clear(); }\n\n  inline void* allocate(size_t size) {\n    static substrate::SimpleLock lock;\n\n    lock.lock();\n    FreeNode* OH = 0;\n    FreeNode* NH = 0;\n    do {\n      OH = head;\n      if (!OH) {\n        lock.unlock();\n        return SourceHeap::allocate(size);\n      }\n      NH = OH->next; // The lock protects this line\n    } while (!__sync_bool_compare_and_swap(&head, OH, NH));\n    lock.unlock();\n    assert(OH);\n    return (void*)OH;\n  }\n\n  inline void deallocate(void* ptr) {\n    if (!ptr)\n      return;\n    FreeNode* OH;\n    FreeNode* NH;\n    do {\n      OH       = head;\n      NH       = (FreeNode*)ptr;\n      NH->next = OH;\n    } while (!__sync_bool_compare_and_swap(&head, OH, NH));\n  }\n};\n\ntemplate <unsigned ElemSize, typename SourceHeap>\nclass BlockHeap : public SourceHeap {\n  struct TyEq {\n    double data[((ElemSize + sizeof(double) - 1) & ~(sizeof(double) - 1)) /\n                sizeof(double)];\n  };\n\n  struct Block_basic {\n    union {\n      Block_basic* next;\n      double dummy;\n    };\n    TyEq data[1];\n  };\n\n  enum {\n    BytesLeft  = (SourceHeap::AllocSize - sizeof(Block_basic)),\n    BytesLeftR = BytesLeft & ~(sizeof(double) - 1),\n    FitLeft    = BytesLeftR / sizeof(TyEq[1]),\n    TotalFit   = FitLeft + 1\n  };\n\n  struct Block {\n    union {\n      Block* next;\n      double dummy;\n    };\n    TyEq data[TotalFit];\n  };\n\n  Block* head;\n  int headIndex;\n\n  void refill() {\n    void* P   = SourceHeap::allocate(SourceHeap::AllocSize);\n    Block* BP = (Block*)P;\n    BP->next  = head;\n    head      = BP;\n    headIndex = 0;\n  }\n\npublic:\n  enum { AllocSize = ElemSize };\n\n  void clear() {\n    while (head) {\n      Block* B = head;\n      head     = B->next;\n      SourceHeap::deallocate(B);\n    }\n  }\n\n  BlockHeap() : SourceHeap(), head(0), headIndex(0) {\n    static_assert(sizeof(Block) <= SourceHeap::AllocSize, \"\");\n  }\n\n  ~BlockHeap() { clear(); }\n\n  inline void* allocate(size_t GALOIS_USED_ONLY_IN_DEBUG(size)) {\n    assert(size == ElemSize);\n    if (!head || headIndex == TotalFit)\n      refill();\n    return &head->data[headIndex++];\n  }\n\n  inline void deallocate(void*) {}\n};\n\n//! This implements a bump pointer though chunks of memory\ntemplate <typename SourceHeap>\nclass BumpHeap : public SourceHeap {\n  struct Block {\n    union {\n      Block* next;\n      double dummy; // for alignment\n    };\n  };\n\n  Block* head;\n  int offset;\n\n  void refill() {\n    void* P   = SourceHeap::allocate(SourceHeap::AllocSize);\n    Block* BP = (Block*)P;\n    BP->next  = head;\n    head      = BP;\n    offset    = sizeof(Block);\n  }\n\npublic:\n  enum { AllocSize = 0 };\n\n  BumpHeap() : SourceHeap(), head(0), offset(0) {}\n\n  ~BumpHeap() { clear(); }\n\n  void clear() {\n    while (head) {\n      Block* B = head;\n      head     = B->next;\n      SourceHeap::deallocate(B);\n    }\n  }\n\n  inline void* allocate(size_t size) {\n    // Increase to alignment\n    size_t alignedSize = (size + sizeof(double) - 1) & ~(sizeof(double) - 1);\n    // Check current block\n    if (!head || offset + alignedSize > SourceHeap::AllocSize) {\n      refill();\n    }\n    if (offset + alignedSize > SourceHeap::AllocSize) {\n      std::abort(); // TODO: remove\n      throw std::bad_alloc();\n    }\n    char* retval = (char*)head;\n    retval += offset;\n    offset += alignedSize;\n    return retval;\n  }\n\n  /**\n   * Allocates size bytes but may fail. If so, size < allocated and\n   * allocated is the number of bytes allocated in the returned buffer.\n   */\n  inline void* allocate(size_t size, size_t& allocated) {\n    // Increase to alignment\n    size_t alignedSize = (size + sizeof(double) - 1) & ~(sizeof(double) - 1);\n    if (alignedSize > SourceHeap::AllocSize) {\n      alignedSize = SourceHeap::AllocSize;\n    }\n    // Check current block\n    if (!head || offset + alignedSize > SourceHeap::AllocSize) {\n      size_t remaining = SourceHeap::AllocSize - offset;\n      assert((remaining & (sizeof(double) - 1)) ==\n             0); // should still be aligned\n      if (!remaining) {\n        refill();\n      } else {\n        alignedSize = remaining;\n      }\n    }\n    char* retval = (char*)head;\n    retval += offset;\n    offset += alignedSize;\n    allocated = (alignedSize > size) ? size : alignedSize;\n    return retval;\n  }\n\n  inline void deallocate(void*) {}\n};\n\n/**\n * This implements a bump pointer though chunks of memory that falls back\n * to malloc if the source heap cannot accommodate an allocation.\n */\ntemplate <typename SourceHeap>\nclass BumpWithMallocHeap : public SourceHeap {\n  struct Block {\n    union {\n      Block* next;\n      double dummy; // for alignment\n    };\n  };\n\n  Block* head;\n  Block* fallbackHead;\n  int offset;\n\n  //! Given block of memory P, update head pointer and offset metadata\n  void refill(void* P, Block*& h, int* o) {\n    Block* BP = (Block*)P;\n    BP->next  = h;\n    h         = BP;\n    if (o)\n      *o = sizeof(Block);\n  }\n\npublic:\n  enum { AllocSize = 0 };\n\n  BumpWithMallocHeap() : SourceHeap(), head(0), fallbackHead(0), offset(0) {}\n\n  ~BumpWithMallocHeap() { clear(); }\n\n  void clear() {\n    while (head) {\n      Block* B = head;\n      head     = B->next;\n      SourceHeap::deallocate(B);\n    }\n    while (fallbackHead) {\n      Block* B     = fallbackHead;\n      fallbackHead = B->next;\n      free(B);\n    }\n  }\n\n  inline void* allocate(size_t size) {\n    // Increase to alignment\n    size_t alignedSize = (size + sizeof(double) - 1) & ~(sizeof(double) - 1);\n    if (sizeof(Block) + alignedSize > SourceHeap::AllocSize) {\n      void* p = malloc(alignedSize + sizeof(Block));\n      refill(p, fallbackHead, NULL);\n      return (char*)p + sizeof(Block);\n    }\n    // Check current block\n    if (!head || offset + alignedSize > SourceHeap::AllocSize)\n      refill(SourceHeap::allocate(SourceHeap::AllocSize), head, &offset);\n    char* retval = (char*)head;\n    retval += offset;\n    offset += alignedSize;\n    return retval;\n  }\n\n  inline void deallocate(void*) {}\n};\n\n//! This is the base source of memory for all allocators.\n//! It maintains a freelist of chunks acquired from the system\nclass SystemHeap {\npublic:\n  // FIXME: actually check!\n  enum { AllocSize = 2 * 1024 * 1024 };\n\n  SystemHeap();\n  ~SystemHeap();\n\n  inline void* allocate(size_t) { return pagePoolAlloc(); }\n\n  inline void deallocate(void* ptr) { pagePoolFree(ptr); }\n};\n\ntemplate <typename Derived>\nclass StaticSingleInstance : private boost::noncopyable {\n\n  // static std::unique_ptr<Derived> instance;\n  static substrate::PtrLock<Derived> ptr;\n\npublic:\n  static Derived* getInstance(void) {\n    Derived* f = ptr.getValue();\n    if (f) {\n      // assert (f == instance.get ());\n      return f;\n    }\n\n    ptr.lock();\n    f = ptr.getValue();\n    if (f) {\n      ptr.unlock();\n      // assert (f == instance.get ());\n    } else {\n      // instance = std::unique_ptr<Derived> (new Derived());\n      // f = instance.get ();\n      f = new Derived;\n      ptr.unlock_and_set(f);\n    }\n    return f;\n  }\n};\n\n// template <typename Derived>\n// std::unique_ptr<Derived> StaticSingleInstance<Derived>::instance =\n// std::unique_ptr<Derived>();\n\ntemplate <typename Derived>\nsubstrate::PtrLock<Derived>\n    StaticSingleInstance<Derived>::ptr = substrate::PtrLock<Derived>();\n\nclass PageHeap : public StaticSingleInstance<PageHeap> {\n\n  using Base = StaticSingleInstance<PageHeap>;\n\n  /* template <typename _U> */ friend class StaticSingleInstance<PageHeap>;\n\n  using InnerHeap = ThreadPrivateHeap<FreeListHeap<SystemHeap>>;\n  // using InnerHeap = SystemHeap;\n\n  InnerHeap innerHeap;\n\n  using dbg = galois::debug<0>;\n\n  PageHeap() : innerHeap() { dbg::print(\"New instance of PageHeap: \", this); }\n\npublic:\n  enum { AllocSize = InnerHeap::AllocSize };\n\n  inline void* allocate(size_t size) {\n    assert(size <= AllocSize);\n    void* ptr = innerHeap.allocate(size);\n    dbg::print(this, \" PageHeap allocate, ptr = \", ptr);\n    return ptr;\n  }\n\n  inline void deallocate(void* ptr) {\n    assert(ptr);\n    dbg::print(this, \" PageHeap  deallocate ptr = \", ptr);\n    innerHeap.deallocate(ptr);\n  }\n};\n\n#ifdef GALOIS_FORCE_STANDALONE\nclass SizedHeapFactory : private boost::noncopyable {\npublic:\n  typedef MallocHeap SizedHeap;\n\n  static SizedHeap* getHeapForSize(const size_t) { return &alloc; }\n\nprivate:\n  static SizedHeap alloc;\n};\n#else\nclass SizedHeapFactory : public StaticSingleInstance<SizedHeapFactory> {\n  using Base = StaticSingleInstance<SizedHeapFactory>;\n  /* template <typename> */ friend class StaticSingleInstance<SizedHeapFactory>;\n\npublic:\n  //! [FixedSizeAllocator example]\n  typedef ThreadPrivateHeap<FreeListHeap<BumpHeap<SystemHeap>>> SizedHeap;\n  //! [FixedSizeAllocator example]\n\n  static SizedHeap* getHeapForSize(const size_t);\n\nprivate:\n  typedef std::map<size_t, SizedHeap*> HeapMap;\n  static thread_local HeapMap* localHeaps;\n  HeapMap heaps;\n  std::list<HeapMap*> allLocalHeaps;\n  substrate::SimpleLock lock;\n\n  SizedHeapFactory();\n\n  SizedHeap* getHeap(size_t);\n\npublic:\n  ~SizedHeapFactory();\n};\n#endif\n\n/**\n * Scalable variable-size allocations.\n *\n * Slight misnomer as this doesn't support allocations greater than a page.\n * Users should call allocate multiple times to split\n * large allocations over multiple pages.\n */\nstruct VariableSizeHeap : public ThreadPrivateHeap<BumpHeap<SystemHeap>> {\n  enum { AllocSize = 0 };\n};\n\n//! Main scalable allocator in Galois\nclass FixedSizeHeap {\n  SizedHeapFactory::SizedHeap* heap;\n\npublic:\n  FixedSizeHeap(size_t size) {\n    heap = SizedHeapFactory::getHeapForSize(size);\n    if (!heap && size != 0) {\n      fprintf(stderr, \"ERROR: Cannot init a fixed sized heap from \"\n                      \"SizedHeapFactory\\n\");\n      throw std::bad_alloc();\n    }\n  }\n\n  inline void* allocate(size_t size) {\n    void* alloc = heap->allocate(size);\n    if (alloc == nullptr && size != 0) {\n      fprintf(stderr, \"ERROR: Fixed sized heap allocate called failed\\n\");\n      throw std::bad_alloc();\n    }\n    return alloc;\n  }\n\n  inline void deallocate(void* ptr) { heap->deallocate(ptr); }\n\n  inline bool operator!=(const FixedSizeHeap& rhs) const {\n    return heap != rhs.heap;\n  }\n\n  inline bool operator==(const FixedSizeHeap& rhs) const {\n    return heap == rhs.heap;\n  }\n};\n\nclass SerialNumaHeap {\n  enum {\n    offset = (sizeof(substrate::LAptr) + (sizeof(double) - 1)) &\n             ~(sizeof(double) - 1)\n  };\n\npublic:\n  enum { AllocSize = 0 };\n\n  void* allocate(size_t size) {\n    auto ptr = substrate::largeMallocInterleaved(size + offset, activeThreads);\n    substrate::LAptr* header =\n        new ((char*)ptr.get()) substrate::LAptr{std::move(ptr)};\n    return (char*)(header->get()) + offset;\n  }\n\n  void deallocate(void* ptr) {\n    char* realPtr = ((char*)ptr - offset);\n    substrate::LAptr dptr{std::move(*(substrate::LAptr*)realPtr)};\n  }\n};\n\n////////////////////////////////////////////////////////////////////////////////\n// Now adapt to standard std allocators\n////////////////////////////////////////////////////////////////////////////////\n\n//! A fixed size block allocator\ntemplate <typename Ty>\nclass FixedSizeAllocator;\n\ntemplate <>\nclass FixedSizeAllocator<void> {\npublic:\n  typedef size_t size_type;\n  typedef ptrdiff_t difference_type;\n  typedef void* pointer;\n  typedef const void* const_pointer;\n  typedef void value_type;\n\n  template <typename Other>\n  struct rebind {\n    typedef FixedSizeAllocator<Other> other;\n  };\n};\n\ntemplate <typename Ty>\nclass FixedSizeAllocator {\n  inline void destruct(char*) const {}\n  inline void destruct(wchar_t*) const {}\n  template <typename T>\n  inline void destruct(T* t) const {\n    t->~T();\n  }\n\n  FixedSizeHeap heap;\n\npublic:\n  typedef size_t size_type;\n  typedef ptrdiff_t difference_type;\n  typedef Ty* pointer;\n  typedef const Ty* const_pointer;\n  typedef Ty& reference;\n  typedef const Ty& const_reference;\n  typedef Ty value_type;\n\n  template <class Other>\n  struct rebind {\n    typedef FixedSizeAllocator<Other> other;\n  };\n\n  FixedSizeAllocator() noexcept : heap(sizeof(Ty)) {}\n\n  template <class U>\n  FixedSizeAllocator(const FixedSizeAllocator<U>&) noexcept\n      : heap(sizeof(Ty)) {}\n\n  inline pointer address(reference val) const { return &val; }\n  inline const_pointer address(const_reference val) const { return &val; }\n\n  pointer allocate(size_type size) {\n    if (size > max_size())\n      throw std::bad_alloc();\n    return static_cast<pointer>(heap.allocate(sizeof(Ty)));\n  }\n\n  void deallocate(pointer ptr, size_type GALOIS_USED_ONLY_IN_DEBUG(len)) {\n    assert(len == 1);\n    heap.deallocate(ptr);\n  }\n\n  template <class U, class... Args>\n  inline void construct(U* p, Args&&... args) const {\n    ::new ((void*)p) U(std::forward<Args>(args)...);\n  }\n\n  inline void destroy(pointer ptr) const { destruct(ptr); }\n\n  size_type max_size() const noexcept { return 1; }\n\n  template <typename T1>\n  inline bool operator!=(const FixedSizeAllocator<T1>& rhs) const {\n    return heap != rhs.heap;\n  }\n\n  template <typename T1>\n  inline bool operator==(const FixedSizeAllocator<T1>& rhs) const {\n    return heap == rhs.heap;\n  }\n};\n\nclass Pow_2_BlockHeap : public StaticSingleInstance<Pow_2_BlockHeap> {\n\nprivate:\n  using Base = StaticSingleInstance<Pow_2_BlockHeap>;\n  /* template <typename> */ friend class StaticSingleInstance<Pow_2_BlockHeap>;\n\n  static const bool USE_MALLOC_AS_BACKUP = true;\n\n  static const size_t LOG2_MIN_SIZE = 3;  // 2^3 == 8 bytes\n  static const size_t LOG2_MAX_SIZE = 16; // 64k\n\n  typedef FixedSizeHeap Heap_ty;\n\n  std::vector<Heap_ty> heapTable;\n\n  static inline size_t pow2(unsigned i) { return (1U << i); }\n\n  static unsigned nextLog2(const size_t allocSize) {\n\n    unsigned i = LOG2_MIN_SIZE;\n\n    while (pow2(i) < allocSize) {\n      ++i;\n    }\n\n    // if (pow2 (i) > pow2 (LOG2_MAX_SIZE)) {\n    // std::fprintf (stderr, \"ERROR: block bigger than huge page size\n    // requested\\n\"); throw std::bad_alloc();\n    // }\n\n    return i;\n  }\n\n  void populateTable(void) {\n    assert(heapTable.empty());\n\n    heapTable.clear();\n    for (unsigned i = 0; i <= LOG2_MAX_SIZE; ++i) {\n      heapTable.push_back(Heap_ty(pow2(i)));\n    }\n  }\n\n  Pow_2_BlockHeap() noexcept; // NOLINT(modernize-use-equals-delete)\n\npublic:\n  void* allocateBlock(const size_t allocSize) {\n\n    if (allocSize > pow2(LOG2_MAX_SIZE)) {\n      if (USE_MALLOC_AS_BACKUP) {\n        return malloc(allocSize);\n      } else {\n        fprintf(stderr, \"ERROR: block bigger than huge page size requested\\n\");\n        throw std::bad_alloc();\n      }\n    } else {\n\n      unsigned i = nextLog2(allocSize);\n      assert(i < heapTable.size());\n      return heapTable[i].allocate(pow2(i));\n    }\n  }\n\n  void deallocateBlock(void* ptr, const size_t allocSize) {\n    if (allocSize > pow2(LOG2_MAX_SIZE)) {\n      if (USE_MALLOC_AS_BACKUP) {\n        free(ptr);\n      } else {\n        fprintf(stderr, \"ERROR: block bigger than huge page size requested\\n\");\n        throw std::bad_alloc();\n      }\n    } else {\n      unsigned i = nextLog2(allocSize);\n      assert(i < heapTable.size());\n      heapTable[i].deallocate(ptr);\n    }\n  }\n};\n\ntemplate <typename Ty>\nclass Pow_2_BlockAllocator {\n\n  template <typename T>\n  static inline void destruct(T* t) {\n    if (!std::is_scalar<T>::value) {\n      t->~T();\n    }\n  }\n\npublic:\n  typedef size_t size_type;\n  typedef ptrdiff_t difference_type;\n  typedef Ty* pointer;\n  typedef const Ty* const_pointer;\n  typedef Ty& reference;\n  typedef const Ty& const_reference;\n  typedef Ty value_type;\n\n  template <class Other>\n  struct rebind {\n    typedef Pow_2_BlockAllocator<Other> other;\n  };\n\n  Pow_2_BlockHeap* heap;\n\n  Pow_2_BlockAllocator() noexcept : heap(Pow_2_BlockHeap::getInstance()) {}\n\n  // template <typename U>\n  // friend class Pow_2_BlockAllocator<U>;\n\n  template <typename U>\n  Pow_2_BlockAllocator(const Pow_2_BlockAllocator<U>& that) noexcept\n      : heap(that.heap) {}\n\n  inline pointer address(reference val) const { return &val; }\n\n  inline const_pointer address(const_reference val) const { return &val; }\n\n  pointer allocate(size_type size) {\n    return static_cast<pointer>(heap->allocateBlock(size * sizeof(Ty)));\n  }\n\n  void deallocate(pointer ptr, size_type len) {\n    heap->deallocateBlock(ptr, len * sizeof(Ty));\n  }\n\n  template <class U, class... Args>\n  inline void construct(U* p, Args&&... args) const {\n    ::new ((void*)p) U(std::forward<Args>(args)...);\n  }\n\n  inline void destroy(pointer ptr) const { destruct(ptr); }\n\n  size_type max_size() const noexcept { return size_type(-1); }\n\n  template <typename T1>\n  bool operator!=(const Pow_2_BlockAllocator<T1>& rhs) const {\n    return heap != rhs.heap;\n  }\n\n  template <typename T1>\n  bool operator==(const Pow_2_BlockAllocator<T1>& rhs) const {\n    return heap == rhs.heap;\n  }\n};\n\ntemplate <>\nclass Pow_2_BlockAllocator<void> {\npublic:\n  typedef size_t size_type;\n  typedef ptrdiff_t difference_type;\n  typedef void* pointer;\n  typedef const void* const_pointer;\n  typedef void value_type;\n\n  template <typename Other>\n  struct rebind {\n    typedef Pow_2_BlockAllocator<Other> other;\n  };\n};\n\n//! Keep a reference to an external allocator\ntemplate <typename Ty, typename HeapTy>\nclass ExternalHeapAllocator;\n\ntemplate <typename HeapTy>\nclass ExternalHeapAllocator<void, HeapTy> {\npublic:\n  typedef size_t size_type;\n  typedef ptrdiff_t difference_type;\n  typedef void* pointer;\n  typedef const void* const_pointer;\n  typedef void value_type;\n\n  template <typename Other>\n  struct rebind {\n    typedef ExternalHeapAllocator<Other, HeapTy> other;\n  };\n};\n\ntemplate <typename Ty, typename HeapTy>\nclass ExternalHeapAllocator {\n  inline void destruct(char*) const {}\n  inline void destruct(wchar_t*) const {}\n  template <typename T>\n  inline void destruct(T* t) const {\n    t->~T();\n  }\n\npublic:\n  HeapTy* heap; // Should be private except that makes copy hard\n\n  typedef size_t size_type;\n  typedef ptrdiff_t difference_type;\n  typedef Ty* pointer;\n  typedef const Ty* const_pointer;\n  typedef Ty& reference;\n  typedef const Ty& const_reference;\n  typedef Ty value_type;\n\n  template <class Other>\n  struct rebind {\n    typedef ExternalHeapAllocator<Other, HeapTy> other;\n  };\n\n  explicit ExternalHeapAllocator(HeapTy* a) noexcept : heap(a) {}\n\n  template <class T1>\n  ExternalHeapAllocator(const ExternalHeapAllocator<T1, HeapTy>& rhs) noexcept {\n    heap = rhs.heap;\n  }\n\n  inline pointer address(reference val) const { return &val; }\n\n  inline const_pointer address(const_reference val) const { return &val; }\n\n  pointer allocate(size_type size) {\n    if (size > max_size())\n      throw std::bad_alloc();\n    return static_cast<pointer>(heap->allocate(size * sizeof(Ty)));\n  }\n\n  void deallocate(pointer ptr, size_type) { heap->deallocate(ptr); }\n\n  inline void construct(pointer ptr, const_reference val) const {\n    new (ptr) Ty(val);\n  }\n\n  template <class U, class... Args>\n  inline void construct(U* p, Args&&... args) const {\n    ::new ((void*)p) U(std::forward<Args>(args)...);\n  }\n\n  void destroy(pointer ptr) const { destruct(ptr); }\n\n  size_type max_size() const noexcept {\n    return (HeapTy::AllocSize == 0) ? size_t(-1) / sizeof(Ty)\n                                    : HeapTy::AllocSize / sizeof(Ty);\n  }\n\n  template <typename T1, typename A1>\n  bool operator!=(const ExternalHeapAllocator<T1, A1>& rhs) const {\n    return heap != rhs.heap;\n  }\n\n  template <typename T1, typename A1>\n  bool operator==(const ExternalHeapAllocator<T1, A1>& rhs) const {\n    return heap == rhs.heap;\n  }\n};\n\ntemplate <typename T>\nclass SerialNumaAllocator : public ExternalHeapAllocator<T, SerialNumaHeap> {\n  using Super = ExternalHeapAllocator<T, SerialNumaHeap>;\n  SerialNumaHeap heap;\n\npublic:\n  template <class Other>\n  struct rebind {\n    typedef SerialNumaAllocator<Other> other;\n  };\n\n  SerialNumaAllocator() : Super(&heap) {}\n};\n\n} // end namespace runtime\n} // end namespace galois\n\n#endif\n"
  },
  {
    "path": "libgalois/include/galois/runtime/OperatorReferenceTypes.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#ifndef GALOIS_RUNTIME_OPERATOR_REFERENCE_TYPES_H\n#define GALOIS_RUNTIME_OPERATOR_REFERENCE_TYPES_H\n\n#include \"galois/config.h\"\n\nnamespace galois {\nnamespace runtime {\n\nnamespace internal {\n\n// Helper template for getting the appropriate type of\n// reference to hold within each executor based off of the\n// type of reference that was passed to it.\n\n// Don't accept operators by value.\ntemplate <typename FuncTy>\nstruct OperatorReferenceType_impl;\n\n// Const references are propagated.\n// If a user supplies a const reference the operator() on the\n// given object must be callable with *this passed as const as well.\ntemplate <typename FuncNoRef>\nstruct OperatorReferenceType_impl<FuncNoRef const&> {\n  using type = FuncNoRef const&;\n};\n\n// Non-const references continue to be non-const.\ntemplate <typename FuncNoRef>\nstruct OperatorReferenceType_impl<FuncNoRef&> {\n  using type = FuncNoRef&;\n};\n\n// Inside each executor store a reference to a received rvalue reference\n// and then use that to pass to the various threads. This must be done in\n// a way that keeps the rvalue reference alive throughout the duration of\n// the parallel loop (as long as the resulting lvalue reference is used\n// anywhere).\ntemplate <typename FuncNoRef>\nstruct OperatorReferenceType_impl<FuncNoRef&&> {\n  using type = FuncNoRef&;\n};\n\n} // namespace internal\n\ntemplate <typename T>\nusing OperatorReferenceType =\n    typename internal::OperatorReferenceType_impl<T>::type;\n\n} // namespace runtime\n} // namespace galois\n\n#endif // ifndef(GALOIS_RUNTIME_OPERATOR_REFERENCE_TYPES_H)\n"
  },
  {
    "path": "libgalois/include/galois/runtime/PagePool.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#ifndef GALOIS_RUNTIME_PAGEPOOL_H\n#define GALOIS_RUNTIME_PAGEPOOL_H\n\n#include <cstddef>\n#include <deque>\n#include <mutex>\n#include <numeric>\n#include <unordered_map>\n#include <vector>\n\n#include \"galois/config.h\"\n#include \"galois/gIO.h\"\n#include \"galois/substrate/CacheLineStorage.h\"\n#include \"galois/substrate/SimpleLock.h\"\n#include \"galois/substrate/PageAlloc.h\"\n#include \"galois/substrate/PtrLock.h\"\n#include \"galois/substrate/ThreadPool.h\"\n\nnamespace galois {\nnamespace runtime {\n\n//! Low level page pool (individual pages, use largeMalloc for large blocks)\n\nvoid* pagePoolAlloc();\nvoid pagePoolFree(void*);\nvoid pagePoolPreAlloc(unsigned);\n\n// Size of returned pages\nsize_t pagePoolSize();\n\n//! Returns total large pages allocated by Galois memory management subsystem\nint numPagePoolAllocTotal();\n//! Returns total large pages allocated for thread by Galois memory management\n//! subsystem\nint numPagePoolAllocForThread(unsigned tid);\n\nnamespace internal {\n\nstruct FreeNode {\n  FreeNode* next;\n};\n\ntypedef galois::substrate::PtrLock<FreeNode> HeadPtr;\ntypedef galois::substrate::CacheLineStorage<HeadPtr> HeadPtrStorage;\n\n// Tracks pages allocated\ntemplate <typename _UNUSED = void>\nclass PageAllocState {\n  std::deque<std::atomic<int>> counts;\n  std::vector<HeadPtrStorage> pool;\n  std::unordered_map<void*, int> ownerMap;\n  galois::substrate::SimpleLock mapLock;\n\n  void* allocFromOS() {\n    void* ptr = galois::substrate::allocPages(1, true);\n    assert(ptr);\n    auto tid = galois::substrate::ThreadPool::getTID();\n    counts[tid] += 1;\n    std::lock_guard<galois::substrate::SimpleLock> lg(mapLock);\n    ownerMap[ptr] = tid;\n    return ptr;\n  }\n\npublic:\n  PageAllocState() {\n    auto num = galois::substrate::getThreadPool().getMaxThreads();\n    counts.resize(num);\n    pool.resize(num);\n  }\n\n  int count(int tid) const { return counts[tid]; }\n\n  int countAll() const {\n    return std::accumulate(counts.begin(), counts.end(), 0);\n  }\n\n  void* pageAlloc() {\n    auto tid    = galois::substrate::ThreadPool::getTID();\n    HeadPtr& hp = pool[tid].data;\n    if (hp.getValue()) {\n      hp.lock();\n      FreeNode* h = hp.getValue();\n      if (h) {\n        hp.unlock_and_set(h->next);\n        return h;\n      }\n      hp.unlock();\n    }\n    return allocFromOS();\n  }\n\n  void pageFree(void* ptr) {\n    assert(ptr);\n    mapLock.lock();\n    assert(ownerMap.count(ptr));\n    int i = ownerMap[ptr];\n    mapLock.unlock();\n    HeadPtr& hp = pool[i].data;\n    hp.lock();\n    FreeNode* nh = reinterpret_cast<FreeNode*>(ptr);\n    nh->next     = hp.getValue();\n    hp.unlock_and_set(nh);\n  }\n\n  void pagePreAlloc() { pageFree(allocFromOS()); }\n};\n\n//! Initialize PagePool, used by runtime::init();\nvoid setPagePoolState(PageAllocState<>* pa);\n\n} // end namespace internal\n\n} // end namespace runtime\n} // end namespace galois\n\n#endif\n"
  },
  {
    "path": "libgalois/include/galois/runtime/Profile.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#ifndef GALOIS_RUNTIME_PROFILE_H\n#define GALOIS_RUNTIME_PROFILE_H\n\n#include <cstdlib>\n\n#ifdef GALOIS_ENABLE_VTUNE\n#include \"ittnotify.h\"\n#endif\n\n#ifdef GALOIS_ENABLE_PAPI\nextern \"C\" {\n#include <papi.h>\n#include <papiStdEventDefs.h>\n}\n#endif\n\n#include \"galois/config.h\"\n#include \"galois/Galois.h\"\n#include \"galois/gIO.h\"\n#include \"galois/Timer.h\"\n\nnamespace galois::runtime {\n\n#ifdef GALOIS_ENABLE_VTUNE\n\ntemplate <typename F>\nvoid profileVtune(const F& func, const char* region) {\n\n  region = region ? region : \"(NULL)\";\n\n  GALOIS_ASSERT(\n      galois::substrate::ThreadPool::getTID() == 0,\n      \"profileVtune can only be called from master thread (thread 0)\");\n\n  __itt_resume();\n\n  timeThis(func, region);\n\n  __itt_pause();\n}\n\n#else\n\ntemplate <typename F>\nvoid profileVtune(const F& func, const char* region) {\n\n  region = region ? region : \"(NULL)\";\n  galois::gWarn(\"Vtune not enabled or found\");\n\n  timeThis(func, region);\n}\n\n#endif\n\n#ifdef GALOIS_ENABLE_PAPI\n\nnamespace internal {\n\nunsigned long papiGetTID(void);\n\ntemplate <typename __T = void>\nvoid papiInit() {\n\n  /* Initialize the PAPI library */\n  int retval = PAPI_library_init(PAPI_VER_CURRENT);\n\n  if (retval != PAPI_VER_CURRENT && retval > 0) {\n    GALOIS_DIE(\"PAPI library version mismatch: \", retval,\n               \" != \", PAPI_VER_CURRENT);\n  }\n\n  if (retval < 0) {\n    GALOIS_DIE(\"initialization error!\");\n  }\n\n  if ((retval = PAPI_thread_init(&papiGetTID)) != PAPI_OK) {\n    GALOIS_DIE(\"PAPI thread init failed\");\n  }\n}\n\ntemplate <typename V1, typename V2>\nvoid decodePapiEvents(const V1& eventNames, V2& papiEvents) {\n  for (size_t i = 0; i < eventNames.size(); ++i) {\n    char buf[256];\n    std::strcpy(buf, eventNames[i].c_str());\n    if (PAPI_event_name_to_code(buf, &papiEvents[i]) != PAPI_OK) {\n      GALOIS_DIE(\"failed to recognize eventName = \", eventNames[i],\n                 \", event code: \", papiEvents[i]);\n    }\n  }\n}\n\ntemplate <typename V1, typename V2, typename V3>\nvoid papiStart(V1& eventSets, V2& papiResults, V3& papiEvents) {\n  galois::on_each([&](const unsigned tid, const unsigned numT) {\n    if (PAPI_register_thread() != PAPI_OK) {\n      GALOIS_DIE(\"failed to register thread with PAPI\");\n    }\n\n    int& eventSet = *eventSets.getLocal();\n\n    eventSet = PAPI_NULL;\n    papiResults.getLocal()->resize(papiEvents.size());\n\n    if (PAPI_create_eventset(&eventSet) != PAPI_OK) {\n      GALOIS_DIE(\"failed to init event set\");\n    }\n    if (PAPI_add_events(eventSet, papiEvents.data(), papiEvents.size()) !=\n        PAPI_OK) {\n      GALOIS_DIE(\"failed to add events\");\n    }\n\n    if (PAPI_start(eventSet) != PAPI_OK) {\n      GALOIS_DIE(\"failed to start PAPI\");\n    }\n  });\n}\n\ntemplate <typename V1, typename V2, typename V3>\nvoid papiStop(V1& eventSets, V2& papiResults, V3& eventNames,\n              const char* region) {\n  galois::on_each([&](const unsigned tid, const unsigned numT) {\n    int& eventSet = *eventSets.getLocal();\n\n    if (PAPI_stop(eventSet, papiResults.getLocal()->data()) != PAPI_OK) {\n      GALOIS_DIE(\"PAPI_stop failed\");\n    }\n\n    if (PAPI_cleanup_eventset(eventSet) != PAPI_OK) {\n      GALOIS_DIE(\"PAPI_cleanup_eventset failed\");\n    }\n\n    if (PAPI_destroy_eventset(&eventSet) != PAPI_OK) {\n      GALOIS_DIE(\"PAPI_destroy_eventset failed\");\n    }\n\n    assert(eventNames.size() == papiResults.getLocal()->size() &&\n           \"Both vectors should be of equal length\");\n    for (size_t i = 0; i < eventNames.size(); ++i) {\n      galois::runtime::reportStat_Tsum(region, eventNames[i],\n                                       (*papiResults.getLocal())[i]);\n    }\n\n    if (PAPI_unregister_thread() != PAPI_OK) {\n      GALOIS_DIE(\"failed to un-register thread with PAPI\");\n    }\n  });\n}\n\ntemplate <typename C>\nvoid splitCSVstr(const std::string& inputStr, C& output,\n                 const char delim = ',') {\n  std::stringstream ss(inputStr);\n\n  for (std::string item; std::getline(ss, item, delim);) {\n    output.push_back(item);\n  }\n}\n\n} // end namespace internal\n\ntemplate <typename F>\nvoid profilePapi(const F& func, const char* region) {\n\n  const char* const PAPI_VAR_NAME = \"GALOIS_PAPI_EVENTS\";\n  region                          = region ? region : \"(NULL)\";\n\n  std::string eventNamesCSV;\n\n  if (!galois::substrate::EnvCheck(PAPI_VAR_NAME, eventNamesCSV) ||\n      eventNamesCSV.empty()) {\n    galois::gWarn(\n        \"No Events specified. Set environment variable GALOIS_PAPI_EVENTS\");\n    galois::timeThis(func, region);\n    return;\n  }\n\n  internal::papiInit();\n\n  std::vector<std::string> eventNames;\n\n  internal::splitCSVstr(eventNamesCSV, eventNames);\n\n  std::vector<int> papiEvents(eventNames.size());\n\n  internal::decodePapiEvents(eventNames, papiEvents);\n\n  galois::substrate::PerThreadStorage<int> eventSets;\n  galois::substrate::PerThreadStorage<std::vector<long_long>> papiResults;\n\n  internal::papiStart(eventSets, papiResults, papiEvents);\n\n  galois::timeThis(func, region);\n\n  internal::papiStop(eventSets, papiResults, eventNames, region);\n}\n\n#else\n\ntemplate <typename F>\nvoid profilePapi(const F& func, const char* region) {\n\n  region = region ? region : \"(NULL)\";\n  galois::gWarn(\"PAPI not enabled or found\");\n\n  timeThis(func, region);\n}\n\n#endif\n\n} // namespace galois::runtime\n\n#endif\n"
  },
  {
    "path": "libgalois/include/galois/runtime/Range.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#ifndef GALOIS_RUNTIME_RANGE_H\n#define GALOIS_RUNTIME_RANGE_H\n\n#include <iterator>\n\n#include <boost/iterator/counting_iterator.hpp>\n\n#include \"galois/config.h\"\n#include \"galois/gstl.h\"\n#include \"galois/substrate/ThreadPool.h\"\n\nnamespace galois {\nnamespace runtime {\n\nextern unsigned int activeThreads;\n\n// TODO(ddn): update to have better forward iterator behavor for blocked/local\n// iteration\n\ntemplate <typename T>\nclass LocalRange {\n  T* container;\n\npublic:\n  typedef T container_type;\n  typedef typename T::iterator iterator;\n  typedef typename T::local_iterator local_iterator;\n  typedef iterator block_iterator;\n  typedef typename std::iterator_traits<iterator>::value_type value_type;\n\n  LocalRange(T& c) : container(&c) {}\n\n  iterator begin() const { return container->begin(); }\n  iterator end() const { return container->end(); }\n\n  // TODO fix constness of local containers\n  /* const */ T& get_container() const { return *container; }\n\n  std::pair<block_iterator, block_iterator> block_pair() const {\n    return galois::block_range(begin(), end(), substrate::ThreadPool::getTID(),\n                               activeThreads);\n  }\n\n  std::pair<local_iterator, local_iterator> local_pair() const {\n    return std::make_pair(container->local_begin(), container->local_end());\n  }\n\n  local_iterator local_begin() const { return container->local_begin(); }\n  local_iterator local_end() const { return container->local_end(); }\n\n  block_iterator block_begin() const { return block_pair().first; }\n  block_iterator block_end() const { return block_pair().second; }\n};\n\ntemplate <typename T>\ninline LocalRange<T> makeLocalRange(T& obj) {\n  return LocalRange<T>(obj);\n}\n\ntemplate <typename IterTy>\nclass StandardRange {\n  IterTy ii, ei;\n\npublic:\n  typedef IterTy iterator;\n  typedef iterator local_iterator;\n  typedef iterator block_iterator;\n\n  typedef typename std::iterator_traits<IterTy>::value_type value_type;\n\n  StandardRange(IterTy b, IterTy e) : ii(b), ei(e) {}\n\n  iterator begin() const { return ii; }\n  iterator end() const { return ei; }\n\n  std::pair<block_iterator, block_iterator> block_pair() const {\n    return galois::block_range(ii, ei, substrate::ThreadPool::getTID(),\n                               activeThreads);\n  }\n\n  std::pair<local_iterator, local_iterator> local_pair() const {\n    return block_pair();\n  }\n\n  local_iterator local_begin() const { return block_begin(); }\n  local_iterator local_end() const { return block_end(); }\n\n  block_iterator block_begin() const { return block_pair().first; }\n  block_iterator block_end() const { return block_pair().second; }\n};\n\ntemplate <typename IterTy>\ninline StandardRange<IterTy> makeStandardRange(IterTy begin, IterTy end) {\n  return StandardRange<IterTy>(begin, end);\n}\n\n/**\n * SpecificRange is a range type where a threads range is specified by\n * an an int array that tells you where each thread should begin its\n * iteration\n */\ntemplate <typename IterTy>\nclass SpecificRange {\n  IterTy global_begin, global_end;\n  const uint32_t* thread_beginnings;\n\npublic:\n  typedef IterTy iterator;\n  typedef iterator local_iterator;\n  typedef iterator block_iterator;\n\n  typedef typename std::iterator_traits<IterTy>::value_type value_type;\n\n  SpecificRange(IterTy b, IterTy e, const uint32_t* thread_ranges)\n      : global_begin(b), global_end(e), thread_beginnings(thread_ranges) {}\n\n  iterator begin() const { return global_begin; }\n  iterator end() const { return global_end; }\n\n  /* Using the thread_beginnings array which tells you which node each thread\n   * should begin at, we can get the local block range for a particular\n   * thread. If the local range falls outside of global range, do nothing.\n   *\n   * @returns A pair of iterators that specifies the beginning and end\n   * of the range for this particular thread.\n   */\n  std::pair<block_iterator, block_iterator> block_pair() const {\n    uint32_t my_thread_id  = substrate::ThreadPool::getTID();\n    uint32_t total_threads = runtime::activeThreads;\n\n    iterator local_begin = thread_beginnings[my_thread_id];\n    iterator local_end   = thread_beginnings[my_thread_id + 1];\n\n    assert(local_begin <= local_end);\n\n    if (thread_beginnings[total_threads] == *global_end && *global_begin == 0) {\n      return std::make_pair(local_begin, local_end);\n    } else {\n      // This path assumes that we were passed in thread_beginnings for the\n      // range 0 to last node, but the passed in range to execute is NOT the\n      // entire 0 to thread end range; therefore, work under the assumption that\n      // only some threads will execute things only if they \"own\" nodes in the\n      // range\n      iterator left  = local_begin;\n      iterator right = local_end;\n\n      // local = what this thread CAN do\n      // global = what this thread NEEDS to do\n\n      // cutoff left and right if global begin/end require less than what we\n      // need\n      if (local_begin < global_begin) {\n        left = global_begin;\n      }\n      if (local_end > global_end) {\n        right = global_end;\n      }\n      // make sure range is sensible after changing left and right\n      if (left >= right || right <= left) {\n        left = right = global_end;\n      }\n\n      // Explanations/reasoning of possible cases\n      // [ ] = local ranges\n      // o = need to be included; global ranges = leftmost and rightmost circle\n      // x = not included\n      // ooooo[ooooooooxxxx]xxxxxx handled (left the same, right moved)\n      // xxxxx[xxxxxooooooo]oooooo handled (left moved, right the same)\n      // xxxxx[xxoooooooxxx]xxxxxx handled (both left/right moved)\n      // xxxxx[xxxxxxxxxxxx]oooooo handled (left will be >= right, set l = r)\n      // oooox[xxxxxxxxxxxx]xxxxxx handled (right will be <= left, set l = r)\n      // xxxxx[oooooooooooo]xxxxxx handled (left, right the same = local range)\n\n      return std::make_pair(left, right);\n    }\n  }\n\n  std::pair<local_iterator, local_iterator> local_pair() const {\n    return block_pair();\n  }\n\n  local_iterator local_begin() const { return block_begin(); }\n  local_iterator local_end() const { return block_end(); }\n\n  block_iterator block_begin() const { return block_pair().first; }\n  block_iterator block_end() const { return block_pair().second; }\n};\n\n/**\n * Creates a SpecificRange object.\n *\n * @tparam IterTy The iterator type used by the range object\n * @param begin The global beginning of the range\n * @param end The global end of the range\n * @param thread_ranges An array of iterators that specifies where each\n * thread's range begins\n * @returns A SpecificRange object\n */\ntemplate <typename IterTy>\ninline SpecificRange<IterTy> makeSpecificRange(IterTy begin, IterTy end,\n                                               const uint32_t* thread_ranges) {\n  return SpecificRange<IterTy>(begin, end, thread_ranges);\n}\n\n} // end namespace runtime\n\nnamespace internal {\n\n// supported variants\n// range(beg, end)\n// range(C& cont)\n// range(const T& x); // single item or drop this in favor of initializer list\n// range(std::initializer_list<T>)\ntemplate <typename I, bool IS_INTEGER = false>\nclass IteratorRangeMaker {\n  I m_beg;\n  I m_end;\n\npublic:\n  IteratorRangeMaker(const I& beg, const I& end) : m_beg(beg), m_end(end) {}\n\n  template <typename Arg>\n  auto operator()(const Arg&) const {\n    return runtime::makeStandardRange(m_beg, m_end);\n  }\n};\n\ntemplate <typename I>\nclass IteratorRangeMaker<I, true> {\n  I m_beg;\n  I m_end;\n\npublic:\n  IteratorRangeMaker(const I& beg, const I& end) : m_beg(beg), m_end(end) {}\n\n  template <typename Arg>\n  auto operator()(const Arg&) const {\n    return runtime::makeStandardRange(boost::counting_iterator<I>(m_beg),\n                                      boost::counting_iterator<I>(m_end));\n  }\n};\n\ntemplate <typename T>\nclass InitListRangeMaker {\n  std::initializer_list<T> m_list;\n\npublic:\n  explicit InitListRangeMaker(const std::initializer_list<T>& l) : m_list(l) {}\n\n  template <typename Arg>\n  auto operator()(const Arg&) const {\n    return runtime::makeStandardRange(m_list.begin(), m_list.end());\n  }\n};\n\ntemplate <typename C, bool HAS_LOCAL_RANGE = true>\nclass ContainerRangeMaker {\n  C& m_cont;\n\npublic:\n  explicit ContainerRangeMaker(C& cont) : m_cont(cont) {}\n\n  template <typename Arg>\n  auto operator()(const Arg&) const {\n    return runtime::makeLocalRange(m_cont);\n  }\n};\n\ntemplate <typename C>\nclass ContainerRangeMaker<C, false> {\n\n  C& m_cont;\n\npublic:\n  explicit ContainerRangeMaker(C& cont) : m_cont(cont) {}\n\n  template <typename Arg>\n  auto operator()(const Arg&) const {\n    return runtime::makeStandardRange(m_cont.begin(), m_cont.end());\n  }\n};\n\ntemplate <typename C>\nclass HasLocalIter {\n\n  template <typename T>\n  using CallExprType = typename std::remove_reference<decltype(\n      std::declval<T>().local_begin())>::type;\n\n  template <typename T>\n  static std::true_type go(typename std::add_pointer<CallExprType<T>>::type);\n\n  template <typename T>\n  static std::false_type go(...);\n\npublic:\n  constexpr static const bool value =\n      std::is_same<decltype(go<C>(nullptr)), std::true_type>::value;\n};\n\n} // end namespace internal\n\ntemplate <typename C>\nauto iterate(C& cont) {\n  return internal::ContainerRangeMaker<C, internal::HasLocalIter<C>::value>(\n      cont);\n}\n\ntemplate <typename T>\nauto iterate(std::initializer_list<T> initList) {\n  return internal::InitListRangeMaker<T>(initList);\n}\n\ntemplate <typename I>\nauto iterate(const I& beg, const I& end) {\n  return internal::IteratorRangeMaker<I, std::is_integral<I>::value>(beg, end);\n}\n\n} // end namespace galois\n#endif\n"
  },
  {
    "path": "libgalois/include/galois/runtime/SharedMem.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#ifndef GALOIS_RUNTIME_SHAREDMEM_H\n#define GALOIS_RUNTIME_SHAREDMEM_H\n\n#include <string>\n\n#include \"galois/config.h\"\n#include \"galois/runtime/PagePool.h\"\n#include \"galois/runtime/Statistics.h\"\n#include \"galois/substrate/SharedMem.h\"\n\nnamespace galois::runtime {\n\ntemplate <typename SM>\nclass SharedMem : public galois::substrate::SharedMem {\n  internal::PageAllocState<> m_pa;\n  SM m_sm;\n\npublic:\n  explicit SharedMem() : m_pa(), m_sm() {\n    internal::setPagePoolState(&m_pa);\n    internal::setSysStatManager(&m_sm);\n  }\n\n  ~SharedMem() {\n    m_sm.print();\n    internal::setSysStatManager(nullptr);\n    internal::setPagePoolState(nullptr);\n  }\n\n  SharedMem(const SharedMem&) = delete;\n  SharedMem& operator=(const SharedMem&) = delete;\n\n  SharedMem(SharedMem&&) = delete;\n  SharedMem& operator=(SharedMem&&) = delete;\n};\n\n} // namespace galois::runtime\n\n#endif\n"
  },
  {
    "path": "libgalois/include/galois/runtime/Statistics.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#ifndef GALOIS_STAT_MANAGER_H\n#define GALOIS_STAT_MANAGER_H\n\n#include <limits>\n#include <map>\n#include <string>\n#include <type_traits>\n\n#include <sys/resource.h>\n#include <sys/time.h>\n\n#include <boost/uuid/uuid.hpp>            // uuid class\n#include <boost/uuid/uuid_generators.hpp> // generators\n#include <boost/uuid/uuid_io.hpp>         // streaming operators etc.\n\n#include \"galois/config.h\"\n#include \"galois/gIO.h\"\n#include \"galois/gstl.h\"\n#include \"galois/Threads.h\"\n#include \"galois/substrate/EnvCheck.h\"\n#include \"galois/substrate/PerThreadStorage.h\"\n#include \"galois/substrate/ThreadRWlock.h\"\n#include \"galois/Threads.h\"\n\n/**\n * TODO:\n * Print intra host stats with per-thread details and inter-host stats with\n per-host details\n * print to 2 files if supporting R format\n * dist implements an addToStat with host ID and manages inter-host stats and\n their combining\n\n */\n\nnamespace galois {\nnamespace runtime {\n\nboost::uuids::uuid getRandUUID();\n\ntemplate <typename T>\nclass RunningMin {\n  T m_min;\n\npublic:\n  RunningMin(void) : m_min(std::numeric_limits<T>::max()) {}\n\n  void add(const T& val) { m_min = std::min(m_min, val); }\n\n  const T& min(void) const { return m_min; }\n};\n\ntemplate <typename T>\nclass RunningMax {\n  T m_max;\n\npublic:\n  RunningMax(void) : m_max(std::numeric_limits<T>::min()) {}\n\n  void add(const T& val) { m_max = std::max(m_max, val); }\n\n  const T& max(void) const { return m_max; }\n};\n\ntemplate <typename T>\nclass RunningSum {\n  T m_sum;\n  size_t m_count;\n\npublic:\n  RunningSum(void) : m_sum(), m_count(0) {}\n\n  void add(const T& val) {\n    m_sum += val;\n    ++m_count;\n  }\n\n  const T& sum(void) const { return m_sum; }\n\n  const size_t& count(void) const { return m_count; }\n\n  T avg() const { return m_sum / m_count; }\n};\n\ntemplate <typename T>\nclass RunningVec {\n\n  using Vec = gstl::Vector<T>;\n\n  Vec m_vec;\n\npublic:\n  void add(const T& val) { m_vec.push_back(val); }\n\n  const Vec& values(void) const { return m_vec; }\n};\n\ntemplate <typename T>\nclass NamedStat {\n\n  using Str = galois::gstl::Str;\n\n  Str m_name;\n\npublic:\n  void setName(const Str& name) { m_name = name; }\n\n  void setName(Str&& name) { m_name = std::move(name); }\n\n  const Str& name(void) const { return m_name; }\n\n  void add(const T&) const {}\n};\n\ntemplate <typename T, typename... Bases>\nclass AggregStat : public Bases... {\n\npublic:\n  using with_min = AggregStat<T, RunningMin<T>, Bases...>;\n\n  using with_max = AggregStat<T, RunningMax<T>, Bases...>;\n\n  using with_sum = AggregStat<T, RunningSum<T>, Bases...>;\n\n  using with_mem = AggregStat<T, RunningVec<T>, Bases...>;\n\n  using with_name = AggregStat<T, NamedStat<T>, Bases...>;\n\n  void add(const T& val) { (..., Bases::add(val)); }\n};\n\nnamespace {\nstatic constexpr const char* StatTotalNames[] = {\"SINGLE\", \"TMIN\", \"TMAX\",\n                                                 \"TSUM\", \"TAVG\"};\n}\n\nstruct StatTotal {\n\n  enum Type { SINGLE = 0, TMIN, TMAX, TSUM, TAVG };\n\n  static const char* str(const Type& t) { return StatTotalNames[t]; }\n};\n\nnamespace internal {\n\ntemplate <typename Stat_tp>\nstruct BasicStatMap {\n\n  using Stat    = Stat_tp;\n  using Str     = galois::gstl::Str;\n  using StrSet  = galois::gstl::Set<Str>;\n  using StatMap = galois::gstl::Map<std::tuple<const Str*, const Str*>, Stat>;\n  using const_iterator = typename StatMap::const_iterator;\n\nprotected:\n  StrSet symbols;\n  StatMap statMap;\n\n  const Str* getOrInsertSymbol(const Str& s) {\n    auto p = symbols.insert(s);\n    return &*(p.first);\n  }\n\n  const Str* getSymbol(const Str& s) const {\n    auto i = symbols.find(s);\n\n    if (i == symbols.cend()) {\n      return nullptr;\n    } else {\n      return &(*i);\n    }\n  }\n\npublic:\n  template <typename... Args>\n  Stat& getOrInsertStat(const Str& region, const Str& category,\n                        Args&&... args) {\n\n    const Str* ln  = getOrInsertSymbol(region);\n    const Str* cat = getOrInsertSymbol(category);\n\n    auto tpl = std::make_tuple(ln, cat);\n\n    auto p = statMap.emplace(tpl, Stat(std::forward<Args>(args)...));\n\n    return p.first->second;\n  }\n\n  const_iterator findStat(const Str& region, const Str& category) const {\n\n    const Str* ln  = getSymbol(region);\n    const Str* cat = getSymbol(category);\n    auto tpl       = std::make_tuple(ln, cat);\n\n    auto i = statMap.find(tpl);\n\n    return i;\n  }\n\n  const Stat& getStat(const Str& region, const Str& category) const {\n\n    auto i = findStat(region, category);\n    assert(i != statMap.end());\n    return i->second;\n  }\n\n  template <typename T, typename... Args>\n  void addToStat(const Str& region, const Str& category, const T& val,\n                 Args&&... statArgs) {\n    Stat& s =\n        getOrInsertStat(region, category, std::forward<Args>(statArgs)...);\n    s.add(val);\n  }\n\n  const_iterator cbegin(void) const { return statMap.cbegin(); }\n  const_iterator cend(void) const { return statMap.cend(); }\n\n  const Str& region(const const_iterator& i) const {\n    return *(std::get<0>(i->first));\n  }\n\n  const Str& category(const const_iterator& i) const {\n    return *(std::get<1>(i->first));\n  }\n\n  const Stat& stat(const const_iterator& i) const { return i->second; }\n};\n\ntemplate <typename T>\nusing VecStat_with_MinMaxSum =\n    typename AggregStat<T>::with_mem::with_min::with_max::with_sum;\n\ntemplate <typename T>\nstruct VecStat : public VecStat_with_MinMaxSum<T> {\n\n  using Base = VecStat_with_MinMaxSum<T>;\n\n  StatTotal::Type m_totalTy;\n\n  explicit VecStat(const StatTotal::Type& type) : Base(), m_totalTy(type) {}\n\n  const StatTotal::Type& totalTy(void) const { return m_totalTy; }\n\n  T total(void) const {\n\n    switch (m_totalTy) {\n\n    case StatTotal::SINGLE:\n      assert(Base::values().size() > 0);\n      return Base::values()[0];\n\n    case StatTotal::TMIN:\n      return Base::min();\n\n    case StatTotal::TMAX:\n      return Base::max();\n\n    case StatTotal::TSUM:\n      return Base::sum();\n\n    case StatTotal::TAVG:\n      return Base::avg();\n\n    default:\n      GALOIS_DIE(\"unreachable\");\n    }\n  }\n};\n\ntemplate <>\nstruct VecStat<gstl::Str> : public AggregStat<gstl::Str>::with_mem {\n\n  using Base = AggregStat<gstl::Str>::with_mem;\n\n  StatTotal::Type m_totalTy;\n\n  explicit VecStat(const StatTotal::Type& type) : Base(), m_totalTy(type) {}\n\n  const StatTotal::Type& totalTy(void) const { return m_totalTy; }\n\n  const gstl::Str& total(void) const {\n\n    switch (m_totalTy) {\n\n    case StatTotal::SINGLE:\n      assert(Base::values().size() > 0);\n      return Base::values()[0];\n\n    default:\n      GALOIS_DIE(\"unreachable\");\n    }\n  }\n};\n\ntemplate <typename T>\nusing VecStatManager = BasicStatMap<VecStat<T>>;\n\ntemplate <typename T>\nstruct ScalarStat {\n  T m_val;\n  StatTotal::Type m_totalTy;\n\n  explicit ScalarStat(const StatTotal::Type& type) : m_val(), m_totalTy(type) {}\n\n  void add(const T& v) { m_val += v; }\n\n  operator const T&(void) const { return m_val; }\n\n  const StatTotal::Type& totalTy(void) const { return m_totalTy; }\n};\n\ntemplate <typename T>\nusing ScalarStatManager = BasicStatMap<ScalarStat<T>>;\n\n} // end namespace internal\n\nclass StatManager {\n\npublic:\n  using Str = galois::gstl::Str;\n\n  static constexpr const char* const SEP           = \", \";\n  static constexpr const char* const TSTAT_SEP     = \"; \";\n  static constexpr const char* const TSTAT_NAME    = \"ThreadValues\";\n  static constexpr const char* const TSTAT_ENV_VAR = \"PRINT_PER_THREAD_STATS\";\n\n  static bool printingThreadVals(void);\n\n  template <typename T>\n  static constexpr const char* statKind(void) {\n    return std::is_same<T, Str>::value ? \"PARAM\" : \"STAT\";\n  }\n\nprivate:\n  template <typename T>\n  struct StatManagerImpl {\n\n    using MergedStats    = internal::VecStatManager<T>;\n    using const_iterator = typename MergedStats::const_iterator;\n    using Stat           = typename MergedStats::Stat;\n\n    substrate::PerThreadStorage<internal::ScalarStatManager<T>>\n        perThreadManagers;\n    MergedStats result;\n    bool merged = false;\n\n    void addToStat(const Str& region, const Str& category, const T& val,\n                   const StatTotal::Type& type) {\n      perThreadManagers.getLocal()->addToStat(region, category, val, type);\n    }\n\n    void mergeStats(void) {\n\n      if (merged) {\n        return;\n      }\n\n      for (unsigned t = 0; t < perThreadManagers.size(); ++t) {\n\n        const auto* manager = perThreadManagers.getRemote(t);\n\n        for (auto i = manager->cbegin(), end_i = manager->cend(); i != end_i;\n             ++i) {\n          result.addToStat(manager->region(i), manager->category(i),\n                           T(manager->stat(i)), manager->stat(i).totalTy());\n        }\n      }\n\n      merged = true;\n    }\n\n    const_iterator cbegin(void) const { return result.cbegin(); }\n    const_iterator cend(void) const { return result.cend(); }\n\n    const Str& region(const const_iterator& i) const {\n      return result.region(i);\n    }\n\n    const Str& category(const const_iterator& i) const {\n      return result.category(i);\n    }\n\n    const Stat& stat(const const_iterator& i) const { return result.stat(i); }\n\n    template <typename S, typename V>\n    void readStat(const const_iterator& i, S& region, S& category, T& total,\n                  StatTotal::Type& type, V& thrdVals) const {\n      region   = this->region(i);\n      category = this->category(i);\n\n      total = this->stat(i).total();\n      type  = this->stat(i).totalTy();\n\n      thrdVals.clear();\n      thrdVals = this->stat(i).values();\n    }\n\n    void print(std::ostream& out) const {\n\n      for (auto i = cbegin(), end_i = cend(); i != end_i; ++i) {\n        out << statKind<T>() << SEP << this->region(i) << SEP\n            << this->category(i) << SEP;\n\n        const auto& s = this->stat(i);\n        out << StatTotal::str(s.totalTy()) << SEP << s.total();\n\n        out << \"\\n\";\n\n        if (StatManager::printingThreadVals()) {\n\n          out << statKind<T>() << SEP << this->region(i) << SEP\n              << this->category(i) << SEP;\n          out << TSTAT_NAME << SEP;\n\n          const char* sep = \"\";\n          for (const auto& v : s.values()) {\n            out << sep << v;\n            sep = TSTAT_SEP;\n          }\n\n          out << \"\\n\";\n        }\n      }\n    }\n  };\n\n  using IntStats     = StatManagerImpl<int64_t>;\n  using FPstats      = StatManagerImpl<double>;\n  using StrStats     = StatManagerImpl<Str>;\n  using int_iterator = typename IntStats::const_iterator;\n  using fp_iterator  = typename FPstats::const_iterator;\n  using str_iterator = typename StrStats::const_iterator;\n\n  std::string m_outfile;\n  IntStats intStats;\n  FPstats fpStats;\n  StrStats strStats;\n\nprotected:\n  void mergeStats(void) {\n    intStats.mergeStats();\n    fpStats.mergeStats();\n    strStats.mergeStats();\n  }\n\n  int_iterator intBegin(void) const;\n  int_iterator intEnd(void) const;\n\n  fp_iterator fpBegin(void) const;\n  fp_iterator fpEnd(void) const;\n\n  str_iterator paramBegin(void) const;\n  str_iterator paramEnd(void) const;\n\n  template <typename S, typename V>\n  void readIntStat(const int_iterator& i, S& region, S& category,\n                   int64_t& total, StatTotal::Type& type, V& vec) const {\n\n    intStats.readStat(i, region, category, total, type, vec);\n  }\n\n  template <typename S, typename V>\n  void readFPstat(const fp_iterator& i, S& region, S& category, double& total,\n                  StatTotal::Type& type, V& vec) const {\n\n    fpStats.readStat(i, region, category, total, type, vec);\n  }\n\n  template <typename S, typename V>\n  void readParam(const str_iterator& i, S& region, S& category, Str& total,\n                 StatTotal::Type& type, V& vec) const {\n\n    strStats.readStat(i, region, category, total, type, vec);\n  }\n\n  virtual void printStats(std::ostream& out);\n\n  void printHeader(std::ostream& out) const;\n\npublic:\n  explicit StatManager(const std::string& outfile = \"\");\n\n  virtual ~StatManager();\n\n  void setStatFile(const std::string& outfile);\n\n  template <typename S1, typename S2, typename T,\n            typename = std::enable_if_t<std::is_integral<T>::value ||\n                                        std::is_floating_point<T>::value>>\n  void addToStat(const S1& region, const S2& category, const T& val,\n                 const StatTotal::Type& type) {\n\n    if (std::is_floating_point<T>::value) {\n      fpStats.addToStat(gstl::makeStr(region), gstl::makeStr(category),\n                        double(val), type);\n\n    } else {\n      intStats.addToStat(gstl::makeStr(region), gstl::makeStr(category),\n                         int64_t(val), type);\n    }\n  }\n\n  template <typename S1, typename S2, typename V>\n  void addToParam(const S1& region, const S2& category, const V& val) {\n    strStats.addToStat(gstl::makeStr(region), gstl::makeStr(category),\n                       gstl::makeStr(val), StatTotal::SINGLE);\n  }\n\n  void print(void);\n};\n\nnamespace internal {\n\nvoid setSysStatManager(StatManager* sm);\nStatManager* sysStatManager(void);\n\n} // namespace internal\n\ntemplate <typename S1, typename S2, typename T>\ninline void reportStat(const S1& region, const S2& category, const T& value,\n                       const StatTotal::Type& type) {\n  internal::sysStatManager()->addToStat(region, category, value, type);\n}\n\ntemplate <typename S1, typename S2, typename T>\ninline void reportStat_Single(const S1& region, const S2& category,\n                              const T& value) {\n  reportStat(region, category, value, StatTotal::SINGLE);\n}\n\ntemplate <typename S1, typename S2, typename T>\ninline void reportStat_Tmin(const S1& region, const S2& category,\n                            const T& value) {\n  reportStat(region, category, value, StatTotal::TMIN);\n}\n\ntemplate <typename S1, typename S2, typename T>\ninline void reportStat_Tmax(const S1& region, const S2& category,\n                            const T& value) {\n  reportStat(region, category, value, StatTotal::TMAX);\n}\n\ntemplate <typename S1, typename S2, typename T>\ninline void reportStat_Tsum(const S1& region, const S2& category,\n                            const T& value) {\n  reportStat(region, category, value, StatTotal::TSUM);\n}\n\ntemplate <typename S1, typename S2, typename T>\ninline void reportStat_Tavg(const S1& region, const S2& category,\n                            const T& value) {\n  reportStat(region, category, value, StatTotal::TAVG);\n}\n\ntemplate <bool Report = false, typename S1, typename S2, typename T>\ninline void reportStatCond(const S1& region, const S2& category, const T& value,\n                           const StatTotal::Type& type) {\n  if (Report)\n    internal::sysStatManager()->addToStat(region, category, value, type);\n}\n\ntemplate <bool Report = false, typename S1, typename S2, typename T>\ninline void reportStatCond_Single(const S1& region, const S2& category,\n                                  const T& value) {\n  if (Report)\n    reportStat(region, category, value, StatTotal::SINGLE);\n}\n\ntemplate <bool Report = false, typename S1, typename S2, typename T>\ninline void reportStatCond_Tmin(const S1& region, const S2& category,\n                                const T& value) {\n  if (Report)\n    reportStat(region, category, value, StatTotal::TMIN);\n}\n\ntemplate <bool Report = false, typename S1, typename S2, typename T>\ninline void reportStatCond_Tmax(const S1& region, const S2& category,\n                                const T& value) {\n  if (Report)\n    reportStat(region, category, value, StatTotal::TMAX);\n}\n\ntemplate <bool Report = false, typename S1, typename S2, typename T>\ninline void reportStatCond_Tsum(const S1& region, const S2& category,\n                                const T& value) {\n  if (Report)\n    reportStat(region, category, value, StatTotal::TSUM);\n}\n\ntemplate <bool Report = false, typename S1, typename S2, typename T>\ninline void reportStatCond_Tavg(const S1& region, const S2& category,\n                                const T& value) {\n  if (Report)\n    reportStat(region, category, value, StatTotal::TAVG);\n}\n\ntemplate <typename S1, typename S2, typename V>\nvoid reportParam(const S1& region, const S2& category, const V& value) {\n  internal::sysStatManager()->addToParam(region, category, value);\n}\n\nvoid setStatFile(const std::string& f);\n\n//! Reports maximum resident set size and page faults stats using\n//! rusage\n//! @param id Identifier to prefix stat with in statistics output\nvoid reportRUsage(const std::string& id);\n\n// TODO: switch to gstl::Str in here\n//! Reports Galois system memory stats for all threads\nvoid reportPageAlloc(const char* category);\n//! Reports NUMA memory stats for all NUMA nodes\nvoid reportNumaAlloc(const char* category);\n\n} // end namespace runtime\n} // end namespace galois\n\n#endif // GALOIS_STAT_MANAGER_H\n"
  },
  {
    "path": "libgalois/include/galois/runtime/Substrate.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#ifndef GALOIS_RUNTIME_SUBSTRATE_H\n#define GALOIS_RUNTIME_SUBSTRATE_H\n\n#include \"galois/substrate/Barrier.h\"\n\nnamespace galois {\nnamespace runtime {\n\n/**\n * Have a pre-instantiated barrier available for use.\n * This is initialized to the current activeThreads. This barrier\n * is designed to be fast and should be used in the common\n * case.\n *\n * However, there is a race if the number of active threads\n * is modified after using this barrier: some threads may still\n * be in the barrier while the main thread reinitializes this\n * barrier to the new number of active threads. If that may\n * happen, use {@link createSimpleBarrier()} instead.\n */\nsubstrate::Barrier& getBarrier(unsigned activeThreads);\n\n} // end namespace runtime\n} // end namespace galois\n\n#endif\n"
  },
  {
    "path": "libgalois/include/galois/runtime/ThreadTimer.h",
    "content": "#ifndef GALOIS_RUNTIME_THREADTIMER_H\n#define GALOIS_RUNTIME_THREADTIMER_H\n\n#include <ctime>\n\n#include \"galois/config.h\"\n#include \"galois/substrate/PerThreadStorage.h\"\n\nnamespace galois::runtime {\n\nclass ThreadTimer {\n  timespec start_;\n  timespec stop_;\n  uint64_t nsec_{0};\n\npublic:\n  ThreadTimer() = default;\n\n  void start() { clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start_); }\n\n  void stop() {\n    clock_gettime(CLOCK_THREAD_CPUTIME_ID, &stop_);\n    nsec_ += (stop_.tv_nsec - start_.tv_nsec);\n    nsec_ += ((stop_.tv_sec - start_.tv_sec) * 1000000000);\n  }\n\n  uint64_t get_nsec() const { return nsec_; }\n\n  uint64_t get_sec() const { return (nsec_ / 1000000000); }\n\n  uint64_t get_msec() const { return (nsec_ / 1000000); }\n};\n\nclass ThreadTimers {\nprotected:\n  substrate::PerThreadStorage<ThreadTimer> timers_;\n\n  void reportTimes(const char* category, const char* region);\n};\n\ntemplate <bool enabled>\nclass PerThreadTimer : private ThreadTimers {\n  const char* const region_;\n  const char* const category_;\n\n  void reportTimes() { reportTimes(category_, region_); }\n\npublic:\n  PerThreadTimer(const char* const region, const char* const category)\n      : region_(region), category_(category) {}\n\n  PerThreadTimer(const PerThreadTimer&) = delete;\n  PerThreadTimer(PerThreadTimer&&)      = delete;\n  PerThreadTimer& operator=(const PerThreadTimer&) = delete;\n  PerThreadTimer& operator=(PerThreadTimer&&) = delete;\n\n  ~PerThreadTimer() { reportTimes(); }\n\n  void start() { timers_.getLocal()->start(); }\n\n  void stop() { timers_.getLocal()->stop(); }\n};\n\ntemplate <>\nclass PerThreadTimer<false> {\n\npublic:\n  PerThreadTimer(const char* const, const char* const) {}\n\n  PerThreadTimer(const PerThreadTimer&) = delete;\n  PerThreadTimer(PerThreadTimer&&)      = delete;\n  PerThreadTimer& operator=(const PerThreadTimer&) = delete;\n  PerThreadTimer& operator=(PerThreadTimer&&) = delete;\n\n  ~PerThreadTimer() = default;\n\n  void start() const {}\n\n  void stop() const {}\n};\n\n} // end namespace galois::runtime\n\n#endif\n"
  },
  {
    "path": "libgalois/include/galois/runtime/TiledExecutor.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#ifndef _GALOIS_RUNTIME_TILEDEXECUTOR_H_\n#define _GALOIS_RUNTIME_TILEDEXECUTOR_H_\n\n#include \"galois/config.h\"\n#include \"galois/Galois.h\"\n#include \"galois/LargeArray.h\"\n#include \"galois/NoDerefIterator.h\"\n\nnamespace galois {\nnamespace runtime {\n\ntemplate <typename Graph, bool UseExp = false>\nclass Fixed2DGraphTiledExecutor {\n  static constexpr int numDims = 2; // code is specialized to 2\n\n  using SpinLock      = galois::substrate::PaddedLock<true>;\n  using GNode         = typename Graph::GraphNode;\n  using iterator      = typename Graph::iterator;\n  using edge_iterator = typename Graph::edge_iterator;\n  using Point         = std::array<size_t, numDims>;\n\n  template <typename T>\n  struct SimpleAtomic {\n    std::atomic<T> value;\n    SimpleAtomic() : value(0) {}\n    SimpleAtomic(const SimpleAtomic& o) : value(o.value.load()) {}\n    T relaxedLoad() { return value.load(std::memory_order_relaxed); }\n    void relaxedAdd(T delta) {\n      value.store(relaxedLoad() + delta, std::memory_order_relaxed);\n    }\n  };\n\n  /**\n   * Tasks are 2D ranges [startX, endX) x [startY, endYInclusive]\n   */\n  struct Task {\n    iterator startX;\n    iterator endX;\n    GNode startY;\n    GNode endYInclusive;\n    Point coord;\n    SimpleAtomic<unsigned> updates;\n  };\n\n  /**\n   * Functor: given a graph on initialization, passing it an edge iterator\n   * will return the destination of that edge in the graph.\n   */\n  struct GetDst {\n    Graph* g;\n    GetDst() {}\n    GetDst(Graph* _g) : g(_g) {}\n\n    GNode operator()(edge_iterator ii) const { return g->getEdgeDst(ii); }\n  };\n\n  using no_deref_iterator = galois::NoDerefIterator<edge_iterator>;\n  using edge_dst_iterator =\n      boost::transform_iterator<GetDst, no_deref_iterator>;\n\n  Graph& g;\n  int cutoff;                          // XXX: UseExp\n  galois::substrate::Barrier& barrier; // XXX: UseExp\n  // std::array<galois::LargeArray<SpinLock>, numDims> locks;\n  // galois::LargeArray<Task> tasks;\n  std::array<std::vector<SpinLock>, numDims> locks;\n  std::vector<Task> tasks;\n  size_t numTasks;\n  unsigned maxUpdates;\n  bool useLocks;\n  galois::GAccumulator<unsigned> failedProbes;\n\n  /**\n   * Advance point p in the specified dimension by delta and account for\n   * overflow as well.\n   *\n   * @param p Point to advance\n   * @param dim Dimension to advance\n   * @param delta Amount to advance by\n   */\n  void nextPoint(Point& p, int dim, int delta) {\n    assert(dim < numDims);\n    p[dim] += delta;\n    // account for overflow\n    while (p[dim] >= locks[dim].size()) {\n      p[dim] -= locks[dim].size();\n    }\n  }\n\n  /**\n   * Get task associated with a point in the 2D block grid.\n   *\n   * For point x, y, get the task indexed into the X direction x times\n   * and indexed into the Y direction y times.\n   *\n   * @param point with coordinates to a task\n   * @returns pointer to task associated with the passed in point\n   */\n  Task* getTask(const Point& p) {\n    Task* t = &tasks[p[0] + p[1] * locks[0].size()];\n\n    assert(t < &tasks[numTasks]);\n    assert(t >= &tasks[0]);\n\n    return t;\n  }\n\n  /**\n   * Finds a block starting from the provided point that hasn't reached\n   * the maximum number of updates and returns a pointer to it.\n   * Uses a lock on each block it probes, and a returned block is **returned\n   * with a lock**.\n   *\n   * @param start Point specifying block to start probe from\n   * @param dim Specifies whether or not to continue probe in x (0) or y (1)\n   * direction\n   * @param n Number of blocks to probe before failing\n   * @returns pointer to block that hasn't reached a maximum number of updates\n   * or nullptr on probe failure. The block is **returned with the lock**.\n   */\n  Task* probeBlockWithLock(Point& start, int dim, size_t n) {\n    Point p = start;\n\n    for (size_t i = 0; i < n; ++i) {\n      Task* t = getTask(p);\n\n      assert(p[0] == t->coord[0]);\n      assert(p[1] == t->coord[1]);\n      assert(t->coord[0] < locks[0].size());\n      assert(t->coord[1] < locks[1].size());\n\n      if (t->updates.relaxedLoad() < maxUpdates) {\n        if (std::try_lock(locks[0][t->coord[0]], locks[1][t->coord[1]]) < 0) {\n          if (t->updates.relaxedLoad() < maxUpdates) {\n            t->updates.relaxedAdd(1);\n            start = p;\n            return t;\n          }\n\n          // TODO add to worklist\n          for (int i = 0; i < numDims; ++i) {\n            locks[i][t->coord[i]].unlock();\n          }\n        }\n      }\n\n      nextPoint(p, dim, 1);\n    }\n\n    failedProbes += 1;\n    return nullptr;\n  }\n\n  /**\n   * Finds a block starting from the provided point that hasn't reached\n   * the maximum number of updates and returns a pointer to it.\n   *\n   * @param start Point specifying block to start probe from\n   * @param dim Specifies whether or not to continue probe in x (0) or y (1)\n   * direction\n   * @param n Number of blocks to probe before failing\n   * @returns pointer to block that hasn't reached a maximum number of updates\n   * or nullptr on probe failure\n   */\n  Task* probeBlockWithoutLock(Point& start, int dim, size_t n) {\n    Point p = start;\n\n    for (size_t i = 0; i < n; ++i) {\n      Task* t = getTask(p);\n\n      assert(p[0] == t->coord[0]);\n      assert(p[1] == t->coord[1]);\n      assert(t->coord[0] < locks[0].size());\n      assert(t->coord[1] < locks[1].size());\n\n      if (t->updates.relaxedLoad() < maxUpdates) {\n        if (t->updates.value.fetch_add(1) < maxUpdates) {\n          // hasn't reached maxed updates at point of fetch\n          start = p;\n          return t;\n        }\n      }\n      nextPoint(p, dim, 1);\n    }\n\n    failedProbes += 1;\n    return nullptr;\n  }\n\n  /**\n   * Finds a block starting from the provided point that hasn't reached\n   * the maximum number of updates (and isn't locked if using locks) and returns\n   * a pointer to it. If a task is found, start is updated to the\n   * corresponding coordinate.\n   *\n   * Wrapper for locked and not locked versions.\n   * Note that if locks are used, the block return will HAVE THE LOCK for that\n   * block.\n   *\n   * @param start Point specifying block to start probe from\n   * @param dim Specifies whether or not to continue probe in x (0) or y (1)\n   * direction\n   * @param n Number of blocks to probe before failing\n   * @returns pointer to block that hasn't reached a maximum number of updates\n   * or nullptr on probe failure. If locks are used, the caller will\n   * have the lock for the block as well.\n   */\n  Task* probeBlock(Point& start, int dim, size_t n) {\n    assert(dim < 2);\n\n    if (useLocks) {\n      return probeBlockWithLock(start, dim, n);\n    } else {\n      return probeBlockWithoutLock(start, dim, n);\n    }\n  }\n\n  // TODO (Loc) this function needs an overhaul; right now it's too hacky and\n  // imprecise\n  /**\n   * From the provided start point, find a block that is updateable and return\n   * it. Search starts by going up-down left-right from start, but if that\n   * fails, begin advancing along the diagonal and searching up-down left-right\n   * until the entire grid is traversed without a found block.\n   *\n   * Updateable = hasn't reached max updates on inspection + isn't locked (if\n   * using locks)\n   *\n   * @param start block to start search from\n   * @param inclusive If true, the initial search will include the provided\n   * start point as a potential block to look at; otherwise it is COMPLETELY\n   * omitted from search (unless you have a non-square grid in which\n   * case it might become \"extra work\"; see TODO below)\n   **/\n  Task* nextBlock(Point& start, bool inclusive) {\n    Task* t;\n\n    // repeats twice just to make sure there are actually no unused blocks\n    // TODO this method of termination detection is hacky and imprecise,\n    // find a better way\n    for (int times = 0; times < 2; ++times) {\n      Point limit{{locks[0].size(), locks[1].size()}};\n\n      int inclusiveDelta = (inclusive && times == 0) ? 0 : 1;\n\n      // First iteration (i.e. inclusive = true) is INCLUSIVE of start\n      // Otherwise, check the next blocks in the x and y direction for the\n      // next block\n      for (int i = 0; i < numDims; ++i) {\n        Point p = start;\n        nextPoint(p, i, inclusiveDelta);\n\n        if ((t = probeBlock(p, i, limit[i] - inclusiveDelta))) {\n          start = p;\n          return t;\n        }\n      }\n\n      // if the above for loop failed, it means all blocks in both directions\n      // (left->right, up->down) from current block from point are locked\n      // and/or all blocks have reached max updates\n      Point p = start;\n      // solution to above issue in comment = advance using diagonal and check\n      // from there\n      for (int i = 0; i < numDims; ++i) {\n        nextPoint(p, i, 1);\n      }\n\n      // below will end up looping through entire grid looking for a block\n      // to work on; in some cases a block will be looped over more than once\n      // (see below TODO)\n      // TODO probably unoptimal: if any limit has hit 0, is it the case that\n      // the entire grid has been looked at already? This comment writer thinks\n      // the answer is yes in which case the below is doing extra work\n      while (std::any_of(limit.begin(), limit.end(),\n                         [](size_t x) { return x > 0; })) {\n        for (int i = 0; i < numDims; ++i) {\n          if (limit[i] > 1 && (t = probeBlock(p, i, limit[i] - 1))) {\n            start = p;\n            return t;\n          }\n        }\n\n        for (int i = 0; i < numDims; ++i) {\n          if (limit[i] > 0) {\n            limit[i] -= 1;\n            nextPoint(p, i, 1);\n          }\n        }\n      }\n    }\n\n    return nullptr;\n  }\n\n  /**\n   * Apply the provided function to the task/block.\n   *\n   * Dense update, i.e. update everything in the block even if no edge exists.\n   *\n   * @tparam UseDense must be true\n   * @tparam Function function type\n   *\n   * @param fn Function to apply to 2 nodes\n   * @param task Task that contains block information\n   */\n  template <bool UseDense, typename Function>\n  void executeBlock(Function& fn, Task& task,\n                    typename std::enable_if<UseDense>::type* = 0) {\n    GetDst getDst{&g};\n\n    for (auto ii = task.startX; ii != task.endX; ++ii) {\n      for (auto jj = g.begin() + task.startY,\n                ej = g.begin() + task.endYInclusive + 1;\n           jj != ej; ++jj) {\n        fn(*ii, *jj);\n      }\n    }\n  }\n\n  /**\n   * Apply the provided function to the task/block.\n   *\n   * Sparse update, i.e. update nodes only if edge exists.\n   *\n   * @tparam UseDense must be false\n   * @tparam Function function type\n   *\n   * @param fn Function to apply to 2 nodes + an edge\n   * @param task Task that contains block information\n   */\n  template <bool UseDense, typename Function>\n  void executeBlock(Function& fn, Task& task,\n                    typename std::enable_if<!UseDense>::type* = 0) {\n    GetDst getDst{&g};\n\n    for (auto ii = task.startX; ii != task.endX; ++ii) {\n      no_deref_iterator nbegin(\n          g.edge_begin(*ii, galois::MethodFlag::UNPROTECTED));\n      no_deref_iterator nend(g.edge_end(*ii, galois::MethodFlag::UNPROTECTED));\n\n      // iterates over the edges, but edge_dst_iterator xforms it to the dest\n      // node itself\n      edge_dst_iterator dbegin(nbegin, getDst);\n      edge_dst_iterator dend(nend, getDst);\n\n      // TODO check if we want to use experimental\n      // if (UseExp &&\n      //    cutoff < 0 &&\n      //    std::distance(g.edge_begin(*ii, galois::MethodFlag::UNPROTECTED),\n      //      g.edge_end(*ii, galois::MethodFlag::UNPROTECTED)) >= -cutoff) {\n      //  continue;\n      //} else if (UseExp &&\n      //           cutoff > 0 &&\n      //           std::distance(g.edge_begin(*ii,\n      //                           galois::MethodFlag::UNPROTECTED),\n      //                         g.edge_end(*ii,\n      //                           galois::MethodFlag::UNPROTECTED)) < cutoff) {\n      //  continue;\n      //}\n\n      for (auto jj = std::lower_bound(dbegin, dend, task.startY); jj != dend;) {\n        // if (UseExp) {\n        //  constexpr int numTimes = 1;\n        //  constexpr int width = 1;\n        //  bool done = false;\n        //  for (int times = 0; times < numTimes; ++times) {\n        //    for (int i = 0; i < width; ++i) {\n        //      edge_iterator edge = *(jj+i).base();\n        //      if (*(jj + i) > task.endYInclusive) {\n        //        done = true;\n        //        break;\n        //      }\n\n        //      fn(*ii, *(jj+i), edge);\n        //    }\n        //  }\n        //  if (done)\n        //    break;\n        //  for (int i = 0; jj != dend && i < width; ++jj, ++i)\n        //    ;\n        //  if (jj == dend)\n        //    break;\n        //} else {\n        edge_iterator edge = *jj.base();\n        if (*jj > task.endYInclusive)\n          break;\n\n        fn(*ii, *jj, edge);\n        ++jj;\n        //}\n      }\n    }\n  }\n\n  /**\n   * Bulk Synchronous Diagonals: Static work assignment\n   *\n   * From the start point assigned to each thread, loop across the grid\n   * diagonally, moving a step in the direction of the longer of the X or Y\n   * direction every round and working along the diagonal there.\n   *\n   * @tparam UseDense dense update (all nodes in block update with all other\n   * nodes) or sparse update (update only if edge exists)\n   * @tparam Type of function specifying how to do update between nodes\n   *\n   * @param fn Function used to update nodes\n   * @param tid Thread id\n   * @param total Total number of threads\n   */\n  template <bool UseDense, typename Function>\n  void executeLoopExp(Function fn, unsigned tid, unsigned total) {\n    Point numBlocks{locks[0].size(), locks[1].size()};\n    Point block;\n    Point start;\n\n    // TODO this assigns each thread a block along the diagonal, which is\n    // probably NOT what you want in this executor since each block will go\n    // along the diagonal; fix this\n    for (int i = 0; i < numDims; ++i) {\n      block[i] = (numBlocks[i] + total - 1) / total; // blocks per thread\n      start[i] = std::min(block[i] * tid, numBlocks[i] - 1); // block to start\n    }\n\n    // Move diagonal along dim each round\n    // if more y than x, then dim is 1 (i.e. y), else 0\n    int dim  = numBlocks[0] < numBlocks[1] ? 1 : 0;\n    int odim = (dim + 1) % 2;\n    // num blocks in dim dimension\n    size_t maxRounds = numBlocks[dim];\n\n    for (size_t rounds = 0; rounds < maxRounds; ++rounds) {\n      Point p{start[0], start[1]};\n      nextPoint(p, dim, rounds);\n\n      size_t ntries =\n          std::min(block[odim] * (tid + 1), numBlocks[odim]) - start[odim];\n      for (size_t tries = 0; tries < ntries; ++tries) {\n        Task* t = probeBlock(p, 0, 1); // probe block I am currently on\n        if (t) {\n          executeBlock<UseDense>(fn, *t);\n\n          if (useLocks) {\n            for (int i = 0; i < numDims; ++i)\n              locks[i][t->coord[i]].unlock();\n          }\n        }\n\n        for (int i = 0; i < numDims; ++i)\n          nextPoint(p, i, 1);\n      }\n\n      barrier.wait();\n    }\n  }\n\n  // TODO examine this\n  // bulk synchronous diagonals: dynamic assignment within diagonals\n  template <bool UseDense, typename Function>\n  void executeLoopExp2(Function fn, unsigned tid, unsigned total) {\n    Point numBlocks{{locks[0].size(), locks[1].size()}};\n    Point block;\n    Point start;\n    for (int i = 0; i < numDims; ++i) {\n      block[i] = (numBlocks[i] + total - 1) / total;\n      start[i] = std::min(block[i] * tid, numBlocks[i] - 1);\n    }\n\n    // Move diagonal along dim each round\n    int dim          = numBlocks[0] < numBlocks[1] ? 1 : 0;\n    int odim         = (dim + 1) % 2;\n    size_t maxRounds = numBlocks[dim];\n\n    for (size_t round = 0; round < maxRounds; ++round) {\n      Point base{{start[0], start[1]}};\n      nextPoint(base, dim, round);\n      for (size_t tries = 0; tries < numBlocks[odim]; ++tries) {\n        size_t index = tries + base[odim];\n        if (index >= numBlocks[odim])\n          index -= numBlocks[odim];\n        Point p{};\n        nextPoint(p, dim, round);\n        nextPoint(p, odim, index);\n        nextPoint(p, dim, index);\n\n        Task* t = probeBlock(p, 0, 1);\n        if (!t)\n          continue;\n        executeBlock<UseDense>(fn, *t);\n\n        if (useLocks) {\n          for (int i = 0; i < numDims; ++i)\n            locks[i][t->coord[i]].unlock();\n        }\n      }\n\n      barrier.wait();\n    }\n  }\n\n  // TODO this function is imprecise by virtue of nextBlock being a bad\n  // function\n  /**\n   * Execute a function over the grid. Dynamic work: a thread can potentially\n   * get any block.\n   *\n   * @tparam UseDense dense update (all nodes in block update with all other\n   * nodes) or sparse update (update only if edge exists)\n   * @tparam Type of function specifying how to do update between nodes\n   *\n   * @param fn Function used to update 2 nodes\n   * @param tid Thread id\n   * @param total Total number of threads\n   */\n  template <bool UseDense, typename Function>\n  void executeLoopOrig(Function fn, unsigned tid, unsigned total) {\n    Point numBlocks{{locks[0].size(), locks[1].size()}};\n    Point block;\n    Point start;\n\n    // find out each thread's starting point; essentially what it is doing\n    // is assinging each thread to a block on the diagonal to begin with\n    for (int i = 0; i < numDims; ++i) {\n      block[i] = (numBlocks[i] + total - 1) / total; // blocks per thread\n      start[i] = std::min(block[i] * tid, numBlocks[i] - 1); // block to start\n    }\n\n    unsigned coresPerSocket =\n        galois::substrate::getThreadPool().getMaxCores() /\n        galois::substrate::getThreadPool().getMaxSockets();\n\n    // if using locks, readjust start Y location of this thread to location of\n    // the thread's socket\n    if (useLocks) {\n      start = {{start[0],\n                std::min(block[1] *\n                             galois::substrate::getThreadPool().getSocket(tid) *\n                             coresPerSocket,\n                         numBlocks[1] - 1)}};\n    }\n\n    Point p = start;\n\n    for (int i = 0;; ++i) {\n      Task* t = nextBlock(p, i == 0);\n      // TODO: Replace with sparse worklist, etc.\n      if (!t)\n        break;\n\n      executeBlock<UseDense>(fn, *t);\n\n      // unlock the task block if using locks (next block returns the task with\n      // the block locked)\n      if (useLocks) {\n        for (int i = 0; i < numDims; ++i) {\n          locks[i][t->coord[i]].unlock();\n        }\n      }\n    }\n  }\n\n  /**\n   * Wrapper for calling a loop executor function.\n   * @tparam UseDense dense update (all nodes in block update with all other\n   * nodes) or sparse update (update only if edge exists)\n   * @tparam Type of function specifying how to do update between nodes\n   *\n   * @param fn Function used to update 2 nodes\n   * @param tid Thread id\n   * @param total Total number of threads\n   */\n  template <bool UseDense, typename Function>\n  void executeLoop(Function fn, unsigned tid, unsigned total) {\n    // if (false && UseExp)\n    //  executeLoopExp2<UseDense>(fn, tid, total);\n    // else\n    executeLoopOrig<UseDense>(fn, tid, total);\n  }\n\n  /**\n   * Given the range of elements in the X dimension and the range of elements\n   * in the Y dimension with their respective sizes, divide the grid of\n   * work into blocks and save the blocks to this structure.\n   *\n   * @param firstX first element in X dimension\n   * @param lastX last element (non inclusive) in X dimension\n   * @param firstY first element in Y dimension\n   * @param lastY last element (non inclusive) in Y dimension\n   * @param sizeX desired size of blocks in X dimension\n   * @param sizeY desired size of blocks in Y dimension\n   */\n  void initializeTasks(iterator firstX, iterator lastX, iterator firstY,\n                       iterator lastY, size_t sizeX, size_t sizeY) {\n    const size_t numXBlocks =\n        (std::distance(firstX, lastX) + sizeX - 1) / sizeX;\n    const size_t numYBlocks =\n        (std::distance(firstY, lastY) + sizeY - 1) / sizeY;\n    const size_t numBlocks = numXBlocks * numYBlocks;\n\n    // locks[0].create(numXBlocks);\n    // locks[1].create(numYBlocks);\n    // tasks.create(numBlocks);\n    locks[0].resize(numXBlocks);\n    locks[1].resize(numYBlocks);\n    tasks.resize(numBlocks);\n\n    // TODO parallelize this?\n    // assign each block the X and Y that it is responsible for\n    for (size_t i = 0; i < numBlocks; ++i) {\n      Task& task = tasks[i];\n      task.coord = {{i % numXBlocks, i / numXBlocks}};\n      std::tie(task.startX, task.endX) =\n          galois::block_range(firstX, lastX, task.coord[0], numXBlocks);\n      iterator s;\n      iterator e;\n      std::tie(s, e) =\n          galois::block_range(firstY, lastY, task.coord[1], numYBlocks);\n      // XXX: Works for CSR graphs\n      task.startY        = *s;\n      task.endYInclusive = *e - 1;\n    }\n  }\n\n  /**\n   * Process assigned to each thread. Each thread calls execute loop which will\n   * run the provided function over the grid.\n   *\n   * @tparam UseDense dense update (all nodes in block update with all other\n   * nodes) or sparse update (update only if edge exists)\n   * @tparam Function function type\n   */\n  template <bool UseDense, typename Function>\n  struct Process {\n    Fixed2DGraphTiledExecutor* self;\n    Function fn;\n\n    void operator()(unsigned tid, unsigned total) {\n      self->executeLoop<UseDense>(fn, tid, total);\n    }\n  };\n\npublic:\n  Fixed2DGraphTiledExecutor(Graph& g, int cutoff = 0)\n      : g(g), cutoff(cutoff),\n        barrier(galois::runtime::getBarrier(galois::getActiveThreads())) {}\n\n  /**\n   * Report the number of probe block failures to statistics.\n   */\n  ~Fixed2DGraphTiledExecutor() {\n    galois::runtime::reportStat_Single(\"TiledExecutor\", \"ProbeFailures\",\n                                       failedProbes.reduce());\n  }\n\n  /**\n   * Execute a function on a provided X set of nodes and Y set of nodes\n   * for a certain number of iterations. Only update nodes x and y if\n   * an edge exists between them (sparse).\n   *\n   * @tparam Function function type\n   *\n   * @param firstX first element in X dimension\n   * @param lastX last element (non inclusive) in X dimension\n   * @param firstY first element in Y dimension\n   * @param lastY last element (non inclusive) in Y dimension\n   * @param sizeX desired size of blocks in X dimension\n   * @param sizeY desired size of blocks in Y dimension\n   * @param fn Function used to update nodes\n   * @param _useLocks true if locks are desired when updating blocks\n   * @param numIterations Max number of iterations to run each block in the\n   * tiled executor for\n   */\n  template <typename Function>\n  void execute(iterator firstX, iterator lastX, iterator firstY, iterator lastY,\n               size_t sizeX, size_t sizeY, Function fn, bool _useLocks,\n               unsigned numIterations = 1) {\n    initializeTasks(firstX, lastX, firstY, lastY, sizeX, sizeY);\n    numTasks   = tasks.size();\n    maxUpdates = numIterations;\n    useLocks   = _useLocks;\n\n    Process<false, Function> p{this, fn};\n\n    galois::on_each(p);\n\n    // TODO remove after worklist fix\n    if (std::any_of(tasks.begin(), tasks.end(),\n                    [this](Task& t) { return t.updates.value < maxUpdates; })) {\n      galois::gWarn(\"Missing tasks\");\n    }\n  }\n\n  /**\n   * Execute a function on a provided X set of nodes and Y set of nodes\n   * for a certain number of iterations. Updates nodes x and y regardless\n   * of whether or not an edge exists between them (dense).\n   *\n   * @tparam Function function type\n   *\n   * @param firstX first element in X dimension\n   * @param lastX last element (non inclusive) in X dimension\n   * @param firstY first element in Y dimension\n   * @param lastY last element (non inclusive) in Y dimension\n   * @param sizeX desired size of blocks in X dimension\n   * @param sizeY desired size of blocks in Y dimension\n   * @param fn Function used to update nodes\n   * @param _useLocks true if locks are desired when updating blocks\n   * @param numIterations Max number of iterations to run each block in the\n   * tiled executor for\n   */\n  template <typename Function>\n  void executeDense(iterator firstX, iterator lastX, iterator firstY,\n                    iterator lastY, size_t sizeX, size_t sizeY, Function fn,\n                    bool _useLocks, int numIterations = 1) {\n    initializeTasks(firstX, lastX, firstY, lastY, sizeX, sizeY);\n    numTasks   = tasks.size();\n    maxUpdates = numIterations;\n    useLocks   = _useLocks;\n    Process<true, Function> p{this, fn};\n    galois::on_each(p);\n\n    // TODO remove after worklist fix\n    if (std::any_of(tasks.begin(), tasks.end(),\n                    [this](Task& t) { return t.updates.value < maxUpdates; })) {\n      galois::gWarn(\"Missing tasks\");\n    }\n  }\n};\n\n} // namespace runtime\n} // namespace galois\n#endif\n"
  },
  {
    "path": "libgalois/include/galois/runtime/Tracer.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n/**\n * @file Tracer.h\n *\n * Includes functions for tracing output and printing data.\n */\n#ifndef GALOIS_RUNTIME_TRACER_H\n#define GALOIS_RUNTIME_TRACER_H\n\n#include <functional>\n#include <sstream>\n\n#include \"galois/config.h\"\n#include \"galois/substrate/EnvCheck.h\"\n#include \"galois/PODResizeableArray.h\"\n\nnamespace galois {\nnamespace runtime {\n\nnamespace internal {\n\n/**\n * Base case for traceImpl; ends the line with a new line.\n */\nstatic inline void traceImpl(std::ostringstream& os) { os << \"\\n\"; }\n\n/**\n * Prints out a value to the output stream.\n */\ntemplate <typename T, typename... Args>\nstatic inline void traceImpl(std::ostringstream& os, T&&, Args&&... args) {\n  // os << value << \" \";\n  traceImpl(os, std::forward<Args>(args)...);\n}\n\n/**\n * Format string to os.\n */\nstatic inline void traceFormatImpl(std::ostringstream& os, const char* format) {\n  os << format;\n}\n\n/**\n * Format string to os as well as something else to print.\n */\ntemplate <typename T, typename... Args>\nstatic inline void traceFormatImpl(std::ostringstream& os, const char* format,\n                                   T&& value, Args&&... args) {\n  for (; *format != '\\0'; format++) {\n    if (*format == '%') {\n      os << value;\n      traceFormatImpl(os, format + 1, std::forward<Args>(args)...);\n      return;\n    }\n    os << *format;\n  }\n}\n\n/**\n * Class to print a vector.\n */\ntemplate <typename T>\nclass vecPrinter {\n  const galois::PODResizeableArray<T>& v;\n\npublic:\n  vecPrinter(const galois::PODResizeableArray<T>& _v) : v(_v) {}\n  void print(std::ostream& os) const {\n    os << \"< \" << v.size() << \" : \";\n    for (auto& i : v)\n      os << \" \" << (int)i;\n    os << \">\";\n  }\n};\n\n/**\n * Operator to print a vector given a vecPrinter object\n */\ntemplate <typename T>\nstd::ostream& operator<<(std::ostream& os, const vecPrinter<T>& vp) {\n  vp.print(os);\n  return os;\n}\n\n/**\n * Prints trace data (which has time data included).\n */\nvoid printTrace(std::ostringstream&);\n\n/**\n * Prints out string stream.\n */\nvoid print_output_impl(std::ostringstream&);\n\nextern bool doTrace;\nextern bool initTrace;\n\n} // namespace internal\n\n/**\n * Given a vector, returns a vector printer object that is able\n * to print the vector out onto an output stream.\n */\ntemplate <typename T>\ninternal::vecPrinter<T> printVec(const galois::PODResizeableArray<T>& v) {\n  return internal::vecPrinter<T>(v);\n};\n\n/**\n * Prints a trace log of the provided arguments if debug mode is on.\n */\n#ifdef NDEBUG\ntemplate <typename... Args>\nstatic inline void trace(Args&&...) {}\n#else\ntemplate <typename... Args>\nstatic inline void trace(Args&&... args) {\n  if (!internal::initTrace) {\n    internal::doTrace   = substrate::EnvCheck(\"GALOIS_DEBUG_TRACE\");\n    internal::initTrace = true;\n  }\n  if (internal::doTrace) {\n    std::ostringstream os;\n    internal::traceImpl(os, std::forward<Args>(args)...);\n    internal::printTrace(os);\n  }\n}\n#endif\n\n/**\n * Prints data to an output stream.\n *\n * @param format Format string\n * @param args data to print\n */\ntemplate <typename... Args>\nstatic inline void printOutput(const char* format, Args&&... args) {\n  std::ostringstream os;\n  internal::traceFormatImpl(os, format, std::forward<Args>(args)...);\n  internal::print_output_impl(os);\n}\n} // namespace runtime\n} // namespace galois\n\n#endif\n"
  },
  {
    "path": "libgalois/include/galois/runtime/UserContextAccess.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#ifndef GALOIS_RUNTIME_USERCONTEXTACCESS_H\n#define GALOIS_RUNTIME_USERCONTEXTACCESS_H\n\n#include \"galois/config.h\"\n#include \"galois/UserContext.h\"\n\nnamespace galois {\nnamespace runtime {\n\n//! Backdoor to allow runtime methods to access private data in UserContext\ntemplate <typename T>\nclass UserContextAccess : public galois::UserContext<T> {\npublic:\n  typedef galois::UserContext<T> SuperTy;\n  typedef typename SuperTy::PushBufferTy PushBufferTy;\n  typedef typename SuperTy::FastPushBack FastPushBack;\n\n  void resetAlloc() { SuperTy::__resetAlloc(); }\n  PushBufferTy& getPushBuffer() { return SuperTy::__getPushBuffer(); }\n  void resetPushBuffer() { SuperTy::__resetPushBuffer(); }\n  SuperTy& data() { return *static_cast<SuperTy*>(this); }\n  void setLocalState(void* p) { SuperTy::__setLocalState(p); }\n  void setFastPushBack(FastPushBack f) { SuperTy::__setFastPushBack(f); }\n  void setBreakFlag(bool* b) {\n    SuperTy::didBreak = b;\n  } // NOLINT(readability-non-const-parameter)\n\n  void setFirstPass(void) { SuperTy::__setFirstPass(); }\n  void resetFirstPass(void) { SuperTy::__resetFirstPass(); }\n};\n\n} // namespace runtime\n} // end namespace galois\n\n#endif\n"
  },
  {
    "path": "libgalois/include/galois/substrate/Barrier.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#ifndef GALOIS_SUBSTRATE_BARRIER_H\n#define GALOIS_SUBSTRATE_BARRIER_H\n\n#include <functional>\n#include <memory>\n\n#include \"galois/config.h\"\n#include \"galois/gIO.h\"\n#include \"galois/substrate/ThreadPool.h\"\n\nnamespace galois {\nnamespace substrate {\n\nclass Barrier {\npublic:\n  virtual ~Barrier();\n\n  // not safe if any thread is in wait\n  virtual void reinit(unsigned val) = 0;\n\n  // Wait at this barrier\n  virtual void wait() = 0;\n\n  // wait at this barrier\n  void operator()(void) { wait(); }\n\n  // barrier type.\n  virtual const char* name() const = 0;\n};\n\n/**\n * Return a reference to system barrier\n */\nBarrier& getBarrier(unsigned activeThreads);\n\n/**\n * Create specific types of barriers.  For benchmarking only.  Use\n * getBarrier() for all production code\n */\nstd::unique_ptr<Barrier> createPthreadBarrier(unsigned);\nstd::unique_ptr<Barrier> createMCSBarrier(unsigned);\nstd::unique_ptr<Barrier> createTopoBarrier(unsigned);\nstd::unique_ptr<Barrier> createCountingBarrier(unsigned);\nstd::unique_ptr<Barrier> createDisseminationBarrier(unsigned);\n\n/**\n * Creates a new simple barrier. This barrier is not designed to be fast but\n * does gaurantee that all threads have left the barrier before returning\n * control. Useful when the number of active threads is modified to avoid a\n * race in {@link getBarrier()}.  Client is reponsible for deallocating\n * returned barrier.\n */\nstd::unique_ptr<Barrier> createSimpleBarrier(unsigned int);\n\nnamespace internal {\n\ntemplate <typename _UNUSED = void>\nstruct BarrierInstance {\n  unsigned m_num_threads;\n  std::unique_ptr<Barrier> m_barrier;\n\n  BarrierInstance(void) {\n    m_num_threads = getThreadPool().getMaxThreads();\n    m_barrier     = createTopoBarrier(m_num_threads);\n  }\n\n  Barrier& get(unsigned numT) {\n    GALOIS_ASSERT(numT > 0,\n                  \"substrate::getBarrier() number of threads must be > 0\");\n\n    numT = std::min(numT, getThreadPool().getMaxUsableThreads());\n    numT = std::max(numT, 1u);\n\n    if (numT != m_num_threads) {\n      m_num_threads = numT;\n      m_barrier->reinit(numT);\n    }\n\n    return *m_barrier;\n  }\n};\n\nvoid setBarrierInstance(BarrierInstance<>* bi);\n\n} // end namespace internal\n\n} // end namespace substrate\n} // end namespace galois\n\n#endif\n"
  },
  {
    "path": "libgalois/include/galois/substrate/CacheLineStorage.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#ifndef GALOIS_SUBSTRATE_CACHELINESTORAGE_H\n#define GALOIS_SUBSTRATE_CACHELINESTORAGE_H\n\n#include <utility>\n\n#include \"galois/config.h\"\n#include \"galois/substrate/CompilerSpecific.h\"\n\nnamespace galois::substrate {\n\n// Store an item with padding\ntemplate <typename T>\nstruct CacheLineStorage {\n  alignas(GALOIS_CACHE_LINE_SIZE) T data;\n\n  char buffer[GALOIS_CACHE_LINE_SIZE - (sizeof(T) % GALOIS_CACHE_LINE_SIZE)];\n  // static_assert(sizeof(T) < GALOIS_CACHE_LINE_SIZE, \"Too large a type\");\n\n  CacheLineStorage() : data() {}\n  CacheLineStorage(const T& v) : data(v) {}\n\n  template <typename A>\n  explicit CacheLineStorage(A&& v) : data(std::forward<A>(v)) {}\n\n  explicit operator T() { return data; }\n\n  T& get() { return data; }\n  template <typename V>\n  CacheLineStorage& operator=(const V& v) {\n    data = v;\n    return *this;\n  }\n};\n\n} // namespace galois::substrate\n\n#endif\n"
  },
  {
    "path": "libgalois/include/galois/substrate/CompilerSpecific.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#ifndef GALOIS_SUBSTRATE_COMPILERSPECIFIC_H\n#define GALOIS_SUBSTRATE_COMPILERSPECIFIC_H\n\n#include \"galois/config.h\"\n\nnamespace galois::substrate {\n\ninline static void asmPause() {\n#if defined(__i386__) || defined(__amd64__)\n  //  __builtin_ia32_pause();\n  asm volatile(\"pause\");\n#endif\n}\n\ninline static void compilerBarrier() { asm volatile(\"\" ::: \"memory\"); }\n\n// xeons have 64 byte cache lines, but will prefetch 2 at a time\nconstexpr int GALOIS_CACHE_LINE_SIZE = 128;\n\n#if defined(__INTEL_COMPILER)\n#define GALOIS_ATTRIBUTE_NOINLINE __attribute__((noinline))\n\n#elif defined(__GNUC__)\n#define GALOIS_ATTRIBUTE_NOINLINE __attribute__((noinline))\n\n#else\n#define GALOIS_ATTRIBUTE_NOINLINE\n#endif\n\n} // namespace galois::substrate\n\n#endif\n"
  },
  {
    "path": "libgalois/include/galois/substrate/EnvCheck.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#ifndef GALOIS_SUBSTRATE_ENVCHECK_H\n#define GALOIS_SUBSTRATE_ENVCHECK_H\n\n#include <cassert>\n#include <string>\n\n#include \"galois/config.h\"\n\nnamespace galois {\nnamespace substrate {\n\nnamespace internal {\n\ntemplate <typename T>\nstruct ConvByType {};\n\ntemplate <>\nstruct ConvByType<int> {\n  static void go(const char* varVal, int& ret) {\n    assert(varVal);\n    ret = std::atoi(varVal);\n  }\n};\n\ntemplate <>\nstruct ConvByType<double> {\n  static void go(const char* varVal, double& ret) {\n    assert(varVal);\n    ret = std::atof(varVal);\n  }\n};\n\ntemplate <>\nstruct ConvByType<std::string> {\n  static void go(const char* varVal, std::string& ret) {\n    assert(varVal);\n    ret = varVal;\n  }\n};\n\ntemplate <typename T>\nbool genericGetEnv(const char* varName, T& ret) {\n\n  char* varVal = getenv(varName);\n  if (varVal) {\n    ConvByType<T>::go(varVal, ret);\n    return true;\n  } else {\n    return false;\n  }\n}\n\n} // end namespace internal\n\n//! Return true if the Enviroment variable is set\nbool EnvCheck(const char* varName);\nbool EnvCheck(const std::string& varName);\n\n/**\n * Return true if Enviroment variable is set, and extract its value into\n * 'retVal' parameter\n * @param varName: name of the variable\n * @param retVal: lvalue to store the value of environment variable\n * @return true if environment variable set, false otherwise\n */\ntemplate <typename T>\nbool EnvCheck(const char* varName, T& retVal) {\n  return internal::genericGetEnv(varName, retVal);\n}\n\ntemplate <typename T>\nbool EnvCheck(const std::string& varName, T& retVal) {\n  return EnvCheck(varName.c_str(), retVal);\n}\n\n} // end namespace substrate\n} // end namespace galois\n\n#endif\n"
  },
  {
    "path": "libgalois/include/galois/substrate/HWTopo.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#ifndef GALOIS_SUBSTRATE_HWTOPO_H\n#define GALOIS_SUBSTRATE_HWTOPO_H\n\n#include <string>\n#include <vector>\n\n#include \"galois/config.h\"\n\nnamespace galois::substrate {\n\nstruct ThreadTopoInfo {\n  unsigned tid;                 // this thread (galois id)\n  unsigned socketLeader;        // first thread id in tid's socket\n  unsigned socket;              // socket (L3 normally) of thread\n  unsigned numaNode;            // memory bank.  may be different than socket.\n  unsigned cumulativeMaxSocket; // max socket id seen from [0, tid]\n  unsigned osContext;           // OS ID to use for thread binding\n  unsigned osNumaNode;          // OS ID for numa node\n};\n\nstruct MachineTopoInfo {\n  unsigned maxThreads;\n  unsigned maxCores;\n  unsigned maxSockets;\n  unsigned maxNumaNodes;\n};\n\nstruct HWTopoInfo {\n  MachineTopoInfo machineTopoInfo;\n  std::vector<ThreadTopoInfo> threadTopoInfo;\n};\n\n/**\n * getHWTopo determines the machine topology from the process information\n * exposed in /proc and /dev filesystems.\n */\nHWTopoInfo getHWTopo();\n\n/**\n * parseCPUList parses cpuset information in \"List format\" as described in\n * cpuset(7) and available under /proc/self/status\n */\nstd::vector<int> parseCPUList(const std::string& in);\n\n/**\n * bindThreadSelf binds a thread to an osContext as returned by getHWTopo.\n */\nbool bindThreadSelf(unsigned osContext);\n\n} // namespace galois::substrate\n\n#endif\n"
  },
  {
    "path": "libgalois/include/galois/substrate/NumaMem.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#ifndef GALOIS_SUBSTRATE_NUMAMEM\n#define GALOIS_SUBSTRATE_NUMAMEM\n\n#include <cstddef>\n#include <memory>\n#include <vector>\n\n#include \"galois/config.h\"\n\nnamespace galois {\nnamespace substrate {\n\nnamespace internal {\nstruct largeFreer {\n  size_t bytes;\n  void operator()(void* ptr) const;\n};\n} // namespace internal\n\ntypedef std::unique_ptr<void, internal::largeFreer> LAptr;\n\nLAptr largeMallocLocal(size_t bytes);    // fault in locally\nLAptr largeMallocFloating(size_t bytes); // leave numa mapping undefined\n// fault in interleaved mapping\nLAptr largeMallocInterleaved(size_t bytes, unsigned numThreads);\n// fault in block interleaved mapping\nLAptr largeMallocBlocked(size_t bytes, unsigned numThreads);\n\n// fault in specified regions for each thread (threadRanges)\ntemplate <typename RangeArrayTy>\nLAptr largeMallocSpecified(size_t bytes, uint32_t numThreads,\n                           RangeArrayTy& threadRanges, size_t elementSize);\n\n} // namespace substrate\n} // namespace galois\n\n#endif // GALOIS_SUBSTRATE_NUMAMEM\n"
  },
  {
    "path": "libgalois/include/galois/substrate/PaddedLock.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#ifndef GALOIS_SUBSTRATE_PADDEDLOCK_H\n#define GALOIS_SUBSTRATE_PADDEDLOCK_H\n\n#include \"galois/substrate/SimpleLock.h\"\n#include \"galois/substrate/CacheLineStorage.h\"\n\nnamespace galois {\nnamespace substrate {\n\n/// PaddedLock is a spinlock.  If the second template parameter is\n/// false, the lock is a noop.\ntemplate <bool concurrent>\nclass PaddedLock;\n\ntemplate <>\nclass PaddedLock<true> {\n  mutable CacheLineStorage<SimpleLock> Lock;\n\npublic:\n  void lock() const { Lock.get().lock(); }\n  bool try_lock() const { return Lock.get().try_lock(); }\n  void unlock() const { Lock.get().unlock(); }\n};\n\ntemplate <>\nclass PaddedLock<false> {\npublic:\n  void lock() const {}\n  bool try_lock() const { return true; }\n  void unlock() const {}\n};\n\n} // end namespace substrate\n} // end namespace galois\n\n#endif\n"
  },
  {
    "path": "libgalois/include/galois/substrate/PageAlloc.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#ifndef GALOIS_SUBSTRATE_PAGEALLOC_H\n#define GALOIS_SUBSTRATE_PAGEALLOC_H\n\n#include <cstddef>\n\n#include \"galois/config.h\"\n\n#ifdef __linux__\n#include <linux/mman.h>\n#endif\n#include <sys/mman.h>\n\n#include <utility>\n#ifdef HAVE_MMAP64\nnamespace galois {\ntemplate <typename... Args>\nvoid* mmap(void* addr, Args... args) { // 0 -> nullptr\n  return ::mmap64(addr, std::forward<Args>(args)...);\n}\n} // namespace galois\n//! offset type for mmap\ntypedef off64_t offset_t;\n#else\nnamespace galois {\ntemplate <typename... Args>\nvoid* mmap(void* addr, Args... args) { // 0 -> nullptr\n  return ::mmap(addr, std::forward<Args>(args)...);\n}\n} // namespace galois\n//! offset type for mmap\ntypedef off_t offset_t;\n#endif\n\n// mmap flags\n#if defined(MAP_ANONYMOUS)\nstatic const int _MAP_ANON = MAP_ANONYMOUS;\n#elif defined(MAP_ANON)\nstatic const int _MAP_ANON = MAP_ANON;\n#else\nstatic_assert(0, \"No Anonymous mapping\");\n#endif\n\nnamespace galois {\nnamespace substrate {\n\n// size of pages\nsize_t allocSize();\n\n// allocate contiguous pages, optionally faulting them in\nvoid* allocPages(unsigned num, bool preFault);\n\n// free page range\nvoid freePages(void* ptr, unsigned num);\n\n} // namespace substrate\n} // namespace galois\n\n#endif // GALOIS_SUBSTRATE_PAGEALLOC_H\n"
  },
  {
    "path": "libgalois/include/galois/substrate/PerThreadStorage.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#ifndef GALOIS_SUBSTRATE_PERTHREADSTORAGE_H\n#define GALOIS_SUBSTRATE_PERTHREADSTORAGE_H\n\n#include <cassert>\n#include <cstddef>\n#include <utility>\n#include <vector>\n\n#include \"galois/config.h\"\n#include \"galois/substrate/HWTopo.h\"\n#include \"galois/substrate/PaddedLock.h\"\n#include \"galois/substrate/ThreadPool.h\"\n\nnamespace galois {\nnamespace substrate {\n\nclass PerBackend {\n  typedef substrate::SimpleLock Lock;\n\n  std::atomic<unsigned int> nextLoc{0};\n  std::atomic<char*>* heads{nullptr};\n  Lock freeOffsetsLock;\n  std::vector<std::vector<unsigned>> freeOffsets;\n  /**\n   * Guards access to non-POD objects that can be accessed after PerBackend\n   * is destroyed. Access can occur through destroying PerThread/PerSocket\n   * objects with static storage duration, which have a reference to a\n   * PerBackend object, which may have be destroyed before the PerThread\n   * object itself.\n   */\n  bool invalid{false};\n\n  void initCommon(unsigned maxT);\n  static unsigned nextLog2(unsigned size);\n\npublic:\n  PerBackend();\n\n  PerBackend(const PerBackend&) = delete;\n  PerBackend& operator=(const PerBackend&) = delete;\n\n  ~PerBackend() {\n    // Intentionally leak heads so that other PerThread operations are\n    // still valid after we are gone\n    invalid = true;\n  }\n\n  char* initPerThread(unsigned maxT);\n  char* initPerSocket(unsigned maxT);\n\n  unsigned allocOffset(const unsigned size);\n  void deallocOffset(const unsigned offset, const unsigned size);\n  void* getRemote(unsigned thread, unsigned offset);\n  void* getLocal(unsigned offset, char* base) { return &base[offset]; }\n  // faster when (1) you already know the id and (2) shared access to heads is\n  // not to expensive; otherwise use getLocal(unsigned,char*)\n  void* getLocal(unsigned offset, unsigned id) { return &heads[id][offset]; }\n};\n\nextern thread_local char* ptsBase;\nPerBackend& getPTSBackend();\n\nextern thread_local char* pssBase;\nPerBackend& getPPSBackend();\n\nvoid initPTS(unsigned maxT);\n\ntemplate <typename T>\nclass PerThreadStorage {\nprotected:\n  PerBackend* b;\n  unsigned offset;\n\n  void destruct() {\n    if (offset == ~0U)\n      return;\n\n    for (unsigned n = 0; n < getThreadPool().getMaxThreads(); ++n)\n      reinterpret_cast<T*>(b->getRemote(n, offset))->~T();\n    b->deallocOffset(offset, sizeof(T));\n    offset = ~0U;\n  }\n\npublic:\n  // construct on each thread\n  template <typename... Args>\n  PerThreadStorage(Args&&... args) : b(&getPTSBackend()) {\n    // in case we make one of these before initializing the thread pool\n    // This will call initPTS for each thread if it hasn't already\n    auto& tp = getThreadPool();\n\n    offset = b->allocOffset(sizeof(T));\n    for (unsigned n = 0; n < tp.getMaxThreads(); ++n)\n      new (b->getRemote(n, offset)) T(std::forward<Args>(args)...);\n  }\n\n  PerThreadStorage(PerThreadStorage&& rhs) : b(rhs.b), offset(rhs.offset) {\n    rhs.offset = ~0;\n  }\n\n  ~PerThreadStorage() { destruct(); }\n\n  PerThreadStorage& operator=(PerThreadStorage&& rhs) {\n    std::swap(offset, rhs.offset);\n    std::swap(b, rhs.b);\n    return *this;\n  }\n\n  T* getLocal() {\n    void* ditem = b->getLocal(offset, ptsBase);\n    return reinterpret_cast<T*>(ditem);\n  }\n\n  const T* getLocal() const {\n    void* ditem = b->getLocal(offset, ptsBase);\n    return reinterpret_cast<T*>(ditem);\n  }\n\n  //! Like getLocal() but optimized for when you already know the thread id\n  T* getLocal(unsigned int thread) {\n    void* ditem = b->getLocal(offset, thread);\n    return reinterpret_cast<T*>(ditem);\n  }\n\n  const T* getLocal(unsigned int thread) const {\n    void* ditem = b->getLocal(offset, thread);\n    return reinterpret_cast<T*>(ditem);\n  }\n\n  T* getRemote(unsigned int thread) {\n    void* ditem = b->getRemote(thread, offset);\n    return reinterpret_cast<T*>(ditem);\n  }\n\n  const T* getRemote(unsigned int thread) const {\n    void* ditem = b->getRemote(thread, offset);\n    return reinterpret_cast<T*>(ditem);\n  }\n\n  unsigned size() const { return getThreadPool().getMaxThreads(); }\n};\n\ntemplate <typename T>\nclass PerSocketStorage {\nprotected:\n  unsigned offset;\n  PerBackend& b;\n\n  void destruct() {\n    auto& tp = getThreadPool();\n    for (unsigned n = 0; n < tp.getMaxSockets(); ++n)\n      reinterpret_cast<T*>(b.getRemote(tp.getLeaderForSocket(n), offset))->~T();\n    b.deallocOffset(offset, sizeof(T));\n  }\n\npublic:\n  template <typename... Args>\n  PerSocketStorage(Args&&... args) : b(getPPSBackend()) {\n    // in case we make one of these before initializing the thread pool\n    // This will call initPTS for each thread if it hasn't already\n    getThreadPool();\n\n    offset   = b.allocOffset(sizeof(T));\n    auto& tp = getThreadPool();\n    for (unsigned n = 0; n < tp.getMaxSockets(); ++n)\n      new (b.getRemote(tp.getLeaderForSocket(n), offset))\n          T(std::forward<Args>(args)...);\n  }\n\n  PerSocketStorage(PerSocketStorage&& o)\n      : offset(std::move(o.offset)), b(getPPSBackend()) {}\n  PerSocketStorage& operator=(PerSocketStorage&& o) {\n    destruct();\n    offset = std::move(o.offset);\n    return *this;\n  }\n\n  PerSocketStorage(const PerSocketStorage&) = delete;\n  PerSocketStorage& operator=(const PerSocketStorage&) = delete;\n\n  ~PerSocketStorage() { destruct(); }\n\n  T* getLocal() {\n    void* ditem = b.getLocal(offset, pssBase);\n    return reinterpret_cast<T*>(ditem);\n  }\n\n  const T* getLocal() const {\n    void* ditem = b.getLocal(offset, pssBase);\n    return reinterpret_cast<T*>(ditem);\n  }\n\n  //! Like getLocal() but optimized for when you already know the thread id\n  T* getLocal(unsigned int thread) {\n    void* ditem = b.getLocal(offset, thread);\n    return reinterpret_cast<T*>(ditem);\n  }\n\n  const T* getLocal(unsigned int thread) const {\n    void* ditem = b.getLocal(offset, thread);\n    return reinterpret_cast<T*>(ditem);\n  }\n\n  T* getRemote(unsigned int thread) {\n    void* ditem = b.getRemote(thread, offset);\n    return reinterpret_cast<T*>(ditem);\n  }\n\n  const T* getRemote(unsigned int thread) const {\n    void* ditem = b.getRemote(thread, offset);\n    return reinterpret_cast<T*>(ditem);\n  }\n\n  T* getRemoteByPkg(unsigned int pkg) {\n    void* ditem = b.getRemote(getThreadPool().getLeaderForSocket(pkg), offset);\n    return reinterpret_cast<T*>(ditem);\n  }\n\n  const T* getRemoteByPkg(unsigned int pkg) const {\n    void* ditem = b.getRemote(getThreadPool().getLeaderForSocket(pkg), offset);\n    return reinterpret_cast<T*>(ditem);\n  }\n\n  unsigned size() const { return getThreadPool().getMaxThreads(); }\n};\n\n} // namespace substrate\n} // end namespace galois\n#endif\n"
  },
  {
    "path": "libgalois/include/galois/substrate/PtrLock.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#ifndef GALOIS_SUBSTRATE_PTRLOCK_H\n#define GALOIS_SUBSTRATE_PTRLOCK_H\n\n#include <cstdint>\n#include <cassert>\n#include <atomic>\n\n#include \"galois/config.h\"\n#include \"galois/substrate/CompilerSpecific.h\"\n\nnamespace galois {\nnamespace substrate {\n\nnamespace internal {\nvoid ptr_slow_lock(std::atomic<uintptr_t>& l);\n}\n\n/// PtrLock is a spinlock and a pointer.  This wraps a pointer and\n/// uses the low order bit for the lock flag Copying a lock is\n/// unsynchronized (relaxed ordering)\n\ntemplate <typename T>\nclass PtrLock {\n  std::atomic<uintptr_t> _lock;\n\n  //  static_assert(alignof(T) > 1, \"Bad data type alignment for PtrLock\");\n\npublic:\n  constexpr PtrLock() : _lock(0) {}\n  // relaxed order for copy\n  PtrLock(const PtrLock& p) : _lock(p._lock.load(std::memory_order_relaxed)) {}\n\n  PtrLock& operator=(const PtrLock& p) {\n    if (&p == this)\n      return *this;\n    // relaxed order for initialization\n    _lock.store(p._lock.load(std::memory_order_relaxed),\n                std::memory_order_relaxed);\n    return *this;\n  }\n\n  inline void lock() {\n    uintptr_t oldval = _lock.load(std::memory_order_relaxed);\n    if (oldval & 1)\n      goto slow_path;\n    if (!_lock.compare_exchange_weak(oldval, oldval | 1,\n                                     std::memory_order_acq_rel,\n                                     std::memory_order_relaxed))\n      goto slow_path;\n    assert(is_locked());\n    return;\n\n  slow_path:\n    internal::ptr_slow_lock(_lock);\n  }\n\n  inline void unlock() {\n    assert(is_locked());\n    _lock.store(_lock.load(std::memory_order_relaxed) & ~(uintptr_t)1,\n                std::memory_order_release);\n  }\n\n  inline void unlock_and_clear() {\n    assert(is_locked());\n    _lock.store(0, std::memory_order_release);\n  }\n\n  inline void unlock_and_set(T* val) {\n    assert(is_locked());\n    assert(!((uintptr_t)val & 1));\n    _lock.store((uintptr_t)val, std::memory_order_release);\n  }\n\n  inline T* getValue() const {\n    return (T*)(_lock.load(std::memory_order_relaxed) & ~(uintptr_t)1);\n  }\n\n  inline void setValue(T* val) {\n    uintptr_t nval = (uintptr_t)val;\n    nval |= (_lock & 1);\n    // relaxed OK since this doesn't clear lock\n    _lock.store(nval, std::memory_order_relaxed);\n  }\n\n  inline bool try_lock() {\n    uintptr_t oldval = _lock.load(std::memory_order_relaxed);\n    if ((oldval & 1) != 0)\n      return false;\n    oldval = _lock.fetch_or(1, std::memory_order_acq_rel);\n    return !(oldval & 1);\n  }\n\n  inline bool is_locked() const {\n    return _lock.load(std::memory_order_acquire) & 1;\n  }\n\n  //! CAS only works on unlocked values\n  //! the lock bit will prevent a successful cas\n  inline bool CAS(T* oldval, T* newval) {\n    assert(!((uintptr_t)oldval & 1) && !((uintptr_t)newval & 1));\n    uintptr_t old = (uintptr_t)oldval;\n    return _lock.compare_exchange_strong(old, (uintptr_t)newval);\n  }\n\n  //! CAS that works on locked values; this can be very dangerous\n  //! when used incorrectly\n  inline bool stealing_CAS(T* oldval, T* newval) {\n    uintptr_t old = 1 | (uintptr_t)oldval;\n    return _lock.compare_exchange_strong(old, 1 | (uintptr_t)newval);\n  }\n};\n\ntemplate <typename T>\nclass DummyPtrLock {\n  T* _lock;\n\npublic:\n  DummyPtrLock() : _lock() {}\n\n  inline void lock() {}\n  inline void unlock() {}\n  inline void unlock_and_clear() { _lock = 0; }\n  inline void unlock_and_set(T* val) { _lock = val; }\n  inline T* getValue() const { return _lock; }\n  inline void setValue(T* val) { _lock = val; }\n  inline bool try_lock() const { return true; }\n  inline bool is_locked() const { return false; }\n  inline bool CAS(T* oldval, T* newval) {\n    if (_lock == oldval) {\n      _lock = newval;\n      return true;\n    }\n    return false;\n  }\n  inline bool stealing_CAS(T* oldval, T* newval) { return CAS(oldval, newval); }\n};\n\n} // end namespace substrate\n} // end namespace galois\n\n#endif\n"
  },
  {
    "path": "libgalois/include/galois/substrate/SharedMem.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#ifndef GALOIS_SUBSTRATE_SHAREDMEM_H\n#define GALOIS_SUBSTRATE_SHAREDMEM_H\n\n#include <memory>\n\n#include \"galois/config.h\"\n#include \"galois/substrate/ThreadPool.h\"\n#include \"galois/substrate/Barrier.h\"\n#include \"galois/substrate/Termination.h\"\n\nnamespace galois::substrate {\n\nclass SharedMem {\n\n  // Order is critical here\n  ThreadPool m_tpool;\n\n  std::unique_ptr<internal::LocalTerminationDetection<>> m_termPtr;\n  std::unique_ptr<internal::BarrierInstance<>> m_biPtr;\n\npublic:\n  /**\n   * Initializes the Substrate library components\n   */\n  SharedMem();\n\n  /**\n   * Destroys the Substrate library components\n   */\n  ~SharedMem();\n\n  SharedMem(const SharedMem&) = delete;\n  SharedMem& operator=(const SharedMem&) = delete;\n\n  SharedMem(SharedMem&&) = delete;\n  SharedMem& operator=(SharedMem&&) = delete;\n};\n\n} // namespace galois::substrate\n\n#endif\n"
  },
  {
    "path": "libgalois/include/galois/substrate/SimpleLock.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#ifndef GALOIS_SUBSTRATE_SIMPLELOCK_H\n#define GALOIS_SUBSTRATE_SIMPLELOCK_H\n\n#include <atomic>\n#include <cassert>\n#include <mutex>\n\n#include \"galois/config.h\"\n#include \"galois/substrate/CompilerSpecific.h\"\n\nnamespace galois {\nnamespace substrate {\n\n/// SimpleLock is a spinlock.\n/// Copying a lock is unsynchronized (relaxed ordering)\n\nclass SimpleLock {\n  mutable std::atomic<int> _lock;\n  void slow_lock() const;\n\npublic:\n  constexpr SimpleLock() : _lock(0) {}\n  // relaxed order for copy\n  SimpleLock(const SimpleLock& p)\n      : _lock(p._lock.load(std::memory_order_relaxed)) {}\n\n  SimpleLock& operator=(const SimpleLock& p) {\n    if (&p == this)\n      return *this;\n    // relaxed order for initialization\n    _lock.store(p._lock.load(std::memory_order_relaxed),\n                std::memory_order_relaxed);\n    return *this;\n  }\n\n  inline void lock() const {\n    int oldval = 0;\n    if (_lock.load(std::memory_order_relaxed))\n      goto slow_path;\n    if (!_lock.compare_exchange_weak(oldval, 1, std::memory_order_acq_rel,\n                                     std::memory_order_relaxed))\n      goto slow_path;\n    assert(is_locked());\n    return;\n  slow_path:\n    slow_lock();\n  }\n\n  inline void unlock() const {\n    assert(is_locked());\n    // HMMMM\n    _lock.store(0, std::memory_order_release);\n    //_lock = 0;\n  }\n\n  inline bool try_lock() const {\n    int oldval = 0;\n    if (_lock.load(std::memory_order_relaxed))\n      return false;\n    if (!_lock.compare_exchange_weak(oldval, 1, std::memory_order_acq_rel))\n      return false;\n    assert(is_locked());\n    return true;\n  }\n\n  inline bool is_locked() const {\n    return _lock.load(std::memory_order_acquire) & 1;\n  }\n};\n\n//! Dummy Lock implements the lock interface without a lock for serial code\n\nclass DummyLock {\npublic:\n  inline void lock() const {}\n  inline void unlock() const {}\n  inline bool try_lock() const { return true; }\n  inline bool is_locked() const { return false; }\n};\n\ntemplate <bool Enabled>\nusing CondLock =\n    typename std::conditional<Enabled, SimpleLock, DummyLock>::type;\n\nusing lock_guard_galois = std::lock_guard<SimpleLock>;\n\n#define MAKE_LOCK_GUARD(__x)                                                   \\\n  galois::substrate::lock_guard_galois locker##___COUNTER__(__x)\n\n} // end namespace substrate\n} // end namespace galois\n\n#endif\n"
  },
  {
    "path": "libgalois/include/galois/substrate/StaticInstance.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#ifndef GALOIS_SUBSTRATE_STATICINSTANCE_H\n#define GALOIS_SUBSTRATE_STATICINSTANCE_H\n\n#include \"galois/config.h\"\n#include \"galois/substrate/CompilerSpecific.h\"\n\nnamespace galois {\nnamespace substrate {\n\n// This should be much simpler in c++03 mode, but be general for now\n// This exists because ptrlock is not a pod, but this is.\ntemplate <typename T>\nstruct StaticInstance {\n  volatile T* V;\n  volatile int _lock;\n\n  inline void lock() {\n    int oldval;\n    do {\n      while (_lock != 0) {\n        substrate::asmPause();\n      }\n      oldval = __sync_fetch_and_or(&_lock, 1);\n    } while (oldval & 1);\n  }\n\n  inline void unlock() {\n    compilerBarrier();\n    _lock = 0;\n  }\n\n  T* get() {\n    volatile T* val = V;\n    if (val)\n      return (T*)val;\n    lock();\n    val = V;\n    if (!val)\n      V = val = new T();\n    unlock();\n    return (T*)val;\n  }\n};\n\n} // end namespace substrate\n} // end namespace galois\n\n#endif\n"
  },
  {
    "path": "libgalois/include/galois/substrate/Termination.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#ifndef GALOIS_SUBSTRATE_TERMINATION_H\n#define GALOIS_SUBSTRATE_TERMINATION_H\n\n#include <atomic>\n\n#include \"galois/config.h\"\n#include \"galois/substrate/PerThreadStorage.h\"\n#include \"galois/substrate/CacheLineStorage.h\"\n\nnamespace galois {\nnamespace substrate {\n\nclass TerminationDetection;\n/*\n * returns an object.  The object will be reused, but reinitialized to\n * activeThreads\n */\nTerminationDetection& getSystemTermination(unsigned activeThreads);\n\nclass TerminationDetection {\n\n  friend TerminationDetection& getSystemTermination(unsigned);\n\nprotected:\n  CacheLineStorage<std::atomic<int>> globalTerm;\n\n  /**\n   * for internal use by child classes\n   */\n  virtual void init(unsigned activeThreads) = 0;\n\npublic:\n  virtual ~TerminationDetection(void);\n  /**\n   * Initializes the per-thread state.  All threads must call this\n   * before any call localTermination.\n   */\n  virtual void initializeThread() = 0;\n\n  /**\n   * Process termination locally.  May be called as often as needed.  The\n   * argument workHappened signals that since last time it was called, some\n   * progress was made that should prevent termination. All threads must call\n   * initializeThread() before any thread calls this function.  This function\n   * should not be on the fast path (this is why it takes a flag, to allow the\n   * caller to buffer up work status changes).\n   */\n  virtual void localTermination(bool workHappened) = 0;\n\n  /**\n   * Returns whether global termination is detected.\n   */\n  bool globalTermination() const { return globalTerm.data; }\n};\n\nnamespace internal {\n// Dijkstra style 2-pass ring termination detection\ntemplate <typename _UNUSED = void>\nclass LocalTerminationDetection : public TerminationDetection {\n\n  struct TokenHolder {\n    friend class TerminationDetection;\n    std::atomic<long> tokenIsBlack;\n    std::atomic<long> hasToken;\n    long processIsBlack;\n    bool lastWasWhite; // only used by the master\n  };\n\n  galois::substrate::PerThreadStorage<TokenHolder> data;\n\n  unsigned activeThreads;\n\n  // send token onwards\n  void propToken(bool isBlack) {\n    unsigned id     = ThreadPool::getTID();\n    TokenHolder& th = *data.getRemote((id + 1) % activeThreads);\n    th.tokenIsBlack = isBlack;\n    th.hasToken     = true;\n  }\n\n  void propGlobalTerm() { globalTerm = true; }\n\n  bool isSysMaster() const { return ThreadPool::getTID() == 0; }\n\nprotected:\n  virtual void init(unsigned aThreads) { activeThreads = aThreads; }\n\npublic:\n  LocalTerminationDetection() {}\n\n  virtual void initializeThread() {\n    TokenHolder& th   = *data.getLocal();\n    th.tokenIsBlack   = false;\n    th.processIsBlack = true;\n    th.lastWasWhite   = true;\n    globalTerm        = false;\n    if (isSysMaster())\n      th.hasToken = true;\n    else\n      th.hasToken = false;\n  }\n\n  virtual void localTermination(bool workHappened) {\n    assert(!(workHappened && globalTerm.get()));\n    TokenHolder& th = *data.getLocal();\n    th.processIsBlack |= workHappened;\n    if (th.hasToken) {\n      if (isSysMaster()) {\n        bool failed     = th.tokenIsBlack || th.processIsBlack;\n        th.tokenIsBlack = th.processIsBlack = false;\n        if (th.lastWasWhite && !failed) {\n          // This was the second success\n          propGlobalTerm();\n          return;\n        }\n        th.lastWasWhite = !failed;\n      }\n      // Normal thread or recirc by master\n      assert(!globalTerm.get() &&\n             \"no token should be in progress after globalTerm\");\n      bool taint        = th.processIsBlack || th.tokenIsBlack;\n      th.processIsBlack = th.tokenIsBlack = false;\n      th.hasToken                         = false;\n      propToken(taint);\n    }\n  }\n};\n\n// Dijkstra style 2-pass tree termination detection\ntemplate <typename _UNUSED = void>\nclass TreeTerminationDetection : public TerminationDetection {\n  static const int num = 2;\n\n  struct TokenHolder {\n    friend class TerminationDetection;\n    // incoming from above\n    volatile long down_token;\n    // incoming from below\n    volatile long up_token[num];\n    // my state\n    long processIsBlack;\n    bool hasToken;\n    bool lastWasWhite; // only used by the master\n    int parent;\n    int parent_offset;\n    TokenHolder* child[num];\n  };\n\n  PerThreadStorage<TokenHolder> data;\n\n  unsigned activeThreads;\n\n  void processToken() {\n    TokenHolder& th = *data.getLocal();\n    // int myid = LL::getTID();\n    // have all up tokens?\n    bool haveAll = th.hasToken;\n    bool black   = th.processIsBlack;\n    for (int i = 0; i < num; ++i) {\n      if (th.child[i]) {\n        if (th.up_token[i] == -1)\n          haveAll = false;\n        else\n          black |= th.up_token[i];\n      }\n    }\n    // Have the tokens, propagate\n    if (haveAll) {\n      th.processIsBlack = false;\n      th.hasToken       = false;\n      if (isSysMaster()) {\n        if (th.lastWasWhite && !black) {\n          // This was the second success\n          propGlobalTerm();\n          return;\n        }\n        th.lastWasWhite = !black;\n        th.down_token   = true;\n      } else {\n        data.getRemote(th.parent)->up_token[th.parent_offset] = black;\n      }\n    }\n\n    // recieved a down token, propagate\n    if (th.down_token) {\n      th.down_token = false;\n      th.hasToken   = true;\n      for (int i = 0; i < num; ++i) {\n        th.up_token[i] = -1;\n        if (th.child[i])\n          th.child[i]->down_token = true;\n      }\n    }\n  }\n\n  void propGlobalTerm() { globalTerm = true; }\n\n  bool isSysMaster() const { return ThreadPool::getTID() == 0; }\n\nprotected:\n  virtual void init(unsigned aThreads) { activeThreads = aThreads; }\n\npublic:\n  TreeTerminationDetection() {}\n\n  virtual void initializeThread() {\n    TokenHolder& th = *data.getLocal();\n    th.down_token   = false;\n    for (int i = 0; i < num; ++i)\n      th.up_token[i] = false;\n    th.processIsBlack = true;\n    th.hasToken       = false;\n    th.lastWasWhite   = false;\n    globalTerm        = false;\n    auto tid          = ThreadPool::getTID();\n    th.parent         = (tid - 1) / num;\n    th.parent_offset  = (tid - 1) % num;\n    for (unsigned i = 0; i < num; ++i) {\n      unsigned cn = tid * num + i + 1;\n      if (cn < activeThreads)\n        th.child[i] = data.getRemote(cn);\n      else\n        th.child[i] = 0;\n    }\n    if (isSysMaster()) {\n      th.down_token = true;\n    }\n  }\n\n  virtual void localTermination(bool workHappened) {\n    assert(!(workHappened && globalTerm.get()));\n    TokenHolder& th = *data.getLocal();\n    th.processIsBlack |= workHappened;\n    processToken();\n  }\n};\n\nvoid setTermDetect(TerminationDetection* term);\n} // end namespace internal\n\n} // namespace substrate\n} // end namespace galois\n\n#endif\n"
  },
  {
    "path": "libgalois/include/galois/substrate/ThreadPool.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#ifndef GALOIS_SUBSTRATE_THREADPOOL_H\n#define GALOIS_SUBSTRATE_THREADPOOL_H\n\n#include <atomic>\n#include <cassert>\n#include <condition_variable>\n#include <cstdlib>\n#include <functional>\n#include <thread>\n#include <vector>\n\n#include \"galois/substrate/CacheLineStorage.h\"\n#include \"galois/substrate/HWTopo.h\"\n\nnamespace galois::substrate::internal {\n\ntemplate <typename tpl, int s, int r>\nstruct ExecuteTupleImpl {\n  static inline void execute(tpl& cmds) {\n    std::get<s>(cmds)();\n    ExecuteTupleImpl<tpl, s + 1, r - 1>::execute(cmds);\n  }\n};\n\ntemplate <typename tpl, int s>\nstruct ExecuteTupleImpl<tpl, s, 0> {\n  static inline void execute(tpl&) {}\n};\n\n} // namespace galois::substrate::internal\n\nnamespace galois::substrate {\n\nclass ThreadPool {\n  friend class SharedMem;\n\nprotected:\n  struct shutdown_ty {}; //! type for shutting down thread\n  struct fastmode_ty {\n    bool mode;\n  }; //! type for setting fastmode\n  struct dedicated_ty {\n    std::function<void(void)> fn;\n  }; //! type to switch to dedicated mode\n\n  //! Per-thread mailboxes for notification\n  struct per_signal {\n    std::condition_variable cv;\n    std::mutex m;\n    unsigned wbegin, wend;\n    std::atomic<int> done;\n    std::atomic<int> fastRelease;\n    ThreadTopoInfo topo;\n\n    void wakeup(bool fastmode) {\n      if (fastmode) {\n        done        = 0;\n        fastRelease = 1;\n      } else {\n        std::lock_guard<std::mutex> lg(m);\n        done = 0;\n        cv.notify_one();\n        // start.release();\n      }\n    }\n\n    void wait(bool fastmode) {\n      if (fastmode) {\n        while (!fastRelease.load(std::memory_order_relaxed)) {\n          asmPause();\n        }\n        fastRelease = 0;\n      } else {\n        std::unique_lock<std::mutex> lg(m);\n        cv.wait(lg, [=] { return !done; });\n        // start.acquire();\n      }\n    }\n  };\n\n  thread_local static per_signal my_box;\n\n  MachineTopoInfo mi;\n  std::vector<per_signal*> signals;\n  std::vector<std::thread> threads;\n  unsigned reserved;\n  unsigned masterFastmode;\n  bool running;\n  std::function<void(void)> work;\n\n  //! destroy all threads\n  void destroyCommon();\n\n  //! Initialize a thread\n  void initThread(unsigned tid);\n\n  //! main thread loop\n  void threadLoop(unsigned tid);\n\n  //! spin up for run\n  void cascade(bool fastmode);\n\n  //! spin down after run\n  void decascade();\n\n  //! execute work on num threads\n  void runInternal(unsigned num);\n\n  ThreadPool();\n\npublic:\n  ~ThreadPool();\n\n  ThreadPool(const ThreadPool&) = delete;\n  ThreadPool& operator=(const ThreadPool&) = delete;\n\n  ThreadPool(ThreadPool&&) = delete;\n  ThreadPool& operator=(ThreadPool&&) = delete;\n\n  //! execute work on all threads\n  //! a simple wrapper for run\n  template <typename... Args>\n  void run(unsigned num, Args&&... args) {\n    struct ExecuteTuple {\n      //      using Ty = std::tuple<Args...>;\n      std::tuple<Args...> cmds;\n\n      void operator()() {\n        internal::ExecuteTupleImpl<\n            std::tuple<Args...>, 0,\n            std::tuple_size<std::tuple<Args...>>::value>::execute(this->cmds);\n      }\n      ExecuteTuple(Args&&... args) : cmds(std::forward<Args>(args)...) {}\n    };\n    // paying for an indirection in work allows small-object optimization in\n    // std::function to kick in and avoid a heap allocation\n    ExecuteTuple lwork(std::forward<Args>(args)...);\n    work = std::ref(lwork);\n    // work =\n    // std::function<void(void)>(ExecuteTuple(std::forward<Args>(args)...));\n    assert(num <= getMaxThreads());\n    runInternal(num);\n  }\n\n  //! run function in a dedicated thread until the threadpool exits\n  void runDedicated(std::function<void(void)>& f);\n\n  // experimental: busy wait for work\n  void burnPower(unsigned num);\n  // experimental: leave busy wait\n  void beKind();\n\n  bool isRunning() const { return running; }\n\n  //! return the number of non-reserved threads in the pool\n  unsigned getMaxUsableThreads() const { return mi.maxThreads - reserved; }\n  //! return the number of threads supported by the thread pool on the current\n  //! machine\n  unsigned getMaxThreads() const { return mi.maxThreads; }\n  unsigned getMaxCores() const { return mi.maxCores; }\n  unsigned getMaxSockets() const { return mi.maxSockets; }\n  unsigned getMaxNumaNodes() const { return mi.maxNumaNodes; }\n\n  unsigned getLeaderForSocket(unsigned pid) const {\n    for (unsigned i = 0; i < getMaxThreads(); ++i)\n      if (getSocket(i) == pid && isLeader(i))\n        return i;\n    abort();\n  }\n\n  bool isLeader(unsigned tid) const {\n    return signals[tid]->topo.socketLeader == tid;\n  }\n  unsigned getSocket(unsigned tid) const { return signals[tid]->topo.socket; }\n  unsigned getLeader(unsigned tid) const {\n    return signals[tid]->topo.socketLeader;\n  }\n  unsigned getCumulativeMaxSocket(unsigned tid) const {\n    return signals[tid]->topo.cumulativeMaxSocket;\n  }\n  unsigned getNumaNode(unsigned tid) const {\n    return signals[tid]->topo.numaNode;\n  }\n\n  static unsigned getTID() { return my_box.topo.tid; }\n  static bool isLeader() { return my_box.topo.tid == my_box.topo.socketLeader; }\n  static unsigned getLeader() { return my_box.topo.socketLeader; }\n  static unsigned getSocket() { return my_box.topo.socket; }\n  static unsigned getCumulativeMaxSocket() {\n    return my_box.topo.cumulativeMaxSocket;\n  }\n  static unsigned getNumaNode() { return my_box.topo.numaNode; }\n};\n\n/**\n * return a reference to system thread pool\n */\nThreadPool& getThreadPool(void);\n\n} // namespace galois::substrate\n\nnamespace galois::substrate::internal {\n\nvoid setThreadPool(ThreadPool* tp);\n\n} // namespace galois::substrate::internal\n\n#endif\n"
  },
  {
    "path": "libgalois/include/galois/substrate/ThreadRWlock.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#ifndef GALOIS_SUBSTRATE_THREAD_RW_LOCK_H\n#define GALOIS_SUBSTRATE_THREAD_RW_LOCK_H\n\n#include \"galois/config.h\"\n#include \"galois/substrate/PaddedLock.h\"\n#include \"galois/substrate/PerThreadStorage.h\"\n\nnamespace galois {\nnamespace substrate {\n\nclass ThreadRWlock {\n\n  typedef substrate::PaddedLock<true> Lock_ty;\n  // typedef galois::runtime::LL::SimpleLock<true> Lock_ty;\n  typedef substrate::PerThreadStorage<Lock_ty> PerThreadLock;\n\n  PerThreadLock locks;\n\npublic:\n  void readLock() { locks.getLocal()->lock(); }\n\n  void readUnlock() { locks.getLocal()->unlock(); }\n\n  void writeLock() {\n    for (unsigned i = 0; i < locks.size(); ++i) {\n      locks.getRemote(i)->lock();\n    }\n  }\n\n  void writeUnlock() {\n    for (unsigned i = 0; i < locks.size(); ++i) {\n      locks.getRemote(i)->unlock();\n    }\n  }\n};\n\n//! readOrUpdate is a generic function to perform reads or writes using a\n//! rwmutex \\param rwmutex is a read/write lock that implements\n//! readLock/readUnlock, writeLoack/writeUnlock \\param readAndCheck is function\n//! object to execute when reading. It returns true only if read was successful.\n//! Should update state to store read result. Shouldn't use rwmutex internally\n//! \\param write is function object to perform the write. It should update state\n//! to store result after writing. Shouldn't use rwmutex internally\ntemplate <typename L, typename R, typename W>\nvoid readUpdateProtected(L& rwmutex, R& readAndCheck, W& write) {\n\n  rwmutex.readLock();\n\n  if (readAndCheck()) {\n\n    rwmutex.readUnlock();\n    return;\n\n  } else {\n\n    rwmutex.readUnlock();\n\n    rwmutex.writeLock();\n    {\n      // check again in case another thread made the write\n      if (!readAndCheck()) {\n        write();\n      }\n    }\n    rwmutex.writeUnlock();\n  }\n}\n\n} // end namespace substrate\n} // end namespace galois\n\n#endif // GALOIS_SUBSTRATE_THREAD_RW_LOCK_H\n"
  },
  {
    "path": "libgalois/include/galois/worklists/AdaptiveObim.h",
    "content": "/** Scalable priority worklist\n *\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#ifndef GALOIS_WORKLIST_ADAPTIVEOBIM_H\n#define GALOIS_WORKLIST_ADAPTIVEOBIM_H\n\n#include <atomic>\n#include <cmath>\n#include <iostream>\n#include <limits>\n#include <type_traits>\n\n#include \"galois/config.h\"\n\n#include \"galois/FlatMap.h\"\n#include \"galois/Timer.h\"\n#include \"galois/substrate/PaddedLock.h\"\n#include \"galois/substrate/PerThreadStorage.h\"\n#include \"galois/worklists/Chunk.h\"\n#include \"galois/worklists/WorkListHelpers.h\"\n\nnamespace galois {\nnamespace worklists {\n\nnamespace internal {\n\ntemplate <typename Index, bool UseDescending>\nstruct AdaptiveOrderedByIntegerMetricComparator {\n  typedef std::less<Index> compare_t;\n  Index identity;\n  Index earliest;\n\n  AdaptiveOrderedByIntegerMetricComparator()\n      : identity(std::numeric_limits<Index>::max()),\n        earliest(std::numeric_limits<Index>::min()) {}\n};\n\ntemplate <typename Index>\nstruct AdaptiveOrderedByIntegerMetricComparator<Index, true> {\n  typedef std::greater<Index> compare_t;\n  Index identity;\n  Index earliest;\n\n  AdaptiveOrderedByIntegerMetricComparator()\n      : identity(std::numeric_limits<Index>::min()),\n        earliest(std::numeric_limits<Index>::max()) {}\n};\n\n} // namespace internal\n\n/**\n * Approximate priority scheduling. Indexer is a default-constructable class\n * whose instances conform to <code>R r = indexer(item)</code> where R is some\n * type with a total order defined by <code>operator&lt;</code> and\n * <code>operator==</code> and item is an element from the Galois set\n * iterator.\n *\n * An example:\n * \\code\n * struct Item { int index; };\n *\n * struct Indexer {\n *   int operator()(Item i) const { return i.index; }\n * };\n *\n * typedef galois::worklists::AdaptiveOrderedByIntegerMetric<Indexer> WL;\n * galois::for_each<WL>(items.begin(), items.end(), Fn);\n * \\endcode\n *\n * @tparam Indexer        Indexer class\n * @tparam Container      Scheduler for each bucket\n * @tparam BlockPeriod    Check for higher priority work every 2^BlockPeriod\n *                        iterations\n * @tparam BSP            Use back-scan prevention\n * @tparam uniformBSP     Use uniform back-scan prevention\n * @tparam T              Work item type\n * @tparam Index          Indexer return type\n * @tparam UseMonotonic   Assume that an activity at priority p will not\n * schedule work at priority p or any priority p1 where p1 < p.\n * @tparam UseDescending  Use descending order instead\n * @tparam Concurrent     Whether or not to allow concurrent execution\n */\ntemplate <class Indexer      = DummyIndexer<int>,\n          typename Container = PerSocketChunkFIFO<>, int BlockPeriod = 0,\n          bool BSP = true, bool uniformBSP = false, int chunk_size = 64,\n          typename T = int, typename Index = int, bool EnableUmerge = false,\n          bool UseMonotonic = false, bool UseDescending = false,\n          bool Concurrent = true>\nstruct AdaptiveOrderedByIntegerMetric\n    : private boost::noncopyable,\n      public internal::AdaptiveOrderedByIntegerMetricComparator<Index,\n                                                                UseDescending> {\n  template <typename _T>\n  using retype = AdaptiveOrderedByIntegerMetric<\n      Indexer, typename Container::template retype<_T>, BlockPeriod, BSP,\n      uniformBSP, chunk_size, _T, typename std::result_of<Indexer(_T)>::type,\n      EnableUmerge, UseMonotonic, UseDescending, Concurrent>;\n\n  template <bool _b>\n  using rethread = AdaptiveOrderedByIntegerMetric<\n      Indexer, typename Container::template rethread<_b>, BlockPeriod, BSP,\n      uniformBSP, chunk_size, T, Index, EnableUmerge, UseMonotonic,\n      UseDescending, _b>;\n\n  template <unsigned _period>\n  struct with_block_period {\n    typedef AdaptiveOrderedByIntegerMetric<\n        Indexer, Container, _period, BSP, uniformBSP, chunk_size, T, Index,\n        EnableUmerge, UseMonotonic, UseDescending, Concurrent>\n        type;\n  };\n  template <typename _container>\n  struct with_container {\n    typedef AdaptiveOrderedByIntegerMetric<\n        Indexer, _container, BlockPeriod, BSP, uniformBSP, chunk_size, T, Index,\n        EnableUmerge, UseMonotonic, UseDescending, Concurrent>\n        type;\n  };\n\n  template <typename _indexer>\n  struct with_indexer {\n    AdaptiveOrderedByIntegerMetric<\n        _indexer, Container, BlockPeriod, BSP, uniformBSP, chunk_size, T, Index,\n        EnableUmerge, UseMonotonic, UseDescending, Concurrent>\n        type;\n  };\n\n  template <bool _bsp>\n  struct with_back_scan_prevention {\n    typedef AdaptiveOrderedByIntegerMetric<\n        Indexer, Container, BlockPeriod, _bsp, uniformBSP, chunk_size, T, Index,\n        EnableUmerge, UseMonotonic, UseDescending, Concurrent>\n        type;\n  };\n\n  template <bool _enable_unmerge>\n  struct with_unmerge {\n    AdaptiveOrderedByIntegerMetric<\n        Indexer, Container, BlockPeriod, BSP, uniformBSP, chunk_size, T, Index,\n        _enable_unmerge, UseMonotonic, UseDescending, Concurrent>\n        type;\n  };\n\n  template <bool _use_monotonic>\n  struct with_monotonic {\n    AdaptiveOrderedByIntegerMetric<\n        Indexer, Container, BlockPeriod, BSP, uniformBSP, chunk_size, T, Index,\n        EnableUmerge, _use_monotonic, UseDescending, Concurrent>\n        type;\n  };\n\n  template <bool _use_descending>\n  struct with_descending {\n    AdaptiveOrderedByIntegerMetric<\n        Indexer, Container, BlockPeriod, BSP, uniformBSP, chunk_size, T, Index,\n        EnableUmerge, UseMonotonic, _use_descending, Concurrent>\n        type;\n  };\n\n  typedef T value_type;\n  typedef Index index_type;\n  typedef uint32_t delta_type;\n\nprivate:\n  typedef typename Container::template rethread<Concurrent> CTy;\n  typedef internal::AdaptiveOrderedByIntegerMetricComparator<Index,\n                                                             UseDescending>\n      Comparator;\n  static inline typename Comparator::compare_t compare;\n  delta_type delta;\n  unsigned int counter;\n  unsigned int maxIndex;\n  unsigned int lastSizeMasterLog;\n\n  // indexing mechanism\n  // smaller delta insertions are prioritirized\n  struct deltaIndex {\n    Index k; // note: original index is stored here\n    delta_type d;\n    // taking the max of deltas and doing right shift eq. shifting priority with\n    // d-max(d1, d2)\n\n    deltaIndex() : k(0), d(0) {}\n    deltaIndex(Index k1, delta_type d1) : k(k1), d(d1) {}\n    bool operator==(const deltaIndex& a) const {\n      unsigned delt = std::max(d, a.d);\n      Index a1      = k >> delt;\n      Index a2      = a.k >> delt;\n      return (a1 == a2 && d == a.d);\n    }\n    bool operator<(const deltaIndex& a) const {\n      unsigned delt = std::max(d, a.d);\n      Index a1      = k >> delt;\n      Index a2      = a.k >> delt;\n      if (compare(a1, a2))\n        return true;\n      if (compare(a2, a1))\n        return false;\n      if (d < a.d)\n        return true;\n      return false;\n    }\n    bool operator>(const deltaIndex& a) const {\n      unsigned delt = std::max(d, a.d);\n      Index a1      = k >> delt;\n      Index a2      = a.k >> delt;\n      if (compare(a2, a1))\n        return true;\n      if (compare(a1, a2))\n        return false;\n      if (d > a.d)\n        return true;\n      return false;\n    }\n    bool operator<=(const deltaIndex& a) const {\n      unsigned delt = std::max(d, a.d);\n      Index a1      = k >> delt;\n      Index a2      = a.k >> delt;\n      if (compare(a1, a2))\n        return true;\n      if (compare(a2, a1))\n        return false;\n      if (d < a.d)\n        return true;\n      if (d == a.d)\n        return true;\n      return false;\n    }\n    bool operator>=(const deltaIndex& a) const {\n      unsigned delt = std::max(d, a.d);\n      Index a1      = k >> delt;\n      Index a2      = a.k >> delt;\n      if (compare(a2, a1))\n        return true;\n      if (compare(a1, a2))\n        return false;\n      if (d > a.d)\n        return true;\n      if (d == a.d)\n        return true;\n      return false;\n    }\n  };\n\n  typedef galois::flat_map<deltaIndex, CTy*> LMapTy;\n\n  struct ThreadData {\n    LMapTy local;\n    deltaIndex curIndex;\n    deltaIndex scanStart;\n    CTy* current;\n    unsigned int lastMasterVersion;\n    unsigned int numPops;\n\n    unsigned int popsLastFix;\n    unsigned int slowPopsLastPeriod;\n    unsigned int pushesLastPeriod;\n    unsigned int popsFromSameQ;\n    struct {\n      size_t pmodAllDeq;\n      unsigned int priosLastPeriod;\n      unsigned int numUmerges;\n      Index maxPrioDiffLastPeriod;\n    } stats;\n    Index minPrio;\n    Index maxPrio;\n    substrate::PaddedLock<Concurrent> lock;\n\n    void cleanup() {\n      popsLastFix        = 0;\n      slowPopsLastPeriod = 0;\n      pushesLastPeriod   = 0;\n\n      stats.priosLastPeriod       = 0;\n      stats.maxPrioDiffLastPeriod = 0;\n\n      minPrio = std::numeric_limits<Index>::max();\n      maxPrio = std::numeric_limits<Index>::min();\n    }\n\n    inline bool isSlowPopFreq(double threshold) {\n      // return ((double)slowPopsLastPeriod / (double)popsLastFix) > threshold;\n      return (double)slowPopsLastPeriod > ((double)popsLastFix * threshold);\n    }\n\n    ThreadData(Index initial)\n        : curIndex(initial, 0), scanStart(initial, 0), current(0),\n          lastMasterVersion(0), numPops(0), popsLastFix(0),\n          slowPopsLastPeriod(0), pushesLastPeriod(0),\n          popsFromSameQ(0), stats{0, 0, 0, 0},\n          minPrio(std::numeric_limits<Index>::max()),\n          maxPrio(std::numeric_limits<Index>::min()) {}\n  };\n\n  typedef std::deque<std::pair<deltaIndex, CTy*>> MasterLog;\n\n  // NB: Place dynamically growing masterLog after fixed-size PerThreadStorage\n  // members to give higher likelihood of reclaiming PerThreadStorage\n  substrate::PerThreadStorage<ThreadData> data;\n  substrate::PaddedLock<Concurrent> masterLock;\n  MasterLog masterLog;\n\n  galois::runtime::FixedSizeHeap heap;\n  std::atomic<unsigned int> masterVersion;\n  Indexer indexer;\n\n  bool updateLocal(ThreadData& p) {\n    if (p.lastMasterVersion != masterVersion.load(std::memory_order_relaxed)) {\n      for (;\n           p.lastMasterVersion < masterVersion.load(std::memory_order_relaxed);\n           ++p.lastMasterVersion) {\n        // XXX(ddn): Somehow the second block is better than\n        // the first for bipartite matching (GCC 4.7.2)\n#if 0\n        p.local.insert(masterLog[p.lastMasterVersion]);\n#else\n        std::pair<deltaIndex, CTy*> logEntry = masterLog[p.lastMasterVersion];\n        p.local[logEntry.first]              = logEntry.second;\n        assert(logEntry.second);\n#endif\n      }\n      return true;\n    }\n    return false;\n  }\n\n  GALOIS_ATTRIBUTE_NOINLINE\n  galois::optional<T> slowPop(ThreadData& p) {\n    // Failed, find minimum bin\n    p.slowPopsLastPeriod++;\n    unsigned myID = galois::substrate::ThreadPool::getTID();\n\n    // first give it some time\n    // then check the fdeq frequency\n    if (myID == 0 && p.popsLastFix > counter &&\n        p.isSlowPopFreq(1.0 / (double)(chunk_size))) {\n      unsigned long numPushesThisStep      = p.pushesLastPeriod;\n      unsigned long priosCreatedThisPeriod = p.stats.priosLastPeriod;\n      unsigned long allPmodDeqCounts       = p.stats.pmodAllDeq;\n      Index minOfMin                       = p.minPrio;\n      Index maxOfMax                       = p.maxPrio;\n      p.cleanup();\n      for (unsigned i = 1; i < runtime::activeThreads; ++i) {\n        while (!data.getRemote(i)->lock.try_lock())\n          ;\n\n        Index& otherMinPrio = data.getRemote(i)->minPrio;\n        minOfMin            = std::min(minOfMin, otherMinPrio, compare);\n        Index& otherMaxPrio = data.getRemote(i)->maxPrio;\n        maxOfMax            = std::max(otherMaxPrio, maxOfMax, compare);\n        numPushesThisStep += data.getRemote(i)->pushesLastPeriod;\n        priosCreatedThisPeriod += data.getRemote(i)->stats.priosLastPeriod;\n        allPmodDeqCounts += data.getRemote(i)->stats.pmodAllDeq;\n\n        data.getRemote(i)->cleanup();\n        data.getRemote(i)->lock.unlock();\n      }\n\n      if ((double)numPushesThisStep) {\n        Index prioRange = (maxOfMax >> delta) - (minOfMin >> delta);\n        // Division is expensive\n        // double fillRatio = ((double)numPushesThisStep / (double)prioRange);\n        if (numPushesThisStep < (chunk_size >> 1) * prioRange) {\n          // Ditto\n          // double xx = ((double)(chunk_size) / fillRatio);\n          double xx = std::log2(chunk_size) - std::log2(numPushesThisStep) +\n                      std::log2(prioRange);\n          assert(xx);\n          delta += std::floor(xx);\n          counter <<= 1;\n        }\n      }\n    }\n    // serif added here\n    // make sure delta is bigger than 0 so that we can actually unmerge things\n    // give it some time and check the same queue pops\n    else if (EnableUmerge && delta > 0 && myID == 0 &&\n             p.popsLastFix > counter && p.popsFromSameQ > (chunk_size << 2)) {\n      if (((p.maxPrio >> delta) - (p.minPrio >> delta)) < 16 &&\n          ((double)p.pushesLastPeriod /\n           ((double)((p.maxPrio >> delta) - (p.minPrio >> delta)))) >\n              4 * chunk_size) { // this is a check to make sure we are also\n                                // pushing with the same frequency end of\n                                // execution\n        double diff = ((p.maxPrio >> delta) - (p.minPrio >> delta)) >= 1\n                          ? ((p.maxPrio >> delta) - (p.minPrio >> delta))\n                          : 1;\n        double xx = 16 / diff;\n        if (delta > (unsigned int)(std::floor(std::log2(xx))))\n          delta -= (unsigned int)(std::floor(std::log2(xx)));\n        else\n          delta = 0;\n\n        p.cleanup();\n        for (unsigned i = 1; i < runtime::activeThreads; ++i) {\n          while (!data.getRemote(i)->lock.try_lock())\n            ;\n          data.getRemote(i)->cleanup();\n          data.getRemote(i)->lock.unlock();\n        }\n        p.stats.numUmerges++;\n      }\n      p.popsFromSameQ = 0;\n    }\n    // p.popsFromSameQ = 0;\n\n    bool localLeader = substrate::ThreadPool::isLeader();\n    deltaIndex msS(this->earliest, 0);\n\n    updateLocal(p);\n\n    if (BSP && !UseMonotonic) {\n      msS = p.scanStart;\n      if (localLeader || uniformBSP) {\n        for (unsigned i = 0; i < runtime::activeThreads; ++i) {\n          msS = std::min(msS, data.getRemote(i)->scanStart);\n        }\n      } else {\n        msS = std::min(\n            msS, data.getRemote(substrate::ThreadPool::getLeader())->scanStart);\n      }\n    }\n\n    for (auto ii = p.local.lower_bound(msS), ei = p.local.end(); ii != ei;\n         ++ii) {\n      galois::optional<T> item;\n      if ((item = ii->second->pop())) {\n        p.current   = ii->second;\n        p.curIndex  = ii->first;\n        p.scanStart = ii->first;\n        p.lock.unlock();\n        return item;\n      }\n    }\n\n    p.lock.unlock();\n    return galois::optional<value_type>();\n  }\n\n  GALOIS_ATTRIBUTE_NOINLINE\n  CTy* slowUpdateLocalOrCreate(ThreadData& p, deltaIndex i) {\n    // update local until we find it or we get the write lock\n    do {\n      updateLocal(p);\n      CTy* lC;\n      if ((lC = p.local[i]))\n        return lC;\n    } while (!masterLock.try_lock());\n    // we have the write lock, update again then create\n    updateLocal(p);\n    CTy*& C2 = p.local[i];\n    if (!C2) {\n      C2                  = new (heap.allocate(sizeof(CTy))) CTy();\n      p.lastMasterVersion = masterVersion.load(std::memory_order_relaxed) + 1;\n      masterLog.push_back(std::make_pair(i, C2));\n      masterVersion.fetch_add(1);\n      p.stats.priosLastPeriod++;\n    }\n    masterLock.unlock();\n    return C2;\n  }\n\n  inline CTy* updateLocalOrCreate(ThreadData& p, deltaIndex i) {\n    // Try local then try update then find again or else create and update the\n    // master log\n    CTy* lC;\n    if ((lC = p.local[i]))\n      return lC;\n    // slowpath\n    return slowUpdateLocalOrCreate(p, i);\n  }\n\npublic:\n  AdaptiveOrderedByIntegerMetric(const Indexer& x = Indexer())\n      : data(this->earliest), heap(sizeof(CTy)), masterVersion(0), indexer(x) {\n    delta   = 0;\n    counter = chunk_size;\n  }\n\n  ~AdaptiveOrderedByIntegerMetric() {\n    ThreadData& p = *data.getLocal();\n    updateLocal(p);\n    // Deallocate in LIFO order to give opportunity for simple garbage\n    // collection\n    // Print stats for priroity counts here\n    for (auto ii = masterLog.rbegin(), ei = masterLog.rend(); ii != ei; ++ii) {\n      CTy* lC = ii->second;\n      lC->~CTy();\n      heap.deallocate(lC);\n    }\n  }\n\n  void push(const value_type& val) {\n    deltaIndex index;\n    ThreadData& p = *data.getLocal();\n    while (!p.lock.try_lock())\n      ;\n\n    p.pushesLastPeriod++;\n    index.k = indexer(val);\n    index.d = delta;\n    if (index.k > p.maxPrio) {\n      p.maxPrio = index.k;\n    }\n    if (index.k < p.minPrio) {\n      p.minPrio = index.k;\n    }\n\n    // Fast path\n    if (index == p.curIndex && p.current) {\n      p.current->push(val);\n      p.lock.unlock();\n      return;\n    }\n\n    // Slow path\n    CTy* C = updateLocalOrCreate(p, index);\n    if (BSP && index < p.scanStart)\n      p.scanStart = index;\n    // Opportunistically move to higher priority work\n    if (index < p.curIndex) {\n      // we moved to a higher prio\n      p.popsFromSameQ = 0;\n\n      p.curIndex = index;\n      p.current  = C;\n    }\n    C->push(val);\n\n    p.lock.unlock();\n  }\n\n  template <typename Iter>\n  size_t push(Iter b, Iter e) {\n    size_t npush;\n    for (npush = 0; b != e; npush++)\n      push(*b++);\n    return npush;\n  }\n\n  template <typename RangeTy>\n  size_t push_initial(const RangeTy& range) {\n    auto rp = range.local_pair();\n    return push(rp.first, rp.second);\n  }\n\n  galois::optional<value_type> pop() {\n    ThreadData& p = *data.getLocal();\n    while (!p.lock.try_lock())\n      ;\n    CTy* C = p.current;\n\n    p.popsLastFix++;\n    p.stats.pmodAllDeq++;\n\n    if (BlockPeriod && ((p.numPops++ & ((1ull << BlockPeriod) - 1)) == 0))\n      return slowPop(p);\n\n    galois::optional<value_type> item;\n    if (C && (item = C->pop())) {\n      p.popsFromSameQ++;\n\n      p.lock.unlock();\n      return item;\n    }\n\n    // Slow path\n    return slowPop(p);\n  }\n};\nGALOIS_WLCOMPILECHECK(AdaptiveOrderedByIntegerMetric)\n\n} // end namespace worklists\n} // end namespace galois\n\n#endif\n"
  },
  {
    "path": "libgalois/include/galois/worklists/BulkSynchronous.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#ifndef GALOIS_WORKLIST_BULKSYNCHRONOUS_H\n#define GALOIS_WORKLIST_BULKSYNCHRONOUS_H\n\n#include <atomic>\n\n#include \"galois/config.h\"\n#include \"galois/runtime/Substrate.h\"\n#include \"galois/worklists/Chunk.h\"\n#include \"galois/worklists/WLCompileCheck.h\"\n\nnamespace galois {\nnamespace worklists {\n\n/**\n * Bulk-synchronous scheduling. Work is processed in rounds, and all newly\n * created work is processed after all the current work in a round is\n * completed.\n */\ntemplate <class Container = PerSocketChunkFIFO<>, class T = int,\n          bool Concurrent = true>\nclass BulkSynchronous : private boost::noncopyable {\npublic:\n  template <bool _concurrent>\n  using rethread = BulkSynchronous<Container, T, _concurrent>;\n\n  template <typename _T>\n  using retype =\n      BulkSynchronous<typename Container::template retype<_T>, _T, Concurrent>;\n\n  template <typename _container>\n  using with_container = BulkSynchronous<_container, T, Concurrent>;\n\nprivate:\n  typedef typename Container::template rethread<Concurrent> CTy;\n\n  struct TLD {\n    unsigned round;\n    TLD() : round(0) {}\n  };\n\n  CTy wls[2];\n  substrate::PerThreadStorage<TLD> tlds;\n  substrate::Barrier& barrier;\n  substrate::CacheLineStorage<std::atomic<bool>> some;\n  std::atomic<bool> isEmpty;\n\npublic:\n  typedef T value_type;\n\n  BulkSynchronous()\n      : barrier(runtime::getBarrier(runtime::activeThreads)), some(false),\n        isEmpty(false) {}\n\n  void push(const value_type& val) {\n    wls[(tlds.getLocal()->round + 1) & 1].push(val);\n  }\n\n  template <typename ItTy>\n  void push(ItTy b, ItTy e) {\n    while (b != e)\n      push(*b++);\n  }\n\n  template <typename RangeTy>\n  void push_initial(const RangeTy& range) {\n    auto rp = range.local_pair();\n    push(rp.first, rp.second);\n    tlds.getLocal()->round = 1;\n    some.get()             = true;\n  }\n\n  galois::optional<value_type> pop() {\n    TLD& tld = *tlds.getLocal();\n    galois::optional<value_type> r;\n\n    while (true) {\n      if (isEmpty)\n        return r; // empty\n\n      r = wls[tld.round].pop();\n      if (r)\n        return r;\n\n      barrier.wait();\n      if (substrate::ThreadPool::getTID() == 0) {\n        if (!some.get())\n          isEmpty = true;\n        some.get() = false;\n      }\n      tld.round = (tld.round + 1) & 1;\n      barrier.wait();\n\n      r = wls[tld.round].pop();\n      if (r) {\n        some.get() = true;\n        return r;\n      }\n    }\n  }\n};\nGALOIS_WLCOMPILECHECK(BulkSynchronous)\n\n} // end namespace worklists\n} // end namespace galois\n\n#endif\n"
  },
  {
    "path": "libgalois/include/galois/worklists/Chunk.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#ifndef GALOIS_WORKLIST_CHUNK_H\n#define GALOIS_WORKLIST_CHUNK_H\n\n#include \"galois/config.h\"\n#include \"galois/FixedSizeRing.h\"\n#include \"galois/runtime/Mem.h\"\n#include \"galois/substrate/PaddedLock.h\"\n#include \"galois/worklists/WLCompileCheck.h\"\n#include \"galois/worklists/WorkListHelpers.h\"\n\nnamespace galois {\nnamespace runtime {\nextern unsigned activeThreads;\n}\nnamespace worklists {\n\nnamespace internal {\n// This overly complex specialization avoids a pointer indirection for\n// non-distributed WL when accessing PerLevel\ntemplate <bool, template <typename> class PS, typename TQ>\nstruct squeue {\n  PS<TQ> queues;\n  TQ& get(int i) { return *queues.getRemote(i); }\n  TQ& get() { return *queues.getLocal(); }\n  int myEffectiveID() { return substrate::ThreadPool::getTID(); }\n  int size() { return runtime::activeThreads; }\n};\n\ntemplate <template <typename> class PS, typename TQ>\nstruct squeue<false, PS, TQ> {\n  TQ queue;\n  TQ& get(int) { return queue; }\n  TQ& get() { return queue; }\n  int myEffectiveID() { return 0; }\n  int size() { return 0; }\n};\n\n//! Common functionality to all chunked worklists\ntemplate <typename T, template <typename, bool> class QT, bool Distributed,\n          bool IsStack, int ChunkSize, bool Concurrent>\nstruct ChunkMaster {\n  template <typename _T>\n  using retype =\n      ChunkMaster<_T, QT, Distributed, IsStack, ChunkSize, Concurrent>;\n\n  template <int _chunk_size>\n  using with_chunk_size =\n      ChunkMaster<T, QT, Distributed, IsStack, _chunk_size, Concurrent>;\n\n  template <bool _Concurrent>\n  using rethread =\n      ChunkMaster<T, QT, Distributed, IsStack, ChunkSize, _Concurrent>;\n\nprivate:\n  class Chunk : public FixedSizeRing<T, ChunkSize>,\n                public QT<Chunk, Concurrent>::ListNode {};\n\n  runtime::FixedSizeAllocator<Chunk> alloc;\n\n  struct p {\n    Chunk* cur;\n    Chunk* next;\n    p() : cur(0), next(0) {}\n  };\n\n  typedef QT<Chunk, Concurrent> LevelItem;\n\n  squeue<Concurrent, substrate::PerThreadStorage, p> data;\n  squeue<Distributed, substrate::PerSocketStorage, LevelItem> Q;\n\n  Chunk* mkChunk() {\n    Chunk* ptr = alloc.allocate(1);\n    alloc.construct(ptr);\n    return ptr;\n  }\n\n  void delChunk(Chunk* ptr) {\n    alloc.destroy(ptr);\n    alloc.deallocate(ptr, 1);\n  }\n\n  void pushChunk(Chunk* C) {\n    LevelItem& I = Q.get();\n    I.push(C);\n  }\n\n  Chunk* popChunkByID(unsigned int i) {\n    LevelItem& I = Q.get(i);\n    return I.pop();\n  }\n\n  Chunk* popChunk() {\n    int id   = Q.myEffectiveID();\n    Chunk* r = popChunkByID(id);\n    if (r)\n      return r;\n\n    for (int i = id + 1; i < (int)Q.size(); ++i) {\n      r = popChunkByID(i);\n      if (r)\n        return r;\n    }\n\n    for (int i = 0; i < id; ++i) {\n      r = popChunkByID(i);\n      if (r)\n        return r;\n    }\n\n    return 0;\n  }\n\n  template <typename... Args>\n  T* emplacei(p& n, Args&&... args) {\n    T* retval = 0;\n    if (n.next && (retval = n.next->emplace_back(std::forward<Args>(args)...)))\n      return retval;\n    if (n.next)\n      pushChunk(n.next);\n    n.next = mkChunk();\n    retval = n.next->emplace_back(std::forward<Args>(args)...);\n    assert(retval);\n    return retval;\n  }\n\npublic:\n  typedef T value_type;\n\n  ChunkMaster()                   = default;\n  ChunkMaster(const ChunkMaster&) = delete;\n  ChunkMaster& operator=(const ChunkMaster&) = delete;\n\n  void flush() {\n    p& n = data.get();\n    if (n.next)\n      pushChunk(n.next);\n    n.next = 0;\n  }\n\n  /**\n   * Construct an item on the worklist and return a pointer to its value.\n   *\n   * This pointer facilitates some internal runtime uses and is not designed\n   * to be used by general clients. The address is generally not safe to use\n   * in the presence of concurrent pops.\n   */\n  template <typename... Args>\n  value_type* emplace(Args&&... args) {\n    p& n = data.get();\n    return emplacei(n, std::forward<Args>(args)...);\n  }\n\n  /**\n   * Return pointer to next value to be returned by pop.\n   *\n   * For internal runtime use.\n   */\n  value_type* peek() {\n    p& n = data.get();\n    if (IsStack) {\n      if (n.next && !n.next->empty())\n        return &n.next->back();\n      if (n.next)\n        delChunk(n.next);\n      n.next = popChunk();\n      if (n.next && !n.next->empty())\n        return &n.next->back();\n      return NULL;\n    } else {\n      if (n.cur && !n.cur->empty())\n        return &n.cur->front();\n      if (n.cur)\n        delChunk(n.cur);\n      n.cur = popChunk();\n      if (!n.cur) {\n        n.cur  = n.next;\n        n.next = 0;\n      }\n      if (n.cur && !n.cur->empty())\n        return &n.cur->front();\n      return NULL;\n    }\n  }\n\n  /**\n   * Remove the value returned from peek() from the worklist.\n   *\n   * For internal runtime use.\n   */\n  void pop_peeked() {\n    p& n = data.get();\n    if (IsStack) {\n      n.next->pop_back();\n      return;\n    } else {\n      n.cur->pop_front();\n      return;\n    }\n  }\n\n  void push(const value_type& val) {\n    p& n = data.get();\n    emplacei(n, val);\n  }\n\n  template <typename Iter>\n  void push(Iter b, Iter e) {\n    p& n = data.get();\n    while (b != e)\n      emplacei(n, *b++);\n  }\n\n  template <typename RangeTy>\n  void push_initial(const RangeTy& range) {\n    auto rp = range.local_pair();\n    push(rp.first, rp.second);\n  }\n\n  galois::optional<value_type> pop() {\n    p& n = data.get();\n    galois::optional<value_type> retval;\n    if (IsStack) {\n      if (n.next && (retval = n.next->extract_back()))\n        return retval;\n      if (n.next)\n        delChunk(n.next);\n      n.next = popChunk();\n      if (n.next)\n        return n.next->extract_back();\n      return galois::optional<value_type>();\n    } else {\n      if (n.cur && (retval = n.cur->extract_front()))\n        return retval;\n      if (n.cur)\n        delChunk(n.cur);\n      n.cur = popChunk();\n      if (!n.cur) {\n        n.cur  = n.next;\n        n.next = 0;\n      }\n      if (n.cur)\n        return n.cur->extract_front();\n      return galois::optional<value_type>();\n    }\n  }\n};\n\n} // namespace internal\n\n/**\n * Chunk FIFO. A global FIFO of chunks of some fixed size.\n *\n * @tparam ChunkSize chunk size\n */\ntemplate <int ChunkSize = 64, typename T = int, bool Concurrent = true>\nusing ChunkFIFO = internal::ChunkMaster<T, ConExtLinkedQueue, false, false,\n                                        ChunkSize, Concurrent>;\nGALOIS_WLCOMPILECHECK(ChunkFIFO)\n\n/**\n * Chunk LIFO. A global LIFO of chunks of some fixed size.\n *\n * @tparam ChunkSize chunk size\n */\ntemplate <int ChunkSize = 64, typename T = int, bool Concurrent = true>\nusing ChunkLIFO = internal::ChunkMaster<T, ConExtLinkedStack, false, true,\n                                        ChunkSize, Concurrent>;\nGALOIS_WLCOMPILECHECK(ChunkLIFO)\n\n/**\n * Distributed chunked FIFO. A more scalable version of {@link ChunkFIFO}.\n *\n * @tparam ChunkSize chunk size\n */\ntemplate <int ChunkSize = 64, typename T = int, bool Concurrent = true>\nusing PerSocketChunkFIFO = internal::ChunkMaster<T, ConExtLinkedQueue, true,\n                                                 false, ChunkSize, Concurrent>;\nGALOIS_WLCOMPILECHECK(PerSocketChunkFIFO)\n\n/**\n * Distributed chunked LIFO. A more scalable version of {@link ChunkLIFO}.\n *\n * @tparam chunksize chunk size\n */\ntemplate <int ChunkSize = 64, typename T = int, bool Concurrent = true>\nusing PerSocketChunkLIFO = internal::ChunkMaster<T, ConExtLinkedStack, true,\n                                                 true, ChunkSize, Concurrent>;\nGALOIS_WLCOMPILECHECK(PerSocketChunkLIFO)\n\n/**\n * Distributed chunked bag. A scalable and resource-efficient policy when you\n * are agnostic to the particular scheduling order.\n *\n * @tparam chunksize chunk size\n */\ntemplate <int ChunkSize = 64, typename T = int, bool Concurrent = true>\nusing PerSocketChunkBag = internal::ChunkMaster<T, ConExtLinkedQueue, true,\n                                                true, ChunkSize, Concurrent>;\nGALOIS_WLCOMPILECHECK(PerSocketChunkBag)\n\n} // end namespace worklists\n} // end namespace galois\n\n#endif\n"
  },
  {
    "path": "libgalois/include/galois/worklists/ExternalReference.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#ifndef GALOIS_WORKLIST_EXTERNALREFERENCE_H\n#define GALOIS_WORKLIST_EXTERNALREFERENCE_H\n\n#include \"galois/config.h\"\n\nnamespace galois {\nnamespace worklists {\n\ntemplate <typename Container, bool IgnorePushInitial = false>\nclass ExternalReference {\n  Container& wl;\n\npublic:\n  //! change the type the worklist holds\n  template <typename _T>\n  using retype = ExternalReference<typename Container::template retype<_T>>;\n\n  //! T is the value type of the WL\n  typedef typename Container::value_type value_type;\n\n  ExternalReference(Container& _wl) : wl(_wl) {}\n\n  //! push a value onto the queue\n  void push(const value_type& val) { wl.push(val); }\n\n  //! push a range onto the queue\n  template <typename Iter>\n  void push(Iter b, Iter e) {\n    wl.push(b, e);\n  }\n\n  //! push initial range onto the queue\n  //! called with the same b and e on each thread\n  template <typename RangeTy>\n  void push_initial(const RangeTy& r) {\n    if (!IgnorePushInitial)\n      wl.push_initial(r);\n  }\n\n  //! pop a value from the queue.\n  galois::optional<value_type> pop() { return wl.pop(); }\n};\n\n} // namespace worklists\n} // namespace galois\n#endif\n"
  },
  {
    "path": "libgalois/include/galois/worklists/LocalQueue.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#ifndef GALOIS_WORKLIST_LOCALQUEUE_H\n#define GALOIS_WORKLIST_LOCALQUEUE_H\n\n#include <type_traits>\n\n#include <boost/mpl/if.hpp>\n\n#include \"galois/config.h\"\n#include \"galois/worklists/Simple.h\"\n\nnamespace galois {\nnamespace worklists {\n\ntemplate <typename T = int>\nstruct NoGlobalQueue {\n  template <bool _concurrent>\n  using rethread = NoGlobalQueue<T>;\n\n  template <typename _T>\n  using retype = NoGlobalQueue<_T>;\n};\n\ntemplate <typename Global = NoGlobalQueue<>, typename Local = GFIFO<int>,\n          typename T = int>\nstruct LocalQueue : private boost::noncopyable {\n  template <bool _concurrent>\n  using rethread = LocalQueue<Global, Local, T>;\n\n  template <typename _T>\n  using retype = LocalQueue<typename Global::template retype<_T>,\n                            typename Local::template retype<_T>, _T>;\n\n  template <typename _global>\n  using with_global = LocalQueue<_global, Local, T>;\n\n  template <typename _local>\n  using with_local = LocalQueue<Global, _local, T>;\n\nprivate:\n  typedef typename Local::template rethread<false> lWLTy;\n  substrate::PerThreadStorage<lWLTy> local;\n  Global global;\n\n  template <typename RangeTy,\n            bool Enable = std::is_same<Global, NoGlobalQueue<T>>::value>\n  void pushGlobal(const RangeTy& range,\n                  typename std::enable_if<Enable>::type* = 0) {\n    auto rp = range.local_pair();\n    local.getLocal()->push(rp.first, rp.second);\n  }\n\n  template <typename RangeTy,\n            bool Enable = std::is_same<Global, NoGlobalQueue<T>>::value>\n  void pushGlobal(const RangeTy& range,\n                  typename std::enable_if<!Enable>::type* = 0) {\n    global.push_initial(range);\n  }\n\n  template <bool Enable = std::is_same<Global, NoGlobalQueue<T>>::value>\n  galois::optional<T> popGlobal(typename std::enable_if<Enable>::type* = 0) {\n    return galois::optional<value_type>();\n  }\n\n  template <bool Enable = std::is_same<Global, NoGlobalQueue<T>>::value>\n  galois::optional<T> popGlobal(typename std::enable_if<!Enable>::type* = 0) {\n    return global.pop();\n  }\n\npublic:\n  typedef T value_type;\n\n  void push(const value_type& val) { local.getLocal()->push(val); }\n\n  template <typename Iter>\n  void push(Iter b, Iter e) {\n    local.getLocal()->push(b, e);\n  }\n\n  template <typename RangeTy>\n  void push_initial(const RangeTy& range) {\n    pushGlobal(range);\n  }\n\n  galois::optional<value_type> pop() {\n    galois::optional<value_type> ret = local.getLocal()->pop();\n    if (ret)\n      return ret;\n    return popGlobal();\n  }\n};\nGALOIS_WLCOMPILECHECK(LocalQueue)\n\n} // end namespace worklists\n} // end namespace galois\n\n#endif\n"
  },
  {
    "path": "libgalois/include/galois/worklists/Obim.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#ifndef GALOIS_WORKLIST_OBIM_H\n#define GALOIS_WORKLIST_OBIM_H\n\n#include <deque>\n#include <limits>\n#include <type_traits>\n\n#include \"galois/FlatMap.h\"\n#include \"galois/runtime/Substrate.h\"\n#include \"galois/substrate/PerThreadStorage.h\"\n#include \"galois/substrate/Termination.h\"\n#include \"galois/worklists/Chunk.h\"\n#include \"galois/worklists/WorkListHelpers.h\"\n\nnamespace galois {\nnamespace worklists {\n\nnamespace internal {\n\ntemplate <typename T, typename Index, bool UseBarrier>\nclass OrderedByIntegerMetricData {\nprotected:\n  struct ThreadData {};\n  bool hasStored(ThreadData&, Index) { return false; }\n  galois::optional<T> popStored(ThreadData&, Index) { return {}; }\n};\n\ntemplate <typename T, typename Index>\nclass OrderedByIntegerMetricData<T, Index, true> {\nprotected:\n  struct ThreadData {\n    bool hasWork;\n    std::deque<std::pair<Index, T>> stored;\n  };\n\n  substrate::Barrier& barrier;\n\n  OrderedByIntegerMetricData()\n      : barrier(runtime::getBarrier(runtime::activeThreads)) {}\n\n  bool hasStored(ThreadData& p, Index idx) {\n    for (auto& e : p.stored) {\n      if (e.first == idx) {\n        std::swap(e, p.stored.front());\n        return true;\n      }\n    }\n    return false;\n  }\n\n  galois::optional<T> popStored(ThreadData& p, Index idx) {\n    galois::optional<T> item;\n    for (auto ii = p.stored.begin(), ei = p.stored.end(); ii != ei; ++ii) {\n      if (ii->first == idx) {\n        item = ii->second;\n        p.stored.erase(ii);\n        break;\n      }\n    }\n    return item;\n  }\n};\n\ntemplate <typename Index, bool UseDescending>\nstruct OrderedByIntegerMetricComparator {\n  std::less<Index> compare;\n  Index identity;\n  Index earliest;\n\n  template <typename C>\n  struct with_local_map {\n    typedef galois::flat_map<Index, C, std::less<Index>> type;\n  };\n  OrderedByIntegerMetricComparator()\n      : identity(std::numeric_limits<Index>::max()),\n        earliest(std::numeric_limits<Index>::min()) {}\n};\n\ntemplate <typename Index>\nstruct OrderedByIntegerMetricComparator<Index, true> {\n  std::greater<Index> compare;\n  Index identity;\n  Index earliest;\n\n  template <typename C>\n  struct with_local_map {\n    typedef galois::flat_map<Index, C, std::greater<Index>> type;\n  };\n  OrderedByIntegerMetricComparator()\n      : identity(std::numeric_limits<Index>::min()),\n        earliest(std::numeric_limits<Index>::max()) {}\n};\n\n} // namespace internal\n\n/**\n * Approximate priority scheduling. Indexer is a default-constructable class\n * whose instances conform to <code>R r = indexer(item)</code> where R is some\n * type with a total order defined by <code>operator&lt;</code> and\n * <code>operator==</code> and item is an element from the Galois set\n * iterator.\n *\n * An example:\n * \\code\n * struct Item { int index; };\n *\n * struct Indexer {\n *   int operator()(Item i) const { return i.index; }\n * };\n *\n * typedef galois::worklists::OrderedByIntegerMetric<Indexer> WL;\n * galois::for_each<WL>(galois::iterate(items), Fn);\n * \\endcode\n *\n * @tparam Indexer        Indexer class\n * @tparam Container      Scheduler for each bucket\n * @tparam BlockPeriod    Check for higher priority work every 2^BlockPeriod\n *                        iterations\n * @tparam BSP            Use back-scan prevention\n * @tparam UseBarrier     Eliminate priority inversions by placing a barrier\n * between priority levels\n * @tparam UseMonotonic   Assume that an activity at priority p will not\n * schedule work at priority p or any priority p1 where p1 < p.\n * @tparam UseDescending  Use descending order instead\n */\n// TODO could move to general comparator but there are issues with atomic reads\n// and initial values for arbitrary types\ntemplate <class Indexer      = DummyIndexer<int>,\n          typename Container = PerSocketChunkFIFO<>, unsigned BlockPeriod = 0,\n          bool BSP = true, typename T = int, typename Index = int,\n          bool UseBarrier = false, bool UseMonotonic = false,\n          bool UseDescending = false, bool Concurrent = true>\nstruct OrderedByIntegerMetric\n    : private boost::noncopyable,\n      public internal::OrderedByIntegerMetricData<T, Index, UseBarrier>,\n      public internal::OrderedByIntegerMetricComparator<Index, UseDescending> {\n  // static_assert(std::is_integral<Index>::value, \"only integral index types\n  // supported\");\n\n  template <typename _T>\n  using retype = OrderedByIntegerMetric<\n      Indexer, typename Container::template retype<_T>, BlockPeriod, BSP, _T,\n      typename std::result_of<Indexer(_T)>::type, UseBarrier, UseMonotonic,\n      UseDescending, Concurrent>;\n\n  template <bool _b>\n  using rethread =\n      OrderedByIntegerMetric<Indexer, Container, BlockPeriod, BSP, T, Index,\n                             UseBarrier, UseMonotonic, UseDescending, _b>;\n\n  template <unsigned _period>\n  struct with_block_period {\n    typedef OrderedByIntegerMetric<Indexer, Container, _period, BSP, T, Index,\n                                   UseBarrier, UseMonotonic, UseDescending,\n                                   Concurrent>\n        type;\n  };\n\n  template <typename _container>\n  struct with_container {\n    typedef OrderedByIntegerMetric<Indexer, _container, BlockPeriod, BSP, T,\n                                   Index, UseBarrier, UseMonotonic,\n                                   UseDescending, Concurrent>\n        type;\n  };\n\n  template <typename _indexer>\n  struct with_indexer {\n    typedef OrderedByIntegerMetric<_indexer, Container, BlockPeriod, BSP, T,\n                                   Index, UseBarrier, UseMonotonic,\n                                   UseDescending, Concurrent>\n        type;\n  };\n\n  template <bool _bsp>\n  struct with_back_scan_prevention {\n    typedef OrderedByIntegerMetric<Indexer, Container, BlockPeriod, _bsp, T,\n                                   Index, UseBarrier, UseMonotonic,\n                                   UseDescending, Concurrent>\n        type;\n  };\n\n  template <bool _use_barrier>\n  struct with_barrier {\n    typedef OrderedByIntegerMetric<Indexer, Container, BlockPeriod, BSP, T,\n                                   Index, _use_barrier, UseMonotonic,\n                                   UseDescending, Concurrent>\n        type;\n  };\n\n  template <bool _use_monotonic>\n  struct with_monotonic {\n    typedef OrderedByIntegerMetric<Indexer, Container, BlockPeriod, BSP, T,\n                                   Index, UseBarrier, _use_monotonic,\n                                   UseDescending, Concurrent>\n        type;\n  };\n\n  template <bool _use_descending>\n  struct with_descending {\n    typedef OrderedByIntegerMetric<Indexer, Container, BlockPeriod, BSP, T,\n                                   Index, UseBarrier, UseMonotonic,\n                                   _use_descending, Concurrent>\n        type;\n  };\n\n  typedef T value_type;\n  typedef Index index_type;\n\nprivate:\n  typedef typename Container::template rethread<Concurrent> CTy;\n  typedef internal::OrderedByIntegerMetricComparator<Index, UseDescending>\n      Comparator;\n  typedef typename Comparator::template with_local_map<CTy*>::type LMapTy;\n\n  struct ThreadData\n      : public internal::OrderedByIntegerMetricData<T, Index,\n                                                    UseBarrier>::ThreadData {\n    LMapTy local;\n    Index curIndex;\n    Index scanStart;\n    CTy* current;\n    unsigned int lastMasterVersion;\n    unsigned int numPops;\n\n    ThreadData(Index initial)\n        : curIndex(initial), scanStart(initial), current(0),\n          lastMasterVersion(0), numPops(0) {}\n  };\n\n  typedef std::deque<std::pair<Index, CTy*>> MasterLog;\n\n  // NB: Place dynamically growing masterLog after fixed-size PerThreadStorage\n  // members to give higher likelihood of reclaiming PerThreadStorage\n  substrate::PerThreadStorage<ThreadData> data;\n  substrate::PaddedLock<Concurrent> masterLock;\n  MasterLog masterLog;\n\n  std::atomic<unsigned int> masterVersion;\n  Indexer indexer;\n\n  bool updateLocal(ThreadData& p) {\n    if (p.lastMasterVersion != masterVersion.load(std::memory_order_relaxed)) {\n      for (;\n           p.lastMasterVersion < masterVersion.load(std::memory_order_relaxed);\n           ++p.lastMasterVersion) {\n        // XXX(ddn): Somehow the second block is better than\n        // the first for bipartite matching (GCC 4.7.2)\n#if 0\n        p.local.insert(masterLog[p.lastMasterVersion]);\n#else\n        std::pair<Index, CTy*> logEntry = masterLog[p.lastMasterVersion];\n        p.local[logEntry.first]         = logEntry.second;\n        assert(logEntry.second);\n#endif\n      }\n      return true;\n    }\n    return false;\n  }\n\n  GALOIS_ATTRIBUTE_NOINLINE\n  galois::optional<T> slowPop(ThreadData& p) {\n    bool localLeader = substrate::ThreadPool::isLeader();\n    Index msS        = this->earliest;\n\n    updateLocal(p);\n\n    if (BSP && !UseMonotonic) {\n      msS = p.scanStart;\n      if (localLeader) {\n        for (unsigned i = 0; i < runtime::activeThreads; ++i) {\n          Index o = data.getRemote(i)->scanStart;\n          if (this->compare(o, msS))\n            msS = o;\n        }\n      } else {\n        Index o = data.getRemote(substrate::ThreadPool::getLeader())->scanStart;\n        if (this->compare(o, msS))\n          msS = o;\n      }\n    }\n\n    for (auto ii = p.local.lower_bound(msS), ei = p.local.end(); ii != ei;\n         ++ii) {\n      galois::optional<T> item;\n      if ((item = ii->second->pop())) {\n        p.current   = ii->second;\n        p.curIndex  = ii->first;\n        p.scanStart = ii->first;\n        return item;\n      }\n    }\n\n    return galois::optional<value_type>();\n  }\n\n  GALOIS_ATTRIBUTE_NOINLINE\n  CTy* slowUpdateLocalOrCreate(ThreadData& p, Index i) {\n    // update local until we find it or we get the write lock\n    do {\n      updateLocal(p);\n      auto it = p.local.find(i);\n      if (it != p.local.end())\n        return it->second;\n    } while (!masterLock.try_lock());\n    // we have the write lock, update again then create\n    updateLocal(p);\n    auto it = p.local.find(i);\n    CTy* C2 = (it != p.local.end()) ? it->second : nullptr;\n    if (!C2) {\n      C2                  = new CTy();\n      p.local[i]          = C2;\n      p.lastMasterVersion = masterVersion.load(std::memory_order_relaxed) + 1;\n      masterLog.push_back(std::make_pair(i, C2));\n      masterVersion.fetch_add(1);\n    }\n    masterLock.unlock();\n    return C2;\n  }\n\n  inline CTy* updateLocalOrCreate(ThreadData& p, Index i) {\n    // Try local then try update then find again or else create and update the\n    // master log\n    auto it = p.local.find(i);\n    if (it != p.local.end())\n      return it->second;\n    // slowpath\n    return slowUpdateLocalOrCreate(p, i);\n  }\n\npublic:\n  OrderedByIntegerMetric(const Indexer& x = Indexer())\n      : data(this->earliest), masterVersion(0), indexer(x) {}\n\n  ~OrderedByIntegerMetric() {\n    // Deallocate in LIFO order to give opportunity for simple garbage\n    // collection\n    for (auto ii = masterLog.rbegin(), ei = masterLog.rend(); ii != ei; ++ii) {\n      delete ii->second;\n    }\n  }\n\n  void push(const value_type& val) {\n    Index index   = indexer(val);\n    ThreadData& p = *data.getLocal();\n\n    assert(!UseMonotonic || this->compare(p.curIndex, index));\n\n    // Fast path\n    if (index == p.curIndex && p.current) {\n      p.current->push(val);\n      return;\n    }\n\n    // Slow path\n    CTy* C = updateLocalOrCreate(p, index);\n    if (BSP && this->compare(index, p.scanStart))\n      p.scanStart = index;\n    // Opportunistically move to higher priority work\n    if (!UseBarrier && this->compare(index, p.curIndex)) {\n      p.curIndex = index;\n      p.current  = C;\n    }\n    C->push(val);\n  }\n\n  template <typename Iter>\n  void push(Iter b, Iter e) {\n    while (b != e)\n      push(*b++);\n  }\n\n  template <typename RangeTy>\n  void push_initial(const RangeTy& range) {\n    auto rp = range.local_pair();\n    push(rp.first, rp.second);\n  }\n\n  galois::optional<value_type> pop() {\n    // Find a successful pop\n    ThreadData& p = *data.getLocal();\n    CTy* C        = p.current;\n\n    if (this->hasStored(p, p.curIndex))\n      return this->popStored(p, p.curIndex);\n\n    if (!UseBarrier && BlockPeriod &&\n        ((p.numPops++ & ((1 << BlockPeriod) - 1)) == 0))\n      return slowPop(p);\n\n    galois::optional<value_type> item;\n    if (C && (item = C->pop()))\n      return item;\n\n    if (UseBarrier)\n      return item;\n\n    // Slow path\n    return slowPop(p);\n  }\n\n  template <bool Barrier = UseBarrier>\n  auto empty() -> typename std::enable_if<Barrier, bool>::type {\n    galois::optional<value_type> item;\n    ThreadData& p = *data.getLocal();\n\n    // try to pop from global worklist\n    item = slowPop(p);\n    if (item)\n      p.stored.push_back(std::make_pair(p.curIndex, *item));\n\n    // check if there are thread-local work items\n    if (!p.stored.empty()) {\n      Index storedIndex = this->identity;\n      for (auto& e : p.stored) {\n        if (this->compare(e.first, storedIndex)) {\n          storedIndex = e.first;\n        }\n      }\n      p.curIndex = storedIndex;\n      p.current  = p.local[storedIndex];\n    }\n    p.hasWork = !p.stored.empty();\n\n    this->barrier.wait();\n\n    // align with the earliest level from threads that have works\n    bool hasWork   = p.hasWork;\n    Index curIndex = (hasWork) ? p.curIndex : this->identity;\n    CTy* C         = (hasWork) ? p.current : nullptr;\n\n    for (unsigned i = 0; i < runtime::activeThreads; ++i) {\n      ThreadData& o = *data.getRemote(i);\n      if (o.hasWork && this->compare(o.curIndex, curIndex)) {\n        curIndex = o.curIndex;\n        C        = o.current;\n      }\n      hasWork |= o.hasWork;\n    }\n\n    this->barrier.wait();\n\n    p.current  = C;\n    p.curIndex = curIndex;\n\n    if (UseMonotonic) {\n      for (auto ii = p.local.begin(); ii != p.local.end();) {\n        bool toBreak = ii->second == C;\n        if (toBreak)\n          break;\n        ii = p.local.erase(ii);\n      }\n    }\n\n    return !hasWork;\n  }\n};\nGALOIS_WLCOMPILECHECK(OrderedByIntegerMetric)\n\n} // end namespace worklists\n} // end namespace galois\n\n#endif\n"
  },
  {
    "path": "libgalois/include/galois/worklists/OrderedList.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#ifndef GALOIS_WORKLIST_ORDEREDLIST_H\n#define GALOIS_WORKLIST_ORDEREDLIST_H\n\n#include \"galois/config.h\"\n#include \"galois/FlatMap.h\"\n\nnamespace galois {\nnamespace worklists {\n\ntemplate <class Compare = std::less<int>, typename T = int,\n          bool concurrent = true>\nclass OrderedList : private boost::noncopyable,\n                    private substrate::PaddedLock<concurrent> {\n  typedef galois::flat_map<T, std::deque<T>, Compare> Map;\n\n  Map map;\n\n  using substrate::PaddedLock<concurrent>::lock;\n  using substrate::PaddedLock<concurrent>::try_lock;\n  using substrate::PaddedLock<concurrent>::unlock;\n\npublic:\n  template <typename Tnew>\n  using retype = OrderedList<Compare, Tnew, concurrent>;\n\n  template <bool b>\n  using rethread = OrderedList<Compare, T, b>;\n\n  typedef T value_type;\n\n  void push(value_type val) {\n    lock();\n    std::deque<T>& list = map[val];\n    list.push_back(val);\n    unlock();\n  }\n\n  template <typename Iter>\n  void push(Iter b, Iter e) {\n    lock();\n    while (b != e) {\n      std::deque<T>& list = map[*b];\n      list.push_back(*b);\n      ++b;\n    }\n    unlock();\n  }\n\n  template <typename RangeTy>\n  void push_initial(RangeTy range) {\n    if (substrate::ThreadPool::getTID() == 0)\n      push(range.begin(), range.end());\n  }\n\n  galois::optional<value_type> pop() {\n    lock();\n    if (map.empty()) {\n      unlock();\n      return galois::optional<value_type>();\n    }\n    auto ii             = map.begin();\n    std::deque<T>& list = ii->second;\n    galois::optional<value_type> v(list.front());\n    list.pop_front();\n    if (list.empty())\n      map.erase(ii);\n    unlock();\n    return v;\n  }\n};\nGALOIS_WLCOMPILECHECK(OrderedList)\n} // namespace worklists\n} // namespace galois\n#endif\n"
  },
  {
    "path": "libgalois/include/galois/worklists/OwnerComputes.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#ifndef GALOIS_WORKLIST_OWNERCOMPUTES_H\n#define GALOIS_WORKLIST_OWNERCOMPUTES_H\n\n#include \"galois/config.h\"\n#include \"galois/worklists/WLCompileCheck.h\"\n\nnamespace galois {\nnamespace worklists {\n\ntemplate <typename OwnerFn   = DummyIndexer<int>,\n          typename Container = ChunkLIFO<>, typename T = int>\nstruct OwnerComputes : private boost::noncopyable {\n  template <typename _T>\n  using retype =\n      OwnerComputes<OwnerFn, typename Container::template retype<_T>, _T>;\n\n  template <bool b>\n  using rethread =\n      OwnerComputes<OwnerFn, typename Container::template rethread<b>, T>;\n\n  template <typename _container>\n  struct with_container {\n    typedef OwnerComputes<OwnerFn, _container, T> type;\n  };\n\n  template <typename _indexer>\n  struct with_indexer {\n    typedef OwnerComputes<_indexer, Container, T> type;\n  };\n\nprivate:\n  typedef typename Container::template retype<T> lWLTy;\n\n  typedef lWLTy cWL;\n  typedef lWLTy pWL;\n\n  OwnerFn Fn;\n  substrate::PerSocketStorage<cWL> items;\n  substrate::PerSocketStorage<pWL> pushBuffer;\n\npublic:\n  typedef T value_type;\n\n  void push(const value_type& val) {\n    unsigned int index  = Fn(val);\n    auto& tp            = substrate::getThreadPool();\n    unsigned int mindex = tp.getSocket(index);\n    // std::cerr << \"[\" << index << \",\" << index % active << \"]\\n\";\n    if (mindex == substrate::ThreadPool::getSocket())\n      items.getLocal()->push(val);\n    else\n      pushBuffer.getRemote(mindex)->push(val);\n  }\n\n  template <typename ItTy>\n  void push(ItTy b, ItTy e) {\n    while (b != e)\n      push(*b++);\n  }\n\n  template <typename RangeTy>\n  void push_initial(const RangeTy& range) {\n    auto rp = range.local_pair();\n    push(rp.first, rp.second);\n    for (unsigned int x = 0; x < pushBuffer.size(); ++x)\n      pushBuffer.getRemote(x)->flush();\n  }\n\n  galois::optional<value_type> pop() {\n    cWL& wl                             = *items.getLocal();\n    galois::optional<value_type> retval = wl.pop();\n    if (retval)\n      return retval;\n    pWL& p = *pushBuffer.getLocal();\n    while ((retval = p.pop()))\n      wl.push(*retval);\n    return wl.pop();\n  }\n};\nGALOIS_WLCOMPILECHECK(OwnerComputes)\n\n} // end namespace worklists\n} // end namespace galois\n\n#endif\n"
  },
  {
    "path": "libgalois/include/galois/worklists/PerThreadChunk.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#ifndef GALOIS_WORKLIST_PERTHREADCHUNK_H\n#define GALOIS_WORKLIST_PERTHREADCHUNK_H\n\n#include \"galois/FixedSizeRing.h\"\n#include \"galois/runtime/Mem.h\"\n#include \"galois/substrate/PerThreadStorage.h\"\n#include \"galois/substrate/CompilerSpecific.h\"\n#include \"galois/substrate/PtrLock.h\"\n#include \"galois/Threads.h\"\n#include \"galois/worklists/WLCompileCheck.h\"\n\nnamespace galois {\nnamespace worklists {\n\nstruct ChunkHeader {\n  ChunkHeader* next;\n  ChunkHeader* prev;\n};\n\nclass PerThreadChunkQueue {\n  substrate::PtrLock<ChunkHeader> head;\n  ChunkHeader* tail;\n\n  void prepend(ChunkHeader* C) {\n    // Find tail of stolen stuff\n    ChunkHeader* t = C;\n    while (t->next) {\n      t = t->next;\n    }\n    head.lock();\n    t->next = head.getValue();\n    if (!t->next)\n      tail = t;\n    head.unlock_and_set(C);\n  }\n\npublic:\n  PerThreadChunkQueue() : tail(0) {}\n\n  bool empty() const { return !tail; }\n\n  void push(ChunkHeader* obj) {\n    head.lock();\n    obj->next = 0;\n    if (tail) {\n      tail->next = obj;\n      tail       = obj;\n      head.unlock();\n    } else {\n      assert(!head.getValue());\n      tail = obj;\n      head.unlock_and_set(obj);\n    }\n  }\n\n  ChunkHeader* pop() {\n    // lock free Fast path empty case\n    if (empty())\n      return 0;\n\n    head.lock();\n    ChunkHeader* h = head.getValue();\n    if (!h) {\n      head.unlock();\n      return 0;\n    }\n    if (tail == h) {\n      tail = 0;\n      assert(!h->next);\n      head.unlock_and_clear();\n    } else {\n      head.unlock_and_set(h->next);\n      h->next = 0;\n    }\n    return h;\n  }\n\n  ChunkHeader* stealAllAndPop(PerThreadChunkQueue& victim) {\n    // Don't do work on empty victims (lockfree check)\n    if (victim.empty())\n      return 0;\n    // Steal everything\n    victim.head.lock();\n    ChunkHeader* C = victim.head.getValue();\n    if (C)\n      victim.tail = 0;\n    victim.head.unlock_and_clear();\n    if (!C)\n      return 0; // Didn't get anything\n    ChunkHeader* retval = C;\n    C                   = C->next;\n    retval->next        = 0;\n    if (!C)\n      return retval; // Only got one thing\n    prepend(C);\n    return retval;\n  }\n\n  ChunkHeader* stealHalfAndPop(PerThreadChunkQueue& victim) {\n    // Don't do work on empty victims (lockfree check)\n    if (victim.empty())\n      return 0;\n    // Steal half\n    victim.head.lock();\n    ChunkHeader* C     = victim.head.getValue();\n    ChunkHeader* ntail = C;\n    bool count         = false;\n    while (C) {\n      C = C->next;\n      if (count)\n        ntail = ntail->next;\n      count = !count;\n    }\n    if (ntail) {\n      C           = ntail->next;\n      ntail->next = 0;\n      victim.tail = ntail;\n    }\n    victim.head.unlock();\n    if (!C)\n      return 0; // Didn't get anything\n    ChunkHeader* retval = C;\n    C                   = C->next;\n    retval->next        = 0;\n    if (!C)\n      return retval; // Only got one thing\n    prepend(C);\n    return retval;\n  }\n};\n\nclass PerThreadChunkStack {\n  substrate::PtrLock<ChunkHeader> head;\n\n  void prepend(ChunkHeader* C) {\n    // Find tail of stolen stuff\n    ChunkHeader* tail = C;\n    while (tail->next) {\n      tail = tail->next;\n    }\n    head.lock();\n    tail->next = head.getValue();\n    head.unlock_and_set(C);\n  }\n\npublic:\n  bool empty() const { return !head.getValue(); }\n\n  void push(ChunkHeader* obj) {\n    ChunkHeader* oldhead = 0;\n    do {\n      oldhead   = head.getValue();\n      obj->next = oldhead;\n    } while (!head.CAS(oldhead, obj));\n  }\n\n  ChunkHeader* pop() {\n    // lock free Fast empty path\n    if (empty())\n      return 0;\n\n    // Disable CAS\n    head.lock();\n    ChunkHeader* retval = head.getValue();\n    ChunkHeader* setval = 0;\n    if (retval) {\n      setval       = retval->next;\n      retval->next = 0;\n    }\n    head.unlock_and_set(setval);\n    return retval;\n  }\n\n  ChunkHeader* stealAllAndPop(PerThreadChunkStack& victim) {\n    // Don't do work on empty victims (lockfree check)\n    if (victim.empty())\n      return 0;\n    // Steal everything\n    victim.head.lock();\n    ChunkHeader* C = victim.head.getValue();\n    victim.head.unlock_and_clear();\n    if (!C)\n      return 0; // Didn't get anything\n    ChunkHeader* retval = C;\n    C                   = C->next;\n    retval->next        = 0;\n    if (!C)\n      return retval; // Only got one thing\n    prepend(C);\n    return retval;\n  }\n\n  ChunkHeader* stealHalfAndPop(PerThreadChunkStack& victim) {\n    // Don't do work on empty victims (lockfree check)\n    if (victim.empty())\n      return 0;\n    // Steal half\n    victim.head.lock();\n    ChunkHeader* C     = victim.head.getValue();\n    ChunkHeader* ntail = C;\n    bool count         = false;\n    while (C) {\n      C = C->next;\n      if (count)\n        ntail = ntail->next;\n      count = !count;\n    }\n    if (ntail) {\n      C           = ntail->next;\n      ntail->next = 0;\n    }\n    victim.head.unlock();\n    if (!C)\n      return 0; // Didn't get anything\n    ChunkHeader* retval = C;\n    C                   = C->next;\n    retval->next        = 0;\n    if (!C)\n      return retval; // Only got one thing\n    prepend(C);\n    return retval;\n  }\n};\n\ntemplate <typename InnerWL>\nclass StealingQueue : private boost::noncopyable {\n  substrate::PerThreadStorage<std::pair<InnerWL, unsigned>> local;\n\n  GALOIS_ATTRIBUTE_NOINLINE\n  ChunkHeader* doSteal() {\n    std::pair<InnerWL, unsigned>& me = *local.getLocal();\n    auto& tp                         = substrate::getThreadPool();\n    unsigned id                      = tp.getTID();\n    unsigned pkg                     = substrate::ThreadPool::getSocket();\n    unsigned num                     = galois::getActiveThreads();\n\n    // First steal from this socket\n    for (unsigned eid = id + 1; eid < num; ++eid) {\n      if (tp.getSocket(eid) == pkg) {\n        ChunkHeader* c = me.first.stealHalfAndPop(local.getRemote(eid)->first);\n        if (c)\n          return c;\n      }\n    }\n    for (unsigned eid = 0; eid < id; ++eid) {\n      if (tp.getSocket(eid) == pkg) {\n        ChunkHeader* c = me.first.stealHalfAndPop(local.getRemote(eid)->first);\n        if (c)\n          return c;\n      }\n    }\n\n    // Leaders can cross socket\n    if (substrate::ThreadPool::isLeader()) {\n      unsigned eid = (id + me.second) % num;\n      ++me.second;\n      if (id != eid && tp.isLeader(eid)) {\n        ChunkHeader* c = me.first.stealAllAndPop(local.getRemote(eid)->first);\n        if (c)\n          return c;\n      }\n    }\n    return 0;\n  }\n\npublic:\n  void push(ChunkHeader* c) { local.getLocal()->first.push(c); }\n\n  ChunkHeader* pop() {\n    if (ChunkHeader* c = local.getLocal()->first.pop())\n      return c;\n    return doSteal();\n  }\n};\n\ntemplate <bool IsLocallyLIFO, int ChunkSize, typename Container, typename T>\nstruct PerThreadChunkMaster : private boost::noncopyable {\n  template <typename _T>\n  using retype = PerThreadChunkMaster<IsLocallyLIFO, ChunkSize, Container, _T>;\n\n  template <bool _concurrent>\n  using rethread = PerThreadChunkMaster<IsLocallyLIFO, ChunkSize, Container, T>;\n\n  template <int _chunk_size>\n  using with_chunk_size =\n      PerThreadChunkMaster<IsLocallyLIFO, _chunk_size, Container, T>;\n\nprivate:\n  class Chunk : public ChunkHeader,\n                public galois::FixedSizeRing<T, ChunkSize> {};\n\n  runtime::FixedSizeAllocator<Chunk> alloc;\n  substrate::PerThreadStorage<std::pair<Chunk*, Chunk*>> data;\n  Container worklist;\n\n  Chunk* mkChunk() {\n    Chunk* ptr = alloc.allocate(1);\n    alloc.construct(ptr);\n    return ptr;\n  }\n\n  void delChunk(Chunk* ptr) {\n    alloc.destroy(ptr);\n    alloc.deallocate(ptr, 1);\n  }\n\n  void swapInPush(std::pair<Chunk*, Chunk*>& d) {\n    if (!IsLocallyLIFO)\n      std::swap(d.first, d.second);\n  }\n\n  Chunk*& getPushChunk(std::pair<Chunk*, Chunk*>& d) {\n    if (!IsLocallyLIFO)\n      return d.second;\n    else\n      return d.first;\n  }\n\n  Chunk*& getPopChunk(std::pair<Chunk*, Chunk*>& d) { return d.first; }\n\n  bool doPush(Chunk* c, const T& val) { return c->push_back(val); }\n\n  galois::optional<T> doPop(Chunk* c) {\n    if (!IsLocallyLIFO)\n      return c->extract_front();\n    else\n      return c->extract_back();\n  }\n\n  void push_internal(std::pair<Chunk*, Chunk*>&, Chunk*& n, const T& val) {\n    // Simple case, space in current chunk\n    if (n && doPush(n, val))\n      return;\n    // full chunk, push\n    if (n)\n      worklist.push(static_cast<ChunkHeader*>(n));\n    // get empty chunk;\n    n = mkChunk();\n    // There better be some room in the new chunk\n    doPush(n, val);\n  }\n\npublic:\n  typedef T value_type;\n\n  PerThreadChunkMaster() {}\n\n  void push(value_type val) {\n    std::pair<Chunk*, Chunk*>& tld = *data.getLocal();\n    Chunk*& n                      = getPushChunk(tld);\n    push_internal(tld, n, val);\n  }\n\n  template <typename Iter>\n  void push(Iter b, Iter e) {\n    std::pair<Chunk*, Chunk*>& tld = *data.getLocal();\n    Chunk*& n                      = getPushChunk(tld);\n    while (b != e)\n      push_internal(tld, n, *b++);\n  }\n\n  template <typename RangeTy>\n  void push_initial(const RangeTy& range) {\n    auto rp = range.local_pair();\n    push(rp.first, rp.second);\n  }\n\n  galois::optional<value_type> pop() {\n    std::pair<Chunk*, Chunk*>& tld = *data.getLocal();\n    Chunk*& n                      = getPopChunk(tld);\n    galois::optional<value_type> retval;\n    // simple case, things in current chunk\n    if (n && (retval = doPop(n)))\n      return retval;\n    // empty chunk, trash it\n    if (n)\n      delChunk(n);\n    // get a new chunk\n    n = static_cast<Chunk*>(worklist.pop());\n    if (n && (retval = doPop(n)))\n      return retval;\n    // try stealing the push buffer if we can\n    swapInPush(tld);\n    if (n)\n      retval = doPop(n);\n    return retval;\n  }\n};\n\ntemplate <int ChunkSize = 64, typename T = int>\nusing PerThreadChunkLIFO =\n    PerThreadChunkMaster<true, ChunkSize, StealingQueue<PerThreadChunkStack>,\n                         T>;\nGALOIS_WLCOMPILECHECK(PerThreadChunkLIFO)\n\ntemplate <int ChunkSize = 64, typename T = int>\nusing PerThreadChunkFIFO =\n    PerThreadChunkMaster<false, ChunkSize, StealingQueue<PerThreadChunkQueue>,\n                         T>;\nGALOIS_WLCOMPILECHECK(PerThreadChunkFIFO)\n\n} // namespace worklists\n} // namespace galois\n#endif\n"
  },
  {
    "path": "libgalois/include/galois/worklists/Simple.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#ifndef GALOIS_WORKLIST_FIFO_H\n#define GALOIS_WORKLIST_FIFO_H\n\n#include <deque>\n#include <mutex>\n\n#include \"galois/config.h\"\n#include \"galois/gdeque.h\"\n#include \"galois/substrate/PaddedLock.h\"\n#include \"galois/worklists/WLCompileCheck.h\"\n\nnamespace galois {\nnamespace worklists {\n\n//! Simple Container Wrapper worklist (not scalable).\ntemplate <typename T, typename container = std::deque<T>, bool popBack = true>\nclass Wrapper : private boost::noncopyable {\n  substrate::PaddedLock<true> lock;\n  container wl;\n\npublic:\n  template <typename _T>\n  using retype = Wrapper<_T>;\n\n  template <bool b>\n  using rethread = Wrapper;\n\n  typedef T value_type;\n\n  void push(const value_type& val) {\n    std::lock_guard<substrate::PaddedLock<true>> lg(lock);\n    wl.push_back(val);\n  }\n\n  template <typename Iter>\n  void push(Iter b, Iter e) {\n    std::lock_guard<substrate::PaddedLock<true>> lg(lock);\n    wl.insert(wl.end(), b, e);\n  }\n\n  template <typename RangeTy>\n  void push_initial(const RangeTy& range) {\n    if (substrate::ThreadPool::getTID() == 0)\n      push(range.begin(), range.end());\n  }\n\n  galois::optional<value_type> pop() {\n    galois::optional<value_type> retval;\n    std::lock_guard<substrate::PaddedLock<true>> lg(lock);\n    if (!wl.empty()) {\n      if (popBack) {\n        retval = wl.back();\n        wl.pop_back();\n      } else {\n        retval = wl.front();\n        wl.pop_front();\n      }\n    }\n    return retval;\n  }\n};\n\ntemplate <typename T = int>\nusing FIFO = Wrapper<T, std::deque<T>, false>;\n\ntemplate <typename T = int>\nusing GFIFO = Wrapper<T, galois::gdeque<T>, false>;\n\ntemplate <typename T = int>\nusing LIFO = Wrapper<T, std::deque<T>, true>;\n\ntemplate <typename T = int>\nusing GLIFO = Wrapper<T, galois::gdeque<T>, true>;\n\nGALOIS_WLCOMPILECHECK(FIFO)\nGALOIS_WLCOMPILECHECK(GFIFO)\nGALOIS_WLCOMPILECHECK(LIFO)\nGALOIS_WLCOMPILECHECK(GLIFO)\n\n} // end namespace worklists\n} // end namespace galois\n\n#endif\n"
  },
  {
    "path": "libgalois/include/galois/worklists/StableIterator.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#ifndef GALOIS_WORKLIST_STABLEITERATOR_H\n#define GALOIS_WORKLIST_STABLEITERATOR_H\n\n#include \"galois/config.h\"\n#include \"galois/gstl.h\"\n#include \"galois/worklists/Chunk.h\"\n\nnamespace galois {\nnamespace worklists {\n\n/**\n * Low-overhead worklist when initial range is not invalidated by the\n * operator.\n *\n * @tparam Steal     Try workstealing on initial ranges\n * @tparam Container Worklist to manage work enqueued by the operator\n * @tparam Iterator  (inferred by library)\n */\ntemplate <bool Steal = false, typename Container = PerSocketChunkFIFO<>,\n          typename Iterator = int*>\nstruct StableIterator {\n  typedef typename std::iterator_traits<Iterator>::value_type value_type;\n  typedef Iterator iterator;\n\n  //! change the type the worklist holds\n  template <typename _T>\n  using retype =\n      StableIterator<Steal, typename Container::template retype<_T>, Iterator>;\n\n  template <bool b>\n  using rethread =\n      StableIterator<Steal, typename Container::template rethread<b>, Iterator>;\n\n  template <typename _iterator>\n  struct with_iterator {\n    typedef StableIterator<Steal, Container, _iterator> type;\n  };\n\n  template <bool _steal>\n  struct with_steal {\n    typedef StableIterator<_steal, Container, Iterator> type;\n  };\n\n  template <typename _container>\n  struct with_container {\n    typedef StableIterator<Steal, _container, Iterator> type;\n  };\n\nprivate:\n  struct shared_state {\n    Iterator stealBegin;\n    Iterator stealEnd;\n    substrate::SimpleLock stealLock;\n    bool stealAvail;\n  };\n\n  struct state {\n    substrate::CacheLineStorage<shared_state> stealState;\n    Iterator localBegin;\n    Iterator localEnd;\n    unsigned int nextVictim;\n    unsigned int numStealFailures;\n\n    void populateSteal() {\n      if (Steal && localBegin != localEnd) {\n        shared_state& s = stealState.data;\n        s.stealLock.lock();\n        s.stealEnd   = localEnd;\n        s.stealBegin = localEnd = galois::split_range(localBegin, localEnd);\n        if (s.stealBegin != s.stealEnd)\n          s.stealAvail = true;\n        s.stealLock.unlock();\n      }\n    }\n  };\n\n  substrate::PerThreadStorage<state> TLDS;\n  Container inner;\n\n  bool doSteal(state& dst, state& src, bool wait) {\n    shared_state& s = src.stealState.data;\n    if (s.stealAvail) {\n      if (wait) {\n        s.stealLock.lock();\n      } else if (!s.stealLock.try_lock()) {\n        return false;\n      }\n      if (s.stealBegin != s.stealEnd) {\n        dst.localBegin = s.stealBegin;\n        s.stealBegin   = dst.localEnd =\n            galois::split_range(s.stealBegin, s.stealEnd);\n        s.stealAvail = s.stealBegin != s.stealEnd;\n      }\n      s.stealLock.unlock();\n    }\n    return dst.localBegin != dst.localEnd;\n  }\n\n  // pop already failed, try again with stealing\n  galois::optional<value_type> pop_steal(state& data) {\n    // always try stealing self\n    if (doSteal(data, data, true))\n      return *data.localBegin++;\n    // only try stealing one other\n    if (doSteal(data, *TLDS.getRemote(data.nextVictim), false)) {\n      // share the wealth\n      if (data.nextVictim != substrate::ThreadPool::getTID())\n        data.populateSteal();\n      return *data.localBegin++;\n    }\n    ++data.nextVictim;\n    ++data.numStealFailures;\n    data.nextVictim %= runtime::activeThreads;\n    return galois::optional<value_type>();\n  }\n\npublic:\n  //! push initial range onto the queue\n  //! called with the same b and e on each thread\n  template <typename RangeTy>\n  void push_initial(const RangeTy& r) {\n    state& data           = *TLDS.getLocal();\n    auto lp               = r.local_pair();\n    data.localBegin       = lp.first;\n    data.localEnd         = lp.second;\n    data.nextVictim       = substrate::ThreadPool::getTID();\n    data.numStealFailures = 0;\n    data.populateSteal();\n  }\n\n  //! pop a value from the queue.\n  galois::optional<value_type> pop() {\n    state& data = *TLDS.getLocal();\n    if (data.localBegin != data.localEnd)\n      return *data.localBegin++;\n\n    galois::optional<value_type> item;\n    if (Steal && 2 * data.numStealFailures > runtime::activeThreads)\n      if ((item = pop_steal(data)))\n        return item;\n    if ((item = inner.pop()))\n      return item;\n    if (Steal)\n      return pop_steal(data);\n    return item;\n  }\n\n  void push(const value_type& val) { inner.push(val); }\n\n  template <typename Iter>\n  void push(Iter b, Iter e) {\n    while (b != e)\n      push(*b++);\n  }\n};\nGALOIS_WLCOMPILECHECK(StableIterator)\n\n} // namespace worklists\n} // namespace galois\n#endif\n"
  },
  {
    "path": "libgalois/include/galois/worklists/WLCompileCheck.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#ifndef GALOIS_WORKLIST_WLCOMPILECHECK_H\n#define GALOIS_WORKLIST_WLCOMPILECHECK_H\n\n#include \"galois/config.h\"\n\n#ifndef GALOIS_WLCOMPILECHECK\n#define GALOIS_WLCOMPILECHECK(name) //\n#endif\n\n#endif\n"
  },
  {
    "path": "libgalois/include/galois/worklists/WorkList.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#ifndef GALOIS_WORKLIST_WORKLIST_H\n#define GALOIS_WORKLIST_WORKLIST_H\n\n#include \"galois/config.h\"\n#include \"galois/optional.h\"\n#include \"galois/worklists/AdaptiveObim.h\"\n#include \"galois/worklists/PerThreadChunk.h\"\n#include \"galois/worklists/BulkSynchronous.h\"\n#include \"galois/worklists/Chunk.h\"\n#include \"galois/worklists/Simple.h\"\n#include \"galois/worklists/LocalQueue.h\"\n#include \"galois/worklists/Obim.h\"\n#include \"galois/worklists/OrderedList.h\"\n#include \"galois/worklists/OwnerComputes.h\"\n#include \"galois/worklists/StableIterator.h\"\n\nnamespace galois {\n/**\n * Scheduling policies for Galois iterators. Unless you have very specific\n * scheduling requirement, {@link PerSocketChunkLIFO} or {@link\n * PerSocketChunkFIFO} is a reasonable scheduling policy. If you need\n * approximate priority scheduling, use {@link OrderedByIntegerMetric}. For\n * debugging, you may be interested in {@link FIFO} or {@link LIFO}, which try\n * to follow serial order exactly.\n *\n * The way to use a worklist is to pass it as a template parameter to\n * {@link for_each()}. For example,\n *\n * \\code\n * galois::for_each(galois::iterate(beg,end), fn,\n * galois::wl<galois::worklists::PerSocketChunkFIFO<32>>()); \\endcode\n */\nnamespace worklists {\nnamespace { // don't pollute the symbol table with the example\n\n// Worklists may not be copied.\n// All classes (should) conform to:\ntemplate <typename T>\nclass AbstractWorkList {\n  AbstractWorkList(const AbstractWorkList&) = delete;\n  const AbstractWorkList& operator=(const AbstractWorkList&) = delete;\n\npublic:\n  AbstractWorkList();\n\n  //! Optional paramaterized Constructor\n  //! parameters can be whatever\n  AbstractWorkList(int, double, char*);\n\n  //! T is the value type of the WL\n  typedef T value_type;\n\n  //! Changes the type the worklist holds\n  template <typename _T>\n  using retype = AbstractWorkList<_T>;\n\n  //! Pushes a value onto the queue\n  void push(const value_type& val);\n\n  //! Pushes a range onto the queue\n  template <typename Iter>\n  void push(Iter b, Iter e);\n\n  /**\n   * Pushes initial range onto the queue. Called with the same b and e on each\n   * thread\n   */\n  template <typename RangeTy>\n  void push_initial(const RangeTy&);\n\n  //! Pops a value from the queue.\n  galois::optional<value_type> pop();\n\n  /**\n   * (optional) Returns true if the worklist is empty. Called infrequently\n   * by scheduler after pop has failed. Good way to split retrieving work\n   * into pop (fast path) and empty (slow path).\n   */\n  bool empty();\n};\n\n} // namespace\n} // end namespace worklists\n} // end namespace galois\n\n#endif\n"
  },
  {
    "path": "libgalois/include/galois/worklists/WorkListHelpers.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#ifndef GALOIS_WORKLIST_WORKLISTHELPERS_H\n#define GALOIS_WORKLIST_WORKLISTHELPERS_H\n\n#include <boost/iterator/iterator_facade.hpp>\n\n#include \"galois/config.h\"\n#include \"galois/substrate/PtrLock.h\"\n#include \"galois/worklists/WLCompileCheck.h\"\n\nnamespace galois {\nnamespace worklists {\n\ntemplate <typename T>\nclass ConExtListNode {\n  T* next;\n\npublic:\n  ConExtListNode() : next(0) {}\n  T*& getNext() { return next; }\n  T* const& getNext() const { return next; }\n};\n\ntemplate <typename T>\nclass ConExtIterator\n    : public boost::iterator_facade<ConExtIterator<T>, T,\n                                    boost::forward_traversal_tag> {\n  friend class boost::iterator_core_access;\n  T* at;\n\n  template <typename OtherTy>\n  bool equal(const ConExtIterator<OtherTy>& o) const {\n    return at == o.at;\n  }\n\n  T& dereference() const { return *at; }\n  void increment() { at = at->getNext(); }\n\npublic:\n  ConExtIterator() : at(0) {}\n\n  template <typename OtherTy>\n  ConExtIterator(const ConExtIterator<OtherTy>& o) : at(o.at) {}\n\n  explicit ConExtIterator(T* x) : at(x) {}\n};\n\ntemplate <typename T, bool concurrent>\nclass ConExtLinkedStack {\n  // fixme: deal with concurrent\n  substrate::PtrLock<T> head;\n\npublic:\n  typedef ConExtListNode<T> ListNode;\n\n  bool empty() const { return !head.getValue(); }\n\n  void push(T* C) {\n    T* oldhead(0);\n    do {\n      oldhead      = head.getValue();\n      C->getNext() = oldhead;\n    } while (!head.CAS(oldhead, C));\n  }\n\n  T* pop() {\n    // lock free Fast path (empty)\n    if (empty())\n      return 0;\n\n    // Disable CAS\n    head.lock();\n    T* C = head.getValue();\n    if (!C) {\n      head.unlock();\n      return 0;\n    }\n    head.unlock_and_set(C->getNext());\n    C->getNext() = 0;\n    return C;\n  }\n\n  //! iterators not safe with concurrent modifications\n  typedef T value_type;\n  typedef T& reference;\n  typedef ConExtIterator<T> iterator;\n  typedef ConExtIterator<const T> const_iterator;\n\n  iterator begin() { return iterator(head.getValue()); }\n  iterator end() { return iterator(); }\n\n  const_iterator begin() const { return const_iterator(head.getValue()); }\n  const_iterator end() const { return const_iterator(); }\n};\n\ntemplate <typename T, bool concurrent>\nclass ConExtLinkedQueue {\n  // Fixme: deal with concurrent\n  substrate::PtrLock<T> head;\n  T* tail;\n\npublic:\n  typedef ConExtListNode<T> ListNode;\n\n  ConExtLinkedQueue() : tail(0) {}\n\n  bool empty() const { return !tail; }\n\n  void push(T* C) {\n    head.lock();\n    // std::cerr << \"in(\" << C << \") \";\n    C->getNext() = 0;\n    if (tail) {\n      tail->getNext() = C;\n      tail            = C;\n      head.unlock();\n    } else {\n      assert(!head.getValue());\n      tail = C;\n      head.unlock_and_set(C);\n    }\n  }\n\n  T* pop() {\n    // lock free Fast path empty case\n    if (empty())\n      return 0;\n\n    head.lock();\n    T* C = head.getValue();\n    if (!C) {\n      head.unlock();\n      return 0;\n    }\n    if (tail == C) {\n      tail = 0;\n      assert(!C->getNext());\n      head.unlock_and_clear();\n    } else {\n      head.unlock_and_set(C->getNext());\n      C->getNext() = 0;\n    }\n    return C;\n  }\n\n  //! iterators not safe with concurrent modifications\n  typedef T value_type;\n  typedef T& reference;\n  typedef ConExtIterator<T> iterator;\n  typedef ConExtIterator<const T> const_iterator;\n\n  iterator begin() { return iterator(head.getValue()); }\n  iterator end() { return iterator(); }\n\n  const_iterator begin() const { return const_iterator(head.getValue()); }\n  const_iterator end() const { return const_iterator(); }\n};\n\ntemplate <typename T>\nstruct DummyIndexer {\n  unsigned operator()(const T&) { return 0; }\n};\n\n} // namespace worklists\n} // end namespace galois\n\n#endif\n"
  },
  {
    "path": "libgalois/src/Barrier.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#include \"galois/substrate/Barrier.h\"\n\n// anchor vtable\ngalois::substrate::Barrier::~Barrier() {}\n\n// galois::substrate::Barrier& galois::substrate::getSystemBarrier(unsigned\n// activeThreads) {\n//  return benchmarking::getTopoBarrier(activeThreads);\n//}\n\nstatic galois::substrate::internal::BarrierInstance<>* BI = nullptr;\n\nvoid galois::substrate::internal::setBarrierInstance(\n    internal::BarrierInstance<>* bi) {\n  GALOIS_ASSERT(!(bi && BI), \"Double initialization of BarrierInstance\");\n  BI = bi;\n}\n\ngalois::substrate::Barrier& galois::substrate::getBarrier(unsigned numT) {\n  GALOIS_ASSERT(BI, \"BarrierInstance not initialized\");\n  return BI->get(numT);\n}\n"
  },
  {
    "path": "libgalois/src/Barrier_Counting.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#include \"galois/substrate/ThreadPool.h\"\n#include \"galois/substrate/Barrier.h\"\n#include \"galois/substrate/CompilerSpecific.h\"\n\nnamespace {\n\nclass CountingBarrier : public galois::substrate::Barrier {\n  std::atomic<unsigned> count;\n  std::atomic<bool> sense;\n  unsigned num;\n  std::vector<galois::substrate::CacheLineStorage<bool>> local_sense;\n\n  void _reinit(unsigned val) {\n    count = num = val;\n    sense       = false;\n    local_sense.resize(val);\n    for (unsigned i = 0; i < val; ++i)\n      local_sense.at(i).get() = false;\n  }\n\npublic:\n  CountingBarrier(unsigned int activeT) { _reinit(activeT); }\n\n  virtual ~CountingBarrier() {}\n\n  virtual void reinit(unsigned val) { _reinit(val); }\n\n  virtual void wait() {\n    bool& lsense =\n        local_sense.at(galois::substrate::ThreadPool::getTID()).get();\n    lsense = !lsense;\n    if (--count == 0) {\n      count = num;\n      sense = lsense;\n    } else {\n      while (sense != lsense) {\n        galois::substrate::asmPause();\n      }\n    }\n  }\n\n  virtual const char* name() const { return \"CountingBarrier\"; }\n};\n\n} // namespace\n\nstd::unique_ptr<galois::substrate::Barrier>\ngalois::substrate::createCountingBarrier(unsigned activeThreads) {\n  return std::unique_ptr<Barrier>(new CountingBarrier(activeThreads));\n}\n"
  },
  {
    "path": "libgalois/src/Barrier_Dissemination.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#include \"galois/substrate/ThreadPool.h\"\n#include \"galois/substrate/Barrier.h\"\n#include \"galois/substrate/CompilerSpecific.h\"\n\n#include <atomic>\n\nnamespace {\n\n#define FAST_LOG2(x)                                                           \\\n  (sizeof(unsigned long) * 8 - 1 - __builtin_clzl((unsigned long)(x)))\n#define FAST_LOG2_UP(x)                                                        \\\n  (((x) - (1 << FAST_LOG2(x))) ? FAST_LOG2(x) + 1 : FAST_LOG2(x))\n\nclass DisseminationBarrier : public galois::substrate::Barrier {\n\n  struct node {\n    std::atomic<int> flag[2];\n    node* partner;\n    node() : partner(nullptr) {}\n    node(const node& rhs) : partner(rhs.partner) {\n      flag[0] = rhs.flag[0].load();\n      flag[1] = rhs.flag[1].load();\n    }\n  };\n\n  struct LocalData {\n    int parity;\n    int sense;\n    node myflags[32];\n    // std::array<node, 32> myflags;\n  };\n\n  std::vector<galois::substrate::CacheLineStorage<LocalData>> nodes;\n  unsigned LogP;\n\n  void _reinit(unsigned P) {\n    LogP = FAST_LOG2_UP(P);\n    nodes.resize(P);\n    for (unsigned i = 0; i < P; ++i) {\n      LocalData& lhs = nodes.at(i).get();\n      lhs.parity     = 0;\n      lhs.sense      = 1;\n      for (unsigned j = 0; j < sizeof(lhs.myflags) / sizeof(*lhs.myflags); ++j)\n        lhs.myflags[j].flag[0] = lhs.myflags[j].flag[1] = 0;\n\n      int d = 1;\n      for (unsigned j = 0; j < LogP; ++j) {\n        LocalData& rhs         = nodes.at((i + d) % P).get();\n        lhs.myflags[j].partner = &rhs.myflags[j];\n        d *= 2;\n      }\n    }\n  }\n\npublic:\n  DisseminationBarrier(unsigned v) { _reinit(v); }\n\n  virtual void reinit(unsigned val) { _reinit(val); }\n\n  virtual void wait() {\n    auto& ld     = nodes.at(galois::substrate::ThreadPool::getTID()).get();\n    auto& sense  = ld.sense;\n    auto& parity = ld.parity;\n    for (unsigned r = 0; r < LogP; ++r) {\n      ld.myflags[r].partner->flag[parity] = sense;\n      while (ld.myflags[r].flag[parity] != sense) {\n        galois::substrate::asmPause();\n      }\n    }\n    if (parity == 1)\n      sense = 1 - ld.sense;\n    parity = 1 - parity;\n  }\n\n  virtual const char* name() const { return \"DisseminationBarrier\"; }\n};\n\n} // namespace\n\nstd::unique_ptr<galois::substrate::Barrier>\ngalois::substrate::createDisseminationBarrier(unsigned activeThreads) {\n  return std::unique_ptr<Barrier>(new DisseminationBarrier(activeThreads));\n}\n"
  },
  {
    "path": "libgalois/src/Barrier_MCS.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#include \"galois/substrate/ThreadPool.h\"\n#include \"galois/substrate/Barrier.h\"\n#include \"galois/substrate/CompilerSpecific.h\"\n\n#include <atomic>\n\nnamespace {\n\nclass MCSBarrier : public galois::substrate::Barrier {\n  struct treenode {\n    // vpid is galois::runtime::LL::getTID()\n    std::atomic<bool>* parentpointer; // null for vpid == 0\n    std::atomic<bool>* childpointers[2];\n    bool havechild[4];\n\n    std::atomic<bool> childnotready[4];\n    std::atomic<bool> parentsense;\n    bool sense;\n    treenode() {}\n    treenode(const treenode& rhs)\n        : parentpointer(rhs.parentpointer), sense(rhs.sense) {\n      childpointers[0] = rhs.childpointers[0];\n      childpointers[1] = rhs.childpointers[1];\n      for (int i = 0; i < 4; ++i) {\n        havechild[i]     = rhs.havechild[i];\n        childnotready[i] = rhs.childnotready[i].load();\n      }\n      parentsense = rhs.parentsense.load();\n    }\n  };\n\n  std::vector<galois::substrate::CacheLineStorage<treenode>> nodes;\n\n  void _reinit(unsigned P) {\n    nodes.resize(P);\n    for (unsigned i = 0; i < P; ++i) {\n      treenode& n   = nodes.at(i).get();\n      n.sense       = true;\n      n.parentsense = false;\n      for (int j = 0; j < 4; ++j)\n        n.childnotready[j] = n.havechild[j] = ((4 * i + j + 1) < P);\n      n.parentpointer =\n          (i == 0) ? 0\n                   : &nodes.at((i - 1) / 4).get().childnotready[(i - 1) % 4];\n      n.childpointers[0] =\n          ((2 * i + 1) >= P) ? 0 : &nodes.at(2 * i + 1).get().parentsense;\n      n.childpointers[1] =\n          ((2 * i + 2) >= P) ? 0 : &nodes.at(2 * i + 2).get().parentsense;\n    }\n  }\n\npublic:\n  MCSBarrier(unsigned v) { _reinit(v); }\n\n  virtual void reinit(unsigned val) { _reinit(val); }\n\n  virtual void wait() {\n    treenode& n = nodes.at(galois::substrate::ThreadPool::getTID()).get();\n    while (n.childnotready[0] || n.childnotready[1] || n.childnotready[2] ||\n           n.childnotready[3]) {\n      galois::substrate::asmPause();\n    }\n    for (int i = 0; i < 4; ++i)\n      n.childnotready[i] = n.havechild[i];\n    if (n.parentpointer) {\n      // FIXME: make sure the compiler doesn't do a RMW because of the as-if\n      // rule\n      *n.parentpointer = false;\n      while (n.parentsense != n.sense) {\n        galois::substrate::asmPause();\n      }\n    }\n    // signal children in wakeup tree\n    if (n.childpointers[0])\n      *n.childpointers[0] = n.sense;\n    if (n.childpointers[1])\n      *n.childpointers[1] = n.sense;\n    n.sense = !n.sense;\n  }\n\n  virtual const char* name() const { return \"MCSBarrier\"; }\n};\n\n} // namespace\n\nstd::unique_ptr<galois::substrate::Barrier>\ngalois::substrate::createMCSBarrier(unsigned activeThreads) {\n  return std::unique_ptr<Barrier>(new MCSBarrier(activeThreads));\n}\n"
  },
  {
    "path": "libgalois/src/Barrier_Pthread.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#include \"galois/substrate/Barrier.h\"\n#include \"galois/substrate/CompilerSpecific.h\"\n#include \"galois/gIO.h\"\n\n#if defined(GALOIS_HAVE_PTHREAD)\n\n#include <unistd.h>\n#include <pthread.h>\n\n#endif\n\n#if defined(GALOIS_HAVE_PTHREAD) && defined(_POSIX_BARRIERS) &&                \\\n    (_POSIX_BARRIERS > 0)\n\nnamespace {\n\nclass PthreadBarrier : public galois::substrate::Barrier {\n  pthread_barrier_t bar;\n\npublic:\n  PthreadBarrier() {\n    int err = 0;\n    if ((err = pthread_barrier_init(&bar, 0, ~0)))\n      GALOIS_DIE(\"pthread \", err);\n  }\n\n  PthreadBarrier(unsigned int v) {\n    int err = 0;\n    if ((err = pthread_barrier_init(&bar, 0, v)))\n      GALOIS_DIE(\"pthread \", err);\n  }\n\n  virtual ~PthreadBarrier() {\n    int err = 0;\n    if ((err = pthread_barrier_destroy(&bar)))\n      GALOIS_DIE(\"pthread \", err);\n  }\n\n  virtual void reinit(unsigned val) {\n    int err = 0;\n    if ((err = pthread_barrier_destroy(&bar)))\n      GALOIS_DIE(\"pthread \", err);\n    if ((err = pthread_barrier_init(&bar, 0, val)))\n      GALOIS_DIE(\"pthread \", err);\n  }\n\n  virtual void wait() {\n    int rc = pthread_barrier_wait(&bar);\n    if (rc && rc != PTHREAD_BARRIER_SERIAL_THREAD)\n      GALOIS_DIE(\"pthread \", rc);\n  }\n\n  virtual const char* name() const { return \"PthreadBarrier\"; }\n};\n\n} // namespace\n\nstd::unique_ptr<galois::substrate::Barrier>\ngalois::substrate::createPthreadBarrier(unsigned activeThreads) {\n  return std::unique_ptr<Barrier>(new PthreadBarrier(activeThreads));\n}\n\n#else\n\nstd::unique_ptr<galois::substrate::Barrier>\ngalois::substrate::createPthreadBarrier(unsigned) {\n  return std::unique_ptr<Barrier>(nullptr);\n}\n\n#endif\n"
  },
  {
    "path": "libgalois/src/Barrier_Simple.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#include \"galois/substrate/Barrier.h\"\n#include \"galois/substrate/ThreadPool.h\"\n\n#include <mutex>\n#include <condition_variable>\n\nnamespace {\n\nclass OneWayBarrier : public galois::substrate::Barrier {\n  std::mutex lock;\n  std::condition_variable cond;\n  unsigned count;\n  unsigned total;\n\npublic:\n  OneWayBarrier(unsigned p) { reinit(p); }\n\n  virtual ~OneWayBarrier() {}\n\n  virtual void reinit(unsigned val) {\n    count = 0;\n    total = val;\n  }\n\n  virtual void wait() {\n    std::unique_lock<std::mutex> tmp(lock);\n    count += 1;\n    cond.wait(tmp, [this]() { return count >= total; });\n    cond.notify_all();\n  }\n\n  virtual const char* name() const { return \"OneWayBarrier\"; }\n};\n\nclass SimpleBarrier : public galois::substrate::Barrier {\n  OneWayBarrier barrier1;\n  OneWayBarrier barrier2;\n  unsigned total;\n\npublic:\n  SimpleBarrier(unsigned p) : barrier1(p), barrier2(p), total(p) {}\n\n  virtual ~SimpleBarrier() {}\n\n  virtual void reinit(unsigned val) {\n    total = val;\n    barrier1.reinit(val);\n    barrier2.reinit(val);\n  }\n\n  virtual void wait() {\n    barrier1.wait();\n    if (galois::substrate::ThreadPool::getTID() == 0)\n      barrier1.reinit(total);\n    barrier2.wait();\n    if (galois::substrate::ThreadPool::getTID() == 0)\n      barrier2.reinit(total);\n  }\n\n  virtual const char* name() const { return \"SimpleBarrier\"; }\n};\n\n} // end anonymous namespace\n\nstd::unique_ptr<galois::substrate::Barrier>\ngalois::substrate::createSimpleBarrier(unsigned int v) {\n  return std::unique_ptr<Barrier>(new SimpleBarrier(v));\n}\n"
  },
  {
    "path": "libgalois/src/Barrier_Topo.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#include \"galois/substrate/PerThreadStorage.h\"\n#include \"galois/substrate/Barrier.h\"\n#include \"galois/substrate/CompilerSpecific.h\"\n\n#include <atomic>\n\nnamespace {\n\nclass TopoBarrier : public galois::substrate::Barrier {\n  struct treenode {\n    // vpid is galois::runtime::LL::getTID()\n\n    // socket binary tree\n    treenode* parentpointer; // null of vpid == 0\n    treenode* childpointers[2];\n\n    // waiting values:\n    unsigned havechild;\n    std::atomic<unsigned> childnotready;\n\n    // signal values\n    std::atomic<unsigned> parentsense;\n  };\n\n  galois::substrate::PerSocketStorage<treenode> nodes;\n  galois::substrate::PerThreadStorage<unsigned> sense;\n\n  void _reinit(unsigned P) {\n    auto& tp      = galois::substrate::getThreadPool();\n    unsigned pkgs = tp.getCumulativeMaxSocket(P - 1) + 1;\n    for (unsigned i = 0; i < pkgs; ++i) {\n      treenode& n     = *nodes.getRemoteByPkg(i);\n      n.childnotready = 0;\n      n.havechild     = 0;\n      for (int j = 0; j < 4; ++j) {\n        if ((4 * i + j + 1) < pkgs) {\n          ++n.childnotready;\n          ++n.havechild;\n        }\n      }\n      for (unsigned j = 0; j < P; ++j) {\n        if (tp.getSocket(j) == i && !tp.isLeader(j)) {\n          ++n.childnotready;\n          ++n.havechild;\n        }\n      }\n      n.parentpointer = (i == 0) ? 0 : nodes.getRemoteByPkg((i - 1) / 4);\n      n.childpointers[0] =\n          ((2 * i + 1) >= pkgs) ? 0 : nodes.getRemoteByPkg(2 * i + 1);\n      n.childpointers[1] =\n          ((2 * i + 2) >= pkgs) ? 0 : nodes.getRemoteByPkg(2 * i + 2);\n      n.parentsense = 0;\n    }\n    for (unsigned i = 0; i < P; ++i)\n      *sense.getRemote(i) = 1;\n  }\n\npublic:\n  TopoBarrier(unsigned v) { _reinit(v); }\n\n  // not safe if any thread is in wait\n  virtual void reinit(unsigned val) { _reinit(val); }\n\n  virtual void wait() {\n    unsigned id = galois::substrate::ThreadPool::getTID();\n    treenode& n = *nodes.getLocal();\n    unsigned& s = *sense.getLocal();\n    bool leader = galois::substrate::ThreadPool::isLeader();\n    // completion tree\n    if (leader) {\n      while (n.childnotready) {\n        galois::substrate::asmPause();\n      }\n      n.childnotready = n.havechild;\n      if (n.parentpointer) {\n        --n.parentpointer->childnotready;\n      }\n    } else {\n      --n.childnotready;\n    }\n\n    // wait for signal\n    if (id != 0) {\n      while (n.parentsense != s) {\n        galois::substrate::asmPause();\n      }\n    }\n\n    // signal children in wakeup tree\n    if (leader) {\n      if (n.childpointers[0])\n        n.childpointers[0]->parentsense = s;\n      if (n.childpointers[1])\n        n.childpointers[1]->parentsense = s;\n      if (id == 0)\n        n.parentsense = s;\n    }\n    ++s;\n  }\n\n  virtual const char* name() const { return \"TopoBarrier\"; }\n};\n\n} // namespace\n\nstd::unique_ptr<galois::substrate::Barrier>\ngalois::substrate::createTopoBarrier(unsigned activeThreads) {\n  return std::unique_ptr<Barrier>(new TopoBarrier(activeThreads));\n}\n"
  },
  {
    "path": "libgalois/src/Context.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#include \"galois/runtime/Context.h\"\n#include \"galois/substrate/SimpleLock.h\"\n#include \"galois/substrate/CacheLineStorage.h\"\n\n#include <stdio.h>\n\n//! Global thread context for each active thread\nstatic thread_local galois::runtime::SimpleRuntimeContext* thread_ctx = 0;\n\nthread_local jmp_buf galois::runtime::execFrame;\n\nvoid galois::runtime::setThreadContext(\n    galois::runtime::SimpleRuntimeContext* ctx) {\n  thread_ctx = ctx;\n}\n\ngalois::runtime::SimpleRuntimeContext* galois::runtime::getThreadContext() {\n  return thread_ctx;\n}\n\n////////////////////////////////////////////////////////////////////////////////\n// LockManagerBase & SimpleRuntimeContext\n////////////////////////////////////////////////////////////////////////////////\n\ngalois::runtime::LockManagerBase::AcquireStatus\ngalois::runtime::LockManagerBase::tryAcquire(\n    galois::runtime::Lockable* lockable) {\n  assert(lockable);\n  // XXX(ddn): Hand inlining this code makes a difference on\n  // delaunaytriangulation (GCC 4.7.2)\n#if 0\n  if (tryLock(lockable)) {\n    assert(!getOwner(lockable));\n    setOwner(lockable);\n    return NEW_OWNER;\n#else\n  if (lockable->owner.try_lock()) {\n    lockable->owner.setValue(this);\n    return NEW_OWNER;\n#endif\n}\nelse if (getOwner(lockable) == this) {\n  return ALREADY_OWNER;\n}\nreturn FAIL;\n}\n\nvoid galois::runtime::SimpleRuntimeContext::release(\n    galois::runtime::Lockable* lockable) {\n  assert(lockable);\n  // The deterministic executor, for instance, steals locks from other\n  // iterations\n  assert(customAcquire || getOwner(lockable) == this);\n  assert(!lockable->next);\n  lockable->owner.unlock_and_clear();\n}\n\nunsigned galois::runtime::SimpleRuntimeContext::commitIteration() {\n  unsigned numLocks = 0;\n  while (locks) {\n    // ORDER MATTERS!\n    Lockable* lockable = locks;\n    locks              = lockable->next;\n    lockable->next     = 0;\n    substrate::compilerBarrier();\n    release(lockable);\n    ++numLocks;\n  }\n\n  return numLocks;\n}\n\nunsigned galois::runtime::SimpleRuntimeContext::cancelIteration() {\n  return commitIteration();\n}\n\nvoid galois::runtime::SimpleRuntimeContext::subAcquire(\n    galois::runtime::Lockable*, galois::MethodFlag) {\n  GALOIS_DIE(\"unreachable\");\n}\n"
  },
  {
    "path": "libgalois/src/Deterministic.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#include \"galois/runtime/Executor_Deterministic.h\"\n\nthread_local galois::runtime::SizedHeapFactory::SizedHeap*\n    galois::runtime::internal::dagListHeap;\n"
  },
  {
    "path": "libgalois/src/DynamicBitset.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2019, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n/**\n * @file DynamicBitset.cpp\n *\n * All the implementation of the DynamicBitSet class incorporated into\n * DynamicBitset.h\n */\n"
  },
  {
    "path": "libgalois/src/EnvCheck.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#include \"galois/substrate/EnvCheck.h\"\n\n#include <cstdlib>\n\nbool galois::substrate::EnvCheck(const char* varName) {\n  if (std::getenv(varName))\n    return true;\n  return false;\n}\n\nbool galois::substrate::EnvCheck(const std::string& varName) {\n  return galois::substrate::EnvCheck(varName.c_str());\n}\n"
  },
  {
    "path": "libgalois/src/FileGraph.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n/**\n * @file FileGraph.cpp\n *\n * Contains FileGraph.h implementations + other static helper functions\n * for FileGraph.\n */\n\n#include \"galois/gIO.h\"\n#include \"galois/graphs/FileGraph.h\"\n#include \"galois/substrate/PageAlloc.h\"\n\n#include <cassert>\n#include <fstream>\n\n#include <sys/stat.h>\n#include <sys/types.h>\n#include <fcntl.h>\n#include <unistd.h>\n\n/**\n * Performs an mmap of all provided arguments.\n */\nnamespace galois {\nnamespace graphs {\n// Graph file format:\n// version (1 or 2) {uint64_t LE}\n// EdgeType size {uint64_t LE}\n// numNodes {uint64_t LE}\n// numEdges {uint64_t LE}\n// outindexs[numNodes] {uint64_t LE} (outindex[nodeid] is index of first edge\n// for nodeid + 1 (end interator.  node 0 has an implicit start iterator of 0.\n// outedges[numEdges] {uint32_t LE or uint64_t LE for ver == 2}\n// potential padding (32bit max) to Re-Align to 64bits\n// EdgeType[numEdges] {EdgeType size}\n\nFileGraph::FileGraph()\n    : sizeofEdge(0), numNodes(0), numEdges(0), outIdx(0), outs(0), edgeData(0),\n      graphVersion(-1), nodeOffset(0), edgeOffset(0) {}\n\nFileGraph::FileGraph(const FileGraph& o) {\n  fromArrays(o.outIdx, o.numNodes, o.outs, o.numEdges, o.edgeData, o.sizeofEdge,\n             o.nodeOffset, o.edgeOffset, true, o.graphVersion);\n}\n\nFileGraph& FileGraph::operator=(const FileGraph& other) {\n  if (this != &other) {\n    FileGraph tmp(other);\n    *this = std::move(tmp);\n  }\n  return *this;\n}\n\nFileGraph::FileGraph(FileGraph&& other)\n    : sizeofEdge(0), numNodes(0), numEdges(0), outIdx(0), outs(0), edgeData(0),\n      graphVersion(-1), nodeOffset(0), edgeOffset(0) {\n  move_assign(std::move(other));\n}\n\nFileGraph& FileGraph::operator=(FileGraph&& other) {\n  move_assign(std::move(other));\n  return *this;\n}\n\nFileGraph::~FileGraph() {\n  for (auto& m : mappings)\n    munmap(m.ptr, m.len);\n  for (auto& fd : fds)\n    close(fd);\n}\n\nvoid FileGraph::move_assign(FileGraph&& o) {\n  std::swap(mappings, o.mappings);\n  std::swap(fds, o.fds);\n  std::swap(sizeofEdge, o.sizeofEdge);\n  std::swap(numNodes, o.numNodes);\n  std::swap(numEdges, o.numEdges);\n  std::swap(outIdx, o.outIdx);\n  std::swap(outs, o.outs);\n  std::swap(edgeData, o.edgeData);\n  std::swap(graphVersion, o.graphVersion);\n  std::swap(nodeOffset, o.nodeOffset);\n  std::swap(edgeOffset, o.edgeOffset);\n}\n\nvoid FileGraph::fromMem(void* m, uint64_t node_offset, uint64_t edge_offset,\n                        uint64_t lenlimit) {\n  uint64_t* fptr = (uint64_t*)m;\n  graphVersion   = convert_le64toh(*fptr++);\n\n  if (graphVersion != 1 && graphVersion != 2) {\n    GALOIS_DIE(\"unknown file version \", graphVersion);\n  }\n\n  sizeofEdge = convert_le64toh(*fptr++);\n  numNodes   = convert_le64toh(*fptr++);\n  numEdges   = convert_le64toh(*fptr++);\n  nodeOffset = node_offset;\n  edgeOffset = edge_offset;\n  outIdx     = fptr;\n\n  // move over to outgoing edge data and save it\n  fptr += numNodes;\n  outs = (void*)fptr;\n\n  // skip memory differently depending on file version\n  if (graphVersion == 1) {\n    uint32_t* fptr32 = (uint32_t*)fptr;\n    fptr32 += numEdges + numEdges % 2;\n    if (!lenlimit || lenlimit > numEdges + ((char*)fptr32 - (char*)m))\n      edgeData = (char*)fptr32;\n    else\n      edgeData = 0;\n  } else {\n    uint64_t* fptr64 = (uint64_t*)fptr;\n    fptr64 += numEdges + numEdges % 2;\n\n    if (!lenlimit || lenlimit > numEdges + ((char*)fptr64 - (char*)m))\n      edgeData = (char*)fptr64;\n    else\n      edgeData = 0;\n  }\n}\n\n/**\n * Calculate the total size needed for all data.\n *\n * @param numNodes number of nodes to make space for\n * @param numEdges number of edges to make space for\n * @param sizeofEdgeData the size taken by 1 edge for its edge data\n * @param graphVersion the graph version of the file being loaded (determines\n * the size of edge ids)\n *\n * @returns Total size in bytes needed to store graph data\n */\nstatic size_t rawBlockSize(size_t numNodes, size_t numEdges,\n                           size_t sizeofEdgeData, int graphVersion) {\n  // header size: version, sizeof_edge_data, numNodes, numEdges, all uint64_t\n  size_t bytes = sizeof(uint64_t) * 4;\n\n  // node data\n  bytes += sizeof(uint64_t) * numNodes;\n\n  if (graphVersion == 1) {\n    bytes += sizeof(uint32_t) * numEdges;\n\n    if (numEdges % 2)\n      bytes += sizeof(uint32_t); // padding\n  } else if (graphVersion == 2) {\n    bytes += sizeof(uint64_t) * numEdges;\n    // no padding necessary in version 2 TODO verify this\n  } else {\n    GALOIS_DIE(\"unknown file version: \", graphVersion);\n  }\n\n  bytes += sizeofEdgeData * numEdges;\n  return bytes;\n}\n\nvoid* FileGraph::fromGraph(FileGraph& g, size_t sizeof_edge_data) {\n  return fromArrays(g.outIdx, g.numNodes, g.outs, g.numEdges, g.edgeData,\n                    sizeof_edge_data, g.nodeOffset, g.edgeOffset, true,\n                    g.graphVersion);\n}\n\nvoid* FileGraph::fromArrays(uint64_t* out_idx, uint64_t num_nodes, void* outs,\n                            uint64_t num_edges, char* edge_data,\n                            size_t sizeof_edge_data, uint64_t node_offset,\n                            uint64_t edge_offset, bool converted,\n                            int oGraphVersion) {\n  size_t bytes =\n      rawBlockSize(num_nodes, num_edges, sizeof_edge_data, oGraphVersion);\n\n  char* base = (char*)mmap(nullptr, bytes, PROT_READ | PROT_WRITE,\n                           _MAP_ANON | MAP_PRIVATE, -1, 0);\n  if (base == MAP_FAILED)\n    GALOIS_SYS_DIE(\"failed allocating graph\");\n\n  mappings.push_back({base, bytes});\n\n  uint64_t* fptr = (uint64_t*)base;\n  // set header info\n  if (oGraphVersion == 1) {\n    *fptr++ = convert_htole64(1);\n  } else if (oGraphVersion == 2) {\n    *fptr++ = convert_htole64(2);\n  } else {\n    GALOIS_DIE(\"unknown file version: \", oGraphVersion);\n  }\n  *fptr++ = convert_htole64(sizeof_edge_data);\n  *fptr++ = convert_htole64(num_nodes);\n  *fptr++ = convert_htole64(num_edges);\n\n  // copy node data\n  if (converted) {\n    memcpy(fptr, out_idx, sizeof(*out_idx) * num_nodes);\n    fptr += num_nodes;\n  } else {\n    for (size_t i = 0; i < num_nodes; ++i)\n      *fptr++ = convert_htole64(out_idx[i]);\n  }\n\n  // TODO verify\n  char* fptr0;\n\n  // copy edge destinations\n  if (oGraphVersion == 1) {\n    uint32_t* fptr32 = (uint32_t*)fptr;\n\n    if (converted) {\n      // memcpy(fptr32, outs, sizeof(*outs) * num_edges);\n      memcpy(fptr32, outs, sizeof(uint32_t) * num_edges);\n      fptr32 += num_edges;\n    } else {\n      for (size_t i = 0; i < num_edges; ++i)\n        *fptr32++ = convert_htole32(((uint32_t*)outs)[i]);\n    }\n\n    // padding\n    if (num_edges % 2)\n      fptr32 += 1;\n\n    fptr0 = (char*)fptr32;\n  } else {\n    // should be version 2; otherwise would have died above\n    // note fptr is already typed as uint64_t*...\n    if (converted) {\n      memcpy(fptr, outs, sizeof(uint64_t) * num_edges);\n      fptr += num_edges;\n    } else {\n      for (size_t i = 0; i < num_edges; ++i)\n        *fptr++ = convert_htole64(((uint64_t*)outs)[i]);\n    }\n\n    // padding\n    if (num_edges % 2)\n      fptr += 1;\n\n    fptr0 = (char*)fptr;\n  }\n\n  // copy edge data if necessary\n  if (edge_data)\n    memcpy(fptr0, edge_data, sizeof_edge_data * num_edges);\n\n  // \"load\" filegraph from our constructed base pointer\n  fromMem(base, node_offset, edge_offset, 0);\n  // graph version should be set in from mem\n\n  assert(graphVersion == oGraphVersion);\n\n  return edgeData;\n}\n\nvoid FileGraph::fromFile(const std::string& filename) {\n  int fd = open(filename.c_str(), O_RDONLY);\n  if (fd == -1)\n    GALOIS_SYS_DIE(\"failed opening \", \"'\", filename, \"'\");\n  fds.push_back(fd);\n\n  struct stat buf;\n  if (fstat(fd, &buf) == -1)\n    GALOIS_SYS_DIE(\"failed reading \", \"'\", filename, \"'\");\n\n  // mmap file, then load from mem using fromMem function\n  int _MAP_BASE = MAP_PRIVATE;\n#ifdef MAP_POPULATE\n  _MAP_BASE |= MAP_POPULATE;\n#endif\n  void* base = mmap(nullptr, buf.st_size, PROT_READ, _MAP_BASE, fd, 0);\n  if (base == MAP_FAILED)\n    GALOIS_SYS_DIE(\"failed reading \", \"'\", filename, \"'\");\n  mappings.push_back({base, static_cast<size_t>(buf.st_size)});\n\n  fromMem(base, 0, 0, buf.st_size);\n}\n\n/**\n * Load graph data from a given offset\n *\n * @param fd File descriptor to load\n * @param offset Offset into file to load\n * @param length Amount of the file to laod\n * @param mappings Mappings structure that tracks the things we have mmap'd\n * @returns Pointer to mmap'd location in memory\n */\ntemplate <typename Mappings>\nstatic void* loadFromOffset(int fd, offset_t offset, size_t length,\n                            Mappings& mappings) {\n  // mmap needs page-aligned offsets\n  offset_t aligned =\n      offset & ~static_cast<offset_t>(galois::substrate::allocSize() - 1);\n  offset_t alignment = offset - aligned;\n  length += alignment;\n  void* base = mmap(nullptr, length, PROT_READ, MAP_PRIVATE, fd, aligned);\n  if (base == MAP_FAILED)\n    GALOIS_SYS_DIE(\"failed allocating for fd \", fd);\n  mappings.push_back({base, length});\n  return static_cast<char*>(base) + alignment;\n}\n\n/**\n * Makes multiple threads page in specific portions of a buffer of memory.\n * Useful for NUMA-aware architectures.\n *\n * @param ptr buffer to page in\n * @param length amount of data to page in\n * @param hugePageSize size of a huge page (what is being paged in)\n * @param numThreads number of threads to use when paging in memory\n */\nstatic void pageInterleaved(void* ptr, uint64_t length, uint32_t hugePageSize,\n                            unsigned int numThreads) {\n  galois::substrate::getThreadPool().run(\n      numThreads, [ptr, length, hugePageSize, numThreads]() {\n        auto myID = galois::substrate::ThreadPool::getTID();\n\n        volatile char* cptr = reinterpret_cast<volatile char*>(ptr);\n\n        // round robin page distribution among threads (e.g. thread 0 gets\n        // a page, then thread 1, then thread n, then back to thread 0 and\n        // so on until the end of the region)\n        for (size_t x = hugePageSize * myID; x < length;\n             x += hugePageSize * numThreads)\n          // this should do an access\n          cptr[x];\n      });\n}\n\nvoid FileGraph::partFromFile(const std::string& filename, NodeRange nrange,\n                             EdgeRange erange, bool numaMap) {\n  int fd = open(filename.c_str(), O_RDONLY);\n  if (fd == -1)\n    GALOIS_SYS_DIE(\"failed opening \", \"'\", filename, \"'\");\n  fds.push_back(fd);\n\n  size_t headerSize = 4 * sizeof(uint64_t);\n  void* base        = mmap(nullptr, headerSize, PROT_READ, MAP_PRIVATE, fd, 0);\n  if (base == MAP_FAILED)\n    GALOIS_SYS_DIE(\"failed reading \", \"'\", filename, \"'\");\n  mappings.push_back({base, headerSize});\n\n  // Read metadata of whole graph\n  fromMem(base, *nrange.first, *erange.first, 0);\n\n  // at this point we should have access to graphVersion...\n\n  // Adjust metadata to correspond to part\n  uint64_t partNumNodes = std::distance(nrange.first, nrange.second);\n  uint64_t partNumEdges = std::distance(erange.first, erange.second);\n  size_t length         = partNumNodes * sizeof(uint64_t);\n  offset_t offset       = headerSize + nodeOffset * sizeof(uint64_t);\n  outIdx = static_cast<uint64_t*>(loadFromOffset(fd, offset, length, mappings));\n\n  // TODO verify correctness\n  if (graphVersion == 1) {\n    length = partNumEdges * sizeof(uint32_t);\n    offset = headerSize + numNodes * sizeof(uint64_t) +\n             edgeOffset * sizeof(uint32_t);\n    outs = loadFromOffset(fd, offset, length, mappings);\n  } else if (graphVersion == 2) {\n    length = partNumEdges * sizeof(uint64_t);\n    offset = headerSize + numNodes * sizeof(uint64_t) +\n             edgeOffset * sizeof(uint64_t);\n    outs = loadFromOffset(fd, offset, length, mappings);\n  } else {\n    GALOIS_DIE(\"unknown file version: \", graphVersion);\n  }\n\n  edgeData = 0;\n  if (sizeofEdge) {\n    length = partNumEdges * sizeofEdge;\n    offset = rawBlockSize(numNodes, numEdges, 0, graphVersion) +\n             sizeofEdge * edgeOffset;\n    edgeData = static_cast<char*>(loadFromOffset(fd, offset, length, mappings));\n  }\n\n  numNodes = partNumNodes;\n  numEdges = partNumEdges;\n\n  // do interleaved numa allocation with current number of threads\n  if (numaMap) {\n    unsigned int numThreads   = galois::runtime::activeThreads;\n    const size_t hugePageSize = 2 * 1024 * 1024; // 2MB\n\n    void* ptr;\n\n    // doesn't really matter if only 1 thread; i.e. do nothing i\n    // that case\n    if (numThreads != 1) {\n      // node pointer to edge dest array\n      ptr    = (void*)outIdx;\n      length = numNodes * sizeof(uint64_t);\n\n      pageInterleaved(ptr, length, hugePageSize, numThreads);\n\n      // edge dest array\n      ptr = (void*)outs;\n      if (graphVersion == 1) {\n        length = numEdges * sizeof(uint32_t);\n      } else {\n        // v2\n        length = numEdges * sizeof(uint64_t);\n      }\n\n      pageInterleaved(ptr, length, hugePageSize, numThreads);\n\n      // edge data (if it exists)\n      if (sizeofEdge) {\n        ptr    = (void*)edgeData;\n        length = numEdges * sizeofEdge;\n\n        pageInterleaved(ptr, length, hugePageSize, numThreads);\n      }\n    }\n  }\n}\n\nsize_t FileGraph::findIndex(size_t nodeSize, size_t edgeSize, size_t targetSize,\n                            size_t lb, size_t ub) {\n  while (lb < ub) {\n    size_t mid = lb + (ub - lb) / 2;\n    // edge begin assumes global id, so add nodeoffset to it as we work with\n    // local ids\n    size_t num_edges = *edge_begin(mid + nodeOffset);\n    size_t size      = (num_edges * edgeSize) + (mid * nodeSize);\n    if (size < targetSize)\n      lb = mid + 1;\n    else\n      ub = mid;\n  }\n  return lb;\n}\n\nauto FileGraph::divideByNode(size_t nodeSize, size_t edgeSize, size_t id,\n                             size_t total) -> GraphRange {\n  std::vector<unsigned> dummy_scale_factor; // dummy passed in to function call\n\n  return galois::graphs::divideNodesBinarySearch(\n      numNodes, numEdges, nodeSize, edgeSize, id, total, outIdx,\n      dummy_scale_factor, edgeOffset);\n}\n\nauto FileGraph::divideByEdge(size_t, size_t, size_t id, size_t total)\n    -> std::pair<NodeRange, EdgeRange> {\n  size_t size  = numEdges;\n  size_t block = (size + total - 1) / total;\n  size_t aa    = block * id;\n  size_t ea    = std::min(block * (id + 1), static_cast<size_t>(numEdges));\n\n  // note these use local node ids (numNodes is made local by partFromFile if\n  // it was called)\n  size_t bb = findIndex(0, 1, aa, 0, numNodes);\n  size_t eb = findIndex(0, 1, ea, bb, numNodes);\n\n  if (true) {\n    galois::gInfo(\"(\", id, \"/\", total, \") [\", bb, \" \", eb, \" \", eb - bb, \"], [\",\n                  aa, \" \", ea, \" \", ea - aa, \"]\");\n  }\n\n  return GraphRange(NodeRange(iterator(bb), iterator(eb)),\n                    EdgeRange(edge_iterator(aa), edge_iterator(ea)));\n}\n\nvoid FileGraph::toFile(const std::string& file) {\n  // FIXME handle files with multiple mappings\n  GALOIS_ASSERT(mappings.size() == 1);\n\n  ssize_t retval;\n  mode_t mode = S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH;\n  int fd      = open(file.c_str(), O_WRONLY | O_CREAT | O_TRUNC, mode);\n  mapping mm  = mappings.back();\n  mappings.pop_back();\n\n  size_t total = mm.len;\n  char* ptr    = (char*)mm.ptr;\n  while (total) {\n    retval = write(fd, ptr, total);\n    if (retval == -1) {\n      GALOIS_SYS_DIE(\"failed writing to \", \"'\", file, \"'\");\n    } else if (retval == 0) {\n      GALOIS_DIE(\"ran out of space writing to \", \"'\", file, \"'\");\n    }\n    total -= retval;\n    ptr += retval;\n  }\n  close(fd);\n}\n\nuint64_t FileGraph::getEdgeIdx(GraphNode src, GraphNode dst) {\n  // loop through all neighbors of src, looking for a match with dst\n  if (graphVersion == 1) {\n    for (auto ii = (uint32_t*)raw_neighbor_begin(src),\n              ei = (uint32_t*)raw_neighbor_end(src);\n         ii != ei; ++ii) {\n      if (convert_le32toh(*ii) == dst)\n        return std::distance((uint32_t*)outs, ii);\n    }\n\n    return ~static_cast<uint64_t>(0);\n  } else if (graphVersion == 2) {\n    for (auto ii = (uint64_t*)raw_neighbor_begin(src),\n              ei = (uint64_t*)raw_neighbor_end(src);\n         ii != ei; ++ii) {\n      if (convert_le64toh(*ii) == dst)\n        return std::distance((uint64_t*)outs, ii);\n    }\n\n    return ~static_cast<uint64_t>(0);\n  } else {\n    GALOIS_DIE(\"unknown file version: \", graphVersion);\n  }\n}\n\n/**\n * Touch pages of a buffer to page them in.\n *\n * @param buf Buffer to touch\n * @param len Length to touch\n * @param stride How much to stride when touching pages\n */\nstatic void pageInReadOnly(void* buf, size_t len, size_t stride) {\n  volatile char* ptr = reinterpret_cast<volatile char*>(buf);\n  for (size_t i = 0; i < len; i += stride)\n    ptr[i];\n}\n\nvoid FileGraph::pageInByNode(size_t id, size_t total, size_t sizeofEdgeData) {\n  size_t edgeSize = 0;\n\n  // different graph version have different edge sizes\n  if (graphVersion == 1) {\n    edgeSize = sizeof(uint32_t);\n  } else if (graphVersion == 2) {\n    edgeSize = sizeof(uint64_t);\n  } else {\n    GALOIS_DIE(\"unknown file version at pageInByNode\", graphVersion);\n  }\n\n  // determine which nodes this id is responsible for paging in\n  auto r = divideByNode(sizeof(uint64_t), sizeofEdgeData + edgeSize, id, total)\n               .first;\n\n  // get begin edge and end edge locations\n  // add node offset because edge_begin assumes a global id while divideByNode\n  // returns LOCAL ids (same below with edge_end)\n  size_t ebegin = *edge_begin(*r.first + nodeOffset);\n  size_t eend   = ebegin;\n\n  if (r.first != r.second)\n    eend = *edge_end(*r.second - 1 + nodeOffset);\n\n  // page in the outIdx array\n  pageInReadOnly(outIdx + *r.first,\n                 std::distance(r.first, r.second) * sizeof(*outIdx),\n                 runtime::pagePoolSize());\n\n  // page in outs array\n  if (graphVersion == 1) {\n    pageInReadOnly((uint32_t*)outs + ebegin, (eend - ebegin) * sizeof(uint32_t),\n                   runtime::pagePoolSize());\n  } else {\n    pageInReadOnly((uint64_t*)outs + ebegin, (eend - ebegin) * sizeof(uint64_t),\n                   runtime::pagePoolSize());\n  }\n\n  // page in edge data\n  pageInReadOnly(edgeData + ebegin * sizeofEdgeData,\n                 (eend - ebegin) * sizeofEdgeData, runtime::pagePoolSize());\n}\n\nvoid* FileGraph::raw_neighbor_begin(GraphNode N) {\n  if (graphVersion == 1) {\n    return &(((uint32_t*)outs)[*edge_begin(N)]);\n  } else if (graphVersion == 2) {\n    return &(((uint64_t*)outs)[*edge_begin(N)]);\n  } else {\n    GALOIS_DIE(\"unknown file version: \", graphVersion);\n  }\n\n  return nullptr;\n}\n\nvoid* FileGraph::raw_neighbor_end(GraphNode N) {\n  if (graphVersion == 1) {\n    return &(((uint32_t*)outs)[*edge_end(N)]);\n  } else if (graphVersion == 2) {\n    return &(((uint64_t*)outs)[*edge_end(N)]);\n  } else {\n    GALOIS_DIE(\"unknown file version: \", graphVersion);\n  }\n\n  return nullptr;\n}\n\nFileGraph::edge_iterator FileGraph::edge_begin(GraphNode N) {\n  size_t idx = 0;\n  if (N > nodeOffset) {\n    numBytesReadIndex += 8;\n    idx = std::min(convert_le64toh(outIdx[N - 1 - nodeOffset]),\n                   static_cast<uint64_t>(edgeOffset + numEdges)) -\n          edgeOffset;\n  } else if (N != nodeOffset) {\n    printf(\"WARNING: reading node out of bounds for this file graph\\n\");\n    // TODO die here?\n  }\n  return edge_iterator(idx);\n}\n\nFileGraph::edge_iterator FileGraph::edge_end(GraphNode N) {\n  size_t idx = 0;\n  if (N >= nodeOffset) {\n    numBytesReadIndex += 8;\n    idx = std::min(convert_le64toh(outIdx[N - nodeOffset]),\n                   static_cast<uint64_t>(edgeOffset + numEdges)) -\n          edgeOffset;\n  } else {\n    printf(\"WARNING: reading node out of bounds for this file graph\\n\");\n    // TODO die here?\n  }\n  return edge_iterator(idx);\n}\n\nFileGraph::GraphNode FileGraph::getEdgeDst(edge_iterator it) {\n  if (graphVersion == 1) {\n    numBytesReadEdgeDst += 4;\n    // can safely return 32 bit as 64 bit\n    return convert_le32toh(((uint32_t*)outs)[*it]);\n  } else if (graphVersion == 2) {\n    numBytesReadEdgeDst += 8;\n    return convert_le64toh(((uint64_t*)outs)[*it]);\n  } else {\n    GALOIS_DIE(\"unknown file version: \", graphVersion);\n  }\n\n  return -1;\n}\n\nFileGraph::node_id_iterator FileGraph::node_id_begin() const {\n  return boost::make_transform_iterator(&((uint32_t*)outs)[0], Convert32());\n}\n\nFileGraph::node_id_iterator FileGraph::node_id_end() const {\n  return boost::make_transform_iterator(&((uint32_t*)outs)[numEdges],\n                                        Convert32());\n}\n\nFileGraph::edge_id_iterator FileGraph::edge_id_begin() const {\n  return boost::make_transform_iterator(&outIdx[0], Convert64());\n}\n\nFileGraph::edge_id_iterator FileGraph::edge_id_end() const {\n  return boost::make_transform_iterator(&outIdx[numNodes], Convert64());\n}\n\nbool FileGraph::hasNeighbor(GraphNode N1, GraphNode N2) {\n  return getEdgeIdx(N1, N2) != ~static_cast<uint64_t>(0);\n}\n\nFileGraph::iterator FileGraph::begin() const { return iterator(nodeOffset); }\n\nFileGraph::iterator FileGraph::end() const {\n  return iterator(nodeOffset + numNodes);\n}\n\nvoid FileGraph::initNodeDegrees() {\n  if (!this->node_degrees.size()) {\n    // allocate memory\n    this->node_degrees.create(this->numNodes);\n    // loop over all nodes, calculate degrees\n    galois::do_all(\n        galois::iterate((uint64_t)0, this->numNodes),\n        [&](unsigned long n) {\n          // calculate and save degrees\n          if (n != 0) {\n            this->node_degrees.set(n, this->outIdx[n] - this->outIdx[n - 1]);\n          } else {\n            this->node_degrees.set(n, this->outIdx[0]);\n          }\n        },\n        galois::loopname(\"FileGraphInitNodeDegrees\"), galois::no_stats());\n  }\n}\n\nuint64_t FileGraph::getDegree(uint32_t node_id) const {\n  // node_degrees array should be initialized\n  assert(this->node_degrees.size());\n  return this->node_degrees[node_id];\n}\n\nvoid FileGraphWriter::phase1() {\n  graphVersion = numNodes <= std::numeric_limits<uint32_t>::max() ? 1 : 2;\n\n  size_t bytes    = galois::graphs::rawBlockSize(numNodes, numEdges, sizeofEdge,\n                                              graphVersion);\n  char* mmap_base = reinterpret_cast<char*>(mmap(\n      nullptr, bytes, PROT_READ | PROT_WRITE, _MAP_ANON | MAP_PRIVATE, -1, 0));\n  if (mmap_base == MAP_FAILED)\n    GALOIS_SYS_DIE(\"failed allocating graph to write\");\n\n  mappings.push_back({mmap_base, bytes});\n\n  uint64_t* fptr = reinterpret_cast<uint64_t*>(mmap_base);\n  // set header info\n  *fptr++    = convert_htole64(graphVersion);\n  *fptr++    = convert_htole64(sizeofEdge);\n  *fptr++    = convert_htole64(numNodes);\n  *fptr++    = convert_htole64(numEdges);\n  nodeOffset = 0;\n  edgeOffset = 0;\n  outIdx     = fptr;\n\n  // move over to outgoing edge data and save it\n  fptr += numNodes;\n  outs = reinterpret_cast<void*>(fptr);\n\n  // skip memory differently depending on file version\n  edgeData = graphVersion == 1\n                 ? reinterpret_cast<char*>(reinterpret_cast<uint32_t*>(fptr) +\n                                           numEdges + numEdges % 2) // padding\n                 : reinterpret_cast<char*>(\n                       /*reinterpret_cast<uint64_t*>*/ (fptr) + numEdges);\n}\n\nvoid FileGraphWriter::phase2() {\n  if (numNodes == 0)\n    return;\n\n  // Turn counts into partial sums\n  uint64_t* prev = outIdx;\n  for (uint64_t *ii = outIdx + 1, *ei = outIdx + numNodes; ii != ei;\n       ++ii, ++prev) {\n    *ii += *prev;\n  }\n  assert(outIdx[numNodes - 1] == numEdges);\n\n  starts = std::make_unique<uint64_t[]>(numNodes);\n}\n\n} // namespace graphs\n} // namespace galois\n"
  },
  {
    "path": "libgalois/src/FileGraphParallel.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#include \"galois/graphs/FileGraph.h\"\n#include \"galois/substrate/ThreadPool.h\"\n#include \"galois/substrate/HWTopo.h\"\n\n#include <mutex>\n#include <condition_variable>\n\nnamespace galois {\nnamespace graphs {\n\nvoid FileGraph::fromFileInterleaved(const std::string& filename,\n                                    size_t sizeofEdgeData) {\n  fromFile(filename);\n\n  std::mutex lock;\n  std::condition_variable cond;\n  auto& tp            = substrate::getThreadPool();\n  unsigned maxSockets = tp.getMaxSockets();\n  unsigned count      = maxSockets;\n\n  // Interleave across all NUMA nodes\n  tp.run(tp.getMaxThreads(), [&]() {\n    std::unique_lock<std::mutex> lk(lock);\n    if (substrate::ThreadPool::isLeader()) {\n      pageInByNode(substrate::ThreadPool::getSocket(), maxSockets,\n                   sizeofEdgeData);\n      if (--count == 0)\n        cond.notify_all();\n    } else {\n      cond.wait(lk, [&]() { return count == 0; });\n    }\n  });\n}\n\n} // namespace graphs\n} // namespace galois\n"
  },
  {
    "path": "libgalois/src/GraphHelpers.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#include <galois/graphs/GraphHelpers.h>\n\nnamespace galois {\nnamespace graphs {\nnamespace internal {\n\nuint32_t determine_block_division(uint32_t numDivisions,\n                                  std::vector<unsigned>& scaleFactor) {\n  uint32_t numBlocks = 0;\n\n  if (scaleFactor.empty()) {\n    // if scale factor isn't specified, everyone gets the same amount\n    numBlocks = numDivisions;\n\n    // scale factor holds a prefix sum of the scale factor\n    for (uint32_t i = 0; i < numDivisions; i++) {\n      scaleFactor.push_back(i + 1);\n    }\n  } else {\n    assert(scaleFactor.size() == numDivisions);\n    assert(numDivisions >= 1);\n\n    // get numDivisions number of blocks we need + save a prefix sum of the\n    // scale factor vector to scaleFactor\n    for (uint32_t i = 0; i < numDivisions; i++) {\n      numBlocks += scaleFactor[i];\n      scaleFactor[i] = numBlocks;\n    }\n  }\n\n  return numBlocks;\n}\n\nbool unitRangeCornerCaseHandle(uint32_t unitsToSplit, uint32_t beginNode,\n                               uint32_t endNode,\n                               std::vector<uint32_t>& returnRanges) {\n  uint32_t totalNodes = endNode - beginNode;\n\n  // check corner cases\n  // no nodes = assign nothing to all units\n  if (beginNode == endNode) {\n    returnRanges[0] = beginNode;\n\n    for (uint32_t i = 0; i < unitsToSplit; i++) {\n      returnRanges[i + 1] = beginNode;\n    }\n\n    return true;\n  }\n\n  // single unit case; 1 unit gets all\n  if (unitsToSplit == 1) {\n    returnRanges[0] = beginNode;\n    returnRanges[1] = endNode;\n    return true;\n    // more units than nodes\n  } else if (unitsToSplit > totalNodes) {\n    uint32_t current_node = beginNode;\n    returnRanges[0]       = current_node;\n    // 1 node for units until out of units\n    for (uint32_t i = 0; i < totalNodes; i++) {\n      returnRanges[i + 1] = ++current_node;\n    }\n    // deal with remainder units; they get nothing\n    for (uint32_t i = totalNodes; i < unitsToSplit; i++) {\n      returnRanges[i + 1] = totalNodes;\n    }\n\n    return true;\n  }\n\n  return false;\n}\n\nvoid unitRangeSanity(\n    uint32_t GALOIS_USED_ONLY_IN_DEBUG(unitsToSplit),\n    uint32_t GALOIS_USED_ONLY_IN_DEBUG(beginNode),\n    uint32_t GALOIS_USED_ONLY_IN_DEBUG(endNode),\n    std::vector<uint32_t>& GALOIS_USED_ONLY_IN_DEBUG(returnRanges)) {\n#ifndef NDEBUG\n  // sanity checks\n  assert(returnRanges[0] == beginNode &&\n         \"return ranges begin not the begin node\");\n  assert(returnRanges[unitsToSplit] == endNode &&\n         \"return ranges end not end node\");\n\n  for (uint32_t i = 1; i < unitsToSplit; i++) {\n    assert(returnRanges[i] >= beginNode && returnRanges[i] <= endNode);\n    assert(returnRanges[i] >= returnRanges[i - 1]);\n  }\n#endif\n}\n\n} // namespace internal\n} // namespace graphs\n} // namespace galois\n"
  },
  {
    "path": "libgalois/src/HWTopo.cpp",
    "content": "#include \"galois/substrate/HWTopo.h\"\n\n#include <stdexcept>\n\nstd::vector<int> galois::substrate::parseCPUList(const std::string& line) {\n  std::vector<int> vals;\n\n  size_t current;\n  size_t next = -1;\n  try {\n    do {\n      current  = next + 1;\n      next     = line.find_first_of(',', current);\n      auto buf = line.substr(current, next - current);\n      if (!buf.empty()) {\n        size_t dash = buf.find_first_of('-', 0);\n        if (dash != std::string::npos) { // range\n          auto first  = buf.substr(0, dash);\n          auto second = buf.substr(dash + 1, std::string::npos);\n          unsigned b  = std::stoi(first.data());\n          unsigned e  = std::stoi(second.data());\n          while (b <= e) {\n            vals.push_back(b++);\n          }\n        } else { // singleton\n          vals.push_back(std::stoi(buf.data()));\n        }\n      }\n    } while (next != std::string::npos);\n  } catch (const std::invalid_argument&) {\n    return std::vector<int>{};\n  } catch (const std::out_of_range&) {\n    return std::vector<int>{};\n  }\n\n  return vals;\n}\n"
  },
  {
    "path": "libgalois/src/HWTopoDarwin.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#include \"galois/substrate/EnvCheck.h\"\n#include \"galois/substrate/HWTopo.h\"\n#include \"galois/substrate/SimpleLock.h\"\n#include \"galois/gIO.h\"\n\n#include <mach/mach_interface.h>\n#include <mach/thread_policy.h>\n#include <sys/types.h>\n#include <sys/sysctl.h>\n#include <algorithm>\n#include <memory>\n#include <mutex>\n#include <pthread.h>\n\nusing namespace galois::substrate;\n\nnamespace {\n\nint getIntValue(const char* name) {\n  int value;\n  size_t len = sizeof(value);\n\n  if (sysctlbyname(name, &value, &len, nullptr, 0) == -1) {\n    GALOIS_SYS_DIE(\"could not get sysctl value for \", name, \": \",\n                   strerror(errno));\n  }\n\n  return value;\n}\n\nHWTopoInfo makeHWTopo() {\n  MachineTopoInfo mti;\n  mti.maxSockets   = getIntValue(\"hw.packages\");\n  mti.maxThreads   = getIntValue(\"hw.logicalcpu_max\");\n  mti.maxCores     = getIntValue(\"hw.physicalcpu_max\");\n  mti.maxNumaNodes = mti.maxSockets;\n\n  std::vector<ThreadTopoInfo> tti;\n  tti.reserve(mti.maxThreads);\n\n  // Darwin doesn't expose more fine-grained topology information,\n  // so assume a dense configuration:\n  // thread 0 +\n  //          |- core 0 +\n  // thread 1 +         |\n  //                    |- socket 0\n  // thread 2 +         |\n  //          |- core 1 +\n  // thread 3 +\n\n  const unsigned threadsPerSocket =\n      (mti.maxThreads + mti.maxThreads - 1) / mti.maxSockets;\n\n  // Describe dense configuration first; then, sort logical threads to the\n  // back.\n  for (unsigned i = 0; i < mti.maxThreads; ++i) {\n    unsigned socket = i / threadsPerSocket;\n    unsigned leader = socket * threadsPerSocket;\n    tti.push_back(ThreadTopoInfo{\n        .socketLeader = leader,\n        .socket       = socket,\n        .numaNode     = socket,\n        .osContext    = i,\n        .osNumaNode   = socket,\n    });\n  }\n\n  const unsigned logicalPerPhysical =\n      (mti.maxThreads + mti.maxThreads - 1) / mti.maxCores;\n\n  std::sort(tti.begin(), tti.end(),\n            [&](const ThreadTopoInfo& a, const ThreadTopoInfo& b) {\n              int smtA = a.osContext % logicalPerPhysical;\n              int smtB = b.osContext % logicalPerPhysical;\n              if (smtA == smtB) {\n                return a.osContext < b.osContext;\n              }\n              return smtA < smtB;\n            });\n\n  for (unsigned i = 0, m = 0; i < mti.maxThreads; ++i) {\n    m                          = std::max(m, tti[i].socket);\n    tti[i].tid                 = i;\n    tti[i].cumulativeMaxSocket = m;\n  }\n\n  return {\n      .machineTopoInfo = mti,\n      .threadTopoInfo  = tti,\n  };\n}\n\n} // namespace\n\n//! binds current thread to OS HW context \"proc\"\nbool galois::substrate::bindThreadSelf(unsigned osContext) {\n  pthread_t thread              = pthread_self();\n  thread_affinity_policy policy = {int(osContext)};\n  thread_t machThread           = pthread_mach_thread_np(thread);\n  if (thread_policy_set(machThread, THREAD_AFFINITY_POLICY,\n                        thread_policy_t(&policy),\n                        THREAD_AFFINITY_POLICY_COUNT)) {\n    galois::gWarn(\"Could not set CPU affinity to \", osContext, \" (\",\n                  strerror(errno), \")\");\n    return false;\n  }\n\n  return true;\n}\n\nHWTopoInfo galois::substrate::getHWTopo() {\n  static SimpleLock lock;\n  static std::unique_ptr<HWTopoInfo> data;\n\n  std::lock_guard<SimpleLock> guard(lock);\n  if (!data) {\n    data = std::make_unique<HWTopoInfo>(makeHWTopo());\n  }\n  return *data;\n}\n"
  },
  {
    "path": "libgalois/src/HWTopoLinux.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#include \"galois/substrate/HWTopo.h\"\n#include \"galois/substrate/EnvCheck.h\"\n#include \"galois/substrate/SimpleLock.h\"\n#include \"galois/gIO.h\"\n\n#include <algorithm>\n#include <array>\n#include <cassert>\n#include <cerrno>\n#include <cstring>\n#include <fstream>\n#include <functional>\n#include <memory>\n#include <mutex>\n#include <set>\n\n#ifdef GALOIS_USE_NUMA\n#include <numa.h>\n#include <numaif.h>\n#endif\n\n#ifdef GALOIS_USE_SCHED_SETAFFINITY\n#include <sched.h>\n#endif\n\nnamespace {\n\nstruct cpuinfo {\n  // fields filled in from OS files\n  unsigned proc;\n  unsigned physid;\n  unsigned sib;\n  unsigned coreid;\n  unsigned cpucores;\n  unsigned numaNode; // from libnuma\n  bool valid;        // from cpuset\n  bool smt;          // computed\n};\n\nbool operator<(const cpuinfo& lhs, const cpuinfo& rhs) {\n  if (lhs.smt != rhs.smt)\n    return lhs.smt < rhs.smt;\n  if (lhs.physid != rhs.physid)\n    return lhs.physid < rhs.physid;\n  if (lhs.coreid != rhs.coreid)\n    return lhs.coreid < rhs.coreid;\n  return lhs.proc < rhs.proc;\n}\n\nunsigned getNumaNode(cpuinfo& c) {\n  static bool warnOnce = false;\n#ifdef GALOIS_USE_NUMA\n  static bool numaAvail = false;\n\n  if (!warnOnce) {\n    warnOnce  = true;\n    numaAvail = numa_available() >= 0;\n    numaAvail = numaAvail && numa_num_configured_nodes() > 0;\n    if (!numaAvail)\n      galois::gWarn(\"Numa support configured but not present at runtime.  \"\n                    \"Assuming numa topology matches socket topology.\");\n  }\n\n  if (!numaAvail)\n    return c.physid;\n  int i = numa_node_of_cpu(c.proc);\n  if (i < 0)\n    GALOIS_SYS_DIE(\"failed finding numa node for \", c.proc);\n  return i;\n#else\n  if (!warnOnce) {\n    warnOnce = true;\n    galois::gWarn(\"Numa Support Not configured (install libnuma-dev).  \"\n                  \"Assuming numa topology matches socket topology.\");\n  }\n  return c.physid;\n#endif\n}\n\n//! Parse /proc/cpuinfo\nstd::vector<cpuinfo> parseCPUInfo() {\n  std::vector<cpuinfo> vals;\n\n  const int len = 1024;\n  std::array<char, len> line;\n\n  std::ifstream procInfo(\"/proc/cpuinfo\");\n  if (!procInfo)\n    GALOIS_SYS_DIE(\"failed opening /proc/cpuinfo\");\n\n  int cur = -1;\n\n  while (true) {\n    procInfo.getline(line.data(), len);\n    if (!procInfo)\n      break;\n\n    int num;\n    if (sscanf(line.data(), \"processor : %d\", &num) == 1) {\n      assert(cur < num);\n      cur = num;\n      vals.resize(cur + 1);\n      vals.at(cur).proc = num;\n    } else if (sscanf(line.data(), \"physical id : %d\", &num) == 1) {\n      vals.at(cur).physid = num;\n    } else if (sscanf(line.data(), \"siblings : %d\", &num) == 1) {\n      vals.at(cur).sib = num;\n    } else if (sscanf(line.data(), \"core id : %d\", &num) == 1) {\n      vals.at(cur).coreid = num;\n    } else if (sscanf(line.data(), \"cpu cores : %d\", &num) == 1) {\n      vals.at(cur).cpucores = num;\n    }\n  }\n\n  for (auto& c : vals)\n    c.numaNode = getNumaNode(c);\n\n  return vals;\n}\n\nunsigned countSockets(const std::vector<cpuinfo>& info) {\n  std::set<unsigned> pkgs;\n  for (auto& c : info)\n    pkgs.insert(c.physid);\n  return pkgs.size();\n}\n\nunsigned countCores(const std::vector<cpuinfo>& info) {\n  std::set<std::pair<int, int>> cores;\n  for (auto& c : info)\n    cores.insert(std::make_pair(c.physid, c.coreid));\n  return cores.size();\n}\n\nunsigned countNumaNodes(const std::vector<cpuinfo>& info) {\n  std::set<unsigned> nodes;\n  for (auto& c : info)\n    nodes.insert(c.numaNode);\n  return nodes.size();\n}\n\nvoid markSMT(std::vector<cpuinfo>& info) {\n  for (unsigned int i = 1; i < info.size(); ++i)\n    if (info[i - 1].physid == info[i].physid &&\n        info[i - 1].coreid == info[i].coreid)\n      info[i].smt = true;\n    else\n      info[i].smt = false;\n}\n\nstd::vector<int> parseCPUSet() {\n  std::vector<int> vals;\n\n  std::ifstream data(\"/proc/self/status\");\n\n  if (!data) {\n    return vals;\n  }\n\n  std::string line;\n  std::string prefix(\"Cpus_allowed_list:\");\n  bool found = false;\n  while (true) {\n    std::getline(data, line);\n    if (!data) {\n      return vals;\n    }\n\n    if (line.compare(0, prefix.size(), prefix) == 0) {\n      found = true;\n      break;\n    }\n  }\n\n  if (!found) {\n    return vals;\n  }\n\n  line = line.substr(prefix.size());\n\n  return galois::substrate::parseCPUList(line);\n}\n\nvoid markValid(std::vector<cpuinfo>& info) {\n  auto v = parseCPUSet();\n  if (v.empty()) {\n    for (auto& c : info)\n      c.valid = true;\n  } else {\n    std::sort(v.begin(), v.end());\n    for (auto& c : info)\n      c.valid = std::binary_search(v.begin(), v.end(), c.proc);\n  }\n}\n\ngalois::substrate::HWTopoInfo makeHWTopo() {\n  galois::substrate::MachineTopoInfo retMTI;\n\n  auto info = parseCPUInfo();\n  std::sort(info.begin(), info.end());\n  markSMT(info);\n  markValid(info);\n\n  info.erase(std::partition(info.begin(), info.end(),\n                            [](const cpuinfo& c) { return c.valid; }),\n             info.end());\n\n  std::sort(info.begin(), info.end());\n  markSMT(info);\n  retMTI.maxSockets   = countSockets(info);\n  retMTI.maxThreads   = info.size();\n  retMTI.maxCores     = countCores(info);\n  retMTI.maxNumaNodes = countNumaNodes(info);\n\n  std::vector<galois::substrate::ThreadTopoInfo> retTTI;\n  retTTI.reserve(retMTI.maxThreads);\n  // compute renumberings\n  std::set<unsigned> sockets;\n  std::set<unsigned> numaNodes;\n  for (auto& i : info) {\n    sockets.insert(i.physid);\n    numaNodes.insert(i.numaNode);\n  }\n  unsigned mid = 0; // max socket id\n  for (unsigned i = 0; i < info.size(); ++i) {\n    unsigned pid = info[i].physid;\n    unsigned repid =\n        std::distance(sockets.begin(), sockets.find(info[i].physid));\n    mid             = std::max(mid, repid);\n    unsigned leader = std::distance(\n        info.begin(),\n        std::find_if(info.begin(), info.end(),\n                     [pid](const cpuinfo& c) { return c.physid == pid; }));\n    retTTI.push_back(galois::substrate::ThreadTopoInfo{\n        i, leader, repid,\n        (unsigned)std::distance(numaNodes.begin(),\n                                numaNodes.find(info[i].numaNode)),\n        mid, info[i].proc, info[i].numaNode});\n  }\n\n  return {\n      .machineTopoInfo = retMTI,\n      .threadTopoInfo  = retTTI,\n  };\n}\n\n} // namespace\n\ngalois::substrate::HWTopoInfo galois::substrate::getHWTopo() {\n  static SimpleLock lock;\n  static std::unique_ptr<HWTopoInfo> data;\n\n  std::lock_guard<SimpleLock> guard(lock);\n  if (!data) {\n    data = std::make_unique<HWTopoInfo>(makeHWTopo());\n  }\n  return *data;\n}\n\n//! binds current thread to OS HW context \"proc\"\nbool galois::substrate::bindThreadSelf(unsigned osContext) {\n#ifdef GALOIS_USE_SCHED_SETAFFINITY\n  cpu_set_t mask;\n  /* CPU_ZERO initializes all the bits in the mask to zero. */\n  CPU_ZERO(&mask);\n\n  /* CPU_SET sets only the bit corresponding to cpu. */\n  // void to cancel unused result warning\n  (void)CPU_SET(osContext, &mask);\n\n  /* sched_setaffinity returns 0 in success */\n  if (sched_setaffinity(0, sizeof(mask), &mask) == -1) {\n    galois::gWarn(\"Could not set CPU affinity to \", osContext, \"(\",\n                  strerror(errno), \")\");\n    return false;\n  }\n  return true;\n#else\n  galois::gWarn(\n      \"Cannot set cpu affinity on this platform.  Performance will be bad.\");\n  return false;\n#endif\n}\n"
  },
  {
    "path": "libgalois/src/Mem.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#include \"galois/runtime/Mem.h\"\n\n#include <map>\n#include <mutex>\n\nusing namespace galois::runtime;\n\n// Anchor the class\nSystemHeap::SystemHeap() { assert(AllocSize == runtime::pagePoolSize()); }\n\nSystemHeap::~SystemHeap() {}\n\n#ifndef GALOIS_FORCE_STANDALONE\nthread_local SizedHeapFactory::HeapMap* SizedHeapFactory::localHeaps = 0;\n\nSizedHeapFactory::SizedHeap*\nSizedHeapFactory::getHeapForSize(const size_t size) {\n  if (size == 0)\n    return nullptr;\n  return Base::getInstance()->getHeap(size);\n}\n\nSizedHeapFactory::SizedHeap* SizedHeapFactory::getHeap(const size_t size) {\n  typedef SizedHeapFactory::HeapMap HeapMap;\n\n  if (!localHeaps) {\n    std::lock_guard<galois::substrate::SimpleLock> ll(lock);\n    localHeaps = new HeapMap;\n    allLocalHeaps.push_front(localHeaps);\n  }\n\n  auto& lentry = (*localHeaps)[size];\n  if (lentry)\n    return lentry;\n\n  {\n    std::lock_guard<galois::substrate::SimpleLock> ll(lock);\n    auto& gentry = heaps[size];\n    if (!gentry)\n      gentry = new SizedHeap();\n    lentry = gentry;\n    return lentry;\n  }\n}\n\nPow_2_BlockHeap::Pow_2_BlockHeap(void) noexcept : heapTable() {\n  populateTable();\n}\n\nSizedHeapFactory::SizedHeapFactory() : lock() {}\n\nSizedHeapFactory::~SizedHeapFactory() {\n  // TODO destructor ordering problem: there may be pointers to deleted\n  // SizedHeap when this Factory is destroyed before dependent\n  // FixedSizeHeaps.\n  for (auto entry : heaps)\n    delete entry.second;\n  for (auto mptr : allLocalHeaps)\n    delete mptr;\n}\n#endif\n"
  },
  {
    "path": "libgalois/src/NumaMem.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#include \"galois/substrate/NumaMem.h\"\n#include \"galois/substrate/PageAlloc.h\"\n#include \"galois/substrate/ThreadPool.h\"\n#include \"galois/gIO.h\"\n\n#include <cassert>\n\nusing namespace galois::substrate;\n\n/* Access pages on each thread so each thread has some pages already loaded\n * (preferably ones it will use) */\nstatic void pageIn(void* _ptr, size_t len, size_t pageSize, unsigned numThreads,\n                   bool finegrained) {\n  char* ptr = static_cast<char*>(_ptr);\n\n  if (numThreads == 1) {\n    for (size_t x = 0; x < len; x += pageSize / 2)\n      ptr[x] = 0;\n  } else {\n    getThreadPool().run(numThreads, [ptr, len, pageSize, numThreads,\n                                     finegrained]() {\n      auto myID = ThreadPool::getTID();\n\n      if (finegrained) {\n        // round robin page distribution among threads (e.g. thread 0 gets\n        // a page, then thread 1, then thread n, then back to thread 0 and\n        // so on until the end of the region)\n        for (size_t x = pageSize * myID; x < len; x += pageSize * numThreads)\n          ptr[x] = 0;\n      } else {\n        // sectioned page distribution (e.g. thread 0 gets first chunk, thread\n        // 1 gets next chunk, ... last thread gets last chunk)\n        for (size_t x = myID * len / numThreads;\n             x < len && x < (myID + 1) * len / numThreads; x += pageSize)\n          ptr[x] = 0;\n      }\n    });\n  }\n}\n\n/**\n * Causes each thread to page in a specified region of the provided memory\n * based on some distribution of elements as specified by a provided array.\n *\n * @tparam RangeArrayTy Type of threadRanges array: should either be uint32_t*\n * or uint64_t*\n * @param _ptr Pointer to the memory to page in\n * @param len Length of the memory passed in\n * @param pageSize Size of a page\n * @param numThreads Number of threads to split work amongst\n * @param threadRanges Array that specifies distribution of elements among\n * threads\n * @param elementSize Size of an element that is to be distributed among\n * threads\n */\ntemplate <typename RangeArrayTy>\nstatic void pageInSpecified(void* _ptr, size_t len, size_t pageSize,\n                            unsigned numThreads, RangeArrayTy threadRanges,\n                            size_t elementSize) {\n  assert(numThreads > 0);\n  assert(elementSize > 0);\n\n  char* ptr = static_cast<char*>(_ptr);\n\n  if (numThreads > 1) {\n    getThreadPool().run(\n        numThreads, [ptr, pageSize, threadRanges, elementSize]() {\n          auto myID = ThreadPool::getTID();\n\n          uint64_t beginLocation = threadRanges[myID];\n          uint64_t endLocation   = threadRanges[myID + 1];\n\n          assert(beginLocation <= endLocation);\n\n          // printf(\"[%u] begin location %u and end location\n          // %u\\n\", myID,\n          //       beginLocation, endLocation);\n\n          // if equal, then no memory needed to allocate in\n          // first place\n          if (beginLocation != endLocation) {\n            size_t beginByte = beginLocation * elementSize;\n            // -1 since endLocation * elementSize will result in the first\n            // byte of the next element.\n            size_t endByte = endLocation ? (endLocation * elementSize) - 1 : 0;\n\n            assert(beginByte <= endByte);\n\n            // memset(ptr + beginByte, 0, (endByte - beginByte +\n            // 1));\n\n            uint32_t beginPage = beginByte / pageSize;\n            uint32_t endPage   = endByte / pageSize;\n\n            assert(beginPage <= endPage);\n\n            // printf(\"thread %u gets begin page %u and end page\n            // %u\\n\", myID,\n            //        beginPage, endPage);\n\n            // write a byte to every page this thread occupies\n            for (uint32_t i = beginPage; i <= endPage; i++) {\n              ptr[i * pageSize] = 0;\n            }\n          }\n        });\n  } else {\n    // 1 thread case\n    for (size_t x = 0; x < len; x += pageSize / 2)\n      ptr[x] = 0;\n  }\n}\n\nstatic void largeFree(void* ptr, size_t bytes) {\n  freePages(ptr, bytes / allocSize());\n}\n\nvoid galois::substrate::internal::largeFreer::operator()(void* ptr) const {\n  largeFree(ptr, bytes);\n}\n\n// round data to a multiple of mult\nstatic size_t roundup(size_t data, size_t mult) {\n  auto rem = data % mult;\n\n  if (!rem)\n    return data;\n  return data + (mult - rem);\n}\n\nLAptr galois::substrate::largeMallocInterleaved(size_t bytes,\n                                                unsigned numThreads) {\n  // round up to hugePageSize\n  bytes = roundup(bytes, allocSize());\n\n#ifdef GALOIS_USE_NUMA\n  // We don't use numa_alloc_interleaved_subset because we really want huge\n  // pages\n  // yes this is a comment in a ifdef, but if libnuma improves, this is where\n  // the alloc would go\n#endif\n  // Get a non-prefaulted allocation\n  void* data = allocPages(bytes / allocSize(), false);\n\n  // Then page in based on thread number\n  if (data)\n    // true = round robin paging\n    pageIn(data, bytes, allocSize(), numThreads, true);\n\n  return LAptr{data, internal::largeFreer{bytes}};\n}\n\nLAptr galois::substrate::largeMallocLocal(size_t bytes) {\n  // round up to hugePageSize\n  bytes = roundup(bytes, allocSize());\n  // Get a prefaulted allocation\n  return LAptr{allocPages(bytes / allocSize(), true),\n               internal::largeFreer{bytes}};\n}\n\nLAptr galois::substrate::largeMallocFloating(size_t bytes) {\n  // round up to hugePageSize\n  bytes = roundup(bytes, allocSize());\n  // Get a non-prefaulted allocation\n  return LAptr{allocPages(bytes / allocSize(), false),\n               internal::largeFreer{bytes}};\n}\n\nLAptr galois::substrate::largeMallocBlocked(size_t bytes, unsigned numThreads) {\n  // round up to hugePageSize\n  bytes = roundup(bytes, allocSize());\n  // Get a non-prefaulted allocation\n  void* data = allocPages(bytes / allocSize(), false);\n  if (data)\n    // false = blocked paging\n    pageIn(data, bytes, allocSize(), numThreads, false);\n  return LAptr{data, internal::largeFreer{bytes}};\n}\n\n/**\n * Allocates pages for some specified number of bytes, then does NUMA page\n * faulting based on a specified distribution of elements among threads.\n *\n * @tparam RangeArrayTy Type of threadRanges array: should either be uint32_t*\n * or uint64_t*\n * @param bytes Number of bytes to allocate\n * @param numThreads Number of threads to page in regions for\n * @param threadRanges Array specifying distribution of elements among threads\n * @param elementSize Size of a data element that will be stored in the\n * allocated memory\n * @returns The allocated memory along with a freer object\n */\ntemplate <typename RangeArrayTy>\nLAptr galois::substrate::largeMallocSpecified(size_t bytes, uint32_t numThreads,\n                                              RangeArrayTy& threadRanges,\n                                              size_t elementSize) {\n  // ceiling to nearest page\n  bytes = roundup(bytes, allocSize());\n\n  void* data = allocPages(bytes / allocSize(), false);\n\n  // NUMA aware page in based on element distribution specified in threadRanges\n  if (data)\n    pageInSpecified(data, bytes, allocSize(), numThreads, threadRanges,\n                    elementSize);\n\n  return LAptr{data, internal::largeFreer{bytes}};\n}\n// Explicit template declarations since the template is defined in the .h\n// file\ntemplate LAptr galois::substrate::largeMallocSpecified<std::vector<uint32_t>>(\n    size_t bytes, uint32_t numThreads, std::vector<uint32_t>& threadRanges,\n    size_t elementSize);\ntemplate LAptr galois::substrate::largeMallocSpecified<std::vector<uint64_t>>(\n    size_t bytes, uint32_t numThreads, std::vector<uint64_t>& threadRanges,\n    size_t elementSize);\n"
  },
  {
    "path": "libgalois/src/OCFileGraph.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#include \"galois/graphs/OCGraph.h\"\n#include \"galois/runtime/Mem.h\"\n#include \"galois/gIO.h\"\n\n#include <cassert>\n\n#include <fcntl.h>\n#include <sys/stat.h>\n#include <sys/types.h>\n\nusing namespace galois::graphs;\n\n// File format V1:\n// version (1) {uint64_t LE}\n// EdgeType size {uint64_t LE}\n// numNodes {uint64_t LE}\n// numEdges {uint64_t LE}\n// outindexs[numNodes] {uint64_t LE} (outindex[nodeid] is index of first edge\n// for nodeid + 1 (end interator.  node 0 has an implicit start iterator of 0.\n// outedges[numEdges] {uint32_t LE}\n// potential padding (32bit max) to Re-Align to 64bits\n// EdgeType[numEdges] {EdgeType size}\n\nOCFileGraph::~OCFileGraph() {\n  if (masterMapping)\n    munmap(masterMapping, masterLength);\n  if (masterFD != -1)\n    close(masterFD);\n}\n\nvoid OCFileGraph::Block::unload() {\n  if (!m_mapping)\n    return;\n\n  if (munmap(m_mapping, m_length) != 0) {\n    GALOIS_SYS_DIE(\"failed unallocating\");\n  }\n  m_mapping = 0;\n}\n\nvoid OCFileGraph::Block::load(int fd, offset_t offset, size_t begin, size_t len,\n                              size_t sizeof_data) {\n  assert(m_mapping == 0);\n\n  offset_t start = offset + begin * sizeof_data;\n  offset_t aligned =\n      start & ~static_cast<offset_t>(galois::runtime::pagePoolSize() - 1);\n\n  int _MAP_BASE = MAP_PRIVATE;\n#ifdef MAP_POPULATE\n  _MAP_BASE |= MAP_POPULATE;\n#endif\n  m_length =\n      len * sizeof_data +\n      galois::runtime::pagePoolSize(); // account for round off due to alignment\n  m_mapping = mmap(nullptr, m_length, PROT_READ, _MAP_BASE, fd, aligned);\n  if (m_mapping == MAP_FAILED) {\n    GALOIS_SYS_DIE(\"failed allocating \", fd);\n  }\n\n  m_data = reinterpret_cast<char*>(m_mapping);\n  assert(aligned <= start);\n  assert(start - aligned <=\n         static_cast<offset_t>(galois::runtime::pagePoolSize()));\n  m_data += start - aligned;\n  m_begin       = begin;\n  m_sizeof_data = sizeof_data;\n}\n\nvoid OCFileGraph::load(segment_type& s, edge_iterator begin, edge_iterator end,\n                       size_t sizeof_data) {\n  size_t bb  = *begin;\n  size_t len = *end - *begin;\n\n  offset_t outs = (4 + numNodes) * sizeof(uint64_t);\n  offset_t data = outs + (numEdges + (numEdges & 1)) * sizeof(uint32_t);\n\n  s.outs.load(masterFD, outs, bb, len, sizeof(uint32_t));\n  if (sizeof_data)\n    s.edgeData.load(masterFD, data, bb, len, sizeof_data);\n\n  s.loaded = true;\n}\n\nstatic void readHeader(int fd, uint64_t& numNodes, uint64_t& numEdges) {\n  void* m = mmap(0, 4 * sizeof(uint64_t), PROT_READ, MAP_PRIVATE, fd, 0);\n  if (m == MAP_FAILED) {\n    GALOIS_SYS_DIE(\"failed reading \", fd);\n  }\n\n  uint64_t* ptr = reinterpret_cast<uint64_t*>(m);\n  assert(ptr[0] == 1);\n  numNodes = ptr[2];\n  numEdges = ptr[3];\n\n  if (munmap(m, 4 * sizeof(uint64_t))) {\n    GALOIS_SYS_DIE(\"failed reading \", fd);\n  }\n}\n\nvoid OCFileGraph::fromFile(const std::string& filename) {\n  masterFD = open(filename.c_str(), O_RDONLY);\n  if (masterFD == -1) {\n    GALOIS_SYS_DIE(\"failed opening \", filename);\n  }\n\n  readHeader(masterFD, numNodes, numEdges);\n  masterLength  = 4 * sizeof(uint64_t) + numNodes * sizeof(uint64_t);\n  int _MAP_BASE = MAP_PRIVATE;\n#ifdef MAP_POPULATE\n  _MAP_BASE |= MAP_POPULATE;\n#endif\n  masterMapping = mmap(0, masterLength, PROT_READ, _MAP_BASE, masterFD, 0);\n  if (masterMapping == MAP_FAILED) {\n    GALOIS_SYS_DIE(\"failed reading \", filename);\n  }\n\n  outIdx = reinterpret_cast<uint64_t*>(masterMapping);\n  outIdx += 4;\n}\n"
  },
  {
    "path": "libgalois/src/PageAlloc.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#include \"galois/substrate/PageAlloc.h\"\n#include \"galois/substrate/SimpleLock.h\"\n#include \"galois/gIO.h\"\n\n#include <mutex>\n\n// figure this out dynamically\nconst size_t hugePageSize = 2 * 1024 * 1024;\n// protect mmap, munmap since linux has issues\nstatic galois::substrate::SimpleLock allocLock;\n\nstatic void* trymmap(size_t size, int flag) {\n  std::lock_guard<galois::substrate::SimpleLock> lg(allocLock);\n  const int _PROT = PROT_READ | PROT_WRITE;\n  void* ptr       = galois::mmap(0, size, _PROT, flag, -1, 0);\n  if (ptr == MAP_FAILED)\n    ptr = nullptr;\n  return ptr;\n}\n\nstatic const int _MAP = _MAP_ANON | MAP_PRIVATE;\n#ifdef MAP_POPULATE\nstatic const int _MAP_POP   = MAP_POPULATE | _MAP;\nstatic const bool doHandMap = false;\n#else\nstatic const int _MAP_POP      = _MAP;\nstatic const bool doHandMap    = true;\n#endif\n#ifdef MAP_HUGETLB\nstatic const int _MAP_HUGE_POP = MAP_HUGETLB | _MAP_POP;\nstatic const int _MAP_HUGE     = MAP_HUGETLB | _MAP;\n#else\nstatic const int _MAP_HUGE_POP = _MAP_POP;\nstatic const int _MAP_HUGE     = _MAP;\n#endif\n\nsize_t galois::substrate::allocSize() { return hugePageSize; }\n\nvoid* galois::substrate::allocPages(unsigned num, bool preFault) {\n  if (num > 0) {\n    void* ptr =\n        trymmap(num * hugePageSize, preFault ? _MAP_HUGE_POP : _MAP_HUGE);\n    if (!ptr) {\n      gDebug(\"Huge page alloc failed, falling back\");\n      ptr = trymmap(num * hugePageSize, preFault ? _MAP_POP : _MAP);\n    }\n\n    if (!ptr)\n      GALOIS_SYS_DIE(\"Out of Memory\");\n\n    if (preFault && doHandMap)\n      for (size_t x = 0; x < num * hugePageSize; x += 4096)\n        static_cast<char*>(ptr)[x] = 0;\n\n    return ptr;\n  } else {\n    return nullptr;\n  }\n}\n\nvoid galois::substrate::freePages(void* ptr, unsigned num) {\n  std::lock_guard<SimpleLock> lg(allocLock);\n  if (munmap(ptr, num * hugePageSize) != 0)\n    GALOIS_SYS_DIE(\"Unmap failed\");\n}\n\n/*\n\nclass PageSizeConf {\n#ifdef MAP_HUGETLB\n  void checkHuge() {\n    std::ifstream f(\"/proc/meminfo\");\n\n    if (!f)\n      return;\n\n    char line[2048];\n    size_t hugePageSizeKb = 0;\n    while (f.getline(line, sizeof(line)/sizeof(*line))) {\n      if (strstr(line, \"Hugepagesize:\") != line)\n        continue;\n      std::stringstream ss(line + strlen(\"Hugepagesize:\"));\n      std::string kb;\n      ss >> hugePageSizeKb >> kb;\n      if (kb != \"kB\")\n        galois::substrate::gWarn(\"error parsing meminfo\");\n      break;\n    }\n    if (hugePageSizeKb * 1024 != galois::runtime::hugePageSize)\n      galois::substrate::gWarn(\"System HugePageSize does not match compiled\nHugePageSize\");\n  }\n#else\n  void checkHuge() { }\n#endif\n\npublic:\n  PageSizeConf() {\n#ifdef _POSIX_PAGESIZE\n    galois::runtime::pageSize = _POSIX_PAGESIZE;\n#else\n    galois::runtime::pageSize = sysconf(_SC_PAGESIZE);\n#endif\n    checkHuge();\n  }\n};\n*/\n"
  },
  {
    "path": "libgalois/src/PagePool.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#define __is_trivial(type)                                                     \\\n  __has_trivial_constructor(type) && __has_trivial_copy(type)\n\n#include \"galois/runtime/PagePool.h\"\n\nusing namespace galois::runtime;\n\nstatic galois::runtime::internal::PageAllocState<>* PA;\n\nvoid galois::runtime::internal::setPagePoolState(PageAllocState<>* pa) {\n  GALOIS_ASSERT(!(PA && pa),\n                \"PagePool.cpp: Double Initialization of PageAllocState\");\n  PA = pa;\n}\n\nint galois::runtime::numPagePoolAllocTotal() { return PA->countAll(); }\n\nint galois::runtime::numPagePoolAllocForThread(unsigned tid) {\n  return PA->count(tid);\n}\n\nvoid* galois::runtime::pagePoolAlloc() { return PA->pageAlloc(); }\n\nvoid galois::runtime::pagePoolPreAlloc(unsigned num) {\n  while (num--)\n    PA->pagePreAlloc();\n}\n\nvoid galois::runtime::pagePoolFree(void* ptr) { PA->pageFree(ptr); }\n\nsize_t galois::runtime::pagePoolSize() { return substrate::allocSize(); }\n"
  },
  {
    "path": "libgalois/src/ParaMeter.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#include \"galois/runtime/Executor_ParaMeter.h\"\n#include \"galois/gIO.h\"\n#include \"galois/substrate/EnvCheck.h\"\n\n#include <ctime>\n\nstruct StatsFileManager {\n\n  constexpr static const char* const PARAM_FILE_ENV_VAR =\n      \"GALOIS_PARAMETER_OUTFILE\";\n\n  bool init     = false;\n  bool isOpen   = false;\n  FILE* statsFH = nullptr;\n  // char statsFileName[FNAME_SIZE];\n  std::string statsFileName;\n\n  ~StatsFileManager(void) { close(); }\n\n  static void getTimeStampedName(std::string& statsFileName) {\n\n    constexpr unsigned FNAME_SIZE = 256;\n    char buf[FNAME_SIZE];\n\n    time_t rawtime;\n    struct tm* timeinfo;\n\n    time(&rawtime);\n    timeinfo = localtime(&rawtime);\n\n    strftime(buf, FNAME_SIZE, \"ParaMeter-Stats-%Y-%m-%d--%H-%M-%S.csv\",\n             timeinfo);\n    statsFileName = buf;\n  }\n\n  FILE* get(void) {\n    if (!init) {\n      init = true;\n\n      if (!galois::substrate::EnvCheck(PARAM_FILE_ENV_VAR, statsFileName)) {\n        // statsFileName = \"ParaMeter-Stats.csv\";\n        getTimeStampedName(statsFileName);\n      }\n\n      statsFH = fopen(statsFileName.c_str(), \"w\");\n      GALOIS_ASSERT(statsFH != nullptr, \"ParaMeter stats file error\");\n\n      galois::runtime::ParaMeter::StepStatsBase::printHeader(statsFH);\n\n      fclose(statsFH);\n    }\n\n    if (!isOpen) {\n      statsFH = fopen(statsFileName.c_str(), \"a\"); // open in append mode\n      GALOIS_ASSERT(statsFH != nullptr, \"ParaMeter stats file error\");\n\n      isOpen = true;\n    }\n\n    return statsFH;\n  }\n\n  void close(void) {\n    if (isOpen) {\n      fclose(statsFH);\n      isOpen  = false;\n      statsFH = nullptr;\n    }\n  }\n};\n\nstatic StatsFileManager& getStatsFileManager(void) {\n  static StatsFileManager s;\n  return s;\n}\n\nFILE* galois::runtime::ParaMeter::getStatsFile(void) {\n  return getStatsFileManager().get();\n}\n\nvoid galois::runtime::ParaMeter::closeStatsFile(void) {\n  getStatsFileManager().close();\n}\n"
  },
  {
    "path": "libgalois/src/PerThreadStorage.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#include \"galois/substrate/PerThreadStorage.h\"\n#include \"galois/substrate/PageAlloc.h\"\n\n#include \"galois/gIO.h\"\n#include <atomic>\n#include <mutex>\n\nthread_local char* galois::substrate::ptsBase;\n\ngalois::substrate::PerBackend& galois::substrate::getPTSBackend() {\n  static galois::substrate::PerBackend b;\n  return b;\n}\n\nthread_local char* galois::substrate::pssBase;\n\ngalois::substrate::PerBackend& galois::substrate::getPPSBackend() {\n  static galois::substrate::PerBackend b;\n  return b;\n}\n\nconst size_t ptAllocSize = galois::substrate::allocSize();\ninline void* alloc() {\n  // alloc a single page, don't prefault\n  void* toReturn = galois::substrate::allocPages(1, true);\n  if (toReturn == nullptr) {\n    GALOIS_DIE(\"per-thread storage out of memory\");\n  }\n  return toReturn;\n}\n\nconstexpr unsigned MAX_SIZE = 30;\n// PerBackend storage is typically cache-aligned. Simplify bookkeeping at the\n// expense of fragmentation by restricting all allocations to be cache-aligned.\nconstexpr unsigned MIN_SIZE = 7;\n\nstatic_assert((1 << MIN_SIZE) == galois::substrate::GALOIS_CACHE_LINE_SIZE);\n\ngalois::substrate::PerBackend::PerBackend() { freeOffsets.resize(MAX_SIZE); }\n\nunsigned galois::substrate::PerBackend::nextLog2(unsigned size) {\n  unsigned i = MIN_SIZE;\n  while ((1U << i) < size) {\n    ++i;\n  }\n  if (i >= MAX_SIZE) {\n    abort();\n  }\n  return i;\n}\n\nunsigned galois::substrate::PerBackend::allocOffset(const unsigned sz) {\n  unsigned ll   = nextLog2(sz);\n  unsigned size = (1 << ll);\n\n  if (nextLoc.load(std::memory_order_relaxed) + size <= ptAllocSize) {\n    // simple path, where we allocate bump ptr style\n    unsigned offset = nextLoc.fetch_add(size);\n    if (offset + size <= ptAllocSize) {\n      return offset;\n    }\n  }\n\n  if (invalid) {\n    GALOIS_DIE(\"allocating after delete\");\n    return ptAllocSize;\n  }\n\n  // find a free offset\n  std::lock_guard<Lock> llock(freeOffsetsLock);\n\n  unsigned index = ll;\n  if (!freeOffsets[index].empty()) {\n    unsigned offset = freeOffsets[index].back();\n    freeOffsets[index].pop_back();\n    return offset;\n  }\n\n  // find a bigger size\n  for (; (index < MAX_SIZE) && (freeOffsets[index].empty()); ++index)\n    ;\n\n  if (index == MAX_SIZE) {\n    GALOIS_DIE(\"per-thread storage out of memory\");\n    return ptAllocSize;\n  }\n\n  // Found a bigger free offset. Use the first piece equal to required\n  // size and produce vending machine change for the rest.\n  assert(!freeOffsets[index].empty());\n  unsigned offset = freeOffsets[index].back();\n  freeOffsets[index].pop_back();\n\n  // remaining chunk\n  unsigned end   = offset + (1 << index);\n  unsigned start = offset + size;\n  for (unsigned i = index - 1; start < end; --i) {\n    freeOffsets[i].push_back(start);\n    start += (1 << i);\n  }\n\n  assert(offset != ptAllocSize);\n\n  return offset;\n}\n\nvoid galois::substrate::PerBackend::deallocOffset(const unsigned offset,\n                                                  const unsigned sz) {\n  unsigned ll       = nextLog2(sz);\n  unsigned size     = (1 << ll);\n  unsigned expected = offset + size;\n\n  if (nextLoc.compare_exchange_strong(expected, offset)) {\n    // allocation was at the end, so recovered some memory\n    return;\n  }\n\n  if (invalid) {\n    GALOIS_DIE(\"deallocing after delete\");\n    return;\n  }\n\n  // allocation not at the end\n  std::lock_guard<Lock> llock(freeOffsetsLock);\n  freeOffsets[ll].push_back(offset);\n}\n\nvoid* galois::substrate::PerBackend::getRemote(unsigned thread,\n                                               unsigned offset) {\n  char* rbase = heads[thread].load(std::memory_order_relaxed);\n  assert(rbase);\n  return &rbase[offset];\n}\n\nvoid galois::substrate::PerBackend::initCommon(unsigned maxT) {\n  if (!heads) {\n    assert(ThreadPool::getTID() == 0);\n    heads = new std::atomic<char*>[maxT] {};\n  }\n}\n\nchar* galois::substrate::PerBackend::initPerThread(unsigned maxT) {\n  initCommon(maxT);\n  char* b = heads[ThreadPool::getTID()] = (char*)alloc();\n  memset(b, 0, ptAllocSize);\n  return b;\n}\n\nchar* galois::substrate::PerBackend::initPerSocket(unsigned maxT) {\n  initCommon(maxT);\n  unsigned id     = ThreadPool::getTID();\n  unsigned leader = ThreadPool::getLeader();\n  if (id == leader) {\n    char* b = heads[id] = (char*)alloc();\n    memset(b, 0, ptAllocSize);\n    return b;\n  }\n  char* expected = nullptr;\n  // wait for leader to fix up socket\n  while (heads[leader].compare_exchange_weak(expected, nullptr)) {\n    substrate::asmPause();\n  }\n  heads[id] = heads[leader].load();\n  return heads[id];\n}\n\nvoid galois::substrate::initPTS(unsigned maxT) {\n  if (!ptsBase) {\n    // unguarded initialization as initPTS will run in the master thread\n    // before any other threads are generated\n    ptsBase = getPTSBackend().initPerThread(maxT);\n  }\n  if (!pssBase) {\n    pssBase = getPPSBackend().initPerSocket(maxT);\n  }\n}\n"
  },
  {
    "path": "libgalois/src/PreAlloc.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#include \"galois/runtime/Executor_OnEach.h\"\n#include \"galois/runtime/Mem.h\"\n#include \"galois/runtime/PagePool.h\"\n\nvoid galois::runtime::preAlloc_impl(unsigned num) {\n  unsigned pagesPerThread = (num + activeThreads - 1) / activeThreads;\n  substrate::getThreadPool().run(activeThreads,\n                                 [=]() { pagePoolPreAlloc(pagesPerThread); });\n}\n"
  },
  {
    "path": "libgalois/src/Profile.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#include \"galois/runtime/Profile.h\"\n\n#ifdef GALOIS_ENABLE_PAPI\nextern \"C\" {\n#include <papi.h>\n#include <papiStdEventDefs.h>\n}\n#include <iostream>\n\nunsigned long galois::runtime::internal::papiGetTID(void) {\n  return galois::substrate::ThreadPool::getTID();\n}\n#endif\n"
  },
  {
    "path": "libgalois/src/PtrLock.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#include \"galois/substrate/PtrLock.h\"\n\nvoid galois::substrate::internal::ptr_slow_lock(std::atomic<uintptr_t>& _l) {\n  uintptr_t oldval;\n  do {\n    while ((_l.load(std::memory_order_acquire) & 1) != 0) {\n      asmPause();\n    }\n    oldval = _l.fetch_or(1, std::memory_order_acq_rel);\n  } while (oldval & 1);\n  assert(_l);\n}\n"
  },
  {
    "path": "libgalois/src/SharedMem.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#include \"galois/substrate/SharedMem.h\"\n#include \"galois/substrate/Barrier.h\"\n#include \"galois/substrate/ThreadPool.h\"\n#include \"galois/substrate/Termination.h\"\n\n#include <memory>\n\ngalois::substrate::SharedMem::SharedMem() {\n  internal::setThreadPool(&m_tpool);\n\n  // delayed initialization because both call getThreadPool in constructor\n  // which is valid only after setThreadPool() above\n  m_biPtr   = std::make_unique<internal::BarrierInstance<>>();\n  m_termPtr = std::make_unique<internal::LocalTerminationDetection<>>();\n\n  internal::setBarrierInstance(m_biPtr.get());\n  internal::setTermDetect(m_termPtr.get());\n}\n\ngalois::substrate::SharedMem::~SharedMem() {\n  internal::setTermDetect(nullptr);\n  internal::setBarrierInstance(nullptr);\n\n  // destructors can call getThreadPool(), hence must be destroyed before\n  // setThreadPool() below\n  m_termPtr.reset();\n  m_biPtr.reset();\n\n  internal::setThreadPool(nullptr);\n}\n"
  },
  {
    "path": "libgalois/src/SharedMemSys.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#include \"galois/SharedMemSys.h\"\n\ngalois::SharedMemSys::SharedMemSys() = default;\n\ngalois::SharedMemSys::~SharedMemSys() = default;\n"
  },
  {
    "path": "libgalois/src/SimpleLock.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#include \"galois/substrate/SimpleLock.h\"\n\nvoid galois::substrate::SimpleLock::slow_lock() const {\n  int oldval = 0;\n  do {\n    while (_lock.load(std::memory_order_acquire) != 0) {\n      asmPause();\n    }\n    oldval = 0;\n  } while (!_lock.compare_exchange_weak(oldval, 1, std::memory_order_acq_rel,\n                                        std::memory_order_relaxed));\n  assert(is_locked());\n}\n"
  },
  {
    "path": "libgalois/src/Statistics.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#include \"galois/runtime/Statistics.h\"\n#include \"galois/runtime/Executor_OnEach.h\"\n\n#include <iostream>\n#include <fstream>\n\nusing namespace galois::runtime;\n\nboost::uuids::uuid galois::runtime::getRandUUID(void) {\n  static boost::uuids::uuid UUID = boost::uuids::random_generator()();\n  return UUID;\n}\n\nusing galois::gstl::Str;\n\nStatManager::StatManager(const std::string& outfile) : m_outfile(outfile) {}\n\nStatManager::~StatManager(void) {}\n\nvoid StatManager::setStatFile(const std::string& outfile) {\n  m_outfile = outfile;\n}\n\nvoid galois::runtime::setStatFile(const std::string& f) {\n  internal::sysStatManager()->setStatFile(f);\n}\n\nvoid galois::runtime::reportRUsage(const std::string& id) {\n  // get rusage at this point in time\n  struct rusage usage_stats;\n  int rusage_result = getrusage(RUSAGE_SELF, &usage_stats);\n  if (rusage_result != 0) {\n    GALOIS_DIE(\"getrusage failed: \", rusage_result);\n  }\n\n  // report stats using ID to identify them\n  reportStat(\"rusage\", \"MaxResidentSetSize_\" + id, usage_stats.ru_maxrss,\n             StatTotal::SINGLE);\n  reportStat(\"rusage\", \"SoftPageFaults_\" + id, usage_stats.ru_minflt,\n             StatTotal::SINGLE);\n  reportStat(\"rusage\", \"HardPageFaults_\" + id, usage_stats.ru_majflt,\n             StatTotal::SINGLE);\n}\n\nbool StatManager::printingThreadVals(void) {\n  return galois::substrate::EnvCheck(StatManager::TSTAT_ENV_VAR);\n}\n\nvoid StatManager::print(void) {\n  if (m_outfile == \"\") {\n    printStats(std::cout);\n  } else {\n    std::ofstream outf(m_outfile.c_str());\n    if (outf.good()) {\n      printStats(outf);\n    } else {\n      gWarn(\"Could not open stats file for writing, file provided:\", m_outfile);\n      printStats(std::cerr);\n    }\n  }\n}\n\nvoid StatManager::printStats(std::ostream& out) {\n  mergeStats();\n  printHeader(out);\n  intStats.print(out);\n  fpStats.print(out);\n  strStats.print(out);\n}\n\nvoid StatManager::printHeader(std::ostream& out) const {\n\n  out << \"STAT_TYPE\" << SEP << \"REGION\" << SEP << \"CATEGORY\" << SEP;\n  out << \"TOTAL_TYPE\" << SEP << \"TOTAL\";\n  out << \"\\n\";\n}\n\nStatManager::int_iterator StatManager::intBegin(void) const {\n  return intStats.cbegin();\n}\nStatManager::int_iterator StatManager::intEnd(void) const {\n  return intStats.cend();\n}\n\nStatManager::fp_iterator StatManager::fpBegin(void) const {\n  return fpStats.cbegin();\n}\nStatManager::fp_iterator StatManager::fpEnd(void) const {\n  return fpStats.cend();\n}\n\nStatManager::str_iterator StatManager::paramBegin(void) const {\n  return strStats.cbegin();\n}\nStatManager::str_iterator StatManager::paramEnd(void) const {\n  return strStats.cend();\n}\n\nstatic galois::runtime::StatManager* SM;\n\nvoid galois::runtime::internal::setSysStatManager(\n    galois::runtime::StatManager* sm) {\n  GALOIS_ASSERT(!(SM && sm), \"StatManager.cpp: Double Initialization of SM\");\n  SM = sm;\n}\n\nStatManager* galois::runtime::internal::sysStatManager(void) { return SM; }\n\nvoid galois::runtime::reportPageAlloc(const char* category) {\n  galois::runtime::on_each_gen(\n      [category](const unsigned int tid, const unsigned int) {\n        reportStat_Tsum(\"PageAlloc\", category, numPagePoolAllocForThread(tid));\n      },\n      std::make_tuple());\n}\n\nvoid galois::runtime::reportNumaAlloc(const char*) {\n  galois::gWarn(\"reportNumaAlloc NOT IMPLEMENTED YET. TBD\");\n  int nodes = substrate::getThreadPool().getMaxNumaNodes();\n  for (int x = 0; x < nodes; ++x) {\n    // auto rStat = Stats.getRemote(x);\n    // std::lock_guard<substrate::SimpleLock> lg(rStat->first);\n    //      rStat->second.emplace_back(loop, category, numNumaAllocForNode(x));\n  }\n  //  SC->addNumaAllocToStat(std::string(\"(NULL)\"), std::string(category ?\n  //  category : \"(NULL)\"));\n}\n"
  },
  {
    "path": "libgalois/src/Substrate.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#include \"galois/runtime/Substrate.h\"\n#include \"galois/substrate/Barrier.h\"\n\ngalois::substrate::Barrier&\ngalois::runtime::getBarrier(unsigned activeThreads) {\n  return galois::substrate::getBarrier(activeThreads);\n}\n"
  },
  {
    "path": "libgalois/src/Support.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n"
  },
  {
    "path": "libgalois/src/Termination.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#include \"galois/gIO.h\"\n#include \"galois/substrate/Termination.h\"\n\n// vtable anchoring\ngalois::substrate::TerminationDetection::~TerminationDetection(void) {}\n\nstatic galois::substrate::TerminationDetection* TERM = nullptr;\n\nvoid galois::substrate::internal::setTermDetect(\n    galois::substrate::TerminationDetection* t) {\n  GALOIS_ASSERT(!(TERM && t), \"Double initialization of TerminationDetection\");\n  TERM = t;\n}\n\ngalois::substrate::TerminationDetection&\ngalois::substrate::getSystemTermination(unsigned activeThreads) {\n  TERM->init(activeThreads);\n  return *TERM;\n}\n"
  },
  {
    "path": "libgalois/src/ThreadPool.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#include \"galois/substrate/ThreadPool.h\"\n#include \"galois/substrate/EnvCheck.h\"\n#include \"galois/substrate/HWTopo.h\"\n#include \"galois/gIO.h\"\n\n#include <algorithm>\n#include <iostream>\n\n// Forward declare this to avoid including PerThreadStorage.\n// We avoid this to stress that the thread Pool MUST NOT depend on PTS.\nnamespace galois::substrate {\n\nextern void initPTS(unsigned);\n\n}\n\nusing galois::substrate::ThreadPool;\n\nthread_local ThreadPool::per_signal ThreadPool::my_box;\n\nThreadPool::ThreadPool()\n    : mi(getHWTopo().machineTopoInfo), reserved(0), masterFastmode(false),\n      running(false) {\n  signals.resize(mi.maxThreads);\n  initThread(0);\n\n  for (unsigned i = 1; i < mi.maxThreads; ++i) {\n    std::thread t(&ThreadPool::threadLoop, this, i);\n    threads.emplace_back(std::move(t));\n  }\n\n  // we don't want signals to have to contain atomics, since they are set once\n  while (std::any_of(signals.begin(), signals.end(),\n                     [](per_signal* p) { return !p || !p->done; })) {\n    std::atomic_thread_fence(std::memory_order_seq_cst);\n  }\n}\n\nThreadPool::~ThreadPool() {\n  destroyCommon();\n  for (auto& t : threads) {\n    t.join();\n  }\n}\n\nvoid ThreadPool::destroyCommon() {\n  beKind(); // reset fastmode\n  run(mi.maxThreads, []() { throw shutdown_ty(); });\n}\n\nvoid ThreadPool::burnPower(unsigned num) {\n  num = std::min(num, getMaxUsableThreads());\n\n  // changing number of threads?  just do a reset\n  if (masterFastmode && masterFastmode != num) {\n    beKind();\n  }\n  if (!masterFastmode) {\n    run(num, []() { throw fastmode_ty{true}; });\n    masterFastmode = num;\n  }\n}\n\nvoid ThreadPool::beKind() {\n  if (masterFastmode) {\n    run(masterFastmode, []() { throw fastmode_ty{false}; });\n    masterFastmode = 0;\n  }\n}\n\n// inefficient append\ntemplate <typename T>\nstatic void atomic_append(std::atomic<T*>& headptr, T* newnode) {\n  T* n = nullptr;\n  if (!headptr.compare_exchange_strong(n, newnode))\n    atomic_append(headptr.load()->next, newnode);\n}\n\n// find id\ntemplate <typename T>\nstatic unsigned findID(std::atomic<T*>& headptr, T* node, unsigned off) {\n  T* n = headptr.load();\n  assert(n);\n  if (n == node) {\n    return off;\n  }\n  return findID(n->next, node, off + 1);\n}\n\ntemplate <typename T>\nstatic T* getNth(std::atomic<T*>& headptr, unsigned off) {\n  T* n = headptr.load();\n  if (!off) {\n    return n;\n  }\n  return getNth(n->next, off - 1);\n}\n\nvoid ThreadPool::initThread(unsigned tid) {\n  signals[tid] = &my_box;\n  my_box.topo  = getHWTopo().threadTopoInfo[tid];\n  // Initialize\n  substrate::initPTS(mi.maxThreads);\n\n  if (!EnvCheck(\"GALOIS_DO_NOT_BIND_THREADS\")) {\n    if (my_box.topo.tid != 0 || !EnvCheck(\"GALOIS_DO_NOT_BIND_MAIN_THREAD\")) {\n      bindThreadSelf(my_box.topo.osContext);\n    }\n  }\n  my_box.done = 1;\n}\n\nvoid ThreadPool::threadLoop(unsigned tid) {\n  initThread(tid);\n  bool fastmode = false;\n  auto& me      = my_box;\n  do {\n    me.wait(fastmode);\n    cascade(fastmode);\n    try {\n      work();\n    } catch (const shutdown_ty&) {\n      return;\n    } catch (const fastmode_ty& fm) {\n      fastmode = fm.mode;\n    } catch (const dedicated_ty dt) {\n      me.done = 1;\n      dt.fn();\n      return;\n    } catch (const std::exception& exc) {\n      // catch anything thrown within try block that derives from std::exception\n      std::cerr << exc.what();\n      abort();\n    } catch (...) {\n      abort();\n    }\n    decascade();\n  } while (true);\n}\n\nvoid ThreadPool::decascade() {\n  auto& me = my_box;\n  // nothing to wake up\n  if (me.wbegin != me.wend) {\n    auto midpoint = me.wbegin + (1 + me.wend - me.wbegin) / 2;\n    auto& c1done  = signals[me.wbegin]->done;\n    while (!c1done) {\n      asmPause();\n    }\n    if (midpoint < me.wend) {\n      auto& c2done = signals[midpoint]->done;\n      while (!c2done) {\n        asmPause();\n      }\n    }\n  }\n  me.done = 1;\n}\n\nvoid ThreadPool::cascade(bool fastmode) {\n  auto& me = my_box;\n  assert(me.wbegin <= me.wend);\n\n  // nothing to wake up\n  if (me.wbegin == me.wend) {\n    return;\n  }\n\n  auto midpoint = me.wbegin + (1 + me.wend - me.wbegin) / 2;\n\n  auto child1    = signals[me.wbegin];\n  child1->wbegin = me.wbegin + 1;\n  child1->wend   = midpoint;\n  child1->wakeup(fastmode);\n\n  if (midpoint < me.wend) {\n    auto child2    = signals[midpoint];\n    child2->wbegin = midpoint + 1;\n    child2->wend   = me.wend;\n    child2->wakeup(fastmode);\n  }\n}\n\nvoid ThreadPool::runInternal(unsigned num) {\n  // sanitize num\n  // seq write to starting should make work safe\n  GALOIS_ASSERT(!running, \"Recursive thread pool execution not supported\");\n  running = true;\n  num     = std::min(std::max(1U, num), getMaxUsableThreads());\n  // my_box is tid 0\n  auto& me  = my_box;\n  me.wbegin = 1;\n  me.wend   = num;\n\n  assert(!masterFastmode || masterFastmode == num);\n  // launch threads\n  cascade(masterFastmode);\n  // Do master thread work\n  try {\n    work();\n  } catch (const shutdown_ty&) {\n    return;\n  } catch (const fastmode_ty& fm) {\n  }\n  // wait for children\n  decascade();\n  // Clean up\n  work    = nullptr;\n  running = false;\n}\n\nvoid ThreadPool::runDedicated(std::function<void(void)>& f) {\n  // TODO(ddn): update galois::runtime::activeThreads to reflect the dedicated\n  // thread but we don't want to depend on galois::runtime symbols and too many\n  // clients access galois::runtime::activeThreads directly.\n  GALOIS_ASSERT(!running,\n                \"Can't start dedicated thread during parallel section\");\n  ++reserved;\n\n  GALOIS_ASSERT(reserved < mi.maxThreads, \"Too many dedicated threads\");\n  work          = [&f]() { throw dedicated_ty{f}; };\n  auto child    = signals[mi.maxThreads - reserved];\n  child->wbegin = 0;\n  child->wend   = 0;\n  child->done   = 0;\n  child->wakeup(masterFastmode);\n  while (!child->done) {\n    asmPause();\n  }\n  work = nullptr;\n}\n\nstatic galois::substrate::ThreadPool* TPOOL = nullptr;\n\nvoid galois::substrate::internal::setThreadPool(ThreadPool* tp) {\n  GALOIS_ASSERT(!(TPOOL && tp), \"Double initialization of ThreadPool\");\n  TPOOL = tp;\n}\n\ngalois::substrate::ThreadPool& galois::substrate::getThreadPool() {\n  GALOIS_ASSERT(TPOOL, \"ThreadPool not initialized\");\n  return *TPOOL;\n}\n"
  },
  {
    "path": "libgalois/src/ThreadTimer.cpp",
    "content": "#include \"galois/runtime/ThreadTimer.h\"\n#include \"galois/runtime/Executor_OnEach.h\"\n#include \"galois/runtime/Statistics.h\"\n\n#include <ctime>\n#include <limits>\n\nvoid galois::runtime::ThreadTimers::reportTimes(const char* category,\n                                                const char* region) {\n\n  uint64_t minTime = std::numeric_limits<uint64_t>::max();\n\n  for (unsigned i = 0; i < timers_.size(); ++i) {\n    auto ns = timers_.getRemote(i)->get_nsec();\n    minTime = std::min(minTime, ns);\n  }\n\n  std::string timeCat = category + std::string(\"PerThreadTimes\");\n  std::string lagCat  = category + std::string(\"PerThreadLag\");\n\n  on_each_gen(\n      [&](auto, auto) {\n        auto ns  = timers_.getLocal()->get_nsec();\n        auto lag = ns - minTime;\n        assert(lag > 0 && \"negative time lag from min is impossible\");\n\n        reportStat_Tmax(region, timeCat.c_str(), ns / 1000000);\n        reportStat_Tmax(region, lagCat.c_str(), lag / 1000000);\n      },\n      std::make_tuple());\n}\n"
  },
  {
    "path": "libgalois/src/Threads.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#include \"galois/substrate/ThreadPool.h\"\n#include \"galois/Threads.h\"\n\n#include <algorithm>\nnamespace galois {\nnamespace runtime {\nunsigned int activeThreads = 1;\n}\n} // namespace galois\n\nunsigned int galois::setActiveThreads(unsigned int num) noexcept {\n  num = std::min(num, galois::substrate::getThreadPool().getMaxUsableThreads());\n  num = std::max(num, 1U);\n  galois::runtime::activeThreads = num;\n  return num;\n}\n\nunsigned int galois::getActiveThreads() noexcept {\n  return galois::runtime::activeThreads;\n}\n"
  },
  {
    "path": "libgalois/src/Timer.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#include \"galois/Timer.h\"\n#include \"galois/runtime/Statistics.h\"\n\nusing namespace galois;\n\nvoid Timer::start() { startT = clockTy::now(); }\n\nvoid Timer::stop() { stopT = clockTy::now(); }\n\nuint64_t Timer::get() const {\n  return std::chrono::duration_cast<std::chrono::milliseconds>(stopT - startT)\n      .count();\n}\n\nuint64_t Timer::get_usec() const {\n  return std::chrono::duration_cast<std::chrono::microseconds>(stopT - startT)\n      .count();\n}\n\nTimeAccumulator::TimeAccumulator() : ltimer(), acc(0) {}\n\nvoid TimeAccumulator::start() { ltimer.start(); }\n\nvoid TimeAccumulator::stop() {\n  ltimer.stop();\n  acc += ltimer.get_usec();\n}\n\nuint64_t TimeAccumulator::get() const { return acc / 1000; }\nuint64_t TimeAccumulator::get_usec() const { return acc; }\n\nTimeAccumulator& TimeAccumulator::operator+=(const TimeAccumulator& rhs) {\n  acc += rhs.acc;\n  return *this;\n}\n\nTimeAccumulator& TimeAccumulator::operator+=(const Timer& rhs) {\n  acc += rhs.get_usec();\n  return *this;\n}\n\nStatTimer::StatTimer(const char* const name, const char* const region) {\n  const char* n = name ? name : \"Time\";\n  const char* r = region ? region : \"(NULL)\";\n\n  name_   = gstl::makeStr(n);\n  region_ = gstl::makeStr(r);\n\n  valid_ = false;\n}\n\nStatTimer::~StatTimer() {\n  if (valid_) {\n    stop();\n  }\n\n  // only report non-zero stat\n  if (TimeAccumulator::get()) {\n    galois::runtime::reportStat_Tmax(region_, name_, TimeAccumulator::get());\n  }\n}\n\nvoid StatTimer::start() {\n  TimeAccumulator::start();\n  valid_ = true;\n}\n\nvoid StatTimer::stop() {\n  valid_ = false;\n  TimeAccumulator::stop();\n}\n\nuint64_t StatTimer::get_usec() const { return TimeAccumulator::get_usec(); }\n"
  },
  {
    "path": "libgalois/src/Tracer.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n/**\n * @file Tracer.cpp\n *\n * Implementations/variables for Tracer.h\n */\n\n#include \"galois/runtime/Tracer.h\"\n#include \"galois/substrate/SimpleLock.h\"\n#include \"galois/substrate/EnvCheck.h\"\n\n#include <fstream>\n#include <cassert>\n#include <iostream>\n#include <chrono>\n#include <mutex>\n\n#include <sys/types.h>\n#include <unistd.h>\n\nusing namespace galois::substrate;\n\nstatic bool doCerr     = false;\nstatic bool doCerrInit = false;\n\nnamespace galois {\nnamespace runtime {\nuint32_t getHostID() __attribute__((weak));\n} // end namespace runtime\n} // end namespace galois\n\n/**\n * Returns 0\n */\nuint32_t galois::runtime::getHostID() { return 0; }\n\nstatic std::ostream& openIfNot() {\n  if (!doCerrInit) {\n    doCerr     = EnvCheck(\"GALOIS_DEBUG_TRACE_STDERR\");\n    doCerrInit = true;\n  }\n  if (doCerr)\n    return std::cerr;\n  static std::ofstream output;\n  if (!output.is_open()) {\n    pid_t id       = getpid();\n    char name[100] = \"\";\n    gethostname(name, sizeof(name));\n    char fname[120];\n    snprintf(fname, sizeof(fname), \"%s.%d.log\", name, id);\n    output.open(fname, std::ios_base::app);\n  }\n  assert(output.is_open());\n  return output;\n}\n\nvoid galois::runtime::internal::printTrace(std::ostringstream& os) {\n  using namespace std::chrono;\n  static SimpleLock lock;\n  std::lock_guard<SimpleLock> lg(lock);\n  auto& out = openIfNot();\n  auto dtn  = system_clock::now().time_since_epoch();\n  out << \"<\" << dtn.count() << \",\" << getHostID() << \"> \";\n  out << os.str();\n  out.flush();\n  static int iSleep   = 0;\n  static bool doSleep = EnvCheck(\"GALOIS_DEBUG_TRACE_PAUSE\", iSleep);\n  if (doSleep)\n    usleep(iSleep ? iSleep : 10);\n}\n\nstatic std::ofstream& openIfNot_output() {\n  static std::ofstream output_file;\n  if (!output_file.is_open()) {\n    char name[100] = \"\";\n    gethostname(name, sizeof(name));\n    char fname[120];\n    snprintf(fname, sizeof(fname), \"output_%s_%d.log\", name,\n             galois::runtime::getHostID());\n    output_file.open(fname, std::ios_base::app);\n  }\n  assert(output_file.is_open());\n  return output_file;\n}\n\nvoid galois::runtime::internal::print_output_impl(std::ostringstream& os) {\n  using namespace galois::runtime;\n  static SimpleLock lock2;\n  std::lock_guard<SimpleLock> lg(lock2);\n  auto& out = openIfNot_output();\n  out << os.str();\n  out.flush();\n}\n\n//! Specifies whether or not tracing is enabled\nbool galois::runtime::internal::doTrace = false;\n//! Specify if program has checked environment to see if doTrace should be on\n//! or off\nbool galois::runtime::internal::initTrace = false;\n"
  },
  {
    "path": "libgalois/src/Version.cpp.in",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting parallelism.\n * The code is being released under the terms of the 3-Clause BSD License (a\n * copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#include \"galois/Version.h\"\n\n#define QUOTE(name) #name\n#define STR(macro) QUOTE(macro)\n\nstd::string galois::getVersion() { return STR(@GALOIS_VERSION@); }\n\nstd::string galois::getRevision() { return \"unknown\"; }\n\nint galois::getVersionMajor() { return @GALOIS_VERSION_MAJOR@; }\n\nint galois::getVersionMinor() { return @GALOIS_VERSION_MINOR@; }\n\nint galois::getVersionPatch() { return @GALOIS_VERSION_PATCH@; }\n\nint galois::getCopyrightYear() { return @GALOIS_COPYRIGHT_YEAR@; }\n"
  },
  {
    "path": "libgalois/src/gIO.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#include \"galois/gIO.h\"\n#include \"galois/substrate/SimpleLock.h\"\n#include \"galois/substrate/EnvCheck.h\"\n#include \"galois/substrate/ThreadPool.h\"\n\n#include <cstdlib>\n#include <cstdio>\n#include <ctime>\n#include <cstring>\n#include <cstdarg>\n#include <cerrno>\n#include <unistd.h>\n#include <iostream>\n#include <fstream>\n#include <iomanip>\n#include <mutex>\n\nstatic void printString(bool error, bool newline, const std::string& prefix,\n                        const std::string& s) {\n  static galois::substrate::SimpleLock IOLock;\n  std::lock_guard<decltype(IOLock)> lock(IOLock);\n  std::ostream& o = error ? std::cerr : std::cout;\n  if (prefix.length())\n    o << prefix << \": \";\n  o << s;\n  if (newline)\n    o << \"\\n\";\n}\n\nvoid galois::gDebugStr(const std::string& s) {\n  static bool skip = galois::substrate::EnvCheck(\"GALOIS_DEBUG_SKIP\");\n  if (skip)\n    return;\n  static const unsigned TIME_STR_SIZE = 32;\n  char time_str[TIME_STR_SIZE];\n  time_t rawtime;\n  struct tm* timeinfo;\n\n  time(&rawtime);\n  timeinfo = localtime(&rawtime);\n\n  strftime(time_str, TIME_STR_SIZE, \"[%H:%M:%S]\", timeinfo);\n\n  std::ostringstream os;\n  os << \"[\" << time_str << \" \" << std::setw(3)\n     << galois::substrate::ThreadPool::getTID() << \"] \" << s;\n\n  if (galois::substrate::EnvCheck(\"GALOIS_DEBUG_TO_FILE\")) {\n    static galois::substrate::SimpleLock dIOLock;\n    std::lock_guard<decltype(dIOLock)> lock(dIOLock);\n    static std::ofstream debugOut;\n    if (!debugOut.is_open()) {\n      char fname[] = \"gdebugXXXXXX\";\n      int fd       = mkstemp(fname);\n      close(fd);\n      debugOut.open(fname);\n      gInfo(\"Debug output going to \", fname);\n    }\n    debugOut << os.str() << \"\\n\";\n    debugOut.flush();\n  } else {\n    printString(true, true, \"DEBUG\", os.str());\n  }\n}\n\nvoid galois::gPrintStr(const std::string& s) {\n  printString(false, false, \"\", s);\n}\n\nvoid galois::gInfoStr(const std::string& s) {\n  printString(false, true, \"INFO\", s);\n}\n\nvoid galois::gWarnStr(const std::string& s) {\n  printString(false, true, \"WARNING\", s);\n}\n\nvoid galois::gErrorStr(const std::string& s) {\n  printString(true, true, \"ERROR\", s);\n}\n\nvoid galois::gFlush() { fflush(stdout); }\n"
  },
  {
    "path": "libgalois/test/CMakeLists.txt",
    "content": "function(add_test_unit name)\n  set(options)\n  set(multi_value_args REQUIRES COMMAND_PREFIX)\n  cmake_parse_arguments(X \"${options}\" \"${one_value_args}\" \"${multi_value_args}\" ${ARGN})\n\n  foreach(required ${X_REQUIRES})\n    if(${${required}} MATCHES \"TRUE\")\n    else()\n      message(STATUS \"NOT compiling ${name} (missing: ${required})\")\n      return()\n    endif()\n  endforeach()\n\n  set(test_name unit-${name})\n\n  add_executable(${test_name} ${name}.cpp)\n  target_link_libraries(${test_name} galois_shmem lonestar)\n\n  set(commandline ${X_COMMAND_PREFIX})\n  list(APPEND commandline \"$<TARGET_FILE:${test_name}>\")\n  list(APPEND commandline ${X_UNPARSED_ARGUMENTS})\n\n  add_test(NAME ${test_name} COMMAND ${commandline})\n\n  # Allow parallel tests\n  set_tests_properties(${test_name}\n    PROPERTIES\n      ENVIRONMENT GALOIS_DO_NOT_BIND_THREADS=1\n      LABELS quick\n    )\nendfunction()\n\nadd_test_unit(acquire)\nadd_test_unit(bandwidth)\nadd_test_unit(barriers 1024 2)\nadd_test_unit(empty-member-lcgraph)\nadd_test_unit(flatmap)\nadd_test_unit(floatingPointErrors)\nadd_test_unit(foreach)\nadd_test_unit(forward-declare-graph)\nadd_test_unit(gcollections)\nadd_test_unit(graph)\nadd_test_unit(graph-compile)\nadd_test_unit(gslist)\nadd_test_unit(hwtopo)\nadd_test_unit(lc-adaptor)\nadd_test_unit(lock)\nadd_test_unit(loop-overhead REQUIRES OPENMP_FOUND)\nadd_test_unit(mem)\nadd_test_unit(morphgraph)\nadd_test_unit(move)\nadd_test_unit(oneach)\nadd_test_unit(papi 2)\nadd_test_unit(pc)\nadd_test_unit(reduction)\nadd_test_unit(sort)\nadd_test_unit(static)\nadd_test_unit(traits)\nadd_test_unit(twoleveliteratora)\nadd_test_unit(wakeup-overhead)\nadd_test_unit(worklists-compile)\nadd_test_unit(morphgraph-removal)\n"
  },
  {
    "path": "libgalois/test/README.md",
    "content": "# Prerequisites\n\nSome tests use sample graphs as inputs, and these can be downloaded with:\n```bash\nmake input\n```\n\nIf you want to point the tests to an existing set of sample graphs, you\ncan use the `cmake -DGALOIS_GRAPH_LOCATION=...`.\n\n# Common ctest commands\n\n```bash\n# All ctest commands should be run from your build directory\ncd ${CMAKE_BINARY_DIR}\n\n# Run all tests in parallel with 4 jobs\nctest -j 4\n\n# Run all tests matching pattern\nctest -R regex\n\n# Run all tests matching label pattern\nctest -L regex\n\n# Show test output\nctest -V\n\nctest --rerun-failed\n\n# Run tests with valgrind memcheck\nctest --test-action memcheck\n\n# ctest state (e.g., last failed tests, test output) is stored in\n# ${CMAKE_BINARY_DIR}/Testing\nfind Testing/ -type f | xargs cat\n```\n\nTests are divided into several major labels:\n- **quick**: Quick tests have no external dependencies and can be run in parallel\n  with other quick tests. Each quick test should run in a second or less. These\n  tests are run as part of our continuous integration pipeline.\n- **nightly**: Nightly tests are tests that take longer (e.g., scalability tests).\n  They are run every night on the current master commit.\n"
  },
  {
    "path": "libgalois/test/acquire.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#include \"galois/Timer.h\"\n#include \"galois/runtime/Context.h\"\n\n#include <cstdlib>\n#include <iostream>\n\nint main(int argc, char** argv) {\n  galois::runtime::SimpleRuntimeContext S;\n  galois::runtime::Lockable L;\n\n  int numAcquires = 1;\n  if (argc > 1)\n    numAcquires = atoi(argv[1]);\n  if (numAcquires <= 0)\n    numAcquires = 1024 * 1024 * 1024;\n\n  galois::Timer t;\n  t.start();\n\n  for (int x = 0; x < numAcquires; ++x)\n    galois::runtime::acquire(&L, galois::MethodFlag::WRITE);\n\n  t.stop();\n  std::cout << \"Locking time: \" << t.get() << \" ms after \" << numAcquires\n            << \"\\n\";\n\n  return 0;\n}\n"
  },
  {
    "path": "libgalois/test/bandwidth.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#include \"galois/Galois.h\"\n#include \"galois/Timer.h\"\n\n#include <random>\n#include <cstdio>\n#include <time.h>\n\ntemplate <typename Gen>\nvoid random_access(Gen& gen, int* buf, size_t size, size_t accesses) {\n  std::uniform_int_distribution<size_t> randIndex(0, size - 1);\n  for (unsigned i = 0; i < accesses; ++i) {\n    size_t idx = randIndex(gen);\n    buf[idx] += 1;\n  }\n}\n\nstruct run_local_helper {\n  int* block;\n  size_t seed;\n  size_t size;\n  run_local_helper(int* b, size_t s, size_t ss) : block(b), seed(s), size(ss) {}\n  run_local_helper() {}\n  void operator()(unsigned int tid, unsigned int num) {\n    std::mt19937 gen(seed + tid);\n    std::uniform_int_distribution<int> randSeed;\n    auto r   = galois::block_range(block, block + size, tid, num);\n    size_t d = std::distance(r.first, r.second);\n    random_access(gen, r.first, d, d);\n  }\n};\n\nvoid run_local(size_t seed, size_t mega) {\n  size_t size = mega * 1024 * 1024;\n  int* block  = (int*)malloc(size * sizeof(*block));\n\n  // Assuming first touch policy\n  run_local_helper r(block, seed, size);\n  galois::on_each(r);\n  free(block);\n}\n\nstruct run_interleaved_helper {\n  int* block;\n  size_t seed;\n  size_t size;\n  run_interleaved_helper(int* b, size_t s, size_t ss)\n      : block(b), seed(s), size(ss) {}\n  run_interleaved_helper() {}\n  void operator()(unsigned int tid, unsigned int num) {\n    std::mt19937 gen(seed + tid);\n    std::uniform_int_distribution<int> randSeed;\n    auto r   = galois::block_range(block, block + size, tid, num);\n    size_t d = std::distance(r.first, r.second);\n    random_access(gen, block, size, d);\n  }\n};\n\nvoid run_interleaved(size_t seed, size_t mega, bool full) {\n  size_t size = mega * 1024 * 1024;\n  auto ptr    = galois::substrate::largeMallocInterleaved(\n      size * sizeof(int),\n      full ? galois::substrate::getThreadPool().getMaxThreads()\n           : galois::runtime::activeThreads);\n  int* block = (int*)ptr.get();\n\n  run_interleaved_helper r(block, seed, size);\n  galois::on_each(r);\n}\n\ntemplate <typename Fn>\nlong time_run(Fn fn) {\n  galois::Timer t1;\n  t1.start();\n  fn();\n  t1.stop();\n  return t1.get();\n}\n\nstruct F1 {\n  size_t seed;\n  size_t mega;\n  F1(size_t s, size_t m) : seed(s), mega(m) {}\n  void operator()() { run_local(seed, mega); }\n};\n\nstruct F2 {\n  size_t seed;\n  size_t mega;\n  bool full;\n  F2(size_t s, size_t m, bool f) : seed(s), mega(m), full(f) {}\n  void operator()() { run_interleaved(seed, mega, full); }\n};\n\nint main(int argc, char** argv) {\n  galois::SharedMemSys Galois_runtime;\n  unsigned M  = galois::substrate::getThreadPool().getMaxThreads() / 2;\n  size_t mega = 1;\n  if (argc > 1)\n    mega = atoi(argv[1]);\n  if (!mega)\n    mega = 200;\n\n  size_t seed = time(NULL);\n  printf(\"Working set: %zu MB\\n\\n\", mega);\n  printf(\"Effective random-access bandwidth (MB/s)\\n\");\n  printf(\"T    LOCAL    INTERLEAVE    FULL-INTERLEAVE\\n\");\n  for (unsigned threads = 1; threads <= M; ++threads) {\n    galois::setActiveThreads(threads);\n\n    long local_millis           = time_run(F1(seed, mega));\n    long interleave_millis      = time_run(F2(seed, mega, false));\n    long full_interleave_millis = time_run(F2(seed, mega, true));\n    double mb                   = mega / (double)sizeof(int);\n    // 4 + length of column header\n    printf(\"%4d %8.2f %13.2f %18.2f\\n\", threads, mb / local_millis * 1000.0,\n           mb / interleave_millis * 1000.0,\n           mb / full_interleave_millis * 1000.0);\n  }\n}\n"
  },
  {
    "path": "libgalois/test/barriers.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#include \"galois/Timer.h\"\n#include \"galois/Galois.h\"\n#include \"galois/substrate/Barrier.h\"\n\n#include <iostream>\n#include <cstdlib>\n#include <unistd.h>\n\nunsigned iter       = 0;\nunsigned numThreads = 0;\n\nchar bname[100];\n\nstruct emp {\n  galois::substrate::Barrier& b;\n\n  void go() {\n    for (unsigned i = 0; i < iter; ++i) {\n      b.wait();\n    }\n  }\n\n  template <typename T>\n  void operator()(const T&) {\n    go();\n  }\n\n  template <typename T, typename C>\n  void operator()(const T&, const C&) {\n    go();\n  }\n};\n\nvoid test(std::unique_ptr<galois::substrate::Barrier> b) {\n  if (b == nullptr) {\n    std::cout << \"skipping \" << bname << \"\\n\";\n    return;\n  }\n\n  unsigned M = numThreads;\n  if (M > 16)\n    M /= 2;\n  while (M) {\n    galois::setActiveThreads(M);\n    b->reinit(M);\n    galois::Timer t;\n    t.start();\n    emp e{*b.get()};\n    galois::on_each(e);\n    t.stop();\n    std::cout << bname << \",\" << b->name() << \",\" << M << \",\" << t.get()\n              << \"\\n\";\n    M -= 1;\n  }\n}\n\nint main(int argc, char** argv) {\n  galois::SharedMemSys Galois_runtime;\n  if (argc > 1)\n    iter = atoi(argv[1]);\n  else\n    iter = 16 * 1024;\n  if (argc > 2)\n    numThreads = atoi(argv[2]);\n  else\n    numThreads = galois::substrate::getThreadPool().getMaxThreads();\n\n  gethostname(bname, sizeof(bname));\n  using namespace galois::substrate;\n  test(createPthreadBarrier(1));\n  test(createCountingBarrier(1));\n  test(createMCSBarrier(1));\n  test(createTopoBarrier(1));\n  test(createDisseminationBarrier(1));\n  return 0;\n}\n"
  },
  {
    "path": "libgalois/test/empty-member-lcgraph.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#include \"galois/graphs/LCGraph.h\"\n\nint main() {\n  constexpr size_t intvoid =\n      sizeof(galois::graphs::internal::EdgeInfoBase<int, void>);\n  constexpr size_t intint =\n      sizeof(galois::graphs::internal::EdgeInfoBase<int, int>);\n  static_assert(intvoid < intint, \"Failed to do empty member optimization\");\n  return intvoid < intint ? 0 : 1;\n}\n"
  },
  {
    "path": "libgalois/test/flatmap.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#include \"galois/Galois.h\"\n#include \"galois/FlatMap.h\"\n#include \"galois/Timer.h\"\n\n#include <boost/iterator/counting_iterator.hpp>\n#include <cstdlib>\n#include <iostream>\n#include <map>\n#include <random>\n\nstruct element {\n  volatile int val;\n  element() : val() {}\n  element(int x) : val(x) {}\n  operator int() const { return val; }\n};\n\nstd::ostream& operator<<(std::ostream& out, const element& e) {\n  out << e.val;\n  return out;\n}\n\ntemplate <typename MapTy>\nstruct Fn1 {\n  MapTy* m;\n  void operator()(const int& x) const { (*m)[x] = element(x); }\n};\n\ntemplate <typename MapTy>\nstruct Fn2 {\n  MapTy* m;\n  void operator()(const int& x) const {\n    int v = (*m)[x].val;\n    GALOIS_ASSERT(v == x || v == 0);\n  }\n};\n\ntemplate <typename MapTy>\nvoid timeMapParallel(std::string c, const std::vector<int>& keys) {\n  MapTy m;\n  galois::Timer t1, t2;\n  t1.start();\n  galois::do_all(galois::iterate(keys), Fn1<MapTy>{&m});\n  t1.stop();\n  t2.start();\n  galois::do_all(galois::iterate(keys), Fn2<MapTy>{&m});\n  t2.stop();\n  std::cout << c << \" \" << t1.get() << \" \" << t2.get() << \"\\n\";\n}\n\ntemplate <typename MapTy>\nvoid timeMap(std::string c, const std::vector<int>& keys) {\n  MapTy m;\n  galois::Timer t1, t2;\n  t1.start();\n  for (auto& x : keys) {\n    m[x] = element(x);\n  }\n  t1.stop();\n  t2.start();\n  for (auto& x : keys) {\n    int v = m[x].val;\n    GALOIS_ASSERT(v == x);\n  }\n  t2.stop();\n  std::cout << c << \" \" << t1.get() << \" \" << t2.get() << \"\\n\";\n}\n\ntemplate <typename MapTy>\nvoid testMap() {\n  MapTy m;\n  MapTy m2(m);\n  MapTy m3;\n\n  m3.insert(std::make_pair(10, 0));\n  m3.insert(std::make_pair(20, 0));\n\n  MapTy m4(m3.begin(), m3.end());\n\n  m2 = m3;\n  m3 = std::move(m2);\n\n  m[0] = 0;\n  m[1] = 1;\n  m[3] = 2;\n  m[3] = m[3] + 3;\n  m[4] = 4;\n\n  m.insert(std::make_pair(5, 4));\n  m.insert(m4.begin(), m4.end());\n\n  std::cout << \"10 == \" << m.find(10)->first << \"\\n\";\n\n  // m.erase(10);\n  // m.erase(1);\n\n  if (m.size() != 7 || m.empty())\n    abort();\n  std::swap(m, m3);\n  if (m.size() != 2 || m.empty())\n    abort();\n  m.clear();\n  if (m.size() != 0 || !m.empty())\n    abort();\n  std::swap(m, m3);\n  if (m.size() != 7 || m.empty())\n    abort();\n\n  for (auto ii = m.begin(), ee = m.end(); ii != ee; ++ii)\n    std::cout << ii->first << \" \" << ii->second << \" \";\n  std::cout << \"\\n\";\n\n  for (auto ii = m.cbegin(), ee = m.cend(); ii != ee; ++ii)\n    std::cout << ii->first << \" \" << ii->second << \" \";\n  std::cout << \"\\n\";\n\n  for (auto ii = m.rbegin(), ee = m.rend(); ii != ee; ++ii)\n    std::cout << ii->first << \" \" << ii->second << \" \";\n  std::cout << \"\\n\";\n\n  for (auto ii = m.crbegin(), ee = m.crend(); ii != ee; ++ii)\n    std::cout << ii->first << \" \" << ii->second << \" \";\n  std::cout << \"\\n\";\n}\n\nvoid timeTests(std::string prefix, const std::vector<int>& keys) {\n  for (int i = 0; i < 3; ++i)\n    timeMap<std::map<int, element>>(prefix + \"std::map\", keys);\n  for (int i = 0; i < 3; ++i)\n    timeMap<galois::flat_map<int, element>>(prefix + \"flat_map\", keys);\n}\n\nint main(int argc, char** argv) {\n  galois::SharedMemSys Galois_runtime;\n  testMap<std::map<int, element>>();\n  testMap<galois::flat_map<int, element>>();\n  galois::setActiveThreads(8);\n\n  int size = 100;\n  if (argc > 1)\n    size = atoi(argv[1]);\n  if (size <= 0)\n    size = 1000000;\n\n  std::mt19937 mt(0);\n  std::uniform_int_distribution<int> dist(0, size);\n  std::vector<int> randomKeys;\n  std::vector<int> keys;\n  for (int i = 0; i < size; ++i) {\n    randomKeys.push_back(dist(mt));\n    keys.push_back(i);\n  }\n\n  timeTests(\"seq \", keys);\n  timeTests(\"random \", randomKeys);\n  return 0;\n}\n"
  },
  {
    "path": "libgalois/test/floatingPointErrors.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#include <boost/rational.hpp>\n\n#include <random>\n#include <iostream>\n\n#include <cstdio>\n\nusing Rational = boost::rational<size_t>;\n\nvoid multiplyTest(const double mpcand, const double mplier, const double ans) {\n\n  double lim = mplier / 100.0;\n  assert(lim >= 1.0);\n\n  std::mt19937 eng;\n  eng.seed(0);\n\n  std::uniform_real_distribution<double> dist(0.0, lim);\n\n  double remainMplier = mplier;\n\n  double computed = 0.0;\n\n  while (remainMplier > 0.0) {\n\n    double partial = dist(eng);\n\n    if (partial > remainMplier) {\n      partial = remainMplier;\n    }\n\n    remainMplier -= partial;\n\n    computed += mpcand * partial;\n  }\n\n  std::printf(\"Error in multiplication with doubles = %g\\n\", (ans - computed));\n}\n\nvoid multiplyTestRational(const Rational& mpcand, const Rational& mplier,\n                          const Rational& ans) {\n\n  size_t lim = boost::rational_cast<size_t>(mplier / Rational(100));\n\n  std::mt19937 eng;\n  eng.seed(0);\n\n  std::uniform_int_distribution<size_t> dist(1, lim);\n\n  Rational remainMplier = mplier;\n\n  Rational computed(0);\n\n  while (remainMplier > Rational(0)) {\n\n    Rational partial(dist(eng), lim);\n\n    if (partial > remainMplier) {\n      partial = remainMplier;\n    }\n\n    remainMplier -= partial;\n\n    computed += mpcand * partial;\n  }\n\n  std::cout << \"Error in multiplication with Rational: \" << (ans - computed)\n            << \"\\n\";\n}\n\nvoid rationalConversionError(double fpVal) {\n\n  static const unsigned SIGNIFICANT_BITS = 40;\n\n  size_t q = (size_t(1) << SIGNIFICANT_BITS);\n  size_t p = size_t(fpVal * q);\n\n  Rational r(p, q);\n\n  std::printf(\"Conversion error = %g\\n\",\n              (fpVal - boost::rational_cast<double>(r)));\n}\n\nint main() {\n  multiplyTest(0.125, 1000.0, 125.0);\n\n  multiplyTestRational(Rational(125, 1000), Rational(1000), Rational(125));\n\n  rationalConversionError(boost::rational_cast<double>(Rational(1, 3)));\n\n  rationalConversionError(sqrt(2.0));\n  rationalConversionError(sqrt(3.0));\n  rationalConversionError(sqrt(1000.0));\n  rationalConversionError(sqrt(100000.0));\n  rationalConversionError(sqrt(15485867)); // prime number\n\n  return 0;\n}\n"
  },
  {
    "path": "libgalois/test/foreach.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#include \"galois/Galois.h\"\n#include \"galois/Bag.h\"\n#include <vector>\n#include <iostream>\n\nvoid function_pointer(int x, galois::UserContext<int>&) {\n  std::cout << x << \"\\n\";\n}\n\nstruct function_object {\n  void operator()(int x, galois::UserContext<int>& ctx) const {\n    function_pointer(x, ctx);\n  }\n};\n\nint main() {\n  galois::SharedMemSys Galois_runtime;\n  std::vector<int> v(10);\n  galois::InsertBag<int> b;\n\n  galois::for_each(galois::iterate(v), &function_pointer,\n                   galois::loopname(\"func-pointer\"));\n  galois::for_each(galois::iterate(v), function_object(),\n                   galois::loopname(\"with function object and options\"));\n  galois::do_all(galois::iterate(v), [&b](int x) { b.push(x); });\n  galois::for_each(galois::iterate(b), function_object());\n\n  // Works without context as well\n#if defined(__INTEL_COMPILER) && __INTEL_COMPILER <= 1400\n#else\n  // Don't support Context-free versions yet (gcc 4.7 problem)\n  //  galois::for_each(v.begin(), v.end(), [](int x) { std::cout << x << \"\\n\";\n  //  });\n  // galois::for_each(b, [](int x) { std::cout << x << \"\\n\"; });\n#endif\n\n  return 0;\n}\n"
  },
  {
    "path": "libgalois/test/forward-declare-graph.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#include \"galois/graphs/Graph.h\"\n#include \"galois/graphs/LCGraph.h\"\n\nstruct Node1;\ntypedef galois::graphs::MorphGraph<Node1, void, true> Graph1;\nstruct Node1 {\n  Graph1::edge_iterator edge;\n  Graph1::GraphNode gnode;\n};\n\nstruct Node2;\ntypedef galois::graphs::LC_CSR_Graph<Node2, void> Graph2;\nstruct Node2 {\n  Graph2::edge_iterator edge;\n  Graph2::GraphNode gnode;\n};\n\nstruct Node3;\ntypedef galois::graphs::LC_InlineEdge_Graph<Node3, void> Graph3;\nstruct Node3 {\n  Graph3::edge_iterator edge;\n  Graph3::GraphNode gnode;\n};\n\nstruct Node4;\ntypedef galois::graphs::LC_Linear_Graph<Node4, void> Graph4;\nstruct Node4 {\n  Graph4::edge_iterator edge;\n  Graph4::GraphNode gnode;\n};\n\nstruct Node5;\ntypedef galois::graphs::LC_Morph_Graph<Node5, void> Graph5;\nstruct Node5 {\n  Graph5::edge_iterator edge;\n  Graph5::GraphNode gnode;\n};\n\nint main() {\n  galois::SharedMemSys Galois_runtime;\n  Graph1 g1;\n  Graph2 g2;\n  Graph3 g3;\n  Graph4 g4;\n  Graph5 g5;\n  return 0;\n}\n"
  },
  {
    "path": "libgalois/test/gcollections.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#include \"galois/Bag.h\"\n#include \"galois/Galois.h\"\n#include \"galois/gdeque.h\"\n#include \"galois/gslist.h\"\n#include \"galois/Timer.h\"\n#include \"galois/gIO.h\"\n#include \"galois/runtime/Mem.h\"\n\n#include <boost/iterator/counting_iterator.hpp>\n\n#include <iostream>\n#include <cassert>\n#include <string>\n#include <deque>\n#include <vector>\n#include <random>\n\ntemplate <typename C>\nauto constexpr needs_heap(int)\n    -> decltype(typename C::promise_to_dealloc(), bool()) {\n  return true;\n}\n\ntemplate <typename C>\nbool constexpr needs_heap(...) {\n  return false;\n}\n\ntemplate <typename C, typename HeapTy, typename V>\nauto addToCollection(C& c, HeapTy& heap, V&& v) ->\n    typename std::enable_if<needs_heap<C>(0)>::type {\n  c.push_front(heap.heap, std::forward<V>(v));\n}\n\ntemplate <typename C, typename HeapTy, typename V>\nauto addToCollection(C& c, HeapTy&, V&& v) ->\n    typename std::enable_if<!needs_heap<C>(0)>::type {\n  c.push_back(std::forward<V>(v));\n}\n\ntemplate <typename C>\nauto removeFromCollection(C& c) ->\n    typename std::enable_if<needs_heap<C>(0)>::type {\n  c.pop_front(typename C::promise_to_dealloc());\n}\n\ntemplate <typename C>\nauto removeFromCollection(C& c) ->\n    typename std::enable_if<!needs_heap<C>(0)>::type {\n  c.pop_back();\n}\n\ntemplate <typename C, bool Enable>\nstruct Heap {};\n\ntemplate <typename C>\nstruct Heap<C, true> {\n  galois::runtime::FixedSizeHeap heap;\n  Heap() : heap(sizeof(typename C::block_type)) {}\n};\n\ntemplate <typename C>\nvoid testBasic(std::string prefix, C&& collection, int N) {\n  Heap<C, needs_heap<C>(0)> heap;\n\n  assert(N > 0);\n  C c = std::move(collection);\n  for (int i = 0; i < N; ++i)\n    addToCollection(c, heap, i);\n\n  int i = 0;\n  for (auto it = c.begin(); it != c.end(); ++it, ++i) {\n    ;\n  }\n\n  GALOIS_ASSERT(N == std::distance(c.begin(), c.end()), prefix);\n\n  i = N - 1;\n  for (; !c.empty(); --i, removeFromCollection(c)) {\n    ;\n  }\n\n  GALOIS_ASSERT(0 == std::distance(c.begin(), c.end()), prefix);\n}\n\ntemplate <typename C>\nvoid testNormal(std::string prefix, C&& collection, int N) {\n  Heap<C, needs_heap<C>(0)> heap;\n\n  assert(N > 0);\n  C c = std::move(collection);\n  for (int i = 0; i < N; ++i)\n    addToCollection(c, heap, i);\n\n  int i = 0;\n  for (auto it = c.begin(); it != c.end(); ++it, ++i) {\n    GALOIS_ASSERT(*it == i, prefix);\n  }\n\n  i = N - 1;\n  for (auto it = c.rbegin(); it != c.rend(); ++it, --i) {\n    GALOIS_ASSERT(*it == i, prefix);\n  }\n\n  GALOIS_ASSERT(static_cast<int>(c.size()) == N, prefix);\n\n  GALOIS_ASSERT(static_cast<int>(c.size()) == std::distance(c.begin(), c.end()),\n                prefix);\n\n  i = N - 1;\n  for (; !c.empty(); --i, removeFromCollection(c)) {\n    GALOIS_ASSERT(c.back() == i, prefix);\n  }\n\n  GALOIS_ASSERT(static_cast<int>(c.size()) == 0, prefix);\n  GALOIS_ASSERT(static_cast<int>(c.size()) == std::distance(c.begin(), c.end()),\n                prefix);\n}\n\ntemplate <typename C>\nvoid testSort(std::string prefix, C&& collection, int N) {\n  Heap<C, needs_heap<C>(0)> heap;\n\n  assert(N > 0);\n  C c = std::move(collection);\n  std::mt19937 gen;\n  std::uniform_int_distribution<int> dist(0, 100);\n  for (int i = 0; i < N; ++i)\n    addToCollection(c, heap, dist(gen));\n\n  std::sort(c.begin(), c.end());\n\n  int last = c.front();\n  for (auto it = c.begin() + 1; it != c.end(); ++it) {\n    GALOIS_ASSERT(last <= *it, prefix);\n    last = *it;\n  }\n\n  last = c.back();\n  removeFromCollection(c);\n  for (; !c.empty(); removeFromCollection(c)) {\n    GALOIS_ASSERT(last >= c.back(), prefix);\n    last = c.back();\n  }\n}\n\ntemplate <typename C, typename Iterator>\nvoid timeAccess(std::string prefix, C&& c, Iterator first, Iterator last) {\n  Heap<C, needs_heap<C>(0)> heap;\n\n  galois::Timer t1, t2;\n  t1.start();\n  while (first != last) {\n    addToCollection(c, heap, *first++);\n  }\n  t1.stop();\n  t2.start();\n  for (auto ii = c.begin(), ei = c.end(); ii != ei; ++ii) {\n    (*ii).val;\n  }\n  t2.stop();\n  std::cout << prefix << \" insert: \" << t1.get() << \" traverse: \" << t2.get()\n            << \"\\n\";\n}\n\ntemplate <typename T>\nvoid timeAccesses(std::string prefix, T&& x, int size) {\n  for (int i = 0; i < 3; ++i)\n    timeAccess(prefix, std::forward<T>(x), boost::counting_iterator<int>(0),\n               boost::counting_iterator<int>(size));\n}\n\nstruct element {\n  volatile int val;\n  element(int x) : val(x) {}\n};\n\nint main(int argc, char** argv) {\n  galois::SharedMemSys Galois_runtime;\n  testBasic(\"galois::gslist\", galois::gslist<int>(), 32 * 32);\n  testNormal(\"galois::gdeque\", galois::gdeque<int>(), 32 * 32);\n  // testSort(\"galois::gdeque\", galois::gdeque<int>(), 32 * 32);\n\n  int size = 100;\n  if (argc > 1)\n    size = atoi(argv[1]);\n  if (size <= 0)\n    size = 1000000;\n  timeAccesses(\"std::deque\", std::deque<element>(), size);\n  timeAccesses(\"std::vector\", std::vector<element>(), size);\n  timeAccesses(\"galois::gdeque\", galois::gdeque<element>(), size);\n  timeAccesses(\"galois::gslist\", galois::gslist<element>(), size);\n  timeAccesses(\"galois::concurrent_gslist\",\n               galois::concurrent_gslist<element>(), size);\n  timeAccesses(\"galois::InsertBag\", galois::InsertBag<element>(), size);\n\n  return 0;\n}\n"
  },
  {
    "path": "libgalois/test/graph-compile.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#include <iostream>\n\n#include \"galois/graphs/Graph.h\"\n\nstruct NoDefault {\n  int x;\n  explicit NoDefault(int x) : x(x) {}\n\nprivate:\n  NoDefault();\n};\n\ntemplate <typename GraphTy>\nvoid check() {\n  typedef typename GraphTy::GraphNode GNode;\n  int v = 0;\n\n  GraphTy g;\n  GNode n1 = g.createNode(1);\n  GNode n2 = g.createNode(2);\n  GNode n3 = g.createNode(3);\n  GNode n4 = g.createNode(4);\n  GNode n5 = g.createNode(5);\n  g.addNode(n1);\n  g.addNode(n2);\n  g.addNode(n3);\n  g.addNode(n4);\n  g.addNode(n5);\n  g.addMultiEdge(n1, n2, galois::MethodFlag::WRITE, v);\n  g.addMultiEdge(n5, n2, galois::MethodFlag::WRITE, v);\n  g.addMultiEdge(n2, n3, galois::MethodFlag::WRITE, v);\n  g.addMultiEdge(n2, n4, galois::MethodFlag::WRITE, v);\n  for (auto ii : g.edges(n2))\n    std::cout << \"o \" << g.getData(g.getEdgeDst(ii)).x << \"\\n\";\n  for (auto ii : g.in_edges(n2))\n    std::cout << \"i \" << g.getData(g.getEdgeDst(ii)).x << \"\\n\";\n  std::cout << \"** removing 2->3\\n\";\n  g.removeEdge(n2, g.findEdge(n2, n3));\n  for (auto ii : g.edges(n2))\n    std::cout << \"o \" << g.getData(g.getEdgeDst(ii)).x << \"\\n\";\n  for (auto ii : g.in_edges(n2))\n    std::cout << \"i \" << g.getData(g.getEdgeDst(ii)).x << \"\\n\";\n  std::cout << \"** removing 5->1\\n\";\n  g.removeEdge(n5, g.findEdge(n5, n2));\n  for (auto ii : g.edges(n2))\n    std::cout << \"o \" << g.getData(g.getEdgeDst(ii)).x << \"\\n\";\n  for (auto ii : g.in_edges(n2))\n    std::cout << \"i \" << g.getData(g.getEdgeDst(ii)).x << \"\\n\";\n  std::cout << \"\\n\\n\";\n}\n\nint main() {\n  galois::SharedMemSys Galois_runtime;\n  check<galois::graphs::MorphGraph<NoDefault, NoDefault, true>>();\n  check<galois::graphs::MorphGraph<NoDefault, NoDefault, false>>();\n  check<galois::graphs::MorphGraph<NoDefault, NoDefault, true, true>>();\n  check<galois::graphs::MorphGraph<NoDefault, NoDefault, false, true>>();\n\n  return 0;\n}\n"
  },
  {
    "path": "libgalois/test/graph.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#include \"galois/graphs/Graph.h\"\n#include <string>\n\nint useGraph(std::string inputfile) {\n  //! [Using a graph]\n  typedef galois::graphs::LC_CSR_Graph<int, int> Graph;\n\n  // Create graph\n  Graph g;\n  galois::graphs::readGraph(g, inputfile);\n\n  int sum = 0;\n\n  // Traverse graph\n  for (Graph::iterator ii = g.begin(), ei = g.end(); ii != ei; ++ii) {\n    Graph::GraphNode src = *ii;\n    for (Graph::edge_iterator jj = g.edge_begin(src), ej = g.edge_end(src);\n         jj != ej; ++jj) {\n      Graph::GraphNode dst = g.getEdgeDst(jj);\n      int edgeData         = g.getEdgeData(jj);\n      int nodeData         = g.getData(dst);\n      sum += edgeData * nodeData;\n    }\n  }\n  //! [Using a graph]\n\n  return sum;\n}\n\nint useGraphCxx11(std::string inputfile) {\n  //! [Using a graph cxx11]\n  typedef galois::graphs::LC_CSR_Graph<int, int> Graph;\n\n  // Create graph\n  Graph g;\n  galois::graphs::readGraph(g, inputfile);\n\n  int sum = 0;\n\n  // Traverse graph\n  for (Graph::GraphNode src : g) {\n    for (Graph::edge_iterator edge : g.out_edges(src)) {\n      Graph::GraphNode dst = g.getEdgeDst(edge);\n      int edgeData         = g.getEdgeData(edge);\n      int nodeData         = g.getData(dst);\n      sum += edgeData * nodeData;\n    }\n  }\n  //! [Using a graph cxx11]\n\n  return sum;\n}\n\nint main(int argc, char** argv) {\n  galois::SharedMemSys G;\n  if (argc > 1) {\n    useGraph(argv[1]);\n    useGraphCxx11(argv[1]);\n  }\n  return 0;\n}\n"
  },
  {
    "path": "libgalois/test/gslist.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#include \"galois/Galois.h\"\n#include \"galois/gslist.h\"\n#include \"galois/gIO.h\"\n#include \"galois/runtime/Mem.h\"\n#include <map>\n\nint main(int argc, char** argv) {\n  galois::SharedMemSys Galois_runtime;\n  typedef galois::runtime::FixedSizeHeap Heap;\n  typedef std::unique_ptr<Heap> HeapPtr;\n  typedef galois::substrate::PerThreadStorage<HeapPtr> Heaps;\n  typedef galois::concurrent_gslist<int> Collection;\n  int numThreads = 2;\n  unsigned size  = 100;\n  if (argc > 1)\n    numThreads = atoi(argv[1]);\n  if (size <= 0)\n    numThreads = 2;\n  if (argc > 2)\n    size = atoi(argv[2]);\n  if (size <= 0)\n    size = 10000;\n\n  galois::setActiveThreads(numThreads);\n\n  Heaps heaps;\n  Collection c;\n\n  galois::on_each([&](unsigned int, unsigned int) {\n    HeapPtr& hp = *heaps.getLocal();\n    hp          = HeapPtr(new Heap(sizeof(Collection::block_type)));\n    for (unsigned i = 0; i < size; ++i)\n      c.push_front(*hp, i);\n  });\n\n  std::map<int, int> counter;\n  for (auto i : c) {\n    counter[i] += 1;\n  }\n  for (unsigned i = 0; i < size; ++i) {\n    GALOIS_ASSERT(counter[i] == numThreads);\n  }\n  GALOIS_ASSERT(counter.size() == size);\n\n  galois::on_each([&](unsigned int, unsigned int) {\n    while (c.pop_front(Collection::promise_to_dealloc()))\n      ;\n  });\n\n  return 0;\n}\n"
  },
  {
    "path": "libgalois/test/hwtopo.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#include \"galois/substrate/HWTopo.h\"\n#include \"galois/gIO.h\"\n\n#include <iostream>\n\nvoid printMyTopo() {\n  auto t = galois::substrate::getHWTopo();\n  std::cout << \"T,C,P,N: \" << t.machineTopoInfo.maxThreads << \" \"\n            << t.machineTopoInfo.maxCores << \" \" << t.machineTopoInfo.maxSockets\n            << \" \" << t.machineTopoInfo.maxNumaNodes << \"\\n\";\n  for (unsigned i = 0; i < t.machineTopoInfo.maxThreads; ++i) {\n    auto& c = t.threadTopoInfo[i];\n    std::cout << \"tid: \" << c.tid << \" leader: \" << c.socketLeader\n              << \" socket: \" << c.socket << \" numaNode: \" << c.numaNode\n              << \" cumulativeMaxSocket: \" << c.cumulativeMaxSocket\n              << \" osContext: \" << c.osContext\n              << \" osNumaNode: \" << c.osNumaNode << \"\\n\";\n  }\n}\n\nvoid test(const std::string& name, const std::vector<int>& found,\n          const std::vector<int>& expected) {\n  if (found != expected) {\n    std::cerr << \"test \" << name << \" failed\\n\";\n\n    std::cerr << \"found: \";\n    for (auto i : found) {\n      std::cerr << i;\n    }\n    std::cerr << \"\\n\";\n\n    std::cerr << \"expected: \";\n    for (auto i : expected) {\n      std::cerr << i;\n    }\n    std::cerr << \"\\n\";\n    std::abort();\n  }\n}\n\nint main() {\n  printMyTopo();\n\n  using namespace galois::substrate;\n\n  test(\"parse with spaces\", parseCPUList(\"     0   \\n\"), std::vector<int>{0});\n  test(\"parse empty\", parseCPUList(\"        \\n\"), std::vector<int>{});\n  test(\"parse singletons\", parseCPUList(\"     0,1,2   \\n\"),\n       std::vector<int>{0, 1, 2});\n  test(\"parse mix of singletons and ranges\", parseCPUList(\"     0,1,2-4   \\n\"),\n       std::vector<int>{0, 1, 2, 3, 4});\n  test(\"parse multiple ranges\", parseCPUList(\"     0-1,2-4   \\n\"),\n       std::vector<int>{0, 1, 2, 3, 4});\n  test(\"parse range\", parseCPUList(\"     0-4   \\n\"),\n       std::vector<int>{0, 1, 2, 3, 4});\n\n  return 0;\n}\n"
  },
  {
    "path": "libgalois/test/lc-adaptor.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#include \"galois/graphs/LC_Adaptor_Graph.h\"\n#include <boost/iterator/counting_iterator.hpp>\n\nstruct CSRArrays {\n  int* outIdx;\n  int* outs;\n  int* nodeData;\n  int numNodes;\n  int numEdges;\n};\n\nclass MyGraph\n    : public galois::graphs::LC_Adaptor_Graph<\n          int, void, MyGraph, int, boost::counting_iterator<int>, int*> {\n  CSRArrays m_instance;\n\npublic:\n  MyGraph(const CSRArrays& i) : m_instance(i) {}\n\n  size_t get_id(GraphNode n) const { return n; }\n\n  node_data_reference get_data(GraphNode n) { return m_instance.nodeData[n]; }\n\n  edge_data_reference get_edge_data(edge_iterator) { return {}; }\n\n  GraphNode get_edge_dst(edge_iterator n) { return *n; }\n\n  int get_size() const { return m_instance.numNodes; }\n  int get_size_edges() const { return m_instance.numEdges; }\n\n  iterator get_begin() const { return iterator(0); }\n  iterator get_end() const { return iterator(m_instance.numNodes); }\n\n  edge_iterator get_edge_begin(GraphNode n) {\n    return n == 0 ? &m_instance.outs[0]\n                  : &m_instance.outs[m_instance.outIdx[n - 1]];\n  }\n  edge_iterator get_edge_end(GraphNode n) {\n    return &m_instance.outs[m_instance.outIdx[n]];\n  }\n};\n\nint main() {\n  CSRArrays arrays;\n  MyGraph g(arrays);\n  return 0;\n}\n"
  },
  {
    "path": "libgalois/test/lock.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#include \"galois/substrate/SimpleLock.h\"\n\n#include <cstdlib>\n\nvolatile int V;\n\nint main(int argc, char** argv) {\n  unsigned M = 1;\n  if (argc > 1)\n    M = atoi(argv[1]);\n  if (!M)\n    M = 1000000000;\n  galois::substrate::SimpleLock L;\n  for (unsigned x = 0; x < M; ++x) {\n    V = 0;\n    L.lock();\n    V = 1;\n    L.unlock();\n    V = 2;\n  }\n  return 0;\n}\n"
  },
  {
    "path": "libgalois/test/lockmgr.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#include \"galois/runtime/Lockable.h\"\n\n#include <iostream>\n\nusing namespace galois::runtime;\n\nstruct simple : public Lockable {\n  int foo;\n};\n\nchar translate(int i) {\n  switch (i) {\n  case 0:\n    return 'F';\n  case 1:\n    return 'N';\n  case 3:\n    return 'O';\n  default:\n    return '?';\n  }\n}\n\n// FIXME: include ro tests\n\nint main(int argc, char** argv) {\n  simple s1, s2;\n  LockManagerBase b1, b2;\n\n  std::cout << translate(b1.tryAcquire(&s1, false)) << \"\\n\";\n  b1.dump(std::cout);\n  b2.dump(std::cout);\n  std::cout << \"\\n\";\n  std::cout << translate(b1.tryAcquire(&s1, false)) << \"\\n\";\n  b1.dump(std::cout);\n  b2.dump(std::cout);\n  std::cout << \"\\n\";\n  std::cout << translate(b1.tryAcquire(&s2, false)) << \"\\n\";\n  b1.dump(std::cout);\n  b2.dump(std::cout);\n  std::cout << \"\\n\";\n  std::cout << translate(b2.tryAcquire(&s1, false)) << \"\\n\";\n  b1.dump(std::cout);\n  b2.dump(std::cout);\n  std::cout << \"\\n\";\n  std::cout << translate(b2.tryAcquire(&s2, false)) << \"\\n\";\n  b1.dump(std::cout);\n  b2.dump(std::cout);\n  std::cout << \"\\n\";\n  auto rb1 = b1.releaseAll();\n  std::cout << rb1.first << \" \" << rb1.second << \"\\n\";\n  b1.dump(std::cout);\n  b2.dump(std::cout);\n  std::cout << \"\\n\";\n  std::cout << translate(b2.tryAcquire(&s1, false)) << \"\\n\";\n  b1.dump(std::cout);\n  b2.dump(std::cout);\n  std::cout << \"\\n\";\n  std::cout << translate(b2.tryAcquire(&s2, false)) << \"\\n\";\n  b1.dump(std::cout);\n  b2.dump(std::cout);\n  std::cout << \"\\n\";\n  // b1.forceAcquire(&s1);\n  // b1.dump(std::cout); b2.dump(std::cout); std::cout << \"\\n\";\n  // b1.forceAcquire(&s2);\n  // b1.dump(std::cout); b2.dump(std::cout); std::cout << \"\\n\";\n  std::cout << translate(b2.tryAcquire(&s1, false)) << \"\\n\";\n  b1.dump(std::cout);\n  b2.dump(std::cout);\n  std::cout << \"\\n\";\n  std::cout << translate(b2.tryAcquire(&s2, false)) << \"\\n\";\n  b1.dump(std::cout);\n  b2.dump(std::cout);\n  std::cout << \"\\n\";\n  auto rb2 = b2.releaseAll();\n  std::cout << rb2.first << \" \" << rb2.second << \"\\n\";\n  b1.dump(std::cout);\n  b2.dump(std::cout);\n  std::cout << \"\\n\";\n  rb1 = b1.releaseAll();\n  std::cout << rb1.first << \" \" << rb1.second << \"\\n\";\n  b1.dump(std::cout);\n  b2.dump(std::cout);\n  std::cout << \"\\n\";\n\n  return 0;\n}\n"
  },
  {
    "path": "libgalois/test/loop-overhead.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#include \"galois/Galois.h\"\n#include \"galois/Timer.h\"\n#include \"galois/substrate/ThreadPool.h\"\n\n#include <iostream>\n#include <cstdlib>\n#include <omp.h>\n\nint RandomNumber() { return (rand() % 1000000); }\nunsigned iter = 1;\n\nstruct emp {\n  template <typename T>\n  void operator()(const T& t) const {\n    galois::substrate::compilerBarrier();\n  }\n  template <typename T, typename C>\n  void operator()(const T& t, const C& c) const {\n    galois::substrate::compilerBarrier();\n  }\n};\n\nunsigned t_inline(std::vector<unsigned>& V, unsigned num) {\n  galois::Timer t;\n  t.start();\n  emp e;\n  for (unsigned x = 0; x < iter; ++x)\n    for (unsigned i = 0; i < num; ++i)\n      e(i);\n  t.stop();\n  return t.get();\n}\n\nunsigned t_stl(std::vector<unsigned>& V, unsigned num) {\n  galois::Timer t;\n  t.start();\n  for (unsigned x = 0; x < iter; ++x)\n    std::for_each(V.begin(), V.begin() + num, emp());\n  t.stop();\n  return t.get();\n}\n\nunsigned t_omp(std::vector<unsigned>& V, unsigned num, unsigned th) {\n  omp_set_num_threads(th); // galois::runtime::LL::getMaxThreads());\n\n  galois::Timer t;\n  t.start();\n  for (unsigned x = 0; x < iter; ++x) {\n    emp f;\n#pragma omp parallel for schedule(guided)\n    for (unsigned n = 0; n < num; ++n)\n      f(n);\n  }\n  t.stop();\n  return t.get();\n}\n\nunsigned t_doall(bool burn, bool steal, std::vector<unsigned>& V, unsigned num,\n                 unsigned th) {\n  galois::setActiveThreads(th); // galois::runtime::LL::getMaxThreads());\n  if (burn)\n    galois::substrate::getThreadPool().burnPower(th);\n\n  galois::Timer t;\n  t.start();\n  for (unsigned x = 0; x < iter; ++x)\n    galois::do_all(galois::iterate(V.begin(), V.begin() + num), emp());\n  t.stop();\n  return t.get();\n}\n\nunsigned t_foreach(bool burn, std::vector<unsigned>& V, unsigned num,\n                   unsigned th) {\n  galois::setActiveThreads(th);\n  if (burn)\n    galois::substrate::getThreadPool().burnPower(th);\n\n  galois::Timer t;\n  t.start();\n  for (unsigned x = 0; x < iter; ++x)\n    galois::for_each(galois::iterate(V.begin(), V.begin() + num), emp(),\n                     galois::no_pushes(), galois::disable_conflict_detection(),\n                     galois::wl<galois::worklists::StableIterator<>>());\n  t.stop();\n  return t.get();\n}\n\nvoid test(\n    std::string header, unsigned maxThreads, unsigned minVec, unsigned maxVec,\n    std::function<unsigned(std::vector<unsigned>&, unsigned, unsigned)> func) {\n  std::cout << header << \"\";\n  for (unsigned M = maxThreads; M; M >>= 1)\n    std::cout << \",\\t\" << M;\n  std::cout << \"\\n\";\n  std::vector<unsigned> V(maxVec);\n  for (unsigned v = minVec; v < maxVec; v <<= 2) {\n    std::cout << v << \"\\t\";\n    for (unsigned M = maxThreads; M; M >>= 1) {\n      std::cout << \",\\t\" << func(V, v, M);\n    }\n    std::cout << \"\\n\";\n  }\n  std::cout << \"\\n\";\n}\n\nint main(int argc, char** argv) {\n  using namespace std::placeholders;\n#pragma omp parallel for\n  for (int x = 0; x < 100; ++x) {\n  }\n\n  unsigned maxVector = 16;\n  if (argc > 1)\n    iter = atoi(argv[1]);\n  if (!iter)\n    iter = 16 * 1024;\n  if (argc > 2)\n    maxVector = atoi(argv[2]);\n  if (!maxVector)\n    maxVector = 1024 * 1024;\n\n  unsigned M = galois::substrate::getThreadPool().getMaxThreads() / 2;\n  test(\"inline\\t\", 1, 16, maxVector,\n       [](std::vector<unsigned>& V, unsigned num, unsigned th) {\n         return t_inline(V, num);\n       });\n  test(\"stl\\t\", 1, 16, maxVector,\n       [](std::vector<unsigned>& V, unsigned num, unsigned th) {\n         return t_stl(V, num);\n       });\n  test(\"omp\\t\", M, 16, maxVector, t_omp);\n  test(\"doall N W\", M, 16, maxVector,\n       std::bind(t_doall, false, false, _1, _2, _3));\n  test(\"doall N S\", M, 16, maxVector,\n       std::bind(t_doall, false, true, _1, _2, _3));\n  test(\"foreach N\", M, 16, maxVector, std::bind(t_foreach, false, _1, _2, _3));\n  test(\"doall B W\", M, 16, maxVector,\n       std::bind(t_doall, true, false, _1, _2, _3));\n  test(\"doall B S\", M, 16, maxVector,\n       std::bind(t_doall, true, true, _1, _2, _3));\n  test(\"foreach B\", M, 16, maxVector, std::bind(t_foreach, true, _1, _2, _3));\n  return 0;\n}\n"
  },
  {
    "path": "libgalois/test/mem.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#include \"galois/Galois.h\"\n#include \"galois/gIO.h\"\n#include \"galois/runtime/Mem.h\"\n\nusing namespace galois::runtime;\nusing namespace galois::substrate;\n\nstruct element {\n  unsigned val;\n  element* next;\n  element(int i) : val(i), next(0) {}\n};\n\nint main() {\n  galois::SharedMemSys Galois_runtime;\n  unsigned baseAllocSize = SystemHeap::AllocSize;\n\n  FixedSizeAllocator<element> falloc;\n  element* last = nullptr;\n  for (unsigned i = 0; i < baseAllocSize; ++i) {\n    element* ptr = falloc.allocate(1);\n    falloc.construct(ptr, i);\n    ptr->next = last;\n    last      = ptr;\n  }\n  for (unsigned i = 0; i < baseAllocSize; ++i) {\n    GALOIS_ASSERT(last);\n    GALOIS_ASSERT(last->val == baseAllocSize - 1 - i);\n    element* next = last->next;\n    falloc.destroy(last);\n    falloc.deallocate(last, 1);\n    last = next;\n  }\n  GALOIS_ASSERT(!last);\n\n  VariableSizeHeap valloc;\n  size_t allocated;\n  GALOIS_ASSERT(1 < baseAllocSize);\n  valloc.allocate(1, allocated);\n  GALOIS_ASSERT(allocated == 1);\n\n  valloc.allocate(baseAllocSize + 1, allocated);\n  GALOIS_ASSERT(allocated <= baseAllocSize);\n\n  int toAllocate = baseAllocSize + 1;\n  while (toAllocate) {\n    valloc.allocate(toAllocate, allocated);\n    toAllocate -= allocated;\n    GALOIS_ASSERT(allocated);\n  }\n\n  return 0;\n}\n"
  },
  {
    "path": "libgalois/test/morphgraph-removal.cpp",
    "content": "#include <iostream>\n#include <string>\n#include <cassert>\n#include <vector>\n#include <type_traits>\n\n#include \"galois/graphs/MorphGraph.h\"\n\nstatic unsigned int numNodes = 10;\nstatic bool verbose          = false;\n\n// only tracks out-going edges\nusing OutGraph =\n    galois::graphs::MorphGraph<unsigned int, unsigned int, true, false>;\n\n// tracks out-going and incoming edges w/ shared edge data\nusing InOutGraph =\n    galois::graphs::MorphGraph<unsigned int, unsigned int, true, true>;\n\n// tracks outgoing edges symmetrically w/ shared edge data\nusing SymGraph = galois::graphs::MorphGraph<unsigned int, unsigned int, false>;\n\ntemplate <class G>\nvoid traverseOutGraph(G& g) {\n  for (auto n : g) {\n    for (auto e : g.edges(n)) {\n      auto dst = g.getEdgeDst(e);\n      std::cout << \"(\" << g.getData(n) << \" -> \" << g.getData(dst) << \"): \";\n      std::cout << g.getEdgeData(e) << \"\\n\";\n    }\n  }\n}\n\ntemplate <class G>\nvoid traverseInGraph(G& g) {\n  for (auto n : g) {\n    for (auto ie : g.in_edges(n)) {\n      auto src = g.getEdgeDst(ie);\n      std::cout << \"(\" << g.getData(n) << \" <- \" << g.getData(src) << \"): \";\n      std::cout << g.getEdgeData(ie) << \"\\n\";\n    }\n  }\n}\n\ntemplate <>\nvoid traverseInGraph(OutGraph&) {}\n\n// construct a directed clique w/ (i, j) where i < j\ntemplate <class G>\nvoid constructGraph(G& g, std::vector<typename G::GraphNode>& v) {\n  // add nodes\n  for (unsigned int i = 0; i < numNodes; i++) {\n    auto n = g.createNode(i);\n    v.push_back(n);\n    g.addNode(n);\n  }\n\n  // add edges\n  for (unsigned int i = 0; i < numNodes; i++) {\n    for (unsigned int j = i + 1; j < numNodes; j++) {\n      g.getEdgeData(g.addEdge(v[i], v[j])) = (i + j);\n    }\n  }\n\n  if (verbose) {\n    std::cout << \"Original\\n\";\n    traverseOutGraph(g);\n    traverseInGraph(g);\n  }\n}\n\ntemplate <class G>\nvoid removeGraphOutEdge(G& g, typename G::GraphNode n1,\n                        typename G::GraphNode n2) {\n  auto e = g.findEdge(n1, n2);\n  if (e != g.edge_end(n1)) {\n    g.removeEdge(n1, e);\n  }\n}\n\nvoid removeGraphInEdge(SymGraph& g, SymGraph::GraphNode n1,\n                       SymGraph::GraphNode n2) {\n  auto e12                            = g.findInEdge(n1, n2);\n  auto GALOIS_USED_ONLY_IN_DEBUG(e21) = g.findEdge(n2, n1);\n\n  if (e12 == g.in_edge_end(n1)) {\n    assert(e21 == g.edge_end(n1));\n  } else {\n    assert(e21 != g.edge_end(n1));\n    assert(n2 == g.getEdgeDst(e12));\n    assert(n1 == g.getEdgeDst(e21));\n    assert(g.getEdgeData(e12) == g.getEdgeData(e21));\n    g.removeEdge(n1, e12);\n    //    g.removeEdge(n2, e21); this is also OK\n  }\n}\n\nvoid removeGraphInEdge(InOutGraph& g, InOutGraph::GraphNode n1,\n                       InOutGraph::GraphNode n2) {\n  auto ie                           = g.findInEdge(n1, n2);\n  auto GALOIS_USED_ONLY_IN_DEBUG(e) = g.findEdge(n2, n1);\n  if (ie == g.in_edge_end(n1)) {\n    assert(e == g.edge_end(n2));\n  } else {\n    assert(e != g.edge_end(n2));\n    assert(n2 == g.getEdgeDst(ie));\n    assert(n1 == g.getEdgeDst(e));\n    assert(g.getEdgeData(ie) == g.getEdgeData(e));\n    //    g.removeEdge(n1, ie); // this leads to compile error\n    g.removeEdge(n2, e);\n  }\n}\n\nunsigned int countUnmatchedEdge(OutGraph& g,\n                                std::vector<typename OutGraph::GraphNode>& v,\n                                unsigned int i, unsigned int j) {\n  unsigned int unmatched = 0;\n\n  // nodes whose out edges are all removed\n  for (unsigned int ri = 0; ri < i; ri++) {\n    for (unsigned int rj = 0; rj < numNodes; rj++) {\n      unmatched += (g.edge_end(v[ri]) != g.findEdge(v[ri], v[rj]));\n    }\n  }\n\n  // the node whose out edge removed up to j\n  for (unsigned int rj = 0; rj < j + 1; rj++) {\n    unmatched += (g.edge_end(v[i]) != g.findEdge(v[i], v[rj]));\n  }\n  for (unsigned int rj = j + 1; rj < numNodes; rj++) {\n    unmatched += (g.edge_end(v[i]) == g.findEdge(v[i], v[rj]));\n  }\n\n  // nodes whose out edges are kept wholly\n  for (unsigned int ri = i + 1; ri < numNodes; ri++) {\n    for (unsigned int rj = 0; rj < ri + 1; rj++) {\n      unmatched += (g.edge_end(v[ri]) != g.findEdge(v[ri], v[rj]));\n    }\n    for (unsigned int rj = ri + 1; rj < numNodes; rj++) {\n      unmatched += (g.edge_end(v[ri]) == g.findEdge(v[ri], v[rj]));\n    }\n  }\n\n  return unmatched;\n}\n\nunsigned int countUnmatchedEdge(InOutGraph& g,\n                                std::vector<typename InOutGraph::GraphNode>& v,\n                                unsigned int i, unsigned int j) {\n  unsigned int unmatched = 0;\n\n  // nodes whose out edges are all removed\n  for (unsigned int ri = 0; ri < i; ri++) {\n    for (unsigned int rj = 0; rj < numNodes; rj++) {\n      unmatched += (g.edge_end(v[ri]) != g.findEdge(v[ri], v[rj]));\n      unmatched += (g.in_edge_end(v[rj]) != g.findInEdge(v[rj], v[ri]));\n    }\n  }\n\n  // the node whose out edge removed up to j\n  for (unsigned int rj = 0; rj < j + 1; rj++) {\n    unmatched += (g.edge_end(v[i]) != g.findEdge(v[i], v[rj]));\n    unmatched += (g.in_edge_end(v[rj]) != g.findInEdge(v[rj], v[i]));\n  }\n  for (unsigned int rj = j + 1; rj < numNodes; rj++) {\n    unmatched += (g.edge_end(v[i]) == g.findEdge(v[i], v[rj]));\n    unmatched += (g.in_edge_end(v[rj]) == g.findInEdge(v[rj], v[i]));\n  }\n\n  // nodes whose out edges are kept wholly\n  for (unsigned int ri = i + 1; ri < numNodes; ri++) {\n    for (unsigned int rj = 0; rj < ri + 1; rj++) {\n      unmatched += (g.edge_end(v[ri]) != g.findEdge(v[ri], v[rj]));\n      unmatched += (g.in_edge_end(v[rj]) != g.findInEdge(v[rj], v[ri]));\n    }\n    for (unsigned int rj = ri + 1; rj < numNodes; rj++) {\n      unmatched += (g.edge_end(v[ri]) == g.findEdge(v[ri], v[rj]));\n      unmatched += (g.in_edge_end(v[rj]) == g.findInEdge(v[rj], v[ri]));\n    }\n  }\n\n  return unmatched;\n}\n\nunsigned int countUnmatchedEdge(SymGraph& g,\n                                std::vector<typename SymGraph::GraphNode>& v,\n                                unsigned int i, unsigned int j) {\n  unsigned int unmatched = 0;\n\n  // no self loops\n  for (unsigned int k = 0; k < numNodes; k++) {\n    unmatched += (g.edge_end(v[k]) != g.findEdge(v[k], v[k]));\n    unmatched += (g.in_edge_end(v[k]) != g.findInEdge(v[k], v[k]));\n  }\n\n  // nodes whose out edges are all removed\n  for (unsigned int ri = 0; ri < i; ri++) {\n    for (unsigned int rj = ri + 1; rj < numNodes; rj++) {\n      unmatched += (g.edge_end(v[ri]) != g.findEdge(v[ri], v[rj]));\n      unmatched += (g.in_edge_end(v[rj]) != g.findInEdge(v[rj], v[ri]));\n    }\n  }\n\n  // the node whose out edge removed up to j\n  for (unsigned int rj = i; rj < j + 1; rj++) {\n    unmatched += (g.edge_end(v[i]) != g.findEdge(v[i], v[rj]));\n    unmatched += (g.in_edge_end(v[rj]) != g.findInEdge(v[rj], v[i]));\n  }\n  for (unsigned int rj = j + 1; rj < numNodes; rj++) {\n    unmatched += (g.edge_end(v[i]) == g.findEdge(v[i], v[rj]));\n    unmatched += (g.in_edge_end(v[rj]) == g.findInEdge(v[rj], v[i]));\n  }\n\n  // nodes whose out edges are kept wholly\n  for (unsigned int ri = i + 1; ri < numNodes; ri++) {\n    for (unsigned int rj = ri + 1; rj < numNodes; rj++) {\n      unmatched += (g.edge_end(v[ri]) == g.findEdge(v[ri], v[rj]));\n      unmatched += (g.in_edge_end(v[rj]) == g.findInEdge(v[rj], v[ri]));\n    }\n  }\n\n  return unmatched;\n}\n\ntemplate <class G>\nunsigned int testGraphOutEdgeRemoval(G& g,\n                                     std::vector<typename G::GraphNode>& v) {\n  constructGraph(g, v);\n  unsigned int numFailedRemoval = 0;\n\n  for (unsigned int i = 0; i < numNodes; i++) {\n    for (unsigned int j = i + 1; j < numNodes; j++) {\n      removeGraphOutEdge(g, v[i], v[j]);\n      numFailedRemoval += (0 != countUnmatchedEdge(g, v, i, j));\n\n      if (verbose) {\n        std::cout << \"Removed edge (\" << i << \" -> \" << j << \")\\n\";\n        traverseOutGraph(g);\n        traverseInGraph(g);\n      }\n    }\n  }\n\n  return numFailedRemoval;\n}\n\ntemplate <class G>\nunsigned int testGraphInEdgeRemoval(G& g,\n                                    std::vector<typename G::GraphNode>& v) {\n  constructGraph(g, v);\n  unsigned int numFailedRemoval = 0;\n\n  for (unsigned int i = 0; i < numNodes; i++) {\n    for (unsigned int j = i + 1; j < numNodes; j++) {\n      removeGraphInEdge(g, v[j], v[i]);\n      numFailedRemoval += (0 != countUnmatchedEdge(g, v, i, j));\n\n      if (verbose) {\n        std::cout << \"Removed in_edge (\" << j << \" <- \" << i << \")\\n\";\n        traverseOutGraph(g);\n        traverseInGraph(g);\n      }\n    }\n  }\n\n  return numFailedRemoval;\n}\n\nint main() {\n  galois::SharedMemSys G;\n  unsigned int numFailure = 0;\n\n  OutGraph outG;\n  std::vector<OutGraph::GraphNode> outV;\n  auto num = testGraphOutEdgeRemoval(outG, outV);\n  numFailure += num;\n  std::cout << \"OutGraph: Failed \" << num << \" edge removals\\n\";\n\n  SymGraph symG, symG2;\n  std::vector<SymGraph::GraphNode> symV, symV2;\n  num = testGraphOutEdgeRemoval(symG, symV);\n  numFailure += num;\n  std::cout << \"SymGraph: Failed \" << num << \" edge removals\\n\";\n  num = testGraphInEdgeRemoval(symG2, symV2);\n  numFailure += num;\n  std::cout << \"SymGraph: Failed \" << num << \" in_edge removals\\n\";\n\n  InOutGraph inOutG, inOutG2;\n  std::vector<InOutGraph::GraphNode> inOutV, inOutV2;\n  num = testGraphOutEdgeRemoval(inOutG, inOutV);\n  numFailure += num;\n  std::cout << \"InOutGraph: Failed \" << num << \" edge removals\\n\";\n  num = testGraphInEdgeRemoval(inOutG2, inOutV2);\n  numFailure += num;\n  std::cout << \"InOutGraph: Failed \" << num << \" in_edge removals\\n\";\n\n  return (numFailure > 0) ? -1 : 0;\n}\n"
  },
  {
    "path": "libgalois/test/morphgraph.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#include \"galois/Galois.h\"\n#include \"galois/Timer.h\"\n#include \"galois/graphs/Graph.h\"\n#include \"galois/graphs/TypeTraits.h\"\n#include \"galois/runtime/Profile.h\"\n\n#include <iostream>\n#include <string>\n\nusing OutGraph =\n    galois::graphs::MorphGraph<unsigned int, unsigned int, true, false>;\nusing InOutGraph =\n    galois::graphs::MorphGraph<unsigned int, unsigned int, true, true>;\nusing SymGraph = galois::graphs::MorphGraph<unsigned int, unsigned int, false>;\n\nstd::string filename;\nstd::string statfile;\nstd::string graphtype;\n\ntemplate <typename Graph>\nvoid initGraph(Graph& g) {\n  unsigned int i = 1;\n  for (auto n : g) {\n    g.getData(n) = i++;\n  }\n}\n\ntemplate <typename Graph>\nvoid traverseGraph(Graph& g) {\n  uint64_t sum = 0;\n\n  for (auto n : g) {\n    for (auto oe : g.edges(n)) {\n      sum += g.getEdgeData(oe);\n    }\n  }\n  std::cout << \"  out sum = \" << sum << \"\\n\";\n\n  for (auto n : g) {\n    for (auto ie : g.in_edges(n)) {\n      sum -= g.getEdgeData(ie);\n    }\n  }\n  std::cout << \"  all sum = \" << sum << \"\\n\";\n}\n\ntemplate <typename Graph>\nvoid run(Graph& g, galois::StatTimer& timer, std::string prompt) {\n  std::cout << prompt << \"\\n\";\n\n  galois::graphs::FileGraph f;\n  f.fromFileInterleaved<typename Graph::file_edge_data_type>(filename);\n\n  size_t approxGraphSize =\n      120 * f.sizeEdges() *\n      sizeof(typename Graph::edge_data_type); // 120*|E|*sizeof(E)\n  size_t numThreads = galois::getActiveThreads();\n  galois::preAlloc(numThreads +\n                   approxGraphSize / galois::runtime::pagePoolSize());\n  galois::reportPageAlloc(\"MeminfoPre\");\n\n  timer.start();\n  galois::runtime::profileVtune(\n      [&g, &f]() {\n        galois::graphs::readGraphDispatch(g, typename Graph::read_tag(), f);\n      },\n      \"Construct MorphGraph\");\n  timer.stop();\n\n  galois::reportPageAlloc(\"MeminfoPost\");\n\n  initGraph(g);\n  traverseGraph(g);\n}\n\nint main(int argc, char** argv) {\n  galois::SharedMemSys G;\n\n  if (argc < 4) {\n    std::cout << \"Usage: ./test-morphgraph <input> <num_threads> \"\n                 \"<out|in-out|symmetric> [stat_file]\\n\";\n    return 0;\n  }\n\n  filename  = argv[1];\n  graphtype = argv[3];\n\n  auto numThreads = galois::setActiveThreads(std::stoul(argv[2]));\n  std::cout << \"Loading \" << filename << \" with \" << numThreads\n            << \" threads.\\n\";\n\n  if (argc >= 5) {\n    galois::runtime::setStatFile(argv[4]);\n  }\n\n  if (\"out\" == graphtype) {\n    galois::StatTimer outT(\"OutGraphTime\");\n    OutGraph outG;\n    run(outG, outT, \"out graph\");\n  } else if (\"in-out\" == graphtype) {\n    galois::StatTimer inoutT(\"InOutGraphTime\");\n    InOutGraph inoutG;\n    run(inoutG, inoutT, \"in-out graph\");\n  } else if (\"symmetric\" == graphtype) {\n    galois::StatTimer symT(\"SymGraphTime\");\n    SymGraph symG;\n    run(symG, symT, \"symmetric graph\");\n  }\n\n  galois::runtime::reportParam(\"Load MorphGraph\", \"Threads\", numThreads);\n  galois::runtime::reportParam(\"Load MorphGraph\", \"File\", filename);\n  galois::runtime::reportParam(\"Load MorphGraph\", \"Graph Type\", graphtype);\n  return 0;\n}\n"
  },
  {
    "path": "libgalois/test/move.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#include \"galois/Bag.h\"\n#include \"galois/gdeque.h\"\n#include \"galois/gslist.h\"\n#include \"galois/FlatMap.h\"\n#include \"galois/LargeArray.h\"\n#include \"galois/runtime/Mem.h\"\n#include \"galois/substrate/PerThreadStorage.h\"\n\nstruct MoveOnly {\n  MoveOnly()           = default;\n  MoveOnly(MoveOnly&&) = default;\n  MoveOnly& operator=(MoveOnly&&) = default;\n  MoveOnly(const MoveOnly&)       = delete;\n  MoveOnly& operator=(const MoveOnly&) = delete;\n};\n\nstruct MoveOnlyA {\n  int* x;\n  MoveOnlyA() {}\n  MoveOnlyA(const MoveOnlyA&) = delete;\n  MoveOnly& operator=(const MoveOnlyA&) = delete;\n  ~MoveOnlyA() {}\n};\n\ntemplate <typename T>\nvoid test(T&& x) {\n  T a = std::move(x);\n  T b;\n  std::swap(a, b);\n  a = std::move(b);\n}\n\ntemplate <typename T, typename U>\nvoid testContainerA(T&& x, U&& y) {\n  T a = std::move(x);\n  T b;\n  b = std::move(a);\n  b.emplace_back(std::move(y));\n}\n\ntemplate <typename T, typename U>\nvoid testContainerAA(T&& x, U&& y) {\n  galois::runtime::FixedSizeHeap heap(sizeof(typename T::block_type));\n\n  T a = std::move(x);\n  T b;\n  b = std::move(a);\n  b.emplace_front(heap, std::move(y));\n  b.clear(heap);\n}\n\ntemplate <typename T, typename U>\nvoid testContainerB(T&& x, U&& y) {\n  T a = std::move(x);\n  T b;\n  b = std::move(a);\n  b.insert(std::move(y));\n}\n\ntemplate <typename T, typename U>\nvoid testContainerC(T&& x, U&& y) {\n  T a = std::move(x);\n  T b;\n  b = std::move(a);\n  b.emplace(b.begin(), std::move(y));\n}\n\nint main() {\n  galois::SharedMemSys Galois_runtime;\n  // test(galois::FixedSizeBag<MoveOnly>());\n  // test(galois::ConcurrentFixedSizeBag<MoveOnly>());\n  // test(galois::FixedSizeRing<MoveOnly>());\n  test(galois::gdeque<MoveOnly>());\n  test(galois::gslist<MoveOnly>());\n  test(galois::concurrent_gslist<MoveOnly>());\n  test(galois::InsertBag<MoveOnly>());\n  test(galois::LargeArray<MoveOnly>());\n  test(galois::substrate::PerSocketStorage<MoveOnly>());\n  test(galois::substrate::PerThreadStorage<MoveOnly>());\n\n  testContainerA(galois::gdeque<MoveOnly>(), MoveOnly());\n  testContainerAA(galois::gslist<MoveOnly>(), MoveOnly());\n  // testContainerAA(galois::concurrent_gslist<MoveOnly>(), MoveOnly());\n  testContainerA(galois::InsertBag<MoveOnly>(), MoveOnly());\n  testContainerC(galois::gdeque<MoveOnly>(), MoveOnly());\n\n  return 0;\n}\n"
  },
  {
    "path": "libgalois/test/oneach.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#include \"galois/Galois.h\"\n\n#include <iostream>\n\nint main() {\n  galois::SharedMemSys Galois_runtime;\n  galois::substrate::SimpleLock l;\n  galois::setActiveThreads(10000);\n  galois::on_each(\n      [&l](int t, int num) {\n        l.lock();\n        std::cout << t << \",\" << num << \"\\n\";\n        l.unlock();\n      },\n      galois::loopname(\"simple loop\"));\n  return 0;\n}\n"
  },
  {
    "path": "libgalois/test/papi.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#include \"galois/Galois.h\"\n#include \"galois/Timer.h\"\n#include \"galois/runtime/Profile.h\"\n\n#include <iostream>\n\ntemplate <typename V>\nsize_t vecSumSerial(V& vec) {\n  galois::runtime::profilePapi(\n      [&](void) {\n        for (size_t i = 0, sz = vec.size(); i < sz; ++i) {\n          vec[i] = i;\n        }\n      },\n      \"vecInit\");\n\n  size_t sum = 0;\n\n  galois::runtime::profilePapi(\n      [&](void) {\n        for (size_t i = 0, sz = vec.size(); i < sz; ++i) {\n          sum += vec[i];\n        }\n      },\n      \"vecSum\");\n\n  return sum;\n}\n\ntemplate <typename V>\nsize_t vecSumParallel(V& vec) {\n  galois::runtime::profilePapi(\n      [&](void) {\n        galois::do_all(galois::iterate(size_t{0}, vec.size()),\n                       [&](size_t i) { vec[i] = i; });\n      },\n      \"vecInit\");\n\n  size_t sum = 0;\n\n  galois::runtime::profilePapi(\n      [&](void) {\n        galois::do_all(galois::iterate(size_t{0}, vec.size()),\n                       [&](size_t i) { sum += vec[i]; });\n      },\n      \"vecSum\");\n\n  return sum;\n}\n\nint main(int argc, char* argv[]) {\n\n  galois::SharedMemSys G;\n\n  unsigned long long numThreads;\n  if (argc == 1) {\n    numThreads = 1;\n  } else if (argc == 2) {\n    numThreads = galois::setActiveThreads(std::stoull(argv[1]));\n  } else {\n    throw std::invalid_argument(\n        \"Test received too many command line arguments\");\n  }\n\n  galois::runtime::reportParam(\"NULL\", \"Threads\", numThreads);\n\n  size_t vecSz = 1024 * 1024;\n\n  std::vector<size_t> vec(vecSz);\n\n  size_t sum = vecSumSerial(vec);\n\n  std::cout << \"Array Sum = \" << sum << \"\\n\";\n\n  return 0;\n}\n"
  },
  {
    "path": "libgalois/test/pc.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#include \"galois/substrate/PerThreadStorage.h\"\n#include \"galois/Timer.h\"\n#include \"galois/Galois.h\"\n\n#include <cstdlib>\n#include <iostream>\n\nusing namespace galois::substrate;\n\nint num = 1;\n\ntemplate <typename T>\nstruct testL {\n  PerThreadStorage<T>& b;\n\n  testL(PerThreadStorage<T>& B) : b(B) {}\n  void operator()(unsigned, unsigned) {\n    for (int x = 0; x < num; ++x) {\n      *b.getLocal() += x;\n    }\n  }\n};\n\ntemplate <typename T>\nstruct testR {\n  PerThreadStorage<T>& b;\n\n  testR(PerThreadStorage<T>& B) : b(B) {}\n  void operator()(unsigned t, unsigned n) {\n    for (int x = 0; x < num; ++x) {\n      *b.getRemote((t + 1) % n) += x;\n    }\n  }\n};\n\ntemplate <typename T>\nvoid testf(const char* str) {\n  PerThreadStorage<T> b;\n  std::cout << \"\\nRunning: \" << str << \" sizeof \" << sizeof(PerThreadStorage<T>)\n            << \"\\n\";\n  galois::Timer tL;\n  tL.start();\n  testL<T> L(b);\n  galois::on_each(L);\n  tL.stop();\n  galois::Timer tR;\n  tR.start();\n  testR<T> R(b);\n  galois::on_each(R);\n  tR.stop();\n  std::cout << str << \" L: \" << tL.get() << \" R: \" << tR.get() << '\\n';\n}\n\nint main(int argc, char** argv) {\n  galois::SharedMemSys Galois_runtime;\n  if (argc > 1)\n    num = atoi(argv[1]);\n  if (num <= 0)\n    num = 1024 * 1024 * 1024;\n\n  unsigned M = galois::substrate::getThreadPool().getMaxThreads();\n\n  while (M) {\n    galois::setActiveThreads(M); // galois::runtime::LL::getMaxThreads());\n    std::cout << \"Using \" << M << \" threads\\n\";\n\n    testf<int>(\"int\");\n    testf<double>(\"double\");\n\n    M /= 2;\n  }\n\n  return 0;\n}\n"
  },
  {
    "path": "libgalois/test/reduction.cpp",
    "content": "#include \"galois/Galois.h\"\n#include \"galois/Reduction.h\"\n#include \"galois/SharedMemSys.h\"\n\n#include <algorithm>\n#include <iostream>\n#include <functional>\n\nstruct Move {\n  Move()            = default;\n  ~Move()           = default;\n  Move(const Move&) = delete;\n  Move(Move&&) noexcept {}\n  Move& operator=(const Move&) = delete;\n  Move& operator               =(Move&&) noexcept { return *this; }\n};\n\nvoid test_move() {\n  auto merge_fn = [](Move& a, Move &&) -> Move& { return a; };\n\n  auto identity_fn = []() { return Move(); };\n\n  auto r = galois::make_reducible(merge_fn, identity_fn);\n\n  Move x;\n  r.update(std::move(x));\n  r.reduce();\n\n  // And as expected, this will not compile:\n  // reducible.update(x);\n}\n\nvoid test_map() {\n  using Map = std::map<std::string, int>;\n\n  auto reduce = [](Map& a, Map&& b) -> Map& {\n    Map v{std::move(b)};\n\n    for (auto& kv : v) {\n      if (a.count(kv.first) == 0) {\n        a[kv.first] = 0;\n      }\n      a[kv.first] += kv.second;\n    }\n\n    return a;\n  };\n\n  auto zero_fn = []() -> Map { return Map(); };\n\n  auto r = galois::make_reducible(reduce, zero_fn);\n  r.update(Map{std::make_pair(\"key\", 1)});\n  Map& result = r.reduce();\n\n  GALOIS_ASSERT(result[\"key\"] == 1);\n}\n\nvoid other() {}\n\nvoid test_max() {\n  const int& (*int_max)(const int&, const int&) = std::max<int>;\n  std::function<const int&(const int&, const int&)> fn{int_max};\n\n  auto r = galois::make_reducible(fn, []() { return 0; });\n\n  constexpr int num = 10;\n\n  r.update(num);\n  r.update(1);\n\n  GALOIS_ASSERT(r.reduce() == num);\n}\n\nvoid test_accum() {\n  galois::GAccumulator<int> accum;\n\n  constexpr int num = 123456;\n\n  galois::do_all(galois::iterate(0, num), [&](int) { accum += 1; });\n\n  GALOIS_ASSERT(accum.reduce() == num);\n}\n\nint main() {\n  galois::SharedMemSys sys;\n  galois::setActiveThreads(2);\n\n  static_assert(sizeof(galois::GAccumulator<int>) <=\n                sizeof(galois::substrate::PerThreadStorage<int>));\n\n  test_map();\n  test_move();\n  test_max();\n  test_accum();\n\n  return 0;\n}\n"
  },
  {
    "path": "libgalois/test/sort.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#include \"galois/Galois.h\"\n#include \"galois/ParallelSTL.h\"\n#include \"galois/Timer.h\"\n\n#include <iostream>\n#include <cstdlib>\n#include <numeric>\n\nint RandomNumber() { return (rand() % 1000000); }\nbool IsOdd(int i) { return ((i % 2) == 1); }\n\nstruct IsOddS {\n  bool operator()(int i) const { return ((i % 2) == 1); }\n};\n\nint vectorSize = 1;\n\nint do_sort() {\n\n  unsigned M = galois::substrate::getThreadPool().getMaxThreads();\n  std::cout << \"sort:\\n\";\n\n  while (M) {\n\n    galois::setActiveThreads(M); // galois::runtime::LL::getMaxThreads());\n    std::cout << \"Using \" << M << \" threads\\n\";\n\n    std::vector<unsigned> V(vectorSize);\n    std::generate(V.begin(), V.end(), RandomNumber);\n    std::vector<unsigned> C = V;\n\n    galois::Timer t;\n    t.start();\n    galois::ParallelSTL::sort(V.begin(), V.end());\n    t.stop();\n\n    galois::Timer t2;\n    t2.start();\n    std::sort(C.begin(), C.end());\n    t2.stop();\n\n    bool eq = std::equal(C.begin(), C.end(), V.begin());\n\n    std::cout << \"Galois: \" << t.get() << \" STL: \" << t2.get()\n              << \" Equal: \" << eq << \"\\n\";\n\n    if (!eq) {\n      std::vector<unsigned> R = V;\n      std::sort(R.begin(), R.end());\n      if (!std::equal(C.begin(), C.end(), R.begin()))\n        std::cout << \"Cannot be made equal, sort mutated array\\n\";\n      for (size_t x = 0; x < V.size(); ++x) {\n        std::cout << x << \"\\t\" << V[x] << \"\\t\" << C[x];\n        if (V[x] != C[x])\n          std::cout << \"\\tDiff\";\n        if (V[x] < C[x])\n          std::cout << \"\\tLT\";\n        if (V[x] > C[x])\n          std::cout << \"\\tGT\";\n        std::cout << \"\\n\";\n      }\n      return 1;\n    }\n\n    M >>= 1;\n  }\n\n  return 0;\n}\n\nint do_count_if() {\n\n  unsigned M = galois::substrate::getThreadPool().getMaxThreads();\n  std::cout << \"count_if:\\n\";\n\n  while (M) {\n\n    galois::setActiveThreads(M); // galois::runtime::LL::getMaxThreads());\n    std::cout << \"Using \" << M << \" threads\\n\";\n\n    std::vector<unsigned> V(vectorSize);\n    std::generate(V.begin(), V.end(), RandomNumber);\n\n    unsigned x1, x2;\n\n    galois::Timer t;\n    t.start();\n    x1 = galois::ParallelSTL::count_if(V.begin(), V.end(), IsOddS());\n    t.stop();\n\n    galois::Timer t2;\n    t2.start();\n    x2 = std::count_if(V.begin(), V.end(), IsOddS());\n    t2.stop();\n\n    std::cout << \"Galois: \" << t.get() << \" STL: \" << t2.get()\n              << \" Equal: \" << (x1 == x2) << \"\\n\";\n    M >>= 1;\n  }\n\n  return 0;\n}\n\ntemplate <typename T>\nstruct mymax {\n  T operator()(const T& x, const T& y) const { return std::max(x, y); }\n};\n\nint do_accumulate() {\n\n  unsigned M = galois::substrate::getThreadPool().getMaxThreads();\n  std::cout << \"accumulate:\\n\";\n\n  while (M) {\n    galois::setActiveThreads(M); // galois::runtime::LL::getMaxThreads());\n    std::cout << \"Using \" << M << \" threads\\n\";\n\n    std::vector<unsigned> V(vectorSize);\n    std::generate(V.begin(), V.end(), RandomNumber);\n\n    unsigned x1, x2;\n\n    galois::Timer t;\n    t.start();\n    x1 = galois::ParallelSTL::accumulate(V.begin(), V.end(), 0u,\n                                         mymax<unsigned>());\n    t.stop();\n\n    galois::Timer t2;\n    t2.start();\n    x2 = std::accumulate(V.begin(), V.end(), 0u, mymax<unsigned>());\n    t2.stop();\n\n    std::cout << \"Galois: \" << t.get() << \" STL: \" << t2.get()\n              << \" Equal: \" << (x1 == x2) << \"\\n\";\n    if (x1 != x2)\n      std::cout << x1 << \" \" << x2 << \"\\n\";\n    M >>= 1;\n  }\n\n  return 0;\n}\n\nint main(int argc, char** argv) {\n  galois::SharedMemSys Galois_runtime;\n  if (argc > 1)\n    vectorSize = atoi(argv[1]);\n  if (vectorSize <= 0)\n    vectorSize = 1024 * 1024 * 16;\n\n  int ret = 0;\n  //  ret |= do_sort();\n  //  ret |= do_count_if();\n  ret |= do_accumulate();\n  return ret;\n}\n"
  },
  {
    "path": "libgalois/test/static.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n// std_tr1__type_traits__is_pod.cpp\n\n#include \"galois/substrate/PtrLock.h\"\n#include \"galois/substrate/SimpleLock.h\"\n#include \"galois/substrate/StaticInstance.h\"\n\n#include <type_traits>\n#include <iostream>\n\nusing namespace galois::substrate;\n\nint main() {\n  std::cout << \"is_pod PtrLock<int> == \" << std::boolalpha\n            << std::is_pod<PtrLock<int>>::value << \"\\n\";\n\n  std::cout << \"is_pod SimpleLock == \" << std::boolalpha\n            << std::is_pod<SimpleLock>::value << \"\\n\";\n  std::cout << \"is_pod DummyLock == \" << std::boolalpha\n            << std::is_pod<DummyLock>::value << \"\\n\";\n\n  std::cout << \"is_pod StaticInstance<int> == \" << std::boolalpha\n            << std::is_pod<StaticInstance<int>>::value << \"\\n\";\n  std::cout << \"is_pod StaticInstance<std::iostream> == \" << std::boolalpha\n            << std::is_pod<StaticInstance<std::iostream>>::value << \"\\n\";\n\n  std::cout << \"is_pod volatile int == \" << std::boolalpha\n            << std::is_pod<volatile int>::value << \"\\n\";\n  std::cout << \"is_pod int == \" << std::boolalpha << std::is_pod<int>::value\n            << \"\\n\";\n\n  return (0);\n}\n"
  },
  {
    "path": "libgalois/test/traits.cpp",
    "content": "#include \"galois/gIO.h\"\n#include \"galois/Traits.h\"\n#include <iostream>\n#include <utility>\n\nstruct A {};\n\nstruct B : public A {\n  std::string name_;\n  B(std::string name) : name_(std::move(name)) {}\n  B() : B(\"\") {}\n};\n\nstruct Unrelated {};\n\ntemplate <size_t... Ints, typename Tuple>\nvoid print(std::index_sequence<Ints...>, Tuple tup) {\n  (..., (std::cout << typeid(std::get<Ints>(tup)).name() << \" \")) << \"\\n\";\n}\n\ntemplate <typename Tuple>\nvoid print(Tuple tup) {\n  print(std::make_index_sequence<std::tuple_size<Tuple>::value>(), tup);\n}\n\nint main() {\n  auto pull_from_default = galois::get_default_trait_values(\n      std::make_tuple(Unrelated{}), std::make_tuple(A{}), std::make_tuple(B{}));\n  static_assert(\n      std::is_same<decltype(pull_from_default), std::tuple<B>>::value);\n\n  auto no_pull_from_default_when_same = galois::get_default_trait_values(\n      std::make_tuple(A{}), std::make_tuple(A{}), std::make_tuple(B{}));\n  static_assert(std::is_same<decltype(no_pull_from_default_when_same),\n                             std::tuple<>>::value);\n\n  auto no_pull_from_default_when_derived = galois::get_default_trait_values(\n      std::make_tuple(B{}), std::make_tuple(A{}), std::make_tuple(B{}));\n  static_assert(std::is_same<decltype(no_pull_from_default_when_derived),\n                             std::tuple<>>::value);\n\n  auto empty_tuple = galois::get_default_trait_values(\n      std::make_tuple(), std::make_tuple(), std::make_tuple());\n  static_assert(std::is_same<decltype(empty_tuple), std::tuple<>>::value);\n\n  auto value_from_default = galois::get_default_trait_values(\n      std::make_tuple(), std::make_tuple(A{}), std::make_tuple(B{\"name\"}));\n  GALOIS_ASSERT(std::get<0>(value_from_default).name_ == \"name\");\n\n  auto get_value = galois::get_trait_value<A>(std::tuple<B>(B{\"name\"}));\n  GALOIS_ASSERT(get_value.name_ == \"name\");\n}\n"
  },
  {
    "path": "libgalois/test/twoleveliteratora.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#include \"galois/TwoLevelIteratorA.h\"\n#include \"galois/gIO.h\"\n\n#include <algorithm>\n#include <boost/iterator/counting_iterator.hpp>\n#include <vector>\n#include <list>\n#include <iostream>\n#include <cstdlib>\n#include <random>\n\nint N = 10;\n\ntemplate <class D, class I>\nstruct GetBegin {\n  typename I::iterator operator()(typename D::reference x) const {\n    return x.begin();\n  }\n  typename I::const_iterator operator()(typename D::const_reference x) const {\n    return x.begin();\n  }\n};\n\ntemplate <class D, class I>\nstruct GetEnd {\n  typename I::iterator operator()(typename D::reference x) const {\n    return x.end();\n  }\n  typename I::const_iterator operator()(typename D::const_reference x) const {\n    return x.end();\n  }\n};\n\ntemplate <bool NonEmpty, class Tag, class D>\nvoid check_forward() {\n  D data;\n\n  for (int i = 0; i < N; ++i) {\n#ifdef GALOIS_CXX11_VECTOR_HAS_NO_EMPLACE\n    if (NonEmpty) {\n      data.push_back(typename D::value_type());\n      data.back().push_back(i);\n    } else {\n      data.push_back(typename D::value_type());\n      data.push_back(typename D::value_type());\n      data.back().push_back(i);\n      data.push_back(typename D::value_type());\n    }\n#else\n    if (NonEmpty) {\n      data.emplace_back();\n      data.back().push_back(i);\n    } else {\n      data.emplace_back();\n      data.emplace_back();\n      data.back().push_back(i);\n      data.emplace_back();\n    }\n#endif\n  }\n\n#if __cplusplus >= 201103L\n  auto r = galois::make_two_level_iterator<Tag>(data.begin(), data.end());\n#else\n  auto r =\n      galois::make_two_level_iterator<Tag, typename D::iterator,\n                                      typename I::iterator, GetBegin<D, I>,\n                                      GetEnd<D, I>>(data.begin(), data.end());\n#endif\n  GALOIS_ASSERT(\n      std::equal(r.first, r.second, boost::make_counting_iterator<int>(0)),\n      \"failed case: forward \", (NonEmpty ? \"non-empty\" : \"empty\"),\n      \" inner range\");\n  GALOIS_ASSERT(std::distance(r.first, r.second) == N, \"failed case: forward \",\n                (NonEmpty ? \"non-empty\" : \"empty\"),\n                \" inner range: \", std::distance(r.first, r.second), \" != \", N);\n}\n\ntemplate <bool NonEmpty, class Tag, class D>\nvoid check_backward() {\n  D data;\n\n  for (int i = N - 1; i >= 0; --i) {\n#ifdef GALOIS_CXX11_VECTOR_HAS_NO_EMPLACE\n    if (NonEmpty) {\n      data.push_back(typename D::value_type());\n      data.back().push_back(i);\n    } else {\n      data.push_back(typename D::value_type());\n      data.push_back(typename D::value_type());\n      data.back().push_back(i);\n      data.push_back(typename D::value_type());\n    }\n#else\n    if (NonEmpty) {\n      data.emplace_back();\n      data.back().push_back(i);\n    } else {\n      data.emplace_back();\n      data.emplace_back();\n      data.back().push_back(i);\n      data.emplace_back();\n    }\n#endif\n  }\n\n#if __cplusplus >= 201103L\n  auto r = galois::make_two_level_iterator<Tag>(data.begin(), data.end());\n#else\n  auto r =\n      galois::make_two_level_iterator<Tag, typename D::iterator,\n                                      typename I::iterator, GetBegin<D, I>,\n                                      GetEnd<D, I>>(data.begin(), data.end());\n#endif\n  auto c = boost::make_counting_iterator<int>(0);\n  GALOIS_ASSERT(std::distance(r.first, r.second) == N, \"failed case: backward \",\n                (NonEmpty ? \"non-empty\" : \"empty\"),\n                \" inner range: \", std::distance(r.first, r.second), \" != \", N);\n  if (r.first == r.second) {\n    return;\n  }\n\n  --r.second;\n  while (true) {\n    GALOIS_ASSERT(*r.second == *c, \"failed case: backward \",\n                  (NonEmpty ? \"non-empty\" : \"empty\"),\n                  \" inner range: \", *r.second, \" != \", *c);\n    if (r.first == r.second)\n      break;\n    --r.second;\n    ++c;\n  }\n}\n\ntemplate <bool NonEmpty, class Tag, class D>\nvoid check_strided() {\n  D data;\n\n  for (int i = 0; i < N; ++i) {\n#ifdef GALOIS_CXX11_VECTOR_HAS_NO_EMPLACE\n    if (NonEmpty) {\n      data.push_back(typename D::value_type());\n      data.back().push_back(i);\n    } else {\n      data.push_back(typename D::value_type());\n      data.push_back(typename D::value_type());\n      data.back().push_back(i);\n      data.push_back(typename D::value_type());\n    }\n#else\n    if (NonEmpty) {\n      data.emplace_back();\n      data.back().push_back(i);\n    } else {\n      data.emplace_back();\n      data.emplace_back();\n      data.back().push_back(i);\n      data.emplace_back();\n    }\n#endif\n  }\n\n#if __cplusplus >= 201103L\n  auto r = galois::make_two_level_iterator<Tag>(data.begin(), data.end());\n#else\n  auto r =\n      galois::make_two_level_iterator<Tag, typename D::iterator,\n                                      typename I::iterator, GetBegin<D, I>,\n                                      GetEnd<D, I>>(data.begin(), data.end());\n#endif\n  auto c = boost::make_counting_iterator<int>(0);\n  GALOIS_ASSERT(std::distance(r.first, r.second) == N, \"failed case: strided \",\n                (NonEmpty ? \"non-empty\" : \"empty\"),\n                \" inner range: \", std::distance(r.first, r.second), \" != \", N);\n  if (r.first == r.second) {\n    return;\n  }\n\n  while (r.first != r.second) {\n    GALOIS_ASSERT(*r.first == *c, \"failed case: strided \",\n                  (NonEmpty ? \"non-empty\" : \"empty\"),\n                  \" inner range: \", *r.first, \" != \", *c);\n\n    auto orig = r.first;\n\n    int k = std::max((N - *c) / 2, 1);\n    std::advance(r.first, k);\n    GALOIS_ASSERT(std::distance(orig, r.first) == k, \"failed case: strided \",\n                  (NonEmpty ? \"non-empty\" : \"empty\"),\n                  \" inner range: \", std::distance(orig, r.first), \" != \", k);\n    for (int i = 0; i < k - 1; ++i)\n      std::advance(r.first, -1);\n\n    GALOIS_ASSERT(std::distance(orig, r.first) == 1, \"failed case: strided \",\n                  (NonEmpty ? \"non-empty\" : \"empty\"),\n                  \" inner range: \", std::distance(orig, r.first), \" != 1\");\n\n    ++c;\n  }\n}\n\ntemplate <bool NonEmpty, class Tag, class D>\nvoid check_random() {\n  D data;\n  std::mt19937 gen;\n  std::uniform_int_distribution<int> dist(0, 100);\n\n  for (int i = 0; i < N; ++i) {\n#ifdef GALOIS_CXX11_VECTOR_HAS_NO_EMPLACE\n    if (NonEmpty) {\n      data.push_back(typename D::value_type());\n      data.back().push_back(dist(gen));\n    } else {\n      data.push_back(typename D::value_type());\n      data.push_back(typename D::value_type());\n      data.back().push_back(dist(gen));\n      data.push_back(typename D::value_type());\n    }\n#else\n    if (NonEmpty) {\n      data.emplace_back();\n      data.back().push_back(dist(gen));\n    } else {\n      data.emplace_back();\n      data.emplace_back();\n      data.back().push_back(dist(gen));\n      data.emplace_back();\n    }\n#endif\n  }\n\n#if __cplusplus >= 201103L\n  auto r = galois::make_two_level_iterator<Tag>(data.begin(), data.end());\n#else\n  auto r =\n      galois::make_two_level_iterator<Tag, typename D::iterator,\n                                      typename I::iterator, GetBegin<D, I>,\n                                      GetEnd<D, I>>(data.begin(), data.end());\n#endif\n\n  std::sort(r.first, r.second);\n\n  int last = *r.first;\n  for (auto ii = r.first + 1; ii != r.second; ++ii) {\n    GALOIS_ASSERT(last <= *ii, \"failed case: random \",\n                  (NonEmpty ? \"non-empty\" : \"empty\"), \" inner range: \", last,\n                  \" > \", *ii);\n    last = *ii;\n  }\n}\n\nvoid check_forward_iteration() {\n  check_forward<true, std::forward_iterator_tag,\n                std::vector<std::vector<int>>>();\n  check_forward<true, std::forward_iterator_tag, std::vector<std::list<int>>>();\n  check_forward<true, std::forward_iterator_tag, std::list<std::vector<int>>>();\n  check_forward<true, std::forward_iterator_tag, std::list<std::list<int>>>();\n\n  check_forward<true, std::bidirectional_iterator_tag,\n                std::vector<std::vector<int>>>();\n  check_forward<true, std::bidirectional_iterator_tag,\n                std::vector<std::list<int>>>();\n  check_forward<true, std::bidirectional_iterator_tag,\n                std::list<std::vector<int>>>();\n  check_forward<true, std::bidirectional_iterator_tag,\n                std::list<std::list<int>>>();\n\n  check_forward<true, std::random_access_iterator_tag,\n                std::vector<std::vector<int>>>();\n  check_forward<true, std::random_access_iterator_tag,\n                std::vector<std::list<int>>>();\n  check_forward<true, std::random_access_iterator_tag,\n                std::list<std::vector<int>>>();\n  check_forward<true, std::random_access_iterator_tag,\n                std::list<std::list<int>>>();\n\n  check_forward<false, std::forward_iterator_tag,\n                std::vector<std::vector<int>>>();\n  check_forward<false, std::forward_iterator_tag,\n                std::vector<std::list<int>>>();\n  check_forward<false, std::forward_iterator_tag,\n                std::list<std::vector<int>>>();\n  check_forward<false, std::forward_iterator_tag, std::list<std::list<int>>>();\n\n  check_forward<false, std::bidirectional_iterator_tag,\n                std::vector<std::vector<int>>>();\n  check_forward<false, std::bidirectional_iterator_tag,\n                std::vector<std::list<int>>>();\n  check_forward<false, std::bidirectional_iterator_tag,\n                std::list<std::vector<int>>>();\n  check_forward<false, std::bidirectional_iterator_tag,\n                std::list<std::list<int>>>();\n\n  check_forward<false, std::random_access_iterator_tag,\n                std::vector<std::vector<int>>>();\n  check_forward<false, std::random_access_iterator_tag,\n                std::vector<std::list<int>>>();\n  check_forward<false, std::random_access_iterator_tag,\n                std::list<std::vector<int>>>();\n  check_forward<false, std::random_access_iterator_tag,\n                std::list<std::list<int>>>();\n}\n\nvoid check_backward_iteration() {\n  check_backward<true, std::bidirectional_iterator_tag,\n                 std::vector<std::vector<int>>>();\n  check_backward<true, std::bidirectional_iterator_tag,\n                 std::vector<std::list<int>>>();\n  check_backward<true, std::bidirectional_iterator_tag,\n                 std::list<std::vector<int>>>();\n  check_backward<true, std::bidirectional_iterator_tag,\n                 std::list<std::list<int>>>();\n\n  check_backward<true, std::random_access_iterator_tag,\n                 std::vector<std::vector<int>>>();\n  check_backward<true, std::random_access_iterator_tag,\n                 std::vector<std::list<int>>>();\n  check_backward<true, std::random_access_iterator_tag,\n                 std::list<std::vector<int>>>();\n  check_backward<true, std::random_access_iterator_tag,\n                 std::list<std::list<int>>>();\n\n  check_backward<false, std::bidirectional_iterator_tag,\n                 std::vector<std::vector<int>>>();\n  check_backward<false, std::bidirectional_iterator_tag,\n                 std::vector<std::list<int>>>();\n  check_backward<false, std::bidirectional_iterator_tag,\n                 std::list<std::vector<int>>>();\n  check_backward<false, std::bidirectional_iterator_tag,\n                 std::list<std::list<int>>>();\n\n  check_backward<false, std::random_access_iterator_tag,\n                 std::vector<std::vector<int>>>();\n  check_backward<false, std::random_access_iterator_tag,\n                 std::vector<std::list<int>>>();\n  check_backward<false, std::random_access_iterator_tag,\n                 std::list<std::vector<int>>>();\n  check_backward<false, std::random_access_iterator_tag,\n                 std::list<std::list<int>>>();\n}\n\nvoid check_strided_iteration() {\n  check_strided<true, std::bidirectional_iterator_tag,\n                std::vector<std::vector<int>>>();\n  check_strided<true, std::bidirectional_iterator_tag,\n                std::vector<std::list<int>>>();\n  check_strided<true, std::bidirectional_iterator_tag,\n                std::list<std::vector<int>>>();\n  check_strided<true, std::bidirectional_iterator_tag,\n                std::list<std::list<int>>>();\n\n  check_strided<true, std::random_access_iterator_tag,\n                std::vector<std::vector<int>>>();\n  check_strided<true, std::random_access_iterator_tag,\n                std::vector<std::list<int>>>();\n  check_strided<true, std::random_access_iterator_tag,\n                std::list<std::vector<int>>>();\n  check_strided<true, std::random_access_iterator_tag,\n                std::list<std::list<int>>>();\n\n  check_strided<false, std::bidirectional_iterator_tag,\n                std::vector<std::vector<int>>>();\n  check_strided<false, std::bidirectional_iterator_tag,\n                std::vector<std::list<int>>>();\n  check_strided<false, std::bidirectional_iterator_tag,\n                std::list<std::vector<int>>>();\n  check_strided<false, std::bidirectional_iterator_tag,\n                std::list<std::list<int>>>();\n\n  check_strided<false, std::random_access_iterator_tag,\n                std::vector<std::vector<int>>>();\n  check_strided<false, std::random_access_iterator_tag,\n                std::vector<std::list<int>>>();\n  check_strided<false, std::random_access_iterator_tag,\n                std::list<std::vector<int>>>();\n  check_strided<false, std::random_access_iterator_tag,\n                std::list<std::list<int>>>();\n}\n\nvoid check_random_iteration() {\n  check_random<true, std::random_access_iterator_tag,\n               std::vector<std::vector<int>>>();\n  check_random<true, std::random_access_iterator_tag,\n               std::vector<std::list<int>>>();\n  check_random<true, std::random_access_iterator_tag,\n               std::list<std::vector<int>>>();\n  check_random<true, std::random_access_iterator_tag,\n               std::list<std::list<int>>>();\n\n  check_random<false, std::random_access_iterator_tag,\n               std::vector<std::vector<int>>>();\n  check_random<false, std::random_access_iterator_tag,\n               std::vector<std::list<int>>>();\n  check_random<false, std::random_access_iterator_tag,\n               std::list<std::vector<int>>>();\n  check_random<false, std::random_access_iterator_tag,\n               std::list<std::list<int>>>();\n}\n\nint main(int argc, char** argv) {\n  if (argc > 1)\n    N = atoi(argv[1]);\n  if (N <= 0)\n    N = 1024 * 4;\n\n  typedef std::vector<std::vector<int>> NestedVector;\n\n  // Static checks\n  NestedVector data;\n  const NestedVector& d(data);\n#if __cplusplus >= 201103L\n  auto r = galois::make_two_level_iterator(d.begin(), d.end());\n#else\n  auto r =\n      galois::make_two_level_iterator<std::forward_iterator_tag,\n                                      NestedVector::const_iterator,\n                                      std::vector<int>::const_iterator,\n                                      GetBegin<NestedVector, std::vector<int>>,\n                                      GetEnd<NestedVector, std::vector<int>>>(\n          d.begin(), d.end());\n#endif\n  static_assert(std::is_same<decltype(*r.first), const int&>::value,\n                \"failed case: preserve constness\");\n\n  // Runtime checks\n  check_forward_iteration();\n  check_backward_iteration();\n  check_strided_iteration();\n  check_random_iteration();\n\n  return 0;\n}\n"
  },
  {
    "path": "libgalois/test/wakeup-overhead.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#include \"galois/Galois.h\"\n#include \"galois/Reduction.h\"\n#include \"galois/Timer.h\"\n#include \"Lonestar/BoilerPlate.h\"\n#include \"llvm/Support/CommandLine.h\"\n\n#include <boost/iterator/counting_iterator.hpp>\n\n#include <chrono>\n#include <cmath>\n#include <iostream>\n#include <vector>\n\ntypedef galois::GAccumulator<double> AccumDouble;\n\nnamespace cll = llvm::cl;\n\nstatic cll::opt<int> size(\"size\", cll::desc(\"length of vectors\"),\n                          cll::init(1000));\nstatic cll::opt<int> rounds(\"rounds\", cll::desc(\"number of rounds\"),\n                            cll::init(10000));\nstatic cll::opt<int> trials(\"trials\", cll::desc(\"number of trials\"),\n                            cll::init(1));\nstatic cll::opt<unsigned> threads(\"threads\", cll::desc(\"number of threads\"),\n                                  cll::init(2));\n\nvoid runDoAllBurn(int num) {\n  galois::substrate::getThreadPool().burnPower(galois::getActiveThreads());\n\n  for (int r = 0; r < rounds; ++r) {\n    galois::do_all(galois::iterate(0, num), [&](int) {\n      asm volatile(\"\" ::: \"memory\");\n    });\n  }\n\n  galois::substrate::getThreadPool().beKind();\n}\n\nvoid runDoAll(int num) {\n  for (int r = 0; r < rounds; ++r) {\n    galois::do_all(galois::iterate(0, num), [&](int) {\n      asm volatile(\"\" ::: \"memory\");\n    });\n  }\n}\n\nvoid runExplicitThread(int num) {\n  galois::substrate::Barrier& barrier =\n      galois::runtime::getBarrier(galois::getActiveThreads());\n\n  galois::on_each([&](unsigned tid, unsigned total) {\n    auto range =\n        galois::block_range(boost::counting_iterator<int>(0),\n                            boost::counting_iterator<int>(num), tid, total);\n    for (int r = 0; r < rounds; ++r) {\n      for (auto ii = range.first, ei = range.second; ii != ei; ++ii) {\n        asm volatile(\"\" ::: \"memory\");\n      }\n      barrier();\n    }\n  });\n}\n\nvoid run(std::function<void(int)> fn, std::string name) {\n  galois::Timer t;\n  t.start();\n  fn(size);\n  t.stop();\n  std::cout << name << \" time: \" << t.get() << \"\\n\";\n}\n\nstd::atomic<int> EXIT;\n\nint main(int argc, char* argv[]) {\n  galois::SharedMemSys Galois_runtime;\n  LonestarStart(argc, argv);\n\n  galois::setActiveThreads(threads);\n\n  EXIT                        = 0;\n  std::function<void(void)> f = []() {\n    while (!EXIT) {\n      std::cerr << \".\";\n      std::this_thread::sleep_for(std::chrono::milliseconds(100));\n    }\n  };\n  galois::substrate::getThreadPool().runDedicated(f);\n\n  for (int t = 0; t < trials; ++t) {\n    run(runDoAll, \"DoAll\");\n    run(runDoAllBurn, \"DoAllBurn\");\n    run(runExplicitThread, \"ExplicitThread\");\n  }\n  EXIT = 1;\n\n  std::cout << \"threads: \" << galois::getActiveThreads() << \" usable threads: \"\n            << galois::substrate::getThreadPool().getMaxUsableThreads()\n            << \" rounds: \" << rounds << \" size: \" << size << \"\\n\";\n\n  return 0;\n}\n"
  },
  {
    "path": "libgalois/test/worklists-compile.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#include \"galois/Galois.h\"\n#include \"galois/runtime/Range.h\"\n\n#include <cstdlib>\n\nint run = 1;\n\ntemplate <typename T2>\nstruct checker {\n  typedef typename T2::template retype<int> T;\n  T wl;\n  typename T::template rethread<true> wl2;\n  typename T::template rethread<false> wl3;\n\n  checker() {\n    int a[4] = {1, 2, 3, 0};\n\n    // Don't actually run this code as some worklists don't support\n    // the full worklist API\n    if (run)\n      return;\n\n    wl.push(0);\n    wl.push_initial(galois::runtime::makeStandardRange(&a[0], &a[4]));\n    wl.push(&a[0], &a[4]);\n    wl.pop();\n\n    wl2.push(0);\n    wl2.push_initial(galois::runtime::makeStandardRange(&a[0], &a[4]));\n    wl2.push(&a[0], &a[4]);\n    wl2.pop();\n\n    wl3.push(0);\n    wl3.push_initial(galois::runtime::makeStandardRange(&a[0], &a[4]));\n    wl3.push(&a[0], &a[4]);\n    wl3.pop();\n  }\n};\n\n#undef GALOIS_WLCOMPILECHECK\n#define GALOIS_WLCOMPILECHECK(name) checker<name<>> ck_##name;\n#include \"galois/worklists/WorkList.h\"\n\nint main(int argc, char** argv) {\n  galois::SharedMemSys Galois_runtime;\n  if (argc > 1)\n    run = atoi(argv[1]);\n\n  return 0;\n}\n"
  },
  {
    "path": "libgluon/CMakeLists.txt",
    "content": "add_library(galois_gluon STATIC)\nadd_library(Galois::gluon ALIAS galois_gluon)\nset_target_properties(galois_gluon PROPERTIES EXPORT_NAME gluon)\nadd_dependencies(lib galois_gluon)\n\ntarget_sources(galois_gluon PRIVATE\n        src/cuda_device.cpp\n        src/SyncStructures.cpp\n        src/GlobalObj.cpp\n        src/GluonSubstrate.cpp\n)\n\ntarget_link_libraries(galois_gluon PUBLIC galois_dist_async)\n\ntarget_include_directories(galois_gluon PUBLIC\n  $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>\n  $<INSTALL_INTERFACE:include>\n)\n\nif (GALOIS_COMM_STATS)\n  target_compile_definitions(galois_gluon PRIVATE GALOIS_COMM_STATS=1)\nendif()\n\nif (GALOIS_USE_BARE_MPI)\n  target_compile_definitions(galois_gluon PRIVATE GALOIS_USE_BARE_MPI=1)\nendif()\n\ninstall(\n  DIRECTORY include/\n  DESTINATION \"${CMAKE_INSTALL_INCLUDEDIR}\"\n  COMPONENT dev\n  FILES_MATCHING PATTERN \"*.h\"\n)\n\ninstall(TARGETS galois_gluon\n  EXPORT GaloisTargets\n  LIBRARY\n    DESTINATION \"${CMAKE_INSTALL_LIBDIR}\"\n    COMPONENT shlib\n  ARCHIVE\n    DESTINATION \"${CMAKE_INSTALL_LIBDIR}\"\n    COMPONENT lib\n  INCLUDES DESTINATION \"${CMAKE_INSTALL_INCLUDEDIR}\"\n)\n"
  },
  {
    "path": "libgluon/include/galois/cuda/Context.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n/*\n */\n\n/**\n * @file cuda/Context.h\n *\n * Contains definition of CUDA context structures.\n *\n * @todo document this file\n */\n\n#pragma once\n#include <cuda.h>\n#include \"gg.h\"\n#include \"galois/cuda/HostDecls.h\"\n\nstruct CUDA_Context_Shared {\n  unsigned int* num_nodes;         // per host\n  DeviceOnly<unsigned int>* nodes; // per host\n};\n\nstruct CUDA_Context_Common {\n  int device;\n  int id;\n  unsigned int numOwned;    // Number of nodes owned (masters) by this host\n  unsigned int beginMaster; // local id of the beginning of master nodes\n  unsigned int numNodesWithEdges; // Number of nodes (masters + mirrors) that\n                                  // have outgoing edges\n  CSRGraphTy gg;\n  struct CUDA_Context_Shared master;\n  struct CUDA_Context_Shared mirror;\n  DeviceOnly<unsigned int> offsets; // union across master/mirror of all hosts\n  Shared<DynamicBitset> is_updated; // union across master/mirror of all hosts\n};\n\ntemplate <typename Type>\nstruct CUDA_Context_Field {\n  Shared<Type> data;\n  Shared<DynamicBitset> is_updated;\n  DeviceOnly<Type> shared_data; // union across master/mirror of all hosts\n};\n\nbool init_CUDA_context_common(struct CUDA_Context_Common* ctx, int device) {\n  struct cudaDeviceProp dev;\n  if (device == -1) {\n    check_cuda(cudaGetDevice(&device));\n  } else {\n    int count;\n    check_cuda(cudaGetDeviceCount(&count));\n    if (device > count) {\n      fprintf(stderr, \"Error: Out-of-range GPU %d specified (%d total GPUs)\",\n              device, count);\n      return false;\n    }\n    check_cuda(cudaSetDevice(device));\n  }\n  ctx->device = device;\n  check_cuda(cudaGetDeviceProperties(&dev, device));\n  printf(\"[%d] Using GPU %d: %s\\n\", ctx->id, device, dev.name);\n  return true;\n}\n\nvoid load_graph_CUDA_common(struct CUDA_Context_Common* ctx, MarshalGraph& g,\n                            unsigned num_hosts) {\n  CSRGraphTy graph;\n  ctx->numOwned          = g.numOwned;\n  ctx->beginMaster       = g.beginMaster;\n  ctx->numNodesWithEdges = g.numNodesWithEdges;\n  assert(ctx->id == g.id);\n\n  size_t mem_usage = ((g.nnodes + 1) + g.nedges) * sizeof(index_type) +\n                     (g.nnodes) * sizeof(node_data_type);\n  if (!g.edge_data)\n    mem_usage += (g.nedges) * sizeof(edge_data_type);\n  printf(\"[%d] Host memory for graph: %3u MB\\n\", ctx->id, mem_usage / 1048756);\n\n  // copy the graph to the GPU\n  graph.nnodes    = g.nnodes;\n  graph.nedges    = g.nedges;\n  graph.row_start = g.row_start;\n  graph.edge_dst  = g.edge_dst;\n  graph.node_data = g.node_data;\n  graph.edge_data = g.edge_data;\n  graph.copy_to_gpu(ctx->gg);\n\n  size_t max_shared_size = 0; // for union across master/mirror of all hosts\n  ctx->master.num_nodes =\n      (unsigned int*)calloc(num_hosts, sizeof(unsigned int));\n  memcpy(ctx->master.num_nodes, g.num_master_nodes,\n         sizeof(unsigned int) * num_hosts);\n  ctx->master.nodes = (DeviceOnly<unsigned int>*)calloc(\n      num_hosts, sizeof(Shared<unsigned int>));\n  for (uint32_t h = 0; h < num_hosts; ++h) {\n    if (ctx->master.num_nodes[h] > 0) {\n      ctx->master.nodes[h].alloc(ctx->master.num_nodes[h]);\n      ctx->master.nodes[h].copy_to_gpu(g.master_nodes[h],\n                                       ctx->master.num_nodes[h]);\n    }\n    if (ctx->master.num_nodes[h] > max_shared_size) {\n      max_shared_size = ctx->master.num_nodes[h];\n    }\n  }\n  ctx->mirror.num_nodes =\n      (unsigned int*)calloc(num_hosts, sizeof(unsigned int));\n  memcpy(ctx->mirror.num_nodes, g.num_mirror_nodes,\n         sizeof(unsigned int) * num_hosts);\n  ctx->mirror.nodes = (DeviceOnly<unsigned int>*)calloc(\n      num_hosts, sizeof(Shared<unsigned int>));\n  for (uint32_t h = 0; h < num_hosts; ++h) {\n    if (ctx->mirror.num_nodes[h] > 0) {\n      ctx->mirror.nodes[h].alloc(ctx->mirror.num_nodes[h]);\n      ctx->mirror.nodes[h].copy_to_gpu(g.mirror_nodes[h],\n                                       ctx->mirror.num_nodes[h]);\n    }\n    if (ctx->mirror.num_nodes[h] > max_shared_size) {\n      max_shared_size = ctx->mirror.num_nodes[h];\n    }\n  }\n  ctx->offsets.alloc(max_shared_size);\n  ctx->is_updated.alloc(1);\n  ctx->is_updated.cpu_wr_ptr()->alloc(max_shared_size);\n  // printf(\"[%u] load_graph_GPU: %u owned nodes of total %u resident, %lu\n  // edges\\n\", ctx->id, ctx->nowned, graph.nnodes, graph.nedges);\n}\n\nsize_t mem_usage_CUDA_common(MarshalGraph& g, unsigned num_hosts) {\n  size_t mem_usage       = 0;\n  size_t max_shared_size = 0; // for union across master/mirror of all hosts\n  mem_usage += num_hosts * sizeof(unsigned int);\n  mem_usage += num_hosts * sizeof(Shared<unsigned int>);\n  for (uint32_t h = 0; h < num_hosts; ++h) {\n    if (g.num_master_nodes[h] > 0) {\n      mem_usage += g.num_master_nodes[h] * sizeof(unsigned int);\n    }\n    if (g.num_master_nodes[h] > max_shared_size) {\n      max_shared_size = g.num_master_nodes[h];\n    }\n  }\n  mem_usage += num_hosts * sizeof(unsigned int);\n  mem_usage += num_hosts * sizeof(Shared<unsigned int>);\n  for (uint32_t h = 0; h < num_hosts; ++h) {\n    if (g.num_mirror_nodes[h] > 0) {\n      mem_usage += g.num_mirror_nodes[h] * sizeof(unsigned int);\n    }\n    if (g.num_mirror_nodes[h] > max_shared_size) {\n      max_shared_size = g.num_mirror_nodes[h];\n    }\n  }\n  mem_usage += max_shared_size * sizeof(unsigned int);\n  mem_usage += ((max_shared_size + 63) / 64) * sizeof(unsigned long long int);\n  return mem_usage;\n}\n\ntemplate <typename Type>\nvoid load_graph_CUDA_field(struct CUDA_Context_Common* ctx,\n                           struct CUDA_Context_Field<Type>* field,\n                           unsigned num_hosts) {\n  field->data.alloc(ctx->gg.nnodes);\n  size_t max_shared_size = 0; // for union across master/mirror of all hosts\n  for (uint32_t h = 0; h < num_hosts; ++h) {\n    if (ctx->master.num_nodes[h] > max_shared_size) {\n      max_shared_size = ctx->master.num_nodes[h];\n    }\n  }\n  for (uint32_t h = 0; h < num_hosts; ++h) {\n    if (ctx->mirror.num_nodes[h] > max_shared_size) {\n      max_shared_size = ctx->mirror.num_nodes[h];\n    }\n  }\n  field->shared_data.alloc(max_shared_size);\n  field->is_updated.alloc(1);\n  field->is_updated.cpu_wr_ptr()->alloc(ctx->gg.nnodes);\n}\n\ntemplate <typename Type>\nsize_t mem_usage_CUDA_field(struct CUDA_Context_Field<Type>* field,\n                            MarshalGraph& g, unsigned num_hosts) {\n  size_t mem_usage = 0;\n  mem_usage += g.nnodes * sizeof(Type);\n  size_t max_shared_size = 0; // for union across master/mirror of all hosts\n  for (uint32_t h = 0; h < num_hosts; ++h) {\n    if (g.num_master_nodes[h] > max_shared_size) {\n      max_shared_size = g.num_master_nodes[h];\n    }\n  }\n  for (uint32_t h = 0; h < num_hosts; ++h) {\n    if (g.num_mirror_nodes[h] > max_shared_size) {\n      max_shared_size = g.num_mirror_nodes[h];\n    }\n  }\n  mem_usage += max_shared_size * sizeof(Type);\n  mem_usage += ((g.nnodes + 63) / 64) * sizeof(unsigned long long int);\n  return mem_usage;\n}\n"
  },
  {
    "path": "libgluon/include/galois/cuda/DynamicBitset.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n/*\n */\n\n/**\n * @file cuda/DynamicBitset.h\n *\n * Contains implementation of CUDA dynamic bitset and iterators for it.\n */\n\n// thread-safe dynamic bitset in CUDA\n#pragma once\n#include <cuda.h>\n#include <math.h>\n#include <iterator>\n\n/**\n * Dynamic Bitset, CUDA version. See galois/DynamicBitset.h.\n *\n * @todo document this file\n */\nclass DynamicBitset {\n  size_t num_bits_capacity;\n  size_t num_bits;\n  uint64_t* bit_vector;\n\npublic:\n  DynamicBitset() {\n    num_bits_capacity = 0;\n    num_bits          = 0;\n    bit_vector        = NULL;\n  }\n\n  DynamicBitset(size_t nbits) { alloc(nbits); }\n\n  ~DynamicBitset() {\n    if (bit_vector != NULL)\n      cudaFree(bit_vector);\n  }\n\n  void alloc(size_t nbits) {\n    assert(num_bits == 0);\n    assert(sizeof(unsigned long long int) * 8 == 64);\n    assert(sizeof(uint64_t) * 8 == 64);\n    num_bits_capacity = nbits;\n    num_bits          = nbits;\n    CUDA_SAFE_CALL(cudaMalloc(&bit_vector, vec_size() * sizeof(uint64_t)));\n    reset();\n  }\n\n  void resize(size_t nbits) {\n    assert(nbits <= num_bits_capacity);\n    num_bits = nbits;\n  }\n\n  __device__ __host__ size_t size() const { return num_bits; }\n\n  __device__ __host__ size_t vec_size() const {\n    size_t bit_vector_size = (num_bits + 63) / 64;\n    return bit_vector_size;\n  }\n\n  __device__ __host__ size_t alloc_size() const {\n    return vec_size() * sizeof(uint64_t);\n  }\n\n  void reset() {\n    CUDA_SAFE_CALL(cudaMemset(bit_vector, 0, vec_size() * sizeof(uint64_t)));\n  }\n\n  // assumes bit_vector is not updated (set) in parallel\n  __device__ bool test(const size_t id) const {\n    size_t bit_index    = id / 64;\n    uint64_t bit_offset = 1;\n    bit_offset <<= (id % 64);\n    return ((bit_vector[bit_index] & bit_offset) != 0);\n  }\n\n  __device__ void set(const size_t id) {\n    size_t bit_index                  = id / 64;\n    unsigned long long int bit_offset = 1;\n    bit_offset <<= (id % 64);\n    if ((bit_vector[bit_index] & bit_offset) == 0) { // test and set\n      atomicOr((unsigned long long int*)&bit_vector[bit_index], bit_offset);\n    }\n  }\n\n  // different indices can be updated in parallel\n  __device__ void batch_reset(const size_t bit_index) {\n    bit_vector[bit_index] = 0;\n  }\n\n  // different indices can be updated in parallel\n  // but assumes same index is not updated in parallel\n  __device__ void batch_bitwise_and(const size_t bit_index,\n                                    const uint64_t mask) {\n    bit_vector[bit_index] &= mask;\n  }\n\n  void copy_to_cpu(uint64_t* bit_vector_cpu_copy) {\n    assert(bit_vector_cpu_copy != NULL);\n    CUDA_SAFE_CALL(cudaMemcpy(bit_vector_cpu_copy, bit_vector,\n                              vec_size() * sizeof(uint64_t),\n                              cudaMemcpyDeviceToHost));\n  }\n\n  void copy_to_gpu(uint64_t* cpu_bit_vector) {\n    assert(cpu_bit_vector != NULL);\n    CUDA_SAFE_CALL(cudaMemcpy(bit_vector, cpu_bit_vector,\n                              vec_size() * sizeof(uint64_t),\n                              cudaMemcpyHostToDevice));\n  }\n};\n\nclass DynamicBitsetIterator\n    : public std::iterator<std::random_access_iterator_tag, bool> {\n  DynamicBitset* bitset;\n  size_t offset;\n\npublic:\n  __device__ __host__ __forceinline__ DynamicBitsetIterator(DynamicBitset* b,\n                                                            size_t i = 0)\n      : bitset(b), offset(i) {}\n\n  __device__ __host__ __forceinline__ DynamicBitsetIterator& operator++() {\n    offset++;\n    return *this;\n  }\n\n  __device__ __host__ __forceinline__ DynamicBitsetIterator& operator--() {\n    offset--;\n    return *this;\n  }\n\n  __device__ __host__ __forceinline__ bool\n  operator<(const DynamicBitsetIterator& bi) {\n    return (offset < bi.offset);\n  }\n\n  __device__ __host__ __forceinline__ bool\n  operator<=(const DynamicBitsetIterator& bi) {\n    return (offset <= bi.offset);\n  }\n\n  __device__ __host__ __forceinline__ bool\n  operator>(const DynamicBitsetIterator& bi) {\n    return (offset > bi.offset);\n  }\n\n  __device__ __host__ __forceinline__ bool\n  operator>=(const DynamicBitsetIterator& bi) {\n    return (offset >= bi.offset);\n  }\n\n  __device__ __host__ __forceinline__ DynamicBitsetIterator&\n  operator+=(size_t i) {\n    offset += i;\n    return *this;\n  }\n\n  __device__ __host__ __forceinline__ DynamicBitsetIterator&\n  operator-=(size_t i) {\n    offset -= i;\n    return *this;\n  }\n\n  __device__ __host__ __forceinline__ DynamicBitsetIterator\n  operator+(size_t i) {\n    return DynamicBitsetIterator(bitset, offset + i);\n  }\n\n  __device__ __host__ __forceinline__ DynamicBitsetIterator\n  operator-(size_t i) {\n    return DynamicBitsetIterator(bitset, offset - i);\n  }\n\n  __device__ __host__ __forceinline__ difference_type\n  operator-(const DynamicBitsetIterator& bi) {\n    return (offset - bi.offset);\n  }\n\n  __device__ __forceinline__ bool operator*() const {\n    return bitset->test(offset);\n  }\n\n  __device__ __forceinline__ bool operator[](const size_t id) const {\n    return bitset->test(offset + id);\n  }\n};\n\nclass IdentityIterator\n    : public std::iterator<std::random_access_iterator_tag, size_t> {\n  size_t offset;\n\npublic:\n  __device__ __host__ __forceinline__ IdentityIterator(size_t i = 0)\n      : offset(i) {}\n\n  __device__ __host__ __forceinline__ IdentityIterator& operator++() {\n    offset++;\n    return *this;\n  }\n\n  __device__ __host__ __forceinline__ IdentityIterator& operator--() {\n    offset--;\n    return *this;\n  }\n\n  __device__ __host__ __forceinline__ bool\n  operator<(const IdentityIterator& bi) {\n    return (offset < bi.offset);\n  }\n\n  __device__ __host__ __forceinline__ bool\n  operator<=(const IdentityIterator& bi) {\n    return (offset <= bi.offset);\n  }\n\n  __device__ __host__ __forceinline__ bool\n  operator>(const IdentityIterator& bi) {\n    return (offset > bi.offset);\n  }\n\n  __device__ __host__ __forceinline__ bool\n  operator>=(const IdentityIterator& bi) {\n    return (offset >= bi.offset);\n  }\n\n  __device__ __host__ __forceinline__ IdentityIterator& operator+=(size_t i) {\n    offset += i;\n    return *this;\n  }\n\n  __device__ __host__ __forceinline__ IdentityIterator& operator-=(size_t i) {\n    offset -= i;\n    return *this;\n  }\n\n  __device__ __host__ __forceinline__ IdentityIterator operator+(size_t i) {\n    return IdentityIterator(offset + i);\n  }\n\n  __device__ __host__ __forceinline__ IdentityIterator operator-(size_t i) {\n    return IdentityIterator(offset - i);\n  }\n\n  __device__ __host__ __forceinline__ difference_type\n  operator-(const IdentityIterator& bi) {\n    return (offset - bi.offset);\n  }\n\n  __device__ __forceinline__ size_t operator*() const { return offset; }\n\n  __device__ __forceinline__ size_t operator[](const size_t id) const {\n    return offset + id;\n  }\n};\n"
  },
  {
    "path": "libgluon/include/galois/cuda/EdgeContext.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n/**\n * @file cuda/EdgeContext.h\n *\n * Contains definition of CUDA context structures.\n *\n * @todo document this file\n */\n\n#pragma once\n\n#pragma once\n#include <cuda.h>\n#include \"gg.h\"\n#pragma once\n#include \"galois/cuda/EdgeHostDecls.h\"\n\nstruct CUDA_Context_Shared_Edges {\n  unsigned int* num_edges;         // per host\n  DeviceOnly<unsigned int>* edges; // per host\n};\n\nstruct CUDA_Context_Common_Edges {\n  int device;\n  int id;\n  unsigned int numOwned;    // Number of nodes owned (masters) by this host\n  unsigned int beginMaster; // local id of the beginning of master nodes\n  unsigned int numNodesWithEdges; // Number of nodes (masters + mirrors) that\n                                  // have outgoing edges\n  CSRGraphTy gg;\n  struct CUDA_Context_Shared_Edges master;\n  struct CUDA_Context_Shared_Edges mirror;\n  DeviceOnly<unsigned int> offsets; // union across master/mirror of all hosts\n  Shared<DynamicBitset> is_updated; // union across master/mirror of all hosts\n};\n\ntemplate <typename Type>\nstruct CUDA_Context_Field_Edges {\n  Shared<Type> data;\n  Shared<DynamicBitset> is_updated; // size of edges\n  DeviceOnly<Type> shared_data;     // union across master/mirror of all hosts\n};\n\nbool init_CUDA_context_common_edges(struct CUDA_Context_Common_Edges* ctx,\n                                    int device) {\n  struct cudaDeviceProp dev;\n  if (device == -1) {\n    check_cuda(cudaGetDevice(&device));\n  } else {\n    int count;\n    check_cuda(cudaGetDeviceCount(&count));\n    if (device > count) {\n      fprintf(stderr, \"Error: Out-of-range GPU %d specified (%d total GPUs)\",\n              device, count);\n      return false;\n    }\n    check_cuda(cudaSetDevice(device));\n  }\n  ctx->device = device;\n  check_cuda(cudaGetDeviceProperties(&dev, device));\n  printf(\"[%d] Using GPU %d: %s\\n\", ctx->id, device, dev.name);\n  return true;\n}\n\nvoid load_graph_CUDA_common_edges(struct CUDA_Context_Common_Edges* ctx,\n                                  EdgeMarshalGraph& g, unsigned num_hosts,\n                                  bool LoadProxyEdges = true) {\n  CSRGraphTy graph;\n  ctx->numOwned          = g.numOwned;\n  ctx->beginMaster       = g.beginMaster;\n  ctx->numNodesWithEdges = g.numNodesWithEdges;\n  assert(ctx->id == g.id);\n\n  size_t mem_usage = ((g.nnodes + 1) + g.nedges) * sizeof(index_type) +\n                     (g.nnodes) * sizeof(node_data_type);\n  if (!g.edge_data)\n    mem_usage += (g.nedges) * sizeof(edge_data_type);\n  printf(\"[%d] Host memory for graph: %3u MB\\n\", ctx->id, mem_usage / 1048756);\n\n  // copy the graph to the GPU\n  graph.nnodes    = g.nnodes;\n  graph.nedges    = g.nedges;\n  graph.row_start = g.row_start;\n  graph.edge_dst  = g.edge_dst;\n  graph.node_data = g.node_data;\n  graph.edge_data = g.edge_data;\n  graph.copy_to_gpu(ctx->gg);\n\n  if (LoadProxyEdges) {\n    size_t max_shared_size = 0; // for union across master/mirror of all hosts\n    ctx->master.num_edges =\n        (unsigned int*)calloc(num_hosts, sizeof(unsigned int));\n    memcpy(ctx->master.num_edges, g.num_master_edges,\n           sizeof(unsigned int) * num_hosts);\n    ctx->master.edges = (DeviceOnly<unsigned int>*)calloc(\n        num_hosts, sizeof(Shared<unsigned int>));\n    for (uint32_t h = 0; h < num_hosts; ++h) {\n      if (ctx->master.num_edges[h] > 0) {\n        ctx->master.edges[h].alloc(ctx->master.num_edges[h]);\n        ctx->master.edges[h].copy_to_gpu(g.master_edges[h],\n                                         ctx->master.num_edges[h]);\n      }\n      if (ctx->master.num_edges[h] > max_shared_size) {\n        max_shared_size = ctx->master.num_edges[h];\n      }\n    }\n    ctx->mirror.num_edges =\n        (unsigned int*)calloc(num_hosts, sizeof(unsigned int));\n    memcpy(ctx->mirror.num_edges, g.num_mirror_edges,\n           sizeof(unsigned int) * num_hosts);\n    ctx->mirror.edges = (DeviceOnly<unsigned int>*)calloc(\n        num_hosts, sizeof(Shared<unsigned int>));\n    for (uint32_t h = 0; h < num_hosts; ++h) {\n      if (ctx->mirror.num_edges[h] > 0) {\n        ctx->mirror.edges[h].alloc(ctx->mirror.num_edges[h]);\n        ctx->mirror.edges[h].copy_to_gpu(g.mirror_edges[h],\n                                         ctx->mirror.num_edges[h]);\n      }\n      if (ctx->mirror.num_edges[h] > max_shared_size) {\n        max_shared_size = ctx->mirror.num_edges[h];\n      }\n    }\n    ctx->offsets.alloc(max_shared_size);\n    ctx->is_updated.alloc(1);\n    ctx->is_updated.cpu_wr_ptr()->alloc(max_shared_size);\n  }\n  // printf(\"[%u] load_graph_GPU: %u owned nodes of total %u resident, %lu\n  // edges\\n\", ctx->id, ctx->nowned, graph.nnodes, graph.nedges);\n}\n\nsize_t mem_usage_CUDA_common_edges(EdgeMarshalGraph& g, unsigned num_hosts) {\n  size_t mem_usage       = 0;\n  size_t max_shared_size = 0; // for union across master/mirror of all hosts\n  mem_usage += num_hosts * sizeof(unsigned int);\n  mem_usage += num_hosts * sizeof(Shared<unsigned int>);\n  for (uint32_t h = 0; h < num_hosts; ++h) {\n    if (g.num_master_edges[h] > 0) {\n      mem_usage += g.num_master_edges[h] * sizeof(unsigned int);\n    }\n    if (g.num_master_edges[h] > max_shared_size) {\n      max_shared_size = g.num_master_edges[h];\n    }\n  }\n  mem_usage += num_hosts * sizeof(unsigned int);\n  mem_usage += num_hosts * sizeof(Shared<unsigned int>);\n  for (uint32_t h = 0; h < num_hosts; ++h) {\n    if (g.num_mirror_edges[h] > 0) {\n      mem_usage += g.num_mirror_edges[h] * sizeof(unsigned int);\n    }\n    if (g.num_mirror_edges[h] > max_shared_size) {\n      max_shared_size = g.num_mirror_edges[h];\n    }\n  }\n  mem_usage += max_shared_size * sizeof(unsigned int);\n  mem_usage += ((max_shared_size + 63) / 64) * sizeof(unsigned long long int);\n  return mem_usage;\n}\n\ntemplate <typename Type>\nvoid load_graph_CUDA_field_edges(struct CUDA_Context_Common_Edges* ctx,\n                                 struct CUDA_Context_Field_Edges<Type>* field,\n                                 unsigned num_hosts) {\n  field->data.alloc(ctx->gg.nedges);\n  size_t max_shared_size = 0; // for union across master/mirror of all hosts\n  for (uint32_t h = 0; h < num_hosts; ++h) {\n    if (ctx->master.num_edges[h] > max_shared_size) {\n      max_shared_size = ctx->master.num_edges[h];\n    }\n  }\n  for (uint32_t h = 0; h < num_hosts; ++h) {\n    if (ctx->mirror.num_edges[h] > max_shared_size) {\n      max_shared_size = ctx->mirror.num_edges[h];\n    }\n  }\n  field->shared_data.alloc(max_shared_size);\n  field->is_updated.alloc(1);\n  field->is_updated.cpu_wr_ptr()->alloc(ctx->gg.nedges);\n}\n\ntemplate <typename Type>\nsize_t mem_usage_CUDA_field_edges(struct CUDA_Context_Field_Edges<Type>* field,\n                                  EdgeMarshalGraph& g, unsigned num_hosts) {\n  size_t mem_usage = 0;\n  mem_usage += g.nedges * sizeof(Type);\n  size_t max_shared_size = 0; // for union across master/mirror of all hosts\n  for (uint32_t h = 0; h < num_hosts; ++h) {\n    if (g.num_master_edges[h] > max_shared_size) {\n      max_shared_size = g.num_master_edges[h];\n    }\n  }\n  for (uint32_t h = 0; h < num_hosts; ++h) {\n    if (g.num_mirror_edges[h] > max_shared_size) {\n      max_shared_size = g.num_mirror_edges[h];\n    }\n  }\n  mem_usage += max_shared_size * sizeof(Type);\n  mem_usage += ((g.nedges + 63) / 64) * sizeof(unsigned long long int);\n  return mem_usage;\n}\n"
  },
  {
    "path": "libgluon/include/galois/cuda/EdgeHostDecls.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n/**\n * @file EdgeHostDecls.h\n *\n * Contains forward declarations and the definition of the EdgeMarshalGraph\n * class, which is used to marshal a graph to GPUs.\n *\n * @todo document this file\n */\n\n#pragma once\n#include <string>\n\n#ifndef LSG_CSR_GRAPH\ntypedef unsigned int index_type; // GPU kernels choke on size_t\ntypedef unsigned int node_data_type;\ntypedef unsigned edge_data_type;\n#endif\n\nstruct EdgeMarshalGraph {\n  size_t nnodes;\n  size_t nedges;\n  unsigned int numOwned;    // Number of nodes owned (masters) by this host\n  unsigned int beginMaster; // local id of the beginning of master nodes\n  unsigned int numNodesWithEdges; // Number of nodes (masters + mirrors) that\n                                  // have outgoing edges\n  int id;\n  unsigned numHosts;\n  index_type* row_start;\n  index_type* edge_dst;\n  node_data_type* node_data;\n  edge_data_type* edge_data;\n  unsigned int* num_master_edges;\n  unsigned int** master_edges;\n  unsigned int* num_mirror_edges;\n  unsigned int** mirror_edges;\n\n  EdgeMarshalGraph()\n      : nnodes(0), nedges(0), numOwned(0), beginMaster(0), numNodesWithEdges(0),\n        id(-1), numHosts(0), row_start(NULL), edge_dst(NULL), node_data(NULL),\n        edge_data(NULL), num_master_edges(NULL), master_edges(NULL),\n        num_mirror_edges(NULL), mirror_edges(NULL) {}\n\n  ~EdgeMarshalGraph() {\n    if (!row_start)\n      free(row_start);\n    if (!edge_dst)\n      free(edge_dst);\n    if (!node_data)\n      free(node_data);\n    if (!edge_data)\n      free(edge_data);\n    if (!num_master_edges)\n      free(num_master_edges);\n    if (master_edges != NULL) {\n      for (unsigned i = 0; i < numHosts; ++i) {\n        free(master_edges[i]);\n      }\n      free(master_edges);\n    }\n    if (!num_mirror_edges)\n      free(num_mirror_edges);\n    if (mirror_edges != NULL) {\n      for (unsigned i = 0; i < numHosts; ++i) {\n        free(mirror_edges[i]);\n      }\n      free(mirror_edges);\n    }\n  }\n};\n\n// to determine the GPU device id\nint get_gpu_device_id(std::string personality_set,\n                      int num_nodes); // defined on the host\n\nstruct CUDA_Context; // forward declaration only because rest is dependent on\n                     // the dist_app\n\n// defined on the device\nstruct CUDA_Context* get_CUDA_context(int id);\nbool init_CUDA_context(struct CUDA_Context* ctx, int device);\nvoid load_graph_CUDA(struct CUDA_Context* ctx, EdgeMarshalGraph& g,\n                     unsigned num_hosts);\nvoid reset_CUDA_context(struct CUDA_Context* ctx);\n"
  },
  {
    "path": "libgluon/include/galois/cuda/HostDecls.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n/**\n * @file HostDecls.h\n *\n * Contains forward declarations and the definition of the MarshalGraph\n * class, which is used to marshal a graph to GPUs.\n *\n * @todo document this file\n */\n\n#pragma once\n#include <string>\n\n#ifndef LSG_CSR_GRAPH\ntypedef unsigned int index_type; // GPU kernels choke on size_t\ntypedef unsigned int node_data_type;\ntypedef unsigned int edge_data_type;\n#endif\n\nstruct MarshalGraph {\n  size_t nnodes;\n  size_t nedges;\n  unsigned int numOwned;    // Number of nodes owned (masters) by this host\n  unsigned int beginMaster; // local id of the beginning of master nodes\n  unsigned int numNodesWithEdges; // Number of nodes (masters + mirrors) that\n                                  // have outgoing edges\n  int id;\n  unsigned numHosts;\n  index_type* row_start;\n  index_type* edge_dst;\n  node_data_type* node_data;\n  edge_data_type* edge_data;\n  unsigned int* num_master_nodes;\n  unsigned int** master_nodes;\n  unsigned int* num_mirror_nodes;\n  unsigned int** mirror_nodes;\n\n  MarshalGraph()\n      : nnodes(0), nedges(0), numOwned(0), beginMaster(0), numNodesWithEdges(0),\n        id(-1), numHosts(0), row_start(NULL), edge_dst(NULL), node_data(NULL),\n        edge_data(NULL), num_master_nodes(NULL), master_nodes(NULL),\n        num_mirror_nodes(NULL), mirror_nodes(NULL) {}\n\n  ~MarshalGraph() {\n    if (!row_start)\n      free(row_start);\n    if (!edge_dst)\n      free(edge_dst);\n    if (!node_data)\n      free(node_data);\n    if (!edge_data)\n      free(edge_data);\n    if (!num_master_nodes)\n      free(num_master_nodes);\n    if (!master_nodes) {\n      for (unsigned i = 0; i < numHosts; ++i) {\n        free(master_nodes[i]);\n      }\n      free(master_nodes);\n    }\n    if (!num_mirror_nodes)\n      free(num_mirror_nodes);\n    if (!mirror_nodes) {\n      for (unsigned i = 0; i < numHosts; ++i) {\n        free(mirror_nodes[i]);\n      }\n      free(mirror_nodes);\n    }\n  }\n};\n\n// to determine the GPU device id\nint get_gpu_device_id(std::string personality_set,\n                      int num_nodes); // defined on the host\n\nstruct CUDA_Context; // forward declaration only because rest is dependent on\n                     // the dist_app\n\n// defined on the device\nstruct CUDA_Context* get_CUDA_context(int id);\nbool init_CUDA_context(struct CUDA_Context* ctx, int device);\nvoid load_graph_CUDA(struct CUDA_Context* ctx, MarshalGraph& g,\n                     unsigned num_hosts);\nvoid reset_CUDA_context(struct CUDA_Context* ctx);\n"
  },
  {
    "path": "libgluon/include/galois/graphs/GluonEdgeSubstrate.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2019, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n/**\n * @file GluonEdgeSubstrate.h\n *\n * Contains the implementation for GluonEdgeSubstrate.\n */\n\n// TODO merge with GluonSubstrate; way too much code duplication\n\n#ifndef _GALOIS_GLUONEDGESUB_H_\n#define _GALOIS_GLUONEDGESUB_H_\n\n#include <unordered_map>\n#include <fstream>\n\n#include \"galois/runtime/GlobalObj.h\"\n#include \"galois/runtime/DistStats.h\"\n#include \"galois/runtime/SyncStructures.h\"\n#include \"galois/runtime/DataCommMode.h\"\n#include \"galois/DynamicBitset.h\"\n\n#ifdef GALOIS_ENABLE_GPU\n#include \"galois/cuda/EdgeHostDecls.h\"\n#endif\n\n#include \"galois/runtime/BareMPI.h\"\n\n// TODO make not global\n//! Specifies what format to send metadata in\nextern DataCommMode enforcedDataMode;\n\n#ifdef GALOIS_USE_BARE_MPI\n//! bare_mpi type to use; see options in runtime/BareMPI.h\nextern BareMPI bare_mpi;\n#endif\n\nnamespace galois {\nnamespace graphs {\n\n/**\n * Gluon communication substrate that handles communication given a user graph.\n * User graph should provide certain things the substrate expects.\n *\n * TODO documentation on expected things\n *\n * @tparam GraphTy User graph to handle communication for\n */\ntemplate <typename GraphTy>\nclass GluonEdgeSubstrate : public galois::runtime::GlobalObject {\nprivate:\n  //! Synchronization type\n  enum SyncType {\n    syncReduce,   //!< Reduction sync\n    syncBroadcast //!< Broadcast sync\n  };\n\n  //! Graph name used for printing things\n  constexpr static const char* const RNAME = \"GluonEdges\";\n\n  //! The graph to handle communication for\n  GraphTy& userGraph;\n  const unsigned id; //!< Copy of net.ID, which is the ID of the machine.\n  DataCommMode substrateDataMode; //!< datamode to enforce\n  const uint32_t\n      numHosts;     //!< Copy of net.Num, which is the total number of machines\n  uint32_t num_run; //!< Keep track of number of runs.\n  uint32_t num_round; //!< Keep track of number of rounds.\n\n  // memoization optimization\n  //! Master edges on different hosts. For broadcast;\n  std::vector<std::vector<size_t>> masterEdges;\n  //! Mirror edges on different hosts. For reduce; comes from the user graph\n  //! during initialization (we expect user to give to us)\n  std::vector<std::vector<size_t>>& mirrorEdges;\n  //! Maximum size of master or mirror edges on different hosts\n  size_t maxSharedSize;\n\n#ifdef GALOIS_USE_BARE_MPI\n  std::vector<MPI_Group> mpi_identity_groups;\n#endif\n  // Used for efficient comms\n  galois::DynamicBitSet syncBitset;\n  galois::PODResizeableArray<unsigned int> syncOffsets;\n\n  void reset_bitset(void (*bitset_reset_range)(size_t, size_t)) {\n    if (userGraph.sizeEdges() > 0) {\n      bitset_reset_range(0, userGraph.sizeEdges() - 1);\n    }\n  }\n\n  //! Increments evilPhase, a phase counter used by communication.\n  void inline incrementEvilPhase() {\n    ++galois::runtime::evilPhase;\n    // limit defined by MPI or LCI\n    if (galois::runtime::evilPhase >=\n        uint32_t{std::numeric_limits<int16_t>::max()}) {\n      galois::runtime::evilPhase = 1;\n    }\n  }\n\n  ////////////////////////////////////////////////////////////////////////////////\n  // Proxy communication setup\n  ////////////////////////////////////////////////////////////////////////////////\n  /**\n   * Let other hosts know about which host has what mirrors/masters;\n   * used for later communication of mirrors/masters.\n   */\n  void exchangeProxyInfo() {\n    auto& net = galois::runtime::getSystemNetworkInterface();\n\n    // send off the mirror edges\n    for (unsigned x = 0; x < numHosts; ++x) {\n      if (x == id)\n        continue;\n\n      galois::runtime::SendBuffer b;\n      gSerialize(b, mirrorEdges[x]);\n      net.sendTagged(x, galois::runtime::evilPhase, b);\n    }\n\n    // receive the mirror edges\n    for (unsigned x = 0; x < numHosts; ++x) {\n      if (x == id)\n        continue;\n\n      decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr)) p;\n      do {\n        p = net.recieveTagged(galois::runtime::evilPhase, nullptr);\n      } while (!p);\n\n      galois::runtime::gDeserialize(p->second, masterEdges[p->first]);\n    }\n    incrementEvilPhase();\n  }\n\n  /**\n   * Send statistics about master/mirror edges to each host, and\n   * report the statistics.\n   */\n  void sendInfoToHost() {\n    auto& net = galois::runtime::getSystemNetworkInterface();\n\n    uint64_t totalMirrorEdges =\n        userGraph.sizeEdges() - userGraph.numOwnedEdges();\n    uint64_t totalOwnedEdges = userGraph.numOwnedEdges();\n\n    // send info to host\n    for (unsigned x = 0; x < numHosts; ++x) {\n      if (x == id)\n        continue;\n\n      galois::runtime::SendBuffer b;\n      gSerialize(b, totalMirrorEdges, totalOwnedEdges);\n      net.sendTagged(x, galois::runtime::evilPhase, b);\n    }\n\n    // receive\n    for (unsigned x = 0; x < numHosts; ++x) {\n      if (x == id)\n        continue;\n\n      decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr)) p;\n      do {\n        p = net.recieveTagged(galois::runtime::evilPhase, nullptr);\n      } while (!p);\n\n      uint64_t totalMirrorFromOther;\n      uint64_t totalOwnedFromOther;\n      galois::runtime::gDeserialize(p->second, totalMirrorFromOther,\n                                    totalOwnedFromOther);\n      totalMirrorEdges += totalMirrorFromOther;\n      totalOwnedEdges += totalOwnedFromOther;\n    }\n    incrementEvilPhase();\n\n    assert(userGraph.globalEdges() == totalOwnedEdges);\n\n    // report stats\n    if (net.ID == 0) {\n      reportProxyStats(totalMirrorEdges);\n    }\n  }\n\n  /**\n   * Sets up the communication between the different hosts that contain\n   * different parts of the graph by exchanging master/mirror information.\n   */\n  void setupCommunication() {\n    galois::CondStatTimer<MORE_DIST_STATS> Tcomm_setup(\"CommunicationSetupTime\",\n                                                       RNAME);\n\n    // barrier so that all hosts start the timer together\n    galois::runtime::getHostBarrier().wait();\n\n    Tcomm_setup.start();\n\n    // Exchange information for memoization optimization.\n    exchangeProxyInfo();\n    // convert the global ids stored in the master/mirror edges arrays to local\n    // ids\n    // TODO: use 32-bit distinct vectors for masters and mirrors from here on\n    for (uint32_t h = 0; h < masterEdges.size(); ++h) {\n      galois::do_all(\n          galois::iterate(size_t{0}, masterEdges[h].size()),\n          [&](size_t n) {\n            masterEdges[h][n] = userGraph.getEdgeLID(masterEdges[h][n]);\n          },\n#if GALOIS_COMM_STATS\n          galois::loopname(get_run_identifier(\"MasterEdges\").c_str()),\n#endif\n          galois::no_stats());\n    }\n\n    for (uint32_t h = 0; h < mirrorEdges.size(); ++h) {\n      galois::do_all(\n          galois::iterate(size_t{0}, mirrorEdges[h].size()),\n          [&](size_t n) {\n            mirrorEdges[h][n] = userGraph.getEdgeLID(mirrorEdges[h][n]);\n          },\n#if GALOIS_COMM_STATS\n          galois::loopname(get_run_identifier(\"MirrorEdges\").c_str()),\n#endif\n          galois::no_stats());\n    }\n\n    Tcomm_setup.stop();\n\n    maxSharedSize = 0;\n    // report masters/mirrors to/from other hosts as statistics\n    for (auto x = 0U; x < masterEdges.size(); ++x) {\n      if (x == id)\n        continue;\n      std::string master_edges_str =\n          \"MasterEdgesFrom_\" + std::to_string(id) + \"_To_\" + std::to_string(x);\n      galois::runtime::reportStatCond_Tsum<MORE_DIST_STATS>(\n          RNAME, master_edges_str, masterEdges[x].size());\n      if (masterEdges[x].size() > maxSharedSize) {\n        maxSharedSize = masterEdges[x].size();\n      }\n    }\n\n    for (auto x = 0U; x < mirrorEdges.size(); ++x) {\n      if (x == id)\n        continue;\n      std::string mirror_edges_str =\n          \"MirrorEdgesFrom_\" + std::to_string(x) + \"_To_\" + std::to_string(id);\n      galois::runtime::reportStatCond_Tsum<MORE_DIST_STATS>(\n          RNAME, mirror_edges_str, mirrorEdges[x].size());\n      if (mirrorEdges[x].size() > maxSharedSize) {\n        maxSharedSize = mirrorEdges[x].size();\n      }\n    }\n\n    sendInfoToHost();\n\n    // do not track memory usage of partitioning\n    auto& net = galois::runtime::getSystemNetworkInterface();\n    net.resetMemUsage();\n  }\n\n  /**\n   * Reports master/mirror stats.\n   * Assumes that communication has already occured so that the host\n   * calling it actually has the info required.\n   *\n   * @param totalMirrorEdges number of mirror edges on all hosts\n   */\n  void reportProxyStats(uint64_t totalMirrorEdges) {\n    float replication_factor =\n        (float)(totalMirrorEdges + userGraph.globalEdges()) /\n        (float)userGraph.globalEdges();\n    galois::runtime::reportStat_Single(RNAME, \"ReplicationFactorEdges\",\n                                       replication_factor);\n    galois::runtime::reportStatCond_Single<MORE_DIST_STATS>(\n        RNAME, \"TotalGlobalMirrorEdges\", totalMirrorEdges);\n  }\n\n  ////////////////////////////////////////////////////////////////////////////////\n  // Initializers\n  ////////////////////////////////////////////////////////////////////////////////\n  /**\n   * Initalize MPI related things. The MPI layer itself should have been\n   * initialized when the network interface was initiailized.\n   */\n  void initBareMPI() {\n#ifdef GALOIS_USE_BARE_MPI\n    if (bare_mpi == noBareMPI)\n      return;\n\n#ifdef GALOIS_USE_LCI\n    // sanity check of ranks\n    int taskRank;\n    MPI_Comm_rank(MPI_COMM_WORLD, &taskRank);\n    if ((unsigned)taskRank != id)\n      GALOIS_DIE(\"mismatch in MPI rank\");\n    int numTasks;\n    MPI_Comm_size(MPI_COMM_WORLD, &numTasks);\n    if ((unsigned)numTasks != numHosts)\n      GALOIS_DIE(\"mismatch in MPI rank\");\n#endif\n    // group setup\n    MPI_Group world_group;\n    MPI_Comm_group(MPI_COMM_WORLD, &world_group);\n    mpi_identity_groups.resize(numHosts);\n\n    for (unsigned x = 0; x < numHosts; ++x) {\n      const int g[1] = {(int)x};\n      MPI_Group_incl(world_group, 1, g, &mpi_identity_groups[x]);\n    }\n\n    if (id == 0) {\n      switch (bare_mpi) {\n      case nonBlockingBareMPI:\n        galois::gPrint(\"Using non-blocking bare MPI\\n\");\n        break;\n      case oneSidedBareMPI:\n        galois::gPrint(\"Using one-sided bare MPI\\n\");\n        break;\n      case noBareMPI:\n      default:\n        GALOIS_DIE(\"unsupported bare MPI\");\n      }\n    }\n#endif\n  }\n\npublic:\n  /**\n   * Delete default constructor: this class NEEDS to have a graph passed into\n   * it.\n   */\n  GluonEdgeSubstrate() = delete;\n\n  /**\n   * Constructor for GluonEdgeSubstrate. Initializes metadata fields.\n   *\n   * @param _userGraph Graph to build substrate on\n   * @param host host number that this graph resides on\n   * @param numHosts total number of hosts in the currently executing program\n   * @param doNothing\n   * @param _substrateDataMode\n   */\n  GluonEdgeSubstrate(GraphTy& _userGraph, unsigned host, unsigned numHosts,\n                     bool doNothing                  = false,\n                     DataCommMode _substrateDataMode = DataCommMode::noData)\n      : galois::runtime::GlobalObject(this), userGraph(_userGraph), id(host),\n        substrateDataMode(_substrateDataMode), numHosts(numHosts), num_run(0),\n        num_round(0), mirrorEdges(userGraph.getMirrorEdges()) {\n    if (!doNothing) {\n      galois::StatTimer edgeSubstrateSetupTimer(\n          \"GluonEdgeSubstrateConstructTime\", RNAME);\n      edgeSubstrateSetupTimer.start();\n\n      // set global\n      enforcedDataMode = _substrateDataMode;\n\n      initBareMPI();\n      // master setup from mirrors done by setupCommunication call\n      masterEdges.resize(numHosts);\n\n      // setup proxy communication\n      galois::CondStatTimer<MORE_DIST_STATS> Tgraph_construct_comm(\n          \"GraphCommSetupTime\", RNAME);\n      Tgraph_construct_comm.start();\n      setupCommunication();\n      Tgraph_construct_comm.stop();\n\n      edgeSubstrateSetupTimer.stop();\n    }\n  }\n\n  ////////////////////////////////////////////////////////////////////////////////\n  // Data extraction from bitsets\n  ////////////////////////////////////////////////////////////////////////////////\n\nprivate:\n  /**\n   * Given a bitset, determine the indices of the bitset that are currently\n   * set.\n   *\n   * @tparam syncType either reduce or broadcast; only used to name the timer\n   *\n   * @param loopName string used to name the timer for this function\n   * @param bitset_comm the bitset to get the offsets of\n   * @param offsets output: the offset vector that will contain indices into\n   * the bitset that are set\n   * @param bit_set_count output: will be set to the number of bits set in the\n   * bitset\n   */\n  template <SyncType syncType>\n  void getOffsetsFromBitset(const std::string& loopName,\n                            const galois::DynamicBitSet& bitset_comm,\n                            galois::PODResizeableArray<unsigned int>& offsets,\n                            size_t& bit_set_count) const {\n    // timer creation\n    std::string syncTypeStr = (syncType == syncReduce) ? \"Reduce\" : \"Broadcast\";\n    std::string offsets_timer_str(syncTypeStr + \"Offsets_\" +\n                                  get_run_identifier(loopName));\n    galois::CondStatTimer<GALOIS_COMM_STATS> Toffsets(offsets_timer_str.c_str(),\n                                                      RNAME);\n\n    Toffsets.start();\n\n    auto activeThreads = galois::getActiveThreads();\n    std::vector<unsigned int> t_prefix_bit_counts(activeThreads);\n\n    // count how many bits are set on each thread\n    galois::on_each([&](unsigned tid, unsigned nthreads) {\n      // TODO use block_range instead\n      unsigned int block_size = bitset_comm.size() / nthreads;\n      if ((bitset_comm.size() % nthreads) > 0)\n        ++block_size;\n      assert((block_size * nthreads) >= bitset_comm.size());\n\n      unsigned int start = tid * block_size;\n      unsigned int end   = (tid + 1) * block_size;\n      if (end > bitset_comm.size())\n        end = bitset_comm.size();\n\n      unsigned int count = 0;\n      for (unsigned int i = start; i < end; ++i) {\n        if (bitset_comm.test(i))\n          ++count;\n      }\n\n      t_prefix_bit_counts[tid] = count;\n    });\n\n    // calculate prefix sum of bits per thread\n    for (unsigned int i = 1; i < activeThreads; ++i) {\n      t_prefix_bit_counts[i] += t_prefix_bit_counts[i - 1];\n    }\n    // total num of set bits\n    bit_set_count = t_prefix_bit_counts[activeThreads - 1];\n\n    // calculate the indices of the set bits and save them to the offset\n    // vector\n    if (bit_set_count > 0) {\n      offsets.resize(bit_set_count);\n      galois::on_each([&](unsigned tid, unsigned nthreads) {\n        // TODO use block_range instead\n        // TODO this is same calculation as above; maybe refactor it\n        // into function?\n        unsigned int block_size = bitset_comm.size() / nthreads;\n        if ((bitset_comm.size() % nthreads) > 0)\n          ++block_size;\n        assert((block_size * nthreads) >= bitset_comm.size());\n\n        unsigned int start = tid * block_size;\n        unsigned int end   = (tid + 1) * block_size;\n        if (end > bitset_comm.size())\n          end = bitset_comm.size();\n\n        unsigned int count = 0;\n        unsigned int t_prefix_bit_count;\n        if (tid == 0) {\n          t_prefix_bit_count = 0;\n        } else {\n          t_prefix_bit_count = t_prefix_bit_counts[tid - 1];\n        }\n\n        for (unsigned int i = start; i < end; ++i) {\n          if (bitset_comm.test(i)) {\n            offsets[t_prefix_bit_count + count] = i;\n            ++count;\n          }\n        }\n      });\n    }\n    Toffsets.stop();\n  }\n\n  /**\n   * Determine what data needs to be synchronized based on the passed in\n   * bitset_compute and returns information regarding these need-to-be-sync'd\n   * edges.\n   *\n   * @tparam FnTy structure that specifies how synchronization is to be done;\n   * only used to get the size of the type being synchronized in this function\n   * @tparam syncType type of synchronization this function is being called\n   * for; only used to name a timer\n   *\n   * @param loopName loopname used to name the timer for the function\n   * @param indices A vector that contains the local ids of the edges that\n   * you want to potentially synchronize\n   * @param bitset_compute Contains the full bitset of all edges in this\n   * graph\n   * @param bitset_comm OUTPUT: bitset that marks which indices in the passed\n   * in indices array need to be synchronized\n   * @param offsets OUTPUT: contains indices into bitset_comm that are set\n   * @param bit_set_count OUTPUT: contains number of bits set in bitset_comm\n   * @param data_mode OUTPUT: the way that this data should be communicated\n   * based on how much data needs to be sent out\n   */\n  template <typename FnTy, SyncType syncType>\n  void getBitsetAndOffsets(const std::string& loopName,\n                           const std::vector<size_t>& indices,\n                           const galois::DynamicBitSet& bitset_compute,\n                           galois::DynamicBitSet& bitset_comm,\n                           galois::PODResizeableArray<unsigned int>& offsets,\n                           size_t& bit_set_count,\n                           DataCommMode& data_mode) const {\n    if (substrateDataMode != onlyData) {\n      bitset_comm.reset();\n      std::string syncTypeStr =\n          (syncType == syncReduce) ? \"Reduce\" : \"Broadcast\";\n      std::string doall_str(syncTypeStr + \"Bitset_\" + loopName);\n\n      bitset_comm.reset();\n      // determine which local edges in the indices array need to be\n      // sychronized\n      galois::do_all(\n          galois::iterate(size_t{0}, indices.size()),\n          [&](size_t n) {\n            // assumes each lid is unique as test is not thread safe\n            size_t lid = indices[n];\n            if (bitset_compute.test(lid)) {\n              bitset_comm.set(n);\n            }\n          },\n#if GALOIS_COMM_STATS\n          galois::loopname(get_run_identifier(doall_str).c_str()),\n#endif\n          galois::no_stats());\n\n      // get the number of set bits and the offsets into the comm bitset\n      getOffsetsFromBitset<syncType>(loopName, bitset_comm, offsets,\n                                     bit_set_count);\n    }\n\n    data_mode =\n        get_data_mode<typename FnTy::ValTy>(bit_set_count, indices.size());\n  }\n\n  ////////////////////////////////////////////////////////////////////////////////\n  // Local to global ID conversion\n  ////////////////////////////////////////////////////////////////////////////////\n  /**\n   * Converts LIDs of edges we are interested in into GIDs.\n   *\n   * @tparam syncType either reduce or broadcast; only used to name the timer\n   *\n   * @param loopName name of loop used to name timer\n   * @param indices Local ids of edges that we are interested in\n   * @param offsets INPUT/OUTPUT holds offsets into \"indices\" that we should\n   * use; after function completion, holds global ids of edges we are interested\n   * in\n   */\n  template <SyncType syncType>\n  void convertLIDToGID(const std::string& loopName,\n                       const std::vector<size_t>& indices,\n                       galois::PODResizeableArray<unsigned int>& offsets) {\n    galois::gWarn(\"LID to GID edge conversion is extremely inefficient at the \"\n                  \"moment!\");\n    std::string syncTypeStr = (syncType == syncReduce) ? \"Reduce\" : \"Broadcast\";\n    std::string doall_str(syncTypeStr + \"_LID2GID_\" +\n                          get_run_identifier(loopName));\n    galois::do_all(\n        galois::iterate(size_t{0}, offsets.size()),\n        [&](size_t n) {\n          offsets[n] =\n              static_cast<uint32_t>(userGraph.getEdgeGID(indices[offsets[n]]));\n        },\n#if GALOIS_COMM_STATS\n        galois::loopname(get_run_identifier(doall_str).c_str()),\n#endif\n        galois::no_stats());\n  }\n\n  /**\n   * Converts a vector of GIDs into local ids.\n   *\n   * @tparam syncType either reduce or broadcast; only used to name the timer\n   *\n   * @param loopName name of loop used to name timer\n   * @param offsets holds GIDs to convert to LIDs\n   */\n  template <SyncType syncType>\n  void convertGIDToLID(const std::string& loopName,\n                       galois::PODResizeableArray<unsigned int>& offsets) {\n    galois::gWarn(\"convert GID to LID used in sync call (not optimized)\");\n    std::string syncTypeStr = (syncType == syncReduce) ? \"Reduce\" : \"Broadcast\";\n    std::string doall_str(syncTypeStr + \"_GID2LID_\" +\n                          get_run_identifier(loopName));\n\n    galois::do_all(\n        galois::iterate(size_t{0}, offsets.size()),\n        [&](size_t n) { offsets[n] = userGraph.getEdgeLID(offsets[n]); },\n#if GALOIS_COMM_STATS\n        galois::loopname(get_run_identifier(doall_str).c_str()),\n#endif\n        galois::no_stats());\n  }\n\n  ////////////////////////////////////////////////////////////////////////////////\n  // Message prep functions (buffering, send buffer getting, etc.)\n  ////////////////////////////////////////////////////////////////////////////////\n  /**\n   * Get data that is going to be sent for synchronization and returns\n   * it in a send buffer.\n   *\n   * @tparam syncType synchronization type\n   * @tparam SyncFnTy synchronization structure with info needed to synchronize\n   * @tparam BitsetFnTy struct that has information needed to access bitset\n   *\n   * @param loopName Name to give timer\n   * @param x Host to send to\n   * @param b OUTPUT: Buffer that will hold data to send\n   */\n  template <\n      SyncType syncType, typename SyncFnTy, typename BitsetFnTy, bool async,\n      typename std::enable_if<!BitsetFnTy::is_vector_bitset()>::type* = nullptr>\n  void getSendBuffer(std::string loopName, unsigned x,\n                     galois::runtime::SendBuffer& b) {\n    auto& sharedEdges = (syncType == syncReduce) ? mirrorEdges : masterEdges;\n\n    if (BitsetFnTy::is_valid()) {\n      syncExtract<syncType, SyncFnTy, BitsetFnTy, async>(loopName, x,\n                                                         sharedEdges[x], b);\n    } else {\n      syncExtract<syncType, SyncFnTy, async>(loopName, x, sharedEdges[x], b);\n    }\n\n    std::string syncTypeStr = (syncType == syncReduce) ? \"Reduce\" : \"Broadcast\";\n    std::string statSendBytes_str(syncTypeStr + \"SendBytes_\" +\n                                  get_run_identifier(loopName));\n\n    galois::runtime::reportStat_Tsum(RNAME, statSendBytes_str, b.size());\n  }\n\n  /**\n   * Given data to serialize in val_vec, serialize it into the send buffer\n   * depending on the mode of data communication selected for the data.\n   *\n   * @tparam syncType either reduce or broadcast\n   * @tparam VecType type of val_vec, which stores the data to send\n   *\n   * @param loopName loop name used for timers\n   * @param data_mode the way that the data should be communicated\n   * @param bit_set_count the number of items we are sending in this message\n   * @param indices list of all edges that we are potentially interested in\n   * sending things to\n   * @param offsets contains indicies into \"indices\" that we are interested in\n   * @param val_vec contains the data that we are serializing to send\n   * @param b the buffer in which to serialize the message we are sending\n   * to\n   */\n  template <bool async, SyncType syncType, typename VecType>\n  void serializeMessage(std::string loopName, DataCommMode data_mode,\n                        size_t bit_set_count, std::vector<size_t>& indices,\n                        galois::PODResizeableArray<unsigned int>& offsets,\n                        galois::DynamicBitSet& bit_set_comm, VecType& val_vec,\n                        galois::runtime::SendBuffer& b) {\n    std::string syncTypeStr = (syncType == syncReduce) ? \"Reduce\" : \"Broadcast\";\n    std::string serialize_timer_str(syncTypeStr + \"SerializeMessage_\" +\n                                    get_run_identifier(loopName));\n    galois::CondStatTimer<GALOIS_COMM_STATS> Tserialize(\n        serialize_timer_str.c_str(), RNAME);\n    if (data_mode == noData) {\n      if (!async) {\n        Tserialize.start();\n        gSerialize(b, data_mode);\n        Tserialize.stop();\n      }\n    } else if (data_mode == gidsData) {\n      offsets.resize(bit_set_count);\n      convertLIDToGID<syncType>(loopName, indices, offsets);\n      val_vec.resize(bit_set_count);\n      Tserialize.start();\n      gSerialize(b, data_mode, bit_set_count, offsets, val_vec);\n      Tserialize.stop();\n    } else if (data_mode == offsetsData) {\n      offsets.resize(bit_set_count);\n      val_vec.resize(bit_set_count);\n      Tserialize.start();\n      gSerialize(b, data_mode, bit_set_count, offsets, val_vec);\n      Tserialize.stop();\n    } else if (data_mode == bitsetData) {\n      val_vec.resize(bit_set_count);\n      Tserialize.start();\n      gSerialize(b, data_mode, bit_set_count, bit_set_comm, val_vec);\n      Tserialize.stop();\n    } else { // onlyData\n      Tserialize.start();\n      gSerialize(b, data_mode, val_vec);\n      Tserialize.stop();\n    }\n  }\n\n  /**\n   * Given the data mode, deserialize the rest of a message in a Receive Buffer.\n   *\n   * @tparam syncType either reduce or broadcast\n   * @tparam VecType type of val_vec, which data will be deserialized into\n   *\n   * @param loopName used to name timers for statistics\n   * @param data_mode data mode with which the original message was sent;\n   * determines how to deserialize the rest of the message\n   * @param buf buffer which contains the received message to deserialize\n   *\n   * The rest of the arguments are output arguments (they are passed by\n   * reference)\n   *\n   * @param bit_set_count Var that holds number of bits set (i.e. number of\n   * node changed) after deserialization\n   * @param offsets holds offsets data after deserialization if data mode is\n   * offsets + data\n   * @param bit_set_comm holds the bitset representing changed edges after\n   * deserialization of data mode is bitset + data\n   * @param buf_start\n   * @param retval\n   * @param val_vec The data proper will be deserialized into this vector\n   */\n  template <SyncType syncType, typename VecType>\n  void deserializeMessage(std::string loopName, DataCommMode data_mode,\n                          uint32_t num, galois::runtime::RecvBuffer& buf,\n                          size_t& bit_set_count,\n                          galois::PODResizeableArray<unsigned int>& offsets,\n                          galois::DynamicBitSet& bit_set_comm,\n                          size_t& buf_start, size_t& retval, VecType& val_vec) {\n    std::string syncTypeStr = (syncType == syncReduce) ? \"Reduce\" : \"Broadcast\";\n    std::string serialize_timer_str(syncTypeStr + \"DeserializeMessage_\" +\n                                    get_run_identifier(loopName));\n    galois::CondStatTimer<GALOIS_COMM_STATS> Tdeserialize(\n        serialize_timer_str.c_str(), RNAME);\n    Tdeserialize.start();\n\n    // get other metadata associated with message if mode isn't OnlyData\n    if (data_mode != onlyData) {\n      galois::runtime::gDeserialize(buf, bit_set_count);\n\n      if (data_mode == gidsData) {\n        galois::runtime::gDeserialize(buf, offsets);\n        convertGIDToLID<syncType>(loopName, offsets);\n      } else if (data_mode == offsetsData) {\n        galois::runtime::gDeserialize(buf, offsets);\n      } else if (data_mode == bitsetData) {\n        bit_set_comm.resize(num);\n        galois::runtime::gDeserialize(buf, bit_set_comm);\n      } else if (data_mode == dataSplit) {\n        galois::runtime::gDeserialize(buf, buf_start);\n      } else if (data_mode == dataSplitFirst) {\n        galois::runtime::gDeserialize(buf, retval);\n      }\n    }\n\n    // get data itself\n    galois::runtime::gDeserialize(buf, val_vec);\n\n    Tdeserialize.stop();\n  }\n\n  ////////////////////////////////////////////////////////////////////////////////\n  // Other helper functions\n  ////////////////////////////////////////////////////////////////////////////////\n  // Requirement: For all X and Y,\n  // On X, nothingToSend(Y) <=> On Y, nothingToRecv(X)\n  /**\n   * Determine if we have anything that we need to send to a particular host\n   *\n   * @param host Host number that we may or may not send to\n   * @param syncType Synchronization type to determine which edges on a\n   * host need to be considered\n   * @returns true if there is nothing to send to a host, false otherwise\n   */\n  bool nothingToSend(unsigned host, SyncType syncType) {\n    auto& sharedEdges = (syncType == syncReduce) ? mirrorEdges : masterEdges;\n    return (sharedEdges[host].size() == 0);\n  }\n\n  /**\n   * Determine if we have anything that we need to receive from a particular\n   * host\n   *\n   * @param host Host number that we may or may not receive from\n   * @param syncType Synchronization type to determine which edges on a\n   * host need to be considered\n   * @returns true if there is nothing to receive from a host, false otherwise\n   */\n  bool nothingToRecv(unsigned host, SyncType syncType) {\n    auto& sharedEdges = (syncType == syncReduce) ? masterEdges : mirrorEdges;\n    return (sharedEdges[host].size() == 0);\n  }\n\n  /**\n   * Reports bytes saved by using the bitset to only selectively load data\n   * to send.\n   *\n   * @tparam SyncFnTy synchronization structure with info needed to synchronize;\n   * used for size calculation\n   *\n   * @param loopName loop name used for timers\n   * @param syncTypeStr String used to name timers\n   * @param totalToSend Total amount of edges that are potentially sent (not\n   * necessarily all nodees will be sent)\n   * @param bitSetCount Number of edges that will actually be sent\n   * @param bitSetComm bitset used to send data\n   */\n  template <typename SyncFnTy>\n  void reportRedundantSize(std::string loopName, std::string syncTypeStr,\n                           uint32_t totalToSend, size_t bitSetCount,\n                           const galois::DynamicBitSet& bitSetComm) {\n    size_t redundant_size =\n        (totalToSend - bitSetCount) * sizeof(typename SyncFnTy::ValTy);\n    size_t bit_set_size = (bitSetComm.get_vec().size() * sizeof(uint64_t));\n\n    if (redundant_size > bit_set_size) {\n      std::string statSavedBytes_str(syncTypeStr + \"SavedBytes_\" +\n                                     get_run_identifier(loopName));\n\n      galois::runtime::reportStatCond_Tsum<MORE_DIST_STATS>(\n          RNAME, statSavedBytes_str, (redundant_size - bit_set_size));\n    }\n  }\n\n  ////////////////////////////////////////////////////////////////////////////////\n  // Extract data from edges (for reduce and broadcast)\n  ////////////////////////////////////////////////////////////////////////////////\n  /**\n   * Extracts data at provided lid.\n   *\n   * This version (reduce) resets the value after extract.\n   *\n   * @tparam FnTy structure that specifies how synchronization is to be done\n   * @tparam syncType either reduce or broadcast; determines if reset is\n   * necessary\n   *\n   * @param lid local id of node to get data from\n   * @returns data (specified by FnTy) of node with local id lid\n   */\n  /* Reduction extract resets the value afterwards */\n  template <typename FnTy, SyncType syncType>\n  inline typename FnTy::ValTy extractWrapper(size_t lid) {\n    if (syncType == syncReduce) {\n      auto val = FnTy::extract(lid, userGraph.getEdgeData(lid));\n      FnTy::reset(lid, userGraph.getEdgeData(lid));\n      return val;\n    } else {\n      return FnTy::extract(lid, userGraph.getEdgeData(lid));\n    }\n  }\n\n  /**\n   * Extracts data at provided lid; uses vecIndex to get the correct element\n   * from the vector.\n   *\n   * This version (reduce) resets the value after extract.\n   *\n   * @tparam FnTy structure that specifies how synchronization is to be done\n   * @tparam syncType either reduce or broadcast; determines if reset is\n   * necessary\n   *\n   * @param lid local id of node to get data from\n   * @param vecIndex index to grab from vector in node\n   * @returns data (specified by FnTy) of node with local id lid\n   */\n  /* Reduction extract resets the value afterwards */\n  template <typename FnTy, SyncType syncType>\n  inline typename FnTy::ValTy extractWrapper(size_t lid, unsigned vecIndex) {\n    if (syncType == syncReduce) {\n      auto val = FnTy::extract(lid, userGraph.getEdgeData(lid), vecIndex);\n      FnTy::reset(lid, userGraph.getEdgeData(lid), vecIndex);\n      return val;\n    } else {\n      return FnTy::extract(lid, userGraph.getEdgeData(lid), vecIndex);\n    }\n  }\n\n  /**\n   * Based on provided arguments, extracts the data that we are interested\n   * in sending into val_vec.\n   *\n   * @tparam FnTy structure that specifies how synchronization is to be done\n   * @tparam syncType either reduce or broadcast; used to determine if reseting\n   * the extracted field is necessary\n   * @tparam identity_offsets If this is true, then ignore the offsets\n   * array and just grab directly from indices (i.e. don't pick out\n   * particular elements, just grab contiguous chunk)\n   * @tparam parallelize Determines if parallelizing the extraction is done or\n   * not\n   *\n   * @param loopName name of loop used to name timer\n   * @param indices Local ids of edges that we are interested in\n   * @param size Number of elements to extract\n   * @param offsets Holds offsets into \"indices\" of the data that we are\n   * interested in\n   * @param val_vec OUTPUT: holds the extracted data\n   * @param start Offset into val_vec to start saving data to\n   */\n  template <typename FnTy, SyncType syncType, bool identity_offsets = false,\n            bool parallelize = true>\n  void extractSubset(const std::string& loopName,\n                     const std::vector<size_t>& indices, size_t size,\n                     const galois::PODResizeableArray<unsigned int>& offsets,\n                     galois::PODResizeableArray<typename FnTy::ValTy>& val_vec,\n                     size_t start = 0) {\n    if (parallelize) {\n      std::string syncTypeStr =\n          (syncType == syncReduce) ? \"Reduce\" : \"Broadcast\";\n      std::string doall_str(syncTypeStr + \"ExtractVal_\" + loopName);\n\n      galois::do_all(\n          galois::iterate(start, start + size),\n          [&](unsigned int n) {\n            unsigned int offset;\n            if (identity_offsets)\n              offset = n;\n            else\n              offset = offsets[n];\n            size_t lid         = indices[offset];\n            val_vec[n - start] = extractWrapper<FnTy, syncType>(lid);\n          },\n#if GALOIS_COMM_STATS\n          galois::loopname(get_run_identifier(doall_str).c_str()),\n#endif\n          galois::no_stats());\n    } else { // non-parallel version\n      for (unsigned n = start; n < start + size; ++n) {\n        unsigned int offset;\n        if (identity_offsets)\n          offset = n;\n        else\n          offset = offsets[n];\n\n        size_t lid         = indices[offset];\n        val_vec[n - start] = extractWrapper<FnTy, syncType>(lid);\n      }\n    }\n  }\n\n  /**\n   * Based on provided arguments, extracts the data that we are interested\n   * in sending into val_vec. Same as above, except it has the vecIndex\n   * arguments and requires vecSync to be true\n   *\n   * @tparam FnTy structure that specifies how synchronization is to be done\n   * @tparam syncType either reduce or broadcast; used to determine if reseting\n   * the extracted field is necessary\n   * @tparam identity_offsets If this is true, then ignore the offsets\n   * array and just grab directly from indices (i.e. don't pick out\n   * particular elements, just grab contiguous chunk)\n   * @tparam parallelize Determines if parallelizing the extraction is done or\n   * not\n   * @tparam vecSync Only set to true if the field being synchronized is a\n   * vector and synchronization is occuring element by element. MUST BE SET\n   * TO TRUE IN ORDER FOR THIS FUNCTION TO COMPILE.\n   *\n   * @param loopName name of loop used to name timer\n   * @param indices Local ids of edges that we are interested in\n   * @param size Number of elements to extract\n   * @param offsets Holds offsets into \"indices\" of the data that we are\n   * interested in\n   * @param val_vec OUTPUT: holds the extracted data\n   * @param vecIndex which element of the vector to extract from node\n   * @param start Offset into val_vec to start saving data to\n   */\n  // TODO find a better way to have this variant without code duplication\n  template <typename FnTy, SyncType syncType, bool identity_offsets = false,\n            bool parallelize = true, bool vecSync = false,\n            typename std::enable_if<vecSync>::type* = nullptr>\n  void extractSubset(const std::string& loopName,\n                     const std::vector<size_t>& indices, size_t size,\n                     const galois::PODResizeableArray<unsigned int>& offsets,\n                     galois::PODResizeableArray<typename FnTy::ValTy>& val_vec,\n                     unsigned vecIndex, size_t start = 0) {\n    val_vec.resize(size); // resize val vec for this vecIndex\n\n    if (parallelize) {\n      std::string syncTypeStr =\n          (syncType == syncReduce) ? \"Reduce\" : \"Broadcast\";\n      std::string doall_str(syncTypeStr + \"ExtractValVector_\" + loopName);\n\n      galois::do_all(\n          galois::iterate(start, start + size),\n          [&](unsigned int n) {\n            unsigned int offset;\n            if (identity_offsets)\n              offset = n;\n            else\n              offset = offsets[n];\n            size_t lid         = indices[offset];\n            val_vec[n - start] = extractWrapper<FnTy, syncType>(lid, vecIndex);\n          },\n#if GALOIS_COMM_STATS\n          galois::loopname(get_run_identifier(doall_str).c_str()),\n#endif\n          galois::no_stats());\n    } else { // non-parallel version\n      for (unsigned n = start; n < start + size; ++n) {\n        unsigned int offset;\n        if (identity_offsets)\n          offset = n;\n        else\n          offset = offsets[n];\n        size_t lid         = indices[offset];\n        val_vec[n - start] = extractWrapper<FnTy, syncType>(lid, vecIndex);\n      }\n    }\n  }\n\n  /**\n   * Based on provided arguments, extracts the data that we are interested\n   * in sending into a send buffer. Lazy serialize variant that works with\n   * certain SeqTy.\n   *\n   * @tparam FnTy structure that specifies how synchronization is to be done\n   * @tparam SeqTy Type of sequence that we are getting data from\n   * @tparam syncType either reduce or broadcast; used to determine if reseting\n   * the extracted field is necessary\n   * @tparam identity_offsets If this is true, then ignore the offsets\n   * array and just grab directly from indices (i.e. don't pick out\n   * particular elements, just grab contiguous chunk)\n   * @tparam parallelize Determines if parallelizing the extraction is done or\n   * not\n   *\n   * @param loopName name of loop used to name timer\n   * @param indices Local ids of edges that we are interested in\n   * @param size Number of elements to extract\n   * @param offsets Holds offsets into \"indices\" of the data that we are\n   * interested in\n   * @param b send buffer to extract data into\n   * @param lseq sequence to get data from\n   * @param start Offset into send buffer to start saving data to\n   */\n  template <typename FnTy, typename SeqTy, SyncType syncType,\n            bool identity_offsets = false, bool parallelize = true>\n  void extractSubset(const std::string& loopName,\n                     const std::vector<size_t>& indices, size_t size,\n                     const galois::PODResizeableArray<unsigned int>& offsets,\n                     galois::runtime::SendBuffer& b, SeqTy lseq,\n                     size_t start = 0) {\n    if (parallelize) {\n      std::string syncTypeStr =\n          (syncType == syncReduce) ? \"Reduce\" : \"Broadcast\";\n      std::string doall_str(syncTypeStr + \"ExtractVal_\" + loopName);\n\n      galois::do_all(\n          galois::iterate(start, start + size),\n          [&](unsigned int n) {\n            unsigned int offset;\n            if (identity_offsets)\n              offset = n;\n            else\n              offset = offsets[n];\n\n            size_t lid = indices[offset];\n            gSerializeLazy(b, lseq, n - start,\n                           extractWrapper<FnTy, syncType>(lid));\n          },\n#if GALOIS_COMM_STATS\n          galois::loopname(get_run_identifier(doall_str).c_str()),\n#endif\n          galois::no_stats());\n    } else {\n      for (unsigned int n = start; n < start + size; ++n) {\n        unsigned int offset;\n        if (identity_offsets)\n          offset = n;\n        else\n          offset = offsets[n];\n        size_t lid = indices[offset];\n        gSerializeLazy(b, lseq, n - start, extractWrapper<FnTy, syncType>(lid));\n      }\n    }\n  }\n\n  /**\n   * GPU wrap function: extracts data from edges and resets them to the\n   * reduction identity value as specified by the sync structure. (Reduce only)\n   *\n   * @tparam FnTy structure that specifies how synchronization is to be done\n   * @tparam SyncType Must be reduce\n   *\n   * @param x node id to extract from\n   * @param v vector to extract data to\n   *\n   * @returns true if called on GPU device\n   */\n  template <typename FnTy, SyncType syncType>\n  inline bool extractBatchWrapper(unsigned x, galois::runtime::SendBuffer& b) {\n    if (syncType == syncReduce) {\n      return FnTy::extract_reset_batch(x, b.getVec().data());\n    } else {\n      return FnTy::extract_batch(x, b.getVec().data());\n    }\n  }\n\n  /**\n   * GPU wrap function: extracts data from edges and resets them to the\n   * reduction identity value as specified by the sync structure. (Reduce only)\n   *\n   * This version specifies more arguments.\n   *\n   * @tparam FnTy structure that specifies how synchronization is to be done\n   * @tparam SyncType Must be reduce\n   *\n   * @param x node id to extract from\n   * @param b\n   * @param o\n   * @param v\n   * @param s\n   * @param data_mode\n   *\n   * @returns true if called on GPU device\n   */\n  template <typename FnTy, SyncType syncType>\n  inline bool extractBatchWrapper(unsigned x, galois::runtime::SendBuffer& b,\n                                  size_t& s, DataCommMode& data_mode) {\n    if (syncType == syncReduce) {\n      return FnTy::extract_reset_batch(x, b.getVec().data(), &s, &data_mode);\n    } else {\n      return FnTy::extract_batch(x, b.getVec().data(), &s, &data_mode);\n    }\n  }\n\n  ////////////////////////////////////////////////////////////////////////////////\n  // Reduce/sets on node (for broadcast)\n  ////////////////////////////////////////////////////////////////////////////////\n  /**\n   * Reduce variant. Takes a value and reduces it according to the sync\n   * structure provided to the function.\n   *\n   * @tparam FnTy structure that specifies how synchronization is to be done\n   * @tparam syncType Reduce sync or broadcast sync\n   *\n   * @param lid local id of node to reduce to\n   * @param val value to reduce to\n   * @param bit_set_compute bitset indicating which edges have changed; updated\n   * if reduction causes a change\n   */\n  template <typename FnTy, SyncType syncType, bool async>\n  inline void setWrapper(size_t lid, typename FnTy::ValTy val,\n                         galois::DynamicBitSet& bit_set_compute) {\n    if (syncType == syncReduce) {\n      if (FnTy::reduce(lid, userGraph.getEdgeData(lid), val)) {\n        if (bit_set_compute.size() != 0) {\n          bit_set_compute.set(lid);\n        }\n      }\n    } else {\n      if (async) {\n        FnTy::reduce(lid, userGraph.getEdgeData(lid), val);\n      } else {\n        FnTy::setVal(lid, userGraph.getEdgeData(lid), val);\n        assert(FnTy::extract(lid, userGraph.getEdgeData(lid)) == val);\n      }\n    }\n  }\n\n  /**\n   * Given data received from another host and information on which edges\n   * to update, do the reduce/set of the received data to update local edges.\n   *\n   * Complement function, in some sense, of extractSubset.\n   *\n   * @tparam VecTy type of indices variable\n   * @tparam FnTy structure that specifies how synchronization is to be done\n   * @tparam SyncType Reduce or broadcast\n   * @tparam identity_offsets If this is true, then ignore the offsets\n   * array and just grab directly from indices (i.e. don't pick out\n   * particular elements, just grab contiguous chunk)\n   * @tparam parallelize True if updates to edges are to be parallelized\n   *\n   * @param loopName name of loop used to name timer\n   * @param indices Local ids of edges that we are interested in\n   * @param size Number of elements to set\n   * @param offsets Holds offsets into \"indices\" of the data that we are\n   * interested in\n   * @param val_vec holds data we will use to set\n   * @param bit_set_compute bitset indicating which edges have changed\n   * @param start Offset into val_vec to start saving data to\n   */\n  template <typename VecTy, typename FnTy, SyncType syncType, bool async,\n            bool identity_offsets = false, bool parallelize = true>\n  void setSubset(const std::string& loopName, const VecTy& indices, size_t size,\n                 const galois::PODResizeableArray<unsigned int>& offsets,\n                 galois::PODResizeableArray<typename FnTy::ValTy>& val_vec,\n                 galois::DynamicBitSet& bit_set_compute, size_t start = 0) {\n    std::string syncTypeStr = (syncType == syncReduce) ? \"Reduce\" : \"Broadcast\";\n    std::string doall_str(syncTypeStr + \"SetVal_\" +\n                          get_run_identifier(loopName));\n\n    if (parallelize) {\n      galois::do_all(\n          galois::iterate(start, start + size),\n          [&](unsigned int n) {\n            unsigned int offset;\n            if (identity_offsets)\n              offset = n;\n            else\n              offset = offsets[n];\n            auto lid = indices[offset];\n            setWrapper<FnTy, syncType, async>(lid, val_vec[n - start],\n                                              bit_set_compute);\n          },\n#if GALOIS_COMM_STATS\n          galois::loopname(get_run_identifier(doall_str).c_str()),\n#endif\n          galois::no_stats());\n    } else {\n      for (unsigned int n = start; n < start + size; ++n) {\n        unsigned int offset;\n        if (identity_offsets)\n          offset = n;\n        else\n          offset = offsets[n];\n        auto lid = indices[offset];\n        setWrapper<FnTy, syncType, async>(lid, val_vec[n - start],\n                                          bit_set_compute);\n      }\n    }\n  }\n\n  /**\n   * GPU wrapper function to reduce multiple edges at once.\n   *\n   * @tparam FnTy structure that specifies how synchronization is to be done\n   * @tparam SyncType Must be reduce\n   *\n   * @param x node id to set\n   * @param v\n   *\n   * @returns true if called on GPU device\n   */\n  template <typename FnTy, SyncType syncType, bool async>\n  inline bool setBatchWrapper(unsigned x, galois::runtime::RecvBuffer& b) {\n    if (syncType == syncReduce) {\n      return FnTy::reduce_batch(x, b.getVec().data() + b.getOffset());\n    } else {\n      if (async) {\n        return FnTy::reduce_mirror_batch(x, b.getVec().data() + b.getOffset());\n      } else {\n        return FnTy::setVal_batch(x, b.getVec().data() + b.getOffset());\n      }\n    }\n  }\n\n  /**\n   * GPU wrapper function to reduce multiple edges at once. More detailed\n   * arguments.\n   *\n   * @tparam FnTy structure that specifies how synchronization is to be done\n   * @tparam SyncType Must be reduce\n   *\n   * @param x node id to set\n   * @param b\n   * @param o\n   * @param v\n   * @param s\n   * @param data_mode\n   *\n   * @returns true if called on GPU device\n   */\n  template <typename FnTy, SyncType syncType, bool async>\n  inline bool setBatchWrapper(unsigned x, galois::runtime::RecvBuffer& b,\n                              DataCommMode& data_mode) {\n    if (syncType == syncReduce) {\n      return FnTy::reduce_batch(x, b.getVec().data() + b.getOffset(),\n                                data_mode);\n    } else {\n      if (async) {\n        return FnTy::reduce_mirror_batch(x, b.getVec().data() + b.getOffset(),\n                                         data_mode);\n      } else {\n        return FnTy::setVal_batch(x, b.getVec().data() + b.getOffset(),\n                                  data_mode);\n      }\n    }\n  }\n\n  ////////////////////////////////////////////////////////////////////////////////\n  // Sends\n  ////////////////////////////////////////////////////////////////////////////////\n  /**\n   * Non-bitset extract that uses serializelazy to copy data over to the\n   * buffer. REQUIRES that the ValTy be memory copyable.\n   *\n   * @tparam syncType either reduce or broadcast\n   * @tparam syncFnTy struct that has info on how to do synchronization\n   *\n   * @param loopName loop name used for timers\n   * @param from_id\n   * @param indices Vector that contains node ids of edges that we will\n   * potentially send things to\n   * @param b OUTPUT: buffer that will be sent over the network; contains data\n   * based on set bits in bitset\n   */\n  template <SyncType syncType, typename SyncFnTy, bool async,\n            typename std::enable_if<galois::runtime::is_memory_copyable<\n                typename SyncFnTy::ValTy>::value>::type* = nullptr>\n  void syncExtract(std::string loopName, unsigned from_id,\n                   std::vector<size_t>& indices,\n                   galois::runtime::SendBuffer& b) {\n    uint32_t num = indices.size();\n    static galois::PODResizeableArray<typename SyncFnTy::ValTy>\n        val_vec; // sometimes wasteful\n    galois::PODResizeableArray<unsigned int>& offsets = syncOffsets;\n    std::string syncTypeStr = (syncType == syncReduce) ? \"Reduce\" : \"Broadcast\";\n    std::string extract_timer_str(syncTypeStr + \"Extract_\" +\n                                  get_run_identifier(loopName));\n    galois::CondStatTimer<GALOIS_COMM_STATS> Textract(extract_timer_str.c_str(),\n                                                      RNAME);\n    std::string extract_batch_timer_str(syncTypeStr + \"ExtractBatch_\" +\n                                        get_run_identifier(loopName));\n    galois::CondStatTimer<GALOIS_COMM_STATS> Textractbatch(\n        extract_batch_timer_str.c_str(), RNAME);\n\n    DataCommMode data_mode;\n\n    Textract.start();\n\n    if (num > 0) {\n      data_mode = onlyData;\n      b.reserve(sizeof(DataCommMode) + sizeof(size_t) +\n                (num * sizeof(typename SyncFnTy::ValTy)));\n\n      Textractbatch.start();\n      bool batch_succeeded =\n          extractBatchWrapper<SyncFnTy, syncType>(from_id, b);\n      Textractbatch.stop();\n\n      if (!batch_succeeded) {\n        b.resize(0);\n        val_vec.reserve(maxSharedSize);\n        val_vec.resize(num);\n        gSerialize(b, onlyData);\n        auto lseq = gSerializeLazySeq(\n            b, num,\n            (galois::PODResizeableArray<typename SyncFnTy::ValTy>*)nullptr);\n        extractSubset<SyncFnTy, decltype(lseq), syncType, true, true>(\n            loopName, indices, num, offsets, b, lseq);\n      } else {\n        b.resize(sizeof(DataCommMode) + sizeof(size_t) +\n                 (num * sizeof(typename SyncFnTy::ValTy)));\n      }\n    } else {\n      b.resize(0);\n      if (!async) {\n        data_mode = noData;\n        gSerialize(b, noData);\n      }\n    }\n\n    Textract.stop();\n\n    std::string metadata_str(syncTypeStr + \"MetadataMode_\" +\n                             std::to_string(data_mode) + \"_\" +\n                             get_run_identifier(loopName));\n    galois::runtime::reportStatCond_Single<MORE_DIST_STATS>(RNAME, metadata_str,\n                                                            1);\n  }\n\n  /**\n   * Non-bitset extract for when the type of the item being sync'd isn't\n   * memory copyable.\n   *\n   * Extracts all of the data for all edges in indices and saves it into\n   * a send buffer for return.\n   *\n   * @tparam syncType either reduce or broadcast\n   * @tparam syncFnTy struct that has info on how to do synchronization\n   *\n   * @param loopName loop name used for timers\n   * @param from_id\n   * @param indices Vector that contains node ids of edges that we will\n   * potentially send things to\n   * @param b OUTPUT: buffer that will be sent over the network; contains data\n   * based on set bits in bitset\n   */\n  template <SyncType syncType, typename SyncFnTy, bool async,\n            typename std::enable_if<!galois::runtime::is_memory_copyable<\n                typename SyncFnTy::ValTy>::value>::type* = nullptr>\n  void syncExtract(std::string loopName, unsigned from_id,\n                   std::vector<size_t>& indices,\n                   galois::runtime::SendBuffer& b) {\n    std::string syncTypeStr = (syncType == syncReduce) ? \"Reduce\" : \"Broadcast\";\n    std::string extract_timer_str(syncTypeStr + \"Extract_\" +\n                                  get_run_identifier(loopName));\n    galois::CondStatTimer<GALOIS_COMM_STATS> Textract(extract_timer_str.c_str(),\n                                                      RNAME);\n    std::string extract_batch_timer_str(syncTypeStr + \"ExtractBatch_\" +\n                                        get_run_identifier(loopName));\n    galois::CondStatTimer<GALOIS_COMM_STATS> Textractbatch(\n        extract_batch_timer_str.c_str(), RNAME);\n\n    DataCommMode data_mode;\n\n    uint32_t num = indices.size();\n    static galois::PODResizeableArray<typename SyncFnTy::ValTy> val_vec;\n    static galois::PODResizeableArray<unsigned int> dummyVector;\n\n    Textract.start();\n\n    if (num > 0) {\n      data_mode = onlyData;\n      b.reserve(sizeof(DataCommMode) + sizeof(size_t) +\n                (num * sizeof(typename SyncFnTy::ValTy)));\n\n      Textractbatch.start();\n      bool batch_succeeded =\n          extractBatchWrapper<SyncFnTy, syncType>(from_id, b);\n      Textractbatch.stop();\n\n      if (!batch_succeeded) {\n        b.resize(0);\n        val_vec.reserve(maxSharedSize);\n        val_vec.resize(num);\n        // get everything (note I pass in \"indices\" as offsets as it won't\n        // even get used anyways)\n        extractSubset<SyncFnTy, syncType, true, true>(loopName, indices, num,\n                                                      dummyVector, val_vec);\n        gSerialize(b, onlyData, val_vec);\n      } else {\n        b.resize(sizeof(DataCommMode) + sizeof(size_t) +\n                 (num * sizeof(typename SyncFnTy::ValTy)));\n      }\n\n    } else {\n      b.resize(0);\n      if (!async) {\n        data_mode = noData;\n        gSerialize(b, noData);\n      }\n    }\n\n    Textract.stop();\n\n    std::string metadata_str(syncTypeStr + \"MetadataMode_\" +\n                             std::to_string(data_mode) + \"_\" +\n                             get_run_identifier(loopName));\n    galois::runtime::reportStatCond_Single<MORE_DIST_STATS>(RNAME, metadata_str,\n                                                            1);\n  }\n\n  /**\n   * Extracts the data that will be sent to a host in this round of\n   * synchronization based on the passed in bitset and saves it to a\n   * send buffer.\n   *\n   * @tparam syncType either reduce or broadcast\n   * @tparam syncFnTy struct that has info on how to do synchronization\n   * @tparam BitsetFnTy struct that has info on how to access the bitset\n   * being used for the extraction\n   *\n   * @param loopName loop name used for timers\n   * @param from_id\n   * @param indices Vector that contains node ids of edges that we will\n   * potentially send things to\n   * @param b OUTPUT: buffer that will be sent over the network; contains data\n   * based on set bits in bitset\n   */\n  template <\n      SyncType syncType, typename SyncFnTy, typename BitsetFnTy, bool async,\n      typename std::enable_if<!BitsetFnTy::is_vector_bitset()>::type* = nullptr>\n  void syncExtract(std::string loopName, unsigned from_id,\n                   std::vector<size_t>& indices,\n                   galois::runtime::SendBuffer& b) {\n    const galois::DynamicBitSet& bit_set_compute = BitsetFnTy::get();\n    uint64_t manualBitsetCount                   = bit_set_compute.count();\n    uint32_t num                                 = indices.size();\n    galois::DynamicBitSet& bit_set_comm          = syncBitset;\n    static galois::PODResizeableArray<typename SyncFnTy::ValTy> val_vec;\n    galois::PODResizeableArray<unsigned int>& offsets = syncOffsets;\n\n    std::string syncTypeStr = (syncType == syncReduce) ? \"Reduce\" : \"Broadcast\";\n    std::string extract_timer_str(syncTypeStr + \"Extract_\" +\n                                  get_run_identifier(loopName));\n    galois::CondStatTimer<GALOIS_COMM_STATS> Textract(extract_timer_str.c_str(),\n                                                      RNAME);\n    std::string extract_alloc_timer_str(syncTypeStr + \"ExtractAlloc_\" +\n                                        get_run_identifier(loopName));\n    galois::CondStatTimer<GALOIS_COMM_STATS> Textractalloc(\n        extract_alloc_timer_str.c_str(), RNAME);\n    std::string extract_batch_timer_str(syncTypeStr + \"ExtractBatch_\" +\n                                        get_run_identifier(loopName));\n    galois::CondStatTimer<GALOIS_COMM_STATS> Textractbatch(\n        extract_batch_timer_str.c_str(), RNAME);\n\n    DataCommMode data_mode;\n\n    Textract.start();\n\n    if (num > 0 && manualBitsetCount > 0) {\n      // if (num > 0) {\n      size_t bit_set_count = 0;\n      Textractalloc.start();\n      if (substrateDataMode == gidsData) {\n        b.reserve(sizeof(DataCommMode) + sizeof(bit_set_count) +\n                  sizeof(size_t) + (num * sizeof(unsigned int)) +\n                  sizeof(size_t) + (num * sizeof(typename SyncFnTy::ValTy)));\n      } else if (substrateDataMode == offsetsData) {\n        b.reserve(sizeof(DataCommMode) + sizeof(bit_set_count) +\n                  sizeof(size_t) + (num * sizeof(unsigned int)) +\n                  sizeof(size_t) + (num * sizeof(typename SyncFnTy::ValTy)));\n      } else if (substrateDataMode == bitsetData) {\n        size_t bitset_alloc_size = ((num + 63) / 64) * sizeof(uint64_t);\n        b.reserve(sizeof(DataCommMode) + sizeof(bit_set_count) +\n                  sizeof(size_t)   // bitset size\n                  + sizeof(size_t) // bitset vector size\n                  + bitset_alloc_size + sizeof(size_t) +\n                  (num * sizeof(typename SyncFnTy::ValTy)));\n      } else { // onlyData or noData (auto)\n        size_t bitset_alloc_size = ((num + 63) / 64) * sizeof(uint64_t);\n        b.reserve(sizeof(DataCommMode) + sizeof(bit_set_count) +\n                  sizeof(size_t)   // bitset size\n                  + sizeof(size_t) // bitset vector size\n                  + bitset_alloc_size + sizeof(size_t) +\n                  (num * sizeof(typename SyncFnTy::ValTy)));\n      }\n      Textractalloc.stop();\n\n      Textractbatch.start();\n      bool batch_succeeded = extractBatchWrapper<SyncFnTy, syncType>(\n          from_id, b, bit_set_count, data_mode);\n      Textractbatch.stop();\n\n      // GPUs have a batch function they can use; CPUs do not; therefore,\n      // CPUS always enter this if block\n      if (!batch_succeeded) {\n        Textractalloc.start();\n        b.resize(0);\n        bit_set_comm.reserve(maxSharedSize);\n        offsets.reserve(maxSharedSize);\n        val_vec.reserve(maxSharedSize);\n        bit_set_comm.resize(num);\n        offsets.resize(num);\n        val_vec.resize(num);\n        Textractalloc.stop();\n\n        getBitsetAndOffsets<SyncFnTy, syncType>(\n            loopName, indices, bit_set_compute, bit_set_comm, offsets,\n            bit_set_count, data_mode);\n\n        if (data_mode == onlyData) {\n          bit_set_count = indices.size();\n          extractSubset<SyncFnTy, syncType, true, true>(\n              loopName, indices, bit_set_count, offsets, val_vec);\n        } else if (data_mode !=\n                   noData) { // bitsetData or offsetsData or gidsData\n          extractSubset<SyncFnTy, syncType, false, true>(\n              loopName, indices, bit_set_count, offsets, val_vec);\n        }\n        serializeMessage<async, syncType>(loopName, data_mode, bit_set_count,\n                                          indices, offsets, bit_set_comm,\n                                          val_vec, b);\n      } else {\n        if (data_mode == noData) {\n          b.resize(0);\n          if (!async) {\n            gSerialize(b, data_mode);\n          }\n        } else if (data_mode == gidsData) {\n          b.resize(sizeof(DataCommMode) + sizeof(bit_set_count) +\n                   sizeof(size_t) + (bit_set_count * sizeof(unsigned int)) +\n                   sizeof(size_t) +\n                   (bit_set_count * sizeof(typename SyncFnTy::ValTy)));\n        } else if (data_mode == offsetsData) {\n          b.resize(sizeof(DataCommMode) + sizeof(bit_set_count) +\n                   sizeof(size_t) + (bit_set_count * sizeof(unsigned int)) +\n                   sizeof(size_t) +\n                   (bit_set_count * sizeof(typename SyncFnTy::ValTy)));\n        } else if (data_mode == bitsetData) {\n          size_t bitset_alloc_size = ((num + 63) / 64) * sizeof(uint64_t);\n          b.resize(sizeof(DataCommMode) + sizeof(bit_set_count) +\n                   sizeof(size_t)   // bitset size\n                   + sizeof(size_t) // bitset vector size\n                   + bitset_alloc_size + sizeof(size_t) +\n                   (bit_set_count * sizeof(typename SyncFnTy::ValTy)));\n        } else { // onlyData\n          b.resize(sizeof(DataCommMode) + sizeof(size_t) +\n                   (num * sizeof(typename SyncFnTy::ValTy)));\n        }\n      }\n\n      reportRedundantSize<SyncFnTy>(loopName, syncTypeStr, num, bit_set_count,\n                                    bit_set_comm);\n    } else {\n      b.resize(0);\n      if (!async) {\n        data_mode = noData;\n        gSerialize(b, noData);\n      }\n    }\n\n    Textract.stop();\n\n    std::string metadata_str(syncTypeStr + \"MetadataMode_\" +\n                             std::to_string(data_mode) + \"_\" +\n                             get_run_identifier(loopName));\n    galois::runtime::reportStatCond_Single<MORE_DIST_STATS>(RNAME, metadata_str,\n                                                            1);\n  }\n\n#ifdef GALOIS_USE_BARE_MPI\n  /**\n   * Sync using MPI instead of network layer.\n   */\n  template <SyncType syncType, typename SyncFnTy, typename BitsetFnTy>\n  void sync_mpi_send(std::string loopName) {\n    static std::vector<galois::runtime::SendBuffer> b;\n    static std::vector<MPI_Request> request;\n    b.resize(numHosts);\n    request.resize(numHosts, MPI_REQUEST_NULL);\n\n    for (unsigned h = 1; h < numHosts; ++h) {\n      unsigned x = (id + h) % numHosts;\n\n      if (nothingToSend(x, syncType))\n        continue;\n\n      int ready = 0;\n      MPI_Test(&request[x], &ready, MPI_STATUS_IGNORE);\n      if (!ready) {\n        assert(b[x].size() > 0);\n        MPI_Wait(&request[x], MPI_STATUS_IGNORE);\n      }\n      if (b[x].size() > 0) {\n        b[x].getVec().clear();\n      }\n\n      getSendBuffer<syncType, SyncFnTy, BitsetFnTy>(loopName, x, b[x]);\n\n      MPI_Isend((uint8_t*)b[x].linearData(), b[x].size(), MPI_BYTE, x, 32767,\n                MPI_COMM_WORLD, &request[x]);\n    }\n\n    if (BitsetFnTy::is_valid() && syncType == syncBroadcast) {\n      reset_bitset(&BitsetFnTy::reset_range);\n    }\n  }\n\n  /**\n   * Sync put using MPI instead of network layer\n   */\n  template <SyncType syncType, typename SyncFnTy, typename BitsetFnTy>\n  void sync_mpi_put(std::string loopName, const MPI_Group& mpi_access_group,\n                    const std::vector<MPI_Win>& window) {\n\n    MPI_Win_start(mpi_access_group, 0, window[id]);\n\n    std::vector<galois::runtime::SendBuffer> b(numHosts);\n    std::vector<size_t> size(numHosts);\n    uint64_t send_buffers_size = 0;\n\n    for (unsigned h = 1; h < numHosts; ++h) {\n      unsigned x = (id + h) % numHosts;\n\n      if (nothingToSend(x, syncType))\n        continue;\n\n      getSendBuffer<syncType, SyncFnTy, BitsetFnTy>(loopName, x, b[x]);\n\n      size[x] = b[x].size();\n      send_buffers_size += size[x];\n      MPI_Put((uint8_t*)&size[x], sizeof(size_t), MPI_BYTE, x, 0,\n              sizeof(size_t), MPI_BYTE, window[id]);\n      MPI_Put((uint8_t*)b[x].linearData(), size[x], MPI_BYTE, x, sizeof(size_t),\n              size[x], MPI_BYTE, window[id]);\n    }\n\n    auto& net = galois::runtime::getSystemNetworkInterface();\n    net.incrementMemUsage(send_buffers_size);\n\n    MPI_Win_complete(window[id]);\n    net.decrementMemUsage(send_buffers_size);\n\n    if (BitsetFnTy::is_valid() && syncType == syncBroadcast) {\n      reset_bitset(&BitsetFnTy::reset_range);\n    }\n  }\n#endif\n\n  /**\n   * Sends data to all hosts (if there is anything that needs to be sent\n   * to that particular host) and adjusts bitset according to sync type.\n   *\n   * @tparam syncType either reduce or broadcast\n   * @tparam SyncFnTy synchronization structure with info needed to synchronize\n   * @tparam BitsetFnTy struct that has information needed to access bitset\n   *\n   * @param loopName used to name timers created by this sync send\n   */\n  template <SyncType syncType, typename SyncFnTy, typename BitsetFnTy,\n            bool async>\n  void syncNetSend(std::string loopName) {\n    static galois::runtime::SendBuffer\n        b; // although a static variable, allocation not reused\n           // due to std::move in net.sendTagged()\n\n    auto& net               = galois::runtime::getSystemNetworkInterface();\n    std::string syncTypeStr = (syncType == syncReduce) ? \"Reduce\" : \"Broadcast\";\n    std::string statNumMessages_str(syncTypeStr + \"NumMessages_\" +\n                                    get_run_identifier(loopName));\n\n    size_t numMessages = 0;\n    for (unsigned h = 1; h < numHosts; ++h) {\n      unsigned x = (id + h) % numHosts;\n\n      if (nothingToSend(x, syncType))\n        continue;\n\n      getSendBuffer<syncType, SyncFnTy, BitsetFnTy, async>(loopName, x, b);\n\n      if ((!async) || (b.size() > 0)) {\n        size_t syncTypePhase = 0;\n        if (async && (syncType == syncBroadcast))\n          syncTypePhase = 1;\n        net.sendTagged(x, galois::runtime::evilPhase, b, syncTypePhase);\n        ++numMessages;\n      }\n    }\n    if (!async) {\n      // Will force all messages to be processed before continuing\n      net.flush();\n    }\n\n    if (BitsetFnTy::is_valid() && syncType == syncBroadcast) {\n      reset_bitset(&BitsetFnTy::reset_range);\n    }\n\n    galois::runtime::reportStat_Tsum(RNAME, statNumMessages_str, numMessages);\n  }\n\n  /**\n   * Sends data over the network to other hosts based on the provided template\n   * arguments.\n   *\n   * @tparam syncType either reduce or broadcast\n   * @tparam SyncFnTy synchronization structure with info needed to synchronize\n   * @tparam BitsetFnTy struct that has info on how to access the bitset\n   *\n   * @param loopName used to name timers for statistics\n   */\n  template <SyncType syncType, typename SyncFnTy, typename BitsetFnTy,\n            bool async>\n  void syncSend(std::string loopName) {\n    std::string syncTypeStr = (syncType == syncReduce) ? \"Reduce\" : \"Broadcast\";\n    galois::CondStatTimer<GALOIS_COMM_STATS> TSendTime(\n        (syncTypeStr + \"Send_\" + get_run_identifier(loopName)).c_str(), RNAME);\n\n    TSendTime.start();\n    syncNetSend<syncType, SyncFnTy, BitsetFnTy, async>(loopName);\n    TSendTime.stop();\n  }\n\n  ////////////////////////////////////////////////////////////////////////////////\n  // Receives\n  ////////////////////////////////////////////////////////////////////////////////\n\n  /**\n   * Deserializes messages from other hosts and applies them to update local\n   * data based on the provided sync structures.\n   *\n   * Complement of syncExtract.\n   *\n   * @tparam syncType either reduce or broadcast\n   * @tparam SyncFnTy synchronization structure with info needed to synchronize\n   * @tparam BitsetFnTy struct that has info on how to access the bitset\n   *\n   * @param from_id ID of host which the message we are processing was received\n   * from\n   * @param buf Buffer that contains received message from other host\n   * @param loopName used to name timers for statistics\n   */\n  template <\n      SyncType syncType, typename SyncFnTy, typename BitsetFnTy, bool async,\n      typename std::enable_if<!BitsetFnTy::is_vector_bitset()>::type* = nullptr>\n  size_t syncRecvApply(uint32_t from_id, galois::runtime::RecvBuffer& buf,\n                       std::string loopName) {\n    std::string syncTypeStr = (syncType == syncReduce) ? \"Reduce\" : \"Broadcast\";\n    std::string set_timer_str(syncTypeStr + \"Set_\" +\n                              get_run_identifier(loopName));\n    galois::CondStatTimer<GALOIS_COMM_STATS> Tset(set_timer_str.c_str(), RNAME);\n    std::string set_batch_timer_str(syncTypeStr + \"SetBatch_\" +\n                                    get_run_identifier(loopName));\n    galois::CondStatTimer<GALOIS_COMM_STATS> Tsetbatch(\n        set_batch_timer_str.c_str(), RNAME);\n\n    galois::DynamicBitSet& bit_set_comm = syncBitset;\n    static galois::PODResizeableArray<typename SyncFnTy::ValTy> val_vec;\n    galois::PODResizeableArray<unsigned int>& offsets = syncOffsets;\n\n    auto& sharedEdges = (syncType == syncReduce) ? masterEdges : mirrorEdges;\n    uint32_t num      = sharedEdges[from_id].size();\n    size_t retval     = 0;\n    Tset.start();\n\n    if (num > 0) { // only enter if we expect message from that host\n      DataCommMode data_mode;\n      // 1st deserialize gets data mode\n      galois::runtime::gDeserialize(buf, data_mode);\n\n      if (data_mode != noData) {\n        // GPU update call\n        Tsetbatch.start();\n        bool batch_succeeded =\n            setBatchWrapper<SyncFnTy, syncType, async>(from_id, buf, data_mode);\n        Tsetbatch.stop();\n\n        // cpu always enters this block\n        if (!batch_succeeded) {\n          size_t bit_set_count = num;\n          size_t buf_start     = 0;\n\n          // deserialize the rest of the data in the buffer depending on the\n          // data mode; arguments passed in here are mostly output vars\n          deserializeMessage<syncType>(loopName, data_mode, num, buf,\n                                       bit_set_count, offsets, bit_set_comm,\n                                       buf_start, retval, val_vec);\n\n          bit_set_comm.reserve(maxSharedSize);\n          offsets.reserve(maxSharedSize);\n          val_vec.reserve(maxSharedSize);\n\n          galois::DynamicBitSet& bit_set_compute = BitsetFnTy::get();\n\n          if (data_mode == bitsetData) {\n            size_t bit_set_count2;\n            getOffsetsFromBitset<syncType>(loopName, bit_set_comm, offsets,\n                                           bit_set_count2);\n            assert(bit_set_count == bit_set_count2);\n          }\n\n          if (data_mode == onlyData) {\n            setSubset<decltype(sharedEdges[from_id]), SyncFnTy, syncType, async,\n                      true, true>(loopName, sharedEdges[from_id], bit_set_count,\n                                  offsets, val_vec, bit_set_compute);\n          } else if (data_mode == dataSplit || data_mode == dataSplitFirst) {\n            setSubset<decltype(sharedEdges[from_id]), SyncFnTy, syncType, async,\n                      true, true>(loopName, sharedEdges[from_id], bit_set_count,\n                                  offsets, val_vec, bit_set_compute, buf_start);\n          } else if (data_mode == gidsData) {\n            setSubset<decltype(offsets), SyncFnTy, syncType, async, true, true>(\n                loopName, offsets, bit_set_count, offsets, val_vec,\n                bit_set_compute);\n          } else { // bitsetData or offsetsData\n            setSubset<decltype(sharedEdges[from_id]), SyncFnTy, syncType, async,\n                      false, true>(loopName, sharedEdges[from_id],\n                                   bit_set_count, offsets, val_vec,\n                                   bit_set_compute);\n          }\n          // TODO: reduce could update the bitset, so it needs to be copied\n          // back to the device\n        }\n      }\n    }\n\n    Tset.stop();\n\n    return retval;\n  }\n\n#ifdef GALOIS_USE_BARE_MPI\n  /**\n   * MPI Irecv wrapper for sync\n   */\n  template <SyncType syncType, typename SyncFnTy, typename BitsetFnTy>\n  void sync_mpi_recv_post(std::string loopName,\n                          std::vector<MPI_Request>& request,\n                          const std::vector<std::vector<uint8_t>>& rb) {\n    for (unsigned h = 1; h < numHosts; ++h) {\n      unsigned x = (id + numHosts - h) % numHosts;\n      if (nothingToRecv(x, syncType))\n        continue;\n\n      MPI_Irecv((uint8_t*)rb[x].data(), rb[x].size(), MPI_BYTE, x, 32767,\n                MPI_COMM_WORLD, &request[x]);\n    }\n  }\n\n  /**\n   * MPI receive wrapper for sync\n   */\n  template <SyncType syncType, typename SyncFnTy, typename BitsetFnTy>\n  void sync_mpi_recv_wait(std::string loopName,\n                          std::vector<MPI_Request>& request,\n                          const std::vector<std::vector<uint8_t>>& rb) {\n    for (unsigned h = 1; h < numHosts; ++h) {\n      unsigned x = (id + numHosts - h) % numHosts;\n      if (nothingToRecv(x, syncType))\n        continue;\n\n      MPI_Status status;\n      MPI_Wait(&request[x], &status);\n\n      int size = 0;\n      MPI_Get_count(&status, MPI_BYTE, &size);\n\n      galois::runtime::RecvBuffer rbuf(rb[x].begin(), rb[x].begin() + size);\n\n      syncRecvApply<syncType, SyncFnTy, BitsetFnTy>(x, rbuf, loopName);\n    }\n  }\n\n  /**\n   * MPI get wrapper for sync\n   */\n  template <SyncType syncType, typename SyncFnTy, typename BitsetFnTy>\n  void sync_mpi_get(std::string loopName, const std::vector<MPI_Win>& window,\n                    const std::vector<std::vector<uint8_t>>& rb) {\n    for (unsigned h = 1; h < numHosts; ++h) {\n      unsigned x = (id + numHosts - h) % numHosts;\n      if (nothingToRecv(x, syncType))\n        continue;\n\n      MPI_Win_wait(window[x]);\n\n      size_t size = 0;\n      memcpy(&size, rb[x].data(), sizeof(size_t));\n\n      galois::runtime::RecvBuffer rbuf(rb[x].begin() + sizeof(size_t),\n                                       rb[x].begin() + sizeof(size_t) + size);\n\n      MPI_Win_post(mpi_identity_groups[x], 0, window[x]);\n\n      syncRecvApply<syncType, SyncFnTy, BitsetFnTy>(x, rbuf, loopName);\n    }\n  }\n#endif\n\n  /**\n   * Determines if there is anything to receive from a host and receives/applies\n   * the messages.\n   *\n   * @tparam syncType either reduce or broadcast\n   * @tparam SyncFnTy synchronization structure with info needed to synchronize\n   * @tparam BitsetFnTy struct that has info on how to access the bitset\n   *\n   * @param loopName used to name timers for statistics\n   */\n  template <SyncType syncType, typename SyncFnTy, typename BitsetFnTy,\n            bool async>\n  void syncNetRecv(std::string loopName) {\n    auto& net = galois::runtime::getSystemNetworkInterface();\n    std::string wait_timer_str(\"Wait_\" + get_run_identifier(loopName));\n    galois::CondStatTimer<GALOIS_COMM_STATS> Twait(wait_timer_str.c_str(),\n                                                   RNAME);\n\n    if (async) {\n      size_t syncTypePhase = 0;\n      if (syncType == syncBroadcast)\n        syncTypePhase = 1;\n      decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr,\n                                 syncTypePhase)) p;\n      do {\n        p = net.recieveTagged(galois::runtime::evilPhase, nullptr,\n                              syncTypePhase);\n\n        if (p) {\n          syncRecvApply<syncType, SyncFnTy, BitsetFnTy, async>(\n              p->first, p->second, loopName);\n        }\n      } while (p);\n    } else {\n      for (unsigned x = 0; x < numHosts; ++x) {\n        if (x == id)\n          continue;\n        if (nothingToRecv(x, syncType))\n          continue;\n\n        Twait.start();\n        decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr)) p;\n        do {\n          p = net.recieveTagged(galois::runtime::evilPhase, nullptr);\n        } while (!p);\n        Twait.stop();\n\n        syncRecvApply<syncType, SyncFnTy, BitsetFnTy, async>(\n            p->first, p->second, loopName);\n      }\n      incrementEvilPhase();\n    }\n  }\n\n  /**\n   * Receives messages from all other hosts and \"applies\" the message (reduce\n   * or set) based on the sync structure provided.\n   *\n   * @tparam syncType either reduce or broadcast\n   * @tparam SyncFnTy synchronization structure with info needed to synchronize\n   * @tparam BitsetFnTy struct that has info on how to access the bitset\n   *\n   * @param loopName used to name timers for statistics\n   */\n  template <SyncType syncType, typename SyncFnTy, typename BitsetFnTy,\n            bool async>\n  void syncRecv(std::string loopName) {\n    std::string syncTypeStr = (syncType == syncReduce) ? \"Reduce\" : \"Broadcast\";\n    galois::CondStatTimer<GALOIS_COMM_STATS> TRecvTime(\n        (syncTypeStr + \"Recv_\" + get_run_identifier(loopName)).c_str(), RNAME);\n\n    TRecvTime.start();\n    syncNetRecv<syncType, SyncFnTy, BitsetFnTy, async>(loopName);\n    TRecvTime.stop();\n  }\n\n////////////////////////////////////////////////////////////////////////////////\n// MPI sync variants\n////////////////////////////////////////////////////////////////////////////////\n#ifdef GALOIS_USE_BARE_MPI\n  /**\n   * Nonblocking MPI sync\n   */\n  template <SyncType syncType, typename SyncFnTy, typename BitsetFnTy>\n  void syncNonblockingMPI(std::string loopName,\n                          bool use_bitset_to_send = true) {\n    std::string syncTypeStr = (syncType == syncReduce) ? \"Reduce\" : \"Broadcast\";\n    galois::CondStatTimer<GALOIS_COMM_STATS> TSendTime(\n        (syncTypeStr + \"Send_\" + get_run_identifier(loopName)).c_str(), RNAME);\n    galois::CondStatTimer<GALOIS_COMM_STATS> TRecvTime(\n        (syncTypeStr + \"Recv_\" + get_run_identifier(loopName)).c_str(), RNAME);\n\n    static std::vector<std::vector<uint8_t>> rb;\n    static std::vector<MPI_Request> request;\n\n    if (rb.size() == 0) { // create the receive buffers\n      TRecvTime.start();\n      auto& sharedEdges = (syncType == syncReduce) ? masterEdges : mirrorEdges;\n      rb.resize(numHosts);\n      request.resize(numHosts, MPI_REQUEST_NULL);\n\n      for (unsigned h = 1; h < numHosts; ++h) {\n        unsigned x = (id + numHosts - h) % numHosts;\n        if (nothingToRecv(x, syncType))\n          continue;\n\n        size_t size =\n            (sharedEdges[x].size() * sizeof(typename SyncFnTy::ValTy));\n        size += sizeof(size_t);       // vector size\n        size += sizeof(DataCommMode); // data mode\n\n        rb[x].resize(size);\n      }\n      TRecvTime.stop();\n    }\n\n    TRecvTime.start();\n    sync_mpi_recv_post<syncType, SyncFnTy, BitsetFnTy>(loopName, request, rb);\n    TRecvTime.stop();\n\n    TSendTime.start();\n    if (use_bitset_to_send) {\n      sync_mpi_send<syncType, SyncFnTy, BitsetFnTy>(loopName);\n    } else {\n      sync_mpi_send<syncType, SyncFnTy, galois::InvalidBitsetFnTy>(loopName);\n    }\n    TSendTime.stop();\n\n    TRecvTime.start();\n    sync_mpi_recv_wait<syncType, SyncFnTy, BitsetFnTy>(loopName, request, rb);\n    TRecvTime.stop();\n  }\n\n  /**\n   * Onesided MPI sync\n   */\n  template <SyncType syncType, typename SyncFnTy, typename BitsetFnTy>\n  void syncOnesidedMPI(std::string loopName, bool use_bitset_to_send = true) {\n    std::string syncTypeStr = (syncType == syncReduce) ? \"Reduce\" : \"Broadcast\";\n    galois::CondStatTimer<GALOIS_COMM_STATS> TSendTime(\n        (syncTypeStr + \"Send_\" + get_run_identifier(loopName)).c_str(), RNAME);\n    galois::CondStatTimer<GALOIS_COMM_STATS> TRecvTime(\n        (syncTypeStr + \"Recv_\" + get_run_identifier(loopName)).c_str(), RNAME);\n\n    static std::vector<MPI_Win> window;\n    static MPI_Group mpi_access_group;\n    static std::vector<std::vector<uint8_t>> rb;\n\n    if (window.size() == 0) { // create the windows\n      TRecvTime.start();\n      auto& sharedEdges = (syncType == syncReduce) ? masterEdges : mirrorEdges;\n      window.resize(numHosts);\n      rb.resize(numHosts);\n\n      uint64_t recv_buffers_size = 0;\n      for (unsigned x = 0; x < numHosts; ++x) {\n        size_t size = sharedEdges[x].size() * sizeof(typename SyncFnTy::ValTy);\n        size += sizeof(size_t);       // vector size\n        size += sizeof(DataCommMode); // data mode\n        size += sizeof(size_t);       // buffer size\n        recv_buffers_size += size;\n\n        rb[x].resize(size);\n\n        MPI_Info info;\n        MPI_Info_create(&info);\n        MPI_Info_set(info, \"no_locks\", \"true\");\n        MPI_Info_set(info, \"same_disp_unit\", \"true\");\n\n        MPI_Win_create(rb[x].data(), size, 1, info, MPI_COMM_WORLD, &window[x]);\n\n        MPI_Info_free(&info);\n      }\n      auto& net = galois::runtime::getSystemNetworkInterface();\n      net.incrementMemUsage(recv_buffers_size);\n\n      for (unsigned h = 1; h < numHosts; ++h) {\n        unsigned x = (id + numHosts - h) % numHosts;\n        if (nothingToRecv(x, syncType))\n          continue;\n        // exposure group of each window is same as identity group of that\n        // window\n        MPI_Win_post(mpi_identity_groups[x], 0, window[x]);\n      }\n      TRecvTime.stop();\n\n      TSendTime.start();\n      std::vector<int> access_hosts;\n      for (unsigned h = 1; h < numHosts; ++h) {\n        unsigned x = (id + h) % numHosts;\n\n        if (nothingToSend(x, syncType))\n          continue;\n\n        access_hosts.push_back(x);\n      }\n      MPI_Group world_group;\n      MPI_Comm_group(MPI_COMM_WORLD, &world_group);\n      // access group for only one window since only one window is accessed\n      MPI_Group_incl(world_group, access_hosts.size(), access_hosts.data(),\n                     &mpi_access_group);\n      TSendTime.stop();\n    }\n\n    TSendTime.start();\n    if (use_bitset_to_send) {\n      sync_mpi_put<syncType, SyncFnTy, BitsetFnTy>(loopName, mpi_access_group,\n                                                   window);\n    } else {\n      sync_mpi_put<syncType, SyncFnTy, galois::InvalidBitsetFnTy>(\n          loopName, mpi_access_group, window);\n    }\n    TSendTime.stop();\n\n    TRecvTime.start();\n    sync_mpi_get<syncType, SyncFnTy, BitsetFnTy>(loopName, window, rb);\n    TRecvTime.stop();\n  }\n#endif\n\n  ////////////////////////////////////////////////////////////////////////////////\n  // Higher Level Sync Calls (broadcast/reduce, etc)\n  ////////////////////////////////////////////////////////////////////////////////\n\n  /**\n   * Does a reduction of data from mirror edges to master edges.\n   *\n   * @tparam ReduceFnTy reduce sync structure for the field\n   * @tparam BitsetFnTy struct that has info on how to access the bitset\n   *\n   * @param loopName used to name timers for statistics\n   */\n  template <typename ReduceFnTy, typename BitsetFnTy, bool async>\n  inline void reduce(std::string loopName) {\n    std::string timer_str(\"Reduce_\" + get_run_identifier(loopName));\n    galois::CondStatTimer<GALOIS_COMM_STATS> TsyncReduce(timer_str.c_str(),\n                                                         RNAME);\n    TsyncReduce.start();\n\n#ifdef GALOIS_USE_BARE_MPI\n    switch (bare_mpi) {\n    case noBareMPI:\n#endif\n      syncSend<syncReduce, ReduceFnTy, BitsetFnTy, async>(loopName);\n      syncRecv<syncReduce, ReduceFnTy, BitsetFnTy, async>(loopName);\n#ifdef GALOIS_USE_BARE_MPI\n      break;\n    case nonBlockingBareMPI:\n      syncNonblockingMPI<syncReduce, ReduceFnTy, BitsetFnTy>(loopName);\n      break;\n    case oneSidedBareMPI:\n      syncOnesidedMPI<syncReduce, ReduceFnTy, BitsetFnTy>(loopName);\n      break;\n    default:\n      GALOIS_DIE(\"unsupported bare MPI\");\n    }\n#endif\n\n    TsyncReduce.stop();\n  }\n\n  /**\n   * Does a broadcast of data from master to mirror edges.\n   *\n   * @tparam BroadcastFnTy broadcast sync structure for the field\n   * @tparam BitsetFnTy struct that has info on how to access the bitset\n   *\n   * @param loopName used to name timers for statistics\n   */\n  template <typename BroadcastFnTy, typename BitsetFnTy, bool async>\n  inline void broadcast(std::string loopName) {\n    std::string timer_str(\"Broadcast_\" + get_run_identifier(loopName));\n    galois::CondStatTimer<GALOIS_COMM_STATS> TsyncBroadcast(timer_str.c_str(),\n                                                            RNAME);\n\n    TsyncBroadcast.start();\n\n    bool use_bitset = true;\n\n#ifdef GALOIS_USE_BARE_MPI\n    switch (bare_mpi) {\n    case noBareMPI:\n#endif\n      if (use_bitset) {\n        syncSend<syncBroadcast, BroadcastFnTy, BitsetFnTy, async>(loopName);\n      } else {\n        syncSend<syncBroadcast, BroadcastFnTy, galois::InvalidBitsetFnTy,\n                 async>(loopName);\n      }\n      syncRecv<syncBroadcast, BroadcastFnTy, BitsetFnTy, async>(loopName);\n#ifdef GALOIS_USE_BARE_MPI\n      break;\n    case nonBlockingBareMPI:\n      syncNonblockingMPI<syncBroadcast, BroadcastFnTy, BitsetFnTy>(loopName,\n                                                                   use_bitset);\n      break;\n    case oneSidedBareMPI:\n      syncOnesidedMPI<syncBroadcast, BroadcastFnTy, BitsetFnTy>(loopName,\n                                                                use_bitset);\n      break;\n    default:\n      GALOIS_DIE(\"unsupported bare MPI\");\n    }\n#endif\n\n    TsyncBroadcast.stop();\n  }\n\n  /**\n   * Do sync necessary for write any, read any.\n   *\n   * @tparam SyncFnTy sync structure for the field\n   * @tparam BitsetFnTy struct that has info on how to access the bitset\n   *\n   * @param loopName used to name timers for statistics\n   */\n  template <typename SyncFnTy, typename BitsetFnTy, bool async>\n  inline void sync_any_to_any(std::string loopName) {\n    // reduce and broadcast for OEC, IEC, CVC, UVC\n    reduce<SyncFnTy, BitsetFnTy, async>(loopName);\n    broadcast<SyncFnTy, BitsetFnTy, async>(loopName);\n  }\n\n  ////////////////////////////////////////////////////////////////////////////////\n  // Public iterface: sync\n  ////////////////////////////////////////////////////////////////////////////////\n\npublic:\n  /**\n   * Main sync call exposed to the user that calls the correct sync function\n   * based on provided template arguments. Must provide information through\n   * structures on how to do synchronization/which fields to synchronize.\n   *\n   * @tparam SyncFnTy sync structure for the field\n   * @tparam BitsetFnTy struct that has info on how to access the bitset\n   * @param loopName used to name timers for statistics\n   */\n  template <typename SyncFnTy, typename BitsetFnTy = galois::InvalidBitsetFnTy,\n            bool async = false>\n  inline void sync(std::string loopName) {\n    std::string timer_str(\"Sync_\" + loopName + \"_\" + get_run_identifier());\n    galois::StatTimer Tsync(timer_str.c_str(), RNAME);\n\n    Tsync.start();\n    sync_any_to_any<SyncFnTy, BitsetFnTy, async>(loopName);\n    Tsync.stop();\n  }\n\n  ////////////////////////////////////////////////////////////////////////////////\n  // GPU marshaling\n  ////////////////////////////////////////////////////////////////////////////////\n\n#ifdef GALOIS_ENABLE_GPU\nprivate:\n  using GraphNode     = typename GraphTy::GraphNode;\n  using edge_iterator = typename GraphTy::edge_iterator;\n  using EdgeTy        = typename GraphTy::EdgeType;\n\n  // Code that handles getting the graph onto the GPU\n  template <bool isVoidType,\n            typename std::enable_if<isVoidType>::type* = nullptr>\n  inline void setMarshalEdge(EdgeMarshalGraph& GALOIS_UNUSED(m),\n                             const size_t GALOIS_UNUSED(index),\n                             const edge_iterator& GALOIS_UNUSED(e)) {\n    // do nothing\n  }\n\n  template <bool isVoidType,\n            typename std::enable_if<!isVoidType>::type* = nullptr>\n  inline void setMarshalEdge(EdgeMarshalGraph& m, const size_t index,\n                             const edge_iterator& e) {\n    m.edge_data[index] = userGraph.getEdgeData(e);\n  }\n\npublic:\n  void getEdgeMarshalGraph(EdgeMarshalGraph& m, bool loadProxyEdges = true) {\n    m.nnodes   = userGraph.size();\n    m.nedges   = userGraph.sizeEdges();\n    m.numOwned = userGraph.numMasters();\n    //// Assumption: master occurs at beginning in contiguous range\n    m.beginMaster       = 0;\n    m.numNodesWithEdges = userGraph.getNumNodesWithEdges();\n    m.id                = id;\n    m.numHosts          = numHosts;\n    m.row_start         = (index_type*)calloc(m.nnodes + 1, sizeof(index_type));\n    m.edge_dst          = (index_type*)calloc(m.nedges, sizeof(index_type));\n    m.node_data         = (index_type*)calloc(m.nnodes, sizeof(node_data_type));\n\n    //// TODO deal with edgety\n    if (std::is_void<EdgeTy>::value) {\n      m.edge_data = NULL;\n    } else {\n      if (!std::is_same<EdgeTy, edge_data_type>::value) {\n        galois::gWarn(\"Edge data type mismatch between CPU and GPU\\n\");\n      }\n      m.edge_data = (edge_data_type*)calloc(m.nedges, sizeof(edge_data_type));\n    }\n\n    galois::do_all(\n        // TODO not using thread ranges, can be optimized if I can iterate\n        // directly over userGraph\n        galois::iterate(userGraph.allNodesRange()),\n        [&](const GraphNode& nodeID) {\n          // initialize node_data with localID-to-globalID mapping\n          m.node_data[nodeID] =\n              userGraph.getGID(nodeID); // this may not be required.\n          m.row_start[nodeID] = *(userGraph.edge_begin(nodeID));\n          for (auto e = userGraph.edge_begin(nodeID);\n               e != userGraph.edge_end(nodeID); e++) {\n            auto edgeID = *e;\n            setMarshalEdge<std::is_void<EdgeTy>::value>(m, edgeID, e);\n            m.edge_dst[edgeID] = userGraph.getEdgeDst(e);\n          }\n        },\n        galois::steal());\n\n    m.row_start[m.nnodes] = m.nedges;\n\n    // TODO?\n    // copy memoization meta-data\n    if (loadProxyEdges) {\n      m.num_master_edges =\n          (unsigned int*)calloc(masterEdges.size(), sizeof(unsigned int));\n      ;\n      m.master_edges =\n          (unsigned int**)calloc(masterEdges.size(), sizeof(unsigned int*));\n      ;\n\n      for (uint32_t h = 0; h < masterEdges.size(); ++h) {\n        m.num_master_edges[h] = masterEdges[h].size();\n\n        if (masterEdges[h].size() > 0) {\n          m.master_edges[h] = (unsigned int*)calloc(masterEdges[h].size(),\n                                                    sizeof(unsigned int));\n          ;\n          std::copy(masterEdges[h].begin(), masterEdges[h].end(),\n                    m.master_edges[h]);\n        } else {\n          m.master_edges[h] = NULL;\n        }\n      }\n\n      m.num_mirror_edges =\n          (unsigned int*)calloc(mirrorEdges.size(), sizeof(unsigned int));\n      ;\n      m.mirror_edges =\n          (unsigned int**)calloc(mirrorEdges.size(), sizeof(unsigned int*));\n      ;\n      for (uint32_t h = 0; h < mirrorEdges.size(); ++h) {\n        m.num_mirror_edges[h] = mirrorEdges[h].size();\n\n        if (mirrorEdges[h].size() > 0) {\n          m.mirror_edges[h] = (unsigned int*)calloc(mirrorEdges[h].size(),\n                                                    sizeof(unsigned int));\n          ;\n          std::copy(mirrorEdges[h].begin(), mirrorEdges[h].end(),\n                    m.mirror_edges[h]);\n        } else {\n          m.mirror_edges[h] = NULL;\n        }\n      }\n    }\n\n    //// user needs to provide method of freeing up graph (it can do nothing\n    //// if they wish)\n    // userGraph.deallocate();\n  }\n#endif // het galois def\n\npublic:\n  ////////////////////////////////////////////////////////////////////////////////\n  // Metadata settings/getters\n  ////////////////////////////////////////////////////////////////////////////////\n  /**\n   * Set the run number.\n   *\n   * @param runNum Number to set the run to\n   */\n  inline void set_num_run(const uint32_t runNum) { num_run = runNum; }\n\n  /**\n   * Get the set run number.\n   *\n   * @returns The set run number saved in the graph\n   */\n  inline uint32_t get_run_num() const { return num_run; }\n\n  /**\n   * Set the round number for use in the run identifier.\n   *\n   * @param round round number to set to\n   */\n  inline void set_num_round(const uint32_t round) { num_round = round; }\n\n  /**\n   * Get a run identifier using the set run and set round.\n   *\n   * @returns a string run identifier\n   * @deprecated We want to move away from calling this by itself; use ones\n   * that take an argument; will be removed once we eliminate all instances\n   * of its use from code\n   */\n  inline std::string get_run_identifier() const {\n#if GALOIS_PER_ROUND_STATS\n    return std::string(std::to_string(num_run) + \"_\" +\n                       std::to_string(num_round));\n#else\n    return std::string(std::to_string(num_run));\n#endif\n  }\n\n  /**\n   * Get a run identifier using the set run and set round and\n   * append to the passed in string.\n   *\n   * @param loop_name String to append the run identifier\n   * @returns String with run identifier appended to passed in loop name\n   */\n  inline std::string get_run_identifier(std::string loop_name) const {\n#if GALOIS_PER_ROUND_STATS\n    return std::string(std::string(loop_name) + \"_\" + std::to_string(num_run) +\n                       \"_\" + std::to_string(num_round));\n#else\n    return std::string(std::string(loop_name) + \"_\" + std::to_string(num_run));\n#endif\n  }\n\n  /**\n   * Get a run identifier using the set run and set round and\n   * append to the passed in string in addition to the number identifier passed\n   * in.\n   *\n   * @param loop_name String to append the run identifier\n   * @param alterID another ID with which to add to the timer name.\n   *\n   * @returns String with run identifier appended to passed in loop name +\n   * alterID\n   */\n  inline std::string get_run_identifier(std::string loop_name,\n                                        unsigned alterID) const {\n#if GALOIS_PER_ROUND_STATS\n    return std::string(std::string(loop_name) + \"_\" + std::to_string(alterID) +\n                       \"_\" + std::to_string(num_run) + \"_\" +\n                       std::to_string(num_round));\n#else\n    return std::string(std::string(loop_name) + \"_\" + std::to_string(alterID) +\n                       \"_\" + std::to_string(num_run));\n#endif\n  }\n};\n\ntemplate <typename GraphTy>\nconstexpr const char* const galois::graphs::GluonEdgeSubstrate<GraphTy>::RNAME;\n} // end namespace graphs\n} // end namespace galois\n\n#endif // header guard\n"
  },
  {
    "path": "libgluon/include/galois/graphs/GluonSubstrate.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2019, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n/**\n * @file GluonSubstrate.h\n *\n * Contains the implementation for GluonSubstrate.\n */\n\n#ifndef _GALOIS_GLUONSUB_H_\n#define _GALOIS_GLUONSUB_H_\n\n#include <unordered_map>\n#include <fstream>\n\n#include \"galois/runtime/GlobalObj.h\"\n#include \"galois/runtime/DistStats.h\"\n#include \"galois/runtime/SyncStructures.h\"\n#include \"galois/runtime/DataCommMode.h\"\n#include \"galois/DynamicBitset.h\"\n\n#ifdef GALOIS_ENABLE_GPU\n#include \"galois/cuda/HostDecls.h\"\n#endif\n\n#include \"galois/runtime/BareMPI.h\"\n\n// TODO find a better way to do this without globals\n//! Specifies what format to send metadata in\nextern DataCommMode enforcedDataMode;\n\n#ifdef GALOIS_USE_BARE_MPI\nextern BareMPI bare_mpi;\n#endif\n\n//! Enumeration for specifiying write location for sync calls\nenum WriteLocation {\n  //! write at source\n  writeSource,\n  //! write at destination\n  writeDestination,\n  //! write at source and/or destination\n  writeAny\n};\n//! Enumeration for specifiying read location for sync calls\nenum ReadLocation {\n  //! read at source\n  readSource,\n  //! read at destination\n  readDestination,\n  //! read at source and/or destination\n  readAny\n};\n\nnamespace galois {\nnamespace graphs {\n\n/**\n * Gluon communication substrate that handles communication given a user graph.\n * User graph should provide certain things the substrate expects.\n *\n * TODO documentation on expected things\n *\n * @tparam GraphTy User graph to handle communication for\n */\ntemplate <typename GraphTy>\nclass GluonSubstrate : public galois::runtime::GlobalObject {\nprivate:\n  //! Synchronization type\n  enum SyncType {\n    syncReduce,   //!< Reduction sync\n    syncBroadcast //!< Broadcast sync\n  };\n\n  //! Graph name used for printing things\n  constexpr static const char* const RNAME = \"Gluon\";\n\n  //! The graph to handle communication for\n  GraphTy& userGraph;\n  const unsigned id; //!< Copy of net.ID, which is the ID of the machine.\n  bool transposed;   //!< Marks if passed in graph is transposed or not.\n  bool isVertexCut;  //!< Marks if passed in graph's partitioning is vertex cut.\n  std::pair<unsigned, unsigned> cartesianGrid; //!< cartesian grid (if any)\n  bool partitionAgnostic; //!< true if communication should ignore partitioning\n  DataCommMode substrateDataMode; //!< datamode to enforce\n  const uint32_t\n      numHosts;     //!< Copy of net.Num, which is the total number of machines\n  uint32_t num_run; //!< Keep track of number of runs.\n  uint32_t num_round; //!< Keep track of number of rounds.\n  bool isCartCut;     //!< True if graph is a cartesian cut\n\n  // bitvector status hasn't been maintained\n  //! Typedef used so galois::runtime::BITVECTOR_STATUS doesn't have to be\n  //! written\n  using BITVECTOR_STATUS = galois::runtime::BITVECTOR_STATUS;\n  //! A pointer set during syncOnDemand calls that points to the status\n  //! of a bitvector with regard to where data has been synchronized\n  //! @todo pass the flag as function paramater instead\n  BITVECTOR_STATUS* currentBVFlag;\n\n  // memoization optimization\n  //! Master nodes on different hosts. For broadcast;\n  std::vector<std::vector<size_t>> masterNodes;\n  //! Mirror nodes on different hosts. For reduce; comes from the user graph\n  //! during initialization (we expect user to give to us)\n  std::vector<std::vector<size_t>>& mirrorNodes;\n  //! Maximum size of master or mirror nodes on different hosts\n  size_t maxSharedSize;\n\n#ifdef GALOIS_USE_BARE_MPI\n  std::vector<MPI_Group> mpi_identity_groups;\n#endif\n  // Used for efficient comms\n  galois::DynamicBitSet syncBitset;\n  galois::PODResizeableArray<unsigned int> syncOffsets;\n\n  /**\n   * Reset a provided bitset given the type of synchronization performed\n   *\n   * @param syncType Type of synchronization to consider when doing reset\n   * @param bitset_reset_range Function to reset range with\n   */\n  void reset_bitset(SyncType syncType,\n                    void (*bitset_reset_range)(size_t, size_t)) {\n    size_t numMasters = userGraph.numMasters();\n    if (numMasters > 0) {\n      // note this assumes masters are from 0 -> a number; CuSP should\n      // do this automatically\n      if (syncType == syncBroadcast) { // reset masters\n        bitset_reset_range(0, numMasters - 1);\n      } else {\n        assert(syncType == syncReduce);\n        // mirrors occur after masters\n        if (numMasters < userGraph.size()) {\n          bitset_reset_range(numMasters, userGraph.size() - 1);\n        }\n      }\n    } else { // all things are mirrors\n      // only need to reset if reduce\n      if (syncType == syncReduce) {\n        if (userGraph.size() > 0) {\n          bitset_reset_range(0, userGraph.size() - 1);\n        }\n      }\n    }\n  }\n\n  //! Increments evilPhase, a phase counter used by communication.\n  void inline incrementEvilPhase() {\n    ++galois::runtime::evilPhase;\n    // limit defined by MPI or LCI\n    if (galois::runtime::evilPhase >=\n        static_cast<uint32_t>(std::numeric_limits<int16_t>::max())) {\n      galois::runtime::evilPhase = 1;\n    }\n  }\n\n  ////////////////////////////////////////////////////////////////////////////////\n  // Proxy communication setup\n  ////////////////////////////////////////////////////////////////////////////////\n  /**\n   * Let other hosts know about which host has what mirrors/masters;\n   * used for later communication of mirrors/masters.\n   */\n  void exchangeProxyInfo() {\n    auto& net = galois::runtime::getSystemNetworkInterface();\n\n    // send off the mirror nodes\n    for (unsigned x = 0; x < numHosts; ++x) {\n      if (x == id)\n        continue;\n\n      galois::runtime::SendBuffer b;\n      gSerialize(b, mirrorNodes[x]);\n      net.sendTagged(x, galois::runtime::evilPhase, b);\n    }\n\n    // receive the mirror nodes\n    for (unsigned x = 0; x < numHosts; ++x) {\n      if (x == id)\n        continue;\n\n      decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr)) p;\n      do {\n        p = net.recieveTagged(galois::runtime::evilPhase, nullptr);\n      } while (!p);\n\n      galois::runtime::gDeserialize(p->second, masterNodes[p->first]);\n    }\n    incrementEvilPhase();\n  }\n\n  /**\n   * Send statistics about master/mirror nodes to each host, and\n   * report the statistics.\n   */\n  void sendInfoToHost() {\n    auto& net = galois::runtime::getSystemNetworkInterface();\n\n    uint64_t global_total_mirror_nodes =\n        userGraph.size() - userGraph.numMasters();\n    uint64_t global_total_owned_nodes = userGraph.numMasters();\n\n    // send info to host\n    for (unsigned x = 0; x < numHosts; ++x) {\n      if (x == id)\n        continue;\n\n      galois::runtime::SendBuffer b;\n      gSerialize(b, global_total_mirror_nodes, global_total_owned_nodes);\n      net.sendTagged(x, galois::runtime::evilPhase, b);\n    }\n\n    // receive\n    for (unsigned x = 0; x < numHosts; ++x) {\n      if (x == id)\n        continue;\n\n      decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr)) p;\n      do {\n        p = net.recieveTagged(galois::runtime::evilPhase, nullptr);\n      } while (!p);\n\n      uint64_t total_mirror_nodes_from_others;\n      uint64_t total_owned_nodes_from_others;\n      galois::runtime::gDeserialize(p->second, total_mirror_nodes_from_others,\n                                    total_owned_nodes_from_others);\n      global_total_mirror_nodes += total_mirror_nodes_from_others;\n      global_total_owned_nodes += total_owned_nodes_from_others;\n    }\n    incrementEvilPhase();\n\n    assert(userGraph.globalSize() == global_total_owned_nodes);\n    // report stats\n    if (net.ID == 0) {\n      reportProxyStats(global_total_mirror_nodes, global_total_owned_nodes);\n    }\n  }\n\n  /**\n   * Sets up the communication between the different hosts that contain\n   * different parts of the graph by exchanging master/mirror information.\n   */\n  void setupCommunication() {\n    galois::CondStatTimer<MORE_DIST_STATS> Tcomm_setup(\"CommunicationSetupTime\",\n                                                       RNAME);\n\n    // barrier so that all hosts start the timer together\n    galois::runtime::getHostBarrier().wait();\n\n    Tcomm_setup.start();\n\n    // Exchange information for memoization optimization.\n    exchangeProxyInfo();\n    // convert the global ids stored in the master/mirror nodes arrays to local\n    // ids\n    // TODO: use 32-bit distinct vectors for masters and mirrors from here on\n    for (uint32_t h = 0; h < masterNodes.size(); ++h) {\n      galois::do_all(\n          galois::iterate(size_t{0}, masterNodes[h].size()),\n          [&](size_t n) {\n            masterNodes[h][n] = userGraph.getLID(masterNodes[h][n]);\n          },\n#if GALOIS_COMM_STATS\n          galois::loopname(get_run_identifier(\"MasterNodes\").c_str()),\n#endif\n          galois::no_stats());\n    }\n\n    for (uint32_t h = 0; h < mirrorNodes.size(); ++h) {\n      galois::do_all(\n          galois::iterate(size_t{0}, mirrorNodes[h].size()),\n          [&](size_t n) {\n            mirrorNodes[h][n] = userGraph.getLID(mirrorNodes[h][n]);\n          },\n#if GALOIS_COMM_STATS\n          galois::loopname(get_run_identifier(\"MirrorNodes\").c_str()),\n#endif\n          galois::no_stats());\n    }\n\n    Tcomm_setup.stop();\n\n    maxSharedSize = 0;\n    // report masters/mirrors to/from other hosts as statistics\n    for (auto x = 0U; x < masterNodes.size(); ++x) {\n      if (x == id)\n        continue;\n      std::string master_nodes_str =\n          \"MasterNodesFrom_\" + std::to_string(id) + \"_To_\" + std::to_string(x);\n      galois::runtime::reportStatCond_Tsum<MORE_DIST_STATS>(\n          RNAME, master_nodes_str, masterNodes[x].size());\n      if (masterNodes[x].size() > maxSharedSize) {\n        maxSharedSize = masterNodes[x].size();\n      }\n    }\n\n    for (auto x = 0U; x < mirrorNodes.size(); ++x) {\n      if (x == id)\n        continue;\n      std::string mirror_nodes_str =\n          \"MirrorNodesFrom_\" + std::to_string(x) + \"_To_\" + std::to_string(id);\n      galois::runtime::reportStatCond_Tsum<MORE_DIST_STATS>(\n          RNAME, mirror_nodes_str, mirrorNodes[x].size());\n      if (mirrorNodes[x].size() > maxSharedSize) {\n        maxSharedSize = mirrorNodes[x].size();\n      }\n    }\n\n    sendInfoToHost();\n\n    // do not track memory usage of partitioning\n    auto& net = galois::runtime::getSystemNetworkInterface();\n    net.resetMemUsage();\n  }\n\n  /**\n   * Reports master/mirror stats.\n   * Assumes that communication has already occured so that the host\n   * calling it actually has the info required.\n   *\n   * @param global_total_mirror_nodes number of mirror nodes on all hosts\n   * @param global_total_owned_nodes number of \"owned\" nodes on all hosts\n   */\n  void reportProxyStats(uint64_t global_total_mirror_nodes,\n                        uint64_t GALOIS_UNUSED(global_total_owned_nodes)) {\n    float replication_factor =\n        (float)(global_total_mirror_nodes + userGraph.globalSize()) /\n        (float)userGraph.globalSize();\n    galois::runtime::reportStat_Single(RNAME, \"ReplicationFactor\",\n                                       replication_factor);\n\n    galois::runtime::reportStatCond_Single<MORE_DIST_STATS>(\n        RNAME, \"TotalNodes\", userGraph.globalSize());\n    galois::runtime::reportStatCond_Single<MORE_DIST_STATS>(\n        RNAME, \"TotalGlobalMirrorNodes\", global_total_mirror_nodes);\n  }\n\n  ////////////////////////////////////////////////////////////////////////////////\n  // Initializers\n  ////////////////////////////////////////////////////////////////////////////////\n  /**\n   * Initalize MPI related things. The MPI layer itself should have been\n   * initialized when the network interface was initiailized.\n   */\n  void initBareMPI() {\n#ifdef GALOIS_USE_BARE_MPI\n    if (bare_mpi == noBareMPI)\n      return;\n\n#ifdef GALOIS_USE_LCI\n    // sanity check of ranks\n    int taskRank;\n    MPI_Comm_rank(MPI_COMM_WORLD, &taskRank);\n    if ((unsigned)taskRank != id)\n      GALOIS_DIE(\"mismatch in MPI rank\");\n    int numTasks;\n    MPI_Comm_size(MPI_COMM_WORLD, &numTasks);\n    if ((unsigned)numTasks != numHosts)\n      GALOIS_DIE(\"mismatch in MPI rank\");\n#endif\n    // group setup\n    MPI_Group world_group;\n    MPI_Comm_group(MPI_COMM_WORLD, &world_group);\n    mpi_identity_groups.resize(numHosts);\n\n    for (unsigned x = 0; x < numHosts; ++x) {\n      const int g[1] = {(int)x};\n      MPI_Group_incl(world_group, 1, g, &mpi_identity_groups[x]);\n    }\n\n    if (id == 0) {\n      switch (bare_mpi) {\n      case nonBlockingBareMPI:\n        galois::gPrint(\"Using non-blocking bare MPI\\n\");\n        break;\n      case oneSidedBareMPI:\n        galois::gPrint(\"Using one-sided bare MPI\\n\");\n        break;\n      case noBareMPI:\n      default:\n        GALOIS_DIE(\"unsupported bare MPI\");\n      }\n    }\n#endif\n  }\n\npublic:\n  /**\n   * Delete default constructor: this class NEEDS to have a graph passed into\n   * it.\n   */\n  GluonSubstrate() = delete;\n\n  /**\n   * Constructor for GluonSubstrate. Initializes metadata fields.\n   *\n   * @param _userGraph graph to build substrate on\n   * @param host host number that this graph resides on\n   * @param numHosts total number of hosts in the currently executing program\n   * @param _transposed True if the graph is transposed\n   * @param _cartesianGrid cartesian grid for sync\n   * @param _partitionAgnostic determines if sync should be partition agnostic\n   * or not\n   * @param _enforcedDataMode Forced data comm mode for sync\n   */\n  GluonSubstrate(\n      GraphTy& _userGraph, unsigned host, unsigned numHosts, bool _transposed,\n      std::pair<unsigned, unsigned> _cartesianGrid = std::make_pair(0u, 0u),\n      bool _partitionAgnostic                      = false,\n      DataCommMode _enforcedDataMode               = DataCommMode::noData)\n      : galois::runtime::GlobalObject(this), userGraph(_userGraph), id(host),\n        transposed(_transposed), isVertexCut(userGraph.is_vertex_cut()),\n        cartesianGrid(_cartesianGrid), partitionAgnostic(_partitionAgnostic),\n        substrateDataMode(_enforcedDataMode), numHosts(numHosts), num_run(0),\n        num_round(0), currentBVFlag(nullptr),\n        mirrorNodes(userGraph.getMirrorNodes()) {\n    if (cartesianGrid.first != 0 && cartesianGrid.second != 0) {\n      GALOIS_ASSERT(cartesianGrid.first * cartesianGrid.second == numHosts,\n                    \"Cartesian split doesn't equal number of hosts\");\n      if (id == 0) {\n        galois::gInfo(\"Gluon optimizing communication for 2-D cartesian cut: \",\n                      cartesianGrid.first, \" x \", cartesianGrid.second);\n      }\n      isCartCut = true;\n    } else {\n      assert(cartesianGrid.first == 0 && cartesianGrid.second == 0);\n      isCartCut = false;\n    }\n\n    // set this global value for use on GPUs mostly\n    enforcedDataMode = _enforcedDataMode;\n\n    initBareMPI();\n    // master setup from mirrors done by setupCommunication call\n    masterNodes.resize(numHosts);\n    // setup proxy communication\n    galois::CondStatTimer<MORE_DIST_STATS> Tgraph_construct_comm(\n        \"GraphCommSetupTime\", RNAME);\n    Tgraph_construct_comm.start();\n    setupCommunication();\n    Tgraph_construct_comm.stop();\n  }\n\n  ////////////////////////////////////////////////////////////////////////////////\n  // Data extraction from bitsets\n  ////////////////////////////////////////////////////////////////////////////////\n\nprivate:\n  /**\n   * Given a bitset, determine the indices of the bitset that are currently\n   * set.\n   *\n   * @tparam syncType either reduce or broadcast; only used to name the timer\n   *\n   * @param loopName string used to name the timer for this function\n   * @param bitset_comm the bitset to get the offsets of\n   * @param offsets output: the offset vector that will contain indices into\n   * the bitset that are set\n   * @param bit_set_count output: will be set to the number of bits set in the\n   * bitset\n   */\n  template <SyncType syncType>\n  void getOffsetsFromBitset(const std::string& loopName,\n                            const galois::DynamicBitSet& bitset_comm,\n                            galois::PODResizeableArray<unsigned int>& offsets,\n                            size_t& bit_set_count) const {\n    // timer creation\n    std::string syncTypeStr = (syncType == syncReduce) ? \"Reduce\" : \"Broadcast\";\n    std::string offsets_timer_str(syncTypeStr + \"Offsets_\" +\n                                  get_run_identifier(loopName));\n    galois::CondStatTimer<GALOIS_COMM_STATS> Toffsets(offsets_timer_str.c_str(),\n                                                      RNAME);\n\n    Toffsets.start();\n\n    auto activeThreads = galois::getActiveThreads();\n    std::vector<unsigned int> t_prefix_bit_counts(activeThreads);\n\n    // count how many bits are set on each thread\n    galois::on_each([&](unsigned tid, unsigned nthreads) {\n      // TODO use block_range instead\n      unsigned int block_size = bitset_comm.size() / nthreads;\n      if ((bitset_comm.size() % nthreads) > 0)\n        ++block_size;\n      assert((block_size * nthreads) >= bitset_comm.size());\n\n      unsigned int start = tid * block_size;\n      unsigned int end   = (tid + 1) * block_size;\n      if (end > bitset_comm.size())\n        end = bitset_comm.size();\n\n      unsigned int count = 0;\n      for (unsigned int i = start; i < end; ++i) {\n        if (bitset_comm.test(i))\n          ++count;\n      }\n\n      t_prefix_bit_counts[tid] = count;\n    });\n\n    // calculate prefix sum of bits per thread\n    for (unsigned int i = 1; i < activeThreads; ++i) {\n      t_prefix_bit_counts[i] += t_prefix_bit_counts[i - 1];\n    }\n    // total num of set bits\n    bit_set_count = t_prefix_bit_counts[activeThreads - 1];\n\n    // calculate the indices of the set bits and save them to the offset\n    // vector\n    if (bit_set_count > 0) {\n      offsets.resize(bit_set_count);\n      galois::on_each([&](unsigned tid, unsigned nthreads) {\n        // TODO use block_range instead\n        // TODO this is same calculation as above; maybe refactor it\n        // into function?\n        unsigned int block_size = bitset_comm.size() / nthreads;\n        if ((bitset_comm.size() % nthreads) > 0)\n          ++block_size;\n        assert((block_size * nthreads) >= bitset_comm.size());\n\n        unsigned int start = tid * block_size;\n        unsigned int end   = (tid + 1) * block_size;\n        if (end > bitset_comm.size())\n          end = bitset_comm.size();\n\n        unsigned int count = 0;\n        unsigned int t_prefix_bit_count;\n        if (tid == 0) {\n          t_prefix_bit_count = 0;\n        } else {\n          t_prefix_bit_count = t_prefix_bit_counts[tid - 1];\n        }\n\n        for (unsigned int i = start; i < end; ++i) {\n          if (bitset_comm.test(i)) {\n            offsets[t_prefix_bit_count + count] = i;\n            ++count;\n          }\n        }\n      });\n    }\n    Toffsets.stop();\n  }\n\n  /**\n   * Determine what data needs to be synchronized based on the passed in\n   * bitset_compute and returns information regarding these need-to-be-sync'd\n   * nodes.\n   *\n   * @tparam FnTy structure that specifies how synchronization is to be done;\n   * only used to get the size of the type being synchronized in this function\n   * @tparam syncType type of synchronization this function is being called\n   * for; only used to name a timer\n   *\n   * @param loopName loopname used to name the timer for the function\n   * @param indices A vector that contains the local ids of the nodes that\n   * you want to potentially synchronize\n   * @param bitset_compute Contains the full bitset of all nodes in this\n   * graph\n   * @param bitset_comm OUTPUT: bitset that marks which indices in the passed\n   * in indices array need to be synchronized\n   * @param offsets OUTPUT: contains indices into bitset_comm that are set\n   * @param bit_set_count OUTPUT: contains number of bits set in bitset_comm\n   * @param data_mode OUTPUT: the way that this data should be communicated\n   * based on how much data needs to be sent out\n   */\n  template <typename FnTy, SyncType syncType>\n  void getBitsetAndOffsets(const std::string& loopName,\n                           const std::vector<size_t>& indices,\n                           const galois::DynamicBitSet& bitset_compute,\n                           galois::DynamicBitSet& bitset_comm,\n                           galois::PODResizeableArray<unsigned int>& offsets,\n                           size_t& bit_set_count,\n                           DataCommMode& data_mode) const {\n    if (substrateDataMode != onlyData) {\n      bitset_comm.reset();\n      std::string syncTypeStr =\n          (syncType == syncReduce) ? \"Reduce\" : \"Broadcast\";\n      std::string doall_str(syncTypeStr + \"Bitset_\" + loopName);\n\n      bitset_comm.reset();\n      // determine which local nodes in the indices array need to be\n      // sychronized\n      galois::do_all(\n          galois::iterate(size_t{0}, indices.size()),\n          [&](size_t n) {\n            // assumes each lid is unique as test is not thread safe\n            size_t lid = indices[n];\n            if (bitset_compute.test(lid)) {\n              bitset_comm.set(n);\n            }\n          },\n#if GALOIS_COMM_STATS\n          galois::loopname(get_run_identifier(doall_str).c_str()),\n#endif\n          galois::no_stats());\n\n      // get the number of set bits and the offsets into the comm bitset\n      getOffsetsFromBitset<syncType>(loopName, bitset_comm, offsets,\n                                     bit_set_count);\n    }\n\n    data_mode =\n        get_data_mode<typename FnTy::ValTy>(bit_set_count, indices.size());\n  }\n\n  template <typename SyncFnTy>\n  size_t getMaxSendBufferSize(uint32_t numShared) {\n    if (substrateDataMode == gidsData) {\n      return sizeof(DataCommMode) + sizeof(size_t) + sizeof(size_t) +\n             (numShared * sizeof(unsigned int)) + sizeof(size_t) +\n             (numShared * sizeof(typename SyncFnTy::ValTy));\n    } else if (substrateDataMode == offsetsData) {\n      return sizeof(DataCommMode) + sizeof(size_t) + sizeof(size_t) +\n             (numShared * sizeof(unsigned int)) + sizeof(size_t) +\n             (numShared * sizeof(typename SyncFnTy::ValTy));\n    } else if (substrateDataMode == bitsetData) {\n      size_t bitset_alloc_size = ((numShared + 63) / 64) * sizeof(uint64_t);\n      return sizeof(DataCommMode) + sizeof(size_t) +\n             sizeof(size_t)   // bitset size\n             + sizeof(size_t) // bitset vector size\n             + bitset_alloc_size + sizeof(size_t) +\n             (numShared * sizeof(typename SyncFnTy::ValTy));\n    } else { // onlyData or noData (auto)\n      size_t bitset_alloc_size = ((numShared + 63) / 64) * sizeof(uint64_t);\n      return sizeof(DataCommMode) + sizeof(size_t) +\n             sizeof(size_t)   // bitset size\n             + sizeof(size_t) // bitset vector size\n             + bitset_alloc_size + sizeof(size_t) +\n             (numShared * sizeof(typename SyncFnTy::ValTy));\n    }\n  }\n\n  ////////////////////////////////////////////////////////////////////////////////\n  // Local to global ID conversion\n  ////////////////////////////////////////////////////////////////////////////////\n  /**\n   * Converts LIDs of nodes we are interested in into GIDs.\n   *\n   * @tparam syncType either reduce or broadcast; only used to name the timer\n   *\n   * @param loopName name of loop used to name timer\n   * @param indices Local ids of nodes that we are interested in\n   * @param offsets INPUT/OUTPUT holds offsets into \"indices\" that we should\n   * use; after function completion, holds global ids of nodes we are interested\n   * in\n   */\n  template <SyncType syncType>\n  void convertLIDToGID(const std::string& loopName,\n                       const std::vector<size_t>& indices,\n                       galois::PODResizeableArray<unsigned int>& offsets) {\n    std::string syncTypeStr = (syncType == syncReduce) ? \"Reduce\" : \"Broadcast\";\n    std::string doall_str(syncTypeStr + \"_LID2GID_\" +\n                          get_run_identifier(loopName));\n    galois::do_all(\n        galois::iterate(size_t{0}, offsets.size()),\n        [&](size_t n) {\n          offsets[n] =\n              static_cast<uint32_t>(userGraph.getGID(indices[offsets[n]]));\n        },\n#if GALOIS_COMM_STATS\n        galois::loopname(get_run_identifier(doall_str).c_str()),\n#endif\n        galois::no_stats());\n  }\n\n  /**\n   * Converts a vector of GIDs into local ids.\n   *\n   * @tparam syncType either reduce or broadcast; only used to name the timer\n   *\n   * @param loopName name of loop used to name timer\n   * @param offsets holds GIDs to convert to LIDs\n   */\n  template <SyncType syncType>\n  void convertGIDToLID(const std::string& loopName,\n                       galois::PODResizeableArray<unsigned int>& offsets) {\n    std::string syncTypeStr = (syncType == syncReduce) ? \"Reduce\" : \"Broadcast\";\n    std::string doall_str(syncTypeStr + \"_GID2LID_\" +\n                          get_run_identifier(loopName));\n\n    galois::do_all(\n        galois::iterate(size_t{0}, offsets.size()),\n        [&](size_t n) { offsets[n] = userGraph.getLID(offsets[n]); },\n#if GALOIS_COMM_STATS\n        galois::loopname(get_run_identifier(doall_str).c_str()),\n#endif\n        galois::no_stats());\n  }\n\n  ////////////////////////////////////////////////////////////////////////////////\n  // Message prep functions (buffering, send buffer getting, etc.)\n  ////////////////////////////////////////////////////////////////////////////////\n  /**\n   * Get data that is going to be sent for synchronization and returns\n   * it in a send buffer.\n   *\n   * @tparam syncType synchronization type\n   * @tparam SyncFnTy synchronization structure with info needed to synchronize\n   * @tparam BitsetFnTy struct that has information needed to access bitset\n   *\n   * @param loopName Name to give timer\n   * @param x Host to send to\n   * @param b OUTPUT: Buffer that will hold data to send\n   */\n  template <\n      SyncType syncType, typename SyncFnTy, typename BitsetFnTy, typename VecTy,\n      bool async,\n      typename std::enable_if<!BitsetFnTy::is_vector_bitset()>::type* = nullptr>\n  void getSendBuffer(std::string loopName, unsigned x,\n                     galois::runtime::SendBuffer& b) {\n    auto& sharedNodes = (syncType == syncReduce) ? mirrorNodes : masterNodes;\n\n    if (BitsetFnTy::is_valid()) {\n      syncExtract<syncType, SyncFnTy, BitsetFnTy, VecTy, async>(\n          loopName, x, sharedNodes[x], b);\n    } else {\n      syncExtract<syncType, SyncFnTy, VecTy, async>(loopName, x, sharedNodes[x],\n                                                    b);\n    }\n\n    std::string syncTypeStr = (syncType == syncReduce) ? \"Reduce\" : \"Broadcast\";\n    std::string statSendBytes_str(syncTypeStr + \"SendBytes_\" +\n                                  get_run_identifier(loopName));\n\n    galois::runtime::reportStat_Tsum(RNAME, statSendBytes_str, b.size());\n  }\n  template <\n      SyncType syncType, typename SyncFnTy, typename BitsetFnTy, typename VecTy,\n      bool async,\n      typename std::enable_if<BitsetFnTy::is_vector_bitset()>::type* = nullptr>\n  void getSendBuffer(std::string loopName, unsigned x,\n                     galois::runtime::SendBuffer& b) {\n    auto& sharedNodes = (syncType == syncReduce) ? mirrorNodes : masterNodes;\n\n    syncExtract<syncType, SyncFnTy, BitsetFnTy, VecTy, async>(\n        loopName, x, sharedNodes[x], b);\n\n    std::string syncTypeStr = (syncType == syncReduce) ? \"Reduce\" : \"Broadcast\";\n    std::string statSendBytes_str(syncTypeStr + \"SendBytesVector_\" +\n                                  get_run_identifier(loopName));\n\n    galois::runtime::reportStat_Tsum(RNAME, statSendBytes_str, b.size());\n  }\n\n  /**\n   * Given data to serialize in val_vec, serialize it into the send buffer\n   * depending on the mode of data communication selected for the data.\n   *\n   * @tparam syncType either reduce or broadcast\n   * @tparam VecType type of val_vec, which stores the data to send\n   *\n   * @param loopName loop name used for timers\n   * @param data_mode the way that the data should be communicated\n   * @param bit_set_count the number of items we are sending in this message\n   * @param indices list of all nodes that we are potentially interested in\n   * sending things to\n   * @param offsets contains indicies into \"indices\" that we are interested in\n   * @param val_vec contains the data that we are serializing to send\n   * @param b the buffer in which to serialize the message we are sending\n   * to\n   */\n  template <bool async, SyncType syncType, typename VecType>\n  void serializeMessage(std::string loopName, DataCommMode data_mode,\n                        size_t bit_set_count, std::vector<size_t>& indices,\n                        galois::PODResizeableArray<unsigned int>& offsets,\n                        galois::DynamicBitSet& bit_set_comm, VecType& val_vec,\n                        galois::runtime::SendBuffer& b) {\n    std::string syncTypeStr = (syncType == syncReduce) ? \"Reduce\" : \"Broadcast\";\n    std::string serialize_timer_str(syncTypeStr + \"SerializeMessage_\" +\n                                    get_run_identifier(loopName));\n    galois::CondStatTimer<GALOIS_COMM_STATS> Tserialize(\n        serialize_timer_str.c_str(), RNAME);\n    if (data_mode == noData) {\n      if (!async) {\n        Tserialize.start();\n        gSerialize(b, data_mode);\n        Tserialize.stop();\n      }\n    } else if (data_mode == gidsData) {\n      offsets.resize(bit_set_count);\n      convertLIDToGID<syncType>(loopName, indices, offsets);\n      val_vec.resize(bit_set_count);\n      Tserialize.start();\n      gSerialize(b, data_mode, bit_set_count, offsets, val_vec);\n      Tserialize.stop();\n    } else if (data_mode == offsetsData) {\n      offsets.resize(bit_set_count);\n      val_vec.resize(bit_set_count);\n      Tserialize.start();\n      gSerialize(b, data_mode, bit_set_count, offsets, val_vec);\n      Tserialize.stop();\n    } else if (data_mode == bitsetData) {\n      val_vec.resize(bit_set_count);\n      Tserialize.start();\n      gSerialize(b, data_mode, bit_set_count, bit_set_comm, val_vec);\n      Tserialize.stop();\n    } else { // onlyData\n      Tserialize.start();\n      gSerialize(b, data_mode, val_vec);\n      Tserialize.stop();\n    }\n  }\n\n  /**\n   * Given the data mode, deserialize the rest of a message in a Receive Buffer.\n   *\n   * @tparam syncType either reduce or broadcast\n   * @tparam VecType type of val_vec, which data will be deserialized into\n   *\n   * @param loopName used to name timers for statistics\n   * @param data_mode data mode with which the original message was sent;\n   * determines how to deserialize the rest of the message\n   * @param buf buffer which contains the received message to deserialize\n   *\n   * The rest of the arguments are output arguments (they are passed by\n   * reference)\n   *\n   * @param bit_set_count Var that holds number of bits set (i.e. number of\n   * node changed) after deserialization\n   * @param offsets holds offsets data after deserialization if data mode is\n   * offsets + data\n   * @param bit_set_comm holds the bitset representing changed nodes after\n   * deserialization of data mode is bitset + data\n   * @param buf_start\n   * @param retval\n   * @param val_vec The data proper will be deserialized into this vector\n   */\n  template <SyncType syncType, typename VecType>\n  void deserializeMessage(std::string loopName, DataCommMode data_mode,\n                          uint32_t num, galois::runtime::RecvBuffer& buf,\n                          size_t& bit_set_count,\n                          galois::PODResizeableArray<unsigned int>& offsets,\n                          galois::DynamicBitSet& bit_set_comm,\n                          size_t& buf_start, size_t& retval, VecType& val_vec) {\n    std::string syncTypeStr = (syncType == syncReduce) ? \"Reduce\" : \"Broadcast\";\n    std::string serialize_timer_str(syncTypeStr + \"DeserializeMessage_\" +\n                                    get_run_identifier(loopName));\n    galois::CondStatTimer<GALOIS_COMM_STATS> Tdeserialize(\n        serialize_timer_str.c_str(), RNAME);\n    Tdeserialize.start();\n\n    // get other metadata associated with message if mode isn't OnlyData\n    if (data_mode != onlyData) {\n      galois::runtime::gDeserialize(buf, bit_set_count);\n\n      if (data_mode == gidsData) {\n        galois::runtime::gDeserialize(buf, offsets);\n        convertGIDToLID<syncType>(loopName, offsets);\n      } else if (data_mode == offsetsData) {\n        galois::runtime::gDeserialize(buf, offsets);\n      } else if (data_mode == bitsetData) {\n        bit_set_comm.resize(num);\n        galois::runtime::gDeserialize(buf, bit_set_comm);\n      } else if (data_mode == dataSplit) {\n        galois::runtime::gDeserialize(buf, buf_start);\n      } else if (data_mode == dataSplitFirst) {\n        galois::runtime::gDeserialize(buf, retval);\n      }\n    }\n\n    // get data itself\n    galois::runtime::gDeserialize(buf, val_vec);\n\n    Tdeserialize.stop();\n  }\n\n  ////////////////////////////////////////////////////////////////////////////////\n  // Other helper functions\n  ////////////////////////////////////////////////////////////////////////////////\n\n  //! Returns the grid row ID of this host\n  unsigned gridRowID() const { return (id / cartesianGrid.second); }\n  //! Returns the grid row ID of the specified host\n  unsigned gridRowID(unsigned hid) const {\n    return (hid / cartesianGrid.second);\n  }\n  //! Returns the grid column ID of this host\n  unsigned gridColumnID() const { return (id % cartesianGrid.second); }\n  //! Returns the grid column ID of the specified host\n  unsigned gridColumnID(unsigned hid) const {\n    return (hid % cartesianGrid.second);\n  }\n\n  /**\n   * Determine if a host is a communication partner using cartesian grid.\n   */\n  bool isNotCommPartnerCVC(unsigned host, SyncType syncType,\n                           WriteLocation writeLocation,\n                           ReadLocation readLocation) {\n    assert(cartesianGrid.first != 0);\n    assert(cartesianGrid.second != 0);\n\n    if (transposed) {\n      if (syncType == syncReduce) {\n        switch (writeLocation) {\n        case writeSource:\n          return (gridColumnID() != gridColumnID(host));\n        case writeDestination:\n          return (gridRowID() != gridRowID(host));\n        case writeAny:\n          assert((gridRowID() == gridRowID(host)) ||\n                 (gridColumnID() == gridColumnID(host)));\n          return ((gridRowID() != gridRowID(host)) &&\n                  (gridColumnID() != gridColumnID(host))); // false\n        default:\n          GALOIS_DIE(\"unreachable\");\n        }\n      } else { // syncBroadcast\n        switch (readLocation) {\n        case readSource:\n          return (gridColumnID() != gridColumnID(host));\n        case readDestination:\n          return (gridRowID() != gridRowID(host));\n        case readAny:\n          assert((gridRowID() == gridRowID(host)) ||\n                 (gridColumnID() == gridColumnID(host)));\n          return ((gridRowID() != gridRowID(host)) &&\n                  (gridColumnID() != gridColumnID(host))); // false\n        default:\n          GALOIS_DIE(\"unreachable\");\n        }\n      }\n    } else {\n      if (syncType == syncReduce) {\n        switch (writeLocation) {\n        case writeSource:\n          return (gridRowID() != gridRowID(host));\n        case writeDestination:\n          return (gridColumnID() != gridColumnID(host));\n        case writeAny:\n          assert((gridRowID() == gridRowID(host)) ||\n                 (gridColumnID() == gridColumnID(host)));\n          return ((gridRowID() != gridRowID(host)) &&\n                  (gridColumnID() != gridColumnID(host))); // false\n        default:\n          GALOIS_DIE(\"unreachable\");\n        }\n      } else { // syncBroadcast, 1\n        switch (readLocation) {\n        case readSource:\n          return (gridRowID() != gridRowID(host));\n        case readDestination:\n          return (gridColumnID() != gridColumnID(host));\n        case readAny:\n          assert((gridRowID() == gridRowID(host)) ||\n                 (gridColumnID() == gridColumnID(host)));\n          return ((gridRowID() != gridRowID(host)) &&\n                  (gridColumnID() != gridColumnID(host))); // false\n        default:\n          GALOIS_DIE(\"unreachable\");\n        }\n      }\n      return false;\n    }\n  }\n\n  // Requirement: For all X and Y,\n  // On X, nothingToSend(Y) <=> On Y, nothingToRecv(X)\n  /**\n   * Determine if we have anything that we need to send to a particular host\n   *\n   * @param host Host number that we may or may not send to\n   * @param syncType Synchronization type to determine which nodes on a\n   * host need to be considered\n   * @param writeLocation If data is being written to on source or\n   * destination (or both)\n   * @param readLocation If data is being read from on source or\n   * destination (or both)\n   * @returns true if there is nothing to send to a host, false otherwise\n   */\n  bool nothingToSend(unsigned host, SyncType syncType,\n                     WriteLocation writeLocation, ReadLocation readLocation) {\n    auto& sharedNodes = (syncType == syncReduce) ? mirrorNodes : masterNodes;\n    // TODO refactor (below)\n    if (!isCartCut) {\n      return (sharedNodes[host].size() == 0);\n    } else {\n      // TODO If CVC, call is not comm partner else use default above\n      if (sharedNodes[host].size() > 0) {\n        return isNotCommPartnerCVC(host, syncType, writeLocation, readLocation);\n      } else {\n        return true;\n      }\n    }\n  }\n\n  /**\n   * Determine if we have anything that we need to receive from a particular\n   * host\n   *\n   * @param host Host number that we may or may not receive from\n   * @param syncType Synchronization type to determine which nodes on a\n   * host need to be considered\n   * @param writeLocation If data is being written to on source or\n   * destination (or both)\n   * @param readLocation If data is being read from on source or\n   * destination (or both)\n   * @returns true if there is nothing to receive from a host, false otherwise\n   */\n  bool nothingToRecv(unsigned host, SyncType syncType,\n                     WriteLocation writeLocation, ReadLocation readLocation) {\n    auto& sharedNodes = (syncType == syncReduce) ? masterNodes : mirrorNodes;\n    // TODO refactor (above)\n    if (!isCartCut) {\n      return (sharedNodes[host].size() == 0);\n    } else {\n      if (sharedNodes[host].size() > 0) {\n        return isNotCommPartnerCVC(host, syncType, writeLocation, readLocation);\n      } else {\n        return true;\n      }\n    }\n  }\n\n  /**\n   * Reports bytes saved by using the bitset to only selectively load data\n   * to send.\n   *\n   * @tparam SyncFnTy synchronization structure with info needed to synchronize;\n   * used for size calculation\n   *\n   * @param loopName loop name used for timers\n   * @param syncTypeStr String used to name timers\n   * @param totalToSend Total amount of nodes that are potentially sent (not\n   * necessarily all nodees will be sent)\n   * @param bitSetCount Number of nodes that will actually be sent\n   * @param bitSetComm bitset used to send data\n   */\n  template <typename SyncFnTy>\n  void reportRedundantSize(std::string loopName, std::string syncTypeStr,\n                           uint32_t totalToSend, size_t bitSetCount,\n                           const galois::DynamicBitSet& bitSetComm) {\n    size_t redundant_size =\n        (totalToSend - bitSetCount) * sizeof(typename SyncFnTy::ValTy);\n    size_t bit_set_size = (bitSetComm.get_vec().size() * sizeof(uint64_t));\n\n    if (redundant_size > bit_set_size) {\n      std::string statSavedBytes_str(syncTypeStr + \"SavedBytes_\" +\n                                     get_run_identifier(loopName));\n\n      galois::runtime::reportStatCond_Tsum<MORE_DIST_STATS>(\n          RNAME, statSavedBytes_str, (redundant_size - bit_set_size));\n    }\n  }\n\n  ////////////////////////////////////////////////////////////////////////////////\n  // Extract data from nodes (for reduce and broadcast)\n  ////////////////////////////////////////////////////////////////////////////////\n  /**\n   * Extracts data at provided lid.\n   *\n   * This version (reduce) resets the value after extract.\n   *\n   * @tparam FnTy structure that specifies how synchronization is to be done\n   * @tparam syncType either reduce or broadcast; determines if reset is\n   * necessary\n   *\n   * @param lid local id of node to get data from\n   * @returns data (specified by FnTy) of node with local id lid\n   */\n  /* Reduction extract resets the value afterwards */\n  template <typename FnTy, SyncType syncType>\n  inline typename FnTy::ValTy extractWrapper(size_t lid) {\n    if (syncType == syncReduce) {\n      auto val = FnTy::extract(lid, userGraph.getData(lid));\n      FnTy::reset(lid, userGraph.getData(lid));\n      return val;\n    } else {\n      return FnTy::extract(lid, userGraph.getData(lid));\n    }\n  }\n\n  /**\n   * Extracts data at provided lid; uses vecIndex to get the correct element\n   * from the vector.\n   *\n   * This version (reduce) resets the value after extract.\n   *\n   * @tparam FnTy structure that specifies how synchronization is to be done\n   * @tparam syncType either reduce or broadcast; determines if reset is\n   * necessary\n   *\n   * @param lid local id of node to get data from\n   * @param vecIndex index to grab from vector in node\n   * @returns data (specified by FnTy) of node with local id lid\n   */\n  /* Reduction extract resets the value afterwards */\n  template <typename FnTy, SyncType syncType>\n  inline typename FnTy::ValTy extractWrapper(size_t lid, unsigned vecIndex) {\n    if (syncType == syncReduce) {\n      auto val = FnTy::extract(lid, userGraph.getData(lid), vecIndex);\n      FnTy::reset(lid, userGraph.getData(lid), vecIndex);\n      return val;\n    } else {\n      return FnTy::extract(lid, userGraph.getData(lid), vecIndex);\n    }\n  }\n\n  /**\n   * Based on provided arguments, extracts the data that we are interested\n   * in sending into val_vec.\n   *\n   * @tparam FnTy structure that specifies how synchronization is to be done\n   * @tparam syncType either reduce or broadcast; used to determine if reseting\n   * the extracted field is necessary\n   * @tparam identity_offsets If this is true, then ignore the offsets\n   * array and just grab directly from indices (i.e. don't pick out\n   * particular elements, just grab contiguous chunk)\n   * @tparam parallelize Determines if parallelizing the extraction is done or\n   * not\n   *\n   * @param loopName name of loop used to name timer\n   * @param indices Local ids of nodes that we are interested in\n   * @param size Number of elements to extract\n   * @param offsets Holds offsets into \"indices\" of the data that we are\n   * interested in\n   * @param val_vec OUTPUT: holds the extracted data\n   * @param start Offset into val_vec to start saving data to\n   */\n  template <typename FnTy, SyncType syncType, typename VecTy,\n            bool identity_offsets = false, bool parallelize = true>\n  void extractSubset(const std::string& loopName,\n                     const std::vector<size_t>& indices, size_t size,\n                     const galois::PODResizeableArray<unsigned int>& offsets,\n                     VecTy& val_vec, size_t start = 0) {\n    if (parallelize) {\n      std::string syncTypeStr =\n          (syncType == syncReduce) ? \"Reduce\" : \"Broadcast\";\n      std::string doall_str(syncTypeStr + \"ExtractVal_\" + loopName);\n\n      galois::do_all(\n          galois::iterate(start, start + size),\n          [&](unsigned int n) {\n            unsigned int offset;\n            if (identity_offsets)\n              offset = n;\n            else\n              offset = offsets[n];\n            size_t lid         = indices[offset];\n            val_vec[n - start] = extractWrapper<FnTy, syncType>(lid);\n          },\n#if GALOIS_COMM_STATS\n          galois::loopname(get_run_identifier(doall_str).c_str()),\n#endif\n          galois::no_stats());\n    } else { // non-parallel version\n      for (unsigned n = start; n < start + size; ++n) {\n        unsigned int offset;\n        if (identity_offsets)\n          offset = n;\n        else\n          offset = offsets[n];\n\n        size_t lid         = indices[offset];\n        val_vec[n - start] = extractWrapper<FnTy, syncType>(lid);\n      }\n    }\n  }\n\n  /**\n   * Based on provided arguments, extracts the data that we are interested\n   * in sending into val_vec. Same as above, except it has the vecIndex\n   * arguments and requires vecSync to be true\n   *\n   * @tparam FnTy structure that specifies how synchronization is to be done\n   * @tparam syncType either reduce or broadcast; used to determine if reseting\n   * the extracted field is necessary\n   * @tparam identity_offsets If this is true, then ignore the offsets\n   * array and just grab directly from indices (i.e. don't pick out\n   * particular elements, just grab contiguous chunk)\n   * @tparam parallelize Determines if parallelizing the extraction is done or\n   * not\n   * @tparam vecSync Only set to true if the field being synchronized is a\n   * vector and synchronization is occuring element by element. MUST BE SET\n   * TO TRUE IN ORDER FOR THIS FUNCTION TO COMPILE.\n   *\n   * @param loopName name of loop used to name timer\n   * @param indices Local ids of nodes that we are interested in\n   * @param size Number of elements to extract\n   * @param offsets Holds offsets into \"indices\" of the data that we are\n   * interested in\n   * @param val_vec OUTPUT: holds the extracted data\n   * @param vecIndex which element of the vector to extract from node\n   * @param start Offset into val_vec to start saving data to\n   */\n  // TODO find a better way to have this variant without code duplication\n  template <typename FnTy, SyncType syncType, typename VecTy,\n            bool identity_offsets = false, bool parallelize = true,\n            bool vecSync                            = false,\n            typename std::enable_if<vecSync>::type* = nullptr>\n  void extractSubset(const std::string& loopName,\n                     const std::vector<size_t>& indices, size_t size,\n                     const galois::PODResizeableArray<unsigned int>& offsets,\n                     VecTy& val_vec, unsigned vecIndex, size_t start = 0) {\n    val_vec.resize(size); // resize val vec for this vecIndex\n\n    if (parallelize) {\n      std::string syncTypeStr =\n          (syncType == syncReduce) ? \"Reduce\" : \"Broadcast\";\n      std::string doall_str(syncTypeStr + \"ExtractValVector_\" + loopName);\n\n      galois::do_all(\n          galois::iterate(start, start + size),\n          [&](unsigned int n) {\n            unsigned int offset;\n            if (identity_offsets)\n              offset = n;\n            else\n              offset = offsets[n];\n            size_t lid         = indices[offset];\n            val_vec[n - start] = extractWrapper<FnTy, syncType>(lid, vecIndex);\n          },\n#if GALOIS_COMM_STATS\n          galois::loopname(get_run_identifier(doall_str).c_str()),\n#endif\n          galois::no_stats());\n    } else { // non-parallel version\n      for (unsigned n = start; n < start + size; ++n) {\n        unsigned int offset;\n        if (identity_offsets)\n          offset = n;\n        else\n          offset = offsets[n];\n        size_t lid         = indices[offset];\n        val_vec[n - start] = extractWrapper<FnTy, syncType>(lid, vecIndex);\n      }\n    }\n  }\n\n  /**\n   * Based on provided arguments, extracts the data that we are interested\n   * in sending into a send buffer. Lazy serialize variant that works with\n   * certain SeqTy.\n   *\n   * @tparam FnTy structure that specifies how synchronization is to be done\n   * @tparam SeqTy Type of sequence that we are getting data from\n   * @tparam syncType either reduce or broadcast; used to determine if reseting\n   * the extracted field is necessary\n   * @tparam identity_offsets If this is true, then ignore the offsets\n   * array and just grab directly from indices (i.e. don't pick out\n   * particular elements, just grab contiguous chunk)\n   * @tparam parallelize Determines if parallelizing the extraction is done or\n   * not\n   *\n   * @param loopName name of loop used to name timer\n   * @param indices Local ids of nodes that we are interested in\n   * @param size Number of elements to extract\n   * @param offsets Holds offsets into \"indices\" of the data that we are\n   * interested in\n   * @param b send buffer to extract data into\n   * @param lseq sequence to get data from\n   * @param start Offset into send buffer to start saving data to\n   */\n  template <typename FnTy, typename SeqTy, SyncType syncType,\n            bool identity_offsets = false, bool parallelize = true>\n  void extractSubset(const std::string& loopName,\n                     const std::vector<size_t>& indices, size_t size,\n                     const galois::PODResizeableArray<unsigned int>& offsets,\n                     galois::runtime::SendBuffer& b, SeqTy lseq,\n                     size_t start = 0) {\n    if (parallelize) {\n      std::string syncTypeStr =\n          (syncType == syncReduce) ? \"Reduce\" : \"Broadcast\";\n      std::string doall_str(syncTypeStr + \"ExtractVal_\" + loopName);\n\n      galois::do_all(\n          galois::iterate(start, start + size),\n          [&](unsigned int n) {\n            unsigned int offset;\n            if (identity_offsets)\n              offset = n;\n            else\n              offset = offsets[n];\n\n            size_t lid = indices[offset];\n            gSerializeLazy(b, lseq, n - start,\n                           extractWrapper<FnTy, syncType>(lid));\n          },\n#if GALOIS_COMM_STATS\n          galois::loopname(get_run_identifier(doall_str).c_str()),\n#endif\n          galois::no_stats());\n    } else {\n      for (unsigned int n = start; n < start + size; ++n) {\n        unsigned int offset;\n        if (identity_offsets)\n          offset = n;\n        else\n          offset = offsets[n];\n        size_t lid = indices[offset];\n        gSerializeLazy(b, lseq, n - start, extractWrapper<FnTy, syncType>(lid));\n      }\n    }\n  }\n\n  /**\n   * GPU wrap function: extracts data from nodes and resets them to the\n   * reduction identity value as specified by the sync structure. (Reduce only)\n   *\n   * @tparam FnTy structure that specifies how synchronization is to be done\n   * @tparam SyncType Must be reduce\n   *\n   * @param x node id to extract from\n   * @param v vector to extract data to\n   *\n   * @returns true if called on GPU device\n   */\n  template <typename FnTy, SyncType syncType>\n  inline bool extractBatchWrapper(unsigned x, galois::runtime::SendBuffer& b) {\n    if (syncType == syncReduce) {\n      return FnTy::extract_reset_batch(x, b.getVec().data());\n    } else {\n      return FnTy::extract_batch(x, b.getVec().data());\n    }\n  }\n\n  /**\n   * GPU wrap function: extracts data from nodes and resets them to the\n   * reduction identity value as specified by the sync structure. (Reduce only)\n   *\n   * This version specifies more arguments.\n   *\n   * @tparam FnTy structure that specifies how synchronization is to be done\n   * @tparam SyncType Must be reduce\n   *\n   * @param x node id to extract from\n   * @param b\n   * @param o\n   * @param v\n   * @param s\n   * @param data_mode\n   *\n   * @returns true if called on GPU device\n   */\n  template <typename FnTy, SyncType syncType>\n  inline bool extractBatchWrapper(unsigned x, galois::runtime::SendBuffer& b,\n                                  size_t& s, DataCommMode& data_mode) {\n    if (syncType == syncReduce) {\n      return FnTy::extract_reset_batch(x, b.getVec().data(), &s, &data_mode);\n    } else {\n      return FnTy::extract_batch(x, b.getVec().data(), &s, &data_mode);\n    }\n  }\n\n  ////////////////////////////////////////////////////////////////////////////////\n  // Reduce/sets on node (for broadcast)\n  ////////////////////////////////////////////////////////////////////////////////\n  /**\n   * Reduce variant. Takes a value and reduces it according to the sync\n   * structure provided to the function.\n   *\n   * @tparam FnTy structure that specifies how synchronization is to be done\n   * @tparam syncType Reduce sync or broadcast sync\n   *\n   * @param lid local id of node to reduce to\n   * @param val value to reduce to\n   * @param bit_set_compute bitset indicating which nodes have changed; updated\n   * if reduction causes a change\n   */\n  template <typename FnTy, SyncType syncType, bool async>\n  inline void setWrapper(size_t lid, typename FnTy::ValTy val,\n                         galois::DynamicBitSet& bit_set_compute) {\n    if (syncType == syncReduce) {\n      if (FnTy::reduce(lid, userGraph.getData(lid), val)) {\n        if (bit_set_compute.size() != 0)\n          bit_set_compute.set(lid);\n      }\n    } else {\n      if (async)\n        FnTy::reduce(lid, userGraph.getData(lid), val);\n      else\n        FnTy::setVal(lid, userGraph.getData(lid), val);\n    }\n  }\n\n  /**\n   * VECTOR VARIANT.\n   *\n   * Reduce variant. Takes a value and reduces it according to the sync\n   * structure provided to the function. Only reduces the element at a\n   * particular index of the vector field being sychronized.\n   *\n   * @tparam FnTy structure that specifies how synchronization is to be done\n   * @tparam syncType Reduce sync or broadcast sync\n   *\n   * @param lid local id of node to reduce to\n   * @param val value to reduce to\n   * @param bit_set_compute bitset indicating which nodes have changed; updated\n   * if reduction causes a change\n   * @param vecIndex which element of the vector to reduce in the node\n   */\n  template <typename FnTy, SyncType syncType, bool async>\n  inline void setWrapper(size_t lid, typename FnTy::ValTy val,\n                         galois::DynamicBitSet& bit_set_compute,\n                         unsigned vecIndex) {\n    if (syncType == syncReduce) {\n      if (FnTy::reduce(lid, userGraph.getData(lid), val, vecIndex)) {\n        if (bit_set_compute.size() != 0)\n          bit_set_compute.set(lid);\n      }\n    } else {\n      if (async)\n        FnTy::reduce(lid, userGraph.getData(lid), val, vecIndex);\n      else\n        FnTy::setVal(lid, userGraph.getData(lid), val, vecIndex);\n    }\n  }\n\n  /**\n   * Given data received from another host and information on which nodes\n   * to update, do the reduce/set of the received data to update local nodes.\n   *\n   * Complement function, in some sense, of extractSubset.\n   *\n   * @tparam VecTy type of indices variable\n   * @tparam FnTy structure that specifies how synchronization is to be done\n   * @tparam SyncType Reduce or broadcast\n   * @tparam identity_offsets If this is true, then ignore the offsets\n   * array and just grab directly from indices (i.e. don't pick out\n   * particular elements, just grab contiguous chunk)\n   * @tparam parallelize True if updates to nodes are to be parallelized\n   *\n   * @param loopName name of loop used to name timer\n   * @param indices Local ids of nodes that we are interested in\n   * @param size Number of elements to set\n   * @param offsets Holds offsets into \"indices\" of the data that we are\n   * interested in\n   * @param val_vec holds data we will use to set\n   * @param bit_set_compute bitset indicating which nodes have changed\n   * @param start Offset into val_vec to start saving data to\n   */\n  template <typename IndicesVecTy, typename FnTy, SyncType syncType,\n            typename VecTy, bool async, bool identity_offsets = false,\n            bool parallelize = true>\n  void setSubset(const std::string& loopName, const IndicesVecTy& indices,\n                 size_t size,\n                 const galois::PODResizeableArray<unsigned int>& offsets,\n                 VecTy& val_vec, galois::DynamicBitSet& bit_set_compute,\n                 size_t start = 0) {\n    std::string syncTypeStr = (syncType == syncReduce) ? \"Reduce\" : \"Broadcast\";\n    std::string doall_str(syncTypeStr + \"SetVal_\" +\n                          get_run_identifier(loopName));\n\n    if (parallelize) {\n      galois::do_all(\n          galois::iterate(start, start + size),\n          [&](unsigned int n) {\n            unsigned int offset;\n            if (identity_offsets)\n              offset = n;\n            else\n              offset = offsets[n];\n            auto lid = indices[offset];\n            setWrapper<FnTy, syncType, async>(lid, val_vec[n - start],\n                                              bit_set_compute);\n          },\n#if GALOIS_COMM_STATS\n          galois::loopname(get_run_identifier(doall_str).c_str()),\n#endif\n          galois::no_stats());\n    } else {\n      for (unsigned int n = start; n < start + size; ++n) {\n        unsigned int offset;\n        if (identity_offsets)\n          offset = n;\n        else\n          offset = offsets[n];\n        auto lid = indices[offset];\n        setWrapper<FnTy, syncType, async>(lid, val_vec[n - start],\n                                          bit_set_compute);\n      }\n    }\n  }\n\n  /**\n   * VECTOR BITSET VARIANT.\n   *\n   * Given data received from another host and information on which nodes\n   * to update, do the reduce/set of the received data to update local nodes.\n   * It will only update a single index of the vector specified by the\n   * sync structures at a time.\n   *\n   * Complement function, in some sense, of extractSubset, vector bitset\n   * variant.\n   *\n   * @tparam VecTy type of indices variable\n   * @tparam FnTy structure that specifies how synchronization is to be done\n   * @tparam SyncType Reduce or broadcast\n   * @tparam identity_offsets If this is true, then ignore the offsets\n   * array and just grab directly from indices (i.e. don't pick out\n   * particular elements, just grab contiguous chunk)\n   * @tparam parallelize True if updates to nodes are to be parallelized\n   * @tparam vecSync Only set to true if the field being synchronized is a\n   * vector. MUST BE SET TO TRUE FOR THIS FUNCTION TO COMPILE\n   *\n   * @param loopName name of loop used to name timer\n   * @param indices Local ids of nodes that we are interested in\n   * @param size Number of elements to set\n   * @param offsets Holds offsets into \"indices\" of the data that we are\n   * interested in\n   * @param val_vec holds data we will use to set\n   * @param bit_set_compute bitset indicating which nodes have changed\n   * @param vecIndex which element of the vector to set in the node\n   * @param start Offset into val_vec to start saving data to\n   */\n  // TODO find a better way to have this variant without code duplication\n  template <typename IndicesVecTy, typename FnTy, SyncType syncType,\n            typename VecTy, bool async, bool identity_offsets = false,\n            bool parallelize = true, bool vecSync = false,\n            typename std::enable_if<vecSync>::type* = nullptr>\n  void setSubset(const std::string& loopName, const IndicesVecTy& indices,\n                 size_t size,\n                 const galois::PODResizeableArray<unsigned int>& offsets,\n                 VecTy& val_vec, galois::DynamicBitSet& bit_set_compute,\n                 unsigned vecIndex, size_t start = 0) {\n    std::string syncTypeStr = (syncType == syncReduce) ? \"Reduce\" : \"Broadcast\";\n    std::string doall_str(syncTypeStr + \"SetValVector_\" +\n                          get_run_identifier(loopName));\n\n    if (parallelize) {\n      galois::do_all(\n          galois::iterate(start, start + size),\n          [&](unsigned int n) {\n            unsigned int offset;\n            if (identity_offsets)\n              offset = n;\n            else\n              offset = offsets[n];\n            auto lid = indices[offset];\n            setWrapper<FnTy, syncType, async>(lid, val_vec[n - start],\n                                              bit_set_compute, vecIndex);\n          },\n#if GALOIS_COMM_STATS\n          galois::loopname(get_run_identifier(doall_str).c_str()),\n#endif\n          galois::no_stats());\n    } else {\n      for (unsigned int n = start; n < start + size; ++n) {\n        unsigned int offset;\n        if (identity_offsets)\n          offset = n;\n        else\n          offset = offsets[n];\n        auto lid = indices[offset];\n        setWrapper<FnTy, syncType, async>(lid, val_vec[n - start],\n                                          bit_set_compute, vecIndex);\n      }\n    }\n  }\n\n  /**\n   * GPU wrapper function to reduce multiple nodes at once.\n   *\n   * @tparam FnTy structure that specifies how synchronization is to be done\n   * @tparam SyncType Must be reduce\n   *\n   * @param x node id to set\n   * @param v\n   *\n   * @returns true if called on GPU device\n   */\n  template <typename FnTy, SyncType syncType, bool async>\n  inline bool setBatchWrapper(unsigned x, galois::runtime::RecvBuffer& b) {\n    if (syncType == syncReduce) {\n      return FnTy::reduce_batch(x, b.getVec().data() + b.getOffset());\n    } else {\n      if (async) {\n        return FnTy::reduce_mirror_batch(x, b.getVec().data() + b.getOffset());\n      } else {\n        return FnTy::setVal_batch(x, b.getVec().data() + b.getOffset());\n      }\n    }\n  }\n\n  /**\n   * GPU wrapper function to reduce multiple nodes at once. More detailed\n   * arguments.\n   *\n   * @tparam FnTy structure that specifies how synchronization is to be done\n   * @tparam SyncType Must be reduce\n   *\n   * @param x node id to set\n   * @param b\n   * @param o\n   * @param v\n   * @param s\n   * @param data_mode\n   *\n   * @returns true if called on GPU device\n   */\n  template <typename FnTy, SyncType syncType, bool async>\n  inline bool setBatchWrapper(unsigned x, galois::runtime::RecvBuffer& b,\n                              DataCommMode& data_mode) {\n    if (syncType == syncReduce) {\n      return FnTy::reduce_batch(x, b.getVec().data() + b.getOffset(),\n                                data_mode);\n    } else {\n      if (async) {\n        return FnTy::reduce_mirror_batch(x, b.getVec().data() + b.getOffset(),\n                                         data_mode);\n      } else {\n        return FnTy::setVal_batch(x, b.getVec().data() + b.getOffset(),\n                                  data_mode);\n      }\n    }\n  }\n\n  ////////////////////////////////////////////////////////////////////////////////\n  // Sends\n  ////////////////////////////////////////////////////////////////////////////////\n  /**\n   * Non-bitset extract that uses serializelazy to copy data over to the\n   * buffer. REQUIRES that the ValTy be memory copyable.\n   *\n   * @tparam syncType either reduce or broadcast\n   * @tparam syncFnTy struct that has info on how to do synchronization\n   *\n   * @param loopName loop name used for timers\n   * @param from_id\n   * @param indices Vector that contains node ids of nodes that we will\n   * potentially send things to\n   * @param b OUTPUT: buffer that will be sent over the network; contains data\n   * based on set bits in bitset\n   */\n  template <SyncType syncType, typename SyncFnTy, typename VecTy, bool async,\n            typename std::enable_if<galois::runtime::is_memory_copyable<\n                typename SyncFnTy::ValTy>::value>::type* = nullptr>\n  void syncExtract(std::string loopName, unsigned from_id,\n                   std::vector<size_t>& indices,\n                   galois::runtime::SendBuffer& b) {\n    uint32_t num = indices.size();\n    static VecTy val_vec; // sometimes wasteful\n    galois::PODResizeableArray<unsigned int>& offsets = syncOffsets;\n    std::string syncTypeStr = (syncType == syncReduce) ? \"Reduce\" : \"Broadcast\";\n    std::string extract_timer_str(syncTypeStr + \"Extract_\" +\n                                  get_run_identifier(loopName));\n    galois::CondStatTimer<GALOIS_COMM_STATS> Textract(extract_timer_str.c_str(),\n                                                      RNAME);\n    std::string extract_batch_timer_str(syncTypeStr + \"ExtractBatch_\" +\n                                        get_run_identifier(loopName));\n    galois::CondStatTimer<GALOIS_COMM_STATS> Textractbatch(\n        extract_batch_timer_str.c_str(), RNAME);\n\n    DataCommMode data_mode;\n\n    Textract.start();\n\n    if (num > 0) {\n      data_mode = onlyData;\n      b.reserve(sizeof(DataCommMode) + sizeof(size_t) +\n                (num * sizeof(typename SyncFnTy::ValTy)));\n\n      Textractbatch.start();\n      bool batch_succeeded =\n          extractBatchWrapper<SyncFnTy, syncType>(from_id, b);\n      Textractbatch.stop();\n\n      if (!batch_succeeded) {\n        b.resize(0);\n        val_vec.reserve(maxSharedSize);\n        val_vec.resize(num);\n        gSerialize(b, onlyData);\n        auto lseq = gSerializeLazySeq(\n            b, num,\n            (galois::PODResizeableArray<typename SyncFnTy::ValTy>*)nullptr);\n        extractSubset<SyncFnTy, decltype(lseq), syncType, true, true>(\n            loopName, indices, num, offsets, b, lseq);\n      } else {\n        b.resize(sizeof(DataCommMode) + sizeof(size_t) +\n                 (num * sizeof(typename SyncFnTy::ValTy)));\n      }\n    } else {\n      data_mode = noData;\n      b.resize(0);\n      if (!async) {\n        gSerialize(b, noData);\n      }\n    }\n\n    Textract.stop();\n\n    std::string metadata_str(syncTypeStr + \"MetadataMode_\" +\n                             std::to_string(data_mode) + \"_\" +\n                             get_run_identifier(loopName));\n    galois::runtime::reportStatCond_Single<MORE_DIST_STATS>(RNAME, metadata_str,\n                                                            1);\n  }\n\n  /**\n   * Non-bitset extract for when the type of the item being sync'd isn't\n   * memory copyable.\n   *\n   * Extracts all of the data for all nodes in indices and saves it into\n   * a send buffer for return.\n   *\n   * @tparam syncType either reduce or broadcast\n   * @tparam syncFnTy struct that has info on how to do synchronization\n   *\n   * @param loopName loop name used for timers\n   * @param from_id\n   * @param indices Vector that contains node ids of nodes that we will\n   * potentially send things to\n   * @param b OUTPUT: buffer that will be sent over the network; contains data\n   * based on set bits in bitset\n   */\n  template <SyncType syncType, typename SyncFnTy, typename VecTy, bool async,\n            typename std::enable_if<!galois::runtime::is_memory_copyable<\n                typename SyncFnTy::ValTy>::value>::type* = nullptr>\n  void syncExtract(std::string loopName, unsigned from_id,\n                   std::vector<size_t>& indices,\n                   galois::runtime::SendBuffer& b) {\n    std::string syncTypeStr = (syncType == syncReduce) ? \"Reduce\" : \"Broadcast\";\n    std::string extract_timer_str(syncTypeStr + \"Extract_\" +\n                                  get_run_identifier(loopName));\n    galois::CondStatTimer<GALOIS_COMM_STATS> Textract(extract_timer_str.c_str(),\n                                                      RNAME);\n    std::string extract_batch_timer_str(syncTypeStr + \"ExtractBatch_\" +\n                                        get_run_identifier(loopName));\n    galois::CondStatTimer<GALOIS_COMM_STATS> Textractbatch(\n        extract_batch_timer_str.c_str(), RNAME);\n\n    DataCommMode data_mode;\n\n    uint32_t num = indices.size();\n    static VecTy val_vec; // sometimes wasteful\n    static galois::PODResizeableArray<unsigned int> dummyVector;\n\n    Textract.start();\n\n    if (num > 0) {\n      data_mode = onlyData;\n      b.reserve(sizeof(DataCommMode) + sizeof(size_t) +\n                (num * sizeof(typename SyncFnTy::ValTy)));\n\n      Textractbatch.start();\n      bool batch_succeeded =\n          extractBatchWrapper<SyncFnTy, syncType>(from_id, b);\n      Textractbatch.stop();\n\n      if (!batch_succeeded) {\n        b.resize(0);\n        val_vec.reserve(maxSharedSize);\n        val_vec.resize(num);\n        // get everything (note I pass in \"indices\" as offsets as it won't\n        // even get used anyways)\n        extractSubset<SyncFnTy, syncType, VecTy, true, true>(\n            loopName, indices, num, dummyVector, val_vec);\n        gSerialize(b, onlyData, val_vec);\n      } else {\n        b.resize(sizeof(DataCommMode) + sizeof(size_t) +\n                 (num * sizeof(typename SyncFnTy::ValTy)));\n      }\n\n    } else {\n      b.resize(0);\n      if (!async) {\n        data_mode = noData;\n        gSerialize(b, noData);\n      }\n    }\n\n    Textract.stop();\n\n    std::string metadata_str(syncTypeStr + \"MetadataMode_\" +\n                             std::to_string(data_mode) + \"_\" +\n                             get_run_identifier(loopName));\n    galois::runtime::reportStatCond_Single<MORE_DIST_STATS>(RNAME, metadata_str,\n                                                            1);\n  }\n\n  /**\n   * Extracts the data that will be sent to a host in this round of\n   * synchronization based on the passed in bitset and saves it to a\n   * send buffer.\n   *\n   * @tparam syncType either reduce or broadcast\n   * @tparam syncFnTy struct that has info on how to do synchronization\n   * @tparam BitsetFnTy struct that has info on how to access the bitset\n   * being used for the extraction\n   *\n   * @param loopName loop name used for timers\n   * @param from_id\n   * @param indices Vector that contains node ids of nodes that we will\n   * potentially send things to\n   * @param b OUTPUT: buffer that will be sent over the network; contains data\n   * based on set bits in bitset\n   */\n  template <\n      SyncType syncType, typename SyncFnTy, typename BitsetFnTy, typename VecTy,\n      bool async,\n      typename std::enable_if<!BitsetFnTy::is_vector_bitset()>::type* = nullptr>\n  void syncExtract(std::string loopName, unsigned from_id,\n                   std::vector<size_t>& indices,\n                   galois::runtime::SendBuffer& b) {\n    uint32_t num                        = indices.size();\n    galois::DynamicBitSet& bit_set_comm = syncBitset;\n    static VecTy val_vec; // sometimes wasteful\n    galois::PODResizeableArray<unsigned int>& offsets = syncOffsets;\n\n    std::string syncTypeStr = (syncType == syncReduce) ? \"Reduce\" : \"Broadcast\";\n    std::string extract_timer_str(syncTypeStr + \"Extract_\" +\n                                  get_run_identifier(loopName));\n    galois::CondStatTimer<GALOIS_COMM_STATS> Textract(extract_timer_str.c_str(),\n                                                      RNAME);\n    std::string extract_alloc_timer_str(syncTypeStr + \"ExtractAlloc_\" +\n                                        get_run_identifier(loopName));\n    galois::CondStatTimer<GALOIS_COMM_STATS> Textractalloc(\n        extract_alloc_timer_str.c_str(), RNAME);\n    std::string extract_batch_timer_str(syncTypeStr + \"ExtractBatch_\" +\n                                        get_run_identifier(loopName));\n    galois::CondStatTimer<GALOIS_COMM_STATS> Textractbatch(\n        extract_batch_timer_str.c_str(), RNAME);\n\n    DataCommMode data_mode;\n\n    Textract.start();\n\n    if (num > 0) {\n      size_t bit_set_count = 0;\n      Textractalloc.start();\n      b.reserve(getMaxSendBufferSize<SyncFnTy>(num));\n      Textractalloc.stop();\n\n      Textractbatch.start();\n      bool batch_succeeded = extractBatchWrapper<SyncFnTy, syncType>(\n          from_id, b, bit_set_count, data_mode);\n      Textractbatch.stop();\n\n      // GPUs have a batch function they can use; CPUs do not; therefore,\n      // CPUS always enter this if block\n      if (!batch_succeeded) {\n        Textractalloc.start();\n        b.resize(0);\n        bit_set_comm.reserve(maxSharedSize);\n        offsets.reserve(maxSharedSize);\n        val_vec.reserve(maxSharedSize);\n        bit_set_comm.resize(num);\n        offsets.resize(num);\n        val_vec.resize(num);\n        Textractalloc.stop();\n        const galois::DynamicBitSet& bit_set_compute = BitsetFnTy::get();\n\n        getBitsetAndOffsets<SyncFnTy, syncType>(\n            loopName, indices, bit_set_compute, bit_set_comm, offsets,\n            bit_set_count, data_mode);\n\n        if (data_mode == onlyData) {\n          bit_set_count = indices.size();\n          extractSubset<SyncFnTy, syncType, VecTy, true, true>(\n              loopName, indices, bit_set_count, offsets, val_vec);\n        } else if (data_mode !=\n                   noData) { // bitsetData or offsetsData or gidsData\n          extractSubset<SyncFnTy, syncType, VecTy, false, true>(\n              loopName, indices, bit_set_count, offsets, val_vec);\n        }\n        serializeMessage<async, syncType>(loopName, data_mode, bit_set_count,\n                                          indices, offsets, bit_set_comm,\n                                          val_vec, b);\n      } else {\n        if (data_mode == noData) {\n          b.resize(0);\n          if (!async) {\n            gSerialize(b, data_mode);\n          }\n        } else if (data_mode == gidsData) {\n          b.resize(sizeof(DataCommMode) + sizeof(bit_set_count) +\n                   sizeof(size_t) + (bit_set_count * sizeof(unsigned int)) +\n                   sizeof(size_t) +\n                   (bit_set_count * sizeof(typename SyncFnTy::ValTy)));\n        } else if (data_mode == offsetsData) {\n          b.resize(sizeof(DataCommMode) + sizeof(bit_set_count) +\n                   sizeof(size_t) + (bit_set_count * sizeof(unsigned int)) +\n                   sizeof(size_t) +\n                   (bit_set_count * sizeof(typename SyncFnTy::ValTy)));\n        } else if (data_mode == bitsetData) {\n          size_t bitset_alloc_size = ((num + 63) / 64) * sizeof(uint64_t);\n          b.resize(sizeof(DataCommMode) + sizeof(bit_set_count) +\n                   sizeof(size_t)   // bitset size\n                   + sizeof(size_t) // bitset vector size\n                   + bitset_alloc_size + sizeof(size_t) +\n                   (bit_set_count * sizeof(typename SyncFnTy::ValTy)));\n        } else { // onlyData\n          b.resize(sizeof(DataCommMode) + sizeof(size_t) +\n                   (num * sizeof(typename SyncFnTy::ValTy)));\n        }\n      }\n\n      reportRedundantSize<SyncFnTy>(loopName, syncTypeStr, num, bit_set_count,\n                                    bit_set_comm);\n    } else {\n      data_mode = noData;\n      b.resize(0);\n      if (!async) {\n        gSerialize(b, noData);\n      }\n    }\n\n    Textract.stop();\n\n    std::string metadata_str(syncTypeStr + \"MetadataMode_\" +\n                             std::to_string(data_mode) + \"_\" +\n                             get_run_identifier(loopName));\n    galois::runtime::reportStatCond_Single<MORE_DIST_STATS>(RNAME, metadata_str,\n                                                            1);\n  }\n\n  /**\n   * Vector bitset variant.\n   *\n   * Extracts the data that will be sent to a host in this round of\n   * synchronization based on the passed in bitset and saves it to a\n   * send buffer. Unlike other variants, this will extract an entire\n   * vector element by element.\n   *\n   * @tparam syncType either reduce or broadcast\n   * @tparam syncFnTy struct that has info on how to do synchronization\n   * @tparam BitsetFnTy struct that has info on how to access the bitset\n   * being used for the extraction. MUST BE A VECTOR BITSET\n   *\n   * @param loopName loop name used for timers\n   * @param from_id\n   * @param indices Vector that contains node ids of nodes that we will\n   * potentially send things to\n   * @param b OUTPUT: buffer that will be sent over the network; contains data\n   * based on set bits in bitset\n   */\n  template <\n      SyncType syncType, typename SyncFnTy, typename BitsetFnTy, typename VecTy,\n      bool async,\n      typename std::enable_if<BitsetFnTy::is_vector_bitset()>::type* = nullptr>\n  void syncExtract(std::string loopName, unsigned, std::vector<size_t>& indices,\n                   galois::runtime::SendBuffer& b) {\n    uint32_t num                        = indices.size();\n    galois::DynamicBitSet& bit_set_comm = syncBitset;\n    static VecTy val_vec; // sometimes wasteful\n    galois::PODResizeableArray<unsigned int>& offsets = syncOffsets;\n\n    std::string syncTypeStr = (syncType == syncReduce) ? \"Reduce\" : \"Broadcast\";\n    std::string extract_timer_str(syncTypeStr + \"ExtractVector_\" +\n                                  get_run_identifier(loopName));\n    galois::CondStatTimer<GALOIS_COMM_STATS> Textract(extract_timer_str.c_str(),\n                                                      RNAME);\n\n    Textract.start();\n\n    if (num > 0) {\n      bit_set_comm.reserve(maxSharedSize);\n      offsets.reserve(maxSharedSize);\n      val_vec.reserve(maxSharedSize);\n      bit_set_comm.resize(num);\n      offsets.resize(num);\n      val_vec.resize(num);\n    }\n\n    DataCommMode data_mode;\n    // loop over all bitsets in the vector of bitsets; each one corresponds to\n    // a different index in the vector field we are synchronizing\n    for (unsigned i = 0; i < BitsetFnTy::numBitsets(); i++) {\n      if (num > 0) {\n        bit_set_comm.reset();\n\n        size_t bit_set_count = 0;\n\n        // No GPU support currently\n        const galois::DynamicBitSet& bit_set_compute = BitsetFnTy::get(i);\n\n        getBitsetAndOffsets<SyncFnTy, syncType>(\n            loopName, indices, bit_set_compute, bit_set_comm, offsets,\n            bit_set_count, data_mode);\n\n        // note the extra template argument which specifies that this is a\n        // vector extract, i.e. get element i of the vector (i passed in as\n        // argument as well)\n        if (data_mode == onlyData) {\n          // galois::gInfo(id, \" node \", i, \" has data to send\");\n          bit_set_count = indices.size();\n          extractSubset<SyncFnTy, syncType, VecTy, true, true, true>(\n              loopName, indices, bit_set_count, offsets, val_vec, i);\n        } else if (data_mode !=\n                   noData) { // bitsetData or offsetsData or gidsData\n          // galois::gInfo(id, \" node \", i, \" has data to send\");\n          extractSubset<SyncFnTy, syncType, VecTy, false, true, true>(\n              loopName, indices, bit_set_count, offsets, val_vec, i);\n        }\n\n        reportRedundantSize<SyncFnTy>(loopName, syncTypeStr, num, bit_set_count,\n                                      bit_set_comm);\n        serializeMessage<async, syncType>(loopName, data_mode, bit_set_count,\n                                          indices, offsets, bit_set_comm,\n                                          val_vec, b);\n      } else {\n        if (!async) { // TODO: is this fine?\n          // append noData for however many bitsets there are\n          gSerialize(b, noData);\n        }\n      }\n    }\n\n    Textract.stop();\n\n    // FIXME report metadata mode for the different bitsets?\n    // std::string metadata_str(syncTypeStr + \"_METADATA_MODE\" +\n    //                         std::to_string(data_mode) +\n    //                         get_run_identifier(loopName));\n    // galois::runtime::reportStat_Single(RNAME, metadata_str, 1);\n  }\n\n#ifdef GALOIS_USE_BARE_MPI\n  /**\n   * Sync using MPI instead of network layer.\n   */\n  template <WriteLocation writeLocation, ReadLocation readLocation,\n            SyncType syncType, typename SyncFnTy, typename BitsetFnTy,\n            typename VecTy, bool async>\n  void sync_mpi_send(std::string loopName) {\n    static std::vector<galois::runtime::SendBuffer> b;\n    static std::vector<MPI_Request> request;\n    b.resize(numHosts);\n    request.resize(numHosts, MPI_REQUEST_NULL);\n\n    for (unsigned h = 1; h < numHosts; ++h) {\n      unsigned x = (id + h) % numHosts;\n\n      if (nothingToSend(x, syncType, writeLocation, readLocation))\n        continue;\n\n      int ready = 0;\n      MPI_Test(&request[x], &ready, MPI_STATUS_IGNORE);\n      if (!ready) {\n        assert(b[x].size() > 0);\n        MPI_Wait(&request[x], MPI_STATUS_IGNORE);\n      }\n      if (b[x].size() > 0) {\n        b[x].getVec().clear();\n      }\n\n      getSendBuffer<syncType, SyncFnTy, BitsetFnTy, VecTy, async>(loopName, x,\n                                                                  b[x]);\n\n      MPI_Isend((uint8_t*)b[x].linearData(), b[x].size(), MPI_BYTE, x, 32767,\n                MPI_COMM_WORLD, &request[x]);\n    }\n\n    if (BitsetFnTy::is_valid()) {\n      reset_bitset(syncType, &BitsetFnTy::reset_range);\n    }\n  }\n\n  /**\n   * Sync put using MPI instead of network layer\n   */\n  template <WriteLocation writeLocation, ReadLocation readLocation,\n            SyncType syncType, typename SyncFnTy, typename BitsetFnTy,\n            typename VecTy, bool async>\n  void sync_mpi_put(std::string loopName, const MPI_Group& mpi_access_group,\n                    const std::vector<MPI_Win>& window) {\n\n    MPI_Win_start(mpi_access_group, 0, window[id]);\n\n    std::vector<galois::runtime::SendBuffer> b(numHosts);\n    std::vector<size_t> size(numHosts);\n    uint64_t send_buffers_size = 0;\n\n    for (unsigned h = 1; h < numHosts; ++h) {\n      unsigned x = (id + h) % numHosts;\n\n      if (nothingToSend(x, syncType, writeLocation, readLocation))\n        continue;\n\n      getSendBuffer<syncType, SyncFnTy, BitsetFnTy, VecTy, async>(loopName, x,\n                                                                  b[x]);\n\n      size[x] = b[x].size();\n      send_buffers_size += size[x];\n      MPI_Put((uint8_t*)&size[x], sizeof(size_t), MPI_BYTE, x, 0,\n              sizeof(size_t), MPI_BYTE, window[id]);\n      MPI_Put((uint8_t*)b[x].linearData(), size[x], MPI_BYTE, x, sizeof(size_t),\n              size[x], MPI_BYTE, window[id]);\n    }\n\n    auto& net = galois::runtime::getSystemNetworkInterface();\n    net.incrementMemUsage(send_buffers_size);\n\n    MPI_Win_complete(window[id]);\n    net.decrementMemUsage(send_buffers_size);\n\n    if (BitsetFnTy::is_valid()) {\n      reset_bitset(syncType, &BitsetFnTy::reset_range);\n    }\n  }\n#endif\n\n  /**\n   * Sends data to all hosts (if there is anything that needs to be sent\n   * to that particular host) and adjusts bitset according to sync type.\n   *\n   * @tparam writeLocation Location data is written (src or dst)\n   * @tparam readLocation Location data is read (src or dst)\n   * @tparam syncType either reduce or broadcast\n   * @tparam SyncFnTy synchronization structure with info needed to synchronize\n   * @tparam BitsetFnTy struct that has information needed to access bitset\n   *\n   * @param loopName used to name timers created by this sync send\n   */\n  template <WriteLocation writeLocation, ReadLocation readLocation,\n            SyncType syncType, typename SyncFnTy, typename BitsetFnTy,\n            typename VecTy, bool async>\n  void syncNetSend(std::string loopName) {\n    static galois::runtime::SendBuffer\n        b; // although a static variable, allocation not reused\n           // due to std::move in net.sendTagged()\n\n    auto& net               = galois::runtime::getSystemNetworkInterface();\n    std::string syncTypeStr = (syncType == syncReduce) ? \"Reduce\" : \"Broadcast\";\n    std::string statNumMessages_str(syncTypeStr + \"NumMessages_\" +\n                                    get_run_identifier(loopName));\n\n    size_t numMessages = 0;\n    for (unsigned h = 1; h < numHosts; ++h) {\n      unsigned x = (id + h) % numHosts;\n\n      if (nothingToSend(x, syncType, writeLocation, readLocation))\n        continue;\n\n      getSendBuffer<syncType, SyncFnTy, BitsetFnTy, VecTy, async>(loopName, x,\n                                                                  b);\n\n      if ((!async) || (b.size() > 0)) {\n        size_t syncTypePhase = 0;\n        if (async && (syncType == syncBroadcast))\n          syncTypePhase = 1;\n        net.sendTagged(x, galois::runtime::evilPhase, b, syncTypePhase);\n        ++numMessages;\n      }\n    }\n    if (!async) {\n      // Will force all messages to be processed before continuing\n      net.flush();\n    }\n\n    if (BitsetFnTy::is_valid()) {\n      reset_bitset(syncType, &BitsetFnTy::reset_range);\n    }\n\n    galois::runtime::reportStat_Tsum(RNAME, statNumMessages_str, numMessages);\n  }\n\n  /**\n   * Sends data over the network to other hosts based on the provided template\n   * arguments.\n   *\n   * @tparam writeLocation Location data is written (src or dst)\n   * @tparam readLocation Location data is read (src or dst)\n   * @tparam syncType either reduce or broadcast\n   * @tparam SyncFnTy synchronization structure with info needed to synchronize\n   * @tparam BitsetFnTy struct that has info on how to access the bitset\n   *\n   * @param loopName used to name timers for statistics\n   */\n  template <WriteLocation writeLocation, ReadLocation readLocation,\n            SyncType syncType, typename SyncFnTy, typename BitsetFnTy,\n            typename VecTy, bool async>\n  void syncSend(std::string loopName) {\n    std::string syncTypeStr = (syncType == syncReduce) ? \"Reduce\" : \"Broadcast\";\n    galois::CondStatTimer<GALOIS_COMM_STATS> TSendTime(\n        (syncTypeStr + \"Send_\" + get_run_identifier(loopName)).c_str(), RNAME);\n\n    TSendTime.start();\n    syncNetSend<writeLocation, readLocation, syncType, SyncFnTy, BitsetFnTy,\n                VecTy, async>(loopName);\n    TSendTime.stop();\n  }\n\n  ////////////////////////////////////////////////////////////////////////////////\n  // Receives\n  ////////////////////////////////////////////////////////////////////////////////\n\n  /**\n   * Deserializes messages from other hosts and applies them to update local\n   * data based on the provided sync structures.\n   *\n   * Complement of syncExtract.\n   *\n   * @tparam syncType either reduce or broadcast\n   * @tparam SyncFnTy synchronization structure with info needed to synchronize\n   * @tparam BitsetFnTy struct that has info on how to access the bitset\n   *\n   * @param from_id ID of host which the message we are processing was received\n   * from\n   * @param buf Buffer that contains received message from other host\n   * @param loopName used to name timers for statistics\n   */\n  template <\n      SyncType syncType, typename SyncFnTy, typename BitsetFnTy, typename VecTy,\n      bool async,\n      typename std::enable_if<!BitsetFnTy::is_vector_bitset()>::type* = nullptr>\n  size_t syncRecvApply(uint32_t from_id, galois::runtime::RecvBuffer& buf,\n                       std::string loopName) {\n    std::string syncTypeStr = (syncType == syncReduce) ? \"Reduce\" : \"Broadcast\";\n    std::string set_timer_str(syncTypeStr + \"Set_\" +\n                              get_run_identifier(loopName));\n    galois::CondStatTimer<GALOIS_COMM_STATS> Tset(set_timer_str.c_str(), RNAME);\n    std::string set_batch_timer_str(syncTypeStr + \"SetBatch_\" +\n                                    get_run_identifier(loopName));\n    galois::CondStatTimer<GALOIS_COMM_STATS> Tsetbatch(\n        set_batch_timer_str.c_str(), RNAME);\n\n    galois::DynamicBitSet& bit_set_comm = syncBitset;\n    static VecTy val_vec;\n    galois::PODResizeableArray<unsigned int>& offsets = syncOffsets;\n\n    auto& sharedNodes = (syncType == syncReduce) ? masterNodes : mirrorNodes;\n    uint32_t num      = sharedNodes[from_id].size();\n    size_t retval     = 0;\n\n    Tset.start();\n\n    if (num > 0) { // only enter if we expect message from that host\n      DataCommMode data_mode;\n      // 1st deserialize gets data mode\n      galois::runtime::gDeserialize(buf, data_mode);\n\n      if (data_mode != noData) {\n        // GPU update call\n        Tsetbatch.start();\n        bool batch_succeeded =\n            setBatchWrapper<SyncFnTy, syncType, async>(from_id, buf, data_mode);\n        Tsetbatch.stop();\n\n        // cpu always enters this block\n        if (!batch_succeeded) {\n          size_t bit_set_count = num;\n          size_t buf_start     = 0;\n\n          // deserialize the rest of the data in the buffer depending on the\n          // data mode; arguments passed in here are mostly output vars\n          deserializeMessage<syncType>(loopName, data_mode, num, buf,\n                                       bit_set_count, offsets, bit_set_comm,\n                                       buf_start, retval, val_vec);\n\n          bit_set_comm.reserve(maxSharedSize);\n          offsets.reserve(maxSharedSize);\n          val_vec.reserve(maxSharedSize);\n\n          galois::DynamicBitSet& bit_set_compute = BitsetFnTy::get();\n\n          if (data_mode == bitsetData) {\n            size_t bit_set_count2;\n            getOffsetsFromBitset<syncType>(loopName, bit_set_comm, offsets,\n                                           bit_set_count2);\n            assert(bit_set_count == bit_set_count2);\n          }\n\n          if (data_mode == onlyData) {\n            setSubset<decltype(sharedNodes[from_id]), SyncFnTy, syncType, VecTy,\n                      async, true, true>(loopName, sharedNodes[from_id],\n                                         bit_set_count, offsets, val_vec,\n                                         bit_set_compute);\n          } else if (data_mode == dataSplit || data_mode == dataSplitFirst) {\n            setSubset<decltype(sharedNodes[from_id]), SyncFnTy, syncType, VecTy,\n                      async, true, true>(loopName, sharedNodes[from_id],\n                                         bit_set_count, offsets, val_vec,\n                                         bit_set_compute, buf_start);\n          } else if (data_mode == gidsData) {\n            setSubset<decltype(offsets), SyncFnTy, syncType, VecTy, async, true,\n                      true>(loopName, offsets, bit_set_count, offsets, val_vec,\n                            bit_set_compute);\n          } else { // bitsetData or offsetsData\n            setSubset<decltype(sharedNodes[from_id]), SyncFnTy, syncType, VecTy,\n                      async, false, true>(loopName, sharedNodes[from_id],\n                                          bit_set_count, offsets, val_vec,\n                                          bit_set_compute);\n          }\n          // TODO: reduce could update the bitset, so it needs to be copied\n          // back to the device\n        }\n      }\n    }\n\n    Tset.stop();\n\n    return retval;\n  }\n\n  /**\n   * VECTOR BITSET VARIANT.\n   *\n   * Deserializes messages from other hosts and applies them to update local\n   * data based on the provided sync structures. Each message will contain\n   * a series of messages that must be deserialized (the number of such\n   * messages corresponds to the size of the vector that is being synchronized).\n   *\n   * Complement of syncExtract, vector bitset version.\n   *\n   * @tparam syncType either reduce or broadcast\n   * @tparam SyncFnTy synchronization structure with info needed to synchronize\n   * @tparam BitsetFnTy struct that has info on how to access the bitset\n   * MUST BE VECTOR BITSET\n   *\n   * @param from_id ID of host which the message we are processing was received\n   * from\n   * @param buf Buffer that contains received message from other host\n   * @param loopName used to name timers for statistics\n   */\n  template <\n      SyncType syncType, typename SyncFnTy, typename BitsetFnTy, typename VecTy,\n      bool async,\n      typename std::enable_if<BitsetFnTy::is_vector_bitset()>::type* = nullptr>\n  size_t syncRecvApply(uint32_t from_id, galois::runtime::RecvBuffer& buf,\n                       std::string loopName) {\n    std::string syncTypeStr = (syncType == syncReduce) ? \"Reduce\" : \"Broadcast\";\n    std::string set_timer_str(syncTypeStr + \"SetVector_\" +\n                              get_run_identifier(loopName));\n    galois::CondStatTimer<GALOIS_COMM_STATS> Tset(set_timer_str.c_str(), RNAME);\n\n    galois::DynamicBitSet& bit_set_comm = syncBitset;\n    static VecTy val_vec;\n    galois::PODResizeableArray<unsigned int>& offsets = syncOffsets;\n\n    auto& sharedNodes = (syncType == syncReduce) ? masterNodes : mirrorNodes;\n    uint32_t num      = sharedNodes[from_id].size();\n    size_t retval     = 0;\n\n    Tset.start();\n\n    if (num > 0) { // only enter if we expect message from that host\n      for (unsigned i = 0; i < BitsetFnTy::numBitsets(); i++) {\n        DataCommMode data_mode;\n        // 1st deserialize gets data mode\n        galois::runtime::gDeserialize(buf, data_mode);\n\n        if (data_mode != noData) {\n          size_t bit_set_count = num;\n          size_t buf_start     = 0;\n\n          // deserialize the rest of the data in the buffer depending on the\n          // data mode; arguments passed in here are mostly output vars\n          deserializeMessage<syncType>(loopName, data_mode, num, buf,\n                                       bit_set_count, offsets, bit_set_comm,\n                                       buf_start, retval, val_vec);\n\n          galois::DynamicBitSet& bit_set_compute = BitsetFnTy::get(i);\n\n          if (data_mode == bitsetData) {\n            size_t bit_set_count2;\n            getOffsetsFromBitset<syncType>(loopName, bit_set_comm, offsets,\n                                           bit_set_count2);\n            assert(bit_set_count == bit_set_count2);\n          }\n\n          // Note the extra template argument and i argument which cause\n          // execution to deal with a particular element of the vector field\n          // we are synchronizing\n          if (data_mode == onlyData) {\n            setSubset<decltype(sharedNodes[from_id]), SyncFnTy, syncType, VecTy,\n                      async, true, true, true>(loopName, sharedNodes[from_id],\n                                               bit_set_count, offsets, val_vec,\n                                               bit_set_compute, i);\n          } else if (data_mode == dataSplit || data_mode == dataSplitFirst) {\n            setSubset<decltype(sharedNodes[from_id]), SyncFnTy, syncType, VecTy,\n                      true, async, true, true, true>(\n                loopName, sharedNodes[from_id], bit_set_count, offsets, val_vec,\n                bit_set_compute, i, buf_start);\n          } else if (data_mode == gidsData) {\n            setSubset<decltype(offsets), SyncFnTy, syncType, VecTy, async, true,\n                      true, true>(loopName, offsets, bit_set_count, offsets,\n                                  val_vec, bit_set_compute, i);\n          } else { // bitsetData or offsetsData\n            setSubset<decltype(sharedNodes[from_id]), SyncFnTy, syncType, VecTy,\n                      async, false, true, true>(loopName, sharedNodes[from_id],\n                                                bit_set_count, offsets, val_vec,\n                                                bit_set_compute, i);\n          }\n        }\n      }\n    }\n\n    Tset.stop();\n\n    return retval;\n  }\n\n#ifdef GALOIS_USE_BARE_MPI\n  /**\n   * MPI Irecv wrapper for sync\n   */\n  template <WriteLocation writeLocation, ReadLocation readLocation,\n            SyncType syncType, typename SyncFnTy, typename BitsetFnTy>\n  void sync_mpi_recv_post(std::vector<MPI_Request>& request,\n                          const std::vector<std::vector<uint8_t>>& rb) {\n    for (unsigned h = 1; h < numHosts; ++h) {\n      unsigned x = (id + numHosts - h) % numHosts;\n      if (nothingToRecv(x, syncType, writeLocation, readLocation))\n        continue;\n\n      MPI_Irecv((uint8_t*)rb[x].data(), rb[x].size(), MPI_BYTE, x, 32767,\n                MPI_COMM_WORLD, &request[x]);\n    }\n  }\n\n  /**\n   * MPI receive wrapper for sync\n   */\n  template <WriteLocation writeLocation, ReadLocation readLocation,\n            SyncType syncType, typename SyncFnTy, typename BitsetFnTy,\n            typename VecTy, bool async>\n  void sync_mpi_recv_wait(std::string loopName,\n                          std::vector<MPI_Request>& request,\n                          const std::vector<std::vector<uint8_t>>& rb) {\n    for (unsigned h = 1; h < numHosts; ++h) {\n      unsigned x = (id + numHosts - h) % numHosts;\n      if (nothingToRecv(x, syncType, writeLocation, readLocation))\n        continue;\n\n      MPI_Status status;\n      MPI_Wait(&request[x], &status);\n\n      int size = 0;\n      MPI_Get_count(&status, MPI_BYTE, &size);\n\n      galois::runtime::RecvBuffer rbuf(rb[x].begin(), rb[x].begin() + size);\n\n      syncRecvApply<syncType, SyncFnTy, BitsetFnTy, VecTy, async>(x, rbuf,\n                                                                  loopName);\n    }\n  }\n\n  /**\n   * MPI get wrapper for sync\n   */\n  template <WriteLocation writeLocation, ReadLocation readLocation,\n            SyncType syncType, typename SyncFnTy, typename BitsetFnTy,\n            typename VecTy, bool async>\n  void sync_mpi_get(std::string loopName, const std::vector<MPI_Win>& window,\n                    const std::vector<std::vector<uint8_t>>& rb) {\n    for (unsigned h = 1; h < numHosts; ++h) {\n      unsigned x = (id + numHosts - h) % numHosts;\n      if (nothingToRecv(x, syncType, writeLocation, readLocation))\n        continue;\n\n      MPI_Win_wait(window[x]);\n\n      size_t size = 0;\n      memcpy(&size, rb[x].data(), sizeof(size_t));\n\n      galois::runtime::RecvBuffer rbuf(rb[x].begin() + sizeof(size_t),\n                                       rb[x].begin() + sizeof(size_t) + size);\n\n      MPI_Win_post(mpi_identity_groups[x], 0, window[x]);\n\n      syncRecvApply<syncType, SyncFnTy, BitsetFnTy, VecTy, async>(x, rbuf,\n                                                                  loopName);\n    }\n  }\n#endif\n\n  /**\n   * Determines if there is anything to receive from a host and receives/applies\n   * the messages.\n   *\n   * @tparam writeLocation Location data is written (src or dst)\n   * @tparam readLocation Location data is read (src or dst)\n   * @tparam syncType either reduce or broadcast\n   * @tparam SyncFnTy synchronization structure with info needed to synchronize\n   * @tparam BitsetFnTy struct that has info on how to access the bitset\n   *\n   * @param loopName used to name timers for statistics\n   */\n  template <WriteLocation writeLocation, ReadLocation readLocation,\n            SyncType syncType, typename SyncFnTy, typename BitsetFnTy,\n            typename VecTy, bool async>\n  void syncNetRecv(std::string loopName) {\n    auto& net = galois::runtime::getSystemNetworkInterface();\n    std::string wait_timer_str(\"Wait_\" + get_run_identifier(loopName));\n    galois::CondStatTimer<GALOIS_COMM_STATS> Twait(wait_timer_str.c_str(),\n                                                   RNAME);\n\n    if (async) {\n      size_t syncTypePhase = 0;\n      if (syncType == syncBroadcast)\n        syncTypePhase = 1;\n      decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr,\n                                 syncTypePhase)) p;\n      do {\n        p = net.recieveTagged(galois::runtime::evilPhase, nullptr,\n                              syncTypePhase);\n\n        if (p) {\n          syncRecvApply<syncType, SyncFnTy, BitsetFnTy, VecTy, async>(\n              p->first, p->second, loopName);\n        }\n      } while (p);\n    } else {\n      for (unsigned x = 0; x < numHosts; ++x) {\n        if (x == id)\n          continue;\n        if (nothingToRecv(x, syncType, writeLocation, readLocation))\n          continue;\n\n        Twait.start();\n        decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr)) p;\n        do {\n          p = net.recieveTagged(galois::runtime::evilPhase, nullptr);\n        } while (!p);\n        Twait.stop();\n\n        syncRecvApply<syncType, SyncFnTy, BitsetFnTy, VecTy, async>(\n            p->first, p->second, loopName);\n      }\n      incrementEvilPhase();\n    }\n  }\n\n  /**\n   * Receives messages from all other hosts and \"applies\" the message (reduce\n   * or set) based on the sync structure provided.\n   *\n   * @tparam writeLocation Location data is written (src or dst)\n   * @tparam readLocation Location data is read (src or dst)\n   * @tparam syncType either reduce or broadcast\n   * @tparam SyncFnTy synchronization structure with info needed to synchronize\n   * @tparam BitsetFnTy struct that has info on how to access the bitset\n   *\n   * @param loopName used to name timers for statistics\n   */\n  template <WriteLocation writeLocation, ReadLocation readLocation,\n            SyncType syncType, typename SyncFnTy, typename BitsetFnTy,\n            typename VecTy, bool async>\n  void syncRecv(std::string loopName) {\n    std::string syncTypeStr = (syncType == syncReduce) ? \"Reduce\" : \"Broadcast\";\n    galois::CondStatTimer<GALOIS_COMM_STATS> TRecvTime(\n        (syncTypeStr + \"Recv_\" + get_run_identifier(loopName)).c_str(), RNAME);\n\n    TRecvTime.start();\n    syncNetRecv<writeLocation, readLocation, syncType, SyncFnTy, BitsetFnTy,\n                VecTy, async>(loopName);\n    TRecvTime.stop();\n  }\n\n////////////////////////////////////////////////////////////////////////////////\n// MPI sync variants\n////////////////////////////////////////////////////////////////////////////////\n#ifdef GALOIS_USE_BARE_MPI\n  /**\n   * Nonblocking MPI sync\n   */\n  template <WriteLocation writeLocation, ReadLocation readLocation,\n            SyncType syncType, typename SyncFnTy, typename BitsetFnTy,\n            typename VecTy, bool async>\n  void syncNonblockingMPI(std::string loopName,\n                          bool use_bitset_to_send = true) {\n    std::string syncTypeStr = (syncType == syncReduce) ? \"Reduce\" : \"Broadcast\";\n    galois::CondStatTimer<GALOIS_COMM_STATS> TSendTime(\n        (syncTypeStr + \"Send_\" + get_run_identifier(loopName)).c_str(), RNAME);\n    galois::CondStatTimer<GALOIS_COMM_STATS> TRecvTime(\n        (syncTypeStr + \"Recv_\" + get_run_identifier(loopName)).c_str(), RNAME);\n\n    static std::vector<std::vector<uint8_t>> rb;\n    static std::vector<MPI_Request> request;\n\n    if (rb.size() == 0) { // create the receive buffers\n      TRecvTime.start();\n      auto& sharedNodes = (syncType == syncReduce) ? masterNodes : mirrorNodes;\n      rb.resize(numHosts);\n      request.resize(numHosts, MPI_REQUEST_NULL);\n\n      for (unsigned h = 1; h < numHosts; ++h) {\n        unsigned x = (id + numHosts - h) % numHosts;\n        if (nothingToRecv(x, syncType, writeLocation, readLocation))\n          continue;\n\n        size_t size = getMaxSendBufferSize<SyncFnTy>(sharedNodes[x].size());\n        rb[x].resize(size);\n      }\n      TRecvTime.stop();\n    }\n\n    TRecvTime.start();\n    sync_mpi_recv_post<writeLocation, readLocation, syncType, SyncFnTy,\n                       BitsetFnTy>(request, rb);\n    TRecvTime.stop();\n\n    TSendTime.start();\n    if (use_bitset_to_send) {\n      sync_mpi_send<writeLocation, readLocation, syncType, SyncFnTy, BitsetFnTy,\n                    VecTy, async>(loopName);\n    } else {\n      sync_mpi_send<writeLocation, readLocation, syncType, SyncFnTy,\n                    galois::InvalidBitsetFnTy, VecTy, async>(loopName);\n    }\n    TSendTime.stop();\n\n    TRecvTime.start();\n    sync_mpi_recv_wait<writeLocation, readLocation, syncType, SyncFnTy,\n                       BitsetFnTy, VecTy, async>(loopName, request, rb);\n    TRecvTime.stop();\n  }\n\n  /**\n   * Onesided MPI sync\n   */\n  template <WriteLocation writeLocation, ReadLocation readLocation,\n            SyncType syncType, typename SyncFnTy, typename BitsetFnTy,\n            typename VecTy, bool async>\n  void syncOnesidedMPI(std::string loopName, bool use_bitset_to_send = true) {\n    std::string syncTypeStr = (syncType == syncReduce) ? \"Reduce\" : \"Broadcast\";\n    galois::CondStatTimer<GALOIS_COMM_STATS> TSendTime(\n        (syncTypeStr + \"Send_\" + get_run_identifier(loopName)).c_str(), RNAME);\n    galois::CondStatTimer<GALOIS_COMM_STATS> TRecvTime(\n        (syncTypeStr + \"Recv_\" + get_run_identifier(loopName)).c_str(), RNAME);\n\n    static std::vector<MPI_Win> window;\n    static MPI_Group mpi_access_group;\n    static std::vector<std::vector<uint8_t>> rb;\n\n    if (window.size() == 0) { // create the windows\n      TRecvTime.start();\n      auto& sharedNodes = (syncType == syncReduce) ? masterNodes : mirrorNodes;\n      window.resize(numHosts);\n      rb.resize(numHosts);\n\n      uint64_t recv_buffers_size = 0;\n      for (unsigned x = 0; x < numHosts; ++x) {\n        size_t size = getMaxSendBufferSize<SyncFnTy>(sharedNodes[x].size());\n        rb[x].resize(size);\n        recv_buffers_size += size;\n\n        MPI_Info info;\n        MPI_Info_create(&info);\n        MPI_Info_set(info, \"no_locks\", \"true\");\n        MPI_Info_set(info, \"same_disp_unit\", \"true\");\n\n        MPI_Win_create(rb[x].data(), size, 1, info, MPI_COMM_WORLD, &window[x]);\n\n        MPI_Info_free(&info);\n      }\n      auto& net = galois::runtime::getSystemNetworkInterface();\n      net.incrementMemUsage(recv_buffers_size);\n\n      for (unsigned h = 1; h < numHosts; ++h) {\n        unsigned x = (id + numHosts - h) % numHosts;\n        if (nothingToRecv(x, syncType, writeLocation, readLocation))\n          continue;\n        // exposure group of each window is same as identity group of that\n        // window\n        MPI_Win_post(mpi_identity_groups[x], 0, window[x]);\n      }\n      TRecvTime.stop();\n\n      TSendTime.start();\n      std::vector<int> access_hosts;\n      for (unsigned h = 1; h < numHosts; ++h) {\n        unsigned x = (id + h) % numHosts;\n\n        if (nothingToSend(x, syncType, writeLocation, readLocation))\n          continue;\n\n        access_hosts.push_back(x);\n      }\n      MPI_Group world_group;\n      MPI_Comm_group(MPI_COMM_WORLD, &world_group);\n      // access group for only one window since only one window is accessed\n      MPI_Group_incl(world_group, access_hosts.size(), access_hosts.data(),\n                     &mpi_access_group);\n      TSendTime.stop();\n    }\n\n    TSendTime.start();\n    if (use_bitset_to_send) {\n      sync_mpi_put<writeLocation, readLocation, syncType, SyncFnTy, BitsetFnTy,\n                   VecTy, async>(loopName, mpi_access_group, window);\n    } else {\n      sync_mpi_put<writeLocation, readLocation, syncType, SyncFnTy,\n                   galois::InvalidBitsetFnTy, VecTy, async>(\n          loopName, mpi_access_group, window);\n    }\n    TSendTime.stop();\n\n    TRecvTime.start();\n    sync_mpi_get<writeLocation, readLocation, syncType, SyncFnTy, BitsetFnTy,\n                 VecTy, async>(loopName, window, rb);\n    TRecvTime.stop();\n  }\n#endif\n\n  ////////////////////////////////////////////////////////////////////////////////\n  // Higher Level Sync Calls (broadcast/reduce, etc)\n  ////////////////////////////////////////////////////////////////////////////////\n\n  /**\n   * Does a reduction of data from mirror nodes to master nodes.\n   *\n   * @tparam writeLocation Location data is written (src or dst)\n   * @tparam readLocation Location data is read (src or dst)\n   * @tparam ReduceFnTy reduce sync structure for the field\n   * @tparam BitsetFnTy struct that has info on how to access the bitset\n   *\n   * @param loopName used to name timers for statistics\n   */\n  template <WriteLocation writeLocation, ReadLocation readLocation,\n            typename ReduceFnTy, typename BitsetFnTy, bool async>\n  inline void reduce(std::string loopName) {\n    std::string timer_str(\"Reduce_\" + get_run_identifier(loopName));\n    galois::CondStatTimer<GALOIS_COMM_STATS> TsyncReduce(timer_str.c_str(),\n                                                         RNAME);\n\n    typedef typename ReduceFnTy::ValTy T;\n    typedef\n        typename std::conditional<galois::runtime::is_memory_copyable<T>::value,\n                                  galois::PODResizeableArray<T>,\n                                  galois::gstl::Vector<T>>::type VecTy;\n\n    TsyncReduce.start();\n\n#ifdef GALOIS_USE_BARE_MPI\n    switch (bare_mpi) {\n    case noBareMPI:\n#endif\n      syncSend<writeLocation, readLocation, syncReduce, ReduceFnTy, BitsetFnTy,\n               VecTy, async>(loopName);\n      syncRecv<writeLocation, readLocation, syncReduce, ReduceFnTy, BitsetFnTy,\n               VecTy, async>(loopName);\n#ifdef GALOIS_USE_BARE_MPI\n      break;\n    case nonBlockingBareMPI:\n      syncNonblockingMPI<writeLocation, readLocation, syncReduce, ReduceFnTy,\n                         BitsetFnTy, VecTy, async>(loopName);\n      break;\n    case oneSidedBareMPI:\n      syncOnesidedMPI<writeLocation, readLocation, syncReduce, ReduceFnTy,\n                      BitsetFnTy, VecTy, async>(loopName);\n      break;\n    default:\n      GALOIS_DIE(\"unsupported bare MPI\");\n    }\n#endif\n\n    TsyncReduce.stop();\n  }\n\n  /**\n   * Does a broadcast of data from master to mirror nodes.\n   *\n   * @tparam writeLocation Location data is written (src or dst)\n   * @tparam readLocation Location data is read (src or dst)\n   * @tparam BroadcastFnTy broadcast sync structure for the field\n   * @tparam BitsetFnTy struct that has info on how to access the bitset\n   *\n   * @param loopName used to name timers for statistics\n   */\n  template <WriteLocation writeLocation, ReadLocation readLocation,\n            typename BroadcastFnTy, typename BitsetFnTy, bool async>\n  inline void broadcast(std::string loopName) {\n    std::string timer_str(\"Broadcast_\" + get_run_identifier(loopName));\n    galois::CondStatTimer<GALOIS_COMM_STATS> TsyncBroadcast(timer_str.c_str(),\n                                                            RNAME);\n\n    typedef typename BroadcastFnTy::ValTy T;\n    typedef\n        typename std::conditional<galois::runtime::is_memory_copyable<T>::value,\n                                  galois::PODResizeableArray<T>,\n                                  galois::gstl::Vector<T>>::type VecTy;\n\n    TsyncBroadcast.start();\n\n    bool use_bitset = true;\n\n    if (currentBVFlag != nullptr) {\n      if (readLocation == readSource &&\n          galois::runtime::src_invalid(*currentBVFlag)) {\n        use_bitset     = false;\n        *currentBVFlag = BITVECTOR_STATUS::NONE_INVALID;\n        currentBVFlag  = nullptr;\n      } else if (readLocation == readDestination &&\n                 galois::runtime::dst_invalid(*currentBVFlag)) {\n        use_bitset     = false;\n        *currentBVFlag = BITVECTOR_STATUS::NONE_INVALID;\n        currentBVFlag  = nullptr;\n      } else if (readLocation == readAny &&\n                 *currentBVFlag != BITVECTOR_STATUS::NONE_INVALID) {\n        // the bitvector flag being non-null means this call came from\n        // sync on demand; sync on demand will NEVER use readAny\n        // if location is read Any + one of src or dst is invalid\n        GALOIS_DIE(\"readAny + use of bitvector flag without none_invalid \"\n                   \"should never happen\");\n      }\n    }\n\n#ifdef GALOIS_USE_BARE_MPI\n    switch (bare_mpi) {\n    case noBareMPI:\n#endif\n      if (use_bitset) {\n        syncSend<writeLocation, readLocation, syncBroadcast, BroadcastFnTy,\n                 BitsetFnTy, VecTy, async>(loopName);\n      } else {\n        syncSend<writeLocation, readLocation, syncBroadcast, BroadcastFnTy,\n                 galois::InvalidBitsetFnTy, VecTy, async>(loopName);\n      }\n      syncRecv<writeLocation, readLocation, syncBroadcast, BroadcastFnTy,\n               BitsetFnTy, VecTy, async>(loopName);\n#ifdef GALOIS_USE_BARE_MPI\n      break;\n    case nonBlockingBareMPI:\n      syncNonblockingMPI<writeLocation, readLocation, syncBroadcast,\n                         BroadcastFnTy, BitsetFnTy, VecTy, async>(loopName,\n                                                                  use_bitset);\n      break;\n    case oneSidedBareMPI:\n      syncOnesidedMPI<writeLocation, readLocation, syncBroadcast, BroadcastFnTy,\n                      BitsetFnTy, VecTy, async>(loopName, use_bitset);\n      break;\n    default:\n      GALOIS_DIE(\"unsupported bare MPI\");\n    }\n#endif\n\n    TsyncBroadcast.stop();\n  }\n\n  /**\n   * Do sync necessary for write source, read source.\n   *\n   * @tparam SyncFnTy sync structure for the field\n   * @tparam BitsetFnTy struct that has info on how to access the bitset\n   *\n   * @param loopName used to name timers for statistics\n   */\n  template <typename SyncFnTy, typename BitsetFnTy, bool async>\n  inline void sync_src_to_src(std::string loopName) {\n    // do nothing for OEC\n    // reduce and broadcast for IEC, CVC, UVC\n    if (transposed || isVertexCut) {\n      reduce<writeSource, readSource, SyncFnTy, BitsetFnTy, async>(loopName);\n      broadcast<writeSource, readSource, SyncFnTy, BitsetFnTy, async>(loopName);\n    }\n  }\n\n  /**\n   * Do sync necessary for write source, read destination.\n   *\n   * @tparam SyncFnTy sync structure for the field\n   * @tparam BitsetFnTy struct that has info on how to access the bitset\n   *\n   * @param loopName used to name timers for statistics\n   */\n  template <typename SyncFnTy, typename BitsetFnTy, bool async>\n  inline void sync_src_to_dst(std::string loopName) {\n    // only broadcast for OEC\n    // only reduce for IEC\n    // reduce and broadcast for CVC, UVC\n    if (transposed) {\n      reduce<writeSource, readDestination, SyncFnTy, BitsetFnTy, async>(\n          loopName);\n      if (isVertexCut) {\n        broadcast<writeSource, readDestination, SyncFnTy, BitsetFnTy, async>(\n            loopName);\n      }\n    } else {\n      if (isVertexCut) {\n        reduce<writeSource, readDestination, SyncFnTy, BitsetFnTy, async>(\n            loopName);\n      }\n      broadcast<writeSource, readDestination, SyncFnTy, BitsetFnTy, async>(\n          loopName);\n    }\n  }\n\n  /**\n   * Do sync necessary for write source, read any.\n   *\n   * @tparam SyncFnTy sync structure for the field\n   * @tparam BitsetFnTy struct that has info on how to access the bitset\n   *\n   * @param loopName used to name timers for statistics\n   */\n  template <typename SyncFnTy, typename BitsetFnTy, bool async>\n  inline void sync_src_to_any(std::string loopName) {\n    // only broadcast for OEC\n    // reduce and broadcast for IEC, CVC, UVC\n    if (transposed || isVertexCut) {\n      reduce<writeSource, readAny, SyncFnTy, BitsetFnTy, async>(loopName);\n    }\n    broadcast<writeSource, readAny, SyncFnTy, BitsetFnTy, async>(loopName);\n  }\n\n  /**\n   * Do sync necessary for write dest, read source.\n   *\n   * @tparam SyncFnTy sync structure for the field\n   * @tparam BitsetFnTy struct that has info on how to access the bitset\n   *\n   * @param loopName used to name timers for statistics\n   */\n  template <typename SyncFnTy, typename BitsetFnTy, bool async>\n  inline void sync_dst_to_src(std::string loopName) {\n    // only reduce for OEC\n    // only broadcast for IEC\n    // reduce and broadcast for CVC, UVC\n    if (transposed) {\n      if (isVertexCut) {\n        reduce<writeDestination, readSource, SyncFnTy, BitsetFnTy, async>(\n            loopName);\n      }\n      broadcast<writeDestination, readSource, SyncFnTy, BitsetFnTy, async>(\n          loopName);\n    } else {\n      reduce<writeDestination, readSource, SyncFnTy, BitsetFnTy, async>(\n          loopName);\n      if (isVertexCut) {\n        broadcast<writeDestination, readSource, SyncFnTy, BitsetFnTy, async>(\n            loopName);\n      }\n    }\n  }\n\n  /**\n   * Do sync necessary for write dest, read dest.\n   *\n   * @tparam SyncFnTy sync structure for the field\n   * @tparam BitsetFnTy struct that has info on how to access the bitset\n   *\n   * @param loopName used to name timers for statistics\n   */\n  template <typename SyncFnTy, typename BitsetFnTy, bool async>\n  inline void sync_dst_to_dst(std::string loopName) {\n    // do nothing for IEC\n    // reduce and broadcast for OEC, CVC, UVC\n    if (!transposed || isVertexCut) {\n      reduce<writeDestination, readDestination, SyncFnTy, BitsetFnTy, async>(\n          loopName);\n      broadcast<writeDestination, readDestination, SyncFnTy, BitsetFnTy, async>(\n          loopName);\n    }\n  }\n\n  /**\n   * Do sync necessary for write dest, read any.\n   *\n   * @tparam SyncFnTy sync structure for the field\n   * @tparam BitsetFnTy struct that has info on how to access the bitset\n   *\n   * @param loopName used to name timers for statistics\n   */\n  template <typename SyncFnTy, typename BitsetFnTy, bool async>\n  inline void sync_dst_to_any(std::string loopName) {\n    // only broadcast for IEC\n    // reduce and broadcast for OEC, CVC, UVC\n    if (!transposed || isVertexCut) {\n      reduce<writeDestination, readAny, SyncFnTy, BitsetFnTy, async>(loopName);\n    }\n    broadcast<writeDestination, readAny, SyncFnTy, BitsetFnTy, async>(loopName);\n  }\n\n  /**\n   * Do sync necessary for write any, read src.\n   *\n   * @tparam SyncFnTy sync structure for the field\n   * @tparam BitsetFnTy struct that has info on how to access the bitset\n   *\n   * @param loopName used to name timers for statistics\n   */\n  template <typename SyncFnTy, typename BitsetFnTy, bool async>\n  inline void sync_any_to_src(std::string loopName) {\n    // only reduce for OEC\n    // reduce and broadcast for IEC, CVC, UVC\n    reduce<writeAny, readSource, SyncFnTy, BitsetFnTy, async>(loopName);\n    if (transposed || isVertexCut) {\n      broadcast<writeAny, readSource, SyncFnTy, BitsetFnTy, async>(loopName);\n    }\n  }\n\n  /**\n   * Do sync necessary for write any, read dst.\n   *\n   * @tparam SyncFnTy sync structure for the field\n   * @tparam BitsetFnTy struct that has info on how to access the bitset\n   *\n   * @param loopName used to name timers for statistics\n   */\n  template <typename SyncFnTy, typename BitsetFnTy, bool async>\n  inline void sync_any_to_dst(std::string loopName) {\n    // only reduce for IEC\n    // reduce and broadcast for OEC, CVC, UVC\n    reduce<writeAny, readDestination, SyncFnTy, BitsetFnTy, async>(loopName);\n\n    if (!transposed || isVertexCut) {\n      broadcast<writeAny, readDestination, SyncFnTy, BitsetFnTy, async>(\n          loopName);\n    }\n  }\n\n  /**\n   * Do sync necessary for write any, read any.\n   *\n   * @tparam SyncFnTy sync structure for the field\n   * @tparam BitsetFnTy struct that has info on how to access the bitset\n   *\n   * @param loopName used to name timers for statistics\n   */\n  template <typename SyncFnTy, typename BitsetFnTy, bool async>\n  inline void sync_any_to_any(std::string loopName) {\n    // reduce and broadcast for OEC, IEC, CVC, UVC\n    reduce<writeAny, readAny, SyncFnTy, BitsetFnTy, async>(loopName);\n    broadcast<writeAny, readAny, SyncFnTy, BitsetFnTy, async>(loopName);\n  }\n\n  ////////////////////////////////////////////////////////////////////////////////\n  // Public iterface: sync\n  ////////////////////////////////////////////////////////////////////////////////\n\npublic:\n  /**\n   * Main sync call exposed to the user that calls the correct sync function\n   * based on provided template arguments. Must provide information through\n   * structures on how to do synchronization/which fields to synchronize.\n   *\n   * @tparam writeLocation Location data is written (src or dst)\n   * @tparam readLocation Location data is read (src or dst)\n   * @tparam SyncFnTy sync structure for the field\n   * @tparam BitsetFnTy struct that has info on how to access the bitset\n   *\n   * @param loopName used to name timers for statistics\n   */\n  template <WriteLocation writeLocation, ReadLocation readLocation,\n            typename SyncFnTy, typename BitsetFnTy = galois::InvalidBitsetFnTy,\n            bool async = false>\n  inline void sync(std::string loopName) {\n    std::string timer_str(\"Sync_\" + loopName + \"_\" + get_run_identifier());\n    galois::StatTimer Tsync(timer_str.c_str(), RNAME);\n\n    Tsync.start();\n\n    if (partitionAgnostic) {\n      sync_any_to_any<SyncFnTy, BitsetFnTy, async>(loopName);\n    } else {\n      if (writeLocation == writeSource) {\n        if (readLocation == readSource) {\n          sync_src_to_src<SyncFnTy, BitsetFnTy, async>(loopName);\n        } else if (readLocation == readDestination) {\n          sync_src_to_dst<SyncFnTy, BitsetFnTy, async>(loopName);\n        } else { // readAny\n          sync_src_to_any<SyncFnTy, BitsetFnTy, async>(loopName);\n        }\n      } else if (writeLocation == writeDestination) {\n        if (readLocation == readSource) {\n          sync_dst_to_src<SyncFnTy, BitsetFnTy, async>(loopName);\n        } else if (readLocation == readDestination) {\n          sync_dst_to_dst<SyncFnTy, BitsetFnTy, async>(loopName);\n        } else { // readAny\n          sync_dst_to_any<SyncFnTy, BitsetFnTy, async>(loopName);\n        }\n      } else { // writeAny\n        if (readLocation == readSource) {\n          sync_any_to_src<SyncFnTy, BitsetFnTy, async>(loopName);\n        } else if (readLocation == readDestination) {\n          sync_any_to_dst<SyncFnTy, BitsetFnTy, async>(loopName);\n        } else { // readAny\n          sync_any_to_any<SyncFnTy, BitsetFnTy, async>(loopName);\n        }\n      }\n    }\n\n    Tsync.stop();\n  }\n\n  ////////////////////////////////////////////////////////////////////////////////\n  // Sync on demand code (unmaintained, may not work)\n  ////////////////////////////////////////////////////////////////////////////////\nprivate:\n  /**\n   * Generic Sync on demand handler. Should NEVER get to this (hence\n   * the galois die).\n   */\n  template <ReadLocation rl, typename SyncFnTy, typename BitsetFnTy>\n  struct SyncOnDemandHandler {\n    // note this call function signature is diff. from specialized versions:\n    // will cause compile time error if this struct is used (which is what\n    // we want)\n    void call() { GALOIS_DIE(\"invalid read location for sync on demand\"); }\n  };\n\n  /**\n   * Sync on demand handler specialized for read source.\n   *\n   * @tparam SyncFnTy sync structure for the field\n   * @tparam BitsetFnTy tells program what data needs to be sync'd\n   */\n  template <typename SyncFnTy, typename BitsetFnTy>\n  struct SyncOnDemandHandler<readSource, SyncFnTy, BitsetFnTy> {\n    /**\n     * Based on sync flags, handles syncs for cases when you need to read\n     * at source\n     *\n     * @param substrate sync substrate\n     * @param fieldFlags the flags structure specifying what needs to be\n     * sync'd\n     * @param loopName loopname used to name timers\n     * @param bvFlag Copy of the bitvector status (valid/invalid at particular\n     * locations)\n     */\n    static inline void call(GluonSubstrate* substrate,\n                            galois::runtime::FieldFlags& fieldFlags,\n                            std::string loopName, const BITVECTOR_STATUS&) {\n      if (fieldFlags.src_to_src() && fieldFlags.dst_to_src()) {\n        substrate->sync_any_to_src<SyncFnTy, BitsetFnTy>(loopName);\n      } else if (fieldFlags.src_to_src()) {\n        substrate->sync_src_to_src<SyncFnTy, BitsetFnTy>(loopName);\n      } else if (fieldFlags.dst_to_src()) {\n        substrate->sync_dst_to_src<SyncFnTy, BitsetFnTy>(loopName);\n      }\n\n      fieldFlags.clear_read_src();\n    }\n  };\n\n  /**\n   * Sync on demand handler specialized for read destination.\n   *\n   * @tparam SyncFnTy sync structure for the field\n   * @tparam BitsetFnTy tells program what data needs to be sync'd\n   */\n  template <typename SyncFnTy, typename BitsetFnTy>\n  struct SyncOnDemandHandler<readDestination, SyncFnTy, BitsetFnTy> {\n    /**\n     * Based on sync flags, handles syncs for cases when you need to read\n     * at destination\n     *\n     * @param substrate sync substrate\n     * @param fieldFlags the flags structure specifying what needs to be\n     * sync'd\n     * @param loopName loopname used to name timers\n     * @param bvFlag Copy of the bitvector status (valid/invalid at particular\n     * locations)\n     */\n    static inline void call(GluonSubstrate* substrate,\n                            galois::runtime::FieldFlags& fieldFlags,\n                            std::string loopName, const BITVECTOR_STATUS&) {\n      if (fieldFlags.src_to_dst() && fieldFlags.dst_to_dst()) {\n        substrate->sync_any_to_dst<SyncFnTy, BitsetFnTy>(loopName);\n      } else if (fieldFlags.src_to_dst()) {\n        substrate->sync_src_to_dst<SyncFnTy, BitsetFnTy>(loopName);\n      } else if (fieldFlags.dst_to_dst()) {\n        substrate->sync_dst_to_dst<SyncFnTy, BitsetFnTy>(loopName);\n      }\n\n      fieldFlags.clear_read_dst();\n    }\n  };\n\n  /**\n   * Sync on demand handler specialized for read any.\n   *\n   * @tparam SyncFnTy sync structure for the field\n   * @tparam BitsetFnTy tells program what data needs to be sync'd\n   */\n  template <typename SyncFnTy, typename BitsetFnTy>\n  struct SyncOnDemandHandler<readAny, SyncFnTy, BitsetFnTy> {\n    /**\n     * Based on sync flags, handles syncs for cases when you need to read\n     * at both source and destination\n     *\n     * @param substrate sync substrate\n     * @param fieldFlags the flags structure specifying what needs to be\n     * sync'd\n     * @param loopName loopname used to name timers\n     * @param bvFlag Copy of the bitvector status (valid/invalid at particular\n     * locations)\n     */\n    static inline void call(GluonSubstrate* substrate,\n                            galois::runtime::FieldFlags& fieldFlags,\n                            std::string loopName,\n                            const BITVECTOR_STATUS& bvFlag) {\n      bool src_write = fieldFlags.src_to_src() || fieldFlags.src_to_dst();\n      bool dst_write = fieldFlags.dst_to_src() || fieldFlags.dst_to_dst();\n\n      if (!(src_write && dst_write)) {\n        // src or dst write flags aren't set (potentially both are not set),\n        // but it's NOT the case that both are set, meaning \"any\" isn't\n        // required in the \"from\"; can work at granularity of just src\n        // write or dst wrte\n\n        if (src_write) {\n          if (fieldFlags.src_to_src() && fieldFlags.src_to_dst()) {\n            if (bvFlag == BITVECTOR_STATUS::NONE_INVALID) {\n              substrate->sync_src_to_any<SyncFnTy, BitsetFnTy>(loopName);\n            } else if (galois::runtime::src_invalid(bvFlag)) {\n              // src invalid bitset; sync individually so it can be called\n              // without bitset\n              substrate->sync_src_to_dst<SyncFnTy, BitsetFnTy>(loopName);\n              substrate->sync_src_to_src<SyncFnTy, BitsetFnTy>(loopName);\n            } else if (galois::runtime::dst_invalid(bvFlag)) {\n              // dst invalid bitset; sync individually so it can be called\n              // without bitset\n              substrate->sync_src_to_src<SyncFnTy, BitsetFnTy>(loopName);\n              substrate->sync_src_to_dst<SyncFnTy, BitsetFnTy>(loopName);\n            } else {\n              GALOIS_DIE(\"invalid bitvector flag setting in syncOnDemand\");\n            }\n          } else if (fieldFlags.src_to_src()) {\n            substrate->sync_src_to_src<SyncFnTy, BitsetFnTy>(loopName);\n          } else { // src to dst is set\n            substrate->sync_src_to_dst<SyncFnTy, BitsetFnTy>(loopName);\n          }\n        } else if (dst_write) {\n          if (fieldFlags.dst_to_src() && fieldFlags.dst_to_dst()) {\n            if (bvFlag == BITVECTOR_STATUS::NONE_INVALID) {\n              substrate->sync_dst_to_any<SyncFnTy, BitsetFnTy>(loopName);\n            } else if (galois::runtime::src_invalid(bvFlag)) {\n              substrate->sync_dst_to_dst<SyncFnTy, BitsetFnTy>(loopName);\n              substrate->sync_dst_to_src<SyncFnTy, BitsetFnTy>(loopName);\n            } else if (galois::runtime::dst_invalid(bvFlag)) {\n              substrate->sync_dst_to_src<SyncFnTy, BitsetFnTy>(loopName);\n              substrate->sync_dst_to_dst<SyncFnTy, BitsetFnTy>(loopName);\n            } else {\n              GALOIS_DIE(\"invalid bitvector flag setting in syncOnDemand\");\n            }\n          } else if (fieldFlags.dst_to_src()) {\n            substrate->sync_dst_to_src<SyncFnTy, BitsetFnTy>(loopName);\n          } else { // dst to dst is set\n            substrate->sync_dst_to_dst<SyncFnTy, BitsetFnTy>(loopName);\n          }\n        }\n\n        // note the \"no flags are set\" case will enter into this block\n        // as well, and it is correctly handled by doing nothing since\n        // both src/dst_write will be false\n      } else {\n        // it is the case that both src/dst write flags are set, so \"any\"\n        // is required in the \"from\"; what remains to be determined is\n        // the use of src, dst, or any for the destination of the sync\n        bool src_read = fieldFlags.src_to_src() || fieldFlags.dst_to_src();\n        bool dst_read = fieldFlags.src_to_dst() || fieldFlags.dst_to_dst();\n\n        if (src_read && dst_read) {\n          if (bvFlag == BITVECTOR_STATUS::NONE_INVALID) {\n            substrate->sync_any_to_any<SyncFnTy, BitsetFnTy>(loopName);\n          } else if (galois::runtime::src_invalid(bvFlag)) {\n            substrate->sync_any_to_dst<SyncFnTy, BitsetFnTy>(loopName);\n            substrate->sync_any_to_src<SyncFnTy, BitsetFnTy>(loopName);\n          } else if (galois::runtime::dst_invalid(bvFlag)) {\n            substrate->sync_any_to_src<SyncFnTy, BitsetFnTy>(loopName);\n            substrate->sync_any_to_dst<SyncFnTy, BitsetFnTy>(loopName);\n          } else {\n            GALOIS_DIE(\"invalid bitvector flag setting in syncOnDemand\");\n          }\n        } else if (src_read) {\n          substrate->sync_any_to_src<SyncFnTy, BitsetFnTy>(loopName);\n        } else { // dst_read\n          substrate->sync_any_to_dst<SyncFnTy, BitsetFnTy>(loopName);\n        }\n      }\n\n      fieldFlags.clear_read_src();\n      fieldFlags.clear_read_dst();\n    }\n  };\n\n  ////////////////////////////////////////////////////////////////////////////////\n  // GPU marshaling\n  ////////////////////////////////////////////////////////////////////////////////\n\n#ifdef GALOIS_ENABLE_GPU\nprivate:\n  using GraphNode     = typename GraphTy::GraphNode;\n  using edge_iterator = typename GraphTy::edge_iterator;\n  using EdgeTy        = typename GraphTy::EdgeType;\n\n  // Code that handles getting the graph onto the GPU\n  template <bool isVoidType,\n            typename std::enable_if<isVoidType>::type* = nullptr>\n  inline void setMarshalEdge(MarshalGraph& GALOIS_UNUSED(m),\n                             const size_t GALOIS_UNUSED(index),\n                             const edge_iterator& GALOIS_UNUSED(e)) {\n    // do nothing\n  }\n\n  template <bool isVoidType,\n            typename std::enable_if<!isVoidType>::type* = nullptr>\n  inline void setMarshalEdge(MarshalGraph& m, const size_t index,\n                             const edge_iterator& e) {\n    m.edge_data[index] = userGraph.getEdgeData(e);\n  }\n\npublic:\n  void getMarshalGraph(MarshalGraph& m) {\n    m.nnodes   = userGraph.size();\n    m.nedges   = userGraph.sizeEdges();\n    m.numOwned = userGraph.numMasters();\n    // Assumption: master occurs at beginning in contiguous range\n    m.beginMaster       = 0;\n    m.numNodesWithEdges = userGraph.getNumNodesWithEdges();\n    m.id                = id;\n    m.numHosts          = numHosts;\n    m.row_start         = (index_type*)calloc(m.nnodes + 1, sizeof(index_type));\n    m.edge_dst          = (index_type*)calloc(m.nedges, sizeof(index_type));\n    m.node_data         = (index_type*)calloc(m.nnodes, sizeof(node_data_type));\n\n    // TODO deal with edgety\n    if (std::is_void<EdgeTy>::value) {\n      m.edge_data = NULL;\n    } else {\n      if (!std::is_same<EdgeTy, edge_data_type>::value) {\n        galois::gWarn(\"Edge data type mismatch between CPU and GPU\\n\");\n      }\n      m.edge_data = (edge_data_type*)calloc(m.nedges, sizeof(edge_data_type));\n    }\n\n    galois::do_all(\n        // TODO not using thread ranges, can be optimized if I can iterate\n        // directly over userGraph\n        galois::iterate(userGraph.allNodesRange()),\n        [&](const GraphNode& nodeID) {\n          // initialize node_data with localID-to-globalID mapping\n          m.node_data[nodeID] = userGraph.getGID(nodeID);\n          m.row_start[nodeID] = *(userGraph.edge_begin(nodeID));\n          for (auto e = userGraph.edge_begin(nodeID);\n               e != userGraph.edge_end(nodeID); e++) {\n            auto edgeID = *e;\n            setMarshalEdge<std::is_void<EdgeTy>::value>(m, edgeID, e);\n            m.edge_dst[edgeID] = userGraph.getEdgeDst(e);\n          }\n        },\n        galois::steal());\n\n    m.row_start[m.nnodes] = m.nedges;\n\n    ////// TODO\n\n    // copy memoization meta-data\n    m.num_master_nodes =\n        (unsigned int*)calloc(masterNodes.size(), sizeof(unsigned int));\n    ;\n    m.master_nodes =\n        (unsigned int**)calloc(masterNodes.size(), sizeof(unsigned int*));\n    ;\n\n    for (uint32_t h = 0; h < masterNodes.size(); ++h) {\n      m.num_master_nodes[h] = masterNodes[h].size();\n\n      if (masterNodes[h].size() > 0) {\n        m.master_nodes[h] =\n            (unsigned int*)calloc(masterNodes[h].size(), sizeof(unsigned int));\n        ;\n        std::copy(masterNodes[h].begin(), masterNodes[h].end(),\n                  m.master_nodes[h]);\n      } else {\n        m.master_nodes[h] = NULL;\n      }\n    }\n\n    m.num_mirror_nodes =\n        (unsigned int*)calloc(mirrorNodes.size(), sizeof(unsigned int));\n    ;\n    m.mirror_nodes =\n        (unsigned int**)calloc(mirrorNodes.size(), sizeof(unsigned int*));\n    ;\n    for (uint32_t h = 0; h < mirrorNodes.size(); ++h) {\n      m.num_mirror_nodes[h] = mirrorNodes[h].size();\n\n      if (mirrorNodes[h].size() > 0) {\n        m.mirror_nodes[h] =\n            (unsigned int*)calloc(mirrorNodes[h].size(), sizeof(unsigned int));\n        ;\n        std::copy(mirrorNodes[h].begin(), mirrorNodes[h].end(),\n                  m.mirror_nodes[h]);\n      } else {\n        m.mirror_nodes[h] = NULL;\n      }\n    }\n\n    // user needs to provide method of freeing up graph (it can do nothing\n    // if they wish)\n    userGraph.deallocate();\n  }\n#endif // het galois def\n\n  ////////////////////////////////////////////////////////////////////////////////\n  // Public sync interface\n  ////////////////////////////////////////////////////////////////////////////////\n\npublic:\n  /**\n   * Given a structure that contains flags signifying what needs to be\n   * synchronized, syncOnDemand will synchronize what is necessary based\n   * on the read location of the * field.\n   *\n   * @tparam readLocation Location in which field will need to be read\n   * @tparam SyncFnTy sync structure for the field\n   * @tparam BitsetFnTy struct which holds a bitset which can be used\n   * to control synchronization at a more fine grain level\n   * @param fieldFlags structure for field you are syncing\n   * @param loopName Name of loop this sync is for for naming timers\n   */\n  template <ReadLocation readLocation, typename SyncFnTy,\n            typename BitsetFnTy = galois::InvalidBitsetFnTy>\n  inline void syncOnDemand(galois::runtime::FieldFlags& fieldFlags,\n                           std::string loopName) {\n    std::string timer_str(\"Sync_\" + get_run_identifier(loopName));\n    galois::StatTimer Tsync(timer_str.c_str(), RNAME);\n    Tsync.start();\n\n    currentBVFlag = &(fieldFlags.bitvectorStatus);\n\n    // call a template-specialized function depending on the read location\n    SyncOnDemandHandler<readLocation, SyncFnTy, BitsetFnTy>::call(\n        this, fieldFlags, loopName, *currentBVFlag);\n\n    currentBVFlag = nullptr;\n\n    Tsync.stop();\n  }\n\n  ////////////////////////////////////////////////////////////////////////////////\n  // Metadata settings/getters\n  ////////////////////////////////////////////////////////////////////////////////\n  /**\n   * Set the run number.\n   *\n   * @param runNum Number to set the run to\n   */\n  inline void set_num_run(const uint32_t runNum) { num_run = runNum; }\n\n  /**\n   * Get the set run number.\n   *\n   * @returns The set run number saved in the graph\n   */\n  inline uint32_t get_run_num() const { return num_run; }\n\n  /**\n   * Set the round number for use in the run identifier.\n   *\n   * @param round round number to set to\n   */\n  inline void set_num_round(const uint32_t round) { num_round = round; }\n\n  /**\n   * Get a run identifier using the set run and set round.\n   *\n   * @returns a string run identifier\n   * @deprecated We want to move away from calling this by itself; use ones\n   * that take an argument; will be removed once we eliminate all instances\n   * of its use from code\n   */\n  inline std::string get_run_identifier() const {\n#if GALOIS_PER_ROUND_STATS\n    return std::string(std::to_string(num_run) + \"_\" +\n                       std::to_string(num_round));\n#else\n    return std::string(std::to_string(num_run));\n#endif\n  }\n\n  /**\n   * Get a run identifier using the set run and set round and\n   * append to the passed in string.\n   *\n   * @param loop_name String to append the run identifier\n   * @returns String with run identifier appended to passed in loop name\n   */\n  inline std::string get_run_identifier(std::string loop_name) const {\n#if GALOIS_PER_ROUND_STATS\n    return std::string(std::string(loop_name) + \"_\" + std::to_string(num_run) +\n                       \"_\" + std::to_string(num_round));\n#else\n    return std::string(std::string(loop_name) + \"_\" + std::to_string(num_run));\n#endif\n  }\n\n  /**\n   * Get a run identifier using the set run and set round and\n   * append to the passed in string in addition to the number identifier passed\n   * in.\n   *\n   * @param loop_name String to append the run identifier\n   * @param alterID another ID with which to add to the timer name.\n   *\n   * @returns String with run identifier appended to passed in loop name +\n   * alterID\n   */\n  inline std::string get_run_identifier(std::string loop_name,\n                                        unsigned alterID) const {\n#if GALOIS_PER_ROUND_STATS\n    return std::string(std::string(loop_name) + \"_\" + std::to_string(alterID) +\n                       \"_\" + std::to_string(num_run) + \"_\" +\n                       std::to_string(num_round));\n#else\n    return std::string(std::string(loop_name) + \"_\" + std::to_string(alterID) +\n                       \"_\" + std::to_string(num_run));\n#endif\n  }\n\n  /**\n   * Given a sync structure, reset the field specified by the structure\n   * to the 0 of the reduction on mirrors.\n   *\n   * @tparam FnTy structure that specifies how synchronization is to be done\n   */\n  template <typename FnTy>\n  void reset_mirrorField() {\n    // TODO make sure this is correct still\n    auto mirrorRanges = userGraph.getMirrorRanges();\n    for (auto r : mirrorRanges) {\n      if (r.first == r.second)\n        continue;\n      assert(r.first < r.second);\n\n      // GPU call\n      bool batch_succeeded = FnTy::reset_batch(r.first, r.second - 1);\n\n      // CPU always enters this block\n      if (!batch_succeeded) {\n        galois::do_all(\n            galois::iterate(r.first, r.second),\n            [&](uint32_t lid) { FnTy::reset(lid, userGraph.getData(lid)); },\n            galois::no_stats(),\n            galois::loopname(get_run_identifier(\"RESET:MIRRORS\").c_str()));\n      }\n    }\n  }\n\n////////////////////////////////////////////////////////////////////////////////\n// Checkpointing code for graph\n////////////////////////////////////////////////////////////////////////////////\n\n// @todo Checkpointing code needs updates to make it work.\n#ifdef GALOIS_CHECKPOINT\n///*\n// * Headers for boost serialization\n// */\n//#include <boost/archive/binary_oarchive.hpp>\n//#include <boost/archive/binary_iarchive.hpp>\n//#include <boost/serialization/split_member.hpp>\n//#include <boost/serialization/binary_object.hpp>\n//#include <boost/serialization/serialization.hpp>\n//#include <boost/serialization/vector.hpp>\n//#include <boost/serialization/unordered_map.hpp>\n//\n// public:\n//  /**\n//   * Checkpoint the complete structure on the node to disk\n//   */\n//  void checkpointSaveNodeData(std::string checkpointFileName = \"checkpoint\") {\n//    using namespace boost::archive;\n//    galois::StatTimer TimerSaveCheckPoint(\n//        get_run_identifier(\"TimerSaveCheckpoint\").c_str(), RNAME);\n//\n//    TimerSaveCheckPoint.start();\n//    std::string checkpointFileName_local =\n//        checkpointFileName + \"_\" + std::to_string(id);\n//\n//    std::ofstream outputStream(checkpointFileName_local, std::ios::binary);\n//    if (!outputStream.is_open()) {\n//      galois::gPrint(\"ERROR: Could not open \", checkpointFileName_local,\n//                     \" to save checkpoint!!!\\n\");\n//    }\n//    galois::gPrint(\"[\", id,\n//                   \"] Saving local checkpoint to :\", checkpointFileName_local,\n//                   \"\\n\");\n//\n//    boost::archive::binary_oarchive ar(outputStream,\n//    boost::archive::no_header);\n//\n//    // TODO handle this with CuSP\n//    userGraph.serializeNodeData(ar);\n//\n//    std::string statSendBytes_str(\"CheckpointBytesTotal\");\n//    constexpr static const char* const RREGION = \"RECOVERY\";\n//    size_t cp_size                             = outputStream.tellp();\n//    galois::runtime::reportStat_Tsum(RREGION, statSendBytes_str, cp_size);\n//\n//    outputStream.flush();\n//    outputStream.close();\n//    TimerSaveCheckPoint.stop();\n//  }\n//\n//  /**\n//   * Load checkpointed data from disk.\n//   */\n//  void checkpointApplyNodeData(std::string checkpointFileName = \"checkpoint\")\n//  {\n//    using namespace boost::archive;\n//    galois::StatTimer TimerApplyCheckPoint(\n//        get_run_identifier(\"TimerApplyCheckpoint\").c_str(), RNAME);\n//\n//    TimerApplyCheckPoint.start();\n//    std::string checkpointFileName_local =\n//        checkpointFileName + \"_\" + std::to_string(id);\n//\n//    std::ifstream inputStream(checkpointFileName_local, std::ios::binary);\n//\n//    if (!inputStream.is_open()) {\n//      galois::gPrint(\"ERROR: Could not open \", checkpointFileName_local,\n//                     \" to read checkpoint!!!\\n\");\n//    }\n//    galois::gPrint(\"[\", id, \"] reading local checkpoint from: \",\n//                   checkpointFileName_local, \"\\n\");\n//\n//    boost::archive::binary_iarchive ar(inputStream,\n//    boost::archive::no_header);\n//\n//    // TODO handle this with CuSP\n//    userGraph.deSerializeNodeData(ar);\n//\n//    inputStream.close();\n//    TimerApplyCheckPoint.stop();\n//  }\n#endif\n};\n\ntemplate <typename GraphTy>\nconstexpr const char* const galois::graphs::GluonSubstrate<GraphTy>::RNAME;\n} // end namespace graphs\n} // end namespace galois\n\n#endif // header guard\n"
  },
  {
    "path": "libgluon/include/galois/runtime/DataCommMode.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n/*\n */\n\n/**\n * @file DataCommMode.h\n *\n * Contains the DataCommMode enumeration and a function that chooses a data\n * comm mode based on its arguments.\n */\n#pragma once\n\n//! Enumeration of data communication modes that can be used in synchronization\n//! @todo document the enums in doxygen\nenum DataCommMode {\n  noData, //!< send no data\n  bitsetData,\n  offsetsData,\n  gidsData,\n  onlyData,\n  dataSplitFirst, // NOT USED\n  dataSplit       // NOT USED\n};\n\n//! If some mode is to be enforced, set this variable\n//! @todo using a global is not great, but current problem is that GPU code\n//! assumes variable and would take some reorg to fix\nextern DataCommMode enforcedDataMode;\n\n/**\n * Given a size of a subset of elements to send and the total number of\n * elements, determine an appropriate data mode to use for sending out the data\n * during synchronization.\n *\n * @tparam DataType type of the data to be synchronized\n *\n * @param num_selected number of elements to send out (subset of num_total)\n * @param num_total total number of elements that exist\n *\n * @returns an appropriate DataCommMode to use for synchronization\n */\ntemplate <typename DataType>\nDataCommMode get_data_mode(size_t num_selected, size_t num_total) {\n  DataCommMode data_mode = noData;\n  if (enforcedDataMode != noData) {\n    data_mode = enforcedDataMode;\n  } else { // no enforced mode, so find an appropriate mode\n    if (num_selected == 0) {\n      data_mode = noData;\n    } else if (num_selected == num_total) {\n      data_mode = onlyData;\n    } else {\n      size_t bitset_alloc_size =\n          ((num_total + 63) / 64) * sizeof(uint64_t) + (2 * sizeof(size_t));\n\n      size_t bitsetDataSize = (num_selected * sizeof(DataType)) +\n                              bitset_alloc_size + sizeof(num_selected);\n      size_t offsetsDataSize = (num_selected * sizeof(DataType)) +\n                               (num_selected * sizeof(unsigned int)) +\n                               sizeof(size_t) + sizeof(num_selected);\n      // find the minimum size one\n      if (bitsetDataSize < offsetsDataSize) {\n        data_mode = bitsetData;\n      } else {\n        data_mode = offsetsData;\n      }\n    }\n  }\n  return data_mode;\n}\n"
  },
  {
    "path": "libgluon/include/galois/runtime/GlobalObj.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n/**\n * @file GlobalObj.h\n *\n * Defines the GlobalObject class, which is a base class that other\n * classes inherit from to be assigned a unique global id.\n */\n\n#include <vector>\n#include <cstdint>\n#include <cassert>\n\n#ifndef _GALOIS_DIST_GLOBAL_OBJECT_H\n#define _GALOIS_DIST_GLOBAL_OBJECT_H\n\nnamespace galois {\nnamespace runtime {\n\n/**\n * A class to be inherited from so that all child classes will have a tracked\n * unique ID.\n *\n * @warning Not thread safe: do not concurrently construct GlobalObjects\n */\nclass GlobalObject {\n  //! Vector that points to all GlobalObject instances\n  //! @todo make a pointer to avoid static initialization?\n  static std::vector<uintptr_t> allobjs;\n  //! ID of a global object\n  uint32_t objID;\n\nprotected:\n  GlobalObject(const GlobalObject&) = delete;\n  GlobalObject(GlobalObject&&)      = delete;\n\n  /**\n   * Returns the pointer for a global object\n   *\n   * @param oid Global object id to get\n   * @returns pointer to requested global object\n   */\n  static uintptr_t ptrForObj(unsigned oid);\n\n  /**\n   * Constructs a global object given a pointer to the object you want to make\n   * a global object.\n   *\n   * @tparam T type of the object to make a GlobalObject\n   * @param ptr pointer to object to make a GlobalObject\n   *\n   * @todo lock needed if multiple GlobalObjects are being constructed in\n   * parallel\n   */\n  template <typename T>\n  GlobalObject(const T* ptr) {\n    objID = allobjs.size();\n    allobjs.push_back(reinterpret_cast<uintptr_t>(ptr));\n  }\n\n  /**\n   * Returns own global id\n   *\n   * @returns this object's global id\n   */\n  uint32_t idForSelf() const { return objID; }\n};\n\n} // end namespace runtime\n} // end namespace galois\n\n#endif //_GALOIS_DIST_GLOBAL_OBJECT_H\n"
  },
  {
    "path": "libgluon/include/galois/runtime/SyncStructures.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n/**\n * @file SyncStructures.h\n *\n * Contains macros for easily defining common Galois sync structures and the\n * field flags class used for on-demand synchronization.\n */\n\n#ifndef _SYNC_STRUCT_MACROS_\n#define _SYNC_STRUCT_MACROS_\n\n#include <cstdint>                       // for uint types used below\n#include <galois/AtomicHelpers.h>        // for galois::max, min\n#include <galois/runtime/DataCommMode.h> // for galois::max, min\n#include <galois/gIO.h>                  // for GALOIS DIE\n\n////////////////////////////////////////////////////////////////////////////////\n// Field flag class\n////////////////////////////////////////////////////////////////////////////////\n\nnamespace galois {\nnamespace runtime {\n\n/**\n * Bitvector status enum specifying validness of certain things in bitvector.\n */\nenum BITVECTOR_STATUS {\n  NONE_INVALID, //!< none of the bitvector is invalid\n  SRC_INVALID,  //!< sources on bitvector are invalid\n  DST_INVALID,  //!< destinations on bitvector are invalid\n  BOTH_INVALID  //< both source and destinations on bitvector are invalid\n};\n\n//! Return true if the sources are invalid in bitvector flag\nbool src_invalid(BITVECTOR_STATUS bv_flag);\n//! Return true if the destinations are invalid in bitvector flag\nbool dst_invalid(BITVECTOR_STATUS bv_flag);\n//! Marks sources invalid on passed in bitvector flag\nvoid make_src_invalid(BITVECTOR_STATUS* bv_flag);\n//! Marks destinations invalid on passed in bitvector flag\nvoid make_dst_invalid(BITVECTOR_STATUS* bv_flag);\n\n/**\n * Each field has a FieldFlags object that indicates synchronization status\n * of that field.\n */\nclass FieldFlags {\nprivate:\n  uint8_t _s2s;\n  uint8_t _s2d;\n  uint8_t _d2s;\n  uint8_t _d2d;\n\npublic:\n  /**\n   * Status of the bitvector in terms of if it can be used to sync the field\n   */\n  BITVECTOR_STATUS bitvectorStatus;\n  /**\n   * Field Flags constructor. Sets all flags to false and bitvector\n   * status to invalid.\n   */\n  FieldFlags() {\n    _s2s            = false;\n    _s2d            = false;\n    _d2s            = false;\n    _d2d            = false;\n    bitvectorStatus = BITVECTOR_STATUS::NONE_INVALID;\n  }\n\n  //! Return true if src2src is set\n  bool src_to_src() const { return _s2s; }\n\n  //! Return true if src2dst is set\n  bool src_to_dst() const { return _s2d; }\n\n  //! Return true if dst2src is set\n  bool dst_to_src() const { return _d2s; }\n\n  //! Return true if dst2dst is set\n  bool dst_to_dst() const { return _d2d; }\n\n  //! Sets write src flags to true\n  void set_write_src() {\n    _s2s = true;\n    _s2d = true;\n  }\n\n  //! Sets write dst flags to true\n  void set_write_dst() {\n    _d2s = true;\n    _d2d = true;\n  }\n\n  //! Sets all write flags to true\n  void set_write_any() {\n    _s2s = true;\n    _s2d = true;\n    _d2s = true;\n    _d2d = true;\n  }\n\n  //! Sets write src flags to false\n  void clear_read_src() {\n    _s2s = false;\n    _d2s = false;\n  }\n\n  //! Sets write dst flags to false\n  void clear_read_dst() {\n    _s2d = false;\n    _d2d = false;\n  }\n\n  //! Sets all write flags to false\n  void clear_read_any() {\n    _s2d = false;\n    _d2d = false;\n    _s2s = false;\n    _d2s = false;\n  }\n\n  //! Sets all write flags to false and sets bitvector stats to none invalid\n  void clear_all() {\n    _s2s            = false;\n    _s2d            = false;\n    _d2s            = false;\n    _d2d            = false;\n    bitvectorStatus = BITVECTOR_STATUS::NONE_INVALID;\n  }\n};\n\n} // end namespace runtime\n} // end namespace galois\n\n////////////////////////////////////////////////////////////////////////////////\n// Reduce Add, Edges\n////////////////////////////////////////////////////////////////////////////////\n#ifdef GALOIS_ENABLE_GPU\n#define GALOIS_SYNC_STRUCTURE_ADD_EDGES(fieldtype)                             \\\n  struct EdgeAddReduce {                                                       \\\n    using ValTy = fieldtype;                                                   \\\n                                                                               \\\n    static ValTy extract(uint64_t edgeID, ValTy& edgeData) {                   \\\n      if (personality == GPU_CUDA)                                             \\\n        return get_edge_cuda(cuda_ctx, edgeID);                                \\\n      assert(personality == CPU);                                              \\\n      return edgeData;                                                         \\\n    }                                                                          \\\n                                                                               \\\n    static bool extract_batch(unsigned from_id, uint8_t* y, size_t* s,         \\\n                              DataCommMode* data_mode) {                       \\\n      if (personality == GPU_CUDA) {                                           \\\n        batch_get_edge_cuda(cuda_ctx, from_id, y, s, data_mode);               \\\n        return true;                                                           \\\n      }                                                                        \\\n      assert(personality == CPU);                                              \\\n      return false;                                                            \\\n    }                                                                          \\\n                                                                               \\\n    static bool extract_batch(unsigned from_id, uint8_t* y) {                  \\\n      if (personality == GPU_CUDA) {                                           \\\n        batch_get_edge_cuda(cuda_ctx, from_id, y);                             \\\n        return true;                                                           \\\n      }                                                                        \\\n      assert(personality == CPU);                                              \\\n      return false;                                                            \\\n    }                                                                          \\\n                                                                               \\\n    static bool extract_reset_batch(unsigned from_id, uint8_t* y, size_t* s,   \\\n                                    DataCommMode* data_mode) {                 \\\n      if (personality == GPU_CUDA) {                                           \\\n        batch_get_reset_edge_cuda(cuda_ctx, from_id, y, s, data_mode,          \\\n                                  (ValTy)0);                                   \\\n        return true;                                                           \\\n      }                                                                        \\\n      assert(personality == CPU);                                              \\\n      return false;                                                            \\\n    }                                                                          \\\n                                                                               \\\n    static bool extract_reset_batch(unsigned from_id, uint8_t* y) {            \\\n      if (personality == GPU_CUDA) {                                           \\\n        batch_get_reset_edge_cuda(cuda_ctx, from_id, y, (ValTy)0);             \\\n        return true;                                                           \\\n      }                                                                        \\\n      assert(personality == CPU);                                              \\\n      return false;                                                            \\\n    }                                                                          \\\n                                                                               \\\n    static bool reduce(uint64_t edgeID, ValTy& edgeData, ValTy y) {            \\\n      if (personality == GPU_CUDA) {                                           \\\n        add_edge_cuda(cuda_ctx, edgeID, y);                                    \\\n        return true;                                                           \\\n      }                                                                        \\\n      assert(personality == CPU);                                              \\\n      edgeData += y;                                                           \\\n      return true;                                                             \\\n    }                                                                          \\\n                                                                               \\\n    static bool reduce_batch(unsigned from_id, uint8_t* y,                     \\\n                             DataCommMode data_mode) {                         \\\n      if (personality == GPU_CUDA) {                                           \\\n        batch_add_edge_cuda(cuda_ctx, from_id, y, data_mode);                  \\\n        return true;                                                           \\\n      }                                                                        \\\n      assert(personality == CPU);                                              \\\n      return false;                                                            \\\n    }                                                                          \\\n                                                                               \\\n    static bool reduce_mirror_batch(unsigned from_id, uint8_t* y,              \\\n                                    DataCommMode data_mode) {                  \\\n      if (personality == GPU_CUDA) {                                           \\\n        batch_add_mirror_edge_cuda(cuda_ctx, from_id, y, data_mode);           \\\n        return true;                                                           \\\n      }                                                                        \\\n      assert(personality == CPU);                                              \\\n      return false;                                                            \\\n    }                                                                          \\\n                                                                               \\\n    static void reset(uint64_t edgeID, ValTy& edgeData) {                      \\\n      if (personality == GPU_CUDA) {                                           \\\n        set_edge_cuda(cuda_ctx, edgeID, (ValTy)0);                             \\\n      }                                                                        \\\n      assert(personality == CPU);                                              \\\n      edgeData = 0;                                                            \\\n    }                                                                          \\\n                                                                               \\\n    static bool reset_batch(size_t GALOIS_UNUSED(begin),                       \\\n                            size_t GALOIS_UNUSED(end)) {                       \\\n      if (personality == GPU_CUDA) {                                           \\\n        batch_reset_edge_cuda(cuda_ctx, begin, end, (ValTy)0);                 \\\n        return true;                                                           \\\n      }                                                                        \\\n      assert(personality == CPU);                                              \\\n      return false;                                                            \\\n    }                                                                          \\\n                                                                               \\\n    static void setVal(uint64_t edgeID, ValTy& edgeData, ValTy y) {            \\\n      if (personality == GPU_CUDA) {                                           \\\n        set_edge_cuda(cuda_ctx, edgeID, (ValTy)0);                             \\\n      }                                                                        \\\n      assert(personality == CPU);                                              \\\n      edgeData = y;                                                            \\\n    }                                                                          \\\n                                                                               \\\n    static bool setVal_batch(unsigned from_id, uint8_t* y,                     \\\n                             DataCommMode data_mode) {                         \\\n      if (personality == GPU_CUDA) {                                           \\\n        batch_set_mirror_edge_cuda(cuda_ctx, from_id, y, data_mode);           \\\n        return true;                                                           \\\n      }                                                                        \\\n      assert(personality == CPU);                                              \\\n      return false;                                                            \\\n    }                                                                          \\\n  };\n#else\n#define GALOIS_SYNC_STRUCTURE_ADD_EDGES(fieldtype)                             \\\n  struct EdgeAddReduce {                                                       \\\n    using ValTy = fieldtype;                                                   \\\n                                                                               \\\n    static ValTy extract(uint64_t edgeID, ValTy& edgeData) {                   \\\n      return edgeData;                                                         \\\n    }                                                                          \\\n                                                                               \\\n    static bool extract_batch(unsigned, uint8_t*, size_t*, DataCommMode*) {    \\\n      return false;                                                            \\\n    }                                                                          \\\n                                                                               \\\n    static bool extract_batch(unsigned, uint8_t*) { return false; }            \\\n                                                                               \\\n    static bool extract_reset_batch(unsigned, uint8_t*, size_t*,               \\\n                                    DataCommMode*) {                           \\\n      return false;                                                            \\\n    }                                                                          \\\n                                                                               \\\n    static bool extract_reset_batch(unsigned, uint8_t*) { return false; }      \\\n                                                                               \\\n    static bool reduce(uint64_t edgeID, ValTy& edgeData, ValTy y) {            \\\n      edgeData += y;                                                           \\\n      return true;                                                             \\\n    }                                                                          \\\n                                                                               \\\n    static bool reduce_batch(unsigned, uint8_t*, DataCommMode) {               \\\n      return false;                                                            \\\n    }                                                                          \\\n                                                                               \\\n    static bool reduce_mirror_batch(unsigned, uint8_t*, DataCommMode) {        \\\n      return false;                                                            \\\n    }                                                                          \\\n                                                                               \\\n    static void reset(uint64_t edgeID, ValTy& edgeData) { edgeData = 0; }      \\\n                                                                               \\\n    static void setVal(uint64_t edgeID, ValTy& edgeData, ValTy y) {            \\\n      edgeData = y;                                                            \\\n    }                                                                          \\\n                                                                               \\\n    static bool setVal_batch(unsigned, uint8_t*, DataCommMode) {               \\\n      return false;                                                            \\\n    }                                                                          \\\n  };\n#endif\n\n/**\n * Sync structure for dynamic bitsets, edges.\n *\n * Bitsets are expected to have the following naming scheme:\n * bitset_edges\n *\n * In addition, you will have to declare and appropriately resize the bitset\n * in your main program as well as set the bitset appropriately (i.e. when you\n * do a write to a particular node).\n */\n#ifdef GALOIS_ENABLE_GPU\n// GPU code included\n#define GALOIS_SYNC_STRUCTURE_BITSET_EDGES                                     \\\n  struct Bitset_edges {                                                        \\\n    static constexpr bool is_vector_bitset() { return false; }                 \\\n    static bool is_valid() { return true; }                                    \\\n                                                                               \\\n    static galois::DynamicBitSet& get() {                                      \\\n      if (personality == GPU_CUDA)                                             \\\n        get_bitset_edge_cuda(cuda_ctx,                                         \\\n                             (uint64_t*)bitset_edges.get_vec().data());        \\\n      return bitset_edges;                                                     \\\n    }                                                                          \\\n                                                                               \\\n    static void reset_range(size_t begin, size_t end) {                        \\\n      if (personality == GPU_CUDA) {                                           \\\n        bitset_edge_reset_cuda(cuda_ctx, begin, end);                          \\\n      } else {                                                                 \\\n        assert(personality == CPU);                                            \\\n        bitset_edges.reset(begin, end);                                        \\\n      }                                                                        \\\n    }                                                                          \\\n  }\n#else\n// no GPU code\n#define GALOIS_SYNC_STRUCTURE_BITSET_EDGES                                     \\\n  struct Bitset_edges {                                                        \\\n    static constexpr bool is_vector_bitset() { return false; }                 \\\n                                                                               \\\n    static constexpr bool is_valid() { return true; }                          \\\n                                                                               \\\n    static galois::DynamicBitSet& get() { return bitset_edges; }               \\\n                                                                               \\\n    static void reset_range(size_t begin, size_t end) {                        \\\n      bitset_edges.reset(begin, end);                                          \\\n    }                                                                          \\\n  }\n#endif\n\n////////////////////////////////////////////////////////////////////////////////\n// Reduce Add\n////////////////////////////////////////////////////////////////////////////////\n\n/**\n * Creates a Galois reduction sync structure that does a sum reduction.\n */\n#ifdef GALOIS_ENABLE_GPU\n// GPU code included\n#define GALOIS_SYNC_STRUCTURE_REDUCE_ADD(fieldname, fieldtype)                 \\\n  struct Reduce_add_##fieldname {                                              \\\n    typedef fieldtype ValTy;                                                   \\\n                                                                               \\\n    static ValTy extract(uint32_t node_id, const struct NodeData& node) {      \\\n      if (personality == GPU_CUDA)                                             \\\n        return get_node_##fieldname##_cuda(cuda_ctx, node_id);                 \\\n      assert(personality == CPU);                                              \\\n      return node.fieldname;                                                   \\\n    }                                                                          \\\n                                                                               \\\n    static bool extract_batch(unsigned from_id, uint8_t* y, size_t* s,         \\\n                              DataCommMode* data_mode) {                       \\\n      if (personality == GPU_CUDA) {                                           \\\n        batch_get_node_##fieldname##_cuda(cuda_ctx, from_id, y, s, data_mode); \\\n        return true;                                                           \\\n      }                                                                        \\\n      assert(personality == CPU);                                              \\\n      return false;                                                            \\\n    }                                                                          \\\n                                                                               \\\n    static bool extract_batch(unsigned from_id, uint8_t* y) {                  \\\n      if (personality == GPU_CUDA) {                                           \\\n        batch_get_node_##fieldname##_cuda(cuda_ctx, from_id, y);               \\\n        return true;                                                           \\\n      }                                                                        \\\n      assert(personality == CPU);                                              \\\n      return false;                                                            \\\n    }                                                                          \\\n                                                                               \\\n    static bool extract_reset_batch(unsigned from_id, uint8_t* y, size_t* s,   \\\n                                    DataCommMode* data_mode) {                 \\\n      if (personality == GPU_CUDA) {                                           \\\n        batch_get_reset_node_##fieldname##_cuda(cuda_ctx, from_id, y, s,       \\\n                                                data_mode, (ValTy)0);          \\\n        return true;                                                           \\\n      }                                                                        \\\n      assert(personality == CPU);                                              \\\n      return false;                                                            \\\n    }                                                                          \\\n                                                                               \\\n    static bool extract_reset_batch(unsigned from_id, uint8_t* y) {            \\\n      if (personality == GPU_CUDA) {                                           \\\n        batch_get_reset_node_##fieldname##_cuda(cuda_ctx, from_id, y,          \\\n                                                (ValTy)0);                     \\\n        return true;                                                           \\\n      }                                                                        \\\n      assert(personality == CPU);                                              \\\n      return false;                                                            \\\n    }                                                                          \\\n                                                                               \\\n    static bool reset_batch(size_t begin, size_t end) {                        \\\n      if (personality == GPU_CUDA) {                                           \\\n        batch_reset_node_##fieldname##_cuda(cuda_ctx, begin, end, (ValTy)0);   \\\n        return true;                                                           \\\n      }                                                                        \\\n      assert(personality == CPU);                                              \\\n      return false;                                                            \\\n    }                                                                          \\\n                                                                               \\\n    static bool reduce(uint32_t node_id, struct NodeData& node, ValTy y) {     \\\n      if (personality == GPU_CUDA) {                                           \\\n        add_node_##fieldname##_cuda(cuda_ctx, node_id, y);                     \\\n        return true;                                                           \\\n      }                                                                        \\\n      assert(personality == CPU);                                              \\\n      {                                                                        \\\n        galois::add(node.fieldname, y);                                        \\\n        return true;                                                           \\\n      }                                                                        \\\n    }                                                                          \\\n                                                                               \\\n    static bool reduce_batch(unsigned from_id, uint8_t* y,                     \\\n                             DataCommMode data_mode) {                         \\\n      if (personality == GPU_CUDA) {                                           \\\n        batch_add_node_##fieldname##_cuda(cuda_ctx, from_id, y, data_mode);    \\\n        return true;                                                           \\\n      }                                                                        \\\n      assert(personality == CPU);                                              \\\n      return false;                                                            \\\n    }                                                                          \\\n                                                                               \\\n    static bool reduce_mirror_batch(unsigned from_id, uint8_t* y,              \\\n                                    DataCommMode data_mode) {                  \\\n      if (personality == GPU_CUDA) {                                           \\\n        batch_add_mirror_node_##fieldname##_cuda(cuda_ctx, from_id, y,         \\\n                                                 data_mode);                   \\\n        return true;                                                           \\\n      }                                                                        \\\n      assert(personality == CPU);                                              \\\n      return false;                                                            \\\n    }                                                                          \\\n                                                                               \\\n    static void reset(uint32_t node_id, struct NodeData& node) {               \\\n      if (personality == GPU_CUDA) {                                           \\\n        set_node_##fieldname##_cuda(cuda_ctx, node_id, (ValTy)0);              \\\n      } else if (personality == CPU)                                           \\\n        galois::set(node.fieldname, (ValTy)0);                                 \\\n    }                                                                          \\\n                                                                               \\\n    static void setVal(uint32_t node_id, struct NodeData& node, ValTy y) {     \\\n      if (personality == GPU_CUDA)                                             \\\n        set_node_##fieldname##_cuda(cuda_ctx, node_id, y);                     \\\n      else if (personality == CPU)                                             \\\n        node.fieldname = y;                                                    \\\n    }                                                                          \\\n                                                                               \\\n    static bool setVal_batch(unsigned from_id, uint8_t* y,                     \\\n                             DataCommMode data_mode) {                         \\\n      if (personality == GPU_CUDA) {                                           \\\n        batch_set_mirror_node_##fieldname##_cuda(cuda_ctx, from_id, y,         \\\n                                                 data_mode);                   \\\n        return true;                                                           \\\n      }                                                                        \\\n      assert(personality == CPU);                                              \\\n      return false;                                                            \\\n    }                                                                          \\\n  }\n#else\n// Non-GPU code\n#define GALOIS_SYNC_STRUCTURE_REDUCE_ADD(fieldname, fieldtype)                 \\\n  struct Reduce_add_##fieldname {                                              \\\n    typedef fieldtype ValTy;                                                   \\\n                                                                               \\\n    static ValTy extract(uint32_t, const struct NodeData& node) {              \\\n      return node.fieldname;                                                   \\\n    }                                                                          \\\n                                                                               \\\n    static bool extract_batch(unsigned, uint8_t*, size_t*, DataCommMode*) {    \\\n      return false;                                                            \\\n    }                                                                          \\\n                                                                               \\\n    static bool extract_batch(unsigned, uint8_t*) { return false; }            \\\n                                                                               \\\n    static bool extract_reset_batch(unsigned, uint8_t*, size_t*,               \\\n                                    DataCommMode*) {                           \\\n      return false;                                                            \\\n    }                                                                          \\\n                                                                               \\\n    static bool extract_reset_batch(unsigned, uint8_t*) { return false; }      \\\n                                                                               \\\n    static bool reset_batch(size_t, size_t) { return false; }                  \\\n                                                                               \\\n    static bool reduce(uint32_t, struct NodeData& node, ValTy y) {             \\\n      {                                                                        \\\n        galois::add(node.fieldname, y);                                        \\\n        return true;                                                           \\\n      }                                                                        \\\n    }                                                                          \\\n                                                                               \\\n    static bool reduce_batch(unsigned, uint8_t*, DataCommMode) {               \\\n      return false;                                                            \\\n    }                                                                          \\\n                                                                               \\\n    static bool reduce_mirror_batch(unsigned, uint8_t*, DataCommMode) {        \\\n      return false;                                                            \\\n    }                                                                          \\\n                                                                               \\\n    static void reset(uint32_t, struct NodeData& node) {                       \\\n      galois::set(node.fieldname, (ValTy)0);                                   \\\n    }                                                                          \\\n                                                                               \\\n    static void setVal(uint32_t, struct NodeData& node, ValTy y) {             \\\n      node.fieldname = y;                                                      \\\n    }                                                                          \\\n                                                                               \\\n    static bool setVal_batch(unsigned, uint8_t*, DataCommMode) {               \\\n      return false;                                                            \\\n    }                                                                          \\\n  }\n#endif\n\n/**\n * Creates a Galois reduction sync structure that does a sum reduction\n * on a field that is represented by an array.\n */\n#ifdef GALOIS_ENABLE_GPU\n// GPU code included\n#define GALOIS_SYNC_STRUCTURE_REDUCE_ADD_ARRAY(fieldname, fieldtype)           \\\n  struct Reduce_add_##fieldname {                                              \\\n    typedef fieldtype ValTy;                                                   \\\n                                                                               \\\n    static ValTy extract(uint32_t node_id, const struct NodeData& node) {      \\\n      if (personality == GPU_CUDA)                                             \\\n        return get_node_##fieldname##_cuda(cuda_ctx, node_id);                 \\\n      assert(personality == CPU);                                              \\\n      return fieldname[node_id];                                               \\\n    }                                                                          \\\n                                                                               \\\n    static bool extract_batch(unsigned from_id, uint8_t* y, size_t* s,         \\\n                              DataCommMode* data_mode) {                       \\\n      if (personality == GPU_CUDA) {                                           \\\n        batch_get_node_##fieldname##_cuda(cuda_ctx, from_id, y, s, data_mode); \\\n        return true;                                                           \\\n      }                                                                        \\\n      assert(personality == CPU);                                              \\\n      return false;                                                            \\\n    }                                                                          \\\n                                                                               \\\n    static bool extract_batch(unsigned from_id, uint8_t* y) {                  \\\n      if (personality == GPU_CUDA) {                                           \\\n        batch_get_node_##fieldname##_cuda(cuda_ctx, from_id, y);               \\\n        return true;                                                           \\\n      }                                                                        \\\n      assert(personality == CPU);                                              \\\n      return false;                                                            \\\n    }                                                                          \\\n                                                                               \\\n    static bool extract_reset_batch(unsigned from_id, uint8_t* y, size_t* s,   \\\n                                    DataCommMode* data_mode) {                 \\\n      if (personality == GPU_CUDA) {                                           \\\n        batch_get_reset_node_##fieldname##_cuda(cuda_ctx, from_id, y, s,       \\\n                                                data_mode, (ValTy)0);          \\\n        return true;                                                           \\\n      }                                                                        \\\n      assert(personality == CPU);                                              \\\n      return false;                                                            \\\n    }                                                                          \\\n                                                                               \\\n    static bool extract_reset_batch(unsigned from_id, uint8_t* y) {            \\\n      if (personality == GPU_CUDA) {                                           \\\n        batch_get_reset_node_##fieldname##_cuda(cuda_ctx, from_id, y,          \\\n                                                (ValTy)0);                     \\\n        return true;                                                           \\\n      }                                                                        \\\n      assert(personality == CPU);                                              \\\n      return false;                                                            \\\n    }                                                                          \\\n                                                                               \\\n    static bool reset_batch(size_t begin, size_t end) {                        \\\n      if (personality == GPU_CUDA) {                                           \\\n        batch_reset_node_##fieldname##_cuda(cuda_ctx, begin, end, (ValTy)0);   \\\n        return true;                                                           \\\n      }                                                                        \\\n      assert(personality == CPU);                                              \\\n      return false;                                                            \\\n    }                                                                          \\\n                                                                               \\\n    static bool reduce(uint32_t node_id, struct NodeData& GALOIS_UNUSED(node), \\\n                       ValTy y) {                                              \\\n      if (personality == GPU_CUDA) {                                           \\\n        add_node_##fieldname##_cuda(cuda_ctx, node_id, y);                     \\\n        return true;                                                           \\\n      }                                                                        \\\n      assert(personality == CPU);                                              \\\n      {                                                                        \\\n        galois::add(fieldname[node_id], y);                                    \\\n        return true;                                                           \\\n      }                                                                        \\\n    }                                                                          \\\n                                                                               \\\n    static bool reduce_batch(unsigned from_id, uint8_t* y,                     \\\n                             DataCommMode data_mode) {                         \\\n      if (personality == GPU_CUDA) {                                           \\\n        batch_add_node_##fieldname##_cuda(cuda_ctx, from_id, y, data_mode);    \\\n        return true;                                                           \\\n      }                                                                        \\\n      assert(personality == CPU);                                              \\\n      return false;                                                            \\\n    }                                                                          \\\n                                                                               \\\n    static bool reduce_mirror_batch(unsigned from_id, uint8_t* y,              \\\n                                    DataCommMode data_mode) {                  \\\n      if (personality == GPU_CUDA) {                                           \\\n        batch_add_mirror_node_##fieldname##_cuda(cuda_ctx, from_id, y,         \\\n                                                 data_mode);                   \\\n        return true;                                                           \\\n      }                                                                        \\\n      assert(personality == CPU);                                              \\\n      return false;                                                            \\\n    }                                                                          \\\n                                                                               \\\n    static void reset(uint32_t node_id,                                        \\\n                      struct NodeData& GALOIS_UNUSED(node)) {                  \\\n      if (personality == GPU_CUDA) {                                           \\\n        set_node_##fieldname##_cuda(cuda_ctx, node_id, (ValTy)0);              \\\n      } else if (personality == CPU)                                           \\\n        galois::set(fieldname[node_id], (ValTy)0);                             \\\n    }                                                                          \\\n                                                                               \\\n    static void setVal(uint32_t node_id, struct NodeData& GALOIS_UNUSED(node), \\\n                       ValTy y) {                                              \\\n      if (personality == GPU_CUDA)                                             \\\n        set_node_##fieldname##_cuda(cuda_ctx, node_id, y);                     \\\n      else if (personality == CPU)                                             \\\n        fieldname[node_id] = y;                                                \\\n    }                                                                          \\\n                                                                               \\\n    static bool setVal_batch(unsigned from_id, uint8_t* y,                     \\\n                             DataCommMode data_mode) {                         \\\n      if (personality == GPU_CUDA) {                                           \\\n        batch_set_mirror_node_##fieldname##_cuda(cuda_ctx, from_id, y,         \\\n                                                 data_mode);                   \\\n        return true;                                                           \\\n      }                                                                        \\\n      assert(personality == CPU);                                              \\\n      return false;                                                            \\\n    }                                                                          \\\n  }\n#else\n// Non-GPU code\n#define GALOIS_SYNC_STRUCTURE_REDUCE_ADD_ARRAY(fieldname, fieldtype)           \\\n  struct Reduce_add_##fieldname {                                              \\\n    typedef fieldtype ValTy;                                                   \\\n                                                                               \\\n    static ValTy extract(uint32_t node_id, const struct NodeData& node) {      \\\n      return fieldname[node_id];                                               \\\n    }                                                                          \\\n                                                                               \\\n    static bool extract_batch(unsigned from_id, uint8_t* y, size_t* s,         \\\n                              DataCommMode* data_mode) {                       \\\n      return false;                                                            \\\n    }                                                                          \\\n                                                                               \\\n    static bool extract_batch(unsigned from_id, uint8_t* y) { return false; }  \\\n                                                                               \\\n    static bool extract_reset_batch(unsigned from_id, uint8_t* y, size_t* s,   \\\n                                    DataCommMode* data_mode) {                 \\\n      return false;                                                            \\\n    }                                                                          \\\n                                                                               \\\n    static bool extract_reset_batch(unsigned from_id, uint8_t* y) {            \\\n      return false;                                                            \\\n    }                                                                          \\\n                                                                               \\\n    static bool reset_batch(size_t GALOIS_UNUSED(begin),                       \\\n                            size_t GALOIS_UNUSED(end)) {                       \\\n      return false;                                                            \\\n    }                                                                          \\\n                                                                               \\\n    static bool reduce(uint32_t node_id, struct NodeData& GALOIS_UNUSED(node), \\\n                       ValTy y) {                                              \\\n      {                                                                        \\\n        galois::add(fieldname[node_id], y);                                    \\\n        return true;                                                           \\\n      }                                                                        \\\n    }                                                                          \\\n                                                                               \\\n    static bool reduce_batch(unsigned from_id, uint8_t* y,                     \\\n                             DataCommMode data_mode) {                         \\\n      return false;                                                            \\\n    }                                                                          \\\n                                                                               \\\n    static bool reduce_mirror_batch(unsigned from_id, uint8_t* y,              \\\n                                    DataCommMode data_mode) {                  \\\n      return false;                                                            \\\n    }                                                                          \\\n                                                                               \\\n    static void reset(uint32_t node_id,                                        \\\n                      struct NodeData& GALOIS_UNUSED(node)) {                  \\\n      galois::set(fieldname[node_id], (ValTy)0);                               \\\n    }                                                                          \\\n                                                                               \\\n    static void setVal(uint32_t node_id, struct NodeData& GALOIS_UNUSED(node), \\\n                       ValTy y) {                                              \\\n      fieldname[node_id] = y;                                                  \\\n    }                                                                          \\\n                                                                               \\\n    static bool setVal_batch(unsigned from_id, uint8_t* y,                     \\\n                             DataCommMode data_mode) {                         \\\n      return false;                                                            \\\n    }                                                                          \\\n  }\n#endif\n\n////////////////////////////////////////////////////////////////////////////////\n// Reduce Set\n////////////////////////////////////////////////////////////////////////////////\n\n/**\n * Creates a Galois reduction sync structure that does a set as a reduction.\n */\n#ifdef GALOIS_ENABLE_GPU\n// GPU code included\n#define GALOIS_SYNC_STRUCTURE_REDUCE_SET(fieldname, fieldtype)                 \\\n  struct Reduce_set_##fieldname {                                              \\\n    typedef fieldtype ValTy;                                                   \\\n                                                                               \\\n    static ValTy extract(uint32_t node_id, const struct NodeData& node) {      \\\n      if (personality == GPU_CUDA)                                             \\\n        return get_node_##fieldname##_cuda(cuda_ctx, node_id);                 \\\n      assert(personality == CPU);                                              \\\n      return node.fieldname;                                                   \\\n    }                                                                          \\\n                                                                               \\\n    static bool extract_batch(unsigned from_id, uint8_t* y, size_t* s,         \\\n                              DataCommMode* data_mode) {                       \\\n      if (personality == GPU_CUDA) {                                           \\\n        batch_get_node_##fieldname##_cuda(cuda_ctx, from_id, y, s, data_mode); \\\n        return true;                                                           \\\n      }                                                                        \\\n      assert(personality == CPU);                                              \\\n      return false;                                                            \\\n    }                                                                          \\\n                                                                               \\\n    static bool extract_batch(unsigned from_id, uint8_t* y) {                  \\\n      if (personality == GPU_CUDA) {                                           \\\n        batch_get_node_##fieldname##_cuda(cuda_ctx, from_id, y);               \\\n        return true;                                                           \\\n      }                                                                        \\\n      assert(personality == CPU);                                              \\\n      return false;                                                            \\\n    }                                                                          \\\n                                                                               \\\n    static bool extract_reset_batch(unsigned from_id, uint8_t* y, size_t* s,   \\\n                                    DataCommMode* data_mode) {                 \\\n      if (personality == GPU_CUDA) {                                           \\\n        batch_get_mirror_node_##fieldname##_cuda(cuda_ctx, from_id, y, s,      \\\n                                                 data_mode);                   \\\n        return true;                                                           \\\n      }                                                                        \\\n      assert(personality == CPU);                                              \\\n      return false;                                                            \\\n    }                                                                          \\\n                                                                               \\\n    static bool extract_reset_batch(unsigned from_id, uint8_t* y) {            \\\n      if (personality == GPU_CUDA) {                                           \\\n        batch_get_mirror_node_##fieldname##_cuda(cuda_ctx, from_id, y);        \\\n        return true;                                                           \\\n      }                                                                        \\\n      assert(personality == CPU);                                              \\\n      return false;                                                            \\\n    }                                                                          \\\n                                                                               \\\n    static bool reset_batch(size_t GALOIS_UNUSED(begin),                       \\\n                            size_t GALOIS_UNUSED(end)) {                       \\\n      return true;                                                             \\\n    }                                                                          \\\n                                                                               \\\n    static bool reduce(uint32_t node_id, struct NodeData& node, ValTy y) {     \\\n      if (personality == GPU_CUDA) {                                           \\\n        set_node_##fieldname##_cuda(cuda_ctx, node_id, y);                     \\\n        return true;                                                           \\\n      }                                                                        \\\n      assert(personality == CPU);                                              \\\n      {                                                                        \\\n        galois::set(node.fieldname, y);                                        \\\n        return true;                                                           \\\n      }                                                                        \\\n    }                                                                          \\\n                                                                               \\\n    static bool reduce_batch(unsigned from_id, uint8_t* y,                     \\\n                             DataCommMode data_mode) {                         \\\n      if (personality == GPU_CUDA) {                                           \\\n        batch_set_node_##fieldname##_cuda(cuda_ctx, from_id, y, data_mode);    \\\n        return true;                                                           \\\n      }                                                                        \\\n      assert(personality == CPU);                                              \\\n      return false;                                                            \\\n    }                                                                          \\\n                                                                               \\\n    static bool reduce_mirror_batch(unsigned from_id, uint8_t* y,              \\\n                                    DataCommMode data_mode) {                  \\\n      if (personality == GPU_CUDA) {                                           \\\n        batch_set_mirror_node_##fieldname##_cuda(cuda_ctx, from_id, y,         \\\n                                                 data_mode);                   \\\n        return true;                                                           \\\n      }                                                                        \\\n      assert(personality == CPU);                                              \\\n      return false;                                                            \\\n    }                                                                          \\\n                                                                               \\\n    static void reset(uint32_t GALOIS_UNUSED(node_id),                         \\\n                      struct NodeData& GALOIS_UNUSED(node)) {}                 \\\n                                                                               \\\n    static void setVal(uint32_t node_id, struct NodeData& node, ValTy y) {     \\\n      if (personality == GPU_CUDA)                                             \\\n        set_node_##fieldname##_cuda(cuda_ctx, node_id, y);                     \\\n      else if (personality == CPU)                                             \\\n        node.fieldname = y;                                                    \\\n    }                                                                          \\\n                                                                               \\\n    static bool setVal_batch(unsigned from_id, uint8_t* y,                     \\\n                             DataCommMode data_mode) {                         \\\n      if (personality == GPU_CUDA) {                                           \\\n        batch_set_mirror_node_##fieldname##_cuda(cuda_ctx, from_id, y,         \\\n                                                 data_mode);                   \\\n        return true;                                                           \\\n      }                                                                        \\\n      assert(personality == CPU);                                              \\\n      return false;                                                            \\\n    }                                                                          \\\n  }\n#else\n// Non-GPU code\n#define GALOIS_SYNC_STRUCTURE_REDUCE_SET(fieldname, fieldtype)                 \\\n  struct Reduce_set_##fieldname {                                              \\\n    typedef fieldtype ValTy;                                                   \\\n                                                                               \\\n    static ValTy extract(uint32_t, const struct NodeData& node) {              \\\n      return node.fieldname;                                                   \\\n    }                                                                          \\\n                                                                               \\\n    static bool extract_batch(unsigned, uint8_t*, size_t*, DataCommMode*) {    \\\n      return false;                                                            \\\n    }                                                                          \\\n                                                                               \\\n    static bool extract_batch(unsigned, uint8_t*) { return false; }            \\\n                                                                               \\\n    static bool extract_reset_batch(unsigned, uint8_t*, size_t*,               \\\n                                    DataCommMode*) {                           \\\n      return false;                                                            \\\n    }                                                                          \\\n                                                                               \\\n    static bool extract_reset_batch(unsigned, uint8_t*) { return false; }      \\\n                                                                               \\\n    static bool reset_batch(size_t, size_t) { return true; }                   \\\n                                                                               \\\n    static bool reduce(uint32_t, struct NodeData& node, ValTy y) {             \\\n      {                                                                        \\\n        galois::set(node.fieldname, y);                                        \\\n        return true;                                                           \\\n      }                                                                        \\\n    }                                                                          \\\n                                                                               \\\n    static bool reduce_batch(unsigned, uint8_t*, DataCommMode) {               \\\n      return false;                                                            \\\n    }                                                                          \\\n                                                                               \\\n    static bool reduce_mirror_batch(unsigned, uint8_t*, DataCommMode) {        \\\n      return false;                                                            \\\n    }                                                                          \\\n                                                                               \\\n    static void reset(uint32_t, struct NodeData&) {}                           \\\n                                                                               \\\n    static void setVal(uint32_t, struct NodeData& node, ValTy y) {             \\\n      node.fieldname = y;                                                      \\\n    }                                                                          \\\n                                                                               \\\n    static bool setVal_batch(unsigned, uint8_t*, DataCommMode) {               \\\n      return false;                                                            \\\n    }                                                                          \\\n  }\n#endif\n\n/**\n * Creates a Galois reduction sync structure that does a set as a reduction\n * on a field represented by an array.\n */\n#ifdef GALOIS_ENABLE_GPU\n// GPU code included\n#define GALOIS_SYNC_STRUCTURE_REDUCE_SET_ARRAY(fieldname, fieldtype)           \\\n  struct Reduce_set_##fieldname {                                              \\\n    typedef fieldtype ValTy;                                                   \\\n                                                                               \\\n    static ValTy extract(uint32_t node_id, const struct NodeData& node) {      \\\n      if (personality == GPU_CUDA)                                             \\\n        return get_node_##fieldname##_cuda(cuda_ctx, node_id);                 \\\n      assert(personality == CPU);                                              \\\n      return fieldname[node_id];                                               \\\n    }                                                                          \\\n                                                                               \\\n    static bool extract_batch(unsigned from_id, uint8_t* y, size_t* s,         \\\n                              DataCommMode* data_mode) {                       \\\n      if (personality == GPU_CUDA) {                                           \\\n        batch_get_node_##fieldname##_cuda(cuda_ctx, from_id, y, s, data_mode); \\\n        return true;                                                           \\\n      }                                                                        \\\n      assert(personality == CPU);                                              \\\n      return false;                                                            \\\n    }                                                                          \\\n                                                                               \\\n    static bool extract_batch(unsigned from_id, uint8_t* y) {                  \\\n      if (personality == GPU_CUDA) {                                           \\\n        batch_get_node_##fieldname##_cuda(cuda_ctx, from_id, y);               \\\n        return true;                                                           \\\n      }                                                                        \\\n      assert(personality == CPU);                                              \\\n      return false;                                                            \\\n    }                                                                          \\\n                                                                               \\\n    static bool extract_reset_batch(unsigned from_id, uint8_t* y, size_t* s,   \\\n                                    DataCommMode* data_mode) {                 \\\n      if (personality == GPU_CUDA) {                                           \\\n        batch_get_mirror_node_##fieldname##_cuda(cuda_ctx, from_id, y, s,      \\\n                                                 data_mode);                   \\\n        return true;                                                           \\\n      }                                                                        \\\n      assert(personality == CPU);                                              \\\n      return false;                                                            \\\n    }                                                                          \\\n                                                                               \\\n    static bool extract_reset_batch(unsigned from_id, uint8_t* y) {            \\\n      if (personality == GPU_CUDA) {                                           \\\n        batch_get_mirror_node_##fieldname##_cuda(cuda_ctx, from_id, y);        \\\n        return true;                                                           \\\n      }                                                                        \\\n      assert(personality == CPU);                                              \\\n      return false;                                                            \\\n    }                                                                          \\\n                                                                               \\\n    static bool reset_batch(size_t GALOIS_UNUSED(begin),                       \\\n                            size_t GALOIS_UNUSED(end)) {                       \\\n      return true;                                                             \\\n    }                                                                          \\\n                                                                               \\\n    static bool reduce(uint32_t node_id, struct NodeData& GALOIS_UNUSED(node), \\\n                       ValTy y) {                                              \\\n      if (personality == GPU_CUDA) {                                           \\\n        set_node_##fieldname##_cuda(cuda_ctx, node_id, y);                     \\\n        return true;                                                           \\\n      }                                                                        \\\n      assert(personality == CPU);                                              \\\n      {                                                                        \\\n        galois::set(fieldname[node_id], y);                                    \\\n        return true;                                                           \\\n      }                                                                        \\\n    }                                                                          \\\n                                                                               \\\n    static bool reduce_batch(unsigned from_id, uint8_t* y,                     \\\n                             DataCommMode data_mode) {                         \\\n      if (personality == GPU_CUDA) {                                           \\\n        batch_set_node_##fieldname##_cuda(cuda_ctx, from_id, y, data_mode);    \\\n        return true;                                                           \\\n      }                                                                        \\\n      assert(personality == CPU);                                              \\\n      return false;                                                            \\\n    }                                                                          \\\n                                                                               \\\n    static bool reduce_mirror_batch(unsigned from_id, uint8_t* y,              \\\n                                    DataCommMode data_mode) {                  \\\n      if (personality == GPU_CUDA) {                                           \\\n        batch_set_mirror_node_##fieldname##_cuda(cuda_ctx, from_id, y,         \\\n                                                 data_mode);                   \\\n        return true;                                                           \\\n      }                                                                        \\\n      assert(personality == CPU);                                              \\\n      return false;                                                            \\\n    }                                                                          \\\n                                                                               \\\n    static void reset(uint32_t GALOIS_UNUSED(node_id),                         \\\n                      struct NodeData& GALOIS_UNUSED(node)) {}                 \\\n                                                                               \\\n    static void setVal(uint32_t node_id, struct NodeData& GALOIS_UNUSED(node), \\\n                       ValTy y) {                                              \\\n      if (personality == GPU_CUDA)                                             \\\n        set_node_##fieldname##_cuda(cuda_ctx, node_id, y);                     \\\n      else if (personality == CPU)                                             \\\n        fieldname[node_id] = y;                                                \\\n    }                                                                          \\\n                                                                               \\\n    static bool setVal_batch(unsigned from_id, uint8_t* y,                     \\\n                             DataCommMode data_mode) {                         \\\n      if (personality == GPU_CUDA) {                                           \\\n        batch_set_mirror_node_##fieldname##_cuda(cuda_ctx, from_id, y,         \\\n                                                 data_mode);                   \\\n        return true;                                                           \\\n      }                                                                        \\\n      assert(personality == CPU);                                              \\\n      return false;                                                            \\\n    }                                                                          \\\n  }\n#else\n// Non-GPU code\n#define GALOIS_SYNC_STRUCTURE_REDUCE_SET_ARRAY(fieldname, fieldtype)           \\\n  struct Reduce_set_##fieldname {                                              \\\n    typedef fieldtype ValTy;                                                   \\\n                                                                               \\\n    static ValTy extract(uint32_t node_id, const struct NodeData& node) {      \\\n      return fieldname[node_id];                                               \\\n    }                                                                          \\\n                                                                               \\\n    static bool extract_batch(unsigned from_id, uint8_t* y, size_t* s,         \\\n                              DataCommMode* data_mode) {                       \\\n      return false;                                                            \\\n    }                                                                          \\\n                                                                               \\\n    static bool extract_batch(unsigned from_id, uint8_t* y) { return false; }  \\\n                                                                               \\\n    static bool extract_reset_batch(unsigned from_id, uint8_t* y, size_t* s,   \\\n                                    DataCommMode* data_mode) {                 \\\n      return false;                                                            \\\n    }                                                                          \\\n                                                                               \\\n    static bool extract_reset_batch(unsigned from_id, uint8_t* y) {            \\\n      return false;                                                            \\\n    }                                                                          \\\n                                                                               \\\n    static bool reset_batch(size_t GALOIS_UNUSED(begin),                       \\\n                            size_t GALOIS_UNUSED(end)) {                       \\\n      return true;                                                             \\\n    }                                                                          \\\n                                                                               \\\n    static bool reduce(uint32_t node_id, struct NodeData& GALOIS_UNUSED(node), \\\n                       ValTy y) {                                              \\\n      {                                                                        \\\n        galois::set(fieldname[node_id], y);                                    \\\n        return true;                                                           \\\n      }                                                                        \\\n    }                                                                          \\\n                                                                               \\\n    static bool reduce_batch(unsigned from_id, uint8_t* y,                     \\\n                             DataCommMode data_mode) {                         \\\n      return false;                                                            \\\n    }                                                                          \\\n                                                                               \\\n    static bool reduce_mirror_batch(unsigned from_id, uint8_t* y,              \\\n                                    DataCommMode data_mode) {                  \\\n      return false;                                                            \\\n    }                                                                          \\\n                                                                               \\\n    static void reset(uint32_t GALOIS_UNUSED(node_id),                         \\\n                      struct NodeData& GALOIS_UNUSED(node)) {}                 \\\n                                                                               \\\n    static void setVal(uint32_t node_id, struct NodeData& GALOIS_UNUSED(node), \\\n                       ValTy y) {                                              \\\n      fieldname[node_id] = y;                                                  \\\n    }                                                                          \\\n                                                                               \\\n    static bool setVal_batch(unsigned from_id, uint8_t* y,                     \\\n                             DataCommMode data_mode) {                         \\\n      return false;                                                            \\\n    }                                                                          \\\n  }\n#endif\n\n////////////////////////////////////////////////////////////////////////////////\n// Reduce Min\n////////////////////////////////////////////////////////////////////////////////\n\n/**\n * Creates a Galois reduction sync structure that does a min reduction.\n */\n#ifdef GALOIS_ENABLE_GPU\n// GPU code included\n#define GALOIS_SYNC_STRUCTURE_REDUCE_MIN(fieldname, fieldtype)                 \\\n  struct Reduce_min_##fieldname {                                              \\\n    typedef fieldtype ValTy;                                                   \\\n                                                                               \\\n    static ValTy extract(uint32_t node_id, const struct NodeData& node) {      \\\n      if (personality == GPU_CUDA)                                             \\\n        return get_node_##fieldname##_cuda(cuda_ctx, node_id);                 \\\n      assert(personality == CPU);                                              \\\n      return node.fieldname;                                                   \\\n    }                                                                          \\\n                                                                               \\\n    static bool extract_batch(unsigned from_id, uint8_t* y, size_t* s,         \\\n                              DataCommMode* data_mode) {                       \\\n      if (personality == GPU_CUDA) {                                           \\\n        batch_get_node_##fieldname##_cuda(cuda_ctx, from_id, y, s, data_mode); \\\n        return true;                                                           \\\n      }                                                                        \\\n      assert(personality == CPU);                                              \\\n      return false;                                                            \\\n    }                                                                          \\\n                                                                               \\\n    static bool extract_batch(unsigned from_id, uint8_t* y) {                  \\\n      if (personality == GPU_CUDA) {                                           \\\n        batch_get_node_##fieldname##_cuda(cuda_ctx, from_id, y);               \\\n        return true;                                                           \\\n      }                                                                        \\\n      assert(personality == CPU);                                              \\\n      return false;                                                            \\\n    }                                                                          \\\n                                                                               \\\n    static bool extract_reset_batch(unsigned from_id, uint8_t* y, size_t* s,   \\\n                                    DataCommMode* data_mode) {                 \\\n      if (personality == GPU_CUDA) {                                           \\\n        batch_get_mirror_node_##fieldname##_cuda(cuda_ctx, from_id, y, s,      \\\n                                                 data_mode);                   \\\n        return true;                                                           \\\n      }                                                                        \\\n      assert(personality == CPU);                                              \\\n      return false;                                                            \\\n    }                                                                          \\\n                                                                               \\\n    static bool extract_reset_batch(unsigned from_id, uint8_t* y) {            \\\n      if (personality == GPU_CUDA) {                                           \\\n        batch_get_mirror_node_##fieldname##_cuda(cuda_ctx, from_id, y);        \\\n        return true;                                                           \\\n      }                                                                        \\\n      assert(personality == CPU);                                              \\\n      return false;                                                            \\\n    }                                                                          \\\n                                                                               \\\n    static bool reset_batch(size_t GALOIS_UNUSED(begin),                       \\\n                            size_t GALOIS_UNUSED(end)) {                       \\\n      return true;                                                             \\\n    }                                                                          \\\n                                                                               \\\n    static bool reduce(uint32_t node_id, struct NodeData& node, ValTy y) {     \\\n      if (personality == GPU_CUDA) {                                           \\\n        return y < min_node_##fieldname##_cuda(cuda_ctx, node_id, y);          \\\n      }                                                                        \\\n      assert(personality == CPU);                                              \\\n      { return y < galois::min(node.fieldname, y); }                           \\\n    }                                                                          \\\n                                                                               \\\n    static bool reduce_batch(unsigned from_id, uint8_t* y,                     \\\n                             DataCommMode data_mode) {                         \\\n      if (personality == GPU_CUDA) {                                           \\\n        batch_min_node_##fieldname##_cuda(cuda_ctx, from_id, y, data_mode);    \\\n        return true;                                                           \\\n      }                                                                        \\\n      assert(personality == CPU);                                              \\\n      return false;                                                            \\\n    }                                                                          \\\n                                                                               \\\n    static bool reduce_mirror_batch(unsigned from_id, uint8_t* y,              \\\n                                    DataCommMode data_mode) {                  \\\n      if (personality == GPU_CUDA) {                                           \\\n        batch_min_mirror_node_##fieldname##_cuda(cuda_ctx, from_id, y,         \\\n                                                 data_mode);                   \\\n        return true;                                                           \\\n      }                                                                        \\\n      assert(personality == CPU);                                              \\\n      return false;                                                            \\\n    }                                                                          \\\n                                                                               \\\n    static void reset(uint32_t GALOIS_UNUSED(node_id),                         \\\n                      struct NodeData& GALOIS_UNUSED(node)) {}                 \\\n                                                                               \\\n    static void setVal(uint32_t node_id, struct NodeData& node, ValTy y) {     \\\n      if (personality == GPU_CUDA)                                             \\\n        set_node_##fieldname##_cuda(cuda_ctx, node_id, y);                     \\\n      else if (personality == CPU)                                             \\\n        node.fieldname = y;                                                    \\\n    }                                                                          \\\n                                                                               \\\n    static bool setVal_batch(unsigned from_id, uint8_t* y,                     \\\n                             DataCommMode data_mode) {                         \\\n      if (personality == GPU_CUDA) {                                           \\\n        batch_set_mirror_node_##fieldname##_cuda(cuda_ctx, from_id, y,         \\\n                                                 data_mode);                   \\\n        return true;                                                           \\\n      }                                                                        \\\n      assert(personality == CPU);                                              \\\n      return false;                                                            \\\n    }                                                                          \\\n  }\n#else\n// Non-GPU code\n#define GALOIS_SYNC_STRUCTURE_REDUCE_MIN(fieldname, fieldtype)                 \\\n  struct Reduce_min_##fieldname {                                              \\\n    typedef fieldtype ValTy;                                                   \\\n                                                                               \\\n    static ValTy extract(uint32_t, const struct NodeData& node) {              \\\n      return node.fieldname;                                                   \\\n    }                                                                          \\\n                                                                               \\\n    static bool extract_batch(unsigned, uint8_t*, size_t*, DataCommMode*) {    \\\n      return false;                                                            \\\n    }                                                                          \\\n                                                                               \\\n    static bool extract_batch(unsigned, uint8_t*) { return false; }            \\\n                                                                               \\\n    static bool extract_reset_batch(unsigned, uint8_t*, size_t*,               \\\n                                    DataCommMode*) {                           \\\n      return false;                                                            \\\n    }                                                                          \\\n                                                                               \\\n    static bool extract_reset_batch(unsigned, uint8_t*) { return false; }      \\\n                                                                               \\\n    static bool reset_batch(size_t, size_t) { return true; }                   \\\n                                                                               \\\n    static bool reduce(uint32_t, struct NodeData& node, ValTy y) {             \\\n      { return y < galois::min(node.fieldname, y); }                           \\\n    }                                                                          \\\n                                                                               \\\n    static bool reduce_batch(unsigned, uint8_t*, DataCommMode) {               \\\n      return false;                                                            \\\n    }                                                                          \\\n                                                                               \\\n    static bool reduce_mirror_batch(unsigned, uint8_t*, DataCommMode) {        \\\n      return false;                                                            \\\n    }                                                                          \\\n                                                                               \\\n    static void reset(uint32_t, struct NodeData&) {}                           \\\n                                                                               \\\n    static void setVal(uint32_t, struct NodeData& node, ValTy y) {             \\\n      node.fieldname = y;                                                      \\\n    }                                                                          \\\n                                                                               \\\n    static bool setVal_batch(unsigned, uint8_t*, DataCommMode) {               \\\n      return false;                                                            \\\n    }                                                                          \\\n  }\n#endif\n\n////////////////////////////////////////////////////////////////////////////////\n// Reduce Max\n////////////////////////////////////////////////////////////////////////////////\n\n/**\n * Creates a Galois reduction sync structure that does a max reduction.\n */\n#ifdef GALOIS_ENABLE_GPU\n// GPU code included\n#define GALOIS_SYNC_STRUCTURE_REDUCE_MAX(fieldname, fieldtype)                 \\\n  struct Reduce_max_##fieldname {                                              \\\n    typedef fieldtype ValTy;                                                   \\\n                                                                               \\\n    static ValTy extract(uint32_t node_id, const struct NodeData& node) {      \\\n      if (personality == GPU_CUDA)                                             \\\n        return get_node_##fieldname##_cuda(cuda_ctx, node_id);                 \\\n      assert(personality == CPU);                                              \\\n      return node.fieldname;                                                   \\\n    }                                                                          \\\n                                                                               \\\n    static bool extract_batch(unsigned from_id, uint8_t* y, size_t* s,         \\\n                              DataCommMode* data_mode) {                       \\\n      if (personality == GPU_CUDA) {                                           \\\n        batch_get_node_##fieldname##_cuda(cuda_ctx, from_id, y, s, data_mode); \\\n        return true;                                                           \\\n      }                                                                        \\\n      assert(personality == CPU);                                              \\\n      return false;                                                            \\\n    }                                                                          \\\n                                                                               \\\n    static bool extract_batch(unsigned from_id, uint8_t* y) {                  \\\n      if (personality == GPU_CUDA) {                                           \\\n        batch_get_node_##fieldname##_cuda(cuda_ctx, from_id, y);               \\\n        return true;                                                           \\\n      }                                                                        \\\n      assert(personality == CPU);                                              \\\n      return false;                                                            \\\n    }                                                                          \\\n                                                                               \\\n    static bool extract_reset_batch(unsigned from_id, uint8_t* y, size_t* s,   \\\n                                    DataCommMode* data_mode) {                 \\\n      if (personality == GPU_CUDA) {                                           \\\n        batch_get_mirror_node_##fieldname##_cuda(cuda_ctx, from_id, y, s,      \\\n                                                 data_mode);                   \\\n        return true;                                                           \\\n      }                                                                        \\\n      assert(personality == CPU);                                              \\\n      return false;                                                            \\\n    }                                                                          \\\n                                                                               \\\n    static bool extract_reset_batch(unsigned from_id, uint8_t* y) {            \\\n      if (personality == GPU_CUDA) {                                           \\\n        batch_get_mirror_node_##fieldname##_cuda(cuda_ctx, from_id, y);        \\\n        return true;                                                           \\\n      }                                                                        \\\n      assert(personality == CPU);                                              \\\n      return false;                                                            \\\n    }                                                                          \\\n                                                                               \\\n    static bool reset_batch(size_t GALOIS_UNUSED(begin),                       \\\n                            size_t GALOIS_UNUSED(end)) {                       \\\n      return true;                                                             \\\n    }                                                                          \\\n                                                                               \\\n    static bool reduce(uint32_t node_id, struct NodeData& node, ValTy y) {     \\\n      if (personality == GPU_CUDA) {                                           \\\n        return y > max_node_##fieldname##_cuda(cuda_ctx, node_id, y);          \\\n      }                                                                        \\\n      assert(personality == CPU);                                              \\\n      { return y > galois::max(node.fieldname, y); }                           \\\n    }                                                                          \\\n                                                                               \\\n    static bool reduce_batch(unsigned from_id, uint8_t* y,                     \\\n                             DataCommMode data_mode) {                         \\\n      if (personality == GPU_CUDA) {                                           \\\n        batch_max_node_##fieldname##_cuda(cuda_ctx, from_id, y, data_mode);    \\\n        return true;                                                           \\\n      }                                                                        \\\n      assert(personality == CPU);                                              \\\n      return false;                                                            \\\n    }                                                                          \\\n                                                                               \\\n    static bool reduce_mirror_batch(unsigned from_id, uint8_t* y,              \\\n                                    DataCommMode data_mode) {                  \\\n      if (personality == GPU_CUDA) {                                           \\\n        batch_max_mirror_node_##fieldname##_cuda(cuda_ctx, from_id, y,         \\\n                                                 data_mode);                   \\\n        return true;                                                           \\\n      }                                                                        \\\n      assert(personality == CPU);                                              \\\n      return false;                                                            \\\n    }                                                                          \\\n                                                                               \\\n    static void reset(uint32_t GALOIS_UNUSED(node_id),                         \\\n                      struct NodeData& GALOIS_UNUSED(node)) {}                 \\\n                                                                               \\\n    static void setVal(uint32_t node_id, struct NodeData& node, ValTy y) {     \\\n      if (personality == GPU_CUDA)                                             \\\n        set_node_##fieldname##_cuda(cuda_ctx, node_id, y);                     \\\n      else if (personality == CPU)                                             \\\n        node.fieldname = y;                                                    \\\n    }                                                                          \\\n                                                                               \\\n    static bool setVal_batch(unsigned from_id, uint8_t* y,                     \\\n                             DataCommMode data_mode) {                         \\\n      if (personality == GPU_CUDA) {                                           \\\n        batch_set_mirror_node_##fieldname##_cuda(cuda_ctx, from_id, y,         \\\n                                                 data_mode);                   \\\n        return true;                                                           \\\n      }                                                                        \\\n      assert(personality == CPU);                                              \\\n      return false;                                                            \\\n    }                                                                          \\\n  }\n#else\n// Non-GPU code\n#define GALOIS_SYNC_STRUCTURE_REDUCE_MAX(fieldname, fieldtype)                 \\\n  struct Reduce_max_##fieldname {                                              \\\n    typedef fieldtype ValTy;                                                   \\\n                                                                               \\\n    static ValTy extract(uint32_t node_id, const struct NodeData& node) {      \\\n      return node.fieldname;                                                   \\\n    }                                                                          \\\n                                                                               \\\n    static bool extract_batch(unsigned from_id, uint8_t* y, size_t* s,         \\\n                              DataCommMode* data_mode) {                       \\\n      return false;                                                            \\\n    }                                                                          \\\n                                                                               \\\n    static bool extract_batch(unsigned from_id, uint8_t* y) { return false; }  \\\n                                                                               \\\n    static bool extract_reset_batch(unsigned from_id, uint8_t* y, size_t* s,   \\\n                                    DataCommMode* data_mode) {                 \\\n      return false;                                                            \\\n    }                                                                          \\\n                                                                               \\\n    static bool extract_reset_batch(unsigned from_id, uint8_t* y) {            \\\n      return false;                                                            \\\n    }                                                                          \\\n                                                                               \\\n    static bool reset_batch(size_t GALOIS_UNUSED(begin),                       \\\n                            size_t GALOIS_UNUSED(end)) {                       \\\n      return true;                                                             \\\n    }                                                                          \\\n                                                                               \\\n    static bool reduce(uint32_t GALOIS_UNUSED(node_id), struct NodeData& node, \\\n                       ValTy y) {                                              \\\n      { return y > galois::max(node.fieldname, y); }                           \\\n    }                                                                          \\\n                                                                               \\\n    static bool reduce_batch(unsigned from_id, uint8_t* y,                     \\\n                             DataCommMode data_mode) {                         \\\n      return false;                                                            \\\n    }                                                                          \\\n                                                                               \\\n    static bool reduce_mirror_batch(unsigned from_id, uint8_t* y,              \\\n                                    DataCommMode data_mode) {                  \\\n      return false;                                                            \\\n    }                                                                          \\\n                                                                               \\\n    static void reset(uint32_t GALOIS_UNUSED(node_id),                         \\\n                      struct NodeData& GALOIS_UNUSED(node)) {}                 \\\n                                                                               \\\n    static void setVal(uint32_t GALOIS_UNUSED(node_id), struct NodeData& node, \\\n                       ValTy y) {                                              \\\n      node.fieldname = y;                                                      \\\n    }                                                                          \\\n                                                                               \\\n    static bool setVal_batch(unsigned from_id, uint8_t* y,                     \\\n                             DataCommMode data_mode) {                         \\\n      return false;                                                            \\\n    }                                                                          \\\n  }\n#endif\n\n/**\n * Creates a Galois reduction sync structure that does a pairwise\n * min reduction on an array.\n */\n#ifdef GALOIS_ENABLE_GPU\n// GPU code included\n#define GALOIS_SYNC_STRUCTURE_REDUCE_MIN_ARRAY(fieldname, fieldtype)           \\\n  struct Reduce_min_##fieldname {                                              \\\n    typedef fieldtype ValTy;                                                   \\\n                                                                               \\\n    static ValTy extract(uint32_t node_id, const struct NodeData& node) {      \\\n      if (personality == GPU_CUDA)                                             \\\n        return get_node_##fieldname##_cuda(cuda_ctx, node_id);                 \\\n      assert(personality == CPU);                                              \\\n      return fieldname[node_id];                                               \\\n    }                                                                          \\\n                                                                               \\\n    static bool extract_batch(unsigned from_id, uint8_t* y, size_t* s,         \\\n                              DataCommMode* data_mode) {                       \\\n      if (personality == GPU_CUDA) {                                           \\\n        batch_get_node_##fieldname##_cuda(cuda_ctx, from_id, y, s, data_mode); \\\n        return true;                                                           \\\n      }                                                                        \\\n      assert(personality == CPU);                                              \\\n      return false;                                                            \\\n    }                                                                          \\\n                                                                               \\\n    static bool extract_batch(unsigned from_id, uint8_t* y) {                  \\\n      if (personality == GPU_CUDA) {                                           \\\n        batch_get_node_##fieldname##_cuda(cuda_ctx, from_id, y);               \\\n        return true;                                                           \\\n      }                                                                        \\\n      assert(personality == CPU);                                              \\\n      return false;                                                            \\\n    }                                                                          \\\n                                                                               \\\n    static bool extract_reset_batch(unsigned from_id, uint8_t* y, size_t* s,   \\\n                                    DataCommMode* data_mode) {                 \\\n      if (personality == GPU_CUDA) {                                           \\\n        batch_get_mirror_node_##fieldname##_cuda(cuda_ctx, from_id, y, s,      \\\n                                                 data_mode);                   \\\n        return true;                                                           \\\n      }                                                                        \\\n      assert(personality == CPU);                                              \\\n      return false;                                                            \\\n    }                                                                          \\\n                                                                               \\\n    static bool extract_reset_batch(unsigned from_id, uint8_t* y) {            \\\n      if (personality == GPU_CUDA) {                                           \\\n        batch_get_mirror_node_##fieldname##_cuda(cuda_ctx, from_id, y);        \\\n        return true;                                                           \\\n      }                                                                        \\\n      assert(personality == CPU);                                              \\\n      return false;                                                            \\\n    }                                                                          \\\n                                                                               \\\n    static bool reset_batch(size_t GALOIS_UNUSED(begin),                       \\\n                            size_t GALOIS_UNUSED(end)) {                       \\\n      return true;                                                             \\\n    }                                                                          \\\n                                                                               \\\n    static bool reduce(uint32_t node_id, struct NodeData& GALOIS_UNUSED(node), \\\n                       ValTy y) {                                              \\\n      if (personality == GPU_CUDA) {                                           \\\n        return y < min_node_##fieldname##_cuda(cuda_ctx, node_id, y);          \\\n      }                                                                        \\\n      assert(personality == CPU);                                              \\\n      { return y < galois::min(fieldname[node_id], y); }                       \\\n    }                                                                          \\\n                                                                               \\\n    static bool reduce_batch(unsigned from_id, uint8_t* y,                     \\\n                             DataCommMode data_mode) {                         \\\n      if (personality == GPU_CUDA) {                                           \\\n        batch_min_node_##fieldname##_cuda(cuda_ctx, from_id, y, data_mode);    \\\n        return true;                                                           \\\n      }                                                                        \\\n      assert(personality == CPU);                                              \\\n      return false;                                                            \\\n    }                                                                          \\\n                                                                               \\\n    static bool reduce_mirror_batch(unsigned from_id, uint8_t* y,              \\\n                                    DataCommMode data_mode) {                  \\\n      if (personality == GPU_CUDA) {                                           \\\n        batch_min_mirror_node_##fieldname##_cuda(cuda_ctx, from_id, y,         \\\n                                                 data_mode);                   \\\n        return true;                                                           \\\n      }                                                                        \\\n      assert(personality == CPU);                                              \\\n      return false;                                                            \\\n    }                                                                          \\\n                                                                               \\\n    static void reset(uint32_t GALOIS_UNUSED(node_id),                         \\\n                      struct NodeData& GALOIS_UNUSED(node)) {}                 \\\n                                                                               \\\n    static void setVal(uint32_t node_id, struct NodeData& GALOIS_UNUSED(node), \\\n                       ValTy y) {                                              \\\n      if (personality == GPU_CUDA)                                             \\\n        set_node_##fieldname##_cuda(cuda_ctx, node_id, y);                     \\\n      else if (personality == CPU)                                             \\\n        fieldname[node_id] = y;                                                \\\n    }                                                                          \\\n                                                                               \\\n    static bool setVal_batch(unsigned from_id, uint8_t* y,                     \\\n                             DataCommMode data_mode) {                         \\\n      if (personality == GPU_CUDA) {                                           \\\n        batch_set_mirror_node_##fieldname##_cuda(cuda_ctx, from_id, y,         \\\n                                                 data_mode);                   \\\n        return true;                                                           \\\n      }                                                                        \\\n      assert(personality == CPU);                                              \\\n      return false;                                                            \\\n    }                                                                          \\\n  }\n#else\n// Non-GPU code\n#define GALOIS_SYNC_STRUCTURE_REDUCE_MIN_ARRAY(fieldname, fieldtype)           \\\n  struct Reduce_min_##fieldname {                                              \\\n    typedef fieldtype ValTy;                                                   \\\n                                                                               \\\n    static ValTy extract(uint32_t node_id, const struct NodeData& node) {      \\\n      return fieldname[node_id];                                               \\\n    }                                                                          \\\n                                                                               \\\n    static bool extract_batch(unsigned from_id, uint8_t* y, size_t* s,         \\\n                              DataCommMode* data_mode) {                       \\\n      return false;                                                            \\\n    }                                                                          \\\n                                                                               \\\n    static bool extract_batch(unsigned from_id, uint8_t* y) { return false; }  \\\n                                                                               \\\n    static bool extract_reset_batch(unsigned from_id, uint8_t* y, size_t* s,   \\\n                                    DataCommMode* data_mode) {                 \\\n      return false;                                                            \\\n    }                                                                          \\\n                                                                               \\\n    static bool extract_reset_batch(unsigned from_id, uint8_t* y) {            \\\n      return false;                                                            \\\n    }                                                                          \\\n                                                                               \\\n    static bool reset_batch(size_t GALOIS_UNUSED(begin),                       \\\n                            size_t GALOIS_UNUSED(end)) {                       \\\n      return true;                                                             \\\n    }                                                                          \\\n                                                                               \\\n    static bool reduce(uint32_t node_id, struct NodeData& GALOIS_UNUSED(node), \\\n                       ValTy y) {                                              \\\n      { return y < galois::min(fieldname[node_id], y); }                       \\\n    }                                                                          \\\n                                                                               \\\n    static bool reduce_batch(unsigned from_id, uint8_t* y,                     \\\n                             DataCommMode data_mode) {                         \\\n      return false;                                                            \\\n    }                                                                          \\\n                                                                               \\\n    static bool reduce_mirror_batch(unsigned from_id, uint8_t* y,              \\\n                                    DataCommMode data_mode) {                  \\\n      return false;                                                            \\\n    }                                                                          \\\n                                                                               \\\n    static void reset(uint32_t GALOIS_UNUSED(node_id),                         \\\n                      struct NodeData& GALOIS_UNUSED(node)) {}                 \\\n                                                                               \\\n    static void setVal(uint32_t node_id, struct NodeData& GALOIS_UNUSED(node), \\\n                       ValTy y) {                                              \\\n      fieldname[node_id] = y;                                                  \\\n    }                                                                          \\\n                                                                               \\\n    static bool setVal_batch(unsigned from_id, uint8_t* y,                     \\\n                             DataCommMode data_mode) {                         \\\n      return false;                                                            \\\n    }                                                                          \\\n  }\n#endif\n\n/**\n * Creates a Galois reduction sync structure that does a pairwise\n * average on an array.\n */\n#ifdef GALOIS_ENABLE_GPU\n// GPU code included\n#define GALOIS_SYNC_STRUCTURE_REDUCE_PAIR_WISE_AVG_ARRAY(fieldname, fieldtype) \\\n  struct Reduce_pair_wise_avg_array_##fieldname {                              \\\n    typedef fieldtype ValTy;                                                   \\\n                                                                               \\\n    static ValTy extract(uint32_t node_id, const struct NodeData& node) {      \\\n      if (personality == GPU_CUDA)                                             \\\n        return get_node_##fieldname##_cuda(cuda_ctx, node_id);                 \\\n      assert(personality == CPU);                                              \\\n      return node.fieldname;                                                   \\\n    }                                                                          \\\n                                                                               \\\n    static bool extract_batch(unsigned from_id, uint8_t* y, size_t* s,         \\\n                              DataCommMode* data_mode) {                       \\\n      if (personality == GPU_CUDA) {                                           \\\n        batch_get_node_##fieldname##_cuda(cuda_ctx, from_id, y, s, data_mode); \\\n        return true;                                                           \\\n      }                                                                        \\\n      assert(personality == CPU);                                              \\\n      return false;                                                            \\\n    }                                                                          \\\n                                                                               \\\n    static bool extract_batch(unsigned from_id, uint8_t* y) {                  \\\n      if (personality == GPU_CUDA) {                                           \\\n        batch_get_node_##fieldname##_cuda(cuda_ctx, from_id, y);               \\\n        return true;                                                           \\\n      }                                                                        \\\n      assert(personality == CPU);                                              \\\n      return false;                                                            \\\n    }                                                                          \\\n                                                                               \\\n    static bool extract_reset_batch(unsigned from_id, uint8_t* y, size_t* s,   \\\n                                    DataCommMode* data_mode) {                 \\\n      if (personality == GPU_CUDA) {                                           \\\n        batch_get_mirror_node_##fieldname##_cuda(cuda_ctx, from_id, y, s,      \\\n                                                 data_mode);                   \\\n        return true;                                                           \\\n      }                                                                        \\\n      assert(personality == CPU);                                              \\\n      return false;                                                            \\\n    }                                                                          \\\n                                                                               \\\n    static bool extract_reset_batch(unsigned from_id, uint8_t* y) {            \\\n      if (personality == GPU_CUDA) {                                           \\\n        batch_get_mirror_node_##fieldname##_cuda(cuda_ctx, from_id, y);        \\\n        return true;                                                           \\\n      }                                                                        \\\n      assert(personality == CPU);                                              \\\n      return false;                                                            \\\n    }                                                                          \\\n                                                                               \\\n    static bool reset_batch(size_t GALOIS_UNUSED(begin),                       \\\n                            size_t GALOIS_UNUSED(end)) {                       \\\n      return false;                                                            \\\n    }                                                                          \\\n                                                                               \\\n    static bool reduce(uint32_t node_id, struct NodeData& node, ValTy y) {     \\\n      if (personality == GPU_CUDA) {                                           \\\n        set_node_##fieldname##_cuda(cuda_ctx, node_id, y);                     \\\n        return true;                                                           \\\n      }                                                                        \\\n      assert(personality == CPU);                                              \\\n      {                                                                        \\\n        galois::pairWiseAvg_vec(node.fieldname, y);                            \\\n        return true;                                                           \\\n      }                                                                        \\\n    }                                                                          \\\n                                                                               \\\n    static bool reduce_batch(unsigned from_id, uint8_t* y,                     \\\n                             DataCommMode data_mode) {                         \\\n      if (personality == GPU_CUDA) {                                           \\\n        batch_set_node_##fieldname##_cuda(cuda_ctx, from_id, y, data_mode);    \\\n        return true;                                                           \\\n      }                                                                        \\\n      assert(personality == CPU);                                              \\\n      return false;                                                            \\\n    }                                                                          \\\n                                                                               \\\n    static bool reduce_mirror_batch(unsigned from_id, uint8_t* y,              \\\n                                    DataCommMode data_mode) {                  \\\n      if (personality == GPU_CUDA) {                                           \\\n        batch_set_mirror_node_##fieldname##_cuda(cuda_ctx, from_id, y,         \\\n                                                 data_mode);                   \\\n        return true;                                                           \\\n      }                                                                        \\\n      assert(personality == CPU);                                              \\\n      return false;                                                            \\\n    }                                                                          \\\n                                                                               \\\n    static void reset(uint32_t GALOIS_UNUSED(node_id),                         \\\n                      struct NodeData& node) {                                 \\\n      { galois::resetVec(node.fieldname); }                                    \\\n    }                                                                          \\\n                                                                               \\\n    static void setVal(uint32_t node_id, struct NodeData& node, ValTy y) {     \\\n      if (personality == GPU_CUDA)                                             \\\n        set_node_##fieldname##_cuda(cuda_ctx, node_id, y);                     \\\n      else if (personality == CPU)                                             \\\n        node.fieldname = y;                                                    \\\n    }                                                                          \\\n                                                                               \\\n    static bool setVal_batch(unsigned from_id, uint8_t* y,                     \\\n                             DataCommMode data_mode) {                         \\\n      if (personality == GPU_CUDA) {                                           \\\n        batch_set_mirror_node_##fieldname##_cuda(cuda_ctx, from_id, y,         \\\n                                                 data_mode);                   \\\n        return true;                                                           \\\n      }                                                                        \\\n      assert(personality == CPU);                                              \\\n      return false;                                                            \\\n    }                                                                          \\\n  }\n#else\n// Non-GPU code\n#define GALOIS_SYNC_STRUCTURE_REDUCE_PAIR_WISE_AVG_ARRAY(fieldname, fieldtype) \\\n  struct Reduce_pair_wise_avg_array_##fieldname {                              \\\n    typedef fieldtype ValTy;                                                   \\\n                                                                               \\\n    static ValTy extract(uint32_t, const struct NodeData& node) {              \\\n      return node.fieldname;                                                   \\\n    }                                                                          \\\n                                                                               \\\n    static bool extract_batch(unsigned, uint8_t*, size_t*, DataCommMode*) {    \\\n      return false;                                                            \\\n    }                                                                          \\\n                                                                               \\\n    static bool extract_batch(unsigned, uint8_t*) { return false; }            \\\n                                                                               \\\n    static bool extract_reset_batch(unsigned, uint8_t*, size_t*,               \\\n                                    DataCommMode*) {                           \\\n      return false;                                                            \\\n    }                                                                          \\\n                                                                               \\\n    static bool extract_reset_batch(unsigned, uint8_t*) { return false; }      \\\n                                                                               \\\n    static bool reset_batch(size_t, size_t) { return false; }                  \\\n                                                                               \\\n    static bool reduce(uint32_t, struct NodeData& node, ValTy y) {             \\\n      {                                                                        \\\n        galois::pairWiseAvg_vec(node.fieldname, y);                            \\\n        return true;                                                           \\\n      }                                                                        \\\n    }                                                                          \\\n                                                                               \\\n    static bool reduce_batch(unsigned, uint8_t*, DataCommMode) {               \\\n      return false;                                                            \\\n    }                                                                          \\\n                                                                               \\\n    static bool reduce_mirror_batch(unsigned, uint8_t*, DataCommMode) {        \\\n      return false;                                                            \\\n    }                                                                          \\\n                                                                               \\\n    static void reset(uint32_t, struct NodeData& node) {                       \\\n      { galois::resetVec(node.fieldname); }                                    \\\n    }                                                                          \\\n                                                                               \\\n    static void setVal(uint32_t, struct NodeData& node, ValTy y) {             \\\n      node.fieldname = y;                                                      \\\n    }                                                                          \\\n                                                                               \\\n    static bool setVal_batch(unsigned, uint8_t*, DataCommMode) {               \\\n      return false;                                                            \\\n    }                                                                          \\\n  }\n#endif\n\n/**\n * Creates a Galois reduction sync structure that does a pairwise\n * sum reduction on an array.\n */\n#define GALOIS_SYNC_STRUCTURE_REDUCE_PAIR_WISE_ADD_ARRAY(fieldname, fieldtype) \\\n  struct Reduce_pair_wise_add_array_##fieldname {                              \\\n    typedef fieldtype ValTy;                                                   \\\n                                                                               \\\n    static ValTy extract(uint32_t, const struct NodeData& node) {              \\\n      return node.fieldname;                                                   \\\n    }                                                                          \\\n                                                                               \\\n    static bool extract_batch(unsigned, uint8_t*, size_t*, DataCommMode*) {    \\\n      return false;                                                            \\\n    }                                                                          \\\n                                                                               \\\n    static bool extract_batch(unsigned, uint8_t*) { return false; }            \\\n                                                                               \\\n    static bool extract_reset_batch(unsigned, uint8_t*, size_t*,               \\\n                                    DataCommMode*) {                           \\\n      return false;                                                            \\\n    }                                                                          \\\n                                                                               \\\n    static bool extract_reset_batch(unsigned, uint8_t*) { return false; }      \\\n                                                                               \\\n    static bool reset_batch(size_t, size_t) { return false; }                  \\\n                                                                               \\\n    static bool reduce(uint32_t, struct NodeData& node, ValTy y) {             \\\n      {                                                                        \\\n        galois::addArray(node.fieldname, y);                                   \\\n        return true;                                                           \\\n      }                                                                        \\\n    }                                                                          \\\n                                                                               \\\n    static bool reduce_batch(unsigned, uint8_t*, DataCommMode) {               \\\n      return false;                                                            \\\n    }                                                                          \\\n                                                                               \\\n    static bool reduce_mirror_batch(unsigned, uint8_t*, DataCommMode) {        \\\n      return false;                                                            \\\n    }                                                                          \\\n                                                                               \\\n    static void reset(uint32_t, struct NodeData& node) {                       \\\n      { galois::resetVec(node.fieldname); }                                    \\\n    }                                                                          \\\n                                                                               \\\n    static void setVal(uint32_t, struct NodeData& node, ValTy y) {             \\\n      node.fieldname = y;                                                      \\\n    }                                                                          \\\n                                                                               \\\n    static bool setVal_batch(unsigned, uint8_t*, DataCommMode) {               \\\n      return false;                                                            \\\n    }                                                                          \\\n  }\n\n/**\n * Creates a Galois reduction sync structure that does a pairwise\n * sum reduction on an array on a SINGLE element.\n */\n#define GALOIS_SYNC_STRUCTURE_REDUCE_PAIR_WISE_ADD_ARRAY_SINGLE(fieldname,     \\\n                                                                fieldtype)     \\\n  struct Reduce_pair_wise_add_array_single_##fieldname {                       \\\n    typedef fieldtype ValTy;                                                   \\\n                                                                               \\\n    static ValTy extract(uint32_t node_id, const struct NodeData& node,        \\\n                         unsigned vecIndex) {                                  \\\n      return node.fieldname[vecIndex];                                         \\\n    }                                                                          \\\n                                                                               \\\n    static bool extract_batch(unsigned from_id, uint8_t* y, size_t* s,         \\\n                              DataCommMode* data_mode) {                       \\\n      return false;                                                            \\\n    }                                                                          \\\n                                                                               \\\n    static bool extract_batch(unsigned from_id, uint8_t* y) { return false; }  \\\n                                                                               \\\n    static bool extract_reset_batch(unsigned, uint8_t*, size_t*,               \\\n                                    DataCommMode*) {                           \\\n      return false;                                                            \\\n    }                                                                          \\\n                                                                               \\\n    static bool extract_reset_batch(unsigned, uint8_t*) { return false; }      \\\n                                                                               \\\n    static bool reset_batch(size_t GALOIS_UNUSED(begin),                       \\\n                            size_t GALOIS_UNUSED(end)) {                       \\\n      return false;                                                            \\\n    }                                                                          \\\n                                                                               \\\n    static bool reduce(uint32_t GALOIS_UNUSED(node_id), struct NodeData& node, \\\n                       ValTy y, unsigned vecIndex) {                           \\\n      node.fieldname[vecIndex] = node.fieldname[vecIndex] + y;                 \\\n      return true;                                                             \\\n    }                                                                          \\\n                                                                               \\\n    static bool reduce_batch(unsigned, uint8_t*, size_t, DataCommMode) {       \\\n      return false;                                                            \\\n    }                                                                          \\\n                                                                               \\\n    static bool reduce_mirror_batch(unsigned from_id, uint8_t* y,              \\\n                                    DataCommMode data_mode) {                  \\\n      return false;                                                            \\\n    }                                                                          \\\n                                                                               \\\n    static void reset(uint32_t GALOIS_UNUSED(node_id), struct NodeData& node,  \\\n                      unsigned vecIndex) {                                     \\\n      node.fieldname[vecIndex] = 0;                                            \\\n    }                                                                          \\\n                                                                               \\\n    static void setVal(uint32_t GALOIS_UNUSED(node_id), struct NodeData& node, \\\n                       ValTy y, unsigned vecIndex) {                           \\\n      node.fieldname[vecIndex] = y;                                            \\\n    }                                                                          \\\n                                                                               \\\n    static void setVal(uint32_t GALOIS_UNUSED(node_id),                        \\\n                       struct NodeData& GALOIS_UNUSED(node), ValTy y) {        \\\n      GALOIS_DIE(\"execution shouldn't get here; needs index arg\");             \\\n    }                                                                          \\\n                                                                               \\\n    static bool setVal_batch(unsigned uint8_t*, DataCommMode) {                \\\n      return false;                                                            \\\n    }                                                                          \\\n  }\n\n////////////////////////////////////////////////////////////////////////////////\n// Bitset struct\n////////////////////////////////////////////////////////////////////////////////\n\n/**\n * Sync structure for dynamic bitsets.\n *\n * Bitsets are expected to have the following naming scheme:\n * bitset_<fieldname>\n *\n * In addition, you will have to declare and appropriately resize the bitset\n * in your main program as well as set the bitset appropriately (i.e. when you\n * do a write to a particular node).\n */\n#ifdef GALOIS_ENABLE_GPU\n// GPU code included\n#define GALOIS_SYNC_STRUCTURE_BITSET(fieldname)                                \\\n  struct Bitset_##fieldname {                                                  \\\n    static constexpr bool is_vector_bitset() { return false; }                 \\\n    static bool is_valid() { return true; }                                    \\\n                                                                               \\\n    static galois::DynamicBitSet& get() {                                      \\\n      if (personality == GPU_CUDA)                                             \\\n        get_bitset_##fieldname##_cuda(                                         \\\n            cuda_ctx, (uint64_t*)bitset_##fieldname.get_vec().data());         \\\n      return bitset_##fieldname;                                               \\\n    }                                                                          \\\n                                                                               \\\n    static void reset_range(size_t begin, size_t end) {                        \\\n      if (personality == GPU_CUDA) {                                           \\\n        bitset_##fieldname##_reset_cuda(cuda_ctx, begin, end);                 \\\n      } else {                                                                 \\\n        assert(personality == CPU);                                            \\\n        bitset_##fieldname.reset(begin, end);                                  \\\n      }                                                                        \\\n    }                                                                          \\\n  }\n#else\n// no GPU code\n#define GALOIS_SYNC_STRUCTURE_BITSET(fieldname)                                \\\n  struct Bitset_##fieldname {                                                  \\\n    static constexpr bool is_vector_bitset() { return false; }                 \\\n                                                                               \\\n    static constexpr bool is_valid() { return true; }                          \\\n                                                                               \\\n    static galois::DynamicBitSet& get() { return bitset_##fieldname; }         \\\n                                                                               \\\n    static void reset_range(size_t begin, size_t end) {                        \\\n      bitset_##fieldname.reset(begin, end);                                    \\\n    }                                                                          \\\n  }\n#endif\n\n/**\n * Sync structure for a vector of dynamic bitsets. Function signatures\n * allow indexing into this vector to get the correct bitset\n *\n * Bitsets are expected to have the following naming scheme:\n * bitset_<fieldname>\n *\n * In addition, you will have to declare and appropriately resize the bitset\n * in your main program as well as set the bitset appropriately (i.e. when you\n * do a write to a particular node).\n */\n#define GALOIS_SYNC_STRUCTURE_VECTOR_BITSET(fieldname)                         \\\n  struct Bitset_##fieldname {                                                  \\\n    static unsigned numBitsets() { return vbitset_##fieldname.size(); }        \\\n                                                                               \\\n    static constexpr bool is_vector_bitset() { return true; }                  \\\n                                                                               \\\n    static constexpr bool is_valid() { return true; }                          \\\n                                                                               \\\n    static galois::DynamicBitSet& get(unsigned i) {                            \\\n      return vbitset_##fieldname[i];                                           \\\n    }                                                                          \\\n                                                                               \\\n    static void reset_range(size_t begin, size_t end) {                        \\\n      for (unsigned i = 0; i < vbitset_##fieldname.size(); i++) {              \\\n        vbitset_##fieldname[i].reset(begin, end);                              \\\n      }                                                                        \\\n    }                                                                          \\\n  }\n\n#endif // header guard\n"
  },
  {
    "path": "libgluon/include/galois/runtime/cuda/DeviceEdgeSync.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#pragma once\n/**\n * @file DeviceEdgeSync.h\n *\n * CUDA header for GPU runtime\n *\n * @todo better file description + document this file\n */\n#pragma once\n#include \"galois/cuda/DynamicBitset.h\"\n#include \"galois/cuda/EdgeContext.h\"\n#include \"galois/runtime/DataCommMode.h\"\n#include \"cub/util_allocator.cuh\"\n\n#ifdef GALOIS_CUDA_CHECK_ERROR\n#define check_cuda_kernel                                                      \\\n  check_cuda(cudaDeviceSynchronize());                                         \\\n  check_cuda(cudaGetLastError());\n#else\n#define check_cuda_kernel check_cuda(cudaGetLastError());\n#endif\n\nenum SharedType { sharedMaster, sharedMirror };\nenum UpdateOp { setOp, addOp, minOp };\n\nvoid kernel_sizing(dim3& blocks, dim3& threads) {\n  threads.x = 256;\n  threads.y = threads.z = 1;\n  blocks.x              = ggc_get_nSM() * 8;\n  blocks.y = blocks.z = 1;\n}\n\ntemplate <typename DataType>\n__global__ void batch_get_subset(index_type subset_size,\n                                 const unsigned int* __restrict__ indices,\n                                 DataType* __restrict__ subset,\n                                 const DataType* __restrict__ array) {\n  unsigned tid       = TID_1D;\n  unsigned nthreads  = TOTAL_THREADS_1D;\n  index_type src_end = subset_size;\n  for (index_type src = 0 + tid; src < src_end; src += nthreads) {\n    unsigned index = indices[src];\n    subset[src]    = array[index];\n  }\n}\n\ntemplate <typename DataType, typename OffsetIteratorType>\n__global__ void batch_get_subset(index_type subset_size,\n                                 const unsigned int* __restrict__ indices,\n                                 const OffsetIteratorType offsets,\n                                 DataType* __restrict__ subset,\n                                 const DataType* __restrict__ array) {\n  unsigned tid       = TID_1D;\n  unsigned nthreads  = TOTAL_THREADS_1D;\n  index_type src_end = subset_size;\n  for (index_type src = 0 + tid; src < src_end; src += nthreads) {\n    unsigned index = indices[offsets[src]];\n    subset[src]    = array[index];\n  }\n}\n\ntemplate <typename DataType>\n__global__ void batch_get_reset_subset(index_type subset_size,\n                                       const unsigned int* __restrict__ indices,\n                                       DataType* __restrict__ subset,\n                                       DataType* __restrict__ array,\n                                       DataType reset_value) {\n  unsigned tid       = TID_1D;\n  unsigned nthreads  = TOTAL_THREADS_1D;\n  index_type src_end = subset_size;\n  for (index_type src = 0 + tid; src < src_end; src += nthreads) {\n    unsigned index = indices[src];\n    subset[src]    = array[index];\n    array[index]   = reset_value;\n  }\n}\n\ntemplate <typename DataType, typename OffsetIteratorType>\n__global__ void batch_get_reset_subset(index_type subset_size,\n                                       const unsigned int* __restrict__ indices,\n                                       const OffsetIteratorType offsets,\n                                       DataType* __restrict__ subset,\n                                       DataType* __restrict__ array,\n                                       DataType reset_value) {\n  unsigned tid       = TID_1D;\n  unsigned nthreads  = TOTAL_THREADS_1D;\n  index_type src_end = subset_size;\n  for (index_type src = 0 + tid; src < src_end; src += nthreads) {\n    unsigned index = indices[offsets[src]];\n    subset[src]    = array[index];\n    array[index]   = reset_value;\n  }\n}\n\ntemplate <typename DataType, SharedType sharedType>\n__global__ void batch_set_subset(index_type subset_size,\n                                 const unsigned int* __restrict__ indices,\n                                 const DataType* __restrict__ subset,\n                                 DataType* __restrict__ array,\n                                 DynamicBitset* __restrict__ is_array_updated) {\n  unsigned tid       = TID_1D;\n  unsigned nthreads  = TOTAL_THREADS_1D;\n  index_type src_end = subset_size;\n  for (index_type src = 0 + tid; src < src_end; src += nthreads) {\n    unsigned index = indices[src];\n    array[index]   = subset[src];\n    if (sharedType != sharedMirror) {\n      is_array_updated->set(index);\n    }\n  }\n}\n\ntemplate <typename DataType, SharedType sharedType, typename OffsetIteratorType>\n__global__ void batch_set_subset(index_type subset_size,\n                                 const unsigned int* __restrict__ indices,\n                                 const OffsetIteratorType offsets,\n                                 const DataType* __restrict__ subset,\n                                 DataType* __restrict__ array,\n                                 DynamicBitset* __restrict__ is_array_updated) {\n  unsigned tid       = TID_1D;\n  unsigned nthreads  = TOTAL_THREADS_1D;\n  index_type src_end = subset_size;\n  for (index_type src = 0 + tid; src < src_end; src += nthreads) {\n    unsigned index = indices[offsets[src]];\n    array[index]   = subset[src];\n    if (sharedType != sharedMirror) {\n      is_array_updated->set(index);\n    }\n  }\n}\n\ntemplate <typename DataType, SharedType sharedType>\n__global__ void batch_add_subset(index_type subset_size,\n                                 const unsigned int* __restrict__ indices,\n                                 const DataType* __restrict__ subset,\n                                 DataType* __restrict__ array,\n                                 DynamicBitset* __restrict__ is_array_updated) {\n  unsigned tid       = TID_1D;\n  unsigned nthreads  = TOTAL_THREADS_1D;\n  index_type src_end = subset_size;\n  for (index_type src = 0 + tid; src < src_end; src += nthreads) {\n    unsigned index = indices[src];\n    array[index] += subset[src];\n    if (sharedType != sharedMirror) {\n      is_array_updated->set(index);\n    }\n  }\n}\n\ntemplate <typename DataType, SharedType sharedType, typename OffsetIteratorType>\n__global__ void batch_add_subset(index_type subset_size,\n                                 const unsigned int* __restrict__ indices,\n                                 const OffsetIteratorType offsets,\n                                 const DataType* __restrict__ subset,\n                                 DataType* __restrict__ array,\n                                 DynamicBitset* __restrict__ is_array_updated) {\n  unsigned tid       = TID_1D;\n  unsigned nthreads  = TOTAL_THREADS_1D;\n  index_type src_end = subset_size;\n  for (index_type src = 0 + tid; src < src_end; src += nthreads) {\n    unsigned index = indices[offsets[src]];\n    array[index] += subset[src];\n    if (sharedType != sharedMirror) {\n      is_array_updated->set(index);\n    }\n  }\n}\n\ntemplate <typename DataType, SharedType sharedType>\n__global__ void batch_min_subset(index_type subset_size,\n                                 const unsigned int* __restrict__ indices,\n                                 const DataType* __restrict__ subset,\n                                 DataType* __restrict__ array,\n                                 DynamicBitset* __restrict__ is_array_updated) {\n  unsigned tid       = TID_1D;\n  unsigned nthreads  = TOTAL_THREADS_1D;\n  index_type src_end = subset_size;\n  for (index_type src = 0 + tid; src < src_end; src += nthreads) {\n    unsigned index = indices[src];\n    if (array[index] > subset[src]) {\n      array[index] = subset[src];\n      if (sharedType != sharedMirror) {\n        is_array_updated->set(index);\n      }\n    }\n  }\n}\n\ntemplate <typename DataType, SharedType sharedType, typename OffsetIteratorType>\n__global__ void batch_min_subset(index_type subset_size,\n                                 const unsigned int* __restrict__ indices,\n                                 const OffsetIteratorType offsets,\n                                 const DataType* __restrict__ subset,\n                                 DataType* __restrict__ array,\n                                 DynamicBitset* __restrict__ is_array_updated) {\n  unsigned tid       = TID_1D;\n  unsigned nthreads  = TOTAL_THREADS_1D;\n  index_type src_end = subset_size;\n  for (index_type src = 0 + tid; src < src_end; src += nthreads) {\n    unsigned index = indices[offsets[src]];\n    if (array[index] > subset[src]) {\n      array[index] = subset[src];\n      if (sharedType != sharedMirror) {\n        is_array_updated->set(index);\n      }\n    }\n  }\n}\n\ntemplate <typename DataType, SharedType sharedType>\n__global__ void batch_max_subset(index_type subset_size,\n                                 const unsigned int* __restrict__ indices,\n                                 const DataType* __restrict__ subset,\n                                 DataType* __restrict__ array,\n                                 DynamicBitset* __restrict__ is_array_updated) {\n  unsigned tid       = TID_1D;\n  unsigned nthreads  = TOTAL_THREADS_1D;\n  index_type src_end = subset_size;\n  for (index_type src = 0 + tid; src < src_end; src += nthreads) {\n    unsigned index = indices[src];\n    if (array[index] < subset[src]) {\n      array[index] = subset[src];\n      if (sharedType != sharedMirror) {\n        is_array_updated->set(index);\n      }\n    }\n  }\n}\n\ntemplate <typename DataType, SharedType sharedType, typename OffsetIteratorType>\n__global__ void batch_max_subset(index_type subset_size,\n                                 const unsigned int* __restrict__ indices,\n                                 const OffsetIteratorType offsets,\n                                 const DataType* __restrict__ subset,\n                                 DataType* __restrict__ array,\n                                 DynamicBitset* __restrict__ is_array_updated) {\n  unsigned tid       = TID_1D;\n  unsigned nthreads  = TOTAL_THREADS_1D;\n  index_type src_end = subset_size;\n  for (index_type src = 0 + tid; src < src_end; src += nthreads) {\n    unsigned index = indices[offsets[src]];\n    if (array[index] < subset[src]) {\n      array[index] = subset[src];\n      if (sharedType != sharedMirror) {\n        is_array_updated->set(index);\n      }\n    }\n  }\n}\n\ntemplate <typename DataType>\n__global__ void batch_reset(DataType* __restrict__ array, index_type begin,\n                            index_type end, DataType val) {\n  unsigned tid       = TID_1D;\n  unsigned nthreads  = TOTAL_THREADS_1D;\n  index_type src_end = end;\n  for (index_type src = begin + tid; src < src_end; src += nthreads) {\n    array[src] = val;\n  }\n}\n\n__global__ void\nbatch_get_subset_bitset(index_type subset_size,\n                        const unsigned int* __restrict__ indices,\n                        DynamicBitset* __restrict__ is_subset_updated,\n                        DynamicBitset* __restrict__ is_array_updated) {\n  unsigned tid       = TID_1D;\n  unsigned nthreads  = TOTAL_THREADS_1D;\n  index_type src_end = subset_size;\n  for (index_type src = 0 + tid; src < src_end; src += nthreads) {\n    unsigned index = indices[src];\n    if (is_array_updated->test(index)) {\n      is_subset_updated->set(src);\n    }\n  }\n}\n\n// inclusive range\n__global__ void bitset_reset_range(DynamicBitset* __restrict__ bitset,\n                                   size_t vec_begin, size_t vec_end, bool test1,\n                                   size_t bit_index1, uint64_t mask1,\n                                   bool test2, size_t bit_index2,\n                                   uint64_t mask2) {\n  unsigned tid      = TID_1D;\n  unsigned nthreads = TOTAL_THREADS_1D;\n\n  for (size_t src = vec_begin + tid; src < vec_end; src += nthreads) {\n    bitset->batch_reset(src);\n  }\n\n  if (tid == 0) {\n    if (test1) {\n      bitset->batch_bitwise_and(bit_index1, mask1);\n    }\n    if (test2) {\n      bitset->batch_bitwise_and(bit_index2, mask2);\n    }\n  }\n}\n\ntemplate <typename DataType>\nvoid reset_bitset_field(struct CUDA_Context_Field_Edges<DataType>* field,\n                        size_t begin, size_t end) {\n  dim3 blocks;\n  dim3 threads;\n  kernel_sizing(blocks, threads);\n  const DynamicBitset* bitset_cpu = field->is_updated.cpu_rd_ptr();\n  assert(begin <= (bitset_cpu->size() - 1));\n  assert(end <= (bitset_cpu->size() - 1));\n\n  size_t vec_begin = (begin + 63) / 64;\n  size_t vec_end;\n\n  if (end == (bitset_cpu->size() - 1))\n    vec_end = bitset_cpu->vec_size();\n  else\n    vec_end = (end + 1) / 64; // floor\n\n  size_t begin2 = vec_begin * 64;\n  size_t end2   = vec_end * 64;\n\n  bool test1;\n  size_t bit_index1;\n  uint64_t mask1;\n\n  bool test2;\n  size_t bit_index2;\n  uint64_t mask2;\n\n  if (begin2 > end2) {\n    test2 = false;\n\n    if (begin < begin2) {\n      test1       = true;\n      bit_index1  = begin / 64;\n      size_t diff = begin2 - begin;\n      assert(diff < 64);\n      mask1 = ((uint64_t)1 << (64 - diff)) - 1;\n\n      // create or mask\n      size_t diff2 = end - end2 + 1;\n      assert(diff2 < 64);\n      mask2 = ~(((uint64_t)1 << diff2) - 1);\n      mask1 |= ~mask2;\n    } else {\n      test1 = false;\n    }\n  } else {\n    if (begin < begin2) {\n      test1       = true;\n      bit_index1  = begin / 64;\n      size_t diff = begin2 - begin;\n      assert(diff < 64);\n      mask1 = ((uint64_t)1 << (64 - diff)) - 1;\n    } else {\n      test1 = false;\n    }\n\n    if (end >= end2) {\n      test2       = true;\n      bit_index2  = end / 64;\n      size_t diff = end - end2 + 1;\n      assert(diff < 64);\n      mask2 = ~(((uint64_t)1 << diff) - 1);\n    } else {\n      test2 = false;\n    }\n  }\n\n  bitset_reset_range<<<blocks, threads>>>(field->is_updated.gpu_rd_ptr(),\n                                          vec_begin, vec_end, test1, bit_index1,\n                                          mask1, test2, bit_index2, mask2);\n}\n\ntemplate <typename DataType>\nvoid reset_data_field(struct CUDA_Context_Field_Edges<DataType>* field,\n                      size_t begin, size_t end, DataType val) {\n  dim3 blocks;\n  dim3 threads;\n  kernel_sizing(blocks, threads);\n\n  batch_reset<DataType><<<blocks, threads>>>(\n      field->data.gpu_wr_ptr(), (index_type)begin, (index_type)end, val);\n}\n\nvoid get_offsets_from_bitset(index_type bitset_size,\n                             unsigned int* __restrict__ offsets,\n                             DynamicBitset* __restrict__ bitset,\n                             size_t* __restrict__ num_set_bits) {\n  cub::CachingDeviceAllocator g_allocator(\n      true); // Caching allocator for device memory\n  DynamicBitsetIterator flag_iterator(bitset);\n  IdentityIterator offset_iterator;\n  Shared<size_t> num_set_bits_ptr;\n  num_set_bits_ptr.alloc(1);\n  void* d_temp_storage      = NULL;\n  size_t temp_storage_bytes = 0;\n  cub::DeviceSelect::Flagged(d_temp_storage, temp_storage_bytes,\n                             offset_iterator, flag_iterator, offsets,\n                             num_set_bits_ptr.gpu_wr_ptr(true), bitset_size);\n  check_cuda_kernel;\n  CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));\n  // CUDA_SAFE_CALL(cudaMalloc(&d_temp_storage, temp_storage_bytes));\n  cub::DeviceSelect::Flagged(d_temp_storage, temp_storage_bytes,\n                             offset_iterator, flag_iterator, offsets,\n                             num_set_bits_ptr.gpu_wr_ptr(true), bitset_size);\n  check_cuda_kernel;\n  // CUDA_SAFE_CALL(cudaFree(d_temp_storage));\n  if (d_temp_storage)\n    CubDebugExit(g_allocator.DeviceFree(d_temp_storage));\n  *num_set_bits = *num_set_bits_ptr.cpu_rd_ptr();\n}\n\ntemplate <typename DataType, SharedType sharedType, bool reset>\nvoid batch_get_shared_edge(struct CUDA_Context_Common_Edges* ctx,\n                           struct CUDA_Context_Field_Edges<DataType>* field,\n                           unsigned from_id, uint8_t* send_buffer,\n                           DataType i = 0) {\n  struct CUDA_Context_Shared_Edges* shared;\n  if (sharedType == sharedMaster) {\n    shared = &ctx->master;\n  } else { // sharedMirror\n    shared = &ctx->mirror;\n  }\n  DeviceOnly<DataType>* shared_data = &field->shared_data;\n  dim3 blocks;\n  dim3 threads;\n  kernel_sizing(blocks, threads);\n\n  // ggc::Timer timer(\"timer\"), timer1(\"timer1\"), timer2(\"timer2\");\n  // timer.start();\n  // timer1.start();\n  size_t v_size = shared->num_edges[from_id];\n  if (reset) {\n    batch_get_reset_subset<DataType><<<blocks, threads>>>(\n        v_size, shared->edges[from_id].device_ptr(), shared_data->device_ptr(),\n        field->data.gpu_wr_ptr(), i);\n  } else {\n    batch_get_subset<DataType><<<blocks, threads>>>(\n        v_size, shared->edges[from_id].device_ptr(), shared_data->device_ptr(),\n        field->data.gpu_rd_ptr());\n  }\n  check_cuda_kernel;\n  // timer1.stop();\n  // timer2.start();\n  DataCommMode data_mode = onlyData;\n  memcpy(send_buffer, &data_mode, sizeof(data_mode));\n  memcpy(send_buffer + sizeof(data_mode), &v_size, sizeof(v_size));\n  shared_data->copy_to_cpu(\n      (DataType*)(send_buffer + sizeof(data_mode) + sizeof(v_size)), v_size);\n  // timer2.stop();\n  // timer.stop();\n  // fprintf(stderr, \"Get %u->%u: Time (ms): %llu + %llu = %llu\\n\",\n  //  ctx->id, from_id,\n  //  timer1.duration_ms(), timer2.duration_ms(),\n  //  timer.duration_ms());\n}\n\ntemplate <typename DataType>\nvoid serializeMessage(struct CUDA_Context_Common_Edges* ctx,\n                      DataCommMode data_mode, size_t bit_set_count,\n                      size_t num_shared, DeviceOnly<DataType>* shared_data,\n                      uint8_t* send_buffer) {\n  if (data_mode == noData) {\n    // do nothing\n    return;\n  }\n\n  size_t offset = 0;\n\n  // serialize data_mode\n  memcpy(send_buffer, &data_mode, sizeof(data_mode));\n  offset += sizeof(data_mode);\n\n  if (data_mode != onlyData) {\n    // serialize bit_set_count\n    memcpy(send_buffer + offset, &bit_set_count, sizeof(bit_set_count));\n    offset += sizeof(bit_set_count);\n  }\n\n  if ((data_mode == gidsData) || (data_mode == offsetsData)) {\n    // serialize offsets vector\n    memcpy(send_buffer + offset, &bit_set_count, sizeof(bit_set_count));\n    offset += sizeof(bit_set_count);\n    ctx->offsets.copy_to_cpu((unsigned int*)(send_buffer + offset),\n                             bit_set_count);\n    offset += bit_set_count * sizeof(unsigned int);\n  } else if ((data_mode == bitsetData)) {\n    // serialize bitset\n    memcpy(send_buffer + offset, &num_shared, sizeof(num_shared));\n    offset += sizeof(num_shared);\n    size_t vec_size = ctx->is_updated.cpu_rd_ptr()->vec_size();\n    memcpy(send_buffer + offset, &vec_size, sizeof(vec_size));\n    offset += sizeof(vec_size);\n    ctx->is_updated.cpu_rd_ptr()->copy_to_cpu(\n        (uint64_t*)(send_buffer + offset));\n    offset += vec_size * sizeof(uint64_t);\n  }\n\n  // serialize data vector\n  memcpy(send_buffer + offset, &bit_set_count, sizeof(bit_set_count));\n  offset += sizeof(bit_set_count);\n  shared_data->copy_to_cpu((DataType*)(send_buffer + offset), bit_set_count);\n  // offset += bit_set_count * sizeof(DataType);\n}\n\ntemplate <typename DataType, SharedType sharedType, bool reset>\nvoid batch_get_shared_edge(struct CUDA_Context_Common_Edges* ctx,\n                           struct CUDA_Context_Field_Edges<DataType>* field,\n                           unsigned from_id, uint8_t* send_buffer,\n                           size_t* v_size, DataCommMode* data_mode,\n                           DataType i = 0) {\n  struct CUDA_Context_Shared_Edges* shared;\n  if (sharedType == sharedMaster) {\n    shared = &ctx->master;\n  } else { // sharedMirror\n    shared = &ctx->mirror;\n  }\n  DeviceOnly<DataType>* shared_data = &field->shared_data;\n  dim3 blocks;\n  dim3 threads;\n  kernel_sizing(blocks, threads);\n\n  // ggc::Timer timer(\"timer\"), timer1(\"timer1\"), timer2(\"timer2\"),\n  // timer3(\"timer3\"), timer4(\"timer 4\"); timer.start();\n  if (enforcedDataMode != onlyData) {\n    // timer1.start();\n    ctx->is_updated.cpu_rd_ptr()->resize(shared->num_edges[from_id]);\n    ctx->is_updated.cpu_rd_ptr()->reset();\n    batch_get_subset_bitset<<<blocks, threads>>>(\n        shared->num_edges[from_id], shared->edges[from_id].device_ptr(),\n        ctx->is_updated.gpu_rd_ptr(), field->is_updated.gpu_rd_ptr());\n    check_cuda_kernel;\n    // timer1.stop();\n    // timer2.start();\n    get_offsets_from_bitset(shared->num_edges[from_id],\n                            ctx->offsets.device_ptr(),\n                            ctx->is_updated.gpu_rd_ptr(), v_size);\n    // timer2.stop();\n  }\n  *data_mode = get_data_mode<DataType>(*v_size, shared->num_edges[from_id]);\n  // timer3.start();\n  if ((*data_mode) == onlyData) {\n    *v_size = shared->num_edges[from_id];\n    if (reset) {\n      batch_get_reset_subset<DataType><<<blocks, threads>>>(\n          *v_size, shared->edges[from_id].device_ptr(),\n          shared_data->device_ptr(), field->data.gpu_wr_ptr(), i);\n    } else {\n      batch_get_subset<DataType><<<blocks, threads>>>(\n          *v_size, shared->edges[from_id].device_ptr(),\n          shared_data->device_ptr(), field->data.gpu_rd_ptr());\n    }\n  } else { // bitsetData || offsetsData\n    if (reset) {\n      batch_get_reset_subset<DataType><<<blocks, threads>>>(\n          *v_size, shared->edges[from_id].device_ptr(),\n          ctx->offsets.device_ptr(), shared_data->device_ptr(),\n          field->data.gpu_wr_ptr(), i);\n    } else {\n      batch_get_subset<DataType><<<blocks, threads>>>(\n          *v_size, shared->edges[from_id].device_ptr(),\n          ctx->offsets.device_ptr(), shared_data->device_ptr(),\n          field->data.gpu_rd_ptr());\n    }\n  }\n  check_cuda_kernel;\n  // timer3.stop();\n  // timer4.start();\n  serializeMessage(ctx, *data_mode, *v_size, shared->num_edges[from_id],\n                   shared_data, send_buffer);\n  // timer4.stop();\n  // timer.stop();\n  // fprintf(stderr, \"Get %u->%u: %d mode %u bitset %u indices. Time (ms): %llu\n  // + %llu + %llu + %llu = %llu\\n\",\n  //  ctx->id, from_id, *data_mode,\n  //  ctx->is_updated.cpu_rd_ptr()->alloc_size(), sizeof(unsigned int) *\n  //  (*v_size), timer1.duration_ms(), timer2.duration_ms(),\n  //  timer3.duration_ms(), timer4.duration_ms(), timer.duration_ms());\n}\n\ntemplate <typename DataType>\nvoid deserializeMessage(struct CUDA_Context_Common_Edges* ctx,\n                        DataCommMode data_mode, size_t& bit_set_count,\n                        size_t num_shared, DeviceOnly<DataType>* shared_data,\n                        uint8_t* recv_buffer) {\n  size_t offset = 0; // data_mode is already deserialized\n\n  if (data_mode != onlyData) {\n    // deserialize bit_set_count\n    memcpy(&bit_set_count, recv_buffer + offset, sizeof(bit_set_count));\n    offset += sizeof(bit_set_count);\n  } else {\n    bit_set_count = num_shared;\n  }\n\n  assert(data_mode != gidsData); // not supported for deserialization on GPUs\n  if (data_mode == offsetsData) {\n    // deserialize offsets vector\n    offset += sizeof(bit_set_count);\n    ctx->offsets.copy_to_gpu((unsigned int*)(recv_buffer + offset),\n                             bit_set_count);\n    offset += bit_set_count * sizeof(unsigned int);\n  } else if ((data_mode == bitsetData)) {\n    // deserialize bitset\n    ctx->is_updated.cpu_rd_ptr()->resize(num_shared);\n    offset += sizeof(num_shared);\n    size_t vec_size = ctx->is_updated.cpu_rd_ptr()->vec_size();\n    offset += sizeof(vec_size);\n    ctx->is_updated.cpu_rd_ptr()->copy_to_gpu(\n        (uint64_t*)(recv_buffer + offset));\n    offset += vec_size * sizeof(uint64_t);\n    // get offsets\n    size_t v_size;\n    get_offsets_from_bitset(num_shared, ctx->offsets.device_ptr(),\n                            ctx->is_updated.gpu_rd_ptr(), &v_size);\n\n    assert(bit_set_count == v_size);\n  }\n\n  // deserialize data vector\n  offset += sizeof(bit_set_count);\n  shared_data->copy_to_gpu((DataType*)(recv_buffer + offset), bit_set_count);\n  // offset += bit_set_count * sizeof(DataType);\n}\n\ntemplate <typename DataType, SharedType sharedType, UpdateOp op>\nvoid batch_set_shared_edge(struct CUDA_Context_Common_Edges* ctx,\n                           struct CUDA_Context_Field_Edges<DataType>* field,\n                           unsigned from_id, uint8_t* recv_buffer,\n                           DataCommMode data_mode) {\n  assert(data_mode != noData);\n  struct CUDA_Context_Shared_Edges* shared;\n  if (sharedType == sharedMaster) {\n    shared = &ctx->master;\n  } else { // sharedMirror\n    shared = &ctx->mirror;\n  }\n  DeviceOnly<DataType>* shared_data = &field->shared_data;\n  dim3 blocks;\n  dim3 threads;\n  kernel_sizing(blocks, threads);\n  size_t v_size;\n\n  // ggc::Timer timer(\"timer\"), timer1(\"timer1\"), timer2(\"timer2\");\n  // timer.start();\n  // timer1.start();\n  deserializeMessage(ctx, data_mode, v_size, shared->num_edges[from_id],\n                     shared_data, recv_buffer);\n  // timer1.stop();\n  // timer2.start();\n  if (data_mode == onlyData) {\n    if (op == setOp) {\n      batch_set_subset<DataType, sharedType><<<blocks, threads>>>(\n          v_size, shared->edges[from_id].device_ptr(),\n          shared_data->device_ptr(), field->data.gpu_wr_ptr(),\n          field->is_updated.gpu_wr_ptr());\n    } else if (op == addOp) {\n      batch_add_subset<DataType, sharedType><<<blocks, threads>>>(\n          v_size, shared->edges[from_id].device_ptr(),\n          shared_data->device_ptr(), field->data.gpu_wr_ptr(),\n          field->is_updated.gpu_wr_ptr());\n    } else if (op == minOp) {\n      batch_min_subset<DataType, sharedType><<<blocks, threads>>>(\n          v_size, shared->edges[from_id].device_ptr(),\n          shared_data->device_ptr(), field->data.gpu_wr_ptr(),\n          field->is_updated.gpu_wr_ptr());\n    }\n  } else if (data_mode == gidsData) {\n    if (op == setOp) {\n      batch_set_subset<DataType, sharedType><<<blocks, threads>>>(\n          v_size, ctx->offsets.device_ptr(), shared_data->device_ptr(),\n          field->data.gpu_wr_ptr(), field->is_updated.gpu_wr_ptr());\n    } else if (op == addOp) {\n      batch_add_subset<DataType, sharedType><<<blocks, threads>>>(\n          v_size, ctx->offsets.device_ptr(), shared_data->device_ptr(),\n          field->data.gpu_wr_ptr(), field->is_updated.gpu_wr_ptr());\n    } else if (op == minOp) {\n      batch_min_subset<DataType, sharedType><<<blocks, threads>>>(\n          v_size, ctx->offsets.device_ptr(), shared_data->device_ptr(),\n          field->data.gpu_wr_ptr(), field->is_updated.gpu_wr_ptr());\n    }\n  } else { // bitsetData || offsetsData\n    if (op == setOp) {\n      batch_set_subset<DataType, sharedType><<<blocks, threads>>>(\n          v_size, shared->edges[from_id].device_ptr(),\n          ctx->offsets.device_ptr(), shared_data->device_ptr(),\n          field->data.gpu_wr_ptr(), field->is_updated.gpu_wr_ptr());\n    } else if (op == addOp) {\n      batch_add_subset<DataType, sharedType><<<blocks, threads>>>(\n          v_size, shared->edges[from_id].device_ptr(),\n          ctx->offsets.device_ptr(), shared_data->device_ptr(),\n          field->data.gpu_wr_ptr(), field->is_updated.gpu_wr_ptr());\n    } else if (op == minOp) {\n      batch_min_subset<DataType, sharedType><<<blocks, threads>>>(\n          v_size, shared->edges[from_id].device_ptr(),\n          ctx->offsets.device_ptr(), shared_data->device_ptr(),\n          field->data.gpu_wr_ptr(), field->is_updated.gpu_wr_ptr());\n    }\n  }\n  check_cuda_kernel;\n  // timer2.stop();\n  // timer.stop();\n  // fprintf(stderr, \"Set %u<-%u: %d mode Time (ms): %llu + %llu = %llu\\n\",\n  //  ctx->id, from_id, data_mode,\n  //  timer1.duration_ms(), timer2.duration_ms(),\n  //  timer.duration_ms());\n}\n"
  },
  {
    "path": "libgluon/include/galois/runtime/cuda/DeviceSync.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n/*\n */\n\n/**\n * @file DeviceSync.h\n *\n * CUDA header for GPU runtime\n *\n * @todo better file description + document this file\n */\n#pragma once\n#include \"galois/cuda/DynamicBitset.h\"\n#include \"galois/cuda/Context.h\"\n#include \"galois/runtime/DataCommMode.h\"\n#include \"cub/util_allocator.cuh\"\n\n#ifdef GALOIS_CUDA_CHECK_ERROR\n#define check_cuda_kernel                                                      \\\n  check_cuda(cudaDeviceSynchronize());                                         \\\n  check_cuda(cudaGetLastError());\n#else\n#define check_cuda_kernel check_cuda(cudaGetLastError());\n#endif\n\nenum SharedType { sharedMaster, sharedMirror };\nenum UpdateOp { setOp, addOp, minOp };\n\nvoid kernel_sizing(dim3& blocks, dim3& threads) {\n  threads.x = 256;\n  threads.y = threads.z = 1;\n  blocks.x              = ggc_get_nSM() * 8;\n  blocks.y = blocks.z = 1;\n}\n\ntemplate <typename DataType>\n__global__ void batch_get_subset(index_type subset_size,\n                                 const unsigned int* __restrict__ indices,\n                                 DataType* __restrict__ subset,\n                                 const DataType* __restrict__ array) {\n  unsigned tid       = TID_1D;\n  unsigned nthreads  = TOTAL_THREADS_1D;\n  index_type src_end = subset_size;\n  for (index_type src = 0 + tid; src < src_end; src += nthreads) {\n    unsigned index = indices[src];\n    subset[src]    = array[index];\n  }\n}\n\ntemplate <typename DataType, typename OffsetIteratorType>\n__global__ void batch_get_subset(index_type subset_size,\n                                 const unsigned int* __restrict__ indices,\n                                 const OffsetIteratorType offsets,\n                                 DataType* __restrict__ subset,\n                                 const DataType* __restrict__ array) {\n  unsigned tid       = TID_1D;\n  unsigned nthreads  = TOTAL_THREADS_1D;\n  index_type src_end = subset_size;\n  for (index_type src = 0 + tid; src < src_end; src += nthreads) {\n    unsigned index = indices[offsets[src]];\n    subset[src]    = array[index];\n  }\n}\n\ntemplate <typename DataType>\n__global__ void batch_get_reset_subset(index_type subset_size,\n                                       const unsigned int* __restrict__ indices,\n                                       DataType* __restrict__ subset,\n                                       DataType* __restrict__ array,\n                                       DataType reset_value) {\n  unsigned tid       = TID_1D;\n  unsigned nthreads  = TOTAL_THREADS_1D;\n  index_type src_end = subset_size;\n  for (index_type src = 0 + tid; src < src_end; src += nthreads) {\n    unsigned index = indices[src];\n    subset[src]    = array[index];\n    array[index]   = reset_value;\n  }\n}\n\ntemplate <typename DataType, typename OffsetIteratorType>\n__global__ void batch_get_reset_subset(index_type subset_size,\n                                       const unsigned int* __restrict__ indices,\n                                       const OffsetIteratorType offsets,\n                                       DataType* __restrict__ subset,\n                                       DataType* __restrict__ array,\n                                       DataType reset_value) {\n  unsigned tid       = TID_1D;\n  unsigned nthreads  = TOTAL_THREADS_1D;\n  index_type src_end = subset_size;\n  for (index_type src = 0 + tid; src < src_end; src += nthreads) {\n    unsigned index = indices[offsets[src]];\n    subset[src]    = array[index];\n    array[index]   = reset_value;\n  }\n}\n\ntemplate <typename DataType, SharedType sharedType>\n__global__ void batch_set_subset(index_type subset_size,\n                                 const unsigned int* __restrict__ indices,\n                                 const DataType* __restrict__ subset,\n                                 DataType* __restrict__ array,\n                                 DynamicBitset* __restrict__ is_array_updated) {\n  unsigned tid       = TID_1D;\n  unsigned nthreads  = TOTAL_THREADS_1D;\n  index_type src_end = subset_size;\n  for (index_type src = 0 + tid; src < src_end; src += nthreads) {\n    unsigned index = indices[src];\n    array[index]   = subset[src];\n    if (sharedType != sharedMirror) {\n      is_array_updated->set(index);\n    }\n  }\n}\n\ntemplate <typename DataType, SharedType sharedType, typename OffsetIteratorType>\n__global__ void batch_set_subset(index_type subset_size,\n                                 const unsigned int* __restrict__ indices,\n                                 const OffsetIteratorType offsets,\n                                 const DataType* __restrict__ subset,\n                                 DataType* __restrict__ array,\n                                 DynamicBitset* __restrict__ is_array_updated) {\n  unsigned tid       = TID_1D;\n  unsigned nthreads  = TOTAL_THREADS_1D;\n  index_type src_end = subset_size;\n  for (index_type src = 0 + tid; src < src_end; src += nthreads) {\n    unsigned index = indices[offsets[src]];\n    array[index]   = subset[src];\n    if (sharedType != sharedMirror) {\n      is_array_updated->set(index);\n    }\n  }\n}\n\ntemplate <typename DataType, SharedType sharedType>\n__global__ void batch_add_subset(index_type subset_size,\n                                 const unsigned int* __restrict__ indices,\n                                 const DataType* __restrict__ subset,\n                                 DataType* __restrict__ array,\n                                 DynamicBitset* __restrict__ is_array_updated) {\n  unsigned tid       = TID_1D;\n  unsigned nthreads  = TOTAL_THREADS_1D;\n  index_type src_end = subset_size;\n  for (index_type src = 0 + tid; src < src_end; src += nthreads) {\n    unsigned index = indices[src];\n    array[index] += subset[src];\n    if (sharedType != sharedMirror) {\n      is_array_updated->set(index);\n    }\n  }\n}\n\ntemplate <typename DataType, SharedType sharedType, typename OffsetIteratorType>\n__global__ void batch_add_subset(index_type subset_size,\n                                 const unsigned int* __restrict__ indices,\n                                 const OffsetIteratorType offsets,\n                                 const DataType* __restrict__ subset,\n                                 DataType* __restrict__ array,\n                                 DynamicBitset* __restrict__ is_array_updated) {\n  unsigned tid       = TID_1D;\n  unsigned nthreads  = TOTAL_THREADS_1D;\n  index_type src_end = subset_size;\n  for (index_type src = 0 + tid; src < src_end; src += nthreads) {\n    unsigned index = indices[offsets[src]];\n    array[index] += subset[src];\n    if (sharedType != sharedMirror) {\n      is_array_updated->set(index);\n    }\n  }\n}\n\ntemplate <typename DataType, SharedType sharedType>\n__global__ void batch_min_subset(index_type subset_size,\n                                 const unsigned int* __restrict__ indices,\n                                 const DataType* __restrict__ subset,\n                                 DataType* __restrict__ array,\n                                 DynamicBitset* __restrict__ is_array_updated) {\n  unsigned tid       = TID_1D;\n  unsigned nthreads  = TOTAL_THREADS_1D;\n  index_type src_end = subset_size;\n  for (index_type src = 0 + tid; src < src_end; src += nthreads) {\n    unsigned index = indices[src];\n    if (array[index] > subset[src]) {\n      array[index] = subset[src];\n      if (sharedType != sharedMirror) {\n        is_array_updated->set(index);\n      }\n    }\n  }\n}\n\ntemplate <typename DataType, SharedType sharedType, typename OffsetIteratorType>\n__global__ void batch_min_subset(index_type subset_size,\n                                 const unsigned int* __restrict__ indices,\n                                 const OffsetIteratorType offsets,\n                                 const DataType* __restrict__ subset,\n                                 DataType* __restrict__ array,\n                                 DynamicBitset* __restrict__ is_array_updated) {\n  unsigned tid       = TID_1D;\n  unsigned nthreads  = TOTAL_THREADS_1D;\n  index_type src_end = subset_size;\n  for (index_type src = 0 + tid; src < src_end; src += nthreads) {\n    unsigned index = indices[offsets[src]];\n    if (array[index] > subset[src]) {\n      array[index] = subset[src];\n      if (sharedType != sharedMirror) {\n        is_array_updated->set(index);\n      }\n    }\n  }\n}\n\ntemplate <typename DataType, SharedType sharedType>\n__global__ void batch_max_subset(index_type subset_size,\n                                 const unsigned int* __restrict__ indices,\n                                 const DataType* __restrict__ subset,\n                                 DataType* __restrict__ array,\n                                 DynamicBitset* __restrict__ is_array_updated) {\n  unsigned tid       = TID_1D;\n  unsigned nthreads  = TOTAL_THREADS_1D;\n  index_type src_end = subset_size;\n  for (index_type src = 0 + tid; src < src_end; src += nthreads) {\n    unsigned index = indices[src];\n    if (array[index] < subset[src]) {\n      array[index] = subset[src];\n      if (sharedType != sharedMirror) {\n        is_array_updated->set(index);\n      }\n    }\n  }\n}\n\ntemplate <typename DataType, SharedType sharedType, typename OffsetIteratorType>\n__global__ void batch_max_subset(index_type subset_size,\n                                 const unsigned int* __restrict__ indices,\n                                 const OffsetIteratorType offsets,\n                                 const DataType* __restrict__ subset,\n                                 DataType* __restrict__ array,\n                                 DynamicBitset* __restrict__ is_array_updated) {\n  unsigned tid       = TID_1D;\n  unsigned nthreads  = TOTAL_THREADS_1D;\n  index_type src_end = subset_size;\n  for (index_type src = 0 + tid; src < src_end; src += nthreads) {\n    unsigned index = indices[offsets[src]];\n    if (array[index] < subset[src]) {\n      array[index] = subset[src];\n      if (sharedType != sharedMirror) {\n        is_array_updated->set(index);\n      }\n    }\n  }\n}\n\ntemplate <typename DataType>\n__global__ void batch_reset(DataType* __restrict__ array, index_type begin,\n                            index_type end, DataType val) {\n  unsigned tid       = TID_1D;\n  unsigned nthreads  = TOTAL_THREADS_1D;\n  index_type src_end = end;\n  for (index_type src = begin + tid; src < src_end; src += nthreads) {\n    array[src] = val;\n  }\n}\n\n__global__ void\nbatch_get_subset_bitset(index_type subset_size,\n                        const unsigned int* __restrict__ indices,\n                        DynamicBitset* __restrict__ is_subset_updated,\n                        DynamicBitset* __restrict__ is_array_updated) {\n  unsigned tid       = TID_1D;\n  unsigned nthreads  = TOTAL_THREADS_1D;\n  index_type src_end = subset_size;\n  for (index_type src = 0 + tid; src < src_end; src += nthreads) {\n    unsigned index = indices[src];\n    if (is_array_updated->test(index)) {\n      is_subset_updated->set(src);\n    }\n  }\n}\n\n// inclusive range\n__global__ void bitset_reset_range(DynamicBitset* __restrict__ bitset,\n                                   size_t vec_begin, size_t vec_end, bool test1,\n                                   size_t bit_index1, uint64_t mask1,\n                                   bool test2, size_t bit_index2,\n                                   uint64_t mask2) {\n  unsigned tid      = TID_1D;\n  unsigned nthreads = TOTAL_THREADS_1D;\n\n  for (size_t src = vec_begin + tid; src < vec_end; src += nthreads) {\n    bitset->batch_reset(src);\n  }\n\n  if (tid == 0) {\n    if (test1) {\n      bitset->batch_bitwise_and(bit_index1, mask1);\n    }\n    if (test2) {\n      bitset->batch_bitwise_and(bit_index2, mask2);\n    }\n  }\n}\n\ntemplate <typename DataType>\nvoid reset_bitset_field(struct CUDA_Context_Field<DataType>* field,\n                        size_t begin, size_t end) {\n  dim3 blocks;\n  dim3 threads;\n  kernel_sizing(blocks, threads);\n  const DynamicBitset* bitset_cpu = field->is_updated.cpu_rd_ptr();\n  assert(begin <= (bitset_cpu->size() - 1));\n  assert(end <= (bitset_cpu->size() - 1));\n\n  size_t vec_begin = (begin + 63) / 64;\n  size_t vec_end;\n\n  if (end == (bitset_cpu->size() - 1))\n    vec_end = bitset_cpu->vec_size();\n  else\n    vec_end = (end + 1) / 64; // floor\n\n  size_t begin2 = vec_begin * 64;\n  size_t end2   = vec_end * 64;\n\n  bool test1;\n  size_t bit_index1;\n  uint64_t mask1;\n\n  bool test2;\n  size_t bit_index2;\n  uint64_t mask2;\n\n  if (begin2 > end2) {\n    test2 = false;\n\n    if (begin < begin2) {\n      test1       = true;\n      bit_index1  = begin / 64;\n      size_t diff = begin2 - begin;\n      assert(diff < 64);\n      mask1 = ((uint64_t)1 << (64 - diff)) - 1;\n\n      // create or mask\n      size_t diff2 = end - end2 + 1;\n      assert(diff2 < 64);\n      mask2 = ~(((uint64_t)1 << diff2) - 1);\n      mask1 |= ~mask2;\n    } else {\n      test1 = false;\n    }\n  } else {\n    if (begin < begin2) {\n      test1       = true;\n      bit_index1  = begin / 64;\n      size_t diff = begin2 - begin;\n      assert(diff < 64);\n      mask1 = ((uint64_t)1 << (64 - diff)) - 1;\n    } else {\n      test1 = false;\n    }\n\n    if (end >= end2) {\n      test2       = true;\n      bit_index2  = end / 64;\n      size_t diff = end - end2 + 1;\n      assert(diff < 64);\n      mask2 = ~(((uint64_t)1 << diff) - 1);\n    } else {\n      test2 = false;\n    }\n  }\n\n  bitset_reset_range<<<blocks, threads>>>(field->is_updated.gpu_rd_ptr(),\n                                          vec_begin, vec_end, test1, bit_index1,\n                                          mask1, test2, bit_index2, mask2);\n}\n\ntemplate <typename DataType>\nvoid reset_data_field(struct CUDA_Context_Field<DataType>* field, size_t begin,\n                      size_t end, DataType val) {\n  dim3 blocks;\n  dim3 threads;\n  kernel_sizing(blocks, threads);\n\n  batch_reset<DataType><<<blocks, threads>>>(\n      field->data.gpu_wr_ptr(), (index_type)begin, (index_type)end, val);\n}\n\nvoid get_offsets_from_bitset(index_type bitset_size,\n                             unsigned int* __restrict__ offsets,\n                             DynamicBitset* __restrict__ bitset,\n                             size_t* __restrict__ num_set_bits) {\n  cub::CachingDeviceAllocator g_allocator(\n      true); // Caching allocator for device memory\n  DynamicBitsetIterator flag_iterator(bitset);\n  IdentityIterator offset_iterator;\n  Shared<size_t> num_set_bits_ptr;\n  num_set_bits_ptr.alloc(1);\n  void* d_temp_storage      = NULL;\n  size_t temp_storage_bytes = 0;\n  cub::DeviceSelect::Flagged(d_temp_storage, temp_storage_bytes,\n                             offset_iterator, flag_iterator, offsets,\n                             num_set_bits_ptr.gpu_wr_ptr(true), bitset_size);\n  check_cuda_kernel;\n  CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));\n  // CUDA_SAFE_CALL(cudaMalloc(&d_temp_storage, temp_storage_bytes));\n  cub::DeviceSelect::Flagged(d_temp_storage, temp_storage_bytes,\n                             offset_iterator, flag_iterator, offsets,\n                             num_set_bits_ptr.gpu_wr_ptr(true), bitset_size);\n  check_cuda_kernel;\n  // CUDA_SAFE_CALL(cudaFree(d_temp_storage));\n  if (d_temp_storage)\n    CubDebugExit(g_allocator.DeviceFree(d_temp_storage));\n  *num_set_bits = *num_set_bits_ptr.cpu_rd_ptr();\n}\n\ntemplate <typename DataType, SharedType sharedType, bool reset>\nvoid batch_get_shared_field(struct CUDA_Context_Common* ctx,\n                            struct CUDA_Context_Field<DataType>* field,\n                            unsigned from_id, uint8_t* send_buffer,\n                            DataType i = 0) {\n  struct CUDA_Context_Shared* shared;\n  if (sharedType == sharedMaster) {\n    shared = &ctx->master;\n  } else { // sharedMirror\n    shared = &ctx->mirror;\n  }\n  DeviceOnly<DataType>* shared_data = &field->shared_data;\n  dim3 blocks;\n  dim3 threads;\n  kernel_sizing(blocks, threads);\n\n  // ggc::Timer timer(\"timer\"), timer1(\"timer1\"), timer2(\"timer2\");\n  // timer.start();\n  // timer1.start();\n  size_t v_size = shared->num_nodes[from_id];\n  if (reset) {\n    batch_get_reset_subset<DataType><<<blocks, threads>>>(\n        v_size, shared->nodes[from_id].device_ptr(), shared_data->device_ptr(),\n        field->data.gpu_wr_ptr(), i);\n  } else {\n    batch_get_subset<DataType><<<blocks, threads>>>(\n        v_size, shared->nodes[from_id].device_ptr(), shared_data->device_ptr(),\n        field->data.gpu_rd_ptr());\n  }\n  check_cuda_kernel;\n  // timer1.stop();\n  // timer2.start();\n  DataCommMode data_mode = onlyData;\n  memcpy(send_buffer, &data_mode, sizeof(data_mode));\n  memcpy(send_buffer + sizeof(data_mode), &v_size, sizeof(v_size));\n  shared_data->copy_to_cpu(\n      (DataType*)(send_buffer + sizeof(data_mode) + sizeof(v_size)), v_size);\n  // timer2.stop();\n  // timer.stop();\n  // fprintf(stderr, \"Get %u->%u: Time (ms): %llu + %llu = %llu\\n\",\n  //  ctx->id, from_id,\n  //  timer1.duration_ms(), timer2.duration_ms(),\n  //  timer.duration_ms());\n}\n\ntemplate <typename DataType>\nvoid serializeMessage(struct CUDA_Context_Common* ctx, DataCommMode data_mode,\n                      size_t bit_set_count, size_t num_shared,\n                      DeviceOnly<DataType>* shared_data, uint8_t* send_buffer) {\n  if (data_mode == noData) {\n    // do nothing\n    return;\n  }\n\n  size_t offset = 0;\n\n  // serialize data_mode\n  memcpy(send_buffer, &data_mode, sizeof(data_mode));\n  offset += sizeof(data_mode);\n\n  if (data_mode != onlyData) {\n    // serialize bit_set_count\n    memcpy(send_buffer + offset, &bit_set_count, sizeof(bit_set_count));\n    offset += sizeof(bit_set_count);\n  }\n\n  if ((data_mode == gidsData) || (data_mode == offsetsData)) {\n    // serialize offsets vector\n    memcpy(send_buffer + offset, &bit_set_count, sizeof(bit_set_count));\n    offset += sizeof(bit_set_count);\n    ctx->offsets.copy_to_cpu((unsigned int*)(send_buffer + offset),\n                             bit_set_count);\n    offset += bit_set_count * sizeof(unsigned int);\n  } else if ((data_mode == bitsetData)) {\n    // serialize bitset\n    memcpy(send_buffer + offset, &num_shared, sizeof(num_shared));\n    offset += sizeof(num_shared);\n    size_t vec_size = ctx->is_updated.cpu_rd_ptr()->vec_size();\n    memcpy(send_buffer + offset, &vec_size, sizeof(vec_size));\n    offset += sizeof(vec_size);\n    ctx->is_updated.cpu_rd_ptr()->copy_to_cpu(\n        (uint64_t*)(send_buffer + offset));\n    offset += vec_size * sizeof(uint64_t);\n  }\n\n  // serialize data vector\n  memcpy(send_buffer + offset, &bit_set_count, sizeof(bit_set_count));\n  offset += sizeof(bit_set_count);\n  shared_data->copy_to_cpu((DataType*)(send_buffer + offset), bit_set_count);\n  // offset += bit_set_count * sizeof(DataType);\n}\n\ntemplate <typename DataType, SharedType sharedType, bool reset>\nvoid batch_get_shared_field(struct CUDA_Context_Common* ctx,\n                            struct CUDA_Context_Field<DataType>* field,\n                            unsigned from_id, uint8_t* send_buffer,\n                            size_t* v_size, DataCommMode* data_mode,\n                            DataType i = 0) {\n  struct CUDA_Context_Shared* shared;\n  if (sharedType == sharedMaster) {\n    shared = &ctx->master;\n  } else { // sharedMirror\n    shared = &ctx->mirror;\n  }\n  DeviceOnly<DataType>* shared_data = &field->shared_data;\n  dim3 blocks;\n  dim3 threads;\n  kernel_sizing(blocks, threads);\n\n  // ggc::Timer timer(\"timer\"), timer1(\"timer1\"), timer2(\"timer2\"),\n  // timer3(\"timer3\"), timer4(\"timer 4\"); timer.start();\n  if (enforcedDataMode != onlyData) {\n    // timer1.start();\n    ctx->is_updated.cpu_rd_ptr()->resize(shared->num_nodes[from_id]);\n    ctx->is_updated.cpu_rd_ptr()->reset();\n    batch_get_subset_bitset<<<blocks, threads>>>(\n        shared->num_nodes[from_id], shared->nodes[from_id].device_ptr(),\n        ctx->is_updated.gpu_rd_ptr(), field->is_updated.gpu_rd_ptr());\n    check_cuda_kernel;\n    // timer1.stop();\n    // timer2.start();\n    get_offsets_from_bitset(shared->num_nodes[from_id],\n                            ctx->offsets.device_ptr(),\n                            ctx->is_updated.gpu_rd_ptr(), v_size);\n    // timer2.stop();\n  }\n  *data_mode = get_data_mode<DataType>(*v_size, shared->num_nodes[from_id]);\n  // timer3.start();\n  if ((*data_mode) == onlyData) {\n    *v_size = shared->num_nodes[from_id];\n    if (reset) {\n      batch_get_reset_subset<DataType><<<blocks, threads>>>(\n          *v_size, shared->nodes[from_id].device_ptr(),\n          shared_data->device_ptr(), field->data.gpu_wr_ptr(), i);\n    } else {\n      batch_get_subset<DataType><<<blocks, threads>>>(\n          *v_size, shared->nodes[from_id].device_ptr(),\n          shared_data->device_ptr(), field->data.gpu_rd_ptr());\n    }\n  } else { // bitsetData || offsetsData\n    if (reset) {\n      batch_get_reset_subset<DataType><<<blocks, threads>>>(\n          *v_size, shared->nodes[from_id].device_ptr(),\n          ctx->offsets.device_ptr(), shared_data->device_ptr(),\n          field->data.gpu_wr_ptr(), i);\n    } else {\n      batch_get_subset<DataType><<<blocks, threads>>>(\n          *v_size, shared->nodes[from_id].device_ptr(),\n          ctx->offsets.device_ptr(), shared_data->device_ptr(),\n          field->data.gpu_rd_ptr());\n    }\n  }\n  check_cuda_kernel;\n  // timer3.stop();\n  // timer4.start();\n  serializeMessage(ctx, *data_mode, *v_size, shared->num_nodes[from_id],\n                   shared_data, send_buffer);\n  // timer4.stop();\n  // timer.stop();\n  // fprintf(stderr, \"Get %u->%u: %d mode %u bitset %u indices. Time (ms): %llu\n  // + %llu + %llu + %llu = %llu\\n\",\n  //  ctx->id, from_id, *data_mode,\n  //  ctx->is_updated.cpu_rd_ptr()->alloc_size(), sizeof(unsigned int) *\n  //  (*v_size), timer1.duration_ms(), timer2.duration_ms(),\n  //  timer3.duration_ms(), timer4.duration_ms(), timer.duration_ms());\n}\n\ntemplate <typename DataType>\nvoid deserializeMessage(struct CUDA_Context_Common* ctx, DataCommMode data_mode,\n                        size_t& bit_set_count, size_t num_shared,\n                        DeviceOnly<DataType>* shared_data,\n                        uint8_t* recv_buffer) {\n  size_t offset = 0; // data_mode is already deserialized\n\n  if (data_mode != onlyData) {\n    // deserialize bit_set_count\n    memcpy(&bit_set_count, recv_buffer + offset, sizeof(bit_set_count));\n    offset += sizeof(bit_set_count);\n  } else {\n    bit_set_count = num_shared;\n  }\n\n  assert(data_mode != gidsData); // not supported for deserialization on GPUs\n  if (data_mode == offsetsData) {\n    // deserialize offsets vector\n    offset += sizeof(bit_set_count);\n    ctx->offsets.copy_to_gpu((unsigned int*)(recv_buffer + offset),\n                             bit_set_count);\n    offset += bit_set_count * sizeof(unsigned int);\n  } else if ((data_mode == bitsetData)) {\n    // deserialize bitset\n    ctx->is_updated.cpu_rd_ptr()->resize(num_shared);\n    offset += sizeof(num_shared);\n    size_t vec_size = ctx->is_updated.cpu_rd_ptr()->vec_size();\n    offset += sizeof(vec_size);\n    ctx->is_updated.cpu_rd_ptr()->copy_to_gpu(\n        (uint64_t*)(recv_buffer + offset));\n    offset += vec_size * sizeof(uint64_t);\n    // get offsets\n    size_t v_size;\n    get_offsets_from_bitset(num_shared, ctx->offsets.device_ptr(),\n                            ctx->is_updated.gpu_rd_ptr(), &v_size);\n\n    assert(bit_set_count == v_size);\n  }\n\n  // deserialize data vector\n  offset += sizeof(bit_set_count);\n  shared_data->copy_to_gpu((DataType*)(recv_buffer + offset), bit_set_count);\n  // offset += bit_set_count * sizeof(DataType);\n}\n\ntemplate <typename DataType, SharedType sharedType, UpdateOp op>\nvoid batch_set_shared_field(struct CUDA_Context_Common* ctx,\n                            struct CUDA_Context_Field<DataType>* field,\n                            unsigned from_id, uint8_t* recv_buffer,\n                            DataCommMode data_mode) {\n  assert(data_mode != noData);\n  struct CUDA_Context_Shared* shared;\n  if (sharedType == sharedMaster) {\n    shared = &ctx->master;\n  } else { // sharedMirror\n    shared = &ctx->mirror;\n  }\n  DeviceOnly<DataType>* shared_data = &field->shared_data;\n  dim3 blocks;\n  dim3 threads;\n  kernel_sizing(blocks, threads);\n  size_t v_size;\n\n  // ggc::Timer timer(\"timer\"), timer1(\"timer1\"), timer2(\"timer2\");\n  // timer.start();\n  // timer1.start();\n  deserializeMessage(ctx, data_mode, v_size, shared->num_nodes[from_id],\n                     shared_data, recv_buffer);\n  // timer1.stop();\n  // timer2.start();\n  if (data_mode == onlyData) {\n    if (op == setOp) {\n      batch_set_subset<DataType, sharedType><<<blocks, threads>>>(\n          v_size, shared->nodes[from_id].device_ptr(),\n          shared_data->device_ptr(), field->data.gpu_wr_ptr(),\n          field->is_updated.gpu_wr_ptr());\n    } else if (op == addOp) {\n      batch_add_subset<DataType, sharedType><<<blocks, threads>>>(\n          v_size, shared->nodes[from_id].device_ptr(),\n          shared_data->device_ptr(), field->data.gpu_wr_ptr(),\n          field->is_updated.gpu_wr_ptr());\n    } else if (op == minOp) {\n      batch_min_subset<DataType, sharedType><<<blocks, threads>>>(\n          v_size, shared->nodes[from_id].device_ptr(),\n          shared_data->device_ptr(), field->data.gpu_wr_ptr(),\n          field->is_updated.gpu_wr_ptr());\n    }\n  } else if (data_mode == gidsData) {\n    if (op == setOp) {\n      batch_set_subset<DataType, sharedType><<<blocks, threads>>>(\n          v_size, ctx->offsets.device_ptr(), shared_data->device_ptr(),\n          field->data.gpu_wr_ptr(), field->is_updated.gpu_wr_ptr());\n    } else if (op == addOp) {\n      batch_add_subset<DataType, sharedType><<<blocks, threads>>>(\n          v_size, ctx->offsets.device_ptr(), shared_data->device_ptr(),\n          field->data.gpu_wr_ptr(), field->is_updated.gpu_wr_ptr());\n    } else if (op == minOp) {\n      batch_min_subset<DataType, sharedType><<<blocks, threads>>>(\n          v_size, ctx->offsets.device_ptr(), shared_data->device_ptr(),\n          field->data.gpu_wr_ptr(), field->is_updated.gpu_wr_ptr());\n    }\n  } else { // bitsetData || offsetsData\n    if (op == setOp) {\n      batch_set_subset<DataType, sharedType><<<blocks, threads>>>(\n          v_size, shared->nodes[from_id].device_ptr(),\n          ctx->offsets.device_ptr(), shared_data->device_ptr(),\n          field->data.gpu_wr_ptr(), field->is_updated.gpu_wr_ptr());\n    } else if (op == addOp) {\n      batch_add_subset<DataType, sharedType><<<blocks, threads>>>(\n          v_size, shared->nodes[from_id].device_ptr(),\n          ctx->offsets.device_ptr(), shared_data->device_ptr(),\n          field->data.gpu_wr_ptr(), field->is_updated.gpu_wr_ptr());\n    } else if (op == minOp) {\n      batch_min_subset<DataType, sharedType><<<blocks, threads>>>(\n          v_size, shared->nodes[from_id].device_ptr(),\n          ctx->offsets.device_ptr(), shared_data->device_ptr(),\n          field->data.gpu_wr_ptr(), field->is_updated.gpu_wr_ptr());\n    }\n  }\n  check_cuda_kernel;\n  // timer2.stop();\n  // timer.stop();\n  // fprintf(stderr, \"Set %u<-%u: %d mode Time (ms): %llu + %llu = %llu\\n\",\n  //  ctx->id, from_id, data_mode,\n  //  timer1.duration_ms(), timer2.duration_ms(),\n  //  timer.duration_ms());\n}\n"
  },
  {
    "path": "libgluon/src/GlobalObj.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n/**\n * @file GlobalObj.cpp\n *\n * Defines GlobalObject's static vector that tracks all GlobalObjects\n * and the ptrForObj function.\n */\n\n#include \"galois/runtime/GlobalObj.h\"\n\nstd::vector<uintptr_t> galois::runtime::GlobalObject::allobjs;\n\nuintptr_t galois::runtime::GlobalObject::ptrForObj(unsigned oid) {\n  assert(oid < allobjs.size());\n  return allobjs[oid];\n}\n"
  },
  {
    "path": "libgluon/src/GluonSubstrate.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n/**\n * @file GluonSubstrate.cpp\n * Contains the enforced datamode global for use by GPUs.\n *\n * TODO get rid of this file/global.\n */\n\n#include \"galois/graphs/GluonSubstrate.h\"\n\nDataCommMode enforcedDataMode = DataCommMode::noData;\n\n#ifdef GALOIS_USE_BARE_MPI\n//! bare_mpi type to use; see options in runtime/BareMPI.h\n// BareMPI bare_mpi = BareMPI::noBareMPI;\nBareMPI bare_mpi = BareMPI::nonBlockingBareMPI;\n#endif\n"
  },
  {
    "path": "libgluon/src/SyncStructures.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n/**\n * @file SyncStructures.cpp\n *\n * Contains implementations of the bitvector status setter/getter functions\n */\n\n#include <galois/runtime/SyncStructures.h>\n\nusing namespace galois::runtime; // for easy access to BITVECTOR_STATUS\n\nbool galois::runtime::src_invalid(BITVECTOR_STATUS bv_flag) {\n  return (bv_flag == BITVECTOR_STATUS::SRC_INVALID ||\n          bv_flag == BITVECTOR_STATUS::BOTH_INVALID);\n}\n\nbool galois::runtime::dst_invalid(BITVECTOR_STATUS bv_flag) {\n  return (bv_flag == BITVECTOR_STATUS::DST_INVALID ||\n          bv_flag == BITVECTOR_STATUS::BOTH_INVALID);\n}\n\nvoid galois::runtime::make_src_invalid(BITVECTOR_STATUS* bv_flag) {\n  switch (*bv_flag) {\n  case NONE_INVALID:\n    *bv_flag = BITVECTOR_STATUS::SRC_INVALID;\n    break;\n  case DST_INVALID:\n    *bv_flag = BITVECTOR_STATUS::BOTH_INVALID;\n    break;\n  case SRC_INVALID:\n  case BOTH_INVALID:\n    break;\n  }\n}\n\nvoid galois::runtime::make_dst_invalid(BITVECTOR_STATUS* bv_flag) {\n  switch (*bv_flag) {\n  case NONE_INVALID:\n    *bv_flag = BITVECTOR_STATUS::DST_INVALID;\n    break;\n  case SRC_INVALID:\n    *bv_flag = BITVECTOR_STATUS::BOTH_INVALID;\n    break;\n  case DST_INVALID:\n  case BOTH_INVALID:\n    break;\n  }\n}\n"
  },
  {
    "path": "libgluon/src/cuda_device.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n/*\n */\n\n/**\n * @file cuda_device.cpp\n *\n * Contains implementation for function that gets gpu device ID.\n */\n#include \"galois/cuda/HostDecls.h\"\n#include \"galois/Galois.h\"\n#include \"galois/runtime/Network.h\"\n#include <algorithm>\n\nint get_gpu_device_id(std::string personality_set, int num_nodes) {\n  auto& net                   = galois::runtime::getSystemNetworkInterface();\n  unsigned host_id            = net.ID;\n  unsigned num_hosts          = net.Num;\n  unsigned num_hosts_per_node = num_hosts / num_nodes;\n  assert((num_hosts % num_nodes) == 0);\n  assert(personality_set.length() == num_hosts_per_node);\n  unsigned num_gpus_per_node =\n      std::count(personality_set.begin(), personality_set.end(), 'g');\n  unsigned num_gpus_before =\n      std::count(personality_set.begin(),\n                 personality_set.begin() + (host_id % num_hosts_per_node), 'g');\n  return (num_gpus_before % num_gpus_per_node);\n}\n"
  },
  {
    "path": "libgpu/CMakeLists.txt",
    "content": "add_library(galois_gpu)\nadd_library(Galois::gpu ALIAS galois_gpu)\nset_target_properties(galois_gpu PROPERTIES EXPORT_NAME gpu)\nadd_dependencies(lib galois_gpu)\n\n#target_link_libraries(galois_gpu ${CUDA_cudadevrt_LIBRARY})\n\ntarget_sources(galois_gpu PRIVATE\n  src/csr_graph.cu\n  src/ggc_rt.cu\n  src/skelapp/skel.cu\n)\n\ntarget_include_directories(galois_gpu PUBLIC\n  $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>\n  $<INSTALL_INTERFACE:include/galois/gpu>\n)\ninstall(\n  DIRECTORY include/\n  DESTINATION \"${CMAKE_INSTALL_INCLUDEDIR}/galois/gpu\"\n  COMPONENT dev\n  FILES_MATCHING PATTERN \"*.h\" PATTERN \"*.hpp\" PATTERN \".cuh\"\n)\n\nif(NOT EXISTS \"${PROJECT_SOURCE_DIR}/external/moderngpu\")\n  message(FATAL_ERROR \"Cannot find the moderngpu Git submodule. Please run `git submodule update --init --recursive`\")\nendif()\n\ntarget_include_directories(galois_gpu PUBLIC\n  $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/external/moderngpu/src>\n  $<INSTALL_INTERFACE:include/galois/gpu/moderngpu/src>\n)\ninstall(\n  DIRECTORY ${PROJECT_SOURCE_DIR}/external/moderngpu\n  DESTINATION \"${CMAKE_INSTALL_INCLUDEDIR}/galois/gpu/moderngpu\"\n  COMPONENT dev\n)\n\nif(NOT EXISTS \"${PROJECT_SOURCE_DIR}/external/cub\")\n  message(FATAL_ERROR \"Cannot find the cub Git submodule. Please run `git submodule update --init --recursive`\")\nendif()\n\ntarget_include_directories(galois_gpu PUBLIC\n  $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/external/cub>\n  $<INSTALL_INTERFACE:include/galois/gpu/cub>\n)\ninstall(\n  DIRECTORY ${PROJECT_SOURCE_DIR}/external/cub\n  DESTINATION \"${CMAKE_INSTALL_INCLUDEDIR}/galois/gpu/cub\"\n  COMPONENT dev\n)\ntarget_compile_definitions(galois_gpu PRIVATE _FORCE_INLINES)\ntarget_compile_options(galois_gpu PUBLIC \"$<$<COMPILE_LANGUAGE:CUDA>:--expt-extended-lambda>\")\nset_property(TARGET galois_gpu PROPERTY CUDA_STANDARD 14)\n\ninstall(TARGETS galois_gpu\n  EXPORT GaloisTargets\n  LIBRARY\n    DESTINATION \"${CMAKE_INSTALL_LIBDIR}\"\n    COMPONENT shlib\n  ARCHIVE\n    DESTINATION \"${CMAKE_INSTALL_LIBDIR}\"\n    COMPONENT lib\n  INCLUDES DESTINATION \"${CMAKE_INSTALL_INCLUDEDIR}\"\n)\n"
  },
  {
    "path": "libgpu/include/Timer.h",
    "content": "#pragma once\n/*\n   Timer.h\n\n   Part of the GGC source code.\n\n   Copyright (C) 2014--2016, The University of Texas at Austin\n\n   See LICENSE.TXT for copyright license.\n\n   Author: Sreepathi Pai <sreepai@ices.utexas.edu>\n*/\n\n#include <time.h>\n#include <unistd.h>\n#include <errno.h>\n#include <assert.h>\n\n#if !(_POSIX_TIMERS > 0)\n#error \"POSIX timers not available\"\n#endif\n\n#ifdef _POSIX_MONOTONIC_CLOCK\n#ifdef CLOCK_MONOTONIC_RAW\nstatic clockid_t CLOCKTYPE    = CLOCK_MONOTONIC_RAW;\nstatic const char* SCLOCKTYPE = \"CLOCK_MONOTONIC_RAW\";\n#else\nstatic clockid_t CLOCKTYPE = CLOCK_MONOTONIC static const char* SCLOCKTYPE =\n    \"CLOCK_MONOTONIC\";\n#endif /* CLOCK_MONOTONIC_RAW */\n#else\n#warning \"CLOCK_MONOTONIC is unavailable, using CLOCK_REALTIME\"\nstatic clockid CLOCKTYPE      = CLOCK_REALTIME;\nstatic const char* SCLOCKTYPE = \"CLOCK_REALTIME\";\n#endif /* _POSIX_MONOTONIC_CLOCK */\n\n#define NANOSEC 1000000000LL\n\nnamespace ggc {\nclass Timer {\n  char const* name;\n  struct timespec begin, end;\n  bool active, valid;\n  unsigned long long last;\n  unsigned long long total;\n\npublic:\n  Timer(const char* timer_name) {\n    name   = timer_name;\n    active = false;\n    valid  = false;\n    total  = 0;\n  }\n\n  unsigned long long normalize(const struct timespec& t) const {\n    return t.tv_sec * NANOSEC + t.tv_nsec;\n  }\n\n  void reset() {\n    assert(!active);\n    total = 0;\n    last  = 0;\n  }\n\n  void start() {\n    assert(!active);\n    active = true;\n    valid  = false;\n    if (clock_gettime(CLOCKTYPE, &begin) == -1) {\n      if (errno == EINVAL) {\n        fprintf(stderr, \"%s (%d) not available.\\n\", SCLOCKTYPE, CLOCKTYPE);\n        // exit?\n      }\n    }\n  }\n\n  void print() {\n    printf(\"%s %llu %llu\\n\", name, normalize(begin), normalize(end));\n  }\n  void stop() {\n    assert(active);\n\n    if (clock_gettime(CLOCKTYPE, &end) == -1) {\n      if (errno == EINVAL) {\n        fprintf(stderr, \"%s (%d) not available.\\n\", SCLOCKTYPE, CLOCKTYPE);\n        // exit?\n      }\n    }\n\n    // assert(normalize(end) > normalize(begin) // paranoid level 2\n\n    last = normalize(end) - normalize(begin);\n    total += last;\n    active = false;\n    valid  = true;\n  }\n\n  unsigned long long duration() const { return last; }\n\n  unsigned long long duration_ms() const { return last * 1000 / NANOSEC; }\n\n  unsigned long long duration_s() const { return last / NANOSEC; }\n\n  unsigned long long total_duration() const { return total; }\n\n  unsigned long long total_duration_ms() const {\n    return total * 1000 / NANOSEC;\n  }\n\n  unsigned long long total_duration_s() const { return total / NANOSEC; }\n};\n} // namespace ggc\n\n#if 0\n__attribute__((constructor)) static void upgrade_timer(void) {\n  struct timespec res;\n  \n// see if CLOCK_MONOTONIC_RAW is available at runtime\n#if defined(_POSIX_MONOTONIC_CLOCK) && defined(__linux__)\n    if(CLOCKTYPE == CLOCK_MONOTONIC) {\n      int rv;\n      clockid_t clockid;\n\n#ifdef CLOCK_MONOTONIC_RAW\n      clockid = CLOCK_MONOTONIC_RAW;\n#else\n      clockid = 4; // from bits/time.h\n#endif\n\n      rv = clock_getres(clockid, &res);\n      if(rv == 0) {\n\t//fprintf(stderr, \"Using CLOCK_MONOTONIC_RAW for Timer.\\n\");\n\tCLOCKTYPE = clockid;\n\tSCLOCKTYPE = \"CLOCK_MONOTONIC_RAW\";\n      }\n    }\n#endif\n}\n#endif\n"
  },
  {
    "path": "libgpu/include/abitset.h",
    "content": "#pragma once\n/*\n   abitset.h\n\n   Implements ApproxBitset. Part of the GGC source code.\n\n   Copyright (C) 2014--2016, The University of Texas at Austin\n\n   See LICENSE.TXT for copyright license.\n\n   Author: Sreepathi Pai <sreepai@ices.utexas.edu>\n*/\n\ntemplate <typename T>\nstruct Base {};\n\ntemplate <>\nstruct Base<unsigned int> {\n  enum {\n    BITS     = 32,\n    LOG_BITS = 5,\n  };\n};\n\ntemplate <>\nstruct Base<unsigned char> {\n  enum {\n    BITS     = 8,\n    LOG_BITS = 3,\n  };\n};\n\ntemplate <typename T>\nclass ApproxBitset {\n  int nbits;\n  Shared<T> bitset;\n  cudaTextureObject_t btx;\n  static const unsigned int bits_per_base = Base<T>::BITS,\n                            divby         = Base<T>::LOG_BITS,\n                            modby         = (1 << Base<T>::LOG_BITS) - 1;\n  T* bitarray;\n\npublic:\n  int size;\n\n  ApproxBitset() { nbits = 0; }\n\n  ApproxBitset(size_t nbits) {\n    this->nbits = nbits;\n    // bits_per_base = sizeof(unsigned int) * 8;\n    size = (nbits + bits_per_base - 1) / bits_per_base;\n\n    // int mask = bits_per_base, count = 0;\n    // while(!(mask & 1)) { mask >>=1; count++; }\n\n    // divby = count;\n    // modby = (1 << divby) - 1;\n\n    // printf(\"%d: %d divby: %d, modby = %d\\n\", count, bits_per_base, divby,\n    // modby);\n\n    bitset.alloc(size);\n    bitset.zero_gpu();\n    bitarray = bitset.gpu_wr_ptr();\n\n    cudaResourceDesc resDesc;\n\n    memset(&resDesc, 0, sizeof(resDesc));\n    resDesc.resType           = cudaResourceTypeLinear;\n    resDesc.res.linear.desc.f = cudaChannelFormatKindUnsigned;\n    resDesc.res.linear.desc.x = Base<T>::BITS; // bits per channel\n\n    cudaTextureDesc texDesc;\n    memset(&texDesc, 0, sizeof(texDesc));\n    texDesc.readMode = cudaReadModeElementType;\n\n    resDesc.res.linear.devPtr      = bitarray;\n    resDesc.res.linear.sizeInBytes = size;\n    check_cuda(cudaCreateTextureObject(&btx, &resDesc, &texDesc, NULL));\n  }\n\n  __device__ void set(int pos) {\n    int elem = pos >> divby, bitpos = pos & modby;\n    // printf(\"before %d %d: %x\\n\", pos, elem, bitarray[elem]);\n    bitarray[elem] |= (1 << bitpos);\n    // printf(\"after %d %d: %x\\n\", pos, elem, bitarray[elem]);\n  }\n\n  __device__ void unset(int pos) {\n    int elem = pos >> divby, bitpos = pos & modby;\n    bitarray[elem] &= ~(1 << bitpos);\n  }\n\n  __device__ int is_set(int pos) const {\n    int elem = pos >> divby, bitpos = pos & modby;\n\n    // printf(\"%d %d\\n\", bitarray[elem], tex1Dfetch<unsigned int>(btx, elem));\n\n    // return bitarray[elem] & (1 << bitpos);\n    // return tex1Dfetch<unsigned int>(btx, elem) & (1 << bitpos);\n    if (!(tex1Dfetch<T>(btx, elem) & (1 << bitpos)))\n      return bitarray[elem] & (1 << bitpos);\n    else\n      return 1;\n  }\n\n  void dump() {\n    T* x = bitset.cpu_rd_ptr();\n    for (int i = 0; i < size; i++) {\n      printf(\"%d: %x\\n\", i, x[i]);\n    }\n  }\n};\n\ntypedef ApproxBitset<unsigned int> ApproxBitsetInt;\ntypedef ApproxBitset<unsigned char> ApproxBitsetByte;\n"
  },
  {
    "path": "libgpu/include/aolist.h",
    "content": "#pragma once\n/*\n   aolist.h\n\n   Implements AppendOnlyList. Part of the GGC source code.\n\n   Copyright (C) 2014--2016, The University of Texas at Austin\n\n   See LICENSE.TXT for copyright license.\n\n   Author: Sreepathi Pai <sreepai@ices.utexas.edu>\n*/\n\n#include \"cub/cub.cuh\"\n#include \"cutil_subset.h\"\n#include \"bmk2.h\"\n#include <moderngpu/kernel_mergesort.hxx>\n\nstruct AppendOnlyList {\n  int* dl;\n  int *dindex, index;\n  int size;\n  bool f_will_write;\n\n  Shared<int> list;\n\n  AppendOnlyList() { size = 0; }\n\n  AppendOnlyList(size_t nsize) {\n    size = nsize;\n\n    if (nsize == 0) {\n      dl    = NULL;\n      index = 0;\n    } else {\n      list.alloc(nsize);\n      dl = list.gpu_wr_ptr();\n      CUDA_SAFE_CALL(cudaMalloc(&dindex, 1 * sizeof(int)));\n      CUDA_SAFE_CALL(cudaMemcpy((void*)dindex, &zero, 1 * sizeof(zero),\n                                cudaMemcpyHostToDevice));\n      index = 0;\n    }\n  }\n\n  void sort() {\n    mgpu::standard_context_t context;\n    mergesort(list.gpu_wr_ptr(), nitems(), mgpu::less_t<int>(), context);\n  }\n\n  void update_cpu() { list.cpu_rd_ptr(); }\n\n  void display_items() {\n    int nsize = nitems();\n    int* l    = list.cpu_rd_ptr();\n\n    printf(\"LIST: \");\n    for (int i = 0; i < nsize; i++)\n      printf(\"%d %d, \", i, l[i]);\n\n    printf(\"\\n\");\n    return;\n  }\n\n  void reset() {\n    CUDA_SAFE_CALL(cudaMemcpy((void*)dindex, &zero, 1 * sizeof(zero),\n                              cudaMemcpyHostToDevice));\n  }\n\n  __device__ __host__ int nitems() {\n#ifdef __CUDA_ARCH__\n    return *dindex;\n#else\n    CUDA_SAFE_CALL(cudaMemcpy(&index, (void*)dindex, 1 * sizeof(index),\n                              cudaMemcpyDeviceToHost));\n    return index;\n#endif\n  }\n\n  __device__ int push(int item) {\n    int lindex = atomicAdd((int*)dindex, 1);\n    assert(lindex <= size);\n\n    dl[lindex] = item;\n    return 1;\n  }\n\n  __device__ int pop_id(int id, int& item) {\n    if (id < *dindex) {\n      item = cub::ThreadLoad<cub::LOAD_CG>(dl + id);\n      // item = dwl[id];\n      return 1;\n    }\n\n    return 0;\n  }\n\n  __device__ int pop(int& item) {\n    int lindex = atomicSub((int*)dindex, 1);\n    if (lindex <= 0) {\n      *dindex = 0;\n      return 0;\n    }\n\n    item = dl[lindex - 1];\n    return 1;\n  }\n\n  __device__ int setup_push_warp_one() {\n    int first, total, offset, lindex = 0;\n\n    warp_active_count(first, offset, total);\n\n    if (offset == 0) {\n      lindex = atomicAdd((int*)dindex, total);\n      assert(lindex <= size);\n    }\n\n    lindex = cub::ShuffleIndex<32>(lindex, first, 0xffffffff);\n    // lindex = cub::ShuffleIndex(lindex, first); // CUB > 1.3.1\n    return lindex + offset;\n  }\n\n  __device__ int setup_push_thread(int nitems) {\n    int lindex = atomicAdd((int*)dindex, nitems);\n    assert(lindex <= size);\n\n    return lindex;\n  }\n\n  __device__ int do_push(int start, int id, int item) {\n    assert(id <= size);\n    dl[start + id] = item;\n    return 1;\n  }\n\n  template <typename T>\n  __device__ __forceinline__ int push_1item(int nitem, int item,\n                                            int threads_per_block) {\n    __shared__ typename T::TempStorage temp_storage;\n    __shared__ int queue_index;\n    int total_items = 0;\n    int thread_data = nitem;\n\n    T(temp_storage).ExclusiveSum(thread_data, thread_data, total_items);\n\n    if (threadIdx.x == 0) {\n      if (debug)\n        printf(\"t: %d\\n\", total_items);\n      queue_index = atomicAdd((int*)dindex, total_items);\n      // printf(\"queueindex: %d %d %d %d %d\\n\", blockIdx.x, threadIdx.x,\n      // queue_index, thread_data + n_items, total_items);\n    }\n\n    __syncthreads();\n\n    if (nitem == 1) {\n      if (queue_index + thread_data >= size) {\n        printf(\"GPU: exceeded length: %d %d %d\\n\", queue_index, thread_data,\n               size);\n        return 0;\n      }\n\n      // dwl[queue_index + thread_data] = item;\n      cub::ThreadStore<cub::STORE_CG>(dl + queue_index + thread_data, item);\n    }\n\n    return total_items;\n  }\n\n  void save(const char* f, const unsigned iteration) {\n    char n[255];\n    int ret;\n\n    ret = snprintf(n, 255, \"%s%s-%05d-%s.wl\", instr_trace_dir(), f, iteration,\n                   instr_uniqid());\n\n    if (ret < 0 || ret >= 255) {\n      fprintf(stderr, \"Error creating filename for kernel '%s', iteration %d\\n\",\n              f, iteration);\n      exit(1);\n    }\n\n    int nsize = nitems();\n    int* wl   = list.cpu_rd_ptr();\n\n    TRACE of = trace_open(n, \"w\");\n    instr_write_array(n, of, sizeof(int), nsize, wl);\n    trace_close(of);\n    bmk2_log_collect(\"ggc/wlcontents\", n);\n    return;\n  }\n\n  void load(const char* f, const unsigned iteration) {\n    char n[255];\n    int ret;\n\n    ret = snprintf(n, 255, \"%s%s-%05d-%s.wl\", instr_trace_dir(), f, iteration,\n                   instr_uniqid());\n\n    if (ret < 0 || ret >= 255) {\n      fprintf(stderr, \"Error creating filename for kernel '%s', iteration %d\\n\",\n              f, iteration);\n      exit(1);\n    }\n\n    TRACE of = trace_open(n, \"w\");\n    int nsize =\n        instr_read_array(n, of, sizeof(int), size, list.cpu_wr_ptr(true));\n    list.gpu_rd_ptr();\n    check_cuda(cudaMemcpy((void*)dindex, &nsize, 1 * sizeof(nsize),\n                          cudaMemcpyHostToDevice));\n    trace_close(of);\n    return;\n  }\n};\n"
  },
  {
    "path": "libgpu/include/atomic_helpers.h",
    "content": "#pragma once\n\n// TODO: re-implement all these using atomicCAS()\n\n__device__ static double atomicTestAdd(double* address, double val) {\n#if __CUDA_ARCH__ >= 600\n  return (val == 0.0) ? *address : atomicAdd(address, val);\n#else\n  unsigned long long int* address_ull = (unsigned long long int*)address;\n  unsigned long long int old          = *address_ull;\n  unsigned long long int assumed;\n  do {\n    assumed                          = old;\n    double value                     = val + __longlong_as_double(assumed);\n    unsigned long long int value_ull = __double_as_longlong(value);\n    old = atomicCAS(address_ull, assumed, value_ull);\n  } while (assumed != old);\n  return __longlong_as_double(assumed);\n#endif\n}\n\n__device__ static float atomicMax(float* address, float val) {\n  int* address_as_i = (int*)address;\n  int val_as_i      = __float_as_int(val);\n  int old_as_i      = *address_as_i;\n  float old         = __int_as_float(old_as_i);\n  while (old < val) {\n    old_as_i = atomicCAS(address_as_i, old_as_i, val_as_i);\n    old      = __int_as_float(old_as_i);\n  }\n  return old;\n}\n\n__device__ static float atomicMin(float* address, float val) {\n  int* address_as_i = (int*)address;\n  int val_as_i      = __float_as_int(val);\n  int old_as_i      = *address_as_i;\n  float old         = __int_as_float(old_as_i);\n  while (old > val) {\n    old_as_i = atomicCAS(address_as_i, old_as_i, val_as_i);\n    old      = __int_as_float(old_as_i);\n  }\n  return old;\n}\n\n__device__ static int atomicTestAdd(int* address, int val) {\n  return (val == 0) ? *address : atomicAdd(address, val);\n}\n\n__device__ static float atomicTestAdd(float* address, float val) {\n  return (val == 0.0) ? *address : atomicAdd(address, val);\n}\n\n__device__ static float atomicTestMin(float* address, float val) {\n  return atomicMin(address, val);\n}\n\n__device__ static float atomicTestMax(float* address, float val) {\n  return atomicMax(address, val);\n}\n\n__device__ static uint32_t atomicTestAdd(uint32_t* address, uint32_t val) {\n  return (val == 0) ? *address : atomicAdd(address, val);\n}\n\n__device__ static uint32_t atomicTestMin(uint32_t* address, uint32_t val) {\n  uint32_t old_val = *address;\n  return (old_val <= val) ? old_val : atomicMin(address, val);\n}\n\n__device__ static uint32_t atomicTestMax(uint32_t* address, uint32_t val) {\n  uint32_t old_val = *address;\n  return (old_val >= val) ? old_val : atomicMax(address, val);\n}\n\n__device__ static uint64_t atomicTestAdd(uint64_t* address, uint64_t val) {\n  return (val == 0) ? *address\n                    : atomicAdd((unsigned long long int*)address, val);\n}\n\n__device__ static uint64_t atomicTestMin(uint64_t* address, uint64_t val) {\n  uint64_t old_val            = *address;\n  unsigned long long int val2 = val;\n  return (old_val <= val) ? old_val\n                          : atomicMin((unsigned long long int*)address, val2);\n}\n\n__device__ static uint64_t atomicTestMax(uint64_t* address, uint64_t val) {\n  uint64_t old_val            = *address;\n  unsigned long long int val2 = val;\n  return (old_val >= val) ? old_val\n                          : atomicMax((unsigned long long int*)address, val2);\n}\n"
  },
  {
    "path": "libgpu/include/bmk2.h",
    "content": "#pragma once\n\n#ifdef __cplusplus\nextern \"C\" {\n#endif\nchar* bmk2_get_binid();\nchar* bmk2_get_inputid();\nchar* bmk2_get_runid();\nint bmk2_log_collect(const char* component, const char* file);\n\n#ifdef __cplusplus\n}\n#endif\n"
  },
  {
    "path": "libgpu/include/component.h",
    "content": "/*\n   component.h\n\n   Implements ComponentSpace. Part of the GGC source code.\n   Originally derived from the LonestarGPU 2.0 source code.\n\n   Copyright (C) 2014--2016, The University of Texas at Austin\n\n   See LICENSE.TXT for copyright license.\n\n   TODO: relicense\n*/\n\nstruct ComponentSpace {\n  ComponentSpace(unsigned nelements);\n\n  __device__ unsigned numberOfElements();\n  __device__ unsigned numberOfComponents();\n  __device__ bool isBoss(unsigned element);\n  __device__ unsigned find(unsigned lelement, bool compresspath = true);\n  __device__ bool unify(unsigned one, unsigned two);\n  __device__ void print1x1();\n  __host__ void print();\n  __host__ void copy(ComponentSpace& two);\n  void dump_to_file(const char* F);\n  void allocate();\n  void init();\n  unsigned numberOfComponentsHost();\n\n  unsigned nelements;\n  unsigned *ncomponents, // number of components.\n      *complen,          // lengths of components.\n      *ele2comp;         // components of elements.\n};\nComponentSpace::ComponentSpace(unsigned nelements) {\n  this->nelements = nelements;\n\n  allocate();\n  init();\n}\n\nvoid ComponentSpace::dump_to_file(const char* F) {\n  static FILE* f;\n  static unsigned* mem;\n\n  if (!f) {\n    f   = fopen(F, \"w\");\n    mem = (unsigned*)calloc(nelements, sizeof(unsigned));\n  }\n\n  assert(cudaMemcpy(mem, ele2comp, nelements * sizeof(unsigned),\n                    cudaMemcpyDeviceToHost) == cudaSuccess);\n\n  int i;\n  for (i = 0; i < nelements; i++) {\n    int boss = i;\n    do {\n      boss = mem[boss];\n    } while (boss != mem[boss]);\n    fprintf(f, \"%d %d %d\\n\", i, mem[i], boss);\n  }\n\n  fprintf(f, \"\\n\");\n}\n\nvoid ComponentSpace::copy(ComponentSpace& two) {\n  assert(cudaMemcpy(two.ncomponents, ncomponents, sizeof(unsigned),\n                    cudaMemcpyDeviceToDevice) == 0);\n  assert(cudaMemcpy(two.ele2comp, ele2comp, sizeof(unsigned) * nelements,\n                    cudaMemcpyDeviceToDevice) == 0);\n  assert(cudaMemcpy(two.complen, complen, sizeof(unsigned) * nelements,\n                    cudaMemcpyDeviceToDevice) == 0);\n}\n__device__ void ComponentSpace::print1x1() {\n  printf(\"\\t\\t-----------------\\n\");\n  for (unsigned ii = 0; ii < nelements; ++ii) {\n    printf(\"\\t\\t%d -> %d\\n\", ii, ele2comp[ii]);\n  }\n  printf(\"\\t\\t-----------------\\n\");\n}\n__global__ void print1x1(ComponentSpace cs) { cs.print1x1(); }\n__host__ void ComponentSpace::print() { ::print1x1<<<1, 1>>>(*this); }\n__device__ unsigned ComponentSpace::numberOfElements() { return nelements; }\n__device__ unsigned ComponentSpace::numberOfComponents() {\n  return *ncomponents;\n}\nunsigned ComponentSpace::numberOfComponentsHost() {\n  unsigned hncomponents = 0;\n  check_cuda(cudaMemcpy(&hncomponents, ncomponents, sizeof(unsigned),\n                        cudaMemcpyDeviceToHost));\n  return hncomponents;\n}\nvoid ComponentSpace::allocate() {\n  check_cuda(cudaMalloc((void**)&ncomponents, 1 * sizeof(unsigned)));\n  check_cuda(cudaMalloc((void**)&complen, nelements * sizeof(unsigned)));\n  check_cuda(cudaMalloc((void**)&ele2comp, nelements * sizeof(unsigned)));\n}\n__global__ void dinitcs(unsigned nelements, unsigned* complen,\n                        unsigned* ele2comp) {\n  unsigned id = blockIdx.x * blockDim.x + threadIdx.x;\n  if (id < nelements) {\n    // elements[id] \t= id;\n    complen[id]  = 1;\n    ele2comp[id] = id;\n  }\n}\nvoid ComponentSpace::init() {\n  // init the elements.\n  unsigned blocksize = 256; ////\n  unsigned nblocks   = (nelements + blocksize - 1) / blocksize;\n  dinitcs<<<nblocks, blocksize>>>(nelements, complen, ele2comp);\n  // init number of components.\n  check_cuda(cudaMemcpy(ncomponents, &nelements, sizeof(unsigned),\n                        cudaMemcpyHostToDevice));\n}\n__device__ bool ComponentSpace::isBoss(unsigned element) {\n  return atomicCAS(&ele2comp[element], element, element) == element;\n}\n__device__ unsigned ComponentSpace::find(unsigned lelement,\n                                         bool compresspath /*= true*/) {\n  // do we need to worry about concurrency in this function?\n  // for other finds, no synchronization necessary as the data-structure is a\n  // tree. for other unifys, synchornization is not required considering that\n  // unify is going to affect only bosses, while find is going to affect only\n  // non-bosses.\n  unsigned element = lelement;\n  while (isBoss(element) == false) {\n    element = ele2comp[element];\n  }\n  if (compresspath)\n    ele2comp[lelement] = element; // path compression.\n  return element;\n}\n__device__ bool ComponentSpace::unify(unsigned one, unsigned two) {\n  // if the client makes sure that one component is going to get unified as a\n  // source with another destination only once, then synchronization is\n  // unnecessary. while this is true for MST, due to load-balancing in if-block\n  // below, a node may be source multiple times. if a component is source in one\n  // thread and destination is another, then it is okay for MST.\n  do {\n    if (!isBoss(one))\n      return false;\n    if (!isBoss(two))\n      return false;\n\n    unsigned onecomp = one;\n    unsigned twocomp = two;\n    // unsigned onecomp = find(one, false);\n    // unsigned twocomp = find(two, false);\n\n    if (onecomp == twocomp)\n      return false; // \"duplicate\" edges due to symmetry\n\n    unsigned boss        = twocomp;\n    unsigned subordinate = onecomp;\n    // if (complen[onecomp] > complen[twocomp]) {\t// one is larger, make it\n    // the representative: can create cycles.\n    if (boss < subordinate) { // break cycles by id.\n      boss        = onecomp;\n      subordinate = twocomp;\n    }\n    // merge subordinate into the boss.\n    // ele2comp[subordinate] = boss;\n\n    unsigned oldboss = atomicCAS(&ele2comp[subordinate], subordinate, boss);\n    if (oldboss != subordinate) { // someone else updated the boss.\n      // we need not restore the ele2comp[subordinate], as union-find ensures\n      // correctness and complen of subordinate doesn't matter.\n      one = oldboss;\n      two = boss;\n      return false;\n    } else {\n      dprintf(\"\\t\\tunifying %d -> %d (%d)\\n\", subordinate, boss);\n      atomicAdd(&complen[boss], complen[subordinate]);\n      // complen[boss] += complen[subordinate];\n      // complen[subordinate] doesn't matter now, since find() will find its\n      // boss.\n\n      // a component has reduced.\n      unsigned ncomp = atomicSub(ncomponents, 1);\n      // atomicDec(ncomponents, nelements);\n      dprintf(\"\\t%d: ncomponents = %d\\n\", threadIdx.x, ncomp);\n      return true;\n    }\n  } while (true);\n}\n"
  },
  {
    "path": "libgpu/include/counter.h",
    "content": "#pragma once\n/*\n   counter.h\n\n   Implements instrumentation counters. Part of the GGC source code.\n\n   Copyright (C) 2014--2016, The University of Texas at Austin\n\n   See LICENSE.TXT for copyright license.\n\n   Author: Sreepathi Pai <sreepai@ices.utexas.edu>\n*/\n\n#include <stdio.h>\n#include <cassert>\n#include <cub/util_device.cuh>\n\n// timeblocks 2.0\n\nconst int MAGIC = 0x5a5e5a61; // random\n\n// from http://forums.nvidia.com/index.php?showtopic=186669\nstatic __device__ uint get_smid_reg(void) {\n  uint ret;\n  asm(\"mov.u32 %0, %smid;\" : \"=r\"(ret));\n  return ret;\n}\n\nclass GPUCounter {\n  unsigned count;\n  unsigned value;\n\npublic:\n  unsigned* tvalues;\n  unsigned* tcounts;\n  unsigned* smids; // make this a char?\n  clock_t* start;\n\n  __device__ void init() {\n    count = 0;\n    value = 0;\n  }\n\n  __device__ void record(int value) { this->value += value; }\n\n  __device__ void count_iter() { count++; }\n\n  __device__ void begin(unsigned tid) {\n    this->start[tid] = clock64();\n    if (threadIdx.x == 0)\n      smids[blockIdx.x] = get_smid_reg();\n  }\n\n  __device__ void end(unsigned tid) {\n    clock_t e = clock64();\n    value     = (e - start[tid]);\n  }\n\n  __device__ void finish(unsigned tid) {\n    tvalues[tid] = value;\n    tcounts[tid] = count;\n  }\n};\n\nclass Counter {\npublic:\n  GPUCounter gc;\n  FILE* f;\n  Shared<unsigned> tvalues;\n  Shared<unsigned> tcounts;\n  Shared<unsigned> smids;\n  Shared<clock_t> start;\n\n  int threads;\n  int blocks, tpblock, dynsmem, residency;\n  const void* function;\n\n  int get_residency(int tpb, int dynsmem) {\n    int res;\n\n#if CUDA_VERSION < 6050\n    assert(dynsmem == 0);\n    cub::MaxSmOccupancy(res, function, tpb);\n#else\n    assert(cudaOccupancyMaxActiveBlocksPerMultiprocessor(\n               &res, function, tpb, dynsmem) == cudaSuccess);\n#endif\n\n    return res;\n  }\n\n  __host__ void init(const char* fname, const void* fn, int blocks, int tpb,\n                     int dynsmem) {\n    this->blocks  = blocks;\n    this->tpblock = tpb;\n\n    threads = blocks * tpb;\n\n    function  = fn;\n    residency = get_residency(tpb, dynsmem);\n    assert(residency > 0);\n\n    f = fopen(fname, \"w\");\n    if (!f) {\n      fprintf(stderr, \"Could not open '%s'\", fname);\n      assert(false);\n    }\n\n    assert(fwrite(&MAGIC, sizeof(MAGIC), 1, f) == 1);\n    assert(fwrite(&blocks, sizeof(blocks), 1, f) == 1);\n    assert(fwrite(&tpb, sizeof(tpb), 1, f) == 1);\n\n    tvalues.alloc(threads);\n    tcounts.alloc(threads);\n    smids.alloc(blocks);\n    start.alloc(threads);\n\n    gc.tvalues = tvalues.gpu_wr_ptr();\n    gc.tcounts = tcounts.gpu_wr_ptr();\n    gc.smids   = smids.gpu_wr_ptr();\n    gc.start   = start.gpu_wr_ptr();\n  }\n\n  __host__ GPUCounter& get_gpu() {\n    tvalues.gpu_wr_ptr();\n    tcounts.gpu_wr_ptr();\n    smids.gpu_wr_ptr();\n    start.gpu_wr_ptr();\n\n    return gc;\n  }\n\n  __host__ void write_data(int iteration, unsigned work, int iblocks = 0,\n                           int ithreads = 0, int idynsmem = -1) {\n    int zero     = 0;\n    unsigned* tv = tvalues.cpu_rd_ptr();\n    unsigned* tc = tcounts.cpu_rd_ptr();\n    int res;\n\n    if (iblocks == 0)\n      iblocks = blocks;\n    if (ithreads == 0)\n      ithreads = tpblock;\n    if (idynsmem == -1)\n      idynsmem = dynsmem;\n\n    if (ithreads != tpblock) {\n      res = get_residency(ithreads, idynsmem);\n      assert(res > 0);\n    } else {\n      res = residency;\n    }\n\n    assert(fwrite(&zero, sizeof(zero), 1, f) == 1);\n    assert(fwrite(&iteration, sizeof(iteration), 1, f) == 1);\n    assert(fwrite(&work, sizeof(work), 1, f) == 1);\n    assert(fwrite(&iblocks, sizeof(iblocks), 1, f) == 1);\n    assert(fwrite(&ithreads, sizeof(ithreads), 1, f) == 1);\n    assert(fwrite(&idynsmem, sizeof(idynsmem), 1, f) == 1);\n    assert(fwrite(&res, sizeof(res), 1, f) == 1);\n\n    // reserved for type identifiers\n    assert(fwrite(&zero, sizeof(zero), 1, f) == 1);\n    assert(fwrite(&zero, sizeof(zero), 1, f) == 1);\n\n    assert(fwrite(smids.cpu_rd_ptr(), sizeof(unsigned), blocks, f) == blocks);\n    assert(fwrite(start.cpu_rd_ptr(), sizeof(clock_t), threads, f) == threads);\n\n    assert(fwrite(tv, sizeof(tv[0]), threads, f) == threads);\n    assert(fwrite(tc, sizeof(tc[0]), threads, f) == threads);\n  }\n  __host__ void zero_gpu() {\n    tvalues.zero_gpu();\n    tcounts.zero_gpu();\n    smids.zero_gpu();\n    start.zero_gpu();\n  }\n};\n"
  },
  {
    "path": "libgpu/include/csr_graph.h",
    "content": "/*\n   csr_graph.h\n\n   Implements a CSR Graph. Part of the GGC source code.\n   Interface derived from LonestarGPU.\n\n   Copyright (C) 2014--2016, The University of Texas at Austin\n\n   See LICENSE.TXT for copyright license.\n\n   Author: Sreepathi Pai <sreepai@ices.utexas.edu>\n*/\n\n#ifndef LSG_CSR_GRAPH\n#define LSG_CSR_GRAPH\n\n#include <fstream>\n\n// Adapted from LSG CSRGraph.h\n\n// TODO: make this template data\ntypedef unsigned index_type; // should be size_t, but GPU chokes on size_t\ntypedef int edge_data_type;\ntypedef int node_data_type;\n\n// very simple implementation\nstruct CSRGraph {\n  unsigned read(char file[], bool read_edge_data = true);\n  void copy_to_gpu(struct CSRGraph& copygraph);\n  void copy_to_cpu(struct CSRGraph& copygraph);\n\n  CSRGraph();\n\n  unsigned init();\n  unsigned allocOnHost(bool no_edge_data = false);\n  unsigned allocOnDevice(bool no_edge_data = false);\n  void progressPrint(unsigned maxii, unsigned ii);\n  unsigned readFromGR(char file[], bool read_edge_data = true);\n\n  unsigned deallocOnHost();\n  unsigned deallocOnDevice();\n  void dealloc();\n\n  __device__ __host__ bool valid_node(index_type node) {\n    return (node < nnodes);\n  }\n\n  __device__ __host__ bool valid_edge(index_type edge) {\n    return (edge < nedges);\n  }\n\n  __device__ __host__ index_type getOutDegree(unsigned src) {\n    assert(src < nnodes);\n    return row_start[src + 1] - row_start[src];\n  };\n\n  __device__ __host__ index_type getDestination(unsigned src, unsigned edge) {\n    assert(src < nnodes);\n    assert(edge < getOutDegree(src));\n\n    index_type abs_edge = row_start[src] + edge;\n    assert(abs_edge < nedges);\n\n    return edge_dst[abs_edge];\n  };\n\n  __device__ __host__ index_type getAbsDestination(unsigned abs_edge) {\n    assert(abs_edge < nedges);\n\n    return edge_dst[abs_edge];\n  };\n\n  __device__ __host__ index_type getFirstEdge(unsigned src) {\n    assert(src <= nnodes); // <= is okay\n    return row_start[src];\n  };\n\n  __device__ __host__ edge_data_type getWeight(unsigned src, unsigned edge) {\n    assert(src < nnodes);\n    assert(edge < getOutDegree(src));\n\n    index_type abs_edge = row_start[src] + edge;\n    assert(abs_edge < nedges);\n\n    return edge_data[abs_edge];\n  };\n\n  __device__ __host__ edge_data_type getAbsWeight(unsigned abs_edge) {\n    assert(abs_edge < nedges);\n\n    return edge_data[abs_edge];\n  };\n\n  void init_from_mgraph(int m, int nnz, index_type* h_row_offsets,\n                        index_type* h_column_indices,\n                        node_data_type* h_labels) {\n    nnodes = m;\n    nedges = nnz;\n    check_cuda(cudaMalloc((void**)&row_start, (m + 1) * sizeof(index_type)));\n    check_cuda(cudaMalloc((void**)&edge_dst, nnz * sizeof(index_type)));\n    check_cuda(cudaMemcpy(row_start, h_row_offsets,\n                          (m + 1) * sizeof(index_type),\n                          cudaMemcpyHostToDevice));\n    check_cuda(cudaMemcpy(edge_dst, h_column_indices, nnz * sizeof(index_type),\n                          cudaMemcpyHostToDevice));\n#ifdef ENABLE_LABEL\n    check_cuda(cudaMalloc((void**)&node_data, m * sizeof(node_data_type)));\n    check_cuda(cudaMemcpy(node_data, h_labels, m * sizeof(node_data_type),\n                          cudaMemcpyHostToDevice));\n#endif\n    // int *h_degrees = (int *)malloc(m * sizeof(int));\n    // for (int i = 0; i < m; i++) h_degrees[i] = h_row_offsets[i + 1] -\n    // h_row_offsets[i]; check_cuda(cudaMalloc((void **)&d_degrees, m *\n    // sizeof(int))); check_cuda(cudaMemcpy(d_degrees, h_degrees, m *\n    // sizeof(int), cudaMemcpyHostToDevice));\n  }\n\n  inline __device__ __host__ index_type getEdgeDst(unsigned edge) {\n    assert(edge < nedges);\n    return edge_dst[edge];\n  };\n  inline __device__ __host__ node_data_type getData(unsigned vid) {\n    return node_data[vid];\n  }\n  inline __device__ __host__ index_type edge_begin(unsigned src) {\n    assert(src <= nnodes);\n    return row_start[src];\n  };\n  inline __device__ __host__ index_type edge_end(unsigned src) {\n    assert(src <= nnodes);\n    return row_start[src + 1];\n  };\n\n  index_type nnodes, nedges;\n  index_type* row_start; // row_start[node] points into edge_dst, node starts at\n                         // 0, row_start[nnodes] = nedges\n  index_type* edge_dst;\n  edge_data_type* edge_data;\n  node_data_type* node_data;\n  bool device_graph;\n};\n\nstruct CSRGraphTex : CSRGraph {\n  cudaTextureObject_t edge_dst_tx;\n  cudaTextureObject_t row_start_tx;\n  cudaTextureObject_t node_data_tx;\n\n  void copy_to_gpu(struct CSRGraphTex& copygraph);\n  unsigned allocOnDevice(bool no_edge_data = false);\n\n  __device__ __host__ index_type getOutDegree(unsigned src) {\n#ifdef __CUDA_ARCH__\n    assert(src < nnodes);\n    return tex1Dfetch<index_type>(row_start_tx, src + 1) -\n           tex1Dfetch<index_type>(row_start_tx, src);\n#else\n    return CSRGraph::getOutDegree(src);\n#endif\n  };\n\n  __device__ node_data_type node_data_ro(index_type node) {\n    assert(node < nnodes);\n    return tex1Dfetch<node_data_type>(node_data_tx, node);\n  }\n\n  __device__ __host__ index_type getDestination(unsigned src, unsigned edge) {\n#ifdef __CUDA_ARCH__\n    assert(src < nnodes);\n    assert(edge < getOutDegree(src));\n\n    index_type abs_edge = tex1Dfetch<index_type>(row_start_tx, src + edge);\n    assert(abs_edge < nedges);\n\n    return tex1Dfetch<index_type>(edge_dst_tx, abs_edge);\n#else\n    return CSRGraph::getDestination(src, edge);\n#endif\n  };\n\n  __device__ __host__ index_type getAbsDestination(unsigned abs_edge) {\n#ifdef __CUDA_ARCH__\n    assert(abs_edge < nedges);\n\n    return tex1Dfetch<index_type>(edge_dst_tx, abs_edge);\n#else\n    return CSRGraph::getAbsDestination(abs_edge);\n#endif\n  };\n\n  __device__ __host__ index_type getFirstEdge(unsigned src) {\n#ifdef __CUDA_ARCH__\n    assert(src <= nnodes); // <= is okay\n    return tex1Dfetch<index_type>(row_start_tx, src);\n#else\n    return CSRGraph::getFirstEdge(src);\n#endif\n  };\n};\n\n#ifdef CSRG_TEX\ntypedef CSRGraphTex CSRGraphTy;\n#else\ntypedef CSRGraph CSRGraphTy;\n#endif\n\n#endif\n"
  },
  {
    "path": "libgpu/include/cuda_launch_config.hpp",
    "content": "/*\n *  Copyright 2008-2012 NVIDIA Corporation\n *\n *  Licensed under the Apache License, Version 2.0 (the \"License\");\n *  you may not use this file except in compliance with the License.\n *  You may obtain a copy of the License at\n *\n *      http://www.apache.org/licenses/LICENSE-2.0\n *\n *  Unless required by applicable law or agreed to in writing, software\n *  distributed under the License is distributed on an \"AS IS\" BASIS,\n *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n *  See the License for the specific language governing permissions and\n *  limitations under the License.\n */\n\n/*\n   Modified by Sreepathi Pai <sreepai@ices.utexas.edu>\n   to remove dependency on Thrust and add maximum_residency()\n*/\n\n#pragma once\n\n#include <cstddef>\n#include <cuda_runtime_api.h>\n\n#define VERBOSE_ERRORS 1\n#if VERBOSE_ERRORS == 1\n#include <stdio.h>\n#endif\n/*! Computes a block size in number of threads for a CUDA kernel using a\n * occupancy-promoting heuristic. \\param attributes The cudaFuncAttributes\n * corresponding to a __global__ function of interest on a GPU of interest.\n *  \\param properties The cudaDeviceProp corresponding to a GPU on which to\n * launch the __global__ function of interest. \\return A CUDA block size, in\n * number of threads, which the resources of the GPU's streaming multiprocessor\n * can accomodate and which is intended to promote occupancy. The result is\n * equivalent to the one performed by the \"CUDA Occupancy Calculator\". \\note The\n * __global__ function of interest is presumed to use 0 bytes of\n * dynamically-allocated __shared__ memory.\n */\ninline __host__ __device__ std::size_t\nblock_size_with_maximum_potential_occupancy(\n    const cudaFuncAttributes& attributes, const cudaDeviceProp& properties);\n\n/*! Computes a block size in number of threads for a CUDA kernel using a\n * occupancy-promoting heuristic. Use this version of the function when a CUDA\n * block's dynamically-allocated __shared__ memory requirements vary with the\n * size of the block. \\param attributes The cudaFuncAttributes corresponding to\n * a __global__ function of interest on a GPU of interest. \\param properties The\n * cudaDeviceProp corresponding to a GPU on which to launch the __global__\n * function of interest. \\param block_size_to_dynamic_smem_bytes A unary\n * function which maps an integer CUDA block size to the number of bytes of\n * dynamically-allocated __shared__ memory required by a CUDA block of that\n * size. \\return A CUDA block size, in number of threads, which the resources of\n * the GPU's streaming multiprocessor can accomodate and which is intended to\n * promote occupancy. The result is equivalent to the one performed by the \"CUDA\n * Occupancy Calculator\".\n */\ntemplate <typename UnaryFunction>\ninline __host__ __device__ std::size_t\nblock_size_with_maximum_potential_occupancy(\n    const cudaFuncAttributes& attributes, const cudaDeviceProp& properties,\n    UnaryFunction block_size_to_dynamic_smem_size);\n\n/*! Computes the maximum residency for a CUDA kernel function\n *  \\param t The CUDA kernel function\n *  \\param CTA_SIZE The size of the CTA in threads\n *  \\param dynamic_smem_bytes The size of dynamic shared memory\n *  \\return Returns the maximum number of thread blocks per SM or 0 on error\n */\ntemplate <typename T>\ninline __host__ std::size_t maximum_residency(T t, const size_t CTA_SIZE,\n                                              const size_t dynamic_smem_bytes);\n\n/*! Computes the maximum residency for a CUDA kernel function\n *  \\param attributes The cudaFuncAttributes corresponding to a __global__\n * function of interest on a GPU of interest. \\param properties The\n * cudaDeviceProp corresponding to a GPU on which to launch the __global__\n * function of interest. \\param CTA_SIZE The size of the CTA in threads \\param\n * dynamic_smem_bytes The size of dynamic shared memory \\return Returns the\n * maximum number of thread blocks per SM or 0 on error\n */\ninline __host__ std::size_t\nmaximum_residency(const cudaFuncAttributes& attributes,\n                  const cudaDeviceProp& properties, size_t CTA_SIZE,\n                  size_t dynamic_smem_bytes);\n\nnamespace __cuda_launch_config_detail {\n\nusing std::size_t;\n\nnamespace util {\n\ntemplate <typename T>\ninline __host__ __device__ T min_(const T& lhs, const T& rhs) {\n  return rhs < lhs ? rhs : lhs;\n}\n\ntemplate <typename T>\nstruct zero_function {\n  inline __host__ __device__ T operator()(T) { return 0; }\n};\n\n// x/y rounding towards +infinity for integers, used to determine # of\n// blocks/warps etc.\ntemplate <typename L, typename R>\ninline __host__ __device__ L divide_ri(const L x, const R y) {\n  return (x + (y - 1)) / y;\n}\n\n// x/y rounding towards zero for integers, used to determine # of blocks/warps\n// etc.\ntemplate <typename L, typename R>\ninline __host__ __device__ L divide_rz(const L x, const R y) {\n  return x / y;\n}\n\n// round x towards infinity to the next multiple of y\ntemplate <typename L, typename R>\ninline __host__ __device__ L round_i(const L x, const R y) {\n  return y * divide_ri(x, y);\n}\n\n// round x towards zero to the next multiple of y\ntemplate <typename L, typename R>\ninline __host__ __device__ L round_z(const L x, const R y) {\n  return y * divide_rz(x, y);\n}\n\n} // end namespace util\n\n// granularity of shared memory allocation\ninline __host__ __device__ size_t\nsmem_allocation_unit(const cudaDeviceProp& properties) {\n  switch (properties.major) {\n  case 1:\n    return 512;\n  case 2:\n    return 128;\n  case 3:\n    return 256;\n  default:\n    return 256; // unknown GPU; have to guess\n  }\n}\n\n// granularity of register allocation\ninline __host__ __device__ size_t reg_allocation_unit(\n    const cudaDeviceProp& properties, const size_t regsPerThread) {\n  switch (properties.major) {\n  case 1:\n    return (properties.minor <= 1) ? 256 : 512;\n  case 2:\n    switch (regsPerThread) {\n    case 21:\n    case 22:\n    case 29:\n    case 30:\n    case 37:\n    case 38:\n    case 45:\n    case 46:\n      return 128;\n    default:\n      return 64;\n    }\n  case 3:\n    return 256;\n  default:\n    return 256; // unknown GPU; have to guess\n  }\n}\n\n// granularity of warp allocation\ninline __host__ __device__ size_t\nwarp_allocation_multiple(const cudaDeviceProp& properties) {\n  return (properties.major <= 1) ? 2 : 1;\n}\n\n// number of \"sides\" into which the multiprocessor is partitioned\ninline __host__ __device__ size_t\nnum_sides_per_multiprocessor(const cudaDeviceProp& properties) {\n  switch (properties.major) {\n  case 1:\n    return 1;\n  case 2:\n    return 2;\n  case 3:\n    return 4;\n  default:\n    return 4; // unknown GPU; have to guess\n  }\n}\n\ninline __host__ __device__ size_t\nmax_blocks_per_multiprocessor(const cudaDeviceProp& properties) {\n  return (properties.major <= 2) ? 8 : 16;\n}\n\ninline __host__ __device__ size_t max_active_blocks_per_multiprocessor(\n    const cudaDeviceProp& properties, const cudaFuncAttributes& attributes,\n    size_t CTA_SIZE, size_t dynamic_smem_bytes) {\n  // Determine the maximum number of CTAs that can be run simultaneously per SM\n  // This is equivalent to the calculation done in the CUDA Occupancy Calculator\n  // spreadsheet\n\n  //////////////////////////////////////////\n  // Limits due to threads/SM or blocks/SM\n  //////////////////////////////////////////\n  const size_t maxThreadsPerSM =\n      properties.maxThreadsPerMultiProcessor; // 768, 1024, 1536, etc.\n  const size_t maxBlocksPerSM = max_blocks_per_multiprocessor(properties);\n\n  // Calc limits\n  const size_t ctaLimitThreads = (CTA_SIZE <= properties.maxThreadsPerBlock)\n                                     ? maxThreadsPerSM / CTA_SIZE\n                                     : 0;\n  const size_t ctaLimitBlocks = maxBlocksPerSM;\n\n  //////////////////////////////////////////\n  // Limits due to shared memory/SM\n  //////////////////////////////////////////\n  const size_t smemAllocationUnit = smem_allocation_unit(properties);\n  const size_t smemBytes  = attributes.sharedSizeBytes + dynamic_smem_bytes;\n  const size_t smemPerCTA = util::round_i(smemBytes, smemAllocationUnit);\n\n  // Calc limit\n  const size_t ctaLimitSMem = smemPerCTA > 0\n                                  ? properties.sharedMemPerBlock / smemPerCTA\n                                  : maxBlocksPerSM;\n\n  //////////////////////////////////////////\n  // Limits due to registers/SM\n  //////////////////////////////////////////\n  const size_t regAllocationUnit =\n      reg_allocation_unit(properties, attributes.numRegs);\n  const size_t warpAllocationMultiple = warp_allocation_multiple(properties);\n  const size_t numWarps               = util::round_i(\n      util::divide_ri(CTA_SIZE, properties.warpSize), warpAllocationMultiple);\n\n  // Calc limit\n  size_t ctaLimitRegs;\n  if (properties.major <= 1) {\n    // GPUs of compute capability 1.x allocate registers to CTAs\n    // Number of regs per block is regs per thread times number of warps times\n    // warp size, rounded up to allocation unit\n    const size_t regsPerCTA = util::round_i(\n        attributes.numRegs * properties.warpSize * numWarps, regAllocationUnit);\n    ctaLimitRegs =\n        regsPerCTA > 0 ? properties.regsPerBlock / regsPerCTA : maxBlocksPerSM;\n  } else {\n    // GPUs of compute capability 2.x and higher allocate registers to warps\n    // Number of regs per warp is regs per thread times times warp size, rounded\n    // up to allocation unit\n    const size_t regsPerWarp = util::round_i(\n        attributes.numRegs * properties.warpSize, regAllocationUnit);\n    const size_t numSides       = num_sides_per_multiprocessor(properties);\n    const size_t numRegsPerSide = properties.regsPerBlock / numSides;\n    ctaLimitRegs                = regsPerWarp > 0\n                       ? ((numRegsPerSide / regsPerWarp) * numSides) / numWarps\n                       : maxBlocksPerSM;\n  }\n\n  //////////////////////////////////////////\n  // Overall limit is min() of limits due to above reasons\n  //////////////////////////////////////////\n  return util::min_(\n      ctaLimitRegs,\n      util::min_(ctaLimitSMem, util::min_(ctaLimitThreads, ctaLimitBlocks)));\n}\n\ntemplate <typename UnaryFunction>\ninline __host__ __device__ size_t default_block_size(\n    const cudaDeviceProp& properties, const cudaFuncAttributes& attributes,\n    UnaryFunction block_size_to_smem_size) {\n  size_t max_occupancy = properties.maxThreadsPerMultiProcessor;\n  size_t largest_blocksize =\n      util::min_(properties.maxThreadsPerBlock, attributes.maxThreadsPerBlock);\n  size_t granularity       = properties.warpSize;\n  size_t max_blocksize     = 0;\n  size_t highest_occupancy = 0;\n\n  for (size_t blocksize = largest_blocksize; blocksize != 0;\n       blocksize -= granularity) {\n    size_t occupancy = blocksize * max_active_blocks_per_multiprocessor(\n                                       properties, attributes, blocksize,\n                                       block_size_to_smem_size(blocksize));\n\n    if (occupancy > highest_occupancy) {\n      max_blocksize     = blocksize;\n      highest_occupancy = occupancy;\n    }\n\n    // early out, can't do better\n    if (highest_occupancy == max_occupancy)\n      break;\n  }\n\n  return max_blocksize;\n}\n\n} // end namespace __cuda_launch_config_detail\n\ntemplate <typename UnaryFunction>\ninline __host__ __device__ std::size_t\nblock_size_with_maximum_potential_occupancy(\n    const cudaFuncAttributes& attributes, const cudaDeviceProp& properties,\n    UnaryFunction block_size_to_dynamic_smem_size) {\n  return __cuda_launch_config_detail::default_block_size(\n      properties, attributes, block_size_to_dynamic_smem_size);\n}\n\ninline __host__ __device__ std::size_t\nblock_size_with_maximum_potential_occupancy(\n    const cudaFuncAttributes& attributes, const cudaDeviceProp& properties) {\n  return block_size_with_maximum_potential_occupancy(\n      attributes, properties,\n      __cuda_launch_config_detail::util::zero_function<std::size_t>());\n}\n\ntemplate <typename T>\ninline __host__ std::size_t block_size_with_maximum_potential_occupancy(T t) {\n  cudaError_t err;\n  cudaFuncAttributes attributes;\n  err = cudaFuncGetAttributes(&attributes, t);\n\n  if (err != cudaSuccess)\n    return 0;\n\n  int device;\n  err = cudaGetDevice(&device);\n\n  if (err != cudaSuccess)\n    return 0;\n\n  cudaDeviceProp properties;\n  err = cudaGetDeviceProperties(&properties, device);\n\n  if (err != cudaSuccess)\n    return 0;\n\n  return block_size_with_maximum_potential_occupancy(attributes, properties);\n}\n\ninline __host__ std::size_t\nmaximum_residency(const cudaFuncAttributes& attributes,\n                  const cudaDeviceProp& properties, size_t CTA_SIZE,\n                  size_t dynamic_smem_bytes) {\n  return __cuda_launch_config_detail::max_active_blocks_per_multiprocessor(\n      properties, attributes, CTA_SIZE, dynamic_smem_bytes);\n}\n\ntemplate <typename T>\ninline __host__ std::size_t maximum_residency(T t, size_t CTA_SIZE,\n                                              size_t dynamic_smem_bytes) {\n  cudaError_t err;\n  cudaFuncAttributes attributes;\n  err = cudaFuncGetAttributes(&attributes, t);\n\n  if (err != cudaSuccess) {\n#if VERBOSE_ERRORS == 1\n    fprintf(stderr, \"Failed to get function attributes (%d: %s)\\n\", err,\n            cudaGetErrorString(err));\n#endif\n    return 0;\n  }\n\n  if (CTA_SIZE > attributes.maxThreadsPerBlock) {\n#if VERBOSE_ERRORS == 1\n    fprintf(stderr,\n            \"WARNING: function CTA size (%d) is greater than can be \"\n            \"accomodated: (%d)\\n\",\n            CTA_SIZE, attributes.maxThreadsPerBlock);\n#endif\n    return 0;\n  }\n\n  int device;\n  err = cudaGetDevice(&device);\n\n  if (err != cudaSuccess) {\n#if VERBOSE_ERRORS == 1\n    fprintf(stderr, \"Failed to get current CUDA device (%d: %s)\\n\", err,\n            cudaGetErrorString(err));\n#endif\n    return 0;\n  }\n\n  cudaDeviceProp properties;\n  err = cudaGetDeviceProperties(&properties, device);\n\n  if (err != cudaSuccess) {\n#if VERBOSE_ERRORS == 1\n    fprintf(stderr, \"Failed to get current CUDA device properties (%d: %s)\\n\",\n            err, cudaGetErrorString(err));\n#endif\n    return 0;\n  }\n\n  size_t mr =\n      maximum_residency(attributes, properties, CTA_SIZE, dynamic_smem_bytes);\n\n#if VERBOSE_ERRORS == 1\n  if (mr == 0) {\n    fprintf(stderr, \"WARNING: Maximum Residency is 0\\n\");\n  }\n#endif\n  return mr;\n}\n\ntemplate <typename T>\ninline __host__ std::size_t all_resident(T t, const dim3& grid,\n                                         const dim3& threads,\n                                         size_t dynamic_smem_bytes) {\n  cudaError_t err;\n  cudaFuncAttributes attributes;\n  err = cudaFuncGetAttributes(&attributes, t);\n\n  if (err != cudaSuccess)\n    return 0;\n\n  int device;\n  err = cudaGetDevice(&device);\n\n  if (err != cudaSuccess)\n    return 0;\n\n  cudaDeviceProp properties;\n  err = cudaGetDeviceProperties(&properties, device);\n\n  if (err != cudaSuccess)\n    return 0;\n\n  return (maximum_residency(attributes, properties,\n                            threads.x * threads.y * threads.z,\n                            dynamic_smem_bytes) *\n          properties.multiProcessorCount) >= grid.x * grid.y * grid.z;\n}\n"
  },
  {
    "path": "libgpu/include/cutil_subset.h",
    "content": "/*\n   cutil_subset.h\n\n   Implements a subset of the CUDA utilities. Part of the GGC source code.\n\n   TODO: actual owner copyright (NVIDIA) and license.\n*/\n\n#pragma once\n#include \"cub/cub.cuh\"\n\n#define CUDA_SAFE_CALL_NO_SYNC(call)                                           \\\n  {                                                                            \\\n    cudaError err = call;                                                      \\\n    if (cudaSuccess != err) {                                                  \\\n      fprintf(stderr, \"Cuda error in file '%s' in line %i : %s.\\n\", __FILE__,  \\\n              __LINE__, cudaGetErrorString(err));                              \\\n      exit(EXIT_FAILURE);                                                      \\\n    }                                                                          \\\n  }\n\n#define CUDA_SAFE_CALL(call) CUDA_SAFE_CALL_NO_SYNC(call);\n\n#define CUDA_SAFE_THREAD_SYNC()                                                \\\n  {                                                                            \\\n    cudaError err = CUT_DEVICE_SYNCHRONIZE();                                  \\\n    if (cudaSuccess != err) {                                                  \\\n      fprintf(stderr, \"Cuda error in file '%s' in line %i : %s.\\n\", __FILE__,  \\\n              __LINE__, cudaGetErrorString(err));                              \\\n    }                                                                          \\\n  }\n\n// from http://forums.nvidia.com/index.php?showtopic=186669\nstatic __device__ uint get_smid(void) {\n  uint ret;\n  asm(\"mov.u32 %0, %smid;\" : \"=r\"(ret));\n  return ret;\n}\n\nstatic __device__ uint get_warpid(void) {\n  uint ret;\n  asm(\"mov.u32 %0, %warpid;\" : \"=r\"(ret));\n  return ret;\n}\n\n// since cub::WarpScan doesn't work very well with disabled threads in the warp\n__device__ __forceinline__ void warp_active_count(int& first, int& offset,\n                                                  int& total) {\n  unsigned int active = __ballot_sync(0xffffffff, 1);\n  total               = __popc(active);\n  offset              = __popc(active & cub::LaneMaskLt());\n  first               = __ffs(active) - 1; // we know active != 0\n}\n\n// since cub::WarpScan doesn't work very well with disabled threads in the warp\n__device__ __forceinline__ void\nwarp_active_count_zero_active(int& first, int& offset, int& total) {\n  unsigned int active = __ballot_sync(0xffffffff, 1);\n  total               = __popc(active);\n  offset              = __popc(active & cub::LaneMaskLt());\n  first               = 0;\n}\n"
  },
  {
    "path": "libgpu/include/exclusive.h",
    "content": "/*\n   exclusive.h\n\n   Runtime implementation for Exclusive. Part of the GGC source code.\n\n   Copyright (C) 2014--2016, The University of Texas at Austin\n\n   See LICENSE.TXT for copyright license.\n\n   Author: Sreepathi Pai <sreepai@ices.utexas.edu>\n*/\n\n#include \"sharedptr.h\"\n#include <cassert>\n#include <cub/cub.cuh>\n\n#ifndef LOADCV\n#define LOADCV(x) cub::ThreadLoad<cub::LOAD_CV>((x))\n#endif\n\n#ifndef LOADCG\n#define LOADCG(x) cub::ThreadLoad<cub::LOAD_CG>((x))\n#endif\n\n#ifndef STORECG\n#define STORECG(x, y) cub::ThreadStore<cub::STORE_CG>((x), (y))\n#endif\n\nclass ExclusiveLocks {\npublic:\n  Shared<int> locks; // need not be shared, GPU-only is fine.\n  int* lk;\n  int nitems;\n\n  ExclusiveLocks() { nitems = 0; }\n\n  ExclusiveLocks(size_t nitems) {\n    this->nitems = nitems;\n    locks.alloc(nitems);\n    locks.cpu_wr_ptr();\n    lk = locks.gpu_wr_ptr();\n  }\n\n  void alloc(size_t nitems) {\n    // to be called once if default constructor was used\n    assert(this->nitems == 0);\n    locks.alloc(nitems);\n    locks.cpu_wr_ptr();\n    lk = locks.gpu_wr_ptr();\n  }\n\n  __device__ void mark_p1(int n, int* a, int id) {\n    // try to claim ownership\n    for (int i = 0; i < n; i++)\n      STORECG(lk + a[i], id);\n  }\n\n  __device__ void mark_p1_iterator(int start, int n, int step, int* a, int id) {\n    // try to claim ownership\n    for (int i = start; i < n; i += step)\n      STORECG(lk + a[i], id);\n  }\n\n  __device__ void mark_p2(int n, int* a, int id) {\n    for (int i = 0; i < n; i++)\n      if (LOADCG(lk + a[i]) != id)\n        atomicMin(lk + a[i], id);\n  }\n\n  __device__ void mark_p2_iterator(int start, int n, int step, int* a, int id) {\n    for (int i = start; i < n; i += step)\n      if (LOADCG(lk + a[i]) != id)\n        atomicMin(lk + a[i], id);\n  }\n\n  __device__ bool owns(int n, int* a, int id) {\n    for (int i = 0; i < n; i++)\n      if (LOADCG(lk + a[i]) != id)\n        return false;\n\n    return true;\n  }\n\n  __device__ bool owns_iterator(int start, int n, int step, int* a, int id) {\n    for (int i = start; i < n; i += step)\n      if (LOADCG(lk + a[i]) != id)\n        return false;\n\n    return true;\n  }\n};\n"
  },
  {
    "path": "libgpu/include/failfast.h",
    "content": "/*\n   failfast.h\n\n   Implements debug routines. Part of the GGC source code.\n\n   Copyright (C) 2014--2016, The University of Texas at Austin\n\n   See LICENSE.TXT for copyright license.\n\n   Author: Sreepathi Pai <sreepai@ices.utexas.edu>\n*/\n\n#pragma once\n\n#include <stdio.h>\n#include <stdarg.h>\n\nstatic void ff_fprintf(const char* file, const int line, FILE* stream,\n                       const char* fmt, ...) {\n  va_list ap;\n  va_start(ap, fmt);\n  int err = vfprintf(stream, fmt, ap);\n  if (err < 0) {\n    fprintf(stderr, \"%s:%d:fprintf failed.\\n\", file, line);\n    exit(1);\n  }\n}\n\n#define check_fprintf(...) ff_fprintf(__FILE__, __LINE__, __VA_ARGS__)\n"
  },
  {
    "path": "libgpu/include/gbar.cuh",
    "content": "/******************************************************************************\n * Copyright (c) 2010-2011, Duane Merrill.  All rights reserved.\n * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.\n * \n * Redistribution and use in source and binary forms, with or without\n * modification, are permitted provided that the following conditions are met:\n *     * Redistributions of source code must retain the above copyright\n *       notice, this list of conditions and the following disclaimer.\n *     * Redistributions in binary form must reproduce the above copyright\n *       notice, this list of conditions and the following disclaimer in the\n *       documentation and/or other materials provided with the distribution.\n *     * Neither the name of the NVIDIA CORPORATION nor the\n *       names of its contributors may be used to endorse or promote products\n *       derived from this software without specific prior written permission.\n * \n * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS \"AS IS\" AND\n * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED\n * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE\n * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY\n * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES\n * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;\n * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND\n * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\n * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS\n * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n *\n ******************************************************************************/\n\n/******************************************************************************\n * Software Global Barrier\n ******************************************************************************/\n\n#pragma once\n\n#include <cub/cub.cuh>\n#include \"cutil_subset.h\"\n\n/**\n * Manages device storage needed for implementing a global software barrier\n * between CTAs in a single grid\n */\nclass GlobalBarrier\n{\npublic:\n\n\ttypedef unsigned int SyncFlag;\n\nprotected :\n\n\n\t// Counters in global device memory\n\tSyncFlag* d_sync;\n\n\t/**\n\t * Simple wrapper for returning a CG-loaded SyncFlag at the specified pointer\n\t */\n\t__device__ __forceinline__ SyncFlag LoadCG(SyncFlag* d_ptr) const\n\t{\n\t\tSyncFlag retval;\n\t\tretval = cub::ThreadLoad<cub::LOAD_CG>(d_ptr);\n\t\treturn retval;\n\t}\n\npublic:\n\n\t/**\n\t * Constructor\n\t */\n\tGlobalBarrier() : d_sync(NULL) {}\n\n\n\t/**\n\t * Synchronize\n\t */\n\t__device__ __forceinline__ void Sync() const\n\t{\n        volatile SyncFlag *d_vol_sync = d_sync;\n\n        // Threadfence and syncthreads to make sure global writes are visible before\n\t\t// thread-0 reports in with its sync counter\n\t\t__threadfence();\n\t\t__syncthreads();\n\n\t\tif (blockIdx.x == 0) {\n\n\t\t\t// Report in ourselves\n\t\t\tif (threadIdx.x == 0) {\n\t\t\t    d_vol_sync[blockIdx.x] = 1;\n\t\t\t}\n\n\t\t\t__syncthreads();\n\n\t\t\t// Wait for everyone else to report in\n\t\t\tfor (int peer_block = threadIdx.x; peer_block < gridDim.x; peer_block += blockDim.x) {\n\t\t\t\twhile (LoadCG(d_sync + peer_block) == 0) {\n\t\t\t\t\t__threadfence_block();\n\t\t\t\t}\n\t\t\t}\n\n\t\t\t__syncthreads();\n\n\t\t\t// Let everyone know it's safe to read their prefix sums\n\t\t\tfor (int peer_block = threadIdx.x; peer_block < gridDim.x; peer_block += blockDim.x) {\n\t\t\t    d_vol_sync[peer_block] = 0;\n\t\t\t}\n\n\t\t} else {\n\n\t\t\tif (threadIdx.x == 0) {\n\t\t\t\t// Report in\n\t\t\t    d_vol_sync[blockIdx.x] = 1;\n\n\t\t\t\t// Wait for acknowledgement\n\t\t\t\twhile (LoadCG(d_sync + blockIdx.x) == 1) {\n\t\t\t\t\t__threadfence_block();\n\t\t\t\t}\n\t\t\t}\n\n\t\t\t__syncthreads();\n\t\t}\n\t}\n};\n\n\n/**\n * Version of global barrier with storage lifetime management.\n *\n * We can use this in host enactors, and pass the base GlobalBarrier\n * as parameters to kernels.\n */\nclass GlobalBarrierLifetime : public GlobalBarrier\n{\nprotected:\n\n\t// Number of bytes backed by d_sync\n\tsize_t sync_bytes;\n\npublic:\n\n\t/**\n\t * Constructor\n\t */\n\tGlobalBarrierLifetime() : GlobalBarrier(), sync_bytes(0) {}\n\n\n\t/**\n\t * Deallocates and resets the progress counters\n\t */\n\tcudaError_t HostReset()\n\t{\n\t\tcudaError_t retval = cudaSuccess;\n\t\tif (d_sync) {\n\t\t\tCUDA_SAFE_CALL(cudaFree(d_sync));\n\t\t\td_sync = NULL;\n\t\t}\n\t\tsync_bytes = 0;\n\t\treturn retval;\n\t}\n\n\n\t/**\n\t * Destructor\n\t */\n\tvirtual ~GlobalBarrierLifetime()\n\t{\n\t\tHostReset();\n\t}\n\n\n\t/**\n\t * Sets up the progress counters for the next kernel launch (lazily\n\t * allocating and initializing them if necessary)\n\t */\n\tcudaError_t Setup(int sweep_grid_size)\n\t{\n\t\tcudaError_t retval = cudaSuccess;\n\t\tdo {\n\t\t\tsize_t new_sync_bytes = sweep_grid_size * sizeof(SyncFlag);\n\t\t\tif (new_sync_bytes > sync_bytes) {\n\n\t\t\t\tif (d_sync) {\n\t\t\t\t\tCUDA_SAFE_CALL(cudaFree(d_sync));\n\t\t\t\t\tretval = cudaSuccess;\n\t\t\t\t}\n\n\t\t\t\tsync_bytes = new_sync_bytes;\n\n\t\t\t\tCUDA_SAFE_CALL(cudaMalloc((void**) &d_sync, sync_bytes));\n\t\t\t\tretval = cudaSuccess;\n\n\t\t\t\t// Initialize to zero\n\t\t\t\tCUDA_SAFE_CALL(cudaMemset(d_sync, 0, sweep_grid_size * sizeof(SyncFlag)));\n\n\t\t\t}\n\t\t} while (0);\n\n\t\treturn retval;\n\t}\n};\n"
  },
  {
    "path": "libgpu/include/gg.h",
    "content": "/*\n   gg.h\n\n   Implements the main GG header file. Part of the GGC source code.\n\n   Copyright (C) 2014--2016, The University of Texas at Austin\n\n   See LICENSE.TXT for copyright license.\n\n   Author: Sreepathi Pai <sreepai@ices.utexas.edu>\n\n   TODO: RTLICENSE\n*/\n\n#ifndef GALOIS_GPU\n#define GALOIS_GPU\n\n#include <fstream>\n#include <stdint.h>\n#include <sys/mman.h>\n#include <sys/types.h>\n#include <sys/stat.h>\n#include <fcntl.h>\n#include <unistd.h>\n#include <cassert>\n\n#ifndef GGDEBUG\n#define GGDEBUG 0\n#endif\n\n#define dprintf                                                                \\\n  if (debug)                                                                   \\\n  printf\nunsigned const debug = GGDEBUG;\n\n#include \"Timer.h\"\n\nstatic void check_cuda_error(const cudaError_t e, const char* file,\n                             const int line) {\n  if (e != cudaSuccess) {\n    fprintf(stderr, \"%s:%d: %s (%d)\\n\", file, line, cudaGetErrorString(e), e);\n    exit(1);\n  }\n}\n\ntemplate <typename T>\nstatic void check_retval(const T retval, const T expected, const char* file,\n                         const int line) {\n  if (retval != expected) {\n    fprintf(stderr, \"%s:%d: Got %d, expected %d\\n\", file, line, retval,\n            expected);\n    exit(1);\n  }\n}\n\ninline static __device__ __host__ int roundup(int a, int r) {\n  return ((a + r - 1) / r) * r;\n}\n\ninline static __device__ __host__ int GG_MIN(int x, int y) {\n  if (x > y)\n    return y;\n  else\n    return x;\n}\n\n#define check_cuda(x) check_cuda_error(x, __FILE__, __LINE__)\n#define check_rv(r, x) check_retval(r, x, __FILE__, __LINE__)\n\n#include \"bmk2.h\"\n#include \"csr_graph.h\"\n#include \"sharedptr.h\"\n#include \"worklist.h\"\n#include \"aolist.h\"\n#include \"lockarray.h\"\n#include \"abitset.h\"\n#include \"gbar.cuh\"\n#include \"cuda_launch_config.hpp\"\n#include \"pipe.h\"\n#include \"exclusive.h\"\n#include \"internal.h\"\n#include \"rv.h\"\n#include \"failfast.h\"\n#include \"ggc_rt.h\"\n#include \"instr.h\"\n\n#include <moderngpu/context.hxx>\n\nextern mgpu::context_t* mgc;\n#endif\n"
  },
  {
    "path": "libgpu/include/ggc_rt.h",
    "content": "#pragma once\n\nstruct ggc_rt_dev_info {\n  int dev;\n  int nSM;\n};\n\nvoid ggc_init_dev_info();\nvoid ggc_set_gpu_device(int dev);\nint ggc_get_nSM();\n"
  },
  {
    "path": "libgpu/include/ggcuda.h",
    "content": "/*\n   ggcuda.h\n\n   Implements GG CUDA runtime bits. Part of the GGC source code.\n\n   Copyright (C) 2014--2016, The University of Texas at Austin\n\n   See LICENSE.TXT for copyright license.\n\n   Author: Sreepathi Pai <sreepai@ices.utexas.edu>\n*/\n\n#pragma once\n\n#define TID_1D (threadIdx.x + blockIdx.x * blockDim.x)\n#define TOTAL_THREADS_1D (gridDim.x * blockDim.x)\n#define BLOCK_DIM_X blockDim.x\n\n#define CONDITION enable_lb\n#define MAX_INT 2147483647\n#define THRESHOLD TOTAL_THREADS_1D\n#define DEGREE_LIMIT ((CONDITION) ? (THRESHOLD) : (MAX_INT))\n"
  },
  {
    "path": "libgpu/include/instr.h",
    "content": "#pragma once\n\ntypedef struct trace_file* TRACE;\nstruct instr_trace;\n\nvoid instr_set_saved_uniqid(const char* id);\nvoid instr_load_uniqid();\nconst char* instr_saved_uniqid();\nconst char* instr_uniqid();\nconst char* instr_trace_dir();\n\nTRACE trace_open(const char* name, const char* mode);\nvoid trace_close(TRACE t);\n\nvoid instr_write_array(const char* n, TRACE f, size_t elemsz, size_t nelems,\n                       void* p);\n\n/* gp is the gpu pointer, cp can be the associated CPU pointer if available */\nvoid instr_write_array_gpu(const char* n, TRACE f, size_t elemsz, size_t nelems,\n                           void* gp, void* cp);\n\nsize_t instr_read_array(const char* n, TRACE f, size_t elemsz, size_t maxnelems,\n                        void* p);\n\nsize_t instr_read_array_gpu(const char* n, TRACE f, size_t elemsz,\n                            size_t maxnelems, void* gp, void* cp);\n\nvoid instr_save_array_gpu(const char* kernel, const int invocation,\n                          const int pos, const char* arg, void* gp, void* cp,\n                          size_t sz, size_t num);\n\nvoid instr_save_array(const char* kernel, const int invocation, const int pos,\n                      const char* arg, void* cp, size_t sz, size_t num);\n\nvoid instr_save_primitive(const char* name, const int invocation, const int pos,\n                          const char* arg, void* p, size_t sp);\n\nsize_t instr_load_array_gpu(const char* kernel, const int invocation,\n                            const int pos, const char* arg, void* gp, void* cp,\n                            size_t sz, size_t maxnum);\n\nsize_t instr_load_array(const char* kernel, const int invocation, const int pos,\n                        const char* arg, void* cp, size_t sz, size_t maxnum);\n\nvoid instr_load_primitive(const char* name, const int invocation, const int pos,\n                          const char* arg, void* p, size_t sp);\n\nstruct instr_trace* instr_trace_file(const char* prefix, int mode);\n\nvoid instr_pipe_iterate(struct instr_trace* f, int depth, int index);\n\nvoid instr_pipe_exit(struct instr_trace* f, int depth, int index);\n\nvoid instr_load_trace(const char* n, struct instr_trace* it);\nbool instr_match_pipe(struct instr_trace* it, int what, int depth, int index);\nbool instr_match_pipe_iterate(struct instr_trace* it, int depth, int index);\nbool instr_match_pipe_exit(struct instr_trace* it, int depth, int index);\nvoid instr_pipe_iterate(struct instr_trace* it, int depth, int index);\nvoid instr_pipe_exit(struct instr_trace* it, int depth, int index);\n\n#ifdef USE_SNAPPY\n#include \"snfile.h\"\nSNAPPY_FILE trace_snappy_handle(TRACE f);\n#endif\n\n#define INSTR_TRACE_ITER 0\n#define INSTR_TRACE_EXIT 1\n"
  },
  {
    "path": "libgpu/include/internal.h",
    "content": "/*\n   internal.h\n\n   Implements internal runtime routines. Part of the GGC source code.\n\n   Copyright (C) 2014--2016, The University of Texas at Austin\n\n   See LICENSE.TXT for copyright license.\n\n   Author: Sreepathi Pai <sreepai@ices.utexas.edu>\n*/\n\n#pragma once\n\ntypedef int cuda_size_t;\n\n// TODO: specialize this\nconst int MAX_TB_SIZE     = 1024;\nconst int LOG_MAX_TB_SIZE = 10;\n\n/* container to perform multiple independent sums (scans) */\ntemplate <int items, typename T>\nstruct multiple_sum {\n  T el[items];\n\n  // https://nvlabs.github.io/cub/classcub_1_1_block_scan.html#a6ed3f77795e582df31d3d6d9d950615e\n  // \"This operation assumes the value of obtained by the T's default\n  // constructor (or by zero-initialization if no user-defined default\n  // constructor exists) is suitable as the identity value zero for addition.\"\n  __device__ __host__ multiple_sum() : multiple_sum(T()) {}\n\n  __device__ __host__ multiple_sum(const T e) {\n    for (int i = 0; i < items; i++)\n      el[i] = e;\n  }\n\n  __device__ __host__ multiple_sum& operator=(const T rhs) {\n    for (int i = 0; i < items; i++)\n      el[i] = rhs;\n\n    return *this;\n  }\n\n  __device__ __host__ multiple_sum& operator+=(const multiple_sum& rhs) {\n    for (int i = 0; i < items; i++)\n      el[i] += rhs.el[i];\n\n    return *this;\n  }\n\n  __device__ __host__ friend multiple_sum operator+(multiple_sum lhs,\n                                                    const multiple_sum& rhs) {\n    return lhs += rhs;\n  }\n};\n\n/* for two scans */\nstruct pair {\n  int x, y, z;\n\n  __device__ __host__ pair& operator+=(const pair& rhs) {\n    x += rhs.x;\n    y += rhs.y;\n    z += rhs.z;\n\n    return *this;\n  }\n\n  __device__ __host__ friend pair operator+(pair lhs, const pair& rhs) {\n    return lhs += rhs;\n  }\n};\n\ntemplate <const int WARPS_PER_TB>\nstruct warp_np {\n  volatile index_type owner[WARPS_PER_TB];\n  volatile index_type start[WARPS_PER_TB];\n  volatile index_type size[WARPS_PER_TB];\n  volatile index_type offset[WARPS_PER_TB]; // task offset\n  volatile index_type src[WARPS_PER_TB];\n};\n\nstruct tb_np {\n  index_type owner;\n  index_type start;\n  index_type size;\n  index_type offset;\n  index_type src;\n};\n\ntemplate <const int ITSIZE>\nstruct fg_np {\n  index_type itvalue[ITSIZE];\n  index_type src[ITSIZE];\n};\n\nstruct empty_np {};\n\ntemplate <typename ts_type, typename index_type, typename TTB, typename TWP,\n          typename TFG>\nunion np_shared {\n  // for scans\n  ts_type temp_storage;\n\n  // for tb-level np\n  TTB tb;\n\n  // for warp-level np\n  TWP warp;\n\n  TFG fg;\n};\n\nstruct NPInspector1 {\n  cuda_size_t total;   // total work across all threads\n  cuda_size_t done;    // total work done across all threads\n  cuda_size_t size;    // size of this thread's work\n  cuda_size_t start;   // this thread's iteration start value\n  cuda_size_t offset;  // offset within flattened iteration space\n  cuda_size_t my_done; // items completed within this thread's space\n\n  // inspect should be inspect_begin, inspect_end, inspect_update really?\n  // especially for custom closures...\n\n  template <typename T>\n  __device__ __host__ cuda_size_t inspect(T* itvalue,\n                                          const cuda_size_t ITSIZE) {\n    cuda_size_t _np_i;\n    for (_np_i = 0;\n         (my_done + _np_i) < size && (offset - done + _np_i) < ITSIZE;\n         _np_i++) {\n      itvalue[offset - done + _np_i] = start + my_done + _np_i;\n    }\n\n    my_done += _np_i;\n    offset += _np_i;\n\n    return _np_i;\n  }\n\n  template <typename T>\n  __device__ __host__ cuda_size_t inspect2(T* itvalue, T* source,\n                                           const cuda_size_t ITSIZE,\n                                           const cuda_size_t src) {\n    cuda_size_t _np_i;\n    for (_np_i = 0;\n         (my_done + _np_i) < size && (offset - done + _np_i) < ITSIZE;\n         _np_i++) {\n      itvalue[offset - done + _np_i] = start + my_done + _np_i;\n      source[offset - done + _np_i]  = src;\n    }\n\n    my_done += _np_i;\n    offset += _np_i;\n\n    return _np_i;\n  }\n\n  __device__ __host__ bool work() const { return total > 0; }\n\n  __device__ __host__ bool valid(const cuda_size_t ltid) const {\n    return ltid < total; // remember total decreases every round\n  }\n\n  __device__ __host__ void execute_round_done(const cuda_size_t ITSIZE) {\n    total -= ITSIZE;\n    done += ITSIZE;\n  }\n};\n"
  },
  {
    "path": "libgpu/include/lockarray.h",
    "content": "/*\n   lockarray.h\n\n   Implements LockArray*. Part of the GGC source code.\n\n   Copyright (C) 2014--2016, The University of Texas at Austin\n\n   See LICENSE.TXT for copyright license.\n\n   Author: Sreepathi Pai <sreepai@ices.utexas.edu>\n*/\n\n#include \"sharedptr.h\"\n#include <cassert>\n\n#define UNLOCKED 0\n#define LOCKED 1\n\nclass LockArraySimple {\npublic:\n  Shared<int> locks;\n  int* glocks;\n  int nlocks;\n\n  LockArraySimple(size_t nlocks) {\n    locks        = Shared<int>(nlocks);\n    this->nlocks = nlocks;\n    glocks       = locks.zero_gpu();\n  }\n\n  // do not use this\n  __device__ bool acquire(int ndx) {\n    assert(ndx >= 0 && ndx < nlocks);\n    while (atomicCAS(glocks + ndx, UNLOCKED, LOCKED) == LOCKED) {\n      __threadfence();\n    }\n    return glocks[ndx] == LOCKED;\n  }\n\n  __device__ bool acquire_or_fail(int ndx) {\n    assert(ndx >= 0 && ndx < nlocks);\n    return atomicCAS(glocks + ndx, UNLOCKED, LOCKED) == UNLOCKED;\n  }\n\n  __device__ bool is_locked(int ndx) {\n    // TODO: atomic reads?\n    assert(ndx >= 0 && ndx < nlocks);\n    return glocks[ndx] == LOCKED;\n  }\n\n  __device__ void release(int ndx) {\n    __threadfence();\n    bool was_locked = atomicCAS(glocks + ndx, LOCKED, UNLOCKED) == LOCKED;\n    assert(was_locked);\n  }\n};\n\nclass LockArrayTicket : public LockArraySimple {\npublic:\n  Shared<int> tickets;\n\n  int* gtickets;\n\n  LockArrayTicket(size_t nlocks) : LockArraySimple(nlocks) {\n    tickets  = Shared<int>(nlocks);\n    gtickets = tickets.gpu_wr_ptr();\n    assert(cudaMemset(gtickets, 0, nlocks * sizeof(int)) == cudaSuccess);\n  }\n\n  __device__ int reserve(int ndx) {\n    assert(ndx >= 0 && ndx < nlocks);\n    return atomicAdd(gtickets + ndx, 1);\n  }\n\n  __device__ bool acquire_or_fail(int ndx, int ticket) {\n    assert(ndx >= 0 && ndx < nlocks);\n    return glocks[ndx] == ticket;\n  }\n\n  __device__ bool is_locked(int ndx) {\n    assert(ndx >= 0 && ndx < nlocks);\n    return glocks[ndx] < gtickets[ndx];\n  }\n\n  __device__ void release(int ndx) {\n    __threadfence();\n    bool was_locked = glocks[ndx]++ < gtickets[ndx];\n    assert(was_locked);\n  }\n};\n\ntypedef LockArraySimple LockArray;\n"
  },
  {
    "path": "libgpu/include/pipe.h",
    "content": "/*\n   pipe.h\n\n   Implements PipeContext*. Part of the GGC source code.\n\n   Copyright (C) 2014--2016, The University of Texas at Austin\n\n   See LICENSE.TXT for copyright license.\n\n   Author: Sreepathi Pai <sreepai@ices.utexas.edu>\n*/\n\n#pragma once\n#include <cuda.h>\n\nclass Worklist2Light;\n\nstruct oi_save {\n  int in;\n  int out;\n  int re;\n  // should be if-def'ed?\n  int in_currslot;\n  int out_currslot;\n  int re_currslot;\n};\n\ntemplate <class WLT>\nstruct PipeContextT {\n  WLT wl[3];\n  int in, out, re;\n  struct oi_save* ois;\n\n  PipeContextT() {}\n\n  PipeContextT(size_t size) {\n    wl[0] = WLT(size);\n    wl[1] = WLT(size);\n    wl[2] = WLT(size);\n    in    = 0;\n    out   = 1;\n    re    = 2;\n    ois   = 0;\n  }\n\n  __device__ __host__ WLT& in_wl() { return wl[in]; }\n\n  __device__ __host__ WLT& out_wl() { return wl[out]; }\n\n  __device__ __host__ WLT& re_wl() { return wl[re]; }\n\n  __device__ __host__ void swap(int& x, int& y) {\n    int t;\n    t = x;\n    x = y;\n    y = t;\n  }\n  __device__ __host__ inline void advance() {\n    wl[in].reset();\n    swap(in, out);\n  }\n\n  __device__ __host__ inline void advance2() { swap(in, out); }\n\n  __device__ __host__ inline void retry() {\n    wl[in].reset();\n    swap(re, in);\n  }\n\n  __device__ __host__ inline void retry2() { swap(re, in); }\n\n  __host__ void prep() { check_cuda(cudaMalloc(&ois, sizeof(struct oi_save))); }\n\n  __device__ void save() {\n    ois->in  = in;\n    ois->out = out;\n    ois->re  = re;\n\n    ois->in_currslot  = wl[in].currslot;\n    ois->out_currslot = wl[out].currslot;\n    ois->re_currslot  = wl[re].currslot;\n  }\n\n  __host__ void restore() {\n    struct oi_save local;\n    check_cuda(cudaMemcpy(&local, ois, sizeof(struct oi_save),\n                          cudaMemcpyDeviceToHost));\n\n    in  = local.in;\n    out = local.out;\n    re  = local.re;\n\n    wl[in].set_slot(local.in_currslot);\n    wl[out].set_slot(local.out_currslot);\n    wl[re].set_slot(local.re_currslot);\n\n    check_cuda(cudaFree(ois));\n  }\n\n  __host__ void free() {\n    for (int i = 0; i < 3; i++) {\n      wl[i].free();\n    }\n  }\n};\n\nstruct PipeContextLight {\n  Worklist2Light wl[2];\n  int index;\n  struct oi_save* ois;\n\n  template <typename T>\n  __device__ PipeContextLight(PipeContextT<T> pipe) {\n    wl[0].fromWL2(pipe.in_wl());\n    wl[1].fromWL2(pipe.out_wl());\n    // wl[2].fromWL2(pipe.re_wl());   // not used\n    index = 0;\n    ois   = 0;\n  }\n\n  /* __device__ __host__ __forceinline__ */\n  /* Worklist2Light &in_wl() { */\n  /*   //assert(in != re && in != out && re != out); */\n  /*   return wl[index]; */\n  /* } */\n\n  /* __device__ __host__ __forceinline__ */\n  /* Worklist2Light &out_wl() { */\n  /*   //assert(out != re && in != out  && re != in); */\n  /*   return wl[index ^ 1]; */\n  /* } */\n\n  /* __device__ __host__ __forceinline__ */\n  /* Worklist2Light &re_wl() { */\n  /*   //assert(in != re && re != out  && in != out); */\n  /*   //return wl[2]; */\n  /* } */\n\n  /* __device__ __host__  */\n  /* void swap(int &x, int &y) { */\n  /*   int t; */\n  /*   t = x; */\n  /*   x = y; */\n  /*   y = t; */\n  /* } */\n  /* __device__ __host__ inline */\n  /* void advance() { */\n  /*   //wl[in].reset(); */\n  /*   //swap(in, out); */\n  /* } */\n\n  /* __device__ __host__ inline */\n  /* void advance2() { */\n  /*   //swap(in, out); */\n  /*   index ^= 1; */\n  /* } */\n\n  /* __device__ __host__ inline */\n  /* void retry() { */\n  /*   //wl[in].reset(); */\n  /*   //swap(re, in); */\n  /* } */\n\n  /* __device__ __host__ inline */\n  /* void retry2() { */\n  /*   //swap(re, in); */\n  /* } */\n\n  /* __host__ void prep() { */\n  /*   check_cuda(cudaMalloc(&ois, sizeof(struct oi_save))); */\n  /* } */\n\n  template <typename T>\n  __device__ void save(PipeContextT<T>& pipe, int index) {\n    pipe.ois->in  = index;\n    pipe.ois->out = index ^ 1;\n    pipe.ois->re  = 2;\n\n    pipe.ois->in_currslot  = wl[index].currslot;\n    pipe.ois->out_currslot = wl[index ^ 1].currslot;\n    // pipe.ois->re_currslot = wl[2].currslot;\n  }\n\n  /* __host__ void restore() { */\n  /*   struct oi_save local; */\n  /*   check_cuda(cudaMemcpy(&local, ois, sizeof(struct oi_save),\n   * cudaMemcpyDeviceToHost)); */\n\n  /*   index = local.in; */\n  /*   // = local.out; */\n  /*   //re = local.re; */\n\n  /*   wl[index].set_slot(local.in_currslot); */\n  /*   wl[index ^ 1].set_slot(local.out_currslot); */\n  /*   //wl[2].set_slot(local.re_currslot); */\n\n  /*   check_cuda(cudaFree(ois)); */\n  /* }     */\n\n  /* __host__ void free() { */\n  /*   for(int i = 0; i < 3; i++) { */\n  /*     //wl[i].free(); */\n  /*   } */\n  /* } */\n};\n\ntypedef PipeContextT<Worklist2> PipeContext;\ntypedef PipeContextT<WorklistT> PipeContextWT;\n"
  },
  {
    "path": "libgpu/include/rv.h",
    "content": "/*\n   rv.h\n\n   Implements Reduce on the GPU. Adapted from the GGC source code.\n\n   Copyright (C) 2014--2016, The University of Texas at Austin\n\n   See LICENSE.TXT for copyright license.\n\n   Author: Sreepathi Pai <sreepai@ices.utexas.edu>\n   Author: Roshan Dathathri <roshan@cs.utexas.edu>\n*/\n\n#pragma once\n#include \"cub/cub.cuh\"\n#include \"atomic_helpers.h\"\n\ntemplate <typename Type>\nclass HGReducible {\npublic:\n  Type* rv; // allocated by the user\n\n  __device__ void thread_entry() {}\n\n  template <typename T>\n  __device__ void thread_exit(typename T::TempStorage& temp_storage) {}\n\n  __device__ void reduce(Type value) {}\n};\n\ntemplate <typename Type>\nclass HGAccumulator : public HGReducible<Type> {\n\npublic:\n  Type local;\n  __device__ void thread_entry() { local = 0; }\n\n  template <typename T>\n  __device__ void thread_exit(typename T::TempStorage& temp_storage) {\n    local = T(temp_storage).Sum(local);\n\n    if (threadIdx.x == 0 && local) {\n      atomicTestAdd((Type*)HGReducible<Type>::rv, local);\n    }\n  }\n\n  __device__ void reduce(Type value) {\n    if (value)\n      local += value;\n  }\n};\n\ntemplate <typename Type>\nclass HGReduceMax : public HGReducible<Type> {\n  Type local;\n\n  struct MaxOp {\n    __device__ Type operator()(const Type& a, const Type& b) {\n      return (a > b) ? a : b;\n    }\n  };\n  MaxOp maxOp;\n\npublic:\n  __device__ void thread_entry() { local = 0; } // assumes positive numbers\n\n  template <typename T>\n  __device__ void thread_exit(typename T::TempStorage& temp_storage) {\n    local = T(temp_storage).Reduce(local, maxOp);\n\n    if (threadIdx.x == 0 && local) {\n      atomicTestMax((Type*)HGReducible<Type>::rv, local);\n    }\n  }\n\n  __device__ void reduce(Type value) {\n    if (local < value)\n      local = value;\n  }\n};\n\ntemplate <typename Type>\nclass HGReduceMin : public HGReducible<Type> {\n  Type local;\n\n  struct MinOp {\n    __device__ Type operator()(const Type& a, const Type& b) {\n      return (a < b) ? a : b;\n    }\n  };\n  MinOp minOp;\n\npublic:\n  __device__ void thread_entry() {\n    local = 1073741823;\n  } // assumes Type can hold this number\n\n  template <typename T>\n  __device__ void thread_exit(typename T::TempStorage& temp_storage) {\n    local = T(temp_storage).Reduce(local, minOp);\n\n    if (threadIdx.x == 0 && (local != 1073741823)) {\n      atomicTestMin((Type*)HGReducible<Type>::rv, local);\n    }\n  }\n\n  __device__ void reduce(Type value) {\n    if (local > value)\n      local = value;\n  }\n};\n"
  },
  {
    "path": "libgpu/include/sharedptr.h",
    "content": "/*\n  sharedptr.h\n\n  Convenience class for shared CPU/GPU allocations.\n  Based on the X10 Runtime ideas described in Pai et al. in PACT 2012.\n  Also see NVIDIA Hemi's array.h at <https://github.com/harrism/hemi>\n\n  Copyright (C) 2014--2016, The University of Texas at Austin\n\n  Author: Sreepathi Pai  <sreepai@ices.utexas.edu>\n*/\n\n#pragma once\n#include <cstdlib>\n#include <cstdio>\n#include <cuda.h>\n#include <assert.h>\n#include \"cutil_subset.h\"\n\ntemplate <typename T>\nclass Shared {\n  T** ptrs;\n  bool* owner;\n  bool* isCPU;\n  int max_devices;\n  size_t nmemb;\n\npublic:\n  Shared() { nmemb = 0; }\n\n  Shared(size_t nmemb) {\n    this->nmemb = nmemb;\n    max_devices = 2;\n    ptrs        = (T**)calloc(max_devices, sizeof(T*));\n    owner       = (bool*)calloc(max_devices, sizeof(bool));\n    isCPU       = (bool*)calloc(max_devices, sizeof(bool));\n\n    isCPU[0] = true;\n\n    for (int i = 0; i < max_devices; i++)\n      owner[i] = true;\n  }\n\n  size_t size() const { return this->nmemb; }\n\n  void alloc(size_t nmemb) {\n    assert(this->nmemb == 0);\n\n    this->nmemb = nmemb;\n\n    max_devices = 2;\n    ptrs        = (T**)calloc(max_devices, sizeof(T*));\n    owner       = (bool*)calloc(max_devices, sizeof(bool));\n    isCPU       = (bool*)calloc(max_devices, sizeof(bool));\n\n    isCPU[0] = true;\n\n    for (int i = 0; i < max_devices; i++)\n      owner[i] = true;\n  }\n\n  void free() {\n    for (int i = 0; i < max_devices; i++)\n      free_device(i);\n  }\n\n  bool free_device(int device = 0) {\n    assert(device < max_devices);\n\n    if (!ptrs[device])\n      return true;\n\n    if (isCPU[device])\n      ::free(ptrs[device]);\n    else {\n      if (cudaFree(ptrs[device]) == cudaSuccess)\n        ptrs[device] = NULL;\n      else\n        return false;\n    }\n\n    return true;\n  }\n\n  bool find_owner(int& o) {\n    int i;\n    for (i = 0; i < max_devices; i++)\n      if (owner[i]) {\n        o = i;\n        break;\n      }\n\n    return i < max_devices;\n  }\n\n  T* cpu_rd_ptr() {\n    if (ptrs[0] == NULL)\n      ptrs[0] = (T*)calloc(nmemb, sizeof(T));\n\n    if (!owner[0]) {\n      int o;\n      if (find_owner(o))\n        copy(o, 0);\n\n      owner[0] = true;\n    }\n\n    return ptrs[0];\n  }\n\n  T* cpu_wr_ptr(bool overwrite = false) {\n    if (ptrs[0] == NULL)\n      ptrs[0] = (T*)calloc(nmemb, sizeof(T));\n\n    if (!owner[0]) {\n      if (!overwrite) {\n        int o;\n        if (find_owner(o))\n          copy(o, 0);\n      }\n\n      owner[0] = true;\n    }\n\n    for (int i = 1; i < max_devices; i++)\n      owner[i] = false;\n\n    return ptrs[0];\n  }\n\n  T* gpu_rd_ptr(int device = 1) /* device >= 1 */\n  {\n    assert(device >= 1);\n\n    if (ptrs[device] == NULL)\n      CUDA_SAFE_CALL(cudaMalloc(&ptrs[device], nmemb * sizeof(T)));\n\n    if (!owner[device]) {\n      int o;\n      if (find_owner(o))\n        copy(o, device);\n\n      owner[device] = true;\n    }\n\n    return ptrs[device];\n  }\n\n  T* gpu_wr_ptr(bool overwrite = false, int device = 1) {\n    assert(device >= 1);\n\n    if (ptrs[device] == NULL) {\n      CUDA_SAFE_CALL(cudaMalloc(&ptrs[device], nmemb * sizeof(T)));\n    }\n\n    if (!owner[device]) {\n      if (!overwrite) {\n        int o;\n        if (find_owner(o))\n          copy(o, device);\n      }\n\n      owner[device] = true;\n    }\n\n    for (int i = 0; i < max_devices; i++)\n      if (i != device)\n        owner[i] = false;\n\n    return ptrs[device];\n  }\n\n  T* zero_gpu(int device = 1) {\n    T* p = gpu_wr_ptr(true, device);\n    CUDA_SAFE_CALL(cudaMemset(p, 0, sizeof(T) * nmemb));\n    return p;\n  }\n\n  void copy(int src, int dst) {\n    if (!ptrs[src])\n      return;\n\n    assert(ptrs[dst]);\n\n    if (isCPU[dst] && !isCPU[src]) {\n      CUDA_SAFE_CALL(cudaMemcpy(ptrs[dst], ptrs[src], nmemb * sizeof(T),\n                                cudaMemcpyDeviceToHost));\n    } else if (!isCPU[dst] && !isCPU[src]) {\n      CUDA_SAFE_CALL(cudaMemcpy(ptrs[dst], ptrs[src], nmemb * sizeof(T),\n                                cudaMemcpyDeviceToDevice));\n    } else if (!isCPU[dst] && isCPU[src]) {\n      CUDA_SAFE_CALL(cudaMemcpy(ptrs[dst], ptrs[src], nmemb * sizeof(T),\n                                cudaMemcpyHostToDevice));\n    } else\n      abort(); // cpu-to-cpu not implemented\n  }\n\n  __device__ __host__ T* ptr() {\n#ifdef __CUDA_ARCH__\n    return ptrs[1]; // TODO: this is invalid beyond one gpu device!\n#else\n    return ptrs[0];\n#endif\n  }\n};\n\ntemplate <typename T>\nclass DeviceOnly {\n  T* ptr;\n  size_t nmemb;\n\npublic:\n  DeviceOnly() {\n    ptr   = NULL;\n    nmemb = 0;\n  }\n\n  DeviceOnly(size_t nmemb) {\n    ptr = NULL;\n    alloc(nmemb);\n  }\n\n  size_t size() const { return nmemb; }\n\n  void alloc(size_t nmemb) {\n    assert(this->nmemb == 0);\n    this->nmemb = nmemb;\n    CUDA_SAFE_CALL(cudaMalloc(&ptr, nmemb * sizeof(T)));\n  }\n\n  bool free() {\n    if (ptr == NULL)\n      return true;\n    if (cudaFree(ptr) == cudaSuccess) {\n      ptr = NULL;\n      return true;\n    }\n    return false;\n  }\n\n  T* zero_gpu() {\n    CUDA_SAFE_CALL(cudaMemset(ptr, 0, sizeof(T) * nmemb));\n    return ptr;\n  }\n\n  void copy_to_gpu(T* cpu_ptr) { copy_to_gpu(cpu_ptr, nmemb); }\n\n  void copy_to_gpu(T* cpu_ptr, size_t nuseb) {\n    if (cpu_ptr == NULL)\n      return;\n    assert(ptr != NULL);\n    assert(nuseb <= nmemb);\n    CUDA_SAFE_CALL(\n        cudaMemcpy(ptr, cpu_ptr, nuseb * sizeof(T), cudaMemcpyHostToDevice));\n  }\n\n  void copy_to_cpu(T* cpu_ptr) { copy_to_cpu(cpu_ptr, nmemb); }\n\n  void copy_to_cpu(T* cpu_ptr, size_t nuseb) {\n    if (ptr == NULL)\n      return;\n    assert(cpu_ptr != NULL);\n    assert(nuseb <= nmemb);\n    CUDA_SAFE_CALL(\n        cudaMemcpy(cpu_ptr, ptr, nuseb * sizeof(T), cudaMemcpyDeviceToHost));\n  }\n\n  __device__ __host__ T* device_ptr() {\n#ifdef __CUDA_ARCH__\n    return ptr; // TODO: this is invalid beyond one gpu device!\n#else\n    return ptr;\n#endif\n  }\n};\n"
  },
  {
    "path": "libgpu/include/snfile.h",
    "content": "#pragma once\n\n#ifdef __cplusplus\nextern \"C\" {\n#endif\n\nstruct snappy_file;\n\ntypedef struct snappy_file* SNAPPY_FILE;\n\nSNAPPY_FILE snopen(const char* name, const char* mode);\nsize_t snwrite(SNAPPY_FILE f, void* p, size_t sz);\nsize_t snread(SNAPPY_FILE f, void* p, size_t sz);\nint sneof(SNAPPY_FILE f);\nvoid snclose(SNAPPY_FILE f);\n\n#ifdef __cplusplus\n}\n#endif\n"
  },
  {
    "path": "libgpu/include/thread_work.h",
    "content": "/*\n  thread_work.h\n\n  Copyright (C) 20XX--20XX, The University of Texas at Austin\n\n  Author: Vishwesh Jatala  <vishwesh.jatala@austin.utexas.edu>\n*/\n\nstruct ThreadWork {\n\n  PipeContextT<Worklist2> thread_work_wl;\n  PipeContextT<Worklist2> thread_src_wl;\n  Shared<int> thread_prefix_work_wl;\n  bool initialized = false;\n\n  void init_thread_work(int size) {\n    if (!initialized) {\n      thread_work_wl = PipeContextT<Worklist2>(size);\n      thread_src_wl  = PipeContextT<Worklist2>(size);\n\n      thread_prefix_work_wl.alloc(size);\n      thread_prefix_work_wl.zero_gpu();\n      initialized = true;\n    }\n  }\n\n  void compute_prefix_sum() {\n\n    cub::CachingDeviceAllocator g_allocator(\n        true); // Caching allocator for device memory\n    // Determine temporary device storage requirements for inclusive prefix sum\n    void* d_temp_storage      = NULL;\n    size_t temp_storage_bytes = 0;\n\n    cub::DeviceScan::InclusiveSum(\n        d_temp_storage, temp_storage_bytes, thread_work_wl.in_wl().dwl,\n        thread_prefix_work_wl.gpu_wr_ptr(), thread_work_wl.in_wl().nitems());\n    // Allocate temporary storage for inclusive prefix sum\n    CubDebugExit(\n        g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));\n    // Run inclusive prefix sum\n    cub::DeviceScan::InclusiveSum(\n        d_temp_storage, temp_storage_bytes, thread_work_wl.in_wl().dwl,\n        thread_prefix_work_wl.gpu_wr_ptr(), thread_work_wl.in_wl().nitems());\n  }\n\n  void reset_thread_work() {\n    thread_prefix_work_wl.zero_gpu();\n    thread_work_wl.in_wl().reset();\n    thread_src_wl.in_wl().reset();\n  }\n};\n\n__device__ unsigned compute_src_and_offset(unsigned first, unsigned last,\n                                           unsigned index,\n                                           int* thread_prefix_work_wl,\n                                           unsigned num_items,\n                                           unsigned int& offset) {\n\n  unsigned middle = (first + last) / 2;\n\n  if (index <= thread_prefix_work_wl[first]) {\n    if (first == 0) {\n      offset = index - 1;\n      return first;\n    } else {\n      offset = index - thread_prefix_work_wl[first - 1] - 1;\n      return first;\n    }\n  }\n  while (first + 1 != last) {\n    middle = (first + last) / 2;\n    if (index > thread_prefix_work_wl[middle]) {\n      first = middle;\n    } else {\n      last = middle;\n    }\n  }\n  offset = index - thread_prefix_work_wl[first] - 1;\n  return last;\n}\n"
  },
  {
    "path": "libgpu/include/worklist.h",
    "content": "/*\n   worklist.h\n\n   Implements Worklist classes. Part of the GGC source code.\n\n   Copyright (C) 2014--2016, The University of Texas at Austin\n\n   See LICENSE.TXT for copyright license.\n\n   Author: Sreepathi Pai <sreepai@ices.utexas.edu>\n*/\n\n#pragma once\n\n#include \"sharedptr.h\"\n#include \"cub/cub.cuh\"\n#include \"cutil_subset.h\"\n#include \"bmk2.h\"\n#include \"instr.h\"\n#include <moderngpu/kernel_mergesort.hxx>\n#include <stdio.h>\n#include <stdlib.h>\n\n#define SLOTS 1\n\nstatic int zero = 0;\n\nextern mgpu::context_t* mgc;\n\nstatic __global__ void reset_wl(volatile int* dindex) { *dindex = 0; }\n\nstatic __global__ void init_wl(int size, int* dsize, volatile int* dindex) {\n  *dsize        = size;\n  *dindex       = 0;\n  *(dindex + 1) = 0;\n}\n\n/*   int *dwl;\n  int *dindex;\n  int *dcounters;\n  int currslot;\n  int length;\n*/\nstruct Worklist {\n  int* dwl;\n  int* dindex;\n#ifdef SLOTS\n  int* dcounters;\n  int currslot;\n#endif\n  int length, index;\n\n  int* wl;\n  int* dnsize;\n\n  int* dprio;\n\n#ifdef COUNT_ATOMICS\n  int* atomic_counter;\n#endif\n\n#ifdef ATOMIC_DENSITY\n  unsigned int* atomic_density;\n#endif\n\n  Shared<int> prio;\n  bool f_will_write;\n\n  Worklist(size_t nsize) {\n#ifdef SLOTS\n    currslot = 0;\n#endif\n    if (nsize == 0) {\n      wl  = NULL;\n      dwl = NULL;\n    } else {\n      wl = (int*)calloc(nsize, sizeof(int));\n      CUDA_SAFE_CALL(cudaMalloc(&dwl, nsize * sizeof(int)));\n    }\n    CUDA_SAFE_CALL(cudaMalloc(&dnsize, 1 * sizeof(int)));\n#ifdef SLOTS\n    CUDA_SAFE_CALL(cudaMalloc(&dcounters, 2 * sizeof(int)));\n    dindex = &dcounters[currslot];\n#else\n    CUDA_SAFE_CALL(cudaMalloc(&dindex, 1 * sizeof(int)));\n#endif\n    // CUDA_SAFE_CALL(cudaMalloc(&dindex, 2 * sizeof(int)));\n\n    init_wl<<<1, 1>>>(nsize, dnsize, dindex);\n\n    // CUDA_SAFE_CALL(cudaMemcpy(dnsize, &nsize, 1 * sizeof(int),\n    // cudaMemcpyHostToDevice)); CUDA_SAFE_CALL(cudaMemcpy((void *) dindex,\n    // &zero, 1 * sizeof(zero), cudaMemcpyHostToDevice));\n\n#ifdef COUNT_ATOMICS\n    CUDA_SAFE_CALL(cudaMalloc(&atomic_counter, sizeof(int) * 1));\n    CUDA_SAFE_CALL(cudaMemcpy((void*)atomic_counter, &zero, 1 * sizeof(zero),\n                              cudaMemcpyHostToDevice));\n#endif\n\n#ifdef ATOMIC_DENSITY\n    CUDA_SAFE_CALL(\n        cudaMalloc(&atomic_density, sizeof(unsigned int) * (32 + 1)));\n    CUDA_SAFE_CALL(\n        cudaMemset(atomic_density, 0, sizeof(unsigned int) * (32 + 1)));\n#endif\n\n    // CUDA_SAFE_CALL(cudaMalloc(&rcounter, 1 * sizeof(int)));\n    // CUDA_SAFE_CALL(cudaMemcpy((void *) rcounter, &zero, 1 * sizeof(zero),\n    // cudaMemcpyHostToDevice));\n\n    prio.alloc(nsize);\n    // prio.cpu_wr_ptr();\n    dprio        = prio.gpu_wr_ptr(true);\n    length       = nsize;\n    f_will_write = false;\n    index        = 0;\n  }\n\n  void free() {\n    ::free(wl);\n    CUDA_SAFE_CALL(cudaFree(dwl));\n    CUDA_SAFE_CALL(cudaFree(dnsize));\n#ifdef SLOTS\n    CUDA_SAFE_CALL(cudaFree(dcounters));\n#else\n    CUDA_SAFE_CALL(cudaFree(dindex));\n#endif\n\n#ifdef COUNT_ATOMICS\n    CUDA_SAFE_CALL(cudaFree(atomic_counter));\n#endif\n\n    prio.free();\n  }\n\n  void will_write() { f_will_write = true; }\n\n  void sort() { mergesort(dwl, nitems(), mgpu::less_t<int>(), *mgc); }\n\n  void sort_prio() {\n    mergesort(dprio, dwl, nitems(), mgpu::less_t<int>(), *mgc);\n  }\n\n  void update_gpu(int nsize) {\n#ifdef SLOTS\n    int index[2] = {nsize, 0};\n    currslot     = 0;\n    dindex       = &dcounters[currslot];\n\n    CUDA_SAFE_CALL(cudaMemcpy((void*)dcounters, &index, 2 * sizeof(nsize),\n                              cudaMemcpyHostToDevice));\n#else\n    CUDA_SAFE_CALL(cudaMemcpy((void*)dindex, &nsize, 1 * sizeof(nsize),\n                              cudaMemcpyHostToDevice));\n#endif\n    CUDA_SAFE_CALL(\n        cudaMemcpy(dwl, wl, nsize * sizeof(int), cudaMemcpyHostToDevice));\n  }\n\n  void update_cpu() {\n    int nsize = nitems();\n    CUDA_SAFE_CALL(\n        cudaMemcpy(wl, dwl, nsize * sizeof(int), cudaMemcpyDeviceToHost));\n  }\n\n  void display_items() {\n    int nsize = nitems();\n    CUDA_SAFE_CALL(\n        cudaMemcpy(wl, dwl, nsize * sizeof(int), cudaMemcpyDeviceToHost));\n\n    printf(\"WL: \");\n    for (int i = 0; i < nsize; i++)\n      printf(\"%d %d, \", i, wl[i]);\n\n    printf(\"\\n\");\n    return;\n  }\n\n  void save(const char* f, const unsigned iteration) {\n    char n[255];\n    int ret;\n\n    ret = snprintf(n, 255, \"%s%s-%05d-%s.wl\", instr_trace_dir(), f, iteration,\n                   instr_uniqid());\n\n    if (ret < 0 || ret >= 255) {\n      fprintf(stderr, \"Error creating filename for kernel '%s', iteration %d\\n\",\n              f, iteration);\n      exit(1);\n    }\n\n    int nsize = nitems();\n    TRACE of  = trace_open(n, \"w\");\n    instr_write_array_gpu(n, of, sizeof(wl[0]), nsize, dwl, wl);\n    trace_close(of);\n    bmk2_log_collect(\"ggc/wlcontents\", n);\n    return;\n  }\n\n  void load(const char* f, const unsigned iteration) {\n    char n[255];\n    int ret;\n\n    ret = snprintf(n, 255, \"%s%s-%05d-%s.wl\", instr_trace_dir(), f, iteration,\n                   instr_saved_uniqid());\n\n    if (ret < 0 || ret >= 255) {\n      fprintf(stderr, \"Error creating filename for kernel '%s', iteration %d\\n\",\n              f, iteration);\n      exit(1);\n    }\n\n    TRACE of  = trace_open(n, \"r\");\n    int nsize = instr_read_array_gpu(n, of, sizeof(wl[0]), length, dwl, wl);\n    CUDA_SAFE_CALL(cudaMemcpy((void*)dindex, &nsize, 1 * sizeof(nsize),\n                              cudaMemcpyHostToDevice));\n    trace_close(of);\n    return;\n  }\n\n#ifdef SLOTS\n  __device__ __host__ inline void reset_next_slot() const {\n#ifdef __CUDA_ARCH__\n    dcounters[1 ^ currslot] = 0;\n#else\n    reset_wl<<<1, 1>>>(&dcounters[1 ^ currslot]);\n#endif\n  }\n\n  __device__ __host__ inline void set_slot(int slot) {\n    currslot = slot;\n    dindex   = &dcounters[currslot];\n  }\n\n  __device__ __host__ inline void swap_slots() {\n    currslot ^= 1;\n    dindex = &dcounters[currslot];\n  }\n#endif /* SLOTS */\n\n  __device__ __host__ inline void reset() {\n#ifdef __CUDA_ARCH__\n    *(volatile int*)dindex = 0;\n    // atomicAdd(rcounter, 1);\n#else\n    // CUDA_SAFE_CALL(cudaMemcpy((void *) dindex, &zero, 1 * sizeof(zero),\n    // cudaMemcpyHostToDevice));\n    reset_wl<<<1, 1>>>(dindex);\n#endif\n  }\n\n  __device__ __host__ inline int nitems() {\n#ifdef __CUDA_ARCH__\n    // return atomicAdd(dindex, 0);\n    // return *dindex;\n    return *((volatile int*)dindex);\n#else\n    // if(f_will_write)\n\n    CUDA_SAFE_CALL(cudaMemcpy(&index, (void*)dindex, 1 * sizeof(index),\n                              cudaMemcpyDeviceToHost));\n\n    // f_will_write = 0;\n    return index;\n#endif\n  }\n\n  __device__ int push(int item) {\n    int lindex = atomicAdd((int*)dindex, 1);\n    assert(lindex <= *dnsize);\n\n#ifdef ATOMIC_DENSITY\n    int first, offset, total;\n    warp_active_count(first, offset, total);\n\n    if (offset == 0) {\n      atomicAdd(&atomic_density[total], 1);\n    }\n#endif\n\n#ifdef COUNT_ATOMICS\n    atomicAdd(atomic_counter, 1);\n#endif\n\n    dwl[lindex] = item;\n    return 1;\n  }\n\n  __device__ int push_range(int nitems) const {\n    int lindex = atomicAdd((int*)dindex, nitems);\n    assert(lindex <= *dnsize);\n\n#ifdef COUNT_ATOMICS\n    atomicAdd(atomic_counter, 1);\n#endif\n\n    return lindex;\n  }\n\n  __device__ int push(int item, int prio) {\n    int lindex = atomicAdd((int*)dindex, 1);\n    assert(lindex <= *dnsize);\n\n#ifdef COUNT_ATOMICS\n    atomicAdd(atomic_counter, 1);\n#endif\n\n#ifdef ATOMIC_DENSITY\n    int first, offset, total;\n    warp_active_count(first, offset, total);\n\n    if (offset == 0) {\n      atomicAdd(&atomic_density[total], 1);\n    }\n#endif\n\n    dwl[lindex]   = item;\n    dprio[lindex] = prio;\n\n    return 1;\n  }\n\n  __device__ int push_id(int id, int item) {\n    assert(id <= *dnsize);\n    dwl[id] = item;\n    return 1;\n  }\n\n  __device__ int setup_push_warp_one() {\n    int first, total, offset, lindex = 0;\n\n    warp_active_count(first, offset, total);\n\n    if (offset == 0) {\n      lindex = atomicAdd((int*)dindex, total);\n      assert(lindex <= *dnsize);\n#ifdef COUNT_ATOMICS\n      atomicAdd(atomic_counter, 1);\n#endif\n\n      // counting density makes no sense -- it is always 1\n    }\n\n    lindex = cub::ShuffleIndex<32>(lindex, first, 0xffffffff);\n    // lindex = cub::ShuffleIndex(lindex, first); // CUB > 1.3.1\n\n    return lindex + offset;\n  }\n\n  __device__ int setup_push_warp_one_za() {\n    int first, total, offset, lindex = 0;\n\n    // test function, not part of API\n\n    warp_active_count_zero_active(first, offset, total);\n\n    if (offset == 0) {\n      lindex = atomicAdd((int*)dindex, total);\n      assert(lindex <= *dnsize);\n#ifdef COUNT_ATOMICS\n      atomicAdd(atomic_counter, 1);\n#endif\n    }\n\n    lindex = cub::ShuffleIndex<32>(lindex, first, 0xffffffff);\n    // lindex = cub::ShuffleIndex(lindex, first); // CUB > 1.3.1\n\n    return lindex + offset;\n  }\n\n  // must be warp uniform ... i.e. all threads in warp must be active\n  template <typename T>\n  __device__ int setup_push_warp(typename T::TempStorage* ts, int nitems) {\n    int total, offset, lindex;\n    T(ts[threadIdx.x / 32]).ExclusiveSum(nitems, offset, total);\n\n    if (threadIdx.x % 32 == 0) {\n      lindex = atomicAdd((int*)dindex, total);\n      assert(lindex <= *dnsize);\n#ifdef COUNT_ATOMICS\n      atomicAdd(atomic_counter, 1);\n#endif\n    }\n\n    lindex = cub::ShuffleIndex<32>(lindex, 0, 0xffffffff);\n    // lindex = cub::ShuffleIndex(lindex, 0); // CUB > 1.3.1\n\n    return lindex + offset;\n  }\n\n  __device__ int do_push(int start, int id, int item) const {\n    assert(id <= *dnsize);\n    dwl[start + id] = item;\n    return 1;\n  }\n\n  __device__ int pop(int& item) const {\n    int lindex = atomicSub((int*)dindex, 1);\n    if (lindex <= 0) {\n      *dindex = 0;\n      return 0;\n    }\n\n    item = dwl[lindex - 1];\n    return 1;\n  }\n};\n\nstruct Worklist2 : public Worklist {\n  Worklist2() : Worklist(0) {}\n  Worklist2(int nsize) : Worklist(nsize) {}\n\n  template <typename T>\n  __device__ __forceinline__ int push_1item(int nitem, int item,\n                                            int threads_per_block) {\n    __shared__ typename T::TempStorage temp_storage;\n    __shared__ int queue_index;\n    int total_items = 0;\n    int thread_data = nitem;\n\n    T(temp_storage).ExclusiveSum(thread_data, thread_data, total_items);\n\n    if (threadIdx.x == 0) {\n      if (debug)\n        printf(\"t: %d\\n\", total_items);\n      queue_index = atomicAdd((int*)dindex, total_items);\n      // printf(\"queueindex: %d %d %d %d %d\\n\", blockIdx.x, threadIdx.x,\n      // queue_index, thread_data + n_items, total_items);\n#ifdef COUNT_ATOMICS\n      atomicAdd(atomic_counter, 1);\n#endif\n    }\n\n    __syncthreads();\n\n    if (nitem == 1) {\n      if (queue_index + thread_data >= *dnsize) {\n        printf(\"GPU: exceeded length: %d %d %d\\n\", queue_index, thread_data,\n               *dnsize);\n        return 0;\n      }\n\n      // dwl[queue_index + thread_data] = item;\n      cub::ThreadStore<cub::STORE_CG>(dwl + queue_index + thread_data, item);\n    }\n\n    return total_items;\n  }\n\n  template <typename T>\n  __device__ __forceinline__ int push_1item(int nitem, int item, int prio,\n                                            int threads_per_block) {\n    __shared__ typename T::TempStorage temp_storage;\n    __shared__ int queue_index;\n    int total_items = 0;\n    int thread_data = nitem;\n\n    T(temp_storage).ExclusiveSum(thread_data, thread_data, total_items);\n\n    if (threadIdx.x == 0) {\n      if (debug)\n        printf(\"t: %d\\n\", total_items);\n      queue_index = atomicAdd((int*)dindex, total_items);\n      // printf(\"queueindex: %d %d %d %d %d\\n\", blockIdx.x, threadIdx.x,\n      // queue_index, thread_data + n_items, total_items);\n#ifdef COUNT_ATOMICS\n      atomicAdd(atomic_counter, 1);\n#endif\n    }\n\n    __syncthreads();\n\n    if (nitem == 1) {\n      if (queue_index + thread_data >= *dnsize) {\n        printf(\"GPU: exceeded length: %d %d %d\\n\", queue_index, thread_data,\n               *dnsize);\n        return 0;\n      }\n\n      // dwl[queue_index + thread_data] = item;\n      cub::ThreadStore<cub::STORE_CG>(dwl + queue_index + thread_data, item);\n      cub::ThreadStore<cub::STORE_CG>(dprio + queue_index + thread_data, prio);\n    }\n\n    return total_items;\n  }\n\n  template <typename T>\n  __device__ __forceinline__ int push_nitems(int n_items, int* items,\n                                             int threads_per_block) {\n    __shared__ typename T::TempStorage temp_storage;\n    __shared__ int queue_index;\n    int total_items;\n\n    int thread_data = n_items;\n\n    T(temp_storage).ExclusiveSum(thread_data, thread_data, total_items);\n\n    if (threadIdx.x == 0) {\n      queue_index = atomicAdd((int*)dindex, total_items);\n      // printf(\"queueindex: %d %d %d %d %d\\n\", blockIdx.x, threadIdx.x,\n      // queue_index, thread_data + n_items, total_items);\n#ifdef COUNT_ATOMICS\n      atomicAdd(atomic_counter, 1);\n#endif\n    }\n\n    __syncthreads();\n\n    for (int i = 0; i < n_items; i++) {\n      // printf(\"pushing %d to %d\\n\", items[i], queue_index + thread_data + i);\n      if (queue_index + thread_data + i >= *dnsize) {\n        printf(\"GPU: exceeded length: %d %d %d %d\\n\", queue_index, thread_data,\n               i, *dnsize);\n        return 0;\n      }\n\n      dwl[queue_index + thread_data + i] = items[i];\n    }\n\n    return total_items;\n  }\n\n  __device__ int pop_id(int id, int& item) const {\n    if (id < *dindex) {\n      item = cub::ThreadLoad<cub::LOAD_CG>(dwl + id);\n      // item = dwl[id];\n      return 1;\n    }\n\n    return 0;\n  }\n\n  __device__ int pop_id_len(int id, int len, int& item) const {\n    if (id < len) {\n      item = cub::ThreadLoad<cub::LOAD_CG>(dwl + id);\n      // item = dwl[id];\n      return 1;\n    }\n\n    return 0;\n  }\n};\n\nstruct WorklistT : public Worklist2 {\n  cudaTextureObject_t tx;\n\n  WorklistT() : Worklist2() {}\n\n  WorklistT(size_t nsize) : Worklist2(nsize) {\n    // from here:\n    // http://devblogs.nvidia.com/parallelforall/cuda-pro-tip-kepler-texture-objects-improve-performance-and-flexibility/\n\n    cudaResourceDesc resDesc;\n    memset(&resDesc, 0, sizeof(resDesc));\n    resDesc.resType                = cudaResourceTypeLinear;\n    resDesc.res.linear.devPtr      = dwl;\n    resDesc.res.linear.desc.f      = cudaChannelFormatKindSigned;\n    resDesc.res.linear.desc.x      = 32; // bits per channel\n    resDesc.res.linear.sizeInBytes = length * sizeof(int);\n\n    cudaTextureDesc texDesc;\n    memset(&texDesc, 0, sizeof(texDesc));\n    texDesc.readMode = cudaReadModeElementType;\n\n    // create texture object: we only have to do this once!\n    CUDA_SAFE_CALL(cudaCreateTextureObject(&tx, &resDesc, &texDesc, NULL));\n  }\n\n  void free() {\n    CUDA_SAFE_CALL(cudaDestroyTextureObject(tx));\n    Worklist2::free();\n  }\n\n  __device__ int pop_id(int id, int& item) {\n    if (id < *dindex) {\n      item = tex1Dfetch<int>(tx, id);\n      // item = cub::ThreadLoad<cub::LOAD_CG>(dwl + id);\n      return 1;\n    }\n\n    return 0;\n  }\n\n  __device__ int pop_id_len(int id, int len, int& item) {\n    if (id < len) {\n      item = tex1Dfetch<int>(tx, id);\n      // item = cub::ThreadLoad<cub::LOAD_CG>(dwl + id);\n      return 1;\n    }\n\n    return 0;\n  }\n};\n\nstruct Worklist2Light {\n  int* dwl;\n  int* dindex;\n  int* dcounters;\n  int currslot;\n  int length;\n\n  __device__ void fromWL2(Worklist2 wl) {\n    dwl       = wl.dwl;\n    dindex    = wl.dindex;\n    dcounters = wl.dcounters;\n    currslot  = wl.currslot;\n    length    = *wl.dnsize;\n  }\n\n  __device__ __host__ inline int nitems() {\n#ifdef __CUDA_ARCH__\n    // return atomicAdd(dindex, 0);\n    // return *dindex;\n    return *((volatile int*)dindex);\n#else\n    assert(false);\n    return 0;\n    // if(f_will_write)\n\n    // CUDA_SAFE_CALL(cudaMemcpy(&index, (void *) dindex, 1 * sizeof(index),\n    // cudaMemcpyDeviceToHost));\n\n    // f_will_write = 0;\n    // return index;\n#endif\n  }\n\n#ifdef SLOTS\n  __device__ __host__ inline void swap_slots() {\n    currslot ^= 1;\n    dindex = &dcounters[currslot];\n  }\n\n  __device__ __host__ inline void set_slot(int slot) {\n    currslot = slot;\n    dindex   = &dcounters[currslot];\n  }\n#endif /* SLOTS */\n\n#ifdef SLOTS\n  __device__ __host__ inline void reset_next_slot() const {\n#ifdef __CUDA_ARCH__\n    dcounters[1 ^ currslot] = 0;\n#else\n    reset_wl<<<1, 1>>>(&dcounters[1 ^ currslot]);\n#endif\n  }\n#endif\n\n  __device__ int do_push(int start, int id, int item) {\n    assert(id <= length);\n    dwl[start + id] = item;\n    return 1;\n  }\n\n  __device__ int push_range(int nitems) const {\n    int lindex = atomicAdd((int*)dindex, nitems);\n    assert(lindex <= length);\n\n#ifdef COUNT_ATOMICS\n    // atomicAdd(atomic_counter, 1);\n#endif\n\n    return lindex;\n  }\n\n  __device__ int setup_push_warp_one() {\n    int first, total, offset, lindex = 0;\n\n    warp_active_count(first, offset, total);\n\n    if (offset == 0) {\n      lindex = atomicAdd((int*)dindex, total);\n      assert(lindex <= length);\n#ifdef COUNT_ATOMICS\n      atomicAdd(atomic_counter, 1);\n#endif\n\n      // counting density makes no sense -- it is always 1\n    }\n\n    lindex = cub::ShuffleIndex<32>(lindex, first, 0xffffffff);\n    // lindex = cub::ShuffleIndex(lindex, first); // CUB > 1.3.1\n\n    return lindex + offset;\n  }\n\n  __device__ int pop_id(int id, int& item) {\n    if (id < *dindex) {\n      item = cub::ThreadLoad<cub::LOAD_CG>(dwl + id);\n      // item = dwl[id];\n      return 1;\n    }\n\n    return 0;\n  }\n};\n\n#ifdef COUNT_ATOMICS\nstatic __device__ __host__ int get_atomic_count(Worklist wl) {\n#ifdef __CUDA_ARCH__\n  return *wl.atomic_counter;\n#else\n  int count = 0;\n  CUDA_SAFE_CALL(cudaMemcpy(&count, wl.atomic_counter, sizeof(int) * 1,\n                            cudaMemcpyDeviceToHost));\n  return count;\n#endif\n}\n#endif\n\n#ifdef ATOMIC_DENSITY\nstatic __device__ __host__ void print_atomic_density(const char* name,\n                                                     Worklist wl) {\n#ifdef __CUDA_ARCH__\n  assert(false);\n#else\n  unsigned count[32 + 1];\n  CUDA_SAFE_CALL(cudaMemcpy(&count, wl.atomic_density,\n                            sizeof(unsigned int) * (32 + 1),\n                            cudaMemcpyDeviceToHost));\n\n  for (int i = 0; i < 32 + 1; i++) {\n    fprintf(stderr, \"INSTR atomic_density_%s_%d %u\\n\", name, i, count[i]);\n  }\n#endif\n}\n#endif\n"
  },
  {
    "path": "libgpu/src/bmk2.c",
    "content": "#include <stdio.h>\n#include <stdlib.h>\n#include <string.h>\n#include \"bmk2.h\"\n\nstatic int inited = 0;\nstatic int bmk2   = 0;\nstatic char* binid;\nstatic char* inputid;\nstatic char* runid;\n\nchar* bmk2_get_binid() { return binid; }\n\nchar* bmk2_get_inputid() { return inputid; }\n\nchar* bmk2_get_runid() { return runid; }\n\nint bmk2_log_collect(const char* component, const char* file) {\n  if (bmk2 && binid && inputid && runid) {\n    fprintf(stderr, \"COLLECT %s/%s %s %s %s\\n\", binid, inputid, runid,\n            component, file);\n    return 1;\n  }\n\n  return 0;\n}\n\n__attribute__((constructor)) void init_bmk2() {\n  char* p;\n\n  inited = 1;\n\n  if (p = getenv(\"BMK2\")) {\n    if (atoi(p) == 1) {\n      bmk2 = 1;\n    }\n  }\n\n  if (bmk2) {\n    if (p = getenv(\"BMK2_BINID\")) {\n      binid = strdup(p);\n    }\n\n    if (p = getenv(\"BMK2_INPUTID\")) {\n      inputid = strdup(p);\n    }\n\n    if (p = getenv(\"BMK2_RUNID\")) {\n      runid = strdup(p);\n    }\n  }\n}\n"
  },
  {
    "path": "libgpu/src/csr_graph.cu",
    "content": "/*\n   csr_graph.cu\n\n   Implements CSR Graph. Part of the GGC source code.\n\n   Copyright (C) 2014--2016, The University of Texas at Austin\n\n   See LICENSE.TXT for copyright license.\n\n   Author: Sreepathi Pai <sreepai@ices.utexas.edu>\n*/\n\n/* -*- mode: c++ -*- */\n\n#include \"gg.h\"\n#include \"csr_graph.h\"\n\nunsigned CSRGraph::init() {\n  row_start = edge_dst = NULL;\n  edge_data            = NULL;\n  node_data            = NULL;\n  nnodes = nedges = 0;\n  device_graph    = false;\n\n  return 0;\n}\n\nunsigned CSRGraph::allocOnHost(bool no_edge_data) {\n  assert(nnodes > 0);\n  assert(!device_graph);\n\n  if (row_start != NULL) // already allocated\n    return true;\n\n  size_t mem_usage = ((nnodes + 1) + nedges) * sizeof(index_type) +\n                     (nnodes) * sizeof(node_data_type);\n  if (!no_edge_data)\n    mem_usage += (nedges) * sizeof(edge_data_type);\n\n  printf(\"Host memory for graph: %3u MB\\n\", mem_usage / 1048756);\n\n  row_start = (index_type*)calloc(nnodes + 1, sizeof(index_type));\n  edge_dst  = (index_type*)calloc(nedges, sizeof(index_type));\n  if (!no_edge_data)\n    edge_data = (edge_data_type*)calloc(nedges, sizeof(edge_data_type));\n  node_data = (node_data_type*)calloc(nnodes, sizeof(node_data_type));\n\n  return ((no_edge_data || edge_data) && row_start && edge_dst && node_data);\n}\n\nunsigned CSRGraph::allocOnDevice(bool no_edge_data) {\n  if (edge_dst != NULL) // already allocated\n    return true;\n\n  assert(edge_dst == NULL); // make sure not already allocated\n\n  if (nedges > 0)\n    check_cuda(cudaMalloc((void**)&edge_dst, nedges * sizeof(index_type)));\n  check_cuda(cudaMalloc((void**)&row_start, (nnodes + 1) * sizeof(index_type)));\n\n  if (!no_edge_data && (nedges > 0))\n    check_cuda(cudaMalloc((void**)&edge_data, nedges * sizeof(edge_data_type)));\n  if (nnodes > 0)\n    check_cuda(cudaMalloc((void**)&node_data, nnodes * sizeof(node_data_type)));\n\n  device_graph = true;\n\n  assert(((nedges == 0) || edge_dst) &&\n         (no_edge_data || (nedges == 0) || edge_data) && row_start &&\n         ((nnodes == 0) || node_data));\n  return true;\n}\n\nvoid CSRGraphTex::copy_to_gpu(struct CSRGraphTex& copygraph) {\n  copygraph.nnodes = nnodes;\n  copygraph.nedges = nedges;\n\n  copygraph.allocOnDevice(edge_data == NULL);\n\n  check_cuda(cudaMemcpy(copygraph.edge_dst, edge_dst,\n                        nedges * sizeof(index_type), cudaMemcpyHostToDevice));\n  if (edge_data != NULL)\n    check_cuda(cudaMemcpy(copygraph.edge_data, edge_data,\n                          nedges * sizeof(edge_data_type),\n                          cudaMemcpyHostToDevice));\n  check_cuda(cudaMemcpy(copygraph.node_data, node_data,\n                        nnodes * sizeof(node_data_type),\n                        cudaMemcpyHostToDevice));\n\n  check_cuda(cudaMemcpy(copygraph.row_start, row_start,\n                        (nnodes + 1) * sizeof(index_type),\n                        cudaMemcpyHostToDevice));\n}\n\nunsigned CSRGraphTex::allocOnDevice(bool no_edge_data) {\n  if (CSRGraph::allocOnDevice(no_edge_data)) {\n    assert(sizeof(index_type) <= 4);     // 32-bit only!\n    assert(sizeof(node_data_type) <= 4); // 32-bit only!\n\n    cudaResourceDesc resDesc;\n\n    memset(&resDesc, 0, sizeof(resDesc));\n    resDesc.resType           = cudaResourceTypeLinear;\n    resDesc.res.linear.desc.f = cudaChannelFormatKindUnsigned;\n    resDesc.res.linear.desc.x = 32; // bits per channel\n\n    cudaTextureDesc texDesc;\n    memset(&texDesc, 0, sizeof(texDesc));\n    texDesc.readMode = cudaReadModeElementType;\n\n    resDesc.res.linear.devPtr      = edge_dst;\n    resDesc.res.linear.sizeInBytes = nedges * sizeof(index_type);\n    check_cuda(cudaCreateTextureObject(&edge_dst_tx, &resDesc, &texDesc, NULL));\n\n    resDesc.res.linear.devPtr      = row_start;\n    resDesc.res.linear.sizeInBytes = (nnodes + 1) * sizeof(index_type);\n    check_cuda(\n        cudaCreateTextureObject(&row_start_tx, &resDesc, &texDesc, NULL));\n\n    resDesc.res.linear.devPtr      = node_data;\n    resDesc.res.linear.sizeInBytes = (nnodes) * sizeof(node_data_type);\n    check_cuda(\n        cudaCreateTextureObject(&node_data_tx, &resDesc, &texDesc, NULL));\n\n    return 1;\n  }\n\n  return 0;\n}\n\nunsigned CSRGraph::deallocOnHost() {\n  if (!device_graph) {\n    free(row_start);\n    free(edge_dst);\n    if (edge_data != NULL)\n      free(edge_data);\n    free(node_data);\n  }\n\n  return 0;\n}\nunsigned CSRGraph::deallocOnDevice() {\n  if (device_graph) {\n    cudaFree(edge_dst);\n    if (edge_data != NULL)\n      cudaFree(edge_data);\n    cudaFree(row_start);\n    cudaFree(node_data);\n  }\n\n  return 0;\n}\n\nCSRGraph::CSRGraph() { init(); }\n\nvoid CSRGraph::progressPrint(unsigned maxii, unsigned ii) {\n  const unsigned nsteps = 10;\n  unsigned ineachstep   = (maxii / nsteps);\n  if (ineachstep == 0)\n    ineachstep = 1;\n  /*if (ii == maxii) {\n    printf(\"\\t100%%\\n\");\n    } else*/\n  if (ii % ineachstep == 0) {\n    int progress = ((size_t)ii * 100) / maxii + 1;\n\n    printf(\"\\t%3d%%\\r\", progress);\n    fflush(stdout);\n  }\n}\n\nunsigned CSRGraph::readFromGR(char file[], bool read_edge_data) {\n  std::ifstream cfile;\n  cfile.open(file);\n\n  // copied from GaloisCpp/trunk/src/FileGraph.h\n  int masterFD = open(file, O_RDONLY);\n  if (masterFD == -1) {\n    printf(\"FileGraph::structureFromFile: unable to open %s.\\n\", file);\n    return 1;\n  }\n\n  struct stat buf;\n  int f = fstat(masterFD, &buf);\n  if (f == -1) {\n    printf(\"FileGraph::structureFromFile: unable to stat %s.\\n\", file);\n    abort();\n  }\n  size_t masterLength = buf.st_size;\n\n  int _MAP_BASE = MAP_PRIVATE;\n  //#ifdef MAP_POPULATE\n  //  _MAP_BASE  |= MAP_POPULATE;\n  //#endif\n\n  void* m = mmap(0, masterLength, PROT_READ, _MAP_BASE, masterFD, 0);\n  if (m == MAP_FAILED) {\n    m = 0;\n    printf(\"FileGraph::structureFromFile: mmap failed.\\n\");\n    abort();\n  }\n\n  ggc::Timer t(\"graphreader\");\n  t.start();\n\n  // parse file\n  uint64_t* fptr                           = (uint64_t*)m;\n  __attribute__((unused)) uint64_t version = le64toh(*fptr++);\n  assert(version == 1);\n  uint64_t sizeEdgeTy = le64toh(*fptr++);\n  uint64_t numNodes   = le64toh(*fptr++);\n  uint64_t numEdges   = le64toh(*fptr++);\n  uint64_t* outIdx    = fptr;\n  fptr += numNodes;\n  uint32_t* fptr32 = (uint32_t*)fptr;\n  uint32_t* outs   = fptr32;\n  fptr32 += numEdges;\n  if (numEdges % 2)\n    fptr32 += 1;\n  edge_data_type* edgeData = (edge_data_type*)fptr32;\n\n  // cuda.\n  nnodes = numNodes;\n  nedges = numEdges;\n\n  printf(\"nnodes=%d, nedges=%d, sizeEdge=%d.\\n\", nnodes, nedges, sizeEdgeTy);\n  allocOnHost(!read_edge_data);\n\n  row_start[0] = 0;\n\n  for (unsigned ii = 0; ii < nnodes; ++ii) {\n    row_start[ii + 1] = le64toh(outIdx[ii]);\n    //   //noutgoing[ii] = le64toh(outIdx[ii]) - le64toh(outIdx[ii - 1]);\n    index_type degree = row_start[ii + 1] - row_start[ii];\n\n    for (unsigned jj = 0; jj < degree; ++jj) {\n      unsigned edgeindex = row_start[ii] + jj;\n\n      unsigned dst = le32toh(outs[edgeindex]);\n      if (dst >= nnodes)\n        printf(\"\\tinvalid edge from %d to %d at index %d(%d).\\n\", ii, dst, jj,\n               edgeindex);\n\n      edge_dst[edgeindex] = dst;\n\n      if (sizeEdgeTy && read_edge_data)\n        edge_data[edgeindex] = edgeData[edgeindex];\n    }\n\n    progressPrint(nnodes, ii);\n  }\n\n  cfile.close(); // probably galois doesn't close its file due to mmap.\n  t.stop();\n\n  // TODO: fix MB/s\n  printf(\"read %lld bytes in %d ms (%0.2f MB/s)\\n\\r\\n\", masterLength,\n         t.duration_ms(), (masterLength / 1000.0) / (t.duration_ms()));\n\n  return 0;\n}\n\nunsigned CSRGraph::read(char file[], bool read_edge_data) {\n  return readFromGR(file, read_edge_data);\n}\n\nvoid CSRGraph::dealloc() {\n  if (device_graph)\n    deallocOnDevice();\n  else\n    deallocOnHost();\n}\n\nvoid CSRGraph::copy_to_gpu(struct CSRGraph& copygraph) {\n  copygraph.nnodes = nnodes;\n  copygraph.nedges = nedges;\n\n  copygraph.allocOnDevice(edge_data == NULL);\n\n  check_cuda(cudaMemcpy(copygraph.edge_dst, edge_dst,\n                        nedges * sizeof(index_type), cudaMemcpyHostToDevice));\n  if (edge_data != NULL)\n    check_cuda(cudaMemcpy(copygraph.edge_data, edge_data,\n                          nedges * sizeof(edge_data_type),\n                          cudaMemcpyHostToDevice));\n  check_cuda(cudaMemcpy(copygraph.node_data, node_data,\n                        nnodes * sizeof(node_data_type),\n                        cudaMemcpyHostToDevice));\n\n  check_cuda(cudaMemcpy(copygraph.row_start, row_start,\n                        (nnodes + 1) * sizeof(index_type),\n                        cudaMemcpyHostToDevice));\n}\n\nvoid CSRGraph::copy_to_cpu(struct CSRGraph& copygraph) {\n  assert(device_graph);\n\n  // cpu graph is not allocated\n  assert(copygraph.nnodes = nnodes);\n  assert(copygraph.nedges = nedges);\n\n  check_cuda(cudaMemcpy(copygraph.edge_dst, edge_dst,\n                        nedges * sizeof(index_type), cudaMemcpyDeviceToHost));\n  if (edge_data != NULL)\n    check_cuda(cudaMemcpy(copygraph.edge_data, edge_data,\n                          nedges * sizeof(edge_data_type),\n                          cudaMemcpyDeviceToHost));\n  check_cuda(cudaMemcpy(copygraph.node_data, node_data,\n                        nnodes * sizeof(node_data_type),\n                        cudaMemcpyDeviceToHost));\n\n  check_cuda(cudaMemcpy(copygraph.row_start, row_start,\n                        (nnodes + 1) * sizeof(index_type),\n                        cudaMemcpyDeviceToHost));\n}\n\nstruct EdgeIterator {\n  CSRGraph* g;\n  index_type node;\n  index_type s;\n\n  __device__ EdgeIterator(CSRGraph& g, index_type node) {\n    this->g    = &g;\n    this->node = node;\n  }\n\n  __device__ index_type size() const {\n    return g->row_start[node + 1] - g->row_start[node];\n  }\n\n  __device__ index_type start() {\n    s = g->row_start[node];\n    return s;\n  }\n\n  __device__ index_type end() const { return g->row_start[node + 1]; }\n\n  __device__ void next() { s++; }\n\n  __device__ index_type dst() const { return g->edge_dst[s]; }\n\n  __device__ edge_data_type data() const { return g->edge_data[s]; }\n};\n"
  },
  {
    "path": "libgpu/src/ggc_rt.cu",
    "content": "/* -*- mode: c++ -*- */\n#include <cuda.h>\n#include \"gg.h\"\n\nstatic struct ggc_rt_dev_info dinfo = {-1, -1};\n\nvoid ggc_init_dev_info() {\n  int dev;\n  struct cudaDeviceProp p;\n\n  check_cuda(cudaGetDevice(&dev));\n  dinfo.dev = dev;\n  \n  check_cuda(cudaGetDeviceProperties(&p, dev));\n  dinfo.nSM = p.multiProcessorCount;\n}\n\nvoid ggc_set_gpu_device(int dev) {\n  check_cuda(cudaSetDevice(dev));\n  ggc_init_dev_info();\n}\n\nint ggc_get_nSM() {\n  if(dinfo.dev == -1)\n    ggc_init_dev_info();\n\n  return dinfo.nSM;\n}\n"
  },
  {
    "path": "libgpu/src/instr.cu",
    "content": "/* -*- mode: c++ -*- */\n#include <cuda.h>\n#include <stdio.h>\n#include <string.h>\n#include <assert.h>\n#include <stdlib.h>\n#include <sys/types.h>\n#include <sys/stat.h>\n#include <unistd.h>\n#include \"gg.h\"\n#include \"bmk2.h\"\n#include <zlib.h>\n\n#ifdef USE_SNAPPY\n#include \"snfile.h\"\n#endif\n\nenum compformat {\n  UNCOMPRESSED = 0,\n  GZIP = 1,\n  SNAPPY = 2\n};\n\nstruct trace_file {\n  int format;\n  union {\n    FILE *f;\n    gzFile z;\n#ifdef USE_SNAPPY\n    SNAPPY_FILE s;\n#endif    \n  };\n};\n\nstatic const char *saved_uniqid;\n\nTRACE trace_open(const char *name, const char *mode) {\n  trace_file *t;\n  int use_compress = 1;\n  const char *c;\n\n  t = (trace_file *) malloc(sizeof(trace_file) * 1);\n\n  if(!t) {\n    fprintf(stderr, \"(Internal) Unable to allocate memory for TRACE '%s' (mode: %s)\\n\", name, mode);\n    exit(1);\n  }\n\n  if(c = getenv(\"INSTR_COMPRESS\")) {\n    use_compress = atoi(c);\n    fprintf(stderr, \"Instr Compression enabled: %d\\n\", use_compress);\n  }\n\n  if(!use_compress) {\n    t->format = UNCOMPRESSED;\n    t->f = fopen(name, mode);\n  \n    if(!t->f) {\n      fprintf(stderr, \"Unable to open trace data file '%s' (mode: %s)\\n\", name, mode);\n      exit(1);\n    }\n  } else {\n#ifdef USE_SNAPPY\n    t->format = SNAPPY;\n    t->s = snopen(name, mode);\n    if(!t->s) {\n      fprintf(stderr, \"Unable to open compressed trace data file '%s' (mode: %s)\\n\", name, mode);\n      exit(1);      \n    }\n#else\n    t->format = GZIP;\n    t->z = gzopen(name, mode);\n    \n    gzbuffer(t->z, 1048576);\n\n    int gzip_level = 3;\n\n    if(c = getenv(\"INSTR_GZIP_LEVEL\")) {\n      gzip_level = atoi(c);\n      fprintf(stderr, \"Using GZIP level: %d\\n\", gzip_level);\n    }\n\n    gzsetparams(t->z, gzip_level, Z_DEFAULT_STRATEGY);\n\n    if(!t->z) {\n      fprintf(stderr, \"Unable to open compressed trace data file '%s' (mode: %s)\\n\", name, mode);\n      exit(1);\n    }\n#endif\n  }\n\n  return t;\n}\n\nvoid trace_close(TRACE t) {\n\n  if(t->format == UNCOMPRESSED) {\n    fclose(t->f);\n  } else if (t->format == GZIP) {\n    gzclose(t->z);\n  } else if (t->format == SNAPPY) {\n#ifdef USE_SNAPPY\n    snclose(t->s);\n#endif \n  }\n\n  free(t);\n}\n\n\nvoid instr_set_saved_uniqid(const char *id) {\n  saved_uniqid = id;\n}\n\nvoid instr_load_uniqid() {\n  const int SZ=255;\n  static char id[SZ];\n  const char *r = NULL;\n\n  r = getenv(\"INSTR_UNIQID\");\n\n  if(r) {\n    strncpy(id, r, SZ);\n    assert(id[SZ - 1] == '\\0');\n    instr_set_saved_uniqid(id);\n  } else {\n    fprintf(stderr, \"Unable to read environment variable INSTR_UNIQID\\n\");\n    exit(1);\n  }\n}\n\nconst char *instr_trace_dir() {\n  const int SZ=255;\n  static char dir[SZ];\n  static bool checked;\n  const char *r = NULL;\n  \n  if(!checked) {\n    r = getenv(\"INSTR_TRACE_DIR\");\n    \n    if(r) {\n      strncpy(dir, r, SZ);\n      assert(dir[SZ - 1] == '\\0');\n      //TODO: append a \"/\"?\n    } else {    \n      dir[0] = '\\0';\n    }\n    checked = true;\n  }\n\n  return dir;\n}\n\nconst char *instr_saved_uniqid() {\n  return saved_uniqid;\n}\n\nconst char *instr_uniqid() {\n  const char *runid;\n  static char spid[32];\n  int ret;\n\n  runid = bmk2_get_runid();\n  if(!runid) {\n    ret = snprintf(spid, 32, \"%d\", getpid());\n    assert(ret > 0 && ret < 32);\n    runid = spid;\n  }\n\n  return runid;\n}\n\nvoid instr_write_array(const char *n, \n\t\t      TRACE f, size_t elemsz, size_t nelems, void *p) \n{\n  assert(f != NULL);\n\n  if(f->format == UNCOMPRESSED) {\n    if(fwrite(&nelems, sizeof(nelems), 1, f->f) != 1) {\n      fprintf(stderr, \"Error writing size to '%s'\\n\", n);\n      exit(1);\n    }\n\n    if(fwrite(p, elemsz, nelems, f->f) != nelems) {\n      fprintf(stderr, \"Error writing items to '%s'\\n\", n);\n      exit(1);\n    }\n  } else if(f->format == GZIP) {\n    if(gzwrite(f->z, &nelems, sizeof(nelems) * 1) < sizeof(nelems) * 1) {\n      fprintf(stderr, \"Error writing size to compressed '%s'\\n\", n);\n      exit(1);\n    }\n    \n    if(gzwrite(f->z, p, elemsz * nelems) < elemsz * nelems) {\n      fprintf(stderr, \"Error writing items to compressed '%s'\\n\", n);\n      exit(1);\n    }\n  } else if(f->format == SNAPPY) {\n#ifdef USE_SNAPPY\n    if(snwrite(f->s, &nelems, sizeof(nelems) * 1) < sizeof(nelems) * 1) {\n      fprintf(stderr, \"Error writing size to compressed '%s'\\n\", n);\n      exit(1);\n    }\n    \n    if(snwrite(f->s, p, elemsz * nelems) < elemsz * nelems) {\n      fprintf(stderr, \"Error writing items to compressed '%s'\\n\", n);\n      exit(1);\n    }\n#endif \n  }\n}\n\n#ifdef USE_SNAPPY\nSNAPPY_FILE trace_snappy_handle(TRACE f) {\n  return f->s;\n}\n#endif\n\nsize_t instr_read_array(const char *n, \n\t\t\tTRACE f, \n\t\t\tsize_t elemsz, \n\t\t\tsize_t maxnelems, \n\t\t\tvoid *p) \n{\n  size_t nelems;\n\n  assert(f != NULL);\n  if(f->format == UNCOMPRESSED) {\n    if(fread(&nelems, sizeof(nelems), 1, f->f) != 1) {\n      fprintf(stderr, \"Error reading size from '%s'\\n\", n);\n      exit(1);\n    }\n\n    if(nelems > maxnelems) {\n      fprintf(stderr, \"Too many items to read from '%s'\\n\", n);\n      exit(1);\n    }\n\n    if(fread(p, elemsz, nelems, f->f) != nelems) {\n      fprintf(stderr, \"Error reading items from '%s'\\n\", n);\n      exit(1);\n    }\n  } else if(f->format == GZIP) {\n    if(gzread(f->z, &nelems, sizeof(nelems) * 1) < sizeof(nelems) * 1) {\n      fprintf(stderr, \"Error reading size from compressed '%s'\\n\", n);\n      exit(1);\n    }\n\n    if(nelems > maxnelems) {\n      fprintf(stderr, \"Too many items to read from '%s'\\n\", n);\n      exit(1);\n    }\n\n    if(gzread(f->z, p, elemsz * nelems) != elemsz * nelems) {\n      fprintf(stderr, \"Error reading items from compressed '%s'\\n\", n);\n      exit(1);\n    }\n  } else if(f->format == SNAPPY) {\n#ifdef USE_SNAPPY\n    if(snread(f->s, &nelems, sizeof(nelems) * 1) < sizeof(nelems) * 1) {\n      fprintf(stderr, \"Error reading size from compressed '%s'\\n\", n);\n      exit(1);\n    }\n\n    if(nelems > maxnelems) {\n      fprintf(stderr, \"Too many items to read from '%s'\\n\", n);\n      exit(1);\n    }\n\n    if(snread(f->s, p, elemsz * nelems) != elemsz * nelems) {\n      fprintf(stderr, \"Error reading items from compressed '%s'\\n\", n);\n      exit(1);\n    }\n#endif \n  }\n  return nelems;\n}\n\nsize_t instr_read_array_gpu(const char *n, \n\t\t\t    TRACE f, size_t elemsz, size_t maxnelems, \n\t\t\t    void *gp, void *cp) \n{\n  bool allocated = false;\n\n  if(!cp) {\n    cp = malloc(elemsz * maxnelems);\n    allocated = true;\n    assert(cp != NULL);\n  }\n\n  size_t nelems = instr_read_array(n, f, elemsz, maxnelems, cp);\n\n  check_cuda(cudaMemcpy(gp, cp, nelems * elemsz, cudaMemcpyHostToDevice));\n  \n  if(allocated) \n    free(cp);\n\n  return nelems;\n}\n\n\nvoid instr_write_array_gpu(const char *n, \n\t\t\t   TRACE f, size_t elemsz, size_t nelems, \n\t\t\t   void *gp, void *cp) \n{\n  bool allocated = false;\n\n  if(!cp) {\n    cp = malloc(elemsz * nelems);\n    allocated = true;\n    assert(cp != NULL);\n  }\n\n  check_cuda(cudaMemcpy(cp, gp, nelems * elemsz, cudaMemcpyDeviceToHost));\n  \n  instr_write_array(n, f, elemsz, nelems, cp);\n\n  if(allocated) \n    free(cp);\n}\n\n\nvoid instr_save_primitive(const char *name, \n\t\t\t  const int invocation,\n\t\t\t  const int pos,\n\t\t\t  const char *arg,\n\t\t\t  void *p, size_t sp)\n{\n  const int SZ=255;\n  char fname[SZ];\n  int written;\n\n  written = snprintf(fname, SZ, \"%s%s.%s.%s.arg\", instr_trace_dir(), name, arg, instr_uniqid());\n  \n  assert(written > 0 && written < SZ);\n  \n  FILE *o;\n  o = fopen(fname, \"w+\");\n\n  if(o == NULL) {\n    fprintf(stderr, \"Failed to open '%s'\\n\", fname);\n    exit(1);\n  }\n\n  if(fseek(o, invocation * sp, SEEK_SET) == 0) {\n    if(fwrite(p, sp, 1, o) != 1) {\n      fprintf(stderr, \"instr_save_primitive: Write failed!\\n\");\n      exit(1);\n    }\n  }\n  else {\n      fprintf(stderr, \"instr_save_primitive: fseek failed!\\n\");\n      exit(1);    \n  }\n\n  if(invocation == 0) {\n    bmk2_log_collect(\"ggc/kstate\", fname);\n  }\n\n  fclose(o);\n}\n\nvoid instr_load_primitive(const char *name, \n\t\t\t  const int invocation,\n\t\t\t  const int pos,\n\t\t\t  const char *arg,\n\t\t\t  void *p, size_t sp)\n{\n  const int SZ=255;\n  char fname[SZ];\n  int written;\n\n  written = snprintf(fname, SZ, \"%s%s.%s.%s.arg\", instr_trace_dir(), name, arg, instr_saved_uniqid());\n  \n  assert(written > 0 && written < SZ);\n  \n  FILE *o;\n  o = fopen(fname, \"r\");\n\n  if(o == NULL) {\n    fprintf(stderr, \"Failed to open '%s'\\n\", fname);\n    exit(1);\n  }\n\n  if(fseek(o, invocation * sp, SEEK_SET) == 0) {\n    if(fread(p, sp, 1, o) != 1) {\n      fprintf(stderr, \"instr_load_primitive: Read failed!\\n\");\n      exit(1);\n    }\n  }\n  else {\n      fprintf(stderr, \"instr_load_primitive: fseek failed!\\n\");\n      exit(1);    \n  }\n\n  fclose(o);\n}\n\n\nvoid instr_save_array_gpu(const char *kernel,\n\t\t\t  const int invocation,\n\t\t\t  const int pos,\n\t\t\t  const char *arg,\n\t\t\t  void *gp,\n\t\t\t  void *cp,\n\t\t\t  size_t sz,\n\t\t\t  size_t num)\n{\n  const int SZ=255;\n  char fname[SZ];\n  int written;\n\n  written = snprintf(fname, SZ, \"%s%s.%s.%d.%s.arg\", instr_trace_dir(), kernel, arg, invocation, instr_uniqid());\n  \n  assert(written >0 && written < SZ);\n  \n  TRACE o;\n  o = trace_open(fname, \"w\");\n\n  instr_write_array_gpu(fname, o, sz, num, gp, cp);\n\n  bmk2_log_collect(\"ggc/kstate\", fname);\n\n  trace_close(o);\n}\n\nvoid instr_save_array(const char *kernel,\n\t\t      const int invocation,\n\t\t      const int pos,\n\t\t      const char *arg,\n\t\t      void *cp,\n\t\t      size_t sz,\n\t\t      size_t num)\n{\n  const int SZ=255;\n  char fname[SZ];\n  int written;\n\n  written = snprintf(fname, SZ, \"%s%s.%s.%d.%s.arg\", instr_trace_dir(), kernel, arg, invocation, instr_uniqid());\n  \n  assert(written >0 && written < SZ);\n  \n  TRACE o;\n  o = trace_open(fname, \"w\");\n\n  instr_write_array(fname, o, sz, num, cp);\n\n  bmk2_log_collect(\"ggc/kstate\", fname);\n\n  trace_close(o);\n}\n\n\nsize_t instr_load_array_gpu(const char *kernel,\n\t\t\t    const int invocation,\n\t\t\t    const int pos,\n\t\t\t    const char *arg,\n\t\t\t    void *gp,\n\t\t\t    void *cp,\n\t\t\t    size_t sz,\n\t\t\t    size_t maxnum)\n{\n  const int SZ=255;\n  char fname[SZ];\n  int written;\n\n  written = snprintf(fname, SZ, \"%s%s.%s.%d.%s.arg\", instr_trace_dir(), kernel, arg, invocation, \n\t\t     instr_saved_uniqid());\n  \n  assert(written >0 && written < SZ);\n  \n  TRACE o;\n  o = trace_open(fname, \"r\");\n\n  if(o == NULL) {\n    fprintf(stderr, \"Failed to open '%s'\\n\", fname);\n    exit(1);\n  }\n\n  assert(o != NULL);\n\n  size_t nelems = instr_read_array_gpu(fname, o, sz, maxnum, gp, cp);\n  \n  trace_close(o);\n\n  return nelems;\n}\n\nsize_t instr_load_array(const char *kernel,\n\t\t\tconst int invocation,\n\t\t\tconst int pos,\n\t\t\tconst char *arg,\n\t\t\tvoid *cp,\n\t\t\tsize_t sz,\n\t\t\tsize_t maxnum)\n{\n  const int SZ=255;\n  char fname[SZ];\n  int written;\n\n  written = snprintf(fname, SZ, \"%s%s.%s.%d.%s.arg\", instr_trace_dir(), kernel, arg, invocation, \n\t\t     instr_saved_uniqid());\n  \n  assert(written >0 && written < SZ);\n  \n  TRACE o;\n  o = trace_open(fname, \"r\");\n\n  size_t nelems = instr_read_array(fname, o, sz, maxnum, cp);\n  \n  trace_close(o);\n\n  return nelems;\n}\n\n\nstruct instr_trace_record {\n  int ty;\n  int depth;\n  int index;\n};\n\nstruct instr_trace {\n  FILE *f;  \n  struct instr_trace_record *r;\n  int records;\n  int index;\n};\n\n\nstruct instr_trace * instr_trace_file(const char *prefix, int mode) {\n  struct instr_trace *it;\n  const int SZ=255;\n  char fname[SZ];\n  int written;\n  const char *id;\n\n  it = (struct instr_trace *) malloc(sizeof(instr_trace));\n  it->r = NULL;\n\n  if(mode == 0) {\n    id = instr_saved_uniqid(); // read\n  } else {\n    id = instr_uniqid(); //write\n  }\n\n  written = snprintf(fname, SZ, \"%s%s.%s.trace\", instr_trace_dir(), prefix, id);\n  assert(written >0 && written < SZ);\n\n  if(mode == 0) \n    it->f = fopen(fname, \"r\");\n  else \n    it->f = fopen(fname, \"w\");\n\n  if(!it->f) {\n    fprintf(stderr, \"Failed to open '%s' for %s\\n\", fname, mode ? \"writing\" : \"reading\");\n    exit(1);  \n  }\n\n  if(mode == 0) {\n    instr_load_trace(fname, it);\n  } else {\n    bmk2_log_collect(\"ggc/trace\", fname);\n  }\n\n  return it;\n}\n\nvoid instr_load_trace(const char *n, struct instr_trace *it) {\n  assert(it != NULL);\n  assert(it->f != NULL);\n  assert(it->r == NULL);\n\n  int wr = 0;\n  int N = 1024;\n  int ty, depth, index;\n\n  it->r = (struct instr_trace_record *) malloc(N*sizeof(instr_trace_record));\n\n  assert(it->r);\n\n  int fd = fileno(it->f);\n  assert(fd != -1);\n\n  struct stat s;\n\n  if(fstat(fd, &s) != 0) {\n    fprintf(stderr, \"Error when stat'ing trace file '%s'\\n\", n);\n    exit(1);\n  }\n\n  if(s.st_size > 0) {\n    while(!feof(it->f)) {\n      if(fscanf(it->f, \"%d %d %d\\n\", &ty, &depth, &index) == 3) {\n\tit->r[wr].ty = ty;\n\tit->r[wr].depth = depth;\n\tit->r[wr].index = index;\n\twr++;\n\tif(wr >= N) {\n\t  N *= 2;\n\t  it->r = (struct instr_trace_record *) realloc(it->r, N*sizeof(instr_trace_record));\n\t  assert(it->r != NULL);\n\t}\n      }\n      else {\n\tfprintf(stderr, \"Error when reading trace file '%s'\\n\", n);\n\texit(1);\n      }\n    }\n  }\n\n  it->records = wr;\n  it->index = 0;\n}\n\nbool instr_match_pipe(struct instr_trace *it, int what, int depth, int index) {\n  assert(it != NULL);\n  assert(it->f != NULL);\n  assert(it->r != NULL);\n  \n  if(it->index < it->records) {\n    struct instr_trace_record *r = &it->r[it->index];\n    if (r->ty == what && r->depth == depth && r->index == index) {\n      it->index++;\n      return true;\n    }\n  }\n\n  return false;\n}\n\nbool debug_match(struct instr_trace *it, bool succeeded, int depth, int index, int what, const char *swhat) {\n  if(!succeeded) {\n    assert(it != NULL);\n    assert(it->f != NULL);\n    assert(it->r != NULL);\n\n    if(it->index < it->records) {\n      struct instr_trace_record *r = &it->r[it->index];\n      \n      fprintf(stderr, \"Match attempt failed: (ty: %d/%s, depth: %d, index: %d) does not equal stored (ty: %d, depth: %d, index: %d)\\n\", \n\t      what, swhat, depth, index, \n\t      r->ty, r->depth, r->index);      \n    } else {\n      fprintf(stderr, \"Match attempt failed: (ty: %d/%s, depth: %d, index: %d) beyond end of stored trace\\n\", \n\t      what, swhat, depth, index);\n    }\n  }\n\n  return succeeded;\n}\n\nbool instr_match_pipe_iterate(struct instr_trace *it, int depth, int index) {\n  bool x = instr_match_pipe(it, INSTR_TRACE_ITER, depth, index);\n\n  return debug_match(it, x, depth, index, INSTR_TRACE_ITER, \"ITER\");\n}\n\nbool instr_match_pipe_exit(struct instr_trace *it, int depth, int index) {\n  bool x = instr_match_pipe(it, INSTR_TRACE_EXIT, depth, index);\n\n  return debug_match(it, x, depth, index, INSTR_TRACE_EXIT, \"EXIT\");\n}\n\nvoid instr_pipe_iterate(struct instr_trace *it, int depth, int index) {\n  assert(it != NULL);\n  assert(it->f != NULL);\n  fprintf(it->f, \"%d %d %d\\n\", INSTR_TRACE_ITER, depth, index);\n}\n\nvoid instr_pipe_exit(struct instr_trace *it, int depth, int index) {\n  assert(it != NULL);\n  assert(it->f != NULL);\n  fprintf(it->f, \"%d %d %d\\n\", INSTR_TRACE_EXIT, depth, index);\n}\n"
  },
  {
    "path": "libgpu/src/skelapp/skel.cu",
    "content": "/* -*- mode: c++ -*- */\n\n#include <cuda.h>\n#include <cstdio>\n#include <unistd.h>\n#include <getopt.h>\n#include <errno.h>\n\n#include \"gg.h\"\n#include \"Timer.h\"\n\nextern void gg_main(CSRGraphTy &, CSRGraphTy &);\nextern void output(CSRGraphTy &, const char *output_file);\nextern const char *GGC_OPTIONS;\n\nint QUIET = 0;\nchar *INPUT, *OUTPUT;\nextern int SKELAPP_RETVAL;\nextern unsigned long DISCOUNT_TIME_NS;\n\nunsigned long DISCOUNT_TIME_NS = 0;\nint SKELAPP_RETVAL = 0;\n\nint CUDA_DEVICE = 0;\n\n//mgpu::ContextPtr mgc;\n\nextern const char *prog_opts;\nextern const char *prog_usage;\nextern const char *prog_args_usage;\nextern void process_prog_opt(char optchar, char *optarg);\nextern int process_prog_arg(int argc, char *argv[], int arg_start);\n\n__global__ void initialize_skel_kernel() {\n}\n\nvoid kernel_sizing(CSRGraphTy & g, dim3 &blocks, dim3 &threads) {\n  threads.x = 256;\n  threads.y = threads.z = 1;\n\n  blocks.x = ggc_get_nSM() * 8;\n  blocks.y = blocks.z = 1;\n}\n\nint load_graph_and_run_kernel(char *graph_file) {\n  CSRGraphTy g, gg;\n  \n  ggc::Timer k(\"gg_main\");\n  fprintf(stderr, \"OPTIONS: %s\\n\", GGC_OPTIONS);\n  g.read(graph_file);\n\n  g.copy_to_gpu(gg);\n\n  int *d;\n  check_cuda(cudaMalloc(&d, sizeof(int) * 1));\n  //check_cuda(cudaFree(d));\n\n  //initialize_skel_kernel<<<1,1>>>();\n\n  k.start();\n  gg_main(g, gg);\n  check_cuda(cudaDeviceSynchronize());\n  k.stop();\n  k.print();\n  fprintf(stderr, \"Total time: %llu ms\\n\", k.duration_ms());\n  fprintf(stderr, \"Total time: %llu ns\\n\", k.duration());\n\n  if(DISCOUNT_TIME_NS > 0) {\n    fprintf(stderr, \"Total time (discounted): %llu ns\\n\", k.duration() - DISCOUNT_TIME_NS);\n  }\n\n  gg.copy_to_cpu(g);\n\n  if(!QUIET)\n    output(g, OUTPUT);\n\n  return SKELAPP_RETVAL;\n}\n\nvoid usage(int argc, char *argv[]) \n{\n  if(strlen(prog_usage)) \n    fprintf(stderr, \"usage: %s [-q] [-g gpunum] [-o output-file] %s graph-file \\n %s\\n\", argv[0], prog_usage, prog_args_usage);\n  else\n    fprintf(stderr, \"usage: %s [-q] [-g gpunum] [-o output-file] graph-file %s\\n\", argv[0], prog_args_usage);\n}\n\nvoid parse_args(int argc, char *argv[]) \n{\n  int c;\n  const char *skel_opts = \"g:qo:\";\n  char *opts;\n  int len = 0;\n  \n  len = strlen(skel_opts) + strlen(prog_opts) + 1;\n  opts = (char *) calloc(1, len);\n  strcat(strcat(opts, skel_opts), prog_opts);\n\n  while((c = getopt(argc, argv, opts)) != -1) {\n    switch(c) \n      {\n      case 'q':\n\tQUIET = 1;\n\tbreak;\n      case 'o':\n\tOUTPUT = optarg; //TODO: copy?\n\tbreak;\n      case 'g':\n\tchar *end;\n\terrno = 0;\n\tCUDA_DEVICE = strtol(optarg, &end, 10);\n\tif(errno != 0 || *end != '\\0') {\n\t  fprintf(stderr, \"Invalid GPU device '%s'. An integer must be specified.\\n\", optarg);\n\t  exit(EXIT_FAILURE);\n\t}\n\tbreak;\n      case '?':\n\tusage(argc, argv);\n\texit(EXIT_FAILURE);\n      default:\n\tprocess_prog_opt(c, optarg);\n\tbreak;\n    }\n  }\n\n  if(optind < argc) {\n    INPUT = argv[optind];\n    if(!process_prog_arg(argc, argv, optind + 1)) {\n      usage(argc, argv);\n      exit(EXIT_FAILURE);\n    }\n  }\n  else {\n    usage(argc, argv);\n    exit(EXIT_FAILURE);      \n  }\n}\n\nint main(int argc, char *argv[]) {\n  if(argc == 1) {\n    usage(argc, argv);\n    exit(1);\n  }\n\n  parse_args(argc, argv);\n  ggc_set_gpu_device(CUDA_DEVICE);\n  //mgc = mgpu::CreateCudaDevice(CUDA_DEVICE);\n  //printf(\"Using GPU: %s\\n\", mgc->DeviceString().c_str());\n  int r = load_graph_and_run_kernel(INPUT);\n  return r;\n}\n"
  },
  {
    "path": "libgpu/src/snappy.c",
    "content": "#include <snappy-c.h>\n#include <stdio.h>\n#include <stdlib.h>\n#include <assert.h>\n#include <string.h>\n#include \"snfile.h\"\n#include <errno.h>\n#include <string.h>\n\nconst int BUFSIZE = 10485760;\n\nstruct snappy_file {\n  FILE* f;\n  char* buf;\n  size_t bufsize;\n  size_t bufhead;\n  size_t buflen;\n};\n\nSNAPPY_FILE snopen(const char* name, const char* mode) {\n  SNAPPY_FILE x;\n\n  x = (SNAPPY_FILE)malloc(sizeof(struct snappy_file) * 1);\n\n  if (!x)\n    return NULL;\n\n  x->f = fopen(name, mode);\n  if (!x->f) {\n    free(x);\n    return NULL;\n  }\n\n  x->buf = (char*)malloc(BUFSIZE);\n  if (!x->buf) {\n    fclose(x->f);\n    free(x);\n    return NULL;\n  }\n\n  x->bufsize = BUFSIZE;\n  x->bufhead = 0;\n  x->buflen  = 0;\n\n  return x;\n}\n\nsize_t snwrite(SNAPPY_FILE f, void* p, size_t sz) {\n  size_t clen;\n\n  clen = snappy_max_compressed_length(sz);\n\n  if (clen > f->bufsize) {\n    f->buf = (char*)realloc(f->buf, clen);\n    if (!f->buf) {\n      fprintf(stderr, \"snwrite: Out of memory!\\n\");\n      return 0;\n    }\n\n    f->bufsize = clen;\n  }\n\n  if (snappy_compress(p, sz, f->buf, &clen) == SNAPPY_OK) {\n    if (fwrite(&clen, sizeof(clen), 1, f->f) != 1)\n      return 0;\n\n    if (fwrite(f->buf, 1, clen, f->f) < clen)\n      return 0;\n\n    return sz;\n  } else {\n    return 0;\n  }\n}\n\nstatic size_t snmin(size_t a, size_t b) { return a > b ? b : a; }\n\nsize_t snread(SNAPPY_FILE f, void* p, size_t sz) {\n  size_t handled = 0;\n  size_t read;\n  size_t clen, unclen;\n\n  // is there uncompressed data in the buffer?\n  assert(f->buflen >= f->bufhead);\n\n  if (f->buflen - f->bufhead) {\n    handled = snmin(sz, (f->buflen - f->bufhead));\n    memcpy(p, f->buf + f->bufhead, handled);\n    sz -= handled;\n    p += handled;\n    f->bufhead += handled;\n  }\n\n  while (sz > 0) {\n    assert(f->bufhead == f->buflen);\n\n    f->bufhead = 0;\n    f->buflen  = 0;\n\n    if (fread(&clen, sizeof(clen), 1, f->f) != 1) {\n      fprintf(stderr, \"Failed to read clen (errno: %d, %d, %d)\\n\", errno,\n              ferror(f->f), feof(f->f));\n      return handled;\n    }\n\n    char* cbuf = (char*)malloc(clen * 1);\n    if (!cbuf) {\n      fprintf(stderr, \"Failed to allocate buffer clen\\n\");\n      return handled;\n    }\n\n    if (fread(cbuf, 1, clen, f->f) < clen) {\n      fprintf(stderr, \"Failed to read complete cbuf of length %u\\n\", clen);\n      // almost certainly an error!\n      free(cbuf);\n      return handled;\n    }\n\n    if (snappy_uncompressed_length(cbuf, clen, &unclen) != SNAPPY_OK) {\n      fprintf(stderr, \"Failed to decompress uncompressed length %u\\n\", clen);\n      return handled;\n    }\n\n    if (unclen > f->bufsize) {\n      f->buf = realloc(f->buf, unclen);\n      if (!f->buf) {\n        fprintf(stderr, \"snread: Out of memory!\\n\");\n        return handled;\n      }\n      f->bufsize = unclen;\n    }\n\n    if (snappy_uncompress(cbuf, clen, f->buf, &unclen) != SNAPPY_OK) {\n      fprintf(stderr, \"Failed to decompress cbuf of length %u\\n\", clen);\n      return handled;\n    }\n    free(cbuf);\n\n    f->buflen = unclen;\n\n    size_t tocopy;\n    tocopy = snmin(sz, f->buflen);\n\n    memcpy(p, f->buf, tocopy);\n    sz -= tocopy;\n    p += tocopy;\n    f->bufhead += tocopy;\n    handled += tocopy;\n  }\n\n  return handled;\n}\n\nint sneof(SNAPPY_FILE f) {\n  assert(f);\n  return feof(f->f);\n}\n\nvoid snclose(SNAPPY_FILE f) {\n  fclose(f->f);\n  free(f->buf);\n  free(f);\n}\n"
  },
  {
    "path": "libgpu/src/snappy_test.c",
    "content": "#include <stdio.h>\n#include <stdlib.h>\n#include <assert.h>\n#include \"snfile.h\"\n\nvoid compress(char* in, char* out) {\n  FILE* i;\n  SNAPPY_FILE s;\n  char* buf;\n  size_t N = 1024 * 1024 * 100;\n  size_t R = 0;\n\n  buf = (char*)malloc(N);\n  assert(buf);\n\n  i = fopen(in, \"r\");\n  s = snopen(out, \"w\");\n\n  assert(i);\n\n  while (!feof(i)) {\n    R = fread(buf, 1, N, i);\n\n    if (R > 0) {\n      snwrite(s, buf, R);\n    } else {\n      fprintf(stderr, \"Error?\\n\");\n    }\n  }\n\n  fclose(i);\n  snclose(s);\n}\n\nvoid decompress(char* in, char* out) {\n  FILE* o;\n  SNAPPY_FILE s;\n  char* buf;\n  size_t N = 1024;\n  size_t R = 0;\n\n  buf = (char*)malloc(N);\n  assert(buf);\n\n  s = snopen(in, \"r\");\n  o = fopen(out, \"w\");\n\n  assert(s);\n  assert(o);\n\n  while (!sneof(s)) {\n    R = snread(s, buf, N);\n    if (R > 0) {\n      if (fwrite(buf, 1, R, o) < R) {\n        fprintf(stderr, \"Error writing\\n\");\n        exit(1);\n      }\n    } else {\n      if (!sneof(s))\n        fprintf(stderr, \"Error?\\n\");\n      break;\n    }\n  }\n\n  snclose(s);\n  fclose(o);\n}\n\nint main(int argc, char* argv[]) {\n  if (argc != 4) {\n    fprintf(stderr, \"Usage: %s cmd input output\\n\", argv[0]);\n    exit(1);\n  }\n\n  char* cmd = argv[1];\n  char* inp = argv[2];\n  char* out = argv[3];\n\n  if (strcmp(cmd, \"compress\") == 0) {\n    compress(inp, out);\n  } else {\n    decompress(inp, out);\n  }\n}\n"
  },
  {
    "path": "libpangolin/CMakeLists.txt",
    "content": "add_library(pangolin STATIC)\nadd_library(Galois::pangolin ALIAS pangolin)\nadd_dependencies(lib pangolin)\n\ntarget_sources(pangolin PRIVATE\n  src/equivalence.cpp\n  src/quick_pattern.cpp\n  src/base_embedding.cpp\n  src/vertex_embedding.cpp\n  src/BfsMining/embedding_list.cpp\n)\n\ntarget_include_directories(pangolin PUBLIC\n  $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>\n  $<INSTALL_INTERFACE:include>\n)\n\ntarget_include_directories(pangolin PUBLIC\n  $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/external/bliss>\n  $<INSTALL_INTERFACE:include>\n)\n\ntarget_link_libraries(pangolin PUBLIC galois_shmem)\n\nif (GALOIS_ENABLE_GPU)\n  add_library(pangolin_gpu INTERFACE)\n  add_library(Galois::pangolin_gpu ALIAS pangolin_gpu)\n  add_dependencies(lib pangolin_gpu)\n\n  target_include_directories(pangolin_gpu INTERFACE\n    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/gpu>\n    $<INSTALL_INTERFACE:include>\n  )\n\n  target_link_libraries(pangolin_gpu INTERFACE galois_shmem galois_gpu)\nendif()\n"
  },
  {
    "path": "libpangolin/README.md",
    "content": "Overview of Graph Pattern Mining (GPM) in Galois\n================================================================================\n\nThis is the Pangolin framework [1] for efficient and flexible graph mining. \nIt uses the bliss library [2][3] for graph isomorphism check. \nThe license for this library is in the bliss directory: \nnote that **it does not use the same license as the rest of Galois**.\nTo run Pangolin applications, please go to ../lonestarmine/README.md\nfor more details.\n\n[1] Xuhao Chen, Roshan Dathathri, Gurbinder Gill, Keshav Pingali, \nPangolin: An Efficient and Flexible Graph Pattern Mining System on CPU and GPU, VLDB 2020\n\n[2] Bliss: A tool for computing automorphism groups and canonical \nlabelings of graphs. http://www.tcs.hut.fi/Software/bliss/, 2017.\n\n[3] Tommi Junttila and Petteri Kaski. 2007. Engineering an efficient \ncanonical labeling tool for large and sparse graphs. In Proceedings \nof the Meeting on Algorithm Engineering & Expermiments, 135-149.\n\nINPUT\n===========\n\nWe support four input graph format: **gr**, **txt**, **adj**, **mtx**.\nFor unlabeled graphs, we use the gr graph format, same as other Galois benchmarks.\n**Make sure that the graph is symmetric and contains no self-loop or redundant edges**.\nIf not, use the convert tool in tools/graph-convert/ to convert the graph.\nWe use **adj** format for labeled graphs as also used by Arabesque and RStream.\nThe **adj** format takes as input graphs with the following formats (vertex labeled):\n\n```\n# <num vertices> <num edges>\n<vertex id> <vertex label> [<neighbour id1> <neighbour id2> ... <neighbour id n>]\n<vertex id> <vertex label> [<neighbour id1> <neighbour id2> ... <neighbour id n>]\n...\n```\n\nWe currently do not support graphs label on edges.\nVertex ids are expected to be sequential integers between 0 and (total number of vertices - 1).\nFor testing, we have prepared a test graph **citeseer** in $GALOIS_HOME/lonestarmine/test_data.\n\nBUILD\n===========\n\n1. Run cmake at BUILD directory: \n\n`cd build; cmake -DUSE_PANGOLIN=1 ../`\n\nTo enable GPU mining, use: \n\n`cmake -DUSE_PANGOLIN=1 -DUSE_GPU=1 ../`\n\n2. Run make:\n\n`cd <BUILD>/lonestar/experimental/fsm; make -j`\n\nRUN\n===========\n\nThe following are a few example command lines.\n\n- `$ ./tc_mine gr $GALOIS_HOME/lonestarmine/test_data/citeseer.csgr -t 28`\n- `$ ./kcl gr $GALOIS_HOME/lonestarmine/test_data/citeseer.csgr -k=3 -t 28`\n- `$ ./motif gr $GALOIS_HOME/lonestarmine/test_data/citeseer.csgr -k=3 -t 56`\n- `$ ./fsm adj $GALOIS_HOME/lonestarmine/test_data/citeseer.sadj -k=2 -ms=300 -t 28`\n\nPERFORMANCE\n===========\n\nPlease see details in the paper.\n\nCITATION\n==========\n\nPlease cite the following paper if you use Pangolin:\n\n```\n@article{Pangolin,\n\ttitle={Pangolin: An Efficient and Flexible Graph Mining System on CPU and GPU},\n\tauthor={Xuhao Chen and Roshan Dathathri and Gurbinder Gill and Keshav Pingali},\n\tyear={2020},\n\tjournal = {Proc. VLDB Endow.},\n\tissue_date = {August 2020},\n\tvolume = {13},\n\tnumber = {8},\n\tmonth = aug,\n\tyear = {2020},\n\tnumpages = {12},\n\tpublisher = {VLDB Endowment},\n}\n```\n\n"
  },
  {
    "path": "libpangolin/gpu/pangolin/bitsets.h",
    "content": "\n#pragma once\n#include <cuda.h>\n#include <assert.h>\n#include \"pangolin/types.cuh\"\n#include \"pangolin/cutils.h\"\n\nclass Bitsets {\npublic:\n  int num_sets;\n  int num_bits_capacity;\n  int num_bits;\n  uint64_t** h_bit_vectors;\n  uint64_t** d_bit_vectors;\n  Bitsets() {}\n  Bitsets(int n, int nbits) { alloc(n, nbits); }\n  ~Bitsets() {}\n  void set_size(int n, int nbits) {\n    num_sets          = n;\n    num_bits_capacity = nbits;\n    num_bits          = nbits;\n  }\n  void alloc(int n, int nbits) {\n    assert(sizeof(unsigned long long int) * 8 == 64);\n    assert(sizeof(uint64_t) * 8 == 64);\n    num_sets          = n;\n    num_bits_capacity = nbits;\n    num_bits          = nbits;\n    h_bit_vectors     = (uint64_t**)malloc(n * sizeof(uint64_t*));\n    for (int i = 0; i < n; i++) {\n      CUDA_SAFE_CALL(\n          cudaMalloc(&h_bit_vectors[i], vec_size() * sizeof(uint64_t)));\n      reset(i);\n    }\n    CUDA_SAFE_CALL(cudaMalloc(&d_bit_vectors, n * sizeof(uint64_t*)));\n    CUDA_SAFE_CALL(cudaMemcpy(d_bit_vectors, h_bit_vectors,\n                              n * sizeof(uint64_t*), cudaMemcpyHostToDevice));\n  }\n  void clear() {\n    for (int i = 0; i < num_sets; i++)\n      reset(i);\n    CUDA_SAFE_CALL(cudaMemcpy(d_bit_vectors, h_bit_vectors,\n                              num_sets * sizeof(uint64_t*),\n                              cudaMemcpyHostToDevice));\n  }\n  void clean() {\n    for (int i = 0; i < num_sets; i++)\n      if (h_bit_vectors[i] != NULL)\n        cudaFree(h_bit_vectors[i]);\n    if (d_bit_vectors != NULL)\n      cudaFree(d_bit_vectors);\n    if (h_bit_vectors != NULL)\n      free(h_bit_vectors);\n  }\n  void reset(int i) {\n    CUDA_SAFE_CALL(\n        cudaMemset(h_bit_vectors[i], 0, vec_size() * sizeof(uint64_t)));\n  }\n  __device__ void set(int sid, int bid) {\n    if (sid >= num_sets)\n      printf(\"sid=%d, num_sets=%d\\n\", sid, num_sets);\n    assert(sid < num_sets);\n    assert(bid < num_bits);\n    int bit_index                     = bid / 64;\n    unsigned long long int bit_offset = 1;\n    bit_offset <<= (bid % 64);\n    if ((d_bit_vectors[sid][bit_index] & bit_offset) == 0) { // test and set\n      atomicOr((unsigned long long int*)&d_bit_vectors[sid][bit_index],\n               bit_offset);\n    }\n  }\n  __device__ int count_num_ones(int sid, size_t bid) {\n    return __popcll(d_bit_vectors[sid][bid]);\n  }\n  __device__ __host__ size_t vec_size() const {\n    size_t bit_vector_size = (num_bits + 63) / 64;\n    return bit_vector_size;\n  }\n};\n"
  },
  {
    "path": "libpangolin/gpu/pangolin/checker.h",
    "content": "#ifndef CHECKER_H\n#define CHECKER_H\n#include <cuda.h>\n#include <cuda_runtime.h>\n\nstatic void check_cuda_error(const cudaError_t e, const char* file,\n                             const int line) {\n  if (e != cudaSuccess) {\n    fprintf(stderr, \"%s:%d: %s (%d)\\n\", file, line, cudaGetErrorString(e), e);\n    exit(1);\n  }\n}\n#define check_cuda(x) check_cuda_error(x, __FILE__, __LINE__)\n\n#endif\n"
  },
  {
    "path": "libpangolin/gpu/pangolin/cutils.h",
    "content": "#ifndef CUTIL_SUBSET_H\n#define CUTIL_SUBSET_H\n\n#define CUDA_SAFE_CALL_NO_SYNC(call)                                           \\\n  {                                                                            \\\n    cudaError err = call;                                                      \\\n    if (cudaSuccess != err) {                                                  \\\n      fprintf(stderr, \"error %d: Cuda error in file '%s' in line %i : %s.\\n\",  \\\n              err, __FILE__, __LINE__, cudaGetErrorString(err));               \\\n      exit(EXIT_FAILURE);                                                      \\\n    }                                                                          \\\n  }\n\n#define CUDA_SAFE_CALL(call) CUDA_SAFE_CALL_NO_SYNC(call);\n\n#define CUDA_SAFE_THREAD_SYNC()                                                \\\n  {                                                                            \\\n    cudaError err = CUT_DEVICE_SYNCHRONIZE();                                  \\\n    if (cudaSuccess != err) {                                                  \\\n      fprintf(stderr, \"Cuda error in file '%s' in line %i : %s.\\n\", __FILE__,  \\\n              __LINE__, cudaGetErrorString(err));                              \\\n    }                                                                          \\\n  }\n\n// from http://forums.nvidia.com/index.php?showtopic=186669\nstatic __device__ unsigned get_smid(void) {\n  unsigned ret;\n  asm(\"mov.u32 %0, %smid;\" : \"=r\"(ret));\n  return ret;\n}\n\ninline unsigned CudaTest(const char* msg) {\n  cudaError_t e;\n  // cudaThreadSynchronize();\n  if (cudaSuccess != (e = cudaGetLastError())) {\n    fprintf(stderr, \"%s: %d\\n\", msg, e);\n    fprintf(stderr, \"%s\\n\", cudaGetErrorString(e));\n    exit(-1);\n    // return 1;\n  }\n  return 0;\n}\n#endif\n"
  },
  {
    "path": "libpangolin/gpu/pangolin/element.cuh",
    "content": "#ifndef ELEMENT_CUH_\n#define ELEMENT_CUH_\n#include \"types.cuh\"\n\nstruct SimpleElement {\nprotected:\n\tIndexT vertex_id;\npublic:\n\tSimpleElement() : vertex_id(0) { }\n\tSimpleElement(IndexT _vertex_id) : vertex_id(_vertex_id) { }\n\tSimpleElement(IndexT _vertex_id, edge_data_type _edge_label, node_data_type _vertex_label, history_type _history) : vertex_id(_vertex_id) { }\n\tSimpleElement(IndexT _vertex_id, key_type _key_index, edge_data_type _edge_label, node_data_type _vertex_label, history_type _history) : vertex_id(_vertex_id) { }\n\t~SimpleElement() { }\n\tinline __device__ void set_vertex_id(IndexT new_id) { vertex_id = new_id; }\n\tinline __device__ void set_history_info(history_type his) { }\n\tinline __device__ void set_vertex_label(node_data_type lab) { }\n\tinline __device__ IndexT get_vid() const { return vertex_id; }\n\tinline __device__ history_type get_his() const { return 0; }\n\tinline __device__ key_type get_key() const { return 0; }\n\tinline __device__ int cmp(const SimpleElement& other) const {\n\t\tif(vertex_id < other.get_vid()) return -1;\n\t\tif(vertex_id > other.get_vid()) return 1;\n\t\treturn 0;\n\t}\n\t//friend bool operator==(const SimpleElement &e1, const SimpleElement &e2) {\n\t//\treturn e1.get_vid() == e2.get_vid();\n\t//}\n};\n\n#ifdef USE_SIMPLE\ntypedef SimpleElement ElementType;\n#endif\n#endif\n"
  },
  {
    "path": "libpangolin/gpu/pangolin/embedding.cuh",
    "content": "#ifndef EMBEDDING_CUH_\n#define EMBEDDING_CUH_\n\n#include \"element.cuh\"\n\ntemplate <typename ElementTy>\nclass Embedding {\npublic:\n\tEmbedding() { size_ = 0; }\n\tEmbedding(size_t n) { size_ = n; elements = new ElementTy[size_]; } // TODO\n\t//Embedding(const Embedding &emb) { size_ = emb.size(); elements = emb.elements; }\n\t~Embedding() { }\n\t__device__ IndexT get_vertex(unsigned i) const { return elements[i].get_vid(); }\n\t__device__ history_type get_history(unsigned i) const { return elements[i].get_his(); }\n\t__device__ node_data_type get_label(unsigned i) const { return elements[i].get_vlabel(); }\n\t__device__ key_type get_key(unsigned i) const { return elements[i].get_key(); }\n\t__device__ bool empty() const { return size_ == 0; }\n\t__device__ size_t size() const { return size_; }\n\t__device__ ElementTy& back() { return elements[size_-1]; }\n\t__device__ const ElementTy& back() const { return elements[size_-1]; }\n\t__device__ ElementTy get_element(unsigned i) const { return elements[i]; }\n\t__device__ void set_element(unsigned i, ElementTy &ele) { elements[i] = ele; }\n\t__device__ void set_vertex(unsigned i, IndexT vid) { elements[i].set_vertex_id(vid); }\n\t//__device__ unsigned insert(unsigned pos, const ElementTy& value ) { return elements[pos] = value; }\n\t//__device__ ElementTy* data() { return elements; }\n\t//__device__ const ElementTy* data() const { return elements; }\n\t//__device__ ElementTy* get_elements() const { return elements; }\nprotected:\n\tElementTy *elements;\n\tsize_t size_;\n};\n\n\nclass BaseEmbedding : public Embedding<SimpleElement> {\npublic:\n\tBaseEmbedding() {}\n\tBaseEmbedding(size_t n) : Embedding(n) {}\n\t~BaseEmbedding() {}\n};\n\n#ifdef USE_BASE_TYPES\ntypedef BaseEmbedding EmbeddingType;\n#endif\n\ntemplate <typename EmbeddingTy>\nclass EmbeddingQueue{\npublic:\n\tEmbeddingQueue() {}\n\t~EmbeddingQueue() {}\n\tvoid init(int nedges, unsigned max_size = 2, bool use_dag = true) {\n\t\tint nnz = nedges;\n\t\tif (!use_dag) nnz = nnz / 2;\n\t\tsize = nedges;\n\t}\n\tEmbeddingTy *queue;\n\tint size;\n};\n\nclass EmbeddingList {\npublic:\n\tEmbeddingList() {}\n\t~EmbeddingList() {}\n\tvoid init(int nedges, unsigned max_size = 2, bool use_dag = true) {\n\t\tlast_level = 1;\n\t\tassert(max_size > 1);\n\t\tmax_level = max_size;\n\t\th_vid_lists = (IndexT **)malloc(max_level * sizeof(IndexT*));\n\t\th_idx_lists = (IndexT **)malloc(max_level * sizeof(IndexT*));\n\t\tcheck_cuda(cudaMalloc(&d_vid_lists, max_level * sizeof(IndexT*)));\n\t\tcheck_cuda(cudaMalloc(&d_idx_lists, max_level * sizeof(IndexT*)));\n\t\t#ifdef ENABLE_LABEL\n\t\th_his_lists = (history_type **)malloc(max_level * sizeof(history_type*));\n\t\tcheck_cuda(cudaMalloc(&d_his_lists, max_level * sizeof(history_type*)));\n\t\t#endif\n\t\tsizes = new size_t[max_level];\n\t\tsizes[0] = 0;\n\t\tint nnz = nedges;\n\t\tif (!use_dag) nnz = nnz / 2;\n\t\tsizes[1] = nnz;\n\t\tcheck_cuda(cudaMalloc((void **)&h_vid_lists[1], nnz * sizeof(IndexT)));\n\t\tcheck_cuda(cudaMalloc((void **)&h_idx_lists[1], nnz * sizeof(IndexT)));\n\t\tcheck_cuda(cudaMemcpy(d_vid_lists, h_vid_lists, max_level * sizeof(IndexT*), cudaMemcpyHostToDevice));\n\t\tcheck_cuda(cudaMemcpy(d_idx_lists, h_idx_lists, max_level * sizeof(IndexT*), cudaMemcpyHostToDevice));\n\t\t#ifdef ENABLE_LABEL\n\t\tcheck_cuda(cudaMalloc((void **)&h_his_lists[1], nnz * sizeof(history_type)));\n\t\tcheck_cuda(cudaMemcpy(d_his_lists, h_his_lists, max_level * sizeof(history_type*), cudaMemcpyHostToDevice));\n\t\t#endif\n\t}\n\tvoid init_cpu(CSRGraph *graph, bool is_dag = false) {\n\t\tint nnz = graph->get_nedges();\n\t\tif (!is_dag) nnz = nnz / 2;\n\t\tIndexT *vid_list = (IndexT *)malloc(nnz*sizeof(IndexT));\n\t\tIndexT *idx_list = (IndexT *)malloc(nnz*sizeof(IndexT));\n\t\tint eid = 0;\n\t\tfor (int src = 0; src < graph->get_nnodes(); src ++) {\n\t\t\tIndexT row_begin = graph->edge_begin(src);\n\t\t\tIndexT row_end = graph->edge_end(src);\n\t\t\tfor (IndexT e = row_begin; e < row_end; e++) {\n\t\t\t\tIndexT dst = graph->getEdgeDst(e);\n\t\t\t\tif (is_dag || src < dst) {\n\t\t\t\t\tvid_list[eid] = dst;\n\t\t\t\t\tidx_list[eid] = src;\n\t\t\t\t\teid ++;\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t\tcheck_cuda(cudaMemcpy(h_vid_lists[1], vid_list, nnz * sizeof(IndexT), cudaMemcpyHostToDevice));\n\t\tcheck_cuda(cudaMemcpy(h_idx_lists[1], idx_list, nnz * sizeof(IndexT), cudaMemcpyHostToDevice));\n\t\t#ifdef ENABLE_LABEL\n\t\tcheck_cuda(cudaMemset(h_his_lists[1], 0, nnz * sizeof(history_type)));\n\t\t#endif\n\t}\n\t__device__ IndexT get_vid(unsigned level, IndexT id) const { return d_vid_lists[level][id]; }\n\t__device__ IndexT get_idx(unsigned level, IndexT id) const { return d_idx_lists[level][id]; }\n\t__device__ history_type get_his(unsigned level, IndexT id) const { return d_his_lists[level][id]; }\n\t__device__ unsigned get_pid(IndexT id) const { return pid_list[id]; }\n\t__device__ void set_vid(unsigned level, IndexT id, IndexT vid) { d_vid_lists[level][id] = vid; }\n\t__device__ void set_idx(unsigned level, IndexT id, IndexT idx) { d_idx_lists[level][id] = idx; }\n\t__device__ void set_his(unsigned level, IndexT id, history_type lab) { d_his_lists[level][id] = lab; }\n\t__device__ void set_pid(IndexT id, unsigned pid) { pid_list[id] = pid; }\n\tsize_t size() const { return sizes[last_level]; }\n\tsize_t size(unsigned level) const { return sizes[level]; }\n\t//__device__ VertexList get_vid_list(unsigned level) { return vid_lists[level]; }\n\t//__device__ UintList get_idx_list(unsigned level) { return idx_lists[level]; }\n\t//__device__ ByteList get_his_list(unsigned level) { return his_lists[level]; }\n\tvoid add_level(unsigned size) { // TODO: this size could be larger than 2^32, when running LiveJournal and even larger graphs\n\t\tlast_level ++;\n\t\tassert(last_level < max_level);\n\t\tcheck_cuda(cudaMalloc((void **)&h_vid_lists[last_level], size * sizeof(IndexT)));\n\t\tcheck_cuda(cudaMalloc((void **)&h_idx_lists[last_level], size * sizeof(IndexT)));\n\t\t#ifdef ENABLE_LABEL\n\t\tcheck_cuda(cudaMalloc((void **)&h_his_lists[last_level], size * sizeof(history_type)));\n\t\t#endif\n\t\t#ifdef USE_PID\n\t\tcheck_cuda(cudaMalloc((void **)&pid_list, size * sizeof(unsigned)));\n\t\t#endif\n\t\tcheck_cuda(cudaMemcpy(d_vid_lists, h_vid_lists, max_level * sizeof(IndexT*), cudaMemcpyHostToDevice));\n\t\tcheck_cuda(cudaMemcpy(d_idx_lists, h_idx_lists, max_level * sizeof(IndexT*), cudaMemcpyHostToDevice));\n\t\t#ifdef ENABLE_LABEL\n\t\tcheck_cuda(cudaMemcpy(d_his_lists, h_his_lists, max_level * sizeof(history_type*), cudaMemcpyHostToDevice));\n\t\t#endif\n\t\tsizes[last_level] = size;\n\t}\n\tvoid remove_tail(unsigned idx) { sizes[last_level] = idx; }\n\tvoid reset_level() {\n\t\tfor (size_t i = 2; i <= last_level; i ++) {\n\t\t\tcheck_cuda(cudaFree(h_vid_lists[i]));\n\t\t\tcheck_cuda(cudaFree(h_idx_lists[i]));\n\t\t}\n\t\tlast_level = 1;\n\t}\n\n\t/*\n\tvoid printout_embeddings(int level, bool verbose = false) {\n\t\tstd::cout << \"number of embeddings in level \" << level << \": \" << size() << std::endl;\n\t\tif(verbose) {\n\t\t\tfor (size_t pos = 0; pos < size(); pos ++) {\n\t\t\t\tembeddingtype emb(last_level+1);\n\t\t\t\tget_embedding(last_level, pos, emb);\n\t\t\t\tstd::cout << emb << \"\\n\";\n\t\t\t}\n\t\t}\n\t}\n\t*/\n\t__device__ void get_embedding(unsigned level, unsigned pos, IndexT *emb) {\n\t\tIndexT vid = get_vid(level, pos);\n\t\tIndexT idx = get_idx(level, pos);\n\t\temb[level] = vid;\n\t\tfor (unsigned l = 1; l < level; l ++) {\n\t\t\tvid = get_vid(level-l, idx);\n\t\t\temb[level-l] = vid;\n\t\t\tidx = get_idx(level-l, idx);\n\t\t}\n\t\temb[0] = idx;\n\t}\n\t__device__ void get_edge_embedding(unsigned level, unsigned pos, IndexT *vids, history_type *hiss) {\n\t\tIndexT vid = get_vid(level, pos);\n\t\tIndexT idx = get_idx(level, pos);\n\t\thistory_type his = get_his(level, pos);\n\t\tvids[level] = vid;\n\t\thiss[level] = his;\n\t\tfor (unsigned l = 1; l < level; l ++) {\n\t\t\tvid = get_vid(level-l, idx);\n\t\t\this = get_his(level-l, idx);\n\t\t\tvids[level-l] = vid;\n\t\t\thiss[level-l] = his;\n\t\t\tidx = get_idx(level-l, idx);\n\t\t}\n\t\tvids[0] = idx;\n\t\thiss[0] = 0;\n\t}\n\nprivate:\n\tunsigned max_level;\n\tunsigned last_level;\n\tsize_t *sizes;\n\tunsigned *pid_list;\n\tIndexT** h_idx_lists;\n\tIndexT** h_vid_lists;\n\thistory_type** h_his_lists;\n\tIndexT** d_idx_lists;\n\tIndexT** d_vid_lists;\n\thistory_type** d_his_lists;\n};\n\n__global__ void init_gpu_dag(int m, CSRGraph graph, EmbeddingList emb_list) {\n\tunsigned src = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (src < m) {\n\t\tIndexT row_begin = graph.edge_begin(src);\n\t\tIndexT row_end = graph.edge_end(src);\n\t\tfor (IndexT e = row_begin; e < row_end; e++) {\n\t\t\tIndexT dst = graph.getEdgeDst(e);\n\t\t\temb_list.set_vid(1, e, dst);\n\t\t\temb_list.set_idx(1, e, src);\n\t\t}\n\t}\n}\n\n__global__ void init_alloc(int m, CSRGraph graph, EmbeddingList emb_list, IndexT *num_emb) {\n\tunsigned src = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (src < m) {\n\t\tnum_emb[src] = 0;\n\t\t#ifdef ENABLE_LABEL\n\t\tnode_data_type src_label = graph.getData(src);\n\t\t#endif\n\t\tIndexT row_begin = graph.edge_begin(src);\n\t\tIndexT row_end = graph.edge_end(src);\n\t\tfor (IndexT e = row_begin; e < row_end; e++) {\n\t\t\tIndexT dst = graph.getEdgeDst(e);\n\t\t\t#ifdef ENABLE_LABEL\n\t\t\tnode_data_type dst_label = graph.getData(dst);\n\t\t\t#endif\n\t\t\t#ifdef ENABLE_LABEL\n\t\t\tif (src_label <= dst_label) num_emb[src] ++;\n\t\t\t#else\n\t\t\tif (src < dst) num_emb[src] ++;\n\t\t\t#endif\n\t\t}\n\t}\n}\n\n__global__ void init_insert(int m, CSRGraph graph, EmbeddingList emb_list, IndexT *indices) {\n\tunsigned src = blockIdx.x * blockDim.x + threadIdx.x;\n\tif (src < m) {\n\t\t#ifdef ENABLE_LABEL\n\t\tnode_data_type src_label = graph.getData(src);\n\t\t#endif\n\t\tIndexT start = indices[src];\n\t\tIndexT row_begin = graph.edge_begin(src);\n\t\tIndexT row_end = graph.edge_end(src);\n\t\tfor (IndexT e = row_begin; e < row_end; e++) {\n\t\t\tIndexT dst = graph.getEdgeDst(e);\n\t\t\t#ifdef ENABLE_LABEL\n\t\t\tnode_data_type dst_label = graph.getData(dst);\n\t\t\t#endif\n\t\t\t#ifdef ENABLE_LABEL\n\t\t\tif (src_label <= dst_label) {\n\t\t\t#else\n\t\t\tif (src < dst) {\n\t\t\t#endif\n\t\t\t\temb_list.set_vid(1, start, dst);\n\t\t\t\temb_list.set_idx(1, start, src);\n\t\t\t\t#ifdef ENABLE_LABEL\n\t\t\t\temb_list.set_his(1, start, 0);\n\t\t\t\t#endif\n\t\t\t\tstart ++;\n\t\t\t}\n\t\t}\n\t}\n}\n\n#endif // EMBEDDING_CUH_\n"
  },
  {
    "path": "libpangolin/gpu/pangolin/graph_gpu.h",
    "content": "#pragma once\n#include <set>\n#include <vector>\n#include <string>\n#include <cassert>\n#include <fstream>\n#include <fcntl.h>\n#include <cassert>\n#include <unistd.h>\n#include <stdint.h>\n#include <algorithm>\n#include <sys/stat.h>\n#include <sys/mman.h>\n#include <sys/types.h>\n#include \"pangolin/types.cuh\"\n#include \"pangolin/checker.h\"\n#include \"pangolin/timer.h\"\n\nstruct Edge {\n  Edge(IndexT from, IndexT to) : src(from), dst(to) {}\n  IndexT src;\n  IndexT dst;\n};\n\nstd::vector<std::vector<Edge>> vertices;\n\nclass CSRGraph {\nprotected:\n  IndexT* row_start;\n  IndexT* edge_dst;\n  node_data_type* node_data;\n  int nnodes;\n  int nedges;\n  bool need_dag;\n  bool device_graph;\n  bool use_node_data;\n\npublic:\n  CSRGraph() { init(); }\n  //~CSRGraph() {}\n  void init() {\n    row_start = edge_dst = NULL;\n    node_data            = NULL;\n    nnodes = nedges = 0;\n    need_dag        = false;\n    device_graph    = false;\n    use_node_data   = false;\n  }\n  void enable_dag() { need_dag = true; }\n  int get_nnodes() { return nnodes; }\n  int get_nedges() { return nedges; }\n  void clean() {\n    check_cuda(cudaFree(row_start));\n    check_cuda(cudaFree(edge_dst));\n  }\n  __device__ __host__ bool valid_node(IndexT node) { return (node < nnodes); }\n  __device__ __host__ bool valid_edge(IndexT edge) { return (edge < nedges); }\n  __device__ __host__ IndexT getOutDegree(unsigned src) {\n    assert(src < nnodes);\n    return row_start[src + 1] - row_start[src];\n  };\n  __device__ __host__ IndexT getDestination(unsigned src, unsigned edge) {\n    assert(src < nnodes);\n    assert(edge < getOutDegree(src));\n    IndexT abs_edge = row_start[src] + edge;\n    assert(abs_edge < nedges);\n    return edge_dst[abs_edge];\n  };\n  __device__ __host__ IndexT getAbsDestination(unsigned abs_edge) {\n    assert(abs_edge < nedges);\n    return edge_dst[abs_edge];\n  };\n  inline __device__ __host__ IndexT getEdgeDst(unsigned edge) {\n    assert(edge < nedges);\n    return edge_dst[edge];\n  };\n  inline __device__ __host__ node_data_type getData(unsigned vid) {\n    return node_data[vid];\n  }\n  inline __device__ __host__ IndexT edge_begin(unsigned src) {\n    assert(src <= nnodes);\n    return row_start[src];\n  };\n  inline __device__ __host__ IndexT edge_end(unsigned src) {\n    assert(src <= nnodes);\n    return row_start[src + 1];\n  };\n  int read(std::string file, bool read_node_data = true, bool dag = false) {\n    std::cout << \"Reading graph fomr file: \" << file << \"\\n\";\n    need_dag = dag;\n    if (read_node_data) {\n      use_node_data = true;\n      return read_adj(file.c_str());\n    } else {\n      use_node_data = false;\n      readFromGR(file.c_str());\n    }\n    return 0;\n  }\n  void readFromGR(const char file[]) {\n    std::ifstream cfile;\n    cfile.open(file);\n    int masterFD = open(file, O_RDONLY);\n    if (masterFD == -1) {\n      printf(\"FileGraph::structureFromFile: unable to open %s.\\n\", file);\n      return;\n    }\n    struct stat buf;\n    int f = fstat(masterFD, &buf);\n    if (f == -1) {\n      printf(\"FileGraph::structureFromFile: unable to stat %s.\\n\", file);\n      abort();\n    }\n    size_t masterLength = buf.st_size;\n    int _MAP_BASE       = MAP_PRIVATE;\n    void* m = mmap(0, masterLength, PROT_READ, _MAP_BASE, masterFD, 0);\n    if (m == MAP_FAILED) {\n      m = 0;\n      printf(\"FileGraph::structureFromFile: mmap failed.\\n\");\n      abort();\n    }\n    Timer t;\n    t.Start();\n    uint64_t* fptr                           = (uint64_t*)m;\n    __attribute__((unused)) uint64_t version = le64toh(*fptr++);\n    assert(version == 1);\n    uint64_t sizeEdgeTy = le64toh(*fptr++);\n    uint64_t numNodes   = le64toh(*fptr++);\n    uint64_t numEdges   = le64toh(*fptr++);\n    uint64_t* outIdx    = fptr;\n    fptr += numNodes;\n    uint32_t* fptr32 = (uint32_t*)fptr;\n    uint32_t* outs   = fptr32;\n    fptr32 += numEdges;\n    if (numEdges % 2)\n      fptr32 += 1;\n    nnodes = numNodes;\n    nedges = numEdges;\n    printf(\"nnodes=%d, nedges=%d, sizeEdge=%d.\\n\", nnodes, nedges, sizeEdgeTy);\n    row_start    = (index_type*)calloc(nnodes + 1, sizeof(index_type));\n    edge_dst     = (index_type*)calloc(nedges, sizeof(index_type));\n    row_start[0] = 0;\n    for (unsigned ii = 0; ii < nnodes; ++ii) {\n      row_start[ii + 1] = le64toh(outIdx[ii]);\n      index_type degree = row_start[ii + 1] - row_start[ii];\n      for (unsigned jj = 0; jj < degree; ++jj) {\n        unsigned edgeindex = row_start[ii] + jj;\n        unsigned dst       = le32toh(outs[edgeindex]);\n        if (dst >= nnodes)\n          printf(\"\\tinvalid edge from %d to %d at index %d(%d).\\n\", ii, dst, jj,\n                 edgeindex);\n        edge_dst[edgeindex] = dst;\n      }\n    }\n    cfile.close(); // probably galois doesn't close its file due to mmap.\n    t.Stop();\n    double runtime = t.Millisecs();\n    printf(\"read %lld bytes in %.1f ms (%0.2f MB/s)\\n\\r\\n\", masterLength,\n           runtime, (masterLength / 1000.0) / runtime);\n    if (need_dag) {\n      reconstruct_from_csr();\n      SquishGraph();\n      MakeCSR(vertices);\n      vertices.clear();\n    }\n    return;\n  }\n  void reconstruct_from_csr() {\n    vertices.resize(nnodes);\n    std::cout << \"Reconstructing from CSR graph ... \";\n    for (int i = 0; i < nnodes; i++) {\n      std::vector<Edge> neighbors;\n      for (IndexT j = row_start[i]; j < row_start[i + 1]; j++)\n        neighbors.push_back(Edge(i, edge_dst[j]));\n      vertices[i] = neighbors;\n    }\n    std::cout << \"Done\\n\";\n  }\n  int read_adj(const char* filename) {\n    FILE* fd = fopen(filename, \"r\");\n    assert(fd != NULL);\n    char buf[2048];\n    unsigned size = 0, maxsize = 0;\n    int numNodes = 0;\n    while (fgets(buf, 2048, fd) != NULL) {\n      auto len = strlen(buf);\n      size += len;\n      if (buf[len - 1] == '\\n') {\n        maxsize = std::max(size, maxsize);\n        size    = 0;\n        numNodes++;\n      }\n    }\n    fclose(fd);\n    nnodes = numNodes;\n    printf(\"nnodes=%d.\\n\", nnodes);\n    std::ifstream is;\n    is.open(filename, std::ios::in);\n    char* line = new char[maxsize + 1];\n    std::vector<std::string> result;\n    nedges    = 0;\n    node_data = (node_data_type*)calloc(nnodes, sizeof(node_data_type));\n    vertices.resize(nnodes);\n    std::vector<Edge> neighbors;\n    for (size_t i = 0; i < nnodes; i++)\n      vertices.push_back(neighbors);\n    int line_count = 0;\n    while (is.getline(line, maxsize + 1)) {\n      result.clear();\n      split(line, result);\n      IndexT src = atoi(result[0].c_str());\n      assert(src == line_count);\n      assert(src < nnodes);\n      node_data[src] = atoi(result[1].c_str());\n      for (size_t i = 2; i < result.size(); i++) {\n        IndexT dst = atoi(result[i].c_str());\n        if (src == dst)\n          continue; // remove self-loop\n        vertices[src].push_back(Edge(src, dst));\n        nedges++;\n      }\n      line_count++;\n    }\n    is.close();\n    printf(\"nedges=%d\\n\", nedges);\n    int num_labels = count_unique_labels();\n    std::cout << \"Number of unique vertex label values: \" << num_labels\n              << std::endl;\n    SquishGraph();\n    printf(\"nedges after clean: %d\\n\", nedges);\n    row_start = (index_type*)calloc(nnodes + 1, sizeof(index_type));\n    edge_dst  = (index_type*)calloc(nedges, sizeof(index_type));\n    MakeCSR(vertices);\n    vertices.clear();\n    return num_labels;\n  }\n\n  void MakeCSR(const std::vector<std::vector<Edge>> vert) {\n    printf(\"Constructing CSR graph ... \");\n    std::vector<IndexT> offsets(nnodes + 1);\n    IndexT total = 0;\n    for (int i = 0; i < nnodes; i++) {\n      offsets[i] = total;\n      total += vert[i].size();\n    }\n    offsets[nnodes] = total;\n    assert(nedges == offsets[nnodes]);\n    assert(row_start != NULL);\n    for (size_t i = 0; i < nnodes + 1; i++)\n      row_start[i] = offsets[i];\n    for (size_t i = 0; i < nnodes; i++) {\n      for (auto e : vert[i]) {\n        if (i != e.src)\n          std::cout << \"[debug] i = \" << i << \", src = \" << e.src\n                    << \", dst = \" << e.dst << \"\\n\";\n        assert(i == e.src);\n        edge_dst[offsets[e.src]++] = e.dst;\n      }\n    }\n    printf(\"Done\\n\");\n  }\n\n  static bool compare_id(Edge a, Edge b) { return (a.dst < b.dst); }\n  void SquishGraph(bool remove_selfloops  = true,\n                   bool remove_redundents = true) {\n    printf(\"Sorting the neighbor lists...\");\n    for (size_t i = 0; i < nnodes; i++)\n      std::sort(vertices[i].begin(), vertices[i].end(), compare_id);\n    printf(\" Done\\n\");\n    // remove self loops\n    int num_selfloops = 0;\n    if (remove_selfloops) {\n      printf(\"Removing self loops...\");\n      for (size_t i = 0; i < nnodes; i++) {\n        for (unsigned j = 0; j < vertices[i].size(); j++) {\n          if (i == vertices[i][j].dst) {\n            vertices[i].erase(vertices[i].begin() + j);\n            num_selfloops++;\n            j--;\n          }\n        }\n      }\n      printf(\" %d selfloops are removed\\n\", num_selfloops);\n      nedges -= num_selfloops;\n    }\n    // remove redundent\n    int num_redundents = 0;\n    if (remove_redundents) {\n      printf(\"Removing redundent edges...\");\n      for (size_t i = 0; i < nnodes; i++) {\n        for (unsigned j = 1; j < vertices[i].size(); j++) {\n          if (vertices[i][j].dst == vertices[i][j - 1].dst) {\n            vertices[i].erase(vertices[i].begin() + j);\n            num_redundents++;\n            j--;\n          }\n        }\n      }\n      printf(\" %d redundent edges are removed\\n\", num_redundents);\n      nedges -= num_redundents;\n    }\n    if (need_dag) {\n      int num_dag = 0;\n      std::cout << \"Constructing DAG...\";\n      IndexT* degrees = new IndexT[nnodes];\n      for (size_t i = 0; i < nnodes; i++)\n        degrees[i] = vertices[i].size();\n      for (size_t i = 0; i < nnodes; i++) {\n        for (unsigned j = 0; j < vertices[i].size(); j++) {\n          IndexT to = vertices[i][j].dst;\n          auto di   = degrees[i];\n          if (degrees[to] < di || (degrees[to] == di && to < i)) {\n            vertices[i].erase(vertices[i].begin() + j);\n            num_dag++;\n            j--;\n          }\n        }\n      }\n      delete degrees;\n      printf(\" %d dag edges are removed\\n\", num_dag);\n      nedges -= num_dag;\n    }\n  }\n\n  int count_unique_labels() {\n    std::set<node_data_type> s;\n    int res = 0;\n    for (int i = 0; i < nnodes; i++) {\n      if (s.find(node_data[i]) == s.end()) {\n        s.insert(node_data[i]);\n        res++;\n      }\n    }\n    return res;\n  }\n\n  inline void split(const std::string& str, std::vector<std::string>& tokens,\n                    const std::string& delimiters = \" \") {\n    std::string::size_type lastPos = str.find_first_not_of(delimiters, 0);\n    std::string::size_type pos     = str.find_first_of(delimiters, lastPos);\n    while (std::string::npos != pos || std::string::npos != lastPos) {\n      tokens.push_back(str.substr(lastPos, pos - lastPos));\n      lastPos = str.find_first_not_of(delimiters, pos);\n      pos     = str.find_first_of(delimiters, lastPos);\n    }\n  }\n\n  void copy_to_gpu(struct CSRGraph& copygraph) {\n    copygraph.nnodes = nnodes;\n    copygraph.nedges = nedges;\n    auto error       = copygraph.allocOnDevice(use_node_data);\n    if (error == 0) {\n      std::cout << \"GPU memory allocation failed\\n\";\n      exit(0);\n    }\n    printf(\"edge_dst: host_ptr %x device_ptr %x \\n\", edge_dst,\n           copygraph.edge_dst);\n    check_cuda(cudaMemcpy(copygraph.edge_dst, edge_dst,\n                          nedges * sizeof(index_type), cudaMemcpyHostToDevice));\n    check_cuda(cudaMemcpy(copygraph.row_start, row_start,\n                          (nnodes + 1) * sizeof(index_type),\n                          cudaMemcpyHostToDevice));\n    if (use_node_data)\n      check_cuda(cudaMemcpy(copygraph.node_data, node_data,\n                            nnodes * sizeof(node_data_type),\n                            cudaMemcpyHostToDevice));\n  }\n\n  unsigned allocOnHost() {\n    assert(nnodes > 0);\n    assert(!device_graph);\n    if (row_start != NULL)\n      return true;\n    std::cout << \"Allocating memory on CPU\\n\";\n    if (use_node_data)\n      std::cout << \"Need node data\\n\";\n    size_t mem_usage = ((nnodes + 1) + nedges) * sizeof(index_type);\n    if (use_node_data)\n      mem_usage += (nnodes) * sizeof(node_data_type);\n    printf(\"Host memory for graph: %3u MB\\n\", mem_usage / 1048756);\n    row_start = (index_type*)calloc(nnodes + 1, sizeof(index_type));\n    edge_dst  = (index_type*)calloc(nedges, sizeof(index_type));\n    if (use_node_data)\n      node_data = (node_data_type*)calloc(nnodes, sizeof(node_data_type));\n    std::cout << \"Memory allocation done\\n\";\n    return ((!use_node_data || node_data) && row_start && edge_dst);\n  }\n\n  unsigned allocOnDevice(bool use_label) {\n    if (edge_dst != NULL) {\n      std::cout << \"already allocated\\n\";\n      exit(0);\n    }\n    assert(edge_dst == NULL); // make sure not already allocated\n    device_graph = true;\n    std::cout << \"Allocating memory on GPU\\n\";\n    check_cuda(cudaMalloc((void**)&edge_dst, nedges * sizeof(index_type)));\n    check_cuda(\n        cudaMalloc((void**)&row_start, (nnodes + 1) * sizeof(index_type)));\n    if (use_label)\n      check_cuda(\n          cudaMalloc((void**)&node_data, nnodes * sizeof(node_data_type)));\n    return (edge_dst && (!use_node_data || node_data) && row_start);\n  }\n};\n"
  },
  {
    "path": "libpangolin/gpu/pangolin/miner.cuh",
    "content": "#ifndef GMINER_HPP_\n#define GMINER_HPP_\n\n#include \"graph_gpu.h\"\n#include \"embedding.cuh\"\n\n__device__ void printout_embedding(unsigned level, IndexT *emb) {\n\tprintf(\"embedding[\");\n\tfor (unsigned i = 0; i < level; i ++) {\n\t\tprintf(\"%d, \", emb[i]);\n\t}\n\tprintf(\"%d]\\n\", emb[level]);\n}\n\ninline __device__ bool binary_search(CSRGraph graph, IndexT key, IndexT begin, IndexT end) {\n\tassert(begin < end);\n\tint l = begin;\n\tint r = end-1;\n\twhile (r >= l) { \n\t\tint mid = l + (r - l) / 2; \n\t\tIndexT value = graph.getEdgeDst(mid);\n\t\tif (value == key) return true;\n\t\tif (value < key) l = mid + 1;\n\t\telse r = mid - 1;\n\t}\n\treturn false;\n}\n\ninline __device__ bool binary_search(IndexT *column_indices, IndexT key, IndexT begin, IndexT end) {\n\tassert(begin < end);\n\tint l = begin;\n\tint r = end-1;\n\twhile (r >= l) { \n\t\tint mid = l + (r - l) / 2; \n\t\tIndexT value = column_indices[mid];\n\t\tif (value == key) return true;\n\t\tif (value < key) l = mid + 1;\n\t\telse r = mid - 1;\n\t}\n\treturn false;\n}\n\ninline __device__ unsigned intersect_dag_merge(IndexT p, IndexT q, CSRGraph graph) {\n\tunsigned count = 0;\n\tIndexT p_start, p_end, q_start, q_end, p_it, q_it;\n\tp_start = graph.edge_begin(p);\n\tp_end = graph.edge_end(p);\n\tq_start = graph.edge_begin(q);\n\tq_end = graph.edge_end(q);\n\tp_it = p_start;\n\tq_it = q_start;\n\tIndexT a, b;\n\twhile (p_it < p_end && q_it < q_end) {\n\t\ta = graph.getEdgeDst(p_it);\n\t\tb = graph.getEdgeDst(q_it);\n\t\tIndexT d = a - b;\n\t\tif (d <= 0) p_it ++;\n\t\tif (d >= 0) q_it ++;\n\t\tif (d == 0) count ++;\n\t}\n\treturn count;\n}\n\ninline __device__ bool is_connected(IndexT a, IndexT b, CSRGraph graph) {\n\tif (graph.getOutDegree(a) == 0 || graph.getOutDegree(b) == 0) return false;\n\tIndexT key = a;\n\tIndexT search = b;\n\tif (graph.getOutDegree(a) < graph.getOutDegree(b)) {\n\t\tkey = b;\n\t\tsearch = a;\n\t} \n\tIndexT begin = graph.edge_begin(search);\n\tIndexT end = graph.edge_end(search);\n\tIndexT l = begin;\n\tIndexT r = end-1;\n\twhile (r >= l) { \n\t\tIndexT mid = l + (r - l) / 2; \n\t\tIndexT value = graph.getEdgeDst(mid);\n\t\tif (value == key) return true;\n\t\tif (value < key) l = mid + 1; \n\t\telse r = mid - 1; \n\t} \n\treturn false;\n}\n\ninline __device__ bool is_connected_dag(IndexT key, IndexT search, CSRGraph graph) {\n\tif (graph.getOutDegree(search) == 0) return false;\n\tIndexT begin = graph.edge_begin(search);\n\tIndexT end = graph.edge_end(search);\n\tIndexT l = begin;\n\tIndexT r = end-1;\n\twhile (r >= l) { \n\t\tIndexT mid = l + (r - l) / 2; \n\t\tIndexT value = graph.getEdgeDst(mid);\n\t\tif (value == key) return true;\n\t\tif (value < key) l = mid + 1; \n\t\telse r = mid - 1; \n\t} \n\treturn false;\n}\n\ninline __device__ bool is_vertexInduced_automorphism(unsigned n, IndexT *emb, unsigned idx, IndexT src, IndexT dst, CSRGraph g) {\n\t// the new vertex id should be larger than the first vertex id\n\tif (dst <= emb[0]) return true;\n\t// the new vertex should not already exist in the embedding\n\tfor (unsigned i = 1; i < n; ++i)\n\t\tif (dst == emb[i]) return true;\n\t// the new vertex should not already be extended by any previous vertex in the embedding\n\tfor (unsigned i = 0; i < idx; ++i)\n\t\tif (is_connected(emb[i], dst, g)) return true;\n\t// the new vertex id should be larger than any vertex id after its source vertex in the embedding\n\tfor (unsigned i = idx+1; i < n; ++i)\n\t\tif (dst < emb[i]) return true;\n\treturn false;\n}\n\n// count 3-motifs\ninline __device__ unsigned find_3motif_pattern_id(unsigned idx, IndexT dst, IndexT* emb, CSRGraph g, unsigned pos = 0) {\n\tunsigned pid = 1; // 3-chain\n\tif (idx == 0) {\n\t\tif (is_connected(emb[1], dst, g)) pid = 0; // triangle\n\t\t#ifdef USE_WEDGE\n\t\t//else if (max_size == 4) is_wedge[pos] = 1; // wedge; used for 4-motif\n\t\t#endif\n\t}\n\treturn pid;\n}\n// count 4-motifs\ninline __device__ unsigned find_4motif_pattern_id(unsigned n, unsigned idx, IndexT dst, IndexT* emb, unsigned pattern, CSRGraph g, unsigned pos = 0) {\n\tunsigned pid = pattern;\n\tunsigned num_edges = 1;\n\tif (pid == 0) { // extending a triangle\n\t\tfor (unsigned j = idx+1; j < n; j ++)\n\t\t\tif (is_connected(emb[j], dst, g)) num_edges ++;\n\t\tpid = num_edges + 2; // p3: tailed-triangle; p4: diamond; p5: 4-clique\n\t} else { // extending a 3-chain\n\t\tassert(pid == 1);\n\t\tbool connected[3];\n\t\tfor (int i = 0; i < 3; i ++) connected[i] = false;\n\t\tconnected[idx] = true;\n\t\tfor (unsigned j = idx+1; j < n; j ++) {\n\t\t\tif (is_connected(emb[j], dst, g)) {\n\t\t\t\tnum_edges ++;\n\t\t\t\tconnected[j] = true;\n\t\t\t}\n\t\t}\n\t\tif (num_edges == 1) {\n\t\t\tpid = 0; // p0: 3-path\n\t\t\tunsigned center = 1;\n\t\t\t#ifdef USE_WEDGE\n\t\t\t//if (is_wedge[pos]) center = 0;\n\t\t\t#else\n\t\t\tcenter = is_connected(emb[1], emb[2], g) ? 1 : 0;\n\t\t\t#endif\n\t\t\tif (idx == center) pid = 1; // p1: 3-star\n\t\t} else if (num_edges == 2) {\n\t\t\tpid = 2; // p2: 4-cycle\n\t\t\tunsigned center = 1;\n\t\t\t#ifdef USE_WEDGE\n\t\t\t//if (is_wedge[pos]) center = 0;\n\t\t\t#else\n\t\t\tcenter = is_connected(emb[1], emb[2], g) ? 1 : 0;\n\t\t\t#endif\n\t\t\tif (connected[center]) pid = 3; // p3: tailed-triangle\n\t\t} else {\n\t\t\tpid = 4; // p4: diamond\n\t\t}\n\t}\n\treturn pid;\n}\n\ninline __device__ unsigned intersect_dag(IndexT a, IndexT b, CSRGraph g) {\n\treturn intersect_dag_merge(a, b, g);\n}\n\ninline __device__ bool is_all_connected_dag(IndexT dst, IndexT *emb, IndexT end, CSRGraph g) {\n\tbool all_connected = true;\n\tfor(IndexT i = 0; i < end; ++i) {\n\t\tIndexT from = emb[i];\n\t\tif (!is_connected_dag(dst, from, g)) {\n\t\t\tall_connected = false;\n\t\t\tbreak;\n\t\t}\n\t}\n\treturn all_connected;\n}\n\n#endif\n"
  },
  {
    "path": "libpangolin/gpu/pangolin/timer.h",
    "content": "// Copyright (c) 2015, The Regents of the University of California (Regents)\n// See LICENSE.txt for license details\n\n#ifndef TIMER_H_\n#define TIMER_H_\n\n#include <sys/time.h>\n\n/*\nGAP Benchmark Suite\nClass:  Timer\nAuthor: Scott Beamer\n\nSimple timer that wraps gettimeofday\n*/\n\nclass Timer {\npublic:\n  Timer() {}\n\n  void Start() { gettimeofday(&start_time_, NULL); }\n\n  void Stop() {\n    gettimeofday(&elapsed_time_, NULL);\n    elapsed_time_.tv_sec -= start_time_.tv_sec;\n    elapsed_time_.tv_usec -= start_time_.tv_usec;\n  }\n\n  double Seconds() const {\n    return elapsed_time_.tv_sec + (double)elapsed_time_.tv_usec / 1e6;\n  }\n\n  double Millisecs() const {\n    return 1000 * elapsed_time_.tv_sec + (double)elapsed_time_.tv_usec / 1000;\n  }\n\n  double Microsecs() const {\n    return 1e6 * elapsed_time_.tv_sec + (double)elapsed_time_.tv_usec;\n  }\n\nprivate:\n  struct timeval start_time_;\n  struct timeval elapsed_time_;\n};\n\n// Times op's execution using the timer t\n#define TIME_OP(t, op)                                                         \\\n  {                                                                            \\\n    t.Start();                                                                 \\\n    (op);                                                                      \\\n    t.Stop();                                                                  \\\n  }\n\n#endif // TIMER_H_\n"
  },
  {
    "path": "libpangolin/gpu/pangolin/types.cuh",
    "content": "#ifndef TYPES_H_\n#define TYPES_H_\n\ntypedef int IndexT;\ntypedef int index_type;\ntypedef uint8_t edge_data_type;\ntypedef uint8_t node_data_type;\ntypedef uint8_t key_type;\ntypedef uint8_t history_type;\ntypedef unsigned char SetType;\ntypedef unsigned long long AccType;\n\n#define PANGOLIN_MAX_SIZE     5\n#define WARP_SIZE   32\n#define BLOCK_SIZE 256\n#define DIVIDE_INTO(x,y) ((x + y - 1)/y)\n#define WARPS_PER_BLOCK (BLOCK_SIZE / WARP_SIZE)\n\n#endif\n"
  },
  {
    "path": "libpangolin/include/pangolin/BfsMining/edge_miner.h",
    "content": "#ifndef EDGE_MINER_H\n#define EDGE_MINER_H\n#include \"pangolin/miner.h\"\n#include \"pangolin/quick_pattern.h\"\n#include \"pangolin/canonical_graph.h\"\n#include \"pangolin/domain_support.h\"\n#include \"pangolin/BfsMining/embedding_list.h\"\n\ntemplate <typename ElementTy, typename EmbeddingTy, typename API,\n          bool report_num_pattern>\nclass EdgeMiner : public Miner<ElementTy, EmbeddingTy, false> {\n  typedef EmbeddingList<ElementTy, EmbeddingTy> EmbeddingListTy;\n  typedef QuickPattern<EmbeddingTy, ElementTy> QPattern;\n  typedef CanonicalGraph<EmbeddingTy, ElementTy> CPattern;\n  // quick pattern map (mapping quick pattern to its frequency)\n  typedef std::unordered_map<QPattern, Frequency> QpMapFreq;\n  // canonical pattern map (mapping canonical pattern to its frequency)\n  typedef std::unordered_map<CPattern, Frequency> CgMapFreq;\n  // quick pattern map (mapping quick pattern to its domain support)\n  typedef std::unordered_map<QPattern, DomainSupport*> QpMapDomain;\n  // canonical pattern map (mapping canonical pattern to its domain support)\n  typedef std::unordered_map<CPattern, DomainSupport*> CgMapDomain;\n  // PerThreadStorage: thread-local quick pattern map\n  typedef galois::substrate::PerThreadStorage<QpMapFreq> LocalQpMapFreq;\n  // PerThreadStorage: thread-local canonical pattern map\n  typedef galois::substrate::PerThreadStorage<CgMapFreq> LocalCgMapFreq;\n  typedef galois::substrate::PerThreadStorage<QpMapDomain> LocalQpMapDomain;\n  typedef galois::substrate::PerThreadStorage<CgMapDomain> LocalCgMapDomain;\n\npublic:\n  EdgeMiner(unsigned max_sz, int nt)\n      : Miner<ElementTy, EmbeddingTy, false>(max_sz, nt) {}\n  virtual ~EdgeMiner() {}\n  void clean() {\n    edge_map.clear();\n    freq_edge_set.clear();\n    is_frequent_edge.clear();\n    clean_maps();\n  }\n  void clean_maps() {\n    id_map.clear();\n    domain_support_map.clear();\n    for (auto ele : qp_map)\n      ele.second->clean();\n    for (auto ele : cg_map)\n      ele.second->clean();\n    for (auto ele : init_map)\n      ele.second->clean();\n    qp_map.clear();\n    cg_map.clear();\n    init_map.clear();\n    for (auto i = 0; i < this->num_threads; i++) {\n      auto qp_map_ptr = qp_localmaps.getLocal(i);\n      for (auto ele : *qp_map_ptr)\n        ele.second->clean();\n      qp_map_ptr->clear();\n      auto cg_map_ptr = cg_localmaps.getLocal(i);\n      for (auto ele : *cg_map_ptr)\n        ele.second->clean();\n      cg_map_ptr->clear();\n      auto init_map_ptr = init_pattern_maps.getLocal(i);\n      for (auto ele : *init_map_ptr)\n        ele.second->clean();\n      init_map_ptr->clear();\n    }\n  }\n  void initialize(std::string) { init_emb_list(); }\n  void init_emb_list() {\n    this->emb_list.init(this->graph, this->max_size + 1);\n    construct_edgemap();\n  }\n  void inc_total_num(int value) { total_num += value; }\n  void solver() {\n    std::cout << \"Mininum support: \" << threshold << \"\\n\";\n    unsigned level = 1;\n    // this->emb_list.printout_embeddings(1);\n    int num_freq_patterns = init_aggregator();\n    if (num_freq_patterns == 0) {\n      std::cout << \"No frequent pattern found\\n\\n\";\n      return;\n    }\n    inc_total_num(num_freq_patterns);\n    std::cout << \"Number of frequent single-edge patterns: \"\n              << num_freq_patterns << \"\\n\";\n    init_filter();\n    // this->emb_list.printout_embeddings(level);\n\n    while (1) {\n      extend_edge(level);\n      level++;\n      // this->emb_list.printout_embeddings(level, debug);\n      quick_aggregate(level);\n      merge_qp_map(level + 1);\n      canonical_aggregate();\n      merge_cg_map(level + 1);\n      num_freq_patterns = support_count();\n      // std::cout << \"num_frequent_patterns: \" << num_freq_patterns << \"\\n\";\n      // printout_agg();\n      inc_total_num(num_freq_patterns);\n      if (num_freq_patterns == 0)\n        break;\n      if (level == this->max_size)\n        break;\n      filter(level);\n      // this->emb_list.printout_embeddings(level, debug);\n      clean_maps();\n    }\n  }\n\n  void extend_edge(unsigned level) {\n    UintList num_new_emb(this->emb_list.size());\n    galois::do_all(\n        galois::iterate((size_t)0, this->emb_list.size()),\n        [&](const size_t& pos) {\n          EmbeddingTy emb(level + 1);\n          get_embedding(level, pos, emb);\n          num_new_emb[pos] = 0;\n          unsigned n       = emb.size();\n          VertexSet vert_set;\n          if (n > 3)\n            for (unsigned i = 0; i < n; i++)\n              vert_set.insert(emb.get_vertex(i));\n          for (unsigned i = 0; i < n; ++i) {\n            auto src = emb.get_vertex(i);\n            if (emb.get_key(i) == 0) { // TODO: need to fix this\n              for (auto e : this->graph.edges(src)) {\n                GNode dst    = this->graph.getEdgeDst(e);\n                BYTE existed = 0;\n                if (is_frequent_edge[*e])\n                  if (API::toAdd(n, emb, i, src, dst, existed, vert_set))\n                    num_new_emb[pos]++;\n              }\n            }\n          }\n          emb.clean();\n        },\n        galois::chunk_size<CHUNK_SIZE>(), galois::steal(),\n        galois::loopname(\"Extending-alloc\"));\n    Ulong new_size =\n        std::accumulate(num_new_emb.begin(), num_new_emb.end(), (Ulong)0);\n    UlongList indices = parallel_prefix_sum<unsigned, Ulong>(num_new_emb);\n    new_size          = indices[indices.size() - 1];\n    this->emb_list.add_level(new_size);\n    galois::do_all(\n        galois::iterate((size_t)0, this->emb_list.size(level)),\n        [&](const size_t& pos) {\n          EmbeddingTy emb(level + 1);\n          get_embedding(level, pos, emb);\n          unsigned start = indices[pos];\n          unsigned n     = emb.size();\n          VertexSet vert_set;\n          if (n > 3)\n            for (unsigned i = 0; i < n; i++)\n              vert_set.insert(emb.get_vertex(i));\n          for (unsigned i = 0; i < n; ++i) {\n            auto src = emb.get_vertex(i);\n            if (emb.get_key(i) == 0) {\n              for (auto e : this->graph.edges(src)) {\n                GNode dst    = this->graph.getEdgeDst(e);\n                BYTE existed = 0;\n                if (is_frequent_edge[*e]) {\n                  if (API::toAdd(n, emb, i, src, dst, existed, vert_set)) {\n                    this->emb_list.set_idx(level + 1, start, pos);\n                    this->emb_list.set_his(level + 1, start, i);\n                    this->emb_list.set_vid(level + 1, start++, dst);\n                  }\n                }\n              }\n            }\n          }\n        },\n        galois::chunk_size<CHUNK_SIZE>(), galois::steal(),\n        galois::loopname(\"Extending-write\"));\n  }\n  inline unsigned init_aggregator() {\n    init_map.clear();\n    galois::do_all(\n        galois::iterate(this->graph.begin(), this->graph.end()),\n        [&](const GNode& src) {\n          InitMap* lmap   = init_pattern_maps.getLocal();\n          auto& src_label = this->graph.getData(src);\n          for (auto e : this->graph.edges(src)) {\n            GNode dst       = this->graph.getEdgeDst(e);\n            auto& dst_label = this->graph.getData(dst);\n            if (src_label <= dst_label) {\n              InitPattern key = get_init_pattern(src_label, dst_label);\n              if (lmap->find(key) == lmap->end()) {\n                (*lmap)[key] = new DomainSupport(2);\n                (*lmap)[key]->set_threshold(threshold);\n              }\n              (*lmap)[key]->add_vertex(0, src);\n              (*lmap)[key]->add_vertex(1, dst);\n            }\n          }\n        },\n        galois::chunk_size<CHUNK_SIZE>(), galois::steal(),\n        galois::loopname(\"InitAggregation\"));\n    merge_init_map();\n    std::cout << \"Number of single-edge patterns: \" << init_map.size() << \"\\n\";\n    unsigned count = 0;\n    for (auto it = init_map.begin(); it != init_map.end(); ++it)\n      if (it->second->get_support())\n        count++;\n    return count; // return number of frequent single-edge patterns\n  }\n  inline void quick_aggregate(unsigned level) {\n    for (auto i = 0; i < this->num_threads; i++)\n      qp_localmaps.getLocal(i)->clear();\n    galois::do_all(\n        galois::iterate((size_t)0, this->emb_list.size()),\n        [&](const size_t& pos) {\n          QpMapDomain* lmap = qp_localmaps.getLocal();\n          EmbeddingTy emb(level + 1);\n          get_embedding(level, pos, emb);\n          unsigned n = emb.size();\n          QPattern qp(emb, true);\n          bool qp_existed = false;\n          auto it         = lmap->find(qp);\n          if (it == lmap->end()) {\n            (*lmap)[qp] = new DomainSupport(n);\n            (*lmap)[qp]->set_threshold(threshold);\n            this->emb_list.set_pid(pos, qp.get_id());\n          } else {\n            qp_existed = true;\n            this->emb_list.set_pid(pos, (it->first).get_id());\n          }\n          for (unsigned i = 0; i < n; i++) {\n            if ((*lmap)[qp]->has_domain_reached_support(i) == false)\n              (*lmap)[qp]->add_vertex(i, emb.get_vertex(i));\n          }\n          if (qp_existed)\n            qp.clean();\n        },\n        galois::chunk_size<CHUNK_SIZE>(), galois::steal(),\n        galois::loopname(\"QuickAggregation\"));\n  }\n  // aggregate quick patterns into canonical patterns.\n  // construct id_map from quick pattern ID (qp_id) to canonical pattern ID\n  // (cg_id)\n  void canonical_aggregate() {\n    id_map.clear();\n    for (auto i = 0; i < this->num_threads; i++)\n      cg_localmaps.getLocal(i)->clear();\n    galois::do_all(\n        galois::iterate(qp_map),\n        [&](std::pair<QPattern, DomainSupport*> element) {\n          CgMapDomain* lmap    = cg_localmaps.getLocal();\n          unsigned num_domains = element.first.get_size();\n          CPattern cg(element.first);\n          int qp_id = element.first.get_id();\n          int cg_id = cg.get_id();\n          slock.lock();\n          id_map.insert(std::make_pair(qp_id, cg_id));\n          slock.unlock();\n          auto it = lmap->find(cg);\n          if (it == lmap->end()) {\n            (*lmap)[cg] = new DomainSupport(num_domains);\n            (*lmap)[cg]->set_threshold(threshold);\n            element.first.set_cgid(cg.get_id());\n          } else {\n            element.first.set_cgid((it->first).get_id());\n          }\n          VertexPositionEquivalences equivalences;\n          element.first.get_equivalences(equivalences);\n          for (unsigned i = 0; i < num_domains; i++) {\n            if ((*lmap)[cg]->has_domain_reached_support(i) == false) {\n              unsigned qp_idx = cg.get_quick_pattern_index(i);\n              assert(qp_idx < num_domains);\n              UintSet equ_set = equivalences.get_equivalent_set(qp_idx);\n              for (unsigned idx : equ_set) {\n                DomainSupport* support = element.second;\n                if (support->has_domain_reached_support(idx) == false) {\n                  bool reached_threshold =\n                      (*lmap)[cg]->add_vertices(i, support->domain_sets[idx]);\n                  if (reached_threshold)\n                    break;\n                } else {\n                  (*lmap)[cg]->set_domain_frequent(i);\n                  break;\n                }\n              }\n            }\n          }\n          cg.clean();\n        },\n        galois::chunk_size<CHUNK_SIZE>(), galois::steal(),\n        galois::loopname(\"CanonicalAggregation\"));\n  }\n  inline void merge_qp_map(LocalQpMapFreq& qp_localmap, QpMapFreq& qp_map) {\n    for (auto i = 0; i < this->num_threads; i++) {\n      for (auto element : *qp_localmap.getLocal(i)) {\n        if (qp_map.find(element.first) != qp_map.end())\n          qp_map[element.first] += element.second;\n        else\n          qp_map[element.first] = element.second;\n      }\n    }\n  }\n  inline void merge_cg_map(LocalCgMapFreq& localmaps, CgMapFreq& cg_map) {\n    for (auto i = 0; i < this->num_threads; i++) {\n      for (auto element : *localmaps.getLocal(i)) {\n        if (cg_map.find(element.first) != cg_map.end())\n          cg_map[element.first] += element.second;\n        else\n          cg_map[element.first] = element.second;\n      }\n    }\n  }\n  inline void merge_init_map() {\n    init_map = *(init_pattern_maps.getLocal(0));\n    for (auto i = 1; i < this->num_threads; i++) {\n      for (auto element : *init_pattern_maps.getLocal(i)) {\n        DomainSupport* support = element.second;\n        if (init_map.find(element.first) == init_map.end()) {\n          init_map[element.first] = support;\n        } else {\n          for (unsigned i = 0; i < 2; i++) {\n            if (!init_map[element.first]->has_domain_reached_support(i)) {\n              if (support->has_domain_reached_support(i))\n                init_map[element.first]->set_domain_frequent(i);\n              else\n                init_map[element.first]->add_vertices(i,\n                                                      support->domain_sets[i]);\n            }\n          }\n        }\n      }\n    }\n  }\n  inline void merge_qp_map(unsigned num_domains) {\n    qp_map.clear();\n    qp_map = *(qp_localmaps.getLocal(0));\n    for (auto i = 1; i < this->num_threads; i++) {\n      const QpMapDomain* lmap = qp_localmaps.getLocal(i);\n      for (auto element : *lmap) {\n        if (qp_map.find(element.first) == qp_map.end())\n          qp_map[element.first] = element.second;\n      }\n      galois::do_all(\n          galois::iterate(*lmap),\n          [&](std::pair<QPattern, DomainSupport*> element) {\n            DomainSupport* support = element.second;\n            for (unsigned i = 0; i < num_domains; i++) {\n              if (!qp_map[element.first]->has_domain_reached_support(i) &&\n                  qp_map[element.first] != support) {\n                if (support->has_domain_reached_support(i))\n                  qp_map[element.first]->set_domain_frequent(i);\n                else\n                  qp_map[element.first]->add_vertices(i,\n                                                      support->domain_sets[i]);\n              }\n            }\n          },\n          galois::chunk_size<CHUNK_SIZE>(), galois::steal(),\n          galois::loopname(\"MergeQuickPatterns\"));\n    }\n  }\n  inline void merge_cg_map(unsigned num_domains) {\n    cg_map.clear();\n    cg_map = *(cg_localmaps.getLocal(0));\n    for (auto i = 1; i < this->num_threads; i++) {\n      const CgMapDomain* lmap = cg_localmaps.getLocal(i);\n      for (auto element : *lmap) {\n        if (cg_map.find(element.first) == cg_map.end())\n          cg_map[element.first] = element.second;\n      }\n      galois::do_all(\n          galois::iterate(*lmap),\n          [&](std::pair<CPattern, DomainSupport*> element) {\n            DomainSupport* support = element.second;\n            for (unsigned i = 0; i < num_domains; i++) {\n              if (!cg_map[element.first]->has_domain_reached_support(i) &&\n                  cg_map[element.first] != support) {\n                if (support->has_domain_reached_support(i))\n                  cg_map[element.first]->set_domain_frequent(i);\n                else\n                  cg_map[element.first]->add_vertices(i,\n                                                      support->domain_sets[i]);\n              }\n            }\n          },\n          galois::chunk_size<CHUNK_SIZE>(), galois::steal(),\n          galois::loopname(\"MergeCanonicalPatterns\"));\n    }\n  }\n\n  // Filtering for FSM\n  inline void init_filter() {\n    UintList is_frequent_emb(this->emb_list.size(), 0);\n    galois::do_all(\n        galois::iterate((size_t)0, this->emb_list.size()),\n        [&](const size_t& pos) {\n          auto src        = this->emb_list.get_idx(1, pos);\n          auto dst        = this->emb_list.get_vid(1, pos);\n          auto& src_label = this->graph.getData(src);\n          auto& dst_label = this->graph.getData(dst);\n          InitPattern key = get_init_pattern(src_label, dst_label);\n          if (init_map[key]->get_support())\n            is_frequent_emb[pos] = 1;\n        },\n        galois::chunk_size<CHUNK_SIZE>(), galois::steal(),\n        galois::loopname(\"InitFilter\"));\n\n    assert(this->emb_list.size() * 2 ==\n           this->graph.sizeEdges()); // symmetric graph\n    is_frequent_edge.resize(this->graph.sizeEdges());\n    std::fill(is_frequent_edge.begin(), is_frequent_edge.end(), 0);\n    galois::do_all(\n        galois::iterate((size_t)0, this->emb_list.size()),\n        [&](const size_t& pos) {\n          if (is_frequent_emb[pos]) {\n            auto src  = this->emb_list.get_idx(1, pos);\n            auto dst  = this->emb_list.get_vid(1, pos);\n            auto eid0 = edge_map[OrderedEdge(src, dst)];\n            auto eid1 = edge_map[OrderedEdge(dst, src)];\n            // std::cout << \"src=\" << src << \", dst=\" << dst\n            //\t<< \", eid_sd=\" << eid0 << \", eid_ds=\" << eid1 << \"\\n\";\n            __sync_bool_compare_and_swap(&is_frequent_edge[eid0], 0, 1);\n            __sync_bool_compare_and_swap(&is_frequent_edge[eid1], 0, 1);\n          }\n        },\n        galois::chunk_size<CHUNK_SIZE>(), galois::steal(),\n        galois::loopname(\"InitFrquentEdges\"));\n    std::cout << \"Number of frequent edges: \"\n              << count(is_frequent_edge.begin(), is_frequent_edge.end(), 1)\n              << \"\\n\";\n\n    UintList indices     = parallel_prefix_sum(is_frequent_emb);\n    VertexList vid_list0 = this->emb_list.get_idx_list(1);\n    VertexList vid_list1 = this->emb_list.get_vid_list(1);\n    galois::do_all(\n        galois::iterate((size_t)0, this->emb_list.size()),\n        [&](const size_t& pos) {\n          if (is_frequent_emb[pos]) {\n            auto src   = vid_list0[pos];\n            auto dst   = vid_list1[pos];\n            auto start = indices[pos];\n            this->emb_list.set_vid(1, start, dst);\n            this->emb_list.set_idx(1, start, src);\n          }\n        },\n        galois::chunk_size<CHUNK_SIZE>(), galois::steal(),\n        galois::loopname(\"InitEmbeddingList\"));\n    this->emb_list.remove_tail(indices.back());\n  }\n\n  // Check if the pattern of a given embedding is frequent, if yes, insert it to\n  // the queue\n  inline void filter(unsigned level) {\n    UintList is_frequent_emb(this->emb_list.size(), 0);\n    galois::do_all(\n        galois::iterate((size_t)0, this->emb_list.size()),\n        [&](const size_t& pos) {\n          unsigned qp_id = this->emb_list.get_pid(pos);\n          unsigned cg_id = id_map.at(qp_id);\n          if (domain_support_map.at(cg_id))\n            is_frequent_emb[pos] = 1;\n        },\n        galois::chunk_size<CHUNK_SIZE>(), galois::steal(),\n        galois::loopname(\"Filter-alloc\"));\n    UlongList indices   = parallel_prefix_sum<unsigned, Ulong>(is_frequent_emb);\n    VertexList vid_list = this->emb_list.get_vid_list(level);\n    UintList idx_list   = this->emb_list.get_idx_list(level);\n    ByteList his_list   = this->emb_list.get_his_list(level);\n    galois::do_all(\n        galois::iterate((size_t)0, this->emb_list.size()),\n        [&](const size_t& pos) {\n          if (is_frequent_emb[pos]) {\n            auto start = indices[pos];\n            auto vid   = vid_list[pos];\n            auto idx   = idx_list[pos];\n            auto his   = his_list[pos];\n            this->emb_list.set_idx(level, start, idx);\n            this->emb_list.set_vid(level, start, vid);\n            this->emb_list.set_his(level, start, his);\n          }\n        },\n        galois::chunk_size<CHUNK_SIZE>(), galois::steal(),\n        galois::loopname(\"Filter-write\"));\n    this->emb_list.remove_tail(indices.back());\n  }\n  void set_threshold(const unsigned minsup) { threshold = minsup; }\n  inline void printout_agg(const CgMapFreq& cg_map) {\n    for (auto it = cg_map.begin(); it != cg_map.end(); ++it)\n      std::cout << \"{\" << it->first << \" --> \" << it->second << std::endl;\n  }\n  inline void printout_agg() {\n    BoolVec support(cg_map.size());\n    int i = 0;\n    for (auto it = cg_map.begin(); it != cg_map.end(); ++it) {\n      support[i] = it->second->get_support();\n      i++;\n    }\n    i = 0;\n    for (auto it = cg_map.begin(); it != cg_map.end(); ++it) {\n      std::cout << \"{\" << it->first << \" --> \" << support[i] << std::endl;\n      i++;\n    }\n  }\n  inline unsigned support_count() {\n    domain_support_map.clear();\n    unsigned count = 0;\n    for (auto it = cg_map.begin(); it != cg_map.end(); ++it) {\n      bool support = it->second->get_support();\n      domain_support_map.insert(std::make_pair(it->first.get_id(), support));\n      if (support)\n        count++;\n    }\n    return count;\n  }\n  // construct edge-map for later use. May not be necessary if Galois has this\n  // support\n  void construct_edgemap() {\n    // std::cout << \"Constructing edge map ...\\n\";\n    for (auto src : this->graph) {\n      for (auto e : this->graph.edges(src)) {\n        GNode dst = this->graph.getEdgeDst(e);\n        OrderedEdge edge(src, dst);\n        // std::cout << \"src=\" << src << \", dst=\" << dst\n        //\t<< \", eid=\" << *e << \"\\n\";\n        edge_map.insert(std::pair<OrderedEdge, unsigned>(edge, *e));\n      }\n    }\n  }\n\nprotected:\n  int total_num; // total number of frequent patterns\n  unsigned threshold;\n  EmbeddingListTy emb_list;\n\nprivate:\n  InitMap init_map;\n  UintMap id_map;\n  DomainMap domain_support_map;\n  galois::gstl::Map<OrderedEdge, unsigned> edge_map;\n  std::set<std::pair<VertexId, VertexId>> freq_edge_set;\n  std::vector<unsigned> is_frequent_edge;\n  InitMaps init_pattern_maps; // initialization map, only used for once, no need\n                              // to clear\n  LocalQpMapDomain qp_localmaps; // quick pattern local map for each thread\n  LocalCgMapDomain cg_localmaps; // canonical pattern local map for each thread\n  QpMapDomain qp_map;            // quick pattern map\n  CgMapDomain cg_map;            // canonical graph map\n  galois::substrate::SimpleLock slock;\n\n  inline InitPattern get_init_pattern(BYTE src_label, BYTE dst_label) {\n    if (src_label <= dst_label)\n      return std::make_pair(src_label, dst_label);\n    else\n      return std::make_pair(dst_label, src_label);\n  }\n  inline void get_embedding(unsigned level, unsigned pos, EmbeddingTy& emb) {\n    auto vid = this->emb_list.get_vid(level, pos);\n    auto idx = this->emb_list.get_idx(level, pos);\n    auto his = this->emb_list.get_his(level, pos);\n    auto lab = this->graph.getData(vid);\n    ElementTy ele(vid, 0, lab, his);\n    emb.set_element(level, ele);\n    for (unsigned l = 1; l < level; l++) {\n      vid = this->emb_list.get_vid(level - l, idx);\n      his = this->emb_list.get_his(level - l, idx);\n      lab = this->graph.getData(vid);\n      ElementTy ele(vid, 0, lab, his);\n      emb.set_element(level - l, ele);\n      idx = this->emb_list.get_idx(level - l, idx);\n    }\n    lab = this->graph.getData(idx);\n    ElementTy ele0(idx, 0, lab, 0);\n    emb.set_element(0, ele0);\n  }\n};\n\n#endif // EDGE_MINER_HPP_\n"
  },
  {
    "path": "libpangolin/include/pangolin/BfsMining/edge_miner_api.h",
    "content": "#pragma once\n#include \"pangolin/gtypes.h\"\n\ntemplate <typename EmbeddingTy>\nclass EdgeMinerAPI {\npublic:\n  EdgeMinerAPI() {}\n  ~EdgeMinerAPI() {}\n\n  // toExtend\n  static inline bool toExtend(unsigned, const EmbeddingTy&, unsigned) {\n    return true;\n  }\n  // toAdd (only add non-automorphisms)\n  static inline bool toAdd(unsigned n, const EmbeddingTy& emb, unsigned pos,\n                           VertexId src, VertexId dst, BYTE& existed,\n                           const VertexSet& vertex_set) {\n    return !is_edge_automorphism(n, emb, pos, src, dst, existed, vertex_set);\n  }\n  // customized pattern classification method\n  static inline unsigned getPattern(unsigned GALOIS_USED_ONLY_IN_DEBUG(n),\n                                    const EmbeddingTy&, unsigned, VertexId,\n                                    VertexId) {\n    assert(n < 4);\n    return 0;\n  }\n\nprotected:\n  static inline bool is_quick_automorphism(unsigned size,\n                                           const EmbeddingTy& emb,\n                                           unsigned history, VertexId dst,\n                                           BYTE& existed) {\n    if (dst <= emb.get_vertex(0))\n      return true;\n    if (dst == emb.get_vertex(1))\n      return true;\n    if (history == 0 && dst < emb.get_vertex(1))\n      return true;\n    if (size == 2) {\n    } else if (size == 3) {\n      if (history == 0 && emb.get_history(2) == 0 && dst <= emb.get_vertex(2))\n        return true;\n      if (history == 0 && emb.get_history(2) == 1 && dst == emb.get_vertex(2))\n        return true;\n      if (history == 1 && emb.get_history(2) == 1 && dst <= emb.get_vertex(2))\n        return true;\n      if (dst == emb.get_vertex(2))\n        existed = 1;\n    } else {\n      std::cout << \"Error: should go to detailed check\\n\";\n    }\n    return false;\n  }\n\n  static inline bool is_edge_automorphism(unsigned size, const EmbeddingTy& emb,\n                                          unsigned history, VertexId src,\n                                          VertexId dst, BYTE& existed,\n                                          const VertexSet& vertex_set) {\n    if (size < 3)\n      return is_quick_automorphism(size, emb, history, dst, existed);\n    // check with the first element\n    if (dst <= emb.get_vertex(0))\n      return true;\n    if (history == 0 && dst <= emb.get_vertex(1))\n      return true;\n    // check loop edge\n    if (dst == emb.get_vertex(emb.get_history(history)))\n      return true;\n    if (vertex_set.find(dst) != vertex_set.end())\n      existed = 1;\n    // check to see if there already exists the vertex added;\n    // if so, just allow to add edge which is (smaller id -> bigger id)\n    if (existed && src > dst)\n      return true;\n    std::pair<VertexId, VertexId> added_edge(src, dst);\n    for (unsigned index = history + 1; index < emb.size(); ++index) {\n      std::pair<VertexId, VertexId> edge;\n      edge.first  = emb.get_vertex(emb.get_history(index));\n      edge.second = emb.get_vertex(index);\n      // assert(edge.first != edge.second);\n      int cmp = compare(added_edge, edge);\n      if (cmp <= 0)\n        return true;\n    }\n    return false;\n  }\n  static inline void swap(std::pair<VertexId, VertexId>& pair) {\n    if (pair.first > pair.second) {\n      auto tmp    = pair.first;\n      pair.first  = pair.second;\n      pair.second = tmp;\n    }\n  }\n  static inline int compare(std::pair<VertexId, VertexId>& oneEdge,\n                            std::pair<VertexId, VertexId>& otherEdge) {\n    swap(oneEdge);\n    swap(otherEdge);\n    if (oneEdge.first == otherEdge.first)\n      return oneEdge.second - otherEdge.second;\n    else\n      return oneEdge.first - otherEdge.first;\n  }\n};\n"
  },
  {
    "path": "libpangolin/include/pangolin/BfsMining/embedding_list.h",
    "content": "#ifndef EMBEDDING_LIST_H_\n#define EMBEDDING_LIST_H_\n#include \"pangolin/gtypes.h\"\n#include \"pangolin/base_embedding.h\"\n#include \"pangolin/vertex_embedding.h\"\n#include \"pangolin/edge_embedding.h\"\n\n// Embedding list: SoA structure\ntemplate <typename ElementType, typename EmbeddingType>\nclass EmbeddingList {\npublic:\n  EmbeddingList() {}\n  ~EmbeddingList() {}\n  void init(PangolinGraph& graph, unsigned max_size = 2, bool is_dag = false);\n  VertexId get_vid(unsigned level, size_t id) const {\n    return vid_lists[level][id];\n  }\n  IndexTy get_idx(unsigned level, size_t id) const {\n    return idx_lists[level][id];\n  }\n  BYTE get_his(unsigned level, size_t id) const { return his_lists[level][id]; }\n  unsigned get_pid(size_t id) const { return pid_list[id]; }\n  void set_vid(unsigned level, size_t id, VertexId vid) {\n    vid_lists[level][id] = vid;\n  }\n  void set_idx(unsigned level, size_t id, IndexTy idx) {\n    idx_lists[level][id] = idx;\n  }\n  void set_his(unsigned level, size_t id, BYTE lab) {\n    his_lists[level][id] = lab;\n  }\n  void set_pid(size_t id, unsigned pid) { pid_list[id] = pid; }\n  size_t size() const { return vid_lists[last_level].size(); }\n  size_t size(unsigned level) const { return vid_lists[level].size(); }\n  VertexList get_vid_list(unsigned level) { return vid_lists[level]; }\n  UintList get_idx_list(unsigned level) { return idx_lists[level]; }\n  ByteList get_his_list(unsigned level) { return his_lists[level]; }\n  void remove_tail(size_t idx) {\n    vid_lists[last_level].erase(vid_lists[last_level].begin() + idx,\n                                vid_lists[last_level].end());\n    if (std::is_same<ElementType, LabeledElement>::value)\n      his_lists[last_level].erase(his_lists[last_level].begin() + idx,\n                                  his_lists[last_level].end());\n  }\n  void add_level(Ulong size) {\n    last_level++;\n    assert(last_level < max_level);\n    vid_lists[last_level].resize(size);\n    idx_lists[last_level].resize(size);\n    if (std::is_same<ElementType, LabeledElement>::value)\n      his_lists[last_level].resize(size);\n    if (std::is_same<EmbeddingType, VertexEmbedding>::value ||\n        std::is_same<EmbeddingType, EdgeEmbedding>::value) // multi-pattern\n      pid_list.resize(size);\n  }\n  void reset_level() {\n    for (size_t i = 2; i <= last_level; i++) {\n      vid_lists[i].clear();\n      idx_lists[i].clear();\n    }\n    last_level = 1;\n  }\n  void printout_embeddings(int level, bool verbose = false) {\n    std::cout << \"Number of embeddings in level \" << level << \": \" << size()\n              << std::endl;\n    if (verbose) {\n      for (size_t pos = 0; pos < size(); pos++) {\n        EmbeddingType emb(last_level + 1);\n        get_embedding(last_level, pos, emb);\n        std::cout << emb << \"\\n\";\n      }\n    }\n  }\n  void clean() {\n    pid_list.clear();\n    for (size_t i = 0; i < vid_lists.size(); i++) {\n      if (std::is_same<ElementType, LabeledElement>::value)\n        his_lists[i].clear();\n      idx_lists[i].clear();\n      vid_lists[i].clear();\n    }\n    his_lists.clear();\n    idx_lists.clear();\n    vid_lists.clear();\n  }\n\nprivate:\n  UintList pid_list;\n  ByteLists his_lists;\n  IndexLists idx_lists;\n  VertexLists vid_lists;\n  unsigned last_level;\n  unsigned max_level;\n  void get_embedding(unsigned level, size_t pos, EmbeddingType& emb) {\n    auto vid    = get_vid(level, pos);\n    IndexTy idx = get_idx(level, pos);\n    BYTE his    = 0;\n    if (std::is_same<ElementType, LabeledElement>::value)\n      his = get_his(level, pos);\n    ElementType ele(vid, 0, 0, his);\n    emb.set_element(level, ele);\n    for (unsigned l = 1; l < level; l++) {\n      vid = get_vid(level - l, idx);\n      if (std::is_same<ElementType, LabeledElement>::value)\n        his = get_his(level - l, idx);\n      ElementType ele(vid, 0, 0, his);\n      emb.set_element(level - l, ele);\n      idx = get_idx(level - l, idx);\n    }\n    ElementType ele0(idx, 0, 0, 0);\n    emb.set_element(0, ele0);\n  }\n};\n\n#endif // EMBEDDING_LIST_HPP_\n"
  },
  {
    "path": "libpangolin/include/pangolin/BfsMining/engine.h",
    "content": "#include \"galois/Galois.h\"\n#include \"pangolin/res_man.h\"\n#include \"pangolin/BfsMining/embedding_list.h\"\n\nint main(int argc, char** argv) {\n  galois::SharedMemSys G;\n  LonestarMineStart(argc, argv, name, desc, url);\n\n  galois::StatTimer totalTime(\"TimerTotal\");\n  totalTime.start();\n\n  AppMiner miner(k, numThreads);\n  galois::StatTimer Tinitial(\"GraphReadingTime\");\n  Tinitial.start();\n  miner.read_graph(filetype, inputFile);\n  Tinitial.stop();\n  ResourceManager rm;\n  for (unsigned nt = 0; nt < num_trials; nt++) {\n    // std::cout << \"\\nStart running trial \" << nt + 1 << \": \";\n    galois::StatTimer Tinitemb(\"EmbInitTime\");\n    Tinitemb.start();\n    miner.initialize(pattern_filename);\n    Tinitemb.stop();\n\n    galois::StatTimer execTime(\"Timer_0\");\n    execTime.start();\n#ifdef TRIANGLE\n    miner.tc_solver();\n#else\n    miner.solver();\n#endif // TRIANGLE\n    execTime.stop();\n    miner.print_output();\n    miner.clean();\n  }\n  std::cout << \"\\n\\t\" << rm.get_peak_memory() << \"\\n\\n\";\n\n  totalTime.stop();\n\n  return 0;\n}\n"
  },
  {
    "path": "libpangolin/include/pangolin/BfsMining/vertex_miner.h",
    "content": "#ifndef VERTEX_MINER_H\n#define VERTEX_MINER_H\n#include \"pangolin/miner.h\"\n#include \"pangolin/ptypes.h\"\n#include \"pangolin/quick_pattern.h\"\n#include \"pangolin/canonical_graph.h\"\n#include \"pangolin/BfsMining/embedding_list.h\"\n\ntemplate <typename ElementTy, typename EmbeddingTy, typename API,\n          bool enable_dag = false, bool is_single = true,\n          bool use_wedge = false, bool use_match_order = false>\nclass VertexMiner : public Miner<ElementTy, EmbeddingTy, enable_dag> {\n  typedef EmbeddingList<ElementTy, EmbeddingTy> EmbeddingListTy;\n\npublic:\n  VertexMiner(unsigned max_sz, int nt, unsigned nb)\n      : Miner<ElementTy, EmbeddingTy, enable_dag>(max_sz, nt), num_blocks(nb) {}\n  virtual ~VertexMiner() {}\n  void init_emb_list() {\n    this->emb_list.init(this->graph, this->max_size, enable_dag);\n  }\n  bool is_single_pattern() { return npatterns == 1; }\n  int get_num_patterns() { return npatterns; }\n  void set_num_patterns(int np = 1) {\n    npatterns = np;\n    accumulators.resize(npatterns);\n    for (int i = 0; i < npatterns; i++)\n      accumulators[i].reset();\n    if (!is_single)\n      for (auto i = 0; i < this->num_threads; i++)\n        qp_localmaps.getLocal(i)->clear();\n  }\n  void clean() {\n    is_wedge.clear();\n    accumulators.clear();\n    qp_map.clear();\n    cg_map.clear();\n    for (auto i = 0; i < this->num_threads; i++)\n      qp_localmaps.getLocal(i)->clear();\n    for (auto i = 0; i < this->num_threads; i++)\n      cg_localmaps.getLocal(i)->clear();\n    this->emb_list.clean();\n  }\n  void initialize(std::string pattern_filename) {\n    galois::on_each([&](unsigned tid, unsigned) {\n      auto& local_counters = *(counters.getLocal(tid));\n      local_counters.resize(npatterns);\n      std::fill(local_counters.begin(), local_counters.end(), 0);\n    });\n    init_emb_list();\n    if (use_match_order) {\n      if (pattern_filename == \"\") {\n        std::cout << \"need specify pattern file name using -p\\n\";\n        exit(1);\n      }\n      // unsigned pid = this->read_pattern(pattern_filename);\n      // unsigned pid = this->read_pattern(pattern_filename, \"gr\", true);\n      // std::cout << \"pattern id = \" << pid << \"\\n\";\n      // set_input_pattern(pid);\n    }\n  }\n  void set_input_pattern(unsigned GALOIS_UNUSED(pid)) {\n    // input_pid = pid;\n  }\n  virtual void print_output() {}\n\n  // extension for vertex-induced motif\n  inline void extend_vertex_multi(unsigned level, size_t chunk_begin,\n                                  size_t chunk_end) {\n    auto cur_size = this->emb_list.size();\n    size_t begin = 0, end = cur_size;\n    if (level == 1) {\n      begin    = chunk_begin;\n      end      = chunk_end;\n      cur_size = end - begin;\n      // std::cout << \"\\t chunk_begin = \" << chunk_begin << \", chunk_end \"\n      //          << chunk_end << \"\\n\";\n    }\n    // std::cout << \"\\t number of current embeddings in level \" << level << \": \"\n    // << cur_size << \"\\n\";\n    UintList num_new_emb(cur_size); // TODO: for large graph, wo need UlongList\n    // UlongList num_new_emb(cur_size);\n    galois::do_all(\n        galois::iterate(begin, end),\n        [&](const size_t& pos) {\n          auto& local_counters  = *(counters.getLocal());\n          unsigned n            = level + 1;\n          StrQpMapFreq* qp_lmap = nullptr;\n          if (n >= 4)\n            qp_lmap = qp_localmaps.getLocal();\n          EmbeddingTy emb(n);\n          get_embedding(level, pos, emb);\n          if (n < this->max_size - 1)\n            num_new_emb[pos - begin] = 0;\n          if (n == 3 && this->max_size == 4)\n            emb.set_pid(this->emb_list.get_pid(pos));\n          for (unsigned i = 0; i < n; ++i) {\n            if (!API::toExtend(n, emb, i))\n              continue;\n            auto src = emb.get_vertex(i);\n            for (auto e : this->graph.edges(src)) {\n              auto dst = this->graph.getEdgeDst(e);\n              if (API::toAdd(n, this->graph, emb, i, dst)) {\n                if (n < this->max_size - 1) {\n                  num_new_emb[pos - begin]++;\n                } else { // do reduction\n                  if (n < 4) {\n                    unsigned pid =\n                        this->find_motif_pattern_id(n, i, dst, emb, pos);\n                    local_counters[pid] += 1;\n                  } else\n                    quick_reduce(n, i, dst, emb, qp_lmap);\n                }\n              }\n            }\n          }\n        },\n        galois::chunk_size<CHUNK_SIZE>(), galois::steal(),\n        galois::loopname(\"Extending-alloc\"));\n    if (level == this->max_size - 2) {\n      galois::on_each([&](unsigned tid, unsigned) {\n        auto& local_counters = *(counters.getLocal(tid));\n        for (int i = 0; i < this->npatterns; i++)\n          this->accumulators[i] += local_counters[i];\n      });\n      return;\n    }\n\n    UlongList indices = parallel_prefix_sum<unsigned, Ulong>(num_new_emb);\n    num_new_emb.clear();\n    Ulong new_size = indices.back();\n    this->emb_list.add_level(new_size);\n    if (use_wedge && level == 1 && this->max_size == 4) {\n      is_wedge.resize(this->emb_list.size());\n      std::fill(is_wedge.begin(), is_wedge.end(), 0);\n    }\n    galois::do_all(\n        galois::iterate(begin, end),\n        [&](const size_t& pos) {\n          EmbeddingTy emb(level + 1);\n          get_embedding(level, pos, emb);\n          auto start = indices[pos - begin];\n          auto n     = emb.size();\n          for (unsigned i = 0; i < n; ++i) {\n            if (!API::toExtend(n, emb, i))\n              continue;\n            auto src = emb.get_vertex(i);\n            for (auto e : this->graph.edges(src)) {\n              GNode dst = this->graph.getEdgeDst(e);\n              if (API::toAdd(n, this->graph, emb, i, dst)) {\n                if (!is_single && n == 2 && this->max_size == 4)\n                  this->emb_list.set_pid(start, this->find_motif_pattern_id(\n                                                    n, i, dst, emb, start));\n                this->emb_list.set_idx(level + 1, start, pos);\n                this->emb_list.set_vid(level + 1, start++, dst);\n              }\n            }\n          }\n        },\n        galois::chunk_size<CHUNK_SIZE>(), galois::steal(),\n        galois::loopname(\"Extending-insert\"));\n    indices.clear();\n  }\n\n  // extension for vertex-induced clique\n  inline void extend_vertex_single(unsigned level, size_t chunk_begin,\n                                   size_t chunk_end) {\n    auto cur_size = this->emb_list.size();\n    size_t begin = 0, end = cur_size;\n    if (level == 1) {\n      begin    = chunk_begin;\n      end      = chunk_end;\n      cur_size = end - begin;\n      // std::cout << \"\\t chunk_begin = \" << chunk_begin << \", chunk_end \"\n      //          << chunk_end << \"\\n\";\n    }\n    // std::cout << \"\\t number of current embeddings in level \" << level << \": \"\n    //          << cur_size << \"\\n\";\n    UintList num_new_emb(cur_size);\n    galois::do_all(\n        galois::iterate(begin, end),\n        [&](const size_t& pos) {\n          auto& local_counters = *(counters.getLocal());\n          EmbeddingTy emb(level + 1);\n          get_embedding(level, pos, emb);\n          auto vid                 = this->emb_list.get_vid(level, pos);\n          num_new_emb[pos - begin] = 0;\n          for (auto e : this->graph.edges(vid)) {\n            GNode dst = this->graph.getEdgeDst(e);\n            if (API::toAdd(level + 1, this->graph, emb, level, dst)) {\n              if (level < this->max_size - 2) {\n                num_new_emb[pos - begin]++;\n              } else {\n                local_counters[0] += 1;\n              }\n            }\n          }\n        },\n        galois::chunk_size<CHUNK_SIZE>(), galois::steal(),\n        galois::loopname(\"Extending-alloc\"));\n\n    if (level == this->max_size - 2) {\n      galois::on_each([&](unsigned tid, unsigned) {\n        auto& local_counters = *(counters.getLocal(tid));\n        for (int i = 0; i < this->npatterns; i++)\n          this->accumulators[0] += local_counters[0];\n      });\n      return;\n    }\n\n    UlongList indices = parallel_prefix_sum<unsigned, Ulong>(num_new_emb);\n    num_new_emb.clear();\n    Ulong new_size = indices.back();\n    std::cout << \"\\t number of new embeddings: \" << new_size << \"\\n\";\n    this->emb_list.add_level(new_size);\n    galois::do_all(\n        galois::iterate(begin, end),\n        [&](const size_t& pos) {\n          EmbeddingTy emb(level + 1);\n          get_embedding(level, pos, emb);\n          auto vid   = this->emb_list.get_vid(level, pos);\n          auto start = indices[pos - begin];\n          for (auto e : this->graph.edges(vid)) {\n            GNode dst = this->graph.getEdgeDst(e);\n            if (API::toAdd(level + 1, this->graph, emb, level, dst)) {\n              this->emb_list.set_idx(level + 1, start, pos);\n              this->emb_list.set_vid(level + 1, start++, dst);\n            }\n          }\n        },\n        galois::chunk_size<CHUNK_SIZE>(), galois::steal(),\n        galois::loopname(\"Extending-insert\"));\n    indices.clear();\n  }\n\n  inline void extend_single_ordered(unsigned level, size_t chunk_begin,\n                                    size_t chunk_end) {\n    auto cur_size = this->emb_list.size();\n    size_t begin = 0, end = cur_size;\n    if (level == 1) {\n      begin    = chunk_begin;\n      end      = chunk_end;\n      cur_size = end - begin;\n      // std::cout << \"\\t chunk_begin = \" << chunk_begin << \", chunk_end \" <<\n      // chunk_end << \"\\n\";\n    }\n    // std::cout << \"\\t number of embeddings in level \" << level << \": \" <<\n    // cur_size << \"\\n\";\n    UintList num_new_emb(cur_size);\n\n    galois::do_all(\n        galois::iterate(begin, end),\n        [&](const size_t& pos) {\n          auto& local_counters = *(counters.getLocal());\n          EmbeddingTy emb(level + 1);\n          get_embedding(level, pos, emb);\n          num_new_emb[pos - begin] = 0;\n          auto id                  = API::getExtendableVertex(level + 1);\n          auto src                 = emb.get_vertex(id);\n          for (auto e : this->graph.edges(src)) {\n            auto dst = this->graph.getEdgeDst(e);\n            if (API::toAdd(level + 1, this->graph, emb, src, dst)) {\n              if (level < this->max_size - 2) {\n                num_new_emb[pos - begin]++;\n              } else {\n                local_counters[0] += 1;\n              }\n            }\n          }\n        },\n        galois::chunk_size<CHUNK_SIZE>(), galois::steal(),\n        galois::loopname(\"Extending-alloc\"));\n\n    if (level == this->max_size - 2) {\n      galois::on_each([&](unsigned tid, unsigned) {\n        auto& local_counters = *(counters.getLocal(tid));\n        for (int i = 0; i < this->npatterns; i++)\n          this->accumulators[0] += local_counters[0];\n      });\n      return;\n    }\n\n    UlongList indices = parallel_prefix_sum<unsigned, Ulong>(num_new_emb);\n    num_new_emb.clear();\n    Ulong new_size = indices.back();\n    // std::cout << \"number of new embeddings: \" << new_size << \"\\n\";\n    this->emb_list.add_level(new_size);\n\n    galois::do_all(\n        galois::iterate(begin, end),\n        [&](const size_t& pos) {\n          EmbeddingTy emb(level + 1);\n          get_embedding(level, pos, emb);\n          auto start = indices[pos - begin];\n          auto id    = API::getExtendableVertex(level + 1);\n          auto src   = emb.get_vertex(id);\n          // std::cout << \"current embedding: \" << emb << \"\\n\";\n          // std::cout << \"extending vertex \" << src << \"\\n\";\n          for (auto e : this->graph.edges(src)) {\n            auto dst = this->graph.getEdgeDst(e);\n            if (API::toAdd(level + 1, this->graph, emb, src, dst)) {\n              this->emb_list.set_idx(level + 1, start, pos);\n              this->emb_list.set_vid(level + 1, start++, dst);\n            }\n          }\n        },\n        galois::chunk_size<CHUNK_SIZE>(), galois::steal(),\n        galois::loopname(\"Extending-insert\"));\n    indices.clear();\n  }\n\n  inline void extend_ordered(unsigned level, size_t chunk_begin,\n                             size_t chunk_end) {\n    auto cur_size = this->emb_list.size();\n    size_t begin = 0, end = cur_size;\n    if (level == 1) {\n      begin    = chunk_begin;\n      end      = chunk_end;\n      cur_size = end - begin;\n      // std::cout << \"\\t chunk_begin = \" << chunk_begin << \", chunk_end \"\n      //          << chunk_end << \"\\n\";\n    }\n    // std::cout << \"\\t number of current embeddings in level \" << level << \": \"\n    //          << cur_size << \"\\n\";\n    UintList num_new_emb(cur_size);\n    galois::do_all(\n        galois::iterate(begin, end),\n        [&](const size_t& pos) {\n          EmbeddingTy emb(level + 1);\n          get_embedding(level, pos, emb);\n          num_new_emb[pos - begin] = 0;\n          // std::cout << \"current embedding: \" << emb << \"\\n\";\n          for (auto q_edge : this->pattern.edges(level + 1)) {\n            VertexId q_dst   = this->pattern.getEdgeDst(q_edge);\n            VertexId q_order = q_dst;\n            if (q_order < level + 1) {\n              VertexId d_vertex = emb.get_vertex(q_order);\n              for (auto d_edge : this->graph.edges(d_vertex)) {\n                GNode d_dst = this->graph.getEdgeDst(d_edge);\n                if (API::toAddOrdered(level + 1, this->graph, emb, q_order,\n                                      d_dst, this->pattern)) {\n                  if (level < this->max_size - 2)\n                    num_new_emb[pos - begin]++;\n                  else\n                    accumulators[0] += 1;\n                }\n              }\n              break;\n            }\n          }\n        },\n        galois::chunk_size<CHUNK_SIZE>(), galois::steal(),\n        galois::loopname(\"Extending-alloc\"));\n\n    if (level == this->max_size - 2)\n      return;\n    UlongList indices = parallel_prefix_sum<unsigned, Ulong>(num_new_emb);\n    num_new_emb.clear();\n    Ulong new_size = indices.back();\n    std::cout << \"\\t number of new embeddings: \" << new_size << \"\\n\";\n    this->emb_list.add_level(new_size);\n    galois::do_all(\n        galois::iterate(begin, end),\n        [&](const size_t& pos) {\n          EmbeddingTy emb(level + 1);\n          get_embedding(level, pos, emb);\n          auto start = indices[pos - begin];\n          for (auto q_edge : this->pattern.edges(level + 1)) {\n            VertexId q_dst   = this->pattern.getEdgeDst(q_edge);\n            VertexId q_order = q_dst;\n            if (q_order < level + 1) {\n              VertexId d_vertex = emb.get_vertex(q_order);\n              for (auto d_edge : this->graph.edges(d_vertex)) {\n                GNode d_dst = this->graph.getEdgeDst(d_edge);\n                if (API::toAddOrdered(level + 1, this->graph, emb, q_order,\n                                      d_dst, this->pattern)) {\n                  this->emb_list.set_idx(level + 1, start, pos);\n                  this->emb_list.set_vid(level + 1, start++, d_dst);\n                }\n              }\n              break;\n            }\n          }\n        },\n        galois::chunk_size<CHUNK_SIZE>(), galois::steal(),\n        galois::loopname(\"Extending-insert\"));\n    indices.clear();\n  }\n\n  // quick pattern reduction\n  inline void quick_reduce(unsigned n, unsigned i, VertexId dst,\n                           const EmbeddingTy& emb, StrQpMapFreq* qp_lmap) {\n    std::vector<bool> connected;\n    this->get_connectivity(n, i, dst, emb, connected);\n    StrQPattern qp(n + 1, connected);\n    if (qp_lmap->find(qp) != qp_lmap->end()) {\n      (*qp_lmap)[qp] += 1;\n      qp.clean();\n    } else\n      (*qp_lmap)[qp] = 1;\n  }\n  // canonical pattern reduction\n  inline void canonical_reduce() {\n    for (auto i = 0; i < this->num_threads; i++)\n      cg_localmaps.getLocal(i)->clear();\n    galois::do_all(\n        galois::iterate(qp_map),\n        [&](auto& element) {\n          StrCgMapFreq* cg_map = cg_localmaps.getLocal();\n          StrCPattern cg(element.first);\n          if (cg_map->find(cg) != cg_map->end())\n            (*cg_map)[cg] += element.second;\n          else\n            (*cg_map)[cg] = element.second;\n          cg.clean();\n        },\n        galois::chunk_size<CHUNK_SIZE>(), galois::loopname(\"CanonicalReduce\"));\n    qp_map.clear();\n  }\n  inline void merge_qp_map() {\n    qp_map.clear();\n    for (unsigned i = 0; i < qp_localmaps.size(); i++) {\n      StrQpMapFreq qp_lmap = *qp_localmaps.getLocal(i);\n      for (auto element : qp_lmap) {\n        if (qp_map.find(element.first) != qp_map.end())\n          qp_map[element.first] += element.second;\n        else\n          qp_map[element.first] = element.second;\n      }\n    }\n  }\n  inline void merge_cg_map() {\n    cg_map.clear();\n    for (unsigned i = 0; i < cg_localmaps.size(); i++) {\n      StrCgMapFreq cg_lmap = *cg_localmaps.getLocal(i);\n      for (auto element : cg_lmap) {\n        if (cg_map.find(element.first) != cg_map.end())\n          cg_map[element.first] += element.second;\n        else\n          cg_map[element.first] = element.second;\n      }\n    }\n  }\n\n  // Utilities\n  Ulong get_total_count() { return accumulators[0].reduce(); }\n  void printout_motifs() {\n    std::cout << std::endl;\n    if (accumulators.size() == 2) {\n      std::cout << \"\\ttriangles \" << accumulators[0].reduce() << std::endl;\n      std::cout << \"\\twedges    \" << accumulators[1].reduce() << std::endl;\n    } else if (accumulators.size() == 6) {\n      std::cout << \"\\t4-paths --> \" << accumulators[0].reduce() << std::endl;\n      std::cout << \"\\t3-stars --> \" << accumulators[1].reduce() << std::endl;\n      std::cout << \"\\t4-cycles --> \" << accumulators[2].reduce() << std::endl;\n      std::cout << \"\\ttailed-triangles --> \" << accumulators[3].reduce()\n                << std::endl;\n      std::cout << \"\\tdiamonds --> \" << accumulators[4].reduce() << std::endl;\n      std::cout << \"\\t4-cliques --> \" << accumulators[5].reduce() << std::endl;\n    } else {\n      if (this->max_size < 9) {\n        std::cout << std::endl;\n        for (auto it = cg_map.begin(); it != cg_map.end(); ++it)\n          std::cout << \"{\" << it->first << \"} --> \" << it->second << std::endl;\n      } else {\n        std::cout << std::endl;\n        for (auto it = cg_map.begin(); it != cg_map.end(); ++it)\n          std::cout << it->first << \" --> \" << it->second << std::endl;\n      }\n    }\n    // std::cout << std::endl;\n  }\n  void tc_vertex_solver() { // vertex parallel\n    galois::do_all(\n        galois::iterate(this->graph.begin(), this->graph.end()),\n        [&](const GNode& src) {\n          for (auto e : this->graph.edges(src)) {\n            auto dst = this->graph.getEdgeDst(e);\n            accumulators[0] += this->intersect(src, dst);\n          }\n        },\n        galois::chunk_size<CHUNK_SIZE>(), galois::steal(),\n        galois::loopname(\"TC\"));\n  }\n\n  void tc_solver() { // edge parallel\n    galois::do_all(\n        galois::iterate((size_t)0, this->emb_list.size()),\n        [&](const size_t& id) {\n          auto src = this->emb_list.get_idx(1, id);\n          auto dst = this->emb_list.get_vid(1, id);\n          auto num = this->intersect_dag(src, dst);\n          accumulators[0] += num;\n        },\n        galois::chunk_size<CHUNK_SIZE>(), galois::steal(),\n        galois::loopname(\"TC\"));\n  }\n\n  void solver() {\n    size_t num          = this->emb_list.size();\n    size_t chunk_length = (num - 1) / num_blocks + 1;\n    // std::cout << \"number of single-edge embeddings: \" << num << \"\\n\";\n    for (size_t cid = 0; cid < num_blocks; cid++) {\n      size_t chunk_begin = cid * chunk_length;\n      size_t chunk_end   = std::min((cid + 1) * chunk_length, num);\n      // size_t cur_size    = chunk_end - chunk_begin;\n      // std::cout << \"Processing the \" << cid << \" chunk (\" << cur_size\n      //          << \" edges) of \" << num_blocks << \" blocks\\n\";\n      unsigned level = 1;\n      while (1) {\n        // this->emb_list.printout_embeddings(level);\n        if (use_match_order) {\n          extend_single_ordered(level, chunk_begin, chunk_end);\n        } else {\n          if (is_single_pattern())\n            extend_vertex_single(level, chunk_begin, chunk_end);\n          else\n            extend_vertex_multi(level, chunk_begin, chunk_end);\n        }\n        if (level == this->max_size - 2)\n          break;\n        level++;\n      }\n      this->emb_list.reset_level();\n    }\n    if (this->max_size >= 5 && !is_single_pattern()) {\n      merge_qp_map();\n      canonical_reduce();\n      merge_cg_map();\n    }\n  }\n\nprivate:\n  unsigned num_blocks;\n  StrQpMapFreq qp_map; // quick patterns map for counting the frequency\n  StrCgMapFreq cg_map; // canonical graph map for couting the frequency\n  LocalStrQpMapFreq qp_localmaps; // quick patterns local map for each thread\n  LocalStrCgMapFreq cg_localmaps; // canonical graph local map for each thread\n  std::vector<BYTE> is_wedge;     // indicate a 3-vertex embedding is a wedge or\n                                  // chain (v0-cntered or v1-centered)\n\n  inline void get_embedding(unsigned level, size_t pos, EmbeddingTy& emb) {\n    auto vid = this->emb_list.get_vid(level, pos);\n    auto idx = this->emb_list.get_idx(level, pos);\n    ElementTy ele(vid);\n    emb.set_element(level, ele);\n    // backward constructing the embedding\n    for (unsigned l = 1; l < level; l++) {\n      auto u = this->emb_list.get_vid(level - l, idx);\n      ElementTy ele(u);\n      emb.set_element(level - l, ele);\n      idx = this->emb_list.get_idx(level - l, idx);\n    }\n    ElementTy ele0(idx);\n    emb.set_element(0, ele0);\n  }\n\nprotected:\n  int npatterns;\n  galois::substrate::PerThreadStorage<std::vector<Ulong>> counters;\n  std::vector<UlongAccu> accumulators;\n  EmbeddingListTy emb_list;\n\n  inline unsigned find_motif_pattern_id(unsigned n, unsigned idx, VertexId dst,\n                                        const EmbeddingTy& emb,\n                                        unsigned pos = 0) {\n    unsigned pid = 0;\n    if (n == 2) { // count 3-motifs\n      pid = 1;    // 3-chain\n      if (idx == 0) {\n        if (this->is_connected(emb.get_vertex(1), dst))\n          pid = 0; // triangle\n        else if (use_wedge && this->max_size == 4)\n          is_wedge[pos] = 1; // wedge; used for 4-motif\n      }\n    } else if (n == 3) { // count 4-motifs\n      unsigned num_edges = 1;\n      pid                = emb.get_pid();\n      if (pid == 0) { // extending a triangle\n        for (unsigned j = idx + 1; j < n; j++)\n          if (this->is_connected(emb.get_vertex(j), dst))\n            num_edges++;\n        pid = num_edges + 2; // p3: tailed-triangle; p4: diamond; p5: 4-clique\n      } else {               // extending a 3-chain\n        std::vector<bool> connected(3, false);\n        connected[idx] = true;\n        for (unsigned j = idx + 1; j < n; j++) {\n          if (this->is_connected(emb.get_vertex(j), dst)) {\n            num_edges++;\n            connected[j] = true;\n          }\n        }\n        if (num_edges == 1) {\n          pid             = 0; // p0: 3-path\n          unsigned center = 1;\n          if (use_wedge) {\n            if (is_wedge[pos])\n              center = 0;\n          } else\n            center = this->is_connected(emb.get_vertex(1), emb.get_vertex(2))\n                         ? 1\n                         : 0;\n          if (idx == center)\n            pid = 1; // p1: 3-star\n        } else if (num_edges == 2) {\n          pid             = 2; // p2: 4-cycle\n          unsigned center = 1;\n          if (use_wedge) {\n            if (is_wedge[pos])\n              center = 0;\n          } else\n            center = this->is_connected(emb.get_vertex(1), emb.get_vertex(2))\n                         ? 1\n                         : 0;\n          if (connected[center])\n            pid = 3; // p3: tailed-triangle\n        } else {\n          pid = 4; // p4: diamond\n        }\n      }\n    } else { // count 5-motif and beyond\n      pid = this->find_motif_pattern_id_eigen(n, idx, dst, emb);\n    }\n    return pid;\n  }\n};\n\n#endif // VERTEX_MINER_HPP_\n"
  },
  {
    "path": "libpangolin/include/pangolin/BfsMining/vertex_miner_api.h",
    "content": "#pragma once\n#include \"pangolin/gtypes.h\"\n\ntemplate <typename EmbeddingTy, bool use_wedge = true>\nclass VertexMinerAPI {\npublic:\n  VertexMinerAPI() {}\n  ~VertexMinerAPI() {}\n  // toExtend\n  static inline bool toExtend(unsigned, const EmbeddingTy&, unsigned) {\n    return true;\n  }\n\n  // toAdd (only add non-automorphisms)\n  static inline bool toAdd(unsigned n, PangolinGraph& g, const EmbeddingTy& emb,\n                           unsigned pos, VertexId dst) {\n    return !is_vertex_automorphism(n, g, emb, pos, dst);\n  }\n\n  static inline bool toAddOrdered(unsigned, PangolinGraph&, const EmbeddingTy&,\n                                  unsigned, VertexId, PangolinGraph&) {\n    return true;\n  }\n\n  // specify which vertex to extend when using matching order\n  static inline unsigned getExtendableVertex(unsigned n) { return n - 1; }\n\n  // given an embedding, return its pattern id (hash value)\n  static inline unsigned getPattern(unsigned, PangolinGraph&, unsigned,\n                                    VertexId, const EmbeddingTy&, unsigned) {\n    return 0;\n  }\n\nprotected:\n  static inline bool is_vertex_automorphism(unsigned n, PangolinGraph& g,\n                                            const EmbeddingTy& emb,\n                                            unsigned idx, VertexId dst) {\n    // unsigned n = emb.size();\n    // the new vertex id should be larger than the first vertex id\n    if (dst <= emb.get_vertex(0))\n      return true;\n    // the new vertex should not already exist in the embedding\n    for (unsigned i = 1; i < n; ++i)\n      if (dst == emb.get_vertex(i))\n        return true;\n    // the new vertex should not already be extended by any previous vertex in\n    // the embedding\n    for (unsigned i = 0; i < idx; ++i)\n      if (is_connected(g, emb.get_vertex(i), dst))\n        return true;\n    // the new vertex id must be larger than any vertex id after its source\n    // vertex in the embedding\n    for (unsigned i = idx + 1; i < n; ++i)\n      if (dst < emb.get_vertex(i))\n        return true;\n    return false;\n  }\n  static inline bool is_all_connected_dag(PangolinGraph& g, unsigned dst,\n                                          const EmbeddingTy& emb, unsigned end,\n                                          unsigned start = 0) {\n    assert(end > 0);\n    bool all_connected = true;\n    for (unsigned i = start; i < end; ++i) {\n      unsigned from = emb.get_vertex(i);\n      if (!is_connected_dag(g, dst, from)) {\n        all_connected = false;\n        break;\n      }\n    }\n    return all_connected;\n  }\n  static inline bool is_connected(PangolinGraph& g, unsigned a, unsigned b) {\n    if (g.get_degree(a) == 0 || g.get_degree(b) == 0)\n      return false;\n    unsigned key    = a;\n    unsigned search = b;\n    if (g.get_degree(a) < g.get_degree(b)) {\n      key    = b;\n      search = a;\n    }\n    auto begin = g.edge_begin(search);\n    auto end   = g.edge_end(search);\n    return binary_search(g, key, begin, end);\n  }\n  static inline int is_connected_dag(PangolinGraph& g, unsigned key,\n                                     unsigned search) {\n    if (g.get_degree(search) == 0)\n      return false;\n    auto begin = g.edge_begin(search);\n    auto end   = g.edge_end(search);\n    return binary_search(g, key, begin, end);\n  }\n  static inline bool binary_search(PangolinGraph& g, unsigned key,\n                                   PangolinGraph::edge_iterator begin,\n                                   PangolinGraph::edge_iterator end) {\n    auto l = begin;\n    auto r = end - 1;\n    while (r >= l) {\n      auto mid       = l + (r - l) / 2;\n      unsigned value = g.getEdgeDst(mid);\n      if (value == key)\n        return true;\n      if (value < key)\n        l = mid + 1;\n      else\n        r = mid - 1;\n    }\n    return false;\n  }\n  static inline unsigned find_motif_pattern_id(unsigned n, PangolinGraph& g,\n                                               unsigned idx, VertexId dst,\n                                               const EmbeddingTy& emb,\n                                               BYTE* pre_pid,\n                                               unsigned pos = 0) {\n    unsigned pid = 0;\n    if (n == 2) { // count 3-motifs\n      pid = 1;    // 3-chain\n      if (idx == 0) {\n        if (is_connected(g, emb.get_vertex(1), dst))\n          pid = 0; // triangle\n        else if (use_wedge)\n          pre_pid[pos] = 1; // wedge; used for 4-motif\n      }\n    } else if (n == 3) { // count 4-motifs\n      unsigned num_edges = 1;\n      pid                = emb.get_pid();\n      if (pid == 0) { // extending a triangle\n        for (unsigned j = idx + 1; j < n; j++)\n          if (is_connected(g, emb.get_vertex(j), dst))\n            num_edges++;\n        pid = num_edges + 2; // p3: tailed-triangle; p4: diamond; p5: 4-clique\n      } else {               // extending a 3-chain\n        assert(pid == 1);\n        std::vector<bool> connected(3, false);\n        connected[idx] = true;\n        for (unsigned j = idx + 1; j < n; j++) {\n          if (is_connected(g, emb.get_vertex(j), dst)) {\n            num_edges++;\n            connected[j] = true;\n          }\n        }\n        if (num_edges == 1) {\n          pid             = 0; // p0: 3-path\n          unsigned center = 1;\n          if (use_wedge) {\n            if (pre_pid[pos])\n              center = 0;\n          } else\n            center =\n                is_connected(g, emb.get_vertex(1), emb.get_vertex(2)) ? 1 : 0;\n          if (idx == center)\n            pid = 1; // p1: 3-star\n        } else if (num_edges == 2) {\n          pid             = 2; // p2: 4-cycle\n          unsigned center = 1;\n          if (use_wedge) {\n            if (pre_pid[pos])\n              center = 0;\n          } else\n            center =\n                is_connected(g, emb.get_vertex(1), emb.get_vertex(2)) ? 1 : 0;\n          if (connected[center])\n            pid = 3; // p3: tailed-triangle\n        } else {\n          pid = 4; // p4: diamond\n        }\n      }\n    } else { // count 5-motif and beyond\n             // pid = find_motif_pattern_id_eigen(n, idx, dst, emb);\n    }\n    return pid;\n  }\n};\n"
  },
  {
    "path": "libpangolin/include/pangolin/base_embedding.h",
    "content": "#pragma once\n#include \"pangolin/embedding.h\"\n#include \"bliss/uintseqhash.hh\"\n\n// Basic Vertex-induced embedding\nclass BaseEmbedding : public Embedding<SimpleElement> {\n  friend std::ostream& operator<<(std::ostream& strm, const BaseEmbedding& emb);\n\npublic:\n  BaseEmbedding() {}\n  BaseEmbedding(size_t n) : Embedding(n) {}\n  ~BaseEmbedding() {}\n  inline unsigned get_hash() const {\n    bliss::UintSeqHash h;\n    for (unsigned i = 0; i < size(); ++i)\n      h.update(elements[i].get_vid());\n    return h.get_value();\n  }\n  BaseEmbedding& operator=(const BaseEmbedding& other) {\n    if (this == &other)\n      return *this;\n    elements = other.get_elements();\n    return *this;\n  }\n  inline unsigned get_pid() const { return 0; } // not used\n  inline void set_pid(unsigned) {}              // not used\n  friend bool operator==(const BaseEmbedding& e1, const BaseEmbedding& e2) {\n    return e1.elements == e2.elements;\n  }\n};\n\nnamespace std {\ntemplate <>\nstruct hash<BaseEmbedding> {\n  std::size_t operator()(const BaseEmbedding& emb) const {\n    return std::hash<int>()(emb.get_hash());\n  }\n};\n} // namespace std\n"
  },
  {
    "path": "libpangolin/include/pangolin/canonical_graph.h",
    "content": "#ifndef CANONICAL_GRAPH_HPP_\n#define CANONICAL_GRAPH_HPP_\n/**\n * Code from on below link. Modified under Galois.\n *\n * https://github.com/rstream-system/RStream/\n *\n * Copyright (c) 2018, Kai Wang and the respective contributors\n * All rights reserved.\n * Reused/revised under 3-BSD\n */\n\n#define USE_DOMAIN // use domain support\n\nGALOIS_IGNORE_EXTERNAL_UNUSED_PARAMETERS\n#include \"bliss/graph.hh\"\nGALOIS_END_IGNORE_EXTERNAL_UNUSED_PARAMETERS\n\n#include \"pangolin/embedding.h\"\n#include \"pangolin/edge_type.h\"\n\ntypedef std::unordered_map<VertexId, BYTE> VertexMap;\ntypedef std::vector<bliss::Graph::Vertex> BlissVertexList;\n\ntemplate <typename EmbeddingTy, typename ElementTy>\nclass CanonicalGraph;\ntemplate <typename EmbeddingTy, typename ElementTy>\nstd::ostream& operator<<(std::ostream& strm,\n                         const CanonicalGraph<EmbeddingTy, ElementTy>& cg);\n\ntemplate <typename EmbeddingTy, typename ElementTy>\nclass CanonicalGraph {\n  friend std::ostream&\n  operator<<<>(std::ostream& strm,\n               const CanonicalGraph<EmbeddingTy, ElementTy>& cg);\n\npublic:\n  CanonicalGraph() : number_of_vertices(0), hash_value(0) {}\n  CanonicalGraph(bliss::AbstractGraph* ag,\n                 bool GALOIS_USED_ONLY_IN_DEBUG(is_directed) = false) {\n    assert(!is_directed);\n    construct_cg(ag);\n  }\n  CanonicalGraph(const QuickPattern<EmbeddingTy, ElementTy>& qp,\n                 bool GALOIS_USED_ONLY_IN_DEBUG(is_directed) = false) {\n    assert(!is_directed);\n    bliss::AbstractGraph* ag = turn_abstract(qp);\n    construct_cg(ag);\n  }\n  ~CanonicalGraph() {}\n  int cmp(const CanonicalGraph& other_cg) const {\n    // compare the numbers of vertices\n    if (get_num_vertices() < other_cg.get_num_vertices())\n      return -1;\n    if (get_num_vertices() > other_cg.get_num_vertices())\n      return 1;\n    // compare hash value\n    if (get_hash() < other_cg.get_hash())\n      return -1;\n    if (get_hash() > other_cg.get_hash())\n      return 1;\n    // compare edges\n    assert(embedding.size() == other_cg.embedding.size());\n    for (unsigned i = 0; i < embedding.size(); ++i) {\n      const auto& t1  = embedding.get_element(i);\n      const auto& t2  = other_cg.embedding.get_element(i);\n      int cmp_element = t1.cmp(t2);\n      if (cmp_element != 0)\n        return cmp_element;\n    }\n    return 0;\n  }\n  inline unsigned get_hash() const { return hash_value; }\n  inline int get_num_vertices() const { return number_of_vertices; }\n  // operator for map\n  inline bool operator==(const CanonicalGraph& other) const {\n    return cmp(other) == 0;\n  }\n  // inline EmbeddingTy& get_embedding() { return embedding; }\n  inline EmbeddingTy get_embedding() const { return embedding; }\n  inline void set_number_vertices(int num_vertices) {\n    number_of_vertices = num_vertices;\n  }\n  inline void set_hash_value(unsigned int hash) { hash_value = hash; }\n  inline unsigned get_quick_pattern_index(unsigned i) { return qp_idx[i]; }\n  inline unsigned get_id() const { return hash_value; }\n  inline void clean() { embedding.clean(); }\n\nprivate:\n  EmbeddingTy embedding;\n  std::vector<int> qp_idx; // TODO: try gstl::Vector\n  int number_of_vertices;\n  unsigned hash_value;\n  unsigned support;\n  void construct_cg(bliss::AbstractGraph* ag) {\n    number_of_vertices = ag->get_nof_vertices();\n    hash_value         = ag->get_hash();\n    transform_to_embedding(ag);\n  }\n  void transform_to_embedding(bliss::AbstractGraph* ag) {\n    bliss::Graph* graph = (bliss::Graph*)ag;\n    VertexSet set;\n    VertexMap map;\n    EdgeHeap min_heap;\n    BlissVertexList vertices = graph->get_vertices_rstream();\n    VertexId first_src       = init_heapAndset(vertices, min_heap, set);\n    assert(first_src != (VertexId)-1);\n    push_first_element(first_src, map, vertices);\n#ifdef USE_DOMAIN\n    bool is_first_edge = true;\n#endif\n    while (!min_heap.empty()) {\n      Edge edge = min_heap.top();\n#ifdef USE_DOMAIN\n      if (is_first_edge) {\n        qp_idx.push_back(edge.src_domain);\n        is_first_edge = false;\n      }\n#endif\n      push_element(edge, map, vertices);\n      min_heap.pop();\n      add_neighbours(edge, min_heap, vertices, set);\n    }\n  }\n  VertexId init_heapAndset(BlissVertexList& vertices, EdgeHeap& min_heap,\n                           VertexSet& set) {\n    for (unsigned i = 0; i < vertices.size(); ++i) {\n      if (!vertices[i].edges.empty()) {\n        for (auto v : vertices[i].edges) {\n#ifdef USE_DOMAIN\n          min_heap.push(Edge(i, v.first, v.second.first, v.second.second));\n#else\n          min_heap.push(Edge(i, v));\n#endif\n        }\n        set.insert(i);\n        return i;\n      }\n    }\n    return -1;\n  }\n  void push_first_element(VertexId first, VertexMap& map,\n                          BlissVertexList& vertices) {\n    map[first] = 0;\n    embedding.push_back(\n        ElementTy(first + 1, (BYTE)0, (BYTE)vertices[first].color, (BYTE)0));\n  }\n  void push_element(Edge& edge, VertexMap& map, BlissVertexList& vertices) {\n    assert(edge.src < edge.dst);\n    if (map.find(edge.src) != map.end()) {\n      embedding.push_back(ElementTy(edge.dst + 1, (BYTE)0,\n                                    (BYTE)vertices[edge.dst].color,\n                                    (BYTE)map[edge.src]));\n#ifdef USE_DOMAIN\n      qp_idx.push_back(edge.dst_domain);\n#endif\n      if (map.find(edge.dst) == map.end()) {\n        int s         = embedding.size() - 1;\n        map[edge.dst] = s;\n      }\n    } else if (map.find(edge.dst) != map.end()) {\n      embedding.push_back(ElementTy(edge.src + 1, (BYTE)0,\n                                    (BYTE)vertices[edge.src].color,\n                                    (BYTE)map[edge.dst]));\n#ifdef USE_DOMAIN\n      qp_idx.push_back(edge.src_domain);\n#endif\n      if (map.find(edge.src) == map.end()) {\n        int s         = embedding.size() - 1;\n        map[edge.src] = s;\n      }\n    } else {\n      // wrong case\n      std::cout << \"wrong case!!!\" << std::endl;\n      throw std::exception();\n    }\n  }\n  void add_neighbours(Edge& edge, EdgeHeap& min_heap, BlissVertexList& vertices,\n                      VertexSet& set) {\n    add_neighbours(edge.src, min_heap, vertices, set);\n    add_neighbours(edge.dst, min_heap, vertices, set);\n  }\n  void add_neighbours(VertexId srcId, EdgeHeap& min_heap,\n                      BlissVertexList& vertices, VertexSet& set) {\n    if (set.find(srcId) == set.end()) {\n      for (auto v : vertices[srcId].edges) {\n#ifdef USE_DOMAIN\n        VertexId dst = v.first;\n#else\n        VertexId dst = v;\n#endif\n        if (set.find(dst) == set.end()) {\n#ifdef USE_DOMAIN\n          Edge edge(srcId, dst, v.second.first, v.second.second);\n#else\n          Edge edge(srcId, dst);\n#endif\n          edge.swap();\n          min_heap.push(edge);\n        }\n      }\n      set.insert(srcId);\n    }\n  }\n  static void report_aut(void* GALOIS_USED_ONLY_IN_DEBUG(param),\n                         const unsigned GALOIS_UNUSED(n),\n                         const unsigned* GALOIS_UNUSED(aut)) {\n    assert(param);\n    // fprintf((FILE*) param, \"Generator: \");\n    // bliss::print_permutation((FILE*) param, n, aut, 1);\n    // fprintf((FILE*) param, \"\\n\");\n  }\n  bliss::AbstractGraph*\n  turn_abstract(const QuickPattern<EmbeddingTy, ElementTy>& qp) {\n    bliss::AbstractGraph* ag = 0;\n    // get the number of vertices\n    std::unordered_map<VertexId, BYTE> vertices;\n    for (unsigned index = 0; index < qp.get_size(); ++index) {\n      auto element = qp.at(index);\n      if (std::is_same<ElementTy, LabeledElement>::value)\n        vertices[element.get_vid()] = element.get_vlabel();\n      else\n        vertices[element.get_vid()] = 0;\n    }\n    // construct bliss graph\n    const unsigned number_vertices = vertices.size();\n    ag                             = new bliss::Graph(vertices.size());\n    // set vertices\n    for (unsigned i = 0; i < number_vertices; ++i)\n      ag->change_color(i, (unsigned)vertices[i + 1]);\n    // read edges\n    assert(qp.get_size() > 1);\n    for (unsigned index = 1; index < qp.get_size(); ++index) {\n      auto element = qp.at(index);\n      VertexId from, to;\n      from = qp.at(element.get_his()).get_vid();\n      to   = element.get_vid();\n      // std::cout << \"Adding edge: \" << from << \" --> \" << to << \"\\n\";\n      ag->add_edge(from - 1, to - 1,\n                   std::make_pair((unsigned)element.get_his(), index));\n    }\n    bliss::Stats stats;\n    const unsigned* cl = ag->canonical_form(\n        stats, &report_aut, stdout); // canonical labeling. This is expensive.\n    bliss::AbstractGraph* cf = ag->permute(cl); // permute to canonical form\n    delete ag;\n    return cf;\n  }\n};\n\ntemplate <typename EmbeddingTy, typename ElementTy>\nstd::ostream& operator<<(std::ostream& strm,\n                         const CanonicalGraph<EmbeddingTy, ElementTy>& cg) {\n  strm << \"{\" << cg.embedding << \"; \" << cg.get_num_vertices() << \"}\";\n  return strm;\n}\n\nnamespace std {\n// template<>\ntemplate <typename EmbeddingTy, typename ElementTy>\nstruct hash<CanonicalGraph<EmbeddingTy, ElementTy>> {\n  std::size_t\n  operator()(const CanonicalGraph<EmbeddingTy, ElementTy>& cg) const {\n    return std::hash<int>()(cg.get_hash());\n  }\n};\n} // namespace std\n#endif // CANONICAL_GRAPH_HPP_\n"
  },
  {
    "path": "libpangolin/include/pangolin/core.h",
    "content": "#pragma once\n\ntypedef struct {\n  unsigned key;\n  unsigned value;\n} keyvalue;\n\nclass bheap {\npublic:\n  unsigned n_max; // max number of nodes.\n  unsigned n;     // number of nodes.\n  unsigned* pt;   // pointers to nodes.\n  keyvalue* kv;   // nodes.\n  bheap() {\n    pt = NULL;\n    kv = NULL;\n  }\n  ~bheap() {\n    if (pt)\n      free(pt);\n    if (kv)\n      free(kv);\n  }\n  void construct(size_t n_max) {\n    n_max = n_max;\n    n     = 0;\n    pt    = (unsigned*)malloc(n_max * sizeof(unsigned));\n    for (unsigned i = 0; i < n_max; i++)\n      pt[i] = (unsigned)-1;\n    kv = (keyvalue*)malloc(n_max * sizeof(keyvalue));\n  }\n  void swap(unsigned i, unsigned j) {\n    keyvalue kv_tmp = kv[i];\n    unsigned pt_tmp = pt[kv_tmp.key];\n    pt[kv[i].key]   = pt[kv[j].key];\n    kv[i]           = kv[j];\n    pt[kv[j].key]   = pt_tmp;\n    kv[j]           = kv_tmp;\n  }\n  void bubble_up(unsigned i) {\n    unsigned j = (i - 1) / 2;\n    while (i > 0) {\n      if (kv[j].value > kv[i].value) {\n        swap(i, j);\n        i = j;\n        j = (i - 1) / 2;\n      } else\n        break;\n    }\n  }\n  void bubble_down() {\n    unsigned i = 0, j1 = 1, j2 = 2, j;\n    while (j1 < n) {\n      j = ((j2 < n) && (kv[j2].value < kv[j1].value)) ? j2 : j1;\n      if (kv[j].value < kv[i].value) {\n        swap(i, j);\n        i  = j;\n        j1 = 2 * i + 1;\n        j2 = j1 + 1;\n        continue;\n      }\n      break;\n    }\n  }\n  void insert(keyvalue item) {\n    pt[item.key] = (n)++;\n    kv[n - 1]    = item;\n    bubble_up(n - 1);\n  }\n  void update(unsigned key) {\n    unsigned i = pt[key];\n    if (i != (unsigned)-1) {\n      ((kv[i]).value)--;\n      bubble_up(i);\n    }\n  }\n  keyvalue popmin() {\n    keyvalue min  = kv[0];\n    pt[min.key]   = (unsigned)-1;\n    kv[0]         = kv[--(n)];\n    pt[kv[0].key] = 0;\n    bubble_down();\n    return min;\n  }\n  // Building the heap structure with (key,value)=(node,degree) for each node\n  void mkheap(size_t n, std::vector<VertexId> v) {\n    construct(n);\n    for (size_t i = 0; i < n; i++) {\n      keyvalue item;\n      item.key   = i;\n      item.value = v[i];\n      insert(item);\n    }\n  }\n};\n"
  },
  {
    "path": "libpangolin/include/pangolin/domain_support.h",
    "content": "#ifndef DOMAIN_SUPPORT_H\n#define DOMAIN_SUPPORT_H\n/**\n * Code ported from on below link. Modified under Galois.\n *\n * https://github.com/qcri/Arabesque\n *\n * Copyright (c) 2015 Qatar Computing Research Institute\n * All rights reserved.\n * Reused/revised under 3-BSD\n */\n\n#include \"pangolin/gtypes.h\"\n\nclass DomainSupport {\npublic:\n  DomainSupport() {\n    num_domains    = 0;\n    enough_support = false;\n  }\n  DomainSupport(unsigned n) {\n    num_domains    = n;\n    enough_support = false;\n    domains_reached_support.resize(n);\n    std::fill(domains_reached_support.begin(), domains_reached_support.end(),\n              0);\n    domain_sets.resize(n);\n  }\n  ~DomainSupport() {}\n  void set_threshold(unsigned minsup) {\n    minimum_support = minsup;\n    // for (auto i = 0; i < num_domains; i++)\n    //\tdomain_sets[i].get_allocator().allocate(minsup+1);\n  }\n  void clean() {\n    domains_reached_support.clear();\n    domain_sets.clear();\n  }\n  void resize(unsigned n) {\n    num_domains    = n;\n    enough_support = false;\n    domains_reached_support.resize(n);\n    std::fill(domains_reached_support.begin(), domains_reached_support.end(),\n              0);\n    domain_sets.resize(n);\n  }\n  bool is_frequent() { return enough_support; }\n  void set_frequent() {\n    if (get_support())\n      enough_support = true;\n  }\n  bool has_domain_reached_support(int i) {\n    assert(i < num_domains);\n    return domains_reached_support[i];\n    // return enough_support || domains_reached_support[i];\n  }\n  void set_domain_frequent(int i) {\n    domains_reached_support[i] = 1;\n    domain_sets[i].clear();\n  }\n  void add_vertex(int i, VertexId vid) {\n    domain_sets[i].insert(vid);\n    if (domain_sets[i].size() >= minimum_support)\n      set_domain_frequent(i);\n  }\n  bool add_vertices(int i, IntSet& vertices) {\n    domains_reached_support[i] = 0;\n    domain_sets[i].insert(vertices.begin(), vertices.end());\n    if (domain_sets[i].size() >= minimum_support) {\n      set_domain_frequent(i);\n      return true;\n    }\n    return false;\n  }\n  // counting the minimal image based support\n  inline bool get_support() {\n    return std::all_of(domains_reached_support.begin(),\n                       domains_reached_support.end(), [](bool v) { return v; });\n  }\n\n  // private:\n  unsigned minimum_support;\n  int num_domains;\n  bool enough_support;\n  BoolVec domains_reached_support;\n  IntSets domain_sets;\n};\n\n// typedef galois::gstl::Map<InitPattern, DomainSupport> InitMap;\ntypedef std::map<InitPattern, DomainSupport*> InitMap;\ntypedef galois::substrate::PerThreadStorage<InitMap> InitMaps;\n#endif\n"
  },
  {
    "path": "libpangolin/include/pangolin/edge_embedding.h",
    "content": "#pragma once\n#include \"pangolin/embedding.h\"\n\n// Edge induced embedding\ntemplate <typename ElementTy>\nclass EdgeInducedEmbedding;\ntemplate <typename ElementTy>\nstd::ostream& operator<<(std::ostream& strm,\n                         const EdgeInducedEmbedding<ElementTy>& emb);\n\ntemplate <typename ElementTy>\nclass EdgeInducedEmbedding : public Embedding<ElementTy> {\n  friend std::ostream& operator<<<>(std::ostream& strm,\n                                    const EdgeInducedEmbedding<ElementTy>& emb);\n\npublic:\n  EdgeInducedEmbedding() { qp_id = 0xFFFFFFFF; }\n  EdgeInducedEmbedding(size_t n) : Embedding<ElementTy>(n) {}\n  ~EdgeInducedEmbedding() {}\n  void set_qpid(unsigned i) { qp_id = i; }    // set the quick pattern id\n  unsigned get_qpid() const { return qp_id; } // get the quick pattern id\nprivate:\n  unsigned qp_id; // quick pattern id\n};\n\ntemplate <typename ElementTy>\nstd::ostream& operator<<(std::ostream& strm,\n                         const EdgeInducedEmbedding<ElementTy>& emb) {\n  if (emb.empty()) {\n    strm << \"(empty)\";\n    return strm;\n  }\n  strm << \"(\";\n  for (unsigned index = 0; index < emb.size() - 1; ++index)\n    std::cout << emb.get_element(index) << \", \";\n  std::cout << emb.get_element(emb.size() - 1);\n  strm << \")\";\n  return strm;\n}\n\ntypedef EdgeInducedEmbedding<LabeledElement> EdgeEmbedding;\n"
  },
  {
    "path": "libpangolin/include/pangolin/edge_type.h",
    "content": "#pragma once\n#include \"pangolin/types.h\"\n\nstruct Edge {\n  VertexId src;\n  VertexId dst;\n#ifdef USE_DOMAIN\n  unsigned src_domain;\n  unsigned dst_domain;\n  Edge(VertexId _src, VertexId _dst, unsigned _src_domain, unsigned _dst_domain)\n      : src(_src), dst(_dst), src_domain(_src_domain), dst_domain(_dst_domain) {\n  }\n#endif\n  Edge(VertexId _src, VertexId _dst) : src(_src), dst(_dst) {}\n  Edge() : src(0), dst(0) {}\n  ~Edge() {}\n  std::string toString() {\n    return \"(\" + std::to_string(src) + \", \" + std::to_string(dst) + \")\";\n  }\n  std::string to_string() const {\n    std::stringstream ss;\n    ss << \"e(\" << src << \",\" << dst << \")\";\n    return ss.str();\n  }\n  void swap() {\n    if (src > dst) {\n      VertexId tmp = src;\n      src          = dst;\n      dst          = tmp;\n#ifdef USE_DOMAIN\n      unsigned domain = src_domain;\n      src_domain      = dst_domain;\n      dst_domain      = domain;\n#endif\n    }\n  }\n};\n\nclass EdgeComparator {\npublic:\n  int operator()(const Edge& oneEdge, const Edge& otherEdge) {\n    if (oneEdge.src == otherEdge.src) {\n      return oneEdge.dst > otherEdge.dst;\n    } else {\n      return oneEdge.src > otherEdge.src;\n    }\n  }\n};\n\ntypedef std::pair<VertexId, VertexId> OrderedEdge;\ntypedef std::priority_queue<Edge, std::vector<Edge>, EdgeComparator> EdgeHeap;\n"
  },
  {
    "path": "libpangolin/include/pangolin/element.h",
    "content": "// defines the embedding element classes\n// LabeledElement: vertex_id, history_info, vertex_label, edge_label. Used for\n// FSM. StructuralElement: vertex_id, history_info. Used for Motifs.\n// SimpleElement: vertex_id. Used for KCL and TC.\n#ifndef ELEMENT_HPP_\n#define ELEMENT_HPP_\n/**\n * Code from on below link. Modified under Galois.\n *\n * https://github.com/rstream-system/RStream/\n *\n * Copyright (c) 2018, Kai Wang and the respective contributors\n * All rights reserved.\n * Reused/revised under 3-BSD\n */\n\n#include \"pangolin/gtypes.h\"\n\n// This is the data structure used in RStream.\n// Each element contains 8 bytes, first 4 bytes is vertex id,\n// second 4 bytes contains edge label(1byte) + vertex label(1byte) + history\n// info(1byte). History info is used to record subgraph structure.\nstruct LabeledElement {\nprotected:\n  VertexId vertex_id;\n  BYTE key_index;\n  BYTE edge_label;\n  BYTE vertex_label;\n  BYTE history_info;\n\npublic:\n  LabeledElement() {}\n  LabeledElement(VertexId _vertex_id)\n      : vertex_id(_vertex_id), key_index(0), edge_label(0), vertex_label(0),\n        history_info(0) {}\n  LabeledElement(VertexId _vertex_id, BYTE _history)\n      : vertex_id(_vertex_id), key_index(0), edge_label(0), vertex_label(0),\n        history_info(_history) {}\n  LabeledElement(VertexId _vertex_id, BYTE _edge_label, BYTE _vertex_label)\n      : vertex_id(_vertex_id), key_index(0), edge_label(_edge_label),\n        vertex_label(_vertex_label), history_info(0) {}\n  LabeledElement(VertexId _vertex_id, BYTE _edge_label, BYTE _vertex_label,\n                 BYTE _history)\n      : vertex_id(_vertex_id), key_index(0), edge_label(_edge_label),\n        vertex_label(_vertex_label), history_info(_history) {}\n  LabeledElement(VertexId _vertex_id, BYTE _key_index, BYTE _edge_label,\n                 BYTE _vertex_label, BYTE _history)\n      : vertex_id(_vertex_id), key_index(_key_index), edge_label(_edge_label),\n        vertex_label(_vertex_label), history_info(_history) {}\n  ~LabeledElement() = default;\n  inline void set_vertex_id(VertexId new_id) { vertex_id = new_id; }\n  inline void set_history_info(BYTE his) { history_info = his; }\n  inline void set_vertex_label(BYTE lab) { vertex_label = lab; }\n  inline int cmp(const LabeledElement& other) const {\n    // compare vertex id\n    if (vertex_id < other.vertex_id)\n      return -1;\n    if (vertex_id > other.vertex_id)\n      return 1;\n    // compare history info\n    if (history_info < other.history_info)\n      return -1;\n    if (history_info > other.history_info)\n      return 1;\n    // compare vertex label\n    if (vertex_label < other.vertex_label)\n      return -1;\n    if (vertex_label > other.vertex_label)\n      return 1;\n    // compare edge label\n    if (edge_label < other.edge_label)\n      return -1;\n    if (edge_label > other.edge_label)\n      return 1;\n    // compare index\n    if (key_index < other.key_index)\n      return -1;\n    if (key_index > other.key_index)\n      return 1;\n    return 0;\n  }\n  VertexId get_vid() const { return vertex_id; }\n  BYTE get_key() const { return key_index; }\n  BYTE get_elabel() const { return edge_label; }\n  BYTE get_vlabel() const { return vertex_label; }\n  BYTE get_his() const { return history_info; }\n  bool has_history() { return true; }\n  friend std::ostream& operator<<(std::ostream& strm,\n                                  const LabeledElement& element) {\n    strm << \"[\" << element.get_vid()\n         << \", \" //<< (int)element.get_key() << \", \" <<\n                 //(int)element.get_elabel() << \", \"\n         << (int)element.get_vlabel() << \", \" << (int)element.get_his() << \"]\";\n    return strm;\n  }\n};\n\nstruct StructuralElement {\nprotected:\n  VertexId vertex_id;\n  BYTE history_info;\n\npublic:\n  StructuralElement() {}\n  StructuralElement(VertexId _vertex_id)\n      : vertex_id(_vertex_id), history_info(0) {}\n  StructuralElement(VertexId _vertex_id, BYTE _history)\n      : vertex_id(_vertex_id), history_info(_history) {}\n  StructuralElement(VertexId _vertex_id, BYTE, BYTE, BYTE _history)\n      : vertex_id(_vertex_id), history_info(_history) {}\n  StructuralElement(VertexId _vertex_id, BYTE, BYTE, BYTE, BYTE _history)\n      : vertex_id(_vertex_id), history_info(_history) {}\n  ~StructuralElement() = default;\n  inline void set_vertex_id(VertexId new_id) { vertex_id = new_id; }\n  inline void set_history_info(BYTE his) { history_info = his; }\n  inline void set_vertex_label(BYTE) {}\n  inline int cmp(const StructuralElement& other) const {\n    // compare vertex id\n    if (vertex_id < other.vertex_id)\n      return -1;\n    if (vertex_id > other.vertex_id)\n      return 1;\n    // compare history info\n    if (history_info < other.history_info)\n      return -1;\n    if (history_info > other.history_info)\n      return 1;\n    return 0;\n  }\n  VertexId get_vid() const { return vertex_id; }\n  BYTE get_his() const { return history_info; }\n  BYTE get_vlabel() const { return 0; }\n  BYTE get_key() const { return 0; }\n  bool has_history() { return true; }\n  friend std::ostream& operator<<(std::ostream& strm,\n                                  const StructuralElement& element) {\n    strm << \"[\" << element.get_vid() << \", \" << (int)element.get_his() << \"]\";\n    return strm;\n  }\n};\n\n// typedef unsigned SimpleElement;\nstruct SimpleElement {\nprotected:\n  VertexId vertex_id;\n\npublic:\n  SimpleElement() : vertex_id(0) {}\n  SimpleElement(VertexId _vertex_id) : vertex_id(_vertex_id) {}\n  SimpleElement(VertexId _vertex_id, BYTE, BYTE, BYTE)\n      : vertex_id(_vertex_id) {}\n  SimpleElement(VertexId _vertex_id, BYTE, BYTE, BYTE, BYTE)\n      : vertex_id(_vertex_id) {}\n  ~SimpleElement() = default;\n  inline void set_vertex_id(VertexId new_id) { vertex_id = new_id; }\n  inline void set_history_info(BYTE) {}\n  inline void set_vertex_label(BYTE) {}\n  VertexId get_vid() const { return vertex_id; }\n  BYTE get_his() const { return 0; }\n  BYTE get_key() const { return 0; }\n  BYTE get_vlabel() const { return 0; }\n  bool has_history() { return false; }\n  inline int cmp(const SimpleElement& other) const {\n    if (vertex_id < other.get_vid())\n      return -1;\n    if (vertex_id > other.get_vid())\n      return 1;\n    return 0;\n  }\n  friend bool operator==(const SimpleElement& e1, const SimpleElement& e2) {\n    return e1.get_vid() == e2.get_vid();\n  }\n  friend std::ostream& operator<<(std::ostream& strm,\n                                  const SimpleElement& element) {\n    strm << \"[\" << element.get_vid() << \"]\";\n    return strm;\n  }\n};\n\n#endif\n"
  },
  {
    "path": "libpangolin/include/pangolin/embedding.h",
    "content": "#ifndef EMBEDDING_HPP_\n#define EMBEDDING_HPP_\n\n// bliss headers\n//#include \"bliss/defs.hh\"\n//#include \"bliss/utils.hh\"\n//#include \"bliss/bignum.hh\"\n\n#include \"pangolin/element.h\"\n\ntemplate <typename ElementTy>\nclass Embedding {\n  // using iterator = typename std::vector<ElementTy>::iterator;\n  using iterator = typename galois::gstl::Vector<ElementTy>::iterator;\n\npublic:\n  Embedding() {}\n  Embedding(size_t n) { elements.resize(n); }\n  Embedding(const Embedding& emb) { elements = emb.elements; }\n  ~Embedding() { elements.clear(); }\n  VertexId get_vertex(unsigned i) const { return elements[i].get_vid(); }\n  BYTE get_history(unsigned i) const { return elements[i].get_his(); }\n  BYTE get_label(unsigned i) const { return elements[i].get_vlabel(); }\n  BYTE get_key(unsigned i) const { return elements[i].get_key(); }\n  bool empty() const { return elements.empty(); }\n  iterator begin() { return elements.begin(); }\n  iterator end() { return elements.end(); }\n  iterator insert(iterator pos, const ElementTy& value) {\n    return elements.insert(pos, value);\n  }\n  void push_back(ElementTy ele) { elements.push_back(ele); }\n  void pop_back() { elements.pop_back(); }\n  ElementTy& back() { return elements.back(); }\n  const ElementTy& back() const { return elements.back(); }\n  size_t size() const { return elements.size(); }\n  void resize(size_t n) { elements.resize(n); }\n  ElementTy* data() { return elements.data(); }\n  const ElementTy* data() const { return elements.data(); }\n  ElementTy get_element(unsigned i) const { return elements[i]; }\n  void set_element(unsigned i, ElementTy& ele) { elements[i] = ele; }\n  void set_vertex(unsigned i, VertexId vid) { elements[i].set_vertex_id(vid); }\n  // std::vector<ElementTy> get_elements() const { return elements; }\n  galois::gstl::Vector<ElementTy> get_elements() const { return elements; }\n  void clean() { elements.clear(); }\n\nprotected:\n  // std::vector<ElementTy> elements;\n  galois::gstl::Vector<ElementTy> elements;\n};\n\n#endif // EMBEDDING_HPP_\n"
  },
  {
    "path": "libpangolin/include/pangolin/embedding_queue.h",
    "content": "#include <iostream>\n#include \"galois/Bag.h\"\n\n// Embedding queue: AoS structure\n// print out the embeddings in the task queue\ntemplate <typename EmbeddingTy>\nclass EmbeddingQueue : public galois::InsertBag<EmbeddingTy> {\npublic:\n  void printout_embeddings(int level, bool verbose = false) {\n    int num_embeddings = std::distance(this->begin(), this->end());\n    std::cout << \"Number of embeddings in level \" << level << \": \"\n              << num_embeddings << std::endl;\n    if (verbose)\n      for (auto emb : *this)\n        std::cout << emb << \"\\n\";\n  }\n  void clean() {\n    for (auto emb : *this)\n      emb.clean();\n    this->clear();\n  }\n};\n"
  },
  {
    "path": "libpangolin/include/pangolin/equivalence.h",
    "content": "#ifndef EQUIVALENCE_HPP_\n#define EQUIVALENCE_HPP_\n/**\n * Code from on below link. Modified under Galois.\n *\n * https://github.com/qcri/Arabesque\n *\n * Copyright (c) 2015 Qatar Computing Research Institute\n * All rights reserved.\n * Reused/revised under 3-BSD\n */\n#include \"pangolin/types.h\"\n\nclass VertexPositionEquivalences {\n  friend std::ostream& operator<<(std::ostream& strm,\n                                  const VertexPositionEquivalences& equ);\n\npublic:\n  VertexPositionEquivalences() { numVertices = 0; }\n  ~VertexPositionEquivalences() {}\n  void clear() {\n    for (unsigned i = 0; i < equivalences.size(); ++i)\n      equivalences[i].clear();\n  }\n  void set_size(unsigned n) {\n    if (numVertices != n) {\n      equivalences.resize(n);\n      numVertices = n;\n    }\n  }\n  void add_equivalence(unsigned pos1, unsigned pos2) {\n    equivalences[pos1].insert(pos2);\n  }\n  UintSet get_equivalent_set(unsigned pos) const { return equivalences[pos]; }\n  void propagate_equivalences() {\n    for (unsigned i = 0; i < numVertices; ++i) {\n      UintSet currentEquivalences = equivalences[i];\n      for (auto equivalentPosition : currentEquivalences) {\n        if (equivalentPosition == i)\n          continue;\n        // equivalences[equivalentPosition];\n      }\n    }\n  }\n  unsigned get_size() const { return numVertices; }\n  bool empty() const { return numVertices == 0; }\n\nprivate:\n  UintSets equivalences;\n  unsigned numVertices;\n};\n#endif // EQUIVALENCE_HPP_\n"
  },
  {
    "path": "libpangolin/include/pangolin/gtypes.h",
    "content": "#pragma once\n// Galois supported types\n#include \"pangolin/types.h\"\n#include \"galois/Bag.h\"\n#include \"galois/Galois.h\"\n#include \"galois/graphs/LCGraph.h\"\n#include \"galois/substrate/PerThreadStorage.h\"\n#include \"galois/substrate/SimpleLock.h\"\n\n#ifndef LARGE_SIZE\ntypedef galois::gstl::Vector<BYTE> ByteList;\ntypedef galois::gstl::Vector<unsigned> UintList;\ntypedef galois::gstl::Vector<Ulong> UlongList;\ntypedef galois::gstl::Vector<VertexId> VertexList;\n#endif\n\ntypedef galois::gstl::Set<VertexId> VertexSet;\ntypedef galois::substrate::PerThreadStorage<UintList> Lists;\ntypedef galois::substrate::PerThreadStorage<unsigned> Counts;\n\ntypedef galois::GAccumulator<unsigned> UintAccu;\ntypedef galois::GAccumulator<uint64_t> UlongAccu;\ntypedef galois::substrate::PerThreadStorage<UintMap> LocalUintMap;\n\n// typedef galois::gstl::Map<unsigned, unsigned> FreqMap;\n// typedef galois::gstl::UnorderedMap<unsigned, bool> DomainMap;\n\n// use Galois memory allocator for domain support\ntypedef galois::gstl::Set<int> IntSet;\ntypedef galois::gstl::Vector<IntSet> IntSets;\n// typedef std::set<int> IntSet;\n// typedef std::vector<IntSet> IntSets;\n\nclass PangolinGraph\n    : public galois::graphs::LC_CSR_Graph<uint32_t, void>::with_numa_alloc<\n          true>::type ::with_no_lockable<true>::type {\npublic:\n  galois::gstl::Vector<uint32_t> degrees;\n  void degree_counting() {\n    degrees.resize(numNodes);\n    galois::do_all(\n        galois::iterate(begin(), end()),\n        [&](auto v) {\n          degrees[v] = std::distance(this->edge_begin(v), this->edge_end(v));\n        },\n        galois::loopname(\"DegreeCounting\"));\n  }\n  uint32_t get_degree(uint32_t n) { return degrees[n]; }\n};\n\ntypedef PangolinGraph::GraphNode GNode;\n"
  },
  {
    "path": "libpangolin/include/pangolin/mgraph.h",
    "content": "#pragma once\n#include <set>\n#include <string>\n#include <vector>\n#include <sstream>\n#include <fstream>\n#include <iostream>\n#include <algorithm>\n#include \"pangolin/types.h\"\n\nstruct MEdge {\n  IndexT src;\n  IndexT dst;\n  ValueT elabel;\n  MEdge() : src(0), dst(0), elabel(0) {}\n  MEdge(IndexT from, IndexT to, ValueT el) : src(from), dst(to), elabel(el) {}\n  std::string to_string() const {\n    std::stringstream ss;\n    ss << \"e(\" << src << \",\" << dst << \",\" << elabel << \")\";\n    return ss.str();\n  }\n};\ntypedef std::vector<MEdge> MEdgeList;\n\nclass MGraph {\npublic:\n  // MEdgeList el;\n  MGraph() : need_dag(false), symmetrize_(false), directed_(false) {}\n  MGraph(bool dag) : need_dag(dag), symmetrize_(false), directed_(false) {}\n  void clean() {\n    el.clear();\n    delete[] rowptr_;\n    delete[] colidx_;\n    delete[] weight_;\n    degrees.clear();\n    labels_.clear();\n    vertices.clear();\n  }\n  IndexT* out_rowptr() const { return rowptr_; }\n  IndexT* out_colidx() const { return colidx_; }\n  ValueT* labels() { return labels_.data(); }\n  ValueT get_label(IndexT n) { return labels_[n]; }\n  IndexT get_offset(IndexT n) { return rowptr_[n]; }\n  IndexT get_dest(IndexT n) { return colidx_[n]; }\n  ValueT get_weight(IndexT n) { return weight_[n]; }\n  unsigned get_max_degree() { return max_degree; }\n  unsigned out_degree(IndexT n) const { return rowptr_[n + 1] - rowptr_[n]; }\n  bool directed() const { return directed_; }\n  size_t num_vertices() const { return num_vertices_; }\n  size_t num_edges() const { return num_edges_; }\n\n  void read_txt(const char* filename, bool symmetrize = true) {\n    std::ifstream is;\n    is.open(filename, std::ios::in);\n    char line[1024];\n    std::vector<std::string> result;\n    std::set<std::pair<IndexT, IndexT>> edge_set;\n    // clear();\n    while (true) {\n      unsigned pos = is.tellg();\n      if (!is.getline(line, 1024))\n        break;\n      result.clear();\n      split(line, result);\n      if (result.empty()) {\n      } else if (result[0] == \"t\") {\n        if (!labels_.empty()) { // use as delimiter\n          is.seekg(pos, std::ios_base::beg);\n          break;\n        } else {\n        }\n      } else if (result[0] == \"v\" && result.size() >= 3) {\n        unsigned id = atoi(result[1].c_str());\n        labels_.resize(id + 1);\n        labels_[id] = atoi(result[2].c_str());\n      } else if (result[0] == \"e\" && result.size() >= 4) {\n        IndexT src    = atoi(result[1].c_str());\n        IndexT dst    = atoi(result[2].c_str());\n        ValueT elabel = atoi(result[3].c_str());\n        assert(labels_.size() > src && labels_.size() > dst);\n        if (src == dst)\n          continue; // remove self-loop\n        if (edge_set.find(std::pair<IndexT, IndexT>(src, dst)) ==\n            edge_set.end()) {\n          edge_set.insert(std::pair<IndexT, IndexT>(src, dst));\n          el.push_back(MEdge(src, dst, elabel));\n          if (symmetrize) {\n            edge_set.insert(std::pair<IndexT, IndexT>(dst, src));\n            el.push_back(MEdge(dst, src, elabel));\n          }\n        }\n      }\n    }\n    is.close();\n    num_vertices_   = labels_.size();\n    auto num_labels = count_unique_labels();\n    std::cout << \"Number of unique vertex label values: \" << num_labels\n              << std::endl;\n    num_edges_ = el.size();\n    if (!directed_)\n      symmetrize_ = false; // no need to symmetrize undirected graph\n    MakeGraphFromEL();\n  }\n  void read_adj(const char* filename) {\n    FILE* fd = fopen(filename, \"r\");\n    assert(fd != NULL);\n    char buf[2048];\n    unsigned size = 0, maxsize = 0;\n    while (fgets(buf, 2048, fd) != NULL) {\n      auto len = strlen(buf);\n      size += len;\n      if (buf[len - 1] == '\\n') {\n        maxsize = std::max(size, maxsize);\n        size    = 0;\n      }\n    }\n    fclose(fd);\n\n    std::ifstream is;\n    is.open(filename, std::ios::in);\n    // char line[1024];\n    char* line = new char[maxsize + 1];\n    std::vector<std::string> result;\n    while (is.getline(line, maxsize + 1)) {\n      result.clear();\n      split(line, result);\n      IndexT src = atoi(result[0].c_str());\n      labels_.resize(src + 1);\n      labels_[src]  = atoi(result[1].c_str());\n      ValueT elabel = 0;\n      std::set<std::pair<IndexT, ValueT>> neighbors;\n      for (size_t i = 2; i < result.size(); i++) {\n        IndexT dst = atoi(result[i].c_str());\n        if (src == dst)\n          continue; // remove self-loop\n        // elabel = atoi(result[i].c_str());\n        neighbors.insert(\n            std::pair<IndexT, ValueT>(dst, elabel)); // remove redundant edge\n      }\n      for (auto it = neighbors.begin(); it != neighbors.end(); ++it)\n        el.push_back(MEdge(src, it->first, it->second));\n    }\n    is.close();\n    num_vertices_   = labels_.size();\n    auto num_labels = count_unique_labels();\n    std::cout << \"Number of unique vertex label values: \" << num_labels\n              << std::endl;\n    num_edges_ = el.size();\n    if (!directed_)\n      symmetrize_ = false; // no need to symmetrize undirected graph\n    MakeGraphFromEL();\n  }\n  void read_mtx(const char* filename, bool symmetrize = false) {\n    std::ifstream in;\n    in.open(filename, std::ios::in);\n    std::string start, object, format, field, symmetry, line;\n    in >> start >> object >> format >> field >> symmetry >> std::ws;\n    if (start != \"%%MatrixMarket\") {\n      std::cout << \".mtx file did not start with %%MatrixMarket\" << std::endl;\n      std::exit(-21);\n    }\n    if ((object != \"matrix\") || (format != \"coordinate\")) {\n      std::cout << \"only allow matrix coordinate format for .mtx\" << std::endl;\n      std::exit(-22);\n    }\n    if (field == \"complex\") {\n      std::cout << \"do not support complex weights for .mtx\" << std::endl;\n      std::exit(-23);\n    }\n    bool read_weights;\n    if (field == \"pattern\") {\n      read_weights = false;\n    } else if ((field == \"real\") || (field == \"double\") ||\n               (field == \"integer\")) {\n      read_weights = true;\n    } else {\n      std::cout << \"unrecognized field type for .mtx\" << std::endl;\n      std::exit(-24);\n    }\n    bool undirected;\n    if (symmetry == \"symmetric\") {\n      undirected = true;\n    } else if ((symmetry == \"general\") || (symmetry == \"skew-symmetric\")) {\n      undirected = false;\n    } else {\n      std::cout << \"unsupported symmetry type for .mtx\" << std::endl;\n      std::exit(-25);\n    }\n    while (true) {\n      char c = in.peek();\n      if (c == '%') {\n        in.ignore(200, '\\n');\n      } else {\n        break;\n      }\n    }\n    size_t m, n, nonzeros;\n    in >> m >> n >> nonzeros >> std::ws;\n    if (m != n) {\n      std::cout << m << \" \" << n << \" \" << nonzeros << std::endl;\n      std::cout << \"matrix must be square for .mtx\" << std::endl;\n      std::exit(-26);\n    }\n    while (std::getline(in, line)) {\n      std::istringstream edge_stream(line);\n      IndexT u;\n      edge_stream >> u;\n      if (read_weights) {\n        IndexT v;\n        edge_stream >> v;\n        el.push_back(MEdge(u - 1, v - 1, 1));\n        if (symmetrize)\n          el.push_back(MEdge(v - 1, u - 1, 1));\n      } else {\n        IndexT v;\n        edge_stream >> v;\n        el.push_back(MEdge(u - 1, v - 1, 1));\n        if (symmetrize)\n          el.push_back(MEdge(v - 1, u - 1, 1));\n      }\n    }\n    in.close();\n    labels_.resize(m);\n    directed_ = !undirected;\n    if (undirected)\n      symmetrize_ = false; // no need to symmetrize undirected graph\n    for (size_t i = 0; i < m; i++) {\n      labels_[i] = rand() % 10 + 1;\n    }\n    num_vertices_ = m;\n    num_edges_    = el.size();\n    MakeGraphFromEL();\n  }\n  void read_gr(PangolinGraph& g) {\n    num_vertices_ = g.size();\n    for (auto it = g.begin(); it != g.end(); it++) {\n      GNode src = *it;\n      for (auto e : g.edges(src)) {\n        GNode dst = g.getEdgeDst(e);\n        el.push_back(MEdge(src, dst, 1));\n      }\n    }\n    assert(el.size() == g.sizeEdges());\n    num_edges_ = el.size();\n    labels_.resize(num_vertices_);\n    for (size_t i = 0; i < num_vertices_; i++) {\n      labels_[i] = g.getData(i);\n    }\n    MakeGraphFromEL();\n  }\n  void print_graph() {\n    if (directed_)\n      std::cout << \"directed graph\\n\";\n    else\n      std::cout << \"undirected graph\\n\";\n    for (size_t n = 0; n < num_vertices_; n++) {\n      IndexT row_begin = rowptr_[n];\n      IndexT row_end   = rowptr_[n + 1];\n      std::cout << \"vertex \" << n << \": label = \" << labels_[n]\n                << \" edgelist = [ \";\n      for (IndexT offset = row_begin; offset < row_end; offset++) {\n        IndexT dst = colidx_[offset];\n        std::cout << dst << \" \";\n      }\n      std::cout << \"]\" << std::endl;\n    }\n  }\n\nprivate:\n  MEdgeList el;\n  bool need_dag;\n  bool symmetrize_; // whether to symmetrize a directed graph\n  bool directed_;\n  size_t num_vertices_;\n  size_t num_edges_;\n  IndexT* rowptr_;\n  IndexT* colidx_;\n  ValueT* weight_;\n  unsigned max_degree;\n  std::vector<IndexT> degrees;\n  std::vector<ValueT> labels_;\n  std::vector<std::vector<MEdge>> vertices;\n\n  unsigned count_unique_labels() {\n    std::set<ValueT> s;\n    unsigned res = 0;\n    for (size_t i = 0; i < labels_.size(); i++) {\n      if (s.find(labels_[i]) == s.end()) {\n        s.insert(labels_[i]);\n        res++;\n      }\n    }\n    return res;\n  }\n  void CountDegrees(const MEdgeList& el) {\n    degrees.resize(num_vertices_);\n    std::fill(degrees.begin(), degrees.end(), 0);\n    for (auto it = el.begin(); it < el.end(); it++) {\n      MEdge e = *it;\n      degrees[e.src]++;\n      if (symmetrize_)\n        degrees[e.dst]++;\n    }\n  }\n  void MakeCSR(bool transpose) {\n    degrees.resize(num_vertices_);\n    std::fill(degrees.begin(), degrees.end(), 0);\n    for (size_t i = 0; i < num_vertices_; i++)\n      degrees[i] = vertices[i].size();\n    max_degree = *(std::max_element(degrees.begin(), degrees.end()));\n\n    std::vector<IndexT> offsets(degrees.size() + 1);\n    IndexT total = 0;\n    for (size_t n = 0; n < degrees.size(); n++) {\n      offsets[n] = total;\n      total += degrees[n];\n    }\n    offsets[degrees.size()] = total;\n\n    assert(num_edges_ == offsets[num_vertices_]);\n    weight_ = new ValueT[num_edges_];\n    colidx_ = new IndexT[num_edges_];\n    rowptr_ = new IndexT[num_vertices_ + 1];\n    for (size_t i = 0; i < num_vertices_ + 1; i++)\n      rowptr_[i] = offsets[i];\n    for (size_t i = 0; i < num_vertices_; i++) {\n      for (auto it = vertices[i].begin(); it < vertices[i].end(); it++) {\n        MEdge e = *it;\n        assert(i == e.src);\n        if (symmetrize_ || (!symmetrize_ && !transpose)) {\n          weight_[offsets[e.src]]   = e.elabel;\n          colidx_[offsets[e.src]++] = e.dst;\n        }\n        if (symmetrize_ || (!symmetrize_ && transpose)) {\n          weight_[offsets[e.dst]]   = e.elabel;\n          colidx_[offsets[e.dst]++] = e.src;\n        }\n      }\n    }\n  }\n  static bool compare_id(MEdge a, MEdge b) { return (a.dst < b.dst); }\n  void SquishGraph(bool remove_selfloops  = true,\n                   bool remove_redundents = true) {\n    std::vector<MEdge> neighbors;\n    for (size_t i = 0; i < num_vertices_; i++)\n      vertices.push_back(neighbors);\n    // assert(num_edges_ == el.size());\n    for (size_t i = 0; i < num_edges_; i++)\n      vertices[el[i].src].push_back(el[i]);\n    el.clear();\n    printf(\"Sorting the neighbor lists...\");\n    for (size_t i = 0; i < num_vertices_; i++)\n      std::sort(vertices[i].begin(), vertices[i].end(), compare_id);\n    printf(\" Done\\n\");\n    // remove self loops\n    int num_selfloops = 0;\n    if (remove_selfloops) {\n      printf(\"Removing self loops...\");\n      for (size_t i = 0; i < num_vertices_; i++) {\n        for (unsigned j = 0; j < vertices[i].size(); j++) {\n          if (i == vertices[i][j].dst) {\n            vertices[i].erase(vertices[i].begin() + j);\n            num_selfloops++;\n            j--;\n          }\n        }\n      }\n      printf(\" %d selfloops are removed\\n\", num_selfloops);\n      num_edges_ -= num_selfloops;\n    }\n    // remove redundent\n    int num_redundents = 0;\n    if (remove_redundents) {\n      printf(\"Removing redundent edges...\");\n      for (size_t i = 0; i < num_vertices_; i++) {\n        for (unsigned j = 1; j < vertices[i].size(); j++) {\n          if (vertices[i][j].dst == vertices[i][j - 1].dst) {\n            vertices[i].erase(vertices[i].begin() + j);\n            num_redundents++;\n            j--;\n          }\n        }\n      }\n      printf(\" %d redundent edges are removed\\n\", num_redundents);\n      num_edges_ -= num_redundents;\n    }\n    if (need_dag) {\n      int num_dag = 0;\n      std::cout << \"Constructing DAG...\";\n      degrees.resize(num_vertices_);\n      for (size_t i = 0; i < num_vertices_; i++)\n        degrees[i] = vertices[i].size();\n      for (size_t i = 0; i < num_vertices_; i++) {\n        for (unsigned j = 0; j < vertices[i].size(); j++) {\n          IndexT to = vertices[i][j].dst;\n          if (degrees[to] < degrees[i] ||\n              (degrees[to] == degrees[i] && to < i)) {\n            vertices[i].erase(vertices[i].begin() + j);\n            num_dag++;\n            j--;\n          }\n        }\n      }\n      printf(\" %d dag edges are removed\\n\", num_dag);\n      num_edges_ -= num_dag;\n    }\n  }\n  void MakeGraphFromEL() {\n    SquishGraph();\n    MakeCSR(false);\n  }\n  inline void split(const std::string& str, std::vector<std::string>& tokens,\n                    const std::string& delimiters = \" \") {\n    std::string::size_type lastPos = str.find_first_not_of(delimiters, 0);\n    std::string::size_type pos     = str.find_first_of(delimiters, lastPos);\n    while (std::string::npos != pos || std::string::npos != lastPos) {\n      tokens.push_back(str.substr(lastPos, pos - lastPos));\n      lastPos = str.find_first_not_of(delimiters, pos);\n      pos     = str.find_first_of(delimiters, lastPos);\n    }\n  }\n};\n"
  },
  {
    "path": "libpangolin/include/pangolin/miner.h",
    "content": "#ifndef MINER_HPP_\n#define MINER_HPP_\n#include \"pangolin/scan.h\"\n#include \"pangolin/util.h\"\n#include \"pangolin/embedding_queue.h\"\n#include \"bliss/uintseqhash.hh\"\n#define CHUNK_SIZE 1\n\ntemplate <typename ElementTy, typename EmbeddingTy, bool enable_dag>\nclass Miner {\n  typedef EmbeddingQueue<EmbeddingTy> EmbeddingQueueTy;\n\npublic:\n  Miner(unsigned max_sz, int nt) : max_size(max_sz), num_threads(nt) {\n    // std::cout << \"max_size = \" << max_sz << std::endl;\n    // std::cout << \"num_threads = \" << nt << std::endl;\n  }\n  virtual ~Miner() {}\n  inline void insert(EmbeddingQueueTy& queue, bool debug = false);\n  inline unsigned intersect(unsigned a, unsigned b) {\n    return intersect_merge(a, b);\n  }\n  inline unsigned intersect_dag(unsigned a, unsigned b) {\n    return intersect_dag_merge(a, b);\n  }\n  // unsigned read_graph(std::string filename);\n  unsigned read_graph(std::string filetype, std::string filename) {\n    max_degree = util::read_graph(graph, filetype, filename, enable_dag);\n    graph.degree_counting();\n    degrees = graph.degrees.data();\n    // std::cout << \"Input graph: num_vertices \" << graph.size() << \" num_edges\n    // \"\n    //          << graph.sizeEdges() << \"\\n\";\n    // util::print_graph(graph);\n    // convert_to_gbbs(filename);\n    return max_degree;\n  }\n  void convert_to_gbbs(std::string filename) {\n    printf(\"writing gbbs file\\n\");\n    std::ofstream outfile;\n    outfile.open(filename + \".gbbs\");\n    outfile << \"AdjacencyGraph\"\n            << \"\\n\";\n    auto m   = graph.size();\n    auto nnz = graph.sizeEdges();\n    outfile << m << \"\\n\";\n    outfile << nnz << \"\\n\";\n    size_t offset = 0;\n    for (size_t i = 0; i < m; i++) {\n      outfile << offset << \"\\n\";\n      offset += graph.get_degree(i);\n    }\n    for (size_t i = 0; i < m; i++) {\n      for (auto e : graph.edges(i)) {\n        auto v = graph.getEdgeDst(e);\n        outfile << v << \"\\n\";\n      }\n    }\n    outfile.close();\n    exit(0);\n  }\n  unsigned read_pattern(std::string filename, std::string filetype = \"gr\",\n                        bool symmetric = false) {\n    unsigned max_deg = util::read_graph(pattern, filetype, filename, false);\n    pattern.degree_counting();\n    auto nv = pattern.size();\n    auto ne = pattern.sizeEdges();\n    std::cout << \"Pattern graph: num_vertices \" << nv << \" num_edges \" << ne\n              << \"\\n\";\n    if (symmetric) {\n      if (nv == 4) {\n        if (ne == 12) {\n          std::cout << \"Input pattern: 4-clique, please use kcl\\n\";\n          exit(1);\n        } else if (ne == 10) {\n          std::cout << \"Input pattern: diamond\\n\";\n          return 4;\n        } else if (ne == 8) {\n          if (max_deg == 3) {\n            std::cout << \"Input pattern: tailed-triangle\\n\";\n            return 3;\n          } else {\n            std::cout << \"Input pattern: 4-cycle\\n\";\n            return 2;\n          }\n        } else if (ne == 6) {\n          if (max_deg == 3) {\n            std::cout << \"Input pattern: 3-star\\n\";\n            return 1;\n          } else {\n            std::cout << \"Input pattern: 4-path\\n\";\n            return 0;\n          }\n        } else {\n          std::cout << \"Error: the number of edges is invalid\\n\";\n          exit(1);\n        }\n      } else if (nv == 5) {\n        std::cout << \"5-motif currently not supported\\n\";\n        exit(1);\n      } else {\n        std::cout << \"pattern size currently not supported\\n\";\n        exit(1);\n      }\n    } else {\n      if (nv == 4) {\n        if (ne == 6) {\n          std::cout << \"Input pattern: 4-clique, please use kcl\\n\";\n          exit(1);\n        } else if (ne == 5) {\n          std::cout << \"Input pattern: diamond\\n\";\n          return 4;\n        } else if (ne == 4) {\n          if (max_deg == 2) {\n            std::cout << \"Input pattern: tailed-triangle\\n\";\n            return 3;\n          } else {\n            // assert(max_deg == 1);\n            std::cout << \"Input pattern: 4-cycle\\n\";\n            return 2;\n          }\n        } else if (ne == 3) {\n          if (max_deg == 3) {\n            std::cout << \"Input pattern: 3-star\\n\";\n            return 1;\n          } else {\n            // assert(max_deg == 2);\n            std::cout << \"Input pattern: 4-path\\n\";\n            return 0;\n          }\n        } else {\n          std::cout << \"Error: the unmber of edges is invalid\\n\";\n          exit(1);\n        }\n      } else if (nv == 5) {\n      } else {\n        std::cout << \"pattern size currently not supported\\n\";\n        exit(1);\n      }\n    }\n    return 0;\n  }\n\nprotected:\n  PangolinGraph graph;\n  PangolinGraph pattern;\n  unsigned max_size;\n  int num_threads;\n  unsigned max_degree;\n  uint32_t* degrees;\n\n  inline bool is_automorphism_dag(unsigned n, const EmbeddingTy& emb,\n                                  unsigned idx, VertexId dst) {\n    // if (dst <= emb.get_vertex(0)) return true;\n    for (unsigned i = 0; i < n; ++i)\n      if (dst == emb.get_vertex(i))\n        return true;\n    for (unsigned i = 0; i < idx; ++i)\n      if (is_connected_dag(dst, emb.get_vertex(i)))\n        return true;\n    // for (unsigned i = idx+1; i < n; ++i) if (dst < emb.get_vertex(i)) return\n    // true;\n    return false;\n  }\n  inline bool is_vertexInduced_automorphism(unsigned n, const EmbeddingTy& emb,\n                                            unsigned idx, VertexId dst) {\n    // unsigned n = emb.size();\n    // the new vertex id should be larger than the first vertex id\n    if (dst <= emb.get_vertex(0))\n      return true;\n    // the new vertex should not already exist in the embedding\n    for (unsigned i = 1; i < n; ++i)\n      if (dst == emb.get_vertex(i))\n        return true;\n    // the new vertex should not already be extended by any previous vertex in\n    // the embedding\n    for (unsigned i = 0; i < idx; ++i)\n      if (is_connected(emb.get_vertex(i), dst))\n        return true;\n    // the new vertex id should be larger than any vertex id after its source\n    // vertex in the embedding\n    for (unsigned i = idx + 1; i < n; ++i)\n      if (dst < emb.get_vertex(i))\n        return true;\n    return false;\n  }\n  unsigned get_degree(PangolinGraph* g, VertexId vid) {\n    return std::distance(g->edge_begin(vid), g->edge_end(vid));\n  }\n  inline unsigned intersect_merge(unsigned src, unsigned dst) {\n    unsigned count = 0;\n    for (auto e : graph.edges(dst)) {\n      GNode dst_dst = graph.getEdgeDst(e);\n      for (auto e1 : graph.edges(src)) {\n        GNode to = graph.getEdgeDst(e1);\n        if (dst_dst == to) {\n          count += 1;\n          break;\n        }\n        if (to > dst_dst)\n          break;\n      }\n    }\n    return count;\n  }\n  inline unsigned intersect_dag_merge(unsigned p, unsigned q) {\n    unsigned count = 0;\n    auto p_start   = graph.edge_begin(p);\n    auto p_end     = graph.edge_end(p);\n    auto q_start   = graph.edge_begin(q);\n    auto q_end     = graph.edge_end(q);\n    auto p_it      = p_start;\n    auto q_it      = q_start;\n    int a;\n    int b;\n    while (p_it < p_end && q_it < q_end) {\n      a     = graph.getEdgeDst(p_it);\n      b     = graph.getEdgeDst(q_it);\n      int d = a - b;\n      if (d <= 0)\n        p_it++;\n      if (d >= 0)\n        q_it++;\n      if (d == 0)\n        count++;\n    }\n    return count;\n  }\n  inline unsigned intersect_search(unsigned a, unsigned b) {\n    if (degrees[a] == 0 || degrees[b] == 0)\n      return 0;\n    unsigned count  = 0;\n    unsigned lookup = a;\n    unsigned search = b;\n    if (degrees[a] > degrees[b]) {\n      lookup = b;\n      search = a;\n    }\n    auto begin = graph.edge_begin(search);\n    auto end   = graph.edge_end(search);\n    for (auto e : graph.edges(lookup)) {\n      GNode key = graph.getEdgeDst(e);\n      if (binary_search(key, begin, end))\n        count++;\n    }\n    return count;\n  }\n  inline bool is_all_connected_except(unsigned dst, unsigned pos,\n                                      const EmbeddingTy& emb) {\n    unsigned n         = emb.size();\n    bool all_connected = true;\n    for (unsigned i = 0; i < n; ++i) {\n      if (i == pos)\n        continue;\n      unsigned from = emb.get_vertex(i);\n      if (!is_connected(from, dst)) {\n        all_connected = false;\n        break;\n      }\n    }\n    return all_connected;\n  }\n  inline bool is_all_connected_except_dag(unsigned dst, unsigned pos,\n                                          const EmbeddingTy& emb) {\n    unsigned n         = emb.size();\n    bool all_connected = true;\n    for (unsigned i = 0; i < n; ++i) {\n      if (i == pos)\n        continue;\n      unsigned from = emb.get_vertex(i);\n      if (!is_connected_dag(dst, from)) {\n        all_connected = false;\n        break;\n      }\n    }\n    return all_connected;\n  }\n  inline bool is_all_connected(unsigned dst, const EmbeddingTy& emb,\n                               unsigned end, unsigned start = 0) {\n    assert(start >= 0 && end > 0);\n    bool all_connected = true;\n    for (unsigned i = start; i < end; ++i) {\n      unsigned from = emb.get_vertex(i);\n      if (!is_connected(from, dst)) {\n        all_connected = false;\n        break;\n      }\n    }\n    return all_connected;\n  }\n  inline bool is_all_connected_dag(unsigned dst, const EmbeddingTy& emb,\n                                   unsigned end, unsigned start = 0) {\n    assert(start >= 0 && end > 0);\n    bool all_connected = true;\n    for (unsigned i = start; i < end; ++i) {\n      unsigned from = emb.get_vertex(i);\n      if (!is_connected_dag(dst, from)) {\n        all_connected = false;\n        break;\n      }\n    }\n    return all_connected;\n  }\n  inline bool is_all_connected_dag(unsigned dst,\n                                   const std::vector<VertexId>& emb,\n                                   unsigned end, unsigned start = 0) {\n    assert(start >= 0 && end > 0);\n    bool all_connected = true;\n    for (unsigned i = start; i < end; ++i) {\n      unsigned from = emb[i];\n      if (!is_connected_dag(dst, from)) {\n        all_connected = false;\n        break;\n      }\n    }\n    return all_connected;\n  }\n  // check if vertex a is connected to vertex b in a undirected graph\n  inline bool is_connected(unsigned a, unsigned b) {\n    if (degrees[a] == 0 || degrees[b] == 0)\n      return false;\n    unsigned key    = a;\n    unsigned search = b;\n    if (degrees[a] < degrees[b]) {\n      key    = b;\n      search = a;\n    }\n    auto begin = graph.edge_begin(search);\n    auto end   = graph.edge_end(search);\n    // return serial_search(key, begin, end);\n    return binary_search(key, begin, end);\n  }\n  inline int is_connected_dag(unsigned key, unsigned search) {\n    if (degrees[search] == 0)\n      return false;\n    auto begin = graph.edge_begin(search);\n    auto end   = graph.edge_end(search);\n    // return serial_search(key, begin, end);\n    return binary_search(key, begin, end);\n  }\n  inline bool serial_search(unsigned key, PangolinGraph::edge_iterator begin,\n                            PangolinGraph::edge_iterator end) {\n    for (auto offset = begin; offset != end; ++offset) {\n      unsigned d = graph.getEdgeDst(offset);\n      if (d == key)\n        return true;\n      if (d > key)\n        return false;\n    }\n    return false;\n  }\n  inline bool binary_search(unsigned key, PangolinGraph::edge_iterator begin,\n                            PangolinGraph::edge_iterator end) {\n    auto l = begin;\n    auto r = end - 1;\n    while (r >= l) {\n      auto mid       = l + (r - l) / 2;\n      unsigned value = graph.getEdgeDst(mid);\n      if (value == key)\n        return true;\n      if (value < key)\n        l = mid + 1;\n      else\n        r = mid - 1;\n    }\n    return false;\n  }\n  inline int binary_search(unsigned key, PangolinGraph::edge_iterator begin,\n                           int length) {\n    if (length < 1)\n      return -1;\n    int l = 0;\n    int r = length - 1;\n    while (r >= l) {\n      int mid        = l + (r - l) / 2;\n      unsigned value = graph.getEdgeDst(begin + mid);\n      if (value == key)\n        return mid;\n      if (value < key)\n        l = mid + 1;\n      else\n        r = mid - 1;\n    }\n    return -1;\n  }\n  inline void gen_adj_matrix(unsigned n, const std::vector<bool>& connected,\n                             Matrix& a) {\n    unsigned l = 0;\n    for (unsigned i = 1; i < n; i++)\n      for (unsigned j = 0; j < i; j++)\n        if (connected[l++])\n          a[i][j] = a[j][i] = 1;\n  }\n  // calculate the trace of a given n*n matrix\n  inline MatType trace(unsigned n, Matrix matrix) {\n    MatType tr = 0;\n    for (unsigned i = 0; i < n; i++) {\n      tr += matrix[i][i];\n    }\n    return tr;\n  }\n  // matrix mutiplication, both a and b are n*n matrices\n  inline Matrix product(unsigned n, const Matrix& a, const Matrix& b) {\n    Matrix c(n, std::vector<MatType>(n));\n    for (unsigned i = 0; i < n; ++i) {\n      for (unsigned j = 0; j < n; ++j) {\n        c[i][j] = 0;\n        for (unsigned k = 0; k < n; ++k) {\n          c[i][j] += a[i][k] * b[k][j];\n        }\n      }\n    }\n    return c;\n  }\n  // calculate the characteristic polynomial of a n*n matrix A\n  inline void char_polynomial(unsigned n, Matrix& A, std::vector<MatType>& c) {\n    // n is the size (num_vertices) of a graph\n    // A is the adjacency matrix (n*n) of the graph\n    Matrix C;\n    C = A;\n    for (unsigned i = 1; i <= n; i++) {\n      if (i > 1) {\n        for (unsigned j = 0; j < n; j++)\n          C[j][j] += c[n - i + 1];\n        C = product(n, A, C);\n      }\n      c[n - i] -= trace(n, C) / i;\n    }\n  }\n  inline void get_connectivity(unsigned n, unsigned idx, VertexId dst,\n                               const EmbeddingTy& emb,\n                               std::vector<bool>& connected) {\n    connected.push_back(true); // 0 and 1 are connected\n    for (unsigned i = 2; i < n; i++)\n      for (unsigned j = 0; j < i; j++)\n        if (is_connected(emb.get_vertex(i), emb.get_vertex(j)))\n          connected.push_back(true);\n        else\n          connected.push_back(false);\n    for (unsigned j = 0; j < n; j++) {\n      if (j == idx)\n        connected.push_back(true);\n      else if (is_connected(emb.get_vertex(j), dst))\n        connected.push_back(true);\n      else\n        connected.push_back(false);\n    }\n  }\n  // eigenvalue based approach to find the pattern id for a given embedding\n  inline unsigned find_motif_pattern_id_eigen(unsigned n, unsigned idx,\n                                              VertexId dst,\n                                              const EmbeddingTy& emb) {\n    std::vector<bool> connected;\n    get_connectivity(n, idx, dst, emb, connected);\n    Matrix A(n + 1, std::vector<MatType>(n + 1, 0));\n    gen_adj_matrix(n + 1, connected, A);\n    std::vector<MatType> c(n + 1, 0);\n    char_polynomial(n + 1, A, c);\n    bliss::UintSeqHash h;\n    for (unsigned i = 0; i < n + 1; ++i)\n      h.update((unsigned)c[i]);\n    return h.get_value();\n  }\n\n  // unsigned orientation(PangolinGraph &og, PangolinGraph &g);\n};\n\n#endif // MINER_HPP_\n"
  },
  {
    "path": "libpangolin/include/pangolin/ptypes.h",
    "content": "#pragma once\n#include \"pangolin/types.h\"\n#include \"pangolin/edge_embedding.h\"\n#include \"pangolin/quick_pattern.h\"\n#include \"pangolin/canonical_graph.h\"\n\ntypedef QuickPattern<EdgeInducedEmbedding<StructuralElement>, StructuralElement>\n    StrQPattern; // structural quick pattern\ntypedef CanonicalGraph<EdgeInducedEmbedding<StructuralElement>,\n                       StructuralElement>\n    StrCPattern; // structural canonical pattern\ntypedef std::unordered_map<StrQPattern, Frequency>\n    StrQpMapFreq; // mapping structural quick pattern to its frequency\ntypedef std::unordered_map<StrCPattern, Frequency>\n    StrCgMapFreq; // mapping structural canonical pattern to its frequency\ntypedef galois::substrate::PerThreadStorage<StrQpMapFreq> LocalStrQpMapFreq;\ntypedef galois::substrate::PerThreadStorage<StrCgMapFreq> LocalStrCgMapFreq;\n/*\nclass Status {\nprotected:\n    std::vector<uint8_t> visited;\npublic:\n    Status() {}\n    ~Status() {}\n    void init(unsigned size) {\n        visited.resize(size);\n        reset();\n    }\n    void reset() {\n        std::fill(visited.begin(), visited.end(), 0);\n    }\n    void set(VertexId pos, uint8_t value) { visited[pos] = value; }\n    uint8_t get(VertexId pos) { return visited[pos]; }\n};\ntypedef galois::substrate::PerThreadStorage<Status> StatusMT; // multi-threaded\n*/\n"
  },
  {
    "path": "libpangolin/include/pangolin/quick_pattern.h",
    "content": "#ifndef QUICK_PATTERN_HPP_\n#define QUICK_PATTERN_HPP_\n/**\n * Code from on below link. Modified under Galois.\n *\n * https://github.com/rstream-system/RStream/\n *\n * Copyright (c) 2018, Kai Wang and the respective contributors\n * All rights reserved.\n * Reused/revised under 3-BSD\n */\n\n#include \"pangolin/embedding.h\"\n#include \"pangolin/equivalence.h\"\n#include \"bliss/uintseqhash.hh\"\n\ntemplate <typename EmbTy, typename EleTy>\nclass QuickPattern;\ntemplate <typename EmbTy, typename EleTy>\nstd::ostream& operator<<(std::ostream& strm,\n                         const QuickPattern<EmbTy, EleTy>& qp);\n\ntemplate <typename EmbTy, typename EleTy>\nclass QuickPattern {\n  friend std::ostream& operator<<<>(std::ostream& strm,\n                                    const QuickPattern<EmbTy, EleTy>& qp);\n\npublic:\n  QuickPattern() {}\n  QuickPattern(unsigned subgraph_size);\n  QuickPattern(const EmbTy& emb);\n  QuickPattern(EmbTy& emb, bool need_permute);\n  QuickPattern(unsigned n, std::vector<bool> connected);\n  ~QuickPattern() {}\n  void get_equivalences(VertexPositionEquivalences& equ) {\n    equ.set_size(size);\n    for (unsigned i = 0; i < size; ++i)\n      equ.add_equivalence(i, i);\n    findAutomorphisms(equ);\n  }\n  // operator for map\n  bool operator==(const QuickPattern& other) const {\n    // compare edges\n    assert(size == other.size);\n    for (unsigned i = 0; i < size; ++i) {\n      const EleTy& t1 = elements[i];\n      const EleTy& t2 = other.elements[i];\n      int cmp_element = t1.cmp(t2);\n      if (cmp_element != 0) {\n        return false;\n      }\n    }\n    return true;\n  }\n  operator size_t() const {\n    size_t a = 0;\n    for (unsigned i = 0; i < size; ++i) {\n      auto element = elements[i];\n      a += element.get_vid();\n    }\n    return a;\n  }\n  inline unsigned get_hash() const { return hash_value; }\n  inline void set_hash() {\n    bliss::UintSeqHash h;\n    h.update(size);\n    // hash vertex labels and edges\n    for (unsigned i = 0; i < size; ++i) {\n      auto element = elements[i];\n      h.update(element.get_vid());\n      if (std::is_same<EleTy, LabeledElement>::value)\n        h.update(element.get_vlabel());\n      if (element.has_history())\n        h.update(element.get_his());\n    }\n    hash_value = h.get_value();\n    // return h.get_value();\n  }\n  EleTy& at(unsigned index) const { return elements[index]; }\n  inline unsigned get_size() const { return size; }\n  inline void clean() { delete[] elements; }\n  inline unsigned get_id() const { return hash_value; }\n  inline unsigned get_cgid() const { return cg_id; }\n  void set_cgid(unsigned i) { cg_id = i; }\n\nprivate:\n  unsigned size;\n  EleTy* elements;\n  unsigned hash_value; // quick pattern ID\n  unsigned\n      cg_id; // ID of the canonical pattern that this quick pattern belongs to\n  void findAutomorphisms(VertexPositionEquivalences& eq_sets);\n};\n\nnamespace std {\ntemplate <typename EmbTy, typename EleTy>\nstruct hash<QuickPattern<EmbTy, EleTy>> {\n  std::size_t operator()(const QuickPattern<EmbTy, EleTy>& qp) const {\n    return std::hash<int>()(qp.get_hash());\n  }\n};\n} // namespace std\n#endif // QUICK_PATTERN_HPP_\n"
  },
  {
    "path": "libpangolin/include/pangolin/res_man.h",
    "content": "#ifndef RESOURCE_MANAGER_HPP_\n#define RESOURCE_MANAGER_HPP_\n/**\n * Code from on below link. Modified under Galois.\n *\n * https://github.com/rstream-system/RStream/\n *\n * Copyright (c) 2018, Kai Wang and the respective contributors\n * All rights reserved.\n * Reused/revised under 3-BSD\n */\n\n#include <sys/time.h>\n#include <sys/resource.h>\n#include <iomanip>\n\nclass ResourceManager {\npublic:\n  ResourceManager() {}\n  ~ResourceManager() {}\n  // peak memory usage\n  std::string get_peak_memory() {\n    double kbm;\n    struct rusage CurUsage;\n    getrusage(RUSAGE_SELF, &CurUsage);\n    kbm        = (double)CurUsage.ru_maxrss;\n    double mbm = kbm / 1024.0;\n    double gbm = mbm / 1024.0;\n    return \"Peak memory: \" + to_string_with_precision(mbm, 3) + \" MB; \" +\n           to_string_with_precision(gbm, 3) + \" GB\";\n  }\n\nprivate:\n  template <typename T = double>\n  std::string to_string_with_precision(const T a_value, const int& n) {\n    std::ostringstream out;\n    out << std::fixed;\n    out << std::setprecision(n) << a_value;\n    return out.str();\n  }\n};\n#endif\n"
  },
  {
    "path": "libpangolin/include/pangolin/scan.h",
    "content": "#pragma once\n#include <vector>\n#include \"pangolin/gtypes.h\"\n\ninline std::vector<IndexT> PrefixSum(const std::vector<IndexT>& vec) {\n  std::vector<IndexT> sums(vec.size() + 1);\n  IndexT total = 0;\n  for (size_t n = 0; n < vec.size(); n++) {\n    sums[n] = total;\n    total += vec[n];\n  }\n  sums[vec.size()] = total;\n  return sums;\n}\n\n#ifdef LARGE_SIZE\ntemplate <typename InTy = unsigned, typename OutTy = unsigned>\ninline std::vector<OutTy> prefix_sum(const std::vector<InTy>& in) {\n  std::vector<OutTy> sums(in.size() + 1);\n  OutTy total = 0;\n  for (size_t n = 0; n < in.size(); n++) {\n    sums[n] = total;\n    total += (OutTy)in[n];\n  }\n  sums[in.size()] = total;\n  return sums;\n}\n\ntemplate <typename InTy = unsigned, typename OutTy = unsigned>\ninline std::vector<OutTy> parallel_prefix_sum(const std::vector<InTy>& in) {\n  const size_t block_size = 1 << 20;\n  const size_t num_blocks = (in.size() + block_size - 1) / block_size;\n  std::vector<OutTy> local_sums(num_blocks);\n  // count how many bits are set on each thread\n  galois::do_all(\n      galois::iterate((size_t)0, num_blocks), [&](const size_t& block) {\n        OutTy lsum       = 0;\n        size_t block_end = std::min((block + 1) * block_size, in.size());\n        for (size_t i = block * block_size; i < block_end; i++)\n          lsum += in[i];\n        local_sums[block] = lsum;\n      });\n  std::vector<OutTy> bulk_prefix(num_blocks + 1);\n  OutTy total = 0;\n  for (size_t block = 0; block < num_blocks; block++) {\n    bulk_prefix[block] = total;\n    total += local_sums[block];\n  }\n  bulk_prefix[num_blocks] = total;\n  std::vector<OutTy> prefix(in.size() + 1);\n  galois::do_all(\n      galois::iterate((size_t)0, num_blocks), [&](const size_t& block) {\n        OutTy local_total = bulk_prefix[block];\n        size_t block_end  = std::min((block + 1) * block_size, in.size());\n        for (size_t i = block * block_size; i < block_end; i++) {\n          prefix[i] = local_total;\n          local_total += in[i];\n        }\n      });\n  prefix[in.size()] = bulk_prefix[num_blocks];\n  return prefix;\n}\n\n#else\ntemplate <typename InTy = unsigned, typename OutTy = unsigned>\ninline galois::gstl::Vector<OutTy>\nparallel_prefix_sum(const galois::gstl::Vector<InTy>& in) {\n  galois::gstl::Vector<OutTy> sums(in.size() + 1);\n  OutTy total = 0;\n  for (size_t n = 0; n < in.size(); n++) {\n    sums[n] = total;\n    total += (OutTy)in[n];\n  }\n  sums[in.size()] = total;\n  return sums;\n}\n#endif\n"
  },
  {
    "path": "libpangolin/include/pangolin/types.h",
    "content": "#ifndef TYPES_H\n#define TYPES_H\n// common types\n#include <map>\n#include <set>\n#include <queue>\n#include <vector>\n#include <cstring>\n#include <cassert>\n#include <cstdint>\n#include <cstdlib>\n#include <sstream>\n#include <fstream>\n#include <iomanip>\n#include <iostream>\n#include <stdint.h>\n#include <string.h>\n#include <algorithm>\n#include <functional>\n#include <unordered_map>\n#include <unordered_set>\n#define LARGE_SIZE // for large graphs such as soc-Livejournal1 and com-Orkut\n\ntypedef float Weight;\ntypedef uint64_t Ulong;\ntypedef uint32_t ValueT;\ntypedef uint32_t VertexId;\ntypedef uint64_t EdgeId;\ntypedef uint8_t BYTE;\n#ifdef LARGE_SIZE\ntypedef uint64_t IndexT;\ntypedef uint64_t IndexTy;\n#else\ntypedef uint32_t IndexT;\ntypedef uint32_t IndexTy;\n#endif\n\ntypedef std::set<uint32_t> UintSet;\ntypedef std::vector<UintSet> UintSets;\n\ntypedef std::vector<BYTE> ByteList;\ntypedef std::vector<uint32_t> UintList;\ntypedef std::vector<Ulong> UlongList;\ntypedef std::vector<VertexId> VertexList;\ntypedef std::vector<UintList> IndexLists;\ntypedef std::vector<ByteList> ByteLists;\ntypedef std::vector<VertexList> VertexLists;\ntypedef std::vector<bool> BoolVec;\n\n// We provide two types of 'support': frequency and domain support.\n// Frequency is used for counting, e.g. motif counting.\n// Domain support, a.k.a, the minimum image-based support, is used for FSM. It\n// has the anti-monotonic property.\ntypedef float MatType;\ntypedef unsigned Frequency;\ntypedef std::vector<std::vector<MatType>> Matrix;\ntypedef std::unordered_map<unsigned, unsigned> UintMap;\ntypedef std::pair<unsigned, unsigned> InitPattern;\ntypedef std::unordered_map<unsigned, unsigned> FreqMap;\ntypedef std::unordered_map<unsigned, bool> DomainMap;\ntypedef std::map<unsigned, std::map<unsigned, unsigned>> Map2D;\n\n#endif\n"
  },
  {
    "path": "libpangolin/include/pangolin/util.h",
    "content": "#ifndef UTIL_H\n#define UTIL_H\n\n#include \"pangolin/scan.h\"\n#include \"pangolin/mgraph.h\"\n#include \"pangolin/res_man.h\"\n\nnamespace util {\n\nvoid print_graph(PangolinGraph& graph) {\n  for (GNode n : graph) {\n    std::cout << \"vertex \" << n << \": label = \" << graph.getData(n)\n              << \": degree = \" << graph.get_degree(n) << \" edgelist = [ \";\n    for (auto e : graph.edges(n))\n      std::cout << graph.getEdgeDst(e) << \" \";\n    std::cout << \"]\" << std::endl;\n  }\n}\n\nvoid genGraph(MGraph& mg, PangolinGraph& g) {\n  g.allocateFrom(mg.num_vertices(), mg.num_edges());\n  g.constructNodes();\n  for (size_t i = 0; i < mg.num_vertices(); i++) {\n    g.getData(i)   = mg.get_label(i);\n    auto row_begin = mg.get_offset(i);\n    auto row_end   = mg.get_offset(i + 1);\n    g.fixEndEdge(i, row_end);\n    for (auto offset = row_begin; offset < row_end; offset++) {\n      g.constructEdge(offset, mg.get_dest(offset), 0);\n    }\n  }\n}\n// relabel vertices by descending degree order (do not apply to weighted graphs)\nvoid DegreeRanking(PangolinGraph& og, PangolinGraph& g) {\n  std::cout << \" Relabeling vertices by descending degree order\\n\";\n  std::vector<IndexT> old_degrees(og.size(), 0);\n  galois::do_all(\n      galois::iterate(og.begin(), og.end()),\n      [&](const auto& src) {\n        old_degrees[src] = std::distance(og.edge_begin(src), og.edge_end(src));\n      },\n      galois::loopname(\"getOldDegrees\"));\n\n  size_t num_vertices = og.size();\n  typedef std::pair<unsigned, IndexT> degree_node_p;\n  std::vector<degree_node_p> degree_id_pairs(num_vertices);\n  for (IndexT n = 0; n < num_vertices; n++)\n    degree_id_pairs[n] = std::make_pair(old_degrees[n], n);\n  std::sort(degree_id_pairs.begin(), degree_id_pairs.end(),\n            std::greater<degree_node_p>());\n\n  std::vector<IndexT> degrees(num_vertices, 0);\n  std::vector<IndexT> new_ids(num_vertices);\n  for (IndexT n = 0; n < num_vertices; n++) {\n    degrees[n]                         = degree_id_pairs[n].first;\n    new_ids[degree_id_pairs[n].second] = n;\n  }\n  std::vector<IndexT> offsets = PrefixSum(degrees);\n\n  g.allocateFrom(og.size(), og.sizeEdges());\n  g.constructNodes();\n  galois::do_all(\n      galois::iterate(og.begin(), og.end()),\n      [&](const auto& src) {\n        auto row_begin = offsets[src];\n        g.fixEndEdge(src, row_begin + degrees[src]);\n        IndexT offset = 0;\n        for (auto e : og.edges(src)) {\n          auto dst = og.getEdgeDst(e);\n          g.constructEdge(row_begin + offset, new_ids[dst], 0);\n          offset++;\n        }\n        assert(offset == degrees[src]);\n      },\n      galois::loopname(\"ConstructNewGraph\"));\n  g.sortAllEdgesByDst();\n}\n\nunsigned orientation(PangolinGraph& og, PangolinGraph& g) {\n  galois::StatTimer Tdag(\"DAG\");\n  Tdag.start();\n  // std::cout << \"Orientation enabled, using DAG\\n\";\n  // std::cout << \"Assume the input graph is clean and symmetric (.csgr)\\n\";\n  // std::cout << \"Before: num_vertices \" << og.size() << \" num_edges \"\n  //          << og.sizeEdges() << \"\\n\";\n  std::vector<IndexT> degrees(og.size(), 0);\n\n  galois::do_all(\n      galois::iterate(og.begin(), og.end()),\n      [&](const auto& src) {\n        degrees[src] = std::distance(og.edge_begin(src), og.edge_end(src));\n      },\n      galois::loopname(\"getOldDegrees\"));\n\n  unsigned max_degree = *(std::max_element(degrees.begin(), degrees.end()));\n  std::vector<IndexT> new_degrees(og.size(), 0);\n\n  galois::do_all(\n      galois::iterate(og.begin(), og.end()),\n      [&](const auto& src) {\n        for (auto e : og.edges(src)) {\n          auto dst = og.getEdgeDst(e);\n          if (degrees[dst] > degrees[src] ||\n              (degrees[dst] == degrees[src] && dst > src)) {\n            new_degrees[src]++;\n          }\n        }\n      },\n      galois::loopname(\"getNewDegrees\"));\n\n  std::vector<IndexT> offsets = PrefixSum(new_degrees);\n  assert(offsets[og.size()] == og.sizeEdges() / 2);\n\n  g.allocateFrom(og.size(), og.sizeEdges() / 2);\n  g.constructNodes();\n\n  galois::do_all(\n      galois::iterate(og.begin(), og.end()),\n      [&](const auto& src) {\n        g.getData(src) = 0;\n        auto row_begin = offsets[src];\n        g.fixEndEdge(src, row_begin + new_degrees[src]);\n        IndexT offset = 0;\n        for (auto e : og.edges(src)) {\n          auto dst = og.getEdgeDst(e);\n          if (degrees[dst] > degrees[src] ||\n              (degrees[dst] == degrees[src] && dst > src)) {\n            g.constructEdge(row_begin + offset, dst, 0);\n            offset++;\n          }\n        }\n        assert(offset == new_degrees[src]);\n      },\n      galois::loopname(\"ConstructNewGraph\"));\n\n  g.sortAllEdgesByDst();\n  Tdag.stop();\n  return max_degree;\n}\n\n// relabel is needed when we use DAG as input graph, and it is disabled when we\n// use symmetrized graph\nunsigned read_graph(PangolinGraph& graph, std::string filetype,\n                    std::string filename, bool need_dag = false) {\n  MGraph mgraph(need_dag);\n  unsigned max_degree = 0;\n  if (filetype == \"txt\") {\n    // printf(\"Reading .lg file: %s\\n\", filename.c_str());\n    mgraph.read_txt(filename.c_str());\n    genGraph(mgraph, graph);\n  } else if (filetype == \"adj\") {\n    // printf(\"Reading .adj file: %s\\n\", filename.c_str());\n    mgraph.read_adj(filename.c_str());\n    genGraph(mgraph, graph);\n  } else if (filetype == \"mtx\") {\n    // printf(\"Reading .mtx file: %s\\n\", filename.c_str());\n    mgraph.read_mtx(filename.c_str(), true); // symmetrize\n    genGraph(mgraph, graph);\n  } else if (filetype == \"gr\") {\n    // printf(\"Reading .gr file: %s\\n\", filename.c_str());\n    if (need_dag) {\n      PangolinGraph g_temp;\n      galois::graphs::readGraph(g_temp, filename);\n      max_degree = orientation(g_temp, graph);\n    } else {\n      galois::graphs::readGraph(graph, filename);\n      galois::do_all(\n          galois::iterate(graph.begin(), graph.end()),\n          [&](const auto& vid) {\n            graph.getData(vid) = 1;\n            // for (auto e : graph.edges(n)) graph.getEdgeData(e) = 1;\n          },\n          galois::loopname(\"assignVertexLabels\"));\n      std::vector<unsigned> degrees(graph.size());\n      galois::do_all(\n          galois::iterate(graph.begin(), graph.end()),\n          [&](const auto& vid) {\n            degrees[vid] =\n                std::distance(graph.edge_begin(vid), graph.edge_end(vid));\n          },\n          galois::loopname(\"computeMaxDegree\"));\n      max_degree = *(std::max_element(degrees.begin(), degrees.end()));\n    }\n  } else {\n    printf(\"Unkown file format\\n\");\n    exit(1);\n  }\n  // print_graph(graph);\n  galois::gPrint(\"Input graph: num_vertices \", graph.size(), \" num_edges \",\n                 graph.sizeEdges(), \"\\n\");\n  if (filetype != \"gr\") {\n    max_degree = mgraph.get_max_degree();\n    mgraph.clean();\n  }\n  // printf(\"max degree = %u\\n\", max_degree);\n  return max_degree;\n}\n\n} // namespace util\n#endif\n"
  },
  {
    "path": "libpangolin/include/pangolin/vertex_embedding.h",
    "content": "#pragma once\n#include \"pangolin/embedding.h\"\n#include \"bliss/uintseqhash.hh\"\n\n// Vertex-induced embedding with hash value\nclass VertexInducedEmbedding : public Embedding<SimpleElement> {\n  friend std::ostream& operator<<(std::ostream& strm,\n                                  const VertexInducedEmbedding& emb);\n\npublic:\n  VertexInducedEmbedding() : Embedding() { hash_value = 0; }\n  VertexInducedEmbedding(size_t n) : Embedding(n) { hash_value = 0; }\n  VertexInducedEmbedding(const VertexInducedEmbedding& emb) : Embedding() {\n    elements   = emb.get_elements();\n    hash_value = emb.get_pid();\n  }\n  ~VertexInducedEmbedding() {}\n  SimpleElement operator[](size_t i) const { return elements[i]; }\n  VertexInducedEmbedding& operator=(const VertexInducedEmbedding& other) {\n    if (this == &other)\n      return *this;\n    elements   = other.get_elements();\n    hash_value = other.get_pid();\n    return *this;\n  }\n  inline unsigned get_pid() const { return hash_value; } // get the pattern id\n  inline void set_pid(unsigned i) { hash_value = i; }    // set the pattern id\n  inline unsigned get_hash() const {\n    bliss::UintSeqHash h;\n    for (unsigned i = 0; i < size(); ++i)\n      h.update(elements[i].get_vid());\n    return h.get_value();\n  }\n  friend bool operator==(const VertexInducedEmbedding& e1,\n                         const VertexInducedEmbedding& e2) {\n    return e1.elements == e2.elements;\n  }\n\nprotected:\n  unsigned hash_value;\n};\n\nnamespace std {\ntemplate <>\nstruct hash<VertexInducedEmbedding> {\n  std::size_t operator()(const VertexInducedEmbedding& emb) const {\n    return std::hash<int>()(emb.get_hash());\n  }\n};\n} // namespace std\n\ntypedef VertexInducedEmbedding VertexEmbedding;\n"
  },
  {
    "path": "libpangolin/src/BfsMining/embedding_list.cpp",
    "content": "#include \"pangolin/BfsMining/embedding_list.h\"\n\ntemplate <typename ElementType, typename EmbeddingType>\nvoid EmbeddingList<ElementType, EmbeddingType>::init(PangolinGraph& graph,\n                                                     unsigned max_size,\n                                                     bool is_dag) {\n  last_level = 1;\n  max_level  = max_size;\n  vid_lists.resize(max_level);\n  idx_lists.resize(max_level);\n  size_t num_emb = graph.sizeEdges();\n  if (!is_dag)\n    num_emb = num_emb / 2;\n  vid_lists[1].resize(num_emb);\n  idx_lists[1].resize(num_emb);\n  if (std::is_same<ElementType, LabeledElement>::value) {\n    his_lists.resize(max_level);\n    his_lists[1].resize(num_emb);\n    galois::do_all(\n        galois::iterate((size_t)0, num_emb),\n        [&](const size_t& pos) { his_lists[1][pos] = 0; },\n        galois::chunk_size<64>(), galois::steal(),\n        galois::loopname(\"Init-his\"));\n  }\n  if (is_dag) {\n    galois::do_all(\n        galois::iterate(graph.begin(), graph.end()),\n        [&](const GNode& src) {\n          for (auto e : graph.edges(src)) {\n            auto dst         = graph.getEdgeDst(e);\n            vid_lists[1][*e] = dst;\n            idx_lists[1][*e] = src;\n          }\n        },\n        galois::chunk_size<64>(), galois::steal(),\n        galois::loopname(\"Init-vid\"));\n  } else {\n    size_t num_vertices = graph.size();\n    UintList num_init_emb(num_vertices);\n    galois::do_all(\n        galois::iterate(graph.begin(), graph.end()),\n        [&](const GNode& src) {\n          num_init_emb[src] = 0;\n          for (auto e : graph.edges(src)) {\n            auto dst = graph.getEdgeDst(e);\n            if (src < dst)\n              num_init_emb[src]++;\n          }\n        },\n        galois::chunk_size<64>(), galois::steal(),\n        galois::loopname(\"Init-vid-alloc\"));\n    UintList indices(num_vertices + 1);\n    unsigned total = 0;\n    for (size_t n = 0; n < num_vertices; n++) {\n      indices[n] = total;\n      total += num_init_emb[n];\n    }\n    indices[num_vertices] = total;\n    galois::do_all(\n        galois::iterate(graph.begin(), graph.end()),\n        [&](const GNode& src) {\n          auto start = indices[src];\n          for (auto e : graph.edges(src)) {\n            GNode dst = graph.getEdgeDst(e);\n            if (src < dst) { // TODO: this may be incorrect for FSM: may cause\n                             // the 4-FSM bug\n              vid_lists[1][start] = dst;\n              idx_lists[1][start] = src;\n              start++;\n            }\n          }\n        },\n        galois::chunk_size<64>(), galois::steal(),\n        galois::loopname(\"Init-vid-insert\"));\n  }\n}\n\ntemplate class EmbeddingList<SimpleElement, BaseEmbedding>;   // TC and KCL\ntemplate class EmbeddingList<SimpleElement, VertexEmbedding>; // Motif\ntemplate class EmbeddingList<LabeledElement,\n                             EdgeInducedEmbedding<LabeledElement>>; // FSM\n"
  },
  {
    "path": "libpangolin/src/base_embedding.cpp",
    "content": "#include \"pangolin/base_embedding.h\"\n\nstd::ostream& operator<<(std::ostream& strm, const BaseEmbedding& emb) {\n  if (emb.empty()) {\n    strm << \"(empty)\";\n    return strm;\n  }\n  strm << \"(\";\n  for (unsigned index = 0; index < emb.size() - 1; ++index)\n    std::cout << emb.get_vertex(index) << \", \";\n  std::cout << emb.get_vertex(emb.size() - 1);\n  strm << \")\";\n  return strm;\n}\n"
  },
  {
    "path": "libpangolin/src/equivalence.cpp",
    "content": "#include \"pangolin/equivalence.h\"\n\nstd::ostream& operator<<(std::ostream& strm,\n                         const VertexPositionEquivalences& equ) {\n  if (equ.get_size() == 0) {\n    strm << \"(empty)\";\n    return strm;\n  }\n  strm << \"VertexPositionEquivalences{equivalences=[\";\n  for (unsigned i = 0; i < equ.get_size(); ++i) {\n    strm << \"[\";\n    for (auto ele : equ.get_equivalent_set(i)) {\n      strm << ele << \", \";\n    }\n    strm << \"], \";\n  }\n  strm << \"]; size=\" << equ.get_size() << \"}\\n\";\n  return strm;\n}\n"
  },
  {
    "path": "libpangolin/src/quick_pattern.cpp",
    "content": "#include \"pangolin/quick_pattern.h\"\n#include \"pangolin/vertex_embedding.h\"\n#include \"pangolin/edge_embedding.h\"\n\ntemplate <typename EmbTy, typename EleTy>\nQuickPattern<EmbTy, EleTy>::QuickPattern(unsigned subgraph_size) {\n  hash_value = 0;\n  cg_id      = 0;\n  size       = subgraph_size / sizeof(EleTy);\n  elements   = new EleTy[size];\n}\n\ntemplate <typename EmbTy, typename EleTy>\nQuickPattern<EmbTy, EleTy>::QuickPattern(const EmbTy& emb) {\n  cg_id          = 0;\n  size           = emb.size();\n  unsigned bytes = size * sizeof(EleTy);\n  elements       = new EleTy[size];\n  std::memcpy(elements, emb.data(), bytes);\n  VertexId new_id = 1;\n  std::unordered_map<VertexId, VertexId> map;\n  for (unsigned i = 0; i < size; i++) {\n    auto& element   = elements[i];\n    VertexId old_id = element.get_vid();\n    auto iterator   = map.find(old_id);\n    if (iterator == map.end()) {\n      element.set_vertex_id(new_id);\n      map[old_id] = new_id++;\n    } else\n      element.set_vertex_id(iterator->second);\n  }\n  set_hash();\n}\n\ntemplate <typename EmbTy, typename EleTy>\nQuickPattern<EmbTy, EleTy>::QuickPattern(EmbTy& emb, bool) {\n  cg_id          = 0;\n  size           = emb.size();\n  unsigned bytes = size * sizeof(EleTy);\n  elements       = new EleTy[size];\n  std::memcpy(elements, emb.data(), bytes);\n  VertexId new_id = 1;\n  if (std::is_same<EleTy, LabeledElement>::value) {\n    if (size == 3) {\n      BYTE l1 = emb.get_label(1);\n      BYTE l2 = emb.get_label(2);\n      BYTE h2 = emb.get_history(2);\n      elements[0].set_vertex_id(1);\n      elements[1].set_vertex_id(2);\n      elements[2].set_vertex_id(3);\n      if (h2 == 0) {\n        if (l1 < l2) {\n          elements[1].set_vertex_label(l2);\n          elements[2].set_vertex_label(l1);\n          VertexId v1 = emb.get_vertex(1);\n          VertexId v2 = emb.get_vertex(2);\n          emb.set_vertex(1, v2);\n          emb.set_vertex(2, v1);\n        }\n      } else {\n        assert(h2 == 1);\n        elements[0].set_vertex_label(l1);\n        elements[2].set_history_info(0);\n        BYTE l0     = emb.get_label(0);\n        VertexId v0 = emb.get_vertex(0);\n        VertexId v1 = emb.get_vertex(1);\n        VertexId v2 = emb.get_vertex(2);\n        if (l0 < l2) {\n          elements[1].set_vertex_label(l2);\n          elements[2].set_vertex_label(l0);\n          emb.set_vertex(1, v2);\n          emb.set_vertex(2, v0);\n        } else {\n          elements[1].set_vertex_label(l0);\n          emb.set_vertex(1, v0);\n        }\n        emb.set_vertex(0, v1);\n      }\n    } else { // size > 3\n      std::unordered_map<VertexId, VertexId> map;\n      for (unsigned i = 0; i < size; i++) {\n        auto& element   = elements[i];\n        VertexId old_id = element.get_vid();\n        auto iterator   = map.find(old_id);\n        if (iterator == map.end()) {\n          element.set_vertex_id(new_id);\n          map[old_id] = new_id++;\n        } else\n          element.set_vertex_id(iterator->second);\n      }\n    }\n  } else { // non-label\n    std::unordered_map<VertexId, VertexId> map;\n    for (unsigned i = 0; i < size; i++) {\n      auto& element   = elements[i];\n      VertexId old_id = element.get_vid();\n      auto iterator   = map.find(old_id);\n      if (iterator == map.end()) {\n        element.set_vertex_id(new_id);\n        map[old_id] = new_id++;\n      } else\n        element.set_vertex_id(iterator->second);\n    }\n  }\n  set_hash();\n}\n\ntemplate <typename EmbTy, typename EleTy>\nQuickPattern<EmbTy, EleTy>::QuickPattern(unsigned n,\n                                         std::vector<bool> connected) {\n  cg_id = 0;\n  size  = std::count(connected.begin(), connected.end(), true) +\n         1; // number of edges + 1\n  elements = new EleTy[size];\n  std::vector<unsigned> pos(n, 0);\n  pos[1] = 1;\n  pos[2] = 2;\n  elements[0].set_vertex_id(1);\n  elements[0].set_history_info(0);\n  elements[1].set_vertex_id(2);\n  elements[1].set_history_info(0);\n  int count = 2;\n  int l     = 1;\n  for (unsigned i = 2; i < n; i++) {\n    if (i < n - 2)\n      pos[i + 1] = pos[i];\n    for (unsigned j = 0; j < i; j++) {\n      if (connected[l++]) {\n        if (i < n - 2)\n          pos[i + 1]++;\n        elements[count].set_vertex_id(i + 1);\n        elements[count++].set_history_info(pos[j]);\n      }\n    }\n  }\n  set_hash();\n}\n\ntemplate <typename EmbTy, typename EleTy>\nvoid QuickPattern<EmbTy, EleTy>::findAutomorphisms(\n    VertexPositionEquivalences& eq_sets) {\n  if (size == 2) { // single-edge\n    if (at(0).get_vlabel() == at(1).get_vlabel()) {\n      eq_sets.add_equivalence(0, 1);\n      eq_sets.add_equivalence(1, 0);\n    }\n  } else if (size == 3) { // two-edge chain\n    if (at(2).get_his() == 0) {\n      if (at(1).get_vlabel() == at(2).get_vlabel()) {\n        eq_sets.add_equivalence(1, 2);\n        eq_sets.add_equivalence(2, 1);\n      }\n    } else if (at(2).get_his() == 1) {\n      if (at(0).get_vlabel() == at(2).get_vlabel()) {\n        eq_sets.add_equivalence(0, 2);\n        eq_sets.add_equivalence(2, 0);\n      }\n    } else\n      std::cout << \"Error\\n\";\n  } else if (size == 4) { // three-edge chain or star\n    if (at(2).get_his() == 0) {\n      if (at(3).get_his() == 0) {\n        if (at(1).get_vlabel() == at(2).get_vlabel()) {\n          eq_sets.add_equivalence(1, 2);\n          eq_sets.add_equivalence(2, 1);\n        }\n        if (at(1).get_vlabel() == at(3).get_vlabel()) {\n          eq_sets.add_equivalence(1, 3);\n          eq_sets.add_equivalence(3, 1);\n        }\n        if (at(2).get_vlabel() == at(3).get_vlabel()) {\n          eq_sets.add_equivalence(2, 3);\n          eq_sets.add_equivalence(3, 2);\n        }\n      } else if (at(3).get_his() == 1) {\n        if (at(2).get_vlabel() == at(3).get_vlabel()) {\n          eq_sets.add_equivalence(2, 3);\n          eq_sets.add_equivalence(3, 2);\n        }\n        if (at(0).get_vlabel() == at(1).get_vlabel()) {\n          eq_sets.add_equivalence(0, 1);\n          eq_sets.add_equivalence(1, 0);\n        }\n      } else if (at(3).get_his() == 2) {\n        if (at(1).get_vlabel() == at(3).get_vlabel()) {\n          eq_sets.add_equivalence(1, 3);\n          eq_sets.add_equivalence(3, 1);\n        }\n        if (at(0).get_vlabel() == at(2).get_vlabel()) {\n          eq_sets.add_equivalence(0, 2);\n          eq_sets.add_equivalence(2, 0);\n        }\n      } else\n        std::cout << \"Error\\n\";\n    } else if (at(2).get_his() == 1) {\n      if (at(3).get_his() == 0) {\n        if (at(2).get_vlabel() == at(3).get_vlabel()) {\n          eq_sets.add_equivalence(2, 3);\n          eq_sets.add_equivalence(3, 2);\n        }\n        if (at(0).get_vlabel() == at(1).get_vlabel()) {\n          eq_sets.add_equivalence(0, 1);\n          eq_sets.add_equivalence(1, 0);\n        }\n      } else if (at(3).get_his() == 1) {\n        if (at(0).get_vlabel() == at(2).get_vlabel()) {\n          eq_sets.add_equivalence(0, 2);\n          eq_sets.add_equivalence(2, 0);\n        }\n        if (at(0).get_vlabel() == at(3).get_vlabel()) {\n          eq_sets.add_equivalence(0, 3);\n          eq_sets.add_equivalence(3, 0);\n        }\n        if (at(2).get_vlabel() == at(3).get_vlabel()) {\n          eq_sets.add_equivalence(2, 3);\n          eq_sets.add_equivalence(3, 2);\n        }\n      } else if (at(3).get_his() == 2) {\n        if (at(0).get_vlabel() == at(3).get_vlabel()) {\n          eq_sets.add_equivalence(0, 3);\n          eq_sets.add_equivalence(3, 0);\n        }\n        if (at(1).get_vlabel() == at(2).get_vlabel()) {\n          eq_sets.add_equivalence(1, 2);\n          eq_sets.add_equivalence(2, 1);\n        }\n      } else\n        std::cout << \"Error\\n\";\n    } else\n      std::cout << \"Error\\n\";\n  } else { // four-edge and beyond\n    std::cout << \"Currently not supported\\n\";\n  }\n}\n\ntemplate <typename EmbTy, typename EleTy>\nstd::ostream& operator<<(std::ostream& strm,\n                         const QuickPattern<EmbTy, EleTy>& qp) {\n  if (qp.get_size() == 0) {\n    strm << \"(empty)\";\n    return strm;\n  }\n  strm << \"(\";\n  for (unsigned index = 0; index < qp.get_size() - 1; ++index)\n    strm << qp.elements[index] << \", \";\n  strm << qp.elements[qp.get_size() - 1];\n  strm << \")\";\n  return strm;\n}\n\ntemplate class QuickPattern<VertexEmbedding, SimpleElement>; // Motif\ntemplate class QuickPattern<EdgeInducedEmbedding<StructuralElement>,\n                            StructuralElement>; // Motif\ntemplate class QuickPattern<EdgeInducedEmbedding<LabeledElement>,\n                            LabeledElement>; // FSM\n"
  },
  {
    "path": "libpangolin/src/vertex_embedding.cpp",
    "content": "#include \"pangolin/vertex_embedding.h\"\n\nstd::ostream& operator<<(std::ostream& strm,\n                         const VertexInducedEmbedding& emb) {\n  if (emb.empty()) {\n    strm << \"(empty)\";\n    return strm;\n  }\n  std::cout << \"(\";\n  for (unsigned index = 0; index < emb.size() - 1; ++index)\n    std::cout << emb.get_vertex(index) << \", \";\n  std::cout << emb.get_vertex(emb.size() - 1);\n  std::cout << \") --> \" << emb.get_pid();\n  return strm;\n}\n"
  },
  {
    "path": "libpygalois/CMakeLists.txt",
    "content": "add_library(pygalois INTERFACE)\n\ntarget_include_directories(pygalois INTERFACE\n  $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>\n  $<INSTALL_INTERFACE:include>\n)\n\ninstall(TARGETS pygalois\n  EXPORT GaloisTargets\n  LIBRARY\n    DESTINATION \"${CMAKE_INSTALL_LIBDIR}\"\n    COMPONENT shlib\n  ARCHIVE\n    DESTINATION \"${CMAKE_INSTALL_LIBDIR}\"\n    COMPONENT lib\n  INCLUDES DESTINATION \"${CMAKE_INSTALL_INCLUDEDIR}\"\n)\n"
  },
  {
    "path": "libpygalois/include/galois/Constants.h",
    "content": "#include <iostream>\nnamespace galois {\nconstexpr uint32_t CHUNK_SIZE_64 = 64;\n\nclass UpdateRequestIndexer {\npublic:\n  uint32_t shift;\n\n  UpdateRequestIndexer(uint32_t _shift) : shift(_shift) {}\n  template <typename R>\n  unsigned int operator()(const R& req) const {\n    unsigned int t = req.dist >> shift;\n    return t;\n  }\n};\n\ntemplate <typename GNode, typename Dist>\nstruct UpdateRequest {\n  GNode src;\n  Dist dist;\n  UpdateRequest(const GNode& N, Dist W) : src(N), dist(W) {}\n  UpdateRequest() : src(), dist(0) {}\n\n  friend bool operator<(const UpdateRequest& left, const UpdateRequest& right) {\n    return left.dist == right.dist ? left.src < right.src\n                                   : left.dist < right.dist;\n  }\n};\n\nstruct ReqPushWrap {\n  template <typename C, typename GNode, typename Dist>\n  void operator()(C& cont, const GNode& n, const Dist& dist) const {\n    cont.push(UpdateRequest<GNode, Dist>(n, dist));\n  }\n};\n\n} // namespace galois\n"
  },
  {
    "path": "libsupport/CMakeLists.txt",
    "content": "add_library(galois_support STATIC)\nadd_library(Galois::support ALIAS galois_support)\nset_target_properties(galois_support PROPERTIES EXPORT_NAME support)\nadd_dependencies(lib galois_support)\n\nset(sources\n        src/GetEnv.cpp\n        src/Logging.cpp\n)\n\ntarget_sources(galois_support PRIVATE ${sources})\n\ntarget_include_directories(galois_support PUBLIC\n  $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>\n  $<INSTALL_INTERFACE:include>\n)\n\nfind_package(fmt REQUIRED)\nif (fmt_VERSION VERSION_LESS 4)\n  message(FATAL_ERROR \"fmt must be version 4 or higher. Found ${fmt_VERSION}.\")\nendif()\ntarget_link_libraries(galois_support fmt::fmt)\n\nadd_subdirectory(test)\n\ninstall(\n  DIRECTORY include/\n  DESTINATION \"${CMAKE_INSTALL_INCLUDEDIR}\"\n  COMPONENT dev\n  FILES_MATCHING PATTERN \"*.h\"\n)\n\ninstall(\n  TARGETS galois_support\n  EXPORT GaloisTargets\n  LIBRARY\n    DESTINATION \"${CMAKE_INSTALL_LIBDIR}\"\n    COMPONENT shlib\n  ARCHIVE\n    DESTINATION \"${CMAKE_INSTALL_LIBDIR}\"\n    COMPONENT lib\n  INCLUDES DESTINATION \"${CMAKE_INSTALL_INCLUDEDIR}\"\n)\n"
  },
  {
    "path": "libsupport/include/galois/GetEnv.h",
    "content": "#ifndef GALOIS_LIBSUPPORT_GALOIS_GET_ENV_H_\n#define GALOIS_LIBSUPPORT_GALOIS_GET_ENV_H_\n\n#include <string>\n\nnamespace galois {\n\n/// Return true if the environment variable is set.\n///\n/// This function simply tests for the presence of an environment variable; in\n/// contrast, bool GetEnv(std::string, bool&) checks if the value of the\n/// environment variable matches common truthy and falsey values.\nbool GetEnv(const std::string& var_name);\n\n/// Return true if environment variable is set, and extract its value into\n/// ret_val parameter.\n///\n/// \\param var_name name of the variable\n/// \\param[out] ret_val where to store the value of environment variable\n/// \\return true if environment variable set and value was successfully parsed;\n///   false otherwise\nbool GetEnv(const std::string& var_name, bool* ret);\nbool GetEnv(const std::string& var_name, int* ret);\nbool GetEnv(const std::string& var_name, double* ret);\nbool GetEnv(const std::string& var_name, std::string* ret);\n\n} // end namespace galois\n\n#endif\n"
  },
  {
    "path": "libsupport/include/galois/Logging.h",
    "content": "#ifndef GALOIS_LIBSUPPORT_GALOIS_LOGGING_H_\n#define GALOIS_LIBSUPPORT_GALOIS_LOGGING_H_\n\n#include <fmt/format.h>\n#include <fmt/ostream.h>\n\n#include <sstream>\n#include <string>\n#include <system_error>\n\n// Small patch to work with libfmt 4.0, which is the version in Ubuntu 18.04.\n#ifndef FMT_STRING\n#define FMT_STRING(...) __VA_ARGS__\n#endif\n\n#if FMT_VERSION >= 60000\n/// Introduce std::error_code to the fmt library. Otherwise, they will be\n/// printed using ostream<< formatting (i.e., as an int).\ntemplate <>\nstruct fmt::formatter<std::error_code> : formatter<string_view> {\n\n  template <typename FormatterContext>\n  auto format(std::error_code c, FormatterContext& ctx) {\n    return formatter<string_view>::format(c.message(), ctx);\n  }\n};\n#endif\n\nnamespace galois {\n\nenum class LogLevel {\n  Debug   = 0,\n  Verbose = 1,\n  // Info = 2,  currently unused\n  Warning = 3,\n  Error   = 4,\n};\n\nnamespace internal {\n\nvoid LogString(LogLevel level, const std::string& s);\n\n}\n\n/// Log at a specific LogLevel.\n///\n/// \\tparam F         string-like type\n/// \\param fmt_string a C++20-style fmt string (e.g., \"hello {}\")\n/// \\param args       arguments to fmt interpolation\ntemplate <typename F, typename... Args>\nvoid Log(LogLevel level, F fmt_string, Args&&... args) {\n  std::string s = fmt::format(fmt_string, std::forward<Args>(args)...);\n  internal::LogString(level, s);\n}\n\n/// Log at a specific LogLevel with source code information.\n///\n/// \\tparam F         string-like type\n/// \\param file_name  file name\n/// \\param line_no    line number\n/// \\param fmt_string a C++20-style fmt string (e.g., \"hello {}\")\n/// \\param args       arguments to fmt interpolation\ntemplate <typename F, typename... Args>\nvoid LogLine(LogLevel level, const char* file_name, int line_no, F fmt_string,\n             Args&&... args) {\n  std::string s         = fmt::format(fmt_string, std::forward<Args>(args)...);\n  std::string with_line = fmt::format(\"{}:{}: {}\", file_name, line_no, s);\n  internal::LogString(level, with_line);\n}\n\n} // end namespace galois\n\n#define GALOIS_LOG_FATAL(fmt_string, ...)                                      \\\n  do {                                                                         \\\n    ::galois::LogLine(::galois::LogLevel::Error, __FILE__, __LINE__,           \\\n                      FMT_STRING(fmt_string), ##__VA_ARGS__);                  \\\n    ::std::abort();                                                            \\\n  } while (0)\n#define GALOIS_LOG_ERROR(fmt_string, ...)                                      \\\n  do {                                                                         \\\n    ::galois::LogLine(::galois::LogLevel::Error, __FILE__, __LINE__,           \\\n                      FMT_STRING(fmt_string), ##__VA_ARGS__);                  \\\n  } while (0)\n#define GALOIS_LOG_WARN(fmt_string, ...)                                       \\\n  do {                                                                         \\\n    ::galois::LogLine(::galois::LogLevel::Warning, __FILE__, __LINE__,         \\\n                      FMT_STRING(fmt_string), ##__VA_ARGS__);                  \\\n  } while (0)\n#define GALOIS_LOG_VERBOSE(fmt_string, ...)                                    \\\n  do {                                                                         \\\n    ::galois::LogLine(::galois::LogLevel::Verbose, __FILE__, __LINE__,         \\\n                      FMT_STRING(fmt_string), ##__VA_ARGS__);                  \\\n  } while (0)\n\n#ifndef NDEBUG\n#define GALOIS_LOG_DEBUG(fmt_string, ...)                                      \\\n  do {                                                                         \\\n    ::galois::LogLine(::galois::LogLevel::Debug, __FILE__, __LINE__,           \\\n                      FMT_STRING(fmt_string), ##__VA_ARGS__);                  \\\n  } while (0)\n#else\n#define GALOIS_LOG_DEBUG(...)\n#endif\n\n#define GALOIS_LOG_ASSERT(cond)                                                \\\n  do {                                                                         \\\n    if (!(cond)) {                                                             \\\n      ::galois::LogLine(::galois::LogLevel::Error, __FILE__, __LINE__,         \\\n                        \"assertion not true: {}\", #cond);                      \\\n      ::std::abort();                                                          \\\n    }                                                                          \\\n  } while (0)\n\n#endif\n"
  },
  {
    "path": "libsupport/src/GetEnv.cpp",
    "content": "#include \"galois/GetEnv.h\"\n\n#include <cstdlib>\n#include <stdexcept>\n\nnamespace {\n\nbool Convert(const std::string& var_val, bool* ret) {\n  // TODO(ddn): strip whitespace, case-insensitive?\n  if (var_val == \"True\" || var_val == \"1\" || var_val == \"true\") {\n    *ret = true;\n    return true;\n  }\n\n  if (var_val == \"False\" || var_val == \"0\" || var_val == \"false\") {\n    *ret = false;\n    return true;\n  }\n\n  return false;\n}\n\nbool Convert(const std::string& var_val, int* ret) {\n  try {\n    *ret = std::stoi(var_val);\n  } catch (std::invalid_argument&) {\n    return false;\n  } catch (std::out_of_range&) {\n    return false;\n  }\n  return true;\n}\n\nbool Convert(const std::string& var_val, double* ret) {\n  try {\n    *ret = std::stod(var_val);\n  } catch (std::invalid_argument&) {\n    return false;\n  } catch (std::out_of_range&) {\n    return false;\n  }\n  return true;\n}\n\nbool Convert(const std::string& var_val, std::string* ret) {\n  *ret = var_val;\n  return true;\n}\n\ntemplate <typename T>\nbool GenericGetEnv(const std::string& var_name, T* ret) {\n  char* var_val = std::getenv(var_name.c_str());\n  if (!var_val) {\n    return false;\n  }\n  return Convert(var_val, ret);\n}\n\n} // namespace\n\nbool galois::GetEnv(const std::string& var_name, bool* ret) {\n  return GenericGetEnv(var_name, ret);\n}\n\nbool galois::GetEnv(const std::string& var_name, int* ret) {\n  return GenericGetEnv(var_name, ret);\n}\n\nbool galois::GetEnv(const std::string& var_name, std::string* ret) {\n  return GenericGetEnv(var_name, ret);\n}\n\nbool galois::GetEnv(const std::string& var_name, double* ret) {\n  return GenericGetEnv(var_name, ret);\n}\n\nbool galois::GetEnv(const std::string& var_name) {\n  return std::getenv(var_name.c_str()) != nullptr;\n}\n"
  },
  {
    "path": "libsupport/src/Logging.cpp",
    "content": "#include \"galois/Logging.h\"\n\n#include <iostream>\n#include <mutex>\n\n#include \"galois/GetEnv.h\"\n\nnamespace {\n\nvoid PrintString(bool error, bool flush, const std::string& prefix,\n                 const std::string& s) {\n  static std::mutex lock;\n  std::lock_guard<std::mutex> lg(lock);\n\n  std::ostream& o = error ? std::cerr : std::cout;\n  if (!prefix.empty()) {\n    o << prefix << \": \";\n  }\n  o << s << \"\\n\";\n  if (flush) {\n    o.flush();\n  }\n}\n\n} // end unnamed namespace\n\nvoid galois::internal::LogString(galois::LogLevel level, const std::string& s) {\n  switch (level) {\n  case LogLevel::Debug:\n    return PrintString(true, false, \"DEBUG\", s);\n  case LogLevel::Verbose:\n    if (galois::GetEnv(\"GALOIS_LOG_VERBOSE\")) {\n      return PrintString(true, false, \"VERBOSE\", s);\n    }\n    return;\n  case LogLevel::Warning:\n    return PrintString(true, false, \"WARNING\", s);\n  case LogLevel::Error:\n    return PrintString(true, false, \"ERROR\", s);\n  default:\n    std::abort();\n  }\n}\n"
  },
  {
    "path": "libsupport/test/CMakeLists.txt",
    "content": "function(add_test_unit name)\n  set(test_name unit-${name})\n\n  add_executable(${test_name} ${name}.cpp)\n  target_link_libraries(${test_name} galois_support)\n\n  set(command_line \"$<TARGET_FILE:${test_name}>\")\n\n  add_test(NAME ${test_name} COMMAND ${command_line})\n\n  # Allow parallel tests\n  set_tests_properties(${test_name}\n    PROPERTIES\n      ENVIRONMENT GALOIS_DO_NOT_BIND_THREADS=1\n      LABELS quick\n    )\nendfunction()\n\nadd_test_unit(getenv)\nadd_test_unit(logging)\n"
  },
  {
    "path": "libsupport/test/getenv.cpp",
    "content": "#include \"galois/GetEnv.h\"\n#include \"galois/Logging.h\"\n\nint main() {\n  GALOIS_LOG_ASSERT(galois::GetEnv(\"PATH\"));\n\n  std::string s;\n  GALOIS_LOG_ASSERT(galois::GetEnv(\"PATH\", &s));\n\n  int i{};\n  GALOIS_LOG_ASSERT(!galois::GetEnv(\"PATH\", &i));\n\n  double d{};\n  GALOIS_LOG_ASSERT(!galois::GetEnv(\"PATH\", &d));\n\n  bool b{};\n  GALOIS_LOG_ASSERT(!galois::GetEnv(\"PATH\", &b));\n\n  return 0;\n}\n"
  },
  {
    "path": "libsupport/test/logging.cpp",
    "content": "#include \"galois/Logging.h\"\n\n#include <system_error>\n\nint main() {\n  GALOIS_LOG_ERROR(\"string\");\n  GALOIS_LOG_ERROR(\"format string: {}\", 42);\n  GALOIS_LOG_ERROR(\"format string: {:d}\", 42);\n  // The following correctly fails with a compile time error\n  // GALOIS_LOG_ERROR(\"basic format string {:s}\", 42);\n  GALOIS_LOG_WARN(\"format number: {:.2f}\", 2.0 / 3.0);\n  GALOIS_LOG_WARN(\"format error code: {}\",\n                  std::make_error_code(std::errc::invalid_argument));\n  GALOIS_LOG_VERBOSE(\n      \"will be printed when environment variable GALOIS_LOG_VERBOSE=1\");\n  GALOIS_LOG_DEBUG(\"this will only be printed in debug builds\");\n  GALOIS_LOG_ASSERT(1 == 1);\n\n  return 0;\n}\n"
  },
  {
    "path": "lonestar/CMakeLists.txt",
    "content": "function(add_test_scale type app)\n  set(options NOT_QUICK)\n  set(one_value_args)\n  set(multi_value_args REQUIRES COMMAND_PREFIX)\n  cmake_parse_arguments(X \"${options}\" \"${one_value_args}\" \"${multi_value_args}\" ${ARGN})\n\n  set(threads)\n  set(thr \"${GALOIS_NUM_TEST_THREADS}\")\n  while (${thr} GREATER 1)\n    list(APPEND threads ${thr})\n    math(EXPR thr \"${thr} / 2\")\n  endwhile()\n  list(APPEND threads \"1\")\n\n  foreach (thr ${threads})\n    set(name run-${type}-${app}-${thr})\n    add_test(NAME ${name} COMMAND ${app} ${X_UNPARSED_ARGUMENTS} -t ${thr})\n    if (NOT ${X_NOT_QUICK})\n      # Allow parallel tests\n      set_tests_properties(${name}\n        PROPERTIES ENVIRONMENT GALOIS_DO_NOT_BIND_THREADS=1 LABELS quick)\n    endif()\n  endforeach()\nendfunction(add_test_scale)\n\nfunction(app_analy_gpu name target_name)\n  set(options NO_GPU)\n  set(one_value_args)\n  set(multi_value_args)\n  cmake_parse_arguments(X \"${options}\" \"${one_value_args}\" \"${multi_value_args}\" ${ARGN})\n  string(CONCAT target_name ${target_name} \"-gpu\")\n  add_executable(${target_name} ${name}.cu support.cu)\n  install(TARGETS ${target_name} DESTINATION \"${CMAKE_INSTALL_BINDIR}\" EXCLUDE_FROM_ALL)\n  if(GALOIS_ENABLE_GPU AND NOT ${X_NO_GPU})\n    target_compile_options(${target_name} PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-w>)\n    target_link_libraries(${target_name} Galois::gpu)\n    set_property(TARGET ${target_name} PROPERTY CUDA_STANDARD 14)\n  endif()\nendfunction()\n\nfunction(add_test_gpu app input output baseoutputext)\n  set(options NOT_QUICK)\n  set(one_value_args)\n  set(multi_value_args REQUIRES COMMAND_PREFIX)\n  set(RESULT_CHECKER ${PROJECT_SOURCE_DIR}/scripts/result_checker.py)\n  set(suffix \"-${app}-${input}\")\n  cmake_parse_arguments(X \"${options}\" \"${one_value_args}\" \"${multi_value_args}\" ${ARGN})\n\n  string(REPLACE \"_\" \";\" app_list ${app})\n  list(GET app_list 0 app_id)\n  set(baseoutput ${BASEOUTPUT}/${input}.${baseoutputext})\n\n  set(name run-${app}-${input})\n  if (EXISTS ${baseoutput})\n     add_test(NAME ${name} COMMAND ${app}-gpu ${X_UNPARSED_ARGUMENTS})\n     add_test(verify${suffix} python ${RESULT_CHECKER} -t=0.01 -sort=1 -delete=1 ${baseoutput} ${output})\n  else()\n     add_test(NAME ${name} COMMAND ${app}-gpu ${X_UNPARSED_ARGUMENTS})\n  endif()\nendfunction(add_test_gpu)\n\n\nif(GALOIS_ENABLE_DIST)\n  add_subdirectory(libdistbench)\n\n  if(GALOIS_ENABLE_GPU)\n    # turn on cuda for distbench as well\n    target_compile_definitions(distbench PRIVATE GALOIS_ENABLE_GPU=1)\n\n    # for debugging\n    add_definitions(-DGALOIS_CUDA_CHECK_ERROR)\n    if(CMAKE_BUILD_TYPE MATCHES \"Debug\")\n      add_compile_options(\"$<$<COMPILE_LANGUAGE:CUDA>:-lineinfo>\")\n    endif()\n  endif()\n\n  function(app_dist name target_name)\n    set(options NO_GPU)\n    set(one_value_args)\n    set(multi_value_args)\n    cmake_parse_arguments(X \"${options}\" \"${one_value_args}\" \"${multi_value_args}\" ${ARGN})\n    string(CONCAT target_name ${target_name} \"-dist\")\n\n    FILE(GLOB CPPSOURCES ${name}*.cpp)\n    add_executable(${target_name} ${CPPSOURCES})\n    add_dependencies(apps ${target_name})\n    target_link_libraries(${target_name} Galois::shmem LLVMSupport)\n    install(TARGETS ${target_name} DESTINATION \"${CMAKE_INSTALL_BINDIR}\" COMPONENT apps EXCLUDE_FROM_ALL)\n\n    target_link_libraries(${target_name} distbench)\n    if(GALOIS_PER_ROUND_STATS)\n      target_compile_definitions(${target_name} PRIVATE GALOIS_PER_ROUND_STATS=1)\n    endif()\n    if(GALOIS_COMM_STATS)\n      target_compile_definitions(${target_name} PRIVATE GALOIS_COMM_STATS=1)\n    endif()\n    if(GALOIS_USE_BARE_MPI)\n      target_compile_definitions(${target_name} PRIVATE GALOIS_USE_BARE_MPI=1)\n    endif()\n\n    if(GALOIS_ENABLE_GPU AND NOT ${X_NO_GPU})\n      target_compile_definitions(${target_name} PRIVATE GALOIS_ENABLE_GPU=1)\n      target_link_libraries(${target_name} ${target_name}_cuda)\n\n      FILE(GLOB CUSOURCES ${name}*.cu)\n      add_library(${target_name}_cuda ${CUSOURCES})\n      target_compile_options(${target_name}_cuda PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-w>)\n      target_link_libraries(${target_name}_cuda Galois::gpu)\n      set_property(TARGET ${target_name}_cuda PROPERTY CUDA_STANDARD 14)\n    endif()\n  endfunction()\n\n  set(RESULT_CHECKER ${PROJECT_SOURCE_DIR}/scripts/result_checker.py)\n  cmake_host_system_information(RESULT HOSTNAME QUERY HOSTNAME)\n  file(MAKE_DIRECTORY \"${CMAKE_BINARY_DIR}/output\")\n\n  function(add_test_dist_and_verify app input type part N np)\n    set(options GPU NOT_QUICK)\n    set(one_value_args)\n    set(multi_value_args)\n    cmake_parse_arguments(X \"${options}\" \"${one_value_args}\" \"${multi_value_args}\" ${ARGN})\n\n    math(EXPR t \"(${N} / ${np})\")\n    string(REPLACE \"_\" \";\" app_list ${app})\n    list(GET app_list 0 app_id)\n    set(output ${BASEOUTPUT}/${input}.${app_id})\n\n    set(suffix \"-${app}-${type}-${input}-${part}-${np}\")\n    if (EXISTS ${output})\n      add_test(run${suffix} mpiexec --bind-to none -n ${np} ./${app} ${X_UNPARSED_ARGUMENTS} -t=${t} -partition=${part} -output -outputLocation=${CMAKE_BINARY_DIR}/output)\n      add_test(verify${suffix} python ${RESULT_CHECKER} -t=0.01 -sort=1 -delete=1 ${output} ${CMAKE_BINARY_DIR}/output/*)\n    else()\n      add_test(run-${app}-${type}-${input}-${part}-${np} mpiexec --bind-to none -n ${np} ./${app} ${X_UNPARSED_ARGUMENTS} -t=${t} -partition=${part})\n    endif()\n\n    if (${X_GPU})\n      set_tests_properties(run${suffix} PROPERTIES RUN_SERIAL true)\n    endif()\n\n    if (NOT ${X_NOT_QUICK})\n      set_tests_properties(run${suffix}\n        PROPERTIES ENVIRONMENT GALOIS_DO_NOT_BIND_THREADS=1 LABELS quick)\n    endif()\n  endfunction()\n\n  function(add_test_dist_for_partitions app input type num_threads num_gpus part)\n    # cut threads in system in half first\n    if (${num_threads} GREATER 1)\n      math(EXPR num_threads \"${num_threads} / 2\")\n    endif()\n\n    # spawn at most 8 processes/use at most 8 threads during testing\n    if (${num_threads} GREATER 8)\n      set(num_threads 8)\n    endif()\n\n    set(partitions ${num_threads})\n    set(thr ${num_threads})\n    while (${thr} GREATER 1)\n      math(EXPR thr \"${thr} / 2\")\n      list(APPEND partitions ${thr})\n    endwhile()\n    list(REVERSE partitions)\n\n    foreach(np ${partitions})\n      if (np GREATER 1)\n        add_test_dist_and_verify(${app} ${input} ${type}-cpu ${part} ${num_threads} 1 ${ARGN})\n      endif()\n      add_test_dist_and_verify(${app} ${input} ${type}-cpu ${part} ${num_threads} ${np} ${ARGN})\n    endforeach()\n\n    if (NOT GALOIS_ENABLE_GPU)\n      return()\n    endif()\n\n    if (num_gpus LESS_EQUAL 0)\n      return()\n    endif()\n\n    if(num_gpus GREATER_EQUAL num_threads)\n      message(FATAL_ERROR \"number of test gpus (${num_gpus}) should be less than number of test threads (${num_threads})\")\n    endif()\n\n    set(PSET \"-pset=\")\n    foreach(np RANGE 1 ${num_gpus})\n      set(PSET \"${PSET}g\")\n      add_test_dist_and_verify(${app} ${input} ${type}-gpu ${part} ${num_threads} ${np} GPU ${ARGN} -num_nodes=1 ${PSET})\n    endforeach(np)\n    set(PSET \"${PSET}c\")\n    math(EXPR np \"(${G} + 1)\")\n    add_test_dist_and_verify(${app} ${input} ${type}-cpugpu ${part} ${num_threads} ${np} GPU ${ARGN} -num_nodes=1 ${PSET} -scalegpu=3)\n  endfunction()\n\n  function(add_test_dist app input)\n    set(options NO_GPU NO_ASYNC)\n    set(one_value_args)\n    set(multi_value_args)\n    cmake_parse_arguments(X \"${options}\" \"${one_value_args}\" \"${multi_value_args}\" ${ARGN})\n\n    set(num_gpus ${GALOIS_NUM_TEST_GPUS})\n    if (${X_NO_GPU})\n      set(num_gpus 0)\n    endif()\n\n    set(num_threads ${GALOIS_NUM_TEST_THREADS})\n\n    foreach (part oec iec cvc cvc-iec hovc hivc)\n      if (NOT ${X_NO_ASYNC})\n        add_test_dist_for_partitions(${app} ${input} sync ${num_threads} ${num_gpus} ${part} ${X_UNPARSED_ARGUMENTS} -exec=Sync)\n        add_test_dist_for_partitions(${app} ${input} async ${num_threads} ${num_gpus} ${part} ${X_UNPARSED_ARGUMENTS} -exec=Async)\n      else()\n        add_test_dist_for_partitions(${app} ${input} sync ${num_threads} ${num_gpus} ${part} ${X_UNPARSED_ARGUMENTS})\n      endif()\n    endforeach()\n  endfunction()\nendif()\n\nadd_subdirectory(liblonestar)\n\nadd_subdirectory(tutorial_examples)\n\nadd_subdirectory(analytics)\nadd_subdirectory(eda)\nadd_subdirectory(mining)\nadd_subdirectory(scientific)\n"
  },
  {
    "path": "lonestar/analytics/CMakeLists.txt",
    "content": "add_subdirectory(cpu)\n\nif(GALOIS_ENABLE_DIST)\n  add_subdirectory(distributed)\nendif()\n\nif(GALOIS_ENABLE_GPU)\n  add_subdirectory(gpu)\nendif()\n"
  },
  {
    "path": "lonestar/analytics/cpu/CMakeLists.txt",
    "content": "add_subdirectory(betweennesscentrality)\nadd_subdirectory(bfs)\nadd_subdirectory(bipart)\nadd_subdirectory(spanningtree)\nadd_subdirectory(clustering)\nadd_subdirectory(connected-components)\nadd_subdirectory(gmetis)\nadd_subdirectory(independentset)\nadd_subdirectory(k-core)\nadd_subdirectory(k-truss)\nadd_subdirectory(matching)\nadd_subdirectory(matrixcompletion)\nadd_subdirectory(pagerank)\nadd_subdirectory(pointstoanalysis)\nadd_subdirectory(preflowpush)\nadd_subdirectory(sssp)\nadd_subdirectory(triangle-counting)\n"
  },
  {
    "path": "lonestar/analytics/cpu/betweennesscentrality/AsyncStructs.h",
    "content": "#ifndef GALOIS_BC_ASYNC\n#define GALOIS_BC_ASYNC\n\n#include \"BCNode.h\"\n#include \"BCEdge.h\"\n#include \"galois/Bag.h\"\n#include \"galois/graphs/BufferedGraph.h\"\n#include \"galois/graphs/LC_CSR_CSC_Graph.h\"\n#include <iomanip>\n\n// WARNING: optimal chunk size may differ depending on input graph\nconstexpr static const unsigned ASYNC_CHUNK_SIZE = 64U;\nusing NodeType = BCNode<BC_USE_MARKING, BC_CONCURRENT>;\nusing AsyncGraph =\n    galois::graphs::LC_CSR_CSC_Graph<NodeType, BCEdge, false, true>;\n\n// Work items for the forward phase\nstruct ForwardPhaseWorkItem {\n  uint32_t nodeID;\n  uint32_t distance;\n  ForwardPhaseWorkItem() : nodeID(infinity), distance(infinity){};\n  ForwardPhaseWorkItem(uint32_t _n, uint32_t _d) : nodeID(_n), distance(_d){};\n};\n\n// grabs distance from a forward phase work item\nstruct FPWorkItemIndexer {\n  uint32_t operator()(const ForwardPhaseWorkItem& it) const {\n    return it.distance;\n  }\n};\n\n// obim worklist type declaration\nnamespace gwl = galois::worklists;\nusing PSchunk = gwl::PerSocketChunkFIFO<ASYNC_CHUNK_SIZE>;\nusing OBIM    = gwl::OrderedByIntegerMetric<FPWorkItemIndexer, PSchunk>;\n\ntemplate <typename T, bool enable>\nstruct Counter : public T {\n  std::string name;\n\n  Counter(std::string s) : name(std::move(s)) {}\n\n  ~Counter() {\n    galois::runtime::reportStat_Single(\"(NULL)\", name, this->reduce());\n  }\n};\n\ntemplate <typename T>\nstruct Counter<T, false> {\n  Counter(std::string) {}\n\n  template <typename... Args>\n  void update(Args...) {}\n};\n\nstruct BetweenessCentralityAsync {\n  AsyncGraph& graph;\n\n  BetweenessCentralityAsync(AsyncGraph& _graph) : graph(_graph) {}\n\n  using SumCounter =\n      Counter<galois::GAccumulator<unsigned long>, BC_COUNT_ACTIONS>;\n  SumCounter spfuCount{\"SP&FU\"};\n  SumCounter updateSigmaP1Count{\"UpdateSigmaBefore\"};\n  SumCounter updateSigmaP2Count{\"RealUS\"};\n  SumCounter firstUpdateCount{\"First Update\"};\n  SumCounter correctNodeP1Count{\"CorrectNodeBefore\"};\n  SumCounter correctNodeP2Count{\"Real CN\"};\n  SumCounter noActionCount{\"NoAction\"};\n\n  using MaxCounter =\n      Counter<galois::GReduceMax<unsigned long>, BC_COUNT_ACTIONS>;\n  MaxCounter largestNodeDist{\"Largest node distance\"};\n\n  using LeafCounter =\n      Counter<galois::GAccumulator<unsigned long>, BC_COUNT_LEAVES>;\n\n  void correctNode(uint32_t dstID, BCEdge&) {\n    NodeType& dstData = graph.getData(dstID);\n\n    // loop through in edges\n    for (auto e : graph.in_edges(dstID)) {\n      BCEdge& inEdgeData = graph.getInEdgeData(e);\n\n      uint32_t srcID = graph.getInEdgeDst(e);\n      if (srcID == dstID)\n        continue;\n\n      NodeType& srcData = graph.getData(srcID);\n\n      // lock in right order\n      if (srcID < dstID) {\n        srcData.lock();\n        dstData.lock();\n      } else {\n        dstData.lock();\n        srcData.lock();\n      }\n\n      const unsigned edgeLevel = inEdgeData.level;\n\n      // Correct Node\n      if (srcData.distance >= dstData.distance) {\n        correctNodeP1Count.update(1);\n        dstData.unlock();\n\n        if (edgeLevel != infinity) {\n          inEdgeData.level = infinity;\n          if (edgeLevel == srcData.distance) {\n            correctNodeP2Count.update(1);\n            srcData.nsuccs--;\n          }\n        }\n        srcData.unlock();\n      } else {\n        srcData.unlock();\n        dstData.unlock();\n      }\n    }\n  }\n\n  template <typename CTXType>\n  void spAndFU(uint32_t srcID, uint32_t dstID, BCEdge& ed, CTXType& ctx) {\n    spfuCount.update(1);\n\n    NodeType& srcData = graph.getData(srcID);\n    NodeType& dstData = graph.getData(dstID);\n\n    // make dst a successor of src, src predecessor of dst\n    srcData.nsuccs++;\n    const ShortPathType srcSigma = srcData.sigma;\n    assert(srcSigma > 0);\n    NodeType::predTY& dstPreds = dstData.preds;\n    bool dstPredsNotEmpty      = !dstPreds.empty();\n    dstPreds.clear();\n    dstPreds.push_back(srcID);\n    dstData.distance = srcData.distance + 1;\n\n    largestNodeDist.update(dstData.distance);\n\n    dstData.nsuccs = 0;        // SP\n    dstData.sigma  = srcSigma; // FU\n    ed.val         = srcSigma;\n    ed.level       = srcData.distance;\n    srcData.unlock();\n    if (!dstData.isAlreadyIn())\n      ctx.push(ForwardPhaseWorkItem(dstID, dstData.distance));\n    dstData.unlock();\n    if (dstPredsNotEmpty) {\n      correctNode(dstID, ed);\n    }\n  }\n\n  template <typename CTXType>\n  void updateSigma(uint32_t srcID, uint32_t dstID, BCEdge& ed, CTXType& ctx) {\n    updateSigmaP1Count.update(1);\n\n    NodeType& srcData = graph.getData(srcID);\n    NodeType& dstData = graph.getData(dstID);\n\n    const ShortPathType srcSigma = srcData.sigma;\n    const ShortPathType eval     = ed.val;\n    const ShortPathType diff     = srcSigma - eval;\n\n    srcData.unlock();\n    // greater than 0.0001 instead of 0 due to floating point imprecision\n    if (diff > 0.0001) {\n      updateSigmaP2Count.update(1);\n      ed.val = srcSigma;\n\n      // ShortPathType old = dstData.sigma;\n      dstData.sigma += diff;\n\n      // if (old >= dstData.sigma) {\n      //  galois::gDebug(\"Overflow detected; capping at max uint64_t\");\n      //  dstData.sigma = std::numeric_limits<uint64_t>::max();\n      //}\n\n      int nbsuccs = dstData.nsuccs;\n\n      if (nbsuccs > 0) {\n        if (!dstData.isAlreadyIn())\n          ctx.push(ForwardPhaseWorkItem(dstID, dstData.distance));\n      }\n      dstData.unlock();\n    } else {\n      dstData.unlock();\n    }\n  }\n\n  template <typename CTXType>\n  void firstUpdate(uint32_t srcID, uint32_t dstID, BCEdge& ed, CTXType& ctx) {\n    firstUpdateCount.update(1);\n\n    NodeType& srcData = graph.getData(srcID);\n    srcData.nsuccs++;\n    const ShortPathType srcSigma = srcData.sigma;\n\n    NodeType& dstData = graph.getData(dstID);\n    dstData.preds.push_back(srcID);\n\n    const ShortPathType dstSigma = dstData.sigma;\n\n    // ShortPathType old = dstData.sigma;\n    dstData.sigma = dstSigma + srcSigma;\n    // if (old >= dstData.sigma) {\n    //  galois::gDebug(\"Overflow detected; capping at max uint64_t\");\n    //  dstData.sigma = std::numeric_limits<uint64_t>::max();\n    //}\n\n    ed.val   = srcSigma;\n    ed.level = srcData.distance;\n    srcData.unlock();\n    int nbsuccs = dstData.nsuccs;\n    if (nbsuccs > 0) {\n      if (!dstData.isAlreadyIn())\n        ctx.push(ForwardPhaseWorkItem(dstID, dstData.distance));\n    }\n    dstData.unlock();\n  }\n\n  void dagConstruction(galois::InsertBag<ForwardPhaseWorkItem>& wl) {\n    galois::for_each(\n        galois::iterate(wl),\n        [&](ForwardPhaseWorkItem& wi, auto& ctx) {\n          uint32_t srcID    = wi.nodeID;\n          NodeType& srcData = graph.getData(srcID);\n          srcData.markOut();\n\n          // loop through all edges\n          for (auto e : graph.edges(srcID)) {\n            BCEdge& edgeData  = graph.getEdgeData(e);\n            uint32_t dstID    = graph.getEdgeDst(e);\n            NodeType& dstData = graph.getData(dstID);\n\n            if (srcID == dstID)\n              continue; // ignore self loops\n\n            // lock in set order to prevent deadlock (lower id\n            // first)\n            // TODO run even in serial version; find way to not\n            // need to run\n            if (srcID < dstID) {\n              srcData.lock();\n              dstData.lock();\n            } else {\n              dstData.lock();\n              srcData.lock();\n            }\n\n            const int elevel = edgeData.level;\n            const int ADist  = srcData.distance;\n            const int BDist  = dstData.distance;\n\n            if (BDist - ADist > 1) {\n              // Shortest Path + First Update (and Correct Node)\n              this->spAndFU(srcID, dstID, edgeData, ctx);\n            } else if (elevel == ADist && BDist == ADist + 1) {\n              // Update Sigma\n              this->updateSigma(srcID, dstID, edgeData, ctx);\n            } else if (BDist == ADist + 1 && elevel != ADist) {\n              // First Update not combined with Shortest Path\n              this->firstUpdate(srcID, dstID, edgeData, ctx);\n            } else { // No Action\n              noActionCount.update(1);\n              srcData.unlock();\n              dstData.unlock();\n            }\n          }\n        },\n        galois::wl<OBIM>(FPWorkItemIndexer()),\n        galois::disable_conflict_detection(), galois::loopname(\"ForwardPhase\"));\n  }\n\n  void dependencyBackProp(galois::InsertBag<uint32_t>& wl) {\n    galois::for_each(\n        galois::iterate(wl),\n        [&](uint32_t srcID, auto& ctx) {\n          NodeType& srcData = graph.getData(srcID);\n          srcData.lock();\n\n          if (srcData.nsuccs == 0) {\n            const double srcDelta = srcData.delta;\n            srcData.bc += srcDelta;\n\n            srcData.unlock();\n\n            NodeType::predTY& srcPreds = srcData.preds;\n\n            // loop through src's predecessors\n            for (unsigned i = 0; i < srcPreds.size(); i++) {\n              uint32_t predID    = srcPreds[i];\n              NodeType& predData = graph.getData(predID);\n\n              assert(srcData.sigma >= 1);\n              const double term =\n                  (double)predData.sigma * (1.0 + srcDelta) / srcData.sigma;\n              // if (std::isnan(term)) {\n              //  galois::gPrint(predData.sigma, \" \", srcDelta, \"\n              //  \", srcData.sigma, \"\\n\");\n              //}\n              predData.lock();\n              predData.delta += term;\n              const unsigned prevPdNsuccs = predData.nsuccs;\n              predData.nsuccs--;\n\n              if (prevPdNsuccs == 1) {\n                predData.unlock();\n                ctx.push(predID);\n              } else {\n                predData.unlock();\n              }\n            }\n\n            // reset data in preparation for next source\n            srcData.reset();\n            for (auto e : graph.edges(srcID)) {\n              graph.getEdgeData(e).reset();\n            }\n          } else {\n            srcData.unlock();\n          }\n        },\n        galois::disable_conflict_detection(),\n        galois::loopname(\"BackwardPhase\"));\n  }\n\n  void findLeaves(galois::InsertBag<uint32_t>& fringeWL, unsigned nnodes) {\n    LeafCounter leafCount{\"leaf nodes in DAG\"};\n    galois::do_all(\n        galois::iterate(0u, nnodes),\n        [&](auto i) {\n          NodeType& n = graph.getData(i);\n\n          if (n.nsuccs == 0 && n.distance < infinity) {\n            leafCount.update(1);\n            fringeWL.push(i);\n          }\n        },\n        galois::loopname(\"LeafFind\"));\n  }\n};\n\nvoid AsyncSanity(AsyncGraph& graph) {\n  galois::GReduceMax<float> accumMax;\n  galois::GReduceMin<float> accumMin;\n  galois::GAccumulator<float> accumSum;\n  accumMax.reset();\n  accumMin.reset();\n  accumSum.reset();\n\n  // get max, min, sum of BC values using accumulators and reducers\n  galois::do_all(\n      galois::iterate(graph),\n      [&](unsigned n) {\n        auto& nodeData = graph.getData(n);\n        accumMax.update(nodeData.bc);\n        accumMin.update(nodeData.bc);\n        accumSum += nodeData.bc;\n      },\n      galois::no_stats(), galois::loopname(\"AsyncSanity\"));\n\n  galois::gPrint(\"Max BC is \", accumMax.reduce(), \"\\n\");\n  galois::gPrint(\"Min BC is \", accumMin.reduce(), \"\\n\");\n  galois::gPrint(\"BC sum is \", accumSum.reduce(), \"\\n\");\n}\n////////////////////////////////////////////////////////////////////////////////\n\n//! runs asynchronous BC\nvoid doAsyncBC() {\n  if (BC_CONCURRENT) {\n    galois::gInfo(\"Running in concurrent mode with \", numThreads, \" threads\");\n  } else {\n    galois::gInfo(\"Running in serial mode\");\n  }\n\n  galois::gInfo(\"Constructing async BC graph\");\n  // create bidirectional graph\n  AsyncGraph bcGraph;\n\n  galois::StatTimer graphConstructTimer(\"GRAPH_CONSTRUCT\");\n  graphConstructTimer.start();\n\n  galois::graphs::FileGraph fileReader;\n  fileReader.fromFile(inputFile);\n  bcGraph.allocateFrom(fileReader.size(), fileReader.sizeEdges());\n  bcGraph.constructNodes();\n\n  galois::do_all(galois::iterate(fileReader), [&](uint32_t i) {\n    auto b = fileReader.edge_begin(i);\n    auto e = fileReader.edge_end(i);\n\n    bcGraph.fixEndEdge(i, *e);\n\n    while (b < e) {\n      bcGraph.constructEdge(*b, fileReader.getEdgeDst(*b));\n      b++;\n    }\n  });\n  bcGraph.constructIncomingEdges();\n\n  graphConstructTimer.stop();\n\n  BetweenessCentralityAsync bcExecutor(bcGraph);\n\n  unsigned nnodes = bcGraph.size();\n  uint64_t nedges = bcGraph.sizeEdges();\n  galois::gInfo(\"Num nodes is \", nnodes, \", num edges is \", nedges);\n  galois::gInfo(\"Using OBIM chunk size: \", ASYNC_CHUNK_SIZE);\n  galois::gInfo(\"Note that optimal chunk size may differ depending on input \"\n                \"graph\");\n  galois::runtime::reportStat_Single(\"BCAsync\", \"ChunkSize\", ASYNC_CHUNK_SIZE);\n\n  galois::reportPageAlloc(\"MemAllocPre\");\n  galois::gInfo(\"Going to pre-allocate pages\");\n  galois::preAlloc(\n      std::min(static_cast<uint64_t>(\n                   std::min(galois::getActiveThreads(), 100U) *\n                   std::max((nnodes / 4500000), unsigned{5}) *\n                   std::max((nedges / 30000000), uint64_t{5}) * 2.5),\n               uint64_t{1500}) +\n      5);\n  galois::gInfo(\"Pre-allocation complete\");\n  galois::reportPageAlloc(\"MemAllocMid\");\n\n  // reset everything in preparation for run\n  galois::do_all(galois::iterate(0u, nnodes),\n                 [&](auto i) { bcGraph.getData(i).reset(); });\n  galois::do_all(galois::iterate(UINT64_C(0), nedges),\n                 [&](auto i) { bcGraph.getEdgeData(i).reset(); });\n\n  // reading in list of sources to operate on if provided\n  std::ifstream sourceFile;\n  std::vector<uint64_t> sourceVector;\n  if (sourcesToUse != \"\") {\n    sourceFile.open(sourcesToUse);\n    std::vector<uint64_t> t(std::istream_iterator<uint64_t>{sourceFile},\n                            std::istream_iterator<uint64_t>{});\n    sourceVector = t;\n    sourceFile.close();\n  }\n\n  if (numOfSources == 0) {\n    numOfSources = nnodes;\n  }\n\n  // if user does specifes a certain number of out sources (i.e. only sources\n  // with outgoing edges), we need to loop over the entire node set to look for\n  // good sources to use\n  uint32_t goodSource = 0;\n  if (iterLimit != 0) {\n    numOfSources = nnodes;\n  }\n\n  // only use at most the number of sources in the passed in source file (if\n  // such a file was actually pass in)\n  if (sourceVector.size() != 0) {\n    if (numOfSources > sourceVector.size()) {\n      numOfSources = sourceVector.size();\n    }\n  }\n\n  galois::InsertBag<ForwardPhaseWorkItem> forwardPhaseWL;\n  galois::InsertBag<uint32_t> backwardPhaseWL;\n\n  galois::gInfo(\"Beginning execution\");\n\n  galois::StatTimer execTime(\"Timer_0\");\n  execTime.start();\n  for (uint32_t i = 0; i < numOfSources; ++i) {\n    uint32_t sourceToUse = i;\n    if (sourceVector.size() != 0) {\n      sourceToUse = sourceVector[i];\n    }\n\n    // ignore nodes with no neighbors\n    if (!std::distance(bcGraph.edge_begin(sourceToUse),\n                       bcGraph.edge_end(sourceToUse))) {\n      galois::gDebug(sourceToUse, \" has no outgoing edges\");\n      continue;\n    }\n\n    forwardPhaseWL.push_back(ForwardPhaseWorkItem(sourceToUse, 0));\n    NodeType& active = bcGraph.getData(sourceToUse);\n    active.initAsSource();\n    galois::gDebug(\"Source is \", sourceToUse);\n\n    bcExecutor.dagConstruction(forwardPhaseWL);\n    forwardPhaseWL.clear();\n\n    bcExecutor.findLeaves(backwardPhaseWL, nnodes);\n\n    double backupSrcBC = active.bc;\n    bcExecutor.dependencyBackProp(backwardPhaseWL);\n\n    active.bc = backupSrcBC; // current source BC should not get updated\n\n    backwardPhaseWL.clear();\n\n    // break out once number of sources user specified to do (if any) has been\n    // reached\n    goodSource++;\n    if (iterLimit != 0 && goodSource >= iterLimit)\n      break;\n  }\n  execTime.stop();\n\n  galois::gInfo(\"Number of sources with outgoing edges was \", goodSource);\n\n  galois::reportPageAlloc(\"MemAllocPost\");\n\n  // sanity\n  AsyncSanity(bcGraph);\n\n  // prints out first 10 node BC values\n  if (!skipVerify) {\n    int count = 0;\n    for (unsigned i = 0; i < nnodes && count < 10; ++i, ++count) {\n      galois::gPrint(count, \": \", std::setiosflags(std::ios::fixed),\n                     std::setprecision(6), bcGraph.getData(i).bc, \"\\n\");\n    }\n  }\n\n  if (output) {\n    std::cerr << \"Writting out bc values...\\n\";\n    std::stringstream outfname;\n    outfname << \"certificate\"\n             << \"_\" << numThreads << \".txt\";\n    std::string fname = outfname.str();\n    std::ofstream outfile(fname.c_str());\n    for (unsigned i = 0; i < nnodes; ++i) {\n      outfile << i << \" \" << std::setiosflags(std::ios::fixed)\n              << std::setprecision(9) << bcGraph.getData(i).bc << \"\\n\";\n    }\n    outfile.close();\n  }\n}\n#endif\n"
  },
  {
    "path": "lonestar/analytics/cpu/betweennesscentrality/BCEdge.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#ifndef _ED_H_\n#define _ED_H_\n\n#include \"BCNode.h\"\n#include \"control.h\"\n\nstruct BCEdge {\n  using NodeType = BCNode<BC_USE_MARKING, BC_CONCURRENT>;\n  NodeType* src;\n  NodeType* dst;\n  ShortPathType val;\n  unsigned level;\n\n  BCEdge(NodeType* _src, NodeType* _dst)\n      : src(_src), dst(_dst), val(0), level(infinity) {}\n  BCEdge() : src(0), dst(0), val(0), level(infinity) {}\n\n  BCEdge& operator=(BCEdge const& from) {\n    if (this != &from) {\n      src   = from.src;\n      dst   = from.dst;\n      val   = from.val;\n      level = from.level;\n    }\n    return *this;\n  }\n\n  inline void reset() {\n    if (level != infinity) {\n      level = infinity;\n    }\n  }\n\n  void checkClear(int j) {\n    if (level != infinity) {\n      galois::gError(j, \" PROBLEM WITH LEVEL OF \", toString());\n    }\n    if (val != 0) {\n      galois::gError(j, \" PROBLEM WITH VAL OF \", toString());\n    }\n  }\n\n  /**\n   * TODO actually implement this if needed\n   */\n  // char isAlreadyIn() {\n  //  return 0;\n  //}\n\n  std::string toString() const {\n    std::ostringstream s;\n    s << val << \" \" << level;\n    return s.str();\n  }\n};\n#endif\n"
  },
  {
    "path": "lonestar/analytics/cpu/betweennesscentrality/BCNode.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#ifndef _BCNODE_H_\n#define _BCNODE_H_\n\n#include \"control.h\"\n\n#include \"galois/substrate/SimpleLock.h\"\n#include \"galois/gstl.h\"\n\n#include <vector>\n#include <string>\n#include <sstream>\n#include <algorithm>\n#include <limits>\n\ntemplate <bool UseMarking = false, bool Concurrent = true>\nstruct BCNode {\n  using LockType =\n      typename std::conditional<Concurrent, galois::substrate::SimpleLock,\n                                char>::type;\n  LockType spinLock;\n\n  using predTY = galois::gstl::Vector<uint32_t>;\n  predTY preds;\n\n  unsigned distance;\n  unsigned nsuccs;\n\n  ShortPathType sigma;\n  double delta;\n  double bc;\n  int mark;\n\n  BCNode()\n      : spinLock(), preds(), distance(infinity), nsuccs(0), sigma(0), delta(0),\n        bc(0), mark(0) {}\n\n  /**\n   * @param a Node to check if predecessor of this node\n   * @returns true if node a is in predecessors of this node\n   */\n  bool predsContain(const BCNode* a) const {\n    typename predTY::const_iterator it = preds.end();\n    return (std::find(preds.begin(), preds.end(), a) != it);\n  }\n\n  template <bool C = Concurrent, typename std::enable_if<C>::type* = nullptr>\n  void lock() {\n    spinLock.lock();\n  }\n\n  template <bool C = Concurrent, typename std::enable_if<C>::type* = nullptr>\n  bool try_lock() {\n    return spinLock.try_lock();\n  }\n\n  template <bool C = Concurrent, typename std::enable_if<C>::type* = nullptr>\n  void unlock() {\n    spinLock.unlock();\n  }\n\n  // below are no-ops for when concurrent is false\n  template <bool C = Concurrent, typename std::enable_if<!C>::type* = nullptr>\n  void lock() {\n    // no-op\n  }\n\n  template <bool C = Concurrent, typename std::enable_if<!C>::type* = nullptr>\n  bool try_lock() {\n    return true;\n  }\n\n  template <bool C = Concurrent, typename std::enable_if<!C>::type* = nullptr>\n  void unlock() {\n    // no-op\n  }\n\n  /**\n   * Return node as string.\n   *\n   * WARNING NON SCALABLE FUNCTION\n   */\n  std::string toString() const {\n    std::ostringstream s;\n\n    s << \" distance: \" << distance << \" sigma: \" << sigma << \" bc: \" << bc\n      << \" nsuccs: \" << nsuccs << \" npreds: \" << preds.size();\n\n    return s.str();\n  }\n\n  /**\n   * Reset everything but the BC value\n   */\n  void reset() {\n    preds.clear();\n    distance = infinity;\n    nsuccs   = 0;\n    sigma    = 0;\n    delta    = 0;\n    mark     = 0;\n  }\n\n  /**\n   * Sanity check to make sure node is reset\n   */\n  void checkClear() const {\n    if (!preds.empty() || nsuccs != 0 || sigma != 0 || delta != 0)\n      galois::gWarn(\"Problem, node not clear\");\n\n    assert(preds.empty());\n    assert(distance == infinity);\n    assert(nsuccs == 0 && sigma == 0 && delta == 0);\n  }\n\n  /**\n   * Initialize this node as the source\n   */\n  void initAsSource() {\n    distance = 0;\n    sigma    = 1;\n  }\n\n  /**\n   * Mark this as 0.\n   */\n  template <bool M = UseMarking, typename std::enable_if<M>::type* = nullptr>\n  void markOut() {\n    if (Concurrent) {\n      __sync_fetch_and_and(&mark, 0);\n    } else {\n      mark = 0;\n    }\n  }\n\n  template <bool M = UseMarking, typename std::enable_if<!M>::type* = nullptr>\n  void markOut() {\n    // no-op\n  }\n\n  /**\n   * @returns true if mark is set to 1\n   */\n  template <bool M = UseMarking, typename std::enable_if<M>::type* = nullptr>\n  int isAlreadyIn() {\n    if (Concurrent) {\n      return __sync_fetch_and_or(&mark, 1);\n    } else {\n      int retval = mark;\n      mark       = 1;\n      return retval;\n    }\n  }\n\n  /**\n   * @returns 0\n   */\n  template <bool M = UseMarking, typename std::enable_if<!M>::type* = nullptr>\n  int isAlreadyIn() {\n    return 0;\n  }\n};\n#endif // _BCNODE_H_\n"
  },
  {
    "path": "lonestar/analytics/cpu/betweennesscentrality/BetweennessCentrality.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2020, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#include \"Lonestar/BoilerPlate.h\"\n#include \"llvm/Support/CommandLine.h\"\n#include \"Lonestar/Utils.h\"\n\n////////////////////////////////////////////////////////////////////////////////\n\nconstexpr static const char* const REGION_NAME = \"BC\";\n\nenum Algo { Level = 0, Async, Outer, AutoAlgo };\n\nconst char* const ALGO_NAMES[] = {\"Level\", \"Async\", \"Outer\", \"Auto\"};\n\nconst uint32_t infinity = std::numeric_limits<uint32_t>::max() / 4;\n\n////////////////////////////////////////////////////////////////////////////////\n\nnamespace cll = llvm::cl;\n\nstatic cll::opt<std::string>\n    inputFile(cll::Positional, cll::desc(\"<input file>\"), cll::Required);\n\nstatic cll::opt<std::string> sourcesToUse(\"sourcesToUse\",\n                                          cll::desc(\"Whitespace separated list \"\n                                                    \"of sources in a file to \"\n                                                    \"use in BC\"),\n                                          cll::init(\"\"));\n\nstatic cll::opt<unsigned int>\n    numOfSources(\"numOfSources\",\n                 cll::desc(\"Number of sources to compute BC on (default all)\"),\n                 cll::init(0));\n\nstatic llvm::cl::opt<unsigned int>\n    iterLimit(\"numOfOutSources\",\n              llvm::cl::desc(\"Number of sources WITH EDGES \"\n                             \" to compute BC on (default is all); does \"\n                             \"not work with Level BC\"),\n              llvm::cl::init(0));\n\nstatic cll::opt<bool>\n    singleSourceBC(\"singleSource\",\n                   cll::desc(\"Level: Use for single source BC (default off)\"),\n                   cll::init(false));\n\nstatic cll::opt<uint64_t>\n    startSource(\"startNode\",\n                cll::desc(\"Level/Outer: Starting source node used for \"\n                          \"betweeness-centrality (default 0); works with \"\n                          \"singleSource flag only\"),\n                cll::init(0));\n\nstatic cll::opt<bool>\n    output(\"output\", cll::desc(\"Output BC (Level/Async) (default: false)\"),\n           cll::init(false));\n\nstatic cll::opt<Algo> algo(\n    \"algo\", cll::desc(\"Choose an algorithm (default value AutoAlgo):\"),\n    cll::values(clEnumVal(Level, \"Level\"), clEnumVal(Async, \"Async\"),\n                clEnumVal(Outer, \"Outer\"),\n                clEnumVal(AutoAlgo,\n                          \"Auto: choose among the algorithms automatically\")),\n    cll::init(AutoAlgo));\n\n////////////////////////////////////////////////////////////////////////////////\n\nstatic const char* name = \"Betweenness Centrality\";\nstatic const char* desc = \"Computes betwenness centrality in an unweighted \"\n                          \"graph\";\n\n////////////////////////////////////////////////////////////////////////////////\n\n// include implementations for other BCs; here so that it has access to command\n// line arguments above at global scope\n// @todo not the best coding practice; passing cl in via argument might be\n// better\n\n#include \"LevelStructs.h\"\n#include \"AsyncStructs.h\"\n#include \"OuterStructs.h\"\n\n////////////////////////////////////////////////////////////////////////////////\n\nint main(int argc, char** argv) {\n  galois::SharedMemSys G;\n  LonestarStart(argc, argv, name, desc, nullptr, &inputFile);\n\n  galois::StatTimer autoAlgoTimer(\"AutoAlgo_0\");\n  galois::StatTimer totalTime(\"TimerTotal\");\n  totalTime.start();\n\n  if (algo == AutoAlgo) {\n    galois::graphs::FileGraph degreeGraph;\n    degreeGraph.fromFile(inputFile);\n    degreeGraph.initNodeDegrees();\n    autoAlgoTimer.start();\n    if (isApproximateDegreeDistributionPowerLaw(degreeGraph)) {\n      algo = Async;\n    } else {\n      algo = Level;\n    }\n    autoAlgoTimer.stop();\n    galois::gInfo(\"Choosing \", ALGO_NAMES[algo], \" algorithm\");\n  }\n\n  switch (algo) {\n  case Level:\n    // see LevelStructs.h\n    galois::gInfo(\"Running level BC\");\n    doLevelBC();\n    break;\n  case Async:\n    // see AsyncStructs.h\n    galois::gInfo(\"Running async BC\");\n    doAsyncBC();\n    break;\n  case Outer:\n    // see OuterStructs.h\n    galois::gInfo(\"Running outer BC\");\n    doOuterBC();\n    break;\n  default:\n    GALOIS_DIE(\"Unknown BC algorithm type\");\n  }\n\n  totalTime.stop();\n  return 0;\n}\n"
  },
  {
    "path": "lonestar/analytics/cpu/betweennesscentrality/CMakeLists.txt",
    "content": "add_executable(betweennesscentrality-cpu BetweennessCentrality.cpp)\nadd_dependencies(apps betweennesscentrality-cpu)\ntarget_link_libraries(betweennesscentrality-cpu PRIVATE Galois::shmem lonestar)\ninstall(TARGETS betweennesscentrality-cpu DESTINATION \"${CMAKE_INSTALL_BINDIR}\" COMPONENT apps EXCLUDE_FROM_ALL)\nadd_test_scale(small-level betweennesscentrality-cpu -algo=Level -numOfSources=4 \"${BASEINPUT}/scalefree/rmat15.gr\")\nadd_test_scale(small-async betweennesscentrality-cpu -algo=Async -numOfSources=4 \"${BASEINPUT}/scalefree/rmat15.gr\")\nadd_test_scale(small-outer betweennesscentrality-cpu -algo=Outer -numOfSources=4 \"${BASEINPUT}/scalefree/rmat15.gr\")\n"
  },
  {
    "path": "lonestar/analytics/cpu/betweennesscentrality/LevelStructs.h",
    "content": "#ifndef GALOIS_BC_LEVEL\n#define GALOIS_BC_LEVEL\n\n#include \"galois/AtomicHelpers.h\"\n#include \"galois/gstl.h\"\n#include \"galois/Reduction.h\"\n#include \"galois/graphs/LCGraph.h\"\n\n#include <limits>\n#include <fstream>\n\n////////////////////////////////////////////////////////////////////////////////\n\nstatic uint64_t levelCurrentSrcNode = 0;\n// type of the num shortest paths variable\nusing LevelShortPathType = double;\n\n// NOTE: types assume that these values will not reach uint64_t: it may\n// need to be changed for very large graphs\nstruct LevelNodeData {\n  uint32_t currentDistance;\n  std::atomic<LevelShortPathType> numShortestPaths;\n  float dependency;\n  float bc;\n};\n\nusing LevelGraph =\n    galois::graphs::LC_CSR_Graph<LevelNodeData, void>::with_no_lockable<\n        true>::type::with_numa_alloc<true>::type;\nusing LevelGNode        = LevelGraph::GraphNode;\nusing LevelWorklistType = galois::InsertBag<LevelGNode, 4096>;\n\nconstexpr static const unsigned LEVEL_CHUNK_SIZE = 256u;\n\n/******************************************************************************/\n/* Functions for running the algorithm */\n/******************************************************************************/\n/**\n * Initialize node fields all to 0\n * @param graph LevelGraph to initialize\n */\nvoid LevelInitializeGraph(LevelGraph& graph) {\n  galois::do_all(\n      galois::iterate(graph),\n      [&](LevelGNode n) {\n        LevelNodeData& nodeData   = graph.getData(n);\n        nodeData.currentDistance  = 0;\n        nodeData.numShortestPaths = 0;\n        nodeData.dependency       = 0;\n        nodeData.bc               = 0;\n      },\n      galois::no_stats(), galois::loopname(\"InitializeGraph\"));\n}\n\n/**\n * Resets data associated to start a new SSSP with a new source.\n *\n * @param graph LevelGraph to reset iteration data\n */\nvoid LevelInitializeIteration(LevelGraph& graph) {\n  galois::do_all(\n      galois::iterate(graph),\n      [&](LevelGNode n) {\n        LevelNodeData& nodeData = graph.getData(n);\n        bool isSource           = (n == levelCurrentSrcNode);\n        // source nodes have distance 0 and initialize short paths to 1, else\n        // distance is infinity with 0 short paths\n        if (!isSource) {\n          nodeData.currentDistance  = infinity;\n          nodeData.numShortestPaths = 0;\n        } else {\n          nodeData.currentDistance  = 0;\n          nodeData.numShortestPaths = 1;\n        }\n        // dependency reset for new source\n        nodeData.dependency = 0;\n      },\n      galois::no_stats(), galois::loopname(\"InitializeIteration\"));\n};\n\n/**\n * Forward phase: SSSP to determine DAG and get shortest path counts.\n *\n * Worklist-based push. Save worklists on a stack for reuse in backward\n * Brandes dependency propagation.\n */\ngalois::gstl::Vector<LevelWorklistType> LevelSSSP(LevelGraph& graph) {\n  galois::gstl::Vector<LevelWorklistType> stackOfWorklists;\n  uint32_t currentLevel = 0;\n\n  // construct first level worklist which consists only of source\n  stackOfWorklists.emplace_back();\n  stackOfWorklists[0].emplace(levelCurrentSrcNode);\n\n  // loop as long as current level's worklist is non-empty\n  while (!stackOfWorklists[currentLevel].empty()) {\n    // create worklist for next level\n    stackOfWorklists.emplace_back();\n    uint32_t nextLevel = currentLevel + 1;\n\n    galois::do_all(\n        galois::iterate(stackOfWorklists[currentLevel]),\n        [&](LevelGNode n) {\n          LevelNodeData& curData = graph.getData(n);\n          GALOIS_ASSERT(curData.currentDistance == currentLevel);\n\n          for (auto e : graph.edges(n)) {\n            LevelGNode dest         = graph.getEdgeDst(e);\n            LevelNodeData& destData = graph.getData(dest);\n\n            if (destData.currentDistance == infinity) {\n              uint32_t oldVal = __sync_val_compare_and_swap(\n                  &(destData.currentDistance), infinity, nextLevel);\n              // only 1 thread should add to worklist\n              if (oldVal == infinity) {\n                stackOfWorklists[nextLevel].emplace(dest);\n              }\n\n              galois::atomicAdd(destData.numShortestPaths,\n                                curData.numShortestPaths.load());\n            } else if (destData.currentDistance == nextLevel) {\n              galois::atomicAdd(destData.numShortestPaths,\n                                curData.numShortestPaths.load());\n            }\n          }\n        },\n        galois::steal(), galois::chunk_size<LEVEL_CHUNK_SIZE>(),\n        galois::no_stats(), galois::loopname(\"SSSP\"));\n\n    // move on to next level\n    currentLevel++;\n  }\n  return stackOfWorklists;\n}\n\n/**\n * Backward phase: use worklist of nodes at each level to back-propagate\n * dependency values.\n *\n * @param graph LevelGraph to do backward Brandes dependency prop on\n */\nvoid LevelBackwardBrandes(\n    LevelGraph& graph,\n    galois::gstl::Vector<LevelWorklistType>& stackOfWorklists) {\n  // minus 3 because last one is empty, one after is leaf nodes, and one\n  // to correct indexing to 0 index\n  if (stackOfWorklists.size() >= 3) {\n    uint32_t currentLevel = stackOfWorklists.size() - 3;\n\n    // last level is ignored since it's just the source\n    while (currentLevel > 0) {\n      LevelWorklistType& currentWorklist = stackOfWorklists[currentLevel];\n      uint32_t succLevel                 = currentLevel + 1;\n\n      galois::do_all(\n          galois::iterate(currentWorklist),\n          [&](LevelGNode n) {\n            LevelNodeData& curData = graph.getData(n);\n            GALOIS_ASSERT(curData.currentDistance == currentLevel);\n\n            for (auto e : graph.edges(n)) {\n              LevelGNode dest         = graph.getEdgeDst(e);\n              LevelNodeData& destData = graph.getData(dest);\n\n              if (destData.currentDistance == succLevel) {\n                // grab dependency, add to self\n                float contrib = ((float)1 + destData.dependency) /\n                                destData.numShortestPaths;\n                curData.dependency = curData.dependency + contrib;\n              }\n            }\n\n            // multiply at end to get final dependency value\n            curData.dependency *= curData.numShortestPaths;\n            // accumulate dependency into bc\n            curData.bc += curData.dependency;\n          },\n          galois::steal(), galois::chunk_size<LEVEL_CHUNK_SIZE>(),\n          galois::no_stats(), galois::loopname(\"Brandes\"));\n\n      // move on to next level lower\n      currentLevel--;\n    }\n  }\n}\n\n/******************************************************************************/\n/* Sanity check */\n/******************************************************************************/\n\n/**\n * Get some sanity numbers (max, min, sum of BC)\n *\n * @param graph LevelGraph to sanity check\n */\nvoid LevelSanity(LevelGraph& graph) {\n  galois::GReduceMax<float> accumMax;\n  galois::GReduceMin<float> accumMin;\n  galois::GAccumulator<float> accumSum;\n  accumMax.reset();\n  accumMin.reset();\n  accumSum.reset();\n\n  // get max, min, sum of BC values using accumulators and reducers\n  galois::do_all(\n      galois::iterate(graph),\n      [&](LevelGNode n) {\n        LevelNodeData& nodeData = graph.getData(n);\n        accumMax.update(nodeData.bc);\n        accumMin.update(nodeData.bc);\n        accumSum += nodeData.bc;\n      },\n      galois::no_stats(), galois::loopname(\"LevelSanity\"));\n\n  galois::gPrint(\"Max BC is \", accumMax.reduce(), \"\\n\");\n  galois::gPrint(\"Min BC is \", accumMin.reduce(), \"\\n\");\n  galois::gPrint(\"BC sum is \", accumSum.reduce(), \"\\n\");\n}\n\n/******************************************************************************/\n/* Running */\n/******************************************************************************/\n\nvoid doLevelBC() {\n  // reading in list of sources to operate on if provided\n  std::ifstream sourceFile;\n  std::vector<uint64_t> sourceVector;\n\n  // some initial stat reporting\n  galois::gInfo(\"Worklist chunk size of \", LEVEL_CHUNK_SIZE,\n                \": best size may depend on input.\");\n  galois::runtime::reportStat_Single(REGION_NAME, \"ChunkSize\",\n                                     LEVEL_CHUNK_SIZE);\n  galois::reportPageAlloc(\"MemAllocPre\");\n\n  // LevelGraph construction\n  galois::StatTimer graphConstructTimer(\"TimerConstructGraph\", \"BFS\");\n  graphConstructTimer.start();\n  LevelGraph graph;\n  galois::graphs::readGraph(graph, inputFile);\n  graphConstructTimer.stop();\n  galois::gInfo(\"Graph construction complete\");\n\n  // preallocate pages in memory so allocation doesn't occur during compute\n  galois::StatTimer preallocTime(\"PreAllocTime\", REGION_NAME);\n  preallocTime.start();\n  galois::preAlloc(\n      std::max(size_t{galois::getActiveThreads()} * (graph.size() / 2000000),\n               std::max(10U, galois::getActiveThreads()) * size_t{10}));\n  preallocTime.stop();\n  galois::reportPageAlloc(\"MemAllocMid\");\n\n  // If particular set of sources was specified, use them\n  if (sourcesToUse != \"\") {\n    sourceFile.open(sourcesToUse);\n    std::vector<uint64_t> t(std::istream_iterator<uint64_t>{sourceFile},\n                            std::istream_iterator<uint64_t>{});\n    sourceVector = t;\n    sourceFile.close();\n  }\n\n  // determine how many sources to loop over based on command line args\n  uint64_t loop_end = 1;\n  bool sSources     = false;\n  if (!singleSourceBC) {\n    if (!numOfSources) {\n      loop_end = graph.size();\n    } else {\n      loop_end = numOfSources;\n    }\n\n    // if provided a file of sources to work with, use that\n    if (sourceVector.size() != 0) {\n      if (loop_end > sourceVector.size()) {\n        loop_end = sourceVector.size();\n      }\n      sSources = true;\n    }\n  }\n\n  // graph initialization, then main loop\n  LevelInitializeGraph(graph);\n\n  galois::gInfo(\"Beginning main computation\");\n  galois::StatTimer execTime(\"Timer_0\");\n\n  // loop over all specified sources for SSSP/Brandes calculation\n  for (uint64_t i = 0; i < loop_end; i++) {\n    if (singleSourceBC) {\n      // only 1 source; specified start source in command line\n      assert(loop_end == 1);\n      galois::gDebug(\"This is single source node BC\");\n      levelCurrentSrcNode = startSource;\n    } else if (sSources) {\n      levelCurrentSrcNode = sourceVector[i];\n    } else {\n      // all sources\n      levelCurrentSrcNode = i;\n    }\n\n    // here begins main computation\n    execTime.start();\n    LevelInitializeIteration(graph);\n    // worklist; last one will be empty\n    galois::gstl::Vector<LevelWorklistType> worklists = LevelSSSP(graph);\n    LevelBackwardBrandes(graph, worklists);\n    execTime.stop();\n  }\n\n  galois::reportPageAlloc(\"MemAllocPost\");\n\n  // sanity checking numbers\n  LevelSanity(graph);\n\n  // Verify, i.e. print out graph data for examination\n  // @todo print to file instead of stdout\n  if (output) {\n    char* v_out = (char*)malloc(40);\n    for (auto ii = graph.begin(); ii != graph.end(); ++ii) {\n      // outputs betweenness centrality\n      sprintf(v_out, \"%u %.9f\\n\", (*ii), graph.getData(*ii).bc);\n      galois::gPrint(v_out);\n    }\n    free(v_out);\n  }\n}\n#endif\n"
  },
  {
    "path": "lonestar/analytics/cpu/betweennesscentrality/OuterStructs.h",
    "content": "#ifndef GALOIS_BC_OUTER\n#define GALOIS_BC_OUTER\n\n#include \"galois/Galois.h\"\n#include \"galois/graphs/LCGraph.h\"\n#include \"Lonestar/BoilerPlate.h\"\n#include <boost/iterator/filter_iterator.hpp>\n\n#include <iomanip>\n#include <fstream>\n\nusing OuterGraph = galois::graphs::LC_CSR_Graph<void, void>::with_no_lockable<\n    true>::type ::with_numa_alloc<true>::type;\nusing OuterGNode = OuterGraph::GraphNode;\n\n////////////////////////////////////////////////////////////////////////////////\n\nclass BCOuter {\n  OuterGraph* G;\n  int NumNodes;\n\n  galois::substrate::PerThreadStorage<double*> CB; // betweeness measure\n  galois::substrate::PerThreadStorage<double*> perThreadSigma;\n  galois::substrate::PerThreadStorage<int*> perThreadD;\n  galois::substrate::PerThreadStorage<double*> perThreadDelta;\n  galois::substrate::PerThreadStorage<galois::gdeque<OuterGNode>*>\n      perThreadSucc;\n\npublic:\n  /**\n   * Constructor initializes thread local storage.\n   */\n  BCOuter(OuterGraph& g) : G(&g), NumNodes(g.size()) { InitializeLocal(); }\n\n  /**\n   * Constructor destroys thread local storage.\n   */\n  ~BCOuter(void) { DeleteLocal(); }\n\n  //! Function that does BC for a single souce; called by a thread\n  void doBC(const OuterGNode curSource) {\n    galois::gdeque<OuterGNode> SQ;\n\n    double* sigma                    = *perThreadSigma.getLocal();\n    int* d                           = *perThreadD.getLocal();\n    double* delta                    = *perThreadDelta.getLocal();\n    galois::gdeque<OuterGNode>* succ = *perThreadSucc.getLocal();\n\n    sigma[curSource] = 1;\n    d[curSource]     = 1;\n\n    SQ.push_back(curSource);\n\n    // Do bfs while computing number of shortest paths (saved into sigma)\n    // and successors of nodes;\n    // Note this bfs makes it so source has distance of 1 instead of 0\n    for (auto qq = SQ.begin(), eq = SQ.end(); qq != eq; ++qq) {\n      int src = *qq;\n\n      for (auto edge : G->edges(src, galois::MethodFlag::UNPROTECTED)) {\n        int dest = G->getEdgeDst(edge);\n\n        if (!d[dest]) {\n          SQ.push_back(dest);\n          d[dest] = d[src] + 1;\n        }\n\n        if (d[dest] == d[src] + 1) {\n          sigma[dest] = sigma[dest] + sigma[src];\n          succ[src].push_back(dest);\n        }\n      }\n    }\n\n    // Back-propogate the dependency values (delta) along the BFS DAG\n    // ignore the source (hence SQ.size > 1 and not SQ.empty)\n    while (SQ.size() > 1) {\n      int leaf = SQ.back();\n      SQ.pop_back();\n\n      double sigma_leaf = sigma[leaf]; // has finalized short path value\n      double delta_leaf = delta[leaf];\n      auto& succ_list   = succ[leaf];\n\n      for (auto succ = succ_list.begin(), succ_end = succ_list.end();\n           succ != succ_end; ++succ) {\n        delta_leaf += (sigma_leaf / sigma[*succ]) * (1.0 + delta[*succ]);\n      }\n      delta[leaf] = delta_leaf;\n    }\n\n    // save result of this source's BC, reset all local values for next\n    // source\n    double* Vec = *CB.getLocal();\n    for (int i = 0; i < NumNodes; ++i) {\n      Vec[i] += delta[i];\n      delta[i] = 0;\n      sigma[i] = 0;\n      d[i]     = 0;\n      succ[i].clear();\n    }\n  }\n\n  /**\n   * Runs betweeness-centrality proper. Instead of a vector of sources,\n   * it will operate on the first numSources sources.\n   *\n   * @param numSources Num sources to get BC contribution for\n   */\n  void runAll(unsigned numSources) {\n    // Each thread works on an individual source node\n    galois::do_all(\n        galois::iterate(0u, numSources),\n        [&](const OuterGNode& curSource) { doBC(curSource); }, galois::steal(),\n        galois::loopname(\"Main\"));\n  }\n\n  /**\n   * Runs betweeness-centrality proper.\n   *\n   * @tparam Cont type of the data structure that holds the nodes to treat\n   * as a source during betweeness-centrality.\n   *\n   * @param v Data structure that holds nodes to treat as a source during\n   * betweeness-centrality\n   */\n  template <typename Cont>\n  void run(const Cont& v) {\n    // Each thread works on an individual source node\n    galois::do_all(\n        galois::iterate(v),\n        [&](const OuterGNode& curSource) { doBC(curSource); }, galois::steal(),\n        galois::loopname(\"Main\"));\n  }\n\n  /**\n   * Verification for reference torus graph inputs.\n   * All nodes should have the same betweenness value up to\n   * some tolerance.\n   */\n  void verify() {\n    double sampleBC = 0.0;\n    bool firstTime  = true;\n    for (int i = 0; i < NumNodes; ++i) {\n      double bc = (*CB.getRemote(0))[i];\n\n      for (unsigned j = 1; j < galois::getActiveThreads(); ++j)\n        bc += (*CB.getRemote(j))[i];\n\n      if (firstTime) {\n        sampleBC = bc;\n        galois::gInfo(\"BC: \", sampleBC);\n        firstTime = false;\n      } else {\n        // check if over some tolerance value\n        if ((bc - sampleBC) > 0.0001) {\n          galois::gInfo(\"If torus graph, verification failed \",\n                        (bc - sampleBC));\n          return;\n        }\n      }\n    }\n  }\n\n  /**\n   * Print betweeness-centrality measures.\n   *\n   * @param begin first node to print BC measure of\n   * @param end iterator after last node to print\n   * @param out stream to output to\n   * @param precision precision of the floating points outputted by the function\n   */\n  void printBCValues(size_t begin, size_t end, std::ostream& out,\n                     int precision = 6) {\n    for (; begin != end; ++begin) {\n      double bc = (*CB.getRemote(0))[begin];\n\n      for (unsigned j = 1; j < galois::getActiveThreads(); ++j)\n        bc += (*CB.getRemote(j))[begin];\n\n      out << begin << \" \" << std::setiosflags(std::ios::fixed)\n          << std::setprecision(precision) << bc << \"\\n\";\n    }\n  }\n\n  /**\n   * Print all betweeness centrality values in the graph.\n   */\n  void printBCcertificate() {\n    std::stringstream foutname;\n    foutname << \"outer_certificate_\" << galois::getActiveThreads();\n\n    std::ofstream outf(foutname.str().c_str());\n    galois::gInfo(\"Writing certificate...\");\n\n    printBCValues(0, NumNodes, outf, 9);\n\n    outf.close();\n  }\n\n  //! sanity check of BC values\n  void outerSanity(OuterGraph& graph) {\n    galois::GReduceMax<float> accumMax;\n    galois::GReduceMin<float> accumMin;\n    galois::GAccumulator<float> accumSum;\n    accumMax.reset();\n    accumMin.reset();\n    accumSum.reset();\n\n    // get max, min, sum of BC values using accumulators and reducers\n    galois::do_all(\n        galois::iterate(graph),\n        [&](LevelGNode n) {\n          double bc = (*CB.getRemote(0))[n];\n\n          for (unsigned j = 1; j < galois::getActiveThreads(); ++j)\n            bc += (*CB.getRemote(j))[n];\n\n          accumMax.update(bc);\n          accumMin.update(bc);\n          accumSum += bc;\n        },\n        galois::no_stats(), galois::loopname(\"OuterSanity\"));\n\n    galois::gPrint(\"Max BC is \", accumMax.reduce(), \"\\n\");\n    galois::gPrint(\"Min BC is \", accumMin.reduce(), \"\\n\");\n    galois::gPrint(\"BC sum is \", accumSum.reduce(), \"\\n\");\n  }\n\nprivate:\n  /**\n   * Initialize an array at some provided address.\n   *\n   * @param addr Address to initialize array at\n   */\n  template <typename T>\n  void initArray(T** addr) {\n    *addr = new T[NumNodes]();\n  }\n\n  /**\n   * Destroy an array at some provided address.\n   *\n   * @param addr Address to destroy array at\n   */\n  template <typename T>\n  void deleteArray(T** addr) {\n    delete[] * addr;\n  }\n\n  /**\n   * Initialize local thread storage.\n   */\n  void InitializeLocal(void) {\n    galois::on_each([this](unsigned, unsigned) {\n      this->initArray(CB.getLocal());\n      this->initArray(perThreadSigma.getLocal());\n      this->initArray(perThreadD.getLocal());\n      this->initArray(perThreadDelta.getLocal());\n      this->initArray(perThreadSucc.getLocal());\n    });\n  }\n\n  /**\n   * Destroy local thread storage.\n   */\n  void DeleteLocal(void) {\n    galois::on_each([this](unsigned, unsigned) {\n      this->deleteArray(CB.getLocal());\n      this->deleteArray(perThreadSigma.getLocal());\n      this->deleteArray(perThreadD.getLocal());\n      this->deleteArray(perThreadDelta.getLocal());\n      this->deleteArray(perThreadSucc.getLocal());\n    });\n  }\n};\n\n/**\n * Functor that indicates if a node contains outgoing edges\n */\nstruct HasOut {\n  OuterGraph* graph;\n  HasOut(OuterGraph* g) : graph(g) {}\n\n  bool operator()(const OuterGNode& n) const {\n    return graph->edge_begin(n) != graph->edge_end(n);\n  }\n};\n\n////////////////////////////////////////////////////////////////////////////////\n\nvoid doOuterBC() {\n  OuterGraph g;\n  galois::graphs::readGraph(g, inputFile);\n\n  BCOuter bcOuter(g);\n\n  size_t NumNodes = g.size();\n\n  // preallocate pages for use in algorithm\n  galois::reportPageAlloc(\"MeminfoPre\");\n  galois::preAlloc(galois::getActiveThreads() * NumNodes / 1650);\n  galois::reportPageAlloc(\"MeminfoMid\");\n\n  // vector of sources to process; initialized if doing outSources\n  std::vector<OuterGNode> v;\n  // preprocessing: find the nodes with out edges we will process and skip\n  // over nodes with no out edges; only done if numOfSources isn't specified\n  if (numOfSources == 0) {\n    // find first node with out edges\n    boost::filter_iterator<HasOut, OuterGraph::iterator> begin =\n        boost::make_filter_iterator(HasOut(&g), g.begin(), g.end());\n    boost::filter_iterator<HasOut, OuterGraph::iterator> end =\n        boost::make_filter_iterator(HasOut(&g), g.end(), g.end());\n    // adjustedEnd = last node we will process based on how many iterations\n    // (i.e. sources) we want to do\n    boost::filter_iterator<HasOut, OuterGraph::iterator> adjustedEnd =\n        iterLimit ? galois::safe_advance(begin, end, (int)iterLimit) : end;\n\n    size_t iterations = std::distance(begin, adjustedEnd);\n    galois::gPrint(\"Num Nodes: \", NumNodes, \" Start Node: \", startSource,\n                   \" Iterations: \", iterations, \"\\n\");\n    // vector of nodes we want to process\n    v.insert(v.end(), begin, adjustedEnd);\n  }\n\n  // execute algorithm\n  galois::StatTimer execTime(\"Timer_0\");\n  execTime.start();\n  // either run a contiguous chunk of sources from beginning or run using\n  // sources with outgoing edges only\n  if (numOfSources > 0) {\n    bcOuter.runAll(numOfSources);\n  } else {\n    bcOuter.run(v);\n  }\n  execTime.stop();\n\n  bcOuter.printBCValues(0, std::min(10UL, NumNodes), std::cout, 6);\n  bcOuter.outerSanity(g);\n  if (output)\n    bcOuter.printBCcertificate();\n\n  if (!skipVerify)\n    bcOuter.verify();\n\n  galois::reportPageAlloc(\"MeminfoPost\");\n}\n#endif\n"
  },
  {
    "path": "lonestar/analytics/cpu/betweennesscentrality/README.md",
    "content": "All the variants of BC discussed below can be run from the same executable.\n\nBUILD\n================================================================================\n\n1. Run cmake at BUILD directory (refer to top-level README for cmake instructions).\n\n2. Run `cd <BUILD>/lonestar/analytics/cpu/betweennesscentrality; make -j`\n\nBetweenness Centrality (Level)\n================================================================================\n\nDESCRIPTION\n--------------------------------------------------------------------------------\n\nRuns Betweenness Centrality where all threads will work on a single betweenness\ncentrality source at one time. Does a forward SSSP phase to find shortest path\ncounts and then does a backward propagation step to calculate BC contributions\nfor the source being operated on.\n\nThis application takes in Galois .gr graphs.\n\nRUN\n--------------------------------------------------------------------------------\n\nTo run all sources, use the following:\n`./betweennesscentrality-cpu <input-graph> -algo=Level -t=<num-threads>`\n\nTo run with a specific number of sources N (starting from the beginning), use\nthe following:\n`./betweennesscentrality-cpu <input-graph> -algo=Level -t=<num-threads> -numOfSources=N`\n\n\nAsynchronous Brandes Betweenness Centrality\n================================================================================\n\nDESCRIPTION\n----------------------------------------\n\nRuns an asynchronous version of Brandes's Betweenness Centrality as formulated\nthrough the operator formulation of algorithms. It is a two-phase algorithm:\nthe first phase determines the shortest path DAG and counts the number of\nshortest paths through a given node for a particular source, and the second\nphase back-propagates dependency values for the calculation of betweenness\ncentrality.\n\ncontrol.h has some variables that may alter how the algorithm runs and what kind\nof data it collects.\n\nPass in a regular .gr graph.\n\nFor more details on the algorithm, see paper here:\nhttps://dl.acm.org/citation.cfm?id=2442521\n\nRUN\n--------------------------------------------------------------------------------\n\nTo run all sources, use the following:\n`./betweennesscentrality-cpu <input-graph> -algo=Async -t=<num-threads>`\n\nTo run with a specific number of sources N (starting from the beginning), use\nthe following:\n`./betweennesscentrality-cpu <input-graph> -algo=Async -t=<num-threads> -numOfSources=N`\n\nTo run with a specific number of sources N (starting from the beginning) **with\noutgoing edges**, use the following:\n`./betweennesscentrality-cpu <input-graph> -algo=Async -t=<num-threads> -numOfOutSources=N`\n\nTo run with a specific set of sources, put the sources in a file with\nthe source ids separated with a line and use the following:\n`./betweennesscentrality-cpu <input-graph> -algo=Async -t=<num-threads> -sourcesToUse=<path-to-file>`\n\nPERFORMANCE\n--------------------------------------------------------------------------------\n\nGood scaling and performance is very dependent on the chunk size parameter\nfor the worklist. It must be changed through the source code as it is\na compile time variable used in templates. The best chunk size is input\ndependent.\n\nGood scaling also comes from using the Galois power-of-two allocator\nfor memory allocations in parallel regions.\n\nFinally, it may be useful to toggle BC_USE_MARKING in control.h: if on, it will\ncheck to see if a node is in a worklist before adding it (preventing duplicates).\nDepending on the input graph, performance may improve with this setting on.\n\nBetweenness Centrality (Outer)\n================================================================================\n\nDESCRIPTION\n--------------------------------------------------------------------------------\n\nRuns Betweenness Centrality where the unit of parallelism is a betweenness\ncentrality source. Each thread will work on the betweenness centrality\ncomputation of it own individual source and find the BC contributions of that\nsource to the rest of the graph.\n\nThis application takes in Galois .gr graphs.\n\n\nRUN\n--------------------------------------------------------------------------------\n\nTo run all sources, use the following:\n`./betweennesscentrality-cpu <input-graph> -algo=Outer -t=<num-threads>`\n\nTo run starting from a particular source, use the following:\n`./betweennesscentrality-cpu <input-graph> -algo=Outer -t=<num-threads> -startNode=<node to begin>`\n\nTo run only on N nodes (that have outgoing edges), use the following:\n`./betweennesscentrality-cpu <input-graph> -algo=Outer -t=<num-threads> -limit=N`\n\nPERFORMANCE\n--------------------------------------------------------------------------------\n\nIf each source's BC calculation takes roughly the same amount of time, then\nload balancing should be good. Otherwise, there may be load imbalance among\nthreads.\n\nALGORITHM CHOICE\n=================================================================================\n\nAsync performs best for high-diameter graphs such as road-networks. Level performs\nbest when the diameter of the graph is not large due to the level-by-level\nnature of its computation.\n"
  },
  {
    "path": "lonestar/analytics/cpu/betweennesscentrality/control.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#ifndef _BC_ASYNC_CONTROL_H_\n#define _BC_ASYNC_CONTROL_H_\n\nusing ShortPathType = double;\n\n#define BC_COUNT_ACTIONS 0\n#define BC_COUNT_LEAVES 0\n\n// on if concurrent BC is to be used\n#define BC_CONCURRENT 1\n// on if markings on nodes are to be used (helps with no duplicates in worklist)\n#define BC_USE_MARKING 1\n\n#endif // end BC_ASYNC_CONTROL\n"
  },
  {
    "path": "lonestar/analytics/cpu/bfs/CMakeLists.txt",
    "content": "add_executable(bfs-cpu bfs.cpp)\nadd_dependencies(apps bfs-cpu)\ntarget_link_libraries(bfs-cpu PRIVATE Galois::shmem lonestar)\ninstall(TARGETS bfs-cpu DESTINATION \"${CMAKE_INSTALL_BINDIR}\" COMPONENT apps EXCLUDE_FROM_ALL)\nadd_test_scale(small1 bfs-cpu \"${BASEINPUT}/reference/structured/rome99.gr\")\nadd_test_scale(small2 bfs-cpu \"${BASEINPUT}/scalefree/rmat10.gr\")\n\nadd_executable(bfs-directionopt-cpu bfsDirectionOpt.cpp)\nadd_dependencies(apps bfs-directionopt-cpu)\ntarget_link_libraries(bfs-directionopt-cpu PRIVATE Galois::shmem lonestar)\ninstall(TARGETS bfs-directionopt-cpu DESTINATION \"${CMAKE_INSTALL_BINDIR}\" COMPONENT apps EXCLUDE_FROM_ALL)\nadd_test_scale(small1 bfs-directionopt-cpu \"${BASEINPUT}/reference/structured/rome99.gr\")\nadd_test_scale(small2 bfs-directionopt-cpu \"${BASEINPUT}/scalefree/rmat10.gr\")\n"
  },
  {
    "path": "lonestar/analytics/cpu/bfs/README.md",
    "content": "Breadth First Search\n================================================================================\n\nDESCRIPTION \n--------------------------------------------------------------------------------\n\nThis program performs breadth-first search on an input graph, starting from a\nsource node (specified by -startNode option). \n\nAsync algorithm maintains a concurrent FIFO of active nodes and uses a\nfor_each loop (a single parallel phase) to go over them. New active nodes are\nadded to the concurrent FIFO\n\nSync algorithm iterates over active nodes in rounds, each round, it uses a\ndo_all loop to iterate over currently active nodes to generate the next set of\nactive nodes. \n\nSync2p further divides each round into two parallel do_all loops\n\nEach algorithm has a variant that implements edge tiling, e.g. SyncTile, which\ndivides the edges of high-degree nodes into multiple work items for better\nload balancing. \n\nINPUT\n--------------------------------------------------------------------------------\n\nThis application takes in Galois .gr graphs.\n\nBUILD\n--------------------------------------------------------------------------------\n\n1. Run cmake at BUILD directory (refer to top-level README for cmake instructions).\n\n2. Run `cd <BUILD>/lonestar/analytics/cpu/bfs; make -j`\n\nRUN\n--------------------------------------------------------------------------------\n\nThe following are a few example command lines.\n\n-`$ ./bfs-cpu <path-to-graph> -exec PARALLEL -algo SyncTile -t 40`\n-`$ ./bfs-cpu <path-to-graph> -exec SERIAL -algo SyncTile -t 40`\n\nPERFORMANCE  \n--------------------------------------------------------------------------------\n\n* In our experience, Sync/SyncTile algorithm gives the best performance.\n* Async/AsyncTile algorithm typically performs better than Sync on high diameter\n  graphs, such as road networks\n* All algorithms rely on CHUNK_SIZE for load balancing, which needs to be\n  tuned for machine and input graph. \n* Tile variants of algorithms provide better load balancing and performance\n  for graphs with high-degree nodes. Tile size is controlled via\n  EDGE_TILE_SIZE constant, which needs to be tuned. \n"
  },
  {
    "path": "lonestar/analytics/cpu/bfs/bfs.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#include \"galois/Galois.h\"\n#include \"galois/gstl.h\"\n#include \"galois/Reduction.h\"\n#include \"galois/Timer.h\"\n#include \"galois/graphs/LCGraph.h\"\n#include \"galois/graphs/TypeTraits.h\"\n#include \"Lonestar/BoilerPlate.h\"\n#include \"Lonestar/BFS_SSSP.h\"\n\n#include \"llvm/Support/CommandLine.h\"\n\n#include <iostream>\n#include <deque>\n#include <type_traits>\n\nnamespace cll = llvm::cl;\n\nstatic const char* name = \"Breadth-first Search\";\n\nstatic const char* desc =\n    \"Computes the shortest path from a source node to all nodes in a directed \"\n    \"graph using a modified Bellman-Ford algorithm\";\n\nstatic const char* url = \"breadth_first_search\";\n\nstatic cll::opt<std::string>\n    inputFile(cll::Positional, cll::desc(\"<input file>\"), cll::Required);\nstatic cll::opt<unsigned int>\n    startNode(\"startNode\",\n              cll::desc(\"Node to start search from (default value 0)\"),\n              cll::init(0));\nstatic cll::opt<unsigned int>\n    reportNode(\"reportNode\",\n               cll::desc(\"Node to report distance to (default value 1)\"),\n               cll::init(1));\n\n// static cll::opt<unsigned int> stepShiftw(\"delta\",\n// cll::desc(\"Shift value for the deltastep\"),\n// cll::init(10));\n\nenum Exec { SERIAL, PARALLEL };\n\nenum Algo { AsyncTile = 0, Async, SyncTile, Sync };\n\nconst char* const ALGO_NAMES[] = {\"AsyncTile\", \"Async\", \"SyncTile\", \"Sync\"};\n\nstatic cll::opt<Exec> execution(\n    \"exec\",\n    cll::desc(\"Choose SERIAL or PARALLEL execution (default value PARALLEL):\"),\n    cll::values(clEnumVal(SERIAL, \"SERIAL\"), clEnumVal(PARALLEL, \"PARALLEL\")),\n    cll::init(PARALLEL));\n\nstatic cll::opt<Algo> algo(\n    \"algo\", cll::desc(\"Choose an algorithm (default value SyncTile):\"),\n    cll::values(clEnumVal(AsyncTile, \"AsyncTile\"), clEnumVal(Async, \"Async\"),\n                clEnumVal(SyncTile, \"SyncTile\"), clEnumVal(Sync, \"Sync\")),\n    cll::init(SyncTile));\n\nusing Graph =\n    galois::graphs::LC_CSR_Graph<unsigned, void>::with_no_lockable<true>::type;\n//::with_numa_alloc<true>::type;\n\nusing GNode = Graph::GraphNode;\n\nconstexpr static const bool TRACK_WORK          = false;\nconstexpr static const unsigned CHUNK_SIZE      = 256U;\nconstexpr static const ptrdiff_t EDGE_TILE_SIZE = 256;\n\nusing BFS = BFS_SSSP<Graph, unsigned int, false, EDGE_TILE_SIZE>;\n\nusing UpdateRequest       = BFS::UpdateRequest;\nusing Dist                = BFS::Dist;\nusing SrcEdgeTile         = BFS::SrcEdgeTile;\nusing SrcEdgeTileMaker    = BFS::SrcEdgeTileMaker;\nusing SrcEdgeTilePushWrap = BFS::SrcEdgeTilePushWrap;\nusing ReqPushWrap         = BFS::ReqPushWrap;\nusing OutEdgeRangeFn      = BFS::OutEdgeRangeFn;\nusing TileRangeFn         = BFS::TileRangeFn;\n\nstruct EdgeTile {\n  Graph::edge_iterator beg;\n  Graph::edge_iterator end;\n};\n\nstruct EdgeTileMaker {\n  EdgeTile operator()(Graph::edge_iterator beg,\n                      Graph::edge_iterator end) const {\n    return EdgeTile{beg, end};\n  }\n};\n\nstruct NodePushWrap {\n\n  template <typename C>\n  void operator()(C& cont, const GNode& n, const char* const) const {\n    (*this)(cont, n);\n  }\n\n  template <typename C>\n  void operator()(C& cont, const GNode& n) const {\n    cont.push(n);\n  }\n};\n\nstruct EdgeTilePushWrap {\n  Graph& graph;\n\n  template <typename C>\n  void operator()(C& cont, const GNode& n, const char* const) const {\n    BFS::pushEdgeTilesParallel(cont, graph, n, EdgeTileMaker{});\n  }\n\n  template <typename C>\n  void operator()(C& cont, const GNode& n) const {\n    BFS::pushEdgeTiles(cont, graph, n, EdgeTileMaker{});\n  }\n};\n\nstruct OneTilePushWrap {\n  Graph& graph;\n\n  template <typename C>\n  void operator()(C& cont, const GNode& n, const char* const) const {\n    (*this)(cont, n);\n  }\n\n  template <typename C>\n  void operator()(C& cont, const GNode& n) const {\n    EdgeTile t{graph.edge_begin(n, galois::MethodFlag::UNPROTECTED),\n               graph.edge_end(n, galois::MethodFlag::UNPROTECTED)};\n\n    cont.push(t);\n  }\n};\n\ntemplate <bool CONCURRENT, typename T, typename P, typename R>\nvoid asyncAlgo(Graph& graph, GNode source, const P& pushWrap,\n               const R& edgeRange) {\n\n  namespace gwl = galois::worklists;\n  // typedef PerSocketChunkFIFO<CHUNK_SIZE> dFIFO;\n  using FIFO = gwl::PerSocketChunkFIFO<CHUNK_SIZE>;\n  using BSWL = gwl::BulkSynchronous<gwl::PerSocketChunkLIFO<CHUNK_SIZE>>;\n  using WL   = FIFO;\n\n  using Loop =\n      typename std::conditional<CONCURRENT, galois::ForEach,\n                                galois::WhileQ<galois::SerFIFO<T>>>::type;\n\n  GALOIS_GCC7_IGNORE_UNUSED_BUT_SET\n  constexpr bool useCAS = CONCURRENT && !std::is_same<WL, BSWL>::value;\n  GALOIS_END_GCC7_IGNORE_UNUSED_BUT_SET\n\n  Loop loop;\n\n  galois::GAccumulator<size_t> BadWork;\n  galois::GAccumulator<size_t> WLEmptyWork;\n\n  graph.getData(source) = 0;\n  galois::InsertBag<T> initBag;\n\n  if (CONCURRENT) {\n    pushWrap(initBag, source, 1, \"parallel\");\n  } else {\n    pushWrap(initBag, source, 1);\n  }\n\n  loop(\n      galois::iterate(initBag),\n      [&](const T& item, auto& ctx) {\n        constexpr galois::MethodFlag flag = galois::MethodFlag::UNPROTECTED;\n\n        const auto& sdist = graph.getData(item.src, flag);\n\n        if (TRACK_WORK) {\n          if (item.dist != sdist) {\n            WLEmptyWork += 1;\n            return;\n          }\n        }\n\n        const auto newDist = item.dist;\n\n        for (auto ii : edgeRange(item)) {\n          GNode dst   = graph.getEdgeDst(ii);\n          auto& ddata = graph.getData(dst, flag);\n\n          while (true) {\n\n            Dist oldDist = ddata;\n\n            if (oldDist <= newDist) {\n              break;\n            }\n\n            if (!useCAS ||\n                __sync_bool_compare_and_swap(&ddata, oldDist, newDist)) {\n\n              if (!useCAS) {\n                ddata = newDist;\n              }\n\n              if (TRACK_WORK) {\n                if (oldDist != BFS::DIST_INFINITY) {\n                  BadWork += 1;\n                }\n              }\n\n              pushWrap(ctx, dst, newDist + 1);\n              break;\n            }\n          }\n        }\n      },\n      galois::wl<WL>(), galois::loopname(\"runBFS\"),\n      galois::disable_conflict_detection());\n\n  if (TRACK_WORK) {\n    galois::runtime::reportStat_Single(\"BFS\", \"BadWork\", BadWork.reduce());\n    galois::runtime::reportStat_Single(\"BFS\", \"EmptyWork\",\n                                       WLEmptyWork.reduce());\n  }\n}\n\ntemplate <bool CONCURRENT, typename T, typename P, typename R>\nvoid syncAlgo(Graph& graph, GNode source, const P& pushWrap,\n              const R& edgeRange) {\n\n  using Cont = typename std::conditional<CONCURRENT, galois::InsertBag<T>,\n                                         galois::SerStack<T>>::type;\n  using Loop = typename std::conditional<CONCURRENT, galois::DoAll,\n                                         galois::StdForEach>::type;\n\n  constexpr galois::MethodFlag flag = galois::MethodFlag::UNPROTECTED;\n\n  Loop loop;\n\n  auto curr = std::make_unique<Cont>();\n  auto next = std::make_unique<Cont>();\n\n  Dist nextLevel              = 0U;\n  graph.getData(source, flag) = 0U;\n\n  if (CONCURRENT) {\n    pushWrap(*next, source, \"parallel\");\n  } else {\n    pushWrap(*next, source);\n  }\n\n  assert(!next->empty());\n\n  while (!next->empty()) {\n\n    std::swap(curr, next);\n    next->clear();\n    ++nextLevel;\n\n    loop(\n        galois::iterate(*curr),\n        [&](const T& item) {\n          for (auto e : edgeRange(item)) {\n            auto dst      = graph.getEdgeDst(e);\n            auto& dstData = graph.getData(dst, flag);\n\n            if (dstData == BFS::DIST_INFINITY) {\n              dstData = nextLevel;\n              pushWrap(*next, dst);\n            }\n          }\n        },\n        galois::steal(), galois::chunk_size<CHUNK_SIZE>(),\n        galois::loopname(\"Sync\"));\n  }\n}\n\ntemplate <bool CONCURRENT>\nvoid runAlgo(Graph& graph, const GNode& source) {\n\n  switch (algo) {\n  case AsyncTile:\n    asyncAlgo<CONCURRENT, SrcEdgeTile>(\n        graph, source, SrcEdgeTilePushWrap{graph}, TileRangeFn());\n    break;\n  case Async:\n    asyncAlgo<CONCURRENT, UpdateRequest>(graph, source, ReqPushWrap(),\n                                         OutEdgeRangeFn{graph});\n    break;\n  case SyncTile:\n    syncAlgo<CONCURRENT, EdgeTile>(graph, source, EdgeTilePushWrap{graph},\n                                   TileRangeFn());\n    break;\n  case Sync:\n    syncAlgo<CONCURRENT, GNode>(graph, source, NodePushWrap(),\n                                OutEdgeRangeFn{graph});\n    break;\n  default:\n    std::cerr << \"ERROR: unkown algo type\\n\";\n  }\n}\n\nint main(int argc, char** argv) {\n  galois::SharedMemSys G;\n  LonestarStart(argc, argv, name, desc, url, &inputFile);\n\n  galois::StatTimer totalTime(\"TimerTotal\");\n  totalTime.start();\n\n  Graph graph;\n  GNode source;\n  GNode report;\n\n  std::cout << \"Reading from file: \" << inputFile << \"\\n\";\n  galois::graphs::readGraph(graph, inputFile);\n  std::cout << \"Read \" << graph.size() << \" nodes, \" << graph.sizeEdges()\n            << \" edges\\n\";\n\n  if (startNode >= graph.size() || reportNode >= graph.size()) {\n    std::cerr << \"failed to set report: \" << reportNode\n              << \" or failed to set source: \" << startNode << \"\\n\";\n    abort();\n  }\n\n  auto it = graph.begin();\n  std::advance(it, startNode.getValue());\n  source = *it;\n  it     = graph.begin();\n  std::advance(it, reportNode.getValue());\n  report = *it;\n\n  size_t approxNodeData = 4 * (graph.size() + graph.sizeEdges());\n  galois::preAlloc(8 * numThreads +\n                   approxNodeData / galois::runtime::pagePoolSize());\n\n  galois::reportPageAlloc(\"MeminfoPre\");\n\n  galois::do_all(galois::iterate(graph),\n                 [&graph](GNode n) { graph.getData(n) = BFS::DIST_INFINITY; });\n  graph.getData(source) = 0;\n\n  std::cout << \"Running \" << ALGO_NAMES[algo] << \" algorithm with \"\n            << (bool(execution) ? \"PARALLEL\" : \"SERIAL\") << \" execution\\n\";\n\n  galois::StatTimer execTime(\"Timer_0\");\n  execTime.start();\n\n  if (execution == SERIAL) {\n    runAlgo<false>(graph, source);\n  } else if (execution == PARALLEL) {\n    runAlgo<true>(graph, source);\n  } else {\n    std::cerr << \"ERROR: unknown type of execution passed to -exec\\n\";\n  }\n\n  execTime.stop();\n\n  galois::reportPageAlloc(\"MeminfoPost\");\n\n  std::cout << \"Node \" << reportNode << \" has distance \"\n            << graph.getData(report) << \"\\n\";\n\n  // Sanity checking code\n  galois::GReduceMax<uint64_t> maxDistance;\n  galois::GAccumulator<uint64_t> distanceSum;\n  galois::GAccumulator<uint32_t> visitedNode;\n  maxDistance.reset();\n  distanceSum.reset();\n  visitedNode.reset();\n\n  galois::do_all(\n      galois::iterate(graph),\n      [&](uint64_t i) {\n        uint32_t myDistance = graph.getData(i);\n\n        if (myDistance != BFS::DIST_INFINITY) {\n          maxDistance.update(myDistance);\n          distanceSum += myDistance;\n          visitedNode += 1;\n        }\n      },\n      galois::loopname(\"Sanity check\"), galois::no_stats());\n\n  // report sanity stats\n  uint64_t rMaxDistance = maxDistance.reduce();\n  uint64_t rDistanceSum = distanceSum.reduce();\n  uint64_t rVisitedNode = visitedNode.reduce();\n  galois::gInfo(\"# visited nodes is \", rVisitedNode);\n  galois::gInfo(\"Max distance is \", rMaxDistance);\n  galois::gInfo(\"Sum of visited distances is \", rDistanceSum);\n\n  if (!skipVerify) {\n    if (BFS::verify(graph, source)) {\n      std::cout << \"Verification successful.\\n\";\n    } else {\n      GALOIS_DIE(\"verification failed\");\n    }\n  }\n\n  totalTime.stop();\n\n  return 0;\n}\n"
  },
  {
    "path": "lonestar/analytics/cpu/bfs/bfsDirectionOpt.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#include \"galois/Galois.h\"\n#include \"galois/AtomicHelpers.h\"\n#include \"galois/DynamicBitset.h\"\n#include \"galois/gstl.h\"\n#include \"galois/Reduction.h\"\n#include \"galois/Timer.h\"\n#include \"galois/graphs/LCGraph.h\"\n#include \"galois/graphs/TypeTraits.h\"\n#include \"galois/graphs/LC_CSR_CSC_Graph.h\"\n#include \"galois/runtime/Profile.h\"\n#include \"Lonestar/BFS_SSSP.h\"\n#include \"Lonestar/Utils.h\"\n#include \"Lonestar/BoilerPlate.h\"\n\n#include \"llvm/Support/CommandLine.h\"\n\n#include <stdio.h>\n#include <sys/time.h>\n#include <sys/resource.h>\n#include <iostream>\n#include <deque>\n#include <type_traits>\n#include <cstdlib>\n\nnamespace cll = llvm::cl;\n\nstatic const char* name = \"Breadth-first Search\";\n\nstatic const char* desc =\n    \"Computes the shortest path from a source node to all nodes in a directed \"\n    \"graph using a modified Bellman-Ford algorithm\";\n\nstatic const char* url = \"breadth_first_search\";\n\nstatic cll::opt<std::string>\n    inputFile(cll::Positional, cll::desc(\"<input file>\"), cll::Required);\nstatic cll::opt<uint64_t>\n    startNode(\"startNode\",\n              cll::desc(\"Node to start search from (default value 0)\"),\n              cll::init(0));\nstatic cll::opt<unsigned int>\n    reportNode(\"reportNode\",\n               cll::desc(\"Node to report distance to (default value 1)\"),\n               cll::init(1));\nstatic cll::opt<unsigned int>\n    numRuns(\"numRuns\", cll::desc(\"Number of runs (default value 1)\"),\n            cll::init(1));\n\nstatic cll::opt<int>\n    alpha(\"alpha\",\n          cll::desc(\"alpha value to change direction in direction-optimization \"\n                    \"(default value 15)\"),\n          cll::init(15));\nstatic cll::opt<int>\n    beta(\"beta\",\n         cll::desc(\"beta value to change direction in direction-optimization \"\n                   \"(default value 18)\"),\n         cll::init(18));\n\nstatic cll::opt<unsigned int>\n    preAlloc(\"preAlloc\",\n             cll::desc(\"Number of pages to preAlloc (default value 400)\"),\n             cll::init(400));\n\nstatic cll::opt<unsigned int>\n    numPrint(\"numPrint\",\n             cll::desc(\"Print parents for the numPrint number of nodes for \"\n                       \"verification if verification is on (default value 10)\"),\n             cll::init(10));\n\nenum Exec { SERIAL, PARALLEL };\n\nenum Algo { SyncDO = 0, Async, AutoAlgo };\n\nconst char* const ALGO_NAMES[] = {\"SyncDO\", \"Async\", \"Auto\"};\n\nstatic cll::opt<Exec> execution(\n    \"exec\",\n    cll::desc(\"Choose SERIAL or PARALLEL execution (default value PARALLEL):\"),\n    cll::values(clEnumVal(SERIAL, \"SERIAL\"), clEnumVal(PARALLEL, \"PARALLEL\")),\n    cll::init(PARALLEL));\n\nstatic cll::opt<Algo>\n    algo(\"algo\", cll::desc(\"Choose an algorithm (default value Auto):\"),\n         cll::values(\n             clEnumVal(SyncDO, \"SyncDO\"), clEnumVal(Async, \"Async\"),\n             clEnumVal(AutoAlgo,\n                       \"Auto: choose between SyncDO and Async automatically\")),\n         cll::init(AutoAlgo));\n\nusing Graph =\n    // galois::graphs::LC_CSR_CSC_Graph<unsigned, void, false, true, true>;\n    galois::graphs::LC_CSR_CSC_Graph<unsigned, void, false, true, true>;\n// galois::graphs::LC_CSR_CSC_Graph<unsigned,\n// void>::with_no_lockable<true>::type::with_numa_alloc<true>::type;\nusing GNode = Graph::GraphNode;\n\nconstexpr static const unsigned CHUNK_SIZE      = 256u;\nconstexpr static const ptrdiff_t EDGE_TILE_SIZE = 256;\n\nusing BFS            = BFS_SSSP<Graph, unsigned int, false, EDGE_TILE_SIZE>;\nusing UpdateRequest  = BFS::UpdateRequest;\nusing Dist           = BFS::Dist;\nusing OutEdgeRangeFn = BFS::OutEdgeRangeFn;\n\nstruct EdgeTile {\n  Graph::edge_iterator beg;\n  Graph::edge_iterator end;\n};\n\nstruct EdgeTileMaker {\n  EdgeTile operator()(Graph::edge_iterator beg,\n                      Graph::edge_iterator end) const {\n    return EdgeTile{beg, end};\n  }\n};\n\nstruct NodePushWrap {\n\n  template <typename C>\n  void operator()(C& cont, const GNode& n, const char* const) const {\n    (*this)(cont, n);\n  }\n\n  template <typename C>\n  void operator()(C& cont, const GNode& n) const {\n    cont.push(n);\n  }\n};\n\nstruct EdgeTilePushWrap {\n  Graph& graph;\n\n  template <typename C>\n  void operator()(C& cont, const GNode& n, const char* const) const {\n    BFS::pushEdgeTilesParallel(cont, graph, n, EdgeTileMaker{});\n  }\n\n  template <typename C>\n  void operator()(C& cont, const GNode& n) const {\n    BFS::pushEdgeTiles(cont, graph, n, EdgeTileMaker{});\n  }\n};\n\nstruct OneTilePushWrap {\n  Graph& graph;\n\n  template <typename C>\n  void operator()(C& cont, const GNode& n, const char* const) const {\n    (*this)(cont, n);\n  }\n\n  template <typename C>\n  void operator()(C& cont, const GNode& n) const {\n    EdgeTile t{graph.edge_begin(n, galois::MethodFlag::UNPROTECTED),\n               graph.edge_end(n, galois::MethodFlag::UNPROTECTED)};\n\n    cont.push(t);\n  }\n};\n\ntemplate <typename WL>\nvoid WlToBitset(WL& wl, galois::DynamicBitSet& bitset) {\n  galois::do_all(\n      galois::iterate(wl), [&](const GNode& src) { bitset.set(src); },\n      galois::steal(), galois::chunk_size<CHUNK_SIZE>(),\n      galois::loopname(\"WlToBitset\"));\n}\n\ntemplate <typename WL>\nvoid BitsetToWl(const Graph& graph, const galois::DynamicBitSet& bitset,\n                WL& wl) {\n  wl.clear();\n  galois::do_all(\n      galois::iterate(graph),\n      [&](const GNode& src) {\n        if (bitset.test(src))\n          // pushWrap(wl, src);\n          wl.push(src);\n      },\n      galois::steal(), galois::chunk_size<CHUNK_SIZE>(),\n      galois::loopname(\"BitsetToWl\"));\n}\n\ntemplate <bool CONCURRENT, typename T, typename P, typename R>\nvoid syncDOAlgo(Graph& graph, GNode source, const P& pushWrap,\n                const R& GALOIS_UNUSED(edgeRange), const uint32_t runID) {\n\n  using Cont = typename std::conditional<CONCURRENT, galois::InsertBag<T>,\n                                         galois::SerStack<T>>::type;\n  using Loop = typename std::conditional<CONCURRENT, galois::DoAll,\n                                         galois::StdForEach>::type;\n\n  constexpr galois::MethodFlag flag = galois::MethodFlag::UNPROTECTED;\n  galois::GAccumulator<uint32_t> work_items;\n\n  Loop loop;\n\n  galois::DynamicBitSet front_bitset, next_bitset;\n  front_bitset.resize(graph.size());\n  next_bitset.resize(graph.size());\n\n  front_bitset.reset();\n  next_bitset.reset();\n\n  Cont* curr = new Cont();\n  Cont* next = new Cont();\n\n  Dist nextLevel              = 0u;\n  graph.getData(source, flag) = 0u;\n\n  if (CONCURRENT) {\n    pushWrap(*next, source, \"parallel\");\n  } else {\n    pushWrap(*next, source);\n  }\n  // adding source to the worklist\n  work_items += 1;\n  // next_bitset.set(source);\n\n  int64_t edges_to_check = graph.sizeEdges();\n  int64_t scout_count =\n      std::distance(graph.edge_begin(source), graph.edge_end(source));\n  galois::gPrint(\"source: \", source, \" has OutDegree:\", scout_count, \"\\n\");\n  assert(!next->empty());\n\n  uint64_t old_workItemNum = 0;\n  uint64_t numNodes        = graph.size();\n  // uint32_t c_pull = 0, c_push = 0;\n  galois::GAccumulator<uint64_t> writes_pull, writes_push;\n  writes_push.reset();\n  writes_pull.reset();\n  // std::vector<uint32_t> pull_levels;\n  // pull_levels.reserve(10);\n\n  while (!next->empty()) {\n\n    std::swap(curr, next);\n    next->clear();\n    if (scout_count > edges_to_check / alpha) {\n\n      WlToBitset(*curr, front_bitset);\n      do {\n        // c_pull++;\n        // pull_levels.push_back(nextLevel);\n\n        ++nextLevel;\n        old_workItemNum = work_items.reduce();\n        work_items.reset();\n\n        // PULL from in-edges\n        loop(\n            galois::iterate(graph),\n            [&](const T& dst) {\n              auto& ddata = graph.getData(dst, flag);\n              if (ddata == BFS::DIST_INFINITY) {\n                for (auto e : graph.in_edges(dst)) {\n                  auto src = graph.getInEdgeDst(e);\n\n                  if (front_bitset.test(src)) {\n                    /*\n                     * Currently assigning parents on the bfs path.\n                     * Assign nextLevel (uncomment below)\n                     */\n                    // ddata = nextLevel;\n                    ddata = src;\n                    next_bitset.set(dst);\n                    work_items += 1;\n                    break;\n                  }\n                }\n              }\n            },\n            galois::steal(), galois::chunk_size<CHUNK_SIZE>(),\n            galois::loopname(\n                (std::string(\"Sync-pull_\") + std::to_string(runID)).c_str()));\n\n        std::swap(front_bitset, next_bitset);\n        next_bitset.reset();\n      } while (work_items.reduce() >= old_workItemNum ||\n               (work_items.reduce() > numNodes / beta));\n\n      BitsetToWl(graph, front_bitset, *next);\n      scout_count = 1;\n    } else {\n      // c_push++;\n      ++nextLevel;\n      edges_to_check -= scout_count;\n      work_items.reset();\n      // PUSH to out-edges\n      loop(\n          galois::iterate(*curr),\n          [&](const T& src) {\n            for (auto e : graph.edges(src)) {\n              auto dst    = graph.getEdgeDst(e);\n              auto& ddata = graph.getData(dst, flag);\n\n              if (ddata == BFS::DIST_INFINITY) {\n                Dist oldDist = ddata;\n                /*\n                 * Currently assigning parents on the bfs path.\n                 * Assign nextLevel (uncomment below)\n                 */\n                // if(__sync_bool_compare_and_swap(&ddata, oldDist, nextLevel))\n                // {\n                if (__sync_bool_compare_and_swap(&ddata, oldDist, src)) {\n                  next->push(dst);\n                  work_items += (graph.edge_end(dst) - graph.edge_begin(dst));\n                }\n              }\n            }\n          },\n          galois::steal(), galois::chunk_size<CHUNK_SIZE>(),\n          galois::loopname(\n              (std::string(\"Sync-push_\") + std::to_string(runID)).c_str()));\n\n      scout_count = work_items.reduce();\n    }\n  }\n\n  delete curr;\n  delete next;\n}\n\ntemplate <bool CONCURRENT, typename T, typename P, typename R>\nvoid asyncAlgo(Graph& graph, GNode source, const P& pushWrap,\n               const R& GALOIS_UNUSED(edgeRange)) {\n\n  namespace gwl = galois::worklists;\n  // typedef PerSocketChunkFIFO<CHUNK_SIZE> dFIFO;\n  using FIFO = gwl::PerSocketChunkFIFO<CHUNK_SIZE>;\n  using WL   = FIFO;\n\n  using Loop =\n      typename std::conditional<CONCURRENT, galois::ForEach,\n                                galois::WhileQ<galois::SerFIFO<T>>>::type;\n\n  Loop loop;\n\n  galois::GAccumulator<size_t> BadWork;\n  galois::GAccumulator<size_t> WLEmptyWork;\n\n  graph.getData(source) = 0;\n  galois::InsertBag<T> initBag;\n\n  if (CONCURRENT) {\n    pushWrap(initBag, source, \"parallel\");\n  } else {\n    pushWrap(initBag, source);\n  }\n\n  loop(\n      galois::iterate(initBag),\n      [&](const GNode& src, auto& ctx) {\n        constexpr galois::MethodFlag flag = galois::MethodFlag::UNPROTECTED;\n\n        for (auto ii : graph.edges(src)) {\n          GNode dst   = graph.getEdgeDst(ii);\n          auto& ddata = graph.getData(dst, flag);\n\n          if (ddata == BFS::DIST_INFINITY) {\n            Dist oldDist = ddata;\n            if (__sync_bool_compare_and_swap(&ddata, oldDist, src)) {\n              ctx.push(dst);\n            }\n          }\n        }\n      },\n      galois::wl<WL>(), galois::loopname(\"runBFS\"),\n      galois::disable_conflict_detection());\n}\n\ntemplate <bool CONCURRENT>\nvoid runAlgo(Graph& graph, const GNode& source, const uint32_t runID) {\n  switch (algo) {\n  case SyncDO:\n    syncDOAlgo<CONCURRENT, GNode>(graph, source, NodePushWrap(),\n                                  OutEdgeRangeFn{graph}, runID);\n    break;\n  case Async:\n    asyncAlgo<CONCURRENT, GNode>(graph, source, NodePushWrap(),\n                                 OutEdgeRangeFn{graph});\n    break;\n\n  default:\n    std::cerr << \"ERROR: unkown algo type\\n\";\n  }\n}\n\nint main(int argc, char** argv) {\n  galois::SharedMemSys G;\n  LonestarStart(argc, argv, name, desc, url, &inputFile);\n\n  galois::StatTimer totalTime(\"TimerTotal\");\n  totalTime.start();\n\n  Graph graph;\n  GNode source;\n  GNode report;\n\n  galois::StatTimer StatTimer_graphConstuct(\"TimerConstructGraph\", \"BFS\");\n  StatTimer_graphConstuct.start();\n  graph.readAndConstructBiGraphFromGRFile(inputFile);\n  StatTimer_graphConstuct.stop();\n  std::cout << \"Read \" << graph.size() << \" nodes, \" << graph.sizeEdges()\n            << \" edges\\n\";\n\n  if (startNode >= graph.size() || reportNode >= graph.size()) {\n    std::cerr << \"failed to set report: \" << reportNode\n              << \" or failed to set source: \" << startNode << \"\\n\";\n    assert(0);\n    abort();\n  }\n\n  auto it = graph.begin();\n  std::advance(it, startNode.getValue());\n  source = *it;\n  it     = graph.begin();\n  std::advance(it, reportNode.getValue());\n  report = *it;\n\n  galois::preAlloc(preAlloc);\n  galois::gPrint(\"Fixed preAlloc done : \", preAlloc, \"\\n\");\n  galois::reportPageAlloc(\"MeminfoPre\");\n\n  galois::do_all(galois::iterate(graph),\n                 [&graph](GNode n) { graph.getData(n) = BFS::DIST_INFINITY; });\n\n  graph.getData(source) = 0;\n\n  std::cout << \"Running \" << ALGO_NAMES[algo] << \" algorithm with \"\n            << (bool(execution) ? \"PARALLEL\" : \"SERIAL\") << \" execution\\n\";\n\n  std::cout\n      << \"WARNING: This bfs version uses bi-directional CSR graph \"\n      << \"and assigns parent instead of the shortest distance from source\\n\";\n  if (algo == Async) {\n    std::cout << \"WARNING: Async bfs does not use direction optimization. \"\n              << \"It uses Galois for_each for asynchronous execution which is \"\n                 \"advantageous \"\n              << \"for large diameter graphs such as road networks\\n\";\n  }\n\n  std::cout << \" Execution started\\n\";\n\n  galois::StatTimer autoAlgoTimer(\"AutoAlgo_0\");\n  galois::StatTimer execTime(\"Timer_0\");\n  execTime.start();\n\n  if (algo == AutoAlgo) {\n    autoAlgoTimer.start();\n    if (isApproximateDegreeDistributionPowerLaw(graph)) {\n      algo = SyncDO;\n    } else {\n      algo = Async;\n    }\n    autoAlgoTimer.stop();\n    galois::gInfo(\"Choosing \", ALGO_NAMES[algo], \" algorithm\");\n  }\n\n  for (unsigned int run = 0; run < numRuns; ++run) {\n    galois::gPrint(\"BFS::go run \", run, \" called\\n\");\n    std::string timer_str(\"Timer_Run\" + std::to_string(run));\n    galois::StatTimer StatTimer_main(timer_str.c_str(), \"BFS\");\n    StatTimer_main.start();\n\n    if (execution == SERIAL) {\n      runAlgo<false>(graph, source, run);\n    } else if (execution == PARALLEL) {\n      galois::runtime::profileVtune(\n          [&]() { runAlgo<true>(graph, source, run); }, \"runAlgo\");\n    } else {\n      std::cerr << \"ERROR: unknown type of execution passed to -exec\\n\";\n      std::abort();\n    }\n\n    StatTimer_main.stop();\n\n    if ((run + 1) != numRuns) {\n      for (unsigned int i = 0; i < 1; ++i) {\n        galois::do_all(galois::iterate(graph), [&graph](GNode n) {\n          graph.getData(n) = BFS::DIST_INFINITY;\n        });\n      }\n    }\n  }\n\n  execTime.stop();\n\n  galois::reportPageAlloc(\"MeminfoPost\");\n\n  std::cout << \"Node \" << reportNode << \" has parent \" << graph.getData(report)\n            << \"\\n\";\n\n  if (!skipVerify) {\n    for (GNode n = 0; n < numPrint; n++) {\n      galois::gPrint(\"parent[\", n, \"] : \", graph.getData(n), \"\\n\");\n    }\n  }\n\n  totalTime.stop();\n\n  return 0;\n}\n"
  },
  {
    "path": "lonestar/analytics/cpu/bipart/CMakeLists.txt",
    "content": "add_executable(bipart-cpu bipart.cpp Coarsening.cpp Metric.cpp Partitioning.cpp Refine.cpp)\nadd_dependencies(apps bipart-cpu)\ntarget_link_libraries(bipart-cpu PRIVATE Galois::shmem lonestar)\ninstall(TARGETS bipart-cpu DESTINATION \"${CMAKE_INSTALL_BINDIR}\" COMPONENT apps EXCLUDE_FROM_ALL)\nadd_test_scale(small1 bipart-cpu -hMetisGraph \"${BASEINPUT}/partitioning/ibm01.hgr\")\n"
  },
  {
    "path": "lonestar/analytics/cpu/bipart/Coarsening.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#include \"bipart.h\"\n#include \"galois/Galois.h\"\n#include \"galois/AtomicHelpers.h\"\n#include \"galois/Reduction.h\"\n#include \"galois/runtime/Profile.h\"\n#include \"galois/substrate/PerThreadStorage.h\"\n#include \"galois/gstl.h\"\n\n#include <iostream>\n#include <unordered_set>\n#include <unordered_map>\n\n// constexpr static const unsigned CHUNK_SIZE = 512U;\n\nint TOTALW;\nint LIMIT;\nbool FLAG = false;\nnamespace {\n\nint hash(unsigned val) {\n  unsigned long int seed = val * 1103515245 + 12345;\n  return ((unsigned)(seed / 65536) % 32768);\n}\n\nvoid parallelRand(std::shared_ptr<MetisGraph> graph, int) {\n\n  GGraph* fineGGraph = graph->getFinerGraph()->getGraph();\n\n  galois::StatTimer T_RAND(\"RAND\");\n  T_RAND.start();\n\n  galois::do_all(\n      galois::iterate((uint64_t)0, fineGGraph->hedges),\n      [&fineGGraph](uint64_t item) {\n        unsigned netnum = fineGGraph->getData(item, flag_no_lock).netnum;\n        netnum          = hash(netnum);\n        fineGGraph->getData(item, flag_no_lock).netrand = netnum;\n      },\n      galois::steal(),\n      // galois::chunk_size<CHUNK_SIZE>());\n      galois::loopname(\"rand\"));\n  T_RAND.stop();\n\n  // std::cout <<\"hedges: \" << fineGGraph->hedges << std::endl;\n\n  galois::StatTimer T_INDEX(\"INDEX\");\n  T_INDEX.start();\n  galois::do_all(\n      galois::iterate((uint64_t)0, fineGGraph->hedges),\n      [&fineGGraph](uint64_t item) {\n        unsigned netnum = fineGGraph->getData(item, flag_no_lock).index;\n        netnum          = hash(1);\n        fineGGraph->getData(item, flag_no_lock).index = netnum;\n      },\n      galois::steal(),\n      // galois::chunk_size<CHUNK_SIZE>());\n      galois::loopname(\"rand_index\"));\n  T_INDEX.stop();\n\n  // std::cout <<\"rand: \" << T_RAND.get() << std::endl;\n  // std::cout << \"rand_index: \" << T_INDEX.get() << std::endl;\n}\n\nusing MatchingPolicy = void(GNode, GGraph*);\n\nvoid PLD_f(GNode node, GGraph* fineGGraph) {\n  int ss =\n      std::distance(fineGGraph->edge_begin(node), fineGGraph->edge_end(node));\n  fineGGraph->getData(node).netval = -ss;\n}\nvoid RAND_f(GNode node, GGraph* fineGGraph) {\n  unsigned id                       = fineGGraph->getData(node).netrand;\n  fineGGraph->getData(node).netval  = -id;\n  fineGGraph->getData(node).netrand = -fineGGraph->getData(node).netnum;\n}\nvoid PP_f(GNode node, GGraph* fineGGraph) {\n  int ss =\n      std::distance(fineGGraph->edge_begin(node), fineGGraph->edge_end(node));\n  fineGGraph->getData(node).netval = ss;\n}\nvoid WD_f(GNode node, GGraph* fineGGraph) {\n  int w = 0;\n  for (auto n : fineGGraph->edges(node)) {\n    auto nn = fineGGraph->getEdgeDst(n);\n    w += fineGGraph->getData(nn).getWeight();\n  }\n  fineGGraph->getData(node).netval = -w;\n}\nvoid MWD_f(GNode node, GGraph* fineGGraph) {\n  int w = 0;\n  for (auto n : fineGGraph->edges(node)) {\n    auto nn = fineGGraph->getEdgeDst(n);\n    w += fineGGraph->getData(nn).getWeight();\n  }\n  fineGGraph->getData(node).netval = w;\n}\nvoid RI_f(GNode node, GGraph* fineGGraph) {\n  int ss =\n      std::distance(fineGGraph->edge_begin(node), fineGGraph->edge_end(node));\n  fineGGraph->getData(node).netval = ss;\n}\nvoid MRI_f(GNode node, GGraph* fineGGraph) {\n  int ss =\n      std::distance(fineGGraph->edge_begin(node), fineGGraph->edge_end(node));\n  fineGGraph->getData(node).netval = ss;\n}\nvoid DEG_f(GNode node, GGraph* fineGGraph) {\n  int w = 0;\n  int ss =\n      std::distance(fineGGraph->edge_begin(node), fineGGraph->edge_end(node));\n  fineGGraph->getData(node).netval = ss;\n  for (auto n : fineGGraph->edges(node)) {\n    auto nn = fineGGraph->getEdgeDst(n);\n    w += fineGGraph->getData(nn).getWeight();\n  }\n  fineGGraph->getData(node).netval = -(w / ss);\n}\nvoid MDEG_f(GNode node, GGraph* fineGGraph) {\n  int w = 0;\n  int ss =\n      std::distance(fineGGraph->edge_begin(node), fineGGraph->edge_end(node));\n  fineGGraph->getData(node).netval = ss;\n  for (auto n : fineGGraph->edges(node)) {\n    auto nn = fineGGraph->getEdgeDst(n);\n    w += fineGGraph->getData(nn).getWeight();\n  }\n  fineGGraph->getData(node).netval = w / ss;\n}\n\ntemplate <MatchingPolicy matcher>\nvoid parallelPrioRand(std::shared_ptr<MetisGraph> graph, int iter) {\n\n  GGraph* fineGGraph = graph->getFinerGraph()->getGraph();\n  parallelRand(graph, iter);\n\n  galois::do_all(\n      galois::iterate(size_t{0}, fineGGraph->hedges),\n      [&](GNode item) {\n        matcher(item, fineGGraph);\n        for (auto c : fineGGraph->edges(item)) {\n          auto dst = fineGGraph->getEdgeDst(c);\n          galois::atomicMin(fineGGraph->getData(dst).netval,\n                            fineGGraph->getData(item).netval.load());\n        }\n      },\n      galois::steal(), galois::loopname(\"atomicMin\"));\n  galois::do_all(\n      galois::iterate(size_t{0}, fineGGraph->hedges),\n      [&](GNode item) {\n        for (auto c : fineGGraph->edges(item)) {\n          auto dst = fineGGraph->getEdgeDst(c);\n          if (fineGGraph->getData(dst).netval ==\n              fineGGraph->getData(item).netval)\n            galois::atomicMin(fineGGraph->getData(dst).netrand,\n                              fineGGraph->getData(item).netrand.load());\n        }\n      },\n      galois::steal(), galois::loopname(\"secondMin2\"));\n  galois::do_all(\n      galois::iterate(size_t{0}, fineGGraph->hedges),\n      [&](GNode item) {\n        for (auto c : fineGGraph->edges(item)) {\n          auto dst = fineGGraph->getEdgeDst(c);\n          if (fineGGraph->getData(dst).netrand ==\n              fineGGraph->getData(item).netrand)\n            galois::atomicMin(fineGGraph->getData(dst).netnum,\n                              fineGGraph->getData(item).netnum.load());\n        }\n      },\n      galois::steal(), galois::loopname(\"secondMin\"));\n}\n\n// hyper edge matching\ntemplate <MatchingPolicy matcher>\nvoid parallelHMatchAndCreateNodes(std::shared_ptr<MetisGraph> graph, int iter,\n                                  GNodeBag& bag, std::vector<bool>& hedges,\n                                  galois::LargeArray<unsigned>& weight) {\n  parallelPrioRand<matcher>(graph, iter);\n  GGraph* fineGGraph = graph->getFinerGraph()->getGraph();\n  assert(fineGGraph != graph->getGraph());\n  typedef std::vector<GNode> VecTy;\n  typedef galois::substrate::PerThreadStorage<VecTy> ThreadLocalData;\n  ThreadLocalData edgesThreadLocal;\n  std::string name = \"phaseI\";\n\n  galois::GAccumulator<unsigned> hedge;\n\n  galois::InsertBag<GNode> hedge_bag;\n\n  galois::do_all(\n      galois::iterate(size_t{0}, fineGGraph->hedges),\n      [&](GNode item) {\n        bool flag       = false;\n        unsigned nodeid = INT_MAX;\n        auto& edges     = *edgesThreadLocal.getLocal();\n        edges.clear();\n        int w = 0;\n        for (auto c : fineGGraph->edges(item)) {\n          auto dst   = fineGGraph->getEdgeDst(c);\n          auto& data = fineGGraph->getData(dst);\n          if (data.isMatched()) {\n            flag = true;\n            continue;\n          }\n          if (data.netnum == fineGGraph->getData(item).netnum) {\n            if (w + fineGGraph->getData(dst).getWeight() > LIMIT)\n              break;\n            edges.push_back(dst);\n            w += fineGGraph->getData(dst).getWeight();\n            nodeid = std::min(nodeid, dst);\n          } else {\n            flag = true;\n          }\n        }\n\n        if (!edges.empty()) {\n          if (flag && edges.size() == 1)\n            return;\n          fineGGraph->getData(item).setMatched();\n          if (flag)\n            hedge_bag.push(item);\n\n          bag.push(nodeid);\n          unsigned ww = 0;\n          for (auto pp : edges) {\n            ww += fineGGraph->getData(pp).getWeight();\n            fineGGraph->getData(pp).setMatched();\n            fineGGraph->getData(pp).setParent(nodeid);\n            fineGGraph->getData(pp).netnum = fineGGraph->getData(item).netnum;\n            // fineGGraph->getData(pp).netnum =\n            // fineGGraph->getData(item).netnum.load();\n          }\n          weight[nodeid - fineGGraph->hedges] = ww;\n        }\n      },\n      galois::loopname(\"phaseI\"));\n\n  for (auto item : hedge_bag)\n    hedges[item] = true;\n}\n\nvoid moreCoarse(std::shared_ptr<MetisGraph> graph,\n                galois::LargeArray<unsigned>& weight) {\n\n  GGraph* fineGGraph = graph->getFinerGraph()->getGraph();\n  typedef std::vector<GNode> VecTy;\n  GNodeBag bag;\n  typedef galois::substrate::PerThreadStorage<VecTy> ThreadLocalData;\n  ThreadLocalData edgesThreadLocal;\n  galois::do_all(\n      galois::iterate(size_t{0}, fineGGraph->hedges),\n      [&](GNode item) {\n        if (fineGGraph->getData(item).isMatched())\n          return;\n        for (auto c : fineGGraph->edges(item)) {\n          auto dst = fineGGraph->getEdgeDst(c);\n          if (fineGGraph->getData(dst).isMatched())\n            fineGGraph->getData(dst).netval = INT_MIN;\n        }\n      },\n      galois::steal(), galois::loopname(\"atomicMin2\"));\n  galois::do_all(\n      galois::iterate(size_t{0}, fineGGraph->hedges),\n      [&](GNode item) {\n        if (fineGGraph->getData(item).isMatched())\n          return;\n        auto& cells = *edgesThreadLocal.getLocal();\n        cells.clear();\n        int best = INT_MAX;\n        GNode b  = 0;\n        for (auto edge : fineGGraph->edges(item)) {\n          auto e     = fineGGraph->getEdgeDst(edge);\n          auto& data = fineGGraph->getData(e);\n          if (!fineGGraph->getData(e).isMatched()) {\n            if (data.netnum == fineGGraph->getData(item).netnum) {\n              cells.push_back(e);\n            }\n          } else if (fineGGraph->getData(e).netval == INT_MIN) {\n            if (fineGGraph->getData(e).getWeight() < best) {\n              best = fineGGraph->getData(e).getWeight();\n              b    = e;\n            } else if (fineGGraph->getData(e).getWeight() == best) {\n              if (e < b)\n                b = e;\n            }\n          }\n        }\n        if (cells.size() > 0) {\n          if (best < INT_MAX) {\n            auto nn = fineGGraph->getData(b).getParent();\n            for (auto e : cells) {\n              bag.push(e);\n              fineGGraph->getData(e).setMatched();\n              fineGGraph->getData(e).setParent(nn);\n              fineGGraph->getData(e).netnum = fineGGraph->getData(b).netnum;\n              // fineGGraph->getData(e).netnum =\n              // fineGGraph->getData(b).netnum.load();\n            }\n          }\n        }\n      },\n      galois::steal(), galois::loopname(\"moreCoarse\"));\n  for (auto c : bag) {\n    auto nn = fineGGraph->getData(c).getParent();\n    int ww  = weight[nn - fineGGraph->hedges];\n    ww += fineGGraph->getData(c).getWeight();\n    weight[nn - fineGGraph->hedges] = ww;\n  }\n}\n\n// Coarsening phaseII\nvoid coarsePhaseII(std::shared_ptr<MetisGraph> graph, std::vector<bool>& hedges,\n                   galois::LargeArray<unsigned>& weight) {\n\n  GGraph* fineGGraph = graph->getFinerGraph()->getGraph();\n  typedef std::set<int> SecTy;\n  typedef std::vector<GNode> VecTy;\n  typedef galois::substrate::PerThreadStorage<SecTy> ThreadLocalData;\n  ThreadLocalData edgesThreadLocal;\n  typedef galois::substrate::PerThreadStorage<VecTy> ThreadLocalDataV;\n  ThreadLocalDataV edgesThreadLocalV;\n  std::string name = \"CoarseningPhaseII\";\n  galois::GAccumulator<int> hhedges;\n  galois::GAccumulator<int> hnode;\n  moreCoarse(graph, weight);\n\n  galois::InsertBag<GNode> hedge_bag;\n\n  galois::do_all(\n      galois::iterate(size_t{0}, fineGGraph->hedges),\n      [&](GNode item) {\n        if (fineGGraph->getData(item).isMatched())\n          return;\n        unsigned ids;\n        int count = 0;\n        for (auto c : fineGGraph->edges(item)) {\n          auto dst   = fineGGraph->getEdgeDst(c);\n          auto& data = fineGGraph->getData(dst);\n          if (data.isMatched()) {\n            if (count == 0) {\n              ids = data.getParent();\n              count++;\n            } else if (ids != data.getParent()) {\n              count++;\n              break;\n            }\n          } else {\n            count = 0;\n            break;\n          }\n        }\n        if (count == 1) {\n          fineGGraph->getData(item).setMatched();\n\n        } else {\n          //\tauto& vec = *edgesThreadLocalV.getLocal();\n          // vec.push_back(item);\n          hedge_bag.push(item);\n          fineGGraph->getData(item).setMatched();\n        }\n      },\n      galois::steal(), galois::loopname(\"count # Hyperedges\"));\n\n  for (auto item : hedge_bag)\n    hedges[item] = true;\n}\n\n// find nodes that are not incident to any hyperedge\nvoid findLoneNodes(GGraph& graph) {\n\n  galois::do_all(\n      galois::iterate((uint64_t)graph.hedges, graph.size()),\n      [&](GNode n) { graph.getData(n).notAlone = false; }, galois::steal(),\n      galois::loopname(\"initialize not alone variables\"));\n\n  galois::do_all(\n      galois::iterate((uint64_t)0, graph.hedges),\n      [&](GNode h) {\n        for (auto n : graph.edges(h))\n          graph.getData(graph.getEdgeDst(n)).notAlone = true;\n      },\n      galois::steal(), galois::loopname(\"set not alone variables\"));\n}\n\n// create coarsened graphs\nvoid parallelCreateEdges(std::shared_ptr<MetisGraph> graph, GNodeBag& bag,\n                         std::vector<bool>& hedges,\n                         galois::LargeArray<unsigned>& weight) {\n\n  GGraph* fineGGraph   = graph->getFinerGraph()->getGraph();\n  GGraph* coarseGGraph = graph->getGraph();\n  assert(fineGGraph != coarseGGraph);\n  galois::GAccumulator<unsigned> hg;\n  galois::do_all(\n      galois::iterate(size_t{0}, fineGGraph->hedges),\n      [&](GNode n) {\n        if (hedges[n])\n          hg += 1;\n      },\n      galois::steal(), galois::loopname(\"number of hyperedges loop\"));\n\n  // find lone nodes\n  findLoneNodes(*fineGGraph);\n\n  galois::do_all(\n      galois::iterate(fineGGraph->hedges, fineGGraph->size()),\n      [&](GNode ii) {\n        if (!fineGGraph->getData(ii)\n                 .isMatched()) { // && fineGGraph->getData(ii).notAlone) {\n          bag.push(ii);\n          fineGGraph->getData(ii).setMatched();\n          fineGGraph->getData(ii).setParent(ii);\n          fineGGraph->getData(ii).netnum  = INT_MAX;\n          weight[ii - fineGGraph->hedges] = fineGGraph->getData(ii).getWeight();\n        }\n      },\n      galois::steal(), galois::loopname(\"noedgebag match\"));\n\n  galois::StatTimer T_BAG(\"BAG\");\n  T_BAG.start();\n  std::vector<bool> inNodeBag(1000, false);\n  std::vector<unsigned> nodeid(1000, INT_MAX);\n\n  for (GNode ii = fineGGraph->hedges; ii < fineGGraph->size(); ii++) {\n\n    if (!fineGGraph->getData(ii).isMatched() &&\n        !fineGGraph->getData(ii).notAlone) {\n      int index        = ii % 1000;\n      inNodeBag[index] = true;\n      if (ii < nodeid[index])\n        nodeid[index] = ii;\n    }\n  }\n\n  for (int i = 0; i < 1000; i++) {\n\n    if (inNodeBag[i]) {\n      bag.push(nodeid[i]);\n      weight[nodeid[i] - fineGGraph->hedges] = 0;\n    }\n  }\n\n  for (GNode ii = fineGGraph->hedges; ii < fineGGraph->size(); ii++) {\n\n    if (!fineGGraph->getData(ii).isMatched() &&\n        !fineGGraph->getData(ii).notAlone) {\n      int index = ii % 1000;\n      fineGGraph->getData(ii).setMatched();\n      fineGGraph->getData(ii).setParent(nodeid[index]);\n      fineGGraph->getData(ii).netnum = INT_MAX;\n\n      weight[nodeid[index] - fineGGraph->hedges] +=\n          fineGGraph->getData(ii).getWeight();\n    }\n  }\n  T_BAG.stop();\n\n  // std::cout <<\"bag time: \"<< T_BAG.get() << std::endl;\n  unsigned hnum   = hg.reduce();\n  unsigned nodes  = std::distance(bag.begin(), bag.end()); // + numnodes;\n  unsigned newval = hnum;\n\n  std::vector<unsigned> idmap(fineGGraph->hnodes);\n  std::vector<unsigned> newrand(nodes);\n  std::vector<unsigned> newWeight(nodes);\n  galois::StatTimer Tloop(\"for loop\");\n  Tloop.start();\n  std::vector<unsigned> v;\n\n  galois::LargeArray<bool> inBag;\n\n  inBag.allocateBlocked(fineGGraph->size());\n  for (GNode n = fineGGraph->hedges; n < fineGGraph->size(); n++)\n    inBag[n] = false;\n\n  for (auto n : bag)\n    inBag[n] = true;\n\n  for (GNode n = fineGGraph->hedges; n < fineGGraph->size(); n++)\n    if (inBag[n])\n      v.push_back(n);\n\n  for (auto n : v) {\n    newrand[newval - hnum]        = n;\n    idmap[n - fineGGraph->hedges] = newval++;\n    newWeight[idmap[n - fineGGraph->hedges] - hnum] =\n        weight[n - fineGGraph->hedges];\n  }\n\n  // for (GNode n = fineGGraph->hedges; n < fineGGraph->size(); n++) {\n  galois::do_all(\n      galois::iterate(fineGGraph->hedges, fineGGraph->size()),\n      [&](GNode n) {\n        unsigned id = fineGGraph->getData(n).getParent();\n        fineGGraph->getData(n).setParent(idmap[id - fineGGraph->hedges]);\n      },\n      galois::steal(), galois::loopname(\"first loop\"));\n  Tloop.stop();\n  uint32_t num_nodes_next = nodes + hnum;\n  uint64_t num_edges_next;\n  galois::gstl::Vector<galois::PODResizeableArray<uint32_t>> edges_id(\n      num_nodes_next);\n  std::vector<std::vector<EdgeTy>> edges_data(num_nodes_next);\n  std::vector<unsigned> old_id(hnum);\n\n  unsigned h_id = 0;\n\n  for (GNode n = 0; n < fineGGraph->hedges; n++) {\n    if (hedges[n]) {\n      old_id[h_id]                  = fineGGraph->getData(n).netnum;\n      fineGGraph->getData(n).nodeid = h_id++;\n    }\n  }\n\n  galois::do_all(\n      galois::iterate(size_t{0}, fineGGraph->hedges),\n      [&](GNode n) {\n        if (!hedges[n])\n          return;\n        // auto data   = fineGGraph->getData(n, flag_no_lock);\n        unsigned id = fineGGraph->getData(n).nodeid;\n\n        for (auto ii : fineGGraph->edges(n)) {\n          GNode dst = fineGGraph->getEdgeDst(ii);\n          //  auto dst_data = fineGGraph->getData(dst, flag_no_lock);\n          // unsigned pid  = dst_data.getParent();\n          unsigned pid = fineGGraph->getData(dst).getParent();\n\n          auto f = std::find(edges_id[id].begin(), edges_id[id].end(), pid);\n          if (f == edges_id[id].end()) {\n\n            edges_id[id].push_back(pid);\n          }\n        } // End edge loop\n      },\n      galois::steal(), galois::loopname(\"BuildGrah: Find edges\"));\n\n  std::vector<uint64_t> prefix_edges(num_nodes_next);\n  galois::GAccumulator<uint64_t> num_edges_acc;\n  galois::do_all(\n      galois::iterate(uint32_t{0}, num_nodes_next),\n      [&](uint32_t c) {\n        prefix_edges[c] = edges_id[c].size();\n        num_edges_acc += prefix_edges[c];\n      },\n      galois::steal(), galois::loopname(\"BuildGrah: Prefix sum\"));\n\n  num_edges_next = num_edges_acc.reduce();\n  for (uint32_t c = 1; c < num_nodes_next; ++c) {\n    prefix_edges[c] += prefix_edges[c - 1];\n  }\n\n  coarseGGraph->constructFrom(num_nodes_next, num_edges_next, prefix_edges,\n                              edges_id, edges_data);\n  coarseGGraph->hedges = hnum;\n  coarseGGraph->hnodes = nodes;\n  galois::do_all(\n      galois::iterate(*coarseGGraph),\n      [&](GNode ii) {\n        if (ii < hnum) {\n          coarseGGraph->getData(ii).netval = INT_MAX;\n          coarseGGraph->getData(ii).netnum = old_id[ii];\n        } else {\n          coarseGGraph->getData(ii).netval  = INT_MAX;\n          coarseGGraph->getData(ii).netnum  = INT_MAX;\n          coarseGGraph->getData(ii).netrand = INT_MAX;\n          coarseGGraph->getData(ii).nodeid  = ii;\n          coarseGGraph->getData(ii).setWeight(\n              newWeight[ii - coarseGGraph->hedges]);\n        }\n      },\n      galois::steal(), galois::loopname(\"noedgebag match\"));\n\n  inBag.destroy();\n  inBag.deallocate();\n}\n\nvoid findMatching(std::shared_ptr<MetisGraph> coarseMetisGraph,\n                  scheduleMode sch, int iter) {\n  std::shared_ptr<MetisGraph> fineMetisGraph =\n      coarseMetisGraph->getFinerGraph();\n  GNodeBag nodes;\n  int sz = coarseMetisGraph->getFinerGraph()->getGraph()->hedges;\n  std::vector<bool> hedges(sz, false);\n  galois::LargeArray<unsigned> weight;\n  weight.allocateBlocked(fineMetisGraph->getGraph()->hnodes);\n\n  switch (sch) {\n  case PLD:\n    parallelHMatchAndCreateNodes<PLD_f>(coarseMetisGraph, iter, nodes, hedges,\n                                        weight);\n    break;\n  case RAND:\n    parallelHMatchAndCreateNodes<RAND_f>(coarseMetisGraph, iter, nodes, hedges,\n                                         weight);\n    break;\n  case PP:\n    parallelHMatchAndCreateNodes<PP_f>(coarseMetisGraph, iter, nodes, hedges,\n                                       weight);\n    break;\n  case WD:\n    parallelHMatchAndCreateNodes<WD_f>(coarseMetisGraph, iter, nodes, hedges,\n                                       weight);\n    break;\n  case RI:\n    parallelHMatchAndCreateNodes<RI_f>(coarseMetisGraph, iter, nodes, hedges,\n                                       weight);\n    break;\n  case MRI:\n    parallelHMatchAndCreateNodes<MRI_f>(coarseMetisGraph, iter, nodes, hedges,\n                                        weight);\n    break;\n  case MWD:\n    parallelHMatchAndCreateNodes<MWD_f>(coarseMetisGraph, iter, nodes, hedges,\n                                        weight);\n    break;\n  case DEG:\n    parallelHMatchAndCreateNodes<DEG_f>(coarseMetisGraph, iter, nodes, hedges,\n                                        weight);\n    break;\n  case MDEG:\n    parallelHMatchAndCreateNodes<MDEG_f>(coarseMetisGraph, iter, nodes, hedges,\n                                         weight);\n    break;\n  default:\n    abort();\n  }\n  coarsePhaseII(coarseMetisGraph, hedges, weight);\n  parallelCreateEdges(coarseMetisGraph, nodes, hedges, weight);\n\n  weight.destroy();\n  weight.deallocate();\n}\n\nstd::shared_ptr<MetisGraph>\ncoarsenOnce(std::shared_ptr<MetisGraph> fineMetisGraph, scheduleMode sch,\n            int iter) {\n  std::shared_ptr<MetisGraph> coarseMetisGraph =\n      std::make_shared<MetisGraph>(fineMetisGraph);\n  findMatching(coarseMetisGraph, sch, iter);\n  return coarseMetisGraph;\n}\n\n} // namespace\n\nstd::shared_ptr<MetisGraph> coarsen(std::shared_ptr<MetisGraph> fineMetisGraph,\n                                    unsigned coarsenTo, scheduleMode sch) {\n\n  std::shared_ptr<MetisGraph> coarseGraph = fineMetisGraph;\n  unsigned size                           = fineMetisGraph->getGraph()->hnodes;\n  unsigned hedgeSize                      = 0;\n  const float ratio                       = 55.0 / 45.0;\n  const float tol                         = std::max(ratio, 1 - ratio) - 1;\n  const int hi                            = (1 + tol) * size / (2 + tol);\n  LIMIT                                   = hi / 4;\n\n  unsigned Size    = size;\n  unsigned iterNum = 0;\n  unsigned newSize = size;\n  while (Size > coarsenTo) {\n    if (iterNum > coarsenTo)\n      break;\n    if (Size - newSize <= 0 && iterNum > 2)\n      break;\n    newSize     = coarseGraph->getGraph()->hnodes;\n    coarseGraph = coarsenOnce(coarseGraph, sch, iterNum);\n    Size        = coarseGraph->getGraph()->hnodes;\n    hedgeSize   = coarseGraph->getGraph()->hedges;\n    // std::cout << \"SIZE IS \" << coarseGraph->getGraph()->hnodes << \" and net\n    // is \"\n    //          << hedgeSize << \"\\n\";\n    if (hedgeSize < 1000)\n      break;\n\n    ++iterNum;\n  }\n  return coarseGraph;\n}\n"
  },
  {
    "path": "lonestar/analytics/cpu/bipart/Metric.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#include \"bipart.h\"\n\n#include <iomanip>\n#include <iostream>\n#include <numeric>\n\nstruct onlineStat {\n  unsigned num;\n  unsigned val;\n  double valSQ;\n  unsigned mmin;\n  unsigned mmax;\n\n  onlineStat()\n      : num(0), val(0), valSQ(0), mmin(std::numeric_limits<unsigned>::max()),\n        mmax(0) {}\n\n  void add(unsigned v) {\n    ++num;\n    val += v;\n    valSQ += (double)v * (double)v;\n    mmin = std::min(v, mmin);\n    mmax = std::max(v, mmax);\n  }\n\n  double mean() { return (double)val / (double)num; }\n\n  double variance() {\n    double t = valSQ / (double)num;\n    double m = mean();\n    return t - m * m;\n  }\n\n  unsigned count() { return num; }\n  unsigned total() { return val; }\n  unsigned min() { return mmin; }\n  unsigned max() { return mmax; }\n};\n\nunsigned graphStat(GGraph& graph) {\n  onlineStat e;\n  for (auto ii : graph) {\n    unsigned val = std::distance(graph.edge_begin(ii), graph.edge_end(ii));\n    e.add(val);\n  }\n  std::cout << \"Nodes \" << e.count() << \" Edges(total, var, min, max) \"\n            << e.total() << \" \" << e.variance() << \" \" << e.min() << \" \"\n            << e.max();\n  return e.count();\n}\n\nstd::vector<unsigned> edgeCut(GGraph& g, unsigned nparts) {\n  std::vector<unsigned> cuts(nparts);\n\n  // find boundary nodes with positive gain\n  for (auto nn : g) {\n    unsigned gPart = g.getData(nn).getPart();\n    for (auto ii : g.edges(nn)) {\n      auto& m = g.getData(g.getEdgeDst(ii));\n      if (m.getPart() != gPart) {\n        cuts.at(gPart) += g.getEdgeData(ii);\n      }\n    }\n  }\n  return cuts;\n}\n\nunsigned computeCut(GGraph& g) {\n  unsigned cuts = 0;\n  for (auto nn : g) {\n    unsigned gPart = g.getData(nn).getPart();\n    for (auto ii : g.edges(nn)) {\n      auto& m = g.getData(g.getEdgeDst(ii));\n      if (m.getPart() != gPart)\n        cuts += g.getEdgeData(ii);\n    }\n  }\n  return cuts / 2;\n}\n\nvoid printCuts(const char* str, MetisGraph* g, unsigned numPartitions) {\n  std::vector<unsigned> ec = edgeCut(*g->getGraph(), numPartitions);\n  std::cout << str << \" Edge Cuts:\\n\";\n  for (unsigned x = 0; x < ec.size(); ++x)\n    std::cout << (x == 0 ? \"\" : \" \") << ec[x];\n  std::cout << \"\\n\";\n  std::cout << str << \" Average Edge Cut: \"\n            << (std::accumulate(ec.begin(), ec.end(), 0) / ec.size()) << \"\\n\";\n  std::cout << str\n            << \" Minimum Edge Cut: \" << *std::min_element(ec.begin(), ec.end())\n            << \"\\n\";\n}\n"
  },
  {
    "path": "lonestar/analytics/cpu/bipart/Partitioning.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#include \"galois/Galois.h\"\n#include \"galois/Timer.h\"\n#include \"bipart.h\"\n#include <set>\n#include \"galois/Galois.h\"\n#include \"galois/AtomicHelpers.h\"\n#include <map>\n#include <set>\n#include <cstdlib>\n#include <iostream>\n#include <stack>\n#include <climits>\n#include <array>\n\nnamespace {\n// final\n__attribute__((unused)) int cut(GGraph& g) {\n\n  GNodeBag bag;\n  galois::do_all(\n      galois::iterate(g),\n      [&](GNode n) {\n        if (g.hedges <= n)\n          return;\n        for (auto cell : g.edges(n)) {\n          auto c   = g.getEdgeDst(cell);\n          int part = g.getData(c).getPart();\n          for (auto x : g.edges(n)) {\n            auto cc   = g.getEdgeDst(x);\n            int partc = g.getData(cc).getPart();\n            if (partc != part) {\n              bag.push(n);\n              return;\n            }\n          }\n        }\n      },\n      galois::loopname(\"cutsize\"));\n  return std::distance(bag.begin(), bag.end());\n}\n\nvoid initGain(GGraph& g) {\n  galois::do_all(\n      galois::iterate(g),\n      [&](GNode n) {\n        if (n < g.hedges)\n          return;\n        g.getData(n).FS.store(0);\n        g.getData(n).TE.store(0);\n      },\n      galois::loopname(\"firstinit\"));\n\n  typedef std::map<GNode, int> mapTy;\n  typedef galois::substrate::PerThreadStorage<mapTy> ThreadLocalData;\n  ThreadLocalData edgesThreadLocal;\n  galois::do_all(\n      galois::iterate(g),\n      [&](GNode n) {\n        if (g.hedges <= n)\n          return;\n        int p1 = 0;\n        int p2 = 0;\n        for (auto x : g.edges(n)) {\n          auto cc  = g.getEdgeDst(x);\n          int part = g.getData(cc).getPart();\n          if (part == 0)\n            p1++;\n          else\n            p2++;\n          if (p1 > 1 && p2 > 1)\n            break;\n        }\n        if (!(p1 > 1 && p2 > 1) && (p1 + p2 > 1)) {\n          for (auto x : g.edges(n)) {\n            auto cc  = g.getEdgeDst(x);\n            int part = g.getData(cc).getPart();\n            int nodep;\n            if (part == 0)\n              nodep = p1;\n            else\n              nodep = p2;\n            if (nodep == 1) {\n              galois::atomicAdd(g.getData(cc).FS, 1);\n            }\n            if (nodep == (p1 + p2)) {\n              galois::atomicAdd(g.getData(cc).TE, 1);\n            }\n          }\n        }\n      },\n      galois::steal(), galois::loopname(\"initGainsPart\"));\n}\n\n} // namespace\n\n// Final\nvoid partition(std::shared_ptr<MetisGraph> mcg, unsigned K) {\n  GGraph* g = mcg->getGraph();\n  galois::GAccumulator<unsigned int> accum;\n  int waccum;\n  galois::GAccumulator<unsigned int> accumZ;\n  GNodeBag nodelist;\n  galois::do_all(\n      galois::iterate(g->hedges, g->size()),\n      [&](GNode item) {\n        accum += g->getData(item).getWeight();\n        g->getData(item, galois::MethodFlag::UNPROTECTED).initRefine(1, true);\n        g->getData(item, galois::MethodFlag::UNPROTECTED).initPartition();\n      },\n      galois::loopname(\"initPart\"));\n\n  galois::do_all(\n      galois::iterate(size_t{0}, g->hedges),\n      [&](GNode item) {\n        for (auto c : g->edges(item)) {\n          auto n = g->getEdgeDst(c);\n          g->getData(n).setPart(0);\n        }\n      },\n      galois::loopname(\"initones\"));\n  GNodeBag nodelistoz;\n  galois::do_all(\n      galois::iterate(g->hedges, g->size()),\n      [&](GNode item) {\n        if (g->getData(item).getPart() == 0) {\n          accumZ += g->getData(item).getWeight();\n          nodelist.push(item);\n        } else\n          nodelistoz.push(item);\n      },\n      galois::loopname(\"initones\"));\n  unsigned newSize = accum.reduce();\n  waccum           = accum.reduce() - accumZ.reduce();\n  // unsigned targetWeight = accum.reduce() / 2;\n  unsigned kvalue        = (K + 1) / 2;\n  unsigned targetWeight0 = (accum.reduce() * kvalue) / K;\n  unsigned targetWeight1 = accum.reduce() - targetWeight0;\n\n  if (static_cast<long>(accumZ.reduce()) > waccum) {\n    int gain = waccum;\n    // initGain(*g);\n    while (1) {\n      initGain(*g);\n      std::vector<GNode> nodeListz;\n      GNodeBag nodelistz;\n      galois::do_all(\n          galois::iterate(nodelist),\n          [&](GNode node) {\n            unsigned pp = g->getData(node).getPart();\n            if (pp == 0) {\n              nodelistz.push(node);\n            }\n          },\n          galois::loopname(\"while\"));\n\n      for (auto c : nodelistz)\n        nodeListz.push_back(c);\n      std::sort(\n          nodeListz.begin(), nodeListz.end(), [&g](GNode& lpw, GNode& rpw) {\n            if (fabs((float)((g->getData(lpw).getGain()) *\n                             (1.0f / g->getData(lpw).getWeight())) -\n                     (float)((g->getData(rpw).getGain()) *\n                             (1.0f / g->getData(rpw).getWeight()))) < 0.00001f)\n              return (float)g->getData(lpw).nodeid <\n                     (float)g->getData(rpw).nodeid;\n            return (float)((g->getData(lpw).getGain()) *\n                           (1.0f / g->getData(lpw).getWeight())) >\n                   (float)((g->getData(rpw).getGain()) *\n                           (1.0f / g->getData(rpw).getWeight()));\n          });\n      int i = 0;\n      for (auto zz : nodeListz) {\n        // auto zz = *nodeListz.begin();\n        g->getData(zz).setPart(1);\n        gain += g->getData(zz).getWeight();\n        // std::cout<<\" weight \"<<g->getData(zz).getWeight()<<\"\\n\";\n\n        i++;\n        if (gain >= static_cast<long>(targetWeight1))\n          break;\n        if (i > sqrt(newSize))\n          break;\n      }\n\n      if (gain >= static_cast<long>(targetWeight1))\n        break;\n      // updateGain(*g,zz);\n    }\n\n  } else {\n\n    int gain = accumZ.reduce();\n    // std::cout<<\"gain is \"<<gain<<\"\\n\";\n    // initGain(*g);\n    while (1) {\n      initGain(*g);\n      std::vector<GNode> nodeListz;\n      GNodeBag nodelistz;\n      galois::do_all(\n          galois::iterate(nodelistoz),\n          [&](GNode node) {\n            // for (auto node : nodelist) {\n            unsigned pp = g->getData(node).getPart();\n            if (pp == 1) {\n              nodelistz.push(node);\n            }\n          },\n          galois::loopname(\"while\"));\n      for (auto c : nodelistz)\n        nodeListz.push_back(c);\n\n      std::sort(\n          nodeListz.begin(), nodeListz.end(), [&g](GNode& lpw, GNode& rpw) {\n            if (fabs((float)((g->getData(lpw).getGain()) *\n                             (1.0f / g->getData(lpw).getWeight())) -\n                     (float)((g->getData(rpw).getGain()) *\n                             (1.0f / g->getData(rpw).getWeight()))) < 0.00001f)\n              return (float)g->getData(lpw).nodeid <\n                     (float)g->getData(rpw).nodeid;\n            return (float)((g->getData(lpw).getGain()) *\n                           (1.0f / g->getData(lpw).getWeight())) >\n                   (float)((g->getData(rpw).getGain()) *\n                           (1.0f / g->getData(rpw).getWeight()));\n          });\n\n      int i = 0;\n      for (auto zz : nodeListz) {\n        // auto zz = *nodeListz.begin();\n        g->getData(zz).setPart(0);\n        gain += g->getData(zz).getWeight();\n\n        i++;\n        if (gain >= static_cast<long>(targetWeight0))\n          break;\n        if (i > sqrt(newSize))\n          break;\n      }\n\n      if (gain >= static_cast<long>(targetWeight0))\n        break;\n\n      // updateGain(*g,zz);\n    }\n  }\n}\n"
  },
  {
    "path": "lonestar/analytics/cpu/bipart/README.md",
    "content": "Hypergraph Partitioning Decomposition\n================================================================================\n\nDESCRIPTION \n--------------------------------------------------------------------------------\n\nPartitiong a hypergraph into <b>k</b> sets. A hypergraph is a generalization of \ngraph in which edges can connect more than two nodes. Hypergraph partitioning has \napplications in VLSI, data mining, bioinformatics, and etc. \n\nThe hypergraph is represented as a bipartite graph where one sets of nodes represents\nhyperedges and the other set represnts nodes. There is an edge between nodes and a hyperedge\nif the node is in that hyperedge.\n\nINPUT\n--------------------------------------------------------------------------------\n\nThis application takes in **HMetis** inputs .hgr graphs.\nYou must specify the -hMetisGraph flag when running this benchmark.\n\nBUILD\n--------------------------------------------------------------------------------\n\n1. Run cmake at BUILD directory (refer to top-level README for cmake instructions).\n\n2. Run `cd <BUILD>/lonestar/analytics/cpu/bipart/; make -j\n\nRUN\n--------------------------------------------------------------------------------\n\nTo run on machine with a k value of 4, use the following:\n`./bipart-cpu <input-graph> <number-of-coarsening-levels> <number-of-refinement-levels> -<scheduling-policy> -t=<num-threads> -hMetisGraph`\n"
  },
  {
    "path": "lonestar/analytics/cpu/bipart/Refine.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#include \"galois/Galois.h\"\n#include \"galois/Reduction.h\"\n#include \"galois/Timer.h\"\n#include \"bipart.h\"\n#include \"galois/AtomicHelpers.h\"\n#include <set>\n#include <iostream>\n#include <fstream>\n\nnamespace {\n\nvoid projectPart(std::shared_ptr<MetisGraph> Graph) {\n  GGraph* fineGraph   = Graph->getFinerGraph()->getGraph();\n  GGraph* coarseGraph = Graph->getGraph();\n  galois::do_all(\n      galois::iterate(fineGraph->hedges, fineGraph->size()),\n      [&](GNode n) {\n        auto parent   = fineGraph->getData(n).getParent();\n        auto& cn      = coarseGraph->getData(parent);\n        unsigned part = cn.getPart();\n        fineGraph->getData(n).setPart(part);\n      },\n      galois::loopname(\"project\"));\n}\n\nvoid initGains(GGraph& g, int) {\n  std::string name    = \"initgain\";\n  std::string fetsref = \"FETSREF_\"; // + std::to_string(pass);\n\n  galois::do_all(\n      galois::iterate(g.hedges, g.size()),\n      [&](GNode n) {\n        g.getData(n).FS.store(0);\n        g.getData(n).TE.store(0);\n      },\n      galois::loopname(name.c_str()));\n  galois::InsertBag<std::pair<GNode, GGraph::edge_iterator>> bag;\n  typedef std::map<GNode, int> mapTy;\n  typedef galois::substrate::PerThreadStorage<mapTy> ThreadLocalData;\n  ThreadLocalData edgesThreadLocal;\n  galois::do_all(\n      galois::iterate(size_t{0}, g.hedges),\n      [&](GNode n) {\n        int p1 = 0;\n        int p2 = 0;\n        for (auto x : g.edges(n)) {\n          auto cc  = g.getEdgeDst(x);\n          int part = g.getData(cc).getPart();\n          if (part == 0)\n            p1++;\n          else\n            p2++;\n          if (p1 > 1 && p2 > 1)\n            break;\n        }\n        if (!(p1 > 1 && p2 > 1) && (p1 + p2 > 1)) {\n          for (auto x : g.edges(n)) {\n            auto cc  = g.getEdgeDst(x);\n            int part = g.getData(cc).getPart();\n            int nodep;\n            if (part == 0)\n              nodep = p1;\n            else\n              nodep = p2;\n            if (nodep == 1) {\n              galois::atomicAdd(g.getData(cc).FS, 1);\n            }\n            if (nodep == (p1 + p2)) {\n              galois::atomicAdd(g.getData(cc).TE, 1);\n            }\n          }\n        }\n      },\n      galois::steal(), galois::loopname(\"initGains\"));\n}\n\nvoid unlock(GGraph& g) {\n  galois::do_all(\n      galois::iterate(g.hedges, g.size()),\n      [&](GNode n) { g.getData(n).counter = 0; }, galois::loopname(\"unlock\"));\n}\n\n__attribute__((unused)) void unlocked(GGraph& g) {\n  galois::do_all(\n      galois::iterate(g.hedges, g.size()),\n      [&](GNode n) { g.getData(n).setLocked(false); },\n      galois::loopname(\"unlocked\"));\n}\n// refine\nvoid parallel_refine_KF(GGraph& g, float, unsigned refineTo) {\n\n  // std::cout<<\"in parallel balance\\n\";\n  typedef galois::gstl::Vector<unsigned> VecTy;\n  typedef galois::substrate::PerThreadStorage<VecTy> ThreadLocalData;\n  ThreadLocalData edgesThreadLocal;\n  std::string name = \"findZandO\";\n\n  // typedef galois::worklists::PerSocketChunkFIFO<8> Chunk;\n\n  galois::GAccumulator<unsigned int> accum;\n  galois::GAccumulator<unsigned int> nodeSize;\n  galois::do_all(\n      galois::iterate(g.hedges, g.size()),\n      [&](GNode n) {\n        nodeSize += g.getData(n).getWeight();\n        if (g.getData(n).getPart() > 0)\n          accum += g.getData(n).getWeight();\n      },\n      galois::loopname(\"make balance\"));\n  unsigned pass = 0;\n  // std::cout<<\"cut parallel \"<<calculate_cutsize(g)<<\"\\n\";\n  // initGain(g);\n  while (pass < refineTo) {\n    // T.start();\n    initGains(g, refineTo);\n    // T.stop();\n    // std::cout<<\"init gain time \"<<T.get()<<\" for round \"<<pass<<\"\\n\";\n    GNodeBag nodelistz;\n    GNodeBag nodelisto;\n    unsigned zeroW = 0;\n    unsigned oneW  = 0;\n    galois::do_all(\n        galois::iterate(g.hedges, g.size()),\n        [&](GNode n) {\n          if (g.getData(n).FS == 0 && g.getData(n).TE == 0)\n            return;\n          int gain = g.getData(n).getGain();\n          if (gain < 0) {\n            return;\n          }\n          unsigned pp = g.getData(n).getPart();\n          if (pp == 0) {\n            nodelistz.push(n);\n          } else {\n            nodelisto.push(n);\n          }\n        },\n        galois::loopname(\"findZandO\"));\n    zeroW = std::distance(nodelistz.begin(), nodelistz.end());\n    oneW  = std::distance(nodelisto.begin(), nodelisto.end());\n    GNodeBag bb;\n    std::vector<GNode> bbagz;\n    std::vector<GNode> bbago;\n    for (auto n : nodelistz)\n      bbagz.push_back(n);\n    for (auto n : nodelisto)\n      bbago.push_back(n);\n    std::sort(bbagz.begin(), bbagz.end(), [&g](GNode& lpw, GNode& rpw) {\n      if (g.getData(lpw).getGain() == g.getData(rpw).getGain())\n        return g.getData(lpw).nodeid < g.getData(rpw).nodeid;\n      return g.getData(lpw).getGain() > g.getData(rpw).getGain();\n    });\n    std::sort(bbago.begin(), bbago.end(), [&g](GNode& lpw, GNode& rpw) {\n      if (g.getData(lpw).getGain() == g.getData(rpw).getGain())\n        return g.getData(lpw).nodeid < g.getData(rpw).nodeid;\n      return g.getData(lpw).getGain() > g.getData(rpw).getGain();\n    });\n    if (zeroW <= oneW) {\n      for (unsigned i = 0; i < zeroW; i++) {\n        bb.push(bbago[i]);\n        bb.push(bbagz[i]);\n        //    if (i >= sqrt(Size)) break;\n      }\n      galois::do_all(\n          galois::iterate(bb),\n          [&](GNode n) {\n            if (g.getData(n).getPart() == 0)\n              g.getData(n).setPart(1);\n            else\n              g.getData(n).setPart(0);\n            g.getData(n).counter++;\n          },\n          galois::loopname(\"swap\"));\n    } else {\n      for (unsigned i = 0; i < oneW; i++) {\n        bb.push(bbago[i]);\n        bb.push(bbagz[i]);\n        //     if (i >= sqrt(Size)) break;\n      }\n      galois::do_all(\n          galois::iterate(bb),\n          [&](GNode n) {\n            if (g.getData(n).getPart() == 0)\n              g.getData(n).setPart(1);\n            else\n              g.getData(n).setPart(0);\n            g.getData(n).counter++;\n          },\n          galois::loopname(\"swap\"));\n    }\n    pass++;\n  }\n  unlock(g);\n}\n\nvoid parallel_make_balance(GGraph& g, float tol, int p) {\n\n  unsigned Size = g.hnodes;\n\n  galois::GAccumulator<unsigned int> accum;\n  galois::GAccumulator<unsigned int> nodeSize;\n  galois::do_all(\n      galois::iterate(g.hedges, g.size()),\n      [&](GNode n) {\n        nodeSize += g.getData(n).getWeight();\n        if (g.getData(n).getPart() > 0)\n          accum += g.getData(n).getWeight();\n      },\n      galois::loopname(\"make balance\"));\n\n  const int hi = (1 + tol) * nodeSize.reduce() / (2 + tol);\n  const int lo = nodeSize.reduce() - hi;\n  int bal      = accum.reduce();\n\n  while (1) {\n    if (bal >= lo && bal <= hi)\n      break;\n    initGains(g, p);\n\n    // creating buckets\n    std::array<std::vector<GNode>, 101> nodeListz;\n    std::array<std::vector<GNode>, 101> nodeListo;\n\n    std::array<GNodeBag, 101> nodelistz;\n    std::array<GNodeBag, 101> nodelisto;\n\n    // bucket for nodes with gan by weight ratio <= -9.0f\n    std::vector<GNode> nodeListzNegGain;\n    std::vector<GNode> nodeListoNegGain;\n\n    GNodeBag nodelistzNegGain;\n    GNodeBag nodelistoNegGain;\n\n    if (bal < lo) {\n\n      // placing each node in an appropriate bucket using the gain by weight\n      // ratio\n      galois::do_all(\n          galois::iterate(g.hedges, g.size()),\n          [&](GNode n) {\n            float gain = ((float)g.getData(n).getGain()) /\n                         ((float)g.getData(n).getWeight());\n            unsigned pp = g.getData(n).getPart();\n            if (pp == 0) {\n              // nodes with gain >= 1.0f are in one bucket\n              if (gain >= 1.0f) {\n                nodelistz[0].push(n);\n              } else if (gain >= 0.0f) {\n                int d   = gain * 10.0f;\n                int idx = 10 - d;\n                nodelistz[idx].push(n);\n              } else if (gain > -9.0f) {\n                int d   = gain * 10.0f - 1;\n                int idx = 10 - d;\n                nodelistz[idx].push(n);\n              } else { // NODES with gain by weight ratio <= -9.0f are in one\n                       // bucket\n                nodelistzNegGain.push(n);\n              }\n            }\n          },\n          galois::steal());\n\n      // sorting each bucket in parallel\n      galois::do_all(\n          galois::iterate(nodelistz),\n          [&](GNodeBag& b) {\n            if (b.begin() == b.end())\n              return;\n\n            GNode n    = *b.begin();\n            float gain = ((float)g.getData(n).getGain()) /\n                         ((float)g.getData(n).getWeight());\n            int idx;\n            if (gain >= 1.0f)\n              idx = 0;\n            else if (gain >= 0.0f) {\n              int d = gain * 10.0f;\n              idx   = 10 - d;\n            } else {\n              int d = gain * 10.0f - 1;\n              idx   = 10 - d;\n            }\n            for (auto x : b) {\n              nodeListz[idx].push_back(x);\n            }\n\n            std::sort(nodeListz[idx].begin(), nodeListz[idx].end(),\n                      [&g](GNode& lpw, GNode& rpw) {\n                        if (fabs((float)((g.getData(lpw).getGain()) *\n                                         (1.0f / g.getData(lpw).getWeight())) -\n                                 (float)((g.getData(rpw).getGain()) *\n                                         (1.0f / g.getData(rpw).getWeight()))) <\n                            0.00001f)\n                          return (float)g.getData(lpw).nodeid <\n                                 (float)g.getData(rpw).nodeid;\n                        return (float)((g.getData(lpw).getGain()) *\n                                       (1.0f / g.getData(lpw).getWeight())) >\n                               (float)((g.getData(rpw).getGain()) *\n                                       (1.0f / g.getData(rpw).getWeight()));\n                      });\n          },\n          galois::steal());\n\n      int i = 0;\n      int j = 0;\n\n      // now moving nodes from partition 0 to 1\n      while (j <= 100) {\n        if (nodeListz[j].size() == 0) {\n          j++;\n          continue;\n        }\n\n        for (auto zz : nodeListz[j]) {\n          g.getData(zz).setPart(1);\n          bal += g.getData(zz).getWeight();\n          if (bal >= lo)\n            break;\n          i++;\n          if (i > sqrt(Size))\n            break;\n        }\n        if (bal >= lo)\n          break;\n        if (i > sqrt(Size))\n          break;\n        j++;\n      }\n\n      if (bal >= lo)\n        break;\n      if (i > sqrt(Size))\n        continue;\n\n      // moving nodes from nodeListzNegGain\n      //\n      if (nodelistzNegGain.begin() == nodelistzNegGain.end())\n        continue;\n\n      for (auto x : nodelistzNegGain)\n        nodeListzNegGain.push_back(x);\n\n      std::sort(nodeListzNegGain.begin(), nodeListzNegGain.end(),\n                [&g](GNode& lpw, GNode& rpw) {\n                  if (fabs((float)((g.getData(lpw).getGain()) *\n                                   (1.0f / g.getData(lpw).getWeight())) -\n                           (float)((g.getData(rpw).getGain()) *\n                                   (1.0f / g.getData(rpw).getWeight()))) <\n                      0.00001f)\n                    return (float)g.getData(lpw).nodeid <\n                           (float)g.getData(rpw).nodeid;\n                  return (float)((g.getData(lpw).getGain()) *\n                                 (1.0f / g.getData(lpw).getWeight())) >\n                         (float)((g.getData(rpw).getGain()) *\n                                 (1.0f / g.getData(rpw).getWeight()));\n                });\n\n      for (auto zz : nodeListzNegGain) {\n        g.getData(zz).setPart(1);\n        bal += g.getData(zz).getWeight();\n        if (bal >= lo)\n          break;\n        i++;\n        if (i > sqrt(Size))\n          break;\n      }\n\n      if (bal >= lo)\n        break;\n\n    } // end if\n\n    else {\n\n      // placing each node in an appropriate bucket using the gain by weight\n      // ratio\n      galois::do_all(galois::iterate(g.hedges, g.size()), [&](GNode n) {\n        float gain =\n            ((float)g.getData(n).getGain()) / ((float)g.getData(n).getWeight());\n        unsigned pp = g.getData(n).getPart();\n        if (pp == 1) {\n          // nodes with gain >= 1.0f are in one bucket\n          if (gain >= 1.0f) {\n            nodelisto[0].push(n);\n          } else if (gain >= 0.0f) {\n            int d   = gain * 10.0f;\n            int idx = 10 - d;\n            nodelisto[idx].push(n);\n          } else if (gain > -9.0f) {\n            int d   = gain * 10.0f - 1;\n            int idx = 10 - d;\n            nodelisto[idx].push(n);\n          } else { // NODES with gain by weight ratio <= -9.0f are in one bucket\n            nodelistoNegGain.push(n);\n          }\n        }\n      });\n\n      // sorting each bucket in parallel\n      galois::do_all(galois::iterate(nodelisto), [&](GNodeBag& b) {\n        if (b.begin() == b.end())\n          return;\n\n        GNode n = *b.begin();\n        float gain =\n            ((float)g.getData(n).getGain()) / ((float)g.getData(n).getWeight());\n        int idx;\n        if (gain >= 1.0f)\n          idx = 0;\n        else if (gain >= 0.0f) {\n          int d = gain * 10.0f;\n          idx   = 10 - d;\n        } else {\n          int d = gain * 10.0f - 1;\n          idx   = 10 - d;\n        }\n        for (auto x : b) {\n          nodeListo[idx].push_back(x);\n        }\n\n        std::sort(nodeListo[idx].begin(), nodeListo[idx].end(),\n                  [&g](GNode& lpw, GNode& rpw) {\n                    if (fabs((float)((g.getData(lpw).getGain()) *\n                                     (1.0f / g.getData(lpw).getWeight())) -\n                             (float)((g.getData(rpw).getGain()) *\n                                     (1.0f / g.getData(rpw).getWeight()))) <\n                        0.00001f)\n                      return (float)g.getData(lpw).nodeid <\n                             (float)g.getData(rpw).nodeid;\n                    return (float)((g.getData(lpw).getGain()) *\n                                   (1.0f / g.getData(lpw).getWeight())) >\n                           (float)((g.getData(rpw).getGain()) *\n                                   (1.0f / g.getData(rpw).getWeight()));\n                  });\n      });\n\n      int i = 0;\n      int j = 0;\n\n      // now moving nodes from partition 1 to 0\n      while (j <= 100) {\n        if (nodeListo[j].size() == 0) {\n          j++;\n          continue;\n        }\n\n        for (auto zz : nodeListo[j]) {\n          g.getData(zz).setPart(0);\n          bal -= g.getData(zz).getWeight();\n          if (bal <= hi)\n            break;\n          i++;\n          if (i > sqrt(Size))\n            break;\n        }\n        if (bal <= hi)\n          break;\n        if (i > sqrt(Size))\n          break;\n        j++;\n      }\n\n      if (bal <= hi)\n        break;\n      if (i > sqrt(Size))\n        continue;\n\n      // moving nodes from nodeListoNegGain\n      //\n      if (nodelistoNegGain.begin() == nodelistoNegGain.end())\n        continue;\n\n      for (auto x : nodelistoNegGain)\n        nodeListoNegGain.push_back(x);\n\n      std::sort(nodeListoNegGain.begin(), nodeListoNegGain.end(),\n                [&g](GNode& lpw, GNode& rpw) {\n                  if (fabs((float)((g.getData(lpw).getGain()) *\n                                   (1.0f / g.getData(lpw).getWeight())) -\n                           (float)((g.getData(rpw).getGain()) *\n                                   (1.0f / g.getData(rpw).getWeight()))) <\n                      0.00001f)\n                    return (float)g.getData(lpw).nodeid <\n                           (float)g.getData(rpw).nodeid;\n                  return (float)((g.getData(lpw).getGain()) *\n                                 (1.0f / g.getData(lpw).getWeight())) >\n                         (float)((g.getData(rpw).getGain()) *\n                                 (1.0f / g.getData(rpw).getWeight()));\n                });\n\n      for (auto zz : nodeListoNegGain) {\n        g.getData(zz).setPart(0);\n        bal -= g.getData(zz).getWeight();\n        if (bal <= hi)\n          break;\n        i++;\n        if (i > sqrt(Size))\n          break;\n      }\n\n      if (bal <= hi)\n        break;\n    } // end else\n\n  } // end while\n}\n\n} // namespace\n\nbool isPT(int n) {\n  if (n == 0)\n    return false;\n\n  return (ceil(log2(n)) == floor(log2(n)));\n}\n\nvoid refine(std::shared_ptr<MetisGraph> coarseGraph, unsigned K,\n            double imbalance) {\n  float ratio = 0.0f;\n  float tol   = 0.0f;\n  bool flag   = isPT(K);\n  if (flag) {\n    ratio = (50.0f + (double)imbalance) / (50.0f - (double)imbalance);\n    tol   = std::max(ratio, 1 - ratio) - 1;\n  } else {\n    ratio = ((float)((K + 1) / 2)) / ((float)(K / 2)); // change if needed\n    tol   = std::max(ratio, 1 - ratio) - 1;\n  }\n  do {\n    std::shared_ptr<MetisGraph> fineGraph = coarseGraph->getFinerGraph();\n    auto gg                               = coarseGraph->getGraph();\n\n    parallel_refine_KF(*gg, tol, 2);\n    parallel_make_balance(*gg, tol, 2);\n    bool do_pro = true;\n    if (fineGraph.get() && do_pro) {\n      projectPart(coarseGraph);\n    }\n  } while ((coarseGraph = coarseGraph->getFinerGraph()));\n}\n"
  },
  {
    "path": "lonestar/analytics/cpu/bipart/bipart.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#include \"bipart.h\"\n#include \"galois/graphs/ReadGraph.h\"\n#include \"galois/Timer.h\"\n#include \"Lonestar/BoilerPlate.h\"\n#include \"galois/graphs/FileGraph.h\"\n#include \"galois/LargeArray.h\"\n\n#include <vector>\n#include <set>\n#include <map>\n#include <iostream>\n#include <string.h>\n#include <stdlib.h>\n#include <numeric>\n#include <algorithm>\n#include <cmath>\n#include <fstream>\n#include <iostream>\n#include <array>\n#include <unordered_set>\n\nnamespace cll = llvm::cl;\n\nstatic const char* name = \"BIPART\";\nstatic const char* desc =\n    \"Partitions a hypergraph into K parts and minimizing the graph cut\";\nstatic const char* url = \"BiPart\";\n\nstatic cll::opt<std::string>\n    inputFile(cll::Positional, cll::desc(\"<input file>\"), cll::Required);\nstatic cll::opt<scheduleMode> schedulingMode(\n    cll::desc(\"Choose a inital scheduling mode:\"),\n    cll::values(clEnumVal(PLD, \"PLD\"), clEnumVal(PP, \"PP\"), clEnumVal(WD, \"WD\"),\n                clEnumVal(RI, \"RI\"), clEnumVal(MRI, \"MRI\"),\n                clEnumVal(MDEG, \"MDEG\"), clEnumVal(DEG, \"DEG\"),\n                clEnumVal(MWD, \"MWD\"), clEnumVal(HIS, \"HIS\"),\n                clEnumVal(RAND, \"random\")),\n    cll::init(RAND));\n\nstatic cll::opt<bool>\n    mtxInput(\"mtxinput\",\n             cll::desc(\"Use text mtx files instead of binary galois gr files\"),\n             cll::init(false));\nstatic cll::opt<bool> weighted(\"weighted\", cll::desc(\"weighted\"),\n                               cll::init(false));\nstatic cll::opt<bool>\n    verbose(\"verbose\",\n            cll::desc(\"verbose output (debugging mode, takes extra time)\"),\n            cll::init(false));\nstatic cll::opt<std::string> outfile(\"outputFile\",\n                                     cll::desc(\"output partition file name\"));\nstatic cll::opt<std::string>\n    orderedfile(\"ordered\", cll::desc(\"output ordered graph file name\"));\nstatic cll::opt<std::string>\n    permutationfile(\"permutation\", cll::desc(\"output permutation file name\"));\nstatic cll::opt<unsigned> csize(cll::Positional,\n                                cll::desc(\"<size of coarsest graph>\"),\n                                cll::init(25));\n\nstatic cll::opt<unsigned> refiter(cll::Positional,\n                                  cll::desc(\"<number of iterations in ref>\"),\n                                  cll::init(2));\nstatic cll::opt<unsigned> numPartitions(cll::Positional,\n                                        cll::desc(\"<number of partitions>\"),\n                                        cll::init(2));\nstatic cll::opt<double> imbalance(\n    \"balance\",\n    cll::desc(\"Percentage deviated from mean partition size (default 5)\"),\n    cll::init(5.0));\n\n//! Flag that forces user to be aware that they should be passing in a\n//! hMetis graph.\nstatic cll::opt<bool>\n    hMetisGraph(\"hMetisGraph\",\n                cll::desc(\"Specify that the input graph is a hMetis\"),\n                cll::init(false));\n\nstatic cll::opt<bool>\n    output(\"output\", cll::desc(\"Specify if partitions need to be written\"),\n           cll::init(false));\ndouble Ctime = 0.0f;\ndouble Ptime = 0.0f;\ndouble Rtime = 0.0f;\n/**\n * Partitioning\n */\nvoid Partition(std::shared_ptr<MetisGraph> metisGraph, unsigned coarsenTo,\n               unsigned K) {\n  galois::StatTimer execTime(\"Timer_0\");\n  execTime.start();\n\n  galois::StatTimer T(\"CoarsenSEP\");\n  T.start();\n  std::shared_ptr<MetisGraph> mcg =\n      coarsen(metisGraph, coarsenTo, schedulingMode);\n  T.stop();\n\n  galois::StatTimer T2(\"PartitionSEP\");\n  T2.start();\n  partition(mcg, K);\n  T2.stop();\n\n  galois::StatTimer T3(\"Refine\");\n  T3.start();\n  refine(mcg, K, imbalance);\n  T3.stop();\n  Ctime += (T.get() / 1000.0f);\n  Ptime += (T2.get() / 1000.0f);\n  Rtime += (T3.get() / 1000.0f);\n\n  execTime.stop();\n}\n\nint computingCut(GGraph& g) {\n\n  GNodeBag bag;\n  galois::GAccumulator<unsigned> edgecut;\n  galois::do_all(\n      galois::iterate((size_t)0, g.hedges),\n      [&](GNode n) {\n        std::set<unsigned> nump;\n        for (auto cell : g.edges(n)) {\n          auto c   = g.getEdgeDst(cell);\n          int part = g.getData(c).getPart();\n          nump.insert(part);\n        }\n        edgecut += (nump.size() - 1);\n      },\n      galois::loopname(\"cutsize\"));\n  return edgecut.reduce();\n}\n\nint computingBalance(GGraph& g) {\n  int max = 0;\n  std::vector<int> parts(numPartitions, 0);\n  for (size_t c = g.hedges; c < g.size(); c++) {\n    unsigned pp = g.getData(c).getPart();\n    parts[pp]++;\n  }\n  for (unsigned i = 0; i < numPartitions; i++) {\n    if (parts[i] > max)\n      max = parts[i];\n  }\n  return max;\n}\n// printGraphBeg(*graph)\n\ntypedef galois::graphs::FileGraph FG;\ntypedef FG::GraphNode FN;\ntemplate <typename GNode, typename Weights>\nstruct order_by_degree {\n  GGraph& graph;\n  Weights& weights;\n  order_by_degree(GGraph& g, Weights& w) : graph(g), weights(w) {}\n  bool operator()(const GNode& a, const GNode& b) {\n    uint64_t wa = weights[a];\n    uint64_t wb = weights[b];\n    int pa      = graph.getData(a, galois::MethodFlag::UNPROTECTED).getPart();\n    int pb      = graph.getData(b, galois::MethodFlag::UNPROTECTED).getPart();\n    if (pa != pb) {\n      return pa < pb;\n    }\n    return wa < wb;\n  }\n};\n\ntypedef galois::substrate::PerThreadStorage<std::map<GNode, uint64_t>>\n    PerThreadDegInfo;\n\nstd::map<uint64_t, uint64_t>\ncellToNet(std::map<uint64_t, std::vector<uint64_t>> netToCell) {\n  std::map<uint64_t, uint64_t> celltonet;\n  for (auto n : netToCell) {\n    for (auto c : n.second) {\n      celltonet[c]++;\n    }\n  }\n  return celltonet;\n}\n\nint hash(unsigned val) {\n  unsigned long int seed = val * 1103515245 + 12345;\n  return ((unsigned)(seed / 65536) % 32768);\n}\n\nint main(int argc, char** argv) {\n  galois::SharedMemSys G;\n\n  LonestarStart(argc, argv, name, desc, url, &inputFile);\n\n  galois::StatTimer totalTime(\"TimerTotal\");\n  totalTime.start();\n\n  if (!hMetisGraph) {\n    GALOIS_DIE(\"This application requires a hMetis graph input;\"\n               \" please use the -hMetisGraph flag \"\n               \" to indicate the input is a hMetisGraph graph.\");\n  }\n\n  std::shared_ptr<MetisGraph> metisGraph(new MetisGraph());\n  GGraph& graph = *metisGraph->getGraph();\n  std::ifstream f(inputFile.c_str());\n  std::string line;\n  std::getline(f, line);\n  std::stringstream ss(line);\n  uint32_t i1;\n  uint64_t i2;\n  ss >> i1 >> i2;\n  uint32_t hedges = i1;\n  uint64_t nodes  = i2;\n  std::cout << \"hedges: \" << hedges << \"\\n\";\n  std::cout << \"nodes: \" << nodes << \"\\n\\n\";\n\n  galois::StatTimer T(\"buildingG\");\n  T.start();\n\n  galois::gstl::Vector<galois::PODResizeableArray<uint32_t>> edges_id(hedges +\n                                                                      nodes);\n  std::vector<std::vector<EdgeTy>> edges_data(hedges + nodes);\n  std::vector<uint64_t> prefix_edges(nodes + hedges);\n  uint32_t cnt   = 0;\n  uint32_t edges = 0;\n  while (std::getline(f, line)) {\n    if (cnt >= hedges) {\n      printf(\"ERROR: too many lines in input file\\n\");\n      exit(-1);\n    }\n    std::stringstream ss(line);\n    int val;\n    while (ss >> val) {\n      if ((val < 1) || (val > static_cast<long>(nodes))) {\n        printf(\"ERROR: node value %d out of bounds\\n\", val);\n        exit(-1);\n      }\n      unsigned newval = hedges + (val - 1);\n      edges_id[cnt].push_back(newval);\n      edges++;\n    }\n    cnt++;\n  }\n  f.close();\n  graph.hedges = hedges;\n  graph.hnodes = nodes;\n  std::cout << \"number of hedges \" << hedges << \"\\n\";\n  uint32_t sizes = hedges + nodes;\n\n  galois::do_all(galois::iterate(uint32_t{0}, sizes),\n                 [&](uint32_t c) { prefix_edges[c] = edges_id[c].size(); });\n\n  for (uint64_t c = 1; c < nodes + hedges; ++c) {\n    prefix_edges[c] += prefix_edges[c - 1];\n  }\n\n  graph.constructFrom(nodes + hedges, edges, prefix_edges, edges_id,\n                      edges_data);\n  galois::do_all(\n      galois::iterate(graph),\n      [&](GNode n) {\n        if (n < hedges)\n          graph.getData(n).netnum = n + 1;\n        else\n          graph.getData(n).netnum = INT_MAX;\n        graph.getData(n).netrand = INT_MAX;\n        graph.getData(n).netval  = INT_MAX;\n        graph.getData(n).nodeid  = n + 1;\n      },\n      galois::steal(), galois::loopname(\"build initial graph\"));\n  T.stop();\n  std::cout << \"time to build a graph \" << T.get() << \"\\n\";\n  graphStat(graph);\n  std::cout << \"\\n\";\n  galois::preAlloc(galois::runtime::numPagePoolAllocTotal() * 10);\n  galois::reportPageAlloc(\"MeminfoPre\");\n  galois::do_all(\n      galois::iterate(graph.hedges, graph.size()),\n      [&](GNode item) {\n        // accum += g->getData(item).getWeight();\n        graph.getData(item, galois::MethodFlag::UNPROTECTED)\n            .initRefine(0, true);\n        graph.getData(item, galois::MethodFlag::UNPROTECTED).initPartition();\n      },\n      galois::steal(), galois::loopname(\"initPart\"));\n\n  Partition(metisGraph, csize, numPartitions);\n\n  const int k = numPartitions;\n  // calculating number of iterations/levels required\n  int num = log2(k);\n\n  int kValue[k];\n  for (int i = 0; i < k; i++)\n    kValue[i] = 0;\n\n  kValue[0]           = (k + 1) / 2;\n  kValue[(k + 1) / 2] = k / 2;\n\n  galois::do_all(\n      galois::iterate((uint64_t)graph.hedges, graph.size()),\n      [&](GNode n) {\n        unsigned pp = graph.getData(n).getPart();\n        if (pp == 1) {\n          graph.getData(n).setPart((k + 1) / 2);\n        }\n      },\n      galois::steal(), galois::loopname(\"set part (original graph)\"));\n\n  // running it level by level\n\n  // toProcess contains nodes to be executed in a given level\n  std::set<int> toProcess;\n  std::set<int> toProcessNew;\n  toProcess.insert(0);\n  toProcess.insert((k + 1) / 2);\n\n  std::vector<std::vector<GNode>> nodesvec(k);\n  // std::array<std::vector<GNode>, 100> hedgesvec;\n\n  for (int level = 0; level < num; level++) {\n\n    for (int i = 0; i < k; i++)\n      nodesvec[i].clear();\n\n    // distributing nodes in relevant vectors according to their current\n    // partition assignment\n    for (GNode n = graph.hedges; n < graph.size(); n++) {\n      unsigned pp = graph.getData(n).getPart();\n      nodesvec[pp].push_back(n);\n    }\n\n    std::vector<std::vector<GNode>> hedgevec(k);\n\n    // distribute hyperedges according to their current partition\n    galois::do_all(\n        galois::iterate((uint64_t)0, graph.hedges),\n        [&](GNode h) {\n          auto edge = *(graph.edges(h).begin());\n          auto dst  = graph.getEdgeDst(edge);\n          auto ii   = graph.getData(dst).getPart();\n\n          bool flag = true;\n\n          for (auto n : graph.edges(h)) {\n            auto part = graph.getData(graph.getEdgeDst(n)).getPart();\n\n            if (part != ii) {\n              flag = false;\n              break;\n            }\n          }\n\n          if (flag)\n            graph.getData(h).setPart(ii);\n          else\n            graph.getData(h).setPart(100000);\n        },\n        galois::steal(), galois::loopname(\"distribute hedges\"));\n\n    for (GNode h = 0; h < graph.hedges; h++) {\n      unsigned part = graph.getData(h).getPart();\n      if (part != 100000)\n        hedgevec[part].push_back(h);\n    }\n\n    // calling Partition for each partition number\n    for (unsigned i : toProcess) {\n      if (kValue[i] > 1) {\n        std::shared_ptr<MetisGraph> metisG;\n        GGraph& gr = *metisG->getGraph();\n\n        unsigned ed = 0;\n\n        for (auto h : hedgevec[i])\n          graph.getData(h).index = ed++;\n\n        unsigned id = ed;\n        for (auto n : nodesvec[i]) {\n          graph.getData(n).index = id++;\n        }\n\n        unsigned totalnodes = id;\n        galois::gstl::Vector<galois::PODResizeableArray<uint32_t>> edges_ids(\n            totalnodes);\n        std::vector<std::vector<EdgeTy>> edge_data(totalnodes);\n        std::vector<uint64_t> pre_edges(totalnodes);\n        unsigned edges = 0;\n\n        galois::do_all(\n            galois::iterate(hedgevec[i]),\n            [&](GNode h) {\n              for (auto v : graph.edges(h)) {\n                auto vv = graph.getEdgeDst(v);\n\n                uint32_t newid = graph.getData(h).index;\n                unsigned nm    = graph.getData(vv).index;\n                edges_ids[newid].push_back(nm);\n              }\n            },\n            galois::steal(), galois::loopname(\"populate edge ids\"));\n\n        uint64_t num_edges_acc = 0;\n        // galois::do_all(\n        //  galois::iterate(uint32_t{0}, totalnodes),\n        for (uint32_t c = 0; c < totalnodes; c++) {\n          pre_edges[c] = edges_ids[c].size();\n          num_edges_acc += pre_edges[c];\n        }\n        // galois::steal(), galois::loopname(\"set pre edges\"));\n\n        edges = num_edges_acc;\n\n        for (uint64_t c = 1; c < totalnodes; ++c) {\n          pre_edges[c] += pre_edges[c - 1];\n        }\n        gr.constructFrom(totalnodes, edges, pre_edges, edges_ids, edge_data);\n\n        gr.hedges = ed;\n        gr.hnodes = id - ed;\n\n        galois::do_all(\n            galois::iterate(gr),\n            [&](GNode n) {\n              if (n < gr.hedges)\n                gr.getData(n).netnum = n + 1;\n              else\n                gr.getData(n).netnum = INT_MAX;\n              gr.getData(n).netrand = INT_MAX;\n              gr.getData(n).netval  = INT_MAX;\n              gr.getData(n).nodeid  = n + 1;\n            },\n            galois::steal(), galois::loopname(\"build graph: recursion level\"));\n\n        Partition(metisG, csize, kValue[i]);\n\n        std::shared_ptr<MetisGraph> mcg = metisG;\n\n        int tmp                   = kValue[i];\n        kValue[i]                 = (tmp + 1) / 2;\n        kValue[i + (tmp + 1) / 2] = (tmp) / 2;\n        toProcessNew.insert(i);\n        toProcessNew.insert(i + (tmp + 1) / 2);\n\n        galois::do_all(\n            galois::iterate(nodesvec[i]),\n            [&](GNode v) {\n              GNode n     = graph.getData(v).index;\n              unsigned pp = gr.getData(n).getPart();\n              if (pp == 0) {\n                graph.getData(v).setPart(i);\n              } else if (pp == 1) {\n                graph.getData(v).setPart(i + (tmp + 1) / 2);\n              }\n            },\n            galois::steal(),\n            galois::loopname(\"set part: inside recursive call\"));\n\n      } // end if\n    }   // end for\n\n    toProcess = toProcessNew;\n    toProcessNew.clear();\n  } // end while\n  std::cout << \"Coarsening time(s):,\" << Ctime << \"\\n\";\n  std::cout << \"Partitiong time(s):,\" << Ptime << \"\\n\";\n  std::cout << \"Refinement time(s):,\" << Rtime << \"\\n\";\n  std::cout << \"\\n\";\n  std::cout << \"Edge Cut,\" << computingCut(graph) << \"\\n\\n\";\n\n  galois::runtime::reportStat_Single(\"BiPart\", \"Edge Cut\", computingCut(graph));\n  // galois::runtime::reportStat_Single(\"BiPart\", \"zero-one\",\n  //                                   computingBalance(graph));\n\n  totalTime.stop();\n  if (output) {\n\n    std::vector<std::vector<uint64_t>> parts(numPartitions);\n\n    for (GNode n = graph.hedges; n < graph.size(); n++) {\n      unsigned p = graph.getData(n).getPart();\n      parts[p].push_back(n - graph.hedges + 1);\n    }\n\n    std::ofstream outputFile(outfile.c_str());\n\n    for (unsigned i = 0; i < numPartitions; i++) {\n      outputFile << i + 1 << \" \";\n      for (auto v : parts[i])\n        outputFile << v << \" \";\n      outputFile << \"\\n\";\n    }\n    outputFile.close();\n  }\n  return 0;\n}\n"
  },
  {
    "path": "lonestar/analytics/cpu/bipart/bipart.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#ifndef BIPART_H_\n#define BIPART_H_\n\n#include \"galois/graphs/LC_CSR_Graph.h\"\n#include \"galois/AtomicWrapper.h\"\n\nclass MetisNode;\ntypedef uint32_t EdgeTy;\n\nstruct GGraph\n    : public galois::graphs::LC_CSR_Graph<MetisNode, EdgeTy>::with_no_lockable<\n          true>::type::with_numa_alloc<true>::type {\n  // false>::type::with_numa_alloc<true>::type {\n  size_t hedges;\n  size_t hnodes;\n};\n\nusing GNode    = GGraph::GraphNode;\nusing GNodeBag = galois::InsertBag<GNode>;\n\nconstexpr galois::MethodFlag flag_no_lock = galois::MethodFlag::UNPROTECTED;\n// algorithms\nenum scheduleMode { PLD, WD, RI, PP, MRI, MWD, DEG, MDEG, HIS, RAND };\n\nenum coarseModeII { HMETISII, PAIRII };\nenum pairScheduleModeII { FIRSTII, MAXWII, ECII };\n// Nodes in the metis graph\nclass MetisNode {\n\n  struct coarsenData {\n    int matched : 1;\n    int failedmatch : 1;\n    GNode parent;\n  };\n  struct refineData {\n    unsigned partition;\n    unsigned oldPartition;\n    bool maybeBoundary;\n  };\n  struct partitionData {\n    bool locked;\n  };\n\n  partitionData pd;\n\n  void initCoarsen() {\n    data.cd.matched     = false;\n    data.cd.failedmatch = false;\n    data.cd.parent      = 0;\n    netval              = 0;\n  }\n\npublic:\n  // bool flag;\n  unsigned counter;\n  int nodeid;\n  galois::CopyableAtomic<int> FS;\n  galois::CopyableAtomic<int> TE;\n  galois::CopyableAtomic<int> netnum;\n  galois::CopyableAtomic<int> netrand;\n  galois::CopyableAtomic<int> netval;\n  galois::CopyableAtomic<int> degree;\n  /*std::atomic<int> FS;\n    std::atomic<int> TE;\n    std::atomic<int> netnum;\n    std::atomic<int> netrand;\n    std::atomic<int> netval;\n    std::atomic<int> degree;\n*/ uint32_t index;\n  bool notAlone;\n\n  void initPartition() { pd.locked = false; }\n\n  // int num;\n  explicit MetisNode(int weight) : _weight(weight) {\n    initCoarsen();\n    initPartition();\n    counter           = 0;\n    data.rd.partition = 0;\n  }\n\n  MetisNode(unsigned weight, GNode child0, GNode child1 = 0) : _weight(weight) {\n    initCoarsen();\n    initPartition();\n    children[0]       = child0;\n    children[1]       = child1;\n    counter           = 0;\n    data.rd.partition = 0;\n  }\n\n  MetisNode() : _weight(1) {\n    initCoarsen();\n    initPartition();\n    counter           = 0;\n    data.rd.partition = 0;\n    data.cd.matched   = false;\n  }\n\n  // call to switch data to refining\n  void initRefine(unsigned part = 0, bool bound = false) {\n    refineData rd = {part, part, bound};\n    data.rd       = rd;\n    counter       = 0;\n  }\n\n  int getWeight() const { return _weight; }\n  void setWeight(int weight) { _weight = weight; }\n\n  void setParent(GNode p) { data.cd.parent = p; }\n  GNode getParent() const {\n    assert(data.cd.parent);\n    return data.cd.parent;\n  }\n  int getGain() { return FS - (TE + counter); }\n\n  void setMatched() { data.cd.matched = true; }\n  void notMatched() { data.cd.matched = false; }\n  bool isMatched() const { return data.cd.matched; }\n\n  void setFailedMatch() { data.cd.failedmatch = true; }\n  bool isFailedMatch() const { return data.cd.failedmatch; }\n\n  GNode getChild(unsigned x) const { return children[x]; }\n  void setChild(GNode c) { children.push_back(c); }\n  unsigned numChildren() const { return children.size(); }\n\n  unsigned getPart() const { return data.rd.partition; }\n  void setPart(unsigned val) { data.rd.partition = val; }\n\n  int getOldPart() const { return data.rd.oldPartition; }\n  void OldPartCpyNew() { data.rd.oldPartition = data.rd.partition; }\n\n  bool getmaybeBoundary() const { return data.rd.maybeBoundary; }\n  void setmaybeBoundary(bool val) { data.rd.maybeBoundary = val; }\n\n  void setLocked(bool locked) { pd.locked = locked; }\n  bool isLocked() { return pd.locked; }\n\nprivate:\n  union {\n    coarsenData cd;\n    refineData rd;\n  } data;\n\n  std::vector<GNode> children;\n  unsigned _weight;\n};\n\n// Structure to keep track of graph hirarchy\nclass MetisGraph : public std::enable_shared_from_this<MetisGraph> {\n  MetisGraph* coarser;\n  std::shared_ptr<MetisGraph> finer;\n\n  GGraph graph;\n\npublic:\n  MetisGraph() : coarser(0) {}\n\n  explicit MetisGraph(std::shared_ptr<MetisGraph> finerGraph)\n      : coarser(0), finer(finerGraph) {\n    finer->coarser = this;\n  }\n\n  const GGraph* getGraph() const { return &graph; }\n  GGraph* getGraph() { return &graph; }\n  std::shared_ptr<MetisGraph> getFinerGraph() const { return finer; }\n  MetisGraph* getCoarserGraph() const { return coarser; }\n\n  // unsigned getNumNodes() { return std::distance(graph.cellList().begin(),\n  // graph.cellList().end()); }\n\n  unsigned getTotalWeight() {\n    std::shared_ptr<MetisGraph> f = shared_from_this();\n    while (f->finer)\n      f = f->finer;\n    // return std::distance(f->graph.cellList().begin(),\n    // f->graph.cellList().end());\n    return 0;\n  }\n};\n\n// Metrics\nunsigned graphStat(GGraph& graph);\n// Coarsening\nstd::shared_ptr<MetisGraph> coarsen(std::shared_ptr<MetisGraph> fineMetisGraph,\n                                    unsigned coarsenTo, scheduleMode sMode);\n\n// Partitioning\nvoid partition(std::shared_ptr<MetisGraph>, unsigned);\n// Refinement\nvoid refine(std::shared_ptr<MetisGraph> coarseGraph, unsigned K,\n            double imbalance);\n\n#endif\n"
  },
  {
    "path": "lonestar/analytics/cpu/clustering/CMakeLists.txt",
    "content": "add_executable(louvain-clustering-cpu louvainClustering.cpp)\nadd_dependencies(apps louvain-clustering-cpu)\ntarget_link_libraries(louvain-clustering-cpu PRIVATE Galois::shmem lonestar)\ninstall(TARGETS louvain-clustering-cpu DESTINATION \"${CMAKE_INSTALL_BINDIR}\" COMPONENT apps EXCLUDE_FROM_ALL)\nadd_test_scale(small1 louvain-clustering-cpu -symmetricGraph \"${BASEINPUT}/scalefree/symmetric/rmat10.sgr\")\n\nadd_executable(leiden-clustering-cpu leidenClustering.cpp)\nadd_dependencies(apps leiden-clustering-cpu)\ntarget_link_libraries(leiden-clustering-cpu PRIVATE Galois::shmem lonestar)\ninstall(TARGETS leiden-clustering-cpu DESTINATION \"${CMAKE_INSTALL_BINDIR}\" COMPONENT apps EXCLUDE_FROM_ALL)\nadd_test_scale(small1 leiden-clustering-cpu -symmetricGraph \"${BASEINPUT}/scalefree/symmetric/rmat10.sgr\")\n"
  },
  {
    "path": "lonestar/analytics/cpu/clustering/README.md",
    "content": "Clustering\n================================================================================\n\nDESCRIPTION \n--------------------------------------------------------------------------------\n\nThis directory contains hierarchical community detection algorithms, that\nrecursively merge the communities into a single node and perform clustering on the \ncoarsened graph until nodes stop changing communities.\n\nThe two algorithms are following:\n\n* Louvain Clustering: This algorithm uses the modularity function to find\n  well-connected communities by maximizing the modularity score, which\n  quantifies the quality of node assignments to the communities based on the\n  density of connections.\n* Leiden Clustering: This is a variant of the Louvain clustering algorithm with\n  the modified coarsening phase that allows nodes to switch communities even\n  after coarsening. This is shown to improve clustering quality with little\n  extra computation.\n\nINPUT\n--------------------------------------------------------------------------------\n\nThis application takes in symmetric Galois .gr graphs.\nYou must specify the -symmetricGraph flag when running this benchmark.\n\nBUILD\n--------------------------------------------------------------------------------\n\n1. Run cmake at BUILD directory (refer to top-level README for cmake instructions).\n\n2. Run `cd <BUILD>/lonestar/analytics/cpu/clustering; make -j`\n\nRUN\n--------------------------------------------------------------------------------\n\nThe following are a few example command lines.\n\n-`$ ./louvain-clustering-cpu <path-to-graph> -t 40 -c_threshold=0.01 -threshold=0.000001 -max_iter 1000 -algo=Foreach  -resolution=0.001 -symmetricGraph`\n\n-`$ ./leiden-clustering-cpu <path-to-graph> -t 40 -c_threshold=0.01 -threshold=0.000001 -max_iter 1000 -algo=Foreach  -resolution=0.001 -symmetricGraph`\n"
  },
  {
    "path": "lonestar/analytics/cpu/clustering/clustering.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#ifndef CLUSTERING_H\n#define CLUSTERING_H\n\n#include \"galois/Galois.h\"\n#include \"galois/AtomicHelpers.h\"\n#include \"galois/LargeArray.h\"\n\n#include \"llvm/Support/CommandLine.h\"\n\n#include <random>\n#include <fstream>\n\nnamespace cll = llvm::cl;\nstatic cll::opt<bool>\n    enable_VF(\"enable_VF\",\n              cll::desc(\"Flag to enable vertex following optimization.\"),\n              cll::init(false));\n\nstatic cll::opt<double> c_threshold(\"c_threshold\",\n                                    cll::desc(\"Threshold for modularity gain\"),\n                                    cll::init(0.01));\n\nstatic cll::opt<double>\n    threshold(\"threshold\", cll::desc(\"Total threshold for modularity gain\"),\n              cll::init(0.01));\n\nstatic cll::opt<uint32_t>\n    max_iter(\"max_iter\", cll::desc(\"Maximum number of iterations to execute\"),\n             cll::init(10));\n\nstatic cll::opt<bool>\n    output_CID(\"output_CID\", cll::desc(\"Flag to enable cluster ID printing.\"),\n               cll::init(false));\n\nstatic cll::opt<std::string>\n    output_CID_filename(\"output_CID_filename\",\n                        cll::desc(\"File name to output cluster IDs.\"),\n                        cll::init(\"output_CID_filename\"));\n\nstatic cll::opt<double>\n    resolution(\"resolution\", cll::desc(\"Resolution for CPM quality function.\"),\n               cll::init(1.0));\n\nstatic cll::opt<double>\n    randomness(\"randomness\",\n               cll::desc(\"Randomness factor for refining clusters in Leiden.\"),\n               cll::init(0.01));\n\nstatic cll::opt<uint32_t>\n    min_graph_size(\"min_graph_size\", cll::desc(\"Minimum coarsened graph size\"),\n                   cll::init(100));\n\n/*\n * Typedefs\n */\nconstexpr static const uint64_t INF_VAL =\n    std::numeric_limits<uint64_t>::max() / 2 - 1;\nconstexpr static const uint64_t UNASSIGNED =\n    std::numeric_limits<uint64_t>::max();\nconstexpr static const double DOUBLE_MAX =\n    std::numeric_limits<double>::max() / 4;\n\nconstexpr galois::MethodFlag flag_no_lock    = galois::MethodFlag::UNPROTECTED;\nconstexpr galois::MethodFlag flag_read_lock  = galois::MethodFlag::READ;\nconstexpr galois::MethodFlag flag_write_lock = galois::MethodFlag::WRITE;\n\ntypedef galois::LargeArray<uint64_t> largeArray;\ntypedef float EdgeTy;\n// typedef uint32_t EdgeTy;\ntypedef galois::LargeArray<EdgeTy> largeArrayEdgeTy;\n\ntemplate <typename GraphTy>\nvoid printGraphCharateristics(GraphTy& graph) {\n\n  galois::gPrint(\"/******************************************/\\n\");\n  galois::gPrint(\"/************ Graph Properties ************/\\n\");\n  galois::gPrint(\"/******************************************/\\n\");\n  galois::gPrint(\"Number of Nodes: \", graph.size(), \"\\n\");\n  galois::gPrint(\"Number of Edges: \", graph.sizeEdges(), \"\\n\");\n}\n\n/**\n * Algorithm to find the best cluster for the node\n * to move to among its neighbors.\n */\ntemplate <typename GraphTy>\nvoid findNeighboringClusters(GraphTy& graph, typename GraphTy::GraphNode& n,\n                             std::map<uint64_t, uint64_t>& cluster_local_map,\n                             std::vector<EdgeTy>& counter,\n                             EdgeTy& self_loop_wt) {\n  using GNode = typename GraphTy::GraphNode;\n  for (auto ii = graph.edge_begin(n); ii != graph.edge_end(n); ++ii) {\n    graph.getData(graph.getEdgeDst(ii), flag_write_lock);\n  }\n\n  uint64_t num_unique_clusters = 0;\n  /**\n   * Add the node's current cluster to be considered\n   * for movement as well\n   */\n  cluster_local_map[graph.getData(n).curr_comm_ass] =\n      0;                // Add n's current cluster\n  counter.push_back(0); // Initialize the counter to zero (no edges incident\n                        // yet)\n  num_unique_clusters++;\n\n  // Assuming we have grabbed lock on all the neighbors\n  for (auto ii = graph.edge_begin(n); ii != graph.edge_end(n); ++ii) {\n    GNode dst = graph.getEdgeDst(ii);\n    auto edge_wt =\n        graph.getEdgeData(ii, flag_no_lock); // Self loop weights is recorded\n    if (dst == n) {\n      self_loop_wt += edge_wt; // Self loop weights is recorded\n    }\n    auto stored_already = cluster_local_map.find(\n        graph.getData(dst).curr_comm_ass); // Check if it already exists\n    if (stored_already != cluster_local_map.end()) {\n      counter[stored_already->second] += edge_wt;\n    } else {\n      cluster_local_map[graph.getData(dst).curr_comm_ass] = num_unique_clusters;\n      counter.push_back(edge_wt);\n      num_unique_clusters++;\n    }\n  } // End edge loop\n  return;\n}\ntemplate <typename GraphTy>\nuint64_t vertexFollowing(GraphTy& graph) {\n  using GNode = typename GraphTy::GraphNode;\n  // Initialize each node to its own cluster\n  galois::do_all(galois::iterate(graph),\n                 [&graph](GNode n) { graph.getData(n).curr_comm_ass = n; });\n\n  // Remove isolated and degree-one nodes\n  galois::GAccumulator<uint64_t> isolatedNodes;\n  galois::do_all(galois::iterate(graph), [&](GNode n) {\n    auto& n_data = graph.getData(n);\n    uint64_t degree =\n        std::distance(graph.edge_begin(n, galois::MethodFlag::UNPROTECTED),\n                      graph.edge_end(n, galois::MethodFlag::UNPROTECTED));\n    if (degree == 0) {\n      isolatedNodes += 1;\n      n_data.curr_comm_ass = UNASSIGNED;\n    } else {\n      if (degree == 1) {\n        // Check if the destination has degree greater than one\n        auto dst = graph.getEdgeDst(\n            graph.edge_end(n, galois::MethodFlag::UNPROTECTED));\n        uint64_t dst_degree = std::distance(\n            graph.edge_begin(dst, galois::MethodFlag::UNPROTECTED),\n            graph.edge_end(dst, galois::MethodFlag::UNPROTECTED));\n        if ((dst_degree > 1 || (n > dst))) {\n          isolatedNodes += 1;\n          n_data.curr_comm_ass = graph.getData(dst).curr_comm_ass;\n        }\n      }\n    }\n  });\n  // The number of isolated nodes that can be removed\n  return isolatedNodes.reduce();\n}\n\ntemplate <typename GraphTy, typename CommArrayTy>\nvoid sumVertexDegreeWeight(GraphTy& graph, CommArrayTy& c_info) {\n  using GNode = typename GraphTy::GraphNode;\n  galois::do_all(galois::iterate(graph), [&](GNode n) {\n    EdgeTy total_weight = 0;\n    auto& n_data        = graph.getData(n);\n    for (auto ii = graph.edge_begin(n); ii != graph.edge_end(n); ++ii) {\n      total_weight += graph.getEdgeData(ii, flag_no_lock);\n    }\n    n_data.degree_wt    = total_weight;\n    c_info[n].degree_wt = total_weight;\n    c_info[n].size      = 1;\n  });\n}\ntemplate <typename GraphTy, typename CommArrayTy>\nvoid sumVertexDegreeWeightWithNodeWeight(GraphTy& graph, CommArrayTy& c_info) {\n  using GNode = typename GraphTy::GraphNode;\n  galois::do_all(galois::iterate(graph), [&](GNode n) {\n    EdgeTy total_weight = 0;\n    auto& n_data        = graph.getData(n);\n    for (auto ii = graph.edge_begin(n); ii != graph.edge_end(n); ++ii) {\n      total_weight += graph.getEdgeData(ii, flag_no_lock);\n    }\n    n_data.degree_wt    = total_weight;\n    c_info[n].degree_wt = total_weight;\n    c_info[n].size      = 1;\n    c_info[n].node_wt.store(n_data.node_wt);\n  });\n}\n\ntemplate <typename GraphTy, typename CommArrayTy>\nvoid sumClusterWeight(GraphTy& graph, CommArrayTy& c_info) {\n  using GNode = typename GraphTy::GraphNode;\n  galois::do_all(galois::iterate(graph), [&](GNode n) {\n    EdgeTy total_weight = 0;\n    auto& n_data        = graph.getData(n);\n    for (auto ii = graph.edge_begin(n); ii != graph.edge_end(n); ++ii) {\n      total_weight += graph.getEdgeData(ii, flag_no_lock);\n    }\n    n_data.degree_wt    = total_weight;\n    c_info[n].degree_wt = 0;\n  });\n\n  galois::do_all(galois::iterate(graph), [&](GNode n) {\n    auto& n_data = graph.getData(n);\n    if (n_data.curr_comm_ass != UNASSIGNED)\n      galois::atomicAdd(c_info[n_data.curr_comm_ass].degree_wt,\n                        n_data.degree_wt);\n  });\n}\n\ntemplate <typename GraphTy>\ndouble calConstantForSecondTerm(GraphTy& graph) {\n  using GNode = typename GraphTy::GraphNode;\n  /**\n   * Using double to avoid overflow\n   */\n  galois::GAccumulator<double> local_weight;\n  galois::do_all(galois::iterate(graph), [&graph, &local_weight](GNode n) {\n    local_weight += graph.getData(n).degree_wt;\n  });\n  /* This is twice since graph is symmetric */\n  double total_edge_weight_twice = local_weight.reduce();\n  return 1 / total_edge_weight_twice;\n}\n\ntemplate <typename GraphTy, typename CommArrayTy>\nuint64_t maxCPMQuality(std::map<uint64_t, uint64_t>& cluster_local_map,\n                       std::vector<EdgeTy>& counter, EdgeTy self_loop_wt,\n                       CommArrayTy& c_info, uint64_t node_wt, uint64_t sc) {\n\n  uint64_t max_index = sc; // Assign the initial value as self community\n  double cur_gain    = 0;\n  double max_gain    = 0;\n  double eix         = counter[0] - self_loop_wt;\n  double eiy         = 0;\n  double size_x      = (double)(c_info[sc].node_wt - node_wt);\n  double size_y      = 0;\n\n  auto stored_already = cluster_local_map.begin();\n  do {\n    if (sc != stored_already->first) {\n\n      eiy =\n          counter[stored_already->second]; // Total edges incident on cluster y\n      size_y = c_info[stored_already->first].node_wt;\n\n      cur_gain = 2.0f * (double)(eiy - eix) -\n                 resolution * node_wt * (double)(size_y - size_x);\n      if ((cur_gain > max_gain) || ((cur_gain == max_gain) && (cur_gain != 0) &&\n                                    (stored_already->first < max_index))) {\n        max_gain  = cur_gain;\n        max_index = stored_already->first;\n      }\n    }\n    stored_already++; // Explore next cluster\n  } while (stored_already != cluster_local_map.end());\n\n  if ((c_info[max_index].size == 1 && c_info[sc].size == 1 && max_index > sc)) {\n    max_index = sc;\n  }\n  assert(max_gain >= 0);\n  return max_index;\n}\n\ntemplate <typename GraphTy, typename CommArrayTy>\ndouble calCPMQuality(GraphTy& graph, CommArrayTy& c_info, double& e_xx,\n                     double& a2_x, double& constant_for_second_term) {\n\n  using GNode = typename GraphTy::GraphNode;\n  /* Variables needed for Modularity calculation */\n  double mod = -1;\n\n  std::cout << \"graph size: \" << graph.size() << \"\\n\";\n  largeArrayEdgeTy cluster_wt_internal;\n\n  /*** Initialization ***/\n  cluster_wt_internal.allocateBlocked(graph.size());\n\n  /* Calculate the overall modularity */\n  galois::GAccumulator<double> acc_e_xx;\n  galois::GAccumulator<double> acc_a2_x;\n\n  galois::do_all(galois::iterate(graph),\n                 [&](GNode n) { cluster_wt_internal[n] = 0; });\n\n  galois::do_all(galois::iterate(graph), [&](GNode n) {\n    auto n_data = graph.getData(n);\n    for (auto ii = graph.edge_begin(n); ii != graph.edge_end(n); ++ii) {\n      if (graph.getData(graph.getEdgeDst(ii)).curr_comm_ass ==\n          n_data.curr_comm_ass) {\n        cluster_wt_internal[n] += graph.getEdgeData(ii);\n      }\n    }\n  });\n\n  galois::do_all(galois::iterate(graph), [&](GNode n) {\n    acc_e_xx += cluster_wt_internal[n];\n    acc_a2_x +=\n        (double)(c_info[n].node_wt) * ((double)(c_info[n].node_wt - 1) * 0.5f);\n    // acc_a2_x += (double) (c_info[n].node_wt) * ((double) (c_info[n].node_wt)\n    // * resolution);\n  });\n\n  e_xx = acc_e_xx.reduce();\n  a2_x = acc_a2_x.reduce();\n  mod  = (e_xx - a2_x) * (double)constant_for_second_term;\n\n  return mod;\n}\n\ntemplate <typename CommArrayTy>\nuint64_t maxModularity(std::map<uint64_t, uint64_t>& cluster_local_map,\n                       std::vector<EdgeTy>& counter, EdgeTy self_loop_wt,\n                       CommArrayTy& c_info, EdgeTy degree_wt, uint64_t sc,\n                       double constant) {\n\n  uint64_t max_index = sc; // Assign the intial value as self community\n  double cur_gain    = 0;\n  double max_gain    = 0;\n  double eix         = counter[0] - self_loop_wt;\n  double ax          = c_info[sc].degree_wt - degree_wt;\n  double eiy         = 0;\n  double ay          = 0;\n\n  auto stored_already = cluster_local_map.begin();\n  do {\n    if (sc != stored_already->first) {\n      ay = c_info[stored_already->first].degree_wt; // Degree wt of cluster y\n      eiy =\n          counter[stored_already->second]; // Total edges incident on cluster y\n      cur_gain = 2 * constant * (eiy - eix) +\n                 2 * degree_wt * ((ax - ay) * constant * constant);\n\n      if ((cur_gain > max_gain) || ((cur_gain == max_gain) && (cur_gain != 0) &&\n                                    (stored_already->first < max_index))) {\n        max_gain  = cur_gain;\n        max_index = stored_already->first;\n      }\n    }\n    stored_already++; // Explore next cluster\n  } while (stored_already != cluster_local_map.end());\n\n  if ((c_info[max_index].size == 1 && c_info[sc].size == 1 && max_index > sc)) {\n    max_index = sc;\n  }\n\n  assert(max_gain >= 0);\n  return max_index;\n}\n\ntemplate <typename CommArrayTy>\nuint64_t\nmaxModularityWithoutSwaps(std::map<uint64_t, uint64_t>& cluster_local_map,\n                          std::vector<EdgeTy>& counter, uint64_t self_loop_wt,\n                          CommArrayTy& c_info, EdgeTy degree_wt, uint64_t sc,\n                          double constant) {\n\n  uint64_t max_index = sc; // Assign the intial value as self community\n  double cur_gain    = 0;\n  double max_gain    = 0;\n  double eix         = counter[0] - self_loop_wt;\n  double ax          = c_info[sc].degree_wt - degree_wt;\n  double eiy         = 0;\n  double ay          = 0;\n\n  auto stored_already = cluster_local_map.begin();\n  do {\n    if (sc != stored_already->first) {\n      ay = c_info[stored_already->first].degree_wt; // Degree wt of cluster y\n\n      if (ay < (ax + degree_wt)) {\n        stored_already++;\n        continue;\n      } else if (ay == (ax + degree_wt) && stored_already->first > sc) {\n        stored_already++;\n        continue;\n      }\n\n      eiy =\n          counter[stored_already->second]; // Total edges incident on cluster y\n      cur_gain = 2 * constant * (eiy - eix) +\n                 2 * degree_wt * ((ax - ay) * constant * constant);\n\n      if ((cur_gain > max_gain) || ((cur_gain == max_gain) && (cur_gain != 0) &&\n                                    (stored_already->first < max_index))) {\n        max_gain  = cur_gain;\n        max_index = stored_already->first;\n      }\n    }\n    stored_already++; // Explore next cluster\n  } while (stored_already != cluster_local_map.end());\n\n  if ((c_info[max_index].size == 1 && c_info[sc].size == 1 && max_index > sc)) {\n    max_index = sc;\n  }\n\n  assert(max_gain >= 0);\n  return max_index;\n}\n\ntemplate <typename GraphTy, typename CommArrayTy>\ndouble calModularityDelay(GraphTy& graph, CommArrayTy& c_info,\n                          CommArrayTy& c_update, double& e_xx, double& a2_x,\n                          double& constant_for_second_term,\n                          std::vector<uint64_t>& local_target) {\n  using GNode = typename GraphTy::GraphNode;\n  /* Variables needed for Modularity calculation */\n  double mod = -1;\n\n  largeArrayEdgeTy cluster_wt_internal;\n\n  /*** Initialization ***/\n  cluster_wt_internal.allocateBlocked(graph.size());\n\n  /* Calculate the overall modularity */\n  galois::GAccumulator<double> acc_e_xx;\n  galois::GAccumulator<double> acc_a2_x;\n\n  galois::do_all(galois::iterate(graph),\n                 [&](GNode n) { cluster_wt_internal[n] = 0; });\n\n  galois::do_all(galois::iterate(graph), [&](GNode n) {\n    for (auto ii = graph.edge_begin(n); ii != graph.edge_end(n); ++ii) {\n      if (local_target[graph.getEdgeDst(ii)] == local_target[n]) {\n        cluster_wt_internal[n] += graph.getEdgeData(ii);\n      }\n    }\n  });\n\n  galois::do_all(galois::iterate(graph), [&](GNode n) {\n    acc_e_xx += cluster_wt_internal[n];\n    acc_a2_x += (double)(c_info[n].degree_wt + c_update[n].degree_wt) *\n                ((double)(c_info[n].degree_wt + c_update[n].degree_wt) *\n                 (double)constant_for_second_term);\n  });\n\n  e_xx = acc_e_xx.reduce();\n  a2_x = acc_a2_x.reduce();\n\n  mod = e_xx * (double)constant_for_second_term -\n        a2_x * (double)constant_for_second_term;\n  return mod;\n}\n\ntemplate <typename GraphTy, typename CommArrayTy>\ndouble calModularity(GraphTy& graph, CommArrayTy& c_info, double& e_xx,\n                     double& a2_x, double& constant_for_second_term) {\n  using GNode = typename GraphTy::GraphNode;\n  /* Variables needed for Modularity calculation */\n  double mod = -1;\n\n  largeArrayEdgeTy cluster_wt_internal;\n\n  /*** Initialization ***/\n  cluster_wt_internal.allocateBlocked(graph.size());\n\n  /* Calculate the overall modularity */\n  galois::GAccumulator<double> acc_e_xx;\n  galois::GAccumulator<double> acc_a2_x;\n\n  galois::do_all(galois::iterate(graph),\n                 [&](GNode n) { cluster_wt_internal[n] = 0; });\n\n  galois::do_all(galois::iterate(graph), [&](GNode n) {\n    auto n_data = graph.getData(n);\n    for (auto ii = graph.edge_begin(n); ii != graph.edge_end(n); ++ii) {\n      if (graph.getData(graph.getEdgeDst(ii)).curr_comm_ass ==\n          n_data.curr_comm_ass) {\n        cluster_wt_internal[n] += graph.getEdgeData(ii);\n      }\n    }\n  });\n\n  galois::do_all(galois::iterate(graph), [&](GNode n) {\n    acc_e_xx += cluster_wt_internal[n];\n    acc_a2_x +=\n        (double)(c_info[n].degree_wt) *\n        ((double)(c_info[n].degree_wt) * (double)constant_for_second_term);\n  });\n\n  e_xx = acc_e_xx.reduce();\n  a2_x = acc_a2_x.reduce();\n\n  mod = e_xx * (double)constant_for_second_term -\n        a2_x * (double)constant_for_second_term;\n  return mod;\n}\n\n/*\n * To compute the final modularity using prev cluster\n * assignments.\n */\ntemplate <typename GraphTy, typename CommArrayTy>\ndouble calModularityFinal(GraphTy& graph) {\n  using GNode     = typename GraphTy::GraphNode;\n  using CommArray = CommArrayTy;\n\n  CommArray c_info;   // Community info\n  CommArray c_update; // Used for updating community\n\n  /* Variables needed for Modularity calculation */\n  double constant_for_second_term;\n  double mod = -1;\n\n  largeArrayEdgeTy cluster_wt_internal;\n\n  /*** Initialization ***/\n  c_info.allocateBlocked(graph.size());\n  c_update.allocateBlocked(graph.size());\n  cluster_wt_internal.allocateBlocked(graph.size());\n\n  /* Calculate the weighted degree sum for each vertex */\n  sumClusterWeight(graph, c_info);\n\n  /* Compute the total weight (2m) and 1/2m terms */\n  constant_for_second_term = calConstantForSecondTerm(graph);\n\n  /* Calculate the overall modularity */\n  double e_xx = 0;\n  galois::GAccumulator<double> acc_e_xx;\n  double a2_x = 0;\n  galois::GAccumulator<double> acc_a2_x;\n\n  galois::do_all(galois::iterate(graph),\n                 [&](GNode n) { cluster_wt_internal[n] = 0; });\n\n  galois::do_all(galois::iterate(graph), [&](GNode n) {\n    auto n_data = graph.getData(n);\n    for (auto ii = graph.edge_begin(n); ii != graph.edge_end(n); ++ii) {\n      if (graph.getData(graph.getEdgeDst(ii)).curr_comm_ass ==\n          n_data.curr_comm_ass) {\n        // if(graph.getData(graph.getEdgeDst(ii)).prev_comm_ass ==\n        // n_data.prev_comm_ass) {\n        cluster_wt_internal[n] += graph.getEdgeData(ii);\n      }\n    }\n  });\n\n  galois::do_all(galois::iterate(graph), [&](GNode n) {\n    acc_e_xx += cluster_wt_internal[n];\n    acc_a2_x +=\n        (double)(c_info[n].degree_wt) *\n        ((double)(c_info[n].degree_wt) * (double)constant_for_second_term);\n  });\n\n  e_xx = acc_e_xx.reduce();\n  a2_x = acc_a2_x.reduce();\n\n  mod = e_xx * (double)constant_for_second_term -\n        a2_x * (double)constant_for_second_term;\n  return mod;\n}\ntemplate <typename GraphTy>\nuint64_t renumberClustersContiguously(GraphTy& graph) {\n  using GNode = typename GraphTy::GraphNode;\n  std::map<uint64_t, uint64_t> cluster_local_map;\n  uint64_t num_unique_clusters = 0;\n\n  for (GNode n = 0; n < graph.size(); ++n) {\n    auto& n_data = graph.getData(n, flag_no_lock);\n    if (n_data.curr_comm_ass != UNASSIGNED) {\n      assert(n_data.curr_comm_ass < graph.size());\n      auto stored_already = cluster_local_map.find(n_data.curr_comm_ass);\n      if (stored_already != cluster_local_map.end()) {\n        n_data.curr_comm_ass = stored_already->second;\n      } else {\n        cluster_local_map[n_data.curr_comm_ass] = num_unique_clusters;\n        n_data.curr_comm_ass                    = num_unique_clusters;\n        num_unique_clusters++;\n      }\n    }\n  }\n  return num_unique_clusters;\n}\n\ntemplate <typename GraphTy>\nuint64_t renumberClustersContiguouslySubcomm(GraphTy& graph) {\n\n  using GNode = typename GraphTy::GraphNode;\n  std::map<uint64_t, uint64_t> cluster_local_map;\n  uint64_t num_unique_clusters = 0;\n\n  for (GNode n = 0; n < graph.size(); ++n) {\n    auto& n_data = graph.getData(n, flag_no_lock);\n    assert(n_data.curr_subcomm_ass != UNASSIGNED);\n    assert(n_data.curr_subcomm_ass < graph.size());\n    auto stored_already = cluster_local_map.find(n_data.curr_subcomm_ass);\n    if (stored_already != cluster_local_map.end()) {\n      n_data.curr_subcomm_ass = stored_already->second;\n    } else {\n      cluster_local_map[n_data.curr_subcomm_ass] = num_unique_clusters;\n      n_data.curr_subcomm_ass                    = num_unique_clusters;\n      num_unique_clusters++;\n    }\n  }\n\n  return num_unique_clusters;\n}\n\ntemplate <typename GraphTy>\nuint64_t renumberClustersContiguouslyArray(largeArray& arr) {\n  using GNode = typename GraphTy::GraphNode;\n  std::map<uint64_t, uint64_t> cluster_local_map;\n  uint64_t num_unique_clusters = 0;\n\n  for (GNode n = 0; n < arr.size(); ++n) {\n    if (arr[n] != UNASSIGNED) {\n      assert(arr[n] < arr.size());\n      auto stored_already = cluster_local_map.find(arr[n]);\n      if (stored_already != cluster_local_map.end()) {\n        arr[n] = stored_already->second;\n      } else {\n        cluster_local_map[arr[n]] = num_unique_clusters;\n        arr[n]                    = num_unique_clusters;\n        num_unique_clusters++;\n      }\n    }\n  }\n  return num_unique_clusters;\n}\n\ntemplate <typename GraphTy>\nvoid printGraph(GraphTy& graph) {\n  using GNode = typename GraphTy::GraphNode;\n  for (GNode n = 0; n < graph.size(); ++n) {\n    for (auto ii = graph.edge_begin(n); ii != graph.edge_end(n); ++ii) {\n      galois::gPrint(n, \" --> \", graph.getEdgeDst(ii), \" , \",\n                     graph.getEdgeData(ii), \"\\n\");\n    }\n  }\n}\n\ntemplate <typename GraphTy>\nvoid printNodeClusterId(GraphTy& graph, std::string output_CID_filename) {\n  using GNode = typename GraphTy::GraphNode;\n  std::ofstream outputFile(output_CID_filename, std::ofstream::out);\n  for (GNode n = 0; n < graph.size(); ++n) {\n    outputFile << n << \"  \" << graph.getData(n).curr_comm_ass << \"\\n\";\n    // outputFile << graph.getData(n).curr_comm_ass << \"\\n\";\n  }\n}\n\ntemplate <typename GraphTy, typename CommArrayTy>\nvoid checkModularity(GraphTy& graph, largeArray& clusters_orig) {\n  using GNode = typename GraphTy::GraphNode;\n  galois::gPrint(\"checkModularity\\n\");\n\n  galois::do_all(galois::iterate(graph), [&](GNode n) {\n    graph.getData(n, flag_no_lock).curr_comm_ass = clusters_orig[n];\n  });\n\n  uint64_t num_unique_clusters = renumberClustersContiguously(graph);\n  galois::gPrint(\"Number of unique clusters (renumber): \", num_unique_clusters,\n                 \"\\n\");\n  auto mod = calModularityFinal<GraphTy, CommArrayTy>(graph);\n  galois::gPrint(\"FINAL MOD: \", mod, \"\\n\");\n}\n\n/***********************************************\n ********** Leiden Routines ********************\n **********************************************/\nuint64_t generateRandonNumber(uint64_t min, uint64_t max) {\n  std::random_device dev;\n  std::mt19937 rng(dev());\n  std::uniform_int_distribution<std::mt19937::result_type> dist6(\n      min, max); // distribution in range [min, max]\n  return dist6(rng);\n}\n\nuint64_t generateRandonNumberDouble(double min, double max) {\n  std::random_device\n      rd; // Will be used to obtain a seed for the random number engine\n  std::mt19937 gen(rd()); // Standard mersenne_twister_engine seeded with rd()\n  std::uniform_real_distribution<> dis(min,\n                                       max); // distribution in range [min, max]\n  return dis(gen);\n}\n\ntemplate <typename CommArrayTy>\ndouble diffCPMQuality(uint64_t curr_subcomm, uint64_t candidate_subcomm,\n                      std::map<uint64_t, uint64_t>& cluster_local_map,\n                      std::vector<EdgeTy>& counter, CommArrayTy& subcomm_info,\n                      EdgeTy self_loop_wt) {\n\n  uint64_t size_x = subcomm_info[curr_subcomm].node_wt;\n  uint64_t size_y = subcomm_info[candidate_subcomm].node_wt;\n\n  double diff =\n      (double)(counter[cluster_local_map[candidate_subcomm]] -\n               counter[cluster_local_map[curr_subcomm]] + self_loop_wt) +\n      resolution * 0.5f *\n          (double)((size_x * (size_x - 1) + size_y * (size_y - 1)) -\n                   ((size_x - 1) * (size_x - 2) + size_y * (size_y + 1)));\n\n  return diff;\n}\n\ntemplate <typename GraphTy, typename CommArrayTy>\nuint64_t getRandomSubcommunity(GraphTy& graph, uint64_t n,\n                               CommArrayTy& subcomm_info,\n                               uint64_t total_degree_wt,\n                               double constant_for_second_term) {\n  using GNode           = typename GraphTy::GraphNode;\n  uint64_t curr_subcomm = graph.getData(n).curr_subcomm_ass;\n\n  std::map<uint64_t, uint64_t>\n      cluster_local_map; // Map each neighbor's subcommunity to local number:\n                         // Subcommunity --> Index\n  std::vector<EdgeTy> counter; // Number of edges to each unique subcommunity\n  uint64_t num_unique_clusters = 1;\n\n  cluster_local_map[curr_subcomm] = 0; // Add n's current subcommunity\n  counter.push_back(0); // Initialize the counter to zero (no edges incident\n                        // yet)\n\n  EdgeTy self_loop_wt = 0;\n\n  for (auto ii = graph.edge_begin(n); ii != graph.edge_end(n); ++ii) {\n    GNode dst = graph.getEdgeDst(ii);\n    EdgeTy edge_wt =\n        graph.getEdgeData(ii, flag_no_lock); // Self loop weights is recorded\n\n    if (dst == n) {\n      self_loop_wt += edge_wt; // Self loop weights is recorded\n    }\n    auto stored_already = cluster_local_map.find(\n        graph.getData(dst).curr_subcomm_ass); // Check if it already exists\n    if (stored_already != cluster_local_map.end()) {\n      counter[stored_already->second] += edge_wt;\n    } else {\n      cluster_local_map[graph.getData(dst).curr_subcomm_ass] =\n          num_unique_clusters;\n      counter.push_back(edge_wt);\n      num_unique_clusters++;\n    }\n  } // End edge loop\n\n  std::map<uint64_t, uint64_t> new_cluster_local_map;\n  std::vector<EdgeTy> new_counter;\n  num_unique_clusters = 0;\n  EdgeTy total        = 0;\n\n  for (auto pair : cluster_local_map) {\n    auto subcomm = pair.first;\n    if (curr_subcomm == subcomm)\n      continue;\n    uint64_t subcomm_degree_wt = subcomm_info[subcomm].degree_wt;\n\n    // check if subcommunity is well connected\n    if (subcomm_info[subcomm].internal_edge_wt <\n        constant_for_second_term * (double)subcomm_degree_wt *\n            ((double)total_degree_wt - (double)subcomm_degree_wt))\n      continue;\n    if (diffCPMQuality(curr_subcomm, subcomm, cluster_local_map, counter,\n                       subcomm_info, self_loop_wt) > 0) {\n      new_cluster_local_map[subcomm] = num_unique_clusters;\n      EdgeTy count                   = counter[cluster_local_map[subcomm]];\n      new_counter.push_back(count);\n      total += count;\n    }\n  }\n\n  // Pick max community size\n  uint64_t rand_idx = 1; // getRandomInt(0,total-1);\n\n  uint64_t idx = 0;\n  for (auto pair : new_cluster_local_map) {\n    if (new_counter[idx] > rand_idx)\n      return pair.first;\n    rand_idx = rand_idx - new_counter[idx];\n    idx++;\n  }\n\n  return UNASSIGNED;\n}\n\ntemplate <typename GraphTy, typename CommTy>\nuint64_t getRandomSubcommunity2(GraphTy& graph, typename GraphTy::GraphNode n,\n                                CommTy& subcomm_info, uint64_t total_degree_wt,\n                                uint64_t comm_id,\n                                double constant_for_second_term) {\n  using GNode  = typename GraphTy::GraphNode;\n  auto& n_data = graph.getData(n);\n  /*\n   * Remove the currently selected node from its current cluster.\n   * This causes the cluster to be empty.\n   */\n  subcomm_info[n_data.curr_subcomm_ass].node_wt          = 0;\n  subcomm_info[n_data.curr_subcomm_ass].internal_edge_wt = 0;\n\n  /*\n   * Map each neighbor's subcommunity to local number: Subcommunity --> Index\n   */\n  std::map<uint64_t, uint64_t> cluster_local_map;\n\n  /*\n   * Edges weight to each unique subcommunity\n   */\n  std::vector<EdgeTy> counter;\n  std::vector<uint64_t> neighboring_cluster_ids;\n\n  /*\n   * Identify the neighboring clusters of the currently selected\n   * node, that is, the clusters with which the currently\n   * selected node is connected. The old cluster of the currently\n   * selected node is also included in the set of neighboring\n   * clusters. In this way, it is always possible that the\n   * currently selected node will be moved back to its old\n   * cluster.\n   */\n  cluster_local_map[n_data.curr_subcomm_ass] = 0; // Add n's current\n                                                  // subcommunity\n  counter.push_back(0); // Initialize the counter to zero (no edges incident\n                        // yet)\n  neighboring_cluster_ids.push_back(n_data.curr_subcomm_ass);\n  uint64_t num_unique_clusters = 1;\n\n  EdgeTy self_loop_wt = 0;\n\n  for (auto ii = graph.edge_begin(n); ii != graph.edge_end(n); ++ii) {\n    GNode dst = graph.getEdgeDst(ii);\n    EdgeTy edge_wt =\n        graph.getEdgeData(ii, flag_no_lock); // Self loop weights is recorded\n    if (graph.getData(dst).curr_comm_ass == comm_id) {\n      if (dst == n) {\n        self_loop_wt += edge_wt; // Self loop weights is recorded\n      }\n      auto stored_already = cluster_local_map.find(\n          graph.getData(dst).curr_subcomm_ass); // Check if it already exists\n      if (stored_already != cluster_local_map.end()) {\n        counter[stored_already->second] += edge_wt;\n      } else {\n        cluster_local_map[graph.getData(dst).curr_subcomm_ass] =\n            num_unique_clusters;\n        counter.push_back(edge_wt);\n        neighboring_cluster_ids.push_back(graph.getData(dst).curr_subcomm_ass);\n        num_unique_clusters++;\n      }\n    }\n  } // End edge loop\n\n  uint64_t best_cluster                            = n_data.curr_subcomm_ass;\n  double max_quality_value_increment               = 0;\n  double total_transformed_quality_value_increment = 0;\n  double quality_value_increment                   = 0;\n  std::vector<double> cum_transformed_quality_value_increment_per_cluster(\n      num_unique_clusters);\n  for (auto pair : cluster_local_map) {\n    auto subcomm = pair.first;\n    if (n_data.curr_subcomm_ass == subcomm)\n      continue;\n\n    uint64_t subcomm_node_wt   = subcomm_info[subcomm].node_wt;\n    uint64_t subcomm_degree_wt = subcomm_info[subcomm].degree_wt;\n\n    // check if subcommunity is well connected\n    if (subcomm_info[subcomm].internal_edge_wt >=\n        constant_for_second_term * (double)subcomm_degree_wt *\n            ((double)total_degree_wt - (double)subcomm_degree_wt)) {\n\n      quality_value_increment =\n          counter[pair.second] - n_data.node_wt * subcomm_node_wt * resolution;\n\n      if (quality_value_increment > max_quality_value_increment) {\n        best_cluster                = subcomm;\n        max_quality_value_increment = quality_value_increment;\n      }\n\n      if (quality_value_increment >= 0)\n        total_transformed_quality_value_increment +=\n            std::exp(quality_value_increment / randomness);\n    }\n    cum_transformed_quality_value_increment_per_cluster[pair.second] =\n        total_transformed_quality_value_increment;\n    counter[pair.second] = 0;\n  }\n\n  /*\n   * Determine the neighboring cluster to which the currently\n   * selected node will be moved.\n   */\n  int64_t min_idx, max_idx, mid_idx;\n  uint64_t chosen_cluster;\n  double r;\n  if (total_transformed_quality_value_increment < DOUBLE_MAX) {\n    r = total_transformed_quality_value_increment *\n        generateRandonNumberDouble(0.0, 1.0);\n    min_idx = -1;\n    max_idx = num_unique_clusters + 1;\n    while (min_idx < max_idx - 1) {\n      mid_idx = (min_idx + max_idx) / 2;\n      if (cum_transformed_quality_value_increment_per_cluster[mid_idx] >= r)\n        max_idx = mid_idx;\n      else\n        min_idx = mid_idx;\n    }\n    chosen_cluster = neighboring_cluster_ids[max_idx];\n  } else {\n    chosen_cluster = best_cluster;\n  }\n  return chosen_cluster;\n}\n/**\n * Finds a clustering of the nodes in a network using the local merging\n * algorithm.\n *\n * <p>\n * The local merging algorithm starts from a singleton partition. It\n * performs a single iteration over the nodes in a network. Each node\n * belonging to a singleton cluster is considered for merging with another\n * cluster. This cluster is chosen randomly from all clusters that do not\n * result in a decrease in the quality function. The larger the increase in\n * the quality function, the more likely a cluster is to be chosen. The\n * strength of this effect is determined by the randomness parameter. The\n * higher the value of the randomness parameter, the stronger the\n * randomness in the choice of a cluster. The lower the value of the\n * randomness parameter, the more likely the cluster resulting in the\n * largest increase in the quality function is to be chosen. A node is\n * merged with a cluster only if both are sufficiently well connected to\n * the rest of the network.\n * </p>\n *\n * @param\n *\n * @return : Number of unique subcommunities formed\n * DO NOT parallelize as it is called within Galois parallel loops\n *\n */\ntemplate <typename GraphTy, typename CommTy>\nvoid mergeNodesSubset(GraphTy& graph,\n                      std::vector<typename GraphTy::GraphNode>& cluster_nodes,\n                      uint64_t comm_id, uint64_t total_degree_wt,\n                      CommTy& subcomm_info, double constant_for_second_term) {\n\n  using GNode = typename GraphTy::GraphNode;\n\n  // select set R\n  std::vector<GNode> cluster_nodes_to_move;\n  for (uint64_t i = 0; i < cluster_nodes.size(); ++i) {\n    GNode n      = cluster_nodes[i];\n    auto& n_data = graph.getData(n);\n    /*\n     * Initialize with singleton sub-communities\n     */\n    EdgeTy nodeEdgeWeightWithinCluster = 0;\n    for (auto ii = graph.edge_begin(n); ii != graph.edge_end(n); ++ii) {\n      GNode dst      = graph.getEdgeDst(ii);\n      EdgeTy edge_wt = graph.getEdgeData(ii, flag_no_lock);\n      /*\n       * Must include the edge weight of all neighbors excluding self loops\n       * belonging to the community comm_id\n       */\n      if (dst != n && graph.getData(dst).curr_comm_ass == comm_id) {\n        nodeEdgeWeightWithinCluster += edge_wt;\n      }\n    }\n\n    uint64_t node_wt   = n_data.node_wt;\n    uint64_t degree_wt = n_data.degree_wt;\n    /*\n     * Additionally, only nodes that are well connected with\n     * the rest of the network are considered for moving.\n     * (externalEdgeWeightPerCluster[j] >= clusterWeights[j] * (totalNodeWeight\n     * - clusterWeights[j]) * resolution\n     */\n    if (nodeEdgeWeightWithinCluster >=\n        constant_for_second_term * (double)degree_wt *\n            ((double)total_degree_wt - (double)degree_wt))\n      cluster_nodes_to_move.push_back(n);\n\n    subcomm_info[n].node_wt          = node_wt;\n    subcomm_info[n].internal_edge_wt = nodeEdgeWeightWithinCluster;\n    subcomm_info[n].size             = 1;\n    subcomm_info[n].degree_wt        = degree_wt;\n  }\n\n  for (GNode n : cluster_nodes_to_move) {\n    auto& n_data = graph.getData(n);\n    /*\n     * Only consider singleton communities\n     */\n    if (subcomm_info[n_data.curr_subcomm_ass].size == 1) {\n      uint64_t new_subcomm_ass =\n          getRandomSubcommunity2(graph, n, subcomm_info, total_degree_wt,\n                                 comm_id, constant_for_second_term);\n\n      if ((int64_t)new_subcomm_ass != -1 &&\n          new_subcomm_ass != graph.getData(n).curr_subcomm_ass) {\n        n_data.curr_subcomm_ass = new_subcomm_ass;\n\n        /*\n         * Move the currently selected node to its new cluster and\n         * update the clustering statistics.\n         */\n        galois::atomicAdd(subcomm_info[new_subcomm_ass].node_wt,\n                          n_data.node_wt);\n        galois::atomicAdd(subcomm_info[new_subcomm_ass].size, (uint64_t)1);\n        galois::atomicAdd(subcomm_info[new_subcomm_ass].degree_wt,\n                          n_data.degree_wt);\n\n        for (auto ii = graph.edge_begin(n); ii != graph.edge_end(n); ++ii) {\n          GNode dst    = graph.getEdgeDst(ii);\n          auto edge_wt = graph.getEdgeData(ii, flag_no_lock);\n          if (dst != n && graph.getData(dst).curr_comm_ass == comm_id) {\n            if (graph.getData(dst).curr_subcomm_ass == new_subcomm_ass) {\n              subcomm_info[new_subcomm_ass].internal_edge_wt -= edge_wt;\n            } else {\n              subcomm_info[new_subcomm_ass].internal_edge_wt += edge_wt;\n            }\n          }\n        }\n      }\n    }\n  }\n}\n\n/*\n * Refine the clustering by iterating over the clusters and by\n * trying to split up each cluster into multiple clusters.\n */\ntemplate <typename GraphTy, typename CommArrayTy>\nvoid refinePartition(GraphTy& graph, double constant_for_second_term) {\n\n  using GNode     = typename GraphTy::GraphNode;\n  using CommArray = CommArrayTy;\n\n  galois::gPrint(\"Refining\\n\");\n\n  // set singleton subcommunities\n  galois::do_all(\n      galois::iterate(graph),\n      [&](GNode n) { graph.getData(n).curr_subcomm_ass = n; }, galois::steal());\n\n  // populate nodes into communities\n  std::vector<std::vector<GNode>> cluster_bags(2 * graph.size() + 1);\n  CommArray comm_info;\n\n  comm_info.allocateBlocked(2 * graph.size() + 1);\n\n  galois::do_all(\n      galois::iterate((uint32_t)0, (uint32_t)(2 * graph.size() + 1)),\n      [&](uint32_t n) {\n        comm_info[n].node_wt   = (uint64_t)0;\n        comm_info[n].degree_wt = (uint64_t)0;\n      },\n      galois::steal());\n\n  for (GNode n : graph) {\n    auto& n_data = graph.getData(n, flag_no_lock);\n    if (n_data.curr_comm_ass != UNASSIGNED)\n      cluster_bags[n_data.curr_comm_ass].push_back(n);\n\n    galois::atomicAdd(comm_info[n_data.curr_comm_ass].node_wt, n_data.node_wt);\n    galois::atomicAdd(comm_info[n_data.curr_comm_ass].degree_wt,\n                      n_data.degree_wt);\n  }\n\n  CommArray subcomm_info;\n\n  subcomm_info.allocateBlocked(graph.size() + 1);\n\n  // call mergeNodesSubset for each community in parallel\n  galois::do_all(galois::iterate((uint64_t)0, (uint64_t)graph.size()),\n                 [&](uint64_t c) {\n                   /*\n                    * Only nodes belonging to singleton clusters can be moved to\n                    * a different cluster. This guarantees that clusters will\n                    * never be split up.\n                    */\n                   comm_info[c].num_subcomm = 0;\n                   if (cluster_bags[c].size() > 1) {\n                     // comm_info[c].num_subcomm =\n                     mergeNodesSubset<GraphTy, CommArray>(\n                         graph, cluster_bags[c], c, comm_info[c].degree_wt,\n                         subcomm_info, constant_for_second_term);\n                   } else {\n                     comm_info[c].num_subcomm = 0;\n                   }\n                 });\n}\n\n/*\n *\n * Graph construction routines to make\n * coarser graphs.\n *\n */\ntemplate <typename GraphTy>\nvoid buildNextLevelGraph(GraphTy& graph, GraphTy& graph_next,\n                         uint64_t num_unique_clusters) {\n  using GNode = typename GraphTy::GraphNode;\n  std::cerr << \"Inside buildNextLevelGraph\\n\";\n\n  galois::StatTimer TimerGraphBuild(\"Timer_Graph_build\");\n  TimerGraphBuild.start();\n  uint32_t num_nodes_next = num_unique_clusters;\n  uint64_t num_edges_next = 0; // Unknown right now\n\n  std::vector<std::vector<GNode>> cluster_bags(num_unique_clusters);\n  // Comment: Serial separation is better than do_all due to contention\n  for (GNode n = 0; n < graph.size(); ++n) {\n    auto n_data = graph.getData(n, flag_no_lock);\n    if (n_data.curr_comm_ass != UNASSIGNED)\n      cluster_bags[n_data.curr_comm_ass].push_back(n);\n  }\n\n  std::vector<std::vector<uint32_t>> edges_id(num_unique_clusters);\n  std::vector<std::vector<EdgeTy>> edges_data(num_unique_clusters);\n\n  /* First pass to find the number of edges */\n  galois::do_all(\n      galois::iterate((uint64_t)0, num_unique_clusters),\n      [&](uint64_t c) {\n        std::map<uint64_t, uint64_t> cluster_local_map;\n        uint64_t num_unique_clusters = 0;\n        for (auto cb_ii = cluster_bags[c].begin();\n             cb_ii != cluster_bags[c].end(); ++cb_ii) {\n\n          assert(graph.getData(*cb_ii, flag_no_lock).curr_comm_ass ==\n                 c); // All nodes in this bag must have same cluster id\n\n          for (auto ii = graph.edge_begin(*cb_ii); ii != graph.edge_end(*cb_ii);\n               ++ii) {\n            GNode dst     = graph.getEdgeDst(ii);\n            auto dst_data = graph.getData(dst, flag_no_lock);\n            assert(dst_data.curr_comm_ass != UNASSIGNED);\n            auto stored_already = cluster_local_map.find(\n                dst_data.curr_comm_ass); // Check if it already exists\n            if (stored_already != cluster_local_map.end()) {\n              edges_data[c][stored_already->second] += graph.getEdgeData(ii);\n            } else {\n              cluster_local_map[dst_data.curr_comm_ass] = num_unique_clusters;\n              edges_id[c].push_back(dst_data.curr_comm_ass);\n              edges_data[c].push_back(graph.getEdgeData(ii));\n              num_unique_clusters++;\n            }\n          } // End edge loop\n        }\n      },\n      galois::steal(), galois::loopname(\"BuildGrah: Find edges\"));\n\n  /* Serial loop to reduce all the edge counts */\n  std::vector<uint64_t> prefix_edges_count(num_unique_clusters);\n  galois::GAccumulator<uint64_t> num_edges_acc;\n  galois::do_all(galois::iterate((uint32_t)0, num_nodes_next), [&](uint32_t c) {\n    prefix_edges_count[c] = edges_id[c].size();\n    num_edges_acc += prefix_edges_count[c];\n  });\n\n  num_edges_next = num_edges_acc.reduce();\n  for (uint32_t c = 1; c < num_nodes_next; ++c) {\n    prefix_edges_count[c] += prefix_edges_count[c - 1];\n  }\n\n  assert(prefix_edges_count[num_unique_clusters - 1] == num_edges_next);\n  galois::gPrint(\"#nodes : \", num_nodes_next, \", #edges : \", num_edges_next,\n                 \"\\n\");\n  std::cerr << \"Graph construction started\"\n            << \"\\n\";\n  galois::StatTimer TimerConstructFrom(\"Timer_Construct_From\");\n  TimerConstructFrom.start();\n  graph_next.constructFrom(num_nodes_next, num_edges_next, prefix_edges_count,\n                           edges_id, edges_data);\n  TimerConstructFrom.stop();\n\n  TimerGraphBuild.stop();\n  galois::gPrint(\"Graph construction done\\n\");\n}\n\ntemplate <typename GraphTy>\nvoid buildNextLevelGraphSubComm(GraphTy& graph, GraphTy& graph_next,\n                                uint64_t num_unique_clusters,\n                                std::vector<uint64_t>& original_comm_ass,\n                                std::vector<uint64_t>& cluster_node_wt) {\n  using GNode = typename GraphTy::GraphNode;\n\n  galois::StatTimer TimerGraphBuild(\"Timer_Graph_build\");\n  TimerGraphBuild.start();\n  uint32_t num_nodes_next = num_unique_clusters;\n  uint64_t num_edges_next = 0; // Unknown right now\n\n  std::vector<std::vector<GNode>> cluster_bags(num_unique_clusters);\n  // Comment: Serial separation is better than do_all due to contention\n  for (GNode n = 0; n < graph.size(); ++n) {\n    auto n_data = graph.getData(n, flag_no_lock);\n    original_comm_ass[n_data.curr_subcomm_ass] =\n        graph.getData(n_data.curr_comm_ass).curr_subcomm_ass;\n    assert(n_data.curr_comm_ass != UNASSIGNED);\n    cluster_bags[n_data.curr_subcomm_ass].push_back(n);\n    cluster_node_wt[n_data.curr_subcomm_ass] += n_data.node_wt;\n  }\n\n  std::vector<std::vector<uint32_t>> edges_id(num_unique_clusters);\n  std::vector<std::vector<EdgeTy>> edges_data(num_unique_clusters);\n\n  /* First pass to find the number of edges */\n  galois::do_all(\n      galois::iterate((uint64_t)0, num_unique_clusters),\n      [&](uint64_t c) {\n        std::map<uint64_t, uint64_t> cluster_local_map;\n        uint64_t num_unique_clusters = 0;\n        for (auto cb_ii = cluster_bags[c].begin();\n             cb_ii != cluster_bags[c].end(); ++cb_ii) {\n\n          assert(graph.getData(*cb_ii, flag_no_lock).curr_subcomm_ass ==\n                 c); // All nodes in this bag must have same cluster id\n\n          for (auto ii = graph.edge_begin(*cb_ii); ii != graph.edge_end(*cb_ii);\n               ++ii) {\n            GNode dst     = graph.getEdgeDst(ii);\n            auto dst_data = graph.getData(dst, flag_no_lock);\n            assert(dst_data.curr_subcomm_ass != UNASSIGNED);\n            auto stored_already = cluster_local_map.find(\n                dst_data.curr_subcomm_ass); // Check if it already exists\n            if (stored_already != cluster_local_map.end()) {\n              edges_data[c][stored_already->second] += graph.getEdgeData(ii);\n            } else {\n              cluster_local_map[dst_data.curr_subcomm_ass] =\n                  num_unique_clusters;\n              edges_id[c].push_back(dst_data.curr_subcomm_ass);\n              edges_data[c].push_back(graph.getEdgeData(ii));\n              num_unique_clusters++;\n            }\n          } // End edge loop\n        }\n      },\n      galois::steal(), galois::loopname(\"BuildGrah: Find edges\"));\n\n  /* Serial loop to reduce all the edge counts */\n  std::vector<uint64_t> prefix_edges_count(num_unique_clusters);\n  galois::GAccumulator<uint64_t> num_edges_acc;\n  galois::do_all(galois::iterate((uint32_t)0, num_nodes_next), [&](uint32_t c) {\n    prefix_edges_count[c] = edges_id[c].size();\n    num_edges_acc += prefix_edges_count[c];\n  });\n\n  num_edges_next = num_edges_acc.reduce();\n  for (uint32_t c = 1; c < num_nodes_next; ++c) {\n    prefix_edges_count[c] += prefix_edges_count[c - 1];\n  }\n\n  assert(prefix_edges_count[num_unique_clusters - 1] == num_edges_next);\n  galois::gPrint(\"#nodes : \", num_nodes_next, \", #edges : \", num_edges_next,\n                 \"\\n\");\n  galois::gPrint(\"#prefix last : \", prefix_edges_count[num_unique_clusters - 1],\n                 \"\\n\");\n\n  std::cerr << \"Graph construction started\"\n            << \"\\n\";\n\n  galois::StatTimer TimerConstructFrom(\"Timer_Construct_From\");\n  TimerConstructFrom.start();\n  graph_next.constructFrom(num_nodes_next, num_edges_next, prefix_edges_count,\n                           edges_id, edges_data);\n  TimerConstructFrom.stop();\n\n  std::cout << \" c1:\" << calConstantForSecondTerm(graph) << \"\\n\";\n  TimerGraphBuild.stop();\n  galois::gPrint(\"Graph construction done\\n\");\n}\n\n#endif // CLUSTERING_H\n"
  },
  {
    "path": "lonestar/analytics/cpu/clustering/leidenClustering.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#include \"galois/Galois.h\"\n#include \"galois/AtomicHelpers.h\"\n#include \"galois/gstl.h\"\n#include \"galois/Reduction.h\"\n#include \"galois/Timer.h\"\n#include \"galois/graphs/LCGraph.h\"\n#include \"galois/graphs/TypeTraits.h\"\n\n#include \"llvm/Support/CommandLine.h\"\n\n#include <iostream>\n#include <fstream>\n#include <deque>\n#include <type_traits>\n\n#include \"Lonestar/BoilerPlate.h\"\n#include \"clustering.h\"\n#include \"galois/DynamicBitset.h\"\n\nstatic const char* name = \"Louvain Clustering\";\n\nstatic const char* desc = \"Cluster nodes of the graph using Louvain Clustering\";\n\nstatic const char* url = \"louvain_clustering\";\n\nenum Algo { foreach };\n\nstatic cll::opt<std::string>\n    inputFile(cll::Positional, cll::desc(\"<input file>\"), cll::Required);\nstatic cll::opt<Algo> algo(\n    \"algo\", cll::desc(\"Choose an algorithm:\"),\n    cll::values(clEnumValN(Algo::foreach, \"Foreach\",\n                           \"Using galois for_each for conflict mitigation\")),\n    cll::init(Algo::foreach));\n\n// Maintain community information\nstruct Comm {\n  std::atomic<uint64_t> size;\n  std::atomic<EdgeTy> degree_wt;\n  std::atomic<uint64_t> node_wt;\n  EdgeTy internal_edge_wt;\n  uint64_t num_subcomm;\n};\n\ntypedef galois::LargeArray<Comm> CommArray;\n// Graph Node information\nstruct Node {\n  uint64_t prev_comm_ass;\n  uint64_t curr_comm_ass;\n  EdgeTy degree_wt;\n  int64_t colorId;\n  /** Only required for Leiden **/\n  uint64_t curr_subcomm_ass;\n  uint64_t node_wt;\n};\n\nusing Graph = galois::graphs::LC_CSR_Graph<Node, EdgeTy>::with_no_lockable<\n    false>::type::with_numa_alloc<true>::type;\nusing GNode = Graph::GraphNode;\n\ndouble algoLeidenWithLocking(Graph& graph, double lower, double threshold,\n                             uint32_t& iter) {\n\n  galois::StatTimer TimerClusteringTotal(\"Timer_Clustering_Total\");\n  TimerClusteringTotal.start();\n\n  galois::gPrint(\"Inside algoLeidenWithLocking\\n\");\n\n  CommArray c_info;   // Community info\n  CommArray c_update; // Used for updating community\n\n  /* Variables needed for Modularity calculation */\n  double constant_for_second_term;\n  double prev_mod      = lower;\n  double curr_mod      = -1;\n  double threshold_mod = threshold;\n  uint32_t num_iter    = iter;\n\n  /*** Initialization ***/\n  c_info.allocateBlocked(graph.size());\n  c_update.allocateBlocked(graph.size());\n\n  /* Calculate the weighted degree sum for each vertex */\n  sumVertexDegreeWeightWithNodeWeight(graph, c_info);\n\n  /* Compute the total weight (2m) and 1/2m terms */\n  constant_for_second_term = calConstantForSecondTerm(graph);\n\n  if (iter > 1) {\n    galois::do_all(galois::iterate(graph), [&](GNode n) {\n      c_info[n].size      = 0;\n      c_info[n].degree_wt = 0;\n      c_info[n].node_wt   = 0;\n    });\n\n    galois::do_all(galois::iterate(graph), [&](GNode n) {\n      auto& n_data = graph.getData(n);\n      galois::atomicAdd(c_info[n_data.curr_comm_ass].size, uint64_t{1});\n      galois::atomicAdd(c_info[n_data.curr_comm_ass].node_wt, n_data.node_wt);\n      galois::atomicAdd(c_info[n_data.curr_comm_ass].degree_wt,\n                        n_data.degree_wt);\n    });\n  }\n\n  galois::gPrint(\"=============================================================\"\n                 \"===========================================\\n\");\n  galois::gPrint(\"Itr      Explore_xx            A_x2          Prev-Prev-Mod   \"\n                 \"      Prev-Mod           Curr-Mod\\n\");\n  galois::gPrint(\"=============================================================\"\n                 \"===========================================\\n\");\n\n  galois::StatTimer TimerClusteringWhile(\"Timer_Clustering_While\");\n  TimerClusteringWhile.start();\n  while (true) {\n    num_iter++;\n    galois::do_all(galois::iterate(graph), [&](GNode n) {\n      c_update[n].degree_wt = 0;\n      c_update[n].size      = 0;\n      c_update[n].node_wt   = 0;\n    });\n\n    galois::for_each(\n        galois::iterate(graph),\n        [&](GNode n, auto&) {\n          auto& n_data    = graph.getData(n, flag_write_lock);\n          uint64_t degree = std::distance(graph.edge_begin(n, flag_write_lock),\n                                          graph.edge_end(n, flag_write_lock));\n\n          uint64_t local_target = UNASSIGNED;\n          std::map<uint64_t, uint64_t>\n              cluster_local_map; // Map each neighbor's cluster to local number:\n                                 // Community --> Index\n          std::vector<EdgeTy> counter; // Number of edges to each unique cluster\n          EdgeTy self_loop_wt = 0;\n\n          if (degree > 0) {\n            findNeighboringClusters(graph, n, cluster_local_map, counter,\n                                    self_loop_wt);\n            local_target =\n                maxModularity(cluster_local_map, counter, self_loop_wt, c_info,\n                              n_data.degree_wt, n_data.curr_comm_ass,\n                              constant_for_second_term);\n            // local_target = maxCPMQuality<Graph, CommArray>(cluster_local_map,\n            // counter, self_loop_wt, c_info, n_data.node_wt,\n            // n_data.curr_comm_ass);\n          } else {\n            local_target = UNASSIGNED;\n          }\n\n          /* Update cluster info */\n          if (local_target != n_data.curr_comm_ass &&\n              local_target != UNASSIGNED) {\n\n            galois::atomicAdd(c_info[local_target].degree_wt, n_data.degree_wt);\n            galois::atomicAdd(c_info[local_target].size, uint64_t{1});\n            galois::atomicAdd(c_info[local_target].node_wt, n_data.node_wt);\n\n            galois::atomicSubtract(c_info[n_data.curr_comm_ass].degree_wt,\n                                   n_data.degree_wt);\n            galois::atomicSubtract(c_info[n_data.curr_comm_ass].size,\n                                   uint64_t{1});\n            galois::atomicSubtract(c_info[n_data.curr_comm_ass].node_wt,\n                                   n_data.node_wt);\n\n            /* Set the new cluster id */\n            n_data.curr_comm_ass = local_target;\n          }\n        },\n        galois::loopname(\"leiden algo: Phase 1\"), galois::no_pushes());\n\n    /* Calculate the overall modularity */\n    double e_xx = 0;\n    double a2_x = 0;\n\n    // curr_mod = calCPMQuality(graph, c_info, e_xx, a2_x,\n    // constant_for_second_term);\n    curr_mod =\n        calModularity(graph, c_info, e_xx, a2_x, constant_for_second_term);\n\n    galois::gPrint(num_iter, \"        \", e_xx, \"        \", a2_x, \"        \",\n                   lower, \"      \", prev_mod, \"       \", curr_mod, \"\\n\");\n\n    if ((curr_mod - prev_mod) < threshold_mod) {\n      galois::gPrint(\"Modularity gain: \", (curr_mod - prev_mod), \" < \",\n                     threshold_mod, \" \\n\");\n      prev_mod = curr_mod;\n      break;\n    }\n    prev_mod = curr_mod;\n  } // End while\n  TimerClusteringWhile.stop();\n\n  iter = num_iter;\n\n  c_info.destroy();\n  c_info.deallocate();\n\n  c_update.destroy();\n  c_update.deallocate();\n\n  TimerClusteringTotal.stop();\n  return prev_mod;\n}\n\nvoid runMultiPhaseLouvainAlgorithm(Graph& graph, uint64_t min_graph_size,\n                                   double c_threshold,\n                                   largeArray& clusters_orig) {\n\n  galois::gPrint(\"Inside runMultiPhaseLouvainAlgorithm\\n\");\n  double prev_mod = -1; // Previous modularity\n  double curr_mod = -1; // Current modularity\n  uint32_t phase  = 0;\n\n  Graph* graph_curr = &graph;\n  Graph graph_next;\n  uint32_t iter           = 0;\n  uint64_t num_nodes_orig = clusters_orig.size();\n  /**\n   * Assign cluster id from previous iteration\n   */\n  galois::do_all(galois::iterate(*graph_curr), [&](GNode n) {\n    graph_curr->getData(n).curr_comm_ass    = n;\n    graph_curr->getData(n).curr_subcomm_ass = n;\n    graph_curr->getData(n).node_wt          = 1;\n  });\n  for (GNode i = 0; i < graph.size(); ++i) {\n    if (graph.getData(i).node_wt > 1)\n      galois::gPrint(\"-->node wt : \", graph.getData(i).node_wt, \"\\n\");\n  }\n  while (true) {\n    iter++;\n    phase++;\n    galois::gPrint(\"Starting Phase : \", phase, \"\\n\");\n    galois::gPrint(\"Graph size : \", (*graph_curr).size(), \"\\n\");\n\n    if ((*graph_curr).size() > min_graph_size) {\n      switch (algo) {\n      case foreach:\n        curr_mod =\n            algoLeidenWithLocking(*graph_curr, curr_mod, c_threshold, iter);\n        break;\n      default:\n        std::abort();\n      }\n    }\n\n    if (iter < max_iter && (curr_mod - prev_mod) > threshold) {\n      double constant_for_second_term = calConstantForSecondTerm(graph);\n      refinePartition<Graph, CommArray>(*graph_curr, constant_for_second_term);\n\n      uint64_t num_unique_subclusters =\n          renumberClustersContiguouslySubcomm(*graph_curr);\n      galois::gPrint(\"Number of unique sub cluster (Refine) : \",\n                     num_unique_subclusters, \"\\n\");\n      std::vector<uint64_t> original_comm_ass(graph_curr->size());\n      std::vector<uint64_t> cluster_node_wt(num_unique_subclusters, 0);\n\n      if (phase == 1) {\n        galois::do_all(\n            galois::iterate(uint64_t{0}, num_nodes_orig), [&](GNode n) {\n              clusters_orig[n] = (*graph_curr).getData(n).curr_subcomm_ass;\n            });\n      } else {\n        galois::do_all(\n            galois::iterate(uint64_t{0}, num_nodes_orig),\n            [&](GNode n) {\n              assert(clusters_orig[n] < (*graph_curr).size());\n              clusters_orig[n] =\n                  (*graph_curr).getData(clusters_orig[n]).curr_subcomm_ass;\n            },\n            galois::steal());\n      }\n      buildNextLevelGraphSubComm(*graph_curr, graph_next,\n                                 num_unique_subclusters, original_comm_ass,\n                                 cluster_node_wt);\n      prev_mod   = curr_mod;\n      graph_curr = &graph_next;\n      /**\n       * Assign cluster id from previous iteration\n       */\n      galois::do_all(galois::iterate(*graph_curr), [&](GNode n) {\n        auto& n_data            = graph_curr->getData(n);\n        n_data.curr_comm_ass    = original_comm_ass[n];\n        n_data.curr_subcomm_ass = original_comm_ass[n];\n        n_data.node_wt          = cluster_node_wt[n];\n      });\n\n      cluster_node_wt.clear();\n      printGraphCharateristics(*graph_curr);\n    } else {\n      break;\n    }\n  }\n  galois::gPrint(\"Phases : \", phase, \"Iter : \", iter, \"\\n\");\n}\n\nint main(int argc, char** argv) {\n  galois::SharedMemSys G;\n  LonestarStart(argc, argv, name, desc, url, &inputFile);\n\n  if (!symmetricGraph) {\n    GALOIS_DIE(\"This application requires a symmetric graph input;\"\n               \" please use the -symmetricGraph flag \"\n               \" to indicate the input is a symmetric graph.\");\n  }\n\n  galois::StatTimer totalTime(\"TimerTotal\");\n  totalTime.start();\n\n  Graph graph;\n  Graph graph_next;\n  Graph* graph_curr;\n\n  std::cout << \"Reading from file: \" << inputFile << \"\\n\";\n  std::cout << \"[WARNING:] Make sure \" << inputFile\n            << \" is symmetric graph without duplicate edges\\n\";\n  galois::graphs::readGraph(graph, inputFile);\n  std::cout << \"Read \" << graph.size() << \" nodes, \" << graph.sizeEdges()\n            << \" edges\\n\";\n\n  graph_curr = &graph;\n\n  /*\n   * To keep track of communities for nodes in the original graph.\n   * Community will be set to -1 for isolated nodes\n   */\n  largeArray clusters_orig;\n  clusters_orig.allocateBlocked(graph_curr->size());\n\n  /*\n   * Vertex following optimization\n   */\n  if (enable_VF) {\n    uint64_t num_nodes_to_fix =\n        vertexFollowing(graph); // Find nodes that follow other nodes\n    galois::gPrint(\"Isolated nodes : \", num_nodes_to_fix, \"\\n\");\n\n    uint64_t num_unique_clusters = renumberClustersContiguously(*graph_curr);\n    galois::gPrint(\n        \"Number of unique clusters (renumber): \", num_unique_clusters, \"\\n\");\n    /*\n     *Initialize node cluster id.\n     */\n    galois::do_all(galois::iterate(*graph_curr), [&](GNode n) {\n      clusters_orig[n] = graph.getData(n, flag_no_lock).curr_comm_ass;\n    });\n\n    /*\n     * Build new graph to remove the isolated nodes\n     */\n    buildNextLevelGraph(*graph_curr, graph_next, num_unique_clusters);\n    graph_curr = &graph_next;\n    printGraphCharateristics(*graph_curr);\n  } else {\n\n    /*\n     *Initialize node cluster id.\n     */\n    galois::do_all(galois::iterate(*graph_curr),\n                   [&](GNode n) { clusters_orig[n] = UNASSIGNED; });\n\n    printGraphCharateristics(*graph_curr);\n  }\n\n  uint64_t min_graph_size = 10;\n  galois::StatTimer execTime(\"Timer_0\");\n  execTime.start();\n  runMultiPhaseLouvainAlgorithm(*graph_curr, min_graph_size, c_threshold,\n                                clusters_orig);\n  execTime.stop();\n\n  /*\n   * Sanity check: Check modularity at the end\n   */\n  checkModularity<Graph, CommArray>(graph, clusters_orig);\n  if (output_CID) {\n    printNodeClusterId(graph, output_CID_filename);\n  }\n\n  totalTime.stop();\n\n  return 0;\n}\n"
  },
  {
    "path": "lonestar/analytics/cpu/clustering/louvainClustering.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#include \"clustering.h\"\n#include \"galois/Galois.h\"\n#include \"galois/AtomicHelpers.h\"\n#include \"galois/DynamicBitset.h\"\n#include \"galois/gstl.h\"\n#include \"galois/Reduction.h\"\n#include \"galois/Timer.h\"\n#include \"galois/graphs/LCGraph.h\"\n#include \"galois/graphs/TypeTraits.h\"\n#include \"Lonestar/BoilerPlate.h\"\n\n#include \"llvm/Support/CommandLine.h\"\n\n#include <iostream>\n#include <fstream>\n#include <deque>\n#include <type_traits>\n\nstatic const char* name = \"Louvain Clustering\";\n\nstatic const char* desc = \"Cluster nodes of the graph using Louvain Clustering\";\n\nstatic const char* url = \"louvain_clustering\";\n\nenum Algo { coloring, foreach, delay, doall };\n\nstatic cll::opt<std::string>\n    inputFile(cll::Positional, cll::desc(\"<input file>\"), cll::Required);\nstatic cll::opt<Algo> algo(\n    \"algo\", cll::desc(\"Choose an algorithm:\"),\n    cll::values(clEnumValN(Algo::coloring, \"Coloring\",\n                           \"Using colors to mitigate conflicts\"),\n                clEnumValN(Algo::foreach, \"Foreach\",\n                           \"Using galois for_each for conflict mitigation\"),\n                clEnumValN(Algo::delay, \"Delay\",\n                           \"Using galois for_each for conflict mitigation but \"\n                           \"delay the updation\"),\n                clEnumValN(Algo::doall, \"Doall\",\n                           \"Using galois for_each for conflict mitigation\")),\n    cll::init(Algo::foreach));\n\n// Maintain community information\nstruct Comm {\n  std::atomic<uint64_t> size;\n  std::atomic<EdgeTy> degree_wt;\n  EdgeTy internal_edge_wt;\n};\n\ntypedef galois::LargeArray<Comm> CommArray;\n\n// Graph Node information\nstruct Node {\n  uint64_t prev_comm_ass;\n  uint64_t curr_comm_ass;\n  EdgeTy degree_wt;\n  int64_t colorId;\n};\n\nusing Graph = galois::graphs::LC_CSR_Graph<Node, EdgeTy>::with_no_lockable<\n    false>::type::with_numa_alloc<true>::type;\nusing GNode = Graph::GraphNode;\n\ndouble algoLouvainWithLocking(Graph& graph, double lower, double threshold,\n                              uint32_t& iter) {\n  galois::StatTimer TimerClusteringTotal(\"Timer_Clustering_Total\");\n  TimerClusteringTotal.start();\n\n  galois::gPrint(\"Inside algoLouvainWithLocking\\n\");\n\n  CommArray c_info;   // Community info\n  CommArray c_update; // Used for updating community\n\n  /* Variables needed for Modularity calculation */\n  double constant_for_second_term;\n  double prev_mod      = lower;\n  double curr_mod      = -1;\n  double threshold_mod = threshold;\n  uint32_t num_iter    = iter;\n\n  /*** Initialization ***/\n  c_info.allocateBlocked(graph.size());\n  c_update.allocateBlocked(graph.size());\n\n  /* Initialization each node to its own cluster */\n  galois::do_all(galois::iterate(graph), [&graph](GNode n) {\n    graph.getData(n).curr_comm_ass = n;\n    graph.getData(n).prev_comm_ass = n;\n  });\n\n  galois::gPrint(\"Init Done\\n\");\n  /* Calculate the weighted degree sum for each vertex */\n  sumVertexDegreeWeight(graph, c_info);\n  galois::gPrint(\"c_info[0] : \", c_info[0].degree_wt.load(), \"\\n\");\n\n  /* Compute the total weight (2m) and 1/2m terms */\n  constant_for_second_term = calConstantForSecondTerm(graph);\n  galois::gPrint(\"constant_for_second_term : \", constant_for_second_term, \"\\n\");\n\n  galois::gPrint(\"=============================================================\"\n                 \"===========================================\\n\");\n  galois::gPrint(\"Itr      Explore_xx            A_x2          Prev-Prev-Mod   \"\n                 \"      Prev-Mod           Curr-Mod\\n\");\n  galois::gPrint(\"=============================================================\"\n                 \"===========================================\\n\");\n\n  galois::StatTimer TimerClusteringWhile(\"Timer_Clustering_While\");\n  TimerClusteringWhile.start();\n  while (true) {\n    num_iter++;\n\n    galois::do_all(galois::iterate(graph), [&](GNode n) {\n      c_update[n].degree_wt = 0;\n      c_update[n].size      = 0;\n    });\n\n    galois::for_each(\n        galois::iterate(graph),\n        [&](GNode n, auto&) {\n          auto& n_data    = graph.getData(n, flag_write_lock);\n          uint64_t degree = std::distance(graph.edge_begin(n, flag_write_lock),\n                                          graph.edge_end(n, flag_write_lock));\n          uint64_t local_target = UNASSIGNED;\n          std::map<uint64_t, uint64_t>\n              cluster_local_map; // Map each neighbor's cluster to local number:\n                                 // Community --> Index\n          std::vector<EdgeTy> counter; // Number of edges to each unique cluster\n          EdgeTy self_loop_wt = 0;\n          if (degree > 0) {\n\n            findNeighboringClusters(graph, n, cluster_local_map, counter,\n                                    self_loop_wt);\n            // Find the max gain in modularity\n            local_target =\n                maxModularity(cluster_local_map, counter, self_loop_wt, c_info,\n                              n_data.degree_wt, n_data.curr_comm_ass,\n                              constant_for_second_term);\n\n          } else {\n            local_target = UNASSIGNED;\n          }\n\n          /* Update cluster info */\n          if (local_target != n_data.curr_comm_ass &&\n              local_target != UNASSIGNED) {\n\n            galois::atomicAdd(c_info[local_target].degree_wt, n_data.degree_wt);\n            galois::atomicAdd(c_info[local_target].size, (uint64_t)1);\n            galois::atomicSubtract(c_info[n_data.curr_comm_ass].degree_wt,\n                                   n_data.degree_wt);\n            galois::atomicSubtract(c_info[n_data.curr_comm_ass].size,\n                                   (uint64_t)1);\n\n            /* Set the new cluster id */\n            n_data.curr_comm_ass = local_target;\n          }\n        },\n        galois::loopname(\"louvain algo: Phase 1\"), galois::no_pushes());\n\n    /* Calculate the overall modularity */\n    double e_xx = 0;\n    double a2_x = 0;\n\n    curr_mod =\n        calModularity(graph, c_info, e_xx, a2_x, constant_for_second_term);\n\n    galois::gPrint(num_iter, \"        \", e_xx, \"        \", a2_x, \"        \",\n                   lower, \"      \", prev_mod, \"       \", curr_mod, \"\\n\");\n\n    if ((curr_mod - prev_mod) < threshold_mod) {\n      galois::gPrint(\"Modularity gain: \", (curr_mod - prev_mod), \" < \",\n                     threshold_mod, \" \\n\");\n      prev_mod = curr_mod;\n      break;\n    }\n\n    prev_mod = curr_mod;\n\n  } // End while\n  TimerClusteringWhile.stop();\n\n  iter = num_iter;\n\n  c_info.destroy();\n  c_info.deallocate();\n\n  c_update.destroy();\n  c_update.deallocate();\n\n  TimerClusteringTotal.stop();\n  return prev_mod;\n}\n\ndouble algoLouvainWithoutLockingDoAll(Graph& graph, double lower,\n                                      double threshold, uint32_t& iter) {\n\n  galois::StatTimer TimerClusteringTotal(\"Timer_Clustering_Total\");\n  TimerClusteringTotal.start();\n\n  galois::gPrint(\"Inside algoLouvainWithLocking\\n\");\n\n  CommArray c_info;   // Community info\n  CommArray c_update; // Used for updating community\n\n  /* Variables needed for Modularity calculation */\n  double constant_for_second_term;\n  double prev_mod      = lower;\n  double curr_mod      = -1;\n  double threshold_mod = threshold;\n  uint32_t num_iter    = iter;\n\n  /*** Initialization ***/\n  c_info.allocateBlocked(graph.size());\n  c_update.allocateBlocked(graph.size());\n\n  /* Initialization each node to its own cluster */\n  galois::do_all(galois::iterate(graph), [&graph](GNode n) {\n    graph.getData(n).curr_comm_ass = n;\n    graph.getData(n).prev_comm_ass = n;\n    graph.getData(n).colorId       = -1;\n  });\n\n  galois::gPrint(\"Init Done\\n\");\n  /* Calculate the weighted degree sum for each vertex */\n  sumVertexDegreeWeight(graph, c_info);\n  galois::gPrint(\"c_info[0] : \", c_info[0].degree_wt.load(), \"\\n\");\n\n  /* Compute the total weight (2m) and 1/2m terms */\n  constant_for_second_term = calConstantForSecondTerm(graph);\n  galois::gPrint(\"constant_for_second_term : \", constant_for_second_term, \"\\n\");\n\n  galois::gPrint(\"=============================================================\"\n                 \"===========================================\\n\");\n  galois::gPrint(\"Itr      Explore_xx            A_x2          Prev-Prev-Mod   \"\n                 \"      Prev-Mod           Curr-Mod\\n\");\n  galois::gPrint(\"=============================================================\"\n                 \"===========================================\\n\");\n\n  galois::StatTimer TimerClusteringWhile(\"Timer_Clustering_While\");\n  TimerClusteringWhile.start();\n  while (true) {\n    num_iter++;\n\n    galois::do_all(galois::iterate(graph), [&](GNode n) {\n      c_update[n].degree_wt = 0;\n      c_update[n].size      = 0;\n    });\n\n    galois::do_all(\n        galois::iterate(graph),\n        [&](GNode n) {\n          auto& n_data    = graph.getData(n, flag_write_lock);\n          uint64_t degree = std::distance(graph.edge_begin(n, flag_no_lock),\n                                          graph.edge_end(n, flag_no_lock));\n          uint64_t local_target = UNASSIGNED;\n          std::map<uint64_t, uint64_t>\n              cluster_local_map; // Map each neighbor's cluster to local number:\n                                 // Community --> Index\n          std::vector<EdgeTy> counter; // Number of edges to each unique cluster\n          EdgeTy self_loop_wt = 0;\n\n          if (degree > 0) {\n            findNeighboringClusters(graph, n, cluster_local_map, counter,\n                                    self_loop_wt);\n            // Find the max gain in modularity\n            local_target = maxModularityWithoutSwaps(\n                cluster_local_map, counter, self_loop_wt, c_info,\n                n_data.degree_wt, n_data.curr_comm_ass,\n                constant_for_second_term);\n\n          } else {\n            local_target = UNASSIGNED;\n          }\n\n          /* Update cluster info */\n          if (local_target != n_data.curr_comm_ass &&\n              local_target != UNASSIGNED) {\n\n            galois::atomicAdd(c_info[local_target].degree_wt, n_data.degree_wt);\n            galois::atomicAdd(c_info[local_target].size, (uint64_t)1);\n            galois::atomicSubtract(c_info[n_data.curr_comm_ass].degree_wt,\n                                   n_data.degree_wt);\n            galois::atomicSubtract(c_info[n_data.curr_comm_ass].size,\n                                   (uint64_t)1);\n\n            /* Set the new cluster id */\n            n_data.curr_comm_ass = local_target;\n          }\n        },\n        galois::loopname(\"louvain algo: Phase 1\"));\n\n    /* Calculate the overall modularity */\n    double e_xx = 0;\n    double a2_x = 0;\n\n    curr_mod =\n        calModularity(graph, c_info, e_xx, a2_x, constant_for_second_term);\n\n    galois::gPrint(num_iter, \"        \", e_xx, \"        \", a2_x, \"        \",\n                   lower, \"      \", prev_mod, \"       \", curr_mod, \"\\n\");\n\n    if ((curr_mod - prev_mod) < threshold_mod) {\n      galois::gPrint(\"Modularity gain: \", (curr_mod - prev_mod), \" < \",\n                     threshold_mod, \" \\n\");\n      prev_mod = curr_mod;\n      break;\n    }\n\n    prev_mod = curr_mod;\n\n  } // End while\n  TimerClusteringWhile.stop();\n\n  iter = num_iter;\n\n  c_info.destroy();\n  c_info.deallocate();\n\n  c_update.destroy();\n  c_update.deallocate();\n\n  TimerClusteringTotal.stop();\n  return prev_mod;\n}\n\ndouble algoLouvainWithLockingDelayUpdate(Graph& graph, double lower,\n                                         double threshold, uint32_t& iter) {\n  galois::gPrint(\"Inside algoLouvainWithLockingDelay\\n\");\n\n  galois::StatTimer TimerClusteringTotal(\"Timer_Clustering_Total\");\n  TimerClusteringTotal.start();\n\n  CommArray c_info;   // Community info\n  CommArray c_update; // Used for updating community\n\n  /* Variables needed for Modularity calculation */\n  double constant_for_second_term;\n  double prev_mod      = -1; // lower;\n  double curr_mod      = -1;\n  double threshold_mod = threshold;\n  uint32_t num_iter    = iter;\n\n  /*** Initialization ***/\n  c_info.allocateBlocked(graph.size());\n  c_update.allocateBlocked(graph.size());\n\n  /* Initialization each node to its own cluster */\n  galois::do_all(galois::iterate(graph), [&graph](GNode n) {\n    graph.getData(n).curr_comm_ass = n;\n    graph.getData(n).prev_comm_ass = n;\n    graph.getData(n).colorId       = -1;\n  });\n\n  galois::gPrint(\"Init Done\\n\");\n  /* Calculate the weighted degree sum for each vertex */\n  sumVertexDegreeWeight(graph, c_info);\n  galois::gPrint(\"c_info[5] : \", c_info[0].degree_wt.load(), \"\\n\");\n\n  /* Compute the total weight (2m) and 1/2m terms */\n  constant_for_second_term = calConstantForSecondTerm(graph);\n  galois::gPrint(\"constant_for_second_term : \", constant_for_second_term, \"\\n\");\n\n  galois::gPrint(\"=============================================================\"\n                 \"===========================================\\n\");\n  galois::gPrint(\"Itr      Explore_xx            A_x2          Prev-Prev-Mod   \"\n                 \"      Prev-Mod           Curr-Mod\\n\");\n  galois::gPrint(\"=============================================================\"\n                 \"===========================================\\n\");\n\n  galois::StatTimer TimerClusteringWhile(\"Timer_Clustering_While\");\n  TimerClusteringWhile.start();\n  while (true) {\n    num_iter++;\n\n    galois::do_all(galois::iterate(graph), [&](GNode n) {\n      c_update[n].degree_wt = 0;\n      c_update[n].size      = 0;\n    });\n\n    std::vector<uint64_t> local_target(graph.size(), UNASSIGNED);\n    galois::GAccumulator<uint32_t> syncRound;\n    galois::do_all(\n        galois::iterate(graph),\n        [&](GNode n) {\n          auto& n_data    = graph.getData(n, flag_write_lock);\n          uint64_t degree = std::distance(graph.edge_begin(n, flag_no_lock),\n                                          graph.edge_end(n, flag_no_lock));\n          std::map<uint64_t, uint64_t>\n              cluster_local_map; // Map each neighbor's cluster to local number:\n                                 // Community --> Index\n          std::vector<EdgeTy> counter; // Number of edges to each unique cluster\n          EdgeTy self_loop_wt = 0;\n\n          if (degree > 0) {\n            findNeighboringClusters(graph, n, cluster_local_map, counter,\n                                    self_loop_wt);\n            // Find the max gain in modularity\n            local_target[n] =\n                maxModularity(cluster_local_map, counter, self_loop_wt, c_info,\n                              n_data.degree_wt, n_data.curr_comm_ass,\n                              constant_for_second_term);\n          } else {\n            local_target[n] = UNASSIGNED;\n          }\n\n          /* Update cluster info */\n          if (local_target[n] != n_data.curr_comm_ass &&\n              local_target[n] != UNASSIGNED) {\n\n            galois::atomicAdd(c_update[local_target[n]].degree_wt,\n                              n_data.degree_wt);\n            galois::atomicAdd(c_update[local_target[n]].size, (uint64_t)1);\n            galois::atomicSubtract(c_update[n_data.curr_comm_ass].degree_wt,\n                                   n_data.degree_wt);\n            galois::atomicSubtract(c_update[n_data.curr_comm_ass].size,\n                                   (uint64_t)1);\n          }\n        },\n        galois::loopname(\"louvain algo: Phase 1\"));\n\n    /* Calculate the overall modularity */\n    double e_xx = 0;\n    double a2_x = 0;\n    curr_mod    = calModularityDelay(graph, c_info, c_update, e_xx, a2_x,\n                                  constant_for_second_term, local_target);\n    galois::gPrint(num_iter, \"        \", e_xx, \"        \", a2_x, \"        \",\n                   lower, \"      \", prev_mod, \"       \", curr_mod, \"\\n\");\n\n    if ((curr_mod - prev_mod) < threshold_mod) {\n      galois::gPrint(\"Modularity gain: \", (curr_mod - prev_mod), \" < \",\n                     threshold_mod, \" \\n\");\n      prev_mod = curr_mod;\n      break;\n    }\n\n    prev_mod = curr_mod;\n    if (prev_mod < lower)\n      prev_mod = lower;\n\n    galois::do_all(galois::iterate(graph), [&](GNode n) {\n      auto& n_data         = graph.getData(n, flag_no_lock);\n      n_data.prev_comm_ass = n_data.curr_comm_ass;\n      n_data.curr_comm_ass = local_target[n];\n      galois::atomicAdd(c_info[n].size, c_update[n].size.load());\n      galois::atomicAdd(c_info[n].degree_wt, c_update[n].degree_wt.load());\n\n      c_update[n].size      = 0;\n      c_update[n].degree_wt = 0;\n    });\n\n  } // End while\n  TimerClusteringWhile.stop();\n\n  iter = num_iter;\n\n  c_info.destroy();\n  c_info.deallocate();\n\n  c_update.destroy();\n  c_update.deallocate();\n\n  TimerClusteringTotal.stop();\n  return prev_mod;\n}\n\nuint64_t coloringDistanceOne(Graph& graph) {\n  galois::for_each(\n      galois::iterate(graph),\n      [&](GNode n, auto&) {\n        auto& n_data = graph.getData(n, flag_write_lock);\n\n        /* Grab lock on neighbours: Cautious operator */\n        for (auto ii = graph.edge_begin(n, flag_write_lock);\n             ii != graph.edge_end(n, flag_write_lock); ++ii) {\n          graph.getData(graph.getEdgeDst(ii),\n                        flag_write_lock); // TODO: Can we use read lock?\n        }\n\n        int64_t max_color = -1;\n        int64_t my_color  = 0;\n        int64_t degree = std::distance(graph.edge_begin(n), graph.edge_end(n));\n        if (degree > 0) {\n          std::vector<bool> isColorSet;\n          isColorSet.resize(degree, false);\n          for (auto ii = graph.edge_begin(n, flag_write_lock);\n               ii != graph.edge_end(n, flag_write_lock); ++ii) {\n            auto dst = graph.getEdgeDst(ii);\n            if (dst == n)\n              continue;\n\n            auto& dst_data = graph.getData(\n                dst, flag_write_lock); // TODO: Can we use read lock?\n            if (dst_data.colorId >= 0) {\n              if (dst_data.colorId >= degree)\n                isColorSet.resize(dst_data.colorId);\n\n              isColorSet[dst_data.colorId] = true;\n              if ((dst_data.colorId > max_color)) {\n                max_color = dst_data.colorId;\n              }\n            }\n          }\n\n          if (max_color >= 0) {\n            /* Assign color */\n            for (; my_color <= max_color; my_color++) {\n              if (isColorSet[my_color] == false) {\n                break;\n              }\n            }\n\n            if (my_color == max_color)\n              my_color++;\n          }\n        }\n        n_data.colorId = my_color;\n      },\n      galois::loopname(\"Coloring loop\"));\n\n  galois::gPrint(\"Checking for conflicts\\n\");\n  /* Check for conflicts */\n  galois::GAccumulator<uint64_t> conflicts;\n  galois::do_all(\n      galois::iterate(graph),\n      [&](GNode n) {\n        auto& n_data = graph.getData(n, flag_no_lock);\n        for (auto ii = graph.edge_begin(n, flag_write_lock);\n             ii != graph.edge_end(n, flag_write_lock); ++ii) {\n          auto dst       = graph.getEdgeDst(ii);\n          auto& dst_data = graph.getData(dst, flag_no_lock);\n          if (dst_data.colorId == n_data.colorId)\n            conflicts += 1;\n        }\n      },\n      galois::loopname(\"Coloring conflicts\"));\n  galois::gPrint(\"WARNING: Conflicts found : \", conflicts.reduce(), \"\\n\");\n\n  int64_t num_colors = 0;\n  for (GNode n = 0; n < graph.size(); ++n) {\n    int64_t color = graph.getData(n, flag_no_lock).colorId;\n    if (color > num_colors)\n      num_colors = color;\n  }\n\n  return num_colors;\n}\n\ndouble algoLouvainWithColoring(Graph& graph, double lower, double threshold,\n                               uint32_t& iter) {\n\n  galois::StatTimer TimerClusteringTotal(\"Timer_Clustering_Total\");\n  TimerClusteringTotal.start();\n\n  galois::gPrint(\"Inside algoLouvainWithColoring\\n\");\n\n  CommArray c_info;   // Community info\n  CommArray c_update; // Used for updating community\n\n  /* Variables needed for Modularity calculation */\n  double constant_for_second_term;\n  double prev_mod      = lower;\n  double curr_mod      = -1;\n  double threshold_mod = threshold;\n  uint32_t num_iter    = iter;\n\n  /*** Initialization ***/\n  c_info.allocateBlocked(graph.size());\n  c_update.allocateBlocked(graph.size());\n\n  /* Initialization each node to its own cluster */\n  galois::do_all(galois::iterate(graph), [&graph](GNode n) {\n    graph.getData(n).curr_comm_ass = n;\n    graph.getData(n).prev_comm_ass = n;\n    graph.getData(n).colorId       = -1;\n  });\n\n  galois::gPrint(\"Coloring\\n\");\n  galois::StatTimer TimerColoring(\"Timer_Cloring\");\n  TimerColoring.start();\n  int64_t num_colors = coloringDistanceOne(graph);\n  TimerColoring.stop();\n\n  /* Calculate the weighted degree sum for each vertex */\n  sumVertexDegreeWeight(graph, c_info);\n  galois::gPrint(\"c_info[5] : \", c_info[0].degree_wt.load(), \"\\n\");\n\n  /* Compute the total weight (2m) and 1/2m terms */\n  constant_for_second_term = calConstantForSecondTerm(graph);\n  galois::gPrint(\"constant_for_second_term : \", constant_for_second_term, \"\\n\");\n\n  galois::gPrint(\"=============================================================\"\n                 \"===========================================\\n\");\n  galois::gPrint(\n      \"Itr      Explore_xx            A_x2           Prev-Mod           \"\n      \"Curr-Mod         Time-1(s)       Time-2(s)        T/Itr(s)\\n\");\n  galois::gPrint(\"=============================================================\"\n                 \"===========================================\\n\");\n\n  galois::do_all(galois::iterate(graph), [&](GNode n) {\n    c_update[n].degree_wt = 0;\n    c_update[n].size      = 0;\n  });\n\n  galois::StatTimer TimerClusteringWhile(\"Timer_Clustering_While\");\n  TimerClusteringWhile.start();\n  while (true) {\n    num_iter++;\n\n    for (int64_t c = 0; c < num_colors; ++c) {\n      // galois::gPrint(\"Color : \", c, \"\\n\");\n      galois::do_all(\n          galois::iterate(graph),\n          [&](GNode n) {\n            auto& n_data = graph.getData(n, flag_write_lock);\n            if (n_data.colorId == c) {\n              uint64_t degree = std::distance(graph.edge_begin(n, flag_no_lock),\n                                              graph.edge_end(n, flag_no_lock));\n              uint64_t local_target = UNASSIGNED;\n              std::map<uint64_t, uint64_t>\n                  cluster_local_map; // Map each neighbor's cluster to local\n                                     // number: Community --> Index\n              std::vector<EdgeTy>\n                  counter; // Number of edges to each unique cluster\n              EdgeTy self_loop_wt = 0;\n\n              if (degree > 0) {\n                findNeighboringClusters(graph, n, cluster_local_map, counter,\n                                        self_loop_wt);\n                // Find the max gain in modularity\n                local_target = maxModularity(\n                    cluster_local_map, counter, self_loop_wt, c_info,\n                    n_data.degree_wt, n_data.curr_comm_ass,\n                    constant_for_second_term);\n              } else {\n                local_target = UNASSIGNED;\n              }\n              /* Update cluster info */\n              if (local_target != n_data.curr_comm_ass &&\n                  local_target != UNASSIGNED) {\n                galois::atomicAdd(c_update[local_target].degree_wt,\n                                  n_data.degree_wt);\n                galois::atomicAdd(c_update[local_target].size, (uint64_t)1);\n                galois::atomicSubtract(c_update[n_data.curr_comm_ass].degree_wt,\n                                       n_data.degree_wt);\n                galois::atomicSubtract(c_update[n_data.curr_comm_ass].size,\n                                       (uint64_t)1);\n                /* Set the new cluster id */\n                n_data.curr_comm_ass = local_target;\n              }\n            }\n          },\n          galois::loopname(\"louvain algo: Phase 1\"));\n\n      galois::do_all(galois::iterate(graph), [&](GNode n) {\n        galois::atomicAdd(c_info[n].size, c_update[n].size.load());\n        galois::atomicAdd(c_info[n].degree_wt, c_update[n].degree_wt.load());\n        c_update[n].size      = 0;\n        c_update[n].degree_wt = 0;\n      });\n    }\n\n    /* Calculate the overall modularity */\n    double e_xx = 0;\n    double a2_x = 0;\n    curr_mod =\n        calModularity(graph, c_info, e_xx, a2_x, constant_for_second_term);\n\n    galois::gPrint(num_iter, \"        \", e_xx, \"        \", a2_x, \"        \",\n                   prev_mod, \"       \", curr_mod, \"\\n\");\n\n    if ((curr_mod - prev_mod) < threshold_mod) {\n      galois::gPrint(\"Modularity gain: \", (curr_mod - prev_mod), \" < \",\n                     threshold_mod, \" \\n\");\n      prev_mod = curr_mod;\n      break;\n    }\n\n    prev_mod = curr_mod;\n\n  } // End while\n  TimerClusteringWhile.stop();\n\n  iter = num_iter;\n\n  c_info.destroy();\n  c_info.deallocate();\n\n  c_update.destroy();\n  c_update.deallocate();\n\n  TimerClusteringTotal.stop();\n  return prev_mod;\n}\n\nvoid runMultiPhaseLouvainAlgorithm(Graph& graph, uint32_t min_graph_size,\n                                   double c_threshold,\n                                   largeArray& clusters_orig) {\n\n  galois::gPrint(\"Inside runMultiPhaseLouvainAlgorithm\\n\");\n  double prev_mod = -1; // Previous modularity\n  double curr_mod = -1; // Current modularity\n  uint32_t phase  = 0;\n\n  Graph* graph_curr = &graph;\n  Graph graph_next;\n  uint32_t iter           = 0;\n  uint64_t num_nodes_orig = clusters_orig.size();\n  while (true) {\n    iter++;\n    phase++;\n    galois::gPrint(\"Starting Phase : \", phase, \"\\n\");\n    galois::gPrint(\"Graph size : \", (*graph_curr).size(), \"\\n\");\n\n    if ((*graph_curr).size() > min_graph_size) {\n\n      switch (algo) {\n      case coloring:\n        curr_mod =\n            algoLouvainWithColoring(*graph_curr, curr_mod, c_threshold, iter);\n        break;\n      case foreach:\n        curr_mod =\n            algoLouvainWithLocking(*graph_curr, curr_mod, c_threshold, iter);\n        break;\n      case doall:\n        curr_mod = algoLouvainWithoutLockingDoAll(*graph_curr, curr_mod,\n                                                  c_threshold, iter);\n        break;\n      case delay:\n        curr_mod = algoLouvainWithLockingDelayUpdate(*graph_curr, curr_mod,\n                                                     c_threshold, iter);\n        break;\n      default:\n        std::abort();\n      }\n    }\n\n    uint64_t num_unique_clusters = renumberClustersContiguously(*graph_curr);\n    galois::gPrint(\n        \"Number of unique clusters (renumber): \", num_unique_clusters, \"\\n\");\n\n    galois::gPrint(\"Prev_mod main: \", prev_mod, \"\\n\");\n    if (iter < max_iter && (curr_mod - prev_mod) > threshold) {\n      if (!enable_VF && phase == 1) {\n        assert(num_nodes_orig == (*graph_curr).size());\n        galois::do_all(galois::iterate(*graph_curr), [&](GNode n) {\n          clusters_orig[n] =\n              (*graph_curr).getData(n, flag_no_lock).curr_comm_ass;\n        });\n      } else {\n        galois::do_all(\n            galois::iterate((uint64_t)0, num_nodes_orig), [&](GNode n) {\n              if (clusters_orig[n] != UNASSIGNED) {\n                assert(clusters_orig[n] < graph_curr->size());\n                clusters_orig[n] = (*graph_curr)\n                                       .getData(clusters_orig[n], flag_no_lock)\n                                       .curr_comm_ass;\n              }\n            });\n      }\n      buildNextLevelGraph(*graph_curr, graph_next, num_unique_clusters);\n      prev_mod   = curr_mod;\n      graph_curr = &graph_next;\n      printGraphCharateristics(*graph_curr);\n    } else {\n      break;\n    }\n  }\n  galois::gPrint(\"Phases : \", phase, \"\\n\");\n  galois::gPrint(\"Iter : \", iter, \"\\n\");\n}\n\nint main(int argc, char** argv) {\n  galois::SharedMemSys G;\n  LonestarStart(argc, argv, name, desc, url, &inputFile);\n\n  if (!symmetricGraph) {\n    GALOIS_DIE(\"This application requires a symmetric graph input;\"\n               \" please use the -symmetricGraph flag \"\n               \" to indicate the input is a symmetric graph.\");\n  }\n\n  galois::StatTimer totalTime(\"TimerTotal\");\n  totalTime.start();\n\n  Graph graph;\n  Graph graph_next;\n  Graph* graph_curr;\n\n  std::cout << \"Reading from file: \" << inputFile << \"\\n\";\n  std::cout << \"[WARNING:] Make sure \" << inputFile\n            << \" is symmetric graph without duplicate edges\\n\";\n  galois::graphs::readGraph(graph, inputFile);\n  std::cout << \"Read \" << graph.size() << \" nodes, \" << graph.sizeEdges()\n            << \" edges\\n\";\n\n  graph_curr = &graph;\n\n  /*\n   * To keep track of communities for nodes in the original graph.\n   * Community will be set to -1 for isolated nodes\n   */\n  largeArray clusters_orig;\n  clusters_orig.allocateBlocked(graph_curr->size());\n\n  /*\n   * Vertex following optimization\n   */\n  if (enable_VF) {\n    uint64_t num_nodes_to_fix =\n        vertexFollowing(graph); // Find nodes that follow other nodes\n    galois::gPrint(\"Isolated nodes : \", num_nodes_to_fix, \"\\n\");\n\n    uint64_t num_unique_clusters = renumberClustersContiguously(*graph_curr);\n    galois::gPrint(\n        \"Number of unique clusters (renumber): \", num_unique_clusters, \"\\n\");\n    /*\n     *Initialize node cluster id.\n     */\n    galois::do_all(galois::iterate(*graph_curr), [&](GNode n) {\n      clusters_orig[n] = graph.getData(n, flag_no_lock).curr_comm_ass;\n    });\n\n    // Build new graph to remove the isolated nodes\n    buildNextLevelGraph(*graph_curr, graph_next, num_unique_clusters);\n    graph_curr = &graph_next;\n    printGraphCharateristics(*graph_curr);\n  } else {\n\n    /*\n     *Initialize node cluster id.\n     */\n    galois::do_all(galois::iterate(*graph_curr),\n                   [&](GNode n) { clusters_orig[n] = -1; });\n\n    printGraphCharateristics(*graph_curr);\n  }\n\n  galois::StatTimer execTime(\"Timer_0\");\n  execTime.start();\n  runMultiPhaseLouvainAlgorithm(*graph_curr, min_graph_size, c_threshold,\n                                clusters_orig);\n  execTime.stop();\n\n  /*\n   * Sanity check: Check modularity at the end\n   */\n  checkModularity<Graph, CommArray>(graph, clusters_orig);\n  if (output_CID) {\n    printNodeClusterId(graph, output_CID_filename);\n  }\n\n  totalTime.stop();\n\n  return 0;\n}\n"
  },
  {
    "path": "lonestar/analytics/cpu/connected-components/CMakeLists.txt",
    "content": "add_executable(connected-components-cpu ConnectedComponents.cpp)\nadd_dependencies(apps connected-components-cpu)\ntarget_link_libraries(connected-components-cpu PRIVATE Galois::shmem lonestar)\ninstall(TARGETS connected-components-cpu DESTINATION \"${CMAKE_INSTALL_BINDIR}\" COMPONENT apps EXCLUDE_FROM_ALL)\nadd_test_scale(small connected-components-cpu \"${BASEINPUT}/scalefree/symmetric/rmat10.sgr\" \"-symmetricGraph\")\n"
  },
  {
    "path": "lonestar/analytics/cpu/connected-components/ConnectedComponents.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#include \"galois/Galois.h\"\n#include \"galois/AtomicHelpers.h\"\n#include \"galois/Bag.h\"\n#include \"galois/ParallelSTL.h\"\n#include \"galois/Reduction.h\"\n#include \"galois/Timer.h\"\n#include \"galois/UnionFind.h\"\n#include \"galois/graphs/LCGraph.h\"\n#include \"galois/graphs/OCGraph.h\"\n#include \"galois/graphs/TypeTraits.h\"\n#include \"galois/runtime/Profile.h\"\n#include \"Lonestar/BoilerPlate.h\"\n\n#include \"llvm/Support/CommandLine.h\"\n\n#include <utility>\n#include <vector>\n#include <algorithm>\n#include <iostream>\n\n#include <ostream>\n#include <fstream>\n\nconst char* name = \"Connected Components\";\nconst char* desc = \"Computes the connected components of a graph\";\n\nnamespace cll = llvm::cl;\n\nenum Algo {\n  serial,\n  labelProp,\n  synchronous,\n  async,\n  edgeasync,\n  blockedasync,\n  edgetiledasync,\n  afforest,\n  edgeafforest,\n  edgetiledafforest,\n};\n\nstatic cll::opt<std::string>\n    inputFile(cll::Positional, cll::desc(\"<input file>\"), cll::Required);\nstatic cll::opt<Algo> algo(\n    \"algo\", cll::desc(\"Choose an algorithm:\"),\n    cll::values(\n        clEnumValN(Algo::async, \"Async\", \"Asynchronous\"),\n        clEnumValN(Algo::edgeasync, \"EdgeAsync\", \"Edge-Asynchronous\"),\n        clEnumValN(Algo::edgetiledasync, \"EdgetiledAsync\",\n                   \"EdgeTiled-Asynchronous (default)\"),\n        clEnumValN(Algo::blockedasync, \"BlockedAsync\", \"Blocked asynchronous\"),\n        clEnumValN(Algo::labelProp, \"LabelProp\",\n                   \"Using label propagation algorithm\"),\n        clEnumValN(Algo::serial, \"Serial\", \"Serial\"),\n        clEnumValN(Algo::synchronous, \"Sync\", \"Synchronous\"),\n        clEnumValN(Algo::afforest, \"Afforest\", \"Using Afforest sampling\"),\n        clEnumValN(Algo::edgeafforest, \"EdgeAfforest\",\n                   \"Using Afforest sampling, Edge-wise\"),\n        clEnumValN(Algo::edgetiledafforest, \"EdgetiledAfforest\",\n                   \"Using Afforest sampling, EdgeTiled\")\n\n            ),\n    cll::init(Algo::edgetiledasync));\n\nstatic cll::opt<std::string>\n    largestComponentFilename(\"outputLargestComponent\",\n                             cll::desc(\"[output graph file]\"), cll::init(\"\"));\nstatic cll::opt<std::string>\n    permutationFilename(\"outputNodePermutation\",\n                        cll::desc(\"[output node permutation file]\"),\n                        cll::init(\"\"));\n#ifndef NDEBUG\nenum OutputEdgeType { void_, int32_, int64_ };\nstatic cll::opt<unsigned int>\n    memoryLimit(\"memoryLimit\",\n                cll::desc(\"Memory limit for out-of-core algorithms (in MB)\"),\n                cll::init(~0U));\nstatic cll::opt<OutputEdgeType> writeEdgeType(\n    \"edgeType\", cll::desc(\"Input/Output edge type:\"),\n    cll::values(\n        clEnumValN(OutputEdgeType::void_, \"void\", \"no edge values\"),\n        clEnumValN(OutputEdgeType::int32_, \"int32\", \"32 bit edge values\"),\n        clEnumValN(OutputEdgeType::int64_, \"int64\", \"64 bit edge values\")),\n    cll::init(OutputEdgeType::void_));\n#endif\n\n// TODO (bozhi) LLVM commandline library now supports option categorization.\n// Categorize params when libllvm is updated to make -help beautiful!\n// static cll::OptionCategory ParamCat(\"Algorithm-Specific Parameters\",\n//                                       \"Only used for specific algorithms.\");\nstatic cll::opt<uint32_t>\n    EDGE_TILE_SIZE(\"edgeTileSize\",\n                   cll::desc(\"(For Edgetiled algos) Size of edge tiles \"\n                             \"(default 512)\"),\n                   // cll::cat(ParamCat),\n                   cll::init(512)); // 512 -> 64\nstatic const int CHUNK_SIZE = 1;\n//! parameter for the Vertex Neighbor Sampling step of Afforest algorithm\nstatic cll::opt<uint32_t> NEIGHBOR_SAMPLES(\n    \"vns\",\n    cll::desc(\"(For Afforest and its variants) number of edges \"\n              \"per vertice to process initially for exposing \"\n              \"partial connectivity (default 2)\"),\n    // cll::cat(ParamCat),\n    cll::init(2));\n//! parameter for the Large Component Skipping step of Afforest algorithm\nstatic cll::opt<uint32_t> COMPONENT_SAMPLES(\n    \"lcs\",\n    cll::desc(\"(For Afforest and its variants) number of times \"\n              \"randomly sampling over vertices to approximately \"\n              \"capture the largest intermediate component \"\n              \"(default 1024)\"),\n    // cll::cat(ParamCat),\n    cll::init(1024));\n\nstruct Node : public galois::UnionFindNode<Node> {\n  using component_type = Node*;\n\n  Node() : galois::UnionFindNode<Node>(const_cast<Node*>(this)) {}\n  Node(const Node& o) : galois::UnionFindNode<Node>(o.m_component) {}\n\n  Node& operator=(const Node& o) {\n    Node c(o);\n    std::swap(c, *this);\n    return *this;\n  }\n\n  component_type component() { return this->get(); }\n  bool isRepComp(unsigned int) { return false; }\n};\n\nconst unsigned int LABEL_INF = std::numeric_limits<unsigned int>::max();\n\n/**\n * Serial connected components algorithm. Just use union-find.\n */\nstruct SerialAlgo {\n  using Graph =\n      galois::graphs::LC_CSR_Graph<Node, void>::with_no_lockable<true>::type;\n  using GNode = Graph::GraphNode;\n\n  template <typename G>\n  void readGraph(G& graph) {\n    galois::graphs::readGraph(graph, inputFile);\n  }\n\n  void operator()(Graph& graph) {\n    for (const GNode& src : graph) {\n      Node& sdata = graph.getData(src, galois::MethodFlag::UNPROTECTED);\n      for (auto ii : graph.edges(src, galois::MethodFlag::UNPROTECTED)) {\n        GNode dst   = graph.getEdgeDst(ii);\n        Node& ddata = graph.getData(dst, galois::MethodFlag::UNPROTECTED);\n        sdata.merge(&ddata);\n      }\n    }\n\n    for (const GNode& src : graph) {\n      Node& sdata = graph.getData(src, galois::MethodFlag::UNPROTECTED);\n      sdata.compress();\n    }\n  }\n};\n\nstruct LabelPropAlgo {\n\n  struct LNode {\n    using component_type = unsigned int;\n    std::atomic<unsigned int> comp_current;\n    unsigned int comp_old;\n\n    component_type component() { return comp_current; }\n    bool isRep() { return false; }\n    bool isRepComp(unsigned int x) { return x == comp_current; }\n  };\n\n  using Graph =\n      galois::graphs::LC_CSR_Graph<LNode, void>::with_no_lockable<true>::type;\n  using GNode          = Graph::GraphNode;\n  using component_type = LNode::component_type;\n\n  template <typename G>\n  void readGraph(G& graph) {\n    galois::graphs::readGraph(graph, inputFile);\n  }\n\n  void operator()(Graph& graph) {\n    galois::GReduceLogicalOr changed;\n    do {\n      changed.reset();\n      galois::do_all(\n          galois::iterate(graph),\n          [&](const GNode& src) {\n            LNode& sdata = graph.getData(src, galois::MethodFlag::UNPROTECTED);\n            if (sdata.comp_old > sdata.comp_current) {\n              sdata.comp_old = sdata.comp_current;\n\n              changed.update(true);\n\n              for (auto e : graph.edges(src, galois::MethodFlag::UNPROTECTED)) {\n                GNode dst              = graph.getEdgeDst(e);\n                auto& ddata            = graph.getData(dst);\n                unsigned int label_new = sdata.comp_current;\n                galois::atomicMin(ddata.comp_current, label_new);\n              }\n            }\n          },\n          galois::disable_conflict_detection(), galois::steal(),\n          galois::loopname(\"LabelPropAlgo\"));\n    } while (changed.reduce());\n  }\n};\n\n/**\n * Synchronous connected components algorithm.  Initially all nodes are in\n * their own component. Then, we merge endpoints of edges to form the spanning\n * tree. Merging is done in two phases to simplify concurrent updates: (1)\n * find components and (2) union components.  Since the merge phase does not\n * do any finds, we only process a fraction of edges at a time; otherwise,\n * the union phase may unnecessarily merge two endpoints in the same\n * component.\n */\nstruct SynchronousAlgo {\n  using Graph =\n      galois::graphs::LC_CSR_Graph<Node, void>::with_no_lockable<true>::type;\n  using GNode = Graph::GraphNode;\n\n  template <typename G>\n  void readGraph(G& graph) {\n    galois::graphs::readGraph(graph, inputFile);\n  }\n\n  struct Edge {\n    GNode src;\n    Node* ddata;\n    int count;\n    Edge(GNode src, Node* ddata, int count)\n        : src(src), ddata(ddata), count(count) {}\n  };\n\n  void operator()(Graph& graph) {\n    size_t rounds = 0;\n    galois::GAccumulator<size_t> emptyMerges;\n\n    galois::InsertBag<Edge> wls[2];\n    galois::InsertBag<Edge>* next;\n    galois::InsertBag<Edge>* cur;\n\n    cur  = &wls[0];\n    next = &wls[1];\n\n    galois::do_all(galois::iterate(graph), [&](const GNode& src) {\n      for (auto ii : graph.edges(src, galois::MethodFlag::UNPROTECTED)) {\n        GNode dst = graph.getEdgeDst(ii);\n        if (src >= dst)\n          continue;\n        Node& ddata = graph.getData(dst, galois::MethodFlag::UNPROTECTED);\n        cur->push(Edge(src, &ddata, 0));\n        break;\n      }\n    });\n\n    while (!cur->empty()) {\n      galois::do_all(\n          galois::iterate(*cur),\n          [&](const Edge& edge) {\n            Node& sdata =\n                graph.getData(edge.src, galois::MethodFlag::UNPROTECTED);\n            if (!sdata.merge(edge.ddata))\n              emptyMerges += 1;\n          },\n          galois::loopname(\"Merge\"));\n\n      galois::do_all(\n          galois::iterate(*cur),\n          [&](const Edge& edge) {\n            GNode src   = edge.src;\n            Node& sdata = graph.getData(src, galois::MethodFlag::UNPROTECTED);\n            Node* scomponent = sdata.findAndCompress();\n            Graph::edge_iterator ii =\n                graph.edge_begin(src, galois::MethodFlag::UNPROTECTED);\n            Graph::edge_iterator ei =\n                graph.edge_end(src, galois::MethodFlag::UNPROTECTED);\n            int count = edge.count + 1;\n            std::advance(ii, count);\n            for (; ii != ei; ++ii, ++count) {\n              GNode dst = graph.getEdgeDst(ii);\n              if (src >= dst)\n                continue;\n              Node& ddata = graph.getData(dst, galois::MethodFlag::UNPROTECTED);\n              Node* dcomponent = ddata.findAndCompress();\n              if (scomponent != dcomponent) {\n                next->push(Edge(src, dcomponent, count));\n                break;\n              }\n            }\n          },\n          galois::loopname(\"Find\"));\n\n      cur->clear();\n      std::swap(cur, next);\n      rounds += 1;\n    }\n\n    galois::do_all(\n        galois::iterate(graph),\n        [&](const GNode& src) {\n          Node& sdata = graph.getData(src, galois::MethodFlag::UNPROTECTED);\n          sdata.compress();\n        },\n        galois::steal(), galois::loopname(\"Compress\"));\n\n    galois::runtime::reportStat_Single(\"CC-Sync\", \"rounds\", rounds);\n    galois::runtime::reportStat_Single(\"CC-Sync\", \"emptyMerges\",\n                                       emptyMerges.reduce());\n  }\n};\n\n/**\n * Like synchronous algorithm, but if we restrict path compression (as done is\n * @link{UnionFindNode}), we can perform unions and finds concurrently.\n */\nstruct AsyncAlgo {\n  using Graph =\n      galois::graphs::LC_CSR_Graph<Node, void>::with_no_lockable<true>::type;\n  using GNode = Graph::GraphNode;\n\n  template <typename G>\n  void readGraph(G& graph) {\n    galois::graphs::readGraph(graph, inputFile);\n  }\n\n  void operator()(Graph& graph) {\n    galois::GAccumulator<size_t> emptyMerges;\n\n    galois::do_all(\n        galois::iterate(graph),\n        [&](const GNode& src) {\n          Node& sdata = graph.getData(src, galois::MethodFlag::UNPROTECTED);\n\n          for (auto ii : graph.edges(src, galois::MethodFlag::UNPROTECTED)) {\n            GNode dst   = graph.getEdgeDst(ii);\n            Node& ddata = graph.getData(dst, galois::MethodFlag::UNPROTECTED);\n\n            if (src >= dst)\n              continue;\n\n            if (!sdata.merge(&ddata))\n              emptyMerges += 1;\n          }\n        },\n        galois::loopname(\"CC-Async\"));\n\n    galois::do_all(\n        galois::iterate(graph),\n        [&](const GNode& src) {\n          Node& sdata = graph.getData(src, galois::MethodFlag::UNPROTECTED);\n          sdata.compress();\n        },\n        galois::steal(), galois::loopname(\"CC-Async-Compress\"));\n\n    galois::runtime::reportStat_Single(\"CC-Async\", \"emptyMerges\",\n                                       emptyMerges.reduce());\n  }\n};\n\nstruct EdgeAsyncAlgo {\n  using Graph =\n      galois::graphs::LC_CSR_Graph<Node, void>::with_no_lockable<true>::type;\n  using GNode = Graph::GraphNode;\n  using Edge  = std::pair<GNode, typename Graph::edge_iterator>;\n\n  template <typename G>\n  void readGraph(G& graph) {\n    galois::graphs::readGraph(graph, inputFile);\n  }\n\n  void operator()(Graph& graph) {\n    galois::GAccumulator<size_t> emptyMerges;\n\n    galois::InsertBag<Edge> works;\n\n    galois::do_all(\n        galois::iterate(graph),\n        [&](const GNode& src) {\n          for (auto ii : graph.edges(src, galois::MethodFlag::UNPROTECTED)) {\n            if (src < graph.getEdgeDst(ii)) {\n              works.push_back(std::make_pair(src, ii));\n            }\n          }\n        },\n        galois::loopname(\"CC-EdgeAsyncInit\"), galois::steal());\n\n    galois::do_all(\n        galois::iterate(works),\n        [&](Edge& e) {\n          Node& sdata = graph.getData(e.first, galois::MethodFlag::UNPROTECTED);\n          GNode dst   = graph.getEdgeDst(e.second);\n          Node& ddata = graph.getData(dst, galois::MethodFlag::UNPROTECTED);\n\n          if (e.first > dst)\n            // continue;\n            ;\n          else if (!sdata.merge(&ddata)) {\n            emptyMerges += 1;\n          }\n        },\n        galois::loopname(\"CC-EdgeAsync\"), galois::steal());\n\n    galois::do_all(\n        galois::iterate(graph),\n        [&](const GNode& src) {\n          Node& sdata = graph.getData(src, galois::MethodFlag::UNPROTECTED);\n          sdata.compress();\n        },\n        galois::steal(), galois::loopname(\"CC-Async-Compress\"));\n\n    galois::runtime::reportStat_Single(\"CC-Async\", \"emptyMerges\",\n                                       emptyMerges.reduce());\n  }\n};\n\n/**\n * Improve performance of async algorithm by following machine topology.\n */\nstruct BlockedAsyncAlgo {\n  using Graph =\n      galois::graphs::LC_CSR_Graph<Node, void>::with_no_lockable<true>::type;\n  using GNode = Graph::GraphNode;\n\n  struct WorkItem {\n    GNode src;\n    Graph::edge_iterator start;\n  };\n\n  template <typename G>\n  void readGraph(G& graph) {\n    galois::graphs::readGraph(graph, inputFile);\n  }\n\n  //! Add the next edge between components to the worklist\n  template <bool MakeContinuation, int Limit, typename Pusher>\n  static void process(Graph& graph, const GNode& src,\n                      const Graph::edge_iterator& start, Pusher& pusher) {\n\n    Node& sdata = graph.getData(src, galois::MethodFlag::UNPROTECTED);\n    int count   = 1;\n    for (Graph::edge_iterator\n             ii = start,\n             ei = graph.edge_end(src, galois::MethodFlag::UNPROTECTED);\n         ii != ei; ++ii, ++count) {\n      GNode dst   = graph.getEdgeDst(ii);\n      Node& ddata = graph.getData(dst, galois::MethodFlag::UNPROTECTED);\n\n      if (src >= dst)\n        continue;\n\n      if (sdata.merge(&ddata)) {\n        if (Limit == 0 || count != Limit)\n          continue;\n      }\n\n      if (MakeContinuation || (Limit != 0 && count == Limit)) {\n        WorkItem item = {src, ii + 1};\n        pusher.push(item);\n        break;\n      }\n    }\n  }\n\n  void operator()(Graph& graph) {\n    galois::InsertBag<WorkItem> items;\n\n    galois::do_all(\n        galois::iterate(graph),\n        [&](const GNode& src) {\n          Graph::edge_iterator start =\n              graph.edge_begin(src, galois::MethodFlag::UNPROTECTED);\n          if (galois::substrate::ThreadPool::getSocket() == 0) {\n            process<true, 0>(graph, src, start, items);\n          } else {\n            process<true, 1>(graph, src, start, items);\n          }\n        },\n        galois::loopname(\"Initialize\"));\n\n    galois::for_each(\n        galois::iterate(items),\n        [&](const WorkItem& item, auto& ctx) {\n          process<true, 0>(graph, item.src, item.start, ctx);\n        },\n        galois::loopname(\"Merge\"),\n        galois::wl<galois::worklists::PerSocketChunkFIFO<128>>());\n\n    galois::do_all(\n        galois::iterate(graph),\n        [&](const GNode& src) {\n          Node& sdata = graph.getData(src, galois::MethodFlag::UNPROTECTED);\n          sdata.compress();\n        },\n        galois::steal(), galois::loopname(\"CC-Async-Compress\"));\n  }\n};\n\nstruct EdgeTiledAsyncAlgo {\n  using Graph =\n      galois::graphs::LC_CSR_Graph<Node, void>::with_no_lockable<true>::type;\n  using GNode = Graph::GraphNode;\n\n  template <typename G>\n  void readGraph(G& graph) {\n    galois::graphs::readGraph(graph, inputFile);\n  }\n\n  struct EdgeTile {\n    // Node* sData;\n    GNode src;\n    Graph::edge_iterator beg;\n    Graph::edge_iterator end;\n  };\n\n  /*struct EdgeTileMaker {\n      EdgeTile operator() (Node* sdata, Graph::edge_iterator beg,\n  Graph::edge_iterator end) const{ return EdgeTile{sdata, beg, end};\n      }\n  };*/\n\n  void operator()(Graph& graph) {\n    galois::GAccumulator<size_t> emptyMerges;\n\n    galois::InsertBag<EdgeTile> works;\n\n    std::cout << \"INFO: Using edge tile size of \" << EDGE_TILE_SIZE\n              << \" and chunk size of \" << CHUNK_SIZE << \"\\n\";\n    std::cout << \"WARNING: Performance varies considerably due to parameter.\\n\";\n    std::cout\n        << \"WARNING: Do not expect the default to be good for your graph.\\n\";\n\n    galois::do_all(\n        galois::iterate(graph),\n        [&](const GNode& src) {\n          // Node& sdata=graph.getData(src,\n          // galois::MethodFlag::UNPROTECTED);\n          auto beg = graph.edge_begin(src, galois::MethodFlag::UNPROTECTED);\n          const auto end = graph.edge_end(src, galois::MethodFlag::UNPROTECTED);\n\n          assert(beg <= end);\n          if ((end - beg) > EDGE_TILE_SIZE) {\n            for (; beg + EDGE_TILE_SIZE < end;) {\n              auto ne = beg + EDGE_TILE_SIZE;\n              assert(ne < end);\n              works.push_back(EdgeTile{src, beg, ne});\n              beg = ne;\n            }\n          }\n\n          if ((end - beg) > 0) {\n            works.push_back(EdgeTile{src, beg, end});\n          }\n        },\n        galois::loopname(\"CC-EdgeTiledAsyncInit\"), galois::steal());\n\n    galois::do_all(\n        galois::iterate(works),\n        [&](const EdgeTile& tile) {\n          // Node& sdata = *(tile.sData);\n          GNode src   = tile.src;\n          Node& sdata = graph.getData(src, galois::MethodFlag::UNPROTECTED);\n\n          for (auto ii = tile.beg; ii != tile.end; ++ii) {\n            GNode dst = graph.getEdgeDst(ii);\n            if (src >= dst)\n              continue;\n\n            Node& ddata = graph.getData(dst, galois::MethodFlag::UNPROTECTED);\n            if (!sdata.merge(&ddata))\n              emptyMerges += 1;\n          }\n        },\n        galois::loopname(\"CC-edgetiledAsync\"), galois::steal(),\n        galois::chunk_size<CHUNK_SIZE>() // 16 -> 1\n    );\n\n    galois::do_all(\n        galois::iterate(graph),\n        [&](const GNode& src) {\n          Node& sdata = graph.getData(src, galois::MethodFlag::UNPROTECTED);\n          sdata.compress();\n        },\n        galois::steal(), galois::loopname(\"CC-Async-Compress\"));\n\n    galois::runtime::reportStat_Single(\"CC-edgeTiledAsync\", \"emptyMerges\",\n                                       emptyMerges.reduce());\n  }\n};\n\ntemplate <typename component_type, typename Graph>\ncomponent_type approxLargestComponent(Graph& graph) {\n  using map_type = std::unordered_map<\n      component_type, int, std::hash<component_type>,\n      std::equal_to<component_type>,\n      galois::gstl::Pow2Alloc<std::pair<const component_type, int>>>;\n  using pair_type = std::pair<component_type, int>;\n\n  map_type comp_freq(COMPONENT_SAMPLES);\n  std::random_device rd;\n  std::mt19937 rng(rd());\n  std::uniform_int_distribution<uint32_t> dist(0, graph.size() - 1);\n  for (uint32_t i = 0; i < COMPONENT_SAMPLES; i++) {\n    auto& ndata = graph.getData(dist(rng), galois::MethodFlag::UNPROTECTED);\n    comp_freq[ndata.component()]++;\n  }\n\n  assert(!comp_freq.empty());\n  auto most_frequent =\n      std::max_element(comp_freq.begin(), comp_freq.end(),\n                       [](const pair_type& a, const pair_type& b) {\n                         return a.second < b.second;\n                       });\n\n  galois::gDebug(\n      \"Approximate largest intermediate component: \", most_frequent->first,\n      \" (hit rate \", 100.0 * (most_frequent->second) / COMPONENT_SAMPLES, \"%)\");\n\n  return most_frequent->first;\n}\n\n/**\n * CC w/ Afforest sampling.\n *\n * [1] M. Sutton, T. Ben-Nun and A. Barak, \"Optimizing Parallel Graph\n * Connectivity Computation via Subgraph Sampling,\" 2018 IEEE International\n * Parallel and Distributed Processing Symposium (IPDPS), Vancouver, BC, 2018,\n * pp. 12-21.\n */\nstruct AfforestAlgo {\n  struct NodeData : public galois::UnionFindNode<NodeData> {\n    using component_type = NodeData*;\n\n    NodeData() : galois::UnionFindNode<NodeData>(const_cast<NodeData*>(this)) {}\n    NodeData(const NodeData& o)\n        : galois::UnionFindNode<NodeData>(o.m_component) {}\n\n    component_type component() { return this->get(); }\n    bool isRepComp(unsigned int) { return false; } // verify\n\n  public:\n    void link(NodeData* b) {\n      NodeData* a = m_component.load(std::memory_order_relaxed);\n      b           = b->m_component.load(std::memory_order_relaxed);\n      while (a != b) {\n        if (a < b)\n          std::swap(a, b);\n        // Now a > b\n        NodeData* ac = a->m_component.load(std::memory_order_relaxed);\n        if ((ac == a && a->m_component.compare_exchange_strong(a, b)) ||\n            (b == ac))\n          break;\n        a = (a->m_component.load(std::memory_order_relaxed))\n                ->m_component.load(std::memory_order_relaxed);\n        b = b->m_component.load(std::memory_order_relaxed);\n      }\n    }\n  };\n  using Graph =\n      galois::graphs::LC_CSR_Graph<NodeData,\n                                   void>::with_no_lockable<true>::type;\n  using GNode          = Graph::GraphNode;\n  using component_type = NodeData::component_type;\n\n  template <typename G>\n  void readGraph(G& graph) {\n    galois::graphs::readGraph(graph, inputFile);\n  }\n\n  void operator()(Graph& graph) {\n    // (bozhi) should NOT go through single direction in sampling step: nodes\n    // with edges less than NEIGHBOR_SAMPLES will fail\n    for (uint32_t r = 0; r < NEIGHBOR_SAMPLES; ++r) {\n      galois::do_all(\n          galois::iterate(graph),\n          [&](const GNode& src) {\n            Graph::edge_iterator ii =\n                graph.edge_begin(src, galois::MethodFlag::UNPROTECTED);\n            Graph::edge_iterator ei =\n                graph.edge_end(src, galois::MethodFlag::UNPROTECTED);\n            for (std::advance(ii, r); ii < ei; ii++) {\n              GNode dst = graph.getEdgeDst(ii);\n              NodeData& sdata =\n                  graph.getData(src, galois::MethodFlag::UNPROTECTED);\n              NodeData& ddata =\n                  graph.getData(dst, galois::MethodFlag::UNPROTECTED);\n              sdata.link(&ddata);\n              break;\n            }\n          },\n          galois::steal(), galois::loopname(\"Afforest-VNS-Link\"));\n\n      galois::do_all(\n          galois::iterate(graph),\n          [&](const GNode& src) {\n            NodeData& sdata =\n                graph.getData(src, galois::MethodFlag::UNPROTECTED);\n            sdata.compress();\n          },\n          galois::steal(), galois::loopname(\"Afforest-VNS-Compress\"));\n    }\n\n    galois::StatTimer StatTimer_Sampling(\"Afforest-LCS-Sampling\");\n    StatTimer_Sampling.start();\n    const component_type c = approxLargestComponent<component_type>(graph);\n    StatTimer_Sampling.stop();\n\n    galois::do_all(\n        galois::iterate(graph),\n        [&](const GNode& src) {\n          NodeData& sdata = graph.getData(src, galois::MethodFlag::UNPROTECTED);\n          if (sdata.component() == c)\n            return;\n          Graph::edge_iterator ii =\n              graph.edge_begin(src, galois::MethodFlag::UNPROTECTED);\n          Graph::edge_iterator ei =\n              graph.edge_end(src, galois::MethodFlag::UNPROTECTED);\n          for (std::advance(ii, NEIGHBOR_SAMPLES.getValue()); ii < ei; ++ii) {\n            GNode dst = graph.getEdgeDst(ii);\n            NodeData& ddata =\n                graph.getData(dst, galois::MethodFlag::UNPROTECTED);\n            sdata.link(&ddata);\n          }\n        },\n        galois::steal(), galois::loopname(\"Afforest-LCS-Link\"));\n\n    galois::do_all(\n        galois::iterate(graph),\n        [&](const GNode& src) {\n          NodeData& sdata = graph.getData(src, galois::MethodFlag::UNPROTECTED);\n          sdata.compress();\n        },\n        galois::steal(), galois::loopname(\"Afforest-LCS-Compress\"));\n  }\n};\n\n/**\n * Edge CC w/ Afforest sampling\n */\nstruct EdgeAfforestAlgo {\n  struct NodeData : public galois::UnionFindNode<NodeData> {\n    using component_type = NodeData*;\n\n    NodeData() : galois::UnionFindNode<NodeData>(const_cast<NodeData*>(this)) {}\n    NodeData(const NodeData& o)\n        : galois::UnionFindNode<NodeData>(o.m_component) {}\n\n    component_type component() { return this->get(); }\n    bool isRepComp(unsigned int) { return false; } // verify\n\n  public:\n    NodeData* hook_min(NodeData* b, NodeData* c = 0) {\n      NodeData* a = m_component.load(std::memory_order_relaxed);\n      b           = b->m_component.load(std::memory_order_relaxed);\n      while (a != b) {\n        if (a < b)\n          std::swap(a, b);\n        // Now a > b\n        NodeData* ac = a->m_component.load(std::memory_order_relaxed);\n        if (ac == a && a->m_component.compare_exchange_strong(a, b)) {\n          if (b == c)\n            return a; //! return victim\n          return 0;\n        }\n        if (b == ac) {\n          return 0;\n        }\n        a = (a->m_component.load(std::memory_order_relaxed))\n                ->m_component.load(std::memory_order_relaxed);\n        b = b->m_component.load(std::memory_order_relaxed);\n      }\n      return 0;\n    }\n  };\n  using Graph =\n      galois::graphs::LC_CSR_Graph<NodeData,\n                                   void>::with_no_lockable<true>::type;\n  using GNode          = Graph::GraphNode;\n  using component_type = NodeData::component_type;\n\n  using Edge = std::pair<GNode, GNode>;\n\n  template <typename G>\n  void readGraph(G& graph) {\n    galois::graphs::readGraph(graph, inputFile);\n  }\n\n  void operator()(Graph& graph) {\n    // (bozhi) should NOT go through single direction in sampling step: nodes\n    // with edges less than NEIGHBOR_SAMPLES will fail\n    for (uint32_t r = 0; r < NEIGHBOR_SAMPLES; ++r) {\n      galois::do_all(\n          galois::iterate(graph),\n          [&](const GNode& src) {\n            Graph::edge_iterator ii =\n                graph.edge_begin(src, galois::MethodFlag::UNPROTECTED);\n            Graph::edge_iterator ei =\n                graph.edge_end(src, galois::MethodFlag::UNPROTECTED);\n            std::advance(ii, r);\n            if (ii < ei) {\n              GNode dst = graph.getEdgeDst(ii);\n              NodeData& sdata =\n                  graph.getData(src, galois::MethodFlag::UNPROTECTED);\n              NodeData& ddata =\n                  graph.getData(dst, galois::MethodFlag::UNPROTECTED);\n              sdata.hook_min(&ddata);\n            }\n          },\n          galois::steal(), galois::loopname(\"EdgeAfforest-VNS-Link\"));\n    }\n    galois::do_all(\n        galois::iterate(graph),\n        [&](const GNode& src) {\n          NodeData& sdata = graph.getData(src, galois::MethodFlag::UNPROTECTED);\n          sdata.compress();\n        },\n        galois::steal(), galois::loopname(\"EdgeAfforest-VNS-Compress\"));\n\n    galois::StatTimer StatTimer_Sampling(\"EdgeAfforest-LCS-Sampling\");\n    StatTimer_Sampling.start();\n    const component_type c = approxLargestComponent<component_type>(graph);\n    StatTimer_Sampling.stop();\n    const component_type c0 =\n        &(graph.getData(0, galois::MethodFlag::UNPROTECTED));\n\n    galois::InsertBag<Edge> works;\n\n    galois::do_all(\n        galois::iterate(graph),\n        [&](const GNode& src) {\n          NodeData& sdata = graph.getData(src, galois::MethodFlag::UNPROTECTED);\n          if (sdata.component() == c)\n            return;\n          auto beg = graph.edge_begin(src, galois::MethodFlag::UNPROTECTED);\n          const auto end = graph.edge_end(src, galois::MethodFlag::UNPROTECTED);\n\n          for (std::advance(beg, NEIGHBOR_SAMPLES.getValue()); beg < end;\n               beg++) {\n            GNode dst = graph.getEdgeDst(beg);\n            NodeData& ddata =\n                graph.getData(dst, galois::MethodFlag::UNPROTECTED);\n            if (src < dst || c == ddata.component()) {\n              works.push_back(std::make_pair(src, dst));\n            }\n          }\n        },\n        galois::loopname(\"EdgeAfforest-LCS-Assembling\"), galois::steal());\n\n    galois::for_each(\n        galois::iterate(works),\n        [&](const Edge& e, auto& ctx) {\n          NodeData& sdata =\n              graph.getData(e.first, galois::MethodFlag::UNPROTECTED);\n          if (sdata.component() == c)\n            return;\n          NodeData& ddata =\n              graph.getData(e.second, galois::MethodFlag::UNPROTECTED);\n          component_type victim = sdata.hook_min(&ddata, c);\n          if (victim) {\n            GNode src = victim - c0; // TODO (bozhi) tricky!\n            for (auto ii : graph.edges(src, galois::MethodFlag::UNPROTECTED)) {\n              GNode dst = graph.getEdgeDst(ii);\n              ctx.push_back(std::make_pair(dst, src));\n            }\n          }\n        },\n        galois::disable_conflict_detection(),\n        galois::loopname(\"EdgeAfforest-LCS-Link\"));\n\n    galois::do_all(\n        galois::iterate(graph),\n        [&](const GNode& src) {\n          NodeData& sdata = graph.getData(src, galois::MethodFlag::UNPROTECTED);\n          sdata.compress();\n        },\n        galois::steal(), galois::loopname(\"EdgeAfforest-LCS-Compress\"));\n  }\n};\n\n/**\n * Edgetiled CC w/ Afforest sampling\n */\nstruct EdgeTiledAfforestAlgo {\n  struct NodeData : public galois::UnionFindNode<NodeData> {\n    using component_type = NodeData*;\n\n    NodeData() : galois::UnionFindNode<NodeData>(const_cast<NodeData*>(this)) {}\n    NodeData(const NodeData& o)\n        : galois::UnionFindNode<NodeData>(o.m_component) {}\n\n    component_type component() { return this->get(); }\n    bool isRepComp(unsigned int) { return false; } // verify\n\n  public:\n    void link(NodeData* b) {\n      NodeData* a = m_component.load(std::memory_order_relaxed);\n      b           = b->m_component.load(std::memory_order_relaxed);\n      while (a != b) {\n        if (a < b)\n          std::swap(a, b);\n        // Now a > b\n        NodeData* ac = a->m_component.load(std::memory_order_relaxed);\n        if ((ac == a && a->m_component.compare_exchange_strong(a, b)) ||\n            (b == ac))\n          break;\n        a = (a->m_component.load(std::memory_order_relaxed))\n                ->m_component.load(std::memory_order_relaxed);\n        b = b->m_component.load(std::memory_order_relaxed);\n      }\n    }\n  };\n  using Graph =\n      galois::graphs::LC_CSR_Graph<NodeData,\n                                   void>::with_no_lockable<true>::type;\n  using GNode          = Graph::GraphNode;\n  using component_type = NodeData::component_type;\n\n  struct EdgeTile {\n    GNode src;\n    Graph::edge_iterator beg;\n    Graph::edge_iterator end;\n  };\n\n  template <typename G>\n  void readGraph(G& graph) {\n    galois::graphs::readGraph(graph, inputFile);\n  }\n\n  void operator()(Graph& graph) {\n    // (bozhi) should NOT go through single direction in sampling step: nodes\n    // with edges less than NEIGHBOR_SAMPLES will fail\n    galois::do_all(\n        galois::iterate(graph),\n        [&](const GNode& src) {\n          auto ii = graph.edge_begin(src, galois::MethodFlag::UNPROTECTED);\n          const auto end = graph.edge_end(src, galois::MethodFlag::UNPROTECTED);\n          for (uint32_t r = 0; r < NEIGHBOR_SAMPLES && ii < end; ++r, ++ii) {\n            GNode dst = graph.getEdgeDst(ii);\n            NodeData& sdata =\n                graph.getData(src, galois::MethodFlag::UNPROTECTED);\n            NodeData& ddata =\n                graph.getData(dst, galois::MethodFlag::UNPROTECTED);\n            sdata.link(&ddata);\n          }\n        },\n        galois::steal(), galois::loopname(\"EdgetiledAfforest-VNS-Link\"));\n\n    galois::do_all(\n        galois::iterate(graph),\n        [&](const GNode& src) {\n          NodeData& sdata = graph.getData(src, galois::MethodFlag::UNPROTECTED);\n          sdata.compress();\n        },\n        galois::steal(), galois::loopname(\"EdgetiledAfforest-VNS-Compress\"));\n\n    galois::StatTimer StatTimer_Sampling(\"EdgetiledAfforest-LCS-Sampling\");\n    StatTimer_Sampling.start();\n    const component_type c = approxLargestComponent<component_type>(graph);\n    StatTimer_Sampling.stop();\n\n    galois::InsertBag<EdgeTile> works;\n    std::cout << \"INFO: Using edge tile size of \" << EDGE_TILE_SIZE\n              << \" and chunk size of \" << CHUNK_SIZE << \"\\n\";\n    galois::do_all(\n        galois::iterate(graph),\n        [&](const GNode& src) {\n          NodeData& sdata = graph.getData(src, galois::MethodFlag::UNPROTECTED);\n          if (sdata.component() == c)\n            return;\n          auto beg = graph.edge_begin(src, galois::MethodFlag::UNPROTECTED);\n          const auto end = graph.edge_end(src, galois::MethodFlag::UNPROTECTED);\n\n          for (std::advance(beg, NEIGHBOR_SAMPLES.getValue());\n               beg + EDGE_TILE_SIZE < end;) {\n            auto ne = beg + EDGE_TILE_SIZE;\n            assert(ne < end);\n            works.push_back(EdgeTile{src, beg, ne});\n            beg = ne;\n          }\n\n          if ((end - beg) > 0) {\n            works.push_back(EdgeTile{src, beg, end});\n          }\n        },\n        galois::loopname(\"EdgetiledAfforest-LCS-Tiling\"), galois::steal());\n\n    galois::do_all(\n        galois::iterate(works),\n        [&](const EdgeTile& tile) {\n          NodeData& sdata =\n              graph.getData(tile.src, galois::MethodFlag::UNPROTECTED);\n          if (sdata.component() == c)\n            return;\n          for (auto ii = tile.beg; ii < tile.end; ++ii) {\n            GNode dst = graph.getEdgeDst(ii);\n            NodeData& ddata =\n                graph.getData(dst, galois::MethodFlag::UNPROTECTED);\n            sdata.link(&ddata);\n          }\n        },\n        galois::steal(), galois::chunk_size<CHUNK_SIZE>(),\n        galois::loopname(\"EdgetiledAfforest-LCS-Link\"));\n\n    galois::do_all(\n        galois::iterate(graph),\n        [&](const GNode& src) {\n          NodeData& sdata = graph.getData(src, galois::MethodFlag::UNPROTECTED);\n          sdata.compress();\n        },\n        galois::steal(), galois::loopname(\"EdgetiledAfforest-LCS-Compress\"));\n  }\n};\n\ntemplate <typename Graph>\nbool verify(\n    Graph&,\n    typename std::enable_if<galois::graphs::is_segmented<Graph>::value>::type* =\n        0) {\n  return true;\n}\n\ntemplate <typename Graph>\nbool verify(Graph& graph,\n            typename std::enable_if<\n                !galois::graphs::is_segmented<Graph>::value>::type* = 0) {\n\n  using GNode = typename Graph::GraphNode;\n\n  auto is_bad = [&graph](const GNode& n) {\n    auto& me = graph.getData(n);\n    for (auto ii : graph.edges(n)) {\n      GNode dst  = graph.getEdgeDst(ii);\n      auto& data = graph.getData(dst);\n      if (data.component() != me.component()) {\n        std::cerr << std::dec << \"not in same component: \" << (unsigned int)n\n                  << \" (\" << me.component() << \")\"\n                  << \" and \" << (unsigned int)dst << \" (\" << data.component()\n                  << \")\"\n                  << \"\\n\";\n        return true;\n      }\n    }\n    return false;\n  };\n\n  return galois::ParallelSTL::find_if(graph.begin(), graph.end(), is_bad) ==\n         graph.end();\n}\n\ntemplate <typename Algo, typename Graph>\ntypename Graph::node_data_type::component_type findLargest(Graph& graph) {\n\n  using GNode          = typename Graph::GraphNode;\n  using component_type = typename Graph::node_data_type::component_type;\n\n  using Map = galois::gstl::Map<component_type, int>;\n\n  auto reduce = [](Map& lhs, Map&& rhs) -> Map& {\n    Map v{std::move(rhs)};\n\n    for (auto& kv : v) {\n      if (lhs.count(kv.first) == 0) {\n        lhs[kv.first] = 0;\n      }\n      lhs[kv.first] += kv.second;\n    }\n\n    return lhs;\n  };\n\n  auto mapIdentity = []() { return Map(); };\n\n  auto accumMap = galois::make_reducible(reduce, mapIdentity);\n\n  galois::GAccumulator<size_t> accumReps;\n\n  galois::do_all(\n      galois::iterate(graph),\n      [&](const GNode& x) {\n        auto& n = graph.getData(x, galois::MethodFlag::UNPROTECTED);\n\n        if (std::is_same<Algo, LabelPropAlgo>::value) {\n          if (n.isRepComp((unsigned int)x)) {\n            accumReps += 1;\n            return;\n          }\n        } else {\n          if (n.isRep()) {\n            accumReps += 1;\n            return;\n          }\n        }\n\n        // Don't add reps to table to avoid adding components of size\n        // 1\n        accumMap.update(Map{std::make_pair(n.component(), 1)});\n      },\n      galois::loopname(\"CountLargest\"));\n\n  Map& map    = accumMap.reduce();\n  size_t reps = accumReps.reduce();\n\n  using ComponentSizePair = std::pair<component_type, int>;\n\n  auto sizeMax = [](const ComponentSizePair& a, const ComponentSizePair& b) {\n    if (a.second > b.second) {\n      return a;\n    }\n    return b;\n  };\n\n  auto identity = []() { return ComponentSizePair{}; };\n\n  auto maxComp = galois::make_reducible(sizeMax, identity);\n\n  galois::do_all(galois::iterate(map),\n                 [&](const ComponentSizePair& x) { maxComp.update(x); });\n\n  ComponentSizePair largest = maxComp.reduce();\n\n  // Compensate for dropping representative node of components\n  double ratio       = graph.size() - reps + map.size();\n  size_t largestSize = largest.second + 1;\n  if (ratio) {\n    ratio = largestSize / ratio;\n  }\n\n  std::cout << \"Total components: \" << reps << \"\\n\";\n  std::cout << \"Number of non-trivial components: \" << map.size()\n            << \" (largest size: \" << largestSize << \" [\" << ratio << \"])\\n\";\n\n  return largest.first;\n}\n\ntemplate <typename Graph>\nvoid initialize(Graph&) {}\n\ntemplate <>\nvoid initialize<LabelPropAlgo::Graph>(typename LabelPropAlgo::Graph& graph) {\n  unsigned int id = 0;\n\n  for (typename LabelPropAlgo::Graph::iterator ii = graph.begin(),\n                                               ei = graph.end();\n       ii != ei; ++ii, ++id) {\n    graph.getData(*ii).comp_current = id;\n    graph.getData(*ii).comp_old     = LABEL_INF;\n  }\n}\n\ntemplate <typename Algo>\nvoid run() {\n  using Graph = typename Algo::Graph;\n\n  Algo algo;\n  Graph graph;\n\n  algo.readGraph(graph);\n  std::cout << \"Read \" << graph.size() << \" nodes\\n\";\n\n  initialize(graph);\n\n  galois::preAlloc(numThreads +\n                   (3 * graph.size() * sizeof(typename Graph::node_data_type)) /\n                       galois::runtime::pagePoolSize());\n  galois::reportPageAlloc(\"MeminfoPre\");\n\n  galois::StatTimer execTime(\"Timer_0\");\n  execTime.start();\n  algo(graph);\n  execTime.stop();\n\n  galois::reportPageAlloc(\"MeminfoPost\");\n\n  if (!skipVerify || largestComponentFilename != \"\" ||\n      permutationFilename != \"\") {\n    findLargest<Algo, Graph>(graph);\n    if (!verify(graph)) {\n      GALOIS_DIE(\"verification failed\");\n    }\n  }\n}\n\nint main(int argc, char** argv) {\n  galois::SharedMemSys G;\n  LonestarStart(argc, argv, name, desc, nullptr, &inputFile);\n\n  galois::StatTimer totalTime(\"TimerTotal\");\n  totalTime.start();\n\n  if (!symmetricGraph) {\n    GALOIS_DIE(\"This application requires a symmetric graph input;\"\n               \" please use the -symmetricGraph flag \"\n               \" to indicate the input is a symmetric graph.\");\n  }\n\n  switch (algo) {\n  case Algo::async:\n    run<AsyncAlgo>();\n    break;\n  case Algo::edgeasync:\n    run<EdgeAsyncAlgo>();\n    break;\n  case Algo::edgetiledasync:\n    run<EdgeTiledAsyncAlgo>();\n    break;\n  case Algo::blockedasync:\n    run<BlockedAsyncAlgo>();\n    break;\n  case Algo::labelProp:\n    run<LabelPropAlgo>();\n    break;\n  case Algo::serial:\n    run<SerialAlgo>();\n    break;\n  case Algo::synchronous:\n    run<SynchronousAlgo>();\n    break;\n  case Algo::afforest:\n    run<AfforestAlgo>();\n    break;\n  case Algo::edgeafforest:\n    run<EdgeAfforestAlgo>();\n    break;\n  case Algo::edgetiledafforest:\n    run<EdgeTiledAfforestAlgo>();\n    break;\n\n  default:\n    std::cerr << \"Unknown algorithm\\n\";\n    abort();\n  }\n\n  totalTime.stop();\n\n  return 0;\n}\n"
  },
  {
    "path": "lonestar/analytics/cpu/connected-components/README.md",
    "content": "Weakly Connected components\n================================================================================\n\nDESCRIPTION \n--------------------------------------------------------------------------------\n\nFind all connected components of an undirected (symmetric) graph. Set the same\nlabel to nodes which belong to the same component.\nTwo major categories of algorithm: Pointer Jumping and Label Propogation.\n\nPointer Jumping is based on union-find that nodes' label is a pointer pointing\nto its representative. We merge endpoints of edges to form the spanning tree.\nMerging is done in two phases to simplify concurrent updates: (1) find components,\nupdate the pointer to reduce the depth of the tree and (2) union componenets,\nupdate nodes connected by edges.\n\nIn Label Propagation, each node is marked with a unique label and propagating\nvertex labels through neighboring vertices until all the vertices in the same\ncomponent are labelled with a unique ID.\n\n  - Serial: Serial pointer-jumping implementation.\n  - Synchronous: Bulk synchronous data-driven implementation.\n    Alternatively execute on two worklists.\n  - Async: Asynchronous topology-driven implementation. Work unit is a node.\n  - BlockedAsync: Asynchronous topology-driven implementation with NUMA-aware\n    optimization. Work unit is a node.\n  - EdgeAsync: Asynchronous topology-driven. Work unit is an edge.\n  - EdgetiledAsync (default): Asynchronous topology-driven.\n    Work unit is an edge tile.\n  - LabelProp: Label propagation implementation.\n\nINPUT\n--------------------------------------------------------------------------------\n\nThis application takes in symmetric Galois .gr graphs.\nYou must specify the -symmetricGraph flag when running this benchmark.\n\nBUILD\n--------------------------------------------------------------------------------\n\n1. Run cmake at BUILD directory (refer to top-level README for cmake instructions).\n\n2. Run `cd <BUILD>/lonestar/analytics/cpu/connected-components; make -j`\n\nRUN\n--------------------------------------------------------------------------------\n\nTo run default algorithm (edgetiledasync), use the following:\n-`$ ./connected-components-cpu <input-graph (symmetric)> -t=<num-threads> -symmetricGraph`\n\nTo run a specific algorithm, use the following:\n-`$ ./connected-components-cpu <input-graph (symmetric)> -t=<num-threads> -algo=<algorithm> -symmetricGraph'\n\nPERFORMANCE  \n--------------------------------------------------------------------------------\n\nDefault algorithm 'edgetiledasync' works best on rmat25, r4-2e26, roadUSA graphs\namong all algorithms. Two parameters 'EDGE_TILE_SIZE' and 'CHUNK_SIZE'\n(granularity of work stealing) are crucial to performance and has to be tuned on\ndifferent platforms. They are set to be 512 and 1 respectively by default.\nLabel propagation is the best if the input graph is randomized,\ni.e. node ID are randomized, highest degree node is not node 0.\n"
  },
  {
    "path": "lonestar/analytics/cpu/gmetis/CMakeLists.txt",
    "content": "add_executable(gmetis-cpu Coarsening.cpp GMetis.cpp Metric.cpp Partitioning.cpp Refine.cpp)\nadd_dependencies(apps gmetis-cpu)\ntarget_link_libraries(gmetis-cpu PRIVATE Galois::shmem lonestar)\ninstall(TARGETS gmetis-cpu DESTINATION \"${CMAKE_INSTALL_BINDIR}\" COMPONENT apps EXCLUDE_FROM_ALL)\n\n# Disable failing test (issue #116).\nadd_test_scale(small1 gmetis-cpu \"${BASEINPUT}/reference/structured/rome99.gr\" \"-numPartitions=4\" NOT_QUICK)\nadd_test_scale(small2 gmetis-cpu \"${BASEINPUT}/scalefree/rmat10.gr\" \"-numPartitions=256\")\n"
  },
  {
    "path": "lonestar/analytics/cpu/gmetis/Coarsening.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#include \"Metis.h\"\n#include \"galois/Galois.h\"\n#include \"galois/Reduction.h\"\n#include \"galois/substrate/PerThreadStorage.h\"\n#include \"galois/gstl.h\"\n\n#include <iostream>\n\nnamespace {\n\nusing MatchingPolicy    = GNode(GNode, GGraph*);\nusing MatchingSubPolicy = std::pair<GNode, int>(GNode, GGraph*, bool tag);\n\nstd::pair<GNode, int> HEMmatch(GNode node, GGraph* graph, bool) {\n  GNode retval = node; // match self if nothing else\n  int maxwgt   = std::numeric_limits<int>::min();\n  //    nume += std::distance(graph->edge_begin(node), graph->edge_end(node));\n  for (auto jj : graph->edges(node, galois::MethodFlag::UNPROTECTED)) {\n    //      ++checked;\n    GNode neighbor = graph->getEdgeDst(jj);\n    MetisNode& neighMNode =\n        graph->getData(neighbor, galois::MethodFlag::UNPROTECTED);\n    int edgeData = graph->getEdgeData(jj, galois::MethodFlag::UNPROTECTED);\n    if (!neighMNode.isMatched() && neighbor != node && maxwgt < edgeData) {\n      maxwgt = edgeData;\n      retval = neighbor;\n    }\n  }\n  return std::make_pair(retval, maxwgt);\n}\nGNode HEMmatch(GNode node, GGraph* graph) {\n  return HEMmatch(node, graph, true).first;\n}\n\nGNode RMmatch(GNode node, GGraph* graph) {\n  for (auto jj : graph->edges(node, galois::MethodFlag::UNPROTECTED)) {\n    GNode neighbor = graph->getEdgeDst(jj);\n    if (!graph->getData(neighbor, galois::MethodFlag::UNPROTECTED)\n             .isMatched() &&\n        neighbor != node)\n      return neighbor;\n  }\n  return node;\n  // Don't actually do random, just choose first\n}\n// std::pair<GNode, int> RMmatch(GNode node, GGraph* graph, bool tag) {\n//  return std::make_pair(RMmatch(node, graph), 0);\n//}\n\ntemplate <MatchingSubPolicy matcher>\nGNode TwoHopMatcher(GNode node, GGraph* graph) {\n  std::pair<GNode, int> retval(node, std::numeric_limits<int>::min());\n  for (auto jj : graph->edges(node, galois::MethodFlag::UNPROTECTED)) {\n    GNode neighbor             = graph->getEdgeDst(jj);\n    std::pair<GNode, int> tval = matcher(neighbor, graph, true);\n    if (tval.first != node && tval.first != neighbor &&\n        tval.second > retval.second)\n      retval = tval;\n  }\n  return retval.first;\n}\n\ntypedef galois::GAccumulator<unsigned> Pcounter;\n\n/*\n *This function is responsible for matching.\n 1. There are two types of matching. Random and Heavy Edge matching\n 2. Random matching picks any random node above a threshold and matches the\n nodes. RM.h\n 3. Heavy Edge Matching matches the vertex which is connected by the heaviest\n edge. HEM.h\n 4. This function can also create the multinode, i.e. the node which is created\n on combining two matched nodes.\n 5. You can enable/disable 4th by changing variantMetis::mergeMatching\n*/\ntemplate <MatchingPolicy matcher, typename WL>\nvoid parallelMatchAndCreateNodes(MetisGraph* graph, Pcounter& pc,\n                                 GNodeBag& noEdgeBag, bool selfMatch) {\n  GGraph* fineGGraph   = graph->getFinerGraph()->getGraph();\n  GGraph* coarseGGraph = graph->getGraph();\n  assert(fineGGraph != coarseGGraph);\n\n  galois::for_each(\n      galois::iterate(*fineGGraph),\n      [&](GNode item, galois::UserContext<GNode>&) {\n        if (fineGGraph->getData(item).isMatched())\n          return;\n\n        if (fineGGraph->edge_begin(item, galois::MethodFlag::UNPROTECTED) ==\n            fineGGraph->edge_end(item, galois::MethodFlag::UNPROTECTED)) {\n          noEdgeBag.push(item);\n          return;\n        }\n\n        GNode ret;\n        do {\n          ret = matcher(item, fineGGraph);\n          // lock ret, since we found it lock-free it may be matched, so try\n          // again\n        } while (fineGGraph->getData(ret).isMatched());\n\n        // at this point both ret and item (and failed matches) are locked.\n        // We do not leave the above loop until we both have the lock on\n        // the node and check the matched status of the locked node.  the\n        // lock before (final) read ensures that we will see any write to\n        // matched\n\n        unsigned numEdges = std::distance(\n            fineGGraph->edge_begin(item, galois::MethodFlag::UNPROTECTED),\n            fineGGraph->edge_end(item, galois::MethodFlag::UNPROTECTED));\n        // assert(numEdges == std::distance(fineGGraph->edge_begin(item),\n        // fineGGraph->edge_end(item)));\n\n        GNode N;\n        if (ret != item) {\n          // match found\n          numEdges += std::distance(\n              fineGGraph->edge_begin(ret, galois::MethodFlag::UNPROTECTED),\n              fineGGraph->edge_end(ret, galois::MethodFlag::UNPROTECTED));\n          // Cautious point\n          N = coarseGGraph->createNode(numEdges,\n                                       fineGGraph->getData(item).getWeight() +\n                                           fineGGraph->getData(ret).getWeight(),\n                                       item, ret);\n          fineGGraph->getData(item).setMatched();\n          fineGGraph->getData(ret).setMatched();\n          fineGGraph->getData(item).setParent(N);\n          fineGGraph->getData(ret).setParent(N);\n        } else {\n          // assertAllMatched(item, fineGGraph);\n          // Cautious point\n          // no match\n          if (selfMatch) {\n            pc.update(1U);\n            N = coarseGGraph->createNode(\n                numEdges, fineGGraph->getData(item).getWeight(), item);\n            fineGGraph->getData(item).setMatched();\n            fineGGraph->getData(item).setParent(N);\n          }\n        }\n      },\n      galois::wl<WL>(), galois::no_pushes(), galois::loopname(\"match\"));\n}\n\n/*\n * This function is responsible for doing a union find of the edges\n * between matched nodes and populate the edges in the coarser graph\n * node.\n */\nvoid createCoarseEdges(MetisGraph* graph) {\n  GGraph* coarseGGraph = graph->getGraph();\n  GGraph* fineGGraph   = graph->getFinerGraph()->getGraph();\n  assert(fineGGraph != coarseGGraph);\n\n  typedef galois::gstl::Vector<std::pair<GNode, unsigned>> VecTy;\n  typedef galois::substrate::PerThreadStorage<VecTy> ThreadLocalData;\n  ThreadLocalData edgesThreadLocal;\n\n  galois::do_all(\n      galois::iterate(*coarseGGraph),\n      [&](GNode node) {\n        //    std::cout << 'p';\n        // fineGGraph is read only in this loop, so skip locks\n        MetisNode& nodeData =\n            coarseGGraph->getData(node, galois::MethodFlag::UNPROTECTED);\n\n        auto& edges = *edgesThreadLocal.getLocal();\n        edges.clear();\n        for (unsigned x = 0; x < nodeData.numChildren(); ++x) {\n          for (auto ii : fineGGraph->edges(nodeData.getChild(x),\n                                           galois::MethodFlag::UNPROTECTED)) {\n            GNode dst = fineGGraph->getEdgeDst(ii);\n            GNode p = fineGGraph->getData(dst, galois::MethodFlag::UNPROTECTED)\n                          .getParent();\n            edges.emplace_back(p, fineGGraph->getEdgeData(\n                                      ii, galois::MethodFlag::UNPROTECTED));\n          }\n        }\n\n        // slightly faster not ordering by edge weight\n        // std::sort(edges.begin(), edges.end(), [] (const std::pair<GNode,\n        // unsigned>& lhs, const std::pair<GNode, unsigned>& rhs) { return\n        // lhs.first < rhs.first; } );\n\n        // insert edges\n        for (auto pp = edges.begin(), ep = edges.end(); pp != ep;) {\n          GNode dst    = pp->first;\n          unsigned sum = pp->second;\n          ++pp;\n          if (node != dst) { // no self edges\n            while (pp != ep && pp->first == dst) {\n              sum += pp->second;\n              ++pp;\n            }\n            coarseGGraph->addMultiEdge(node, dst,\n                                       galois::MethodFlag::UNPROTECTED, sum);\n          }\n        }\n        //    assert(e);\n        // nodeData.setNumEdges(e);\n      },\n      galois::steal(), galois::loopname(\"popedge\"));\n}\n\nstruct HighDegreeIndexer {\n  static GGraph* indexgraph;\n  unsigned int operator()(const GNode& val) const {\n    return indexgraph->getData(val, galois::MethodFlag::UNPROTECTED)\n                   .isFailedMatch()\n               ? std::numeric_limits<unsigned int>::max()\n               : (std::numeric_limits<unsigned int>::max() -\n                  ((std::distance(indexgraph->edge_begin(\n                                      val, galois::MethodFlag::UNPROTECTED),\n                                  indexgraph->edge_end(\n                                      val, galois::MethodFlag::UNPROTECTED))) >>\n                   2));\n  }\n};\nGGraph* HighDegreeIndexer::indexgraph = 0;\n\nstruct LowDegreeIndexer {\n  unsigned int operator()(const GNode& val) const {\n    unsigned x = std::distance(HighDegreeIndexer::indexgraph->edge_begin(\n                                   val, galois::MethodFlag::UNPROTECTED),\n                               HighDegreeIndexer::indexgraph->edge_end(\n                                   val, galois::MethodFlag::UNPROTECTED));\n    return x; // >> 2;\n    // int targetlevel = 0;\n    // while (x >>= 1) ++targetlevel;\n    // return targetlevel;\n  }\n};\n\nstruct WeightIndexer {\n  int operator()(const GNode& val) const {\n    return HighDegreeIndexer::indexgraph\n        ->getData(val, galois::MethodFlag::UNPROTECTED)\n        .getWeight();\n  }\n};\n\n/*unsigned minRuns(unsigned coarsenTo, unsigned size) {\n  unsigned num = 0;\n  while (coarsenTo < size) {\n    ++num;\n    size /= 2;\n  }\n  return num;\n}*/\n\nunsigned fixupLoners(GNodeBag& b, GGraph* coarseGGraph, GGraph* fineGGraph) {\n  unsigned count = 0;\n  auto ii = b.begin(), ee = b.end();\n  while (ii != ee) {\n    auto i2 = ii;\n    ++i2;\n    if (i2 != ee) {\n      GNode N =\n          coarseGGraph->createNode(0,\n                                   fineGGraph->getData(*ii).getWeight() +\n                                       fineGGraph->getData(*i2).getWeight(),\n                                   *ii, *i2);\n      fineGGraph->getData(*ii).setMatched();\n      fineGGraph->getData(*i2).setMatched();\n      fineGGraph->getData(*ii).setParent(N);\n      fineGGraph->getData(*i2).setParent(N);\n      ++ii;\n      ++count;\n    } else {\n      GNode N = coarseGGraph->createNode(\n          0, fineGGraph->getData(*ii).getWeight(), *ii);\n      fineGGraph->getData(*ii).setMatched();\n      fineGGraph->getData(*ii).setParent(N);\n    }\n    ++ii;\n  }\n  return count;\n}\n\nunsigned findMatching(MetisGraph* coarseMetisGraph, bool useRM, bool use2Hop,\n                      bool verbose) {\n  MetisGraph* fineMetisGraph = coarseMetisGraph->getFinerGraph();\n\n  /*\n   * Different worklist versions tried, PerSocketChunkFIFO 256 works best with\n   * LC_MORPH_graph. Another good type would be Lazy Iter.\n   */\n  // typedef galois::worklists::ChunkLIFO<64, GNode> WL;\n  // typedef\n  // galois::worklists::LazyIter<decltype(fineGGraph->local_begin()),false> WL;\n\n  GNodeBag bagOfLoners;\n  Pcounter pc;\n\n  bool useOBIM = true;\n\n  typedef galois::worklists::StableIterator<true> WL;\n  if (useRM) {\n    parallelMatchAndCreateNodes<RMmatch, WL>(coarseMetisGraph, pc, bagOfLoners,\n                                             !use2Hop);\n  } else {\n    // FIXME: use obim for SHEM matching\n    typedef galois::worklists::PerSocketChunkLIFO<32> Chunk;\n    // typedef galois::worklists::OrderedByIntegerMetric<WeightIndexer, Chunk>\n    // pW;\n    typedef galois::worklists::OrderedByIntegerMetric<LowDegreeIndexer, Chunk>\n        pLD;\n    // typedef galois::worklists::OrderedByIntegerMetric<HighDegreeIndexer,\n    // Chunk> pHD;\n\n    HighDegreeIndexer::indexgraph = fineMetisGraph->getGraph();\n    if (useOBIM)\n      parallelMatchAndCreateNodes<HEMmatch, pLD>(coarseMetisGraph, pc,\n                                                 bagOfLoners, !use2Hop);\n    else\n      parallelMatchAndCreateNodes<HEMmatch, WL>(coarseMetisGraph, pc,\n                                                bagOfLoners, !use2Hop);\n  }\n  unsigned c = fixupLoners(bagOfLoners, coarseMetisGraph->getGraph(),\n                           fineMetisGraph->getGraph());\n  if (verbose && c)\n    std::cout << \"\\n\\tLone Matches \" << c;\n  if (use2Hop) {\n    typedef galois::worklists::PerSocketChunkLIFO<32> Chunk;\n    // typedef galois::worklists::OrderedByIntegerMetric<WeightIndexer, Chunk>\n    // pW;\n    typedef galois::worklists::OrderedByIntegerMetric<LowDegreeIndexer, Chunk>\n        pLD;\n    // typedef galois::worklists::OrderedByIntegerMetric<HighDegreeIndexer,\n    // Chunk> pHD;\n\n    HighDegreeIndexer::indexgraph = fineMetisGraph->getGraph();\n    Pcounter pc2;\n    if (useOBIM)\n      parallelMatchAndCreateNodes<TwoHopMatcher<HEMmatch>, pLD>(\n          coarseMetisGraph, pc2, bagOfLoners, true);\n    else\n      parallelMatchAndCreateNodes<TwoHopMatcher<HEMmatch>, WL>(\n          coarseMetisGraph, pc2, bagOfLoners, true);\n    return pc2.reduce();\n  }\n  return pc.reduce();\n}\n\nMetisGraph* coarsenOnce(MetisGraph* fineMetisGraph, unsigned& rem, bool useRM,\n                        bool with2Hop, bool verbose) {\n  MetisGraph* coarseMetisGraph = new MetisGraph(fineMetisGraph);\n  galois::Timer t, t2;\n  if (verbose)\n    t.start();\n  rem = findMatching(coarseMetisGraph, useRM, with2Hop, verbose);\n  if (verbose) {\n    t.stop();\n    std::cout << \"\\n\\tTime Matching \" << t.get() << \"\\n\";\n    t2.start();\n  }\n  createCoarseEdges(coarseMetisGraph);\n  if (verbose) {\n    t2.stop();\n    std::cout << \"\\tTime Creating \" << t2.get() << \"\\n\";\n  }\n  return coarseMetisGraph;\n}\n\n} // namespace\n\nMetisGraph* coarsen(MetisGraph* fineMetisGraph, unsigned coarsenTo,\n                    bool verbose) {\n  MetisGraph* coarseGraph = fineMetisGraph;\n  unsigned size           = std::distance(fineMetisGraph->getGraph()->begin(),\n                                fineMetisGraph->getGraph()->end());\n  unsigned iterNum        = 0;\n  bool with2Hop           = false;\n  unsigned stat           = 0;\n  while (true) { // overflow\n    if (verbose) {\n      std::cout << \"Coarsening \" << iterNum << \"\\t\";\n      stat = graphStat(*coarseGraph->getGraph());\n    }\n    unsigned rem     = 0;\n    coarseGraph      = coarsenOnce(coarseGraph, rem, false, with2Hop, verbose);\n    unsigned newSize = size / 2 + rem / 2;\n    if (verbose) {\n      std::cout << \"\\tTO\\t\";\n      unsigned stat2 = graphStat(*coarseGraph->getGraph());\n      std::cout << \"\\n\\tRatio \" << (double)stat2 / (double)stat << \" REM \"\n                << rem << \" new size \" << newSize << \"\\n\";\n    }\n\n    if (size * 3 < newSize * 4) {\n      with2Hop = true;\n      if (verbose)\n        std::cout << \"** Enabling 2 hop matching\\n\";\n    } else {\n      with2Hop = false;\n    }\n\n    size = newSize;\n    if (newSize * 4 < coarsenTo) { // be more exact near the end\n      size = std::distance(coarseGraph->getGraph()->begin(),\n                           coarseGraph->getGraph()->end());\n      if (size < coarsenTo)\n        break;\n    }\n    ++iterNum;\n  }\n\n  return coarseGraph;\n}\n"
  },
  {
    "path": "lonestar/analytics/cpu/gmetis/GMetis.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#include <vector>\n#include <set>\n#include <map>\n#include <iostream>\n#include <string.h>\n#include <stdlib.h>\n#include <numeric>\n#include <algorithm>\n#include <cmath>\n#include <fstream>\n\n#include \"Metis.h\"\n#include \"galois/graphs/ReadGraph.h\"\n#include \"galois/Timer.h\"\n//#include \"GraphReader.h\"\n#include \"Lonestar/BoilerPlate.h\"\n#include \"galois/graphs/FileGraph.h\"\n#include \"galois/LargeArray.h\"\n\nnamespace cll = llvm::cl;\n\nstatic const char* name = \"GMetis\";\nstatic const char* desc =\n    \"Partitions a graph into K parts and minimizing the graph cut\";\nstatic const char* url = \"gMetis\";\n\nstatic cll::opt<std::string>\n    inputFile(cll::Positional, cll::desc(\"<input file>\"), cll::Required);\nstatic cll::opt<InitialPartMode> partMode(\n    cll::desc(\"Choose a inital part mode:\"),\n    cll::values(clEnumVal(GGP, \"GGP\"), clEnumVal(GGGP, \"GGGP (default)\"),\n                clEnumVal(MGGGP, \"MGGGP\")),\n    cll::init(GGGP));\nstatic cll::opt<refinementMode> refineMode(\n    cll::desc(\"Choose a refinement mode:\"),\n    cll::values(clEnumVal(BKL, \"BKL\"), clEnumVal(BKL2, \"BKL2 (default)\"),\n                clEnumVal(ROBO, \"ROBO\"), clEnumVal(GRACLUS, \"GRACLUS\")),\n    cll::init(BKL2));\n\nstatic cll::opt<bool>\n    mtxInput(\"mtxinput\",\n             cll::desc(\"Use text mtx files instead of binary galois gr files\"),\n             cll::init(false));\nstatic cll::opt<bool> weighted(\"weighted\", cll::desc(\"weighted\"),\n                               cll::init(false));\nstatic cll::opt<bool>\n    verbose(\"verbose\",\n            cll::desc(\"verbose output (debugging mode, takes extra time)\"),\n            cll::init(false));\nstatic cll::opt<std::string> outfile(\"output\",\n                                     cll::desc(\"output partition file name\"));\nstatic cll::opt<std::string>\n    orderedfile(\"ordered\", cll::desc(\"output ordered graph file name\"));\nstatic cll::opt<std::string>\n    permutationfile(\"permutation\", cll::desc(\"output permutation file name\"));\nstatic cll::opt<int> numPartitions(\"numPartitions\",\n                                   cll::desc(\"<Number of partitions>\"),\n                                   cll::Required);\nstatic cll::opt<double> imbalance(\n    \"balance\",\n    cll::desc(\"Fraction deviated from mean partition size (default 0.01)\"),\n    cll::init(0.01));\n\n// const double COARSEN_FRACTION = 0.9;\n\n/**\n * KMetis Algorithm\n */\nvoid Partition(MetisGraph* metisGraph, unsigned nparts) {\n  unsigned fineMetisGraphWeight = metisGraph->getTotalWeight();\n  unsigned meanWeight = ((double)fineMetisGraphWeight) / (double)nparts;\n  unsigned coarsenTo  = 20 * nparts;\n\n  if (verbose)\n    std::cout << \"Starting coarsening: \\n\";\n  galois::StatTimer T(\"Coarsen\");\n  T.start();\n  auto mcg =\n      std::unique_ptr<MetisGraph>(coarsen(metisGraph, coarsenTo, verbose));\n  T.stop();\n  if (verbose)\n    std::cout << \"Time coarsen: \" << T.get() << \"\\n\";\n\n  galois::StatTimer T2(\"Partition\");\n  T2.start();\n  std::vector<partInfo> parts;\n  parts = partition(mcg.get(), fineMetisGraphWeight, nparts, partMode);\n  T2.stop();\n\n  if (verbose)\n    std::cout << \"Init edge cut : \" << computeCut(*mcg->getGraph()) << \"\\n\\n\";\n\n  std::vector<partInfo> initParts = parts;\n  if (verbose)\n    std::cout << \"Time clustering:  \" << T2.get() << '\\n';\n\n  if (verbose) {\n    switch (refineMode) {\n    case BKL2:\n      std::cout << \"Sorting refinnement with BKL2\\n\";\n      break;\n    case BKL:\n      std::cout << \"Sorting refinnement with BKL\\n\";\n      break;\n    case ROBO:\n      std::cout << \"Sorting refinnement with ROBO\\n\";\n      break;\n    case GRACLUS:\n      std::cout << \"Sorting refinnement with GRACLUS\\n\";\n      break;\n    default:\n      abort();\n    }\n  }\n\n  galois::StatTimer T3(\"Refine\");\n  T3.start();\n  refine(mcg.get(), parts, meanWeight - (unsigned)(meanWeight * imbalance),\n         meanWeight + (unsigned)(meanWeight * imbalance), refineMode, verbose);\n  T3.stop();\n  if (verbose)\n    std::cout << \"Time refinement: \" << T3.get() << \"\\n\";\n\n  std::cout << \"Initial dist\\n\";\n  printPartStats(initParts);\n  std::cout << \"\\n\";\n\n  std::cout << \"Refined dist\\n\";\n  printPartStats(parts);\n  std::cout << \"\\n\";\n}\n\ntypedef galois::graphs::FileGraph FG;\ntypedef FG::GraphNode FN;\ntemplate <typename GNode, typename Weights>\nstruct order_by_degree {\n  GGraph& graph;\n  Weights& weights;\n  order_by_degree(GGraph& g, Weights& w) : graph(g), weights(w) {}\n  bool operator()(const GNode& a, const GNode& b) {\n    uint64_t wa = weights[a];\n    uint64_t wb = weights[b];\n    int pa      = graph.getData(a, galois::MethodFlag::UNPROTECTED).getPart();\n    int pb      = graph.getData(b, galois::MethodFlag::UNPROTECTED).getPart();\n    if (pa != pb) {\n      return pa < pb;\n    }\n    return wa < wb;\n  }\n};\n\ntypedef galois::substrate::PerThreadStorage<std::map<GNode, uint64_t>>\n    PerThreadDegInfo;\n\nint main(int argc, char** argv) {\n  galois::SharedMemSys G;\n  LonestarStart(argc, argv, name, desc, url, &inputFile);\n\n  galois::StatTimer totalTime(\"TimerTotal\");\n  totalTime.start();\n\n  srand(-1);\n  MetisGraph metisGraph;\n  GGraph& graph = *metisGraph.getGraph();\n\n  galois::graphs::readGraph(graph, inputFile);\n\n  galois::do_all(\n      galois::iterate(graph),\n      [&](GNode node) {\n        for (auto jj : graph.edges(node)) {\n          graph.getEdgeData(jj) = 1;\n          // weight+=1;\n        }\n      },\n      galois::loopname(\"initMorphGraph\"));\n\n  graphStat(graph);\n  std::cout << \"\\n\";\n\n  galois::preAlloc(galois::runtime::numPagePoolAllocTotal() * 5);\n  galois::reportPageAlloc(\"MeminfoPre\");\n\n  galois::StatTimer execTime(\"Timer_0\");\n  execTime.start();\n  Partition(&metisGraph, numPartitions);\n  execTime.stop();\n\n  galois::reportPageAlloc(\"MeminfoPost\");\n\n  std::cout << \"Total edge cut: \" << computeCut(graph) << \"\\n\";\n\n  if (outfile != \"\") {\n    MetisGraph* coarseGraph = &metisGraph;\n    while (coarseGraph->getCoarserGraph())\n      coarseGraph = coarseGraph->getCoarserGraph();\n    std::ofstream outFile(outfile.c_str());\n    for (auto it = graph.begin(), ie = graph.end(); it != ie; it++) {\n      unsigned gPart = graph.getData(*it).getPart();\n      outFile << gPart << '\\n';\n    }\n  }\n\n  if (orderedfile != \"\" || permutationfile != \"\") {\n    galois::graphs::FileGraph g;\n    g.fromFile(inputFile);\n    typedef galois::LargeArray<GNode> Permutation;\n    Permutation perm;\n    perm.create(g.size());\n    std::copy(graph.begin(), graph.end(), perm.begin());\n    PerThreadDegInfo threadDegInfo;\n    std::vector<int> parts(numPartitions);\n    for (unsigned int i = 0; i < parts.size(); i++) {\n      parts[i] = i;\n    }\n\n    using WL = galois::worklists::PerSocketChunkFIFO<16>;\n\n    galois::for_each(\n        galois::iterate(parts),\n        [&](int part, auto& lwl) {\n          constexpr auto flag = galois::MethodFlag::UNPROTECTED;\n          typedef std::vector<\n              std::pair<unsigned, GNode>,\n              galois::PerIterAllocTy::rebind<std::pair<unsigned, GNode>>::other>\n              GD;\n          // copy and translate all edges\n          GD orderedNodes(GD::allocator_type(lwl.getPerIterAlloc()));\n          for (auto n : graph) {\n            auto& nd = graph.getData(n, flag);\n            if (static_cast<int>(nd.getPart()) == part) {\n              int edges = std::distance(graph.edge_begin(n, flag),\n                                        graph.edge_end(n, flag));\n              orderedNodes.push_back(std::make_pair(edges, n));\n            }\n          }\n          std::sort(orderedNodes.begin(), orderedNodes.end());\n          int index = 0;\n          std::map<GNode, uint64_t>& threadMap(*threadDegInfo.getLocal());\n          for (auto p : orderedNodes) {\n            GNode n = p.second;\n            threadMap[n] += index;\n            for (auto eb : graph.edges(n, flag)) {\n              GNode neigh = graph.getEdgeDst(eb);\n              auto& nd    = graph.getData(neigh, flag);\n              if (static_cast<int>(nd.getPart()) == part) {\n                threadMap[neigh] += index;\n              }\n            }\n            index++;\n          }\n        },\n        galois::wl<WL>(), galois::per_iter_alloc(),\n        galois::loopname(\"Order Graph\"));\n\n    std::map<GNode, uint64_t> globalMap;\n    for (unsigned int i = 0; i < threadDegInfo.size(); i++) {\n      std::map<GNode, uint64_t>& localMap(*threadDegInfo.getRemote(i));\n      for (auto mb = localMap.begin(), me = localMap.end(); mb != me; mb++) {\n        globalMap[mb->first] = mb->second;\n      }\n    }\n    order_by_degree<GNode, std::map<GNode, uint64_t>> fn(graph, globalMap);\n    std::map<GNode, int> nodeIdMap;\n    int id = 0;\n    for (auto nb = graph.begin(), ne = graph.end(); nb != ne; nb++) {\n      nodeIdMap[*nb] = id;\n      id++;\n    }\n    // compute inverse\n    std::stable_sort(perm.begin(), perm.end(), fn);\n    galois::LargeArray<uint64_t> perm2;\n    perm2.create(g.size());\n    // compute permutation\n    id = 0;\n    for (auto pb = perm.begin(), pe = perm.end(); pb != pe; pb++) {\n      int prevId    = nodeIdMap[*pb];\n      perm2[prevId] = id;\n      id++;\n    }\n    galois::graphs::FileGraph out;\n    galois::graphs::permute<int>(g, perm2, out);\n    if (orderedfile != \"\")\n      out.toFile(orderedfile);\n    if (permutationfile != \"\") {\n      std::ofstream file(permutationfile.c_str());\n      galois::LargeArray<uint64_t> transpose;\n      transpose.create(g.size());\n      uint64_t id = 0;\n      for (auto& ii : perm2) {\n        transpose[ii] = id++;\n      }\n      for (auto& ii : transpose) {\n        file << ii << \"\\n\";\n      }\n    }\n  }\n\n  totalTime.stop();\n\n  return 0;\n}\n"
  },
  {
    "path": "lonestar/analytics/cpu/gmetis/GraphReader.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#ifndef GRAPHREADER_H_\n#define GRAPHREADER_H_\n#include <fstream>\n#include <vector>\nusing namespace std;\n\ntypedef galois::graphs::LC_CSR_Graph<int, unsigned int> InputGraph;\ntypedef galois::graphs::LC_CSR_Graph<int, unsigned int>::GraphNode InputGNode;\n\nwhile (true) {\n  int index = strtol(items, &remaining, 10) - 1;\n  if (index < 0)\n    break;\n  items    = remaining;\n  GNode n2 = nodes[index];\n  if (n1 == n2) {\n    continue;\n  }\n  graph->addEdge(n1, n2, galois::MethodFlag::WRITE, 1);\n  graph->getData(n1).setEdgeWeight(graph->getData(n1).getEdgeWeight() + 1);\n  graph->getData(n1).setNumEdges(graph->getData(n1).getNumEdges() + 1);\n  countEdges++;\n}\n}\n\nparallelMakeNodes(GGraph* g, vector<GNode>& gn, InputGraph* in,\n                  galois::GAccumulator<int>& numNodes)\n    : graph(g), inputGraph(in), gnodes(gn), pnumNodes(numNodes) {}\nvoid operator()(InputGNode node, galois::UserContext<InputGNode>& ctx) {\n  int id     = inputGraph->getData(node);\n  GNode item = graph->createNode(100, 1); // FIXME: edge num\n  //    graph->addNode(item);\n  gnodes[id] = item;\n  pnumNodes += 1;\n}\n}\n;\n\nstruct parallelMakeEdges {\n  GGraph* graph;\n  InputGraph* inputGraph;\n  vector<GNode>& gnodes;\n  bool weighted;\n  bool directed;\n  galois::GAccumulator<int>& pnumEdges;\n\n  parallelMakeEdges(GGraph* g, vector<GNode>& gn, InputGraph* in,\n                    galois::GAccumulator<int>& numE, bool w = false,\n                    bool dir = true)\n      : graph(g), inputGraph(in), gnodes(gn), pnumEdges(numE) {\n    weighted = w;\n    directed = dir;\n  }\n\n  void operator()(InputGNode inNode, galois::UserContext<InputGNode>& ctx) {\n    int nodeId          = inputGraph->getData(inNode);\n    GNode node          = gnodes[nodeId];\n    MetisNode& nodeData = graph->getData(node);\n    for (InputGraph::edge_iterator jj   = inputGraph->edge_begin(inNode),\n                                   eejj = inputGraph->edge_end(inNode);\n         jj != eejj; ++jj) {\n      InputGNode inNeighbor = inputGraph->getEdgeDst(jj);\n      if (inNode == inNeighbor)\n        continue;\n      int neighId = inputGraph->getData(inNeighbor);\n      int weight  = 1;\n      if (weighted) {\n        weight = inputGraph->getEdgeData(jj);\n      }\n      graph->addEdge(node, gnodes[neighId], galois::MethodFlag::WRITE, weight);\n      nodeData.setNumEdges(nodeData.getNumEdges() + 1);\n      nodeData.setEdgeWeight(nodeData.getEdgeWeight() + weight);\n      /*if(!directed){\n        graph->getEdgeData(graph->addEdge(node, gnodes[neighId])) = weight;//\n        nodeData.incNumEdges();\n        nodeData.addEdgeWeight(weight);\n        }else{\n        graph->getEdgeData(graph->addEdge(node, gnodes[neighId])) = weight;\n        graph->getEdgeData(graph->addEdge(gnodes[neighId], node)) = weight;\n        }*/\n      pnumEdges += 1;\n    }\n  }\n};\n\nvoid readGraph(MetisGraph* metisGraph, const char* filename,\n               bool weighted = false, bool directed = true) {\n  InputGraph inputGraph;\n  galois::graphs::readGraph(inputGraph, filename);\n  cout << \"start to transfer data to GGraph\\n\";\n  int id = 0;\n  for (InputGraph::iterator ii = inputGraph.begin(), ee = inputGraph.end();\n       ii != ee; ++ii) {\n    InputGNode node          = *ii;\n    inputGraph.getData(node) = id++;\n  }\n\n  GGraph* graph = metisGraph->getGraph();\n  vector<GNode> gnodes(inputGraph.size());\n  id = 0;\n  /*for(uint64_t i=0;i<inputGraph.size();i++){\n    GNode node = graph->createNode(MetisNode(id, 1));\n    graph->addNode(node);\n    gnodes[id++] = node;\n    }*/\n\n  typedef galois::worklists::PerSocketChunkFIFO<256> WL;\n  galois::GAccumulator<int> pnumNodes;\n  galois::GAccumulator<int> pnumEdges;\n\n  galois::Timer t;\n  t.start();\n  galois::for_each<WL>(inputGraph.begin(), inputGraph.end(),\n                       parallelMakeNodes(graph, gnodes, &inputGraph, pnumNodes),\n                       \"NodesLoad\");\n  t.stop();\n  cout << t.get() << \" ms\\n\";\n  t.start();\n  galois::for_each<WL>(\n      inputGraph.begin(), inputGraph.end(),\n      parallelMakeEdges(graph, gnodes, &inputGraph, pnumEdges, weighted, true),\n      \"EdgesLoad\");\n  t.stop();\n  cout << t.get() << \" ms\\n\";\n\n  int numNodes = pnumNodes.reduce();\n  int numEdges = pnumEdges.reduce();\n\n  cout << \"Done Reading Graph \";\n  cout << \"numNodes: \" << numNodes << \"|numEdges: \" << numEdges / 2 << \"\\n\";\n}\n\n#endif /* GRAPHREADER_H_ */\n"
  },
  {
    "path": "lonestar/analytics/cpu/gmetis/Metis.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#ifndef METIS_H_\n#define METIS_H_\n\n#include \"galois/graphs/LC_Morph_Graph.h\"\n\nclass MetisNode;\nusing GGraph   = galois::graphs::LC_Morph_Graph<MetisNode, int>;\nusing GNode    = GGraph::GraphNode;\nusing GNodeBag = galois::InsertBag<GNode>;\n\n// algorithms\nenum InitialPartMode { GGP, GGGP, MGGGP };\nenum refinementMode { BKL, BKL2, ROBO, GRACLUS };\n// Nodes in the metis graph\nclass MetisNode {\n\n  struct coarsenData {\n    int matched : 1;\n    int failedmatch : 1;\n    GNode parent;\n  };\n  struct refineData {\n    unsigned partition;\n    unsigned oldPartition;\n    bool maybeBoundary;\n  };\n  struct partitionData {\n    bool locked;\n  };\n\n  partitionData pd;\n\n  void initCoarsen() {\n    data.cd.matched     = false;\n    data.cd.failedmatch = false;\n    data.cd.parent      = NULL;\n  }\n\npublic:\n  void initPartition() { pd.locked = false; }\n  // int num;\n  explicit MetisNode(int weight) : _weight(weight) {\n    initCoarsen();\n    initPartition();\n  }\n\n  MetisNode(unsigned weight, GNode child0, GNode child1 = NULL)\n      : _weight(weight) {\n    initCoarsen();\n    initPartition();\n    children[0] = child0;\n    children[1] = child1;\n  }\n\n  MetisNode() : _weight(1) {\n    initCoarsen();\n    initPartition();\n  }\n\n  // call to switch data to refining\n  void initRefine(unsigned part = 0, bool bound = false) {\n    refineData rd = {part, part, bound};\n    data.rd       = rd;\n  }\n\n  int getWeight() const { return _weight; }\n  void setWeight(int weight) { _weight = weight; }\n\n  void setParent(GNode p) { data.cd.parent = p; }\n  GNode getParent() const {\n    assert(data.cd.parent);\n    return data.cd.parent;\n  }\n\n  void setMatched() { data.cd.matched = true; }\n  bool isMatched() const { return data.cd.matched; }\n\n  void setFailedMatch() { data.cd.failedmatch = true; }\n  bool isFailedMatch() const { return data.cd.failedmatch; }\n\n  GNode getChild(unsigned x) const { return children[x]; }\n  unsigned numChildren() const { return children[1] ? 2 : 1; }\n\n  unsigned getPart() const { return data.rd.partition; }\n  void setPart(unsigned val) { data.rd.partition = val; }\n\n  int getOldPart() const { return data.rd.oldPartition; }\n  void OldPartCpyNew() { data.rd.oldPartition = data.rd.partition; }\n\n  bool getmaybeBoundary() const { return data.rd.maybeBoundary; }\n  void setmaybeBoundary(bool val) { data.rd.maybeBoundary = val; }\n\n  void setLocked(bool locked) { pd.locked = locked; }\n  bool isLocked() { return pd.locked; }\n\nprivate:\n  union {\n    coarsenData cd;\n    refineData rd;\n  } data;\n\n  GNode children[2];\n  unsigned _weight;\n};\n\n// Structure to keep track of graph hirarchy\nclass MetisGraph {\n  MetisGraph* coarser;\n  MetisGraph* finer;\n\n  GGraph graph;\n\npublic:\n  MetisGraph() : coarser(0), finer(0) {}\n\n  explicit MetisGraph(MetisGraph* finerGraph) : coarser(0), finer(finerGraph) {\n    finer->coarser = this;\n  }\n\n  const GGraph* getGraph() const { return &graph; }\n  GGraph* getGraph() { return &graph; }\n  MetisGraph* getFinerGraph() const { return finer; }\n  MetisGraph* getCoarserGraph() const { return coarser; }\n\n  unsigned getNumNodes() { return std::distance(graph.begin(), graph.end()); }\n\n  unsigned getTotalWeight() {\n    MetisGraph* f = this;\n    while (f->finer)\n      f = f->finer;\n    return std::distance(f->graph.begin(), f->graph.end());\n  }\n};\n\n// Structure to store working partition information\nstruct partInfo {\n  unsigned partNum;\n  unsigned partMask;\n  unsigned partWeight;\n\n  explicit partInfo(unsigned mw) : partNum(0), partMask(1), partWeight(mw) {}\n\n  partInfo() : partNum(~0), partMask(~0), partWeight(~0) {}\n\n  partInfo(unsigned pn, unsigned pm, unsigned pw)\n      : partNum(pn), partMask(pm), partWeight(pw) {}\n\n  unsigned splitID() const { return partNum | partMask; }\n\n  std::pair<unsigned, unsigned> splitRatio(unsigned numParts) {\n    unsigned L = 0, R = 0;\n    unsigned LM = partMask - 1; // 00100 -> 00011\n    for (unsigned x = 0; x < numParts; ++x)\n      if ((x & LM) == partNum) {\n        if (x & partMask)\n          ++R;\n        else\n          ++L;\n      }\n    return std::make_pair(L, R);\n  }\n\n  partInfo split() {\n    partInfo np(splitID(), partMask << 1, 0);\n    partMask <<= 1;\n    return np;\n  }\n};\n\nstd::ostream& operator<<(std::ostream& os, const partInfo& p);\n\n// Metrics\nvoid printPartStats(std::vector<partInfo>&);\nunsigned graphStat(GGraph& graph);\nstd::vector<unsigned> edgeCut(GGraph& g, unsigned nparts);\nvoid printCuts(const char* str, MetisGraph* g, unsigned numPartitions);\nunsigned computeCut(GGraph& g);\n\n// Coarsening\nMetisGraph* coarsen(MetisGraph* fineMetisGraph, unsigned coarsenTo,\n                    bool verbose);\n\n// Partitioning\nstd::vector<partInfo> partition(MetisGraph* coarseMetisGraph,\n                                unsigned fineMetisGraphWeight,\n                                unsigned numPartitions,\n                                InitialPartMode partMode);\nstd::vector<partInfo> BisectAll(MetisGraph* mcg, unsigned numPartitions,\n                                unsigned maxSize);\n// Refinement\nvoid refine(MetisGraph* coarseGraph, std::vector<partInfo>& parts,\n            unsigned minSize, unsigned maxSize, refinementMode refM,\n            bool verbose);\n// void refinePart(GGraph& g, std::vector<partInfo>& parts, unsigned maxSize);\n// Balancing\nvoid balance(MetisGraph* Graph, std::vector<partInfo>& parts, unsigned maxSize);\n\n#endif\n"
  },
  {
    "path": "lonestar/analytics/cpu/gmetis/Metric.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#include \"Metis.h\"\n\n#include <iomanip>\n#include <iostream>\n#include <numeric>\n\nstruct onlineStat {\n  unsigned num;\n  unsigned val;\n  double valSQ;\n  unsigned mmin;\n  unsigned mmax;\n\n  onlineStat()\n      : num(0), val(0), valSQ(0), mmin(std::numeric_limits<unsigned>::max()),\n        mmax(0) {}\n\n  void add(unsigned v) {\n    ++num;\n    val += v;\n    valSQ += (double)v * (double)v;\n    mmin = std::min(v, mmin);\n    mmax = std::max(v, mmax);\n  }\n\n  double mean() { return (double)val / (double)num; }\n\n  double variance() {\n    double t = valSQ / (double)num;\n    double m = mean();\n    return t - m * m;\n  }\n\n  unsigned count() { return num; }\n  unsigned total() { return val; }\n  unsigned min() { return mmin; }\n  unsigned max() { return mmax; }\n};\n\nunsigned graphStat(GGraph& graph) {\n  onlineStat e;\n  for (auto ii = graph.begin(), ee = graph.end(); ii != ee; ++ii) {\n    unsigned val = std::distance(graph.edge_begin(*ii), graph.edge_end(*ii));\n    e.add(val);\n  }\n  std::cout << \"Nodes \" << e.count() << \" Edges(total, var, min, max) \"\n            << e.total() << \" \" << e.variance() << \" \" << e.min() << \" \"\n            << e.max();\n  return e.count();\n}\n\nstd::vector<unsigned> edgeCut(GGraph& g, unsigned nparts) {\n  std::vector<unsigned> cuts(nparts);\n\n  // find boundary nodes with positive gain\n  for (auto nn : g) {\n    unsigned gPart = g.getData(nn).getPart();\n    for (auto ii : g.edges(nn)) {\n      auto& m = g.getData(g.getEdgeDst(ii));\n      if (m.getPart() != gPart) {\n        cuts.at(gPart) += g.getEdgeData(ii);\n      }\n    }\n  }\n  return cuts;\n}\n\nunsigned computeCut(GGraph& g) {\n  unsigned cuts = 0;\n  for (auto nn : g) {\n    unsigned gPart = g.getData(nn).getPart();\n    for (auto ii : g.edges(nn)) {\n      auto& m = g.getData(g.getEdgeDst(ii));\n      if (m.getPart() != gPart)\n        cuts += g.getEdgeData(ii);\n    }\n  }\n  return cuts / 2;\n}\n\nvoid printPartStats(std::vector<partInfo>& parts) {\n  onlineStat e;\n  assert(!parts.empty());\n  for (unsigned x = 0; x < parts.size(); ++x) {\n    e.add(parts[x].partWeight);\n  }\n  std::cout << \"target \" << e.total() / e.count() << \" var \" << e.variance()\n            << \" min \" << e.min() << \" max \" << e.max() << \"\\n\";\n}\n\nstd::ostream& operator<<(std::ostream& os, const partInfo& p) {\n  os << \"Num \" << std::setw(3) << p.partNum << \"\\tmask \" << std::setw(5)\n     << std::hex << p.partMask << std::dec << \"\\tweight \" << p.partWeight;\n  return os;\n}\n\nvoid printCuts(const char* str, MetisGraph* g, unsigned numPartitions) {\n  std::vector<unsigned> ec = edgeCut(*g->getGraph(), numPartitions);\n  std::cout << str << \" Edge Cuts:\\n\";\n  for (unsigned x = 0; x < ec.size(); ++x)\n    std::cout << (x == 0 ? \"\" : \" \") << ec[x];\n  std::cout << \"\\n\";\n  std::cout << str << \" Average Edge Cut: \"\n            << (std::accumulate(ec.begin(), ec.end(), 0) / ec.size()) << \"\\n\";\n  std::cout << str\n            << \" Minimum Edge Cut: \" << *std::min_element(ec.begin(), ec.end())\n            << \"\\n\";\n}\n"
  },
  {
    "path": "lonestar/analytics/cpu/gmetis/Partitioning.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#include \"galois/Galois.h\"\n#include \"galois/Timer.h\"\n#include \"Metis.h\"\n#include <set>\n#include \"galois/Galois.h\"\n#include <map>\n#include <set>\n#include <cstdlib>\n#include <iostream>\n#include <stack>\n#include <climits>\nconst bool multiSeed = true;\n\nnamespace {\n\n// gain of moving n from it's current part to new part\nint gain_limited(GGraph& g, GNode n, unsigned newpart,\n                 galois::MethodFlag flag) {\n  int retval     = 0;\n  unsigned nPart = g.getData(n, flag).getPart();\n  for (auto ii : g.edges(n, flag)) {\n    GNode neigh = g.getEdgeDst(ii);\n    auto nData  = g.getData(neigh, flag);\n    if (nData.getPart() == nPart)\n      retval -= g.getEdgeData(ii, flag);\n    else if (nData.getPart() == newpart)\n      retval += g.getEdgeData(ii, flag);\n  }\n  return retval;\n}\n\nGNode findSeed(GGraph& g, unsigned partNum, int partWeight,\n               galois::MethodFlag flag) {\n  // pick a seed\n\n  int rWeight = (int)(drand48() * (partWeight));\n  GNode seed  = *g.begin();\n  /*std::vector<std::pair<int,GNode> > nodeEd;\n  for (auto ii = g.begin(), ee = g.end(); ii != ee; ++ii) {\n    if (g.getData(*ii, flag).getPart() == partNum) {\n      seed = *ii;\n      nodeEd.push_back(std::make_pair(std::distance(g.edge_begin(*ii),g.edge_end(*ii)),*ii));\n    }\n  }\n  std::sort(nodeEd.begin(),nodeEd.end());\n  return nodeEd[nodeEd.size()-1].second;\n  */\n  for (auto ii = g.begin(), ee = g.end(); ii != ee; ++ii) {\n    if (g.getData(*ii, flag).getPart() == partNum) {\n      seed = *ii;\n      rWeight -= g.getData(*ii, flag).getWeight();\n      if (rWeight < 0)\n        return *ii;\n    }\n  }\n\n  return seed;\n}\n\nusing BisectPolicy = partInfo(GGraph& g, partInfo& oldPart,\n                              std::pair<unsigned, unsigned> ratio,\n                              std::vector<GNode>* b, int oldWeight);\n\npartInfo bisect_GGP(GGraph& g, partInfo& oldPart,\n                    std::pair<unsigned, unsigned> ratio,\n                    std::vector<GNode>* b = NULL, int = 0) {\n  partInfo newPart = oldPart.split();\n  std::deque<GNode> boundary;\n  unsigned& newWeight = newPart.partWeight = 0;\n  unsigned targetWeight =\n      oldPart.partWeight * ratio.second / (ratio.first + ratio.second);\n\n  auto flag = galois::MethodFlag::UNPROTECTED;\n\n  do {\n    boundary.push_back(findSeed(g, oldPart.partNum, oldPart.partWeight, flag));\n    // grow partition\n    while (newWeight < targetWeight && !boundary.empty()) {\n      GNode n = boundary.front();\n      boundary.pop_front();\n      if (g.getData(n, flag).getPart() == newPart.partNum)\n        continue;\n      newWeight += g.getData(n, flag).getWeight();\n      g.getData(n, flag).setPart(newPart.partNum);\n      if (b)\n        b->push_back(n);\n      for (auto ii : g.edges(n, flag))\n        if (g.getData(g.getEdgeDst(ii), flag).getPart() == oldPart.partNum)\n          boundary.push_back(g.getEdgeDst(ii));\n    }\n  } while (newWeight < targetWeight && multiSeed);\n\n  oldPart.partWeight -= newWeight;\n  return newPart;\n}\n\npartInfo bisect_GGGP(GGraph& g, partInfo& oldPart,\n                     std::pair<unsigned, unsigned> ratio,\n                     std::vector<GNode>* b = NULL, int oldWeight = 0) {\n  partInfo newPart = oldPart.split();\n  std::map<GNode, int> gains;\n  std::map<int, std::set<GNode>> boundary;\n\n  unsigned& newWeight = newPart.partWeight = oldWeight;\n  unsigned targetWeight =\n      oldPart.partWeight * ratio.second / (ratio.first + ratio.second);\n  // pick a seed\n\n  auto flag = galois::MethodFlag::UNPROTECTED;\n\n  do {\n    // boundary[0].insert(findSeed(g, oldPart.partNum, oldPart.partWeight,\n    // flag));\n    GNode bNode = findSeed(g, oldPart.partNum, oldPart.partWeight, flag);\n    boundary[gain_limited(g, bNode, newPart.partNum, flag)].insert(bNode);\n\n    // grow partition\n    while (newWeight < targetWeight && !boundary.empty()) {\n      auto bi = boundary.rbegin();\n      GNode n = *bi->second.begin();\n      bi->second.erase(bi->second.begin());\n      if (bi->second.empty())\n        boundary.erase(bi->first);\n      if (g.getData(n, flag).getPart() == newPart.partNum)\n        continue;\n      newWeight += g.getData(n, flag).getWeight();\n      g.getData(n, flag).setPart(newPart.partNum);\n      if (b)\n        b->push_back(n);\n      for (auto ii : g.edges(n, flag)) {\n        GNode dst = g.getEdgeDst(ii);\n        auto gi   = gains.find(dst);\n        if (gi != gains.end()) { // update\n          boundary[gi->second].erase(dst);\n          if (boundary[gi->second].empty())\n            boundary.erase(gi->second);\n          gains.erase(dst);\n        }\n        if (g.getData(dst, flag).getPart() == oldPart.partNum) {\n          int newgain = gains[dst] =\n              gain_limited(g, dst, newPart.partNum, flag);\n          boundary[newgain].insert(dst);\n        }\n      }\n    }\n  } while (newWeight < targetWeight && multiSeed);\n\n  oldPart.partWeight -= newWeight;\n  return newPart;\n}\n\nint computeEdgeCut(GGraph& g) {\n  int cuts = 0;\n  for (auto nn : g) {\n    unsigned gPart = g.getData(nn).getPart();\n    for (auto ii : g.edges(nn)) {\n      auto& m = g.getData(g.getEdgeDst(ii));\n      if (m.getPart() != gPart) {\n        cuts += g.getEdgeData(ii);\n      }\n    }\n  }\n\n  return cuts / 2;\n}\n\n/*int node_gain(GGraph &graph, GNode node) {\n  auto nData = graph.getData(node,galois::MethodFlag::UNPROTECTED);\n  int gain = 0;\n  for (auto ei : graph.edges(node)) {\n    auto neigh = graph.getEdgeDst(ei);\n    int ew = graph.getEdgeData(ei);\n    auto neighData = graph.getData(neigh,galois::MethodFlag::UNPROTECTED);\n    if (nData.getPart() != neighData.getPart()) {\n      gain += ew;\n    } else {\n      gain -= ew;\n    }\n  }\n  return gain;\n}*/\n\ntypedef std::pair<int, std::pair<GNode, GNode>> PartMatch;\ntypedef galois::substrate::PerThreadStorage<PartMatch> PerThreadPartInfo;\nvoid KLMatch(GGraph& graph, std::vector<GNode>& boundary,\n             PerThreadPartInfo& threadInfo, int oldPartNum, int newPartNum) {\n\n  auto isPartOk = [&](int partNum) -> bool {\n    return (partNum == oldPartNum || partNum == newPartNum);\n  };\n  auto isNodeOk = [&](MetisNode& node) -> bool {\n    return !node.isLocked() && isPartOk(node.getPart());\n  };\n\n  galois::for_each(\n      galois::iterate(boundary),\n      [&](GNode node, auto&) {\n        auto flag            = galois::MethodFlag::UNPROTECTED;\n        PartMatch* localInfo = threadInfo.getLocal();\n        int gain             = localInfo->first;\n        auto& srcData        = graph.getData(node, flag);\n        int srcGain          = 0;\n        if (!isNodeOk(srcData)) {\n          return;\n        }\n\n        for (auto ei : graph.edges(node, flag)) {\n          int ew      = graph.getEdgeData(ei, flag);\n          GNode n     = graph.getEdgeDst(ei);\n          auto& nData = graph.getData(n, flag);\n          if (!isNodeOk(nData)) {\n            continue;\n          }\n          if (nData.getPart() == srcData.getPart()) {\n            srcGain -= ew;\n          } else {\n            srcGain += ew;\n          }\n        }\n        for (auto ei : graph.edges(node, flag)) {\n          GNode n       = graph.getEdgeDst(ei);\n          auto nData    = graph.getData(n, flag);\n          int nw        = graph.getEdgeData(ei, flag);\n          int neighGain = 0;\n          if (!isNodeOk(nData) || nData.getPart() == srcData.getPart()) {\n            continue;\n          }\n          for (auto nei : graph.edges(n, flag)) {\n            int ew      = graph.getEdgeData(nei, flag);\n            GNode nn    = graph.getEdgeDst(nei);\n            auto nnData = graph.getData(nn, flag);\n            if (!isNodeOk(nnData)) {\n              continue;\n            }\n            if (nnData.getPart() == nData.getPart()) {\n              neighGain -= ew;\n            } else {\n              neighGain += ew;\n            }\n          }\n          int totalGain = srcGain + neighGain - 2 * nw;\n          if (totalGain > gain) {\n            gain                     = totalGain;\n            localInfo->first         = gain;\n            localInfo->second.first  = node;\n            localInfo->second.second = n;\n          }\n        }\n      },\n      galois::loopname(\"KLMatch\"),\n      galois::wl<galois::worklists::ChunkLIFO<32>>());\n};\n\nvoid refine_kl(GGraph& graph, std::vector<GNode>& boundary, int oldPartNum,\n               int newPartNum, std::vector<partInfo>& parts) {\n  std::vector<GNode> swappedNodes;\n  std::vector<PartMatch> foundNodes;\n  // int iter = 0;\n  do {\n    std::vector<PartMatch> matches;\n    for (unsigned int j = 0; j < boundary.size(); j++) {\n      PerThreadPartInfo iterationInfo;\n      for (unsigned int i = 0; i < iterationInfo.size(); i++) {\n        iterationInfo.getRemote(i)->first         = INT_MIN;\n        iterationInfo.getRemote(i)->second.first  = NULL;\n        iterationInfo.getRemote(i)->second.second = NULL;\n      }\n      KLMatch(graph, boundary, iterationInfo, oldPartNum, newPartNum);\n      PartMatch bestMatch;\n      bestMatch.first = INT_MIN;\n      for (unsigned int i = 0; i < iterationInfo.size(); i++) {\n        PartMatch match = *iterationInfo.getRemote(i);\n        if (match.first > bestMatch.first) {\n          bestMatch = match;\n        }\n      }\n      if (bestMatch.second.first == NULL || bestMatch.second.second == NULL)\n        break;\n      auto& m1 = graph.getData(bestMatch.second.first);\n      auto& m2 = graph.getData(bestMatch.second.second);\n      m1.setLocked(true);\n      m2.setLocked(true);\n      matches.push_back(bestMatch);\n      foundNodes.push_back(bestMatch);\n    }\n    if (matches.size() == 0) {\n      break;\n    }\n    int g_max = 0;\n    int temp  = 0;\n    int index = -1;\n    for (unsigned int k = 0; k < matches.size(); k++) {\n      g_max += matches[k].first;\n      if (g_max > temp) {\n        temp  = g_max;\n        index = k;\n      }\n    }\n    g_max = temp;\n\n    if (g_max <= 0 || index < 0)\n      break;\n\n    for (int i = 0; i <= index; i++) {\n      PartMatch match = matches[i];\n      GNode n1        = match.second.first;\n      GNode n2        = match.second.second;\n      auto& n1Data    = graph.getData(n1);\n      auto& n2Data    = graph.getData(n2);\n      int p1          = n1Data.getPart();\n      int p2          = n2Data.getPart();\n      parts[p1].partWeight += (n2Data.getWeight() - n1Data.getWeight());\n      parts[p2].partWeight += (n1Data.getWeight() - n2Data.getWeight());\n      n1Data.setPart(p2);\n      n2Data.setPart(p1);\n      swappedNodes.push_back(n1);\n      swappedNodes.push_back(n2);\n    }\n    for (unsigned int i = index + 1; i < matches.size(); i++) {\n      auto& m1 = graph.getData(matches[i].second.first);\n      auto& m2 = graph.getData(matches[i].second.second);\n      m1.setLocked(false);\n      m2.setLocked(false);\n    }\n  } while (true);\n  for (PartMatch match : foundNodes) {\n    graph.getData(match.second.first).setLocked(false);\n    graph.getData(match.second.second).setLocked(false);\n  }\n}\n\ntemplate <BisectPolicy bisect>\nvoid serialBisect(MetisGraph* mg, unsigned int, unsigned int nparts,\n                  std::vector<partInfo>& parts) {\n  GGraph* graph = mg->getGraph();\n  std::stack<partInfo*> workList;\n  workList.push(&parts[0]);\n  while (!workList.empty()) {\n    partInfo* item = workList.top();\n    workList.pop();\n    if (item->splitID() >= nparts) // when to stop\n      continue;\n    std::pair<unsigned, unsigned> ratio = item->splitRatio(nparts);\n    std::vector<GNode> newNodes;\n    // int iter = 0;\n    partInfo newPart;\n    newPart.partWeight = 0;\n    newPart = bisect(*graph, *item, ratio, &newNodes, newPart.partWeight);\n    parts[newPart.partNum] = newPart;\n    refine_kl(*graph, newNodes, item->partNum, newPart.partNum, parts);\n    newPart.partWeight = parts[newPart.partNum].partWeight;\n    item->partWeight   = parts[item->partNum].partWeight;\n    // unsigned targetWeight = item->partWeight * ratio.second / (ratio.first +\n    // ratio.second);\n    item->partWeight = parts[item->partNum].partWeight;\n    workList.push(&(parts[newPart.partNum]));\n    workList.push(item);\n  }\n}\n\ntemplate <BisectPolicy bisect>\nvoid parallelBisect(MetisGraph* mg, unsigned int, unsigned int nparts,\n                    std::vector<partInfo>& parts) {\n  GGraph* graph = mg->getGraph();\n  galois::for_each(\n      galois::iterate({&parts[0]}),\n      [&](partInfo* item, auto& cnx) {\n        if (item->splitID() >= nparts) // when to stop\n          return;\n        std::pair<unsigned, unsigned> ratio = item->splitRatio(nparts);\n        // std::cout << \"Splitting \" << item->partNum << \":\" <<\n        // item->partMask << \" L \" << ratio.first << \" R \" <<\n        // ratio.second << \"\\n\";\n        partInfo newPart = bisect(*graph, *item, ratio, NULL, 0);\n        // std::cout << \"Result \" << item->partNum << \" \" <<\n        // newPart.partNum << \"\\n\";\n        parts[newPart.partNum] = newPart;\n        cnx.push(&parts[newPart.partNum]);\n        cnx.push(item);\n      },\n      galois::loopname(\"parallelBisect\"),\n      galois::wl<galois::worklists::ChunkLIFO<1>>());\n}\n\n} // namespace\n\nstd::vector<partInfo> partition(MetisGraph* mcg, unsigned fineMetisGraphWeight,\n                                unsigned numPartitions,\n                                InitialPartMode partMode) {\n  std::vector<partInfo> parts(numPartitions);\n  assert(fineMetisGraphWeight == mcg->getTotalWeight());\n  parts[0] = partInfo(fineMetisGraphWeight);\n\n  galois::do_all(\n      galois::iterate(*mcg->getGraph()),\n      [g = mcg->getGraph()](GNode item) {\n        g->getData(item, galois::MethodFlag::UNPROTECTED).initRefine(0, true);\n        g->getData(item, galois::MethodFlag::UNPROTECTED).initPartition();\n      },\n      galois::loopname(\"initPart\"));\n\n  bool serialPartition = false;\n  if (serialPartition) {\n    switch (partMode) {\n    case GGP:\n      std::cout << \"\\nSorting initial partitioning using GGP:\\n\";\n      serialBisect<bisect_GGP>(mcg, fineMetisGraphWeight, numPartitions, parts);\n      break;\n    case GGGP:\n      std::cout << \"\\nSorting initial partitioning using GGGP:\\n\";\n      serialBisect<bisect_GGGP>(mcg, fineMetisGraphWeight, numPartitions,\n                                parts);\n      break;\n    default:\n      abort();\n    }\n  } else {\n    switch (partMode) {\n    case GGP:\n      std::cout << \"\\nSorting initial partitioning using GGP:\\n\";\n      parallelBisect<bisect_GGP>(mcg, fineMetisGraphWeight, numPartitions,\n                                 parts);\n      break;\n    case GGGP:\n      std::cout << \"\\nSorting initial partitioning using GGGP:\\n\";\n      parallelBisect<bisect_GGGP>(mcg, fineMetisGraphWeight, numPartitions,\n                                  parts);\n      break;\n    default:\n      abort();\n    }\n  }\n  // XXX(ddn): Leave commented out until we have balance() defined.\n#if 0\n  if (!multiSeed) {\n    unsigned maxWeight = 1.01 * mcg->getTotalWeight() / numPartitions;\n    balance(mcg, parts, maxWeight);\n  }\n#endif\n  static_assert(multiSeed, \"not yet implemented\");\n  return parts;\n}\n\nnamespace {\nint edgeCount(GGraph& g) {\n  int count = 0;\n  for (auto nn : g)\n    for (auto ii : g.edges(nn))\n      count += g.getEdgeData(ii);\n  return count / 2;\n}\n} // namespace\n\nstd::vector<partInfo> BisectAll(MetisGraph* mcg, unsigned numPartitions,\n                                unsigned int) {\n  std::cout << \"\\nSorting initial partitioning using MGGGP:\\n\";\n  auto flag = galois::MethodFlag::UNPROTECTED;\n  GGraph& g = *mcg->getGraph();\n\n  int bestCut = edgeCount(g);\n  std::map<GNode, int> bestParts;\n  std::vector<partInfo> bestPartInfos(numPartitions);\n\n  for (int nbTry = 0; nbTry < 20; nbTry++) {\n    std::vector<partInfo> partInfos(numPartitions);\n    std::vector<std::map<int, std::set<GNode>>> boundary(numPartitions);\n    std::map<int, std::set<int>> partitions;\n    for (auto ii : g)\n      g.getData(ii).setPart(numPartitions + 1);\n    auto seedIter = g.begin();\n    int k         = 0;\n    // find one seed for each partition and do initialization\n    for (unsigned int i = 0; i < numPartitions; i++) {\n      int seed      = (int)(drand48() * (mcg->getNumNodes())) + 1;\n      bool goodseed = true;\n      while (seed--)\n        if (++seedIter == g.end())\n          seedIter = g.begin();\n      GNode n = *seedIter;\n\n      for (unsigned int j = 0; j < i && k < 50; j++) {\n        goodseed = goodseed && (*boundary[j][0].begin() != n);\n        for (auto ii : g.edges(n, flag))\n          goodseed = goodseed && (*boundary[j][0].begin() != g.getEdgeDst(ii));\n      }\n      if (!goodseed) {\n        k++;\n        i--;\n        continue;\n      }\n      partInfos[i] = partInfo(i, 0, 0);\n      boundary[i][0].insert(n);\n      partitions[0].insert(i);\n    }\n    auto beg = g.begin();\n    while (!partitions.empty()) {\n      // find the next partition to improove\n      auto bb       = partitions.begin();\n      int partToMod = *bb->second.begin();\n      bb->second.erase(bb->second.begin());\n      if (bb->second.empty())\n        partitions.erase(bb->first);\n\n      // find the node to add to the partition\n      GNode n = *g.begin();\n      do {\n        if (boundary[partToMod].empty())\n          break;\n        auto bi = boundary[partToMod].rbegin();\n        n       = *bi->second.begin();\n        bi->second.erase(bi->second.begin());\n        if (bi->second.empty())\n          boundary[partToMod].erase(bi->first);\n      } while (g.getData(n, flag).getPart() < numPartitions &&\n               !boundary[partToMod].empty());\n\n      if (g.getData(n, flag).getPart() < numPartitions &&\n          boundary[partToMod].empty()) {\n        GGraph::iterator ii = beg, ee = g.end();\n        for (; ii != ee; ii++)\n          if (g.getData(*ii).getPart() == numPartitions + 1)\n            break;\n        if (ii == ee)\n          break;\n        else\n          n = *(beg = ii);\n      }\n\n      // add the node\n      partInfos[partToMod].partWeight += g.getData(n, flag).getWeight();\n      partitions[partInfos[partToMod].partWeight].insert(partToMod);\n      g.getData(n, flag).setPart(partToMod);\n      for (auto ii : g.edges(n, flag)) {\n        GNode dst   = g.getEdgeDst(ii);\n        int newgain = gain_limited(g, dst, partToMod, flag);\n        boundary[partToMod][newgain].insert(dst);\n      }\n    }\n    // decides if this partition is the nez best one\n    int newCut = computeEdgeCut(g);\n    if (newCut < bestCut) {\n      bestCut = newCut;\n      for (GGraph::iterator ii = g.begin(), ee = g.end(); ii != ee; ii++)\n        bestParts[*ii] = g.getData(*ii, flag).getPart();\n      for (unsigned int i = 0; i < numPartitions; i++)\n        bestPartInfos[i] = partInfos[i];\n    }\n  }\n\n  for (GGraph::iterator ii = g.begin(), ee = g.end(); ii != ee; ii++)\n    g.getData(*ii, flag).setPart(bestParts[*ii]);\n\n  return bestPartInfos;\n}\n"
  },
  {
    "path": "lonestar/analytics/cpu/gmetis/README.md",
    "content": "GMETIS\n================================================================================\n\nDESCRIPTION \n--------------------------------------------------------------------------------\n\nThis program partitions a given graph using the METIS algorithm:\n\nGeorge Karypis and Vipin Kumar. Multilevel k-way Partitioning Scheme for \nIrregular Graphs. J. Parallel Distributed Computing. 1998.\n\nGeorge Karypis and Vipin Kumar. A fast and high quality multilevel scheme \nfor partitioning irregular graphs. International Conference on Parallel \nProcessing. 1995\n\nThe algorithm first coarsens the graph, partitions it, and then refines \nthe partitioning.\n\nINPUT\n--------------------------------------------------------------------------------\n\nThis application takes in Galois .gr graphs.\n\nBUILD\n--------------------------------------------------------------------------------\n\n1. Run cmake at BUILD directory (refer to top-level README for cmake instructions).\n\n2. Run `cd <BUILD>/lonestar/analytics/cpu/gmetis; make -j`\n\nRUN\n--------------------------------------------------------------------------------\n\nThe following are a few example command lines.\n\n-`$ ./gmetis-cpu <path-to-graph> <number-of-partitions>`\n-`$ ./gmetis-cpu <path-to-graph> <number-of-partitions> -t 20 -GGP`\n\nPERFORMANCE\n--------------------------------------------------------------------------------\n\n* In our experience, the default GGGP and BKL2 algorithms for initial partitioning \n  and refining, respectively, give the best performance.\n\n* The performance of all algorithms depend on an optimal choice of the compile \n  time constant, CHUNK_SIZE, the granularity of stolen work when work stealing is \n  enabled (via galois::steal()). The optimal value of the constant might depend on \n  the architecture, so you might want to evaluate the performance over a range of \n  values (say [16-4096]).\n"
  },
  {
    "path": "lonestar/analytics/cpu/gmetis/Refine.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#include \"galois/Galois.h\"\n#include \"galois/Reduction.h\"\n#include \"galois/Timer.h\"\n#include \"Metis.h\"\n#include <set>\n#include <iostream>\n\nnamespace {\n\nbool isBoundary(GGraph& g, GNode n) {\n  unsigned int nPart = g.getData(n).getPart();\n  for (auto ii : g.edges(n))\n    if (g.getData(g.getEdgeDst(ii)).getPart() != nPart)\n      return true;\n  return false;\n}\n\n// This is only used on the terminal graph (find graph)\nvoid findBoundary(GNodeBag& bag, GGraph& cg) {\n\n  galois::do_all(\n      galois::iterate(cg),\n      [&](GNode n) {\n        auto& cn = cg.getData(n, galois::MethodFlag::UNPROTECTED);\n        if (cn.getmaybeBoundary())\n          cn.setmaybeBoundary(isBoundary(cg, n));\n        if (cn.getmaybeBoundary())\n          bag.push(n);\n      },\n      galois::loopname(\"findBoundary\"));\n}\n\n// this is used on the coarse graph to project to the fine graph\nvoid findBoundaryAndProject(GNodeBag& bag, GGraph& cg, GGraph& fg) {\n  galois::do_all(\n      galois::iterate(cg),\n      [&](GNode n) {\n        auto& cn = cg.getData(n, galois::MethodFlag::UNPROTECTED);\n        if (cn.getmaybeBoundary())\n          cn.setmaybeBoundary(isBoundary(cg, n));\n\n        // project part and maybe boundary\n        // unsigned part = cn.getPart();\n        for (unsigned x = 0; x < cn.numChildren(); ++x) {\n          fg.getData(cn.getChild(x), galois::MethodFlag::UNPROTECTED)\n              .initRefine(cn.getPart(), cn.getmaybeBoundary());\n        }\n        if (cn.getmaybeBoundary())\n          bag.push(n);\n      },\n      galois::loopname(\"findBoundaryAndProject\"));\n}\n\ntemplate <bool balance>\nvoid refine_BKL2(unsigned minSize, unsigned maxSize, GGraph& cg, GGraph* fg,\n                 std::vector<partInfo>& parts) {\n\n  auto gainIndexer = [&cg](GNode n) -> int {\n    int retval              = 0;\n    galois::MethodFlag flag = galois::MethodFlag::UNPROTECTED;\n    unsigned int nPart      = cg.getData(n, flag).getPart();\n    for (auto ii = cg.edge_begin(n, flag), ee = cg.edge_end(n); ii != ee;\n         ++ii) {\n      GNode neigh = cg.getEdgeDst(ii);\n      if (cg.getData(neigh, flag).getPart() == nPart)\n        retval -= cg.getEdgeData(ii, flag);\n      else\n        retval += cg.getEdgeData(ii, flag);\n    }\n    return -retval / 16;\n  };\n\n  typedef galois::worklists::PerSocketChunkFIFO<8> Chunk;\n  typedef galois::worklists::OrderedByIntegerMetric<decltype(gainIndexer),\n                                                    Chunk, 10>\n      pG;\n\n  GNodeBag boundary;\n\n  if (fg)\n    findBoundaryAndProject(boundary, cg, *fg);\n  else\n    findBoundary(boundary, cg);\n\n  //! [Example Per-Thread-Storage Declaration]\n  typedef galois::gstl::Vector<unsigned> VecTy;\n  typedef galois::substrate::PerThreadStorage<VecTy> ThreadLocalData;\n  ThreadLocalData edgesThreadLocal;\n  //! [Example Per-Thread-Storage Declaration]\n\n  //! [Example Per-Thread-Storage Usage]\n  // Find the partition n is most connected to\n  auto pickPartitionEC = [&](GNode n, auto&) -> unsigned {\n    auto& edges = *edgesThreadLocal.getLocal();\n    edges.clear();\n    edges.resize(parts.size(), 0);\n    unsigned P = cg.getData(n).getPart();\n    for (auto ii : cg.edges(n)) {\n      GNode neigh = cg.getEdgeDst(ii);\n      auto& nd    = cg.getData(neigh);\n      if (parts[nd.getPart()].partWeight < maxSize || nd.getPart() == P)\n        edges[nd.getPart()] += cg.getEdgeData(ii);\n    }\n    return std::distance(edges.begin(),\n                         std::max_element(edges.begin(), edges.end()));\n  };\n  //! [Example Per-Thread-Storage Usage]\n\n  // Find the smallest partition n is connected to\n  auto pickPartitionMP = [&](GNode n, auto&) -> unsigned {\n    unsigned P  = cg.getData(n).getPart();\n    unsigned W  = parts[P].partWeight;\n    auto& edges = *edgesThreadLocal.getLocal();\n    edges.clear();\n    edges.resize(parts.size(), ~0);\n    edges[P] = W;\n    W        = (double)W * 0.9;\n    for (auto ii : cg.edges(n)) {\n      GNode neigh = cg.getEdgeDst(ii);\n      auto& nd    = cg.getData(neigh);\n      if (parts[nd.getPart()].partWeight < W)\n        edges[nd.getPart()] = parts[nd.getPart()].partWeight;\n    }\n    return std::distance(edges.begin(),\n                         std::min_element(edges.begin(), edges.end()));\n  };\n\n  galois::for_each(\n      galois::iterate(boundary),\n      [&](GNode n, auto& cnx) {\n        auto& nd         = cg.getData(n);\n        unsigned curpart = nd.getPart();\n        unsigned newpart =\n            balance ? pickPartitionMP(n, cnx) : pickPartitionEC(n, cnx);\n        if (parts[curpart].partWeight < minSize)\n          return;\n        if (curpart != newpart) {\n          nd.setPart(newpart);\n          __sync_fetch_and_sub(&parts[curpart].partWeight, nd.getWeight());\n          __sync_fetch_and_add(&parts[newpart].partWeight, nd.getWeight());\n          for (auto ii : cg.edges(n)) {\n            GNode neigh = cg.getEdgeDst(ii);\n            auto& ned   = cg.getData(neigh);\n            if (ned.getPart() != newpart && !ned.getmaybeBoundary()) {\n              ned.setmaybeBoundary(true);\n              if (fg)\n                for (unsigned x = 0; x < ned.numChildren(); ++x)\n                  fg->getData(ned.getChild(x), galois::MethodFlag::UNPROTECTED)\n                      .setmaybeBoundary(true);\n            }\n            // if (ned.getPart() != newpart)\n            // cnx.push(neigh);\n          }\n          if (fg)\n            for (unsigned x = 0; x < nd.numChildren(); ++x)\n              fg->getData(nd.getChild(x), galois::MethodFlag::UNPROTECTED)\n                  .setPart(newpart);\n        }\n      },\n      galois::loopname(\"refine\"), galois::wl<pG>(gainIndexer));\n}\n\nvoid projectPart(MetisGraph* Graph, std::vector<partInfo>&) {\n  GGraph* fineGraph   = Graph->getFinerGraph()->getGraph();\n  GGraph* coarseGraph = Graph->getGraph();\n\n  galois::do_all(\n      galois::iterate(*coarseGraph),\n      [&](GNode n) {\n        auto& cn      = coarseGraph->getData(n);\n        unsigned part = cn.getPart();\n        for (unsigned x = 0; x < cn.numChildren(); ++x) {\n          fineGraph->getData(cn.getChild(x)).setPart(part);\n        }\n      },\n      galois::loopname(\"project\"));\n}\n\nint gain(GGraph& g, GNode n) {\n  int retval         = 0;\n  unsigned int nPart = g.getData(n).getPart();\n  for (auto ii : g.edges(n)) {\n    GNode neigh = g.getEdgeDst(ii);\n    if (g.getData(neigh).getPart() == nPart)\n      retval -= g.getEdgeData(ii);\n    else\n      retval += g.getEdgeData(ii);\n  }\n  return retval;\n}\n\nvoid parallelBoundary(GNodeBag& bag, GGraph& graph) {\n  galois::do_all(\n      galois::iterate(graph),\n      [&](GNode n) {\n        if (gain(graph, n) > 0)\n          bag.push(n);\n      },\n      galois::loopname(\"Get-Boundary\"));\n}\n\nvoid refineOneByOne(GGraph& g, std::vector<partInfo>& parts) {\n  std::vector<GNode> boundary;\n  unsigned int meanWeight = 0;\n  for (unsigned int i = 0; i < parts.size(); i++)\n    meanWeight += parts[i].partWeight;\n  meanWeight /= parts.size();\n\n  GNodeBag boundaryBag;\n  parallelBoundary(boundaryBag, g);\n\n  for (auto ii = boundaryBag.begin(), ie = boundaryBag.end(); ii != ie; ii++) {\n    GNode n        = (*ii);\n    unsigned nPart = g.getData(n).getPart();\n    int part[parts.size()];\n    for (unsigned int i = 0; i < parts.size(); i++)\n      part[i] = 0;\n    for (auto ii : g.edges(n)) {\n      GNode neigh = g.getEdgeDst(ii);\n      part[g.getData(neigh).getPart()] += g.getEdgeData(ii);\n    }\n    int t          = part[nPart];\n    unsigned int p = nPart;\n    for (unsigned int i = 0; i < parts.size(); i++)\n      if (i != nPart && part[i] > t &&\n          parts[nPart].partWeight > parts[i].partWeight * (98) / (100) &&\n          parts[nPart].partWeight > meanWeight * 98 / 100) {\n        t = part[i];\n        p = i;\n      }\n    if (p != nPart) {\n      g.getData(n).setPart(p);\n      parts[p].partWeight += g.getData(n).getWeight();\n      parts[nPart].partWeight -= g.getData(n).getWeight();\n    }\n  }\n}\n\nvoid refine_BKL(GGraph& g, std::vector<partInfo>& parts) {\n  std::set<GNode> boundary;\n\n  // find boundary nodes with positive gain\n  GNodeBag boundaryBag;\n  parallelBoundary(boundaryBag, g);\n\n  for (auto ii = boundaryBag.begin(), ie = boundaryBag.end(); ii != ie; ii++) {\n    boundary.insert(*ii);\n  }\n\n  // refine by swapping with a neighbor high-gain node\n  while (!boundary.empty()) {\n    GNode n = *boundary.begin();\n    boundary.erase(boundary.begin());\n    unsigned nPart = g.getData(n).getPart();\n    for (auto ii : g.edges(n)) {\n      GNode neigh        = g.getEdgeDst(ii);\n      unsigned neighPart = g.getData(neigh).getPart();\n      if (neighPart != nPart && boundary.count(neigh) && gain(g, n) > 0 &&\n          gain(g, neigh) > 0) {\n        unsigned nWeight     = g.getData(n).getWeight();\n        unsigned neighWeight = g.getData(neigh).getWeight();\n        // swap\n        g.getData(n).setPart(neighPart);\n        g.getData(neigh).setPart(nPart);\n        // update partinfo\n        parts[neighPart].partWeight += nWeight;\n        parts[neighPart].partWeight -= neighWeight;\n        parts[nPart].partWeight += neighWeight;\n        parts[nPart].partWeight -= nWeight;\n        // remove nodes\n        boundary.erase(neigh);\n        break;\n      }\n    }\n  }\n}\n\n/*double ratiocut(int nbClust, int* degree, int* card)\n{\n  double res=0;\n  for (int i=0; i<nbClust;i++)\n    res += (double)(degree[i])/(double)(card[i]);\n\n  return res;\n}*/\n\nvoid GraclusRefining(GGraph* graph, int nbParti, int nbIter) {\n  nbIter = std::min(15, nbIter);\n  std::vector<double> Dist(nbParti);\n  std::vector<int> card(nbParti);\n  std::vector<int> degreeIn(nbParti);\n\n  using Accum = galois::GAccumulator<size_t>;\n  std::vector<Accum> cardAccum(nbParti);\n  std::vector<Accum> degreeInAccum(nbParti);\n\n  for (int j = 0; j < nbIter; j++) {\n\n    GGraph& g = *graph;\n    galois::do_all(\n        galois::iterate(g),\n        [&](GNode n) {\n          unsigned int clust =\n              g.getData(n, galois::MethodFlag::UNPROTECTED).getPart();\n          int degreet = 0;\n\n          g.getData(n, galois::MethodFlag::UNPROTECTED).OldPartCpyNew();\n\n          for (auto ii : g.edges(n, galois::MethodFlag::UNPROTECTED))\n            if (g.getData(g.getEdgeDst(ii), galois::MethodFlag::UNPROTECTED)\n                    .getPart() == clust)\n              degreet +=\n                  (int)g.getEdgeData(ii, galois::MethodFlag::UNPROTECTED);\n\n          cardAccum[clust] +=\n              g.getData(n, galois::MethodFlag::UNPROTECTED).getWeight();\n          degreeInAccum[clust] += degreet;\n        },\n        galois::loopname(\"compute dists\"));\n\n    for (int i = 0; i < nbParti; i++) {\n      card[i] = cardAccum[i].reduce();\n      cardAccum[i].reset();\n\n      degreeIn[i] = degreeInAccum[i].reduce();\n      degreeInAccum[i].reset();\n\n      Dist[i] = (card[i] != 0) ? (double)(degreeIn[i] + card[i]) /\n                                     ((double)card[i] * card[i])\n                               : 0;\n    }\n\n    galois::do_all(\n        galois::iterate(g),\n        [&](GNode n) {\n          double dmin   = std::numeric_limits<double>::min();\n          int partition = -1;\n          galois::gstl::Map<int, int> degreein;\n          degreein[g.getData(n, galois::MethodFlag::UNPROTECTED)\n                       .getOldPart()] += 1;\n          for (auto ii : g.edges(n, galois::MethodFlag::UNPROTECTED)) {\n            int nclust =\n                g.getData(g.getEdgeDst(ii), galois::MethodFlag::UNPROTECTED)\n                    .getOldPart();\n            degreein[nclust] +=\n                (int)g.getEdgeData(ii, galois::MethodFlag::UNPROTECTED);\n          }\n\n          for (auto clust = degreein.begin(), ee = degreein.end(); clust != ee;\n               ++clust) {\n            // the distance between the cluster clust and the noden is :\n            double d = Dist[clust->first] - (2.0 * (double)clust->second /\n                                             (double)card[clust->first]);\n            if (d < dmin || partition == -1) {\n              dmin      = d;\n              partition = clust->first;\n            }\n          }\n          g.getData(n, galois::MethodFlag::UNPROTECTED).setPart(partition);\n        },\n        galois::loopname(\"make moves\"));\n  }\n}\n\n} // namespace\n\nvoid refine(MetisGraph* coarseGraph, std::vector<partInfo>& parts,\n            unsigned minSize, unsigned maxSize, refinementMode refM,\n            bool verbose) {\n  MetisGraph* tGraph = coarseGraph;\n  int nbIter         = 1;\n  if (refM == GRACLUS) {\n    while ((tGraph = tGraph->getFinerGraph()))\n      nbIter *= 2;\n    nbIter /= 4;\n  }\n  do {\n    MetisGraph* fineGraph = coarseGraph->getFinerGraph();\n    bool doProject        = true;\n    if (verbose) {\n      std::cout << \"Cut \" << computeCut(*coarseGraph->getGraph())\n                << \" Weights \";\n      printPartStats(parts);\n      std::cout << \"\\n\";\n    }\n    // refine nparts times\n    switch (refM) {\n    case BKL2:\n      refine_BKL2<false>(minSize, maxSize, *coarseGraph->getGraph(),\n                         fineGraph ? fineGraph->getGraph() : nullptr, parts);\n      doProject = false;\n      break;\n    case BKL:\n      refine_BKL(*coarseGraph->getGraph(), parts);\n      break;\n    case ROBO:\n      refineOneByOne(*coarseGraph->getGraph(), parts);\n      break;\n    case GRACLUS:\n      GraclusRefining(coarseGraph->getGraph(), parts.size(), nbIter);\n      nbIter = (nbIter + 1) / 2;\n      break;\n    default:\n      abort();\n    }\n    // project up\n    if (fineGraph && doProject) {\n      projectPart(coarseGraph, parts);\n    }\n  } while ((coarseGraph = coarseGraph->getFinerGraph()));\n}\n\n/*\nvoid balance(MetisGraph* coarseGraph, std::vector<partInfo>& parts, unsigned\nmeanSize) { MetisGraph* fineGraph = coarseGraph->getFinerGraph();\n    refine_BKL2<true>(meanSize, *coarseGraph->getGraph(), fineGraph ?\nfineGraph->getGraph() : nullptr, parts);\n}\n*/\n"
  },
  {
    "path": "lonestar/analytics/cpu/independentset/CMakeLists.txt",
    "content": "add_executable(maximal-independentset-cpu IndependentSet.cpp)\nadd_dependencies(apps maximal-independentset-cpu)\ntarget_link_libraries(maximal-independentset-cpu PRIVATE Galois::shmem lonestar)\ninstall(TARGETS maximal-independentset-cpu DESTINATION \"${CMAKE_INSTALL_BINDIR}\" COMPONENT apps EXCLUDE_FROM_ALL)\nadd_test_scale(small maximal-independentset-cpu \"${BASEINPUT}/scalefree/symmetric/rmat10.sgr\" \"-symmetricGraph\")\n"
  },
  {
    "path": "lonestar/analytics/cpu/independentset/IndependentSet.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#include \"galois/Galois.h\"\n#include \"galois/Bag.h\"\n#include \"galois/ParallelSTL.h\"\n#include \"galois/Reduction.h\"\n#include \"galois/Timer.h\"\n#include \"galois/graphs/LCGraph.h\"\n#include \"galois/runtime/Profile.h\"\n#include \"llvm/Support/CommandLine.h\"\n\n#include \"Lonestar/BoilerPlate.h\"\n\n#include <utility>\n#include <vector>\n#include <algorithm>\n#include <iostream>\n#include <type_traits>\n#include <random>\n#include <math.h>\n\nconst char* name = \"Maximal Independent Set\";\nconst char* desc =\n    \"Computes a maximal independent set (not maximum) of nodes in a graph\";\nconst char* url = \"independent_set\";\n\nenum Algo { serial, pull, nondet, detBase, prio, edgetiledprio };\n\nnamespace cll = llvm::cl;\nstatic cll::opt<std::string>\n    inputFile(cll::Positional, cll::desc(\"<input file>\"), cll::Required);\n\nstatic cll::opt<Algo> algo(\n    \"algo\", cll::desc(\"Choose an algorithm:\"),\n    cll::values(\n        clEnumVal(serial, \"Serial\"),\n        clEnumVal(pull,\n                  \"Pull-based (node 0 is initially in the independent set)\"),\n        clEnumVal(nondet, \"Non-deterministic, use bulk synchronous worklist\"),\n        clEnumVal(detBase, \"use deterministic worklist\"),\n        clEnumVal(\n            prio,\n            \"prio algo based on Martin's GPU ECL-MIS algorithm (default)\"),\n        clEnumVal(\n            edgetiledprio,\n            \"edge-tiled prio algo based on Martin's GPU ECL-MIS algorithm\")),\n    cll::init(prio));\n\nenum MatchFlag : char { UNMATCHED, OTHER_MATCHED, MATCHED };\n\nstruct Node {\n  MatchFlag flag;\n  Node() : flag(UNMATCHED) {}\n};\n\nstruct prioNode {\n  unsigned char flag; // 1 bit matched,6 bits prio, 1 bit undecided\n  prioNode() : flag((unsigned char){0x01}) {}\n};\n\nstruct SerialAlgo {\n  using Graph = galois::graphs::LC_CSR_Graph<Node, void>::with_numa_alloc<\n      true>::type ::with_no_lockable<true>::type;\n  using GNode = Graph::GraphNode;\n\n  void operator()(Graph& graph) {\n    for (Graph::iterator ii = graph.begin(), ei = graph.end(); ii != ei; ++ii) {\n      if (findUnmatched(graph, *ii))\n        match(graph, *ii);\n    }\n  }\n\n  bool findUnmatched(Graph& graph, GNode src) {\n    Node& me = graph.getData(src);\n    if (me.flag != UNMATCHED)\n      return false;\n\n    for (auto ii : graph.edges(src)) {\n      GNode dst  = graph.getEdgeDst(ii);\n      Node& data = graph.getData(dst);\n      if (data.flag == MATCHED)\n        return false;\n    }\n    return true;\n  }\n\n  void match(Graph& graph, GNode src) {\n    Node& me = graph.getData(src);\n    for (auto ii : graph.edges(src)) {\n      GNode dst  = graph.getEdgeDst(ii);\n      Node& data = graph.getData(dst);\n      data.flag  = OTHER_MATCHED;\n    }\n    me.flag = MATCHED;\n  }\n};\n\ntemplate <Algo algo>\nstruct DefaultAlgo {\n\n  using Graph = typename galois::graphs::LC_CSR_Graph<\n      Node, void>::template with_numa_alloc<true>::type;\n\n  using GNode = typename Graph::GraphNode;\n\n  struct LocalState {\n    bool mod;\n    explicit LocalState() : mod(false) {}\n  };\n\n  template <galois::MethodFlag Flag>\n  bool build(Graph& graph, GNode src) {\n    Node& me = graph.getData(src, Flag);\n    if (me.flag != UNMATCHED)\n      return false;\n\n    for (auto ii : graph.edges(src, galois::MethodFlag::UNPROTECTED)) {\n      GNode dst  = graph.getEdgeDst(ii);\n      Node& data = graph.getData(dst, Flag);\n      if (data.flag == MATCHED)\n        return false;\n    }\n    return true;\n  }\n\n  void modify(Graph& graph, GNode src) {\n    Node& me = graph.getData(src, galois::MethodFlag::UNPROTECTED);\n    for (auto ii : graph.edges(src, galois::MethodFlag::UNPROTECTED)) {\n      GNode dst  = graph.getEdgeDst(ii);\n      Node& data = graph.getData(dst, galois::MethodFlag::UNPROTECTED);\n      data.flag  = OTHER_MATCHED;\n    }\n    me.flag = MATCHED;\n  }\n\n  template <typename C>\n  void processNode(Graph& graph, const GNode& src, C& ctx) {\n    bool mod;\n    mod = build<galois::MethodFlag::WRITE>(graph, src);\n    graph.getData(src, galois::MethodFlag::WRITE);\n    ctx.cautiousPoint(); // Failsafe point\n\n    if (mod) {\n      modify(graph, src);\n    }\n  }\n\n  template <typename WL, typename... Args>\n  void run(Graph& graph, Args&&... args) {\n\n    auto detID = [](const GNode& x) { return x; };\n\n    galois::for_each(\n        galois::iterate(graph),\n        [&, this](const GNode& src, auto& ctx) {\n          this->processNode(graph, src, ctx);\n        },\n        galois::no_pushes(), galois::wl<WL>(), galois::loopname(\"DefaultAlgo\"),\n        galois::det_id<decltype(detID)>(detID),\n        galois::local_state<LocalState>(), std::forward<Args>(args)...);\n  }\n\n  void operator()(Graph& graph) {\n    using DWL = galois::worklists::Deterministic<>;\n\n    using BSWL = galois::worklists::BulkSynchronous<\n        typename galois::worklists::PerSocketChunkFIFO<64>>;\n\n    switch (algo) {\n    case nondet:\n      run<BSWL>(graph);\n      break;\n    case detBase:\n      run<DWL>(graph);\n      break;\n    default:\n      std::cerr << \"Unknown algorithm\" << algo << \"\\n\";\n      abort();\n    }\n  }\n};\n\nstruct PullAlgo {\n\n  using Graph = galois::graphs::LC_CSR_Graph<Node, void>::with_numa_alloc<\n      true>::type ::with_no_lockable<true>::type;\n\n  using GNode = Graph::GraphNode;\n  using Bag   = galois::InsertBag<GNode>;\n\n  using Counter = galois::GAccumulator<size_t>;\n\n  template <typename R>\n  void pull(const R& range, Graph& graph, Bag& matched, Bag& otherMatched,\n            Bag& next, Counter& numProcessed) {\n\n    galois::do_all(\n        range,\n        [&](const GNode& src) {\n          numProcessed += 1;\n          Node& n = graph.getData(src, galois::MethodFlag::UNPROTECTED);\n          if (n.flag == OTHER_MATCHED)\n            return;\n\n          MatchFlag f = MATCHED;\n          for (auto edge :\n               graph.out_edges(src, galois::MethodFlag::UNPROTECTED)) {\n            GNode dst = graph.getEdgeDst(edge);\n            if (dst >= src) {\n              continue;\n            }\n\n            Node& other = graph.getData(dst, galois::MethodFlag::UNPROTECTED);\n            if (other.flag == MATCHED) {\n              f = OTHER_MATCHED;\n              break;\n            } else if (other.flag == UNMATCHED) {\n              f = UNMATCHED;\n            }\n          }\n\n          if (f == UNMATCHED) {\n            next.push_back(src);\n          } else if (f == MATCHED) {\n            matched.push_back(src);\n          } else {\n            otherMatched.push_back(src);\n          }\n        },\n        galois::loopname(\"pull\"));\n  }\n\n  template <MatchFlag F>\n  void take(Bag& bag, Graph& graph, Counter& numTaken) {\n\n    galois::do_all(\n        galois::iterate(bag),\n        [&](const GNode& src) {\n          Node& n = graph.getData(src, galois::MethodFlag::UNPROTECTED);\n          numTaken += 1;\n          n.flag = F;\n        },\n        galois::loopname(\"take\"));\n  }\n\n  void operator()(Graph& graph) {\n    size_t rounds = 0;\n    Counter numProcessed;\n    Counter numTaken;\n\n    Bag bags[2];\n    Bag* cur  = &bags[0];\n    Bag* next = &bags[1];\n    Bag matched;\n    Bag otherMatched;\n    uint64_t size  = graph.size();\n    uint64_t delta = graph.size() / 25;\n\n    Graph::iterator ii = graph.begin();\n    Graph::iterator ei = graph.begin();\n\n    while (size > 0) {\n      numProcessed.reset();\n\n      if (!cur->empty()) {\n        pull(galois::iterate(*cur), graph, matched, otherMatched, *next,\n             numProcessed);\n      }\n\n      size_t numCur = numProcessed.reduce();\n      std::advance(ei, std::min(size, delta) - numCur);\n\n      if (ii != ei) {\n        pull(galois::iterate(ii, ei), graph, matched, otherMatched, *next,\n             numProcessed);\n      }\n\n      ii = ei;\n\n      numTaken.reset();\n\n      take<MATCHED>(matched, graph, numTaken);\n      take<OTHER_MATCHED>(otherMatched, graph, numTaken);\n\n      cur->clear();\n      matched.clear();\n      otherMatched.clear();\n      std::swap(cur, next);\n      rounds += 1;\n      assert(size >= numTaken.reduce());\n      size -= numTaken.reduce();\n    }\n\n    galois::runtime::reportStat_Single(\"IndependentSet-PullAlgo\", \"rounds\",\n                                       rounds);\n  }\n};\n\nstruct PrioAlgo {\n  using Graph = galois::graphs::LC_CSR_Graph<prioNode, void>::with_numa_alloc<\n      true>::type ::with_no_lockable<true>::type;\n  using GNode = Graph::GraphNode;\n\n  unsigned int hash(unsigned int val) const {\n    val = ((val >> 16) ^ val) * 0x45d9f3b;\n    val = ((val >> 16) ^ val) * 0x45d9f3b;\n    return (val >> 16) ^ val;\n  }\n\n  void operator()(Graph& graph) {\n    galois::GAccumulator<size_t> rounds;\n    galois::GAccumulator<float> nedges;\n    galois::GReduceLogicalOr unmatched;\n    galois::substrate::PerThreadStorage<std::mt19937*> generator;\n\n    galois::do_all(\n        galois::iterate(graph),\n        [&](const GNode& src) {\n          nedges += std::distance(\n              graph.edge_begin(src, galois::MethodFlag::UNPROTECTED),\n              graph.edge_end(src, galois::MethodFlag::UNPROTECTED));\n        },\n        galois::loopname(\"cal_degree\"), galois::steal());\n\n    float nedges_tmp = nedges.reduce();\n    float avg_degree = nedges_tmp / (float)graph.size();\n    unsigned char in = ~1;\n    float scale_avg  = ((in / 2) - 1) * avg_degree;\n\n    galois::do_all(\n        galois::iterate(graph),\n        [&](const GNode& src) {\n          prioNode& nodedata =\n              graph.getData(src, galois::MethodFlag::UNPROTECTED);\n          float degree = (float)std::distance(\n              graph.edge_begin(src, galois::MethodFlag::UNPROTECTED),\n              graph.edge_end(src, galois::MethodFlag::UNPROTECTED));\n          float x = degree - hash(src) * 0.00000000023283064365386962890625f;\n          int res = round(scale_avg / (avg_degree + x));\n          unsigned char val = (res + res) | 1;\n          nodedata.flag     = val;\n        },\n        galois::loopname(\"init-prio\"), galois::steal());\n\n    do {\n      unmatched.reset();\n      galois::do_all(\n          galois::iterate(graph),\n          [&](const GNode& src) {\n            prioNode& nodedata =\n                graph.getData(src, galois::MethodFlag::UNPROTECTED);\n\n            if (!(nodedata.flag & (unsigned char)1))\n              return;\n\n            for (auto edge :\n                 graph.out_edges(src, galois::MethodFlag::UNPROTECTED)) {\n              GNode dst = graph.getEdgeDst(edge);\n\n              prioNode& other =\n                  graph.getData(dst, galois::MethodFlag::UNPROTECTED);\n\n              if (other.flag == (unsigned char)0xfe) { // matched, highest prio\n                nodedata.flag = (unsigned char)0x00;\n                unmatched.update(true);\n                return;\n              }\n\n              if (nodedata.flag > other.flag)\n                continue;\n              else if (nodedata.flag == other.flag) {\n                if (src > dst)\n                  continue;\n                else if (src == dst) {\n                  nodedata.flag = (unsigned char)0x00; // other_matched\n                  return;\n                } else {\n                  unmatched.update(true);\n                  return;\n                }\n              } else {\n                unmatched.update(true);\n                return;\n              }\n            }\n            nodedata.flag = (unsigned char)0xfe; // matched, highest prio\n          },\n          galois::loopname(\"execute\"), galois::steal());\n\n      rounds += 1;\n    } while (unmatched.reduce());\n\n    galois::runtime::reportStat_Single(\"IndependentSet-prioAlgo\", \"rounds\",\n                                       rounds.reduce());\n  }\n};\n\nstruct EdgeTiledPrioAlgo {\n  using Graph = galois::graphs::LC_CSR_Graph<prioNode, void>::with_numa_alloc<\n      true>::type ::with_no_lockable<true>::type;\n  using GNode = Graph::GraphNode;\n\n  struct EdgeTile {\n    GNode src;\n    Graph::edge_iterator beg;\n    Graph::edge_iterator end;\n    bool flag;\n  };\n\n  unsigned int hash(unsigned int val) const {\n    val = ((val >> 16) ^ val) * 0x45d9f3b;\n    val = ((val >> 16) ^ val) * 0x45d9f3b;\n    return (val >> 16) ^ val;\n  }\n\n  void operator()(Graph& graph) {\n    galois::GAccumulator<size_t> rounds;\n    galois::GAccumulator<float> nedges;\n    galois::GReduceLogicalOr unmatched;\n    galois::substrate::PerThreadStorage<std::mt19937*> generator;\n    galois::InsertBag<EdgeTile> works;\n    const int EDGE_TILE_SIZE = 64;\n    galois::do_all(\n        galois::iterate(graph),\n        [&](const GNode& src) {\n          nedges += std::distance(\n              graph.edge_begin(src, galois::MethodFlag::UNPROTECTED),\n              graph.edge_end(src, galois::MethodFlag::UNPROTECTED));\n        },\n        galois::loopname(\"cal_degree\"), galois::steal());\n\n    float nedges_tmp = nedges.reduce();\n    float avg_degree = nedges_tmp / (float)graph.size();\n    unsigned char in = ~1;\n    float scale_avg  = ((in / 2) - 1) * avg_degree;\n\n    galois::do_all(\n        galois::iterate(graph),\n        [&](const GNode& src) {\n          prioNode& nodedata =\n              graph.getData(src, galois::MethodFlag::UNPROTECTED);\n          auto beg = graph.edge_begin(src, galois::MethodFlag::UNPROTECTED);\n          const auto end = graph.edge_end(src, galois::MethodFlag::UNPROTECTED);\n\n          float degree = (float)std::distance(beg, end);\n          float x = degree - hash(src) * 0.00000000023283064365386962890625f;\n          int res = round(scale_avg / (avg_degree + x));\n          unsigned char val = (res + res) | 0x03;\n\n          nodedata.flag = val;\n          assert(beg <= end);\n          if ((end - beg) > EDGE_TILE_SIZE) {\n            for (; beg + EDGE_TILE_SIZE < end;) {\n              auto ne = beg + EDGE_TILE_SIZE;\n              assert(ne < end);\n              works.push_back(EdgeTile{src, beg, ne, false});\n              beg = ne;\n            }\n          }\n          if ((end - beg) > 0) {\n            works.push_back(EdgeTile{src, beg, end, false});\n          }\n        },\n        galois::loopname(\"init-prio\"), galois::steal());\n\n    do {\n      unmatched.reset();\n      galois::do_all(\n          galois::iterate(works),\n          [&](EdgeTile& tile) {\n            GNode src = tile.src;\n\n            prioNode& nodedata =\n                graph.getData(src, galois::MethodFlag::UNPROTECTED);\n\n            if ((nodedata.flag & (unsigned char){1})) { // is undecided\n\n              for (auto edge = tile.beg; edge != tile.end; ++edge) {\n                GNode dst = graph.getEdgeDst(edge);\n\n                prioNode& other =\n                    graph.getData(dst, galois::MethodFlag::UNPROTECTED);\n\n                if (other.flag ==\n                    (unsigned char){0xfe}) { // permanent matched, highest prio\n                  nodedata.flag = (unsigned char){0x00};\n                  return;\n                }\n\n                if (nodedata.flag > other.flag)\n                  continue;\n                else if (nodedata.flag == other.flag) {\n                  if (src > dst)\n                    continue;\n                  else if (src == dst) {\n                    nodedata.flag = (unsigned char){0x00}; // other_matched\n                    tile.flag     = false;\n                    return;\n                  } else {\n                    tile.flag = false;\n                    unmatched.update(true);\n                    return;\n                  }\n                } else {\n                  tile.flag = false;\n                  unmatched.update(true);\n                  return;\n                }\n              }\n              tile.flag = true; // temporary-matched\n            }\n          },\n          galois::loopname(\"execute\"), galois::steal());\n\n      galois::do_all(\n          galois::iterate(works),\n          [&](EdgeTile& tile) {\n            auto src = tile.src;\n            prioNode& nodedata =\n                graph.getData(src, galois::MethodFlag::UNPROTECTED);\n\n            if ((nodedata.flag & (unsigned char){1}) &&\n                tile.flag == false) { // undecided and temporary no\n              nodedata.flag &=\n                  (unsigned char){0xfd}; // 0x1111 1101, not temporary yes\n            }\n          },\n          galois::loopname(\"match_reduce\"), galois::steal());\n\n      galois::do_all(\n          galois::iterate(graph),\n          [&](const GNode& src) {\n            prioNode& nodedata =\n                graph.getData(src, galois::MethodFlag::UNPROTECTED);\n            if ((nodedata.flag & (unsigned char){0x01}) != 0) { // undecided\n              if ((nodedata.flag & (unsigned char){0x02}) !=\n                  0) { // temporary yes\n                nodedata.flag =\n                    (unsigned char){0xfe}; // 0x1111 1110, permanent yes\n                for (auto edge :\n                     graph.out_edges(src, galois::MethodFlag::UNPROTECTED)) {\n                  GNode dst = graph.getEdgeDst(edge);\n\n                  prioNode& other =\n                      graph.getData(dst, galois::MethodFlag::UNPROTECTED);\n                  other.flag =\n                      (unsigned char){0x00}; // OTHER_MATCHED, permanent no\n                }\n              } else\n                nodedata.flag |=\n                    (unsigned char){0x03}; // 0x0000 0011, temp yes, undecided\n            }\n          },\n          galois::loopname(\"match_update\"), galois::steal());\n\n      rounds += 1;\n    } while (unmatched.reduce());\n\n    galois::runtime::reportStat_Single(\"IndependentSet-prioAlgo\", \"rounds\",\n                                       rounds.reduce());\n  }\n};\n\ntemplate <typename Graph>\nstruct is_bad {\n  using GNode = typename Graph::GraphNode;\n  using Node  = typename Graph::node_data_type;\n  Graph& graph;\n\n  is_bad(Graph& g) : graph(g) {}\n\n  bool operator()(GNode n) const {\n    Node& me = graph.getData(n);\n    if (me.flag == MATCHED) {\n      for (auto ii : graph.edges(n)) {\n        GNode dst  = graph.getEdgeDst(ii);\n        Node& data = graph.getData(dst);\n        if (dst != n && data.flag == MATCHED) {\n          std::cerr << \"double match\\n\";\n          return true;\n        }\n      }\n    } else if (me.flag == UNMATCHED) {\n      bool ok = false;\n      for (auto ii : graph.edges(n)) {\n        GNode dst  = graph.getEdgeDst(ii);\n        Node& data = graph.getData(dst);\n        if (data.flag != UNMATCHED) {\n          ok = true;\n        }\n      }\n      if (!ok) {\n        std::cerr << \"not maximal\\n\";\n        return true;\n      }\n    }\n    return false;\n  }\n};\n\ntemplate <typename Graph>\nstruct is_matched {\n  Graph& graph;\n  using GNode = typename Graph::GraphNode;\n\n  is_matched(Graph& g) : graph(g) {}\n\n  bool operator()(const GNode& n) const {\n    return graph.getData(n).flag == MATCHED;\n  }\n};\n\ntemplate <typename Graph, typename Algo>\nbool verify(Graph& graph, Algo&) {\n  using GNode    = typename Graph::GraphNode;\n  using prioNode = typename Graph::node_data_type;\n\n  if (std::is_same<Algo, PrioAlgo>::value ||\n      std::is_same<Algo, EdgeTiledPrioAlgo>::value) {\n    galois::do_all(\n        galois::iterate(graph),\n        [&](const GNode& src) {\n          prioNode& nodedata =\n              graph.getData(src, galois::MethodFlag::UNPROTECTED);\n          if (nodedata.flag == (unsigned char){0xfe}) {\n            nodedata.flag = MATCHED;\n          } else if (nodedata.flag == (unsigned char){0x00}) {\n            nodedata.flag = OTHER_MATCHED;\n          } else\n            std::cout << \"error in verify_change! Some nodes are not decided.\"\n                      << \"\\n\";\n        },\n        galois::loopname(\"verify_change\"));\n  }\n\n  return galois::ParallelSTL::find_if(graph.begin(), graph.end(),\n                                      is_bad<Graph>(graph)) == graph.end();\n}\n\ntemplate <typename Algo>\nvoid run() {\n  using Graph = typename Algo::Graph;\n  using GNode = typename Graph::GraphNode;\n\n  Algo algo;\n  Graph graph;\n  galois::graphs::readGraph(graph, inputFile);\n\n  // galois::preAlloc(numThreads + (graph.size() * sizeof(Node) * numThreads /\n  // 8) / galois::runtime::MM::hugePageSize); Tighter upper bound\n  if (std::is_same<Algo, DefaultAlgo<nondet>>::value) {\n    galois::preAlloc(numThreads +\n                     16 * graph.size() / galois::runtime::pagePoolSize());\n  } else {\n    galois::preAlloc(numThreads + 64 * (sizeof(GNode) + sizeof(Node)) *\n                                      graph.size() /\n                                      galois::runtime::pagePoolSize());\n  }\n\n  galois::reportPageAlloc(\"MeminfoPre\");\n  galois::StatTimer execTime(\"Timer_0\");\n\n  execTime.start();\n  algo(graph);\n  execTime.stop();\n\n  galois::reportPageAlloc(\"MeminfoPost\");\n\n  if (!skipVerify && !verify(graph, algo)) {\n    std::cerr << \"verification failed\\n\";\n    assert(0 && \"verification failed\");\n    abort();\n  }\n\n  std::cout << \"Cardinality of maximal independent set: \"\n            << galois::ParallelSTL::count_if(graph.begin(), graph.end(),\n                                             is_matched<Graph>(graph))\n            << \"\\n\";\n}\n\nint main(int argc, char** argv) {\n  galois::SharedMemSys G;\n  LonestarStart(argc, argv, name, desc, url, &inputFile);\n\n  galois::StatTimer totalTime(\"TimerTotal\");\n  totalTime.start();\n\n  if (!symmetricGraph) {\n    GALOIS_DIE(\"independent set requires a symmetric graph input;\"\n               \" please use the -symmetricGraph flag \"\n               \" to indicate the input is a symmetric graph\");\n  }\n\n  if (!symmetricGraph) {\n    GALOIS_DIE(\"This application requires a symmetric graph input;\"\n               \" please use the -symmetricGraph flag \"\n               \" to indicate the input is a symmetric graph.\");\n  }\n\n  switch (algo) {\n  case serial:\n    run<SerialAlgo>();\n    break;\n  case nondet:\n    run<DefaultAlgo<nondet>>();\n    break;\n  case detBase:\n    run<DefaultAlgo<detBase>>();\n    break;\n  case pull:\n    run<PullAlgo>();\n    break;\n  case prio:\n    run<PrioAlgo>();\n    break;\n  case edgetiledprio:\n    run<EdgeTiledPrioAlgo>();\n    break;\n  default:\n    std::cerr << \"Unknown algorithm\" << algo << \"\\n\";\n    abort();\n  }\n\n  totalTime.stop();\n\n  return 0;\n}\n"
  },
  {
    "path": "lonestar/analytics/cpu/independentset/README.md",
    "content": "Maximal Independent Set\n================================================================================\n\nDESCRIPTION \n--------------------------------------------------------------------------------\n\nFind the Maximal Independent Set (not maximum) of ndoes in an undirected \n(symmetric) graph. \n\nFor convenience, we used IN to represent a node in the independent set, OUT to \nrepresent not in the independent set, UNDECIDED represent undecided.\n\n- serial: serial greedy version.\n- pull: pull-based greedy version. Node 0 is initially marked IN.\n- detBase: greedy version, using Galois deterministic worklist.\n- nondet: greedy version, using Galois bulk synchronous worklist.\n- prio(default): based on Martin Butcher's GPU ECL-MIS algorithm. For more information,\n  please look at http://cs.txstate.edu/~burtscher/research/ECL-MIS/.\n- edgetiledprio: edge-tiled version of prio.\n\nINPUT\n--------------------------------------------------------------------------------\n\nThis application takes in symmetric Galois .gr graphs.\nYou must specify the -symmetricGraph flag when running this benchmark.\n\nBUILD\n--------------------------------------------------------------------------------\n\n1. Run cmake at BUILD directory (refer to top-level README for cmake instructions).\n\n2. Run `cd <BUILD>/lonestar/analytics/cpu/independentset/; make -j`\n\nRUN\n--------------------------------------------------------------------------------\n\nTo run default algorithm (prio), use the following:\n-`$ ./maximal-independentset-cpu <input-graph (symmetric)> -t=<num-threads> -symmetricGraph`\n\nTo run a specific algorithm, use the following:\n-`$ ./maximal-independentset-cpu <input-graph (symmetric)> -t=<num-threads> -algo=<algorithm> -symmetricGraph`\n\nPERFORMANCE  \n--------------------------------------------------------------------------------\n\nIn 'prio', when a node has high priority than all of its neighbors, it is marked \nas IN. For its neighbors, you can choose either 1) update its neighbors to OUT in \nsame round (Push), or 2) next round its neighbors check if they have an IN neighbor, \nand update themselves to OUT (Pull).\nFirst method works better on none-power-law graphs. Second method works better \non power-law graphs. \n"
  },
  {
    "path": "lonestar/analytics/cpu/k-core/CMakeLists.txt",
    "content": "add_executable(k-core-cpu kcore.cpp)\nadd_dependencies(apps k-core-cpu)\ntarget_link_libraries(k-core-cpu PRIVATE Galois::shmem lonestar)\ninstall(TARGETS k-core-cpu DESTINATION \"${CMAKE_INSTALL_BINDIR}\" COMPONENT apps EXCLUDE_FROM_ALL)\n\nadd_test_scale(small k-core-cpu --kcore=4 -symmetricGraph \"${BASEINPUT}/scalefree/symmetric/rmat10.sgr\")\n"
  },
  {
    "path": "lonestar/analytics/cpu/k-core/README.md",
    "content": "K-Core Decomposition\n================================================================================\n\nDESCRIPTION \n--------------------------------------------------------------------------------\n\nFinds the <b>k-core</b> in a graph. A k-core of a graph G is defined as a maxiaml\nconnected subgraph in which all vertices have degree at least k.\n\nThis is a parallel worklist push-style implementation. The initial worklist consists\nof nodes that have degree less than k. These nodes will decrement the degree\nof their neighbors, and the first time a neighbor's degree falls under the\nspecified k value, it will be added onto the worklist so it can decrement\nits neighbors as it is considered removed from the graph.\n\nINPUT\n--------------------------------------------------------------------------------\n\nThis application takes in symmetric Galois .gr graphs.\nYou must specify the -symmetricGraph flag when running this benchmark.\n\nBUILD\n--------------------------------------------------------------------------------\n\n1. Run cmake at BUILD directory (refer to top-level README for cmake instructions).\n\n2. Run `cd <BUILD>/lonestar/analytics/cpu/k-core/; make -j`\n\nRUN\n--------------------------------------------------------------------------------\n\nTo run on machine with a k value of 4, use the following:\n`./k-core-cpu <symmetric-input-graph> -t=<num-threads> -kcore=4 -symmetricGraph`\n\nPERFORMANCE\n--------------------------------------------------------------------------------\n\nWorklist chunk size (specified as a constant in the source code) may affect\nperformance based on the input provided to k-core.\n\nThere is preallocation of pages before the main computation begins: if the\nstatistics reported at the end of computation indicate that pages\nwere allocated during computation (i.e., MemAllocMid is less than MemAllocPost),\nyou may need to change how many pages are preallocated before computation.\n"
  },
  {
    "path": "lonestar/analytics/cpu/k-core/kcore.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause\n * BSD License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2019, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#include \"galois/Galois.h\"\n#include \"galois/gstl.h\"\n#include \"galois/AtomicHelpers.h\"\n#include \"galois/Reduction.h\"\n#include \"galois/graphs/LCGraph.h\"\n#include \"Lonestar/BoilerPlate.h\"\n\n#include \"llvm/Support/CommandLine.h\"\n\nconstexpr static const char* const REGION_NAME = \"k-core\";\nconstexpr static const char* const name        = \"k-core\";\nconstexpr static const char* const desc        = \"Finds the k-core of a graph, \"\n                                          \"defined as the subgraph where\"\n                                          \" all vertices have degree at \"\n                                          \"least k.\";\n\n/*******************************************************************************\n * Declaration of command line arguments\n ******************************************************************************/\nnamespace cll = llvm::cl;\n\nenum Algo { Async = 0, Sync };\n\nstatic cll::opt<std::string>\n    inputFile(cll::Positional, cll::desc(\"<input file>\"), cll::Required);\n\n//! Choose algorithm: worklist vs. sync.\nstatic cll::opt<Algo> algo(\"algo\",\n                           cll::desc(\"Choose an algorithm (default Sync):\"),\n                           cll::values(clEnumVal(Async, \"Asynchronous\"),\n                                       clEnumVal(Sync, \"Synchronous\")),\n                           cll::init(Sync));\n\n//! Required k specification for k-core.\nstatic cll::opt<unsigned int> k_core_num(\"kcore\", cll::desc(\"k-core value\"),\n                                         cll::Required);\n\n/*******************************************************************************\n * Graph structure declarations + other inits\n ******************************************************************************/\n\n//! Node deadness can be derived from current degree and k value, so no field\n//! necessary.\nstruct NodeData {\n  std::atomic<uint32_t> currentDegree;\n};\n\n//! Typedef for graph used, CSR graph (edge-type is void).\nusing Graph =\n    galois::graphs::LC_CSR_Graph<NodeData, void>::with_no_lockable<true>::type;\n//! Typedef for node type in the CSR graph.\nusing GNode = Graph::GraphNode;\n\n//! Chunksize for for_each worklist: best chunksize will depend on input.\nconstexpr static const unsigned CHUNK_SIZE = 64u;\n\n/*******************************************************************************\n * Functions for running the algorithm\n ******************************************************************************/\n\n/**\n * Initialize degree fields in graph with current degree. Since symmetric,\n * out edge count is equivalent to in-edge count.\n *\n * @param graph Graph to initialize degrees in\n */\nvoid degreeCounting(Graph& graph) {\n  galois::do_all(\n      galois::iterate(graph.begin(), graph.end()),\n      [&](GNode curNode) {\n        NodeData& curData = graph.getData(curNode);\n        curData.currentDegree.store(\n            std::distance(graph.edge_begin(curNode), graph.edge_end(curNode)));\n      },\n      galois::loopname(\"DegreeCounting\"), galois::no_stats());\n};\n\n/**\n * Setup initial worklist of dead nodes.\n *\n * @param graph Graph to operate on\n * @param initialWorklist Empty worklist to be filled with dead nodes.\n */\nvoid setupInitialWorklist(Graph& graph,\n                          galois::InsertBag<GNode>& initialWorklist) {\n  galois::do_all(\n      galois::iterate(graph.begin(), graph.end()),\n      [&](GNode curNode) {\n        NodeData& curData = graph.getData(curNode);\n        if (curData.currentDegree < k_core_num) {\n          //! Dead node, add to initialWorklist for processing later.\n          initialWorklist.emplace(curNode);\n        }\n      },\n      galois::loopname(\"InitialWorklistSetup\"), galois::no_stats());\n}\n\n/**\n * Starting with initial dead nodes as current worklist; decrement degree;\n * add to next worklist; switch next with current and repeat until worklist\n * is empty (i.e. no more dead nodes).\n *\n * @param graph Graph to operate on\n */\nvoid syncCascadeKCore(Graph& graph) {\n  galois::InsertBag<GNode>* current = new galois::InsertBag<GNode>;\n  galois::InsertBag<GNode>* next    = new galois::InsertBag<GNode>;\n\n  //! Setup worklist.\n  setupInitialWorklist(graph, *next);\n\n  while (!next->empty()) {\n    //! Make \"next\" into current.\n    std::swap(current, next);\n    next->clear();\n\n    galois::do_all(\n        galois::iterate(*current),\n        [&](GNode deadNode) {\n          //! Decrement degree of all neighbors.\n          for (auto e : graph.edges(deadNode)) {\n            GNode dest         = graph.getEdgeDst(e);\n            NodeData& destData = graph.getData(dest);\n            uint32_t oldDegree =\n                galois::atomicSubtract(destData.currentDegree, 1u);\n\n            if (oldDegree == k_core_num) {\n              //! This thread was responsible for putting degree of destination\n              //! below threshold; add to worklist.\n              next->emplace(dest);\n            }\n          }\n        },\n        galois::steal(), galois::chunk_size<CHUNK_SIZE>(),\n        galois::loopname(\"SyncCascadeDeadNodes\"));\n  }\n\n  delete current;\n  delete next;\n}\n\n/**\n * Starting with initial dead nodes, decrement degree and add to worklist\n * as they drop below 'k' threshold until worklist is empty (i.e. no more dead\n * nodes).\n *\n * @param graph Graph to operate on\n * @param initialWorklist Worklist containing initial dead nodes\n */\nvoid asyncCascadeKCore(Graph& graph,\n                       galois::InsertBag<GNode>& initialWorklist) {\n  galois::for_each(\n      galois::iterate(initialWorklist),\n      [&](GNode deadNode, auto& ctx) {\n        //! Decrement degree of all neighbors.\n        for (auto e : graph.edges(deadNode)) {\n          GNode dest         = graph.getEdgeDst(e);\n          NodeData& destData = graph.getData(dest);\n          uint32_t oldDegree =\n              galois::atomicSubtract(destData.currentDegree, 1u);\n\n          if (oldDegree == k_core_num) {\n            //! This thread was responsible for putting degree of destination\n            //! below threshold: add to worklist.\n            ctx.push(dest);\n          }\n        }\n      },\n      galois::disable_conflict_detection(), galois::chunk_size<CHUNK_SIZE>(),\n      galois::loopname(\"AsyncCascadeDeadNodes\"));\n}\n\n/*******************************************************************************\n * Sanity check operators\n ******************************************************************************/\n\n/**\n * Print number of nodes that are still alive.\n *\n * @param graph Graph to get alive count of\n */\nvoid kCoreSanity(Graph& graph) {\n  galois::GAccumulator<uint32_t> aliveNodes;\n  aliveNodes.reset();\n\n  galois::do_all(\n      galois::iterate(graph.begin(), graph.end()),\n      [&](GNode curNode) {\n        NodeData& curData = graph.getData(curNode);\n        if (curData.currentDegree >= k_core_num) {\n          aliveNodes += 1;\n        }\n      },\n      galois::loopname(\"KCoreSanityCheck\"), galois::no_stats());\n\n  galois::gPrint(\"Number of nodes in the \", k_core_num, \"-core is \",\n                 aliveNodes.reduce(), \"\\n\");\n}\n\n/*******************************************************************************\n * Main method for running\n ******************************************************************************/\n\nint main(int argc, char** argv) {\n  galois::SharedMemSys G;\n  LonestarStart(argc, argv, name, desc, nullptr, &inputFile);\n\n  galois::StatTimer totalTime(\"TimerTotal\");\n  totalTime.start();\n\n  if (!symmetricGraph) {\n    GALOIS_DIE(\"This application requires a symmetric graph input;\"\n               \" please use the -symmetricGraph flag \"\n               \" to indicate the input is a symmetric graph.\");\n  }\n\n  //! Some initial stat reporting.\n  galois::gInfo(\"Worklist chunk size of \", CHUNK_SIZE,\n                \": best size may depend\"\n                \" on input.\");\n  galois::runtime::reportStat_Single(REGION_NAME, \"ChunkSize\", CHUNK_SIZE);\n  galois::reportPageAlloc(\"MemAllocPre\");\n\n  //! Read graph from disk.\n  galois::StatTimer graphReadingTimer(\"GraphConstructTime\", REGION_NAME);\n  graphReadingTimer.start();\n  Graph graph;\n  galois::graphs::readGraph(graph, inputFile);\n  graphReadingTimer.stop();\n\n  //! Preallocate pages in memory so allocation doesn't occur during compute.\n  galois::StatTimer preallocTime(\"PreAllocTime\", REGION_NAME);\n  preallocTime.start();\n  galois::preAlloc(\n      std::max(size_t{galois::getActiveThreads()} * (graph.size() / 1000000),\n               std::max(10U, galois::getActiveThreads()) * size_t{10}));\n  preallocTime.stop();\n  galois::reportPageAlloc(\"MemAllocMid\");\n\n  //! Intialization of degrees.\n  degreeCounting(graph);\n\n  //! Begins main computation.\n  galois::StatTimer execTime(\"Timer_0\");\n\n  execTime.start();\n\n  if (algo == Async) {\n    galois::gInfo(\"Running asynchronous k-core with k-core number \",\n                  k_core_num);\n    //! Worklist setup of initial dead ndoes.\n    galois::InsertBag<GNode> initialWorklist;\n    setupInitialWorklist(graph, initialWorklist);\n    //! Actual work; propagate deadness by decrementing degrees and adding dead\n    //! nodes to worklist.\n    asyncCascadeKCore(graph, initialWorklist);\n  } else if (algo == Sync) {\n    galois::gInfo(\"Running synchronous k-core with k-core number \", k_core_num);\n    //! Synchronous k-core.\n    syncCascadeKCore(graph);\n  } else {\n    GALOIS_DIE(\"invalid specification of k-core algorithm\");\n  }\n\n  execTime.stop();\n\n  galois::reportPageAlloc(\"MemAllocPost\");\n\n  //! Sanity check.\n  if (!skipVerify) {\n    kCoreSanity(graph);\n  }\n\n  totalTime.stop();\n\n  return 0;\n}\n"
  },
  {
    "path": "lonestar/analytics/cpu/k-truss/CMakeLists.txt",
    "content": "add_executable(k-truss-cpu K-Truss.cpp)\nadd_dependencies(apps k-truss-cpu)\ntarget_link_libraries(k-truss-cpu PRIVATE Galois::shmem lonestar)\ninstall(TARGETS k-truss-cpu DESTINATION \"${CMAKE_INSTALL_BINDIR}\" COMPONENT apps EXCLUDE_FROM_ALL)\n\nadd_executable(verify-k-truss Verify.cpp)\nadd_dependencies(apps verify-k-truss)\ntarget_link_libraries(verify-k-truss PRIVATE Galois::shmem lonestar)\ninstall(TARGETS verify-k-truss DESTINATION \"${CMAKE_INSTALL_BINDIR}\" COMPONENT apps EXCLUDE_FROM_ALL)\nadd_test_scale(small k-truss-cpu -trussNum=4 -symmetricGraph \"${BASEINPUT}/scalefree/symmetric/rmat10.sgr\")\n"
  },
  {
    "path": "lonestar/analytics/cpu/k-truss/K-Truss.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause\n * BSD License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#include \"galois/Galois.h\"\n#include \"galois/Reduction.h\"\n#include \"galois/Bag.h\"\n#include \"galois/Timer.h\"\n#include \"galois/graphs/Graph.h\"\n#include \"galois/graphs/TypeTraits.h\"\n#include \"galois/runtime/Statistics.h\"\n#include \"Lonestar/BoilerPlate.h\"\n\n#include \"llvm/Support/CommandLine.h\"\n\n#include <iostream>\n#include <deque>\n#include <algorithm>\n#include <fstream>\n#include <memory>\n\nenum Algo {\n  bspJacobi,\n  bsp,\n  bspCoreThenTruss,\n};\n\nnamespace cll = llvm::cl;\n\nstatic const char* name = \"Maximal k-trusses\";\nstatic const char* desc =\n    \"Computes the maximal k-trusses for a given undirected graph\";\nstatic const char* url = \"k_truss\";\n\nstatic cll::opt<std::string>\n    inputFile(cll::Positional, cll::desc(\"<input file>\"), cll::Required);\nstatic cll::opt<unsigned int>\n    trussNum(\"trussNum\", cll::desc(\"report trussNum-trusses\"), cll::Required);\n\nstatic cll::opt<std::string>\n    outName(\"o\", cll::desc(\"output file for the edgelist of resulting truss\"));\n\nstatic cll::opt<Algo> algo(\n    \"algo\", cll::desc(\"Choose an algorithm:\"),\n    cll::values(\n        clEnumValN(Algo::bspJacobi, \"bspJacobi\",\n                   \"Bulk-synchronous parallel with separated edge removal\"),\n        clEnumValN(Algo::bsp, \"bsp\", \"Bulk-synchronous parallel (default)\"),\n        clEnumValN(Algo::bspCoreThenTruss, \"bspCoreThenTruss\",\n                   \"Compute k-1 core and then k-truss\")),\n    cll::init(Algo::bsp));\n\n//! Set LSB of an edge weight to indicate the removal of the edge.\nusing Graph =\n    galois::graphs::LC_CSR_Graph<void, uint32_t>::template with_numa_alloc<\n        true>::type::template with_no_lockable<true>::type;\n\nusing GNode   = Graph::GraphNode;\nusing Edge    = std::pair<GNode, GNode>;\nusing EdgeVec = galois::InsertBag<Edge>;\nusing NodeVec = galois::InsertBag<GNode>;\n\ntemplate <typename T>\nusing PerIterAlloc = typename galois::PerIterAllocTy::rebind<T>::other;\n\nstatic const uint32_t valid   = 0x0;\nstatic const uint32_t removed = 0x1;\n\n#if 0 ///< Deprecated codes.\n///< TODO We can restore the asynchronous ktruss.\n\n/**\n * Get the common negihbors between the node src and the dst.\n *\n * @param g\n * @param src the target src node.\n * @param dst the target dst node.\n * @param a\n *\n * @return dequeue containing the common neighbors between the src and the dst.\n */\nstd::deque<GNode, PerIterAlloc<GNode>>\ngetValidCommonNeighbors(Graph& g, GNode src, GNode dst,\n                        galois::PerIterAllocTy& a,\n                        galois::MethodFlag flag = galois::MethodFlag::WRITE) {\n  auto srcI = g.edge_begin(src, flag), srcE = g.edge_end(src, flag),\n       dstI = g.edge_begin(dst, flag), dstE = g.edge_end(dst, flag);\n  std::deque<GNode, PerIterAlloc<GNode>> commonNeighbors(a);\n\n  while (true) {\n    //! Find the first valid edge.\n    while (srcI != srcE && (g.getEdgeData(srcI) & removed)) {\n      ++srcI;\n    }\n\n    while (dstI != dstE && (g.getEdgeData(dstI) & removed)) {\n      ++dstI;\n    }\n\n    //! Check for intersection.\n    auto sN = g.getEdgeDst(srcI), dN = g.getEdgeDst(dstI);\n\n    if (srcI == srcE || dstI == dstE) {\n      break;\n    }\n\n    if (sN < dN) {\n      ++srcI;\n    } else if (dN < sN) {\n      ++dstI;\n    } else {\n      commonNeighbors.push_back(sN);\n      ++srcI;\n      ++dstI;\n    }\n  }\n  return commonNeighbors;\n}\n\n/**\n * AsyncTrussTxAlgo:\n * 1. Compute support for all edges and pick out unsupported ones.\n * 2. Remove unsupported edges, decrease the support for affected edges and pick\n *    out those becomeing unsupported.\n * 3. Repeat 2. until no more unsupported edges are found.\n *\n * edges update in default Galois sync model, i.e. transactional semantics.\n */\nstruct AsyncTrussTxAlgo {\n  std::string name() { return \"asyncTx\"; }\n\n  struct PickUnsupportedEdges {\n    Graph& g;\n    unsigned int j;\n    EdgeVec& r;\n\n    PickUnsupportedEdges(Graph& g, unsigned int j, EdgeVec& r)\n        : g(g), j(j), r(r) {}\n\n    void operator()(Edge e, galois::UserContext<Edge>& ctx) {\n      auto src = e.first, dst = e.second;\n      std::deque<GNode, PerIterAlloc<GNode>> commonNeighbors =\n          getValidCommonNeighbors(g, src, dst, ctx.getPerIterAlloc(),\n                                  galois::MethodFlag::UNPROTECTED);\n      auto numValidCommonNeighbors = commonNeighbors.size();\n\n      g.getEdgeData(g.findEdgeSortedByDst(src, dst)) =\n                                          (numValidCommonNeighbors << 1);\n      g.getEdgeData(g.findEdgeSortedByDst(dst, src)) =\n                                          (numValidCommonNeighbors << 1);\n      if (numValidCommonNeighbors < j) {\n        r.push_back(e);\n      }\n    }\n  };\n\n  struct PropagateEdgeRemoval {\n    Graph& g;\n    unsigned int j;\n\n    PropagateEdgeRemoval(Graph& g, unsigned int j) : g(g), j(j) {}\n\n    void removeUnsupportedEdge(GNode src, GNode dst,\n                               galois::UserContext<Edge>& ctx) {\n      auto& oeData = g.getEdgeData(g.findEdgeSortedByDst(src, dst));\n      auto& ieData = g.getEdgeData(g.findEdgeSortedByDst(dst, src));\n\n      auto newSupport = (oeData >> 1) - 1;\n      oeData          = (newSupport << 1);\n      ieData          = (newSupport << 1);\n      if (newSupport < j) {\n        ctx.push(std::make_pair(src, dst));\n      }\n    }\n\n    void operator()(Edge e, galois::UserContext<Edge>& ctx) {\n      auto src = e.first, dst = e.second;\n\n      //! Lock src's neighbors.\n      auto& oeData = g.getEdgeData(g.findEdgeSortedByDst(src, dst));\n      //! Lock src's neighbors' neighbors for back edges from them to src's\n      //! neighbors.\n      for (auto ei : g.edges(src)) {\n        g.edges(g.getEdgeDst(ei));\n      }\n\n      //! Lock dst's neighbors.\n      auto& ieData = g.getEdgeData(g.findEdgeSortedByDst(dst, src));\n      //! Lock dst's neighbors' neighbors for back edge from them to dst's\n      //! neighbors.\n      for (auto ei : g.edges(dst)) {\n        g.edges(g.getEdgeDst(ei));\n      }\n\n      //! Avoid repeated processing.\n      if (oeData & removed) {\n        return;\n      }\n\n      //! Mark as removed.\n      oeData = removed;\n      ieData = removed;\n\n      //! Propagate edge removal.\n      std::deque<GNode, PerIterAlloc<GNode>> commonNeighbors =\n          getValidCommonNeighbors(g, src, dst, ctx.getPerIterAlloc());\n      for (auto n : commonNeighbors) {\n        removeUnsupportedEdge(((n < src) ? n : src), ((n < src) ? src : n),\n                              ctx);\n        removeUnsupportedEdge(((n < dst) ? n : dst), ((n < dst) ? dst : n),\n                              ctx);\n      }\n    }\n  };\n\n  void operator()(Graph& g, unsigned int k) {\n    if (k - 2 == 0) {\n      return;\n    }\n\n    EdgeVec work, unsupported;\n\n    //! Symmetry breaking:\n    //! Consider only edges (i, j) where i < j.\n    galois::do_all(galois::iterate(g),\n                   [&g, &work](GNode n) {\n                     for (auto e :\n                          g.edges(n, galois::MethodFlag::UNPROTECTED)) {\n                       auto dst = g.getEdgeDst(e);\n                       if (dst > n) {\n                         work.push_back(std::make_pair(n, dst));\n                       }\n                     }\n                   },\n                   galois::steal());\n\n    galois::for_each(galois::iterate(work),\n                     PickUnsupportedEdges{g, k - 2, unsupported},\n                     galois::loopname(\"PickUnsupportedEdges\"),\n                     galois::disable_conflict_detection(), galois::no_pushes(),\n                     galois::per_iter_alloc());\n\n    galois::for_each(galois::iterate(unsupported),\n                     PropagateEdgeRemoval{g, k - 2},\n                     galois::loopname(\"PropagateEdgeRemoval\"),\n                     galois::per_iter_alloc());\n  } ///< End operator().\n};  ///< End AsyncTrussTxAlgo.\n\n#endif\n\n/**\n * Initialize edge data to valid.\n */\ntemplate <typename Graph>\nvoid initialize(Graph& g) {\n  g.sortAllEdgesByDst();\n\n  //! Initializa all edges to valid.\n  galois::do_all(\n      galois::iterate(g),\n      [&g](typename Graph::GraphNode N) {\n        for (auto e : g.edges(N, galois::MethodFlag::UNPROTECTED)) {\n          g.getEdgeData(e) = valid;\n        }\n      },\n      galois::steal());\n}\n\n/**\n * Dump ktruss for each node to a file.\n */\ntemplate <typename Graph>\nvoid reportKTruss(Graph& g) {\n  if (outName.empty()) {\n    return;\n  }\n\n  std::ofstream of(outName);\n  if (!of.is_open()) {\n    std::cerr << \"Cannot open \" << outName << \" for output.\\n\";\n    return;\n  }\n\n  for (auto n : g) {\n    for (auto e : g.edges(n, galois::MethodFlag::UNPROTECTED)) {\n      auto dst = g.getEdgeDst(e);\n      if (n < dst && (g.getEdgeData(e) & 0x1) != removed) {\n        of << n << \" \" << dst << \" \" << g.getEdgeData(e) << \"\\n\";\n      }\n    }\n  }\n\n  of.close();\n}\n\n/**\n * Check if the number of valid edges is more than or equal to j.\n * If it is, then the node n still could be processed.\n * Otherwise, the node n will be ignored in the following steps.\n *\n * @param g\n * @param n the target node n to be tested\n * @param j the target number of triangels\n *\n * @return true if the target node n has the number of degrees\n *         more than or equal to j\n */\nbool isValidDegreeNoLessThanJ(Graph& g, GNode n, unsigned int j) {\n  size_t numValid = 0;\n  for (auto e : g.edges(n, galois::MethodFlag::UNPROTECTED)) {\n    if (!(g.getEdgeData(e) & removed)) {\n      numValid += 1;\n      if (numValid >= j) {\n        return true;\n      }\n    }\n  }\n  return numValid >= j;\n}\n\n/**\n * Measure the number of intersected edges between the src and the dst nodes.\n *\n * @param g\n * @param src the source node\n * @param dst the destination node\n * @param j the number of the target triangles\n *\n * @return true if the src and the dst are included in more than j triangles\n */\nbool isSupportNoLessThanJ(Graph& g, GNode src, GNode dst, unsigned int j) {\n  size_t numValidEqual = 0;\n  auto srcI            = g.edge_begin(src, galois::MethodFlag::UNPROTECTED),\n       srcE            = g.edge_end(src, galois::MethodFlag::UNPROTECTED),\n       dstI            = g.edge_begin(dst, galois::MethodFlag::UNPROTECTED),\n       dstE            = g.edge_end(dst, galois::MethodFlag::UNPROTECTED);\n\n  while (true) {\n    //! Find the first valid edge.\n    while (srcI != srcE && (g.getEdgeData(srcI) & removed)) {\n      ++srcI;\n    }\n    while (dstI != dstE && (g.getEdgeData(dstI) & removed)) {\n      ++dstI;\n    }\n\n    if (srcI == srcE || dstI == dstE) {\n      return numValidEqual >= j;\n    }\n\n    //! Check for intersection.\n    auto sN = g.getEdgeDst(srcI), dN = g.getEdgeDst(dstI);\n    if (sN < dN) {\n      ++srcI;\n    } else if (dN < sN) {\n      ++dstI;\n    } else {\n      numValidEqual += 1;\n      if (numValidEqual >= j) {\n        return true;\n      }\n      ++srcI;\n      ++dstI;\n    }\n  }\n\n  return numValidEqual >= j;\n}\n\n/**\n * BSPTrussJacobiAlgo:\n * 1. Scan for unsupported edges.\n * 2. If no unsupported edges are found, done.\n * 3. Remove unsupported edges in a separated loop.\n *    TODO why would it be processed in a separted loop?\n * 4. Go back to 1.\n */\nstruct BSPTrussJacobiAlgo {\n  std::string name() { return \"bsp\"; }\n\n  struct PickUnsupportedEdges {\n    Graph& g;\n    unsigned int j;\n    EdgeVec& r; ///< unsupported\n    EdgeVec& s; ///< next\n\n    PickUnsupportedEdges(Graph& g, unsigned int j, EdgeVec& r, EdgeVec& s)\n        : g(g), j(j), r(r), s(s) {}\n\n    void operator()(Edge e) {\n      EdgeVec& w = isSupportNoLessThanJ(g, e.first, e.second, j) ? s : r;\n      w.push_back(e);\n    }\n  };\n\n  void operator()(Graph& g, unsigned int k) {\n    if (k - 2 == 0) {\n      return;\n    }\n\n    EdgeVec unsupported, work[2];\n    EdgeVec *cur = &work[0], *next = &work[1];\n\n    //! Symmetry breaking:\n    //! Consider only edges (i, j) where i < j.\n    galois::do_all(\n        galois::iterate(g),\n        [&](GNode n) {\n          for (auto e : g.edges(n, galois::MethodFlag::UNPROTECTED)) {\n            auto dst = g.getEdgeDst(e);\n            if (dst > n) {\n              cur->push_back(std::make_pair(n, dst));\n            }\n          }\n        },\n        galois::steal());\n\n    while (true) {\n      galois::do_all(galois::iterate(*cur),\n                     PickUnsupportedEdges{g, k - 2, unsupported, *next},\n                     galois::steal());\n\n      if (std::distance(unsupported.begin(), unsupported.end()) == 0) {\n        break;\n      }\n\n      //! Mark unsupported edges as removed.\n      galois::do_all(\n          galois::iterate(unsupported),\n          [&](Edge e) {\n            g.getEdgeData(g.findEdgeSortedByDst(e.first, e.second)) = removed;\n            g.getEdgeData(g.findEdgeSortedByDst(e.second, e.first)) = removed;\n          },\n          galois::steal());\n\n      unsupported.clear();\n      cur->clear();\n      std::swap(cur, next);\n    }\n  } ///< End operator()\n};  ///< End struct BSPTrussJacobiAlgo\n\n/**\n * BSPTrussAlgo:\n * 1. Keep supported edges and remove unsupported edges.\n * 2. If all edges are kept, done.\n * 3. Go back to 3.\n */\nstruct BSPTrussAlgo {\n  std::string name() { return \"bsp\"; }\n\n  struct KeepSupportedEdges {\n    Graph& g;\n    unsigned int j;\n    EdgeVec& s;\n\n    KeepSupportedEdges(Graph& g, unsigned int j, EdgeVec& s)\n        : g(g), j(j), s(s) {}\n\n    void operator()(Edge e) {\n      if (isSupportNoLessThanJ(g, e.first, e.second, j)) {\n        s.push_back(e);\n      } else {\n        g.getEdgeData(g.findEdgeSortedByDst(e.first, e.second)) = removed;\n        g.getEdgeData(g.findEdgeSortedByDst(e.second, e.first)) = removed;\n      }\n    }\n  };\n\n  void operator()(Graph& g, unsigned int k) {\n    if (k - 2 == 0) {\n      return;\n    }\n\n    EdgeVec work[2];\n    EdgeVec *cur = &work[0], *next = &work[1];\n    size_t curSize, nextSize;\n\n    //! Symmetry breaking:\n    //! Consider only edges (i, j) where i < j.\n    galois::do_all(\n        galois::iterate(g),\n        [&g, cur](GNode n) {\n          for (auto e : g.edges(n, galois::MethodFlag::UNPROTECTED)) {\n            auto dst = g.getEdgeDst(e);\n            if (dst > n) {\n              cur->push_back(std::make_pair(n, dst));\n            }\n          }\n        },\n        galois::steal());\n    curSize = std::distance(cur->begin(), cur->end());\n\n    //! Remove unsupported edges until no more edges can be removed.\n    while (true) {\n      galois::do_all(galois::iterate(*cur), KeepSupportedEdges{g, k - 2, *next},\n                     galois::steal());\n      nextSize = std::distance(next->begin(), next->end());\n\n      if (curSize == nextSize) {\n        //! Every edge in *cur is kept, done\n        break;\n      }\n\n      cur->clear();\n      curSize = nextSize;\n      std::swap(cur, next);\n    }\n  } ///< End operator()\n};  ///< End struct BSPTrussAlgo\n\n/**\n * BSPCoreAlgo:\n * 1. Keep nodes w/ degree >= k and remove all edges for nodes whose degree < k.\n * 2. If all nodes are kept, done.\n * 3. Go back to 1.\n */\nstruct BSPCoreAlgo {\n  std::string name() { return \"bspCore\"; }\n\n  struct KeepValidNodes {\n    Graph& g;\n    unsigned int j;\n    NodeVec& s;\n\n    KeepValidNodes(Graph& g, unsigned int j, NodeVec& s) : g(g), j(j), s(s) {}\n\n    void operator()(GNode n) {\n      if (isValidDegreeNoLessThanJ(g, n, j)) {\n        s.push_back(n);\n      } else {\n        for (auto e : g.edges(n, galois::MethodFlag::UNPROTECTED)) {\n          auto dst                                     = g.getEdgeDst(e);\n          g.getEdgeData(g.findEdgeSortedByDst(n, dst)) = removed;\n          g.getEdgeData(g.findEdgeSortedByDst(dst, n)) = removed;\n        }\n      }\n    }\n  };\n\n  void operator()(Graph& g, unsigned int k) {\n    NodeVec work[2];\n    NodeVec *cur = &work[0], *next = &work[1];\n    size_t curSize = g.size(), nextSize;\n\n    galois::do_all(galois::iterate(g), KeepValidNodes{g, k, *next},\n                   galois::steal());\n    nextSize = std::distance(next->begin(), next->end());\n\n    while (curSize != nextSize) {\n      cur->clear();\n      curSize = nextSize;\n      std::swap(cur, next);\n\n      galois::do_all(galois::iterate(*cur), KeepValidNodes{g, k, *next},\n                     galois::steal());\n      nextSize = std::distance(next->begin(), next->end());\n    }\n  }\n}; ///< End BSPCoreAlgo.\n\n/**\n * BSPCoreThenTrussAlgo:\n * 1. Reduce the graph to k-1 core\n * 2. Compute k-truss from k-1 core\n */\nstruct BSPCoreThenTrussAlgo {\n  std::string name() { return \"bspCoreThenTruss\"; }\n\n  void operator()(Graph& g, unsigned int k) {\n    if (k - 2 == 0) {\n      return;\n    }\n\n    galois::StatTimer TCore(\"Reduce_to_(k-1)-core\");\n    TCore.start();\n\n    BSPCoreAlgo bspCore;\n    bspCore(g, k - 1);\n\n    TCore.stop();\n\n    galois::StatTimer TTruss(\"Reduce_to_k-truss\");\n    TTruss.start();\n\n    BSPTrussAlgo bspTrussIm;\n    bspTrussIm(g, k);\n\n    TTruss.stop();\n  } ///< End operator().\n};  ///< End struct BSPCoreThenTrussAlgo.\n\ntemplate <typename Algo>\nvoid run() {\n  Graph graph;\n  Algo algo;\n\n  std::cout << \"Reading from file: \" << inputFile << \"\\n\";\n  galois::graphs::readGraph(graph, inputFile, true);\n  std::cout << \"Read \" << graph.size() << \" nodes, \" << graph.sizeEdges()\n            << \" edges\\n\";\n  std::cout << \"Running \" << algo.name() << \" algorithm for maximal \"\n            << trussNum << \"-truss\\n\";\n\n  size_t approxEdgeData = 4 * (graph.size() + graph.sizeEdges());\n  galois::preAlloc(numThreads +\n                   4 * (approxEdgeData) / galois::runtime::pagePoolSize());\n  galois::reportPageAlloc(\"MeminfoPre\");\n\n  initialize(graph);\n\n  galois::StatTimer execTime(\"Timer_0\");\n  execTime.start();\n  algo(graph, trussNum);\n  execTime.stop();\n\n  galois::reportPageAlloc(\"MeminfoPost\");\n  reportKTruss(graph);\n\n  uint64_t numEdges = 0;\n\n  for (auto n : graph) {\n    for (auto e : graph.edges(n, galois::MethodFlag::UNPROTECTED)) {\n      auto dst = graph.getEdgeDst(e);\n      if (n < dst && (graph.getEdgeData(e) & 0x1) != removed) {\n        numEdges++;\n      }\n    }\n  }\n\n  galois::gInfo(\"Number of edges left in truss is \", numEdges);\n}\n\nint main(int argc, char** argv) {\n  galois::SharedMemSys G;\n  LonestarStart(argc, argv, name, desc, url, &inputFile);\n\n  galois::StatTimer totalTime(\"TimerTotal\");\n  totalTime.start();\n\n  if (!symmetricGraph) {\n    GALOIS_DIE(\"This application requires a symmetric graph input;\"\n               \" please use the -symmetricGraph flag \"\n               \" to indicate the input is a symmetric graph.\");\n  }\n\n  if (2 > trussNum) {\n    std::cerr << \"trussNum >= 2\\n\";\n    return -1;\n  }\n\n  switch (algo) {\n  case bspJacobi:\n    run<BSPTrussJacobiAlgo>();\n    break;\n  case bsp:\n    run<BSPTrussAlgo>();\n    break;\n  case bspCoreThenTruss:\n    run<BSPCoreThenTrussAlgo>();\n    break;\n  default:\n    std::cerr << \"Unknown algorithm\\n\";\n    abort();\n  }\n\n  totalTime.stop();\n\n  return 0;\n}\n"
  },
  {
    "path": "lonestar/analytics/cpu/k-truss/README.md",
    "content": "K-Truss\n================================================================================\n\nDESCRIPTION \n--------------------------------------------------------------------------------\n\nThis program finds the k-truss for some k value in a given undirect graph.\nA k-truss is the subgraph of a graph in which every edge in the subgraph\nis a part of at least k - 2 triangles.\n\nINPUT\n--------------------------------------------------------------------------------\n\nThis application takes in symmetric Galois .gr graphs.\nYou must specify the -symmetricGraph flag when running this benchmark.\n\nBUILD\n--------------------------------------------------------------------------------\n\n1. Run cmake at BUILD directory (refer to top-level README for cmake instructions).\n\n2. Run `cd <BUILD>/lonestar/analytics/cpu/k-truss; make -j`\n\nRUN\n--------------------------------------------------------------------------------\n\nThe following are a few example command lines.\n\nFind the 5 truss using 40 threads and the BSP algorithm.\n\n-`$ ./k-truss-cpu <path-symmetric-clean-graph> -algo bsp -trussNum=5 -t 40 -symmetricGraph`\n\nThe following outputs the edges of a 10 truss to a file using bspJacobi (edge\nremoval is separated).\n\n-`$ ./k-truss-cpu <path-symmetric-clean-graph> -algo bspJacobi -t 40 -trussNum=10 -o=10truss.out -symmetricGraph`\n\nPERFORMANCE\n--------------------------------------------------------------------------------\n\n* The BSP variant (the default, -bsp) generally performs better in our experience.\n"
  },
  {
    "path": "lonestar/analytics/cpu/k-truss/Verify.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#include \"galois/Galois.h\"\n#include \"galois/Reduction.h\"\n#include \"galois/Bag.h\"\n#include \"galois/Timer.h\"\n#include \"galois/Timer.h\"\n#include \"galois/graphs/Graph.h\"\n#include \"galois/graphs/TypeTraits.h\"\n#include \"llvm/Support/CommandLine.h\"\n#include \"Lonestar/BoilerPlate.h\"\n\n#include <iostream>\n#include <unordered_set>\n#include <algorithm>\n#include <fstream>\n\nnamespace cll = llvm::cl;\n\nstatic const char* name = \"verify_ktruss\";\nstatic const char* desc = \"Verify for maximal k-truss\";\n\nstatic cll::opt<std::string>\n    inputFile(cll::Positional, cll::desc(\"<input graph>\"), cll::Required);\nstatic cll::opt<std::string> trussFile(\"trussFile\",\n                                       cll::desc(\"edgelist for the trusses\"),\n                                       cll::Required);\nstatic cll::opt<unsigned int>\n    trussNum(\"trussNum\", cll::desc(\"verify for maximal trussNum-trusses\"),\n             cll::Required);\nstatic cll::opt<unsigned int>\n    ktrussNodes(\"trussNodes\", cll::desc(\"truss nodes for verification\"),\n                cll::init(0));\nstatic cll::opt<unsigned int>\n    ktrussEdges(\"trussEdges\", cll::desc(\"truss edges for verification\"),\n                cll::init(0)); // must be undirected edge count, i.e. counting\n                               // (n1, n2) and (n2, n1) as 1 edge\n\nstatic const uint32_t valid   = 0x0;\nstatic const uint32_t removed = 0x1;\n\n// edge weight: (# triangles supported << 1) | removal\n//   set LSB of an edge weight to indicate the removal of the edge.\n//   << 1 to track # triangles an edge supports,\n//   >> 1 when computing edge supports\ntypedef galois::graphs::LC_CSR_Graph<void, uint32_t>::template with_numa_alloc<\n    true>::type ::template with_no_lockable<true>::type Graph;\ntypedef Graph::GraphNode GNode;\n\ntypedef std::pair<GNode, GNode> Edge;\ntypedef galois::InsertBag<Edge> EdgeVec;\n\nvoid initialize(Graph& g) {\n  g.sortAllEdgesByDst();\n\n  // initializa all edges to removed\n  galois::do_all(\n      galois::iterate(g),\n      [&g](typename Graph::GraphNode N) {\n        for (auto e : g.edges(N, galois::MethodFlag::UNPROTECTED)) {\n          g.getEdgeData(e) = removed;\n        }\n      },\n      galois::steal());\n}\n\n// TODO: can we read in edges in parallel?\nvoid readTruss(Graph& g) {\n  std::ifstream edgelist(trussFile);\n  if (!edgelist.is_open()) {\n    std::string errMsg = \"Failed to open \" + trussFile;\n    GALOIS_DIE(errMsg);\n  }\n\n  unsigned int n1, n2;\n  unsigned int edges = 0;\n  std::unordered_set<unsigned int> nodes;\n  while (edgelist >> n1 >> n2) {\n    auto e = g.findEdgeSortedByDst(n1, n2);\n    if (valid == g.getEdgeData(e)) {\n      std::cout << \"ignoring duplicate edge\" << n1 << \", \" << n2 << \"\\n\";\n      continue;\n    }\n    g.getEdgeData(e) = valid;\n\n    e = g.findEdgeSortedByDst(n2, n1);\n    if (valid == g.getEdgeData(e)) {\n      std::cout << \"duplicate edge (rev) \" << n2 << \", \" << n1 << \"\\n\";\n      continue;\n    }\n    g.getEdgeData(e) = valid;\n\n    edges++;\n    nodes.insert(n1);\n    nodes.insert(n2);\n  }\n\n  std::cout << \"read \" << nodes.size() << \" unique nodes\\n\";\n  std::cout << \"read \" << edges << \" unique edges\\n\";\n\n  if (ktrussEdges && edges != ktrussEdges) {\n    std::cerr << \"edges read not equal to -trussEdges=\" << ktrussEdges << \"\\n\";\n    GALOIS_DIE(\"verification error\");\n  }\n\n  if (ktrussNodes && nodes.size() != ktrussNodes) {\n    std::cerr << \"nodes read not equal to -trussNodes=\" << ktrussNodes << \"\\n\";\n    GALOIS_DIE(\"verification error\");\n  }\n}\n\nvoid printGraph(Graph& g) {\n  for (auto n : g) {\n    std::cout << \"node \" << n << \"\\n\";\n    for (auto e : g.edges(n, galois::MethodFlag::UNPROTECTED)) {\n      auto d = g.getEdgeDst(e);\n      if (d >= n)\n        continue;\n      std::cout << \"  edge to \" << d\n                << ((g.getEdgeData(e) & removed) ? \" removed\" : \"\") << \"\\n\";\n    }\n  }\n}\n\nstd::pair<size_t, size_t> countValidNodesAndEdges(Graph& g) {\n  galois::GAccumulator<size_t> numNodes, numEdges;\n\n  galois::do_all(\n      galois::iterate(g),\n      [&g, &numNodes, &numEdges](GNode n) {\n        size_t numN = 0;\n        for (auto e : g.edges(n, galois::MethodFlag::UNPROTECTED)) {\n          if (!(g.getEdgeData(e) & removed)) {\n            if (g.getEdgeDst(e) > n) {\n              numEdges += 1;\n            }\n            numN = 1;\n          }\n        }\n        numNodes += numN;\n      },\n      galois::steal());\n\n  return std::make_pair(numNodes.reduce(), numEdges.reduce());\n}\n\nbool isSupportNoLessThanJ(Graph& g, GNode src, GNode dst, unsigned int j) {\n  size_t numValidEqual = 0;\n  auto srcI            = g.edge_begin(src, galois::MethodFlag::UNPROTECTED),\n       srcE            = g.edge_end(src, galois::MethodFlag::UNPROTECTED),\n       dstI            = g.edge_begin(dst, galois::MethodFlag::UNPROTECTED),\n       dstE            = g.edge_end(dst, galois::MethodFlag::UNPROTECTED);\n\n  while (true) {\n    // find the first valid edge\n    while (srcI != srcE && (g.getEdgeData(srcI) & removed)) {\n      ++srcI;\n    }\n    while (dstI != dstE && (g.getEdgeData(dstI) & removed)) {\n      ++dstI;\n    }\n\n    if (srcI == srcE || dstI == dstE) {\n      return numValidEqual >= j;\n    }\n\n    // check for intersection\n    auto sN = g.getEdgeDst(srcI), dN = g.getEdgeDst(dstI);\n    if (sN < dN) {\n      ++srcI;\n    } else if (dN < sN) {\n      ++dstI;\n    } else {\n      numValidEqual += 1;\n      if (numValidEqual >= j) {\n        return true;\n      }\n      ++srcI;\n      ++dstI;\n    }\n  }\n  return numValidEqual >= j;\n}\n\nint main(int argc, char** argv) {\n  galois::SharedMemSys G;\n  LonestarStart(argc, argv, name, desc, nullptr, &inputFile);\n\n  galois::StatTimer totalTime(\"TimerTotal\");\n  totalTime.start();\n\n  if (!symmetricGraph) {\n    GALOIS_DIE(\"This application requires a symmetric graph input;\"\n               \" please use the -symmetricGraph flag \"\n               \" to indicate the input is a symmetric graph.\");\n  }\n\n  if (2 > trussNum) {\n    std::cerr << \"trussNum >= 2\\n\";\n    return -1;\n  }\n\n  std::cout << \"Verifying maximal \" << trussNum << \"-truss\\n\";\n  std::cout << \"Truss is computed for \" << inputFile << \" and stored in \"\n            << trussFile << \"\\n\";\n\n  Graph g;\n  EdgeVec work, shouldBeInvalid, shouldBeValid;\n\n  galois::graphs::readGraph(g, inputFile, true);\n  std::cout << \"Read \" << g.size() << \" nodes\\n\";\n\n  galois::StatTimer execTime(\"Timer_0\");\n  execTime.start();\n\n  initialize(g);\n  readTruss(g);\n  //  printGraph(g);\n\n  auto validNum = countValidNodesAndEdges(g);\n  std::cout << validNum.first << \" valid nodes\\n\";\n  std::cout << validNum.second << \" valid edges\\n\";\n\n  // every valid node should have at least trussNum-1 valid neighbors\n  // so # valid edges >= smallest # directed edges among valid nodes\n  assert((validNum.first * (trussNum - 1)) <= validNum.second * 2);\n\n  // symmetry breaking:\n  // consider only edges (i, j) where i < j\n  galois::do_all(\n      galois::iterate(g),\n      [&g, &work](GNode n) {\n        for (auto e : g.edges(n, galois::MethodFlag::UNPROTECTED)) {\n          auto dst = g.getEdgeDst(e);\n          if (dst > n) {\n            work.push_back(std::make_pair(n, dst));\n          }\n        }\n      },\n      galois::steal());\n\n  // pick out the following:\n  // 1. valid edges whose support < trussNum-2\n  // 2. removed edges whose support >= trussNum-2\n  galois::do_all(\n      galois::iterate(work),\n      [&g, &shouldBeInvalid, &shouldBeValid](Edge e) {\n        bool isSupportEnough =\n            isSupportNoLessThanJ(g, e.first, e.second, trussNum - 2);\n        bool isRemoved =\n            g.getEdgeData(g.findEdgeSortedByDst(e.first, e.second)) & 0x1;\n        if (!isRemoved && !isSupportEnough) {\n          shouldBeInvalid.push_back(e);\n        } else if (isRemoved && isSupportEnough) {\n          shouldBeValid.push_back(e);\n        }\n      },\n      galois::steal());\n\n  auto numShouldBeInvalid =\n      std::distance(shouldBeInvalid.begin(), shouldBeInvalid.end());\n  auto numShouldBeValid =\n      std::distance(shouldBeValid.begin(), shouldBeValid.end());\n  if (!numShouldBeInvalid && !numShouldBeValid) {\n    std::cout << \"Verification succeeded\\n\";\n  } else {\n    for (auto e : shouldBeInvalid) {\n      std::cerr << \"(\" << e.first << \", \" << e.second\n                << \") should be invalid\\n\";\n    }\n    for (auto e : shouldBeValid) {\n      std::cerr << \"(\" << e.first << \", \" << e.second << \") should be valid\\n\";\n    }\n    std::cerr << \"Verification failed!\\n\";\n    return 1;\n  }\n\n  execTime.start();\n\n  totalTime.stop();\n\n  return 0;\n}\n"
  },
  {
    "path": "lonestar/analytics/cpu/k-truss/bmktest2.py",
    "content": "import bmk2\nfrom bmkprops import graph_bmk, PERF_RE, get_ktruss_checker\nimport os\n\nclass KtrussGaloisBase(graph_bmk):\n    bmk = \"ktruss\"\n    algo = None\n\n    def filter_inputs(self, inputs):\n        def finput(x):\n            if not \"symmetric\" in x.props.flags: return False\n            if x.props.format == 'bin/galois': return True\n\n            return False\n\n        return filter(finput, inputs)\n\n    def get_run_spec(self, bmkinput):\n        x = bmk2.RunSpec(self, bmkinput)\n\n        k, ec = get_ktruss_checker(bmkinput, self.config['k'])\n        t = int(self.config['t'])\n\n        x.set_binary(self.props._cwd, 'k-truss')\n        x.set_arg(bmkinput.props.file, bmk2.AT_INPUT_FILE)\n        assert self.algo is not None\n        x.set_arg('-algo=%s' % (self.algo,), bmk2.AT_OPAQUE)\n        x.set_arg('-trussNum=%d' % (k,), bmk2.AT_OPAQUE)\n        x.set_arg(\"-t=%d\" % (t,), bmk2.AT_OPAQUE)\n        x.set_arg('-o=@output', bmk2.AT_TEMPORARY_OUTPUT)\n        x.set_checker(bmk2.ExternalChecker(ec))\n\n        x.set_perf(bmk2.PerfRE(r\"^\\(NULL\\),.*, Time,0,0,(?P<time_ms>[0-9]+)$\"))\n        return x\n\nclass KtrussGaloisBSP(KtrussGaloisBase):\n    variant = \"galois+bsp\"\n    algo = \"bsp\"\n\nclass KtrussGaloisBSPIm(KtrussGaloisBase):\n    variant = \"galois+bspIm\"\n    algo = \"bspIm\"\n\nclass KtrussGaloisBSPCoreThenTruss(KtrussGaloisBase):\n    variant = \"galois+bspCoreThenTruss\"\n    algo = \"bspCoreThenTruss\"\n\nclass KtrussGaloisAsync(KtrussGaloisBase):\n    variant = \"galois+async\"\n    algo = \"async\"\n\n        \nBINARIES = [KtrussGaloisBSP(),\n            KtrussGaloisBSPIm(),\n            KtrussGaloisBSPCoreThenTruss(),\n            KtrussGaloisAsync(),]\n"
  },
  {
    "path": "lonestar/analytics/cpu/matching/CMakeLists.txt",
    "content": "add_executable(maximum-cardinality-matching-cpu bipartite-mcm.cpp)\nadd_dependencies(apps maximum-cardinality-matching-cpu)\ntarget_link_libraries(maximum-cardinality-matching-cpu PRIVATE Galois::shmem lonestar)\ninstall(TARGETS maximum-cardinality-matching-cpu DESTINATION \"${CMAKE_INSTALL_BINDIR}\" COMPONENT apps EXCLUDE_FROM_ALL)\n\nadd_test_scale(small1 maximum-cardinality-matching-cpu -symmetricGraph -inputType generated -n 100 -numEdges 1000 -numGroups 10 -seed 0)\nadd_test_scale(small2 maximum-cardinality-matching-cpu -symmetricGraph -inputType generated -n 100 -numEdges 10000 -numGroups 100 -seed 0)\n"
  },
  {
    "path": "lonestar/analytics/cpu/matching/README.md",
    "content": "Maximum Cardinality Matching\n================================================================================\n\nDESCRIPTION \n--------------------------------------------------------------------------------\n\nThis program finds the maximum cardinality bipartite matching in a bipartite graph.\nIt uses the Alt-Blum-Melhorn-Paul Algorithm described at\nhttps://web.eecs.umich.edu/~pettie/matching/Alt-Blum-Mehlhorn-Paul-bipartite-matching-dense-graphs.pdf\nThis algoritm is also described in:\nK. Mehlhorn and S. Naeher. LEDA: A Platform for Combinatorial and Geometric Computing. Cambridge University Press, 1999\n\nAfter all the augmenting paths of a given length are found, the algorithm\nfinishes using the Ford-Fulkerson algorithm for matching.\n\nBy default, a randomly generated input is used, though input can be taken from\na file instead. In general, the parallelism available to this algorithm is\nheavily dependent on the characteristics of the input.\n\nINPUT\n--------------------------------------------------------------------------------\n\nThis application takes in symmetric Galois .gr graphs.\nYou must specify the -symmetricGraph flag when running this benchmark.\n\nBUILD\n--------------------------------------------------------------------------------\n\n1. Run cmake at BUILD directory (refer to top-level README for cmake instructions).\n\n2. Run `cd <BUILD>/lonestar/analytics/cpu/matching && make -j`\n\nRUN\n--------------------------------------------------------------------------------\n\nThe following are a few example command lines.\n\n - `./maximum-cardinality-matching-cpu -symmetricGraph -abmpAlgo -inputType=generated -numEdges=100000000 -numGroups=10000 -seed=0 -n=1000000 -t=40`\n - `./maximum-cardinality-matching-cpu -symmetricGraph -abmpAlgo -inputType=generated -numEdges=1000000000 -numGroups=2000000 -seed=0 -n=10000000 -t=40`\n"
  },
  {
    "path": "lonestar/analytics/cpu/matching/bipartite-mcm.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n// TODO(ddn): Needs a graph implementation that supports reversing edges more\n// efficiently\n\n#include \"galois/Galois.h\"\n#include \"galois/Timer.h\"\n#include \"galois/Timer.h\"\n#include \"galois/graphs/Graph.h\"\n#include \"galois/graphs/LCGraph.h\"\n#include \"galois/graphs/FileGraph.h\"\n#include \"llvm/Support/CommandLine.h\"\n#include \"Lonestar/BoilerPlate.h\"\n\n#include <algorithm>\n#include <iostream>\n#include <random>\n#include <string>\n#include <vector>\n\nnamespace cll = llvm::cl;\n\nstatic const char* name = \"Maximum cardinality matching in bipartite graphs\";\nstatic const char* desc =\n    \"Computes maximum cardinality matching in bipartite graphs. \"\n    \"A matching of G is a subset of edges that do not share an endpoint. \"\n    \"The maximum cardinality matching is the matching with the most number of \"\n    \"edges.\";\nstatic const char* url = \"bipartite_mcm\";\n\nenum MatchingAlgo { pfpAlgo, ffAlgo, abmpAlgo };\n\nenum ExecutionType { serial, parallel };\n\nenum InputType { generated, fromFile };\n\nstatic cll::opt<std::string>\n    inputFile(cll::Positional, cll::desc(\"<input file>\"), cll::Optional);\nstatic cll::opt<MatchingAlgo>\n    algo(cll::desc(\"Choose an algorithm:\"),\n         cll::values(clEnumVal(pfpAlgo, \"Preflow-push\"),\n                     clEnumVal(ffAlgo, \"Ford-Fulkerson augmenting paths\"),\n                     clEnumVal(abmpAlgo, \"Alt-Blum-Mehlhorn-Paul\")),\n         cll::init(abmpAlgo));\nstatic cll::opt<ExecutionType> executionType(\n    cll::desc(\"Choose execution type:\"),\n    cll::values(clEnumVal(serial, \"Serial\"), clEnumVal(parallel, \"Parallel\")),\n    cll::init(parallel));\nstatic cll::opt<InputType>\n    inputType(\"inputType\", cll::desc(\"Input type:\"),\n              cll::values(clEnumVal(generated, \"Generated\"),\n                          clEnumVal(fromFile, \"From file\")),\n              cll::init(fromFile));\nstatic cll::opt<int>\n    N(\"n\", cll::desc(\"Size of each set of nodes in generated input\"),\n      cll::init(100));\nstatic cll::opt<int> numEdges(\"numEdges\",\n                              cll::desc(\"Number of edges in generated input\"),\n                              cll::init(1000));\nstatic cll::opt<int> numGroups(\"numGroups\",\n                               cll::desc(\"Number of groups in generated input\"),\n                               cll::init(10));\nstatic cll::opt<int> seed(\"seed\", cll::desc(\"Random seed for generated input\"),\n                          cll::init(0));\nstatic cll::opt<bool> runIteratively(\n    \"runIteratively\",\n    cll::desc(\"After finding matching, removed matched edges and repeat\"),\n    cll::init(false));\n\n// TODO(ddn): switch to this graph for FF and ABMP algos when we fix reading\n// graphs\ntemplate <typename NodeTy, typename EdgeTy>\nstruct BipartiteGraph : public galois::graphs::LC_Morph_Graph<NodeTy, EdgeTy> {\n  typedef galois::graphs::LC_Morph_Graph<NodeTy, EdgeTy> Super;\n  typedef std::vector<typename Super::GraphNode> NodeList;\n\n  NodeList A;\n  NodeList B;\n};\n\ntemplate <typename NodeTy, typename EdgeTy>\nstruct MFBipartiteGraph\n    : public galois::graphs::MorphGraph<NodeTy, EdgeTy, true> {\n  typedef galois::graphs::MorphGraph<NodeTy, EdgeTy, true> Super;\n  typedef std::vector<typename Super::GraphNode> NodeList;\n\n  NodeList A;\n  NodeList B;\n};\n\n//******************************** Common ************************\n\ntemplate <typename G, template <typename, bool> class Algo>\nstruct Exists {\n  bool operator()(G&, const typename G::edge_iterator&) { return true; }\n};\n\ntemplate <typename G>\nstruct GraphTypes {\n  typedef typename G::GraphNode GraphNode;\n  typedef std::pair<GraphNode, GraphNode> Edge;\n  typedef std::vector<Edge> Matching;\n};\n\nstruct BaseNode {\n  size_t id;\n  int degree;\n  bool covered;\n  bool free;\n  bool reachable; // for preparing node cover\n  BaseNode(size_t i = -1)\n      : id(i), degree(0), covered(false), free(true), reachable(false) {}\n  void reset() {\n    degree    = 0;\n    covered   = false;\n    free      = true;\n    reachable = false;\n  }\n};\n\ntemplate <typename G>\nstruct MarkReachable {\n  typedef typename G::GraphNode GraphNode;\n  typedef typename G::edge_iterator edge_iterator;\n\n  void operator()(G& g, const GraphNode& root) {\n    std::deque<GraphNode> queue;\n    queue.push_back(root);\n\n    while (!queue.empty()) {\n      GraphNode cur = queue.front();\n      queue.pop_front();\n      if (g.getData(cur).reachable)\n        continue;\n      g.getData(cur).reachable = true;\n      for (auto ii : g.edges(cur)) {\n        GraphNode dst = g.getEdgeDst(ii);\n        queue.push_back(dst);\n      }\n    }\n  }\n};\n\ntemplate <typename G, template <typename, bool> class Algo>\nstruct PrepareForVerifier {\n  typedef typename GraphTypes<G>::Edge Edge;\n  typedef typename GraphTypes<G>::Matching Matching;\n  typedef typename G::GraphNode GraphNode;\n  typedef typename G::NodeList NodeList;\n  typedef typename G::node_data_type node_data_type;\n  typedef typename G::edge_iterator edge_iterator;\n\n  void operator()(G& g, Matching* matching) {\n    Exists<G, Algo> exists;\n\n    for (auto src : g.B) {\n      for (auto ii : g.edges(src)) {\n        GraphNode dst = g.getEdgeDst(ii);\n        if (exists(g, ii)) {\n          matching->push_back(Edge(src, dst));\n        }\n      }\n    }\n\n    for (typename NodeList::iterator ii = g.A.begin(), ei = g.A.end(); ii != ei;\n         ++ii) {\n      if (g.getData(*ii).free)\n        MarkReachable<G>()(g, *ii);\n    }\n\n    for (typename Matching::iterator ii = matching->begin(),\n                                     ei = matching->end();\n         ii != ei; ++ii) {\n      if (g.getData(ii->first).reachable) {\n        // Reachable from a free node in A\n        g.getData(ii->first).covered = true;\n      } else {\n        g.getData(ii->second).covered = true;\n      }\n    }\n  }\n};\n\n//********************** FF Algorithm **************************\n\nstruct FFNode : public BaseNode {\n  int pred;\n  bool reached;\n  FFNode(size_t i = -1) : BaseNode(i), pred(-1), reached(false) {}\n  void reset() {\n    BaseNode::reset();\n    reached = false;\n    pred    = -1;\n  }\n};\n\n//! Switch between concurrent and serial instances\ntemplate <typename T1, typename T2, bool B>\nstruct InstanceWrapper;\ntemplate <typename T1, typename T2>\nstruct InstanceWrapper<T1, T2, true> {\n  T1& m_t1;\n  T2& m_t2;\n  typedef T2 Type;\n  InstanceWrapper(T1& t1, T2& t2) : m_t1(t1), m_t2(t2) {}\n  T2& get() { return m_t2; }\n};\ntemplate <typename T1, typename T2>\nstruct InstanceWrapper<T1, T2, false> {\n  T1& m_t1;\n  T2& m_t2;\n  typedef T1 Type;\n  InstanceWrapper(T1& t1, T2& t2) : m_t1(t1), m_t2(t2) {}\n  T1& get() { return m_t1; }\n};\n\n//! Switch between concurrent and serial types\ntemplate <typename T1, typename T2, bool B>\nstruct TypeWrapper;\ntemplate <typename T1, typename T2>\nstruct TypeWrapper<T1, T2, true> {\n  typedef T2 Type;\n};\ntemplate <typename T1, typename T2>\nstruct TypeWrapper<T1, T2, false> {\n  typedef T1 Type;\n};\n\n//! Matching algorithm of Ford and Fulkerson\ntemplate <typename G, bool Concurrent>\nstruct MatchingFF {\n  typedef typename G::GraphNode GraphNode;\n  typedef typename G::NodeList NodeList;\n  typedef typename G::node_data_type node_data_type;\n  typedef typename G::edge_iterator edge_iterator;\n  typedef typename GraphTypes<G>::Edge Edge;\n\n  typedef std::vector<Edge> SerialRevs;\n  typedef std::vector<GraphNode> SerialReached;\n\n  typedef std::vector<Edge,\n                      typename galois::PerIterAllocTy::rebind<Edge>::other>\n      ParallelRevs;\n  typedef std::vector<GraphNode,\n                      typename galois::PerIterAllocTy::rebind<GraphNode>::other>\n      ParallelReached;\n\n  typedef InstanceWrapper<SerialRevs, ParallelRevs, Concurrent> RevsWrapper;\n  typedef InstanceWrapper<SerialReached, ParallelReached, Concurrent>\n      ReachedWrapper;\n\n  typedef std::deque<GraphNode,\n                     typename galois::PerIterAllocTy::rebind<GraphNode>::other>\n      Queue;\n  typedef std::vector<GraphNode,\n                      typename galois::PerIterAllocTy::rebind<GraphNode>::other>\n      Preds;\n\n  static const galois::MethodFlag flag =\n      Concurrent ? galois::MethodFlag::WRITE : galois::MethodFlag::UNPROTECTED;\n\n  static const bool canRunIteratively = true;\n\n  std::string name() {\n    return std::string(Concurrent ? \"Concurrent\" : \"Serial\") +\n           \" Ford-Fulkerson\";\n  }\n\n  template <typename C>\n  bool findAugmentingPath(G& g, const GraphNode& root, C& ctx,\n                          typename RevsWrapper::Type& revs,\n                          typename ReachedWrapper::Type& reached) {\n    Queue queue(ctx.getPerIterAlloc());\n    Preds preds(ctx.getPerIterAlloc());\n\n    // Order matters between (1) and (2)\n    g.getData(root, flag).reached = true; // (1)\n    reached.push_back(root);              // (2)\n\n    queue.push_back(root);\n\n    while (!queue.empty()) {\n      GraphNode src = queue.front();\n      queue.pop_front();\n\n      for (auto ii : g.edges(src, flag)) {\n        GraphNode dst        = g.getEdgeDst(ii);\n        node_data_type& ddst = g.getData(dst, galois::MethodFlag::UNPROTECTED);\n        if (ddst.reached)\n          continue;\n\n        ddst.reached = true;\n        reached.push_back(dst);\n\n        ddst.pred = preds.size();\n        preds.push_back(src);\n\n        if (ddst.free) {\n          // Fail-safe point modulo ``reached'' which is handled separately\n          ddst.free     = false;\n          GraphNode cur = dst;\n          while (cur != root) {\n            GraphNode pred =\n                preds[g.getData(cur, galois::MethodFlag::UNPROTECTED).pred];\n            revs.push_back(Edge(pred, cur));\n            cur = pred;\n          }\n          return true;\n        } else {\n          assert(std::distance(g.edge_begin(dst), g.edge_end(dst)) == 1);\n          for (auto jj : g.edges(dst, flag)) {\n            GraphNode cur = g.getEdgeDst(jj);\n\n            g.getData(cur, galois::MethodFlag::UNPROTECTED).pred = preds.size();\n            preds.push_back(dst);\n\n            g.getData(cur, galois::MethodFlag::UNPROTECTED).reached = true;\n            reached.push_back(cur);\n\n            queue.push_back(cur);\n          }\n        }\n      }\n    }\n    return false;\n  }\n\n  //! Makes sure that ``reached'' to properly reset even if we get aborted\n  struct ReachedCleanup {\n    G& g;\n    typename ReachedWrapper::Type& reached;\n\n    ReachedCleanup(G& g, typename ReachedWrapper::Type& r) : g(g), reached(r) {}\n\n    ~ReachedCleanup() { cleanup(); }\n\n    virtual void release() { cleanup(); }\n\n    void cleanup() {\n      // In non-concurrent case, we can continue reusing reached\n      if (Concurrent)\n        clear();\n    }\n\n    void clear() {\n      for (typename ReachedWrapper::Type::iterator ii = reached.begin(),\n                                                   ei = reached.end();\n           ii != ei; ++ii) {\n        assert(g.getData(*ii, galois::MethodFlag::UNPROTECTED).reached);\n        g.getData(*ii, galois::MethodFlag::UNPROTECTED).reached = false;\n      }\n      reached.clear();\n    }\n  };\n\n  template <typename C>\n  void propagate(G& g, const GraphNode& src, C& ctx,\n                 typename RevsWrapper::Type& revs,\n                 typename ReachedWrapper::Type& reached) {\n\n    ReachedCleanup cleanup(g, reached);\n\n    if (findAugmentingPath(g, src, ctx, revs, reached)) {\n      g.getData(src, galois::MethodFlag::UNPROTECTED).free = false;\n\n      // Reverse edges in augmenting path\n      for (typename RevsWrapper::Type::iterator jj = revs.begin(),\n                                                ej = revs.end();\n           jj != ej; ++jj) {\n        auto edge =\n            g.findEdge(jj->first, jj->second, galois::MethodFlag::UNPROTECTED);\n        assert(edge != g.edge_end(jj->first));\n        g.removeEdge(jj->first, edge, galois::MethodFlag::UNPROTECTED);\n        g.addEdge(jj->second, jj->first, galois::MethodFlag::UNPROTECTED);\n      }\n      revs.clear();\n\n      cleanup.clear();\n    }\n  }\n\n  void operator()(G& g) {\n    SerialRevs revs;\n    SerialReached reached;\n\n    galois::setActiveThreads(Concurrent ? numThreads : 1);\n\n    galois::for_each(\n        galois::iterate(g.A),\n        [&, this](const GraphNode& node, auto& ctx) {\n          if (!g.getData(node, flag).free)\n            return;\n\n          ParallelRevs parallelRevs(ctx.getPerIterAlloc());\n          ParallelReached parallelReached(ctx.getPerIterAlloc());\n\n          this->propagate(g, node, ctx, RevsWrapper(revs, parallelRevs).get(),\n                          ReachedWrapper(reached, parallelReached).get());\n        },\n        galois::loopname(\"MatchingFF\"), galois::per_iter_alloc(),\n        galois::wl<galois::worklists::PerSocketChunkFIFO<32>>());\n  }\n};\n\n//********************** ABMP Algorithm **************************\n\nstruct ABMPNode : public FFNode {\n  unsigned layer;\n  int next;\n  ABMPNode(size_t i = -1) : FFNode(i), layer(0), next(0) {}\n  void reset() {\n    FFNode::reset();\n    layer = 0;\n    next  = 0;\n  }\n};\n\n//! Matching algorithm of Alt, Blum, Mehlhorn and Paul\ntemplate <typename G, bool Concurrent>\nstruct MatchingABMP {\n  typedef typename G::NodeList NodeList;\n  typedef typename G::GraphNode GraphNode;\n  typedef typename G::edge_iterator edge_iterator;\n  typedef typename G::node_data_type node_data_type;\n  typedef typename GraphTypes<G>::Edge Edge;\n  typedef std::vector<Edge,\n                      typename galois::PerIterAllocTy::rebind<Edge>::other>\n      Revs;\n  typedef std::pair<GraphNode, unsigned> WorkItem;\n\n  static const galois::MethodFlag flag =\n      Concurrent ? galois::MethodFlag::WRITE : galois::MethodFlag::UNPROTECTED;\n\n  static const bool canRunIteratively = true;\n\n  std::string name() {\n    return std::string(Concurrent ? \"Concurrent\" : \"Serial\") +\n           \" Alt-Blum-Mehlhorn-Paul\";\n  }\n\n  bool nextEdge(G& g, const GraphNode& src, GraphNode& next) {\n    node_data_type& dsrc = g.getData(src, galois::MethodFlag::UNPROTECTED);\n    unsigned l           = dsrc.layer - 1;\n\n    // Start search where we last left off\n    edge_iterator ii = g.edge_begin(src, flag);\n    edge_iterator ei = g.edge_end(src, flag);\n    assert(dsrc.next <= std::distance(ii, ei));\n    std::advance(ii, dsrc.next);\n    for (; ii != ei &&\n           g.getData(g.getEdgeDst(ii), galois::MethodFlag::UNPROTECTED).layer !=\n               l;\n         ++ii, ++dsrc.next) {\n      ;\n    }\n\n    if (ii == ei) {\n      return false;\n    } else {\n      next = g.getEdgeDst(ii);\n      return true;\n    }\n  }\n\n  //! Returns true if we've added a new element\n  // TODO: better name here\n  template <typename C>\n  bool propagate(G& g, const GraphNode& root, C& ctx) {\n    Revs revs(ctx.getPerIterAlloc());\n\n    GraphNode cur = root;\n\n    g.getData(root, flag);\n\n    while (true) {\n      GraphNode next;\n      if (g.getData(cur, galois::MethodFlag::UNPROTECTED).free &&\n          g.getData(cur, galois::MethodFlag::UNPROTECTED).layer == 0) {\n        assert(g.getData(root, galois::MethodFlag::UNPROTECTED).free);\n        // (1) Breakthrough\n        g.getData(cur, galois::MethodFlag::UNPROTECTED).free =\n            g.getData(root, galois::MethodFlag::UNPROTECTED).free = false;\n\n        // Reverse edges in augmenting path\n        for (typename Revs::iterator ii = revs.begin(), ei = revs.end();\n             ii != ei; ++ii) {\n          auto edge = g.findEdge(ii->first, ii->second,\n                                 galois::MethodFlag::UNPROTECTED);\n          assert(edge != g.edge_end(ii->first));\n          g.removeEdge(ii->first, edge, galois::MethodFlag::UNPROTECTED);\n          g.addEdge(ii->second, ii->first, galois::MethodFlag::UNPROTECTED);\n        }\n        // revs.clear();\n        if (revs.size() > 1024) {\n          std::cout << \"WARNING: allocating large amounts in parallel: \"\n                    << revs.size() << \"elements\\n\";\n        }\n        return false;\n      } else if (nextEdge(g, cur, next)) {\n        // (2) Advance\n        revs.push_back(Edge(cur, next));\n        cur = next;\n      } else {\n        // (3) Retreat\n        unsigned& layer = g.getData(cur, galois::MethodFlag::UNPROTECTED).layer;\n        layer += 2;\n        g.getData(cur, galois::MethodFlag::UNPROTECTED).next = 0;\n        if (revs.empty()) {\n          ctx.push(std::make_pair(cur, layer));\n          return true;\n        }\n        cur = revs.back().first;\n        revs.pop_back();\n      }\n    }\n  }\n\n  void operator()(G& g) {\n    galois::StatTimer t(\"serial\");\n    t.start();\n    std::vector<WorkItem> initial;\n    for (typename NodeList::iterator ii = g.A.begin(), ei = g.A.end(); ii != ei;\n         ++ii) {\n      g.getData(*ii).layer = 1;\n      if (g.getData(*ii).free)\n        initial.push_back(std::make_pair(*ii, 1));\n    }\n    t.stop();\n\n    unsigned maxLayer =\n        (unsigned)(0.1 * sqrt(std::distance(g.begin(), g.end())));\n    // size_t size = initial.size();\n    galois::setActiveThreads(Concurrent ? numThreads : 1);\n\n    using namespace galois::worklists;\n\n    auto indexer = [](const WorkItem& n) { return n.second; };\n\n    typedef PerSocketChunkFIFO<1024> PSchunk;\n    typedef OrderedByIntegerMetric<decltype(indexer), PSchunk> OBIM;\n\n    galois::for_each(\n        galois::iterate(initial),\n        [&, this](const WorkItem& item, auto& ctx) {\n          unsigned curLayer = item.second;\n          if (curLayer > maxLayer) {\n            // std::cout << \"Reached max layer: \" << curLayer <<\n            // \"\\n\";\n            ctx.breakLoop();\n            return;\n          }\n          // if (size <= 50 * curLayer) {\n          //  std::cout << \"Reached min size: \" << size << \"\\n\";\n          //  ctx.breakLoop();\n          //}\n          if (!this->propagate(g, item.first, ctx)) {\n            //__sync_fetch_and_add(&size, -1);\n          }\n        },\n        galois::per_iter_alloc(), galois::parallel_break(),\n        galois::loopname(\"MatchingABMP\"), galois::wl<OBIM>(indexer));\n\n    t.start();\n    MatchingFF<G, false> algo;\n    // std::cout << \"Switching to \" << algo.name() << \"\\n\";\n    algo(g);\n    t.stop();\n  }\n};\n\n// *************************** MaxFlow Algorithm *******************************\nstruct MFNode : public BaseNode {\n  size_t excess;\n  unsigned height;\n  int current;\n  MFNode(size_t i = -1) : BaseNode(i), excess(0), height(1), current(0) {}\n  void reset() {\n    BaseNode::reset();\n    excess  = 0;\n    height  = 1;\n    current = 0;\n  }\n};\n\nstruct MFEdge {\n  int cap;\n  MFEdge() : cap(1) {}\n  MFEdge(int c) : cap(c) {}\n};\n\n//! Matching via reduction to maxflow\ntemplate <typename G, bool Concurrent>\nstruct MatchingMF {\n  typedef typename G::NodeList NodeList;\n  typedef typename G::GraphNode GraphNode;\n  typedef typename G::edge_iterator edge_iterator;\n  typedef typename G::iterator iterator;\n  typedef typename G::node_data_type node_data_type;\n  typedef typename G::edge_data_type edge_data_type;\n  static const galois::MethodFlag flag =\n      Concurrent ? galois::MethodFlag::WRITE : galois::MethodFlag::UNPROTECTED;\n  static const bool canRunIteratively = false;\n\n  /**\n   * Beta parameter the original Goldberg algorithm to control when global\n   * relabeling occurs. For comparison purposes, we keep them the same as\n   * before, but it is possible to achieve much better performance by adjusting\n   * the global relabel frequency.\n   */\n  static const int BETA = 12;\n  /**\n   * Alpha parameter the original Goldberg algorithm to control when global\n   * relabeling occurs. For comparison purposes, we keep them the same as\n   * before, but it is possible to achieve much better performance by adjusting\n   * the global relabel frequency.\n   */\n  static const int ALPHA = 6;\n\n  std::string name() {\n    return std::string(Concurrent ? \"Concurrent\" : \"Serial\") + \" Max Flow\";\n  }\n\n  void reduceCapacity(edge_data_type& edge1, edge_data_type& edge2,\n                      int amount) {\n    edge1.cap -= amount;\n    edge2.cap += amount;\n  }\n\n  template <typename C>\n  bool discharge(G& g, const GraphNode& src, C& ctx, const GraphNode& source,\n                 const GraphNode& sink, unsigned numNodes) {\n    node_data_type& node = g.getData(src, flag);\n    // unsigned prevHeight = node.height;\n    bool relabeled = false;\n\n    if (node.excess == 0) {\n      return false;\n    }\n\n    while (true) {\n      galois::MethodFlag f = relabeled ? galois::MethodFlag::UNPROTECTED : flag;\n      bool finished        = false;\n      int current          = -1;\n\n      for (auto ii : g.edges(src, f)) {\n        ++current;\n        GraphNode dst        = g.getEdgeDst(ii);\n        edge_data_type& edge = g.getEdgeData(ii);\n        if (edge.cap == 0 || current < node.current)\n          continue;\n\n        node_data_type& dnode = g.getData(dst, galois::MethodFlag::UNPROTECTED);\n        if (node.height - 1 != dnode.height)\n          continue;\n\n        // Push flow\n        int amount = std::min(static_cast<int>(node.excess), edge.cap);\n        reduceCapacity(edge,\n                       g.getEdgeData(g.findEdge(\n                           dst, src, galois::MethodFlag::UNPROTECTED)),\n                       amount);\n\n        // Only add once\n        if (dst != sink && dst != source && dnode.excess == 0)\n          ctx.push(dst);\n\n        node.excess -= amount;\n        dnode.excess += amount;\n\n        if (node.excess == 0) {\n          finished     = true;\n          node.current = current;\n          break;\n        }\n      }\n\n      if (finished)\n        break;\n\n      relabel(g, src, numNodes);\n      relabeled = true;\n\n      // prevHeight = node.height;\n    }\n\n    return relabeled;\n  }\n\n  void relabel(G& g, const GraphNode& src, unsigned int) {\n    unsigned minHeight = std::numeric_limits<unsigned>::max();\n    int minEdge        = 0; // TODO: not sure of initial value\n\n    int current = -1;\n    for (auto ii : g.edges(src, galois::MethodFlag::UNPROTECTED)) {\n      ++current;\n      GraphNode dst = g.getEdgeDst(ii);\n      int cap       = g.getEdgeData(ii).cap;\n      if (cap > 0) {\n        node_data_type& dnode = g.getData(dst, galois::MethodFlag::UNPROTECTED);\n        if (dnode.height < minHeight) {\n          minHeight = dnode.height;\n          minEdge   = current;\n        }\n      }\n    }\n\n    assert(minHeight != std::numeric_limits<unsigned>::max());\n    ++minHeight;\n\n    node_data_type& node = g.getData(src, galois::MethodFlag::UNPROTECTED);\n    node.height          = minHeight;\n    node.current         = minEdge;\n  }\n\n  void globalRelabel(G& g, const GraphNode& source, const GraphNode& sink,\n                     unsigned numNodes, std::vector<GraphNode>& incoming) {\n\n    for (iterator ii = g.begin(), ei = g.end(); ii != ei; ++ii) {\n      GraphNode src        = *ii;\n      node_data_type& node = g.getData(src, galois::MethodFlag::UNPROTECTED);\n      node.height          = numNodes;\n      node.current         = 0;\n      if (src == sink)\n        node.height = 0;\n    }\n\n    constexpr bool useCAS = false;\n\n    galois::StatTimer T(\"BfsTime\");\n    T.start();\n    galois::for_each(\n        galois::iterate({sink}),\n        [&](const GraphNode& src, auto& ctx) {\n          for (auto ii :\n               g.edges(src, useCAS ? galois::MethodFlag::UNPROTECTED : flag)) {\n            GraphNode dst = g.getEdgeDst(ii);\n            if (g.getEdgeData(\n                     g.findEdge(dst, src, galois::MethodFlag::UNPROTECTED))\n                    .cap > 0) {\n              node_data_type& node =\n                  g.getData(dst, galois::MethodFlag::UNPROTECTED);\n              unsigned newHeight =\n                  g.getData(src, galois::MethodFlag::UNPROTECTED).height + 1;\n              if (useCAS) {\n                unsigned oldHeight = 0;\n                while (newHeight < (oldHeight = node.height)) {\n                  if (__sync_bool_compare_and_swap(&node.height, oldHeight,\n                                                   newHeight)) {\n                    ctx.push(dst);\n                    break;\n                  }\n                }\n              } else {\n                if (newHeight < node.height) {\n                  node.height = newHeight;\n                  ctx.push(dst);\n                }\n              }\n            }\n          }\n        },\n        galois::wl<galois::worklists::PerSocketChunkFIFO<32>>());\n    T.stop();\n\n    for (iterator ii = g.begin(), ei = g.end(); ii != ei; ++ii) {\n      GraphNode src        = *ii;\n      node_data_type& node = g.getData(src, galois::MethodFlag::UNPROTECTED);\n      if (src == sink || src == source)\n        continue;\n      if (node.excess > 0)\n        incoming.push_back(src);\n    }\n  }\n\n  void initializePreflow(G& g, const GraphNode& source,\n                         std::vector<GraphNode>& initial) {\n    for (auto ii : g.edges(source)) {\n      GraphNode dst        = g.getEdgeDst(ii);\n      edge_data_type& edge = g.getEdgeData(ii);\n      int cap              = edge.cap;\n      if (cap > 0)\n        initial.push_back(dst);\n      reduceCapacity(edge, g.getEdgeData(g.findEdge(dst, source)), cap);\n      g.getData(dst).excess += cap;\n    }\n  }\n\n  //! Adds reverse edges\n  void initializeGraph(G& g, GraphNode& source, GraphNode& sink,\n                       unsigned& numNodes, unsigned& globalRelabelInterval) {\n    size_t numEdges = 0;\n\n    numNodes                 = std::distance(g.begin(), g.end());\n    source                   = g.createNode(node_data_type(numNodes++));\n    sink                     = g.createNode(node_data_type(numNodes++));\n    g.getData(source).height = numNodes;\n    g.addNode(source);\n    g.addNode(sink);\n\n    // Add reverse edge\n    for (auto src : g.A) {\n      for (auto ii : g.edges(src)) {\n        GraphNode dst = g.getEdgeDst(ii);\n        g.getEdgeData(g.addMultiEdge(dst, src, galois::MethodFlag::WRITE)) =\n            edge_data_type(0);\n        ++numEdges;\n      }\n    }\n\n    // Add edge from source to each node in A\n    for (typename NodeList::iterator src = g.A.begin(), esrc = g.A.end();\n         src != esrc; ++src) {\n      g.getEdgeData(g.addMultiEdge(source, *src, galois::MethodFlag::WRITE)) =\n          edge_data_type();\n      g.getEdgeData(g.addMultiEdge(*src, source, galois::MethodFlag::WRITE)) =\n          edge_data_type(0);\n      ++numEdges;\n    }\n\n    // Add edge to sink from each node in B\n    for (typename NodeList::iterator src = g.B.begin(), esrc = g.B.end();\n         src != esrc; ++src) {\n      g.getEdgeData(g.addMultiEdge(*src, sink, galois::MethodFlag::WRITE)) =\n          edge_data_type();\n      g.getEdgeData(g.addMultiEdge(sink, *src, galois::MethodFlag::WRITE)) =\n          edge_data_type(0);\n      ++numEdges;\n    }\n\n    globalRelabelInterval = numNodes * ALPHA + numEdges;\n  }\n\n  //! Extract matching from saturated edges\n  void extractMatching(G& g) {\n    for (auto src : g.A) {\n      for (auto ii : g.edges(src)) {\n        GraphNode dst = g.getEdgeDst(ii);\n        if (g.getEdgeData(ii).cap == 0) {\n          g.getData(src).free = g.getData(dst).free = false;\n        }\n      }\n    }\n  }\n\n  void operator()(G& g) {\n    galois::StatTimer t(\"serial\");\n\n    t.start();\n    GraphNode source;\n    GraphNode sink;\n    unsigned numNodes;\n    unsigned globalRelabelInterval;\n    initializeGraph(g, source, sink, numNodes, globalRelabelInterval);\n\n    std::vector<GraphNode> initial;\n    initializePreflow(g, source, initial);\n    t.stop();\n\n    bool shouldGlobalRelabel = false;\n    unsigned counter         = 0;\n    galois::setActiveThreads(Concurrent ? numThreads : 1);\n\n    while (!initial.empty()) {\n      galois::for_each(\n          galois::iterate(initial),\n          [&, this](const GraphNode& src, auto& ctx) {\n            int increment = 1;\n            if (this->discharge(g, src, ctx, source, sink, numNodes)) {\n              increment += BETA;\n            }\n\n            counter += increment;\n            if (globalRelabelInterval && counter >= globalRelabelInterval) {\n              shouldGlobalRelabel = true;\n              ctx.breakLoop();\n              return;\n            }\n          },\n          galois::loopname(\"MatchingMF\"), galois::parallel_break(),\n          galois::wl<galois::worklists::PerSocketChunkFIFO<32>>());\n\n      if (!shouldGlobalRelabel)\n        break;\n\n      t.start();\n      std::cout << \"Starting global relabel, current excess at sink \"\n                << g.getData(sink).excess << \"\\n\";\n      initial.clear();\n      globalRelabel(g, source, sink, numNodes, initial);\n      shouldGlobalRelabel = false;\n      t.stop();\n    }\n\n    t.start();\n    std::cout << \"Final excess at sink \" << g.getData(sink).excess << \"\\n\";\n    g.removeNode(sink);\n    g.removeNode(source);\n    extractMatching(g);\n    t.stop();\n  }\n};\n\ntemplate <typename G>\nstruct Exists<G, MatchingMF> {\n  typedef typename G::edge_iterator edge_iterator;\n\n  bool operator()(G& g, const edge_iterator& ii) {\n    // assert(g.getEdgeData(src, dst).cap + g.getEdgeData(dst, src).cap == 1);\n    // assert(g.getEdgeData(src, dst).cap != g.getEdgeData(dst, src).cap);\n    return g.getEdgeData(ii).cap == 1;\n  }\n};\n\n// ******************* Verification ***************************\n\ntemplate <typename G>\nstruct Verifier {\n  typedef typename G::GraphNode GraphNode;\n  typedef typename G::node_data_type node_data_type;\n  typedef typename G::edge_iterator edge_iterator;\n  typedef typename G::NodeList NodeList;\n  typedef typename GraphTypes<G>::Matching Matching;\n\n  bool hasCoveredNeighbors(G& g, const GraphNode& src) {\n    for (auto ii : g.edges(src)) {\n      GraphNode dst = g.getEdgeDst(ii);\n      if (!g.getData(dst).covered)\n        return false;\n    }\n    return true;\n  }\n\n  void check(G& g, typename NodeList::iterator ii,\n             typename NodeList::iterator ei, size_t& count, bool& retval) {\n    for (; ii != ei; ++ii) {\n      node_data_type& dii = g.getData(*ii);\n      if (dii.degree > 1) {\n        std::cerr << \"Error: not a matching, node \" << dii.id << \" incident to \"\n                  << dii.degree << \" edges\\n\";\n        retval = false;\n      }\n\n      if (dii.covered) {\n        count++;\n      }\n\n      if (dii.covered || hasCoveredNeighbors(g, *ii)) {\n        // Good\n      } else {\n        std::cerr << \"Error: not a node cover, node \" << dii.id\n                  << \" with degree \" << dii.degree\n                  << \" not covered nor incident to covered node\\n\";\n        retval = false;\n      }\n    }\n  }\n\n  bool operator()(G& g, const Matching& matching) {\n    for (typename Matching::const_iterator ii = matching.begin(),\n                                           ei = matching.end();\n         ii != ei; ++ii) {\n      g.getData(ii->first).degree++;\n      g.getData(ii->second).degree++;\n    }\n\n    bool retval  = true;\n    size_t count = 0;\n    check(g, g.A.begin(), g.A.end(), count, retval);\n    check(g, g.B.begin(), g.B.end(), count, retval);\n\n    if (count != matching.size()) {\n      std::cerr << \"Error: matching is different than node cover \"\n                << matching.size() << \" vs \" << count << \"\\n\";\n      retval = false;\n    }\n\n    return retval;\n  }\n};\n\n/**\n * Generate a random bipartite graph as used in LEDA evaluation and\n * refererenced in [CGM+97]. Nodes are divided into numGroups groups of size\n * numA/numGroups each. Each node in A has degree d = numEdges/numA and the\n * edges out of a node in group i of A go to random nodes in groups i+1 and\n * i-1  of B. If numGroups == 0, just randomly assign nodes of A to nodes of\n * B.\n */\ntemplate <typename G>\nvoid generateRandomInput(int numA, int numB, int numEdges, int numGroups,\n                         int seed, G& g) {\n  typedef typename G::edge_data_type edge_data_type;\n\n  std::cout << \"numGroups: \" << numGroups << \" seed: \" << seed << \"\\n\";\n\n  galois::graphs::FileGraphWriter p;\n  p.setNumNodes(numA + numB);\n  p.setNumEdges<edge_data_type>(numEdges);\n\n  for (int phase = 0; phase < 2; ++phase) {\n    if (phase == 0)\n      p.phase1();\n    else\n      p.phase2();\n\n    std::mt19937 gen(seed);\n    std::uniform_int_distribution<> dist(0, 1);\n\n    assert(numA > 0 && numB > 0);\n\n    int d = numEdges / numA;\n    if (numGroups > numA)\n      numGroups = numA;\n    if (numGroups > numB)\n      numGroups = numB;\n\n    int count = 0;\n    if (numGroups > 0) {\n      int aSize = numA / numGroups;\n      int bSize = numB / numGroups;\n\n      for (int ii = 0; ii < numA; ++ii, ++count) {\n        int group = count / aSize;\n        if (group == numGroups)\n          break;\n        int base1 = group == 0 ? (numGroups - 1) * bSize : (group - 1) * bSize;\n        int base2 = group == numGroups - 1 ? 0 : (group + 1) * bSize;\n        for (int i = 0; i < d; ++i) {\n          int b   = dist(gen) < 0.5 ? base1 : base2;\n          int off = (int)(dist(gen) * (bSize - 1));\n          if (phase == 0)\n            p.incrementDegree(ii);\n          else\n            p.addNeighbor(ii, b + off + numA);\n        }\n      }\n    }\n\n    int r = numEdges - count * d;\n    while (r--) {\n      int ind_a = (int)(dist(gen) * (numA - 1));\n      int ind_b = (int)(dist(gen) * (numB - 1));\n      if (phase == 0)\n        p.incrementDegree(ind_a);\n      else\n        p.addNeighbor(ind_a, ind_b + numA);\n    }\n  }\n\n  // Leave edge data uninitialized\n  p.finish<edge_data_type>();\n  galois::graphs::readGraph(g, p);\n}\n\n/**\n * Read bipartite graph from file.\n *\n * Assumes\n *  (1) nodes in set A have edges while nodes in set B don't\n *  (2) nodes in set A are the first numA nodes (followed by nodes in set B)\n */\ntemplate <typename G>\nvoid readInput(const std::string& filename, G& g) {\n  galois::graphs::readGraph(g, filename);\n}\n\ntemplate <template <typename, bool> class Algo, typename G>\nsize_t countMatching(G& g) {\n  Exists<G, Algo> exists;\n  size_t count = 0;\n  for (auto n : g.B) {\n    for (auto edge : g.out_edges(n)) {\n      if (exists(g, edge)) {\n        count += 1;\n      }\n    }\n  }\n  return count;\n}\n\ntemplate <template <typename, bool> class Algo, typename G>\nvoid removeMatchedEdges(G& g) {\n  Exists<G, Algo> exists;\n  for (auto n : g.B) {\n    assert(std::distance(g.edge_begin(n), g.edge_end(n)) <= 1);\n    for (auto edge : g.out_edges(n)) {\n      if (exists(g, edge)) {\n        g.removeEdge(n, edge);\n        break;\n      }\n    }\n  }\n}\n\ntemplate <template <typename, bool> class Algo, typename G, bool Concurrent>\nvoid start(int N, int numEdges, int numGroups) {\n  typedef Algo<G, Concurrent> A;\n\n  A algo;\n  G g;\n\n  if (runIteratively && !algo.canRunIteratively)\n    GALOIS_DIE(\"algo does not support iterative execution\");\n\n  switch (inputType) {\n  case generated:\n    generateRandomInput(N, N, numEdges, numGroups, seed, g);\n    break;\n  case fromFile:\n    readInput(inputFile, g);\n    break;\n  default:\n    GALOIS_DIE(\"unknown input type\");\n  }\n\n  size_t id = 0;\n  for (auto n : g) {\n    g.getData(n).id = id++;\n    if (g.edge_begin(n) != g.edge_end(n))\n      g.A.push_back(n);\n    else\n      g.B.push_back(n);\n  }\n\n  std::cout << \"numA: \" << g.A.size() << \" numB: \" << g.B.size() << \"\\n\";\n\n  std::cout << \"Starting \" << algo.name() << \"\\n\";\n\n  galois::StatTimer execTime(\"Timer_0\");\n\n  while (true) {\n    execTime.start();\n    algo(g);\n    execTime.stop();\n\n    if (!skipVerify) {\n      typename GraphTypes<G>::Matching matching;\n      PrepareForVerifier<G, Algo>()(g, &matching);\n      if (!Verifier<G>()(g, matching)) {\n        GALOIS_DIE(\"verification failed\");\n      } else {\n        std::cout << \"Verification successful.\\n\";\n      }\n    }\n\n    size_t matchingSize = countMatching<Algo>(g);\n    std::cout << \"Matching of cardinality: \" << matchingSize << \"\\n\";\n\n    if (!runIteratively || matchingSize == 0)\n      break;\n\n    removeMatchedEdges<Algo>(g);\n    for (auto n : g)\n      g.getData(n).reset();\n  }\n}\n\ntemplate <bool Concurrent>\nvoid start() {\n  switch (algo) {\n  case abmpAlgo:\n    start<MatchingABMP, MFBipartiteGraph<ABMPNode, void>, Concurrent>(\n        N, numEdges, numGroups);\n    break;\n  case pfpAlgo:\n    start<MatchingMF, MFBipartiteGraph<MFNode, MFEdge>, Concurrent>(N, numEdges,\n                                                                    numGroups);\n    break;\n  case ffAlgo:\n    start<MatchingFF, MFBipartiteGraph<FFNode, void>, Concurrent>(N, numEdges,\n                                                                  numGroups);\n    break;\n  default:\n    GALOIS_DIE(\"unknown algo\");\n  }\n}\n\nint main(int argc, char** argv) {\n  galois::SharedMemSys G;\n  LonestarStart(argc, argv, name, desc, url, &inputFile);\n\n  galois::StatTimer totalTime(\"TimerTotal\");\n  totalTime.start();\n\n  if (!symmetricGraph) {\n    GALOIS_DIE(\"This application requires a symmetric graph input;\"\n               \" please use the -symmetricGraph flag \"\n               \" to indicate the input is a symmetric graph.\");\n  }\n\n  switch (executionType) {\n  case serial:\n    start<false>();\n    break;\n  case parallel:\n    start<true>();\n    break;\n  default:\n    GALOIS_DIE(\"unknown execution type\");\n  }\n\n  totalTime.stop();\n\n  return 0;\n}\n"
  },
  {
    "path": "lonestar/analytics/cpu/matrixcompletion/CMakeLists.txt",
    "content": "add_executable(matrixcompletion-cpu matrixCompletion.cpp)\nadd_dependencies(apps matrixcompletion-cpu)\ntarget_link_libraries(matrixcompletion-cpu PRIVATE Galois::shmem lonestar)\ninstall(TARGETS matrixcompletion-cpu DESTINATION \"${CMAKE_INSTALL_BINDIR}\" COMPONENT apps EXCLUDE_FROM_ALL)\n\n\nif(CMAKE_COMPILER_IS_GNUCC)\n  target_compile_options(matrixcompletion-cpu PRIVATE -ffast-math)\nendif()\n\nfind_package(Eigen3 CONFIG)\nif(Eigen_FOUND)\n  target_link_libraries(matrixcompletion-cpu Eigen3::Eigen)\n  target_compile_definitions(matrixcompletion-cpu PRIVATE -DHAS_EIGEN -DEIGEN_DONT_PARALLELIZE)\nendif()\n\nif (Eigen_FOUND)\n  add_test_scale(small-sync matrixcompletion-cpu -algo=syncALS -lambda=0.001 -learningRate=0.01 -learningRateFunction=intel -tolerance=0.01 -useSameLatentVector -useDetInit \"${BASEINPUT}/weighted/bipartite/Epinions_dataset.gr\")\n\n  add_test_scale(small-simple matrixcompletion-cpu -algo=simpleALS -lambda=0.001 -learningRate=0.01 -learningRateFunction=intel -tolerance=0.01 -useSameLatentVector -useDetInit \"${BASEINPUT}/weighted/bipartite/Epinions_dataset.gr\")\nendif()\n\nadd_test_scale(small-edge matrixcompletion-cpu -algo=sgdBlockEdge -lambda=0.001 -learningRate=0.01 -learningRateFunction=intel -tolerance=0.01 -useSameLatentVector -useDetInit \"${BASEINPUT}/weighted/bipartite/Epinions_dataset.gr\")\n\nadd_test_scale(small-jump matrixcompletion-cpu -algo=sgdBlockJump -lambda=0.001 -learningRate=0.01 -learningRateFunction=intel -tolerance=0.01 -useSameLatentVector -useDetInit \"${BASEINPUT}/weighted/bipartite/Epinions_dataset.gr\")\n\nadd_test_scale(small-byitems matrixcompletion-cpu -algo=sgdByItems -lambda=0.001 -learningRate=0.01 -learningRateFunction=intel -tolerance=0.01 -useSameLatentVector -useDetInit \"${BASEINPUT}/weighted/bipartite/Epinions_dataset.gr\")\n\nadd_test_scale(small-byedges matrixcompletion-cpu -algo=sgdByEdges -lambda=0.001 -learningRate=0.01 -learningRateFunction=intel -tolerance=0.01 -useSameLatentVector -useDetInit \"${BASEINPUT}/weighted/bipartite/Epinions_dataset.gr\")\n"
  },
  {
    "path": "lonestar/analytics/cpu/matrixcompletion/README.md",
    "content": "Matrix Completion\n================================================================================\n\nDESCRIPTION\n--------------------------------------------------------------------------------\n\nThis program performs the matrix completion using different stochastic gradient\ndescent (SGD) and alternating least squares (ALS) algorithms on a bipartite graph.\nWe have implemeted 4 SGD based algorithms and 2 ALS based algorithms.\n\nSGD algorithms:\n  1. sgdByItems\n  2. sgdByEdges\n  3. sgdBlockEdge\n  4. sgdBlockJump\n\nALS algorithms:\n  1. SimpleALS\n  2. SyncALS\n\nINPUT\n--------------------------------------------------------------------------------\n\nAll versions expect a bipartite graph in gr format.\nNOTE: The bipartite must have all the nodes with out-going edges in the beginning,\nfollowed by all the nodes without any out-going edges. For example, a bipartite\ngraph with out-going edges from users to movies, where each edge is a rating\ngiven by a user for a movie, the graph layout must have all the user nodes \ntogether in the beginning followed by all the movie nodes.\n\nBUILD\n--------------------------------------------------------------------------------\n\n1. Run cmake at BUILD directory (refer to top-level README for cmake instructions).\n\n2. Run `cd <BUILD>/lonestar/analytics/cpu/matrixcompletion; make -j`\n\nRUN\n--------------------------------------------------------------------------------\n\nThe following are a few example command lines.\n\n`$./matrixcompletion-cpu <path-to-graph> -algo=sgdBlockJump  -lambda=0.001 -learningRate=0.01 -learningRateFunction=intel -tolerance=0.0001 -t 40 -updatesPerEdge=1 -maxUpdates=20`\n\nTo list all the options including the names of the algorithms (-algo):\n`$./matrixcompletion-cpu --help`\n\nIn our experience, out of all the SGD algorithms on netflix graph\n(#nodes: 497959, #edges: 99072112), sgdBlockEdge gives the best performance \nand out of ALS algorithms SyncALS performs the best.\n\nTUNING PERFORMANCE\n--------------------------------------------------------------------------------\n\nPerformance of different algorithmic variants is input dependent. \nThe values for '-lambda', '-learningRateFunction', and '-learningRate' need \nto be tuned for each input graph. If root mean square erro (RMSE) is 'nan', try \ndifferent values for 'lambda', 'learningRateFunction', and 'learningRate'.\n"
  },
  {
    "path": "lonestar/analytics/cpu/matrixcompletion/bipartite-gen.py",
    "content": "import random\nimport optparse\nimport collections\nimport sys\n\ndef main(num_users, num_movies, num_edges, options):\n\trandom.seed(1000)\n\tnum_nodes = num_users + num_movies\n\tadj = collections.defaultdict(set)\n\t\n\t#print('p sp %d %d' % (num_nodes, num_edges))\n\n\tuser_set = set(xrange(1, num_users+1))\n\n\tdef randUser():\n\t\tx = random.randint(num_movies+1, num_movies + num_users)\n\t\treturn x\t\n\tdef randMovie():\n\t\tx =random.randint(1, num_movies)\n\t\treturn x\t\n\tdef randRating():\n\t\treturn random.randint(1, 5)\n\tdef addEdge(src, dst, w):\n\t\tif dst in adj[src]:\n\t\t\treturn False\n\t\tprint('a %d %d %d' % (src, dst, w))\n\t\tadj[src].add(dst)\n\t\treturn True\n\t\n\tedges_emitted = num_movies\n\tfor movie in xrange(1, num_movies+1):\n\t\tuser = randUser()\n\t\taddEdge(movie, user, randRating())\n\t\tuser_set.discard(user)\n\t\n\tedges_emitted = edges_emitted + len(user_set)\n\tfor user in user_set:\n\t\twhile not addEdge(randMovie(), num_movies + user, randRating()):\n\t\t\tpass\n\n\tfor i in xrange(num_edges - edges_emitted):\n\t\twhile not addEdge(randMovie(), randUser(), randRating()):\n\t\t\tpass\n\nif __name__ == '__main__':\n\tusage = 'usage: %prog <num users> <num movies> <num edges>'\n\tparser = optparse.OptionParser(usage=usage)\n\t(options, args) = parser.parse_args()\n\tif len(args) != 3:\n\t\tparser.error('missing arguments')\n\tmain(int(args[0]), int(args[1]), int(args[2]), options)\n"
  },
  {
    "path": "lonestar/analytics/cpu/matrixcompletion/commandLineParam.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#include \"llvm/Support/CommandLine.h\"\n/**\n * Common commandline parameters to for matrix completion algorithms\n */\nnamespace cll = llvm::cl;\nstatic cll::opt<std::string>\n    inputFilename(cll::Positional, cll::desc(\"<input file>\"), cll::Required);\n\n// (Purdue, Neflix): 0.012, (Purdue, Yahoo Music): 0.00075, (Purdue, HugeWiki):\n// 0.001 Intel: 0.001 Bottou: 0.1\nstatic cll::opt<float> learningRate(\"learningRate\",\n                                    cll::desc(\"learning rate parameter [alpha] \"\n                                              \"for Bold, Bottou, Intel and \"\n                                              \"Purdue step size function\"),\n                                    cll::init(0.012));\n\n// (Purdue, Netflix): 0.015, (Purdue, Yahoo Music): 0.01,\n// (Purdue, HugeWiki): 0.0, Intel: 0.9\nstatic cll::opt<float> decayRate(\"decayRate\",\n                                 cll::desc(\"decay rate parameter [beta] for \"\n                                           \"Intel and Purdue step size \"\n                                           \"function\"),\n                                 cll::init(0.015));\n// (Purdue, Netflix): 0.05, (Purdue, Yahoo Music): 1.0, (Purdue, HugeWiki): 0.01\n// Intel: 0.001\nstatic cll::opt<float> lambda(\"lambda\",\n                              cll::desc(\"regularization parameter [lambda]\"),\n                              cll::init(0.05));\n\nstatic cll::opt<unsigned> usersPerBlock(\"usersPerBlock\",\n                                        cll::desc(\"users per block\"),\n                                        cll::init(2048));\nstatic cll::opt<unsigned> itemsPerBlock(\"itemsPerBlock\",\n                                        cll::desc(\"items per block\"),\n                                        cll::init(350));\nstatic cll::opt<float>\n    tolerance(\"tolerance\", cll::desc(\"convergence tolerance\"), cll::init(0.01));\n\nstatic cll::opt<bool> useSameLatentVector(\"useSameLatentVector\",\n                                          cll::desc(\"initialize all nodes to \"\n                                                    \"use same latent vector\"),\n                                          cll::init(false));\n\n// Regarding algorithm termination\nstatic cll::opt<unsigned> maxUpdates(\"maxUpdates\",\n                                     cll::desc(\"Max number of times to update \"\n                                               \"latent vectors (default 100)\"),\n                                     cll::init(100));\n"
  },
  {
    "path": "lonestar/analytics/cpu/matrixcompletion/matrixCompletion.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#include \"matrixCompletion.h\"\n#include \"galois/ParallelSTL.h\"\n#include \"galois/graphs/Graph.h\"\n#include \"galois/runtime/TiledExecutor.h\"\n#include \"Lonestar/BoilerPlate.h\"\n\n#include <cmath>\n#include <fstream>\n#include <iostream>\n#include <ostream>\n\n#ifdef HAS_EIGEN\n#include <Eigen/Sparse>\n#include <Eigen/Dense>\n#endif\n\n#ifdef _OPENMP\n#include <omp.h>\n#endif\n\nstatic const char* const name = \"Matrix Completion\";\nstatic const char* const desc =\n    \"Computes Matrix Decomposition using Stochastic \"\n    \"Gradient Descent or Alternating Least Squares\";\n\nenum Algo {\n  syncALS,\n  simpleALS,\n  sgdByItems,\n  sgdByEdges,\n  sgdBlockEdge,\n  sgdBlockJump,\n};\n\nenum Step { bold, bottou, intel, inverse, purdue };\n\nstatic cll::opt<std::string>\n    inputFile(cll::Positional, cll::desc(\"<input file>\"), cll::Required);\n\n/*\n * Commandline options for different Algorithms\n */\nstatic cll::opt<Algo>\n    algo(\"algo\", cll::desc(\"Choose an algorithm:\"),\n         cll::values(\n             clEnumValN(Algo::syncALS, \"syncALS\", \"Alternating least squares\"),\n             clEnumValN(Algo::simpleALS, \"simpleALS\",\n                        \"Simple alternating least squares\"),\n             clEnumValN(Algo::sgdBlockEdge, \"sgdBlockEdge\",\n                        \"SGD Edge blocking (default)\"),\n             clEnumValN(Algo::sgdBlockJump, \"sgdBlockJump\",\n                        \"SGD using Block jumping \"),\n             clEnumValN(Algo::sgdByItems, \"sgdByItems\", \"Simple SGD on Items\"),\n             clEnumValN(Algo::sgdByEdges, \"sgdByEdges\", \"Simple SGD on edges\")),\n         cll::init(Algo::sgdBlockEdge));\n/*\n * Commandline options for different learning functions\n */\nstatic cll::opt<Step> learningRateFunction(\n    \"learningRateFunction\", cll::desc(\"Choose learning rate function:\"),\n    cll::values(clEnumValN(Step::intel, \"intel\", \"Intel\"),\n                clEnumValN(Step::purdue, \"purdue\", \"Purdue\"),\n                clEnumValN(Step::bottou, \"bottou\", \"Bottou\"),\n                clEnumValN(Step::bold, \"bold\", \"Bold (default)\"),\n                clEnumValN(Step::inverse, \"inverse\", \"Inverse\")),\n    cll::init(Step::bold));\n\nstatic cll::opt<int> cutoff(\"cutoff\");\n\n#ifdef HAS_EIGEN\nstatic const unsigned ALS_CHUNK_SIZE = 4;\n#endif\n\nsize_t NUM_ITEM_NODES = 0;\n\nstruct PurdueStepFunction : public StepFunction {\n  virtual std::string name() const { return \"Purdue\"; }\n  virtual LatentValue stepSize(int round) const {\n    return learningRate * 1.5 / (1.0 + decayRate * pow(round + 1, 1.5));\n  }\n};\n\nstruct IntelStepFunction : public StepFunction {\n  virtual std::string name() const { return \"Intel\"; }\n  virtual LatentValue stepSize(int round) const {\n    return learningRate * pow(decayRate, round);\n  }\n};\n\nstruct BottouStepFunction : public StepFunction {\n  virtual std::string name() const { return \"Bottou\"; }\n  virtual LatentValue stepSize(int round) const {\n    return learningRate / (1.0 + learningRate * lambda * round);\n  }\n};\n\nstruct InverseStepFunction : public StepFunction {\n  virtual std::string name() const { return \"Inverse\"; }\n  virtual LatentValue stepSize(int round) const { return 1.0 / (round + 1); }\n};\n\nstruct BoldStepFunction : public StepFunction {\n  virtual std::string name() const { return \"Bold\"; }\n  virtual bool isBold() const { return true; }\n  virtual LatentValue stepSize(int) const { return 0.0; }\n};\n\ntemplate <typename Graph>\ndouble sumSquaredError(Graph& g) {\n  typedef typename Graph::GraphNode GNode;\n  // computing Root Mean Square Error\n  // Assuming only item nodes have edges\n  galois::GAccumulator<double> error;\n\n  galois::do_all(\n      galois::iterate(g.begin(), g.begin() + NUM_ITEM_NODES), [&](GNode n) {\n        for (auto ii = g.edge_begin(n), ei = g.edge_end(n); ii != ei; ++ii) {\n          GNode dst = g.getEdgeDst(ii);\n          LatentValue e =\n              predictionError(g.getData(n).latentVector,\n                              g.getData(dst).latentVector, g.getEdgeData(ii));\n          error += (e * e);\n        }\n      });\n  return error.reduce();\n}\n\ntemplate <typename Graph>\nsize_t countEdges(Graph& g) {\n  typedef typename Graph::GraphNode GNode;\n  galois::GAccumulator<size_t> edges;\n  galois::runtime::Fixed2DGraphTiledExecutor<Graph> executor(g);\n  std::cout << \"NUM_ITEM_NODES : \" << NUM_ITEM_NODES << \"\\n\";\n  executor.execute(\n      g.begin(), g.begin() + NUM_ITEM_NODES, g.begin() + NUM_ITEM_NODES,\n      g.end(), itemsPerBlock, usersPerBlock,\n      [&](GNode, GNode, typename Graph::edge_iterator) { edges += 1; },\n      false); // false = no locks\n  return edges.reduce();\n}\n\ntemplate <typename Graph>\nvoid verify(Graph& g, const std::string& prefix) {\n  std::cout << countEdges(g) << \" : \" << g.sizeEdges() << \"\\n\";\n  if (countEdges(g) != g.sizeEdges()) {\n    GALOIS_DIE(\"edge list of input graph probably not sorted\");\n  }\n\n  double error = sumSquaredError(g);\n  double rmse  = std::sqrt(error / g.sizeEdges());\n\n  std::cout << prefix << \"RMSE: \" << rmse << \"\\n\";\n}\n\ntemplate <typename T, unsigned Size>\nstruct ExplicitFiniteChecker {};\n\ntemplate <typename T>\nstruct ExplicitFiniteChecker<T, 4U> {\n  static_assert(std::numeric_limits<T>::is_iec559, \"Need IEEE floating point\");\n  bool isFinite(T v) {\n    union {\n      T value;\n      uint32_t bits;\n    } a = {v};\n    if (a.bits == 0x7F800000) {\n      return false; // +inf\n    } else if (a.bits == 0xFF800000) {\n      return false; // -inf\n    } else if (a.bits >= 0x7F800001 && a.bits <= 0x7FBFFFFF) {\n      return false; // signaling NaN\n    } else if (a.bits >= 0xFF800001 && a.bits <= 0xFFBFFFFF) {\n      return false; // signaling NaN\n    } else if (a.bits >= 0x7FC00000 && a.bits <= 0x7FFFFFFF) {\n      return false; // quiet NaN\n    } else if (a.bits >= 0xFFC00000 && a.bits <= 0xFFFFFFFF) {\n      return false; // quiet NaN\n    }\n    return true;\n  }\n};\n\ntemplate <typename T>\nstruct ExplicitFiniteChecker<T, 8U> {\n  static_assert(std::numeric_limits<T>::is_iec559, \"Need IEEE floating point\");\n  bool isFinite(T v) {\n    union {\n      T value;\n      uint64_t bits;\n    } a = {v};\n    if (a.bits == 0x7FF0000000000000) {\n      return false; // +inf\n    } else if (a.bits == 0xFFF0000000000000) {\n      return false; // -inf\n    } else if (a.bits >= 0x7FF0000000000001 && a.bits <= 0x7FF7FFFFFFFFFFFF) {\n      return false; // signaling NaN\n    } else if (a.bits >= 0xFFF0000000000001 && a.bits <= 0xFFF7FFFFFFFFFFFF) {\n      return false; // signaling NaN\n    } else if (a.bits >= 0x7FF8000000000000 && a.bits <= 0x7FFFFFFFFFFFFFFF) {\n      return false; // quiet NaN\n    } else if (a.bits >= 0xFFF8000000000000 && a.bits <= 0xFFFFFFFFFFFFFFFF) {\n      return false; // quiet NaN\n    }\n    return true;\n  }\n};\n\ntemplate <typename T>\nbool isFinite(T v) {\n#ifdef __FAST_MATH__\n  return ExplicitFiniteChecker<T, sizeof(T)>().isFinite(v);\n#else\n  return std::isfinite(v);\n#endif\n}\n\ndouble countFlops(size_t nnz, int rounds, int k) {\n  double flop = 0;\n  if (useExactError) {\n    // dotProduct = 2K, square = 1, sum = 1\n    flop += nnz * (2.0 * k + 1 + 1);\n  } else {\n    // Computed during gradient update: square = 1, sum = 1\n    flop += nnz * (1 + 1);\n  }\n  // dotProduct = 2K, gradient = 10K,\n  flop += rounds * (nnz * (12.0 * k));\n  return flop;\n}\n\n/*\n * Common function to execute different algorithms\n * till convergence.\n *\n * @param StepFunction to be used\n * @param Graph\n * @param fn (algorithm)\n *\n */\ntemplate <typename Graph, typename Fn>\nvoid executeUntilConverged(const StepFunction& sf, Graph& g, Fn fn) {\n  galois::GAccumulator<double> errorAccum;\n  std::vector<LatentValue> steps(updatesPerEdge);\n  LatentValue last    = -1.0;\n  unsigned deltaRound = updatesPerEdge;\n  LatentValue rate    = learningRate;\n\n  galois::StatTimer executeAlgoTimer(\"Algorithm Execution Time\");\n  galois::TimeAccumulator elapsed;\n  elapsed.start();\n\n  unsigned long lastTime = 0;\n\n  for (unsigned int round = 0;; round += deltaRound) {\n    if (fixedRounds > 0 && round >= fixedRounds)\n      break;\n    if (fixedRounds > 0)\n      deltaRound = std::min(deltaRound, fixedRounds - round);\n\n    for (unsigned i = 0; i < updatesPerEdge; ++i) {\n      // Assume that loss decreases\n      if (sf.isBold())\n        steps[i] = i == 0 ? rate : steps[i - 1] * 1.05;\n      else\n        steps[i] = sf.stepSize(round + i);\n    }\n\n    executeAlgoTimer.start();\n    fn(&steps[0], round + deltaRound, useExactError ? &errorAccum : NULL);\n    executeAlgoTimer.stop();\n    double error = useExactError ? errorAccum.reduce() : sumSquaredError(g);\n\n    elapsed.stop();\n\n    unsigned long curElapsed = elapsed.get();\n    elapsed.start();\n    unsigned long millis = curElapsed - lastTime;\n    lastTime             = curElapsed;\n\n    double gflops = countFlops(g.sizeEdges(), deltaRound, LATENT_VECTOR_SIZE) /\n                    millis / 1e6;\n\n    int curRound = round + deltaRound;\n    galois::gPrint(\"R: \", curRound, \" elapsed (ms): \", curElapsed,\n                   \" GFLOP/s: \", gflops);\n    if (useExactError) {\n      galois::gPrint(\" RMSE (R \", curRound,\n                     \"): \", std::sqrt(error / g.sizeEdges()), \"\\n\");\n    } else {\n      galois::gPrint(\" Approx. RMSE (R \", (curRound - 1),\n                     \".5): \", std::sqrt(std::abs(error / g.sizeEdges())), \"\\n\");\n    }\n\n    galois::gPrint(\"Error Change : \", std::abs((last - error) / last), \"\\n\");\n    if (!isFinite(error))\n      break;\n    if (fixedRounds <= 0 &&\n        (round >= maxUpdates || std::abs((last - error) / last) < tolerance))\n      break;\n    if (sf.isBold()) {\n      // Assume that loss decreases first round\n      if (last >= 0.0 && last < error)\n        rate = steps[deltaRound - 1] * 0.5;\n      else\n        rate = steps[deltaRound - 1] * 1.05;\n    }\n    last = error;\n  }\n}\n\n/*\n * Divides the Items and users into 2D blocks.\n * Locks each block to work on it.\n */\nstruct SGDBlockJumpAlgo {\n  bool isSgd() const { return true; }\n  typedef galois::substrate::PaddedLock<true> SpinLock;\n  static const bool precomputeOffsets = true; // false;\n\n  std::string name() const { return \"sgdBlockJumpAlgo\"; }\n\n  struct Node {\n    LatentValue latentVector[LATENT_VECTOR_SIZE];\n  };\n\n  typedef galois::graphs::LC_CSR_Graph<Node, EdgeType>\n      //    ::with_numa_alloc<true>::type\n      ::with_no_lockable<true>::type Graph;\n  typedef Graph::GraphNode GNode;\n\n  void readGraph(Graph& g) { galois::graphs::readGraph(g, inputFile); }\n\n  size_t userIdToUserNode(size_t userId) { return userId + NUM_ITEM_NODES; }\n\n  struct BlockInfo {\n    size_t id;\n    size_t x;\n    size_t y;\n    size_t userStart;\n    size_t userEnd;\n    size_t itemStart;\n    size_t itemEnd;\n    size_t numitems;\n    size_t updates;\n    double error;\n    int* userOffsets;\n\n    std::ostream& print(std::ostream& os) {\n      os << \"id: \" << id << \" x: \" << x << \" y: \" << y\n         << \" userStart: \" << userStart << \" userEnd: \" << userEnd\n         << \" itemStart: \" << itemStart << \" itemEnd: \" << itemEnd\n         << \" updates: \" << updates << \"\\n\";\n      return os;\n    }\n\n    ~BlockInfo() { delete[] userOffsets; }\n  };\n\n  struct Process {\n    Graph& g;\n    SpinLock *xLocks, *yLocks;\n    BlockInfo* blocks;\n    size_t numXBlocks, numYBlocks;\n    const LatentValue* steps;\n    size_t maxUpdates;\n    galois::GAccumulator<double>* errorAccum;\n\n    struct GetDst {\n      Graph* g;\n      GetDst() {}\n      GetDst(Graph* _g) : g(_g) {}\n      GNode operator()(Graph::edge_iterator ii) const {\n        return g->getEdgeDst(ii);\n      }\n    };\n\n    /**\n     * Preconditions: row and column of slice are locked.\n     *\n     * Postconditions: increments update count, does sgd update on each item\n     * and user in the slice\n     */\n    template <bool Enable = precomputeOffsets>\n    size_t runBlock(BlockInfo& si,\n                    typename std::enable_if<!Enable>::type* = 0) {\n      if (si.updates >= maxUpdates)\n        return 0;\n      typedef galois::NoDerefIterator<Graph::edge_iterator> no_deref_iterator;\n      typedef boost::transform_iterator<GetDst, no_deref_iterator>\n          edge_dst_iterator;\n\n      LatentValue stepSize = steps[si.updates - maxUpdates + updatesPerEdge];\n      size_t seen          = 0;\n      double error         = 0.0;\n\n      // Set up item iterators\n      size_t itemId      = 0;\n      Graph::iterator mm = g.begin(), em = g.begin();\n      std::advance(mm, si.itemStart);\n      std::advance(em, si.itemEnd);\n\n      GetDst fn{&g};\n\n      // For each item in the range\n      for (; mm != em; ++mm, ++itemId) {\n        GNode item      = *mm;\n        Node& itemData  = g.getData(item);\n        size_t lastUser = si.userEnd + NUM_ITEM_NODES;\n\n        edge_dst_iterator start(no_deref_iterator(g.edge_begin(\n                                    item, galois::MethodFlag::UNPROTECTED)),\n                                fn);\n        edge_dst_iterator end(no_deref_iterator(g.edge_end(\n                                  item, galois::MethodFlag::UNPROTECTED)),\n                              fn);\n\n        // For each edge in the range\n        for (auto ii =\n                 std::lower_bound(start, end, si.userStart + NUM_ITEM_NODES);\n             ii != end; ++ii) {\n          GNode user = g.getEdgeDst(*ii.base());\n\n          if (user >= lastUser)\n            break;\n\n          LatentValue e = doGradientUpdate(itemData.latentVector,\n                                           g.getData(user).latentVector, lambda,\n                                           g.getEdgeData(*ii.base()), stepSize);\n          if (errorAccum)\n            error += e * e;\n          ++seen;\n        }\n      }\n\n      si.updates += 1;\n      if (errorAccum) {\n        *errorAccum += (error - si.error);\n        si.error = error;\n      }\n\n      return seen;\n    }\n\n    template <bool Enable = precomputeOffsets>\n    size_t runBlock(BlockInfo& si, typename std::enable_if<Enable>::type* = 0) {\n      if (si.updates >= maxUpdates)\n        return 0;\n      LatentValue stepSize = steps[si.updates - maxUpdates + updatesPerEdge];\n      size_t seen          = 0;\n      double error         = 0.0;\n\n      // Set up item iterators\n      size_t itemId      = 0;\n      Graph::iterator mm = g.begin(), em = g.begin();\n      std::advance(mm, si.itemStart);\n      std::advance(em, si.itemEnd);\n\n      // For each item in the range\n      for (; mm != em; ++mm, ++itemId) {\n        if (si.userOffsets[itemId] < 0)\n          continue;\n\n        GNode item      = *mm;\n        Node& itemData  = g.getData(item);\n        size_t lastUser = si.userEnd + NUM_ITEM_NODES;\n\n        // For each edge in the range\n        for (auto ii = g.edge_begin(item) + si.userOffsets[itemId],\n                  ei = g.edge_end(item);\n             ii != ei; ++ii) {\n          GNode user = g.getEdgeDst(ii);\n\n          if (user >= lastUser)\n            break;\n\n          LatentValue e = doGradientUpdate(itemData.latentVector,\n                                           g.getData(user).latentVector, lambda,\n                                           g.getEdgeData(ii), stepSize);\n          if (errorAccum)\n            error += e * e;\n          ++seen;\n        }\n      }\n\n      si.updates += 1;\n      if (errorAccum) {\n        *errorAccum += (error - si.error);\n        si.error = error;\n      }\n\n      return seen;\n    }\n\n    /**\n     * Searches next slice to work on.\n     *\n     * @returns slice id to work on, x and y locks are held on the slice\n     */\n    size_t getNextBlock(BlockInfo* sp) {\n      size_t numBlocks   = numXBlocks * numYBlocks;\n      size_t nextBlockId = sp->id + 1;\n      for (size_t i = 0; i < 2 * numBlocks; ++i, ++nextBlockId) {\n        // Wrap around\n        if (nextBlockId == numBlocks)\n          nextBlockId = 0;\n\n        BlockInfo& nextBlock = blocks[nextBlockId];\n\n        if (nextBlock.updates < maxUpdates && xLocks[nextBlock.x].try_lock()) {\n          if (yLocks[nextBlock.y].try_lock()) {\n            // Return while holding locks\n            return nextBlockId;\n          } else {\n            xLocks[nextBlock.x].unlock();\n          }\n        }\n      }\n\n      return numBlocks;\n    }\n\n    void operator()(unsigned tid, unsigned total) {\n      galois::StatTimer timer(\"PerThreadTime\");\n      // TODO: Report Accumulators at the end\n      galois::GAccumulator<size_t> edgesVisited;\n      galois::GAccumulator<size_t> blocksVisited;\n      size_t numBlocks = numXBlocks * numYBlocks;\n      size_t xBlock    = (numXBlocks + total - 1) / total;\n      size_t xStart    = std::min(xBlock * tid, numXBlocks - 1);\n      size_t yBlock    = (numYBlocks + total - 1) / total;\n      size_t yStart    = std::min(yBlock * tid, numYBlocks - 1);\n      BlockInfo* sp    = &blocks[xStart + yStart + numXBlocks];\n\n      timer.start();\n\n      while (true) {\n        sp = &blocks[getNextBlock(sp)];\n        if (sp == &blocks[numBlocks])\n          break;\n        blocksVisited += 1;\n        edgesVisited += runBlock(*sp);\n\n        xLocks[sp->x].unlock();\n        yLocks[sp->y].unlock();\n      }\n\n      timer.stop();\n    }\n  };\n\n  void operator()(Graph& g, const StepFunction& sf) {\n    galois::StatTimer preProcessTimer(\"PreProcessingTime\");\n    preProcessTimer.start();\n    const size_t numUsers = g.size() - NUM_ITEM_NODES;\n    const size_t numYBlocks =\n        (NUM_ITEM_NODES + itemsPerBlock - 1) / itemsPerBlock;\n    const size_t numXBlocks = (numUsers + usersPerBlock - 1) / usersPerBlock;\n    const size_t numBlocks  = numXBlocks * numYBlocks;\n\n    SpinLock* xLocks = new SpinLock[numXBlocks];\n    SpinLock* yLocks = new SpinLock[numYBlocks];\n\n    std::cout << \"itemsPerBlock: \" << itemsPerBlock\n              << \" usersPerBlock: \" << usersPerBlock\n              << \" numBlocks: \" << numBlocks << \" numXBlocks: \" << numXBlocks\n              << \" numYBlocks: \" << numYBlocks << \"\\n\";\n\n    // Initialize\n    BlockInfo* blocks = new BlockInfo[numBlocks];\n    for (size_t i = 0; i < numBlocks; i++) {\n      BlockInfo& si = blocks[i];\n      si.id         = i;\n      si.x          = i % numXBlocks;\n      si.y          = i / numXBlocks;\n      si.updates    = 0;\n      si.error      = 0.0;\n      si.userStart  = si.x * usersPerBlock;\n      si.userEnd    = std::min((si.x + 1) * usersPerBlock, numUsers);\n      si.itemStart  = si.y * itemsPerBlock;\n      si.itemEnd    = std::min((si.y + 1) * itemsPerBlock, NUM_ITEM_NODES);\n      si.numitems   = si.itemEnd - si.itemStart;\n      if (precomputeOffsets) {\n        si.userOffsets = new int[si.numitems];\n      } else {\n        si.userOffsets = nullptr;\n      }\n    }\n\n    // Partition item edges in blocks to users according to range [userStart,\n    // userEnd)\n    if (precomputeOffsets) {\n      galois::do_all(galois::iterate(g.begin(), g.begin() + NUM_ITEM_NODES),\n                     [&](GNode item) {\n                       size_t sliceY = item / itemsPerBlock;\n                       BlockInfo* s  = &blocks[sliceY * numXBlocks];\n\n                       size_t pos = item - s->itemStart;\n                       auto ii = g.edge_begin(item), ei = g.edge_end(item);\n                       size_t offset = 0;\n                       for (size_t i = 0; i < numXBlocks; ++i, ++s) {\n                         size_t start = userIdToUserNode(s->userStart);\n                         size_t end   = userIdToUserNode(s->userEnd);\n\n                         if (ii != ei && g.getEdgeDst(ii) >= start &&\n                             g.getEdgeDst(ii) < end) {\n                           s->userOffsets[pos] = offset;\n                         } else {\n                           s->userOffsets[pos] = -1;\n                         }\n                         for (; ii != ei && g.getEdgeDst(ii) < end;\n                              ++ii, ++offset)\n                           ;\n                       }\n                     });\n    }\n    preProcessTimer.stop();\n\n    galois::StatTimer executeTimer(\"Time\");\n    executeTimer.start();\n    executeUntilConverged(sf, g,\n                          [&](LatentValue* steps, size_t maxUpdates,\n                              galois::GAccumulator<double>* errorAccum) {\n                            Process fn{g,      xLocks,     yLocks,\n                                       blocks, numXBlocks, numYBlocks,\n                                       steps,  maxUpdates, errorAccum};\n                            galois::on_each(fn);\n                          });\n    executeTimer.stop();\n\n    delete[] xLocks;\n    delete[] yLocks;\n    delete[] blocks;\n  }\n};\n\n/*\n * Simple SGD going over all the destination(users) for a given\n * source(Item)\n */\nclass SGDItemsAlgo {\n  static const bool makeSerializable = false;\n\n  struct BasicNode {\n    LatentValue latentVector[LATENT_VECTOR_SIZE];\n  };\n\n  using Node = BasicNode;\n\npublic:\n  bool isSgd() const { return true; }\n\n  typedef typename galois::graphs::LC_CSR_Graph<Node, EdgeType>\n      //::template with_numa_alloc<true>::type\n      ::template with_out_of_line_lockable<true>::type ::\n          template with_no_lockable<!makeSerializable>::type Graph;\n\n  void readGraph(Graph& g) { galois::graphs::readGraph(g, inputFile); }\n\n  std::string name() const { return \"sgdItemsAlgo\"; }\n\n  size_t numItems() const { return NUM_ITEM_NODES; }\n\nprivate:\n  using GNode         = typename Graph::GraphNode;\n  using edge_iterator = typename Graph::edge_iterator;\n\n  struct Execute {\n    Graph& g;\n    galois::GAccumulator<unsigned>& edgesVisited;\n\n    void operator()(LatentValue* steps, int,\n                    galois::GAccumulator<double>* errorAccum) {\n\n      const LatentValue stepSize = steps[0];\n      galois::for_each(\n          galois::iterate(g.begin(), g.begin() + NUM_ITEM_NODES),\n          [&](GNode src, auto&) {\n            for (auto ii : g.edges(src)) {\n\n              GNode dst         = g.getEdgeDst(ii);\n              LatentValue error = doGradientUpdate(\n                  g.getData(src, galois::MethodFlag::UNPROTECTED).latentVector,\n                  g.getData(dst).latentVector, lambda, g.getEdgeData(ii),\n                  stepSize);\n\n              edgesVisited += 1;\n              if (useExactError)\n                *errorAccum += error;\n            }\n          },\n          galois::wl<galois::worklists::PerSocketChunkFIFO<64>>(),\n          galois::no_pushes(), galois::loopname(\"sgdItemsAlgo\"));\n    }\n  };\n\npublic:\n  void operator()(Graph& g, const StepFunction& sf) {\n    verify(g, \"sgdItemsAlgo\");\n    galois::GAccumulator<unsigned> edgesVisited;\n\n    galois::StatTimer executeTimer(\"Time\");\n    executeTimer.start();\n\n    Execute fn{g, edgesVisited};\n    executeUntilConverged(sf, g, fn);\n\n    executeTimer.stop();\n\n    galois::runtime::reportStat_Single(\"sgdItemsAlgo\", \"EdgesVisited\",\n                                       edgesVisited.reduce());\n  }\n};\n\n/**\n * Simple by-edge grouped by items (only one edge per item on the WL at any\n * time)\n */\nclass SGDEdgeItem {\n  static const bool makeSerializable = false;\n\n  struct BasicNode {\n    // latent vector to be learned.\n    LatentValue latentVector[LATENT_VECTOR_SIZE];\n    // if a item's update is interrupted, where to start when resuming.\n    unsigned int edge_offset;\n  };\n\n  using Node = BasicNode;\n\npublic:\n  bool isSgd() const { return true; }\n\n  typedef typename galois::graphs::LC_CSR_Graph<Node, EdgeType>\n      //::template with_numa_alloc<true>::type\n      ::template with_out_of_line_lockable<true>::type ::\n          template with_no_lockable<!makeSerializable>::type Graph;\n\n  void readGraph(Graph& g) { galois::graphs::readGraph(g, inputFile); }\n\n  std::string name() const { return \"sgdEdgeItem\"; }\n\n  size_t numItems() const { return NUM_ITEM_NODES; }\n\nprivate:\n  using GNode         = typename Graph::GraphNode;\n  using edge_iterator = typename Graph::edge_iterator;\n\n  struct Execute {\n    Graph& g;\n    galois::GAccumulator<unsigned>& edgesVisited;\n    void operator()(LatentValue* steps, int,\n                    galois::GAccumulator<double>* errorAccum) {\n      const LatentValue stepSize = steps[0];\n      galois::for_each(\n          galois::iterate(g.begin(), g.begin() + NUM_ITEM_NODES),\n          [&](GNode src, auto& ctx) {\n            auto ii = g.edge_begin(src, galois::MethodFlag::UNPROTECTED);\n            auto ee = g.edge_end(src, galois::MethodFlag::UNPROTECTED);\n\n            if (ii == ee)\n              return;\n\n            // Do not need lock on the source node, since only one thread can\n            // work on a given src(item).\n            auto& srcData = g.getData(src, galois::MethodFlag::UNPROTECTED);\n            // Advance to the edge that has not been worked yet.\n            std::advance(ii, srcData.edge_offset);\n            // Take lock on the destination as multiple source may update the\n            // same destination.\n            auto& dstData = g.getData(g.getEdgeDst(ii));\n            LatentValue error =\n                doGradientUpdate(srcData.latentVector, dstData.latentVector,\n                                 lambda, g.getEdgeData(ii), stepSize);\n\n            ++srcData.edge_offset;\n            ++ii;\n\n            edgesVisited += 1;\n            if (useExactError)\n              *errorAccum += error;\n\n            if (ii == ee) {\n              // Finished the last edge.\n              // Start from the first edge.\n              srcData.edge_offset = 0;\n              return;\n            } else {\n              // More edges to work on, therefore push the current src\n              // to the worklist.\n              ctx.push(src);\n            }\n          },\n          galois::wl<galois::worklists::PerSocketChunkLIFO<8>>(),\n          galois::loopname(\"sgdEdgeItem\"));\n    }\n  };\n\npublic:\n  void operator()(Graph& g, const StepFunction& sf) {\n    verify(g, \"sgdEdgeItem\");\n    galois::GAccumulator<unsigned> edgesVisited;\n\n    galois::StatTimer executeTimer(\"Time\");\n    executeTimer.start();\n\n    Execute fn{g, edgesVisited};\n    executeUntilConverged(sf, g, fn);\n\n    executeTimer.stop();\n\n    galois::runtime::reportStat_Single(\"sgdEdgeItem\", \"EdgesVisited\",\n                                       edgesVisited.reduce());\n  }\n};\n\n/*\n * Simple edge-wise operator\n * Use Fixed2DGraphTiledExecutor to divide Items and Users in to blocks.\n * Locks blocks (blocks may share Items or Users) to work on them.\n *\n */\nclass SGDBlockEdgeAlgo {\n  static const bool makeSerializable = false;\n\n  struct BasicNode {\n    LatentValue latentVector[LATENT_VECTOR_SIZE];\n  };\n\n  using Node = BasicNode;\n\npublic:\n  bool isSgd() const { return true; }\n\n  typedef typename galois::graphs::LC_CSR_Graph<Node, EdgeType>\n      //::template with_numa_alloc<true>::type\n      ::template with_out_of_line_lockable<true>::type ::\n          template with_no_lockable<!makeSerializable>::type Graph;\n\n  void readGraph(Graph& g) { galois::graphs::readGraph(g, inputFile); }\n\n  std::string name() const { return \"sgdBlockEdge\"; }\n\n  size_t numItems() const { return NUM_ITEM_NODES; }\n\nprivate:\n  using GNode         = typename Graph::GraphNode;\n  using edge_iterator = typename Graph::edge_iterator;\n\n  struct Execute {\n    Graph& g;\n    galois::GAccumulator<unsigned>& edgesVisited;\n\n    void operator()(LatentValue* steps, int,\n                    galois::GAccumulator<double>* errorAccum) {\n      galois::runtime::Fixed2DGraphTiledExecutor<Graph> executor(g);\n      executor.execute(\n          g.begin(), g.begin() + NUM_ITEM_NODES, g.begin() + NUM_ITEM_NODES,\n          g.end(), itemsPerBlock, usersPerBlock,\n          [&](GNode src, GNode dst, edge_iterator edge) {\n            const LatentValue stepSize = steps[0];\n            LatentValue error          = doGradientUpdate(\n                g.getData(src).latentVector, g.getData(dst).latentVector,\n                lambda, g.getEdgeData(edge), stepSize);\n            edgesVisited += 1;\n            if (useExactError)\n              *errorAccum += error;\n          },\n          true // use locks\n      );\n    }\n  };\n\npublic:\n  void operator()(Graph& g, const StepFunction& sf) {\n    verify(g, \"sgdBlockEdgeAlgo\");\n    galois::GAccumulator<unsigned> edgesVisited;\n\n    galois::StatTimer executeTimer(\"Time\");\n    executeTimer.start();\n\n    Execute fn{g, edgesVisited};\n    executeUntilConverged(sf, g, fn);\n\n    executeTimer.stop();\n\n    galois::runtime::reportStat_Single(\"sgdBlockEdgeAlgo\", \"EdgesVisited\",\n                                       edgesVisited.reduce());\n  }\n};\n\n/**\n * ALS algorithms\n */\n\n#ifdef HAS_EIGEN\n\nstruct SimpleALSalgo {\n  bool isSgd() const { return false; }\n  std::string name() const { return \"AlternatingLeastSquares\"; }\n  struct Node {\n    LatentValue latentVector[LATENT_VECTOR_SIZE];\n  };\n\n  typedef typename galois::graphs::LC_CSR_Graph<\n      Node, EdgeType>::with_no_lockable<true>::type Graph;\n  typedef Graph::GraphNode GNode;\n  // Column-major access\n  typedef Eigen::SparseMatrix<LatentValue> Sp;\n  typedef Eigen::Matrix<LatentValue, LATENT_VECTOR_SIZE, Eigen::Dynamic> MT;\n  typedef Eigen::Matrix<LatentValue, LATENT_VECTOR_SIZE, 1> V;\n  typedef Eigen::Map<V> MapV;\n\n  Sp A;\n  Sp AT;\n\n  void readGraph(Graph& g) { galois::graphs::readGraph(g, inputFile); }\n\n  void copyToGraph(Graph& g, MT& WT, MT& HT) {\n    // Copy out\n    for (GNode n : g) {\n      LatentValue* ptr = &g.getData(n).latentVector[0];\n      MapV mapV{ptr};\n      if (n < NUM_ITEM_NODES) {\n        mapV = WT.col(n);\n      } else {\n        mapV = HT.col(n - NUM_ITEM_NODES);\n      }\n    }\n  }\n\n  void copyFromGraph(Graph& g, MT& WT, MT& HT) {\n    for (GNode n : g) {\n      LatentValue* ptr = &g.getData(n).latentVector[0];\n      MapV mapV{ptr};\n      if (n < NUM_ITEM_NODES) {\n        WT.col(n) = mapV;\n      } else {\n        HT.col(n - NUM_ITEM_NODES) = mapV;\n      }\n    }\n  }\n\n  void initializeA(Graph& g) {\n    typedef Eigen::Triplet<int> Triplet;\n    std::vector<Triplet> triplets{g.sizeEdges()};\n    auto it = triplets.begin();\n    for (auto n : g) {\n      for (auto edge : g.out_edges(n)) {\n        *it++ = Triplet(n, g.getEdgeDst(edge) - NUM_ITEM_NODES,\n                        g.getEdgeData(edge));\n      }\n    }\n    A.resize(NUM_ITEM_NODES, g.size() - NUM_ITEM_NODES);\n    A.setFromTriplets(triplets.begin(), triplets.end());\n    AT = A.transpose();\n  }\n\n  void operator()(Graph& g, const StepFunction&) {\n    galois::TimeAccumulator elapsed;\n    elapsed.start();\n\n    // Find W, H that minimize ||W H^T - A||_2^2 by solving alternating least\n    // squares problems:\n    //   (W^T W + lambda I) H^T = W^T A (solving for H^T)\n    //   (H^T H + lambda I) W^T = H^T A^T (solving for W^T)\n    MT WT{LATENT_VECTOR_SIZE, NUM_ITEM_NODES};\n    MT HT{LATENT_VECTOR_SIZE, g.size() - NUM_ITEM_NODES};\n    typedef Eigen::Matrix<LatentValue, LATENT_VECTOR_SIZE, LATENT_VECTOR_SIZE>\n        XTX;\n    typedef Eigen::Matrix<LatentValue, LATENT_VECTOR_SIZE, Eigen::Dynamic> XTSp;\n    typedef galois::substrate::PerThreadStorage<XTX> PerThrdXTX;\n\n    galois::gPrint(\"ALS::Start initializeA\\n\");\n    initializeA(g);\n    galois::gPrint(\"ALS::End initializeA\\n\");\n    galois::gPrint(\"ALS::Start copyFromGraph\\n\");\n    copyFromGraph(g, WT, HT);\n    galois::gPrint(\"ALS::End copyFromGraph\\n\");\n\n    double last = -1.0;\n    galois::StatTimer mmTime(\"MMTime\");\n    galois::StatTimer update1Time(\"UpdateTime1\");\n    galois::StatTimer update2Time(\"UpdateTime2\");\n    galois::StatTimer copyTime(\"CopyTime\");\n    galois::StatTimer totalExecTime(\"totalExecTime\");\n    galois::StatTimer totalAlgoTime(\"Time\");\n    PerThrdXTX xtxs;\n\n    totalAlgoTime.start();\n    for (unsigned round = 1;; ++round) {\n      totalExecTime.start();\n      mmTime.start();\n      // TODO parallelize this using tiled executor\n      XTSp WTA = WT * A;\n      mmTime.stop();\n\n      update1Time.start();\n      // TODO: Change to Do_all, pass ints to iterator\n      galois::for_each(\n          galois::iterate(boost::counting_iterator<int>(0),\n                          boost::counting_iterator<int>(A.outerSize())),\n          [&](int col, galois::UserContext<int>&) {\n            // Compute WTW = W^T * W for sparse A\n            XTX& WTW = *xtxs.getLocal();\n            WTW.setConstant(0);\n            for (Sp::InnerIterator it(A, col); it; ++it)\n              WTW.triangularView<Eigen::Upper>() +=\n                  WT.col(it.row()) * WT.col(it.row()).transpose();\n            for (int i = 0; i < LATENT_VECTOR_SIZE; ++i)\n              WTW(i, i) += lambda;\n            HT.col(col) =\n                WTW.selfadjointView<Eigen::Upper>().llt().solve(WTA.col(col));\n          });\n      update1Time.stop();\n\n      mmTime.start();\n      XTSp HTAT = HT * AT;\n      mmTime.stop();\n\n      update2Time.start();\n      galois::for_each(\n          galois::iterate(boost::counting_iterator<int>(0),\n                          boost::counting_iterator<int>(AT.outerSize())),\n          [&](int col, galois::UserContext<int>&) {\n            // Compute HTH = H^T * H for sparse A\n            XTX& HTH = *xtxs.getLocal();\n            HTH.setConstant(0);\n            for (Sp::InnerIterator it(AT, col); it; ++it)\n              HTH.triangularView<Eigen::Upper>() +=\n                  HT.col(it.row()) * HT.col(it.row()).transpose();\n            for (int i = 0; i < LATENT_VECTOR_SIZE; ++i)\n              HTH(i, i) += lambda;\n            WT.col(col) =\n                HTH.selfadjointView<Eigen::Upper>().llt().solve(HTAT.col(col));\n          });\n      update2Time.stop();\n\n      copyTime.start();\n      copyToGraph(g, WT, HT);\n      copyTime.stop();\n      totalExecTime.stop();\n\n      double error = sumSquaredError(g);\n      elapsed.stop();\n      std::cout << \"R: \" << round << \" elapsed (ms): \" << elapsed.get()\n                << \" RMSE (R \" << round\n                << \"): \" << std::sqrt(error / g.sizeEdges()) << \"\\n\";\n      elapsed.start();\n\n      if (fixedRounds <= 0 && round > 1 &&\n          std::abs((last - error) / last) < tolerance)\n        break;\n      if (fixedRounds > 0 && round >= fixedRounds)\n        break;\n\n      last = error;\n    }\n    totalAlgoTime.stop();\n  }\n};\n\nstruct SyncALSalgo {\n\n  bool isSgd() const { return false; }\n\n  std::string name() const { return \"SynchronousAlternatingLeastSquares\"; }\n\n  struct Node {\n    LatentValue latentVector[LATENT_VECTOR_SIZE];\n  };\n\n  static const bool NEEDS_LOCKS = false;\n  typedef typename galois::graphs::LC_CSR_Graph<Node, EdgeType> BaseGraph;\n  typedef typename std::conditional<\n      NEEDS_LOCKS,\n      typename BaseGraph::template with_out_of_line_lockable<true>::type,\n      typename BaseGraph::template with_no_lockable<true>::type>::type Graph;\n  typedef typename Graph::GraphNode GNode;\n  // Column-major access\n  typedef Eigen::SparseMatrix<LatentValue> Sp;\n  typedef Eigen::Matrix<LatentValue, LATENT_VECTOR_SIZE, Eigen::Dynamic> MT;\n  typedef Eigen::Matrix<LatentValue, LATENT_VECTOR_SIZE, 1> V;\n  typedef Eigen::Map<V> MapV;\n  typedef Eigen::Matrix<LatentValue, LATENT_VECTOR_SIZE, LATENT_VECTOR_SIZE>\n      XTX;\n  typedef Eigen::Matrix<LatentValue, LATENT_VECTOR_SIZE, Eigen::Dynamic> XTSp;\n\n  typedef galois::substrate::PerThreadStorage<XTX> PerThrdXTX;\n  typedef galois::substrate::PerThreadStorage<V> PerThrdV;\n\n  Sp A;\n  Sp AT;\n\n  void readGraph(Graph& g) { galois::graphs::readGraph(g, inputFile); }\n\n  void copyToGraph(Graph& g, MT& WT, MT& HT) {\n    // Copy out\n    for (GNode n : g) {\n      LatentValue* ptr = &g.getData(n).latentVector[0];\n      MapV mapV{ptr};\n      if (n < NUM_ITEM_NODES) {\n        mapV = WT.col(n);\n      } else {\n        mapV = HT.col(n - NUM_ITEM_NODES);\n      }\n    }\n  }\n\n  void copyFromGraph(Graph& g, MT& WT, MT& HT) {\n    for (GNode n : g) {\n      LatentValue* ptr = &g.getData(n).latentVector[0];\n      MapV mapV{ptr};\n      if (n < NUM_ITEM_NODES) {\n        WT.col(n) = mapV;\n      } else {\n        HT.col(n - NUM_ITEM_NODES) = mapV;\n      }\n    }\n  }\n\n  void initializeA(Graph& g) {\n    typedef Eigen::Triplet<int> Triplet;\n    std::vector<Triplet> triplets{g.sizeEdges()};\n    auto it = triplets.begin();\n    for (auto n : g) {\n      for (auto edge : g.out_edges(n)) {\n        *it++ = Triplet(n, g.getEdgeDst(edge) - NUM_ITEM_NODES,\n                        g.getEdgeData(edge));\n      }\n    }\n    A.resize(NUM_ITEM_NODES, g.size() - NUM_ITEM_NODES);\n    A.setFromTriplets(triplets.begin(), triplets.end());\n    AT = A.transpose();\n  }\n\n  void update(Graph& g, size_t col, MT& WT, MT& HT, PerThrdXTX& xtxs,\n              PerThrdV& rhs) {\n    // Compute WTW = W^T * W for sparse A\n    V& r = *rhs.getLocal();\n    if (col < NUM_ITEM_NODES) {\n      r.setConstant(0);\n      // HTAT = HT * AT; r = HTAT.col(col)\n      for (Sp::InnerIterator it(AT, col); it; ++it)\n        r += it.value() * HT.col(it.row());\n      XTX& HTH = *xtxs.getLocal();\n      HTH.setConstant(0);\n      for (Sp::InnerIterator it(AT, col); it; ++it)\n        HTH.triangularView<Eigen::Upper>() +=\n            HT.col(it.row()) * HT.col(it.row()).transpose();\n      for (int i = 0; i < LATENT_VECTOR_SIZE; ++i)\n        HTH(i, i) += lambda;\n      WT.col(col) = HTH.selfadjointView<Eigen::Upper>().llt().solve(r);\n    } else {\n      col = col - NUM_ITEM_NODES;\n      r.setConstant(0);\n      // WTA = WT * A; x = WTA.col(col)\n      for (Sp::InnerIterator it(A, col); it; ++it)\n        r += it.value() * WT.col(it.row());\n      XTX& WTW = *xtxs.getLocal();\n      WTW.setConstant(0);\n      for (Sp::InnerIterator it(A, col); it; ++it)\n        WTW.triangularView<Eigen::Upper>() +=\n            WT.col(it.row()) * WT.col(it.row()).transpose();\n      for (int i = 0; i < LATENT_VECTOR_SIZE; ++i)\n        WTW(i, i) += lambda;\n      HT.col(col) = WTW.selfadjointView<Eigen::Upper>().llt().solve(r);\n    }\n  }\n\n  struct NonDetTraits {\n    typedef std::tuple<> base_function_traits;\n  };\n\n  struct Process {\n    struct LocalState {\n      LocalState(Process&, galois::PerIterAllocTy&) {}\n    };\n\n    struct DeterministicId {\n      uintptr_t operator()(size_t x) const { return x; }\n    };\n\n    typedef std::tuple<galois::per_iter_alloc, galois::intent_to_read,\n                       galois::local_state<LocalState>,\n                       galois::det_id<DeterministicId>>\n        ikdg_function_traits;\n    typedef std::tuple<galois::per_iter_alloc, galois::fixed_neighborhood,\n                       galois::local_state<LocalState>,\n                       galois::det_id<DeterministicId>>\n        add_remove_function_traits;\n    typedef std::tuple<> nondet_function_traits;\n\n    SyncALSalgo& self;\n    Graph& g;\n    MT& WT;\n    MT& HT;\n    PerThrdXTX& xtxs;\n    PerThrdV& rhs;\n\n    Process(SyncALSalgo& self, Graph& g, MT& WT, MT& HT, PerThrdXTX& xtxs,\n            PerThrdV& rhs)\n        : self(self), g(g), WT(WT), HT(HT), xtxs(xtxs), rhs(rhs) {}\n\n    void operator()(size_t col, galois::UserContext<size_t>& ctx) {\n      self.update(g, col, WT, HT, xtxs, rhs);\n    }\n  };\n\n  void operator()(Graph& g, const StepFunction&) {\n    if (!useSameLatentVector) {\n      galois::gWarn(\"Results are not deterministic with different numbers of \"\n                    \"threads unless -useSameLatentVector is true\");\n    }\n    galois::TimeAccumulator elapsed;\n    elapsed.start();\n\n    // Find W, H that minimize ||W H^T - A||_2^2 by solving alternating least\n    // squares problems:\n    //   (W^T W + lambda I) H^T = W^T A (solving for H^T)\n    //   (H^T H + lambda I) W^T = H^T A^T (solving for W^T)\n    MT WT{LATENT_VECTOR_SIZE, NUM_ITEM_NODES};\n    MT HT{LATENT_VECTOR_SIZE, g.size() - NUM_ITEM_NODES};\n\n    initializeA(g);\n    copyFromGraph(g, WT, HT);\n\n    double last = -1.0;\n    galois::StatTimer updateTime(\"UpdateTime\");\n    galois::StatTimer copyTime(\"CopyTime\");\n    galois::StatTimer totalExecTime(\"totalExecTime\");\n    galois::StatTimer totalAlgoTime(\"Time\");\n    PerThrdXTX xtxs;\n    PerThrdV rhs;\n\n    totalAlgoTime.start();\n    for (unsigned round = 1;; ++round) {\n\n      totalExecTime.start();\n      updateTime.start();\n\n      typedef galois::worklists::PerThreadChunkLIFO<ALS_CHUNK_SIZE> WL_ty;\n      galois::for_each(\n          galois::iterate(boost::counting_iterator<size_t>(0),\n                          boost::counting_iterator<size_t>(NUM_ITEM_NODES)),\n          Process(*this, g, WT, HT, xtxs, rhs), galois::wl<WL_ty>(),\n          galois::loopname(\"syncALS-users\"));\n      galois::for_each(\n          galois::iterate(boost::counting_iterator<size_t>(NUM_ITEM_NODES),\n                          boost::counting_iterator<size_t>(g.size())),\n          Process(*this, g, WT, HT, xtxs, rhs), galois::wl<WL_ty>(),\n          galois::loopname(\"syncALS-items\"));\n\n      updateTime.stop();\n\n      copyTime.start();\n      copyToGraph(g, WT, HT);\n      copyTime.stop();\n      totalExecTime.stop();\n\n      double error = sumSquaredError(g);\n      elapsed.stop();\n      std::cout << \"R: \" << round << \" elapsed (ms): \" << elapsed.get()\n                << \" RMSE (R \" << round\n                << \"): \" << std::sqrt(error / g.sizeEdges()) << \"\\n\";\n      elapsed.start();\n\n      if (fixedRounds <= 0 && round > 1 &&\n          std::abs((last - error) / last) < tolerance)\n        break;\n      if (fixedRounds > 0 && round >= fixedRounds)\n        break;\n\n      last = error;\n    } // end for\n    totalAlgoTime.stop();\n  }\n};\n\n#endif // HAS_EIGEN\n\n/**\n * Initializes latent vector with random values and returns basic graph\n * parameters.\n *\n * @tparam Graph type of g\n * @param g Graph to initialize\n * @returns number of item nodes, i.e. nodes with outgoing edges. They should\n * be the first nodes of the graph in memory\n */\n\ntemplate <typename Graph>\nsize_t initializeGraphData(Graph& g) {\n  galois::gPrint(\"initializeGraphData\\n\");\n  galois::StatTimer initTimer(\"InitializeGraph\");\n  initTimer.start();\n  double top = 1.0 / std::sqrt(LATENT_VECTOR_SIZE);\n  galois::substrate::PerThreadStorage<std::mt19937> gen;\n\n#if __cplusplus >= 201103L || defined(HAVE_CXX11_UNIFORM_INT_DISTRIBUTION)\n  std::uniform_real_distribution<LatentValue> dist(0, top);\n#else\n  std::uniform_real<LatentValue> dist(0, top);\n#endif\n\n  if (useDetInit) {\n    galois::do_all(galois::iterate(g), [&](typename Graph::GraphNode n) {\n      auto& data = g.getData(n);\n      auto val   = genVal(n);\n      for (int i = 0; i < LATENT_VECTOR_SIZE; i++) {\n        data.latentVector[i] = val;\n      }\n    });\n  } else {\n    galois::do_all(galois::iterate(g), [&](typename Graph::GraphNode n) {\n      auto& data = g.getData(n);\n\n      // all threads initialize their assignment with same generator or\n      // a thread local one\n      if (useSameLatentVector) {\n        std::mt19937 sameGen;\n        for (int i = 0; i < LATENT_VECTOR_SIZE; i++) {\n          data.latentVector[i] = dist(sameGen);\n        }\n      } else {\n        for (int i = 0; i < LATENT_VECTOR_SIZE; i++) {\n          data.latentVector[i] = dist(*gen.getLocal());\n        }\n      }\n    });\n  }\n\n  auto activeThreads = galois::getActiveThreads();\n  std::vector<uint32_t> largestNodeID_perThread(activeThreads);\n\n  galois::on_each([&](unsigned tid, unsigned nthreads) {\n    unsigned int block_size = g.size() / nthreads;\n    if ((g.size() % nthreads) > 0)\n      ++block_size;\n\n    uint32_t start = tid * block_size;\n    uint32_t end   = (tid + 1) * block_size;\n    if (end > g.size())\n      end = g.size();\n\n    largestNodeID_perThread[tid] = 0;\n    for (uint32_t i = start; i < end; ++i) {\n      if (std::distance(g.edge_begin(i), g.edge_end(i))) {\n        if (largestNodeID_perThread[tid] < i)\n          largestNodeID_perThread[tid] = i;\n      }\n    }\n  });\n\n  uint32_t largestNodeID = 0;\n  for (uint32_t t = 0; t < activeThreads; ++t) {\n    if (largestNodeID < largestNodeID_perThread[t])\n      largestNodeID = largestNodeID_perThread[t];\n  }\n  size_t numItemNodes = largestNodeID + 1;\n\n  initTimer.stop();\n  return numItemNodes;\n}\n\nStepFunction* newStepFunction() {\n  switch (learningRateFunction) {\n  case Step::intel:\n    return new IntelStepFunction;\n  case Step::purdue:\n    return new PurdueStepFunction;\n  case Step::bottou:\n    return new BottouStepFunction;\n  case Step::inverse:\n    return new InverseStepFunction;\n  case Step::bold:\n    return new BoldStepFunction;\n  default:\n    GALOIS_DIE(\"unknown step function\");\n  }\n}\n\ntemplate <typename Graph>\nvoid writeBinaryLatentVectors(Graph& g, const std::string& filename) {\n  std::ofstream file(filename);\n  for (auto ii = g.begin(), ei = g.end(); ii != ei; ++ii) {\n    auto& v = g.getData(*ii).latentVector;\n    for (int i = 0; i < LATENT_VECTOR_SIZE; ++i) {\n      file.write(reinterpret_cast<char*>(&v[i]), sizeof(v[i]));\n    }\n  }\n  file.close();\n}\n\ntemplate <typename Graph>\nvoid writeAsciiLatentVectors(Graph& g, const std::string& filename) {\n  std::ofstream file(filename);\n  for (auto ii = g.begin(), ei = g.end(); ii != ei; ++ii) {\n    auto& v = g.getData(*ii).latentVector;\n    for (int i = 0; i < LATENT_VECTOR_SIZE; ++i) {\n      file << v[i] << \" \";\n    }\n    file << \"\\n\";\n  }\n  file.close();\n}\n\n/**\n * Run the provided algorithm (provided through the template argument).\n *\n * @param Algo algorithm to run\n */\ntemplate <typename Algo>\nvoid run() {\n  typename Algo::Graph g;\n  Algo algo;\n\n  galois::runtime::reportNumaAlloc(\"NumaAlloc0\");\n\n  // Bipartite graph in general graph data structure should be following:\n  // * items are the first m nodes\n  // * users are the next n nodes\n  // * only items have outedges\n  algo.readGraph(g);\n\n  galois::runtime::reportNumaAlloc(\"NumaAlloc1\");\n\n  // initialize latent vectors and get number of item nodes\n  NUM_ITEM_NODES = initializeGraphData(g);\n\n  galois::runtime::reportNumaAlloc(\"NumaAlloc2\");\n\n  std::cout << \"num users: \" << g.size() - NUM_ITEM_NODES\n            << \" num items: \" << NUM_ITEM_NODES\n            << \" num ratings: \" << g.sizeEdges() << \"\\n\";\n\n  std::unique_ptr<StepFunction> sf{newStepFunction()};\n  std::cout << \"latent vector size: \" << LATENT_VECTOR_SIZE\n            << \" algo: \" << algo.name() << \" lambda: \" << lambda;\n\n  if (algo.isSgd()) {\n    std::cout << \" learning rate: \" << learningRate\n              << \" decay rate: \" << decayRate\n              << \" step function: \" << sf->name();\n  }\n\n  std::cout << \"\\n\";\n\n  if (!skipVerify) {\n    verify(g, \"Initial\");\n  }\n\n  // algorithm call\n  galois::StatTimer execTime(\"Timer_0\");\n  execTime.start();\n  algo(g, *sf);\n  execTime.stop();\n\n  if (!skipVerify) {\n    verify(g, \"Final\");\n  }\n\n  if (outputFilename != \"\") {\n    std::cout << \"Writing latent vectors to \" << outputFilename << \"\\n\";\n    switch (outputType) {\n    case OutputType::binary:\n      writeBinaryLatentVectors(g, outputFilename);\n      break;\n    case OutputType::ascii:\n      writeAsciiLatentVectors(g, outputFilename);\n      break;\n    default:\n      GALOIS_DIE(\"invalid output type for latent vector output\");\n    }\n  }\n\n  galois::runtime::reportNumaAlloc(\"NumaAlloc\");\n}\n\nint main(int argc, char** argv) {\n  galois::SharedMemSys G;\n  LonestarStart(argc, argv, name, desc, nullptr, &inputFile);\n\n  galois::StatTimer totalTime(\"TimerTotal\");\n  totalTime.start();\n\n  switch (algo) {\n#ifdef HAS_EIGEN\n  case Algo::syncALS:\n    run<SyncALSalgo>();\n    break;\n  case Algo::simpleALS:\n    run<SimpleALSalgo>();\n    break;\n#endif\n  case Algo::sgdByItems:\n    run<SGDItemsAlgo>();\n    break;\n  case Algo::sgdByEdges:\n    run<SGDEdgeItem>();\n    break;\n  case Algo::sgdBlockEdge:\n    run<SGDBlockEdgeAlgo>();\n    break;\n  case Algo::sgdBlockJump:\n    run<SGDBlockJumpAlgo>();\n    break;\n  default:\n    GALOIS_DIE(\"unknown algorithm\");\n    break;\n  }\n\n  totalTime.stop();\n\n  return 0;\n}\n"
  },
  {
    "path": "lonestar/analytics/cpu/matrixcompletion/matrixCompletion.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#ifndef LONESTAR_MATRIXCOMPLETION_H\n#define LONESTAR_MATRIXCOMPLETION_H\n\n#include <cassert>\n#include <galois/gstl.h>\n#include <string>\n#include \"llvm/Support/CommandLine.h\"\n\ntypedef float LatentValue;\ntypedef float EdgeType;\n\n// Purdue, CSGD: 100; Intel: 20\n// static const int LATENT_VECTOR_SIZE = 100;\nstatic const int LATENT_VECTOR_SIZE = 20; // Purdue, CSGD: 100; Intel: 20\n\n/**\n * Common commandline parameters to for matrix completion algorithms\n */\nenum OutputType { binary, ascii };\n\nnamespace cll = llvm::cl;\n\n/*\n * (Purdue, Neflix): 0.012, (Purdue, Yahoo Music): 0.00075, (Purdue, HugeWiki):\n * 0.001 Intel: 0.001 Bottou: 0.1\n */\nstatic cll::opt<float> learningRate(\"learningRate\",\n                                    cll::desc(\"learning rate parameter [alpha] \"\n                                              \"for Bold, Bottou, Intel and \"\n                                              \"Purdue step size function\"),\n                                    cll::init(0.012));\n\n/*\n * (Purdue, Netflix): 0.015, (Purdue, Yahoo Music): 0.01,\n * (Purdue, HugeWiki): 0.0, Intel: 0.9\n */\nstatic cll::opt<float> decayRate(\"decayRate\",\n                                 cll::desc(\"decay rate parameter [beta] for \"\n                                           \"Intel and Purdue step size \"\n                                           \"function\"),\n                                 cll::init(0.015));\n/*\n * (Purdue, Netflix): 0.05, (Purdue, Yahoo Music): 1.0, (Purdue, HugeWiki): 0.01\n * Intel: 0.001\n */\nstatic cll::opt<float> lambda(\"lambda\",\n                              cll::desc(\"regularization parameter [lambda]\"),\n                              cll::init(0.05));\n\nstatic cll::opt<unsigned> usersPerBlock(\"usersPerBlock\",\n                                        cll::desc(\"users per block\"),\n                                        cll::init(2048));\nstatic cll::opt<unsigned> itemsPerBlock(\"itemsPerBlock\",\n                                        cll::desc(\"items per block\"),\n                                        cll::init(350));\nstatic cll::opt<float>\n    tolerance(\"tolerance\", cll::desc(\"convergence tolerance\"), cll::init(0.01));\n\nstatic cll::opt<bool> useSameLatentVector(\"useSameLatentVector\",\n                                          cll::desc(\"initialize all nodes to \"\n                                                    \"use same latent vector\"),\n                                          cll::init(false));\n\n/*\n * Regarding algorithm termination\n */\nstatic cll::opt<unsigned> maxUpdates(\"maxUpdates\",\n                                     cll::desc(\"Max number of times to update \"\n                                               \"latent vectors (default 100)\"),\n                                     cll::init(100));\n\nstatic cll::opt<std::string>\n    outputFilename(cll::Positional, cll::desc(\"[output file]\"), cll::init(\"\"));\nstatic cll::opt<std::string>\n    transposeGraphName(\"graphTranspose\", cll::desc(\"Transpose of input graph\"));\nstatic cll::opt<OutputType>\n    outputType(\"output\", cll::desc(\"Output type:\"),\n               cll::values(clEnumValN(OutputType::binary, \"binary\", \"Binary\"),\n                           clEnumValN(OutputType::ascii, \"ascii\", \"ASCII\")),\n               cll::init(OutputType::binary));\n\nstatic cll::opt<unsigned int>\n    updatesPerEdge(\"updatesPerEdge\", cll::desc(\"number of updates per edge\"),\n                   cll::init(1));\n\nstatic cll::opt<unsigned int>\n    fixedRounds(\"fixedRounds\", cll::desc(\"run for a fixed number of rounds\"),\n                cll::init(0));\nstatic cll::opt<bool> useExactError(\"useExactError\",\n                                    cll::desc(\"use exact error for testing \"\n                                              \"convergence\"),\n                                    cll::init(false));\nstatic cll::opt<bool>\n    useDetInit(\"useDetInit\",\n               cll::desc(\"initialize all nodes to \"\n                         \"use deterministic values for latent vector\"),\n               cll::init(false));\n\n/**\n * Inner product of 2 vectors.\n *\n * Like std::inner_product but rewritten here to check vectorization\n *\n * @param first1 Pointer to beginning of vector 1\n * @param last1 Pointer to end of vector 1. Should be exactly LATENT_VECTOR_SIZE\n * away from first1\n * @param first2 Pointer to beginning of vector 2. Should have at least\n * LATENT_VECTOR_SIZE elements from this point.\n * @param init Initial value to accumulate sum into\n *\n * @returns init + the inner product (i.e. the inner product if init is 0, error\n * if init is -\"ground truth\"\n */\ntemplate <typename T>\nT innerProduct(T* __restrict__ first1,\n               T* __restrict__ GALOIS_USED_ONLY_IN_DEBUG(last1),\n               T* __restrict__ first2, T init) {\n  assert(first1 + LATENT_VECTOR_SIZE == last1);\n  for (int i = 0; i < LATENT_VECTOR_SIZE; ++i) {\n    init += first1[i] * first2[i];\n  }\n  return init;\n}\n\ntemplate <typename T>\nT predictionError(T* __restrict__ itemLatent, T* __restrict__ userLatent,\n                  double actual) {\n  T v = actual;\n  return innerProduct(itemLatent, itemLatent + LATENT_VECTOR_SIZE, userLatent,\n                      -v);\n}\n\n/**\n * Objective: squared loss with weighted-square-norm regularization\n *\n * Updates latent vectors to reduce the error from the edge value.\n *\n * @param itemLatent latent vector of the item\n * @param userLatent latent vector of the user\n * @param lambda learning parameter\n * @param edgeRating Data on the edge, i.e. the number that the inner product\n * of the 2 latent vectors should eventually get to\n * @param stepSize learning parameter: how much to adjust vectors by to\n * correct for erro\n *\n * @return Error before gradient update\n */\ntemplate <typename T>\nT doGradientUpdate(T* __restrict__ itemLatent, T* __restrict__ userLatent,\n                   double lambda, double edgeRating, double stepSize) {\n  // Implicit cast to type T\n  T l      = lambda;\n  T step   = stepSize;\n  T rating = edgeRating;\n  T error  = innerProduct(itemLatent, itemLatent + LATENT_VECTOR_SIZE,\n                         userLatent, -rating);\n\n  // Take gradient step to reduce error\n  for (int i = 0; i < LATENT_VECTOR_SIZE; i++) {\n    T prevItem = itemLatent[i];\n    T prevUser = userLatent[i];\n    itemLatent[i] -= step * (error * prevUser + l * prevItem);\n    userLatent[i] -= step * (error * prevItem + l * prevUser);\n  }\n\n  return error;\n}\n\nstruct StepFunction {\n  virtual LatentValue stepSize(int round) const = 0;\n  virtual std::string name() const              = 0;\n  virtual bool isBold() const { return false; }\n  virtual ~StepFunction() {}\n};\n\n/*\n * Generate a number [-1, 1] using node id\n * for deterministic runs\n */\nstatic double genVal(uint32_t n) {\n  return 2.0 * ((double)n / (double)RAND_MAX) - 1.0;\n}\n\nStepFunction* newStepFunction();\n\ntemplate <typename Graph>\nsize_t initializeGraphData(Graph& g);\n\n#endif\n"
  },
  {
    "path": "lonestar/analytics/cpu/matrixcompletion/parselog.sh",
    "content": "#!/bin/bash\nperl -pe 'if(/(\\d+) elapsed \\(ms\\): (\\S+).*GFLOP\\/s: (\\S+) RMSE.*: (\\S+)/){ print \"STAT SINGLE Elapsed$1 (null) $2\\n\";}' \\\n| perl -pe 'if(/(\\d+) elapsed \\(ms\\): (\\S+).*GFLOP\\/s: (\\S+) RMSE.*: (\\S+)/){ print \"STAT SINGLE GFLOPS$1 (null) $3\\n\";}' \\\n| perl -pe 'if(/(\\d+) elapsed \\(ms\\): (\\S+).*GFLOP\\/s: (\\S+) RMSE.*: (\\S+)/){ print \"STAT SINGLE RMSE$1 (null) $4\\n\";}' \\\n| ~/w/GaloisDefault/scripts/report.py\n"
  },
  {
    "path": "lonestar/analytics/cpu/matrixcompletion/plot.R",
    "content": "library(grid)\nlibrary(gridExtra)\nlibrary(ggplot2)\nlibrary(plyr)\nlibrary(reshape2)\n\ntheme_set(theme_bw(base_size=16) + \n  theme(legend.background=element_rect(color=\"white\", fill=\"white\"),\n        legend.key=element_rect(color=\"white\", fill=\"white\"),\n        #legend.margin=unit(0, \"lines\"),\n        #plot.margin=unit(c(0, 0.25, 0, 0), \"lines\"),\n        strip.background=element_rect(color=\"white\", fill=\"white\")))\n\ndd <- read.csv(\"data.csv\", stringsAsFactors=F)\ndd <- dd[,!grepl(\"null\", names(dd))]\ndd <- subset(dd, !is.na(RMSE1))\ndd$Kind <- \"\"\ndd$Kind[grepl(\"mc-ddn\", dd$CommandLine)] <- \"galois\"\ndd$Kind[grepl(\"nomad\", dd$CommandLine)] <- \"nomad\"\ndd$Kind[grepl(\"collaborative\", dd$CommandLine)] <- \"graphlab\"\ndd$Input <- \"\"\ndd$Input[grepl(\"bgg\", dd$CommandLine)] <- \"bgg\"\ndd$Input[grepl(\"yahoo\", dd$CommandLine)] <- \"yahoo\"\ndd$Input[grepl(\"netflix\", dd$CommandLine)] <- \"netflix\"\n\ng1 <- ddply(dd, .(Input, Kind, Threads), function (d) {\n    ldply(grep(\"GFLOPS\", names(d), value=T), function(n) {\n      i=sub(\"GFLOPS(\\\\d+)\", \"\\\\1\", n)\n      if (any(is.na(d[,n]))) data.frame() else data.frame(I=as.numeric(i), GFLOPS=d[,n])})\n})\ng2 <- ddply(dd, .(Input, Kind, Threads), function (d) {\n    ldply(grep(\"Elapsed\", names(d), value=T), function(n) {\n      i=sub(\"Elapsed(\\\\d+)\", \"\\\\1\", n)\n      if (any(is.na(d[,n]))) data.frame() else data.frame(I=as.numeric(i), Elapsed=d[,n])})\n})\ng3 <- ddply(dd, .(Input, Kind, Threads), function (d) {\n    ldply(grep(\"RMSE\", names(d), value=T), function(n) {\n      i=sub(\"RMSE(\\\\d+)\", \"\\\\1\", n)\n      if (any(is.na(d[,n]))) data.frame() else data.frame(I=as.numeric(i), RMSE=d[,n])})\n})\n\ngg <- merge(g1, merge(g2, g3))\ngg <- rbind(gg,\n  data.frame(\n    Input=rep(c(\"bgg\", \"netflix\", \"yahoo\"), 3),\n    Kind=c(rep(\"galois\", 3), rep(\"graphlab\", 3), rep(\"nomad\", 3)),\n    Threads=20,\n    I=0,\n    GFLOPS=NA,\n    Elapsed=0,\n    RMSE=rep(c(70.6059, 3.40143, 61.7137), 3)\n  ))\ngg$Elapsed <- gg$Elapsed / 1000\n\ng_legend<-function(a.gplot){\n  tmp <- ggplot_gtable(ggplot_build(a.gplot))\n  leg <- which(sapply(tmp$grobs, function(x) x$name) == \"guide-box\")\n  legend <- tmp$grobs[[leg]]\n  return(legend)\n}\n\nerrorLevelY <- ddply(subset(gg, Threads==20), .(Input), summarize, intercept=1.1*min(RMSE))\nerrorLevelX <- ddply(subset(gg, Threads==20), .(Input, Kind), function(d) {\n  fn <- approxfun(d$Elapsed, d$RMSE)\n  target <- subset(errorLevelY, Input==d$Input[1])$intercept[1]\n  opt <- optimize(f=function(x) { (fn(x) - target)^2 }, c(0, 1000))\n  data.frame(xintercept=opt$minimum)\n})\n\np1 <- ggplot(subset(gg, Input==\"bgg\" & Threads==20), aes(x=Elapsed, y=RMSE, color=Kind)) +\n  geom_line(size=1) +\n  scale_x_continuous(\"\") +\n  scale_y_continuous(\"\") +\n  scale_color_brewer(type=\"qual\") +\n  geom_abline(data=subset(errorLevelY, Input==\"bgg\"), aes(intercept=intercept, slope=0)) +\n  geom_vline(data=subset(errorLevelX, Input==\"bgg\"),\n             linetype=\"dashed\",\n             aes(xintercept=xintercept, color=Kind)) +\n  geom_text(data=subset(errorLevelX, Input==\"bgg\"),\n            color=\"black\",\n            aes(x=xintercept, y=0, label=sprintf(\"%.0f\", xintercept), vjust = -1, hjust = 0)) +\n  facet_wrap(~Input, scale=\"free\") +\n  theme(legend.position=\"bottom\") +\n  coord_cartesian(xlim=c(0, 500), ylim=c(0, 15))\np2 <- ggplot(subset(gg, Input==\"netflix\" & Threads==20), aes(x=Elapsed, y=RMSE, color=Kind)) +\n  geom_line(size=1) +\n  scale_x_continuous(\"\") +\n  scale_y_continuous(\"\") +\n  scale_color_brewer(type=\"qual\") +\n  geom_abline(data=subset(errorLevelY, Input==\"netflix\"), aes(intercept=intercept, slope=0)) +\n  geom_vline(data=subset(errorLevelX, Input==\"netflix\"),\n             linetype=\"dashed\",\n             aes(xintercept=xintercept, color=Kind)) +\n  geom_text(data=subset(errorLevelX, Input==\"netflix\"),\n            color=\"black\",\n            aes(x=xintercept, y=0.75, label=sprintf(\"%.0f\", xintercept), vjust = -1, hjust = 0)) +\n  facet_wrap(~Input, scale=\"free\") +\n  theme(legend.position=\"none\") +\n  coord_cartesian(xlim=c(0, 200), ylim=c(0.75, 1.5))\np3 <- ggplot(subset(gg, Input==\"yahoo\" & Threads==20), aes(x=Elapsed, y=RMSE, color=Kind)) +\n  geom_line(size=1) +\n  scale_x_continuous(\"\") +\n  scale_y_continuous(\"\") +\n  scale_color_brewer(type=\"qual\") +\n  geom_abline(data=subset(errorLevelY, Input==\"yahoo\"), aes(intercept=intercept, slope=0)) +\n  geom_vline(data=subset(errorLevelX, Input==\"yahoo\"),\n             linetype=\"dashed\",\n             aes(xintercept=xintercept, color=Kind)) +\n  geom_text(data=subset(errorLevelX, Input==\"yahoo\"),\n            color=\"black\",\n            aes(x=xintercept, y=c(18, 0, 17), label=sprintf(\"%.0f\", xintercept), vjust = -1, hjust = 0)) +\n  facet_wrap(~Input, scale=\"free\") +\n  theme(legend.position=\"none\") +\n  coord_cartesian(xlim=c(0, 700), ylim=c(17, 30))\nplegend <- g_legend(p1)\npp <- arrangeGrob(\n  arrangeGrob(p1 + theme(legend.position=\"none\"), p2, p3, nrow=1),\n  plegend, nrow=2, heights=c(1, 0.1))\n\nggplot(gg, aes(x=as.factor(Threads), y=GFLOPS, color=Kind)) + \n  geom_boxplot(position=\"identity\", outlier.colour = NULL) +\n  scale_color_brewer(type=\"qual\") +\n  scale_x_discrete(\"Threads\") +\n  facet_wrap(~Input, scale=\"free\")\n\n\ndd <- read.csv(\"/net/peltier/workspace/ddn/build/default/tyahoo-transpose.degreehist\")\ndd <- read.csv(\"/net/peltier/workspace/ddn/build/default/tyahoo.degreehist\")\ndd <- subset(dd, Degree != 0)\nmod <- nls(Count ~ exp(a + b * Degree), data=dd, start = list(a=0, b=0))\ndd$Kind <- \"O\"\npp <- data.frame(Hist=0, Degree=dd$Degree, Count=predict(mod, list(x=dd$Degree)), Kind=\"P\")\ndd <- rbind(pp, dd)\nggplot(dd, aes(x=Degree, y=Count, color=Kind)) +\n  geom_point() +\n  scale_x_log10()\n"
  },
  {
    "path": "lonestar/analytics/cpu/matrixcompletion/runexp.py",
    "content": "#!/usr/bin/env python\n\nimport os\nimport sys\nimport subprocess\nimport math\n\nRootDir = '/net/gilbert/workspace/ddn/build'\nRunPy = '%s/w/GaloisDefault/scripts/run.py' % os.environ['HOME']\nCommandTable = [{\n  'name': 'bgg',\n  'lambda': '0.01',\n  'learningRate': '0.0005',\n  'decayRate': '0.001',\n  'time': 500,\n  'secondsPerIteration': {'nomad': 1, 'galois': 0.25, 'graphlab': 0.5 },\n  'nomadSecondsPerIteration': 10,\n  'galoisOption': '-itemsPerBlock 200 -usersPerBlock 1024',\n}, {\n  'name': 'netflix',\n  'lambda': '0.05',\n  'learningRate': '0.012',\n  'decayRate': '0.015',\n  'time': 1000,\n  'secondsPerIteration': {'nomad': 10, 'galois': 1, 'graphlab': 30 },\n  'galoisOption': '-itemsPerBlock 150 -usersPerBlock 2048'\n}, {\n  'name': 'yahoo',\n  'lambda': '1.0',\n  'learningRate': '0.00075',\n  'decayRate': '0.01',\n  'time': 1000,\n  'secondsPerIteration': {'nomad': 25, 'galois': 25, 'graphlab': 100 },\n  #'galoisOption': '-itemsPerBlock 1500 -usersPerBlock 40000' # transpose\n  'galoisOption': '-itemsPerBlock 4625 -usersPerBlock 7025'\n}]\n\ndef galoisCommand(c):\n  # exp/apps/sgd/sgd-ddn /net/faraday/workspace/inputs/weighted/bipartite/bgg.gr -t 20 -algo blockedEdge -lambda 0.01 -learningRate 0.0005 -decayRate 0.001 -learningRateFunction Purdue -itemsPerBlock 200 -usersPerBlock 1024 -fixedRounds 400 -useExactError\n  cmd = os.path.join(RootDir, 'default/exp/apps/matrixcompletion/mc-ddn')\n  input = os.path.join('/net/faraday/workspace/inputs/weighted/bipartite', c['name'] + '.gr')\n  \n  iterations = int(math.ceil(c['time'] / c['secondsPerIteration']['galois']))\n  opts = ['-useSameLatentVector', '-algo blockedEdge', '-useExactError', '-fixedRounds', str(iterations)]\n  opts += ['-learningRateFunction purdue']\n  opts += [c['galoisOption']]\n  nparams = ['-lambda', '-learningRate', '-decayRate']\n  cparams = ['lambda', 'learningRate', 'decayRate']\n  params = [(n, c[p]) for (n,p) in zip(nparams, cparams)]\n\n  s = [RunPy, '-t 20,40', '--', cmd, input]\n  #s = [RunPy, '-t 20', '--', cmd, input]\n  s += [v for s in params for v in s]\n  s += opts\n  subprocess.call(' '.join(s), shell=True)\n\ndef graphlabCommand(c):\n  # toolkits/collaborative_filtering/sgd --matrix /net/faraday/workspace/ddn/inputs/graphlab/bgg --D 100 --lambda 0.01 --gamma 0.0005 --step_dec 0.001 --ncpus 40 --max_iter 400\n  cmd = os.path.join(RootDir, 'graphlab/toolkits/collaborative_filtering/sgd')\n  input = os.path.join('/net/faraday/workspace/ddn/inputs/graphlab', c['name'])\n  \n  iterations = int(math.ceil(c['time'] / c['secondsPerIteration']['graphlab']))\n  opts = ['--D 100', '--max_iter', str(iterations)]\n  nparams = ['--lambda', '--gamma', '--step_dec']\n  cparams = ['lambda', 'learningRate', 'decayRate']\n  params = [(n, c[p]) for (n,p) in zip(nparams, cparams)]\n\n  s = [RunPy, '--no-default-thread', '-x Threads::--ncpus::20,40', '--', cmd, input]\n  s += [v for s in params for v in s]\n  s += opts\n  subprocess.call(' '.join(s), shell=True)\n\ndef nomadCommand(c):\n  # mpirun ./nomad_double --path /net/faraday/workspace/ddn/inputs/nomad/bgg --nthreads 40 --lrate 0.0005  --drate 0.001  --dim  100  --reg 0.01 --timeout 10 20 30 40 50 60 70 80 90 100 110 120 130 140 150 160 170 180 190 200\n  cmd = os.path.join(RootDir, 'nomad/nomad_double')\n  input = os.path.join('/net/faraday/workspace/ddn/inputs/nomad', c['name'])\n  iterations = int(math.ceil(c['time'] / c['secondsPerIteration']['nomad']))\n  timeouts = [str(c['secondsPerIteration']['nomad'] * x) for x in range(1, iterations + 1)]\n  opts = ['--dim 100', '--timeout', ' '.join(timeouts)]\n  nparams = ['--reg', '--lrate', '--drate']\n  cparams = ['lambda', 'learningRate', 'decayRate']\n  params = [(n, c[p]) for (n,p) in zip(nparams, cparams)]\n\n  s = [RunPy, '--no-default-thread', '--append-arguments', '-x Threads::--nthreads::20,40', '--', 'mpirun', cmd, '--path', input]\n  s += [v for s in params for v in s]\n  s += opts\n  subprocess.call(' '.join(s), shell=True)\n\ndef main():\n  sys.stdout = os.fdopen(sys.stdout.fileno(), 'w', 0)\n  for c in CommandTable:\n    #for t in [galoisCommand, graphlabCommand, nomadCommand]:\n    for t in [galoisCommand]:\n      t(c)\n\n\nif __name__ == '__main__':\n  main()\n"
  },
  {
    "path": "lonestar/analytics/cpu/pagerank/CMakeLists.txt",
    "content": "add_executable(pagerank-pull-cpu PageRank-pull.cpp)\nadd_dependencies(apps pagerank-pull-cpu)\ntarget_link_libraries(pagerank-pull-cpu PRIVATE Galois::shmem lonestar)\ninstall(TARGETS pagerank-pull-cpu DESTINATION \"${CMAKE_INSTALL_BINDIR}\" COMPONENT apps EXCLUDE_FROM_ALL)\n\nadd_test_scale(small pagerank-pull-cpu -transposedGraph -tolerance=0.01 \"${BASEINPUT}/scalefree/transpose/rmat10.tgr\")\nadd_test_scale(small-topo pagerank-pull-cpu -transposedGraph -tolerance=0.01 -algo=Topo \"${BASEINPUT}/scalefree/transpose/rmat10.tgr\")\n\nadd_executable(pagerank-push-cpu PageRank-push.cpp)\nadd_dependencies(apps pagerank-push-cpu)\ntarget_link_libraries(pagerank-push-cpu PRIVATE Galois::shmem lonestar)\ninstall(TARGETS pagerank-push-cpu DESTINATION \"${CMAKE_INSTALL_BINDIR}\" COMPONENT apps EXCLUDE_FROM_ALL)\n\nadd_test_scale(small pagerank-push-cpu -tolerance=0.01 \"${BASEINPUT}/scalefree/transpose/rmat10.tgr\")\nadd_test_scale(small-sync pagerank-push-cpu -tolerance=0.01 -algo=Sync \"${BASEINPUT}/scalefree/transpose/rmat10.tgr\")\n"
  },
  {
    "path": "lonestar/analytics/cpu/pagerank/PageRank-constants.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#ifndef LONESTAR_PAGERANK_CONSTANTS_H\n#define LONESTAR_PAGERANK_CONSTANTS_H\n\n#include <iostream>\n\n#define DEBUG 0\n\nstatic const char* name = \"Page Rank\";\nstatic const char* url  = nullptr;\n\n//! All PageRank algorithm variants use the same constants for ease of\n//! comparison.\nconstexpr static const float ALPHA         = 0.85;\nconstexpr static const float INIT_RESIDUAL = 1 - ALPHA;\n\nconstexpr static const float TOLERANCE   = 1.0e-3;\nconstexpr static const unsigned MAX_ITER = 1000;\n\nconstexpr static const unsigned PRINT_TOP = 20;\n\nnamespace cll = llvm::cl;\n\nstatic cll::opt<std::string>\n    inputFile(cll::Positional, cll::desc(\"<input file>\"), cll::Required);\nstatic cll::opt<float> tolerance(\"tolerance\", cll::desc(\"tolerance\"),\n                                 cll::init(TOLERANCE));\nstatic cll::opt<unsigned int> maxIterations(\n    \"maxIterations\",\n    cll::desc(\"Maximum iterations, applies round-based versions only\"),\n    cll::init(MAX_ITER));\n\n//! Type definitions.\ntypedef float PRTy;\n\ntemplate <typename GNode>\nstruct TopPair {\n  float value;\n  GNode id;\n\n  TopPair(float v, GNode i) : value(v), id(i) {}\n\n  bool operator<(const TopPair& b) const {\n    if (value == b.value)\n      return id > b.id;\n    return value < b.value;\n  }\n};\n\n//! Helper functions.\n\nPRTy atomicAdd(std::atomic<PRTy>& v, PRTy delta) {\n  PRTy old;\n  do {\n    old = v;\n  } while (!v.compare_exchange_strong(old, old + delta));\n  return old;\n}\n\ntemplate <typename Graph>\nvoid printTop(Graph& graph, unsigned topn = PRINT_TOP) {\n\n  using GNode = typename Graph::GraphNode;\n  typedef TopPair<GNode> Pair;\n  typedef std::map<Pair, GNode> TopMap;\n\n  TopMap top;\n\n  for (auto ii = graph.begin(), ei = graph.end(); ii != ei; ++ii) {\n    GNode src  = *ii;\n    auto& n    = graph.getData(src);\n    PRTy value = n.value;\n    Pair key(value, src);\n\n    if (top.size() < topn) {\n      top.insert(std::make_pair(key, src));\n      continue;\n    }\n\n    if (top.begin()->first < key) {\n      top.erase(top.begin());\n      top.insert(std::make_pair(key, src));\n    }\n  }\n\n  int rank = 1;\n  std::cout << \"Rank PageRank Id\\n\";\n  for (auto ii = top.rbegin(), ei = top.rend(); ii != ei; ++ii, ++rank) {\n    std::cout << rank << \": \" << ii->first.value << \" \" << ii->first.id << \"\\n\";\n  }\n}\n\n#if DEBUG\ntemplate <typename Graph>\nvoid printPageRank(Graph& graph) {\n  std::cout << \"Id\\tPageRank\\n\";\n  int counter = 0;\n  for (auto ii = graph.begin(), ei = graph.end(); ii != ei; ii++) {\n    auto& sd = graph.getData(*ii);\n    std::cout << counter << \" \" << sd.value << \"\\n\";\n    counter++;\n  }\n}\n#endif\n\n#endif\n"
  },
  {
    "path": "lonestar/analytics/cpu/pagerank/PageRank-pull.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#include \"Lonestar/BoilerPlate.h\"\n#include \"PageRank-constants.h\"\n#include \"galois/Galois.h\"\n#include \"galois/LargeArray.h\"\n#include \"galois/Timer.h\"\n#include \"galois/graphs/LCGraph.h\"\n#include \"galois/graphs/TypeTraits.h\"\n#include \"galois/gstl.h\"\n\nconst char* desc =\n    \"Computes page ranks a la Page and Brin. This is a pull-style algorithm.\";\n\nenum Algo { Topo = 0, Residual };\n\nstatic cll::opt<Algo> algo(\"algo\", cll::desc(\"Choose an algorithm:\"),\n                           cll::values(clEnumVal(Topo, \"Topological\"),\n                                       clEnumVal(Residual, \"Residual\")),\n                           cll::init(Residual));\n\n//! Flag that forces user to be aware that they should be passing in a\n//! transposed graph.\nstatic cll::opt<bool>\n    transposedGraph(\"transposedGraph\",\n                    cll::desc(\"Specify that the input graph is transposed\"),\n                    cll::init(false));\n\nconstexpr static const unsigned CHUNK_SIZE = 32;\n\nstruct LNode {\n  PRTy value;\n  uint32_t nout;\n};\n\ntypedef galois::graphs::LC_CSR_Graph<LNode, void>::with_no_lockable<\n    true>::type ::with_numa_alloc<true>::type Graph;\ntypedef typename Graph::GraphNode GNode;\n\nusing DeltaArray    = galois::LargeArray<PRTy>;\nusing ResidualArray = galois::LargeArray<PRTy>;\n\n//! Initialize nodes for the topological algorithm.\nvoid initNodeDataTopological(Graph& g) {\n  PRTy init_value = 1.0f / g.size();\n  galois::do_all(\n      galois::iterate(g),\n      [&](const GNode& n) {\n        auto& sdata = g.getData(n, galois::MethodFlag::UNPROTECTED);\n        sdata.value = init_value;\n        sdata.nout  = 0;\n      },\n      galois::no_stats(), galois::loopname(\"initNodeData\"));\n}\n\n//! Initialize nodes for the residual algorithm.\nvoid initNodeDataResidual(Graph& g, DeltaArray& delta,\n                          ResidualArray& residual) {\n  galois::do_all(\n      galois::iterate(g),\n      [&](const GNode& n) {\n        auto& sdata = g.getData(n, galois::MethodFlag::UNPROTECTED);\n        sdata.value = 0;\n        sdata.nout  = 0;\n        delta[n]    = 0;\n        residual[n] = INIT_RESIDUAL;\n      },\n      galois::no_stats(), galois::loopname(\"initNodeData\"));\n}\n\n//! Computing outdegrees in the tranpose graph is equivalent to computing the\n//! indegrees in the original graph.\nvoid computeOutDeg(Graph& graph) {\n  galois::StatTimer outDegreeTimer(\"computeOutDegFunc\");\n  outDegreeTimer.start();\n\n  galois::LargeArray<std::atomic<size_t>> vec;\n  vec.allocateInterleaved(graph.size());\n\n  galois::do_all(\n      galois::iterate(graph),\n      [&](const GNode& src) { vec.constructAt(src, 0ul); }, galois::no_stats(),\n      galois::loopname(\"InitDegVec\"));\n\n  galois::do_all(\n      galois::iterate(graph),\n      [&](const GNode& src) {\n        for (auto nbr : graph.edges(src)) {\n          GNode dst = graph.getEdgeDst(nbr);\n          vec[dst].fetch_add(1ul);\n        };\n      },\n      galois::steal(), galois::chunk_size<CHUNK_SIZE>(), galois::no_stats(),\n      galois::loopname(\"computeOutDeg\"));\n\n  galois::do_all(\n      galois::iterate(graph),\n      [&](const GNode& src) {\n        auto& srcData = graph.getData(src, galois::MethodFlag::UNPROTECTED);\n        srcData.nout  = vec[src];\n      },\n      galois::no_stats(), galois::loopname(\"CopyDeg\"));\n\n  outDegreeTimer.stop();\n}\n\n/**\n * It does not calculate the pagerank for each iteration,\n * but only calculate the residual to be added from the previous pagerank to\n * the current one.\n * If the residual is smaller than the tolerance, that is not reflected to\n * the next pagerank.\n */\n//! [scalarreduction]\nvoid computePRResidual(Graph& graph, DeltaArray& delta,\n                       ResidualArray& residual) {\n  unsigned int iterations = 0;\n  galois::GAccumulator<unsigned int> accum;\n\n  while (true) {\n    galois::do_all(\n        galois::iterate(graph),\n        [&](const GNode& src) {\n          auto& sdata = graph.getData(src);\n          delta[src]  = 0;\n\n          //! Only the residual higher than tolerance will be reflected\n          //! to the pagerank.\n          if (residual[src] > tolerance) {\n            PRTy oldResidual = residual[src];\n            residual[src]    = 0.0;\n            sdata.value += oldResidual;\n            if (sdata.nout > 0) {\n              delta[src] = oldResidual * ALPHA / sdata.nout;\n              accum += 1;\n            }\n          }\n        },\n        galois::no_stats(), galois::loopname(\"PageRank_delta\"));\n\n    galois::do_all(\n        galois::iterate(graph),\n        [&](const GNode& src) {\n          float sum = 0;\n          for (auto nbr : graph.edges(src)) {\n            GNode dst = graph.getEdgeDst(nbr);\n            if (delta[dst] > 0) {\n              sum += delta[dst];\n            }\n          }\n          if (sum > 0) {\n            residual[src] = sum;\n          }\n        },\n        galois::steal(), galois::chunk_size<CHUNK_SIZE>(), galois::no_stats(),\n        galois::loopname(\"PageRank\"));\n\n#if DEBUG\n    std::cout << \"iteration: \" << iterations << \"\\n\";\n#endif\n    iterations++;\n    if (iterations >= maxIterations || !accum.reduce()) {\n      break;\n    }\n    accum.reset();\n  } ///< End while(true).\n    //! [scalarreduction]\n\n  if (iterations >= maxIterations) {\n    std::cerr << \"ERROR: failed to converge in \" << iterations\n              << \" iterations\\n\";\n  }\n}\n\n/**\n * PageRank pull topological.\n * Always calculate the new pagerank for each iteration.\n */\nvoid computePRTopological(Graph& graph) {\n  unsigned int iteration = 0;\n  galois::GAccumulator<float> accum;\n\n  float base_score = (1.0f - ALPHA) / graph.size();\n  while (true) {\n    galois::do_all(\n        galois::iterate(graph),\n        [&](const GNode& src) {\n          constexpr const galois::MethodFlag flag =\n              galois::MethodFlag::UNPROTECTED;\n\n          LNode& sdata = graph.getData(src, flag);\n          float sum    = 0.0;\n\n          for (auto jj = graph.edge_begin(src, flag),\n                    ej = graph.edge_end(src, flag);\n               jj != ej; ++jj) {\n            GNode dst = graph.getEdgeDst(jj);\n\n            LNode& ddata = graph.getData(dst, flag);\n            sum += ddata.value / ddata.nout;\n          }\n\n          //! New value of pagerank after computing contributions from\n          //! incoming edges in the original graph.\n          float value = sum * ALPHA + base_score;\n          //! Find the delta in new and old pagerank values.\n          float diff = std::fabs(value - sdata.value);\n\n          //! Do not update pagerank before the diff is computed since\n          //! there is a data dependence on the pagerank value.\n          sdata.value = value;\n          accum += diff;\n        },\n        galois::no_stats(), galois::steal(), galois::chunk_size<CHUNK_SIZE>(),\n        galois::loopname(\"PageRank\"));\n\n#if DEBUG\n    std::cout << \"iteration: \" << iteration << \" max delta: \" << delta << \"\\n\";\n#endif\n\n    iteration += 1;\n    if (accum.reduce() <= tolerance || iteration >= maxIterations) {\n      break;\n    }\n    accum.reset();\n\n  } ///< End while(true).\n\n  galois::runtime::reportStat_Single(\"PageRank\", \"Rounds\", iteration);\n  if (iteration >= maxIterations) {\n    std::cerr << \"ERROR: failed to converge in \" << iteration\n              << \" iterations\\n\";\n  }\n}\n\nvoid prTopological(Graph& graph) {\n  initNodeDataTopological(graph);\n  computeOutDeg(graph);\n\n  galois::StatTimer execTime(\"Timer_0\");\n  execTime.start();\n  computePRTopological(graph);\n  execTime.stop();\n}\n\nvoid prResidual(Graph& graph) {\n  DeltaArray delta;\n  delta.allocateInterleaved(graph.size());\n  ResidualArray residual;\n  residual.allocateInterleaved(graph.size());\n\n  initNodeDataResidual(graph, delta, residual);\n  computeOutDeg(graph);\n\n  galois::StatTimer execTime(\"Timer_0\");\n  execTime.start();\n  computePRResidual(graph, delta, residual);\n  execTime.stop();\n}\n\nint main(int argc, char** argv) {\n  galois::SharedMemSys G;\n  LonestarStart(argc, argv, name, desc, url, &inputFile);\n\n  if (!transposedGraph) {\n    GALOIS_DIE(\"This application requires a transposed graph input;\"\n               \" please use the -transposedGraph flag \"\n               \" to indicate the input is a transposed graph.\");\n  }\n  galois::StatTimer totalTime(\"TimerTotal\");\n  totalTime.start();\n\n  Graph transposeGraph;\n  std::cout << \"WARNING: pull style algorithms work on the transpose of the \"\n               \"actual graph\\n\"\n            << \"WARNING: this program assumes that \" << inputFile\n            << \" contains transposed representation\\n\\n\"\n            << \"Reading graph: \" << inputFile << \"\\n\";\n\n  galois::graphs::readGraph(transposeGraph, inputFile);\n  std::cout << \"Read \" << transposeGraph.size() << \" nodes, \"\n            << transposeGraph.sizeEdges() << \" edges\\n\";\n\n  galois::preAlloc(2 * numThreads + (3 * transposeGraph.size() *\n                                     sizeof(typename Graph::node_data_type)) /\n                                        galois::runtime::pagePoolSize());\n  galois::reportPageAlloc(\"MeminfoPre\");\n\n  switch (algo) {\n  case Topo:\n    std::cout << \"Running Pull Topological version, tolerance:\" << tolerance\n              << \", maxIterations:\" << maxIterations << \"\\n\";\n    prTopological(transposeGraph);\n    break;\n  case Residual:\n    std::cout << \"Running Pull Residual version, tolerance:\" << tolerance\n              << \", maxIterations:\" << maxIterations << \"\\n\";\n    prResidual(transposeGraph);\n    break;\n  default:\n    std::abort();\n  }\n\n  galois::reportPageAlloc(\"MeminfoPost\");\n\n  //! Sanity checking code.\n  galois::GReduceMax<PRTy> maxRank;\n  galois::GReduceMin<PRTy> minRank;\n  galois::GAccumulator<PRTy> distanceSum;\n  maxRank.reset();\n  minRank.reset();\n  distanceSum.reset();\n\n  //! [example of no_stats]\n  galois::do_all(\n      galois::iterate(transposeGraph),\n      [&](uint64_t i) {\n        PRTy rank = transposeGraph.getData(i).value;\n\n        maxRank.update(rank);\n        minRank.update(rank);\n        distanceSum += rank;\n      },\n      galois::loopname(\"Sanity check\"), galois::no_stats());\n  //! [example of no_stats]\n\n  PRTy rMaxRank = maxRank.reduce();\n  PRTy rMinRank = minRank.reduce();\n  PRTy rSum     = distanceSum.reduce();\n  galois::gInfo(\"Max rank is \", rMaxRank);\n  galois::gInfo(\"Min rank is \", rMinRank);\n  galois::gInfo(\"Sum is \", rSum);\n\n  if (!skipVerify) {\n    printTop(transposeGraph);\n  }\n\n#if DEBUG\n  printPageRank(transposeGraph);\n#endif\n\n  totalTime.stop();\n\n  return 0;\n}\n"
  },
  {
    "path": "lonestar/analytics/cpu/pagerank/PageRank-push.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#include \"Lonestar/BoilerPlate.h\"\n#include \"PageRank-constants.h\"\n#include \"galois/Bag.h\"\n#include \"galois/Galois.h\"\n#include \"galois/Timer.h\"\n#include \"galois/graphs/LCGraph.h\"\n#include \"galois/graphs/TypeTraits.h\"\n\n/**\n * These implementations are based on the Push-based PageRank computation\n * (Algorithm 4) as described in the PageRank Europar 2015 paper.\n *\n * WHANG, Joyce Jiyoung, et al. Scalable data-driven pagerank: Algorithms,\n * system issues, and lessons learned. In: European Conference on Parallel\n * Processing. Springer, Berlin, Heidelberg, 2015. p. 438-450.\n */\n\nconst char* desc =\n    \"Computes page ranks a la Page and Brin. This is a push-style algorithm.\";\n\nconstexpr static const unsigned CHUNK_SIZE = 16;\n\nenum Algo { Async, Sync }; ///< Async has better asbolute performance.\n\nstatic cll::opt<Algo> algo(\"algo\", cll::desc(\"Choose an algorithm:\"),\n                           cll::values(clEnumVal(Async, \"Async\"),\n                                       clEnumVal(Sync, \"Sync\")),\n                           cll::init(Async));\n\nstruct LNode {\n  PRTy value;\n  std::atomic<PRTy> residual;\n\n  void init() {\n    value    = 0.0;\n    residual = INIT_RESIDUAL;\n  }\n\n  friend std::ostream& operator<<(std::ostream& os, const LNode& n) {\n    os << \"{PR \" << n.value << \", residual \" << n.residual << \"}\";\n    return os;\n  }\n};\n\ntypedef galois::graphs::LC_CSR_Graph<LNode, void>::with_numa_alloc<\n    true>::type ::with_no_lockable<true>::type Graph;\ntypedef typename Graph::GraphNode GNode;\n\nvoid asyncPageRank(Graph& graph) {\n  typedef galois::worklists::PerSocketChunkFIFO<CHUNK_SIZE> WL;\n  galois::for_each(\n      galois::iterate(graph),\n      [&](GNode src, auto& ctx) {\n        LNode& sdata = graph.getData(src);\n        constexpr const galois::MethodFlag flag =\n            galois::MethodFlag::UNPROTECTED;\n\n        if (sdata.residual > tolerance) {\n          PRTy oldResidual = sdata.residual.exchange(0.0);\n          sdata.value += oldResidual;\n          int src_nout = std::distance(graph.edge_begin(src, flag),\n                                       graph.edge_end(src, flag));\n          if (src_nout > 0) {\n            PRTy delta = oldResidual * ALPHA / src_nout;\n            //! For each out-going neighbors.\n            for (auto jj : graph.edges(src, flag)) {\n              GNode dst    = graph.getEdgeDst(jj);\n              LNode& ddata = graph.getData(dst, flag);\n              if (delta > 0) {\n                auto old = atomicAdd(ddata.residual, delta);\n                if ((old < tolerance) && (old + delta >= tolerance)) {\n                  ctx.push(dst);\n                }\n              }\n            }\n          }\n        }\n      },\n      galois::loopname(\"PushResidualAsync\"),\n      galois::disable_conflict_detection(), galois::no_stats(),\n      galois::wl<WL>());\n}\n\nvoid syncPageRank(Graph& graph) {\n  struct Update {\n    PRTy delta;\n    Graph::edge_iterator beg;\n    Graph::edge_iterator end;\n  };\n\n  constexpr ptrdiff_t EDGE_TILE_SIZE = 128;\n\n  galois::InsertBag<Update> updates;\n  galois::InsertBag<GNode> activeNodes;\n\n  galois::do_all(\n      galois::iterate(graph), [&](const GNode& src) { activeNodes.push(src); },\n      galois::no_stats());\n\n  size_t iter = 0;\n  for (; !activeNodes.empty() && iter < maxIterations; ++iter) {\n    galois::do_all(\n        galois::iterate(activeNodes),\n        [&](const GNode& src) {\n          constexpr const galois::MethodFlag flag =\n              galois::MethodFlag::UNPROTECTED;\n          LNode& sdata = graph.getData(src, flag);\n\n          if (sdata.residual > tolerance) {\n            PRTy oldResidual = sdata.residual;\n            sdata.value += oldResidual;\n            sdata.residual = 0.0;\n\n            int src_nout = std::distance(graph.edge_begin(src, flag),\n                                         graph.edge_end(src, flag));\n            PRTy delta   = oldResidual * ALPHA / src_nout;\n\n            auto beg       = graph.edge_begin(src, flag);\n            const auto end = graph.edge_end(src, flag);\n\n            assert(beg <= end);\n\n            //! Edge tiling for large outdegree nodes.\n            if ((end - beg) > EDGE_TILE_SIZE) {\n              for (; beg + EDGE_TILE_SIZE < end;) {\n                auto ne = beg + EDGE_TILE_SIZE;\n                updates.push(Update{delta, beg, ne});\n                beg = ne;\n              }\n            }\n\n            if ((end - beg) > 0) {\n              updates.push(Update{delta, beg, end});\n            }\n          }\n        },\n        galois::steal(), galois::chunk_size<CHUNK_SIZE>(),\n        galois::loopname(\"CreateEdgeTiles\"), galois::no_stats());\n\n    activeNodes.clear();\n\n    galois::do_all(\n        galois::iterate(updates),\n        [&](const Update& up) {\n          constexpr const galois::MethodFlag flag =\n              galois::MethodFlag::UNPROTECTED;\n          //! For each out-going neighbors.\n          for (auto jj = up.beg; jj != up.end; ++jj) {\n            GNode dst    = graph.getEdgeDst(jj);\n            LNode& ddata = graph.getData(dst, flag);\n            auto old     = atomicAdd(ddata.residual, up.delta);\n            //! If fabs(old) is greater than tolerance, then it would\n            //! already have been processed in the previous do_all\n            //! loop.\n            if ((old <= tolerance) && (old + up.delta >= tolerance)) {\n              activeNodes.push(dst);\n            }\n          }\n        },\n        galois::steal(), galois::chunk_size<CHUNK_SIZE>(),\n        galois::loopname(\"PushResidualSync\"), galois::no_stats());\n\n    updates.clear();\n  }\n\n  if (iter >= maxIterations) {\n    std::cerr << \"ERROR: failed to converge in \" << iter << \" iterations\\n\";\n  }\n}\n\nint main(int argc, char** argv) {\n  galois::SharedMemSys G;\n  LonestarStart(argc, argv, name, desc, url, &inputFile);\n\n  galois::StatTimer totalTime(\"TimerTotal\");\n  totalTime.start();\n\n  Graph graph;\n  galois::graphs::readGraph(graph, inputFile);\n  std::cout << \"Read \" << graph.size() << \" nodes, \" << graph.sizeEdges()\n            << \" edges\\n\";\n\n  galois::preAlloc(5 * numThreads +\n                   (5 * graph.size() * sizeof(typename Graph::node_data_type)) /\n                       galois::runtime::pagePoolSize());\n  galois::reportPageAlloc(\"MeminfoPre\");\n\n  std::cout << \"tolerance:\" << tolerance << \", maxIterations:\" << maxIterations\n            << \"\\n\";\n\n  galois::do_all(\n      galois::iterate(graph), [&graph](GNode n) { graph.getData(n).init(); },\n      galois::no_stats(), galois::loopname(\"Initialize\"));\n\n  galois::StatTimer execTime(\"Timer_0\");\n  execTime.start();\n\n  switch (algo) {\n  case Async:\n    std::cout << \"Running Edge Async push version,\";\n    asyncPageRank(graph);\n    break;\n\n  case Sync:\n    std::cout << \"Running Edge Sync push version,\";\n    syncPageRank(graph);\n    break;\n\n  default:\n    std::abort();\n  }\n\n  execTime.stop();\n\n  galois::reportPageAlloc(\"MeminfoPost\");\n\n  if (!skipVerify) {\n    printTop(graph);\n  }\n\n#if DEBUG\n  printPageRank(graph);\n#endif\n\n  totalTime.stop();\n\n  return 0;\n}\n"
  },
  {
    "path": "lonestar/analytics/cpu/pagerank/README.md",
    "content": "Pagerank\n================================================================================\n\nDESCRIPTION \n--------------------------------------------------------------------------------\n\nWe implement both pull and push-style PageRank algorithms. The push-style\nalgorithms are based on the computations (Algorithm 4) described in the \nPageRank Europar 2015 paper.\n\nWhang et al. Scalable Data-driven PageRank: Algorithms, System Issues, and \nLessons Learned. Europar 2015.\n\nThere are two variants, topological and residual, of the pull-style algorithm \nthat are implemented. The pull variants perform better than the push variants \nsince there are no atomic operations. The residual version performs and scales \nthe best. It does less work and uses separate arrays for storing delta and \nresidual information to improve locality and use of memory bandwidth.\n\nINPUT\n--------------------------------------------------------------------------------\n\nThe push variant takes in Galois .gr format.\nThe pull variant takes in transposed Galois .gr graphs.\nYou must specify the -transposedGraph flag when running the pull variant.\n\nBUILD\n--------------------------------------------------------------------------------\n\n1. Run `cmake` at the BUILD directory (refer to top-level README for instructions).\n\n2. Run `cd <BUILD>/lonestar/analytics/cpu/pagerank; make -j` \n\nRUN\n--------------------------------------------------------------------------------\n\nThe following are a few examples of invoking PageRank.\n\n* `$ ./pagerank-pull-cpu <path-transpose-graph> -tolerance=0.001 -transposedGraph`\n\n* `$ ./pagerank-pull-cpu <path-transpose-graph> -t=20 -tolerance=0.001 -algo=Residual -transposedGraph`\n\n* `$ ./pagerank-push-cpu <path-graph> -t=40 -tolerance=0.001 -algo=Async`\n\nPERFORMANCE  \n--------------------------------------------------------------------------------\n\nThe performance of the push and the pull versions depend on an optimal choice \nof the the compile time constant, CHUNK_SIZE. For the pull version, CHUNK_SIZE \ndenotes the granularity of stolen work when work stealing is enabled (via \ngalois::steal()). The optimal value of the constant might depend on the \narchitecture, so you might want to evaluate the performance over a range of \nvalues (say [16-4096]).\n"
  },
  {
    "path": "lonestar/analytics/cpu/pointstoanalysis/CMakeLists.txt",
    "content": "add_executable(pointstoanalysis-cpu PointsTo.cpp)\nadd_dependencies(apps pointstoanalysis-cpu)\ntarget_link_libraries(pointstoanalysis-cpu PRIVATE Galois::shmem lonestar)\ninstall(TARGETS pointstoanalysis-cpu DESTINATION \"${CMAKE_INSTALL_BINDIR}\" COMPONENT apps EXCLUDE_FROM_ALL)\n\nadd_test_scale(small pointstoanalysis-cpu \"${BASEINPUT}/java/pta/gap_constraints.txt\")\n"
  },
  {
    "path": "lonestar/analytics/cpu/pointstoanalysis/PointsTo.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#include \"galois/Galois.h\"\n#include \"llvm/Support/CommandLine.h\"\n#include \"Lonestar/BoilerPlate.h\"\n#include <iostream>\n#include <fstream>\n#include <deque>\n#include \"SparseBitVector.h\"\n\n////////////////////////////////////////////////////////////////////////////////\n// Command line parameters\n////////////////////////////////////////////////////////////////////////////////\n\nnamespace cll = llvm::cl;\n\nconst char* name = \"Points-to Analysis\";\nconst char* desc = \"Performs inclusion-based points-to analysis over the input \"\n                   \"constraints.\";\n\nstatic cll::opt<std::string>\n    inputFile(cll::Positional, cll::desc(\"<input file>\"), cll::Required);\nstatic cll::opt<bool>\n    useSerial(\"serial\",\n              cll::desc(\"Runs serial version of the algorithm \"\n                        \"(i.e. 1 thread, no galois::for_each) \"\n                        \"(default false)\"),\n              cll::init(false));\n\nstatic cll::opt<bool>\n    printAnswer(\"printAnswer\",\n                cll::desc(\"If set, prints all points to facts \"\n                          \"at the end\"),\n                cll::init(false));\n\nstatic cll::opt<bool>\n    useCycleDetection(\"ocd\",\n                      cll::desc(\"If set, online cycle detection is\"\n                                \" used in algorithm (serial only) \"\n                                \"(default false)\"),\n                      cll::init(false));\n\nstatic cll::opt<unsigned>\n    THRESHOLD_LS(\"lsThreshold\",\n                 cll::desc(\"Determines how many constraints to \"\n                           \"process before load/store constraints \"\n                           \"are reprocessed (serial only) \"\n                           \"(default 500000)\"),\n                 cll::init(500000));\n\n////////////////////////////////////////////////////////////////////////////////\n// Declaration of strutures, types, and variables\n////////////////////////////////////////////////////////////////////////////////\n\n/**\n * Class representing a points-to constraint.\n */\nclass PtsToCons {\npublic:\n  using ConstraintType = enum { AddressOf = 0, Copy, Load, Store, GEP };\n\nprivate:\n  unsigned src;\n  unsigned dst;\n  ConstraintType type;\n\npublic:\n  PtsToCons(ConstraintType tt, unsigned ss, unsigned dd) {\n    src  = ss;\n    dst  = dd;\n    type = tt;\n  }\n\n  /**\n   * @returns This constraint's src and dst node\n   */\n  std::pair<unsigned, unsigned> getSrcDst() const {\n    return std::pair<unsigned, unsigned>(src, dst);\n  }\n\n  /**\n   * @returns The type of this constraint\n   */\n  ConstraintType getType() const { return type; }\n\n  /**\n   * Print out this constraint to stderr\n   */\n  void print() const {\n    if (type == Store) {\n      std::cerr << \"*\";\n    }\n\n    std::cerr << \"v\" << dst;\n    std::cerr << \" = \";\n\n    if (type == Load) {\n      std::cerr << \"*\";\n    } else if (type == AddressOf) {\n      std::cerr << \"&\";\n    }\n\n    std::cerr << \"v\" << src;\n\n    std::cerr << \"\\n\";\n  }\n};\n\n/**\n * Points to analysis runner base class. Does not have a run method itself.\n *\n * @tparam IsConcurrent if set to true, the data structures used for points\n * to results and outgoing edges will be thread safe\n */\ntemplate <bool IsConcurrent>\nclass PTABase {\n  // sparse bit vector is concurrent or serial based on template parameter\n  using SparseBitVector = galois::SparseBitVector<IsConcurrent>;\n\n  using PointsToConstraints = std::vector<PtsToCons>;\n  using PointsToInfo        = std::vector<SparseBitVector>;\n  using EdgeVector          = std::vector<SparseBitVector>;\n\n  using NodeAllocator =\n      galois::FixedSizeAllocator<typename SparseBitVector::Node>;\n\nprotected:\n  PointsToInfo pointsToResult; // pointsTo results for nodes\n  EdgeVector outgoingEdges;    // holds outgoing edges of a node\n\n  PointsToConstraints addressCopyConstraints;\n  PointsToConstraints loadStoreConstraints;\n\n  size_t numNodes = 0;\n\n  ////////////////////////////////////////////////////////////////////////////////\n  /**\n   * Online Cycle Detection and elimination structure + functions.\n   */\n  struct OnlineCycleDetection {\n  private:\n    PTABase<IsConcurrent>&\n        outerPTA; // reference to outer PTA instance to get runtime info\n\n    galois::gstl::Vector<unsigned> ancestors; // TODO find better representation\n    galois::gstl::Vector<bool> visited;       // TODO use better representation\n    galois::gstl::Vector<unsigned> representative;\n\n    unsigned NoRepresentative; // \"constant\" that represents no representative\n\n    /**\n     * @returns true of the nodeid is an ancestor node\n     * FIXME find better way to do this instead of linear scan\n     */\n    bool isAncestor(unsigned nodeid) {\n      for (unsigned ii : ancestors) {\n        if (ii == nodeid) {\n          return true;\n        }\n      }\n      return false;\n    }\n\n    /**\n     * Depth first recursion of the nodeid to see if it eventually\n     * reaches an ancestor, in which case there is a cycle. The cycle is\n     * then collapsed, i.e. all nodes in the cycle have their representative\n     * changed to the representative of the node where the cycle starts.\n     *\n     * Note it is okay not to detect all cycles as it is only an efficiency\n     * concern.\n     *\n     * @param nodeid nodeid to check the existance of a cycle\n     * @param cyclenode This is used as OUTPUT parameter; if a node is\n     * detected to be an ancestor, it is returned via this variable.\n     * @returns true if a cycle has been detected (i.e. a node has a path\n     * to an ancestor), false otherwise\n     */\n    bool cycleDetect(unsigned nodeID, unsigned& cycleNode) {\n      unsigned nodeRep = getFinalRepresentative(nodeID);\n\n      // if the node is an ancestor, that means there's a path from the ancestor\n      // to the ancestor (i.e. cycle)\n      if (isAncestor(nodeRep)) {\n        cycleNode = nodeRep;\n        return true;\n      }\n\n      if (visited[nodeRep]) {\n        return false;\n      }\n\n      visited[nodeRep] = true;\n\n      // keep track of the current depth first search path\n      ancestors.push_back(nodeRep);\n\n      // don't use an iterator here because outgoing edges might get updated\n      // during this loop\n      std::vector<unsigned> repOutgoingEdges =\n          outerPTA.outgoingEdges[nodeRep].getAllSetBits();\n\n      for (auto dst : repOutgoingEdges) {\n        // recursive depth first cycle detection; if a cycle is found,\n        // collapse the path\n        if (cycleDetect(dst, cycleNode)) {\n          cycleCollapse(cycleNode);\n        }\n      }\n      ancestors.pop_back();\n\n      return false;\n    }\n\n    /**\n     * Make all nodes that are a part of some detected cycle starting at\n     * repr have their representatives changed to the representative of\n     * repr (thereby \"collapsing\" the cycle).\n     *\n     * @param repr The node at which the cycle begins\n     */\n    void cycleCollapse(unsigned repr) {\n      // assert(repr is present in ancestors).\n      unsigned repToChangeTo = getFinalRepresentative(repr);\n\n      for (auto ii = ancestors.begin(); ii != ancestors.end(); ++ii) {\n        if (*ii == repr) {\n          galois::gDebug(\"collapsing cycle for \", repr);\n          // cycle exists between nodes ancestors[*ii..end].\n          for (auto jj = ii; jj != ancestors.end(); ++jj) {\n            // jjRepr has no representative.\n            unsigned jjRepr = getFinalRepresentative(*jj);\n            makeRepr(jjRepr, repToChangeTo);\n          }\n\n          break;\n        }\n      }\n    }\n\n    /**\n     * Make repr the representative of nodeID.\n     *\n     * @param nodeID node to change the representative of\n     * @param repr nodeID will have its representative changed to this\n     */\n    void makeRepr(unsigned nodeID, unsigned repr) {\n      if (repr != nodeID) {\n        galois::gDebug(\"change repr[\", nodeID, \"] = \", repr);\n\n        representative[nodeID] = repr;\n\n        // the representative needs to have all of the items that the nodes\n        // it is representing has, so if the node has more than the rep,\n        // unify\n        if (!outerPTA.pointsToResult[nodeID].isSubsetEq(\n                outerPTA.pointsToResult[repr])) {\n          outerPTA.pointsToResult[repr].unify(outerPTA.pointsToResult[nodeID]);\n        }\n\n        // unify edges as well if necessary since rep represents it now\n        if (!outerPTA.outgoingEdges[nodeID].isSubsetEq(\n                outerPTA.outgoingEdges[repr])) {\n          outerPTA.outgoingEdges[repr].unify(outerPTA.outgoingEdges[nodeID]);\n        }\n      }\n    }\n\n  public:\n    OnlineCycleDetection(PTABase<IsConcurrent>& o) : outerPTA(o) {}\n\n    /**\n     * Init fields (outerPTA needs to have numNodes set).\n     */\n    void init() {\n      NoRepresentative = outerPTA.numNodes;\n      visited.resize(outerPTA.numNodes);\n      representative.resize(outerPTA.numNodes);\n\n      for (unsigned ii = 0; ii < outerPTA.numNodes; ++ii) {\n        representative[ii] = NoRepresentative;\n      }\n    }\n\n    /**\n     * Given a node id, find its representative. Also, do path compression\n     * of the path to the representative.\n     *\n     * @param nodeid Node id to get the representative of\n     * @returns The representative of nodeid\n     */\n    unsigned getFinalRepresentative(unsigned nodeid) {\n      unsigned finalRep = nodeid;\n\n      // follow chain of representatives until a \"root\" is reached\n      while (representative[finalRep] != NoRepresentative) {\n        finalRep = representative[finalRep];\n      }\n\n      // path compression; make all things along path to final representative\n      // point to the final representative\n      unsigned curRep = representative[nodeid];\n\n      while (curRep != NoRepresentative) {\n        representative[nodeid] = finalRep;\n        nodeid                 = curRep;\n        curRep                 = representative[nodeid];\n      }\n\n      return finalRep;\n    }\n\n    /**\n     * Go over all sources of new edges to see if there are cycles in them.\n     * If so, collapse the cycles.\n     *\n     * @param updates vector of nodes that are sources of new edges.\n     */\n    template <typename VecType>\n    void process(VecType& updates) {\n      if (!useCycleDetection) {\n        return;\n      }\n\n      // TODO this can probably be made more efficient (fill?)\n      for (unsigned ii = 0; ii < outerPTA.numNodes; ++ii) {\n        visited[ii] = false;\n      }\n\n      unsigned cycleNode = NoRepresentative; // set to invalid id.\n\n      for (unsigned update : updates) {\n        galois::gDebug(\"cycle process \", update);\n\n        if (cycleDetect(update, cycleNode)) {\n          cycleCollapse(cycleNode);\n        }\n      }\n    }\n  }; // end struct OnlineCycleDetection\n  ////////////////////////////////////////////////////////////////////////////////\n\n  OnlineCycleDetection ocd; // cycle detector/squasher; only works with serial\n\n  /**\n   * Adds edges to the graph based on load/store constraints.\n   *\n   * A load from src -> dst means anything that src points to must also\n   * point to dst.\n   *\n   * A store from src -> dst means src must point to anything\n   * that dst points to.\n   *\n   * Any updated nodes are returned in the updates vector.\n   *\n   * @tparam LoopInvoker Functor that will run the loop\n   * @tparam VecType object that supports a push_back function that represents\n   * nodes to be worked on\n   *\n   * @param constraints Load/store constraints to use to add edges\n   * @param updates output variable that will have updated nodes added to it\n   */\n  template <typename LoopInvoker, typename VecType>\n  void processLoadStore(const PointsToConstraints& constraints,\n                        VecType& updates) {\n\n    LoopInvoker()(galois::iterate(constraints), [&](auto constraint) {\n      unsigned src;\n      unsigned dst;\n      std::tie(src, dst) = constraint.getSrcDst();\n\n      unsigned srcRepr = ocd.getFinalRepresentative(src);\n      unsigned dstRepr = ocd.getFinalRepresentative(dst);\n\n      if (constraint.getType() == PtsToCons::Load) {\n        for (auto pointee = pointsToResult[srcRepr].begin();\n             pointee != pointsToResult[srcRepr].end(); pointee++) {\n          unsigned pointeeRepr = ocd.getFinalRepresentative(*pointee);\n\n          // add edge from pointee to dst if it doesn't already exist\n          if (pointeeRepr != dstRepr &&\n              !outgoingEdges[pointeeRepr].test(dstRepr)) {\n            outgoingEdges[pointeeRepr].set(dstRepr);\n\n            updates.push_back(pointeeRepr);\n          }\n        }\n      } else { // store whatever src has into whatever dst points to\n        bool newEdgeAdded = false;\n\n        for (auto pointee = pointsToResult[dstRepr].begin();\n             pointee != pointsToResult[dstRepr].end(); pointee++) {\n          unsigned pointeeRepr = ocd.getFinalRepresentative(*pointee);\n\n          // add edge from src -> pointee if it doesn't exist\n          if (srcRepr != pointeeRepr &&\n              !outgoingEdges[srcRepr].test(pointeeRepr)) {\n            outgoingEdges[srcRepr].set(pointeeRepr);\n\n            newEdgeAdded = true;\n          }\n        }\n\n        if (newEdgeAdded) {\n          updates.push_back(srcRepr);\n        }\n      }\n    });\n  }\n\n  /**\n   * Processes the AddressOf, Copy constraints.\n   *\n   * Sets the bitvector for AddressOf constraints, i.e. a set bit means\n   * that you point to whatever that bit represents.\n   *\n   * Creates edges for Copy constraints, i.e. edge from a to b indicates\n   * b is a copy of a.\n   *\n   * @tparam LoopInvoker Functor that will run the loop\n   * @tparam VecType object that supports a push_back function as well\n   * as iteration over pushed objects\n   *\n   * @param constraints vector of AddressOf and Copy constraints\n   * @returns vector of UpdatesRequests from all sources with new edges\n   * added by the Copy constraint\n   */\n  template <typename LoopInvoker, typename VecType>\n  VecType processAddressOfCopy(const PointsToConstraints& constraints) {\n    VecType updates;\n\n    LoopInvoker()(galois::iterate(constraints), [&](auto ii) {\n      unsigned src;\n      unsigned dst;\n\n      std::tie(src, dst) = ii.getSrcDst();\n\n      if (ii.getType() == PtsToCons::AddressOf) { // addressof; save point info\n        pointsToResult[dst].set(src);\n      } else if (src != dst) { // copy constraint; add an edge\n        outgoingEdges[src].set(dst);\n        updates.push_back(src);\n      }\n    });\n\n    return updates;\n  }\n\n  /**\n   * If an edge exists from src to dst, then dst is a copy of src.\n   * Propogate any points to information from source to dest.\n   *\n   * @param src Source node in graph\n   * @param dst Dest node in graph\n   * @returns non-negative value if any bitvector has changed\n   */\n  unsigned propagate(unsigned src, unsigned dst) {\n    unsigned newPtsTo = 0;\n\n    if (src != dst) {\n      unsigned srcRepr = ocd.getFinalRepresentative(src);\n      unsigned dstRepr = ocd.getFinalRepresentative(dst);\n\n      // if src is a not subset of dst... (i.e. src has more), then\n      // propogate src's points to info to dst\n      if (srcRepr != dstRepr &&\n          !pointsToResult[srcRepr].isSubsetEq(pointsToResult[dstRepr])) {\n        // galois::gDebug(\"unifying \", dstRepr, \" by \", srcRepr);\n        // newPtsTo is positive if changes are made\n        newPtsTo += pointsToResult[dstRepr].unify(pointsToResult[srcRepr]);\n      }\n    }\n\n    return newPtsTo;\n  }\n\npublic:\n  PTABase() : ocd(*this) {}\n\n  /**\n   * Given the number of nodes in the constraint graph, initialize the\n   * structures needed for the points-to algorithm.\n   *\n   * @param n Number of nodes in the constraint graph\n   * @param nodeAllocator galois allocator object to allocate nodes in the\n   * sparse bit vector\n   */\n  void initialize(size_t n, NodeAllocator& nodeAllocator) {\n    numNodes = n;\n\n    // initialize different constructs based on which version is being run\n    pointsToResult.resize(numNodes);\n    outgoingEdges.resize(numNodes);\n\n    // initialize vectors\n    for (unsigned i = 0; i < numNodes; i++) {\n      pointsToResult[i].init(&nodeAllocator);\n      outgoingEdges[i].init(&nodeAllocator);\n    }\n\n    ocd.init();\n  }\n\n  //! frees memory allocated by the node allocator\n  void freeNodeAllocatorMemory() {\n    for (unsigned i = 0; i < numNodes; i++) {\n      pointsToResult[i].freeAll();\n      outgoingEdges[i].freeAll();\n    }\n  }\n\n  /**\n   * Read a constraint file and load its contents into memory.\n   *\n   * @param file filename to read\n   * @returns number of nodes in the constraint graph\n   */\n  unsigned readConstraints(const char* file) {\n    galois::gInfo(\"GEP constraints (constraint type 4) and any constraints \"\n                  \"with offsets are ignored.\");\n\n    unsigned numNodes     = 0;\n    unsigned nconstraints = 0;\n\n    std::ifstream cfile(file);\n    std::string cstr;\n\n    getline(cfile, cstr); // # of vars.\n    sscanf(cstr.c_str(), \"%d\", &numNodes);\n\n    getline(cfile, cstr); // # of constraints.\n    sscanf(cstr.c_str(), \"%d\", &nconstraints);\n\n    addressCopyConstraints.clear();\n    loadStoreConstraints.clear();\n\n    unsigned constraintNum;\n    unsigned src;\n    unsigned dst;\n    unsigned offset;\n\n    PtsToCons::ConstraintType type;\n\n    // Create constraint objects and save them to appropriate location\n    for (unsigned ii = 0; ii < nconstraints; ++ii) {\n      getline(cfile, cstr);\n      union {\n        int as_int;\n        PtsToCons::ConstraintType as_ctype;\n      } type_converter;\n      sscanf(cstr.c_str(), \"%d,%d,%d,%d,%d\", &constraintNum, &src, &dst,\n             &type_converter.as_int, &offset);\n\n      type = type_converter.as_ctype;\n\n      PtsToCons cc(type, src, dst);\n\n      if (type == PtsToCons::AddressOf || type == PtsToCons::Copy) {\n        addressCopyConstraints.push_back(cc);\n      } else if (type == PtsToCons::Load || type == PtsToCons::Store) {\n        if (offset == 0) { // ignore load/stores with offsets\n          loadStoreConstraints.push_back(cc);\n        }\n      }\n      // ignore GEP constraints\n    }\n\n    cfile.close();\n\n    return numNodes;\n  }\n\n  //////////////////////////////////////////////////////////////////////////////\n  // Debugging/output functions\n  //////////////////////////////////////////////////////////////////////////////\n\n  /**\n   * Prints the constraints in the passed in vector of constraints.\n   *\n   * @param constraints vector of PtsToCons\n   */\n  void printConstraints(PointsToConstraints& constraints) {\n    for (auto ii = constraints.begin(); ii != constraints.end(); ++ii) {\n      ii->print();\n    }\n  }\n\n  /**\n   * Checks to make sure that all representative point to at LEAST\n   * what the nodes that it represents are pointing to. Necessary but not\n   * sufficient check for correctness.\n   */\n  void checkReprPointsTo() {\n    for (unsigned ii = 0; ii < pointsToResult.size(); ++ii) {\n      unsigned repr = ocd.getFinalRepresentative(ii);\n      if (repr != ii && !pointsToResult[ii].isSubsetEq(pointsToResult[repr])) {\n        galois::gError(\"pointsto(\", ii,\n                       \") is not less than its \"\n                       \"representative pointsto(\",\n                       repr, \").\");\n      }\n    }\n  }\n\n  /**\n   * Makes sure that the representative of a set of nodes has all of the\n   * edges that the nodes it is representing has.\n   */\n  void checkReprEdges() {\n    for (unsigned ii = 0; ii < outgoingEdges.size(); ++ii) {\n      unsigned repr = ocd.getFinalRepresentative(ii);\n      if (repr != ii && !outgoingEdges[ii].isSubsetEq(outgoingEdges[repr])) {\n        galois::gError(\"edges(\", ii,\n                       \") is not less than its \"\n                       \"representative edges(\",\n                       repr, \").\");\n      }\n    }\n  }\n\n  /**\n   * @returns The total number of points to facts in the system.\n   */\n  unsigned countPointsToFacts() {\n    unsigned count = 0;\n\n    for (auto ii = pointsToResult.begin(); ii != pointsToResult.end(); ++ii) {\n      unsigned repr = ocd.getFinalRepresentative(ii - pointsToResult.begin());\n      count += pointsToResult[repr].count();\n    }\n\n    return count;\n  }\n\n  /**\n   * Prints out points to info for all verticies in the constraint graph.\n   */\n  void printPointsToInfo() {\n    std::string prefix = \"v\";\n\n    for (auto ii = pointsToResult.begin(); ii != pointsToResult.end(); ++ii) {\n      std::cerr << prefix << ii - pointsToResult.begin() << \": \";\n      unsigned repr = ocd.getFinalRepresentative(ii - pointsToResult.begin());\n      pointsToResult[repr].print(std::cerr, prefix);\n    }\n  }\n}; // end class PTA\n\n/**\n * Serial points to executor.\n */\nclass PTASerial : public PTABase<false> {\npublic:\n  /**\n   * Run points-to-analysis on a single thread.\n   */\n  void run() {\n    galois::gDebug(\n        \"no of addr+copy constraints = \", addressCopyConstraints.size(),\n        \", no of load+store constraints = \", loadStoreConstraints.size());\n    galois::gDebug(\"no of nodes = \", numNodes);\n\n    std::deque<unsigned> updates;\n    updates = processAddressOfCopy<galois::StdForEach, std::deque<unsigned>>(\n        addressCopyConstraints);\n    processLoadStore<galois::StdForEach>(loadStoreConstraints, updates);\n\n    unsigned numUps = 0;\n\n    // FIFO\n    while (!updates.empty()) {\n      unsigned src = updates.front();\n      updates.pop_front();\n\n      for (auto dst = outgoingEdges[src].begin();\n           dst != outgoingEdges[src].end(); dst++) {\n        unsigned newPtsTo = propagate(src, *dst);\n\n        if (newPtsTo) { // newPtsTo is positive if dst changed\n          updates.push_back(ocd.getFinalRepresentative(*dst));\n        }\n\n        numUps++;\n      }\n\n      if (updates.empty() || numUps >= THRESHOLD_LS) {\n        galois::gDebug(\"No of points-to facts computed = \",\n                       countPointsToFacts());\n        numUps = 0;\n\n        // After propagating all constraints, see if load/store\n        // constraints need to be added in since graph was potentially updated\n        processLoadStore<galois::StdForEach>(loadStoreConstraints, updates);\n\n        // do cycle squashing\n        ocd.process(updates);\n      }\n    }\n  }\n};\n\n/**\n * Concurrent points to executor.\n */\nclass PTAConcurrent : public PTABase<true> {\npublic:\n  /**\n   * Run points-to-analysis using galois::for_each as the main loop.\n   */\n  void run() {\n    galois::gDebug(\n        \"no of addr+copy constraints = \", addressCopyConstraints.size(),\n        \", no of load+store constraints = \", loadStoreConstraints.size());\n    galois::gDebug(\"no of nodes = \", numNodes);\n\n    galois::InsertBag<unsigned> updates;\n    updates = processAddressOfCopy<galois::DoAll, galois::InsertBag<unsigned>>(\n        addressCopyConstraints);\n    processLoadStore<galois::DoAll>(loadStoreConstraints, updates);\n\n    while (!updates.empty()) {\n      galois::for_each(\n          galois::iterate(updates),\n          [this](unsigned req, auto& ctx) {\n            for (auto dst = this->outgoingEdges[req].begin();\n                 dst != this->outgoingEdges[req].end(); dst++) {\n              unsigned newPtsTo = this->propagate(req, *dst);\n\n              if (newPtsTo)\n                ctx.push(this->ocd.getFinalRepresentative(*dst));\n            }\n          },\n          galois::loopname(\"PointsToMainUpdateLoop\"),\n          galois::disable_conflict_detection(),\n          galois::wl<galois::worklists::PerSocketChunkFIFO<8>>());\n\n      galois::gDebug(\"No of points-to facts computed = \", countPointsToFacts());\n\n      updates.clear();\n\n      // After propagating all constraints, see if load/store constraints need\n      // to be added in since graph was potentially updated\n      processLoadStore<galois::DoAll>(loadStoreConstraints, updates);\n\n      // do cycle squashing\n      // ocd.process(updates); // TODO have parallel OCD, if possible\n    }\n  }\n};\n\n/**\n * Method from running PTA.\n */\ntemplate <typename PTAClass, typename Alloc>\nvoid runPTA(PTAClass& pta, Alloc& nodeAllocator) {\n  size_t numNodes = pta.readConstraints(inputFile.c_str());\n  pta.initialize(numNodes, nodeAllocator);\n\n  galois::StatTimer execTime(\"Timer_0\");\n\n  execTime.start();\n  pta.run();\n  execTime.stop();\n\n  galois::gInfo(\"No of points-to facts computed = \", pta.countPointsToFacts());\n\n  if (!skipVerify) {\n    galois::gInfo(\"Doing verification step\");\n    pta.checkReprPointsTo();\n    pta.checkReprEdges();\n  }\n\n  if (printAnswer) {\n    pta.printPointsToInfo();\n  }\n\n  // free everything nodeallocator allocated\n  pta.freeNodeAllocatorMemory();\n}\n\nint main(int argc, char** argv) {\n  galois::SharedMemSys G;\n  LonestarStart(argc, argv, name, desc, nullptr, &inputFile);\n\n  galois::StatTimer totalTime(\"TimerTotal\");\n  totalTime.start();\n\n  // depending on serial or concurrent, create the correct class and pass it\n  // into the run harness which takes care of the rest\n  if (!useSerial) {\n    galois::gInfo(\"-------- Parallel version: \", galois::getActiveThreads(),\n                  \" threads.\");\n    galois::gInfo(\"Note correctness of this version is relative to the serial \"\n                  \"version.\");\n\n    PTAConcurrent p;\n    galois::FixedSizeAllocator<typename galois::SparseBitVector<true>::Node>\n        nodeAllocator;\n    runPTA(p, nodeAllocator);\n  } else {\n    galois::gInfo(\"-------- Sequential version.\");\n    galois::gInfo(\n        \"The load store threshold (-lsThreshold) may need tweaking for \"\n        \"best performance; its current setting may not be the best for \"\n        \"your input and may actually degrade performance.\");\n    PTASerial p;\n    galois::FixedSizeAllocator<typename galois::SparseBitVector<false>::Node>\n        nodeAllocator;\n    runPTA(p, nodeAllocator);\n  }\n\n  totalTime.stop();\n\n  return 0;\n}\n"
  },
  {
    "path": "lonestar/analytics/cpu/pointstoanalysis/README.md",
    "content": "Points To Analysis\n================================================================================\n\nDESCRIPTION \n--------------------------------------------------------------------------------\n\nPoints-to analysis based on Hardekopf and Lin's points-to analysis algorithm.\n\nGiven a constraint file (format detailed below), runs a graph based points-to\nanalysis algorithm to determine which nodes point to which other nodes.\nBoth a serial and a multi-threaded version exist, and the serial version\nsupports online cycle detection.\n\nPerformance is achieved by using a sparse bit vector to represent both\nedges and points-to information.\n\nINPUT\n--------------------------------------------------------------------------------\n\nThe input is a constraint file in the following format:\n\n```\n<num vars> <num constraints>\n<constraint num> <src> <dst> <type> <offset>\n<constraint num> <src> <dst> <type> <offset>\n<constraint num> <src> <dst> <type> <offset>\n.\n.\n.\n<constraint num> <src> <dst> <type> <offset>\n<EOF>\n```\n\n`<src>` and `<dst>` are node IDs, and `<type>` specifies the relation\nbetween them. `<offset>` is not supported in the implementation: it must be\nset to 0. If it is not, the entire constraint will be ignored.\n\nThe constraint types supported are the following:\n\n0 = Address Of Constraint\n1 = Copy Constraint\n2 = Load Constraint\n3 = Store Constraint\n\nAll other constraint types will be ignored.\n\nNote that the correctness of the parallel version is relative to the serial\nversion, which may or may not match other implementations of points-to\nanalysis.\n\nBUILD\n--------------------------------------------------------------------------------\n\n1. Run cmake at BUILD directory (refer to top-level README for cmake instructions).\n\n2. Run `cd <BUILD>/lonestar/analytics/cpu/pointstoanalysis; make -j`\n\nRUN\n--------------------------------------------------------------------------------\n\nRun serial points-to analysis with the following command:\n`./pointstoanalysis-cpu <constraint file> -serial`\n\nRun serial points-to analysis with online cycle detection with the following \ncommand:\n`./pointstoanalysis-cpu <constraint file> -serial -ocd`\n\nRun serial points-to analysis that reprocesses load/store constraints after\nN constraints with the following command:\n`./pointstoanalysis-cpu <constraint file> -serial -lsThreshold=N`\n\nRun the parallel version of points-to analysis with the following command:\n`./pointstoanalysis-cpu <constraint file> -t=<num threads>`\n\nRun the parallel version of points-to analysis and print the results with\nthe following command (the serial version also supports printAnswer):\n`./pointstoanalysis-cpu <constraint file> -t=<num threads> -printAnswer`\n\nPERFORMANCE  \n--------------------------------------------------------------------------------\n\nOnline cycle detection in the serial version may or may not help depending on the\ninput. There are cases where it can hurt performance. The serial version also\nhas a threshold that determines load/store constraints are reprocessed.\nDepending on your input, you may get better performance by tuning the frequency\nat which these constraints are reprocessed (the idea is that it may eliminate\nredundant constraints that currently exist in the worklist).\n"
  },
  {
    "path": "lonestar/analytics/cpu/pointstoanalysis/SparseBitVector.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#ifndef _GALOIS_SPARSEBITVECTOR_\n#define _GALOIS_SPARSEBITVECTOR_\n\n#include <galois/AtomicWrapper.h>\n#include <galois/Mem.h>\n#include <utility>\n#include <boost/iterator/iterator_facade.hpp>\n\nnamespace galois {\n\n/**\n * Sparse bit vector using a linked list. Also thread safe; however, only\n * guarantee is that functions return values based on the state of the\n * vector AT THE TIME THE FUNCTION IS CALLED. (i.e. if concurrent update\n * happens in a function call, the update may or may not be visible).\n */\ntemplate <bool IsConcurrent>\nstruct SparseBitVector {\n  using WORD                     = unsigned int;\n  static const unsigned wordSize = sizeof(WORD) * 8;\n\n  //////////////////////////////////////////////////////////////////////////////\n\n  /**\n   * Node in sparse bit vector linked list\n   */\n  struct Node {\n    unsigned _base; // base to multiply by/used to sort linked list\n    // If concurrent, then wrap these in a copyable atomic\n    using WordType =\n        typename std::conditional<IsConcurrent, galois::CopyableAtomic<WORD>,\n                                  WORD>::type;\n    WordType _bitVector; // stores set bits for a base\n\n    using NodeType =\n        typename std::conditional<IsConcurrent, galois::CopyableAtomic<Node*>,\n                                  Node*>::type;\n    NodeType _next; // pointer to next node in linked list\n\n    /**\n     * Needs a base when being constructed.\n     *\n     * @param base\n     */\n    Node(unsigned base) {\n      _base      = base;\n      _bitVector = 0;\n      _next      = nullptr;\n    }\n\n    /**\n     * Construct with an already set\n     *\n     * @param base\n     */\n    Node(unsigned base, unsigned offset) {\n      _base = base;\n\n      // set bit at offset\n      unsigned toStore = 0;\n      toStore |= ((WORD)1 << offset);\n      _bitVector = toStore;\n\n      _next = nullptr;\n    }\n\n    /**\n     * Thread safe set. Uses compare and swap to atomically update the\n     * bitvector.\n     *\n     * @param offset Offset to set the bit at\n     * @returns true if the set bit wasn't set previously\n     */\n    template <bool A                            = IsConcurrent,\n              typename std::enable_if<A>::type* = nullptr>\n    bool set(unsigned offset) {\n      unsigned expected = _bitVector;\n      unsigned newValue = expected | ((WORD)1 << offset);\n      bool changed      = (expected != newValue);\n\n      while (changed && !std::atomic_compare_exchange_weak(\n                            &_bitVector, &expected, newValue)) {\n        // if cas fails, then update new value\n        newValue = expected | ((WORD)1 << offset);\n        changed  = (expected != newValue);\n      }\n\n      return changed;\n    }\n\n    /**\n     * Sets the bit at the provided offset. Not thread safe/ not concurrent\n     * version.\n     *\n     * @param offset Offset to set the bit at\n     * @returns true if the set bit wasn't set previously\n     */\n    template <bool A                             = IsConcurrent,\n              typename std::enable_if<!A>::type* = nullptr>\n    bool set(unsigned offset) {\n      WORD beforeBits = _bitVector;\n      _bitVector |= ((WORD)1 << offset);\n      return _bitVector != beforeBits;\n    }\n\n    /**\n     * @param offset Offset into bits to check status of\n     * @returns true if bit at offset is set, false otherwise\n     */\n    bool test(unsigned offset) const {\n      WORD mask = (WORD)1 << offset;\n      return ((_bitVector & mask) == mask);\n    }\n\n    /**\n     * Determines if second has set all of the bits that this objects has set.\n     *\n     * @param second pointer to compare against\n     * @returns true if second word's bits has everything that this\n     * word's bits have\n     */\n    bool isSubsetEq(Node* second) const {\n      WORD current = _bitVector;\n      return (current & (second->_bitVector)) == current;\n    }\n\n    /**\n     * Bitwise or with second's bits field on our field.\n     *\n     * @param second sbv node to do a bitwise or with\n     * @returns 1 if something changed, 0 otherwise\n     */\n    template <bool A                            = IsConcurrent,\n              typename std::enable_if<A>::type* = nullptr>\n    unsigned unify(Node* second) {\n      if (second) {\n        WORD oldVector = _bitVector;\n        WORD newVector = oldVector | (second->_bitVector);\n        bool changed   = (oldVector != newVector);\n\n        while (changed && !std::atomic_compare_exchange_weak(\n                              &_bitVector, &oldVector, newVector)) {\n          // if cas fails, update again\n          newVector = oldVector | (second->_bitVector);\n          changed   = (oldVector != newVector);\n        }\n\n        return changed;\n      }\n\n      return 0;\n    }\n\n    /**\n     * Bitwise or with second's bits field on our field. Non-concurrent\n     * version.\n     *\n     * @param second Node to do a bitwise or with\n     * @returns 1 if something changed, 0 otherwise\n     */\n    template <bool A                             = IsConcurrent,\n              typename std::enable_if<!A>::type* = nullptr>\n    unsigned unify(Node* second) {\n      if (second) {\n        WORD oldBits = _bitVector;\n        _bitVector |= second->_bitVector;\n        return (_bitVector != oldBits);\n      }\n\n      return 0;\n    }\n\n    // TODO revise this to use a Galois allocator\n    /**\n     * @returns a pointer to a copy of this word without the preservation\n     * of the linked list\n     */\n    Node* clone(galois::FixedSizeAllocator<Node>* nodeAllocator) const {\n      Node* newWord = nodeAllocator->allocate(1);\n      nodeAllocator->construct(newWord, 0);\n\n      newWord->_base      = _base;\n      newWord->_bitVector = _bitVector;\n      newWord->_next      = nullptr;\n\n      return newWord;\n    }\n\n    /**\n     * TODO can probably use bit twiddling to get count more efficiently\n     *\n     * @returns The number of set bits in this word\n     */\n    unsigned count() const {\n      unsigned numElements = 0;\n\n      WORD bitMask = 1;\n      WORD bits    = _bitVector;\n\n      for (unsigned ii = 0; ii < wordSize; ++ii) {\n        if (bits & bitMask) {\n          ++numElements;\n        }\n\n        bitMask <<= 1;\n      }\n      return numElements;\n    }\n\n    /**\n     * Gets the set bits in this word and adds them to the passed in\n     * vector.\n     *\n     * @tparam VectorTy vector type that supports push_back\n     * @param setBits Vector to add set bits to\n     * @returns Number of set bits in this word\n     */\n    template <typename VectorTy>\n    unsigned getAllSetBits(VectorTy& setbits) const {\n      // or mask used to mask set bits\n      WORD orMask     = 1;\n      unsigned numSet = 0;\n      WORD bits       = _bitVector;\n\n      for (unsigned curBit = 0; curBit < wordSize; ++curBit) {\n        if (bits & orMask) {\n          setbits.push_back(_base * wordSize + curBit);\n          numSet++;\n        }\n\n        orMask <<= 1;\n      }\n\n      return numSet;\n    }\n  };\n\n  //////////////////////////////////////////////////////////////////////////////\n\n  /**\n   * Iterator for SparseBitVector\n   *\n   * BEHAVIOR IF THE BIT VECTOR IS ALTERED DURING ITERATION IS UNDEFINED.\n   * (i.e. correctness is not guaranteed)\n   */\n  class SBVIterator\n      : public boost::iterator_facade<SBVIterator, const unsigned,\n                                      boost::forward_traversal_tag> {\n    Node* currentHead;\n    unsigned currentBit{0};\n    unsigned currentValue{~0U};\n\n    void advanceToNextBit(bool inclusive) {\n      if (!inclusive) {\n        currentBit++; // current bit doesn't count for checking\n      }\n\n      bool found = false;\n      while (!found && currentHead != nullptr) {\n        while (currentBit < wordSize) {\n          if (currentHead->test(currentBit)) {\n            found = true;\n            break;\n          } else {\n            currentBit++;\n          }\n        }\n\n        if (!found) {\n          currentHead = (currentHead->_next);\n          currentBit  = 0;\n        }\n      }\n\n      if (currentHead != nullptr) {\n        currentValue = (currentHead->_base * wordSize) + currentBit;\n      } else {\n        currentValue = -1;\n      }\n    }\n\n  public:\n    /**\n     * This is the end for an iterator.\n     */\n    SBVIterator() : currentHead(nullptr), currentBit(0), currentValue(-1) {\n      currentValue = -1;\n    }\n\n    SBVIterator(Node* firstHead)\n        : currentHead(firstHead), currentBit(0), currentValue(-1) {\n      advanceToNextBit(true);\n    }\n\n    SBVIterator(SparseBitVector* bv) : currentBit(0), currentValue(-1) {\n      currentHead = (bv->head);\n      advanceToNextBit(true);\n    }\n\n  private:\n    friend class boost::iterator_core_access;\n\n    /**\n     * Goes to next bit of bitvector.\n     */\n    void increment() {\n      if (currentHead != nullptr) {\n        advanceToNextBit(false); // false = increment currentBit\n      }                          // do nothing if head is nullptr (i.e. the end)\n    }\n\n    /**\n     * @param other Another iterator to compare against\n     * @returns true if other iterator currently points to the same location\n     */\n    bool equal(const SBVIterator& other) const {\n      if (currentHead != nullptr) {\n        if (other.currentHead == currentHead &&\n            other.currentBit == currentBit) {\n          return true;\n        } else {\n          return false;\n        }\n      } else {\n        if (other.currentHead == nullptr) {\n          return true;\n        } else {\n          return false;\n        }\n      }\n    }\n\n    /**\n     * @returns the current value that the iterator is pointing to\n     */\n    const unsigned& dereference() const { return currentValue; }\n  };\n\n  //////////////////////////////////////////////////////////////////////////////\n\n  using NodeType =\n      typename std::conditional<IsConcurrent, galois::CopyableAtomic<Node*>,\n                                Node*>::type;\n  // head of linked list\n  NodeType head;\n  // allocator of new nodes\n  galois::FixedSizeAllocator<Node>* nodeAllocator;\n\n  /**\n   * Default constructor = nullptrs\n   */\n  SparseBitVector() {\n    head          = nullptr;\n    nodeAllocator = nullptr;\n  }\n\n  /**\n   * Free all nodes allocated with the node allocator\n   */\n  ~SparseBitVector() {\n    if (nodeAllocator) {\n      this->freeAll();\n    }\n  }\n\n  void freeAll() {\n    if (nodeAllocator) { // check to make sure nodeAllocator isn't null\n      while (head) {\n        Node* current = head;\n        head          = current->_next;\n        nodeAllocator->destroy(current);\n        nodeAllocator->deallocate(current, 1);\n      }\n    }\n\n    // set to null pointer so it doesn't try this again later on destructor\n    nodeAllocator = nullptr;\n  }\n\n  /**\n   * Initialize by setting head to nullptr and saving the word allocator.\n   *\n   * @param _nodeAllocator allocator object for nodes to use when creating\n   * a new linked list node\n   */\n  void init(galois::FixedSizeAllocator<Node>* _nodeAllocator) {\n    head          = nullptr;\n    nodeAllocator = _nodeAllocator;\n  }\n\n  /**\n   * @returns iterator to first set element of this bitvector\n   */\n  SBVIterator begin() { return SBVIterator(this); }\n\n  /**\n   * @returns end iterator of this bitvector.\n   */\n  SBVIterator end() { return SBVIterator(); }\n\n  /**\n   * Set the provided bit num in the bitvector. Will create a new word if the\n   * word needed to set the bit doesn't exist yet + will rearrange linked\n   * list of words as necessary.\n   *\n   * @param num The bit to set in the bitvector\n   * @returns true if the bit set wasn't set previously\n   */\n  template <bool A = IsConcurrent, typename std::enable_if<A>::type* = nullptr>\n  bool set(unsigned num) {\n    unsigned baseWord;\n    unsigned offsetIntoWord;\n\n    // determine base word and bit that corresponds to the num\n    std::tie(baseWord, offsetIntoWord) = getOffsets(num);\n\n    Node* curPtr = head;\n    Node* prev   = nullptr;\n\n    // while true due to fact that compare and swap (CAS) may fail\n    while (true) {\n      // pointers should be in sorted order\n      // loop through linked list to find the correct base word (if it exists)\n      while (curPtr != nullptr && curPtr->_base < baseWord) {\n        prev   = curPtr;\n        curPtr = (curPtr->_next);\n      }\n\n      // if base already exists, then set the correct offset bit\n      if (curPtr != nullptr && curPtr->_base == baseWord) {\n        return curPtr->set(offsetIntoWord);\n        // else the base wasn't found; create and set, then rearrange linked\n        // list accordingly\n      } else {\n        Node* newWord = nodeAllocator->allocate(1);\n        nodeAllocator->construct(newWord, baseWord, offsetIntoWord);\n\n        // note at this point curPtr is the next element in the list that\n        // the new one we create should point to\n        newWord->_next = curPtr;\n\n        // attempt a compare and swap: if it fails, that means the list was\n        // altered, so go back to beginning of this loop to check again\n        if (prev) {\n          if (std::atomic_compare_exchange_weak(&(prev->_next), &curPtr,\n                                                newWord)) {\n            return true;\n          } else {\n            // if it fails, return to the top; current pointer has new value\n            // that needs to be checked\n            nodeAllocator->destroy(newWord);\n            nodeAllocator->deallocate(newWord, 1);\n          }\n        } else {\n          if (std::atomic_compare_exchange_weak(&head, &curPtr, newWord)) {\n            return true;\n          } else {\n            // if it fails, return to the top; current pointer has new value\n            // that needs to be checked\n            nodeAllocator->destroy(newWord);\n            nodeAllocator->deallocate(newWord, 1);\n          }\n        }\n      }\n    }\n  }\n\n  /**\n   * Set the provided bit in the bitvector. Will create a new word if the\n   * word needed to set the bit doesn't exist yet + will rearrange linked\n   * list of words as necessary.\n   *\n   * Not thread safe.\n   *\n   * @param bit The bit to set in the bitvector\n   * @returns true if the bit set wasn't set previously\n   */\n  template <bool A = IsConcurrent, typename std::enable_if<!A>::type* = nullptr>\n  bool set(unsigned bit) {\n    unsigned baseWord;\n    unsigned offsetIntoWord;\n\n    std::tie(baseWord, offsetIntoWord) = getOffsets(bit);\n\n    Node* curPtr = head;\n    Node* prev   = nullptr;\n\n    // pointers should be in sorted order\n    // loop through linked list to find the correct base word (if it exists)\n    while (curPtr != nullptr && curPtr->_base < baseWord) {\n      prev   = curPtr;\n      curPtr = curPtr->_next;\n    }\n\n    // if base already exists, then set the correct offset bit\n    if (curPtr != nullptr && curPtr->_base == baseWord) {\n      return curPtr->set(offsetIntoWord);\n      // else the base wasn't found; create and set, then rearrange linked list\n      // accordingly\n    } else {\n      Node* newWord = nodeAllocator->allocate(1);\n      nodeAllocator->construct(newWord, baseWord, offsetIntoWord);\n\n      // this should point to prev's next, prev should point to this\n      if (prev) {\n        newWord->_next = prev->_next;\n        prev->_next    = newWord;\n      } else {\n        if (curPtr == nullptr) {\n          // this is the first word we are adding since both prev and head are\n          // null; next is nothing\n          newWord->_next = nullptr;\n        } else {\n          // this new word goes before curptr; if prev is null and curptr isn't,\n          // it means it had to go before\n          newWord->_next = head;\n        }\n\n        head = newWord;\n      }\n\n      return true;\n    }\n  }\n\n  /**\n   * Determines if a particular number bit in the bitvector is set.\n   *\n   * Note it may return false if a bit is set concurrently by another\n   * thread.\n   *\n   * @param num Bit in bitvector to check status of\n   * @returns true if the argument bit is set in this bitvector, false\n   * otherwise. May also return false if the bit being tested for is set\n   * concurrently by another thread.\n   */\n  bool test(unsigned num) const {\n    unsigned baseWord;\n    unsigned offsetIntoWord;\n\n    std::tie(baseWord, offsetIntoWord) = getOffsets(num);\n    Node* curPointer                   = head;\n\n    while (curPointer != nullptr && curPointer->_base < baseWord) {\n      curPointer = (curPointer->_next);\n    }\n\n    if (curPointer != nullptr && curPointer->_base == baseWord) {\n      return curPointer->test(offsetIntoWord);\n    } else {\n      return false;\n    }\n  }\n\n  /**\n   * READ THIS REGARDING THE CONCURRENT VERSION OF THIS:\n   *\n   * This function, in some sense, will not return false incorrectly (i.e.\n   * if it returns false, then at that point in time this vector actually\n   * isnt' a subset of the other vector; however, it is entirely possible\n   * that a concurrent update will change that as this function is returning\n   * \"false\").\n   *\n   * So basically, no guarantees. It exists to somewhat optimize some steps\n   * of PointsTo, but it should not be relied on if writes to THIS bitvector\n   * can happen concurrently (writes to second are fine).\n   *\n   * In serial execution it will work as expected.\n   *\n   * @param second Vector to check if this vector is a subset of\n   * @returns true if this vector is a subset of the second vector\n   */\n  bool isSubsetEq(const SparseBitVector& second) const {\n    Node* ptrOne = head;\n    Node* ptrTwo = second.head;\n\n    while (ptrOne != nullptr && ptrTwo != nullptr) {\n      if (ptrOne->_base == ptrTwo->_base) {\n        if (!ptrOne->isSubsetEq(ptrTwo)) {\n          return false;\n        }\n\n        // subset check successful; advance both pointers\n        ptrOne = (ptrOne->_next);\n        ptrTwo = (ptrTwo->_next);\n      } else if (ptrOne->_base < ptrTwo->_base) {\n        // ptrTwo has overtaken ptrOne, i.e. one has something (a base)\n        // two doesn't\n        return false;\n      } else { // ptrOne > ptrTwo\n        // greater than case; advance ptrTwo to see if it eventually\n        // reaches what ptrOne is currently at\n        ptrTwo = (ptrTwo->_next);\n      }\n    }\n\n    if (ptrOne != nullptr) {\n      // if ptrOne is not null, the loop exited because ptrTwo is nullptr,\n      // meaning this vector has more than the other vector, i.e. not a subset\n      return false;\n    } else {\n      // here means ptrOne == nullptr => it has sucessfully subset checked all\n      // words that matter\n      return true;\n    }\n  }\n\n  /**\n   * Concurrent version.\n   *\n   * Takes the passed in bitvector and does an \"or\" with it to update this\n   * bitvector.\n   *\n   * ONLY GUARANTEE IS THAT YOU WILL GET THINGS IN second THAT EXISTED\n   * AT TIME OF CALL; IF second IS UPDATED CONCURRENTLY, YOU MAY OR MAY\n   * NOT GET THOSE UPDATES.\n   *\n   * @param second BitVector to merge this one with\n   * @returns a non-negative value if something changed\n   */\n  template <bool A = IsConcurrent, typename std::enable_if<A>::type* = nullptr>\n  unsigned unify(const SparseBitVector& second) {\n    unsigned changed = 0;\n\n    Node* prev   = nullptr;\n    Node* ptrOne = head;\n    Node* ptrTwo = second.head;\n\n    while (ptrTwo != nullptr) {\n      while (ptrOne != nullptr && ptrTwo != nullptr) {\n        if (ptrOne->_base == ptrTwo->_base) {\n          // merged ptrTwo's word with our word, then advance both\n          changed += ptrOne->unify(ptrTwo);\n\n          prev   = ptrOne;\n          ptrOne = (ptrOne->_next);\n          ptrTwo = (ptrTwo->_next);\n        } else if (ptrOne->_base < ptrTwo->_base) {\n          // advance our pointer until we reach new bases we don't have\n          prev   = ptrOne;\n          ptrOne = (ptrOne->_next);\n        } else { // oneBase > twoBase\n          // two has something we don't have; add it between prev and current\n          // ptrone\n          Node* newWord = ptrTwo->clone(nodeAllocator);\n          // newWord comes before our current word\n          (newWord->_next) = ptrOne;\n\n          if (prev) {\n            if (!std::atomic_compare_exchange_weak(&(prev->_next), &ptrOne,\n                                                   newWord)) {\n              // if it fails, return to the top; ptrOne has new value\n              // that needs to be checked\n              nodeAllocator->destroy(newWord);\n              nodeAllocator->deallocate(newWord, 1);\n              continue;\n            }\n          } else {\n            if (!std::atomic_compare_exchange_weak(&head, &ptrOne, newWord)) {\n              // if it fails, return to the top; ptrOne has new value\n              // that needs to be checked\n              nodeAllocator->destroy(newWord);\n              nodeAllocator->deallocate(newWord, 1);\n              continue;\n            }\n          }\n\n          prev = newWord;\n          // done with ptrTwo's word, advance\n          ptrTwo = (ptrTwo->_next);\n\n          changed++;\n        }\n      }\n\n      // ptrOne = nullptr, but ptrTwo still has values; clone values\n      // and attempt to add\n      while (ptrTwo) {\n        Node* newWord = ptrTwo->clone(nodeAllocator);\n\n        // note ptrOne in below cases should be nullptr...\n        if (prev) {\n          if (!std::atomic_compare_exchange_weak(&(prev->_next), &ptrOne,\n                                                 newWord)) {\n            // if it fails, return to the top; ptrOne has new value\n            // that needs to be checked\n            nodeAllocator->destroy(newWord);\n            nodeAllocator->deallocate(newWord, 1);\n            break; // goes out to outermost while loop\n          }\n        } else {\n          if (!std::atomic_compare_exchange_weak(&head, &ptrOne, newWord)) {\n            // if it fails, return to the top; ptrOne has new value\n            // that needs to be checked\n            nodeAllocator->destroy(newWord);\n            nodeAllocator->deallocate(newWord, 1);\n            break; // goes out to outermost while loop\n          }\n        }\n\n        prev   = newWord;\n        ptrTwo = (ptrTwo->_next);\n\n        changed++;\n      }\n    }\n\n    return changed;\n  }\n\n  /**\n   * Non-concurrent version.\n   *\n   * Takes the passed in bitvector and does an \"or\" with it to update this\n   * bitvector.\n   *\n   * @param second BitVector to merge this one with\n   * @returns a non-negative value if something changed\n   */\n  template <bool A = IsConcurrent, typename std::enable_if<!A>::type* = nullptr>\n  unsigned unify(const SparseBitVector& second) {\n    unsigned changed = 0;\n\n    Node* prev   = nullptr;\n    Node* ptrOne = head;\n    Node* ptrTwo = second.head;\n\n    while (ptrOne != nullptr && ptrTwo != nullptr) {\n      if (ptrOne->_base == ptrTwo->_base) {\n        // merged ptrTwo's word with our word, then advance both\n        changed += ptrOne->unify(ptrTwo);\n\n        prev   = ptrOne;\n        ptrOne = ptrOne->_next;\n        ptrTwo = ptrTwo->_next;\n      } else if (ptrOne->_base < ptrTwo->_base) {\n        // advance our pointer until we reach new bases we don't have\n        prev   = ptrOne;\n        ptrOne = (ptrOne->_next);\n      } else { // oneBase > twoBase\n        // two has something we don't have; add it between prev and current\n        // ptrone\n        Node* newWord = ptrTwo->clone(nodeAllocator);\n        // newWord comes before our current word\n        newWord->_next = ptrOne;\n\n        if (prev) {\n          prev->_next = newWord;\n        } else {\n          head = newWord;\n        }\n\n        prev = newWord;\n        // done with ptrTwo's word, advance\n        ptrTwo = ptrTwo->_next;\n\n        changed++;\n      }\n    }\n\n    // ptrOne = nullptr, but ptrTwo still has values; clone values and add\n    while (ptrTwo) {\n      Node* newWord = ptrTwo->clone(nodeAllocator);\n\n      if (prev) {\n        prev->_next = newWord;\n      } else {\n        head = newWord;\n      }\n\n      prev   = newWord;\n      ptrTwo = (ptrTwo->_next);\n\n      changed++;\n    }\n\n    return changed;\n  }\n\n  /**\n   * @returns number of bits set by all words in this bitvector\n   */\n  unsigned count() const {\n    unsigned nbits = 0;\n\n    for (Node* ptr = head; ptr; ptr = (ptr->_next)) {\n      nbits += ptr->count();\n    }\n\n    return nbits;\n  }\n\n  /**\n   * Gets the set bits in this bitvector and returns them in a vector type.\n   *\n   * @tparam VectorTy vector type that supports push_back\n   * @returns Vector with all set bits\n   */\n  std::vector<unsigned> getAllSetBits() const {\n    std::vector<unsigned> setBits;\n\n    // loop through all words in the bitvector and get their set bits\n    for (Node* curPtr = head; curPtr != nullptr; curPtr = curPtr->_next) {\n      curPtr->getAllSetBits(setBits);\n    }\n\n    return setBits;\n  }\n\n  /**\n   * Output the bits that are set in this bitvector.\n   *\n   * @param out Stream to output to\n   * @param prefix A string to append to the set bit numbers\n   */\n  void print(std::ostream& out, std::string prefix = std::string(\"\")) const {\n    std::vector<unsigned> setBits = getAllSetBits();\n    out << \"Elements(\" << setBits.size() << \"): \";\n\n    for (auto setBitNum : setBits) {\n      out << prefix << setBitNum << \", \";\n    }\n\n    out << \"\\n\";\n  }\n\nprivate:\n  /**\n   * @param num Bit that needs to be set\n   * @returns a pair signifying a base word and the offset into a\n   * baseword that corresponds to num\n   */\n  std::pair<unsigned, unsigned> getOffsets(unsigned num) const {\n    unsigned baseWord       = num / wordSize;\n    unsigned offsetIntoWord = num % wordSize;\n\n    return std::pair<unsigned, unsigned>(baseWord, offsetIntoWord);\n  }\n};\n\n} // namespace galois\n\n#endif\n"
  },
  {
    "path": "lonestar/analytics/cpu/preflowpush/CMakeLists.txt",
    "content": "add_executable(preflowpush-cpu Preflowpush.cpp)\nadd_dependencies(apps preflowpush-cpu)\ntarget_link_libraries(preflowpush-cpu PRIVATE Galois::shmem lonestar)\ninstall(TARGETS preflowpush-cpu DESTINATION \"${CMAKE_INSTALL_BINDIR}\" COMPONENT apps EXCLUDE_FROM_ALL)\nadd_test_scale(small1 preflowpush-cpu \"${BASEINPUT}/reference/structured/torus5.gr\" \"-sourceNode=0\" \"-sinkNode=10\")\n"
  },
  {
    "path": "lonestar/analytics/cpu/preflowpush/Preflowpush.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#include \"galois/Galois.h\"\n#include \"galois/Reduction.h\"\n#include \"galois/Bag.h\"\n#include \"galois/Timer.h\"\n#include \"galois/graphs/LCGraph.h\"\n#include \"llvm/Support/CommandLine.h\"\n#include \"Lonestar/BoilerPlate.h\"\n\n#include <boost/iterator/iterator_adaptor.hpp>\n\n#include <fstream>\n#include <iostream>\n\nnamespace cll = llvm::cl;\n\nconst char* name = \"Preflow Push\";\nconst char* desc =\n    \"Finds the maximum flow in a network using the preflow push technique\";\nconst char* url = \"preflow_push\";\n\nenum DetAlgo { nondet = 0, detBase, detDisjoint };\n\nstatic cll::opt<std::string>\n    inputFile(cll::Positional, cll::desc(\"<input file>\"), cll::Required);\nstatic cll::opt<uint32_t> sourceId(\"sourceNode\", cll::desc(\"Source node\"),\n                                   cll::Required);\nstatic cll::opt<uint32_t> sinkId(\"sinkNode\", cll::desc(\"Sink node\"),\n                                 cll::Required);\nstatic cll::opt<bool> useHLOrder(\"useHLOrder\",\n                                 cll::desc(\"Use HL ordering heuristic\"),\n                                 cll::init(false));\nstatic cll::opt<bool>\n    useUnitCapacity(\"useUnitCapacity\",\n                    cll::desc(\"Assume all capacities are unit\"),\n                    cll::init(false));\nstatic cll::opt<bool> useSymmetricDirectly(\n    \"useSymmetricDirectly\",\n    cll::desc(\"Assume input graph is symmetric and has unit capacities\"),\n    cll::init(false));\nstatic cll::opt<int>\n    relabelInt(\"relabel\",\n               cll::desc(\"relabel interval X: relabel every X iterations \"\n                         \"(default 0 uses default interval)\"),\n               cll::init(0));\nstatic cll::opt<DetAlgo>\n    detAlgo(cll::desc(\"Deterministic algorithm:\"),\n            cll::values(clEnumVal(nondet, \"Non-deterministic (default)\"),\n                        clEnumVal(detBase, \"Base execution\"),\n                        clEnumVal(detDisjoint, \"Disjoint execution\")),\n            cll::init(nondet));\n\n/**\n * Alpha parameter the original Goldberg algorithm to control when global\n * relabeling occurs. For comparison purposes, we keep them the same as\n * before, but it is possible to achieve much better performance by adjusting\n * the global relabel frequency.\n */\nstatic const int ALPHA = 6;\n\n/**\n * Beta parameter the original Goldberg algorithm to control when global\n * relabeling occurs. For comparison purposes, we keep them the same as\n * before, but it is possible to achieve much better performance by adjusting\n * the global relabel frequency.\n */\nstatic const int BETA = 12;\n\nstruct Node {\n  uint32_t id;\n  int64_t excess;\n  int height;\n  int current;\n\n  Node() : excess(0), height(1), current(0) {}\n};\n\nstd::ostream& operator<<(std::ostream& os, const Node& n) {\n  os << \"(\"\n     << \"id: \" << n.id << \", excess: \" << n.excess << \", height: \" << n.height\n     << \", current: \" << n.current << \")\";\n  return os;\n}\n\nusing Graph =\n    galois::graphs::LC_CSR_Graph<Node, int32_t>::with_numa_alloc<false>::type;\nusing GNode   = Graph::GraphNode;\nusing Counter = galois::GAccumulator<int>;\n\nstruct PreflowPush {\n\n  Graph graph;\n  GNode sink;\n  GNode source;\n  int global_relabel_interval;\n  bool should_global_relabel = false;\n  galois::LargeArray<Graph::edge_iterator>\n      reverseDirectionEdgeIterator; // ideally should be on the graph as\n                                    // graph.getReverseEdgeIterator()\n\n  void reduceCapacity(const Graph::edge_iterator& ii, int64_t amount) {\n    Graph::edge_data_type& cap1 = graph.getEdgeData(ii);\n    Graph::edge_data_type& cap2 =\n        graph.getEdgeData(reverseDirectionEdgeIterator[*ii]);\n    cap1 -= amount;\n    cap2 += amount;\n  }\n\n  Graph::edge_iterator findEdge(GNode src, GNode dst) {\n\n    auto i     = graph.edge_begin(src, galois::MethodFlag::UNPROTECTED);\n    auto end_i = graph.edge_end(src, galois::MethodFlag::UNPROTECTED);\n\n    if ((end_i - i) < 32) {\n      return findEdgeLinear(dst, i, end_i);\n\n    } else {\n      return findEdgeLog2(dst, i, end_i);\n    }\n  }\n\n  Graph::edge_iterator findEdgeLinear(GNode dst, Graph::edge_iterator beg_e,\n                                      Graph::edge_iterator end_e) {\n\n    auto ii = beg_e;\n    for (; ii != end_e; ++ii) {\n      if (graph.getEdgeDst(ii) == dst)\n        break;\n    }\n    assert(ii != end_e); // Never return the end iterator\n    return ii;\n  }\n\n  Graph::edge_iterator findEdgeLog2(GNode dst, Graph::edge_iterator i,\n                                    Graph::edge_iterator end_i) {\n\n    struct EdgeDstIter\n        : public boost::iterator_facade<\n              EdgeDstIter, GNode, boost::random_access_traversal_tag, GNode> {\n      Graph* g;\n      Graph::edge_iterator ei;\n\n      EdgeDstIter() : g(nullptr) {}\n\n      EdgeDstIter(Graph* g, Graph::edge_iterator ei) : g(g), ei(ei) {}\n\n    private:\n      friend boost::iterator_core_access;\n\n      GNode dereference() const { return g->getEdgeDst(ei); }\n\n      void increment() { ++ei; }\n\n      void decrement() { --ei; }\n\n      bool equal(const EdgeDstIter& that) const {\n        assert(this->g == that.g);\n        return this->ei == that.ei;\n      }\n\n      void advance(ptrdiff_t n) { ei += n; }\n\n      ptrdiff_t distance_to(const EdgeDstIter& that) const {\n        assert(this->g == that.g);\n\n        return that.ei - this->ei;\n      }\n    };\n\n    EdgeDstIter ai(&graph, i);\n    EdgeDstIter end_ai(&graph, end_i);\n\n    auto ret = std::lower_bound(ai, end_ai, dst);\n\n    assert(ret != end_ai);\n    assert(*ret == dst);\n\n    return ret.ei;\n  }\n\n  void acquire(const GNode& src) {\n    // LC Graphs have a different idea of locking\n    for (auto ii : graph.edges(src, galois::MethodFlag::WRITE)) {\n      GNode dst = graph.getEdgeDst(ii);\n      graph.getData(dst, galois::MethodFlag::WRITE);\n    }\n  }\n\n  void relabel(const GNode& src) {\n    int minHeight = std::numeric_limits<int>::max();\n    int minEdge   = 0;\n\n    int current = 0;\n    for (auto ii : graph.edges(src, galois::MethodFlag::UNPROTECTED)) {\n      GNode dst   = graph.getEdgeDst(ii);\n      int64_t cap = graph.getEdgeData(ii);\n      if (cap > 0) {\n        const Node& dnode = graph.getData(dst, galois::MethodFlag::UNPROTECTED);\n        if (dnode.height < minHeight) {\n          minHeight = dnode.height;\n          minEdge   = current;\n        }\n      }\n      ++current;\n    }\n\n    assert(minHeight != std::numeric_limits<int>::max());\n    ++minHeight;\n\n    Node& node = graph.getData(src, galois::MethodFlag::UNPROTECTED);\n    if (minHeight < (int)graph.size()) {\n      node.height  = minHeight;\n      node.current = minEdge;\n    } else {\n      node.height = graph.size();\n    }\n  }\n\n  template <typename C>\n  bool discharge(const GNode& src, C& ctx) {\n    Node& node     = graph.getData(src, galois::MethodFlag::UNPROTECTED);\n    bool relabeled = false;\n\n    if (node.excess == 0 || node.height >= (int)graph.size()) {\n      return false;\n    }\n\n    while (true) {\n      galois::MethodFlag flag = galois::MethodFlag::UNPROTECTED;\n      bool finished           = false;\n      int current             = node.current;\n\n      auto ii = graph.edge_begin(src, flag);\n      auto ee = graph.edge_end(src, flag);\n\n      std::advance(ii, node.current);\n\n      for (; ii != ee; ++ii, ++current) {\n        GNode dst   = graph.getEdgeDst(ii);\n        int64_t cap = graph.getEdgeData(ii);\n        if (cap == 0) // || current < node.current)\n          continue;\n\n        Node& dnode = graph.getData(dst, galois::MethodFlag::UNPROTECTED);\n        if (node.height - 1 != dnode.height)\n          continue;\n\n        // Push flow\n        int64_t amount = std::min(node.excess, cap);\n        reduceCapacity(ii, amount);\n\n        // Only add once\n        if (dst != sink && dst != source && dnode.excess == 0)\n          ctx.push(dst);\n\n        assert(node.excess >= amount);\n        node.excess -= amount;\n        dnode.excess += amount;\n\n        if (node.excess == 0) {\n          finished     = true;\n          node.current = current;\n          break;\n        }\n      }\n\n      if (finished)\n        break;\n\n      relabel(src);\n      relabeled = true;\n\n      if (node.height == (int)graph.size())\n        break;\n\n      // prevHeight = node.height;\n    }\n\n    return relabeled;\n  }\n\n  template <DetAlgo version>\n  void detDischarge(galois::InsertBag<GNode>& initial, Counter& counter) {\n    typedef galois::worklists::Deterministic<> DWL;\n\n    auto detIDfn = [this](const GNode& item) -> uint32_t {\n      return graph.getData(item, galois::MethodFlag::UNPROTECTED).id;\n    };\n\n    const int relabel_interval =\n        global_relabel_interval / galois::getActiveThreads();\n\n    auto detBreakFn = [&, this](void) -> bool {\n      if (this->global_relabel_interval > 0 &&\n          counter.getLocal() >= relabel_interval) {\n        this->should_global_relabel = true;\n        return true;\n      } else {\n        return false;\n      }\n    };\n\n    galois::for_each(\n        galois::iterate(initial),\n        [&, this](GNode& src, auto& ctx) {\n          if (version != nondet) {\n            if (ctx.isFirstPass()) {\n              this->acquire(src);\n            }\n            if (version == detDisjoint && ctx.isFirstPass()) {\n              return;\n            } else {\n              this->graph.getData(src, galois::MethodFlag::WRITE);\n              ctx.cautiousPoint();\n            }\n          }\n\n          int increment = 1;\n          if (this->discharge(src, ctx)) {\n            increment += BETA;\n          }\n\n          counter += increment;\n        },\n        galois::loopname(\"detDischarge\"), galois::wl<DWL>(),\n        galois::per_iter_alloc(), galois::det_id<decltype(detIDfn)>(detIDfn),\n        galois::det_parallel_break<decltype(detBreakFn)>(detBreakFn));\n  }\n\n  template <typename W>\n  void nonDetDischarge(galois::InsertBag<GNode>& initial, Counter& counter,\n                       const W& wl_opt) {\n\n    // per thread\n    const int relabel_interval =\n        global_relabel_interval / galois::getActiveThreads();\n\n    galois::for_each(\n        galois::iterate(initial),\n        [&counter, relabel_interval, this](GNode& src, auto& ctx) {\n          int increment = 1;\n          this->acquire(src);\n          if (this->discharge(src, ctx)) {\n            increment += BETA;\n          }\n\n          counter += increment;\n          if (this->global_relabel_interval > 0 &&\n              counter.getLocal() >= relabel_interval) { // local check\n\n            this->should_global_relabel = true;\n            ctx.breakLoop();\n            return;\n          }\n        },\n        galois::loopname(\"nonDetDischarge\"), galois::parallel_break(), wl_opt);\n  }\n\n  /**\n   * Do reverse BFS on residual graph.\n   */\n  template <DetAlgo version, typename WL, bool useCAS = true>\n  void updateHeights() {\n\n    galois::for_each(\n        galois::iterate({sink}),\n        [&, this](const GNode& src, auto& ctx) {\n          if (version != nondet) {\n\n            if (ctx.isFirstPass()) {\n              for (auto ii :\n                   this->graph.edges(src, galois::MethodFlag::WRITE)) {\n                GNode dst = this->graph.getEdgeDst(ii);\n                int64_t rdata =\n                    this->graph.getEdgeData(reverseDirectionEdgeIterator[*ii]);\n                if (rdata > 0) {\n                  this->graph.getData(dst, galois::MethodFlag::WRITE);\n                }\n              }\n            }\n\n            if (version == detDisjoint && ctx.isFirstPass()) {\n              return;\n            } else {\n              this->graph.getData(src, galois::MethodFlag::WRITE);\n              ctx.cautiousPoint();\n            }\n          }\n\n          for (auto ii :\n               this->graph.edges(src, useCAS ? galois::MethodFlag::UNPROTECTED\n                                             : galois::MethodFlag::WRITE)) {\n            GNode dst = this->graph.getEdgeDst(ii);\n            int64_t rdata =\n                this->graph.getEdgeData(reverseDirectionEdgeIterator[*ii]);\n            if (rdata > 0) {\n              Node& node =\n                  this->graph.getData(dst, galois::MethodFlag::UNPROTECTED);\n              int newHeight =\n                  this->graph.getData(src, galois::MethodFlag::UNPROTECTED)\n                      .height +\n                  1;\n              if (useCAS) {\n                int oldHeight = 0;\n                while (newHeight < (oldHeight = node.height)) {\n                  if (__sync_bool_compare_and_swap(&node.height, oldHeight,\n                                                   newHeight)) {\n                    ctx.push(dst);\n                    break;\n                  }\n                }\n              } else {\n                if (newHeight < node.height) {\n                  node.height = newHeight;\n                  ctx.push(dst);\n                }\n              }\n            }\n          } // end for\n        },\n        galois::wl<WL>(), galois::disable_conflict_detection(),\n        galois::loopname(\"updateHeights\"));\n  }\n\n  template <typename IncomingWL>\n  void globalRelabel(IncomingWL& incoming) {\n\n    galois::do_all(\n        galois::iterate(graph),\n        [&](const GNode& src) {\n          Node& node   = graph.getData(src, galois::MethodFlag::UNPROTECTED);\n          node.height  = graph.size();\n          node.current = 0;\n          if (src == sink)\n            node.height = 0;\n        },\n        galois::loopname(\"ResetHeights\"));\n\n    using BSWL = galois::worklists::BulkSynchronous<>;\n    using DWL  = galois::worklists::Deterministic<>;\n    switch (detAlgo) {\n    case nondet:\n      updateHeights<nondet, BSWL>();\n      break;\n    case detBase:\n      updateHeights<detBase, DWL>();\n      break;\n    case detDisjoint:\n      updateHeights<detDisjoint, DWL>();\n      break;\n    default:\n      std::cerr << \"Unknown algorithm\" << detAlgo << \"\\n\";\n      abort();\n    }\n\n    galois::do_all(\n        galois::iterate(graph),\n        [&incoming, this](const GNode& src) {\n          Node& node =\n              this->graph.getData(src, galois::MethodFlag::UNPROTECTED);\n          if (src == this->sink || src == this->source ||\n              node.height >= (int)this->graph.size())\n            return;\n          if (node.excess > 0)\n            incoming.push_back(src);\n        },\n        galois::loopname(\"FindWork\"));\n  }\n\n  template <typename C>\n  void initializePreflow(C& initial) {\n    for (auto ii : graph.edges(source)) {\n      GNode dst   = graph.getEdgeDst(ii);\n      int64_t cap = graph.getEdgeData(ii);\n      reduceCapacity(ii, cap);\n      Node& node = graph.getData(dst);\n      node.excess += cap;\n      if (cap > 0)\n        initial.push_back(dst);\n    }\n  }\n\n  void run() {\n    Graph* captured_graph = &graph;\n    auto obimIndexer      = [=](const GNode& n) {\n      return -captured_graph->getData(n, galois::MethodFlag::UNPROTECTED)\n                  .height;\n    };\n\n    typedef galois::worklists::PerSocketChunkFIFO<16> Chunk;\n    typedef galois::worklists::OrderedByIntegerMetric<decltype(obimIndexer),\n                                                      Chunk>\n        OBIM;\n\n    galois::InsertBag<GNode> initial;\n    initializePreflow(initial);\n\n    while (initial.begin() != initial.end()) {\n      galois::StatTimer T_discharge(\"DischargeTime\");\n      T_discharge.start();\n      Counter counter;\n      switch (detAlgo) {\n      case nondet:\n        if (useHLOrder) {\n          nonDetDischarge(initial, counter, galois::wl<OBIM>(obimIndexer));\n        } else {\n          nonDetDischarge(initial, counter, galois::wl<Chunk>());\n        }\n        break;\n      case detBase:\n        detDischarge<detBase>(initial, counter);\n        break;\n      case detDisjoint:\n        detDischarge<detDisjoint>(initial, counter);\n        break;\n      default:\n        std::cerr << \"Unknown algorithm\" << detAlgo << \"\\n\";\n        abort();\n      }\n      T_discharge.stop();\n\n      if (should_global_relabel) {\n        galois::StatTimer T_global_relabel(\"GlobalRelabelTime\");\n        T_global_relabel.start();\n        initial.clear();\n        globalRelabel(initial);\n        should_global_relabel = false;\n        std::cout << \" Flow after global relabel: \"\n                  << graph.getData(sink).excess << \"\\n\";\n        T_global_relabel.stop();\n      } else {\n        break;\n      }\n    }\n  }\n\n  template <typename EdgeTy>\n  static void writePfpGraph(const std::string& inputFile,\n                            const std::string& outputFile) {\n    typedef galois::graphs::FileGraph ReaderGraph;\n    typedef ReaderGraph::GraphNode ReaderGNode;\n\n    ReaderGraph reader;\n    reader.fromFile(inputFile);\n\n    typedef galois::graphs::FileGraphWriter Writer;\n\n    Writer p;\n\n    // Count edges\n    size_t numEdges = 0;\n    for (ReaderGraph::iterator ii = reader.begin(), ei = reader.end(); ii != ei;\n         ++ii) {\n      ReaderGNode rsrc = *ii;\n      for (auto jj : reader.edges(rsrc)) {\n        ReaderGNode rdst = reader.getEdgeDst(jj);\n        if (rsrc == rdst)\n          continue;\n        if (!reader.hasNeighbor(rdst, rsrc))\n          ++numEdges;\n        ++numEdges;\n      }\n    }\n\n    p.setNumNodes(reader.size());\n    p.setNumEdges<EdgeTy>(numEdges);\n\n    p.phase1();\n    for (ReaderGraph::iterator ii = reader.begin(), ei = reader.end(); ii != ei;\n         ++ii) {\n      ReaderGNode rsrc = *ii;\n      for (auto jj : reader.edges(rsrc)) {\n        ReaderGNode rdst = reader.getEdgeDst(jj);\n        if (rsrc == rdst)\n          continue;\n        if (!reader.hasNeighbor(rdst, rsrc))\n          p.incrementDegree(rdst);\n        p.incrementDegree(rsrc);\n      }\n    }\n\n    EdgeTy one = 1;\n    static_assert(sizeof(one) == sizeof(uint32_t), \"Unexpected edge data size\");\n    one = galois::convert_le32toh(one);\n\n    p.phase2();\n    for (ReaderGraph::iterator ii = reader.begin(), ei = reader.end(); ii != ei;\n         ++ii) {\n      ReaderGNode rsrc = *ii;\n      for (auto jj : reader.edges(rsrc)) {\n        ReaderGNode rdst = reader.getEdgeDst(jj);\n        if (rsrc == rdst)\n          continue;\n        if (!reader.hasNeighbor(rdst, rsrc))\n          p.addNeighbor<EdgeTy>(rdst, rsrc, 0);\n        EdgeTy cap = useUnitCapacity ? one : reader.getEdgeData<EdgeTy>(jj);\n        p.addNeighbor<EdgeTy>(rsrc, rdst, cap);\n      }\n    }\n\n    p.finish();\n\n    using Wnode = Writer::GraphNode;\n\n    struct IdLess {\n      bool\n      operator()(const galois::graphs::EdgeSortValue<Wnode, EdgeTy>& e1,\n                 const galois::graphs::EdgeSortValue<Wnode, EdgeTy>& e2) const {\n        return e1.dst < e2.dst;\n      }\n    };\n\n    for (Writer::iterator i = p.begin(), end_i = p.end(); i != end_i; ++i) {\n      p.sortEdges<EdgeTy>(*i, IdLess());\n    }\n\n    p.toFile(outputFile);\n  }\n\n  void initializeGraph(std::string inputFile, uint32_t sourceId,\n                       uint32_t sinkId) {\n    if (useSymmetricDirectly) {\n      galois::graphs::readGraph(graph, inputFile);\n      for (auto ss : graph)\n        for (auto ii : graph.edges(ss))\n          graph.getEdgeData(ii) = 1;\n    } else {\n      if (inputFile.find(\".gr.pfp\") != inputFile.size() - strlen(\".gr.pfp\")) {\n        std::string pfpName = inputFile + \".pfp\";\n        std::ifstream pfpFile(pfpName.c_str());\n        if (!pfpFile.good()) {\n          galois::gPrint(\"Writing new input file: \", pfpName, \"\\n\");\n          writePfpGraph<Graph::edge_data_type>(inputFile, pfpName);\n        }\n        inputFile = pfpName;\n      }\n      galois::gPrint(\"Reading graph: \", inputFile, \"\\n\");\n      galois::graphs::readGraph(graph, inputFile);\n\n      // Assume that input edge data has already been converted instead\n#if 0 // def HAVE_BIG_ENDIAN\n      // Convert edge data to host ordering\n      for (auto ss : newApp->graph) {\n        for (auto ii : newApp->graph.edges(ss)) {\n          Graph::edge_data_type& cap = newApp->graph.getEdgeData(ii);\n          static_assert(sizeof(cap) == sizeof(uint32_t), \"Unexpected edge data size\");\n          cap = galois::convert_le32toh(cap);\n        }\n      }\n#endif\n    }\n\n    if (sourceId == sinkId || sourceId >= graph.size() ||\n        sinkId >= graph.size()) {\n      std::cerr << \"invalid source or sink id\\n\";\n      abort();\n    }\n\n    uint32_t id = 0;\n    for (Graph::iterator ii = graph.begin(), ei = graph.end(); ii != ei;\n         ++ii, ++id) {\n      if (id == sourceId) {\n        source                       = *ii;\n        graph.getData(source).height = graph.size();\n      } else if (id == sinkId) {\n        sink = *ii;\n      }\n      graph.getData(*ii).id = id;\n    }\n\n    reverseDirectionEdgeIterator.allocateInterleaved(graph.sizeEdges());\n    // memoize the reverse direction edge-iterators\n    galois::do_all(\n        galois::iterate(graph.begin(), graph.end()),\n        [&, this](const GNode& src) {\n          for (auto ii :\n               this->graph.edges(src, galois::MethodFlag::UNPROTECTED)) {\n            GNode dst                         = this->graph.getEdgeDst(ii);\n            reverseDirectionEdgeIterator[*ii] = this->findEdge(dst, src);\n          }\n        },\n        galois::loopname(\"FindReverseDirectionEdges\"));\n  }\n\n  void checkSorting(void) {\n    for (auto n : graph) {\n      galois::optional<GNode> prevDst;\n      for (auto e : graph.edges(n, galois::MethodFlag::UNPROTECTED)) {\n        GNode dst = graph.getEdgeDst(e);\n        if (prevDst.is_initialized()) {\n          Node& prevNode =\n              graph.getData(*prevDst, galois::MethodFlag::UNPROTECTED);\n          Node& currNode = graph.getData(dst, galois::MethodFlag::UNPROTECTED);\n          GALOIS_ASSERT(prevNode.id != currNode.id,\n                        \"Adjacency list cannot have duplicates\");\n          GALOIS_ASSERT(prevNode.id <= currNode.id, \"Adjacency list unsorted\");\n        }\n        prevDst = dst;\n      }\n    }\n  }\n\n  void checkAugmentingPath() {\n    // Use id field as visited flag\n    for (Graph::iterator ii = graph.begin(), ee = graph.end(); ii != ee; ++ii) {\n      GNode src             = *ii;\n      graph.getData(src).id = 0;\n    }\n\n    std::deque<GNode> queue;\n\n    graph.getData(source).id = 1;\n    queue.push_back(source);\n\n    while (!queue.empty()) {\n      GNode& src = queue.front();\n      queue.pop_front();\n      for (auto ii : graph.edges(src)) {\n        GNode dst = graph.getEdgeDst(ii);\n        if (graph.getData(dst).id == 0 && graph.getEdgeData(ii) > 0) {\n          graph.getData(dst).id = 1;\n          queue.push_back(dst);\n        }\n      }\n    }\n\n    if (graph.getData(sink).id != 0) {\n      assert(false && \"Augmenting path exisits\");\n      abort();\n    }\n  }\n\n  void checkHeights() {\n    for (Graph::iterator ii = graph.begin(), ei = graph.end(); ii != ei; ++ii) {\n      GNode src = *ii;\n      int sh    = graph.getData(src).height;\n      for (auto jj : graph.edges(src)) {\n        GNode dst   = graph.getEdgeDst(jj);\n        int64_t cap = graph.getEdgeData(jj);\n        int dh      = graph.getData(dst).height;\n        if (cap > 0 && sh > dh + 1) {\n          std::cerr << \"height violated at \" << graph.getData(src) << \"\\n\";\n          abort();\n        }\n      }\n    }\n  }\n\n  void checkConservation(PreflowPush& orig) {\n    std::vector<GNode> map;\n    map.resize(graph.size());\n\n    // Setup ids assuming same iteration order in both graphs\n    uint32_t id = 0;\n    for (Graph::iterator ii = graph.begin(), ei = graph.end(); ii != ei;\n         ++ii, ++id) {\n      graph.getData(*ii).id = id;\n    }\n    id = 0;\n    for (Graph::iterator ii = orig.graph.begin(), ei = orig.graph.end();\n         ii != ei; ++ii, ++id) {\n      orig.graph.getData(*ii).id = id;\n      map[id]                    = *ii;\n    }\n\n    // Now do some checking\n    for (Graph::iterator ii = graph.begin(), ei = graph.end(); ii != ei; ++ii) {\n      GNode src        = *ii;\n      const Node& node = graph.getData(src);\n      uint32_t srcId   = node.id;\n\n      if (src == source || src == sink)\n        continue;\n\n      if (node.excess != 0 && node.height != (int)graph.size()) {\n        std::cerr << \"Non-zero excess at \" << node << \"\\n\";\n        abort();\n      }\n\n      int64_t sum = 0;\n      for (auto jj : graph.edges(src)) {\n        GNode dst      = graph.getEdgeDst(jj);\n        uint32_t dstId = graph.getData(dst).id;\n        int64_t ocap =\n            orig.graph.getEdgeData(orig.findEdge(map[srcId], map[dstId]));\n        int64_t delta = 0;\n        if (ocap > 0)\n          delta -= (ocap - graph.getEdgeData(jj));\n        else\n          delta += graph.getEdgeData(jj);\n        sum += delta;\n      }\n\n      if (node.excess != sum) {\n        std::cerr << \"Not pseudoflow: \" << node.excess << \" != \" << sum\n                  << \" at \" << node << \"\\n\";\n        abort();\n      }\n    }\n  }\n\n  void verify(PreflowPush& orig) {\n    // FIXME: doesn't fully check result\n    checkHeights();\n    checkConservation(orig);\n    checkAugmentingPath();\n  }\n};\n\nint main(int argc, char** argv) {\n  galois::SharedMemSys G;\n  LonestarStart(argc, argv, name, desc, url, &inputFile);\n\n  galois::StatTimer totalTime(\"TimerTotal\");\n  totalTime.start();\n\n  PreflowPush app;\n  app.initializeGraph(inputFile, sourceId, sinkId);\n\n  app.checkSorting();\n\n  if (relabelInt == 0) {\n    app.global_relabel_interval =\n        app.graph.size() * ALPHA + app.graph.sizeEdges() / 3;\n  } else {\n    app.global_relabel_interval = relabelInt;\n  }\n  std::cout << \"Number of nodes: \" << app.graph.size() << \"\\n\";\n  std::cout << \"Global relabel interval: \" << app.global_relabel_interval\n            << \"\\n\";\n\n  galois::preAlloc(numThreads * app.graph.size() /\n                   galois::runtime::pagePoolSize());\n  galois::reportPageAlloc(\"MeminfoPre\");\n\n  galois::StatTimer execTime(\"Timer_0\");\n  execTime.start();\n  app.run();\n  execTime.stop();\n\n  galois::reportPageAlloc(\"MeminfoPost\");\n\n  std::cout << \"Flow is \" << app.graph.getData(app.sink).excess << \"\\n\";\n\n  if (!skipVerify) {\n    PreflowPush orig;\n    orig.initializeGraph(inputFile, sourceId, sinkId);\n    app.verify(orig);\n    std::cout << \"(Partially) Verified\\n\";\n  }\n\n  totalTime.stop();\n\n  return 0;\n}\n"
  },
  {
    "path": "lonestar/analytics/cpu/preflowpush/README.md",
    "content": "Preflow Push algorithm\n================================================================================\n\nDESCRIPTION \n--------------------------------------------------------------------------------\n\nThis program computes the maximum flow from a given source to a given sink \nin a given directed graph using the preflow-push algorithm (also called \npush-relabel algorithm):\n\nA. Goldberg. Efficient Graph Algorithms for Sequential and Parallel Computers. \nPhD thesis. Dept. of EECS, MIT. 1987.\n\nIt also incorporates global relabel and gap detection heuristics:\n\nB. Cherkassy, A. Goldberg. On implementing the push-relabel method for the \nmaximum flow problem. Algorithmica. 1997\n\nINPUT\n--------------------------------------------------------------------------------\n\nThis application takes in Galois .gr graphs.\n\nBUILD\n--------------------------------------------------------------------------------\n\n1. Run cmake at BUILD directory (refer to top-level README for cmake instructions).\n\n2. Run `cd <BUILD>/lonestar/analysis/cpu/preflowpush; make -j`\n\nRUN\n--------------------------------------------------------------------------------\n\nThe following are a few example command lines.\n\n-`$ ./preflowpush-cpu <path-to-graph> <source-ID> <sink-ID>`\n-`$ ./preflowpush-cpu <path-to-graph> <source-ID> <sink-ID> -t=20`\n\nPERFORMANCE\n--------------------------------------------------------------------------------\n\n* In our experience, the deterministic algorithms perform much slower than the \n  non-deterministic one.\n\n* The performance of all algorithms depend on an optimal choice of the compile \n  time constant, CHUNK_SIZE, the granularity of stolen work when work stealing is \n  enabled (via galois::steal()). The optimal value of the constant might depend on \n  the architecture, so you might want to evaluate the performance over a range of \n  values (say [16-4096]).\n"
  },
  {
    "path": "lonestar/analytics/cpu/spanningtree/Boruvka.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#include \"galois/Galois.h\"\n#include \"galois/Bag.h\"\n#include \"galois/ParallelSTL.h\"\n#include \"galois/Reduction.h\"\n#include \"galois/Timer.h\"\n#include \"galois/UnionFind.h\"\n#include \"galois/graphs/LCGraph.h\"\n#include \"galois/runtime/Profile.h\"\n#include \"Lonestar/BoilerPlate.h\"\n\n#include \"llvm/Support/CommandLine.h\"\n\n#include <atomic>\n#include <utility>\n#include <algorithm>\n#include <iostream>\n\nnamespace cll = llvm::cl;\n\nstatic const char* name = \"Boruvka's Minimum Spanning Tree Algorithm\";\nstatic const char* desc = \"Computes the minimum spanning forest of a graph\";\nstatic const char* url  = \"mst\";\n\nenum Algo { parallel, exp_parallel };\n\nstatic cll::opt<std::string>\n    inputFilename(cll::Positional, cll::desc(\"<input file>\"), cll::Required);\nstatic cll::opt<Algo>\n    algo(\"algo\", cll::desc(\"Choose an algorithm (default value parallel):\"),\n         cll::values(clEnumVal(parallel, \"Parallel\")), cll::init(parallel));\n\ntypedef int EdgeData;\n\nstruct Node : public galois::UnionFindNode<Node> {\n  std::atomic<EdgeData*> lightest;\n  Node() : galois::UnionFindNode<Node>(const_cast<Node*>(this)) {}\n};\n\ntypedef galois::graphs::LC_CSR_Graph<Node, EdgeData>::with_numa_alloc<\n    true>::type ::with_no_lockable<true>::type Graph;\n\ntypedef Graph::GraphNode GNode;\n\nstd::ostream& operator<<(std::ostream& os, const Node& n) {\n  os << \"[id: \" << &n << \", c: \" << n.find() << \"]\";\n  return os;\n}\n\nstruct Edge {\n  GNode src;\n  GNode dst;\n  const EdgeData* weight;\n  Edge(const GNode& s, const GNode& d, const EdgeData* w)\n      : src(s), dst(d), weight(w) {}\n};\n\n/**\n * Boruvka's algorithm. Implemented bulk-synchronously in order to avoid the\n * need to merge edge lists.\n */\ntemplate <bool useExp>\nstruct ParallelAlgo {\n  struct WorkItem {\n    Edge edge;\n    int cur;\n    WorkItem(const GNode& s, const GNode& d, const EdgeData* w, int c)\n        : edge(s, d, w), cur(c) {}\n  };\n\n  typedef galois::InsertBag<WorkItem> WL;\n\n  Graph graph;\n\n  WL wls[3];\n  WL* current;\n  WL* next;\n  WL* pending;\n  EdgeData limit;\n  galois::InsertBag<Edge> mst;\n  EdgeData inf;\n  EdgeData heaviest;\n\n  /**\n   * Find lightest edge between components leaving a node and add it to the\n   * worklist.\n   */\n  template <bool useLimit, typename Context, typename Pending>\n  static void findLightest(ParallelAlgo* self, const GNode& src, int cur,\n                           Context& ctx, Pending& pending) {\n    Node& sdata = self->graph.getData(src, galois::MethodFlag::UNPROTECTED);\n    Graph::edge_iterator ii =\n        self->graph.edge_begin(src, galois::MethodFlag::UNPROTECTED);\n    Graph::edge_iterator ei =\n        self->graph.edge_end(src, galois::MethodFlag::UNPROTECTED);\n\n    std::advance(ii, cur);\n\n    for (; ii != ei; ++ii, ++cur) {\n      GNode dst   = self->graph.getEdgeDst(ii);\n      Node& ddata = self->graph.getData(dst, galois::MethodFlag::UNPROTECTED);\n      EdgeData& weight = self->graph.getEdgeData(ii);\n      if (useLimit && weight > self->limit) {\n        pending.push(WorkItem(src, dst, &weight, cur));\n        return;\n      }\n      Node* rep;\n      if ((rep = sdata.findAndCompress()) != ddata.findAndCompress()) {\n        // const EdgeData& weight = self->graph.getEdgeData(ii);\n        EdgeData* old;\n        ctx.push(WorkItem(src, dst, &weight, cur));\n        while (weight < *(old = rep->lightest)) {\n          if (rep->lightest.compare_exchange_strong(old, &weight))\n            break;\n        }\n        return;\n      }\n    }\n  }\n\n  /**\n   * Merge step specialized for first round of the algorithm.\n   */\n  struct Initialize {\n    ParallelAlgo* self;\n\n    Initialize(ParallelAlgo* s) : self(s) {}\n\n    void operator()(const GNode& src) const {\n      (*this)(src, *self->next, *self->pending);\n    }\n\n    template <typename Context>\n    void operator()(const GNode& src, Context& ctx) const {\n      (*this)(src, ctx, *self->pending);\n    }\n\n    template <typename Context, typename Pending>\n    void operator()(const GNode& src, Context& ctx, Pending& pending) const {\n      Node& sdata = self->graph.getData(src, galois::MethodFlag::UNPROTECTED);\n      sdata.lightest = &self->inf;\n      findLightest<false>(self, src, 0, ctx, pending);\n    }\n  };\n\n  struct Merge {\n\n    ParallelAlgo* self;\n\n    Merge(ParallelAlgo* s) : self(s) {}\n\n    void operator()(const WorkItem& item) const {\n      (*this)(item, *self->next, *self->pending);\n    }\n\n    template <typename Context>\n    void operator()(const WorkItem& item, Context& ctx) const {\n      (*this)(item, ctx, *self->pending);\n    }\n\n    template <typename Context, typename Pending>\n    void operator()(const WorkItem& item, Context&, Pending&) const {\n      GNode src   = item.edge.src;\n      Node& sdata = self->graph.getData(src, galois::MethodFlag::UNPROTECTED);\n      Node* rep   = sdata.findAndCompress();\n      int cur     = item.cur;\n\n      if (rep->lightest == item.edge.weight) {\n        GNode dst   = item.edge.dst;\n        Node& ddata = self->graph.getData(dst, galois::MethodFlag::UNPROTECTED);\n        if ((rep = sdata.merge(&ddata))) {\n          rep->lightest = &self->inf;\n          self->mst.push(Edge(src, dst, item.edge.weight));\n        }\n        ++cur;\n      }\n    }\n  };\n\n  struct Find {\n    ParallelAlgo* self;\n\n    Find(ParallelAlgo* s) : self(s) {}\n\n    void operator()(const WorkItem& item) const {\n      (*this)(item, *self->next, *self->pending);\n    }\n\n    template <typename Context>\n    void operator()(const WorkItem& item, Context& ctx) const {\n      (*this)(item, ctx, *self->pending);\n    }\n\n    template <typename Context, typename Pending>\n    void operator()(const WorkItem& item, Context& ctx,\n                    Pending& pending) const {\n      findLightest<true>(self, item.edge.src, item.cur, ctx, pending);\n    }\n  };\n\n  void init() {\n    current = &wls[0];\n    next    = &wls[1];\n    pending = &wls[2];\n\n    EdgeData delta = std::max(heaviest / 5, 1);\n    limit          = delta;\n  }\n\n  void process() {\n\n    constexpr unsigned CHUNK_SIZE = 16;\n\n    size_t rounds = 0;\n\n    init();\n\n    galois::do_all(galois::iterate(graph), Initialize(this),\n                   galois::chunk_size<CHUNK_SIZE>(), galois::steal(),\n                   galois::loopname(\"Initialize\"));\n\n    while (true) {\n      while (true) {\n        rounds += 1;\n\n        std::swap(current, next);\n        galois::do_all(galois::iterate(*current), Merge(this), galois::steal(),\n                       galois::chunk_size<CHUNK_SIZE>(),\n                       galois::loopname(\"Merge\"));\n        galois::do_all(galois::iterate(*current), Find(this), galois::steal(),\n                       galois::chunk_size<CHUNK_SIZE>(),\n                       galois::loopname(\"Find\"));\n        current->clear();\n\n        if (next->empty())\n          break;\n      }\n\n      if (pending->empty())\n        break;\n\n      std::swap(next, pending);\n\n      limit *= 2;\n    }\n\n    galois::runtime::reportStat_Single(\"Boruvka\", \"rounds\", rounds);\n  }\n\n  void processExp() { GALOIS_DIE(\"not supported\"); }\n\n  void operator()() {\n    if (useExp) {\n      processExp();\n    } else {\n      process();\n    }\n  }\n\n  bool checkAcyclic(void) {\n    galois::GAccumulator<unsigned> roots;\n\n    galois::do_all(galois::iterate(graph), [&roots, this](const GNode& n) {\n      const auto& data = graph.getData(n, galois::MethodFlag::UNPROTECTED);\n      if (data.isRep())\n        roots += 1;\n    });\n\n    unsigned numRoots = roots.reduce();\n    unsigned numEdges = std::distance(mst.begin(), mst.end());\n\n    if (graph.size() - numRoots != numEdges) {\n      std::cerr << \"Generated graph is not a forest. \"\n                << \"Expected \" << graph.size() - numRoots << \" edges but \"\n                << \"found \" << numEdges << \"\\n\";\n      return false;\n    }\n\n    std::cout << \"Num trees: \" << numRoots << \"\\n\";\n    std::cout << \"Tree edges: \" << numEdges << \"\\n\";\n    return true;\n  }\n\n  EdgeData sortEdges() {\n\n    galois::GReduceMax<EdgeData> heavy;\n\n    galois::do_all(galois::iterate(graph), [&heavy, this](const GNode& src) {\n      //! [sortEdgeByEdgeData]\n      graph.sortEdgesByEdgeData(src, std::less<EdgeData>(),\n                                galois::MethodFlag::UNPROTECTED);\n      //! [sortEdgeByEdgeData]\n\n      Graph::edge_iterator ii =\n          graph.edge_begin(src, galois::MethodFlag::UNPROTECTED);\n      Graph::edge_iterator ei =\n          graph.edge_end(src, galois::MethodFlag::UNPROTECTED);\n      ptrdiff_t dist = std::distance(ii, ei);\n      if (dist == 0)\n        return;\n      std::advance(ii, dist - 1);\n      heavy.update(graph.getEdgeData(ii));\n    });\n\n    return heavy.reduce();\n  }\n\n  bool verify() {\n\n    auto is_bad_graph = [this](const GNode& n) {\n      Node& me = graph.getData(n);\n      for (auto ii : graph.edges(n)) {\n        GNode dst  = graph.getEdgeDst(ii);\n        Node& data = graph.getData(dst);\n        if (me.findAndCompress() != data.findAndCompress()) {\n          std::cerr << \"not in same component: \" << me << \" and \" << data\n                    << \"\\n\";\n          return true;\n        }\n      }\n      return false;\n    };\n\n    auto is_bad_mst = [this](const Edge& e) {\n      return graph.getData(e.src).findAndCompress() !=\n             graph.getData(e.dst).findAndCompress();\n    };\n\n    if (galois::ParallelSTL::find_if(graph.begin(), graph.end(),\n                                     is_bad_graph) == graph.end()) {\n      if (galois::ParallelSTL::find_if(mst.begin(), mst.end(), is_bad_mst) ==\n          mst.end()) {\n        return checkAcyclic();\n      }\n    }\n    return false;\n  }\n\n  void initializeGraph() {\n    galois::graphs::FileGraph origGraph;\n    galois::graphs::FileGraph symGraph;\n\n    origGraph.fromFileInterleaved<EdgeData>(inputFilename);\n    if (!symmetricGraph)\n      galois::graphs::makeSymmetric<EdgeData>(origGraph, symGraph);\n    else\n      std::swap(symGraph, origGraph);\n\n    galois::graphs::readGraph(graph, symGraph);\n\n    galois::StatTimer Tsort(\"InitializeSortTime\");\n    Tsort.start();\n    heaviest = sortEdges();\n    if (heaviest == std::numeric_limits<EdgeData>::max() ||\n        heaviest == std::numeric_limits<EdgeData>::min()) {\n      GALOIS_DIE(\"Edge weights of graph out of range\");\n    }\n    inf = heaviest + 1;\n\n    Tsort.stop();\n\n    std::cout << \"Nodes: \" << graph.size() << \" edges: \" << graph.sizeEdges()\n              << \" heaviest edge: \" << heaviest << \"\\n\";\n  }\n};\n\ntemplate <typename Algo>\nvoid run() {\n\n  Algo algo;\n\n  galois::StatTimer Tinitial(\"InitializeTime\");\n  Tinitial.start();\n  algo.initializeGraph();\n  Tinitial.stop();\n\n  galois::preAlloc(8 * galois::getActiveThreads() +\n                   16 * (algo.graph.size() + algo.graph.sizeEdges()) /\n                       galois::runtime::pagePoolSize());\n  galois::reportPageAlloc(\"MeminfoPre\");\n\n  galois::StatTimer execTime(\"Timer_0\");\n  execTime.start();\n  galois::runtime::profileVtune([&](void) { algo(); }, \"boruvka\");\n  execTime.stop();\n\n  galois::reportPageAlloc(\"MeminfoPost\");\n\n  auto get_weight = [](const Edge& e) { return *e.weight; };\n\n  auto w = galois::ParallelSTL::map_reduce(\n      algo.mst.begin(), algo.mst.end(), get_weight, std::plus<size_t>(), 0UL);\n\n  std::cout << \"MST weight: \" << w << \"\\n\";\n\n  if (!skipVerify && !algo.verify()) {\n    GALOIS_DIE(\"verification failed\");\n  }\n}\n\nint main(int argc, char** argv) {\n  galois::SharedMemSys G;\n  LonestarStart(argc, argv, name, desc, url, &inputFilename);\n\n  galois::StatTimer totalTime(\"TimerTotal\");\n  totalTime.start();\n\n  switch (algo) {\n  case parallel:\n    run<ParallelAlgo<false>>();\n    break;\n  case exp_parallel:\n    run<ParallelAlgo<true>>();\n    break;\n  default:\n    std::cerr << \"Unknown algo: \" << algo << \"\\n\";\n  }\n\n  totalTime.stop();\n\n  return 0;\n}\n"
  },
  {
    "path": "lonestar/analytics/cpu/spanningtree/CMakeLists.txt",
    "content": "add_executable(minimum-spanningtree-cpu Boruvka.cpp)\nadd_dependencies(apps minimum-spanningtree-cpu)\ntarget_link_libraries(minimum-spanningtree-cpu PRIVATE Galois::shmem lonestar)\ninstall(TARGETS minimum-spanningtree-cpu DESTINATION \"${CMAKE_INSTALL_BINDIR}\" COMPONENT apps EXCLUDE_FROM_ALL)\n\nadd_test_scale(small1 minimum-spanningtree-cpu \"${BASEINPUT}/scalefree/rmat10.gr\")\nadd_test_scale(small2 minimum-spanningtree-cpu \"${BASEINPUT}/reference/structured/rome99.gr\")\n"
  },
  {
    "path": "lonestar/analytics/cpu/spanningtree/README.md",
    "content": "Minimum Weight Spanning Tree\n================================================================================\n\nDESCRIPTION \n--------------------------------------------------------------------------------\n\nThis program computes a minimum-weight spanning tree (MST) of an input graph.\n\nThis implementation uses a Union-Find (aka Disjoint Set) data structure to keep\ntrack of spanning trees and to avoid cycles in the tree.  The algorithm proceeds in multiple rounds, \nwhere in each round, it performs two\nparallel phases. One phase performs *Find* operations while the other phase\nperforms *Union* operations. \n\nINPUT\n--------------------------------------------------------------------------------\n\nThis application takes in Galois .gr graphs.\n\n- If the input is a non-symmetric graph, the program first converts it into symmetric\n  graph (MST is defined for undirected/symmetric graphs only).\n- If the input is a symmetric graph, the user must provide -symmetricGraph flag at\n  commandline\n\nBUILD\n--------------------------------------------------------------------------------\n\n1. Run cmake at BUILD directory (refer to top-level README for cmake instructions).\n\n2. Run `cd <BUILD>/lonestar/analytics/cpu/spanningtree; make -j`\n\nRUN\n--------------------------------------------------------------------------------\n\nThe following are a few example command lines.\n\n-`$ ./minimum-spanningtree-cpu <path-to-directed-graph> -algo parallel -t 40`\n-`$ ./minimum-spanningtree-cpu <path-to-symmetric-graph> -symmetricGraph -algo parallel -t 40`\n\nPERFORMANCE  \n--------------------------------------------------------------------------------\n\n* All parallel loops in 'parallel' algorithm rely on CHUNK_SIZE parameter for load-balancing,\n  which needs to be tuned for machine and input graph. \n"
  },
  {
    "path": "lonestar/analytics/cpu/spanningtree/UnionFind.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#ifndef GALOIS_UNION_FIND\n#define GALOIS_UNION_FIND\n\n#include <cstddef>\n\ntemplate <typename ElTy, ElTy initializer>\nstruct UnionFind {\n\n  ElTy* parents;\n  const size_t size;\n\n  explicit UnionFind(size_t sz) : size(sz) {\n\n    parents = new ElTy[size];\n    for (size_t s = 0; s < sz; s++)\n      parents[s] = initializer;\n  }\n\n  ElTy uf_find(ElTy e) {\n    if (parents[e] == initializer)\n      return e;\n    ElTy tmp = e;\n    ElTy rep = initializer;\n    while (parents[tmp] != initializer)\n      tmp = parents[tmp];\n    rep = tmp;\n    tmp = e;\n    while (parents[tmp] != initializer) {\n      parents[tmp] = rep;\n      tmp          = parents[tmp];\n    }\n    return rep;\n  }\n\n  void uf_union(ElTy e1, ElTy e2) { parents[e1] = e2; }\n\n  ~UnionFind() { delete parents; }\n};\n\nvoid test_uf() { UnionFind<int, -1> sample(10000); }\n#endif // def GALOIS_UNION_FIND\n"
  },
  {
    "path": "lonestar/analytics/cpu/sssp/CMakeLists.txt",
    "content": "add_executable(sssp-cpu SSSP.cpp)\nadd_dependencies(apps sssp-cpu)\ntarget_link_libraries(sssp-cpu PRIVATE Galois::shmem lonestar)\ninstall(TARGETS sssp-cpu DESTINATION \"${CMAKE_INSTALL_BINDIR}\" COMPONENT apps EXCLUDE_FROM_ALL)\n\nadd_test_scale(small1 sssp-cpu \"${BASEINPUT}/reference/structured/rome99.gr\" -delta 8)\nadd_test_scale(small2 sssp-cpu \"${BASEINPUT}/scalefree/rmat10.gr\" -delta 8)\n"
  },
  {
    "path": "lonestar/analytics/cpu/sssp/README.md",
    "content": "Single Source Shortest Path\n================================================================================\n\nDESCRIPTION \n--------------------------------------------------------------------------------\n\nThis program computes the distance of shortest paths in a graph, starting from a\nsource node (specified by -startNode option). \n\n- deltaStep implements a variation on the Delta-Stepping algorithm by Meyer and\n  Sanders, 2003. serDelta is its serial implementation \n- dijkstra is a serial implementation of Dijkstra's algorithm\n- topo is a variation on Bellman-Ford algorithm, which visits all the nodes in the\n  graph, every round, until convergence\n\nEach algorithm has a variant that implements edge tiling, e.g. deltaTile, which\ndivides the edges of high-degree nodes into multiple work items for better\nload balancing. \n\nINPUT\n--------------------------------------------------------------------------------\n\nThis application takes in Galois .gr graphs having integer edge weights.\n\nBUILD\n--------------------------------------------------------------------------------\n\n1. Run cmake at BUILD directory (refer to top-level README for cmake instructions).\n\n2. Run `cd <BUILD>/lonestar/analytics/cpu/sssp; make -j`\n\nRUN\n--------------------------------------------------------------------------------\n\nThe following are a few example command lines.\n\n-`$ ./sssp-cpu <path-to-graph> -algo deltaStep -delta 13 -t 40`\n-`$ ./sssp-cpu <path-to-graph> -algo deltaTile -delta 13 -t 40`\n\nPERFORMANCE  \n--------------------------------------------------------------------------------\n\n* deltaStep/deltaTile algorithms typically performs the best on high diameter\n  graphs, such as road networks. Its performance is sensitive to the *delta* parameter, which is\n  provided as a power-of-2 at the commandline. *delta* parameter should be tuned\n  for every input graph\n* topo/topoTile algorithms typically perform the best on low diameter graphs, such\n  as social networks and RMAT graphs\n* All algorithms rely on CHUNK_SIZE for load balancing, which needs to be\n  tuned for machine and input graph. \n* Tile variants of algorithms provide better load balancing and performance\n  for graphs with high-degree nodes. Tile size is controlled via\n  EDGE_TILE_SIZE constant, which needs to be tuned. \n"
  },
  {
    "path": "lonestar/analytics/cpu/sssp/SSSP.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#include \"galois/Galois.h\"\n#include \"galois/AtomicHelpers.h\"\n#include \"galois/Reduction.h\"\n#include \"galois/PriorityQueue.h\"\n#include \"galois/Timer.h\"\n#include \"galois/graphs/LCGraph.h\"\n#include \"galois/graphs/TypeTraits.h\"\n#include \"Lonestar/BoilerPlate.h\"\n#include \"Lonestar/BFS_SSSP.h\"\n#include \"Lonestar/Utils.h\"\n\n#include \"llvm/Support/CommandLine.h\"\n\n#include <iostream>\n\nnamespace cll = llvm::cl;\n\nstatic const char* name = \"Single Source Shortest Path\";\nstatic const char* desc =\n    \"Computes the shortest path from a source node to all nodes in a directed \"\n    \"graph using a modified chaotic iteration algorithm\";\nstatic const char* url = \"single_source_shortest_path\";\n\nstatic cll::opt<std::string>\n    inputFile(cll::Positional, cll::desc(\"<input file>\"), cll::Required);\nstatic cll::opt<unsigned int>\n    startNode(\"startNode\",\n              cll::desc(\"Node to start search from (default value 0)\"),\n              cll::init(0));\nstatic cll::opt<unsigned int>\n    reportNode(\"reportNode\",\n               cll::desc(\"Node to report distance to(default value 1)\"),\n               cll::init(1));\nstatic cll::opt<unsigned int>\n    stepShift(\"delta\",\n              cll::desc(\"Shift value for the deltastep (default value 13)\"),\n              cll::init(13));\n\nenum Algo {\n  deltaTile = 0,\n  deltaStep,\n  deltaStepBarrier,\n  serDeltaTile,\n  serDelta,\n  dijkstraTile,\n  dijkstra,\n  topo,\n  topoTile,\n  AutoAlgo\n};\n\nconst char* const ALGO_NAMES[] = {\n    \"deltaTile\", \"deltaStep\",    \"deltaStepBarrier\", \"serDeltaTile\",\n    \"serDelta\",  \"dijkstraTile\", \"dijkstra\",         \"topo\",\n    \"topoTile\",  \"Auto\"};\n\nstatic cll::opt<Algo> algo(\n    \"algo\", cll::desc(\"Choose an algorithm (default value auto):\"),\n    cll::values(clEnumVal(deltaTile, \"deltaTile\"),\n                clEnumVal(deltaStep, \"deltaStep\"),\n                clEnumVal(deltaStepBarrier, \"deltaStepBarrier\"),\n                clEnumVal(serDeltaTile, \"serDeltaTile\"),\n                clEnumVal(serDelta, \"serDelta\"),\n                clEnumVal(dijkstraTile, \"dijkstraTile\"),\n                clEnumVal(dijkstra, \"dijkstra\"), clEnumVal(topo, \"topo\"),\n                clEnumVal(topoTile, \"topoTile\"),\n                clEnumVal(AutoAlgo,\n                          \"auto: choose among the algorithms automatically\")),\n    cll::init(AutoAlgo));\n\n//! [withnumaalloc]\nusing Graph = galois::graphs::LC_CSR_Graph<std::atomic<uint32_t>, uint32_t>::\n    with_no_lockable<true>::type ::with_numa_alloc<true>::type;\n//! [withnumaalloc]\ntypedef Graph::GraphNode GNode;\n\nconstexpr static const bool TRACK_WORK          = false;\nconstexpr static const unsigned CHUNK_SIZE      = 64U;\nconstexpr static const ptrdiff_t EDGE_TILE_SIZE = 512;\n\nusing SSSP                 = BFS_SSSP<Graph, uint32_t, true, EDGE_TILE_SIZE>;\nusing Dist                 = SSSP::Dist;\nusing UpdateRequest        = SSSP::UpdateRequest;\nusing UpdateRequestIndexer = SSSP::UpdateRequestIndexer;\nusing SrcEdgeTile          = SSSP::SrcEdgeTile;\nusing SrcEdgeTileMaker     = SSSP::SrcEdgeTileMaker;\nusing SrcEdgeTilePushWrap  = SSSP::SrcEdgeTilePushWrap;\nusing ReqPushWrap          = SSSP::ReqPushWrap;\nusing OutEdgeRangeFn       = SSSP::OutEdgeRangeFn;\nusing TileRangeFn          = SSSP::TileRangeFn;\n\nnamespace gwl = galois::worklists;\nusing PSchunk = gwl::PerSocketChunkFIFO<CHUNK_SIZE>;\nusing OBIM    = gwl::OrderedByIntegerMetric<UpdateRequestIndexer, PSchunk>;\nusing OBIM_Barrier =\n    gwl::OrderedByIntegerMetric<UpdateRequestIndexer,\n                                PSchunk>::with_barrier<true>::type;\n\ntemplate <typename T, typename OBIMTy = OBIM, typename P, typename R>\nvoid deltaStepAlgo(Graph& graph, GNode source, const P& pushWrap,\n                   const R& edgeRange) {\n\n  //! [reducible for self-defined stats]\n  galois::GAccumulator<size_t> BadWork;\n  //! [reducible for self-defined stats]\n  galois::GAccumulator<size_t> WLEmptyWork;\n\n  graph.getData(source) = 0;\n\n  galois::InsertBag<T> initBag;\n  pushWrap(initBag, source, 0, \"parallel\");\n\n  galois::for_each(\n      galois::iterate(initBag),\n      [&](const T& item, auto& ctx) {\n        constexpr galois::MethodFlag flag = galois::MethodFlag::UNPROTECTED;\n        const auto& sdata                 = graph.getData(item.src, flag);\n\n        if (sdata < item.dist) {\n          if (TRACK_WORK)\n            WLEmptyWork += 1;\n          return;\n        }\n\n        for (auto ii : edgeRange(item)) {\n\n          GNode dst          = graph.getEdgeDst(ii);\n          auto& ddist        = graph.getData(dst, flag);\n          Dist ew            = graph.getEdgeData(ii, flag);\n          const Dist newDist = sdata + ew;\n          Dist oldDist       = galois::atomicMin<uint32_t>(ddist, newDist);\n          if (newDist < oldDist) {\n            if (TRACK_WORK) {\n              //! [per-thread contribution of self-defined stats]\n              if (oldDist != SSSP::DIST_INFINITY) {\n                BadWork += 1;\n              }\n              //! [per-thread contribution of self-defined stats]\n            }\n            pushWrap(ctx, dst, newDist);\n          }\n        }\n      },\n      galois::wl<OBIMTy>(UpdateRequestIndexer{stepShift}),\n      galois::disable_conflict_detection(), galois::loopname(\"SSSP\"));\n\n  if (TRACK_WORK) {\n    //! [report self-defined stats]\n    galois::runtime::reportStat_Single(\"SSSP\", \"BadWork\", BadWork.reduce());\n    //! [report self-defined stats]\n    galois::runtime::reportStat_Single(\"SSSP\", \"WLEmptyWork\",\n                                       WLEmptyWork.reduce());\n  }\n}\n\ntemplate <typename T, typename P, typename R>\nvoid serDeltaAlgo(Graph& graph, const GNode& source, const P& pushWrap,\n                  const R& edgeRange) {\n\n  SerialBucketWL<T, UpdateRequestIndexer> wl(UpdateRequestIndexer{stepShift});\n  ;\n  graph.getData(source) = 0;\n\n  pushWrap(wl, source, 0);\n\n  size_t iter = 0UL;\n  while (!wl.empty()) {\n\n    auto& curr = wl.minBucket();\n\n    while (!curr.empty()) {\n      ++iter;\n      auto item = curr.front();\n      curr.pop_front();\n\n      if (graph.getData(item.src) < item.dist) {\n        // empty work\n        continue;\n      }\n\n      for (auto e : edgeRange(item)) {\n\n        GNode dst   = graph.getEdgeDst(e);\n        auto& ddata = graph.getData(dst);\n\n        const auto newDist = item.dist + graph.getEdgeData(e);\n\n        if (newDist < ddata) {\n          ddata = newDist;\n          pushWrap(wl, dst, newDist);\n        }\n      }\n    }\n\n    wl.goToNextBucket();\n  }\n\n  if (!wl.allEmpty()) {\n    std::abort();\n  }\n  galois::runtime::reportStat_Single(\"SSSP-Serial-Delta\", \"Iterations\", iter);\n}\n\ntemplate <typename T, typename P, typename R>\nvoid dijkstraAlgo(Graph& graph, const GNode& source, const P& pushWrap,\n                  const R& edgeRange) {\n\n  using WL = galois::MinHeap<T>;\n\n  graph.getData(source) = 0;\n\n  WL wl;\n  pushWrap(wl, source, 0);\n\n  size_t iter = 0;\n\n  while (!wl.empty()) {\n    ++iter;\n\n    T item = wl.pop();\n\n    if (graph.getData(item.src) < item.dist) {\n      // empty work\n      continue;\n    }\n\n    for (auto e : edgeRange(item)) {\n\n      GNode dst   = graph.getEdgeDst(e);\n      auto& ddata = graph.getData(dst);\n\n      const auto newDist = item.dist + graph.getEdgeData(e);\n\n      if (newDist < ddata) {\n        ddata = newDist;\n        pushWrap(wl, dst, newDist);\n      }\n    }\n  }\n\n  galois::runtime::reportStat_Single(\"SSSP-Dijkstra\", \"Iterations\", iter);\n}\n\nvoid topoAlgo(Graph& graph, const GNode& source) {\n\n  galois::LargeArray<Dist> oldDist;\n  oldDist.allocateInterleaved(graph.size());\n\n  constexpr Dist INFTY = SSSP::DIST_INFINITY;\n  galois::do_all(\n      galois::iterate(size_t{0}, graph.size()),\n      [&](size_t i) { oldDist.constructAt(i, INFTY); }, galois::no_stats(),\n      galois::loopname(\"initDistArray\"));\n\n  graph.getData(source) = 0;\n\n  galois::GReduceLogicalOr changed;\n  size_t rounds = 0;\n\n  do {\n\n    ++rounds;\n    changed.reset();\n\n    galois::do_all(\n        galois::iterate(graph),\n        [&](const GNode& n) {\n          const auto& sdata = graph.getData(n);\n\n          if (oldDist[n] > sdata) {\n\n            oldDist[n] = sdata;\n            changed.update(true);\n\n            for (auto e : graph.edges(n)) {\n              const auto newDist = sdata + graph.getEdgeData(e);\n              auto dst           = graph.getEdgeDst(e);\n              auto& ddata        = graph.getData(dst);\n              galois::atomicMin(ddata, newDist);\n            }\n          }\n        },\n        galois::steal(), galois::loopname(\"Update\"));\n\n  } while (changed.reduce());\n\n  galois::runtime::reportStat_Single(\"SSSP-topo\", \"rounds\", rounds);\n}\n\nvoid topoTileAlgo(Graph& graph, const GNode& source) {\n\n  galois::InsertBag<SrcEdgeTile> tiles;\n\n  graph.getData(source) = 0;\n\n  galois::do_all(\n      galois::iterate(graph),\n      [&](const GNode& n) {\n        SSSP::pushEdgeTiles(tiles, graph, n,\n                            SrcEdgeTileMaker{n, SSSP::DIST_INFINITY});\n      },\n      galois::steal(), galois::loopname(\"MakeTiles\"));\n\n  galois::GReduceLogicalOr changed;\n  size_t rounds = 0;\n\n  do {\n    ++rounds;\n    changed.reset();\n\n    galois::do_all(\n        galois::iterate(tiles),\n        [&](SrcEdgeTile& t) {\n          const auto& sdata = graph.getData(t.src);\n\n          if (t.dist > sdata) {\n\n            t.dist = sdata;\n            changed.update(true);\n\n            for (auto e = t.beg; e != t.end; ++e) {\n              const auto newDist = sdata + graph.getEdgeData(e);\n              auto dst           = graph.getEdgeDst(e);\n              auto& ddata        = graph.getData(dst);\n              galois::atomicMin(ddata, newDist);\n            }\n          }\n        },\n        galois::steal(), galois::loopname(\"Update\"));\n\n  } while (changed.reduce());\n\n  galois::runtime::reportStat_Single(\"SSSP-topo\", \"rounds\", rounds);\n}\n\nint main(int argc, char** argv) {\n  galois::SharedMemSys G;\n  LonestarStart(argc, argv, name, desc, url, &inputFile);\n\n  galois::StatTimer totalTime(\"TimerTotal\");\n  totalTime.start();\n\n  Graph graph;\n  GNode source;\n  GNode report;\n\n  std::cout << \"Reading from file: \" << inputFile << \"\\n\";\n  galois::graphs::readGraph(graph, inputFile);\n  std::cout << \"Read \" << graph.size() << \" nodes, \" << graph.sizeEdges()\n            << \" edges\\n\";\n\n  if (startNode >= graph.size() || reportNode >= graph.size()) {\n    std::cerr << \"failed to set report: \" << reportNode\n              << \" or failed to set source: \" << startNode << \"\\n\";\n    assert(0);\n    abort();\n  }\n\n  auto it = graph.begin();\n  std::advance(it, startNode.getValue());\n  source = *it;\n  it     = graph.begin();\n  std::advance(it, reportNode.getValue());\n  report = *it;\n\n  size_t approxNodeData = graph.size() * 64;\n  galois::preAlloc(numThreads +\n                   approxNodeData / galois::runtime::pagePoolSize());\n  galois::reportPageAlloc(\"MeminfoPre\");\n\n  if (algo == deltaStep || algo == deltaTile || algo == serDelta ||\n      algo == serDeltaTile) {\n    std::cout << \"INFO: Using delta-step of \" << (1 << stepShift) << \"\\n\";\n    std::cout\n        << \"WARNING: Performance varies considerably due to delta parameter.\\n\";\n    std::cout\n        << \"WARNING: Do not expect the default to be good for your graph.\\n\";\n  }\n\n  galois::do_all(galois::iterate(graph),\n                 [&graph](GNode n) { graph.getData(n) = SSSP::DIST_INFINITY; });\n\n  graph.getData(source) = 0;\n\n  std::cout << \"Running \" << ALGO_NAMES[algo] << \" algorithm\\n\";\n\n  galois::StatTimer autoAlgoTimer(\"AutoAlgo_0\");\n  galois::StatTimer execTime(\"Timer_0\");\n  execTime.start();\n\n  if (algo == AutoAlgo) {\n    autoAlgoTimer.start();\n    if (isApproximateDegreeDistributionPowerLaw(graph)) {\n      algo = deltaStep;\n    } else {\n      algo = deltaStepBarrier;\n    }\n    autoAlgoTimer.stop();\n    galois::gInfo(\"Choosing \", ALGO_NAMES[algo], \" algorithm\");\n  }\n\n  switch (algo) {\n  case deltaTile:\n    deltaStepAlgo<SrcEdgeTile>(graph, source, SrcEdgeTilePushWrap{graph},\n                               TileRangeFn());\n    break;\n  case deltaStep:\n    deltaStepAlgo<UpdateRequest>(graph, source, ReqPushWrap(),\n                                 OutEdgeRangeFn{graph});\n    break;\n  case serDeltaTile:\n    serDeltaAlgo<SrcEdgeTile>(graph, source, SrcEdgeTilePushWrap{graph},\n                              TileRangeFn());\n    break;\n  case serDelta:\n    serDeltaAlgo<UpdateRequest>(graph, source, ReqPushWrap(),\n                                OutEdgeRangeFn{graph});\n    break;\n  case dijkstraTile:\n    dijkstraAlgo<SrcEdgeTile>(graph, source, SrcEdgeTilePushWrap{graph},\n                              TileRangeFn());\n    break;\n  case dijkstra:\n    dijkstraAlgo<UpdateRequest>(graph, source, ReqPushWrap(),\n                                OutEdgeRangeFn{graph});\n    break;\n  case topo:\n    topoAlgo(graph, source);\n    break;\n  case topoTile:\n    topoTileAlgo(graph, source);\n    break;\n\n  case deltaStepBarrier:\n    deltaStepAlgo<UpdateRequest, OBIM_Barrier>(graph, source, ReqPushWrap(),\n                                               OutEdgeRangeFn{graph});\n    break;\n\n  default:\n    std::abort();\n  }\n\n  execTime.stop();\n\n  galois::reportPageAlloc(\"MeminfoPost\");\n\n  std::cout << \"Node \" << reportNode << \" has distance \"\n            << graph.getData(report) << \"\\n\";\n\n  // Sanity checking code\n  galois::GReduceMax<uint64_t> maxDistance;\n  galois::GAccumulator<uint64_t> distanceSum;\n  galois::GAccumulator<uint32_t> visitedNode;\n  maxDistance.reset();\n  distanceSum.reset();\n  visitedNode.reset();\n\n  galois::do_all(\n      galois::iterate(graph),\n      [&](uint64_t i) {\n        uint32_t myDistance = graph.getData(i);\n\n        if (myDistance != SSSP::DIST_INFINITY) {\n          maxDistance.update(myDistance);\n          distanceSum += myDistance;\n          visitedNode += 1;\n        }\n      },\n      galois::loopname(\"Sanity check\"), galois::no_stats());\n\n  // report sanity stats\n  uint64_t rMaxDistance = maxDistance.reduce();\n  uint64_t rDistanceSum = distanceSum.reduce();\n  uint64_t rVisitedNode = visitedNode.reduce();\n  galois::gInfo(\"# visited nodes is \", rVisitedNode);\n  galois::gInfo(\"Max distance is \", rMaxDistance);\n  galois::gInfo(\"Sum of visited distances is \", rDistanceSum);\n\n  if (!skipVerify) {\n    if (SSSP::verify(graph, source)) {\n      std::cout << \"Verification successful.\\n\";\n    } else {\n      GALOIS_DIE(\"verification failed\");\n    }\n  }\n\n  totalTime.stop();\n\n  return 0;\n}\n"
  },
  {
    "path": "lonestar/analytics/cpu/triangle-counting/CMakeLists.txt",
    "content": "add_executable(triangle-counting-cpu Triangles.cpp)\nadd_dependencies(apps triangle-counting-cpu)\ntarget_link_libraries(triangle-counting-cpu PRIVATE Galois::shmem lonestar)\ninstall(TARGETS triangle-counting-cpu DESTINATION \"${CMAKE_INSTALL_BINDIR}\" COMPONENT apps EXCLUDE_FROM_ALL)\n\nadd_test_scale(small-ordered-relabel triangle-counting-cpu -symmetricGraph -algo=orderedCount --relabel=true \"${BASEINPUT}/scalefree/symmetric/rmat15.csgr\" NOT_QUICK)\nadd_test_scale(small-ordered triangle-counting-cpu -symmetricGraph -algo=orderedCount \"${BASEINPUT}/scalefree/symmetric/rmat15.csgr\")\nadd_test_scale(small-node triangle-counting-cpu -symmetricGraph -algo=nodeiterator \"${BASEINPUT}/scalefree/symmetric/rmat15.csgr\")\nadd_test_scale(small-edge triangle-counting-cpu -symmetricGraph -algo=edgeiterator \"${BASEINPUT}/scalefree/symmetric/rmat15.csgr\")\n"
  },
  {
    "path": "lonestar/analytics/cpu/triangle-counting/README.md",
    "content": "Triangle counting\n================================================================================\n\nDESCRIPTION \n--------------------------------------------------------------------------------\n\nThis program counts the number of triangles in a given undirected graph. We \nimplement both node-iterator and edge-iterator algorithms from the following:\n\nThomas Schank. Algorithmic Aspects of Triangle-Based Network Analysis. PhD\nThesis. Universitat Karlsruhe. 2007.\n\nWe also have an ordered count algorithm that sorts the nodes by degree before\nexecution: this has been found to give good performance. We implement the\nordered count algorithm from the following:\n\nhttp://gap.cs.berkeley.edu/benchmark.html\n\nINPUT\n--------------------------------------------------------------------------------\n\nThis application takes in symmetric Galois .gr graphs.\nYou must specify the -symmetricGraph flag when running this benchmark.\n\nBUILD\n--------------------------------------------------------------------------------\n\n1. Run cmake at BUILD directory (refer to top-level README for cmake instructions).\n\n2. Run `cd <BUILD>/lonestar/analytics/cpu/triangle-counting; make -j`\n\nRUN\n--------------------------------------------------------------------------------\n\nThe following are a few example command lines.\n\n-`$ ./triangle-counting-cpu <path-symmetric-graph> -algo edgeiterator -t 40 -symmetricGraph`\n-`$ ./triangle-counting-cpu <path-symmetric-graph> -t 20 -algo nodeiterator -symmetricGraph`\n-`$ ./triangle-counting-cpu <path-symmetric-graph> -t 20 -algo orderedCount -symmetricGraph`\n\nPERFORMANCE\n--------------------------------------------------------------------------------\n\n* In our experience, orderedCount algorithm gives the best performance.\n\n* The performance of algorithms depend on an optimal choice of the compile \n  time constant, CHUNK_SIZE, the granularity of stolen work when work stealing is \n  enabled (via galois::steal()). The optimal value of the constant might depend on \n  the architecture, so you might want to evaluate the performance over a range of \n  values (say [16-4096]).\n"
  },
  {
    "path": "lonestar/analytics/cpu/triangle-counting/Triangles.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#include \"galois/Galois.h\"\n#include \"galois/Bag.h\"\n#include \"galois/ParallelSTL.h\"\n#include \"galois/Reduction.h\"\n#include \"galois/Timer.h\"\n#include \"galois/graphs/LCGraph.h\"\n#include \"galois/graphs/BufferedGraph.h\"\n#include \"galois/runtime/Profile.h\"\n#include \"llvm/Support/CommandLine.h\"\n#include \"Lonestar/Utils.h\"\n#include \"Lonestar/BoilerPlate.h\"\n\n#include <boost/iterator/transform_iterator.hpp>\n\n#include <utility>\n#include <vector>\n#include <algorithm>\n#include <iostream>\n#include <fstream>\n\nconst char* name = \"Triangles\";\nconst char* desc = \"Counts the triangles in a graph\";\n\nconstexpr static const unsigned CHUNK_SIZE = 64U;\nenum Algo { nodeiterator, edgeiterator, orderedCount };\n\nnamespace cll = llvm::cl;\n\nstatic cll::opt<std::string>\n    inputFile(cll::Positional, cll::desc(\"<input file>\"), cll::Required);\nstatic cll::opt<Algo> algo(\n    \"algo\", cll::desc(\"Choose an algorithm:\"),\n    cll::values(clEnumValN(Algo::nodeiterator, \"nodeiterator\", \"Node Iterator\"),\n                clEnumValN(Algo::edgeiterator, \"edgeiterator\", \"Edge Iterator\"),\n                clEnumValN(Algo::orderedCount, \"orderedCount\",\n                           \"Ordered Simple Count (default)\")),\n    cll::init(Algo::orderedCount));\n\nstatic cll::opt<bool>\n    relabel(\"relabel\",\n            cll::desc(\"Relabel nodes of the graph (default value of false => \"\n                      \"choose automatically)\"),\n            cll::init(false));\n\ntypedef galois::graphs::LC_CSR_Graph<void, void>::with_numa_alloc<\n    true>::type ::with_no_lockable<true>::type Graph;\n\ntypedef Graph::GraphNode GNode;\n\n/**\n * Like std::lower_bound but doesn't dereference iterators. Returns the first\n * element for which comp is not true.\n */\ntemplate <typename Iterator, typename Compare>\nIterator lowerBound(Iterator first, Iterator last, Compare comp) {\n  using difference_type =\n      typename std::iterator_traits<Iterator>::difference_type;\n\n  Iterator it;\n  difference_type count;\n  difference_type half;\n\n  count = std::distance(first, last);\n  while (count > 0) {\n    it   = first;\n    half = count / 2;\n    std::advance(it, half);\n    if (comp(it)) {\n      first = ++it;\n      count -= half + 1;\n    } else {\n      count = half;\n    }\n  }\n  return first;\n}\n\n/**\n * std::set_intersection over edge_iterators.\n */\ntemplate <typename G>\nsize_t countEqual(G& g, typename G::edge_iterator aa,\n                  typename G::edge_iterator ea, typename G::edge_iterator bb,\n                  typename G::edge_iterator eb) {\n  size_t retval = 0;\n  while (aa != ea && bb != eb) {\n    typename G::GraphNode a = g.getEdgeDst(aa);\n    typename G::GraphNode b = g.getEdgeDst(bb);\n    if (a < b) {\n      ++aa;\n    } else if (b < a) {\n      ++bb;\n    } else {\n      retval += 1;\n      ++aa;\n      ++bb;\n    }\n  }\n  return retval;\n}\n\ntemplate <typename G>\nstruct LessThan {\n  G& g;\n  typename G::GraphNode n;\n  LessThan(G& g, typename G::GraphNode n) : g(g), n(n) {}\n  bool operator()(typename G::edge_iterator it) { return g.getEdgeDst(it) < n; }\n};\n\ntemplate <typename G>\nstruct GreaterThanOrEqual {\n  G& g;\n  typename G::GraphNode n;\n  GreaterThanOrEqual(G& g, typename G::GraphNode n) : g(g), n(n) {}\n  bool operator()(typename G::edge_iterator it) {\n    return !(n < g.getEdgeDst(it));\n  }\n};\n\ntemplate <typename G>\nstruct DegreeLess {\n  typedef typename G::GraphNode N;\n  G* g;\n  DegreeLess(G& g) : g(&g) {}\n\n  bool operator()(const N& n1, const N& n2) const {\n    return std::distance(g->edge_begin(n1), g->edge_end(n1)) <\n           std::distance(g->edge_begin(n2), g->edge_end(n2));\n  }\n};\ntemplate <typename G>\nstruct DegreeGreater {\n  typedef typename G::GraphNode N;\n  G* g;\n  DegreeGreater(G& g) : g(&g) {}\n\n  bool operator()(const N& n1, const N& n2) const {\n    return std::distance(g->edge_begin(n1), g->edge_end(n1)) >\n           std::distance(g->edge_begin(n2), g->edge_end(n2));\n  }\n};\ntemplate <typename G>\nstruct GetDegree {\n  typedef typename G::GraphNode N;\n  G* g;\n  GetDegree(G& g) : g(&g) {}\n\n  ptrdiff_t operator()(const N& n) const {\n    return std::distance(g->edge_begin(n), g->edge_end(n));\n  }\n};\n\ntemplate <typename GraphNode, typename EdgeTy>\nstruct IdLess {\n  bool\n  operator()(const galois::graphs::EdgeSortValue<GraphNode, EdgeTy>& e1,\n             const galois::graphs::EdgeSortValue<GraphNode, EdgeTy>& e2) const {\n    return e1.dst < e2.dst;\n  }\n};\n\n/**\n * Node Iterator algorithm for counting triangles.\n * <code>\n * for (v in G)\n *   for (all pairs of neighbors (a, b) of v)\n *     if ((a,b) in G and a < v < b)\n *       triangle += 1\n * </code>\n *\n * Thomas Schank. Algorithmic Aspects of Triangle-Based Network Analysis. PhD\n * Thesis. Universitat Karlsruhe. 2007.\n */\nvoid nodeIteratingAlgo(Graph& graph) {\n\n  galois::GAccumulator<size_t> numTriangles;\n\n  //! [profile w/ vtune]\n  galois::runtime::profileVtune(\n      [&]() {\n        galois::do_all(\n            galois::iterate(graph),\n            [&](const GNode& n) {\n              // Partition neighbors\n              // [first, ea) [n] [bb, last)\n              Graph::edge_iterator first =\n                  graph.edge_begin(n, galois::MethodFlag::UNPROTECTED);\n              Graph::edge_iterator last =\n                  graph.edge_end(n, galois::MethodFlag::UNPROTECTED);\n              Graph::edge_iterator ea =\n                  lowerBound(first, last, LessThan<Graph>(graph, n));\n              Graph::edge_iterator bb =\n                  lowerBound(first, last, GreaterThanOrEqual<Graph>(graph, n));\n\n              for (; bb != last; ++bb) {\n                GNode B = graph.getEdgeDst(bb);\n                for (auto aa = first; aa != ea; ++aa) {\n                  GNode A = graph.getEdgeDst(aa);\n                  Graph::edge_iterator vv =\n                      graph.edge_begin(A, galois::MethodFlag::UNPROTECTED);\n                  Graph::edge_iterator ev =\n                      graph.edge_end(A, galois::MethodFlag::UNPROTECTED);\n                  Graph::edge_iterator it =\n                      lowerBound(vv, ev, LessThan<Graph>(graph, B));\n                  if (it != ev && graph.getEdgeDst(it) == B) {\n                    numTriangles += 1;\n                  }\n                }\n              }\n            },\n            galois::chunk_size<CHUNK_SIZE>(), galois::steal(),\n            galois::loopname(\"nodeIteratingAlgo\"));\n      },\n      \"nodeIteratorAlgo\");\n  //! [profile w/ vtune]\n\n  std::cout << \"Num Triangles: \" << numTriangles.reduce() << \"\\n\";\n}\n\n/**\n * Lambda function to count triangles\n */\nvoid orderedCountFunc(Graph& graph, GNode n,\n                      galois::GAccumulator<size_t>& numTriangles) {\n  size_t numTriangles_local = 0;\n  for (auto it_v : graph.edges(n)) {\n    auto v = graph.getEdgeDst(it_v);\n    if (v >= n)\n      break;\n    Graph::edge_iterator it_n =\n        graph.edge_begin(n, galois::MethodFlag::UNPROTECTED);\n\n    for (auto it_vv : graph.edges(v)) {\n      auto vv = graph.getEdgeDst(it_vv);\n      if (vv >= v)\n        break;\n      while (graph.getEdgeDst(it_n) < vv)\n        it_n++;\n      if (vv == graph.getEdgeDst(it_n)) {\n\n        Graph::edge_iterator multi_it_n = it_n;\n\n        while (multi_it_n !=\n                   graph.edge_end(n, galois::MethodFlag::UNPROTECTED) &&\n               graph.getEdgeDst(multi_it_n) == vv) {\n          numTriangles_local += 1;\n          multi_it_n++;\n        }\n      }\n    }\n  }\n  numTriangles += numTriangles_local;\n}\n\n/*\n * Simple counting loop, instead of binary searching.\n */\nvoid orderedCountAlgo(Graph& graph) {\n  galois::GAccumulator<size_t> numTriangles;\n  galois::do_all(\n      galois::iterate(graph),\n      [&](const GNode& n) { orderedCountFunc(graph, n, numTriangles); },\n      galois::chunk_size<CHUNK_SIZE>(), galois::steal(),\n      galois::loopname(\"orderedCountAlgo\"));\n\n  galois::gPrint(\"Num Triangles: \", numTriangles.reduce(), \"\\n\");\n}\n\n/**\n * Edge Iterator algorithm for counting triangles.\n * <code>\n * for ((a, b) in E)\n *   if (a < b)\n *     for (v in intersect(neighbors(a), neighbors(b)))\n *       if (a < v < b)\n *         triangle += 1\n * </code>\n *\n * Thomas Schank. Algorithmic Aspects of Triangle-Based Network Analysis. PhD\n * Thesis. Universitat Karlsruhe. 2007.\n */\nvoid edgeIteratingAlgo(Graph& graph) {\n\n  struct WorkItem {\n    GNode src;\n    GNode dst;\n    WorkItem(const GNode& a1, const GNode& a2) : src(a1), dst(a2) {}\n  };\n\n  galois::InsertBag<WorkItem> items;\n  galois::GAccumulator<size_t> numTriangles;\n\n  galois::do_all(\n      galois::iterate(graph),\n      [&](GNode n) {\n        for (Graph::edge_iterator edge :\n             graph.out_edges(n, galois::MethodFlag::UNPROTECTED)) {\n          GNode dst = graph.getEdgeDst(edge);\n          if (n < dst)\n            items.push(WorkItem(n, dst));\n        }\n      },\n      galois::loopname(\"Initialize\"));\n\n  //  galois::runtime::profileVtune(\n  //! [profile w/ papi]\n  galois::runtime::profilePapi(\n      [&]() {\n        galois::do_all(\n            galois::iterate(items),\n            [&](const WorkItem& w) {\n              // Compute intersection of range (w.src, w.dst) in neighbors of\n              // w.src and w.dst\n              Graph::edge_iterator abegin =\n                  graph.edge_begin(w.src, galois::MethodFlag::UNPROTECTED);\n              Graph::edge_iterator aend =\n                  graph.edge_end(w.src, galois::MethodFlag::UNPROTECTED);\n              Graph::edge_iterator bbegin =\n                  graph.edge_begin(w.dst, galois::MethodFlag::UNPROTECTED);\n              Graph::edge_iterator bend =\n                  graph.edge_end(w.dst, galois::MethodFlag::UNPROTECTED);\n\n              Graph::edge_iterator aa = lowerBound(\n                  abegin, aend, GreaterThanOrEqual<Graph>(graph, w.src));\n              Graph::edge_iterator ea =\n                  lowerBound(abegin, aend, LessThan<Graph>(graph, w.dst));\n              Graph::edge_iterator bb = lowerBound(\n                  bbegin, bend, GreaterThanOrEqual<Graph>(graph, w.src));\n              Graph::edge_iterator eb =\n                  lowerBound(bbegin, bend, LessThan<Graph>(graph, w.dst));\n\n              numTriangles += countEqual(graph, aa, ea, bb, eb);\n            },\n            galois::loopname(\"edgeIteratingAlgo\"),\n            galois::chunk_size<CHUNK_SIZE>(), galois::steal());\n      },\n      \"edgeIteratorAlgo\");\n  //! [profile w/ papi]\n\n  std::cout << \"NumTriangles: \" << numTriangles.reduce() << \"\\n\";\n}\n\n//! Sorts read graph by degree (high degree nodes are reindexed to beginning)\nvoid makeSortedGraph(Graph& graph) {\n  galois::StatTimer readTimer(\"ReadGraphTimer\");\n  readTimer.start();\n  // read original graph\n  galois::graphs::BufferedGraph<void> initial;\n  initial.loadGraph(inputFile);\n  readTimer.stop();\n\n  galois::StatTimer Trelabel(\"GraphRelabelTimer\");\n  Trelabel.start();\n\n  size_t numGraphNodes = initial.size();\n  // create node -> degree pairs\n  using DegreeNodePair = std::pair<uint64_t, uint32_t>;\n  std::vector<DegreeNodePair> dnPairs(numGraphNodes);\n  galois::do_all(\n      galois::iterate(size_t{0}, numGraphNodes),\n      [&](size_t nodeID) {\n        size_t nodeDegree =\n            std::distance(initial.edgeBegin(nodeID), initial.edgeEnd(nodeID));\n        dnPairs[nodeID] = DegreeNodePair(nodeDegree, nodeID);\n      },\n      galois::loopname(\"CreateDegreeNodeVector\"));\n\n  galois::StatTimer degSortTimer(\"DegreeSortTimer\");\n  degSortTimer.start();\n  // sort by degree (first item)\n  galois::ParallelSTL::sort(dnPairs.begin(), dnPairs.end(),\n                            std::greater<DegreeNodePair>());\n  degSortTimer.stop();\n\n  // create mapping, get degrees out to another vector to get prefix sum\n  std::vector<uint32_t> oldToNewMapping(numGraphNodes);\n  std::vector<uint64_t> inProgressPrefixSum(numGraphNodes);\n  galois::do_all(\n      galois::iterate(size_t{0}, numGraphNodes),\n      [&](size_t index) {\n        // save degree, which is pair.first\n        inProgressPrefixSum[index] = dnPairs[index].first;\n        // save mapping; original index is in .second, map it to current index\n        oldToNewMapping[dnPairs[index].second] = index;\n      },\n      galois::loopname(\"CreateRemappingGetPrefixSum\"));\n\n  std::vector<uint64_t> newPrefixSum(numGraphNodes);\n  galois::ParallelSTL::partial_sum(inProgressPrefixSum.begin(),\n                                   inProgressPrefixSum.end(),\n                                   newPrefixSum.begin());\n\n  // allocate graph\n  graph.allocateFrom(numGraphNodes, initial.sizeEdges());\n  // construct nodes\n  graph.constructNodes();\n  // set edge endpoints using prefix sum\n  galois::do_all(\n      galois::iterate(size_t{0}, numGraphNodes),\n      [&](size_t nodeIndex) {\n        graph.fixEndEdge(nodeIndex, newPrefixSum[nodeIndex]);\n      },\n      galois::loopname(\"SetEdgeEndpoints\"));\n\n  // construct edges by looping through filegraph and saving to correct\n  // locations\n  galois::do_all(\n      galois::iterate(0u, initial.size()),\n      [&](uint32_t oldNodeID) {\n        uint32_t newIndex = oldToNewMapping[oldNodeID];\n\n        // get the start location of this reindex'd nodes edges\n        uint64_t currentEdgeIndex;\n        if (newIndex != 0) {\n          currentEdgeIndex = newPrefixSum[newIndex - 1];\n        } else {\n          currentEdgeIndex = 0;\n        }\n\n        // construct the graph, reindexing as it goes along\n        for (auto e = initial.edgeBegin(oldNodeID);\n             e < initial.edgeEnd(oldNodeID); e++) {\n          // get destination, reindex\n          uint32_t oldEdgeDst       = initial.edgeDestination(*e);\n          uint32_t reindexedEdgeDst = oldToNewMapping[oldEdgeDst];\n\n          // construct edge\n          graph.constructEdge(currentEdgeIndex, reindexedEdgeDst);\n          currentEdgeIndex++;\n        }\n        // this assert makes sure reindex was correct + makes sure all edges\n        // are accounted for\n        assert(currentEdgeIndex == newPrefixSum[newIndex]);\n      },\n      galois::steal(), galois::loopname(\"ReindexingGraph\"));\n\n  galois::StatTimer edgeSortTimer(\"EdgeSortTimer\");\n  edgeSortTimer.start();\n  // sort by destinations\n  graph.sortAllEdgesByDst();\n  edgeSortTimer.stop();\n\n  // initialize local ranges\n  graph.initializeLocalRanges();\n\n  Trelabel.stop();\n}\n\nvoid readGraph(Graph& graph) {\n  galois::StatTimer autoAlgoTimer(\"AutoAlgo_0\");\n  if (!relabel) {\n    galois::graphs::FileGraph degreeGraph;\n    degreeGraph.fromFile(inputFile);\n    degreeGraph.initNodeDegrees();\n    autoAlgoTimer.start();\n    relabel = isApproximateDegreeDistributionPowerLaw(degreeGraph);\n    autoAlgoTimer.stop();\n  }\n  if (relabel) {\n    galois::gInfo(\"Relabeling and sorting graph...\");\n    makeSortedGraph(graph);\n  } else {\n    galois::graphs::readGraph(graph, inputFile);\n    // algorithm correctness requires sorting edges by destination\n    graph.sortAllEdgesByDst();\n  }\n}\n\nint main(int argc, char** argv) {\n  galois::SharedMemSys G;\n  LonestarStart(argc, argv, name, desc, nullptr, &inputFile);\n\n  galois::StatTimer totalTime(\"TimerTotal\");\n  totalTime.start();\n\n  if (!symmetricGraph) {\n    GALOIS_DIE(\"This application requires a symmetric graph input;\"\n               \" please use the -symmetricGraph flag \"\n               \" to indicate the input is a symmetric graph.\");\n  }\n\n  Graph graph;\n\n  galois::StatTimer initialTime(\"GraphReadingTime\");\n  initialTime.start();\n  readGraph(graph);\n  initialTime.stop();\n\n  galois::preAlloc(numThreads + 16 * (graph.size() + graph.sizeEdges()) /\n                                    galois::runtime::pagePoolSize());\n  galois::reportPageAlloc(\"MeminfoPre\");\n\n  galois::gInfo(\"Starting triangle counting...\");\n\n  galois::StatTimer execTime(\"Timer_0\");\n  execTime.start();\n  // case by case preAlloc to avoid allocating unnecessarily\n  switch (algo) {\n  case nodeiterator:\n    nodeIteratingAlgo(graph);\n    break;\n\n  case edgeiterator:\n    edgeIteratingAlgo(graph);\n    break;\n\n  case orderedCount:\n    orderedCountAlgo(graph);\n    break;\n\n  default:\n    std::cerr << \"Unknown algo: \" << algo << \"\\n\";\n  }\n  execTime.stop();\n\n  galois::reportPageAlloc(\"MeminfoPost\");\n\n  totalTime.stop();\n\n  return 0;\n}\n"
  },
  {
    "path": "lonestar/analytics/distributed/CMakeLists.txt",
    "content": "include_directories(\"${PROJECT_SOURCE_DIR}/libgluon/include\")\n\nadd_subdirectory(betweennesscentrality)\nadd_subdirectory(bfs)\nadd_subdirectory(connected-components)\nadd_subdirectory(k-core)\nadd_subdirectory(pagerank)\nadd_subdirectory(partition)\nadd_subdirectory(matrixcompletion)\nadd_subdirectory(sssp)\nadd_subdirectory(triangle-counting)\n"
  },
  {
    "path": "lonestar/analytics/distributed/README.md",
    "content": "Overview of Distributed and Heterogeneous Systems in Galois\n================================================================================\n\nThis directory contains benchmarks that run using D-Galois and D-IrGL.\n\nD-Galois is distributed Galois built using the Gluon communication substrate.\nSimilarly, D-IrGL is distributed IrGL built using Gluon.\nGluon is just the communication substrate: it is not a standalone system.\n\nBasic Compiling Through CMake (Distributed and Heterogeneous Galois)\n================================================================================\n\nThe dependencies for distributed Galois are exactly the same as shared-memory\nGalois except that it requires an MPI library (e.g. mpich2) to be on the\nsystem as well.\n\nTo build distributed/heterogeneous Galois, certain CMake flags must be\nspecified.\n\nFor distributed Galois, i.e. D-Galois:\n\n`cmake ${GALOIS_ROOT} -DGALOIS_ENABLE_DIST=1`\n\nFor distributed and heterogeneous Galois, i.e. D-IrGL:\n\n`cmake ${GALOIS_ROOT} -DGALOIS_ENABLE_DIST=1 -DGALOIS_CUDA_CAPABILITY=<insert CUDA capability here>`\n\nThe CUDA capability should be one that your GPU supports. For example, if you\nwanted to use a GTX 1080 and a K80, the command would look like this:\n\n`cmake ${GALOIS_ROOT} -DGALOIS_ENABLE_DIST=1 -DGALOIS_CUDA_CAPABILITY=\"3.7;6.1\"`\n\nNote that heterogeneous Galois requires CUDA 8.0 and above and a compiler\nthat is compatible with the CUDA version that you use.\n\nNote that heterogeneous Galois requires the cub and moderngpu git submodules, which can be cloned using the followed commands.\n\n```Shell\ncd $GALOIS_ROOT\ngit submodule init\ngit submodule update \n```\nThese modules will be cloned in the ${GALOIS\\_ROOT}/external directory\n\nCompiling with distributed Galois will add the `distributed` directory under\n`lonestar/analytics` to the build folder.\n\nCompiling Provided Apps\n================================================================================\n\nOnce CMake is successfully completed, you can build the provided apps with the\nfollowing command in lonestar/analytics/distributed directory.\n\n`make -j`\n\nYou can compile specific apps by going their directories and running make.\n\nRunning Provided Apps\n================================================================================\n\nYou can learn how to run compiled applications by running them with the -help\ncommand line option:\n\n`./bfs-push -help`\n\nMost of the provided graph applications take graphs in a .gr format, which\nis a Galois graph format that stores the graph in a CSR or CSC format. We\nprovide a graph converter tool under 'tools/graph-convert' that can take\nvarious graph formats and convert them to the Galois format.\n\nRunning Provided Apps (Distributed Apps)\n================================================================================\n\nFirst, note that if running multiple processes on a single machine (e.g.,\nsingle-host multi-GPU or multi-host multi-GPU where a process is spawned for\neach GPU), specifying `GALOIS_DO_NOT_BIND_THREADS=1` as an environment variable\nis crucial for performance.\n\nIf using MPI, multiple processes split across multiple hosts can be specified\nwith the following:\n\n`GALOIS_DO_NOT_BIND_THREADS=1 mpirun -n=<# of processes> -hosts=<machines to run on> ./bfs-push <input graph>`\n\nThe distributed applications have a few common command line flags that are\nworth noting. More details can be found by running a distributed application\nwith the -help flag.\n\n`-partition=<partitioning policy>`\n\nSpecifies the partitioning that you would like to use when splitting the graph\namong multiple hosts.\n\n`-exec=Sync,Async`\n\nSpecifies synchronous communication (bulk-synchronous parallel where every host\nblocks for messages from other hosts at the end of a round of execution)\nor asynchronous communication (bulk-asynchronous parallel where a host does\nnot have to block on messages from other hosts at the end of the round and\nmay continue execution).\n\n`-graphTranspose`\n\nSpecifies the transpose of the provided input graph. This is used to\ncreate certain partitions of the graph (and is required for some of the\npartitioning policies).\n\n`-runs`\n\nNumber of times to run an application.\n\n`-statFile`\n\nSpecify the file in which to output run statistics to.\n\n`-t`\n\nNumber of threads to use on a single machine excluding a communication thread\nthat is used by all of the provided distributed benchmarks. Note that\nGPUs only use 1 thread (excluding the communication thread).\n\n`-output` / `-outputLocation=<directory>`\n\nOutputs the result of running the application to a file. For example,\nspecifying this flag on a bfs application will output the shortest distances to\neach node.\n\nRunning Provided Apps (Distributed Heterogeneous Apps)\n================================================================================\n\nHeterogeneous apps have additional command line parameters:\n\n`-num_nodes=<num>`\n\nSpecifies the total number of PHYSICAL machines on the system. For example,\nyou could have 2 machines with 8 GPUs each for a total of 16 processes,\nbut you still would only have 2 machines. Therefore, you would use\n`-num_nodes=2`. Note that there **must** be one process per GPU in use.\n\n`-pset=<string>`\n\nSpecifies the architecture to run on on a single machine using \"c\" (CPUs) and\n\"g\" (GPUs). For example, if you have 2 machines with 8 GPUs each,\nbut you want to run with 3 GPUs on each machine, you would use `-pset=\"ggg\"`.\nTherefore, combined with `-num_nodes=2`, you would have a total of 6 units of\nexecution: 3 GPUs on 2 machines for a total of 6. This creates a total of\n6 processes across the 2 machines (1 for each GPU).\n\nAlso, it suffices to use only one \"c\" in pset to run on CPUs on your machines:\nyou can specify the amount of cores/hyperthreads to use using the\naforementioned thread option `-t`.\n\nExamples for Running Provided Apps\n================================================================================\n\nTo run 3 processes all on a single machine, use the following:\n`GALOIS_DO_NOT_BIND_THREADS=1 mpirun -n=3 ./bfs_push rmat15.gr -graphTranspose=rmat15.tgr -t=4 -num_nodes=1 -partition=oec`\nNote: when heterogeneous execution is not enabled via `GALOIS_CUDA_CAPABILITY`,\n`-num_nodes=1` is invalid and will not appear as an option.\n\nis not correct if heterogeneous execution is not\nenabled via specifying the CUDA capability (as it does not appear as an option if\nheterogeneous execution is not on).\n\nTo run on 3 CPUs on h1, h2, and h3, use the following:\n`mpirun -n=3 -hosts=h1,h2,h3 ./cc_push rmat15.sgr -symmetricGraph -t=1 -num_nodes=1 -partition=iec`\n\nTo run on 3 GPUs on a single machine, use the following:\n`GALOIS_DO_NOT_BIND_THREADS=1 mpirun -n=3 ./sssp_pull rmat15.gr -graphTranspose=rmat15.tgr -t=1 -num_nodes=1 -pset=\"ggg\" -partition=cvc`\n\nTo run on 4 GPUs on 2 machines h1 and h2 (each with 2 GPUs), use the following:\n`GALOIS_DO_NOT_BIND_THREADS=1 mpirun -n=4 -hosts=h1,h2 ./bfs_pull rmat15.gr -graphTranspose=rmat15.tgr -t=1 -num_nodes=2 -pset=\"gg\" -partition=cvc-iec`\nNote that `mpirun -n=4` is 4 because there are a total of 4 execution units being used.\n\nTo run on 1 CPU and 1 GPU each on 2 machines h1 and h2, use the following:\n`GALOIS_DO_NOT_BIND_THREADS=1 mpirun -n=4 -hosts=h1,h2 ./pagerank_pull rmat15.gr -graphTranspose=rmat15.tgr -t=1 -num_nodes=2 -pset=\"cg\" -partition=oec`\n\nPerformance Considerations\n================================================================================\n\n* As mentioned above if running multiple processes on a single machine,\n  specifying `GALOIS_DO_NOT_BIND_THREADS=1` as an environment variable is\n  crucial for performance.\n\n* We have also observed that `GALOIS_DO_NOT_BIND_THREADS=1` to improve\n  performance in a distributed setting as well (multiple hosts each with its\n  own process).\n\n* For 16 or less hosts/GPUs, for performance, we recommend using an\n  **edge-cut** partitioning policy (OEC or IEC) with **synchronous**\n  communication for performance.\n\n* For 32 or more hosts/GPUs, for performance, we recommend using the\n  **Cartesian vertex-cut** partitioning policy (CVC) with **asynchronous**\n  communication for performance.\n\nPublications Related to Distributed Applications\n================================================================================\n\nPlease see the publications listed below for information on the distributed\nruntime as well as performance studies we have conducted over the years.\n\nRoshan Dathathri, Gurbinder Gill, Loc Hoang, Hoang-Vu Dang, Alex Brooks,\nNikoli Dryden, Marc Snir, Keshav Pingali, “Gluon: A Communication-Optimizing\nSubstrate for Distributed Heterogeneous Graph Analytics,” Proceedings of the\n39th ACM SIGPLAN Conference on Programming Language Design and Implementation\n(PLDI), June 2018.\n\nGurbinder Gill, Roshan Dathathri, Loc Hoang, Andrew Lenharth, Keshav Pingali,\n“Abelian: A Compiler for Graph Analytics on Distributed, Heterogeneous\nPlatforms,” Proceedings of the 24th International European Conference on\nParallel and Distributed Computing (Euro-Par), August 2018.\n\nGurbinder Gill, Roshan Dathathri, Loc Hoang, Keshav Pingali, “A Study of\nPartitioning Policies for Graph Analytics on Large-scale Distributed\nPlatforms,” Proceedings of the 45th International Conference on Very Large Data\nBases (PVLDB), 12(4): 321-334, December 2018.\n\nLoc Hoang, Matteo Pontecorvi, Roshan Dathathri, Gurbinder Gill, Bozhi You,\nKeshav Pingali, Vijaya Ramachandran, “A Round-Efficient Distributed\nBetweenness Centrality Algorithm,” Proceedings of the 24th ACM SIGPLAN\nSymposium on Principles and Practice of Parallel Programming (PPoPP), February\n2019.\n\nRoshan Dathathri, Gurbinder Gill, Loc Hoang, Keshav Pingali, “Phoenix: A\nSubstrate for Resilient Distributed Graph Analytics,” Proceedings of the 24th\nACM International Conference on Architectural Support for Programming Languages\nand Operating Systems (ASPLOS), April 2019.\n\nLoc Hoang, Roshan Dathathri, Gurbinder Gill, Keshav Pingali, “CuSP: A\nCustomizable Streaming Edge Partitioner for Distributed Graph Analytics,”\nProceedings of the 33rd IEEE International Parallel and Distributed Processing\nSymposium (IPDPS), May 2019.\n\nLoc Hoang, Vishwesh Jatala, Xuhao Chen, Udit Agarwal, Roshan Dathathri,\nGurbinder Gill, Keshav Pingali, “DistTC: High Performance Distributed Triangle\nCounting,” Proceedings of the IEEE International Conference on High Performance\nExtreme Computing (HPEC), September 2019.\n\nRoshan Dathathri, Gurbinder Gill, Loc Hoang, Hoang-Vu Dang, Vishwesh Jatala, V.\nKrishna Nandivada, Marc Snir, Keshav Pingali, “Gluon-Async: A Bulk-Asynchronous\nSystem for Distributed and Heterogeneous Graph Analytics,” Proceedings of the\n28th IEEE International Conference on Parallel Architectures and Compilation\nTechniques (PACT), September 2019.\n\nVishwesh Jatala, Roshan Dathathri, Gurbinder Gill, Loc Hoang, V. Krishna\nNandivada, Keshav Pingali, “A Study of Graph Analytics for Massive Datasets on\nDistributed GPUs,” Proceedings of the 34th IEEE International Parallel and\nDistributed Processing Symposium (IPDPS), May 2020.\n\nBasic Use (Creating Your Own Applications)\n================================================================================\n\nYou can run the sample applications and make your own Galois programs directly\nin the build tree without installing anything. Just add a subdirectory under\ndistributed, copy a CMakeLists.txt file from another application to your new\napplication, and add the subdirectory to the CMakeLists in distributed.\n"
  },
  {
    "path": "lonestar/analytics/distributed/betweennesscentrality/CMakeLists.txt",
    "content": "app_dist(bc_level betweennesscentrality-level)\nadd_test_dist(betweennesscentrality-level-dist rmat15 NO_ASYNC ${BASEINPUT}/scalefree/rmat15.gr -graphTranspose=${BASEINPUT}/scalefree/transpose/rmat15.tgr -numOfSources=4)\n\napp_dist(bc_mr betweennesscentrality-minrounds NO_GPU)\nadd_test_dist(betweennesscentrality-minrounds-dist rmat15 NO_ASYNC NO_GPU ${BASEINPUT}/scalefree/rmat15.gr -graphTranspose=${BASEINPUT}/scalefree/transpose/rmat15.tgr -numOfSources=4 -numRoundSources=4)\nadd_test_dist(betweennesscentrality-minrounds-dist rmat15all NO_ASYNC NO_GPU NOT_QUICK ${BASEINPUT}/scalefree/rmat15.gr -graphTranspose=${BASEINPUT}/scalefree/transpose/rmat15.tgr -numRoundSources=4096)\n"
  },
  {
    "path": "lonestar/analytics/distributed/betweennesscentrality/README.md",
    "content": "Betweenness Centrality\n================================================================================\n\nDESCRIPTION \n--------------------------------------------------------------------------------\n\nbetweenesscentrality-level is a bulk synchronous parallel version of Brandes's\nBetweenness Centrality that does both the forward and backward phases of\nBrandes's algorithm in a level by level, work-efficient fashion. The algorithm\nsolves dependencies for a single source at a time.\n\nbetweenesscentrality-minrounds is a provably round efficient distributed\nalgorithm that can solve for betweenness centrality dependencies for multiple\nsources at a time. It leverages a proven insight that allows the algorithm to\nknow exactly which round that synchronization of source data needs to occur:\nthis results in communication only when necessary which further improves the\nalgorithms efficiency in the distributed setting. Details of the algorithm, the\nproofs of correctness, and performance comparisons can be found in our paper:\n\nLoc Hoang, Matteo Pontecorvi, Roshan Dathathri, Gurbinder Gill, Bozhi You,\nKeshav Pingali, Vijaya Ramachandran, “A Round-Efficient Distributed\nBetweenness Centrality Algorithm,” Proceedings of the 24th ACM SIGPLAN\nSymposium on Principles and Practice of Parallel Programming (PPoPP), February\n2019.\n\n\nINPUT\n--------------------------------------------------------------------------------\n\nTakes in Galois .gr graphs.\n\nBUILD\n--------------------------------------------------------------------------------\n\n1. Run cmake at BUILD directory (refer to top-level README for cmake instructions).\n\n2. Run `cd <BUILD>/lonestar/analytics/distributed/betweenesscentrality; make -j`\n\nRUN\n--------------------------------------------------------------------------------\n\nAll the command line arguments used by both apps are the same except for\n`-numRoundSources`, which is used by minrounds to control the number of sources\nbeing batched at any given point.\n\nTo run solving for all sources, use the following:\n`./betweenesscentrality-level-dist <input-graph> -t=<num-threads>`\n\nTo run for the first n sources, use the following:\n`./betweenesscentrality-level-dist <input-graph> -t=<num-threads> -numOfSources=n`\n\nTo run using specified sources from a file, use the following:\n`./betweenesscentrality-level-dist <input-graph> -t=<num-threads> -sourcesToUse=<filename>`\n\nTo run on 3 hosts h1, h2, and h3 with a Cartesian vertex cut partition for all\nsources, use the following:\n`mpirun -n=3 -hosts=h1,h2,h3 ./betweenesscentrality-level-dist <input-graph> -t=<num-threads> -partition=cvc`\n\nTo run for all sources in batches of k on 3 hosts, use the following:\n`mpirun -n=3 -hosts=h1,h2,h3 ./betweenesscentrality-minrounds-dist <input-graph> -t=<num-threads> -numRoundSources=k`\n\nPERFORMANCE\n--------------------------------------------------------------------------------\n\n* The minrounds implementation performs significantly better than the level\nimplementation on high diameter graphs as it batches multiple sources together\nat once and significantly reduces (1) rounds executed and (2) the communication\noverhead. \n\n* Batching more sources in minrounds is a tradeoff between memory usage\nand efficiency: more sources generally leads to less rounds executed but\nrequires a linear increase in memory used by the implementation to store data\nfor all of the sources being batched.\n\n* More details on the differences between level and minrounds can be found in\nour performance study in the MRBC paper cited above.\n"
  },
  {
    "path": "lonestar/analytics/distributed/betweennesscentrality/bc_level.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n/**\n * This version of BC-Level uses an option in the synchronization runtime to\n * avoid the overheads of having 2 extra accumulator variables.\n */\n\n//#define BCDEBUG\n\n#include \"DistBench/Output.h\"\n#include \"DistBench/Start.h\"\n#include \"galois/DistGalois.h\"\n#include \"galois/gstl.h\"\n#include \"galois/DReducible.h\"\n#include \"galois/runtime/Tracer.h\"\n\n#include <iomanip>\n#include <iostream>\n#include <limits>\n\n#ifdef GALOIS_ENABLE_GPU\n#include \"bc_level_cuda.h\"\nstruct CUDA_Context* cuda_ctx;\n#else\n// type of the num shortest paths variable\nusing ShortPathType = double;\nenum { CPU, GPU_CUDA };\nint personality = CPU;\n#endif\n\nconstexpr static const char* const REGION_NAME = \"BC\";\n\n/******************************************************************************/\n/* Declaration of command line arguments */\n/******************************************************************************/\nnamespace cll = llvm::cl;\nstatic cll::opt<std::string>\n    sourcesToUse(\"sourcesToUse\",\n                 cll::desc(\"Whitespace separated list \"\n                           \"of sources in a file to \"\n                           \"use in BC (default empty)\"),\n                 cll::init(\"\"));\nstatic cll::opt<bool>\n    singleSourceBC(\"singleSource\",\n                   cll::desc(\"Use for single source BC (default off)\"),\n                   cll::init(false));\nstatic cll::opt<uint64_t>\n    startSource(\"startNode\",\n                cll::desc(\"Starting source node used for \"\n                          \"betweeness-centrality (default 0)\"),\n                cll::init(0));\nstatic cll::opt<unsigned int>\n    numberOfSources(\"numOfSources\",\n                    cll::desc(\"Number of sources to use for \"\n                              \"betweeness-centraility (default all)\"),\n                    cll::init(0));\n\n/******************************************************************************/\n/* Graph structure declarations */\n/******************************************************************************/\nconst uint32_t infinity          = std::numeric_limits<uint32_t>::max() / 4;\nstatic uint64_t current_src_node = 0;\n// global round numbers; 1 for forward, 1 for back; used in sync structs as well\nuint32_t globalRoundNumber = 0;\nuint32_t backRoundCount    = 0;\n\n// NOTE: types assume that these values will not reach uint64_t: it may\n// need to be changed for very large graphs\nstruct NodeData {\n  // SSSP vars\n  std::atomic<uint32_t> current_length;\n  // Betweeness centrality vars\n  std::atomic<ShortPathType> num_shortest_paths;\n  float dependency;\n  float betweeness_centrality;\n\n  //#ifdef BCDEBUG\n  void dump() {\n    galois::gPrint(\"DUMP: \", current_length.load(), \" \",\n                   num_shortest_paths.load(), \" \", dependency, \"\\n\");\n  }\n  //#endif\n};\n\n// reading in list of sources to operate on if provided\nstd::ifstream sourceFile;\nstd::vector<uint64_t> sourceVector;\n\nusing Graph = galois::graphs::DistGraph<NodeData, void>;\nusing GNode = typename Graph::GraphNode;\n\n// bitsets for tracking updates\ngalois::DynamicBitSet bitset_num_shortest_paths;\ngalois::DynamicBitSet bitset_current_length;\ngalois::DynamicBitSet bitset_dependency;\n\nstd::unique_ptr<galois::graphs::GluonSubstrate<Graph>> syncSubstrate;\n\n// sync structures\n#include \"bc_level_sync.hh\"\n\n/******************************************************************************/\n/* Functors for running the algorithm */\n/******************************************************************************/\n\nstruct InitializeGraph {\n  Graph* graph;\n\n  InitializeGraph(Graph* _graph) : graph(_graph) {}\n\n  /* Initialize the graph */\n  void static go(Graph& _graph) {\n    const auto& allNodes = _graph.allNodesRange();\n\n    if (personality == GPU_CUDA) {\n#ifdef GALOIS_ENABLE_GPU\n      std::string impl_str(\n          syncSubstrate->get_run_identifier(\"InitializeGraph\"));\n      galois::StatTimer StatTimer_cuda(impl_str.c_str(), REGION_NAME);\n      StatTimer_cuda.start();\n      InitializeGraph_allNodes_cuda(cuda_ctx);\n      StatTimer_cuda.stop();\n#else\n      abort();\n#endif\n    } else if (personality == CPU) {\n      galois::do_all(\n          // pass in begin/end to not use local thread ranges\n          galois::iterate(allNodes.begin(), allNodes.end()),\n          InitializeGraph{&_graph}, galois::no_stats(),\n          galois::loopname(\"InitializeGraph\"));\n    }\n  }\n\n  /* Functor passed into the Galois operator to carry out initialization;\n   * reset everything */\n  void operator()(GNode src) const {\n    NodeData& src_data = graph->getData(src);\n\n    src_data.betweeness_centrality = 0;\n    src_data.num_shortest_paths    = 0;\n    src_data.dependency            = 0;\n  }\n};\n\n/* This is used to reset node data when switching to a difference source */\nstruct InitializeIteration {\n  const uint32_t& local_infinity;\n  const uint64_t& local_current_src_node;\n  Graph* graph;\n\n  InitializeIteration(const uint32_t& _local_infinity,\n                      const uint64_t& _local_current_src_node, Graph* _graph)\n      : local_infinity(_local_infinity),\n        local_current_src_node(_local_current_src_node), graph(_graph) {}\n\n  /* Reset necessary graph metadata for next iteration of SSSP */\n  void static go(Graph& _graph) {\n    const auto& allNodes = _graph.allNodesRange();\n\n    if (personality == GPU_CUDA) {\n#ifdef GALOIS_ENABLE_GPU\n      std::string impl_str(\n          syncSubstrate->get_run_identifier(\"InitializeIteration\"));\n      galois::StatTimer StatTimer_cuda(impl_str.c_str(), REGION_NAME);\n      StatTimer_cuda.start();\n      InitializeIteration_allNodes_cuda(infinity, current_src_node, cuda_ctx);\n      StatTimer_cuda.stop();\n#else\n      abort();\n#endif\n    } else if (personality == CPU) {\n      galois::do_all(\n          galois::iterate(allNodes.begin(), allNodes.end()),\n          InitializeIteration{infinity, current_src_node, &_graph},\n          galois::loopname(syncSubstrate\n                               ->get_run_identifier(std::string(REGION_NAME) +\n                                                    \"_InitializeIteration\")\n                               .c_str()),\n          galois::no_stats());\n    }\n  }\n\n  /* Functor passed into the Galois operator to carry out reset of node data\n   * (aside from betweeness centrality measure */\n  void operator()(GNode src) const {\n    NodeData& src_data = graph->getData(src);\n\n    bool is_source = graph->getGID(src) == local_current_src_node;\n\n    if (!is_source) {\n      src_data.current_length     = local_infinity;\n      src_data.num_shortest_paths = 0;\n    } else {\n      src_data.current_length     = 0;\n      src_data.num_shortest_paths = 1;\n    }\n    src_data.dependency = 0;\n  }\n};\n\n/**\n * Forward pass does level by level BFS to find distances and number of\n * shortest paths\n */\nstruct ForwardPass {\n  Graph* graph;\n  galois::DGAccumulator<uint32_t>& dga;\n  uint32_t local_r;\n\n  ForwardPass(Graph* _graph, galois::DGAccumulator<uint32_t>& _dga,\n              uint32_t roundNum)\n      : graph(_graph), dga(_dga), local_r(roundNum) {}\n\n  /**\n   * Level by level BFS while also finding number of shortest paths to a\n   * particular node in the BFS tree.\n   *\n   * @param _graph Graph to use\n   * @param _dga distributed accumulator\n   * @param[out] roundNumber Number of rounds taken to finish\n   */\n  void static go(Graph& _graph, galois::DGAccumulator<uint32_t>& _dga) {\n    globalRoundNumber          = 0;\n    const auto& nodesWithEdges = _graph.allNodesWithEdgesRange();\n\n    bool moreThanOne = galois::runtime::getSystemNetworkInterface().Num > 1;\n\n    do {\n      _dga.reset();\n\n      if (personality == GPU_CUDA) {\n#ifdef GALOIS_ENABLE_GPU\n        std::string impl_str(syncSubstrate->get_run_identifier(\n            std::string(REGION_NAME) + \"_ForwardPass\"));\n        galois::StatTimer StatTimer_cuda(impl_str.c_str(), REGION_NAME);\n        StatTimer_cuda.start();\n        unsigned int __retval = 0;\n        ForwardPass_nodesWithEdges_cuda(__retval, globalRoundNumber, cuda_ctx);\n        _dga += __retval;\n        StatTimer_cuda.stop();\n#else\n        abort();\n#endif\n      } else if (personality == CPU) {\n        galois::do_all(\n            galois::iterate(nodesWithEdges),\n            ForwardPass(&_graph, _dga, globalRoundNumber),\n            galois::loopname(syncSubstrate\n                                 ->get_run_identifier(std::string(REGION_NAME) +\n                                                      \"_ForwardPass\")\n                                 .c_str()),\n            galois::steal(), galois::no_stats());\n      }\n\n      // synchronize distances and shortest paths\n      // read any because a destination node without the correct distance\n      // may use a different distance (leading to incorrectness)\n      if (moreThanOne) {\n        syncSubstrate->sync<writeDestination, readAny,\n                            Reduce_min_current_length, Bitset_current_length>(\n            std::string(REGION_NAME) + \"_ForwardPass\");\n        syncSubstrate\n            ->sync<writeDestination, readSource, Reduce_add_num_shortest_paths,\n                   Bitset_num_shortest_paths>(std::string(REGION_NAME) +\n                                              \"_ForwardPass\");\n      }\n\n      globalRoundNumber++;\n    } while (_dga.reduce(syncSubstrate->get_run_identifier()));\n  }\n\n  void operator()(GNode src) const {\n    NodeData& src_data = graph->getData(src);\n\n    if (src_data.current_length == local_r) {\n      for (auto current_edge : graph->edges(src)) {\n        GNode dst         = graph->getEdgeDst(current_edge);\n        auto& dst_data    = graph->getData(dst);\n        uint32_t new_dist = 1 + src_data.current_length;\n        uint32_t old = galois::atomicMin(dst_data.current_length, new_dist);\n\n        if (old > new_dist) {\n          // assert(dst_data.current_length == r + 1);\n          // assert(src_data.num_shortest_paths > 0);\n\n          bitset_current_length.set(dst);\n          double nsp = src_data.num_shortest_paths;\n          galois::atomicAdd(dst_data.num_shortest_paths, nsp);\n          bitset_num_shortest_paths.set(dst);\n\n          dga += 1;\n        } else if (old == new_dist) {\n          // assert(src_data.num_shortest_paths > 0);\n          // assert(dst_data.current_length == r + 1);\n\n          double nsp = src_data.num_shortest_paths;\n          galois::atomicAdd(dst_data.num_shortest_paths, nsp);\n          bitset_num_shortest_paths.set(dst);\n\n          dga += 1;\n        }\n      }\n    }\n  }\n};\n\n/**\n * Synchronize num shortest paths on destinations (should already\n * exist on all sources).\n */\nstruct MiddleSync {\n  Graph* graph;\n  const uint32_t local_infinity;\n\n  MiddleSync(Graph* _graph, const uint32_t li)\n      : graph(_graph), local_infinity(li){};\n\n  void static go(Graph& _graph, const uint32_t _li) {\n    // step only required if more than one host\n    if (galois::runtime::getSystemNetworkInterface().Num > 1) {\n      const auto& masters = _graph.masterNodesRange();\n\n      if (personality == GPU_CUDA) {\n#ifdef GALOIS_ENABLE_GPU\n        std::string impl_str(syncSubstrate->get_run_identifier(\"MiddleSync\"));\n        galois::StatTimer StatTimer_cuda(impl_str.c_str(), REGION_NAME);\n        StatTimer_cuda.start();\n        MiddleSync_masterNodes_cuda(infinity, cuda_ctx);\n        StatTimer_cuda.stop();\n#else\n        abort();\n#endif\n      } else if (personality == CPU) {\n        galois::do_all(\n            galois::iterate(masters.begin(), masters.end()),\n            MiddleSync(&_graph, _li),\n            galois::loopname(\n                syncSubstrate->get_run_identifier(\"MiddleSync\").c_str()),\n            galois::no_stats());\n      }\n\n      syncSubstrate->sync<writeSource, readAny, Reduce_set_num_shortest_paths>(\n          std::string(REGION_NAME) + \"_MiddleSync\");\n    }\n  }\n\n  /**\n   * Set node for sync if it has a non-zero distance\n   */\n  void operator()(GNode src) const {\n    NodeData& src_data = graph->getData(src);\n\n    if (src_data.current_length != local_infinity) {\n      bitset_num_shortest_paths.set(src);\n    }\n  }\n};\n\n/**\n * Propagate dependency backward by iterating backward over levels of BFS tree\n */\nstruct BackwardPass {\n  Graph* graph;\n  uint32_t local_r;\n\n  BackwardPass(Graph* _graph, uint32_t roundNum)\n      : graph(_graph), local_r(roundNum) {}\n\n  void static go(Graph& _graph, uint32_t roundNumber) {\n    const auto& nodesWithEdges = _graph.allNodesWithEdgesRange();\n    bool moreThanOne = galois::runtime::getSystemNetworkInterface().Num > 1;\n\n    backRoundCount = roundNumber - 1;\n\n    for (; backRoundCount > 0; backRoundCount--) {\n      if (personality == GPU_CUDA) {\n#ifdef GALOIS_ENABLE_GPU\n        std::string impl_str(syncSubstrate->get_run_identifier(\"BackwardPass\"));\n        galois::StatTimer StatTimer_cuda(impl_str.c_str(), REGION_NAME);\n        StatTimer_cuda.start();\n        BackwardPass_nodesWithEdges_cuda(backRoundCount, cuda_ctx);\n        StatTimer_cuda.stop();\n#else\n        abort();\n#endif\n      } else if (personality == CPU) {\n        galois::do_all(\n            galois::iterate(nodesWithEdges),\n            BackwardPass(&_graph, backRoundCount),\n            galois::loopname(syncSubstrate\n                                 ->get_run_identifier(std::string(REGION_NAME) +\n                                                      \"_BackwardPass\")\n                                 .c_str()),\n            galois::steal(), galois::no_stats());\n      }\n\n      if (moreThanOne) {\n        syncSubstrate->sync<writeSource, readDestination, Reduce_add_dependency,\n                            Bitset_dependency>(std::string(REGION_NAME) +\n                                               \"_BackwardPass\");\n      }\n    }\n  }\n\n  /**\n   * If on the correct level, calculate self-depndency by checking successor\n   * nodes.\n   */\n  void operator()(GNode src) const {\n    NodeData& src_data = graph->getData(src);\n\n    if (src_data.current_length == local_r) {\n      uint32_t dest_to_find = src_data.current_length + 1;\n      for (auto current_edge : graph->edges(src)) {\n        GNode dst      = graph->getEdgeDst(current_edge);\n        auto& dst_data = graph->getData(dst);\n\n        if (dest_to_find == dst_data.current_length) {\n          float contrib =\n              ((float)1 + dst_data.dependency) / dst_data.num_shortest_paths;\n          src_data.dependency = src_data.dependency + contrib;\n          bitset_dependency.set(src);\n        }\n      }\n      src_data.dependency *= src_data.num_shortest_paths;\n    }\n  }\n};\n\nstruct BC {\n  Graph* graph;\n\n  BC(Graph* _graph) : graph(_graph) {}\n\n  void static go(Graph& _graph, galois::DGAccumulator<uint32_t>& dga) {\n    globalRoundNumber = 0;\n    // reset the graph aside from the between-cent measure\n    InitializeIteration::go(_graph);\n    // get distances and num paths\n    ForwardPass::go(_graph, dga);\n\n    // dependency calc only matters if there's a node with distance at\n    // least 2\n    if (globalRoundNumber > 2) {\n      MiddleSync::go(_graph, infinity);\n      BackwardPass::go(_graph, globalRoundNumber - 1);\n\n      const auto& masters = _graph.masterNodesRange();\n      // finally, since dependencies are finalized for this round at this\n      // point, add them to the betweeness centrality measure on each node\n      if (personality == GPU_CUDA) {\n#ifdef GALOIS_ENABLE_GPU\n        std::string impl_str(syncSubstrate->get_run_identifier(\"BC\"));\n        galois::StatTimer StatTimer_cuda(impl_str.c_str(), REGION_NAME);\n        StatTimer_cuda.start();\n        BC_masterNodes_cuda(cuda_ctx);\n        StatTimer_cuda.stop();\n#else\n        abort();\n#endif\n      } else if (personality == CPU) {\n        galois::do_all(\n            galois::iterate(masters.begin(), masters.end()), BC(&_graph),\n            galois::no_stats(),\n            galois::loopname(\n                syncSubstrate->get_run_identifier(std::string(REGION_NAME))\n                    .c_str()));\n      }\n    }\n  }\n\n  /**\n   * Adds dependency measure to BC measure\n   */\n  void operator()(GNode src) const {\n    NodeData& src_data = graph->getData(src);\n\n    if (src_data.dependency > 0) {\n      src_data.betweeness_centrality += src_data.dependency;\n    }\n  }\n};\n\n/******************************************************************************/\n/* Sanity check */\n/******************************************************************************/\n\nstruct Sanity {\n  Graph* graph;\n\n  galois::DGReduceMax<float>& DGAccumulator_max;\n  galois::DGReduceMin<float>& DGAccumulator_min;\n  galois::DGAccumulator<float>& DGAccumulator_sum;\n\n  Sanity(Graph* _graph, galois::DGReduceMax<float>& _DGAccumulator_max,\n         galois::DGReduceMin<float>& _DGAccumulator_min,\n         galois::DGAccumulator<float>& _DGAccumulator_sum)\n      : graph(_graph), DGAccumulator_max(_DGAccumulator_max),\n        DGAccumulator_min(_DGAccumulator_min),\n        DGAccumulator_sum(_DGAccumulator_sum) {}\n\n  void static go(Graph& _graph, galois::DGReduceMax<float>& DGA_max,\n                 galois::DGReduceMin<float>& DGA_min,\n                 galois::DGAccumulator<float>& DGA_sum) {\n\n    DGA_max.reset();\n    DGA_min.reset();\n    DGA_sum.reset();\n\n    if (personality == GPU_CUDA) {\n#ifdef GALOIS_ENABLE_GPU\n      // std::string impl_str(syncSubstrate->get_run_identifier(\"Sanity\"));\n      std::string impl_str(\"Sanity\");\n      galois::StatTimer StatTimer_cuda(impl_str.c_str(), REGION_NAME);\n      StatTimer_cuda.start();\n      float sum, max, min;\n      Sanity_masterNodes_cuda(sum, max, min, cuda_ctx);\n      DGA_sum += sum;\n      DGA_max.update(max);\n      DGA_min.update(min);\n      StatTimer_cuda.stop();\n#else\n      abort();\n#endif\n    } else if (personality == CPU) {\n      galois::do_all(galois::iterate(_graph.masterNodesRange().begin(),\n                                     _graph.masterNodesRange().end()),\n                     Sanity(&_graph, DGA_max, DGA_min, DGA_sum),\n                     galois::no_stats(), galois::loopname(\"Sanity\"));\n    }\n\n    float max_bc = DGA_max.reduce();\n    float min_bc = DGA_min.reduce();\n    float bc_sum = DGA_sum.reduce();\n\n    // Only node 0 will print data\n    if (galois::runtime::getSystemNetworkInterface().ID == 0) {\n      galois::gPrint(\"Max BC is \", max_bc, \"\\n\");\n      galois::gPrint(\"Min BC is \", min_bc, \"\\n\");\n      galois::gPrint(\"BC sum is \", bc_sum, \"\\n\");\n    }\n  }\n\n  /* Gets the max, min rank from all owned nodes and\n   * also the sum of ranks */\n  void operator()(GNode src) const {\n    NodeData& sdata = graph->getData(src);\n\n    DGAccumulator_max.update(sdata.betweeness_centrality);\n    DGAccumulator_min.update(sdata.betweeness_centrality);\n    DGAccumulator_sum += sdata.betweeness_centrality;\n  }\n};\n\n/******************************************************************************/\n/* Make results */\n/******************************************************************************/\n\nstd::vector<float> makeResultsCPU(std::unique_ptr<Graph>& hg) {\n  std::vector<float> values;\n\n  values.reserve(hg->numMasters());\n  for (auto node : hg->masterNodesRange()) {\n    values.push_back(hg->getData(node).betweeness_centrality);\n  }\n\n  return values;\n}\n\n#ifdef GALOIS_ENABLE_GPU\nstd::vector<float> makeResultsGPU(std::unique_ptr<Graph>& hg) {\n  std::vector<float> values;\n\n  values.reserve(hg->numMasters());\n  for (auto node : hg->masterNodesRange()) {\n    values.push_back(get_node_betweeness_centrality_cuda(cuda_ctx, node));\n  }\n\n  return values;\n}\n#else\nstd::vector<float> makeResultsGPU(std::unique_ptr<Graph>& /*unused*/) {\n  abort();\n}\n#endif\n\nstd::vector<float> makeResults(std::unique_ptr<Graph>& hg) {\n  switch (personality) {\n  case CPU:\n    return makeResultsCPU(hg);\n  case GPU_CUDA:\n    return makeResultsGPU(hg);\n  default:\n    abort();\n  }\n}\n\n/******************************************************************************/\n/* Main method for running */\n/******************************************************************************/\n\nconstexpr static const char* const name =\n    \"Betweeness Centrality Level by Level\";\nconstexpr static const char* const desc =\n    \"Betweeness Centrality Level by Level on Distributed Galois.\";\nconstexpr static const char* const url = nullptr;\n\nint main(int argc, char** argv) {\n  galois::DistMemSys G;\n  DistBenchStart(argc, argv, name, desc, url);\n\n  auto& net = galois::runtime::getSystemNetworkInterface();\n\n  galois::StatTimer StatTimer_total(\"TimerTotal\", REGION_NAME);\n\n  StatTimer_total.start();\n\n  std::unique_ptr<Graph> h_graph;\n#ifdef GALOIS_ENABLE_GPU\n  std::tie(h_graph, syncSubstrate) =\n      distGraphInitialization<NodeData, void>(&cuda_ctx);\n#else\n  std::tie(h_graph, syncSubstrate) = distGraphInitialization<NodeData, void>();\n#endif\n\n  if (!sourcesToUse.empty()) {\n    sourceFile.open(sourcesToUse);\n    std::vector<uint64_t> t(std::istream_iterator<uint64_t>{sourceFile},\n                            std::istream_iterator<uint64_t>{});\n    sourceVector = t;\n    sourceFile.close();\n  }\n\n  bitset_num_shortest_paths.resize(h_graph->size());\n  bitset_current_length.resize(h_graph->size());\n  bitset_dependency.resize(h_graph->size());\n\n  galois::gPrint(\"[\", net.ID, \"] InitializeGraph::go called\\n\");\n\n  InitializeGraph::go((*h_graph));\n  galois::runtime::getHostBarrier().wait();\n\n  // shared DG accumulator among all steps\n  galois::DGAccumulator<uint32_t> dga;\n\n  // sanity dg accumulators\n  galois::DGReduceMax<float> dga_max;\n  galois::DGReduceMin<float> dga_min;\n  galois::DGAccumulator<float> dga_sum;\n\n  galois::runtime::reportStat_Single(std::string(REGION_NAME),\n                                     std::string(\"NumSources\"),\n                                     (unsigned int)numberOfSources);\n  for (auto run = 0; run < numRuns; ++run) {\n    galois::gPrint(\"[\", net.ID, \"] BC::go run \", run, \" called\\n\");\n    std::string timer_str(\"Timer_\" + std::to_string(run));\n\n    uint64_t loop_end = 1;\n    bool sSources     = false;\n\n    if (!singleSourceBC) {\n      if (!numberOfSources) {\n        loop_end = h_graph->globalSize();\n      } else {\n        loop_end = numberOfSources;\n      }\n\n      // if provided a file of sources to work with, use that\n      if (!sourceVector.empty()) {\n        if (loop_end > sourceVector.size()) {\n          loop_end = sourceVector.size();\n        }\n        sSources = true;\n      }\n    }\n\n    galois::StatTimer StatTimer_main(timer_str.c_str(), REGION_NAME);\n\n    for (uint64_t i = 0; i < loop_end; i++) {\n      if (singleSourceBC) {\n        // only 1 source; specified start source in command line\n        assert(loop_end == 1);\n        galois::gDebug(\"This is single source node BC\");\n        current_src_node = startSource;\n      } else if (sSources) {\n        current_src_node = sourceVector[i];\n      } else {\n        // all sources\n        current_src_node = i;\n      }\n\n      globalRoundNumber = 0;\n      backRoundCount    = 0;\n\n      StatTimer_main.start();\n      BC::go(*h_graph, dga);\n      StatTimer_main.stop();\n\n      // Round reporting\n      if (galois::runtime::getSystemNetworkInterface().ID == 0) {\n        galois::runtime::reportStat_Single(\n            REGION_NAME, syncSubstrate->get_run_identifier(\"NumRounds\", i),\n            globalRoundNumber);\n        uint32_t backRounds;\n        if (globalRoundNumber > 2) {\n          backRounds = globalRoundNumber - 2;\n        } else {\n          backRounds = 0;\n        }\n        galois::runtime::reportStat_Single(\n            REGION_NAME,\n            syncSubstrate->get_run_identifier(\"NumForwardRounds\", i),\n            globalRoundNumber);\n        galois::runtime::reportStat_Single(\n            REGION_NAME, syncSubstrate->get_run_identifier(\"NumBackRounds\", i),\n            backRounds);\n        galois::runtime::reportStat_Single(\n            REGION_NAME, std::string(\"TotalRounds_\") + std::to_string(run),\n            globalRoundNumber + backRounds);\n      }\n    }\n\n    Sanity::go(*h_graph, dga_max, dga_min, dga_sum);\n\n    // re-init graph for next run\n    if ((run + 1) != numRuns) {\n      galois::runtime::getHostBarrier().wait();\n      (*syncSubstrate).set_num_run(run + 1);\n\n      if (personality == GPU_CUDA) {\n#ifdef GALOIS_ENABLE_GPU\n        bitset_num_shortest_paths_reset_cuda(cuda_ctx);\n        bitset_current_length_reset_cuda(cuda_ctx);\n        bitset_dependency_reset_cuda(cuda_ctx);\n#else\n        abort();\n#endif\n      } else if (personality == CPU) {\n        bitset_num_shortest_paths.reset();\n        bitset_current_length.reset();\n        bitset_dependency.reset();\n      }\n\n      InitializeGraph::go(*h_graph);\n      galois::runtime::getHostBarrier().wait();\n    }\n  }\n\n  StatTimer_total.stop();\n\n  if (output) {\n    std::vector<float> results = makeResults(h_graph);\n    auto globalIDs             = h_graph->getMasterGlobalIDs();\n    assert(results.size() == globalIDs.size());\n\n    writeOutput(outputLocation, \"betweenness_centrality\", results.data(),\n                results.size(), globalIDs.data());\n  }\n\n  return 0;\n}\n"
  },
  {
    "path": "lonestar/analytics/distributed/betweennesscentrality/bc_level_cuda.cu",
    "content": "/*  -*- mode: c++ -*-  */\n#include \"gg.h\"\n#include \"ggcuda.h\"\n#include \"cub/cub.cuh\"\n#include \"cub/util_allocator.cuh\"\n#include \"thread_work.h\"\n\nvoid kernel_sizing(CSRGraph &, dim3 &, dim3 &);\n#define TB_SIZE 256\nconst char *GGC_OPTIONS = \"coop_conv=False $ outline_iterate_gb=False $ backoff_blocking_factor=4 $ parcomb=True $ np_schedulers=set(['fg', 'tb', 'wp']) $ cc_disable=set([]) $ tb_lb=False $ hacks=set([]) $ np_factor=8 $ instrument=set([]) $ unroll=[] $ instrument_mode=None $ read_props=None $ outline_iterate=False $ ignore_nested_errors=False $ np=True $ write_props=None $ quiet_cgen=True $ dyn_lb=False $ retry_backoff=True $ cuda.graph_type=basic $ cuda.use_worklist_slots=True $ cuda.worklist_type=basic\";\nbool enable_lb = false;\n#include \"bc_level_cuda.cuh\"\n__global__ void InitializeGraph(CSRGraph graph, unsigned int __begin, unsigned int __end, float * p_betweeness_centrality, float * p_dependency, ShortPathType * p_num_shortest_paths)\n{\n  unsigned tid = TID_1D;\n  unsigned nthreads = TOTAL_THREADS_1D;\n\n  const unsigned __kernel_tb_size = TB_SIZE;\n  index_type src_end;\n  // FP: \"1 -> 2;\n  src_end = __end;\n  for (index_type src = __begin + tid; src < src_end; src += nthreads)\n  {\n    bool pop  = src < __end;\n    if (pop)\n    {\n      p_betweeness_centrality[src] = 0;\n      p_num_shortest_paths[src]    = 0;\n      p_dependency[src]            = 0;\n    }\n  }\n  // FP: \"9 -> 10;\n}\n__global__ void InitializeIteration(CSRGraph graph, unsigned int __begin, unsigned int __end, const uint64_t  local_current_src_node, const uint32_t  local_infinity, uint32_t * p_current_length, float * p_dependency, ShortPathType * p_num_shortest_paths)\n{\n  unsigned tid = TID_1D;\n  unsigned nthreads = TOTAL_THREADS_1D;\n\n  const unsigned __kernel_tb_size = TB_SIZE;\n  bool is_source;\n  index_type src_end;\n  // FP: \"1 -> 2;\n  // FP: \"2 -> 3;\n  src_end = __end;\n  for (index_type src = __begin + tid; src < src_end; src += nthreads)\n  {\n    bool pop  = src < __end;\n    if (pop)\n    {\n      is_source = graph.node_data[src] == local_current_src_node;\n      if (!is_source)\n      {\n        p_current_length[src]     = local_infinity;\n        p_num_shortest_paths[src] = 0;\n      }\n      else\n      {\n        p_current_length[src]     = 0;\n        p_num_shortest_paths[src] = 1;\n      }\n      p_dependency[src]       = 0;\n    }\n  }\n  // FP: \"15 -> 16;\n}\n__global__ void ForwardPass(CSRGraph graph, unsigned int __begin, unsigned int __end, uint32_t local_r, uint32_t * p_current_length, ShortPathType * p_num_shortest_paths, DynamicBitset& bitset_current_length, DynamicBitset& bitset_num_shortest_paths, HGAccumulator<uint32_t> dga)\n{\n  unsigned tid = TID_1D;\n  unsigned nthreads = TOTAL_THREADS_1D;\n\n  const unsigned __kernel_tb_size = TB_SIZE;\n  __shared__ cub::BlockReduce<uint32_t, TB_SIZE>::TempStorage dga_ts;\n  index_type src_end;\n  // FP: \"1 -> 2;\n  // FP: \"2 -> 3;\n  dga.thread_entry();\n  // FP: \"3 -> 4;\n  src_end = __end;\n  for (index_type src = __begin + tid; src < src_end; src += nthreads)\n  {\n    index_type current_edge_end;\n    bool pop  = src < __end;\n    if (pop)\n    {\n      if (p_current_length[src] == local_r)\n      {\n      }\n      else\n      {\n        pop = false;\n      }\n    }\n    if (!pop)\n    {\n      continue;\n    }\n    current_edge_end = (graph).getFirstEdge((src) + 1);\n    for (index_type current_edge = (graph).getFirstEdge(src) + 0; current_edge < current_edge_end; current_edge += 1)\n    {\n      index_type dst;\n      uint32_t new_dist;\n      uint32_t old;\n      dst = graph.getAbsDestination(current_edge);\n      new_dist = 1 + p_current_length[src];\n      old = atomicTestMin(&p_current_length[dst], new_dist);\n      if (old > new_dist)\n      {\n        double nsp;\n        bitset_current_length.set(dst);\n        nsp = p_num_shortest_paths[src];\n        atomicTestAdd(&p_num_shortest_paths[dst], nsp);\n        bitset_num_shortest_paths.set(dst);\n        dga.reduce( 1);\n      }\n      else\n      {\n        if (old == new_dist)\n        {\n          double nsp;\n          nsp = p_num_shortest_paths[src];\n          atomicTestAdd(&p_num_shortest_paths[dst], nsp);\n          bitset_num_shortest_paths.set(dst);\n          dga.reduce( 1);\n        }\n      }\n    }\n  }\n  // FP: \"37 -> 38;\n  dga.thread_exit<cub::BlockReduce<uint32_t, TB_SIZE> >(dga_ts);\n  // FP: \"38 -> 39;\n}\n__global__ void MiddleSync(CSRGraph graph, unsigned int __begin, unsigned int __end, uint32_t local_infinity, uint32_t * p_current_length, DynamicBitset& bitset_num_shortest_paths)\n{\n  unsigned tid = TID_1D;\n  unsigned nthreads = TOTAL_THREADS_1D;\n\n  const unsigned __kernel_tb_size = TB_SIZE;\n  index_type src_end;\n  // FP: \"1 -> 2;\n  src_end = __end;\n  for (index_type src = __begin + tid; src < src_end; src += nthreads)\n  {\n    bool pop  = src < __end;\n    if (pop)\n    {\n      if (p_current_length[src] != local_infinity)\n      {\n        bitset_num_shortest_paths.set(src);\n      }\n    }\n  }\n  // FP: \"9 -> 10;\n}\n__global__ void BackwardPass(CSRGraph graph, unsigned int __begin, unsigned int __end, uint32_t local_r, uint32_t * p_current_length, float * p_dependency, ShortPathType * p_num_shortest_paths, DynamicBitset& bitset_dependency)\n{\n  unsigned tid = TID_1D;\n  unsigned nthreads = TOTAL_THREADS_1D;\n\n  const unsigned __kernel_tb_size = TB_SIZE;\n  uint32_t dest_to_find;\n  index_type src_end;\n  // FP: \"1 -> 2;\n  // FP: \"2 -> 3;\n  src_end = __end;\n  for (index_type src = __begin + tid; src < src_end; src += nthreads)\n  {\n    index_type current_edge_end;\n    bool pop  = src < __end;\n    if (pop)\n    {\n      if (p_current_length[src] == local_r)\n      {\n        dest_to_find = p_current_length[src] + 1;\n      }\n      else\n      {\n        pop = false;\n      }\n    }\n    if (!pop)\n    {\n      continue;\n    }\n    current_edge_end = (graph).getFirstEdge((src) + 1);\n    for (index_type current_edge = (graph).getFirstEdge(src) + 0; current_edge < current_edge_end; current_edge += 1)\n    {\n      index_type dst;\n      dst = graph.getAbsDestination(current_edge);\n      if (dest_to_find == p_current_length[dst])\n      {\n        float contrib;\n        contrib = ((float)1 + p_dependency[dst]) / p_num_shortest_paths[dst];\n        p_dependency[src] = p_dependency[src] + contrib;\n        bitset_dependency.set(src);\n      }\n    }\n    p_dependency[src] *= p_num_shortest_paths[src];\n  }\n  // FP: \"25 -> 26;\n}\n__global__ void BC(CSRGraph graph, unsigned int __begin, unsigned int __end, float * p_betweeness_centrality, float * p_dependency)\n{\n  unsigned tid = TID_1D;\n  unsigned nthreads = TOTAL_THREADS_1D;\n\n  const unsigned __kernel_tb_size = TB_SIZE;\n  index_type src_end;\n  // FP: \"1 -> 2;\n  src_end = __end;\n  for (index_type src = __begin + tid; src < src_end; src += nthreads)\n  {\n    bool pop  = src < __end;\n    if (pop)\n    {\n      if (p_dependency[src] > 0)\n      {\n        p_betweeness_centrality[src] += p_dependency[src];\n      }\n    }\n  }\n  // FP: \"9 -> 10;\n}\n__global__ void Sanity(CSRGraph graph, unsigned int __begin, unsigned int __end, float * p_betweeness_centrality, HGAccumulator<float> DGAccumulator_sum, HGReduceMax<float> DGAccumulator_max, HGReduceMin<float> DGAccumulator_min)\n{\n  unsigned tid = TID_1D;\n  unsigned nthreads = TOTAL_THREADS_1D;\n\n  const unsigned __kernel_tb_size = TB_SIZE;\n  __shared__ cub::BlockReduce<float, TB_SIZE>::TempStorage DGAccumulator_sum_ts;\n  __shared__ cub::BlockReduce<float, TB_SIZE>::TempStorage DGAccumulator_max_ts;\n  __shared__ cub::BlockReduce<float, TB_SIZE>::TempStorage DGAccumulator_min_ts;\n  index_type src_end;\n  // FP: \"1 -> 2;\n  // FP: \"2 -> 3;\n  DGAccumulator_sum.thread_entry();\n  // FP: \"3 -> 4;\n  // FP: \"4 -> 5;\n  DGAccumulator_max.thread_entry();\n  // FP: \"5 -> 6;\n  // FP: \"6 -> 7;\n  DGAccumulator_min.thread_entry();\n  // FP: \"7 -> 8;\n  src_end = __end;\n  for (index_type src = __begin + tid; src < src_end; src += nthreads)\n  {\n    bool pop  = src < __end;\n    if (pop)\n    {\n      DGAccumulator_max.reduce(p_betweeness_centrality[src]);\n      DGAccumulator_min.reduce(p_betweeness_centrality[src]);\n      DGAccumulator_sum.reduce( p_betweeness_centrality[src]);\n    }\n  }\n  // FP: \"15 -> 16;\n  DGAccumulator_sum.thread_exit<cub::BlockReduce<float, TB_SIZE> >(DGAccumulator_sum_ts);\n  // FP: \"16 -> 17;\n  DGAccumulator_max.thread_exit<cub::BlockReduce<float, TB_SIZE> >(DGAccumulator_max_ts);\n  // FP: \"17 -> 18;\n  DGAccumulator_min.thread_exit<cub::BlockReduce<float, TB_SIZE> >(DGAccumulator_min_ts);\n  // FP: \"18 -> 19;\n}\nvoid InitializeGraph_cuda(unsigned int  __begin, unsigned int  __end, struct CUDA_Context*  ctx)\n{\n  dim3 blocks;\n  dim3 threads;\n  // FP: \"1 -> 2;\n  // FP: \"2 -> 3;\n  // FP: \"3 -> 4;\n  kernel_sizing(blocks, threads);\n  // FP: \"4 -> 5;\n  InitializeGraph <<<blocks, threads>>>(ctx->gg, __begin, __end, ctx->betweeness_centrality.data.gpu_wr_ptr(), ctx->dependency.data.gpu_wr_ptr(), ctx->num_shortest_paths.data.gpu_wr_ptr());\n  cudaDeviceSynchronize();\n  // FP: \"5 -> 6;\n  check_cuda_kernel;\n  // FP: \"6 -> 7;\n}\nvoid InitializeGraph_allNodes_cuda(struct CUDA_Context*  ctx)\n{\n  // FP: \"1 -> 2;\n  InitializeGraph_cuda(0, ctx->gg.nnodes, ctx);\n  // FP: \"2 -> 3;\n}\nvoid InitializeGraph_masterNodes_cuda(struct CUDA_Context*  ctx)\n{\n  // FP: \"1 -> 2;\n  InitializeGraph_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, ctx);\n  // FP: \"2 -> 3;\n}\nvoid InitializeGraph_nodesWithEdges_cuda(struct CUDA_Context*  ctx)\n{\n  // FP: \"1 -> 2;\n  InitializeGraph_cuda(0, ctx->numNodesWithEdges, ctx);\n  // FP: \"2 -> 3;\n}\nvoid InitializeIteration_cuda(unsigned int  __begin, unsigned int  __end, const uint32_t & local_infinity, const uint64_t & local_current_src_node, struct CUDA_Context*  ctx)\n{\n  dim3 blocks;\n  dim3 threads;\n  // FP: \"1 -> 2;\n  // FP: \"2 -> 3;\n  // FP: \"3 -> 4;\n  kernel_sizing(blocks, threads);\n  // FP: \"4 -> 5;\n  InitializeIteration <<<blocks, threads>>>(ctx->gg, __begin, __end, local_current_src_node, local_infinity, ctx->current_length.data.gpu_wr_ptr(), ctx->dependency.data.gpu_wr_ptr(), ctx->num_shortest_paths.data.gpu_wr_ptr());\n  cudaDeviceSynchronize();\n  // FP: \"5 -> 6;\n  check_cuda_kernel;\n  // FP: \"6 -> 7;\n}\nvoid InitializeIteration_allNodes_cuda(const uint32_t & local_infinity, const uint64_t & local_current_src_node, struct CUDA_Context*  ctx)\n{\n  // FP: \"1 -> 2;\n  InitializeIteration_cuda(0, ctx->gg.nnodes, local_infinity, local_current_src_node, ctx);\n  // FP: \"2 -> 3;\n}\nvoid InitializeIteration_masterNodes_cuda(const uint32_t & local_infinity, const uint64_t & local_current_src_node, struct CUDA_Context*  ctx)\n{\n  // FP: \"1 -> 2;\n  InitializeIteration_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, local_infinity, local_current_src_node, ctx);\n  // FP: \"2 -> 3;\n}\nvoid InitializeIteration_nodesWithEdges_cuda(const uint32_t & local_infinity, const uint64_t & local_current_src_node, struct CUDA_Context*  ctx)\n{\n  // FP: \"1 -> 2;\n  InitializeIteration_cuda(0, ctx->numNodesWithEdges, local_infinity, local_current_src_node, ctx);\n  // FP: \"2 -> 3;\n}\nvoid ForwardPass_cuda(unsigned int  __begin, unsigned int  __end, uint32_t & dga, uint32_t local_r, struct CUDA_Context*  ctx)\n{\n  dim3 blocks;\n  dim3 threads;\n  HGAccumulator<uint32_t> _dga;\n  // FP: \"1 -> 2;\n  // FP: \"2 -> 3;\n  // FP: \"3 -> 4;\n  kernel_sizing(blocks, threads);\n  // FP: \"4 -> 5;\n  Shared<uint32_t> dgaval  = Shared<uint32_t>(1);\n  // FP: \"5 -> 6;\n  // FP: \"6 -> 7;\n  *(dgaval.cpu_wr_ptr()) = 0;\n  // FP: \"7 -> 8;\n  _dga.rv = dgaval.gpu_wr_ptr();\n  // FP: \"8 -> 9;\n  ForwardPass <<<blocks, threads>>>(ctx->gg, __begin, __end, local_r, ctx->current_length.data.gpu_wr_ptr(), ctx->num_shortest_paths.data.gpu_wr_ptr(), *(ctx->current_length.is_updated.gpu_rd_ptr()), *(ctx->num_shortest_paths.is_updated.gpu_rd_ptr()), _dga);\n  cudaDeviceSynchronize();\n  // FP: \"9 -> 10;\n  check_cuda_kernel;\n  // FP: \"10 -> 11;\n  dga = *(dgaval.cpu_rd_ptr());\n  // FP: \"11 -> 12;\n}\nvoid ForwardPass_allNodes_cuda(uint32_t & dga, uint32_t local_r, struct CUDA_Context*  ctx)\n{\n  // FP: \"1 -> 2;\n  ForwardPass_cuda(0, ctx->gg.nnodes, dga, local_r, ctx);\n  // FP: \"2 -> 3;\n}\nvoid ForwardPass_masterNodes_cuda(uint32_t & dga, uint32_t local_r, struct CUDA_Context*  ctx)\n{\n  // FP: \"1 -> 2;\n  ForwardPass_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, dga, local_r, ctx);\n  // FP: \"2 -> 3;\n}\nvoid ForwardPass_nodesWithEdges_cuda(uint32_t & dga, uint32_t local_r, struct CUDA_Context*  ctx)\n{\n  // FP: \"1 -> 2;\n  ForwardPass_cuda(0, ctx->numNodesWithEdges, dga, local_r, ctx);\n  // FP: \"2 -> 3;\n}\nvoid MiddleSync_cuda(unsigned int  __begin, unsigned int  __end, const uint32_t local_infinity, struct CUDA_Context*  ctx)\n{\n  dim3 blocks;\n  dim3 threads;\n  // FP: \"1 -> 2;\n  // FP: \"2 -> 3;\n  // FP: \"3 -> 4;\n  kernel_sizing(blocks, threads);\n  // FP: \"4 -> 5;\n  MiddleSync <<<blocks, threads>>>(ctx->gg, __begin, __end, local_infinity, ctx->current_length.data.gpu_wr_ptr(), *(ctx->num_shortest_paths.is_updated.gpu_rd_ptr()));\n  cudaDeviceSynchronize();\n  // FP: \"5 -> 6;\n  check_cuda_kernel;\n  // FP: \"6 -> 7;\n}\nvoid MiddleSync_allNodes_cuda(const uint32_t local_infinity, struct CUDA_Context*  ctx)\n{\n  // FP: \"1 -> 2;\n  MiddleSync_cuda(0, ctx->gg.nnodes, local_infinity, ctx);\n  // FP: \"2 -> 3;\n}\nvoid MiddleSync_masterNodes_cuda(const uint32_t local_infinity, struct CUDA_Context*  ctx)\n{\n  // FP: \"1 -> 2;\n  MiddleSync_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, local_infinity, ctx);\n  // FP: \"2 -> 3;\n}\nvoid MiddleSync_nodesWithEdges_cuda(const uint32_t local_infinity, struct CUDA_Context*  ctx)\n{\n  // FP: \"1 -> 2;\n  MiddleSync_cuda(0, ctx->numNodesWithEdges, local_infinity, ctx);\n  // FP: \"2 -> 3;\n}\nvoid BackwardPass_cuda(unsigned int  __begin, unsigned int  __end, uint32_t local_r, struct CUDA_Context*  ctx)\n{\n  dim3 blocks;\n  dim3 threads;\n  // FP: \"1 -> 2;\n  // FP: \"2 -> 3;\n  // FP: \"3 -> 4;\n  kernel_sizing(blocks, threads);\n  // FP: \"4 -> 5;\n  BackwardPass <<<blocks, threads>>>(ctx->gg, __begin, __end, local_r, ctx->current_length.data.gpu_wr_ptr(), ctx->dependency.data.gpu_wr_ptr(), ctx->num_shortest_paths.data.gpu_wr_ptr(), *(ctx->dependency.is_updated.gpu_rd_ptr()));\n  cudaDeviceSynchronize();\n  // FP: \"5 -> 6;\n  check_cuda_kernel;\n  // FP: \"6 -> 7;\n}\nvoid BackwardPass_allNodes_cuda(uint32_t local_r, struct CUDA_Context*  ctx)\n{\n  // FP: \"1 -> 2;\n  BackwardPass_cuda(0, ctx->gg.nnodes, local_r, ctx);\n  // FP: \"2 -> 3;\n}\nvoid BackwardPass_masterNodes_cuda(uint32_t local_r, struct CUDA_Context*  ctx)\n{\n  // FP: \"1 -> 2;\n  BackwardPass_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, local_r, ctx);\n  // FP: \"2 -> 3;\n}\nvoid BackwardPass_nodesWithEdges_cuda(uint32_t local_r, struct CUDA_Context*  ctx)\n{\n  // FP: \"1 -> 2;\n  BackwardPass_cuda(0, ctx->numNodesWithEdges, local_r, ctx);\n  // FP: \"2 -> 3;\n}\nvoid BC_cuda(unsigned int  __begin, unsigned int  __end, struct CUDA_Context*  ctx)\n{\n  dim3 blocks;\n  dim3 threads;\n  // FP: \"1 -> 2;\n  // FP: \"2 -> 3;\n  // FP: \"3 -> 4;\n  kernel_sizing(blocks, threads);\n  // FP: \"4 -> 5;\n  BC <<<blocks, threads>>>(ctx->gg, __begin, __end, ctx->betweeness_centrality.data.gpu_wr_ptr(), ctx->dependency.data.gpu_wr_ptr());\n  cudaDeviceSynchronize();\n  // FP: \"5 -> 6;\n  check_cuda_kernel;\n  // FP: \"6 -> 7;\n}\nvoid BC_allNodes_cuda(struct CUDA_Context*  ctx)\n{\n  // FP: \"1 -> 2;\n  BC_cuda(0, ctx->gg.nnodes, ctx);\n  // FP: \"2 -> 3;\n}\nvoid BC_masterNodes_cuda(struct CUDA_Context*  ctx)\n{\n  // FP: \"1 -> 2;\n  BC_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, ctx);\n  // FP: \"2 -> 3;\n}\nvoid BC_nodesWithEdges_cuda(struct CUDA_Context*  ctx)\n{\n  // FP: \"1 -> 2;\n  BC_cuda(0, ctx->numNodesWithEdges, ctx);\n  // FP: \"2 -> 3;\n}\nvoid Sanity_cuda(unsigned int  __begin, unsigned int  __end, float & DGAccumulator_sum, float & DGAccumulator_max, float & DGAccumulator_min, struct CUDA_Context*  ctx)\n{\n  dim3 blocks;\n  dim3 threads;\n  HGAccumulator<float> _DGAccumulator_sum;\n  HGReduceMax<float> _DGAccumulator_max;\n  HGReduceMin<float> _DGAccumulator_min;\n  // FP: \"1 -> 2;\n  // FP: \"2 -> 3;\n  // FP: \"3 -> 4;\n  kernel_sizing(blocks, threads);\n  // FP: \"4 -> 5;\n  Shared<float> DGAccumulator_sumval  = Shared<float>(1);\n  // FP: \"5 -> 6;\n  // FP: \"6 -> 7;\n  *(DGAccumulator_sumval.cpu_wr_ptr()) = 0;\n  // FP: \"7 -> 8;\n  _DGAccumulator_sum.rv = DGAccumulator_sumval.gpu_wr_ptr();\n  // FP: \"8 -> 9;\n  Shared<float> DGAccumulator_maxval  = Shared<float>(1);\n  // FP: \"9 -> 10;\n  // FP: \"10 -> 11;\n  *(DGAccumulator_maxval.cpu_wr_ptr()) = 0;\n  // FP: \"11 -> 12;\n  _DGAccumulator_max.rv = DGAccumulator_maxval.gpu_wr_ptr();\n  // FP: \"12 -> 13;\n  Shared<float> DGAccumulator_minval  = Shared<float>(1);\n  // FP: \"13 -> 14;\n  // FP: \"14 -> 15;\n  *(DGAccumulator_minval.cpu_wr_ptr()) = 1073741823;\n  // FP: \"15 -> 16;\n  _DGAccumulator_min.rv = DGAccumulator_minval.gpu_wr_ptr();\n  // FP: \"16 -> 17;\n  Sanity <<<blocks, threads>>>(ctx->gg, __begin, __end, ctx->betweeness_centrality.data.gpu_wr_ptr(), _DGAccumulator_sum, _DGAccumulator_max, _DGAccumulator_min);\n  cudaDeviceSynchronize();\n  // FP: \"17 -> 18;\n  check_cuda_kernel;\n  // FP: \"18 -> 19;\n  DGAccumulator_sum = *(DGAccumulator_sumval.cpu_rd_ptr());\n  // FP: \"19 -> 20;\n  DGAccumulator_max = *(DGAccumulator_maxval.cpu_rd_ptr());\n  // FP: \"20 -> 21;\n  DGAccumulator_min = *(DGAccumulator_minval.cpu_rd_ptr());\n  // FP: \"21 -> 22;\n}\nvoid Sanity_allNodes_cuda(float & DGAccumulator_sum, float & DGAccumulator_max, float & DGAccumulator_min, struct CUDA_Context*  ctx)\n{\n  // FP: \"1 -> 2;\n  Sanity_cuda(0, ctx->gg.nnodes, DGAccumulator_sum, DGAccumulator_max, DGAccumulator_min, ctx);\n  // FP: \"2 -> 3;\n}\nvoid Sanity_masterNodes_cuda(float & DGAccumulator_sum, float & DGAccumulator_max, float & DGAccumulator_min, struct CUDA_Context*  ctx)\n{\n  // FP: \"1 -> 2;\n  Sanity_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, DGAccumulator_sum, DGAccumulator_max, DGAccumulator_min, ctx);\n  // FP: \"2 -> 3;\n}\nvoid Sanity_nodesWithEdges_cuda(float & DGAccumulator_sum, float & DGAccumulator_max, float & DGAccumulator_min, struct CUDA_Context*  ctx)\n{\n  // FP: \"1 -> 2;\n  Sanity_cuda(0, ctx->numNodesWithEdges, DGAccumulator_sum, DGAccumulator_max, DGAccumulator_min, ctx);\n  // FP: \"2 -> 3;\n}"
  },
  {
    "path": "lonestar/analytics/distributed/betweennesscentrality/bc_level_cuda.cuh",
    "content": "#pragma once\n#include <cuda.h>\n#include <stdio.h>\n#include <sys/types.h>\n#include <unistd.h>\n#include \"bc_level_cuda.h\"\n#include \"galois/runtime/cuda/DeviceSync.h\"\n\nstruct CUDA_Context : public CUDA_Context_Common {\n\tstruct CUDA_Context_Field<float> betweeness_centrality;\n\tstruct CUDA_Context_Field<uint32_t> current_length;\n\tstruct CUDA_Context_Field<float> dependency;\n\tstruct CUDA_Context_Field<ShortPathType> num_shortest_paths;\n};\n\nstruct CUDA_Context* get_CUDA_context(int id) {\n\tstruct CUDA_Context* ctx;\n\tctx = (struct CUDA_Context* ) calloc(1, sizeof(struct CUDA_Context));\n\tctx->id = id;\n\treturn ctx;\n}\n\nbool init_CUDA_context(struct CUDA_Context* ctx, int device) {\n\treturn init_CUDA_context_common(ctx, device);\n}\n\nvoid load_graph_CUDA(struct CUDA_Context* ctx, MarshalGraph &g, unsigned num_hosts) {\n\tsize_t mem_usage = mem_usage_CUDA_common(g, num_hosts);\n\tmem_usage += mem_usage_CUDA_field(&ctx->betweeness_centrality, g, num_hosts);\n\tmem_usage += mem_usage_CUDA_field(&ctx->current_length, g, num_hosts);\n\tmem_usage += mem_usage_CUDA_field(&ctx->dependency, g, num_hosts);\n\tmem_usage += mem_usage_CUDA_field(&ctx->num_shortest_paths, g, num_hosts);\n\tprintf(\"[%d] Host memory for communication context: %3u MB\\n\", ctx->id, mem_usage/1048756);\n\tload_graph_CUDA_common(ctx, g, num_hosts);\n\tload_graph_CUDA_field(ctx, &ctx->betweeness_centrality, num_hosts);\n\tload_graph_CUDA_field(ctx, &ctx->current_length, num_hosts);\n\tload_graph_CUDA_field(ctx, &ctx->dependency, num_hosts);\n\tload_graph_CUDA_field(ctx, &ctx->num_shortest_paths, num_hosts);\n\treset_CUDA_context(ctx);\n}\n\nvoid reset_CUDA_context(struct CUDA_Context* ctx) {\n\tctx->betweeness_centrality.data.zero_gpu();\n\tctx->current_length.data.zero_gpu();\n\tctx->dependency.data.zero_gpu();\n\tctx->num_shortest_paths.data.zero_gpu();\n}\n\nvoid get_bitset_betweeness_centrality_cuda(struct CUDA_Context* ctx, uint64_t* bitset_compute) {\n\tctx->betweeness_centrality.is_updated.cpu_rd_ptr()->copy_to_cpu(bitset_compute);\n}\n\nvoid bitset_betweeness_centrality_reset_cuda(struct CUDA_Context* ctx) {\n\tctx->betweeness_centrality.is_updated.cpu_rd_ptr()->reset();\n}\n\nvoid bitset_betweeness_centrality_reset_cuda(struct CUDA_Context* ctx, size_t begin, size_t end) {\n\treset_bitset_field(&ctx->betweeness_centrality, begin, end);\n}\n\nfloat get_node_betweeness_centrality_cuda(struct CUDA_Context* ctx, unsigned LID) {\n\tfloat *betweeness_centrality = ctx->betweeness_centrality.data.cpu_rd_ptr();\n\treturn betweeness_centrality[LID];\n}\n\nvoid set_node_betweeness_centrality_cuda(struct CUDA_Context* ctx, unsigned LID, float v) {\n\tfloat *betweeness_centrality = ctx->betweeness_centrality.data.cpu_wr_ptr();\n\tbetweeness_centrality[LID] = v;\n}\n\nvoid add_node_betweeness_centrality_cuda(struct CUDA_Context* ctx, unsigned LID, float v) {\n\tfloat *betweeness_centrality = ctx->betweeness_centrality.data.cpu_wr_ptr();\n\tbetweeness_centrality[LID] += v;\n}\n\nbool min_node_betweeness_centrality_cuda(struct CUDA_Context* ctx, unsigned LID, float v) {\n\tfloat *betweeness_centrality = ctx->betweeness_centrality.data.cpu_wr_ptr();\n\tif (betweeness_centrality[LID] > v){\n\t\tbetweeness_centrality[LID] = v;\n\t\treturn true;\n\t}\n\treturn false;\n}\n\nvoid batch_get_node_betweeness_centrality_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v) {\n\tbatch_get_shared_field<float, sharedMaster, false>(ctx, &ctx->betweeness_centrality, from_id, v);\n}\n\nvoid batch_get_node_betweeness_centrality_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode) {\n\tbatch_get_shared_field<float, sharedMaster, false>(ctx, &ctx->betweeness_centrality, from_id, v, v_size, data_mode);\n}\n\nvoid batch_get_mirror_node_betweeness_centrality_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v) {\n\tbatch_get_shared_field<float, sharedMirror, false>(ctx, &ctx->betweeness_centrality, from_id, v);\n}\n\nvoid batch_get_mirror_node_betweeness_centrality_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode) {\n\tbatch_get_shared_field<float, sharedMirror, false>(ctx, &ctx->betweeness_centrality, from_id, v, v_size, data_mode);\n}\n\nvoid batch_get_reset_node_betweeness_centrality_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, float i) {\n\tbatch_get_shared_field<float, sharedMirror, true>(ctx, &ctx->betweeness_centrality, from_id, v, i);\n}\n\nvoid batch_get_reset_node_betweeness_centrality_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode, float i) {\n\tbatch_get_shared_field<float, sharedMirror, true>(ctx, &ctx->betweeness_centrality, from_id, v, v_size, data_mode, i);\n}\n\nvoid batch_set_mirror_node_betweeness_centrality_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<float, sharedMirror, setOp>(ctx, &ctx->betweeness_centrality, from_id, v, data_mode);\n}\n\nvoid batch_set_node_betweeness_centrality_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<float, sharedMaster, setOp>(ctx, &ctx->betweeness_centrality, from_id, v, data_mode);\n}\n\nvoid batch_add_mirror_node_betweeness_centrality_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<float, sharedMirror, addOp>(ctx, &ctx->betweeness_centrality, from_id, v, data_mode);\n}\n\nvoid batch_add_node_betweeness_centrality_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<float, sharedMaster, addOp>(ctx, &ctx->betweeness_centrality, from_id, v, data_mode);\n}\n\nvoid batch_min_mirror_node_betweeness_centrality_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<float, sharedMirror, minOp>(ctx, &ctx->betweeness_centrality, from_id, v, data_mode);\n}\n\nvoid batch_min_node_betweeness_centrality_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<float, sharedMaster, minOp>(ctx, &ctx->betweeness_centrality, from_id, v, data_mode);\n}\n\nvoid batch_reset_node_betweeness_centrality_cuda(struct CUDA_Context* ctx, size_t begin, size_t end, float v) {\n\treset_data_field<float>(&ctx->betweeness_centrality, begin, end, v);\n}\n\nvoid get_bitset_current_length_cuda(struct CUDA_Context* ctx, uint64_t* bitset_compute) {\n\tctx->current_length.is_updated.cpu_rd_ptr()->copy_to_cpu(bitset_compute);\n}\n\nvoid bitset_current_length_reset_cuda(struct CUDA_Context* ctx) {\n\tctx->current_length.is_updated.cpu_rd_ptr()->reset();\n}\n\nvoid bitset_current_length_reset_cuda(struct CUDA_Context* ctx, size_t begin, size_t end) {\n\treset_bitset_field(&ctx->current_length, begin, end);\n}\n\nuint32_t get_node_current_length_cuda(struct CUDA_Context* ctx, unsigned LID) {\n\tuint32_t *current_length = ctx->current_length.data.cpu_rd_ptr();\n\treturn current_length[LID];\n}\n\nvoid set_node_current_length_cuda(struct CUDA_Context* ctx, unsigned LID, uint32_t v) {\n\tuint32_t *current_length = ctx->current_length.data.cpu_wr_ptr();\n\tcurrent_length[LID] = v;\n}\n\nvoid add_node_current_length_cuda(struct CUDA_Context* ctx, unsigned LID, uint32_t v) {\n\tuint32_t *current_length = ctx->current_length.data.cpu_wr_ptr();\n\tcurrent_length[LID] += v;\n}\n\nbool min_node_current_length_cuda(struct CUDA_Context* ctx, unsigned LID, uint32_t v) {\n\tuint32_t *current_length = ctx->current_length.data.cpu_wr_ptr();\n\tif (current_length[LID] > v){\n\t\tcurrent_length[LID] = v;\n\t\treturn true;\n\t}\n\treturn false;\n}\n\nvoid batch_get_node_current_length_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v) {\n\tbatch_get_shared_field<uint32_t, sharedMaster, false>(ctx, &ctx->current_length, from_id, v);\n}\n\nvoid batch_get_node_current_length_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode) {\n\tbatch_get_shared_field<uint32_t, sharedMaster, false>(ctx, &ctx->current_length, from_id, v, v_size, data_mode);\n}\n\nvoid batch_get_mirror_node_current_length_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v) {\n\tbatch_get_shared_field<uint32_t, sharedMirror, false>(ctx, &ctx->current_length, from_id, v);\n}\n\nvoid batch_get_mirror_node_current_length_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode) {\n\tbatch_get_shared_field<uint32_t, sharedMirror, false>(ctx, &ctx->current_length, from_id, v, v_size, data_mode);\n}\n\nvoid batch_get_reset_node_current_length_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, uint32_t i) {\n\tbatch_get_shared_field<uint32_t, sharedMirror, true>(ctx, &ctx->current_length, from_id, v, i);\n}\n\nvoid batch_get_reset_node_current_length_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode, uint32_t i) {\n\tbatch_get_shared_field<uint32_t, sharedMirror, true>(ctx, &ctx->current_length, from_id, v, v_size, data_mode, i);\n}\n\nvoid batch_set_mirror_node_current_length_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<uint32_t, sharedMirror, setOp>(ctx, &ctx->current_length, from_id, v, data_mode);\n}\n\nvoid batch_set_node_current_length_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<uint32_t, sharedMaster, setOp>(ctx, &ctx->current_length, from_id, v, data_mode);\n}\n\nvoid batch_add_mirror_node_current_length_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<uint32_t, sharedMirror, addOp>(ctx, &ctx->current_length, from_id, v, data_mode);\n}\n\nvoid batch_add_node_current_length_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<uint32_t, sharedMaster, addOp>(ctx, &ctx->current_length, from_id, v, data_mode);\n}\n\nvoid batch_min_mirror_node_current_length_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<uint32_t, sharedMirror, minOp>(ctx, &ctx->current_length, from_id, v, data_mode);\n}\n\nvoid batch_min_node_current_length_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<uint32_t, sharedMaster, minOp>(ctx, &ctx->current_length, from_id, v, data_mode);\n}\n\nvoid batch_reset_node_current_length_cuda(struct CUDA_Context* ctx, size_t begin, size_t end, uint32_t v) {\n\treset_data_field<uint32_t>(&ctx->current_length, begin, end, v);\n}\n\nvoid get_bitset_dependency_cuda(struct CUDA_Context* ctx, uint64_t* bitset_compute) {\n\tctx->dependency.is_updated.cpu_rd_ptr()->copy_to_cpu(bitset_compute);\n}\n\nvoid bitset_dependency_reset_cuda(struct CUDA_Context* ctx) {\n\tctx->dependency.is_updated.cpu_rd_ptr()->reset();\n}\n\nvoid bitset_dependency_reset_cuda(struct CUDA_Context* ctx, size_t begin, size_t end) {\n\treset_bitset_field(&ctx->dependency, begin, end);\n}\n\nfloat get_node_dependency_cuda(struct CUDA_Context* ctx, unsigned LID) {\n\tfloat *dependency = ctx->dependency.data.cpu_rd_ptr();\n\treturn dependency[LID];\n}\n\nvoid set_node_dependency_cuda(struct CUDA_Context* ctx, unsigned LID, float v) {\n\tfloat *dependency = ctx->dependency.data.cpu_wr_ptr();\n\tdependency[LID] = v;\n}\n\nvoid add_node_dependency_cuda(struct CUDA_Context* ctx, unsigned LID, float v) {\n\tfloat *dependency = ctx->dependency.data.cpu_wr_ptr();\n\tdependency[LID] += v;\n}\n\nbool min_node_dependency_cuda(struct CUDA_Context* ctx, unsigned LID, float v) {\n\tfloat *dependency = ctx->dependency.data.cpu_wr_ptr();\n\tif (dependency[LID] > v){\n\t\tdependency[LID] = v;\n\t\treturn true;\n\t}\n\treturn false;\n}\n\nvoid batch_get_node_dependency_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v) {\n\tbatch_get_shared_field<float, sharedMaster, false>(ctx, &ctx->dependency, from_id, v);\n}\n\nvoid batch_get_node_dependency_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode) {\n\tbatch_get_shared_field<float, sharedMaster, false>(ctx, &ctx->dependency, from_id, v, v_size, data_mode);\n}\n\nvoid batch_get_mirror_node_dependency_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v) {\n\tbatch_get_shared_field<float, sharedMirror, false>(ctx, &ctx->dependency, from_id, v);\n}\n\nvoid batch_get_mirror_node_dependency_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode) {\n\tbatch_get_shared_field<float, sharedMirror, false>(ctx, &ctx->dependency, from_id, v, v_size, data_mode);\n}\n\nvoid batch_get_reset_node_dependency_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, float i) {\n\tbatch_get_shared_field<float, sharedMirror, true>(ctx, &ctx->dependency, from_id, v, i);\n}\n\nvoid batch_get_reset_node_dependency_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode, float i) {\n\tbatch_get_shared_field<float, sharedMirror, true>(ctx, &ctx->dependency, from_id, v, v_size, data_mode, i);\n}\n\nvoid batch_set_mirror_node_dependency_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<float, sharedMirror, setOp>(ctx, &ctx->dependency, from_id, v, data_mode);\n}\n\nvoid batch_set_node_dependency_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<float, sharedMaster, setOp>(ctx, &ctx->dependency, from_id, v, data_mode);\n}\n\nvoid batch_add_mirror_node_dependency_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<float, sharedMirror, addOp>(ctx, &ctx->dependency, from_id, v, data_mode);\n}\n\nvoid batch_add_node_dependency_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<float, sharedMaster, addOp>(ctx, &ctx->dependency, from_id, v, data_mode);\n}\n\nvoid batch_min_mirror_node_dependency_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<float, sharedMirror, minOp>(ctx, &ctx->dependency, from_id, v, data_mode);\n}\n\nvoid batch_min_node_dependency_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<float, sharedMaster, minOp>(ctx, &ctx->dependency, from_id, v, data_mode);\n}\n\nvoid batch_reset_node_dependency_cuda(struct CUDA_Context* ctx, size_t begin, size_t end, float v) {\n\treset_data_field<float>(&ctx->dependency, begin, end, v);\n}\n\nvoid get_bitset_num_shortest_paths_cuda(struct CUDA_Context* ctx, uint64_t* bitset_compute) {\n\tctx->num_shortest_paths.is_updated.cpu_rd_ptr()->copy_to_cpu(bitset_compute);\n}\n\nvoid bitset_num_shortest_paths_reset_cuda(struct CUDA_Context* ctx) {\n\tctx->num_shortest_paths.is_updated.cpu_rd_ptr()->reset();\n}\n\nvoid bitset_num_shortest_paths_reset_cuda(struct CUDA_Context* ctx, size_t begin, size_t end) {\n\treset_bitset_field(&ctx->num_shortest_paths, begin, end);\n}\n\nShortPathType get_node_num_shortest_paths_cuda(struct CUDA_Context* ctx, unsigned LID) {\n\tShortPathType *num_shortest_paths = ctx->num_shortest_paths.data.cpu_rd_ptr();\n\treturn num_shortest_paths[LID];\n}\n\nvoid set_node_num_shortest_paths_cuda(struct CUDA_Context* ctx, unsigned LID, ShortPathType v) {\n\tShortPathType *num_shortest_paths = ctx->num_shortest_paths.data.cpu_wr_ptr();\n\tnum_shortest_paths[LID] = v;\n}\n\nvoid add_node_num_shortest_paths_cuda(struct CUDA_Context* ctx, unsigned LID, ShortPathType v) {\n\tShortPathType *num_shortest_paths = ctx->num_shortest_paths.data.cpu_wr_ptr();\n\tnum_shortest_paths[LID] += v;\n}\n\nbool min_node_num_shortest_paths_cuda(struct CUDA_Context* ctx, unsigned LID, ShortPathType v) {\n\tShortPathType *num_shortest_paths = ctx->num_shortest_paths.data.cpu_wr_ptr();\n\tif (num_shortest_paths[LID] > v){\n\t\tnum_shortest_paths[LID] = v;\n\t\treturn true;\n\t}\n\treturn false;\n}\n\nvoid batch_get_node_num_shortest_paths_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v) {\n\tbatch_get_shared_field<ShortPathType, sharedMaster, false>(ctx, &ctx->num_shortest_paths, from_id, v);\n}\n\nvoid batch_get_node_num_shortest_paths_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode) {\n\tbatch_get_shared_field<ShortPathType, sharedMaster, false>(ctx, &ctx->num_shortest_paths, from_id, v, v_size, data_mode);\n}\n\nvoid batch_get_mirror_node_num_shortest_paths_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v) {\n\tbatch_get_shared_field<ShortPathType, sharedMirror, false>(ctx, &ctx->num_shortest_paths, from_id, v);\n}\n\nvoid batch_get_mirror_node_num_shortest_paths_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode) {\n\tbatch_get_shared_field<ShortPathType, sharedMirror, false>(ctx, &ctx->num_shortest_paths, from_id, v, v_size, data_mode);\n}\n\nvoid batch_get_reset_node_num_shortest_paths_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, ShortPathType i) {\n\tbatch_get_shared_field<ShortPathType, sharedMirror, true>(ctx, &ctx->num_shortest_paths, from_id, v, i);\n}\n\nvoid batch_get_reset_node_num_shortest_paths_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode, ShortPathType i) {\n\tbatch_get_shared_field<ShortPathType, sharedMirror, true>(ctx, &ctx->num_shortest_paths, from_id, v, v_size, data_mode, i);\n}\n\nvoid batch_set_mirror_node_num_shortest_paths_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<ShortPathType, sharedMirror, setOp>(ctx, &ctx->num_shortest_paths, from_id, v, data_mode);\n}\n\nvoid batch_set_node_num_shortest_paths_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<ShortPathType, sharedMaster, setOp>(ctx, &ctx->num_shortest_paths, from_id, v, data_mode);\n}\n\nvoid batch_add_mirror_node_num_shortest_paths_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<ShortPathType, sharedMirror, addOp>(ctx, &ctx->num_shortest_paths, from_id, v, data_mode);\n}\n\nvoid batch_add_node_num_shortest_paths_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<ShortPathType, sharedMaster, addOp>(ctx, &ctx->num_shortest_paths, from_id, v, data_mode);\n}\n\nvoid batch_min_mirror_node_num_shortest_paths_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<ShortPathType, sharedMirror, minOp>(ctx, &ctx->num_shortest_paths, from_id, v, data_mode);\n}\n\nvoid batch_min_node_num_shortest_paths_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<ShortPathType, sharedMaster, minOp>(ctx, &ctx->num_shortest_paths, from_id, v, data_mode);\n}\n\nvoid batch_reset_node_num_shortest_paths_cuda(struct CUDA_Context* ctx, size_t begin, size_t end, ShortPathType v) {\n\treset_data_field<ShortPathType>(&ctx->num_shortest_paths, begin, end, v);\n}\n\n"
  },
  {
    "path": "lonestar/analytics/distributed/betweennesscentrality/bc_level_cuda.h",
    "content": "#pragma once\n\n#include \"galois/runtime/DataCommMode.h\"\n#include \"galois/cuda/HostDecls.h\"\n\n// type of the num shortest paths variable\nusing ShortPathType = double;\n\nvoid get_bitset_betweeness_centrality_cuda(struct CUDA_Context* ctx,\n                                           uint64_t* bitset_compute);\nvoid bitset_betweeness_centrality_reset_cuda(struct CUDA_Context* ctx);\nvoid bitset_betweeness_centrality_reset_cuda(struct CUDA_Context* ctx,\n                                             size_t begin, size_t end);\nfloat get_node_betweeness_centrality_cuda(struct CUDA_Context* ctx,\n                                          unsigned LID);\nvoid set_node_betweeness_centrality_cuda(struct CUDA_Context* ctx, unsigned LID,\n                                         float v);\nvoid add_node_betweeness_centrality_cuda(struct CUDA_Context* ctx, unsigned LID,\n                                         float v);\nbool min_node_betweeness_centrality_cuda(struct CUDA_Context* ctx, unsigned LID,\n                                         float v);\nvoid batch_get_node_betweeness_centrality_cuda(struct CUDA_Context* ctx,\n                                               unsigned from_id, uint8_t* v);\nvoid batch_get_node_betweeness_centrality_cuda(struct CUDA_Context* ctx,\n                                               unsigned from_id, uint8_t* v,\n                                               size_t* v_size,\n                                               DataCommMode* data_mode);\nvoid batch_get_mirror_node_betweeness_centrality_cuda(struct CUDA_Context* ctx,\n                                                      unsigned from_id,\n                                                      uint8_t* v);\nvoid batch_get_mirror_node_betweeness_centrality_cuda(struct CUDA_Context* ctx,\n                                                      unsigned from_id,\n                                                      uint8_t* v,\n                                                      size_t* v_size,\n                                                      DataCommMode* data_mode);\nvoid batch_get_reset_node_betweeness_centrality_cuda(struct CUDA_Context* ctx,\n                                                     unsigned from_id,\n                                                     uint8_t* v, float i);\nvoid batch_get_reset_node_betweeness_centrality_cuda(struct CUDA_Context* ctx,\n                                                     unsigned from_id,\n                                                     uint8_t* v, size_t* v_size,\n                                                     DataCommMode* data_mode,\n                                                     float i);\nvoid batch_set_mirror_node_betweeness_centrality_cuda(struct CUDA_Context* ctx,\n                                                      unsigned from_id,\n                                                      uint8_t* v,\n                                                      DataCommMode data_mode);\nvoid batch_set_node_betweeness_centrality_cuda(struct CUDA_Context* ctx,\n                                               unsigned from_id, uint8_t* v,\n                                               DataCommMode data_mode);\nvoid batch_add_mirror_node_betweeness_centrality_cuda(struct CUDA_Context* ctx,\n                                                      unsigned from_id,\n                                                      uint8_t* v,\n                                                      DataCommMode data_mode);\nvoid batch_add_node_betweeness_centrality_cuda(struct CUDA_Context* ctx,\n                                               unsigned from_id, uint8_t* v,\n                                               DataCommMode data_mode);\nvoid batch_min_mirror_node_betweeness_centrality_cuda(struct CUDA_Context* ctx,\n                                                      unsigned from_id,\n                                                      uint8_t* v,\n                                                      DataCommMode data_mode);\nvoid batch_min_node_betweeness_centrality_cuda(struct CUDA_Context* ctx,\n                                               unsigned from_id, uint8_t* v,\n                                               DataCommMode data_mode);\nvoid batch_reset_node_betweeness_centrality_cuda(struct CUDA_Context* ctx,\n                                                 size_t begin, size_t end,\n                                                 float v);\n\nvoid get_bitset_current_length_cuda(struct CUDA_Context* ctx,\n                                    uint64_t* bitset_compute);\nvoid bitset_current_length_reset_cuda(struct CUDA_Context* ctx);\nvoid bitset_current_length_reset_cuda(struct CUDA_Context* ctx, size_t begin,\n                                      size_t end);\nuint32_t get_node_current_length_cuda(struct CUDA_Context* ctx, unsigned LID);\nvoid set_node_current_length_cuda(struct CUDA_Context* ctx, unsigned LID,\n                                  uint32_t v);\nvoid add_node_current_length_cuda(struct CUDA_Context* ctx, unsigned LID,\n                                  uint32_t v);\nbool min_node_current_length_cuda(struct CUDA_Context* ctx, unsigned LID,\n                                  uint32_t v);\nvoid batch_get_node_current_length_cuda(struct CUDA_Context* ctx,\n                                        unsigned from_id, uint8_t* v);\nvoid batch_get_node_current_length_cuda(struct CUDA_Context* ctx,\n                                        unsigned from_id, uint8_t* v,\n                                        size_t* v_size,\n                                        DataCommMode* data_mode);\nvoid batch_get_mirror_node_current_length_cuda(struct CUDA_Context* ctx,\n                                               unsigned from_id, uint8_t* v);\nvoid batch_get_mirror_node_current_length_cuda(struct CUDA_Context* ctx,\n                                               unsigned from_id, uint8_t* v,\n                                               size_t* v_size,\n                                               DataCommMode* data_mode);\nvoid batch_get_reset_node_current_length_cuda(struct CUDA_Context* ctx,\n                                              unsigned from_id, uint8_t* v,\n                                              uint32_t i);\nvoid batch_get_reset_node_current_length_cuda(struct CUDA_Context* ctx,\n                                              unsigned from_id, uint8_t* v,\n                                              size_t* v_size,\n                                              DataCommMode* data_mode,\n                                              uint32_t i);\nvoid batch_set_mirror_node_current_length_cuda(struct CUDA_Context* ctx,\n                                               unsigned from_id, uint8_t* v,\n                                               DataCommMode data_mode);\nvoid batch_set_node_current_length_cuda(struct CUDA_Context* ctx,\n                                        unsigned from_id, uint8_t* v,\n                                        DataCommMode data_mode);\nvoid batch_add_mirror_node_current_length_cuda(struct CUDA_Context* ctx,\n                                               unsigned from_id, uint8_t* v,\n                                               DataCommMode data_mode);\nvoid batch_add_node_current_length_cuda(struct CUDA_Context* ctx,\n                                        unsigned from_id, uint8_t* v,\n                                        DataCommMode data_mode);\nvoid batch_min_mirror_node_current_length_cuda(struct CUDA_Context* ctx,\n                                               unsigned from_id, uint8_t* v,\n                                               DataCommMode data_mode);\nvoid batch_min_node_current_length_cuda(struct CUDA_Context* ctx,\n                                        unsigned from_id, uint8_t* v,\n                                        DataCommMode data_mode);\nvoid batch_reset_node_current_length_cuda(struct CUDA_Context* ctx,\n                                          size_t begin, size_t end, uint32_t v);\n\nvoid get_bitset_dependency_cuda(struct CUDA_Context* ctx,\n                                uint64_t* bitset_compute);\nvoid bitset_dependency_reset_cuda(struct CUDA_Context* ctx);\nvoid bitset_dependency_reset_cuda(struct CUDA_Context* ctx, size_t begin,\n                                  size_t end);\nfloat get_node_dependency_cuda(struct CUDA_Context* ctx, unsigned LID);\nvoid set_node_dependency_cuda(struct CUDA_Context* ctx, unsigned LID, float v);\nvoid add_node_dependency_cuda(struct CUDA_Context* ctx, unsigned LID, float v);\nbool min_node_dependency_cuda(struct CUDA_Context* ctx, unsigned LID, float v);\nvoid batch_get_node_dependency_cuda(struct CUDA_Context* ctx, unsigned from_id,\n                                    uint8_t* v);\nvoid batch_get_node_dependency_cuda(struct CUDA_Context* ctx, unsigned from_id,\n                                    uint8_t* v, size_t* v_size,\n                                    DataCommMode* data_mode);\nvoid batch_get_mirror_node_dependency_cuda(struct CUDA_Context* ctx,\n                                           unsigned from_id, uint8_t* v);\nvoid batch_get_mirror_node_dependency_cuda(struct CUDA_Context* ctx,\n                                           unsigned from_id, uint8_t* v,\n                                           size_t* v_size,\n                                           DataCommMode* data_mode);\nvoid batch_get_reset_node_dependency_cuda(struct CUDA_Context* ctx,\n                                          unsigned from_id, uint8_t* v,\n                                          float i);\nvoid batch_get_reset_node_dependency_cuda(struct CUDA_Context* ctx,\n                                          unsigned from_id, uint8_t* v,\n                                          size_t* v_size,\n                                          DataCommMode* data_mode, float i);\nvoid batch_set_mirror_node_dependency_cuda(struct CUDA_Context* ctx,\n                                           unsigned from_id, uint8_t* v,\n                                           DataCommMode data_mode);\nvoid batch_set_node_dependency_cuda(struct CUDA_Context* ctx, unsigned from_id,\n                                    uint8_t* v, DataCommMode data_mode);\nvoid batch_add_mirror_node_dependency_cuda(struct CUDA_Context* ctx,\n                                           unsigned from_id, uint8_t* v,\n                                           DataCommMode data_mode);\nvoid batch_add_node_dependency_cuda(struct CUDA_Context* ctx, unsigned from_id,\n                                    uint8_t* v, DataCommMode data_mode);\nvoid batch_min_mirror_node_dependency_cuda(struct CUDA_Context* ctx,\n                                           unsigned from_id, uint8_t* v,\n                                           DataCommMode data_mode);\nvoid batch_min_node_dependency_cuda(struct CUDA_Context* ctx, unsigned from_id,\n                                    uint8_t* v, DataCommMode data_mode);\nvoid batch_reset_node_dependency_cuda(struct CUDA_Context* ctx, size_t begin,\n                                      size_t end, float v);\n\nvoid get_bitset_num_shortest_paths_cuda(struct CUDA_Context* ctx,\n                                        uint64_t* bitset_compute);\nvoid bitset_num_shortest_paths_reset_cuda(struct CUDA_Context* ctx);\nvoid bitset_num_shortest_paths_reset_cuda(struct CUDA_Context* ctx,\n                                          size_t begin, size_t end);\nShortPathType get_node_num_shortest_paths_cuda(struct CUDA_Context* ctx,\n                                               unsigned LID);\nvoid set_node_num_shortest_paths_cuda(struct CUDA_Context* ctx, unsigned LID,\n                                      ShortPathType v);\nvoid add_node_num_shortest_paths_cuda(struct CUDA_Context* ctx, unsigned LID,\n                                      ShortPathType v);\nbool min_node_num_shortest_paths_cuda(struct CUDA_Context* ctx, unsigned LID,\n                                      ShortPathType v);\nvoid batch_get_node_num_shortest_paths_cuda(struct CUDA_Context* ctx,\n                                            unsigned from_id, uint8_t* v);\nvoid batch_get_node_num_shortest_paths_cuda(struct CUDA_Context* ctx,\n                                            unsigned from_id, uint8_t* v,\n                                            size_t* v_size,\n                                            DataCommMode* data_mode);\nvoid batch_get_mirror_node_num_shortest_paths_cuda(struct CUDA_Context* ctx,\n                                                   unsigned from_id,\n                                                   uint8_t* v);\nvoid batch_get_mirror_node_num_shortest_paths_cuda(struct CUDA_Context* ctx,\n                                                   unsigned from_id, uint8_t* v,\n                                                   size_t* v_size,\n                                                   DataCommMode* data_mode);\nvoid batch_get_reset_node_num_shortest_paths_cuda(struct CUDA_Context* ctx,\n                                                  unsigned from_id, uint8_t* v,\n                                                  ShortPathType i);\nvoid batch_get_reset_node_num_shortest_paths_cuda(struct CUDA_Context* ctx,\n                                                  unsigned from_id, uint8_t* v,\n                                                  size_t* v_size,\n                                                  DataCommMode* data_mode,\n                                                  ShortPathType i);\nvoid batch_set_mirror_node_num_shortest_paths_cuda(struct CUDA_Context* ctx,\n                                                   unsigned from_id, uint8_t* v,\n                                                   DataCommMode data_mode);\nvoid batch_set_node_num_shortest_paths_cuda(struct CUDA_Context* ctx,\n                                            unsigned from_id, uint8_t* v,\n                                            DataCommMode data_mode);\nvoid batch_add_mirror_node_num_shortest_paths_cuda(struct CUDA_Context* ctx,\n                                                   unsigned from_id, uint8_t* v,\n                                                   DataCommMode data_mode);\nvoid batch_add_node_num_shortest_paths_cuda(struct CUDA_Context* ctx,\n                                            unsigned from_id, uint8_t* v,\n                                            DataCommMode data_mode);\nvoid batch_min_mirror_node_num_shortest_paths_cuda(struct CUDA_Context* ctx,\n                                                   unsigned from_id, uint8_t* v,\n                                                   DataCommMode data_mode);\nvoid batch_min_node_num_shortest_paths_cuda(struct CUDA_Context* ctx,\n                                            unsigned from_id, uint8_t* v,\n                                            DataCommMode data_mode);\nvoid batch_reset_node_num_shortest_paths_cuda(struct CUDA_Context* ctx,\n                                              size_t begin, size_t end,\n                                              ShortPathType v);\n\nvoid BC_cuda(unsigned int __begin, unsigned int __end,\n             struct CUDA_Context* ctx);\nvoid BC_allNodes_cuda(struct CUDA_Context* ctx);\nvoid BC_masterNodes_cuda(struct CUDA_Context* ctx);\nvoid BC_nodesWithEdges_cuda(struct CUDA_Context* ctx);\nvoid BackwardPass_cuda(unsigned int __begin, unsigned int __end,\n                       uint32_t local_r, struct CUDA_Context* ctx);\nvoid BackwardPass_allNodes_cuda(uint32_t local_r, struct CUDA_Context* ctx);\nvoid BackwardPass_masterNodes_cuda(uint32_t local_r, struct CUDA_Context* ctx);\nvoid BackwardPass_nodesWithEdges_cuda(uint32_t local_r,\n                                      struct CUDA_Context* ctx);\nvoid ForwardPass_cuda(unsigned int __begin, unsigned int __end, uint32_t& dga,\n                      uint32_t local_r, struct CUDA_Context* ctx);\nvoid ForwardPass_allNodes_cuda(uint32_t& dga, uint32_t local_r,\n                               struct CUDA_Context* ctx);\nvoid ForwardPass_masterNodes_cuda(uint32_t& dga, uint32_t local_r,\n                                  struct CUDA_Context* ctx);\nvoid ForwardPass_nodesWithEdges_cuda(uint32_t& dga, uint32_t local_r,\n                                     struct CUDA_Context* ctx);\nvoid InitializeGraph_cuda(unsigned int __begin, unsigned int __end,\n                          struct CUDA_Context* ctx);\nvoid InitializeGraph_allNodes_cuda(struct CUDA_Context* ctx);\nvoid InitializeGraph_masterNodes_cuda(struct CUDA_Context* ctx);\nvoid InitializeGraph_nodesWithEdges_cuda(struct CUDA_Context* ctx);\nvoid InitializeIteration_cuda(unsigned int __begin, unsigned int __end,\n                              const uint32_t& local_infinity,\n                              const uint64_t& local_current_src_node,\n                              struct CUDA_Context* ctx);\nvoid InitializeIteration_allNodes_cuda(const uint32_t& local_infinity,\n                                       const uint64_t& local_current_src_node,\n                                       struct CUDA_Context* ctx);\nvoid InitializeIteration_masterNodes_cuda(\n    const uint32_t& local_infinity, const uint64_t& local_current_src_node,\n    struct CUDA_Context* ctx);\nvoid InitializeIteration_nodesWithEdges_cuda(\n    const uint32_t& local_infinity, const uint64_t& local_current_src_node,\n    struct CUDA_Context* ctx);\nvoid MiddleSync_cuda(unsigned int __begin, unsigned int __end,\n                     const uint32_t local_infinity, struct CUDA_Context* ctx);\nvoid MiddleSync_allNodes_cuda(const uint32_t local_infinity,\n                              struct CUDA_Context* ctx);\nvoid MiddleSync_masterNodes_cuda(const uint32_t local_infinity,\n                                 struct CUDA_Context* ctx);\nvoid MiddleSync_nodesWithEdges_cuda(const uint32_t local_infinity,\n                                    struct CUDA_Context* ctx);\nvoid Sanity_cuda(unsigned int __begin, unsigned int __end,\n                 float& DGAccumulator_sum, float& DGAccumulator_max,\n                 float& DGAccumulator_min, struct CUDA_Context* ctx);\nvoid Sanity_allNodes_cuda(float& DGAccumulator_sum, float& DGAccumulator_max,\n                          float& DGAccumulator_min, struct CUDA_Context* ctx);\nvoid Sanity_masterNodes_cuda(float& DGAccumulator_sum, float& DGAccumulator_max,\n                             float& DGAccumulator_min,\n                             struct CUDA_Context* ctx);\nvoid Sanity_nodesWithEdges_cuda(float& DGAccumulator_sum,\n                                float& DGAccumulator_max,\n                                float& DGAccumulator_min,\n                                struct CUDA_Context* ctx);\n"
  },
  {
    "path": "lonestar/analytics/distributed/betweennesscentrality/bc_level_cuda.py",
    "content": "from gg.ast import *\nfrom gg.lib.graph import Graph\nfrom gg.lib.wl import Worklist\nfrom gg.ast.params import GraphParam\nimport cgen\nG = Graph(\"graph\")\nWL = Worklist()\nast = Module([\nCBlock([cgen.Include(\"bc_level_cuda.cuh\", system = False)], parse = False),\nKernel(\"InitializeGraph\", [G.param(), ('unsigned int', '__begin'), ('unsigned int', '__end'), ('float *', 'p_betweeness_centrality'), ('float *', 'p_dependency'), ('ShortPathType *', 'p_num_shortest_paths')],\n[\nForAll(\"src\", G.nodes(\"__begin\", \"__end\"),\n[\nCDecl([(\"bool\", \"pop\", \" = src < __end\")]),\nIf(\"pop\", [\nCBlock([\"p_betweeness_centrality[src] = 0\"]),\nCBlock([\"p_num_shortest_paths[src]    = 0\"]),\nCBlock([\"p_dependency[src]            = 0\"]),\n]),\n]),\n]),\nKernel(\"InitializeIteration\", [G.param(), ('unsigned int', '__begin'), ('unsigned int', '__end'), ('const uint64_t ', 'local_current_src_node'), ('const uint32_t ', 'local_infinity'), ('uint32_t *', 'p_current_length'), ('float *', 'p_dependency'), ('ShortPathType *', 'p_num_shortest_paths')],\n[\nCDecl([(\"bool\", \"is_source\", \"\")]),\nForAll(\"src\", G.nodes(\"__begin\", \"__end\"),\n[\nCDecl([(\"bool\", \"pop\", \" = src < __end\")]),\nIf(\"pop\", [\nCBlock([\"is_source = graph.node_data[src] == local_current_src_node\"]),\nIf(\"!is_source\",\n[\nCBlock([\"p_current_length[src]     = local_infinity\"]),\nCBlock([\"p_num_shortest_paths[src] = 0\"]),\n],\n[\nCBlock([\"p_current_length[src]     = 0\"]),\nCBlock([\"p_num_shortest_paths[src] = 1\"]),\n]),\nCBlock([\"p_dependency[src]       = 0\"]),\n]),\n]),\n]),\nKernel(\"ForwardPass\", [G.param(), ('unsigned int', '__begin'), ('unsigned int', '__end'), ('uint32_t', 'local_r'), ('uint32_t *', 'p_current_length'), ('ShortPathType *', 'p_num_shortest_paths'), ('DynamicBitset&', 'bitset_current_length'), ('DynamicBitset&', 'bitset_num_shortest_paths'), ('HGAccumulator<uint32_t>', 'dga')],\n[\nCDecl([(\"__shared__ cub::BlockReduce<uint32_t, TB_SIZE>::TempStorage\", \"dga_ts\", \"\")]),\nCBlock([\"dga.thread_entry()\"]),\nForAll(\"src\", G.nodes(\"__begin\", \"__end\"),\n[\nCDecl([(\"bool\", \"pop\", \" = src < __end\")]),\nIf(\"pop\", [\nIf(\"p_current_length[src] == local_r\",\n[\n], [ CBlock([\"pop = false\"]), ]),\n]),\nUniformConditional(If(\"!pop\", [CBlock(\"continue\")]), uniform_only = False, _only_if_np = True),\nClosureHint(\nForAll(\"current_edge\", G.edges(\"src\"),\n[\nCDecl([(\"index_type\", \"dst\", \"\")]),\nCBlock([\"dst = graph.getAbsDestination(current_edge)\"]),\nCDecl([(\"uint32_t\", \"new_dist\", \"\")]),\nCBlock([\"new_dist = 1 + p_current_length[src]\"]),\nCDecl([(\"uint32_t\", \"old\", \"\")]),\nCBlock([\"old = atomicTestMin(&p_current_length[dst], new_dist)\"]),\nIf(\"old > new_dist\",\n[\nCBlock([\"bitset_current_length.set(dst)\"]),\nCDecl([(\"double\", \"nsp\", \"\")]),\nCBlock([\"nsp = p_num_shortest_paths[src]\"]),\nCBlock([\"atomicTestAdd(&p_num_shortest_paths[dst], nsp)\"]),\nCBlock([\"bitset_num_shortest_paths.set(dst)\"]),\nCBlock([\"dga.reduce( 1)\"]),\n],\n[\nIf(\"old == new_dist\",\n[\nCDecl([(\"double\", \"nsp\", \"\")]),\nCBlock([\"nsp = p_num_shortest_paths[src]\"]),\nCBlock([\"atomicTestAdd(&p_num_shortest_paths[dst], nsp)\"]),\nCBlock([\"bitset_num_shortest_paths.set(dst)\"]),\nCBlock([\"dga.reduce( 1)\"]),\n]),\n]),\n]),\n),\n]),\nCBlock([\"dga.thread_exit<cub::BlockReduce<uint32_t, TB_SIZE> >(dga_ts)\"], parse = False),\n]),\nKernel(\"MiddleSync\", [G.param(), ('unsigned int', '__begin'), ('unsigned int', '__end'), ('uint32_t', 'local_infinity'), ('uint32_t *', 'p_current_length'), ('DynamicBitset&', 'bitset_num_shortest_paths')],\n[\nForAll(\"src\", G.nodes(\"__begin\", \"__end\"),\n[\nCDecl([(\"bool\", \"pop\", \" = src < __end\")]),\nIf(\"pop\", [\nIf(\"p_current_length[src] != local_infinity\",\n[\nCBlock([\"bitset_num_shortest_paths.set(src)\"]),\n]),\n]),\n]),\n]),\nKernel(\"BackwardPass\", [G.param(), ('unsigned int', '__begin'), ('unsigned int', '__end'), ('uint32_t', 'local_r'), ('uint32_t *', 'p_current_length'), ('float *', 'p_dependency'), ('ShortPathType *', 'p_num_shortest_paths'), ('DynamicBitset&', 'bitset_dependency')],\n[\nCDecl([(\"uint32_t\", \"dest_to_find\", \"\")]),\nForAll(\"src\", G.nodes(\"__begin\", \"__end\"),\n[\nCDecl([(\"bool\", \"pop\", \" = src < __end\")]),\nIf(\"pop\", [\nIf(\"p_current_length[src] == local_r\",\n[\nCBlock([\"dest_to_find = p_current_length[src] + 1\"]),\n], [ CBlock([\"pop = false\"]), ]),\n]),\nUniformConditional(If(\"!pop\", [CBlock(\"continue\")]), uniform_only = False, _only_if_np = True),\nClosureHint(\nForAll(\"current_edge\", G.edges(\"src\"),\n[\nCDecl([(\"index_type\", \"dst\", \"\")]),\nCBlock([\"dst = graph.getAbsDestination(current_edge)\"]),\nIf(\"dest_to_find == p_current_length[dst]\",\n[\nCDecl([(\"float\", \"contrib\", \"\")]),\nCBlock([\"contrib = ((float)1 + p_dependency[dst]) / p_num_shortest_paths[dst]\"]),\nCBlock([\"p_dependency[src] = p_dependency[src] + contrib\"]),\nCBlock([\"bitset_dependency.set(src)\"]),\n]),\n]),\n),\nCBlock([\"p_dependency[src] *= p_num_shortest_paths[src]\"]),\n]),\n]),\nKernel(\"BC\", [G.param(), ('unsigned int', '__begin'), ('unsigned int', '__end'), ('float *', 'p_betweeness_centrality'), ('float *', 'p_dependency')],\n[\nForAll(\"src\", G.nodes(\"__begin\", \"__end\"),\n[\nCDecl([(\"bool\", \"pop\", \" = src < __end\")]),\nIf(\"pop\", [\nIf(\"p_dependency[src] > 0\",\n[\nCBlock([\"p_betweeness_centrality[src] += p_dependency[src]\"]),\n]),\n]),\n]),\n]),\nKernel(\"Sanity\", [G.param(), ('unsigned int', '__begin'), ('unsigned int', '__end'), ('float *', 'p_betweeness_centrality'), ('HGAccumulator<float>', 'DGAccumulator_sum'), ('HGReduceMax<float>', 'DGAccumulator_max'), ('HGReduceMin<float>', 'DGAccumulator_min')],\n[\nCDecl([(\"__shared__ cub::BlockReduce<float, TB_SIZE>::TempStorage\", \"DGAccumulator_sum_ts\", \"\")]),\nCBlock([\"DGAccumulator_sum.thread_entry()\"]),\nCDecl([(\"__shared__ cub::BlockReduce<float, TB_SIZE>::TempStorage\", \"DGAccumulator_max_ts\", \"\")]),\nCBlock([\"DGAccumulator_max.thread_entry()\"]),\nCDecl([(\"__shared__ cub::BlockReduce<float, TB_SIZE>::TempStorage\", \"DGAccumulator_min_ts\", \"\")]),\nCBlock([\"DGAccumulator_min.thread_entry()\"]),\nForAll(\"src\", G.nodes(\"__begin\", \"__end\"),\n[\nCDecl([(\"bool\", \"pop\", \" = src < __end\")]),\nIf(\"pop\", [\nCBlock([\"DGAccumulator_max.reduce(p_betweeness_centrality[src])\"]),\nCBlock([\"DGAccumulator_min.reduce(p_betweeness_centrality[src])\"]),\nCBlock([\"DGAccumulator_sum.reduce( p_betweeness_centrality[src])\"]),\n]),\n]),\nCBlock([\"DGAccumulator_sum.thread_exit<cub::BlockReduce<float, TB_SIZE> >(DGAccumulator_sum_ts)\"], parse = False),\nCBlock([\"DGAccumulator_max.thread_exit<cub::BlockReduce<float, TB_SIZE> >(DGAccumulator_max_ts)\"], parse = False),\nCBlock([\"DGAccumulator_min.thread_exit<cub::BlockReduce<float, TB_SIZE> >(DGAccumulator_min_ts)\"], parse = False),\n]),\nKernel(\"InitializeGraph_cuda\", [('unsigned int ', '__begin'), ('unsigned int ', '__end'), ('struct CUDA_Context* ', 'ctx')],\n[\nCDecl([(\"dim3\", \"blocks\", \"\")]),\nCDecl([(\"dim3\", \"threads\", \"\")]),\nCBlock([\"kernel_sizing(blocks, threads)\"]),\nInvoke(\"InitializeGraph\", (\"ctx->gg\", \"__begin\", \"__end\", \"ctx->betweeness_centrality.data.gpu_wr_ptr()\", \"ctx->dependency.data.gpu_wr_ptr()\", \"ctx->num_shortest_paths.data.gpu_wr_ptr()\")),\nCBlock([\"check_cuda_kernel\"], parse = False),\n], host = True),\nKernel(\"InitializeGraph_allNodes_cuda\", [('struct CUDA_Context* ', 'ctx')],\n[\nCBlock([\"InitializeGraph_cuda(0, ctx->gg.nnodes, ctx)\"]),\n], host = True),\nKernel(\"InitializeGraph_masterNodes_cuda\", [('struct CUDA_Context* ', 'ctx')],\n[\nCBlock([\"InitializeGraph_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, ctx)\"]),\n], host = True),\nKernel(\"InitializeGraph_nodesWithEdges_cuda\", [('struct CUDA_Context* ', 'ctx')],\n[\nCBlock([\"InitializeGraph_cuda(0, ctx->numNodesWithEdges, ctx)\"]),\n], host = True),\nKernel(\"InitializeIteration_cuda\", [('unsigned int ', '__begin'), ('unsigned int ', '__end'), ('const uint32_t &', 'local_infinity'), ('const uint64_t &', 'local_current_src_node'), ('struct CUDA_Context* ', 'ctx')],\n[\nCDecl([(\"dim3\", \"blocks\", \"\")]),\nCDecl([(\"dim3\", \"threads\", \"\")]),\nCBlock([\"kernel_sizing(blocks, threads)\"]),\nInvoke(\"InitializeIteration\", (\"ctx->gg\", \"__begin\", \"__end\", \"local_current_src_node\", \"local_infinity\", \"ctx->current_length.data.gpu_wr_ptr()\", \"ctx->dependency.data.gpu_wr_ptr()\", \"ctx->num_shortest_paths.data.gpu_wr_ptr()\")),\nCBlock([\"check_cuda_kernel\"], parse = False),\n], host = True),\nKernel(\"InitializeIteration_allNodes_cuda\", [('const uint32_t &', 'local_infinity'), ('const uint64_t &', 'local_current_src_node'), ('struct CUDA_Context* ', 'ctx')],\n[\nCBlock([\"InitializeIteration_cuda(0, ctx->gg.nnodes, local_infinity, local_current_src_node, ctx)\"]),\n], host = True),\nKernel(\"InitializeIteration_masterNodes_cuda\", [('const uint32_t &', 'local_infinity'), ('const uint64_t &', 'local_current_src_node'), ('struct CUDA_Context* ', 'ctx')],\n[\nCBlock([\"InitializeIteration_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, local_infinity, local_current_src_node, ctx)\"]),\n], host = True),\nKernel(\"InitializeIteration_nodesWithEdges_cuda\", [('const uint32_t &', 'local_infinity'), ('const uint64_t &', 'local_current_src_node'), ('struct CUDA_Context* ', 'ctx')],\n[\nCBlock([\"InitializeIteration_cuda(0, ctx->numNodesWithEdges, local_infinity, local_current_src_node, ctx)\"]),\n], host = True),\nKernel(\"ForwardPass_cuda\", [('unsigned int ', '__begin'), ('unsigned int ', '__end'), ('uint32_t &', 'dga'), ('uint32_t', 'local_r'), ('struct CUDA_Context* ', 'ctx')],\n[\nCDecl([(\"dim3\", \"blocks\", \"\")]),\nCDecl([(\"dim3\", \"threads\", \"\")]),\nCBlock([\"kernel_sizing(blocks, threads)\"]),\nCDecl([(\"Shared<uint32_t>\", \"dgaval\", \" = Shared<uint32_t>(1)\")]),\nCDecl([(\"HGAccumulator<uint32_t>\", \"_dga\", \"\")]),\nCBlock([\"*(dgaval.cpu_wr_ptr()) = 0\"]),\nCBlock([\"_dga.rv = dgaval.gpu_wr_ptr()\"]),\nInvoke(\"ForwardPass\", (\"ctx->gg\", \"__begin\", \"__end\", \"local_r\", \"ctx->current_length.data.gpu_wr_ptr()\", \"ctx->num_shortest_paths.data.gpu_wr_ptr()\", \"*(ctx->current_length.is_updated.gpu_rd_ptr())\", \"*(ctx->num_shortest_paths.is_updated.gpu_rd_ptr())\", \"_dga\")),\nCBlock([\"check_cuda_kernel\"], parse = False),\nCBlock([\"dga = *(dgaval.cpu_rd_ptr())\"]),\n], host = True),\nKernel(\"ForwardPass_allNodes_cuda\", [('uint32_t &', 'dga'), ('uint32_t', 'local_r'), ('struct CUDA_Context* ', 'ctx')],\n[\nCBlock([\"ForwardPass_cuda(0, ctx->gg.nnodes, dga, local_r, ctx)\"]),\n], host = True),\nKernel(\"ForwardPass_masterNodes_cuda\", [('uint32_t &', 'dga'), ('uint32_t', 'local_r'), ('struct CUDA_Context* ', 'ctx')],\n[\nCBlock([\"ForwardPass_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, dga, local_r, ctx)\"]),\n], host = True),\nKernel(\"ForwardPass_nodesWithEdges_cuda\", [('uint32_t &', 'dga'), ('uint32_t', 'local_r'), ('struct CUDA_Context* ', 'ctx')],\n[\nCBlock([\"ForwardPass_cuda(0, ctx->numNodesWithEdges, dga, local_r, ctx)\"]),\n], host = True),\nKernel(\"MiddleSync_cuda\", [('unsigned int ', '__begin'), ('unsigned int ', '__end'), ('const uint32_t', 'local_infinity'), ('struct CUDA_Context* ', 'ctx')],\n[\nCDecl([(\"dim3\", \"blocks\", \"\")]),\nCDecl([(\"dim3\", \"threads\", \"\")]),\nCBlock([\"kernel_sizing(blocks, threads)\"]),\nInvoke(\"MiddleSync\", (\"ctx->gg\", \"__begin\", \"__end\", \"local_infinity\", \"ctx->current_length.data.gpu_wr_ptr()\", \"*(ctx->num_shortest_paths.is_updated.gpu_rd_ptr())\")),\nCBlock([\"check_cuda_kernel\"], parse = False),\n], host = True),\nKernel(\"MiddleSync_allNodes_cuda\", [('const uint32_t', 'local_infinity'), ('struct CUDA_Context* ', 'ctx')],\n[\nCBlock([\"MiddleSync_cuda(0, ctx->gg.nnodes, local_infinity, ctx)\"]),\n], host = True),\nKernel(\"MiddleSync_masterNodes_cuda\", [('const uint32_t', 'local_infinity'), ('struct CUDA_Context* ', 'ctx')],\n[\nCBlock([\"MiddleSync_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, local_infinity, ctx)\"]),\n], host = True),\nKernel(\"MiddleSync_nodesWithEdges_cuda\", [('const uint32_t', 'local_infinity'), ('struct CUDA_Context* ', 'ctx')],\n[\nCBlock([\"MiddleSync_cuda(0, ctx->numNodesWithEdges, local_infinity, ctx)\"]),\n], host = True),\nKernel(\"BackwardPass_cuda\", [('unsigned int ', '__begin'), ('unsigned int ', '__end'), ('uint32_t', 'local_r'), ('struct CUDA_Context* ', 'ctx')],\n[\nCDecl([(\"dim3\", \"blocks\", \"\")]),\nCDecl([(\"dim3\", \"threads\", \"\")]),\nCBlock([\"kernel_sizing(blocks, threads)\"]),\nInvoke(\"BackwardPass\", (\"ctx->gg\", \"__begin\", \"__end\", \"local_r\", \"ctx->current_length.data.gpu_wr_ptr()\", \"ctx->dependency.data.gpu_wr_ptr()\", \"ctx->num_shortest_paths.data.gpu_wr_ptr()\", \"*(ctx->dependency.is_updated.gpu_rd_ptr())\")),\nCBlock([\"check_cuda_kernel\"], parse = False),\n], host = True),\nKernel(\"BackwardPass_allNodes_cuda\", [('uint32_t', 'local_r'), ('struct CUDA_Context* ', 'ctx')],\n[\nCBlock([\"BackwardPass_cuda(0, ctx->gg.nnodes, local_r, ctx)\"]),\n], host = True),\nKernel(\"BackwardPass_masterNodes_cuda\", [('uint32_t', 'local_r'), ('struct CUDA_Context* ', 'ctx')],\n[\nCBlock([\"BackwardPass_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, local_r, ctx)\"]),\n], host = True),\nKernel(\"BackwardPass_nodesWithEdges_cuda\", [('uint32_t', 'local_r'), ('struct CUDA_Context* ', 'ctx')],\n[\nCBlock([\"BackwardPass_cuda(0, ctx->numNodesWithEdges, local_r, ctx)\"]),\n], host = True),\nKernel(\"BC_cuda\", [('unsigned int ', '__begin'), ('unsigned int ', '__end'), ('struct CUDA_Context* ', 'ctx')],\n[\nCDecl([(\"dim3\", \"blocks\", \"\")]),\nCDecl([(\"dim3\", \"threads\", \"\")]),\nCBlock([\"kernel_sizing(blocks, threads)\"]),\nInvoke(\"BC\", (\"ctx->gg\", \"__begin\", \"__end\", \"ctx->betweeness_centrality.data.gpu_wr_ptr()\", \"ctx->dependency.data.gpu_wr_ptr()\")),\nCBlock([\"check_cuda_kernel\"], parse = False),\n], host = True),\nKernel(\"BC_allNodes_cuda\", [('struct CUDA_Context* ', 'ctx')],\n[\nCBlock([\"BC_cuda(0, ctx->gg.nnodes, ctx)\"]),\n], host = True),\nKernel(\"BC_masterNodes_cuda\", [('struct CUDA_Context* ', 'ctx')],\n[\nCBlock([\"BC_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, ctx)\"]),\n], host = True),\nKernel(\"BC_nodesWithEdges_cuda\", [('struct CUDA_Context* ', 'ctx')],\n[\nCBlock([\"BC_cuda(0, ctx->numNodesWithEdges, ctx)\"]),\n], host = True),\nKernel(\"Sanity_cuda\", [('unsigned int ', '__begin'), ('unsigned int ', '__end'), ('float &', 'DGAccumulator_sum'), ('float &', 'DGAccumulator_max'), ('float &', 'DGAccumulator_min'), ('struct CUDA_Context* ', 'ctx')],\n[\nCDecl([(\"dim3\", \"blocks\", \"\")]),\nCDecl([(\"dim3\", \"threads\", \"\")]),\nCBlock([\"kernel_sizing(blocks, threads)\"]),\nCDecl([(\"Shared<float>\", \"DGAccumulator_sumval\", \" = Shared<float>(1)\")]),\nCDecl([(\"HGAccumulator<float>\", \"_DGAccumulator_sum\", \"\")]),\nCBlock([\"*(DGAccumulator_sumval.cpu_wr_ptr()) = 0\"]),\nCBlock([\"_DGAccumulator_sum.rv = DGAccumulator_sumval.gpu_wr_ptr()\"]),\nCDecl([(\"Shared<float>\", \"DGAccumulator_maxval\", \" = Shared<float>(1)\")]),\nCDecl([(\"HGReduceMax<float>\", \"_DGAccumulator_max\", \"\")]),\nCBlock([\"*(DGAccumulator_maxval.cpu_wr_ptr()) = 0\"]),\nCBlock([\"_DGAccumulator_max.rv = DGAccumulator_maxval.gpu_wr_ptr()\"]),\nCDecl([(\"Shared<float>\", \"DGAccumulator_minval\", \" = Shared<float>(1)\")]),\nCDecl([(\"HGReduceMin<float>\", \"_DGAccumulator_min\", \"\")]),\nCBlock([\"*(DGAccumulator_minval.cpu_wr_ptr()) = 1073741823\"]),\nCBlock([\"_DGAccumulator_min.rv = DGAccumulator_minval.gpu_wr_ptr()\"]),\nInvoke(\"Sanity\", (\"ctx->gg\", \"__begin\", \"__end\", \"ctx->betweeness_centrality.data.gpu_wr_ptr()\", \"_DGAccumulator_sum\", \"_DGAccumulator_max\", \"_DGAccumulator_min\")),\nCBlock([\"check_cuda_kernel\"], parse = False),\nCBlock([\"DGAccumulator_sum = *(DGAccumulator_sumval.cpu_rd_ptr())\"]),\nCBlock([\"DGAccumulator_max = *(DGAccumulator_maxval.cpu_rd_ptr())\"]),\nCBlock([\"DGAccumulator_min = *(DGAccumulator_minval.cpu_rd_ptr())\"]),\n], host = True),\nKernel(\"Sanity_allNodes_cuda\", [('float &', 'DGAccumulator_sum'), ('float &', 'DGAccumulator_max'), ('float &', 'DGAccumulator_min'), ('struct CUDA_Context* ', 'ctx')],\n[\nCBlock([\"Sanity_cuda(0, ctx->gg.nnodes, DGAccumulator_sum, DGAccumulator_max, DGAccumulator_min, ctx)\"]),\n], host = True),\nKernel(\"Sanity_masterNodes_cuda\", [('float &', 'DGAccumulator_sum'), ('float &', 'DGAccumulator_max'), ('float &', 'DGAccumulator_min'), ('struct CUDA_Context* ', 'ctx')],\n[\nCBlock([\"Sanity_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, DGAccumulator_sum, DGAccumulator_max, DGAccumulator_min, ctx)\"]),\n], host = True),\nKernel(\"Sanity_nodesWithEdges_cuda\", [('float &', 'DGAccumulator_sum'), ('float &', 'DGAccumulator_max'), ('float &', 'DGAccumulator_min'), ('struct CUDA_Context* ', 'ctx')],\n[\nCBlock([\"Sanity_cuda(0, ctx->numNodesWithEdges, DGAccumulator_sum, DGAccumulator_max, DGAccumulator_min, ctx)\"]),\n], host = True),\n])\n"
  },
  {
    "path": "lonestar/analytics/distributed/betweennesscentrality/bc_level_sync.hh",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting parallelism.\n * The code is being released under the terms of the 3-Clause BSD License (a\n * copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#include \"galois/runtime/SyncStructures.h\"\n\n////////////////////////////////////////////////////////////////////////////\n// # short paths\n////////////////////////////////////////////////////////////////////////////\n\nGALOIS_SYNC_STRUCTURE_REDUCE_ADD(num_shortest_paths, ShortPathType);\n// used for middle sync only\nGALOIS_SYNC_STRUCTURE_REDUCE_SET(num_shortest_paths, ShortPathType);\nGALOIS_SYNC_STRUCTURE_BITSET(num_shortest_paths);\n\n////////////////////////////////////////////////////////////////////////////\n// Current Lengths\n////////////////////////////////////////////////////////////////////////////\n\nGALOIS_SYNC_STRUCTURE_REDUCE_MIN(current_length, uint32_t);\nGALOIS_SYNC_STRUCTURE_BITSET(current_length);\n\n////////////////////////////////////////////////////////////////////////////\n// Dependency\n////////////////////////////////////////////////////////////////////////////\n\nGALOIS_SYNC_STRUCTURE_REDUCE_ADD(dependency, float);\nGALOIS_SYNC_STRUCTURE_BITSET(dependency);\n"
  },
  {
    "path": "lonestar/analytics/distributed/betweennesscentrality/bc_mr.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#include \"DistBench/Output.h\"\n#include \"DistBench/Start.h\"\n#include \"galois/DistGalois.h\"\n#include \"galois/DReducible.h\"\n#include \"galois/runtime/Tracer.h\"\n\n#include <iomanip>\n#include <iostream>\n\n// type of short path\nusing ShortPathType = double;\n\n/**\n * Structure for holding data calculated during BC\n */\nstruct BCData {\n  uint32_t minDistance;\n  ShortPathType shortPathCount;\n  galois::CopyableAtomic<float> dependencyValue;\n};\n\nconstexpr static const char* const REGION_NAME = \"MRBC\";\n\n/******************************************************************************/\n/* Declaration of command line arguments */\n/******************************************************************************/\nnamespace cll = llvm::cl;\n\nstatic cll::opt<std::string> sourcesToUse(\"sourcesToUse\",\n                                          cll::desc(\"Sources to use in BC\"),\n                                          cll::init(\"\"));\nstatic cll::opt<unsigned int>\n    numSourcesPerRound(\"numRoundSources\",\n                       cll::desc(\"Number of sources to use for APSP\"),\n                       cll::init(1));\nstatic cll::opt<unsigned int>\n    totalNumSources(\"numOfSources\",\n                    cll::desc(\"Total number of sources to do BC\"),\n                    cll::init(0));\nstatic cll::opt<bool> useSingleSource(\"singleSource\",\n                                      cll::desc(\"Use a single source.\"),\n                                      cll::init(false));\nstatic cll::opt<unsigned long long>\n    startNode(\"startNode\", cll::desc(\"Single source start node.\"),\n              cll::init(0));\nstatic cll::opt<unsigned int> vIndex(\"index\",\n                                     cll::desc(\"DEBUG: Index to print for \"\n                                               \"dist/short paths\"),\n                                     cll::init(0), cll::Hidden);\nstatic cll::opt<unsigned int>\n    vectorSize(\"vectorSize\",\n               cll::desc(\"DEBUG: Specify size of vector \"\n                         \"used for node data\"),\n               cll::init(0), cll::Hidden);\n\n// moved here so MRBCTree has access to numSourcesPerRound\n#include \"mrbc_tree.h\"\n\n/******************************************************************************/\n/* Graph structure declarations */\n/******************************************************************************/\n\n// NOTE: declared types assume that these values will not reach uint64_t: it may\n// need to be changed for very large graphs\nstruct NodeData {\n  galois::gstl::Vector<BCData> sourceData;\n  // distance map\n  MRBCTree dTree;\n  // final bc value\n  float bc;\n  // index that needs to be pulled in a round\n  uint32_t roundIndexToSend;\n};\n\nusing Graph = galois::graphs::DistGraph<NodeData, void>;\nusing GNode = typename Graph::GraphNode;\n\n// Bitsets for tracking which nodes need to be sync'd with respect to a\n// particular field\ngalois::DynamicBitSet bitset_minDistances;\ngalois::DynamicBitSet bitset_dependency;\n\nstd::unique_ptr<galois::graphs::GluonSubstrate<Graph>> syncSubstrate;\n\n// moved here for access to ShortPathType, NodeData, DynamicBitSets\n#include \"mrbc_sync.hh\"\n\n/******************************************************************************/\n/* Functions for running the algorithm */\n/******************************************************************************/\n\n/**\n * Graph initialization. Initialize all of the node data fields.\n *\n * @param graph Local graph to operate on\n */\nvoid InitializeGraph(Graph& graph) {\n  const auto& allNodes = graph.allNodesRange();\n\n  galois::do_all(\n      galois::iterate(allNodes.begin(), allNodes.end()),\n      [&](GNode curNode) {\n        NodeData& cur_data = graph.getData(curNode);\n        cur_data.sourceData.resize(vectorSize);\n        cur_data.bc = 0.0;\n      },\n      galois::loopname(\n          syncSubstrate->get_run_identifier(\"InitializeGraph\").c_str()),\n      galois::no_stats()); // Only stats the runtime by loopname\n}\n\n/**\n * This is used to reset node data when switching to a different\n * source set. Initializes everything for the coming source set.\n *\n * @param graph Local graph to operate on\n * @param offset Offset into sources (i.e. number of sources already done)\n **/\nvoid InitializeIteration(Graph& graph,\n                         const std::vector<uint64_t>& nodesToConsider) {\n  const auto& allNodes = graph.allNodesRange();\n\n  galois::do_all(\n      galois::iterate(allNodes.begin(), allNodes.end()),\n      [&](GNode curNode) {\n        NodeData& cur_data        = graph.getData(curNode);\n        cur_data.roundIndexToSend = infinity;\n        cur_data.dTree.initialize();\n        for (unsigned i = 0; i < numSourcesPerRound; i++) {\n          // min distance and short path count setup\n          if (nodesToConsider[i] == graph.getGID(curNode)) { // source node\n            cur_data.sourceData[i].minDistance     = 0;\n            cur_data.sourceData[i].shortPathCount  = 1;\n            cur_data.sourceData[i].dependencyValue = 0.0;\n            cur_data.dTree.setDistance(i, 0);\n          } else { // non-source node\n            cur_data.sourceData[i].minDistance     = infinity;\n            cur_data.sourceData[i].shortPathCount  = 0;\n            cur_data.sourceData[i].dependencyValue = 0.0;\n          }\n        }\n      },\n      galois::loopname(\n          syncSubstrate->get_run_identifier(\"InitializeIteration\").c_str()),\n      galois::no_stats());\n};\n\n/**\n * Find the message to send out from each node every round (if any exists to\n * be sent).\n *\n * @param graph Local graph to operate on\n * @param roundNumber current round number\n * @param dga Distributed accumulator for determining if work was done in\n * an iteration across all hosts\n */\nvoid FindMessageToSync(Graph& graph, const uint32_t roundNumber,\n                       galois::DGAccumulator<uint32_t>& dga) {\n  const auto& allNodes = graph.allNodesRange();\n\n  galois::do_all(\n      galois::iterate(allNodes.begin(), allNodes.end()),\n      [&](GNode curNode) {\n        NodeData& cur_data        = graph.getData(curNode);\n        cur_data.roundIndexToSend = cur_data.dTree.getIndexToSend(roundNumber);\n\n        if (cur_data.roundIndexToSend != infinity) {\n          if (cur_data.sourceData[cur_data.roundIndexToSend].minDistance != 0) {\n            bitset_minDistances.set(curNode);\n          }\n          dga += 1;\n        } else if (cur_data.dTree.moreWork()) {\n          dga += 1;\n        }\n      },\n      galois::loopname(syncSubstrate\n                           ->get_run_identifier(std::string(REGION_NAME) +\n                                                \"_FindMessageToSync\")\n                           .c_str()),\n      galois::steal(), galois::no_stats());\n}\n\n/**\n * Mark index we're sending out this round as sent + update metadata as\n * necessary.\n *\n * @param graph Local graph to operate on\n * @param roundNumber current round number\n */\nvoid ConfirmMessageToSend(Graph& graph, const uint32_t roundNumber) {\n  const auto& allNodes = graph.allNodesRange();\n\n  galois::do_all(\n      galois::iterate(allNodes.begin(), allNodes.end()),\n      [&](GNode curNode) {\n        NodeData& cur_data = graph.getData(curNode);\n        if (cur_data.roundIndexToSend != infinity) {\n          cur_data.dTree.markSent(roundNumber);\n        }\n      },\n      galois::loopname(syncSubstrate\n                           ->get_run_identifier(std::string(REGION_NAME) +\n                                                \"_ConfirmMessageToSend\")\n                           .c_str()),\n      galois::no_stats());\n}\n\n/**\n * If a node has something to send (as indicated by its indexToSend variable),\n * it will be pulled by all of its outgoing neighbors.\n *\n * Pull-style is used here to avoid the need for locks as 2 variables must be\n * updated at once.\n *\n * @param graph Local graph to operate on\n * @param dga Distributed accumulator for determining if work was done in\n * an iteration across all hosts\n */\nvoid SendAPSPMessagesOp(GNode dst, Graph& graph,\n                        galois::DGAccumulator<uint32_t>& dga) {\n  auto& dnode     = graph.getData(dst);\n  auto& dnodeData = dnode.sourceData;\n\n  for (auto inEdge : graph.edges(dst)) {\n    NodeData& src_data   = graph.getData(graph.getEdgeDst(inEdge));\n    uint32_t indexToSend = src_data.roundIndexToSend;\n\n    if (indexToSend != infinity) {\n      uint32_t distValue = src_data.sourceData[indexToSend].minDistance;\n      uint32_t newValue  = distValue + 1;\n      // Update minDistance vector\n      auto& dnodeIndex  = dnodeData[indexToSend];\n      uint32_t oldValue = dnodeIndex.minDistance;\n\n      if (oldValue > newValue) {\n        dnodeIndex.minDistance = newValue;\n        dnode.dTree.setDistance(indexToSend, oldValue, newValue);\n        // overwrite short path with this node's shortest path\n        dnodeIndex.shortPathCount =\n            src_data.sourceData[indexToSend].shortPathCount;\n      } else if (oldValue == newValue) {\n        assert(src_data.sourceData[indexToSend].shortPathCount != 0);\n        // add to short path\n        dnodeIndex.shortPathCount +=\n            src_data.sourceData[indexToSend].shortPathCount;\n      }\n\n      dga += 1;\n    }\n  }\n}\n\nvoid SendAPSPMessages(Graph& graph, galois::DGAccumulator<uint32_t>& dga) {\n  const auto& allNodesWithEdges = graph.allNodesWithEdgesRange();\n\n  galois::do_all(\n      galois::iterate(allNodesWithEdges),\n      [&](GNode dst) { SendAPSPMessagesOp(dst, graph, dga); },\n      galois::loopname(syncSubstrate\n                           ->get_run_identifier(std::string(REGION_NAME) +\n                                                \"_SendAPSPMessages\")\n                           .c_str()),\n      galois::steal(), galois::no_stats());\n}\n\n/**\n * Find all pairs shortest paths for the sources currently being worked on\n * as well as the number of shortest paths for each source.\n *\n * @param graph Local graph to operate on\n * @param dga Distributed accumulator for determining if work was done in\n * an iteration across all hosts\n *\n * @returns total number of rounds needed to do this phase\n */\nuint32_t APSP(Graph& graph, galois::DGAccumulator<uint32_t>& dga) {\n  uint32_t roundNumber = 0;\n\n  do {\n    dga.reset();\n    galois::gDebug(\"[\", galois::runtime::getSystemNetworkInterface().ID, \"]\",\n                   \" Round \", roundNumber);\n    syncSubstrate->set_num_round(roundNumber);\n\n    // you can think of this FindMessageToSync call being a part of the sync\n    FindMessageToSync(graph, roundNumber, dga);\n\n    // Template para's are struct names\n    syncSubstrate->sync<writeAny, readAny, APSPReduce, Bitset_minDistances>(\n        std::string(std::string(REGION_NAME) + \"_APSP\"));\n\n    // confirm message to send after sync potentially changes what you were\n    // planning on sending\n    ConfirmMessageToSend(graph, roundNumber);\n\n    // send messages (if any)\n    SendAPSPMessages(graph, dga);\n\n    roundNumber++;\n  } while (dga.reduce(syncSubstrate->get_run_identifier()));\n\n  return roundNumber;\n}\n\n/**\n * Get the round number for the backward propagation phase using the round\n * number from the APSP phase. This round number determines when a node should\n * send out a message for the backward propagation of dependency values.\n *\n * @param graph Local graph to operate on\n */\nvoid RoundUpdate(Graph& graph) {\n  const auto& allNodes = graph.allNodesRange();\n  syncSubstrate->set_num_round(0);\n\n  galois::do_all(\n      galois::iterate(allNodes.begin(), allNodes.end()),\n      [&](GNode node) {\n        NodeData& cur_data = graph.getData(node);\n        cur_data.dTree.prepForBackPhase();\n      },\n      galois::loopname(\n          syncSubstrate\n              ->get_run_identifier(std::string(REGION_NAME) + \"_RoundUpdate\")\n              .c_str()),\n      galois::no_stats());\n}\n\n/**\n * Find the message that needs to be back propagated this round by checking\n * round number.\n */\nvoid BackFindMessageToSend(Graph& graph, const uint32_t roundNumber,\n                           const uint32_t lastRoundNumber) {\n  // has to be all nodes because even nodes without edges may have dependency\n  // that needs to be sync'd\n  const auto& allNodes = graph.allNodesRange();\n\n  galois::do_all(\n      galois::iterate(allNodes.begin(), allNodes.end()),\n      [&](GNode dst) {\n        NodeData& dst_data = graph.getData(dst);\n\n        // if zero distances already reached, there is no point sending things\n        // out since we don't care about dependecy for sources (i.e. distance\n        // 0)\n        if (!dst_data.dTree.isZeroReached()) {\n          dst_data.roundIndexToSend =\n              dst_data.dTree.backGetIndexToSend(roundNumber, lastRoundNumber);\n\n          if (dst_data.roundIndexToSend != infinity) {\n            // only comm if not redundant 0\n            if (dst_data.sourceData[dst_data.roundIndexToSend]\n                    .dependencyValue != 0) {\n              bitset_dependency.set(dst);\n            }\n          }\n        }\n      },\n      galois::loopname(syncSubstrate\n                           ->get_run_identifier(std::string(REGION_NAME) +\n                                                \"_BackFindMessageToSend\")\n                           .c_str()),\n      galois::no_stats());\n}\n\n/**\n * Back propagate dependency values depending on the round that a node\n * sent out the shortest path message.\n *\n * @param graph Local graph to operate on\n * @param lastRoundNumber last round number in the APSP phase\n */\nvoid BackPropOp(GNode dst, Graph& graph) {\n  NodeData& dst_data = graph.getData(dst);\n  unsigned i         = dst_data.roundIndexToSend;\n\n  if (i != infinity) {\n    uint32_t myDistance = dst_data.sourceData[i].minDistance;\n\n    // calculate final dependency value\n    dst_data.sourceData[i].dependencyValue =\n        dst_data.sourceData[i].dependencyValue *\n        dst_data.sourceData[i].shortPathCount;\n\n    // get the value to add to predecessors\n    float toAdd = ((float)1 + dst_data.sourceData[i].dependencyValue) /\n                  dst_data.sourceData[i].shortPathCount;\n\n    for (auto inEdge : graph.edges(dst)) {\n      GNode src               = graph.getEdgeDst(inEdge);\n      auto& src_data          = graph.getData(src);\n      uint32_t sourceDistance = src_data.sourceData[i].minDistance;\n\n      // source nodes of this batch (i.e. distance 0) can be safely\n      // ignored\n      if (sourceDistance != 0) {\n        // determine if this source is a predecessor\n        if (myDistance == (sourceDistance + 1)) {\n          // add to dependency of predecessor using our finalized one\n          galois::atomicAdd(src_data.sourceData[i].dependencyValue, toAdd);\n        }\n      }\n    }\n  }\n}\n\nvoid BackProp(Graph& graph, const uint32_t lastRoundNumber) {\n  // All nodes WITH EDGES (another at SendMessage)\n  const auto& allNodesWithEdges = graph.allNodesWithEdgesRange();\n\n  uint32_t currentRound = 0;\n\n  while (currentRound <= lastRoundNumber) {\n    syncSubstrate->set_num_round(currentRound);\n\n    BackFindMessageToSend(graph, currentRound, lastRoundNumber);\n\n    // write destination in this case being the source in the actual graph\n    // since we're using the tranpose graph\n    syncSubstrate->sync<writeDestination, readSource, DependencyReduce,\n                        Bitset_dependency>(\n        std::string(std::string(REGION_NAME) + \"_DependencySync\"));\n\n    galois::do_all(\n        galois::iterate(allNodesWithEdges),\n        [&](GNode dst) { BackPropOp(dst, graph); },\n        galois::loopname(\n            syncSubstrate\n                ->get_run_identifier(std::string(REGION_NAME) + \"_BackProp\")\n                .c_str()),\n        galois::steal(), galois::no_stats());\n\n    currentRound++;\n  }\n}\n\n/**\n * BC sum: take the dependency value for each source and add it to the\n * final BC value.\n *\n * @param graph Local graph to operate on\n * @param offset Offset into sources (i.e. number of sources already done)\n */\nvoid BC(Graph& graph, const std::vector<uint64_t>& nodesToConsider) {\n  const auto& masterNodes = graph.masterNodesRange();\n  syncSubstrate->set_num_round(0);\n\n  galois::do_all(\n      galois::iterate(masterNodes.begin(), masterNodes.end()),\n      [&](GNode node) {\n        NodeData& cur_data = graph.getData(node);\n\n        for (unsigned i = 0; i < numSourcesPerRound; i++) {\n          // exclude sources themselves from BC calculation\n          if (graph.getGID(node) != nodesToConsider[i]) {\n            cur_data.bc += cur_data.sourceData[i].dependencyValue;\n          }\n        }\n      },\n      galois::loopname(\n          syncSubstrate->get_run_identifier(std::string(REGION_NAME)).c_str()),\n      galois::no_stats());\n};\n\n/******************************************************************************/\n/* Sanity check */\n/******************************************************************************/\n\nvoid Sanity(Graph& graph) {\n  galois::DGReduceMax<float> DGA_max;\n  galois::DGReduceMin<float> DGA_min;\n  galois::DGAccumulator<float> DGA_sum;\n\n  DGA_max.reset();\n  DGA_min.reset();\n  DGA_sum.reset();\n\n  galois::do_all(\n      galois::iterate(graph.masterNodesRange().begin(),\n                      graph.masterNodesRange().end()),\n      [&](auto src) {\n        NodeData& sdata = graph.getData(src);\n\n        DGA_max.update(sdata.bc);\n        DGA_min.update(sdata.bc);\n        DGA_sum += sdata.bc;\n      },\n      galois::no_stats(), galois::loopname(\"Sanity\"));\n\n  float max_bc = DGA_max.reduce();\n  float min_bc = DGA_min.reduce();\n  float bc_sum = DGA_sum.reduce();\n\n  // Only node 0 will print data\n  if (galois::runtime::getSystemNetworkInterface().ID == 0) {\n    galois::gPrint(\"Max BC is \", max_bc, \"\\n\");\n    galois::gPrint(\"Min BC is \", min_bc, \"\\n\");\n    galois::gPrint(\"BC sum is \", bc_sum, \"\\n\");\n  }\n};\n\n/******************************************************************************/\n/* Make results */\n/******************************************************************************/\n\nstd::vector<float> makeResults(std::unique_ptr<Graph>& hg) {\n  std::vector<float> values;\n\n  values.reserve(hg->numMasters());\n  for (auto node : hg->masterNodesRange()) {\n    values.push_back(hg->getData(node).bc);\n  }\n\n  return values;\n}\n\n/******************************************************************************/\n/* Main method for running */\n/******************************************************************************/\n\nconstexpr static const char* const name = \"Min-Rounds Betweeness Centrality\";\nconstexpr static const char* const desc = \"Min-Rounds Betweeness \"\n                                          \"Centrality on Distributed Galois.\";\nconstexpr static const char* const url = nullptr;\n\nuint64_t macroRound = 0; // macro round, i.e. number of batches done so far\n\nint main(int argc, char** argv) {\n  galois::DistMemSys G;\n  DistBenchStart(argc, argv, name, desc, url);\n\n  auto& net = galois::runtime::getSystemNetworkInterface();\n\n  // Total timer\n  galois::StatTimer StatTimer_total(\"TimerTotal\", REGION_NAME);\n  StatTimer_total.start();\n\n  galois::gPrint(\"[\", net.ID, \"] InitializeGraph\\n\");\n  std::unique_ptr<Graph> hg;\n  // false = iterate over in edges\n  std::tie(hg, syncSubstrate) =\n      distGraphInitialization<NodeData, void, false>();\n\n  if (totalNumSources == 0) {\n    galois::gDebug(\"Total num sources unspecified\");\n    totalNumSources = hg->globalSize();\n  }\n\n  if (useSingleSource) {\n    totalNumSources    = 1;\n    numSourcesPerRound = 1;\n  }\n\n  // set vector size in node data\n  if (vectorSize == 0) {\n    vectorSize = numSourcesPerRound.getValue();\n  }\n  GALOIS_ASSERT(vectorSize >= numSourcesPerRound);\n\n  // Backup the number of sources per round\n  uint64_t origNumRoundSources = numSourcesPerRound;\n\n  // Start graph initialization\n  galois::StatTimer StatTimer_graph_init(\"TIMER_GRAPH_INIT\", REGION_NAME);\n  StatTimer_graph_init.start();\n  InitializeGraph(*hg);\n  StatTimer_graph_init.stop();\n\n  galois::runtime::getHostBarrier().wait();\n\n  // shared DG accumulator among all steps\n  galois::DGAccumulator<uint32_t> dga;\n\n  // reading in list of sources to operate on if provided\n  std::ifstream sourceFile;\n  std::vector<uint64_t> sourceVector;\n  if (!sourcesToUse.empty()) {\n    sourceFile.open(sourcesToUse);\n    std::vector<uint64_t> t(std::istream_iterator<uint64_t>{sourceFile},\n                            std::istream_iterator<uint64_t>{});\n    sourceVector = t; // stored in source vector\n    sourceFile.close();\n  }\n\n  if (startNode && !sourceVector.empty()) {\n    GALOIS_DIE(\"startNode option not compatible with sourcesToUse\");\n  }\n\n  // \"sourceVector\" if file not provided\n  std::vector<uint64_t> nodesToConsider;\n  nodesToConsider.resize(numSourcesPerRound);\n\n  // bitset initialization\n  bitset_dependency.resize(hg->size());\n  bitset_minDistances.resize(hg->size());\n\n  ////////////////////////////////////////////////////////////////////////////////\n\n  galois::runtime::reportStat_Single(std::string(REGION_NAME),\n                                     std::string(\"NumSources\"),\n                                     (unsigned int)totalNumSources);\n  for (auto run = 0; run < numRuns; ++run) {\n    galois::gPrint(\"[\", net.ID, \"] Run \", run, \" started\\n\");\n\n    // Timer per RUN\n    std::string timer_str(\"Timer_\" + std::to_string(run));\n    galois::StatTimer StatTimer_main(timer_str.c_str(), REGION_NAME);\n\n    // Associated to totalNumSources\n    uint64_t totalSourcesFound = 0;\n\n    // offset into sources to operate on\n    uint64_t offset = startNode;\n    // node boundary to end search at\n    uint64_t nodeBoundary =\n        sourceVector.empty() ? hg->globalSize() : sourceVector.size();\n\n    while (offset < nodeBoundary && totalSourcesFound < totalNumSources) {\n      if (useSingleSource) {\n        nodesToConsider[0] = startNode;\n        totalNumSources    = 1;\n        totalSourcesFound  = 1;\n      } else {\n        unsigned sourcesFound = 0;\n        while (sourcesFound < numSourcesPerRound && offset < nodeBoundary &&\n               totalSourcesFound < totalNumSources) {\n          // choose from read source file or from beginning (0 to n)\n          nodesToConsider[sourcesFound] =\n              sourceVector.empty() ? offset : sourceVector[offset];\n          offset++;\n          sourcesFound++;\n          totalSourcesFound++;\n        }\n\n        if (sourcesFound == 0) {\n          assert((offset - startNode) == totalNumSources ||\n                 totalSourcesFound == totalNumSources);\n          break;\n        }\n\n        if ((offset - startNode) < totalNumSources) {\n          assert(numSourcesPerRound == sourcesFound);\n        } else {\n          // >= totalNumSources\n          assert((offset - startNode) == totalNumSources);\n          galois::gDebug(\"Out of sources (found \", sourcesFound, \")\");\n          numSourcesPerRound = sourcesFound;\n        }\n      }\n\n      galois::gDebug(\"Using the following sources\");\n      for (unsigned i = 0; i < numSourcesPerRound; i++) {\n        galois::gDebug(nodesToConsider[i]);\n      }\n\n      if (net.ID == 0) {\n        galois::gPrint(\"Begin batch #\", macroRound, \"\\n\");\n      }\n\n      // accumulate time per batch\n      StatTimer_main.start();\n      InitializeIteration(*hg, nodesToConsider);\n\n      // APSP returns total number of rounds taken\n      // subtract 2 to get to last round where message was sent (round\n      // after that is empty round where nothing is done)\n      uint32_t lastRoundNumber = APSP(*hg, dga) - 2;\n      RoundUpdate(*hg);\n      BackProp(*hg, lastRoundNumber);\n      BC(*hg, nodesToConsider);\n\n      StatTimer_main.stop();\n\n      syncSubstrate->set_num_round(0);\n      // report num rounds\n      if (galois::runtime::getSystemNetworkInterface().ID == 0) {\n        galois::runtime::reportStat_Single(\n            REGION_NAME,\n            // hg->get_run_identifier(\"NumForwardRounds\", macroRound),\n            syncSubstrate->get_run_identifier(\"NumForwardRounds\"),\n            lastRoundNumber + 2);\n        galois::runtime::reportStat_Single(\n            REGION_NAME,\n            // hg->get_run_identifier(\"NumBackwardRounds\", macroRound),\n            syncSubstrate->get_run_identifier(\"NumBackwardRounds\"),\n            lastRoundNumber + 1);\n        galois::runtime::reportStat_Single(\n            REGION_NAME, syncSubstrate->get_run_identifier(\"TotalRounds\"),\n            lastRoundNumber + lastRoundNumber + 3);\n      }\n\n      macroRound++;\n    }\n\n    Sanity(*hg);\n\n    // re-init graph for next run\n    if ((run + 1) != numRuns) { // not the last run\n      galois::runtime::getHostBarrier().wait();\n      syncSubstrate->set_num_run(run + 1);\n      syncSubstrate->set_num_round(0);\n      offset             = 0;\n      macroRound         = 0;\n      numSourcesPerRound = origNumRoundSources;\n\n      bitset_dependency.reset();\n      bitset_minDistances.reset();\n\n      InitializeGraph(*hg);\n      galois::runtime::getHostBarrier().wait();\n    }\n\n    // Current run finished\n  }\n\n  StatTimer_total.stop();\n\n  ////////////////////////////////////////////////////////////////////////////////\n\n  if (output) {\n    std::vector<float> results = makeResults(hg);\n    auto globalIDs             = hg->getMasterGlobalIDs();\n    assert(results.size() == globalIDs.size());\n\n    writeOutput(outputLocation, \"betweenness_centrality\", results.data(),\n                results.size(), globalIDs.data());\n  }\n\n  return 0;\n}\n"
  },
  {
    "path": "lonestar/analytics/distributed/betweennesscentrality/mrbc_bitset.hh",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting parallelism.\n * The code is being released under the terms of the 3-Clause BSD License (a\n * copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n#ifndef _MR_BC_BIT_SET_\n#define _MR_BC_BIT_SET_\n\n#include \"galois/DynamicBitset.h\"\n#include <boost/random/detail/integer_log2.hpp>\n\n/** OPTIONS **********/\n/** 1. Optimized mode: enable ONLY ONE of them at most **/\n// #define REVERSE_MODE //! Not applicable for v6\n// #define FLIP_MODE  //! Not applicable for v6\n/** 2. Do you need an indicator? **/\n#define USE_INDICATOR\n/*********************/\n\n/**\n * Derivate from DynamicBitSet\n **/\nclass MRBCBitSet : public galois::DynamicBitSet {\n  // using Base = galois::DynamicBitSet;\n  // @DynamicBitSet (protected)\n  // using Base::bitvec;\n  // using Base::num_bits;\n  // using Base::bits_uint64;\n\n  #ifdef USE_INDICATOR\n    //! indicate the index of bit to process\n    size_t indicator;\n  #endif\n\n  // Member functions\n  inline size_t get_word(size_t pos) const { return pos < bits_uint64? 0 : pos / bits_uint64; }\n  inline size_t get_offset(size_t pos) const { return pos < bits_uint64? pos : pos % bits_uint64; }\n  inline uint64_t get_mask(size_t pos) const { return uint64_t(1) << get_offset(pos); }\n\n  #if defined(REVERSE_MODE) || defined(FLIP_MODE)\n    size_t reverse(size_t pos) {\n      return pos == npos? npos : num_bits - pos - 1;\n    }\n  #endif\n\n  #ifdef FLIP_MODE\n    void flip_recursive(size_t pos) {\n      size_t next = find_next(pos);\n      if (next != npos)\n        flip_recursive(next);\n      // do the flip for pos\n      uint64_t block = get_word(pos), mask = get_mask(pos);\n      uint64_t rBlock = get_word(reverse(pos)), rMask = get_mask(reverse(pos));\n      // flip if asymmetrical\n      if (!(bitvec[rBlock] & rMask)) {\n        bitvec[block].fetch_and(~mask);\n        size_t r_old = bitvec[rBlock];\n        while (!bitvec[rBlock].compare_exchange_weak(\n          r_old, r_old | rMask, std::memory_order_relaxed));\n      }\n    }\n  #endif\n\n  size_t right_most_bit(uint64_t w) const {\n    // assert(w >= 1);\n    return boost::integer_log2<uint64_t>(w & -w);\n  }\n\n  size_t left_most_bit(uint64_t w) const {\n      return boost::integer_log2<uint64_t>(w);\n  }\n\n  size_t find_from_block(size_t first, bool fore=true) const {\n    size_t i;\n    if (fore) {\n      for (i = first; i < bitvec.size() && bitvec[i] == 0; i++);\n      if (i >= bitvec.size())\n          return npos;\n      return i * bits_uint64 + right_most_bit(bitvec[i]);\n    }\n    else {\n      for (i = first; i > 0 && bitvec[i] == 0; i--);\n      if (i <= 0 && bitvec[i] == 0)\n        return npos;\n      return i * bits_uint64 + left_most_bit(bitvec[i]);\n    }\n  }\n\npublic:\n  typedef size_t iterator;\n  typedef size_t reverse_iterator;\n\n  //! sign for N/A\n  static const size_t npos = std::numeric_limits<size_t>::max();\n\n  #ifdef FLIP_MODE\n    void flip() {\n      size_t first = find_first();\n      if (first != npos)\n        flip_recursive(first);\n    }\n  #endif\n\n  #ifdef USE_INDICATOR\n  // Accessors\n  size_t getIndicator() const { return indicator; }\n  void setIndicator(size_t index) { indicator = index; }\n  #endif\n\n  // @DynamicBitSet\n  #ifdef _GALOIS_DYNAMIC_BIT_SET_\n  // using Base::size;\n  #else\n  size_t size() const { return num_bits; }\n  #endif\n\n  //! Constructor which initializes to an empty bitset.\n  MRBCBitSet() {\n    resize(numSourcesPerRound);\n    #ifdef USE_INDICATOR\n      indicator = npos;\n    #endif\n  }\n\n  #ifdef _GALOIS_DYNAMIC_BIT_SET_\n  // using Base::test;\n  #else\n  // @DynamicBitSet\n  bool test(size_t index) const {\n    size_t bit_index = get_word(index);\n    uint64_t bit_mask = get_mask(index);\n    return ((bitvec[bit_index] & bit_mask) != 0);\n  }\n  #endif\n\n  /**\n   * Test and then set\n   *\n   * @returns test result before set\n   */\n  bool test_set(size_t pos, bool val=true) {\n    bool const ret = test(pos);\n    if (ret != val) {\n      uint64_t old_val = bitvec[get_word(pos)];\n      if (val) {\n        while (!bitvec[get_word(pos)].compare_exchange_weak(\n          old_val, (old_val | get_mask(pos)), \n          std::memory_order_relaxed));\n      }\n      else {\n        while (!bitvec[get_word(pos)].compare_exchange_weak(\n          old_val, (old_val & ~get_mask(pos)), \n          std::memory_order_relaxed));\n      }\n    }\n    return ret;\n  }\n\n  #ifdef _GALOIS_DYNAMIC_BIT_SET_\n  // using Base::set;\n  #else\n  // @DynamicBitSet\n  void set(size_t index) {\n    size_t bit_index = get_word(index);\n    uint64_t bit_mask = get_mask(index);\n    if ((bitvec[bit_index] & bit_mask) == 0) { // test and set\n      size_t old_val = bitvec[bit_index];\n      while (!bitvec[bit_index].compare_exchange_weak(\n          old_val, old_val | bit_mask, \n          std::memory_order_relaxed));\n    }\n  }\n  #endif\n\n  // DISABLED @DynamicBitSet\n  void reset(size_t index) {\n    size_t bit_index = get_word(index);\n    uint64_t bit_mask = get_mask(index);\n    bitvec[bit_index].fetch_and(~bit_mask);\n    // @ Better implementation:\n    // while (!bitvec[bit_index].compare_exchange_weak(\n    //   old_val, old_val & ~bit_mask, \n    //   std::memory_order_relaxed));\n  }\n\n  #ifdef _GALOIS_DYNAMIC_BIT_SET_\n  // using Base::reset;\n  // using Base::resize;\n  #else\n  // @DynamicBitSet\n  void reset() { std::fill(bitvec.begin(), bitvec.end(), uint64_t(0)); }\n\n  // @DynamicBitSet\n  void resize(uint64_t n) {\n    assert(bits_uint64 == 64); // compatibility with other devices\n    num_bits = n;\n    bitvec.resize((n + bits_uint64 - 1) / bits_uint64);\n    reset();\n  }\n  #endif\n\n  bool none() {\n    for (size_t i = 0; i < bitvec.size(); ++i)\n      if (bitvec[i])\n        return false;\n    return true;\n  }\n\n  #ifdef USE_INDICATOR\n    /**\n     * Set a bit with the side-effect updating indicator to the first.\n     */\n    void set_indicator(size_t pos) {\n      #ifdef REVERSE_MODE\n        set(reverse(pos));\n      #else\n        set(pos);\n      #endif\n      if (pos < indicator) {\n        indicator = pos;\n      }\n    }\n\n    bool test_set_indicator(size_t pos, bool val=true) {\n      #ifdef REVERSE_MODE\n        if (test_set(reverse(pos), val)) {\n          if (pos == indicator) {\n            forward_indicator();\n          }\n          return true;\n        }\n        else\n          return false;\n      #else\n        if (test_set(pos, val)) {\n          if (pos == indicator) {\n            forward_indicator();\n          }\n          return true;\n        }\n        else\n          return false;\n      #endif\n    }\n\n    /**\n     * Return true if indicator is npos\n     */\n    bool nposInd() {\n      return indicator == npos;\n    }\n  #endif\n  /**\n   * Returns: the lowest index i such as bit i is set, or npos if *this has no on bits.\n   */\n  size_t find_first() const {\n    return find_from_block(0);\n  }\n\n  size_t find_last() const {\n    return find_from_block(bitvec.size() - 1, false);\n  }\n\n  #ifdef REVERSE_MODE\n    inline size_t begin() { return reverse(find_last()); }\n  #else\n    inline size_t begin() { return find_first(); }\n  #endif\n  inline size_t end() { return npos; }\n\n  #if defined(REVERSE_MODE) || defined(FLIP_MODE)\n    inline size_t rbegin() { return reverse(find_first()); }\n  #else\n    inline size_t rbegin() { return find_last(); }\n  #endif\n  inline size_t rend() { return npos; }\n\n  /**\n   * Returns: the lowest index i greater than pos such as bit i is set, or npos if no such index exists.\n   */\n  size_t find_next(size_t pos) const {\n    if (pos == npos) {\n      return find_first();\n    }\n    if (++pos >= size() || size() == 0) {\n      return npos;\n    }\n    size_t curBlock = get_word(pos);\n    auto curOffset = get_offset(pos);\n    auto seg = bitvec[curBlock];\n    while (seg != bitvec[curBlock])\n      seg = bitvec[curBlock];\n    uint64_t res = seg >> curOffset;\n    return res?\n      pos + right_most_bit(res) : find_from_block(++curBlock);\n  }\n\n  size_t find_prev(size_t pos) const{\n    if (pos >= size()) {\n      return find_last();\n    }\n    // Return npos if no bit set\n    if (pos-- == 0 || size() == 0) {\n      return npos;\n    }\n    size_t curBlock = get_word(pos);\n    uint64_t res = bitvec[curBlock] & ((uint64_t(2) << get_offset(pos)) - 1);\n    return res?\n      curBlock * bits_uint64 + left_most_bit(res) : \n      (curBlock?\n        find_from_block(--curBlock, false) : npos);\n  }\n\n  /**\n   * To move iterator to the previous set bit, and return the old value.\n   */\n  inline size_t forward_iterate(size_t& i) {\n    size_t old = i;\n    #ifdef REVERSE_MODE\n      i = reverse(find_prev(reverse(i)));\n    #else\n      i = find_next(i);\n    #endif\n    return old;\n  }\n\n  /**\n   * To move iterator to the next set bit.\n   */\n  inline size_t backward_iterate(size_t& i) {\n    size_t old = i;\n    #ifdef FLIP_MODE\n      i = nposInd()? find_first() : find_next(i);\n      return reverse(old);\n    #else\n      #ifdef REVERSE_MODE\n      i = reverse(find_next(reverse(i)));\n      #else\n      i = find_prev(i);\n      #endif\n      return old;\n    #endif\n  }\n\n  #ifdef USE_INDICATOR\n    /**\n     * To move indicator to the previous set bit, and return the old value.\n     */\n    size_t forward_indicator() {\n      return forward_iterate(indicator);\n    }\n\n    /**\n     * To move indicator to the next set bit.\n     */\n    size_t backward_indicator() {\n      return backward_iterate(indicator);\n    }\n  #endif\n};\n#endif\n"
  },
  {
    "path": "lonestar/analytics/distributed/betweennesscentrality/mrbc_sync.hh",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting parallelism.\n * The code is being released under the terms of the 3-Clause BSD License (a\n * copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n////////////////////////////////////////////////////////////////////////////////\n// APSP synchronization\n////////////////////////////////////////////////////////////////////////////////\n\nstruct APSPReduce {\n  using ValTy = galois::TupleOfThree<uint32_t, uint32_t, ShortPathType>;\n\n  static ValTy extract(uint32_t, struct NodeData& node) {\n    uint32_t indexToGet = node.roundIndexToSend;\n\n    uint32_t a;\n    uint32_t b;\n    ShortPathType c;\n\n    a = indexToGet;\n    if (indexToGet != infinity) {\n      // get min distance and # shortest paths\n      b = node.sourceData[indexToGet].minDistance;\n      c = node.sourceData[indexToGet].shortPathCount;\n    } else {\n      // no-op\n      b = infinity;\n      c = 0;\n    }\n\n    return ValTy(a, b, c);\n  }\n\n  static bool extract_batch(unsigned, uint8_t*, size_t*,\n                            DataCommMode*) { return false; }\n\n  static bool extract_batch(unsigned, uint8_t*) { return false; }\n\n  static bool extract_reset_batch(unsigned, uint8_t*, size_t*,\n                                  DataCommMode*) { return false; }\n\n  static bool extract_reset_batch(unsigned, uint8_t*) { return false; }\n\n  static bool reduce(uint32_t, struct NodeData& node, ValTy y) {\n    uint32_t rIndex = y.first;\n\n    if (rIndex != infinity) {\n      uint32_t rDistance = y.second;\n      ShortPathType rNumPaths = y.third;\n\n      // do updates based on received numbers\n      uint32_t old = galois::min(node.sourceData[rIndex].minDistance, rDistance);\n\n      // reset shortest paths if min dist changed (i.e. don't add to it)\n      if (old > rDistance) {\n        node.dTree.setDistance(rIndex, old, rDistance);\n        assert(rNumPaths != 0);\n        node.sourceData[rIndex].shortPathCount = rNumPaths;\n      } else if (old == rDistance) {\n        // add to short path\n        node.sourceData[rIndex].shortPathCount += rNumPaths;\n      }\n\n      // if received distance is smaller than current candidate for sending, send\n      // it out instead (if tie breaker wins i.e. lower in position)\n      if (node.roundIndexToSend == infinity ||\n          (node.sourceData[rIndex].minDistance <\n            node.sourceData[node.roundIndexToSend].minDistance)) {\n          node.roundIndexToSend = rIndex;\n      } else if (node.sourceData[rIndex].minDistance ==\n                 node.sourceData[node.roundIndexToSend].minDistance) {\n        if (rIndex < node.roundIndexToSend) {\n          node.roundIndexToSend = rIndex;\n        }\n      }\n\n      // return true: if it received a message for some node, then that\n      // node on a mirror needs to get the most updated value (i.e. value on\n      // master)\n      return true;\n    }\n\n    return false;\n  }\n\n  static bool reduce_batch(unsigned, uint8_t*, DataCommMode) { return false; }\n\n  static bool reduce_mirror_batch(unsigned, uint8_t*, DataCommMode) { return false; }\n\n  // reset the number of shortest paths (the master will now have it)\n  static void reset(uint32_t, struct NodeData &node) {\n    if (node.roundIndexToSend != infinity) {\n      node.sourceData[node.roundIndexToSend].shortPathCount = 0;\n    }\n  }\n\n  static void setVal(uint32_t, struct NodeData & node, ValTy y) {\n    uint32_t rIndex = y.first;\n    if (rIndex != infinity) {\n      uint32_t rDistance = y.second;\n      ShortPathType rNumPaths = y.third;\n\n      // values from master are canonical ones for this round\n      node.roundIndexToSend = rIndex;\n      uint32_t oldDistance = node.sourceData[rIndex].minDistance;\n      node.sourceData[rIndex].minDistance = rDistance;\n      node.sourceData[rIndex].shortPathCount = rNumPaths;\n      node.dTree.setDistance(rIndex, oldDistance, rDistance);\n    }\n  }\n\n  static bool setVal_batch(unsigned, uint8_t*, DataCommMode) { return false; }\n};\n\n////////////////////////////////////////////////////////////////////////////////\n\nstruct DependencyReduce {\n  using ValTy = galois::Pair<uint32_t, float>;\n\n  static ValTy extract(uint32_t, struct NodeData& node) {\n    uint32_t indexToGet = node.roundIndexToSend;\n    float thing;\n    if (indexToGet != infinity) {\n      thing = node.sourceData[indexToGet].dependencyValue;\n    } else {\n      thing = 0;\n    }\n\n    return ValTy(indexToGet, thing);\n  }\n\n  static bool extract_batch(unsigned, uint8_t*, size_t*,\n                            DataCommMode*) { return false; }\n\n  static bool extract_batch(unsigned, uint8_t*) { return false; }\n\n  static bool extract_reset_batch(unsigned, uint8_t*, size_t*,\n                                  DataCommMode*) { return false; }\n\n  static bool extract_reset_batch(unsigned, uint8_t*) { return false; }\n\n  static bool reduce(uint32_t, struct NodeData& node, ValTy y) {\n    uint32_t rIndex = y.first;\n\n    if (rIndex != infinity) {\n      if (node.roundIndexToSend != rIndex) {\n        galois::gError(node.roundIndexToSend, \" \", rIndex);\n      }\n      assert(node.roundIndexToSend == rIndex);\n\n      float rToAdd = y.second;\n      galois::atomicAdd(node.sourceData[rIndex].dependencyValue, rToAdd);\n      return true;\n    }\n\n    return false;\n  }\n\n  static bool reduce_batch(unsigned, uint8_t*, DataCommMode) { return false; }\n\n  static bool reduce_mirror_batch(unsigned, uint8_t*, DataCommMode) { return false; }\n\n  // reset the number of shortest paths (the master will now have it)\n  static void reset(uint32_t, struct NodeData &node) {\n    if (node.roundIndexToSend != infinity) {\n      node.sourceData[node.roundIndexToSend].dependencyValue = 0;\n    }\n  }\n\n  static void setVal(uint32_t, struct NodeData & node, ValTy y) {\n    uint32_t rIndex = y.first;\n    if (rIndex != infinity) {\n      float rDep = y.second;\n      assert(node.roundIndexToSend == rIndex);\n      node.sourceData[rIndex].dependencyValue = rDep;\n    }\n  }\n\n  static bool setVal_batch(unsigned, uint8_t*, DataCommMode) { return false; }\n};\n\n////////////////////////////////////////////////////////////////////////////////\n// Bitsets\n////////////////////////////////////////////////////////////////////////////////\n\nGALOIS_SYNC_STRUCTURE_BITSET(minDistances);\nGALOIS_SYNC_STRUCTURE_BITSET(dependency);\n"
  },
  {
    "path": "lonestar/analytics/distributed/betweennesscentrality/mrbc_tree.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#ifndef _MRBCTREE_\n#define _MRBCTREE_\n#include <boost/container/flat_map.hpp>\n#include \"mrbc_bitset.hh\"\n\nconst uint32_t infinity = std::numeric_limits<uint32_t>::max() >> 2;\n\n/**\n * Binary tree class to make finding a source's message to send out during MRBC\n * easier.\n */\nclass MRBCTree {\n  using BitSet  = MRBCBitSet;\n  using FlatMap = boost::container::flat_map<\n      uint32_t, BitSet, std::less<uint32_t>,\n      galois::gstl::Pow2Alloc<std::pair<uint32_t, BitSet>>>;\n\n  //! map to a bitset of nodes that belong in a particular distance group\n  FlatMap distanceTree;\n  //! number of sources that have already been sent out\n  uint32_t numSentSources;\n  //! number of non-infinity values (i.e. number of sources added already)\n  uint32_t numNonInfinity;\n  //! indicates if zero distance has been reached for backward iteration\n  bool zeroReached;\n\n  //! reverse iterator over map\n  using TreeIter = typename FlatMap::reverse_iterator;\n  //! Current iterator for reverse map\n  TreeIter curKey;\n  //! End key for reverse map iterator\n  TreeIter endCurKey;\n\npublic:\n  /*** InitializeIteration\n   * *****************************************************/\n\n  /**\n   * Reset the map, initialize all distances to infinity, and reset the \"sent\"\n   * vector and num sent sources.\n   */\n  void initialize() {\n    distanceTree.clear();\n    // reset number of sent sources\n    numSentSources = 0;\n    // reset number of non infinity sources that exist\n    numNonInfinity = 0;\n    // reset the flag for backward phase\n    zeroReached = false;\n  }\n\n  /**\n   * Assumes you're adding a NEW distance; i.e. there better not be a duplicate\n   * of index somewhere.\n   */\n  void setDistance(uint32_t index, uint32_t newDistance) {\n    // Only for iterstion initialization\n    // assert(newDistance == 0);\n    // assert(distanceTree[newDistance].size() == numSourcesPerRound);\n    distanceTree[newDistance].set_indicator(index);\n    numNonInfinity++;\n  }\n\n  /*** FindMessageToSync\n   * ********************************************************/\n\n  /**\n   * Get the index that needs to be sent out this round given the round number.\n   */\n  uint32_t getIndexToSend(uint32_t roundNumber) {\n    uint32_t distanceToCheck = roundNumber - numSentSources;\n    uint32_t indexToSend     = infinity;\n\n    auto setIter = distanceTree.find(distanceToCheck);\n    if (setIter != distanceTree.end()) {\n      BitSet& setToCheck = setIter->second;\n      auto index         = setToCheck.getIndicator();\n      if (index != setToCheck.npos) {\n        indexToSend = index;\n      }\n    }\n    return indexToSend;\n  }\n\n  /**\n   * Return true if potentially more work exists to be done\n   */\n  bool moreWork() { return numNonInfinity > numSentSources; }\n\n  /*** ConfirmMessageToSend\n   * *****************************************************/\n\n  /**\n   * Note that a particular source's message has already been sent in the data\n   * structure and increment the number of sent sources.\n   */\n  void markSent(uint32_t roundNumber) {\n    uint32_t distanceToCheck = roundNumber - numSentSources;\n    BitSet& setToCheck       = distanceTree[distanceToCheck];\n    setToCheck.forward_indicator();\n\n    numSentSources++;\n  }\n\n  /*** SendAPSPMessages\n   * *********************************************************/\n\n  /**\n   * Update the distance map: given an index to update as well as its old\n   * distance, remove the old distance and replace with new distance.\n   */\n  void setDistance(uint32_t index, uint32_t oldDistance, uint32_t newDistance) {\n    if (oldDistance == newDistance) {\n      return;\n    }\n\n    auto setIter = distanceTree.find(oldDistance);\n    bool existed = false;\n    // if it exists, remove it\n    if (setIter != distanceTree.end()) {\n      BitSet& setToChange = setIter->second;\n      existed =\n          setToChange.test_set_indicator(index, false); // Test, set, update\n    }\n\n    // if it didn't exist before, add to number of non-infinity nodes\n    if (!existed) {\n      numNonInfinity++;\n    }\n\n    // asset(distanceTree[newDistance].size() == numSourcesPerRound);\n    distanceTree[newDistance].set_indicator(index);\n  }\n\n  /*** RoundUpdate\n   * **************************************************************/\n\n  /**\n   * Begin the setup for the back propagation phase by setting up the\n   * iterators.\n   */\n  void prepForBackPhase() {\n    curKey    = distanceTree.rbegin();\n    endCurKey = distanceTree.rend();\n\n    if (curKey != endCurKey) {\n      // find non-empty distance if first one happens to be empty\n      if (curKey->second.none()) {\n        for (++curKey; curKey != endCurKey && curKey->second.none(); ++curKey)\n          ;\n      }\n    }\n\n    // setup if not empty\n    if (curKey != endCurKey) {\n      BitSet& curSet = curKey->second;\n#ifdef FLIP_MODE\n      curSet.flip();\n#endif\n      curSet.backward_indicator();\n    }\n  }\n\n  /*** BackFindMessageToSend\n   * *****************************************************/\n\n  /**\n   * Given a round number, figure out which index needs to be sent out for the\n   * back propagation phase.\n   */\n  uint32_t backGetIndexToSend(const uint32_t roundNumber,\n                              const uint32_t lastRound) {\n    uint32_t indexToReturn = infinity;\n\n    while (curKey != endCurKey) {\n      uint32_t distance = curKey->first;\n      if ((distance + numSentSources - 1) != (lastRound - roundNumber)) {\n        // round to send not reached yet; get out\n        return infinity;\n      }\n\n      if (distance == 0) {\n        zeroReached = true;\n        return infinity;\n      }\n\n      BitSet& curSet = curKey->second;\n      if (!curSet.nposInd()) {\n        // this number should be sent out this round\n        indexToReturn = curSet.backward_indicator();\n        numSentSources--;\n        break;\n      } else {\n        // set exhausted; go onto next set\n        for (++curKey; curKey != endCurKey && curKey->second.none(); ++curKey)\n          ;\n\n        // if another set exists, set it up, else do nothing\n        if (curKey != endCurKey) {\n          BitSet& nextSet = curKey->second;\n#ifdef FLIP_MODE\n          nextSet.flip();\n#endif\n          nextSet.backward_indicator();\n        }\n      }\n    }\n\n    if (curKey == endCurKey) {\n      assert(numSentSources == 0);\n    }\n\n    return indexToReturn;\n  }\n\n  /**\n   * Returns zeroReached variable.\n   */\n  bool isZeroReached() { return zeroReached; }\n};\n\n#endif\n"
  },
  {
    "path": "lonestar/analytics/distributed/bfs/CMakeLists.txt",
    "content": "app_dist(bfs_push bfs-push)\nadd_test_dist(bfs-push-dist rmat15 ${BASEINPUT}/scalefree/rmat15.gr -graphTranspose=${BASEINPUT}/scalefree/transpose/rmat15.tgr)\n\napp_dist(bfs_pull bfs-pull)\nadd_test_dist(bfs-pull-dist rmat15 ${BASEINPUT}/scalefree/rmat15.gr -graphTranspose=${BASEINPUT}/scalefree/transpose/rmat15.tgr)\n"
  },
  {
    "path": "lonestar/analytics/distributed/bfs/README.md",
    "content": "Breadth First Search\n================================================================================\n\nDESCRIPTION \n--------------------------------------------------------------------------------\n\nThis program performs breadth-first search on an input graph, starting from a\nsource node (specified by -startNode option). \n\nThe algorithm supports both a bulk-synchronous and a bulk-asynchronous\nparallel algorithms. This benchmark consists of two algorithms,\npush- and pull-based. In the push-based algorithm, a node that has been\nupdated from the last round will push out its distance value to its neighbors\nand update them if necessary in each round. In the pull-based algorithm,\nevery node will check its neighbors' distance values and update their own\nvalues based on what they see in each round.\n\nINPUT\n--------------------------------------------------------------------------------\n\nTakes in Galois .gr graphs.\n\nBUILD\n--------------------------------------------------------------------------------\n\n1. Run cmake at BUILD directory (refer to top-level README for cmake instructions).\n\n2. Run `cd <BUILD>/lonestar/analytics/distributed/bfs; make -j\n\nRUN\n--------------------------------------------------------------------------------\n\nTo run on 1 host with start node 0, use the following:\n`./bfs-push-dist <input-graph> -graphTranspose=<transpose-input-graph> -t=<num-threads>`\n`./bfs-pull-dist <input-graph> -graphTranspose=<transpose-input-graph> -t=<num-threads>`\n\nTo run on 3 hosts h1, h2, and h3 for start node 0, use the following:\n`mpirun -n=3 -hosts=h1,h2,h3 ./bfs-push-dist <input-graph> -graphTranspose=<transpose-input-graph> -t=<num-threads>`\n`mpirun -n=3 -hosts=h1,h2,h3 ./bfs-pull-dist <input-graph> -graphTranspose=<transpose-input-graph> -t=<num-threads>`\n\nTo run on 3 hosts h1, h2, and h3 for start node 10 with an incoming edge cut, use the following:\n`mpirun -n=3 -hosts=h1,h2,h3 ./bfs-push-dist <input-graph> -graphTranspose=<transpose-input-graph> -t=<num-threads> -startNode=10 -partition=iec`\n`mpirun -n=3 -hosts=h1,h2,h3 ./bfs-pull-dist <input-graph> -graphTranspose=<transpose-input-graph> -t=<num-threads> -startNode=10 -partition=iec`\n\nPERFORMANCE\n--------------------------------------------------------------------------------\n\n* The push variant generally performs better in our experience.\n\n* For 16 or less hosts/GPUs, for performance, we recommend using an\n  **edge-cut** partitioning policy (OEC or IEC) with **synchronous**\n  communication for performance.\n\n* For 32 or more hosts/GPUs, for performance, we recommend using the\n  **Cartesian vertex-cut** partitioning policy (CVC) with **asynchronous**\n  communication for performance.\n"
  },
  {
    "path": "lonestar/analytics/distributed/bfs/bfs_pull.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#include \"DistBench/Output.h\"\n#include \"DistBench/Start.h\"\n#include \"galois/DistGalois.h\"\n#include \"galois/DReducible.h\"\n#include \"galois/DTerminationDetector.h\"\n#include \"galois/gstl.h\"\n#include \"galois/runtime/Tracer.h\"\n\n#include <iostream>\n#include <limits>\n\n#ifdef GALOIS_ENABLE_GPU\n#include \"bfs_pull_cuda.h\"\nstruct CUDA_Context* cuda_ctx;\n#else\nenum { CPU, GPU_CUDA };\nint personality = CPU;\n#endif\n\nconstexpr static const char* const REGION_NAME = \"BFS\";\n\n/******************************************************************************/\n/* Declaration of command line arguments */\n/******************************************************************************/\n\nnamespace cll = llvm::cl;\n\nstatic cll::opt<unsigned int> maxIterations(\"maxIterations\",\n                                            cll::desc(\"Maximum iterations: \"\n                                                      \"Default 1000\"),\n                                            cll::init(1000));\n\nstatic cll::opt<uint64_t>\n    src_node(\"startNode\", cll::desc(\"ID of the source node\"), cll::init(0));\n\nenum Exec { Sync, Async };\n\nstatic cll::opt<Exec> execution(\n    \"exec\", cll::desc(\"Distributed Execution Model (default value Async):\"),\n    cll::values(clEnumVal(Sync, \"Bulk-synchronous Parallel (BSP)\"),\n                clEnumVal(Async, \"Bulk-asynchronous Parallel (BASP)\")),\n    cll::init(Async));\n\n/******************************************************************************/\n/* Graph structure declarations + other initialization */\n/******************************************************************************/\n\nconst uint32_t infinity = std::numeric_limits<uint32_t>::max() / 4;\n\nstruct NodeData {\n  uint32_t dist_current;\n};\n\ntypedef galois::graphs::DistGraph<NodeData, void> Graph;\ntypedef typename Graph::GraphNode GNode;\n\ngalois::DynamicBitSet bitset_dist_current;\n\nstd::unique_ptr<galois::graphs::GluonSubstrate<Graph>> syncSubstrate;\n\n#include \"bfs_pull_sync.hh\"\n\n/******************************************************************************/\n/* Algorithm structures */\n/******************************************************************************/\n\nstruct InitializeGraph {\n  const uint32_t& local_infinity;\n  cll::opt<uint64_t>& local_src_node;\n  Graph* graph;\n\n  InitializeGraph(cll::opt<uint64_t>& _src_node, const uint32_t& _infinity,\n                  Graph* _graph)\n      : local_infinity(_infinity), local_src_node(_src_node), graph(_graph) {}\n\n  void static go(Graph& _graph) {\n    const auto& allNodes = _graph.allNodesRange();\n    if (personality == GPU_CUDA) {\n#ifdef GALOIS_ENABLE_GPU\n      std::string impl_str(\"InitializeGraph_\" +\n                           (syncSubstrate->get_run_identifier()));\n      galois::StatTimer StatTimer_cuda(impl_str.c_str(), REGION_NAME);\n      StatTimer_cuda.start();\n      InitializeGraph_allNodes_cuda(infinity, src_node, cuda_ctx);\n      StatTimer_cuda.stop();\n#else\n      abort();\n#endif\n    } else if (personality == CPU) {\n      galois::do_all(\n          galois::iterate(allNodes),\n          InitializeGraph(src_node, infinity, &_graph), galois::no_stats(),\n          galois::loopname(\n              syncSubstrate->get_run_identifier(\"InitializeGraph\").c_str()));\n    }\n  }\n\n  void operator()(GNode src) const {\n    NodeData& sdata = graph->getData(src);\n    sdata.dist_current =\n        (graph->getGID(src) == local_src_node) ? 0 : local_infinity;\n  }\n};\n\ntemplate <bool async>\nstruct BFS {\n  Graph* graph;\n  using DGTerminatorDetector =\n      typename std::conditional<async, galois::DGTerminator<unsigned int>,\n                                galois::DGAccumulator<unsigned int>>::type;\n\n  DGTerminatorDetector& active_vertices;\n\n  BFS(Graph* _graph, DGTerminatorDetector& _dga)\n      : graph(_graph), active_vertices(_dga) {}\n\n  void static go(Graph& _graph) {\n    unsigned _num_iterations = 0;\n    DGTerminatorDetector dga;\n\n    const auto& nodesWithEdges = _graph.allNodesWithEdgesRange();\n    do {\n      syncSubstrate->set_num_round(_num_iterations);\n      dga.reset();\n      if (personality == GPU_CUDA) {\n#ifdef GALOIS_ENABLE_GPU\n        std::string impl_str(\"BFS_\" + (syncSubstrate->get_run_identifier()));\n        galois::StatTimer StatTimer_cuda(impl_str.c_str(), REGION_NAME);\n        StatTimer_cuda.start();\n        unsigned int __retval = 0;\n        BFS_nodesWithEdges_cuda(__retval, cuda_ctx);\n        dga += __retval;\n        StatTimer_cuda.stop();\n#else\n        abort();\n#endif\n      } else if (personality == CPU) {\n        galois::do_all(\n            galois::iterate(nodesWithEdges), BFS(&_graph, dga),\n            galois::no_stats(), galois::steal(),\n            galois::loopname(syncSubstrate->get_run_identifier(\"BFS\").c_str()));\n      }\n      syncSubstrate->sync<writeSource, readDestination, Reduce_min_dist_current,\n                          Bitset_dist_current, async>(\"BFS\");\n\n      galois::runtime::reportStat_Tsum(\n          REGION_NAME, syncSubstrate->get_run_identifier(\"NumWorkItems\"),\n          (unsigned long)dga.read_local());\n      ++_num_iterations;\n    } while ((async || (_num_iterations < maxIterations)) &&\n             dga.reduce(syncSubstrate->get_run_identifier()));\n\n    if (galois::runtime::getSystemNetworkInterface().ID == 0) {\n      galois::runtime::reportStat_Single(\n          REGION_NAME,\n          \"NumIterations_\" + std::to_string(syncSubstrate->get_run_num()),\n          (unsigned long)_num_iterations);\n    }\n  }\n\n  void operator()(GNode src) const {\n    NodeData& snode = graph->getData(src);\n\n    for (auto jj : graph->edges(src)) {\n      GNode dst         = graph->getEdgeDst(jj);\n      auto& dnode       = graph->getData(dst);\n      uint32_t new_dist = dnode.dist_current + 1;\n      uint32_t old_dist = galois::min(snode.dist_current, new_dist);\n      if (old_dist > new_dist) {\n        bitset_dist_current.set(src);\n        active_vertices += 1;\n      }\n    }\n  }\n};\n\n/******************************************************************************/\n/* Sanity check operators */\n/******************************************************************************/\n\n/* Prints total number of nodes visited + max distance */\nstruct BFSSanityCheck {\n  const uint32_t& local_infinity;\n  Graph* graph;\n\n  galois::DGAccumulator<uint64_t>& DGAccumulator_sum;\n  galois::DGReduceMax<uint32_t>& DGMax;\n\n  BFSSanityCheck(const uint32_t& _infinity, Graph* _graph,\n                 galois::DGAccumulator<uint64_t>& dgas,\n                 galois::DGReduceMax<uint32_t>& dgm)\n      : local_infinity(_infinity), graph(_graph), DGAccumulator_sum(dgas),\n        DGMax(dgm) {}\n\n  void static go(Graph& _graph, galois::DGAccumulator<uint64_t>& dgas,\n                 galois::DGReduceMax<uint32_t>& dgm) {\n    dgas.reset();\n    dgm.reset();\n\n    if (personality == GPU_CUDA) {\n#ifdef GALOIS_ENABLE_GPU\n      uint64_t sum;\n      uint32_t max;\n      BFSSanityCheck_masterNodes_cuda(sum, max, infinity, cuda_ctx);\n      dgas += sum;\n      dgm.update(max);\n#else\n      abort();\n#endif\n    } else {\n      galois::do_all(galois::iterate(_graph.masterNodesRange().begin(),\n                                     _graph.masterNodesRange().end()),\n                     BFSSanityCheck(infinity, &_graph, dgas, dgm),\n                     galois::no_stats(), galois::loopname(\"BFSSanityCheck\"));\n    }\n\n    uint64_t num_visited  = dgas.reduce();\n    uint32_t max_distance = dgm.reduce();\n\n    // Only host 0 will print the info\n    if (galois::runtime::getSystemNetworkInterface().ID == 0) {\n      galois::gPrint(\"Number of nodes visited from source \", src_node, \" is \",\n                     num_visited, \"\\n\");\n      galois::gPrint(\"Max distance from source \", src_node, \" is \",\n                     max_distance, \"\\n\");\n    }\n  }\n\n  void operator()(GNode src) const {\n    NodeData& src_data = graph->getData(src);\n\n    if (src_data.dist_current < local_infinity) {\n      DGAccumulator_sum += 1;\n      DGMax.update(src_data.dist_current);\n    }\n  }\n};\n\n/******************************************************************************/\n/* Make results */\n/******************************************************************************/\n\nstd::vector<uint32_t> makeResultsCPU(std::unique_ptr<Graph>& hg) {\n  std::vector<uint32_t> values;\n\n  values.reserve(hg->numMasters());\n  for (auto node : hg->masterNodesRange()) {\n    values.push_back(hg->getData(node).dist_current);\n  }\n\n  return values;\n}\n\n#ifdef GALOIS_ENABLE_GPU\nstd::vector<uint32_t> makeResultsGPU(std::unique_ptr<Graph>& hg) {\n  std::vector<uint32_t> values;\n\n  values.reserve(hg->numMasters());\n  for (auto node : hg->masterNodesRange()) {\n    values.push_back(get_node_dist_current_cuda(cuda_ctx, node));\n  }\n\n  return values;\n}\n#else\nstd::vector<uint32_t> makeResultsGPU(std::unique_ptr<Graph>& /*unused*/) {\n  abort();\n}\n#endif\n\nstd::vector<uint32_t> makeResults(std::unique_ptr<Graph>& hg) {\n  switch (personality) {\n  case CPU:\n    return makeResultsCPU(hg);\n  case GPU_CUDA:\n    return makeResultsGPU(hg);\n  default:\n    abort();\n  }\n}\n\n/******************************************************************************/\n/* Main */\n/******************************************************************************/\n\nstatic const char* const name = \"BFS pull - Distributed Heterogeneous\";\nstatic const char* const desc = \"BFS pull on Distributed Galois.\";\nstatic const char* const url  = nullptr;\n\nint main(int argc, char** argv) {\n  galois::DistMemSys G;\n  DistBenchStart(argc, argv, name, desc, url);\n\n  auto& net = galois::runtime::getSystemNetworkInterface();\n  if (net.ID == 0) {\n    galois::runtime::reportParam(REGION_NAME, \"Source Node ID\", src_node);\n    galois::runtime::reportParam(REGION_NAME, \"Max Iterations\", maxIterations);\n  }\n  galois::StatTimer StatTimer_total(\"TimerTotal\", REGION_NAME);\n\n  StatTimer_total.start();\n\n  std::unique_ptr<Graph> hg;\n#ifdef GALOIS_ENABLE_GPU\n  std::tie(hg, syncSubstrate) =\n      distGraphInitialization<NodeData, void, false>(&cuda_ctx);\n#else\n  std::tie(hg, syncSubstrate) =\n      distGraphInitialization<NodeData, void, false>();\n#endif\n\n  bitset_dist_current.resize(hg->size());\n\n  galois::gPrint(\"[\", net.ID, \"] InitializeGraph::go called\\n\");\n\n  InitializeGraph::go((*hg));\n  galois::runtime::getHostBarrier().wait();\n\n  // accumulators for use in operators\n  galois::DGAccumulator<uint64_t> DGAccumulator_sum;\n  galois::DGReduceMax<uint32_t> m;\n\n  for (auto run = 0; run < numRuns; ++run) {\n    galois::gPrint(\"[\", net.ID, \"] BFS::go run \", run, \" called\\n\");\n    std::string timer_str(\"Timer_\" + std::to_string(run));\n    galois::StatTimer StatTimer_main(timer_str.c_str(), REGION_NAME);\n\n    StatTimer_main.start();\n    if (execution == Async) {\n      BFS<true>::go(*hg);\n    } else {\n      BFS<false>::go(*hg);\n    }\n    StatTimer_main.stop();\n\n    // sanity check\n    BFSSanityCheck::go(*hg, DGAccumulator_sum, m);\n\n    if ((run + 1) != numRuns) {\n      if (personality == GPU_CUDA) {\n#ifdef GALOIS_ENABLE_GPU\n        bitset_dist_current_reset_cuda(cuda_ctx);\n#else\n        abort();\n#endif\n      } else {\n        bitset_dist_current.reset();\n      }\n\n      (*syncSubstrate).set_num_run(run + 1);\n      InitializeGraph::go(*hg);\n      galois::runtime::getHostBarrier().wait();\n    }\n  }\n\n  StatTimer_total.stop();\n\n  if (output) {\n    std::vector<uint32_t> results = makeResults(hg);\n    auto globalIDs                = hg->getMasterGlobalIDs();\n    assert(results.size() == globalIDs.size());\n\n    writeOutput(outputLocation, \"level\", results.data(), results.size(),\n                globalIDs.data());\n  }\n\n  return 0;\n}\n"
  },
  {
    "path": "lonestar/analytics/distributed/bfs/bfs_pull_cuda.cu",
    "content": "/*  -*- mode: c++ -*-  */\n#include \"gg.h\"\n#include \"ggcuda.h\"\n#include \"cub/cub.cuh\"\n#include \"cub/util_allocator.cuh\"\n#include \"thread_work.h\"\n\nvoid kernel_sizing(CSRGraph &, dim3 &, dim3 &);\n#define TB_SIZE 256\nconst char *GGC_OPTIONS = \"coop_conv=False $ outline_iterate_gb=False $ backoff_blocking_factor=4 $ parcomb=True $ np_schedulers=set(['fg', 'tb', 'wp']) $ cc_disable=set([]) $ tb_lb=False $ hacks=set([]) $ np_factor=8 $ instrument=set([]) $ unroll=[] $ instrument_mode=None $ read_props=None $ outline_iterate=False $ ignore_nested_errors=False $ np=True $ write_props=None $ quiet_cgen=True $ dyn_lb=True $ retry_backoff=True $ cuda.graph_type=basic $ cuda.use_worklist_slots=True $ cuda.worklist_type=basic\";\nstruct ThreadWork t_work;\nbool enable_lb = true;\n#include \"bfs_pull_cuda.cuh\"\nstatic const int __tb_BFS = TB_SIZE;\n__global__ void InitializeGraph(CSRGraph graph, unsigned int __begin, unsigned int __end, const uint32_t  local_infinity, uint64_t local_src_node, uint32_t * p_dist_current)\n{\n  unsigned tid = TID_1D;\n  unsigned nthreads = TOTAL_THREADS_1D;\n\n  const unsigned __kernel_tb_size = TB_SIZE;\n  index_type src_end;\n  // FP: \"1 -> 2;\n  src_end = __end;\n  for (index_type src = __begin + tid; src < src_end; src += nthreads)\n  {\n    bool pop  = src < __end;\n    if (pop)\n    {\n      p_dist_current[src] = (graph.node_data[src] == local_src_node) ? 0 : local_infinity;\n    }\n  }\n  // FP: \"7 -> 8;\n}\n__global__ void BFS_TB_LB(CSRGraph graph, unsigned int __begin, unsigned int __end, uint32_t * p_dist_current, DynamicBitset& bitset_dist_current, HGAccumulator<unsigned int> active_vertices, int * thread_prefix_work_wl, unsigned int num_items, PipeContextT<Worklist2> thread_src_wl)\n{\n  unsigned tid = TID_1D;\n  unsigned nthreads = TOTAL_THREADS_1D;\n\n  const unsigned __kernel_tb_size = TB_SIZE;\n  __shared__ unsigned int total_work;\n  __shared__ unsigned block_start_src_index;\n  __shared__ unsigned block_end_src_index;\n  unsigned my_work;\n  unsigned src;\n  unsigned int offset;\n  unsigned int current_work;\n  // FP: \"1 -> 2;\n  // FP: \"2 -> 3;\n  unsigned blockdim_x = BLOCK_DIM_X;\n  // FP: \"3 -> 4;\n  // FP: \"4 -> 5;\n  // FP: \"5 -> 6;\n  // FP: \"6 -> 7;\n  // FP: \"7 -> 8;\n  // FP: \"8 -> 9;\n  // FP: \"9 -> 10;\n  total_work = thread_prefix_work_wl[num_items - 1];\n  // FP: \"10 -> 11;\n  my_work = ceilf((float)(total_work) / (float) nthreads);\n  // FP: \"11 -> 12;\n\n  // FP: \"12 -> 13;\n  __syncthreads();\n  // FP: \"13 -> 14;\n\n  // FP: \"14 -> 15;\n  if (my_work != 0)\n  {\n    current_work = tid;\n  }\n  // FP: \"17 -> 18;\n  for (unsigned i =0; i < my_work; i++)\n  {\n    unsigned int block_start_work;\n    unsigned int block_end_work;\n    if (threadIdx.x == 0)\n    {\n      if (current_work < total_work)\n      {\n        block_start_work = current_work;\n        block_end_work=current_work + blockdim_x - 1;\n        if (block_end_work >= total_work)\n        {\n          block_end_work = total_work - 1;\n        }\n        block_start_src_index = compute_src_and_offset(0, num_items - 1,  block_start_work+1, thread_prefix_work_wl, num_items,offset);\n        block_end_src_index = compute_src_and_offset(0, num_items - 1, block_end_work+1, thread_prefix_work_wl, num_items, offset);\n      }\n    }\n    __syncthreads();\n\n    if (current_work < total_work)\n    {\n      unsigned src_index;\n      index_type jj;\n      src_index = compute_src_and_offset(block_start_src_index, block_end_src_index, current_work+1, thread_prefix_work_wl,num_items, offset);\n      src= thread_src_wl.in_wl().dwl[src_index];\n      jj = (graph).getFirstEdge(src)+ offset;\n      {\n        index_type dst;\n        uint32_t new_dist;\n        uint32_t old_dist;\n        dst = graph.getAbsDestination(jj);\n        new_dist = p_dist_current[dst] + 1;\n        old_dist = atomicTestMin(&p_dist_current[src], new_dist);\n        if (old_dist > new_dist)\n        {\n          bitset_dist_current.set(src);\n          active_vertices.reduce( 1);\n        }\n      }\n      current_work = current_work + nthreads;\n    }\n    __syncthreads();\n  }\n  // FP: \"50 -> 51;\n}\n__global__ void BFS(CSRGraph graph, unsigned int __begin, unsigned int __end, uint32_t * p_dist_current, DynamicBitset& bitset_dist_current, HGAccumulator<unsigned int> active_vertices, PipeContextT<Worklist2> thread_work_wl, PipeContextT<Worklist2> thread_src_wl, bool enable_lb)\n{\n  unsigned tid = TID_1D;\n  unsigned nthreads = TOTAL_THREADS_1D;\n\n  const unsigned __kernel_tb_size = __tb_BFS;\n  __shared__ cub::BlockReduce<unsigned int, TB_SIZE>::TempStorage active_vertices_ts;\n  index_type src_end;\n  index_type src_rup;\n  // FP: \"1 -> 2;\n  const int _NP_CROSSOVER_WP = 32;\n  const int _NP_CROSSOVER_TB = __kernel_tb_size;\n  // FP: \"2 -> 3;\n  const int BLKSIZE = __kernel_tb_size;\n  const int ITSIZE = BLKSIZE * 8;\n  unsigned d_limit = DEGREE_LIMIT;\n  // FP: \"3 -> 4;\n\n  typedef cub::BlockScan<multiple_sum<2, index_type>, BLKSIZE> BlockScan;\n  typedef union np_shared<BlockScan::TempStorage, index_type, struct tb_np, struct warp_np<__kernel_tb_size/32>, struct fg_np<ITSIZE> > npsTy;\n\n  // FP: \"4 -> 5;\n  __shared__ npsTy nps ;\n  // FP: \"5 -> 6;\n  // FP: \"6 -> 7;\n  active_vertices.thread_entry();\n  // FP: \"7 -> 8;\n  src_end = __end;\n  src_rup = ((__begin) + roundup(((__end) - (__begin)), (blockDim.x)));\n  for (index_type src = __begin + tid; src < src_rup; src += nthreads)\n  {\n    int index;\n    multiple_sum<2, index_type> _np_mps;\n    multiple_sum<2, index_type> _np_mps_total;\n    // FP: \"8 -> 9;\n    bool pop  = src < __end && ((( src < (graph).nnodes )) ? true: false);\n    // FP: \"9 -> 10;\n    if (pop)\n    {\n    }\n    // FP: \"11 -> 12;\n    // FP: \"14 -> 15;\n    // FP: \"15 -> 16;\n    int threshold = TOTAL_THREADS_1D;\n    // FP: \"16 -> 17;\n    if (pop && (graph).getOutDegree(src) >= threshold)\n    {\n      index = thread_work_wl.in_wl().push_range(1) ;\n      thread_src_wl.in_wl().push_range(1);\n      thread_work_wl.in_wl().dwl[index] = (graph).getOutDegree(src);\n      thread_src_wl.in_wl().dwl[index] = src;\n      pop = false;\n    }\n    // FP: \"19 -> 20;\n    struct NPInspector1 _np = {0,0,0,0,0,0};\n    // FP: \"20 -> 21;\n    __shared__ struct { index_type src; } _np_closure [TB_SIZE];\n    // FP: \"21 -> 22;\n    _np_closure[threadIdx.x].src = src;\n    // FP: \"22 -> 23;\n    if (pop)\n    {\n      _np.size = (graph).getOutDegree(src);\n      _np.start = (graph).getFirstEdge(src);\n    }\n    // FP: \"25 -> 26;\n    // FP: \"26 -> 27;\n    _np_mps.el[0] = _np.size >= _NP_CROSSOVER_WP ? _np.size : 0;\n    _np_mps.el[1] = _np.size < _NP_CROSSOVER_WP ? _np.size : 0;\n    // FP: \"27 -> 28;\n    BlockScan(nps.temp_storage).ExclusiveSum(_np_mps, _np_mps, _np_mps_total);\n    // FP: \"28 -> 29;\n    if (threadIdx.x == 0)\n    {\n      nps.tb.owner = MAX_TB_SIZE + 1;\n    }\n    // FP: \"31 -> 32;\n    __syncthreads();\n    // FP: \"32 -> 33;\n    while (true)\n    {\n      // FP: \"33 -> 34;\n      if (_np.size >= _NP_CROSSOVER_TB)\n      {\n        nps.tb.owner = threadIdx.x;\n      }\n      // FP: \"36 -> 37;\n      __syncthreads();\n      // FP: \"37 -> 38;\n      if (nps.tb.owner == MAX_TB_SIZE + 1)\n      {\n        // FP: \"38 -> 39;\n        __syncthreads();\n        // FP: \"39 -> 40;\n        break;\n      }\n      // FP: \"41 -> 42;\n      if (nps.tb.owner == threadIdx.x)\n      {\n        nps.tb.start = _np.start;\n        nps.tb.size = _np.size;\n        nps.tb.src = threadIdx.x;\n        _np.start = 0;\n        _np.size = 0;\n      }\n      // FP: \"44 -> 45;\n      __syncthreads();\n      // FP: \"45 -> 46;\n      int ns = nps.tb.start;\n      int ne = nps.tb.size;\n      // FP: \"46 -> 47;\n      if (nps.tb.src == threadIdx.x)\n      {\n        nps.tb.owner = MAX_TB_SIZE + 1;\n      }\n      // FP: \"49 -> 50;\n      assert(nps.tb.src < __kernel_tb_size);\n      src = _np_closure[nps.tb.src].src;\n      // FP: \"50 -> 51;\n      for (int _np_j = threadIdx.x; _np_j < ne; _np_j += BLKSIZE)\n      {\n        index_type jj;\n        jj = ns +_np_j;\n        {\n          index_type dst;\n          uint32_t new_dist;\n          uint32_t old_dist;\n          dst = graph.getAbsDestination(jj);\n          new_dist = p_dist_current[dst] + 1;\n          old_dist = atomicTestMin(&p_dist_current[src], new_dist);\n          if (old_dist > new_dist)\n          {\n            bitset_dist_current.set(src);\n            active_vertices.reduce( 1);\n          }\n        }\n      }\n      // FP: \"64 -> 65;\n      __syncthreads();\n    }\n    // FP: \"66 -> 67;\n\n    // FP: \"67 -> 68;\n    {\n      const int warpid = threadIdx.x / 32;\n      // FP: \"68 -> 69;\n      const int _np_laneid = cub::LaneId();\n      // FP: \"69 -> 70;\n      while (__any_sync(0xffffffff, _np.size >= _NP_CROSSOVER_WP && _np.size < _NP_CROSSOVER_TB))\n      {\n        if (_np.size >= _NP_CROSSOVER_WP && _np.size < _NP_CROSSOVER_TB)\n        {\n          nps.warp.owner[warpid] = _np_laneid;\n        }\n        if (nps.warp.owner[warpid] == _np_laneid)\n        {\n          nps.warp.start[warpid] = _np.start;\n          nps.warp.size[warpid] = _np.size;\n          nps.warp.src[warpid] = threadIdx.x;\n          _np.start = 0;\n          _np.size = 0;\n        }\n        index_type _np_w_start = nps.warp.start[warpid];\n        index_type _np_w_size = nps.warp.size[warpid];\n        assert(nps.warp.src[warpid] < __kernel_tb_size);\n        src = _np_closure[nps.warp.src[warpid]].src;\n        for (int _np_ii = _np_laneid; _np_ii < _np_w_size; _np_ii += 32)\n        {\n          index_type jj;\n          jj = _np_w_start +_np_ii;\n          {\n            index_type dst;\n            uint32_t new_dist;\n            uint32_t old_dist;\n            dst = graph.getAbsDestination(jj);\n            new_dist = p_dist_current[dst] + 1;\n            old_dist = atomicTestMin(&p_dist_current[src], new_dist);\n            if (old_dist > new_dist)\n            {\n              bitset_dist_current.set(src);\n              active_vertices.reduce( 1);\n            }\n          }\n        }\n      }\n      // FP: \"93 -> 94;\n      __syncthreads();\n      // FP: \"94 -> 95;\n    }\n\n    // FP: \"95 -> 96;\n    __syncthreads();\n    // FP: \"96 -> 97;\n    _np.total = _np_mps_total.el[1];\n    _np.offset = _np_mps.el[1];\n    // FP: \"97 -> 98;\n    while (_np.work())\n    {\n      // FP: \"98 -> 99;\n      int _np_i =0;\n      // FP: \"99 -> 100;\n      _np.inspect2(nps.fg.itvalue, nps.fg.src, ITSIZE, threadIdx.x);\n      // FP: \"100 -> 101;\n      __syncthreads();\n      // FP: \"101 -> 102;\n\n      // FP: \"102 -> 103;\n      for (_np_i = threadIdx.x; _np_i < ITSIZE && _np.valid(_np_i); _np_i += BLKSIZE)\n      {\n        index_type jj;\n        assert(nps.fg.src[_np_i] < __kernel_tb_size);\n        src = _np_closure[nps.fg.src[_np_i]].src;\n        jj= nps.fg.itvalue[_np_i];\n        {\n          index_type dst;\n          uint32_t new_dist;\n          uint32_t old_dist;\n          dst = graph.getAbsDestination(jj);\n          new_dist = p_dist_current[dst] + 1;\n          old_dist = atomicTestMin(&p_dist_current[src], new_dist);\n          if (old_dist > new_dist)\n          {\n            bitset_dist_current.set(src);\n            active_vertices.reduce( 1);\n          }\n        }\n      }\n      // FP: \"117 -> 118;\n      _np.execute_round_done(ITSIZE);\n      // FP: \"118 -> 119;\n      __syncthreads();\n    }\n    // FP: \"120 -> 121;\n    assert(threadIdx.x < __kernel_tb_size);\n    src = _np_closure[threadIdx.x].src;\n  }\n  // FP: \"122 -> 123;\n  active_vertices.thread_exit<cub::BlockReduce<unsigned int, TB_SIZE> >(active_vertices_ts);\n  // FP: \"123 -> 124;\n}\n__global__ void BFSSanityCheck(CSRGraph graph, unsigned int __begin, unsigned int __end, const uint32_t  local_infinity, uint32_t * p_dist_current, HGAccumulator<uint64_t> DGAccumulator_sum, HGReduceMax<uint32_t> DGMax)\n{\n  unsigned tid = TID_1D;\n  unsigned nthreads = TOTAL_THREADS_1D;\n\n  const unsigned __kernel_tb_size = TB_SIZE;\n  __shared__ cub::BlockReduce<uint64_t, TB_SIZE>::TempStorage DGAccumulator_sum_ts;\n  __shared__ cub::BlockReduce<uint32_t, TB_SIZE>::TempStorage DGMax_ts;\n  index_type src_end;\n  // FP: \"1 -> 2;\n  // FP: \"2 -> 3;\n  DGAccumulator_sum.thread_entry();\n  // FP: \"3 -> 4;\n  // FP: \"4 -> 5;\n  DGMax.thread_entry();\n  // FP: \"5 -> 6;\n  src_end = __end;\n  for (index_type src = __begin + tid; src < src_end; src += nthreads)\n  {\n    bool pop  = src < __end;\n    if (pop)\n    {\n      if (p_dist_current[src] < local_infinity)\n      {\n        DGAccumulator_sum.reduce( 1);\n        DGMax.reduce(p_dist_current[src]);\n      }\n    }\n  }\n  // FP: \"14 -> 15;\n  DGAccumulator_sum.thread_exit<cub::BlockReduce<uint64_t, TB_SIZE> >(DGAccumulator_sum_ts);\n  // FP: \"15 -> 16;\n  DGMax.thread_exit<cub::BlockReduce<uint32_t, TB_SIZE> >(DGMax_ts);\n  // FP: \"16 -> 17;\n}\nvoid InitializeGraph_cuda(unsigned int  __begin, unsigned int  __end, const uint32_t & local_infinity, uint64_t local_src_node, struct CUDA_Context*  ctx)\n{\n  t_work.init_thread_work(ctx->gg.nnodes);\n  dim3 blocks;\n  dim3 threads;\n  // FP: \"1 -> 2;\n  // FP: \"2 -> 3;\n  // FP: \"3 -> 4;\n  kernel_sizing(blocks, threads);\n  // FP: \"4 -> 5;\n  InitializeGraph <<<blocks, threads>>>(ctx->gg, __begin, __end, local_infinity, local_src_node, ctx->dist_current.data.gpu_wr_ptr());\n  cudaDeviceSynchronize();\n  // FP: \"5 -> 6;\n  check_cuda_kernel;\n  // FP: \"6 -> 7;\n}\nvoid InitializeGraph_allNodes_cuda(const uint32_t & local_infinity, uint64_t local_src_node, struct CUDA_Context*  ctx)\n{\n  // FP: \"1 -> 2;\n  InitializeGraph_cuda(0, ctx->gg.nnodes, local_infinity, local_src_node, ctx);\n  // FP: \"2 -> 3;\n}\nvoid InitializeGraph_masterNodes_cuda(const uint32_t & local_infinity, uint64_t local_src_node, struct CUDA_Context*  ctx)\n{\n  // FP: \"1 -> 2;\n  InitializeGraph_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, local_infinity, local_src_node, ctx);\n  // FP: \"2 -> 3;\n}\nvoid InitializeGraph_nodesWithEdges_cuda(const uint32_t & local_infinity, uint64_t local_src_node, struct CUDA_Context*  ctx)\n{\n  // FP: \"1 -> 2;\n  InitializeGraph_cuda(0, ctx->numNodesWithEdges, local_infinity, local_src_node, ctx);\n  // FP: \"2 -> 3;\n}\nvoid BFS_cuda(unsigned int  __begin, unsigned int  __end, unsigned int & active_vertices, struct CUDA_Context*  ctx)\n{\n  dim3 blocks;\n  dim3 threads;\n  HGAccumulator<unsigned int> _active_vertices;\n  // FP: \"1 -> 2;\n  // FP: \"2 -> 3;\n  // FP: \"3 -> 4;\n  kernel_sizing(blocks, threads);\n  // FP: \"4 -> 5;\n  Shared<unsigned int> active_verticesval  = Shared<unsigned int>(1);\n  // FP: \"5 -> 6;\n  // FP: \"6 -> 7;\n  *(active_verticesval.cpu_wr_ptr()) = 0;\n  // FP: \"7 -> 8;\n  _active_vertices.rv = active_verticesval.gpu_wr_ptr();\n  // FP: \"8 -> 9;\n  BFS <<<blocks, __tb_BFS>>>(ctx->gg, __begin, __end, ctx->dist_current.data.gpu_wr_ptr(), *(ctx->dist_current.is_updated.gpu_rd_ptr()), _active_vertices, t_work.thread_work_wl, t_work.thread_src_wl, enable_lb);\n  cudaDeviceSynchronize();\n  if (enable_lb)\n  {\n    int num_items = t_work.thread_work_wl.in_wl().nitems();\n    if (num_items != 0)\n    {\n      t_work.compute_prefix_sum();\n      cudaDeviceSynchronize();\n      BFS_TB_LB <<<blocks, __tb_BFS>>>(ctx->gg, __begin, __end, ctx->dist_current.data.gpu_wr_ptr(), *(ctx->dist_current.is_updated.gpu_rd_ptr()), _active_vertices, t_work.thread_prefix_work_wl.gpu_wr_ptr(), num_items, t_work.thread_src_wl);\n      cudaDeviceSynchronize();\n      t_work.reset_thread_work();\n    }\n  }\n  // FP: \"9 -> 10;\n  check_cuda_kernel;\n  // FP: \"10 -> 11;\n  active_vertices = *(active_verticesval.cpu_rd_ptr());\n  // FP: \"11 -> 12;\n}\nvoid BFS_allNodes_cuda(unsigned int & active_vertices, struct CUDA_Context*  ctx)\n{\n  // FP: \"1 -> 2;\n  BFS_cuda(0, ctx->gg.nnodes, active_vertices, ctx);\n  // FP: \"2 -> 3;\n}\nvoid BFS_masterNodes_cuda(unsigned int & active_vertices, struct CUDA_Context*  ctx)\n{\n  // FP: \"1 -> 2;\n  BFS_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, active_vertices, ctx);\n  // FP: \"2 -> 3;\n}\nvoid BFS_nodesWithEdges_cuda(unsigned int & active_vertices, struct CUDA_Context*  ctx)\n{\n  // FP: \"1 -> 2;\n  BFS_cuda(0, ctx->numNodesWithEdges, active_vertices, ctx);\n  // FP: \"2 -> 3;\n}\nvoid BFSSanityCheck_cuda(unsigned int  __begin, unsigned int  __end, uint64_t & DGAccumulator_sum, uint32_t & DGMax, const uint32_t & local_infinity, struct CUDA_Context*  ctx)\n{\n  dim3 blocks;\n  dim3 threads;\n  HGAccumulator<uint64_t> _DGAccumulator_sum;\n  HGReduceMax<uint32_t> _DGMax;\n  // FP: \"1 -> 2;\n  // FP: \"2 -> 3;\n  // FP: \"3 -> 4;\n  kernel_sizing(blocks, threads);\n  // FP: \"4 -> 5;\n  Shared<uint64_t> DGAccumulator_sumval  = Shared<uint64_t>(1);\n  // FP: \"5 -> 6;\n  // FP: \"6 -> 7;\n  *(DGAccumulator_sumval.cpu_wr_ptr()) = 0;\n  // FP: \"7 -> 8;\n  _DGAccumulator_sum.rv = DGAccumulator_sumval.gpu_wr_ptr();\n  // FP: \"8 -> 9;\n  Shared<uint32_t> DGMaxval  = Shared<uint32_t>(1);\n  // FP: \"9 -> 10;\n  // FP: \"10 -> 11;\n  *(DGMaxval.cpu_wr_ptr()) = 0;\n  // FP: \"11 -> 12;\n  _DGMax.rv = DGMaxval.gpu_wr_ptr();\n  // FP: \"12 -> 13;\n  BFSSanityCheck <<<blocks, threads>>>(ctx->gg, __begin, __end, local_infinity, ctx->dist_current.data.gpu_wr_ptr(), _DGAccumulator_sum, _DGMax);\n  cudaDeviceSynchronize();\n  // FP: \"13 -> 14;\n  check_cuda_kernel;\n  // FP: \"14 -> 15;\n  DGAccumulator_sum = *(DGAccumulator_sumval.cpu_rd_ptr());\n  // FP: \"15 -> 16;\n  DGMax = *(DGMaxval.cpu_rd_ptr());\n  // FP: \"16 -> 17;\n}\nvoid BFSSanityCheck_allNodes_cuda(uint64_t & DGAccumulator_sum, uint32_t & DGMax, const uint32_t & local_infinity, struct CUDA_Context*  ctx)\n{\n  // FP: \"1 -> 2;\n  BFSSanityCheck_cuda(0, ctx->gg.nnodes, DGAccumulator_sum, DGMax, local_infinity, ctx);\n  // FP: \"2 -> 3;\n}\nvoid BFSSanityCheck_masterNodes_cuda(uint64_t & DGAccumulator_sum, uint32_t & DGMax, const uint32_t & local_infinity, struct CUDA_Context*  ctx)\n{\n  // FP: \"1 -> 2;\n  BFSSanityCheck_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, DGAccumulator_sum, DGMax, local_infinity, ctx);\n  // FP: \"2 -> 3;\n}\nvoid BFSSanityCheck_nodesWithEdges_cuda(uint64_t & DGAccumulator_sum, uint32_t & DGMax, const uint32_t & local_infinity, struct CUDA_Context*  ctx)\n{\n  // FP: \"1 -> 2;\n  BFSSanityCheck_cuda(0, ctx->numNodesWithEdges, DGAccumulator_sum, DGMax, local_infinity, ctx);\n  // FP: \"2 -> 3;\n}\n"
  },
  {
    "path": "lonestar/analytics/distributed/bfs/bfs_pull_cuda.cuh",
    "content": "#pragma once\n#include <cuda.h>\n#include <stdio.h>\n#include <sys/types.h>\n#include <unistd.h>\n#include \"bfs_pull_cuda.h\"\n#include \"galois/runtime/cuda/DeviceSync.h\"\n\nstruct CUDA_Context : public CUDA_Context_Common {\n\tstruct CUDA_Context_Field<uint32_t> dist_current;\n};\n\nstruct CUDA_Context* get_CUDA_context(int id) {\n\tstruct CUDA_Context* ctx;\n\tctx = (struct CUDA_Context* ) calloc(1, sizeof(struct CUDA_Context));\n\tctx->id = id;\n\treturn ctx;\n}\n\nbool init_CUDA_context(struct CUDA_Context* ctx, int device) {\n\treturn init_CUDA_context_common(ctx, device);\n}\n\nvoid load_graph_CUDA(struct CUDA_Context* ctx, MarshalGraph &g, unsigned num_hosts) {\n\tsize_t mem_usage = mem_usage_CUDA_common(g, num_hosts);\n\tmem_usage += mem_usage_CUDA_field(&ctx->dist_current, g, num_hosts);\n\tprintf(\"[%d] Host memory for communication context: %3u MB\\n\", ctx->id, mem_usage/1048756);\n\tload_graph_CUDA_common(ctx, g, num_hosts);\n\tload_graph_CUDA_field(ctx, &ctx->dist_current, num_hosts);\n\treset_CUDA_context(ctx);\n}\n\nvoid reset_CUDA_context(struct CUDA_Context* ctx) {\n\tctx->dist_current.data.zero_gpu();\n}\n\nvoid get_bitset_dist_current_cuda(struct CUDA_Context* ctx, uint64_t* bitset_compute) {\n\tctx->dist_current.is_updated.cpu_rd_ptr()->copy_to_cpu(bitset_compute);\n}\n\nvoid bitset_dist_current_reset_cuda(struct CUDA_Context* ctx) {\n\tctx->dist_current.is_updated.cpu_rd_ptr()->reset();\n}\n\nvoid bitset_dist_current_reset_cuda(struct CUDA_Context* ctx, size_t begin, size_t end) {\n\treset_bitset_field(&ctx->dist_current, begin, end);\n}\n\nuint32_t get_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned LID) {\n\tuint32_t *dist_current = ctx->dist_current.data.cpu_rd_ptr();\n\treturn dist_current[LID];\n}\n\nvoid set_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned LID, uint32_t v) {\n\tuint32_t *dist_current = ctx->dist_current.data.cpu_wr_ptr();\n\tdist_current[LID] = v;\n}\n\nvoid add_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned LID, uint32_t v) {\n\tuint32_t *dist_current = ctx->dist_current.data.cpu_wr_ptr();\n\tdist_current[LID] += v;\n}\n\nbool min_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned LID, uint32_t v) {\n\tuint32_t *dist_current = ctx->dist_current.data.cpu_wr_ptr();\n\tif (dist_current[LID] > v){\n\t\tdist_current[LID] = v;\n\t\treturn true;\n\t}\n\treturn false;\n}\n\nvoid batch_get_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v) {\n\tbatch_get_shared_field<uint32_t, sharedMaster, false>(ctx, &ctx->dist_current, from_id, v);\n}\n\nvoid batch_get_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode) {\n\tbatch_get_shared_field<uint32_t, sharedMaster, false>(ctx, &ctx->dist_current, from_id, v, v_size, data_mode);\n}\n\nvoid batch_get_mirror_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v) {\n\tbatch_get_shared_field<uint32_t, sharedMirror, false>(ctx, &ctx->dist_current, from_id, v);\n}\n\nvoid batch_get_mirror_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode) {\n\tbatch_get_shared_field<uint32_t, sharedMirror, false>(ctx, &ctx->dist_current, from_id, v, v_size, data_mode);\n}\n\nvoid batch_get_reset_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, uint32_t i) {\n\tbatch_get_shared_field<uint32_t, sharedMirror, true>(ctx, &ctx->dist_current, from_id, v, i);\n}\n\nvoid batch_get_reset_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode, uint32_t i) {\n\tbatch_get_shared_field<uint32_t, sharedMirror, true>(ctx, &ctx->dist_current, from_id, v, v_size, data_mode, i);\n}\n\nvoid batch_set_mirror_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<uint32_t, sharedMirror, setOp>(ctx, &ctx->dist_current, from_id, v, data_mode);\n}\n\nvoid batch_set_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<uint32_t, sharedMaster, setOp>(ctx, &ctx->dist_current, from_id, v, data_mode);\n}\n\nvoid batch_add_mirror_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<uint32_t, sharedMirror, addOp>(ctx, &ctx->dist_current, from_id, v, data_mode);\n}\n\nvoid batch_add_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<uint32_t, sharedMaster, addOp>(ctx, &ctx->dist_current, from_id, v, data_mode);\n}\n\nvoid batch_min_mirror_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<uint32_t, sharedMirror, minOp>(ctx, &ctx->dist_current, from_id, v, data_mode);\n}\n\nvoid batch_min_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<uint32_t, sharedMaster, minOp>(ctx, &ctx->dist_current, from_id, v, data_mode);\n}\n\nvoid batch_reset_node_dist_current_cuda(struct CUDA_Context* ctx, size_t begin, size_t end, uint32_t v) {\n\treset_data_field<uint32_t>(&ctx->dist_current, begin, end, v);\n}\n\n"
  },
  {
    "path": "lonestar/analytics/distributed/bfs/bfs_pull_cuda.h",
    "content": "#pragma once\n\n#include \"galois/runtime/DataCommMode.h\"\n#include \"galois/cuda/HostDecls.h\"\n\nvoid get_bitset_dist_current_cuda(struct CUDA_Context* ctx,\n                                  uint64_t* bitset_compute);\nvoid bitset_dist_current_reset_cuda(struct CUDA_Context* ctx);\nvoid bitset_dist_current_reset_cuda(struct CUDA_Context* ctx, size_t begin,\n                                    size_t end);\nuint32_t get_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned LID);\nvoid set_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned LID,\n                                uint32_t v);\nvoid add_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned LID,\n                                uint32_t v);\nbool min_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned LID,\n                                uint32_t v);\nvoid batch_get_node_dist_current_cuda(struct CUDA_Context* ctx,\n                                      unsigned from_id, uint8_t* v);\nvoid batch_get_node_dist_current_cuda(struct CUDA_Context* ctx,\n                                      unsigned from_id, uint8_t* v,\n                                      size_t* v_size, DataCommMode* data_mode);\nvoid batch_get_mirror_node_dist_current_cuda(struct CUDA_Context* ctx,\n                                             unsigned from_id, uint8_t* v);\nvoid batch_get_mirror_node_dist_current_cuda(struct CUDA_Context* ctx,\n                                             unsigned from_id, uint8_t* v,\n                                             size_t* v_size,\n                                             DataCommMode* data_mode);\nvoid batch_get_reset_node_dist_current_cuda(struct CUDA_Context* ctx,\n                                            unsigned from_id, uint8_t* v,\n                                            uint32_t i);\nvoid batch_get_reset_node_dist_current_cuda(struct CUDA_Context* ctx,\n                                            unsigned from_id, uint8_t* v,\n                                            size_t* v_size,\n                                            DataCommMode* data_mode,\n                                            uint32_t i);\nvoid batch_set_mirror_node_dist_current_cuda(struct CUDA_Context* ctx,\n                                             unsigned from_id, uint8_t* v,\n                                             DataCommMode data_mode);\nvoid batch_set_node_dist_current_cuda(struct CUDA_Context* ctx,\n                                      unsigned from_id, uint8_t* v,\n                                      DataCommMode data_mode);\nvoid batch_add_mirror_node_dist_current_cuda(struct CUDA_Context* ctx,\n                                             unsigned from_id, uint8_t* v,\n                                             DataCommMode data_mode);\nvoid batch_add_node_dist_current_cuda(struct CUDA_Context* ctx,\n                                      unsigned from_id, uint8_t* v,\n                                      DataCommMode data_mode);\nvoid batch_min_mirror_node_dist_current_cuda(struct CUDA_Context* ctx,\n                                             unsigned from_id, uint8_t* v,\n                                             DataCommMode data_mode);\nvoid batch_min_node_dist_current_cuda(struct CUDA_Context* ctx,\n                                      unsigned from_id, uint8_t* v,\n                                      DataCommMode data_mode);\nvoid batch_reset_node_dist_current_cuda(struct CUDA_Context* ctx, size_t begin,\n                                        size_t end, uint32_t v);\n\nvoid BFS_cuda(unsigned int __begin, unsigned int __end,\n              unsigned int& active_vertices, struct CUDA_Context* ctx);\nvoid BFSSanityCheck_cuda(unsigned int __begin, unsigned int __end,\n                         uint64_t& DGAccumulator_sum, uint32_t& DGMax,\n                         const uint32_t& local_infinity,\n                         struct CUDA_Context* ctx);\nvoid BFSSanityCheck_allNodes_cuda(uint64_t& DGAccumulator_sum, uint32_t& DGMax,\n                                  const uint32_t& local_infinity,\n                                  struct CUDA_Context* ctx);\nvoid BFSSanityCheck_masterNodes_cuda(uint64_t& DGAccumulator_sum,\n                                     uint32_t& DGMax,\n                                     const uint32_t& local_infinity,\n                                     struct CUDA_Context* ctx);\nvoid BFSSanityCheck_nodesWithEdges_cuda(uint64_t& DGAccumulator_sum,\n                                        uint32_t& DGMax,\n                                        const uint32_t& local_infinity,\n                                        struct CUDA_Context* ctx);\nvoid BFS_allNodes_cuda(unsigned int& active_vertices, struct CUDA_Context* ctx);\nvoid BFS_masterNodes_cuda(unsigned int& active_vertices,\n                          struct CUDA_Context* ctx);\nvoid BFS_nodesWithEdges_cuda(unsigned int& active_vertices,\n                             struct CUDA_Context* ctx);\nvoid InitializeGraph_cuda(unsigned int __begin, unsigned int __end,\n                          const uint32_t& local_infinity,\n                          uint64_t local_src_node, struct CUDA_Context* ctx);\nvoid InitializeGraph_allNodes_cuda(const uint32_t& local_infinity,\n                                   uint64_t local_src_node,\n                                   struct CUDA_Context* ctx);\nvoid InitializeGraph_masterNodes_cuda(const uint32_t& local_infinity,\n                                      uint64_t local_src_node,\n                                      struct CUDA_Context* ctx);\nvoid InitializeGraph_nodesWithEdges_cuda(const uint32_t& local_infinity,\n                                         uint64_t local_src_node,\n                                         struct CUDA_Context* ctx);\n"
  },
  {
    "path": "lonestar/analytics/distributed/bfs/bfs_pull_cuda.py",
    "content": "from gg.ast import *\nfrom gg.lib.graph import Graph\nfrom gg.lib.wl import Worklist\nfrom gg.ast.params import GraphParam\nimport cgen\nG = Graph(\"graph\")\nWL = Worklist()\nast = Module([\nCBlock([cgen.Include(\"bfs_pull_cuda.cuh\", system = False)], parse = False),\nKernel(\"InitializeGraph\", [G.param(), ('unsigned int', '__begin'), ('unsigned int', '__end'), ('const uint32_t ', 'local_infinity'), ('unsigned long long', 'local_src_node'), ('uint32_t *', 'p_dist_current')],\n[\nForAll(\"src\", G.nodes(\"__begin\", \"__end\"),\n[\nCDecl([(\"bool\", \"pop\", \" = src < __end\")]),\nIf(\"pop\", [\nCBlock([\"p_dist_current[src] = (graph.node_data[src] == local_src_node) ? 0 : local_infinity\"]),\n]),\n]),\n]),\nKernel(\"BFS\", [G.param(), ('unsigned int', '__begin'), ('unsigned int', '__end'), ('uint32_t *', 'p_dist_current'), ('DynamicBitset&', 'bitset_dist_current'), ('HGAccumulator<unsigned int>', 'active_vertices')],\n[\nCDecl([(\"__shared__ cub::BlockReduce<unsigned int, TB_SIZE>::TempStorage\", \"active_vertices_ts\", \"\")]),\nCBlock([\"active_vertices.thread_entry()\"]),\nForAll(\"src\", G.nodes(\"__begin\", \"__end\"),\n[\nCDecl([(\"bool\", \"pop\", \" = src < __end\")]),\nIf(\"pop\", [\n]),\nUniformConditional(If(\"!pop\", [CBlock(\"continue\")]), uniform_only = False, _only_if_np = True),\nClosureHint(\nForAll(\"jj\", G.edges(\"src\"),\n[\nCDecl([(\"index_type\", \"dst\", \"\")]),\nCBlock([\"dst = graph.getAbsDestination(jj)\"]),\nCDecl([(\"uint32_t\", \"new_dist\", \"\")]),\nCBlock([\"new_dist = p_dist_current[dst] + 1\"]),\nCDecl([(\"uint32_t\", \"old_dist\", \"\")]),\nCBlock([\"old_dist = atomicTestMin(&p_dist_current[src], new_dist)\"]),\nIf(\"old_dist > new_dist\",\n[\nCBlock([\"bitset_dist_current.set(src)\"]),\nCBlock([\"active_vertices.reduce( 1)\"]),\n]),\n]),\n),\n]),\nCBlock([\"active_vertices.thread_exit<cub::BlockReduce<unsigned int, TB_SIZE> >(active_vertices_ts)\"], parse = False),\n]),\nKernel(\"BFSSanityCheck\", [G.param(), ('unsigned int', '__begin'), ('unsigned int', '__end'), ('const uint32_t ', 'local_infinity'), ('uint32_t *', 'p_dist_current'), ('HGAccumulator<uint64_t>', 'DGAccumulator_sum'), ('HGReduceMax<uint32_t>', 'DGMax')],\n[\nCDecl([(\"__shared__ cub::BlockReduce<uint64_t, TB_SIZE>::TempStorage\", \"DGAccumulator_sum_ts\", \"\")]),\nCBlock([\"DGAccumulator_sum.thread_entry()\"]),\nCDecl([(\"__shared__ cub::BlockReduce<uint32_t, TB_SIZE>::TempStorage\", \"DGMax_ts\", \"\")]),\nCBlock([\"DGMax.thread_entry()\"]),\nForAll(\"src\", G.nodes(\"__begin\", \"__end\"),\n[\nCDecl([(\"bool\", \"pop\", \" = src < __end\")]),\nIf(\"pop\", [\nIf(\"p_dist_current[src] < local_infinity\",\n[\nCBlock([\"DGAccumulator_sum.reduce( 1)\"]),\nCBlock([\"DGMax.reduce(p_dist_current[src])\"]),\n]),\n]),\n]),\nCBlock([\"DGAccumulator_sum.thread_exit<cub::BlockReduce<uint64_t, TB_SIZE> >(DGAccumulator_sum_ts)\"], parse = False),\nCBlock([\"DGMax.thread_exit<cub::BlockReduce<uint32_t, TB_SIZE> >(DGMax_ts)\"], parse = False),\n]),\nKernel(\"InitializeGraph_cuda\", [('unsigned int ', '__begin'), ('unsigned int ', '__end'), ('const uint32_t &', 'local_infinity'), ('unsigned long long', 'local_src_node'), ('struct CUDA_Context* ', 'ctx')],\n[\nCDecl([(\"dim3\", \"blocks\", \"\")]),\nCDecl([(\"dim3\", \"threads\", \"\")]),\nCBlock([\"kernel_sizing(blocks, threads)\"]),\nInvoke(\"InitializeGraph\", (\"ctx->gg\", \"__begin\", \"__end\", \"local_infinity\", \"local_src_node\", \"ctx->dist_current.data.gpu_wr_ptr()\")),\nCBlock([\"check_cuda_kernel\"], parse = False),\n], host = True),\nKernel(\"InitializeGraph_allNodes_cuda\", [('const uint32_t &', 'local_infinity'), ('unsigned long long', 'local_src_node'), ('struct CUDA_Context* ', 'ctx')],\n[\nCBlock([\"InitializeGraph_cuda(0, ctx->gg.nnodes, local_infinity, local_src_node, ctx)\"]),\n], host = True),\nKernel(\"InitializeGraph_masterNodes_cuda\", [('const uint32_t &', 'local_infinity'), ('unsigned long long', 'local_src_node'), ('struct CUDA_Context* ', 'ctx')],\n[\nCBlock([\"InitializeGraph_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, local_infinity, local_src_node, ctx)\"]),\n], host = True),\nKernel(\"InitializeGraph_nodesWithEdges_cuda\", [('const uint32_t &', 'local_infinity'), ('unsigned long long', 'local_src_node'), ('struct CUDA_Context* ', 'ctx')],\n[\nCBlock([\"InitializeGraph_cuda(0, ctx->numNodesWithEdges, local_infinity, local_src_node, ctx)\"]),\n], host = True),\nKernel(\"BFS_cuda\", [('unsigned int ', '__begin'), ('unsigned int ', '__end'), ('unsigned int &', 'active_vertices'), ('struct CUDA_Context* ', 'ctx')],\n[\nCDecl([(\"dim3\", \"blocks\", \"\")]),\nCDecl([(\"dim3\", \"threads\", \"\")]),\nCBlock([\"kernel_sizing(blocks, threads)\"]),\nCDecl([(\"Shared<unsigned int>\", \"active_verticesval\", \" = Shared<unsigned int>(1)\")]),\nCDecl([(\"HGAccumulator<unsigned int>\", \"_active_vertices\", \"\")]),\nCBlock([\"*(active_verticesval.cpu_wr_ptr()) = 0\"]),\nCBlock([\"_active_vertices.rv = active_verticesval.gpu_wr_ptr()\"]),\nInvoke(\"BFS\", (\"ctx->gg\", \"__begin\", \"__end\", \"ctx->dist_current.data.gpu_wr_ptr()\", \"*(ctx->dist_current.is_updated.gpu_rd_ptr())\", \"_active_vertices\")),\nCBlock([\"check_cuda_kernel\"], parse = False),\nCBlock([\"active_vertices = *(active_verticesval.cpu_rd_ptr())\"]),\n], host = True),\nKernel(\"BFS_allNodes_cuda\", [('unsigned int &', 'active_vertices'), ('struct CUDA_Context* ', 'ctx')],\n[\nCBlock([\"BFS_cuda(0, ctx->gg.nnodes, active_vertices, ctx)\"]),\n], host = True),\nKernel(\"BFS_masterNodes_cuda\", [('unsigned int &', 'active_vertices'), ('struct CUDA_Context* ', 'ctx')],\n[\nCBlock([\"BFS_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, active_vertices, ctx)\"]),\n], host = True),\nKernel(\"BFS_nodesWithEdges_cuda\", [('unsigned int &', 'active_vertices'), ('struct CUDA_Context* ', 'ctx')],\n[\nCBlock([\"BFS_cuda(0, ctx->numNodesWithEdges, active_vertices, ctx)\"]),\n], host = True),\nKernel(\"BFSSanityCheck_cuda\", [('unsigned int ', '__begin'), ('unsigned int ', '__end'), ('uint64_t &', 'DGAccumulator_sum'), ('uint32_t &', 'DGMax'), ('const uint32_t &', 'local_infinity'), ('struct CUDA_Context* ', 'ctx')],\n[\nCDecl([(\"dim3\", \"blocks\", \"\")]),\nCDecl([(\"dim3\", \"threads\", \"\")]),\nCBlock([\"kernel_sizing(blocks, threads)\"]),\nCDecl([(\"Shared<uint64_t>\", \"DGAccumulator_sumval\", \" = Shared<uint64_t>(1)\")]),\nCDecl([(\"HGAccumulator<uint64_t>\", \"_DGAccumulator_sum\", \"\")]),\nCBlock([\"*(DGAccumulator_sumval.cpu_wr_ptr()) = 0\"]),\nCBlock([\"_DGAccumulator_sum.rv = DGAccumulator_sumval.gpu_wr_ptr()\"]),\nCDecl([(\"Shared<uint32_t>\", \"DGMaxval\", \" = Shared<uint32_t>(1)\")]),\nCDecl([(\"HGReduceMax<uint32_t>\", \"_DGMax\", \"\")]),\nCBlock([\"*(DGMaxval.cpu_wr_ptr()) = 0\"]),\nCBlock([\"_DGMax.rv = DGMaxval.gpu_wr_ptr()\"]),\nInvoke(\"BFSSanityCheck\", (\"ctx->gg\", \"__begin\", \"__end\", \"local_infinity\", \"ctx->dist_current.data.gpu_wr_ptr()\", \"_DGAccumulator_sum\", \"_DGMax\")),\nCBlock([\"check_cuda_kernel\"], parse = False),\nCBlock([\"DGAccumulator_sum = *(DGAccumulator_sumval.cpu_rd_ptr())\"]),\nCBlock([\"DGMax = *(DGMaxval.cpu_rd_ptr())\"]),\n], host = True),\nKernel(\"BFSSanityCheck_allNodes_cuda\", [('uint64_t &', 'DGAccumulator_sum'), ('uint32_t &', 'DGMax'), ('const uint32_t &', 'local_infinity'), ('struct CUDA_Context* ', 'ctx')],\n[\nCBlock([\"BFSSanityCheck_cuda(0, ctx->gg.nnodes, DGAccumulator_sum, DGMax, local_infinity, ctx)\"]),\n], host = True),\nKernel(\"BFSSanityCheck_masterNodes_cuda\", [('uint64_t &', 'DGAccumulator_sum'), ('uint32_t &', 'DGMax'), ('const uint32_t &', 'local_infinity'), ('struct CUDA_Context* ', 'ctx')],\n[\nCBlock([\"BFSSanityCheck_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, DGAccumulator_sum, DGMax, local_infinity, ctx)\"]),\n], host = True),\nKernel(\"BFSSanityCheck_nodesWithEdges_cuda\", [('uint64_t &', 'DGAccumulator_sum'), ('uint32_t &', 'DGMax'), ('const uint32_t &', 'local_infinity'), ('struct CUDA_Context* ', 'ctx')],\n[\nCBlock([\"BFSSanityCheck_cuda(0, ctx->numNodesWithEdges, DGAccumulator_sum, DGMax, local_infinity, ctx)\"]),\n], host = True),\n])\n"
  },
  {
    "path": "lonestar/analytics/distributed/bfs/bfs_pull_sync.hh",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting parallelism.\n * The code is being released under the terms of the 3-Clause BSD License (a\n * copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#include \"galois/runtime/SyncStructures.h\"\n\nGALOIS_SYNC_STRUCTURE_REDUCE_SET(dist_current, unsigned int);\nGALOIS_SYNC_STRUCTURE_REDUCE_MIN(dist_current, unsigned int);\nGALOIS_SYNC_STRUCTURE_BITSET(dist_current);\n"
  },
  {
    "path": "lonestar/analytics/distributed/bfs/bfs_push.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#include \"DistBench/Output.h\"\n#include \"DistBench/Start.h\"\n#include \"galois/DistGalois.h\"\n#include \"galois/gstl.h\"\n#include \"galois/DReducible.h\"\n#include \"galois/DTerminationDetector.h\"\n#include \"galois/runtime/Tracer.h\"\n\n#include <iostream>\n#include <limits>\n\n#ifdef GALOIS_ENABLE_GPU\n#include \"bfs_push_cuda.h\"\nstruct CUDA_Context* cuda_ctx;\n#else\nenum { CPU, GPU_CUDA };\nint personality = CPU;\n#endif\n\nconstexpr static const char* const REGION_NAME = \"BFS\";\n\n/******************************************************************************/\n/* Declaration of command line arguments */\n/******************************************************************************/\n\nnamespace cll = llvm::cl;\n\nstatic cll::opt<unsigned int> maxIterations(\"maxIterations\",\n                                            cll::desc(\"Maximum iterations: \"\n                                                      \"Default 1000\"),\n                                            cll::init(1000));\n\nstatic cll::opt<uint64_t>\n    src_node(\"startNode\", cll::desc(\"ID of the source node\"), cll::init(0));\n\nstatic cll::opt<uint32_t>\n    delta(\"delta\",\n          cll::desc(\"Shift value for the delta step (default value 0)\"),\n          cll::init(0));\n\nenum Exec { Sync, Async };\n\nstatic cll::opt<Exec> execution(\n    \"exec\", cll::desc(\"Distributed Execution Model (default value Async):\"),\n    cll::values(clEnumVal(Sync, \"Bulk-synchronous Parallel (BSP)\"),\n                clEnumVal(Async, \"Bulk-asynchronous Parallel (BASP)\")),\n    cll::init(Async));\n\n/******************************************************************************/\n/* Graph structure declarations + other initialization */\n/******************************************************************************/\n\nconst uint32_t infinity = std::numeric_limits<uint32_t>::max() / 4;\n\nstruct NodeData {\n  std::atomic<uint32_t> dist_current;\n  uint32_t dist_old;\n};\n\ngalois::DynamicBitSet bitset_dist_current;\n\ntypedef galois::graphs::DistGraph<NodeData, void> Graph;\ntypedef typename Graph::GraphNode GNode;\n\nstd::unique_ptr<galois::graphs::GluonSubstrate<Graph>> syncSubstrate;\n\n#include \"bfs_push_sync.hh\"\n\n/******************************************************************************/\n/* Algorithm structures */\n/******************************************************************************/\n\nstruct InitializeGraph {\n  const uint32_t& local_infinity;\n  cll::opt<uint64_t>& local_src_node;\n  Graph* graph;\n\n  InitializeGraph(cll::opt<uint64_t>& _src_node, const uint32_t& _infinity,\n                  Graph* _graph)\n      : local_infinity(_infinity), local_src_node(_src_node), graph(_graph) {}\n\n  void static go(Graph& _graph) {\n    const auto& allNodes = _graph.allNodesRange();\n\n    if (personality == GPU_CUDA) {\n#ifdef GALOIS_ENABLE_GPU\n      std::string impl_str(\n          syncSubstrate->get_run_identifier(\"InitializeGraph_\"));\n      galois::StatTimer StatTimer_cuda(impl_str.c_str(), REGION_NAME);\n      StatTimer_cuda.start();\n      InitializeGraph_allNodes_cuda(infinity, src_node, cuda_ctx);\n      StatTimer_cuda.stop();\n#else\n      abort();\n#endif\n    } else if (personality == CPU) {\n      galois::do_all(\n          galois::iterate(allNodes.begin(), allNodes.end()),\n          InitializeGraph{src_node, infinity, &_graph}, galois::no_stats(),\n          galois::loopname(\n              syncSubstrate->get_run_identifier(\"InitializeGraph\").c_str()));\n    }\n  }\n\n  void operator()(GNode src) const {\n    NodeData& sdata = graph->getData(src);\n    sdata.dist_current =\n        (graph->getGID(src) == local_src_node) ? 0 : local_infinity;\n    sdata.dist_old =\n        (graph->getGID(src) == local_src_node) ? 0 : local_infinity;\n  }\n};\n\ntemplate <bool async>\nstruct FirstItr_BFS {\n  Graph* graph;\n\n  FirstItr_BFS(Graph* _graph) : graph(_graph) {}\n\n  void static go(Graph& _graph) {\n    uint32_t __begin, __end;\n    if (_graph.isLocal(src_node)) {\n      __begin = _graph.getLID(src_node);\n      __end   = __begin + 1;\n    } else {\n      __begin = 0;\n      __end   = 0;\n    }\n    syncSubstrate->set_num_round(0);\n    if (personality == GPU_CUDA) {\n#ifdef GALOIS_ENABLE_GPU\n      std::string impl_str(syncSubstrate->get_run_identifier(\"BFS\"));\n      galois::StatTimer StatTimer_cuda(impl_str.c_str(), REGION_NAME);\n      StatTimer_cuda.start();\n      FirstItr_BFS_cuda(__begin, __end, cuda_ctx);\n      StatTimer_cuda.stop();\n#else\n      abort();\n#endif\n    } else if (personality == CPU) {\n      // one node\n      galois::do_all(\n          galois::iterate(__begin, __end), FirstItr_BFS{&_graph},\n          galois::no_stats(),\n          galois::loopname(syncSubstrate->get_run_identifier(\"BFS\").c_str()));\n    }\n\n    syncSubstrate->sync<writeDestination, readSource, Reduce_min_dist_current,\n                        Bitset_dist_current, async>(\"BFS\");\n\n    galois::runtime::reportStat_Tsum(\n        REGION_NAME, syncSubstrate->get_run_identifier(\"NumWorkItems\"),\n        __end - __begin);\n  }\n\n  void operator()(GNode src) const {\n    NodeData& snode = graph->getData(src);\n    snode.dist_old  = snode.dist_current;\n\n    for (auto jj : graph->edges(src)) {\n      GNode dst         = graph->getEdgeDst(jj);\n      auto& dnode       = graph->getData(dst);\n      uint32_t new_dist = 1 + snode.dist_current;\n      uint32_t old_dist = galois::atomicMin(dnode.dist_current, new_dist);\n      if (old_dist > new_dist)\n        bitset_dist_current.set(dst);\n    }\n  }\n};\n\ntemplate <bool async>\nstruct BFS {\n  uint32_t local_priority;\n  Graph* graph;\n  using DGTerminatorDetector =\n      typename std::conditional<async, galois::DGTerminator<unsigned int>,\n                                galois::DGAccumulator<unsigned int>>::type;\n  using DGAccumulatorTy = galois::DGAccumulator<unsigned int>;\n\n  DGTerminatorDetector& active_vertices;\n  DGAccumulatorTy& work_edges;\n\n  BFS(uint32_t _local_priority, Graph* _graph, DGTerminatorDetector& _dga,\n      DGAccumulatorTy& _work_edges)\n      : local_priority(_local_priority), graph(_graph), active_vertices(_dga),\n        work_edges(_work_edges) {}\n\n  void static go(Graph& _graph) {\n    FirstItr_BFS<async>::go(_graph);\n\n    unsigned _num_iterations = 1;\n\n    const auto& nodesWithEdges = _graph.allNodesWithEdgesRange();\n\n    uint32_t priority;\n    if (delta == 0)\n      priority = std::numeric_limits<uint32_t>::max();\n    else\n      priority = 0;\n    DGTerminatorDetector dga;\n    DGAccumulatorTy work_edges;\n\n    do {\n\n      // if (work_edges.reduce() == 0)\n      priority += delta;\n\n      syncSubstrate->set_num_round(_num_iterations);\n      dga.reset();\n      work_edges.reset();\n      if (personality == GPU_CUDA) {\n#ifdef GALOIS_ENABLE_GPU\n        std::string impl_str(syncSubstrate->get_run_identifier(\"BFS\"));\n        galois::StatTimer StatTimer_cuda(impl_str.c_str(), REGION_NAME);\n        StatTimer_cuda.start();\n        unsigned int __retval  = 0;\n        unsigned int __retval2 = 0;\n        BFS_nodesWithEdges_cuda(__retval, __retval2, priority, cuda_ctx);\n        dga += __retval;\n        work_edges += __retval2;\n        StatTimer_cuda.stop();\n#else\n        abort();\n#endif\n      } else if (personality == CPU) {\n        galois::do_all(\n            galois::iterate(nodesWithEdges),\n            BFS(priority, &_graph, dga, work_edges), galois::steal(),\n            galois::no_stats(),\n            galois::loopname(syncSubstrate->get_run_identifier(\"BFS\").c_str()));\n      }\n      syncSubstrate->sync<writeDestination, readSource, Reduce_min_dist_current,\n                          Bitset_dist_current, async>(\"BFS\");\n\n      galois::runtime::reportStat_Tsum(\n          REGION_NAME, syncSubstrate->get_run_identifier(\"NumWorkItems\"),\n          (unsigned long)work_edges.read_local());\n\n      ++_num_iterations;\n    } while ((async || (_num_iterations < maxIterations)) &&\n             dga.reduce(syncSubstrate->get_run_identifier()));\n\n    galois::runtime::reportStat_Tmax(\n        REGION_NAME,\n        \"NumIterations_\" + std::to_string(syncSubstrate->get_run_num()),\n        (unsigned long)_num_iterations);\n  }\n\n  void operator()(GNode src) const {\n    NodeData& snode = graph->getData(src);\n\n    if (snode.dist_old > snode.dist_current) {\n      active_vertices += 1;\n\n      if (local_priority > snode.dist_current) {\n        snode.dist_old = snode.dist_current;\n\n        for (auto jj : graph->edges(src)) {\n          work_edges += 1;\n\n          GNode dst         = graph->getEdgeDst(jj);\n          auto& dnode       = graph->getData(dst);\n          uint32_t new_dist = 1 + snode.dist_current;\n          uint32_t old_dist = galois::atomicMin(dnode.dist_current, new_dist);\n          if (old_dist > new_dist)\n            bitset_dist_current.set(dst);\n        }\n      }\n    }\n  }\n};\n\n/******************************************************************************/\n/* Sanity check operators */\n/******************************************************************************/\n\n/* Prints total number of nodes visited + max distance */\nstruct BFSSanityCheck {\n  const uint32_t& local_infinity;\n  Graph* graph;\n\n  galois::DGAccumulator<uint64_t>& DGAccumulator_sum;\n  galois::DGReduceMax<uint32_t>& DGMax;\n\n  BFSSanityCheck(const uint32_t& _infinity, Graph* _graph,\n                 galois::DGAccumulator<uint64_t>& dgas,\n                 galois::DGReduceMax<uint32_t>& dgm)\n      : local_infinity(_infinity), graph(_graph), DGAccumulator_sum(dgas),\n        DGMax(dgm) {}\n\n  void static go(Graph& _graph, galois::DGAccumulator<uint64_t>& dgas,\n                 galois::DGReduceMax<uint32_t>& dgm) {\n    dgas.reset();\n    dgm.reset();\n\n    if (personality == GPU_CUDA) {\n#ifdef GALOIS_ENABLE_GPU\n      uint64_t sum;\n      uint32_t max;\n      BFSSanityCheck_masterNodes_cuda(sum, max, infinity, cuda_ctx);\n      dgas += sum;\n      dgm.update(max);\n#else\n      abort();\n#endif\n    } else {\n      galois::do_all(galois::iterate(_graph.masterNodesRange().begin(),\n                                     _graph.masterNodesRange().end()),\n                     BFSSanityCheck(infinity, &_graph, dgas, dgm),\n                     galois::no_stats(), galois::loopname(\"BFSSanityCheck\"));\n    }\n\n    uint64_t num_visited  = dgas.reduce();\n    uint32_t max_distance = dgm.reduce();\n\n    // Only host 0 will print the info\n    if (galois::runtime::getSystemNetworkInterface().ID == 0) {\n      galois::gPrint(\"Number of nodes visited from source \", src_node, \" is \",\n                     num_visited, \"\\n\");\n      galois::gPrint(\"Max distance from source \", src_node, \" is \",\n                     max_distance, \"\\n\");\n    }\n  }\n\n  void operator()(GNode src) const {\n    NodeData& src_data = graph->getData(src);\n\n    if (src_data.dist_current < local_infinity) {\n      DGAccumulator_sum += 1;\n      DGMax.update(src_data.dist_current);\n    }\n  }\n};\n\n/******************************************************************************/\n/* Make results */\n/******************************************************************************/\n\nstd::vector<uint32_t> makeResultsCPU(std::unique_ptr<Graph>& hg) {\n  std::vector<uint32_t> values;\n\n  values.reserve(hg->numMasters());\n  for (auto node : hg->masterNodesRange()) {\n    values.push_back(hg->getData(node).dist_current);\n  }\n\n  return values;\n}\n\n#ifdef GALOIS_ENABLE_GPU\nstd::vector<uint32_t> makeResultsGPU(std::unique_ptr<Graph>& hg) {\n  std::vector<uint32_t> values;\n\n  values.reserve(hg->numMasters());\n  for (auto node : hg->masterNodesRange()) {\n    values.push_back(get_node_dist_current_cuda(cuda_ctx, node));\n  }\n\n  return values;\n}\n#else\nstd::vector<uint32_t> makeResultsGPU(std::unique_ptr<Graph>& /*unused*/) {\n  abort();\n}\n#endif\n\nstd::vector<uint32_t> makeResults(std::unique_ptr<Graph>& hg) {\n  switch (personality) {\n  case CPU:\n    return makeResultsCPU(hg);\n  case GPU_CUDA:\n    return makeResultsGPU(hg);\n  default:\n    abort();\n  }\n}\n\n/******************************************************************************/\n/* Main */\n/******************************************************************************/\n\nconstexpr static const char* const name =\n    \"BFS - Distributed Heterogeneous with \"\n    \"worklist.\";\nconstexpr static const char* const desc = \"BFS on Distributed Galois.\";\nconstexpr static const char* const url  = nullptr;\n\nint main(int argc, char** argv) {\n  galois::DistMemSys G;\n  DistBenchStart(argc, argv, name, desc, url);\n\n  const auto& net = galois::runtime::getSystemNetworkInterface();\n  if (net.ID == 0) {\n    galois::runtime::reportParam(REGION_NAME, \"Max Iterations\", maxIterations);\n    galois::runtime::reportParam(REGION_NAME, \"Source Node ID\", src_node);\n  }\n\n  galois::StatTimer StatTimer_total(\"TimerTotal\", REGION_NAME);\n\n  StatTimer_total.start();\n\n  std::unique_ptr<Graph> hg;\n#ifdef GALOIS_ENABLE_GPU\n  std::tie(hg, syncSubstrate) =\n      distGraphInitialization<NodeData, void>(&cuda_ctx);\n#else\n  std::tie(hg, syncSubstrate) = distGraphInitialization<NodeData, void>();\n#endif\n  // bitset comm setup\n  bitset_dist_current.resize(hg->size());\n\n  galois::gPrint(\"[\", net.ID, \"] InitializeGraph::go called\\n\");\n\n  InitializeGraph::go((*hg));\n  galois::runtime::getHostBarrier().wait();\n\n  // accumulators for use in operators\n  galois::DGAccumulator<uint64_t> DGAccumulator_sum;\n  galois::DGReduceMax<uint32_t> m;\n\n  for (auto run = 0; run < numRuns; ++run) {\n    galois::gPrint(\"[\", net.ID, \"] BFS::go run \", run, \" called\\n\");\n    std::string timer_str(\"Timer_\" + std::to_string(run));\n    galois::StatTimer StatTimer_main(timer_str.c_str(), REGION_NAME);\n\n    StatTimer_main.start();\n    if (execution == Async) {\n      BFS<true>::go(*hg);\n    } else {\n      BFS<false>::go(*hg);\n    }\n    StatTimer_main.stop();\n\n    // sanity check\n    BFSSanityCheck::go(*hg, DGAccumulator_sum, m);\n\n    if ((run + 1) != numRuns) {\n      if (personality == GPU_CUDA) {\n#ifdef GALOIS_ENABLE_GPU\n        bitset_dist_current_reset_cuda(cuda_ctx);\n#else\n        abort();\n#endif\n      } else {\n        bitset_dist_current.reset();\n      }\n\n      syncSubstrate->set_num_run(run + 1);\n      InitializeGraph::go((*hg));\n      galois::runtime::getHostBarrier().wait();\n    }\n  }\n\n  StatTimer_total.stop();\n\n  if (output) {\n    std::vector<uint32_t> results = makeResults(hg);\n    auto globalIDs                = hg->getMasterGlobalIDs();\n    assert(results.size() == globalIDs.size());\n\n    writeOutput(outputLocation, \"level\", results.data(), results.size(),\n                globalIDs.data());\n  }\n\n  return 0;\n}\n"
  },
  {
    "path": "lonestar/analytics/distributed/bfs/bfs_push_cuda.cu",
    "content": "/*  -*- mode: c++ -*-  */\n#include \"gg.h\"\n#include \"ggcuda.h\"\n#include \"cub/cub.cuh\"\n#include \"cub/util_allocator.cuh\"\n#include \"thread_work.h\"\n\nvoid kernel_sizing(CSRGraph &, dim3 &, dim3 &);\n#define TB_SIZE 256\nconst char *GGC_OPTIONS = \"coop_conv=False $ outline_iterate_gb=False $ backoff_blocking_factor=4 $ parcomb=True $ np_schedulers=set(['fg', 'tb', 'wp']) $ cc_disable=set([]) $ tb_lb=False $ hacks=set([]) $ np_factor=8 $ instrument=set([]) $ unroll=[] $ instrument_mode=None $ read_props=None $ outline_iterate=False $ ignore_nested_errors=False $ np=True $ write_props=None $ quiet_cgen=True $ dyn_lb=True $ retry_backoff=True $ cuda.graph_type=basic $ cuda.use_worklist_slots=True $ cuda.worklist_type=basic\";\nstruct ThreadWork t_work;\nbool enable_lb = true;\n#include \"bfs_push_cuda.cuh\"\nstatic const int __tb_BFS = TB_SIZE;\nstatic const int __tb_FirstItr_BFS = TB_SIZE;\n__global__ void InitializeGraph(CSRGraph graph, unsigned int __begin, unsigned int __end, const uint32_t  local_infinity, uint64_t local_src_node, uint32_t * p_dist_current, uint32_t * p_dist_old)\n{\n  unsigned tid = TID_1D;\n  unsigned nthreads = TOTAL_THREADS_1D;\n\n  const unsigned __kernel_tb_size = TB_SIZE;\n  index_type src_end;\n  // FP: \"1 -> 2;\n  src_end = __end;\n  for (index_type src = __begin + tid; src < src_end; src += nthreads)\n  {\n    bool pop  = src < __end;\n    if (pop)\n    {\n      p_dist_current[src] = (graph.node_data[src] == local_src_node) ? 0 : local_infinity;\n      p_dist_old[src] = (graph.node_data[src] == local_src_node) ? 0 : local_infinity;\n    }\n  }\n  // FP: \"8 -> 9;\n}\n__global__ void FirstItr_BFS_TB_LB(CSRGraph graph, unsigned int __begin, unsigned int __end, uint32_t * p_dist_current, uint32_t * p_dist_old, DynamicBitset& bitset_dist_current, int * thread_prefix_work_wl, unsigned int num_items, PipeContextT<Worklist2> thread_src_wl)\n{\n  unsigned tid = TID_1D;\n  unsigned nthreads = TOTAL_THREADS_1D;\n\n  const unsigned __kernel_tb_size = TB_SIZE;\n  __shared__ unsigned int total_work;\n  __shared__ unsigned block_start_src_index;\n  __shared__ unsigned block_end_src_index;\n  unsigned my_work;\n  unsigned src;\n  unsigned int offset;\n  unsigned int current_work;\n  // FP: \"1 -> 2;\n  // FP: \"2 -> 3;\n  unsigned blockdim_x = BLOCK_DIM_X;\n  // FP: \"3 -> 4;\n  // FP: \"4 -> 5;\n  // FP: \"5 -> 6;\n  // FP: \"6 -> 7;\n  // FP: \"7 -> 8;\n  // FP: \"8 -> 9;\n  // FP: \"9 -> 10;\n  total_work = thread_prefix_work_wl[num_items - 1];\n  // FP: \"10 -> 11;\n  my_work = ceilf((float)(total_work) / (float) nthreads);\n  // FP: \"11 -> 12;\n\n  // FP: \"12 -> 13;\n  __syncthreads();\n  // FP: \"13 -> 14;\n\n  // FP: \"14 -> 15;\n  if (my_work != 0)\n  {\n    current_work = tid;\n  }\n  // FP: \"17 -> 18;\n  for (unsigned i =0; i < my_work; i++)\n  {\n    unsigned int block_start_work;\n    unsigned int block_end_work;\n    if (threadIdx.x == 0)\n    {\n      if (current_work < total_work)\n      {\n        block_start_work = current_work;\n        block_end_work=current_work + blockdim_x - 1;\n        if (block_end_work >= total_work)\n        {\n          block_end_work = total_work - 1;\n        }\n        block_start_src_index = compute_src_and_offset(0, num_items - 1,  block_start_work+1, thread_prefix_work_wl, num_items,offset);\n        block_end_src_index = compute_src_and_offset(0, num_items - 1, block_end_work+1, thread_prefix_work_wl, num_items, offset);\n      }\n    }\n    __syncthreads();\n\n    if (current_work < total_work)\n    {\n      unsigned src_index;\n      index_type jj;\n      src_index = compute_src_and_offset(block_start_src_index, block_end_src_index, current_work+1, thread_prefix_work_wl,num_items, offset);\n      src= thread_src_wl.in_wl().dwl[src_index];\n      jj = (graph).getFirstEdge(src)+ offset;\n      {\n        index_type dst;\n        uint32_t new_dist;\n        uint32_t old_dist;\n        dst = graph.getAbsDestination(jj);\n        new_dist = 1 + p_dist_current[src];\n        old_dist = atomicTestMin(&p_dist_current[dst], new_dist);\n        if (old_dist > new_dist)\n        {\n          bitset_dist_current.set(dst);\n        }\n      }\n      current_work = current_work + nthreads;\n    }\n    __syncthreads();\n  }\n  // FP: \"49 -> 50;\n}\n__global__ void FirstItr_BFS(CSRGraph graph, unsigned int __begin, unsigned int __end, uint32_t * p_dist_current, uint32_t * p_dist_old, DynamicBitset& bitset_dist_current, PipeContextT<Worklist2> thread_work_wl, PipeContextT<Worklist2> thread_src_wl, bool enable_lb)\n{\n  unsigned tid = TID_1D;\n  unsigned nthreads = TOTAL_THREADS_1D;\n\n  const unsigned __kernel_tb_size = __tb_FirstItr_BFS;\n  index_type src_end;\n  index_type src_rup;\n  // FP: \"1 -> 2;\n  const int _NP_CROSSOVER_WP = 32;\n  const int _NP_CROSSOVER_TB = __kernel_tb_size;\n  // FP: \"2 -> 3;\n  const int BLKSIZE = __kernel_tb_size;\n  const int ITSIZE = BLKSIZE * 8;\n  unsigned d_limit = DEGREE_LIMIT;\n  // FP: \"3 -> 4;\n\n  typedef cub::BlockScan<multiple_sum<2, index_type>, BLKSIZE> BlockScan;\n  typedef union np_shared<BlockScan::TempStorage, index_type, struct tb_np, struct warp_np<__kernel_tb_size/32>, struct fg_np<ITSIZE> > npsTy;\n\n  // FP: \"4 -> 5;\n  __shared__ npsTy nps ;\n  // FP: \"5 -> 6;\n  src_end = __end;\n  src_rup = ((__begin) + roundup(((__end) - (__begin)), (blockDim.x)));\n  for (index_type src = __begin + tid; src < src_rup; src += nthreads)\n  {\n    int index;\n    multiple_sum<2, index_type> _np_mps;\n    multiple_sum<2, index_type> _np_mps_total;\n    // FP: \"6 -> 7;\n    bool pop  = src < __end && ((( src < (graph).nnodes )) ? true: false);\n    // FP: \"7 -> 8;\n    if (pop)\n    {\n      p_dist_old[src]  = p_dist_current[src];\n    }\n    // FP: \"10 -> 11;\n    // FP: \"13 -> 14;\n    // FP: \"14 -> 15;\n    int threshold = TOTAL_THREADS_1D;\n    // FP: \"15 -> 16;\n    if (pop && (graph).getOutDegree(src) >= threshold)\n    {\n      index = thread_work_wl.in_wl().push_range(1) ;\n      thread_src_wl.in_wl().push_range(1);\n      thread_work_wl.in_wl().dwl[index] = (graph).getOutDegree(src);\n      thread_src_wl.in_wl().dwl[index] = src;\n      pop = false;\n    }\n    // FP: \"18 -> 19;\n    struct NPInspector1 _np = {0,0,0,0,0,0};\n    // FP: \"19 -> 20;\n    __shared__ struct { index_type src; } _np_closure [TB_SIZE];\n    // FP: \"20 -> 21;\n    _np_closure[threadIdx.x].src = src;\n    // FP: \"21 -> 22;\n    if (pop)\n    {\n      _np.size = (graph).getOutDegree(src);\n      _np.start = (graph).getFirstEdge(src);\n    }\n    // FP: \"24 -> 25;\n    // FP: \"25 -> 26;\n    _np_mps.el[0] = _np.size >= _NP_CROSSOVER_WP ? _np.size : 0;\n    _np_mps.el[1] = _np.size < _NP_CROSSOVER_WP ? _np.size : 0;\n    // FP: \"26 -> 27;\n    BlockScan(nps.temp_storage).ExclusiveSum(_np_mps, _np_mps, _np_mps_total);\n    // FP: \"27 -> 28;\n    if (threadIdx.x == 0)\n    {\n      nps.tb.owner = MAX_TB_SIZE + 1;\n    }\n    // FP: \"30 -> 31;\n    __syncthreads();\n    // FP: \"31 -> 32;\n    while (true)\n    {\n      // FP: \"32 -> 33;\n      if (_np.size >= _NP_CROSSOVER_TB)\n      {\n        nps.tb.owner = threadIdx.x;\n      }\n      // FP: \"35 -> 36;\n      __syncthreads();\n      // FP: \"36 -> 37;\n      if (nps.tb.owner == MAX_TB_SIZE + 1)\n      {\n        // FP: \"37 -> 38;\n        __syncthreads();\n        // FP: \"38 -> 39;\n        break;\n      }\n      // FP: \"40 -> 41;\n      if (nps.tb.owner == threadIdx.x)\n      {\n        nps.tb.start = _np.start;\n        nps.tb.size = _np.size;\n        nps.tb.src = threadIdx.x;\n        _np.start = 0;\n        _np.size = 0;\n      }\n      // FP: \"43 -> 44;\n      __syncthreads();\n      // FP: \"44 -> 45;\n      int ns = nps.tb.start;\n      int ne = nps.tb.size;\n      // FP: \"45 -> 46;\n      if (nps.tb.src == threadIdx.x)\n      {\n        nps.tb.owner = MAX_TB_SIZE + 1;\n      }\n      // FP: \"48 -> 49;\n      assert(nps.tb.src < __kernel_tb_size);\n      src = _np_closure[nps.tb.src].src;\n      // FP: \"49 -> 50;\n      for (int _np_j = threadIdx.x; _np_j < ne; _np_j += BLKSIZE)\n      {\n        index_type jj;\n        jj = ns +_np_j;\n        {\n          index_type dst;\n          uint32_t new_dist;\n          uint32_t old_dist;\n          dst = graph.getAbsDestination(jj);\n          new_dist = 1 + p_dist_current[src];\n          old_dist = atomicTestMin(&p_dist_current[dst], new_dist);\n          if (old_dist > new_dist)\n          {\n            bitset_dist_current.set(dst);\n          }\n        }\n      }\n      // FP: \"62 -> 63;\n      __syncthreads();\n    }\n    // FP: \"64 -> 65;\n\n    // FP: \"65 -> 66;\n    {\n      const int warpid = threadIdx.x / 32;\n      // FP: \"66 -> 67;\n      const int _np_laneid = cub::LaneId();\n      // FP: \"67 -> 68;\n      while (__any_sync(0xffffffff, _np.size >= _NP_CROSSOVER_WP && _np.size < _NP_CROSSOVER_TB))\n      {\n        if (_np.size >= _NP_CROSSOVER_WP && _np.size < _NP_CROSSOVER_TB)\n        {\n          nps.warp.owner[warpid] = _np_laneid;\n        }\n        if (nps.warp.owner[warpid] == _np_laneid)\n        {\n          nps.warp.start[warpid] = _np.start;\n          nps.warp.size[warpid] = _np.size;\n          nps.warp.src[warpid] = threadIdx.x;\n          _np.start = 0;\n          _np.size = 0;\n        }\n        index_type _np_w_start = nps.warp.start[warpid];\n        index_type _np_w_size = nps.warp.size[warpid];\n        assert(nps.warp.src[warpid] < __kernel_tb_size);\n        src = _np_closure[nps.warp.src[warpid]].src;\n        for (int _np_ii = _np_laneid; _np_ii < _np_w_size; _np_ii += 32)\n        {\n          index_type jj;\n          jj = _np_w_start +_np_ii;\n          {\n            index_type dst;\n            uint32_t new_dist;\n            uint32_t old_dist;\n            dst = graph.getAbsDestination(jj);\n            new_dist = 1 + p_dist_current[src];\n            old_dist = atomicTestMin(&p_dist_current[dst], new_dist);\n            if (old_dist > new_dist)\n            {\n              bitset_dist_current.set(dst);\n            }\n          }\n        }\n      }\n      // FP: \"90 -> 91;\n      __syncthreads();\n      // FP: \"91 -> 92;\n    }\n\n    // FP: \"92 -> 93;\n    __syncthreads();\n    // FP: \"93 -> 94;\n    _np.total = _np_mps_total.el[1];\n    _np.offset = _np_mps.el[1];\n    // FP: \"94 -> 95;\n    while (_np.work())\n    {\n      // FP: \"95 -> 96;\n      int _np_i =0;\n      // FP: \"96 -> 97;\n      _np.inspect2(nps.fg.itvalue, nps.fg.src, ITSIZE, threadIdx.x);\n      // FP: \"97 -> 98;\n      __syncthreads();\n      // FP: \"98 -> 99;\n\n      // FP: \"99 -> 100;\n      for (_np_i = threadIdx.x; _np_i < ITSIZE && _np.valid(_np_i); _np_i += BLKSIZE)\n      {\n        index_type jj;\n        assert(nps.fg.src[_np_i] < __kernel_tb_size);\n        src = _np_closure[nps.fg.src[_np_i]].src;\n        jj= nps.fg.itvalue[_np_i];\n        {\n          index_type dst;\n          uint32_t new_dist;\n          uint32_t old_dist;\n          dst = graph.getAbsDestination(jj);\n          new_dist = 1 + p_dist_current[src];\n          old_dist = atomicTestMin(&p_dist_current[dst], new_dist);\n          if (old_dist > new_dist)\n          {\n            bitset_dist_current.set(dst);\n          }\n        }\n      }\n      // FP: \"113 -> 114;\n      _np.execute_round_done(ITSIZE);\n      // FP: \"114 -> 115;\n      __syncthreads();\n    }\n    // FP: \"116 -> 117;\n    assert(threadIdx.x < __kernel_tb_size);\n    src = _np_closure[threadIdx.x].src;\n  }\n  // FP: \"118 -> 119;\n}\n__global__ void BFS_TB_LB(CSRGraph graph, unsigned int __begin, unsigned int __end, uint32_t local_priority, uint32_t * p_dist_current, uint32_t * p_dist_old, DynamicBitset& bitset_dist_current, HGAccumulator<unsigned int> active_vertices, HGAccumulator<unsigned int> work_items, int * thread_prefix_work_wl, unsigned int num_items, PipeContextT<Worklist2> thread_src_wl)\n{\n  unsigned tid = TID_1D;\n  unsigned nthreads = TOTAL_THREADS_1D;\n\n  const unsigned __kernel_tb_size = TB_SIZE;\n  __shared__ unsigned int total_work;\n  __shared__ unsigned block_start_src_index;\n  __shared__ unsigned block_end_src_index;\n  unsigned my_work;\n  unsigned src;\n  unsigned int offset;\n  unsigned int current_work;\n  // FP: \"1 -> 2;\n  // FP: \"2 -> 3;\n  unsigned blockdim_x = BLOCK_DIM_X;\n  // FP: \"3 -> 4;\n  // FP: \"4 -> 5;\n  // FP: \"5 -> 6;\n  // FP: \"6 -> 7;\n  // FP: \"7 -> 8;\n  // FP: \"8 -> 9;\n  // FP: \"9 -> 10;\n  total_work = thread_prefix_work_wl[num_items - 1];\n  // FP: \"10 -> 11;\n  my_work = ceilf((float)(total_work) / (float) nthreads);\n  // FP: \"11 -> 12;\n\n  // FP: \"12 -> 13;\n  __syncthreads();\n  // FP: \"13 -> 14;\n\n  // FP: \"14 -> 15;\n  if (my_work != 0)\n  {\n    current_work = tid;\n  }\n  // FP: \"17 -> 18;\n  for (unsigned i =0; i < my_work; i++)\n  {\n    unsigned int block_start_work;\n    unsigned int block_end_work;\n    if (threadIdx.x == 0)\n    {\n      if (current_work < total_work)\n      {\n        block_start_work = current_work;\n        block_end_work=current_work + blockdim_x - 1;\n        if (block_end_work >= total_work)\n        {\n          block_end_work = total_work - 1;\n        }\n        block_start_src_index = compute_src_and_offset(0, num_items - 1,  block_start_work+1, thread_prefix_work_wl, num_items,offset);\n        block_end_src_index = compute_src_and_offset(0, num_items - 1, block_end_work+1, thread_prefix_work_wl, num_items, offset);\n      }\n    }\n    __syncthreads();\n\n    if (current_work < total_work)\n    {\n      unsigned src_index;\n      index_type jj;\n      src_index = compute_src_and_offset(block_start_src_index, block_end_src_index, current_work+1, thread_prefix_work_wl,num_items, offset);\n      src= thread_src_wl.in_wl().dwl[src_index];\n      jj = (graph).getFirstEdge(src)+ offset;\n      {\n        index_type dst;\n        uint32_t new_dist;\n        uint32_t old_dist;\n        work_items.reduce( 1);\n        dst = graph.getAbsDestination(jj);\n        new_dist = 1 + p_dist_current[src];\n        old_dist = atomicTestMin(&p_dist_current[dst], new_dist);\n        if (old_dist > new_dist)\n        {\n          bitset_dist_current.set(dst);\n        }\n      }\n      current_work = current_work + nthreads;\n    }\n    __syncthreads();\n  }\n  // FP: \"50 -> 51;\n}\n__global__ void BFS(CSRGraph graph, unsigned int __begin, unsigned int __end, uint32_t local_priority, uint32_t * p_dist_current, uint32_t * p_dist_old, DynamicBitset& bitset_dist_current, HGAccumulator<unsigned int> active_vertices, HGAccumulator<unsigned int> work_items, PipeContextT<Worklist2> thread_work_wl, PipeContextT<Worklist2> thread_src_wl, bool enable_lb)\n{\n  unsigned tid = TID_1D;\n  unsigned nthreads = TOTAL_THREADS_1D;\n\n  const unsigned __kernel_tb_size = __tb_BFS;\n  __shared__ cub::BlockReduce<unsigned int, TB_SIZE>::TempStorage active_vertices_ts;\n  __shared__ cub::BlockReduce<unsigned int, TB_SIZE>::TempStorage work_items_ts;\n  index_type src_end;\n  index_type src_rup;\n  // FP: \"1 -> 2;\n  const int _NP_CROSSOVER_WP = 32;\n  const int _NP_CROSSOVER_TB = __kernel_tb_size;\n  // FP: \"2 -> 3;\n  const int BLKSIZE = __kernel_tb_size;\n  const int ITSIZE = BLKSIZE * 8;\n  unsigned d_limit = DEGREE_LIMIT;\n  // FP: \"3 -> 4;\n\n  typedef cub::BlockScan<multiple_sum<2, index_type>, BLKSIZE> BlockScan;\n  typedef union np_shared<BlockScan::TempStorage, index_type, struct tb_np, struct warp_np<__kernel_tb_size/32>, struct fg_np<ITSIZE> > npsTy;\n\n  // FP: \"4 -> 5;\n  __shared__ npsTy nps ;\n  // FP: \"5 -> 6;\n  // FP: \"6 -> 7;\n  active_vertices.thread_entry();\n  // FP: \"7 -> 8;\n  // FP: \"8 -> 9;\n  work_items.thread_entry();\n  // FP: \"9 -> 10;\n  src_end = __end;\n  src_rup = ((__begin) + roundup(((__end) - (__begin)), (blockDim.x)));\n  for (index_type src = __begin + tid; src < src_rup; src += nthreads)\n  {\n    int index;\n    multiple_sum<2, index_type> _np_mps;\n    multiple_sum<2, index_type> _np_mps_total;\n    // FP: \"10 -> 11;\n    bool pop  = src < __end && ((( src < (graph).nnodes )) ? true: false);\n    // FP: \"11 -> 12;\n    if (pop)\n    {\n      if (p_dist_old[src] > p_dist_current[src])\n      {\n        active_vertices.reduce( 1);\n        if (local_priority > p_dist_current[src])\n        {\n          p_dist_old[src] = p_dist_current[src];\n        }\n        else\n        {\n          pop = false;\n        }\n      }\n      else\n      {\n        pop = false;\n      }\n    }\n    // FP: \"19 -> 20;\n    // FP: \"22 -> 23;\n    // FP: \"23 -> 24;\n    int threshold = TOTAL_THREADS_1D;\n    // FP: \"24 -> 25;\n    if (pop && (graph).getOutDegree(src) >= threshold)\n    {\n      index = thread_work_wl.in_wl().push_range(1) ;\n      thread_src_wl.in_wl().push_range(1);\n      thread_work_wl.in_wl().dwl[index] = (graph).getOutDegree(src);\n      thread_src_wl.in_wl().dwl[index] = src;\n      pop = false;\n    }\n    // FP: \"27 -> 28;\n    struct NPInspector1 _np = {0,0,0,0,0,0};\n    // FP: \"28 -> 29;\n    __shared__ struct { index_type src; } _np_closure [TB_SIZE];\n    // FP: \"29 -> 30;\n    _np_closure[threadIdx.x].src = src;\n    // FP: \"30 -> 31;\n    if (pop)\n    {\n      _np.size = (graph).getOutDegree(src);\n      _np.start = (graph).getFirstEdge(src);\n    }\n    // FP: \"33 -> 34;\n    // FP: \"34 -> 35;\n    _np_mps.el[0] = _np.size >= _NP_CROSSOVER_WP ? _np.size : 0;\n    _np_mps.el[1] = _np.size < _NP_CROSSOVER_WP ? _np.size : 0;\n    // FP: \"35 -> 36;\n    BlockScan(nps.temp_storage).ExclusiveSum(_np_mps, _np_mps, _np_mps_total);\n    // FP: \"36 -> 37;\n    if (threadIdx.x == 0)\n    {\n      nps.tb.owner = MAX_TB_SIZE + 1;\n    }\n    // FP: \"39 -> 40;\n    __syncthreads();\n    // FP: \"40 -> 41;\n    while (true)\n    {\n      // FP: \"41 -> 42;\n      if (_np.size >= _NP_CROSSOVER_TB)\n      {\n        nps.tb.owner = threadIdx.x;\n      }\n      // FP: \"44 -> 45;\n      __syncthreads();\n      // FP: \"45 -> 46;\n      if (nps.tb.owner == MAX_TB_SIZE + 1)\n      {\n        // FP: \"46 -> 47;\n        __syncthreads();\n        // FP: \"47 -> 48;\n        break;\n      }\n      // FP: \"49 -> 50;\n      if (nps.tb.owner == threadIdx.x)\n      {\n        nps.tb.start = _np.start;\n        nps.tb.size = _np.size;\n        nps.tb.src = threadIdx.x;\n        _np.start = 0;\n        _np.size = 0;\n      }\n      // FP: \"52 -> 53;\n      __syncthreads();\n      // FP: \"53 -> 54;\n      int ns = nps.tb.start;\n      int ne = nps.tb.size;\n      // FP: \"54 -> 55;\n      if (nps.tb.src == threadIdx.x)\n      {\n        nps.tb.owner = MAX_TB_SIZE + 1;\n      }\n      // FP: \"57 -> 58;\n      assert(nps.tb.src < __kernel_tb_size);\n      src = _np_closure[nps.tb.src].src;\n      // FP: \"58 -> 59;\n      for (int _np_j = threadIdx.x; _np_j < ne; _np_j += BLKSIZE)\n      {\n        index_type jj;\n        jj = ns +_np_j;\n        {\n          index_type dst;\n          uint32_t new_dist;\n          uint32_t old_dist;\n          work_items.reduce( 1);\n          dst = graph.getAbsDestination(jj);\n          new_dist = 1 + p_dist_current[src];\n          old_dist = atomicTestMin(&p_dist_current[dst], new_dist);\n          if (old_dist > new_dist)\n          {\n            bitset_dist_current.set(dst);\n          }\n        }\n      }\n      // FP: \"72 -> 73;\n      __syncthreads();\n    }\n    // FP: \"74 -> 75;\n\n    // FP: \"75 -> 76;\n    {\n      const int warpid = threadIdx.x / 32;\n      // FP: \"76 -> 77;\n      const int _np_laneid = cub::LaneId();\n      // FP: \"77 -> 78;\n      while (__any_sync(0xffffffff, _np.size >= _NP_CROSSOVER_WP && _np.size < _NP_CROSSOVER_TB))\n      {\n        if (_np.size >= _NP_CROSSOVER_WP && _np.size < _NP_CROSSOVER_TB)\n        {\n          nps.warp.owner[warpid] = _np_laneid;\n        }\n        if (nps.warp.owner[warpid] == _np_laneid)\n        {\n          nps.warp.start[warpid] = _np.start;\n          nps.warp.size[warpid] = _np.size;\n          nps.warp.src[warpid] = threadIdx.x;\n          _np.start = 0;\n          _np.size = 0;\n        }\n        index_type _np_w_start = nps.warp.start[warpid];\n        index_type _np_w_size = nps.warp.size[warpid];\n        assert(nps.warp.src[warpid] < __kernel_tb_size);\n        src = _np_closure[nps.warp.src[warpid]].src;\n        for (int _np_ii = _np_laneid; _np_ii < _np_w_size; _np_ii += 32)\n        {\n          index_type jj;\n          jj = _np_w_start +_np_ii;\n          {\n            index_type dst;\n            uint32_t new_dist;\n            uint32_t old_dist;\n            work_items.reduce( 1);\n            dst = graph.getAbsDestination(jj);\n            new_dist = 1 + p_dist_current[src];\n            old_dist = atomicTestMin(&p_dist_current[dst], new_dist);\n            if (old_dist > new_dist)\n            {\n              bitset_dist_current.set(dst);\n            }\n          }\n        }\n      }\n      // FP: \"101 -> 102;\n      __syncthreads();\n      // FP: \"102 -> 103;\n    }\n\n    // FP: \"103 -> 104;\n    __syncthreads();\n    // FP: \"104 -> 105;\n    _np.total = _np_mps_total.el[1];\n    _np.offset = _np_mps.el[1];\n    // FP: \"105 -> 106;\n    while (_np.work())\n    {\n      // FP: \"106 -> 107;\n      int _np_i =0;\n      // FP: \"107 -> 108;\n      _np.inspect2(nps.fg.itvalue, nps.fg.src, ITSIZE, threadIdx.x);\n      // FP: \"108 -> 109;\n      __syncthreads();\n      // FP: \"109 -> 110;\n\n      // FP: \"110 -> 111;\n      for (_np_i = threadIdx.x; _np_i < ITSIZE && _np.valid(_np_i); _np_i += BLKSIZE)\n      {\n        index_type jj;\n        assert(nps.fg.src[_np_i] < __kernel_tb_size);\n        src = _np_closure[nps.fg.src[_np_i]].src;\n        jj= nps.fg.itvalue[_np_i];\n        {\n          index_type dst;\n          uint32_t new_dist;\n          uint32_t old_dist;\n          work_items.reduce( 1);\n          dst = graph.getAbsDestination(jj);\n          new_dist = 1 + p_dist_current[src];\n          old_dist = atomicTestMin(&p_dist_current[dst], new_dist);\n          if (old_dist > new_dist)\n          {\n            bitset_dist_current.set(dst);\n          }\n        }\n      }\n      // FP: \"125 -> 126;\n      _np.execute_round_done(ITSIZE);\n      // FP: \"126 -> 127;\n      __syncthreads();\n    }\n    // FP: \"128 -> 129;\n    assert(threadIdx.x < __kernel_tb_size);\n    src = _np_closure[threadIdx.x].src;\n  }\n  // FP: \"132 -> 133;\n  active_vertices.thread_exit<cub::BlockReduce<unsigned int, TB_SIZE> >(active_vertices_ts);\n  // FP: \"133 -> 134;\n  work_items.thread_exit<cub::BlockReduce<unsigned int, TB_SIZE> >(work_items_ts);\n  // FP: \"134 -> 135;\n}\n__global__ void BFSSanityCheck(CSRGraph graph, unsigned int __begin, unsigned int __end, const uint32_t  local_infinity, uint32_t * p_dist_current, HGAccumulator<uint64_t> DGAccumulator_sum, HGReduceMax<uint32_t> DGMax)\n{\n  unsigned tid = TID_1D;\n  unsigned nthreads = TOTAL_THREADS_1D;\n\n  const unsigned __kernel_tb_size = TB_SIZE;\n  __shared__ cub::BlockReduce<uint64_t, TB_SIZE>::TempStorage DGAccumulator_sum_ts;\n  __shared__ cub::BlockReduce<uint32_t, TB_SIZE>::TempStorage DGMax_ts;\n  index_type src_end;\n  // FP: \"1 -> 2;\n  // FP: \"2 -> 3;\n  DGAccumulator_sum.thread_entry();\n  // FP: \"3 -> 4;\n  // FP: \"4 -> 5;\n  DGMax.thread_entry();\n  // FP: \"5 -> 6;\n  src_end = __end;\n  for (index_type src = __begin + tid; src < src_end; src += nthreads)\n  {\n    bool pop  = src < __end;\n    if (pop)\n    {\n      if (p_dist_current[src] < local_infinity)\n      {\n        DGAccumulator_sum.reduce( 1);\n        DGMax.reduce(p_dist_current[src]);\n      }\n    }\n  }\n  // FP: \"14 -> 15;\n  DGAccumulator_sum.thread_exit<cub::BlockReduce<uint64_t, TB_SIZE> >(DGAccumulator_sum_ts);\n  // FP: \"15 -> 16;\n  DGMax.thread_exit<cub::BlockReduce<uint32_t, TB_SIZE> >(DGMax_ts);\n  // FP: \"16 -> 17;\n}\nvoid InitializeGraph_cuda(unsigned int  __begin, unsigned int  __end, const uint32_t & local_infinity, uint64_t local_src_node, struct CUDA_Context*  ctx)\n{\n  t_work.init_thread_work(ctx->gg.nnodes);\n  dim3 blocks;\n  dim3 threads;\n  // FP: \"1 -> 2;\n  // FP: \"2 -> 3;\n  // FP: \"3 -> 4;\n  kernel_sizing(blocks, threads);\n  // FP: \"4 -> 5;\n  InitializeGraph <<<blocks, threads>>>(ctx->gg, __begin, __end, local_infinity, local_src_node, ctx->dist_current.data.gpu_wr_ptr(), ctx->dist_old.data.gpu_wr_ptr());\n  cudaDeviceSynchronize();\n  // FP: \"5 -> 6;\n  check_cuda_kernel;\n  // FP: \"6 -> 7;\n}\nvoid InitializeGraph_allNodes_cuda(const uint32_t & local_infinity, uint64_t local_src_node, struct CUDA_Context*  ctx)\n{\n  // FP: \"1 -> 2;\n  InitializeGraph_cuda(0, ctx->gg.nnodes, local_infinity, local_src_node, ctx);\n  // FP: \"2 -> 3;\n}\nvoid InitializeGraph_masterNodes_cuda(const uint32_t & local_infinity, uint64_t local_src_node, struct CUDA_Context*  ctx)\n{\n  // FP: \"1 -> 2;\n  InitializeGraph_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, local_infinity, local_src_node, ctx);\n  // FP: \"2 -> 3;\n}\nvoid InitializeGraph_nodesWithEdges_cuda(const uint32_t & local_infinity, uint64_t local_src_node, struct CUDA_Context*  ctx)\n{\n  // FP: \"1 -> 2;\n  InitializeGraph_cuda(0, ctx->numNodesWithEdges, local_infinity, local_src_node, ctx);\n  // FP: \"2 -> 3;\n}\nvoid FirstItr_BFS_cuda(unsigned int  __begin, unsigned int  __end, struct CUDA_Context*  ctx)\n{\n  dim3 blocks;\n  dim3 threads;\n  // FP: \"1 -> 2;\n  // FP: \"2 -> 3;\n  // FP: \"3 -> 4;\n  kernel_sizing(blocks, threads);\n  // FP: \"4 -> 5;\n  FirstItr_BFS <<<blocks, __tb_FirstItr_BFS>>>(ctx->gg, __begin, __end, ctx->dist_current.data.gpu_wr_ptr(), ctx->dist_old.data.gpu_wr_ptr(), *(ctx->dist_current.is_updated.gpu_rd_ptr()), t_work.thread_work_wl, t_work.thread_src_wl, enable_lb);\n  cudaDeviceSynchronize();\n  if (enable_lb)\n  {\n    int num_items = t_work.thread_work_wl.in_wl().nitems();\n    if (num_items != 0)\n    {\n      t_work.compute_prefix_sum();\n      cudaDeviceSynchronize();\n      FirstItr_BFS_TB_LB <<<blocks, __tb_FirstItr_BFS>>>(ctx->gg, __begin, __end, ctx->dist_current.data.gpu_wr_ptr(), ctx->dist_old.data.gpu_wr_ptr(), *(ctx->dist_current.is_updated.gpu_rd_ptr()), t_work.thread_prefix_work_wl.gpu_wr_ptr(), num_items, t_work.thread_src_wl);\n      cudaDeviceSynchronize();\n      t_work.reset_thread_work();\n    }\n  }\n  // FP: \"5 -> 6;\n  check_cuda_kernel;\n  // FP: \"6 -> 7;\n}\nvoid FirstItr_BFS_allNodes_cuda(struct CUDA_Context*  ctx)\n{\n  // FP: \"1 -> 2;\n  FirstItr_BFS_cuda(0, ctx->gg.nnodes, ctx);\n  // FP: \"2 -> 3;\n}\nvoid FirstItr_BFS_masterNodes_cuda(struct CUDA_Context*  ctx)\n{\n  // FP: \"1 -> 2;\n  FirstItr_BFS_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, ctx);\n  // FP: \"2 -> 3;\n}\nvoid FirstItr_BFS_nodesWithEdges_cuda(struct CUDA_Context*  ctx)\n{\n  // FP: \"1 -> 2;\n  FirstItr_BFS_cuda(0, ctx->numNodesWithEdges, ctx);\n  // FP: \"2 -> 3;\n}\nvoid BFS_cuda(unsigned int  __begin, unsigned int  __end, unsigned int & active_vertices, unsigned int & work_items, uint32_t local_priority, struct CUDA_Context*  ctx)\n{\n  dim3 blocks;\n  dim3 threads;\n  HGAccumulator<unsigned int> _active_vertices;\n  HGAccumulator<unsigned int> _work_items;\n  // FP: \"1 -> 2;\n  // FP: \"2 -> 3;\n  // FP: \"3 -> 4;\n  kernel_sizing(blocks, threads);\n  // FP: \"4 -> 5;\n  Shared<unsigned int> active_verticesval  = Shared<unsigned int>(1);\n  // FP: \"5 -> 6;\n  // FP: \"6 -> 7;\n  *(active_verticesval.cpu_wr_ptr()) = 0;\n  // FP: \"7 -> 8;\n  _active_vertices.rv = active_verticesval.gpu_wr_ptr();\n  // FP: \"8 -> 9;\n  Shared<unsigned int> work_itemsval  = Shared<unsigned int>(1);\n  // FP: \"9 -> 10;\n  // FP: \"10 -> 11;\n  *(work_itemsval.cpu_wr_ptr()) = 0;\n  // FP: \"11 -> 12;\n  _work_items.rv = work_itemsval.gpu_wr_ptr();\n  // FP: \"12 -> 13;\n  BFS <<<blocks, __tb_BFS>>>(ctx->gg, __begin, __end, local_priority, ctx->dist_current.data.gpu_wr_ptr(), ctx->dist_old.data.gpu_wr_ptr(), *(ctx->dist_current.is_updated.gpu_rd_ptr()), _active_vertices, _work_items, t_work.thread_work_wl, t_work.thread_src_wl, enable_lb);\n  cudaDeviceSynchronize();\n  if (enable_lb)\n  {\n    int num_items = t_work.thread_work_wl.in_wl().nitems();\n    if (num_items != 0)\n    {\n      t_work.compute_prefix_sum();\n      cudaDeviceSynchronize();\n      BFS_TB_LB <<<blocks, __tb_BFS>>>(ctx->gg, __begin, __end, local_priority, ctx->dist_current.data.gpu_wr_ptr(), ctx->dist_old.data.gpu_wr_ptr(), *(ctx->dist_current.is_updated.gpu_rd_ptr()), _active_vertices, _work_items, t_work.thread_prefix_work_wl.gpu_wr_ptr(), num_items, t_work.thread_src_wl);\n      cudaDeviceSynchronize();\n      t_work.reset_thread_work();\n    }\n  }\n  // FP: \"13 -> 14;\n  check_cuda_kernel;\n  // FP: \"14 -> 15;\n  active_vertices = *(active_verticesval.cpu_rd_ptr());\n  // FP: \"15 -> 16;\n  work_items = *(work_itemsval.cpu_rd_ptr());\n  // FP: \"16 -> 17;\n}\nvoid BFS_allNodes_cuda(unsigned int & active_vertices, unsigned int & work_items, uint32_t local_priority, struct CUDA_Context*  ctx)\n{\n  // FP: \"1 -> 2;\n  BFS_cuda(0, ctx->gg.nnodes, active_vertices, work_items, local_priority, ctx);\n  // FP: \"2 -> 3;\n}\nvoid BFS_masterNodes_cuda(unsigned int & active_vertices, unsigned int & work_items, uint32_t local_priority, struct CUDA_Context*  ctx)\n{\n  // FP: \"1 -> 2;\n  BFS_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, active_vertices, work_items, local_priority, ctx);\n  // FP: \"2 -> 3;\n}\nvoid BFS_nodesWithEdges_cuda(unsigned int & active_vertices, unsigned int & work_items, uint32_t local_priority, struct CUDA_Context*  ctx)\n{\n  // FP: \"1 -> 2;\n  BFS_cuda(0, ctx->numNodesWithEdges, active_vertices, work_items, local_priority, ctx);\n  // FP: \"2 -> 3;\n}\nvoid BFSSanityCheck_cuda(unsigned int  __begin, unsigned int  __end, uint64_t & DGAccumulator_sum, uint32_t & DGMax, const uint32_t & local_infinity, struct CUDA_Context*  ctx)\n{\n  dim3 blocks;\n  dim3 threads;\n  HGAccumulator<uint64_t> _DGAccumulator_sum;\n  HGReduceMax<uint32_t> _DGMax;\n  // FP: \"1 -> 2;\n  // FP: \"2 -> 3;\n  // FP: \"3 -> 4;\n  kernel_sizing(blocks, threads);\n  // FP: \"4 -> 5;\n  Shared<uint64_t> DGAccumulator_sumval  = Shared<uint64_t>(1);\n  // FP: \"5 -> 6;\n  // FP: \"6 -> 7;\n  *(DGAccumulator_sumval.cpu_wr_ptr()) = 0;\n  // FP: \"7 -> 8;\n  _DGAccumulator_sum.rv = DGAccumulator_sumval.gpu_wr_ptr();\n  // FP: \"8 -> 9;\n  Shared<uint32_t> DGMaxval  = Shared<uint32_t>(1);\n  // FP: \"9 -> 10;\n  // FP: \"10 -> 11;\n  *(DGMaxval.cpu_wr_ptr()) = 0;\n  // FP: \"11 -> 12;\n  _DGMax.rv = DGMaxval.gpu_wr_ptr();\n  // FP: \"12 -> 13;\n  BFSSanityCheck <<<blocks, threads>>>(ctx->gg, __begin, __end, local_infinity, ctx->dist_current.data.gpu_wr_ptr(), _DGAccumulator_sum, _DGMax);\n  cudaDeviceSynchronize();\n  // FP: \"13 -> 14;\n  check_cuda_kernel;\n  // FP: \"14 -> 15;\n  DGAccumulator_sum = *(DGAccumulator_sumval.cpu_rd_ptr());\n  // FP: \"15 -> 16;\n  DGMax = *(DGMaxval.cpu_rd_ptr());\n  // FP: \"16 -> 17;\n}\nvoid BFSSanityCheck_allNodes_cuda(uint64_t & DGAccumulator_sum, uint32_t & DGMax, const uint32_t & local_infinity, struct CUDA_Context*  ctx)\n{\n  // FP: \"1 -> 2;\n  BFSSanityCheck_cuda(0, ctx->gg.nnodes, DGAccumulator_sum, DGMax, local_infinity, ctx);\n  // FP: \"2 -> 3;\n}\nvoid BFSSanityCheck_masterNodes_cuda(uint64_t & DGAccumulator_sum, uint32_t & DGMax, const uint32_t & local_infinity, struct CUDA_Context*  ctx)\n{\n  // FP: \"1 -> 2;\n  BFSSanityCheck_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, DGAccumulator_sum, DGMax, local_infinity, ctx);\n  // FP: \"2 -> 3;\n}\nvoid BFSSanityCheck_nodesWithEdges_cuda(uint64_t & DGAccumulator_sum, uint32_t & DGMax, const uint32_t & local_infinity, struct CUDA_Context*  ctx)\n{\n  // FP: \"1 -> 2;\n  BFSSanityCheck_cuda(0, ctx->numNodesWithEdges, DGAccumulator_sum, DGMax, local_infinity, ctx);\n  // FP: \"2 -> 3;\n}\n"
  },
  {
    "path": "lonestar/analytics/distributed/bfs/bfs_push_cuda.cuh",
    "content": "#pragma once\n#include <cuda.h>\n#include <stdio.h>\n#include <sys/types.h>\n#include <unistd.h>\n#include \"bfs_push_cuda.h\"\n#include \"galois/runtime/cuda/DeviceSync.h\"\n\nstruct CUDA_Context : public CUDA_Context_Common {\n\tstruct CUDA_Context_Field<uint32_t> dist_current;\n\tstruct CUDA_Context_Field<uint32_t> dist_old;\n};\n\nstruct CUDA_Context* get_CUDA_context(int id) {\n\tstruct CUDA_Context* ctx;\n\tctx = (struct CUDA_Context* ) calloc(1, sizeof(struct CUDA_Context));\n\tctx->id = id;\n\treturn ctx;\n}\n\nbool init_CUDA_context(struct CUDA_Context* ctx, int device) {\n\treturn init_CUDA_context_common(ctx, device);\n}\n\nvoid load_graph_CUDA(struct CUDA_Context* ctx, MarshalGraph &g, unsigned num_hosts) {\n\tsize_t mem_usage = mem_usage_CUDA_common(g, num_hosts);\n\tmem_usage += mem_usage_CUDA_field(&ctx->dist_current, g, num_hosts);\n\tmem_usage += mem_usage_CUDA_field(&ctx->dist_old, g, num_hosts);\n\tprintf(\"[%d] Host memory for communication context: %3u MB\\n\", ctx->id, mem_usage/1048756);\n\tload_graph_CUDA_common(ctx, g, num_hosts);\n\tload_graph_CUDA_field(ctx, &ctx->dist_current, num_hosts);\n\tload_graph_CUDA_field(ctx, &ctx->dist_old, num_hosts);\n\treset_CUDA_context(ctx);\n}\n\nvoid reset_CUDA_context(struct CUDA_Context* ctx) {\n\tctx->dist_current.data.zero_gpu();\n\tctx->dist_old.data.zero_gpu();\n}\n\nvoid get_bitset_dist_current_cuda(struct CUDA_Context* ctx, uint64_t* bitset_compute) {\n\tctx->dist_current.is_updated.cpu_rd_ptr()->copy_to_cpu(bitset_compute);\n}\n\nvoid bitset_dist_current_reset_cuda(struct CUDA_Context* ctx) {\n\tctx->dist_current.is_updated.cpu_rd_ptr()->reset();\n}\n\nvoid bitset_dist_current_reset_cuda(struct CUDA_Context* ctx, size_t begin, size_t end) {\n\treset_bitset_field(&ctx->dist_current, begin, end);\n}\n\nuint32_t get_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned LID) {\n\tuint32_t *dist_current = ctx->dist_current.data.cpu_rd_ptr();\n\treturn dist_current[LID];\n}\n\nvoid set_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned LID, uint32_t v) {\n\tuint32_t *dist_current = ctx->dist_current.data.cpu_wr_ptr();\n\tdist_current[LID] = v;\n}\n\nvoid add_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned LID, uint32_t v) {\n\tuint32_t *dist_current = ctx->dist_current.data.cpu_wr_ptr();\n\tdist_current[LID] += v;\n}\n\nbool min_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned LID, uint32_t v) {\n\tuint32_t *dist_current = ctx->dist_current.data.cpu_wr_ptr();\n\tif (dist_current[LID] > v){\n\t\tdist_current[LID] = v;\n\t\treturn true;\n\t}\n\treturn false;\n}\n\nvoid batch_get_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v) {\n\tbatch_get_shared_field<uint32_t, sharedMaster, false>(ctx, &ctx->dist_current, from_id, v);\n}\n\nvoid batch_get_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode) {\n\tbatch_get_shared_field<uint32_t, sharedMaster, false>(ctx, &ctx->dist_current, from_id, v, v_size, data_mode);\n}\n\nvoid batch_get_mirror_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v) {\n\tbatch_get_shared_field<uint32_t, sharedMirror, false>(ctx, &ctx->dist_current, from_id, v);\n}\n\nvoid batch_get_mirror_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode) {\n\tbatch_get_shared_field<uint32_t, sharedMirror, false>(ctx, &ctx->dist_current, from_id, v, v_size, data_mode);\n}\n\nvoid batch_get_reset_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, uint32_t i) {\n\tbatch_get_shared_field<uint32_t, sharedMirror, true>(ctx, &ctx->dist_current, from_id, v, i);\n}\n\nvoid batch_get_reset_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode, uint32_t i) {\n\tbatch_get_shared_field<uint32_t, sharedMirror, true>(ctx, &ctx->dist_current, from_id, v, v_size, data_mode, i);\n}\n\nvoid batch_set_mirror_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<uint32_t, sharedMirror, setOp>(ctx, &ctx->dist_current, from_id, v, data_mode);\n}\n\nvoid batch_set_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<uint32_t, sharedMaster, setOp>(ctx, &ctx->dist_current, from_id, v, data_mode);\n}\n\nvoid batch_add_mirror_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<uint32_t, sharedMirror, addOp>(ctx, &ctx->dist_current, from_id, v, data_mode);\n}\n\nvoid batch_add_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<uint32_t, sharedMaster, addOp>(ctx, &ctx->dist_current, from_id, v, data_mode);\n}\n\nvoid batch_min_mirror_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<uint32_t, sharedMirror, minOp>(ctx, &ctx->dist_current, from_id, v, data_mode);\n}\n\nvoid batch_min_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<uint32_t, sharedMaster, minOp>(ctx, &ctx->dist_current, from_id, v, data_mode);\n}\n\nvoid batch_reset_node_dist_current_cuda(struct CUDA_Context* ctx, size_t begin, size_t end, uint32_t v) {\n\treset_data_field<uint32_t>(&ctx->dist_current, begin, end, v);\n}\n\nvoid get_bitset_dist_old_cuda(struct CUDA_Context* ctx, uint64_t* bitset_compute) {\n\tctx->dist_old.is_updated.cpu_rd_ptr()->copy_to_cpu(bitset_compute);\n}\n\nvoid bitset_dist_old_reset_cuda(struct CUDA_Context* ctx) {\n\tctx->dist_old.is_updated.cpu_rd_ptr()->reset();\n}\n\nvoid bitset_dist_old_reset_cuda(struct CUDA_Context* ctx, size_t begin, size_t end) {\n\treset_bitset_field(&ctx->dist_old, begin, end);\n}\n\nuint32_t get_node_dist_old_cuda(struct CUDA_Context* ctx, unsigned LID) {\n\tuint32_t *dist_old = ctx->dist_old.data.cpu_rd_ptr();\n\treturn dist_old[LID];\n}\n\nvoid set_node_dist_old_cuda(struct CUDA_Context* ctx, unsigned LID, uint32_t v) {\n\tuint32_t *dist_old = ctx->dist_old.data.cpu_wr_ptr();\n\tdist_old[LID] = v;\n}\n\nvoid add_node_dist_old_cuda(struct CUDA_Context* ctx, unsigned LID, uint32_t v) {\n\tuint32_t *dist_old = ctx->dist_old.data.cpu_wr_ptr();\n\tdist_old[LID] += v;\n}\n\nbool min_node_dist_old_cuda(struct CUDA_Context* ctx, unsigned LID, uint32_t v) {\n\tuint32_t *dist_old = ctx->dist_old.data.cpu_wr_ptr();\n\tif (dist_old[LID] > v){\n\t\tdist_old[LID] = v;\n\t\treturn true;\n\t}\n\treturn false;\n}\n\nvoid batch_get_node_dist_old_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v) {\n\tbatch_get_shared_field<uint32_t, sharedMaster, false>(ctx, &ctx->dist_old, from_id, v);\n}\n\nvoid batch_get_node_dist_old_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode) {\n\tbatch_get_shared_field<uint32_t, sharedMaster, false>(ctx, &ctx->dist_old, from_id, v, v_size, data_mode);\n}\n\nvoid batch_get_mirror_node_dist_old_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v) {\n\tbatch_get_shared_field<uint32_t, sharedMirror, false>(ctx, &ctx->dist_old, from_id, v);\n}\n\nvoid batch_get_mirror_node_dist_old_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode) {\n\tbatch_get_shared_field<uint32_t, sharedMirror, false>(ctx, &ctx->dist_old, from_id, v, v_size, data_mode);\n}\n\nvoid batch_get_reset_node_dist_old_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, uint32_t i) {\n\tbatch_get_shared_field<uint32_t, sharedMirror, true>(ctx, &ctx->dist_old, from_id, v, i);\n}\n\nvoid batch_get_reset_node_dist_old_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode, uint32_t i) {\n\tbatch_get_shared_field<uint32_t, sharedMirror, true>(ctx, &ctx->dist_old, from_id, v, v_size, data_mode, i);\n}\n\nvoid batch_set_mirror_node_dist_old_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<uint32_t, sharedMirror, setOp>(ctx, &ctx->dist_old, from_id, v, data_mode);\n}\n\nvoid batch_set_node_dist_old_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<uint32_t, sharedMaster, setOp>(ctx, &ctx->dist_old, from_id, v, data_mode);\n}\n\nvoid batch_add_mirror_node_dist_old_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<uint32_t, sharedMirror, addOp>(ctx, &ctx->dist_old, from_id, v, data_mode);\n}\n\nvoid batch_add_node_dist_old_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<uint32_t, sharedMaster, addOp>(ctx, &ctx->dist_old, from_id, v, data_mode);\n}\n\nvoid batch_min_mirror_node_dist_old_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<uint32_t, sharedMirror, minOp>(ctx, &ctx->dist_old, from_id, v, data_mode);\n}\n\nvoid batch_min_node_dist_old_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<uint32_t, sharedMaster, minOp>(ctx, &ctx->dist_old, from_id, v, data_mode);\n}\n\nvoid batch_reset_node_dist_old_cuda(struct CUDA_Context* ctx, size_t begin, size_t end, uint32_t v) {\n\treset_data_field<uint32_t>(&ctx->dist_old, begin, end, v);\n}\n\n"
  },
  {
    "path": "lonestar/analytics/distributed/bfs/bfs_push_cuda.h",
    "content": "#pragma once\n\n#include \"galois/runtime/DataCommMode.h\"\n#include \"galois/cuda/HostDecls.h\"\n\nvoid get_bitset_dist_current_cuda(struct CUDA_Context* ctx,\n                                  uint64_t* bitset_compute);\nvoid bitset_dist_current_reset_cuda(struct CUDA_Context* ctx);\nvoid bitset_dist_current_reset_cuda(struct CUDA_Context* ctx, size_t begin,\n                                    size_t end);\nuint32_t get_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned LID);\nvoid set_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned LID,\n                                uint32_t v);\nvoid add_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned LID,\n                                uint32_t v);\nbool min_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned LID,\n                                uint32_t v);\nvoid batch_get_node_dist_current_cuda(struct CUDA_Context* ctx,\n                                      unsigned from_id, uint8_t* v);\nvoid batch_get_node_dist_current_cuda(struct CUDA_Context* ctx,\n                                      unsigned from_id, uint8_t* v,\n                                      size_t* v_size, DataCommMode* data_mode);\nvoid batch_get_mirror_node_dist_current_cuda(struct CUDA_Context* ctx,\n                                             unsigned from_id, uint8_t* v);\nvoid batch_get_mirror_node_dist_current_cuda(struct CUDA_Context* ctx,\n                                             unsigned from_id, uint8_t* v,\n                                             size_t* v_size,\n                                             DataCommMode* data_mode);\nvoid batch_get_reset_node_dist_current_cuda(struct CUDA_Context* ctx,\n                                            unsigned from_id, uint8_t* v,\n                                            uint32_t i);\nvoid batch_get_reset_node_dist_current_cuda(struct CUDA_Context* ctx,\n                                            unsigned from_id, uint8_t* v,\n                                            size_t* v_size,\n                                            DataCommMode* data_mode,\n                                            uint32_t i);\nvoid batch_set_mirror_node_dist_current_cuda(struct CUDA_Context* ctx,\n                                             unsigned from_id, uint8_t* v,\n                                             DataCommMode data_mode);\nvoid batch_set_node_dist_current_cuda(struct CUDA_Context* ctx,\n                                      unsigned from_id, uint8_t* v,\n                                      DataCommMode data_mode);\nvoid batch_add_mirror_node_dist_current_cuda(struct CUDA_Context* ctx,\n                                             unsigned from_id, uint8_t* v,\n                                             DataCommMode data_mode);\nvoid batch_add_node_dist_current_cuda(struct CUDA_Context* ctx,\n                                      unsigned from_id, uint8_t* v,\n                                      DataCommMode data_mode);\nvoid batch_min_mirror_node_dist_current_cuda(struct CUDA_Context* ctx,\n                                             unsigned from_id, uint8_t* v,\n                                             DataCommMode data_mode);\nvoid batch_min_node_dist_current_cuda(struct CUDA_Context* ctx,\n                                      unsigned from_id, uint8_t* v,\n                                      DataCommMode data_mode);\nvoid batch_reset_node_dist_current_cuda(struct CUDA_Context* ctx, size_t begin,\n                                        size_t end, uint32_t v);\n\nvoid get_bitset_dist_old_cuda(struct CUDA_Context* ctx,\n                              uint64_t* bitset_compute);\nvoid bitset_dist_old_reset_cuda(struct CUDA_Context* ctx);\nvoid bitset_dist_old_reset_cuda(struct CUDA_Context* ctx, size_t begin,\n                                size_t end);\nuint32_t get_node_dist_old_cuda(struct CUDA_Context* ctx, unsigned LID);\nvoid set_node_dist_old_cuda(struct CUDA_Context* ctx, unsigned LID, uint32_t v);\nvoid add_node_dist_old_cuda(struct CUDA_Context* ctx, unsigned LID, uint32_t v);\nbool min_node_dist_old_cuda(struct CUDA_Context* ctx, unsigned LID, uint32_t v);\nvoid batch_get_node_dist_old_cuda(struct CUDA_Context* ctx, unsigned from_id,\n                                  uint8_t* v);\nvoid batch_get_node_dist_old_cuda(struct CUDA_Context* ctx, unsigned from_id,\n                                  uint8_t* v, size_t* v_size,\n                                  DataCommMode* data_mode);\nvoid batch_get_mirror_node_dist_old_cuda(struct CUDA_Context* ctx,\n                                         unsigned from_id, uint8_t* v);\nvoid batch_get_mirror_node_dist_old_cuda(struct CUDA_Context* ctx,\n                                         unsigned from_id, uint8_t* v,\n                                         size_t* v_size,\n                                         DataCommMode* data_mode);\nvoid batch_get_reset_node_dist_old_cuda(struct CUDA_Context* ctx,\n                                        unsigned from_id, uint8_t* v,\n                                        uint32_t i);\nvoid batch_get_reset_node_dist_old_cuda(struct CUDA_Context* ctx,\n                                        unsigned from_id, uint8_t* v,\n                                        size_t* v_size, DataCommMode* data_mode,\n                                        uint32_t i);\nvoid batch_set_mirror_node_dist_old_cuda(struct CUDA_Context* ctx,\n                                         unsigned from_id, uint8_t* v,\n                                         DataCommMode data_mode);\nvoid batch_set_node_dist_old_cuda(struct CUDA_Context* ctx, unsigned from_id,\n                                  uint8_t* v, DataCommMode data_mode);\nvoid batch_add_mirror_node_dist_old_cuda(struct CUDA_Context* ctx,\n                                         unsigned from_id, uint8_t* v,\n                                         DataCommMode data_mode);\nvoid batch_add_node_dist_old_cuda(struct CUDA_Context* ctx, unsigned from_id,\n                                  uint8_t* v, DataCommMode data_mode);\nvoid batch_min_mirror_node_dist_old_cuda(struct CUDA_Context* ctx,\n                                         unsigned from_id, uint8_t* v,\n                                         DataCommMode data_mode);\nvoid batch_min_node_dist_old_cuda(struct CUDA_Context* ctx, unsigned from_id,\n                                  uint8_t* v, DataCommMode data_mode);\nvoid batch_reset_node_dist_old_cuda(struct CUDA_Context* ctx, size_t begin,\n                                    size_t end, uint32_t v);\n\nvoid BFS_cuda(unsigned int __begin, unsigned int __end,\n              unsigned int& active_vertices, unsigned int& work_items,\n              uint32_t local_priority, struct CUDA_Context* ctx);\nvoid BFSSanityCheck_cuda(unsigned int __begin, unsigned int __end,\n                         uint64_t& DGAccumulator_sum, uint32_t& DGMax,\n                         const uint32_t& local_infinity,\n                         struct CUDA_Context* ctx);\nvoid BFSSanityCheck_allNodes_cuda(uint64_t& DGAccumulator_sum, uint32_t& DGMax,\n                                  const uint32_t& local_infinity,\n                                  struct CUDA_Context* ctx);\nvoid BFSSanityCheck_masterNodes_cuda(uint64_t& DGAccumulator_sum,\n                                     uint32_t& DGMax,\n                                     const uint32_t& local_infinity,\n                                     struct CUDA_Context* ctx);\nvoid BFSSanityCheck_nodesWithEdges_cuda(uint64_t& DGAccumulator_sum,\n                                        uint32_t& DGMax,\n                                        const uint32_t& local_infinity,\n                                        struct CUDA_Context* ctx);\nvoid BFS_allNodes_cuda(unsigned int& active_vertices, unsigned int& work_items,\n                       uint32_t local_priority, struct CUDA_Context* ctx);\nvoid BFS_masterNodes_cuda(unsigned int& active_vertices,\n                          unsigned int& work_items, uint32_t local_priority,\n                          struct CUDA_Context* ctx);\nvoid BFS_nodesWithEdges_cuda(unsigned int& active_vertices,\n                             unsigned int& work_items, uint32_t local_priority,\n                             struct CUDA_Context* ctx);\nvoid FirstItr_BFS_cuda(unsigned int __begin, unsigned int __end,\n                       struct CUDA_Context* ctx);\nvoid FirstItr_BFS_allNodes_cuda(struct CUDA_Context* ctx);\nvoid FirstItr_BFS_masterNodes_cuda(struct CUDA_Context* ctx);\nvoid FirstItr_BFS_nodesWithEdges_cuda(struct CUDA_Context* ctx);\nvoid InitializeGraph_cuda(unsigned int __begin, unsigned int __end,\n                          const uint32_t& local_infinity,\n                          uint64_t local_src_node, struct CUDA_Context* ctx);\nvoid InitializeGraph_allNodes_cuda(const uint32_t& local_infinity,\n                                   uint64_t local_src_node,\n                                   struct CUDA_Context* ctx);\nvoid InitializeGraph_masterNodes_cuda(const uint32_t& local_infinity,\n                                      uint64_t local_src_node,\n                                      struct CUDA_Context* ctx);\nvoid InitializeGraph_nodesWithEdges_cuda(const uint32_t& local_infinity,\n                                         uint64_t local_src_node,\n                                         struct CUDA_Context* ctx);\n"
  },
  {
    "path": "lonestar/analytics/distributed/bfs/bfs_push_cuda.py",
    "content": "from gg.ast import *\nfrom gg.lib.graph import Graph\nfrom gg.lib.wl import Worklist\nfrom gg.ast.params import GraphParam\nimport cgen\nG = Graph(\"graph\")\nWL = Worklist()\nast = Module([\nCBlock([cgen.Include(\"bfs_push_cuda.cuh\", system = False)], parse = False),\nKernel(\"InitializeGraph\", [G.param(), ('unsigned int', '__begin'), ('unsigned int', '__end'), ('const uint32_t ', 'local_infinity'), ('unsigned long long', 'local_src_node'), ('uint32_t *', 'p_dist_current'), ('uint32_t *', 'p_dist_old')],\n[\nForAll(\"src\", G.nodes(\"__begin\", \"__end\"),\n[\nCDecl([(\"bool\", \"pop\", \" = src < __end\")]),\nIf(\"pop\", [\nCBlock([\"p_dist_current[src] = (graph.node_data[src] == local_src_node) ? 0 : local_infinity\"]),\nCBlock([\"p_dist_old[src] = (graph.node_data[src] == local_src_node) ? 0 : local_infinity\"]),\n]),\n]),\n]),\nKernel(\"FirstItr_BFS\", [G.param(), ('unsigned int', '__begin'), ('unsigned int', '__end'), ('uint32_t *', 'p_dist_current'), ('uint32_t *', 'p_dist_old'), ('DynamicBitset&', 'bitset_dist_current')],\n[\nForAll(\"src\", G.nodes(\"__begin\", \"__end\"),\n[\nCDecl([(\"bool\", \"pop\", \" = src < __end\")]),\nIf(\"pop\", [\nCBlock([\"p_dist_old[src]  = p_dist_current[src]\"]),\n]),\nUniformConditional(If(\"!pop\", [CBlock(\"continue\")]), uniform_only = False, _only_if_np = True),\nClosureHint(\nForAll(\"jj\", G.edges(\"src\"),\n[\nCDecl([(\"index_type\", \"dst\", \"\")]),\nCBlock([\"dst = graph.getAbsDestination(jj)\"]),\nCDecl([(\"uint32_t\", \"new_dist\", \"\")]),\nCBlock([\"new_dist = 1 + p_dist_current[src]\"]),\nCDecl([(\"uint32_t\", \"old_dist\", \"\")]),\nCBlock([\"old_dist = atomicTestMin(&p_dist_current[dst], new_dist)\"]),\nIf(\"old_dist > new_dist\",\n[\nCBlock([\"bitset_dist_current.set(dst)\"]),\n]),\n]),\n),\n]),\n]),\nKernel(\"BFS\", [G.param(), ('unsigned int', '__begin'), ('unsigned int', '__end'), ('uint32_t', 'local_priority'), ('uint32_t *', 'p_dist_current'), ('uint32_t *', 'p_dist_old'), ('DynamicBitset&', 'bitset_dist_current'), ('HGAccumulator<unsigned int>', 'active_vertices'), ('HGAccumulator<unsigned int>', 'work_items')],\n[\nCDecl([(\"__shared__ cub::BlockReduce<unsigned int, TB_SIZE>::TempStorage\", \"active_vertices_ts\", \"\")]),\nCBlock([\"active_vertices.thread_entry()\"]),\nCDecl([(\"__shared__ cub::BlockReduce<unsigned int, TB_SIZE>::TempStorage\", \"work_items_ts\", \"\")]),\nCBlock([\"work_items.thread_entry()\"]),\nForAll(\"src\", G.nodes(\"__begin\", \"__end\"),\n[\nCDecl([(\"bool\", \"pop\", \" = src < __end\")]),\nIf(\"pop\", [\nIf(\"p_dist_old[src] > p_dist_current[src]\",\n[\nCBlock([\"active_vertices.reduce( 1)\"]),\nIf(\"local_priority > p_dist_current[src]\",\n[\nCBlock([\"p_dist_old[src] = p_dist_current[src]\"]),\n], [ CBlock([\"pop = false\"]), ]),\n], [ CBlock([\"pop = false\"]), ]),\n]),\nUniformConditional(If(\"!pop\", [CBlock(\"continue\")]), uniform_only = False, _only_if_np = True),\nClosureHint(\nForAll(\"jj\", G.edges(\"src\"),\n[\nCBlock([\"work_items.reduce( 1)\"]),\nCDecl([(\"index_type\", \"dst\", \"\")]),\nCBlock([\"dst = graph.getAbsDestination(jj)\"]),\nCDecl([(\"uint32_t\", \"new_dist\", \"\")]),\nCBlock([\"new_dist = 1 + p_dist_current[src]\"]),\nCDecl([(\"uint32_t\", \"old_dist\", \"\")]),\nCBlock([\"old_dist = atomicTestMin(&p_dist_current[dst], new_dist)\"]),\nIf(\"old_dist > new_dist\",\n[\nCBlock([\"bitset_dist_current.set(dst)\"]),\n]),\n]),\n),\n]),\nCBlock([\"active_vertices.thread_exit<cub::BlockReduce<unsigned int, TB_SIZE> >(active_vertices_ts)\"], parse = False),\nCBlock([\"work_items.thread_exit<cub::BlockReduce<unsigned int, TB_SIZE> >(work_items_ts)\"], parse = False),\n]),\nKernel(\"BFSSanityCheck\", [G.param(), ('unsigned int', '__begin'), ('unsigned int', '__end'), ('const uint32_t ', 'local_infinity'), ('uint32_t *', 'p_dist_current'), ('HGAccumulator<uint64_t>', 'DGAccumulator_sum'), ('HGReduceMax<uint32_t>', 'DGMax')],\n[\nCDecl([(\"__shared__ cub::BlockReduce<uint64_t, TB_SIZE>::TempStorage\", \"DGAccumulator_sum_ts\", \"\")]),\nCBlock([\"DGAccumulator_sum.thread_entry()\"]),\nCDecl([(\"__shared__ cub::BlockReduce<uint32_t, TB_SIZE>::TempStorage\", \"DGMax_ts\", \"\")]),\nCBlock([\"DGMax.thread_entry()\"]),\nForAll(\"src\", G.nodes(\"__begin\", \"__end\"),\n[\nCDecl([(\"bool\", \"pop\", \" = src < __end\")]),\nIf(\"pop\", [\nIf(\"p_dist_current[src] < local_infinity\",\n[\nCBlock([\"DGAccumulator_sum.reduce( 1)\"]),\nCBlock([\"DGMax.reduce(p_dist_current[src])\"]),\n]),\n]),\n]),\nCBlock([\"DGAccumulator_sum.thread_exit<cub::BlockReduce<uint64_t, TB_SIZE> >(DGAccumulator_sum_ts)\"], parse = False),\nCBlock([\"DGMax.thread_exit<cub::BlockReduce<uint32_t, TB_SIZE> >(DGMax_ts)\"], parse = False),\n]),\nKernel(\"InitializeGraph_cuda\", [('unsigned int ', '__begin'), ('unsigned int ', '__end'), ('const uint32_t &', 'local_infinity'), ('unsigned long long', 'local_src_node'), ('struct CUDA_Context* ', 'ctx')],\n[\nCDecl([(\"dim3\", \"blocks\", \"\")]),\nCDecl([(\"dim3\", \"threads\", \"\")]),\nCBlock([\"kernel_sizing(blocks, threads)\"]),\nInvoke(\"InitializeGraph\", (\"ctx->gg\", \"__begin\", \"__end\", \"local_infinity\", \"local_src_node\", \"ctx->dist_current.data.gpu_wr_ptr()\", \"ctx->dist_old.data.gpu_wr_ptr()\")),\nCBlock([\"check_cuda_kernel\"], parse = False),\n], host = True),\nKernel(\"InitializeGraph_allNodes_cuda\", [('const uint32_t &', 'local_infinity'), ('unsigned long long', 'local_src_node'), ('struct CUDA_Context* ', 'ctx')],\n[\nCBlock([\"InitializeGraph_cuda(0, ctx->gg.nnodes, local_infinity, local_src_node, ctx)\"]),\n], host = True),\nKernel(\"InitializeGraph_masterNodes_cuda\", [('const uint32_t &', 'local_infinity'), ('unsigned long long', 'local_src_node'), ('struct CUDA_Context* ', 'ctx')],\n[\nCBlock([\"InitializeGraph_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, local_infinity, local_src_node, ctx)\"]),\n], host = True),\nKernel(\"InitializeGraph_nodesWithEdges_cuda\", [('const uint32_t &', 'local_infinity'), ('unsigned long long', 'local_src_node'), ('struct CUDA_Context* ', 'ctx')],\n[\nCBlock([\"InitializeGraph_cuda(0, ctx->numNodesWithEdges, local_infinity, local_src_node, ctx)\"]),\n], host = True),\nKernel(\"FirstItr_BFS_cuda\", [('unsigned int ', '__begin'), ('unsigned int ', '__end'), ('struct CUDA_Context* ', 'ctx')],\n[\nCDecl([(\"dim3\", \"blocks\", \"\")]),\nCDecl([(\"dim3\", \"threads\", \"\")]),\nCBlock([\"kernel_sizing(blocks, threads)\"]),\nInvoke(\"FirstItr_BFS\", (\"ctx->gg\", \"__begin\", \"__end\", \"ctx->dist_current.data.gpu_wr_ptr()\", \"ctx->dist_old.data.gpu_wr_ptr()\", \"*(ctx->dist_current.is_updated.gpu_rd_ptr())\")),\nCBlock([\"check_cuda_kernel\"], parse = False),\n], host = True),\nKernel(\"FirstItr_BFS_allNodes_cuda\", [('struct CUDA_Context* ', 'ctx')],\n[\nCBlock([\"FirstItr_BFS_cuda(0, ctx->gg.nnodes, ctx)\"]),\n], host = True),\nKernel(\"FirstItr_BFS_masterNodes_cuda\", [('struct CUDA_Context* ', 'ctx')],\n[\nCBlock([\"FirstItr_BFS_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, ctx)\"]),\n], host = True),\nKernel(\"FirstItr_BFS_nodesWithEdges_cuda\", [('struct CUDA_Context* ', 'ctx')],\n[\nCBlock([\"FirstItr_BFS_cuda(0, ctx->numNodesWithEdges, ctx)\"]),\n], host = True),\nKernel(\"BFS_cuda\", [('unsigned int ', '__begin'), ('unsigned int ', '__end'), ('unsigned int &', 'active_vertices'), ('unsigned int &', 'work_items'), ('uint32_t', 'local_priority'), ('struct CUDA_Context* ', 'ctx')],\n[\nCDecl([(\"dim3\", \"blocks\", \"\")]),\nCDecl([(\"dim3\", \"threads\", \"\")]),\nCBlock([\"kernel_sizing(blocks, threads)\"]),\nCDecl([(\"Shared<unsigned int>\", \"active_verticesval\", \" = Shared<unsigned int>(1)\")]),\nCDecl([(\"HGAccumulator<unsigned int>\", \"_active_vertices\", \"\")]),\nCBlock([\"*(active_verticesval.cpu_wr_ptr()) = 0\"]),\nCBlock([\"_active_vertices.rv = active_verticesval.gpu_wr_ptr()\"]),\nCDecl([(\"Shared<unsigned int>\", \"work_itemsval\", \" = Shared<unsigned int>(1)\")]),\nCDecl([(\"HGAccumulator<unsigned int>\", \"_work_items\", \"\")]),\nCBlock([\"*(work_itemsval.cpu_wr_ptr()) = 0\"]),\nCBlock([\"_work_items.rv = work_itemsval.gpu_wr_ptr()\"]),\nInvoke(\"BFS\", (\"ctx->gg\", \"__begin\", \"__end\", \"local_priority\", \"ctx->dist_current.data.gpu_wr_ptr()\", \"ctx->dist_old.data.gpu_wr_ptr()\", \"*(ctx->dist_current.is_updated.gpu_rd_ptr())\", \"_active_vertices\", \"_work_items\")),\nCBlock([\"check_cuda_kernel\"], parse = False),\nCBlock([\"active_vertices = *(active_verticesval.cpu_rd_ptr())\"]),\nCBlock([\"work_items = *(work_itemsval.cpu_rd_ptr())\"]),\n], host = True),\nKernel(\"BFS_allNodes_cuda\", [('unsigned int &', 'active_vertices'), ('unsigned int &', 'work_items'), ('uint32_t', 'local_priority'), ('struct CUDA_Context* ', 'ctx')],\n[\nCBlock([\"BFS_cuda(0, ctx->gg.nnodes, active_vertices, work_items, local_priority, ctx)\"]),\n], host = True),\nKernel(\"BFS_masterNodes_cuda\", [('unsigned int &', 'active_vertices'), ('unsigned int &', 'work_items'), ('uint32_t', 'local_priority'), ('struct CUDA_Context* ', 'ctx')],\n[\nCBlock([\"BFS_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, active_vertices, work_items, local_priority, ctx)\"]),\n], host = True),\nKernel(\"BFS_nodesWithEdges_cuda\", [('unsigned int &', 'active_vertices'), ('unsigned int &', 'work_items'), ('uint32_t', 'local_priority'), ('struct CUDA_Context* ', 'ctx')],\n[\nCBlock([\"BFS_cuda(0, ctx->numNodesWithEdges, active_vertices, work_items, local_priority, ctx)\"]),\n], host = True),\nKernel(\"BFSSanityCheck_cuda\", [('unsigned int ', '__begin'), ('unsigned int ', '__end'), ('uint64_t &', 'DGAccumulator_sum'), ('uint32_t &', 'DGMax'), ('const uint32_t &', 'local_infinity'), ('struct CUDA_Context* ', 'ctx')],\n[\nCDecl([(\"dim3\", \"blocks\", \"\")]),\nCDecl([(\"dim3\", \"threads\", \"\")]),\nCBlock([\"kernel_sizing(blocks, threads)\"]),\nCDecl([(\"Shared<uint64_t>\", \"DGAccumulator_sumval\", \" = Shared<uint64_t>(1)\")]),\nCDecl([(\"HGAccumulator<uint64_t>\", \"_DGAccumulator_sum\", \"\")]),\nCBlock([\"*(DGAccumulator_sumval.cpu_wr_ptr()) = 0\"]),\nCBlock([\"_DGAccumulator_sum.rv = DGAccumulator_sumval.gpu_wr_ptr()\"]),\nCDecl([(\"Shared<uint32_t>\", \"DGMaxval\", \" = Shared<uint32_t>(1)\")]),\nCDecl([(\"HGReduceMax<uint32_t>\", \"_DGMax\", \"\")]),\nCBlock([\"*(DGMaxval.cpu_wr_ptr()) = 0\"]),\nCBlock([\"_DGMax.rv = DGMaxval.gpu_wr_ptr()\"]),\nInvoke(\"BFSSanityCheck\", (\"ctx->gg\", \"__begin\", \"__end\", \"local_infinity\", \"ctx->dist_current.data.gpu_wr_ptr()\", \"_DGAccumulator_sum\", \"_DGMax\")),\nCBlock([\"check_cuda_kernel\"], parse = False),\nCBlock([\"DGAccumulator_sum = *(DGAccumulator_sumval.cpu_rd_ptr())\"]),\nCBlock([\"DGMax = *(DGMaxval.cpu_rd_ptr())\"]),\n], host = True),\nKernel(\"BFSSanityCheck_allNodes_cuda\", [('uint64_t &', 'DGAccumulator_sum'), ('uint32_t &', 'DGMax'), ('const uint32_t &', 'local_infinity'), ('struct CUDA_Context* ', 'ctx')],\n[\nCBlock([\"BFSSanityCheck_cuda(0, ctx->gg.nnodes, DGAccumulator_sum, DGMax, local_infinity, ctx)\"]),\n], host = True),\nKernel(\"BFSSanityCheck_masterNodes_cuda\", [('uint64_t &', 'DGAccumulator_sum'), ('uint32_t &', 'DGMax'), ('const uint32_t &', 'local_infinity'), ('struct CUDA_Context* ', 'ctx')],\n[\nCBlock([\"BFSSanityCheck_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, DGAccumulator_sum, DGMax, local_infinity, ctx)\"]),\n], host = True),\nKernel(\"BFSSanityCheck_nodesWithEdges_cuda\", [('uint64_t &', 'DGAccumulator_sum'), ('uint32_t &', 'DGMax'), ('const uint32_t &', 'local_infinity'), ('struct CUDA_Context* ', 'ctx')],\n[\nCBlock([\"BFSSanityCheck_cuda(0, ctx->numNodesWithEdges, DGAccumulator_sum, DGMax, local_infinity, ctx)\"]),\n], host = True),\n])\n"
  },
  {
    "path": "lonestar/analytics/distributed/bfs/bfs_push_sync.hh",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting parallelism.\n * The code is being released under the terms of the 3-Clause BSD License (a\n * copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#include \"galois/runtime/SyncStructures.h\"\n\nGALOIS_SYNC_STRUCTURE_REDUCE_SET(dist_current, unsigned int);\nGALOIS_SYNC_STRUCTURE_REDUCE_MIN(dist_current, unsigned int);\nGALOIS_SYNC_STRUCTURE_BITSET(dist_current);\n"
  },
  {
    "path": "lonestar/analytics/distributed/connected-components/CMakeLists.txt",
    "content": "app_dist(cc_push connected-components-push)\nadd_test_dist(connected-components-push-dist rmat15 ${BASEINPUT}/scalefree/symmetric/rmat15.sgr -symmetricGraph)\n\napp_dist(cc_pull connected-components-pull)\nadd_test_dist(connected-components-pull-dist rmat15 ${BASEINPUT}/scalefree/symmetric/rmat15.sgr -symmetricGraph)\n"
  },
  {
    "path": "lonestar/analytics/distributed/connected-components/README.md",
    "content": "Connected Components\n================================================================================\n\nDESCRIPTION \n--------------------------------------------------------------------------------\n\nFind all connected components of an undirected (symmetric) graph. Set the same \nlabel to nodes which belong to the same component.\n\nThe algorithm supports both a bulk-synchronous and a bulk-asynchronous\nparallel algorithms. This benchmark consists of two algorithms,\npush- and pull-based.  In the push variant of the algorithm, nodes with a label\nthat has changed from the last round will push this label out to its neighbors\nand update their labels with a min operation. In the pull variant of the\nalgorithm, all nodes check their neighbors to see if they have a lower label,\nand they will adopt the lowest label among its neighbors/itself as its component.\n\nINPUT\n--------------------------------------------------------------------------------\n\nTakes in symmetric Galois .gr graphs. You must specify the -symmetricGraph\nflag when running this benchmark.\n\nBUILD\n--------------------------------------------------------------------------------\n\n1. Run cmake at BUILD directory (refer to top-level README for cmake instructions).\n\n2. Run `cd <BUILD>/lonestar/analytics/distributed/connected-components/; make -j\n\nRUN\n--------------------------------------------------------------------------------\n\nTo run on 1 machine, use the following:\n`./connected-components-push-dist <symmetric-input-graph> -t=<num-threads> -symmetricGraph`\n`./connected-components-pull-dist <symmetric-input-graph> -t=<num-threads> -symmetricGraph`\n\nTo run on 3 hosts h1, h2, and h3, use the following:\n`mpirun -n=3 -hosts=h1,h2,h3 ./connected-components-push-dist <symmetric-input-graph> -t=<num-threads> -symmetricGraph`\n`mpirun -n=3 -hosts=h1,h2,h3 ./connected-components-pull-dist <symmetric-input-graph> -t=<num-threads> -symmetricGraph`\n\nTo run on 3 hosts h1, h2, and h3 with an incoming edge cut, use the following:\n`mpirun -n=3 -hosts=h1,h2,h3 ./connected-components-push-dist <symmetric-input-graph> -t=<num-threads> -symmetricGraph -partition=iec`\n`mpirun -n=3 -hosts=h1,h2,h3 ./connected-components-pull-dist <symmetric-input-graph> -t=<num-threads> -symmetricGraph -partition=iec`\n\nPERFORMANCE\n--------------------------------------------------------------------------------\n\n* The push variant generally performs better in our experience.\n\n* For 16 or less hosts/GPUs, for performance, we recommend using an\n  **edge-cut** partitioning policy (OEC or IEC) with **synchronous**\n  communication for performance.\n\n* For 32 or more hosts/GPUs, for performance, we recommend using the\n  **Cartesian vertex-cut** partitioning policy (CVC) with **asynchronous**\n  communication for performance.\n"
  },
  {
    "path": "lonestar/analytics/distributed/connected-components/cc_pull.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#include \"DistBench/Output.h\"\n#include \"DistBench/Start.h\"\n#include \"galois/DistGalois.h\"\n#include \"galois/DReducible.h\"\n#include \"galois/DTerminationDetector.h\"\n#include \"galois/gstl.h\"\n#include \"galois/runtime/Tracer.h\"\n\n#include <iostream>\n#include <limits>\n\n#ifdef GALOIS_ENABLE_GPU\n#include \"cc_pull_cuda.h\"\nstruct CUDA_Context* cuda_ctx;\n#else\nenum { CPU, GPU_CUDA };\nint personality = CPU;\n#endif\n\nconstexpr static const char* const REGION_NAME = \"ConnectedComp\";\n\n/******************************************************************************/\n/* Declaration of command line arguments */\n/******************************************************************************/\n\nnamespace cll = llvm::cl;\nstatic cll::opt<unsigned int> maxIterations(\"maxIterations\",\n                                            cll::desc(\"Maximum iterations: \"\n                                                      \"Default 1000\"),\n                                            cll::init(1000));\n\nenum Exec { Sync, Async };\n\nstatic cll::opt<Exec> execution(\n    \"exec\", cll::desc(\"Distributed Execution Model (default value Async):\"),\n    cll::values(clEnumVal(Sync, \"Bulk-synchronous Parallel (BSP)\"),\n                clEnumVal(Async, \"Bulk-asynchronous Parallel (BASP)\")),\n    cll::init(Async));\n\n/******************************************************************************/\n/* Graph structure declarations + other initialization */\n/******************************************************************************/\n\nstruct NodeData {\n  uint32_t comp_current;\n};\n\ngalois::DynamicBitSet bitset_comp_current;\n\ntypedef galois::graphs::DistGraph<NodeData, void> Graph;\ntypedef typename Graph::GraphNode GNode;\n\nstd::unique_ptr<galois::graphs::GluonSubstrate<Graph>> syncSubstrate;\n\n#include \"cc_pull_sync.hh\"\n\n/******************************************************************************/\n/* Algorithm structures */\n/******************************************************************************/\n\nstruct InitializeGraph {\n  Graph* graph;\n\n  InitializeGraph(Graph* _graph) : graph(_graph) {}\n\n  void static go(Graph& _graph) {\n    const auto& allNodes = _graph.allNodesRange();\n    if (personality == GPU_CUDA) {\n#ifdef GALOIS_ENABLE_GPU\n      std::string impl_str(\"InitializeGraph_\" +\n                           (syncSubstrate->get_run_identifier()));\n      galois::StatTimer StatTimer_cuda(impl_str.c_str(), REGION_NAME);\n      StatTimer_cuda.start();\n\n      InitializeGraph_allNodes_cuda(cuda_ctx);\n\n      StatTimer_cuda.stop();\n#else\n      abort();\n#endif\n    } else if (personality == CPU) {\n      galois::do_all(\n          galois::iterate(allNodes.begin(), allNodes.end()),\n          InitializeGraph{&_graph}, galois::no_stats(),\n          galois::loopname(\n              syncSubstrate->get_run_identifier(\"InitializeGraph\").c_str()));\n    }\n  }\n\n  void operator()(GNode src) const {\n    NodeData& sdata    = graph->getData(src);\n    sdata.comp_current = graph->getGID(src);\n  }\n};\n\ntemplate <bool async>\nstruct ConnectedComp {\n  Graph* graph;\n  using DGTerminatorDetector =\n      typename std::conditional<async, galois::DGTerminator<unsigned int>,\n                                galois::DGAccumulator<unsigned int>>::type;\n\n  DGTerminatorDetector& active_vertices;\n\n  ConnectedComp(Graph* _graph, DGTerminatorDetector& _dga)\n      : graph(_graph), active_vertices(_dga) {}\n\n  void static go(Graph& _graph) {\n    unsigned _num_iterations = 0;\n    DGTerminatorDetector dga;\n\n    const auto& nodesWithEdges = _graph.allNodesWithEdgesRange();\n    do {\n      syncSubstrate->set_num_round(_num_iterations);\n      dga.reset();\n      if (personality == GPU_CUDA) {\n#ifdef GALOIS_ENABLE_GPU\n        std::string impl_str(\"ConnectedComp_\" +\n                             (syncSubstrate->get_run_identifier()));\n        galois::StatTimer StatTimer_cuda(impl_str.c_str(), REGION_NAME);\n        StatTimer_cuda.start();\n        unsigned int __retval = 0;\n        ConnectedComp_nodesWithEdges_cuda(__retval, cuda_ctx);\n        dga += __retval;\n        StatTimer_cuda.stop();\n#else\n        abort();\n#endif\n      } else if (personality == CPU) {\n        galois::do_all(\n            galois::iterate(nodesWithEdges), ConnectedComp(&_graph, dga),\n            galois::steal(), galois::no_stats(),\n            galois::loopname(\n                syncSubstrate->get_run_identifier(\"ConnectedComp\").c_str()));\n      }\n\n      syncSubstrate->sync<writeSource, readDestination, Reduce_min_comp_current,\n                          Bitset_comp_current, async>(\"ConnectedComp\");\n\n      galois::runtime::reportStat_Tsum(\n          REGION_NAME, \"NumWorkItems_\" + (syncSubstrate->get_run_identifier()),\n          (unsigned long)dga.read_local());\n      ++_num_iterations;\n    } while ((async || (_num_iterations < maxIterations)) &&\n             dga.reduce(syncSubstrate->get_run_identifier()));\n\n    if (galois::runtime::getSystemNetworkInterface().ID == 0) {\n      galois::runtime::reportStat_Single(\n          REGION_NAME,\n          \"NumIterations_\" + std::to_string(syncSubstrate->get_run_num()),\n          (unsigned long)_num_iterations);\n    }\n  }\n\n  void operator()(GNode src) const {\n    NodeData& snode = graph->getData(src);\n\n    for (auto jj : graph->edges(src)) {\n      GNode dst         = graph->getEdgeDst(jj);\n      auto& dnode       = graph->getData(dst);\n      uint32_t new_comp = dnode.comp_current;\n      uint32_t old_comp = galois::min(snode.comp_current, new_comp);\n      if (old_comp > new_comp) {\n        bitset_comp_current.set(src);\n        active_vertices += 1;\n      }\n    }\n  }\n};\n\n/******************************************************************************/\n/* Sanity check operators */\n/******************************************************************************/\n\n/* Get/print the number of components */\nstruct ConnectedCompSanityCheck {\n  Graph* graph;\n\n  galois::DGAccumulator<uint64_t>& active_vertices;\n\n  ConnectedCompSanityCheck(Graph* _graph, galois::DGAccumulator<uint64_t>& _dga)\n      : graph(_graph), active_vertices(_dga) {}\n\n  void static go(Graph& _graph, galois::DGAccumulator<uint64_t>& dga) {\n    dga.reset();\n\n    if (personality == GPU_CUDA) {\n#ifdef GALOIS_ENABLE_GPU\n      uint64_t sum;\n      ConnectedCompSanityCheck_masterNodes_cuda(sum, cuda_ctx);\n      dga += sum;\n#else\n      abort();\n#endif\n    } else {\n      galois::do_all(galois::iterate(_graph.masterNodesRange().begin(),\n                                     _graph.masterNodesRange().end()),\n                     ConnectedCompSanityCheck(&_graph, dga), galois::no_stats(),\n                     galois::loopname(\"ConnectedCompSanityCheck\"));\n    }\n\n    uint64_t num_components = dga.reduce();\n\n    // Only node 0 will print the number visited\n    if (galois::runtime::getSystemNetworkInterface().ID == 0) {\n      galois::gPrint(\"Number of components is \", num_components, \"\\n\");\n    }\n  }\n\n  /* Check if a node's component is the same as its ID.\n   * if yes, then increment an accumulator */\n  void operator()(GNode src) const {\n    NodeData& src_data = graph->getData(src);\n\n    if (src_data.comp_current == graph->getGID(src)) {\n      active_vertices += 1;\n    }\n  }\n};\n\n/******************************************************************************/\n/* Make results */\n/******************************************************************************/\n\nstd::vector<uint32_t> makeResultsCPU(std::unique_ptr<Graph>& hg) {\n  std::vector<uint32_t> values;\n\n  values.reserve(hg->numMasters());\n  for (auto node : hg->masterNodesRange()) {\n    values.push_back(hg->getData(node).comp_current);\n  }\n\n  return values;\n}\n\n#ifdef GALOIS_ENABLE_GPU\nstd::vector<uint32_t> makeResultsGPU(std::unique_ptr<Graph>& hg) {\n  std::vector<uint32_t> values;\n\n  values.reserve(hg->numMasters());\n  for (auto node : hg->masterNodesRange()) {\n    values.push_back(get_node_comp_current_cuda(cuda_ctx, node));\n  }\n\n  return values;\n}\n#else\nstd::vector<uint32_t> makeResultsGPU(std::unique_ptr<Graph>& /*unused*/) {\n  abort();\n}\n#endif\n\nstd::vector<uint32_t> makeResults(std::unique_ptr<Graph>& hg) {\n  switch (personality) {\n  case CPU:\n    return makeResultsCPU(hg);\n  case GPU_CUDA:\n    return makeResultsGPU(hg);\n  default:\n    abort();\n  }\n}\n\n/******************************************************************************/\n/* Main */\n/******************************************************************************/\n\nconstexpr static const char* const name = \"ConnectedComp Pull - Distributed \"\n                                          \"Heterogeneous\";\nconstexpr static const char* const desc = \"ConnectedComp pull on Distributed \"\n                                          \"Galois.\";\nconstexpr static const char* const url = nullptr;\n\nint main(int argc, char** argv) {\n  galois::DistMemSys G;\n  DistBenchStart(argc, argv, name, desc, url);\n\n  auto& net = galois::runtime::getSystemNetworkInterface();\n\n  if (net.ID == 0) {\n    galois::runtime::reportParam(REGION_NAME, \"Max Iterations\", maxIterations);\n  }\n\n  galois::StatTimer StatTimer_total(\"TimerTotal\", REGION_NAME);\n\n  StatTimer_total.start();\n\n  std::unique_ptr<Graph> hg;\n#ifdef GALOIS_ENABLE_GPU\n  std::tie(hg, syncSubstrate) =\n      symmetricDistGraphInitialization<NodeData, void>(&cuda_ctx);\n#else\n  std::tie(hg, syncSubstrate) =\n      symmetricDistGraphInitialization<NodeData, void>();\n#endif\n\n  bitset_comp_current.resize(hg->size());\n\n  galois::gPrint(\"[\", net.ID, \"] InitializeGraph::go called\\n\");\n  InitializeGraph::go((*hg));\n  galois::runtime::getHostBarrier().wait();\n\n  galois::DGAccumulator<uint64_t> active_vertices64;\n\n  for (auto run = 0; run < numRuns; ++run) {\n    galois::gPrint(\"[\", net.ID, \"] ConnectedComp::go run \", run, \" called\\n\");\n    std::string timer_str(\"Timer_\" + std::to_string(run));\n    galois::StatTimer StatTimer_main(timer_str.c_str(), REGION_NAME);\n\n    StatTimer_main.start();\n    if (execution == Async) {\n      ConnectedComp<true>::go(*hg);\n    } else {\n      ConnectedComp<false>::go(*hg);\n    }\n    StatTimer_main.stop();\n\n    ConnectedCompSanityCheck::go(*hg, active_vertices64);\n\n    if ((run + 1) != numRuns) {\n      if (personality == GPU_CUDA) {\n#ifdef GALOIS_ENABLE_GPU\n        bitset_comp_current_reset_cuda(cuda_ctx);\n#else\n        abort();\n#endif\n      } else {\n        bitset_comp_current.reset();\n      }\n\n      (*syncSubstrate).set_num_run(run + 1);\n      InitializeGraph::go((*hg));\n      galois::runtime::getHostBarrier().wait();\n    }\n  }\n\n  StatTimer_total.stop();\n\n  if (output) {\n    std::vector<uint32_t> results = makeResults(hg);\n    auto globalIDs                = hg->getMasterGlobalIDs();\n    assert(results.size() == globalIDs.size());\n\n    writeOutput(outputLocation, \"component\", results.data(), results.size(),\n                globalIDs.data());\n  }\n\n  return 0;\n}\n"
  },
  {
    "path": "lonestar/analytics/distributed/connected-components/cc_pull_cuda.cu",
    "content": "/*  -*- mode: c++ -*-  */\n#include \"gg.h\"\n#include \"ggcuda.h\"\n#include \"cub/cub.cuh\"\n#include \"cub/util_allocator.cuh\"\n#include \"thread_work.h\"\n\nvoid kernel_sizing(CSRGraph &, dim3 &, dim3 &);\n#define TB_SIZE 256\nconst char *GGC_OPTIONS = \"coop_conv=False $ outline_iterate_gb=False $ backoff_blocking_factor=4 $ parcomb=True $ np_schedulers=set(['fg', 'tb', 'wp']) $ cc_disable=set([]) $ tb_lb=False $ hacks=set([]) $ np_factor=8 $ instrument=set([]) $ unroll=[] $ instrument_mode=None $ read_props=None $ outline_iterate=False $ ignore_nested_errors=False $ np=True $ write_props=None $ quiet_cgen=True $ dyn_lb=True $ retry_backoff=True $ cuda.graph_type=basic $ cuda.use_worklist_slots=True $ cuda.worklist_type=basic\";\nstruct ThreadWork t_work;\nbool enable_lb = true;\n#include \"cc_pull_cuda.cuh\"\nstatic const int __tb_ConnectedComp = TB_SIZE;\n__global__ void InitializeGraph(CSRGraph graph, unsigned int __begin, unsigned int __end, uint32_t * p_comp_current)\n{\n  unsigned tid = TID_1D;\n  unsigned nthreads = TOTAL_THREADS_1D;\n\n  const unsigned __kernel_tb_size = TB_SIZE;\n  index_type src_end;\n  // FP: \"1 -> 2;\n  src_end = __end;\n  for (index_type src = __begin + tid; src < src_end; src += nthreads)\n  {\n    bool pop  = src < __end;\n    if (pop)\n    {\n      p_comp_current[src] = graph.node_data[src];\n    }\n  }\n  // FP: \"7 -> 8;\n}\n__global__ void ConnectedComp_TB_LB(CSRGraph graph, unsigned int __begin, unsigned int __end, uint32_t * p_comp_current, DynamicBitset& bitset_comp_current, HGAccumulator<unsigned int> active_vertices, int * thread_prefix_work_wl, unsigned int num_items, PipeContextT<Worklist2> thread_src_wl)\n{\n  unsigned tid = TID_1D;\n  unsigned nthreads = TOTAL_THREADS_1D;\n\n  const unsigned __kernel_tb_size = TB_SIZE;\n  __shared__ unsigned int total_work;\n  __shared__ unsigned block_start_src_index;\n  __shared__ unsigned block_end_src_index;\n  unsigned my_work;\n  unsigned src;\n  unsigned int offset;\n  unsigned int current_work;\n  // FP: \"1 -> 2;\n  // FP: \"2 -> 3;\n  unsigned blockdim_x = BLOCK_DIM_X;\n  // FP: \"3 -> 4;\n  // FP: \"4 -> 5;\n  // FP: \"5 -> 6;\n  // FP: \"6 -> 7;\n  // FP: \"7 -> 8;\n  // FP: \"8 -> 9;\n  // FP: \"9 -> 10;\n  total_work = thread_prefix_work_wl[num_items - 1];\n  // FP: \"10 -> 11;\n  my_work = ceilf((float)(total_work) / (float) nthreads);\n  // FP: \"11 -> 12;\n\n  // FP: \"12 -> 13;\n  __syncthreads();\n  // FP: \"13 -> 14;\n\n  // FP: \"14 -> 15;\n  if (my_work != 0)\n  {\n    current_work = tid;\n  }\n  // FP: \"17 -> 18;\n  for (unsigned i =0; i < my_work; i++)\n  {\n    unsigned int block_start_work;\n    unsigned int block_end_work;\n    if (threadIdx.x == 0)\n    {\n      if (current_work < total_work)\n      {\n        block_start_work = current_work;\n        block_end_work=current_work + blockdim_x - 1;\n        if (block_end_work >= total_work)\n        {\n          block_end_work = total_work - 1;\n        }\n        block_start_src_index = compute_src_and_offset(0, num_items - 1,  block_start_work+1, thread_prefix_work_wl, num_items,offset);\n        block_end_src_index = compute_src_and_offset(0, num_items - 1, block_end_work+1, thread_prefix_work_wl, num_items, offset);\n      }\n    }\n    __syncthreads();\n\n    if (current_work < total_work)\n    {\n      unsigned src_index;\n      index_type jj;\n      src_index = compute_src_and_offset(block_start_src_index, block_end_src_index, current_work+1, thread_prefix_work_wl,num_items, offset);\n      src= thread_src_wl.in_wl().dwl[src_index];\n      jj = (graph).getFirstEdge(src)+ offset;\n      {\n        index_type dst;\n        uint32_t new_comp;\n        uint32_t old_comp;\n        dst = graph.getAbsDestination(jj);\n        new_comp = p_comp_current[dst];\n        old_comp = atomicTestMin(&p_comp_current[src], new_comp);\n        if (old_comp > new_comp)\n        {\n          bitset_comp_current.set(src);\n          active_vertices.reduce( 1);\n        }\n      }\n      current_work = current_work + nthreads;\n    }\n    __syncthreads();\n  }\n  // FP: \"50 -> 51;\n}\n__global__ void ConnectedComp(CSRGraph graph, unsigned int __begin, unsigned int __end, uint32_t * p_comp_current, DynamicBitset& bitset_comp_current, HGAccumulator<unsigned int> active_vertices, PipeContextT<Worklist2> thread_work_wl, PipeContextT<Worklist2> thread_src_wl, bool enable_lb)\n{\n  unsigned tid = TID_1D;\n  unsigned nthreads = TOTAL_THREADS_1D;\n\n  const unsigned __kernel_tb_size = __tb_ConnectedComp;\n  __shared__ cub::BlockReduce<unsigned int, TB_SIZE>::TempStorage active_vertices_ts;\n  index_type src_end;\n  index_type src_rup;\n  // FP: \"1 -> 2;\n  const int _NP_CROSSOVER_WP = 32;\n  const int _NP_CROSSOVER_TB = __kernel_tb_size;\n  // FP: \"2 -> 3;\n  const int BLKSIZE = __kernel_tb_size;\n  const int ITSIZE = BLKSIZE * 8;\n  unsigned d_limit = DEGREE_LIMIT;\n  // FP: \"3 -> 4;\n\n  typedef cub::BlockScan<multiple_sum<2, index_type>, BLKSIZE> BlockScan;\n  typedef union np_shared<BlockScan::TempStorage, index_type, struct tb_np, struct warp_np<__kernel_tb_size/32>, struct fg_np<ITSIZE> > npsTy;\n\n  // FP: \"4 -> 5;\n  __shared__ npsTy nps ;\n  // FP: \"5 -> 6;\n  // FP: \"6 -> 7;\n  active_vertices.thread_entry();\n  // FP: \"7 -> 8;\n  src_end = __end;\n  src_rup = ((__begin) + roundup(((__end) - (__begin)), (blockDim.x)));\n  for (index_type src = __begin + tid; src < src_rup; src += nthreads)\n  {\n    int index;\n    multiple_sum<2, index_type> _np_mps;\n    multiple_sum<2, index_type> _np_mps_total;\n    // FP: \"8 -> 9;\n    bool pop  = src < __end && ((( src < (graph).nnodes )) ? true: false);\n    // FP: \"9 -> 10;\n    if (pop)\n    {\n    }\n    // FP: \"11 -> 12;\n    // FP: \"14 -> 15;\n    // FP: \"15 -> 16;\n    int threshold = TOTAL_THREADS_1D;\n    // FP: \"16 -> 17;\n    if (pop && (graph).getOutDegree(src) >= threshold)\n    {\n      index = thread_work_wl.in_wl().push_range(1) ;\n      thread_src_wl.in_wl().push_range(1);\n      thread_work_wl.in_wl().dwl[index] = (graph).getOutDegree(src);\n      thread_src_wl.in_wl().dwl[index] = src;\n      pop = false;\n    }\n    // FP: \"19 -> 20;\n    struct NPInspector1 _np = {0,0,0,0,0,0};\n    // FP: \"20 -> 21;\n    __shared__ struct { index_type src; } _np_closure [TB_SIZE];\n    // FP: \"21 -> 22;\n    _np_closure[threadIdx.x].src = src;\n    // FP: \"22 -> 23;\n    if (pop)\n    {\n      _np.size = (graph).getOutDegree(src);\n      _np.start = (graph).getFirstEdge(src);\n    }\n    // FP: \"25 -> 26;\n    // FP: \"26 -> 27;\n    _np_mps.el[0] = _np.size >= _NP_CROSSOVER_WP ? _np.size : 0;\n    _np_mps.el[1] = _np.size < _NP_CROSSOVER_WP ? _np.size : 0;\n    // FP: \"27 -> 28;\n    BlockScan(nps.temp_storage).ExclusiveSum(_np_mps, _np_mps, _np_mps_total);\n    // FP: \"28 -> 29;\n    if (threadIdx.x == 0)\n    {\n      nps.tb.owner = MAX_TB_SIZE + 1;\n    }\n    // FP: \"31 -> 32;\n    __syncthreads();\n    // FP: \"32 -> 33;\n    while (true)\n    {\n      // FP: \"33 -> 34;\n      if (_np.size >= _NP_CROSSOVER_TB)\n      {\n        nps.tb.owner = threadIdx.x;\n      }\n      // FP: \"36 -> 37;\n      __syncthreads();\n      // FP: \"37 -> 38;\n      if (nps.tb.owner == MAX_TB_SIZE + 1)\n      {\n        // FP: \"38 -> 39;\n        __syncthreads();\n        // FP: \"39 -> 40;\n        break;\n      }\n      // FP: \"41 -> 42;\n      if (nps.tb.owner == threadIdx.x)\n      {\n        nps.tb.start = _np.start;\n        nps.tb.size = _np.size;\n        nps.tb.src = threadIdx.x;\n        _np.start = 0;\n        _np.size = 0;\n      }\n      // FP: \"44 -> 45;\n      __syncthreads();\n      // FP: \"45 -> 46;\n      int ns = nps.tb.start;\n      int ne = nps.tb.size;\n      // FP: \"46 -> 47;\n      if (nps.tb.src == threadIdx.x)\n      {\n        nps.tb.owner = MAX_TB_SIZE + 1;\n      }\n      // FP: \"49 -> 50;\n      assert(nps.tb.src < __kernel_tb_size);\n      src = _np_closure[nps.tb.src].src;\n      // FP: \"50 -> 51;\n      for (int _np_j = threadIdx.x; _np_j < ne; _np_j += BLKSIZE)\n      {\n        index_type jj;\n        jj = ns +_np_j;\n        {\n          index_type dst;\n          uint32_t new_comp;\n          uint32_t old_comp;\n          dst = graph.getAbsDestination(jj);\n          new_comp = p_comp_current[dst];\n          old_comp = atomicTestMin(&p_comp_current[src], new_comp);\n          if (old_comp > new_comp)\n          {\n            bitset_comp_current.set(src);\n            active_vertices.reduce( 1);\n          }\n        }\n      }\n      // FP: \"64 -> 65;\n      __syncthreads();\n    }\n    // FP: \"66 -> 67;\n\n    // FP: \"67 -> 68;\n    {\n      const int warpid = threadIdx.x / 32;\n      // FP: \"68 -> 69;\n      const int _np_laneid = cub::LaneId();\n      // FP: \"69 -> 70;\n      while (__any_sync(0xffffffff, _np.size >= _NP_CROSSOVER_WP && _np.size < _NP_CROSSOVER_TB))\n      {\n        if (_np.size >= _NP_CROSSOVER_WP && _np.size < _NP_CROSSOVER_TB)\n        {\n          nps.warp.owner[warpid] = _np_laneid;\n        }\n        if (nps.warp.owner[warpid] == _np_laneid)\n        {\n          nps.warp.start[warpid] = _np.start;\n          nps.warp.size[warpid] = _np.size;\n          nps.warp.src[warpid] = threadIdx.x;\n          _np.start = 0;\n          _np.size = 0;\n        }\n        index_type _np_w_start = nps.warp.start[warpid];\n        index_type _np_w_size = nps.warp.size[warpid];\n        assert(nps.warp.src[warpid] < __kernel_tb_size);\n        src = _np_closure[nps.warp.src[warpid]].src;\n        for (int _np_ii = _np_laneid; _np_ii < _np_w_size; _np_ii += 32)\n        {\n          index_type jj;\n          jj = _np_w_start +_np_ii;\n          {\n            index_type dst;\n            uint32_t new_comp;\n            uint32_t old_comp;\n            dst = graph.getAbsDestination(jj);\n            new_comp = p_comp_current[dst];\n            old_comp = atomicTestMin(&p_comp_current[src], new_comp);\n            if (old_comp > new_comp)\n            {\n              bitset_comp_current.set(src);\n              active_vertices.reduce( 1);\n            }\n          }\n        }\n      }\n      // FP: \"93 -> 94;\n      __syncthreads();\n      // FP: \"94 -> 95;\n    }\n\n    // FP: \"95 -> 96;\n    __syncthreads();\n    // FP: \"96 -> 97;\n    _np.total = _np_mps_total.el[1];\n    _np.offset = _np_mps.el[1];\n    // FP: \"97 -> 98;\n    while (_np.work())\n    {\n      // FP: \"98 -> 99;\n      int _np_i =0;\n      // FP: \"99 -> 100;\n      _np.inspect2(nps.fg.itvalue, nps.fg.src, ITSIZE, threadIdx.x);\n      // FP: \"100 -> 101;\n      __syncthreads();\n      // FP: \"101 -> 102;\n\n      // FP: \"102 -> 103;\n      for (_np_i = threadIdx.x; _np_i < ITSIZE && _np.valid(_np_i); _np_i += BLKSIZE)\n      {\n        index_type jj;\n        assert(nps.fg.src[_np_i] < __kernel_tb_size);\n        src = _np_closure[nps.fg.src[_np_i]].src;\n        jj= nps.fg.itvalue[_np_i];\n        {\n          index_type dst;\n          uint32_t new_comp;\n          uint32_t old_comp;\n          dst = graph.getAbsDestination(jj);\n          new_comp = p_comp_current[dst];\n          old_comp = atomicTestMin(&p_comp_current[src], new_comp);\n          if (old_comp > new_comp)\n          {\n            bitset_comp_current.set(src);\n            active_vertices.reduce( 1);\n          }\n        }\n      }\n      // FP: \"117 -> 118;\n      _np.execute_round_done(ITSIZE);\n      // FP: \"118 -> 119;\n      __syncthreads();\n    }\n    // FP: \"120 -> 121;\n    assert(threadIdx.x < __kernel_tb_size);\n    src = _np_closure[threadIdx.x].src;\n  }\n  // FP: \"122 -> 123;\n  active_vertices.thread_exit<cub::BlockReduce<unsigned int, TB_SIZE> >(active_vertices_ts);\n  // FP: \"123 -> 124;\n}\n__global__ void ConnectedCompSanityCheck(CSRGraph graph, unsigned int __begin, unsigned int __end, uint32_t * p_comp_current, HGAccumulator<uint64_t> active_vertices)\n{\n  unsigned tid = TID_1D;\n  unsigned nthreads = TOTAL_THREADS_1D;\n\n  const unsigned __kernel_tb_size = TB_SIZE;\n  __shared__ cub::BlockReduce<uint64_t, TB_SIZE>::TempStorage active_vertices_ts;\n  index_type src_end;\n  // FP: \"1 -> 2;\n  // FP: \"2 -> 3;\n  active_vertices.thread_entry();\n  // FP: \"3 -> 4;\n  src_end = __end;\n  for (index_type src = __begin + tid; src < src_end; src += nthreads)\n  {\n    bool pop  = src < __end;\n    if (pop)\n    {\n      if (p_comp_current[src] == graph.node_data[src])\n      {\n        active_vertices.reduce( 1);\n      }\n    }\n  }\n  // FP: \"11 -> 12;\n  active_vertices.thread_exit<cub::BlockReduce<uint64_t, TB_SIZE> >(active_vertices_ts);\n  // FP: \"12 -> 13;\n}\nvoid InitializeGraph_cuda(unsigned int  __begin, unsigned int  __end, struct CUDA_Context*  ctx)\n{\n  t_work.init_thread_work(ctx->gg.nnodes);\n  dim3 blocks;\n  dim3 threads;\n  // FP: \"1 -> 2;\n  // FP: \"2 -> 3;\n  // FP: \"3 -> 4;\n  kernel_sizing(blocks, threads);\n  // FP: \"4 -> 5;\n  InitializeGraph <<<blocks, threads>>>(ctx->gg, __begin, __end, ctx->comp_current.data.gpu_wr_ptr());\n  cudaDeviceSynchronize();\n  // FP: \"5 -> 6;\n  check_cuda_kernel;\n  // FP: \"6 -> 7;\n}\nvoid InitializeGraph_allNodes_cuda(struct CUDA_Context*  ctx)\n{\n  // FP: \"1 -> 2;\n  InitializeGraph_cuda(0, ctx->gg.nnodes, ctx);\n  // FP: \"2 -> 3;\n}\nvoid InitializeGraph_masterNodes_cuda(struct CUDA_Context*  ctx)\n{\n  // FP: \"1 -> 2;\n  InitializeGraph_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, ctx);\n  // FP: \"2 -> 3;\n}\nvoid InitializeGraph_nodesWithEdges_cuda(struct CUDA_Context*  ctx)\n{\n  // FP: \"1 -> 2;\n  InitializeGraph_cuda(0, ctx->numNodesWithEdges, ctx);\n  // FP: \"2 -> 3;\n}\nvoid ConnectedComp_cuda(unsigned int  __begin, unsigned int  __end, unsigned int & active_vertices, struct CUDA_Context*  ctx)\n{\n  dim3 blocks;\n  dim3 threads;\n  HGAccumulator<unsigned int> _active_vertices;\n  // FP: \"1 -> 2;\n  // FP: \"2 -> 3;\n  // FP: \"3 -> 4;\n  kernel_sizing(blocks, threads);\n  // FP: \"4 -> 5;\n  Shared<unsigned int> active_verticesval  = Shared<unsigned int>(1);\n  // FP: \"5 -> 6;\n  // FP: \"6 -> 7;\n  *(active_verticesval.cpu_wr_ptr()) = 0;\n  // FP: \"7 -> 8;\n  _active_vertices.rv = active_verticesval.gpu_wr_ptr();\n  // FP: \"8 -> 9;\n  ConnectedComp <<<blocks, __tb_ConnectedComp>>>(ctx->gg, __begin, __end, ctx->comp_current.data.gpu_wr_ptr(), *(ctx->comp_current.is_updated.gpu_rd_ptr()), _active_vertices, t_work.thread_work_wl, t_work.thread_src_wl, enable_lb);\n  cudaDeviceSynchronize();\n  if (enable_lb)\n  {\n    int num_items = t_work.thread_work_wl.in_wl().nitems();\n    if (num_items != 0)\n    {\n      t_work.compute_prefix_sum();\n      cudaDeviceSynchronize();\n      ConnectedComp_TB_LB <<<blocks, __tb_ConnectedComp>>>(ctx->gg, __begin, __end, ctx->comp_current.data.gpu_wr_ptr(), *(ctx->comp_current.is_updated.gpu_rd_ptr()), _active_vertices, t_work.thread_prefix_work_wl.gpu_wr_ptr(), num_items, t_work.thread_src_wl);\n      cudaDeviceSynchronize();\n      t_work.reset_thread_work();\n    }\n  }\n  // FP: \"9 -> 10;\n  check_cuda_kernel;\n  // FP: \"10 -> 11;\n  active_vertices = *(active_verticesval.cpu_rd_ptr());\n  // FP: \"11 -> 12;\n}\nvoid ConnectedComp_allNodes_cuda(unsigned int & active_vertices, struct CUDA_Context*  ctx)\n{\n  // FP: \"1 -> 2;\n  ConnectedComp_cuda(0, ctx->gg.nnodes, active_vertices, ctx);\n  // FP: \"2 -> 3;\n}\nvoid ConnectedComp_masterNodes_cuda(unsigned int & active_vertices, struct CUDA_Context*  ctx)\n{\n  // FP: \"1 -> 2;\n  ConnectedComp_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, active_vertices, ctx);\n  // FP: \"2 -> 3;\n}\nvoid ConnectedComp_nodesWithEdges_cuda(unsigned int & active_vertices, struct CUDA_Context*  ctx)\n{\n  // FP: \"1 -> 2;\n  ConnectedComp_cuda(0, ctx->numNodesWithEdges, active_vertices, ctx);\n  // FP: \"2 -> 3;\n}\nvoid ConnectedCompSanityCheck_cuda(unsigned int  __begin, unsigned int  __end, uint64_t & active_vertices, struct CUDA_Context*  ctx)\n{\n  dim3 blocks;\n  dim3 threads;\n  HGAccumulator<uint64_t> _active_vertices;\n  // FP: \"1 -> 2;\n  // FP: \"2 -> 3;\n  // FP: \"3 -> 4;\n  kernel_sizing(blocks, threads);\n  // FP: \"4 -> 5;\n  Shared<uint64_t> active_verticesval  = Shared<uint64_t>(1);\n  // FP: \"5 -> 6;\n  // FP: \"6 -> 7;\n  *(active_verticesval.cpu_wr_ptr()) = 0;\n  // FP: \"7 -> 8;\n  _active_vertices.rv = active_verticesval.gpu_wr_ptr();\n  // FP: \"8 -> 9;\n  ConnectedCompSanityCheck <<<blocks, threads>>>(ctx->gg, __begin, __end, ctx->comp_current.data.gpu_wr_ptr(), _active_vertices);\n  cudaDeviceSynchronize();\n  // FP: \"9 -> 10;\n  check_cuda_kernel;\n  // FP: \"10 -> 11;\n  active_vertices = *(active_verticesval.cpu_rd_ptr());\n  // FP: \"11 -> 12;\n}\nvoid ConnectedCompSanityCheck_allNodes_cuda(uint64_t & active_vertices, struct CUDA_Context*  ctx)\n{\n  // FP: \"1 -> 2;\n  ConnectedCompSanityCheck_cuda(0, ctx->gg.nnodes, active_vertices, ctx);\n  // FP: \"2 -> 3;\n}\nvoid ConnectedCompSanityCheck_masterNodes_cuda(uint64_t & active_vertices, struct CUDA_Context*  ctx)\n{\n  // FP: \"1 -> 2;\n  ConnectedCompSanityCheck_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, active_vertices, ctx);\n  // FP: \"2 -> 3;\n}\nvoid ConnectedCompSanityCheck_nodesWithEdges_cuda(uint64_t & active_vertices, struct CUDA_Context*  ctx)\n{\n  // FP: \"1 -> 2;\n  ConnectedCompSanityCheck_cuda(0, ctx->numNodesWithEdges, active_vertices, ctx);\n  // FP: \"2 -> 3;\n}"
  },
  {
    "path": "lonestar/analytics/distributed/connected-components/cc_pull_cuda.cuh",
    "content": "#pragma once\n#include <cuda.h>\n#include <stdio.h>\n#include <sys/types.h>\n#include <unistd.h>\n#include \"cc_pull_cuda.h\"\n#include \"galois/runtime/cuda/DeviceSync.h\"\n\nstruct CUDA_Context : public CUDA_Context_Common {\n\tstruct CUDA_Context_Field<uint32_t> comp_current;\n};\n\nstruct CUDA_Context* get_CUDA_context(int id) {\n\tstruct CUDA_Context* ctx;\n\tctx = (struct CUDA_Context* ) calloc(1, sizeof(struct CUDA_Context));\n\tctx->id = id;\n\treturn ctx;\n}\n\nbool init_CUDA_context(struct CUDA_Context* ctx, int device) {\n\treturn init_CUDA_context_common(ctx, device);\n}\n\nvoid load_graph_CUDA(struct CUDA_Context* ctx, MarshalGraph &g, unsigned num_hosts) {\n\tsize_t mem_usage = mem_usage_CUDA_common(g, num_hosts);\n\tmem_usage += mem_usage_CUDA_field(&ctx->comp_current, g, num_hosts);\n\tprintf(\"[%d] Host memory for communication context: %3u MB\\n\", ctx->id, mem_usage/1048756);\n\tload_graph_CUDA_common(ctx, g, num_hosts);\n\tload_graph_CUDA_field(ctx, &ctx->comp_current, num_hosts);\n\treset_CUDA_context(ctx);\n}\n\nvoid reset_CUDA_context(struct CUDA_Context* ctx) {\n\tctx->comp_current.data.zero_gpu();\n}\n\nvoid get_bitset_comp_current_cuda(struct CUDA_Context* ctx, uint64_t* bitset_compute) {\n\tctx->comp_current.is_updated.cpu_rd_ptr()->copy_to_cpu(bitset_compute);\n}\n\nvoid bitset_comp_current_reset_cuda(struct CUDA_Context* ctx) {\n\tctx->comp_current.is_updated.cpu_rd_ptr()->reset();\n}\n\nvoid bitset_comp_current_reset_cuda(struct CUDA_Context* ctx, size_t begin, size_t end) {\n\treset_bitset_field(&ctx->comp_current, begin, end);\n}\n\nuint32_t get_node_comp_current_cuda(struct CUDA_Context* ctx, unsigned LID) {\n\tuint32_t *comp_current = ctx->comp_current.data.cpu_rd_ptr();\n\treturn comp_current[LID];\n}\n\nvoid set_node_comp_current_cuda(struct CUDA_Context* ctx, unsigned LID, uint32_t v) {\n\tuint32_t *comp_current = ctx->comp_current.data.cpu_wr_ptr();\n\tcomp_current[LID] = v;\n}\n\nvoid add_node_comp_current_cuda(struct CUDA_Context* ctx, unsigned LID, uint32_t v) {\n\tuint32_t *comp_current = ctx->comp_current.data.cpu_wr_ptr();\n\tcomp_current[LID] += v;\n}\n\nbool min_node_comp_current_cuda(struct CUDA_Context* ctx, unsigned LID, uint32_t v) {\n\tuint32_t *comp_current = ctx->comp_current.data.cpu_wr_ptr();\n\tif (comp_current[LID] > v){\n\t\tcomp_current[LID] = v;\n\t\treturn true;\n\t}\n\treturn false;\n}\n\nvoid batch_get_node_comp_current_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v) {\n\tbatch_get_shared_field<uint32_t, sharedMaster, false>(ctx, &ctx->comp_current, from_id, v);\n}\n\nvoid batch_get_node_comp_current_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode) {\n\tbatch_get_shared_field<uint32_t, sharedMaster, false>(ctx, &ctx->comp_current, from_id, v, v_size, data_mode);\n}\n\nvoid batch_get_mirror_node_comp_current_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v) {\n\tbatch_get_shared_field<uint32_t, sharedMirror, false>(ctx, &ctx->comp_current, from_id, v);\n}\n\nvoid batch_get_mirror_node_comp_current_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode) {\n\tbatch_get_shared_field<uint32_t, sharedMirror, false>(ctx, &ctx->comp_current, from_id, v, v_size, data_mode);\n}\n\nvoid batch_get_reset_node_comp_current_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, uint32_t i) {\n\tbatch_get_shared_field<uint32_t, sharedMirror, true>(ctx, &ctx->comp_current, from_id, v, i);\n}\n\nvoid batch_get_reset_node_comp_current_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode, uint32_t i) {\n\tbatch_get_shared_field<uint32_t, sharedMirror, true>(ctx, &ctx->comp_current, from_id, v, v_size, data_mode, i);\n}\n\nvoid batch_set_mirror_node_comp_current_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<uint32_t, sharedMirror, setOp>(ctx, &ctx->comp_current, from_id, v, data_mode);\n}\n\nvoid batch_set_node_comp_current_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<uint32_t, sharedMaster, setOp>(ctx, &ctx->comp_current, from_id, v, data_mode);\n}\n\nvoid batch_add_mirror_node_comp_current_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<uint32_t, sharedMirror, addOp>(ctx, &ctx->comp_current, from_id, v, data_mode);\n}\n\nvoid batch_add_node_comp_current_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<uint32_t, sharedMaster, addOp>(ctx, &ctx->comp_current, from_id, v, data_mode);\n}\n\nvoid batch_min_mirror_node_comp_current_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<uint32_t, sharedMirror, minOp>(ctx, &ctx->comp_current, from_id, v, data_mode);\n}\n\nvoid batch_min_node_comp_current_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<uint32_t, sharedMaster, minOp>(ctx, &ctx->comp_current, from_id, v, data_mode);\n}\n\nvoid batch_reset_node_comp_current_cuda(struct CUDA_Context* ctx, size_t begin, size_t end, uint32_t v) {\n\treset_data_field<uint32_t>(&ctx->comp_current, begin, end, v);\n}\n\n"
  },
  {
    "path": "lonestar/analytics/distributed/connected-components/cc_pull_cuda.h",
    "content": "#pragma once\n\n#include \"galois/runtime/DataCommMode.h\"\n#include \"galois/cuda/HostDecls.h\"\n\nvoid get_bitset_comp_current_cuda(struct CUDA_Context* ctx,\n                                  uint64_t* bitset_compute);\nvoid bitset_comp_current_reset_cuda(struct CUDA_Context* ctx);\nvoid bitset_comp_current_reset_cuda(struct CUDA_Context* ctx, size_t begin,\n                                    size_t end);\nuint32_t get_node_comp_current_cuda(struct CUDA_Context* ctx, unsigned LID);\nvoid set_node_comp_current_cuda(struct CUDA_Context* ctx, unsigned LID,\n                                uint32_t v);\nvoid add_node_comp_current_cuda(struct CUDA_Context* ctx, unsigned LID,\n                                uint32_t v);\nbool min_node_comp_current_cuda(struct CUDA_Context* ctx, unsigned LID,\n                                uint32_t v);\nvoid batch_get_node_comp_current_cuda(struct CUDA_Context* ctx,\n                                      unsigned from_id, uint8_t* v);\nvoid batch_get_node_comp_current_cuda(struct CUDA_Context* ctx,\n                                      unsigned from_id, uint8_t* v,\n                                      size_t* v_size, DataCommMode* data_mode);\nvoid batch_get_mirror_node_comp_current_cuda(struct CUDA_Context* ctx,\n                                             unsigned from_id, uint8_t* v);\nvoid batch_get_mirror_node_comp_current_cuda(struct CUDA_Context* ctx,\n                                             unsigned from_id, uint8_t* v,\n                                             size_t* v_size,\n                                             DataCommMode* data_mode);\nvoid batch_get_reset_node_comp_current_cuda(struct CUDA_Context* ctx,\n                                            unsigned from_id, uint8_t* v,\n                                            uint32_t i);\nvoid batch_get_reset_node_comp_current_cuda(struct CUDA_Context* ctx,\n                                            unsigned from_id, uint8_t* v,\n                                            size_t* v_size,\n                                            DataCommMode* data_mode,\n                                            uint32_t i);\nvoid batch_set_mirror_node_comp_current_cuda(struct CUDA_Context* ctx,\n                                             unsigned from_id, uint8_t* v,\n                                             DataCommMode data_mode);\nvoid batch_set_node_comp_current_cuda(struct CUDA_Context* ctx,\n                                      unsigned from_id, uint8_t* v,\n                                      DataCommMode data_mode);\nvoid batch_add_mirror_node_comp_current_cuda(struct CUDA_Context* ctx,\n                                             unsigned from_id, uint8_t* v,\n                                             DataCommMode data_mode);\nvoid batch_add_node_comp_current_cuda(struct CUDA_Context* ctx,\n                                      unsigned from_id, uint8_t* v,\n                                      DataCommMode data_mode);\nvoid batch_min_mirror_node_comp_current_cuda(struct CUDA_Context* ctx,\n                                             unsigned from_id, uint8_t* v,\n                                             DataCommMode data_mode);\nvoid batch_min_node_comp_current_cuda(struct CUDA_Context* ctx,\n                                      unsigned from_id, uint8_t* v,\n                                      DataCommMode data_mode);\nvoid batch_reset_node_comp_current_cuda(struct CUDA_Context* ctx, size_t begin,\n                                        size_t end, uint32_t v);\n\nvoid ConnectedComp_cuda(unsigned int __begin, unsigned int __end,\n                        unsigned int& active_vertices,\n                        struct CUDA_Context* ctx);\nvoid ConnectedCompSanityCheck_cuda(unsigned int __begin, unsigned int __end,\n                                   uint64_t& active_vertices,\n                                   struct CUDA_Context* ctx);\nvoid ConnectedCompSanityCheck_allNodes_cuda(uint64_t& active_vertices,\n                                            struct CUDA_Context* ctx);\nvoid ConnectedCompSanityCheck_masterNodes_cuda(uint64_t& active_vertices,\n                                               struct CUDA_Context* ctx);\nvoid ConnectedCompSanityCheck_nodesWithEdges_cuda(uint64_t& active_vertices,\n                                                  struct CUDA_Context* ctx);\nvoid ConnectedComp_allNodes_cuda(unsigned int& active_vertices,\n                                 struct CUDA_Context* ctx);\nvoid ConnectedComp_masterNodes_cuda(unsigned int& active_vertices,\n                                    struct CUDA_Context* ctx);\nvoid ConnectedComp_nodesWithEdges_cuda(unsigned int& active_vertices,\n                                       struct CUDA_Context* ctx);\nvoid InitializeGraph_cuda(unsigned int __begin, unsigned int __end,\n                          struct CUDA_Context* ctx);\nvoid InitializeGraph_allNodes_cuda(struct CUDA_Context* ctx);\nvoid InitializeGraph_masterNodes_cuda(struct CUDA_Context* ctx);\nvoid InitializeGraph_nodesWithEdges_cuda(struct CUDA_Context* ctx);\n"
  },
  {
    "path": "lonestar/analytics/distributed/connected-components/cc_pull_cuda.py",
    "content": "from gg.ast import *\nfrom gg.lib.graph import Graph\nfrom gg.lib.wl import Worklist\nfrom gg.ast.params import GraphParam\nimport cgen\nG = Graph(\"graph\")\nWL = Worklist()\nast = Module([\nCBlock([cgen.Include(\"cc_pull_cuda.cuh\", system = False)], parse = False),\nKernel(\"InitializeGraph\", [G.param(), ('unsigned int', '__begin'), ('unsigned int', '__end'), ('uint32_t *', 'p_comp_current')],\n[\nForAll(\"src\", G.nodes(\"__begin\", \"__end\"),\n[\nCDecl([(\"bool\", \"pop\", \" = src < __end\")]),\nIf(\"pop\", [\nCBlock([\"p_comp_current[src] = graph.node_data[src]\"]),\n]),\n]),\n]),\nKernel(\"ConnectedComp\", [G.param(), ('unsigned int', '__begin'), ('unsigned int', '__end'), ('uint32_t *', 'p_comp_current'), ('DynamicBitset&', 'bitset_comp_current'), ('HGAccumulator<unsigned int>', 'active_vertices')],\n[\nCDecl([(\"__shared__ cub::BlockReduce<unsigned int, TB_SIZE>::TempStorage\", \"active_vertices_ts\", \"\")]),\nCBlock([\"active_vertices.thread_entry()\"]),\nForAll(\"src\", G.nodes(\"__begin\", \"__end\"),\n[\nCDecl([(\"bool\", \"pop\", \" = src < __end\")]),\nIf(\"pop\", [\n]),\nUniformConditional(If(\"!pop\", [CBlock(\"continue\")]), uniform_only = False, _only_if_np = True),\nClosureHint(\nForAll(\"jj\", G.edges(\"src\"),\n[\nCDecl([(\"index_type\", \"dst\", \"\")]),\nCBlock([\"dst = graph.getAbsDestination(jj)\"]),\nCDecl([(\"uint32_t\", \"new_comp\", \"\")]),\nCBlock([\"new_comp = p_comp_current[dst]\"]),\nCDecl([(\"uint32_t\", \"old_comp\", \"\")]),\nCBlock([\"old_comp = atomicTestMin(&p_comp_current[src], new_comp)\"]),\nIf(\"old_comp > new_comp\",\n[\nCBlock([\"bitset_comp_current.set(src)\"]),\nCBlock([\"active_vertices.reduce( 1)\"]),\n]),\n]),\n),\n]),\nCBlock([\"active_vertices.thread_exit<cub::BlockReduce<unsigned int, TB_SIZE> >(active_vertices_ts)\"], parse = False),\n]),\nKernel(\"ConnectedCompSanityCheck\", [G.param(), ('unsigned int', '__begin'), ('unsigned int', '__end'), ('uint32_t *', 'p_comp_current'), ('HGAccumulator<uint64_t>', 'active_vertices')],\n[\nCDecl([(\"__shared__ cub::BlockReduce<uint64_t, TB_SIZE>::TempStorage\", \"active_vertices_ts\", \"\")]),\nCBlock([\"active_vertices.thread_entry()\"]),\nForAll(\"src\", G.nodes(\"__begin\", \"__end\"),\n[\nCDecl([(\"bool\", \"pop\", \" = src < __end\")]),\nIf(\"pop\", [\nIf(\"p_comp_current[src] == graph.node_data[src]\",\n[\nCBlock([\"active_vertices.reduce( 1)\"]),\n]),\n]),\n]),\nCBlock([\"active_vertices.thread_exit<cub::BlockReduce<uint64_t, TB_SIZE> >(active_vertices_ts)\"], parse = False),\n]),\nKernel(\"InitializeGraph_cuda\", [('unsigned int ', '__begin'), ('unsigned int ', '__end'), ('struct CUDA_Context* ', 'ctx')],\n[\nCDecl([(\"dim3\", \"blocks\", \"\")]),\nCDecl([(\"dim3\", \"threads\", \"\")]),\nCBlock([\"kernel_sizing(blocks, threads)\"]),\nInvoke(\"InitializeGraph\", (\"ctx->gg\", \"__begin\", \"__end\", \"ctx->comp_current.data.gpu_wr_ptr()\")),\nCBlock([\"check_cuda_kernel\"], parse = False),\n], host = True),\nKernel(\"InitializeGraph_allNodes_cuda\", [('struct CUDA_Context* ', 'ctx')],\n[\nCBlock([\"InitializeGraph_cuda(0, ctx->gg.nnodes, ctx)\"]),\n], host = True),\nKernel(\"InitializeGraph_masterNodes_cuda\", [('struct CUDA_Context* ', 'ctx')],\n[\nCBlock([\"InitializeGraph_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, ctx)\"]),\n], host = True),\nKernel(\"InitializeGraph_nodesWithEdges_cuda\", [('struct CUDA_Context* ', 'ctx')],\n[\nCBlock([\"InitializeGraph_cuda(0, ctx->numNodesWithEdges, ctx)\"]),\n], host = True),\nKernel(\"ConnectedComp_cuda\", [('unsigned int ', '__begin'), ('unsigned int ', '__end'), ('unsigned int &', 'active_vertices'), ('struct CUDA_Context* ', 'ctx')],\n[\nCDecl([(\"dim3\", \"blocks\", \"\")]),\nCDecl([(\"dim3\", \"threads\", \"\")]),\nCBlock([\"kernel_sizing(blocks, threads)\"]),\nCDecl([(\"Shared<unsigned int>\", \"active_verticesval\", \" = Shared<unsigned int>(1)\")]),\nCDecl([(\"HGAccumulator<unsigned int>\", \"_active_vertices\", \"\")]),\nCBlock([\"*(active_verticesval.cpu_wr_ptr()) = 0\"]),\nCBlock([\"_active_vertices.rv = active_verticesval.gpu_wr_ptr()\"]),\nInvoke(\"ConnectedComp\", (\"ctx->gg\", \"__begin\", \"__end\", \"ctx->comp_current.data.gpu_wr_ptr()\", \"*(ctx->comp_current.is_updated.gpu_rd_ptr())\", \"_active_vertices\")),\nCBlock([\"check_cuda_kernel\"], parse = False),\nCBlock([\"active_vertices = *(active_verticesval.cpu_rd_ptr())\"]),\n], host = True),\nKernel(\"ConnectedComp_allNodes_cuda\", [('unsigned int &', 'active_vertices'), ('struct CUDA_Context* ', 'ctx')],\n[\nCBlock([\"ConnectedComp_cuda(0, ctx->gg.nnodes, active_vertices, ctx)\"]),\n], host = True),\nKernel(\"ConnectedComp_masterNodes_cuda\", [('unsigned int &', 'active_vertices'), ('struct CUDA_Context* ', 'ctx')],\n[\nCBlock([\"ConnectedComp_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, active_vertices, ctx)\"]),\n], host = True),\nKernel(\"ConnectedComp_nodesWithEdges_cuda\", [('unsigned int &', 'active_vertices'), ('struct CUDA_Context* ', 'ctx')],\n[\nCBlock([\"ConnectedComp_cuda(0, ctx->numNodesWithEdges, active_vertices, ctx)\"]),\n], host = True),\nKernel(\"ConnectedCompSanityCheck_cuda\", [('unsigned int ', '__begin'), ('unsigned int ', '__end'), ('uint64_t &', 'active_vertices'), ('struct CUDA_Context* ', 'ctx')],\n[\nCDecl([(\"dim3\", \"blocks\", \"\")]),\nCDecl([(\"dim3\", \"threads\", \"\")]),\nCBlock([\"kernel_sizing(blocks, threads)\"]),\nCDecl([(\"Shared<uint64_t>\", \"active_verticesval\", \" = Shared<uint64_t>(1)\")]),\nCDecl([(\"HGAccumulator<uint64_t>\", \"_active_vertices\", \"\")]),\nCBlock([\"*(active_verticesval.cpu_wr_ptr()) = 0\"]),\nCBlock([\"_active_vertices.rv = active_verticesval.gpu_wr_ptr()\"]),\nInvoke(\"ConnectedCompSanityCheck\", (\"ctx->gg\", \"__begin\", \"__end\", \"ctx->comp_current.data.gpu_wr_ptr()\", \"_active_vertices\")),\nCBlock([\"check_cuda_kernel\"], parse = False),\nCBlock([\"active_vertices = *(active_verticesval.cpu_rd_ptr())\"]),\n], host = True),\nKernel(\"ConnectedCompSanityCheck_allNodes_cuda\", [('uint64_t &', 'active_vertices'), ('struct CUDA_Context* ', 'ctx')],\n[\nCBlock([\"ConnectedCompSanityCheck_cuda(0, ctx->gg.nnodes, active_vertices, ctx)\"]),\n], host = True),\nKernel(\"ConnectedCompSanityCheck_masterNodes_cuda\", [('uint64_t &', 'active_vertices'), ('struct CUDA_Context* ', 'ctx')],\n[\nCBlock([\"ConnectedCompSanityCheck_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, active_vertices, ctx)\"]),\n], host = True),\nKernel(\"ConnectedCompSanityCheck_nodesWithEdges_cuda\", [('uint64_t &', 'active_vertices'), ('struct CUDA_Context* ', 'ctx')],\n[\nCBlock([\"ConnectedCompSanityCheck_cuda(0, ctx->numNodesWithEdges, active_vertices, ctx)\"]),\n], host = True),\n])\n"
  },
  {
    "path": "lonestar/analytics/distributed/connected-components/cc_pull_sync.hh",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting parallelism.\n * The code is being released under the terms of the 3-Clause BSD License (a\n * copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#include \"galois/runtime/SyncStructures.h\"\n\nGALOIS_SYNC_STRUCTURE_REDUCE_SET(comp_current, uint32_t);\nGALOIS_SYNC_STRUCTURE_REDUCE_MIN(comp_current, uint32_t);\nGALOIS_SYNC_STRUCTURE_BITSET(comp_current);\n"
  },
  {
    "path": "lonestar/analytics/distributed/connected-components/cc_push.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#include \"DistBench/Output.h\"\n#include \"DistBench/Start.h\"\n#include \"galois/DistGalois.h\"\n#include \"galois/DReducible.h\"\n#include \"galois/DTerminationDetector.h\"\n#include \"galois/gstl.h\"\n#include \"galois/runtime/Tracer.h\"\n\n#include <iostream>\n#include <limits>\n\n#ifdef GALOIS_ENABLE_GPU\n#include \"cc_push_cuda.h\"\nstruct CUDA_Context* cuda_ctx;\n#else\nenum { CPU, GPU_CUDA };\nint personality = CPU;\n#endif\n\nconstexpr static const char* const REGION_NAME = \"ConnectedComp\";\n\n/******************************************************************************/\n/* Declaration of command line arguments */\n/******************************************************************************/\n\nnamespace cll = llvm::cl;\nstatic cll::opt<unsigned int> maxIterations(\"maxIterations\",\n                                            cll::desc(\"Maximum iterations: \"\n                                                      \"Default 1000\"),\n                                            cll::init(1000));\n\nenum Exec { Sync, Async };\n\nstatic cll::opt<Exec> execution(\n    \"exec\", cll::desc(\"Distributed Execution Model (default value Async):\"),\n    cll::values(clEnumVal(Sync, \"Bulk-synchronous Parallel (BSP)\"),\n                clEnumVal(Async, \"Bulk-asynchronous Parallel (BASP)\")),\n    cll::init(Async));\n\n/******************************************************************************/\n/* Graph structure declarations + other initialization */\n/******************************************************************************/\n\nstruct NodeData {\n  std::atomic<uint32_t> comp_current;\n  uint32_t comp_old;\n};\n\ngalois::DynamicBitSet bitset_comp_current;\n\ntypedef galois::graphs::DistGraph<NodeData, void> Graph;\ntypedef typename Graph::GraphNode GNode;\n\nstd::unique_ptr<galois::graphs::GluonSubstrate<Graph>> syncSubstrate;\n\n#include \"cc_push_sync.hh\"\n\n/******************************************************************************/\n/* Algorithm structures */\n/******************************************************************************/\n\nstruct InitializeGraph {\n  Graph* graph;\n\n  InitializeGraph(Graph* _graph) : graph(_graph) {}\n\n  void static go(Graph& _graph) {\n    const auto& allNodes = _graph.allNodesRange();\n\n    if (personality == GPU_CUDA) {\n#ifdef GALOIS_ENABLE_GPU\n      std::string impl_str(\"InitializeGraph_\" +\n                           (syncSubstrate->get_run_identifier()));\n      galois::StatTimer StatTimer_cuda(impl_str.c_str(), REGION_NAME);\n      StatTimer_cuda.start();\n      InitializeGraph_allNodes_cuda(cuda_ctx);\n      StatTimer_cuda.stop();\n#else\n      abort();\n#endif\n    } else if (personality == CPU) {\n      galois::do_all(\n          galois::iterate(allNodes.begin(), allNodes.end()),\n          InitializeGraph{&_graph}, galois::no_stats(),\n          galois::loopname(\n              syncSubstrate->get_run_identifier(\"InitializeGraph\").c_str()));\n    }\n  }\n\n  void operator()(GNode src) const {\n    NodeData& sdata    = graph->getData(src);\n    sdata.comp_current = graph->getGID(src);\n    sdata.comp_old     = graph->getGID(src);\n  }\n};\n\ntemplate <bool async>\nstruct FirstItr_ConnectedComp {\n  Graph* graph;\n  FirstItr_ConnectedComp(Graph* _graph) : graph(_graph) {}\n\n  void static go(Graph& _graph) {\n    const auto& nodesWithEdges = _graph.allNodesWithEdgesRange();\n    syncSubstrate->set_num_round(0);\n    if (personality == GPU_CUDA) {\n#ifdef GALOIS_ENABLE_GPU\n      std::string impl_str(\"ConnectedComp_\" +\n                           (syncSubstrate->get_run_identifier()));\n      galois::StatTimer StatTimer_cuda(impl_str.c_str(), REGION_NAME);\n      StatTimer_cuda.start();\n      StatTimer_cuda.stop();\n      FirstItr_ConnectedComp_nodesWithEdges_cuda(cuda_ctx);\n      StatTimer_cuda.stop();\n#else\n      abort();\n#endif\n    } else if (personality == CPU) {\n      galois::do_all(\n          galois::iterate(nodesWithEdges), FirstItr_ConnectedComp{&_graph},\n          galois::steal(), galois::no_stats(),\n          galois::loopname(\n              syncSubstrate->get_run_identifier(\"ConnectedComp\").c_str()));\n    }\n\n    syncSubstrate->sync<writeDestination, readSource, Reduce_min_comp_current,\n                        Bitset_comp_current, async>(\"ConnectedComp\");\n\n    galois::runtime::reportStat_Tsum(\n        REGION_NAME, \"NumWorkItems_\" + (syncSubstrate->get_run_identifier()),\n        _graph.allNodesRange().end() - _graph.allNodesRange().begin());\n  }\n\n  void operator()(GNode src) const {\n    NodeData& snode = graph->getData(src);\n    snode.comp_old  = snode.comp_current;\n\n    for (auto jj : graph->edges(src)) {\n      GNode dst         = graph->getEdgeDst(jj);\n      auto& dnode       = graph->getData(dst);\n      uint32_t new_dist = snode.comp_current;\n      uint32_t old_dist = galois::atomicMin(dnode.comp_current, new_dist);\n      if (old_dist > new_dist)\n        bitset_comp_current.set(dst);\n    }\n  }\n};\n\ntemplate <bool async>\nstruct ConnectedComp {\n  Graph* graph;\n  using DGTerminatorDetector =\n      typename std::conditional<async, galois::DGTerminator<unsigned int>,\n                                galois::DGAccumulator<unsigned int>>::type;\n\n  DGTerminatorDetector& active_vertices;\n\n  ConnectedComp(Graph* _graph, DGTerminatorDetector& _dga)\n      : graph(_graph), active_vertices(_dga) {}\n\n  void static go(Graph& _graph) {\n    using namespace galois::worklists;\n\n    FirstItr_ConnectedComp<async>::go(_graph);\n\n    unsigned _num_iterations = 1;\n    DGTerminatorDetector dga;\n\n    const auto& nodesWithEdges = _graph.allNodesWithEdgesRange();\n\n    do {\n      syncSubstrate->set_num_round(_num_iterations);\n      dga.reset();\n      if (personality == GPU_CUDA) {\n#ifdef GALOIS_ENABLE_GPU\n        std::string impl_str(\"ConnectedComp_\" +\n                             (syncSubstrate->get_run_identifier()));\n        galois::StatTimer StatTimer_cuda(impl_str.c_str(), REGION_NAME);\n        StatTimer_cuda.start();\n        unsigned int __retval = 0;\n        ConnectedComp_nodesWithEdges_cuda(__retval, cuda_ctx);\n        dga += __retval;\n        StatTimer_cuda.stop();\n#else\n        abort();\n#endif\n      } else if (personality == CPU) {\n        galois::do_all(\n            galois::iterate(nodesWithEdges), ConnectedComp(&_graph, dga),\n            galois::no_stats(), galois::steal(),\n            galois::loopname(\n                syncSubstrate->get_run_identifier(\"ConnectedComp\").c_str()));\n      }\n\n      syncSubstrate->sync<writeDestination, readSource, Reduce_min_comp_current,\n                          Bitset_comp_current, async>(\"ConnectedComp\");\n\n      galois::runtime::reportStat_Tsum(\n          REGION_NAME, \"NumWorkItems_\" + (syncSubstrate->get_run_identifier()),\n          (unsigned long)dga.read_local());\n      ++_num_iterations;\n    } while ((async || (_num_iterations < maxIterations)) &&\n             dga.reduce(syncSubstrate->get_run_identifier()));\n\n    galois::runtime::reportStat_Tmax(\n        REGION_NAME,\n        \"NumIterations_\" + std::to_string(syncSubstrate->get_run_num()),\n        (unsigned long)_num_iterations);\n  }\n\n  void operator()(GNode src) const {\n    NodeData& snode = graph->getData(src);\n\n    if (snode.comp_old > snode.comp_current) {\n      snode.comp_old = snode.comp_current;\n\n      for (auto jj : graph->edges(src)) {\n        active_vertices += 1;\n\n        GNode dst         = graph->getEdgeDst(jj);\n        auto& dnode       = graph->getData(dst);\n        uint32_t new_dist = snode.comp_current;\n        uint32_t old_dist = galois::atomicMin(dnode.comp_current, new_dist);\n        if (old_dist > new_dist)\n          bitset_comp_current.set(dst);\n      }\n    }\n  }\n};\n\n/******************************************************************************/\n/* Sanity check operators */\n/******************************************************************************/\n\n/* Get/print the number of components */\nstruct ConnectedCompSanityCheck {\n  Graph* graph;\n\n  galois::DGAccumulator<uint64_t>& active_vertices;\n\n  ConnectedCompSanityCheck(Graph* _graph, galois::DGAccumulator<uint64_t>& _dga)\n      : graph(_graph), active_vertices(_dga) {}\n\n  void static go(Graph& _graph, galois::DGAccumulator<uint64_t>& dga) {\n    dga.reset();\n\n    if (personality == GPU_CUDA) {\n#ifdef GALOIS_ENABLE_GPU\n      uint64_t sum;\n      ConnectedCompSanityCheck_masterNodes_cuda(sum, cuda_ctx);\n      dga += sum;\n#else\n      abort();\n#endif\n    } else {\n      galois::do_all(galois::iterate(_graph.masterNodesRange().begin(),\n                                     _graph.masterNodesRange().end()),\n                     ConnectedCompSanityCheck(&_graph, dga), galois::no_stats(),\n                     galois::loopname(\"ConnectedCompSanityCheck\"));\n    }\n\n    uint64_t num_components = dga.reduce();\n\n    // Only node 0 will print the number visited\n    if (galois::runtime::getSystemNetworkInterface().ID == 0) {\n      galois::gPrint(\"Number of components is \", num_components, \"\\n\");\n    }\n  }\n\n  /* Check if a node's component is the same as its ID.\n   * if yes, then increment an accumulator */\n  void operator()(GNode src) const {\n    NodeData& src_data = graph->getData(src);\n\n    if (src_data.comp_current == graph->getGID(src)) {\n      active_vertices += 1;\n    }\n  }\n};\n\n/******************************************************************************/\n/* Make results */\n/******************************************************************************/\n\nstd::vector<uint32_t> makeResultsCPU(std::unique_ptr<Graph>& hg) {\n  std::vector<uint32_t> values;\n\n  values.reserve(hg->numMasters());\n  for (auto node : hg->masterNodesRange()) {\n    values.push_back(hg->getData(node).comp_current);\n  }\n\n  return values;\n}\n\n#ifdef GALOIS_ENABLE_GPU\nstd::vector<uint32_t> makeResultsGPU(std::unique_ptr<Graph>& hg) {\n  std::vector<uint32_t> values;\n\n  values.reserve(hg->numMasters());\n  for (auto node : hg->masterNodesRange()) {\n    values.push_back(get_node_comp_current_cuda(cuda_ctx, node));\n  }\n\n  return values;\n}\n#else\nstd::vector<uint32_t> makeResultsGPU(std::unique_ptr<Graph>& /*unused*/) {\n  abort();\n}\n#endif\n\nstd::vector<uint32_t> makeResults(std::unique_ptr<Graph>& hg) {\n  switch (personality) {\n  case CPU:\n    return makeResultsCPU(hg);\n  case GPU_CUDA:\n    return makeResultsGPU(hg);\n  default:\n    abort();\n  }\n}\n\n/******************************************************************************/\n/* Main */\n/******************************************************************************/\n\nconstexpr static const char* const name = \"ConnectedComp - Distributed \"\n                                          \"Heterogeneous with filter.\";\nconstexpr static const char* const desc =\n    \"ConnectedComp on Distributed Galois.\";\nconstexpr static const char* const url = nullptr;\n\nint main(int argc, char** argv) {\n  galois::DistMemSys G;\n  DistBenchStart(argc, argv, name, desc, url);\n\n  auto& net = galois::runtime::getSystemNetworkInterface();\n\n  if (net.ID == 0) {\n    galois::runtime::reportParam(REGION_NAME, \"Max Iterations\", maxIterations);\n  }\n\n  galois::StatTimer StatTimer_total(\"TimerTotal\", REGION_NAME);\n\n  StatTimer_total.start();\n\n  std::unique_ptr<Graph> hg;\n#ifdef GALOIS_ENABLE_GPU\n  std::tie(hg, syncSubstrate) =\n      symmetricDistGraphInitialization<NodeData, void>(&cuda_ctx);\n#else\n  std::tie(hg, syncSubstrate) =\n      symmetricDistGraphInitialization<NodeData, void>();\n#endif\n\n  bitset_comp_current.resize(hg->size());\n\n  galois::gPrint(\"[\", net.ID, \"] InitializeGraph::go called\\n\");\n\n  InitializeGraph::go((*hg));\n  galois::runtime::getHostBarrier().wait();\n\n  galois::DGAccumulator<uint64_t> active_vertices64;\n\n  for (auto run = 0; run < numRuns; ++run) {\n    galois::gPrint(\"[\", net.ID, \"] ConnectedComp::go run \", run, \" called\\n\");\n    std::string timer_str(\"Timer_\" + std::to_string(run));\n    galois::StatTimer StatTimer_main(timer_str.c_str(), REGION_NAME);\n\n    StatTimer_main.start();\n    if (execution == Async) {\n      ConnectedComp<true>::go(*hg);\n    } else {\n      ConnectedComp<false>::go(*hg);\n    }\n    StatTimer_main.stop();\n\n    ConnectedCompSanityCheck::go(*hg, active_vertices64);\n\n    if ((run + 1) != numRuns) {\n      if (personality == GPU_CUDA) {\n#ifdef GALOIS_ENABLE_GPU\n        bitset_comp_current_reset_cuda(cuda_ctx);\n#else\n        abort();\n#endif\n      } else {\n        bitset_comp_current.reset();\n      }\n\n      (*syncSubstrate).set_num_run(run + 1);\n      InitializeGraph::go((*hg));\n      galois::runtime::getHostBarrier().wait();\n    }\n  }\n\n  StatTimer_total.stop();\n\n  if (output) {\n    std::vector<uint32_t> results = makeResults(hg);\n    auto globalIDs                = hg->getMasterGlobalIDs();\n    assert(results.size() == globalIDs.size());\n\n    writeOutput(outputLocation, \"component\", results.data(), results.size(),\n                globalIDs.data());\n  }\n\n  return 0;\n}\n"
  },
  {
    "path": "lonestar/analytics/distributed/connected-components/cc_push_cuda.cu",
    "content": "/*  -*- mode: c++ -*-  */\n#include \"gg.h\"\n#include \"ggcuda.h\"\n#include \"cub/cub.cuh\"\n#include \"cub/util_allocator.cuh\"\n#include \"thread_work.h\"\n\nvoid kernel_sizing(CSRGraph &, dim3 &, dim3 &);\n#define TB_SIZE 256\nconst char *GGC_OPTIONS = \"coop_conv=False $ outline_iterate_gb=False $ backoff_blocking_factor=4 $ parcomb=True $ np_schedulers=set(['fg', 'tb', 'wp']) $ cc_disable=set([]) $ tb_lb=False $ hacks=set([]) $ np_factor=8 $ instrument=set([]) $ unroll=[] $ instrument_mode=None $ read_props=None $ outline_iterate=False $ ignore_nested_errors=False $ np=True $ write_props=None $ quiet_cgen=True $ dyn_lb=True $ retry_backoff=True $ cuda.graph_type=basic $ cuda.use_worklist_slots=True $ cuda.worklist_type=basic\";\nstruct ThreadWork t_work;\nbool enable_lb = true;\n#include \"cc_push_cuda.cuh\"\nstatic const int __tb_FirstItr_ConnectedComp = TB_SIZE;\nstatic const int __tb_ConnectedComp = TB_SIZE;\n__global__ void InitializeGraph(CSRGraph graph, unsigned int __begin, unsigned int __end, uint32_t * p_comp_current, uint32_t * p_comp_old)\n{\n  unsigned tid = TID_1D;\n  unsigned nthreads = TOTAL_THREADS_1D;\n\n  const unsigned __kernel_tb_size = TB_SIZE;\n  index_type src_end;\n  // FP: \"1 -> 2;\n  src_end = __end;\n  for (index_type src = __begin + tid; src < src_end; src += nthreads)\n  {\n    bool pop  = src < __end;\n    if (pop)\n    {\n      p_comp_current[src] = graph.node_data[src];\n      p_comp_old[src]     = graph.node_data[src];\n    }\n  }\n  // FP: \"8 -> 9;\n}\n__global__ void FirstItr_ConnectedComp_TB_LB(CSRGraph graph, unsigned int __begin, unsigned int __end, uint32_t * p_comp_current, uint32_t * p_comp_old, DynamicBitset& bitset_comp_current, int * thread_prefix_work_wl, unsigned int num_items, PipeContextT<Worklist2> thread_src_wl)\n{\n  unsigned tid = TID_1D;\n  unsigned nthreads = TOTAL_THREADS_1D;\n\n  const unsigned __kernel_tb_size = TB_SIZE;\n  __shared__ unsigned int total_work;\n  __shared__ unsigned block_start_src_index;\n  __shared__ unsigned block_end_src_index;\n  unsigned my_work;\n  unsigned src;\n  unsigned int offset;\n  unsigned int current_work;\n  // FP: \"1 -> 2;\n  // FP: \"2 -> 3;\n  unsigned blockdim_x = BLOCK_DIM_X;\n  // FP: \"3 -> 4;\n  // FP: \"4 -> 5;\n  // FP: \"5 -> 6;\n  // FP: \"6 -> 7;\n  // FP: \"7 -> 8;\n  // FP: \"8 -> 9;\n  // FP: \"9 -> 10;\n  total_work = thread_prefix_work_wl[num_items - 1];\n  // FP: \"10 -> 11;\n  my_work = ceilf((float)(total_work) / (float) nthreads);\n  // FP: \"11 -> 12;\n\n  // FP: \"12 -> 13;\n  __syncthreads();\n  // FP: \"13 -> 14;\n\n  // FP: \"14 -> 15;\n  if (my_work != 0)\n  {\n    current_work = tid;\n  }\n  // FP: \"17 -> 18;\n  for (unsigned i =0; i < my_work; i++)\n  {\n    unsigned int block_start_work;\n    unsigned int block_end_work;\n    if (threadIdx.x == 0)\n    {\n      if (current_work < total_work)\n      {\n        block_start_work = current_work;\n        block_end_work=current_work + blockdim_x - 1;\n        if (block_end_work >= total_work)\n        {\n          block_end_work = total_work - 1;\n        }\n        block_start_src_index = compute_src_and_offset(0, num_items - 1,  block_start_work+1, thread_prefix_work_wl, num_items,offset);\n        block_end_src_index = compute_src_and_offset(0, num_items - 1, block_end_work+1, thread_prefix_work_wl, num_items, offset);\n      }\n    }\n    __syncthreads();\n\n    if (current_work < total_work)\n    {\n      unsigned src_index;\n      index_type jj;\n      src_index = compute_src_and_offset(block_start_src_index, block_end_src_index, current_work+1, thread_prefix_work_wl,num_items, offset);\n      src= thread_src_wl.in_wl().dwl[src_index];\n      jj = (graph).getFirstEdge(src)+ offset;\n      {\n        index_type dst;\n        uint32_t new_dist;\n        uint32_t old_dist;\n        dst = graph.getAbsDestination(jj);\n        new_dist = p_comp_current[src];\n        old_dist = atomicTestMin(&p_comp_current[dst], new_dist);\n        if (old_dist > new_dist)\n        {\n          bitset_comp_current.set(dst);\n        }\n      }\n      current_work = current_work + nthreads;\n    }\n    __syncthreads();\n  }\n  // FP: \"49 -> 50;\n}\n__global__ void FirstItr_ConnectedComp(CSRGraph graph, unsigned int __begin, unsigned int __end, uint32_t * p_comp_current, uint32_t * p_comp_old, DynamicBitset& bitset_comp_current, PipeContextT<Worklist2> thread_work_wl, PipeContextT<Worklist2> thread_src_wl, bool enable_lb)\n{\n  unsigned tid = TID_1D;\n  unsigned nthreads = TOTAL_THREADS_1D;\n\n  const unsigned __kernel_tb_size = __tb_FirstItr_ConnectedComp;\n  index_type src_end;\n  index_type src_rup;\n  // FP: \"1 -> 2;\n  const int _NP_CROSSOVER_WP = 32;\n  const int _NP_CROSSOVER_TB = __kernel_tb_size;\n  // FP: \"2 -> 3;\n  const int BLKSIZE = __kernel_tb_size;\n  const int ITSIZE = BLKSIZE * 8;\n  unsigned d_limit = DEGREE_LIMIT;\n  // FP: \"3 -> 4;\n\n  typedef cub::BlockScan<multiple_sum<2, index_type>, BLKSIZE> BlockScan;\n  typedef union np_shared<BlockScan::TempStorage, index_type, struct tb_np, struct warp_np<__kernel_tb_size/32>, struct fg_np<ITSIZE> > npsTy;\n\n  // FP: \"4 -> 5;\n  __shared__ npsTy nps ;\n  // FP: \"5 -> 6;\n  src_end = __end;\n  src_rup = ((__begin) + roundup(((__end) - (__begin)), (blockDim.x)));\n  for (index_type src = __begin + tid; src < src_rup; src += nthreads)\n  {\n    int index;\n    multiple_sum<2, index_type> _np_mps;\n    multiple_sum<2, index_type> _np_mps_total;\n    // FP: \"6 -> 7;\n    bool pop  = src < __end && ((( src < (graph).nnodes )) ? true: false);\n    // FP: \"7 -> 8;\n    if (pop)\n    {\n      p_comp_old[src]  = p_comp_current[src];\n    }\n    // FP: \"10 -> 11;\n    // FP: \"13 -> 14;\n    // FP: \"14 -> 15;\n    int threshold = TOTAL_THREADS_1D;\n    // FP: \"15 -> 16;\n    if (pop && (graph).getOutDegree(src) >= threshold)\n    {\n      index = thread_work_wl.in_wl().push_range(1) ;\n      thread_src_wl.in_wl().push_range(1);\n      thread_work_wl.in_wl().dwl[index] = (graph).getOutDegree(src);\n      thread_src_wl.in_wl().dwl[index] = src;\n      pop = false;\n    }\n    // FP: \"18 -> 19;\n    struct NPInspector1 _np = {0,0,0,0,0,0};\n    // FP: \"19 -> 20;\n    __shared__ struct { index_type src; } _np_closure [TB_SIZE];\n    // FP: \"20 -> 21;\n    _np_closure[threadIdx.x].src = src;\n    // FP: \"21 -> 22;\n    if (pop)\n    {\n      _np.size = (graph).getOutDegree(src);\n      _np.start = (graph).getFirstEdge(src);\n    }\n    // FP: \"24 -> 25;\n    // FP: \"25 -> 26;\n    _np_mps.el[0] = _np.size >= _NP_CROSSOVER_WP ? _np.size : 0;\n    _np_mps.el[1] = _np.size < _NP_CROSSOVER_WP ? _np.size : 0;\n    // FP: \"26 -> 27;\n    BlockScan(nps.temp_storage).ExclusiveSum(_np_mps, _np_mps, _np_mps_total);\n    // FP: \"27 -> 28;\n    if (threadIdx.x == 0)\n    {\n      nps.tb.owner = MAX_TB_SIZE + 1;\n    }\n    // FP: \"30 -> 31;\n    __syncthreads();\n    // FP: \"31 -> 32;\n    while (true)\n    {\n      // FP: \"32 -> 33;\n      if (_np.size >= _NP_CROSSOVER_TB)\n      {\n        nps.tb.owner = threadIdx.x;\n      }\n      // FP: \"35 -> 36;\n      __syncthreads();\n      // FP: \"36 -> 37;\n      if (nps.tb.owner == MAX_TB_SIZE + 1)\n      {\n        // FP: \"37 -> 38;\n        __syncthreads();\n        // FP: \"38 -> 39;\n        break;\n      }\n      // FP: \"40 -> 41;\n      if (nps.tb.owner == threadIdx.x)\n      {\n        nps.tb.start = _np.start;\n        nps.tb.size = _np.size;\n        nps.tb.src = threadIdx.x;\n        _np.start = 0;\n        _np.size = 0;\n      }\n      // FP: \"43 -> 44;\n      __syncthreads();\n      // FP: \"44 -> 45;\n      int ns = nps.tb.start;\n      int ne = nps.tb.size;\n      // FP: \"45 -> 46;\n      if (nps.tb.src == threadIdx.x)\n      {\n        nps.tb.owner = MAX_TB_SIZE + 1;\n      }\n      // FP: \"48 -> 49;\n      assert(nps.tb.src < __kernel_tb_size);\n      src = _np_closure[nps.tb.src].src;\n      // FP: \"49 -> 50;\n      for (int _np_j = threadIdx.x; _np_j < ne; _np_j += BLKSIZE)\n      {\n        index_type jj;\n        jj = ns +_np_j;\n        {\n          index_type dst;\n          uint32_t new_dist;\n          uint32_t old_dist;\n          dst = graph.getAbsDestination(jj);\n          new_dist = p_comp_current[src];\n          old_dist = atomicTestMin(&p_comp_current[dst], new_dist);\n          if (old_dist > new_dist)\n          {\n            bitset_comp_current.set(dst);\n          }\n        }\n      }\n      // FP: \"62 -> 63;\n      __syncthreads();\n    }\n    // FP: \"64 -> 65;\n\n    // FP: \"65 -> 66;\n    {\n      const int warpid = threadIdx.x / 32;\n      // FP: \"66 -> 67;\n      const int _np_laneid = cub::LaneId();\n      // FP: \"67 -> 68;\n      while (__any_sync(0xffffffff, _np.size >= _NP_CROSSOVER_WP && _np.size < _NP_CROSSOVER_TB))\n      {\n        if (_np.size >= _NP_CROSSOVER_WP && _np.size < _NP_CROSSOVER_TB)\n        {\n          nps.warp.owner[warpid] = _np_laneid;\n        }\n        if (nps.warp.owner[warpid] == _np_laneid)\n        {\n          nps.warp.start[warpid] = _np.start;\n          nps.warp.size[warpid] = _np.size;\n          nps.warp.src[warpid] = threadIdx.x;\n          _np.start = 0;\n          _np.size = 0;\n        }\n        index_type _np_w_start = nps.warp.start[warpid];\n        index_type _np_w_size = nps.warp.size[warpid];\n        assert(nps.warp.src[warpid] < __kernel_tb_size);\n        src = _np_closure[nps.warp.src[warpid]].src;\n        for (int _np_ii = _np_laneid; _np_ii < _np_w_size; _np_ii += 32)\n        {\n          index_type jj;\n          jj = _np_w_start +_np_ii;\n          {\n            index_type dst;\n            uint32_t new_dist;\n            uint32_t old_dist;\n            dst = graph.getAbsDestination(jj);\n            new_dist = p_comp_current[src];\n            old_dist = atomicTestMin(&p_comp_current[dst], new_dist);\n            if (old_dist > new_dist)\n            {\n              bitset_comp_current.set(dst);\n            }\n          }\n        }\n      }\n      // FP: \"90 -> 91;\n      __syncthreads();\n      // FP: \"91 -> 92;\n    }\n\n    // FP: \"92 -> 93;\n    __syncthreads();\n    // FP: \"93 -> 94;\n    _np.total = _np_mps_total.el[1];\n    _np.offset = _np_mps.el[1];\n    // FP: \"94 -> 95;\n    while (_np.work())\n    {\n      // FP: \"95 -> 96;\n      int _np_i =0;\n      // FP: \"96 -> 97;\n      _np.inspect2(nps.fg.itvalue, nps.fg.src, ITSIZE, threadIdx.x);\n      // FP: \"97 -> 98;\n      __syncthreads();\n      // FP: \"98 -> 99;\n\n      // FP: \"99 -> 100;\n      for (_np_i = threadIdx.x; _np_i < ITSIZE && _np.valid(_np_i); _np_i += BLKSIZE)\n      {\n        index_type jj;\n        assert(nps.fg.src[_np_i] < __kernel_tb_size);\n        src = _np_closure[nps.fg.src[_np_i]].src;\n        jj= nps.fg.itvalue[_np_i];\n        {\n          index_type dst;\n          uint32_t new_dist;\n          uint32_t old_dist;\n          dst = graph.getAbsDestination(jj);\n          new_dist = p_comp_current[src];\n          old_dist = atomicTestMin(&p_comp_current[dst], new_dist);\n          if (old_dist > new_dist)\n          {\n            bitset_comp_current.set(dst);\n          }\n        }\n      }\n      // FP: \"113 -> 114;\n      _np.execute_round_done(ITSIZE);\n      // FP: \"114 -> 115;\n      __syncthreads();\n    }\n    // FP: \"116 -> 117;\n    assert(threadIdx.x < __kernel_tb_size);\n    src = _np_closure[threadIdx.x].src;\n  }\n  // FP: \"118 -> 119;\n}\n__global__ void ConnectedComp_TB_LB(CSRGraph graph, unsigned int __begin, unsigned int __end, uint32_t * p_comp_current, uint32_t * p_comp_old, DynamicBitset& bitset_comp_current, HGAccumulator<unsigned int> active_vertices, int * thread_prefix_work_wl, unsigned int num_items, PipeContextT<Worklist2> thread_src_wl)\n{\n  unsigned tid = TID_1D;\n  unsigned nthreads = TOTAL_THREADS_1D;\n\n  const unsigned __kernel_tb_size = TB_SIZE;\n  __shared__ unsigned int total_work;\n  __shared__ unsigned block_start_src_index;\n  __shared__ unsigned block_end_src_index;\n  unsigned my_work;\n  unsigned src;\n  unsigned int offset;\n  unsigned int current_work;\n  // FP: \"1 -> 2;\n  // FP: \"2 -> 3;\n  unsigned blockdim_x = BLOCK_DIM_X;\n  // FP: \"3 -> 4;\n  // FP: \"4 -> 5;\n  // FP: \"5 -> 6;\n  // FP: \"6 -> 7;\n  // FP: \"7 -> 8;\n  // FP: \"8 -> 9;\n  // FP: \"9 -> 10;\n  total_work = thread_prefix_work_wl[num_items - 1];\n  // FP: \"10 -> 11;\n  my_work = ceilf((float)(total_work) / (float) nthreads);\n  // FP: \"11 -> 12;\n\n  // FP: \"12 -> 13;\n  __syncthreads();\n  // FP: \"13 -> 14;\n\n  // FP: \"14 -> 15;\n  if (my_work != 0)\n  {\n    current_work = tid;\n  }\n  // FP: \"17 -> 18;\n  for (unsigned i =0; i < my_work; i++)\n  {\n    unsigned int block_start_work;\n    unsigned int block_end_work;\n    if (threadIdx.x == 0)\n    {\n      if (current_work < total_work)\n      {\n        block_start_work = current_work;\n        block_end_work=current_work + blockdim_x - 1;\n        if (block_end_work >= total_work)\n        {\n          block_end_work = total_work - 1;\n        }\n        block_start_src_index = compute_src_and_offset(0, num_items - 1,  block_start_work+1, thread_prefix_work_wl, num_items,offset);\n        block_end_src_index = compute_src_and_offset(0, num_items - 1, block_end_work+1, thread_prefix_work_wl, num_items, offset);\n      }\n    }\n    __syncthreads();\n\n    if (current_work < total_work)\n    {\n      unsigned src_index;\n      index_type jj;\n      src_index = compute_src_and_offset(block_start_src_index, block_end_src_index, current_work+1, thread_prefix_work_wl,num_items, offset);\n      src= thread_src_wl.in_wl().dwl[src_index];\n      jj = (graph).getFirstEdge(src)+ offset;\n      {\n        index_type dst;\n        uint32_t new_dist;\n        uint32_t old_dist;\n        active_vertices.reduce( 1);\n        dst = graph.getAbsDestination(jj);\n        new_dist = p_comp_current[src];\n        old_dist = atomicTestMin(&p_comp_current[dst], new_dist);\n        if (old_dist > new_dist)\n        {\n          bitset_comp_current.set(dst);\n        }\n      }\n      current_work = current_work + nthreads;\n    }\n    __syncthreads();\n  }\n  // FP: \"50 -> 51;\n}\n__global__ void ConnectedComp(CSRGraph graph, unsigned int __begin, unsigned int __end, uint32_t * p_comp_current, uint32_t * p_comp_old, DynamicBitset& bitset_comp_current, HGAccumulator<unsigned int> active_vertices, PipeContextT<Worklist2> thread_work_wl, PipeContextT<Worklist2> thread_src_wl, bool enable_lb)\n{\n  unsigned tid = TID_1D;\n  unsigned nthreads = TOTAL_THREADS_1D;\n\n  const unsigned __kernel_tb_size = __tb_ConnectedComp;\n  __shared__ cub::BlockReduce<unsigned int, TB_SIZE>::TempStorage active_vertices_ts;\n  index_type src_end;\n  index_type src_rup;\n  // FP: \"1 -> 2;\n  const int _NP_CROSSOVER_WP = 32;\n  const int _NP_CROSSOVER_TB = __kernel_tb_size;\n  // FP: \"2 -> 3;\n  const int BLKSIZE = __kernel_tb_size;\n  const int ITSIZE = BLKSIZE * 8;\n  unsigned d_limit = DEGREE_LIMIT;\n  // FP: \"3 -> 4;\n\n  typedef cub::BlockScan<multiple_sum<2, index_type>, BLKSIZE> BlockScan;\n  typedef union np_shared<BlockScan::TempStorage, index_type, struct tb_np, struct warp_np<__kernel_tb_size/32>, struct fg_np<ITSIZE> > npsTy;\n\n  // FP: \"4 -> 5;\n  __shared__ npsTy nps ;\n  // FP: \"5 -> 6;\n  // FP: \"6 -> 7;\n  active_vertices.thread_entry();\n  // FP: \"7 -> 8;\n  src_end = __end;\n  src_rup = ((__begin) + roundup(((__end) - (__begin)), (blockDim.x)));\n  for (index_type src = __begin + tid; src < src_rup; src += nthreads)\n  {\n    int index;\n    multiple_sum<2, index_type> _np_mps;\n    multiple_sum<2, index_type> _np_mps_total;\n    // FP: \"8 -> 9;\n    bool pop  = src < __end && ((( src < (graph).nnodes )) ? true: false);\n    // FP: \"9 -> 10;\n    if (pop)\n    {\n      if (p_comp_old[src] > p_comp_current[src])\n      {\n        p_comp_old[src] = p_comp_current[src];\n      }\n      else\n      {\n        pop = false;\n      }\n    }\n    // FP: \"14 -> 15;\n    // FP: \"17 -> 18;\n    // FP: \"18 -> 19;\n    int threshold = TOTAL_THREADS_1D;\n    // FP: \"19 -> 20;\n    if (pop && (graph).getOutDegree(src) >= threshold)\n    {\n      index = thread_work_wl.in_wl().push_range(1) ;\n      thread_src_wl.in_wl().push_range(1);\n      thread_work_wl.in_wl().dwl[index] = (graph).getOutDegree(src);\n      thread_src_wl.in_wl().dwl[index] = src;\n      pop = false;\n    }\n    // FP: \"22 -> 23;\n    struct NPInspector1 _np = {0,0,0,0,0,0};\n    // FP: \"23 -> 24;\n    __shared__ struct { index_type src; } _np_closure [TB_SIZE];\n    // FP: \"24 -> 25;\n    _np_closure[threadIdx.x].src = src;\n    // FP: \"25 -> 26;\n    if (pop)\n    {\n      _np.size = (graph).getOutDegree(src);\n      _np.start = (graph).getFirstEdge(src);\n    }\n    // FP: \"28 -> 29;\n    // FP: \"29 -> 30;\n    _np_mps.el[0] = _np.size >= _NP_CROSSOVER_WP ? _np.size : 0;\n    _np_mps.el[1] = _np.size < _NP_CROSSOVER_WP ? _np.size : 0;\n    // FP: \"30 -> 31;\n    BlockScan(nps.temp_storage).ExclusiveSum(_np_mps, _np_mps, _np_mps_total);\n    // FP: \"31 -> 32;\n    if (threadIdx.x == 0)\n    {\n      nps.tb.owner = MAX_TB_SIZE + 1;\n    }\n    // FP: \"34 -> 35;\n    __syncthreads();\n    // FP: \"35 -> 36;\n    while (true)\n    {\n      // FP: \"36 -> 37;\n      if (_np.size >= _NP_CROSSOVER_TB)\n      {\n        nps.tb.owner = threadIdx.x;\n      }\n      // FP: \"39 -> 40;\n      __syncthreads();\n      // FP: \"40 -> 41;\n      if (nps.tb.owner == MAX_TB_SIZE + 1)\n      {\n        // FP: \"41 -> 42;\n        __syncthreads();\n        // FP: \"42 -> 43;\n        break;\n      }\n      // FP: \"44 -> 45;\n      if (nps.tb.owner == threadIdx.x)\n      {\n        nps.tb.start = _np.start;\n        nps.tb.size = _np.size;\n        nps.tb.src = threadIdx.x;\n        _np.start = 0;\n        _np.size = 0;\n      }\n      // FP: \"47 -> 48;\n      __syncthreads();\n      // FP: \"48 -> 49;\n      int ns = nps.tb.start;\n      int ne = nps.tb.size;\n      // FP: \"49 -> 50;\n      if (nps.tb.src == threadIdx.x)\n      {\n        nps.tb.owner = MAX_TB_SIZE + 1;\n      }\n      // FP: \"52 -> 53;\n      assert(nps.tb.src < __kernel_tb_size);\n      src = _np_closure[nps.tb.src].src;\n      // FP: \"53 -> 54;\n      for (int _np_j = threadIdx.x; _np_j < ne; _np_j += BLKSIZE)\n      {\n        index_type jj;\n        jj = ns +_np_j;\n        {\n          index_type dst;\n          uint32_t new_dist;\n          uint32_t old_dist;\n          active_vertices.reduce( 1);\n          dst = graph.getAbsDestination(jj);\n          new_dist = p_comp_current[src];\n          old_dist = atomicTestMin(&p_comp_current[dst], new_dist);\n          if (old_dist > new_dist)\n          {\n            bitset_comp_current.set(dst);\n          }\n        }\n      }\n      // FP: \"67 -> 68;\n      __syncthreads();\n    }\n    // FP: \"69 -> 70;\n\n    // FP: \"70 -> 71;\n    {\n      const int warpid = threadIdx.x / 32;\n      // FP: \"71 -> 72;\n      const int _np_laneid = cub::LaneId();\n      // FP: \"72 -> 73;\n      while (__any_sync(0xffffffff, _np.size >= _NP_CROSSOVER_WP && _np.size < _NP_CROSSOVER_TB))\n      {\n        if (_np.size >= _NP_CROSSOVER_WP && _np.size < _NP_CROSSOVER_TB)\n        {\n          nps.warp.owner[warpid] = _np_laneid;\n        }\n        if (nps.warp.owner[warpid] == _np_laneid)\n        {\n          nps.warp.start[warpid] = _np.start;\n          nps.warp.size[warpid] = _np.size;\n          nps.warp.src[warpid] = threadIdx.x;\n          _np.start = 0;\n          _np.size = 0;\n        }\n        index_type _np_w_start = nps.warp.start[warpid];\n        index_type _np_w_size = nps.warp.size[warpid];\n        assert(nps.warp.src[warpid] < __kernel_tb_size);\n        src = _np_closure[nps.warp.src[warpid]].src;\n        for (int _np_ii = _np_laneid; _np_ii < _np_w_size; _np_ii += 32)\n        {\n          index_type jj;\n          jj = _np_w_start +_np_ii;\n          {\n            index_type dst;\n            uint32_t new_dist;\n            uint32_t old_dist;\n            active_vertices.reduce( 1);\n            dst = graph.getAbsDestination(jj);\n            new_dist = p_comp_current[src];\n            old_dist = atomicTestMin(&p_comp_current[dst], new_dist);\n            if (old_dist > new_dist)\n            {\n              bitset_comp_current.set(dst);\n            }\n          }\n        }\n      }\n      // FP: \"96 -> 97;\n      __syncthreads();\n      // FP: \"97 -> 98;\n    }\n\n    // FP: \"98 -> 99;\n    __syncthreads();\n    // FP: \"99 -> 100;\n    _np.total = _np_mps_total.el[1];\n    _np.offset = _np_mps.el[1];\n    // FP: \"100 -> 101;\n    while (_np.work())\n    {\n      // FP: \"101 -> 102;\n      int _np_i =0;\n      // FP: \"102 -> 103;\n      _np.inspect2(nps.fg.itvalue, nps.fg.src, ITSIZE, threadIdx.x);\n      // FP: \"103 -> 104;\n      __syncthreads();\n      // FP: \"104 -> 105;\n\n      // FP: \"105 -> 106;\n      for (_np_i = threadIdx.x; _np_i < ITSIZE && _np.valid(_np_i); _np_i += BLKSIZE)\n      {\n        index_type jj;\n        assert(nps.fg.src[_np_i] < __kernel_tb_size);\n        src = _np_closure[nps.fg.src[_np_i]].src;\n        jj= nps.fg.itvalue[_np_i];\n        {\n          index_type dst;\n          uint32_t new_dist;\n          uint32_t old_dist;\n          active_vertices.reduce( 1);\n          dst = graph.getAbsDestination(jj);\n          new_dist = p_comp_current[src];\n          old_dist = atomicTestMin(&p_comp_current[dst], new_dist);\n          if (old_dist > new_dist)\n          {\n            bitset_comp_current.set(dst);\n          }\n        }\n      }\n      // FP: \"120 -> 121;\n      _np.execute_round_done(ITSIZE);\n      // FP: \"121 -> 122;\n      __syncthreads();\n    }\n    // FP: \"123 -> 124;\n    assert(threadIdx.x < __kernel_tb_size);\n    src = _np_closure[threadIdx.x].src;\n  }\n  // FP: \"126 -> 127;\n  active_vertices.thread_exit<cub::BlockReduce<unsigned int, TB_SIZE> >(active_vertices_ts);\n  // FP: \"127 -> 128;\n}\n__global__ void ConnectedCompSanityCheck(CSRGraph graph, unsigned int __begin, unsigned int __end, uint32_t * p_comp_current, HGAccumulator<uint64_t> active_vertices)\n{\n  unsigned tid = TID_1D;\n  unsigned nthreads = TOTAL_THREADS_1D;\n\n  const unsigned __kernel_tb_size = TB_SIZE;\n  __shared__ cub::BlockReduce<uint64_t, TB_SIZE>::TempStorage active_vertices_ts;\n  index_type src_end;\n  // FP: \"1 -> 2;\n  // FP: \"2 -> 3;\n  active_vertices.thread_entry();\n  // FP: \"3 -> 4;\n  src_end = __end;\n  for (index_type src = __begin + tid; src < src_end; src += nthreads)\n  {\n    bool pop  = src < __end;\n    if (pop)\n    {\n      if (p_comp_current[src] == graph.node_data[src])\n      {\n        active_vertices.reduce( 1);\n      }\n    }\n  }\n  // FP: \"11 -> 12;\n  active_vertices.thread_exit<cub::BlockReduce<uint64_t, TB_SIZE> >(active_vertices_ts);\n  // FP: \"12 -> 13;\n}\nvoid InitializeGraph_cuda(unsigned int  __begin, unsigned int  __end, struct CUDA_Context*  ctx)\n{\n  t_work.init_thread_work(ctx->gg.nnodes);\n  dim3 blocks;\n  dim3 threads;\n  // FP: \"1 -> 2;\n  // FP: \"2 -> 3;\n  // FP: \"3 -> 4;\n  kernel_sizing(blocks, threads);\n  // FP: \"4 -> 5;\n  InitializeGraph <<<blocks, threads>>>(ctx->gg, __begin, __end, ctx->comp_current.data.gpu_wr_ptr(), ctx->comp_old.data.gpu_wr_ptr());\n  cudaDeviceSynchronize();\n  // FP: \"5 -> 6;\n  check_cuda_kernel;\n  // FP: \"6 -> 7;\n}\nvoid InitializeGraph_allNodes_cuda(struct CUDA_Context*  ctx)\n{\n  // FP: \"1 -> 2;\n  InitializeGraph_cuda(0, ctx->gg.nnodes, ctx);\n  // FP: \"2 -> 3;\n}\nvoid InitializeGraph_masterNodes_cuda(struct CUDA_Context*  ctx)\n{\n  // FP: \"1 -> 2;\n  InitializeGraph_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, ctx);\n  // FP: \"2 -> 3;\n}\nvoid InitializeGraph_nodesWithEdges_cuda(struct CUDA_Context*  ctx)\n{\n  // FP: \"1 -> 2;\n  InitializeGraph_cuda(0, ctx->numNodesWithEdges, ctx);\n  // FP: \"2 -> 3;\n}\nvoid FirstItr_ConnectedComp_cuda(unsigned int  __begin, unsigned int  __end, struct CUDA_Context*  ctx)\n{\n  dim3 blocks;\n  dim3 threads;\n  // FP: \"1 -> 2;\n  // FP: \"2 -> 3;\n  // FP: \"3 -> 4;\n  kernel_sizing(blocks, threads);\n  // FP: \"4 -> 5;\n  FirstItr_ConnectedComp <<<blocks, __tb_FirstItr_ConnectedComp>>>(ctx->gg, __begin, __end, ctx->comp_current.data.gpu_wr_ptr(), ctx->comp_old.data.gpu_wr_ptr(), *(ctx->comp_current.is_updated.gpu_rd_ptr()), t_work.thread_work_wl, t_work.thread_src_wl, enable_lb);\n  cudaDeviceSynchronize();\n  if (enable_lb)\n  {\n    int num_items = t_work.thread_work_wl.in_wl().nitems();\n    if (num_items != 0)\n    {\n      t_work.compute_prefix_sum();\n      cudaDeviceSynchronize();\n      FirstItr_ConnectedComp_TB_LB <<<blocks, __tb_FirstItr_ConnectedComp>>>(ctx->gg, __begin, __end, ctx->comp_current.data.gpu_wr_ptr(), ctx->comp_old.data.gpu_wr_ptr(), *(ctx->comp_current.is_updated.gpu_rd_ptr()), t_work.thread_prefix_work_wl.gpu_wr_ptr(), num_items, t_work.thread_src_wl);\n      cudaDeviceSynchronize();\n      t_work.reset_thread_work();\n    }\n  }\n  // FP: \"5 -> 6;\n  check_cuda_kernel;\n  // FP: \"6 -> 7;\n}\nvoid FirstItr_ConnectedComp_allNodes_cuda(struct CUDA_Context*  ctx)\n{\n  // FP: \"1 -> 2;\n  FirstItr_ConnectedComp_cuda(0, ctx->gg.nnodes, ctx);\n  // FP: \"2 -> 3;\n}\nvoid FirstItr_ConnectedComp_masterNodes_cuda(struct CUDA_Context*  ctx)\n{\n  // FP: \"1 -> 2;\n  FirstItr_ConnectedComp_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, ctx);\n  // FP: \"2 -> 3;\n}\nvoid FirstItr_ConnectedComp_nodesWithEdges_cuda(struct CUDA_Context*  ctx)\n{\n  // FP: \"1 -> 2;\n  FirstItr_ConnectedComp_cuda(0, ctx->numNodesWithEdges, ctx);\n  // FP: \"2 -> 3;\n}\nvoid ConnectedComp_cuda(unsigned int  __begin, unsigned int  __end, unsigned int & active_vertices, struct CUDA_Context*  ctx)\n{\n  dim3 blocks;\n  dim3 threads;\n  HGAccumulator<unsigned int> _active_vertices;\n  // FP: \"1 -> 2;\n  // FP: \"2 -> 3;\n  // FP: \"3 -> 4;\n  kernel_sizing(blocks, threads);\n  // FP: \"4 -> 5;\n  Shared<unsigned int> active_verticesval  = Shared<unsigned int>(1);\n  // FP: \"5 -> 6;\n  // FP: \"6 -> 7;\n  *(active_verticesval.cpu_wr_ptr()) = 0;\n  // FP: \"7 -> 8;\n  _active_vertices.rv = active_verticesval.gpu_wr_ptr();\n  // FP: \"8 -> 9;\n  ConnectedComp <<<blocks, __tb_ConnectedComp>>>(ctx->gg, __begin, __end, ctx->comp_current.data.gpu_wr_ptr(), ctx->comp_old.data.gpu_wr_ptr(), *(ctx->comp_current.is_updated.gpu_rd_ptr()), _active_vertices, t_work.thread_work_wl, t_work.thread_src_wl, enable_lb);\n  cudaDeviceSynchronize();\n  if (enable_lb)\n  {\n    int num_items = t_work.thread_work_wl.in_wl().nitems();\n    if (num_items != 0)\n    {\n      t_work.compute_prefix_sum();\n      cudaDeviceSynchronize();\n      ConnectedComp_TB_LB <<<blocks, __tb_ConnectedComp>>>(ctx->gg, __begin, __end, ctx->comp_current.data.gpu_wr_ptr(), ctx->comp_old.data.gpu_wr_ptr(), *(ctx->comp_current.is_updated.gpu_rd_ptr()), _active_vertices, t_work.thread_prefix_work_wl.gpu_wr_ptr(), num_items, t_work.thread_src_wl);\n      cudaDeviceSynchronize();\n      t_work.reset_thread_work();\n    }\n  }\n  // FP: \"9 -> 10;\n  check_cuda_kernel;\n  // FP: \"10 -> 11;\n  active_vertices = *(active_verticesval.cpu_rd_ptr());\n  // FP: \"11 -> 12;\n}\nvoid ConnectedComp_allNodes_cuda(unsigned int & active_vertices, struct CUDA_Context*  ctx)\n{\n  // FP: \"1 -> 2;\n  ConnectedComp_cuda(0, ctx->gg.nnodes, active_vertices, ctx);\n  // FP: \"2 -> 3;\n}\nvoid ConnectedComp_masterNodes_cuda(unsigned int & active_vertices, struct CUDA_Context*  ctx)\n{\n  // FP: \"1 -> 2;\n  ConnectedComp_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, active_vertices, ctx);\n  // FP: \"2 -> 3;\n}\nvoid ConnectedComp_nodesWithEdges_cuda(unsigned int & active_vertices, struct CUDA_Context*  ctx)\n{\n  // FP: \"1 -> 2;\n  ConnectedComp_cuda(0, ctx->numNodesWithEdges, active_vertices, ctx);\n  // FP: \"2 -> 3;\n}\nvoid ConnectedCompSanityCheck_cuda(unsigned int  __begin, unsigned int  __end, uint64_t & active_vertices, struct CUDA_Context*  ctx)\n{\n  dim3 blocks;\n  dim3 threads;\n  HGAccumulator<uint64_t> _active_vertices;\n  // FP: \"1 -> 2;\n  // FP: \"2 -> 3;\n  // FP: \"3 -> 4;\n  kernel_sizing(blocks, threads);\n  // FP: \"4 -> 5;\n  Shared<uint64_t> active_verticesval  = Shared<uint64_t>(1);\n  // FP: \"5 -> 6;\n  // FP: \"6 -> 7;\n  *(active_verticesval.cpu_wr_ptr()) = 0;\n  // FP: \"7 -> 8;\n  _active_vertices.rv = active_verticesval.gpu_wr_ptr();\n  // FP: \"8 -> 9;\n  ConnectedCompSanityCheck <<<blocks, threads>>>(ctx->gg, __begin, __end, ctx->comp_current.data.gpu_wr_ptr(), _active_vertices);\n  cudaDeviceSynchronize();\n  // FP: \"9 -> 10;\n  check_cuda_kernel;\n  // FP: \"10 -> 11;\n  active_vertices = *(active_verticesval.cpu_rd_ptr());\n  // FP: \"11 -> 12;\n}\nvoid ConnectedCompSanityCheck_allNodes_cuda(uint64_t & active_vertices, struct CUDA_Context*  ctx)\n{\n  // FP: \"1 -> 2;\n  ConnectedCompSanityCheck_cuda(0, ctx->gg.nnodes, active_vertices, ctx);\n  // FP: \"2 -> 3;\n}\nvoid ConnectedCompSanityCheck_masterNodes_cuda(uint64_t & active_vertices, struct CUDA_Context*  ctx)\n{\n  // FP: \"1 -> 2;\n  ConnectedCompSanityCheck_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, active_vertices, ctx);\n  // FP: \"2 -> 3;\n}\nvoid ConnectedCompSanityCheck_nodesWithEdges_cuda(uint64_t & active_vertices, struct CUDA_Context*  ctx)\n{\n  // FP: \"1 -> 2;\n  ConnectedCompSanityCheck_cuda(0, ctx->numNodesWithEdges, active_vertices, ctx);\n  // FP: \"2 -> 3;\n}"
  },
  {
    "path": "lonestar/analytics/distributed/connected-components/cc_push_cuda.cuh",
    "content": "#pragma once\n#include <cuda.h>\n#include <stdio.h>\n#include <sys/types.h>\n#include <unistd.h>\n#include \"cc_push_cuda.h\"\n#include \"galois/runtime/cuda/DeviceSync.h\"\n\nstruct CUDA_Context : public CUDA_Context_Common {\n\tstruct CUDA_Context_Field<uint32_t> comp_current;\n\tstruct CUDA_Context_Field<uint32_t> comp_old;\n};\n\nstruct CUDA_Context* get_CUDA_context(int id) {\n\tstruct CUDA_Context* ctx;\n\tctx = (struct CUDA_Context* ) calloc(1, sizeof(struct CUDA_Context));\n\tctx->id = id;\n\treturn ctx;\n}\n\nbool init_CUDA_context(struct CUDA_Context* ctx, int device) {\n\treturn init_CUDA_context_common(ctx, device);\n}\n\nvoid load_graph_CUDA(struct CUDA_Context* ctx, MarshalGraph &g, unsigned num_hosts) {\n\tsize_t mem_usage = mem_usage_CUDA_common(g, num_hosts);\n\tmem_usage += mem_usage_CUDA_field(&ctx->comp_current, g, num_hosts);\n\tmem_usage += mem_usage_CUDA_field(&ctx->comp_old, g, num_hosts);\n\tprintf(\"[%d] Host memory for communication context: %3u MB\\n\", ctx->id, mem_usage/1048756);\n\tload_graph_CUDA_common(ctx, g, num_hosts);\n\tload_graph_CUDA_field(ctx, &ctx->comp_current, num_hosts);\n\tload_graph_CUDA_field(ctx, &ctx->comp_old, num_hosts);\n\treset_CUDA_context(ctx);\n}\n\nvoid reset_CUDA_context(struct CUDA_Context* ctx) {\n\tctx->comp_current.data.zero_gpu();\n\tctx->comp_old.data.zero_gpu();\n}\n\nvoid get_bitset_comp_current_cuda(struct CUDA_Context* ctx, uint64_t* bitset_compute) {\n\tctx->comp_current.is_updated.cpu_rd_ptr()->copy_to_cpu(bitset_compute);\n}\n\nvoid bitset_comp_current_reset_cuda(struct CUDA_Context* ctx) {\n\tctx->comp_current.is_updated.cpu_rd_ptr()->reset();\n}\n\nvoid bitset_comp_current_reset_cuda(struct CUDA_Context* ctx, size_t begin, size_t end) {\n\treset_bitset_field(&ctx->comp_current, begin, end);\n}\n\nuint32_t get_node_comp_current_cuda(struct CUDA_Context* ctx, unsigned LID) {\n\tuint32_t *comp_current = ctx->comp_current.data.cpu_rd_ptr();\n\treturn comp_current[LID];\n}\n\nvoid set_node_comp_current_cuda(struct CUDA_Context* ctx, unsigned LID, uint32_t v) {\n\tuint32_t *comp_current = ctx->comp_current.data.cpu_wr_ptr();\n\tcomp_current[LID] = v;\n}\n\nvoid add_node_comp_current_cuda(struct CUDA_Context* ctx, unsigned LID, uint32_t v) {\n\tuint32_t *comp_current = ctx->comp_current.data.cpu_wr_ptr();\n\tcomp_current[LID] += v;\n}\n\nbool min_node_comp_current_cuda(struct CUDA_Context* ctx, unsigned LID, uint32_t v) {\n\tuint32_t *comp_current = ctx->comp_current.data.cpu_wr_ptr();\n\tif (comp_current[LID] > v){\n\t\tcomp_current[LID] = v;\n\t\treturn true;\n\t}\n\treturn false;\n}\n\nvoid batch_get_node_comp_current_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v) {\n\tbatch_get_shared_field<uint32_t, sharedMaster, false>(ctx, &ctx->comp_current, from_id, v);\n}\n\nvoid batch_get_node_comp_current_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode) {\n\tbatch_get_shared_field<uint32_t, sharedMaster, false>(ctx, &ctx->comp_current, from_id, v, v_size, data_mode);\n}\n\nvoid batch_get_mirror_node_comp_current_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v) {\n\tbatch_get_shared_field<uint32_t, sharedMirror, false>(ctx, &ctx->comp_current, from_id, v);\n}\n\nvoid batch_get_mirror_node_comp_current_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode) {\n\tbatch_get_shared_field<uint32_t, sharedMirror, false>(ctx, &ctx->comp_current, from_id, v, v_size, data_mode);\n}\n\nvoid batch_get_reset_node_comp_current_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, uint32_t i) {\n\tbatch_get_shared_field<uint32_t, sharedMirror, true>(ctx, &ctx->comp_current, from_id, v, i);\n}\n\nvoid batch_get_reset_node_comp_current_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode, uint32_t i) {\n\tbatch_get_shared_field<uint32_t, sharedMirror, true>(ctx, &ctx->comp_current, from_id, v, v_size, data_mode, i);\n}\n\nvoid batch_set_mirror_node_comp_current_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<uint32_t, sharedMirror, setOp>(ctx, &ctx->comp_current, from_id, v, data_mode);\n}\n\nvoid batch_set_node_comp_current_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<uint32_t, sharedMaster, setOp>(ctx, &ctx->comp_current, from_id, v, data_mode);\n}\n\nvoid batch_add_mirror_node_comp_current_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<uint32_t, sharedMirror, addOp>(ctx, &ctx->comp_current, from_id, v, data_mode);\n}\n\nvoid batch_add_node_comp_current_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<uint32_t, sharedMaster, addOp>(ctx, &ctx->comp_current, from_id, v, data_mode);\n}\n\nvoid batch_min_mirror_node_comp_current_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<uint32_t, sharedMirror, minOp>(ctx, &ctx->comp_current, from_id, v, data_mode);\n}\n\nvoid batch_min_node_comp_current_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<uint32_t, sharedMaster, minOp>(ctx, &ctx->comp_current, from_id, v, data_mode);\n}\n\nvoid batch_reset_node_comp_current_cuda(struct CUDA_Context* ctx, size_t begin, size_t end, uint32_t v) {\n\treset_data_field<uint32_t>(&ctx->comp_current, begin, end, v);\n}\n\nvoid get_bitset_comp_old_cuda(struct CUDA_Context* ctx, uint64_t* bitset_compute) {\n\tctx->comp_old.is_updated.cpu_rd_ptr()->copy_to_cpu(bitset_compute);\n}\n\nvoid bitset_comp_old_reset_cuda(struct CUDA_Context* ctx) {\n\tctx->comp_old.is_updated.cpu_rd_ptr()->reset();\n}\n\nvoid bitset_comp_old_reset_cuda(struct CUDA_Context* ctx, size_t begin, size_t end) {\n\treset_bitset_field(&ctx->comp_old, begin, end);\n}\n\nuint32_t get_node_comp_old_cuda(struct CUDA_Context* ctx, unsigned LID) {\n\tuint32_t *comp_old = ctx->comp_old.data.cpu_rd_ptr();\n\treturn comp_old[LID];\n}\n\nvoid set_node_comp_old_cuda(struct CUDA_Context* ctx, unsigned LID, uint32_t v) {\n\tuint32_t *comp_old = ctx->comp_old.data.cpu_wr_ptr();\n\tcomp_old[LID] = v;\n}\n\nvoid add_node_comp_old_cuda(struct CUDA_Context* ctx, unsigned LID, uint32_t v) {\n\tuint32_t *comp_old = ctx->comp_old.data.cpu_wr_ptr();\n\tcomp_old[LID] += v;\n}\n\nbool min_node_comp_old_cuda(struct CUDA_Context* ctx, unsigned LID, uint32_t v) {\n\tuint32_t *comp_old = ctx->comp_old.data.cpu_wr_ptr();\n\tif (comp_old[LID] > v){\n\t\tcomp_old[LID] = v;\n\t\treturn true;\n\t}\n\treturn false;\n}\n\nvoid batch_get_node_comp_old_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v) {\n\tbatch_get_shared_field<uint32_t, sharedMaster, false>(ctx, &ctx->comp_old, from_id, v);\n}\n\nvoid batch_get_node_comp_old_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode) {\n\tbatch_get_shared_field<uint32_t, sharedMaster, false>(ctx, &ctx->comp_old, from_id, v, v_size, data_mode);\n}\n\nvoid batch_get_mirror_node_comp_old_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v) {\n\tbatch_get_shared_field<uint32_t, sharedMirror, false>(ctx, &ctx->comp_old, from_id, v);\n}\n\nvoid batch_get_mirror_node_comp_old_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode) {\n\tbatch_get_shared_field<uint32_t, sharedMirror, false>(ctx, &ctx->comp_old, from_id, v, v_size, data_mode);\n}\n\nvoid batch_get_reset_node_comp_old_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, uint32_t i) {\n\tbatch_get_shared_field<uint32_t, sharedMirror, true>(ctx, &ctx->comp_old, from_id, v, i);\n}\n\nvoid batch_get_reset_node_comp_old_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode, uint32_t i) {\n\tbatch_get_shared_field<uint32_t, sharedMirror, true>(ctx, &ctx->comp_old, from_id, v, v_size, data_mode, i);\n}\n\nvoid batch_set_mirror_node_comp_old_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<uint32_t, sharedMirror, setOp>(ctx, &ctx->comp_old, from_id, v, data_mode);\n}\n\nvoid batch_set_node_comp_old_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<uint32_t, sharedMaster, setOp>(ctx, &ctx->comp_old, from_id, v, data_mode);\n}\n\nvoid batch_add_mirror_node_comp_old_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<uint32_t, sharedMirror, addOp>(ctx, &ctx->comp_old, from_id, v, data_mode);\n}\n\nvoid batch_add_node_comp_old_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<uint32_t, sharedMaster, addOp>(ctx, &ctx->comp_old, from_id, v, data_mode);\n}\n\nvoid batch_min_mirror_node_comp_old_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<uint32_t, sharedMirror, minOp>(ctx, &ctx->comp_old, from_id, v, data_mode);\n}\n\nvoid batch_min_node_comp_old_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<uint32_t, sharedMaster, minOp>(ctx, &ctx->comp_old, from_id, v, data_mode);\n}\n\nvoid batch_reset_node_comp_old_cuda(struct CUDA_Context* ctx, size_t begin, size_t end, uint32_t v) {\n\treset_data_field<uint32_t>(&ctx->comp_old, begin, end, v);\n}\n\n"
  },
  {
    "path": "lonestar/analytics/distributed/connected-components/cc_push_cuda.h",
    "content": "#pragma once\n\n#include \"galois/runtime/DataCommMode.h\"\n#include \"galois/cuda/HostDecls.h\"\n\nvoid get_bitset_comp_current_cuda(struct CUDA_Context* ctx,\n                                  uint64_t* bitset_compute);\nvoid bitset_comp_current_reset_cuda(struct CUDA_Context* ctx);\nvoid bitset_comp_current_reset_cuda(struct CUDA_Context* ctx, size_t begin,\n                                    size_t end);\nuint32_t get_node_comp_current_cuda(struct CUDA_Context* ctx, unsigned LID);\nvoid set_node_comp_current_cuda(struct CUDA_Context* ctx, unsigned LID,\n                                uint32_t v);\nvoid add_node_comp_current_cuda(struct CUDA_Context* ctx, unsigned LID,\n                                uint32_t v);\nbool min_node_comp_current_cuda(struct CUDA_Context* ctx, unsigned LID,\n                                uint32_t v);\nvoid batch_get_node_comp_current_cuda(struct CUDA_Context* ctx,\n                                      unsigned from_id, uint8_t* v);\nvoid batch_get_node_comp_current_cuda(struct CUDA_Context* ctx,\n                                      unsigned from_id, uint8_t* v,\n                                      size_t* v_size, DataCommMode* data_mode);\nvoid batch_get_mirror_node_comp_current_cuda(struct CUDA_Context* ctx,\n                                             unsigned from_id, uint8_t* v);\nvoid batch_get_mirror_node_comp_current_cuda(struct CUDA_Context* ctx,\n                                             unsigned from_id, uint8_t* v,\n                                             size_t* v_size,\n                                             DataCommMode* data_mode);\nvoid batch_get_reset_node_comp_current_cuda(struct CUDA_Context* ctx,\n                                            unsigned from_id, uint8_t* v,\n                                            uint32_t i);\nvoid batch_get_reset_node_comp_current_cuda(struct CUDA_Context* ctx,\n                                            unsigned from_id, uint8_t* v,\n                                            size_t* v_size,\n                                            DataCommMode* data_mode,\n                                            uint32_t i);\nvoid batch_set_mirror_node_comp_current_cuda(struct CUDA_Context* ctx,\n                                             unsigned from_id, uint8_t* v,\n                                             DataCommMode data_mode);\nvoid batch_set_node_comp_current_cuda(struct CUDA_Context* ctx,\n                                      unsigned from_id, uint8_t* v,\n                                      DataCommMode data_mode);\nvoid batch_add_mirror_node_comp_current_cuda(struct CUDA_Context* ctx,\n                                             unsigned from_id, uint8_t* v,\n                                             DataCommMode data_mode);\nvoid batch_add_node_comp_current_cuda(struct CUDA_Context* ctx,\n                                      unsigned from_id, uint8_t* v,\n                                      DataCommMode data_mode);\nvoid batch_min_mirror_node_comp_current_cuda(struct CUDA_Context* ctx,\n                                             unsigned from_id, uint8_t* v,\n                                             DataCommMode data_mode);\nvoid batch_min_node_comp_current_cuda(struct CUDA_Context* ctx,\n                                      unsigned from_id, uint8_t* v,\n                                      DataCommMode data_mode);\nvoid batch_reset_node_comp_current_cuda(struct CUDA_Context* ctx, size_t begin,\n                                        size_t end, uint32_t v);\n\nvoid get_bitset_comp_old_cuda(struct CUDA_Context* ctx,\n                              uint64_t* bitset_compute);\nvoid bitset_comp_old_reset_cuda(struct CUDA_Context* ctx);\nvoid bitset_comp_old_reset_cuda(struct CUDA_Context* ctx, size_t begin,\n                                size_t end);\nuint32_t get_node_comp_old_cuda(struct CUDA_Context* ctx, unsigned LID);\nvoid set_node_comp_old_cuda(struct CUDA_Context* ctx, unsigned LID, uint32_t v);\nvoid add_node_comp_old_cuda(struct CUDA_Context* ctx, unsigned LID, uint32_t v);\nbool min_node_comp_old_cuda(struct CUDA_Context* ctx, unsigned LID, uint32_t v);\nvoid batch_get_node_comp_old_cuda(struct CUDA_Context* ctx, unsigned from_id,\n                                  uint8_t* v);\nvoid batch_get_node_comp_old_cuda(struct CUDA_Context* ctx, unsigned from_id,\n                                  uint8_t* v, size_t* v_size,\n                                  DataCommMode* data_mode);\nvoid batch_get_mirror_node_comp_old_cuda(struct CUDA_Context* ctx,\n                                         unsigned from_id, uint8_t* v);\nvoid batch_get_mirror_node_comp_old_cuda(struct CUDA_Context* ctx,\n                                         unsigned from_id, uint8_t* v,\n                                         size_t* v_size,\n                                         DataCommMode* data_mode);\nvoid batch_get_reset_node_comp_old_cuda(struct CUDA_Context* ctx,\n                                        unsigned from_id, uint8_t* v,\n                                        uint32_t i);\nvoid batch_get_reset_node_comp_old_cuda(struct CUDA_Context* ctx,\n                                        unsigned from_id, uint8_t* v,\n                                        size_t* v_size, DataCommMode* data_mode,\n                                        uint32_t i);\nvoid batch_set_mirror_node_comp_old_cuda(struct CUDA_Context* ctx,\n                                         unsigned from_id, uint8_t* v,\n                                         DataCommMode data_mode);\nvoid batch_set_node_comp_old_cuda(struct CUDA_Context* ctx, unsigned from_id,\n                                  uint8_t* v, DataCommMode data_mode);\nvoid batch_add_mirror_node_comp_old_cuda(struct CUDA_Context* ctx,\n                                         unsigned from_id, uint8_t* v,\n                                         DataCommMode data_mode);\nvoid batch_add_node_comp_old_cuda(struct CUDA_Context* ctx, unsigned from_id,\n                                  uint8_t* v, DataCommMode data_mode);\nvoid batch_min_mirror_node_comp_old_cuda(struct CUDA_Context* ctx,\n                                         unsigned from_id, uint8_t* v,\n                                         DataCommMode data_mode);\nvoid batch_min_node_comp_old_cuda(struct CUDA_Context* ctx, unsigned from_id,\n                                  uint8_t* v, DataCommMode data_mode);\nvoid batch_reset_node_comp_old_cuda(struct CUDA_Context* ctx, size_t begin,\n                                    size_t end, uint32_t v);\n\nvoid ConnectedComp_cuda(unsigned int __begin, unsigned int __end,\n                        unsigned int& active_vertices,\n                        struct CUDA_Context* ctx);\nvoid ConnectedCompSanityCheck_cuda(unsigned int __begin, unsigned int __end,\n                                   uint64_t& active_vertices,\n                                   struct CUDA_Context* ctx);\nvoid ConnectedCompSanityCheck_allNodes_cuda(uint64_t& active_vertices,\n                                            struct CUDA_Context* ctx);\nvoid ConnectedCompSanityCheck_masterNodes_cuda(uint64_t& active_vertices,\n                                               struct CUDA_Context* ctx);\nvoid ConnectedCompSanityCheck_nodesWithEdges_cuda(uint64_t& active_vertices,\n                                                  struct CUDA_Context* ctx);\nvoid ConnectedComp_allNodes_cuda(unsigned int& active_vertices,\n                                 struct CUDA_Context* ctx);\nvoid ConnectedComp_masterNodes_cuda(unsigned int& active_vertices,\n                                    struct CUDA_Context* ctx);\nvoid ConnectedComp_nodesWithEdges_cuda(unsigned int& active_vertices,\n                                       struct CUDA_Context* ctx);\nvoid FirstItr_ConnectedComp_cuda(unsigned int __begin, unsigned int __end,\n                                 struct CUDA_Context* ctx);\nvoid FirstItr_ConnectedComp_allNodes_cuda(struct CUDA_Context* ctx);\nvoid FirstItr_ConnectedComp_masterNodes_cuda(struct CUDA_Context* ctx);\nvoid FirstItr_ConnectedComp_nodesWithEdges_cuda(struct CUDA_Context* ctx);\nvoid InitializeGraph_cuda(unsigned int __begin, unsigned int __end,\n                          struct CUDA_Context* ctx);\nvoid InitializeGraph_allNodes_cuda(struct CUDA_Context* ctx);\nvoid InitializeGraph_masterNodes_cuda(struct CUDA_Context* ctx);\nvoid InitializeGraph_nodesWithEdges_cuda(struct CUDA_Context* ctx);\n"
  },
  {
    "path": "lonestar/analytics/distributed/connected-components/cc_push_cuda.py",
    "content": "from gg.ast import *\nfrom gg.lib.graph import Graph\nfrom gg.lib.wl import Worklist\nfrom gg.ast.params import GraphParam\nimport cgen\nG = Graph(\"graph\")\nWL = Worklist()\nast = Module([\nCBlock([cgen.Include(\"cc_push_cuda.cuh\", system = False)], parse = False),\nKernel(\"InitializeGraph\", [G.param(), ('unsigned int', '__begin'), ('unsigned int', '__end'), ('uint32_t *', 'p_comp_current'), ('uint32_t *', 'p_comp_old')],\n[\nForAll(\"src\", G.nodes(\"__begin\", \"__end\"),\n[\nCDecl([(\"bool\", \"pop\", \" = src < __end\")]),\nIf(\"pop\", [\nCBlock([\"p_comp_current[src] = graph.node_data[src]\"]),\nCBlock([\"p_comp_old[src]     = graph.node_data[src]\"]),\n]),\n]),\n]),\nKernel(\"FirstItr_ConnectedComp\", [G.param(), ('unsigned int', '__begin'), ('unsigned int', '__end'), ('uint32_t *', 'p_comp_current'), ('uint32_t *', 'p_comp_old'), ('DynamicBitset&', 'bitset_comp_current')],\n[\nForAll(\"src\", G.nodes(\"__begin\", \"__end\"),\n[\nCDecl([(\"bool\", \"pop\", \" = src < __end\")]),\nIf(\"pop\", [\nCBlock([\"p_comp_old[src]  = p_comp_current[src]\"]),\n]),\nUniformConditional(If(\"!pop\", [CBlock(\"continue\")]), uniform_only = False, _only_if_np = True),\nClosureHint(\nForAll(\"jj\", G.edges(\"src\"),\n[\nCDecl([(\"index_type\", \"dst\", \"\")]),\nCBlock([\"dst = graph.getAbsDestination(jj)\"]),\nCDecl([(\"uint32_t\", \"new_dist\", \"\")]),\nCBlock([\"new_dist = p_comp_current[src]\"]),\nCDecl([(\"uint32_t\", \"old_dist\", \"\")]),\nCBlock([\"old_dist = atomicTestMin(&p_comp_current[dst], new_dist)\"]),\nIf(\"old_dist > new_dist\",\n[\nCBlock([\"bitset_comp_current.set(dst)\"]),\n]),\n]),\n),\n]),\n]),\nKernel(\"ConnectedComp\", [G.param(), ('unsigned int', '__begin'), ('unsigned int', '__end'), ('uint32_t *', 'p_comp_current'), ('uint32_t *', 'p_comp_old'), ('DynamicBitset&', 'bitset_comp_current'), ('HGAccumulator<unsigned int>', 'active_vertices')],\n[\nCDecl([(\"__shared__ cub::BlockReduce<unsigned int, TB_SIZE>::TempStorage\", \"active_vertices_ts\", \"\")]),\nCBlock([\"active_vertices.thread_entry()\"]),\nForAll(\"src\", G.nodes(\"__begin\", \"__end\"),\n[\nCDecl([(\"bool\", \"pop\", \" = src < __end\")]),\nIf(\"pop\", [\nIf(\"p_comp_old[src] > p_comp_current[src]\",\n[\nCBlock([\"p_comp_old[src] = p_comp_current[src]\"]),\n], [ CBlock([\"pop = false\"]), ]),\n]),\nUniformConditional(If(\"!pop\", [CBlock(\"continue\")]), uniform_only = False, _only_if_np = True),\nClosureHint(\nForAll(\"jj\", G.edges(\"src\"),\n[\nCBlock([\"active_vertices.reduce( 1)\"]),\nCDecl([(\"index_type\", \"dst\", \"\")]),\nCBlock([\"dst = graph.getAbsDestination(jj)\"]),\nCDecl([(\"uint32_t\", \"new_dist\", \"\")]),\nCBlock([\"new_dist = p_comp_current[src]\"]),\nCDecl([(\"uint32_t\", \"old_dist\", \"\")]),\nCBlock([\"old_dist = atomicTestMin(&p_comp_current[dst], new_dist)\"]),\nIf(\"old_dist > new_dist\",\n[\nCBlock([\"bitset_comp_current.set(dst)\"]),\n]),\n]),\n),\n]),\nCBlock([\"active_vertices.thread_exit<cub::BlockReduce<unsigned int, TB_SIZE> >(active_vertices_ts)\"], parse = False),\n]),\nKernel(\"ConnectedCompSanityCheck\", [G.param(), ('unsigned int', '__begin'), ('unsigned int', '__end'), ('uint32_t *', 'p_comp_current'), ('HGAccumulator<uint64_t>', 'active_vertices')],\n[\nCDecl([(\"__shared__ cub::BlockReduce<uint64_t, TB_SIZE>::TempStorage\", \"active_vertices_ts\", \"\")]),\nCBlock([\"active_vertices.thread_entry()\"]),\nForAll(\"src\", G.nodes(\"__begin\", \"__end\"),\n[\nCDecl([(\"bool\", \"pop\", \" = src < __end\")]),\nIf(\"pop\", [\nIf(\"p_comp_current[src] == graph.node_data[src]\",\n[\nCBlock([\"active_vertices.reduce( 1)\"]),\n]),\n]),\n]),\nCBlock([\"active_vertices.thread_exit<cub::BlockReduce<uint64_t, TB_SIZE> >(active_vertices_ts)\"], parse = False),\n]),\nKernel(\"InitializeGraph_cuda\", [('unsigned int ', '__begin'), ('unsigned int ', '__end'), ('struct CUDA_Context* ', 'ctx')],\n[\nCDecl([(\"dim3\", \"blocks\", \"\")]),\nCDecl([(\"dim3\", \"threads\", \"\")]),\nCBlock([\"kernel_sizing(blocks, threads)\"]),\nInvoke(\"InitializeGraph\", (\"ctx->gg\", \"__begin\", \"__end\", \"ctx->comp_current.data.gpu_wr_ptr()\", \"ctx->comp_old.data.gpu_wr_ptr()\")),\nCBlock([\"check_cuda_kernel\"], parse = False),\n], host = True),\nKernel(\"InitializeGraph_allNodes_cuda\", [('struct CUDA_Context* ', 'ctx')],\n[\nCBlock([\"InitializeGraph_cuda(0, ctx->gg.nnodes, ctx)\"]),\n], host = True),\nKernel(\"InitializeGraph_masterNodes_cuda\", [('struct CUDA_Context* ', 'ctx')],\n[\nCBlock([\"InitializeGraph_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, ctx)\"]),\n], host = True),\nKernel(\"InitializeGraph_nodesWithEdges_cuda\", [('struct CUDA_Context* ', 'ctx')],\n[\nCBlock([\"InitializeGraph_cuda(0, ctx->numNodesWithEdges, ctx)\"]),\n], host = True),\nKernel(\"FirstItr_ConnectedComp_cuda\", [('unsigned int ', '__begin'), ('unsigned int ', '__end'), ('struct CUDA_Context* ', 'ctx')],\n[\nCDecl([(\"dim3\", \"blocks\", \"\")]),\nCDecl([(\"dim3\", \"threads\", \"\")]),\nCBlock([\"kernel_sizing(blocks, threads)\"]),\nInvoke(\"FirstItr_ConnectedComp\", (\"ctx->gg\", \"__begin\", \"__end\", \"ctx->comp_current.data.gpu_wr_ptr()\", \"ctx->comp_old.data.gpu_wr_ptr()\", \"*(ctx->comp_current.is_updated.gpu_rd_ptr())\")),\nCBlock([\"check_cuda_kernel\"], parse = False),\n], host = True),\nKernel(\"FirstItr_ConnectedComp_allNodes_cuda\", [('struct CUDA_Context* ', 'ctx')],\n[\nCBlock([\"FirstItr_ConnectedComp_cuda(0, ctx->gg.nnodes, ctx)\"]),\n], host = True),\nKernel(\"FirstItr_ConnectedComp_masterNodes_cuda\", [('struct CUDA_Context* ', 'ctx')],\n[\nCBlock([\"FirstItr_ConnectedComp_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, ctx)\"]),\n], host = True),\nKernel(\"FirstItr_ConnectedComp_nodesWithEdges_cuda\", [('struct CUDA_Context* ', 'ctx')],\n[\nCBlock([\"FirstItr_ConnectedComp_cuda(0, ctx->numNodesWithEdges, ctx)\"]),\n], host = True),\nKernel(\"ConnectedComp_cuda\", [('unsigned int ', '__begin'), ('unsigned int ', '__end'), ('unsigned int &', 'active_vertices'), ('struct CUDA_Context* ', 'ctx')],\n[\nCDecl([(\"dim3\", \"blocks\", \"\")]),\nCDecl([(\"dim3\", \"threads\", \"\")]),\nCBlock([\"kernel_sizing(blocks, threads)\"]),\nCDecl([(\"Shared<unsigned int>\", \"active_verticesval\", \" = Shared<unsigned int>(1)\")]),\nCDecl([(\"HGAccumulator<unsigned int>\", \"_active_vertices\", \"\")]),\nCBlock([\"*(active_verticesval.cpu_wr_ptr()) = 0\"]),\nCBlock([\"_active_vertices.rv = active_verticesval.gpu_wr_ptr()\"]),\nInvoke(\"ConnectedComp\", (\"ctx->gg\", \"__begin\", \"__end\", \"ctx->comp_current.data.gpu_wr_ptr()\", \"ctx->comp_old.data.gpu_wr_ptr()\", \"*(ctx->comp_current.is_updated.gpu_rd_ptr())\", \"_active_vertices\")),\nCBlock([\"check_cuda_kernel\"], parse = False),\nCBlock([\"active_vertices = *(active_verticesval.cpu_rd_ptr())\"]),\n], host = True),\nKernel(\"ConnectedComp_allNodes_cuda\", [('unsigned int &', 'active_vertices'), ('struct CUDA_Context* ', 'ctx')],\n[\nCBlock([\"ConnectedComp_cuda(0, ctx->gg.nnodes, active_vertices, ctx)\"]),\n], host = True),\nKernel(\"ConnectedComp_masterNodes_cuda\", [('unsigned int &', 'active_vertices'), ('struct CUDA_Context* ', 'ctx')],\n[\nCBlock([\"ConnectedComp_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, active_vertices, ctx)\"]),\n], host = True),\nKernel(\"ConnectedComp_nodesWithEdges_cuda\", [('unsigned int &', 'active_vertices'), ('struct CUDA_Context* ', 'ctx')],\n[\nCBlock([\"ConnectedComp_cuda(0, ctx->numNodesWithEdges, active_vertices, ctx)\"]),\n], host = True),\nKernel(\"ConnectedCompSanityCheck_cuda\", [('unsigned int ', '__begin'), ('unsigned int ', '__end'), ('uint64_t &', 'active_vertices'), ('struct CUDA_Context* ', 'ctx')],\n[\nCDecl([(\"dim3\", \"blocks\", \"\")]),\nCDecl([(\"dim3\", \"threads\", \"\")]),\nCBlock([\"kernel_sizing(blocks, threads)\"]),\nCDecl([(\"Shared<uint64_t>\", \"active_verticesval\", \" = Shared<uint64_t>(1)\")]),\nCDecl([(\"HGAccumulator<uint64_t>\", \"_active_vertices\", \"\")]),\nCBlock([\"*(active_verticesval.cpu_wr_ptr()) = 0\"]),\nCBlock([\"_active_vertices.rv = active_verticesval.gpu_wr_ptr()\"]),\nInvoke(\"ConnectedCompSanityCheck\", (\"ctx->gg\", \"__begin\", \"__end\", \"ctx->comp_current.data.gpu_wr_ptr()\", \"_active_vertices\")),\nCBlock([\"check_cuda_kernel\"], parse = False),\nCBlock([\"active_vertices = *(active_verticesval.cpu_rd_ptr())\"]),\n], host = True),\nKernel(\"ConnectedCompSanityCheck_allNodes_cuda\", [('uint64_t &', 'active_vertices'), ('struct CUDA_Context* ', 'ctx')],\n[\nCBlock([\"ConnectedCompSanityCheck_cuda(0, ctx->gg.nnodes, active_vertices, ctx)\"]),\n], host = True),\nKernel(\"ConnectedCompSanityCheck_masterNodes_cuda\", [('uint64_t &', 'active_vertices'), ('struct CUDA_Context* ', 'ctx')],\n[\nCBlock([\"ConnectedCompSanityCheck_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, active_vertices, ctx)\"]),\n], host = True),\nKernel(\"ConnectedCompSanityCheck_nodesWithEdges_cuda\", [('uint64_t &', 'active_vertices'), ('struct CUDA_Context* ', 'ctx')],\n[\nCBlock([\"ConnectedCompSanityCheck_cuda(0, ctx->numNodesWithEdges, active_vertices, ctx)\"]),\n], host = True),\n])\n"
  },
  {
    "path": "lonestar/analytics/distributed/connected-components/cc_push_sync.hh",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting parallelism.\n * The code is being released under the terms of the 3-Clause BSD License (a\n * copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#include \"galois/runtime/SyncStructures.h\"\n\nGALOIS_SYNC_STRUCTURE_REDUCE_SET(comp_current, uint32_t);\nGALOIS_SYNC_STRUCTURE_REDUCE_MIN(comp_current, uint32_t);\nGALOIS_SYNC_STRUCTURE_BITSET(comp_current);\n"
  },
  {
    "path": "lonestar/analytics/distributed/k-core/CMakeLists.txt",
    "content": "app_dist(kcore_push k-core-push)\nadd_test_dist(k-core-push-dist rmat15 ${BASEINPUT}/scalefree/symmetric/rmat15.sgr -symmetricGraph -kcore=100)\n\napp_dist(kcore_pull k-core-pull)\nadd_test_dist(k-core-pull-dist rmat15 ${BASEINPUT}/scalefree/symmetric/rmat15.sgr -symmetricGraph -kcore=100)\n"
  },
  {
    "path": "lonestar/analytics/distributed/k-core/README.md",
    "content": "K-Core Decomposition\n================================================================================\n\nDESCRIPTION \n--------------------------------------------------------------------------------\n\nFinds the <b>k-core</b> in a graph. A k-core is defined as a subgraph of a \ngraph in which all vertices of degree less than k have been removed from the \ngraph. The remaining vertices must all have a degree of at least k.\n\nThe algorithm supports both a bulk-synchronous and a bulk-asynchronous\nparallel algorithms. This benchmark consists of two algorithms,\npush- and pull-based. In the push-based algorithm, all non-removed nodes\ncheck to see if their degree has fallen below k in each round. If so,\nit removes itself and decrements the degree on its neighbors.\nIn the pull-based algorithm, a node will check which of its neighbors have\nrecently been removed from the graph and decrement its own degree in each round.\nIf the degree falls below k, then it removes itself from the graph.\n\n\nINPUT\n--------------------------------------------------------------------------------\n\nTakes in symmetric Galois .gr graphs. You must specify the -symmetricGraph\nflag when running this benchmark.\n\nBUILD\n--------------------------------------------------------------------------------\n\n1. Run cmake at BUILD directory (refer to top-level README for cmake instructions).\n\n2. Run `cd <BUILD>/lonestar/analytics/distributed/k-core; make -j\n\nRUN\n--------------------------------------------------------------------------------\n\nTo run on 1 machine with a k value of 4, use the following:\n`./k-core-push-dist <symmetric-input-graph> -t=<num-threads> -symmetricGraph -kcore=4`\n`./k-core-pull-dist <symmetric-input-graph> -t=<num-threads> -symmetricGraph -kcore=4`\n\nTo run on 3 hosts h1, h2, and h3 with a k value of 4, use the following:\n`mpirun -n=3 -hosts=h1,h2,h3 ./k-core-push-dist <symmetric-input-graph> -t=<num-threads> -symmetricGraph -kcore=4`\n`mpirun -n=3 -hosts=h1,h2,h3 ./k-core-pull-dist <symmetric-input-graph> -t=<num-threads> -symmetricGraph -kcore=4`\n\nTo run on 3 hosts h1, h2, and h3 with a k value of 4 with an incoming edge cut, use the following:\n`mpirun -n=3 -hosts=h1,h2,h3 ./k-core-push-dist <symmetric-input-graph> -t=<num-threads> -symmetricGraph -partition=iec -kcore=4`\n`mpirun -n=3 -hosts=h1,h2,h3 ./k-core-pull-dist <symmetric-input-graph> -t=<num-threads> -symmetricGraph -partition=iec -kcore=4`\n\nPERFORMANCE\n--------------------------------------------------------------------------------\n\n* The push variant generally performs better in our experience.\n\n* For 16 or less hosts/GPUs, for performance, we recommend using an\n  **edge-cut** partitioning policy (OEC or IEC) with **synchronous**\n  communication for performance.\n\n* For 32 or more hosts/GPUs, for performance, we recommend using the\n  **Cartesian vertex-cut** partitioning policy (CVC) with **asynchronous**\n  communication for performance.\n"
  },
  {
    "path": "lonestar/analytics/distributed/k-core/kcore_pull.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n/******************************************************************************/\n/* Sync code/calls was manually written, not compiler generated */\n/******************************************************************************/\n\n#include \"DistBench/Output.h\"\n#include \"DistBench/Start.h\"\n#include \"galois/DistGalois.h\"\n#include \"galois/DReducible.h\"\n#include \"galois/DTerminationDetector.h\"\n#include \"galois/gstl.h\"\n#include \"galois/runtime/Tracer.h\"\n\n#include <iostream>\n#include <limits>\n\n#ifdef GALOIS_ENABLE_GPU\n#include \"kcore_pull_cuda.h\"\nstruct CUDA_Context* cuda_ctx;\n#else\nenum { CPU, GPU_CUDA };\nint personality = CPU;\n#endif\n\nconstexpr static const char* const REGION_NAME = \"KCore\";\n\n/******************************************************************************/\n/* Declaration of command line arguments */\n/******************************************************************************/\nnamespace cll = llvm::cl;\nstatic cll::opt<unsigned int>\n    maxIterations(\"maxIterations\",\n                  cll::desc(\"Maximum iterations: Default 10000\"),\n                  cll::init(10000));\n// required k specification for k-core\nstatic cll::opt<unsigned int> k_core_num(\"kcore\", cll::desc(\"KCore value\"),\n                                         cll::Required);\n\nenum Exec { Sync, Async };\n\nstatic cll::opt<Exec> execution(\n    \"exec\", cll::desc(\"Distributed Execution Model (default value Async):\"),\n    cll::values(clEnumVal(Sync, \"Bulk-synchronous Parallel (BSP)\"),\n                clEnumVal(Async, \"Bulk-asynchronous Parallel (BASP)\")),\n    cll::init(Async));\n\n/******************************************************************************/\n/* Graph structure declarations + other inits */\n/******************************************************************************/\n\nstruct NodeData {\n  uint32_t current_degree;\n  uint32_t trim;\n  uint8_t flag;\n  uint8_t pull_flag;\n};\n\ntypedef galois::graphs::DistGraph<NodeData, void> Graph;\ntypedef typename Graph::GraphNode GNode;\n\n// bitset for tracking updates\ngalois::DynamicBitSet bitset_current_degree;\ngalois::DynamicBitSet bitset_trim;\n\nstd::unique_ptr<galois::graphs::GluonSubstrate<Graph>> syncSubstrate;\n\n// add all sync/bitset structs (needs above declarations)\n#include \"kcore_pull_sync.hh\"\n\n/******************************************************************************/\n/* Functors for running the algorithm */\n/******************************************************************************/\n\n/* Degree counting\n * Called by InitializeGraph1 */\nstruct DegreeCounting {\n  Graph* graph;\n\n  DegreeCounting(Graph* _graph) : graph(_graph) {}\n\n  /* Initialize the entire graph node-by-node */\n  void static go(Graph& _graph) {\n    const auto& nodesWithEdges = _graph.allNodesWithEdgesRange();\n\n    if (personality == GPU_CUDA) {\n#ifdef GALOIS_ENABLE_GPU\n      std::string impl_str(\"DegreeCounting_\" +\n                           (syncSubstrate->get_run_identifier()));\n      galois::StatTimer StatTimer_cuda(impl_str.c_str(), REGION_NAME);\n      StatTimer_cuda.start();\n      DegreeCounting_nodesWithEdges_cuda(cuda_ctx);\n      StatTimer_cuda.stop();\n#else\n      abort();\n#endif\n    } else if (personality == CPU) {\n      galois::do_all(\n          galois::iterate(nodesWithEdges), DegreeCounting{&_graph},\n          galois::steal(), galois::no_stats(),\n          galois::loopname(\n              syncSubstrate->get_run_identifier(\"DegreeCounting\").c_str()));\n    }\n\n    syncSubstrate->sync<writeSource, readAny, Reduce_add_current_degree,\n                        Bitset_current_degree>(\"DegreeCounting\");\n  }\n\n  /* Calculate degree of nodes by checking how many nodes have it as a dest and\n   * adding for every dest (works same way in pull version since it's a\n   * symmetric graph) */\n  void operator()(GNode src) const {\n    NodeData& src_data = graph->getData(src);\n\n    src_data.current_degree =\n        std::distance(graph->edge_begin(src), graph->edge_end(src));\n    bitset_current_degree.set(src);\n\n    //// technically can use std::dist above, but this is more easily\n    //// recognizable by dist compiler + this is init so it doesn't matter much\n    // for (auto current_edge : graph->edges(src)) {\n    //  src_data.current_degree++;\n    //  bitset_current_degree.set(src);\n    //}\n  }\n};\n\n/* Initialize: initial field setup */\nstruct InitializeGraph {\n  Graph* graph;\n\n  InitializeGraph(Graph* _graph) : graph(_graph) {}\n\n  /* Initialize the entire graph node-by-node */\n  void static go(Graph& _graph) {\n    const auto& allNodes = _graph.allNodesRange();\n\n    if (personality == GPU_CUDA) {\n#ifdef GALOIS_ENABLE_GPU\n      std::string impl_str(\"InitializeGraph_\" +\n                           (syncSubstrate->get_run_identifier()));\n      galois::StatTimer StatTimer_cuda(impl_str.c_str(), REGION_NAME);\n      StatTimer_cuda.start();\n      InitializeGraph_allNodes_cuda(cuda_ctx);\n      StatTimer_cuda.stop();\n#else\n      abort();\n#endif\n    } else if (personality == CPU) {\n      galois::do_all(\n          galois::iterate(allNodes.begin(), allNodes.end()),\n          InitializeGraph{&_graph}, galois::no_stats(),\n          galois::loopname(\n              syncSubstrate->get_run_identifier(\"InitializeGraph\").c_str()));\n    }\n\n    // degree calculation\n    DegreeCounting::go(_graph);\n  }\n\n  /* Setup intial fields */\n  void operator()(GNode src) const {\n    NodeData& src_data      = graph->getData(src);\n    src_data.flag           = true;\n    src_data.trim           = 0;\n    src_data.current_degree = 0;\n    src_data.pull_flag      = false;\n  }\n};\n\n/* Updates liveness of a node + updates flag that says if node has been pulled\n * from */\ntemplate <bool async>\nstruct LiveUpdate {\n  cll::opt<uint32_t>& local_k_core_num;\n  Graph* graph;\n  using DGTerminatorDetector =\n      typename std::conditional<async, galois::DGTerminator<unsigned int>,\n                                galois::DGAccumulator<unsigned int>>::type;\n\n  DGTerminatorDetector& active_vertices;\n\n  LiveUpdate(cll::opt<uint32_t>& _kcore, Graph* _graph,\n             DGTerminatorDetector& _dga)\n      : local_k_core_num(_kcore), graph(_graph), active_vertices(_dga) {}\n\n  void static go(Graph& _graph, DGTerminatorDetector& dga) {\n    const auto& allNodes = _graph.allNodesRange();\n    dga.reset();\n\n    if (personality == GPU_CUDA) {\n#ifdef GALOIS_ENABLE_GPU\n      std::string impl_str(\"LiveUpdate_\" +\n                           (syncSubstrate->get_run_identifier()));\n      galois::StatTimer StatTimer_cuda(impl_str.c_str(), REGION_NAME);\n      StatTimer_cuda.start();\n      unsigned int __retval = 0;\n      LiveUpdate_allNodes_cuda(__retval, k_core_num, cuda_ctx);\n      dga += __retval;\n      StatTimer_cuda.stop();\n#else\n      abort();\n#endif\n    } else if (personality == CPU) {\n      galois::do_all(\n          galois::iterate(allNodes.begin(), allNodes.end()),\n          LiveUpdate{k_core_num, &_graph, dga}, galois::no_stats(),\n          galois::loopname(\n              syncSubstrate->get_run_identifier(\"LiveUpdate\").c_str()));\n    }\n\n    // no sync necessary as all nodes should have updated\n  }\n\n  /**\n   * Mark a node dead if degree is under kcore number and mark it\n   * available for pulling from.\n   *\n   * If dead, and pull flag is on, then turn off flag as you don't want to\n   * be pulled from more than once.\n   */\n  void operator()(GNode src) const {\n    NodeData& sdata = graph->getData(src);\n\n    if (sdata.flag) {\n      if (sdata.trim > 0) {\n        sdata.current_degree = sdata.current_degree - sdata.trim;\n      }\n\n      if (sdata.current_degree < local_k_core_num) {\n        sdata.flag = false;\n        active_vertices += 1;\n\n        // let neighbors pull from me next round\n        // assert(sdata.pull_flag == false);\n        sdata.pull_flag = true;\n      }\n    } else {\n      // dead\n      if (sdata.pull_flag) {\n        // do not allow neighbors to pull value from this node anymore\n        sdata.pull_flag = false;\n      }\n    }\n\n    // always reset trim\n    sdata.trim = 0;\n  }\n};\n\n/* Step that determines if a node is dead and updates its neighbors' trim\n * if it is */\ntemplate <bool async>\nstruct KCore {\n  Graph* graph;\n\n  using DGTerminatorDetector =\n      typename std::conditional<async, galois::DGTerminator<unsigned int>,\n                                galois::DGAccumulator<unsigned int>>::type;\n\n  KCore(Graph* _graph) : graph(_graph) {}\n\n  void static go(Graph& _graph) {\n    unsigned iterations = 0;\n    DGTerminatorDetector dga;\n\n    const auto& nodesWithEdges = _graph.allNodesWithEdgesRange();\n\n    do {\n      syncSubstrate->set_num_round(iterations);\n\n      if (personality == GPU_CUDA) {\n#ifdef GALOIS_ENABLE_GPU\n        std::string impl_str(\"KCore_\" + (syncSubstrate->get_run_identifier()));\n        galois::StatTimer StatTimer_cuda(impl_str.c_str(), REGION_NAME);\n        StatTimer_cuda.start();\n        KCore_nodesWithEdges_cuda(cuda_ctx);\n        StatTimer_cuda.stop();\n#else\n        abort();\n#endif\n      } else if (personality == CPU) {\n        galois::do_all(galois::iterate(nodesWithEdges), KCore{&_graph},\n                       galois::no_stats(), galois::steal(),\n                       galois::loopname(\n                           syncSubstrate->get_run_identifier(\"KCore\").c_str()));\n      }\n\n      syncSubstrate\n          ->sync<writeSource, readAny, Reduce_add_trim, Bitset_trim, async>(\n              \"KCore\");\n\n      // update live/deadness\n      LiveUpdate<async>::go(_graph, dga);\n\n      iterations++;\n    } while ((async || (iterations < maxIterations)) &&\n             dga.reduce(syncSubstrate->get_run_identifier()));\n\n    if (galois::runtime::getSystemNetworkInterface().ID == 0) {\n      galois::runtime::reportStat_Single(\n          REGION_NAME,\n          \"NumIterations_\" + std::to_string(syncSubstrate->get_run_num()),\n          (unsigned long)iterations);\n    }\n  }\n\n  void operator()(GNode src) const {\n    NodeData& src_data = graph->getData(src);\n\n    // only if node is alive we do things\n    if (src_data.flag) {\n      // if dst node is dead, increment trim by one so we can decrement\n      // our degree later\n      for (auto current_edge : graph->edges(src)) {\n        GNode dst          = graph->getEdgeDst(current_edge);\n        NodeData& dst_data = graph->getData(dst);\n\n        if (dst_data.pull_flag) {\n          galois::add(src_data.trim, (uint32_t)1);\n          bitset_trim.set(src);\n        }\n      }\n    }\n  }\n};\n\n/******************************************************************************/\n/* Sanity check operators */\n/******************************************************************************/\n\n/* Gets the total number of nodes that are still alive */\nstruct KCoreSanityCheck {\n  Graph* graph;\n  galois::DGAccumulator<uint64_t>& active_vertices;\n\n  KCoreSanityCheck(Graph* _graph,\n                   galois::DGAccumulator<uint64_t>& _active_vertices)\n      : graph(_graph), active_vertices(_active_vertices) {}\n\n  void static go(Graph& _graph, galois::DGAccumulator<uint64_t>& dga) {\n    dga.reset();\n\n    if (personality == GPU_CUDA) {\n#ifdef GALOIS_ENABLE_GPU\n      uint64_t sum = 0;\n      KCoreSanityCheck_masterNodes_cuda(sum, cuda_ctx);\n      dga += sum;\n#else\n      abort();\n#endif\n    } else {\n      galois::do_all(galois::iterate(_graph.masterNodesRange().begin(),\n                                     _graph.masterNodesRange().end()),\n                     KCoreSanityCheck(&_graph, dga), galois::no_stats(),\n                     galois::loopname(\"KCoreSanityCheck\"));\n    }\n\n    uint64_t num_nodes = dga.reduce();\n\n    // Only node 0 will print data\n    if (galois::runtime::getSystemNetworkInterface().ID == 0) {\n      galois::gPrint(\"Number of nodes in the \", k_core_num, \"-core is \",\n                     num_nodes, \"\\n\");\n    }\n  }\n\n  /* Check if an owned node is alive/dead: increment appropriate accumulator */\n  void operator()(GNode src) const {\n    NodeData& src_data = graph->getData(src);\n\n    if (src_data.flag) {\n      active_vertices += 1;\n    }\n  }\n};\n\n/******************************************************************************/\n/* Make results */\n/******************************************************************************/\n\nstd::vector<unsigned> makeResultsCPU(std::unique_ptr<Graph>& hg) {\n  std::vector<unsigned> values;\n\n  values.reserve(hg->numMasters());\n  for (auto node : hg->masterNodesRange()) {\n    values.push_back(hg->getData(node).flag);\n  }\n\n  return values;\n}\n\n#ifdef GALOIS_ENABLE_GPU\nstd::vector<unsigned> makeResultsGPU(std::unique_ptr<Graph>& hg) {\n  std::vector<unsigned> values;\n\n  values.reserve(hg->numMasters());\n  for (auto node : hg->masterNodesRange()) {\n    values.push_back(get_node_flag_cuda(cuda_ctx, node));\n  }\n\n  return values;\n}\n#else\nstd::vector<unsigned> makeResultsGPU(std::unique_ptr<Graph>& /*unused*/) {\n  abort();\n}\n#endif\n\nstd::vector<unsigned> makeResults(std::unique_ptr<Graph>& hg) {\n  switch (personality) {\n  case CPU:\n    return makeResultsCPU(hg);\n  case GPU_CUDA:\n    return makeResultsGPU(hg);\n  default:\n    abort();\n  }\n}\n\n/******************************************************************************/\n/* Main method for running */\n/******************************************************************************/\n\nconstexpr static const char* const name = \"KCore - Distributed Heterogeneous \"\n                                          \"Pull Topological.\";\nconstexpr static const char* const desc = \"KCore on Distributed Galois.\";\nconstexpr static const char* const url  = nullptr;\n\nint main(int argc, char** argv) {\n  galois::DistMemSys G;\n  DistBenchStart(argc, argv, name, desc, url);\n\n  auto& net = galois::runtime::getSystemNetworkInterface();\n  if (net.ID == 0) {\n    galois::runtime::reportParam(REGION_NAME, \"Max Iterations\", maxIterations);\n  }\n\n  galois::StatTimer StatTimer_total(\"TimerTotal\", REGION_NAME);\n\n  StatTimer_total.start();\n\n  std::unique_ptr<Graph> h_graph;\n#ifdef GALOIS_ENABLE_GPU\n  std::tie(h_graph, syncSubstrate) =\n      symmetricDistGraphInitialization<NodeData, void>(&cuda_ctx);\n#else\n  std::tie(h_graph, syncSubstrate) =\n      symmetricDistGraphInitialization<NodeData, void>();\n#endif\n\n  bitset_current_degree.resize(h_graph->size());\n  bitset_trim.resize(h_graph->size());\n\n  galois::gPrint(\"[\", net.ID, \"] InitializeGraph::go functions called\\n\");\n\n  InitializeGraph::go((*h_graph));\n  galois::runtime::getHostBarrier().wait();\n\n  galois::DGAccumulator<uint64_t> dga;\n\n  for (auto run = 0; run < numRuns; ++run) {\n    galois::gPrint(\"[\", net.ID, \"] KCore::go run \", run, \" called\\n\");\n    std::string timer_str(\"Timer_\" + std::to_string(run));\n    galois::StatTimer StatTimer_main(timer_str.c_str(), REGION_NAME);\n\n    StatTimer_main.start();\n    if (execution == Async) {\n      KCore<true>::go(*h_graph);\n    } else {\n      KCore<false>::go(*h_graph);\n    }\n    StatTimer_main.stop();\n\n    // sanity check\n    KCoreSanityCheck::go(*h_graph, dga);\n\n    // re-init graph for next run\n    if ((run + 1) != numRuns) {\n      (*syncSubstrate).set_num_run(run + 1);\n\n      if (personality == GPU_CUDA) {\n#ifdef GALOIS_ENABLE_GPU\n        bitset_current_degree_reset_cuda(cuda_ctx);\n        bitset_trim_reset_cuda(cuda_ctx);\n#else\n        abort();\n#endif\n      } else {\n        bitset_current_degree.reset();\n        bitset_trim.reset();\n      }\n\n      InitializeGraph::go(*h_graph);\n      galois::runtime::getHostBarrier().wait();\n    }\n  }\n\n  StatTimer_total.stop();\n\n  if (output) {\n    std::vector<unsigned> results = makeResults(h_graph);\n    auto globalIDs                = h_graph->getMasterGlobalIDs();\n    assert(results.size() == globalIDs.size());\n\n    writeOutput(outputLocation, \"in_kcore\", results.data(), results.size(),\n                globalIDs.data());\n  }\n\n  return 0;\n}\n"
  },
  {
    "path": "lonestar/analytics/distributed/k-core/kcore_pull_cuda.cu",
    "content": "/*  -*- mode: c++ -*-  */\n#include \"gg.h\"\n#include \"ggcuda.h\"\n#include \"cub/cub.cuh\"\n#include \"cub/util_allocator.cuh\"\n#include \"thread_work.h\"\n\nvoid kernel_sizing(CSRGraph &, dim3 &, dim3 &);\n#define TB_SIZE 256\nconst char *GGC_OPTIONS = \"coop_conv=False $ outline_iterate_gb=False $ backoff_blocking_factor=4 $ parcomb=True $ np_schedulers=set(['fg', 'tb', 'wp']) $ cc_disable=set([]) $ tb_lb=False $ hacks=set([]) $ np_factor=8 $ instrument=set([]) $ unroll=[] $ instrument_mode=None $ read_props=None $ outline_iterate=False $ ignore_nested_errors=False $ np=True $ write_props=None $ quiet_cgen=True $ dyn_lb=True $ retry_backoff=True $ cuda.graph_type=basic $ cuda.use_worklist_slots=True $ cuda.worklist_type=basic\";\nstruct ThreadWork t_work;\nbool enable_lb = true;\n#include \"kcore_pull_cuda.cuh\"\nstatic const int __tb_KCore = TB_SIZE;\n__global__ void DegreeCounting(CSRGraph graph, unsigned int __begin, unsigned int __end, uint32_t * p_current_degree, DynamicBitset& bitset_current_degree)\n{\n  unsigned tid = TID_1D;\n  unsigned nthreads = TOTAL_THREADS_1D;\n\n  const unsigned __kernel_tb_size = TB_SIZE;\n  index_type src_end;\n  // FP: \"1 -> 2;\n  src_end = __end;\n  for (index_type src = __begin + tid; src < src_end; src += nthreads)\n  {\n    bool pop  = src < __end;\n    if (pop)\n    {\n      p_current_degree[src] = graph.getOutDegree(src);\n      bitset_current_degree.set(src);\n    }\n  }\n  // FP: \"8 -> 9;\n}\n__global__ void InitializeGraph(CSRGraph graph, unsigned int __begin, unsigned int __end, uint32_t * p_current_degree, uint8_t * p_flag, uint8_t * p_pull_flag, uint32_t * p_trim)\n{\n  unsigned tid = TID_1D;\n  unsigned nthreads = TOTAL_THREADS_1D;\n\n  const unsigned __kernel_tb_size = TB_SIZE;\n  index_type src_end;\n  // FP: \"1 -> 2;\n  src_end = __end;\n  for (index_type src = __begin + tid; src < src_end; src += nthreads)\n  {\n    bool pop  = src < __end;\n    if (pop)\n    {\n      p_flag[src]           = true;\n      p_trim[src]           = 0;\n      p_current_degree[src] = 0;\n      p_pull_flag[src]      = false;\n    }\n  }\n  // FP: \"10 -> 11;\n}\n__global__ void LiveUpdate(CSRGraph graph, unsigned int __begin, unsigned int __end, uint32_t local_k_core_num, uint32_t * p_current_degree, uint8_t * p_flag, uint8_t * p_pull_flag, uint32_t * p_trim, HGAccumulator<unsigned int> active_vertices)\n{\n  unsigned tid = TID_1D;\n  unsigned nthreads = TOTAL_THREADS_1D;\n\n  const unsigned __kernel_tb_size = TB_SIZE;\n  __shared__ cub::BlockReduce<unsigned int, TB_SIZE>::TempStorage active_vertices_ts;\n  index_type src_end;\n  // FP: \"1 -> 2;\n  // FP: \"2 -> 3;\n  active_vertices.thread_entry();\n  // FP: \"3 -> 4;\n  src_end = __end;\n  for (index_type src = __begin + tid; src < src_end; src += nthreads)\n  {\n    bool pop  = src < __end;\n    if (pop)\n    {\n      if (p_flag[src])\n      {\n        if (p_trim[src] > 0)\n        {\n          p_current_degree[src] = p_current_degree[src] - p_trim[src];\n        }\n        if (p_current_degree[src] < local_k_core_num)\n        {\n          p_flag[src] = false;\n          active_vertices.reduce( 1);\n          p_pull_flag[src] = true;\n        }\n      }\n      else\n      {\n        if (p_pull_flag[src])\n        {\n          p_pull_flag[src] = false;\n        }\n      }\n      p_trim[src] = 0;\n    }\n  }\n  // FP: \"22 -> 23;\n  active_vertices.thread_exit<cub::BlockReduce<unsigned int, TB_SIZE> >(active_vertices_ts);\n  // FP: \"23 -> 24;\n}\n__global__ void KCore_TB_LB(CSRGraph graph, unsigned int __begin, unsigned int __end, uint8_t * p_flag, uint8_t * p_pull_flag, uint32_t * p_trim, DynamicBitset& bitset_trim, int * thread_prefix_work_wl, unsigned int num_items, PipeContextT<Worklist2> thread_src_wl)\n{\n  unsigned tid = TID_1D;\n  unsigned nthreads = TOTAL_THREADS_1D;\n\n  const unsigned __kernel_tb_size = TB_SIZE;\n  __shared__ unsigned int total_work;\n  __shared__ unsigned block_start_src_index;\n  __shared__ unsigned block_end_src_index;\n  unsigned my_work;\n  unsigned src;\n  unsigned int offset;\n  unsigned int current_work;\n  // FP: \"1 -> 2;\n  // FP: \"2 -> 3;\n  unsigned blockdim_x = BLOCK_DIM_X;\n  // FP: \"3 -> 4;\n  // FP: \"4 -> 5;\n  // FP: \"5 -> 6;\n  // FP: \"6 -> 7;\n  // FP: \"7 -> 8;\n  // FP: \"8 -> 9;\n  // FP: \"9 -> 10;\n  total_work = thread_prefix_work_wl[num_items - 1];\n  // FP: \"10 -> 11;\n  my_work = ceilf((float)(total_work) / (float) nthreads);\n  // FP: \"11 -> 12;\n\n  // FP: \"12 -> 13;\n  __syncthreads();\n  // FP: \"13 -> 14;\n\n  // FP: \"14 -> 15;\n  if (my_work != 0)\n  {\n    current_work = tid;\n  }\n  // FP: \"17 -> 18;\n  for (unsigned i =0; i < my_work; i++)\n  {\n    unsigned int block_start_work;\n    unsigned int block_end_work;\n    if (threadIdx.x == 0)\n    {\n      if (current_work < total_work)\n      {\n        block_start_work = current_work;\n        block_end_work=current_work + blockdim_x - 1;\n        if (block_end_work >= total_work)\n        {\n          block_end_work = total_work - 1;\n        }\n        block_start_src_index = compute_src_and_offset(0, num_items - 1,  block_start_work+1, thread_prefix_work_wl, num_items,offset);\n        block_end_src_index = compute_src_and_offset(0, num_items - 1, block_end_work+1, thread_prefix_work_wl, num_items, offset);\n      }\n    }\n    __syncthreads();\n\n    if (current_work < total_work)\n    {\n      unsigned src_index;\n      index_type current_edge;\n      src_index = compute_src_and_offset(block_start_src_index, block_end_src_index, current_work+1, thread_prefix_work_wl,num_items, offset);\n      src= thread_src_wl.in_wl().dwl[src_index];\n      current_edge = (graph).getFirstEdge(src)+ offset;\n      {\n        index_type dst;\n        dst = graph.getAbsDestination(current_edge);\n        if (p_pull_flag[dst])\n        {\n          atomicTestAdd(&p_trim[src], (uint32_t)1);\n          bitset_trim.set(src);\n        }\n      }\n      current_work = current_work + nthreads;\n    }\n    __syncthreads();\n  }\n  // FP: \"46 -> 47;\n}\n__global__ void KCore(CSRGraph graph, unsigned int __begin, unsigned int __end, uint8_t * p_flag, uint8_t * p_pull_flag, uint32_t * p_trim, DynamicBitset& bitset_trim, PipeContextT<Worklist2> thread_work_wl, PipeContextT<Worklist2> thread_src_wl, bool enable_lb)\n{\n  unsigned tid = TID_1D;\n  unsigned nthreads = TOTAL_THREADS_1D;\n\n  const unsigned __kernel_tb_size = __tb_KCore;\n  index_type src_end;\n  index_type src_rup;\n  // FP: \"1 -> 2;\n  const int _NP_CROSSOVER_WP = 32;\n  const int _NP_CROSSOVER_TB = __kernel_tb_size;\n  // FP: \"2 -> 3;\n  const int BLKSIZE = __kernel_tb_size;\n  const int ITSIZE = BLKSIZE * 8;\n  unsigned d_limit = DEGREE_LIMIT;\n  // FP: \"3 -> 4;\n\n  typedef cub::BlockScan<multiple_sum<2, index_type>, BLKSIZE> BlockScan;\n  typedef union np_shared<BlockScan::TempStorage, index_type, struct tb_np, struct warp_np<__kernel_tb_size/32>, struct fg_np<ITSIZE> > npsTy;\n\n  // FP: \"4 -> 5;\n  __shared__ npsTy nps ;\n  // FP: \"5 -> 6;\n  src_end = __end;\n  src_rup = ((__begin) + roundup(((__end) - (__begin)), (blockDim.x)));\n  for (index_type src = __begin + tid; src < src_rup; src += nthreads)\n  {\n    int index;\n    multiple_sum<2, index_type> _np_mps;\n    multiple_sum<2, index_type> _np_mps_total;\n    // FP: \"6 -> 7;\n    bool pop  = src < __end && ((( src < (graph).nnodes )) ? true: false);\n    // FP: \"7 -> 8;\n    if (pop)\n    {\n      if (p_flag[src])\n      {\n      }\n      else\n      {\n        pop = false;\n      }\n    }\n    // FP: \"12 -> 13;\n    // FP: \"15 -> 16;\n    // FP: \"16 -> 17;\n    int threshold = TOTAL_THREADS_1D;\n    // FP: \"17 -> 18;\n    if (pop && (graph).getOutDegree(src) >= threshold)\n    {\n      index = thread_work_wl.in_wl().push_range(1) ;\n      thread_src_wl.in_wl().push_range(1);\n      thread_work_wl.in_wl().dwl[index] = (graph).getOutDegree(src);\n      thread_src_wl.in_wl().dwl[index] = src;\n      pop = false;\n    }\n    // FP: \"20 -> 21;\n    struct NPInspector1 _np = {0,0,0,0,0,0};\n    // FP: \"21 -> 22;\n    __shared__ struct { index_type src; } _np_closure [TB_SIZE];\n    // FP: \"22 -> 23;\n    _np_closure[threadIdx.x].src = src;\n    // FP: \"23 -> 24;\n    if (pop)\n    {\n      _np.size = (graph).getOutDegree(src);\n      _np.start = (graph).getFirstEdge(src);\n    }\n    // FP: \"26 -> 27;\n    // FP: \"27 -> 28;\n    _np_mps.el[0] = _np.size >= _NP_CROSSOVER_WP ? _np.size : 0;\n    _np_mps.el[1] = _np.size < _NP_CROSSOVER_WP ? _np.size : 0;\n    // FP: \"28 -> 29;\n    BlockScan(nps.temp_storage).ExclusiveSum(_np_mps, _np_mps, _np_mps_total);\n    // FP: \"29 -> 30;\n    if (threadIdx.x == 0)\n    {\n      nps.tb.owner = MAX_TB_SIZE + 1;\n    }\n    // FP: \"32 -> 33;\n    __syncthreads();\n    // FP: \"33 -> 34;\n    while (true)\n    {\n      // FP: \"34 -> 35;\n      if (_np.size >= _NP_CROSSOVER_TB)\n      {\n        nps.tb.owner = threadIdx.x;\n      }\n      // FP: \"37 -> 38;\n      __syncthreads();\n      // FP: \"38 -> 39;\n      if (nps.tb.owner == MAX_TB_SIZE + 1)\n      {\n        // FP: \"39 -> 40;\n        __syncthreads();\n        // FP: \"40 -> 41;\n        break;\n      }\n      // FP: \"42 -> 43;\n      if (nps.tb.owner == threadIdx.x)\n      {\n        nps.tb.start = _np.start;\n        nps.tb.size = _np.size;\n        nps.tb.src = threadIdx.x;\n        _np.start = 0;\n        _np.size = 0;\n      }\n      // FP: \"45 -> 46;\n      __syncthreads();\n      // FP: \"46 -> 47;\n      int ns = nps.tb.start;\n      int ne = nps.tb.size;\n      // FP: \"47 -> 48;\n      if (nps.tb.src == threadIdx.x)\n      {\n        nps.tb.owner = MAX_TB_SIZE + 1;\n      }\n      // FP: \"50 -> 51;\n      assert(nps.tb.src < __kernel_tb_size);\n      src = _np_closure[nps.tb.src].src;\n      // FP: \"51 -> 52;\n      for (int _np_j = threadIdx.x; _np_j < ne; _np_j += BLKSIZE)\n      {\n        index_type current_edge;\n        current_edge = ns +_np_j;\n        {\n          index_type dst;\n          dst = graph.getAbsDestination(current_edge);\n          if (p_pull_flag[dst])\n          {\n            atomicTestAdd(&p_trim[src], (uint32_t)1);\n            bitset_trim.set(src);\n          }\n        }\n      }\n      // FP: \"61 -> 62;\n      __syncthreads();\n    }\n    // FP: \"63 -> 64;\n\n    // FP: \"64 -> 65;\n    {\n      const int warpid = threadIdx.x / 32;\n      // FP: \"65 -> 66;\n      const int _np_laneid = cub::LaneId();\n      // FP: \"66 -> 67;\n      while (__any_sync(0xffffffff, _np.size >= _NP_CROSSOVER_WP && _np.size < _NP_CROSSOVER_TB))\n      {\n        if (_np.size >= _NP_CROSSOVER_WP && _np.size < _NP_CROSSOVER_TB)\n        {\n          nps.warp.owner[warpid] = _np_laneid;\n        }\n        if (nps.warp.owner[warpid] == _np_laneid)\n        {\n          nps.warp.start[warpid] = _np.start;\n          nps.warp.size[warpid] = _np.size;\n          nps.warp.src[warpid] = threadIdx.x;\n          _np.start = 0;\n          _np.size = 0;\n        }\n        index_type _np_w_start = nps.warp.start[warpid];\n        index_type _np_w_size = nps.warp.size[warpid];\n        assert(nps.warp.src[warpid] < __kernel_tb_size);\n        src = _np_closure[nps.warp.src[warpid]].src;\n        for (int _np_ii = _np_laneid; _np_ii < _np_w_size; _np_ii += 32)\n        {\n          index_type current_edge;\n          current_edge = _np_w_start +_np_ii;\n          {\n            index_type dst;\n            dst = graph.getAbsDestination(current_edge);\n            if (p_pull_flag[dst])\n            {\n              atomicTestAdd(&p_trim[src], (uint32_t)1);\n              bitset_trim.set(src);\n            }\n          }\n        }\n      }\n      // FP: \"86 -> 87;\n      __syncthreads();\n      // FP: \"87 -> 88;\n    }\n\n    // FP: \"88 -> 89;\n    __syncthreads();\n    // FP: \"89 -> 90;\n    _np.total = _np_mps_total.el[1];\n    _np.offset = _np_mps.el[1];\n    // FP: \"90 -> 91;\n    while (_np.work())\n    {\n      // FP: \"91 -> 92;\n      int _np_i =0;\n      // FP: \"92 -> 93;\n      _np.inspect2(nps.fg.itvalue, nps.fg.src, ITSIZE, threadIdx.x);\n      // FP: \"93 -> 94;\n      __syncthreads();\n      // FP: \"94 -> 95;\n\n      // FP: \"95 -> 96;\n      for (_np_i = threadIdx.x; _np_i < ITSIZE && _np.valid(_np_i); _np_i += BLKSIZE)\n      {\n        index_type current_edge;\n        assert(nps.fg.src[_np_i] < __kernel_tb_size);\n        src = _np_closure[nps.fg.src[_np_i]].src;\n        current_edge= nps.fg.itvalue[_np_i];\n        {\n          index_type dst;\n          dst = graph.getAbsDestination(current_edge);\n          if (p_pull_flag[dst])\n          {\n            atomicTestAdd(&p_trim[src], (uint32_t)1);\n            bitset_trim.set(src);\n          }\n        }\n      }\n      // FP: \"106 -> 107;\n      _np.execute_round_done(ITSIZE);\n      // FP: \"107 -> 108;\n      __syncthreads();\n    }\n    // FP: \"109 -> 110;\n    assert(threadIdx.x < __kernel_tb_size);\n    src = _np_closure[threadIdx.x].src;\n  }\n  // FP: \"111 -> 112;\n}\n__global__ void KCoreSanityCheck(CSRGraph graph, unsigned int __begin, unsigned int __end, uint8_t * p_flag, HGAccumulator<uint64_t> active_vertices)\n{\n  unsigned tid = TID_1D;\n  unsigned nthreads = TOTAL_THREADS_1D;\n\n  const unsigned __kernel_tb_size = TB_SIZE;\n  __shared__ cub::BlockReduce<uint64_t, TB_SIZE>::TempStorage active_vertices_ts;\n  index_type src_end;\n  // FP: \"1 -> 2;\n  // FP: \"2 -> 3;\n  active_vertices.thread_entry();\n  // FP: \"3 -> 4;\n  src_end = __end;\n  for (index_type src = __begin + tid; src < src_end; src += nthreads)\n  {\n    bool pop  = src < __end;\n    if (pop)\n    {\n      if (p_flag[src])\n      {\n        active_vertices.reduce( 1);\n      }\n    }\n  }\n  // FP: \"11 -> 12;\n  active_vertices.thread_exit<cub::BlockReduce<uint64_t, TB_SIZE> >(active_vertices_ts);\n  // FP: \"12 -> 13;\n}\nvoid DegreeCounting_cuda(unsigned int  __begin, unsigned int  __end, struct CUDA_Context*  ctx)\n{\n  dim3 blocks;\n  dim3 threads;\n  // FP: \"1 -> 2;\n  // FP: \"2 -> 3;\n  // FP: \"3 -> 4;\n  kernel_sizing(blocks, threads);\n  // FP: \"4 -> 5;\n  DegreeCounting <<<blocks, threads>>>(ctx->gg, __begin, __end, ctx->current_degree.data.gpu_wr_ptr(), *(ctx->current_degree.is_updated.gpu_rd_ptr()));\n  cudaDeviceSynchronize();\n  // FP: \"5 -> 6;\n  check_cuda_kernel;\n  // FP: \"6 -> 7;\n}\nvoid DegreeCounting_allNodes_cuda(struct CUDA_Context*  ctx)\n{\n  // FP: \"1 -> 2;\n  DegreeCounting_cuda(0, ctx->gg.nnodes, ctx);\n  // FP: \"2 -> 3;\n}\nvoid DegreeCounting_masterNodes_cuda(struct CUDA_Context*  ctx)\n{\n  // FP: \"1 -> 2;\n  DegreeCounting_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, ctx);\n  // FP: \"2 -> 3;\n}\nvoid DegreeCounting_nodesWithEdges_cuda(struct CUDA_Context*  ctx)\n{\n  // FP: \"1 -> 2;\n  DegreeCounting_cuda(0, ctx->numNodesWithEdges, ctx);\n  // FP: \"2 -> 3;\n}\nvoid InitializeGraph_cuda(unsigned int  __begin, unsigned int  __end, struct CUDA_Context*  ctx)\n{\n  t_work.init_thread_work(ctx->gg.nnodes);\n  dim3 blocks;\n  dim3 threads;\n  // FP: \"1 -> 2;\n  // FP: \"2 -> 3;\n  // FP: \"3 -> 4;\n  kernel_sizing(blocks, threads);\n  // FP: \"4 -> 5;\n  InitializeGraph <<<blocks, threads>>>(ctx->gg, __begin, __end, ctx->current_degree.data.gpu_wr_ptr(), ctx->flag.data.gpu_wr_ptr(), ctx->pull_flag.data.gpu_wr_ptr(), ctx->trim.data.gpu_wr_ptr());\n  cudaDeviceSynchronize();\n  // FP: \"5 -> 6;\n  check_cuda_kernel;\n  // FP: \"6 -> 7;\n}\nvoid InitializeGraph_allNodes_cuda(struct CUDA_Context*  ctx)\n{\n  // FP: \"1 -> 2;\n  InitializeGraph_cuda(0, ctx->gg.nnodes, ctx);\n  // FP: \"2 -> 3;\n}\nvoid InitializeGraph_masterNodes_cuda(struct CUDA_Context*  ctx)\n{\n  // FP: \"1 -> 2;\n  InitializeGraph_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, ctx);\n  // FP: \"2 -> 3;\n}\nvoid InitializeGraph_nodesWithEdges_cuda(struct CUDA_Context*  ctx)\n{\n  // FP: \"1 -> 2;\n  InitializeGraph_cuda(0, ctx->numNodesWithEdges, ctx);\n  // FP: \"2 -> 3;\n}\nvoid LiveUpdate_cuda(unsigned int  __begin, unsigned int  __end, unsigned int & active_vertices, uint32_t local_k_core_num, struct CUDA_Context*  ctx)\n{\n  dim3 blocks;\n  dim3 threads;\n  HGAccumulator<unsigned int> _active_vertices;\n  // FP: \"1 -> 2;\n  // FP: \"2 -> 3;\n  // FP: \"3 -> 4;\n  kernel_sizing(blocks, threads);\n  // FP: \"4 -> 5;\n  Shared<unsigned int> active_verticesval  = Shared<unsigned int>(1);\n  // FP: \"5 -> 6;\n  // FP: \"6 -> 7;\n  *(active_verticesval.cpu_wr_ptr()) = 0;\n  // FP: \"7 -> 8;\n  _active_vertices.rv = active_verticesval.gpu_wr_ptr();\n  // FP: \"8 -> 9;\n  LiveUpdate <<<blocks, threads>>>(ctx->gg, __begin, __end, local_k_core_num, ctx->current_degree.data.gpu_wr_ptr(), ctx->flag.data.gpu_wr_ptr(), ctx->pull_flag.data.gpu_wr_ptr(), ctx->trim.data.gpu_wr_ptr(), _active_vertices);\n  cudaDeviceSynchronize();\n  // FP: \"9 -> 10;\n  check_cuda_kernel;\n  // FP: \"10 -> 11;\n  active_vertices = *(active_verticesval.cpu_rd_ptr());\n  // FP: \"11 -> 12;\n}\nvoid LiveUpdate_allNodes_cuda(unsigned int & active_vertices, uint32_t local_k_core_num, struct CUDA_Context*  ctx)\n{\n  // FP: \"1 -> 2;\n  LiveUpdate_cuda(0, ctx->gg.nnodes, active_vertices, local_k_core_num, ctx);\n  // FP: \"2 -> 3;\n}\nvoid LiveUpdate_masterNodes_cuda(unsigned int & active_vertices, uint32_t local_k_core_num, struct CUDA_Context*  ctx)\n{\n  // FP: \"1 -> 2;\n  LiveUpdate_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, active_vertices, local_k_core_num, ctx);\n  // FP: \"2 -> 3;\n}\nvoid LiveUpdate_nodesWithEdges_cuda(unsigned int & active_vertices, uint32_t local_k_core_num, struct CUDA_Context*  ctx)\n{\n  // FP: \"1 -> 2;\n  LiveUpdate_cuda(0, ctx->numNodesWithEdges, active_vertices, local_k_core_num, ctx);\n  // FP: \"2 -> 3;\n}\nvoid KCore_cuda(unsigned int  __begin, unsigned int  __end, struct CUDA_Context*  ctx)\n{\n  dim3 blocks;\n  dim3 threads;\n  // FP: \"1 -> 2;\n  // FP: \"2 -> 3;\n  // FP: \"3 -> 4;\n  kernel_sizing(blocks, threads);\n  // FP: \"4 -> 5;\n  KCore <<<blocks, __tb_KCore>>>(ctx->gg, __begin, __end, ctx->flag.data.gpu_wr_ptr(), ctx->pull_flag.data.gpu_wr_ptr(), ctx->trim.data.gpu_wr_ptr(), *(ctx->trim.is_updated.gpu_rd_ptr()), t_work.thread_work_wl, t_work.thread_src_wl, enable_lb);\n  cudaDeviceSynchronize();\n  if (enable_lb)\n  {\n    int num_items = t_work.thread_work_wl.in_wl().nitems();\n    if (num_items != 0)\n    {\n      t_work.compute_prefix_sum();\n      cudaDeviceSynchronize();\n      KCore_TB_LB <<<blocks, __tb_KCore>>>(ctx->gg, __begin, __end, ctx->flag.data.gpu_wr_ptr(), ctx->pull_flag.data.gpu_wr_ptr(), ctx->trim.data.gpu_wr_ptr(), *(ctx->trim.is_updated.gpu_rd_ptr()), t_work.thread_prefix_work_wl.gpu_wr_ptr(), num_items, t_work.thread_src_wl);\n      cudaDeviceSynchronize();\n      t_work.reset_thread_work();\n    }\n  }\n  // FP: \"5 -> 6;\n  check_cuda_kernel;\n  // FP: \"6 -> 7;\n}\nvoid KCore_allNodes_cuda(struct CUDA_Context*  ctx)\n{\n  // FP: \"1 -> 2;\n  KCore_cuda(0, ctx->gg.nnodes, ctx);\n  // FP: \"2 -> 3;\n}\nvoid KCore_masterNodes_cuda(struct CUDA_Context*  ctx)\n{\n  // FP: \"1 -> 2;\n  KCore_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, ctx);\n  // FP: \"2 -> 3;\n}\nvoid KCore_nodesWithEdges_cuda(struct CUDA_Context*  ctx)\n{\n  // FP: \"1 -> 2;\n  KCore_cuda(0, ctx->numNodesWithEdges, ctx);\n  // FP: \"2 -> 3;\n}\nvoid KCoreSanityCheck_cuda(unsigned int  __begin, unsigned int  __end, uint64_t & active_vertices, struct CUDA_Context*  ctx)\n{\n  dim3 blocks;\n  dim3 threads;\n  HGAccumulator<uint64_t> _active_vertices;\n  // FP: \"1 -> 2;\n  // FP: \"2 -> 3;\n  // FP: \"3 -> 4;\n  kernel_sizing(blocks, threads);\n  // FP: \"4 -> 5;\n  Shared<uint64_t> active_verticesval  = Shared<uint64_t>(1);\n  // FP: \"5 -> 6;\n  // FP: \"6 -> 7;\n  *(active_verticesval.cpu_wr_ptr()) = 0;\n  // FP: \"7 -> 8;\n  _active_vertices.rv = active_verticesval.gpu_wr_ptr();\n  // FP: \"8 -> 9;\n  KCoreSanityCheck <<<blocks, threads>>>(ctx->gg, __begin, __end, ctx->flag.data.gpu_wr_ptr(), _active_vertices);\n  cudaDeviceSynchronize();\n  // FP: \"9 -> 10;\n  check_cuda_kernel;\n  // FP: \"10 -> 11;\n  active_vertices = *(active_verticesval.cpu_rd_ptr());\n  // FP: \"11 -> 12;\n}\nvoid KCoreSanityCheck_allNodes_cuda(uint64_t & active_vertices, struct CUDA_Context*  ctx)\n{\n  // FP: \"1 -> 2;\n  KCoreSanityCheck_cuda(0, ctx->gg.nnodes, active_vertices, ctx);\n  // FP: \"2 -> 3;\n}\nvoid KCoreSanityCheck_masterNodes_cuda(uint64_t & active_vertices, struct CUDA_Context*  ctx)\n{\n  // FP: \"1 -> 2;\n  KCoreSanityCheck_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, active_vertices, ctx);\n  // FP: \"2 -> 3;\n}\nvoid KCoreSanityCheck_nodesWithEdges_cuda(uint64_t & active_vertices, struct CUDA_Context*  ctx)\n{\n  // FP: \"1 -> 2;\n  KCoreSanityCheck_cuda(0, ctx->numNodesWithEdges, active_vertices, ctx);\n  // FP: \"2 -> 3;\n}"
  },
  {
    "path": "lonestar/analytics/distributed/k-core/kcore_pull_cuda.cuh",
    "content": "#pragma once\n#include <cuda.h>\n#include <stdio.h>\n#include <sys/types.h>\n#include <unistd.h>\n#include \"kcore_pull_cuda.h\"\n#include \"galois/runtime/cuda/DeviceSync.h\"\n\nstruct CUDA_Context : public CUDA_Context_Common {\n\tstruct CUDA_Context_Field<uint32_t> current_degree;\n\tstruct CUDA_Context_Field<uint8_t> flag;\n\tstruct CUDA_Context_Field<uint8_t> pull_flag;\n\tstruct CUDA_Context_Field<uint32_t> trim;\n};\n\nstruct CUDA_Context* get_CUDA_context(int id) {\n\tstruct CUDA_Context* ctx;\n\tctx = (struct CUDA_Context* ) calloc(1, sizeof(struct CUDA_Context));\n\tctx->id = id;\n\treturn ctx;\n}\n\nbool init_CUDA_context(struct CUDA_Context* ctx, int device) {\n\treturn init_CUDA_context_common(ctx, device);\n}\n\nvoid load_graph_CUDA(struct CUDA_Context* ctx, MarshalGraph &g, unsigned num_hosts) {\n\tsize_t mem_usage = mem_usage_CUDA_common(g, num_hosts);\n\tmem_usage += mem_usage_CUDA_field(&ctx->current_degree, g, num_hosts);\n\tmem_usage += mem_usage_CUDA_field(&ctx->flag, g, num_hosts);\n\tmem_usage += mem_usage_CUDA_field(&ctx->pull_flag, g, num_hosts);\n\tmem_usage += mem_usage_CUDA_field(&ctx->trim, g, num_hosts);\n\tprintf(\"[%d] Host memory for communication context: %3u MB\\n\", ctx->id, mem_usage/1048756);\n\tload_graph_CUDA_common(ctx, g, num_hosts);\n\tload_graph_CUDA_field(ctx, &ctx->current_degree, num_hosts);\n\tload_graph_CUDA_field(ctx, &ctx->flag, num_hosts);\n\tload_graph_CUDA_field(ctx, &ctx->pull_flag, num_hosts);\n\tload_graph_CUDA_field(ctx, &ctx->trim, num_hosts);\n\treset_CUDA_context(ctx);\n}\n\nvoid reset_CUDA_context(struct CUDA_Context* ctx) {\n\tctx->current_degree.data.zero_gpu();\n\tctx->flag.data.zero_gpu();\n\tctx->pull_flag.data.zero_gpu();\n\tctx->trim.data.zero_gpu();\n}\n\nvoid get_bitset_current_degree_cuda(struct CUDA_Context* ctx, uint64_t* bitset_compute) {\n\tctx->current_degree.is_updated.cpu_rd_ptr()->copy_to_cpu(bitset_compute);\n}\n\nvoid bitset_current_degree_reset_cuda(struct CUDA_Context* ctx) {\n\tctx->current_degree.is_updated.cpu_rd_ptr()->reset();\n}\n\nvoid bitset_current_degree_reset_cuda(struct CUDA_Context* ctx, size_t begin, size_t end) {\n\treset_bitset_field(&ctx->current_degree, begin, end);\n}\n\nuint32_t get_node_current_degree_cuda(struct CUDA_Context* ctx, unsigned LID) {\n\tuint32_t *current_degree = ctx->current_degree.data.cpu_rd_ptr();\n\treturn current_degree[LID];\n}\n\nvoid set_node_current_degree_cuda(struct CUDA_Context* ctx, unsigned LID, uint32_t v) {\n\tuint32_t *current_degree = ctx->current_degree.data.cpu_wr_ptr();\n\tcurrent_degree[LID] = v;\n}\n\nvoid add_node_current_degree_cuda(struct CUDA_Context* ctx, unsigned LID, uint32_t v) {\n\tuint32_t *current_degree = ctx->current_degree.data.cpu_wr_ptr();\n\tcurrent_degree[LID] += v;\n}\n\nbool min_node_current_degree_cuda(struct CUDA_Context* ctx, unsigned LID, uint32_t v) {\n\tuint32_t *current_degree = ctx->current_degree.data.cpu_wr_ptr();\n\tif (current_degree[LID] > v){\n\t\tcurrent_degree[LID] = v;\n\t\treturn true;\n\t}\n\treturn false;\n}\n\nvoid batch_get_node_current_degree_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v) {\n\tbatch_get_shared_field<uint32_t, sharedMaster, false>(ctx, &ctx->current_degree, from_id, v);\n}\n\nvoid batch_get_node_current_degree_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode) {\n\tbatch_get_shared_field<uint32_t, sharedMaster, false>(ctx, &ctx->current_degree, from_id, v, v_size, data_mode);\n}\n\nvoid batch_get_mirror_node_current_degree_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v) {\n\tbatch_get_shared_field<uint32_t, sharedMirror, false>(ctx, &ctx->current_degree, from_id, v);\n}\n\nvoid batch_get_mirror_node_current_degree_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode) {\n\tbatch_get_shared_field<uint32_t, sharedMirror, false>(ctx, &ctx->current_degree, from_id, v, v_size, data_mode);\n}\n\nvoid batch_get_reset_node_current_degree_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, uint32_t i) {\n\tbatch_get_shared_field<uint32_t, sharedMirror, true>(ctx, &ctx->current_degree, from_id, v, i);\n}\n\nvoid batch_get_reset_node_current_degree_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode, uint32_t i) {\n\tbatch_get_shared_field<uint32_t, sharedMirror, true>(ctx, &ctx->current_degree, from_id, v, v_size, data_mode, i);\n}\n\nvoid batch_set_mirror_node_current_degree_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<uint32_t, sharedMirror, setOp>(ctx, &ctx->current_degree, from_id, v, data_mode);\n}\n\nvoid batch_set_node_current_degree_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<uint32_t, sharedMaster, setOp>(ctx, &ctx->current_degree, from_id, v, data_mode);\n}\n\nvoid batch_add_mirror_node_current_degree_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<uint32_t, sharedMirror, addOp>(ctx, &ctx->current_degree, from_id, v, data_mode);\n}\n\nvoid batch_add_node_current_degree_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<uint32_t, sharedMaster, addOp>(ctx, &ctx->current_degree, from_id, v, data_mode);\n}\n\nvoid batch_min_mirror_node_current_degree_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<uint32_t, sharedMirror, minOp>(ctx, &ctx->current_degree, from_id, v, data_mode);\n}\n\nvoid batch_min_node_current_degree_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<uint32_t, sharedMaster, minOp>(ctx, &ctx->current_degree, from_id, v, data_mode);\n}\n\nvoid batch_reset_node_current_degree_cuda(struct CUDA_Context* ctx, size_t begin, size_t end, uint32_t v) {\n\treset_data_field<uint32_t>(&ctx->current_degree, begin, end, v);\n}\n\nvoid get_bitset_flag_cuda(struct CUDA_Context* ctx, uint64_t* bitset_compute) {\n\tctx->flag.is_updated.cpu_rd_ptr()->copy_to_cpu(bitset_compute);\n}\n\nvoid bitset_flag_reset_cuda(struct CUDA_Context* ctx) {\n\tctx->flag.is_updated.cpu_rd_ptr()->reset();\n}\n\nvoid bitset_flag_reset_cuda(struct CUDA_Context* ctx, size_t begin, size_t end) {\n\treset_bitset_field(&ctx->flag, begin, end);\n}\n\nuint8_t get_node_flag_cuda(struct CUDA_Context* ctx, unsigned LID) {\n\tuint8_t *flag = ctx->flag.data.cpu_rd_ptr();\n\treturn flag[LID];\n}\n\nvoid set_node_flag_cuda(struct CUDA_Context* ctx, unsigned LID, uint8_t v) {\n\tuint8_t *flag = ctx->flag.data.cpu_wr_ptr();\n\tflag[LID] = v;\n}\n\nvoid add_node_flag_cuda(struct CUDA_Context* ctx, unsigned LID, uint8_t v) {\n\tuint8_t *flag = ctx->flag.data.cpu_wr_ptr();\n\tflag[LID] += v;\n}\n\nbool min_node_flag_cuda(struct CUDA_Context* ctx, unsigned LID, uint8_t v) {\n\tuint8_t *flag = ctx->flag.data.cpu_wr_ptr();\n\tif (flag[LID] > v){\n\t\tflag[LID] = v;\n\t\treturn true;\n\t}\n\treturn false;\n}\n\nvoid batch_get_node_flag_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v) {\n\tbatch_get_shared_field<uint8_t, sharedMaster, false>(ctx, &ctx->flag, from_id, v);\n}\n\nvoid batch_get_node_flag_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode) {\n\tbatch_get_shared_field<uint8_t, sharedMaster, false>(ctx, &ctx->flag, from_id, v, v_size, data_mode);\n}\n\nvoid batch_get_mirror_node_flag_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v) {\n\tbatch_get_shared_field<uint8_t, sharedMirror, false>(ctx, &ctx->flag, from_id, v);\n}\n\nvoid batch_get_mirror_node_flag_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode) {\n\tbatch_get_shared_field<uint8_t, sharedMirror, false>(ctx, &ctx->flag, from_id, v, v_size, data_mode);\n}\n\nvoid batch_get_reset_node_flag_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, uint8_t i) {\n\tbatch_get_shared_field<uint8_t, sharedMirror, true>(ctx, &ctx->flag, from_id, v, i);\n}\n\nvoid batch_get_reset_node_flag_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode, uint8_t i) {\n\tbatch_get_shared_field<uint8_t, sharedMirror, true>(ctx, &ctx->flag, from_id, v, v_size, data_mode, i);\n}\n\nvoid batch_set_mirror_node_flag_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<uint8_t, sharedMirror, setOp>(ctx, &ctx->flag, from_id, v, data_mode);\n}\n\nvoid batch_set_node_flag_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<uint8_t, sharedMaster, setOp>(ctx, &ctx->flag, from_id, v, data_mode);\n}\n\nvoid batch_add_mirror_node_flag_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<uint8_t, sharedMirror, addOp>(ctx, &ctx->flag, from_id, v, data_mode);\n}\n\nvoid batch_add_node_flag_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<uint8_t, sharedMaster, addOp>(ctx, &ctx->flag, from_id, v, data_mode);\n}\n\nvoid batch_min_mirror_node_flag_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<uint8_t, sharedMirror, minOp>(ctx, &ctx->flag, from_id, v, data_mode);\n}\n\nvoid batch_min_node_flag_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<uint8_t, sharedMaster, minOp>(ctx, &ctx->flag, from_id, v, data_mode);\n}\n\nvoid batch_reset_node_flag_cuda(struct CUDA_Context* ctx, size_t begin, size_t end, uint8_t v) {\n\treset_data_field<uint8_t>(&ctx->flag, begin, end, v);\n}\n\nvoid get_bitset_pull_flag_cuda(struct CUDA_Context* ctx, uint64_t* bitset_compute) {\n\tctx->pull_flag.is_updated.cpu_rd_ptr()->copy_to_cpu(bitset_compute);\n}\n\nvoid bitset_pull_flag_reset_cuda(struct CUDA_Context* ctx) {\n\tctx->pull_flag.is_updated.cpu_rd_ptr()->reset();\n}\n\nvoid bitset_pull_flag_reset_cuda(struct CUDA_Context* ctx, size_t begin, size_t end) {\n\treset_bitset_field(&ctx->pull_flag, begin, end);\n}\n\nuint8_t get_node_pull_flag_cuda(struct CUDA_Context* ctx, unsigned LID) {\n\tuint8_t *pull_flag = ctx->pull_flag.data.cpu_rd_ptr();\n\treturn pull_flag[LID];\n}\n\nvoid set_node_pull_flag_cuda(struct CUDA_Context* ctx, unsigned LID, uint8_t v) {\n\tuint8_t *pull_flag = ctx->pull_flag.data.cpu_wr_ptr();\n\tpull_flag[LID] = v;\n}\n\nvoid add_node_pull_flag_cuda(struct CUDA_Context* ctx, unsigned LID, uint8_t v) {\n\tuint8_t *pull_flag = ctx->pull_flag.data.cpu_wr_ptr();\n\tpull_flag[LID] += v;\n}\n\nbool min_node_pull_flag_cuda(struct CUDA_Context* ctx, unsigned LID, uint8_t v) {\n\tuint8_t *pull_flag = ctx->pull_flag.data.cpu_wr_ptr();\n\tif (pull_flag[LID] > v){\n\t\tpull_flag[LID] = v;\n\t\treturn true;\n\t}\n\treturn false;\n}\n\nvoid batch_get_node_pull_flag_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v) {\n\tbatch_get_shared_field<uint8_t, sharedMaster, false>(ctx, &ctx->pull_flag, from_id, v);\n}\n\nvoid batch_get_node_pull_flag_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode) {\n\tbatch_get_shared_field<uint8_t, sharedMaster, false>(ctx, &ctx->pull_flag, from_id, v, v_size, data_mode);\n}\n\nvoid batch_get_mirror_node_pull_flag_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v) {\n\tbatch_get_shared_field<uint8_t, sharedMirror, false>(ctx, &ctx->pull_flag, from_id, v);\n}\n\nvoid batch_get_mirror_node_pull_flag_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode) {\n\tbatch_get_shared_field<uint8_t, sharedMirror, false>(ctx, &ctx->pull_flag, from_id, v, v_size, data_mode);\n}\n\nvoid batch_get_reset_node_pull_flag_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, uint8_t i) {\n\tbatch_get_shared_field<uint8_t, sharedMirror, true>(ctx, &ctx->pull_flag, from_id, v, i);\n}\n\nvoid batch_get_reset_node_pull_flag_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode, uint8_t i) {\n\tbatch_get_shared_field<uint8_t, sharedMirror, true>(ctx, &ctx->pull_flag, from_id, v, v_size, data_mode, i);\n}\n\nvoid batch_set_mirror_node_pull_flag_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<uint8_t, sharedMirror, setOp>(ctx, &ctx->pull_flag, from_id, v, data_mode);\n}\n\nvoid batch_set_node_pull_flag_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<uint8_t, sharedMaster, setOp>(ctx, &ctx->pull_flag, from_id, v, data_mode);\n}\n\nvoid batch_add_mirror_node_pull_flag_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<uint8_t, sharedMirror, addOp>(ctx, &ctx->pull_flag, from_id, v, data_mode);\n}\n\nvoid batch_add_node_pull_flag_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<uint8_t, sharedMaster, addOp>(ctx, &ctx->pull_flag, from_id, v, data_mode);\n}\n\nvoid batch_min_mirror_node_pull_flag_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<uint8_t, sharedMirror, minOp>(ctx, &ctx->pull_flag, from_id, v, data_mode);\n}\n\nvoid batch_min_node_pull_flag_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<uint8_t, sharedMaster, minOp>(ctx, &ctx->pull_flag, from_id, v, data_mode);\n}\n\nvoid batch_reset_node_pull_flag_cuda(struct CUDA_Context* ctx, size_t begin, size_t end, uint8_t v) {\n\treset_data_field<uint8_t>(&ctx->pull_flag, begin, end, v);\n}\n\nvoid get_bitset_trim_cuda(struct CUDA_Context* ctx, uint64_t* bitset_compute) {\n\tctx->trim.is_updated.cpu_rd_ptr()->copy_to_cpu(bitset_compute);\n}\n\nvoid bitset_trim_reset_cuda(struct CUDA_Context* ctx) {\n\tctx->trim.is_updated.cpu_rd_ptr()->reset();\n}\n\nvoid bitset_trim_reset_cuda(struct CUDA_Context* ctx, size_t begin, size_t end) {\n\treset_bitset_field(&ctx->trim, begin, end);\n}\n\nuint32_t get_node_trim_cuda(struct CUDA_Context* ctx, unsigned LID) {\n\tuint32_t *trim = ctx->trim.data.cpu_rd_ptr();\n\treturn trim[LID];\n}\n\nvoid set_node_trim_cuda(struct CUDA_Context* ctx, unsigned LID, uint32_t v) {\n\tuint32_t *trim = ctx->trim.data.cpu_wr_ptr();\n\ttrim[LID] = v;\n}\n\nvoid add_node_trim_cuda(struct CUDA_Context* ctx, unsigned LID, uint32_t v) {\n\tuint32_t *trim = ctx->trim.data.cpu_wr_ptr();\n\ttrim[LID] += v;\n}\n\nbool min_node_trim_cuda(struct CUDA_Context* ctx, unsigned LID, uint32_t v) {\n\tuint32_t *trim = ctx->trim.data.cpu_wr_ptr();\n\tif (trim[LID] > v){\n\t\ttrim[LID] = v;\n\t\treturn true;\n\t}\n\treturn false;\n}\n\nvoid batch_get_node_trim_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v) {\n\tbatch_get_shared_field<uint32_t, sharedMaster, false>(ctx, &ctx->trim, from_id, v);\n}\n\nvoid batch_get_node_trim_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode) {\n\tbatch_get_shared_field<uint32_t, sharedMaster, false>(ctx, &ctx->trim, from_id, v, v_size, data_mode);\n}\n\nvoid batch_get_mirror_node_trim_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v) {\n\tbatch_get_shared_field<uint32_t, sharedMirror, false>(ctx, &ctx->trim, from_id, v);\n}\n\nvoid batch_get_mirror_node_trim_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode) {\n\tbatch_get_shared_field<uint32_t, sharedMirror, false>(ctx, &ctx->trim, from_id, v, v_size, data_mode);\n}\n\nvoid batch_get_reset_node_trim_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, uint32_t i) {\n\tbatch_get_shared_field<uint32_t, sharedMirror, true>(ctx, &ctx->trim, from_id, v, i);\n}\n\nvoid batch_get_reset_node_trim_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode, uint32_t i) {\n\tbatch_get_shared_field<uint32_t, sharedMirror, true>(ctx, &ctx->trim, from_id, v, v_size, data_mode, i);\n}\n\nvoid batch_set_mirror_node_trim_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<uint32_t, sharedMirror, setOp>(ctx, &ctx->trim, from_id, v, data_mode);\n}\n\nvoid batch_set_node_trim_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<uint32_t, sharedMaster, setOp>(ctx, &ctx->trim, from_id, v, data_mode);\n}\n\nvoid batch_add_mirror_node_trim_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<uint32_t, sharedMirror, addOp>(ctx, &ctx->trim, from_id, v, data_mode);\n}\n\nvoid batch_add_node_trim_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<uint32_t, sharedMaster, addOp>(ctx, &ctx->trim, from_id, v, data_mode);\n}\n\nvoid batch_min_mirror_node_trim_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<uint32_t, sharedMirror, minOp>(ctx, &ctx->trim, from_id, v, data_mode);\n}\n\nvoid batch_min_node_trim_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<uint32_t, sharedMaster, minOp>(ctx, &ctx->trim, from_id, v, data_mode);\n}\n\nvoid batch_reset_node_trim_cuda(struct CUDA_Context* ctx, size_t begin, size_t end, uint32_t v) {\n\treset_data_field<uint32_t>(&ctx->trim, begin, end, v);\n}\n\n"
  },
  {
    "path": "lonestar/analytics/distributed/k-core/kcore_pull_cuda.h",
    "content": "#pragma once\n\n#include \"galois/runtime/DataCommMode.h\"\n#include \"galois/cuda/HostDecls.h\"\n\nvoid get_bitset_current_degree_cuda(struct CUDA_Context* ctx,\n                                    uint64_t* bitset_compute);\nvoid bitset_current_degree_reset_cuda(struct CUDA_Context* ctx);\nvoid bitset_current_degree_reset_cuda(struct CUDA_Context* ctx, size_t begin,\n                                      size_t end);\nuint32_t get_node_current_degree_cuda(struct CUDA_Context* ctx, unsigned LID);\nvoid set_node_current_degree_cuda(struct CUDA_Context* ctx, unsigned LID,\n                                  uint32_t v);\nvoid add_node_current_degree_cuda(struct CUDA_Context* ctx, unsigned LID,\n                                  uint32_t v);\nbool min_node_current_degree_cuda(struct CUDA_Context* ctx, unsigned LID,\n                                  uint32_t v);\nvoid batch_get_node_current_degree_cuda(struct CUDA_Context* ctx,\n                                        unsigned from_id, uint8_t* v);\nvoid batch_get_node_current_degree_cuda(struct CUDA_Context* ctx,\n                                        unsigned from_id, uint8_t* v,\n                                        size_t* v_size,\n                                        DataCommMode* data_mode);\nvoid batch_get_mirror_node_current_degree_cuda(struct CUDA_Context* ctx,\n                                               unsigned from_id, uint8_t* v);\nvoid batch_get_mirror_node_current_degree_cuda(struct CUDA_Context* ctx,\n                                               unsigned from_id, uint8_t* v,\n                                               size_t* v_size,\n                                               DataCommMode* data_mode);\nvoid batch_get_reset_node_current_degree_cuda(struct CUDA_Context* ctx,\n                                              unsigned from_id, uint8_t* v,\n                                              uint32_t i);\nvoid batch_get_reset_node_current_degree_cuda(struct CUDA_Context* ctx,\n                                              unsigned from_id, uint8_t* v,\n                                              size_t* v_size,\n                                              DataCommMode* data_mode,\n                                              uint32_t i);\nvoid batch_set_mirror_node_current_degree_cuda(struct CUDA_Context* ctx,\n                                               unsigned from_id, uint8_t* v,\n                                               DataCommMode data_mode);\nvoid batch_set_node_current_degree_cuda(struct CUDA_Context* ctx,\n                                        unsigned from_id, uint8_t* v,\n                                        DataCommMode data_mode);\nvoid batch_add_mirror_node_current_degree_cuda(struct CUDA_Context* ctx,\n                                               unsigned from_id, uint8_t* v,\n                                               DataCommMode data_mode);\nvoid batch_add_node_current_degree_cuda(struct CUDA_Context* ctx,\n                                        unsigned from_id, uint8_t* v,\n                                        DataCommMode data_mode);\nvoid batch_min_mirror_node_current_degree_cuda(struct CUDA_Context* ctx,\n                                               unsigned from_id, uint8_t* v,\n                                               DataCommMode data_mode);\nvoid batch_min_node_current_degree_cuda(struct CUDA_Context* ctx,\n                                        unsigned from_id, uint8_t* v,\n                                        DataCommMode data_mode);\nvoid batch_reset_node_current_degree_cuda(struct CUDA_Context* ctx,\n                                          size_t begin, size_t end, uint32_t v);\n\nvoid get_bitset_flag_cuda(struct CUDA_Context* ctx, uint64_t* bitset_compute);\nvoid bitset_flag_reset_cuda(struct CUDA_Context* ctx);\nvoid bitset_flag_reset_cuda(struct CUDA_Context* ctx, size_t begin, size_t end);\nuint8_t get_node_flag_cuda(struct CUDA_Context* ctx, unsigned LID);\nvoid set_node_flag_cuda(struct CUDA_Context* ctx, unsigned LID, uint8_t v);\nvoid add_node_flag_cuda(struct CUDA_Context* ctx, unsigned LID, uint8_t v);\nbool min_node_flag_cuda(struct CUDA_Context* ctx, unsigned LID, uint8_t v);\nvoid batch_get_node_flag_cuda(struct CUDA_Context* ctx, unsigned from_id,\n                              uint8_t* v);\nvoid batch_get_node_flag_cuda(struct CUDA_Context* ctx, unsigned from_id,\n                              uint8_t* v, size_t* v_size,\n                              DataCommMode* data_mode);\nvoid batch_get_mirror_node_flag_cuda(struct CUDA_Context* ctx, unsigned from_id,\n                                     uint8_t* v);\nvoid batch_get_mirror_node_flag_cuda(struct CUDA_Context* ctx, unsigned from_id,\n                                     uint8_t* v, size_t* v_size,\n                                     DataCommMode* data_mode);\nvoid batch_get_reset_node_flag_cuda(struct CUDA_Context* ctx, unsigned from_id,\n                                    uint8_t* v, uint8_t i);\nvoid batch_get_reset_node_flag_cuda(struct CUDA_Context* ctx, unsigned from_id,\n                                    uint8_t* v, size_t* v_size,\n                                    DataCommMode* data_mode, uint8_t i);\nvoid batch_set_mirror_node_flag_cuda(struct CUDA_Context* ctx, unsigned from_id,\n                                     uint8_t* v, DataCommMode data_mode);\nvoid batch_set_node_flag_cuda(struct CUDA_Context* ctx, unsigned from_id,\n                              uint8_t* v, DataCommMode data_mode);\nvoid batch_add_mirror_node_flag_cuda(struct CUDA_Context* ctx, unsigned from_id,\n                                     uint8_t* v, DataCommMode data_mode);\nvoid batch_add_node_flag_cuda(struct CUDA_Context* ctx, unsigned from_id,\n                              uint8_t* v, DataCommMode data_mode);\nvoid batch_min_mirror_node_flag_cuda(struct CUDA_Context* ctx, unsigned from_id,\n                                     uint8_t* v, DataCommMode data_mode);\nvoid batch_min_node_flag_cuda(struct CUDA_Context* ctx, unsigned from_id,\n                              uint8_t* v, DataCommMode data_mode);\nvoid batch_reset_node_flag_cuda(struct CUDA_Context* ctx, size_t begin,\n                                size_t end, uint8_t v);\n\nvoid get_bitset_pull_flag_cuda(struct CUDA_Context* ctx,\n                               uint64_t* bitset_compute);\nvoid bitset_pull_flag_reset_cuda(struct CUDA_Context* ctx);\nvoid bitset_pull_flag_reset_cuda(struct CUDA_Context* ctx, size_t begin,\n                                 size_t end);\nuint8_t get_node_pull_flag_cuda(struct CUDA_Context* ctx, unsigned LID);\nvoid set_node_pull_flag_cuda(struct CUDA_Context* ctx, unsigned LID, uint8_t v);\nvoid add_node_pull_flag_cuda(struct CUDA_Context* ctx, unsigned LID, uint8_t v);\nbool min_node_pull_flag_cuda(struct CUDA_Context* ctx, unsigned LID, uint8_t v);\nvoid batch_get_node_pull_flag_cuda(struct CUDA_Context* ctx, unsigned from_id,\n                                   uint8_t* v);\nvoid batch_get_node_pull_flag_cuda(struct CUDA_Context* ctx, unsigned from_id,\n                                   uint8_t* v, size_t* v_size,\n                                   DataCommMode* data_mode);\nvoid batch_get_mirror_node_pull_flag_cuda(struct CUDA_Context* ctx,\n                                          unsigned from_id, uint8_t* v);\nvoid batch_get_mirror_node_pull_flag_cuda(struct CUDA_Context* ctx,\n                                          unsigned from_id, uint8_t* v,\n                                          size_t* v_size,\n                                          DataCommMode* data_mode);\nvoid batch_get_reset_node_pull_flag_cuda(struct CUDA_Context* ctx,\n                                         unsigned from_id, uint8_t* v,\n                                         uint8_t i);\nvoid batch_get_reset_node_pull_flag_cuda(struct CUDA_Context* ctx,\n                                         unsigned from_id, uint8_t* v,\n                                         size_t* v_size,\n                                         DataCommMode* data_mode, uint8_t i);\nvoid batch_set_mirror_node_pull_flag_cuda(struct CUDA_Context* ctx,\n                                          unsigned from_id, uint8_t* v,\n                                          DataCommMode data_mode);\nvoid batch_set_node_pull_flag_cuda(struct CUDA_Context* ctx, unsigned from_id,\n                                   uint8_t* v, DataCommMode data_mode);\nvoid batch_add_mirror_node_pull_flag_cuda(struct CUDA_Context* ctx,\n                                          unsigned from_id, uint8_t* v,\n                                          DataCommMode data_mode);\nvoid batch_add_node_pull_flag_cuda(struct CUDA_Context* ctx, unsigned from_id,\n                                   uint8_t* v, DataCommMode data_mode);\nvoid batch_min_mirror_node_pull_flag_cuda(struct CUDA_Context* ctx,\n                                          unsigned from_id, uint8_t* v,\n                                          DataCommMode data_mode);\nvoid batch_min_node_pull_flag_cuda(struct CUDA_Context* ctx, unsigned from_id,\n                                   uint8_t* v, DataCommMode data_mode);\nvoid batch_reset_node_pull_flag_cuda(struct CUDA_Context* ctx, size_t begin,\n                                     size_t end, uint8_t v);\n\nvoid get_bitset_trim_cuda(struct CUDA_Context* ctx, uint64_t* bitset_compute);\nvoid bitset_trim_reset_cuda(struct CUDA_Context* ctx);\nvoid bitset_trim_reset_cuda(struct CUDA_Context* ctx, size_t begin, size_t end);\nuint32_t get_node_trim_cuda(struct CUDA_Context* ctx, unsigned LID);\nvoid set_node_trim_cuda(struct CUDA_Context* ctx, unsigned LID, uint32_t v);\nvoid add_node_trim_cuda(struct CUDA_Context* ctx, unsigned LID, uint32_t v);\nbool min_node_trim_cuda(struct CUDA_Context* ctx, unsigned LID, uint32_t v);\nvoid batch_get_node_trim_cuda(struct CUDA_Context* ctx, unsigned from_id,\n                              uint8_t* v);\nvoid batch_get_node_trim_cuda(struct CUDA_Context* ctx, unsigned from_id,\n                              uint8_t* v, size_t* v_size,\n                              DataCommMode* data_mode);\nvoid batch_get_mirror_node_trim_cuda(struct CUDA_Context* ctx, unsigned from_id,\n                                     uint8_t* v);\nvoid batch_get_mirror_node_trim_cuda(struct CUDA_Context* ctx, unsigned from_id,\n                                     uint8_t* v, size_t* v_size,\n                                     DataCommMode* data_mode);\nvoid batch_get_reset_node_trim_cuda(struct CUDA_Context* ctx, unsigned from_id,\n                                    uint8_t* v, uint32_t i);\nvoid batch_get_reset_node_trim_cuda(struct CUDA_Context* ctx, unsigned from_id,\n                                    uint8_t* v, size_t* v_size,\n                                    DataCommMode* data_mode, uint32_t i);\nvoid batch_set_mirror_node_trim_cuda(struct CUDA_Context* ctx, unsigned from_id,\n                                     uint8_t* v, DataCommMode data_mode);\nvoid batch_set_node_trim_cuda(struct CUDA_Context* ctx, unsigned from_id,\n                              uint8_t* v, DataCommMode data_mode);\nvoid batch_add_mirror_node_trim_cuda(struct CUDA_Context* ctx, unsigned from_id,\n                                     uint8_t* v, DataCommMode data_mode);\nvoid batch_add_node_trim_cuda(struct CUDA_Context* ctx, unsigned from_id,\n                              uint8_t* v, DataCommMode data_mode);\nvoid batch_min_mirror_node_trim_cuda(struct CUDA_Context* ctx, unsigned from_id,\n                                     uint8_t* v, DataCommMode data_mode);\nvoid batch_min_node_trim_cuda(struct CUDA_Context* ctx, unsigned from_id,\n                              uint8_t* v, DataCommMode data_mode);\nvoid batch_reset_node_trim_cuda(struct CUDA_Context* ctx, size_t begin,\n                                size_t end, uint32_t v);\n\nvoid DegreeCounting_cuda(unsigned int __begin, unsigned int __end,\n                         struct CUDA_Context* ctx);\nvoid DegreeCounting_allNodes_cuda(struct CUDA_Context* ctx);\nvoid DegreeCounting_masterNodes_cuda(struct CUDA_Context* ctx);\nvoid DegreeCounting_nodesWithEdges_cuda(struct CUDA_Context* ctx);\nvoid InitializeGraph_cuda(unsigned int __begin, unsigned int __end,\n                          struct CUDA_Context* ctx);\nvoid InitializeGraph_allNodes_cuda(struct CUDA_Context* ctx);\nvoid InitializeGraph_masterNodes_cuda(struct CUDA_Context* ctx);\nvoid InitializeGraph_nodesWithEdges_cuda(struct CUDA_Context* ctx);\nvoid KCore_cuda(unsigned int __begin, unsigned int __end,\n                struct CUDA_Context* ctx);\nvoid KCoreSanityCheck_cuda(unsigned int __begin, unsigned int __end,\n                           uint64_t& active_vertices, struct CUDA_Context* ctx);\nvoid KCoreSanityCheck_allNodes_cuda(uint64_t& active_vertices,\n                                    struct CUDA_Context* ctx);\nvoid KCoreSanityCheck_masterNodes_cuda(uint64_t& active_vertices,\n                                       struct CUDA_Context* ctx);\nvoid KCoreSanityCheck_nodesWithEdges_cuda(uint64_t& active_vertices,\n                                          struct CUDA_Context* ctx);\nvoid KCore_allNodes_cuda(struct CUDA_Context* ctx);\nvoid KCore_masterNodes_cuda(struct CUDA_Context* ctx);\nvoid KCore_nodesWithEdges_cuda(struct CUDA_Context* ctx);\nvoid LiveUpdate_cuda(unsigned int __begin, unsigned int __end,\n                     unsigned int& active_vertices, uint32_t local_k_core_num,\n                     struct CUDA_Context* ctx);\nvoid LiveUpdate_allNodes_cuda(unsigned int& active_vertices,\n                              uint32_t local_k_core_num,\n                              struct CUDA_Context* ctx);\nvoid LiveUpdate_masterNodes_cuda(unsigned int& active_vertices,\n                                 uint32_t local_k_core_num,\n                                 struct CUDA_Context* ctx);\nvoid LiveUpdate_nodesWithEdges_cuda(unsigned int& active_vertices,\n                                    uint32_t local_k_core_num,\n                                    struct CUDA_Context* ctx);\n"
  },
  {
    "path": "lonestar/analytics/distributed/k-core/kcore_pull_cuda.py",
    "content": "from gg.ast import *\nfrom gg.lib.graph import Graph\nfrom gg.lib.wl import Worklist\nfrom gg.ast.params import GraphParam\nimport cgen\nG = Graph(\"graph\")\nWL = Worklist()\nast = Module([\nCBlock([cgen.Include(\"kcore_pull_cuda.cuh\", system = False)], parse = False),\nKernel(\"DegreeCounting\", [G.param(), ('unsigned int', '__begin'), ('unsigned int', '__end'), ('uint32_t *', 'p_current_degree'), ('DynamicBitset&', 'bitset_current_degree')],\n[\nForAll(\"src\", G.nodes(\"__begin\", \"__end\"),\n[\nCDecl([(\"bool\", \"pop\", \" = src < __end\")]),\nIf(\"pop\", [\nCBlock([\"p_current_degree[src] = graph.getOutDegree(src)\"]),\nCBlock([\"bitset_current_degree.set(src)\"]),\n]),\n]),\n]),\nKernel(\"InitializeGraph\", [G.param(), ('unsigned int', '__begin'), ('unsigned int', '__end'), ('uint32_t *', 'p_current_degree'), ('uint8_t *', 'p_flag'), ('uint8_t *', 'p_pull_flag'), ('uint32_t *', 'p_trim')],\n[\nForAll(\"src\", G.nodes(\"__begin\", \"__end\"),\n[\nCDecl([(\"bool\", \"pop\", \" = src < __end\")]),\nIf(\"pop\", [\nCBlock([\"p_flag[src]           = true\"]),\nCBlock([\"p_trim[src]           = 0\"]),\nCBlock([\"p_current_degree[src] = 0\"]),\nCBlock([\"p_pull_flag[src]      = false\"]),\n]),\n]),\n]),\nKernel(\"LiveUpdate\", [G.param(), ('unsigned int', '__begin'), ('unsigned int', '__end'), ('uint32_t', 'local_k_core_num'), ('uint32_t *', 'p_current_degree'), ('uint8_t *', 'p_flag'), ('uint8_t *', 'p_pull_flag'), ('uint32_t *', 'p_trim'), ('HGAccumulator<unsigned int>', 'active_vertices')],\n[\nCDecl([(\"__shared__ cub::BlockReduce<unsigned int, TB_SIZE>::TempStorage\", \"active_vertices_ts\", \"\")]),\nCBlock([\"active_vertices.thread_entry()\"]),\nForAll(\"src\", G.nodes(\"__begin\", \"__end\"),\n[\nCDecl([(\"bool\", \"pop\", \" = src < __end\")]),\nIf(\"pop\", [\nIf(\"p_flag[src]\",\n[\nIf(\"p_trim[src] > 0\",\n[\nCBlock([\"p_current_degree[src] = p_current_degree[src] - p_trim[src]\"]),\n]),\nIf(\"p_current_degree[src] < local_k_core_num\",\n[\nCBlock([\"p_flag[src] = false\"]),\nCBlock([\"active_vertices.reduce( 1)\"]),\nCBlock([\"p_pull_flag[src] = true\"]),\n]),\n],\n[\nIf(\"p_pull_flag[src]\",\n[\nCBlock([\"p_pull_flag[src] = false\"]),\n]),\n]),\nCBlock([\"p_trim[src] = 0\"]),\n]),\n]),\nCBlock([\"active_vertices.thread_exit<cub::BlockReduce<unsigned int, TB_SIZE> >(active_vertices_ts)\"], parse = False),\n]),\nKernel(\"KCore\", [G.param(), ('unsigned int', '__begin'), ('unsigned int', '__end'), ('uint8_t *', 'p_flag'), ('uint8_t *', 'p_pull_flag'), ('uint32_t *', 'p_trim'), ('DynamicBitset&', 'bitset_trim')],\n[\nForAll(\"src\", G.nodes(\"__begin\", \"__end\"),\n[\nCDecl([(\"bool\", \"pop\", \" = src < __end\")]),\nIf(\"pop\", [\nIf(\"p_flag[src]\",\n[\n], [ CBlock([\"pop = false\"]), ]),\n]),\nUniformConditional(If(\"!pop\", [CBlock(\"continue\")]), uniform_only = False, _only_if_np = True),\nClosureHint(\nForAll(\"current_edge\", G.edges(\"src\"),\n[\nCDecl([(\"index_type\", \"dst\", \"\")]),\nCBlock([\"dst = graph.getAbsDestination(current_edge)\"]),\nIf(\"p_pull_flag[dst]\",\n[\nCBlock([\"atomicTestAdd(&p_trim[src], (uint32_t)1)\"]),\nCBlock([\"bitset_trim.set(src)\"]),\n]),\n]),\n),\n]),\n]),\nKernel(\"KCoreSanityCheck\", [G.param(), ('unsigned int', '__begin'), ('unsigned int', '__end'), ('uint8_t *', 'p_flag'), ('HGAccumulator<uint64_t>', 'active_vertices')],\n[\nCDecl([(\"__shared__ cub::BlockReduce<uint64_t, TB_SIZE>::TempStorage\", \"active_vertices_ts\", \"\")]),\nCBlock([\"active_vertices.thread_entry()\"]),\nForAll(\"src\", G.nodes(\"__begin\", \"__end\"),\n[\nCDecl([(\"bool\", \"pop\", \" = src < __end\")]),\nIf(\"pop\", [\nIf(\"p_flag[src]\",\n[\nCBlock([\"active_vertices.reduce( 1)\"]),\n]),\n]),\n]),\nCBlock([\"active_vertices.thread_exit<cub::BlockReduce<uint64_t, TB_SIZE> >(active_vertices_ts)\"], parse = False),\n]),\nKernel(\"DegreeCounting_cuda\", [('unsigned int ', '__begin'), ('unsigned int ', '__end'), ('struct CUDA_Context* ', 'ctx')],\n[\nCDecl([(\"dim3\", \"blocks\", \"\")]),\nCDecl([(\"dim3\", \"threads\", \"\")]),\nCBlock([\"kernel_sizing(blocks, threads)\"]),\nInvoke(\"DegreeCounting\", (\"ctx->gg\", \"__begin\", \"__end\", \"ctx->current_degree.data.gpu_wr_ptr()\", \"*(ctx->current_degree.is_updated.gpu_rd_ptr())\")),\nCBlock([\"check_cuda_kernel\"], parse = False),\n], host = True),\nKernel(\"DegreeCounting_allNodes_cuda\", [('struct CUDA_Context* ', 'ctx')],\n[\nCBlock([\"DegreeCounting_cuda(0, ctx->gg.nnodes, ctx)\"]),\n], host = True),\nKernel(\"DegreeCounting_masterNodes_cuda\", [('struct CUDA_Context* ', 'ctx')],\n[\nCBlock([\"DegreeCounting_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, ctx)\"]),\n], host = True),\nKernel(\"DegreeCounting_nodesWithEdges_cuda\", [('struct CUDA_Context* ', 'ctx')],\n[\nCBlock([\"DegreeCounting_cuda(0, ctx->numNodesWithEdges, ctx)\"]),\n], host = True),\nKernel(\"InitializeGraph_cuda\", [('unsigned int ', '__begin'), ('unsigned int ', '__end'), ('struct CUDA_Context* ', 'ctx')],\n[\nCDecl([(\"dim3\", \"blocks\", \"\")]),\nCDecl([(\"dim3\", \"threads\", \"\")]),\nCBlock([\"kernel_sizing(blocks, threads)\"]),\nInvoke(\"InitializeGraph\", (\"ctx->gg\", \"__begin\", \"__end\", \"ctx->current_degree.data.gpu_wr_ptr()\", \"ctx->flag.data.gpu_wr_ptr()\", \"ctx->pull_flag.data.gpu_wr_ptr()\", \"ctx->trim.data.gpu_wr_ptr()\")),\nCBlock([\"check_cuda_kernel\"], parse = False),\n], host = True),\nKernel(\"InitializeGraph_allNodes_cuda\", [('struct CUDA_Context* ', 'ctx')],\n[\nCBlock([\"InitializeGraph_cuda(0, ctx->gg.nnodes, ctx)\"]),\n], host = True),\nKernel(\"InitializeGraph_masterNodes_cuda\", [('struct CUDA_Context* ', 'ctx')],\n[\nCBlock([\"InitializeGraph_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, ctx)\"]),\n], host = True),\nKernel(\"InitializeGraph_nodesWithEdges_cuda\", [('struct CUDA_Context* ', 'ctx')],\n[\nCBlock([\"InitializeGraph_cuda(0, ctx->numNodesWithEdges, ctx)\"]),\n], host = True),\nKernel(\"LiveUpdate_cuda\", [('unsigned int ', '__begin'), ('unsigned int ', '__end'), ('unsigned int &', 'active_vertices'), ('uint32_t', 'local_k_core_num'), ('struct CUDA_Context* ', 'ctx')],\n[\nCDecl([(\"dim3\", \"blocks\", \"\")]),\nCDecl([(\"dim3\", \"threads\", \"\")]),\nCBlock([\"kernel_sizing(blocks, threads)\"]),\nCDecl([(\"Shared<unsigned int>\", \"active_verticesval\", \" = Shared<unsigned int>(1)\")]),\nCDecl([(\"HGAccumulator<unsigned int>\", \"_active_vertices\", \"\")]),\nCBlock([\"*(active_verticesval.cpu_wr_ptr()) = 0\"]),\nCBlock([\"_active_vertices.rv = active_verticesval.gpu_wr_ptr()\"]),\nInvoke(\"LiveUpdate\", (\"ctx->gg\", \"__begin\", \"__end\", \"local_k_core_num\", \"ctx->current_degree.data.gpu_wr_ptr()\", \"ctx->flag.data.gpu_wr_ptr()\", \"ctx->pull_flag.data.gpu_wr_ptr()\", \"ctx->trim.data.gpu_wr_ptr()\", \"_active_vertices\")),\nCBlock([\"check_cuda_kernel\"], parse = False),\nCBlock([\"active_vertices = *(active_verticesval.cpu_rd_ptr())\"]),\n], host = True),\nKernel(\"LiveUpdate_allNodes_cuda\", [('unsigned int &', 'active_vertices'), ('uint32_t', 'local_k_core_num'), ('struct CUDA_Context* ', 'ctx')],\n[\nCBlock([\"LiveUpdate_cuda(0, ctx->gg.nnodes, active_vertices, local_k_core_num, ctx)\"]),\n], host = True),\nKernel(\"LiveUpdate_masterNodes_cuda\", [('unsigned int &', 'active_vertices'), ('uint32_t', 'local_k_core_num'), ('struct CUDA_Context* ', 'ctx')],\n[\nCBlock([\"LiveUpdate_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, active_vertices, local_k_core_num, ctx)\"]),\n], host = True),\nKernel(\"LiveUpdate_nodesWithEdges_cuda\", [('unsigned int &', 'active_vertices'), ('uint32_t', 'local_k_core_num'), ('struct CUDA_Context* ', 'ctx')],\n[\nCBlock([\"LiveUpdate_cuda(0, ctx->numNodesWithEdges, active_vertices, local_k_core_num, ctx)\"]),\n], host = True),\nKernel(\"KCore_cuda\", [('unsigned int ', '__begin'), ('unsigned int ', '__end'), ('struct CUDA_Context* ', 'ctx')],\n[\nCDecl([(\"dim3\", \"blocks\", \"\")]),\nCDecl([(\"dim3\", \"threads\", \"\")]),\nCBlock([\"kernel_sizing(blocks, threads)\"]),\nInvoke(\"KCore\", (\"ctx->gg\", \"__begin\", \"__end\", \"ctx->flag.data.gpu_wr_ptr()\", \"ctx->pull_flag.data.gpu_wr_ptr()\", \"ctx->trim.data.gpu_wr_ptr()\", \"*(ctx->trim.is_updated.gpu_rd_ptr())\")),\nCBlock([\"check_cuda_kernel\"], parse = False),\n], host = True),\nKernel(\"KCore_allNodes_cuda\", [('struct CUDA_Context* ', 'ctx')],\n[\nCBlock([\"KCore_cuda(0, ctx->gg.nnodes, ctx)\"]),\n], host = True),\nKernel(\"KCore_masterNodes_cuda\", [('struct CUDA_Context* ', 'ctx')],\n[\nCBlock([\"KCore_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, ctx)\"]),\n], host = True),\nKernel(\"KCore_nodesWithEdges_cuda\", [('struct CUDA_Context* ', 'ctx')],\n[\nCBlock([\"KCore_cuda(0, ctx->numNodesWithEdges, ctx)\"]),\n], host = True),\nKernel(\"KCoreSanityCheck_cuda\", [('unsigned int ', '__begin'), ('unsigned int ', '__end'), ('uint64_t &', 'active_vertices'), ('struct CUDA_Context* ', 'ctx')],\n[\nCDecl([(\"dim3\", \"blocks\", \"\")]),\nCDecl([(\"dim3\", \"threads\", \"\")]),\nCBlock([\"kernel_sizing(blocks, threads)\"]),\nCDecl([(\"Shared<uint64_t>\", \"active_verticesval\", \" = Shared<uint64_t>(1)\")]),\nCDecl([(\"HGAccumulator<uint64_t>\", \"_active_vertices\", \"\")]),\nCBlock([\"*(active_verticesval.cpu_wr_ptr()) = 0\"]),\nCBlock([\"_active_vertices.rv = active_verticesval.gpu_wr_ptr()\"]),\nInvoke(\"KCoreSanityCheck\", (\"ctx->gg\", \"__begin\", \"__end\", \"ctx->flag.data.gpu_wr_ptr()\", \"_active_vertices\")),\nCBlock([\"check_cuda_kernel\"], parse = False),\nCBlock([\"active_vertices = *(active_verticesval.cpu_rd_ptr())\"]),\n], host = True),\nKernel(\"KCoreSanityCheck_allNodes_cuda\", [('uint64_t &', 'active_vertices'), ('struct CUDA_Context* ', 'ctx')],\n[\nCBlock([\"KCoreSanityCheck_cuda(0, ctx->gg.nnodes, active_vertices, ctx)\"]),\n], host = True),\nKernel(\"KCoreSanityCheck_masterNodes_cuda\", [('uint64_t &', 'active_vertices'), ('struct CUDA_Context* ', 'ctx')],\n[\nCBlock([\"KCoreSanityCheck_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, active_vertices, ctx)\"]),\n], host = True),\nKernel(\"KCoreSanityCheck_nodesWithEdges_cuda\", [('uint64_t &', 'active_vertices'), ('struct CUDA_Context* ', 'ctx')],\n[\nCBlock([\"KCoreSanityCheck_cuda(0, ctx->numNodesWithEdges, active_vertices, ctx)\"]),\n], host = True),\n])\n"
  },
  {
    "path": "lonestar/analytics/distributed/k-core/kcore_pull_sync.hh",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting parallelism.\n * The code is being released under the terms of the 3-Clause BSD License (a\n * copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#include \"galois/runtime/SyncStructures.h\"\n\n////////////////////////////////////////////////////////////////////////////////\n// current_degree\n////////////////////////////////////////////////////////////////////////////////\n\nGALOIS_SYNC_STRUCTURE_REDUCE_ADD(current_degree, uint32_t);\nGALOIS_SYNC_STRUCTURE_REDUCE_SET(current_degree, uint32_t);\nGALOIS_SYNC_STRUCTURE_BITSET(current_degree);\n\n////////////////////////////////////////////////////////////////////////////////\n// trim\n////////////////////////////////////////////////////////////////////////////////\n\nGALOIS_SYNC_STRUCTURE_REDUCE_ADD(trim, uint32_t);\nGALOIS_SYNC_STRUCTURE_BITSET(trim);\n"
  },
  {
    "path": "lonestar/analytics/distributed/k-core/kcore_push.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n/******************************************************************************/\n/* Sync code/calls was manually written, not compiler generated */\n/******************************************************************************/\n\n#include \"DistBench/Output.h\"\n#include \"DistBench/Start.h\"\n#include \"galois/DistGalois.h\"\n#include \"galois/DReducible.h\"\n#include \"galois/DTerminationDetector.h\"\n#include \"galois/gstl.h\"\n#include \"galois/runtime/Tracer.h\"\n\n#include <iostream>\n#include <limits>\n\n#ifdef GALOIS_ENABLE_GPU\n#include \"kcore_push_cuda.h\"\nstruct CUDA_Context* cuda_ctx;\n#else\nenum { CPU, GPU_CUDA };\nint personality = CPU;\n#endif\n\nconstexpr static const char* const REGION_NAME = \"KCore\";\n\n/******************************************************************************/\n/* Declaration of command line arguments */\n/******************************************************************************/\nnamespace cll = llvm::cl;\nstatic cll::opt<unsigned int>\n    maxIterations(\"maxIterations\",\n                  cll::desc(\"Maximum iterations: Default 10000\"),\n                  cll::init(10000));\n// required k specification for k-core\nstatic cll::opt<unsigned int> k_core_num(\"kcore\", cll::desc(\"KCore value\"),\n                                         cll::Required);\n\nenum Exec { Sync, Async };\n\nstatic cll::opt<Exec> execution(\n    \"exec\", cll::desc(\"Distributed Execution Model (default value Async):\"),\n    cll::values(clEnumVal(Sync, \"Bulk-synchronous Parallel (BSP)\"),\n                clEnumVal(Async, \"Bulk-asynchronous Parallel (BASP)\")),\n    cll::init(Async));\n\n/******************************************************************************/\n/* Graph structure declarations + other inits */\n/******************************************************************************/\n\nstruct NodeData {\n  std::atomic<uint32_t> current_degree;\n  std::atomic<uint32_t> trim;\n  uint8_t flag;\n};\n\ntypedef galois::graphs::DistGraph<NodeData, void> Graph;\ntypedef typename Graph::GraphNode GNode;\n\n// bitset for tracking updates\ngalois::DynamicBitSet bitset_current_degree;\ngalois::DynamicBitSet bitset_trim;\n\nstd::unique_ptr<galois::graphs::GluonSubstrate<Graph>> syncSubstrate;\n\n// add all sync/bitset structs (needs above declarations)\n#include \"kcore_push_sync.hh\"\n\n/******************************************************************************/\n/* Functors for running the algorithm */\n/******************************************************************************/\n\n/* Degree counting\n * Called by InitializeGraph1 */\nstruct InitializeGraph2 {\n  Graph* graph;\n\n  InitializeGraph2(Graph* _graph) : graph(_graph) {}\n\n  /* Initialize the entire graph node-by-node */\n  void static go(Graph& _graph) {\n    const auto& nodesWithEdges = _graph.allNodesWithEdgesRange();\n\n    if (personality == GPU_CUDA) {\n#ifdef GALOIS_ENABLE_GPU\n      std::string impl_str(\"InitializeGraph2_\" +\n                           (syncSubstrate->get_run_identifier()));\n      galois::StatTimer StatTimer_cuda(impl_str.c_str(), REGION_NAME);\n      StatTimer_cuda.start();\n      InitializeGraph2_nodesWithEdges_cuda(cuda_ctx);\n      StatTimer_cuda.stop();\n#else\n      abort();\n#endif\n    } else if (personality == CPU) {\n      galois::do_all(\n          galois::iterate(nodesWithEdges), InitializeGraph2{&_graph},\n          galois::steal(), galois::no_stats(),\n          galois::loopname(\n              syncSubstrate->get_run_identifier(\"InitializeGraph2\").c_str()));\n    }\n\n    syncSubstrate->sync<writeDestination, readSource, Reduce_add_current_degree,\n                        Bitset_current_degree>(\"InitializeGraph2\");\n  }\n\n  /* Calculate degree of nodes by checking how many nodes have it as a dest and\n   * adding for every dest */\n  void operator()(GNode src) const {\n    for (auto current_edge : graph->edges(src)) {\n      GNode dest_node = graph->getEdgeDst(current_edge);\n\n      NodeData& dest_data = graph->getData(dest_node);\n      galois::atomicAdd(dest_data.current_degree, (uint32_t)1);\n\n      bitset_current_degree.set(dest_node);\n    }\n  }\n};\n\n/* Initialize: initial field setup */\nstruct InitializeGraph1 {\n  Graph* graph;\n\n  InitializeGraph1(Graph* _graph) : graph(_graph) {}\n\n  /* Initialize the entire graph node-by-node */\n  void static go(Graph& _graph) {\n    const auto& allNodes = _graph.allNodesRange();\n\n    if (personality == GPU_CUDA) {\n#ifdef GALOIS_ENABLE_GPU\n      std::string impl_str(\"InitializeGraph1_\" +\n                           (syncSubstrate->get_run_identifier()));\n      galois::StatTimer StatTimer_cuda(impl_str.c_str(), REGION_NAME);\n      StatTimer_cuda.start();\n      InitializeGraph1_allNodes_cuda(cuda_ctx);\n      StatTimer_cuda.stop();\n#else\n      abort();\n#endif\n    } else if (personality == CPU) {\n      galois::do_all(\n          galois::iterate(allNodes.begin(), allNodes.end()),\n          InitializeGraph1{&_graph}, galois::no_stats(),\n          galois::loopname(\n              syncSubstrate->get_run_identifier(\"InitializeGraph1\").c_str()));\n    }\n\n    // degree calculation\n    InitializeGraph2::go(_graph);\n  }\n\n  /* Setup intial fields */\n  void operator()(GNode src) const {\n    NodeData& src_data      = graph->getData(src);\n    src_data.flag           = true;\n    src_data.trim           = 0;\n    src_data.current_degree = 0;\n  }\n};\n\n/* Use the trim value (i.e. number of incident nodes that have been removed)\n * to update degrees.\n * Called by KCoreStep1 */\nstruct KCoreStep2 {\n  Graph* graph;\n\n  KCoreStep2(Graph* _graph) : graph(_graph) {}\n\n  void static go(Graph& _graph) {\n    const auto& nodesWithEdges = _graph.allNodesWithEdgesRange();\n    if (personality == GPU_CUDA) {\n#ifdef GALOIS_ENABLE_GPU\n      std::string impl_str(\"KCore_\" + (syncSubstrate->get_run_identifier()));\n      galois::StatTimer StatTimer_cuda(impl_str.c_str(), REGION_NAME);\n      StatTimer_cuda.start();\n      KCoreStep2_nodesWithEdges_cuda(cuda_ctx);\n      StatTimer_cuda.stop();\n#else\n      abort();\n#endif\n    } else if (personality == CPU) {\n      galois::do_all(\n          galois::iterate(nodesWithEdges.begin(), nodesWithEdges.end()),\n          KCoreStep2{&_graph}, galois::no_stats(),\n          galois::loopname(syncSubstrate->get_run_identifier(\"KCore\").c_str()));\n    }\n  }\n\n  void operator()(GNode src) const {\n    NodeData& src_data = graph->getData(src);\n\n    // we currently do not care about degree for dead nodes,\n    // so we ignore those (i.e. if flag isn't set, do nothing)\n    if (src_data.flag) {\n      if (src_data.trim > 0) {\n        src_data.current_degree = src_data.current_degree - src_data.trim;\n      }\n    }\n\n    src_data.trim = 0;\n  }\n};\n\n/* Step that determines if a node is dead and updates its neighbors' trim\n * if it is */\ntemplate <bool async>\nstruct KCoreStep1 {\n  cll::opt<uint32_t>& local_k_core_num;\n  Graph* graph;\n\n  using DGTerminatorDetector =\n      typename std::conditional<async, galois::DGTerminator<unsigned int>,\n                                galois::DGAccumulator<unsigned int>>::type;\n\n  DGTerminatorDetector& active_vertices;\n\n  KCoreStep1(cll::opt<uint32_t>& _kcore, Graph* _graph,\n             DGTerminatorDetector& _dga)\n      : local_k_core_num(_kcore), graph(_graph), active_vertices(_dga) {}\n\n  void static go(Graph& _graph) {\n    unsigned iterations = 0;\n    DGTerminatorDetector dga;\n\n    const auto& nodesWithEdges = _graph.allNodesWithEdgesRange();\n\n    do {\n      syncSubstrate->set_num_round(iterations);\n      dga.reset();\n      if (personality == GPU_CUDA) {\n#ifdef GALOIS_ENABLE_GPU\n        std::string impl_str(\"KCore_\" + (syncSubstrate->get_run_identifier()));\n        galois::StatTimer StatTimer_cuda(impl_str.c_str(), REGION_NAME);\n        StatTimer_cuda.start();\n        unsigned int __retval = 0;\n        KCoreStep1_nodesWithEdges_cuda(__retval, k_core_num, cuda_ctx);\n        dga += __retval;\n        StatTimer_cuda.stop();\n#else\n        abort();\n#endif\n      } else if (personality == CPU) {\n        galois::do_all(galois::iterate(nodesWithEdges),\n                       KCoreStep1{k_core_num, &_graph, dga}, galois::steal(),\n                       galois::no_stats(),\n                       galois::loopname(\n                           syncSubstrate->get_run_identifier(\"KCore\").c_str()));\n      }\n\n      // do the trim sync; readSource because in symmetric graph\n      // source=destination; not a readAny because any will grab non\n      // source/dest nodes (which have degree 0, so they won't have a trim\n      // anyways)\n      syncSubstrate->sync<writeDestination, readSource, Reduce_add_trim,\n                          Bitset_trim, async>(\"KCore\");\n\n      // handle trimming (locally)\n      KCoreStep2::go(_graph);\n\n      iterations++;\n    } while ((async || (iterations < maxIterations)) &&\n             dga.reduce(syncSubstrate->get_run_identifier()));\n\n    if (galois::runtime::getSystemNetworkInterface().ID == 0) {\n      galois::runtime::reportStat_Single(\n          REGION_NAME,\n          \"NumIterations_\" + std::to_string(syncSubstrate->get_run_num()),\n          (unsigned long)iterations);\n    }\n  }\n\n  void operator()(GNode src) const {\n    NodeData& src_data = graph->getData(src);\n\n    // only if node is alive we do things\n    if (src_data.flag) {\n      if (src_data.current_degree < local_k_core_num) {\n        // set flag to 0 (false) and increment trim on outgoing neighbors\n        // (if they exist)\n        src_data.flag = false;\n        active_vertices += 1; // can be optimized: node may not have edges\n\n        for (auto current_edge : graph->edges(src)) {\n          GNode dst = graph->getEdgeDst(current_edge);\n\n          auto& dst_data = graph->getData(dst);\n\n          galois::atomicAdd(dst_data.trim, (uint32_t)1);\n          bitset_trim.set(dst);\n        }\n      }\n    }\n  }\n};\n\n/******************************************************************************/\n/* Sanity check operators */\n/******************************************************************************/\n\n/* Gets the total number of nodes that are still alive */\nstruct KCoreSanityCheck {\n  Graph* graph;\n  galois::DGAccumulator<uint64_t>& active_vertices;\n\n  KCoreSanityCheck(Graph* _graph,\n                   galois::DGAccumulator<uint64_t>& _active_vertices)\n      : graph(_graph), active_vertices(_active_vertices) {}\n\n  void static go(Graph& _graph, galois::DGAccumulator<uint64_t>& dga) {\n    dga.reset();\n\n    if (personality == GPU_CUDA) {\n#ifdef GALOIS_ENABLE_GPU\n      uint64_t sum;\n      KCoreSanityCheck_masterNodes_cuda(sum, cuda_ctx);\n      dga += sum;\n#else\n      abort();\n#endif\n    } else {\n      galois::do_all(galois::iterate(_graph.masterNodesRange().begin(),\n                                     _graph.masterNodesRange().end()),\n                     KCoreSanityCheck(&_graph, dga), galois::no_stats(),\n                     galois::loopname(\"KCoreSanityCheck\"));\n    }\n\n    uint64_t num_nodes = dga.reduce();\n\n    // Only node 0 will print data\n    if (galois::runtime::getSystemNetworkInterface().ID == 0) {\n      galois::gPrint(\"Number of nodes in the \", k_core_num, \"-core is \",\n                     num_nodes, \"\\n\");\n    }\n  }\n\n  /* Check if an owned node is alive/dead: increment appropriate accumulator */\n  void operator()(GNode src) const {\n    NodeData& src_data = graph->getData(src);\n\n    if (src_data.flag) {\n      active_vertices += 1;\n    }\n  }\n};\n\n/******************************************************************************/\n/* Make results */\n/******************************************************************************/\n\nstd::vector<unsigned> makeResultsCPU(std::unique_ptr<Graph>& hg) {\n  std::vector<unsigned> values;\n\n  values.reserve(hg->numMasters());\n  for (auto node : hg->masterNodesRange()) {\n    values.push_back(hg->getData(node).flag);\n  }\n\n  return values;\n}\n\n#ifdef GALOIS_ENABLE_GPU\nstd::vector<unsigned> makeResultsGPU(std::unique_ptr<Graph>& hg) {\n  std::vector<unsigned> values;\n\n  values.reserve(hg->numMasters());\n  for (auto node : hg->masterNodesRange()) {\n    values.push_back(get_node_flag_cuda(cuda_ctx, node));\n  }\n\n  return values;\n}\n#else\nstd::vector<unsigned> makeResultsGPU(std::unique_ptr<Graph>& /*unused*/) {\n  abort();\n}\n#endif\n\nstd::vector<unsigned> makeResults(std::unique_ptr<Graph>& hg) {\n  switch (personality) {\n  case CPU:\n    return makeResultsCPU(hg);\n  case GPU_CUDA:\n    return makeResultsGPU(hg);\n  default:\n    abort();\n  }\n}\n\n/******************************************************************************/\n/* Main method for running */\n/******************************************************************************/\n\nconstexpr static const char* const name = \"KCore - Distributed Heterogeneous \"\n                                          \"Push Filter.\";\nconstexpr static const char* const desc = \"KCore on Distributed Galois.\";\nconstexpr static const char* const url  = nullptr;\n\nint main(int argc, char** argv) {\n  galois::DistMemSys G;\n  DistBenchStart(argc, argv, name, desc, url);\n\n  auto& net = galois::runtime::getSystemNetworkInterface();\n  if (net.ID == 0) {\n    galois::runtime::reportParam(REGION_NAME, \"Max Iterations\", maxIterations);\n  }\n\n  galois::StatTimer StatTimer_total(\"TimerTotal\", REGION_NAME);\n\n  StatTimer_total.start();\n\n  std::unique_ptr<Graph> h_graph;\n#ifdef GALOIS_ENABLE_GPU\n  std::tie(h_graph, syncSubstrate) =\n      symmetricDistGraphInitialization<NodeData, void>(&cuda_ctx);\n#else\n  std::tie(h_graph, syncSubstrate) =\n      symmetricDistGraphInitialization<NodeData, void>();\n#endif\n\n  bitset_current_degree.resize(h_graph->size());\n  bitset_trim.resize(h_graph->size());\n\n  galois::gPrint(\"[\", net.ID, \"] InitializeGraph::go functions called\\n\");\n  InitializeGraph1::go((*h_graph));\n  galois::runtime::getHostBarrier().wait();\n\n  galois::DGAccumulator<uint64_t> dga;\n\n  for (auto run = 0; run < numRuns; ++run) {\n    galois::gPrint(\"[\", net.ID, \"] KCoreStep1::go run \", run, \" called\\n\");\n    std::string timer_str(\"Timer_\" + std::to_string(run));\n    galois::StatTimer StatTimer_main(timer_str.c_str(), REGION_NAME);\n\n    StatTimer_main.start();\n    if (execution == Async) {\n      KCoreStep1<true>::go(*h_graph);\n    } else {\n      KCoreStep1<false>::go(*h_graph);\n    }\n    StatTimer_main.stop();\n\n    // sanity check\n    KCoreSanityCheck::go(*h_graph, dga);\n\n    // re-init graph for next run\n    if ((run + 1) != numRuns) {\n      (*syncSubstrate).set_num_run(run + 1);\n\n      if (personality == GPU_CUDA) {\n#ifdef GALOIS_ENABLE_GPU\n        bitset_current_degree_reset_cuda(cuda_ctx);\n        bitset_trim_reset_cuda(cuda_ctx);\n#else\n        abort();\n#endif\n      } else {\n        bitset_current_degree.reset();\n        bitset_trim.reset();\n      }\n\n      InitializeGraph1::go(*h_graph);\n      galois::runtime::getHostBarrier().wait();\n    }\n  }\n\n  StatTimer_total.stop();\n\n  if (output) {\n    std::vector<unsigned> results = makeResults(h_graph);\n    auto globalIDs                = h_graph->getMasterGlobalIDs();\n    assert(results.size() == globalIDs.size());\n\n    writeOutput(outputLocation, \"in_kcore\", results.data(), results.size(),\n                globalIDs.data());\n  }\n\n  return 0;\n}\n"
  },
  {
    "path": "lonestar/analytics/distributed/k-core/kcore_push_cuda.cu",
    "content": "/*  -*- mode: c++ -*-  */\n#include \"gg.h\"\n#include \"ggcuda.h\"\n#include \"cub/cub.cuh\"\n#include \"cub/util_allocator.cuh\"\n#include \"thread_work.h\"\n\nvoid kernel_sizing(CSRGraph &, dim3 &, dim3 &);\n#define TB_SIZE 256\nconst char *GGC_OPTIONS = \"coop_conv=False $ outline_iterate_gb=False $ backoff_blocking_factor=4 $ parcomb=True $ np_schedulers=set(['fg', 'tb', 'wp']) $ cc_disable=set([]) $ tb_lb=False $ hacks=set([]) $ np_factor=8 $ instrument=set([]) $ unroll=[] $ instrument_mode=None $ read_props=None $ outline_iterate=False $ ignore_nested_errors=False $ np=True $ write_props=None $ quiet_cgen=True $ dyn_lb=True $ retry_backoff=True $ cuda.graph_type=basic $ cuda.use_worklist_slots=True $ cuda.worklist_type=basic\";\nstruct ThreadWork t_work;\nbool enable_lb = true;\n#include \"kcore_push_cuda.cuh\"\nstatic const int __tb_InitializeGraph2 = TB_SIZE;\nstatic const int __tb_KCoreStep1 = TB_SIZE;\n__global__ void InitializeGraph2_TB_LB(CSRGraph graph, unsigned int __begin, unsigned int __end, uint32_t * p_current_degree, DynamicBitset& bitset_current_degree, int * thread_prefix_work_wl, unsigned int num_items, PipeContextT<Worklist2> thread_src_wl)\n{\n  unsigned tid = TID_1D;\n  unsigned nthreads = TOTAL_THREADS_1D;\n\n  const unsigned __kernel_tb_size = TB_SIZE;\n  __shared__ unsigned int total_work;\n  __shared__ unsigned block_start_src_index;\n  __shared__ unsigned block_end_src_index;\n  unsigned my_work;\n  unsigned src;\n  unsigned int offset;\n  unsigned int current_work;\n  // FP: \"1 -> 2;\n  // FP: \"2 -> 3;\n  unsigned blockdim_x = BLOCK_DIM_X;\n  // FP: \"3 -> 4;\n  // FP: \"4 -> 5;\n  // FP: \"5 -> 6;\n  // FP: \"6 -> 7;\n  // FP: \"7 -> 8;\n  // FP: \"8 -> 9;\n  // FP: \"9 -> 10;\n  total_work = thread_prefix_work_wl[num_items - 1];\n  // FP: \"10 -> 11;\n  my_work = ceilf((float)(total_work) / (float) nthreads);\n  // FP: \"11 -> 12;\n\n  // FP: \"12 -> 13;\n  __syncthreads();\n  // FP: \"13 -> 14;\n\n  // FP: \"14 -> 15;\n  if (my_work != 0)\n  {\n    current_work = tid;\n  }\n  // FP: \"17 -> 18;\n  for (unsigned i =0; i < my_work; i++)\n  {\n    unsigned int block_start_work;\n    unsigned int block_end_work;\n    if (threadIdx.x == 0)\n    {\n      if (current_work < total_work)\n      {\n        block_start_work = current_work;\n        block_end_work=current_work + blockdim_x - 1;\n        if (block_end_work >= total_work)\n        {\n          block_end_work = total_work - 1;\n        }\n        block_start_src_index = compute_src_and_offset(0, num_items - 1,  block_start_work+1, thread_prefix_work_wl, num_items,offset);\n        block_end_src_index = compute_src_and_offset(0, num_items - 1, block_end_work+1, thread_prefix_work_wl, num_items, offset);\n      }\n    }\n    __syncthreads();\n\n    if (current_work < total_work)\n    {\n      unsigned src_index;\n      index_type current_edge;\n      src_index = compute_src_and_offset(block_start_src_index, block_end_src_index, current_work+1, thread_prefix_work_wl,num_items, offset);\n      src= thread_src_wl.in_wl().dwl[src_index];\n      current_edge = (graph).getFirstEdge(src)+ offset;\n      {\n        index_type dest_node;\n        dest_node = graph.getAbsDestination(current_edge);\n        atomicTestAdd(&p_current_degree[dest_node], (uint32_t)1);\n        bitset_current_degree.set(dest_node);\n      }\n      current_work = current_work + nthreads;\n    }\n    __syncthreads();\n  }\n  // FP: \"44 -> 45;\n}\n__global__ void InitializeGraph2(CSRGraph graph, unsigned int __begin, unsigned int __end, uint32_t * p_current_degree, DynamicBitset& bitset_current_degree, PipeContextT<Worklist2> thread_work_wl, PipeContextT<Worklist2> thread_src_wl, bool enable_lb)\n{\n  unsigned tid = TID_1D;\n  unsigned nthreads = TOTAL_THREADS_1D;\n\n  const unsigned __kernel_tb_size = __tb_InitializeGraph2;\n  index_type src_end;\n  index_type src_rup;\n  // FP: \"1 -> 2;\n  const int _NP_CROSSOVER_WP = 32;\n  const int _NP_CROSSOVER_TB = __kernel_tb_size;\n  // FP: \"2 -> 3;\n  const int BLKSIZE = __kernel_tb_size;\n  const int ITSIZE = BLKSIZE * 8;\n  unsigned d_limit = DEGREE_LIMIT;\n  // FP: \"3 -> 4;\n\n  typedef cub::BlockScan<multiple_sum<2, index_type>, BLKSIZE> BlockScan;\n  typedef union np_shared<BlockScan::TempStorage, index_type, struct tb_np, struct warp_np<__kernel_tb_size/32>, struct fg_np<ITSIZE> > npsTy;\n\n  // FP: \"4 -> 5;\n  __shared__ npsTy nps ;\n  // FP: \"5 -> 6;\n  src_end = __end;\n  src_rup = ((__begin) + roundup(((__end) - (__begin)), (blockDim.x)));\n  for (index_type src = __begin + tid; src < src_rup; src += nthreads)\n  {\n    int index;\n    multiple_sum<2, index_type> _np_mps;\n    multiple_sum<2, index_type> _np_mps_total;\n    // FP: \"6 -> 7;\n    bool pop  = src < __end && ((( src < (graph).nnodes )) ? true: false);\n    // FP: \"7 -> 8;\n    if (pop)\n    {\n    }\n    // FP: \"9 -> 10;\n    // FP: \"12 -> 13;\n    // FP: \"13 -> 14;\n    int threshold = TOTAL_THREADS_1D;\n    // FP: \"14 -> 15;\n    if (pop && (graph).getOutDegree(src) >= threshold)\n    {\n      index = thread_work_wl.in_wl().push_range(1) ;\n      thread_src_wl.in_wl().push_range(1);\n      thread_work_wl.in_wl().dwl[index] = (graph).getOutDegree(src);\n      thread_src_wl.in_wl().dwl[index] = src;\n      pop = false;\n    }\n    // FP: \"17 -> 18;\n    struct NPInspector1 _np = {0,0,0,0,0,0};\n    // FP: \"18 -> 19;\n    __shared__ struct { ; } _np_closure [TB_SIZE];\n    // FP: \"19 -> 20;\n    // FP: \"20 -> 21;\n    if (pop)\n    {\n      _np.size = (graph).getOutDegree(src);\n      _np.start = (graph).getFirstEdge(src);\n    }\n    // FP: \"23 -> 24;\n    // FP: \"24 -> 25;\n    _np_mps.el[0] = _np.size >= _NP_CROSSOVER_WP ? _np.size : 0;\n    _np_mps.el[1] = _np.size < _NP_CROSSOVER_WP ? _np.size : 0;\n    // FP: \"25 -> 26;\n    BlockScan(nps.temp_storage).ExclusiveSum(_np_mps, _np_mps, _np_mps_total);\n    // FP: \"26 -> 27;\n    if (threadIdx.x == 0)\n    {\n      nps.tb.owner = MAX_TB_SIZE + 1;\n    }\n    // FP: \"29 -> 30;\n    __syncthreads();\n    // FP: \"30 -> 31;\n    while (true)\n    {\n      // FP: \"31 -> 32;\n      if (_np.size >= _NP_CROSSOVER_TB)\n      {\n        nps.tb.owner = threadIdx.x;\n      }\n      // FP: \"34 -> 35;\n      __syncthreads();\n      // FP: \"35 -> 36;\n      if (nps.tb.owner == MAX_TB_SIZE + 1)\n      {\n        // FP: \"36 -> 37;\n        __syncthreads();\n        // FP: \"37 -> 38;\n        break;\n      }\n      // FP: \"39 -> 40;\n      if (nps.tb.owner == threadIdx.x)\n      {\n        nps.tb.start = _np.start;\n        nps.tb.size = _np.size;\n        nps.tb.src = threadIdx.x;\n        _np.start = 0;\n        _np.size = 0;\n      }\n      // FP: \"42 -> 43;\n      __syncthreads();\n      // FP: \"43 -> 44;\n      int ns = nps.tb.start;\n      int ne = nps.tb.size;\n      // FP: \"44 -> 45;\n      if (nps.tb.src == threadIdx.x)\n      {\n        nps.tb.owner = MAX_TB_SIZE + 1;\n      }\n      // FP: \"47 -> 48;\n      assert(nps.tb.src < __kernel_tb_size);\n      // FP: \"48 -> 49;\n      for (int _np_j = threadIdx.x; _np_j < ne; _np_j += BLKSIZE)\n      {\n        index_type current_edge;\n        current_edge = ns +_np_j;\n        {\n          index_type dest_node;\n          dest_node = graph.getAbsDestination(current_edge);\n          atomicTestAdd(&p_current_degree[dest_node], (uint32_t)1);\n          bitset_current_degree.set(dest_node);\n        }\n      }\n      // FP: \"56 -> 57;\n      __syncthreads();\n    }\n    // FP: \"58 -> 59;\n\n    // FP: \"59 -> 60;\n    {\n      const int warpid = threadIdx.x / 32;\n      // FP: \"60 -> 61;\n      const int _np_laneid = cub::LaneId();\n      // FP: \"61 -> 62;\n      while (__any_sync(0xffffffff, _np.size >= _NP_CROSSOVER_WP && _np.size < _NP_CROSSOVER_TB))\n      {\n        if (_np.size >= _NP_CROSSOVER_WP && _np.size < _NP_CROSSOVER_TB)\n        {\n          nps.warp.owner[warpid] = _np_laneid;\n        }\n        if (nps.warp.owner[warpid] == _np_laneid)\n        {\n          nps.warp.start[warpid] = _np.start;\n          nps.warp.size[warpid] = _np.size;\n          nps.warp.src[warpid] = threadIdx.x;\n          _np.start = 0;\n          _np.size = 0;\n        }\n        index_type _np_w_start = nps.warp.start[warpid];\n        index_type _np_w_size = nps.warp.size[warpid];\n        assert(nps.warp.src[warpid] < __kernel_tb_size);\n        for (int _np_ii = _np_laneid; _np_ii < _np_w_size; _np_ii += 32)\n        {\n          index_type current_edge;\n          current_edge = _np_w_start +_np_ii;\n          {\n            index_type dest_node;\n            dest_node = graph.getAbsDestination(current_edge);\n            atomicTestAdd(&p_current_degree[dest_node], (uint32_t)1);\n            bitset_current_degree.set(dest_node);\n          }\n        }\n      }\n      // FP: \"79 -> 80;\n      __syncthreads();\n      // FP: \"80 -> 81;\n    }\n\n    // FP: \"81 -> 82;\n    __syncthreads();\n    // FP: \"82 -> 83;\n    _np.total = _np_mps_total.el[1];\n    _np.offset = _np_mps.el[1];\n    // FP: \"83 -> 84;\n    while (_np.work())\n    {\n      // FP: \"84 -> 85;\n      int _np_i =0;\n      // FP: \"85 -> 86;\n      _np.inspect2(nps.fg.itvalue, nps.fg.src, ITSIZE, threadIdx.x);\n      // FP: \"86 -> 87;\n      __syncthreads();\n      // FP: \"87 -> 88;\n\n      // FP: \"88 -> 89;\n      for (_np_i = threadIdx.x; _np_i < ITSIZE && _np.valid(_np_i); _np_i += BLKSIZE)\n      {\n        index_type current_edge;\n        assert(nps.fg.src[_np_i] < __kernel_tb_size);\n        current_edge= nps.fg.itvalue[_np_i];\n        {\n          index_type dest_node;\n          dest_node = graph.getAbsDestination(current_edge);\n          atomicTestAdd(&p_current_degree[dest_node], (uint32_t)1);\n          bitset_current_degree.set(dest_node);\n        }\n      }\n      // FP: \"97 -> 98;\n      _np.execute_round_done(ITSIZE);\n      // FP: \"98 -> 99;\n      __syncthreads();\n    }\n    // FP: \"100 -> 101;\n    assert(threadIdx.x < __kernel_tb_size);\n  }\n  // FP: \"102 -> 103;\n}\n__global__ void InitializeGraph1(CSRGraph graph, unsigned int __begin, unsigned int __end, uint32_t * p_current_degree, uint8_t * p_flag, uint32_t * p_trim)\n{\n  unsigned tid = TID_1D;\n  unsigned nthreads = TOTAL_THREADS_1D;\n\n  const unsigned __kernel_tb_size = TB_SIZE;\n  index_type src_end;\n  // FP: \"1 -> 2;\n  src_end = __end;\n  for (index_type src = __begin + tid; src < src_end; src += nthreads)\n  {\n    bool pop  = src < __end;\n    if (pop)\n    {\n      p_flag[src]           = true;\n      p_trim[src]           = 0;\n      p_current_degree[src] = 0;\n    }\n  }\n  // FP: \"9 -> 10;\n}\n__global__ void KCoreStep2(CSRGraph graph, unsigned int __begin, unsigned int __end, uint32_t * p_current_degree, uint8_t * p_flag, uint32_t * p_trim)\n{\n  unsigned tid = TID_1D;\n  unsigned nthreads = TOTAL_THREADS_1D;\n\n  const unsigned __kernel_tb_size = TB_SIZE;\n  index_type src_end;\n  // FP: \"1 -> 2;\n  src_end = __end;\n  for (index_type src = __begin + tid; src < src_end; src += nthreads)\n  {\n    bool pop  = src < __end;\n    if (pop)\n    {\n      if (p_flag[src])\n      {\n        if (p_trim[src] > 0)\n        {\n          p_current_degree[src] = p_current_degree[src] - p_trim[src];\n        }\n      }\n      p_trim[src] = 0;\n    }\n  }\n  // FP: \"12 -> 13;\n}\n__global__ void KCoreStep1_TB_LB(CSRGraph graph, unsigned int __begin, unsigned int __end, uint32_t local_k_core_num, uint32_t * p_current_degree, uint8_t * p_flag, uint32_t * p_trim, DynamicBitset& bitset_trim, HGAccumulator<unsigned int> active_vertices, int * thread_prefix_work_wl, unsigned int num_items, PipeContextT<Worklist2> thread_src_wl)\n{\n  unsigned tid = TID_1D;\n  unsigned nthreads = TOTAL_THREADS_1D;\n\n  const unsigned __kernel_tb_size = TB_SIZE;\n  __shared__ unsigned int total_work;\n  __shared__ unsigned block_start_src_index;\n  __shared__ unsigned block_end_src_index;\n  unsigned my_work;\n  unsigned src;\n  unsigned int offset;\n  unsigned int current_work;\n  // FP: \"1 -> 2;\n  // FP: \"2 -> 3;\n  unsigned blockdim_x = BLOCK_DIM_X;\n  // FP: \"3 -> 4;\n  // FP: \"4 -> 5;\n  // FP: \"5 -> 6;\n  // FP: \"6 -> 7;\n  // FP: \"7 -> 8;\n  // FP: \"8 -> 9;\n  // FP: \"9 -> 10;\n  total_work = thread_prefix_work_wl[num_items - 1];\n  // FP: \"10 -> 11;\n  my_work = ceilf((float)(total_work) / (float) nthreads);\n  // FP: \"11 -> 12;\n\n  // FP: \"12 -> 13;\n  __syncthreads();\n  // FP: \"13 -> 14;\n\n  // FP: \"14 -> 15;\n  if (my_work != 0)\n  {\n    current_work = tid;\n  }\n  // FP: \"17 -> 18;\n  for (unsigned i =0; i < my_work; i++)\n  {\n    unsigned int block_start_work;\n    unsigned int block_end_work;\n    if (threadIdx.x == 0)\n    {\n      if (current_work < total_work)\n      {\n        block_start_work = current_work;\n        block_end_work=current_work + blockdim_x - 1;\n        if (block_end_work >= total_work)\n        {\n          block_end_work = total_work - 1;\n        }\n        block_start_src_index = compute_src_and_offset(0, num_items - 1,  block_start_work+1, thread_prefix_work_wl, num_items,offset);\n        block_end_src_index = compute_src_and_offset(0, num_items - 1, block_end_work+1, thread_prefix_work_wl, num_items, offset);\n      }\n    }\n    __syncthreads();\n\n    if (current_work < total_work)\n    {\n      unsigned src_index;\n      index_type current_edge;\n      src_index = compute_src_and_offset(block_start_src_index, block_end_src_index, current_work+1, thread_prefix_work_wl,num_items, offset);\n      src= thread_src_wl.in_wl().dwl[src_index];\n      current_edge = (graph).getFirstEdge(src)+ offset;\n      {\n        index_type dst;\n        dst = graph.getAbsDestination(current_edge);\n        atomicTestAdd(&p_trim[dst], (uint32_t)1);\n        bitset_trim.set(dst);\n      }\n      current_work = current_work + nthreads;\n    }\n    __syncthreads();\n  }\n  // FP: \"44 -> 45;\n}\n__global__ void KCoreStep1(CSRGraph graph, unsigned int __begin, unsigned int __end, uint32_t local_k_core_num, uint32_t * p_current_degree, uint8_t * p_flag, uint32_t * p_trim, DynamicBitset& bitset_trim, HGAccumulator<unsigned int> active_vertices, PipeContextT<Worklist2> thread_work_wl, PipeContextT<Worklist2> thread_src_wl, bool enable_lb)\n{\n  unsigned tid = TID_1D;\n  unsigned nthreads = TOTAL_THREADS_1D;\n\n  const unsigned __kernel_tb_size = __tb_KCoreStep1;\n  __shared__ cub::BlockReduce<unsigned int, TB_SIZE>::TempStorage active_vertices_ts;\n  index_type src_end;\n  index_type src_rup;\n  // FP: \"1 -> 2;\n  const int _NP_CROSSOVER_WP = 32;\n  const int _NP_CROSSOVER_TB = __kernel_tb_size;\n  // FP: \"2 -> 3;\n  const int BLKSIZE = __kernel_tb_size;\n  const int ITSIZE = BLKSIZE * 8;\n  unsigned d_limit = DEGREE_LIMIT;\n  // FP: \"3 -> 4;\n\n  typedef cub::BlockScan<multiple_sum<2, index_type>, BLKSIZE> BlockScan;\n  typedef union np_shared<BlockScan::TempStorage, index_type, struct tb_np, struct warp_np<__kernel_tb_size/32>, struct fg_np<ITSIZE> > npsTy;\n\n  // FP: \"4 -> 5;\n  __shared__ npsTy nps ;\n  // FP: \"5 -> 6;\n  // FP: \"6 -> 7;\n  active_vertices.thread_entry();\n  // FP: \"7 -> 8;\n  src_end = __end;\n  src_rup = ((__begin) + roundup(((__end) - (__begin)), (blockDim.x)));\n  for (index_type src = __begin + tid; src < src_rup; src += nthreads)\n  {\n    int index;\n    multiple_sum<2, index_type> _np_mps;\n    multiple_sum<2, index_type> _np_mps_total;\n    // FP: \"8 -> 9;\n    bool pop  = src < __end && ((( src < (graph).nnodes )) ? true: false);\n    // FP: \"9 -> 10;\n    if (pop)\n    {\n      if (p_flag[src])\n      {\n        if (p_current_degree[src] < local_k_core_num)\n        {\n          p_flag[src] = false;\n          active_vertices.reduce( 1);\n        }\n        else\n        {\n          pop = false;\n        }\n      }\n      else\n      {\n        pop = false;\n      }\n    }\n    // FP: \"17 -> 18;\n    // FP: \"20 -> 21;\n    // FP: \"21 -> 22;\n    int threshold = TOTAL_THREADS_1D;\n    // FP: \"22 -> 23;\n    if (pop && (graph).getOutDegree(src) >= threshold)\n    {\n      index = thread_work_wl.in_wl().push_range(1) ;\n      thread_src_wl.in_wl().push_range(1);\n      thread_work_wl.in_wl().dwl[index] = (graph).getOutDegree(src);\n      thread_src_wl.in_wl().dwl[index] = src;\n      pop = false;\n    }\n    // FP: \"25 -> 26;\n    struct NPInspector1 _np = {0,0,0,0,0,0};\n    // FP: \"26 -> 27;\n    __shared__ struct { ; } _np_closure [TB_SIZE];\n    // FP: \"27 -> 28;\n    // FP: \"28 -> 29;\n    if (pop)\n    {\n      _np.size = (graph).getOutDegree(src);\n      _np.start = (graph).getFirstEdge(src);\n    }\n    // FP: \"31 -> 32;\n    // FP: \"32 -> 33;\n    _np_mps.el[0] = _np.size >= _NP_CROSSOVER_WP ? _np.size : 0;\n    _np_mps.el[1] = _np.size < _NP_CROSSOVER_WP ? _np.size : 0;\n    // FP: \"33 -> 34;\n    BlockScan(nps.temp_storage).ExclusiveSum(_np_mps, _np_mps, _np_mps_total);\n    // FP: \"34 -> 35;\n    if (threadIdx.x == 0)\n    {\n      nps.tb.owner = MAX_TB_SIZE + 1;\n    }\n    // FP: \"37 -> 38;\n    __syncthreads();\n    // FP: \"38 -> 39;\n    while (true)\n    {\n      // FP: \"39 -> 40;\n      if (_np.size >= _NP_CROSSOVER_TB)\n      {\n        nps.tb.owner = threadIdx.x;\n      }\n      // FP: \"42 -> 43;\n      __syncthreads();\n      // FP: \"43 -> 44;\n      if (nps.tb.owner == MAX_TB_SIZE + 1)\n      {\n        // FP: \"44 -> 45;\n        __syncthreads();\n        // FP: \"45 -> 46;\n        break;\n      }\n      // FP: \"47 -> 48;\n      if (nps.tb.owner == threadIdx.x)\n      {\n        nps.tb.start = _np.start;\n        nps.tb.size = _np.size;\n        nps.tb.src = threadIdx.x;\n        _np.start = 0;\n        _np.size = 0;\n      }\n      // FP: \"50 -> 51;\n      __syncthreads();\n      // FP: \"51 -> 52;\n      int ns = nps.tb.start;\n      int ne = nps.tb.size;\n      // FP: \"52 -> 53;\n      if (nps.tb.src == threadIdx.x)\n      {\n        nps.tb.owner = MAX_TB_SIZE + 1;\n      }\n      // FP: \"55 -> 56;\n      assert(nps.tb.src < __kernel_tb_size);\n      // FP: \"56 -> 57;\n      for (int _np_j = threadIdx.x; _np_j < ne; _np_j += BLKSIZE)\n      {\n        index_type current_edge;\n        current_edge = ns +_np_j;\n        {\n          index_type dst;\n          dst = graph.getAbsDestination(current_edge);\n          atomicTestAdd(&p_trim[dst], (uint32_t)1);\n          bitset_trim.set(dst);\n        }\n      }\n      // FP: \"64 -> 65;\n      __syncthreads();\n    }\n    // FP: \"66 -> 67;\n\n    // FP: \"67 -> 68;\n    {\n      const int warpid = threadIdx.x / 32;\n      // FP: \"68 -> 69;\n      const int _np_laneid = cub::LaneId();\n      // FP: \"69 -> 70;\n      while (__any_sync(0xffffffff, _np.size >= _NP_CROSSOVER_WP && _np.size < _NP_CROSSOVER_TB))\n      {\n        if (_np.size >= _NP_CROSSOVER_WP && _np.size < _NP_CROSSOVER_TB)\n        {\n          nps.warp.owner[warpid] = _np_laneid;\n        }\n        if (nps.warp.owner[warpid] == _np_laneid)\n        {\n          nps.warp.start[warpid] = _np.start;\n          nps.warp.size[warpid] = _np.size;\n          nps.warp.src[warpid] = threadIdx.x;\n          _np.start = 0;\n          _np.size = 0;\n        }\n        index_type _np_w_start = nps.warp.start[warpid];\n        index_type _np_w_size = nps.warp.size[warpid];\n        assert(nps.warp.src[warpid] < __kernel_tb_size);\n        for (int _np_ii = _np_laneid; _np_ii < _np_w_size; _np_ii += 32)\n        {\n          index_type current_edge;\n          current_edge = _np_w_start +_np_ii;\n          {\n            index_type dst;\n            dst = graph.getAbsDestination(current_edge);\n            atomicTestAdd(&p_trim[dst], (uint32_t)1);\n            bitset_trim.set(dst);\n          }\n        }\n      }\n      // FP: \"87 -> 88;\n      __syncthreads();\n      // FP: \"88 -> 89;\n    }\n\n    // FP: \"89 -> 90;\n    __syncthreads();\n    // FP: \"90 -> 91;\n    _np.total = _np_mps_total.el[1];\n    _np.offset = _np_mps.el[1];\n    // FP: \"91 -> 92;\n    while (_np.work())\n    {\n      // FP: \"92 -> 93;\n      int _np_i =0;\n      // FP: \"93 -> 94;\n      _np.inspect2(nps.fg.itvalue, nps.fg.src, ITSIZE, threadIdx.x);\n      // FP: \"94 -> 95;\n      __syncthreads();\n      // FP: \"95 -> 96;\n\n      // FP: \"96 -> 97;\n      for (_np_i = threadIdx.x; _np_i < ITSIZE && _np.valid(_np_i); _np_i += BLKSIZE)\n      {\n        index_type current_edge;\n        assert(nps.fg.src[_np_i] < __kernel_tb_size);\n        current_edge= nps.fg.itvalue[_np_i];\n        {\n          index_type dst;\n          dst = graph.getAbsDestination(current_edge);\n          atomicTestAdd(&p_trim[dst], (uint32_t)1);\n          bitset_trim.set(dst);\n        }\n      }\n      // FP: \"105 -> 106;\n      _np.execute_round_done(ITSIZE);\n      // FP: \"106 -> 107;\n      __syncthreads();\n    }\n    // FP: \"108 -> 109;\n    assert(threadIdx.x < __kernel_tb_size);\n  }\n  // FP: \"112 -> 113;\n  active_vertices.thread_exit<cub::BlockReduce<unsigned int, TB_SIZE> >(active_vertices_ts);\n  // FP: \"113 -> 114;\n}\n__global__ void KCoreSanityCheck(CSRGraph graph, unsigned int __begin, unsigned int __end, uint8_t * p_flag, HGAccumulator<uint64_t> active_vertices)\n{\n  unsigned tid = TID_1D;\n  unsigned nthreads = TOTAL_THREADS_1D;\n\n  const unsigned __kernel_tb_size = TB_SIZE;\n  __shared__ cub::BlockReduce<uint64_t, TB_SIZE>::TempStorage active_vertices_ts;\n  index_type src_end;\n  // FP: \"1 -> 2;\n  // FP: \"2 -> 3;\n  active_vertices.thread_entry();\n  // FP: \"3 -> 4;\n  src_end = __end;\n  for (index_type src = __begin + tid; src < src_end; src += nthreads)\n  {\n    bool pop  = src < __end;\n    if (pop)\n    {\n      if (p_flag[src])\n      {\n        active_vertices.reduce( 1);\n      }\n    }\n  }\n  // FP: \"11 -> 12;\n  active_vertices.thread_exit<cub::BlockReduce<uint64_t, TB_SIZE> >(active_vertices_ts);\n  // FP: \"12 -> 13;\n}\nvoid InitializeGraph2_cuda(unsigned int  __begin, unsigned int  __end, struct CUDA_Context*  ctx)\n{\n  dim3 blocks;\n  dim3 threads;\n  // FP: \"1 -> 2;\n  // FP: \"2 -> 3;\n  // FP: \"3 -> 4;\n  kernel_sizing(blocks, threads);\n  // FP: \"4 -> 5;\n  InitializeGraph2 <<<blocks, __tb_InitializeGraph2>>>(ctx->gg, __begin, __end, ctx->current_degree.data.gpu_wr_ptr(), *(ctx->current_degree.is_updated.gpu_rd_ptr()), t_work.thread_work_wl, t_work.thread_src_wl, enable_lb);\n  cudaDeviceSynchronize();\n  if (enable_lb)\n  {\n    int num_items = t_work.thread_work_wl.in_wl().nitems();\n    if (num_items != 0)\n    {\n      t_work.compute_prefix_sum();\n      cudaDeviceSynchronize();\n      InitializeGraph2_TB_LB <<<blocks, __tb_InitializeGraph2>>>(ctx->gg, __begin, __end, ctx->current_degree.data.gpu_wr_ptr(), *(ctx->current_degree.is_updated.gpu_rd_ptr()), t_work.thread_prefix_work_wl.gpu_wr_ptr(), num_items, t_work.thread_src_wl);\n      cudaDeviceSynchronize();\n      t_work.reset_thread_work();\n    }\n  }\n  // FP: \"5 -> 6;\n  check_cuda_kernel;\n  // FP: \"6 -> 7;\n}\nvoid InitializeGraph2_allNodes_cuda(struct CUDA_Context*  ctx)\n{\n  // FP: \"1 -> 2;\n  InitializeGraph2_cuda(0, ctx->gg.nnodes, ctx);\n  // FP: \"2 -> 3;\n}\nvoid InitializeGraph2_masterNodes_cuda(struct CUDA_Context*  ctx)\n{\n  // FP: \"1 -> 2;\n  InitializeGraph2_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, ctx);\n  // FP: \"2 -> 3;\n}\nvoid InitializeGraph2_nodesWithEdges_cuda(struct CUDA_Context*  ctx)\n{\n  // FP: \"1 -> 2;\n  InitializeGraph2_cuda(0, ctx->numNodesWithEdges, ctx);\n  // FP: \"2 -> 3;\n}\nvoid InitializeGraph1_cuda(unsigned int  __begin, unsigned int  __end, struct CUDA_Context*  ctx)\n{\n  t_work.init_thread_work(ctx->gg.nnodes);\n  dim3 blocks;\n  dim3 threads;\n  // FP: \"1 -> 2;\n  // FP: \"2 -> 3;\n  // FP: \"3 -> 4;\n  kernel_sizing(blocks, threads);\n  // FP: \"4 -> 5;\n  InitializeGraph1 <<<blocks, threads>>>(ctx->gg, __begin, __end, ctx->current_degree.data.gpu_wr_ptr(), ctx->flag.data.gpu_wr_ptr(), ctx->trim.data.gpu_wr_ptr());\n  cudaDeviceSynchronize();\n  // FP: \"5 -> 6;\n  check_cuda_kernel;\n  // FP: \"6 -> 7;\n}\nvoid InitializeGraph1_allNodes_cuda(struct CUDA_Context*  ctx)\n{\n  // FP: \"1 -> 2;\n  InitializeGraph1_cuda(0, ctx->gg.nnodes, ctx);\n  // FP: \"2 -> 3;\n}\nvoid InitializeGraph1_masterNodes_cuda(struct CUDA_Context*  ctx)\n{\n  // FP: \"1 -> 2;\n  InitializeGraph1_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, ctx);\n  // FP: \"2 -> 3;\n}\nvoid InitializeGraph1_nodesWithEdges_cuda(struct CUDA_Context*  ctx)\n{\n  // FP: \"1 -> 2;\n  InitializeGraph1_cuda(0, ctx->numNodesWithEdges, ctx);\n  // FP: \"2 -> 3;\n}\nvoid KCoreStep2_cuda(unsigned int  __begin, unsigned int  __end, struct CUDA_Context*  ctx)\n{\n  dim3 blocks;\n  dim3 threads;\n  // FP: \"1 -> 2;\n  // FP: \"2 -> 3;\n  // FP: \"3 -> 4;\n  kernel_sizing(blocks, threads);\n  // FP: \"4 -> 5;\n  KCoreStep2 <<<blocks, threads>>>(ctx->gg, __begin, __end, ctx->current_degree.data.gpu_wr_ptr(), ctx->flag.data.gpu_wr_ptr(), ctx->trim.data.gpu_wr_ptr());\n  cudaDeviceSynchronize();\n  // FP: \"5 -> 6;\n  check_cuda_kernel;\n  // FP: \"6 -> 7;\n}\nvoid KCoreStep2_allNodes_cuda(struct CUDA_Context*  ctx)\n{\n  // FP: \"1 -> 2;\n  KCoreStep2_cuda(0, ctx->gg.nnodes, ctx);\n  // FP: \"2 -> 3;\n}\nvoid KCoreStep2_masterNodes_cuda(struct CUDA_Context*  ctx)\n{\n  // FP: \"1 -> 2;\n  KCoreStep2_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, ctx);\n  // FP: \"2 -> 3;\n}\nvoid KCoreStep2_nodesWithEdges_cuda(struct CUDA_Context*  ctx)\n{\n  // FP: \"1 -> 2;\n  KCoreStep2_cuda(0, ctx->numNodesWithEdges, ctx);\n  // FP: \"2 -> 3;\n}\nvoid KCoreStep1_cuda(unsigned int  __begin, unsigned int  __end, unsigned int & active_vertices, uint32_t local_k_core_num, struct CUDA_Context*  ctx)\n{\n  dim3 blocks;\n  dim3 threads;\n  HGAccumulator<unsigned int> _active_vertices;\n  // FP: \"1 -> 2;\n  // FP: \"2 -> 3;\n  // FP: \"3 -> 4;\n  kernel_sizing(blocks, threads);\n  // FP: \"4 -> 5;\n  Shared<unsigned int> active_verticesval  = Shared<unsigned int>(1);\n  // FP: \"5 -> 6;\n  // FP: \"6 -> 7;\n  *(active_verticesval.cpu_wr_ptr()) = 0;\n  // FP: \"7 -> 8;\n  _active_vertices.rv = active_verticesval.gpu_wr_ptr();\n  // FP: \"8 -> 9;\n  KCoreStep1 <<<blocks, __tb_KCoreStep1>>>(ctx->gg, __begin, __end, local_k_core_num, ctx->current_degree.data.gpu_wr_ptr(), ctx->flag.data.gpu_wr_ptr(), ctx->trim.data.gpu_wr_ptr(), *(ctx->trim.is_updated.gpu_rd_ptr()), _active_vertices, t_work.thread_work_wl, t_work.thread_src_wl, enable_lb);\n  cudaDeviceSynchronize();\n  if (enable_lb)\n  {\n    int num_items = t_work.thread_work_wl.in_wl().nitems();\n    if (num_items != 0)\n    {\n      t_work.compute_prefix_sum();\n      cudaDeviceSynchronize();\n      KCoreStep1_TB_LB <<<blocks, __tb_KCoreStep1>>>(ctx->gg, __begin, __end, local_k_core_num, ctx->current_degree.data.gpu_wr_ptr(), ctx->flag.data.gpu_wr_ptr(), ctx->trim.data.gpu_wr_ptr(), *(ctx->trim.is_updated.gpu_rd_ptr()), _active_vertices, t_work.thread_prefix_work_wl.gpu_wr_ptr(), num_items, t_work.thread_src_wl);\n      cudaDeviceSynchronize();\n      t_work.reset_thread_work();\n    }\n  }\n  // FP: \"9 -> 10;\n  check_cuda_kernel;\n  // FP: \"10 -> 11;\n  active_vertices = *(active_verticesval.cpu_rd_ptr());\n  // FP: \"11 -> 12;\n}\nvoid KCoreStep1_allNodes_cuda(unsigned int & active_vertices, uint32_t local_k_core_num, struct CUDA_Context*  ctx)\n{\n  // FP: \"1 -> 2;\n  KCoreStep1_cuda(0, ctx->gg.nnodes, active_vertices, local_k_core_num, ctx);\n  // FP: \"2 -> 3;\n}\nvoid KCoreStep1_masterNodes_cuda(unsigned int & active_vertices, uint32_t local_k_core_num, struct CUDA_Context*  ctx)\n{\n  // FP: \"1 -> 2;\n  KCoreStep1_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, active_vertices, local_k_core_num, ctx);\n  // FP: \"2 -> 3;\n}\nvoid KCoreStep1_nodesWithEdges_cuda(unsigned int & active_vertices, uint32_t local_k_core_num, struct CUDA_Context*  ctx)\n{\n  // FP: \"1 -> 2;\n  KCoreStep1_cuda(0, ctx->numNodesWithEdges, active_vertices, local_k_core_num, ctx);\n  // FP: \"2 -> 3;\n}\nvoid KCoreSanityCheck_cuda(unsigned int  __begin, unsigned int  __end, uint64_t & active_vertices, struct CUDA_Context*  ctx)\n{\n  dim3 blocks;\n  dim3 threads;\n  HGAccumulator<uint64_t> _active_vertices;\n  // FP: \"1 -> 2;\n  // FP: \"2 -> 3;\n  // FP: \"3 -> 4;\n  kernel_sizing(blocks, threads);\n  // FP: \"4 -> 5;\n  Shared<uint64_t> active_verticesval  = Shared<uint64_t>(1);\n  // FP: \"5 -> 6;\n  // FP: \"6 -> 7;\n  *(active_verticesval.cpu_wr_ptr()) = 0;\n  // FP: \"7 -> 8;\n  _active_vertices.rv = active_verticesval.gpu_wr_ptr();\n  // FP: \"8 -> 9;\n  KCoreSanityCheck <<<blocks, threads>>>(ctx->gg, __begin, __end, ctx->flag.data.gpu_wr_ptr(), _active_vertices);\n  cudaDeviceSynchronize();\n  // FP: \"9 -> 10;\n  check_cuda_kernel;\n  // FP: \"10 -> 11;\n  active_vertices = *(active_verticesval.cpu_rd_ptr());\n  // FP: \"11 -> 12;\n}\nvoid KCoreSanityCheck_allNodes_cuda(uint64_t & active_vertices, struct CUDA_Context*  ctx)\n{\n  // FP: \"1 -> 2;\n  KCoreSanityCheck_cuda(0, ctx->gg.nnodes, active_vertices, ctx);\n  // FP: \"2 -> 3;\n}\nvoid KCoreSanityCheck_masterNodes_cuda(uint64_t & active_vertices, struct CUDA_Context*  ctx)\n{\n  // FP: \"1 -> 2;\n  KCoreSanityCheck_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, active_vertices, ctx);\n  // FP: \"2 -> 3;\n}\nvoid KCoreSanityCheck_nodesWithEdges_cuda(uint64_t & active_vertices, struct CUDA_Context*  ctx)\n{\n  // FP: \"1 -> 2;\n  KCoreSanityCheck_cuda(0, ctx->numNodesWithEdges, active_vertices, ctx);\n  // FP: \"2 -> 3;\n}"
  },
  {
    "path": "lonestar/analytics/distributed/k-core/kcore_push_cuda.cuh",
    "content": "#pragma once\n#include <cuda.h>\n#include <stdio.h>\n#include <sys/types.h>\n#include <unistd.h>\n#include \"kcore_push_cuda.h\"\n#include \"galois/runtime/cuda/DeviceSync.h\"\n\nstruct CUDA_Context : public CUDA_Context_Common {\n\tstruct CUDA_Context_Field<uint32_t> current_degree;\n\tstruct CUDA_Context_Field<uint8_t> flag;\n\tstruct CUDA_Context_Field<uint32_t> trim;\n};\n\nstruct CUDA_Context* get_CUDA_context(int id) {\n\tstruct CUDA_Context* ctx;\n\tctx = (struct CUDA_Context* ) calloc(1, sizeof(struct CUDA_Context));\n\tctx->id = id;\n\treturn ctx;\n}\n\nbool init_CUDA_context(struct CUDA_Context* ctx, int device) {\n\treturn init_CUDA_context_common(ctx, device);\n}\n\nvoid load_graph_CUDA(struct CUDA_Context* ctx, MarshalGraph &g, unsigned num_hosts) {\n\tsize_t mem_usage = mem_usage_CUDA_common(g, num_hosts);\n\tmem_usage += mem_usage_CUDA_field(&ctx->current_degree, g, num_hosts);\n\tmem_usage += mem_usage_CUDA_field(&ctx->flag, g, num_hosts);\n\tmem_usage += mem_usage_CUDA_field(&ctx->trim, g, num_hosts);\n\tprintf(\"[%d] Host memory for communication context: %3u MB\\n\", ctx->id, mem_usage/1048756);\n\tload_graph_CUDA_common(ctx, g, num_hosts);\n\tload_graph_CUDA_field(ctx, &ctx->current_degree, num_hosts);\n\tload_graph_CUDA_field(ctx, &ctx->flag, num_hosts);\n\tload_graph_CUDA_field(ctx, &ctx->trim, num_hosts);\n\treset_CUDA_context(ctx);\n}\n\nvoid reset_CUDA_context(struct CUDA_Context* ctx) {\n\tctx->current_degree.data.zero_gpu();\n\tctx->flag.data.zero_gpu();\n\tctx->trim.data.zero_gpu();\n}\n\nvoid get_bitset_current_degree_cuda(struct CUDA_Context* ctx, uint64_t* bitset_compute) {\n\tctx->current_degree.is_updated.cpu_rd_ptr()->copy_to_cpu(bitset_compute);\n}\n\nvoid bitset_current_degree_reset_cuda(struct CUDA_Context* ctx) {\n\tctx->current_degree.is_updated.cpu_rd_ptr()->reset();\n}\n\nvoid bitset_current_degree_reset_cuda(struct CUDA_Context* ctx, size_t begin, size_t end) {\n\treset_bitset_field(&ctx->current_degree, begin, end);\n}\n\nuint32_t get_node_current_degree_cuda(struct CUDA_Context* ctx, unsigned LID) {\n\tuint32_t *current_degree = ctx->current_degree.data.cpu_rd_ptr();\n\treturn current_degree[LID];\n}\n\nvoid set_node_current_degree_cuda(struct CUDA_Context* ctx, unsigned LID, uint32_t v) {\n\tuint32_t *current_degree = ctx->current_degree.data.cpu_wr_ptr();\n\tcurrent_degree[LID] = v;\n}\n\nvoid add_node_current_degree_cuda(struct CUDA_Context* ctx, unsigned LID, uint32_t v) {\n\tuint32_t *current_degree = ctx->current_degree.data.cpu_wr_ptr();\n\tcurrent_degree[LID] += v;\n}\n\nbool min_node_current_degree_cuda(struct CUDA_Context* ctx, unsigned LID, uint32_t v) {\n\tuint32_t *current_degree = ctx->current_degree.data.cpu_wr_ptr();\n\tif (current_degree[LID] > v){\n\t\tcurrent_degree[LID] = v;\n\t\treturn true;\n\t}\n\treturn false;\n}\n\nvoid batch_get_node_current_degree_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v) {\n\tbatch_get_shared_field<uint32_t, sharedMaster, false>(ctx, &ctx->current_degree, from_id, v);\n}\n\nvoid batch_get_node_current_degree_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode) {\n\tbatch_get_shared_field<uint32_t, sharedMaster, false>(ctx, &ctx->current_degree, from_id, v, v_size, data_mode);\n}\n\nvoid batch_get_mirror_node_current_degree_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v) {\n\tbatch_get_shared_field<uint32_t, sharedMirror, false>(ctx, &ctx->current_degree, from_id, v);\n}\n\nvoid batch_get_mirror_node_current_degree_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode) {\n\tbatch_get_shared_field<uint32_t, sharedMirror, false>(ctx, &ctx->current_degree, from_id, v, v_size, data_mode);\n}\n\nvoid batch_get_reset_node_current_degree_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, uint32_t i) {\n\tbatch_get_shared_field<uint32_t, sharedMirror, true>(ctx, &ctx->current_degree, from_id, v, i);\n}\n\nvoid batch_get_reset_node_current_degree_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode, uint32_t i) {\n\tbatch_get_shared_field<uint32_t, sharedMirror, true>(ctx, &ctx->current_degree, from_id, v, v_size, data_mode, i);\n}\n\nvoid batch_set_mirror_node_current_degree_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<uint32_t, sharedMirror, setOp>(ctx, &ctx->current_degree, from_id, v, data_mode);\n}\n\nvoid batch_set_node_current_degree_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<uint32_t, sharedMaster, setOp>(ctx, &ctx->current_degree, from_id, v, data_mode);\n}\n\nvoid batch_add_mirror_node_current_degree_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<uint32_t, sharedMirror, addOp>(ctx, &ctx->current_degree, from_id, v, data_mode);\n}\n\nvoid batch_add_node_current_degree_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<uint32_t, sharedMaster, addOp>(ctx, &ctx->current_degree, from_id, v, data_mode);\n}\n\nvoid batch_min_mirror_node_current_degree_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<uint32_t, sharedMirror, minOp>(ctx, &ctx->current_degree, from_id, v, data_mode);\n}\n\nvoid batch_min_node_current_degree_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<uint32_t, sharedMaster, minOp>(ctx, &ctx->current_degree, from_id, v, data_mode);\n}\n\nvoid batch_reset_node_current_degree_cuda(struct CUDA_Context* ctx, size_t begin, size_t end, uint32_t v) {\n\treset_data_field<uint32_t>(&ctx->current_degree, begin, end, v);\n}\n\nvoid get_bitset_flag_cuda(struct CUDA_Context* ctx, uint64_t* bitset_compute) {\n\tctx->flag.is_updated.cpu_rd_ptr()->copy_to_cpu(bitset_compute);\n}\n\nvoid bitset_flag_reset_cuda(struct CUDA_Context* ctx) {\n\tctx->flag.is_updated.cpu_rd_ptr()->reset();\n}\n\nvoid bitset_flag_reset_cuda(struct CUDA_Context* ctx, size_t begin, size_t end) {\n\treset_bitset_field(&ctx->flag, begin, end);\n}\n\nuint8_t get_node_flag_cuda(struct CUDA_Context* ctx, unsigned LID) {\n\tuint8_t *flag = ctx->flag.data.cpu_rd_ptr();\n\treturn flag[LID];\n}\n\nvoid set_node_flag_cuda(struct CUDA_Context* ctx, unsigned LID, uint8_t v) {\n\tuint8_t *flag = ctx->flag.data.cpu_wr_ptr();\n\tflag[LID] = v;\n}\n\nvoid add_node_flag_cuda(struct CUDA_Context* ctx, unsigned LID, uint8_t v) {\n\tuint8_t *flag = ctx->flag.data.cpu_wr_ptr();\n\tflag[LID] += v;\n}\n\nbool min_node_flag_cuda(struct CUDA_Context* ctx, unsigned LID, uint8_t v) {\n\tuint8_t *flag = ctx->flag.data.cpu_wr_ptr();\n\tif (flag[LID] > v){\n\t\tflag[LID] = v;\n\t\treturn true;\n\t}\n\treturn false;\n}\n\nvoid batch_get_node_flag_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v) {\n\tbatch_get_shared_field<uint8_t, sharedMaster, false>(ctx, &ctx->flag, from_id, v);\n}\n\nvoid batch_get_node_flag_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode) {\n\tbatch_get_shared_field<uint8_t, sharedMaster, false>(ctx, &ctx->flag, from_id, v, v_size, data_mode);\n}\n\nvoid batch_get_mirror_node_flag_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v) {\n\tbatch_get_shared_field<uint8_t, sharedMirror, false>(ctx, &ctx->flag, from_id, v);\n}\n\nvoid batch_get_mirror_node_flag_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode) {\n\tbatch_get_shared_field<uint8_t, sharedMirror, false>(ctx, &ctx->flag, from_id, v, v_size, data_mode);\n}\n\nvoid batch_get_reset_node_flag_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, uint8_t i) {\n\tbatch_get_shared_field<uint8_t, sharedMirror, true>(ctx, &ctx->flag, from_id, v, i);\n}\n\nvoid batch_get_reset_node_flag_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode, uint8_t i) {\n\tbatch_get_shared_field<uint8_t, sharedMirror, true>(ctx, &ctx->flag, from_id, v, v_size, data_mode, i);\n}\n\nvoid batch_set_mirror_node_flag_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<uint8_t, sharedMirror, setOp>(ctx, &ctx->flag, from_id, v, data_mode);\n}\n\nvoid batch_set_node_flag_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<uint8_t, sharedMaster, setOp>(ctx, &ctx->flag, from_id, v, data_mode);\n}\n\nvoid batch_add_mirror_node_flag_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<uint8_t, sharedMirror, addOp>(ctx, &ctx->flag, from_id, v, data_mode);\n}\n\nvoid batch_add_node_flag_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<uint8_t, sharedMaster, addOp>(ctx, &ctx->flag, from_id, v, data_mode);\n}\n\nvoid batch_min_mirror_node_flag_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<uint8_t, sharedMirror, minOp>(ctx, &ctx->flag, from_id, v, data_mode);\n}\n\nvoid batch_min_node_flag_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<uint8_t, sharedMaster, minOp>(ctx, &ctx->flag, from_id, v, data_mode);\n}\n\nvoid batch_reset_node_flag_cuda(struct CUDA_Context* ctx, size_t begin, size_t end, uint8_t v) {\n\treset_data_field<uint8_t>(&ctx->flag, begin, end, v);\n}\n\nvoid get_bitset_trim_cuda(struct CUDA_Context* ctx, uint64_t* bitset_compute) {\n\tctx->trim.is_updated.cpu_rd_ptr()->copy_to_cpu(bitset_compute);\n}\n\nvoid bitset_trim_reset_cuda(struct CUDA_Context* ctx) {\n\tctx->trim.is_updated.cpu_rd_ptr()->reset();\n}\n\nvoid bitset_trim_reset_cuda(struct CUDA_Context* ctx, size_t begin, size_t end) {\n\treset_bitset_field(&ctx->trim, begin, end);\n}\n\nuint32_t get_node_trim_cuda(struct CUDA_Context* ctx, unsigned LID) {\n\tuint32_t *trim = ctx->trim.data.cpu_rd_ptr();\n\treturn trim[LID];\n}\n\nvoid set_node_trim_cuda(struct CUDA_Context* ctx, unsigned LID, uint32_t v) {\n\tuint32_t *trim = ctx->trim.data.cpu_wr_ptr();\n\ttrim[LID] = v;\n}\n\nvoid add_node_trim_cuda(struct CUDA_Context* ctx, unsigned LID, uint32_t v) {\n\tuint32_t *trim = ctx->trim.data.cpu_wr_ptr();\n\ttrim[LID] += v;\n}\n\nbool min_node_trim_cuda(struct CUDA_Context* ctx, unsigned LID, uint32_t v) {\n\tuint32_t *trim = ctx->trim.data.cpu_wr_ptr();\n\tif (trim[LID] > v){\n\t\ttrim[LID] = v;\n\t\treturn true;\n\t}\n\treturn false;\n}\n\nvoid batch_get_node_trim_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v) {\n\tbatch_get_shared_field<uint32_t, sharedMaster, false>(ctx, &ctx->trim, from_id, v);\n}\n\nvoid batch_get_node_trim_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode) {\n\tbatch_get_shared_field<uint32_t, sharedMaster, false>(ctx, &ctx->trim, from_id, v, v_size, data_mode);\n}\n\nvoid batch_get_mirror_node_trim_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v) {\n\tbatch_get_shared_field<uint32_t, sharedMirror, false>(ctx, &ctx->trim, from_id, v);\n}\n\nvoid batch_get_mirror_node_trim_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode) {\n\tbatch_get_shared_field<uint32_t, sharedMirror, false>(ctx, &ctx->trim, from_id, v, v_size, data_mode);\n}\n\nvoid batch_get_reset_node_trim_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, uint32_t i) {\n\tbatch_get_shared_field<uint32_t, sharedMirror, true>(ctx, &ctx->trim, from_id, v, i);\n}\n\nvoid batch_get_reset_node_trim_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode, uint32_t i) {\n\tbatch_get_shared_field<uint32_t, sharedMirror, true>(ctx, &ctx->trim, from_id, v, v_size, data_mode, i);\n}\n\nvoid batch_set_mirror_node_trim_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<uint32_t, sharedMirror, setOp>(ctx, &ctx->trim, from_id, v, data_mode);\n}\n\nvoid batch_set_node_trim_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<uint32_t, sharedMaster, setOp>(ctx, &ctx->trim, from_id, v, data_mode);\n}\n\nvoid batch_add_mirror_node_trim_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<uint32_t, sharedMirror, addOp>(ctx, &ctx->trim, from_id, v, data_mode);\n}\n\nvoid batch_add_node_trim_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<uint32_t, sharedMaster, addOp>(ctx, &ctx->trim, from_id, v, data_mode);\n}\n\nvoid batch_min_mirror_node_trim_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<uint32_t, sharedMirror, minOp>(ctx, &ctx->trim, from_id, v, data_mode);\n}\n\nvoid batch_min_node_trim_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<uint32_t, sharedMaster, minOp>(ctx, &ctx->trim, from_id, v, data_mode);\n}\n\nvoid batch_reset_node_trim_cuda(struct CUDA_Context* ctx, size_t begin, size_t end, uint32_t v) {\n\treset_data_field<uint32_t>(&ctx->trim, begin, end, v);\n}\n\n"
  },
  {
    "path": "lonestar/analytics/distributed/k-core/kcore_push_cuda.h",
    "content": "#pragma once\n\n#include \"galois/runtime/DataCommMode.h\"\n#include \"galois/cuda/HostDecls.h\"\n\nvoid get_bitset_current_degree_cuda(struct CUDA_Context* ctx,\n                                    uint64_t* bitset_compute);\nvoid bitset_current_degree_reset_cuda(struct CUDA_Context* ctx);\nvoid bitset_current_degree_reset_cuda(struct CUDA_Context* ctx, size_t begin,\n                                      size_t end);\nuint32_t get_node_current_degree_cuda(struct CUDA_Context* ctx, unsigned LID);\nvoid set_node_current_degree_cuda(struct CUDA_Context* ctx, unsigned LID,\n                                  uint32_t v);\nvoid add_node_current_degree_cuda(struct CUDA_Context* ctx, unsigned LID,\n                                  uint32_t v);\nbool min_node_current_degree_cuda(struct CUDA_Context* ctx, unsigned LID,\n                                  uint32_t v);\nvoid batch_get_node_current_degree_cuda(struct CUDA_Context* ctx,\n                                        unsigned from_id, uint8_t* v);\nvoid batch_get_node_current_degree_cuda(struct CUDA_Context* ctx,\n                                        unsigned from_id, uint8_t* v,\n                                        size_t* v_size,\n                                        DataCommMode* data_mode);\nvoid batch_get_mirror_node_current_degree_cuda(struct CUDA_Context* ctx,\n                                               unsigned from_id, uint8_t* v);\nvoid batch_get_mirror_node_current_degree_cuda(struct CUDA_Context* ctx,\n                                               unsigned from_id, uint8_t* v,\n                                               size_t* v_size,\n                                               DataCommMode* data_mode);\nvoid batch_get_reset_node_current_degree_cuda(struct CUDA_Context* ctx,\n                                              unsigned from_id, uint8_t* v,\n                                              uint32_t i);\nvoid batch_get_reset_node_current_degree_cuda(struct CUDA_Context* ctx,\n                                              unsigned from_id, uint8_t* v,\n                                              size_t* v_size,\n                                              DataCommMode* data_mode,\n                                              uint32_t i);\nvoid batch_set_mirror_node_current_degree_cuda(struct CUDA_Context* ctx,\n                                               unsigned from_id, uint8_t* v,\n                                               DataCommMode data_mode);\nvoid batch_set_node_current_degree_cuda(struct CUDA_Context* ctx,\n                                        unsigned from_id, uint8_t* v,\n                                        DataCommMode data_mode);\nvoid batch_add_mirror_node_current_degree_cuda(struct CUDA_Context* ctx,\n                                               unsigned from_id, uint8_t* v,\n                                               DataCommMode data_mode);\nvoid batch_add_node_current_degree_cuda(struct CUDA_Context* ctx,\n                                        unsigned from_id, uint8_t* v,\n                                        DataCommMode data_mode);\nvoid batch_min_mirror_node_current_degree_cuda(struct CUDA_Context* ctx,\n                                               unsigned from_id, uint8_t* v,\n                                               DataCommMode data_mode);\nvoid batch_min_node_current_degree_cuda(struct CUDA_Context* ctx,\n                                        unsigned from_id, uint8_t* v,\n                                        DataCommMode data_mode);\nvoid batch_reset_node_current_degree_cuda(struct CUDA_Context* ctx,\n                                          size_t begin, size_t end, uint32_t v);\n\nvoid get_bitset_flag_cuda(struct CUDA_Context* ctx, uint64_t* bitset_compute);\nvoid bitset_flag_reset_cuda(struct CUDA_Context* ctx);\nvoid bitset_flag_reset_cuda(struct CUDA_Context* ctx, size_t begin, size_t end);\nuint8_t get_node_flag_cuda(struct CUDA_Context* ctx, unsigned LID);\nvoid set_node_flag_cuda(struct CUDA_Context* ctx, unsigned LID, uint8_t v);\nvoid add_node_flag_cuda(struct CUDA_Context* ctx, unsigned LID, uint8_t v);\nbool min_node_flag_cuda(struct CUDA_Context* ctx, unsigned LID, uint8_t v);\nvoid batch_get_node_flag_cuda(struct CUDA_Context* ctx, unsigned from_id,\n                              uint8_t* v);\nvoid batch_get_node_flag_cuda(struct CUDA_Context* ctx, unsigned from_id,\n                              uint8_t* v, size_t* v_size,\n                              DataCommMode* data_mode);\nvoid batch_get_mirror_node_flag_cuda(struct CUDA_Context* ctx, unsigned from_id,\n                                     uint8_t* v);\nvoid batch_get_mirror_node_flag_cuda(struct CUDA_Context* ctx, unsigned from_id,\n                                     uint8_t* v, size_t* v_size,\n                                     DataCommMode* data_mode);\nvoid batch_get_reset_node_flag_cuda(struct CUDA_Context* ctx, unsigned from_id,\n                                    uint8_t* v, uint8_t i);\nvoid batch_get_reset_node_flag_cuda(struct CUDA_Context* ctx, unsigned from_id,\n                                    uint8_t* v, size_t* v_size,\n                                    DataCommMode* data_mode, uint8_t i);\nvoid batch_set_mirror_node_flag_cuda(struct CUDA_Context* ctx, unsigned from_id,\n                                     uint8_t* v, DataCommMode data_mode);\nvoid batch_set_node_flag_cuda(struct CUDA_Context* ctx, unsigned from_id,\n                              uint8_t* v, DataCommMode data_mode);\nvoid batch_add_mirror_node_flag_cuda(struct CUDA_Context* ctx, unsigned from_id,\n                                     uint8_t* v, DataCommMode data_mode);\nvoid batch_add_node_flag_cuda(struct CUDA_Context* ctx, unsigned from_id,\n                              uint8_t* v, DataCommMode data_mode);\nvoid batch_min_mirror_node_flag_cuda(struct CUDA_Context* ctx, unsigned from_id,\n                                     uint8_t* v, DataCommMode data_mode);\nvoid batch_min_node_flag_cuda(struct CUDA_Context* ctx, unsigned from_id,\n                              uint8_t* v, DataCommMode data_mode);\nvoid batch_reset_node_flag_cuda(struct CUDA_Context* ctx, size_t begin,\n                                size_t end, uint8_t v);\n\nvoid get_bitset_trim_cuda(struct CUDA_Context* ctx, uint64_t* bitset_compute);\nvoid bitset_trim_reset_cuda(struct CUDA_Context* ctx);\nvoid bitset_trim_reset_cuda(struct CUDA_Context* ctx, size_t begin, size_t end);\nuint32_t get_node_trim_cuda(struct CUDA_Context* ctx, unsigned LID);\nvoid set_node_trim_cuda(struct CUDA_Context* ctx, unsigned LID, uint32_t v);\nvoid add_node_trim_cuda(struct CUDA_Context* ctx, unsigned LID, uint32_t v);\nbool min_node_trim_cuda(struct CUDA_Context* ctx, unsigned LID, uint32_t v);\nvoid batch_get_node_trim_cuda(struct CUDA_Context* ctx, unsigned from_id,\n                              uint8_t* v);\nvoid batch_get_node_trim_cuda(struct CUDA_Context* ctx, unsigned from_id,\n                              uint8_t* v, size_t* v_size,\n                              DataCommMode* data_mode);\nvoid batch_get_mirror_node_trim_cuda(struct CUDA_Context* ctx, unsigned from_id,\n                                     uint8_t* v);\nvoid batch_get_mirror_node_trim_cuda(struct CUDA_Context* ctx, unsigned from_id,\n                                     uint8_t* v, size_t* v_size,\n                                     DataCommMode* data_mode);\nvoid batch_get_reset_node_trim_cuda(struct CUDA_Context* ctx, unsigned from_id,\n                                    uint8_t* v, uint32_t i);\nvoid batch_get_reset_node_trim_cuda(struct CUDA_Context* ctx, unsigned from_id,\n                                    uint8_t* v, size_t* v_size,\n                                    DataCommMode* data_mode, uint32_t i);\nvoid batch_set_mirror_node_trim_cuda(struct CUDA_Context* ctx, unsigned from_id,\n                                     uint8_t* v, DataCommMode data_mode);\nvoid batch_set_node_trim_cuda(struct CUDA_Context* ctx, unsigned from_id,\n                              uint8_t* v, DataCommMode data_mode);\nvoid batch_add_mirror_node_trim_cuda(struct CUDA_Context* ctx, unsigned from_id,\n                                     uint8_t* v, DataCommMode data_mode);\nvoid batch_add_node_trim_cuda(struct CUDA_Context* ctx, unsigned from_id,\n                              uint8_t* v, DataCommMode data_mode);\nvoid batch_min_mirror_node_trim_cuda(struct CUDA_Context* ctx, unsigned from_id,\n                                     uint8_t* v, DataCommMode data_mode);\nvoid batch_min_node_trim_cuda(struct CUDA_Context* ctx, unsigned from_id,\n                              uint8_t* v, DataCommMode data_mode);\nvoid batch_reset_node_trim_cuda(struct CUDA_Context* ctx, size_t begin,\n                                size_t end, uint32_t v);\n\nvoid InitializeGraph1_cuda(unsigned int __begin, unsigned int __end,\n                           struct CUDA_Context* ctx);\nvoid InitializeGraph1_allNodes_cuda(struct CUDA_Context* ctx);\nvoid InitializeGraph1_masterNodes_cuda(struct CUDA_Context* ctx);\nvoid InitializeGraph1_nodesWithEdges_cuda(struct CUDA_Context* ctx);\nvoid InitializeGraph2_cuda(unsigned int __begin, unsigned int __end,\n                           struct CUDA_Context* ctx);\nvoid InitializeGraph2_allNodes_cuda(struct CUDA_Context* ctx);\nvoid InitializeGraph2_masterNodes_cuda(struct CUDA_Context* ctx);\nvoid InitializeGraph2_nodesWithEdges_cuda(struct CUDA_Context* ctx);\nvoid KCoreSanityCheck_cuda(unsigned int __begin, unsigned int __end,\n                           uint64_t& active_vertices, struct CUDA_Context* ctx);\nvoid KCoreSanityCheck_allNodes_cuda(uint64_t& active_vertices,\n                                    struct CUDA_Context* ctx);\nvoid KCoreSanityCheck_masterNodes_cuda(uint64_t& active_vertices,\n                                       struct CUDA_Context* ctx);\nvoid KCoreSanityCheck_nodesWithEdges_cuda(uint64_t& active_vertices,\n                                          struct CUDA_Context* ctx);\nvoid KCoreStep1_cuda(unsigned int __begin, unsigned int __end,\n                     unsigned int& active_vertices, uint32_t local_k_core_num,\n                     struct CUDA_Context* ctx);\nvoid KCoreStep1_allNodes_cuda(unsigned int& active_vertices,\n                              uint32_t local_k_core_num,\n                              struct CUDA_Context* ctx);\nvoid KCoreStep1_masterNodes_cuda(unsigned int& active_vertices,\n                                 uint32_t local_k_core_num,\n                                 struct CUDA_Context* ctx);\nvoid KCoreStep1_nodesWithEdges_cuda(unsigned int& active_vertices,\n                                    uint32_t local_k_core_num,\n                                    struct CUDA_Context* ctx);\nvoid KCoreStep2_cuda(unsigned int __begin, unsigned int __end,\n                     struct CUDA_Context* ctx);\nvoid KCoreStep2_allNodes_cuda(struct CUDA_Context* ctx);\nvoid KCoreStep2_masterNodes_cuda(struct CUDA_Context* ctx);\nvoid KCoreStep2_nodesWithEdges_cuda(struct CUDA_Context* ctx);\n"
  },
  {
    "path": "lonestar/analytics/distributed/k-core/kcore_push_cuda.py",
    "content": "from gg.ast import *\nfrom gg.lib.graph import Graph\nfrom gg.lib.wl import Worklist\nfrom gg.ast.params import GraphParam\nimport cgen\nG = Graph(\"graph\")\nWL = Worklist()\nast = Module([\nCBlock([cgen.Include(\"kcore_push_cuda.cuh\", system = False)], parse = False),\nKernel(\"InitializeGraph2\", [G.param(), ('unsigned int', '__begin'), ('unsigned int', '__end'), ('uint32_t *', 'p_current_degree'), ('DynamicBitset&', 'bitset_current_degree')],\n[\nForAll(\"src\", G.nodes(\"__begin\", \"__end\"),\n[\nCDecl([(\"bool\", \"pop\", \" = src < __end\")]),\nIf(\"pop\", [\n]),\nUniformConditional(If(\"!pop\", [CBlock(\"continue\")]), uniform_only = False, _only_if_np = True),\nClosureHint(\nForAll(\"current_edge\", G.edges(\"src\"),\n[\nCDecl([(\"index_type\", \"dest_node\", \"\")]),\nCBlock([\"dest_node = graph.getAbsDestination(current_edge)\"]),\nCBlock([\"atomicTestAdd(&p_current_degree[dest_node], (uint32_t)1)\"]),\nCBlock([\"bitset_current_degree.set(dest_node)\"]),\n]),\n),\n]),\n]),\nKernel(\"InitializeGraph1\", [G.param(), ('unsigned int', '__begin'), ('unsigned int', '__end'), ('uint32_t *', 'p_current_degree'), ('uint8_t *', 'p_flag'), ('uint32_t *', 'p_trim')],\n[\nForAll(\"src\", G.nodes(\"__begin\", \"__end\"),\n[\nCDecl([(\"bool\", \"pop\", \" = src < __end\")]),\nIf(\"pop\", [\nCBlock([\"p_flag[src]           = true\"]),\nCBlock([\"p_trim[src]           = 0\"]),\nCBlock([\"p_current_degree[src] = 0\"]),\n]),\n]),\n]),\nKernel(\"KCoreStep2\", [G.param(), ('unsigned int', '__begin'), ('unsigned int', '__end'), ('uint32_t *', 'p_current_degree'), ('uint8_t *', 'p_flag'), ('uint32_t *', 'p_trim')],\n[\nForAll(\"src\", G.nodes(\"__begin\", \"__end\"),\n[\nCDecl([(\"bool\", \"pop\", \" = src < __end\")]),\nIf(\"pop\", [\nIf(\"p_flag[src]\",\n[\nIf(\"p_trim[src] > 0\",\n[\nCBlock([\"p_current_degree[src] = p_current_degree[src] - p_trim[src]\"]),\n]),\n]),\nCBlock([\"p_trim[src] = 0\"]),\n]),\n]),\n]),\nKernel(\"KCoreStep1\", [G.param(), ('unsigned int', '__begin'), ('unsigned int', '__end'), ('uint32_t', 'local_k_core_num'), ('uint32_t *', 'p_current_degree'), ('uint8_t *', 'p_flag'), ('uint32_t *', 'p_trim'), ('DynamicBitset&', 'bitset_trim'), ('HGAccumulator<unsigned int>', 'active_vertices')],\n[\nCDecl([(\"__shared__ cub::BlockReduce<unsigned int, TB_SIZE>::TempStorage\", \"active_vertices_ts\", \"\")]),\nCBlock([\"active_vertices.thread_entry()\"]),\nForAll(\"src\", G.nodes(\"__begin\", \"__end\"),\n[\nCDecl([(\"bool\", \"pop\", \" = src < __end\")]),\nIf(\"pop\", [\nIf(\"p_flag[src]\",\n[\nIf(\"p_current_degree[src] < local_k_core_num\",\n[\nCBlock([\"p_flag[src] = false\"]),\nCBlock([\"active_vertices.reduce( 1)\"]),\n], [ CBlock([\"pop = false\"]), ]),\n], [ CBlock([\"pop = false\"]), ]),\n]),\nUniformConditional(If(\"!pop\", [CBlock(\"continue\")]), uniform_only = False, _only_if_np = True),\nClosureHint(\nForAll(\"current_edge\", G.edges(\"src\"),\n[\nCDecl([(\"index_type\", \"dst\", \"\")]),\nCBlock([\"dst = graph.getAbsDestination(current_edge)\"]),\nCBlock([\"atomicTestAdd(&p_trim[dst], (uint32_t)1)\"]),\nCBlock([\"bitset_trim.set(dst)\"]),\n]),\n),\n]),\nCBlock([\"active_vertices.thread_exit<cub::BlockReduce<unsigned int, TB_SIZE> >(active_vertices_ts)\"], parse = False),\n]),\nKernel(\"KCoreSanityCheck\", [G.param(), ('unsigned int', '__begin'), ('unsigned int', '__end'), ('uint8_t *', 'p_flag'), ('HGAccumulator<uint64_t>', 'active_vertices')],\n[\nCDecl([(\"__shared__ cub::BlockReduce<uint64_t, TB_SIZE>::TempStorage\", \"active_vertices_ts\", \"\")]),\nCBlock([\"active_vertices.thread_entry()\"]),\nForAll(\"src\", G.nodes(\"__begin\", \"__end\"),\n[\nCDecl([(\"bool\", \"pop\", \" = src < __end\")]),\nIf(\"pop\", [\nIf(\"p_flag[src]\",\n[\nCBlock([\"active_vertices.reduce( 1)\"]),\n]),\n]),\n]),\nCBlock([\"active_vertices.thread_exit<cub::BlockReduce<uint64_t, TB_SIZE> >(active_vertices_ts)\"], parse = False),\n]),\nKernel(\"InitializeGraph2_cuda\", [('unsigned int ', '__begin'), ('unsigned int ', '__end'), ('struct CUDA_Context* ', 'ctx')],\n[\nCDecl([(\"dim3\", \"blocks\", \"\")]),\nCDecl([(\"dim3\", \"threads\", \"\")]),\nCBlock([\"kernel_sizing(blocks, threads)\"]),\nInvoke(\"InitializeGraph2\", (\"ctx->gg\", \"__begin\", \"__end\", \"ctx->current_degree.data.gpu_wr_ptr()\", \"*(ctx->current_degree.is_updated.gpu_rd_ptr())\")),\nCBlock([\"check_cuda_kernel\"], parse = False),\n], host = True),\nKernel(\"InitializeGraph2_allNodes_cuda\", [('struct CUDA_Context* ', 'ctx')],\n[\nCBlock([\"InitializeGraph2_cuda(0, ctx->gg.nnodes, ctx)\"]),\n], host = True),\nKernel(\"InitializeGraph2_masterNodes_cuda\", [('struct CUDA_Context* ', 'ctx')],\n[\nCBlock([\"InitializeGraph2_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, ctx)\"]),\n], host = True),\nKernel(\"InitializeGraph2_nodesWithEdges_cuda\", [('struct CUDA_Context* ', 'ctx')],\n[\nCBlock([\"InitializeGraph2_cuda(0, ctx->numNodesWithEdges, ctx)\"]),\n], host = True),\nKernel(\"InitializeGraph1_cuda\", [('unsigned int ', '__begin'), ('unsigned int ', '__end'), ('struct CUDA_Context* ', 'ctx')],\n[\nCDecl([(\"dim3\", \"blocks\", \"\")]),\nCDecl([(\"dim3\", \"threads\", \"\")]),\nCBlock([\"kernel_sizing(blocks, threads)\"]),\nInvoke(\"InitializeGraph1\", (\"ctx->gg\", \"__begin\", \"__end\", \"ctx->current_degree.data.gpu_wr_ptr()\", \"ctx->flag.data.gpu_wr_ptr()\", \"ctx->trim.data.gpu_wr_ptr()\")),\nCBlock([\"check_cuda_kernel\"], parse = False),\n], host = True),\nKernel(\"InitializeGraph1_allNodes_cuda\", [('struct CUDA_Context* ', 'ctx')],\n[\nCBlock([\"InitializeGraph1_cuda(0, ctx->gg.nnodes, ctx)\"]),\n], host = True),\nKernel(\"InitializeGraph1_masterNodes_cuda\", [('struct CUDA_Context* ', 'ctx')],\n[\nCBlock([\"InitializeGraph1_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, ctx)\"]),\n], host = True),\nKernel(\"InitializeGraph1_nodesWithEdges_cuda\", [('struct CUDA_Context* ', 'ctx')],\n[\nCBlock([\"InitializeGraph1_cuda(0, ctx->numNodesWithEdges, ctx)\"]),\n], host = True),\nKernel(\"KCoreStep2_cuda\", [('unsigned int ', '__begin'), ('unsigned int ', '__end'), ('struct CUDA_Context* ', 'ctx')],\n[\nCDecl([(\"dim3\", \"blocks\", \"\")]),\nCDecl([(\"dim3\", \"threads\", \"\")]),\nCBlock([\"kernel_sizing(blocks, threads)\"]),\nInvoke(\"KCoreStep2\", (\"ctx->gg\", \"__begin\", \"__end\", \"ctx->current_degree.data.gpu_wr_ptr()\", \"ctx->flag.data.gpu_wr_ptr()\", \"ctx->trim.data.gpu_wr_ptr()\")),\nCBlock([\"check_cuda_kernel\"], parse = False),\n], host = True),\nKernel(\"KCoreStep2_allNodes_cuda\", [('struct CUDA_Context* ', 'ctx')],\n[\nCBlock([\"KCoreStep2_cuda(0, ctx->gg.nnodes, ctx)\"]),\n], host = True),\nKernel(\"KCoreStep2_masterNodes_cuda\", [('struct CUDA_Context* ', 'ctx')],\n[\nCBlock([\"KCoreStep2_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, ctx)\"]),\n], host = True),\nKernel(\"KCoreStep2_nodesWithEdges_cuda\", [('struct CUDA_Context* ', 'ctx')],\n[\nCBlock([\"KCoreStep2_cuda(0, ctx->numNodesWithEdges, ctx)\"]),\n], host = True),\nKernel(\"KCoreStep1_cuda\", [('unsigned int ', '__begin'), ('unsigned int ', '__end'), ('unsigned int &', 'active_vertices'), ('uint32_t', 'local_k_core_num'), ('struct CUDA_Context* ', 'ctx')],\n[\nCDecl([(\"dim3\", \"blocks\", \"\")]),\nCDecl([(\"dim3\", \"threads\", \"\")]),\nCBlock([\"kernel_sizing(blocks, threads)\"]),\nCDecl([(\"Shared<unsigned int>\", \"active_verticesval\", \" = Shared<unsigned int>(1)\")]),\nCDecl([(\"HGAccumulator<unsigned int>\", \"_active_vertices\", \"\")]),\nCBlock([\"*(active_verticesval.cpu_wr_ptr()) = 0\"]),\nCBlock([\"_active_vertices.rv = active_verticesval.gpu_wr_ptr()\"]),\nInvoke(\"KCoreStep1\", (\"ctx->gg\", \"__begin\", \"__end\", \"local_k_core_num\", \"ctx->current_degree.data.gpu_wr_ptr()\", \"ctx->flag.data.gpu_wr_ptr()\", \"ctx->trim.data.gpu_wr_ptr()\", \"*(ctx->trim.is_updated.gpu_rd_ptr())\", \"_active_vertices\")),\nCBlock([\"check_cuda_kernel\"], parse = False),\nCBlock([\"active_vertices = *(active_verticesval.cpu_rd_ptr())\"]),\n], host = True),\nKernel(\"KCoreStep1_allNodes_cuda\", [('unsigned int &', 'active_vertices'), ('uint32_t', 'local_k_core_num'), ('struct CUDA_Context* ', 'ctx')],\n[\nCBlock([\"KCoreStep1_cuda(0, ctx->gg.nnodes, active_vertices, local_k_core_num, ctx)\"]),\n], host = True),\nKernel(\"KCoreStep1_masterNodes_cuda\", [('unsigned int &', 'active_vertices'), ('uint32_t', 'local_k_core_num'), ('struct CUDA_Context* ', 'ctx')],\n[\nCBlock([\"KCoreStep1_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, active_vertices, local_k_core_num, ctx)\"]),\n], host = True),\nKernel(\"KCoreStep1_nodesWithEdges_cuda\", [('unsigned int &', 'active_vertices'), ('uint32_t', 'local_k_core_num'), ('struct CUDA_Context* ', 'ctx')],\n[\nCBlock([\"KCoreStep1_cuda(0, ctx->numNodesWithEdges, active_vertices, local_k_core_num, ctx)\"]),\n], host = True),\nKernel(\"KCoreSanityCheck_cuda\", [('unsigned int ', '__begin'), ('unsigned int ', '__end'), ('uint64_t &', 'active_vertices'), ('struct CUDA_Context* ', 'ctx')],\n[\nCDecl([(\"dim3\", \"blocks\", \"\")]),\nCDecl([(\"dim3\", \"threads\", \"\")]),\nCBlock([\"kernel_sizing(blocks, threads)\"]),\nCDecl([(\"Shared<uint64_t>\", \"active_verticesval\", \" = Shared<uint64_t>(1)\")]),\nCDecl([(\"HGAccumulator<uint64_t>\", \"_active_vertices\", \"\")]),\nCBlock([\"*(active_verticesval.cpu_wr_ptr()) = 0\"]),\nCBlock([\"_active_vertices.rv = active_verticesval.gpu_wr_ptr()\"]),\nInvoke(\"KCoreSanityCheck\", (\"ctx->gg\", \"__begin\", \"__end\", \"ctx->flag.data.gpu_wr_ptr()\", \"_active_vertices\")),\nCBlock([\"check_cuda_kernel\"], parse = False),\nCBlock([\"active_vertices = *(active_verticesval.cpu_rd_ptr())\"]),\n], host = True),\nKernel(\"KCoreSanityCheck_allNodes_cuda\", [('uint64_t &', 'active_vertices'), ('struct CUDA_Context* ', 'ctx')],\n[\nCBlock([\"KCoreSanityCheck_cuda(0, ctx->gg.nnodes, active_vertices, ctx)\"]),\n], host = True),\nKernel(\"KCoreSanityCheck_masterNodes_cuda\", [('uint64_t &', 'active_vertices'), ('struct CUDA_Context* ', 'ctx')],\n[\nCBlock([\"KCoreSanityCheck_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, active_vertices, ctx)\"]),\n], host = True),\nKernel(\"KCoreSanityCheck_nodesWithEdges_cuda\", [('uint64_t &', 'active_vertices'), ('struct CUDA_Context* ', 'ctx')],\n[\nCBlock([\"KCoreSanityCheck_cuda(0, ctx->numNodesWithEdges, active_vertices, ctx)\"]),\n], host = True),\n])\n"
  },
  {
    "path": "lonestar/analytics/distributed/k-core/kcore_push_sync.hh",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting parallelism.\n * The code is being released under the terms of the 3-Clause BSD License (a\n * copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#include \"galois/runtime/SyncStructures.h\"\n\n////////////////////////////////////////////////////////////////////////////////\n// current_degree\n////////////////////////////////////////////////////////////////////////////////\n\nGALOIS_SYNC_STRUCTURE_REDUCE_ADD(current_degree, uint32_t);\nGALOIS_SYNC_STRUCTURE_REDUCE_SET(current_degree, uint32_t);\nGALOIS_SYNC_STRUCTURE_BITSET(current_degree);\n\n////////////////////////////////////////////////////////////////////////////////\n// trim\n////////////////////////////////////////////////////////////////////////////////\n\nGALOIS_SYNC_STRUCTURE_REDUCE_ADD(trim, uint32_t);\nGALOIS_SYNC_STRUCTURE_BITSET(trim);\n"
  },
  {
    "path": "lonestar/analytics/distributed/matrixcompletion/CMakeLists.txt",
    "content": "app_dist(matrixCompletion matrixcompletion NO_GPU)\nadd_test_dist(matrixcompletion-dist Epinions_dataset NO_ASYNC NO_GPU ${BASEINPUT}/weighted/bipartite/Epinions_dataset.gr -graphTranspose=${BASEINPUT}/weighted/bipartite/Epinions_dataset.tgr -maxIterations=2)\n"
  },
  {
    "path": "lonestar/analytics/distributed/matrixcompletion/README.md",
    "content": "Matrix Completion\n================================================================================\n\nDESCRIPTION \n--------------------------------------------------------------------------------\n\nRuns matrix completion using stochastic gradient descent.\n\nThe algorithm is a bulk synchronous parallel residual based algorithm. In\neach round, updates to the latent vectors are calcuated based on the current\nerror between 2 nodes and then applied at the end of the round.\n\nINPUT\n--------------------------------------------------------------------------------\n\nTakes in bipartite Galois .gr graphs: all nodes with edges should be located\nin the prefix of the graph.\n\nBUILD\n--------------------------------------------------------------------------------\n\n1. Run cmake at BUILD directory (refer to top-level README for cmake instructions).\n\n2. Run `cd <BUILD>/lonestar/analytics/distributed/matrixcompletion; make -j\n\nRUN\n--------------------------------------------------------------------------------\n\nTo run for a max of 10 iterations, do the following\n`./matrixcompletion-dist <bipartite-input-graph> -t=<num-threads> -maxIterations=10`\n\nTo run on 3 hosts h1, h2, and h3 with changes to the learning parameters, use the following:\n`mpirun -n=3 -hosts=h1,h2,h3 ./matrixcompletion-dist <bipartite-input-graph> -t=<num-threads> -DECAY_RATE=0.5 -LAMBDA=0.001 -LEARNING_RATE=0.001`\n\nPERFORMANCE  \n--------------------------------------------------------------------------------\n\n* Convergence/time to convergence may be affected by the different learning \n  parameters (e.g. decay rate, lambda, learning rate). They may need tuning for\n  best performance. The best parameters are input dependent.\n\n* For 16 or less hosts/GPUs, for performance, we recommend using an\n  **edge-cut** partitioning policy (OEC or IEC).\n\n* For 32 or more hosts/GPUs, for performance, we recommend using the\n  **Cartesian vertex-cut** partitioning policy (CVC).\n"
  },
  {
    "path": "lonestar/analytics/distributed/matrixcompletion/matrixCompletion.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#include <iostream>\n#include <limits>\n#include <cmath>\n#include \"DistBench/Start.h\"\n#include \"galois/ArrayWrapper.h\"\n#include \"galois/AtomicWrapper.h\"\n#include \"galois/DistGalois.h\"\n#include \"galois/DReducible.h\"\n#include \"galois/gstl.h\"\n#include \"galois/runtime/Tracer.h\"\n\n#ifdef GALOIS_ENABLE_GPU\n#include \"galois/cuda/cuda_device.h\"\n#include \"sgd_cuda.h\"\nstruct CUDA_Context* cuda_ctx;\n#endif\n\nconstexpr static const char* const REGION_NAME = \"SGD\";\n\n/******************************************************************************/\n/* Declaration of command line arguments */\n/******************************************************************************/\n\nnamespace cll = llvm::cl;\n\nstatic cll::opt<unsigned int>\n    maxIterations(\"maxIterations\",\n                  cll::desc(\"Maximum iterations: Default 10000\"),\n                  cll::init(10000));\nstatic cll::opt<double>\n    LEARNING_RATE(\"LEARNING_RATE\",\n                  cll::desc(\"Learning rate (GAMMA): Default 0.00001\"),\n                  cll::init(0.00001));\nstatic cll::opt<double> LAMBDA(\"LAMBDA\", cll::desc(\"LAMBDA: Default 0.0001\"),\n                               cll::init(0.0001));\nstatic cll::opt<double>\n    DECAY_RATE(\"DECAY_RATE\",\n               cll::desc(\"Decay rate to be used in step size function \"\n                         \"(DECAY_RATE): Default 0.9\"),\n               cll::init(0.9));\n\n/******************************************************************************/\n/* Graph structure declarations + helper functions + other initialization */\n/******************************************************************************/\n\n#define LATENT_VECTOR_SIZE 20\n// static const double LEARNING_RATE = 0.00001; // GAMMA, Purdue: 0.01 Intel:\n// 0.001 static const double DECAY_RATE = 0.9; // STEP_DEC, Purdue: 0.1 Intel:\n// 0.9 static const double LAMBDA = 0.0001; // Purdue: 1.0 Intel: 0.001\nstatic const double MINVAL = -1e+100;\nstatic const double MAXVAL = 1e+100;\n\nstruct NodeData {\n\n  std::vector<galois::CopyableAtomic<double>> residual_latent_vector;\n  std::vector<double> latent_vector;\n};\n\ntypedef galois::graphs::DistGraph<NodeData, double> Graph;\ntypedef typename Graph::GraphNode GNode;\n\nstd::unique_ptr<galois::graphs::GluonSubstrate<Graph>> syncSubstrate;\n\n#include \"matrixCompletion_sync.hh\"\n// TODO: Set seed\nstatic double genRand() {\n  // generate a random double in (-1,1)\n  return 2.0 * ((double)std::rand() / (double)RAND_MAX) - 1.0;\n}\n\n// Purdue learning function\ndouble getstep_size(unsigned int round) {\n  return LEARNING_RATE * 1.5 / (1.0 + DECAY_RATE * pow(round + 1, 1.5));\n}\n\n/**\n * Prediction of edge weight based on 2 latent vectors\n */\ndouble calcPrediction(const NodeData& movie_data, const NodeData& user_data) {\n  double pred = galois::innerProduct(movie_data.latent_vector,\n                                     user_data.latent_vector, 0.0);\n\n  pred = std::min(MAXVAL, pred);\n  pred = std::max(MINVAL, pred);\n\n  return pred;\n}\n\n/******************************************************************************/\n/* Algorithm structures */\n/******************************************************************************/\n\nstruct InitializeGraph {\n  Graph* graph;\n\n  InitializeGraph(Graph* _graph) : graph(_graph) {}\n\n  void static go(Graph& _graph) {\n    auto& allNodes = _graph.allNodesRange();\n\n#ifdef GALOIS_ENABLE_GPU\n    if (personality == GPU_CUDA) {\n      std::string impl_str(\n          syncSubstrate->get_run_identifier(\"InitializeGraph\"));\n      galois::StatTimer StatTimer_cuda(impl_str.c_str());\n      StatTimer_cuda.start();\n      InitializeGraph_cuda(*allNodes.begin(), *allNodes.end(), cuda_ctx);\n      StatTimer_cuda.stop();\n    } else if (personality == CPU)\n#endif\n      galois::do_all(galois::iterate(allNodes.begin(), allNodes.end()),\n                     InitializeGraph{&_graph}, galois::loopname(\"Init\"));\n\n    // due to latent_vector being generated randomly, it should be sync'd\n    // to 1 consistent version across all hosts\n    syncSubstrate->sync<writeSource, readAny, Reduce_set_latent_vector>(\n        \"InitializeGraph\");\n  }\n\n  void operator()(GNode src) const {\n    NodeData& sdata = graph->getData(src);\n\n    // resize vectors\n    sdata.latent_vector.resize(LATENT_VECTOR_SIZE);\n    sdata.residual_latent_vector.resize(LATENT_VECTOR_SIZE);\n\n    for (int i = 0; i < LATENT_VECTOR_SIZE; i++) {\n      sdata.latent_vector[i] = genRand();  // randomly create latent vector\n      sdata.residual_latent_vector[i] = 0; // randomly create latent vector\n\n#ifndef NDEBUG\n      if (!std::isnormal(sdata.latent_vector[i]))\n        galois::gDebug(\"GEN for \", i, \" \", sdata.latent_vector[i]);\n#endif\n    }\n  }\n};\n\nstruct SGD_mergeResidual {\n  Graph* graph;\n\n  SGD_mergeResidual(Graph* _graph) : graph(_graph) {}\n\n  void static go(Graph& _graph) {\n\n    auto& allNodes = _graph.allNodesRange();\n\n#ifdef GALOIS_ENABLE_GPU\n    if (personality == GPU_CUDA) {\n      std::string impl_str(\"SGD_\" + (syncSubstrate->get_run_identifier()));\n      galois::StatTimer StatTimer_cuda(impl_str.c_str());\n      StatTimer_cuda.start();\n      int __retval = 0;\n      SGD_all_cuda(__retval, cuda_ctx);\n      // DGAccumulator_accum += __retval;\n      StatTimer_cuda.stop();\n    } else if (personality == CPU)\n#endif\n\n      galois::do_all(\n          galois::iterate(allNodes.begin(), allNodes.end()),\n          SGD_mergeResidual{&_graph},\n          galois::loopname(\n              syncSubstrate->get_run_identifier(\"SGD_merge\").c_str()),\n          galois::steal(), galois::no_stats());\n  }\n\n  void operator()(GNode src) const {\n    NodeData& sdata              = graph->getData(src);\n    auto& latent_vector          = sdata.latent_vector;\n    auto& residual_latent_vector = sdata.residual_latent_vector;\n\n    for (int i = 0; i < LATENT_VECTOR_SIZE; ++i) {\n      latent_vector[i] += residual_latent_vector[i];\n      residual_latent_vector[i] = 0;\n\n#ifndef NDEBUG\n      if (!std::isnormal(sdata.latent_vector[i]))\n        galois::gDebug(\"GEN for \", i, \" \", sdata.latent_vector[i]);\n#endif\n    }\n  }\n};\n\nstruct SGD {\n  Graph* graph;\n  double step_size;\n  galois::DGAccumulator<double>& DGAccumulator_accum;\n\n  SGD(Graph* _graph, double _step_size, galois::DGAccumulator<double>& _dga)\n      : graph(_graph), step_size(_step_size), DGAccumulator_accum(_dga) {}\n\n  void static go(Graph& _graph, galois::DGAccumulator<double>& dga) {\n    unsigned _num_iterations = 0;\n    double rms_normalized    = 0.0;\n    auto& nodesWithEdges     = _graph.allNodesWithEdgesRange();\n    const auto& net          = galois::runtime::getSystemNetworkInterface();\n    galois::gPrint(\"Nodes with edges on : \", net.ID, \" : \",\n                   std::distance(nodesWithEdges.begin(), nodesWithEdges.end()),\n                   \"\\n\");\n    do {\n      galois::gPrint(\"ITERATION : \", _num_iterations, \"\\n\");\n\n      auto step_size = getstep_size(_num_iterations);\n      syncSubstrate->set_num_round(_num_iterations);\n      dga.reset();\n      galois::do_all(\n          galois::iterate(nodesWithEdges), SGD(&_graph, step_size, dga),\n          galois::loopname(syncSubstrate->get_run_identifier(\"SGD\").c_str()),\n          galois::steal(), galois::no_stats());\n\n      // sync all residual latent vectors\n      syncSubstrate->sync<writeAny, readAny,\n                          Reduce_pair_wise_add_array_residual_latent_vector>(\n          \"SGD\");\n\n      SGD_mergeResidual::go(_graph);\n\n      ++_num_iterations;\n\n      // calculate root mean squared error\n      rms_normalized = std::sqrt(dga.reduce() / _graph.globalSizeEdges());\n      galois::gDebug(\"RMS Normalized : \", rms_normalized);\n      galois::gPrint(\"RMS Normalized: \", rms_normalized, \"\\n\");\n    } while ((_num_iterations < maxIterations) && (rms_normalized > 1));\n\n    if (galois::runtime::getSystemNetworkInterface().ID == 0) {\n      galois::runtime::reportStat_Single(\n          REGION_NAME,\n          \"NumIterations_\" + std::to_string(syncSubstrate->get_run_num()),\n          _num_iterations);\n    }\n  }\n\n  void operator()(GNode src) const {\n    NodeData& sdata           = graph->getData(src);\n    auto& movie_node          = sdata.latent_vector;\n    auto& residual_movie_node = sdata.residual_latent_vector;\n\n    for (auto jj = graph->edge_begin(src), ej = graph->edge_end(src); jj != ej;\n         ++jj) {\n      GNode dst   = graph->getEdgeDst(jj);\n      auto& ddata = graph->getData(dst);\n\n      auto& user_node          = ddata.latent_vector;\n      auto& residual_user_node = ddata.residual_latent_vector;\n      // auto& sdata_up = sdata.updates;\n\n      double edge_rating = graph->getEdgeData(dst);\n\n      // doGradientUpdate\n      double old_dp = galois::innerProduct(user_node, movie_node, double(0));\n\n      double cur_error = edge_rating - old_dp;\n      DGAccumulator_accum += (cur_error * cur_error);\n\n      assert(cur_error < 10000 && cur_error > -10000);\n\n      // update both vectors based on error derived from 2 previous vectors\n      for (int i = 0; i < LATENT_VECTOR_SIZE; ++i) {\n\n        double prevUser  = user_node[i];\n        double prevMovie = movie_node[i];\n\n        galois::atomicAdd(\n            residual_user_node[i],\n            double(step_size * (cur_error * prevMovie - LAMBDA * prevUser)));\n        assert(std::isnormal(residual_user_node[i].load()));\n\n        galois::atomicAdd(\n            residual_movie_node[i],\n            double(step_size * (cur_error * prevUser - LAMBDA * prevMovie)));\n        assert(std::isnormal(residual_movie_node[i].load()));\n      }\n    }\n  }\n};\n\n/******************************************************************************/\n/* Main */\n/******************************************************************************/\nconstexpr static const char* const name = \"SGD - Distributed Heterogeneous\";\nconstexpr static const char* const desc = \"SGD on Distributed Galois.\";\nconstexpr static const char* const url  = nullptr;\n\nint main(int argc, char** argv) {\n  galois::DistMemSys G;\n  DistBenchStart(argc, argv, name, desc, url);\n\n  const auto& net = galois::runtime::getSystemNetworkInterface();\n  if (net.ID == 0) {\n    galois::runtime::reportParam(REGION_NAME, \"Max Iterations\", maxIterations);\n  }\n\n  galois::StatTimer StatTimer_total(\"TimerTotal\", REGION_NAME);\n\n  StatTimer_total.start();\n  std::unique_ptr<Graph> hg;\n#ifdef GALOIS_ENABLE_GPU\n  std::tie(hg, syncSubstrate) =\n      distGraphInitialization<NodeData, double>(&cuda_ctx);\n#else\n  std::tie(hg, syncSubstrate) = distGraphInitialization<NodeData, double>();\n#endif\n\n  galois::gPrint(\"[\", net.ID, \"] InitializeGraph::go called\\n\");\n\n  InitializeGraph::go((*hg));\n\n  galois::runtime::getHostBarrier().wait();\n\n  // accumulators for use in operators\n  galois::DGAccumulator<double> DGAccumulator_accum;\n\n  for (auto run = 0; run < numRuns; ++run) {\n    galois::gPrint(\"[\", net.ID, \"] SGD::go run \", run, \" called\\n\");\n    std::string timer_str(\"Timer_\" + std::to_string(run));\n    galois::StatTimer StatTimer_main(timer_str.c_str(), REGION_NAME);\n\n    StatTimer_main.start();\n    SGD::go((*hg), DGAccumulator_accum);\n    StatTimer_main.stop();\n\n    if ((run + 1) != numRuns) {\n      syncSubstrate->set_num_run(run + 1);\n      InitializeGraph::go(*hg);\n      galois::runtime::getHostBarrier().wait();\n    }\n  }\n\n  StatTimer_total.stop();\n\n  if (output) {\n    galois::gError(\"output requested but this application doesn't support it\");\n    return 1;\n  }\n\n  return 0;\n}\n"
  },
  {
    "path": "lonestar/analytics/distributed/matrixcompletion/matrixCompletion_sync.hh",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting parallelism.\n * The code is being released under the terms of the 3-Clause BSD License (a\n * copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#include \"galois/runtime/SyncStructures.h\"\n#include \"galois/AtomicWrapper.h\"\n#include \"galois/ArrayWrapper.h\"\n\n#define LATENT_VECTOR_SIZE 20\n\ntypedef galois::CopyableArray<double, LATENT_VECTOR_SIZE> ArrTy;\ntypedef galois::CopyableArray<galois::CopyableAtomic<double>, LATENT_VECTOR_SIZE> ArrAtomicTy;\ntypedef std::vector<galois::CopyableAtomic<double>> VecAtomicTy;\ntypedef std::vector<double> VecTy;\n\n//GALOIS_SYNC_STRUCTURE_REDUCE_SET(residual_latent_vector, ArrAtomicTy);\n//GALOIS_SYNC_STRUCTURE_REDUCE_PAIR_WISE_ADD_ARRAY(residual_latent_vector, ArrAtomicTy);\n\n//New vector type\nGALOIS_SYNC_STRUCTURE_REDUCE_SET(residual_latent_vector, VecAtomicTy);\nGALOIS_SYNC_STRUCTURE_REDUCE_PAIR_WISE_ADD_ARRAY(residual_latent_vector, VecAtomicTy);\n\n\n//GALOIS_SYNC_STRUCTURE_REDUCE_PAIR_WISE_AVG_ARRAY(residual_latent_vector, ArrAtomicTy);\n//GALOIS_SYNC_STRUCTURE_REDUCE_SET(latent_vector, ArrTy);\n//GALOIS_SYNC_STRUCTURE_REDUCE_PAIR_WISE_AVG_ARRAY(latent_vector, ArrTy);\n\n//New vector type\nGALOIS_SYNC_STRUCTURE_REDUCE_SET(latent_vector, VecTy);\nGALOIS_SYNC_STRUCTURE_REDUCE_PAIR_WISE_AVG_ARRAY(latent_vector, VecTy);\n\n"
  },
  {
    "path": "lonestar/analytics/distributed/pagerank/CMakeLists.txt",
    "content": "app_dist(pagerank_pull pagerank-pull)\nadd_test_dist(pagerank-pull-dist rmat15 ${BASEINPUT}/scalefree/rmat15.gr -graphTranspose=${BASEINPUT}/scalefree/transpose/rmat15.tgr -maxIterations=100)\n\napp_dist(pagerank_push pagerank-push)\nadd_test_dist(pagerank-push-dist rmat15 ${BASEINPUT}/scalefree/rmat15.gr -graphTranspose=${BASEINPUT}/scalefree/transpose/rmat15.tgr -maxIterations=100)\n"
  },
  {
    "path": "lonestar/analytics/distributed/pagerank/README.md",
    "content": "PageRank\n================================================================================\n\nDESCRIPTION \n--------------------------------------------------------------------------------\n\nGiven a graph, ranks nodes in order of their importance using the PageRank\nalgorithm.\n\nThe algorithm supports both a bulk-synchronous and a bulk-asynchronous\nparallel algorithms. This benchmark consists of two algorithms,\npush- and pull-based. In the push-based algorithm, if a node has new\ncontributions to its neighbors' page rank values, it will push them out\nto them, in each round. In the pull-based algorithm, every node will\ncontribute to its own pagerank from its neighbors if they have any new\ncontributions to give, in each round.\n\nINPUT\n--------------------------------------------------------------------------------\n\nThis application takes in Galois .gr graphs.\n\nBUILD\n--------------------------------------------------------------------------------\n\n1. Run cmake at BUILD directory (refer to top-level README for cmake instructions).\n\n2. Run `cd <BUILD>/lonestar/analytics/distributed/pagerank/; make -j\n\nRUN\n--------------------------------------------------------------------------------\n\nTo run on 1 host for a max of 100 iterations, use the following:\n`./pagerank-push-dist <input-graph> -graphTranspose=<transpose-input-graph> -t=<num-threads> -maxIterations=100` \n`./pagerank-pull-dist <input-graph> -graphTranspose=<transpose-input-graph> -t=<num-threads> -maxIterations=100` \n\nTo run on 3 hosts h1, h2, and h3 for a max of 100 iterations with tolerance 0.001, use the following:\n`mpirun -n=3 -hosts=h1,h2,h3 ./pagerank-push-dist <input-graph> -graphTranspose=<transpose-input-graph> -t=<num-threads> -maxIterations=100 -tolerance=0.001`\n`mpirun -n=3 -hosts=h1,h2,h3 ./pagerank-pull-dist <input-graph> -graphTranspose=<transpose-input-graph> -t=<num-threads> -maxIterations=100 -tolerance=0.001`\n\nTo run on 3 hosts h1, h2, and h3 with an incoming edge cut, use the following:\n`mpirun -n=3 -hosts=h1,h2,h3 ./pagerank-push-dist <input-graph> -graphTranspose=<transpose-input-graph> -t=<num-threads> -partition=iec`\n`mpirun -n=3 -hosts=h1,h2,h3 ./pagerank-pull-dist <input-graph> -graphTranspose=<transpose-input-graph> -t=<num-threads> -partition=iec`\n\nPERFORMANCE\n--------------------------------------------------------------------------------\n\n* The pull variant generally performs better in our experience.\n\n* For 16 or less hosts/GPUs, for performance, we recommend using an\n  **edge-cut** partitioning policy (OEC or IEC) with **synchronous**\n  communication for performance.\n\n* For 32 or more hosts/GPUs, for performance, we recommend using the\n  **Cartesian vertex-cut** partitioning policy (CVC) with **asynchronous**\n  communication for performance.\n"
  },
  {
    "path": "lonestar/analytics/distributed/pagerank/pagerank_pull.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#include \"DistBench/Output.h\"\n#include \"DistBench/Start.h\"\n#include \"galois/DistGalois.h\"\n#include \"galois/gstl.h\"\n#include \"galois/DReducible.h\"\n#include \"galois/DTerminationDetector.h\"\n#include \"galois/runtime/Tracer.h\"\n\n#include <algorithm>\n#include <iostream>\n#include <limits>\n#include <vector>\n\n#ifdef GALOIS_ENABLE_GPU\n#include \"pagerank_pull_cuda.h\"\nstruct CUDA_Context* cuda_ctx;\n#else\nenum { CPU, GPU_CUDA };\nint personality = CPU;\n#endif\n\nconstexpr static const char* const REGION_NAME = \"PageRank\";\n\n/******************************************************************************/\n/* Declaration of command line arguments */\n/******************************************************************************/\nnamespace cll = llvm::cl;\n\nstatic cll::opt<float> tolerance(\"tolerance\",\n                                 cll::desc(\"tolerance for residual\"),\n                                 cll::init(0.000001));\nstatic cll::opt<unsigned int>\n    maxIterations(\"maxIterations\",\n                  cll::desc(\"Maximum iterations: Default 1000\"),\n                  cll::init(1000));\n\nenum Exec { Sync, Async };\n\nstatic cll::opt<Exec> execution(\n    \"exec\", cll::desc(\"Distributed Execution Model (default value Async):\"),\n    cll::values(clEnumVal(Sync, \"Bulk-synchronous Parallel (BSP)\"),\n                clEnumVal(Async, \"Bulk-asynchronous Parallel (BASP)\")),\n    cll::init(Async));\n\n/******************************************************************************/\n/* Graph structure declarations + other initialization */\n/******************************************************************************/\n\nstatic const float alpha = (1.0 - 0.85);\nstruct NodeData {\n  float value;\n  std::atomic<uint32_t> nout;\n  float residual;\n  float delta;\n};\n\ngalois::DynamicBitSet bitset_residual;\ngalois::DynamicBitSet bitset_nout;\n\ntypedef galois::graphs::DistGraph<NodeData, void> Graph;\ntypedef typename Graph::GraphNode GNode;\n\nstd::unique_ptr<galois::graphs::GluonSubstrate<Graph>> syncSubstrate;\n\n#include \"pagerank_pull_sync.hh\"\n\n/******************************************************************************/\n/* Algorithm structures */\n/******************************************************************************/\n\n/* (Re)initialize all fields to 0 except for residual which needs to be 0.15\n * everywhere */\nstruct ResetGraph {\n  const float& local_alpha;\n  Graph* graph;\n\n  ResetGraph(const float& _local_alpha, Graph* _graph)\n      : local_alpha(_local_alpha), graph(_graph) {}\n\n  void static go(Graph& _graph) {\n    const auto& allNodes = _graph.allNodesRange();\n    if (personality == GPU_CUDA) {\n#ifdef GALOIS_ENABLE_GPU\n      std::string impl_str(\"ResetGraph_\" +\n                           (syncSubstrate->get_run_identifier()));\n      galois::StatTimer StatTimer_cuda(impl_str.c_str(), REGION_NAME);\n      StatTimer_cuda.start();\n      ResetGraph_allNodes_cuda(alpha, cuda_ctx);\n      StatTimer_cuda.stop();\n#else\n      abort();\n#endif\n    } else if (personality == CPU) {\n      galois::do_all(\n          galois::iterate(allNodes.begin(), allNodes.end()),\n          ResetGraph{alpha, &_graph}, galois::no_stats(),\n          galois::loopname(\n              syncSubstrate->get_run_identifier(\"ResetGraph\").c_str()));\n    }\n  }\n\n  void operator()(GNode src) const {\n    auto& sdata    = graph->getData(src);\n    sdata.value    = 0;\n    sdata.nout     = 0;\n    sdata.delta    = 0;\n    sdata.residual = local_alpha;\n  }\n};\n\nstruct InitializeGraph {\n  Graph* graph;\n\n  InitializeGraph(Graph* _graph) : graph(_graph) {}\n\n  void static go(Graph& _graph) {\n    // init graph\n    ResetGraph::go(_graph);\n\n    const auto& nodesWithEdges = _graph.allNodesWithEdgesRange();\n\n    if (personality == GPU_CUDA) {\n#ifdef GALOIS_ENABLE_GPU\n      std::string impl_str(\"InitializeGraph_\" +\n                           (syncSubstrate->get_run_identifier()));\n      galois::StatTimer StatTimer_cuda(impl_str.c_str(), REGION_NAME);\n      StatTimer_cuda.start();\n      InitializeGraph_nodesWithEdges_cuda(cuda_ctx);\n      StatTimer_cuda.stop();\n#else\n      abort();\n#endif\n    } else if (personality == CPU) {\n      // doing a local do all because we are looping over edges\n      galois::do_all(\n          galois::iterate(nodesWithEdges), InitializeGraph{&_graph},\n          galois::steal(), galois::no_stats(),\n          galois::loopname(\n              syncSubstrate->get_run_identifier(\"InitializeGraph\").c_str()));\n    }\n\n    syncSubstrate\n        ->sync<writeDestination, readAny, Reduce_add_nout, Bitset_nout>(\n            \"InitializeGraph\");\n  }\n\n  // Calculate \"outgoing\" edges for destination nodes (note we are using\n  // the tranpose graph for pull algorithms)\n  void operator()(GNode src) const {\n    for (auto nbr : graph->edges(src)) {\n      GNode dst   = graph->getEdgeDst(nbr);\n      auto& ddata = graph->getData(dst);\n      galois::atomicAdd(ddata.nout, (uint32_t)1);\n      bitset_nout.set(dst);\n    }\n  }\n};\n\ntemplate <bool async>\nstruct PageRank_delta {\n  const float& local_alpha;\n  cll::opt<float>& local_tolerance;\n  Graph* graph;\n\n  using DGTerminatorDetector =\n      typename std::conditional<async, galois::DGTerminator<unsigned int>,\n                                galois::DGAccumulator<unsigned int>>::type;\n\n  DGTerminatorDetector& active_vertices;\n\n  PageRank_delta(const float& _local_alpha, cll::opt<float>& _local_tolerance,\n                 Graph* _graph, DGTerminatorDetector& _dga)\n      : local_alpha(_local_alpha), local_tolerance(_local_tolerance),\n        graph(_graph), active_vertices(_dga) {}\n\n  void static go(Graph& _graph, DGTerminatorDetector& dga) {\n    const auto& allNodes = _graph.allNodesRange();\n\n    if (personality == GPU_CUDA) {\n#ifdef GALOIS_ENABLE_GPU\n      std::string impl_str(\"PageRank_\" + (syncSubstrate->get_run_identifier()));\n      galois::StatTimer StatTimer_cuda(impl_str.c_str(), REGION_NAME);\n      StatTimer_cuda.start();\n      unsigned int __retval = 0;\n      PageRank_delta_allNodes_cuda(__retval, alpha, tolerance, cuda_ctx);\n      dga += __retval;\n      StatTimer_cuda.stop();\n#else\n      abort();\n#endif\n    } else if (personality == CPU) {\n      galois::do_all(\n          galois::iterate(allNodes.begin(), allNodes.end()),\n          PageRank_delta{alpha, tolerance, &_graph, dga}, galois::no_stats(),\n          galois::loopname(\n              syncSubstrate->get_run_identifier(\"PageRank_delta\").c_str()));\n    }\n  }\n\n  void operator()(GNode src) const {\n    auto& sdata = graph->getData(src);\n    sdata.delta = 0;\n\n    if (sdata.residual > 0) {\n      sdata.value += sdata.residual;\n      if (sdata.residual > this->local_tolerance) {\n        if (sdata.nout > 0) {\n          sdata.delta = sdata.residual * (1 - local_alpha) / sdata.nout;\n          active_vertices += 1;\n        }\n      }\n      sdata.residual = 0;\n    }\n  }\n};\n\n// TODO: GPU code operator does not match CPU's operator (cpu accumulates sum\n// and adds all at once, GPU adds each pulled value individually/atomically)\ntemplate <bool async>\nstruct PageRank {\n  Graph* graph;\n\n  using DGTerminatorDetector =\n      typename std::conditional<async, galois::DGTerminator<unsigned int>,\n                                galois::DGAccumulator<unsigned int>>::type;\n\n  PageRank(Graph* _graph) : graph(_graph) {}\n\n  void static go(Graph& _graph) {\n    unsigned _num_iterations   = 0;\n    const auto& nodesWithEdges = _graph.allNodesWithEdgesRange();\n    DGTerminatorDetector dga;\n\n    // unsigned int reduced = 0;\n\n    do {\n      syncSubstrate->set_num_round(_num_iterations);\n      dga.reset();\n      PageRank_delta<async>::go(_graph, dga);\n      // reset residual on mirrors\n      syncSubstrate->reset_mirrorField<Reduce_add_residual>();\n\n      if (personality == GPU_CUDA) {\n#ifdef GALOIS_ENABLE_GPU\n        std::string impl_str(\"PageRank_\" +\n                             (syncSubstrate->get_run_identifier()));\n        galois::StatTimer StatTimer_cuda(impl_str.c_str(), REGION_NAME);\n        StatTimer_cuda.start();\n        PageRank_nodesWithEdges_cuda(cuda_ctx);\n        StatTimer_cuda.stop();\n#else\n        abort();\n#endif\n      } else if (personality == CPU) {\n        galois::do_all(\n            galois::iterate(nodesWithEdges), PageRank{&_graph}, galois::steal(),\n            galois::no_stats(),\n            galois::loopname(\n                syncSubstrate->get_run_identifier(\"PageRank\").c_str()));\n      }\n\n      syncSubstrate->sync<writeSource, readDestination, Reduce_add_residual,\n                          Bitset_residual, async>(\"PageRank\");\n\n      galois::runtime::reportStat_Tsum(\n          REGION_NAME, \"NumWorkItems_\" + (syncSubstrate->get_run_identifier()),\n          (unsigned long)_graph.sizeEdges());\n\n      ++_num_iterations;\n    } while ((async || (_num_iterations < maxIterations)) &&\n             dga.reduce(syncSubstrate->get_run_identifier()));\n\n    galois::runtime::reportStat_Tmax(\n        REGION_NAME,\n        \"NumIterations_\" + std::to_string(syncSubstrate->get_run_num()),\n        (unsigned long)_num_iterations);\n  }\n\n  // Pull deltas from neighbor nodes, then add to self-residual\n  void operator()(GNode src) const {\n    auto& sdata = graph->getData(src);\n\n    for (auto nbr : graph->edges(src)) {\n      GNode dst   = graph->getEdgeDst(nbr);\n      auto& ddata = graph->getData(dst);\n\n      if (ddata.delta > 0) {\n        galois::add(sdata.residual, ddata.delta);\n\n        bitset_residual.set(src);\n      }\n    }\n  }\n};\n\n/******************************************************************************/\n/* Sanity check operators */\n/******************************************************************************/\n\n// Gets various values from the pageranks values/residuals of the graph\nstruct PageRankSanity {\n  cll::opt<float>& local_tolerance;\n  Graph* graph;\n\n  galois::DGAccumulator<float>& DGAccumulator_sum;\n  galois::DGAccumulator<float>& DGAccumulator_sum_residual;\n  galois::DGAccumulator<uint64_t>& DGAccumulator_residual_over_tolerance;\n\n  galois::DGReduceMax<float>& max_value;\n  galois::DGReduceMin<float>& min_value;\n  galois::DGReduceMax<float>& max_residual;\n  galois::DGReduceMin<float>& min_residual;\n\n  PageRankSanity(\n      cll::opt<float>& _local_tolerance, Graph* _graph,\n      galois::DGAccumulator<float>& _DGAccumulator_sum,\n      galois::DGAccumulator<float>& _DGAccumulator_sum_residual,\n      galois::DGAccumulator<uint64_t>& _DGAccumulator_residual_over_tolerance,\n      galois::DGReduceMax<float>& _max_value,\n      galois::DGReduceMin<float>& _min_value,\n      galois::DGReduceMax<float>& _max_residual,\n      galois::DGReduceMin<float>& _min_residual)\n      : local_tolerance(_local_tolerance), graph(_graph),\n        DGAccumulator_sum(_DGAccumulator_sum),\n        DGAccumulator_sum_residual(_DGAccumulator_sum_residual),\n        DGAccumulator_residual_over_tolerance(\n            _DGAccumulator_residual_over_tolerance),\n        max_value(_max_value), min_value(_min_value),\n        max_residual(_max_residual), min_residual(_min_residual) {}\n\n  void static go(Graph& _graph, galois::DGAccumulator<float>& DGA_sum,\n                 galois::DGAccumulator<float>& DGA_sum_residual,\n                 galois::DGAccumulator<uint64_t>& DGA_residual_over_tolerance,\n                 galois::DGReduceMax<float>& max_value,\n                 galois::DGReduceMin<float>& min_value,\n                 galois::DGReduceMax<float>& max_residual,\n                 galois::DGReduceMin<float>& min_residual) {\n    DGA_sum.reset();\n    DGA_sum_residual.reset();\n    max_value.reset();\n    max_residual.reset();\n    min_value.reset();\n    min_residual.reset();\n    DGA_residual_over_tolerance.reset();\n\n    if (personality == GPU_CUDA) {\n#ifdef GALOIS_ENABLE_GPU\n      float _max_value;\n      float _min_value;\n      float _sum_value;\n      float _sum_residual;\n      uint64_t num_residual_over_tolerance;\n      float _max_residual;\n      float _min_residual;\n      PageRankSanity_masterNodes_cuda(\n          num_residual_over_tolerance, _sum_value, _sum_residual, _max_residual,\n          _max_value, _min_residual, _min_value, tolerance, cuda_ctx);\n      DGA_sum += _sum_value;\n      DGA_sum_residual += _sum_residual;\n      DGA_residual_over_tolerance += num_residual_over_tolerance;\n      max_value.update(_max_value);\n      max_residual.update(_max_residual);\n      min_value.update(_min_value);\n      min_residual.update(_min_residual);\n#endif\n    } else {\n      galois::do_all(galois::iterate(_graph.masterNodesRange().begin(),\n                                     _graph.masterNodesRange().end()),\n                     PageRankSanity(tolerance, &_graph, DGA_sum,\n                                    DGA_sum_residual,\n                                    DGA_residual_over_tolerance, max_value,\n                                    min_value, max_residual, min_residual),\n                     galois::no_stats(), galois::loopname(\"PageRankSanity\"));\n    }\n\n    float max_rank          = max_value.reduce();\n    float min_rank          = min_value.reduce();\n    float rank_sum          = DGA_sum.reduce();\n    float residual_sum      = DGA_sum_residual.reduce();\n    uint64_t over_tolerance = DGA_residual_over_tolerance.reduce();\n    float max_res           = max_residual.reduce();\n    float min_res           = min_residual.reduce();\n\n    // Only node 0 will print data\n    if (galois::runtime::getSystemNetworkInterface().ID == 0) {\n      galois::gPrint(\"Max rank is \", max_rank, \"\\n\");\n      galois::gPrint(\"Min rank is \", min_rank, \"\\n\");\n      galois::gPrint(\"Rank sum is \", rank_sum, \"\\n\");\n      galois::gPrint(\"Residual sum is \", residual_sum, \"\\n\");\n      galois::gPrint(\"# nodes with residual over \", tolerance,\n                     \" (tolerance) is \", over_tolerance, \"\\n\");\n      galois::gPrint(\"Max residual is \", max_res, \"\\n\");\n      galois::gPrint(\"Min residual is \", min_res, \"\\n\");\n    }\n  }\n\n  /* Gets the max, min rank from all owned nodes and\n   * also the sum of ranks */\n  void operator()(GNode src) const {\n    NodeData& sdata = graph->getData(src);\n\n    max_value.update(sdata.value);\n    min_value.update(sdata.value);\n    max_residual.update(sdata.residual);\n    min_residual.update(sdata.residual);\n\n    DGAccumulator_sum += sdata.value;\n    DGAccumulator_sum_residual += sdata.residual;\n\n    if (sdata.residual > local_tolerance) {\n      DGAccumulator_residual_over_tolerance += 1;\n    }\n  }\n};\n\nstd::vector<float> makeResultsCPU(std::unique_ptr<Graph>& hg) {\n  std::vector<float> values;\n\n  values.reserve(hg->numMasters());\n  for (auto node : hg->masterNodesRange()) {\n    values.push_back(hg->getData(node).value);\n  }\n\n  return values;\n}\n\n#ifdef GALOIS_ENABLE_GPU\nstd::vector<float> makeResultsGPU(std::unique_ptr<Graph>& hg) {\n  std::vector<float> values;\n\n  values.reserve(hg->numMasters());\n  for (auto node : hg->masterNodesRange()) {\n    values.push_back(get_node_value_cuda(cuda_ctx, node));\n  }\n\n  return values;\n}\n#else\nstd::vector<float> makeResultsGPU(std::unique_ptr<Graph>& /*unused*/) {\n  abort();\n}\n#endif\n\nstd::vector<float> makeResults(std::unique_ptr<Graph>& hg) {\n  switch (personality) {\n  case CPU:\n    return makeResultsCPU(hg);\n  case GPU_CUDA:\n    return makeResultsGPU(hg);\n  default:\n    abort();\n  }\n}\n\n/******************************************************************************/\n/* Main */\n/******************************************************************************/\n\nconstexpr static const char* const name = \"PageRank - Compiler Generated \"\n                                          \"Distributed Heterogeneous\";\nconstexpr static const char* const desc = \"PageRank Residual Pull version on \"\n                                          \"Distributed Galois.\";\nconstexpr static const char* const url = nullptr;\n\nint main(int argc, char** argv) {\n  galois::DistMemSys G;\n  DistBenchStart(argc, argv, name, desc, url);\n\n  auto& net = galois::runtime::getSystemNetworkInterface();\n\n  if (net.ID == 0) {\n    galois::runtime::reportParam(REGION_NAME, \"Max Iterations\", maxIterations);\n    std::ostringstream ss;\n    ss << tolerance;\n    galois::runtime::reportParam(REGION_NAME, \"Tolerance\", ss.str());\n  }\n\n  galois::StatTimer StatTimer_total(\"TimerTotal\", REGION_NAME);\n\n  StatTimer_total.start();\n\n  std::unique_ptr<Graph> hg;\n#ifdef GALOIS_ENABLE_GPU\n  std::tie(hg, syncSubstrate) =\n      distGraphInitialization<NodeData, void, false>(&cuda_ctx);\n#else\n  std::tie(hg, syncSubstrate) =\n      distGraphInitialization<NodeData, void, false>();\n#endif\n\n  bitset_residual.resize(hg->size());\n  bitset_nout.resize(hg->size());\n\n  galois::gPrint(\"[\", net.ID, \"] InitializeGraph::go called\\n\");\n\n  InitializeGraph::go(*hg);\n  galois::runtime::getHostBarrier().wait();\n\n  galois::DGAccumulator<float> DGA_sum;\n  galois::DGAccumulator<float> DGA_sum_residual;\n  galois::DGAccumulator<uint64_t> DGA_residual_over_tolerance;\n  galois::DGReduceMax<float> max_value;\n  galois::DGReduceMin<float> min_value;\n  galois::DGReduceMax<float> max_residual;\n  galois::DGReduceMin<float> min_residual;\n\n  for (auto run = 0; run < numRuns; ++run) {\n    galois::gPrint(\"[\", net.ID, \"] PageRank::go run \", run, \" called\\n\");\n    std::string timer_str(\"Timer_\" + std::to_string(run));\n    galois::StatTimer StatTimer_main(timer_str.c_str(), REGION_NAME);\n\n    StatTimer_main.start();\n    if (execution == Async) {\n      PageRank<true>::go(*hg);\n    } else {\n      PageRank<false>::go(*hg);\n    }\n    StatTimer_main.stop();\n\n    // sanity check\n    PageRankSanity::go(*hg, DGA_sum, DGA_sum_residual,\n                       DGA_residual_over_tolerance, max_value, min_value,\n                       max_residual, min_residual);\n\n    if ((run + 1) != numRuns) {\n      if (personality == GPU_CUDA) {\n#ifdef GALOIS_ENABLE_GPU\n        bitset_residual_reset_cuda(cuda_ctx);\n        bitset_nout_reset_cuda(cuda_ctx);\n#else\n        abort();\n#endif\n      } else {\n        bitset_residual.reset();\n        bitset_nout.reset();\n      }\n\n      syncSubstrate->set_num_run(run + 1);\n      InitializeGraph::go(*hg);\n      galois::runtime::getHostBarrier().wait();\n    }\n  }\n\n  StatTimer_total.stop();\n\n  if (output) {\n    std::vector<float> results = makeResults(hg);\n    auto globalIDs             = hg->getMasterGlobalIDs();\n    assert(results.size() == globalIDs.size());\n\n    writeOutput(outputLocation, \"pagerank\", results.data(), results.size(),\n                globalIDs.data());\n  }\n\n  return 0;\n}\n"
  },
  {
    "path": "lonestar/analytics/distributed/pagerank/pagerank_pull_cuda.cu",
    "content": "/*  -*- mode: c++ -*-  */\n#include \"gg.h\"\n#include \"ggcuda.h\"\n#include \"cub/cub.cuh\"\n#include \"cub/util_allocator.cuh\"\n#include \"thread_work.h\"\n\nvoid kernel_sizing(CSRGraph &, dim3 &, dim3 &);\n#define TB_SIZE 256\nconst char *GGC_OPTIONS = \"coop_conv=False $ outline_iterate_gb=False $ backoff_blocking_factor=4 $ parcomb=True $ np_schedulers=set(['fg', 'tb', 'wp']) $ cc_disable=set([]) $ tb_lb=False $ hacks=set([]) $ np_factor=8 $ instrument=set([]) $ unroll=[] $ instrument_mode=None $ read_props=None $ outline_iterate=False $ ignore_nested_errors=False $ np=True $ write_props=None $ quiet_cgen=True $ dyn_lb=True $ retry_backoff=True $ cuda.graph_type=basic $ cuda.use_worklist_slots=True $ cuda.worklist_type=basic\";\nstruct ThreadWork t_work;\nbool enable_lb = true;\n#include \"pagerank_pull_cuda.cuh\"\nstatic const int __tb_PageRank = TB_SIZE;\nstatic const int __tb_InitializeGraph = TB_SIZE;\n__global__ void ResetGraph(CSRGraph graph, unsigned int __begin, unsigned int __end, const float  local_alpha, float * p_delta, uint32_t * p_nout, float * p_residual, float * p_value)\n{\n  unsigned tid = TID_1D;\n  unsigned nthreads = TOTAL_THREADS_1D;\n\n  const unsigned __kernel_tb_size = TB_SIZE;\n  index_type src_end;\n  // FP: \"1 -> 2;\n  src_end = __end;\n  for (index_type src = __begin + tid; src < src_end; src += nthreads)\n  {\n    bool pop  = src < __end;\n    if (pop)\n    {\n      p_value[src]    = 0;\n      p_nout[src]     = 0;\n      p_delta[src]    = 0;\n      p_residual[src] = local_alpha;\n    }\n  }\n  // FP: \"10 -> 11;\n}\n__global__ void InitializeGraph_TB_LB(CSRGraph graph, unsigned int __begin, unsigned int __end, uint32_t * p_nout, DynamicBitset& bitset_nout, int * thread_prefix_work_wl, unsigned int num_items, PipeContextT<Worklist2> thread_src_wl)\n{\n  unsigned tid = TID_1D;\n  unsigned nthreads = TOTAL_THREADS_1D;\n\n  const unsigned __kernel_tb_size = TB_SIZE;\n  __shared__ unsigned int total_work;\n  __shared__ unsigned block_start_src_index;\n  __shared__ unsigned block_end_src_index;\n  unsigned my_work;\n  unsigned src;\n  unsigned int offset;\n  unsigned int current_work;\n  // FP: \"1 -> 2;\n  // FP: \"2 -> 3;\n  unsigned blockdim_x = BLOCK_DIM_X;\n  // FP: \"3 -> 4;\n  // FP: \"4 -> 5;\n  // FP: \"5 -> 6;\n  // FP: \"6 -> 7;\n  // FP: \"7 -> 8;\n  // FP: \"8 -> 9;\n  // FP: \"9 -> 10;\n  total_work = thread_prefix_work_wl[num_items - 1];\n  // FP: \"10 -> 11;\n  my_work = ceilf((float)(total_work) / (float) nthreads);\n  // FP: \"11 -> 12;\n\n  // FP: \"12 -> 13;\n  __syncthreads();\n  // FP: \"13 -> 14;\n\n  // FP: \"14 -> 15;\n  if (my_work != 0)\n  {\n    current_work = tid;\n  }\n  // FP: \"17 -> 18;\n  for (unsigned i =0; i < my_work; i++)\n  {\n    unsigned int block_start_work;\n    unsigned int block_end_work;\n    if (threadIdx.x == 0)\n    {\n      if (current_work < total_work)\n      {\n        block_start_work = current_work;\n        block_end_work=current_work + blockdim_x - 1;\n        if (block_end_work >= total_work)\n        {\n          block_end_work = total_work - 1;\n        }\n        block_start_src_index = compute_src_and_offset(0, num_items - 1,  block_start_work+1, thread_prefix_work_wl, num_items,offset);\n        block_end_src_index = compute_src_and_offset(0, num_items - 1, block_end_work+1, thread_prefix_work_wl, num_items, offset);\n      }\n    }\n    __syncthreads();\n\n    if (current_work < total_work)\n    {\n      unsigned src_index;\n      index_type nbr;\n      src_index = compute_src_and_offset(block_start_src_index, block_end_src_index, current_work+1, thread_prefix_work_wl,num_items, offset);\n      src= thread_src_wl.in_wl().dwl[src_index];\n      nbr = (graph).getFirstEdge(src)+ offset;\n      {\n        index_type dst;\n        dst = graph.getAbsDestination(nbr);\n        atomicTestAdd(&p_nout[dst], (uint32_t)1);\n        bitset_nout.set(dst);\n      }\n      current_work = current_work + nthreads;\n    }\n    __syncthreads();\n  }\n  // FP: \"44 -> 45;\n}\n__global__ void InitializeGraph(CSRGraph graph, unsigned int __begin, unsigned int __end, uint32_t * p_nout, DynamicBitset& bitset_nout, PipeContextT<Worklist2> thread_work_wl, PipeContextT<Worklist2> thread_src_wl, bool enable_lb)\n{\n  unsigned tid = TID_1D;\n  unsigned nthreads = TOTAL_THREADS_1D;\n\n  const unsigned __kernel_tb_size = __tb_InitializeGraph;\n  index_type src_end;\n  index_type src_rup;\n  // FP: \"1 -> 2;\n  const int _NP_CROSSOVER_WP = 32;\n  const int _NP_CROSSOVER_TB = __kernel_tb_size;\n  // FP: \"2 -> 3;\n  const int BLKSIZE = __kernel_tb_size;\n  const int ITSIZE = BLKSIZE * 8;\n  unsigned d_limit = DEGREE_LIMIT;\n  // FP: \"3 -> 4;\n\n  typedef cub::BlockScan<multiple_sum<2, index_type>, BLKSIZE> BlockScan;\n  typedef union np_shared<BlockScan::TempStorage, index_type, struct tb_np, struct warp_np<__kernel_tb_size/32>, struct fg_np<ITSIZE> > npsTy;\n\n  // FP: \"4 -> 5;\n  __shared__ npsTy nps ;\n  // FP: \"5 -> 6;\n  src_end = __end;\n  src_rup = ((__begin) + roundup(((__end) - (__begin)), (blockDim.x)));\n  for (index_type src = __begin + tid; src < src_rup; src += nthreads)\n  {\n    int index;\n    multiple_sum<2, index_type> _np_mps;\n    multiple_sum<2, index_type> _np_mps_total;\n    // FP: \"6 -> 7;\n    bool pop  = src < __end && ((( src < (graph).nnodes )) ? true: false);\n    // FP: \"7 -> 8;\n    if (pop)\n    {\n    }\n    // FP: \"9 -> 10;\n    // FP: \"12 -> 13;\n    // FP: \"13 -> 14;\n    int threshold = TOTAL_THREADS_1D;\n    // FP: \"14 -> 15;\n    if (pop && (graph).getOutDegree(src) >= threshold)\n    {\n      index = thread_work_wl.in_wl().push_range(1) ;\n      thread_src_wl.in_wl().push_range(1);\n      thread_work_wl.in_wl().dwl[index] = (graph).getOutDegree(src);\n      thread_src_wl.in_wl().dwl[index] = src;\n      pop = false;\n    }\n    // FP: \"17 -> 18;\n    struct NPInspector1 _np = {0,0,0,0,0,0};\n    // FP: \"18 -> 19;\n    __shared__ struct { ; } _np_closure [TB_SIZE];\n    // FP: \"19 -> 20;\n    // FP: \"20 -> 21;\n    if (pop)\n    {\n      _np.size = (graph).getOutDegree(src);\n      _np.start = (graph).getFirstEdge(src);\n    }\n    // FP: \"23 -> 24;\n    // FP: \"24 -> 25;\n    _np_mps.el[0] = _np.size >= _NP_CROSSOVER_WP ? _np.size : 0;\n    _np_mps.el[1] = _np.size < _NP_CROSSOVER_WP ? _np.size : 0;\n    // FP: \"25 -> 26;\n    BlockScan(nps.temp_storage).ExclusiveSum(_np_mps, _np_mps, _np_mps_total);\n    // FP: \"26 -> 27;\n    if (threadIdx.x == 0)\n    {\n      nps.tb.owner = MAX_TB_SIZE + 1;\n    }\n    // FP: \"29 -> 30;\n    __syncthreads();\n    // FP: \"30 -> 31;\n    while (true)\n    {\n      // FP: \"31 -> 32;\n      if (_np.size >= _NP_CROSSOVER_TB)\n      {\n        nps.tb.owner = threadIdx.x;\n      }\n      // FP: \"34 -> 35;\n      __syncthreads();\n      // FP: \"35 -> 36;\n      if (nps.tb.owner == MAX_TB_SIZE + 1)\n      {\n        // FP: \"36 -> 37;\n        __syncthreads();\n        // FP: \"37 -> 38;\n        break;\n      }\n      // FP: \"39 -> 40;\n      if (nps.tb.owner == threadIdx.x)\n      {\n        nps.tb.start = _np.start;\n        nps.tb.size = _np.size;\n        nps.tb.src = threadIdx.x;\n        _np.start = 0;\n        _np.size = 0;\n      }\n      // FP: \"42 -> 43;\n      __syncthreads();\n      // FP: \"43 -> 44;\n      int ns = nps.tb.start;\n      int ne = nps.tb.size;\n      // FP: \"44 -> 45;\n      if (nps.tb.src == threadIdx.x)\n      {\n        nps.tb.owner = MAX_TB_SIZE + 1;\n      }\n      // FP: \"47 -> 48;\n      assert(nps.tb.src < __kernel_tb_size);\n      // FP: \"48 -> 49;\n      for (int _np_j = threadIdx.x; _np_j < ne; _np_j += BLKSIZE)\n      {\n        index_type nbr;\n        nbr = ns +_np_j;\n        {\n          index_type dst;\n          dst = graph.getAbsDestination(nbr);\n          atomicTestAdd(&p_nout[dst], (uint32_t)1);\n          bitset_nout.set(dst);\n        }\n      }\n      // FP: \"56 -> 57;\n      __syncthreads();\n    }\n    // FP: \"58 -> 59;\n\n    // FP: \"59 -> 60;\n    {\n      const int warpid = threadIdx.x / 32;\n      // FP: \"60 -> 61;\n      const int _np_laneid = cub::LaneId();\n      // FP: \"61 -> 62;\n      while (__any_sync(0xffffffff, _np.size >= _NP_CROSSOVER_WP && _np.size < _NP_CROSSOVER_TB))\n      {\n        if (_np.size >= _NP_CROSSOVER_WP && _np.size < _NP_CROSSOVER_TB)\n        {\n          nps.warp.owner[warpid] = _np_laneid;\n        }\n        if (nps.warp.owner[warpid] == _np_laneid)\n        {\n          nps.warp.start[warpid] = _np.start;\n          nps.warp.size[warpid] = _np.size;\n          nps.warp.src[warpid] = threadIdx.x;\n          _np.start = 0;\n          _np.size = 0;\n        }\n        index_type _np_w_start = nps.warp.start[warpid];\n        index_type _np_w_size = nps.warp.size[warpid];\n        assert(nps.warp.src[warpid] < __kernel_tb_size);\n        for (int _np_ii = _np_laneid; _np_ii < _np_w_size; _np_ii += 32)\n        {\n          index_type nbr;\n          nbr = _np_w_start +_np_ii;\n          {\n            index_type dst;\n            dst = graph.getAbsDestination(nbr);\n            atomicTestAdd(&p_nout[dst], (uint32_t)1);\n            bitset_nout.set(dst);\n          }\n        }\n      }\n      // FP: \"79 -> 80;\n      __syncthreads();\n      // FP: \"80 -> 81;\n    }\n\n    // FP: \"81 -> 82;\n    __syncthreads();\n    // FP: \"82 -> 83;\n    _np.total = _np_mps_total.el[1];\n    _np.offset = _np_mps.el[1];\n    // FP: \"83 -> 84;\n    while (_np.work())\n    {\n      // FP: \"84 -> 85;\n      int _np_i =0;\n      // FP: \"85 -> 86;\n      _np.inspect2(nps.fg.itvalue, nps.fg.src, ITSIZE, threadIdx.x);\n      // FP: \"86 -> 87;\n      __syncthreads();\n      // FP: \"87 -> 88;\n\n      // FP: \"88 -> 89;\n      for (_np_i = threadIdx.x; _np_i < ITSIZE && _np.valid(_np_i); _np_i += BLKSIZE)\n      {\n        index_type nbr;\n        assert(nps.fg.src[_np_i] < __kernel_tb_size);\n        nbr= nps.fg.itvalue[_np_i];\n        {\n          index_type dst;\n          dst = graph.getAbsDestination(nbr);\n          atomicTestAdd(&p_nout[dst], (uint32_t)1);\n          bitset_nout.set(dst);\n        }\n      }\n      // FP: \"97 -> 98;\n      _np.execute_round_done(ITSIZE);\n      // FP: \"98 -> 99;\n      __syncthreads();\n    }\n    // FP: \"100 -> 101;\n    assert(threadIdx.x < __kernel_tb_size);\n  }\n  // FP: \"102 -> 103;\n}\n__global__ void PageRank_delta(CSRGraph graph, unsigned int __begin, unsigned int __end, const float  local_alpha, float local_tolerance, float * p_delta, uint32_t * p_nout, float * p_residual, float * p_value, HGAccumulator<unsigned int> active_vertices)\n{\n  unsigned tid = TID_1D;\n  unsigned nthreads = TOTAL_THREADS_1D;\n\n  const unsigned __kernel_tb_size = TB_SIZE;\n  __shared__ cub::BlockReduce<unsigned int, TB_SIZE>::TempStorage active_vertices_ts;\n  index_type src_end;\n  // FP: \"1 -> 2;\n  // FP: \"2 -> 3;\n  active_vertices.thread_entry();\n  // FP: \"3 -> 4;\n  src_end = __end;\n  for (index_type src = __begin + tid; src < src_end; src += nthreads)\n  {\n    bool pop  = src < __end;\n    if (pop)\n    {\n      p_delta[src] = 0;\n      if (p_residual[src] > 0)\n      {\n        p_value[src] += p_residual[src];\n        if (p_residual[src] > local_tolerance)\n        {\n          if (p_nout[src] > 0)\n          {\n            p_delta[src] = p_residual[src] * (1 - local_alpha) / p_nout[src];\n            active_vertices.reduce( 1);\n          }\n        }\n        p_residual[src] = 0;\n      }\n    }\n  }\n  // FP: \"19 -> 20;\n  active_vertices.thread_exit<cub::BlockReduce<unsigned int, TB_SIZE> >(active_vertices_ts);\n  // FP: \"20 -> 21;\n}\n__global__ void PageRank_TB_LB(CSRGraph graph, unsigned int __begin, unsigned int __end, float * p_delta, float * p_residual, DynamicBitset& bitset_residual, int * thread_prefix_work_wl, unsigned int num_items, PipeContextT<Worklist2> thread_src_wl)\n{\n  unsigned tid = TID_1D;\n  unsigned nthreads = TOTAL_THREADS_1D;\n\n  const unsigned __kernel_tb_size = TB_SIZE;\n  __shared__ unsigned int total_work;\n  __shared__ unsigned block_start_src_index;\n  __shared__ unsigned block_end_src_index;\n  unsigned my_work;\n  unsigned src;\n  unsigned int offset;\n  unsigned int current_work;\n  // FP: \"1 -> 2;\n  // FP: \"2 -> 3;\n  unsigned blockdim_x = BLOCK_DIM_X;\n  // FP: \"3 -> 4;\n  // FP: \"4 -> 5;\n  // FP: \"5 -> 6;\n  // FP: \"6 -> 7;\n  // FP: \"7 -> 8;\n  // FP: \"8 -> 9;\n  // FP: \"9 -> 10;\n  total_work = thread_prefix_work_wl[num_items - 1];\n  // FP: \"10 -> 11;\n  my_work = ceilf((float)(total_work) / (float) nthreads);\n  // FP: \"11 -> 12;\n\n  // FP: \"12 -> 13;\n  __syncthreads();\n  // FP: \"13 -> 14;\n\n  // FP: \"14 -> 15;\n  if (my_work != 0)\n  {\n    current_work = tid;\n  }\n  // FP: \"17 -> 18;\n  for (unsigned i =0; i < my_work; i++)\n  {\n    unsigned int block_start_work;\n    unsigned int block_end_work;\n    if (threadIdx.x == 0)\n    {\n      if (current_work < total_work)\n      {\n        block_start_work = current_work;\n        block_end_work=current_work + blockdim_x - 1;\n        if (block_end_work >= total_work)\n        {\n          block_end_work = total_work - 1;\n        }\n        block_start_src_index = compute_src_and_offset(0, num_items - 1,  block_start_work+1, thread_prefix_work_wl, num_items,offset);\n        block_end_src_index = compute_src_and_offset(0, num_items - 1, block_end_work+1, thread_prefix_work_wl, num_items, offset);\n      }\n    }\n    __syncthreads();\n\n    if (current_work < total_work)\n    {\n      unsigned src_index;\n      index_type nbr;\n      src_index = compute_src_and_offset(block_start_src_index, block_end_src_index, current_work+1, thread_prefix_work_wl,num_items, offset);\n      src= thread_src_wl.in_wl().dwl[src_index];\n      nbr = (graph).getFirstEdge(src)+ offset;\n      {\n        index_type dst;\n        dst = graph.getAbsDestination(nbr);\n        if (p_delta[dst] > 0)\n        {\n          atomicTestAdd(&p_residual[src], p_delta[dst]);\n          bitset_residual.set(src);\n        }\n      }\n      current_work = current_work + nthreads;\n    }\n    __syncthreads();\n  }\n  // FP: \"46 -> 47;\n}\n__global__ void PageRank(CSRGraph graph, unsigned int __begin, unsigned int __end, float * p_delta, float * p_residual, DynamicBitset& bitset_residual, PipeContextT<Worklist2> thread_work_wl, PipeContextT<Worklist2> thread_src_wl, bool enable_lb)\n{\n  unsigned tid = TID_1D;\n  unsigned nthreads = TOTAL_THREADS_1D;\n\n  const unsigned __kernel_tb_size = __tb_PageRank;\n  index_type src_end;\n  index_type src_rup;\n  // FP: \"1 -> 2;\n  const int _NP_CROSSOVER_WP = 32;\n  const int _NP_CROSSOVER_TB = __kernel_tb_size;\n  // FP: \"2 -> 3;\n  const int BLKSIZE = __kernel_tb_size;\n  const int ITSIZE = BLKSIZE * 8;\n  unsigned d_limit = DEGREE_LIMIT;\n  // FP: \"3 -> 4;\n\n  typedef cub::BlockScan<multiple_sum<2, index_type>, BLKSIZE> BlockScan;\n  typedef union np_shared<BlockScan::TempStorage, index_type, struct tb_np, struct warp_np<__kernel_tb_size/32>, struct fg_np<ITSIZE> > npsTy;\n\n  // FP: \"4 -> 5;\n  __shared__ npsTy nps ;\n  // FP: \"5 -> 6;\n  src_end = __end;\n  src_rup = ((__begin) + roundup(((__end) - (__begin)), (blockDim.x)));\n  for (index_type src = __begin + tid; src < src_rup; src += nthreads)\n  {\n    int index;\n    multiple_sum<2, index_type> _np_mps;\n    multiple_sum<2, index_type> _np_mps_total;\n    // FP: \"6 -> 7;\n    bool pop  = src < __end && ((( src < (graph).nnodes )) ? true: false);\n    // FP: \"7 -> 8;\n    if (pop)\n    {\n    }\n    // FP: \"9 -> 10;\n    // FP: \"12 -> 13;\n    // FP: \"13 -> 14;\n    int threshold = TOTAL_THREADS_1D;\n    // FP: \"14 -> 15;\n    if (pop && (graph).getOutDegree(src) >= threshold)\n    {\n      index = thread_work_wl.in_wl().push_range(1) ;\n      thread_src_wl.in_wl().push_range(1);\n      thread_work_wl.in_wl().dwl[index] = (graph).getOutDegree(src);\n      thread_src_wl.in_wl().dwl[index] = src;\n      pop = false;\n    }\n    // FP: \"17 -> 18;\n    struct NPInspector1 _np = {0,0,0,0,0,0};\n    // FP: \"18 -> 19;\n    __shared__ struct { index_type src; } _np_closure [TB_SIZE];\n    // FP: \"19 -> 20;\n    _np_closure[threadIdx.x].src = src;\n    // FP: \"20 -> 21;\n    if (pop)\n    {\n      _np.size = (graph).getOutDegree(src);\n      _np.start = (graph).getFirstEdge(src);\n    }\n    // FP: \"23 -> 24;\n    // FP: \"24 -> 25;\n    _np_mps.el[0] = _np.size >= _NP_CROSSOVER_WP ? _np.size : 0;\n    _np_mps.el[1] = _np.size < _NP_CROSSOVER_WP ? _np.size : 0;\n    // FP: \"25 -> 26;\n    BlockScan(nps.temp_storage).ExclusiveSum(_np_mps, _np_mps, _np_mps_total);\n    // FP: \"26 -> 27;\n    if (threadIdx.x == 0)\n    {\n      nps.tb.owner = MAX_TB_SIZE + 1;\n    }\n    // FP: \"29 -> 30;\n    __syncthreads();\n    // FP: \"30 -> 31;\n    while (true)\n    {\n      // FP: \"31 -> 32;\n      if (_np.size >= _NP_CROSSOVER_TB)\n      {\n        nps.tb.owner = threadIdx.x;\n      }\n      // FP: \"34 -> 35;\n      __syncthreads();\n      // FP: \"35 -> 36;\n      if (nps.tb.owner == MAX_TB_SIZE + 1)\n      {\n        // FP: \"36 -> 37;\n        __syncthreads();\n        // FP: \"37 -> 38;\n        break;\n      }\n      // FP: \"39 -> 40;\n      if (nps.tb.owner == threadIdx.x)\n      {\n        nps.tb.start = _np.start;\n        nps.tb.size = _np.size;\n        nps.tb.src = threadIdx.x;\n        _np.start = 0;\n        _np.size = 0;\n      }\n      // FP: \"42 -> 43;\n      __syncthreads();\n      // FP: \"43 -> 44;\n      int ns = nps.tb.start;\n      int ne = nps.tb.size;\n      // FP: \"44 -> 45;\n      if (nps.tb.src == threadIdx.x)\n      {\n        nps.tb.owner = MAX_TB_SIZE + 1;\n      }\n      // FP: \"47 -> 48;\n      assert(nps.tb.src < __kernel_tb_size);\n      src = _np_closure[nps.tb.src].src;\n      // FP: \"48 -> 49;\n      for (int _np_j = threadIdx.x; _np_j < ne; _np_j += BLKSIZE)\n      {\n        index_type nbr;\n        nbr = ns +_np_j;\n        {\n          index_type dst;\n          dst = graph.getAbsDestination(nbr);\n          if (p_delta[dst] > 0)\n          {\n            atomicTestAdd(&p_residual[src], p_delta[dst]);\n            bitset_residual.set(src);\n          }\n        }\n      }\n      // FP: \"58 -> 59;\n      __syncthreads();\n    }\n    // FP: \"60 -> 61;\n\n    // FP: \"61 -> 62;\n    {\n      const int warpid = threadIdx.x / 32;\n      // FP: \"62 -> 63;\n      const int _np_laneid = cub::LaneId();\n      // FP: \"63 -> 64;\n      while (__any_sync(0xffffffff, _np.size >= _NP_CROSSOVER_WP && _np.size < _NP_CROSSOVER_TB))\n      {\n        if (_np.size >= _NP_CROSSOVER_WP && _np.size < _NP_CROSSOVER_TB)\n        {\n          nps.warp.owner[warpid] = _np_laneid;\n        }\n        if (nps.warp.owner[warpid] == _np_laneid)\n        {\n          nps.warp.start[warpid] = _np.start;\n          nps.warp.size[warpid] = _np.size;\n          nps.warp.src[warpid] = threadIdx.x;\n          _np.start = 0;\n          _np.size = 0;\n        }\n        index_type _np_w_start = nps.warp.start[warpid];\n        index_type _np_w_size = nps.warp.size[warpid];\n        assert(nps.warp.src[warpid] < __kernel_tb_size);\n        src = _np_closure[nps.warp.src[warpid]].src;\n        for (int _np_ii = _np_laneid; _np_ii < _np_w_size; _np_ii += 32)\n        {\n          index_type nbr;\n          nbr = _np_w_start +_np_ii;\n          {\n            index_type dst;\n            dst = graph.getAbsDestination(nbr);\n            if (p_delta[dst] > 0)\n            {\n              atomicTestAdd(&p_residual[src], p_delta[dst]);\n              bitset_residual.set(src);\n            }\n          }\n        }\n      }\n      // FP: \"83 -> 84;\n      __syncthreads();\n      // FP: \"84 -> 85;\n    }\n\n    // FP: \"85 -> 86;\n    __syncthreads();\n    // FP: \"86 -> 87;\n    _np.total = _np_mps_total.el[1];\n    _np.offset = _np_mps.el[1];\n    // FP: \"87 -> 88;\n    while (_np.work())\n    {\n      // FP: \"88 -> 89;\n      int _np_i =0;\n      // FP: \"89 -> 90;\n      _np.inspect2(nps.fg.itvalue, nps.fg.src, ITSIZE, threadIdx.x);\n      // FP: \"90 -> 91;\n      __syncthreads();\n      // FP: \"91 -> 92;\n\n      // FP: \"92 -> 93;\n      for (_np_i = threadIdx.x; _np_i < ITSIZE && _np.valid(_np_i); _np_i += BLKSIZE)\n      {\n        index_type nbr;\n        assert(nps.fg.src[_np_i] < __kernel_tb_size);\n        src = _np_closure[nps.fg.src[_np_i]].src;\n        nbr= nps.fg.itvalue[_np_i];\n        {\n          index_type dst;\n          dst = graph.getAbsDestination(nbr);\n          if (p_delta[dst] > 0)\n          {\n            atomicTestAdd(&p_residual[src], p_delta[dst]);\n            bitset_residual.set(src);\n          }\n        }\n      }\n      // FP: \"103 -> 104;\n      _np.execute_round_done(ITSIZE);\n      // FP: \"104 -> 105;\n      __syncthreads();\n    }\n    // FP: \"106 -> 107;\n    assert(threadIdx.x < __kernel_tb_size);\n    src = _np_closure[threadIdx.x].src;\n  }\n  // FP: \"108 -> 109;\n}\n__global__ void PageRankSanity(CSRGraph graph, unsigned int __begin, unsigned int __end, float local_tolerance, float * p_residual, float * p_value, HGAccumulator<uint64_t> DGAccumulator_residual_over_tolerance, HGAccumulator<float> DGAccumulator_sum, HGAccumulator<float> DGAccumulator_sum_residual, HGReduceMax<float> max_residual, HGReduceMax<float> max_value, HGReduceMin<float> min_residual, HGReduceMin<float> min_value)\n{\n  unsigned tid = TID_1D;\n  unsigned nthreads = TOTAL_THREADS_1D;\n\n  const unsigned __kernel_tb_size = TB_SIZE;\n  __shared__ cub::BlockReduce<uint64_t, TB_SIZE>::TempStorage DGAccumulator_residual_over_tolerance_ts;\n  __shared__ cub::BlockReduce<float, TB_SIZE>::TempStorage DGAccumulator_sum_ts;\n  __shared__ cub::BlockReduce<float, TB_SIZE>::TempStorage DGAccumulator_sum_residual_ts;\n  __shared__ cub::BlockReduce<float, TB_SIZE>::TempStorage max_residual_ts;\n  __shared__ cub::BlockReduce<float, TB_SIZE>::TempStorage max_value_ts;\n  __shared__ cub::BlockReduce<float, TB_SIZE>::TempStorage min_residual_ts;\n  __shared__ cub::BlockReduce<float, TB_SIZE>::TempStorage min_value_ts;\n  index_type src_end;\n  // FP: \"1 -> 2;\n  // FP: \"2 -> 3;\n  DGAccumulator_residual_over_tolerance.thread_entry();\n  // FP: \"3 -> 4;\n  // FP: \"4 -> 5;\n  DGAccumulator_sum.thread_entry();\n  // FP: \"5 -> 6;\n  // FP: \"6 -> 7;\n  DGAccumulator_sum_residual.thread_entry();\n  // FP: \"7 -> 8;\n  // FP: \"8 -> 9;\n  max_residual.thread_entry();\n  // FP: \"9 -> 10;\n  // FP: \"10 -> 11;\n  max_value.thread_entry();\n  // FP: \"11 -> 12;\n  // FP: \"12 -> 13;\n  min_residual.thread_entry();\n  // FP: \"13 -> 14;\n  // FP: \"14 -> 15;\n  min_value.thread_entry();\n  // FP: \"15 -> 16;\n  src_end = __end;\n  for (index_type src = __begin + tid; src < src_end; src += nthreads)\n  {\n    bool pop  = src < __end;\n    if (pop)\n    {\n      max_value.reduce(p_value[src]);\n      min_value.reduce(p_value[src]);\n      max_residual.reduce(p_residual[src]);\n      min_residual.reduce(p_residual[src]);\n      DGAccumulator_sum.reduce( p_value[src]);\n      DGAccumulator_sum.reduce( p_residual[src]);\n      if (p_residual[src] > local_tolerance)\n      {\n        DGAccumulator_residual_over_tolerance.reduce( 1);\n      }\n    }\n  }\n  // FP: \"29 -> 30;\n  DGAccumulator_residual_over_tolerance.thread_exit<cub::BlockReduce<uint64_t, TB_SIZE> >(DGAccumulator_residual_over_tolerance_ts);\n  // FP: \"30 -> 31;\n  DGAccumulator_sum.thread_exit<cub::BlockReduce<float, TB_SIZE> >(DGAccumulator_sum_ts);\n  // FP: \"31 -> 32;\n  DGAccumulator_sum_residual.thread_exit<cub::BlockReduce<float, TB_SIZE> >(DGAccumulator_sum_residual_ts);\n  // FP: \"32 -> 33;\n  max_residual.thread_exit<cub::BlockReduce<float, TB_SIZE> >(max_residual_ts);\n  // FP: \"33 -> 34;\n  max_value.thread_exit<cub::BlockReduce<float, TB_SIZE> >(max_value_ts);\n  // FP: \"34 -> 35;\n  min_residual.thread_exit<cub::BlockReduce<float, TB_SIZE> >(min_residual_ts);\n  // FP: \"35 -> 36;\n  min_value.thread_exit<cub::BlockReduce<float, TB_SIZE> >(min_value_ts);\n  // FP: \"36 -> 37;\n}\nvoid ResetGraph_cuda(unsigned int  __begin, unsigned int  __end, const float & local_alpha, struct CUDA_Context*  ctx)\n{\n  dim3 blocks;\n  dim3 threads;\n  // FP: \"1 -> 2;\n  // FP: \"2 -> 3;\n  // FP: \"3 -> 4;\n  kernel_sizing(blocks, threads);\n  // FP: \"4 -> 5;\n  ResetGraph <<<blocks, threads>>>(ctx->gg, __begin, __end, local_alpha, ctx->delta.data.gpu_wr_ptr(), ctx->nout.data.gpu_wr_ptr(), ctx->residual.data.gpu_wr_ptr(), ctx->value.data.gpu_wr_ptr());\n  cudaDeviceSynchronize();\n  // FP: \"5 -> 6;\n  check_cuda_kernel;\n  // FP: \"6 -> 7;\n}\nvoid ResetGraph_allNodes_cuda(const float & local_alpha, struct CUDA_Context*  ctx)\n{\n  // FP: \"1 -> 2;\n  ResetGraph_cuda(0, ctx->gg.nnodes, local_alpha, ctx);\n  // FP: \"2 -> 3;\n}\nvoid ResetGraph_masterNodes_cuda(const float & local_alpha, struct CUDA_Context*  ctx)\n{\n  // FP: \"1 -> 2;\n  ResetGraph_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, local_alpha, ctx);\n  // FP: \"2 -> 3;\n}\nvoid ResetGraph_nodesWithEdges_cuda(const float & local_alpha, struct CUDA_Context*  ctx)\n{\n  // FP: \"1 -> 2;\n  ResetGraph_cuda(0, ctx->numNodesWithEdges, local_alpha, ctx);\n  // FP: \"2 -> 3;\n}\nvoid InitializeGraph_cuda(unsigned int  __begin, unsigned int  __end, struct CUDA_Context*  ctx)\n{\n  t_work.init_thread_work(ctx->gg.nnodes);\n  dim3 blocks;\n  dim3 threads;\n  // FP: \"1 -> 2;\n  // FP: \"2 -> 3;\n  // FP: \"3 -> 4;\n  kernel_sizing(blocks, threads);\n  // FP: \"4 -> 5;\n  InitializeGraph <<<blocks, __tb_InitializeGraph>>>(ctx->gg, __begin, __end, ctx->nout.data.gpu_wr_ptr(), *(ctx->nout.is_updated.gpu_rd_ptr()), t_work.thread_work_wl, t_work.thread_src_wl, enable_lb);\n  cudaDeviceSynchronize();\n  if (enable_lb)\n  {\n    int num_items = t_work.thread_work_wl.in_wl().nitems();\n    if (num_items != 0)\n    {\n      t_work.compute_prefix_sum();\n      cudaDeviceSynchronize();\n      InitializeGraph_TB_LB <<<blocks, __tb_InitializeGraph>>>(ctx->gg, __begin, __end, ctx->nout.data.gpu_wr_ptr(), *(ctx->nout.is_updated.gpu_rd_ptr()), t_work.thread_prefix_work_wl.gpu_wr_ptr(), num_items, t_work.thread_src_wl);\n      cudaDeviceSynchronize();\n      t_work.reset_thread_work();\n    }\n  }\n  // FP: \"5 -> 6;\n  check_cuda_kernel;\n  // FP: \"6 -> 7;\n}\nvoid InitializeGraph_allNodes_cuda(struct CUDA_Context*  ctx)\n{\n  // FP: \"1 -> 2;\n  InitializeGraph_cuda(0, ctx->gg.nnodes, ctx);\n  // FP: \"2 -> 3;\n}\nvoid InitializeGraph_masterNodes_cuda(struct CUDA_Context*  ctx)\n{\n  // FP: \"1 -> 2;\n  InitializeGraph_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, ctx);\n  // FP: \"2 -> 3;\n}\nvoid InitializeGraph_nodesWithEdges_cuda(struct CUDA_Context*  ctx)\n{\n  // FP: \"1 -> 2;\n  InitializeGraph_cuda(0, ctx->numNodesWithEdges, ctx);\n  // FP: \"2 -> 3;\n}\nvoid PageRank_delta_cuda(unsigned int  __begin, unsigned int  __end, unsigned int & active_vertices, const float & local_alpha, float local_tolerance, struct CUDA_Context*  ctx)\n{\n  dim3 blocks;\n  dim3 threads;\n  HGAccumulator<unsigned int> _active_vertices;\n  // FP: \"1 -> 2;\n  // FP: \"2 -> 3;\n  // FP: \"3 -> 4;\n  kernel_sizing(blocks, threads);\n  // FP: \"4 -> 5;\n  Shared<unsigned int> active_verticesval  = Shared<unsigned int>(1);\n  // FP: \"5 -> 6;\n  // FP: \"6 -> 7;\n  *(active_verticesval.cpu_wr_ptr()) = 0;\n  // FP: \"7 -> 8;\n  _active_vertices.rv = active_verticesval.gpu_wr_ptr();\n  // FP: \"8 -> 9;\n  PageRank_delta <<<blocks, threads>>>(ctx->gg, __begin, __end, local_alpha, local_tolerance, ctx->delta.data.gpu_wr_ptr(), ctx->nout.data.gpu_wr_ptr(), ctx->residual.data.gpu_wr_ptr(), ctx->value.data.gpu_wr_ptr(), _active_vertices);\n  cudaDeviceSynchronize();\n  // FP: \"9 -> 10;\n  check_cuda_kernel;\n  // FP: \"10 -> 11;\n  active_vertices = *(active_verticesval.cpu_rd_ptr());\n  // FP: \"11 -> 12;\n}\nvoid PageRank_delta_allNodes_cuda(unsigned int & active_vertices, const float & local_alpha, float local_tolerance, struct CUDA_Context*  ctx)\n{\n  // FP: \"1 -> 2;\n  PageRank_delta_cuda(0, ctx->gg.nnodes, active_vertices, local_alpha, local_tolerance, ctx);\n  // FP: \"2 -> 3;\n}\nvoid PageRank_delta_masterNodes_cuda(unsigned int & active_vertices, const float & local_alpha, float local_tolerance, struct CUDA_Context*  ctx)\n{\n  // FP: \"1 -> 2;\n  PageRank_delta_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, active_vertices, local_alpha, local_tolerance, ctx);\n  // FP: \"2 -> 3;\n}\nvoid PageRank_delta_nodesWithEdges_cuda(unsigned int & active_vertices, const float & local_alpha, float local_tolerance, struct CUDA_Context*  ctx)\n{\n  // FP: \"1 -> 2;\n  PageRank_delta_cuda(0, ctx->numNodesWithEdges, active_vertices, local_alpha, local_tolerance, ctx);\n  // FP: \"2 -> 3;\n}\nvoid PageRank_cuda(unsigned int  __begin, unsigned int  __end, struct CUDA_Context*  ctx)\n{\n  dim3 blocks;\n  dim3 threads;\n  // FP: \"1 -> 2;\n  // FP: \"2 -> 3;\n  // FP: \"3 -> 4;\n  kernel_sizing(blocks, threads);\n  // FP: \"4 -> 5;\n  PageRank <<<blocks, __tb_PageRank>>>(ctx->gg, __begin, __end, ctx->delta.data.gpu_wr_ptr(), ctx->residual.data.gpu_wr_ptr(), *(ctx->residual.is_updated.gpu_rd_ptr()), t_work.thread_work_wl, t_work.thread_src_wl, enable_lb);\n  cudaDeviceSynchronize();\n  if (enable_lb)\n  {\n    int num_items = t_work.thread_work_wl.in_wl().nitems();\n    if (num_items != 0)\n    {\n      t_work.compute_prefix_sum();\n      cudaDeviceSynchronize();\n      PageRank_TB_LB <<<blocks, __tb_PageRank>>>(ctx->gg, __begin, __end, ctx->delta.data.gpu_wr_ptr(), ctx->residual.data.gpu_wr_ptr(), *(ctx->residual.is_updated.gpu_rd_ptr()), t_work.thread_prefix_work_wl.gpu_wr_ptr(), num_items, t_work.thread_src_wl);\n      cudaDeviceSynchronize();\n      t_work.reset_thread_work();\n    }\n  }\n  // FP: \"5 -> 6;\n  check_cuda_kernel;\n  // FP: \"6 -> 7;\n}\nvoid PageRank_allNodes_cuda(struct CUDA_Context*  ctx)\n{\n  // FP: \"1 -> 2;\n  PageRank_cuda(0, ctx->gg.nnodes, ctx);\n  // FP: \"2 -> 3;\n}\nvoid PageRank_masterNodes_cuda(struct CUDA_Context*  ctx)\n{\n  // FP: \"1 -> 2;\n  PageRank_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, ctx);\n  // FP: \"2 -> 3;\n}\nvoid PageRank_nodesWithEdges_cuda(struct CUDA_Context*  ctx)\n{\n  // FP: \"1 -> 2;\n  PageRank_cuda(0, ctx->numNodesWithEdges, ctx);\n  // FP: \"2 -> 3;\n}\nvoid PageRankSanity_cuda(unsigned int  __begin, unsigned int  __end, uint64_t & DGAccumulator_residual_over_tolerance, float & DGAccumulator_sum, float & DGAccumulator_sum_residual, float & max_residual, float & max_value, float & min_residual, float & min_value, float local_tolerance, struct CUDA_Context*  ctx)\n{\n  dim3 blocks;\n  dim3 threads;\n  HGAccumulator<uint64_t> _DGAccumulator_residual_over_tolerance;\n  HGAccumulator<float> _DGAccumulator_sum;\n  HGAccumulator<float> _DGAccumulator_sum_residual;\n  HGReduceMax<float> _max_residual;\n  HGReduceMax<float> _max_value;\n  HGReduceMin<float> _min_residual;\n  HGReduceMin<float> _min_value;\n  // FP: \"1 -> 2;\n  // FP: \"2 -> 3;\n  // FP: \"3 -> 4;\n  kernel_sizing(blocks, threads);\n  // FP: \"4 -> 5;\n  Shared<uint64_t> DGAccumulator_residual_over_toleranceval  = Shared<uint64_t>(1);\n  // FP: \"5 -> 6;\n  // FP: \"6 -> 7;\n  *(DGAccumulator_residual_over_toleranceval.cpu_wr_ptr()) = 0;\n  // FP: \"7 -> 8;\n  _DGAccumulator_residual_over_tolerance.rv = DGAccumulator_residual_over_toleranceval.gpu_wr_ptr();\n  // FP: \"8 -> 9;\n  Shared<float> DGAccumulator_sumval  = Shared<float>(1);\n  // FP: \"9 -> 10;\n  // FP: \"10 -> 11;\n  *(DGAccumulator_sumval.cpu_wr_ptr()) = 0;\n  // FP: \"11 -> 12;\n  _DGAccumulator_sum.rv = DGAccumulator_sumval.gpu_wr_ptr();\n  // FP: \"12 -> 13;\n  Shared<float> DGAccumulator_sum_residualval  = Shared<float>(1);\n  // FP: \"13 -> 14;\n  // FP: \"14 -> 15;\n  *(DGAccumulator_sum_residualval.cpu_wr_ptr()) = 0;\n  // FP: \"15 -> 16;\n  _DGAccumulator_sum_residual.rv = DGAccumulator_sum_residualval.gpu_wr_ptr();\n  // FP: \"16 -> 17;\n  Shared<float> max_residualval  = Shared<float>(1);\n  // FP: \"17 -> 18;\n  // FP: \"18 -> 19;\n  *(max_residualval.cpu_wr_ptr()) = 0;\n  // FP: \"19 -> 20;\n  _max_residual.rv = max_residualval.gpu_wr_ptr();\n  // FP: \"20 -> 21;\n  Shared<float> max_valueval  = Shared<float>(1);\n  // FP: \"21 -> 22;\n  // FP: \"22 -> 23;\n  *(max_valueval.cpu_wr_ptr()) = 0;\n  // FP: \"23 -> 24;\n  _max_value.rv = max_valueval.gpu_wr_ptr();\n  // FP: \"24 -> 25;\n  Shared<float> min_residualval  = Shared<float>(1);\n  // FP: \"25 -> 26;\n  // FP: \"26 -> 27;\n  *(min_residualval.cpu_wr_ptr()) = 1073741823;\n  // FP: \"27 -> 28;\n  _min_residual.rv = min_residualval.gpu_wr_ptr();\n  // FP: \"28 -> 29;\n  Shared<float> min_valueval  = Shared<float>(1);\n  // FP: \"29 -> 30;\n  // FP: \"30 -> 31;\n  *(min_valueval.cpu_wr_ptr()) = 1073741823;\n  // FP: \"31 -> 32;\n  _min_value.rv = min_valueval.gpu_wr_ptr();\n  // FP: \"32 -> 33;\n  PageRankSanity <<<blocks, threads>>>(ctx->gg, __begin, __end, local_tolerance, ctx->residual.data.gpu_wr_ptr(), ctx->value.data.gpu_wr_ptr(), _DGAccumulator_residual_over_tolerance, _DGAccumulator_sum, _DGAccumulator_sum_residual, _max_residual, _max_value, _min_residual, _min_value);\n  cudaDeviceSynchronize();\n  // FP: \"33 -> 34;\n  check_cuda_kernel;\n  // FP: \"34 -> 35;\n  DGAccumulator_residual_over_tolerance = *(DGAccumulator_residual_over_toleranceval.cpu_rd_ptr());\n  // FP: \"35 -> 36;\n  DGAccumulator_sum = *(DGAccumulator_sumval.cpu_rd_ptr());\n  // FP: \"36 -> 37;\n  DGAccumulator_sum_residual = *(DGAccumulator_sum_residualval.cpu_rd_ptr());\n  // FP: \"37 -> 38;\n  max_residual = *(max_residualval.cpu_rd_ptr());\n  // FP: \"38 -> 39;\n  max_value = *(max_valueval.cpu_rd_ptr());\n  // FP: \"39 -> 40;\n  min_residual = *(min_residualval.cpu_rd_ptr());\n  // FP: \"40 -> 41;\n  min_value = *(min_valueval.cpu_rd_ptr());\n  // FP: \"41 -> 42;\n}\nvoid PageRankSanity_allNodes_cuda(uint64_t & DGAccumulator_residual_over_tolerance, float & DGAccumulator_sum, float & DGAccumulator_sum_residual, float & max_residual, float & max_value, float & min_residual, float & min_value, float local_tolerance, struct CUDA_Context*  ctx)\n{\n  // FP: \"1 -> 2;\n  PageRankSanity_cuda(0, ctx->gg.nnodes, DGAccumulator_residual_over_tolerance, DGAccumulator_sum, DGAccumulator_sum_residual, max_residual, max_value, min_residual, min_value, local_tolerance, ctx);\n  // FP: \"2 -> 3;\n}\nvoid PageRankSanity_masterNodes_cuda(uint64_t & DGAccumulator_residual_over_tolerance, float & DGAccumulator_sum, float & DGAccumulator_sum_residual, float & max_residual, float & max_value, float & min_residual, float & min_value, float local_tolerance, struct CUDA_Context*  ctx)\n{\n  // FP: \"1 -> 2;\n  PageRankSanity_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, DGAccumulator_residual_over_tolerance, DGAccumulator_sum, DGAccumulator_sum_residual, max_residual, max_value, min_residual, min_value, local_tolerance, ctx);\n  // FP: \"2 -> 3;\n}\nvoid PageRankSanity_nodesWithEdges_cuda(uint64_t & DGAccumulator_residual_over_tolerance, float & DGAccumulator_sum, float & DGAccumulator_sum_residual, float & max_residual, float & max_value, float & min_residual, float & min_value, float local_tolerance, struct CUDA_Context*  ctx)\n{\n  // FP: \"1 -> 2;\n  PageRankSanity_cuda(0, ctx->numNodesWithEdges, DGAccumulator_residual_over_tolerance, DGAccumulator_sum, DGAccumulator_sum_residual, max_residual, max_value, min_residual, min_value, local_tolerance, ctx);\n  // FP: \"2 -> 3;\n}"
  },
  {
    "path": "lonestar/analytics/distributed/pagerank/pagerank_pull_cuda.cuh",
    "content": "#pragma once\n#include <cuda.h>\n#include <stdio.h>\n#include <sys/types.h>\n#include <unistd.h>\n#include \"pagerank_pull_cuda.h\"\n#include \"galois/runtime/cuda/DeviceSync.h\"\n\nstruct CUDA_Context : public CUDA_Context_Common {\n\tstruct CUDA_Context_Field<float> delta;\n\tstruct CUDA_Context_Field<uint32_t> nout;\n\tstruct CUDA_Context_Field<float> residual;\n\tstruct CUDA_Context_Field<float> value;\n};\n\nstruct CUDA_Context* get_CUDA_context(int id) {\n\tstruct CUDA_Context* ctx;\n\tctx = (struct CUDA_Context* ) calloc(1, sizeof(struct CUDA_Context));\n\tctx->id = id;\n\treturn ctx;\n}\n\nbool init_CUDA_context(struct CUDA_Context* ctx, int device) {\n\treturn init_CUDA_context_common(ctx, device);\n}\n\nvoid load_graph_CUDA(struct CUDA_Context* ctx, MarshalGraph &g, unsigned num_hosts) {\n\tsize_t mem_usage = mem_usage_CUDA_common(g, num_hosts);\n\tmem_usage += mem_usage_CUDA_field(&ctx->delta, g, num_hosts);\n\tmem_usage += mem_usage_CUDA_field(&ctx->nout, g, num_hosts);\n\tmem_usage += mem_usage_CUDA_field(&ctx->residual, g, num_hosts);\n\tmem_usage += mem_usage_CUDA_field(&ctx->value, g, num_hosts);\n\tprintf(\"[%d] Host memory for communication context: %3u MB\\n\", ctx->id, mem_usage/1048756);\n\tload_graph_CUDA_common(ctx, g, num_hosts);\n\tload_graph_CUDA_field(ctx, &ctx->delta, num_hosts);\n\tload_graph_CUDA_field(ctx, &ctx->nout, num_hosts);\n\tload_graph_CUDA_field(ctx, &ctx->residual, num_hosts);\n\tload_graph_CUDA_field(ctx, &ctx->value, num_hosts);\n\treset_CUDA_context(ctx);\n}\n\nvoid reset_CUDA_context(struct CUDA_Context* ctx) {\n\tctx->delta.data.zero_gpu();\n\tctx->nout.data.zero_gpu();\n\tctx->residual.data.zero_gpu();\n\tctx->value.data.zero_gpu();\n}\n\nvoid get_bitset_delta_cuda(struct CUDA_Context* ctx, uint64_t* bitset_compute) {\n\tctx->delta.is_updated.cpu_rd_ptr()->copy_to_cpu(bitset_compute);\n}\n\nvoid bitset_delta_reset_cuda(struct CUDA_Context* ctx) {\n\tctx->delta.is_updated.cpu_rd_ptr()->reset();\n}\n\nvoid bitset_delta_reset_cuda(struct CUDA_Context* ctx, size_t begin, size_t end) {\n\treset_bitset_field(&ctx->delta, begin, end);\n}\n\nfloat get_node_delta_cuda(struct CUDA_Context* ctx, unsigned LID) {\n\tfloat *delta = ctx->delta.data.cpu_rd_ptr();\n\treturn delta[LID];\n}\n\nvoid set_node_delta_cuda(struct CUDA_Context* ctx, unsigned LID, float v) {\n\tfloat *delta = ctx->delta.data.cpu_wr_ptr();\n\tdelta[LID] = v;\n}\n\nvoid add_node_delta_cuda(struct CUDA_Context* ctx, unsigned LID, float v) {\n\tfloat *delta = ctx->delta.data.cpu_wr_ptr();\n\tdelta[LID] += v;\n}\n\nbool min_node_delta_cuda(struct CUDA_Context* ctx, unsigned LID, float v) {\n\tfloat *delta = ctx->delta.data.cpu_wr_ptr();\n\tif (delta[LID] > v){\n\t\tdelta[LID] = v;\n\t\treturn true;\n\t}\n\treturn false;\n}\n\nvoid batch_get_node_delta_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v) {\n\tbatch_get_shared_field<float, sharedMaster, false>(ctx, &ctx->delta, from_id, v);\n}\n\nvoid batch_get_node_delta_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode) {\n\tbatch_get_shared_field<float, sharedMaster, false>(ctx, &ctx->delta, from_id, v, v_size, data_mode);\n}\n\nvoid batch_get_mirror_node_delta_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v) {\n\tbatch_get_shared_field<float, sharedMirror, false>(ctx, &ctx->delta, from_id, v);\n}\n\nvoid batch_get_mirror_node_delta_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode) {\n\tbatch_get_shared_field<float, sharedMirror, false>(ctx, &ctx->delta, from_id, v, v_size, data_mode);\n}\n\nvoid batch_get_reset_node_delta_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, float i) {\n\tbatch_get_shared_field<float, sharedMirror, true>(ctx, &ctx->delta, from_id, v, i);\n}\n\nvoid batch_get_reset_node_delta_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode, float i) {\n\tbatch_get_shared_field<float, sharedMirror, true>(ctx, &ctx->delta, from_id, v, v_size, data_mode, i);\n}\n\nvoid batch_set_mirror_node_delta_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<float, sharedMirror, setOp>(ctx, &ctx->delta, from_id, v, data_mode);\n}\n\nvoid batch_set_node_delta_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<float, sharedMaster, setOp>(ctx, &ctx->delta, from_id, v, data_mode);\n}\n\nvoid batch_add_mirror_node_delta_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<float, sharedMirror, addOp>(ctx, &ctx->delta, from_id, v, data_mode);\n}\n\nvoid batch_add_node_delta_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<float, sharedMaster, addOp>(ctx, &ctx->delta, from_id, v, data_mode);\n}\n\nvoid batch_min_mirror_node_delta_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<float, sharedMirror, minOp>(ctx, &ctx->delta, from_id, v, data_mode);\n}\n\nvoid batch_min_node_delta_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<float, sharedMaster, minOp>(ctx, &ctx->delta, from_id, v, data_mode);\n}\n\nvoid batch_reset_node_delta_cuda(struct CUDA_Context* ctx, size_t begin, size_t end, float v) {\n\treset_data_field<float>(&ctx->delta, begin, end, v);\n}\n\nvoid get_bitset_nout_cuda(struct CUDA_Context* ctx, uint64_t* bitset_compute) {\n\tctx->nout.is_updated.cpu_rd_ptr()->copy_to_cpu(bitset_compute);\n}\n\nvoid bitset_nout_reset_cuda(struct CUDA_Context* ctx) {\n\tctx->nout.is_updated.cpu_rd_ptr()->reset();\n}\n\nvoid bitset_nout_reset_cuda(struct CUDA_Context* ctx, size_t begin, size_t end) {\n\treset_bitset_field(&ctx->nout, begin, end);\n}\n\nuint32_t get_node_nout_cuda(struct CUDA_Context* ctx, unsigned LID) {\n\tuint32_t *nout = ctx->nout.data.cpu_rd_ptr();\n\treturn nout[LID];\n}\n\nvoid set_node_nout_cuda(struct CUDA_Context* ctx, unsigned LID, uint32_t v) {\n\tuint32_t *nout = ctx->nout.data.cpu_wr_ptr();\n\tnout[LID] = v;\n}\n\nvoid add_node_nout_cuda(struct CUDA_Context* ctx, unsigned LID, uint32_t v) {\n\tuint32_t *nout = ctx->nout.data.cpu_wr_ptr();\n\tnout[LID] += v;\n}\n\nbool min_node_nout_cuda(struct CUDA_Context* ctx, unsigned LID, uint32_t v) {\n\tuint32_t *nout = ctx->nout.data.cpu_wr_ptr();\n\tif (nout[LID] > v){\n\t\tnout[LID] = v;\n\t\treturn true;\n\t}\n\treturn false;\n}\n\nvoid batch_get_node_nout_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v) {\n\tbatch_get_shared_field<uint32_t, sharedMaster, false>(ctx, &ctx->nout, from_id, v);\n}\n\nvoid batch_get_node_nout_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode) {\n\tbatch_get_shared_field<uint32_t, sharedMaster, false>(ctx, &ctx->nout, from_id, v, v_size, data_mode);\n}\n\nvoid batch_get_mirror_node_nout_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v) {\n\tbatch_get_shared_field<uint32_t, sharedMirror, false>(ctx, &ctx->nout, from_id, v);\n}\n\nvoid batch_get_mirror_node_nout_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode) {\n\tbatch_get_shared_field<uint32_t, sharedMirror, false>(ctx, &ctx->nout, from_id, v, v_size, data_mode);\n}\n\nvoid batch_get_reset_node_nout_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, uint32_t i) {\n\tbatch_get_shared_field<uint32_t, sharedMirror, true>(ctx, &ctx->nout, from_id, v, i);\n}\n\nvoid batch_get_reset_node_nout_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode, uint32_t i) {\n\tbatch_get_shared_field<uint32_t, sharedMirror, true>(ctx, &ctx->nout, from_id, v, v_size, data_mode, i);\n}\n\nvoid batch_set_mirror_node_nout_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<uint32_t, sharedMirror, setOp>(ctx, &ctx->nout, from_id, v, data_mode);\n}\n\nvoid batch_set_node_nout_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<uint32_t, sharedMaster, setOp>(ctx, &ctx->nout, from_id, v, data_mode);\n}\n\nvoid batch_add_mirror_node_nout_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<uint32_t, sharedMirror, addOp>(ctx, &ctx->nout, from_id, v, data_mode);\n}\n\nvoid batch_add_node_nout_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<uint32_t, sharedMaster, addOp>(ctx, &ctx->nout, from_id, v, data_mode);\n}\n\nvoid batch_min_mirror_node_nout_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<uint32_t, sharedMirror, minOp>(ctx, &ctx->nout, from_id, v, data_mode);\n}\n\nvoid batch_min_node_nout_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<uint32_t, sharedMaster, minOp>(ctx, &ctx->nout, from_id, v, data_mode);\n}\n\nvoid batch_reset_node_nout_cuda(struct CUDA_Context* ctx, size_t begin, size_t end, uint32_t v) {\n\treset_data_field<uint32_t>(&ctx->nout, begin, end, v);\n}\n\nvoid get_bitset_residual_cuda(struct CUDA_Context* ctx, uint64_t* bitset_compute) {\n\tctx->residual.is_updated.cpu_rd_ptr()->copy_to_cpu(bitset_compute);\n}\n\nvoid bitset_residual_reset_cuda(struct CUDA_Context* ctx) {\n\tctx->residual.is_updated.cpu_rd_ptr()->reset();\n}\n\nvoid bitset_residual_reset_cuda(struct CUDA_Context* ctx, size_t begin, size_t end) {\n\treset_bitset_field(&ctx->residual, begin, end);\n}\n\nfloat get_node_residual_cuda(struct CUDA_Context* ctx, unsigned LID) {\n\tfloat *residual = ctx->residual.data.cpu_rd_ptr();\n\treturn residual[LID];\n}\n\nvoid set_node_residual_cuda(struct CUDA_Context* ctx, unsigned LID, float v) {\n\tfloat *residual = ctx->residual.data.cpu_wr_ptr();\n\tresidual[LID] = v;\n}\n\nvoid add_node_residual_cuda(struct CUDA_Context* ctx, unsigned LID, float v) {\n\tfloat *residual = ctx->residual.data.cpu_wr_ptr();\n\tresidual[LID] += v;\n}\n\nbool min_node_residual_cuda(struct CUDA_Context* ctx, unsigned LID, float v) {\n\tfloat *residual = ctx->residual.data.cpu_wr_ptr();\n\tif (residual[LID] > v){\n\t\tresidual[LID] = v;\n\t\treturn true;\n\t}\n\treturn false;\n}\n\nvoid batch_get_node_residual_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v) {\n\tbatch_get_shared_field<float, sharedMaster, false>(ctx, &ctx->residual, from_id, v);\n}\n\nvoid batch_get_node_residual_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode) {\n\tbatch_get_shared_field<float, sharedMaster, false>(ctx, &ctx->residual, from_id, v, v_size, data_mode);\n}\n\nvoid batch_get_mirror_node_residual_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v) {\n\tbatch_get_shared_field<float, sharedMirror, false>(ctx, &ctx->residual, from_id, v);\n}\n\nvoid batch_get_mirror_node_residual_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode) {\n\tbatch_get_shared_field<float, sharedMirror, false>(ctx, &ctx->residual, from_id, v, v_size, data_mode);\n}\n\nvoid batch_get_reset_node_residual_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, float i) {\n\tbatch_get_shared_field<float, sharedMirror, true>(ctx, &ctx->residual, from_id, v, i);\n}\n\nvoid batch_get_reset_node_residual_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode, float i) {\n\tbatch_get_shared_field<float, sharedMirror, true>(ctx, &ctx->residual, from_id, v, v_size, data_mode, i);\n}\n\nvoid batch_set_mirror_node_residual_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<float, sharedMirror, setOp>(ctx, &ctx->residual, from_id, v, data_mode);\n}\n\nvoid batch_set_node_residual_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<float, sharedMaster, setOp>(ctx, &ctx->residual, from_id, v, data_mode);\n}\n\nvoid batch_add_mirror_node_residual_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<float, sharedMirror, addOp>(ctx, &ctx->residual, from_id, v, data_mode);\n}\n\nvoid batch_add_node_residual_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<float, sharedMaster, addOp>(ctx, &ctx->residual, from_id, v, data_mode);\n}\n\nvoid batch_min_mirror_node_residual_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<float, sharedMirror, minOp>(ctx, &ctx->residual, from_id, v, data_mode);\n}\n\nvoid batch_min_node_residual_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<float, sharedMaster, minOp>(ctx, &ctx->residual, from_id, v, data_mode);\n}\n\nvoid batch_reset_node_residual_cuda(struct CUDA_Context* ctx, size_t begin, size_t end, float v) {\n\treset_data_field<float>(&ctx->residual, begin, end, v);\n}\n\nvoid get_bitset_value_cuda(struct CUDA_Context* ctx, uint64_t* bitset_compute) {\n\tctx->value.is_updated.cpu_rd_ptr()->copy_to_cpu(bitset_compute);\n}\n\nvoid bitset_value_reset_cuda(struct CUDA_Context* ctx) {\n\tctx->value.is_updated.cpu_rd_ptr()->reset();\n}\n\nvoid bitset_value_reset_cuda(struct CUDA_Context* ctx, size_t begin, size_t end) {\n\treset_bitset_field(&ctx->value, begin, end);\n}\n\nfloat get_node_value_cuda(struct CUDA_Context* ctx, unsigned LID) {\n\tfloat *value = ctx->value.data.cpu_rd_ptr();\n\treturn value[LID];\n}\n\nvoid set_node_value_cuda(struct CUDA_Context* ctx, unsigned LID, float v) {\n\tfloat *value = ctx->value.data.cpu_wr_ptr();\n\tvalue[LID] = v;\n}\n\nvoid add_node_value_cuda(struct CUDA_Context* ctx, unsigned LID, float v) {\n\tfloat *value = ctx->value.data.cpu_wr_ptr();\n\tvalue[LID] += v;\n}\n\nbool min_node_value_cuda(struct CUDA_Context* ctx, unsigned LID, float v) {\n\tfloat *value = ctx->value.data.cpu_wr_ptr();\n\tif (value[LID] > v){\n\t\tvalue[LID] = v;\n\t\treturn true;\n\t}\n\treturn false;\n}\n\nvoid batch_get_node_value_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v) {\n\tbatch_get_shared_field<float, sharedMaster, false>(ctx, &ctx->value, from_id, v);\n}\n\nvoid batch_get_node_value_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode) {\n\tbatch_get_shared_field<float, sharedMaster, false>(ctx, &ctx->value, from_id, v, v_size, data_mode);\n}\n\nvoid batch_get_mirror_node_value_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v) {\n\tbatch_get_shared_field<float, sharedMirror, false>(ctx, &ctx->value, from_id, v);\n}\n\nvoid batch_get_mirror_node_value_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode) {\n\tbatch_get_shared_field<float, sharedMirror, false>(ctx, &ctx->value, from_id, v, v_size, data_mode);\n}\n\nvoid batch_get_reset_node_value_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, float i) {\n\tbatch_get_shared_field<float, sharedMirror, true>(ctx, &ctx->value, from_id, v, i);\n}\n\nvoid batch_get_reset_node_value_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode, float i) {\n\tbatch_get_shared_field<float, sharedMirror, true>(ctx, &ctx->value, from_id, v, v_size, data_mode, i);\n}\n\nvoid batch_set_mirror_node_value_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<float, sharedMirror, setOp>(ctx, &ctx->value, from_id, v, data_mode);\n}\n\nvoid batch_set_node_value_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<float, sharedMaster, setOp>(ctx, &ctx->value, from_id, v, data_mode);\n}\n\nvoid batch_add_mirror_node_value_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<float, sharedMirror, addOp>(ctx, &ctx->value, from_id, v, data_mode);\n}\n\nvoid batch_add_node_value_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<float, sharedMaster, addOp>(ctx, &ctx->value, from_id, v, data_mode);\n}\n\nvoid batch_min_mirror_node_value_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<float, sharedMirror, minOp>(ctx, &ctx->value, from_id, v, data_mode);\n}\n\nvoid batch_min_node_value_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<float, sharedMaster, minOp>(ctx, &ctx->value, from_id, v, data_mode);\n}\n\nvoid batch_reset_node_value_cuda(struct CUDA_Context* ctx, size_t begin, size_t end, float v) {\n\treset_data_field<float>(&ctx->value, begin, end, v);\n}\n\n"
  },
  {
    "path": "lonestar/analytics/distributed/pagerank/pagerank_pull_cuda.h",
    "content": "#pragma once\n\n#include \"galois/runtime/DataCommMode.h\"\n#include \"galois/cuda/HostDecls.h\"\n\nvoid get_bitset_delta_cuda(struct CUDA_Context* ctx, uint64_t* bitset_compute);\nvoid bitset_delta_reset_cuda(struct CUDA_Context* ctx);\nvoid bitset_delta_reset_cuda(struct CUDA_Context* ctx, size_t begin,\n                             size_t end);\nfloat get_node_delta_cuda(struct CUDA_Context* ctx, unsigned LID);\nvoid set_node_delta_cuda(struct CUDA_Context* ctx, unsigned LID, float v);\nvoid add_node_delta_cuda(struct CUDA_Context* ctx, unsigned LID, float v);\nbool min_node_delta_cuda(struct CUDA_Context* ctx, unsigned LID, float v);\nvoid batch_get_node_delta_cuda(struct CUDA_Context* ctx, unsigned from_id,\n                               uint8_t* v);\nvoid batch_get_node_delta_cuda(struct CUDA_Context* ctx, unsigned from_id,\n                               uint8_t* v, size_t* v_size,\n                               DataCommMode* data_mode);\nvoid batch_get_mirror_node_delta_cuda(struct CUDA_Context* ctx,\n                                      unsigned from_id, uint8_t* v);\nvoid batch_get_mirror_node_delta_cuda(struct CUDA_Context* ctx,\n                                      unsigned from_id, uint8_t* v,\n                                      size_t* v_size, DataCommMode* data_mode);\nvoid batch_get_reset_node_delta_cuda(struct CUDA_Context* ctx, unsigned from_id,\n                                     uint8_t* v, float i);\nvoid batch_get_reset_node_delta_cuda(struct CUDA_Context* ctx, unsigned from_id,\n                                     uint8_t* v, size_t* v_size,\n                                     DataCommMode* data_mode, float i);\nvoid batch_set_mirror_node_delta_cuda(struct CUDA_Context* ctx,\n                                      unsigned from_id, uint8_t* v,\n                                      DataCommMode data_mode);\nvoid batch_set_node_delta_cuda(struct CUDA_Context* ctx, unsigned from_id,\n                               uint8_t* v, DataCommMode data_mode);\nvoid batch_add_mirror_node_delta_cuda(struct CUDA_Context* ctx,\n                                      unsigned from_id, uint8_t* v,\n                                      DataCommMode data_mode);\nvoid batch_add_node_delta_cuda(struct CUDA_Context* ctx, unsigned from_id,\n                               uint8_t* v, DataCommMode data_mode);\nvoid batch_min_mirror_node_delta_cuda(struct CUDA_Context* ctx,\n                                      unsigned from_id, uint8_t* v,\n                                      DataCommMode data_mode);\nvoid batch_min_node_delta_cuda(struct CUDA_Context* ctx, unsigned from_id,\n                               uint8_t* v, DataCommMode data_mode);\nvoid batch_reset_node_delta_cuda(struct CUDA_Context* ctx, size_t begin,\n                                 size_t end, float v);\n\nvoid get_bitset_nout_cuda(struct CUDA_Context* ctx, uint64_t* bitset_compute);\nvoid bitset_nout_reset_cuda(struct CUDA_Context* ctx);\nvoid bitset_nout_reset_cuda(struct CUDA_Context* ctx, size_t begin, size_t end);\nuint32_t get_node_nout_cuda(struct CUDA_Context* ctx, unsigned LID);\nvoid set_node_nout_cuda(struct CUDA_Context* ctx, unsigned LID, uint32_t v);\nvoid add_node_nout_cuda(struct CUDA_Context* ctx, unsigned LID, uint32_t v);\nbool min_node_nout_cuda(struct CUDA_Context* ctx, unsigned LID, uint32_t v);\nvoid batch_get_node_nout_cuda(struct CUDA_Context* ctx, unsigned from_id,\n                              uint8_t* v);\nvoid batch_get_node_nout_cuda(struct CUDA_Context* ctx, unsigned from_id,\n                              uint8_t* v, size_t* v_size,\n                              DataCommMode* data_mode);\nvoid batch_get_mirror_node_nout_cuda(struct CUDA_Context* ctx, unsigned from_id,\n                                     uint8_t* v);\nvoid batch_get_mirror_node_nout_cuda(struct CUDA_Context* ctx, unsigned from_id,\n                                     uint8_t* v, size_t* v_size,\n                                     DataCommMode* data_mode);\nvoid batch_get_reset_node_nout_cuda(struct CUDA_Context* ctx, unsigned from_id,\n                                    uint8_t* v, uint32_t i);\nvoid batch_get_reset_node_nout_cuda(struct CUDA_Context* ctx, unsigned from_id,\n                                    uint8_t* v, size_t* v_size,\n                                    DataCommMode* data_mode, uint32_t i);\nvoid batch_set_mirror_node_nout_cuda(struct CUDA_Context* ctx, unsigned from_id,\n                                     uint8_t* v, DataCommMode data_mode);\nvoid batch_set_node_nout_cuda(struct CUDA_Context* ctx, unsigned from_id,\n                              uint8_t* v, DataCommMode data_mode);\nvoid batch_add_mirror_node_nout_cuda(struct CUDA_Context* ctx, unsigned from_id,\n                                     uint8_t* v, DataCommMode data_mode);\nvoid batch_add_node_nout_cuda(struct CUDA_Context* ctx, unsigned from_id,\n                              uint8_t* v, DataCommMode data_mode);\nvoid batch_min_mirror_node_nout_cuda(struct CUDA_Context* ctx, unsigned from_id,\n                                     uint8_t* v, DataCommMode data_mode);\nvoid batch_min_node_nout_cuda(struct CUDA_Context* ctx, unsigned from_id,\n                              uint8_t* v, DataCommMode data_mode);\nvoid batch_reset_node_nout_cuda(struct CUDA_Context* ctx, size_t begin,\n                                size_t end, uint32_t v);\n\nvoid get_bitset_residual_cuda(struct CUDA_Context* ctx,\n                              uint64_t* bitset_compute);\nvoid bitset_residual_reset_cuda(struct CUDA_Context* ctx);\nvoid bitset_residual_reset_cuda(struct CUDA_Context* ctx, size_t begin,\n                                size_t end);\nfloat get_node_residual_cuda(struct CUDA_Context* ctx, unsigned LID);\nvoid set_node_residual_cuda(struct CUDA_Context* ctx, unsigned LID, float v);\nvoid add_node_residual_cuda(struct CUDA_Context* ctx, unsigned LID, float v);\nbool min_node_residual_cuda(struct CUDA_Context* ctx, unsigned LID, float v);\nvoid batch_get_node_residual_cuda(struct CUDA_Context* ctx, unsigned from_id,\n                                  uint8_t* v);\nvoid batch_get_node_residual_cuda(struct CUDA_Context* ctx, unsigned from_id,\n                                  uint8_t* v, size_t* v_size,\n                                  DataCommMode* data_mode);\nvoid batch_get_mirror_node_residual_cuda(struct CUDA_Context* ctx,\n                                         unsigned from_id, uint8_t* v);\nvoid batch_get_mirror_node_residual_cuda(struct CUDA_Context* ctx,\n                                         unsigned from_id, uint8_t* v,\n                                         size_t* v_size,\n                                         DataCommMode* data_mode);\nvoid batch_get_reset_node_residual_cuda(struct CUDA_Context* ctx,\n                                        unsigned from_id, uint8_t* v, float i);\nvoid batch_get_reset_node_residual_cuda(struct CUDA_Context* ctx,\n                                        unsigned from_id, uint8_t* v,\n                                        size_t* v_size, DataCommMode* data_mode,\n                                        float i);\nvoid batch_set_mirror_node_residual_cuda(struct CUDA_Context* ctx,\n                                         unsigned from_id, uint8_t* v,\n                                         DataCommMode data_mode);\nvoid batch_set_node_residual_cuda(struct CUDA_Context* ctx, unsigned from_id,\n                                  uint8_t* v, DataCommMode data_mode);\nvoid batch_add_mirror_node_residual_cuda(struct CUDA_Context* ctx,\n                                         unsigned from_id, uint8_t* v,\n                                         DataCommMode data_mode);\nvoid batch_add_node_residual_cuda(struct CUDA_Context* ctx, unsigned from_id,\n                                  uint8_t* v, DataCommMode data_mode);\nvoid batch_min_mirror_node_residual_cuda(struct CUDA_Context* ctx,\n                                         unsigned from_id, uint8_t* v,\n                                         DataCommMode data_mode);\nvoid batch_min_node_residual_cuda(struct CUDA_Context* ctx, unsigned from_id,\n                                  uint8_t* v, DataCommMode data_mode);\nvoid batch_reset_node_residual_cuda(struct CUDA_Context* ctx, size_t begin,\n                                    size_t end, float v);\n\nvoid get_bitset_value_cuda(struct CUDA_Context* ctx, uint64_t* bitset_compute);\nvoid bitset_value_reset_cuda(struct CUDA_Context* ctx);\nvoid bitset_value_reset_cuda(struct CUDA_Context* ctx, size_t begin,\n                             size_t end);\nfloat get_node_value_cuda(struct CUDA_Context* ctx, unsigned LID);\nvoid set_node_value_cuda(struct CUDA_Context* ctx, unsigned LID, float v);\nvoid add_node_value_cuda(struct CUDA_Context* ctx, unsigned LID, float v);\nbool min_node_value_cuda(struct CUDA_Context* ctx, unsigned LID, float v);\nvoid batch_get_node_value_cuda(struct CUDA_Context* ctx, unsigned from_id,\n                               uint8_t* v);\nvoid batch_get_node_value_cuda(struct CUDA_Context* ctx, unsigned from_id,\n                               uint8_t* v, size_t* v_size,\n                               DataCommMode* data_mode);\nvoid batch_get_mirror_node_value_cuda(struct CUDA_Context* ctx,\n                                      unsigned from_id, uint8_t* v);\nvoid batch_get_mirror_node_value_cuda(struct CUDA_Context* ctx,\n                                      unsigned from_id, uint8_t* v,\n                                      size_t* v_size, DataCommMode* data_mode);\nvoid batch_get_reset_node_value_cuda(struct CUDA_Context* ctx, unsigned from_id,\n                                     uint8_t* v, float i);\nvoid batch_get_reset_node_value_cuda(struct CUDA_Context* ctx, unsigned from_id,\n                                     uint8_t* v, size_t* v_size,\n                                     DataCommMode* data_mode, float i);\nvoid batch_set_mirror_node_value_cuda(struct CUDA_Context* ctx,\n                                      unsigned from_id, uint8_t* v,\n                                      DataCommMode data_mode);\nvoid batch_set_node_value_cuda(struct CUDA_Context* ctx, unsigned from_id,\n                               uint8_t* v, DataCommMode data_mode);\nvoid batch_add_mirror_node_value_cuda(struct CUDA_Context* ctx,\n                                      unsigned from_id, uint8_t* v,\n                                      DataCommMode data_mode);\nvoid batch_add_node_value_cuda(struct CUDA_Context* ctx, unsigned from_id,\n                               uint8_t* v, DataCommMode data_mode);\nvoid batch_min_mirror_node_value_cuda(struct CUDA_Context* ctx,\n                                      unsigned from_id, uint8_t* v,\n                                      DataCommMode data_mode);\nvoid batch_min_node_value_cuda(struct CUDA_Context* ctx, unsigned from_id,\n                               uint8_t* v, DataCommMode data_mode);\nvoid batch_reset_node_value_cuda(struct CUDA_Context* ctx, size_t begin,\n                                 size_t end, float v);\n\nvoid InitializeGraph_cuda(unsigned int __begin, unsigned int __end,\n                          struct CUDA_Context* ctx);\nvoid InitializeGraph_allNodes_cuda(struct CUDA_Context* ctx);\nvoid InitializeGraph_masterNodes_cuda(struct CUDA_Context* ctx);\nvoid InitializeGraph_nodesWithEdges_cuda(struct CUDA_Context* ctx);\nvoid PageRank_cuda(unsigned int __begin, unsigned int __end,\n                   struct CUDA_Context* ctx);\nvoid PageRankSanity_cuda(unsigned int __begin, unsigned int __end,\n                         uint64_t& DGAccumulator_residual_over_tolerance,\n                         float& DGAccumulator_sum,\n                         float& DGAccumulator_sum_residual, float& max_residual,\n                         float& max_value, float& min_residual,\n                         float& min_value, float local_tolerance,\n                         struct CUDA_Context* ctx);\nvoid PageRankSanity_allNodes_cuda(\n    uint64_t& DGAccumulator_residual_over_tolerance, float& DGAccumulator_sum,\n    float& DGAccumulator_sum_residual, float& max_residual, float& max_value,\n    float& min_residual, float& min_value, float local_tolerance,\n    struct CUDA_Context* ctx);\nvoid PageRankSanity_masterNodes_cuda(\n    uint64_t& DGAccumulator_residual_over_tolerance, float& DGAccumulator_sum,\n    float& DGAccumulator_sum_residual, float& max_residual, float& max_value,\n    float& min_residual, float& min_value, float local_tolerance,\n    struct CUDA_Context* ctx);\nvoid PageRankSanity_nodesWithEdges_cuda(\n    uint64_t& DGAccumulator_residual_over_tolerance, float& DGAccumulator_sum,\n    float& DGAccumulator_sum_residual, float& max_residual, float& max_value,\n    float& min_residual, float& min_value, float local_tolerance,\n    struct CUDA_Context* ctx);\nvoid PageRank_allNodes_cuda(struct CUDA_Context* ctx);\nvoid PageRank_delta_cuda(unsigned int __begin, unsigned int __end,\n                         unsigned int& active_vertices,\n                         const float& local_alpha, float local_tolerance,\n                         struct CUDA_Context* ctx);\nvoid PageRank_delta_allNodes_cuda(unsigned int& active_vertices,\n                                  const float& local_alpha,\n                                  float local_tolerance,\n                                  struct CUDA_Context* ctx);\nvoid PageRank_delta_masterNodes_cuda(unsigned int& active_vertices,\n                                     const float& local_alpha,\n                                     float local_tolerance,\n                                     struct CUDA_Context* ctx);\nvoid PageRank_delta_nodesWithEdges_cuda(unsigned int& active_vertices,\n                                        const float& local_alpha,\n                                        float local_tolerance,\n                                        struct CUDA_Context* ctx);\nvoid PageRank_masterNodes_cuda(struct CUDA_Context* ctx);\nvoid PageRank_nodesWithEdges_cuda(struct CUDA_Context* ctx);\nvoid ResetGraph_cuda(unsigned int __begin, unsigned int __end,\n                     const float& local_alpha, struct CUDA_Context* ctx);\nvoid ResetGraph_allNodes_cuda(const float& local_alpha,\n                              struct CUDA_Context* ctx);\nvoid ResetGraph_masterNodes_cuda(const float& local_alpha,\n                                 struct CUDA_Context* ctx);\nvoid ResetGraph_nodesWithEdges_cuda(const float& local_alpha,\n                                    struct CUDA_Context* ctx);\n"
  },
  {
    "path": "lonestar/analytics/distributed/pagerank/pagerank_pull_cuda.py",
    "content": "from gg.ast import *\nfrom gg.lib.graph import Graph\nfrom gg.lib.wl import Worklist\nfrom gg.ast.params import GraphParam\nimport cgen\nG = Graph(\"graph\")\nWL = Worklist()\nast = Module([\nCBlock([cgen.Include(\"pagerank_pull_cuda.cuh\", system = False)], parse = False),\nKernel(\"ResetGraph\", [G.param(), ('unsigned int', '__begin'), ('unsigned int', '__end'), ('const float ', 'local_alpha'), ('float *', 'p_delta'), ('uint32_t *', 'p_nout'), ('float *', 'p_residual'), ('float *', 'p_value')],\n[\nForAll(\"src\", G.nodes(\"__begin\", \"__end\"),\n[\nCDecl([(\"bool\", \"pop\", \" = src < __end\")]),\nIf(\"pop\", [\nCBlock([\"p_value[src]    = 0\"]),\nCBlock([\"p_nout[src]     = 0\"]),\nCBlock([\"p_delta[src]    = 0\"]),\nCBlock([\"p_residual[src] = local_alpha\"]),\n]),\n]),\n]),\nKernel(\"InitializeGraph\", [G.param(), ('unsigned int', '__begin'), ('unsigned int', '__end'), ('uint32_t *', 'p_nout'), ('DynamicBitset&', 'bitset_nout')],\n[\nForAll(\"src\", G.nodes(\"__begin\", \"__end\"),\n[\nCDecl([(\"bool\", \"pop\", \" = src < __end\")]),\nIf(\"pop\", [\n]),\nUniformConditional(If(\"!pop\", [CBlock(\"continue\")]), uniform_only = False, _only_if_np = True),\nClosureHint(\nForAll(\"nbr\", G.edges(\"src\"),\n[\nCDecl([(\"index_type\", \"dst\", \"\")]),\nCBlock([\"dst = graph.getAbsDestination(nbr)\"]),\nCBlock([\"atomicTestAdd(&p_nout[dst], (uint32_t)1)\"]),\nCBlock([\"bitset_nout.set(dst)\"]),\n]),\n),\n]),\n]),\nKernel(\"PageRank_delta\", [G.param(), ('unsigned int', '__begin'), ('unsigned int', '__end'), ('const float ', 'local_alpha'), ('float', 'local_tolerance'), ('float *', 'p_delta'), ('uint32_t *', 'p_nout'), ('float *', 'p_residual'), ('float *', 'p_value'), ('HGAccumulator<unsigned int>', 'active_vertices')],\n[\nCDecl([(\"__shared__ cub::BlockReduce<unsigned int, TB_SIZE>::TempStorage\", \"active_vertices_ts\", \"\")]),\nCBlock([\"active_vertices.thread_entry()\"]),\nForAll(\"src\", G.nodes(\"__begin\", \"__end\"),\n[\nCDecl([(\"bool\", \"pop\", \" = src < __end\")]),\nIf(\"pop\", [\nCBlock([\"p_delta[src] = 0\"]),\nIf(\"p_residual[src] > 0\",\n[\nCBlock([\"p_value[src] += p_residual[src]\"]),\nIf(\"p_residual[src] > local_tolerance\",\n[\nIf(\"p_nout[src] > 0\",\n[\nCBlock([\"p_delta[src] = p_residual[src] * (1 - local_alpha) / p_nout[src]\"]),\nCBlock([\"active_vertices.reduce( 1)\"]),\n]),\n]),\nCBlock([\"p_residual[src] = 0\"]),\n]),\n]),\n]),\nCBlock([\"active_vertices.thread_exit<cub::BlockReduce<unsigned int, TB_SIZE> >(active_vertices_ts)\"], parse = False),\n]),\nKernel(\"PageRank\", [G.param(), ('unsigned int', '__begin'), ('unsigned int', '__end'), ('float *', 'p_delta'), ('float *', 'p_residual'), ('DynamicBitset&', 'bitset_residual')],\n[\nForAll(\"src\", G.nodes(\"__begin\", \"__end\"),\n[\nCDecl([(\"bool\", \"pop\", \" = src < __end\")]),\nIf(\"pop\", [\n]),\nUniformConditional(If(\"!pop\", [CBlock(\"continue\")]), uniform_only = False, _only_if_np = True),\nClosureHint(\nForAll(\"nbr\", G.edges(\"src\"),\n[\nCDecl([(\"index_type\", \"dst\", \"\")]),\nCBlock([\"dst = graph.getAbsDestination(nbr)\"]),\nIf(\"p_delta[dst] > 0\",\n[\nCBlock([\"atomicTestAdd(&p_residual[src], p_delta[dst])\"]),\nCBlock([\"bitset_residual.set(src)\"]),\n]),\n]),\n),\n]),\n]),\nKernel(\"PageRankSanity\", [G.param(), ('unsigned int', '__begin'), ('unsigned int', '__end'), ('float', 'local_tolerance'), ('float *', 'p_residual'), ('float *', 'p_value'), ('HGAccumulator<uint64_t>', 'DGAccumulator_residual_over_tolerance'), ('HGAccumulator<float>', 'DGAccumulator_sum'), ('HGAccumulator<float>', 'DGAccumulator_sum_residual'), ('HGReduceMax<float>', 'max_residual'), ('HGReduceMax<float>', 'max_value'), ('HGReduceMin<float>', 'min_residual'), ('HGReduceMin<float>', 'min_value')],\n[\nCDecl([(\"__shared__ cub::BlockReduce<uint64_t, TB_SIZE>::TempStorage\", \"DGAccumulator_residual_over_tolerance_ts\", \"\")]),\nCBlock([\"DGAccumulator_residual_over_tolerance.thread_entry()\"]),\nCDecl([(\"__shared__ cub::BlockReduce<float, TB_SIZE>::TempStorage\", \"DGAccumulator_sum_ts\", \"\")]),\nCBlock([\"DGAccumulator_sum.thread_entry()\"]),\nCDecl([(\"__shared__ cub::BlockReduce<float, TB_SIZE>::TempStorage\", \"DGAccumulator_sum_residual_ts\", \"\")]),\nCBlock([\"DGAccumulator_sum_residual.thread_entry()\"]),\nCDecl([(\"__shared__ cub::BlockReduce<float, TB_SIZE>::TempStorage\", \"max_residual_ts\", \"\")]),\nCBlock([\"max_residual.thread_entry()\"]),\nCDecl([(\"__shared__ cub::BlockReduce<float, TB_SIZE>::TempStorage\", \"max_value_ts\", \"\")]),\nCBlock([\"max_value.thread_entry()\"]),\nCDecl([(\"__shared__ cub::BlockReduce<float, TB_SIZE>::TempStorage\", \"min_residual_ts\", \"\")]),\nCBlock([\"min_residual.thread_entry()\"]),\nCDecl([(\"__shared__ cub::BlockReduce<float, TB_SIZE>::TempStorage\", \"min_value_ts\", \"\")]),\nCBlock([\"min_value.thread_entry()\"]),\nForAll(\"src\", G.nodes(\"__begin\", \"__end\"),\n[\nCDecl([(\"bool\", \"pop\", \" = src < __end\")]),\nIf(\"pop\", [\nCBlock([\"max_value.reduce(p_value[src])\"]),\nCBlock([\"min_value.reduce(p_value[src])\"]),\nCBlock([\"max_residual.reduce(p_residual[src])\"]),\nCBlock([\"min_residual.reduce(p_residual[src])\"]),\nCBlock([\"DGAccumulator_sum.reduce( p_value[src])\"]),\nCBlock([\"DGAccumulator_sum.reduce( p_residual[src])\"]),\nIf(\"p_residual[src] > local_tolerance\",\n[\nCBlock([\"DGAccumulator_residual_over_tolerance.reduce( 1)\"]),\n]),\n]),\n]),\nCBlock([\"DGAccumulator_residual_over_tolerance.thread_exit<cub::BlockReduce<uint64_t, TB_SIZE> >(DGAccumulator_residual_over_tolerance_ts)\"], parse = False),\nCBlock([\"DGAccumulator_sum.thread_exit<cub::BlockReduce<float, TB_SIZE> >(DGAccumulator_sum_ts)\"], parse = False),\nCBlock([\"DGAccumulator_sum_residual.thread_exit<cub::BlockReduce<float, TB_SIZE> >(DGAccumulator_sum_residual_ts)\"], parse = False),\nCBlock([\"max_residual.thread_exit<cub::BlockReduce<float, TB_SIZE> >(max_residual_ts)\"], parse = False),\nCBlock([\"max_value.thread_exit<cub::BlockReduce<float, TB_SIZE> >(max_value_ts)\"], parse = False),\nCBlock([\"min_residual.thread_exit<cub::BlockReduce<float, TB_SIZE> >(min_residual_ts)\"], parse = False),\nCBlock([\"min_value.thread_exit<cub::BlockReduce<float, TB_SIZE> >(min_value_ts)\"], parse = False),\n]),\nKernel(\"ResetGraph_cuda\", [('unsigned int ', '__begin'), ('unsigned int ', '__end'), ('const float &', 'local_alpha'), ('struct CUDA_Context* ', 'ctx')],\n[\nCDecl([(\"dim3\", \"blocks\", \"\")]),\nCDecl([(\"dim3\", \"threads\", \"\")]),\nCBlock([\"kernel_sizing(blocks, threads)\"]),\nInvoke(\"ResetGraph\", (\"ctx->gg\", \"__begin\", \"__end\", \"local_alpha\", \"ctx->delta.data.gpu_wr_ptr()\", \"ctx->nout.data.gpu_wr_ptr()\", \"ctx->residual.data.gpu_wr_ptr()\", \"ctx->value.data.gpu_wr_ptr()\")),\nCBlock([\"check_cuda_kernel\"], parse = False),\n], host = True),\nKernel(\"ResetGraph_allNodes_cuda\", [('const float &', 'local_alpha'), ('struct CUDA_Context* ', 'ctx')],\n[\nCBlock([\"ResetGraph_cuda(0, ctx->gg.nnodes, local_alpha, ctx)\"]),\n], host = True),\nKernel(\"ResetGraph_masterNodes_cuda\", [('const float &', 'local_alpha'), ('struct CUDA_Context* ', 'ctx')],\n[\nCBlock([\"ResetGraph_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, local_alpha, ctx)\"]),\n], host = True),\nKernel(\"ResetGraph_nodesWithEdges_cuda\", [('const float &', 'local_alpha'), ('struct CUDA_Context* ', 'ctx')],\n[\nCBlock([\"ResetGraph_cuda(0, ctx->numNodesWithEdges, local_alpha, ctx)\"]),\n], host = True),\nKernel(\"InitializeGraph_cuda\", [('unsigned int ', '__begin'), ('unsigned int ', '__end'), ('struct CUDA_Context* ', 'ctx')],\n[\nCDecl([(\"dim3\", \"blocks\", \"\")]),\nCDecl([(\"dim3\", \"threads\", \"\")]),\nCBlock([\"kernel_sizing(blocks, threads)\"]),\nInvoke(\"InitializeGraph\", (\"ctx->gg\", \"__begin\", \"__end\", \"ctx->nout.data.gpu_wr_ptr()\", \"*(ctx->nout.is_updated.gpu_rd_ptr())\")),\nCBlock([\"check_cuda_kernel\"], parse = False),\n], host = True),\nKernel(\"InitializeGraph_allNodes_cuda\", [('struct CUDA_Context* ', 'ctx')],\n[\nCBlock([\"InitializeGraph_cuda(0, ctx->gg.nnodes, ctx)\"]),\n], host = True),\nKernel(\"InitializeGraph_masterNodes_cuda\", [('struct CUDA_Context* ', 'ctx')],\n[\nCBlock([\"InitializeGraph_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, ctx)\"]),\n], host = True),\nKernel(\"InitializeGraph_nodesWithEdges_cuda\", [('struct CUDA_Context* ', 'ctx')],\n[\nCBlock([\"InitializeGraph_cuda(0, ctx->numNodesWithEdges, ctx)\"]),\n], host = True),\nKernel(\"PageRank_delta_cuda\", [('unsigned int ', '__begin'), ('unsigned int ', '__end'), ('unsigned int &', 'active_vertices'), ('const float &', 'local_alpha'), ('float', 'local_tolerance'), ('struct CUDA_Context* ', 'ctx')],\n[\nCDecl([(\"dim3\", \"blocks\", \"\")]),\nCDecl([(\"dim3\", \"threads\", \"\")]),\nCBlock([\"kernel_sizing(blocks, threads)\"]),\nCDecl([(\"Shared<unsigned int>\", \"active_verticesval\", \" = Shared<unsigned int>(1)\")]),\nCDecl([(\"HGAccumulator<unsigned int>\", \"_active_vertices\", \"\")]),\nCBlock([\"*(active_verticesval.cpu_wr_ptr()) = 0\"]),\nCBlock([\"_active_vertices.rv = active_verticesval.gpu_wr_ptr()\"]),\nInvoke(\"PageRank_delta\", (\"ctx->gg\", \"__begin\", \"__end\", \"local_alpha\", \"local_tolerance\", \"ctx->delta.data.gpu_wr_ptr()\", \"ctx->nout.data.gpu_wr_ptr()\", \"ctx->residual.data.gpu_wr_ptr()\", \"ctx->value.data.gpu_wr_ptr()\", \"_active_vertices\")),\nCBlock([\"check_cuda_kernel\"], parse = False),\nCBlock([\"active_vertices = *(active_verticesval.cpu_rd_ptr())\"]),\n], host = True),\nKernel(\"PageRank_delta_allNodes_cuda\", [('unsigned int &', 'active_vertices'), ('const float &', 'local_alpha'), ('float', 'local_tolerance'), ('struct CUDA_Context* ', 'ctx')],\n[\nCBlock([\"PageRank_delta_cuda(0, ctx->gg.nnodes, active_vertices, local_alpha, local_tolerance, ctx)\"]),\n], host = True),\nKernel(\"PageRank_delta_masterNodes_cuda\", [('unsigned int &', 'active_vertices'), ('const float &', 'local_alpha'), ('float', 'local_tolerance'), ('struct CUDA_Context* ', 'ctx')],\n[\nCBlock([\"PageRank_delta_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, active_vertices, local_alpha, local_tolerance, ctx)\"]),\n], host = True),\nKernel(\"PageRank_delta_nodesWithEdges_cuda\", [('unsigned int &', 'active_vertices'), ('const float &', 'local_alpha'), ('float', 'local_tolerance'), ('struct CUDA_Context* ', 'ctx')],\n[\nCBlock([\"PageRank_delta_cuda(0, ctx->numNodesWithEdges, active_vertices, local_alpha, local_tolerance, ctx)\"]),\n], host = True),\nKernel(\"PageRank_cuda\", [('unsigned int ', '__begin'), ('unsigned int ', '__end'), ('struct CUDA_Context* ', 'ctx')],\n[\nCDecl([(\"dim3\", \"blocks\", \"\")]),\nCDecl([(\"dim3\", \"threads\", \"\")]),\nCBlock([\"kernel_sizing(blocks, threads)\"]),\nInvoke(\"PageRank\", (\"ctx->gg\", \"__begin\", \"__end\", \"ctx->delta.data.gpu_wr_ptr()\", \"ctx->residual.data.gpu_wr_ptr()\", \"*(ctx->residual.is_updated.gpu_rd_ptr())\")),\nCBlock([\"check_cuda_kernel\"], parse = False),\n], host = True),\nKernel(\"PageRank_allNodes_cuda\", [('struct CUDA_Context* ', 'ctx')],\n[\nCBlock([\"PageRank_cuda(0, ctx->gg.nnodes, ctx)\"]),\n], host = True),\nKernel(\"PageRank_masterNodes_cuda\", [('struct CUDA_Context* ', 'ctx')],\n[\nCBlock([\"PageRank_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, ctx)\"]),\n], host = True),\nKernel(\"PageRank_nodesWithEdges_cuda\", [('struct CUDA_Context* ', 'ctx')],\n[\nCBlock([\"PageRank_cuda(0, ctx->numNodesWithEdges, ctx)\"]),\n], host = True),\nKernel(\"PageRankSanity_cuda\", [('unsigned int ', '__begin'), ('unsigned int ', '__end'), ('uint64_t &', 'DGAccumulator_residual_over_tolerance'), ('float &', 'DGAccumulator_sum'), ('float &', 'DGAccumulator_sum_residual'), ('float &', 'max_residual'), ('float &', 'max_value'), ('float &', 'min_residual'), ('float &', 'min_value'), ('float', 'local_tolerance'), ('struct CUDA_Context* ', 'ctx')],\n[\nCDecl([(\"dim3\", \"blocks\", \"\")]),\nCDecl([(\"dim3\", \"threads\", \"\")]),\nCBlock([\"kernel_sizing(blocks, threads)\"]),\nCDecl([(\"Shared<uint64_t>\", \"DGAccumulator_residual_over_toleranceval\", \" = Shared<uint64_t>(1)\")]),\nCDecl([(\"HGAccumulator<uint64_t>\", \"_DGAccumulator_residual_over_tolerance\", \"\")]),\nCBlock([\"*(DGAccumulator_residual_over_toleranceval.cpu_wr_ptr()) = 0\"]),\nCBlock([\"_DGAccumulator_residual_over_tolerance.rv = DGAccumulator_residual_over_toleranceval.gpu_wr_ptr()\"]),\nCDecl([(\"Shared<float>\", \"DGAccumulator_sumval\", \" = Shared<float>(1)\")]),\nCDecl([(\"HGAccumulator<float>\", \"_DGAccumulator_sum\", \"\")]),\nCBlock([\"*(DGAccumulator_sumval.cpu_wr_ptr()) = 0\"]),\nCBlock([\"_DGAccumulator_sum.rv = DGAccumulator_sumval.gpu_wr_ptr()\"]),\nCDecl([(\"Shared<float>\", \"DGAccumulator_sum_residualval\", \" = Shared<float>(1)\")]),\nCDecl([(\"HGAccumulator<float>\", \"_DGAccumulator_sum_residual\", \"\")]),\nCBlock([\"*(DGAccumulator_sum_residualval.cpu_wr_ptr()) = 0\"]),\nCBlock([\"_DGAccumulator_sum_residual.rv = DGAccumulator_sum_residualval.gpu_wr_ptr()\"]),\nCDecl([(\"Shared<float>\", \"max_residualval\", \" = Shared<float>(1)\")]),\nCDecl([(\"HGReduceMax<float>\", \"_max_residual\", \"\")]),\nCBlock([\"*(max_residualval.cpu_wr_ptr()) = 0\"]),\nCBlock([\"_max_residual.rv = max_residualval.gpu_wr_ptr()\"]),\nCDecl([(\"Shared<float>\", \"max_valueval\", \" = Shared<float>(1)\")]),\nCDecl([(\"HGReduceMax<float>\", \"_max_value\", \"\")]),\nCBlock([\"*(max_valueval.cpu_wr_ptr()) = 0\"]),\nCBlock([\"_max_value.rv = max_valueval.gpu_wr_ptr()\"]),\nCDecl([(\"Shared<float>\", \"min_residualval\", \" = Shared<float>(1)\")]),\nCDecl([(\"HGReduceMin<float>\", \"_min_residual\", \"\")]),\nCBlock([\"*(min_residualval.cpu_wr_ptr()) = 1073741823\"]),\nCBlock([\"_min_residual.rv = min_residualval.gpu_wr_ptr()\"]),\nCDecl([(\"Shared<float>\", \"min_valueval\", \" = Shared<float>(1)\")]),\nCDecl([(\"HGReduceMin<float>\", \"_min_value\", \"\")]),\nCBlock([\"*(min_valueval.cpu_wr_ptr()) = 1073741823\"]),\nCBlock([\"_min_value.rv = min_valueval.gpu_wr_ptr()\"]),\nInvoke(\"PageRankSanity\", (\"ctx->gg\", \"__begin\", \"__end\", \"local_tolerance\", \"ctx->residual.data.gpu_wr_ptr()\", \"ctx->value.data.gpu_wr_ptr()\", \"_DGAccumulator_residual_over_tolerance\", \"_DGAccumulator_sum\", \"_DGAccumulator_sum_residual\", \"_max_residual\", \"_max_value\", \"_min_residual\", \"_min_value\")),\nCBlock([\"check_cuda_kernel\"], parse = False),\nCBlock([\"DGAccumulator_residual_over_tolerance = *(DGAccumulator_residual_over_toleranceval.cpu_rd_ptr())\"]),\nCBlock([\"DGAccumulator_sum = *(DGAccumulator_sumval.cpu_rd_ptr())\"]),\nCBlock([\"DGAccumulator_sum_residual = *(DGAccumulator_sum_residualval.cpu_rd_ptr())\"]),\nCBlock([\"max_residual = *(max_residualval.cpu_rd_ptr())\"]),\nCBlock([\"max_value = *(max_valueval.cpu_rd_ptr())\"]),\nCBlock([\"min_residual = *(min_residualval.cpu_rd_ptr())\"]),\nCBlock([\"min_value = *(min_valueval.cpu_rd_ptr())\"]),\n], host = True),\nKernel(\"PageRankSanity_allNodes_cuda\", [('uint64_t &', 'DGAccumulator_residual_over_tolerance'), ('float &', 'DGAccumulator_sum'), ('float &', 'DGAccumulator_sum_residual'), ('float &', 'max_residual'), ('float &', 'max_value'), ('float &', 'min_residual'), ('float &', 'min_value'), ('float', 'local_tolerance'), ('struct CUDA_Context* ', 'ctx')],\n[\nCBlock([\"PageRankSanity_cuda(0, ctx->gg.nnodes, DGAccumulator_residual_over_tolerance, DGAccumulator_sum, DGAccumulator_sum_residual, max_residual, max_value, min_residual, min_value, local_tolerance, ctx)\"]),\n], host = True),\nKernel(\"PageRankSanity_masterNodes_cuda\", [('uint64_t &', 'DGAccumulator_residual_over_tolerance'), ('float &', 'DGAccumulator_sum'), ('float &', 'DGAccumulator_sum_residual'), ('float &', 'max_residual'), ('float &', 'max_value'), ('float &', 'min_residual'), ('float &', 'min_value'), ('float', 'local_tolerance'), ('struct CUDA_Context* ', 'ctx')],\n[\nCBlock([\"PageRankSanity_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, DGAccumulator_residual_over_tolerance, DGAccumulator_sum, DGAccumulator_sum_residual, max_residual, max_value, min_residual, min_value, local_tolerance, ctx)\"]),\n], host = True),\nKernel(\"PageRankSanity_nodesWithEdges_cuda\", [('uint64_t &', 'DGAccumulator_residual_over_tolerance'), ('float &', 'DGAccumulator_sum'), ('float &', 'DGAccumulator_sum_residual'), ('float &', 'max_residual'), ('float &', 'max_value'), ('float &', 'min_residual'), ('float &', 'min_value'), ('float', 'local_tolerance'), ('struct CUDA_Context* ', 'ctx')],\n[\nCBlock([\"PageRankSanity_cuda(0, ctx->numNodesWithEdges, DGAccumulator_residual_over_tolerance, DGAccumulator_sum, DGAccumulator_sum_residual, max_residual, max_value, min_residual, min_value, local_tolerance, ctx)\"]),\n], host = True),\n])\n"
  },
  {
    "path": "lonestar/analytics/distributed/pagerank/pagerank_pull_sync.hh",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting parallelism.\n * The code is being released under the terms of the 3-Clause BSD License (a\n * copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#include \"galois/runtime/SyncStructures.h\"\n\nGALOIS_SYNC_STRUCTURE_REDUCE_ADD(nout, unsigned int);\nGALOIS_SYNC_STRUCTURE_BITSET(nout);\n\nGALOIS_SYNC_STRUCTURE_REDUCE_ADD(residual, float);\nGALOIS_SYNC_STRUCTURE_REDUCE_SET(residual, float);\nGALOIS_SYNC_STRUCTURE_BITSET(residual);\n"
  },
  {
    "path": "lonestar/analytics/distributed/pagerank/pagerank_push.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#include \"DistBench/Output.h\"\n#include \"DistBench/Start.h\"\n#include \"galois/DistGalois.h\"\n#include \"galois/gstl.h\"\n#include \"galois/DReducible.h\"\n#include \"galois/DTerminationDetector.h\"\n#include \"galois/runtime/Tracer.h\"\n\n#include <algorithm>\n#include <iostream>\n#include <limits>\n#include <vector>\n\n#ifdef GALOIS_ENABLE_GPU\n#include \"pagerank_push_cuda.h\"\nstruct CUDA_Context* cuda_ctx;\n#else\nenum { CPU, GPU_CUDA };\nint personality = CPU;\n#endif\n\nconstexpr static const char* const REGION_NAME = \"PageRank\";\n\n/******************************************************************************/\n/* Declaration of command line arguments */\n/******************************************************************************/\nnamespace cll = llvm::cl;\n\nstatic cll::opt<float> tolerance(\"tolerance\",\n                                 cll::desc(\"tolerance for residual\"),\n                                 cll::init(0.000001));\nstatic cll::opt<unsigned int>\n    maxIterations(\"maxIterations\",\n                  cll::desc(\"Maximum iterations: Default 1000\"),\n                  cll::init(1000));\n\nenum Exec { Sync, Async };\n\nstatic cll::opt<Exec> execution(\n    \"exec\", cll::desc(\"Distributed Execution Model (default value Async):\"),\n    cll::values(clEnumVal(Sync, \"Bulk-synchronous Parallel (BSP)\"),\n                clEnumVal(Async, \"Bulk-asynchronous Parallel (BASP)\")),\n    cll::init(Async));\n\n/******************************************************************************/\n/* Graph structure declarations + other initialization */\n/******************************************************************************/\n\nstatic const float alpha = (1.0 - 0.85);\nstruct NodeData {\n  float value;\n  std::atomic<uint32_t> nout;\n  float delta;\n  std::atomic<float> residual;\n};\n\ngalois::DynamicBitSet bitset_residual;\ngalois::DynamicBitSet bitset_nout;\n\ntypedef galois::graphs::DistGraph<NodeData, void> Graph;\ntypedef typename Graph::GraphNode GNode;\ntypedef GNode WorkItem;\n\nstd::unique_ptr<galois::graphs::GluonSubstrate<Graph>> syncSubstrate;\n\n#include \"pagerank_push_sync.hh\"\n\n/******************************************************************************/\n/* Algorithm structures */\n/******************************************************************************/\n\n// Reset all fields of all nodes to 0\nstruct ResetGraph {\n  Graph* graph;\n\n  ResetGraph(Graph* _graph) : graph(_graph) {}\n  void static go(Graph& _graph) {\n    const auto& allNodes = _graph.allNodesRange();\n    if (personality == GPU_CUDA) {\n#ifdef GALOIS_ENABLE_GPU\n      std::string impl_str(\"ResetGraph_\" +\n                           (syncSubstrate->get_run_identifier()));\n      galois::StatTimer StatTimer_cuda(impl_str.c_str(), REGION_NAME);\n      StatTimer_cuda.start();\n      ResetGraph_allNodes_cuda(cuda_ctx);\n      StatTimer_cuda.stop();\n#else\n      abort();\n#endif\n    } else if (personality == CPU) {\n      galois::do_all(\n          galois::iterate(allNodes.begin(), allNodes.end()),\n          ResetGraph{&_graph}, galois::no_stats(),\n          galois::loopname(\n              syncSubstrate->get_run_identifier(\"ResetGraph\").c_str()));\n    }\n  }\n\n  void operator()(GNode src) const {\n    NodeData& sdata = graph->getData(src);\n    sdata.value     = 0;\n    sdata.nout      = 0;\n    sdata.residual  = 0;\n    sdata.delta     = 0;\n  }\n};\n\n// Initialize residual at nodes with outgoing edges + find nout for\n// nodes with outgoing edges\nstruct InitializeGraph {\n  const float& local_alpha;\n  Graph* graph;\n\n  InitializeGraph(const float& _alpha, Graph* _graph)\n      : local_alpha(_alpha), graph(_graph) {}\n\n  void static go(Graph& _graph) {\n    // first initialize all fields to 0 via ResetGraph (can't assume all zero\n    // at start)\n    ResetGraph::go(_graph);\n\n    const auto& nodesWithEdges = _graph.allNodesWithEdgesRange();\n\n    if (personality == GPU_CUDA) {\n#ifdef GALOIS_ENABLE_GPU\n      std::string impl_str(\"InitializeGraph_\" +\n                           (syncSubstrate->get_run_identifier()));\n      galois::StatTimer StatTimer_cuda(impl_str.c_str(), REGION_NAME);\n      StatTimer_cuda.start();\n      InitializeGraph_nodesWithEdges_cuda(alpha, cuda_ctx);\n      StatTimer_cuda.stop();\n#else\n      abort();\n#endif\n    } else if (personality == CPU) {\n      // regular do all without stealing; just initialization of nodes with\n      // outgoing edges\n      galois::do_all(\n          galois::iterate(nodesWithEdges.begin(), nodesWithEdges.end()),\n          InitializeGraph{alpha, &_graph}, galois::steal(), galois::no_stats(),\n          galois::loopname(\n              syncSubstrate->get_run_identifier(\"InitializeGraph\").c_str()));\n    }\n\n    syncSubstrate->sync<writeSource, readSource, Reduce_add_nout, Bitset_nout>(\n        \"InitializeGraphNout\");\n  }\n\n  void operator()(GNode src) const {\n    NodeData& sdata = graph->getData(src);\n    sdata.residual  = local_alpha;\n    uint32_t num_edges =\n        std::distance(graph->edge_begin(src), graph->edge_end(src));\n    galois::atomicAdd(sdata.nout, num_edges);\n    bitset_nout.set(src);\n  }\n};\n\nstruct PageRank_delta {\n  const float& local_alpha;\n  cll::opt<float>& local_tolerance;\n  Graph* graph;\n\n  PageRank_delta(const float& _local_alpha, cll::opt<float>& _local_tolerance,\n                 Graph* _graph)\n      : local_alpha(_local_alpha), local_tolerance(_local_tolerance),\n        graph(_graph) {}\n\n  void static go(Graph& _graph) {\n    const auto& nodesWithEdges = _graph.allNodesWithEdgesRange();\n\n    if (personality == GPU_CUDA) {\n#ifdef GALOIS_ENABLE_GPU\n      std::string impl_str(\"PageRank_\" + (syncSubstrate->get_run_identifier()));\n      galois::StatTimer StatTimer_cuda(impl_str.c_str(), REGION_NAME);\n      StatTimer_cuda.start();\n      PageRank_delta_nodesWithEdges_cuda(alpha, tolerance, cuda_ctx);\n      StatTimer_cuda.stop();\n#else\n      abort();\n#endif\n    } else if (personality == CPU) {\n      galois::do_all(\n          galois::iterate(nodesWithEdges.begin(), nodesWithEdges.end()),\n          PageRank_delta{alpha, tolerance, &_graph}, galois::no_stats(),\n          galois::loopname(\n              syncSubstrate->get_run_identifier(\"PageRank_delta\").c_str()));\n    }\n  }\n\n  void operator()(WorkItem src) const {\n    NodeData& sdata = graph->getData(src);\n\n    if (sdata.residual > 0) {\n      float residual_old = sdata.residual;\n      sdata.residual     = 0;\n      sdata.value += residual_old;\n      if (residual_old > this->local_tolerance) {\n        if (sdata.nout > 0) {\n          sdata.delta = residual_old * (1 - local_alpha) / sdata.nout;\n        }\n      }\n    }\n  }\n};\n\ntemplate <bool async>\nstruct PageRank {\n  Graph* graph;\n  using DGTerminatorDetector =\n      typename std::conditional<async, galois::DGTerminator<unsigned int>,\n                                galois::DGAccumulator<unsigned int>>::type;\n\n  DGTerminatorDetector& active_vertices;\n\n  PageRank(Graph* _g, DGTerminatorDetector& _dga)\n      : graph(_g), active_vertices(_dga) {}\n\n  void static go(Graph& _graph) {\n    unsigned _num_iterations   = 0;\n    const auto& nodesWithEdges = _graph.allNodesWithEdgesRange();\n    DGTerminatorDetector dga;\n\n    do {\n      syncSubstrate->set_num_round(_num_iterations);\n      PageRank_delta::go(_graph);\n      dga.reset();\n      // reset residual on mirrors\n      syncSubstrate->reset_mirrorField<Reduce_add_residual>();\n\n      if (personality == GPU_CUDA) {\n#ifdef GALOIS_ENABLE_GPU\n        std::string impl_str(\"PageRank_\" +\n                             (syncSubstrate->get_run_identifier()));\n        galois::StatTimer StatTimer_cuda(impl_str.c_str(), REGION_NAME);\n        StatTimer_cuda.start();\n        unsigned int __retval = 0;\n        PageRank_nodesWithEdges_cuda(__retval, cuda_ctx);\n        dga += __retval;\n        StatTimer_cuda.stop();\n#else\n        abort();\n#endif\n      } else if (personality == CPU) {\n        galois::do_all(\n            galois::iterate(nodesWithEdges), PageRank{&_graph, dga},\n            galois::no_stats(), galois::steal(),\n            galois::loopname(\n                syncSubstrate->get_run_identifier(\"PageRank\").c_str()));\n      }\n\n      syncSubstrate->sync<writeDestination, readSource, Reduce_add_residual,\n                          Bitset_residual, async>(\"PageRank\");\n\n      galois::runtime::reportStat_Tsum(\n          REGION_NAME, \"NumWorkItems_\" + (syncSubstrate->get_run_identifier()),\n          (unsigned long)dga.read_local());\n\n      ++_num_iterations;\n    } while ((async || (_num_iterations < maxIterations)) &&\n             dga.reduce(syncSubstrate->get_run_identifier()));\n\n    if (galois::runtime::getSystemNetworkInterface().ID == 0) {\n      galois::runtime::reportStat_Single(\n          REGION_NAME,\n          \"NumIterations_\" + std::to_string(syncSubstrate->get_run_num()),\n          (unsigned long)_num_iterations);\n    }\n  }\n\n  void operator()(WorkItem src) const {\n    NodeData& sdata = graph->getData(src);\n    if (sdata.delta > 0) {\n      float _delta = sdata.delta;\n      sdata.delta  = 0;\n\n      active_vertices += 1; // this should be moved to Pagerank_delta operator\n\n      for (auto nbr : graph->edges(src)) {\n        GNode dst       = graph->getEdgeDst(nbr);\n        NodeData& ddata = graph->getData(dst);\n\n        galois::atomicAdd(ddata.residual, _delta);\n\n        bitset_residual.set(dst);\n      }\n    }\n  }\n};\n\n/******************************************************************************/\n/* Sanity check operators */\n/******************************************************************************/\n\n// Gets various values from the pageranks values/residuals of the graph\nstruct PageRankSanity {\n  cll::opt<float>& local_tolerance;\n  Graph* graph;\n\n  galois::DGAccumulator<float>& DGAccumulator_sum;\n  galois::DGAccumulator<float>& DGAccumulator_sum_residual;\n  galois::DGAccumulator<uint64_t>& DGAccumulator_residual_over_tolerance;\n\n  galois::DGReduceMax<float>& max_value;\n  galois::DGReduceMin<float>& min_value;\n  galois::DGReduceMax<float>& max_residual;\n  galois::DGReduceMin<float>& min_residual;\n\n  PageRankSanity(\n      cll::opt<float>& _local_tolerance, Graph* _graph,\n      galois::DGAccumulator<float>& _DGAccumulator_sum,\n      galois::DGAccumulator<float>& _DGAccumulator_sum_residual,\n      galois::DGAccumulator<uint64_t>& _DGAccumulator_residual_over_tolerance,\n      galois::DGReduceMax<float>& _max_value,\n      galois::DGReduceMin<float>& _min_value,\n      galois::DGReduceMax<float>& _max_residual,\n      galois::DGReduceMin<float>& _min_residual)\n      : local_tolerance(_local_tolerance), graph(_graph),\n        DGAccumulator_sum(_DGAccumulator_sum),\n        DGAccumulator_sum_residual(_DGAccumulator_sum_residual),\n        DGAccumulator_residual_over_tolerance(\n            _DGAccumulator_residual_over_tolerance),\n        max_value(_max_value), min_value(_min_value),\n        max_residual(_max_residual), min_residual(_min_residual) {}\n\n  void static go(Graph& _graph, galois::DGAccumulator<float>& DGA_sum,\n                 galois::DGAccumulator<float>& DGA_sum_residual,\n                 galois::DGAccumulator<uint64_t>& DGA_residual_over_tolerance,\n                 galois::DGReduceMax<float>& max_value,\n                 galois::DGReduceMin<float>& min_value,\n                 galois::DGReduceMax<float>& max_residual,\n                 galois::DGReduceMin<float>& min_residual) {\n    DGA_sum.reset();\n    DGA_sum_residual.reset();\n    max_value.reset();\n    max_residual.reset();\n    min_value.reset();\n    min_residual.reset();\n    DGA_residual_over_tolerance.reset();\n\n    if (personality == GPU_CUDA) {\n#ifdef GALOIS_ENABLE_GPU\n      float _max_value;\n      float _min_value;\n      float _sum_value;\n      float _sum_residual;\n      uint64_t num_residual_over_tolerance;\n      float _max_residual;\n      float _min_residual;\n      PageRankSanity_masterNodes_cuda(\n          num_residual_over_tolerance, _sum_value, _sum_residual, _max_residual,\n          _max_value, _min_residual, _min_value, tolerance, cuda_ctx);\n      DGA_sum += _sum_value;\n      DGA_sum_residual += _sum_residual;\n      DGA_residual_over_tolerance += num_residual_over_tolerance;\n      max_value.update(_max_value);\n      max_residual.update(_max_residual);\n      min_value.update(_min_value);\n      min_residual.update(_min_residual);\n#else\n      abort();\n#endif\n    } else {\n      galois::do_all(galois::iterate(_graph.masterNodesRange().begin(),\n                                     _graph.masterNodesRange().end()),\n                     PageRankSanity(tolerance, &_graph, DGA_sum,\n                                    DGA_sum_residual,\n                                    DGA_residual_over_tolerance, max_value,\n                                    min_value, max_residual, min_residual),\n                     galois::no_stats(), galois::loopname(\"PageRankSanity\"));\n    }\n\n    float max_rank          = max_value.reduce();\n    float min_rank          = min_value.reduce();\n    float rank_sum          = DGA_sum.reduce();\n    float residual_sum      = DGA_sum_residual.reduce();\n    uint64_t over_tolerance = DGA_residual_over_tolerance.reduce();\n    float max_res           = max_residual.reduce();\n    float min_res           = min_residual.reduce();\n\n    // Only node 0 will print data\n    if (galois::runtime::getSystemNetworkInterface().ID == 0) {\n      galois::gPrint(\"Max rank is \", max_rank, \"\\n\");\n      galois::gPrint(\"Min rank is \", min_rank, \"\\n\");\n      galois::gPrint(\"Rank sum is \", rank_sum, \"\\n\");\n      galois::gPrint(\"Residual sum is \", residual_sum, \"\\n\");\n      galois::gPrint(\"# nodes with residual over \", tolerance,\n                     \" (tolerance) is \", over_tolerance, \"\\n\");\n      galois::gPrint(\"Max residual is \", max_res, \"\\n\");\n      galois::gPrint(\"Min residual is \", min_res, \"\\n\");\n    }\n  }\n\n  /* Gets the max, min rank from all owned nodes and\n   * also the sum of ranks */\n  void operator()(GNode src) const {\n    NodeData& sdata = graph->getData(src);\n\n    max_value.update(sdata.value);\n    min_value.update(sdata.value);\n    max_residual.update(sdata.residual);\n    min_residual.update(sdata.residual);\n\n    DGAccumulator_sum += sdata.value;\n    DGAccumulator_sum_residual += sdata.residual;\n\n    if (sdata.residual > local_tolerance) {\n      DGAccumulator_residual_over_tolerance += 1;\n    }\n  }\n};\n\n/******************************************************************************/\n/* Make results */\n/******************************************************************************/\n\nstd::vector<float> makeResultsCPU(std::unique_ptr<Graph>& hg) {\n  std::vector<float> values;\n\n  values.reserve(hg->numMasters());\n  for (auto node : hg->masterNodesRange()) {\n    values.push_back(hg->getData(node).value);\n  }\n\n  return values;\n}\n\n#ifdef GALOIS_ENABLE_GPU\nstd::vector<float> makeResultsGPU(std::unique_ptr<Graph>& hg) {\n  std::vector<float> values;\n\n  values.reserve(hg->numMasters());\n  for (auto node : hg->masterNodesRange()) {\n    values.push_back(get_node_value_cuda(cuda_ctx, node));\n  }\n\n  return values;\n}\n#else\nstd::vector<float> makeResultsGPU(std::unique_ptr<Graph>& /*unused*/) {\n  abort();\n}\n#endif\n\nstd::vector<float> makeResults(std::unique_ptr<Graph>& hg) {\n  switch (personality) {\n  case CPU:\n    return makeResultsCPU(hg);\n  case GPU_CUDA:\n    return makeResultsGPU(hg);\n  default:\n    abort();\n  }\n}\n\n/******************************************************************************/\n/* Main */\n/******************************************************************************/\n\nconstexpr static const char* const name = \"PageRank - Compiler Generated \"\n                                          \"Distributed Heterogeneous\";\nconstexpr static const char* const desc = \"Residual PageRank on Distributed \"\n                                          \"Galois.\";\nconstexpr static const char* const url = 0;\n\nint main(int argc, char** argv) {\n  galois::DistMemSys G;\n  DistBenchStart(argc, argv, name, desc, url);\n\n  auto& net = galois::runtime::getSystemNetworkInterface();\n\n  if (net.ID == 0) {\n    galois::runtime::reportParam(REGION_NAME, \"Max Iterations\", maxIterations);\n    std::ostringstream ss;\n    ss << tolerance;\n    galois::runtime::reportParam(REGION_NAME, \"Tolerance\", ss.str());\n  }\n  galois::StatTimer StatTimer_total(\"TimerTotal\", REGION_NAME);\n\n  StatTimer_total.start();\n\n  std::unique_ptr<Graph> hg;\n#ifdef GALOIS_ENABLE_GPU\n  std::tie(hg, syncSubstrate) =\n      distGraphInitialization<NodeData, void>(&cuda_ctx);\n#else\n  std::tie(hg, syncSubstrate) = distGraphInitialization<NodeData, void>();\n#endif\n\n  bitset_residual.resize(hg->size());\n  bitset_nout.resize(hg->size());\n\n  galois::gPrint(\"[\", net.ID, \"] InitializeGraph::go called\\n\");\n\n  InitializeGraph::go((*hg));\n  galois::runtime::getHostBarrier().wait();\n\n  galois::DGAccumulator<float> DGA_sum;\n  galois::DGAccumulator<float> DGA_sum_residual;\n  galois::DGAccumulator<uint64_t> DGA_residual_over_tolerance;\n  galois::DGReduceMax<float> max_value;\n  galois::DGReduceMin<float> min_value;\n  galois::DGReduceMax<float> max_residual;\n  galois::DGReduceMin<float> min_residual;\n\n  for (auto run = 0; run < numRuns; ++run) {\n    galois::gPrint(\"[\", net.ID, \"] PageRank::go run \", run, \" called\\n\");\n    std::string timer_str(\"Timer_\" + std::to_string(run));\n    galois::StatTimer StatTimer_main(timer_str.c_str(), REGION_NAME);\n\n    StatTimer_main.start();\n    if (execution == Async) {\n      PageRank<true>::go(*hg);\n    } else {\n      PageRank<false>::go(*hg);\n    }\n    StatTimer_main.stop();\n\n    // sanity check\n    PageRankSanity::go(*hg, DGA_sum, DGA_sum_residual,\n                       DGA_residual_over_tolerance, max_value, min_value,\n                       max_residual, min_residual);\n\n    if ((run + 1) != numRuns) {\n      if (personality == GPU_CUDA) {\n#ifdef GALOIS_ENABLE_GPU\n        bitset_residual_reset_cuda(cuda_ctx);\n        bitset_nout_reset_cuda(cuda_ctx);\n#else\n        abort();\n#endif\n      } else {\n        bitset_residual.reset();\n        bitset_nout.reset();\n      }\n\n      (*syncSubstrate).set_num_run(run + 1);\n      InitializeGraph::go(*hg);\n      galois::runtime::getHostBarrier().wait();\n    }\n  }\n\n  StatTimer_total.stop();\n\n  if (output) {\n    std::vector<float> results = makeResults(hg);\n    auto globalIDs             = hg->getMasterGlobalIDs();\n    assert(results.size() == globalIDs.size());\n\n    writeOutput(outputLocation, \"pagerank\", results.data(), results.size(),\n                globalIDs.data());\n  }\n\n  return 0;\n}\n"
  },
  {
    "path": "lonestar/analytics/distributed/pagerank/pagerank_push_cuda.cu",
    "content": "/*  -*- mode: c++ -*-  */\n#include \"gg.h\"\n#include \"ggcuda.h\"\n#include \"cub/cub.cuh\"\n#include \"cub/util_allocator.cuh\"\n#include \"thread_work.h\"\n\nvoid kernel_sizing(CSRGraph &, dim3 &, dim3 &);\n#define TB_SIZE 256\nconst char *GGC_OPTIONS = \"coop_conv=False $ outline_iterate_gb=False $ backoff_blocking_factor=4 $ parcomb=True $ np_schedulers=set(['fg', 'tb', 'wp']) $ cc_disable=set([]) $ tb_lb=False $ hacks=set([]) $ np_factor=8 $ instrument=set([]) $ unroll=[] $ instrument_mode=None $ read_props=None $ outline_iterate=False $ ignore_nested_errors=False $ np=True $ write_props=None $ quiet_cgen=True $ dyn_lb=False $ retry_backoff=True $ cuda.graph_type=basic $ cuda.use_worklist_slots=True $ cuda.worklist_type=basic\";\nbool enable_lb = false;\n#include \"pagerank_push_cuda.cuh\"\nstatic const int __tb_PageRank = TB_SIZE;\n__global__ void ResetGraph(CSRGraph graph, unsigned int __begin, unsigned int __end, float * p_delta, uint32_t * p_nout, float * p_residual, float * p_value)\n{\n  unsigned tid = TID_1D;\n  unsigned nthreads = TOTAL_THREADS_1D;\n\n  const unsigned __kernel_tb_size = TB_SIZE;\n  index_type src_end;\n  // FP: \"1 -> 2;\n  src_end = __end;\n  for (index_type src = __begin + tid; src < src_end; src += nthreads)\n  {\n    bool pop  = src < __end;\n    if (pop)\n    {\n      p_value[src]     = 0;\n      p_nout[src]      = 0;\n      p_residual[src]  = 0;\n      p_delta[src]     = 0;\n    }\n  }\n  // FP: \"10 -> 11;\n}\n__global__ void InitializeGraph(CSRGraph graph, unsigned int __begin, unsigned int __end, const float  local_alpha, uint32_t * p_nout, float * p_residual, DynamicBitset& bitset_nout)\n{\n  unsigned tid = TID_1D;\n  unsigned nthreads = TOTAL_THREADS_1D;\n\n  const unsigned __kernel_tb_size = TB_SIZE;\n  uint32_t num_edges;\n  index_type src_end;\n  // FP: \"1 -> 2;\n  // FP: \"2 -> 3;\n  src_end = __end;\n  for (index_type src = __begin + tid; src < src_end; src += nthreads)\n  {\n    bool pop  = src < __end;\n    if (pop)\n    {\n      p_residual[src]  = local_alpha;\n      num_edges = graph.getOutDegree(src);\n      atomicTestAdd(&p_nout[src], num_edges);\n      bitset_nout.set(src);\n    }\n  }\n  // FP: \"11 -> 12;\n}\n__global__ void PageRank_delta(CSRGraph graph, unsigned int __begin, unsigned int __end, const float  local_alpha, float local_tolerance, float * p_delta, uint32_t * p_nout, float * p_residual, float * p_value)\n{\n  unsigned tid = TID_1D;\n  unsigned nthreads = TOTAL_THREADS_1D;\n\n  const unsigned __kernel_tb_size = TB_SIZE;\n  float residual_old;\n  index_type src_end;\n  // FP: \"1 -> 2;\n  // FP: \"2 -> 3;\n  src_end = __end;\n  for (index_type src = __begin + tid; src < src_end; src += nthreads)\n  {\n    bool pop  = src < __end;\n    if (pop)\n    {\n      if (p_residual[src] > 0)\n      {\n        residual_old = p_residual[src];\n        p_residual[src]     = 0;\n        p_value[src] += residual_old;\n        if (residual_old > local_tolerance)\n        {\n          if (p_nout[src] > 0)\n          {\n            p_delta[src] = residual_old * (1 - local_alpha) / p_nout[src];\n          }\n        }\n      }\n    }\n  }\n  // FP: \"17 -> 18;\n}\n__global__ void PageRank(CSRGraph graph, unsigned int __begin, unsigned int __end, float * p_delta, float * p_residual, DynamicBitset& bitset_residual, HGAccumulator<unsigned int> active_vertices, bool enable_lb)\n{\n  unsigned tid = TID_1D;\n  unsigned nthreads = TOTAL_THREADS_1D;\n\n  const unsigned __kernel_tb_size = __tb_PageRank;\n  float _delta;\n  __shared__ cub::BlockReduce<unsigned int, TB_SIZE>::TempStorage active_vertices_ts;\n  index_type src_end;\n  index_type src_rup;\n  // FP: \"1 -> 2;\n  const int _NP_CROSSOVER_WP = 32;\n  const int _NP_CROSSOVER_TB = __kernel_tb_size;\n  // FP: \"2 -> 3;\n  const int BLKSIZE = __kernel_tb_size;\n  const int ITSIZE = BLKSIZE * 8;\n  unsigned d_limit = DEGREE_LIMIT;\n  // FP: \"3 -> 4;\n\n  typedef cub::BlockScan<multiple_sum<2, index_type>, BLKSIZE> BlockScan;\n  typedef union np_shared<BlockScan::TempStorage, index_type, struct tb_np, struct warp_np<__kernel_tb_size/32>, struct fg_np<ITSIZE> > npsTy;\n\n  // FP: \"4 -> 5;\n  __shared__ npsTy nps ;\n  // FP: \"5 -> 6;\n  // FP: \"6 -> 7;\n  // FP: \"7 -> 8;\n  active_vertices.thread_entry();\n  // FP: \"8 -> 9;\n  src_end = __end;\n  src_rup = ((__begin) + roundup(((__end) - (__begin)), (blockDim.x)));\n  for (index_type src = __begin + tid; src < src_rup; src += nthreads)\n  {\n    multiple_sum<2, index_type> _np_mps;\n    multiple_sum<2, index_type> _np_mps_total;\n    // FP: \"9 -> 10;\n    bool pop  = src < __end && ((( src < (graph).nnodes )) ? true: false);\n    // FP: \"10 -> 11;\n    if (pop)\n    {\n      if (p_delta[src] > 0)\n      {\n        _delta = p_delta[src];\n        p_delta[src]  = 0;\n        active_vertices.reduce( 1);\n      }\n      else\n      {\n        pop = false;\n      }\n    }\n    // FP: \"17 -> 18;\n    // FP: \"20 -> 21;\n    struct NPInspector1 _np = {0,0,0,0,0,0};\n    // FP: \"21 -> 22;\n    __shared__ struct { float _delta; } _np_closure [TB_SIZE];\n    // FP: \"22 -> 23;\n    _np_closure[threadIdx.x]._delta = _delta;\n    // FP: \"23 -> 24;\n    if (pop)\n    {\n      _np.size = (graph).getOutDegree(src);\n      _np.start = (graph).getFirstEdge(src);\n    }\n    // FP: \"26 -> 27;\n    // FP: \"27 -> 28;\n    _np_mps.el[0] = _np.size >= _NP_CROSSOVER_WP ? _np.size : 0;\n    _np_mps.el[1] = _np.size < _NP_CROSSOVER_WP ? _np.size : 0;\n    // FP: \"28 -> 29;\n    BlockScan(nps.temp_storage).ExclusiveSum(_np_mps, _np_mps, _np_mps_total);\n    // FP: \"29 -> 30;\n    if (threadIdx.x == 0)\n    {\n      nps.tb.owner = MAX_TB_SIZE + 1;\n    }\n    // FP: \"32 -> 33;\n    __syncthreads();\n    // FP: \"33 -> 34;\n    while (true)\n    {\n      // FP: \"34 -> 35;\n      if (_np.size >= _NP_CROSSOVER_TB)\n      {\n        nps.tb.owner = threadIdx.x;\n      }\n      // FP: \"37 -> 38;\n      __syncthreads();\n      // FP: \"38 -> 39;\n      if (nps.tb.owner == MAX_TB_SIZE + 1)\n      {\n        // FP: \"39 -> 40;\n        __syncthreads();\n        // FP: \"40 -> 41;\n        break;\n      }\n      // FP: \"42 -> 43;\n      if (nps.tb.owner == threadIdx.x)\n      {\n        nps.tb.start = _np.start;\n        nps.tb.size = _np.size;\n        nps.tb.src = threadIdx.x;\n        _np.start = 0;\n        _np.size = 0;\n      }\n      // FP: \"45 -> 46;\n      __syncthreads();\n      // FP: \"46 -> 47;\n      int ns = nps.tb.start;\n      int ne = nps.tb.size;\n      // FP: \"47 -> 48;\n      if (nps.tb.src == threadIdx.x)\n      {\n        nps.tb.owner = MAX_TB_SIZE + 1;\n      }\n      // FP: \"50 -> 51;\n      assert(nps.tb.src < __kernel_tb_size);\n      _delta = _np_closure[nps.tb.src]._delta;\n      // FP: \"51 -> 52;\n      for (int _np_j = threadIdx.x; _np_j < ne; _np_j += BLKSIZE)\n      {\n        index_type nbr;\n        nbr = ns +_np_j;\n        {\n          index_type dst;\n          dst = graph.getAbsDestination(nbr);\n          atomicTestAdd(&p_residual[dst], _delta);\n          bitset_residual.set(dst);\n        }\n      }\n      // FP: \"59 -> 60;\n      __syncthreads();\n    }\n    // FP: \"61 -> 62;\n\n    // FP: \"62 -> 63;\n    {\n      const int warpid = threadIdx.x / 32;\n      // FP: \"63 -> 64;\n      const int _np_laneid = cub::LaneId();\n      // FP: \"64 -> 65;\n      while (__any_sync(0xffffffff, _np.size >= _NP_CROSSOVER_WP && _np.size < _NP_CROSSOVER_TB))\n      {\n        if (_np.size >= _NP_CROSSOVER_WP && _np.size < _NP_CROSSOVER_TB)\n        {\n          nps.warp.owner[warpid] = _np_laneid;\n        }\n        if (nps.warp.owner[warpid] == _np_laneid)\n        {\n          nps.warp.start[warpid] = _np.start;\n          nps.warp.size[warpid] = _np.size;\n          nps.warp.src[warpid] = threadIdx.x;\n          _np.start = 0;\n          _np.size = 0;\n        }\n        index_type _np_w_start = nps.warp.start[warpid];\n        index_type _np_w_size = nps.warp.size[warpid];\n        assert(nps.warp.src[warpid] < __kernel_tb_size);\n        _delta = _np_closure[nps.warp.src[warpid]]._delta;\n        for (int _np_ii = _np_laneid; _np_ii < _np_w_size; _np_ii += 32)\n        {\n          index_type nbr;\n          nbr = _np_w_start +_np_ii;\n          {\n            index_type dst;\n            dst = graph.getAbsDestination(nbr);\n            atomicTestAdd(&p_residual[dst], _delta);\n            bitset_residual.set(dst);\n          }\n        }\n      }\n      // FP: \"82 -> 83;\n      __syncthreads();\n      // FP: \"83 -> 84;\n    }\n\n    // FP: \"84 -> 85;\n    __syncthreads();\n    // FP: \"85 -> 86;\n    _np.total = _np_mps_total.el[1];\n    _np.offset = _np_mps.el[1];\n    // FP: \"86 -> 87;\n    while (_np.work())\n    {\n      // FP: \"87 -> 88;\n      int _np_i =0;\n      // FP: \"88 -> 89;\n      _np.inspect2(nps.fg.itvalue, nps.fg.src, ITSIZE, threadIdx.x);\n      // FP: \"89 -> 90;\n      __syncthreads();\n      // FP: \"90 -> 91;\n\n      // FP: \"91 -> 92;\n      for (_np_i = threadIdx.x; _np_i < ITSIZE && _np.valid(_np_i); _np_i += BLKSIZE)\n      {\n        index_type nbr;\n        assert(nps.fg.src[_np_i] < __kernel_tb_size);\n        _delta = _np_closure[nps.fg.src[_np_i]]._delta;\n        nbr= nps.fg.itvalue[_np_i];\n        {\n          index_type dst;\n          dst = graph.getAbsDestination(nbr);\n          atomicTestAdd(&p_residual[dst], _delta);\n          bitset_residual.set(dst);\n        }\n      }\n      // FP: \"100 -> 101;\n      _np.execute_round_done(ITSIZE);\n      // FP: \"101 -> 102;\n      __syncthreads();\n    }\n    // FP: \"103 -> 104;\n    assert(threadIdx.x < __kernel_tb_size);\n    _delta = _np_closure[threadIdx.x]._delta;\n  }\n  // FP: \"106 -> 107;\n  active_vertices.thread_exit<cub::BlockReduce<unsigned int, TB_SIZE> >(active_vertices_ts);\n  // FP: \"107 -> 108;\n}\n__global__ void PageRankSanity(CSRGraph graph, unsigned int __begin, unsigned int __end, float local_tolerance, float * p_residual, float * p_value, HGAccumulator<uint64_t> DGAccumulator_residual_over_tolerance, HGAccumulator<float> DGAccumulator_sum, HGAccumulator<float> DGAccumulator_sum_residual, HGReduceMax<float> max_residual, HGReduceMax<float> max_value, HGReduceMin<float> min_residual, HGReduceMin<float> min_value)\n{\n  unsigned tid = TID_1D;\n  unsigned nthreads = TOTAL_THREADS_1D;\n\n  const unsigned __kernel_tb_size = TB_SIZE;\n  __shared__ cub::BlockReduce<uint64_t, TB_SIZE>::TempStorage DGAccumulator_residual_over_tolerance_ts;\n  __shared__ cub::BlockReduce<float, TB_SIZE>::TempStorage DGAccumulator_sum_ts;\n  __shared__ cub::BlockReduce<float, TB_SIZE>::TempStorage DGAccumulator_sum_residual_ts;\n  __shared__ cub::BlockReduce<float, TB_SIZE>::TempStorage max_residual_ts;\n  __shared__ cub::BlockReduce<float, TB_SIZE>::TempStorage max_value_ts;\n  __shared__ cub::BlockReduce<float, TB_SIZE>::TempStorage min_residual_ts;\n  __shared__ cub::BlockReduce<float, TB_SIZE>::TempStorage min_value_ts;\n  index_type src_end;\n  // FP: \"1 -> 2;\n  // FP: \"2 -> 3;\n  DGAccumulator_residual_over_tolerance.thread_entry();\n  // FP: \"3 -> 4;\n  // FP: \"4 -> 5;\n  DGAccumulator_sum.thread_entry();\n  // FP: \"5 -> 6;\n  // FP: \"6 -> 7;\n  DGAccumulator_sum_residual.thread_entry();\n  // FP: \"7 -> 8;\n  // FP: \"8 -> 9;\n  max_residual.thread_entry();\n  // FP: \"9 -> 10;\n  // FP: \"10 -> 11;\n  max_value.thread_entry();\n  // FP: \"11 -> 12;\n  // FP: \"12 -> 13;\n  min_residual.thread_entry();\n  // FP: \"13 -> 14;\n  // FP: \"14 -> 15;\n  min_value.thread_entry();\n  // FP: \"15 -> 16;\n  src_end = __end;\n  for (index_type src = __begin + tid; src < src_end; src += nthreads)\n  {\n    bool pop  = src < __end;\n    if (pop)\n    {\n      max_value.reduce(p_value[src]);\n      min_value.reduce(p_value[src]);\n      max_residual.reduce(p_residual[src]);\n      min_residual.reduce(p_residual[src]);\n      DGAccumulator_sum.reduce( p_value[src]);\n      DGAccumulator_sum.reduce( p_residual[src]);\n      if (p_residual[src] > local_tolerance)\n      {\n        DGAccumulator_residual_over_tolerance.reduce( 1);\n      }\n    }\n  }\n  // FP: \"29 -> 30;\n  DGAccumulator_residual_over_tolerance.thread_exit<cub::BlockReduce<uint64_t, TB_SIZE> >(DGAccumulator_residual_over_tolerance_ts);\n  // FP: \"30 -> 31;\n  DGAccumulator_sum.thread_exit<cub::BlockReduce<float, TB_SIZE> >(DGAccumulator_sum_ts);\n  // FP: \"31 -> 32;\n  DGAccumulator_sum_residual.thread_exit<cub::BlockReduce<float, TB_SIZE> >(DGAccumulator_sum_residual_ts);\n  // FP: \"32 -> 33;\n  max_residual.thread_exit<cub::BlockReduce<float, TB_SIZE> >(max_residual_ts);\n  // FP: \"33 -> 34;\n  max_value.thread_exit<cub::BlockReduce<float, TB_SIZE> >(max_value_ts);\n  // FP: \"34 -> 35;\n  min_residual.thread_exit<cub::BlockReduce<float, TB_SIZE> >(min_residual_ts);\n  // FP: \"35 -> 36;\n  min_value.thread_exit<cub::BlockReduce<float, TB_SIZE> >(min_value_ts);\n  // FP: \"36 -> 37;\n}\nvoid ResetGraph_cuda(unsigned int  __begin, unsigned int  __end, struct CUDA_Context*  ctx)\n{\n  dim3 blocks;\n  dim3 threads;\n  // FP: \"1 -> 2;\n  // FP: \"2 -> 3;\n  // FP: \"3 -> 4;\n  kernel_sizing(blocks, threads);\n  // FP: \"4 -> 5;\n  ResetGraph <<<blocks, threads>>>(ctx->gg, __begin, __end, ctx->delta.data.gpu_wr_ptr(), ctx->nout.data.gpu_wr_ptr(), ctx->residual.data.gpu_wr_ptr(), ctx->value.data.gpu_wr_ptr());\n  cudaDeviceSynchronize();\n  // FP: \"5 -> 6;\n  check_cuda_kernel;\n  // FP: \"6 -> 7;\n}\nvoid ResetGraph_allNodes_cuda(struct CUDA_Context*  ctx)\n{\n  // FP: \"1 -> 2;\n  ResetGraph_cuda(0, ctx->gg.nnodes, ctx);\n  // FP: \"2 -> 3;\n}\nvoid ResetGraph_masterNodes_cuda(struct CUDA_Context*  ctx)\n{\n  // FP: \"1 -> 2;\n  ResetGraph_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, ctx);\n  // FP: \"2 -> 3;\n}\nvoid ResetGraph_nodesWithEdges_cuda(struct CUDA_Context*  ctx)\n{\n  // FP: \"1 -> 2;\n  ResetGraph_cuda(0, ctx->numNodesWithEdges, ctx);\n  // FP: \"2 -> 3;\n}\nvoid InitializeGraph_cuda(unsigned int  __begin, unsigned int  __end, const float & local_alpha, struct CUDA_Context*  ctx)\n{\n  dim3 blocks;\n  dim3 threads;\n  // FP: \"1 -> 2;\n  // FP: \"2 -> 3;\n  // FP: \"3 -> 4;\n  kernel_sizing(blocks, threads);\n  // FP: \"4 -> 5;\n  InitializeGraph <<<blocks, threads>>>(ctx->gg, __begin, __end, local_alpha, ctx->nout.data.gpu_wr_ptr(), ctx->residual.data.gpu_wr_ptr(), *(ctx->nout.is_updated.gpu_rd_ptr()));\n  cudaDeviceSynchronize();\n  // FP: \"5 -> 6;\n  check_cuda_kernel;\n  // FP: \"6 -> 7;\n}\nvoid InitializeGraph_allNodes_cuda(const float & local_alpha, struct CUDA_Context*  ctx)\n{\n  // FP: \"1 -> 2;\n  InitializeGraph_cuda(0, ctx->gg.nnodes, local_alpha, ctx);\n  // FP: \"2 -> 3;\n}\nvoid InitializeGraph_masterNodes_cuda(const float & local_alpha, struct CUDA_Context*  ctx)\n{\n  // FP: \"1 -> 2;\n  InitializeGraph_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, local_alpha, ctx);\n  // FP: \"2 -> 3;\n}\nvoid InitializeGraph_nodesWithEdges_cuda(const float & local_alpha, struct CUDA_Context*  ctx)\n{\n  // FP: \"1 -> 2;\n  InitializeGraph_cuda(0, ctx->numNodesWithEdges, local_alpha, ctx);\n  // FP: \"2 -> 3;\n}\nvoid PageRank_delta_cuda(unsigned int  __begin, unsigned int  __end, const float & local_alpha, float local_tolerance, struct CUDA_Context*  ctx)\n{\n  dim3 blocks;\n  dim3 threads;\n  // FP: \"1 -> 2;\n  // FP: \"2 -> 3;\n  // FP: \"3 -> 4;\n  kernel_sizing(blocks, threads);\n  // FP: \"4 -> 5;\n  PageRank_delta <<<blocks, threads>>>(ctx->gg, __begin, __end, local_alpha, local_tolerance, ctx->delta.data.gpu_wr_ptr(), ctx->nout.data.gpu_wr_ptr(), ctx->residual.data.gpu_wr_ptr(), ctx->value.data.gpu_wr_ptr());\n  cudaDeviceSynchronize();\n  // FP: \"5 -> 6;\n  check_cuda_kernel;\n  // FP: \"6 -> 7;\n}\nvoid PageRank_delta_allNodes_cuda(const float & local_alpha, float local_tolerance, struct CUDA_Context*  ctx)\n{\n  // FP: \"1 -> 2;\n  PageRank_delta_cuda(0, ctx->gg.nnodes, local_alpha, local_tolerance, ctx);\n  // FP: \"2 -> 3;\n}\nvoid PageRank_delta_masterNodes_cuda(const float & local_alpha, float local_tolerance, struct CUDA_Context*  ctx)\n{\n  // FP: \"1 -> 2;\n  PageRank_delta_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, local_alpha, local_tolerance, ctx);\n  // FP: \"2 -> 3;\n}\nvoid PageRank_delta_nodesWithEdges_cuda(const float & local_alpha, float local_tolerance, struct CUDA_Context*  ctx)\n{\n  // FP: \"1 -> 2;\n  PageRank_delta_cuda(0, ctx->numNodesWithEdges, local_alpha, local_tolerance, ctx);\n  // FP: \"2 -> 3;\n}\nvoid PageRank_cuda(unsigned int  __begin, unsigned int  __end, unsigned int & active_vertices, struct CUDA_Context*  ctx)\n{\n  dim3 blocks;\n  dim3 threads;\n  HGAccumulator<unsigned int> _active_vertices;\n  // FP: \"1 -> 2;\n  // FP: \"2 -> 3;\n  // FP: \"3 -> 4;\n  kernel_sizing(blocks, threads);\n  // FP: \"4 -> 5;\n  Shared<unsigned int> active_verticesval  = Shared<unsigned int>(1);\n  // FP: \"5 -> 6;\n  // FP: \"6 -> 7;\n  *(active_verticesval.cpu_wr_ptr()) = 0;\n  // FP: \"7 -> 8;\n  _active_vertices.rv = active_verticesval.gpu_wr_ptr();\n  // FP: \"8 -> 9;\n  PageRank <<<blocks, __tb_PageRank>>>(ctx->gg, __begin, __end, ctx->delta.data.gpu_wr_ptr(), ctx->residual.data.gpu_wr_ptr(), *(ctx->residual.is_updated.gpu_rd_ptr()), _active_vertices, enable_lb);\n  cudaDeviceSynchronize();\n  // FP: \"9 -> 10;\n  check_cuda_kernel;\n  // FP: \"10 -> 11;\n  active_vertices = *(active_verticesval.cpu_rd_ptr());\n  // FP: \"11 -> 12;\n}\nvoid PageRank_allNodes_cuda(unsigned int & active_vertices, struct CUDA_Context*  ctx)\n{\n  // FP: \"1 -> 2;\n  PageRank_cuda(0, ctx->gg.nnodes, active_vertices, ctx);\n  // FP: \"2 -> 3;\n}\nvoid PageRank_masterNodes_cuda(unsigned int & active_vertices, struct CUDA_Context*  ctx)\n{\n  // FP: \"1 -> 2;\n  PageRank_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, active_vertices, ctx);\n  // FP: \"2 -> 3;\n}\nvoid PageRank_nodesWithEdges_cuda(unsigned int & active_vertices, struct CUDA_Context*  ctx)\n{\n  // FP: \"1 -> 2;\n  PageRank_cuda(0, ctx->numNodesWithEdges, active_vertices, ctx);\n  // FP: \"2 -> 3;\n}\nvoid PageRankSanity_cuda(unsigned int  __begin, unsigned int  __end, uint64_t & DGAccumulator_residual_over_tolerance, float & DGAccumulator_sum, float & DGAccumulator_sum_residual, float & max_residual, float & max_value, float & min_residual, float & min_value, float local_tolerance, struct CUDA_Context*  ctx)\n{\n  dim3 blocks;\n  dim3 threads;\n  HGAccumulator<uint64_t> _DGAccumulator_residual_over_tolerance;\n  HGAccumulator<float> _DGAccumulator_sum;\n  HGAccumulator<float> _DGAccumulator_sum_residual;\n  HGReduceMax<float> _max_residual;\n  HGReduceMax<float> _max_value;\n  HGReduceMin<float> _min_residual;\n  HGReduceMin<float> _min_value;\n  // FP: \"1 -> 2;\n  // FP: \"2 -> 3;\n  // FP: \"3 -> 4;\n  kernel_sizing(blocks, threads);\n  // FP: \"4 -> 5;\n  Shared<uint64_t> DGAccumulator_residual_over_toleranceval  = Shared<uint64_t>(1);\n  // FP: \"5 -> 6;\n  // FP: \"6 -> 7;\n  *(DGAccumulator_residual_over_toleranceval.cpu_wr_ptr()) = 0;\n  // FP: \"7 -> 8;\n  _DGAccumulator_residual_over_tolerance.rv = DGAccumulator_residual_over_toleranceval.gpu_wr_ptr();\n  // FP: \"8 -> 9;\n  Shared<float> DGAccumulator_sumval  = Shared<float>(1);\n  // FP: \"9 -> 10;\n  // FP: \"10 -> 11;\n  *(DGAccumulator_sumval.cpu_wr_ptr()) = 0;\n  // FP: \"11 -> 12;\n  _DGAccumulator_sum.rv = DGAccumulator_sumval.gpu_wr_ptr();\n  // FP: \"12 -> 13;\n  Shared<float> DGAccumulator_sum_residualval  = Shared<float>(1);\n  // FP: \"13 -> 14;\n  // FP: \"14 -> 15;\n  *(DGAccumulator_sum_residualval.cpu_wr_ptr()) = 0;\n  // FP: \"15 -> 16;\n  _DGAccumulator_sum_residual.rv = DGAccumulator_sum_residualval.gpu_wr_ptr();\n  // FP: \"16 -> 17;\n  Shared<float> max_residualval  = Shared<float>(1);\n  // FP: \"17 -> 18;\n  // FP: \"18 -> 19;\n  *(max_residualval.cpu_wr_ptr()) = 0;\n  // FP: \"19 -> 20;\n  _max_residual.rv = max_residualval.gpu_wr_ptr();\n  // FP: \"20 -> 21;\n  Shared<float> max_valueval  = Shared<float>(1);\n  // FP: \"21 -> 22;\n  // FP: \"22 -> 23;\n  *(max_valueval.cpu_wr_ptr()) = 0;\n  // FP: \"23 -> 24;\n  _max_value.rv = max_valueval.gpu_wr_ptr();\n  // FP: \"24 -> 25;\n  Shared<float> min_residualval  = Shared<float>(1);\n  // FP: \"25 -> 26;\n  // FP: \"26 -> 27;\n  *(min_residualval.cpu_wr_ptr()) = 1073741823;\n  // FP: \"27 -> 28;\n  _min_residual.rv = min_residualval.gpu_wr_ptr();\n  // FP: \"28 -> 29;\n  Shared<float> min_valueval  = Shared<float>(1);\n  // FP: \"29 -> 30;\n  // FP: \"30 -> 31;\n  *(min_valueval.cpu_wr_ptr()) = 1073741823;\n  // FP: \"31 -> 32;\n  _min_value.rv = min_valueval.gpu_wr_ptr();\n  // FP: \"32 -> 33;\n  PageRankSanity <<<blocks, threads>>>(ctx->gg, __begin, __end, local_tolerance, ctx->residual.data.gpu_wr_ptr(), ctx->value.data.gpu_wr_ptr(), _DGAccumulator_residual_over_tolerance, _DGAccumulator_sum, _DGAccumulator_sum_residual, _max_residual, _max_value, _min_residual, _min_value);\n  cudaDeviceSynchronize();\n  // FP: \"33 -> 34;\n  check_cuda_kernel;\n  // FP: \"34 -> 35;\n  DGAccumulator_residual_over_tolerance = *(DGAccumulator_residual_over_toleranceval.cpu_rd_ptr());\n  // FP: \"35 -> 36;\n  DGAccumulator_sum = *(DGAccumulator_sumval.cpu_rd_ptr());\n  // FP: \"36 -> 37;\n  DGAccumulator_sum_residual = *(DGAccumulator_sum_residualval.cpu_rd_ptr());\n  // FP: \"37 -> 38;\n  max_residual = *(max_residualval.cpu_rd_ptr());\n  // FP: \"38 -> 39;\n  max_value = *(max_valueval.cpu_rd_ptr());\n  // FP: \"39 -> 40;\n  min_residual = *(min_residualval.cpu_rd_ptr());\n  // FP: \"40 -> 41;\n  min_value = *(min_valueval.cpu_rd_ptr());\n  // FP: \"41 -> 42;\n}\nvoid PageRankSanity_allNodes_cuda(uint64_t & DGAccumulator_residual_over_tolerance, float & DGAccumulator_sum, float & DGAccumulator_sum_residual, float & max_residual, float & max_value, float & min_residual, float & min_value, float local_tolerance, struct CUDA_Context*  ctx)\n{\n  // FP: \"1 -> 2;\n  PageRankSanity_cuda(0, ctx->gg.nnodes, DGAccumulator_residual_over_tolerance, DGAccumulator_sum, DGAccumulator_sum_residual, max_residual, max_value, min_residual, min_value, local_tolerance, ctx);\n  // FP: \"2 -> 3;\n}\nvoid PageRankSanity_masterNodes_cuda(uint64_t & DGAccumulator_residual_over_tolerance, float & DGAccumulator_sum, float & DGAccumulator_sum_residual, float & max_residual, float & max_value, float & min_residual, float & min_value, float local_tolerance, struct CUDA_Context*  ctx)\n{\n  // FP: \"1 -> 2;\n  PageRankSanity_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, DGAccumulator_residual_over_tolerance, DGAccumulator_sum, DGAccumulator_sum_residual, max_residual, max_value, min_residual, min_value, local_tolerance, ctx);\n  // FP: \"2 -> 3;\n}\nvoid PageRankSanity_nodesWithEdges_cuda(uint64_t & DGAccumulator_residual_over_tolerance, float & DGAccumulator_sum, float & DGAccumulator_sum_residual, float & max_residual, float & max_value, float & min_residual, float & min_value, float local_tolerance, struct CUDA_Context*  ctx)\n{\n  // FP: \"1 -> 2;\n  PageRankSanity_cuda(0, ctx->numNodesWithEdges, DGAccumulator_residual_over_tolerance, DGAccumulator_sum, DGAccumulator_sum_residual, max_residual, max_value, min_residual, min_value, local_tolerance, ctx);\n  // FP: \"2 -> 3;\n}"
  },
  {
    "path": "lonestar/analytics/distributed/pagerank/pagerank_push_cuda.cuh",
    "content": "#pragma once\n#include <cuda.h>\n#include <stdio.h>\n#include <sys/types.h>\n#include <unistd.h>\n#include \"pagerank_push_cuda.h\"\n#include \"galois/runtime/cuda/DeviceSync.h\"\n\nstruct CUDA_Context : public CUDA_Context_Common {\n\tstruct CUDA_Context_Field<float> delta;\n\tstruct CUDA_Context_Field<uint32_t> nout;\n\tstruct CUDA_Context_Field<float> residual;\n\tstruct CUDA_Context_Field<float> value;\n};\n\nstruct CUDA_Context* get_CUDA_context(int id) {\n\tstruct CUDA_Context* ctx;\n\tctx = (struct CUDA_Context* ) calloc(1, sizeof(struct CUDA_Context));\n\tctx->id = id;\n\treturn ctx;\n}\n\nbool init_CUDA_context(struct CUDA_Context* ctx, int device) {\n\treturn init_CUDA_context_common(ctx, device);\n}\n\nvoid load_graph_CUDA(struct CUDA_Context* ctx, MarshalGraph &g, unsigned num_hosts) {\n\tsize_t mem_usage = mem_usage_CUDA_common(g, num_hosts);\n\tmem_usage += mem_usage_CUDA_field(&ctx->delta, g, num_hosts);\n\tmem_usage += mem_usage_CUDA_field(&ctx->nout, g, num_hosts);\n\tmem_usage += mem_usage_CUDA_field(&ctx->residual, g, num_hosts);\n\tmem_usage += mem_usage_CUDA_field(&ctx->value, g, num_hosts);\n\tprintf(\"[%d] Host memory for communication context: %3u MB\\n\", ctx->id, mem_usage/1048756);\n\tload_graph_CUDA_common(ctx, g, num_hosts);\n\tload_graph_CUDA_field(ctx, &ctx->delta, num_hosts);\n\tload_graph_CUDA_field(ctx, &ctx->nout, num_hosts);\n\tload_graph_CUDA_field(ctx, &ctx->residual, num_hosts);\n\tload_graph_CUDA_field(ctx, &ctx->value, num_hosts);\n\treset_CUDA_context(ctx);\n}\n\nvoid reset_CUDA_context(struct CUDA_Context* ctx) {\n\tctx->delta.data.zero_gpu();\n\tctx->nout.data.zero_gpu();\n\tctx->residual.data.zero_gpu();\n\tctx->value.data.zero_gpu();\n}\n\nvoid get_bitset_delta_cuda(struct CUDA_Context* ctx, uint64_t* bitset_compute) {\n\tctx->delta.is_updated.cpu_rd_ptr()->copy_to_cpu(bitset_compute);\n}\n\nvoid bitset_delta_reset_cuda(struct CUDA_Context* ctx) {\n\tctx->delta.is_updated.cpu_rd_ptr()->reset();\n}\n\nvoid bitset_delta_reset_cuda(struct CUDA_Context* ctx, size_t begin, size_t end) {\n\treset_bitset_field(&ctx->delta, begin, end);\n}\n\nfloat get_node_delta_cuda(struct CUDA_Context* ctx, unsigned LID) {\n\tfloat *delta = ctx->delta.data.cpu_rd_ptr();\n\treturn delta[LID];\n}\n\nvoid set_node_delta_cuda(struct CUDA_Context* ctx, unsigned LID, float v) {\n\tfloat *delta = ctx->delta.data.cpu_wr_ptr();\n\tdelta[LID] = v;\n}\n\nvoid add_node_delta_cuda(struct CUDA_Context* ctx, unsigned LID, float v) {\n\tfloat *delta = ctx->delta.data.cpu_wr_ptr();\n\tdelta[LID] += v;\n}\n\nbool min_node_delta_cuda(struct CUDA_Context* ctx, unsigned LID, float v) {\n\tfloat *delta = ctx->delta.data.cpu_wr_ptr();\n\tif (delta[LID] > v){\n\t\tdelta[LID] = v;\n\t\treturn true;\n\t}\n\treturn false;\n}\n\nvoid batch_get_node_delta_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v) {\n\tbatch_get_shared_field<float, sharedMaster, false>(ctx, &ctx->delta, from_id, v);\n}\n\nvoid batch_get_node_delta_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode) {\n\tbatch_get_shared_field<float, sharedMaster, false>(ctx, &ctx->delta, from_id, v, v_size, data_mode);\n}\n\nvoid batch_get_mirror_node_delta_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v) {\n\tbatch_get_shared_field<float, sharedMirror, false>(ctx, &ctx->delta, from_id, v);\n}\n\nvoid batch_get_mirror_node_delta_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode) {\n\tbatch_get_shared_field<float, sharedMirror, false>(ctx, &ctx->delta, from_id, v, v_size, data_mode);\n}\n\nvoid batch_get_reset_node_delta_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, float i) {\n\tbatch_get_shared_field<float, sharedMirror, true>(ctx, &ctx->delta, from_id, v, i);\n}\n\nvoid batch_get_reset_node_delta_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode, float i) {\n\tbatch_get_shared_field<float, sharedMirror, true>(ctx, &ctx->delta, from_id, v, v_size, data_mode, i);\n}\n\nvoid batch_set_mirror_node_delta_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<float, sharedMirror, setOp>(ctx, &ctx->delta, from_id, v, data_mode);\n}\n\nvoid batch_set_node_delta_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<float, sharedMaster, setOp>(ctx, &ctx->delta, from_id, v, data_mode);\n}\n\nvoid batch_add_mirror_node_delta_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<float, sharedMirror, addOp>(ctx, &ctx->delta, from_id, v, data_mode);\n}\n\nvoid batch_add_node_delta_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<float, sharedMaster, addOp>(ctx, &ctx->delta, from_id, v, data_mode);\n}\n\nvoid batch_min_mirror_node_delta_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<float, sharedMirror, minOp>(ctx, &ctx->delta, from_id, v, data_mode);\n}\n\nvoid batch_min_node_delta_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<float, sharedMaster, minOp>(ctx, &ctx->delta, from_id, v, data_mode);\n}\n\nvoid batch_reset_node_delta_cuda(struct CUDA_Context* ctx, size_t begin, size_t end, float v) {\n\treset_data_field<float>(&ctx->delta, begin, end, v);\n}\n\nvoid get_bitset_nout_cuda(struct CUDA_Context* ctx, uint64_t* bitset_compute) {\n\tctx->nout.is_updated.cpu_rd_ptr()->copy_to_cpu(bitset_compute);\n}\n\nvoid bitset_nout_reset_cuda(struct CUDA_Context* ctx) {\n\tctx->nout.is_updated.cpu_rd_ptr()->reset();\n}\n\nvoid bitset_nout_reset_cuda(struct CUDA_Context* ctx, size_t begin, size_t end) {\n\treset_bitset_field(&ctx->nout, begin, end);\n}\n\nuint32_t get_node_nout_cuda(struct CUDA_Context* ctx, unsigned LID) {\n\tuint32_t *nout = ctx->nout.data.cpu_rd_ptr();\n\treturn nout[LID];\n}\n\nvoid set_node_nout_cuda(struct CUDA_Context* ctx, unsigned LID, uint32_t v) {\n\tuint32_t *nout = ctx->nout.data.cpu_wr_ptr();\n\tnout[LID] = v;\n}\n\nvoid add_node_nout_cuda(struct CUDA_Context* ctx, unsigned LID, uint32_t v) {\n\tuint32_t *nout = ctx->nout.data.cpu_wr_ptr();\n\tnout[LID] += v;\n}\n\nbool min_node_nout_cuda(struct CUDA_Context* ctx, unsigned LID, uint32_t v) {\n\tuint32_t *nout = ctx->nout.data.cpu_wr_ptr();\n\tif (nout[LID] > v){\n\t\tnout[LID] = v;\n\t\treturn true;\n\t}\n\treturn false;\n}\n\nvoid batch_get_node_nout_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v) {\n\tbatch_get_shared_field<uint32_t, sharedMaster, false>(ctx, &ctx->nout, from_id, v);\n}\n\nvoid batch_get_node_nout_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode) {\n\tbatch_get_shared_field<uint32_t, sharedMaster, false>(ctx, &ctx->nout, from_id, v, v_size, data_mode);\n}\n\nvoid batch_get_mirror_node_nout_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v) {\n\tbatch_get_shared_field<uint32_t, sharedMirror, false>(ctx, &ctx->nout, from_id, v);\n}\n\nvoid batch_get_mirror_node_nout_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode) {\n\tbatch_get_shared_field<uint32_t, sharedMirror, false>(ctx, &ctx->nout, from_id, v, v_size, data_mode);\n}\n\nvoid batch_get_reset_node_nout_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, uint32_t i) {\n\tbatch_get_shared_field<uint32_t, sharedMirror, true>(ctx, &ctx->nout, from_id, v, i);\n}\n\nvoid batch_get_reset_node_nout_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode, uint32_t i) {\n\tbatch_get_shared_field<uint32_t, sharedMirror, true>(ctx, &ctx->nout, from_id, v, v_size, data_mode, i);\n}\n\nvoid batch_set_mirror_node_nout_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<uint32_t, sharedMirror, setOp>(ctx, &ctx->nout, from_id, v, data_mode);\n}\n\nvoid batch_set_node_nout_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<uint32_t, sharedMaster, setOp>(ctx, &ctx->nout, from_id, v, data_mode);\n}\n\nvoid batch_add_mirror_node_nout_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<uint32_t, sharedMirror, addOp>(ctx, &ctx->nout, from_id, v, data_mode);\n}\n\nvoid batch_add_node_nout_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<uint32_t, sharedMaster, addOp>(ctx, &ctx->nout, from_id, v, data_mode);\n}\n\nvoid batch_min_mirror_node_nout_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<uint32_t, sharedMirror, minOp>(ctx, &ctx->nout, from_id, v, data_mode);\n}\n\nvoid batch_min_node_nout_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<uint32_t, sharedMaster, minOp>(ctx, &ctx->nout, from_id, v, data_mode);\n}\n\nvoid batch_reset_node_nout_cuda(struct CUDA_Context* ctx, size_t begin, size_t end, uint32_t v) {\n\treset_data_field<uint32_t>(&ctx->nout, begin, end, v);\n}\n\nvoid get_bitset_residual_cuda(struct CUDA_Context* ctx, uint64_t* bitset_compute) {\n\tctx->residual.is_updated.cpu_rd_ptr()->copy_to_cpu(bitset_compute);\n}\n\nvoid bitset_residual_reset_cuda(struct CUDA_Context* ctx) {\n\tctx->residual.is_updated.cpu_rd_ptr()->reset();\n}\n\nvoid bitset_residual_reset_cuda(struct CUDA_Context* ctx, size_t begin, size_t end) {\n\treset_bitset_field(&ctx->residual, begin, end);\n}\n\nfloat get_node_residual_cuda(struct CUDA_Context* ctx, unsigned LID) {\n\tfloat *residual = ctx->residual.data.cpu_rd_ptr();\n\treturn residual[LID];\n}\n\nvoid set_node_residual_cuda(struct CUDA_Context* ctx, unsigned LID, float v) {\n\tfloat *residual = ctx->residual.data.cpu_wr_ptr();\n\tresidual[LID] = v;\n}\n\nvoid add_node_residual_cuda(struct CUDA_Context* ctx, unsigned LID, float v) {\n\tfloat *residual = ctx->residual.data.cpu_wr_ptr();\n\tresidual[LID] += v;\n}\n\nbool min_node_residual_cuda(struct CUDA_Context* ctx, unsigned LID, float v) {\n\tfloat *residual = ctx->residual.data.cpu_wr_ptr();\n\tif (residual[LID] > v){\n\t\tresidual[LID] = v;\n\t\treturn true;\n\t}\n\treturn false;\n}\n\nvoid batch_get_node_residual_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v) {\n\tbatch_get_shared_field<float, sharedMaster, false>(ctx, &ctx->residual, from_id, v);\n}\n\nvoid batch_get_node_residual_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode) {\n\tbatch_get_shared_field<float, sharedMaster, false>(ctx, &ctx->residual, from_id, v, v_size, data_mode);\n}\n\nvoid batch_get_mirror_node_residual_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v) {\n\tbatch_get_shared_field<float, sharedMirror, false>(ctx, &ctx->residual, from_id, v);\n}\n\nvoid batch_get_mirror_node_residual_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode) {\n\tbatch_get_shared_field<float, sharedMirror, false>(ctx, &ctx->residual, from_id, v, v_size, data_mode);\n}\n\nvoid batch_get_reset_node_residual_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, float i) {\n\tbatch_get_shared_field<float, sharedMirror, true>(ctx, &ctx->residual, from_id, v, i);\n}\n\nvoid batch_get_reset_node_residual_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode, float i) {\n\tbatch_get_shared_field<float, sharedMirror, true>(ctx, &ctx->residual, from_id, v, v_size, data_mode, i);\n}\n\nvoid batch_set_mirror_node_residual_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<float, sharedMirror, setOp>(ctx, &ctx->residual, from_id, v, data_mode);\n}\n\nvoid batch_set_node_residual_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<float, sharedMaster, setOp>(ctx, &ctx->residual, from_id, v, data_mode);\n}\n\nvoid batch_add_mirror_node_residual_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<float, sharedMirror, addOp>(ctx, &ctx->residual, from_id, v, data_mode);\n}\n\nvoid batch_add_node_residual_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<float, sharedMaster, addOp>(ctx, &ctx->residual, from_id, v, data_mode);\n}\n\nvoid batch_min_mirror_node_residual_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<float, sharedMirror, minOp>(ctx, &ctx->residual, from_id, v, data_mode);\n}\n\nvoid batch_min_node_residual_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<float, sharedMaster, minOp>(ctx, &ctx->residual, from_id, v, data_mode);\n}\n\nvoid batch_reset_node_residual_cuda(struct CUDA_Context* ctx, size_t begin, size_t end, float v) {\n\treset_data_field<float>(&ctx->residual, begin, end, v);\n}\n\nvoid get_bitset_value_cuda(struct CUDA_Context* ctx, uint64_t* bitset_compute) {\n\tctx->value.is_updated.cpu_rd_ptr()->copy_to_cpu(bitset_compute);\n}\n\nvoid bitset_value_reset_cuda(struct CUDA_Context* ctx) {\n\tctx->value.is_updated.cpu_rd_ptr()->reset();\n}\n\nvoid bitset_value_reset_cuda(struct CUDA_Context* ctx, size_t begin, size_t end) {\n\treset_bitset_field(&ctx->value, begin, end);\n}\n\nfloat get_node_value_cuda(struct CUDA_Context* ctx, unsigned LID) {\n\tfloat *value = ctx->value.data.cpu_rd_ptr();\n\treturn value[LID];\n}\n\nvoid set_node_value_cuda(struct CUDA_Context* ctx, unsigned LID, float v) {\n\tfloat *value = ctx->value.data.cpu_wr_ptr();\n\tvalue[LID] = v;\n}\n\nvoid add_node_value_cuda(struct CUDA_Context* ctx, unsigned LID, float v) {\n\tfloat *value = ctx->value.data.cpu_wr_ptr();\n\tvalue[LID] += v;\n}\n\nbool min_node_value_cuda(struct CUDA_Context* ctx, unsigned LID, float v) {\n\tfloat *value = ctx->value.data.cpu_wr_ptr();\n\tif (value[LID] > v){\n\t\tvalue[LID] = v;\n\t\treturn true;\n\t}\n\treturn false;\n}\n\nvoid batch_get_node_value_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v) {\n\tbatch_get_shared_field<float, sharedMaster, false>(ctx, &ctx->value, from_id, v);\n}\n\nvoid batch_get_node_value_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode) {\n\tbatch_get_shared_field<float, sharedMaster, false>(ctx, &ctx->value, from_id, v, v_size, data_mode);\n}\n\nvoid batch_get_mirror_node_value_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v) {\n\tbatch_get_shared_field<float, sharedMirror, false>(ctx, &ctx->value, from_id, v);\n}\n\nvoid batch_get_mirror_node_value_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode) {\n\tbatch_get_shared_field<float, sharedMirror, false>(ctx, &ctx->value, from_id, v, v_size, data_mode);\n}\n\nvoid batch_get_reset_node_value_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, float i) {\n\tbatch_get_shared_field<float, sharedMirror, true>(ctx, &ctx->value, from_id, v, i);\n}\n\nvoid batch_get_reset_node_value_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode, float i) {\n\tbatch_get_shared_field<float, sharedMirror, true>(ctx, &ctx->value, from_id, v, v_size, data_mode, i);\n}\n\nvoid batch_set_mirror_node_value_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<float, sharedMirror, setOp>(ctx, &ctx->value, from_id, v, data_mode);\n}\n\nvoid batch_set_node_value_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<float, sharedMaster, setOp>(ctx, &ctx->value, from_id, v, data_mode);\n}\n\nvoid batch_add_mirror_node_value_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<float, sharedMirror, addOp>(ctx, &ctx->value, from_id, v, data_mode);\n}\n\nvoid batch_add_node_value_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<float, sharedMaster, addOp>(ctx, &ctx->value, from_id, v, data_mode);\n}\n\nvoid batch_min_mirror_node_value_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<float, sharedMirror, minOp>(ctx, &ctx->value, from_id, v, data_mode);\n}\n\nvoid batch_min_node_value_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<float, sharedMaster, minOp>(ctx, &ctx->value, from_id, v, data_mode);\n}\n\nvoid batch_reset_node_value_cuda(struct CUDA_Context* ctx, size_t begin, size_t end, float v) {\n\treset_data_field<float>(&ctx->value, begin, end, v);\n}\n\n"
  },
  {
    "path": "lonestar/analytics/distributed/pagerank/pagerank_push_cuda.h",
    "content": "#pragma once\n\n#include \"galois/runtime/DataCommMode.h\"\n#include \"galois/cuda/HostDecls.h\"\n\nvoid get_bitset_delta_cuda(struct CUDA_Context* ctx, uint64_t* bitset_compute);\nvoid bitset_delta_reset_cuda(struct CUDA_Context* ctx);\nvoid bitset_delta_reset_cuda(struct CUDA_Context* ctx, size_t begin,\n                             size_t end);\nfloat get_node_delta_cuda(struct CUDA_Context* ctx, unsigned LID);\nvoid set_node_delta_cuda(struct CUDA_Context* ctx, unsigned LID, float v);\nvoid add_node_delta_cuda(struct CUDA_Context* ctx, unsigned LID, float v);\nbool min_node_delta_cuda(struct CUDA_Context* ctx, unsigned LID, float v);\nvoid batch_get_node_delta_cuda(struct CUDA_Context* ctx, unsigned from_id,\n                               uint8_t* v);\nvoid batch_get_node_delta_cuda(struct CUDA_Context* ctx, unsigned from_id,\n                               uint8_t* v, size_t* v_size,\n                               DataCommMode* data_mode);\nvoid batch_get_mirror_node_delta_cuda(struct CUDA_Context* ctx,\n                                      unsigned from_id, uint8_t* v);\nvoid batch_get_mirror_node_delta_cuda(struct CUDA_Context* ctx,\n                                      unsigned from_id, uint8_t* v,\n                                      size_t* v_size, DataCommMode* data_mode);\nvoid batch_get_reset_node_delta_cuda(struct CUDA_Context* ctx, unsigned from_id,\n                                     uint8_t* v, float i);\nvoid batch_get_reset_node_delta_cuda(struct CUDA_Context* ctx, unsigned from_id,\n                                     uint8_t* v, size_t* v_size,\n                                     DataCommMode* data_mode, float i);\nvoid batch_set_mirror_node_delta_cuda(struct CUDA_Context* ctx,\n                                      unsigned from_id, uint8_t* v,\n                                      DataCommMode data_mode);\nvoid batch_set_node_delta_cuda(struct CUDA_Context* ctx, unsigned from_id,\n                               uint8_t* v, DataCommMode data_mode);\nvoid batch_add_mirror_node_delta_cuda(struct CUDA_Context* ctx,\n                                      unsigned from_id, uint8_t* v,\n                                      DataCommMode data_mode);\nvoid batch_add_node_delta_cuda(struct CUDA_Context* ctx, unsigned from_id,\n                               uint8_t* v, DataCommMode data_mode);\nvoid batch_min_mirror_node_delta_cuda(struct CUDA_Context* ctx,\n                                      unsigned from_id, uint8_t* v,\n                                      DataCommMode data_mode);\nvoid batch_min_node_delta_cuda(struct CUDA_Context* ctx, unsigned from_id,\n                               uint8_t* v, DataCommMode data_mode);\nvoid batch_reset_node_delta_cuda(struct CUDA_Context* ctx, size_t begin,\n                                 size_t end, float v);\n\nvoid get_bitset_nout_cuda(struct CUDA_Context* ctx, uint64_t* bitset_compute);\nvoid bitset_nout_reset_cuda(struct CUDA_Context* ctx);\nvoid bitset_nout_reset_cuda(struct CUDA_Context* ctx, size_t begin, size_t end);\nuint32_t get_node_nout_cuda(struct CUDA_Context* ctx, unsigned LID);\nvoid set_node_nout_cuda(struct CUDA_Context* ctx, unsigned LID, uint32_t v);\nvoid add_node_nout_cuda(struct CUDA_Context* ctx, unsigned LID, uint32_t v);\nbool min_node_nout_cuda(struct CUDA_Context* ctx, unsigned LID, uint32_t v);\nvoid batch_get_node_nout_cuda(struct CUDA_Context* ctx, unsigned from_id,\n                              uint8_t* v);\nvoid batch_get_node_nout_cuda(struct CUDA_Context* ctx, unsigned from_id,\n                              uint8_t* v, size_t* v_size,\n                              DataCommMode* data_mode);\nvoid batch_get_mirror_node_nout_cuda(struct CUDA_Context* ctx, unsigned from_id,\n                                     uint8_t* v);\nvoid batch_get_mirror_node_nout_cuda(struct CUDA_Context* ctx, unsigned from_id,\n                                     uint8_t* v, size_t* v_size,\n                                     DataCommMode* data_mode);\nvoid batch_get_reset_node_nout_cuda(struct CUDA_Context* ctx, unsigned from_id,\n                                    uint8_t* v, uint32_t i);\nvoid batch_get_reset_node_nout_cuda(struct CUDA_Context* ctx, unsigned from_id,\n                                    uint8_t* v, size_t* v_size,\n                                    DataCommMode* data_mode, uint32_t i);\nvoid batch_set_mirror_node_nout_cuda(struct CUDA_Context* ctx, unsigned from_id,\n                                     uint8_t* v, DataCommMode data_mode);\nvoid batch_set_node_nout_cuda(struct CUDA_Context* ctx, unsigned from_id,\n                              uint8_t* v, DataCommMode data_mode);\nvoid batch_add_mirror_node_nout_cuda(struct CUDA_Context* ctx, unsigned from_id,\n                                     uint8_t* v, DataCommMode data_mode);\nvoid batch_add_node_nout_cuda(struct CUDA_Context* ctx, unsigned from_id,\n                              uint8_t* v, DataCommMode data_mode);\nvoid batch_min_mirror_node_nout_cuda(struct CUDA_Context* ctx, unsigned from_id,\n                                     uint8_t* v, DataCommMode data_mode);\nvoid batch_min_node_nout_cuda(struct CUDA_Context* ctx, unsigned from_id,\n                              uint8_t* v, DataCommMode data_mode);\nvoid batch_reset_node_nout_cuda(struct CUDA_Context* ctx, size_t begin,\n                                size_t end, uint32_t v);\n\nvoid get_bitset_residual_cuda(struct CUDA_Context* ctx,\n                              uint64_t* bitset_compute);\nvoid bitset_residual_reset_cuda(struct CUDA_Context* ctx);\nvoid bitset_residual_reset_cuda(struct CUDA_Context* ctx, size_t begin,\n                                size_t end);\nfloat get_node_residual_cuda(struct CUDA_Context* ctx, unsigned LID);\nvoid set_node_residual_cuda(struct CUDA_Context* ctx, unsigned LID, float v);\nvoid add_node_residual_cuda(struct CUDA_Context* ctx, unsigned LID, float v);\nbool min_node_residual_cuda(struct CUDA_Context* ctx, unsigned LID, float v);\nvoid batch_get_node_residual_cuda(struct CUDA_Context* ctx, unsigned from_id,\n                                  uint8_t* v);\nvoid batch_get_node_residual_cuda(struct CUDA_Context* ctx, unsigned from_id,\n                                  uint8_t* v, size_t* v_size,\n                                  DataCommMode* data_mode);\nvoid batch_get_mirror_node_residual_cuda(struct CUDA_Context* ctx,\n                                         unsigned from_id, uint8_t* v);\nvoid batch_get_mirror_node_residual_cuda(struct CUDA_Context* ctx,\n                                         unsigned from_id, uint8_t* v,\n                                         size_t* v_size,\n                                         DataCommMode* data_mode);\nvoid batch_get_reset_node_residual_cuda(struct CUDA_Context* ctx,\n                                        unsigned from_id, uint8_t* v, float i);\nvoid batch_get_reset_node_residual_cuda(struct CUDA_Context* ctx,\n                                        unsigned from_id, uint8_t* v,\n                                        size_t* v_size, DataCommMode* data_mode,\n                                        float i);\nvoid batch_set_mirror_node_residual_cuda(struct CUDA_Context* ctx,\n                                         unsigned from_id, uint8_t* v,\n                                         DataCommMode data_mode);\nvoid batch_set_node_residual_cuda(struct CUDA_Context* ctx, unsigned from_id,\n                                  uint8_t* v, DataCommMode data_mode);\nvoid batch_add_mirror_node_residual_cuda(struct CUDA_Context* ctx,\n                                         unsigned from_id, uint8_t* v,\n                                         DataCommMode data_mode);\nvoid batch_add_node_residual_cuda(struct CUDA_Context* ctx, unsigned from_id,\n                                  uint8_t* v, DataCommMode data_mode);\nvoid batch_min_mirror_node_residual_cuda(struct CUDA_Context* ctx,\n                                         unsigned from_id, uint8_t* v,\n                                         DataCommMode data_mode);\nvoid batch_min_node_residual_cuda(struct CUDA_Context* ctx, unsigned from_id,\n                                  uint8_t* v, DataCommMode data_mode);\nvoid batch_reset_node_residual_cuda(struct CUDA_Context* ctx, size_t begin,\n                                    size_t end, float v);\n\nvoid get_bitset_value_cuda(struct CUDA_Context* ctx, uint64_t* bitset_compute);\nvoid bitset_value_reset_cuda(struct CUDA_Context* ctx);\nvoid bitset_value_reset_cuda(struct CUDA_Context* ctx, size_t begin,\n                             size_t end);\nfloat get_node_value_cuda(struct CUDA_Context* ctx, unsigned LID);\nvoid set_node_value_cuda(struct CUDA_Context* ctx, unsigned LID, float v);\nvoid add_node_value_cuda(struct CUDA_Context* ctx, unsigned LID, float v);\nbool min_node_value_cuda(struct CUDA_Context* ctx, unsigned LID, float v);\nvoid batch_get_node_value_cuda(struct CUDA_Context* ctx, unsigned from_id,\n                               uint8_t* v);\nvoid batch_get_node_value_cuda(struct CUDA_Context* ctx, unsigned from_id,\n                               uint8_t* v, size_t* v_size,\n                               DataCommMode* data_mode);\nvoid batch_get_mirror_node_value_cuda(struct CUDA_Context* ctx,\n                                      unsigned from_id, uint8_t* v);\nvoid batch_get_mirror_node_value_cuda(struct CUDA_Context* ctx,\n                                      unsigned from_id, uint8_t* v,\n                                      size_t* v_size, DataCommMode* data_mode);\nvoid batch_get_reset_node_value_cuda(struct CUDA_Context* ctx, unsigned from_id,\n                                     uint8_t* v, float i);\nvoid batch_get_reset_node_value_cuda(struct CUDA_Context* ctx, unsigned from_id,\n                                     uint8_t* v, size_t* v_size,\n                                     DataCommMode* data_mode, float i);\nvoid batch_set_mirror_node_value_cuda(struct CUDA_Context* ctx,\n                                      unsigned from_id, uint8_t* v,\n                                      DataCommMode data_mode);\nvoid batch_set_node_value_cuda(struct CUDA_Context* ctx, unsigned from_id,\n                               uint8_t* v, DataCommMode data_mode);\nvoid batch_add_mirror_node_value_cuda(struct CUDA_Context* ctx,\n                                      unsigned from_id, uint8_t* v,\n                                      DataCommMode data_mode);\nvoid batch_add_node_value_cuda(struct CUDA_Context* ctx, unsigned from_id,\n                               uint8_t* v, DataCommMode data_mode);\nvoid batch_min_mirror_node_value_cuda(struct CUDA_Context* ctx,\n                                      unsigned from_id, uint8_t* v,\n                                      DataCommMode data_mode);\nvoid batch_min_node_value_cuda(struct CUDA_Context* ctx, unsigned from_id,\n                               uint8_t* v, DataCommMode data_mode);\nvoid batch_reset_node_value_cuda(struct CUDA_Context* ctx, size_t begin,\n                                 size_t end, float v);\n\nvoid InitializeGraph_cuda(unsigned int __begin, unsigned int __end,\n                          const float& local_alpha, struct CUDA_Context* ctx);\nvoid InitializeGraph_allNodes_cuda(const float& local_alpha,\n                                   struct CUDA_Context* ctx);\nvoid InitializeGraph_masterNodes_cuda(const float& local_alpha,\n                                      struct CUDA_Context* ctx);\nvoid InitializeGraph_nodesWithEdges_cuda(const float& local_alpha,\n                                         struct CUDA_Context* ctx);\nvoid PageRank_cuda(unsigned int __begin, unsigned int __end,\n                   unsigned int& active_vertices, struct CUDA_Context* ctx);\nvoid PageRankSanity_cuda(unsigned int __begin, unsigned int __end,\n                         uint64_t& DGAccumulator_residual_over_tolerance,\n                         float& DGAccumulator_sum,\n                         float& DGAccumulator_sum_residual, float& max_residual,\n                         float& max_value, float& min_residual,\n                         float& min_value, float local_tolerance,\n                         struct CUDA_Context* ctx);\nvoid PageRankSanity_allNodes_cuda(\n    uint64_t& DGAccumulator_residual_over_tolerance, float& DGAccumulator_sum,\n    float& DGAccumulator_sum_residual, float& max_residual, float& max_value,\n    float& min_residual, float& min_value, float local_tolerance,\n    struct CUDA_Context* ctx);\nvoid PageRankSanity_masterNodes_cuda(\n    uint64_t& DGAccumulator_residual_over_tolerance, float& DGAccumulator_sum,\n    float& DGAccumulator_sum_residual, float& max_residual, float& max_value,\n    float& min_residual, float& min_value, float local_tolerance,\n    struct CUDA_Context* ctx);\nvoid PageRankSanity_nodesWithEdges_cuda(\n    uint64_t& DGAccumulator_residual_over_tolerance, float& DGAccumulator_sum,\n    float& DGAccumulator_sum_residual, float& max_residual, float& max_value,\n    float& min_residual, float& min_value, float local_tolerance,\n    struct CUDA_Context* ctx);\nvoid PageRank_allNodes_cuda(unsigned int& active_vertices,\n                            struct CUDA_Context* ctx);\nvoid PageRank_delta_cuda(unsigned int __begin, unsigned int __end,\n                         const float& local_alpha, float local_tolerance,\n                         struct CUDA_Context* ctx);\nvoid PageRank_delta_allNodes_cuda(const float& local_alpha,\n                                  float local_tolerance,\n                                  struct CUDA_Context* ctx);\nvoid PageRank_delta_masterNodes_cuda(const float& local_alpha,\n                                     float local_tolerance,\n                                     struct CUDA_Context* ctx);\nvoid PageRank_delta_nodesWithEdges_cuda(const float& local_alpha,\n                                        float local_tolerance,\n                                        struct CUDA_Context* ctx);\nvoid PageRank_masterNodes_cuda(unsigned int& active_vertices,\n                               struct CUDA_Context* ctx);\nvoid PageRank_nodesWithEdges_cuda(unsigned int& active_vertices,\n                                  struct CUDA_Context* ctx);\nvoid ResetGraph_cuda(unsigned int __begin, unsigned int __end,\n                     struct CUDA_Context* ctx);\nvoid ResetGraph_allNodes_cuda(struct CUDA_Context* ctx);\nvoid ResetGraph_masterNodes_cuda(struct CUDA_Context* ctx);\nvoid ResetGraph_nodesWithEdges_cuda(struct CUDA_Context* ctx);\n"
  },
  {
    "path": "lonestar/analytics/distributed/pagerank/pagerank_push_cuda.py",
    "content": "from gg.ast import *\nfrom gg.lib.graph import Graph\nfrom gg.lib.wl import Worklist\nfrom gg.ast.params import GraphParam\nimport cgen\nG = Graph(\"graph\")\nWL = Worklist()\nast = Module([\nCBlock([cgen.Include(\"pagerank_push_cuda.cuh\", system = False)], parse = False),\nKernel(\"ResetGraph\", [G.param(), ('unsigned int', '__begin'), ('unsigned int', '__end'), ('float *', 'p_delta'), ('uint32_t *', 'p_nout'), ('float *', 'p_residual'), ('float *', 'p_value')],\n[\nForAll(\"src\", G.nodes(\"__begin\", \"__end\"),\n[\nCDecl([(\"bool\", \"pop\", \" = src < __end\")]),\nIf(\"pop\", [\nCBlock([\"p_value[src]     = 0\"]),\nCBlock([\"p_nout[src]      = 0\"]),\nCBlock([\"p_residual[src]  = 0\"]),\nCBlock([\"p_delta[src]     = 0\"]),\n]),\n]),\n]),\nKernel(\"InitializeGraph\", [G.param(), ('unsigned int', '__begin'), ('unsigned int', '__end'), ('const float ', 'local_alpha'), ('uint32_t *', 'p_nout'), ('float *', 'p_residual'), ('DynamicBitset&', 'bitset_nout')],\n[\nCDecl([(\"uint32_t\", \"num_edges\", \"\")]),\nForAll(\"src\", G.nodes(\"__begin\", \"__end\"),\n[\nCDecl([(\"bool\", \"pop\", \" = src < __end\")]),\nIf(\"pop\", [\nCBlock([\"p_residual[src]  = local_alpha\"]),\nCBlock([\"num_edges = graph.getOutDegree(src)\"]),\nCBlock([\"atomicTestAdd(&p_nout[src], num_edges)\"]),\nCBlock([\"bitset_nout.set(src)\"]),\n]),\n]),\n]),\nKernel(\"PageRank_delta\", [G.param(), ('unsigned int', '__begin'), ('unsigned int', '__end'), ('const float ', 'local_alpha'), ('float', 'local_tolerance'), ('float *', 'p_delta'), ('uint32_t *', 'p_nout'), ('float *', 'p_residual'), ('float *', 'p_value')],\n[\nCDecl([(\"float\", \"residual_old\", \"\")]),\nForAll(\"src\", G.nodes(\"__begin\", \"__end\"),\n[\nCDecl([(\"bool\", \"pop\", \" = src < __end\")]),\nIf(\"pop\", [\nIf(\"p_residual[src] > 0\",\n[\nCBlock([\"residual_old = p_residual[src]\"]),\nCBlock([\"p_residual[src]     = 0\"]),\nCBlock([\"p_value[src] += residual_old\"]),\nIf(\"residual_old > local_tolerance\",\n[\nIf(\"p_nout[src] > 0\",\n[\nCBlock([\"p_delta[src] = residual_old * (1 - local_alpha) / p_nout[src]\"]),\n]),\n]),\n]),\n]),\n]),\n]),\nKernel(\"PageRank\", [G.param(), ('unsigned int', '__begin'), ('unsigned int', '__end'), ('float *', 'p_delta'), ('float *', 'p_residual'), ('DynamicBitset&', 'bitset_residual'), ('HGAccumulator<unsigned int>', 'active_vertices')],\n[\nCDecl([(\"float\", \"_delta\", \"\")]),\nCDecl([(\"__shared__ cub::BlockReduce<unsigned int, TB_SIZE>::TempStorage\", \"active_vertices_ts\", \"\")]),\nCBlock([\"active_vertices.thread_entry()\"]),\nForAll(\"src\", G.nodes(\"__begin\", \"__end\"),\n[\nCDecl([(\"bool\", \"pop\", \" = src < __end\")]),\nIf(\"pop\", [\nIf(\"p_delta[src] > 0\",\n[\nCBlock([\"_delta = p_delta[src]\"]),\nCBlock([\"p_delta[src]  = 0\"]),\nCBlock([\"active_vertices.reduce( 1)\"]),\n], [ CBlock([\"pop = false\"]), ]),\n]),\nUniformConditional(If(\"!pop\", [CBlock(\"continue\")]), uniform_only = False, _only_if_np = True),\nClosureHint(\nForAll(\"nbr\", G.edges(\"src\"),\n[\nCDecl([(\"index_type\", \"dst\", \"\")]),\nCBlock([\"dst = graph.getAbsDestination(nbr)\"]),\nCBlock([\"atomicTestAdd(&p_residual[dst], _delta)\"]),\nCBlock([\"bitset_residual.set(dst)\"]),\n]),\n),\n]),\nCBlock([\"active_vertices.thread_exit<cub::BlockReduce<unsigned int, TB_SIZE> >(active_vertices_ts)\"], parse = False),\n]),\nKernel(\"PageRankSanity\", [G.param(), ('unsigned int', '__begin'), ('unsigned int', '__end'), ('float', 'local_tolerance'), ('float *', 'p_residual'), ('float *', 'p_value'), ('HGAccumulator<uint64_t>', 'DGAccumulator_residual_over_tolerance'), ('HGAccumulator<float>', 'DGAccumulator_sum'), ('HGAccumulator<float>', 'DGAccumulator_sum_residual'), ('HGReduceMax<float>', 'max_residual'), ('HGReduceMax<float>', 'max_value'), ('HGReduceMin<float>', 'min_residual'), ('HGReduceMin<float>', 'min_value')],\n[\nCDecl([(\"__shared__ cub::BlockReduce<uint64_t, TB_SIZE>::TempStorage\", \"DGAccumulator_residual_over_tolerance_ts\", \"\")]),\nCBlock([\"DGAccumulator_residual_over_tolerance.thread_entry()\"]),\nCDecl([(\"__shared__ cub::BlockReduce<float, TB_SIZE>::TempStorage\", \"DGAccumulator_sum_ts\", \"\")]),\nCBlock([\"DGAccumulator_sum.thread_entry()\"]),\nCDecl([(\"__shared__ cub::BlockReduce<float, TB_SIZE>::TempStorage\", \"DGAccumulator_sum_residual_ts\", \"\")]),\nCBlock([\"DGAccumulator_sum_residual.thread_entry()\"]),\nCDecl([(\"__shared__ cub::BlockReduce<float, TB_SIZE>::TempStorage\", \"max_residual_ts\", \"\")]),\nCBlock([\"max_residual.thread_entry()\"]),\nCDecl([(\"__shared__ cub::BlockReduce<float, TB_SIZE>::TempStorage\", \"max_value_ts\", \"\")]),\nCBlock([\"max_value.thread_entry()\"]),\nCDecl([(\"__shared__ cub::BlockReduce<float, TB_SIZE>::TempStorage\", \"min_residual_ts\", \"\")]),\nCBlock([\"min_residual.thread_entry()\"]),\nCDecl([(\"__shared__ cub::BlockReduce<float, TB_SIZE>::TempStorage\", \"min_value_ts\", \"\")]),\nCBlock([\"min_value.thread_entry()\"]),\nForAll(\"src\", G.nodes(\"__begin\", \"__end\"),\n[\nCDecl([(\"bool\", \"pop\", \" = src < __end\")]),\nIf(\"pop\", [\nCBlock([\"max_value.reduce(p_value[src])\"]),\nCBlock([\"min_value.reduce(p_value[src])\"]),\nCBlock([\"max_residual.reduce(p_residual[src])\"]),\nCBlock([\"min_residual.reduce(p_residual[src])\"]),\nCBlock([\"DGAccumulator_sum.reduce( p_value[src])\"]),\nCBlock([\"DGAccumulator_sum.reduce( p_residual[src])\"]),\nIf(\"p_residual[src] > local_tolerance\",\n[\nCBlock([\"DGAccumulator_residual_over_tolerance.reduce( 1)\"]),\n]),\n]),\n]),\nCBlock([\"DGAccumulator_residual_over_tolerance.thread_exit<cub::BlockReduce<uint64_t, TB_SIZE> >(DGAccumulator_residual_over_tolerance_ts)\"], parse = False),\nCBlock([\"DGAccumulator_sum.thread_exit<cub::BlockReduce<float, TB_SIZE> >(DGAccumulator_sum_ts)\"], parse = False),\nCBlock([\"DGAccumulator_sum_residual.thread_exit<cub::BlockReduce<float, TB_SIZE> >(DGAccumulator_sum_residual_ts)\"], parse = False),\nCBlock([\"max_residual.thread_exit<cub::BlockReduce<float, TB_SIZE> >(max_residual_ts)\"], parse = False),\nCBlock([\"max_value.thread_exit<cub::BlockReduce<float, TB_SIZE> >(max_value_ts)\"], parse = False),\nCBlock([\"min_residual.thread_exit<cub::BlockReduce<float, TB_SIZE> >(min_residual_ts)\"], parse = False),\nCBlock([\"min_value.thread_exit<cub::BlockReduce<float, TB_SIZE> >(min_value_ts)\"], parse = False),\n]),\nKernel(\"ResetGraph_cuda\", [('unsigned int ', '__begin'), ('unsigned int ', '__end'), ('struct CUDA_Context* ', 'ctx')],\n[\nCDecl([(\"dim3\", \"blocks\", \"\")]),\nCDecl([(\"dim3\", \"threads\", \"\")]),\nCBlock([\"kernel_sizing(blocks, threads)\"]),\nInvoke(\"ResetGraph\", (\"ctx->gg\", \"__begin\", \"__end\", \"ctx->delta.data.gpu_wr_ptr()\", \"ctx->nout.data.gpu_wr_ptr()\", \"ctx->residual.data.gpu_wr_ptr()\", \"ctx->value.data.gpu_wr_ptr()\")),\nCBlock([\"check_cuda_kernel\"], parse = False),\n], host = True),\nKernel(\"ResetGraph_allNodes_cuda\", [('struct CUDA_Context* ', 'ctx')],\n[\nCBlock([\"ResetGraph_cuda(0, ctx->gg.nnodes, ctx)\"]),\n], host = True),\nKernel(\"ResetGraph_masterNodes_cuda\", [('struct CUDA_Context* ', 'ctx')],\n[\nCBlock([\"ResetGraph_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, ctx)\"]),\n], host = True),\nKernel(\"ResetGraph_nodesWithEdges_cuda\", [('struct CUDA_Context* ', 'ctx')],\n[\nCBlock([\"ResetGraph_cuda(0, ctx->numNodesWithEdges, ctx)\"]),\n], host = True),\nKernel(\"InitializeGraph_cuda\", [('unsigned int ', '__begin'), ('unsigned int ', '__end'), ('const float &', 'local_alpha'), ('struct CUDA_Context* ', 'ctx')],\n[\nCDecl([(\"dim3\", \"blocks\", \"\")]),\nCDecl([(\"dim3\", \"threads\", \"\")]),\nCBlock([\"kernel_sizing(blocks, threads)\"]),\nInvoke(\"InitializeGraph\", (\"ctx->gg\", \"__begin\", \"__end\", \"local_alpha\", \"ctx->nout.data.gpu_wr_ptr()\", \"ctx->residual.data.gpu_wr_ptr()\", \"*(ctx->nout.is_updated.gpu_rd_ptr())\")),\nCBlock([\"check_cuda_kernel\"], parse = False),\n], host = True),\nKernel(\"InitializeGraph_allNodes_cuda\", [('const float &', 'local_alpha'), ('struct CUDA_Context* ', 'ctx')],\n[\nCBlock([\"InitializeGraph_cuda(0, ctx->gg.nnodes, local_alpha, ctx)\"]),\n], host = True),\nKernel(\"InitializeGraph_masterNodes_cuda\", [('const float &', 'local_alpha'), ('struct CUDA_Context* ', 'ctx')],\n[\nCBlock([\"InitializeGraph_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, local_alpha, ctx)\"]),\n], host = True),\nKernel(\"InitializeGraph_nodesWithEdges_cuda\", [('const float &', 'local_alpha'), ('struct CUDA_Context* ', 'ctx')],\n[\nCBlock([\"InitializeGraph_cuda(0, ctx->numNodesWithEdges, local_alpha, ctx)\"]),\n], host = True),\nKernel(\"PageRank_delta_cuda\", [('unsigned int ', '__begin'), ('unsigned int ', '__end'), ('const float &', 'local_alpha'), ('float', 'local_tolerance'), ('struct CUDA_Context* ', 'ctx')],\n[\nCDecl([(\"dim3\", \"blocks\", \"\")]),\nCDecl([(\"dim3\", \"threads\", \"\")]),\nCBlock([\"kernel_sizing(blocks, threads)\"]),\nInvoke(\"PageRank_delta\", (\"ctx->gg\", \"__begin\", \"__end\", \"local_alpha\", \"local_tolerance\", \"ctx->delta.data.gpu_wr_ptr()\", \"ctx->nout.data.gpu_wr_ptr()\", \"ctx->residual.data.gpu_wr_ptr()\", \"ctx->value.data.gpu_wr_ptr()\")),\nCBlock([\"check_cuda_kernel\"], parse = False),\n], host = True),\nKernel(\"PageRank_delta_allNodes_cuda\", [('const float &', 'local_alpha'), ('float', 'local_tolerance'), ('struct CUDA_Context* ', 'ctx')],\n[\nCBlock([\"PageRank_delta_cuda(0, ctx->gg.nnodes, local_alpha, local_tolerance, ctx)\"]),\n], host = True),\nKernel(\"PageRank_delta_masterNodes_cuda\", [('const float &', 'local_alpha'), ('float', 'local_tolerance'), ('struct CUDA_Context* ', 'ctx')],\n[\nCBlock([\"PageRank_delta_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, local_alpha, local_tolerance, ctx)\"]),\n], host = True),\nKernel(\"PageRank_delta_nodesWithEdges_cuda\", [('const float &', 'local_alpha'), ('float', 'local_tolerance'), ('struct CUDA_Context* ', 'ctx')],\n[\nCBlock([\"PageRank_delta_cuda(0, ctx->numNodesWithEdges, local_alpha, local_tolerance, ctx)\"]),\n], host = True),\nKernel(\"PageRank_cuda\", [('unsigned int ', '__begin'), ('unsigned int ', '__end'), ('unsigned int &', 'active_vertices'), ('struct CUDA_Context* ', 'ctx')],\n[\nCDecl([(\"dim3\", \"blocks\", \"\")]),\nCDecl([(\"dim3\", \"threads\", \"\")]),\nCBlock([\"kernel_sizing(blocks, threads)\"]),\nCDecl([(\"Shared<unsigned int>\", \"active_verticesval\", \" = Shared<unsigned int>(1)\")]),\nCDecl([(\"HGAccumulator<unsigned int>\", \"_active_vertices\", \"\")]),\nCBlock([\"*(active_verticesval.cpu_wr_ptr()) = 0\"]),\nCBlock([\"_active_vertices.rv = active_verticesval.gpu_wr_ptr()\"]),\nInvoke(\"PageRank\", (\"ctx->gg\", \"__begin\", \"__end\", \"ctx->delta.data.gpu_wr_ptr()\", \"ctx->residual.data.gpu_wr_ptr()\", \"*(ctx->residual.is_updated.gpu_rd_ptr())\", \"_active_vertices\")),\nCBlock([\"check_cuda_kernel\"], parse = False),\nCBlock([\"active_vertices = *(active_verticesval.cpu_rd_ptr())\"]),\n], host = True),\nKernel(\"PageRank_allNodes_cuda\", [('unsigned int &', 'active_vertices'), ('struct CUDA_Context* ', 'ctx')],\n[\nCBlock([\"PageRank_cuda(0, ctx->gg.nnodes, active_vertices, ctx)\"]),\n], host = True),\nKernel(\"PageRank_masterNodes_cuda\", [('unsigned int &', 'active_vertices'), ('struct CUDA_Context* ', 'ctx')],\n[\nCBlock([\"PageRank_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, active_vertices, ctx)\"]),\n], host = True),\nKernel(\"PageRank_nodesWithEdges_cuda\", [('unsigned int &', 'active_vertices'), ('struct CUDA_Context* ', 'ctx')],\n[\nCBlock([\"PageRank_cuda(0, ctx->numNodesWithEdges, active_vertices, ctx)\"]),\n], host = True),\nKernel(\"PageRankSanity_cuda\", [('unsigned int ', '__begin'), ('unsigned int ', '__end'), ('uint64_t &', 'DGAccumulator_residual_over_tolerance'), ('float &', 'DGAccumulator_sum'), ('float &', 'DGAccumulator_sum_residual'), ('float &', 'max_residual'), ('float &', 'max_value'), ('float &', 'min_residual'), ('float &', 'min_value'), ('float', 'local_tolerance'), ('struct CUDA_Context* ', 'ctx')],\n[\nCDecl([(\"dim3\", \"blocks\", \"\")]),\nCDecl([(\"dim3\", \"threads\", \"\")]),\nCBlock([\"kernel_sizing(blocks, threads)\"]),\nCDecl([(\"Shared<uint64_t>\", \"DGAccumulator_residual_over_toleranceval\", \" = Shared<uint64_t>(1)\")]),\nCDecl([(\"HGAccumulator<uint64_t>\", \"_DGAccumulator_residual_over_tolerance\", \"\")]),\nCBlock([\"*(DGAccumulator_residual_over_toleranceval.cpu_wr_ptr()) = 0\"]),\nCBlock([\"_DGAccumulator_residual_over_tolerance.rv = DGAccumulator_residual_over_toleranceval.gpu_wr_ptr()\"]),\nCDecl([(\"Shared<float>\", \"DGAccumulator_sumval\", \" = Shared<float>(1)\")]),\nCDecl([(\"HGAccumulator<float>\", \"_DGAccumulator_sum\", \"\")]),\nCBlock([\"*(DGAccumulator_sumval.cpu_wr_ptr()) = 0\"]),\nCBlock([\"_DGAccumulator_sum.rv = DGAccumulator_sumval.gpu_wr_ptr()\"]),\nCDecl([(\"Shared<float>\", \"DGAccumulator_sum_residualval\", \" = Shared<float>(1)\")]),\nCDecl([(\"HGAccumulator<float>\", \"_DGAccumulator_sum_residual\", \"\")]),\nCBlock([\"*(DGAccumulator_sum_residualval.cpu_wr_ptr()) = 0\"]),\nCBlock([\"_DGAccumulator_sum_residual.rv = DGAccumulator_sum_residualval.gpu_wr_ptr()\"]),\nCDecl([(\"Shared<float>\", \"max_residualval\", \" = Shared<float>(1)\")]),\nCDecl([(\"HGReduceMax<float>\", \"_max_residual\", \"\")]),\nCBlock([\"*(max_residualval.cpu_wr_ptr()) = 0\"]),\nCBlock([\"_max_residual.rv = max_residualval.gpu_wr_ptr()\"]),\nCDecl([(\"Shared<float>\", \"max_valueval\", \" = Shared<float>(1)\")]),\nCDecl([(\"HGReduceMax<float>\", \"_max_value\", \"\")]),\nCBlock([\"*(max_valueval.cpu_wr_ptr()) = 0\"]),\nCBlock([\"_max_value.rv = max_valueval.gpu_wr_ptr()\"]),\nCDecl([(\"Shared<float>\", \"min_residualval\", \" = Shared<float>(1)\")]),\nCDecl([(\"HGReduceMin<float>\", \"_min_residual\", \"\")]),\nCBlock([\"*(min_residualval.cpu_wr_ptr()) = 1073741823\"]),\nCBlock([\"_min_residual.rv = min_residualval.gpu_wr_ptr()\"]),\nCDecl([(\"Shared<float>\", \"min_valueval\", \" = Shared<float>(1)\")]),\nCDecl([(\"HGReduceMin<float>\", \"_min_value\", \"\")]),\nCBlock([\"*(min_valueval.cpu_wr_ptr()) = 1073741823\"]),\nCBlock([\"_min_value.rv = min_valueval.gpu_wr_ptr()\"]),\nInvoke(\"PageRankSanity\", (\"ctx->gg\", \"__begin\", \"__end\", \"local_tolerance\", \"ctx->residual.data.gpu_wr_ptr()\", \"ctx->value.data.gpu_wr_ptr()\", \"_DGAccumulator_residual_over_tolerance\", \"_DGAccumulator_sum\", \"_DGAccumulator_sum_residual\", \"_max_residual\", \"_max_value\", \"_min_residual\", \"_min_value\")),\nCBlock([\"check_cuda_kernel\"], parse = False),\nCBlock([\"DGAccumulator_residual_over_tolerance = *(DGAccumulator_residual_over_toleranceval.cpu_rd_ptr())\"]),\nCBlock([\"DGAccumulator_sum = *(DGAccumulator_sumval.cpu_rd_ptr())\"]),\nCBlock([\"DGAccumulator_sum_residual = *(DGAccumulator_sum_residualval.cpu_rd_ptr())\"]),\nCBlock([\"max_residual = *(max_residualval.cpu_rd_ptr())\"]),\nCBlock([\"max_value = *(max_valueval.cpu_rd_ptr())\"]),\nCBlock([\"min_residual = *(min_residualval.cpu_rd_ptr())\"]),\nCBlock([\"min_value = *(min_valueval.cpu_rd_ptr())\"]),\n], host = True),\nKernel(\"PageRankSanity_allNodes_cuda\", [('uint64_t &', 'DGAccumulator_residual_over_tolerance'), ('float &', 'DGAccumulator_sum'), ('float &', 'DGAccumulator_sum_residual'), ('float &', 'max_residual'), ('float &', 'max_value'), ('float &', 'min_residual'), ('float &', 'min_value'), ('float', 'local_tolerance'), ('struct CUDA_Context* ', 'ctx')],\n[\nCBlock([\"PageRankSanity_cuda(0, ctx->gg.nnodes, DGAccumulator_residual_over_tolerance, DGAccumulator_sum, DGAccumulator_sum_residual, max_residual, max_value, min_residual, min_value, local_tolerance, ctx)\"]),\n], host = True),\nKernel(\"PageRankSanity_masterNodes_cuda\", [('uint64_t &', 'DGAccumulator_residual_over_tolerance'), ('float &', 'DGAccumulator_sum'), ('float &', 'DGAccumulator_sum_residual'), ('float &', 'max_residual'), ('float &', 'max_value'), ('float &', 'min_residual'), ('float &', 'min_value'), ('float', 'local_tolerance'), ('struct CUDA_Context* ', 'ctx')],\n[\nCBlock([\"PageRankSanity_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, DGAccumulator_residual_over_tolerance, DGAccumulator_sum, DGAccumulator_sum_residual, max_residual, max_value, min_residual, min_value, local_tolerance, ctx)\"]),\n], host = True),\nKernel(\"PageRankSanity_nodesWithEdges_cuda\", [('uint64_t &', 'DGAccumulator_residual_over_tolerance'), ('float &', 'DGAccumulator_sum'), ('float &', 'DGAccumulator_sum_residual'), ('float &', 'max_residual'), ('float &', 'max_value'), ('float &', 'min_residual'), ('float &', 'min_value'), ('float', 'local_tolerance'), ('struct CUDA_Context* ', 'ctx')],\n[\nCBlock([\"PageRankSanity_cuda(0, ctx->numNodesWithEdges, DGAccumulator_residual_over_tolerance, DGAccumulator_sum, DGAccumulator_sum_residual, max_residual, max_value, min_residual, min_value, local_tolerance, ctx)\"]),\n], host = True),\n])\n"
  },
  {
    "path": "lonestar/analytics/distributed/pagerank/pagerank_push_sync.hh",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting parallelism.\n * The code is being released under the terms of the 3-Clause BSD License (a\n * copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#include \"galois/runtime/SyncStructures.h\"\n\nGALOIS_SYNC_STRUCTURE_REDUCE_ADD(nout, unsigned int);\nGALOIS_SYNC_STRUCTURE_BITSET(nout);\n\nGALOIS_SYNC_STRUCTURE_REDUCE_SET(residual, float);\nGALOIS_SYNC_STRUCTURE_REDUCE_ADD(residual, float);\nGALOIS_SYNC_STRUCTURE_BITSET(residual);\n"
  },
  {
    "path": "lonestar/analytics/distributed/partition/CMakeLists.txt",
    "content": "app_dist(partition partition NO_GPU)\n"
  },
  {
    "path": "lonestar/analytics/distributed/partition/partition.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#include <iostream>\n#include <limits>\n#include \"DistBench/Start.h\"\n#include \"galois/DistGalois.h\"\n#include \"galois/gstl.h\"\n\n/******************************************************************************/\n/* Declaration of command line arguments */\n/******************************************************************************/\n\nnamespace cll = llvm::cl;\n\n/******************************************************************************/\n/* Graph structure declarations + other initialization */\n/******************************************************************************/\n\nstruct NodeData {\n  uint32_t dummy;\n};\n\ntypedef galois::graphs::DistGraph<NodeData, void> Graph;\ntypedef typename Graph::GraphNode GNode;\n\n/******************************************************************************/\n/* Main */\n/******************************************************************************/\n\nconstexpr static const char* const name = \"Partition\";\nconstexpr static const char* const desc = \"Partitions a normal graph.\";\nconstexpr static const char* const url  = 0;\n\nint main(int argc, char** argv) {\n  galois::DistMemSys G;\n  DistBenchStart(argc, argv, name, desc, url);\n  distGraphInitialization<NodeData, void>();\n  return 0;\n}\n"
  },
  {
    "path": "lonestar/analytics/distributed/sssp/CMakeLists.txt",
    "content": "app_dist(sssp_push sssp-push)\nadd_test_dist(sssp-push-dist rmat15 ${BASEINPUT}/scalefree/rmat15.gr -graphTranspose=${BASEINPUT}/scalefree/transpose/rmat15.tgr)\n\napp_dist(sssp_pull sssp-pull)\nadd_test_dist(sssp-pull-dist rmat15 ${BASEINPUT}/scalefree/rmat15.gr -graphTranspose=${BASEINPUT}/scalefree/transpose/rmat15.tgr)\n"
  },
  {
    "path": "lonestar/analytics/distributed/sssp/README.md",
    "content": "Single Source Shortest Path\n================================================================================\n\nDESCRIPTION \n--------------------------------------------------------------------------------\n\nThis program performs single source shortest path on a weighted input graph, \nstarting from a source node (specified by -startNode option). \n\nThe algorithm supports both a bulk-synchronous and a bulk-asynchronous\nparallel algorithms. This benchmark consists of two algorithms,\npush- and pull-based. In the push based algorithm, a node that has been updated\nfrom the last round will push out its distance value to its neighbors and\nupdate them if necessary after considering the edge weight between itself\nand its neighbor, in each round.\n\nIn the pull based algorithm, every node will check its neighbors' distance \nvalues and update their own values based on the edge weight between the node\nand its neighbor, in each round.\n\n\n\nINPUT\n--------------------------------------------------------------------------------\n\nTakes in weighted Galois .gr graphs.\n\nBUILD\n--------------------------------------------------------------------------------\n\n1. Run cmake at BUILD directory (refer to top-level README for cmake instructions).\n\n2. Run `cd <BUILD>/lonestar/analytics/distributed/sssp; make -j\n\nRUN\n--------------------------------------------------------------------------------\n\nTo run on 1 host with start node 0, use the following:\n`./sssp-push-dist <input-graph> -graphTranspose=<transpose-input-graph> -t=<num-threads>` \n`./sssp-pull-dist <input-graph> -graphTranspose=<transpose-input-graph> -t=<num-threads>` \n\nTo run on 3 hosts h1, h2, and h3 for start node 0, use the following:\n`mpirun -n=3 -hosts=h1,h2,h3 ./sssp-push-dist <input-graph> -graphTranspose=<transpose-input-graph> -t=<num-threads>` \n`mpirun -n=3 -hosts=h1,h2,h3 ./sssp-pull-dist <input-graph> -graphTranspose=<transpose-input-graph> -t=<num-threads>` \n\nTo run on 3 hosts h1, h2, and h3 for start node 10 with an incoming edge cut, use the following:\n`mpirun -n=3 -hosts=h1,h2,h3 ./sssp-push-dist <input-graph> -graphTranspose=<transpose-input-graph> -t=<num-threads> -startNode=10 -partition=iec`\n`mpirun -n=3 -hosts=h1,h2,h3 ./sssp-pull-dist <input-graph> -graphTranspose=<transpose-input-graph> -t=<num-threads>` \n\nPERFORMANCE  \n--------------------------------------------------------------------------------\n\n* The push variant generally performs better in our experience.\n\n* For 16 or less hosts/GPUs, for performance, we recommend using an\n  **edge-cut** partitioning policy (OEC or IEC) with **synchronous**\n  communication for performance.\n\n* For 32 or more hosts/GPUs, for performance, we recommend using the\n  **Cartesian vertex-cut** partitioning policy (CVC) with **asynchronous**\n  communication for performance.\n"
  },
  {
    "path": "lonestar/analytics/distributed/sssp/sssp_pull.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#include \"DistBench/Output.h\"\n#include \"DistBench/Start.h\"\n#include \"galois/DistGalois.h\"\n#include \"galois/DReducible.h\"\n#include \"galois/DTerminationDetector.h\"\n#include \"galois/gstl.h\"\n#include \"galois/runtime/Tracer.h\"\n\n#include <iostream>\n#include <limits>\n\n#ifdef GALOIS_ENABLE_GPU\n#include \"sssp_pull_cuda.h\"\nstruct CUDA_Context* cuda_ctx;\n#else\nenum { CPU, GPU_CUDA };\nint personality = CPU;\n#endif\n\nconstexpr static const char* const REGION_NAME = \"SSSP\";\n\n/******************************************************************************/\n/* Declaration of command line arguments */\n/******************************************************************************/\n\nnamespace cll = llvm::cl;\nstatic cll::opt<unsigned int> maxIterations(\"maxIterations\",\n                                            cll::desc(\"Maximum iterations: \"\n                                                      \"Default 1000\"),\n                                            cll::init(1000));\nstatic cll::opt<uint64_t>\n    src_node(\"startNode\", cll::desc(\"ID of the source node\"), cll::init(0));\n\nenum Exec { Sync, Async };\n\nstatic cll::opt<Exec> execution(\n    \"exec\", cll::desc(\"Distributed Execution Model (default value Async):\"),\n    cll::values(clEnumVal(Sync, \"Bulk-synchronous Parallel (BSP)\"),\n                clEnumVal(Async, \"Bulk-asynchronous Parallel (BASP)\")),\n    cll::init(Async));\n\n/******************************************************************************/\n/* Graph structure declarations + other initialization */\n/******************************************************************************/\n\nconst uint32_t infinity = std::numeric_limits<uint32_t>::max() / 4;\n\nstruct NodeData {\n  uint32_t dist_current;\n};\n\ngalois::DynamicBitSet bitset_dist_current;\n\ntypedef galois::graphs::DistGraph<NodeData, unsigned int> Graph;\ntypedef typename Graph::GraphNode GNode;\n\nstd::unique_ptr<galois::graphs::GluonSubstrate<Graph>> syncSubstrate;\n\n#include \"sssp_pull_sync.hh\"\n\n/******************************************************************************/\n/* Algorithm structures */\n/******************************************************************************/\n\nstruct InitializeGraph {\n  const uint32_t& local_infinity;\n  cll::opt<uint64_t>& local_src_node;\n  Graph* graph;\n\n  InitializeGraph(cll::opt<uint64_t>& _src_node, const uint32_t& _infinity,\n                  Graph* _graph)\n      : local_infinity(_infinity), local_src_node(_src_node), graph(_graph) {}\n\n  void static go(Graph& _graph) {\n    const auto& allNodes = _graph.allNodesRange();\n    if (personality == GPU_CUDA) {\n#ifdef GALOIS_ENABLE_GPU\n      std::string impl_str(\"InitializeGraph_\" +\n                           (syncSubstrate->get_run_identifier()));\n      galois::StatTimer StatTimer_cuda(impl_str.c_str(), REGION_NAME);\n      StatTimer_cuda.start();\n      InitializeGraph_allNodes_cuda(infinity, src_node, cuda_ctx);\n      StatTimer_cuda.stop();\n#else\n      abort();\n#endif\n    } else if (personality == CPU) {\n      galois::do_all(\n          galois::iterate(allNodes.begin(), allNodes.end()),\n          InitializeGraph{src_node, infinity, &_graph}, galois::no_stats(),\n          galois::loopname(\n              syncSubstrate->get_run_identifier(\"InitializeGraph\").c_str()));\n    }\n  }\n\n  void operator()(GNode src) const {\n    NodeData& sdata = graph->getData(src);\n    sdata.dist_current =\n        (graph->getGID(src) == local_src_node) ? 0 : local_infinity;\n  }\n};\n\ntemplate <bool async>\nstruct SSSP {\n  Graph* graph;\n  using DGTerminatorDetector =\n      typename std::conditional<async, galois::DGTerminator<unsigned int>,\n                                galois::DGAccumulator<unsigned int>>::type;\n\n  DGTerminatorDetector& active_vertices;\n\n  SSSP(Graph* _graph, DGTerminatorDetector& _dga)\n      : graph(_graph), active_vertices(_dga) {}\n\n  void static go(Graph& _graph) {\n    unsigned _num_iterations   = 0;\n    const auto& nodesWithEdges = _graph.allNodesWithEdgesRange();\n    DGTerminatorDetector dga;\n\n    do {\n      syncSubstrate->set_num_round(_num_iterations);\n      dga.reset();\n      if (personality == GPU_CUDA) {\n#ifdef GALOIS_ENABLE_GPU\n        std::string impl_str(\"SSSP_\" + (syncSubstrate->get_run_identifier()));\n        galois::StatTimer StatTimer_cuda(impl_str.c_str(), REGION_NAME);\n        StatTimer_cuda.start();\n        unsigned int __retval = 0;\n        SSSP_nodesWithEdges_cuda(__retval, cuda_ctx);\n        dga += __retval;\n        StatTimer_cuda.stop();\n#else\n        abort();\n#endif\n      } else if (personality == CPU) {\n        galois::do_all(galois::iterate(nodesWithEdges), SSSP{&_graph, dga},\n                       galois::no_stats(), galois::steal(),\n                       galois::loopname(\n                           syncSubstrate->get_run_identifier(\"SSSP\").c_str()));\n      }\n\n      syncSubstrate->sync<writeSource, readDestination, Reduce_min_dist_current,\n                          Bitset_dist_current, async>(\"SSSP\");\n\n      galois::runtime::reportStat_Tsum(\n          REGION_NAME, \"NumWorkItems_\" + (syncSubstrate->get_run_identifier()),\n          dga.read_local());\n\n      ++_num_iterations;\n    } while ((async || (_num_iterations < maxIterations)) &&\n             dga.reduce(syncSubstrate->get_run_identifier()));\n\n    if (galois::runtime::getSystemNetworkInterface().ID == 0) {\n      galois::runtime::reportStat_Single(\n          REGION_NAME,\n          \"NumIterations_\" + std::to_string(syncSubstrate->get_run_num()),\n          _num_iterations);\n    }\n  }\n\n  void operator()(GNode src) const {\n    NodeData& snode = graph->getData(src);\n\n    for (auto jj : graph->edges(src)) {\n      GNode dst         = graph->getEdgeDst(jj);\n      auto& dnode       = graph->getData(dst);\n      uint32_t new_dist = dnode.dist_current + graph->getEdgeData(jj);\n      uint32_t old_dist = galois::min(snode.dist_current, new_dist);\n      if (old_dist > new_dist) {\n        bitset_dist_current.set(src);\n        active_vertices += 1;\n      }\n    }\n  }\n};\n\n/******************************************************************************/\n/* Sanity check operators */\n/******************************************************************************/\n\n/* Prints total number of nodes visited + max distance */\nstruct SSSPSanityCheck {\n  const uint32_t& local_infinity;\n  Graph* graph;\n\n  galois::DGAccumulator<uint64_t>& DGAccumulator_sum;\n  galois::DGReduceMax<uint32_t>& DGMax;\n  galois::DGAccumulator<uint64_t>& dg_avg;\n\n  SSSPSanityCheck(const uint32_t& _infinity, Graph* _graph,\n                  galois::DGAccumulator<uint64_t>& dgas,\n                  galois::DGReduceMax<uint32_t>& dgm,\n                  galois::DGAccumulator<uint64_t>& _dg_avg)\n      : local_infinity(_infinity), graph(_graph), DGAccumulator_sum(dgas),\n        DGMax(dgm), dg_avg(_dg_avg) {}\n\n  void static go(Graph& _graph, galois::DGAccumulator<uint64_t>& dgas,\n                 galois::DGReduceMax<uint32_t>& dgm,\n                 galois::DGAccumulator<uint64_t>& dgag) {\n    dgas.reset();\n    dgm.reset();\n    dgag.reset();\n\n    if (personality == GPU_CUDA) {\n#ifdef GALOIS_ENABLE_GPU\n      uint64_t sum;\n      uint64_t avg;\n      uint32_t max;\n      SSSPSanityCheck_masterNodes_cuda(sum, avg, max, infinity, cuda_ctx);\n      dgas += sum;\n      dgm.update(max);\n      dgag += avg;\n#else\n      abort();\n#endif\n    } else {\n      galois::do_all(galois::iterate(_graph.masterNodesRange().begin(),\n                                     _graph.masterNodesRange().end()),\n                     SSSPSanityCheck(infinity, &_graph, dgas, dgm, dgag),\n                     galois::no_stats(), galois::loopname(\"SSSPSanityCheck\"));\n    }\n\n    uint64_t num_visited  = dgas.reduce();\n    uint32_t max_distance = dgm.reduce();\n\n    float visit_average = ((float)dgag.reduce()) / num_visited;\n\n    // Only host 0 will print the info\n    if (galois::runtime::getSystemNetworkInterface().ID == 0) {\n      galois::gPrint(\"Number of nodes visited from source \", src_node, \" is \",\n                     num_visited, \"\\n\");\n      galois::gPrint(\"Max distance from source \", src_node, \" is \",\n                     max_distance, \"\\n\");\n      galois::gPrint(\"Average distances on visited nodes is \", visit_average,\n                     \"\\n\");\n    }\n  }\n\n  void operator()(GNode src) const {\n    NodeData& src_data = graph->getData(src);\n\n    if (src_data.dist_current < local_infinity) {\n      DGAccumulator_sum += 1;\n      DGMax.update(src_data.dist_current);\n      dg_avg += src_data.dist_current;\n    }\n  }\n};\n\n/******************************************************************************/\n/* Make results */\n/******************************************************************************/\n\nstd::vector<uint32_t> makeResultsCPU(std::unique_ptr<Graph>& hg) {\n  std::vector<uint32_t> values;\n\n  values.reserve(hg->numMasters());\n  for (auto node : hg->masterNodesRange()) {\n    values.push_back(hg->getData(node).dist_current);\n  }\n\n  return values;\n}\n\n#ifdef GALOIS_ENABLE_GPU\nstd::vector<uint32_t> makeResultsGPU(std::unique_ptr<Graph>& hg) {\n  std::vector<uint32_t> values;\n\n  values.reserve(hg->numMasters());\n  for (auto node : hg->masterNodesRange()) {\n    values.push_back(get_node_dist_current_cuda(cuda_ctx, node));\n  }\n\n  return values;\n}\n#else\nstd::vector<uint32_t> makeResultsGPU(std::unique_ptr<Graph>& /*unused*/) {\n  abort();\n}\n#endif\n\nstd::vector<uint32_t> makeResults(std::unique_ptr<Graph>& hg) {\n  switch (personality) {\n  case CPU:\n    return makeResultsCPU(hg);\n  case GPU_CUDA:\n    return makeResultsGPU(hg);\n  default:\n    abort();\n  }\n}\n\n/******************************************************************************/\n/* Main */\n/******************************************************************************/\n\nconstexpr static const char* const name = \"SSSP pull - Distributed \"\n                                          \"Heterogeneous\";\nconstexpr static const char* const desc = \"SSSP pull on Distributed Galois.\";\nconstexpr static const char* const url  = nullptr;\n\nint main(int argc, char** argv) {\n  galois::DistMemSys G;\n  DistBenchStart(argc, argv, name, desc, url);\n\n  auto& net = galois::runtime::getSystemNetworkInterface();\n  if (net.ID == 0) {\n    galois::runtime::reportParam(REGION_NAME, \"Max Iterations\", maxIterations);\n    galois::runtime::reportParam(REGION_NAME, \"Source Node ID\", src_node);\n  }\n\n  galois::StatTimer StatTimer_total(\"TimerTotal\", REGION_NAME);\n\n  StatTimer_total.start();\n\n  std::unique_ptr<Graph> hg;\n#ifdef GALOIS_ENABLE_GPU\n  std::tie(hg, syncSubstrate) =\n      distGraphInitialization<NodeData, unsigned int, false>(&cuda_ctx);\n#else\n  std::tie(hg, syncSubstrate) =\n      distGraphInitialization<NodeData, unsigned int, false>();\n#endif\n\n  bitset_dist_current.resize(hg->size());\n\n  galois::gPrint(\"[\", net.ID, \"] InitializeGraph::go called\\n\");\n\n  InitializeGraph::go((*hg));\n  galois::runtime::getHostBarrier().wait();\n\n  // accumulators for use in operators\n  galois::DGAccumulator<uint64_t> DGAccumulator_sum;\n  galois::DGAccumulator<uint64_t> dg_avge;\n  galois::DGReduceMax<uint32_t> m;\n\n  for (auto run = 0; run < numRuns; ++run) {\n    galois::gPrint(\"[\", net.ID, \"] SSSP::go run \", run, \" called\\n\");\n    std::string timer_str(\"Timer_\" + std::to_string(run));\n    galois::StatTimer StatTimer_main(timer_str.c_str(), REGION_NAME);\n\n    StatTimer_main.start();\n    if (execution == Async) {\n      SSSP<true>::go(*hg);\n    } else {\n      SSSP<false>::go(*hg);\n    }\n    StatTimer_main.stop();\n\n    // sanity check\n    SSSPSanityCheck::go(*hg, DGAccumulator_sum, m, dg_avge);\n\n    if ((run + 1) != numRuns) {\n      if (personality == GPU_CUDA) {\n#ifdef GALOIS_ENABLE_GPU\n        bitset_dist_current_reset_cuda(cuda_ctx);\n#else\n        abort();\n#endif\n      } else {\n        bitset_dist_current.reset();\n      }\n\n      (*syncSubstrate).set_num_run(run + 1);\n      InitializeGraph::go((*hg));\n      galois::runtime::getHostBarrier().wait();\n    }\n  }\n\n  StatTimer_total.stop();\n\n  if (output) {\n    std::vector<uint32_t> results = makeResults(hg);\n    auto globalIDs                = hg->getMasterGlobalIDs();\n    assert(results.size() == globalIDs.size());\n\n    writeOutput(outputLocation, \"distance\", results.data(), results.size(),\n                globalIDs.data());\n  }\n\n  return 0;\n}\n"
  },
  {
    "path": "lonestar/analytics/distributed/sssp/sssp_pull_cuda.cu",
    "content": "/*  -*- mode: c++ -*-  */\n#include \"gg.h\"\n#include \"ggcuda.h\"\n#include \"cub/cub.cuh\"\n#include \"cub/util_allocator.cuh\"\n#include \"thread_work.h\"\n\nvoid kernel_sizing(CSRGraph &, dim3 &, dim3 &);\n#define TB_SIZE 256\nconst char *GGC_OPTIONS = \"coop_conv=False $ outline_iterate_gb=False $ backoff_blocking_factor=4 $ parcomb=True $ np_schedulers=set(['fg', 'tb', 'wp']) $ cc_disable=set([]) $ tb_lb=False $ hacks=set([]) $ np_factor=8 $ instrument=set([]) $ unroll=[] $ instrument_mode=None $ read_props=None $ outline_iterate=False $ ignore_nested_errors=False $ np=True $ write_props=None $ quiet_cgen=True $ dyn_lb=True $ retry_backoff=True $ cuda.graph_type=basic $ cuda.use_worklist_slots=True $ cuda.worklist_type=basic\";\nstruct ThreadWork t_work;\nbool enable_lb = true;\n#include \"sssp_pull_cuda.cuh\"\nstatic const int __tb_SSSP = TB_SIZE;\n__global__ void InitializeGraph(CSRGraph graph, unsigned int __begin, unsigned int __end, const uint32_t  local_infinity, unsigned long long local_src_node, uint32_t * p_dist_current)\n{\n  unsigned tid = TID_1D;\n  unsigned nthreads = TOTAL_THREADS_1D;\n\n  const unsigned __kernel_tb_size = TB_SIZE;\n  index_type src_end;\n  // FP: \"1 -> 2;\n  src_end = __end;\n  for (index_type src = __begin + tid; src < src_end; src += nthreads)\n  {\n    bool pop  = src < __end;\n    if (pop)\n    {\n      p_dist_current[src] = (graph.node_data[src] == local_src_node) ? 0 : local_infinity;\n    }\n  }\n  // FP: \"7 -> 8;\n}\n__global__ void SSSP_TB_LB(CSRGraph graph, unsigned int __begin, unsigned int __end, uint32_t * p_dist_current, DynamicBitset& bitset_dist_current, HGAccumulator<unsigned int> active_vertices, int * thread_prefix_work_wl, unsigned int num_items, PipeContextT<Worklist2> thread_src_wl)\n{\n  unsigned tid = TID_1D;\n  unsigned nthreads = TOTAL_THREADS_1D;\n\n  const unsigned __kernel_tb_size = TB_SIZE;\n  __shared__ unsigned int total_work;\n  __shared__ unsigned block_start_src_index;\n  __shared__ unsigned block_end_src_index;\n  unsigned my_work;\n  unsigned src;\n  unsigned int offset;\n  unsigned int current_work;\n  // FP: \"1 -> 2;\n  // FP: \"2 -> 3;\n  unsigned blockdim_x = BLOCK_DIM_X;\n  // FP: \"3 -> 4;\n  // FP: \"4 -> 5;\n  // FP: \"5 -> 6;\n  // FP: \"6 -> 7;\n  // FP: \"7 -> 8;\n  // FP: \"8 -> 9;\n  // FP: \"9 -> 10;\n  total_work = thread_prefix_work_wl[num_items - 1];\n  // FP: \"10 -> 11;\n  my_work = ceilf((float)(total_work) / (float) nthreads);\n  // FP: \"11 -> 12;\n\n  // FP: \"12 -> 13;\n  __syncthreads();\n  // FP: \"13 -> 14;\n\n  // FP: \"14 -> 15;\n  if (my_work != 0)\n  {\n    current_work = tid;\n  }\n  // FP: \"17 -> 18;\n  for (unsigned i =0; i < my_work; i++)\n  {\n    unsigned int block_start_work;\n    unsigned int block_end_work;\n    if (threadIdx.x == 0)\n    {\n      if (current_work < total_work)\n      {\n        block_start_work = current_work;\n        block_end_work=current_work + blockdim_x - 1;\n        if (block_end_work >= total_work)\n        {\n          block_end_work = total_work - 1;\n        }\n        block_start_src_index = compute_src_and_offset(0, num_items - 1,  block_start_work+1, thread_prefix_work_wl, num_items,offset);\n        block_end_src_index = compute_src_and_offset(0, num_items - 1, block_end_work+1, thread_prefix_work_wl, num_items, offset);\n      }\n    }\n    __syncthreads();\n\n    if (current_work < total_work)\n    {\n      unsigned src_index;\n      index_type jj;\n      src_index = compute_src_and_offset(block_start_src_index, block_end_src_index, current_work+1, thread_prefix_work_wl,num_items, offset);\n      src= thread_src_wl.in_wl().dwl[src_index];\n      jj = (graph).getFirstEdge(src)+ offset;\n      {\n        index_type dst;\n        uint32_t new_dist;\n        uint32_t old_dist;\n        dst = graph.getAbsDestination(jj);\n        new_dist = p_dist_current[dst] + graph.getAbsWeight(jj);\n        old_dist = atomicTestMin(&p_dist_current[src], new_dist);\n        if (old_dist > new_dist)\n        {\n          bitset_dist_current.set(src);\n          active_vertices.reduce( 1);\n        }\n      }\n      current_work = current_work + nthreads;\n    }\n    __syncthreads();\n  }\n  // FP: \"50 -> 51;\n}\n__global__ void SSSP(CSRGraph graph, unsigned int __begin, unsigned int __end, uint32_t * p_dist_current, DynamicBitset& bitset_dist_current, HGAccumulator<unsigned int> active_vertices, PipeContextT<Worklist2> thread_work_wl, PipeContextT<Worklist2> thread_src_wl, bool enable_lb)\n{\n  unsigned tid = TID_1D;\n  unsigned nthreads = TOTAL_THREADS_1D;\n\n  const unsigned __kernel_tb_size = __tb_SSSP;\n  __shared__ cub::BlockReduce<unsigned int, TB_SIZE>::TempStorage active_vertices_ts;\n  index_type src_end;\n  index_type src_rup;\n  // FP: \"1 -> 2;\n  const int _NP_CROSSOVER_WP = 32;\n  const int _NP_CROSSOVER_TB = __kernel_tb_size;\n  // FP: \"2 -> 3;\n  const int BLKSIZE = __kernel_tb_size;\n  const int ITSIZE = BLKSIZE * 8;\n  unsigned d_limit = DEGREE_LIMIT;\n  // FP: \"3 -> 4;\n\n  typedef cub::BlockScan<multiple_sum<2, index_type>, BLKSIZE> BlockScan;\n  typedef union np_shared<BlockScan::TempStorage, index_type, struct tb_np, struct warp_np<__kernel_tb_size/32>, struct fg_np<ITSIZE> > npsTy;\n\n  // FP: \"4 -> 5;\n  __shared__ npsTy nps ;\n  // FP: \"5 -> 6;\n  // FP: \"6 -> 7;\n  active_vertices.thread_entry();\n  // FP: \"7 -> 8;\n  src_end = __end;\n  src_rup = ((__begin) + roundup(((__end) - (__begin)), (blockDim.x)));\n  for (index_type src = __begin + tid; src < src_rup; src += nthreads)\n  {\n    int index;\n    multiple_sum<2, index_type> _np_mps;\n    multiple_sum<2, index_type> _np_mps_total;\n    // FP: \"8 -> 9;\n    bool pop  = src < __end && ((( src < (graph).nnodes )) ? true: false);\n    // FP: \"9 -> 10;\n    if (pop)\n    {\n    }\n    // FP: \"11 -> 12;\n    // FP: \"14 -> 15;\n    // FP: \"15 -> 16;\n    int threshold = TOTAL_THREADS_1D;\n    // FP: \"16 -> 17;\n    if (pop && (graph).getOutDegree(src) >= threshold)\n    {\n      index = thread_work_wl.in_wl().push_range(1) ;\n      thread_src_wl.in_wl().push_range(1);\n      thread_work_wl.in_wl().dwl[index] = (graph).getOutDegree(src);\n      thread_src_wl.in_wl().dwl[index] = src;\n      pop = false;\n    }\n    // FP: \"19 -> 20;\n    struct NPInspector1 _np = {0,0,0,0,0,0};\n    // FP: \"20 -> 21;\n    __shared__ struct { index_type src; } _np_closure [TB_SIZE];\n    // FP: \"21 -> 22;\n    _np_closure[threadIdx.x].src = src;\n    // FP: \"22 -> 23;\n    if (pop)\n    {\n      _np.size = (graph).getOutDegree(src);\n      _np.start = (graph).getFirstEdge(src);\n    }\n    // FP: \"25 -> 26;\n    // FP: \"26 -> 27;\n    _np_mps.el[0] = _np.size >= _NP_CROSSOVER_WP ? _np.size : 0;\n    _np_mps.el[1] = _np.size < _NP_CROSSOVER_WP ? _np.size : 0;\n    // FP: \"27 -> 28;\n    BlockScan(nps.temp_storage).ExclusiveSum(_np_mps, _np_mps, _np_mps_total);\n    // FP: \"28 -> 29;\n    if (threadIdx.x == 0)\n    {\n      nps.tb.owner = MAX_TB_SIZE + 1;\n    }\n    // FP: \"31 -> 32;\n    __syncthreads();\n    // FP: \"32 -> 33;\n    while (true)\n    {\n      // FP: \"33 -> 34;\n      if (_np.size >= _NP_CROSSOVER_TB)\n      {\n        nps.tb.owner = threadIdx.x;\n      }\n      // FP: \"36 -> 37;\n      __syncthreads();\n      // FP: \"37 -> 38;\n      if (nps.tb.owner == MAX_TB_SIZE + 1)\n      {\n        // FP: \"38 -> 39;\n        __syncthreads();\n        // FP: \"39 -> 40;\n        break;\n      }\n      // FP: \"41 -> 42;\n      if (nps.tb.owner == threadIdx.x)\n      {\n        nps.tb.start = _np.start;\n        nps.tb.size = _np.size;\n        nps.tb.src = threadIdx.x;\n        _np.start = 0;\n        _np.size = 0;\n      }\n      // FP: \"44 -> 45;\n      __syncthreads();\n      // FP: \"45 -> 46;\n      int ns = nps.tb.start;\n      int ne = nps.tb.size;\n      // FP: \"46 -> 47;\n      if (nps.tb.src == threadIdx.x)\n      {\n        nps.tb.owner = MAX_TB_SIZE + 1;\n      }\n      // FP: \"49 -> 50;\n      assert(nps.tb.src < __kernel_tb_size);\n      src = _np_closure[nps.tb.src].src;\n      // FP: \"50 -> 51;\n      for (int _np_j = threadIdx.x; _np_j < ne; _np_j += BLKSIZE)\n      {\n        index_type jj;\n        jj = ns +_np_j;\n        {\n          index_type dst;\n          uint32_t new_dist;\n          uint32_t old_dist;\n          dst = graph.getAbsDestination(jj);\n          new_dist = p_dist_current[dst] + graph.getAbsWeight(jj);\n          old_dist = atomicTestMin(&p_dist_current[src], new_dist);\n          if (old_dist > new_dist)\n          {\n            bitset_dist_current.set(src);\n            active_vertices.reduce( 1);\n          }\n        }\n      }\n      // FP: \"64 -> 65;\n      __syncthreads();\n    }\n    // FP: \"66 -> 67;\n\n    // FP: \"67 -> 68;\n    {\n      const int warpid = threadIdx.x / 32;\n      // FP: \"68 -> 69;\n      const int _np_laneid = cub::LaneId();\n      // FP: \"69 -> 70;\n      while (__any_sync(0xffffffff, _np.size >= _NP_CROSSOVER_WP && _np.size < _NP_CROSSOVER_TB))\n      {\n        if (_np.size >= _NP_CROSSOVER_WP && _np.size < _NP_CROSSOVER_TB)\n        {\n          nps.warp.owner[warpid] = _np_laneid;\n        }\n        if (nps.warp.owner[warpid] == _np_laneid)\n        {\n          nps.warp.start[warpid] = _np.start;\n          nps.warp.size[warpid] = _np.size;\n          nps.warp.src[warpid] = threadIdx.x;\n          _np.start = 0;\n          _np.size = 0;\n        }\n        index_type _np_w_start = nps.warp.start[warpid];\n        index_type _np_w_size = nps.warp.size[warpid];\n        assert(nps.warp.src[warpid] < __kernel_tb_size);\n        src = _np_closure[nps.warp.src[warpid]].src;\n        for (int _np_ii = _np_laneid; _np_ii < _np_w_size; _np_ii += 32)\n        {\n          index_type jj;\n          jj = _np_w_start +_np_ii;\n          {\n            index_type dst;\n            uint32_t new_dist;\n            uint32_t old_dist;\n            dst = graph.getAbsDestination(jj);\n            new_dist = p_dist_current[dst] + graph.getAbsWeight(jj);\n            old_dist = atomicTestMin(&p_dist_current[src], new_dist);\n            if (old_dist > new_dist)\n            {\n              bitset_dist_current.set(src);\n              active_vertices.reduce( 1);\n            }\n          }\n        }\n      }\n      // FP: \"93 -> 94;\n      __syncthreads();\n      // FP: \"94 -> 95;\n    }\n\n    // FP: \"95 -> 96;\n    __syncthreads();\n    // FP: \"96 -> 97;\n    _np.total = _np_mps_total.el[1];\n    _np.offset = _np_mps.el[1];\n    // FP: \"97 -> 98;\n    while (_np.work())\n    {\n      // FP: \"98 -> 99;\n      int _np_i =0;\n      // FP: \"99 -> 100;\n      _np.inspect2(nps.fg.itvalue, nps.fg.src, ITSIZE, threadIdx.x);\n      // FP: \"100 -> 101;\n      __syncthreads();\n      // FP: \"101 -> 102;\n\n      // FP: \"102 -> 103;\n      for (_np_i = threadIdx.x; _np_i < ITSIZE && _np.valid(_np_i); _np_i += BLKSIZE)\n      {\n        index_type jj;\n        assert(nps.fg.src[_np_i] < __kernel_tb_size);\n        src = _np_closure[nps.fg.src[_np_i]].src;\n        jj= nps.fg.itvalue[_np_i];\n        {\n          index_type dst;\n          uint32_t new_dist;\n          uint32_t old_dist;\n          dst = graph.getAbsDestination(jj);\n          new_dist = p_dist_current[dst] + graph.getAbsWeight(jj);\n          old_dist = atomicTestMin(&p_dist_current[src], new_dist);\n          if (old_dist > new_dist)\n          {\n            bitset_dist_current.set(src);\n            active_vertices.reduce( 1);\n          }\n        }\n      }\n      // FP: \"117 -> 118;\n      _np.execute_round_done(ITSIZE);\n      // FP: \"118 -> 119;\n      __syncthreads();\n    }\n    // FP: \"120 -> 121;\n    assert(threadIdx.x < __kernel_tb_size);\n    src = _np_closure[threadIdx.x].src;\n  }\n  // FP: \"122 -> 123;\n  active_vertices.thread_exit<cub::BlockReduce<unsigned int, TB_SIZE> >(active_vertices_ts);\n  // FP: \"123 -> 124;\n}\n__global__ void SSSPSanityCheck(CSRGraph graph, unsigned int __begin, unsigned int __end, const uint32_t  local_infinity, uint32_t * p_dist_current, HGAccumulator<uint64_t> DGAccumulator_sum, HGAccumulator<uint64_t> dg_avg, HGReduceMax<uint32_t> DGMax)\n{\n  unsigned tid = TID_1D;\n  unsigned nthreads = TOTAL_THREADS_1D;\n\n  const unsigned __kernel_tb_size = TB_SIZE;\n  __shared__ cub::BlockReduce<uint64_t, TB_SIZE>::TempStorage DGAccumulator_sum_ts;\n  __shared__ cub::BlockReduce<uint64_t, TB_SIZE>::TempStorage dg_avg_ts;\n  __shared__ cub::BlockReduce<uint32_t, TB_SIZE>::TempStorage DGMax_ts;\n  index_type src_end;\n  // FP: \"1 -> 2;\n  // FP: \"2 -> 3;\n  DGAccumulator_sum.thread_entry();\n  // FP: \"3 -> 4;\n  // FP: \"4 -> 5;\n  dg_avg.thread_entry();\n  // FP: \"5 -> 6;\n  // FP: \"6 -> 7;\n  DGMax.thread_entry();\n  // FP: \"7 -> 8;\n  src_end = __end;\n  for (index_type src = __begin + tid; src < src_end; src += nthreads)\n  {\n    bool pop  = src < __end;\n    if (pop)\n    {\n      if (p_dist_current[src] < local_infinity)\n      {\n        DGAccumulator_sum.reduce( 1);\n        DGMax.reduce(p_dist_current[src]);\n        dg_avg.reduce( p_dist_current[src]);\n      }\n    }\n  }\n  // FP: \"17 -> 18;\n  DGAccumulator_sum.thread_exit<cub::BlockReduce<uint64_t, TB_SIZE> >(DGAccumulator_sum_ts);\n  // FP: \"18 -> 19;\n  dg_avg.thread_exit<cub::BlockReduce<uint64_t, TB_SIZE> >(dg_avg_ts);\n  // FP: \"19 -> 20;\n  DGMax.thread_exit<cub::BlockReduce<uint32_t, TB_SIZE> >(DGMax_ts);\n  // FP: \"20 -> 21;\n}\nvoid InitializeGraph_cuda(unsigned int  __begin, unsigned int  __end, const uint32_t & local_infinity, unsigned long long local_src_node, struct CUDA_Context*  ctx)\n{\n  t_work.init_thread_work(ctx->gg.nnodes);\n  dim3 blocks;\n  dim3 threads;\n  // FP: \"1 -> 2;\n  // FP: \"2 -> 3;\n  // FP: \"3 -> 4;\n  kernel_sizing(blocks, threads);\n  // FP: \"4 -> 5;\n  InitializeGraph <<<blocks, threads>>>(ctx->gg, __begin, __end, local_infinity, local_src_node, ctx->dist_current.data.gpu_wr_ptr());\n  cudaDeviceSynchronize();\n  // FP: \"5 -> 6;\n  check_cuda_kernel;\n  // FP: \"6 -> 7;\n}\nvoid InitializeGraph_allNodes_cuda(const uint32_t & local_infinity, unsigned long long local_src_node, struct CUDA_Context*  ctx)\n{\n  // FP: \"1 -> 2;\n  InitializeGraph_cuda(0, ctx->gg.nnodes, local_infinity, local_src_node, ctx);\n  // FP: \"2 -> 3;\n}\nvoid InitializeGraph_masterNodes_cuda(const uint32_t & local_infinity, unsigned long long local_src_node, struct CUDA_Context*  ctx)\n{\n  // FP: \"1 -> 2;\n  InitializeGraph_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, local_infinity, local_src_node, ctx);\n  // FP: \"2 -> 3;\n}\nvoid InitializeGraph_nodesWithEdges_cuda(const uint32_t & local_infinity, unsigned long long local_src_node, struct CUDA_Context*  ctx)\n{\n  // FP: \"1 -> 2;\n  InitializeGraph_cuda(0, ctx->numNodesWithEdges, local_infinity, local_src_node, ctx);\n  // FP: \"2 -> 3;\n}\nvoid SSSP_cuda(unsigned int  __begin, unsigned int  __end, unsigned int & active_vertices, struct CUDA_Context*  ctx)\n{\n  dim3 blocks;\n  dim3 threads;\n  HGAccumulator<unsigned int> _active_vertices;\n  // FP: \"1 -> 2;\n  // FP: \"2 -> 3;\n  // FP: \"3 -> 4;\n  kernel_sizing(blocks, threads);\n  // FP: \"4 -> 5;\n  Shared<unsigned int> active_verticesval  = Shared<unsigned int>(1);\n  // FP: \"5 -> 6;\n  // FP: \"6 -> 7;\n  *(active_verticesval.cpu_wr_ptr()) = 0;\n  // FP: \"7 -> 8;\n  _active_vertices.rv = active_verticesval.gpu_wr_ptr();\n  // FP: \"8 -> 9;\n  SSSP <<<blocks, __tb_SSSP>>>(ctx->gg, __begin, __end, ctx->dist_current.data.gpu_wr_ptr(), *(ctx->dist_current.is_updated.gpu_rd_ptr()), _active_vertices, t_work.thread_work_wl, t_work.thread_src_wl, enable_lb);\n  cudaDeviceSynchronize();\n  if (enable_lb)\n  {\n    int num_items = t_work.thread_work_wl.in_wl().nitems();\n    if (num_items != 0)\n    {\n      t_work.compute_prefix_sum();\n      cudaDeviceSynchronize();\n      SSSP_TB_LB <<<blocks, __tb_SSSP>>>(ctx->gg, __begin, __end, ctx->dist_current.data.gpu_wr_ptr(), *(ctx->dist_current.is_updated.gpu_rd_ptr()), _active_vertices, t_work.thread_prefix_work_wl.gpu_wr_ptr(), num_items, t_work.thread_src_wl);\n      cudaDeviceSynchronize();\n      t_work.reset_thread_work();\n    }\n  }\n  // FP: \"9 -> 10;\n  check_cuda_kernel;\n  // FP: \"10 -> 11;\n  active_vertices = *(active_verticesval.cpu_rd_ptr());\n  // FP: \"11 -> 12;\n}\nvoid SSSP_allNodes_cuda(unsigned int & active_vertices, struct CUDA_Context*  ctx)\n{\n  // FP: \"1 -> 2;\n  SSSP_cuda(0, ctx->gg.nnodes, active_vertices, ctx);\n  // FP: \"2 -> 3;\n}\nvoid SSSP_masterNodes_cuda(unsigned int & active_vertices, struct CUDA_Context*  ctx)\n{\n  // FP: \"1 -> 2;\n  SSSP_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, active_vertices, ctx);\n  // FP: \"2 -> 3;\n}\nvoid SSSP_nodesWithEdges_cuda(unsigned int & active_vertices, struct CUDA_Context*  ctx)\n{\n  // FP: \"1 -> 2;\n  SSSP_cuda(0, ctx->numNodesWithEdges, active_vertices, ctx);\n  // FP: \"2 -> 3;\n}\nvoid SSSPSanityCheck_cuda(unsigned int  __begin, unsigned int  __end, uint64_t & DGAccumulator_sum, uint64_t & dg_avg, uint32_t & DGMax, const uint32_t & local_infinity, struct CUDA_Context*  ctx)\n{\n  dim3 blocks;\n  dim3 threads;\n  HGAccumulator<uint64_t> _DGAccumulator_sum;\n  HGAccumulator<uint64_t> _dg_avg;\n  HGReduceMax<uint32_t> _DGMax;\n  // FP: \"1 -> 2;\n  // FP: \"2 -> 3;\n  // FP: \"3 -> 4;\n  kernel_sizing(blocks, threads);\n  // FP: \"4 -> 5;\n  Shared<uint64_t> DGAccumulator_sumval  = Shared<uint64_t>(1);\n  // FP: \"5 -> 6;\n  // FP: \"6 -> 7;\n  *(DGAccumulator_sumval.cpu_wr_ptr()) = 0;\n  // FP: \"7 -> 8;\n  _DGAccumulator_sum.rv = DGAccumulator_sumval.gpu_wr_ptr();\n  // FP: \"8 -> 9;\n  Shared<uint64_t> dg_avgval  = Shared<uint64_t>(1);\n  // FP: \"9 -> 10;\n  // FP: \"10 -> 11;\n  *(dg_avgval.cpu_wr_ptr()) = 0;\n  // FP: \"11 -> 12;\n  _dg_avg.rv = dg_avgval.gpu_wr_ptr();\n  // FP: \"12 -> 13;\n  Shared<uint32_t> DGMaxval  = Shared<uint32_t>(1);\n  // FP: \"13 -> 14;\n  // FP: \"14 -> 15;\n  *(DGMaxval.cpu_wr_ptr()) = 0;\n  // FP: \"15 -> 16;\n  _DGMax.rv = DGMaxval.gpu_wr_ptr();\n  // FP: \"16 -> 17;\n  SSSPSanityCheck <<<blocks, threads>>>(ctx->gg, __begin, __end, local_infinity, ctx->dist_current.data.gpu_wr_ptr(), _DGAccumulator_sum, _dg_avg, _DGMax);\n  cudaDeviceSynchronize();\n  // FP: \"17 -> 18;\n  check_cuda_kernel;\n  // FP: \"18 -> 19;\n  DGAccumulator_sum = *(DGAccumulator_sumval.cpu_rd_ptr());\n  // FP: \"19 -> 20;\n  dg_avg = *(dg_avgval.cpu_rd_ptr());\n  // FP: \"20 -> 21;\n  DGMax = *(DGMaxval.cpu_rd_ptr());\n  // FP: \"21 -> 22;\n}\nvoid SSSPSanityCheck_allNodes_cuda(uint64_t & DGAccumulator_sum, uint64_t & dg_avg, uint32_t & DGMax, const uint32_t & local_infinity, struct CUDA_Context*  ctx)\n{\n  // FP: \"1 -> 2;\n  SSSPSanityCheck_cuda(0, ctx->gg.nnodes, DGAccumulator_sum, dg_avg, DGMax, local_infinity, ctx);\n  // FP: \"2 -> 3;\n}\nvoid SSSPSanityCheck_masterNodes_cuda(uint64_t & DGAccumulator_sum, uint64_t & dg_avg, uint32_t & DGMax, const uint32_t & local_infinity, struct CUDA_Context*  ctx)\n{\n  // FP: \"1 -> 2;\n  SSSPSanityCheck_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, DGAccumulator_sum, dg_avg, DGMax, local_infinity, ctx);\n  // FP: \"2 -> 3;\n}\nvoid SSSPSanityCheck_nodesWithEdges_cuda(uint64_t & DGAccumulator_sum, uint64_t & dg_avg, uint32_t & DGMax, const uint32_t & local_infinity, struct CUDA_Context*  ctx)\n{\n  // FP: \"1 -> 2;\n  SSSPSanityCheck_cuda(0, ctx->numNodesWithEdges, DGAccumulator_sum, dg_avg, DGMax, local_infinity, ctx);\n  // FP: \"2 -> 3;\n}"
  },
  {
    "path": "lonestar/analytics/distributed/sssp/sssp_pull_cuda.cuh",
    "content": "#pragma once\n#include <cuda.h>\n#include <stdio.h>\n#include <sys/types.h>\n#include <unistd.h>\n#include \"sssp_pull_cuda.h\"\n#include \"galois/runtime/cuda/DeviceSync.h\"\n\nstruct CUDA_Context : public CUDA_Context_Common {\n\tstruct CUDA_Context_Field<uint32_t> dist_current;\n};\n\nstruct CUDA_Context* get_CUDA_context(int id) {\n\tstruct CUDA_Context* ctx;\n\tctx = (struct CUDA_Context* ) calloc(1, sizeof(struct CUDA_Context));\n\tctx->id = id;\n\treturn ctx;\n}\n\nbool init_CUDA_context(struct CUDA_Context* ctx, int device) {\n\treturn init_CUDA_context_common(ctx, device);\n}\n\nvoid load_graph_CUDA(struct CUDA_Context* ctx, MarshalGraph &g, unsigned num_hosts) {\n\tsize_t mem_usage = mem_usage_CUDA_common(g, num_hosts);\n\tmem_usage += mem_usage_CUDA_field(&ctx->dist_current, g, num_hosts);\n\tprintf(\"[%d] Host memory for communication context: %3u MB\\n\", ctx->id, mem_usage/1048756);\n\tload_graph_CUDA_common(ctx, g, num_hosts);\n\tload_graph_CUDA_field(ctx, &ctx->dist_current, num_hosts);\n\treset_CUDA_context(ctx);\n}\n\nvoid reset_CUDA_context(struct CUDA_Context* ctx) {\n\tctx->dist_current.data.zero_gpu();\n}\n\nvoid get_bitset_dist_current_cuda(struct CUDA_Context* ctx, uint64_t* bitset_compute) {\n\tctx->dist_current.is_updated.cpu_rd_ptr()->copy_to_cpu(bitset_compute);\n}\n\nvoid bitset_dist_current_reset_cuda(struct CUDA_Context* ctx) {\n\tctx->dist_current.is_updated.cpu_rd_ptr()->reset();\n}\n\nvoid bitset_dist_current_reset_cuda(struct CUDA_Context* ctx, size_t begin, size_t end) {\n\treset_bitset_field(&ctx->dist_current, begin, end);\n}\n\nuint32_t get_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned LID) {\n\tuint32_t *dist_current = ctx->dist_current.data.cpu_rd_ptr();\n\treturn dist_current[LID];\n}\n\nvoid set_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned LID, uint32_t v) {\n\tuint32_t *dist_current = ctx->dist_current.data.cpu_wr_ptr();\n\tdist_current[LID] = v;\n}\n\nvoid add_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned LID, uint32_t v) {\n\tuint32_t *dist_current = ctx->dist_current.data.cpu_wr_ptr();\n\tdist_current[LID] += v;\n}\n\nbool min_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned LID, uint32_t v) {\n\tuint32_t *dist_current = ctx->dist_current.data.cpu_wr_ptr();\n\tif (dist_current[LID] > v){\n\t\tdist_current[LID] = v;\n\t\treturn true;\n\t}\n\treturn false;\n}\n\nvoid batch_get_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v) {\n\tbatch_get_shared_field<uint32_t, sharedMaster, false>(ctx, &ctx->dist_current, from_id, v);\n}\n\nvoid batch_get_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode) {\n\tbatch_get_shared_field<uint32_t, sharedMaster, false>(ctx, &ctx->dist_current, from_id, v, v_size, data_mode);\n}\n\nvoid batch_get_mirror_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v) {\n\tbatch_get_shared_field<uint32_t, sharedMirror, false>(ctx, &ctx->dist_current, from_id, v);\n}\n\nvoid batch_get_mirror_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode) {\n\tbatch_get_shared_field<uint32_t, sharedMirror, false>(ctx, &ctx->dist_current, from_id, v, v_size, data_mode);\n}\n\nvoid batch_get_reset_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, uint32_t i) {\n\tbatch_get_shared_field<uint32_t, sharedMirror, true>(ctx, &ctx->dist_current, from_id, v, i);\n}\n\nvoid batch_get_reset_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode, uint32_t i) {\n\tbatch_get_shared_field<uint32_t, sharedMirror, true>(ctx, &ctx->dist_current, from_id, v, v_size, data_mode, i);\n}\n\nvoid batch_set_mirror_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<uint32_t, sharedMirror, setOp>(ctx, &ctx->dist_current, from_id, v, data_mode);\n}\n\nvoid batch_set_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<uint32_t, sharedMaster, setOp>(ctx, &ctx->dist_current, from_id, v, data_mode);\n}\n\nvoid batch_add_mirror_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<uint32_t, sharedMirror, addOp>(ctx, &ctx->dist_current, from_id, v, data_mode);\n}\n\nvoid batch_add_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<uint32_t, sharedMaster, addOp>(ctx, &ctx->dist_current, from_id, v, data_mode);\n}\n\nvoid batch_min_mirror_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<uint32_t, sharedMirror, minOp>(ctx, &ctx->dist_current, from_id, v, data_mode);\n}\n\nvoid batch_min_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<uint32_t, sharedMaster, minOp>(ctx, &ctx->dist_current, from_id, v, data_mode);\n}\n\nvoid batch_reset_node_dist_current_cuda(struct CUDA_Context* ctx, size_t begin, size_t end, uint32_t v) {\n\treset_data_field<uint32_t>(&ctx->dist_current, begin, end, v);\n}\n\n"
  },
  {
    "path": "lonestar/analytics/distributed/sssp/sssp_pull_cuda.h",
    "content": "#pragma once\n\n#include \"galois/runtime/DataCommMode.h\"\n#include \"galois/cuda/HostDecls.h\"\n\nvoid get_bitset_dist_current_cuda(struct CUDA_Context* ctx,\n                                  uint64_t* bitset_compute);\nvoid bitset_dist_current_reset_cuda(struct CUDA_Context* ctx);\nvoid bitset_dist_current_reset_cuda(struct CUDA_Context* ctx, size_t begin,\n                                    size_t end);\nuint32_t get_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned LID);\nvoid set_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned LID,\n                                uint32_t v);\nvoid add_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned LID,\n                                uint32_t v);\nbool min_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned LID,\n                                uint32_t v);\nvoid batch_get_node_dist_current_cuda(struct CUDA_Context* ctx,\n                                      unsigned from_id, uint8_t* v);\nvoid batch_get_node_dist_current_cuda(struct CUDA_Context* ctx,\n                                      unsigned from_id, uint8_t* v,\n                                      size_t* v_size, DataCommMode* data_mode);\nvoid batch_get_mirror_node_dist_current_cuda(struct CUDA_Context* ctx,\n                                             unsigned from_id, uint8_t* v);\nvoid batch_get_mirror_node_dist_current_cuda(struct CUDA_Context* ctx,\n                                             unsigned from_id, uint8_t* v,\n                                             size_t* v_size,\n                                             DataCommMode* data_mode);\nvoid batch_get_reset_node_dist_current_cuda(struct CUDA_Context* ctx,\n                                            unsigned from_id, uint8_t* v,\n                                            uint32_t i);\nvoid batch_get_reset_node_dist_current_cuda(struct CUDA_Context* ctx,\n                                            unsigned from_id, uint8_t* v,\n                                            size_t* v_size,\n                                            DataCommMode* data_mode,\n                                            uint32_t i);\nvoid batch_set_mirror_node_dist_current_cuda(struct CUDA_Context* ctx,\n                                             unsigned from_id, uint8_t* v,\n                                             DataCommMode data_mode);\nvoid batch_set_node_dist_current_cuda(struct CUDA_Context* ctx,\n                                      unsigned from_id, uint8_t* v,\n                                      DataCommMode data_mode);\nvoid batch_add_mirror_node_dist_current_cuda(struct CUDA_Context* ctx,\n                                             unsigned from_id, uint8_t* v,\n                                             DataCommMode data_mode);\nvoid batch_add_node_dist_current_cuda(struct CUDA_Context* ctx,\n                                      unsigned from_id, uint8_t* v,\n                                      DataCommMode data_mode);\nvoid batch_min_mirror_node_dist_current_cuda(struct CUDA_Context* ctx,\n                                             unsigned from_id, uint8_t* v,\n                                             DataCommMode data_mode);\nvoid batch_min_node_dist_current_cuda(struct CUDA_Context* ctx,\n                                      unsigned from_id, uint8_t* v,\n                                      DataCommMode data_mode);\nvoid batch_reset_node_dist_current_cuda(struct CUDA_Context* ctx, size_t begin,\n                                        size_t end, uint32_t v);\n\nvoid InitializeGraph_cuda(unsigned int __begin, unsigned int __end,\n                          const uint32_t& local_infinity,\n                          unsigned long long local_src_node,\n                          struct CUDA_Context* ctx);\nvoid InitializeGraph_allNodes_cuda(const uint32_t& local_infinity,\n                                   unsigned long long local_src_node,\n                                   struct CUDA_Context* ctx);\nvoid InitializeGraph_masterNodes_cuda(const uint32_t& local_infinity,\n                                      unsigned long long local_src_node,\n                                      struct CUDA_Context* ctx);\nvoid InitializeGraph_nodesWithEdges_cuda(const uint32_t& local_infinity,\n                                         unsigned long long local_src_node,\n                                         struct CUDA_Context* ctx);\nvoid SSSP_cuda(unsigned int __begin, unsigned int __end,\n               unsigned int& active_vertices, struct CUDA_Context* ctx);\nvoid SSSPSanityCheck_cuda(unsigned int __begin, unsigned int __end,\n                          uint64_t& DGAccumulator_sum, uint64_t& dg_avg,\n                          uint32_t& DGMax, const uint32_t& local_infinity,\n                          struct CUDA_Context* ctx);\nvoid SSSPSanityCheck_allNodes_cuda(uint64_t& DGAccumulator_sum,\n                                   uint64_t& dg_avg, uint32_t& DGMax,\n                                   const uint32_t& local_infinity,\n                                   struct CUDA_Context* ctx);\nvoid SSSPSanityCheck_masterNodes_cuda(uint64_t& DGAccumulator_sum,\n                                      uint64_t& dg_avg, uint32_t& DGMax,\n                                      const uint32_t& local_infinity,\n                                      struct CUDA_Context* ctx);\nvoid SSSPSanityCheck_nodesWithEdges_cuda(uint64_t& DGAccumulator_sum,\n                                         uint64_t& dg_avg, uint32_t& DGMax,\n                                         const uint32_t& local_infinity,\n                                         struct CUDA_Context* ctx);\nvoid SSSP_allNodes_cuda(unsigned int& active_vertices,\n                        struct CUDA_Context* ctx);\nvoid SSSP_masterNodes_cuda(unsigned int& active_vertices,\n                           struct CUDA_Context* ctx);\nvoid SSSP_nodesWithEdges_cuda(unsigned int& active_vertices,\n                              struct CUDA_Context* ctx);\n"
  },
  {
    "path": "lonestar/analytics/distributed/sssp/sssp_pull_cuda.py",
    "content": "from gg.ast import *\nfrom gg.lib.graph import Graph\nfrom gg.lib.wl import Worklist\nfrom gg.ast.params import GraphParam\nimport cgen\nG = Graph(\"graph\")\nWL = Worklist()\nast = Module([\nCBlock([cgen.Include(\"sssp_pull_cuda.cuh\", system = False)], parse = False),\nKernel(\"InitializeGraph\", [G.param(), ('unsigned int', '__begin'), ('unsigned int', '__end'), ('const uint32_t ', 'local_infinity'), ('unsigned long long', 'local_src_node'), ('uint32_t *', 'p_dist_current')],\n[\nForAll(\"src\", G.nodes(\"__begin\", \"__end\"),\n[\nCDecl([(\"bool\", \"pop\", \" = src < __end\")]),\nIf(\"pop\", [\nCBlock([\"p_dist_current[src] = (graph.node_data[src] == local_src_node) ? 0 : local_infinity\"]),\n]),\n]),\n]),\nKernel(\"SSSP\", [G.param(), ('unsigned int', '__begin'), ('unsigned int', '__end'), ('uint32_t *', 'p_dist_current'), ('DynamicBitset&', 'bitset_dist_current'), ('HGAccumulator<unsigned int>', 'active_vertices')],\n[\nCDecl([(\"__shared__ cub::BlockReduce<unsigned int, TB_SIZE>::TempStorage\", \"active_vertices_ts\", \"\")]),\nCBlock([\"active_vertices.thread_entry()\"]),\nForAll(\"src\", G.nodes(\"__begin\", \"__end\"),\n[\nCDecl([(\"bool\", \"pop\", \" = src < __end\")]),\nIf(\"pop\", [\n]),\nUniformConditional(If(\"!pop\", [CBlock(\"continue\")]), uniform_only = False, _only_if_np = True),\nClosureHint(\nForAll(\"jj\", G.edges(\"src\"),\n[\nCDecl([(\"index_type\", \"dst\", \"\")]),\nCBlock([\"dst = graph.getAbsDestination(jj)\"]),\nCDecl([(\"uint32_t\", \"new_dist\", \"\")]),\nCBlock([\"new_dist = p_dist_current[dst] + graph.getAbsWeight(jj)\"]),\nCDecl([(\"uint32_t\", \"old_dist\", \"\")]),\nCBlock([\"old_dist = atomicTestMin(&p_dist_current[src], new_dist)\"]),\nIf(\"old_dist > new_dist\",\n[\nCBlock([\"bitset_dist_current.set(src)\"]),\nCBlock([\"active_vertices.reduce( 1)\"]),\n]),\n]),\n),\n]),\nCBlock([\"active_vertices.thread_exit<cub::BlockReduce<unsigned int, TB_SIZE> >(active_vertices_ts)\"], parse = False),\n]),\nKernel(\"SSSPSanityCheck\", [G.param(), ('unsigned int', '__begin'), ('unsigned int', '__end'), ('const uint32_t ', 'local_infinity'), ('uint32_t *', 'p_dist_current'), ('HGAccumulator<uint64_t>', 'DGAccumulator_sum'), ('HGAccumulator<uint64_t>', 'dg_avg'), ('HGReduceMax<uint32_t>', 'DGMax')],\n[\nCDecl([(\"__shared__ cub::BlockReduce<uint64_t, TB_SIZE>::TempStorage\", \"DGAccumulator_sum_ts\", \"\")]),\nCBlock([\"DGAccumulator_sum.thread_entry()\"]),\nCDecl([(\"__shared__ cub::BlockReduce<uint64_t, TB_SIZE>::TempStorage\", \"dg_avg_ts\", \"\")]),\nCBlock([\"dg_avg.thread_entry()\"]),\nCDecl([(\"__shared__ cub::BlockReduce<uint32_t, TB_SIZE>::TempStorage\", \"DGMax_ts\", \"\")]),\nCBlock([\"DGMax.thread_entry()\"]),\nForAll(\"src\", G.nodes(\"__begin\", \"__end\"),\n[\nCDecl([(\"bool\", \"pop\", \" = src < __end\")]),\nIf(\"pop\", [\nIf(\"p_dist_current[src] < local_infinity\",\n[\nCBlock([\"DGAccumulator_sum.reduce( 1)\"]),\nCBlock([\"DGMax.reduce(p_dist_current[src])\"]),\nCBlock([\"dg_avg.reduce( p_dist_current[src])\"]),\n]),\n]),\n]),\nCBlock([\"DGAccumulator_sum.thread_exit<cub::BlockReduce<uint64_t, TB_SIZE> >(DGAccumulator_sum_ts)\"], parse = False),\nCBlock([\"dg_avg.thread_exit<cub::BlockReduce<uint64_t, TB_SIZE> >(dg_avg_ts)\"], parse = False),\nCBlock([\"DGMax.thread_exit<cub::BlockReduce<uint32_t, TB_SIZE> >(DGMax_ts)\"], parse = False),\n]),\nKernel(\"InitializeGraph_cuda\", [('unsigned int ', '__begin'), ('unsigned int ', '__end'), ('const uint32_t &', 'local_infinity'), ('unsigned long long', 'local_src_node'), ('struct CUDA_Context* ', 'ctx')],\n[\nCDecl([(\"dim3\", \"blocks\", \"\")]),\nCDecl([(\"dim3\", \"threads\", \"\")]),\nCBlock([\"kernel_sizing(blocks, threads)\"]),\nInvoke(\"InitializeGraph\", (\"ctx->gg\", \"__begin\", \"__end\", \"local_infinity\", \"local_src_node\", \"ctx->dist_current.data.gpu_wr_ptr()\")),\nCBlock([\"check_cuda_kernel\"], parse = False),\n], host = True),\nKernel(\"InitializeGraph_allNodes_cuda\", [('const uint32_t &', 'local_infinity'), ('unsigned long long', 'local_src_node'), ('struct CUDA_Context* ', 'ctx')],\n[\nCBlock([\"InitializeGraph_cuda(0, ctx->gg.nnodes, local_infinity, local_src_node, ctx)\"]),\n], host = True),\nKernel(\"InitializeGraph_masterNodes_cuda\", [('const uint32_t &', 'local_infinity'), ('unsigned long long', 'local_src_node'), ('struct CUDA_Context* ', 'ctx')],\n[\nCBlock([\"InitializeGraph_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, local_infinity, local_src_node, ctx)\"]),\n], host = True),\nKernel(\"InitializeGraph_nodesWithEdges_cuda\", [('const uint32_t &', 'local_infinity'), ('unsigned long long', 'local_src_node'), ('struct CUDA_Context* ', 'ctx')],\n[\nCBlock([\"InitializeGraph_cuda(0, ctx->numNodesWithEdges, local_infinity, local_src_node, ctx)\"]),\n], host = True),\nKernel(\"SSSP_cuda\", [('unsigned int ', '__begin'), ('unsigned int ', '__end'), ('unsigned int &', 'active_vertices'), ('struct CUDA_Context* ', 'ctx')],\n[\nCDecl([(\"dim3\", \"blocks\", \"\")]),\nCDecl([(\"dim3\", \"threads\", \"\")]),\nCBlock([\"kernel_sizing(blocks, threads)\"]),\nCDecl([(\"Shared<unsigned int>\", \"active_verticesval\", \" = Shared<unsigned int>(1)\")]),\nCDecl([(\"HGAccumulator<unsigned int>\", \"_active_vertices\", \"\")]),\nCBlock([\"*(active_verticesval.cpu_wr_ptr()) = 0\"]),\nCBlock([\"_active_vertices.rv = active_verticesval.gpu_wr_ptr()\"]),\nInvoke(\"SSSP\", (\"ctx->gg\", \"__begin\", \"__end\", \"ctx->dist_current.data.gpu_wr_ptr()\", \"*(ctx->dist_current.is_updated.gpu_rd_ptr())\", \"_active_vertices\")),\nCBlock([\"check_cuda_kernel\"], parse = False),\nCBlock([\"active_vertices = *(active_verticesval.cpu_rd_ptr())\"]),\n], host = True),\nKernel(\"SSSP_allNodes_cuda\", [('unsigned int &', 'active_vertices'), ('struct CUDA_Context* ', 'ctx')],\n[\nCBlock([\"SSSP_cuda(0, ctx->gg.nnodes, active_vertices, ctx)\"]),\n], host = True),\nKernel(\"SSSP_masterNodes_cuda\", [('unsigned int &', 'active_vertices'), ('struct CUDA_Context* ', 'ctx')],\n[\nCBlock([\"SSSP_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, active_vertices, ctx)\"]),\n], host = True),\nKernel(\"SSSP_nodesWithEdges_cuda\", [('unsigned int &', 'active_vertices'), ('struct CUDA_Context* ', 'ctx')],\n[\nCBlock([\"SSSP_cuda(0, ctx->numNodesWithEdges, active_vertices, ctx)\"]),\n], host = True),\nKernel(\"SSSPSanityCheck_cuda\", [('unsigned int ', '__begin'), ('unsigned int ', '__end'), ('uint64_t &', 'DGAccumulator_sum'), ('uint64_t &', 'dg_avg'), ('uint32_t &', 'DGMax'), ('const uint32_t &', 'local_infinity'), ('struct CUDA_Context* ', 'ctx')],\n[\nCDecl([(\"dim3\", \"blocks\", \"\")]),\nCDecl([(\"dim3\", \"threads\", \"\")]),\nCBlock([\"kernel_sizing(blocks, threads)\"]),\nCDecl([(\"Shared<uint64_t>\", \"DGAccumulator_sumval\", \" = Shared<uint64_t>(1)\")]),\nCDecl([(\"HGAccumulator<uint64_t>\", \"_DGAccumulator_sum\", \"\")]),\nCBlock([\"*(DGAccumulator_sumval.cpu_wr_ptr()) = 0\"]),\nCBlock([\"_DGAccumulator_sum.rv = DGAccumulator_sumval.gpu_wr_ptr()\"]),\nCDecl([(\"Shared<uint64_t>\", \"dg_avgval\", \" = Shared<uint64_t>(1)\")]),\nCDecl([(\"HGAccumulator<uint64_t>\", \"_dg_avg\", \"\")]),\nCBlock([\"*(dg_avgval.cpu_wr_ptr()) = 0\"]),\nCBlock([\"_dg_avg.rv = dg_avgval.gpu_wr_ptr()\"]),\nCDecl([(\"Shared<uint32_t>\", \"DGMaxval\", \" = Shared<uint32_t>(1)\")]),\nCDecl([(\"HGReduceMax<uint32_t>\", \"_DGMax\", \"\")]),\nCBlock([\"*(DGMaxval.cpu_wr_ptr()) = 0\"]),\nCBlock([\"_DGMax.rv = DGMaxval.gpu_wr_ptr()\"]),\nInvoke(\"SSSPSanityCheck\", (\"ctx->gg\", \"__begin\", \"__end\", \"local_infinity\", \"ctx->dist_current.data.gpu_wr_ptr()\", \"_DGAccumulator_sum\", \"_dg_avg\", \"_DGMax\")),\nCBlock([\"check_cuda_kernel\"], parse = False),\nCBlock([\"DGAccumulator_sum = *(DGAccumulator_sumval.cpu_rd_ptr())\"]),\nCBlock([\"dg_avg = *(dg_avgval.cpu_rd_ptr())\"]),\nCBlock([\"DGMax = *(DGMaxval.cpu_rd_ptr())\"]),\n], host = True),\nKernel(\"SSSPSanityCheck_allNodes_cuda\", [('uint64_t &', 'DGAccumulator_sum'), ('uint64_t &', 'dg_avg'), ('uint32_t &', 'DGMax'), ('const uint32_t &', 'local_infinity'), ('struct CUDA_Context* ', 'ctx')],\n[\nCBlock([\"SSSPSanityCheck_cuda(0, ctx->gg.nnodes, DGAccumulator_sum, dg_avg, DGMax, local_infinity, ctx)\"]),\n], host = True),\nKernel(\"SSSPSanityCheck_masterNodes_cuda\", [('uint64_t &', 'DGAccumulator_sum'), ('uint64_t &', 'dg_avg'), ('uint32_t &', 'DGMax'), ('const uint32_t &', 'local_infinity'), ('struct CUDA_Context* ', 'ctx')],\n[\nCBlock([\"SSSPSanityCheck_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, DGAccumulator_sum, dg_avg, DGMax, local_infinity, ctx)\"]),\n], host = True),\nKernel(\"SSSPSanityCheck_nodesWithEdges_cuda\", [('uint64_t &', 'DGAccumulator_sum'), ('uint64_t &', 'dg_avg'), ('uint32_t &', 'DGMax'), ('const uint32_t &', 'local_infinity'), ('struct CUDA_Context* ', 'ctx')],\n[\nCBlock([\"SSSPSanityCheck_cuda(0, ctx->numNodesWithEdges, DGAccumulator_sum, dg_avg, DGMax, local_infinity, ctx)\"]),\n], host = True),\n])\n"
  },
  {
    "path": "lonestar/analytics/distributed/sssp/sssp_pull_sync.hh",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting parallelism.\n * The code is being released under the terms of the 3-Clause BSD License (a\n * copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#include \"galois/runtime/SyncStructures.h\"\n\nGALOIS_SYNC_STRUCTURE_REDUCE_SET(dist_current, unsigned int);\nGALOIS_SYNC_STRUCTURE_REDUCE_MIN(dist_current, unsigned int);\nGALOIS_SYNC_STRUCTURE_BITSET(dist_current);\n"
  },
  {
    "path": "lonestar/analytics/distributed/sssp/sssp_push.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#include \"DistBench/Output.h\"\n#include \"DistBench/Start.h\"\n#include \"galois/DistGalois.h\"\n#include \"galois/DReducible.h\"\n#include \"galois/DTerminationDetector.h\"\n#include \"galois/gstl.h\"\n#include \"galois/runtime/Tracer.h\"\n\n#include <iostream>\n#include <limits>\n\n#ifdef GALOIS_ENABLE_GPU\n#include \"sssp_push_cuda.h\"\nstruct CUDA_Context* cuda_ctx;\n#else\nenum { CPU, GPU_CUDA };\nint personality = CPU;\n#endif\n\nconstexpr static const char* const REGION_NAME = \"SSSP\";\n\n/******************************************************************************/\n/* Declaration of command line arguments */\n/******************************************************************************/\n\nnamespace cll = llvm::cl;\n\nstatic cll::opt<unsigned int> maxIterations(\"maxIterations\",\n                                            cll::desc(\"Maximum iterations: \"\n                                                      \"Default 1000\"),\n                                            cll::init(1000));\nstatic cll::opt<uint64_t>\n    src_node(\"startNode\", cll::desc(\"ID of the source node\"), cll::init(0));\n\nstatic cll::opt<uint32_t>\n    delta(\"delta\",\n          cll::desc(\"Shift value for the delta step (default value 0)\"),\n          cll::init(0));\n\nenum Exec { Sync, Async };\n\nstatic cll::opt<Exec> execution(\n    \"exec\", cll::desc(\"Distributed Execution Model (default value Async):\"),\n    cll::values(clEnumVal(Sync, \"Bulk-synchronous Parallel (BSP)\"),\n                clEnumVal(Async, \"Bulk-asynchronous Parallel (BASP)\")),\n    cll::init(Async));\n\n/******************************************************************************/\n/* Graph structure declarations + other initialization */\n/******************************************************************************/\n\nconst uint32_t infinity = std::numeric_limits<uint32_t>::max() / 4;\n\nstruct NodeData {\n  std::atomic<uint32_t> dist_current;\n  uint32_t dist_old;\n};\n\ngalois::DynamicBitSet bitset_dist_current;\n\ntypedef galois::graphs::DistGraph<NodeData, unsigned int> Graph;\ntypedef typename Graph::GraphNode GNode;\n\nstd::unique_ptr<galois::graphs::GluonSubstrate<Graph>> syncSubstrate;\n\n#include \"sssp_push_sync.hh\"\n\n/******************************************************************************/\n/* Algorithm structures */\n/******************************************************************************/\n\nstruct InitializeGraph {\n  const uint32_t& local_infinity;\n  cll::opt<uint64_t>& local_src_node;\n  Graph* graph;\n\n  InitializeGraph(cll::opt<uint64_t>& _src_node, const uint32_t& _infinity,\n                  Graph* _graph)\n      : local_infinity(_infinity), local_src_node(_src_node), graph(_graph) {}\n\n  void static go(Graph& _graph) {\n    const auto& allNodes = _graph.allNodesRange();\n\n    if (personality == GPU_CUDA) {\n#ifdef GALOIS_ENABLE_GPU\n      std::string impl_str(\"InitializeGraph_\" +\n                           (syncSubstrate->get_run_identifier()));\n      galois::StatTimer StatTimer_cuda(impl_str.c_str(), REGION_NAME);\n      StatTimer_cuda.start();\n      InitializeGraph_allNodes_cuda(infinity, src_node, cuda_ctx);\n      StatTimer_cuda.stop();\n#else\n      abort();\n#endif\n    } else if (personality == CPU) {\n      galois::do_all(\n          galois::iterate(allNodes.begin(), allNodes.end()),\n          InitializeGraph{src_node, infinity, &_graph}, galois::no_stats(),\n          galois::loopname(\n              syncSubstrate->get_run_identifier(\"InitializeGraph\").c_str()));\n    }\n  }\n\n  void operator()(GNode src) const {\n    NodeData& sdata = graph->getData(src);\n    sdata.dist_current =\n        (graph->getGID(src) == local_src_node) ? 0 : local_infinity;\n    sdata.dist_old =\n        (graph->getGID(src) == local_src_node) ? 0 : local_infinity;\n  }\n};\n\ntemplate <bool async>\nstruct FirstItr_SSSP {\n  Graph* graph;\n  FirstItr_SSSP(Graph* _graph) : graph(_graph) {}\n\n  void static go(Graph& _graph) {\n    uint32_t __begin, __end;\n    if (_graph.isLocal(src_node)) {\n      __begin = _graph.getLID(src_node);\n      __end   = __begin + 1;\n    } else {\n      __begin = 0;\n      __end   = 0;\n    }\n    syncSubstrate->set_num_round(0);\n    if (personality == GPU_CUDA) {\n#ifdef GALOIS_ENABLE_GPU\n      std::string impl_str(\"SSSP_\" + (syncSubstrate->get_run_identifier()));\n      galois::StatTimer StatTimer_cuda(impl_str.c_str(), REGION_NAME);\n      StatTimer_cuda.start();\n      FirstItr_SSSP_cuda(__begin, __end, cuda_ctx);\n      StatTimer_cuda.stop();\n#else\n      abort();\n#endif\n    } else if (personality == CPU) {\n      // one node\n      galois::do_all(\n          galois::iterate(__begin, __end), FirstItr_SSSP{&_graph},\n          galois::no_stats(),\n          galois::loopname(syncSubstrate->get_run_identifier(\"SSSP\").c_str()));\n    }\n\n    syncSubstrate->sync<writeDestination, readSource, Reduce_min_dist_current,\n                        Bitset_dist_current, async>(\"SSSP\");\n\n    galois::runtime::reportStat_Tsum(\n        \"SSSP\", \"NumWorkItems_\" + (syncSubstrate->get_run_identifier()),\n        __end - __begin);\n  }\n\n  void operator()(GNode src) const {\n    NodeData& snode = graph->getData(src);\n    snode.dist_old  = snode.dist_current;\n\n    for (auto jj : graph->edges(src)) {\n      GNode dst         = graph->getEdgeDst(jj);\n      auto& dnode       = graph->getData(dst);\n      uint32_t new_dist = graph->getEdgeData(jj) + snode.dist_current;\n      uint32_t old_dist = galois::atomicMin(dnode.dist_current, new_dist);\n      if (old_dist > new_dist)\n        bitset_dist_current.set(dst);\n    }\n  }\n};\n\ntemplate <bool async>\nstruct SSSP {\n  uint32_t local_priority;\n  Graph* graph;\n  using DGTerminatorDetector =\n      typename std::conditional<async, galois::DGTerminator<unsigned int>,\n                                galois::DGAccumulator<unsigned int>>::type;\n  using DGAccumulatorTy = galois::DGAccumulator<unsigned int>;\n\n  DGTerminatorDetector& active_vertices;\n  DGAccumulatorTy& work_edges;\n\n  SSSP(uint32_t _local_priority, Graph* _graph, DGTerminatorDetector& _dga,\n       DGAccumulatorTy& _work_edges)\n      : local_priority(_local_priority), graph(_graph), active_vertices(_dga),\n        work_edges(_work_edges) {}\n\n  void static go(Graph& _graph) {\n    FirstItr_SSSP<async>::go(_graph);\n\n    unsigned _num_iterations = 1;\n\n    const auto& nodesWithEdges = _graph.allNodesWithEdgesRange();\n\n    uint32_t priority;\n    if (delta == 0)\n      priority = std::numeric_limits<uint32_t>::max();\n    else\n      priority = 0;\n    DGTerminatorDetector dga;\n    DGAccumulatorTy work_edges;\n\n    do {\n\n      // if (work_edges.reduce() == 0)\n      priority += delta;\n\n      syncSubstrate->set_num_round(_num_iterations);\n      dga.reset();\n      work_edges.reset();\n      if (personality == GPU_CUDA) {\n#ifdef GALOIS_ENABLE_GPU\n        std::string impl_str(\"SSSP_\" + (syncSubstrate->get_run_identifier()));\n        galois::StatTimer StatTimer_cuda(impl_str.c_str(), REGION_NAME);\n        StatTimer_cuda.start();\n        unsigned int __retval  = 0;\n        unsigned int __retval2 = 0;\n        SSSP_nodesWithEdges_cuda(__retval, __retval2, priority, cuda_ctx);\n        dga += __retval;\n        work_edges += __retval2;\n        StatTimer_cuda.stop();\n#else\n        abort();\n#endif\n      } else if (personality == CPU) {\n        galois::do_all(\n            galois::iterate(nodesWithEdges),\n            SSSP{priority, &_graph, dga, work_edges}, galois::no_stats(),\n            galois::loopname(syncSubstrate->get_run_identifier(\"SSSP\").c_str()),\n            galois::steal());\n      }\n\n      syncSubstrate->sync<writeDestination, readSource, Reduce_min_dist_current,\n                          Bitset_dist_current, async>(\"SSSP\");\n\n      galois::runtime::reportStat_Tsum(\n          \"SSSP\", \"NumWorkItems_\" + (syncSubstrate->get_run_identifier()),\n          work_edges.read_local());\n      ++_num_iterations;\n    } while ((async || (_num_iterations < maxIterations)) &&\n             dga.reduce(syncSubstrate->get_run_identifier()));\n\n    galois::runtime::reportStat_Tmax(\n        \"SSSP\", \"NumIterations_\" + std::to_string(syncSubstrate->get_run_num()),\n        _num_iterations);\n  }\n\n  void operator()(GNode src) const {\n    NodeData& snode = graph->getData(src);\n\n    if (snode.dist_old > snode.dist_current) {\n      active_vertices += 1;\n\n      if (local_priority > snode.dist_current) {\n        snode.dist_old = snode.dist_current;\n\n        for (auto jj : graph->edges(src)) {\n          work_edges += 1;\n\n          GNode dst         = graph->getEdgeDst(jj);\n          auto& dnode       = graph->getData(dst);\n          uint32_t new_dist = graph->getEdgeData(jj) + snode.dist_current;\n          uint32_t old_dist = galois::atomicMin(dnode.dist_current, new_dist);\n          if (old_dist > new_dist)\n            bitset_dist_current.set(dst);\n        }\n      }\n    }\n  }\n};\n\n/******************************************************************************/\n/* Sanity check operators */\n/******************************************************************************/\n\n/* Prints total number of nodes visited + max distance */\nstruct SSSPSanityCheck {\n  const uint32_t& local_infinity;\n  Graph* graph;\n\n  galois::DGAccumulator<uint64_t>& DGAccumulator_sum;\n  galois::DGReduceMax<uint32_t>& DGMax;\n  galois::DGAccumulator<uint64_t>& dg_avg;\n\n  SSSPSanityCheck(const uint32_t& _infinity, Graph* _graph,\n                  galois::DGAccumulator<uint64_t>& dgas,\n                  galois::DGReduceMax<uint32_t>& dgm,\n                  galois::DGAccumulator<uint64_t>& _dg_avg)\n      : local_infinity(_infinity), graph(_graph), DGAccumulator_sum(dgas),\n        DGMax(dgm), dg_avg(_dg_avg) {}\n\n  void static go(Graph& _graph, galois::DGAccumulator<uint64_t>& dgas,\n                 galois::DGReduceMax<uint32_t>& dgm,\n                 galois::DGAccumulator<uint64_t>& dgag) {\n    dgas.reset();\n    dgm.reset();\n    dgag.reset();\n\n    if (personality == GPU_CUDA) {\n#ifdef GALOIS_ENABLE_GPU\n      uint64_t sum;\n      uint64_t avg;\n      uint32_t max;\n      SSSPSanityCheck_masterNodes_cuda(sum, avg, max, infinity, cuda_ctx);\n      dgas += sum;\n      dgm.update(max);\n      dgag += avg;\n#else\n      abort();\n#endif\n    } else {\n      galois::do_all(galois::iterate(_graph.masterNodesRange().begin(),\n                                     _graph.masterNodesRange().end()),\n                     SSSPSanityCheck(infinity, &_graph, dgas, dgm, dgag),\n                     galois::no_stats(), galois::loopname(\"SSSPSanityCheck\"));\n    }\n\n    uint64_t num_visited  = dgas.reduce();\n    uint32_t max_distance = dgm.reduce();\n\n    float visit_average = ((float)dgag.reduce()) / num_visited;\n\n    // Only host 0 will print the info\n    if (galois::runtime::getSystemNetworkInterface().ID == 0) {\n      galois::gPrint(\"Number of nodes visited from source \", src_node, \" is \",\n                     num_visited, \"\\n\");\n      galois::gPrint(\"Max distance from source \", src_node, \" is \",\n                     max_distance, \"\\n\");\n      galois::gPrint(\"Average distances on visited nodes is \", visit_average,\n                     \"\\n\");\n    }\n  }\n\n  void operator()(GNode src) const {\n    NodeData& src_data = graph->getData(src);\n\n    if (src_data.dist_current < local_infinity) {\n      DGAccumulator_sum += 1;\n      DGMax.update(src_data.dist_current);\n      dg_avg += src_data.dist_current;\n    }\n  }\n};\n\n/******************************************************************************/\n/* Make results */\n/******************************************************************************/\n\nstd::vector<uint32_t> makeResultsCPU(std::unique_ptr<Graph>& hg) {\n  std::vector<uint32_t> values;\n\n  values.reserve(hg->numMasters());\n  for (auto node : hg->masterNodesRange()) {\n    values.push_back(hg->getData(node).dist_current);\n  }\n\n  return values;\n}\n\n#ifdef GALOIS_ENABLE_GPU\nstd::vector<uint32_t> makeResultsGPU(std::unique_ptr<Graph>& hg) {\n  std::vector<uint32_t> values;\n\n  values.reserve(hg->numMasters());\n  for (auto node : hg->masterNodesRange()) {\n    values.push_back(get_node_dist_current_cuda(cuda_ctx, node));\n  }\n\n  return values;\n}\n#else\nstd::vector<uint32_t> makeResultsGPU(std::unique_ptr<Graph>& /*unused*/) {\n  abort();\n}\n#endif\n\nstd::vector<uint32_t> makeResults(std::unique_ptr<Graph>& hg) {\n  switch (personality) {\n  case CPU:\n    return makeResultsCPU(hg);\n  case GPU_CUDA:\n    return makeResultsGPU(hg);\n  default:\n    abort();\n  }\n}\n\n/******************************************************************************/\n/* Main */\n/******************************************************************************/\n\nconstexpr static const char* const name = \"SSSP - Distributed Heterogeneous \"\n                                          \"with worklist.\";\nconstexpr static const char* const desc = \"Variant of Chaotic relaxation SSSP \"\n                                          \"on Distributed Galois.\";\nconstexpr static const char* const url = nullptr;\n\nint main(int argc, char** argv) {\n  galois::DistMemSys G;\n  DistBenchStart(argc, argv, name, desc, url);\n\n  auto& net = galois::runtime::getSystemNetworkInterface();\n\n  if (net.ID == 0) {\n    galois::runtime::reportParam(\"SSSP\", \"Max Iterations\", maxIterations);\n    galois::runtime::reportParam(\"SSSP\", \"Source Node ID\", src_node);\n  }\n\n  galois::StatTimer StatTimer_total(\"TimerTotal\", REGION_NAME);\n\n  StatTimer_total.start();\n\n  std::unique_ptr<Graph> hg;\n#ifdef GALOIS_ENABLE_GPU\n  std::tie(hg, syncSubstrate) =\n      distGraphInitialization<NodeData, unsigned int>(&cuda_ctx);\n#else\n  std::tie(hg, syncSubstrate) =\n      distGraphInitialization<NodeData, unsigned int>();\n#endif\n\n  bitset_dist_current.resize(hg->size());\n\n  galois::gPrint(\"[\", net.ID, \"] InitializeGraph::go called\\n\");\n\n  InitializeGraph::go((*hg));\n  galois::runtime::getHostBarrier().wait();\n\n  // accumulators for use in operators\n  galois::DGAccumulator<uint64_t> DGAccumulator_sum;\n  galois::DGAccumulator<uint64_t> dg_avge;\n  galois::DGReduceMax<uint32_t> m;\n\n  for (auto run = 0; run < numRuns; ++run) {\n    galois::gPrint(\"[\", net.ID, \"] SSSP::go run \", run, \" called\\n\");\n    std::string timer_str(\"Timer_\" + std::to_string(run));\n    galois::StatTimer StatTimer_main(timer_str.c_str(), REGION_NAME);\n\n    StatTimer_main.start();\n    if (execution == Async) {\n      SSSP<true>::go(*hg);\n    } else {\n      SSSP<false>::go(*hg);\n    }\n    StatTimer_main.stop();\n\n    SSSPSanityCheck::go(*hg, DGAccumulator_sum, m, dg_avge);\n\n    if ((run + 1) != numRuns) {\n      if (personality == GPU_CUDA) {\n#ifdef GALOIS_ENABLE_GPU\n        bitset_dist_current_reset_cuda(cuda_ctx);\n#else\n        abort();\n#endif\n      } else {\n        bitset_dist_current.reset();\n      }\n\n      (*syncSubstrate).set_num_run(run + 1);\n      InitializeGraph::go(*hg);\n      galois::runtime::getHostBarrier().wait();\n    }\n  }\n\n  StatTimer_total.stop();\n\n  if (output) {\n    std::vector<uint32_t> results = makeResults(hg);\n    auto globalIDs                = hg->getMasterGlobalIDs();\n    assert(results.size() == globalIDs.size());\n\n    writeOutput(outputLocation, \"distance\", results.data(), results.size(),\n                globalIDs.data());\n  }\n\n  return 0;\n}\n"
  },
  {
    "path": "lonestar/analytics/distributed/sssp/sssp_push_cuda.cu",
    "content": "/*  -*- mode: c++ -*-  */\n#include \"gg.h\"\n#include \"ggcuda.h\"\n#include \"cub/cub.cuh\"\n#include \"cub/util_allocator.cuh\"\n#include \"thread_work.h\"\n\nvoid kernel_sizing(CSRGraph &, dim3 &, dim3 &);\n#define TB_SIZE 256\nconst char *GGC_OPTIONS = \"coop_conv=False $ outline_iterate_gb=False $ backoff_blocking_factor=4 $ parcomb=True $ np_schedulers=set(['fg', 'tb', 'wp']) $ cc_disable=set([]) $ tb_lb=False $ hacks=set([]) $ np_factor=8 $ instrument=set([]) $ unroll=[] $ instrument_mode=None $ read_props=None $ outline_iterate=False $ ignore_nested_errors=False $ np=True $ write_props=None $ quiet_cgen=True $ dyn_lb=True $ retry_backoff=True $ cuda.graph_type=basic $ cuda.use_worklist_slots=True $ cuda.worklist_type=basic\";\nstruct ThreadWork t_work;\nbool enable_lb = true;\n#include \"sssp_push_cuda.cuh\"\nstatic const int __tb_FirstItr_SSSP = TB_SIZE;\nstatic const int __tb_SSSP = TB_SIZE;\n__global__ void InitializeGraph(CSRGraph graph, unsigned int __begin, unsigned int __end, const uint32_t  local_infinity, unsigned long long local_src_node, uint32_t * p_dist_current, uint32_t * p_dist_old)\n{\n  unsigned tid = TID_1D;\n  unsigned nthreads = TOTAL_THREADS_1D;\n\n  const unsigned __kernel_tb_size = TB_SIZE;\n  index_type src_end;\n  // FP: \"1 -> 2;\n  src_end = __end;\n  for (index_type src = __begin + tid; src < src_end; src += nthreads)\n  {\n    bool pop  = src < __end;\n    if (pop)\n    {\n      p_dist_current[src] = (graph.node_data[src] == local_src_node) ? 0 : local_infinity;\n      p_dist_old[src] = (graph.node_data[src] == local_src_node) ? 0 : local_infinity;\n    }\n  }\n  // FP: \"8 -> 9;\n}\n__global__ void FirstItr_SSSP_TB_LB(CSRGraph graph, unsigned int __begin, unsigned int __end, uint32_t * p_dist_current, uint32_t * p_dist_old, DynamicBitset& bitset_dist_current, int * thread_prefix_work_wl, unsigned int num_items, PipeContextT<Worklist2> thread_src_wl)\n{\n  unsigned tid = TID_1D;\n  unsigned nthreads = TOTAL_THREADS_1D;\n\n  const unsigned __kernel_tb_size = TB_SIZE;\n  __shared__ unsigned int total_work;\n  __shared__ unsigned block_start_src_index;\n  __shared__ unsigned block_end_src_index;\n  unsigned my_work;\n  unsigned src;\n  unsigned int offset;\n  unsigned int current_work;\n  // FP: \"1 -> 2;\n  // FP: \"2 -> 3;\n  unsigned blockdim_x = BLOCK_DIM_X;\n  // FP: \"3 -> 4;\n  // FP: \"4 -> 5;\n  // FP: \"5 -> 6;\n  // FP: \"6 -> 7;\n  // FP: \"7 -> 8;\n  // FP: \"8 -> 9;\n  // FP: \"9 -> 10;\n  total_work = thread_prefix_work_wl[num_items - 1];\n  // FP: \"10 -> 11;\n  my_work = ceilf((float)(total_work) / (float) nthreads);\n  // FP: \"11 -> 12;\n\n  // FP: \"12 -> 13;\n  __syncthreads();\n  // FP: \"13 -> 14;\n\n  // FP: \"14 -> 15;\n  if (my_work != 0)\n  {\n    current_work = tid;\n  }\n  // FP: \"17 -> 18;\n  for (unsigned i =0; i < my_work; i++)\n  {\n    unsigned int block_start_work;\n    unsigned int block_end_work;\n    if (threadIdx.x == 0)\n    {\n      if (current_work < total_work)\n      {\n        block_start_work = current_work;\n        block_end_work=current_work + blockdim_x - 1;\n        if (block_end_work >= total_work)\n        {\n          block_end_work = total_work - 1;\n        }\n        block_start_src_index = compute_src_and_offset(0, num_items - 1,  block_start_work+1, thread_prefix_work_wl, num_items,offset);\n        block_end_src_index = compute_src_and_offset(0, num_items - 1, block_end_work+1, thread_prefix_work_wl, num_items, offset);\n      }\n    }\n    __syncthreads();\n\n    if (current_work < total_work)\n    {\n      unsigned src_index;\n      index_type jj;\n      src_index = compute_src_and_offset(block_start_src_index, block_end_src_index, current_work+1, thread_prefix_work_wl,num_items, offset);\n      src= thread_src_wl.in_wl().dwl[src_index];\n      jj = (graph).getFirstEdge(src)+ offset;\n      {\n        index_type dst;\n        uint32_t new_dist;\n        uint32_t old_dist;\n        dst = graph.getAbsDestination(jj);\n        new_dist = graph.getAbsWeight(jj) + p_dist_current[src];\n        old_dist = atomicTestMin(&p_dist_current[dst], new_dist);\n        if (old_dist > new_dist)\n        {\n          bitset_dist_current.set(dst);\n        }\n      }\n      current_work = current_work + nthreads;\n    }\n    __syncthreads();\n  }\n  // FP: \"49 -> 50;\n}\n__global__ void FirstItr_SSSP(CSRGraph graph, unsigned int __begin, unsigned int __end, uint32_t * p_dist_current, uint32_t * p_dist_old, DynamicBitset& bitset_dist_current, PipeContextT<Worklist2> thread_work_wl, PipeContextT<Worklist2> thread_src_wl, bool enable_lb)\n{\n  unsigned tid = TID_1D;\n  unsigned nthreads = TOTAL_THREADS_1D;\n\n  const unsigned __kernel_tb_size = __tb_FirstItr_SSSP;\n  index_type src_end;\n  index_type src_rup;\n  // FP: \"1 -> 2;\n  const int _NP_CROSSOVER_WP = 32;\n  const int _NP_CROSSOVER_TB = __kernel_tb_size;\n  // FP: \"2 -> 3;\n  const int BLKSIZE = __kernel_tb_size;\n  const int ITSIZE = BLKSIZE * 8;\n  unsigned d_limit = DEGREE_LIMIT;\n  // FP: \"3 -> 4;\n\n  typedef cub::BlockScan<multiple_sum<2, index_type>, BLKSIZE> BlockScan;\n  typedef union np_shared<BlockScan::TempStorage, index_type, struct tb_np, struct warp_np<__kernel_tb_size/32>, struct fg_np<ITSIZE> > npsTy;\n\n  // FP: \"4 -> 5;\n  __shared__ npsTy nps ;\n  // FP: \"5 -> 6;\n  src_end = __end;\n  src_rup = ((__begin) + roundup(((__end) - (__begin)), (blockDim.x)));\n  for (index_type src = __begin + tid; src < src_rup; src += nthreads)\n  {\n    int index;\n    multiple_sum<2, index_type> _np_mps;\n    multiple_sum<2, index_type> _np_mps_total;\n    // FP: \"6 -> 7;\n    bool pop  = src < __end && ((( src < (graph).nnodes )) ? true: false);\n    // FP: \"7 -> 8;\n    if (pop)\n    {\n      p_dist_old[src]  = p_dist_current[src];\n    }\n    // FP: \"10 -> 11;\n    // FP: \"13 -> 14;\n    // FP: \"14 -> 15;\n    int threshold = TOTAL_THREADS_1D;\n    // FP: \"15 -> 16;\n    if (pop && (graph).getOutDegree(src) >= threshold)\n    {\n      index = thread_work_wl.in_wl().push_range(1) ;\n      thread_src_wl.in_wl().push_range(1);\n      thread_work_wl.in_wl().dwl[index] = (graph).getOutDegree(src);\n      thread_src_wl.in_wl().dwl[index] = src;\n      pop = false;\n    }\n    // FP: \"18 -> 19;\n    struct NPInspector1 _np = {0,0,0,0,0,0};\n    // FP: \"19 -> 20;\n    __shared__ struct { index_type src; } _np_closure [TB_SIZE];\n    // FP: \"20 -> 21;\n    _np_closure[threadIdx.x].src = src;\n    // FP: \"21 -> 22;\n    if (pop)\n    {\n      _np.size = (graph).getOutDegree(src);\n      _np.start = (graph).getFirstEdge(src);\n    }\n    // FP: \"24 -> 25;\n    // FP: \"25 -> 26;\n    _np_mps.el[0] = _np.size >= _NP_CROSSOVER_WP ? _np.size : 0;\n    _np_mps.el[1] = _np.size < _NP_CROSSOVER_WP ? _np.size : 0;\n    // FP: \"26 -> 27;\n    BlockScan(nps.temp_storage).ExclusiveSum(_np_mps, _np_mps, _np_mps_total);\n    // FP: \"27 -> 28;\n    if (threadIdx.x == 0)\n    {\n      nps.tb.owner = MAX_TB_SIZE + 1;\n    }\n    // FP: \"30 -> 31;\n    __syncthreads();\n    // FP: \"31 -> 32;\n    while (true)\n    {\n      // FP: \"32 -> 33;\n      if (_np.size >= _NP_CROSSOVER_TB)\n      {\n        nps.tb.owner = threadIdx.x;\n      }\n      // FP: \"35 -> 36;\n      __syncthreads();\n      // FP: \"36 -> 37;\n      if (nps.tb.owner == MAX_TB_SIZE + 1)\n      {\n        // FP: \"37 -> 38;\n        __syncthreads();\n        // FP: \"38 -> 39;\n        break;\n      }\n      // FP: \"40 -> 41;\n      if (nps.tb.owner == threadIdx.x)\n      {\n        nps.tb.start = _np.start;\n        nps.tb.size = _np.size;\n        nps.tb.src = threadIdx.x;\n        _np.start = 0;\n        _np.size = 0;\n      }\n      // FP: \"43 -> 44;\n      __syncthreads();\n      // FP: \"44 -> 45;\n      int ns = nps.tb.start;\n      int ne = nps.tb.size;\n      // FP: \"45 -> 46;\n      if (nps.tb.src == threadIdx.x)\n      {\n        nps.tb.owner = MAX_TB_SIZE + 1;\n      }\n      // FP: \"48 -> 49;\n      assert(nps.tb.src < __kernel_tb_size);\n      src = _np_closure[nps.tb.src].src;\n      // FP: \"49 -> 50;\n      for (int _np_j = threadIdx.x; _np_j < ne; _np_j += BLKSIZE)\n      {\n        index_type jj;\n        jj = ns +_np_j;\n        {\n          index_type dst;\n          uint32_t new_dist;\n          uint32_t old_dist;\n          dst = graph.getAbsDestination(jj);\n          new_dist = graph.getAbsWeight(jj) + p_dist_current[src];\n          old_dist = atomicTestMin(&p_dist_current[dst], new_dist);\n          if (old_dist > new_dist)\n          {\n            bitset_dist_current.set(dst);\n          }\n        }\n      }\n      // FP: \"62 -> 63;\n      __syncthreads();\n    }\n    // FP: \"64 -> 65;\n\n    // FP: \"65 -> 66;\n    {\n      const int warpid = threadIdx.x / 32;\n      // FP: \"66 -> 67;\n      const int _np_laneid = cub::LaneId();\n      // FP: \"67 -> 68;\n      while (__any_sync(0xffffffff, _np.size >= _NP_CROSSOVER_WP && _np.size < _NP_CROSSOVER_TB))\n      {\n        if (_np.size >= _NP_CROSSOVER_WP && _np.size < _NP_CROSSOVER_TB)\n        {\n          nps.warp.owner[warpid] = _np_laneid;\n        }\n        if (nps.warp.owner[warpid] == _np_laneid)\n        {\n          nps.warp.start[warpid] = _np.start;\n          nps.warp.size[warpid] = _np.size;\n          nps.warp.src[warpid] = threadIdx.x;\n          _np.start = 0;\n          _np.size = 0;\n        }\n        index_type _np_w_start = nps.warp.start[warpid];\n        index_type _np_w_size = nps.warp.size[warpid];\n        assert(nps.warp.src[warpid] < __kernel_tb_size);\n        src = _np_closure[nps.warp.src[warpid]].src;\n        for (int _np_ii = _np_laneid; _np_ii < _np_w_size; _np_ii += 32)\n        {\n          index_type jj;\n          jj = _np_w_start +_np_ii;\n          {\n            index_type dst;\n            uint32_t new_dist;\n            uint32_t old_dist;\n            dst = graph.getAbsDestination(jj);\n            new_dist = graph.getAbsWeight(jj) + p_dist_current[src];\n            old_dist = atomicTestMin(&p_dist_current[dst], new_dist);\n            if (old_dist > new_dist)\n            {\n              bitset_dist_current.set(dst);\n            }\n          }\n        }\n      }\n      // FP: \"90 -> 91;\n      __syncthreads();\n      // FP: \"91 -> 92;\n    }\n\n    // FP: \"92 -> 93;\n    __syncthreads();\n    // FP: \"93 -> 94;\n    _np.total = _np_mps_total.el[1];\n    _np.offset = _np_mps.el[1];\n    // FP: \"94 -> 95;\n    while (_np.work())\n    {\n      // FP: \"95 -> 96;\n      int _np_i =0;\n      // FP: \"96 -> 97;\n      _np.inspect2(nps.fg.itvalue, nps.fg.src, ITSIZE, threadIdx.x);\n      // FP: \"97 -> 98;\n      __syncthreads();\n      // FP: \"98 -> 99;\n\n      // FP: \"99 -> 100;\n      for (_np_i = threadIdx.x; _np_i < ITSIZE && _np.valid(_np_i); _np_i += BLKSIZE)\n      {\n        index_type jj;\n        assert(nps.fg.src[_np_i] < __kernel_tb_size);\n        src = _np_closure[nps.fg.src[_np_i]].src;\n        jj= nps.fg.itvalue[_np_i];\n        {\n          index_type dst;\n          uint32_t new_dist;\n          uint32_t old_dist;\n          dst = graph.getAbsDestination(jj);\n          new_dist = graph.getAbsWeight(jj) + p_dist_current[src];\n          old_dist = atomicTestMin(&p_dist_current[dst], new_dist);\n          if (old_dist > new_dist)\n          {\n            bitset_dist_current.set(dst);\n          }\n        }\n      }\n      // FP: \"113 -> 114;\n      _np.execute_round_done(ITSIZE);\n      // FP: \"114 -> 115;\n      __syncthreads();\n    }\n    // FP: \"116 -> 117;\n    assert(threadIdx.x < __kernel_tb_size);\n    src = _np_closure[threadIdx.x].src;\n  }\n  // FP: \"118 -> 119;\n}\n__global__ void SSSP_TB_LB(CSRGraph graph, unsigned int __begin, unsigned int __end, uint32_t local_priority, uint32_t * p_dist_current, uint32_t * p_dist_old, DynamicBitset& bitset_dist_current, HGAccumulator<unsigned int> active_vertices, HGAccumulator<unsigned int> work_edges, int * thread_prefix_work_wl, unsigned int num_items, PipeContextT<Worklist2> thread_src_wl)\n{\n  unsigned tid = TID_1D;\n  unsigned nthreads = TOTAL_THREADS_1D;\n\n  const unsigned __kernel_tb_size = TB_SIZE;\n  __shared__ unsigned int total_work;\n  __shared__ unsigned block_start_src_index;\n  __shared__ unsigned block_end_src_index;\n  unsigned my_work;\n  unsigned src;\n  unsigned int offset;\n  unsigned int current_work;\n  // FP: \"1 -> 2;\n  // FP: \"2 -> 3;\n  unsigned blockdim_x = BLOCK_DIM_X;\n  // FP: \"3 -> 4;\n  // FP: \"4 -> 5;\n  // FP: \"5 -> 6;\n  // FP: \"6 -> 7;\n  // FP: \"7 -> 8;\n  // FP: \"8 -> 9;\n  // FP: \"9 -> 10;\n  total_work = thread_prefix_work_wl[num_items - 1];\n  // FP: \"10 -> 11;\n  my_work = ceilf((float)(total_work) / (float) nthreads);\n  // FP: \"11 -> 12;\n\n  // FP: \"12 -> 13;\n  __syncthreads();\n  // FP: \"13 -> 14;\n\n  // FP: \"14 -> 15;\n  if (my_work != 0)\n  {\n    current_work = tid;\n  }\n  // FP: \"17 -> 18;\n  for (unsigned i =0; i < my_work; i++)\n  {\n    unsigned int block_start_work;\n    unsigned int block_end_work;\n    if (threadIdx.x == 0)\n    {\n      if (current_work < total_work)\n      {\n        block_start_work = current_work;\n        block_end_work=current_work + blockdim_x - 1;\n        if (block_end_work >= total_work)\n        {\n          block_end_work = total_work - 1;\n        }\n        block_start_src_index = compute_src_and_offset(0, num_items - 1,  block_start_work+1, thread_prefix_work_wl, num_items,offset);\n        block_end_src_index = compute_src_and_offset(0, num_items - 1, block_end_work+1, thread_prefix_work_wl, num_items, offset);\n      }\n    }\n    __syncthreads();\n\n    if (current_work < total_work)\n    {\n      unsigned src_index;\n      index_type jj;\n      src_index = compute_src_and_offset(block_start_src_index, block_end_src_index, current_work+1, thread_prefix_work_wl,num_items, offset);\n      src= thread_src_wl.in_wl().dwl[src_index];\n      jj = (graph).getFirstEdge(src)+ offset;\n      {\n        index_type dst;\n        uint32_t new_dist;\n        uint32_t old_dist;\n        work_edges.reduce( 1);\n        dst = graph.getAbsDestination(jj);\n        new_dist = graph.getAbsWeight(jj) + p_dist_current[src];\n        old_dist = atomicTestMin(&p_dist_current[dst], new_dist);\n        if (old_dist > new_dist)\n        {\n          bitset_dist_current.set(dst);\n        }\n      }\n      current_work = current_work + nthreads;\n    }\n    __syncthreads();\n  }\n  // FP: \"50 -> 51;\n}\n__global__ void SSSP(CSRGraph graph, unsigned int __begin, unsigned int __end, uint32_t local_priority, uint32_t * p_dist_current, uint32_t * p_dist_old, DynamicBitset& bitset_dist_current, HGAccumulator<unsigned int> active_vertices, HGAccumulator<unsigned int> work_edges, PipeContextT<Worklist2> thread_work_wl, PipeContextT<Worklist2> thread_src_wl, bool enable_lb)\n{\n  unsigned tid = TID_1D;\n  unsigned nthreads = TOTAL_THREADS_1D;\n\n  const unsigned __kernel_tb_size = __tb_SSSP;\n  __shared__ cub::BlockReduce<unsigned int, TB_SIZE>::TempStorage active_vertices_ts;\n  __shared__ cub::BlockReduce<unsigned int, TB_SIZE>::TempStorage work_edges_ts;\n  index_type src_end;\n  index_type src_rup;\n  // FP: \"1 -> 2;\n  const int _NP_CROSSOVER_WP = 32;\n  const int _NP_CROSSOVER_TB = __kernel_tb_size;\n  // FP: \"2 -> 3;\n  const int BLKSIZE = __kernel_tb_size;\n  const int ITSIZE = BLKSIZE * 8;\n  unsigned d_limit = DEGREE_LIMIT;\n  // FP: \"3 -> 4;\n\n  typedef cub::BlockScan<multiple_sum<2, index_type>, BLKSIZE> BlockScan;\n  typedef union np_shared<BlockScan::TempStorage, index_type, struct tb_np, struct warp_np<__kernel_tb_size/32>, struct fg_np<ITSIZE> > npsTy;\n\n  // FP: \"4 -> 5;\n  __shared__ npsTy nps ;\n  // FP: \"5 -> 6;\n  // FP: \"6 -> 7;\n  active_vertices.thread_entry();\n  // FP: \"7 -> 8;\n  // FP: \"8 -> 9;\n  work_edges.thread_entry();\n  // FP: \"9 -> 10;\n  src_end = __end;\n  src_rup = ((__begin) + roundup(((__end) - (__begin)), (blockDim.x)));\n  for (index_type src = __begin + tid; src < src_rup; src += nthreads)\n  {\n    int index;\n    multiple_sum<2, index_type> _np_mps;\n    multiple_sum<2, index_type> _np_mps_total;\n    // FP: \"10 -> 11;\n    bool pop  = src < __end && ((( src < (graph).nnodes )) ? true: false);\n    // FP: \"11 -> 12;\n    if (pop)\n    {\n      if (p_dist_old[src] > p_dist_current[src])\n      {\n        active_vertices.reduce( 1);\n        if (local_priority > p_dist_current[src])\n        {\n          p_dist_old[src] = p_dist_current[src];\n        }\n        else\n        {\n          pop = false;\n        }\n      }\n      else\n      {\n        pop = false;\n      }\n    }\n    // FP: \"19 -> 20;\n    // FP: \"22 -> 23;\n    // FP: \"23 -> 24;\n    int threshold = TOTAL_THREADS_1D;\n    // FP: \"24 -> 25;\n    if (pop && (graph).getOutDegree(src) >= threshold)\n    {\n      index = thread_work_wl.in_wl().push_range(1) ;\n      thread_src_wl.in_wl().push_range(1);\n      thread_work_wl.in_wl().dwl[index] = (graph).getOutDegree(src);\n      thread_src_wl.in_wl().dwl[index] = src;\n      pop = false;\n    }\n    // FP: \"27 -> 28;\n    struct NPInspector1 _np = {0,0,0,0,0,0};\n    // FP: \"28 -> 29;\n    __shared__ struct { index_type src; } _np_closure [TB_SIZE];\n    // FP: \"29 -> 30;\n    _np_closure[threadIdx.x].src = src;\n    // FP: \"30 -> 31;\n    if (pop)\n    {\n      _np.size = (graph).getOutDegree(src);\n      _np.start = (graph).getFirstEdge(src);\n    }\n    // FP: \"33 -> 34;\n    // FP: \"34 -> 35;\n    _np_mps.el[0] = _np.size >= _NP_CROSSOVER_WP ? _np.size : 0;\n    _np_mps.el[1] = _np.size < _NP_CROSSOVER_WP ? _np.size : 0;\n    // FP: \"35 -> 36;\n    BlockScan(nps.temp_storage).ExclusiveSum(_np_mps, _np_mps, _np_mps_total);\n    // FP: \"36 -> 37;\n    if (threadIdx.x == 0)\n    {\n      nps.tb.owner = MAX_TB_SIZE + 1;\n    }\n    // FP: \"39 -> 40;\n    __syncthreads();\n    // FP: \"40 -> 41;\n    while (true)\n    {\n      // FP: \"41 -> 42;\n      if (_np.size >= _NP_CROSSOVER_TB)\n      {\n        nps.tb.owner = threadIdx.x;\n      }\n      // FP: \"44 -> 45;\n      __syncthreads();\n      // FP: \"45 -> 46;\n      if (nps.tb.owner == MAX_TB_SIZE + 1)\n      {\n        // FP: \"46 -> 47;\n        __syncthreads();\n        // FP: \"47 -> 48;\n        break;\n      }\n      // FP: \"49 -> 50;\n      if (nps.tb.owner == threadIdx.x)\n      {\n        nps.tb.start = _np.start;\n        nps.tb.size = _np.size;\n        nps.tb.src = threadIdx.x;\n        _np.start = 0;\n        _np.size = 0;\n      }\n      // FP: \"52 -> 53;\n      __syncthreads();\n      // FP: \"53 -> 54;\n      int ns = nps.tb.start;\n      int ne = nps.tb.size;\n      // FP: \"54 -> 55;\n      if (nps.tb.src == threadIdx.x)\n      {\n        nps.tb.owner = MAX_TB_SIZE + 1;\n      }\n      // FP: \"57 -> 58;\n      assert(nps.tb.src < __kernel_tb_size);\n      src = _np_closure[nps.tb.src].src;\n      // FP: \"58 -> 59;\n      for (int _np_j = threadIdx.x; _np_j < ne; _np_j += BLKSIZE)\n      {\n        index_type jj;\n        jj = ns +_np_j;\n        {\n          index_type dst;\n          uint32_t new_dist;\n          uint32_t old_dist;\n          work_edges.reduce( 1);\n          dst = graph.getAbsDestination(jj);\n          new_dist = graph.getAbsWeight(jj) + p_dist_current[src];\n          old_dist = atomicTestMin(&p_dist_current[dst], new_dist);\n          if (old_dist > new_dist)\n          {\n            bitset_dist_current.set(dst);\n          }\n        }\n      }\n      // FP: \"72 -> 73;\n      __syncthreads();\n    }\n    // FP: \"74 -> 75;\n\n    // FP: \"75 -> 76;\n    {\n      const int warpid = threadIdx.x / 32;\n      // FP: \"76 -> 77;\n      const int _np_laneid = cub::LaneId();\n      // FP: \"77 -> 78;\n      while (__any_sync(0xffffffff, _np.size >= _NP_CROSSOVER_WP && _np.size < _NP_CROSSOVER_TB))\n      {\n        if (_np.size >= _NP_CROSSOVER_WP && _np.size < _NP_CROSSOVER_TB)\n        {\n          nps.warp.owner[warpid] = _np_laneid;\n        }\n        if (nps.warp.owner[warpid] == _np_laneid)\n        {\n          nps.warp.start[warpid] = _np.start;\n          nps.warp.size[warpid] = _np.size;\n          nps.warp.src[warpid] = threadIdx.x;\n          _np.start = 0;\n          _np.size = 0;\n        }\n        index_type _np_w_start = nps.warp.start[warpid];\n        index_type _np_w_size = nps.warp.size[warpid];\n        assert(nps.warp.src[warpid] < __kernel_tb_size);\n        src = _np_closure[nps.warp.src[warpid]].src;\n        for (int _np_ii = _np_laneid; _np_ii < _np_w_size; _np_ii += 32)\n        {\n          index_type jj;\n          jj = _np_w_start +_np_ii;\n          {\n            index_type dst;\n            uint32_t new_dist;\n            uint32_t old_dist;\n            work_edges.reduce( 1);\n            dst = graph.getAbsDestination(jj);\n            new_dist = graph.getAbsWeight(jj) + p_dist_current[src];\n            old_dist = atomicTestMin(&p_dist_current[dst], new_dist);\n            if (old_dist > new_dist)\n            {\n              bitset_dist_current.set(dst);\n            }\n          }\n        }\n      }\n      // FP: \"101 -> 102;\n      __syncthreads();\n      // FP: \"102 -> 103;\n    }\n\n    // FP: \"103 -> 104;\n    __syncthreads();\n    // FP: \"104 -> 105;\n    _np.total = _np_mps_total.el[1];\n    _np.offset = _np_mps.el[1];\n    // FP: \"105 -> 106;\n    while (_np.work())\n    {\n      // FP: \"106 -> 107;\n      int _np_i =0;\n      // FP: \"107 -> 108;\n      _np.inspect2(nps.fg.itvalue, nps.fg.src, ITSIZE, threadIdx.x);\n      // FP: \"108 -> 109;\n      __syncthreads();\n      // FP: \"109 -> 110;\n\n      // FP: \"110 -> 111;\n      for (_np_i = threadIdx.x; _np_i < ITSIZE && _np.valid(_np_i); _np_i += BLKSIZE)\n      {\n        index_type jj;\n        assert(nps.fg.src[_np_i] < __kernel_tb_size);\n        src = _np_closure[nps.fg.src[_np_i]].src;\n        jj= nps.fg.itvalue[_np_i];\n        {\n          index_type dst;\n          uint32_t new_dist;\n          uint32_t old_dist;\n          work_edges.reduce( 1);\n          dst = graph.getAbsDestination(jj);\n          new_dist = graph.getAbsWeight(jj) + p_dist_current[src];\n          old_dist = atomicTestMin(&p_dist_current[dst], new_dist);\n          if (old_dist > new_dist)\n          {\n            bitset_dist_current.set(dst);\n          }\n        }\n      }\n      // FP: \"125 -> 126;\n      _np.execute_round_done(ITSIZE);\n      // FP: \"126 -> 127;\n      __syncthreads();\n    }\n    // FP: \"128 -> 129;\n    assert(threadIdx.x < __kernel_tb_size);\n    src = _np_closure[threadIdx.x].src;\n  }\n  // FP: \"132 -> 133;\n  active_vertices.thread_exit<cub::BlockReduce<unsigned int, TB_SIZE> >(active_vertices_ts);\n  // FP: \"133 -> 134;\n  work_edges.thread_exit<cub::BlockReduce<unsigned int, TB_SIZE> >(work_edges_ts);\n  // FP: \"134 -> 135;\n}\n__global__ void SSSPSanityCheck(CSRGraph graph, unsigned int __begin, unsigned int __end, const uint32_t  local_infinity, uint32_t * p_dist_current, HGAccumulator<uint64_t> DGAccumulator_sum, HGAccumulator<uint64_t> dg_avg, HGReduceMax<uint32_t> DGMax)\n{\n  unsigned tid = TID_1D;\n  unsigned nthreads = TOTAL_THREADS_1D;\n\n  const unsigned __kernel_tb_size = TB_SIZE;\n  __shared__ cub::BlockReduce<uint64_t, TB_SIZE>::TempStorage DGAccumulator_sum_ts;\n  __shared__ cub::BlockReduce<uint64_t, TB_SIZE>::TempStorage dg_avg_ts;\n  __shared__ cub::BlockReduce<uint32_t, TB_SIZE>::TempStorage DGMax_ts;\n  index_type src_end;\n  // FP: \"1 -> 2;\n  // FP: \"2 -> 3;\n  DGAccumulator_sum.thread_entry();\n  // FP: \"3 -> 4;\n  // FP: \"4 -> 5;\n  dg_avg.thread_entry();\n  // FP: \"5 -> 6;\n  // FP: \"6 -> 7;\n  DGMax.thread_entry();\n  // FP: \"7 -> 8;\n  src_end = __end;\n  for (index_type src = __begin + tid; src < src_end; src += nthreads)\n  {\n    bool pop  = src < __end;\n    if (pop)\n    {\n      if (p_dist_current[src] < local_infinity)\n      {\n        DGAccumulator_sum.reduce( 1);\n        DGMax.reduce(p_dist_current[src]);\n        dg_avg.reduce( p_dist_current[src]);\n      }\n    }\n  }\n  // FP: \"17 -> 18;\n  DGAccumulator_sum.thread_exit<cub::BlockReduce<uint64_t, TB_SIZE> >(DGAccumulator_sum_ts);\n  // FP: \"18 -> 19;\n  dg_avg.thread_exit<cub::BlockReduce<uint64_t, TB_SIZE> >(dg_avg_ts);\n  // FP: \"19 -> 20;\n  DGMax.thread_exit<cub::BlockReduce<uint32_t, TB_SIZE> >(DGMax_ts);\n  // FP: \"20 -> 21;\n}\nvoid InitializeGraph_cuda(unsigned int  __begin, unsigned int  __end, const uint32_t & local_infinity, unsigned long long local_src_node, struct CUDA_Context*  ctx)\n{\n  t_work.init_thread_work(ctx->gg.nnodes);\n  dim3 blocks;\n  dim3 threads;\n  // FP: \"1 -> 2;\n  // FP: \"2 -> 3;\n  // FP: \"3 -> 4;\n  kernel_sizing(blocks, threads);\n  // FP: \"4 -> 5;\n  InitializeGraph <<<blocks, threads>>>(ctx->gg, __begin, __end, local_infinity, local_src_node, ctx->dist_current.data.gpu_wr_ptr(), ctx->dist_old.data.gpu_wr_ptr());\n  cudaDeviceSynchronize();\n  // FP: \"5 -> 6;\n  check_cuda_kernel;\n  // FP: \"6 -> 7;\n}\nvoid InitializeGraph_allNodes_cuda(const uint32_t & local_infinity, unsigned long long local_src_node, struct CUDA_Context*  ctx)\n{\n  // FP: \"1 -> 2;\n  InitializeGraph_cuda(0, ctx->gg.nnodes, local_infinity, local_src_node, ctx);\n  // FP: \"2 -> 3;\n}\nvoid InitializeGraph_masterNodes_cuda(const uint32_t & local_infinity, unsigned long long local_src_node, struct CUDA_Context*  ctx)\n{\n  // FP: \"1 -> 2;\n  InitializeGraph_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, local_infinity, local_src_node, ctx);\n  // FP: \"2 -> 3;\n}\nvoid InitializeGraph_nodesWithEdges_cuda(const uint32_t & local_infinity, unsigned long long local_src_node, struct CUDA_Context*  ctx)\n{\n  // FP: \"1 -> 2;\n  InitializeGraph_cuda(0, ctx->numNodesWithEdges, local_infinity, local_src_node, ctx);\n  // FP: \"2 -> 3;\n}\nvoid FirstItr_SSSP_cuda(unsigned int  __begin, unsigned int  __end, struct CUDA_Context*  ctx)\n{\n  dim3 blocks;\n  dim3 threads;\n  // FP: \"1 -> 2;\n  // FP: \"2 -> 3;\n  // FP: \"3 -> 4;\n  kernel_sizing(blocks, threads);\n  // FP: \"4 -> 5;\n  FirstItr_SSSP <<<blocks, __tb_FirstItr_SSSP>>>(ctx->gg, __begin, __end, ctx->dist_current.data.gpu_wr_ptr(), ctx->dist_old.data.gpu_wr_ptr(), *(ctx->dist_current.is_updated.gpu_rd_ptr()), t_work.thread_work_wl, t_work.thread_src_wl, enable_lb);\n  cudaDeviceSynchronize();\n  if (enable_lb)\n  {\n    int num_items = t_work.thread_work_wl.in_wl().nitems();\n    if (num_items != 0)\n    {\n      t_work.compute_prefix_sum();\n      cudaDeviceSynchronize();\n      FirstItr_SSSP_TB_LB <<<blocks, __tb_FirstItr_SSSP>>>(ctx->gg, __begin, __end, ctx->dist_current.data.gpu_wr_ptr(), ctx->dist_old.data.gpu_wr_ptr(), *(ctx->dist_current.is_updated.gpu_rd_ptr()), t_work.thread_prefix_work_wl.gpu_wr_ptr(), num_items, t_work.thread_src_wl);\n      cudaDeviceSynchronize();\n      t_work.reset_thread_work();\n    }\n  }\n  // FP: \"5 -> 6;\n  check_cuda_kernel;\n  // FP: \"6 -> 7;\n}\nvoid FirstItr_SSSP_allNodes_cuda(struct CUDA_Context*  ctx)\n{\n  // FP: \"1 -> 2;\n  FirstItr_SSSP_cuda(0, ctx->gg.nnodes, ctx);\n  // FP: \"2 -> 3;\n}\nvoid FirstItr_SSSP_masterNodes_cuda(struct CUDA_Context*  ctx)\n{\n  // FP: \"1 -> 2;\n  FirstItr_SSSP_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, ctx);\n  // FP: \"2 -> 3;\n}\nvoid FirstItr_SSSP_nodesWithEdges_cuda(struct CUDA_Context*  ctx)\n{\n  // FP: \"1 -> 2;\n  FirstItr_SSSP_cuda(0, ctx->numNodesWithEdges, ctx);\n  // FP: \"2 -> 3;\n}\nvoid SSSP_cuda(unsigned int  __begin, unsigned int  __end, unsigned int & active_vertices, unsigned int & work_edges, uint32_t local_priority, struct CUDA_Context*  ctx)\n{\n  dim3 blocks;\n  dim3 threads;\n  HGAccumulator<unsigned int> _active_vertices;\n  HGAccumulator<unsigned int> _work_edges;\n  // FP: \"1 -> 2;\n  // FP: \"2 -> 3;\n  // FP: \"3 -> 4;\n  kernel_sizing(blocks, threads);\n  // FP: \"4 -> 5;\n  Shared<unsigned int> active_verticesval  = Shared<unsigned int>(1);\n  // FP: \"5 -> 6;\n  // FP: \"6 -> 7;\n  *(active_verticesval.cpu_wr_ptr()) = 0;\n  // FP: \"7 -> 8;\n  _active_vertices.rv = active_verticesval.gpu_wr_ptr();\n  // FP: \"8 -> 9;\n  Shared<unsigned int> work_edgesval  = Shared<unsigned int>(1);\n  // FP: \"9 -> 10;\n  // FP: \"10 -> 11;\n  *(work_edgesval.cpu_wr_ptr()) = 0;\n  // FP: \"11 -> 12;\n  _work_edges.rv = work_edgesval.gpu_wr_ptr();\n  // FP: \"12 -> 13;\n  SSSP <<<blocks, __tb_SSSP>>>(ctx->gg, __begin, __end, local_priority, ctx->dist_current.data.gpu_wr_ptr(), ctx->dist_old.data.gpu_wr_ptr(), *(ctx->dist_current.is_updated.gpu_rd_ptr()), _active_vertices, _work_edges, t_work.thread_work_wl, t_work.thread_src_wl, enable_lb);\n  cudaDeviceSynchronize();\n  if (enable_lb)\n  {\n    int num_items = t_work.thread_work_wl.in_wl().nitems();\n    if (num_items != 0)\n    {\n      t_work.compute_prefix_sum();\n      cudaDeviceSynchronize();\n      SSSP_TB_LB <<<blocks, __tb_SSSP>>>(ctx->gg, __begin, __end, local_priority, ctx->dist_current.data.gpu_wr_ptr(), ctx->dist_old.data.gpu_wr_ptr(), *(ctx->dist_current.is_updated.gpu_rd_ptr()), _active_vertices, _work_edges, t_work.thread_prefix_work_wl.gpu_wr_ptr(), num_items, t_work.thread_src_wl);\n      cudaDeviceSynchronize();\n      t_work.reset_thread_work();\n    }\n  }\n  // FP: \"13 -> 14;\n  check_cuda_kernel;\n  // FP: \"14 -> 15;\n  active_vertices = *(active_verticesval.cpu_rd_ptr());\n  // FP: \"15 -> 16;\n  work_edges = *(work_edgesval.cpu_rd_ptr());\n  // FP: \"16 -> 17;\n}\nvoid SSSP_allNodes_cuda(unsigned int & active_vertices, unsigned int & work_edges, uint32_t local_priority, struct CUDA_Context*  ctx)\n{\n  // FP: \"1 -> 2;\n  SSSP_cuda(0, ctx->gg.nnodes, active_vertices, work_edges, local_priority, ctx);\n  // FP: \"2 -> 3;\n}\nvoid SSSP_masterNodes_cuda(unsigned int & active_vertices, unsigned int & work_edges, uint32_t local_priority, struct CUDA_Context*  ctx)\n{\n  // FP: \"1 -> 2;\n  SSSP_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, active_vertices, work_edges, local_priority, ctx);\n  // FP: \"2 -> 3;\n}\nvoid SSSP_nodesWithEdges_cuda(unsigned int & active_vertices, unsigned int & work_edges, uint32_t local_priority, struct CUDA_Context*  ctx)\n{\n  // FP: \"1 -> 2;\n  SSSP_cuda(0, ctx->numNodesWithEdges, active_vertices, work_edges, local_priority, ctx);\n  // FP: \"2 -> 3;\n}\nvoid SSSPSanityCheck_cuda(unsigned int  __begin, unsigned int  __end, uint64_t & DGAccumulator_sum, uint64_t & dg_avg, uint32_t & DGMax, const uint32_t & local_infinity, struct CUDA_Context*  ctx)\n{\n  dim3 blocks;\n  dim3 threads;\n  HGAccumulator<uint64_t> _DGAccumulator_sum;\n  HGAccumulator<uint64_t> _dg_avg;\n  HGReduceMax<uint32_t> _DGMax;\n  // FP: \"1 -> 2;\n  // FP: \"2 -> 3;\n  // FP: \"3 -> 4;\n  kernel_sizing(blocks, threads);\n  // FP: \"4 -> 5;\n  Shared<uint64_t> DGAccumulator_sumval  = Shared<uint64_t>(1);\n  // FP: \"5 -> 6;\n  // FP: \"6 -> 7;\n  *(DGAccumulator_sumval.cpu_wr_ptr()) = 0;\n  // FP: \"7 -> 8;\n  _DGAccumulator_sum.rv = DGAccumulator_sumval.gpu_wr_ptr();\n  // FP: \"8 -> 9;\n  Shared<uint64_t> dg_avgval  = Shared<uint64_t>(1);\n  // FP: \"9 -> 10;\n  // FP: \"10 -> 11;\n  *(dg_avgval.cpu_wr_ptr()) = 0;\n  // FP: \"11 -> 12;\n  _dg_avg.rv = dg_avgval.gpu_wr_ptr();\n  // FP: \"12 -> 13;\n  Shared<uint32_t> DGMaxval  = Shared<uint32_t>(1);\n  // FP: \"13 -> 14;\n  // FP: \"14 -> 15;\n  *(DGMaxval.cpu_wr_ptr()) = 0;\n  // FP: \"15 -> 16;\n  _DGMax.rv = DGMaxval.gpu_wr_ptr();\n  // FP: \"16 -> 17;\n  SSSPSanityCheck <<<blocks, threads>>>(ctx->gg, __begin, __end, local_infinity, ctx->dist_current.data.gpu_wr_ptr(), _DGAccumulator_sum, _dg_avg, _DGMax);\n  cudaDeviceSynchronize();\n  // FP: \"17 -> 18;\n  check_cuda_kernel;\n  // FP: \"18 -> 19;\n  DGAccumulator_sum = *(DGAccumulator_sumval.cpu_rd_ptr());\n  // FP: \"19 -> 20;\n  dg_avg = *(dg_avgval.cpu_rd_ptr());\n  // FP: \"20 -> 21;\n  DGMax = *(DGMaxval.cpu_rd_ptr());\n  // FP: \"21 -> 22;\n}\nvoid SSSPSanityCheck_allNodes_cuda(uint64_t & DGAccumulator_sum, uint64_t & dg_avg, uint32_t & DGMax, const uint32_t & local_infinity, struct CUDA_Context*  ctx)\n{\n  // FP: \"1 -> 2;\n  SSSPSanityCheck_cuda(0, ctx->gg.nnodes, DGAccumulator_sum, dg_avg, DGMax, local_infinity, ctx);\n  // FP: \"2 -> 3;\n}\nvoid SSSPSanityCheck_masterNodes_cuda(uint64_t & DGAccumulator_sum, uint64_t & dg_avg, uint32_t & DGMax, const uint32_t & local_infinity, struct CUDA_Context*  ctx)\n{\n  // FP: \"1 -> 2;\n  SSSPSanityCheck_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, DGAccumulator_sum, dg_avg, DGMax, local_infinity, ctx);\n  // FP: \"2 -> 3;\n}\nvoid SSSPSanityCheck_nodesWithEdges_cuda(uint64_t & DGAccumulator_sum, uint64_t & dg_avg, uint32_t & DGMax, const uint32_t & local_infinity, struct CUDA_Context*  ctx)\n{\n  // FP: \"1 -> 2;\n  SSSPSanityCheck_cuda(0, ctx->numNodesWithEdges, DGAccumulator_sum, dg_avg, DGMax, local_infinity, ctx);\n  // FP: \"2 -> 3;\n}"
  },
  {
    "path": "lonestar/analytics/distributed/sssp/sssp_push_cuda.cuh",
    "content": "#pragma once\n#include <cuda.h>\n#include <stdio.h>\n#include <sys/types.h>\n#include <unistd.h>\n#include \"sssp_push_cuda.h\"\n#include \"galois/runtime/cuda/DeviceSync.h\"\n\nstruct CUDA_Context : public CUDA_Context_Common {\n\tstruct CUDA_Context_Field<uint32_t> dist_current;\n\tstruct CUDA_Context_Field<uint32_t> dist_old;\n};\n\nstruct CUDA_Context* get_CUDA_context(int id) {\n\tstruct CUDA_Context* ctx;\n\tctx = (struct CUDA_Context* ) calloc(1, sizeof(struct CUDA_Context));\n\tctx->id = id;\n\treturn ctx;\n}\n\nbool init_CUDA_context(struct CUDA_Context* ctx, int device) {\n\treturn init_CUDA_context_common(ctx, device);\n}\n\nvoid load_graph_CUDA(struct CUDA_Context* ctx, MarshalGraph &g, unsigned num_hosts) {\n\tsize_t mem_usage = mem_usage_CUDA_common(g, num_hosts);\n\tmem_usage += mem_usage_CUDA_field(&ctx->dist_current, g, num_hosts);\n\tmem_usage += mem_usage_CUDA_field(&ctx->dist_old, g, num_hosts);\n\tprintf(\"[%d] Host memory for communication context: %3u MB\\n\", ctx->id, mem_usage/1048756);\n\tload_graph_CUDA_common(ctx, g, num_hosts);\n\tload_graph_CUDA_field(ctx, &ctx->dist_current, num_hosts);\n\tload_graph_CUDA_field(ctx, &ctx->dist_old, num_hosts);\n\treset_CUDA_context(ctx);\n}\n\nvoid reset_CUDA_context(struct CUDA_Context* ctx) {\n\tctx->dist_current.data.zero_gpu();\n\tctx->dist_old.data.zero_gpu();\n}\n\nvoid get_bitset_dist_current_cuda(struct CUDA_Context* ctx, uint64_t* bitset_compute) {\n\tctx->dist_current.is_updated.cpu_rd_ptr()->copy_to_cpu(bitset_compute);\n}\n\nvoid bitset_dist_current_reset_cuda(struct CUDA_Context* ctx) {\n\tctx->dist_current.is_updated.cpu_rd_ptr()->reset();\n}\n\nvoid bitset_dist_current_reset_cuda(struct CUDA_Context* ctx, size_t begin, size_t end) {\n\treset_bitset_field(&ctx->dist_current, begin, end);\n}\n\nuint32_t get_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned LID) {\n\tuint32_t *dist_current = ctx->dist_current.data.cpu_rd_ptr();\n\treturn dist_current[LID];\n}\n\nvoid set_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned LID, uint32_t v) {\n\tuint32_t *dist_current = ctx->dist_current.data.cpu_wr_ptr();\n\tdist_current[LID] = v;\n}\n\nvoid add_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned LID, uint32_t v) {\n\tuint32_t *dist_current = ctx->dist_current.data.cpu_wr_ptr();\n\tdist_current[LID] += v;\n}\n\nbool min_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned LID, uint32_t v) {\n\tuint32_t *dist_current = ctx->dist_current.data.cpu_wr_ptr();\n\tif (dist_current[LID] > v){\n\t\tdist_current[LID] = v;\n\t\treturn true;\n\t}\n\treturn false;\n}\n\nvoid batch_get_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v) {\n\tbatch_get_shared_field<uint32_t, sharedMaster, false>(ctx, &ctx->dist_current, from_id, v);\n}\n\nvoid batch_get_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode) {\n\tbatch_get_shared_field<uint32_t, sharedMaster, false>(ctx, &ctx->dist_current, from_id, v, v_size, data_mode);\n}\n\nvoid batch_get_mirror_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v) {\n\tbatch_get_shared_field<uint32_t, sharedMirror, false>(ctx, &ctx->dist_current, from_id, v);\n}\n\nvoid batch_get_mirror_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode) {\n\tbatch_get_shared_field<uint32_t, sharedMirror, false>(ctx, &ctx->dist_current, from_id, v, v_size, data_mode);\n}\n\nvoid batch_get_reset_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, uint32_t i) {\n\tbatch_get_shared_field<uint32_t, sharedMirror, true>(ctx, &ctx->dist_current, from_id, v, i);\n}\n\nvoid batch_get_reset_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode, uint32_t i) {\n\tbatch_get_shared_field<uint32_t, sharedMirror, true>(ctx, &ctx->dist_current, from_id, v, v_size, data_mode, i);\n}\n\nvoid batch_set_mirror_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<uint32_t, sharedMirror, setOp>(ctx, &ctx->dist_current, from_id, v, data_mode);\n}\n\nvoid batch_set_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<uint32_t, sharedMaster, setOp>(ctx, &ctx->dist_current, from_id, v, data_mode);\n}\n\nvoid batch_add_mirror_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<uint32_t, sharedMirror, addOp>(ctx, &ctx->dist_current, from_id, v, data_mode);\n}\n\nvoid batch_add_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<uint32_t, sharedMaster, addOp>(ctx, &ctx->dist_current, from_id, v, data_mode);\n}\n\nvoid batch_min_mirror_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<uint32_t, sharedMirror, minOp>(ctx, &ctx->dist_current, from_id, v, data_mode);\n}\n\nvoid batch_min_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<uint32_t, sharedMaster, minOp>(ctx, &ctx->dist_current, from_id, v, data_mode);\n}\n\nvoid batch_reset_node_dist_current_cuda(struct CUDA_Context* ctx, size_t begin, size_t end, uint32_t v) {\n\treset_data_field<uint32_t>(&ctx->dist_current, begin, end, v);\n}\n\nvoid get_bitset_dist_old_cuda(struct CUDA_Context* ctx, uint64_t* bitset_compute) {\n\tctx->dist_old.is_updated.cpu_rd_ptr()->copy_to_cpu(bitset_compute);\n}\n\nvoid bitset_dist_old_reset_cuda(struct CUDA_Context* ctx) {\n\tctx->dist_old.is_updated.cpu_rd_ptr()->reset();\n}\n\nvoid bitset_dist_old_reset_cuda(struct CUDA_Context* ctx, size_t begin, size_t end) {\n\treset_bitset_field(&ctx->dist_old, begin, end);\n}\n\nuint32_t get_node_dist_old_cuda(struct CUDA_Context* ctx, unsigned LID) {\n\tuint32_t *dist_old = ctx->dist_old.data.cpu_rd_ptr();\n\treturn dist_old[LID];\n}\n\nvoid set_node_dist_old_cuda(struct CUDA_Context* ctx, unsigned LID, uint32_t v) {\n\tuint32_t *dist_old = ctx->dist_old.data.cpu_wr_ptr();\n\tdist_old[LID] = v;\n}\n\nvoid add_node_dist_old_cuda(struct CUDA_Context* ctx, unsigned LID, uint32_t v) {\n\tuint32_t *dist_old = ctx->dist_old.data.cpu_wr_ptr();\n\tdist_old[LID] += v;\n}\n\nbool min_node_dist_old_cuda(struct CUDA_Context* ctx, unsigned LID, uint32_t v) {\n\tuint32_t *dist_old = ctx->dist_old.data.cpu_wr_ptr();\n\tif (dist_old[LID] > v){\n\t\tdist_old[LID] = v;\n\t\treturn true;\n\t}\n\treturn false;\n}\n\nvoid batch_get_node_dist_old_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v) {\n\tbatch_get_shared_field<uint32_t, sharedMaster, false>(ctx, &ctx->dist_old, from_id, v);\n}\n\nvoid batch_get_node_dist_old_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode) {\n\tbatch_get_shared_field<uint32_t, sharedMaster, false>(ctx, &ctx->dist_old, from_id, v, v_size, data_mode);\n}\n\nvoid batch_get_mirror_node_dist_old_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v) {\n\tbatch_get_shared_field<uint32_t, sharedMirror, false>(ctx, &ctx->dist_old, from_id, v);\n}\n\nvoid batch_get_mirror_node_dist_old_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode) {\n\tbatch_get_shared_field<uint32_t, sharedMirror, false>(ctx, &ctx->dist_old, from_id, v, v_size, data_mode);\n}\n\nvoid batch_get_reset_node_dist_old_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, uint32_t i) {\n\tbatch_get_shared_field<uint32_t, sharedMirror, true>(ctx, &ctx->dist_old, from_id, v, i);\n}\n\nvoid batch_get_reset_node_dist_old_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, size_t* v_size, DataCommMode* data_mode, uint32_t i) {\n\tbatch_get_shared_field<uint32_t, sharedMirror, true>(ctx, &ctx->dist_old, from_id, v, v_size, data_mode, i);\n}\n\nvoid batch_set_mirror_node_dist_old_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<uint32_t, sharedMirror, setOp>(ctx, &ctx->dist_old, from_id, v, data_mode);\n}\n\nvoid batch_set_node_dist_old_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<uint32_t, sharedMaster, setOp>(ctx, &ctx->dist_old, from_id, v, data_mode);\n}\n\nvoid batch_add_mirror_node_dist_old_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<uint32_t, sharedMirror, addOp>(ctx, &ctx->dist_old, from_id, v, data_mode);\n}\n\nvoid batch_add_node_dist_old_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<uint32_t, sharedMaster, addOp>(ctx, &ctx->dist_old, from_id, v, data_mode);\n}\n\nvoid batch_min_mirror_node_dist_old_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<uint32_t, sharedMirror, minOp>(ctx, &ctx->dist_old, from_id, v, data_mode);\n}\n\nvoid batch_min_node_dist_old_cuda(struct CUDA_Context* ctx, unsigned from_id, uint8_t* v, DataCommMode data_mode) {\n\tbatch_set_shared_field<uint32_t, sharedMaster, minOp>(ctx, &ctx->dist_old, from_id, v, data_mode);\n}\n\nvoid batch_reset_node_dist_old_cuda(struct CUDA_Context* ctx, size_t begin, size_t end, uint32_t v) {\n\treset_data_field<uint32_t>(&ctx->dist_old, begin, end, v);\n}\n\n"
  },
  {
    "path": "lonestar/analytics/distributed/sssp/sssp_push_cuda.h",
    "content": "#pragma once\n\n#include \"galois/runtime/DataCommMode.h\"\n#include \"galois/cuda/HostDecls.h\"\n\nvoid get_bitset_dist_current_cuda(struct CUDA_Context* ctx,\n                                  uint64_t* bitset_compute);\nvoid bitset_dist_current_reset_cuda(struct CUDA_Context* ctx);\nvoid bitset_dist_current_reset_cuda(struct CUDA_Context* ctx, size_t begin,\n                                    size_t end);\nuint32_t get_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned LID);\nvoid set_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned LID,\n                                uint32_t v);\nvoid add_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned LID,\n                                uint32_t v);\nbool min_node_dist_current_cuda(struct CUDA_Context* ctx, unsigned LID,\n                                uint32_t v);\nvoid batch_get_node_dist_current_cuda(struct CUDA_Context* ctx,\n                                      unsigned from_id, uint8_t* v);\nvoid batch_get_node_dist_current_cuda(struct CUDA_Context* ctx,\n                                      unsigned from_id, uint8_t* v,\n                                      size_t* v_size, DataCommMode* data_mode);\nvoid batch_get_mirror_node_dist_current_cuda(struct CUDA_Context* ctx,\n                                             unsigned from_id, uint8_t* v);\nvoid batch_get_mirror_node_dist_current_cuda(struct CUDA_Context* ctx,\n                                             unsigned from_id, uint8_t* v,\n                                             size_t* v_size,\n                                             DataCommMode* data_mode);\nvoid batch_get_reset_node_dist_current_cuda(struct CUDA_Context* ctx,\n                                            unsigned from_id, uint8_t* v,\n                                            uint32_t i);\nvoid batch_get_reset_node_dist_current_cuda(struct CUDA_Context* ctx,\n                                            unsigned from_id, uint8_t* v,\n                                            size_t* v_size,\n                                            DataCommMode* data_mode,\n                                            uint32_t i);\nvoid batch_set_mirror_node_dist_current_cuda(struct CUDA_Context* ctx,\n                                             unsigned from_id, uint8_t* v,\n                                             DataCommMode data_mode);\nvoid batch_set_node_dist_current_cuda(struct CUDA_Context* ctx,\n                                      unsigned from_id, uint8_t* v,\n                                      DataCommMode data_mode);\nvoid batch_add_mirror_node_dist_current_cuda(struct CUDA_Context* ctx,\n                                             unsigned from_id, uint8_t* v,\n                                             DataCommMode data_mode);\nvoid batch_add_node_dist_current_cuda(struct CUDA_Context* ctx,\n                                      unsigned from_id, uint8_t* v,\n                                      DataCommMode data_mode);\nvoid batch_min_mirror_node_dist_current_cuda(struct CUDA_Context* ctx,\n                                             unsigned from_id, uint8_t* v,\n                                             DataCommMode data_mode);\nvoid batch_min_node_dist_current_cuda(struct CUDA_Context* ctx,\n                                      unsigned from_id, uint8_t* v,\n                                      DataCommMode data_mode);\nvoid batch_reset_node_dist_current_cuda(struct CUDA_Context* ctx, size_t begin,\n                                        size_t end, uint32_t v);\n\nvoid get_bitset_dist_old_cuda(struct CUDA_Context* ctx,\n                              uint64_t* bitset_compute);\nvoid bitset_dist_old_reset_cuda(struct CUDA_Context* ctx);\nvoid bitset_dist_old_reset_cuda(struct CUDA_Context* ctx, size_t begin,\n                                size_t end);\nuint32_t get_node_dist_old_cuda(struct CUDA_Context* ctx, unsigned LID);\nvoid set_node_dist_old_cuda(struct CUDA_Context* ctx, unsigned LID, uint32_t v);\nvoid add_node_dist_old_cuda(struct CUDA_Context* ctx, unsigned LID, uint32_t v);\nbool min_node_dist_old_cuda(struct CUDA_Context* ctx, unsigned LID, uint32_t v);\nvoid batch_get_node_dist_old_cuda(struct CUDA_Context* ctx, unsigned from_id,\n                                  uint8_t* v);\nvoid batch_get_node_dist_old_cuda(struct CUDA_Context* ctx, unsigned from_id,\n                                  uint8_t* v, size_t* v_size,\n                                  DataCommMode* data_mode);\nvoid batch_get_mirror_node_dist_old_cuda(struct CUDA_Context* ctx,\n                                         unsigned from_id, uint8_t* v);\nvoid batch_get_mirror_node_dist_old_cuda(struct CUDA_Context* ctx,\n                                         unsigned from_id, uint8_t* v,\n                                         size_t* v_size,\n                                         DataCommMode* data_mode);\nvoid batch_get_reset_node_dist_old_cuda(struct CUDA_Context* ctx,\n                                        unsigned from_id, uint8_t* v,\n                                        uint32_t i);\nvoid batch_get_reset_node_dist_old_cuda(struct CUDA_Context* ctx,\n                                        unsigned from_id, uint8_t* v,\n                                        size_t* v_size, DataCommMode* data_mode,\n                                        uint32_t i);\nvoid batch_set_mirror_node_dist_old_cuda(struct CUDA_Context* ctx,\n                                         unsigned from_id, uint8_t* v,\n                                         DataCommMode data_mode);\nvoid batch_set_node_dist_old_cuda(struct CUDA_Context* ctx, unsigned from_id,\n                                  uint8_t* v, DataCommMode data_mode);\nvoid batch_add_mirror_node_dist_old_cuda(struct CUDA_Context* ctx,\n                                         unsigned from_id, uint8_t* v,\n                                         DataCommMode data_mode);\nvoid batch_add_node_dist_old_cuda(struct CUDA_Context* ctx, unsigned from_id,\n                                  uint8_t* v, DataCommMode data_mode);\nvoid batch_min_mirror_node_dist_old_cuda(struct CUDA_Context* ctx,\n                                         unsigned from_id, uint8_t* v,\n                                         DataCommMode data_mode);\nvoid batch_min_node_dist_old_cuda(struct CUDA_Context* ctx, unsigned from_id,\n                                  uint8_t* v, DataCommMode data_mode);\nvoid batch_reset_node_dist_old_cuda(struct CUDA_Context* ctx, size_t begin,\n                                    size_t end, uint32_t v);\n\nvoid FirstItr_SSSP_cuda(unsigned int __begin, unsigned int __end,\n                        struct CUDA_Context* ctx);\nvoid FirstItr_SSSP_allNodes_cuda(struct CUDA_Context* ctx);\nvoid FirstItr_SSSP_masterNodes_cuda(struct CUDA_Context* ctx);\nvoid FirstItr_SSSP_nodesWithEdges_cuda(struct CUDA_Context* ctx);\nvoid InitializeGraph_cuda(unsigned int __begin, unsigned int __end,\n                          const uint32_t& local_infinity,\n                          unsigned long long local_src_node,\n                          struct CUDA_Context* ctx);\nvoid InitializeGraph_allNodes_cuda(const uint32_t& local_infinity,\n                                   unsigned long long local_src_node,\n                                   struct CUDA_Context* ctx);\nvoid InitializeGraph_masterNodes_cuda(const uint32_t& local_infinity,\n                                      unsigned long long local_src_node,\n                                      struct CUDA_Context* ctx);\nvoid InitializeGraph_nodesWithEdges_cuda(const uint32_t& local_infinity,\n                                         unsigned long long local_src_node,\n                                         struct CUDA_Context* ctx);\nvoid SSSP_cuda(unsigned int __begin, unsigned int __end,\n               unsigned int& active_vertices, unsigned int& work_edges,\n               uint32_t local_priority, struct CUDA_Context* ctx);\nvoid SSSPSanityCheck_cuda(unsigned int __begin, unsigned int __end,\n                          uint64_t& DGAccumulator_sum, uint64_t& dg_avg,\n                          uint32_t& DGMax, const uint32_t& local_infinity,\n                          struct CUDA_Context* ctx);\nvoid SSSPSanityCheck_allNodes_cuda(uint64_t& DGAccumulator_sum,\n                                   uint64_t& dg_avg, uint32_t& DGMax,\n                                   const uint32_t& local_infinity,\n                                   struct CUDA_Context* ctx);\nvoid SSSPSanityCheck_masterNodes_cuda(uint64_t& DGAccumulator_sum,\n                                      uint64_t& dg_avg, uint32_t& DGMax,\n                                      const uint32_t& local_infinity,\n                                      struct CUDA_Context* ctx);\nvoid SSSPSanityCheck_nodesWithEdges_cuda(uint64_t& DGAccumulator_sum,\n                                         uint64_t& dg_avg, uint32_t& DGMax,\n                                         const uint32_t& local_infinity,\n                                         struct CUDA_Context* ctx);\nvoid SSSP_allNodes_cuda(unsigned int& active_vertices, unsigned int& work_edges,\n                        uint32_t local_priority, struct CUDA_Context* ctx);\nvoid SSSP_masterNodes_cuda(unsigned int& active_vertices,\n                           unsigned int& work_edges, uint32_t local_priority,\n                           struct CUDA_Context* ctx);\nvoid SSSP_nodesWithEdges_cuda(unsigned int& active_vertices,\n                              unsigned int& work_edges, uint32_t local_priority,\n                              struct CUDA_Context* ctx);\n"
  },
  {
    "path": "lonestar/analytics/distributed/sssp/sssp_push_cuda.py",
    "content": "from gg.ast import *\nfrom gg.lib.graph import Graph\nfrom gg.lib.wl import Worklist\nfrom gg.ast.params import GraphParam\nimport cgen\nG = Graph(\"graph\")\nWL = Worklist()\nast = Module([\nCBlock([cgen.Include(\"sssp_push_cuda.cuh\", system = False)], parse = False),\nKernel(\"InitializeGraph\", [G.param(), ('unsigned int', '__begin'), ('unsigned int', '__end'), ('const uint32_t ', 'local_infinity'), ('unsigned long long', 'local_src_node'), ('uint32_t *', 'p_dist_current'), ('uint32_t *', 'p_dist_old')],\n[\nForAll(\"src\", G.nodes(\"__begin\", \"__end\"),\n[\nCDecl([(\"bool\", \"pop\", \" = src < __end\")]),\nIf(\"pop\", [\nCBlock([\"p_dist_current[src] = (graph.node_data[src] == local_src_node) ? 0 : local_infinity\"]),\nCBlock([\"p_dist_old[src] = (graph.node_data[src] == local_src_node) ? 0 : local_infinity\"]),\n]),\n]),\n]),\nKernel(\"FirstItr_SSSP\", [G.param(), ('unsigned int', '__begin'), ('unsigned int', '__end'), ('uint32_t *', 'p_dist_current'), ('uint32_t *', 'p_dist_old'), ('DynamicBitset&', 'bitset_dist_current')],\n[\nForAll(\"src\", G.nodes(\"__begin\", \"__end\"),\n[\nCDecl([(\"bool\", \"pop\", \" = src < __end\")]),\nIf(\"pop\", [\nCBlock([\"p_dist_old[src]  = p_dist_current[src]\"]),\n]),\nUniformConditional(If(\"!pop\", [CBlock(\"continue\")]), uniform_only = False, _only_if_np = True),\nClosureHint(\nForAll(\"jj\", G.edges(\"src\"),\n[\nCDecl([(\"index_type\", \"dst\", \"\")]),\nCBlock([\"dst = graph.getAbsDestination(jj)\"]),\nCDecl([(\"uint32_t\", \"new_dist\", \"\")]),\nCBlock([\"new_dist = graph.getAbsWeight(jj) + p_dist_current[src]\"]),\nCDecl([(\"uint32_t\", \"old_dist\", \"\")]),\nCBlock([\"old_dist = atomicTestMin(&p_dist_current[dst], new_dist)\"]),\nIf(\"old_dist > new_dist\",\n[\nCBlock([\"bitset_dist_current.set(dst)\"]),\n]),\n]),\n),\n]),\n]),\nKernel(\"SSSP\", [G.param(), ('unsigned int', '__begin'), ('unsigned int', '__end'), ('uint32_t', 'local_priority'), ('uint32_t *', 'p_dist_current'), ('uint32_t *', 'p_dist_old'), ('DynamicBitset&', 'bitset_dist_current'), ('HGAccumulator<unsigned int>', 'active_vertices'), ('HGAccumulator<unsigned int>', 'work_edges')],\n[\nCDecl([(\"__shared__ cub::BlockReduce<unsigned int, TB_SIZE>::TempStorage\", \"active_vertices_ts\", \"\")]),\nCBlock([\"active_vertices.thread_entry()\"]),\nCDecl([(\"__shared__ cub::BlockReduce<unsigned int, TB_SIZE>::TempStorage\", \"work_edges_ts\", \"\")]),\nCBlock([\"work_edges.thread_entry()\"]),\nForAll(\"src\", G.nodes(\"__begin\", \"__end\"),\n[\nCDecl([(\"bool\", \"pop\", \" = src < __end\")]),\nIf(\"pop\", [\nIf(\"p_dist_old[src] > p_dist_current[src]\",\n[\nCBlock([\"active_vertices.reduce( 1)\"]),\nIf(\"local_priority > p_dist_current[src]\",\n[\nCBlock([\"p_dist_old[src] = p_dist_current[src]\"]),\n], [ CBlock([\"pop = false\"]), ]),\n], [ CBlock([\"pop = false\"]), ]),\n]),\nUniformConditional(If(\"!pop\", [CBlock(\"continue\")]), uniform_only = False, _only_if_np = True),\nClosureHint(\nForAll(\"jj\", G.edges(\"src\"),\n[\nCBlock([\"work_edges.reduce( 1)\"]),\nCDecl([(\"index_type\", \"dst\", \"\")]),\nCBlock([\"dst = graph.getAbsDestination(jj)\"]),\nCDecl([(\"uint32_t\", \"new_dist\", \"\")]),\nCBlock([\"new_dist = graph.getAbsWeight(jj) + p_dist_current[src]\"]),\nCDecl([(\"uint32_t\", \"old_dist\", \"\")]),\nCBlock([\"old_dist = atomicTestMin(&p_dist_current[dst], new_dist)\"]),\nIf(\"old_dist > new_dist\",\n[\nCBlock([\"bitset_dist_current.set(dst)\"]),\n]),\n]),\n),\n]),\nCBlock([\"active_vertices.thread_exit<cub::BlockReduce<unsigned int, TB_SIZE> >(active_vertices_ts)\"], parse = False),\nCBlock([\"work_edges.thread_exit<cub::BlockReduce<unsigned int, TB_SIZE> >(work_edges_ts)\"], parse = False),\n]),\nKernel(\"SSSPSanityCheck\", [G.param(), ('unsigned int', '__begin'), ('unsigned int', '__end'), ('const uint32_t ', 'local_infinity'), ('uint32_t *', 'p_dist_current'), ('HGAccumulator<uint64_t>', 'DGAccumulator_sum'), ('HGAccumulator<uint64_t>', 'dg_avg'), ('HGReduceMax<uint32_t>', 'DGMax')],\n[\nCDecl([(\"__shared__ cub::BlockReduce<uint64_t, TB_SIZE>::TempStorage\", \"DGAccumulator_sum_ts\", \"\")]),\nCBlock([\"DGAccumulator_sum.thread_entry()\"]),\nCDecl([(\"__shared__ cub::BlockReduce<uint64_t, TB_SIZE>::TempStorage\", \"dg_avg_ts\", \"\")]),\nCBlock([\"dg_avg.thread_entry()\"]),\nCDecl([(\"__shared__ cub::BlockReduce<uint32_t, TB_SIZE>::TempStorage\", \"DGMax_ts\", \"\")]),\nCBlock([\"DGMax.thread_entry()\"]),\nForAll(\"src\", G.nodes(\"__begin\", \"__end\"),\n[\nCDecl([(\"bool\", \"pop\", \" = src < __end\")]),\nIf(\"pop\", [\nIf(\"p_dist_current[src] < local_infinity\",\n[\nCBlock([\"DGAccumulator_sum.reduce( 1)\"]),\nCBlock([\"DGMax.reduce(p_dist_current[src])\"]),\nCBlock([\"dg_avg.reduce( p_dist_current[src])\"]),\n]),\n]),\n]),\nCBlock([\"DGAccumulator_sum.thread_exit<cub::BlockReduce<uint64_t, TB_SIZE> >(DGAccumulator_sum_ts)\"], parse = False),\nCBlock([\"dg_avg.thread_exit<cub::BlockReduce<uint64_t, TB_SIZE> >(dg_avg_ts)\"], parse = False),\nCBlock([\"DGMax.thread_exit<cub::BlockReduce<uint32_t, TB_SIZE> >(DGMax_ts)\"], parse = False),\n]),\nKernel(\"InitializeGraph_cuda\", [('unsigned int ', '__begin'), ('unsigned int ', '__end'), ('const uint32_t &', 'local_infinity'), ('unsigned long long', 'local_src_node'), ('struct CUDA_Context* ', 'ctx')],\n[\nCDecl([(\"dim3\", \"blocks\", \"\")]),\nCDecl([(\"dim3\", \"threads\", \"\")]),\nCBlock([\"kernel_sizing(blocks, threads)\"]),\nInvoke(\"InitializeGraph\", (\"ctx->gg\", \"__begin\", \"__end\", \"local_infinity\", \"local_src_node\", \"ctx->dist_current.data.gpu_wr_ptr()\", \"ctx->dist_old.data.gpu_wr_ptr()\")),\nCBlock([\"check_cuda_kernel\"], parse = False),\n], host = True),\nKernel(\"InitializeGraph_allNodes_cuda\", [('const uint32_t &', 'local_infinity'), ('unsigned long long', 'local_src_node'), ('struct CUDA_Context* ', 'ctx')],\n[\nCBlock([\"InitializeGraph_cuda(0, ctx->gg.nnodes, local_infinity, local_src_node, ctx)\"]),\n], host = True),\nKernel(\"InitializeGraph_masterNodes_cuda\", [('const uint32_t &', 'local_infinity'), ('unsigned long long', 'local_src_node'), ('struct CUDA_Context* ', 'ctx')],\n[\nCBlock([\"InitializeGraph_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, local_infinity, local_src_node, ctx)\"]),\n], host = True),\nKernel(\"InitializeGraph_nodesWithEdges_cuda\", [('const uint32_t &', 'local_infinity'), ('unsigned long long', 'local_src_node'), ('struct CUDA_Context* ', 'ctx')],\n[\nCBlock([\"InitializeGraph_cuda(0, ctx->numNodesWithEdges, local_infinity, local_src_node, ctx)\"]),\n], host = True),\nKernel(\"FirstItr_SSSP_cuda\", [('unsigned int ', '__begin'), ('unsigned int ', '__end'), ('struct CUDA_Context* ', 'ctx')],\n[\nCDecl([(\"dim3\", \"blocks\", \"\")]),\nCDecl([(\"dim3\", \"threads\", \"\")]),\nCBlock([\"kernel_sizing(blocks, threads)\"]),\nInvoke(\"FirstItr_SSSP\", (\"ctx->gg\", \"__begin\", \"__end\", \"ctx->dist_current.data.gpu_wr_ptr()\", \"ctx->dist_old.data.gpu_wr_ptr()\", \"*(ctx->dist_current.is_updated.gpu_rd_ptr())\")),\nCBlock([\"check_cuda_kernel\"], parse = False),\n], host = True),\nKernel(\"FirstItr_SSSP_allNodes_cuda\", [('struct CUDA_Context* ', 'ctx')],\n[\nCBlock([\"FirstItr_SSSP_cuda(0, ctx->gg.nnodes, ctx)\"]),\n], host = True),\nKernel(\"FirstItr_SSSP_masterNodes_cuda\", [('struct CUDA_Context* ', 'ctx')],\n[\nCBlock([\"FirstItr_SSSP_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, ctx)\"]),\n], host = True),\nKernel(\"FirstItr_SSSP_nodesWithEdges_cuda\", [('struct CUDA_Context* ', 'ctx')],\n[\nCBlock([\"FirstItr_SSSP_cuda(0, ctx->numNodesWithEdges, ctx)\"]),\n], host = True),\nKernel(\"SSSP_cuda\", [('unsigned int ', '__begin'), ('unsigned int ', '__end'), ('unsigned int &', 'active_vertices'), ('unsigned int &', 'work_edges'), ('uint32_t', 'local_priority'), ('struct CUDA_Context* ', 'ctx')],\n[\nCDecl([(\"dim3\", \"blocks\", \"\")]),\nCDecl([(\"dim3\", \"threads\", \"\")]),\nCBlock([\"kernel_sizing(blocks, threads)\"]),\nCDecl([(\"Shared<unsigned int>\", \"active_verticesval\", \" = Shared<unsigned int>(1)\")]),\nCDecl([(\"HGAccumulator<unsigned int>\", \"_active_vertices\", \"\")]),\nCBlock([\"*(active_verticesval.cpu_wr_ptr()) = 0\"]),\nCBlock([\"_active_vertices.rv = active_verticesval.gpu_wr_ptr()\"]),\nCDecl([(\"Shared<unsigned int>\", \"work_edgesval\", \" = Shared<unsigned int>(1)\")]),\nCDecl([(\"HGAccumulator<unsigned int>\", \"_work_edges\", \"\")]),\nCBlock([\"*(work_edgesval.cpu_wr_ptr()) = 0\"]),\nCBlock([\"_work_edges.rv = work_edgesval.gpu_wr_ptr()\"]),\nInvoke(\"SSSP\", (\"ctx->gg\", \"__begin\", \"__end\", \"local_priority\", \"ctx->dist_current.data.gpu_wr_ptr()\", \"ctx->dist_old.data.gpu_wr_ptr()\", \"*(ctx->dist_current.is_updated.gpu_rd_ptr())\", \"_active_vertices\", \"_work_edges\")),\nCBlock([\"check_cuda_kernel\"], parse = False),\nCBlock([\"active_vertices = *(active_verticesval.cpu_rd_ptr())\"]),\nCBlock([\"work_edges = *(work_edgesval.cpu_rd_ptr())\"]),\n], host = True),\nKernel(\"SSSP_allNodes_cuda\", [('unsigned int &', 'active_vertices'), ('unsigned int &', 'work_edges'), ('uint32_t', 'local_priority'), ('struct CUDA_Context* ', 'ctx')],\n[\nCBlock([\"SSSP_cuda(0, ctx->gg.nnodes, active_vertices, work_edges, local_priority, ctx)\"]),\n], host = True),\nKernel(\"SSSP_masterNodes_cuda\", [('unsigned int &', 'active_vertices'), ('unsigned int &', 'work_edges'), ('uint32_t', 'local_priority'), ('struct CUDA_Context* ', 'ctx')],\n[\nCBlock([\"SSSP_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, active_vertices, work_edges, local_priority, ctx)\"]),\n], host = True),\nKernel(\"SSSP_nodesWithEdges_cuda\", [('unsigned int &', 'active_vertices'), ('unsigned int &', 'work_edges'), ('uint32_t', 'local_priority'), ('struct CUDA_Context* ', 'ctx')],\n[\nCBlock([\"SSSP_cuda(0, ctx->numNodesWithEdges, active_vertices, work_edges, local_priority, ctx)\"]),\n], host = True),\nKernel(\"SSSPSanityCheck_cuda\", [('unsigned int ', '__begin'), ('unsigned int ', '__end'), ('uint64_t &', 'DGAccumulator_sum'), ('uint64_t &', 'dg_avg'), ('uint32_t &', 'DGMax'), ('const uint32_t &', 'local_infinity'), ('struct CUDA_Context* ', 'ctx')],\n[\nCDecl([(\"dim3\", \"blocks\", \"\")]),\nCDecl([(\"dim3\", \"threads\", \"\")]),\nCBlock([\"kernel_sizing(blocks, threads)\"]),\nCDecl([(\"Shared<uint64_t>\", \"DGAccumulator_sumval\", \" = Shared<uint64_t>(1)\")]),\nCDecl([(\"HGAccumulator<uint64_t>\", \"_DGAccumulator_sum\", \"\")]),\nCBlock([\"*(DGAccumulator_sumval.cpu_wr_ptr()) = 0\"]),\nCBlock([\"_DGAccumulator_sum.rv = DGAccumulator_sumval.gpu_wr_ptr()\"]),\nCDecl([(\"Shared<uint64_t>\", \"dg_avgval\", \" = Shared<uint64_t>(1)\")]),\nCDecl([(\"HGAccumulator<uint64_t>\", \"_dg_avg\", \"\")]),\nCBlock([\"*(dg_avgval.cpu_wr_ptr()) = 0\"]),\nCBlock([\"_dg_avg.rv = dg_avgval.gpu_wr_ptr()\"]),\nCDecl([(\"Shared<uint32_t>\", \"DGMaxval\", \" = Shared<uint32_t>(1)\")]),\nCDecl([(\"HGReduceMax<uint32_t>\", \"_DGMax\", \"\")]),\nCBlock([\"*(DGMaxval.cpu_wr_ptr()) = 0\"]),\nCBlock([\"_DGMax.rv = DGMaxval.gpu_wr_ptr()\"]),\nInvoke(\"SSSPSanityCheck\", (\"ctx->gg\", \"__begin\", \"__end\", \"local_infinity\", \"ctx->dist_current.data.gpu_wr_ptr()\", \"_DGAccumulator_sum\", \"_dg_avg\", \"_DGMax\")),\nCBlock([\"check_cuda_kernel\"], parse = False),\nCBlock([\"DGAccumulator_sum = *(DGAccumulator_sumval.cpu_rd_ptr())\"]),\nCBlock([\"dg_avg = *(dg_avgval.cpu_rd_ptr())\"]),\nCBlock([\"DGMax = *(DGMaxval.cpu_rd_ptr())\"]),\n], host = True),\nKernel(\"SSSPSanityCheck_allNodes_cuda\", [('uint64_t &', 'DGAccumulator_sum'), ('uint64_t &', 'dg_avg'), ('uint32_t &', 'DGMax'), ('const uint32_t &', 'local_infinity'), ('struct CUDA_Context* ', 'ctx')],\n[\nCBlock([\"SSSPSanityCheck_cuda(0, ctx->gg.nnodes, DGAccumulator_sum, dg_avg, DGMax, local_infinity, ctx)\"]),\n], host = True),\nKernel(\"SSSPSanityCheck_masterNodes_cuda\", [('uint64_t &', 'DGAccumulator_sum'), ('uint64_t &', 'dg_avg'), ('uint32_t &', 'DGMax'), ('const uint32_t &', 'local_infinity'), ('struct CUDA_Context* ', 'ctx')],\n[\nCBlock([\"SSSPSanityCheck_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, DGAccumulator_sum, dg_avg, DGMax, local_infinity, ctx)\"]),\n], host = True),\nKernel(\"SSSPSanityCheck_nodesWithEdges_cuda\", [('uint64_t &', 'DGAccumulator_sum'), ('uint64_t &', 'dg_avg'), ('uint32_t &', 'DGMax'), ('const uint32_t &', 'local_infinity'), ('struct CUDA_Context* ', 'ctx')],\n[\nCBlock([\"SSSPSanityCheck_cuda(0, ctx->numNodesWithEdges, DGAccumulator_sum, dg_avg, DGMax, local_infinity, ctx)\"]),\n], host = True),\n])\n"
  },
  {
    "path": "lonestar/analytics/distributed/sssp/sssp_push_sync.hh",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting parallelism.\n * The code is being released under the terms of the 3-Clause BSD License (a\n * copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#include \"galois/runtime/SyncStructures.h\"\n\nGALOIS_SYNC_STRUCTURE_REDUCE_SET(dist_current, unsigned int);\nGALOIS_SYNC_STRUCTURE_REDUCE_MIN(dist_current, unsigned int);\nGALOIS_SYNC_STRUCTURE_BITSET(dist_current);\n"
  },
  {
    "path": "lonestar/analytics/distributed/triangle-counting/CMakeLists.txt",
    "content": "app_dist(tc triangle-counting)\nadd_test_dist(triangle-counting-dist rmat15 NO_ASYNC ${BASEINPUT}/scalefree/symmetric/rmat15.csgr -symmetricGraph)\n"
  },
  {
    "path": "lonestar/analytics/distributed/triangle-counting/README.md",
    "content": "Triangle Counting\n================================================================================\n\nDESCRIPTION \n--------------------------------------------------------------------------------\n\nCounts the number of triangles in a symmetric, clean (i.e., no self-loops and\nno multiedges) graph in a multi-GPU setting. This implementation is the\none used in the paper \"DistTC: High Performance Distributed Triangle Counting\"\nwhich appeared in the Graph Challenge 2019 competition.\n\nA CPU implementation is currently in planning and will appear here once it is\nready.\n\nINPUT\n--------------------------------------------------------------------------------\n\nTakes in symmetric Galois .gr graphs that have been cleaned.\nYou must specify the -symmetricGraph flag when running this benchmark.\n\nBUILD\n--------------------------------------------------------------------------------\n\n1. Run cmake at BUILD directory (refer to top-level README for cmake instructions).\n\n2. Run `cd <BUILD>/lonestar/analytics/distributed/triangle-counting; make -j\n\nRUN\n--------------------------------------------------------------------------------\n\nTo run on 1 with a single GPU, use the following:\n`./triangle-counting-dist <symmetric-input-graph> -symmetricGraph -pset=g -num_nodes=1`\n\nTo run on a single machine with 56 CPU threads, use the following:\n`./triangle-counting-dist <symmetric-input-graph> -symmetricGraph -t=56`\n\nTo run on 3 GPUs on a machine, use the following:\n`mpirun -n=3 ./triangle-counting-dist <symmetric-input-graph> -symmetricGraph -pset=ggg -num_nodes=1`\n\nTo run on 6 GPUs on 2 machines h1 and h2 with 3 GPUs each, use the following:\n`mpirun -n=6 -hosts=h1,h2 ./triangle-counting-dist <symmetric-input-graph> -symmetricGraph -pset=ggg -num_nodes=2`\n\nTo run on 4 GPUs and 2 CPUs on 2 machines h1 and h2 with 2 GPUs and 1 CPU each, use the following:\n`mpirun -n=6 -hosts=h1,h2 ./triangle-counting-dist <symmetric-input-graph> -symmetricGraph -pset=ggc -num_nodes=2`\n\n\nPERFORMANCE\n--------------------------------------------------------------------------------\n\nThe performance analysis of distributed triangle counting can be found at [1]. The key observations from our study are as follows.\n\n* On a single GPU, we do not partition the graph, so application performance is better due to the computation phase on the GPU.\n\n* For distributed multi-GPUs,  we observe that application scales. With the increase in the number of GPUs,  the time taken to compute the number of triangles decreases since our algorithm is free from the synchronization except for the final aggregation.\n\n[1] Loc Hoang, Vishwesh Jatala, Xuhao Chen, Udit Agarwal, Roshan Dathathri, Gurbinder Gill, and Keshav Pingali, [DistTC: High Performance Distributed Triangle Counting. In 2019 IEEE High Performance Extreme Computing Conference](https://ieeexplore.ieee.org/document/8916438), HPEC 2019. IEEE, 2019.\n"
  },
  {
    "path": "lonestar/analytics/distributed/triangle-counting/tc.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n/* This is an implementation of Distributed multi-GPU triangle counting code.\n * The single GPU code which is executed on GPU is generated using the IrGL\n * compiler. Currently, it does not support distributed multi-CPU code.\n *\n * TODO implement CPU kernel\n */\n\n#include \"DistBench/MiningStart.h\"\n#include \"galois/DistGalois.h\"\n#include \"galois/DReducible.h\"\n#include \"galois/DTerminationDetector.h\"\n#include \"galois/gstl.h\"\n#include \"galois/graphs/GenericPartitioners.h\"\n#include \"galois/graphs/MiningPartitioner.h\"\n#include \"galois/runtime/Tracer.h\"\n\n#include <iostream>\n#include <limits>\n\n#ifdef GALOIS_ENABLE_GPU\n#include \"tc_cuda.h\"\nstruct CUDA_Context* cuda_ctx;\n#else\nenum { CPU, GPU_CUDA };\nint personality = CPU;\n#endif\n\nnamespace cll = llvm::cl;\n\nconstexpr static const char* const REGION_NAME = \"TC\";\n\n/*******************************************************************************\n * Graph structure declarations + other initialization\n ******************************************************************************/\n\ntypedef galois::graphs::MiningGraph<void, void, MiningPolicyDegrees> Graph;\ntypedef typename Graph::GraphNode GNode;\n\nstd::unique_ptr<galois::graphs::GluonEdgeSubstrate<Graph>> syncSubstrate;\n\ntemplate <bool async>\nstruct TC {\n  Graph* graph;\n  using DGAccumulatorTy = galois::DGAccumulator<uint64_t>;\n  DGAccumulatorTy& numTriangles;\n\n  TC(Graph* _graph, DGAccumulatorTy& _numTriangles)\n      : graph(_graph), numTriangles(_numTriangles) {}\n\n  // use the below line once CPU code is added\n  void static go(Graph& _graph) {\n    unsigned _num_iterations = 0;\n    DGAccumulatorTy numTriangles;\n    syncSubstrate->set_num_round(_num_iterations);\n    numTriangles.reset();\n    const auto& allMasterNodes = _graph.masterNodesRange();\n\n#ifdef GALOIS_ENABLE_GPU\n    if (personality == GPU_CUDA) { ///< GPU TC.\n      std::string impl_str(syncSubstrate->get_run_identifier(\"TC\"));\n      galois::StatTimer StatTimer_cuda(impl_str.c_str(), REGION_NAME);\n      StatTimer_cuda.start();\n      uint64_t num_local_triangles = 0;\n      TC_masterNodes_cuda(num_local_triangles, cuda_ctx);\n      numTriangles += num_local_triangles;\n      StatTimer_cuda.stop();\n    } else { ///< CPU TC.\n#endif\n      galois::do_all(\n          galois::iterate(allMasterNodes), TC(&_graph, numTriangles),\n          galois::steal(),\n          galois::loopname(syncSubstrate->get_run_identifier(\"TC\").c_str()));\n#ifdef GALOIS_ENABLE_GPU\n    }\n#endif\n\n    uint64_t total_triangles = numTriangles.reduce();\n    if (galois::runtime::getSystemNetworkInterface().ID == 0) {\n      galois::gPrint(\"Total number of triangles \", total_triangles, \"\\n\");\n    }\n  }\n\n  void operator()(GNode v) const {\n    size_t numTriangles_local = 0;\n    for (auto vIter : graph->edges(v)) {\n      GNode w                       = graph->getEdgeDst(vIter);\n      Graph::edge_iterator vIterBeg = graph->edge_begin(v);\n      Graph::edge_iterator vIterEnd = graph->edge_end(v);\n\n      for (auto wIter : graph->edges(w)) {\n        auto x                      = graph->getEdgeDst(wIter);\n        Graph::edge_iterator vvIter = vIterBeg;\n        while (graph->getEdgeDst(vvIter) < x && vvIter < vIterEnd) {\n          vvIter++;\n        }\n        if (vvIter < vIterEnd && x == graph->getEdgeDst(vvIter)) {\n          ++numTriangles_local;\n        }\n      }\n    } ///< Finding triangles is done.\n    numTriangles += numTriangles_local;\n  } ///< CPU operator is done.\n};\n\n/*******************************************************************************\n * Main\n ******************************************************************************/\n\nconstexpr static const char* const name =\n    \"TC - Distributed Multi-GPU Triangle Counting \";\nconstexpr static const char* const desc = \"TC on Distributed GPU (D-IrGL).\";\nconstexpr static const char* const url  = nullptr;\n\nint main(int argc, char** argv) {\n  galois::DistMemSys G;\n  DistBenchStart(argc, argv, name, desc, url);\n\n  if (!symmetricGraph) {\n    GALOIS_DIE(\"This application requires a symmetric graph input;\"\n               \" please use the -symmetricGraph flag \"\n               \" to indicate the input is a symmetric graph.\");\n  }\n\n  const auto& net = galois::runtime::getSystemNetworkInterface();\n\n  galois::StatTimer StatTimer_total(\"TimerTotal\", REGION_NAME);\n\n  StatTimer_total.start();\n  std::unique_ptr<Graph> hg;\n#ifdef GALOIS_ENABLE_GPU\n  std::tie(hg, syncSubstrate) =\n      distGraphInitialization<void, void>(&cuda_ctx, false);\n#else\n  std::tie(hg, syncSubstrate) = distGraphInitialization<void, void>(false);\n#endif\n\n  if (personality == GPU_CUDA) {\n#ifdef GALOIS_ENABLE_GPU\n    std::string timer_str(\"SortEdgesGPU\");\n    galois::StatTimer edgeSortTime(\"SortEdgesGPU\", REGION_NAME);\n    edgeSortTime.start();\n    sortEdgesByDestination_cuda(cuda_ctx);\n    edgeSortTime.stop();\n#else\n    abort();\n#endif\n  } else if (personality == CPU) {\n    galois::StatTimer edgeSortTime(\"SortEdgesCPU\", REGION_NAME);\n    edgeSortTime.start();\n    hg->sortEdgesByDestination();\n    edgeSortTime.stop();\n  }\n  ///! accumulators for use in operators\n  galois::DGAccumulator<uint64_t> DGAccumulator_numTriangles;\n\n  for (auto run = 0; run < numRuns; ++run) {\n    galois::gPrint(\"[\", net.ID, \"] TC::go run \", run, \" called\\n\");\n    std::string timer_str(\"Timer_\" + std::to_string(run));\n    galois::StatTimer StatTimer_main(timer_str.c_str(), REGION_NAME);\n\n    StatTimer_main.start();\n    TC<false>::go(*hg);\n    StatTimer_main.stop();\n\n    syncSubstrate->set_num_run(run + 1);\n  }\n  StatTimer_total.stop();\n\n  if (output) {\n    galois::gError(\"output requested but this application doesn't support it\");\n    return 1;\n  }\n\n  return 0;\n}\n"
  },
  {
    "path": "lonestar/analytics/distributed/triangle-counting/tc_cuda.cu",
    "content": "#include \"gg.h\"\n#include \"ggcuda.h\"\n#include \"cub/cub.cuh\"\n#include \"cub/util_allocator.cuh\"\n#include \"thread_work.h\"\n\nvoid kernel_sizing(CSRGraph &, dim3 &, dim3 &);\n#define TB_SIZE 256\n#include \"moderngpu/kernel_reduce.hxx\"\n#include \"tc_cuda.cuh\"\n#include \"moderngpu/kernel_segsort.hxx\"\n#include <cuda_profiler_api.h>\n#define WARP_SIZE 32\n\ninline __device__ unsigned long intersect(CSRGraph graph, index_type u, index_type v) {\n\tindex_type u_start = graph.getFirstEdge(u);\n\tindex_type u_end = u_start + graph.getOutDegree(u);\n\tindex_type v_start = graph.getFirstEdge(v);\n\tindex_type v_end = v_start + graph.getOutDegree(v);\n\tunsigned long count = 0;\n\tindex_type u_it = u_start;\n\tindex_type v_it = v_start;\n\tindex_type a;\n\tindex_type b;\n\twhile (u_it < u_end && v_it < v_end) {\n\t\ta = graph.getAbsDestination(u_it);\n\t\tb = graph.getAbsDestination(v_it);\n\t\tint d = a - b;\n\t\tif (d <= 0) u_it++;\n\t\tif (d >= 0) v_it++;\n\t\tif (d == 0) count++;\n\t}\n\treturn count;\n}\n\n__global__ void base(CSRGraph graph, unsigned begin, unsigned end, HGAccumulator<unsigned long> num_local_triangles) {\n\tunsigned tid = blockIdx.x * blockDim.x + threadIdx.x;\n\tunsigned long local_total = 0;\n\t__shared__ cub::BlockReduce<unsigned long, TB_SIZE>::TempStorage num_local_triangles_ts;\n\tnum_local_triangles.thread_entry();\n\tfor (index_type src = begin + tid; src < end; src += TOTAL_THREADS_1D) {\n\t\tindex_type row_begin = graph.getFirstEdge(src);\n\t\tindex_type row_end = row_begin + graph.getOutDegree(src); \n\t\tfor (index_type offset = row_begin; offset < row_end; ++ offset) {\n\t\t\tindex_type dst = graph.getAbsDestination(offset);\n\t\t\tlocal_total = intersect(graph, dst, src);\n\t\t\tif (local_total) num_local_triangles.reduce(local_total);\n\t\t}\n\t}\n\tnum_local_triangles.thread_exit<cub::BlockReduce<unsigned long, TB_SIZE> >(num_local_triangles_ts);\n}\n\ninline __device__ bool serial_search(CSRGraph graph, unsigned key, index_type begin, index_type end) {\n\tfor (index_type offset = begin; offset < end; ++ offset) {\n\t\tindex_type d = graph.getAbsDestination(offset);\n\t\tif (d == key) return true;\n\t\tif (d > key) return false;\n\t}\n\treturn false;\n}\n\ninline __device__ bool binary_search(CSRGraph graph, index_type key, index_type begin, index_type end) {\n\tassert(begin < end);\n\tint l = begin;\n\tint r = end-1;\n\twhile (r >= l) { \n\t\t//assert(l<graph.nedges && r<graph.nedges);\n\t\tint mid = l + (r - l) / 2; \n\t\tif (mid >= graph.nedges) printf(\"mid=%u, l=%u, r=%u, begin=%u, end=%u, key=%u\\n\", mid, l, r, begin, end, key);\n\t\tassert(mid < graph.nedges);\n\t\tindex_type value = graph.getAbsDestination(mid);\n\t\tif (value == key) return true;\n\t\tif (value < key) l = mid + 1;\n\t\telse r = mid - 1;\n\t}\n\treturn false;\n}\n\n__global__ void warp(CSRGraph graph, unsigned begin, unsigned end, HGAccumulator<unsigned long> num_local_triangles) {\n\tunsigned thread_id   = blockIdx.x * blockDim.x + threadIdx.x;\n\tunsigned thread_lane = threadIdx.x & (WARP_SIZE-1);            // thread index within the warp\n\tunsigned warp_id     = thread_id   / WARP_SIZE;                // global warp index\n\tunsigned num_warps   = (TB_SIZE / WARP_SIZE) * gridDim.x;   // total number of active warps\n\n\t__shared__ cub::BlockReduce<unsigned long, TB_SIZE>::TempStorage num_local_triangles_ts;\n\tnum_local_triangles.thread_entry();\n\t// each warp takes one vertex\n\tfor (index_type src = begin + warp_id; src < end; src += num_warps) {\n\t\tindex_type row_begin = graph.getFirstEdge(src);\n\t\tindex_type src_size = graph.getOutDegree(src);\n\t\tindex_type row_end = row_begin + src_size;\n\t\t// take one edge\n\t\tfor (index_type offset = row_begin; offset < row_end; offset ++) {\n\t\t\tindex_type dst = graph.getAbsDestination(offset);\n\t\t\tassert(src != dst);\n\t\t\tindex_type dst_size = graph.getOutDegree(dst);\n\t\t\tindex_type lookup = src;\n\t\t\tindex_type search = dst;\n\t\t\tif (src_size > dst_size) {\n\t\t\t\tlookup = dst;\n\t\t\t\tsearch = src;\n\t\t\t}\n\t\t\tindex_type lookup_begin = graph.getFirstEdge(lookup);\n\t\t\tindex_type lookup_size = graph.getOutDegree(lookup);\n\t\t\tindex_type search_size = graph.getOutDegree(search);\n\t\t\tif (lookup_size > 0 && search_size > 0) {\n\t\t\t\tfor (index_type i = thread_lane; i < lookup_size; i += WARP_SIZE) {\n\t\t\t\t\tindex_type index = lookup_begin + i;\n\t\t\t\t\tindex_type key = graph.getAbsDestination(index);\n\t\t\t\t\tindex_type search_begin = graph.getFirstEdge(search);\n\t\t\t\t\tif (binary_search(graph, key, search_begin, search_begin+search_size))\n\t\t\t\t\t//if (serial_search(graph, key, search_begin, search_begin+search_size))\n\t\t\t\t\t\tnum_local_triangles.reduce(1);\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t}\n\tnum_local_triangles.thread_exit<cub::BlockReduce<unsigned long, TB_SIZE> >(num_local_triangles_ts);\n}\n\nvoid sortEdgesByDestination_cuda(struct CUDA_Context* ctx) {\n\tmgpu::standard_context_t context;\n        mgpu::segmented_sort(ctx->gg.edge_dst, ctx->gg.nedges, (const int *) ctx->gg.row_start + 1, ctx->gg.nnodes - 1, mgpu::less_t<int>(), context);\n}\n\nvoid TC_cuda(unsigned __begin, unsigned __end, unsigned long & num_local_triangles, struct CUDA_Context* ctx) {\n\tdim3 blocks;\n\tdim3 threads;\n\tkernel_sizing(blocks, threads);\n\tHGAccumulator<unsigned long> _num_local_triangles;\n\tShared<unsigned long> num_local_trianglesval  = Shared<unsigned long>(1);\n\t*(num_local_trianglesval.cpu_wr_ptr()) = 0;\n\t_num_local_triangles.rv = num_local_trianglesval.gpu_wr_ptr();\n\t//mgc = mgpu::CreateCudaDevice(ctx->device);\n\t//mgpu::SegSortKeysFromIndices(ctx->gg.edge_dst, ctx->gg.nedges, (const int *) ctx->gg.row_start + 1, ctx->gg.nnodes - 1, *mgc);\n\t//base<<<blocks, TB_SIZE>>>(ctx->gg, __begin, __end, _num_local_triangles);\n\twarp<<<blocks, TB_SIZE>>>(ctx->gg, __begin, __end, _num_local_triangles);\n\tcudaDeviceSynchronize();\n\tcheck_cuda_kernel;\n\tnum_local_triangles = *(num_local_trianglesval.cpu_rd_ptr());\n\t//dump_memory_info(\"end\", ctx->id);\n\tcudaProfilerStop();\n\t//num_local_triangles = (unsigned)h_total;\n}\n\nvoid TC_masterNodes_cuda(unsigned long& num_local_triangles, struct CUDA_Context* ctx) {\n\tTC_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, num_local_triangles, ctx);\n}\n"
  },
  {
    "path": "lonestar/analytics/distributed/triangle-counting/tc_cuda.cuh",
    "content": "#pragma once\n#include <cuda.h>\n#include <stdio.h>\n#include <sys/types.h>\n#include <unistd.h>\n#include \"tc_cuda.h\"\n#include \"galois/runtime/cuda/DeviceEdgeSync.h\"\n\nvoid dump_memory_info(const char *s, int netId) {\n  size_t total, free;\n\n  if(cudaMemGetInfo(&free, &total) == cudaSuccess) {\n    printf(\"[%d] GPU_memory_total_%s %zu\\n\", netId, s, total);\n    printf(\"[%d] GPU_memory_free_%s %zu\\n\", netId, s, free);\n  }\n}\n\nstruct CUDA_Context : public CUDA_Context_Common_Edges {\n};\n\nstruct CUDA_Context* get_CUDA_context(int id) {\n\tstruct CUDA_Context* ctx;\n\tctx = (struct CUDA_Context* ) calloc(1, sizeof(struct CUDA_Context));\n\tctx->id = id;\n\treturn ctx;\n}\n\nbool init_CUDA_context(struct CUDA_Context* ctx, int device) {\n\treturn init_CUDA_context_common_edges(ctx, device);\n}\n\nvoid load_graph_CUDA(struct CUDA_Context* ctx, EdgeMarshalGraph &g, unsigned num_hosts) {\n\tdump_memory_info(\"start\", ctx->id);\n\tload_graph_CUDA_common_edges(ctx, g, num_hosts, false);\n\treset_CUDA_context(ctx);\n}\n\nvoid reset_CUDA_context(struct CUDA_Context* ctx) {\n}\n"
  },
  {
    "path": "lonestar/analytics/distributed/triangle-counting/tc_cuda.h",
    "content": "#pragma once\n\n#include \"galois/runtime/DataCommMode.h\"\n#include \"galois/cuda/EdgeHostDecls.h\"\n\nvoid sortEdgesByDestination_cuda(struct CUDA_Context* ctx);\nvoid TC_cuda(unsigned int __begin, unsigned int __end,\n             unsigned long& num_local_triangles, struct CUDA_Context* ctx);\nvoid TC_masterNodes_cuda(unsigned long& num_local_triangles,\n                         struct CUDA_Context* ctx);\n"
  },
  {
    "path": "lonestar/analytics/distributed/triangle-counting/tc_cuda.py",
    "content": "from gg.ast import *\nfrom gg.lib.graph import Graph\nfrom gg.lib.wl import Worklist\nfrom gg.ast.params import GraphParam\nimport cgen\nG = Graph(\"graph\")\nWL = Worklist()\nast = Module([\nCBlock([cgen.Include(\"kernels/reduce.cuh\", system = False)], parse = False),\nCBlock([cgen.Include(\"tc_cuda.cuh\", system = False)], parse = False),\nCBlock([cgen.Include(\"kernels/segmentedsort.cuh\", system = False)], parse = False),\nCBlock([cgen.Include(\"moderngpu.cuh\", system = False)], parse = False),\nCBlock([cgen.Include(\"util/mgpucontext.h\", system = False)], parse = False),\nCBlock([cgen.Include(\"cuda_profiler_api.h\", system = True)], parse = False),\n\nCBlock('mgpu::ContextPtr mgc', parse = False),\n\nKernel(\"intersect\", [G.param(), ('index_type', 'u'), ('index_type', 'v')],\n       [\n        CDecl([('index_type', 'u_start', '= graph.getFirstEdge(u)'),\n               ('index_type', 'u_end', '= u_start + graph.getOutDegree(u)'),\n               ('index_type', 'v_start', '= graph.getFirstEdge(v)'),\n               ('index_type', 'v_end', '= v_start + graph.getOutDegree(v)'),\n               ('int', 'count', '= 0'),\n               ('index_type', 'u_it', '= u_start'),\n               ('index_type', 'v_it', '= v_start'),\n               ('index_type', 'a', ''),\n               ('index_type', 'b', ''),                       \n               ]),\n        While('u_it < u_end && v_it < v_end',\n              [\n                CBlock('a = graph.getAbsDestination(u_it)'),\n                CBlock('b = graph.getAbsDestination(v_it)'),                        \n                CDecl(('int', 'd', '= a - b')),\n                If('d <= 0', [CBlock('u_it++')]),\n                If('d >= 0', [CBlock('v_it++')]),\n                If('d == 0', [CBlock('count++')]),\n                ]\n              ),                      \n        CBlock('return count'),\n        ],\n       device=True,\n       ret_type = 'unsigned int',\n),\n\nKernel(\"TC\", [G.param(), ('unsigned int', '__begin'), ('unsigned int', '__end'), ('HGAccumulator<unsigned int>', 'num_local_triangles')],\n       [CDecl([(\"__shared__ cub::BlockReduce<unsigned int, TB_SIZE>::TempStorage\", \"num_local_triangles_ts\", \"\")]),\n\t\tCBlock([\"num_local_triangles.thread_entry()\"]),\n\t\tForAll(\"src\", G.nodes(\"__begin\", \"__end\"),\n               [CDecl([(\"bool\", \"pop\", \" = src < __end\")]),\n\t\t\t\tUniformConditional(If(\"!pop\", [CBlock(\"continue\")]), uniform_only = False, _only_if_np = True),\n                ClosureHint(ForAll(\"edge\", G.edges(\"src\"), \n                                   [CDecl([('index_type', 'u', '= graph.getAbsDestination(edge)'),\n                                           ('index_type', 'd_u', '= graph.getOutDegree(u)'),\n                                           ('int', 'xcount', '= 0')]),\n                                    CBlock('xcount = intersect(graph, u, src)'),\n                                    If('xcount', [CBlock([\"num_local_triangles.reduce(xcount)\"])])\n                                    ]\n                                   )\n                            ),\n                ]\n        ),\n\t\tCBlock([\"num_local_triangles.thread_exit<cub::BlockReduce<unsigned int, TB_SIZE> >(num_local_triangles_ts)\"], parse = False),\n        ],\n),\n\nKernel(\"TC_cuda\", [('unsigned int ', '__begin'), ('unsigned int ', '__end'), ('unsigned int &', 'num_local_triangles'), ('struct CUDA_Context* ', 'ctx')],\n[\n\tCDecl([(\"dim3\", \"blocks\", \"\")]),\n\tCDecl([(\"dim3\", \"threads\", \"\")]),\n\tCBlock([\"kernel_sizing(blocks, threads)\"]),\n\tCDecl([(\"Shared<unsigned int>\", \"num_local_trianglesval\", \" = Shared<unsigned int>(1)\")]),\n\tCDecl([(\"HGAccumulator<unsigned int>\", \"_num_local_triangles\", \"\")]),\n\tCBlock([\"*(num_local_trianglesval.cpu_wr_ptr()) = 0\"]),\n\tCBlock([\"_num_local_triangles.rv = num_local_trianglesval.gpu_wr_ptr()\"]),\n\tCBlock([\"mgc = mgpu::CreateCudaDevice(ctx->device)\"], parse=False),\n\tCBlock(\"mgpu::SegSortKeysFromIndices(ctx->gg.edge_dst, ctx->gg.nedges, (const int *) ctx->gg.row_start + 1, ctx->gg.nnodes - 1, *mgc)\", parse=False),\n\tInvoke(\"TC\", (\"ctx->gg\", \"__begin\", \"__end\",  \"_num_local_triangles\")),\n\tCBlock([\"check_cuda_kernel\"], parse = False),\n\tCBlock([\"num_local_triangles = *(num_local_trianglesval.cpu_rd_ptr())\"]),\n\tCBlock('dump_memory_info(\"end\", ctx->id)', parse=False),\n\tCBlock('cudaProfilerStop()', parse=False),\n], host = True),\n\nKernel(\"TC_masterNodes_cuda\", [('unsigned int &', 'num_local_triangles'), ('struct CUDA_Context* ', 'ctx')],\n[\n\tCBlock([\"TC_cuda(ctx->beginMaster, ctx->beginMaster + ctx->numOwned, num_local_triangles, ctx)\"]),\n], host = True),\n])\n\n"
  },
  {
    "path": "lonestar/analytics/gpu/CMakeLists.txt",
    "content": "add_subdirectory(bfs)\nadd_subdirectory(sssp)\nadd_subdirectory(connected-components)\nadd_subdirectory(pagerank)\nadd_subdirectory(triangle-counting)\nadd_subdirectory(spanningtree)\nadd_subdirectory(independentset)\nadd_subdirectory(pointstoanalysis)\nadd_subdirectory(matrixcompletion)\n\n"
  },
  {
    "path": "lonestar/analytics/gpu/README.md",
    "content": "Overview of LonestarGPU Analytic Benchmark Suite\n================================================================================\n\nThe LonestarGPU suite contains CUDA implementations of several\nirregular algorithms that exhibit amorphous data parallelism.\nCurrently, LonestarGPU suite contains the following analytic applications,\nwhich can be executed on a single-GPU.\n\n### Analytics Applications\n  * Breadth-First Search \n  * Connected Components\n  * Maximal Independent Set\n  * Minimum Spanning Tree\n  * Pagerank\n  * Points-to Analysis\n  * Single-Source Shortest Paths\n  * Stochastic Gradient Descent\n  * Triangle Counting\n\nCompiling LonestarGPU Through CMake \n================================================================================\n\nThe dependencies for LonestarGPU suite are the same as shared-memory.\nNote that  LonestarGPU requires CUDA 8.0 and above.\n\nNote that distributed Galois requires the cub and moderngpu git submodules,\nwhich can be cloned using the followed commands.\n\n```Shell\ncd $GALOIS_ROOT\ngit submodule init\ngit submodule update\n```\nThese modules will be cloned in the ${GALOIS\\_ROOT}/external directory\n\nTo build the LonestarGPU suite, first, create a build directory and\nrun CMake with -DGALOIS\\_CUDA\\_CAPABILITY=\\<insert CUDA capability here\\> flag\nin the build directory. The CUDA capability should be one that your\nGPU supports. For example, if you wanted to build for a GTX 1080 and a K80,\nthe commands would look like this:\n\n```Shell\ncd ${GALOIS_ROOT}\nmkdir build\ncd build\ncmake ${GALOIS_ROOT} -DGALOIS_CUDA_CAPABILITY=\"3.7;6.1\"\n\nAfter compiling through CMake, the system will create the 'lonestar/analytics/gpu'\nand 'lonestar/scientific/gpu' directories in ${GALOIS\\_ROOT}/build directory. \n\nCompiling Analytics Applications\n================================================================================\n\nOnce CMake is completed,  compile the provided analytics apps by executing the \nfollowing command in the ${GALOIS\\_ROOT}/build/lonestar/analytics/gpu directory.\n\n`make -j`\n\nYou can compile a specific app by executing the following commands (shown for bfs).\n\n```Shell\ncd bfs\nmake -j\n```\n\nRunning Analytics Applications\n================================================================================\n\nTo run a specific app, follow the instructions given in the README.md\nin the particular app directory. \n\nDocumentation\n================================================================================\n\nFurther documentation is available at\n[http://iss.ices.utexas.edu/?p=projects/galois/lonestargpu](http://iss.ices.utexas.edu/?p=projects/galois/lonestargpu)\n\n\n\n\n"
  },
  {
    "path": "lonestar/analytics/gpu/bfs/CMakeLists.txt",
    "content": "app_analy_gpu(bfs bfs)\nadd_test_gpu(bfs rmat15 rmat15.out bfs -s 0 -o rmat15.out ${BASEINPUT}/scalefree/rmat15.gr)\n"
  },
  {
    "path": "lonestar/analytics/gpu/bfs/README.md",
    "content": "Breadth-First Search\n================================================================================\n\nDESCRIPTION\n--------------------------------------------------------------------------------\n\nThis benchmark computes the level of each node from a source node in an unweighted graph. It starts at a node and explores all the nodes on the same level and move on to nodes at the next depth level.\n\nINPUT\n--------------------------------------------------------------------------------\n\nTake in Galois .gr graphs.\n\nBUILD\n--------------------------------------------------------------------------------\n\n1. Run cmake at BUILD directory (refer to top-level README for cmake instructions).\n\n2. Run `cd <BUILD>/lonestar/analytics/gpu/bfs; make -j`\n\nRUN\n--------------------------------------------------------------------------------\n\nTo run default algorithm, use the following:\n\n-`$ ./bfs-gpu -o <output-file> -l -s <startNode> <input-graph>`\n\n-`$ ./bfs-gpu -o outfile.txt -l -s 0 rmat15.gr`\n\nThe option -l enables thread block load balancer. Enable this option for power-law graphs to improve the performance. It is recommended to disable this option for high diameter graphs, such as road-networks.\n"
  },
  {
    "path": "lonestar/analytics/gpu/bfs/bfs.cu",
    "content": "/*  -*- mode: c++ -*-  */\n#include \"gg.h\"\n#include \"ggcuda.h\"\n#include \"cub/cub.cuh\"\n#include \"cub/util_allocator.cuh\"\n#include \"thread_work.h\"\n\nvoid kernel_sizing(CSRGraph &, dim3 &, dim3 &);\n#define TB_SIZE 256\nconst char *GGC_OPTIONS = \"coop_conv=False $ outline_iterate_gb=True $ backoff_blocking_factor=4 $ parcomb=True $ np_schedulers=set(['fg', 'tb', 'wp']) $ cc_disable=set([]) $ tb_lb=True $ hacks=set([]) $ np_factor=8 $ instrument=set([]) $ unroll=[] $ read_props=None $ outline_iterate=True $ ignore_nested_errors=False $ np=True $ write_props=None $ quiet_cgen=True $ retry_backoff=True $ cuda.graph_type=basic $ cuda.use_worklist_slots=True $ cuda.worklist_type=basic\";\nstruct ThreadWork t_work;\nextern int start_node;\nbool enable_lb = false;\ntypedef int edge_data_type;\ntypedef int node_data_type;\nextern const node_data_type INF = INT_MAX;\nstatic const int __tb_bfs_kernel = TB_SIZE;\nstatic const int __tb_gg_main_pipe_1_gpu_gb = 256;\n__global__ void bfs_init(CSRGraph graph, int src)\n{\n  unsigned tid = TID_1D;\n  unsigned nthreads = TOTAL_THREADS_1D;\n\n  index_type node_end;\n  node_end = (graph).nnodes;\n  for (index_type node = 0 + tid; node < node_end; node += nthreads)\n  {\n    graph.node_data[node] = (node == src) ? 0 : INF ;\n  }\n}\n__global__ void bfs_kernel_dev_TB_LB(CSRGraph graph, int LEVEL, int * thread_prefix_work_wl, unsigned int num_items, PipeContextT<Worklist2> thread_src_wl, Worklist2 in_wl, Worklist2 out_wl)\n{\n  unsigned tid = TID_1D;\n  unsigned nthreads = TOTAL_THREADS_1D;\n\n  __shared__ unsigned int total_work;\n  __shared__ unsigned block_start_src_index;\n  __shared__ unsigned block_end_src_index;\n  unsigned my_work;\n  unsigned node;\n  unsigned int offset;\n  unsigned int current_work;\n  unsigned blockdim_x = BLOCK_DIM_X;\n  total_work = thread_prefix_work_wl[num_items - 1];\n  my_work = ceilf((float)(total_work) / (float) nthreads);\n\n  __syncthreads();\n\n  if (my_work != 0)\n  {\n    current_work = tid;\n  }\n  for (unsigned i =0; i < my_work; i++)\n  {\n    unsigned int block_start_work;\n    unsigned int block_end_work;\n    if (threadIdx.x == 0)\n    {\n      if (current_work < total_work)\n      {\n        block_start_work = current_work;\n        block_end_work=current_work + blockdim_x - 1;\n        if (block_end_work >= total_work)\n        {\n          block_end_work = total_work - 1;\n        }\n        block_start_src_index = compute_src_and_offset(0, num_items - 1,  block_start_work+1, thread_prefix_work_wl, num_items,offset);\n        block_end_src_index = compute_src_and_offset(0, num_items - 1, block_end_work+1, thread_prefix_work_wl, num_items, offset);\n      }\n    }\n    __syncthreads();\n\n    if (current_work < total_work)\n    {\n      unsigned src_index;\n      index_type edge;\n      src_index = compute_src_and_offset(block_start_src_index, block_end_src_index, current_work+1, thread_prefix_work_wl,num_items, offset);\n      node= thread_src_wl.in_wl().dwl[src_index];\n      edge = (graph).getFirstEdge(node)+ offset;\n      {\n        index_type dst;\n        dst = graph.getAbsDestination(edge);\n        if (graph.node_data[dst] == INF)\n        {\n          index_type _start_24;\n          graph.node_data[dst] = LEVEL;\n          _start_24 = (out_wl).setup_push_warp_one();;\n          (out_wl).do_push(_start_24, 0, dst);\n        }\n      }\n      current_work = current_work + nthreads;\n    }\n  }\n}\n__global__ void Inspect_bfs_kernel_dev(CSRGraph graph, int LEVEL, PipeContextT<Worklist2> thread_work_wl, PipeContextT<Worklist2> thread_src_wl, bool enable_lb, Worklist2 in_wl, Worklist2 out_wl)\n{\n  unsigned tid = TID_1D;\n  unsigned nthreads = TOTAL_THREADS_1D;\n\n  index_type wlnode_end;\n  wlnode_end = *((volatile index_type *) (in_wl).dindex);\n  for (index_type wlnode = 0 + tid; wlnode < wlnode_end; wlnode += nthreads)\n  {\n    int node;\n    bool pop;\n    int index;\n    pop = (in_wl).pop_id(wlnode, node) && ((( node < (graph).nnodes ) && ( (graph).getOutDegree(node) >= DEGREE_LIMIT)) ? true: false);\n    if (pop)\n    {\n      index = thread_work_wl.in_wl().push_range(1) ;\n      thread_src_wl.in_wl().push_range(1);\n      thread_work_wl.in_wl().dwl[index] = (graph).getOutDegree(node);\n      thread_src_wl.in_wl().dwl[index] = node;\n    }\n  }\n}\n__device__ void bfs_kernel_dev(CSRGraph graph, int LEVEL, bool enable_lb, Worklist2 in_wl, Worklist2 out_wl)\n{\n  unsigned tid = TID_1D;\n  unsigned nthreads = TOTAL_THREADS_1D;\n\n  const unsigned __kernel_tb_size = __tb_bfs_kernel;\n  index_type wlnode_end;\n  const int _NP_CROSSOVER_WP = 32;\n  const int _NP_CROSSOVER_TB = __kernel_tb_size;\n  const int BLKSIZE = __kernel_tb_size;\n  const int ITSIZE = BLKSIZE * 8;\n\n  typedef cub::BlockScan<multiple_sum<2, index_type>, BLKSIZE> BlockScan;\n  typedef union np_shared<BlockScan::TempStorage, index_type, struct tb_np, struct warp_np<__kernel_tb_size/32>, struct fg_np<ITSIZE> > npsTy;\n\n  __shared__ npsTy nps ;\n  wlnode_end = roundup((*((volatile index_type *) (in_wl).dindex)), (blockDim.x));\n  for (index_type wlnode = 0 + tid; wlnode < wlnode_end; wlnode += nthreads)\n  {\n    int node;\n    bool pop;\n    multiple_sum<2, index_type> _np_mps;\n    multiple_sum<2, index_type> _np_mps_total;\n    pop = (in_wl).pop_id(wlnode, node) && ((( node < (graph).nnodes ) && ( (graph).getOutDegree(node) < DEGREE_LIMIT)) ? true: false);\n    struct NPInspector1 _np = {0,0,0,0,0,0};\n    if (pop)\n    {\n      _np.size = (graph).getOutDegree(node);\n      _np.start = (graph).getFirstEdge(node);\n    }\n    _np_mps.el[0] = _np.size >= _NP_CROSSOVER_WP ? _np.size : 0;\n    _np_mps.el[1] = _np.size < _NP_CROSSOVER_WP ? _np.size : 0;\n    BlockScan(nps.temp_storage).ExclusiveSum(_np_mps, _np_mps, _np_mps_total);\n    if (threadIdx.x == 0)\n    {\n      nps.tb.owner = MAX_TB_SIZE + 1;\n    }\n    __syncthreads();\n    while (true)\n    {\n      if (_np.size >= _NP_CROSSOVER_TB)\n      {\n        nps.tb.owner = threadIdx.x;\n      }\n      __syncthreads();\n      if (nps.tb.owner == MAX_TB_SIZE + 1)\n      {\n        __syncthreads();\n        break;\n      }\n      if (nps.tb.owner == threadIdx.x)\n      {\n        nps.tb.start = _np.start;\n        nps.tb.size = _np.size;\n        nps.tb.src = threadIdx.x;\n        _np.start = 0;\n        _np.size = 0;\n      }\n      __syncthreads();\n      int ns = nps.tb.start;\n      int ne = nps.tb.size;\n      if (nps.tb.src == threadIdx.x)\n      {\n        nps.tb.owner = MAX_TB_SIZE + 1;\n      }\n      for (int _np_j = threadIdx.x; _np_j < ne; _np_j += BLKSIZE)\n      {\n        index_type edge;\n        edge = ns +_np_j;\n        {\n          index_type dst;\n          dst = graph.getAbsDestination(edge);\n          if (graph.node_data[dst] == INF)\n          {\n            index_type _start_24;\n            graph.node_data[dst] = LEVEL;\n            _start_24 = (out_wl).setup_push_warp_one();;\n            (out_wl).do_push(_start_24, 0, dst);\n          }\n        }\n      }\n      __syncthreads();\n    }\n\n    {\n      const int warpid = threadIdx.x / 32;\n      const int _np_laneid = cub::LaneId();\n      while (__any_sync(0xffffffff,_np.size >= _NP_CROSSOVER_WP && _np.size < _NP_CROSSOVER_TB))\n      {\n        if (_np.size >= _NP_CROSSOVER_WP && _np.size < _NP_CROSSOVER_TB)\n        {\n          nps.warp.owner[warpid] = _np_laneid;\n        }\n        if (nps.warp.owner[warpid] == _np_laneid)\n        {\n          nps.warp.start[warpid] = _np.start;\n          nps.warp.size[warpid] = _np.size;\n\n          _np.start = 0;\n          _np.size = 0;\n        }\n        index_type _np_w_start = nps.warp.start[warpid];\n        index_type _np_w_size = nps.warp.size[warpid];\n        for (int _np_ii = _np_laneid; _np_ii < _np_w_size; _np_ii += 32)\n        {\n          index_type edge;\n          edge = _np_w_start +_np_ii;\n          {\n            index_type dst;\n            dst = graph.getAbsDestination(edge);\n            if (graph.node_data[dst] == INF)\n            {\n              index_type _start_24;\n              graph.node_data[dst] = LEVEL;\n              _start_24 = (out_wl).setup_push_warp_one();;\n              (out_wl).do_push(_start_24, 0, dst);\n            }\n          }\n        }\n      }\n      __syncthreads();\n    }\n\n    __syncthreads();\n    _np.total = _np_mps_total.el[1];\n    _np.offset = _np_mps.el[1];\n    while (_np.work())\n    {\n      int _np_i =0;\n      _np.inspect(nps.fg.itvalue, ITSIZE);\n      __syncthreads();\n\n      for (_np_i = threadIdx.x; _np_i < ITSIZE && _np.valid(_np_i); _np_i += BLKSIZE)\n      {\n        index_type edge;\n        edge= nps.fg.itvalue[_np_i];\n        {\n          index_type dst;\n          dst = graph.getAbsDestination(edge);\n          if (graph.node_data[dst] == INF)\n          {\n            index_type _start_24;\n            graph.node_data[dst] = LEVEL;\n            _start_24 = (out_wl).setup_push_warp_one();;\n            (out_wl).do_push(_start_24, 0, dst);\n          }\n        }\n      }\n      _np.execute_round_done(ITSIZE);\n      __syncthreads();\n    }\n  }\n}\n__global__ void bfs_kernel(CSRGraph graph, int LEVEL, bool enable_lb, Worklist2 in_wl, Worklist2 out_wl)\n{\n  unsigned tid = TID_1D;\n\n  if (tid == 0)\n    in_wl.reset_next_slot();\n\n  bfs_kernel_dev(graph, LEVEL, enable_lb, in_wl, out_wl);\n}\nvoid gg_main_pipe_1(CSRGraph& gg, int& LEVEL, PipeContextT<Worklist2>& pipe, dim3& blocks, dim3& threads)\n{\n  while (pipe.in_wl().nitems())\n  {\n    pipe.out_wl().will_write();\n    if (enable_lb)\n    {\n      t_work.reset_thread_work();\n      Inspect_bfs_kernel_dev <<<blocks, __tb_bfs_kernel>>>(gg, LEVEL, t_work.thread_work_wl, t_work.thread_src_wl, enable_lb, pipe.in_wl(), pipe.out_wl());\n      cudaDeviceSynchronize();\n      int num_items = t_work.thread_work_wl.in_wl().nitems();\n      if (num_items != 0)\n      {\n        t_work.compute_prefix_sum();\n        cudaDeviceSynchronize();\n        bfs_kernel_dev_TB_LB <<<blocks, __tb_bfs_kernel>>>(gg, LEVEL, t_work.thread_prefix_work_wl.gpu_wr_ptr(), num_items, t_work.thread_src_wl, pipe.in_wl(), pipe.out_wl());\n        cudaDeviceSynchronize();\n      }\n    }\n    bfs_kernel <<<blocks, __tb_bfs_kernel>>>(gg, LEVEL, enable_lb, pipe.in_wl(), pipe.out_wl());\n    cudaDeviceSynchronize();\n    pipe.in_wl().swap_slots();\n    pipe.advance2();\n    LEVEL++;\n  }\n}\n__global__ void __launch_bounds__(__tb_gg_main_pipe_1_gpu_gb) gg_main_pipe_1_gpu_gb(CSRGraph gg, int LEVEL, PipeContextT<Worklist2> pipe, int* cl_LEVEL, bool enable_lb, GlobalBarrier gb)\n{\n  unsigned tid = TID_1D;\n\n  LEVEL = *cl_LEVEL;\n  while (pipe.in_wl().nitems())\n  {\n    if (tid == 0)\n      pipe.in_wl().reset_next_slot();\n    bfs_kernel_dev (gg, LEVEL, enable_lb, pipe.in_wl(), pipe.out_wl());\n    pipe.in_wl().swap_slots();\n    gb.Sync();\n    pipe.advance2();\n    LEVEL++;\n  }\n  gb.Sync();\n  if (tid == 0)\n  {\n    *cl_LEVEL = LEVEL;\n  }\n}\nvoid gg_main_pipe_1_wrapper(CSRGraph& gg, int& LEVEL, PipeContextT<Worklist2>& pipe, dim3& blocks, dim3& threads)\n{\n  static GlobalBarrierLifetime gg_main_pipe_1_gpu_gb_barrier;\n  static bool gg_main_pipe_1_gpu_gb_barrier_inited;\n  extern bool enable_lb;\n  static const size_t gg_main_pipe_1_gpu_gb_residency = maximum_residency(gg_main_pipe_1_gpu_gb, __tb_gg_main_pipe_1_gpu_gb, 0);\n  static const size_t gg_main_pipe_1_gpu_gb_blocks = GG_MIN(blocks.x, ggc_get_nSM() * gg_main_pipe_1_gpu_gb_residency);\n  if(!gg_main_pipe_1_gpu_gb_barrier_inited) { gg_main_pipe_1_gpu_gb_barrier.Setup(gg_main_pipe_1_gpu_gb_blocks); gg_main_pipe_1_gpu_gb_barrier_inited = true;};\n  if (enable_lb)\n  {\n    gg_main_pipe_1(gg,LEVEL,pipe,blocks,threads);\n  }\n  else\n  {\n    int* cl_LEVEL;\n    check_cuda(cudaMalloc(&cl_LEVEL, sizeof(int) * 1));\n    check_cuda(cudaMemcpy(cl_LEVEL, &LEVEL, sizeof(int) * 1, cudaMemcpyHostToDevice));\n\n    gg_main_pipe_1_gpu_gb<<<gg_main_pipe_1_gpu_gb_blocks, __tb_gg_main_pipe_1_gpu_gb>>>(gg,LEVEL,pipe,cl_LEVEL, enable_lb, gg_main_pipe_1_gpu_gb_barrier);\n    check_cuda(cudaMemcpy(&LEVEL, cl_LEVEL, sizeof(int) * 1, cudaMemcpyDeviceToHost));\n    check_cuda(cudaFree(cl_LEVEL));\n  }\n}\nvoid gg_main(CSRGraph& hg, CSRGraph& gg)\n{\n  dim3 blocks, threads;\n  kernel_sizing(gg, blocks, threads);\n  t_work.init_thread_work(gg.nnodes);\n  PipeContextT<Worklist2> wl;\n  bfs_init <<<blocks, threads>>>(gg, start_node);\n  cudaDeviceSynchronize();\n  int LEVEL = 1;\n  wl = PipeContextT<Worklist2>(gg.nnodes);\n  wl.in_wl().wl[0] = start_node;\n  wl.in_wl().update_gpu(1);\n  gg_main_pipe_1_wrapper(gg,LEVEL,wl,blocks,threads);\n}\n"
  },
  {
    "path": "lonestar/analytics/gpu/bfs/support.cu",
    "content": "/* -*- mode: C++ -*- */\n\n#include \"gg.h\"\n\nconst char *prog_opts = \"ls:\";\nconst char *prog_usage = \"[-l] [-s startNode]\";\nconst char *prog_args_usage = \"-l: enable thread block load balancer (by default false)\";\n\nextern const int INF;\nint start_node = 0;\nextern bool enable_lb;\n\nint process_prog_arg(int argc, char *argv[], int arg_start) {\n   return 1;\n}\n\nvoid process_prog_opt(char c, char *optarg) {\n   if(c == 's') {\n     start_node = atoi(optarg);\n     assert(start_node >= 0);\n   }\n   if(c == 'l') {\n\t   enable_lb = true;\n   }\n}\n\nvoid output(CSRGraphTy &g, const char *output_file) {\n  FILE *f;\n\n  if(!output_file)\n    return;\n\n  if(strcmp(output_file, \"-\") == 0)\n    f = stdout;\n  else\n    f = fopen(output_file, \"w\");\n\n  const uint32_t infinity = std::numeric_limits<uint32_t>::max() / 4;    \n  for(int i = 0; i < g.nnodes; i++) {\n    if(g.node_data[i] == INF) {\n      //formatting the output to be compatible with the distributed bfs ouput \n      check_fprintf(f, \"%d %d\\n\", i, infinity);\n    } else {\n      check_fprintf(f, \"%d %d\\n\", i, g.node_data[i]);\n    }    \n  }\n}\n"
  },
  {
    "path": "lonestar/analytics/gpu/connected-components/CMakeLists.txt",
    "content": "app_analy_gpu(cc connected-components)\nadd_test_gpu(connected-components rmat15 rmat15.out cc -o rmat15.out ${BASEINPUT}/scalefree/symmetric/rmat15.sgr)\n\n"
  },
  {
    "path": "lonestar/analytics/gpu/connected-components/README.md",
    "content": "Connected Components\n================================================================================\n\nDESCRIPTION\n--------------------------------------------------------------------------------\n\n\nA connected component of an undirected graph is a subgraph in which there is a path between any two nodes. A node with no edges is itself a connected component. This benchmark computes number of connected components in an undirected graph.\n\nINPUT\n--------------------------------------------------------------------------------\n\nTake in symmetric Galois .sgr graphs. \n\nBUILD\n--------------------------------------------------------------------------------\n\n1. Run cmake at BUILD directory (refer to top-level README for cmake instructions).\n\n2. Run `cd <BUILD>/lonestar/analytics/gpu/connected-components; make -j`\n\nRUN\n--------------------------------------------------------------------------------\n\nTo run default algorithm, use the following:\n\n-`$ ./connected-components-gpu -o <output-file> <symmetric-input-graph>`\n\n-`$ ./connected-components-gpu -o outfile.txt road-USA.sgr`\n\n"
  },
  {
    "path": "lonestar/analytics/gpu/connected-components/cc.cu",
    "content": "/*  -*- mode: c++ -*-  */\n#include \"gg.h\"\n#include \"ggcuda.h\"\n#include \"cub/cub.cuh\"\n#include \"cub/util_allocator.cuh\"\n#include \"thread_work.h\"\n\nvoid kernel_sizing(CSRGraph &, dim3 &, dim3 &);\n#define TB_SIZE 256\nconst char *GGC_OPTIONS = \"coop_conv=False $ outline_iterate_gb=True $ backoff_blocking_factor=4 $ parcomb=True $ np_schedulers=set(['fg', 'tb', 'wp']) $ cc_disable=set([]) $ tb_lb=True $ hacks=set([]) $ np_factor=8 $ instrument=set([]) $ unroll=[] $ read_props=None $ outline_iterate=True $ ignore_nested_errors=False $ np=True $ write_props=None $ quiet_cgen=True $ retry_backoff=True $ cuda.graph_type=basic $ cuda.use_worklist_slots=True $ cuda.worklist_type=basic\";\nstruct ThreadWork t_work;\nextern unsigned long DISCOUNT_TIME_NS;\nbool enable_lb = true;\nstatic const int __tb_prep_edge_src = TB_SIZE;\nstatic const int __tb_gg_main_pipe_4_gpu_gb = 256;\nstatic const int __tb_gg_main_pipe_3_gpu_gb = 256;\n__global__ void init(CSRGraph graph)\n{\n  unsigned tid = TID_1D;\n  unsigned nthreads = TOTAL_THREADS_1D;\n\n  index_type node_end;\n  node_end = (graph).nnodes;\n  for (index_type node = 0 + tid; node < node_end; node += nthreads)\n  {\n    graph.node_data[node] = node;\n  }\n}\n__global__ void prep_edge_src_TB_LB(CSRGraph graph, index_type * edge_src, int * thread_prefix_work_wl, unsigned int num_items, PipeContextT<Worklist2> thread_src_wl)\n{\n  unsigned tid = TID_1D;\n  unsigned nthreads = TOTAL_THREADS_1D;\n\n  __shared__ unsigned int total_work;\n  __shared__ unsigned block_start_src_index;\n  __shared__ unsigned block_end_src_index;\n  unsigned my_work;\n  unsigned node;\n  unsigned int offset;\n  unsigned int current_work;\n  unsigned blockdim_x = BLOCK_DIM_X;\n  total_work = thread_prefix_work_wl[num_items - 1];\n  my_work = ceilf((float)(total_work) / (float) nthreads);\n\n  __syncthreads();\n\n  if (my_work != 0)\n  {\n    current_work = tid;\n  }\n  for (unsigned i =0; i < my_work; i++)\n  {\n    unsigned int block_start_work;\n    unsigned int block_end_work;\n    if (threadIdx.x == 0)\n    {\n      if (current_work < total_work)\n      {\n        block_start_work = current_work;\n        block_end_work=current_work + blockdim_x - 1;\n        if (block_end_work >= total_work)\n        {\n          block_end_work = total_work - 1;\n        }\n        block_start_src_index = compute_src_and_offset(0, num_items - 1,  block_start_work+1, thread_prefix_work_wl, num_items,offset);\n        block_end_src_index = compute_src_and_offset(0, num_items - 1, block_end_work+1, thread_prefix_work_wl, num_items, offset);\n      }\n    }\n    __syncthreads();\n\n    if (current_work < total_work)\n    {\n      unsigned src_index;\n      index_type edge;\n      src_index = compute_src_and_offset(block_start_src_index, block_end_src_index, current_work+1, thread_prefix_work_wl,num_items, offset);\n      node= thread_src_wl.in_wl().dwl[src_index];\n      edge = (graph).getFirstEdge(node)+ offset;\n      {\n        edge_src[edge] = node;\n      }\n      current_work = current_work + nthreads;\n    }\n  }\n}\n__global__ void Inspect_prep_edge_src(CSRGraph graph, index_type * edge_src, PipeContextT<Worklist2> thread_work_wl, PipeContextT<Worklist2> thread_src_wl, bool enable_lb)\n{\n  unsigned tid = TID_1D;\n  unsigned nthreads = TOTAL_THREADS_1D;\n\n  index_type node_end;\n  node_end = (graph).nnodes;\n  for (index_type node = 0 + tid; node < node_end; node += nthreads)\n  {\n    bool pop;\n    int index;\n    pop = (((( node < (graph).nnodes ) && ( (graph).getOutDegree(node) >= DEGREE_LIMIT)) ? true: false)) && graph.valid_node(node);;\n    if (pop)\n    {\n      index = thread_work_wl.in_wl().push_range(1) ;\n      thread_src_wl.in_wl().push_range(1);\n      thread_work_wl.in_wl().dwl[index] = (graph).getOutDegree(node);\n      thread_src_wl.in_wl().dwl[index] = node;\n    }\n  }\n}\n__global__ void prep_edge_src(CSRGraph graph, index_type * edge_src, bool enable_lb)\n{\n  unsigned tid = TID_1D;\n  unsigned nthreads = TOTAL_THREADS_1D;\n\n  const unsigned __kernel_tb_size = __tb_prep_edge_src;\n  index_type node_end;\n  const int _NP_CROSSOVER_WP = 32;\n  const int _NP_CROSSOVER_TB = __kernel_tb_size;\n  const int BLKSIZE = __kernel_tb_size;\n  const int ITSIZE = BLKSIZE * 8;\n\n  typedef cub::BlockScan<multiple_sum<2, index_type>, BLKSIZE> BlockScan;\n  typedef union np_shared<BlockScan::TempStorage, index_type, struct tb_np, struct warp_np<__kernel_tb_size/32>, struct fg_np<ITSIZE> > npsTy;\n\n  __shared__ npsTy nps ;\n  node_end = roundup(((graph).nnodes), (blockDim.x));\n  for (index_type node = 0 + tid; node < node_end; node += nthreads)\n  {\n    bool pop;\n    multiple_sum<2, index_type> _np_mps;\n    multiple_sum<2, index_type> _np_mps_total;\n    pop = (((( node < (graph).nnodes ) && ( (graph).getOutDegree(node) < DEGREE_LIMIT)) ? true: false)) && graph.valid_node(node);;\n    struct NPInspector1 _np = {0,0,0,0,0,0};\n    __shared__ struct { index_type node; } _np_closure [TB_SIZE];\n    _np_closure[threadIdx.x].node = node;\n    if (pop)\n    {\n      _np.size = (graph).getOutDegree(node);\n      _np.start = (graph).getFirstEdge(node);\n    }\n    _np_mps.el[0] = _np.size >= _NP_CROSSOVER_WP ? _np.size : 0;\n    _np_mps.el[1] = _np.size < _NP_CROSSOVER_WP ? _np.size : 0;\n    BlockScan(nps.temp_storage).ExclusiveSum(_np_mps, _np_mps, _np_mps_total);\n    if (threadIdx.x == 0)\n    {\n      nps.tb.owner = MAX_TB_SIZE + 1;\n    }\n    __syncthreads();\n    while (true)\n    {\n      if (_np.size >= _NP_CROSSOVER_TB)\n      {\n        nps.tb.owner = threadIdx.x;\n      }\n      __syncthreads();\n      if (nps.tb.owner == MAX_TB_SIZE + 1)\n      {\n        __syncthreads();\n        break;\n      }\n      if (nps.tb.owner == threadIdx.x)\n      {\n        nps.tb.start = _np.start;\n        nps.tb.size = _np.size;\n        nps.tb.src = threadIdx.x;\n        _np.start = 0;\n        _np.size = 0;\n      }\n      __syncthreads();\n      int ns = nps.tb.start;\n      int ne = nps.tb.size;\n      if (nps.tb.src == threadIdx.x)\n      {\n        nps.tb.owner = MAX_TB_SIZE + 1;\n      }\n      assert(nps.tb.src < __kernel_tb_size);\n      node = _np_closure[nps.tb.src].node;\n      for (int _np_j = threadIdx.x; _np_j < ne; _np_j += BLKSIZE)\n      {\n        index_type edge;\n        edge = ns +_np_j;\n        {\n          edge_src[edge] = node;\n        }\n      }\n      __syncthreads();\n    }\n\n    {\n      const int warpid = threadIdx.x / 32;\n      const int _np_laneid = cub::LaneId();\n      while (__any_sync(0xffffffff, _np.size >= _NP_CROSSOVER_WP && _np.size < _NP_CROSSOVER_TB))\n      {\n        if (_np.size >= _NP_CROSSOVER_WP && _np.size < _NP_CROSSOVER_TB)\n        {\n          nps.warp.owner[warpid] = _np_laneid;\n        }\n        if (nps.warp.owner[warpid] == _np_laneid)\n        {\n          nps.warp.start[warpid] = _np.start;\n          nps.warp.size[warpid] = _np.size;\n          nps.warp.src[warpid] = threadIdx.x;\n          _np.start = 0;\n          _np.size = 0;\n        }\n        index_type _np_w_start = nps.warp.start[warpid];\n        index_type _np_w_size = nps.warp.size[warpid];\n        assert(nps.warp.src[warpid] < __kernel_tb_size);\n        node = _np_closure[nps.warp.src[warpid]].node;\n        for (int _np_ii = _np_laneid; _np_ii < _np_w_size; _np_ii += 32)\n        {\n          index_type edge;\n          edge = _np_w_start +_np_ii;\n          {\n            edge_src[edge] = node;\n          }\n        }\n      }\n      __syncthreads();\n    }\n\n    __syncthreads();\n    _np.total = _np_mps_total.el[1];\n    _np.offset = _np_mps.el[1];\n    while (_np.work())\n    {\n      int _np_i =0;\n      _np.inspect2(nps.fg.itvalue, nps.fg.src, ITSIZE, threadIdx.x);\n      __syncthreads();\n\n      for (_np_i = threadIdx.x; _np_i < ITSIZE && _np.valid(_np_i); _np_i += BLKSIZE)\n      {\n        index_type edge;\n        assert(nps.fg.src[_np_i] < __kernel_tb_size);\n        node = _np_closure[nps.fg.src[_np_i]].node;\n        edge= nps.fg.itvalue[_np_i];\n        {\n          edge_src[edge] = node;\n        }\n      }\n      _np.execute_round_done(ITSIZE);\n      __syncthreads();\n    }\n    assert(threadIdx.x < __kernel_tb_size);\n    node = _np_closure[threadIdx.x].node;\n  }\n}\n__global__ void hook_init(CSRGraph graph, index_type * edge_src)\n{\n  unsigned tid = TID_1D;\n  unsigned nthreads = TOTAL_THREADS_1D;\n\n  int edge_end;\n  edge_end = graph.nedges;\n  for (int edge = 0 + tid; edge < edge_end; edge += nthreads)\n  {\n    index_type x = edge_src[edge];\n    index_type y = graph.getAbsDestination(edge);\n    index_type mx = x > y ? x : y;\n    index_type mn = x > y ? y : x;\n    graph.node_data[mx] = mn;\n  }\n}\n__global__ void hook_high_to_low(CSRGraph graph, const __restrict__ index_type * edge_src, char * marks, HGAccumulator<int> ret_val)\n{\n  unsigned tid = TID_1D;\n  unsigned nthreads = TOTAL_THREADS_1D;\n\n  typedef cub::BlockReduce<int, TB_SIZE> _br;\n  __shared__ _br::TempStorage _ts;\n  ret_val.thread_entry();\n  int edge_end;\n  edge_end = graph.nedges;\n  for (int edge = 0 + tid; edge < edge_end; edge += nthreads)\n  {\n    if (!marks[edge])\n    {\n      index_type u = edge_src[edge];\n      index_type v = graph.getAbsDestination(edge);\n      node_data_type p_u = graph.node_data[u];\n      node_data_type p_v = graph.node_data[v];\n      index_type mx = p_u > p_v ? p_u : p_v;\n      index_type mn = p_u > p_v ? p_v : p_u;\n      if (mx == mn)\n      {\n        marks[edge] = 1;\n      }\n      else\n      {\n        graph.node_data[mn] = mx;\n        ret_val.reduce(true);\n        continue;\n        continue;\n      }\n    }\n  }\n  ret_val.thread_exit<_br>(_ts);\n}\n__global__ void hook_low_to_high(CSRGraph graph, index_type * edge_src, char * marks, HGAccumulator<int> ret_val)\n{\n  unsigned tid = TID_1D;\n  unsigned nthreads = TOTAL_THREADS_1D;\n\n  typedef cub::BlockReduce<int, TB_SIZE> _br;\n  __shared__ _br::TempStorage _ts;\n  ret_val.thread_entry();\n  int edge_end;\n  edge_end = graph.nedges;\n  for (int edge = 0 + tid; edge < edge_end; edge += nthreads)\n  {\n    if (!marks[edge])\n    {\n      index_type u = edge_src[edge];\n      index_type v = graph.getAbsDestination(edge);\n      node_data_type p_u = graph.node_data[u];\n      node_data_type p_v = graph.node_data[v];\n      index_type mx = p_u > p_v ? p_u : p_v;\n      index_type mn = p_u > p_v ? p_v : p_u;\n      if (mx == mn)\n      {\n        marks[edge] = 1;\n      }\n      else\n      {\n        graph.node_data[mx] = mn;\n        ret_val.reduce(true);\n        continue;\n        continue;\n      }\n    }\n  }\n  ret_val.thread_exit<_br>(_ts);\n}\n__device__ void p_jump_dev(CSRGraph graph, HGAccumulator<int> ret_val)\n{\n  unsigned tid = TID_1D;\n  unsigned nthreads = TOTAL_THREADS_1D;\n\n  typedef cub::BlockReduce<int, TB_SIZE> _br;\n  __shared__ _br::TempStorage _ts;\n  ret_val.thread_entry();\n  index_type node_end;\n  node_end = (graph).nnodes;\n  for (index_type node = 0 + tid; node < node_end; node += nthreads)\n  {\n    node_data_type p_u = graph.node_data[node];\n    node_data_type p_v = graph.node_data[p_u];\n    if (p_u != p_v)\n    {\n      graph.node_data[node] = p_v;\n      ret_val.reduce(true);\n      continue;\n      continue;\n    }\n  }\n  ret_val.thread_exit<_br>(_ts);\n}\n__global__ void p_jump(CSRGraph graph, HGAccumulator<int> ret_val)\n{\n  p_jump_dev(graph, ret_val);\n}\n__global__ void identify_roots(CSRGraph graph, Worklist2 in_wl, Worklist2 out_wl)\n{\n  unsigned tid = TID_1D;\n  unsigned nthreads = TOTAL_THREADS_1D;\n\n  if (tid == 0)\n    in_wl.reset_next_slot();\n\n  index_type node_end;\n  node_end = (graph).nnodes;\n  for (index_type node = 0 + tid; node < node_end; node += nthreads)\n  {\n    if (graph.node_data[node] == node)\n    {\n      index_type _start_73;\n      _start_73 = (out_wl).setup_push_warp_one();;\n      (out_wl).do_push(_start_73, 0, node);\n    }\n  }\n}\n__device__ void p_jump_roots_dev(CSRGraph graph, Worklist2 in_wl, Worklist2 out_wl, HGAccumulator<int> ret_val)\n{\n  unsigned tid = TID_1D;\n  unsigned nthreads = TOTAL_THREADS_1D;\n\n  typedef cub::BlockReduce<int, TB_SIZE> _br;\n  __shared__ _br::TempStorage _ts;\n  ret_val.thread_entry();\n  index_type wlnode_end;\n  wlnode_end = *((volatile index_type *) (in_wl).dindex);\n  for (index_type wlnode = 0 + tid; wlnode < wlnode_end; wlnode += nthreads)\n  {\n    bool pop;\n    int node;\n    pop = (in_wl).pop_id(wlnode, node);\n    node_data_type p_u = graph.node_data[node];\n    node_data_type p_v = graph.node_data[p_u];\n    if (p_u != p_v)\n    {\n      graph.node_data[node] = p_v;\n      ret_val.reduce(true);\n      continue;\n      continue;\n    }\n  }\n  ret_val.thread_exit<_br>(_ts);\n}\n__global__ void p_jump_roots(CSRGraph graph, Worklist2 in_wl, Worklist2 out_wl, HGAccumulator<int> ret_val)\n{\n  unsigned tid = TID_1D;\n\n  if (tid == 0)\n    in_wl.reset_next_slot();\n\n  p_jump_roots_dev(graph, in_wl, out_wl, ret_val);\n}\n__global__ void p_jump_leaves(CSRGraph graph)\n{\n  unsigned tid = TID_1D;\n  unsigned nthreads = TOTAL_THREADS_1D;\n\n  index_type node_end;\n  node_end = (graph).nnodes;\n  for (index_type node = 0 + tid; node < node_end; node += nthreads)\n  {\n    node_data_type p_u = graph.node_data[node];\n    node_data_type p_v = graph.node_data[p_u];\n    if (p_u != p_v)\n    {\n      graph.node_data[node] = p_v;\n    }\n  }\n}\n__global__ void count_components(CSRGraph graph, int * count)\n{\n  unsigned tid = TID_1D;\n  unsigned nthreads = TOTAL_THREADS_1D;\n\n  index_type node_end;\n  node_end = (graph).nnodes;\n  for (index_type node = 0 + tid; node < node_end; node += nthreads)\n  {\n    if (node == graph.node_data[node])\n    {\n      atomicAdd(count, 1);\n    }\n  }\n}\nvoid gg_main_pipe_4(CSRGraphTy& gg, PipeContextT<Worklist2>& pipe, dim3& blocks, dim3& threads)\n{\n  bool loopc = false;\n  do\n  {\n    Shared<int> retval = Shared<int>(1);\n    HGAccumulator<int> _rv;\n    *(retval.cpu_wr_ptr()) = 0;\n    _rv.rv = retval.gpu_wr_ptr();\n    pipe.out_wl().will_write();\n    p_jump_roots <<<blocks, threads>>>(gg, pipe.in_wl(), pipe.out_wl(), _rv);\n    cudaDeviceSynchronize();\n    loopc = *(retval.cpu_rd_ptr()) > 0;\n  }\n  while (loopc);\n}\n__global__ void __launch_bounds__(__tb_gg_main_pipe_4_gpu_gb) gg_main_pipe_4_gpu_gb(CSRGraphTy gg, PipeContextT<Worklist2> pipe, int* retval, bool enable_lb, GlobalBarrier gb)\n{\n  unsigned tid = TID_1D;\n\n  bool loopc = false;\n  do\n  {\n    HGAccumulator<int> _rv;\n    *retval = 0;\n    _rv.rv = retval;\n    gb.Sync();\n    if (tid == 0)\n      pipe.in_wl().reset_next_slot();\n    p_jump_roots_dev (gg, pipe.in_wl(), pipe.out_wl(), _rv);\n    _rv.local = *retval;\n    gb.Sync();\n    loopc = *retval > 0;\n    gb.Sync();\n  }\n  while (loopc);\n  gb.Sync();\n  if (tid == 0)\n  {\n    pipe.save();\n  }\n}\nvoid gg_main_pipe_4_wrapper(CSRGraphTy& gg, PipeContextT<Worklist2>& pipe, dim3& blocks, dim3& threads)\n{\n  static GlobalBarrierLifetime gg_main_pipe_4_gpu_gb_barrier;\n  static bool gg_main_pipe_4_gpu_gb_barrier_inited;\n  extern bool enable_lb;\n  static const size_t gg_main_pipe_4_gpu_gb_residency = maximum_residency(gg_main_pipe_4_gpu_gb, __tb_gg_main_pipe_4_gpu_gb, 0);\n  static const size_t gg_main_pipe_4_gpu_gb_blocks = GG_MIN(blocks.x, ggc_get_nSM() * gg_main_pipe_4_gpu_gb_residency);\n  if(!gg_main_pipe_4_gpu_gb_barrier_inited) { gg_main_pipe_4_gpu_gb_barrier.Setup(gg_main_pipe_4_gpu_gb_blocks); gg_main_pipe_4_gpu_gb_barrier_inited = true;};\n  Shared<int> retval (1);\n  if (enable_lb)\n  {\n    gg_main_pipe_4(gg,pipe,blocks,threads);\n  }\n  else\n  {\n    pipe.prep();\n\n    gg_main_pipe_4_gpu_gb<<<gg_main_pipe_4_gpu_gb_blocks, __tb_gg_main_pipe_4_gpu_gb>>>(gg,pipe,retval.gpu_wr_ptr(), enable_lb, gg_main_pipe_4_gpu_gb_barrier);\n    pipe.restore();\n  }\n}\nvoid gg_main_pipe_3(CSRGraphTy& gg, dim3& blocks, dim3& threads)\n{\n  bool loopc = false;\n  do\n  {\n    Shared<int> retval = Shared<int>(1);\n    HGAccumulator<int> _rv;\n    *(retval.cpu_wr_ptr()) = 0;\n    _rv.rv = retval.gpu_wr_ptr();\n    p_jump <<<blocks, threads>>>(gg, _rv);\n    cudaDeviceSynchronize();\n    loopc = *(retval.cpu_rd_ptr()) > 0;\n  }\n  while (loopc);\n}\n__global__ void __launch_bounds__(__tb_gg_main_pipe_3_gpu_gb) gg_main_pipe_3_gpu_gb(CSRGraphTy gg, int* retval, bool enable_lb, GlobalBarrier gb)\n{\n  bool loopc = false;\n  do\n  {\n    HGAccumulator<int> _rv;\n    *retval = 0;\n    _rv.rv = retval;\n    gb.Sync();\n    p_jump_dev (gg, _rv);\n    _rv.local = *retval;\n    gb.Sync();\n    loopc = *retval > 0;\n    gb.Sync();\n  }\n  while (loopc);\n  gb.Sync();\n}\nvoid gg_main_pipe_3_wrapper(CSRGraphTy& gg, dim3& blocks, dim3& threads)\n{\n  static GlobalBarrierLifetime gg_main_pipe_3_gpu_gb_barrier;\n  static bool gg_main_pipe_3_gpu_gb_barrier_inited;\n  extern bool enable_lb;\n  static const size_t gg_main_pipe_3_gpu_gb_residency = maximum_residency(gg_main_pipe_3_gpu_gb, __tb_gg_main_pipe_3_gpu_gb, 0);\n  static const size_t gg_main_pipe_3_gpu_gb_blocks = GG_MIN(blocks.x, ggc_get_nSM() * gg_main_pipe_3_gpu_gb_residency);\n  if(!gg_main_pipe_3_gpu_gb_barrier_inited) { gg_main_pipe_3_gpu_gb_barrier.Setup(gg_main_pipe_3_gpu_gb_blocks); gg_main_pipe_3_gpu_gb_barrier_inited = true;};\n  Shared<int> retval (1);\n  if (enable_lb)\n  {\n    gg_main_pipe_3(gg,blocks,threads);\n  }\n  else\n  {\n\n    gg_main_pipe_3_gpu_gb<<<gg_main_pipe_3_gpu_gb_blocks, __tb_gg_main_pipe_3_gpu_gb>>>(gg,retval.gpu_wr_ptr(), enable_lb, gg_main_pipe_3_gpu_gb_barrier);\n  }\n}\nvoid gg_main(CSRGraphTy& hg, CSRGraphTy& gg)\n{\n  dim3 blocks, threads;\n  kernel_sizing(gg, blocks, threads);\n  t_work.init_thread_work(gg.nnodes);\n  int edge_blocks;\n  int node_blocks;\n  cudaEvent_t start;\n  cudaEvent_t stop;\n  PipeContextT<Worklist2> pipe;\n  int it_hk = 1;\n  Shared<index_type> edge_src (gg.nedges);\n  Shared<char> edge_marks (gg.nedges);\n  bool flag = false;\n  edge_blocks = hg.nedges / TB_SIZE + 1;\n  node_blocks = hg.nnodes / TB_SIZE + 1;\n  edge_marks.zero_gpu();\n  check_cuda(cudaEventCreate(&start));\n  check_cuda(cudaEventCreate(&stop));\n  check_cuda(cudaEventRecord(start));\n  if (enable_lb)\n  {\n    t_work.reset_thread_work();\n    Inspect_prep_edge_src <<<node_blocks, __tb_prep_edge_src>>>(gg, edge_src.gpu_wr_ptr(), t_work.thread_work_wl, t_work.thread_src_wl, enable_lb);\n    cudaDeviceSynchronize();\n    int num_items = t_work.thread_work_wl.in_wl().nitems();\n    if (num_items != 0)\n    {\n      t_work.compute_prefix_sum();\n      cudaDeviceSynchronize();\n      prep_edge_src_TB_LB <<<node_blocks, __tb_prep_edge_src>>>(gg, edge_src.gpu_wr_ptr(), t_work.thread_prefix_work_wl.gpu_wr_ptr(), num_items, t_work.thread_src_wl);\n      cudaDeviceSynchronize();\n    }\n  }\n  prep_edge_src <<<node_blocks, __tb_prep_edge_src>>>(gg, edge_src.gpu_wr_ptr(), enable_lb);\n  cudaDeviceSynchronize();\n  check_cuda(cudaEventRecord(stop));\n  init <<<node_blocks, threads>>>(gg);\n  cudaDeviceSynchronize();\n  hook_init <<<edge_blocks, threads>>>(gg, edge_src.gpu_rd_ptr());\n  cudaDeviceSynchronize();\n  gg_main_pipe_3_wrapper(gg,blocks,threads);\n  pipe = PipeContextT<Worklist2>(gg.nnodes);\n  {\n    {\n      do\n      {\n        pipe.out_wl().will_write();\n        identify_roots <<<blocks, threads>>>(gg, pipe.in_wl(), pipe.out_wl());\n        cudaDeviceSynchronize();\n        pipe.in_wl().swap_slots();\n        pipe.advance2();\n        if (it_hk != 0)\n        {\n          Shared<int> retval = Shared<int>(1);\n          HGAccumulator<int> _rv;\n          *(retval.cpu_wr_ptr()) = 0;\n          _rv.rv = retval.gpu_wr_ptr();\n          hook_low_to_high <<<edge_blocks, threads>>>(gg, edge_src.gpu_rd_ptr(), edge_marks.gpu_wr_ptr(), _rv);\n          cudaDeviceSynchronize();\n          flag = *(retval.cpu_rd_ptr());\n          it_hk = (it_hk + 1) % 4;\n        }\n        else\n        {\n          Shared<int> retval = Shared<int>(1);\n          HGAccumulator<int> _rv;\n          *(retval.cpu_wr_ptr()) = 0;\n          _rv.rv = retval.gpu_wr_ptr();\n          hook_high_to_low <<<edge_blocks, threads>>>(gg, edge_src.gpu_rd_ptr(), edge_marks.gpu_wr_ptr(), _rv);\n          cudaDeviceSynchronize();\n          flag = *(retval.cpu_rd_ptr());\n        }\n        if (!flag)\n        {\n          break;\n        }\n        gg_main_pipe_4_wrapper(gg,pipe,blocks,threads);\n        p_jump_leaves <<<node_blocks, threads>>>(gg);\n        cudaDeviceSynchronize();\n      }\n      while (flag);\n    }\n  }\n  printf(\"iterations: %d\\n\", it_hk);\n  Shared<int> count (1);\n  *(count.cpu_wr_ptr()) = 0;\n  count_components <<<blocks, threads>>>(gg, count.gpu_wr_ptr());\n  cudaDeviceSynchronize();\n  printf(\"components: %d\\n\", *(count.cpu_rd_ptr()));\n  float ms =0;\n  check_cuda(cudaEventElapsedTime(&ms, start, stop));\n  DISCOUNT_TIME_NS = (int) (ms * 1000000);\n  printf(\"prep_edge_src: %llu ns\\n\", DISCOUNT_TIME_NS);\n}\n"
  },
  {
    "path": "lonestar/analytics/gpu/connected-components/support.cu",
    "content": "/* -*- mode: C++ -*- */\n\n#include \"gg.h\"\n\nconst char *prog_opts = \"\";\nconst char *prog_usage = \"\";\nconst char *prog_args_usage = \"\";\n\nint process_prog_arg(int argc, char *argv[], int arg_start) {\n   return 1;\n}\n\nvoid process_prog_opt(char c, char *optarg) {\n}\n\nvoid output(CSRGraphTy &g, const char *output_file) {\n  FILE *f;\n\n  if(!output_file)\n    return;\n\n  if(strcmp(output_file, \"-\") == 0)\n    f = stdout;\n  else\n    f = fopen(output_file, \"w\");\n    \n\n  for(int i = 0; i < g.nnodes; i++) {\n    check_fprintf(f, \"%d %d\\n\", i, g.node_data[i]);\n    \n    for(int j = g.getFirstEdge(i); j < g.getFirstEdge(i+1); j++) {\n      index_type dst = g.getAbsDestination(j);\n      check_fprintf(f, \"\\te %d: %d %d %d\\n\", j, dst, g.edge_data[j], g.node_data[dst] );\n    }\n  }\n}\n"
  },
  {
    "path": "lonestar/analytics/gpu/independentset/CMakeLists.txt",
    "content": "app_analy_gpu(mis maximal-independentset)\nset_property(TARGET maximal-independentset-gpu PROPERTY CUDA_SEPARABLE_COMPILATION ON)\ntarget_link_libraries(maximal-independentset-gpu ${CUDA_cudadevrt_LIBRARY})\ntarget_link_libraries(maximal-independentset-gpu Galois::gpu -lcurand)\nadd_test_gpu(maximal-independentset rmat15 rmat15.out mis -o rmat15.out ${BASEINPUT}/scalefree/rmat15.gr)\n"
  },
  {
    "path": "lonestar/analytics/gpu/independentset/README.md",
    "content": "Maximal Independent Set\n================================================================================\n\nDESCRIPTION\n--------------------------------------------------------------------------------\n\nThis benchmark computes the maximal independent set in an unweighted graph.\n\nINPUT\n--------------------------------------------------------------------------------\n\nThis application takes in Galois .gr graphs.\n\nBUILD\n--------------------------------------------------------------------------------\n\n1. Run cmake at BUILD directory (refer to top-level README for cmake instructions).\n\n2. Run `cd <BUILD>/lonestar/analytics/gpu/independentset; make -j`\n\nRUN\n--------------------------------------------------------------------------------\n\nTo run default algorithm, use the following:\n\n-`$ ./minimum-spanningtree-gpu -o=<output-file> <input-graph>`\n\n-`$ ./minimum-spanningtree-gpu -o outfile.txt road-USA.gr`\n"
  },
  {
    "path": "lonestar/analytics/gpu/independentset/mis.cu",
    "content": "/*  -*- mode: c++ -*-  */\n#include \"gg.h\"\n#include \"ggcuda.h\"\n#include <curand.h>\n\nvoid kernel_sizing(CSRGraph &, dim3 &, dim3 &);\n#define TB_SIZE 256\nconst char *GGC_OPTIONS = \"coop_conv=False $ outline_iterate_gb=False $ backoff_blocking_factor=4 $ parcomb=False $ np_schedulers=set(['fg', 'tb', 'wp']) $ cc_disable=set([]) $ hacks=set([]) $ np_factor=1 $ instrument=set([]) $ unroll=[] $ read_props=None $ outline_iterate=True $ ignore_nested_errors=False $ np=False $ write_props=None $ quiet_cgen=True $ retry_backoff=True $ cuda.graph_type=texture $ cuda.use_worklist_slots=True $ cuda.worklist_type=texture\";\n#include <curand.h>\n#define UNMARKED 0\n#define MARKED 1\n#define NON_INDEPENDENT 2\n#define NON_MAXIMAL 3\n#define SEED1 0x12345678LL\n#define SEED2 0xabbdef12LL\n#define SEED3 0xcafe1234LL\n#define SEED4 0x09832516LL\nstatic const int __tb_one = 1;\n__global__ void gen_prio_gpu(CSRGraph graph, unsigned int * prio, unsigned int x, unsigned int y, unsigned int z, unsigned int w)\n{\n  unsigned tid = TID_1D;\n  unsigned nthreads = TOTAL_THREADS_1D;\n\n  const unsigned __kernel_tb_size = TB_SIZE;\n  index_type node_end;\n  x ^= tid;\n  y ^= tid;\n  z ^= tid;\n  w ^= tid;\n  assert(!(x == 0 && y == 0 && z == 0 && w == 0));\n  node_end = (graph).nnodes;\n  for (index_type node = 0 + tid; node < node_end; node += nthreads)\n  {\n    unsigned int t;\n    t = x ^ (x << 11);\n    x = y;\n    y = z;\n    z = w;\n    w = w ^ (w >> 19) ^ t ^ (t >> 8);\n    prio[node] = w;\n  }\n}\nvoid gen_prio(CSRGraph graph, unsigned int * prio)\n{\n  curandGenerator_t gen;\n  check_rv(curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_MT19937), CURAND_STATUS_SUCCESS);\n  check_rv(curandSetPseudoRandomGeneratorSeed(gen, SEED1), CURAND_STATUS_SUCCESS);\n  check_rv(curandSetGeneratorOrdering (gen, CURAND_ORDERING_PSEUDO_BEST), CURAND_STATUS_SUCCESS);\n}\n__global__ void init_wl(CSRGraph graph, WorklistT in_wl, WorklistT out_wl)\n{\n  unsigned tid = TID_1D;\n  unsigned nthreads = TOTAL_THREADS_1D;\n\n  const unsigned __kernel_tb_size = TB_SIZE;\n  if (tid == 0)\n    in_wl.reset_next_slot();\n\n  index_type node_end;\n  node_end = (graph).nnodes;\n  for (index_type node = 0 + tid; node < node_end; node += nthreads)\n  {\n    (out_wl).push(node);\n  }\n}\n__global__ void mark_nodes(CSRGraph graph, const unsigned int * __restrict__ prio, WorklistT in_wl, WorklistT out_wl)\n{\n  unsigned tid = TID_1D;\n  unsigned nthreads = TOTAL_THREADS_1D;\n\n  const unsigned __kernel_tb_size = TB_SIZE;\n  if (tid == 0)\n    in_wl.reset_next_slot();\n\n  index_type wlnode_end;\n  wlnode_end = *((volatile index_type *) (in_wl).dindex);\n  for (index_type wlnode = 0 + tid; wlnode < wlnode_end; wlnode += nthreads)\n  {\n    bool pop;\n    int node;\n    index_type edge_end;\n    pop = (in_wl).pop_id(wlnode, node);\n    int max_prio = prio[node];\n    int max_prio_node = node;\n    edge_end = (graph).getFirstEdge((node) + 1);\n    for (index_type edge = (graph).getFirstEdge(node) + 0; edge < edge_end; edge += 1)\n    {\n      index_type dst = graph.getAbsDestination(edge);\n      if (dst != node && graph.node_data[dst] != NON_INDEPENDENT && prio[dst] >= max_prio)\n      {\n        if ((prio[dst] > max_prio) || dst > max_prio_node)\n        {\n          max_prio = prio[dst];\n          max_prio_node = dst;\n        }\n      }\n    }\n    if (max_prio_node == node)\n    {\n      assert(graph.node_data[node] == UNMARKED);\n      graph.node_data[node] = MARKED;\n    }\n  }\n}\n__global__ void drop_marked_nodes_and_nbors(CSRGraph graph, WorklistT in_wl, WorklistT out_wl)\n{\n  unsigned tid = TID_1D;\n  unsigned nthreads = TOTAL_THREADS_1D;\n\n  const unsigned __kernel_tb_size = TB_SIZE;\n  if (tid == 0)\n    in_wl.reset_next_slot();\n\n  index_type wlnode_end;\n  wlnode_end = *((volatile index_type *) (in_wl).dindex);\n  for (index_type wlnode = 0 + tid; wlnode < wlnode_end; wlnode += nthreads)\n  {\n    bool pop;\n    int node;\n    pop = (in_wl).pop_id(wlnode, node);\n    bool drop = false;\n    if (graph.node_data[node] == MARKED)\n    {\n      drop = true;\n    }\n    if (!drop)\n    {\n      index_type edge_end;\n      edge_end = (graph).getFirstEdge((node) + 1);\n      for (index_type edge = (graph).getFirstEdge(node) + 0; edge < edge_end; edge += 1)\n      {\n        index_type dst = graph.getAbsDestination(edge);\n        if (graph.node_data[dst] == MARKED)\n        {\n          drop = true;\n        }\n      }\n    }\n    if (!drop)\n    {\n      (out_wl).push(node);\n    }\n    else\n    {\n      if (graph.node_data[node] == UNMARKED)\n      {\n        graph.node_data[node] = NON_INDEPENDENT;\n      }\n    }\n  }\n}\nvoid gg_main_pipe_1(CSRGraphTy& gg, int& STEPS, Shared<unsigned int>& prio, PipeContextT<WorklistT>& pipe, dim3& blocks, dim3& threads)\n{\n  {\n    pipe.out_wl().will_write();\n    init_wl <<<blocks, threads>>>(gg, pipe.in_wl(), pipe.out_wl());\n    pipe.in_wl().swap_slots();\n    pipe.advance2();\n    while (pipe.in_wl().nitems())\n    {\n      pipe.out_wl().will_write();\n      mark_nodes <<<blocks, threads>>>(gg, prio.gpu_rd_ptr(), pipe.in_wl(), pipe.out_wl());\n      pipe.out_wl().will_write();\n      drop_marked_nodes_and_nbors <<<blocks, threads>>>(gg, pipe.in_wl(), pipe.out_wl());\n      pipe.in_wl().swap_slots();\n      pipe.advance2();\n      STEPS++;\n    }\n  }\n}\n\nvoid gg_main_pipe_1_wrapper(CSRGraphTy& gg, int& STEPS, Shared<unsigned int>& prio, PipeContextT<WorklistT>& pipe, dim3& blocks, dim3& threads)\n{\n    gg_main_pipe_1(gg,STEPS,prio,pipe,blocks,threads);\n}\nvoid gg_main(CSRGraphTy& hg, CSRGraphTy& gg)\n{\n  dim3 blocks, threads;\n  kernel_sizing(gg, blocks, threads);\n  PipeContextT<WorklistT> pipe;\n  Shared<unsigned int> prio (hg.nnodes);\n  int STEPS = 0;\n  ggc::Timer t (\"random\");\n  t.start();\n  gen_prio_gpu <<<blocks, threads>>>(gg, prio.gpu_wr_ptr(), SEED1, SEED2, SEED3, SEED4);\n  cudaDeviceSynchronize();\n  t.stop();\n  printf(\"Random number generation took %llu ns\\n\", t.duration());\n  pipe = PipeContextT<WorklistT>(gg.nnodes);\n  gg_main_pipe_1_wrapper(gg,STEPS,prio,pipe,blocks,threads);\n  printf(\"Total steps: %d\\n\", STEPS);\n}\n"
  },
  {
    "path": "lonestar/analytics/gpu/independentset/support.cu",
    "content": "/* -*- mode: C++ -*- */\n\n#include \"gg.h\"\n\nconst char *prog_opts = \"\";\nconst char *prog_usage = \"\";\nconst char *prog_args_usage = \"\";\n\nint process_prog_arg(int argc, char *argv[], int arg_start) {\n  return 1;\n}\n\nvoid process_prog_opt(char c, char *optarg) {\n  ;\n}\n\nvoid output(CSRGraphTy &g, const char *output_file) {\n  FILE *f;\n\n  if(!output_file)\n    return;\n\n  if(strcmp(output_file, \"-\") == 0)\n    f = stdout;\n  else\n    f = fopen(output_file, \"w\");\n\n  unsigned int count = 0;\n  for(int i = 0; i < g.nnodes; i++) {\n    count += (g.node_data[i] == 1);\n  }\n\n  check_fprintf(f, \"%u\\n\", count);\n  for(int i = 0; i < g.nnodes; i++) {\n    if(g.node_data[i] == 1)\n      check_fprintf(f, \"%d\\n\", i);\n  }\n\n  fclose(f);\n}\n"
  },
  {
    "path": "lonestar/analytics/gpu/matrixcompletion/CMakeLists.txt",
    "content": "app_analy_gpu(sgd matrixcompletion)\n#add_test_gpu(matrixcompletion Epinions Epinions.out sgd ${BASEINPUT}/weighted/bipartite/Epinions_dataset.gr)\nadd_test_gpu(matrixcompletion bgg bgg.out sgd)\n"
  },
  {
    "path": "lonestar/analytics/gpu/matrixcompletion/README.md",
    "content": "Matrix Completion\n================================================================================\n\nDESCRIPTION\n--------------------------------------------------------------------------------\n\nThis benchmark implements Stochastic Gradient Descent (SGD). In particular,\nthe benchmark uses SGD to complete unknown entries of a sparse matrix.\nThe sparse matrix represents a bipartite graph, with one set of nodes represent\nmovies, while the other set represents users. The edge connecting a movie node\nto a user node denotes that the user has rated the movie, with the edge label\nrepresenting the rating assigned. This benchmark has rough correspondence to\nthe GPU implementations described\n[in this paper](http://www.cs.utexas.edu/~rashid/public/ipdps2016.pdf).\n\nINPUT\n--------------------------------------------------------------------------------\n\nThis application takes in directed bipartite Galois .gr graphs.\n\nBUILD\n--------------------------------------------------------------------------------\n\n1. Run cmake at BUILD directory (refer to top-level README for cmake instructions).\n\n2. Run `cd <BUILD>/lonestar/analytics/gpu/matrixcompletion; make -j`\n\nRUN\n--------------------------------------------------------------------------------\n\nTo run default algorithm, use the following:\n\n-`$ ./matrixcompletion <input-graph>`\n-`$ ./matrixcompletion Epinions_dataset.gr`\n"
  },
  {
    "path": "lonestar/analytics/gpu/matrixcompletion/SGDAsyncEdgeCu.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#ifdef _WIN32\n#define NOMINMAX\n#include <windows.h>\n#include <process.h>\n#include <time.h>\n#include <Psapi.h>\n#else\n#include <sys/time.h>\n#endif\n#include <cuda.h>\n#include \"SGDCommonCu.h\"\n#include \"SGDGraphCu.h\"\n#include <algorithm>\n\n#define R 2\n#define C 2\n#define BLOCKSIZE 1\n#define GRANULARITY 5000\n#define _P_DATA1_(_t25) _P_DATA1[_t25]\n#define _P_DATA1__(_t25) _P_DATA1[_t25 + 1]\n#define new_ratings(_t49, _t50, _t51, _t52)                                    \\\n  new_ratings[(_t50)*R * C + (_t51)*C + (_t52)]\nstruct a_list {\n  int col_[1];\n  float ratings[R * C];\n  struct a_list* next;\n};\n\nstruct mk {\n  struct a_list* ptr;\n};\n\n#ifndef GALOISGPU_APPS_SGD_CUDA_SGDASYNCEDGECU_H_\n#define GALOISGPU_APPS_SGD_CUDA_SGDASYNCEDGECU_H_\n#define col_(_t5) col[_t5]\n#define index_(i) index[i]\n#define index__(i) index[i + 1]\n#define __rose_lt(x, y) ((x) < (y) ? (x) : (y))\n\n#define gpuErrchk(ans)                                                         \\\n  { gpuAssert((ans), __FILE__, __LINE__); }\ninline void gpuAssert(cudaError_t code, const char* file, int line,\n                      bool abort = true) {\n  if (code != cudaSuccess) {\n    fprintf(stderr, \"GPUassert: %s %s %d\\n\", cudaGetErrorString(code), file,\n            line);\n    if (abort)\n      exit(code);\n  }\n}\nbool out_degree_compare(std::pair<int, int> i, std::pair<int, int> j) {\n  return (i.second > j.second);\n}\n\nstruct data_list {\n  float data[R][C];\n  int col;\n  struct data_list* next;\n};\n__device__ void segreduce_warp2(float* y, float* val) {\n  int tx     = threadIdx.x;\n  float left = 0;\n\n  if (tx >= 1) {\n    left = val[tx - 1];\n    val[tx] += left;\n    left = 0;\n  }\n  __syncthreads();\n\n  if (tx >= 2) {\n    left = val[tx - 2];\n    val[tx] += left;\n    left = 0;\n  }\n  __syncthreads();\n  if (tx >= 4) {\n    left = val[tx - 4];\n    val[tx] += left;\n    left = 0;\n  }\n  __syncthreads();\n  if (tx >= 8) {\n    left = val[tx - 8];\n    val[tx] += left;\n    left = 0;\n  }\n  __syncthreads();\n\n  if (tx == SGD_FEATURE_SIZE - 1)\n    *y += val[tx];\n  __syncthreads();\n}\n\n__global__ void sgd_blk_diag_operator(float* fv, int* metadata,\n                                      float* new_ratings, int* _P_DATA2,\n                                      int* _P_DATA1, float step_size, int t2) {\n  int l;\n  int i;\n  int bx;\n  bx = blockIdx.x;\n  int tx;\n  tx = threadIdx.x;\n  int ty;\n  ty = threadIdx.y;\n  __device__ __shared__ float _P2[BLOCKSIZE];\n  __device__ __shared__ float _P3[BLOCKSIZE * SGD_FEATURE_SIZE];\n  int newVariable5;\n  float _P4[C];\n  float _P5[R];\n  // int t4;\n  // int t6;\n  // int t8;\n  int t10;\n  int t12;\n  int movie_size = metadata[2];\n  if (ty <= _P_DATA1__(t2) - _P_DATA1_(t2) - BLOCKSIZE * bx - 1)\n    newVariable5 = _P_DATA2[_P_DATA1_(t2) + BLOCKSIZE * bx + ty];\n  if (ty <= _P_DATA1__(t2) - _P_DATA1_(t2) - BLOCKSIZE * bx - 1) {\n    for (t10 = 0; t10 <= R - 1; t10 += 1)\n      _P5[R * newVariable5 + t10 - R * newVariable5] =\n          fv[(R * newVariable5 + t10) * SGD_FEATURE_SIZE + tx];\n    for (i = 0; i <= R - 1; i += 1) {\n      for (t12 = 0; t12 <= C - 1; t12 += 1)\n        _P4[C * newVariable5 + C * t2 + t12 + 1 -\n            (C * t2 + C * newVariable5 + 1)] =\n            fv[(C * newVariable5 + C * t2 + t12 + 1) * SGD_FEATURE_SIZE + tx];\n      for (l = 0; l <= C - 1; l += 1) {\n        if (0 <= new_ratings(t2, BLOCKSIZE * bx + _P_DATA1_(t2) + ty, i, l)) {\n          _P2[ty] = -new_ratings[(BLOCKSIZE * bx + _P_DATA1_(t2) + ty) * R * C +\n                                 i * C + l];\n          _P3[tx + ty * SGD_FEATURE_SIZE] =\n              (_P5[R * newVariable5 + i - R * newVariable5] *\n               _P4[C * t2 - movie_size + 1 + C * newVariable5 + l + movie_size -\n                   (C * t2 + C * newVariable5 + 1)]);\n          segreduce_warp2(&_P2[ty], &_P3[0 + ty * SGD_FEATURE_SIZE]);\n          _P5[R * newVariable5 + i - R * newVariable5] -=\n              (step_size *\n               ((_P2[ty] * _P4[C * t2 - movie_size + 1 + C * newVariable5 + l +\n                               movie_size - (C * t2 + C * newVariable5 + 1)]) +\n                (0.05f * _P5[R * newVariable5 + i - R * newVariable5])));\n          _P4[C * t2 - movie_size + 1 + C * newVariable5 + l + movie_size -\n              (C * t2 + C * newVariable5 + 1)] -=\n              (step_size *\n               ((_P2[ty] * _P5[R * newVariable5 + i - R * newVariable5]) +\n                (0.05f * _P4[C * t2 - movie_size + 1 + C * newVariable5 + l +\n                             movie_size - (C * t2 + C * newVariable5 + 1)])));\n        } else if (new_ratings(t2, BLOCKSIZE * bx + _P_DATA1_(t2) + ty, i, l) <=\n                   -2) {\n          _P2[ty] = -new_ratings[(BLOCKSIZE * bx + _P_DATA1_(t2) + ty) * R * C +\n                                 i * C + l];\n          _P3[tx + ty * SGD_FEATURE_SIZE] =\n              (_P5[R * newVariable5 + i - R * newVariable5] *\n               _P4[C * t2 - movie_size + 1 + C * newVariable5 + l + movie_size -\n                   (C * t2 + C * newVariable5 + 1)]);\n          segreduce_warp2(&_P2[ty], &_P3[0 + ty * SGD_FEATURE_SIZE]);\n          _P5[R * newVariable5 + i - R * newVariable5] -=\n              (step_size *\n               ((_P2[ty] * _P4[C * t2 - movie_size + 1 + C * newVariable5 + l +\n                               movie_size - (C * t2 + C * newVariable5 + 1)]) +\n                (0.05f * _P5[R * newVariable5 + i - R * newVariable5])));\n          _P4[C * t2 - movie_size + 1 + C * newVariable5 + l + movie_size -\n              (C * t2 + C * newVariable5 + 1)] -=\n              (step_size *\n               ((_P2[ty] * _P5[R * newVariable5 + i - R * newVariable5]) +\n                (0.05f * _P4[C * t2 - movie_size + 1 + C * newVariable5 + l +\n                             movie_size - (C * t2 + C * newVariable5 + 1)])));\n        }\n      }\n      for (t12 = 0; t12 <= C - 1; t12 += 1)\n        fv[(C * newVariable5 + C * t2 + t12 + 1) * SGD_FEATURE_SIZE + tx] =\n            _P4[C * newVariable5 + C * t2 + t12 + 1 -\n                (C * t2 + C * newVariable5 + 1)];\n    }\n    for (t10 = 0; t10 <= R - 1; t10 += 1)\n      fv[(R * newVariable5 + t10) * SGD_FEATURE_SIZE + tx] =\n          _P5[R * newVariable5 + t10 - R * newVariable5];\n  }\n}\n\nstruct Timer {\n  double _start;\n  double _end;\n  Timer() : _start(0), _end(0) {}\n  void clear() { _start = _end = 0; }\n  void start() { _start = rtclock(); }\n  void stop() { _end = rtclock(); }\n  double get_time_seconds(void) { return (_end - _start); }\n\n#ifdef _WIN32\n  static double rtclock() {\n    LARGE_INTEGER tickPerSecond, tick;\n    QueryPerformanceFrequency(&tickPerSecond);\n    QueryPerformanceCounter(&tick);\n    return (tick.QuadPart * 1000000 / tickPerSecond.QuadPart) * 1.0e-6;\n  }\n#else\n  static double rtclock() {\n    struct timezone Tzp;\n    struct timeval Tp;\n    int stat;\n    stat = gettimeofday(&Tp, &Tzp);\n    if (stat != 0)\n      printf(\"Error return from gettimeofday: %d\", stat);\n    return (Tp.tv_sec + Tp.tv_usec * 1.0e-6);\n  }\n#endif\n};\n\n/************************************************************************\n ************************************************************************/\n///\nstruct RunStats {\n  int round;\n  int curr_step;\n  float total_time;\n  float time_per_diagonal;\n  float insp_time;\n  RunStats(int r, int s, float t, float t_p_d, float i_t) {\n    round             = r;\n    curr_step         = s;\n    total_time        = t;\n    time_per_diagonal = t_p_d;\n    insp_time         = i_t;\n  }\n  RunStats() {\n    round = curr_step = 0;\n    total_time = time_per_diagonal = insp_time = 0.0f;\n  }\n  //   fprintf(stderr, \"diag\\t%d\\t%d\\t%6.6g\\t%6.6g\\t%6.6g\\t\", round,\n  //   curr_step,total_time,total_time/(double)(m+ n -1), insp_time);\n};\nstruct StatAccumulator {\n  std::vector<RunStats> stats;\n  void push_stats(int r, int s, float t, float t_p_d, float i_t) {\n    RunStats rs(r, s, t, t_p_d, i_t);\n    stats.push_back(rs);\n    //      fprintf(stderr, \"diag#\\t%d\\t%d\\t%6.6g\\t%6.6g\\t%6.6g\\t\", r,\n    //      s,t,t_p_d, i_t);\n  }\n  void print() {\n    RunStats sum;\n    for (int i = 0; i < stats.size(); ++i) {\n      RunStats& s = stats[i];\n      sum.round += s.round;\n      sum.curr_step += s.curr_step;\n      sum.total_time += s.total_time;\n      sum.time_per_diagonal += s.time_per_diagonal;\n      sum.insp_time += s.insp_time;\n    }\n    size_t num_items = stats.size();\n    printf(\"\\nAverage time per iteration: %6.6g\\n\", sum.total_time / num_items);\n  }\n};\n///\ntemplate <typename T>\nstruct CUDAArray {\n  T* device_data;\n  T* host_data;\n  size_t _size;\n  CUDAArray(size_t s) : _size(s) {\n    host_data   = new T[_size];\n    device_data = NULL;\n  }\n  ~CUDAArray() {\n    delete[] host_data;\n    if (device_data != NULL)\n      gpuErrchk(cudaFree(device_data));\n  }\n  void copy_to_device() {\n    gpuErrchk(cudaMemcpy(device_data, host_data, sizeof(T) * _size,\n                         cudaMemcpyHostToDevice));\n  }\n  void create_on_device() {\n    gpuErrchk(cudaMalloc(&device_data, sizeof(T) * _size));\n  }\n  void copy_to_host() {\n    gpuErrchk(cudaMemcpy(host_data, device_data, sizeof(T) * _size,\n                         cudaMemcpyDeviceToHost));\n  }\n  T* host_ptr() { return host_data; }\n  T* device_ptr() { return device_data; }\n  size_t size() { return _size; }\n};\n\n/************************************************************************\n\n ************************************************************************/\n\n// typedef float EdgeDataType;\ntypedef unsigned int EdgeDataType;\n\nstruct SGDAsynEdgeCudaFunctor {\n  typedef SGD_LC_LinearArray_Undirected_Graph<unsigned int, EdgeDataType>\n      GraphTy;\n  typedef CUDAArray<int> ArrayType;\n  typedef CUDAArray<float> FeatureArrayType;\n  ////////////////////////////////////////////////////////////\n  /************************************************\n   *\n   *************************************************/\n  StatAccumulator stats;\n  GraphTy graph;\n  std::vector<int> movies;\n  std::map<int, int> old_pos_to_new_pos;\n  std::vector<std::pair<int, int>>\n      sorted_nodes; // 1st field is the position of the node, second field is\n                    // the out_degree\n\n  std::vector<int> user_indices;\n  ArrayType* metadata;\n\n  ArrayType* index;\n  ArrayType* diag_number;\n  ArrayType* new_index;\n  ArrayType* col;\n  ArrayType* new_col;\n  FeatureArrayType* new_ratings;\n\n  // to track diagonal arrays\n  int count_of_diagonals;\n  FeatureArrayType* features;\n  FeatureArrayType* ratings;\n  float accumulated_error;\n  int round;\n  unsigned int max_rating;\n  char filename[512];\n  std::vector<int> user_edge_count;\n  /************************************************************************\n   *\n   *metadata (16)\n   *edge_info, worklist, ratings (2+1+1)*NE\n   *locks, features*FEATURE_SIZE, (1+FEATURE_SIZE)NN\n   ************************************************************************/\n  SGDAsynEdgeCudaFunctor(bool road, const char* p_filename) : round(0) {\n    strcpy(filename, p_filename);\n    // fprintf(stderr, \"Creating SGDAsynEdgeCudaFunctor -  features =[%d].\\n\",\n    // SGD_FEATURE_SIZE);\n    graph.read(p_filename);\n    allocate();\n    initialize();\n    printf(\"Feature size: %d\\n\", SGD_FEATURE_SIZE);\n    // printf(\"Number of movies found: %ld\\n\", movies.size());\n  }\n  /************************************************************************\n   *\n   ************************************************************************/\n  SGDAsynEdgeCudaFunctor(int num_m, int num_u) : round(0) {\n    strcpy(filename, \"generated-input\");\n    // fprintf(stderr, \"Creating SGDAsynEdgeFunctor -  features =[%d] .\\n\",\n    // SGD_FEATURE_SIZE);\n    complete_bipartitie(graph, num_m, num_u);\n    allocate();\n    initialize();\n    fprintf(stderr, \"Number of movies found :: %ld\\n\", movies.size());\n  }\n  /************************************************************************\n   *\n   ************************************************************************/\n  SGDAsynEdgeCudaFunctor(int num_m) : round(0) {\n    strcpy(filename, \"gen-diagonal-input\");\n    // fprintf(stderr, \"Creating SGDAsynEdgeFunctor -  features =[%d] .\\n\",\n    // SGD_FEATURE_SIZE);\n    diagonal_graph(graph, num_m);\n    allocate();\n    initialize();\n    fprintf(stderr, \"Number of movies found :: %ld\\n\", movies.size());\n  }\n  /************************************************************************\n   *\n   ************************************************************************/\n  void allocate() {\n    features = new FeatureArrayType(graph.num_nodes() * SGD_FEATURE_SIZE);\n    features->create_on_device();\n    ratings  = new FeatureArrayType(graph.num_edges());\n    metadata = new ArrayType(16);\n    metadata->create_on_device();\n    index = new ArrayType(graph.num_nodes() + 1);\n    col   = new ArrayType(graph.num_edges());\n  }\n  /************************************************************************\n   *\n   ************************************************************************/\n  void deallocate() {\n    delete features;\n    delete ratings;\n    delete metadata;\n    delete index;\n  }\n  /************************************************************************\n   *\n   ************************************************************************/\n  void copy_to_device() { features->copy_to_device(); }\n  /************************************************************************\n   *\n   ************************************************************************/\n  void copy_to_host() { features->copy_to_host(); }\n  /************************************************************************\n   *\n   ************************************************************************/\n  void initialize() {\n    {\n      int deviceCount;\n      cudaGetDeviceCount(&deviceCount);\n      int device;\n      for (device = 0; device < deviceCount; ++device) {\n        cudaDeviceProp deviceProp;\n        cudaGetDeviceProperties(&deviceProp, device);\n        // fprintf(stderr, \"Device %s (%d) : CC %d.%d, MaxThreads:%d \\n\",\n        //\t\tdeviceProp.name, device, deviceProp.major,\n        //\t\tdeviceProp.minor, deviceProp.maxThreadsPerBlock);\n      }\n    }\n    std::vector<int> all_edges;\n    initialize_features_random(graph, features, movies);\n    movies.clear();\n    unsigned int max_degree = 0;\n    // unsigned max_degree_id = 0;\n\n    for (unsigned int i = 0; i < graph.num_nodes(); ++i) {\n      for (int j = 0; j < graph.num_neighbors(i); j++) {\n        if (graph.out_neighbors(i, j) >= graph.num_nodes())\n          fprintf(stderr, \"error in input at %d\\n\", i);\n      }\n    }\n\n    for (unsigned int i = 0; i < graph.num_nodes(); ++i) {\n\n      sorted_nodes.push_back(std::pair<int, int>(i, graph.num_neighbors(i)));\n      if (graph.num_neighbors(i) > max_degree) {\n        max_degree = graph.num_neighbors(i);\n        // max_degree_id = i;\n      }\n      if (graph.num_neighbors(i) > 0) {\n        movies.push_back(i);\n      } else {\n        user_indices.push_back(i);\n      }\n    }\n    std::sort(sorted_nodes.begin(), sorted_nodes.end(), out_degree_compare);\n    max_rating = 0;\n    for (unsigned int i = 0; i < graph.num_edges(); ++i) {\n      max_rating = std::max(max_rating, graph.out_edge_data()[i]);\n    }\n    // fprintf(stderr, \"] , max_Rating: %d, movies: %ld, Max degree:: %d for\n    // node: %d\\n\", \t\tmax_rating, movies.size(), max_degree,\n    // max_degree_id);\n    distribute_chunks(all_edges);\n    cache_chunks(all_edges);\n  }\n  /************************************************************************\n   *\n   ************************************************************************/\n  void cache_chunks(std::vector<int>& all_edges) {\n    index->host_ptr()[0] = 0;\n    int count            = 0;\n    int user_count       = movies.size();\n\n    for (int i = 0; i < sorted_nodes.size(); i++) {\n      for (int j = 0; j < sorted_nodes[i].second; j++) {\n        int old_pos = graph.out_neighbors(sorted_nodes[i].first, j);\n\n        if (old_pos_to_new_pos.find(old_pos) != old_pos_to_new_pos.end())\n          col->host_ptr()[count] = old_pos_to_new_pos.find(old_pos)->second;\n        else {\n          col->host_ptr()[count] = user_count;\n          old_pos_to_new_pos.insert(std::pair<int, int>(old_pos, user_count));\n          user_count++;\n        }\n        ratings->host_ptr()[count++] =\n            graph.out_edge_data(sorted_nodes[i].first, j);\n      }\n      index->host_ptr()[i + 1] = count;\n    }\n\n    graph.outgoing_index()[0] = index->host_ptr()[0];\n    for (int i = 0; i < sorted_nodes.size(); i++) {\n      graph.outgoing_index()[i + 1] = index->host_ptr()[i + 1];\n      for (int j = index->host_ptr()[i]; j < index->host_ptr()[i + 1]; j++) {\n        graph.out_neighbors(i, j - index->host_ptr()[i]) = col->host_ptr()[j];\n        graph.out_edge_data(i, j - index->host_ptr()[i]) =\n            ratings->host_ptr()[j];\n        ratings->host_ptr()[j] /= (float)max_rating;\n      }\n    }\n  }\n  /************************************************************************\n   *\n   ************************************************************************/\n  void distribute_chunks(std::vector<int>& all_edges) {\n    std::vector<int> in_edge_wl(graph.num_edges());\n    for (size_t i = 0; i < graph.num_edges(); ++i) {\n      in_edge_wl[i] = i;\n    }\n    size_t num_edges_to_process = in_edge_wl.size();\n    int num_items               = graph.num_edges();\n    all_edges.resize(num_items);\n    memcpy(all_edges.data(), in_edge_wl.data(), num_items * sizeof(int));\n  }\n  /************************************************************************\n   *\n   ************************************************************************/\n  void operator()(int num_steps) {\n    // print_edges();\n    // print_latent();\n    // max_rating = 1;\n    copy_to_device();\n    compute_err(graph, features, max_rating);\n    for (round = 0; round < num_steps; ++round) {\n      this->gpu_operator();\n      copy_to_host();\n      float rmse = compute_err(graph, features, max_rating);\n      if (rmse < 0.1)\n        break;\n    }\n    stats.print();\n    // print_latent();\n  }\n\n  void print_latent() {\n    for (int n = 0; n < 10; n++) {\n      FeatureType* features_l = &(features->host_ptr()[n * SGD_FEATURE_SIZE]);\n      printf(\"latent(%d)[%.3f\", n, features_l[0]);\n      for (int i = 1; i < SGD_FEATURE_SIZE; i++)\n        printf(\" %.3f\", features_l[i]);\n      printf(\"]\\n\");\n    }\n  }\n  void print_edges() {\n    std::cout << \"edges: [\" << graph.out_edge_data()[0];\n    for (int n = 1; n < 100; n++) {\n      if (n >= graph.num_edges())\n        break;\n      std::cout << \", \" << graph.out_edge_data()[n];\n    }\n    printf(\"]\\n\");\n  }\n  /************************************************************************\n   *\n   ************************************************************************/\n  void gpu_operator() {\n    int curr_step           = 0;\n    metadata->host_ptr()[4] = graph.num_edges();\n    double total_time       = 0;\n    double insp_time        = 0;\n    metadata->host_ptr()[2] = movies.size();\n    metadata->host_ptr()[4] = 0;\n    metadata->host_ptr()[0] = user_indices.size();\n    metadata->copy_to_device();\n\n    const float step_size = SGD_STEP_SIZE(round);\n    dim3 block_size       = dim3(SGD_FEATURE_SIZE, BLOCKSIZE);\n    int num_blocks        = ceil(movies.size() / (float)BLOCKSIZE);\n    cudaError_t err;\n\n    Timer timer, timer2;\n    int iter;\n    int num_items = graph.num_edges();\n    int iter2     = 0;\n    timer2.start();\n    diag_inspector(movies.size(), user_indices.size(), index->host_ptr(),\n                   ratings->host_ptr(), col->host_ptr(), iter2);\n    timer2.stop();\n    insp_time += timer2.get_time_seconds();\n    new_col->copy_to_device();\n    new_ratings->copy_to_device();\n    new_index->copy_to_device();\n    //\t\tdiag_number->copy_to_device();\n    timer.start();\n\n    int non_zero_blk_diags = 0;\n    for (iter = 0; iter < count_of_diagonals; iter++) {\n      if (new_index->host_ptr()[iter + 1] - new_index->host_ptr()[iter] > 0) {\n        non_zero_blk_diags++;\n        num_blocks =\n            (new_index->host_ptr()[iter + 1] - new_index->host_ptr()[iter]) %\n                        (BLOCKSIZE) ==\n                    0\n                ? (new_index->host_ptr()[iter + 1] -\n                   new_index->host_ptr()[iter]) /\n                      (BLOCKSIZE)\n                :\n\n                (new_index->host_ptr()[iter + 1] -\n                 new_index->host_ptr()[iter]) /\n                        (BLOCKSIZE) +\n                    1;\n        // std::cout << \"num_blocks = \" << num_blocks << \"block_size = \" <<\n        // SGD_FEATURE_SIZE * BLOCKSIZE << \"\\n\";\n        sgd_blk_diag_operator<<<num_blocks, block_size>>>(\n            features->device_ptr(), metadata->device_ptr(),\n            new_ratings->device_ptr(), new_col->device_ptr(),\n            new_index->device_ptr(), step_size, iter);\n      }\n    }\n\n    cudaDeviceSynchronize();\n    timer.stop();\n\n    total_time += timer.get_time_seconds();\n    if ((err = cudaGetLastError()) != cudaSuccess) {\n      fprintf(stderr, \"aborted %s \\n\", cudaGetErrorString(err));\n      exit(-1);\n    }\n\n    metadata->copy_to_host();\n    // fprintf(stderr, \"blk_diag: round %d curr_step %d total_time %.3f\n    // per_diag_time %6.3g insp_time %.3f\\t\", round, \t\tcurr_step,\n    // total_time,\n    // total_time / (double) count_of_diagonals, insp_time);\n    printf(\"round %d: total_time %.3f\\t\", round, total_time);\n    stats.push_stats(round, curr_step, total_time,\n                     total_time / (double)count_of_diagonals, insp_time);\n\n    delete new_ratings;\n    delete new_index;\n    delete new_col;\n  }\n\n  int diag_inspector(int movies, int users, int* index, float* a, int* col,\n                     int iter) {\n\n    int t6;\n    int t4;\n    int t2;\n    int newVariable4;\n    int newVariable3;\n    int newVariable2;\n    struct a_list* _P_DATA4;\n    int newVariable1;\n    int newVariable0;\n    struct mk* _P_DATA3;\n    struct a_list** _P1;\n    int chill_count_1;\n    int* _P_DATA1;\n    int _t31;\n    int _t34;\n    /*\n    int t8;\n    int *_P_DATA2;\n    int chill_count_0;\n    int _t39;\n    int _t38;\n    int _t37;\n    int In_3;\n    int In_2;\n    int In_1;\n    int _t36;\n    int _t35;\n    int _t33;\n    int _t32;\n    int _t30;\n    int _t29;\n    int _t28;\n    int _t27;\n    int _t25;\n    int _t26;\n    int _t24;\n    int _t23;\n    int _t22;\n    int _t21;\n    int _t20;\n    int _t19;\n    int _t18;\n    int _t17;\n    int _t16;\n    int _t15;\n    int _t14;\n    int _t12;\n    int _t11;\n    int _t10;\n    int _t9;\n    int _t7;\n    int _t6;\n    int _t5;\n    int _t4;\n    int l;\n    int _t3;\n    int _t2;\n    int _t1;\n    int i;\n    int j;\n    int k;\n    */\n    _P_DATA1      = (int*)malloc(sizeof(int) * (users / C + movies / R));\n    _P1           = (struct a_list**)malloc(sizeof(struct a_list*) *\n                                  (users / C + movies / R - 1));\n    _P_DATA1[0]   = 0;\n    _P_DATA3      = ((\n        struct mk*)(malloc(sizeof(struct mk) * (users / C + movies / R - 1))));\n    chill_count_1 = 0;\n    _P_DATA1[0]   = 0;\n    for (_t31 = 0; _t31 <= users / C + movies / R - 2; _t31 += 1) {\n      _P1[1 * _t31]          = 0;\n      _P_DATA1[1 * _t31 + 1] = 0;\n    }\n    for (t2 = 0; t2 <= movies / R - 1; t2 += 1) {\n      for (t4 = 0; t4 <= R - 1; t4 += 1)\n        for (t6 = index_(R * t2 + t4); t6 <= index__(R * t2 + t4) - 1;\n             t6 += 1) {\n          _t31 = (col_(t6) - movies + R * (movies / R - 1)) / C - t2;\n          _P_DATA3[_t31].ptr = 0;\n        }\n      for (t4 = 0; t4 <= R - 1; t4 += 1)\n        for (t6 = index_(R * t2 + t4); t6 <= index__(R * t2 + t4) - 1;\n             t6 += 1) {\n          _t31 = (col_(t6) - movies + R * (movies / R - 1)) / C - t2;\n          _t34 = (col_(t6) - movies + R * (movies / R - 1)) % C;\n          if (_P_DATA3[_t31].ptr == 0) {\n            _P_DATA4 = ((struct a_list*)(malloc(sizeof(struct a_list) * 1)));\n            _P_DATA4->next     = _P1[_t31];\n            _P1[_t31]          = _P_DATA4;\n            _P_DATA3[_t31].ptr = _P1[_t31];\n            for (newVariable0 = 0; newVariable0 <= R - 1; newVariable0 += 1)\n              for (newVariable1 = 0; newVariable1 <= C - 1; newVariable1 += 1)\n                _P_DATA3[_t31]\n                    .ptr->ratings[C * newVariable0 + 1 * newVariable1] = -1;\n            _P_DATA3[_t31].ptr->col_[0] = t2;\n            chill_count_1 += 1;\n            _P_DATA1[_t31 + 1] += 1;\n          }\n          _P_DATA3[_t31].ptr->ratings[C * t4 + 1 * _t34] = a[t6];\n        }\n    }\n\n    new_col     = new ArrayType(chill_count_1);\n    new_index   = new ArrayType(users / C + movies / R);\n    new_ratings = new FeatureArrayType(chill_count_1 * R * C);\n    new_col->create_on_device();\n    new_index->create_on_device();\n    new_ratings->create_on_device();\n    new_index->host_ptr()[0] = 0;\n    for (t2 = 0; t2 <= users / C + movies / R - 2; t2 += 1) {\n      for (newVariable2 = 1 - _P_DATA1[1 * t2 + 1]; newVariable2 <= 0;\n           newVariable2 += 1) {\n        new_col->host_ptr()[_P_DATA1[1 * t2] - newVariable2] =\n            _P1[1 * t2]->col_[0];\n        for (newVariable3 = 0; newVariable3 <= R - 1; newVariable3 += 1)\n          for (newVariable4 = 0; newVariable4 <= C - 1; newVariable4 += 1)\n            new_ratings->host_ptr()[R * C * (_P_DATA1[1 * t2] - newVariable2) +\n                                    C * newVariable3 + 1 * newVariable4] =\n                _P1[1 * t2]->ratings[C * newVariable3 + 1 * newVariable4];\n        _P_DATA4 = _P1[1 * t2]->next;\n        free(_P1[1 * t2]);\n        _P1[1 * t2] = _P_DATA4;\n      }\n      _P_DATA1[1 * t2 + 1] += _P_DATA1[1 * t2];\n      new_index->host_ptr()[t2 + 1] = _P_DATA1[t2 + 1];\n    }\n\n    count_of_diagonals = users / C + movies / R - 1;\n    free(_P_DATA1);\n    free(_P_DATA3);\n    free(_P1);\n    return chill_count_1;\n  }\n\n  /************************************************************************\n   *\n   ************************************************************************/\n  ~SGDAsynEdgeCudaFunctor() {\n    deallocate();\n    // fprintf(stderr, \"Destroying SGDAsynEdgeCudaFunctor object.\\n\");\n  }\n};\n//###################################################################//\n\n#endif /* GALOISGPU_APPS_SGD_CUDA_SGDASYNCEDGECU_H_ */\n"
  },
  {
    "path": "lonestar/analytics/gpu/matrixcompletion/SGDCommonCu.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#ifndef SGDCOMMON_CU_H_\n#define SGDCOMMON_CU_H_\n\n#include <assert.h>\n#define _SGD_USE_SHARED_MEM_ 1\n#define SGD_FEATURE_SIZE 16\n\n#ifdef _WIN32\ntemplate <typename T>\nbool isnormal(const T&) {\n  return true;\n}\n#else\nusing namespace std;\n\nconst float SGD_LAMBDA        = 0.05f;\nconst float SGD_LEARNING_RATE = 0.012f;\nconst float SGD_DECAY_RATE    = 0.015f;\nconst int SGD_MAX_ROUNDS      = 5;\ntypedef float FeatureType;\n\n/************************************************\n *\n *************************************************/\nfloat SGD_STEP_SIZE(int X) {\n  return SGD_LEARNING_RATE * 1.5f / (1.0f + SGD_DECAY_RATE * pow(X + 1, 1.5f));\n} // Purdue.\n//#define SGD_STEP_SIZE(X) (0.001f *1.5f/(1.0+0.9* pow(X+1,1.5))) //Intel.\n/************************************************\n *\n *************************************************/\nfloat sum_vector(const float* a) {\n  float res = 0.0f;\n  for (int i = 0; i < SGD_FEATURE_SIZE; ++i)\n    res += a[i];\n  return res;\n}\n/************************************************\n *\n *************************************************/\ntemplate <typename T>\nT dot_product(const T* a, const T* b) {\n  T res = 0.0f;\n  for (int i = 0; i < SGD_FEATURE_SIZE; i++) {\n    assert((a[i] == 0 || isnormal(a[i]) == true));\n    assert((b[i] == 0 || isnormal(b[i]) == true));\n    res += a[i] * b[i];\n  }\n  return res;\n}\n/************************************************\n *\n *************************************************/\nfloat toMB(long val) { return val / (float)(1024 * 1024); }\n/************************************************\n *\n *************************************************/\nstruct DebugData {\n  struct NodeStats {\n    int max_rating;\n    int min_rating;\n    int sum_rating;\n    int count_rating;\n    int my_degree;\n    bool is_movie;\n    NodeStats() {\n      max_rating = sum_rating = count_rating = 0;\n      min_rating                             = std::numeric_limits<int>::max();\n      my_degree                              = 0;\n      is_movie                               = false;\n    }\n    void stat(int val) {\n      max_rating = std::max(max_rating, val);\n      min_rating = std::min(min_rating, val);\n      sum_rating += (val);\n      count_rating++;\n    }\n  };\n  std::vector<std::pair<int, int>> user_degrees;\n  std::vector<std::pair<int, int>> movie_degrees;\n  std::map<int, int> user_map;\n  std::map<int, int> movie_map;\n};\n/************************************************\n *\n *************************************************/\n\ntemplate <typename GraphTy>\nstatic void write_stats_to_file(GraphTy& graph) {\n  // 0 Write graph as csv to file:\n  {\n    std::ofstream out_file(\"/workspace/rashid/bgg.csv\");\n    out_file << \"Src,Dst,Wt\\n\";\n    for (size_t i = 0; i < graph.num_edges(); ++i) {\n      out_file << graph.get_edge_src(i) << \",\" << graph.out_neighbors()[i]\n               << \",\" << graph.out_edge_data()[i] << \"\\n\";\n    }\n    out_file.close();\n  }\n  //      return;\n  // 3 Write average-user degree per-movie\n  {\n    int max_user_degree  = 0;\n    int max_movie_degree = 0;\n    std::vector<DebugData::NodeStats> all_nodes_stats(graph.num_nodes());\n    std::vector<int> movie_indices;\n    std::vector<int> user_indices;\n    {\n      for (size_t i = 0; i < graph.num_edges(); ++i) {\n        int src    = graph.get_edge_src(i);\n        int dst    = graph.out_neighbors()[i];\n        int rating = graph.out_edge_data()[i];\n        all_nodes_stats[src].my_degree++;\n        all_nodes_stats[dst].my_degree++;\n        all_nodes_stats[src].is_movie = true;\n        all_nodes_stats[src].stat(rating);\n        all_nodes_stats[dst].stat(rating);\n        movie_indices.push_back(src);\n        user_indices.push_back(dst);\n        max_movie_degree =\n            std::max(max_movie_degree, all_nodes_stats[src].my_degree);\n        max_user_degree =\n            std::max(max_user_degree, all_nodes_stats[dst].my_degree);\n      }\n    }\n    {\n      /*std::sort(debug_data.user_degrees.begin(),\n       debug_data.user_degrees.end(), [](const std::pair<int, int>& lhs, const\n       std::pair<int, int>& rhs) { return lhs.second > rhs.second;});\n       std::sort(debug_data.movie_degrees.begin(),\n       debug_data.movie_degrees.end(), [](const std::pair<int, int>& lhs, const\n       std::pair<int, int>& rhs) { return lhs.second > rhs.second;});\n       */\n    }\n    std::vector<DebugData::NodeStats> user_per_degree_stats(max_user_degree +\n                                                            1);\n    std::vector<DebugData::NodeStats> movie_per_degree_stats(max_movie_degree +\n                                                             1);\n    long sum_ratings = 0;\n    for (size_t i = 0; i < graph.num_edges(); ++i) {\n      int m      = graph.get_edge_src(i);\n      int u      = graph.out_neighbors()[i];\n      int rating = graph.out_edge_data()[i];\n      int m_d    = all_nodes_stats[m].my_degree;\n      int u_d    = all_nodes_stats[u].my_degree;\n      assert(all_nodes_stats[m].is_movie == true &&\n             all_nodes_stats[u].is_movie == false);\n      user_per_degree_stats.at(u_d).stat(rating);\n      movie_per_degree_stats.at(m_d).stat(rating);\n      sum_ratings += rating;\n    }\n    std::cout << \"Sizes:: \" << movie_indices.size() << \", \"\n              << user_indices.size() << \"\\n\";\n    std::cout << \"Max-degree:: \" << max_movie_degree << \", \" << max_user_degree\n              << \"\\n\";\n    std::cout << \"Average rating:: \" << sum_ratings / (float)(graph.num_edges())\n              << \"\\n\";\n    {\n      std::ofstream out_file_u(\"/workspace/rashid/user_stats.csv\");\n      out_file_u << \"Id,Degree,NumNodes,Min,Max,Sum\\n\";\n      for (size_t i = 0; i < user_indices.size(); ++i) {\n        int index = user_indices[i];\n        assert(all_nodes_stats[index].is_movie == false);\n        out_file_u << i << \",\" << all_nodes_stats[index].count_rating << \",\"\n                   << all_nodes_stats[index].min_rating << \",\"\n                   << all_nodes_stats[index].max_rating << \",\"\n                   << all_nodes_stats[index].sum_rating << \"\\n\";\n      }\n      out_file_u.close();\n    }\n    {\n      std::ofstream out_file_u(\"/workspace/rashid/movie_stats.csv\");\n      out_file_u << \"Id,Degree,NumNodes,Min,Max,Sum\\n\";\n      for (size_t i = 0; i < movie_indices.size(); ++i) {\n        int index = movie_indices[i];\n        assert(all_nodes_stats[index].is_movie == true);\n        out_file_u << i << \",\" << all_nodes_stats[index].count_rating << \",\"\n                   << all_nodes_stats[index].min_rating << \",\"\n                   << all_nodes_stats[index].max_rating << \",\"\n                   << all_nodes_stats[index].sum_rating << \"\\n\";\n      }\n      out_file_u.close();\n    }\n    {\n      std::ofstream out_file_u(\"/workspace/rashid/bgg_user_average_degree.csv\");\n      out_file_u << \"Degree,NumNodes,Min,Max,Sum\\n\";\n      for (int i = 0; i < max_user_degree; ++i) {\n        if (user_per_degree_stats[i].count_rating > 0)\n          out_file_u << i << \",\" << user_per_degree_stats[i].count_rating << \",\"\n                     << user_per_degree_stats[i].min_rating << \",\"\n                     << user_per_degree_stats[i].max_rating << \",\"\n                     << user_per_degree_stats[i].sum_rating << \"\\n\";\n      }\n      out_file_u.close();\n    }\n    {\n      std::ofstream out_file_m(\n          \"/workspace/rashid/bgg_movie_average_degree.csv\");\n      out_file_m << \"Degree,NumNodes,Min,Max,Sum\\n\";\n      for (int i = 0; i < max_movie_degree; ++i) {\n        if (movie_per_degree_stats[i].count_rating > 0)\n          out_file_m << i << \",\" << movie_per_degree_stats[i].count_rating\n                     << \",\" << movie_per_degree_stats[i].min_rating << \",\"\n                     << movie_per_degree_stats[i].max_rating << \",\"\n                     << movie_per_degree_stats[i].sum_rating << \"\\n\";\n      }\n      out_file_m.close();\n    }\n  }\n  // 4 Write average-movie degree per-user\n  std::cout << \"Done writing debug info...\\n\";\n  exit(-1);\n}\n/************************************************\n *\n ************************************************/\ntemplate <typename GraphType, typename FeatureArrayType>\nfloat compute_err(GraphType& graph, FeatureArrayType* features,\n                  int max_rating) {\n  int fail_count = 0;\n  float sum      = 0;\n  for (unsigned int i = 0; i < features->size(); ++i) {\n    float f = features->host_ptr()[i];\n    sum += f;\n    if ((f != 0 && isnormal(f) == false)) {\n      fail_count++;\n    }\n  }\n  // fprintf(stderr, \"Failed:: %6.6g,Sum, %6.6g \", fail_count / (float)\n  // (features->size()), sum);\n  float accumulated_error = 0.0f;\n  float max_err           = 0.0f;\n  for (unsigned int i = 0; i < graph.num_edges(); ++i) {\n    int src      = graph.get_edge_src(i);\n    int dst      = graph.out_neighbors()[i];\n    float rating = graph.out_edge_data()[i] / (float)max_rating;\n    // if(src <0 || src >= graph.num_nodes() || dst < 0 || dst >=\n    // graph.num_nodes())\n    //     fprintf(stderr, \" error at src %d and dst %d\\n\", src, dst);\n    float computed_rating =\n        dot_product(&features->host_ptr()[src * SGD_FEATURE_SIZE],\n                    &features->host_ptr()[dst * SGD_FEATURE_SIZE]);\n    float err = (computed_rating - rating);\n    max_err   = std::max((double)max_err, (double)fabs(err));\n    accumulated_error += err * err;\n  }\n  accumulated_error /= (float)graph.num_edges();\n  //   float rms = std::sqrt((float) accumulated_error);\n  float rms = sqrt((float)accumulated_error);\n  // fprintf(stderr, \"Average_error %.3f , max_error %.3f, RMS %.5f \\n\",\n  // accumulated_error, max_err, rms);\n  printf(\"RMS %.5f\\n\", rms);\n  return rms;\n}\n/************************************************************************\n *\n ************************************************************************/\ntemplate <typename GraphType, typename FeatureArrayType, typename LockType>\nvoid initialize_features_random(GraphType& graph, FeatureArrayType* features,\n                                LockType* locks, std::vector<int>& movies) {\n  using namespace std;\n\n  FeatureType top = 1.0 / sqrt(SGD_FEATURE_SIZE);\n  //   uniform_real_distribution<FeatureType> dist(0, top);\n  //   mt19937 gen;\n  /*      std::uniform_real_distribution<FeatureType> dist(-1.0f, 1.0f);*/\n  FeatureType feature_sum = 0.0f, min_feature = top, max_feature = -top;\n  // For each node, initialize features to random, and lock to be unlocked.\n  for (unsigned int i = 0; i < graph.num_nodes(); ++i) {\n    locks->host_ptr()[i] = -1;\n    FeatureType* features_l =\n        &(features\n              ->host_ptr()[i *\n                           SGD_FEATURE_SIZE]); // graph.node_data()[i].features;\n    for (int j = 0; j < SGD_FEATURE_SIZE; ++j) {\n      (features_l[j] = rand() / (float)std::numeric_limits<int>::max());\n      feature_sum += (features_l[j] = features_l[j] * top);\n      max_feature = std::max(features_l[j], max_feature);\n      min_feature = std::min(features_l[j], min_feature);\n      assert(isnormal(features_l[j]) || features_l[j] == 0);\n    }\n    if (graph.num_neighbors(i) > 0)\n      movies.push_back(i);\n  }\n  // std::cout << \"initial features:: \" << feature_sum << \" , [\" << min_feature\n  // << \" , \" << max_feature;\n}\n/************************************************************************\n *\n ************************************************************************/\ntemplate <typename GraphType, typename FeatureArrayType>\nvoid initialize_features_random(GraphType& graph, FeatureArrayType* features,\n                                std::vector<int>& movies) {\n  using namespace std;\n\n  FeatureType top         = 1.0 / sqrt(SGD_FEATURE_SIZE);\n  FeatureType feature_sum = 0.0f, min_feature = top, max_feature = -top;\n  // For each node, initialize features to random, and lock to be unlocked.\n  for (unsigned int i = 0; i < graph.num_nodes(); ++i) {\n    FeatureType* features_l =\n        &(features\n              ->host_ptr()[i *\n                           SGD_FEATURE_SIZE]); // graph.node_data()[i].features;\n    for (int j = 0; j < SGD_FEATURE_SIZE; j++) {\n      (features_l[j] = rand() / (float)std::numeric_limits<int>::max());\n      feature_sum += (features_l[j] = features_l[j] * top);\n      max_feature = std::max(features_l[j], max_feature);\n      min_feature = std::min(features_l[j], min_feature);\n      assert(isnormal(features_l[j]) || features_l[j] == 0);\n    }\n    if (graph.num_neighbors(i) > 0)\n      movies.push_back(i);\n  }\n  // std::cout << \"initial features:: \" << feature_sum << \" , [\" << min_feature\n  // << \" , \" << max_feature; std::cout << \"initial features: feature_sum \" <<\n  // feature_sum << \" min_feature \" << min_feature << \" max_feature \" <<\n  // max_feature << \"\\n\";\n}\n/************************************************************************\n *\n ************************************************************************/\n/************************************************\n *\n *************************************************/\ntemplate <typename GraphType>\nvoid diagonal_graph(GraphType& g, int num_nodes) {\n  g.init(2 * num_nodes, num_nodes);\n  for (int i = 0; i < num_nodes; ++i) {\n    g.outgoing_index()[i] = i;\n    g.get_edge_src()[i]   = i;\n    g.out_neighbors()[i]  = i + num_nodes;\n    g.out_edge_data()[i]  = 3;\n  }\n  for (int i = num_nodes; i < 2 * num_nodes; ++i) {\n    g.outgoing_index()[i] = num_nodes;\n  }\n  g.outgoing_index()[2 * num_nodes] = num_nodes;\n\n} // End complete_bipartitie\n/************************************************\n *\n *************************************************/\ntemplate <typename GraphType>\nvoid complete_bipartitie(GraphType& g, int num_movies, int num_users) {\n  g.init(num_movies + num_users, num_users * num_movies);\n  int index = 0;\n  for (int i = 0; i < num_movies; ++i) {\n    g.outgoing_index()[i] = index;\n    for (int j = 0; j < num_users; ++j) {\n      g.get_edge_src()[index + j] = i;\n      //         g.out_neighbors()[index + j] = num_movies + ((j + i) %\n      //         num_users);\n      g.out_neighbors()[index + j] = num_movies + j;\n      g.out_edge_data()[index + j] = 3;\n    }\n    index += num_users;\n  }\n  for (int i = num_movies; i < num_movies + num_users; ++i) {\n    g.outgoing_index()[i] = index;\n  }\n  g.outgoing_index()[num_movies + num_users] = index;\n\n  if (false) {\n    std::ofstream out_file(\"gen_graph.csv\");\n    for (int i = 0; i < num_movies; ++i) {\n      for (size_t nbr_idx = g.outgoing_index()[i];\n           nbr_idx < g.outgoing_index()[i + 1]; ++nbr_idx) {\n        out_file << g.out_neighbors()[nbr_idx] << \",\";\n      }\n      out_file << \"\\n\";\n    }\n    out_file.close();\n  }\n} // End complete_bipartitie\n/************************************************\n *\n *************************************************/\ntemplate <typename GraphType>\nvoid compute_err(GraphType& graph) {\n  int fail_count    = 0;\n  float sum         = 0;\n  float sum_ratings = 0;\n  for (unsigned int i = 0; i < graph.num_nodes(); ++i) {\n    for (int idx = 0; idx < SGD_FEATURE_SIZE; ++idx) {\n      float f = graph.node_data()[i].features[idx];\n      sum += f;\n      if ((f != 0 && isnormal(f) == false)) {\n        fail_count++;\n      }\n    }\n  }\n  // fprintf(stderr, \"Failed:: %6.6g,Sum, %6.6g \", fail_count / (float)\n  // (graph.num_nodes()*SGD_FEATURE_SIZE), sum);\n  float accumulated_error = 0.0f;\n  float max_err           = 0.0f;\n  typedef typename GraphType::NodeDataType NodeDataType;\n  NodeDataType* features = graph.node_data();\n  for (unsigned int i = 0; i < graph.num_edges(); ++i) {\n    unsigned int src = graph.out_edge_src()[i];\n    unsigned int dst = graph.out_neighbors()[i];\n    float rating     = graph.out_edge_data()[i];\n    sum_ratings += rating;\n    float computed_rating =\n        dot_product(features[src], features[dst], graph.num_nodes());\n    float err = (computed_rating - rating);\n    max_err   = std::max((double)max_err, (double)fabs(err));\n    accumulated_error += err * err;\n  }\n  accumulated_error /= (float)graph.num_edges();\n  float rms = sqrt((float)accumulated_error);\n  // fprintf(stderr, \"Average_error, %6.6f , max_error, %6.6f, RMS, %6.6f ,\n  // RatingsSum, %6.6g\\n\", accumulated_error, max_err, rms,sum_ratings);\n}\n/************************************************\n *\n *************************************************/\n\n#endif // OpenCL.\n#endif /* SGDCOMMON_H_ */\n"
  },
  {
    "path": "lonestar/analytics/gpu/matrixcompletion/SGDGraphCu.h",
    "content": "/*\n * SGDGraphCu.h\n *\n *  Created on: Nov 12, 2014\n *      Author: rashid\n */\n#include <iostream>\n#include <fstream>\n#include <cstring>\n#include <algorithm>\n#include <string>\n#include <vector>\n#include <map>\n#include <limits>\n#include <math.h>\n#include <fstream>\n#include <string>\n#include <iostream>\n#include <limits>\n#include <stdio.h>\n#include <cassert>\n#ifdef _WIN32\n#include <fcntl.h>\n#include <sys/types.h>\n#include <sys/stat.h>\n#include <io.h>\n#include <stdio.h>\n#else\n#include <unistd.h>\n#include <sys/mman.h>\n#endif\n#include <sys/stat.h>\n#include <sys/types.h>\n#include <fcntl.h>\n\n#ifndef GALOISGPU_APPS_SGD_CUDA_SGDGRAPHCU_H_\n#define GALOISGPU_APPS_SGD_CUDA_SGDGRAPHCU_H_\n\n#ifdef __APPLE__\n#include <libkern/OSByteOrder.h>\n#define le64toh(x) (x) // OSSwapLittleToHostInt64(x)\n#define le32toh(x) (x) //  OSSwapLittleToHostInt32(x)\n#elif __FreeBSD__\n#include <sys/endian.h>\n#elif __linux__\ntypedef ulong uint64_t;\ntypedef uint uint32_t;\n#include <endian.h>\n#ifndef le64toh\n#if __BYTE_ORDER == __LITTLE_ENDIAN\n#define le64toh(x) (x)\n#define le32toh(x) (x)\n#else\n#define le64toh(x) __bswap_64(x)\n#define le32toh(x) __bswap_32(x)\n#endif\n#endif\n#else\n#endif\n\n/*\n * LC_LinearArray_Undirected_Graph.h\n *\n *  Created on: Oct 24, 2013\n *  Single array representation, has outgoing edges.\n *      Author: rashid\n */\n\ntemplate <typename NodeDataTy, typename EdgeDataTy>\nstruct SGD_LC_LinearArray_Undirected_Graph {\n  // Are you using gcc/4.7+ Error on line below for earlier versions.\n  typedef NodeDataTy NodeDataType;\n  typedef EdgeDataTy EdgeDataType;\n  typedef unsigned int NodeIDType;\n  typedef unsigned int EdgeIDType;\n  size_t _num_nodes;\n  size_t _num_edges;\n  unsigned int _max_degree;\n  const size_t SizeEdgeData;\n  const size_t SizeNodeData;\n  int* gpu_graph;\n  /////////////////////////////////////////////////////////////////////////////////////////////\n  /////////////////////////////////////////////////////////////////////////////////////////////\n  SGD_LC_LinearArray_Undirected_Graph()\n      : SizeEdgeData(sizeof(EdgeDataType) / sizeof(unsigned int)),\n        SizeNodeData(sizeof(NodeDataType) / sizeof(unsigned int)) {\n    _max_degree = _num_nodes = _num_edges = 0;\n    gpu_graph                             = 0;\n  }\n  void read(const char* filename) {\n    readFromGR(filename);\n    for (unsigned int i = 0; i < num_nodes(); ++i) {\n      for (unsigned int e = outgoing_index()[i]; e < outgoing_index()[i + 1];\n           ++e) {\n        get_edge_src()[e] = i;\n      }\n    }\n  }\n  unsigned inline readFromGR(const char* file) {\n    std::ifstream cfile;\n    cfile.open(file);\n\n    // copied from GaloisCpp/trunk/src/FileGraph.h\n    int masterFD = open(file, O_RDONLY);\n    if (masterFD == -1) {\n      printf(\"FileGraph::structureFromFile: unable to open %s.\\n\", file);\n      abort();\n    }\n\n    struct stat buf;\n    int f = fstat(masterFD, &buf);\n    if (f == -1) {\n      printf(\"FileGraph::structureFromFile: unable to stat %s.\\n\", file);\n      abort();\n    }\n    size_t masterLength = buf.st_size;\n\n    int _MAP_BASE = MAP_PRIVATE;\n    //#ifdef MAP_POPULATE\n    //  _MAP_BASE  |= MAP_POPULATE;\n    //#endif\n\n    void* m = mmap(0, masterLength, PROT_READ, _MAP_BASE, masterFD, 0);\n    if (m == MAP_FAILED) {\n      m = 0;\n      printf(\"FileGraph::structureFromFile: mmap failed.\\n\");\n      abort();\n    }\n\n    // parse file\n    uint64_t* fptr                           = (uint64_t*)m;\n    __attribute__((unused)) uint64_t version = le64toh(*fptr++);\n    assert(version == 1);\n    __attribute__((unused)) uint64_t sizeEdgeTy = le64toh(*fptr++);\n    uint64_t numNodes                           = le64toh(*fptr++);\n    uint64_t numEdges                           = le64toh(*fptr++);\n    uint64_t* outIdx                            = fptr;\n    fptr += numNodes;\n    uint32_t* fptr32 = (uint32_t*)fptr;\n    uint32_t* outs   = fptr32;\n    fptr32 += numEdges;\n    if (numEdges % 2)\n      fptr32 += 1;\n    unsigned* edgeData = (unsigned*)fptr32;\n\n    _num_nodes = numNodes;\n    _num_edges = numEdges;\n    std::cout << \"num_nodes: \" << _num_nodes << \", num_edges: \" << _num_edges\n              << \"\\n\";\n    init(_num_nodes, _num_edges);\n    // node_data\n    memset(node_data(), 0, sizeof(unsigned int) * _num_nodes);\n    for (unsigned int i = 0; i < _num_edges; ++i) {\n      out_neighbors()[i] = le32toh(outs[i]);\n    }\n    outgoing_index()[0] = 0;\n    for (unsigned int i = 0; i < _num_nodes; ++i) {\n      outgoing_index()[i + 1] = le32toh(outIdx[i]);\n    }\n    unsigned int start        = 0;\n    unsigned int displacement = 0;\n    for (unsigned int i = 0; i < _num_nodes; ++i) {\n      unsigned int end = le32toh(outIdx[i]);\n      for (unsigned int idx = start; idx < end; ++idx) {\n        // node i's idx neighbor is to be populated here.\n        out_edge_data()[displacement] = le32toh(edgeData[idx]);\n        // out_edge_data()[displacement] = 1;\n        out_neighbors()[displacement] = le32toh(outs[idx]);\n        displacement++;\n      }\n      start = end;\n    }\n    /*   for (size_t i = 0; i < g._num_nodes; ++i)\n          g.node_data()[i] = std::numeric_limits<unsigned int>::max() / 2;*/\n    cfile.close();\n    update_in_neighbors();\n    return 0;\n  }\n\n  NodeDataType* node_data() { return (NodeDataType*)gpu_graph + 4; }\n  unsigned int* outgoing_index() {\n    return (unsigned int*)(node_data()) + _num_nodes * SizeNodeData;\n  }\n  unsigned int outgoing_index(const int idx) const {\n    return ((unsigned int*)(gpu_graph + 4) + _num_nodes * SizeNodeData)[idx];\n  }\n  unsigned int* out_neighbors() {\n    return (unsigned int*)outgoing_index() + _num_nodes + 1;\n  }\n  EdgeDataType* out_edge_data() {\n    return (EdgeDataType*)(unsigned int*)(out_neighbors()) + _num_edges;\n  }\n  EdgeDataType& out_edge_data(unsigned int node_id, unsigned int nbr_id) {\n    return ((EdgeDataType*)out_edge_data())[outgoing_index()[node_id] + nbr_id];\n  }\n  unsigned int& out_neighbors(unsigned int node_id, unsigned int nbr_id) {\n    return ((unsigned int*)out_neighbors())[outgoing_index()[node_id] + nbr_id];\n  }\n  unsigned int* incoming_index() { return outgoing_index(); }\n  unsigned int* in_neighbors() { return outgoing_index(); }\n  EdgeDataType* in_edge_data() { return out_edge_data(); }\n  unsigned int* get_edge_src() {\n    return (unsigned*)out_edge_data() + _num_edges;\n  }\n  unsigned int get_edge_src(int edge_index) {\n    return get_edge_src()[edge_index];\n  }\n  unsigned int* last() {\n    return (unsigned int*)in_edge_data() + _num_edges * SizeEdgeData;\n  }\n\n  size_t num_nodes() { return _num_nodes; }\n  size_t num_edges() { return _num_edges; }\n  size_t num_neighbors(const unsigned int node_id) const {\n    return outgoing_index(node_id + 1) - outgoing_index(node_id);\n  }\n  size_t max_degree() { return _max_degree; }\n  void init(size_t n_n, size_t n_e) {\n    _num_nodes = n_n;\n    _num_edges = n_e;\n    // const int arr_size = (4 + (_num_nodes * SizeNodeData) + (_num_nodes + 1)\n    // + (_num_edges) + (_num_edges * SizeEdgeData) + (_num_edges)); std::cout\n    // << \"Allocating NN: \" << _num_nodes << \"(\" << SizeNodeData << \") , NE :\"\n    // << _num_edges << \", TOTAL:: \" << arr_size << \"\\n\"; Num_nodes, num_edges,\n    // [node_data] , [outgoing_index], [out_neighbors], [edge_data] , [src\n    // indices] fprintf(stderr, \"GraphSize :: %6.6g MB\\n\", arr_size /\n    // (float(1024\n    // * 1024)));\n    gpu_graph =\n        new int[(4 + (_num_nodes * SizeNodeData) + (_num_nodes + 1) +\n                 (_num_edges) + (_num_edges * SizeEdgeData) + (_num_edges))];\n    (gpu_graph)[0] = (int)_num_nodes;\n    (gpu_graph)[1] = (int)_num_edges;\n    (gpu_graph)[2] = (int)SizeNodeData;\n    (gpu_graph)[3] = (int)SizeEdgeData;\n    // allocate_on_gpu();\n  }\n  /////////////////////////////////////////////////////////////////////////////////////////////\n  /////////////////////////////////////////////////////////////////////////////////////////////\n  void print_header(void) {\n    std::cout << \"Header :: [\";\n    for (unsigned int i = 0; i < 6; ++i) {\n      std::cout << gpu_graph[i] << \",\";\n    }\n    std::cout << \"\\n\";\n    return;\n  }\n  /////////////////////////////////////////////////////////////////////////////////////////////\n  /////////////////////////////////////////////////////////////////////////////////////////////\n  void print_node(unsigned int idx, const char* post = \"\") {\n    if (idx < _num_nodes) {\n      std::cout << \"N-\" << idx << \"(\" << (node_data())[idx] << \")\"\n                << \" :: [\";\n      for (size_t i = (outgoing_index())[idx]; i < (outgoing_index())[idx + 1];\n           ++i) {\n        std::cout << \" \" << (out_neighbors())[i] << \"(\" << (out_edge_data())[i]\n                  << \"<\" << node_data()[out_neighbors()[i]] << \">\"\n                  << \"), \";\n      }\n      std::cout << \"]\" << post;\n    }\n    return;\n  }\n  /////////////////////////////////////////////////////////////////////////////////////////////\n  /////////////////////////////////////////////////////////////////////////////////////////////\n  void print_graph(void) {\n    std::cout << \"\\n====Printing graph (\" << _num_nodes << \" , \" << _num_edges\n              << \")=====\\n\";\n    for (size_t i = 0; i < _num_nodes; ++i) {\n      print_node(i);\n      std::cout << \"\\n\";\n    }\n    return;\n  }\n\n  /////////////////////////////////////////////////////////////////////////////////////////////\n  /////////////////////////////////////////////////////////////////////////////////////////////\n  void update_in_neighbors(void) {}\n  /////////////////////////////////////////////////////////////////////////////////////////////\n  /////////////////////////////////////////////////////////////////////////////////////////////\n  void print_compact(void) {\n    std::cout << \"Summary:: [\" << _num_nodes << \", \" << _num_edges << \", \"\n              << outgoing_index()[_num_nodes] << \"]\";\n    std::cout << \"\\nOut-index [\";\n    for (size_t i = 0; i < _num_nodes + 1; ++i) {\n      if (i < _num_nodes && outgoing_index()[i] > outgoing_index()[i + 1])\n        std::cout << \"**ERR**\";\n      std::cout << \" \" << outgoing_index()[i] << \",\";\n    }\n    std::cout << \"]\\nNeigh[\";\n    for (size_t i = 0; i < _num_edges; ++i) {\n      if (out_neighbors()[i] > _num_nodes)\n        std::cout << \"**ERR**\";\n      std::cout << \" \" << out_neighbors()[i] << \",\";\n    }\n    std::cout << \"]\\nEData [\";\n    for (size_t i = 0; i < _num_edges; ++i) {\n      std::cout << \" \" << out_edge_data()[i] << \",\";\n    }\n    std::cout << \"]\";\n  }\n  /////////////////////////////////////////////////////////////////////////////////////////////\n  /////////////////////////////////////////////////////////////////////////////////////////////\n  unsigned int verify() {\n    unsigned int* t_node_data      = node_data();\n    unsigned int* t_outgoing_index = outgoing_index();\n    unsigned int* t_neighbors      = out_neighbors();\n    unsigned int* t_out_edge_data  = out_edge_data();\n    unsigned int err_count         = 0;\n    for (unsigned int node_id = 0; node_id < _num_nodes; ++node_id) {\n      unsigned int curr_distance = t_node_data[node_id];\n      // Go over all the neighbors.\n      for (unsigned int idx = t_outgoing_index[node_id];\n           idx < t_outgoing_index[node_id + 1]; ++idx) {\n        unsigned int temp = t_node_data[t_neighbors[idx]];\n        if (curr_distance + t_out_edge_data[idx] < temp) {\n          if (err_count < 10) {\n            std::cout << \"Error :: \";\n            print_node(node_id);\n            std::cout << \" With :: \";\n            print_node(t_neighbors[idx]);\n            std::cout << \"\\n\";\n          }\n          err_count++;\n        }\n      }\n    } // End for\n    return err_count;\n  }\n  ////////////##############################################################///////////\n  ////////////##############################################################///////////\n  unsigned int verify_in() { return 0; }\n  /////////////////////////////////////////////////////////////////////////////////////////////\n  /////////////////////////////////////////////////////////////////////////////////////////////\n  void deallocate(void) { delete gpu_graph; }\n};\n// End LC_Graph\n/////////////////////////////////////////////////////////////////////////////////////////////\n/////////////////////////////////////////////////////////////////////////////////////////////\n#endif /* GALOISGPU_APPS_SGD_CUDA_SGDGRAPHCU_H_ */\n"
  },
  {
    "path": "lonestar/analytics/gpu/matrixcompletion/sgd.cu",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting parallelism.\n * The code is being released under the terms of the 3-Clause BSD License (a\n * copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n//#define _GOPT_DEBUG 1\n#include <algorithm>\n#include <cstring>\n#include <fstream>\n#include <iostream>\n#include <limits>\n#include <map>\n#include <math.h>\n#include <set>\n#include <string>\n#include <vector>\n#include <libgen.h>\n\n#include <cuda.h>\n\n#include \"SGDAsyncEdgeCu.h\"\n\nint main(int argc, char ** args) {\n   const char * fname = \"/net/ohm/export/iss/inputs/GaloisGPU/bgg.gr\";\n   if (argc == 2)\n      fname = args[1];\n   typedef SGDAsynEdgeCudaFunctor SGDFunctorTy;\n   //fprintf(stderr, \"===============================Starting- processing %s\\n===============================\", fname);\n   SGDFunctorTy func(false, fname);\n\tfunc(5);\n   //fprintf(stderr, \"====================Terminating - processed%s================================\\n\", fname);\n   //std::cout << \"Completed successfully!\\n\";\n   return 0;\n}\n"
  },
  {
    "path": "lonestar/analytics/gpu/matrixcompletion/support.cu",
    "content": "\n"
  },
  {
    "path": "lonestar/analytics/gpu/pagerank/CMakeLists.txt",
    "content": "app_analy_gpu(pagerank pagerank)\nadd_test_gpu(pagerank rmat15 rmat15.out pagerank -o rmat15.out -x 100 ${BASEINPUT}/scalefree/rmat15.gr)\n"
  },
  {
    "path": "lonestar/analytics/gpu/pagerank/README.md",
    "content": "Pagerank\n================================================================================\n\nDESCRIPTION\n--------------------------------------------------------------------------------\n\n\n PageRank is a key technique in web mining to rank the importance of web pages. In PageRank, each web page is assigned a numerical weight to begin with, and the algorithm tries to estimate the importance of the web page relative to other web pages in the hyperlinked set of pages. The key assumption is that more important web pages are likely to receive more links from other websites. More details about the problem and different solutions can be found in [1, 2].\n\n[1] https://en.wikipedia.org/wiki/PageRank\n\n[2] Whang et al. Scalable Data-driven PageRank: Algorithms, System Issues, and Lessons Learned. European Conference on Parallel Processing, 2015.\n\n This benchmark computes the PageRank of the nodes for a given input graph using  using a push-style  residual-based algorithm. The algorithm takes input as a graph, and some constant parameters that are used in the computation. The algorithmic parameters are the following:\n\n* ALPHA: ALPHA represents the damping factor, which is the probability that a web surfer will continue browsing by clicking on the linked pages. The damping factor is generally set to 0.85 in the literature.\n* TOLERANCE: It represents a bound on the error in the computation.\n* MAX_ITER: The number of iterations to repeat the PageRank computation.\n\nINPUT\n--------------------------------------------------------------------------------\n\nTake in Galois .gr graphs. \n\nBUILD\n--------------------------------------------------------------------------------\n\n1. Run cmake at BUILD directory (refer to top-level README for cmake instructions).\n\n2. Run `cd <BUILD>/lonestar/analytics/gpu/pagerank; make -j`\n\nRUN\n--------------------------------------------------------------------------------\n\nTo run default algorithm, use the following:\n\n-`$ ./pagerank-gpu -o <output-file> -t <top_ranks> -x <max_iterations> <input-graph>`\n\n-`$ ./pagerank-gpu -o outfile.txt -x 1000 road-USA.gr`\n"
  },
  {
    "path": "lonestar/analytics/gpu/pagerank/pagerank.cu",
    "content": "/*  -*- mode: c++ -*-  */\n#include \"gg.h\"\n#include \"ggcuda.h\"\n#include \"cub/cub.cuh\"\n#include \"cub/util_allocator.cuh\"\n#include \"thread_work.h\"\n\nvoid kernel_sizing(CSRGraph &, dim3 &, dim3 &);\n#define TB_SIZE 256\nconst char *GGC_OPTIONS = \"coop_conv=False $ outline_iterate_gb=True $ backoff_blocking_factor=4 $ parcomb=True $ np_schedulers=set(['wp', 'fg']) $ cc_disable=set([]) $ tb_lb=False $ hacks=set([]) $ np_factor=8 $ instrument=set([]) $ unroll=[] $ read_props=None $ outline_iterate=True $ ignore_nested_errors=False $ np=True $ write_props=None $ quiet_cgen=True $ retry_backoff=True $ cuda.graph_type=basic $ cuda.use_worklist_slots=True $ cuda.worklist_type=basic\";\nstruct ThreadWork t_work;\nbool enable_lb = false;\ntypedef int edge_data_type;\ntypedef int node_data_type;\ntypedef float* gfloat_p;\nfloat* P_CURR ;\nfloat* P_NEXT ;\nextern const float ALPHA = 0.85;\nextern const float EPSILON = 0.000001;\nextern int MAX_ITERATIONS ;\nstatic const int __tb_gg_main_pipe_1_gpu_gb = 256;\nstatic const int __tb_pagerank_main = TB_SIZE;\nstatic const int __tb_remove_dups = TB_SIZE;\n__global__ void init_1(CSRGraph graph, float * p_curr, float * residual)\n{\n  unsigned tid = TID_1D;\n  unsigned nthreads = TOTAL_THREADS_1D;\n\n  index_type node_end;\n  node_end = (graph).nnodes;\n  for (index_type node = 0 + tid; node < node_end; node += nthreads)\n  {\n    float update;\n    index_type edge_end;\n    p_curr[node] = 1.0 - ALPHA;\n    update = 1.0/graph.getOutDegree(node);\n    edge_end = (graph).getFirstEdge((node) + 1);\n    for (index_type edge = (graph).getFirstEdge(node) + 0; edge < edge_end; edge += 1)\n    {\n      index_type dst;\n      dst = graph.getAbsDestination(edge);\n      atomicAdd(residual + dst, update);\n    }\n  }\n}\n__device__ void init_2_dev(CSRGraph graph, float * residual, Worklist2 in_wl, Worklist2 out_wl)\n{\n  unsigned tid = TID_1D;\n  unsigned nthreads = TOTAL_THREADS_1D;\n\n  index_type _start_22;\n  index_type node_end;\n  _start_22 = (out_wl).push_range((tid < ((graph).nnodes)) ? ((((graph).nnodes) - 1 - tid)/nthreads + 1) : 0);;\n  node_end = (graph).nnodes;\n  for (index_type node = 0 + tid, node_pos = 0; node < node_end; node_pos++, node += nthreads)\n  {\n    residual[node] *= (1.0 - ALPHA) * ALPHA;\n    (out_wl).do_push(_start_22, node_pos, node);\n  }\n}\n__global__ void init_2(CSRGraph graph, float * residual, Worklist2 in_wl, Worklist2 out_wl)\n{\n  unsigned tid = TID_1D;\n\n  if (tid == 0)\n    in_wl.reset_next_slot();\n\n  init_2_dev(graph, residual, in_wl, out_wl);\n}\n__global__ void remove_dups(int * marks, Worklist2 in_wl, Worklist2 out_wl, GlobalBarrier gb)\n{\n  unsigned tid = TID_1D;\n  unsigned nthreads = TOTAL_THREADS_1D;\n\n  if (tid == 0)\n    in_wl.reset_next_slot();\n\n  index_type wlnode_end;\n  index_type wlnode2_end;\n  wlnode_end = *((volatile index_type *) (in_wl).dindex);\n  for (index_type wlnode = 0 + tid; wlnode < wlnode_end; wlnode += nthreads)\n  {\n    int node;\n    bool pop;\n    pop = (in_wl).pop_id(wlnode, node);\n    marks[node] = wlnode;\n  }\n  gb.Sync();\n  wlnode2_end = *((volatile index_type *) (in_wl).dindex);\n  for (index_type wlnode2 = 0 + tid; wlnode2 < wlnode2_end; wlnode2 += nthreads)\n  {\n    int node;\n    bool pop;\n    pop = (in_wl).pop_id(wlnode2, node);\n    if (marks[node] == wlnode2)\n    {\n      index_type _start_37;\n      _start_37 = (out_wl).setup_push_warp_one();;\n      (out_wl).do_push(_start_37, 0, node);\n    }\n  }\n}\n__device__ void pagerank_main_dev(CSRGraph graph, float * p_curr, float * residual, float * p_diff, bool enable_lb, Worklist2 in_wl, Worklist2 out_wl)\n{\n  unsigned tid = TID_1D;\n  unsigned nthreads = TOTAL_THREADS_1D;\n\n  const unsigned __kernel_tb_size = __tb_pagerank_main;\n  index_type wlnode_end;\n  const int _NP_CROSSOVER_WP = 32;\n  const int BLKSIZE = __kernel_tb_size;\n  const int ITSIZE = BLKSIZE * 8;\n\n  typedef cub::BlockScan<multiple_sum<2, index_type>, BLKSIZE> BlockScan;\n  typedef union np_shared<BlockScan::TempStorage, index_type, struct empty_np, struct warp_np<__kernel_tb_size/32>, struct fg_np<ITSIZE> > npsTy;\n\n  __shared__ npsTy nps ;\n  wlnode_end = roundup((*((volatile index_type *) (in_wl).dindex)), (blockDim.x));\n  for (index_type wlnode = 0 + tid; wlnode < wlnode_end; wlnode += nthreads)\n  {\n    int sdeg;\n    float update;\n    int node;\n    bool pop;\n    float res;\n    multiple_sum<2, index_type> _np_mps;\n    multiple_sum<2, index_type> _np_mps_total;\n    pop = (in_wl).pop_id(wlnode, node);\n    if (pop)\n    {\n      res =atomicExch(residual + node, 0);\n      p_curr[node] += res;\n      sdeg = graph.getOutDegree(node);\n      update = res * ALPHA / sdeg;\n    }\n    struct NPInspector1 _np = {0,0,0,0,0,0};\n    __shared__ struct { float update; } _np_closure [TB_SIZE];\n    _np_closure[threadIdx.x].update = update;\n    if (pop)\n    {\n      _np.size = (graph).getOutDegree(node);\n      _np.start = (graph).getFirstEdge(node);\n    }\n    _np_mps.el[0] = _np.size >= _NP_CROSSOVER_WP ? _np.size : 0;\n    _np_mps.el[1] = _np.size < _NP_CROSSOVER_WP ? _np.size : 0;\n    BlockScan(nps.temp_storage).ExclusiveSum(_np_mps, _np_mps, _np_mps_total);\n    if (threadIdx.x == 0)\n    {\n    }\n    __syncthreads();\n    {\n      const int warpid = threadIdx.x / 32;\n      const int _np_laneid = cub::LaneId();\n      while (__any_sync(0xffffffff, _np.size >= _NP_CROSSOVER_WP))\n      {\n        if (_np.size >= _NP_CROSSOVER_WP)\n        {\n          nps.warp.owner[warpid] = _np_laneid;\n        }\n        if (nps.warp.owner[warpid] == _np_laneid)\n        {\n          nps.warp.start[warpid] = _np.start;\n          nps.warp.size[warpid] = _np.size;\n          nps.warp.src[warpid] = threadIdx.x;\n          _np.start = 0;\n          _np.size = 0;\n        }\n        index_type _np_w_start = nps.warp.start[warpid];\n        index_type _np_w_size = nps.warp.size[warpid];\n        assert(nps.warp.src[warpid] < __kernel_tb_size);\n        update = _np_closure[nps.warp.src[warpid]].update;\n        for (int _np_ii = _np_laneid; _np_ii < _np_w_size; _np_ii += 32)\n        {\n          index_type edge;\n          edge = _np_w_start +_np_ii;\n          {\n            index_type dst;\n            float prev;\n            dst = graph.getAbsDestination(edge);\n            prev = atomicAdd(residual + dst, update);\n            if (prev + update > EPSILON && prev < EPSILON)\n            {\n              index_type _start_57;\n              _start_57 = (out_wl).setup_push_warp_one();;\n              (out_wl).do_push(_start_57, 0, dst);\n            }\n          }\n        }\n      }\n      __syncthreads();\n    }\n\n    __syncthreads();\n    _np.total = _np_mps_total.el[1];\n    _np.offset = _np_mps.el[1];\n    while (_np.work())\n    {\n      int _np_i =0;\n      _np.inspect2(nps.fg.itvalue, nps.fg.src, ITSIZE, threadIdx.x);\n      __syncthreads();\n\n      for (_np_i = threadIdx.x; _np_i < ITSIZE && _np.valid(_np_i); _np_i += BLKSIZE)\n      {\n        index_type edge;\n        assert(nps.fg.src[_np_i] < __kernel_tb_size);\n        update = _np_closure[nps.fg.src[_np_i]].update;\n        edge= nps.fg.itvalue[_np_i];\n        {\n          index_type dst;\n          float prev;\n          dst = graph.getAbsDestination(edge);\n          prev = atomicAdd(residual + dst, update);\n          if (prev + update > EPSILON && prev < EPSILON)\n          {\n            index_type _start_57;\n            _start_57 = (out_wl).setup_push_warp_one();;\n            (out_wl).do_push(_start_57, 0, dst);\n          }\n        }\n      }\n      _np.execute_round_done(ITSIZE);\n      __syncthreads();\n    }\n    assert(threadIdx.x < __kernel_tb_size);\n    update = _np_closure[threadIdx.x].update;\n  }\n}\n__global__ void __launch_bounds__(TB_SIZE, 3) pagerank_main(CSRGraph graph, float * p_curr, float * residual, float * p_diff, bool enable_lb, Worklist2 in_wl, Worklist2 out_wl)\n{\n  unsigned tid = TID_1D;\n\n  if (tid == 0)\n    in_wl.reset_next_slot();\n\n  pagerank_main_dev(graph, p_curr, residual, p_diff, enable_lb, in_wl, out_wl);\n}\nvoid gg_main_pipe_1(gfloat_p p2, gfloat_p p0, gfloat_p rp, int& iter, CSRGraph& gg, CSRGraph& hg, int MAX_ITERATIONS, PipeContextT<Worklist2>& pipe, dim3& blocks, dim3& threads)\n{\n  {\n    pipe.out_wl().will_write();\n    init_2 <<<blocks, threads>>>(gg, rp, pipe.in_wl(), pipe.out_wl());\n    cudaDeviceSynchronize();\n    pipe.in_wl().swap_slots();\n    pipe.advance2();\n    while (pipe.in_wl().nitems())\n    {\n      pipe.out_wl().will_write();\n      pagerank_main <<<blocks, __tb_pagerank_main>>>(gg, p0, rp, p2, enable_lb, pipe.in_wl(), pipe.out_wl());\n      cudaDeviceSynchronize();\n      pipe.in_wl().swap_slots();\n      pipe.advance2();\n      iter++;\n      if (iter >= MAX_ITERATIONS)\n      {\n        break;\n      }\n    }\n  }\n}\n__global__ void __launch_bounds__(__tb_gg_main_pipe_1_gpu_gb) gg_main_pipe_1_gpu_gb(gfloat_p p2, gfloat_p p0, gfloat_p rp, int iter, CSRGraph gg, CSRGraph hg, int MAX_ITERATIONS, PipeContextT<Worklist2> pipe, int* cl_iter, bool enable_lb, GlobalBarrier gb)\n{\n  unsigned tid = TID_1D;\n\n  iter = *cl_iter;\n  {\n    if (tid == 0)\n      pipe.in_wl().reset_next_slot();\n    init_2_dev (gg, rp, pipe.in_wl(), pipe.out_wl());\n    pipe.in_wl().swap_slots();\n    gb.Sync();\n    pipe.advance2();\n    while (pipe.in_wl().nitems())\n    {\n      if (tid == 0)\n        pipe.in_wl().reset_next_slot();\n      pagerank_main_dev (gg, p0, rp, p2, enable_lb, pipe.in_wl(), pipe.out_wl());\n      pipe.in_wl().swap_slots();\n      gb.Sync();\n      pipe.advance2();\n      iter++;\n      if (iter >= MAX_ITERATIONS)\n      {\n        break;\n      }\n    }\n  }\n  gb.Sync();\n  if (tid == 0)\n  {\n    *cl_iter = iter;\n  }\n}\nvoid gg_main_pipe_1_wrapper(gfloat_p p2, gfloat_p p0, gfloat_p rp, int& iter, CSRGraph& gg, CSRGraph& hg, int MAX_ITERATIONS, PipeContextT<Worklist2>& pipe, dim3& blocks, dim3& threads)\n{\n  static GlobalBarrierLifetime gg_main_pipe_1_gpu_gb_barrier;\n  static bool gg_main_pipe_1_gpu_gb_barrier_inited;\n  extern bool enable_lb;\n  static const size_t gg_main_pipe_1_gpu_gb_residency = maximum_residency(gg_main_pipe_1_gpu_gb, __tb_gg_main_pipe_1_gpu_gb, 0);\n  static const size_t gg_main_pipe_1_gpu_gb_blocks = GG_MIN(blocks.x, ggc_get_nSM() * gg_main_pipe_1_gpu_gb_residency);\n  if(!gg_main_pipe_1_gpu_gb_barrier_inited) { gg_main_pipe_1_gpu_gb_barrier.Setup(gg_main_pipe_1_gpu_gb_blocks); gg_main_pipe_1_gpu_gb_barrier_inited = true;};\n  if (enable_lb)\n  {\n    gg_main_pipe_1(p2,p0,rp,iter,gg,hg,MAX_ITERATIONS,pipe,blocks,threads);\n  }\n  else\n  {\n    int* cl_iter;\n    check_cuda(cudaMalloc(&cl_iter, sizeof(int) * 1));\n    check_cuda(cudaMemcpy(cl_iter, &iter, sizeof(int) * 1, cudaMemcpyHostToDevice));\n\n    gg_main_pipe_1_gpu_gb<<<gg_main_pipe_1_gpu_gb_blocks, __tb_gg_main_pipe_1_gpu_gb>>>(p2,p0,rp,iter,gg,hg,MAX_ITERATIONS,pipe,cl_iter, enable_lb, gg_main_pipe_1_gpu_gb_barrier);\n    check_cuda(cudaMemcpy(&iter, cl_iter, sizeof(int) * 1, cudaMemcpyDeviceToHost));\n    check_cuda(cudaFree(cl_iter));\n  }\n}\nvoid gg_main(CSRGraph& hg, CSRGraph& gg)\n{\n  dim3 blocks, threads;\n  kernel_sizing(gg, blocks, threads);\n  t_work.init_thread_work(gg.nnodes);\n  static GlobalBarrierLifetime remove_dups_barrier;\n  static bool remove_dups_barrier_inited;\n  PipeContextT<Worklist2> pipe;\n  Shared<float> p[3] = {Shared<float> (hg.nnodes), Shared<float> (hg.nnodes), Shared<float>(hg.nnodes)};\n  Shared<float> r (hg.nnodes);\n  Shared<int> marks (hg.nnodes);\n  static const size_t remove_dups_residency = maximum_residency(remove_dups, __tb_remove_dups, 0);\n  static const size_t remove_dups_blocks = GG_MIN(blocks.x, ggc_get_nSM() * remove_dups_residency);\n  if(!remove_dups_barrier_inited) { remove_dups_barrier.Setup(remove_dups_blocks); remove_dups_barrier_inited = true;};\n  int iter = 0;\n  r.zero_gpu();\n  init_1 <<<blocks, threads>>>(gg, p[0].gpu_wr_ptr(), r.gpu_wr_ptr());\n  cudaDeviceSynchronize();\n  gfloat_p p0 =p[0].gpu_wr_ptr();\n  gfloat_p p2 =p[2].gpu_wr_ptr();\n  gfloat_p rp =r.gpu_wr_ptr();\n  pipe = PipeContextT<Worklist2>(hg.nedges);\n  gg_main_pipe_1_wrapper(p2,p0,rp,iter,gg,hg,MAX_ITERATIONS,pipe,blocks,threads);\n  printf(\"PR took %d iterations\\n\", iter);\n  P_CURR = p[0].cpu_rd_ptr();\n  P_NEXT = p[0].cpu_rd_ptr();\n}\n"
  },
  {
    "path": "lonestar/analytics/gpu/pagerank/support.cu",
    "content": "/* -*- mode: C++ -*- */\n\n#include \"gg.h\"\n#include <float.h>\n#include <stdint.h>\n\nstruct pr_value {\n  index_type node;\n  float rank;\n  inline bool operator< (const pr_value& rhs) const {\n    return rank < rhs.rank;\n  }\n};\n\n/* TODO: accept ALPHA and EPSILON */\nconst char *prog_opts = \"nt:x:\";\nconst char *prog_usage = \"[-n] [-t top_ranks] [-x max_iterations]\";\nconst char *prog_args_usage = \"\";\n\nextern float *P_CURR, *P_NEXT;\nextern const float ALPHA, EPSILON;\nextern int MAX_ITERATIONS;\n\nint NO_PRINT_PAGERANK = 0;\nint PRINT_TOP = 0;\nint MAX_ITERATIONS =  INT_MAX;\n\nint process_prog_arg(int argc, char *argv[], int arg_start) {\n   return 1;\n}\n\nvoid process_prog_opt(char c, char *optarg) {\n  if(c == 'n')\n    NO_PRINT_PAGERANK = 1;\n\n  if(c == 't') {\n    PRINT_TOP = atoi(optarg);    \n  }\n\n  if(c == 'x') {\n    MAX_ITERATIONS = atoi(optarg);\n  }\n}\n\nvoid output(CSRGraphTy &g, const char *output_file) {\n  FILE *f;\n\n  struct pr_value * pr;\n\n  pr = (struct pr_value *) calloc(g.nnodes, sizeof(struct pr_value));\n\n  if(pr == NULL) {\n    fprintf(stderr, \"Failed to allocate memory\\n\");\n    exit(1);\n  }\n\n  fprintf(stderr, \"Calculating sum ...\\n\");\n  float sum = 0;\n  for(int i = 0; i < g.nnodes; i++) {\n    pr[i].node = i;\n    pr[i].rank = P_CURR[i];\n    sum += P_CURR[i];\n  }\n\n  fprintf(stdout, \"sum: %f (%d)\\n\", sum, g.nnodes);\n\n  if(!output_file)\n    return;\n\n//  fprintf(stderr, \"Sorting by rank ...\\n\");\n//  std::stable_sort(pr, pr + g.nnodes);\n//  fprintf(stderr, \"Writing to file ...\\n\");\n\n  if(strcmp(output_file, \"-\") == 0)\n    f = stdout;\n  else\n    f = fopen(output_file, \"w\");\n\n//  check_fprintf(f, \"ALPHA %*e EPSILON %*e\\n\", FLT_DIG, ALPHA, FLT_DIG, EPSILON);\n\n  if(PRINT_TOP == 0)\n    PRINT_TOP = g.nnodes;\n\n//  check_fprintf(f, \"RANKS 1--%d of %d\\n\", PRINT_TOP, g.nnodes);\n\n  /* for(int i = 1; i <= PRINT_TOP; i++) {\n    if(NO_PRINT_PAGERANK) \n      check_fprintf(f, \"%d %d\\n\", i, pr[g.nnodes - i].node);\n    else \n      check_fprintf(f, \"%d %d %*e\\n\", i, pr[g.nnodes - i].node, FLT_DIG, pr[g.nnodes - i].rank/sum);  \n  } */\n  for(int i = 0; i < g.nnodes; i++) {\n    if(NO_PRINT_PAGERANK) \n      check_fprintf(f, \"%d\\n\", pr[i].node);\n    else \n      check_fprintf(f, \"%d %f\\n\", pr[i].node, FLT_DIG, pr[i].rank);  \n  }\n\n  free(pr);\n}\n"
  },
  {
    "path": "lonestar/analytics/gpu/pointstoanalysis/CMakeLists.txt",
    "content": "app_analy_gpu(pta pointstoanalysis)\nadd_test_gpu(pointstoanalysis tshark tshark.out pta ${BASEINPUT}/java/pta/tshark_nodes.txt ${BASEINPUT}/java/pta/tshark_constraints_after_hcd.txt ${BASEINPUT}/java/pta/tshark_hcd.txt ${BASEINPUT}/java/pta/tshark_correct_soln_001.txt)\n"
  },
  {
    "path": "lonestar/analytics/gpu/pointstoanalysis/README.md",
    "content": "Points To Analysis\n================================================================================\n\nDESCRIPTION\n--------------------------------------------------------------------------------\n\nGiven a set of points-to constraints, the problem is to compute the points-to\ninformation for each pointer, in a flow-insensitive context-insensitive manner.\n\nINPUT\n--------------------------------------------------------------------------------\n\nThis application takes in Galois .gr graphs representing constraints.\n\nBUILD\n--------------------------------------------------------------------------------\n\n1. Run cmake at BUILD directory (refer to top-level README for cmake instructions).\n\n2. Run `cd <BUILD>/lonestar/analytics/gpu/pointstoanalysis; make -j`\n\nRUN\n--------------------------------------------------------------------------------\n\nTo run default algorithm, use the following:\n\n-`$ ./pta <nodes-file> <constraints-file> <hcd-table> <solution-file> [TRANSFER, VERIFY]`\n-`$ ./pta tshark_nodes.txt tshark_constraints_after_hcd.txt tshark_hcd.txt tshark_correct_soln_001.txt`\n\n"
  },
  {
    "path": "lonestar/analytics/gpu/pointstoanalysis/andersen.cu",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting parallelism.\n * The code is being released under the terms of the 3-Clause BSD License (a\n * copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#include \"andersen.h\"\n#include <thrust/adjacent_difference.h>\n#include <thrust/host_vector.h>\n#include <thrust/device_vector.h>\n#include <thrust/fill.h>\n#include <thrust/sort.h>\n#include <thrust/copy.h>\n#include <thrust/scan.h>\n#include <thrust/unique.h>\n#include <thrust/functional.h>\n#include <thrust/remove.h>\n#include <thrust/transform_reduce.h>\n#include <thrust/iterator/counting_iterator.h>\n#include \"cuda_launch_config.hpp\"\n\nusing namespace thrust;\n\n__constant__ uint __storeStart__;\n__constant__ uint __loadInvStart__;\n\n/**\n *  number of variables of the input program.\n */\n__constant__ uint __numVars__;\n\n__constant__ uint* __ptsConstraints__;\n__constant__ uint __numPtsConstraints__;\n\n__constant__ uint*  __copyConstraints__;\n__constant__ uint __numCopyConstraints__;\n\n__constant__ uint* __loadConstraints__;\n__constant__ uint __numLoadConstraints__;\n\n__constant__ uint* __storeConstraints__;\n__constant__ uint __numStoreConstraints__;\n__device__ uint __numStore__ = 0;\n\n__constant__ uint* __gepInv__;\n__constant__ uint __numGepInv__;\n\n__constant__ uint* __size__;\n\n__constant__ uint* __initialRep__;\n__constant__ uint* __initialNonRep__;\n__constant__ uint __numInitialRep__;\n\n__constant__  uint* __nextVar__;\n\n /**\n  * Table of indexes to the information inferred by HCD.\n  * Each entry is a pair (index, index + delta) that refers to __hcdTable__ \n  */\n__constant__ uint* __hcdIndex__;\n__constant__ uint __numHcdIndex__;\n/**\n * List of pairs (y, x_0, x_(delta - 2)) where pts(*y) = pts(x_0) = ... pts(x_((delta - 2))\n * The equivalences have been detected during the offline phase of HCD, executed in the CPU\n */\n__constant__ uint* __hcdTable__;\n__constant__ uint __numHcdTable__;\n\n/**\n * Representative array\n */\n__constant__ volatile uint* __rep__; // HAS to be volatile\n\n/**\n * array of elements containing all the edges in the graph.\n */\n__constant__ volatile uint* __edges__; // HAS to be volatile\n__constant__ uint* __graph__;\n\n__constant__  uint* __lock__;\n\n__constant__ uint* __key__;\n__constant__ uint* __val__;\n__constant__ uint* __keyAux__;\n__device__ uint __numKeysCounter__ = 0;\n__device__ uint __numKeys__;\n__constant__ uint* __currPtsHead__;\n\n__device__ uint __counter__ = 0;\n__device__ uint __max__ = 0;\n__device__ uint __min__ = 0;\n\n__device__ bool __done__ = true;\n__device__ uint __error__;\n\n__device__ uint __worklistIndex0__ = 0;\n__device__ uint __worklistIndex1__ = 1;\n\nuint createTime = 0;\ndouble createTime2 = 0;\n\n//////////// utility functions for the GPU /////////\n\n__device__ uint  __errorCode__ = 0;\n__device__ uint  __errorLine__ = 0;\n__device__ char* __errorMsg__;\n\n__device__ inline uint nextPowerOfTwo(uint v) {\n  return 1U << (uintSize * 8 - __clz(v - 1));\n}\n\n__device__ inline uint __count(int predicate) {\n  const uint ballot = __ballot_sync(0xffffffff,predicate);\n  return __popc(ballot);\n}\n\n__device__ inline uint isFirstThreadOfWarp(){\n  return !threadIdx.x;\n}\n\n__device__ inline uint getWarpIdInGrid(){\n  return (blockIdx.x * (blockDim.x * blockDim.y / WARP_SIZE) + threadIdx.y);\n}\n\n__device__ inline uint isFirstWarpOfGrid(){\n  return !(blockIdx.x || threadIdx.y);\n}\n\n__device__ inline uint isFirstWarpOfBlock(){\n  return !threadIdx.y;\n}\n\n__device__ inline uint getThreadIdInBlock(){\n  return mul32(threadIdx.y) + threadIdx.x;\n}\n\n__device__ inline uint isFirstThreadOfBlock(){\n  return !getThreadIdInBlock();\n}\n\n__device__ inline uint getThreadIdInGrid(){\n  return mul32(getWarpIdInGrid()) + threadIdx.x;\n}\n\n__device__ inline uint getThreadsPerBlock() {\n  return blockDim.x * blockDim.y;\n}\n\n__device__ inline uint isLastThreadOfBlock(){\n  return getThreadIdInBlock() == getThreadsPerBlock() - 1;\n}\n\n__device__ inline uint getWarpsPerBlock() {\n  return blockDim.y;\n}\n\n__device__ inline uint getWarpsPerGrid() {\n  return blockDim.y * gridDim.x;\n}\n\n__device__ inline uint getThreadsPerGrid() {\n  return mul32(getWarpsPerGrid());\n}\n\n__device__ inline uint getBlockIdInGrid(){\n  return blockIdx.x;\n}\n\n__device__ inline uint getBlocksPerGrid(){\n  return gridDim.x;\n}\n\n__device__ void syncAllThreads() {\n  __syncthreads();\n  uint to = getBlocksPerGrid() - 1;\n  if (isFirstThreadOfBlock()) {      \n    volatile uint* counter = &__counter__;\n    if (atomicInc((uint*) counter, to) < to) {       \n      while (*counter); // spinning...\n    }\n  }\n  __syncthreads();\n}\n\n__device__ uint getValAtThread(volatile uint* const _shared_, const uint myVal, const uint i) {\n  if (threadIdx.x == i) {\n    _shared_[threadIdx.y] = myVal;\n  }\n  return _shared_[threadIdx.y];\n}\n\n__device__ uint getValAtThread(const uint myVal, const uint i) {\n  __shared__ volatile uint _shared_[MAX_WARPS_PER_BLOCK];\n  if (threadIdx.x == i) {\n    _shared_[threadIdx.y] = myVal;\n  }\n  return _shared_[threadIdx.y];\n}\n\n/*\n * Forward declarations\n */\n__device__ void insertAll(const uint storeIndex, uint* _shared_, uint numFrom, bool sort = true);\n\ntemplate<uint toRel, uint fromRel>\n__device__ void unionAll(const uint to, uint* _shared_, uint numFrom, bool sort = true);\n\ntemplate<uint toRel, uint fromRel>\n__device__  void map(const uint to, const uint base, const uint myBits, uint* _shared_,\n    uint& numFrom);\n\n__device__ inline uint mul960(uint num) {\n  // 960 = 1024 - 64\n  return (num << 10) - (num << 6);\n}\n\n__device__ inline uint __graphGet__(const uint row,  const uint col) {\n  return __edges__[row + col];\n}\n\n__device__ inline uint __graphGet__(const uint pos) {\n  return __graph__[pos];\n}\n\n__device__ inline void __graphSet__(const uint row,  const uint col, const uint val) {\n  __edges__[row + col] = val;\n}\n\n__device__ inline void __graphSet__(const uint pos, const uint val) {\n  __graph__[pos] = val;\n}\n\n__device__ inline uint _sharedGet_(volatile uint* _shared_, uint index, uint offset) {\n  return _shared_[index + offset];\n}\n\n__device__ inline void _sharedSet_(volatile uint* _shared_, uint index, uint offset, uint val) {\n  _shared_[index + offset] = val;\n}\n\n__device__ inline uint getHeadIndex(uint var, uint rel){\n  if (rel == NEXT_DIFF_PTS) {\n    return NEXT_DIFF_PTS_START - mul32(var);\n  }\n  if (rel == COPY_INV) {\n    return COPY_INV_START + mul32(var);\n  }\n  if (rel == CURR_DIFF_PTS) {\n    return CURR_DIFF_PTS_START - mul32(var);\n  }\n  if (rel == PTS) {\n    return mul32(var);\n  }\n  if (rel == STORE) {\n    return __storeStart__ + mul32(var);\n  }\n  // it has to be LOAD_INV, right?\n  return __loadInvStart__ + mul32(var);\n}\n\n__device__ inline uint getNextDiffPtsHeadIndex(uint var){\n    return NEXT_DIFF_PTS_START - mul32(var);\n}\n\n__device__ inline uint getCopyInvHeadIndex(uint var){\n    return COPY_INV_START + mul32(var);\n}\n\n__device__ inline uint getCurrDiffPtsHeadIndex(uint var){\n    return CURR_DIFF_PTS_START - mul32(var);\n}\n\n__device__ inline uint getPtsHeadIndex(uint var){\n    return mul32(var);\n}\n\n__device__ inline uint getStoreHeadIndex(uint var){\n    return __storeStart__ + mul32(var);\n}\n\n__device__ inline uint getLoadInvHeadIndex(uint var){\n    return __loadInvStart__ + mul32(var);\n}\n\n__device__ inline int isEmpty(uint var, uint rel) {\n  const uint headIndex = getHeadIndex(var, rel);\n  return __graphGet__(headIndex, BASE) == NIL;\n}\n\n/**\n * Mask that tells whether the variables contained in an element have size > offset\n * There is one such mask per offset.\n * stored in compressed format\n */\n__constant__ uint* __offsetMask__;\n\n/**\n * Number of rows needed to represent the mask of ONE offset.\n * = ceil(numObjectVars / DST_PER_ELEMENT), since non-object pointers have size 1.\n */\n__constant__ uint __offsetMaskRowsPerOffset__; \n\n__device__ inline uint __offsetMaskGet__(const uint base, const uint col, const uint offset) {\n  return __offsetMask__[mul32((offset - 1) * __offsetMaskRowsPerOffset__ + base) + col];\n}\n\n__device__ inline void __offsetMaskSet__(const uint base, const uint col, const uint offset,\n    const uint val) {\n  __offsetMask__[mul32((offset - 1) * __offsetMaskRowsPerOffset__ + base) + col] = val;\n}\n\n/**\n * Mask that tells whether the pts-to of an element changed.\n * the BASE and NEXT words are always equal to 0\n * stored in compressed format\n */\n__constant__ uint* __diffPtsMask__;\n\n__device__ inline uint __diffPtsMaskGet__(const uint base, const uint col) {\n  return __diffPtsMask__[mul32(base) + col];\n}\n\n__device__ inline void __diffPtsMaskSet__(const uint base, const uint col, const uint val) {\n  __diffPtsMask__[mul32(base) + col] = val;\n}\n\n/**\n * Index of the next free element in the corresponding free list.\n * The index is given in words, not bytes or number of elements.\n */\n__device__ uint __ptsFreeList__,__nextDiffPtsFreeList__, __currDiffPtsFreeList__, __otherFreeList__;\n\n__device__ inline uint mallocPts(uint size = ELEMENT_WIDTH) {\n  __shared__ volatile uint _shared_[MAX_WARPS_PER_BLOCK];\n  if (isFirstThreadOfWarp()) {\n    _shared_[threadIdx.y] = atomicAdd(&__ptsFreeList__, size);\n  }\n  return _shared_[threadIdx.y];\n}\n\n__device__ inline uint mallocNextDiffPts() {\n  __shared__ volatile uint _shared_[MAX_WARPS_PER_BLOCK];\n  if (isFirstThreadOfWarp()) {\n    _shared_[threadIdx.y] = atomicSub(&__nextDiffPtsFreeList__, ELEMENT_WIDTH);\n  }\n  return _shared_[threadIdx.y];\n}\n\n__device__ inline uint mallocCurrDiffPts() {\n  __shared__ volatile uint _shared_[MAX_WARPS_PER_BLOCK];\n  if (isFirstThreadOfWarp()) {\n    _shared_[threadIdx.y] = atomicSub(&__currDiffPtsFreeList__, ELEMENT_WIDTH);\n  }\n  return _shared_[threadIdx.y];\n}\n\n__device__ inline uint mallocOther() {\n  __shared__ volatile uint _shared_[MAX_WARPS_PER_BLOCK]; \n  if (isFirstThreadOfWarp()) {\n    _shared_[threadIdx.y] = atomicAdd(&__otherFreeList__, ELEMENT_WIDTH);\n  }\n  return _shared_[threadIdx.y];\n}\n\n__device__ inline uint mallocIn(uint rel) {\n  if (rel == NEXT_DIFF_PTS) {\n    return mallocNextDiffPts();\n  }\n  if (rel >= COPY_INV) {\n    return mallocOther();\n  }\n  if (rel == PTS) {\n    return mallocPts();\n  }\n  if (rel == CURR_DIFF_PTS) {\n    return mallocCurrDiffPts();\n  }\n  //printf(\"WTF! (%u)\", rel);\n  return 0;\n}\n\n/**\n * Get and increment the current worklist index\n * Granularity: warp\n * @param delta Number of elements to be retrieved at once \n * @return Worklist index 'i'. All the work items in the [i, i + delta) interval are guaranteed\n * to be assigned to the current warp.\n */\n__device__ inline uint getAndIncrement(const uint delta) {\n  __shared__ volatile uint _shared_[MAX_WARPS_PER_BLOCK];\n  if (isFirstThreadOfWarp()) {\n    _shared_[threadIdx.y] = atomicAdd(&__worklistIndex0__, delta);\n  }\n  return _shared_[threadIdx.y];\n}\n\n__device__ inline uint getAndIncrement(uint* counter, uint delta) {\n  __shared__ volatile uint _shared_[MAX_WARPS_PER_BLOCK];\n  if (isFirstThreadOfWarp()) {\n    _shared_[threadIdx.y] = atomicAdd(counter, delta);\n  }\n  return _shared_[threadIdx.y];\n}\n\n/**\n * Lock a given variable \n * Granularity: warp\n * @param var Id of the variable\n * @return A non-zero value if the operation succeeded\n */\n__device__ inline uint lock(const uint var) {\n  uint any = __any_sync(0xffffffff,isFirstThreadOfWarp() && (atomicCAS(__lock__ + var, UNLOCKED, LOCKED) \n      == UNLOCKED));\n  return any;\n}\n\n/**\n * Unlock a variable\n * Granularity: warp or thread\n * @param var Id of the variable\n */\n__device__ inline void unlock(const uint var) {\n  __lock__[var] = UNLOCKED;\n}\n\n__device__ inline int isRep(const uint var) {\n  return __rep__[var] == var;\n}\n\n__device__ inline void setRep(const uint var, const uint rep) {\n  __rep__[var] = rep;\n}\n\n__device__ inline uint getRep(const uint var) {\n  return __rep__[var];\n}\n\n__device__ inline uint getRepRec(const uint var) {\n  uint rep = var;\n  uint repRep = __rep__[rep];\n  while (repRep != rep) {\n    rep = repRep;\n    repRep = __rep__[rep];\n  } \n  return rep;\n}\n\n__device__ ulongint recordStartTime() {\n  __shared__ volatile ulongint _ret_[MAX_WARPS_PER_BLOCK];\n  if (isFirstThreadOfWarp()) {\n    _ret_[threadIdx.y] = clock();\n  }\n  return _ret_[threadIdx.y];\n}\n\n__device__ void recordElapsedTime(ulongint start){\n  if (isFirstThreadOfWarp()) {\n    ulongint delta;\n    ulongint end = clock();\n    if (end > start) {\n      delta = end - start;\n    } else {\n      delta = end + (0xffffffff - start);\n    }\n    double time = TICKS_TO_MS(delta);\n    printf(\"Block %u, Warp: %u: %8.2f ms.\\n\", blockIdx.x, threadIdx.y, time);\n  }\n}\n\n__device__ inline uint decodeWord(const uint base, const uint word, const uint bits) {\n  uint ret = mul960(base) + mul32(word);\n  return (isBitActive(bits, threadIdx.x)) ? __rep__[ret + threadIdx.x] : NIL;\n}\n\n__device__ inline void swap(volatile uint* const keyA, volatile uint* const keyB, const uint dir) {\n  uint n1 = *keyA;\n  uint n2 = *keyB;\n  if ((n1 < n2) != dir) {\n    *keyA = n2;\n    *keyB = n1;\n  }\n}\n\n// Bitonic Sort, in ascending order using one WARP\n// precondition: size of _shared_ has to be a power of 2\n__device__ inline void bitonicSort(volatile uint* const _shared_, const uint to) {\n  for (int size = 2; size <= to; size <<= 1) {\n    for (int stride = size / 2; stride > 0; stride >>= 1) {\n      for (int id = threadIdx.x; id < (to / 2); id += WARP_SIZE) {\n        const uint myDir = ((id & (size / 2)) == 0);\n        uint pos = 2 * id - mod(id, stride);\n        volatile uint* start = _shared_  + pos;\n        swap(start, start + stride, myDir);\n      }\n    }\n  }\n}\n\n__device__ void blockBitonicSort(volatile uint* _shared_, uint to) {\n  uint idInBlock = getThreadIdInBlock();\n  for (int size = 2; size <= to; size <<= 1) {\n    for (int stride = size / 2; stride > 0; stride >>= 1) {\n      __syncthreads();\n      for (int id = idInBlock; id < (to / 2); id += getThreadsPerBlock()) {\n        const uint myDir = ((id & (size / 2)) == 0);\n        uint pos = 2 * id - mod(id, stride);\n        volatile uint* start = _shared_ + pos;\n        swap(start, start + stride, myDir);\n      }\n    }\n  }\n}\n\n/**\n * Sort an array in ascending order.\n * Granularity: block\n * @param _shared_ list of integers\n * @param to size of the sublist we want to process\n */\n__device__ void blockSort(volatile uint* _shared_, uint to) {\n  uint size = max(nextPowerOfTwo(to), 32);\n  uint id = getThreadIdInBlock();\n  for (int i = to + id; i < size; i += getThreadsPerBlock()) {\n    _shared_[i] = NIL;\n  }\n  blockBitonicSort(_shared_, size);  \n  __syncthreads();\n}\n\n/**\n * Remove duplicates on a sorted sequence, equivalent to Thrust 'unique' function but uses one warp.\n * If there are NILS, they are treated like any other number\n * precondition: the input list is sorted\n * precondition: to >= 32\n * precondition: shared_[-1] exists and is equal to NIL\n * Granularity: warp\n *\n * @param _shared_ list of integers\n * @param to size of the sublist we want to process\n * @return number of unique elements in the input.\n */\n__device__  inline uint unique(volatile uint* const _shared_, uint to) {\n  uint startPos = 0;\n  uint myMask = (1 << (threadIdx.x + 1)) - 1;\n  for (int id = threadIdx.x; id < to; id += WARP_SIZE) {\n    uint myVal = _shared_[id];\n    uint fresh = __ballot_sync(0xffffffff,myVal != _shared_[id - 1]);\n    // pos = starting position + number of 1's to my right (incl. myself) minus one\n    uint pos = startPos + __popc(fresh & myMask) - 1;\n    _shared_[pos] = myVal;\n    startPos += __popc(fresh);\n  }\n  return startPos;\n}\n\n__device__ uint removeDuplicates(volatile uint* const _shared_, const uint to) {\n  const uint size = max(nextPowerOfTwo(to), 32);\n  for (int i = to + threadIdx.x; i < size; i += WARP_SIZE) {\n    _shared_[i] = NIL;\n  }\n  bitonicSort(_shared_, size);\n  uint ret = unique(_shared_, size);\n  return (size > to) ? ret - 1 : ret;\n}\n\n__device__ void print(uint* m, const uint size) {\n  if (!isFirstThreadOfWarp())\n    return;\n  //printf(\"[\");\n  for (int i = 0; i < size; i++) {\n    //printf(\"%u\", m[i]);\n    if (i < size - 1) {\n      //printf(\", \");\n    }\n  }\n  //printf (\"]\");\n}\n\n__device__ void print(int* m, const uint size) {\n  if (!isFirstThreadOfWarp())\n    return;\n  //printf(\"[\");\n  for (int i = 0; i < size; i++) {\n    //printf(\"%d\", m[i]);\n    if (i < size - 1) {\n      //printf(\", \");\n    }\n  }\n  //printf (\"]\");\n}\n\n\n__device__ volatile uint __printBuffer__[PRINT_BUFFER_SIZE];\n\n // TODO: assumes we print with 1 block and 1 warp...\n__device__ void printElementAsSet(const uint base, volatile uint myBits, bool& first) {\n  for (int i = 0; i < BASE; i++) {\n    uint word = getValAtThread(myBits, i);\n    uint myDst = decodeWord(base, i, word);\n    for (int j = 0; j < WARP_SIZE; j++) {\n      uint dst = getValAtThread(myDst, j);\n      if (dst != NIL && isFirstThreadOfWarp()) {\n        if (first) {\n          //printf(\"%u\", dst);\n        } else {\n          //printf(\", %u\", dst);\n        }\n        first = false;\n      }\n    }\n  }\n}\n\n__device__ void printDiffPtsMask() {\n  uint numVars = __numVars__;\n  if (isFirstThreadOfWarp()) {\n    //printf(\"DIFF_PTS_MASK: [\");\n  }\n  bool first = true;\n  int to = ceil((float) numVars /  (float) ELEMENT_CARDINALITY);\n  for (int base = 0; base < to; base++) {\n    uint myBits = __diffPtsMaskGet__(base, threadIdx.x);\n    printElementAsSet(base, myBits, first);\n  }\n  if (isFirstThreadOfWarp())\n    ;//printf(\"]\\n\");\n}\n\n__global__ void __printDiffPtsMask() {\n  printDiffPtsMask();\n}\n\n__device__ void printOffsetMask(uint numObjectsVars, uint offset) {\n  if (isFirstThreadOfWarp()) {\n    //printf(\"MASK for offset %u: [\", offset);\n  }\n  bool first = true;\n  int to = __offsetMaskRowsPerOffset__;\n  for (int base = 0; base < to; base++) {\n    uint myBits = __offsetMaskGet__(base, threadIdx.x, offset);\n    printElementAsSet(base, myBits, first);\n  }\n  if (isFirstThreadOfWarp())\n    ;//printf(\"]\\n\");\n}\n\n__device__ void printOffsetMasks(uint numObjectsVars, uint maxOffset) {\n  if (!isFirstWarpOfGrid()) {\n    return;\n  }\n  for (int i = 1; i <= maxOffset; i++) {\n    printOffsetMask(numObjectsVars, i);\n  }\n}\n\n__global__ void __printOffsetMasks(uint numObjectsVars, uint maxOffset) {\n  printOffsetMasks(numObjectsVars, maxOffset);\n}\n\n__device__ void printElementRec(uint index) {\n  volatile uint myBits = __graphGet__(index, threadIdx.x);\n  uint all = __all_sync(0xffffffff,myBits == NIL);\n  if (all) {\n    return;\n  }\n  while (index != NIL) {\n    //printf(\"Thread: %u, value: %u\\n\", threadIdx.x, myBits);\n    index = __graphGet__(index, NEXT);\n    if (index != NIL) {\n      myBits = __graphGet__(index, threadIdx.x);\n    }\n  }\n}\n\n__device__ void printSharedElementRec(uint* volatile _shared_, uint index) {\n  volatile uint myBits = _sharedGet_(_shared_, index, threadIdx.x);\n  uint all = __all_sync(0xffffffff,myBits == NIL);\n  if (all) {\n    return;\n  }\n  while (index != NIL) {\n    //printf(\"Thread: %u, value: %u\\n\", threadIdx.x, myBits);\n    index = _sharedGet_(_shared_, index, NEXT);\n    if (index != NIL) {\n      myBits = _sharedGet_(_shared_, index, threadIdx.x);\n    }\n  }\n}\n\n__device__  void accumulate(const uint base, uint myBits, uint& numFrom, uint rel) {\n  uint nonEmpty = __ballot_sync(0xffffffff,myBits && threadIdx.x < BASE);\n  while (nonEmpty) {\n    uint pos = __ffs(nonEmpty) - 1;\n    nonEmpty &= (nonEmpty - 1);\n    uint bits = getValAtThread(myBits, pos);\n    uint numOnes = __popc(bits);\n    //cudaAssert(numFrom + numOnes > PRINT_BUFFER_SIZE); \n    uint var = mul960(base) + mul32(pos) + threadIdx.x;\n    // PTS edges: we do not use representatives. In all the other relations we do.\n    var = isBitActive(bits, threadIdx.x) ? (rel > CURR_DIFF_PTS ? __rep__[var] : var) : NIL;\n    pos = numFrom + __popc(bits & ((1 << threadIdx.x) - 1));\n    if (var != NIL) {\n      __printBuffer__[pos] = var;\n    }\n    numFrom += numOnes;\n  }\n}\n\n__device__ void printEdges(const uint src, const uint rel, const uint printEmptySets) { \n  if (isEmpty(src, rel) && !printEmptySets) {\n    return;\n  }\n  if (isFirstThreadOfWarp()) {\n    //printf(\"%d => [\", src);\n  }\n  uint index = getHeadIndex(src, rel);\n  uint numFrom = 0;\n  do {\n    uint myBits = __graphGet__(index, threadIdx.x);\n    uint base = __graphGet__(index, BASE);\n    if (base == NIL) {\n      break;\n    }\n    index = __graphGet__(index, NEXT);\n    accumulate(base, myBits, numFrom, rel);\n  } while (index != NIL);\n  if (numFrom) {\n    if (rel > CURR_DIFF_PTS) {\n      numFrom = removeDuplicates(__printBuffer__, numFrom);\n    }\n    for (int i = 0; i < numFrom; i++) {\n      uint val = __printBuffer__[i]; // has to be non-NIL\n      if (isFirstThreadOfWarp()) {\n        if (!i) {\n          //printf(\"%u\", val);\n        } else {\n          //printf(\", %u\", val);\n        }\n      }\n    }\n  }\n  if (isFirstThreadOfWarp()) {\n    //printf(\"]\\n\");\n  }\n}\n\n__device__ void printEdgesOf(const uint src, int rel) {\n  if (isFirstThreadOfWarp()) {\n    //printf(\"%s of \", getName(rel));\n  }\n  printEdges(src, rel, 1);\n}\n\n__device__ void printEdgesStartingAt(uint index, int rel) {\n  if (isFirstThreadOfWarp()) {\n    //printf(\"%s @ %u => [\", getName(rel), index);\n  }\n  uint numFrom = 0;\n  do {\n    uint myBits = __graphGet__(index, threadIdx.x);\n    uint base = __graphGet__(index, BASE);\n    if (base == NIL) {\n      break;\n    }\n    index = __graphGet__(index, NEXT);\n    accumulate(base, myBits, numFrom, rel);\n  } while (index != NIL);\n  if (numFrom) {\n    if (rel > CURR_DIFF_PTS) {\n      numFrom = removeDuplicates(__printBuffer__, numFrom);\n    }\n    for (int i = 0; i < numFrom; i++) {\n      uint val = __printBuffer__[i]; // has to be non-NIL\n      if (isFirstThreadOfWarp()) {\n        if (!i) {\n          //printf(\"%u\", val);\n        } else {\n          //printf(\", %u\", val);\n        }\n      }\n    }\n  }\n  if (isFirstThreadOfWarp()) {\n    //printf(\"]\\n\");\n  }\n}\n\n__device__ void printEdgesOf(uint src) {\n  for (int i = 0; i <= LAST_DYNAMIC_REL; i++) {\n    printEdgesOf(src, i);\n  }\n}\n\n__global__ void __printEdgesOf(uint src, int rel) {\n  printEdgesOf(src, rel);\n}\n\n__global__ void __printEdgesOf(uint src) {\n  printEdgesOf(src);\n}\n\n__device__ void printEdges(int rel) {\n  if (isFirstThreadOfWarp()) {\n    //printf(\"%s edges:\\n\", getName(rel));\n  }\n  for (int src = 0; src < __numVars__; src++) {\n    printEdges(src, rel, 0);\n  }\n}\n\n__global__ void __printEdges(int rel) {\n  printEdges(rel);\n}\n\n__device__ void printGepEdges() {\n  uint numVarsGepInv = __numGepInv__;\n  if (isFirstThreadOfWarp()) {\n    //printf(\"GEP_INV edges:\\n\");\n  }\n  volatile __shared__ uint _shared_[WARP_SIZE];\n  for (int i = 0; i < numVarsGepInv; i += WARP_SIZE) {\n    _shared_[threadIdx.x] = __gepInv__[i + threadIdx.x];\n    for (int j= 0; j < WARP_SIZE && _shared_[j] != NIL; j +=2) {\n      uint dst = _shared_[j];\n      uint srcOffset = _shared_[j + 1];\n      if (isFirstThreadOfWarp()) {\n        //printf(\"%u => %u (%u)\\n\", dst, id(srcOffset), offset(srcOffset));\n      }\n    }\n  }\n}\n\n__global__ void __printGepEdges() {\n  printGepEdges();\n}\n\n__device__ void printConstraints(uint* __constraints__, const uint numConstraints) { \n  volatile __shared__ uint _shared_[WARP_SIZE];\n  for (int i = 0; i < numConstraints * 2; i += WARP_SIZE) {\n    _shared_[threadIdx.x] = __constraints__[i + threadIdx.x];\n    for (int j = 0; j < WARP_SIZE; j += 2) {\n      if (i + j >= numConstraints * 2) {\n        return;\n      }\n      uint src = _shared_[j];\n      uint dst = _shared_[j + 1];\n      if (isFirstThreadOfWarp()) {\n        //printf(\"%u => %u\\n\", src, dst);\n      }\n    }\n  }\n}\n\n__device__ int checkForErrors(uint var, uint rel) {\n  uint index = getHeadIndex(var, rel);\n  uint lastBase = 0;\n  uint first = 1;\n\n  uint bits = __graphGet__(index, threadIdx.x);\n  uint all = __all_sync(0xffffffff,bits == NIL);\n  if (all) {\n    return 0;\n  }\n  do {\n    bits = __graphGet__(index, threadIdx.x);\n    uint all_bits = __all_sync(0xffffffff,threadIdx.x >= BASE || bits == NIL);\n    if (all_bits) {\n      if (isFirstThreadOfWarp()) {\n        //printf(\"ERROR: empty element at %s of %u \\n\", getName(rel), var);\n      }\n      //printElementRec(getHeadIndex(var, rel));\n      __error__ = 1;\n      return 1;\n    }\n    uint base = __graphGet__(index, BASE);\n    index = __graphGet__(index, NEXT);\n    if (base == NIL) {\n      if (isFirstThreadOfWarp()) {\n        //printf(\"ERROR: inconsistency at %s of %u: BASE is NIL but other word is not\\n\",\n            //getName(rel), var);\n      }\n      printElementRec(getHeadIndex(var, rel));\n      __error__ = 1;\n      return 1;\n    }\n    if (!first && base <= lastBase) {\n      if (isFirstThreadOfWarp()) {\n        //printf(\"ERROR: BASE(element) = %u <= BASE(prev(element)) = %u at %s of %u\\n\", base, \n            //lastBase, getName(rel), var);\n      }\n      //printElementRec(getHeadIndex(var, rel));\n      __error__ = 1;\n      return 1;\n    }\n    first = 0;\n    lastBase = base;\n  } while (index != NIL);\n  return 0;\n}\n\n__global__ void checkForErrors(uint rel) {\n  uint numVars = __numVars__;\n  int inc = mul32(getWarpsPerGrid());\n  int init = mul32(getWarpIdInGrid());\n  for (int initVar = init; initVar < numVars; initVar += inc) {\n    for (int i = 0; i < WARP_SIZE; i++) {\n      uint var = initVar + i;\n      if (var > numVars || checkForErrors(var, rel)) {\n        return;\n      }\n    }\n  }\n}\n\n__device__ uint hashCode(uint index) {\n  __shared__ uint _sh_[DEF_THREADS_PER_BLOCK];\n  volatile uint* _shared_ = &_sh_[threadIdx.y * WARP_SIZE];\n  uint myRet = 0;\n  uint bits = __graphGet__(index + threadIdx.x);\n  uint base = __graphGet__(index + BASE);\n  if (base == NIL) {\n    return 0;\n  }\n  while (1) {\n    uint elementHash = base * (30 + threadIdx.x) ^ bits;\n    if (bits) {\n      myRet ^= elementHash;      \n    }\n    index = __graphGet__(index + NEXT);\n    if (index == NIL) {\n      break;\n    }\n    bits = __graphGet__(index + threadIdx.x);\n    base = __graphGet__(index + BASE);\n  } \n  _shared_[threadIdx.x] = myRet;\n  if (threadIdx.x < 14) {\n    _shared_[threadIdx.x] ^= _shared_[threadIdx.x + WARP_SIZE / 2];\n  }\n  if (threadIdx.x < 8) {\n    _shared_[threadIdx.x] ^= _shared_[threadIdx.x + WARP_SIZE / 4];\n  }\n  if (threadIdx.x < 4) {\n    _shared_[threadIdx.x] ^= _shared_[threadIdx.x + WARP_SIZE / 8];\n  }\n  return _shared_[0] ^ _shared_[1] ^ _shared_[2] ^ _shared_[3];\n}\n\n__device__ uint equal(uint index1, uint index2) {\n  uint bits1 = __graphGet__(index1 + threadIdx.x);\n  uint bits2 = __graphGet__(index2 + threadIdx.x);\n  uint all = __all_sync(0xffffffff,(threadIdx.x == NEXT) || (bits1 == bits2));\n  while (all) {\n    index1 = __graphGet__(index1 + NEXT);\n    index2 = __graphGet__(index2 + NEXT);\n    if (index1 == NIL || index2 == NIL) {\n      return index1 == index2;\n    }\n    bits1 = __graphGet__(index1 + threadIdx.x);\n    bits2 = __graphGet__(index2 + threadIdx.x);\n  }\n  return 0;\n}\n\n__device__ uint size(uint var, uint rel) {\n  __shared__ uint _sh_[DEF_THREADS_PER_BLOCK];\n  volatile uint* _shared_ = &_sh_[threadIdx.y * WARP_SIZE];\n  if (isEmpty(var, rel)) {\n    return 0;\n  }\n  uint index = getHeadIndex(var, rel);\n  uint myRet = 0;\n  do {\n    uint myBits = __graphGet__(index, threadIdx.x);\n    index = __graphGet__(index, NEXT);\n    myRet += __popc(myBits);\n  } while (index != NIL);\n  _shared_[threadIdx.x] = threadIdx.x >= BASE ? 0 : myRet;\n  for (int stride = WARP_SIZE / 2; stride > 0; stride >>= 1) {\n    if (threadIdx.x < stride) {\n      _shared_[threadIdx.x] += _shared_[threadIdx.x + stride];\n    }\n  }\n  return _shared_[0];\n}\n\n__device__ void unionToCopyInv(const uint to, const uint fromIndex, uint* const _shared_, \n    bool applyCopy = true) {\n  uint toIndex = getCopyInvHeadIndex(to);\n  if (fromIndex == toIndex) {\n    return;\n  }\n  uint fromBits = __graphGet__(fromIndex + threadIdx.x);\n  uint fromBase = __graphGet__(fromIndex + BASE);\n  if (fromBase == NIL) {\n    return;\n  }\n  uint fromNext = __graphGet__(fromIndex + NEXT);\n  uint toBits = __graphGet__(toIndex + threadIdx.x);\n  uint toBase = __graphGet__(toIndex + BASE);\n  uint toNext = __graphGet__(toIndex + NEXT);\n  uint numFrom = 0;\n  uint newVal;\n  while (1) {\n    if (toBase > fromBase) {\n      if (toBase == NIL) {\n        newVal = fromNext == NIL ? NIL : mallocOther();\n      } else {\n        newVal = mallocOther();\n        __graphSet__(newVal + threadIdx.x, toBits);\n      }\n      fromBits = threadIdx.x == NEXT ? newVal : fromBits;\n      __graphSet__(toIndex + threadIdx.x, fromBits);\n      if (applyCopy) {\n        map<NEXT_DIFF_PTS, PTS>(to, fromBase, fromBits, _shared_, numFrom);\n      }\n      if (fromNext == NIL) {\n        break;\n      }\n      toIndex = newVal;\n      fromBits = __graphGet__(fromNext + threadIdx.x);\n      fromBase = __graphGet__(fromNext + BASE);\n      fromNext = __graphGet__(fromNext + NEXT);      \n    } else if (toBase == fromBase) {\n      uint orBits = fromBits | toBits;\n      uint diffs = __any_sync(0xffffffff,uint(orBits != toBits && threadIdx.x < NEXT));\n      bool nextWasNil = false;\n      if (toNext == NIL && fromNext != NIL) {\n        toNext = mallocOther();\n        nextWasNil = true;\n      }\n      uint newBits = threadIdx.x == NEXT ? toNext : orBits;\n      if (newBits != toBits) {\n        __graphSet__(toIndex + threadIdx.x, newBits);\n      }\n      // if there was any element added to COPY_INV, apply COPY_INV rule\n      if (applyCopy && diffs) {\n        uint diffBits = fromBits & ~toBits;\n        map<NEXT_DIFF_PTS, PTS > (to, fromBase, diffBits, _shared_, numFrom);\n      }\n      //advance `to` and `from`\n      if (fromNext == NIL) {\n        break;\n      }\n      toIndex = toNext;\n      if (nextWasNil) {\n        toBits = NIL;\n        toBase = NIL;\n        toNext = NIL;\n      } else {\n        toBits = __graphGet__(toIndex + threadIdx.x);\n        toBase = __graphGet__(toIndex + BASE);\n        toNext = __graphGet__(toIndex + NEXT);\n      }\n      fromBits = __graphGet__(fromNext + threadIdx.x);\n      fromBase = __graphGet__(fromNext + BASE);\n      fromNext = __graphGet__(fromNext + NEXT);      \n    } else { //toBase < fromBase\n      if (toNext == NIL) {\n        uint newNext = mallocOther();\n        __graphSet__(toIndex + NEXT, newNext);\n        toIndex = newNext;\n        toBits = NIL;\n        toBase = NIL;\n      } else {\n        toIndex = toNext;\n        toBits = __graphGet__(toNext + threadIdx.x);\n        toBase = __graphGet__(toIndex + BASE);\n        toNext = __graphGet__(toNext + NEXT);        \n      }\n    }\n  }\n  if (applyCopy && numFrom) {\n    // flush pending unions\n    unionAll<NEXT_DIFF_PTS, PTS> (to, _shared_, numFrom);\n  }\n}\n\n__device__ void clone(uint toIndex, uint fromBits, uint fromNext, const uint toRel) {  \n  while (1) {\n    uint newIndex = fromNext == NIL ? NIL : mallocIn(toRel);    \n    uint val = threadIdx.x == NEXT ? newIndex : fromBits;\n    __graphSet__(toIndex + threadIdx.x, val);\n    if (fromNext == NIL) {\n      break;\n    }\n    toIndex = newIndex;\n    fromBits = __graphGet__(fromNext + threadIdx.x);\n    fromNext = __graphGet__(fromNext + NEXT);        \n  } \n}\n\n// toRel = any non-static relationship\n__device__ void unionG2G(const uint to, const uint toRel, const uint fromIndex) {\n  uint toIndex = getHeadIndex(to, toRel);\n  uint fromBits = __graphGet__(fromIndex + threadIdx.x); \n  uint fromBase = __graphGet__(fromIndex + BASE);\n  if (fromBase == NIL) {\n    return;\n  }\n  uint fromNext = __graphGet__(fromIndex + NEXT);\n  uint toBits = __graphGet__(toIndex + threadIdx.x);\n  uint toBase = __graphGet__(toIndex + BASE);\n  if (toBase == NIL) {\n    clone(toIndex, fromBits, fromNext, toRel);\n    return;\n  }\n  uint toNext = __graphGet__(toIndex + NEXT);\n  while (1) {\n    if (toBase > fromBase) {\n      uint newIndex = mallocIn(toRel);\n      __graphSet__(newIndex + threadIdx.x, toBits);      \n      uint val = threadIdx.x == NEXT ? newIndex : fromBits;\n      __graphSet__(toIndex + threadIdx.x, val);\n      // advance 'from'\n      if (fromNext == NIL) {\n        return;\n      }\n      toIndex = newIndex;\n      fromBits = __graphGet__(fromNext + threadIdx.x);\n      fromBase = __graphGet__(fromNext + BASE);\n      fromNext = __graphGet__(fromNext + NEXT);        \n    } else if (toBase == fromBase) {\n      uint newToNext = (toNext == NIL && fromNext != NIL) ? mallocIn(toRel) : toNext;\n      uint orBits = fromBits | toBits;\n      uint newBits = threadIdx.x == NEXT ? newToNext : orBits;\n      if (newBits != toBits) {\n        __graphSet__(toIndex + threadIdx.x, newBits);\n      }\n      //advance `to` and `from`\n      if (fromNext == NIL) {\n        return;\n      }\n      fromBits = __graphGet__(fromNext + threadIdx.x);\n      fromBase = __graphGet__(fromNext + BASE);\n      fromNext = __graphGet__(fromNext + NEXT);      \n      if (toNext == NIL) {\n        clone(newToNext, fromBits, fromNext, toRel);\n        return;\n      } \n      toIndex = newToNext;\n      toBits = __graphGet__(toNext + threadIdx.x);\n      toBase = __graphGet__(toNext + BASE);\n      toNext = __graphGet__(toNext + NEXT);\n    } else { // toBase < fromBase\n      if (toNext == NIL) {\n        toNext = mallocIn(toRel);\n        __graphSet__(toIndex + NEXT, toNext);\n        clone(toNext, fromBits, fromNext, toRel);\n        return;\n      } \n      toIndex = toNext;\n      toBits = __graphGet__(toNext + threadIdx.x);\n      toBase = __graphGet__(toNext + BASE);\n      toNext = __graphGet__(toNext + NEXT);      \n    }\n  } \n}\n\n// WATCH OUT: ASSUMES fromRel==toRel\n// like unionTo, but reusing the elements of 'from' (introduces sharing of elements)\n// toRel = any non-static relationship\n__device__  void unionG2GRecycling(const uint to, const uint toRel, uint fromIndex) {\n  uint fromBits = __graphGet__(fromIndex, threadIdx.x);\n  uint fromBase = __graphGet__(fromIndex, BASE);\n  if (fromBase == NIL) {\n    return;\n  }\n  uint toIndex = getHeadIndex(to, toRel);\n  uint toBits = __graphGet__(toIndex, threadIdx.x);\n  uint toBase = __graphGet__(toIndex, BASE);\n  if (toBase == NIL) {\n    __graphSet__(toIndex, threadIdx.x, fromBits);\n    return;\n  }\n  uint toNext = __graphGet__(toIndex, NEXT);\n  uint fromNext = __graphGet__(fromIndex, NEXT);\n  uint fromHeadIndex = fromIndex;\n  do {\n    if (toBase == fromBase) {\n      uint newToNext = (toNext == NIL) ? fromNext : toNext;\n      uint orBits = fromBits | toBits;\n      uint newBits = threadIdx.x == NEXT ? newToNext : orBits;\n      if (newBits != toBits) {\n        __graphSet__(toIndex, threadIdx.x, newBits);\n      }\n      //advance `to` and `from`\n      if (toNext == NIL || fromNext == NIL) { // done with current elt and there is no NEXT => exit\n        return;\n      }\n      fromIndex = fromNext;\n      fromBits = __graphGet__(fromIndex, threadIdx.x);\n      fromBase = __graphGet__(fromIndex, BASE);\n      fromNext = __graphGet__(fromIndex, NEXT);\n      toIndex = toNext;\n      toBits = __graphGet__(toIndex, threadIdx.x);\n      toBase = __graphGet__(toIndex, BASE);\n      toNext = __graphGet__(toIndex, NEXT);\n    } else if (toBase < fromBase) {\n      if (toNext == NIL) {\n        if (fromIndex == fromHeadIndex) {\n          fromIndex = mallocIn(toRel);\n          __graphSet__(fromIndex, threadIdx.x, fromBits);\n        }\n        __graphSet__(toIndex, NEXT, fromIndex);\n        return;\n      }\n      // advance 'to'\n      toIndex = toNext;\n      toBits = __graphGet__(toIndex, threadIdx.x);\n      toBase = __graphGet__(toIndex, BASE);\n      toNext = __graphGet__(toIndex, NEXT);\n    } else { // toBase > fromBase\n      if (fromIndex == fromHeadIndex) {\n        fromIndex = mallocIn(toRel);      \n      }\n      __graphSet__(fromIndex, threadIdx.x, toBits);\n      int val = threadIdx.x == NEXT ? fromIndex : fromBits;\n      __graphSet__(toIndex, threadIdx.x, val);\n      toIndex = fromIndex; // toBits does not change\n      fromIndex = fromNext;\n      if (fromNext != NIL) {\n        //advance 'from'\n        fromBits = __graphGet__(fromIndex, threadIdx.x);\n        fromBase = __graphGet__(fromIndex, BASE);\n        fromNext = __graphGet__(fromIndex, NEXT);\n      }\n    }\n  } while (fromIndex != NIL);\n}\n\n__device__ uint addVirtualElement(uint index, const uint fromBase, const uint fromBits, \n    const uint toRel) {\n  for (;;) {\n    uint toBits = __graphGet__(index + threadIdx.x);\n    uint toBase = __graphGet__(index + BASE);\n    if (toBase == NIL) {\n      // can only happen if the adjancency list of `to` is empty\n      // cost: exactly one global write\n      __graphSet__(index + threadIdx.x, fromBits);\n      return index;\n    }\n    if (toBase == fromBase) {\n      // cost: at most one global write\n      uint orBits = toBits | fromBits;\n      if (orBits != toBits && threadIdx.x < NEXT) {\n        __graphSet__(index + threadIdx.x, orBits);\n      }\n      return index;\n    }\n    if (toBase < fromBase) {\n      uint toNext = getValAtThread(toBits, NEXT);\n      if (toNext == NIL) {\n        // appending; cost: two global writes\n        uint newIndex = mallocIn(toRel);\n        __graphSet__(newIndex + threadIdx.x, fromBits);\n        __graphSet__(index + NEXT, newIndex);\n        return newIndex;\n      }\n      index = toNext;\n    } else {\n      // cost: two global writes\n      uint newIndex = mallocIn(toRel);\n      __graphSet__(newIndex + threadIdx.x, toBits);\n      uint val = threadIdx.x == NEXT ? newIndex : fromBits;\n      __graphSet__(index + threadIdx.x, val);\n      return index;\n    }\n  }\n}\n\n__device__ uint insert(const uint index, const uint var, const int rel) {  \n  uint base = BASE_OF(var);\n  uint word = WORD_OF(var);\n  uint bit = BIT_OF(var);\n  uint myBits = 0;\n  if (threadIdx.x == word) {\n    myBits = 1 << bit;\n  } else if (threadIdx.x == BASE) {\n    myBits = base;\n  } else if (threadIdx.x == NEXT) {\n    myBits = NIL;\n  }  \n  return addVirtualElement(index, base, myBits, rel);\n}\n\n__device__ inline uint resetWorklistIndex() {\n  __syncthreads();\n  uint numBlocks = getBlocksPerGrid();\n  if (isFirstThreadOfBlock() && atomicInc(&__counter__, numBlocks - 1) == (numBlocks - 1)) {\n    __worklistIndex0__ = 0;\n    __counter__ = 0;\n    return 1;\n  }  \n  return 0;\n}\n\n__global__ void addEdges(uint* __key__, uint* __keyAux__, uint* __val__, const uint to,  uint rel) {\n  __shared__ uint _sh_[WARPS_PER_BLOCK(DEF_THREADS_PER_BLOCK) * WARP_SIZE];\n  uint* _shared_ = &_sh_[threadIdx.y * WARP_SIZE];\n  uint i = getAndIncrement(1);\n  while (i < to) {\n    uint src = __key__[i];\n    if (src == NIL) {\n      break;\n    }\n    uint index  = getHeadIndex(src, rel);\n    uint startIndex = __keyAux__[i];\n    uint end = __keyAux__[i + 1]; \n    uint start = roundToPrevMultipleOf(startIndex, WARP_SIZE); // to ensure alignment\n    for (int j = start; j < end; j += WARP_SIZE) {\n      uint myIndex = j + threadIdx.x;\n      _shared_[threadIdx.x] = myIndex < end ? __val__[myIndex] : NIL; \n      uint startK = max(((int) startIndex) - j, 0);\n      uint endK = min(end - j, WARP_SIZE);      \n      for (int k = startK; k < endK; k++) {\n        uint dst = _shared_[k];\n        index = insert(index, dst, rel);\n      }      \n    }   \n    i = getAndIncrement(1);\n  }\n  resetWorklistIndex();  \n}\n\ntemplate<uint toRel, uint fromRel>\n__device__  inline void unionAll(const uint to, uint* const _shared_, uint numFrom, bool sort) {\n  if (numFrom > 1 && sort) {\n    numFrom = removeDuplicates(_shared_, numFrom);\n  }\n  for (int i = 0; i < numFrom; i++) {\n    uint fromIndex = _shared_[i];     \n    if (fromRel != CURR_DIFF_PTS) {\n      fromIndex = getHeadIndex(fromIndex, fromRel);\n    }\n    if (toRel == COPY_INV) {\n      unionToCopyInv(to, fromIndex, _shared_ + DECODE_VECTOR_SIZE + 1);\n    } else {\n      unionG2G(to, toRel, fromIndex);\n    }\n  }\n}\n\ntemplate<uint toRel, uint fromRel>\n__device__  void map(uint to, const uint base, const uint myBits, uint* const _shared_, \n    uint& numFrom) {\n  uint ballot = __ballot_sync(0xffffffff,myBits);\n  uint nonEmpty = ballot & LT_BASE;\n  const uint threadMask = 1 << threadIdx.x;\n  const uint myMask = threadMask - 1;\n  const uint mul960base = mul960(base);\n  while (nonEmpty) {\n    uint pos = __ffs(nonEmpty) - 1;\n    nonEmpty &= (nonEmpty - 1);\n    uint bits = getValAtThread(myBits, pos);\n    uint var =  getRep(mul960base + mul32(pos) + threadIdx.x); //coalesced\n    uint bitActive = (var != I2P) && (bits & threadMask);\n    bits = __ballot_sync(0xffffffff,bitActive);\n    uint numOnes = __popc(bits);\n    if (numFrom + numOnes > DECODE_VECTOR_SIZE) {\n      numFrom = removeDuplicates(_shared_, numFrom);\n      if (numFrom + numOnes > DECODE_VECTOR_SIZE) {\n        if (toRel == STORE) {\n          insertAll(to, _shared_, numFrom, false);\n        } else {                \n          unionAll<toRel, fromRel>(to, _shared_, numFrom, false); \n        }\n        numFrom = 0;\n      }\n    }\n    pos = numFrom + __popc(bits & myMask);\n    if (bitActive) {      \n      if (fromRel == CURR_DIFF_PTS) {\n        _shared_[pos] = __currPtsHead__[var];\n      }\n      else {\n        _shared_[pos] = var;\n      }\n    }\n    numFrom += numOnes;\n  }\n}\n\ntemplate<uint firstRel, uint secondRel, uint thirdRel>\n__device__ void apply(const uint src, uint* const _shared_) {\n  uint numFrom = 0;\n  uint index = getHeadIndex(src, firstRel);\n  do {\n    uint myBits = __graphGet__(index + threadIdx.x);\n    uint base = __graphGet__(index + BASE);\n    if (base == NIL) {\n      break;\n    }\n    index = __graphGet__(index + NEXT);\n    if (secondRel == CURR_DIFF_PTS) {\n      myBits &= __diffPtsMaskGet__(base, threadIdx.x);\n    } \n    map<thirdRel, secondRel>(src, base, myBits, _shared_, numFrom);\n  } while (index != NIL);\n  if (numFrom) {\n    unionAll<thirdRel, secondRel>(src, _shared_, numFrom);\n  }\n}\n\n__device__ void insertAll(const uint src, uint* const _shared_, uint numFrom, const bool sort) {\n  if (numFrom > 1 && sort) {\n    numFrom = removeDuplicates(_shared_, numFrom);\n  }\n  const uint storeIndex = getStoreHeadIndex(src);\n  for (int i = 0; i < numFrom; i += WARP_SIZE) {\n    uint size = min(numFrom - i, WARP_SIZE);\n    uint next = getAndIncrement(&__numKeysCounter__, size);\n    // TODO: we need to make sure that (next + threadIdx.x < MAX_HASH_SIZE)\n    if (threadIdx.x < size) {\n      __key__[next + threadIdx.x] = _shared_[i + threadIdx.x]; // at most 2 transactions\n      __val__[next + threadIdx.x] = storeIndex;    \n    }\n  }\n}\n\n__device__ void store2storeInv(const uint src, uint* const _shared_) {\n  uint currDiffPtsIndex = getCurrDiffPtsHeadIndex(src);\n  uint numFrom = 0;\n  do {\n    uint myBits = __graphGet__(currDiffPtsIndex + threadIdx.x);\n    uint base = __graphGet__(currDiffPtsIndex + BASE);\n    if (base == NIL) {\n      break;\n    }\n    currDiffPtsIndex = __graphGet__(currDiffPtsIndex + NEXT);\n    map<STORE, STORE>(src, base, myBits, _shared_, numFrom);\n  } while (currDiffPtsIndex != NIL);\n  if (numFrom) {\n    insertAll(src, _shared_, numFrom);\n  }\n}\n\n__global__ void copyInv_loadInv_store2storeInv() {\n  __shared__ uint _sh_[WARPS_PER_BLOCK(COPY_INV_THREADS_PER_BLOCK) * (DECODE_VECTOR_SIZE * 2 + 2)];\n  uint* const _shared_ = &_sh_[threadIdx.y * (DECODE_VECTOR_SIZE * 2 + 2)];\n  _shared_[0] = NIL;\n  _shared_[DECODE_VECTOR_SIZE + 1] = NIL;\n  uint to = __numVars__;\n  uint src = getAndIncrement(&__worklistIndex1__, 1);\n  while (src < to) {\n    apply<COPY_INV, CURR_DIFF_PTS, NEXT_DIFF_PTS>(src, _shared_ + 1 + DECODE_VECTOR_SIZE + 1);\n    apply<LOAD_INV, CURR_DIFF_PTS, COPY_INV>(src, _shared_ + 1);\n    src = getAndIncrement(&__worklistIndex1__,1);\n  }\n  to = __numStore__;\n  src = getAndIncrement(1);\n  while (src < to) {\n    src = __storeConstraints__[src];\n    if (src != NIL) {\n      store2storeInv(src, _shared_ + 1);\n    }\n    src = getAndIncrement(1);\n  }\n  if (resetWorklistIndex()) {\n    __key__[__numKeysCounter__] = NIL;\n    __val__[__numKeysCounter__] = NIL;        \n    __numKeys__ = __numKeysCounter__ + 1;\n    __numKeysCounter__ = 0;\n    __worklistIndex1__ = 0;\n  }  \n}\n\n__device__ void warpStoreInv(const uint i, uint* const _pending_, uint* _numPending_) {\n  uint src = __key__[i];\n  uint startIndex = __keyAux__[i];\n  uint end = __keyAux__[i + 1]; \n  if (end - startIndex > WARPS_PER_BLOCK(STORE_INV_THREADS_PER_BLOCK) * 4) { \n    // too big for a single warp => add to pending, so the whole block will process this variable\n    if (isFirstThreadOfWarp()) {\n      uint where = 3 * atomicAdd(_numPending_, 1);\n      _pending_[where] = src;\n      _pending_[where + 1] = startIndex;\n      _pending_[where + 2] = end;\n    }\n    return;\n  }\n  uint* const _shared_ = _pending_ + WARPS_PER_BLOCK(STORE_INV_THREADS_PER_BLOCK) * 3 + \n      threadIdx.y * (WARP_SIZE + DECODE_VECTOR_SIZE + 1);\n  _shared_[WARP_SIZE] = NIL;\n  uint start = roundToPrevMultipleOf(startIndex, WARP_SIZE); // to ensure alignment\n  for (int j = start; j < end; j += WARP_SIZE) {\n    uint myIndex = j + threadIdx.x;\n    _shared_[threadIdx.x] = myIndex < end ? __val__[myIndex] : NIL; \n    uint startK = max(((int) startIndex) - j, 0);\n    uint endK = min(end - j, WARP_SIZE);      \n    for (int k = startK; k < endK; k++) {\n      uint fromIndex = _shared_[k];\n      unionToCopyInv(src, fromIndex, _shared_ + 1 + WARP_SIZE); \n    }      \n  }\n}\n\n__device__ void blockStoreInv(uint src, uint* const _dummyVars_, volatile uint* _warpInfo_, \n    uint& _numPending_) {\n  uint* _shared_ = _dummyVars_ + WARPS_PER_BLOCK(STORE_INV_THREADS_PER_BLOCK) * 4 + \n      threadIdx.y * (WARP_SIZE + DECODE_VECTOR_SIZE + 1);\n  __shared__ uint _counter_, _start_, _end_;\n\n  _shared_[WARP_SIZE] = NIL;\n  _shared_ += WARP_SIZE + 1;\n  __syncthreads();\n  for (int i = 0; i < _numPending_; i++) {\n    if (isFirstWarpOfBlock()) {\n      uint* pending = _dummyVars_ + WARPS_PER_BLOCK(STORE_INV_THREADS_PER_BLOCK);    \n      src =     pending[3 * i]; \n      _start_ = pending[3 * i + 1];\n      _end_ =   pending[3 * i + 2];\n      _counter_ = _start_; \n    }\n    __syncthreads();\n    if (isFirstThreadOfWarp()) {\n      _warpInfo_[threadIdx.y] = atomicAdd(&_counter_, 1);      \n    }\n    uint j = _warpInfo_[threadIdx.y];\n    while (j < _end_) {      \n      uint fromIndex = __val__[j];\n      unionToCopyInv(src, fromIndex, _shared_, isFirstWarpOfBlock());         \n      if (isFirstThreadOfWarp()) {\n        _warpInfo_[threadIdx.y] = atomicAdd(&_counter_, 1);      \n      }\n      j = _warpInfo_[threadIdx.y];\n    }\n    __syncthreads(); \n    if (isFirstWarpOfBlock()) {\n      for (int i = 1; i < WARPS_PER_BLOCK(STORE_INV_THREADS_PER_BLOCK); i++) {\n        uint var2 = _dummyVars_[i];\n        unionToCopyInv(src, getCopyInvHeadIndex(var2), _shared_);\n      }\n    }\n    __syncthreads();\n    if (!isFirstWarpOfBlock()) { //reset fields so updateDiffPts doesn't work on dummy variables\n      uint index = getHeadIndex(src, COPY_INV);\n      __graphSet__(index, threadIdx.x, NIL);\n    }         \n  }\n  if (isFirstWarpOfBlock()) {\n    _numPending_ = 0;\n  }\n  __syncthreads();\n}\n\n__global__ void storeInv() {\n  __shared__ uint _sh_[WARPS_PER_BLOCK(STORE_INV_THREADS_PER_BLOCK) * \n      (5 + WARP_SIZE + DECODE_VECTOR_SIZE + 1)];\n  __shared__ volatile uint* _warpInfo_;\n  __shared__ volatile uint _warpsWorking_;\n  __shared__ uint* _dummyVars_;\n  __shared__ uint _numPending_, _to_;\n  \n  if (isFirstWarpOfBlock()) {\n    _to_ = __numKeys__ - 1; // because the last one is NIL\n    _dummyVars_ = _sh_ + WARPS_PER_BLOCK(STORE_INV_THREADS_PER_BLOCK);\n    if (threadIdx.x < WARPS_PER_BLOCK(STORE_INV_THREADS_PER_BLOCK)) {\n      _dummyVars_[threadIdx.x] = __initialNonRep__[mul32(blockIdx.x) + threadIdx.x];\n    }\n    _warpInfo_ = _sh_;\n    _numPending_ = 0;\n    _warpsWorking_ = WARPS_PER_BLOCK(STORE_INV_THREADS_PER_BLOCK);\n  } \n  __syncthreads();\n  uint counter, src;\n  if (!isFirstWarpOfBlock()) {\n    src = _dummyVars_[threadIdx.y];    \n  }\n  if (isFirstThreadOfWarp()) {\n    uint next = atomicAdd(&__worklistIndex0__, 1);\n    if (next >= _to_) {\n      atomicSub((uint*) &_warpsWorking_, 1);\n    }\n    _warpInfo_[threadIdx.y] = next;      \n  }\n  counter = _warpInfo_[threadIdx.y]; \n  while (_warpsWorking_) {\n    if (counter < _to_) {\n      warpStoreInv(counter, _sh_ + WARPS_PER_BLOCK(STORE_INV_THREADS_PER_BLOCK) * 2, &_numPending_);\n    }\n    __syncthreads();\n    if (_numPending_) {\n      blockStoreInv(src, _dummyVars_, _warpInfo_, _numPending_);\n    }\n    if (counter < _to_ ) {\n      if (isFirstThreadOfWarp()) {\n        uint next = atomicAdd(&__worklistIndex0__, 1);\n        if (next >= _to_) {\n          atomicSub((uint*) &_warpsWorking_, 1);\n        }\n        _warpInfo_[threadIdx.y] = next;      \n      }\n      counter = _warpInfo_[threadIdx.y]; \n    }\n  }\n  resetWorklistIndex();  \n}\n\n__device__ void shift(const uint base, const uint bits, const uint offset,\n    volatile uint* _shifted_) {\n  _shifted_[threadIdx.x] = 0;\n  _shifted_[threadIdx.x + WARP_SIZE] = 0;\n  _shifted_[threadIdx.x + WARP_SIZE * 2] = 0;\n  uint delta = div32(offset);\n  uint highWidth = mod32(offset);\n  uint lowWidth = WARP_SIZE - highWidth;\n  // these memory accesses do not conflict\n  _shifted_[threadIdx.x + delta] = (bits << highWidth);\n  _shifted_[threadIdx.x + delta + 1] |= (bits >> lowWidth);\n  _shifted_[threadIdx.x + WARP_SIZE * 2] = _shifted_[threadIdx.x + BASE * 2];\n  _shifted_[threadIdx.x + WARP_SIZE] = _shifted_[threadIdx.x + BASE];\n  _shifted_[BASE] = base;\n  _shifted_[BASE + WARP_SIZE] = base + 1;\n  _shifted_[BASE + WARP_SIZE * 2] = base + 2;\n}\n\n__device__ void applyGepInvRule(uint x, const uint y, const uint offset, volatile uint* _shared_) {\n  uint yIndex = getCurrDiffPtsHeadIndex(y);\n  uint myBits = __graphGet__(yIndex, threadIdx.x);\n  uint all = __all_sync(0xffffffff,myBits == NIL);\n  if (all) {\n    return;\n  }\n  uint xIndex = getNextDiffPtsHeadIndex(x);\n  do {\n    myBits = __graphGet__(yIndex, threadIdx.x);\n    uint base = __graphGet__(yIndex, BASE);\n    yIndex = __graphGet__(yIndex, NEXT);\n    myBits &= __offsetMaskGet__(base, threadIdx.x, offset);\n    uint all = __all_sync(0xffffffff,myBits == 0);\n    if (all) {\n      continue;\n    }\n    shift(base, myBits, offset, _shared_);\n    for (int i = 0; i < 3; i++) {\n      uint myBits = threadIdx.x == NEXT ? NIL : _shared_[threadIdx.x + WARP_SIZE * i];\n      uint all = __any_sync(0xffffffff,myBits && threadIdx.x < BASE);\n      if (all) {\n        xIndex = addVirtualElement(xIndex, base + i, myBits, NEXT_DIFF_PTS);\n      }\n    }\n  } while (yIndex != NIL);\n}\n\n__global__ void gepInv() {\n  __shared__ uint _sh_[WARPS_PER_BLOCK(GEP_INV_THREADS_PER_BLOCK) * (WARP_SIZE * 3)];\n  volatile uint* _shared_ = &_sh_[threadIdx.y * (WARP_SIZE * 3)];\n  const uint to = __numGepInv__ * 2;\n  uint index = getAndIncrement(2);\n  while (index < to) {\n    uint x = __gepInv__[index];\n    x = getRep(x);\n    uint val1 = __gepInv__[index + 1];\n    while (!lock(x));  // busy wait, should be short\n    const uint y = getRep(id(val1));\n    applyGepInvRule(x, y, offset(val1), _shared_);\n    unlock(x);\n    index = getAndIncrement(2);\n  }\n  if (resetWorklistIndex()) {\n    __done__ = true;\n  }  \n}\n\n__device__ void cloneAndLink(const uint var, const uint ptsIndex, uint& currDiffPtsIndex, \n    const uint diffPtsBits, const uint diffPtsNext) {\n  clone(ptsIndex, diffPtsBits, diffPtsNext, PTS);\n  if (currDiffPtsIndex != NIL) {\n    __graphSet__(currDiffPtsIndex + NEXT, ptsIndex);\n  } else {\n    currDiffPtsIndex = getCurrDiffPtsHeadIndex(var);\n    uint ptsBits = __graphGet__(ptsIndex + threadIdx.x);\n    __graphSet__(currDiffPtsIndex + threadIdx.x, ptsBits);        \n  }  \n}\n\n/**\n * Update the current, next and total PTS sets of a variable. In the last iteration of the main\n * loop, points-to edges have been added to NEXT_DIFF_PTS. However, many of them might already be\n * present in PTS. The purpose of this function is to update PTS as PTS U NEXT_DIFF_PTS, and set \n * CURR_DIFF_PTS as the difference between the old and new PTS for the given variable.\n *  \n * @param var ID of the variable\n * @return true if new pts edges have been added to this variable\n */ \n__device__ bool updatePtsAndDiffPts(const uint var) {\n  const uint diffPtsHeadIndex = getNextDiffPtsHeadIndex(var);\n  uint diffPtsBits = __graphGet__(diffPtsHeadIndex + threadIdx.x);\n  uint diffPtsBase = __graphGet__(diffPtsHeadIndex + BASE);\n  if (diffPtsBase == NIL) {\n    return false;\n  }\n  uint diffPtsNext = __graphGet__(diffPtsHeadIndex + NEXT);\n  __graphSet__(diffPtsHeadIndex + threadIdx.x, NIL);\n  uint ptsIndex = getPtsHeadIndex(var);\n  uint ptsBits = __graphGet__(ptsIndex + threadIdx.x);\n  uint ptsBase = __graphGet__(ptsIndex + BASE);\n  if (ptsBase == NIL) { \n    //we pass ptsBase instead of NIL because it's also NIL but it can be modified\n    cloneAndLink(var, ptsIndex, ptsBase, diffPtsBits, diffPtsNext);\n    return true;    \n  }      \n  uint ptsNext = __graphGet__(ptsIndex + NEXT);\n  uint currDiffPtsIndex = NIL;\n  while (1)  {   \n    if (ptsBase > diffPtsBase) {\n      uint newIndex = mallocPts();\n      __graphSet__(newIndex + threadIdx.x, ptsBits);        \n      uint val = threadIdx.x == NEXT ? newIndex : diffPtsBits;\n      __graphSet__(ptsIndex + threadIdx.x, val);\n      ptsIndex = newIndex;\n      // update CURR_DIFF_PTS\n      newIndex = currDiffPtsIndex == NIL ? getCurrDiffPtsHeadIndex(var) : mallocCurrDiffPts();\n      val = threadIdx.x == NEXT ? NIL : diffPtsBits;\n      __graphSet__(newIndex + threadIdx.x, val);\n      if (currDiffPtsIndex != NIL) {\n        __graphSet__(currDiffPtsIndex + NEXT, newIndex);\n      }\n      if (diffPtsNext == NIL) {\n        return true;\n      }\n      currDiffPtsIndex = newIndex;\n      diffPtsBits = __graphGet__(diffPtsNext + threadIdx.x);\n      diffPtsBase = __graphGet__(diffPtsNext + BASE);      \n      diffPtsNext = __graphGet__(diffPtsNext + NEXT);      \n    } else if (ptsBase == diffPtsBase) {      \n      uint newPtsNext = (ptsNext == NIL && diffPtsNext != NIL) ? mallocPts() : ptsNext;\n      uint orBits = threadIdx.x == NEXT ? newPtsNext : ptsBits | diffPtsBits;\n      uint ballot = __ballot_sync(0xffffffff,orBits != ptsBits);\n      if (ballot) {\n        __graphSet__(ptsIndex + threadIdx.x, orBits);          \n        if (ballot & LT_BASE) {\n          // update CURR_DIFF_PTS\n          orBits = diffPtsBits & ~ptsBits;\n          if (threadIdx.x == BASE) {\n            orBits = ptsBase;\n          } else if (threadIdx.x == NEXT) {\n            orBits = NIL;\n          }\n          uint newIndex;\n          if (currDiffPtsIndex != NIL) {\n            newIndex = mallocCurrDiffPts();\n            __graphSet__(currDiffPtsIndex + NEXT, newIndex);\n          } else {\n            newIndex = getCurrDiffPtsHeadIndex(var);\n          }\n          __graphSet__(newIndex + threadIdx.x, orBits);\n          currDiffPtsIndex = newIndex;\n        }\n      }\n      if (diffPtsNext == NIL) {\n        return (currDiffPtsIndex != NIL);\n      }\n      diffPtsBits = __graphGet__(diffPtsNext + threadIdx.x);\n      diffPtsBase = __graphGet__(diffPtsNext + BASE);      \n      diffPtsNext = __graphGet__(diffPtsNext + NEXT);      \n      if (ptsNext == NIL) {\n        cloneAndLink(var, newPtsNext, currDiffPtsIndex, diffPtsBits, diffPtsNext);\n        return true;    \n      } \n      ptsIndex = ptsNext;\n      ptsBits = __graphGet__(ptsIndex + threadIdx.x);\n      ptsBase = __graphGet__(ptsIndex + BASE);\n      ptsNext = __graphGet__(ptsIndex + NEXT);         \n    } else { // ptsBase > diffPtsBase\n      if (ptsNext == NIL) {\n        uint newPtsIndex = mallocPts();\n        __graphSet__(ptsIndex + NEXT, newPtsIndex);\n        cloneAndLink(var, newPtsIndex, currDiffPtsIndex, diffPtsBits, diffPtsNext);\n        return true;\n      }\n      ptsIndex = ptsNext;\n      ptsBits = __graphGet__(ptsIndex + threadIdx.x);\n      ptsBase = __graphGet__(ptsIndex + BASE);\n      ptsNext = __graphGet__(ptsIndex + NEXT);        \n    } \n  }\n}\n\n__global__ void updatePtsInformation() {\n  bool newWork = false;\n  const uint numVars = __numVars__;\n  const uint CHUNK_SIZE = 12;\n  //ulongint start = recordStartTime();  \n  int i = getAndIncrement(CHUNK_SIZE);\n  while (i < numVars) {    \n    for (int var = i; var < min(i + CHUNK_SIZE, numVars); var++) {\n      bool newStuff = updatePtsAndDiffPts(var);\n      newWork |= newStuff;\n      if (!newStuff) {\n        const uint currPtsHeadIndex = getCurrDiffPtsHeadIndex(var);\n        __graphSet__(currPtsHeadIndex + threadIdx.x, NIL);        \n      }    \n    }\n    i = getAndIncrement(CHUNK_SIZE);\n  }\n  if (newWork) {\n    __done__ = false;\n  }\n//  if (isFirstThreadOfWarp()) {\n//    //printf(\"Warp %u: %u\\n\", getWarpIdInGrid(), getEllapsedTime(start));\n//  }  \n  uint headerSize = numVars * ELEMENT_WIDTH;\n  if (resetWorklistIndex()) {\n    __currDiffPtsFreeList__ = CURR_DIFF_PTS_START - headerSize;\n    __nextDiffPtsFreeList__ = NEXT_DIFF_PTS_START - headerSize;\n  }\n}\n\n__global__ void createOffsetMasks(int numObjectVars, uint maxOffset) {\n  __shared__ uint _sh_[DEF_THREADS_PER_BLOCK];\n  volatile uint* _mask_ =  &_sh_[threadIdx.y * WARP_SIZE];\n\n  int inc = mul960(getWarpsPerGrid());\n  int init = mul960(getWarpIdInGrid());\n  for (int i = init; i < numObjectVars; i += inc) {\n    uint base = BASE_OF(i);\n    for (int offset = 1; offset <= maxOffset; offset++) {\n      _mask_[threadIdx.x] = 0;\n      for (int src = i; src < min(i + ELEMENT_CARDINALITY, numObjectVars); src += WARP_SIZE) {\n        uint size = __size__[src + threadIdx.x];\n        uint all = __all_sync(0xffffffff,size <= offset);\n        if (all) {\n          continue;\n        }\n        uint word = WORD_OF(src - i);\n        _mask_[word] = __ballot_sync(0xffffffff,size > offset);\n      }\n      __offsetMaskSet__(base, threadIdx.x, offset, _mask_[threadIdx.x]);\n    }\n  }\n}\n\n__device__ uint lockToVar(uint lock) {\n  if ((lock < VAR(0)) || (lock >= LOCKED)) {\n    return lock;\n  }\n  return lock - VAR(0);\n}\n\n__device__ void merge(const uint var1, const uint var2, const uint rep) {\n  //if (isFirstThreadOfWarp()) //printf(\"%u <= %u\\n\", var1, var2);\n  uint headIndex = getPtsHeadIndex(var2);\n  unionG2GRecycling(var1, PTS, headIndex);\n  __graphSet__(headIndex, threadIdx.x, NIL);\n  headIndex = getCopyInvHeadIndex(var2);\n  unionG2GRecycling(var1, COPY_INV, headIndex);\n  __graphSet__(headIndex, threadIdx.x, NIL);\n  headIndex = getStoreHeadIndex(var2);\n  unionG2GRecycling(var1, STORE, headIndex);\n  __graphSet__(headIndex, threadIdx.x, NIL);\n  headIndex = getLoadInvHeadIndex(var2);\n  unionG2GRecycling(var1, LOAD_INV, headIndex);\n  __graphSet__(headIndex, threadIdx.x, NIL);\n  // clear CURR_DIFF_PTS \n  headIndex = getCurrDiffPtsHeadIndex(var2);\n  //unionG2GRecycling(var1, CURR_DIFF_PTS, headIndex);\n  __graphSet__(headIndex, threadIdx.x, NIL);\n  setRep(var2, rep);\n  __threadfence(); \n  unlock(var2);\n}\n\n/**\n * Merge a list of pointer-equivalent variables\n * Granularity: block\n * @param _list_ Pointer-equivalent variables\n * @param _listSize_ Number of variables to be processed\n */\n__device__ void mergeCycle(const uint* const _list_, const uint _listSize_) {\n  __shared__ uint _counter_;\n  if (!_listSize_) {\n    __syncthreads();\n    return;\n  }\n  // 'ry' will be the representative of this cycle\n  uint ry = _list_[0];  \n  if (_listSize_ == 1) {\n    if (isFirstWarpOfBlock()) {\n      unlock(ry);\n    }    \n    __syncthreads();\n    return;\n  }\n  uint warpsPerBlock = getWarpsPerBlock();\n  if (_listSize_ > warpsPerBlock) {\n    // each warp chooses a local representative and then merges each popped worklist item with it.\n    uint var1 = _list_[threadIdx.y];\n    _counter_ = warpsPerBlock;\n    __syncthreads();\n    uint index = getAndIncrement(&_counter_, 1);\n    while (index < _listSize_) {\n      uint var2 = _list_[index];\n      merge(var1, var2, ry);\n      index = getAndIncrement(&_counter_, 1);\n    }\n  }\n  __syncthreads();\n  // the first warp merges the local representatives. This is actually faster (and simpler)\n  // than performing a reduction of the list using the entire block, due to load imbalance.\n  if (isFirstWarpOfBlock()) { \n    uint to = min(_listSize_, warpsPerBlock);\n    for (int i = 1; i < to; i++) {\n      uint var = _list_[i];\n      merge(ry, var, ry);\n    }    \n    //reset CURR_PTS of the cycle representative to be PTS\n    uint myBits = __graphGet__(getPtsHeadIndex(ry), threadIdx.x);\n    __graphSet__(getCurrDiffPtsHeadIndex(ry), threadIdx.x, myBits); \n    __threadfence();    \n    unlock(ry);\n  }\n  __syncthreads();  \n}\n\n// to be executed by one thread\n__device__ uint lockVarRep(uint& var) {\n  while (1) {\n    uint rep = getRepRec(var);\n    uint old = atomicCAS(__lock__ + rep, UNLOCKED, VAR(blockIdx.x));      \n    if (old == PTR(blockIdx.x)) {\n        // try to promote lock to type VAR\n      old = atomicCAS(__lock__ + rep, PTR(blockIdx.x), VAR(blockIdx.x));            \n    }\n    if (old != UNLOCKED && old != PTR(blockIdx.x)) {\n      var = rep;\n      return old;\n    }\n    // we locked it, but maybe is not a representative anymore\n    var = getRep(rep);\n    if (var == rep) {\n      return UNLOCKED;\n    }\n    if (old == PTR(blockIdx.x)) { // back to PTR\n        __lock__[rep] = PTR(blockIdx.x);            \n    } else {\n      unlock(rep);\n    }\n  }\n}\n\n/**\n * Lock a list of variables\n * Granularity: block\n * @param _currVar_ List of variables to lock, sorted in ascending order\n * @param _currVarSize_ Number of variables we want to process. At the end of the function,\n * it stores the number of variables we were able to lock.\n * @param _nextVar_ List where to add all the variables we could not lock\n * @param _nextVarSize_ Number of variables we could not lock\n */\n__device__ void lockVars(uint* const _currVar_, uint& _currVarSize_, uint* const _nextVar_, \n    uint* _nextVarSize_) {\n  __shared__ uint _count_;\n  _count_ = 0;\n  __syncthreads();\n  for (int i = getThreadIdInBlock(); i < _currVarSize_; i+= getThreadsPerBlock()) {\n    uint var = _currVar_[i];  \n    // block culling to filter out some duplicates\n    if (i && var == _currVar_[i - 1]) {\n      continue;        \n    }\n    uint stat = lockVarRep(var);\n    uint pos;\n    if (stat == UNLOCKED) {\n      pos = atomicAdd(&_count_, 1);\n      _currVar_[pos] = var;\n    } else if (stat != VAR(blockIdx.x)) { \n      uint pos = atomicAdd(_nextVarSize_, 1);\n      _nextVar_[pos] = var;        \n    }       \n  }   \n  __syncthreads();  \n  _currVarSize_ = _count_; //first currVarSize positions are populated\n  __syncthreads();  \n}\n\n// to be executed by one WARP\n__device__ uint lockPtr(uint ptr) {\n  __shared__ volatile uint _shared_[MAX_WARPS_PER_BLOCK];\n  uint intended = PTR(getBlockIdInGrid());\n  if (isFirstThreadOfWarp()) {    \n    _shared_[threadIdx.y] = atomicCAS(__lock__ + ptr, UNLOCKED, intended);      \n  }\n  return _shared_[threadIdx.y];\n}\n\n/**\n * Lock every variable in the current points-to set of the input variable.\n * Granularity: warp\n * @param x A variable locked by the current block\n * @param _currVar_ List of locked variables\n * @param _currVarSize_ Number of locked variables\n * @param _nextVar_ List of variables we could not lock\n * @param _nextVarSize_ Number of variables we could not lock\n */\n__device__ void decodeCurrPts(const uint x, uint* const _currVar_, uint* const _currVarSize_, \n    uint* const _nextVar_, uint* const _nextVarSize_) {\n  uint index = getCurrDiffPtsHeadIndex(x);\n  do {\n    uint myBits = __graphGet__(index, threadIdx.x);\n    uint base = __graphGet__(index, BASE);\n    if (base == NIL) {\n      break;\n    }\n    index = __graphGet__(index, NEXT);\n    uint nonEmpty = __ballot_sync(0xffffffff,myBits && threadIdx.x < BASE);\n    uint lastVar = NIL;\n    while (nonEmpty) {\n      uint pos = __ffs(nonEmpty) - 1;\n      nonEmpty &= (nonEmpty - 1);\n      uint bits = getValAtThread(myBits, pos);\n      uint var = mul960(base) + mul32(pos) + threadIdx.x;\n      if (var == I2P || !isBitActive(bits, threadIdx.x)) {\n        var = NIL;\n      } else {\n        uint stat = lockVarRep(var);             \n        if (stat != UNLOCKED) {\n          if (stat != VAR(blockIdx.x) && var != lastVar) { \n            // TODO: do something so we do not lose equivalences. This only affects Linux, though\n            uint where = atomicInc(_nextVarSize_, HCD_DECODE_VECTOR_SIZE - 1); \n            _nextVar_[where] = var;              \n            lastVar = var;\n          }         \n          var = NIL;\n        }  \n      }\n      bits = __ballot_sync(0xffffffff,var != NIL);\n      if (!bits) {\n        continue;\n      }\n      uint numOnes = __popc(bits);\n      uint prevNumFrom = 0;\n      if (isFirstThreadOfWarp()) {\n        prevNumFrom = atomicAdd(_currVarSize_, numOnes);\n      }\n      prevNumFrom = getValAtThread(prevNumFrom, 0);\n      // TODO: make sure that (prevNumFrom + numOnes < HCD_DECODE_VECTOR_SIZE)      \n      //if (isFirstThreadOfWarp() && ((prevNumFrom + numOnes) >= HCD_DECODE_VECTOR_SIZE)) { \n      //  //printf(\"Exceeded HCD_DECODE_VECTOR_SIZE!!\\n\"); \n      //} \n      pos = prevNumFrom + __popc(bits & ((1 << threadIdx.x) - 1));\n      if (var != NIL) { \n        _currVar_[pos] = var;\n      }             \n    }\n  } while (index != NIL);\n}\n\n/**\n * Lock a list of (pointer) variables and their points-to sets\n * Granularity: block \n */\n__device__ void lockPtrs(uint* const _currPtr_, uint& _currPtrSize_, uint* const _nextPtr_, \n    uint* _nextPtrSize_, uint* const _currVar_, uint* _currVarSize_, uint* const _nextVar_, \n    uint* _nextVarSize_) {\n  const uint warpsPerBlock = getWarpsPerBlock();  \n  for (int i = threadIdx.y; i < _currPtrSize_; i += warpsPerBlock) {\n    uint ptr = _currPtr_[i];\n    uint stat = lockPtr(ptr);\n    if (stat != UNLOCKED && stat != VAR(blockIdx.x)) {       \n      _currPtr_[i] = NIL;\n      if (isFirstThreadOfWarp()) {\n        uint pos = atomicAdd(_nextPtrSize_, 1);\n        _nextPtr_[pos] = ptr;\n      }          \n    } else {\n      decodeCurrPts(ptr, _currVar_, _currVarSize_, _nextVar_, _nextVarSize_);\n    }\n  }\n  __syncthreads();   \n}\n\n__device__ void unlockPtrs(const uint* const _list_, const uint _listSize_) {\n  int init = getThreadIdInBlock();\n  int inc = getThreadsPerBlock();\n  for (int i = init; i < _listSize_; i += inc) {\n    uint var = _list_[i];\n    if (var != NIL) {\n      // if it is locked by VAR(blockIdx.x), keep it that way\n      atomicCAS(__lock__ + var, PTR(blockIdx.x), UNLOCKED);\n    }\n  }\n  __syncthreads();\n}\n\n/**\n * Online phase of Hybrid Cycle Detection\n * This is when things get really hairy -- but the overall performance of the algorithm is \n * dramatically improved by removing the equivalents discovered during the offline analysis, so\n * there is not way around it AFAIK.\n * The kernel takes a list of tuples (y, x_0, ..., x_N) where pts(*y) = pts(x_0) = ... pts(x_N)\n * Each block pops a pair out of the worklist, and performs the following logic:\n *   a) lock variables y,x_0,...,x_N\n *   b) decode and lock the points-to of x_0,...,x_N\n *   c) merge all the variables that we were able to lock\n *   d) unlock the merged variables\n *   e) repeat a-d for all the variables we were not able to lock\n * Note that e) is not strictly necessary, but we would be missing some (maybe relevant) \n * equivalences that will eventually result in more work for the standard graph rules.\n */\n__global__ void hcd() {\n  __shared__ uint _counter_;\n  /**\n   * list of variables (x,...,x_N) such that all the variables in the set {pts(x),...pts(x_N)}\n   * are pointer-equivalent.\n   */\n  __shared__ uint _ptr_[HCD_TABLE_SIZE * 2];\n  /*\n   * pointer to _ptr_ indicating where the current list starts\n   */\n  __shared__ uint *_currPtr_;\n  /**\n   * pointer to _ptr_ indicating where the next list starts. \n   * The reason why need of sublists within _ptr_ is because we might not have been able to lock\n   * all the variables in _currPtr_, so everything that is pending (=needs to be processed in the\n   * next iteration) is placed in the subarray pointed by _nextPtr_\n   */\n  __shared__ uint *_nextPtr_;\n  /**\n   * list of variables that are pointer equivalent (thus need to be merged)\n   */\n  __shared__ uint _currVar_[HCD_DECODE_VECTOR_SIZE];\n  /**\n   * list of variables that are pointer equivalent but could not be locked in the current iteration\n   */\n  __shared__ uint *_nextVar_;\n  __shared__ uint _currPtrSize_, _nextPtrSize_, _currVarSize_, _nextVarSize_;    \n  const uint threadIdInBlock = getThreadIdInBlock();\n  const uint threadsInBlock = getThreadsPerBlock();\n  const uint to = __numHcdIndex__;\n  \n  // first thread of the block picks next hcd pair to work on\n  if (isFirstThreadOfBlock()) {\n    _counter_ = atomicAdd(&__worklistIndex0__, 1);\n    _nextVar_ = __nextVar__ + getBlockIdInGrid() * HCD_DECODE_VECTOR_SIZE;\n  }\n  __syncthreads();\n  while (_counter_ < to) {\n    uint pair = __hcdIndex__[_counter_];\n    uint start = getFirst(pair);\n    uint end = getSecond(pair);\n    // move the (x0,...,x_N) sublist to shared memory\n    for (int i = start + 1 + threadIdInBlock; i < end; i += threadsInBlock) {\n      _ptr_[i - start - 1] = __hcdTable__[i];\n    } \n    if (isFirstWarpOfBlock()) {\n      _currPtrSize_ = end - start - 1;\n      _currVar_[0] = __hcdTable__[start];\n      _currVarSize_ = 1;\n      _currPtr_ = _ptr_;\n      // we do not know how many variables we will not be able to lock, so unfortunately we have\n      // use a statically fixed index\n      _nextPtr_ = _ptr_ + HCD_TABLE_SIZE;\n    }\n    while (1) {   \n      _nextPtrSize_ = 0;\n      _nextVarSize_ = 0;\n      __syncthreads();           \n      // lock variables in the current variable list (variables that belong to the points-to set\n      // of x_I and could not be locked in a previous iteration)\n      lockVars(_currVar_, _currVarSize_, _nextVar_, &_nextVarSize_);     \n      // lock variables in current pointer list, then decode their points-to sets and lock those too\n      lockPtrs(_currPtr_, _currPtrSize_, _nextPtr_, &_nextPtrSize_, _currVar_, &_currVarSize_,  _nextVar_, &_nextVarSize_);\n      // unlock variables in pointer list if they are not in the variable list\n      unlockPtrs(_currPtr_, _currPtrSize_);                        \n      blockSort(_currVar_, _currVarSize_);\n      // merge variable list!\n      mergeCycle(_currVar_, _currVarSize_); \n      // if there is any pending work -because variables or pointers could not be locked-, update\n      // the corresponding information and retry\n      if (!_nextPtrSize_ && (!_nextVarSize_ || (_currVarSize_ + _nextVarSize_ == 1))) {\n        break;\n      }\n      if (isFirstWarpOfBlock() && _currVarSize_) {\n        _currVar_[_nextVarSize_] = _currVar_[0]; // merge representative with pending\n      }\n      __syncthreads();\n      for (int i = threadIdInBlock; i < _nextVarSize_; i+= threadsInBlock) {\n        _currVar_[i] = _nextVar_[i];\n      }\n      if (isFirstWarpOfBlock()) {\n        _currVarSize_ = _nextVarSize_ + (_currVarSize_ > 0);\n        _currPtrSize_ = _nextPtrSize_;\n        uint* tmp = _nextPtr_;\n        _nextPtr_ = _currPtr_;\n        _currPtr_ = tmp;\n      }        \n      __syncthreads(); \n      blockSort(_currVar_, _currVarSize_);       \n    }\n    if (isFirstThreadOfBlock()) {\n      _counter_ = atomicAdd(&__worklistIndex0__, 1);\n    }\n    __syncthreads();    \n  }\n  resetWorklistIndex();\n}\n\n__global__ void updateInfo() {\n  int inc = getThreadsPerGrid();\n  int init = getThreadIdInGrid();\n  uint to = __numVars__;\n  // a) path compression\n  for (int var = init; var < to; var += inc) {\n    uint rep = getRepRec(var); // non-coalesced\n    if (rep != var) {\n      setRep(var, rep); //coalesced\n    }\n    uint diffPtsMask = __ballot_sync(0xffffffff,!isEmpty(rep, CURR_DIFF_PTS)); //non aligned\n    __diffPtsMaskSet__(BASE_OF(var), WORD_OF(var), diffPtsMask); //aligned\n  }\n  syncAllThreads();\n  // b) update store rules\n  to = __numStore__;\n  for (int index = init; index < to; index += inc) {\n    // the size of store has been rounded to a multiple of 32, so no out-of-bounds\n    uint src = __storeConstraints__[index];\n    if (src != NIL) {\n      src = getRep(src);\n      uint val = (atomicCAS(__lock__ + src, UNLOCKED, LOCKED) == UNLOCKED) ? src : NIL;\n      __storeConstraints__[index] = val;        \n    }\n  }\n  syncAllThreads();\n  // c) unlock\n  for (int index = init; index < to; index += inc) {\n    uint src = __storeConstraints__[index];\n    if (src != NIL) {\n      unlock(getRep(src));\n    }\n  }\n}\n\n__launch_bounds__ (DEF_THREADS_PER_BLOCK)\n__global__ void initialize() {\n  uint to = __numVars__;\n  uint headerSize = to * ELEMENT_WIDTH;\n  if (isFirstThreadOfBlock()) {\n    __ptsFreeList__ = headerSize;\n    __currDiffPtsFreeList__ = CURR_DIFF_PTS_START - headerSize;    \n    __nextDiffPtsFreeList__ = NEXT_DIFF_PTS_START - headerSize;\n    // after LOAD_INV, STORE and CURR_DIFF_PTS_INV  header regions\n    __otherFreeList__ = COPY_INV_START + headerSize * (LAST_DYNAMIC_REL - COPY_INV + 1);\n  }\n  __syncthreads();\n  int inc = mul32(getWarpsPerGrid());\n  int init = mul32(getWarpIdInGrid());\n  for (int var = init; var < to; var += inc) {\n    unlock(var + threadIdx.x);\n    setRep(var + threadIdx.x, var + threadIdx.x);\n    for (int i = 0; i < WARP_SIZE; i++) {\n      uint index = getHeadIndex(var + i, PTS);\n      __graphSet__(index + threadIdx.x, NIL);\n      index = getHeadIndex(var + i, NEXT_DIFF_PTS);\n      __graphSet__(index + threadIdx.x, NIL);\n      index = getHeadIndex(var + i, CURR_DIFF_PTS);\n      __graphSet__(index + threadIdx.x, NIL);\n      index = getHeadIndex(var + i, COPY_INV);\n      __graphSet__(index + threadIdx.x, NIL);\n      index = getHeadIndex(var + i, STORE);\n      __graphSet__(index + threadIdx.x, NIL);\n      index = getHeadIndex(var + i, LOAD_INV);\n      __graphSet__(index + threadIdx.x, NIL);\n    }\n  }\n  inc = mul960(getWarpsPerGrid());\n  init = mul960(getWarpIdInGrid());\n  for (int i = init; i < to; i += inc) {\n    uint base = BASE_OF(i);\n    __diffPtsMaskSet__(base, threadIdx.x, 0);\n  }\n  syncAllThreads();\n  to = __numInitialRep__;\n  init = getThreadIdInGrid();\n  inc = getThreadsPerGrid();\n  // the offline phase of Hybrid Cycle Detection already detected some pointer equivalent variables.\n    for (int i = init; i < to; i += inc) {\n    setRep(__initialNonRep__[i], __initialRep__[i]);    \n  }\n}\n\n__global__ void computeCurrPtsHash() {\n  const uint to = __numVars__;\n  uint src = getAndIncrement(WARP_SIZE);\n  while (src < to) {\n    for (int i = 0; i < WARP_SIZE; i++) {\n      if (!isEmpty(src + i, CURR_DIFF_PTS)) {\n        uint hash = hashCode(getHeadIndex(src + i, CURR_DIFF_PTS));\n        uint next = getAndIncrement(&__numKeysCounter__, 1);\n        __key__[next] = hash;\n        __val__[next] = src + i;\n      }\n    }\n    src = getAndIncrement(WARP_SIZE);\n  }\n  if (resetWorklistIndex()) {\n    __numKeys__ = __numKeysCounter__;\n    __numKeysCounter__ = 0;\n  }  \n}\n\n__global__ void findCurrPtsEquivalents() {\n  __shared__ uint _sh_[WARPS_PER_BLOCK(UPDATE_THREADS_PER_BLOCK) * WARP_SIZE * 2];\n  uint* _key_ = &_sh_[threadIdx.y * WARP_SIZE * 2];\n  uint* _val_ = _key_ + WARP_SIZE;\n\n  const uint to = __numKeys__;\n  uint index = getAndIncrement(WARP_SIZE);\n  while (index < to) {\n    if (index + threadIdx.x < to) {\n      _key_[threadIdx.x] = __key__[index + threadIdx.x];\n      _val_[threadIdx.x] = __val__[index + threadIdx.x];\n    }\n    for (int i = 0; i < WARP_SIZE && index + i < to; i++) {\n      uint var1 = _val_[i];\n      uint var1Head = getHeadIndex(var1, CURR_DIFF_PTS);\n      uint j = _key_[i];\n      while (j < index + i) {\n        uint var2 = __val__[j];\n        uint var2Head = getHeadIndex(var2, CURR_DIFF_PTS);\n        if (equal(var1Head, var2Head)) {\n          __currPtsHead__[var1] = var2Head;\n          break;\n        }\n        j++;\n      }\n      if (j == index + i) {\n        __currPtsHead__[var1] = var1Head;\n      }\n    }\n    index = getAndIncrement(WARP_SIZE);\n  } \n  resetWorklistIndex();\n}\n\n__host__ void checkKernelErrors(char *msg) {\n  cudaError_t e;\n  cudaThreadSynchronize(); \n  if (cudaSuccess != (e = cudaGetLastError())) {\n    printf(\"\\n%s: %s\\n\", msg, cudaGetErrorString(e));\n    exit(-1);\n  }\n}\n\n__host__ void checkErrors(uint rel) {\n#if CHECK_SPV\n  uint error = 0;\n  checkForErrors << <getBlocks(), THREADS_PER_BLOCK >> >(rel);\n  checkKernelErrors(\"ERROR while checking for errors\");\n  cudaSafeCall(cudaMemcpyFromSymbol(&error, __error__, uintSize, 0, D2H));\n  if (error) {\n    exit(-1);\n  }\n#endif\n}\n\n__host__ void checkAllErrors() {\n  checkErrors(PTS);\n  checkErrors(NEXT_DIFF_PTS);\n  checkErrors(CURR_DIFF_PTS);\n  checkErrors(COPY_INV);\n  checkErrors(LOAD_INV);\n  checkErrors(STORE);\n}\n\n__host__ void addTimeToRule(uint& counter, clock_t& startTime) {\n  uint ellapsedTime = (int) (1000.0f * (clock() - startTime) / CLOCKS_PER_SEC);\n  counter += ellapsedTime;\n  startTime = clock();\n}\n\n__host__ void printRule(const char* msg) {\n#if PRINT_RULES\n    printf(\"%s\", msg);\n#endif\n}\n\ntemplate <typename Vector>\n__host__ void printVector(const Vector& v, uint size) {\n  std::cout << \"[\";\n  for (size_t i = 0; i < size; i++) {    \n    uint num =  v[i];\n    if (num != NIL) {\n      std::cout << num;\n      if (i < size - 1) {\n        std::cout << \", \";\n      }    \n    }\n  }\n  std::cout << \"]\";\n}\n\n__host__ void initializeEdges(uint* &constraintsName, uint &constraintNumber, uint rel) {\n  dim3 dimInitialize(WARP_SIZE, getThreadsPerBlock(DEF_THREADS_PER_BLOCK) / WARP_SIZE);\n  uint* constraints;\n  uint numConstraints;\n  cudaSafeCall(cudaMemcpyFromSymbol(&constraints, constraintsName, sizeof(uint*)));\n  cudaSafeCall(cudaMemcpyFromSymbol(&numConstraints, constraintNumber, uintSize));\n  device_ptr<uint> src(constraints);\n  device_vector<uint> dstIndex(numConstraints);\n  sequence(dstIndex.begin(), dstIndex.begin() + numConstraints);    \n  uint numSrc = unique_by_key(src, src + numConstraints, dstIndex.begin()).first - src;    \n  addEdges<<<getBlocks() * 3, dimInitialize>>>(constraints, raw_pointer_cast(&dstIndex[0]), \n      constraints + numConstraints, numSrc, rel); \n  if (rel == STORE) {\n    cudaSafeCall(cudaMemcpyToSymbol(__numStore__, &numSrc, uintSize));    \n  } else {\n    cudaFree(constraints);\n  }  \n  checkKernelErrors(\"ERROR while adding initial edges\");\n}\n\nextern \"C\" void createGraph(const uint numObjectVars, const uint maxOffset) {\n  setbuf(stdout, NULL);\n  printf(\"[dev]  Creating graph and masks out of constraints...\");\n  const uint startTime = clock();\n  double startTime2 = rtclock();\n  dim3 dim(WARP_SIZE, getThreadsPerBlock(DEF_THREADS_PER_BLOCK)/ WARP_SIZE);\n\n  /* no need for maximum_residency here, since kernel will fail to launch otherwise */\n\n  initialize<<<getBlocks(), dim>>>();\n  checkKernelErrors(\"ERROR at initialize\");\n\n  initializeEdges(__ptsConstraints__, __numPtsConstraints__, NEXT_DIFF_PTS);\n  initializeEdges(__copyConstraints__, __numCopyConstraints__, COPY_INV);\n  initializeEdges(__loadConstraints__, __numLoadConstraints__, LOAD_INV);\n  initializeEdges(__storeConstraints__, __numStoreConstraints__, STORE);\n  // no need to add GEP_INV edges, there is only one per variable\n\n  createOffsetMasks<<<getBlocks(), dim>>>(numObjectVars, maxOffset);\n  checkKernelErrors(\"ERROR while creating the offset mask\");\n  uint* size;\n  cudaSafeCall(cudaMemcpyFromSymbol(&size, __size__, sizeof(uint*)));    \n  cudaFree(size);\n  \n  printf(\"OK.\\n\");\n  createTime = getEllapsedTime(startTime);\n  createTime2 = rtclock() - startTime2;\n}\n\nstruct neqAdapter : public thrust::unary_function<tuple<uint, uint>, uint>{\n  __host__ __device__\n  uint operator()(const tuple<uint, uint>& a) {\n    return get<0>(a) != get<1>(a);\n  }\n};\n\nstruct mulAdapter : public thrust::unary_function<tuple<uint, uint>, uint>{\n  __host__ __device__\n  uint operator()(const tuple<uint, uint>& a) {\n    return get<0>(a) * get<1>(a);\n  }\n};\n\n__host__ void buildHashMap(device_vector<uint>& key, device_vector<uint>& val,const uint size) {\n  sort_by_key(key.begin(), key.begin() + size, val.begin());    \n  thrust::maximum<uint> uintMax;\n  inclusive_scan(\n     make_transform_iterator(\n        make_zip_iterator(make_tuple(\n          make_transform_iterator(\n              make_zip_iterator(make_tuple(key.begin() + 1, key.begin())), \n              neqAdapter()), \n          counting_iterator<uint>(1))), \n        mulAdapter()),\n     make_transform_iterator(\n         make_zip_iterator(make_tuple(\n             make_transform_iterator(\n                 make_zip_iterator(make_tuple(key.begin() + size, key.begin() + size - 1)), \n                 neqAdapter()), \n          counting_iterator<uint>(1))), \n         mulAdapter()), key.begin() + 1, uintMax);  \n  key[0] = 0;          \n}\n\nextern \"C\" uint andersen(uint numVars) {\n  setbuf(stdout, NULL);\n  printf(\"[dev]  Solving: \");\n  const uint startTime = clock();\n  const double startTime2 = rtclock();\n\n  uint iteration = 0;\n  uint updatePtsTime = 0;\n  uint hcdTime = 0;\n  uint ptsEquivTime = 0;\n  uint copyInvTime = 0;\n  uint storeInvTime = 0;\n  uint gepInvTime = 0;\n  dim3 dim512(WARP_SIZE, getThreadsPerBlock(512) / WARP_SIZE);\n  dim3 dimDefThreads(WARP_SIZE, getThreadsPerBlock(DEF_THREADS_PER_BLOCK) / WARP_SIZE);\n  dim3 dimUpdate2(WARP_SIZE, getThreadsPerBlock(UPDATE_THREADS_PER_BLOCK) / WARP_SIZE);\n  dim3 dimHcd(WARP_SIZE, getThreadsPerBlock(HCD_THREADS_PER_BLOCK) / WARP_SIZE);\n  dim3 dimCopy(WARP_SIZE, getThreadsPerBlock(COPY_INV_THREADS_PER_BLOCK) / WARP_SIZE);\n  dim3 dimStore(WARP_SIZE, getThreadsPerBlock(STORE_INV_THREADS_PER_BLOCK) / WARP_SIZE);\n  dim3 dimGep(WARP_SIZE, getThreadsPerBlock(GEP_INV_THREADS_PER_BLOCK) / WARP_SIZE);\n \n  device_vector<uint> key(MAX_HASH_SIZE);\n  uint* ptr = raw_pointer_cast(&key[0]);\n  cudaSafeCall(cudaMemcpyToSymbol(__key__, &ptr, sizeof(uint*)));\n  device_vector<uint> keyAux(MAX_HASH_SIZE);\n  ptr = raw_pointer_cast(&keyAux[0]);\n  cudaSafeCall(cudaMemcpyToSymbol(__keyAux__, &ptr, sizeof(uint*)));\n  device_vector<uint> val(MAX_HASH_SIZE);\n  ptr = raw_pointer_cast(&val[0]);  \n  cudaSafeCall(cudaMemcpyToSymbol(__val__, &ptr, sizeof(uint*)));\n\n  clock_t ruleTime = clock();\n  uint blocks = getBlocks();\n  // TODO: mega-hack to avoid race condition on 'gcc' input.\n  uint hcdBlocks = getenv(\"GCC\") ? 4 : blocks;\n  \n  /**\n   * TODO (Jan'11)\n   *  \n   * a) use pointers instead of integers for the indexes, which is possible because all the \n   * inputs can be analyzed using a 4GB heap. Advantages:\n   *   a.1) when dereferencing an index, currently we assume that in reality is a delta with \n   *   respect to __edges__. Because of that, every access to an element becomes *(__edges__ + delta).\n   *   If we are using pointers, we could simply do *ptr. Note that __edges__ is in constant memory.\n   *   a.2.) we could use the malloc in the CUDA libraries. Malloc could potentially be used in two\n   *   places: OTHER and PTS edges. In practice, we currently keep the PTS edges together because they\n   *   contain the solution so we would restric malloc to allocating copy/load/store edges. Since\n   *   malloc returns a pointer, it would be compatible with the index-is-a-pointer system\n   *\n   * b) HCD is buggy when many blocks are used. This happens only for the gcc input, so the \n   * temporal path (see \"hcdBlocks\" variable) is to set the limit of blocks to four.\n   * \n   * c) retrieve the amount of memory and use that as HEAP_SIZE. \n   * \n   *  d) devise a better representation scheme st all the benchmarks fit in 3GB, so I can effectively\n   *  use an MSI GTX580 (=> much faster than the Tesla C2070 or Quadro 6000) for all the inputs.\n   */  \n\n\n  const int updateInfo_residency = maximum_residency(updateInfo, dim512.x * dim512.y * dim512.z, 0);\n  \n  uint ptsStartIndex;  \n  while (1) {\n    //printf(\"\\n\\nIteration: %u\\n\", iteration);\n    cudaSafeCall(cudaMemcpyFromSymbol(&ptsStartIndex, __ptsFreeList__, uintSize));\n  //printf(\"\\tstart = %d.\\n\", ptsStartIndex);\n    printRule(\"    updating pts...\");\n    updatePtsInformation<<<blocks, dimUpdate2>>>();\n    checkKernelErrors(\"ERROR at update pts\");\n    printRule(\"done\\n\");\n    addTimeToRule(updatePtsTime, ruleTime);\n    bool done = true;\n    cudaSafeCall(cudaMemcpyFromSymbol(&done, __done__, sizeof(bool)));\n    if (done) {\n      break;\n    }\n    // Ideally, we would use one stream to copy all the points-to edges discovered during the \n    // last iteration (resident in the interval [CURR_DIFF_PTS_START, __currDiffPtsFreeList__]) \n    // back to the host while the other stream computes the next iteration, computation that does\n    // not modify the CURR_DIFF_PTS set. However, Thrust does not currently support streams, and\n    // kernel invocations using the default stream add a implicit synchronization point [CUDA 4.1\n    // programming guide, 3.2.5.5.4]\n    // If you do want to implement the simultaneous copy-kernel scheme, you can always modify\n    // the Thrust source code or create your custom Thrust library with the stream hardcoded on it.\n    // To avoid going that way, I chose to publish the version of the code that does pay a penalty\n    // for the data transfer.\n       \n    printRule(\"    hcd...\");\n    hcd<<<hcdBlocks, dimHcd>>>();\n    checkKernelErrors(\"ERROR at hcd rule\");                    \n    updateInfo<<<updateInfo_residency * blocks, dim512>>>();\n    checkKernelErrors(\"ERROR while updating information after collapsing\");\n    printRule(\"done\\n\");\n    addTimeToRule(hcdTime, ruleTime);\n\n    printRule(\"    finding curr_pts equivalences...\");\n    computeCurrPtsHash<<<3 * blocks, dimDefThreads>>>();\n    checkKernelErrors(\"ERROR at compute hash\");\n    uint numKeys;\n    cudaSafeCall(cudaMemcpyFromSymbol(&numKeys, __numKeys__, uintSize));\n    buildHashMap(key, val, numKeys);\n    findCurrPtsEquivalents<<<3 * blocks, dimUpdate2>>>();\n    checkKernelErrors(\"ERROR in finding CURR_PTS equivalents\");       \n    printRule(\"done\\n\");\n    addTimeToRule(ptsEquivTime, ruleTime);\n    \n    printRule(\"    copy_inv and load_inv and store2storeInv...\");\n    copyInv_loadInv_store2storeInv<<<blocks, dimCopy>>>();\n    checkKernelErrors(\"ERROR at copy_inv/load_inv/store2storeinv rule\");        \n  \n    cudaSafeCall(cudaMemcpyFromSymbol(&numKeys, __numKeys__, uintSize));    \n    assert(numKeys <= MAX_HASH_SIZE);\n    sort_by_key(key.begin(), key.begin() + numKeys, val.begin());\n    sequence(keyAux.begin(), keyAux.begin() + numKeys);    \n    numKeys = unique_by_key(key.begin(), key.begin() + numKeys, keyAux.begin()).first - key.begin();    \n    cudaSafeCall(cudaMemcpyToSymbol(__numKeys__, &numKeys, uintSize));   \n    printRule(\"done\\n\");\n    addTimeToRule(copyInvTime, ruleTime);\n    \n    printRule(\"    store_inv...\");\n    storeInv<<<blocks, dimStore>>>();\n    checkKernelErrors(\"ERROR at store_inv rule\");\n    printRule(\"done\\n\");\n    addTimeToRule(storeInvTime, ruleTime);\n\n    printRule(\"    gep_inv...\");\n    gepInv<<<blocks, dimGep>>>();\n    checkKernelErrors(\"ERROR at gep_inv rule\");\n    printRule(\"done\\n\");\n    addTimeToRule(gepInvTime, ruleTime);\n\n    iteration++;\n    printf(\".\");\n  }\n  printf(\"OK.\\n\");\n  printf(\"Iterations = %u.\\n\", iteration);\n  // store the last index for the PTS elements\n  uint ptsEndIndex;  \n  cudaSafeCall(cudaMemcpyFromSymbol(&ptsEndIndex, __ptsFreeList__, uintSize));\n  uint solveTime = getEllapsedTime(startTime);\n  double solveTime2 = rtclock() - startTime2;\n\n  printf(\"SOLVE runtime: %u ms.\\n\", createTime + solveTime);\n  printf(\"SOLVE runtime2: %f ms.\\n\", (createTime2 + solveTime2) * 1000.0);\n  printf(\"    create graph    : %u ms.\\n\", createTime);\n  printf(\"    rule solving    : %u ms.\\n\", solveTime);\n  printf(\"        updatePts   : %u ms.\\n\", updatePtsTime);\n  printf(\"        hcd         : %u ms.\\n\", hcdTime);\n  printf(\"        equiv       : %u ms.\\n\", ptsEquivTime);\n  printf(\"        cpLdSt2inv  : %u ms.\\n\", copyInvTime);\n  printf(\"        store       : %u ms.\\n\", storeInvTime);\n  printf(\"        gepInv      : %u ms.\\n\", gepInvTime);\n  //printf(\"amount of points-to info = %d.\\n\", ptsEndIndex - ptsStartIndex);\n  //  return ptsEndIndex - ptsStartIndex;\n  return ptsEndIndex;\n}\n"
  },
  {
    "path": "lonestar/analytics/gpu/pointstoanalysis/andersen.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#ifndef ANDERSEN_H\n#define ANDERSEN_H\n\n#include <assert.h>\n#include <cuda.h>\n#include <cuda_runtime.h>\n#include <limits.h>\n#include <math.h>\n#include <stdint.h>\n#include <stdlib.h>\n#include <stdio.h>\n#include <time.h>\n#include <sys/time.h>\n\n// production or debug mode\n// in debug mode, only one warp is active in the whole grid.\n#define DEBUG 0\n#define CHECK_SPV 0\n#define PRINT_RULES 0\n\n// Amount of memory reserved for the graph edges. It has to be slightly smaller\n// than the total amount of memory available in the system\n#define HEAP_SIZE_MB (3930)\n\ntypedef unsigned int uint;\ntypedef unsigned long int ulongint;\n#define uintSize (sizeof(uint))\n\n#define BASE_OF(x) ((x) / ELEMENT_CARDINALITY)\n#define WORD_OF(x) (div32((x) % ELEMENT_CARDINALITY))\n#define BIT_OF(x) (mod32(x))\n\n#define WARPS_PER_BLOCK(x) (x / WARP_SIZE)\n#define WARPS(x) (x / WARP_SIZE * BLOCKS)\n\n#define I2P (1U)\n#define NIL (UINT_MAX)\n// offset for the given field of an element\n#define BASE (30U)\n#define NEXT (31U)\n\n#define PTS 0\n#define NEXT_DIFF_PTS 1\n#define CURR_DIFF_PTS 2\n#define COPY_INV 3\n#define LOAD_INV 4\n#define STORE 5\n#define GEP_INV 7\n#define LAST_DYNAMIC_REL STORE\n\n// size (in words) of an element\n#define ELEMENT_WIDTH 32\n#define ELEMENT_CARDINALITY (30 * 32)\n\n#define HEAP_SIZE_MBf ((float)HEAP_SIZE_MB)\n// Size of the region dedicated to CURR_DIFF_PTS edges\n#define CURR__DIFF_PTS_REGION_SIZE_MB ((uint)(HEAP_SIZE_MBf * 0.1425))\n// Size of the region dedicated to copy/load/store edges\n#define OTHER_REGION_SIZE_MB ((uint)(HEAP_SIZE_MBf * 0.1475))\n// sizes are given in 32-bit words\n#define HEAP_SIZE (HEAP_SIZE_MB * 1024 * 256)\n#define MAX_HASH_SIZE (1 << 20)\n#define COPY_INV_START                                                         \\\n  (HEAP_SIZE - OTHER_REGION_SIZE_MB * 1024 * 256) // COPY region\n#define CURR_DIFF_PTS_START (COPY_INV_START - ELEMENT_WIDTH)\n#define NEXT_DIFF_PTS_START                                                    \\\n  (CURR_DIFF_PTS_START - CURR__DIFF_PTS_REGION_SIZE_MB * 1024 * 256 -          \\\n   ELEMENT_WIDTH)\n\n// profiling variables. No need to set them up for your system unless your are\n// timing device invocations\n#define CLOCK_FREQUENCY                                                        \\\n  (1500000.0f) // 1.15M cycles per ms for Quadro 6000/ Tesla C2070\n#define TICKS_TO_MS(x) (((double)(x)) / CLOCK_FREQUENCY)\n// bytes to megabytes\n\n#define B2MB(x) ((x) / (1024 * 1024))\n// megabytes to bytes\n#define MB2B(x) ((x)*1024 * 1024)\n\n#define MAX_NODES (1 << 22)\n#define OFFSET_BITS 10\n#define MAX_GEP_OFFSET (1 << OFFSET_BITS)\n#define OFFSET_MASK (MAX_GEP_OFFSET - 1)\n// all 1's except for bits 30 and 31\n#define LT_BASE ((1 << 30) - 1)\n// all 1's except for bit 31\n#define LT_NEXT ((((uint)(1 << 31)) - 1))\n\n// given in words\n#define DECODE_VECTOR_SIZE (128) // has to be power of two\n\n// maximum size of an HCD table\n#define HCD_TABLE_SIZE (256)\n#define HCD_DECODE_VECTOR_SIZE (8192) // maxed out for 'pine' input\n\n#define PRINT_BUFFER_SIZE (16384) // up to this many neighbors\n#define ERROR_MESSAGE_BUFFER_SIZE (512)\n\n#define WARP_SIZE 32\n#define LOG2_32 5\n\n#define MAX_WARPS_PER_BLOCK (32)\n\n// number of threads per block for each rule. The thread count is based on the\n// amount of shared memory available and empirical measures.\n//#define DEF_THREADS_PER_BLOCK (1024)\n//#define UPDATE_THREADS_PER_BLOCK (1024)\n//#define HCD_THREADS_PER_BLOCK (512)\n//#define COPY_INV_THREADS_PER_BLOCK (864)\n//#define STORE_INV_THREADS_PER_BLOCK (864)\n//#define GEP_INV_THREADS_PER_BLOCK (1024)\n\n#include \"pta_tuning.h\"\n\n#define UNLOCKED (UINT_MAX)\n#define LOCKED (UINT_MAX - 1)\n#define VAR(x) (((x) + (UINT_MAX >> 1)))\n#define PTR(x) ((x))\n\n#define cudaSafeCall(err)                                                      \\\n  {                                                                            \\\n    if (cudaSuccess != err) {                                                  \\\n      fprintf(stderr, \"%s(%i) : Runtime API error %d : %s.\\n\", __FILE__,       \\\n              __LINE__, (int)err, cudaGetErrorString(err));                    \\\n      exit(-1);                                                                \\\n    }                                                                          \\\n  }\n\n#define D2H cudaMemcpyDeviceToHost\n#define H2D cudaMemcpyHostToDevice\n\nextern \"C\" void createGraph(const uint numObjectVars, const uint maxOffset);\nextern \"C\" uint andersen(uint numVars);\n\n__host__ inline uint getBlocks() {\n  if (DEBUG) {\n    return 1;\n  }\n  cudaDeviceProp deviceProp;\n  cudaGetDeviceProperties(&deviceProp, 0);\n  return deviceProp.multiProcessorCount;\n}\n\n__host__ inline uint getThreadsPerBlock(uint intended) {\n  return DEBUG ? WARP_SIZE : intended;\n}\n\n//////////// utility functions used in both the CPU and GPU /////////\n\n__device__ __host__ inline const char* getName(uint rel) {\n  if (rel == PTS)\n    return \"PTS\";\n  if (rel == NEXT_DIFF_PTS)\n    return \"NEXT_DIFF_PTS\";\n  if (rel == CURR_DIFF_PTS)\n    return \"CURR_DIFF_PTS\";\n  if (rel == COPY_INV)\n    return \"COPY_INV\";\n  if (rel == LOAD_INV)\n    return \"LOAD_INV\";\n  if (rel == STORE)\n    return \"STORE\";\n  if (rel == GEP_INV)\n    return \"GEP_INV\";\n  return \"UNKNOWN_REL\";\n}\n\n// ellapsed time, in milliseconds\n__device__ __host__ inline uint getEllapsedTime(const clock_t& startTime) {\n  // TODO: this code should depend on whether it is executing on the GPU or the\n  // CPU\n  return (int)(1000.0f * (clock() - startTime) / CLOCKS_PER_SEC);\n}\n\n__device__ __host__ static inline int isBitActive(uint word, uint bit) {\n  return word & (1 << bit);\n}\n\n__device__ __host__ static inline uint isOdd(uint num) { return num & 1; }\n\n__device__ __host__ static inline uint mul32(uint num) {\n  return num << LOG2_32;\n}\n\n__device__ __host__ static inline uint div32(uint num) {\n  return num >> LOG2_32;\n}\n\n__device__ __host__ static inline uint mod32(uint num) { return num & 31; }\n\n// base has to be a power of two\n__device__ __host__ static inline uint mod(uint num, uint base) {\n  return num & (base - 1);\n}\n\n__device__ __host__ static inline uint getFirst(uint pair) {\n  return pair >> 16;\n}\n\n__device__ __host__ static inline uint getSecond(uint pair) {\n  return (pair & 0x0000FFFF);\n}\n\n__device__ __host__ static inline uint createPair(uint first, uint second) {\n  return (first << 16) | second;\n}\n\n// related to GEP constraints\n__device__ __host__ static inline uint offset(const uint srcOffset) {\n  return srcOffset & OFFSET_MASK;\n}\n\n// related to GEP constraints\n__device__ __host__ static inline uint id(const uint srcOffset) {\n  return srcOffset >> OFFSET_BITS;\n}\n\n__device__ __host__ static inline uint idOffset(const uint src,\n                                                const uint offset) {\n  return offset | (src << OFFSET_BITS);\n}\n\n// e.g. for powerOfTwo==32: 4 => 32, 32 => 32, 33 => 64\n// second parameter has to be a power of two\n__device__ __host__ static inline uint roundToNextMultipleOf(uint num,\n                                                             uint powerOfTwo) {\n  if ((num & (powerOfTwo - 1)) == 0) {\n    return num;\n  }\n  return (num / powerOfTwo + 1) * powerOfTwo;\n}\n\n// e.g. for powerOfTwo==32: 0 => 0, 4 => 0, 32 => 32, 33 => 32\n// second parameter has to be a power of two\n__device__ __host__ static inline uint roundToPrevMultipleOf(uint num,\n                                                             uint powerOfTwo) {\n  if ((num & (powerOfTwo - 1)) == 0) {\n    return num;\n  }\n  return ((num / powerOfTwo + 1) * powerOfTwo) - 32;\n}\n\n// The second parameter has to be a power of 2\n__device__ __host__ static inline int isMultipleOf(uint num, uint powerOfTwo) {\n  return !(num & (powerOfTwo - 1));\n}\n\nstatic double rtclock() {\n  struct timezone Tzp;\n  struct timeval Tp;\n  int stat;\n  stat = gettimeofday(&Tp, &Tzp);\n  if (stat != 0)\n    printf(\"Error return from gettimeofday: %d\", stat);\n  return (Tp.tv_sec + Tp.tv_usec * 1.0e-6);\n}\n\n#endif\n"
  },
  {
    "path": "lonestar/analytics/gpu/pointstoanalysis/pta.cu",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting parallelism.\n * The code is being released under the terms of the 3-Clause BSD License (a\n * copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n\n#include <algorithm>\n#include <fstream>\n#include <iostream>\n#include <iterator>\n#include <sstream>\n#include <vector>\n\n#include \"andersen.h\"\n//#include \"mySVN/gzstream.h\"\t// commented by,\n#define igzstream ifstream\t// added by,\n\n#include \"andersen.cu\"\t\t// added by Rupesh Nasre on Dec 27, 2012.\nusing namespace std;\n    \n// check that the obtained solution is a subset of the desired solution. Useful when trying to \n// detect bugs (for instance, detected the 1st iteration such that the inclusion does not hold)\n#define USE_INCLUSION (2)\n\nstatic uint transferH2dTime = 0;\nstatic uint transferD2hTime = 0;\n\n//static void printDeviceMemory() {\n  //size_t uCurAvailMemoryInBytes, uTotalMemoryInBytes;\n  //cudaMemGetInfo( &uCurAvailMemoryInBytes, &uTotalMemoryInBytes );\n  //cout << \"[host] GPU's total memory: \"<< B2MB(uTotalMemoryInBytes) << \" MB, free Memory: \"\n  //        << B2MB(uCurAvailMemoryInBytes) << \" MB\" << endl;    \n  //if (B2MB(uCurAvailMemoryInBytes) < 3930) {\n  //    cout << \"Warning: there is not enough memory in your GPU to analyze all inputs.\" << endl;\n  //}\n//}\n\nstatic void printVector(const vector<uint>& m) {\n  vector<uint>::size_type size = m.size();\n  cout << \"[\";\n  if (size) {\n    ostream_iterator<uint> out_it (cout,\", \");\n    std::copy(m.begin(), m.begin() + size - 1, out_it);\n    cout << m[size - 1];\n  }\n  cout << \"]\";\n}\n\nstatic void printVector(uint* m, const uint size) {\n  cout << \"[\";\n  if (size) {\n    ostream_iterator<uint> out_it (cout,\", \");\n    std::copy(m, m + size - 1, out_it);\n    cout << m[size - 1];\n  }\n  cout << \"]\";\n}\n\nvoid printMatrix(uint* m, const uint rows, const uint cols) {\n  printf(\"[\");\n  for (uint i = 0; i < rows; i++) {\n    if (i > 0) {\n      printf(\" \");\n    }\n    printVector(&m[i * cols], cols);\n    if (i < rows - 1) {\n      printf(\"\\n\");\n    }\n  }\n  printf(\"]\\n\");\n}\n\nvoid checkGPUConfiguration() {\n  int deviceCount;\n  cudaGetDeviceCount(&deviceCount);\n  if (deviceCount == 0) {\n    cerr << \"There is no device supporting CUDA\\n\" << endl;\n    exit(-1);\n  }\n  cudaDeviceProp deviceProp;\n  cudaGetDeviceProperties(&deviceProp, 0);\n  if ((deviceProp.major == 9999) && (deviceProp.minor == 9999)) {\n    cerr << \"There is no CUDA capable device\" << endl;\n    exit(-1);\n  }\n  if ((WARP_SIZE != 32)) {\n    cerr << \"Warp size must be 32\" << endl ;\n    exit(-1);\n  }\n  // Make printf buffer bigger, otherwise some printf messages are not displayed\n  size_t limit;\n  cudaDeviceGetLimit(&limit, cudaLimitPrintfFifoSize); \n  cudaDeviceSetLimit(cudaLimitPrintfFifoSize, limit * 16);\n  // Make stack bigger, otherwise recursive functions will fail silently (?)\n  //cudaThreadGetLimit(&limit, cudaLimitStackSize);\n  //cudaThreadSetLimit(cudaLimitStackSize, limit * 8);\n}\n\nuint nextUint(istringstream& lineStream) {\n  string item;\n  getline(lineStream, item, ',');\n  return atoi(item.c_str());\n}\n\nstring skipBlanksAndComments(igzstream& inFile){\n  string line;  \n  for (;;) {\n    getline(inFile, line);\n    if (!line.empty() && line[0] != '#') {\n      return string(line);\n    }\n  }\n}\n\nuint readNumVars(igzstream &inFile) {\n  string line = skipBlanksAndComments(inFile);\n  istringstream linestream(line);\n  return nextUint(linestream);\n}\n\nuint readNodes(char *fileName, uint& numVars, uint& numObjectVars) {\n  cout << \"[host] Reading nodes...\" << flush;\n  igzstream inFile(fileName, igzstream::in);\n  if (!inFile) {\n    fprintf(stderr, \"Error: file %s not found.\\n\", fileName);\n    exit(-1);\n  }\n  string line = skipBlanksAndComments(inFile);\n  istringstream linestream(line);\n  // read total number of variables\n  numVars = roundToNextMultipleOf(nextUint(linestream), 32);\n    // cout << \"number of variables: \" << numVars << endl;\n  line = skipBlanksAndComments(inFile);\n  istringstream linestream2(line);\n  // for some reason, the number stored is lastObjectVar\n  numObjectVars = nextUint(linestream2) + 1; \n  // cout << \"    object variables: \" << numObjectVars << endl;\n  skipBlanksAndComments(inFile); // skip lastFunctionNode\n  uint length = roundToNextMultipleOf(numObjectVars, 32);\n  uint* size = new uint[length];\n  assert (size != NULL);\n  for (uint i = 0; i < numObjectVars; i++) {\n    line = skipBlanksAndComments(inFile);\n    istringstream linestream(line);\n    nextUint(linestream);  // ignore var ID\n    size[i] = nextUint(linestream);\n    nextUint(linestream);// ignore functionNode crap\n  } \n  inFile.close();\n  for (uint i = numObjectVars; i < length; i++) {\n    size[i] = 0;\n  }\n  const uint startTime = clock();\n  uint* sizeLocal;\n  cudaSafeCall(cudaMalloc((void **) &sizeLocal, length * uintSize));\n  cudaSafeCall(cudaMemcpy(sizeLocal, size, length * uintSize, H2D));\n  cudaSafeCall(cudaMemcpyToSymbol(__size__, &sizeLocal, sizeof(uint*)));\n  cudaSafeCall(cudaMemcpyToSymbol(__numVars__, &numVars, uintSize));\n  transferH2dTime += getEllapsedTime(startTime);\n  cout << \"OK.\" << endl << flush;\n  return numObjectVars;\n}\n\nuint inline padNumber(uint num) {\n  uint ret = roundToNextMultipleOf(num, 32);\n  if (ret == num) {\n    ret = roundToNextMultipleOf(num + 1, 32);\n  }\n  return ret;\n}\n\nuint* readConstraints(igzstream &inFile, uint rows) {\n  uint length = padNumber(rows);\n  uint* constraints = new uint[length * 2];\n  assert (constraints != NULL);\n  for (uint i = 0; i < rows; i++) {\n    string line = skipBlanksAndComments(inFile);\n    istringstream linestream(line);\n    nextUint(linestream);  // ignore constraint ID\n    uint src = nextUint(linestream);\n    uint dst = nextUint(linestream);\n    nextUint(linestream); // ignore type\n    uint offset = nextUint(linestream); \n    if (offset) {\n      cerr << \"Detected constraint with offset\" << endl << flush;\n      exit(-1);\n    }\n    constraints[i] = dst;\n    constraints[i + length] = src;\n  }  \n  // pad with NILs\n  for (uint i = rows; i < length; i++) {\n    constraints[i] = NIL;\n    constraints[i + length] = NIL;\n  }\n  return constraints;\n}\n\nvoid readAndTransferConstraints(igzstream &inFile, uint numConstraints, uint* &constraintsName, \n    uint &numConstraintsName) {\n  uint* constraints = readConstraints(inFile, numConstraints);\n  const uint startTime = clock();\n  uint* constraintLocal;\n  uint paddedSize = padNumber(numConstraints);\n  size_t size = paddedSize * uintSize * 2;\n  cudaSafeCall(cudaMalloc((void **) &constraintLocal, size));\n  cudaSafeCall(cudaMemcpyToSymbol(constraintsName, &constraintLocal, sizeof(uint*)));\n  cudaSafeCall(cudaMemcpyToSymbol(numConstraintsName, &paddedSize, uintSize));\n  cudaSafeCall(cudaMemcpy(constraintLocal, constraints, size, H2D));\n  transferH2dTime += getEllapsedTime(startTime);\n  delete [] constraints;\n}\n\nvoid readAndTransferGepConstraints(igzstream &inFile, uint numConstraints, uint& maxOffset) {\n  uint length = roundToNextMultipleOf(numConstraints * 2, 32);\n  uint* constraints = new uint[length];\n  assert (constraints != NULL);\n  for (uint i = 0; i < numConstraints; i++) {\n    string line = skipBlanksAndComments(inFile);\n    istringstream linestream(line);\n    nextUint(linestream); // ignore constraint ID\n    uint src = nextUint(linestream);\n    uint dst = nextUint(linestream);\n    nextUint(linestream); // ignore type\n    uint offset = nextUint(linestream);\n    if (offset > maxOffset) {\n      maxOffset = offset;\n    }\n    if (offset > MAX_GEP_OFFSET) {\n      cerr << \"Offset too large: \" << offset << \" (max. allowed: \" << MAX_GEP_OFFSET << \")\";\n      exit(-1);\n    }\n    constraints[i * 2] = dst;\n    constraints[i * 2 + 1] = idOffset(src, offset);\n  } \n  // pad with NILs\n  for (uint i = numConstraints * 2; i < length; i++) {\n    constraints[i] = NIL;\n  }\n  \n  const uint startTime = clock();\n  uint* formattedConstraintsLocal;\n  cudaSafeCall(cudaMalloc((void **) &formattedConstraintsLocal, length * uintSize));\n  cudaSafeCall(cudaMemcpy(formattedConstraintsLocal, constraints, length * uintSize, H2D));\n  cudaSafeCall(cudaMemcpyToSymbol(__gepInv__, &formattedConstraintsLocal, sizeof(uint*)));\n  cudaSafeCall(cudaMemcpyToSymbol(__numGepInv__, &numConstraints, uintSize, 0, H2D));\n  transferH2dTime += getEllapsedTime(startTime);\n  delete [] constraints;\n}\n\n// returns a pointer to __pts__\nvoid readConstraints(char *fileName, uint numVars, uint& maxOffset) {\n  cout << \"[host] Reading constraints...\" << flush;\n  igzstream inFile(fileName, igzstream::in);\n  if (!inFile) {\n    fprintf(stderr, \"Error: file %s not found.\\n\", fileName);\n    exit(-1);\n  }\n  string line = skipBlanksAndComments(inFile);\n  istringstream linestream(line);\n  uint numAddressOf = nextUint(linestream); \n  uint numCopy = nextUint(linestream);\n  uint numLoad = nextUint(linestream);\n  uint numStore = nextUint(linestream);\n  uint numGep = nextUint(linestream);\n  readAndTransferConstraints(inFile, numAddressOf, __ptsConstraints__, __numPtsConstraints__);\n  readAndTransferConstraints(inFile, numCopy, __copyConstraints__, __numCopyConstraints__);\n  readAndTransferConstraints(inFile, numLoad, __loadConstraints__, __numLoadConstraints__);\n  readAndTransferConstraints(inFile, numStore, __storeConstraints__, __numStoreConstraints__);\n  uint headerSize = numVars * ELEMENT_WIDTH;\n  uint start = COPY_INV_START + headerSize;\n  cudaSafeCall(cudaMemcpyToSymbol(__loadInvStart__, &start, sizeof(uint)));\n  start += headerSize;\n  cudaSafeCall(cudaMemcpyToSymbol(__storeStart__, &start, sizeof(uint)));\n  readAndTransferGepConstraints(inFile, numGep, maxOffset);\n  inFile.close();\n  cout << \"OK.\" << endl << flush;\n}\n\n// TODO: this code is too complex, simplify\nvoid readHcdInfo(char *fileName) {\n  cout << \"[host] Reading HCD table...\" << flush;\n  igzstream inFile(fileName, igzstream::in);\n  if (!inFile) {\n    fprintf(stderr, \"Error: file %s not found.\\n\", fileName);\n    exit(-1);\n  }\n  // a) read initial table of representatives\n  string line = skipBlanksAndComments(inFile);\n  istringstream linestream(line);\n  uint numMerged = nextUint(linestream);\n  uint* initialNonRep = new uint[numMerged];\n  uint* initialRep = new uint[numMerged];\n  for (uint i = 0; i < numMerged; i++) {\n    string line = skipBlanksAndComments(inFile);\n    istringstream linestream(line);\n    uint var = nextUint(linestream);\n    uint rep = nextUint(linestream);\n    initialNonRep[i] = var;\n    initialRep[i] = rep;\n  }\n  int* initRepLocal;\n  // transfer index table\n  cudaSafeCall(cudaMalloc((void **) &initRepLocal, uintSize * numMerged));\n  cudaSafeCall(cudaMemcpy(initRepLocal, initialRep, uintSize * numMerged, H2D));\n  cudaSafeCall(cudaMemcpyToSymbol(__initialRep__, &initRepLocal, sizeof(uint*)));\n  cudaSafeCall(cudaMalloc((void **) &initRepLocal, uintSize * numMerged));\n  cudaSafeCall(cudaMemcpy(initRepLocal, initialNonRep, uintSize * numMerged, H2D));\n  cudaSafeCall(cudaMemcpyToSymbol(__initialNonRep__, &initRepLocal, sizeof(uint*)));\n  cudaSafeCall(cudaMemcpyToSymbol(__numInitialRep__, &numMerged, uintSize));\n  // b) read HCD table itself\n  {\n    string line = skipBlanksAndComments(inFile);\n    istringstream linestream(line);\n    uint numKeys = nextUint(linestream);\n    uint numValues = nextUint(linestream);\n    uint hcdTableSize = numKeys + numValues;\n    uint* table = new uint[hcdTableSize];\n    uint* index = new uint[numKeys];\n    if (numKeys) {\n      uint keys = 0;\n      uint lastY = 0;\n      index[keys] = getFirst(0);\n      for (uint i = 0; i < numValues; i++) {\n        string line = skipBlanksAndComments(inFile);\n        istringstream linestream(line);\n        uint y = nextUint(linestream);\n        uint x = nextUint(linestream);\n        if (y != lastY) {\n          table[i + keys] = y;\n          if (keys) {\n            assert(((i + keys) - (index[keys - 1])) <= HCD_TABLE_SIZE);\n            index[keys - 1] = createPair(index[keys - 1], i + keys);\n            index[keys] = i + keys;\n          }\n          keys++;\n          lastY = y;\n        }\n        table[i + keys] = x;\n      }\n      assert(((numKeys + numValues) - (index[keys - 1])) <= HCD_TABLE_SIZE);\n      index[keys - 1] = createPair(index[keys - 1], numKeys + numValues);\n    }\n    int* hcdIndexLocal;\n    int* hcdTableLocal;\n    // transfer index table\n    cudaSafeCall(cudaMalloc((void **) &hcdIndexLocal, uintSize * numKeys));\n    cudaSafeCall(cudaMemcpy(hcdIndexLocal, index, uintSize * numKeys, H2D));\n    cudaSafeCall(cudaMemcpyToSymbol(__hcdIndex__, &hcdIndexLocal, sizeof(uint*)));\n    cudaSafeCall(cudaMemcpyToSymbol(__numHcdIndex__, &numKeys, uintSize));\n    // transfer HCD table\n    cudaSafeCall(cudaMalloc((void **) &hcdTableLocal, uintSize * (numKeys + numValues)));\n    cudaSafeCall(cudaMemcpy(hcdTableLocal, table, uintSize * (numKeys + numValues), H2D));\n    cudaSafeCall(cudaMemcpyToSymbol(__hcdTable__, &hcdTableLocal, sizeof(uint*)));\n    cudaSafeCall(cudaMemcpyToSymbol(__numHcdTable__, &hcdTableSize, uintSize));\n    }\n  cout << \"OK.\" << endl << flush;\n}\n\n// allocate memory for the graph edges\nuint* allocateElementPool() {\n  const uint startTime = clock();\n  uint* elementPoolLocal;\n  \n  size_t size =  HEAP_SIZE * sizeof(uint);\n  cudaSafeCall(cudaMalloc((void **) &elementPoolLocal, size));\n  // elements are initialized on the GPU, so we only transfer the pointers \n  cudaSafeCall(cudaMemcpyToSymbol(__graph__, &elementPoolLocal, sizeof(uint*)));\n  cudaSafeCall(cudaMemcpyToSymbol(__edges__, &elementPoolLocal, sizeof(uint*)));\n  transferH2dTime += getEllapsedTime(startTime);\n  return elementPoolLocal;\n}\n\nuint* allocateOther(uint numVars) {\n  uint* lockLocal;\n  size_t size =  roundToNextMultipleOf(numVars, 32) * sizeof(uint);\n  cudaSafeCall(cudaMalloc((void **) &lockLocal, size));\n  cudaSafeCall(cudaMemcpyToSymbol(__lock__, &lockLocal, sizeof(uint*)));\n  cudaSafeCall(cudaMalloc((void **) &lockLocal, size));\n  cudaSafeCall(cudaMemcpyToSymbol(__currPtsHead__, &lockLocal, sizeof(uint*)));\n  cudaSafeCall(cudaMalloc((void **) &lockLocal, getBlocks() * HCD_DECODE_VECTOR_SIZE));\n  cudaSafeCall(cudaMemcpyToSymbol(__nextVar__, &lockLocal, sizeof(uint*)));\n  cudaSafeCall(cudaMalloc((void **) &lockLocal, size));\n  cudaSafeCall(cudaMemcpyToSymbol(__rep__, &lockLocal, sizeof(uint*)));\n  return lockLocal;\n}\n\nvoid allocateDiffPtsMask(uint numVars) {\n  int* maskLocal; \n  int rows = ceil((float) numVars /  (float) ELEMENT_CARDINALITY);\n  size_t size =  rows * ELEMENT_WIDTH * sizeof(uint);\n  cudaSafeCall(cudaMalloc((void **) &maskLocal, size));\n  cudaSafeCall(cudaMemcpyToSymbol(__diffPtsMask__, &maskLocal, sizeof(uint*)));\n}\n\nvoid allocateOffsetMask(uint numObjectVars, uint maxOffset) {\n  int* maskLocal;\n  int rows = ceil((float) numObjectVars /  (float) ELEMENT_CARDINALITY);\n  size_t size =  rows * ELEMENT_WIDTH * maxOffset * sizeof(uint);\n  cudaSafeCall(cudaMalloc((void **) &maskLocal, size));\n  cudaSafeCall(cudaMemcpyToSymbol(__offsetMask__, &maskLocal, sizeof(uint*)));\n  cudaSafeCall(cudaMemcpyToSymbol(__offsetMaskRowsPerOffset__, &rows, sizeof(uint)));\n}\n\nuint* allocateOthers(const uint numVars, const uint numObjectVars, const uint maxOffset) {\n  const uint startTime = clock();\n  uint* repD = allocateOther(numVars);\n  allocateDiffPtsMask(numVars);\n  allocateOffsetMask(numObjectVars, maxOffset);\n  transferH2dTime += getEllapsedTime(startTime);\n  return repD;\n}\n\nvoid convertCsvIntoVector(string csv, vector<uint>& ret) {\n  if (csv.empty()) {\n    return;\n  }\n  istringstream linestream(csv);\n  while (!linestream.eof()) {\n    uint next = nextUint(linestream);\n    ret.push_back(next);\n  }\n}\n\nvoid getPts(uint var, uint* ptsEdges, uint ptsSize, vector<uint>& ret) {\n  uint index = mul32(var);\n  do {\n    if (index > ptsSize) {\n      cerr << \"Error at variable \" << var << \". The NEXT field exceeds the size of PTS. Next: \"\n          << index << \", size: \" << ptsSize << endl << flush;\n      return;\n      //exit(-1);\n    }\n    uint base = ptsEdges[index + BASE];\n    // if base == NIL => empty adjancency list\n    if (base == NIL) {\n      return;\n    }\n    for (uint j = 0; j < BASE; j++) {\n      uint word = ptsEdges[index + j];\n      if (!word) {\n        continue;\n      }\n      for (uint z = 0; z < WARP_SIZE; z++) {\n        if (isBitActive(word, z)) {\n          uint num = base * ELEMENT_CARDINALITY + j * WARP_SIZE + z;\n          ret.push_back(num);\n        }\n      }\n    }\n    index = ptsEdges[index + NEXT];\n  } while (index != NIL);\n}\n\nvoid verifySolution(bool useInclusion, uint* ptsEdges, uint ptsSize, uint* rep, const vector<uint>& vars,\n    const vector<uint>& sol) {\n  for (uint i = 0; i < vars.size(); i++) {\n    uint var = vars[i];\n    vector<uint> ptsVar;\n    uint representative = rep[var];   \n    if (representative != var) {\n      // non-representative: simply make sure that the representative is included in 'vars'\n      if (std::find(vars.begin(), vars.end(), representative) == vars.end()) {\n        getPts(representative, ptsEdges, ptsSize, ptsVar);\n        cerr << \"Error at variable \" << var << \" (rep=\" << representative\n            << \"): the obtained pts (1st line) differs from the correct solution (2nd line)\" << endl;\n       printVector(ptsVar);\n       cerr << endl;\n       printVector(sol);\n       cerr << endl;      \n       exit(-1);\n      }\n    } else {\n      getPts(representative, ptsEdges, ptsSize, ptsVar);\n      bool OK = useInclusion ? includes(sol.begin(), sol.end(), ptsVar.begin(), ptsVar.end()) : \n        (ptsVar == sol);\n      if (!OK) {\n        cerr << \"Error at representative \" << var << \": the obtained pts (1st line) \"\n             << \"differs from the correct solution (2nd line)\" << endl;\n       printVector(ptsVar);\n       cerr << endl;\n       printVector(sol);\n       cerr << endl;      \n       exit(-1);\n      }\n    }\n  }\n}\n\nvoid verifySolution(uint verify, uint* ptsEdges, uint ptsSize, uint* rep, char* solFile) {\n  if (!verify) {\n    return;\n  }\n  igzstream inFile(solFile, igzstream::in);\n  if (!inFile) {\n    fprintf(stderr, \"Error: file %s not found.\\n\", solFile);\n    exit(-1);\n  }\n  if (verify == USE_INCLUSION) {\n    cerr << \"[host] WARNING: verification uses inclusion.\" << endl << flush;\n  }\n  cerr << \"[host] Verifying against \" << solFile << \"...\" << flush;\n  string line;  \n  getline(inFile, line); // skip first line\n  while (getline(inFile, line)) {\n    size_t pos = line.find(\"] => [\");\n    string lhs = line.substr(1, pos - 1);\n    vector<uint> vars;\n    convertCsvIntoVector(lhs, vars);\n    string rhs = line.substr(pos + 6);\n    rhs = rhs.substr(0, rhs.size() - 1);\n    vector<uint> sol;   \n    convertCsvIntoVector(rhs, sol);\n    verifySolution(verify == USE_INCLUSION, ptsEdges, ptsSize, rep, vars, sol);\n  }\n  inFile.close();\n  cerr << \"OK.\" << endl << flush;\n}\n\nvoid printSolution(uint numVars, uint* ptsEdges, uint ptsSize) {\n  for (uint i = 0; i < numVars; i++) {\n    vector<uint> ptsVar;\n    getPts(i, ptsEdges, ptsSize, ptsVar);\n    if (!ptsVar.empty()) {\n      cout << i << \" => \" << flush;\n      printVector(ptsVar);\n      cout << endl << flush;\n    }\n  }\n}\n\n// transfer back PTS and representative tables\nvoid transferBackInfo(uint verify, uint numVars, uint* edgesD, uint ptsSize, uint* repD, char* solFile) {\n  cerr << \"[host] Tranferring back \" << B2MB(ptsSize * 4) << \" MB...\" << flush;\n  const uint startTime = clock();\n  uint* ptsEdges = NULL;\n  cudaSafeCall(cudaHostAlloc((void**) &ptsEdges, ptsSize * uintSize, 0));\n  cudaSafeCall(cudaMemcpy(ptsEdges, edgesD, ptsSize * uintSize, D2H));\n  uint* rep = NULL; \n  cudaSafeCall(cudaHostAlloc((void**) &rep, numVars * uintSize, 0));\n  cudaSafeCall(cudaMemcpy(rep, repD, numVars * uintSize, D2H));\n  //printSolution(numVars, ptsEdges, ptsSize);\n  transferD2hTime += getEllapsedTime(startTime);\n  cerr << \"OK.\" << endl << flush;\n  cout << \"TRANSFER runtime: \"  << (transferH2dTime + transferD2hTime) << \" ms.\" << endl;\n  cout << \"    h2d: \" << transferH2dTime << \" ms.\" << endl;\n  cout << \"    d2h: \" << transferD2hTime << \" ms.\" << endl;\n  verifySolution(verify, ptsEdges, ptsSize, rep, solFile);\n  cudaSafeCall(cudaFreeHost(ptsEdges));\n  cudaSafeCall(cudaFreeHost(rep));\n}\n\nint main(int argc, char** argv) {  \n  if ((argc < 5) || (argc > 7)) {\n    cerr << \"Usage : \" << argv[0] << \" NODES_FILE CONSTRAINTS_FILE HCD_TABLE SOLUTION_FILE [TRANSFER, VERIFY]\" << endl;\n    exit(-1);\n  }\n//  printDeviceMemory();\n  // TODO: a lot of checks on the arguments are missing...\n  bool transfer = false;\n  int verify = 0;\n  if (argc > 5) {\n    transfer = atoi(argv[5]);\n    verify = atoi(argv[6]);\n  }\n  checkGPUConfiguration();\n  uint maxOffset = 0; \n  uint numVars, numObjectVars;\n  string input(argv[1]);\n  size_t start = input.find_last_of('/') + 1;\n  size_t end = input.find('_');\n  cerr << \"\\n[host] Input: \" <<  input.substr(start, end - start) << endl;\n#ifdef __LP64__\n  cout << \"[host] 64-bit detected.\" << endl << flush;\n#endif\n  readNodes(argv[1], numVars, numObjectVars);   \n  readConstraints(argv[2], numVars, maxOffset);\n\tprintf(\"%d\\t%d\\n\", numObjectVars, numVars); \n  readHcdInfo(argv[3]);\n  uint* edgesD = allocateElementPool();\n  uint* repD = allocateOthers(numVars, numObjectVars, maxOffset);\n  createGraph(numObjectVars, maxOffset);\n  uint endIndex = andersen(numVars);\n  if (transfer) {\n    transferBackInfo(verify, numVars, edgesD, endIndex, repD, argv[4]);\n  }\n  return 0;\n}\n"
  },
  {
    "path": "lonestar/analytics/gpu/pointstoanalysis/pta_tuning.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#pragma once\n#define GPU_NAME \"Quadro 6000\"\n#define GPU_VERSION_MAJOR 2\n#define GPU_VERSION_MINOR 0\n#define RT_VERSION 5050\n#define DRV_VERSION 6050\n\n#define DEF_THREADS_PER_BLOCK 480\n#define UPDATE_THREADS_PER_BLOCK 992\n#define HCD_THREADS_PER_BLOCK 256\n#define COPY_INV_THREADS_PER_BLOCK 512\n#define STORE_INV_THREADS_PER_BLOCK 352\n#define GEP_INV_THREADS_PER_BLOCK 512\nstatic const char* TUNING_PARAMETERS =\n    \"DEF_THREADS_PER_BLOCK 480\\nUPDATE_THREADS_PER_BLOCK \"\n    \"992\\nHCD_THREADS_PER_BLOCK 256\\nCOPY_INV_THREADS_PER_BLOCK \"\n    \"512\\nSTORE_INV_THREADS_PER_BLOCK 352\\nGEP_INV_THREADS_PER_BLOCK 512\\n\";\n"
  },
  {
    "path": "lonestar/analytics/gpu/pointstoanalysis/support.cu",
    "content": "\n"
  },
  {
    "path": "lonestar/analytics/gpu/spanningtree/CMakeLists.txt",
    "content": "app_analy_gpu(mst minimum-spanningtree)\nadd_test_gpu(minimum-spanningtree rmat15 rmat15.out mst -o rmat15.out ${BASEINPUT}/scalefree/symmetric/rmat15.sgr)\n"
  },
  {
    "path": "lonestar/analytics/gpu/spanningtree/README.md",
    "content": "Minumum Spanning Tree\n================================================================================\n\nDESCRIPTION\n--------------------------------------------------------------------------------\n\nThis benchmark computes a minimum spanning tree in a graph. This program uses worklists for better performance.\nThe algorithm is implemented by successive edge-relaxations of the minimum weight edges. However, since an explicit edge-relaxation involves modifying the graph, the implementation performs edge-relaxation indirectly. This is done by keeping track of the set of nodes that have been merged, called components, which avoids modifications to the graph. Each component's size grows in each iteration, while the number of components reduces (due to components getting merged). \n\nINPUT\n--------------------------------------------------------------------------------\n\nThis application takes in symmetric Galois .gr graphs.\nYou must specify the -symmetricGraph flag when running this benchmark.\n\nBUILD\n--------------------------------------------------------------------------------\n\n1. Run cmake at BUILD directory (refer to top-level README for cmake instructions).\n\n2. Run `cd <BUILD>/lonestar/analytics/gpu/spanningtree; make -j`\n\nRUN\n--------------------------------------------------------------------------------\n\nTo run default algorithm, use the following:\n\n-`$ ./maximal-independent-gpu -o=<output-file> <input-graph> -symmetricGraph`\n-`$ ./maximal-independent-gpu -o=rmat15.out rmat15.sgr -symmetricGraph`\n"
  },
  {
    "path": "lonestar/analytics/gpu/spanningtree/mst-tex.cu",
    "content": "/*  -*- mode: c++ -*-  */\n#include \"gg.h\"\n#include \"ggcuda.h\"\n#include \"cub/cub.cuh\"\n#include \"cub/util_allocator.cuh\"\n#include \"thread_work.h\"\n\nvoid kernel_sizing(CSRGraphTex &, dim3 &, dim3 &);\n#define TB_SIZE 256\nconst char *GGC_OPTIONS = \"coop_conv=False $ outline_iterate_gb=False $ backoff_blocking_factor=4 $ parcomb=False $ np_schedulers=set(['fg', 'tb', 'wp']) $ cc_disable=set([]) $ hacks=set([]) $ np_factor=1 $ instrument=set([]) $ unroll=[] $ read_props=None $ outline_iterate=True $ ignore_nested_errors=False $ np=False $ write_props=None $ quiet_cgen=True $ retry_backoff=True $ cuda.graph_type=texture $ cuda.use_worklist_slots=True $ cuda.worklist_type=texture\";\nAppendOnlyList el;\n#include \"mst.h\"\n#define INF UINT_MAX\nconst int DEBUG = 0;\nstatic const int __tb_union_components = TB_SIZE;\n__global__ void init_wl(CSRGraphTex graph, WorklistT in_wl, WorklistT out_wl)\n{\n  unsigned tid = TID_1D;\n  unsigned nthreads = TOTAL_THREADS_1D;\n\n  const unsigned __kernel_tb_size = TB_SIZE;\n  if (tid == 0)\n    in_wl.reset_next_slot();\n\n  index_type node_end;\n  node_end = (graph).nnodes;\n  for (index_type node = 0 + tid; node < node_end; node += nthreads)\n  {\n    (out_wl).push(node);\n  }\n}\n__global__ void find_comp_min_elem(CSRGraphTex graph, struct comp_data comp, LockArrayTicket complocks, ComponentSpace cs, int level, WorklistT in_wl, WorklistT out_wl)\n{\n  unsigned tid = TID_1D;\n  unsigned nthreads = TOTAL_THREADS_1D;\n\n  const unsigned __kernel_tb_size = TB_SIZE;\n  if (tid == 0)\n    in_wl.reset_next_slot();\n\n  index_type wlnode_end;\n\n  wlnode_end = *((volatile index_type *) (in_wl).dindex);\n  for (index_type wlnode = 0 + tid; wlnode < wlnode_end; wlnode += nthreads)\n  {\n    int node;\n    bool pop;\n    index_type edge_end;\n    pop = (in_wl).pop_id(wlnode, node);\n    unsigned minwt = INF;\n    unsigned minedge = INF;\n    int degree = graph.getOutDegree(node);\n    int mindstcomp  = 0;\n    int srccomp = cs.find(node);\n    edge_end = (graph).getFirstEdge((node) + 1);\n    for (index_type edge = (graph).getFirstEdge(node) + 0; edge < edge_end; edge += 1)\n    {\n      int edgewt = graph.getAbsWeight(edge);\n      if (edgewt < minwt)\n      {\n        int dstcomp = cs.find(graph.getAbsDestination(edge));\n        if (dstcomp != srccomp)\n        {\n          minwt = edgewt;\n          minedge = edge;\n        }\n      }\n    }\n    if (minwt != INF)\n    {\n      (out_wl).push(node);\n      {\n        volatile bool done_ = false;\n\t\tint _ticket = (complocks).reserve(srccomp);\n        while (!done_)\n        {\n          if (complocks.acquire_or_fail(srccomp, _ticket))\n          {\n            if (comp.minwt[srccomp] == 0 || (comp.lvl[srccomp] < level) || (comp.minwt[srccomp] > minwt))\n            {\n              comp.minwt[srccomp] = minwt;\n              comp.lvl[srccomp] = level;\n              comp.minedge[srccomp] = minedge;\n            }\n            complocks.release(srccomp);\n            done_ = true;\n          }\n        }\n      }\n    }\n    else\n    {\n      if (cs.isBoss(node) && degree)\n      {\n        (out_wl).push(node);\n      }\n    }\n  }\n}\n__global__ void union_components(CSRGraphTex graph, ComponentSpace cs, struct comp_data compdata, int level, AppendOnlyList el, AppendOnlyList ew, WorklistT in_wl, WorklistT out_wl, GlobalBarrier gb, HGAccumulator<int> ret_val)\n{\n  unsigned tid = TID_1D;\n  unsigned nthreads = TOTAL_THREADS_1D;\n\n  const unsigned __kernel_tb_size = TB_SIZE;\n  typedef cub::BlockReduce<int, TB_SIZE> _br;\n  __shared__ _br::TempStorage _ts;\n  ret_val.thread_entry();\n  if (tid == 0)\n    in_wl.reset_next_slot();\n\n  index_type wlnode_end;\n  wlnode_end = roundup((*((volatile index_type *) (in_wl).dindex)), (nthreads));\n  for (index_type wlnode = 0 + tid; wlnode < wlnode_end; wlnode += nthreads)\n  {\n    int node;\n    bool pop;\n    pop = (in_wl).pop_id(wlnode, node);\n    int r = 0;\n    int dstcomp = -1;\n    int srccomp = -1;\n    if (pop && compdata.lvl[node] == level)\n    {\n      srccomp = cs.find(node);\n      dstcomp = cs.find(graph.getAbsDestination(compdata.minedge[node]));\n    }\n    gb.Sync();\n    if (srccomp != dstcomp)\n    {\n      if (!cs.unify(srccomp, dstcomp))\n      {\n        r = 1;\n      }\n      else\n      {\n        el.push(compdata.minedge[node]);\n        ew.push(compdata.minwt[node]);\n      }\n    }\n    gb.Sync();\n    if (r)\n    {\n      ret_val.reduce(true);\n      continue;\n    }\n  }\n  ret_val.thread_exit<_br>(_ts);\n}\nvoid gg_main(CSRGraphTex& hg, CSRGraphTex& gg)\n{\n  dim3 blocks, threads;\n  kernel_sizing(gg, blocks, threads);\n  static GlobalBarrierLifetime union_components_barrier;\n  static bool union_components_barrier_inited;\n  struct comp_data comp;\n  PipeContextT<WorklistT> pipe;\n  ComponentSpace cs (hg.nnodes);\n  el = AppendOnlyList(hg.nedges);\n  AppendOnlyList ew (hg.nedges);\n  static const size_t union_components_residency = maximum_residency(union_components, __tb_union_components, 0);\n  static const size_t union_components_blocks = GG_MIN(blocks.x, ggc_get_nSM() * union_components_residency);\n  if(!union_components_barrier_inited) { union_components_barrier.Setup(union_components_blocks); union_components_barrier_inited = true;};\n  comp.weight.alloc(hg.nnodes);\n  comp.edge.alloc(hg.nnodes);\n  comp.node.alloc(hg.nnodes);\n  comp.level.alloc(hg.nnodes);\n  comp.dstcomp.alloc(hg.nnodes);\n  comp.lvl = comp.level.zero_gpu();\n  comp.minwt = comp.weight.zero_gpu();\n  comp.minedge = comp.edge.gpu_wr_ptr();\n  comp.minnode = comp.node.gpu_wr_ptr();\n  comp.mindstcomp = comp.dstcomp.gpu_wr_ptr();\n  LockArrayTicket complocks (hg.nnodes);\n  int level = 1;\n  int mw = 0;\n  int last_mw = 0;\n  pipe = PipeContextT<WorklistT>(hg.nnodes);\n  {\n    {\n      pipe.out_wl().will_write();\n      init_wl <<<blocks, threads>>>(gg, pipe.in_wl(), pipe.out_wl());\n      pipe.in_wl().swap_slots();\n      pipe.advance2();\n      while (pipe.in_wl().nitems())\n      {\n        bool loopc = false;\n        last_mw = mw;\n        pipe.out_wl().will_write();\n        find_comp_min_elem <<<blocks, threads>>>(gg, comp, complocks, cs, level, pipe.in_wl(), pipe.out_wl());\n        pipe.in_wl().swap_slots();\n        pipe.advance2();\n        do\n        {\n          Shared<int> retval = Shared<int>(1);\n          HGAccumulator<int> _rv;\n          *(retval.cpu_wr_ptr()) = 0;\n          _rv.rv = retval.gpu_wr_ptr();\n          pipe.out_wl().will_write();\n          union_components <<<union_components_blocks, __tb_union_components>>>(gg, cs, comp, level, el, ew, pipe.in_wl(), pipe.out_wl(), union_components_barrier, _rv);\n          loopc = *(retval.cpu_rd_ptr()) > 0;\n        }\n        while (loopc);\n        mw = el.nitems();\n        level++;\n        if (last_mw == mw)\n        {\n          break;\n        }\n      }\n    }\n  }\n  unsigned long int rweight = 0;\n  size_t nmstedges ;\n  nmstedges = ew.nitems();\n  mgpu::standard_context_t context;\n  mgpu::reduce(ew.list.gpu_rd_ptr(), nmstedges, &rweight, mgpu::plus_t<long unsigned int>(), context);\n  printf(\"number of iterations: %d\\n\", level);\n  printf(\"final mstwt: %llu\\n\", rweight);\n  printf(\"total edges: %llu, total components: %llu\\n\", nmstedges, cs.numberOfComponentsHost());\n}\n"
  },
  {
    "path": "lonestar/analytics/gpu/spanningtree/mst.cu",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting parallelism.\n * The code is being released under the terms of the 3-Clause BSD License (a\n * copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#include \"gg.h\"\n#include \"ggcuda.h\"\n#include \"cub/cub.cuh\"\n#include \"cub/util_allocator.cuh\"\n#include \"thread_work.h\"\n\nvoid kernel_sizing(CSRGraph &, dim3 &, dim3 &);\n#define TB_SIZE 256\nconst char *GGC_OPTIONS = \"coop_conv=False $ outline_iterate_gb=False $ backoff_blocking_factor=4 $ parcomb=False $ np_schedulers=set(['fg', 'tb', 'wp']) $ cc_disable=set([]) $ hacks=set([]) $ np_factor=1 $ instrument=set([]) $ unroll=[] $ instrument_mode=None $ read_props=None $ outline_iterate=True $ ignore_nested_errors=False $ np=False $ write_props=None $ quiet_cgen=True $ retry_backoff=True $ cuda.graph_type=basic $ cuda.use_worklist_slots=True $ cuda.worklist_type=basic\";\nAppendOnlyList el;\n#include \"mst.h\"\n#define INF UINT_MAX\nconst int DEBUG = 0;\nstatic const int __tb_union_components = TB_SIZE;\n__global__ void init_wl(CSRGraph graph, Worklist2 in_wl, Worklist2 out_wl)\n{\n  unsigned tid = TID_1D;\n  unsigned nthreads = TOTAL_THREADS_1D;\n\n  const unsigned __kernel_tb_size = TB_SIZE;\n  if (tid == 0)\n    in_wl.reset_next_slot();\n\n  index_type node_end;\n  // FP: \"1 -> 2;\n  node_end = (graph).nnodes;\n  for (index_type node = 0 + tid; node < node_end; node += nthreads)\n  {\n    (out_wl).push(node);\n  }\n  // FP: \"4 -> 5;\n}\n__global__ void find_comp_min_elem(CSRGraph graph, struct comp_data comp, LockArrayTicket complocks, ComponentSpace cs, int level, AppendOnlyList bosses, Worklist2 in_wl, Worklist2 out_wl)\n{\n  unsigned tid = TID_1D;\n  unsigned nthreads = TOTAL_THREADS_1D;\n\n  const unsigned __kernel_tb_size = TB_SIZE;\n  if (tid == 0)\n    in_wl.reset_next_slot();\n\n  index_type wlnode_end;\n  // FP: \"1 -> 2;\n\n  // FP: \"2 -> 3;\n  wlnode_end = *((volatile index_type *) (in_wl).dindex);\n  for (index_type wlnode = 0 + tid; wlnode < wlnode_end; wlnode += nthreads)\n  {\n    int node;\n    bool pop;\n    index_type edge_end;\n    pop = (in_wl).pop_id(wlnode, node);\n    unsigned minwt = INF;\n    unsigned minedge = INF;\n    int degree = graph.getOutDegree(node);\n    int mindstcomp  = 0;\n    int srccomp = cs.find(node);\n    bool isBoss = srccomp == node;\n    edge_end = (graph).getFirstEdge((node) + 1);\n    for (index_type edge = (graph).getFirstEdge(node) + 0; edge < edge_end; edge += 1)\n    {\n      int edgewt = graph.getAbsWeight(edge);\n      if (edgewt < minwt)\n      {\n        int dstcomp = cs.find(graph.getAbsDestination(edge));\n        if (dstcomp != srccomp)\n        {\n          minwt = edgewt;\n          minedge = edge;\n        }\n      }\n    }\n    if (isBoss && degree)\n    {\n      bosses.push(node);\n    }\n    if (minwt != INF)\n    {\n      (out_wl).push(node);\n      {\n        #if __CUDACC_VER_MAJOR__ >= 7\n        volatile bool done_ = false;\n        #else\n        bool done_ = false;\n        #endif\n        int _ticket = (complocks).reserve(srccomp);\n        while (!done_)\n        {\n          if (complocks.acquire_or_fail(srccomp, _ticket))\n          {\n            if (comp.minwt[srccomp] == 0 || (comp.lvl[srccomp] < level) || (comp.minwt[srccomp] > minwt))\n            {\n              comp.minwt[srccomp] = minwt;\n              comp.lvl[srccomp] = level;\n              comp.minedge[srccomp] = minedge;\n            }\n            complocks.release(srccomp);\n            done_ = true;\n          }\n        }\n      }\n    }\n    else\n    {\n      if (isBoss && degree)\n      {\n        (out_wl).push(node);\n      }\n    }\n  }\n  // FP: \"30 -> 31;\n}\n__global__ void union_components(CSRGraph graph, ComponentSpace cs, struct comp_data compdata, int level, AppendOnlyList el, AppendOnlyList ew, AppendOnlyList b_in, AppendOnlyList b_out, Worklist2 in_wl, Worklist2 out_wl, GlobalBarrier gb, HGAccumulator<int> ret_val)\n{\n  unsigned tid = TID_1D;\n  unsigned nthreads = TOTAL_THREADS_1D;\n\n  const unsigned __kernel_tb_size = TB_SIZE;\n  typedef cub::BlockReduce<int, TB_SIZE> _br;\n  __shared__ _br::TempStorage _ts;\n  ret_val.thread_entry();\n \n  if (tid == 0)\n    in_wl.reset_next_slot();\n\n  index_type wlnode_end;\n  index_type wlnode_rup;\n  // FP: \"1 -> 2;\n  wlnode_end = *((volatile index_type *) (b_in).dindex);\n  wlnode_rup = ((0) + roundup(((*((volatile index_type *) (b_in).dindex)) - (0)), (nthreads)));\n  for (index_type wlnode = 0 + tid; wlnode < wlnode_rup; wlnode += nthreads)\n  {\n    int node;\n    bool pop;\n    pop = (b_in).pop_id(wlnode, node);\n    int r = 0;\n    int dstcomp = -1;\n    int srccomp = -1;\n    if (pop && compdata.lvl[node] == level)\n    {\n      srccomp = cs.find(node);\n      dstcomp = cs.find(graph.getAbsDestination(compdata.minedge[node]));\n    }\n    gb.Sync();\n    if (srccomp != dstcomp)\n    {\n      if (!cs.unify(srccomp, dstcomp))\n      {\n        b_out.push(node);\n        r = 1;\n      }\n      else\n      {\n        el.push(compdata.minedge[node]);\n        ew.push(compdata.minwt[node]);\n      }\n    }\n    gb.Sync();\n    if (r)\n    {\n      ret_val.reduce(true);\n      continue;\n    }\n  }\n  ret_val.thread_exit<_br>(_ts);\n}\nvoid gg_main(CSRGraph& hg, CSRGraph& gg)\n{\n  dim3 blocks, threads;\n  kernel_sizing(gg, blocks, threads);\n  static GlobalBarrierLifetime union_components_barrier;\n  static bool union_components_barrier_inited;\n  struct comp_data comp;\n  PipeContextT<Worklist2> pipe;\n  // FP: \"1 -> 2;\n  ComponentSpace cs (hg.nnodes);\n  // FP: \"2 -> 3;\n  el = AppendOnlyList(hg.nedges);\n  // FP: \"3 -> 4;\n  AppendOnlyList ew (hg.nedges);\n  // FP: \"4 -> 5;\n  AppendOnlyList bosses[2] = {AppendOnlyList(hg.nnodes), AppendOnlyList(hg.nnodes)};\n  int cur_boss = 0;\n  // FP: \"5 -> 6;\n  static const size_t union_components_residency = maximum_residency(union_components, __tb_union_components, 0);\n  static const size_t union_components_blocks = GG_MIN(blocks.x, ggc_get_nSM() * union_components_residency);\n  if(!union_components_barrier_inited) { union_components_barrier.Setup(union_components_blocks); union_components_barrier_inited = true;};\n  // FP: \"6 -> 7;\n  // FP: \"7 -> 8;\n  comp.weight.alloc(hg.nnodes);\n  comp.edge.alloc(hg.nnodes);\n  comp.node.alloc(hg.nnodes);\n  comp.level.alloc(hg.nnodes);\n  comp.dstcomp.alloc(hg.nnodes);\n  comp.lvl = comp.level.zero_gpu();\n  comp.minwt = comp.weight.zero_gpu();\n  comp.minedge = comp.edge.gpu_wr_ptr();\n  comp.minnode = comp.node.gpu_wr_ptr();\n  comp.mindstcomp = comp.dstcomp.gpu_wr_ptr();\n  // FP: \"8 -> 9;\n  LockArrayTicket complocks (hg.nnodes);\n  // FP: \"9 -> 10;\n  int level = 1;\n  int mw = 0;\n  int last_mw = 0;\n  // FP: \"10 -> 11;\n  pipe = PipeContextT<Worklist2>(hg.nnodes);\n  {\n    {\n      pipe.out_wl().will_write();\n      init_wl <<<blocks, threads>>>(gg, pipe.in_wl(), pipe.out_wl());\n      pipe.in_wl().swap_slots();\n      pipe.advance2();\n      // FP: \"12 -> 13;\n      while (pipe.in_wl().nitems())\n      {\n        bool loopc = false;\n        last_mw = mw;\n        pipe.out_wl().will_write();\n        find_comp_min_elem <<<blocks, threads>>>(gg, comp, complocks, cs, level, bosses[cur_boss], pipe.in_wl(), pipe.out_wl());\n        pipe.in_wl().swap_slots();\n        pipe.advance2();\n        do\n        {\n          Shared<int> retval = Shared<int>(1);\n          HGAccumulator<int> _rv;\n          *(retval.cpu_wr_ptr()) = 0;\n          _rv.rv = retval.gpu_wr_ptr();\n          pipe.out_wl().will_write();\n          union_components <<<union_components_blocks, __tb_union_components>>>(gg, cs, comp, level, el, ew, bosses[cur_boss], bosses[cur_boss ^ 1], pipe.in_wl(), pipe.out_wl(), union_components_barrier, _rv);\n          loopc = *(retval.cpu_rd_ptr()) > 0;\n          cur_boss ^= 1;\n          bosses[cur_boss].reset();\n        }\n        while (loopc);\n        mw = el.nitems();\n        level++;\n        if (last_mw == mw)\n        {\n          break;\n        }\n      }\n    }\n  }\n  pipe.free();\n  unsigned long int rweight = 0;\n  size_t nmstedges ;\n  nmstedges = ew.nitems();\n  printf(\"nmstedges = %d\\n\", nmstedges);\n  int *h_list = (int *)malloc(nmstedges*sizeof(int));\n  check_cuda(cudaMemcpy(h_list, ew.list.gpu_rd_ptr(), nmstedges * sizeof(int), cudaMemcpyDeviceToHost));\n  for (size_t i = 0; i < nmstedges; i ++) rweight += h_list[i];\n  printf(\"final mstwt: %llu\\n\", rweight);\n  printf(\"total edges: %llu, total components: %llu\\n\", nmstedges, cs.numberOfComponentsHost());\n}\n"
  },
  {
    "path": "lonestar/analytics/gpu/spanningtree/mst.h",
    "content": "#pragma once\n\n#include \"component.h\"\n#include \"moderngpu/kernel_reduce.hxx\"\n\nstruct comp_data {\n  Shared<int> weight;\n  Shared<int> edge;\n  Shared<int> node;\n  Shared<int> level;\n  Shared<int> dstcomp;\n\n  int* lvl;\n  int* minwt;\n  int* minedge; // absolute\n  int* minnode;\n  int* mindstcomp;\n};\n\nstatic void dump_comp_data(struct comp_data comp, int n, int lvl);\n\nstatic void dump_comp_data(struct comp_data comp, int n, int lvl) {\n  int *level, *minwt, *minedge, *minnode, *mindstcomp;\n\n  level      = comp.level.cpu_rd_ptr();\n  minwt      = comp.weight.cpu_rd_ptr();\n  minedge    = comp.edge.cpu_rd_ptr();\n  minnode    = comp.node.cpu_rd_ptr();\n  mindstcomp = comp.dstcomp.cpu_rd_ptr();\n\n  for (int i = 0; i < n; i++) {\n    if (level[i] == lvl) {\n      fprintf(stderr, \"%d: (%d) node %d edge %d weight %d dstcomp %d\\n\", i,\n              level[i], minnode[i], minedge[i], minwt[i], mindstcomp[i]);\n    }\n  }\n\n  comp.level.gpu_wr_ptr();\n  comp.weight.gpu_wr_ptr();\n  comp.edge.gpu_wr_ptr();\n  comp.node.gpu_wr_ptr();\n  comp.dstcomp.gpu_wr_ptr();\n}\n"
  },
  {
    "path": "lonestar/analytics/gpu/spanningtree/support.cu",
    "content": "#include \"gg.h\"\nconst char *prog_opts = \"\";\nconst char *prog_usage = \"\";\nconst char *prog_args_usage = \"\";\n\nextern AppendOnlyList el;\nint process_prog_arg(int argc, char *argv[], int arg_start) {\n   return 1;\n}\n\nvoid process_prog_opt(char c, char *optarg) { }\n\nvoid output(CSRGraphTy &g, const char *output_file) {\n  FILE *f;\n  if(!output_file) return;\n  if(strcmp(output_file, \"-\") == 0) f = stdout;\n  else f = fopen(output_file, \"w\");\n  el.sort();\n  int *e = el.list.cpu_rd_ptr();\n  int edges = el.nitems();\n  for(int i = 0; i < edges; i++)\n    check_fprintf(f, \"%d %d %d %d\\n\", i, e[i], g.getAbsDestination(e[i]), g.getAbsWeight(e[i]));\n}\n"
  },
  {
    "path": "lonestar/analytics/gpu/sssp/CMakeLists.txt",
    "content": "app_analy_gpu(sssp sssp)\nadd_test_gpu(sssp rmat15 rmat15.out sssp -s 0 -o rmat15.out ${BASEINPUT}/scalefree/rmat15.gr)\n"
  },
  {
    "path": "lonestar/analytics/gpu/sssp/README.md",
    "content": "Single-Source Shortest Paths\n================================================================================\n\nDESCRIPTION\n--------------------------------------------------------------------------------\n\nThis benchmark computes the shortest path from a source node to all nodes in a directed graph with non-negative edge weights by using a modified near-far algorithm [1].\n\n[1] https://people.csail.mit.edu/jshun/6886-s18/papers/DBGO14.pdf\n\nINPUT\n--------------------------------------------------------------------------------\n\nTake in Galois .gr graphs.\n\nBUILD\n--------------------------------------------------------------------------------\n\n1. Run cmake at BUILD directory (refer to top-level README for cmake instructions).\n\n2. Run `cd <BUILD>/lonestar/analytics/gpu/sssp; make -j`\n\nRUN\n--------------------------------------------------------------------------------\n\nTo run default algorithm, use the following:\n\n-`$ ./sssp-gpu -o <output-file> -l -s startNode <input-graph>`\n\n-`$ ./sssp-gpu -o outfile.txt -l -s 0 rmat15.gr`\n\nThe option -l enables thread block load balancer. Enable this option for power-law graphs to improve the performance. It is recommended to disable this option for high diameter graphs, such as road-networks.\n"
  },
  {
    "path": "lonestar/analytics/gpu/sssp/sssp.cu",
    "content": "/*  -*- mode: c++ -*-  */\n#include \"gg.h\"\n#include \"ggcuda.h\"\n#include \"cub/cub.cuh\"\n#include \"cub/util_allocator.cuh\"\n#include \"thread_work.h\"\n\nvoid kernel_sizing(CSRGraph &, dim3 &, dim3 &);\n#define TB_SIZE 256\nconst char *GGC_OPTIONS = \"coop_conv=False $ outline_iterate_gb=True $ backoff_blocking_factor=4 $ parcomb=True $ np_schedulers=set(['fg', 'wp']) $ cc_disable=set([]) $ tb_lb=True $ hacks=set([]) $ np_factor=8 $ instrument=set([]) $ unroll=[] $ read_props=None $ outline_iterate=True $ ignore_nested_errors=False $ np=True $ write_props=None $ quiet_cgen=True $ retry_backoff=True $ cuda.graph_type=basic $ cuda.use_worklist_slots=True $ cuda.worklist_type=basic\";\nstruct ThreadWork t_work;\nextern int DELTA;\nextern int start_node;\nbool enable_lb = false;\ntypedef int edge_data_type;\ntypedef int node_data_type;\ntypedef int * gint_p;\nextern const node_data_type INF = INT_MAX;\nstatic const int __tb_gg_main_pipe_1_gpu_gb = 256;\nstatic const int __tb_sssp_kernel = TB_SIZE;\nstatic const int __tb_remove_dups = TB_SIZE;\n__global__ void kernel(CSRGraph graph, int src)\n{\n  unsigned tid = TID_1D;\n  unsigned nthreads = TOTAL_THREADS_1D;\n\n  index_type node_end;\n  node_end = (graph).nnodes;\n  for (index_type node = 0 + tid; node < node_end; node += nthreads)\n  {\n    graph.node_data[node] = (node == src) ? 0 : INF ;\n  }\n}\n__device__ void remove_dups_dev(int * marks, Worklist2 in_wl, Worklist2 out_wl, GlobalBarrier gb)\n{\n  unsigned tid = TID_1D;\n  unsigned nthreads = TOTAL_THREADS_1D;\n\n  index_type wlnode_end;\n  index_type wlnode2_end;\n  wlnode_end = *((volatile index_type *) (in_wl).dindex);\n  for (index_type wlnode = 0 + tid; wlnode < wlnode_end; wlnode += nthreads)\n  {\n    int node;\n    bool pop;\n    pop = (in_wl).pop_id(wlnode, node);\n    marks[node] = wlnode;\n  }\n  gb.Sync();\n  wlnode2_end = *((volatile index_type *) (in_wl).dindex);\n  for (index_type wlnode2 = 0 + tid; wlnode2 < wlnode2_end; wlnode2 += nthreads)\n  {\n    int node;\n    bool pop;\n    pop = (in_wl).pop_id(wlnode2, node);\n    if (marks[node] == wlnode2)\n    {\n      index_type _start_26;\n      _start_26 = (out_wl).setup_push_warp_one();;\n      (out_wl).do_push(_start_26, 0, node);\n    }\n  }\n}\n__global__ void remove_dups(int * marks, Worklist2 in_wl, Worklist2 out_wl, GlobalBarrier gb)\n{\n  unsigned tid = TID_1D;\n\n  if (tid == 0)\n    in_wl.reset_next_slot();\n\n  remove_dups_dev(marks, in_wl, out_wl, gb);\n}\n__global__ void sssp_kernel_dev_TB_LB(CSRGraph graph, int delta, int * thread_prefix_work_wl, unsigned int num_items, PipeContextT<Worklist2> thread_src_wl, Worklist2 in_wl, Worklist2 out_wl, Worklist2 re_wl)\n{\n  unsigned tid = TID_1D;\n  unsigned nthreads = TOTAL_THREADS_1D;\n\n  __shared__ unsigned int total_work;\n  __shared__ unsigned block_start_src_index;\n  __shared__ unsigned block_end_src_index;\n  unsigned my_work;\n  unsigned node;\n  unsigned int offset;\n  unsigned int current_work;\n  unsigned blockdim_x = BLOCK_DIM_X;\n  total_work = thread_prefix_work_wl[num_items - 1];\n  my_work = ceilf((float)(total_work) / (float) nthreads);\n\n  __syncthreads();\n\n  if (my_work != 0)\n  {\n    current_work = tid;\n  }\n  for (unsigned i =0; i < my_work; i++)\n  {\n    unsigned int block_start_work;\n    unsigned int block_end_work;\n    if (threadIdx.x == 0)\n    {\n      if (current_work < total_work)\n      {\n        block_start_work = current_work;\n        block_end_work=current_work + blockdim_x - 1;\n        if (block_end_work >= total_work)\n        {\n          block_end_work = total_work - 1;\n        }\n        block_start_src_index = compute_src_and_offset(0, num_items - 1,  block_start_work+1, thread_prefix_work_wl, num_items,offset);\n        block_end_src_index = compute_src_and_offset(0, num_items - 1, block_end_work+1, thread_prefix_work_wl, num_items, offset);\n      }\n    }\n    __syncthreads();\n\n    if (current_work < total_work)\n    {\n      unsigned src_index;\n      index_type edge;\n      src_index = compute_src_and_offset(block_start_src_index, block_end_src_index, current_work+1, thread_prefix_work_wl,num_items, offset);\n      node= thread_src_wl.in_wl().dwl[src_index];\n      edge = (graph).getFirstEdge(node)+ offset;\n      {\n        index_type dst = graph.getAbsDestination(edge);\n        edge_data_type wt = graph.getAbsWeight(edge);\n        if (graph.node_data[dst] > graph.node_data[node] + wt)\n        {\n          atomicMin(graph.node_data + dst, graph.node_data[node] + wt);\n          if (graph.node_data[node] + wt <= delta)\n          {\n            index_type _start_67;\n            _start_67 = (re_wl).setup_push_warp_one();;\n            (re_wl).do_push(_start_67, 0, dst);\n          }\n          else\n          {\n            (out_wl).push(dst);\n          }\n        }\n      }\n      current_work = current_work + nthreads;\n    }\n  }\n}\n__global__ void Inspect_sssp_kernel_dev(CSRGraph graph, int delta, PipeContextT<Worklist2> thread_work_wl, PipeContextT<Worklist2> thread_src_wl, bool enable_lb, Worklist2 in_wl, Worklist2 out_wl)\n{\n  unsigned tid = TID_1D;\n  unsigned nthreads = TOTAL_THREADS_1D;\n\n  index_type wlnode_end;\n  wlnode_end = *((volatile index_type *) (in_wl).dindex);\n  for (index_type wlnode = 0 + tid; wlnode < wlnode_end; wlnode += nthreads)\n  {\n    int node;\n    bool pop;\n    int index;\n    pop = (in_wl).pop_id(wlnode, node) && ((( node < (graph).nnodes ) && ( (graph).getOutDegree(node) >= DEGREE_LIMIT)) ? true: false);\n    if (pop && graph.node_data[node] == INF)\n    {\n      continue;\n    }\n    if (pop)\n    {\n      index = thread_work_wl.in_wl().push_range(1) ;\n      thread_src_wl.in_wl().push_range(1);\n      thread_work_wl.in_wl().dwl[index] = (graph).getOutDegree(node);\n      thread_src_wl.in_wl().dwl[index] = node;\n    }\n  }\n}\n__device__ void sssp_kernel_dev(CSRGraph graph, int delta, bool enable_lb, Worklist2 in_wl, Worklist2 out_wl, Worklist2 re_wl)\n{\n  unsigned tid = TID_1D;\n  unsigned nthreads = TOTAL_THREADS_1D;\n\n  const unsigned __kernel_tb_size = __tb_sssp_kernel;\n  index_type wlnode_end;\n  const int _NP_CROSSOVER_WP = 32;\n  const int BLKSIZE = __kernel_tb_size;\n  const int ITSIZE = BLKSIZE * 8;\n\n  typedef cub::BlockScan<multiple_sum<2, index_type>, BLKSIZE> BlockScan;\n  typedef union np_shared<BlockScan::TempStorage, index_type, struct empty_np, struct warp_np<__kernel_tb_size/32>, struct fg_np<ITSIZE> > npsTy;\n\n  __shared__ npsTy nps ;\n  wlnode_end = roundup((*((volatile index_type *) (in_wl).dindex)), (blockDim.x));\n  for (index_type wlnode = 0 + tid; wlnode < wlnode_end; wlnode += nthreads)\n  {\n    int node;\n    bool pop;\n    multiple_sum<2, index_type> _np_mps;\n    multiple_sum<2, index_type> _np_mps_total;\n    pop = (in_wl).pop_id(wlnode, node) && ((( node < (graph).nnodes ) && ( (graph).getOutDegree(node) < DEGREE_LIMIT)) ? true: false);\n    pop = pop && !(graph.node_data[node] == INF);\n    struct NPInspector1 _np = {0,0,0,0,0,0};\n    __shared__ struct { int node; } _np_closure [TB_SIZE];\n    _np_closure[threadIdx.x].node = node;\n    if (pop)\n    {\n      _np.size = (graph).getOutDegree(node);\n      _np.start = (graph).getFirstEdge(node);\n    }\n    _np_mps.el[0] = _np.size >= _NP_CROSSOVER_WP ? _np.size : 0;\n    _np_mps.el[1] = _np.size < _NP_CROSSOVER_WP ? _np.size : 0;\n    BlockScan(nps.temp_storage).ExclusiveSum(_np_mps, _np_mps, _np_mps_total);\n    if (threadIdx.x == 0)\n    {\n    }\n    __syncthreads();\n    {\n      const int warpid = threadIdx.x / 32;\n      const int _np_laneid = cub::LaneId();\n      while (__any_sync(0xffffffff, _np.size >= _NP_CROSSOVER_WP))\n      {\n        if (_np.size >= _NP_CROSSOVER_WP)\n        {\n          nps.warp.owner[warpid] = _np_laneid;\n        }\n        if (nps.warp.owner[warpid] == _np_laneid)\n        {\n          nps.warp.start[warpid] = _np.start;\n          nps.warp.size[warpid] = _np.size;\n          nps.warp.src[warpid] = threadIdx.x;\n          _np.start = 0;\n          _np.size = 0;\n        }\n        index_type _np_w_start = nps.warp.start[warpid];\n        index_type _np_w_size = nps.warp.size[warpid];\n        assert(nps.warp.src[warpid] < __kernel_tb_size);\n        node = _np_closure[nps.warp.src[warpid]].node;\n        for (int _np_ii = _np_laneid; _np_ii < _np_w_size; _np_ii += 32)\n        {\n          index_type edge;\n          edge = _np_w_start +_np_ii;\n          {\n            index_type dst = graph.getAbsDestination(edge);\n            edge_data_type wt = graph.getAbsWeight(edge);\n            if (graph.node_data[dst] > graph.node_data[node] + wt)\n            {\n              atomicMin(graph.node_data + dst, graph.node_data[node] + wt);\n              if (graph.node_data[node] + wt <= delta)\n              {\n                index_type _start_67;\n                _start_67 = (re_wl).setup_push_warp_one();;\n                (re_wl).do_push(_start_67, 0, dst);\n              }\n              else\n              {\n                (out_wl).push(dst);\n              }\n            }\n          }\n        }\n      }\n      __syncthreads();\n    }\n\n    __syncthreads();\n    _np.total = _np_mps_total.el[1];\n    _np.offset = _np_mps.el[1];\n    while (_np.work())\n    {\n      int _np_i =0;\n      _np.inspect2(nps.fg.itvalue, nps.fg.src, ITSIZE, threadIdx.x);\n      __syncthreads();\n\n      for (_np_i = threadIdx.x; _np_i < ITSIZE && _np.valid(_np_i); _np_i += BLKSIZE)\n      {\n        index_type edge;\n        assert(nps.fg.src[_np_i] < __kernel_tb_size);\n        node = _np_closure[nps.fg.src[_np_i]].node;\n        edge= nps.fg.itvalue[_np_i];\n        {\n          index_type dst = graph.getAbsDestination(edge);\n          edge_data_type wt = graph.getAbsWeight(edge);\n          if (graph.node_data[dst] > graph.node_data[node] + wt)\n          {\n            atomicMin(graph.node_data + dst, graph.node_data[node] + wt);\n            if (graph.node_data[node] + wt <= delta)\n            {\n              index_type _start_67;\n              _start_67 = (re_wl).setup_push_warp_one();;\n              (re_wl).do_push(_start_67, 0, dst);\n            }\n            else\n            {\n              (out_wl).push(dst);\n            }\n          }\n        }\n      }\n      _np.execute_round_done(ITSIZE);\n      __syncthreads();\n    }\n    assert(threadIdx.x < __kernel_tb_size);\n    node = _np_closure[threadIdx.x].node;\n  }\n}\n__global__ void __launch_bounds__(TB_SIZE, 2) sssp_kernel(CSRGraph graph, int delta, bool enable_lb, Worklist2 in_wl, Worklist2 out_wl, Worklist2 re_wl)\n{\n  unsigned tid = TID_1D;\n\n  if (tid == 0)\n    in_wl.reset_next_slot();\n\n  sssp_kernel_dev(graph, delta, enable_lb, in_wl, out_wl, re_wl);\n}\nvoid gg_main_pipe_1(CSRGraph& gg, gint_p glevel, int& curdelta, int& i, int DELTA, GlobalBarrier& remove_dups_barrier, int remove_dups_blocks, PipeContextT<Worklist2>& pipe, dim3& blocks, dim3& threads)\n{\n  while (pipe.in_wl().nitems())\n  {\n    while (pipe.in_wl().nitems())\n    {\n      pipe.out_wl().will_write();\n      pipe.re_wl().will_write();\n      if (enable_lb)\n      {\n        t_work.reset_thread_work();\n        Inspect_sssp_kernel_dev <<<blocks, __tb_sssp_kernel>>>(gg, curdelta, t_work.thread_work_wl, t_work.thread_src_wl, enable_lb, pipe.in_wl(), pipe.out_wl());\n        cudaDeviceSynchronize();\n        int num_items = t_work.thread_work_wl.in_wl().nitems();\n        if (num_items != 0)\n        {\n          t_work.compute_prefix_sum();\n          cudaDeviceSynchronize();\n          sssp_kernel_dev_TB_LB <<<blocks, __tb_sssp_kernel>>>(gg, curdelta, t_work.thread_prefix_work_wl.gpu_wr_ptr(), num_items, t_work.thread_src_wl, pipe.in_wl(), pipe.out_wl(), pipe.re_wl());\n          cudaDeviceSynchronize();\n        }\n      }\n      sssp_kernel <<<blocks, __tb_sssp_kernel>>>(gg, curdelta, enable_lb, pipe.in_wl(), pipe.out_wl(), pipe.re_wl());\n      cudaDeviceSynchronize();\n      pipe.in_wl().swap_slots();\n      pipe.retry2();\n    }\n    pipe.advance2();\n    pipe.out_wl().will_write();\n    remove_dups <<<remove_dups_blocks, __tb_remove_dups>>>(glevel, pipe.in_wl(), pipe.out_wl(), remove_dups_barrier);\n    cudaDeviceSynchronize();\n    pipe.in_wl().swap_slots();\n    pipe.advance2();\n    i++;\n    curdelta += DELTA;\n  }\n}\n__global__ void __launch_bounds__(__tb_gg_main_pipe_1_gpu_gb) gg_main_pipe_1_gpu_gb(CSRGraph gg, gint_p glevel, int curdelta, int i, int DELTA, GlobalBarrier remove_dups_barrier, int remove_dups_blocks, PipeContextT<Worklist2> pipe, int* cl_curdelta, int* cl_i, bool enable_lb, GlobalBarrier gb)\n{\n  unsigned tid = TID_1D;\n\n  curdelta = *cl_curdelta;\n  i = *cl_i;\n  while (pipe.in_wl().nitems())\n  {\n    while (pipe.in_wl().nitems())\n    {\n      if (tid == 0)\n        pipe.in_wl().reset_next_slot();\n      sssp_kernel_dev (gg, curdelta, enable_lb, pipe.in_wl(), pipe.out_wl(), pipe.re_wl());\n      pipe.in_wl().swap_slots();\n      gb.Sync();\n      pipe.retry2();\n    }\n    gb.Sync();\n    pipe.advance2();\n    if (tid == 0)\n      pipe.in_wl().reset_next_slot();\n    remove_dups_dev (glevel, pipe.in_wl(), pipe.out_wl(), gb);\n    pipe.in_wl().swap_slots();\n    gb.Sync();\n    pipe.advance2();\n    i++;\n    curdelta += DELTA;\n  }\n  gb.Sync();\n  if (tid == 0)\n  {\n    *cl_curdelta = curdelta;\n    *cl_i = i;\n  }\n}\nvoid gg_main_pipe_1_wrapper(CSRGraph& gg, gint_p glevel, int& curdelta, int& i, int DELTA, GlobalBarrier& remove_dups_barrier, int remove_dups_blocks, PipeContextT<Worklist2>& pipe, dim3& blocks, dim3& threads)\n{\n  static GlobalBarrierLifetime gg_main_pipe_1_gpu_gb_barrier;\n  static bool gg_main_pipe_1_gpu_gb_barrier_inited;\n  extern bool enable_lb;\n  static const size_t gg_main_pipe_1_gpu_gb_residency = maximum_residency(gg_main_pipe_1_gpu_gb, __tb_gg_main_pipe_1_gpu_gb, 0);\n  static const size_t gg_main_pipe_1_gpu_gb_blocks = GG_MIN(blocks.x, ggc_get_nSM() * gg_main_pipe_1_gpu_gb_residency);\n  if(!gg_main_pipe_1_gpu_gb_barrier_inited) { gg_main_pipe_1_gpu_gb_barrier.Setup(gg_main_pipe_1_gpu_gb_blocks); gg_main_pipe_1_gpu_gb_barrier_inited = true;};\n  if (enable_lb)\n  {\n    gg_main_pipe_1(gg,glevel,curdelta,i,DELTA,remove_dups_barrier,remove_dups_blocks,pipe,blocks,threads);\n  }\n  else\n  {\n    int* cl_curdelta;\n    int* cl_i;\n    check_cuda(cudaMalloc(&cl_curdelta, sizeof(int) * 1));\n    check_cuda(cudaMalloc(&cl_i, sizeof(int) * 1));\n    check_cuda(cudaMemcpy(cl_curdelta, &curdelta, sizeof(int) * 1, cudaMemcpyHostToDevice));\n    check_cuda(cudaMemcpy(cl_i, &i, sizeof(int) * 1, cudaMemcpyHostToDevice));\n\n    gg_main_pipe_1_gpu_gb<<<gg_main_pipe_1_gpu_gb_blocks, __tb_gg_main_pipe_1_gpu_gb>>>(gg,glevel,curdelta,i,DELTA,remove_dups_barrier,remove_dups_blocks,pipe,cl_curdelta,cl_i, enable_lb, gg_main_pipe_1_gpu_gb_barrier);\n    check_cuda(cudaMemcpy(&curdelta, cl_curdelta, sizeof(int) * 1, cudaMemcpyDeviceToHost));\n    check_cuda(cudaMemcpy(&i, cl_i, sizeof(int) * 1, cudaMemcpyDeviceToHost));\n    check_cuda(cudaFree(cl_curdelta));\n    check_cuda(cudaFree(cl_i));\n  }\n}\nvoid gg_main(CSRGraph& hg, CSRGraph& gg)\n{\n  dim3 blocks, threads;\n  kernel_sizing(gg, blocks, threads);\n  t_work.init_thread_work(gg.nnodes);\n  static GlobalBarrierLifetime remove_dups_barrier;\n  static bool remove_dups_barrier_inited;\n  gint_p glevel;\n  PipeContextT<Worklist2> pipe;\n  Shared<int> level (hg.nnodes);\n  level.cpu_wr_ptr();\n  static const size_t remove_dups_residency = maximum_residency(remove_dups, __tb_remove_dups, 0);\n  static const size_t remove_dups_blocks = GG_MIN(blocks.x, ggc_get_nSM() * remove_dups_residency);\n  if(!remove_dups_barrier_inited) { remove_dups_barrier.Setup(remove_dups_blocks); remove_dups_barrier_inited = true;};\n  kernel <<<blocks, threads>>>(gg, start_node);\n  cudaDeviceSynchronize();\n  int i = 0;\n  int curdelta = 0;\n  printf(\"delta: %d\\n\", DELTA);\n  glevel = level.gpu_wr_ptr();\n  pipe = PipeContextT<Worklist2>(gg.nedges*2);\n  pipe.in_wl().wl[0] = start_node;\n  pipe.in_wl().update_gpu(1);\n  gg_main_pipe_1_wrapper(gg,glevel,curdelta,i,DELTA,remove_dups_barrier,remove_dups_blocks,pipe,blocks,threads);\n  printf(\"iterations: %d\\n\", i);\n}\n"
  },
  {
    "path": "lonestar/analytics/gpu/sssp/support.cu",
    "content": "/* -*- mode: C++ -*- */\n\n#include \"gg.h\"\n#include <cassert>\n\nconst char *prog_opts = \"ls:d:\";\nconst char *prog_usage = \"[-l] [-d delta] [-s startNode]\";\nconst char *prog_args_usage = \"-l: enable thread block load balancer (by default false)\";\n\nint DELTA = 10000;\nextern const int INF;\nint start_node = 0;\nextern bool enable_lb;\n\nint process_prog_arg(int argc, char *argv[], int arg_start) {\n   return 1;\n}\n\nvoid process_prog_opt(char c, char *optarg) {\n  if(c == 'd') {\n    DELTA = atoi(optarg);\n    assert(DELTA > 0);\n  }\n  if(c == 'l') { \n\tenable_lb = true;\n  }\n  if(c == 's') {\n     start_node = atoi(optarg);\n     assert(start_node >= 0);\n  }\n}\n\n\nvoid output(CSRGraphTy &g, const char *output_file) {\n  FILE *f;\n\n  if(!output_file)\n    return;\n\n  if(strcmp(output_file, \"-\") == 0)\n    f = stdout;\n  else\n    f = fopen(output_file, \"w\");\n    \n  const uint32_t infinity = std::numeric_limits<uint32_t>::max() / 4;    \n  for(int i = 0; i < g.nnodes; i++) {\n    if(g.node_data[i] == INF) {\n      //formatting the output to be compatible with the distributed bfs ouput \n      check_fprintf(f, \"%d %d\\n\", i, infinity);\n    } else {\n      check_fprintf(f, \"%d %d\\n\", i, g.node_data[i]);\n    }\n  }\n\n}\n"
  },
  {
    "path": "lonestar/analytics/gpu/triangle-counting/CMakeLists.txt",
    "content": "app_analy_gpu(tc triangle-counting)\nadd_test_gpu(triangle-counting rmat15 rmat15.out tc ${BASEINPUT}/scalefree/symmetric/rmat15.csgr)\n"
  },
  {
    "path": "lonestar/analytics/gpu/triangle-counting/README.md",
    "content": "Triangle Counting\n================================================================================\n\nDESCRIPTION\n--------------------------------------------------------------------------------\n\n\nThis benchmark counts the number of triangles in a given undirected graph. It implements the approach from Polak [1] in IrGL[2].\n\n[1] Adam Polak. Counting triangles in large graphs on GPU. In IPDPS Workshops 2016,  pages  740~@~S746,  2016\n[2] https://users.ices.utexas.edu/~sreepai/sree-oopsla2016.pdf\n\nINPUT\n--------------------------------------------------------------------------------\n\nInput graphs are Galois .csgr format, i.e., symmetric, have no self-loops, and have no duplicated edges.\n\nBUILD\n--------------------------------------------------------------------------------\n\n1. Run cmake at BUILD directory (refer to top-level README for cmake instructions).\n\n2. Run `cd <BUILD>/lonestar/analytics/gpu/triangle-counting; make -j`\n\nRUN\n--------------------------------------------------------------------------------\n\nTo run default algorithm, use the following:\n\n-`$ ./triangle-counting-gpu  <csgr-input-graph>`\n\n-`$ ./triangle-counting-gpu road-USA.csgr`\n"
  },
  {
    "path": "lonestar/analytics/gpu/triangle-counting/support.cu",
    "content": "/* -*- mode: C++ -*- */\n\n#include \"gg.h\"\n\nconst char *prog_opts = \"\";\nconst char *prog_usage = \"\";\nconst char *prog_args_usage = \"\";\n\nint process_prog_arg(int argc, char *argv[], int arg_start) {\n   return 1;\n}\n\nvoid process_prog_opt(char c, char *optarg) {\n  ;\n}\n\nvoid debug_output(CSRGraphTy &g, unsigned int *valid_edges) {\n  for(int i = 0; i < g.nnodes; i++) {    \n    int start = g.row_start[i];\n    for(int j = 0; j < valid_edges[i]; j++) {\n      printf(\"%d %d\\n\", i, g.edge_dst[start + j]);\n    }    \n  }\n}\n\nvoid output(CSRGraphTy &g, const char *output_file) {\n  FILE *f;\n\n  if(!output_file)\n    return;\n\n  if(strcmp(output_file, \"-\") == 0)\n    f = stdout;\n  else\n    f = fopen(output_file, \"w\");\n    \n\n  for(int i = 0; i < g.nedges; i++)\n    check_fprintf(f, \"%d %d\\n\", i, g.edge_dst[i]);\n}\n"
  },
  {
    "path": "lonestar/analytics/gpu/triangle-counting/tc.cu",
    "content": "/*  -*- mode: c++ -*-  */\n#include \"gg.h\"\n#include \"ggcuda.h\"\n\nvoid kernel_sizing(CSRGraph &, dim3 &, dim3 &);\n#define TB_SIZE 256\nconst char *GGC_OPTIONS = \"coop_conv=False $ outline_iterate_gb=False $ backoff_blocking_factor=4 $ parcomb=False $ np_schedulers=set(['tb', 'fg']) $ cc_disable=set([]) $ hacks=set([]) $ np_factor=8 $ instrument=set([]) $ unroll=[] $ instrument_mode=None $ read_props=None $ outline_iterate=True $ ignore_nested_errors=False $ np=True $ write_props=None $ quiet_cgen=True $ retry_backoff=True $ cuda.graph_type=texture $ cuda.use_worklist_slots=True $ cuda.worklist_type=texture\";\nvoid debug_output(CSRGraphTy &g, unsigned int *valid_edges);;\nstatic const int __tb_preprocess = TB_SIZE;\nstatic const int __tb_count_triangles = TB_SIZE;\n__global__ void preprocess(CSRGraph graph, unsigned int * valid_edges)\n{\n  unsigned tid = TID_1D;\n  unsigned nthreads = TOTAL_THREADS_1D;\n\n  const unsigned __kernel_tb_size = __tb_preprocess;\n  index_type node_rup;\n  const int _NP_CROSSOVER_TB = __kernel_tb_size;\n  const int BLKSIZE = __kernel_tb_size;\n  const int ITSIZE = BLKSIZE * 8;\n\n  typedef cub::BlockScan<multiple_sum<2, index_type>, BLKSIZE> BlockScan;\n  typedef union np_shared<BlockScan::TempStorage, index_type, struct tb_np, struct empty_np, struct fg_np<ITSIZE> > npsTy;\n\n  __shared__ npsTy nps ;\n  node_rup = ((0) + roundup((((graph).nnodes) - (0)), (blockDim.x)));\n  for (index_type node = 0 + tid; node < node_rup; node += nthreads)\n  {\n    bool pop;\n    int degree;\n    multiple_sum<2, index_type> _np_mps;\n    multiple_sum<2, index_type> _np_mps_total;\n    pop = graph.valid_node(node);;\n    if (pop)\n    {\n      degree = graph.getOutDegree(node);\n    }\n    struct NPInspector1 _np = {0,0,0,0,0,0};\n    __shared__ struct { index_type node; int degree; } _np_closure [TB_SIZE];\n    _np_closure[threadIdx.x].node = node;\n    _np_closure[threadIdx.x].degree = degree;\n    if (pop)\n    {\n      _np.size = (graph).getOutDegree(node);\n      _np.start = (graph).getFirstEdge(node);\n    }\n    _np_mps.el[0] = _np.size >= _NP_CROSSOVER_TB ? _np.size : 0;\n    _np_mps.el[1] = _np.size < _NP_CROSSOVER_TB ? _np.size : 0;\n    BlockScan(nps.temp_storage).ExclusiveSum(_np_mps, _np_mps, _np_mps_total);\n    if (threadIdx.x == 0)\n    {\n      nps.tb.owner = MAX_TB_SIZE + 1;\n    }\n    __syncthreads();\n    while (true)\n    {\n      if (_np.size >= _NP_CROSSOVER_TB)\n      {\n        nps.tb.owner = threadIdx.x;\n      }\n      __syncthreads();\n      if (nps.tb.owner == MAX_TB_SIZE + 1)\n      {\n        __syncthreads();\n        break;\n      }\n      if (nps.tb.owner == threadIdx.x)\n      {\n        nps.tb.start = _np.start;\n        nps.tb.size = _np.size;\n        nps.tb.src = threadIdx.x;\n        _np.start = 0;\n        _np.size = 0;\n      }\n      __syncthreads();\n      int ns = nps.tb.start;\n      int ne = nps.tb.size;\n      if (nps.tb.src == threadIdx.x)\n      {\n        nps.tb.owner = MAX_TB_SIZE + 1;\n      }\n      assert(nps.tb.src < __kernel_tb_size);\n      node = _np_closure[nps.tb.src].node;\n      degree = _np_closure[nps.tb.src].degree;\n      for (int _np_j = threadIdx.x; _np_j < ne; _np_j += BLKSIZE)\n      {\n        index_type edge;\n        edge = ns +_np_j;\n        {\n          index_type dst = graph.getAbsDestination(edge);\n          int dst_degree = graph.getOutDegree(dst);\n          if ((dst_degree > degree) || (dst_degree == degree && dst > node))\n          {\n            graph.edge_data[edge] = dst;\n            atomicAdd(valid_edges + node, 1);\n          }\n          else\n          {\n            graph.edge_data[edge] = graph.nnodes;\n          }\n        }\n      }\n      __syncthreads();\n    }\n\n    _np.total = _np_mps_total.el[1];\n    _np.offset = _np_mps.el[1];\n    while (_np.work())\n    {\n      int _np_i =0;\n      _np.inspect2(nps.fg.itvalue, nps.fg.src, ITSIZE, threadIdx.x);\n      __syncthreads();\n\n      for (_np_i = threadIdx.x; _np_i < ITSIZE && _np.valid(_np_i); _np_i += BLKSIZE)\n      {\n        index_type edge;\n        assert(nps.fg.src[_np_i] < __kernel_tb_size);\n        node = _np_closure[nps.fg.src[_np_i]].node;\n        degree = _np_closure[nps.fg.src[_np_i]].degree;\n        edge= nps.fg.itvalue[_np_i];\n        {\n          index_type dst = graph.getAbsDestination(edge);\n          int dst_degree = graph.getOutDegree(dst);\n          if ((dst_degree > degree) || (dst_degree == degree && dst > node))\n          {\n            graph.edge_data[edge] = dst;\n            atomicAdd(valid_edges + node, 1);\n          }\n          else\n          {\n            graph.edge_data[edge] = graph.nnodes;\n          }\n        }\n      }\n      _np.execute_round_done(ITSIZE);\n      __syncthreads();\n    }\n    assert(threadIdx.x < __kernel_tb_size);\n    node = _np_closure[threadIdx.x].node;\n    degree = _np_closure[threadIdx.x].degree;\n  }\n}\n__device__ unsigned int intersect(CSRGraph graph, index_type u, index_type v, unsigned int * valid_edges)\n{\n  index_type u_start = graph.getFirstEdge(u);\n  index_type u_end = u_start + valid_edges[u];\n  index_type v_start = graph.getFirstEdge(v);\n  index_type v_end = v_start + valid_edges[v];\n  int count = 0;\n  index_type u_it = u_start;\n  index_type v_it = v_start;\n  index_type a ;\n  index_type b ;\n  while (u_it < u_end && v_it < v_end)\n  {\n    a = graph.getAbsDestination(u_it);\n    b = graph.getAbsDestination(v_it);\n    int d = a - b;\n    if (d <= 0)\n    {\n      u_it++;\n    }\n    if (d >= 0)\n    {\n      v_it++;\n    }\n    if (d == 0)\n    {\n      count++;\n    }\n  }\n  return count;\n}\n__global__ void count_triangles(CSRGraph graph, unsigned int * valid_edges, int * count)\n{\n  unsigned tid = TID_1D;\n  unsigned nthreads = TOTAL_THREADS_1D;\n\n  const unsigned __kernel_tb_size = __tb_count_triangles;\n  index_type v_rup;\n  const int _NP_CROSSOVER_TB = __kernel_tb_size;\n  const int BLKSIZE = __kernel_tb_size;\n  const int ITSIZE = BLKSIZE * 8;\n\n  typedef cub::BlockScan<multiple_sum<2, index_type>, BLKSIZE> BlockScan;\n  typedef union np_shared<BlockScan::TempStorage, index_type, struct tb_np, struct empty_np, struct fg_np<ITSIZE> > npsTy;\n\n  __shared__ npsTy nps ;\n  v_rup = ((0) + roundup((((graph).nnodes) - (0)), (blockDim.x)));\n  for (index_type v = 0 + tid; v < v_rup; v += nthreads)\n  {\n    bool pop;\n    multiple_sum<2, index_type> _np_mps;\n    multiple_sum<2, index_type> _np_mps_total;\n    pop = graph.valid_node(v);;\n    struct NPInspector1 _np = {0,0,0,0,0,0};\n    __shared__ struct { index_type v; } _np_closure [TB_SIZE];\n    _np_closure[threadIdx.x].v = v;\n    if (pop)\n    {\n      _np.size = ((graph).getFirstEdge(v)+ valid_edges[v]) - ((graph).getFirstEdge(v));\n      _np.start = (graph).getFirstEdge(v);\n    }\n    _np_mps.el[0] = _np.size >= _NP_CROSSOVER_TB ? _np.size : 0;\n    _np_mps.el[1] = _np.size < _NP_CROSSOVER_TB ? _np.size : 0;\n    BlockScan(nps.temp_storage).ExclusiveSum(_np_mps, _np_mps, _np_mps_total);\n    if (threadIdx.x == 0)\n    {\n      nps.tb.owner = MAX_TB_SIZE + 1;\n    }\n    __syncthreads();\n    while (true)\n    {\n      if (_np.size >= _NP_CROSSOVER_TB)\n      {\n        nps.tb.owner = threadIdx.x;\n      }\n      __syncthreads();\n      if (nps.tb.owner == MAX_TB_SIZE + 1)\n      {\n        __syncthreads();\n        break;\n      }\n      if (nps.tb.owner == threadIdx.x)\n      {\n        nps.tb.start = _np.start;\n        nps.tb.size = _np.size;\n        nps.tb.src = threadIdx.x;\n        _np.start = 0;\n        _np.size = 0;\n      }\n      __syncthreads();\n      int ns = nps.tb.start;\n      int ne = nps.tb.size;\n      if (nps.tb.src == threadIdx.x)\n      {\n        nps.tb.owner = MAX_TB_SIZE + 1;\n      }\n      assert(nps.tb.src < __kernel_tb_size);\n      v = _np_closure[nps.tb.src].v;\n      for (int _np_j = threadIdx.x; _np_j < ne; _np_j += BLKSIZE)\n      {\n        index_type edge;\n        edge = ns +_np_j;\n        {\n          index_type u = graph.getAbsDestination(edge);\n          index_type d_u = graph.getOutDegree(u);\n          int xcount = 0;\n          xcount = intersect(graph, u, v, valid_edges);\n          if (xcount)\n          {\n            atomicAdd(count, xcount);\n          }\n        }\n      }\n      __syncthreads();\n    }\n\n    _np.total = _np_mps_total.el[1];\n    _np.offset = _np_mps.el[1];\n    while (_np.work())\n    {\n      int _np_i =0;\n      _np.inspect2(nps.fg.itvalue, nps.fg.src, ITSIZE, threadIdx.x);\n      __syncthreads();\n\n      for (_np_i = threadIdx.x; _np_i < ITSIZE && _np.valid(_np_i); _np_i += BLKSIZE)\n      {\n        index_type edge;\n        assert(nps.fg.src[_np_i] < __kernel_tb_size);\n        v = _np_closure[nps.fg.src[_np_i]].v;\n        edge= nps.fg.itvalue[_np_i];\n        {\n          index_type u = graph.getAbsDestination(edge);\n          index_type d_u = graph.getOutDegree(u);\n          int xcount = 0;\n          xcount = intersect(graph, u, v, valid_edges);\n          if (xcount)\n          {\n            atomicAdd(count, xcount);\n          }\n        }\n      }\n      _np.execute_round_done(ITSIZE);\n      __syncthreads();\n    }\n    assert(threadIdx.x < __kernel_tb_size);\n    v = _np_closure[threadIdx.x].v;\n  }\n}\nvoid gg_main(CSRGraphTy& hg, CSRGraphTy& gg)\n{\n  dim3 blocks, threads;\n  kernel_sizing(gg, blocks, threads);\n  Shared<int> count (1);\n  Shared<unsigned int> valid_edges (hg.nnodes);\n  count.zero_gpu();\n  valid_edges.zero_gpu();\n  preprocess <<<blocks, __tb_preprocess>>>(gg, valid_edges.gpu_wr_ptr());\n  void     *d_temp_storage = NULL;\n  size_t   temp_storage_bytes = 0;\n  cub::DeviceSegmentedRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, gg.edge_data , gg.edge_data, gg.edge_dst, gg.edge_dst, \n                                               gg.nedges, gg.nnodes - 1,  gg.row_start, gg.row_start + 1);\n  // Allocate temporary storage\n  cudaMalloc(&d_temp_storage, temp_storage_bytes);\n  // Run sorting operation\n  cub::DeviceSegmentedRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, gg.edge_data , gg.edge_data, gg.edge_dst, gg.edge_dst, \n                                               gg.nedges, gg.nnodes - 1,  gg.row_start, gg.row_start + 1);\n  count_triangles <<<blocks, __tb_count_triangles>>>(gg, valid_edges.gpu_rd_ptr(), count.gpu_wr_ptr());\n  printf(\"triangles: %d\\n\", *count.cpu_rd_ptr());\n}\n"
  },
  {
    "path": "lonestar/eda/CMakeLists.txt",
    "content": "add_subdirectory(cpu)\n"
  },
  {
    "path": "lonestar/eda/cpu/CMakeLists.txt",
    "content": "add_subdirectory(aig-rewriting)\nadd_subdirectory(sproute)\n"
  },
  {
    "path": "lonestar/eda/cpu/aig-rewriting/CMakeLists.txt",
    "content": "file(GLOB Sources \n  subjectgraph/aig/*.cpp\n  algorithms/*.cpp\n  parsers/*.cpp\n  writers/*.cpp\n  misc/util/*.cpp\n  functional/*.cpp\n  xxHash/xxhash.c\n)\n\nadd_executable(aig-rewriting-cpu main.cpp ${Sources})\nadd_dependencies(apps aig-rewriting-cpu)\ntarget_link_libraries(aig-rewriting-cpu PRIVATE Galois::shmem lonestar)\ninstall(TARGETS aig-rewriting-cpu DESTINATION \"${CMAKE_INSTALL_BINDIR}\" COMPONENT apps EXCLUDE_FROM_ALL)\n\ntarget_include_directories(aig-rewriting-cpu PRIVATE\n  \"$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/subjectgraph/aig>\"\n  \"$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/algorithms>\"\n  \"$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/parsers>\"\n  \"$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/writers>\"\n  \"$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/misc>\"\n  \"$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/misc/util>\"\n  \"$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/functional>\"\n  \"$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/xxHash>\"\n)\n\nadd_test_scale(small1 aig-rewriting-cpu -AIG \"${BASEINPUT}/eda/logic-synthesis/EPFL/arithmetic/adder/aiger/adder.aig\" -v)\nadd_test_scale(small2 aig-rewriting-cpu -AIG \"${BASEINPUT}/eda/logic-synthesis/EPFL/random_control/voter/aiger/voter.aig\" -v)\n"
  },
  {
    "path": "lonestar/eda/cpu/aig-rewriting/README.md",
    "content": "Aig-Rewriting\n================================================================================\n\n\nDESCRIPTION \n--------------------------------------------------------------------------------\n\nThis program rewrites a given AIG in order to reduce the number of AIG nodes\nwhile preseving the functional equivalence of the represented circuit. For \ndetails, please refer to the following paper:\n\nVinicius Possani, Yi-Shan Lu, Alan Mishchenko, Keshav Pingali, Renato Ribas, \nAndré Reis. Unlocking Fine-Grain Parallelism for AIG Rewriting. In ICCAD 2018.\n\n\nINPUT\n--------------------------------------------------------------------------------\n\nThe program expects an AIG graph.\n\n\nBUILD\n--------------------------------------------------------------------------------\n\n1. Run cmake at BUILD directory (refer to top-level README for cmake instructions).\n\n2. Run `cd <BUILD>/lonestar/eda/cpu/aig-rewriting; make -j aig-rewriting-cpu`\n\n\nRUN\n--------------------------------------------------------------------------------\n\nThe following are a few example command lines.\n\n-`$ ./aig-rewriting-cpu <path-AIG> -t 14`\n-`$ ./aig-rewriting-cpu <path-AIG> -t 28 -v`\n\n\nPERFORMANCE  \n--------------------------------------------------------------------------------\n\n- Performance is sensitive to CHUNK_SIZE for the worklist, whose optimal value is input and\n  machine dependent\n"
  },
  {
    "path": "lonestar/eda/cpu/aig-rewriting/algorithms/ChoiceManager.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n/*\n\n @Vinicius Possani\n Parallel AIG Choice Insertion December 6, 2018.\n\n*/\n\n#include \"ChoiceManager.h\"\n#include \"galois/worklists/Chunk.h\"\n\n#include <stdlib.h>\n#include <string.h>\n#include <assert.h>\n#include <chrono>\n\nusing namespace std::chrono;\n\nnamespace algorithm {\n\nChoiceManager::ChoiceManager(aig::Aig& aig, CutManager& cutMan,\n                             NPNManager& npnMan, PreCompGraphManager& pcgMan,\n                             int nGraphs, int nChoices)\n    : aig(aig), cutMan(cutMan), npnMan(npnMan), pcgMan(pcgMan),\n      perThreadDataCH(), nGraphs(nGraphs), nChoices(nChoices) {\n\n  nFuncs = (1 << 16);\n}\n\nChoiceManager::~ChoiceManager() {}\n\nvoid ChoiceManager::createNodeChoices(ThreadLocalDataCH* thData,\n                                      aig::GNode node) {\n\n  aig::Graph& aigGraph    = this->aig.getGraph();\n  aig::NodeData& nodeData = aigGraph.getData(node, galois::MethodFlag::WRITE);\n\n  // Get the node's cuts\n  this->cutMan.computeCutsRecursively(node);\n  Cut* cutsBegin = this->cutMan.getNodeCuts()[nodeData.id];\n  assert(cutsBegin != nullptr);\n\n  char* perm;\n  unsigned phase;\n  unsigned truth;\n  bool isOutputCompl = false;\n  //  int requiredLevel  = 0;\n  // \tint nSubgraphs;\n  int addedChoices = 0;\n  int i;\n  Cut* cut               = nullptr;\n  DecGraph* currentGraph = nullptr;\n  ForestNode* forestNode = nullptr;\n\n  /*\n// Go through the cuts to lock the fanin conee\nfor (cut = cutsBegin; cut != nullptr; cut = cut->nextCut) {\n  // Consider only 4-input cuts\n  if (cut->nLeaves != 4) {\n    continue;\n  }\n  lockFaninCone(aigGraph, node, cut);\n}\n  */\n\n  // Go through the cuts to rewrite\n  for (cut = cutsBegin; cut != nullptr; cut = cut->nextCut) {\n\n    if (addedChoices >= this->nChoices) {\n      break;\n    }\n\n    // Consider only 4-input cuts\n    if (cut->nLeaves != 4) {\n      continue;\n    }\n\n    // Get the fanin permutation\n    truth = 0xFFFF & (*(this->cutMan.readTruth(cut)));\n    perm  = this->npnMan.getPerms4()[(int)this->npnMan.getPerms()[truth]];\n    phase = this->npnMan.getPhases()[truth];\n    isOutputCompl = ((phase & (1 << 4)) > 0);\n\n    // Collect fanins with the corresponding permutation/phase\n    for (i = 0; i < cut->nLeaves; i++) {\n      aig::GNode faninNode = this->aig.getNodes()[cut->leaves[(int)perm[i]]];\n      if (faninNode == nullptr) {\n        break;\n      }\n      thData->currentFanins[i]    = faninNode;\n      thData->currentFaninsPol[i] = !((phase & (1 << i)) > 0);\n    }\n\n    if (i != cut->nLeaves) {\n      continue;\n    }\n\n    // find the matching class of subgraphs\n    std::vector<ForestNode*>& subgraphs =\n        this->pcgMan.getClasses()[this->npnMan.getMap()[truth]];\n\n    // Pruning\n    int nSubgraphs = subgraphs.size();\n    if (nSubgraphs > this->nGraphs) {\n      nSubgraphs = this->nGraphs;\n    }\n\n    // determine the best subgrap\n    for (i = 0; i < nSubgraphs; i++) {\n\n      forestNode   = subgraphs[i];\n      currentGraph = (DecGraph*)forestNode->pNext; // get the current graph\n\n      bool isComplemented = isOutputCompl\n                                ? (bool)currentGraph->getRootEdge().fCompl ^ 1\n                                : (bool)currentGraph->getRootEdge().fCompl;\n\n      if (isComplemented) {\n        continue;\n      }\n\n      // Preparing structure/AIG tracking for updating the AIG\n      for (int j = 0; j < 20; j++) {\n        if (j < 4) {\n          thData->decNodeFunc[j] =\n              thData->currentFanins[j]; // Link cut leaves to the decomposition\n                                        // graph\n        } else {\n          thData->decNodeFunc[j] =\n              nullptr; // Clear the link table, after leaves\n        }\n      }\n\n      bool wasAdded =\n          updateAig(thData, node, nodeData, currentGraph, isOutputCompl);\n\n      if (wasAdded) {\n        addedChoices++;\n      }\n    }\n  }\n}\n\nbool ChoiceManager::updateAig(ThreadLocalDataCH* thData, aig::GNode rootNode,\n                              aig::NodeData& rootData, DecGraph* decGraph,\n                              bool isOutputCompl) {\n\n  aig::GNode choiceNode;\n  //  aig::GNode auxNode;\n  aig::Graph& aigGraph = this->aig.getGraph();\n\n  bool isDecGraphComplemented = isOutputCompl\n                                    ? (bool)decGraph->getRootEdge().fCompl ^ 1\n                                    : (bool)decGraph->getRootEdge().fCompl;\n\n  // check for constant function\n  if (decGraph->isConst()) {\n    choiceNode = this->aig.getConstZero();\n  } else {\n    // check for a literal\n    if (decGraph->isVar()) {\n      DecNode* decNode = decGraph->getVar();\n      isDecGraphComplemented =\n          isDecGraphComplemented\n              ? (!thData->currentFaninsPol[decNode->id]) ^ true\n              : !thData->currentFaninsPol[decNode->id];\n      choiceNode = thData->decNodeFunc[decNode->id];\n    } else {\n      bool isFeasible = decGraphToAigTry(thData, decGraph);\n      if (isFeasible) {\n        choiceNode = decGraphToAigCreate(thData, decGraph);\n      } else {\n        return false;\n      }\n    }\n  }\n\n  if (rootNode == choiceNode) {\n    return false;\n  }\n\n  aig::NodeData& choiceNodeData =\n      aigGraph.getData(choiceNode, galois::MethodFlag::WRITE);\n  choiceNodeData.choiceList = nullptr;\n  // choiceNodeData.isCompl = isDecGraphComplemented;\n\n  aig::GNode currChoice = rootData.choiceList;\n\n  while (currChoice != nullptr) {\n    if (choiceNode == currChoice) {\n      return false;\n    }\n\n    aig::NodeData& currChoiceData =\n        aigGraph.getData(currChoice, galois::MethodFlag::WRITE);\n\n    if (currChoiceData.choiceList == nullptr) {\n      currChoiceData.choiceList = choiceNode;\n      return true;\n    } else {\n      currChoice = currChoiceData.choiceList;\n    }\n  }\n\n  rootData.choiceList = choiceNode;\n\n  // std::cout << \"Node \" << choiceNodeData.id << \" was added as choice to node\n  // \" << rootData.id << std::endl;\n\n  return true;\n}\n\n/* Transforms the decomposition graph into the AIG. Before calling this\n * procedure, AIG nodes for the fanins (cut's leaves) should be assigned to\n * thData->decNodeFun[ decNode.id ]. */\nbool ChoiceManager::decGraphToAigTry(ThreadLocalDataCH* thData,\n                                     DecGraph* decGraph) {\n\n  DecNode* decNode = nullptr;\n  DecNode* lhsNode;\n  DecNode* rhsNode;\n  aig::GNode curAnd;\n  aig::GNode lhsAnd;\n  aig::GNode rhsAnd;\n  bool lhsAndPol;\n  bool rhsAndPol;\n  aig::Graph& aigGraph = this->aig.getGraph();\n\n  // build the AIG nodes corresponding to the AND gates of the graph\n  for (int i = decGraph->getLeaveNum();\n       (i < decGraph->getNodeNum()) && ((decNode = decGraph->getNode(i)), 1);\n       i++) {\n\n    // get the children of this node\n    lhsNode = decGraph->getNode(decNode->eEdge0.Node);\n    rhsNode = decGraph->getNode(decNode->eEdge1.Node);\n\n    // get the AIG nodes corresponding to the children\n    lhsAnd = thData->decNodeFunc[lhsNode->id];\n    rhsAnd = thData->decNodeFunc[rhsNode->id];\n\n    if (lhsAnd && rhsAnd) {\n      if (lhsNode->id < 4) { // If lhs is a cut leaf\n        lhsAndPol = decNode->eEdge0.fCompl\n                        ? !(thData->currentFaninsPol[lhsNode->id])\n                        : thData->currentFaninsPol[lhsNode->id];\n      } else {\n        lhsAndPol = decNode->eEdge0.fCompl ? false : true;\n      }\n\n      if (rhsNode->id < 4) { // If rhs is a cut leaf\n        rhsAndPol = decNode->eEdge1.fCompl\n                        ? !(thData->currentFaninsPol[rhsNode->id])\n                        : thData->currentFaninsPol[rhsNode->id];\n      } else {\n        rhsAndPol = decNode->eEdge1.fCompl ? false : true;\n      }\n\n      curAnd =\n          this->aig.lookupNodeInFanoutMap(lhsAnd, rhsAnd, lhsAndPol, rhsAndPol);\n\n      if (curAnd) {\n        aig::NodeData& curAndData =\n            aigGraph.getData(curAnd, galois::MethodFlag::READ);\n        if (curAndData.nFanout == 0) {\n          return false;\n        }\n      }\n    } else {\n      curAnd = nullptr;\n    }\n\n    thData->decNodeFunc[decNode->id] = curAnd;\n  }\n\n  aig::GNode choiceRoot = thData->decNodeFunc[decNode->id];\n\n  if (choiceRoot != nullptr) {\n    aig::NodeData& choiceRootData =\n        aigGraph.getData(choiceRoot, galois::MethodFlag::READ);\n    if (choiceRootData.nFanout > 0) {\n      return false;\n    }\n  }\n\n  return true;\n}\n\n/* Transforms the decomposition graph into the AIG. Before calling this\n * procedure, AIG nodes for the fanins (cut's leaves) should be assigned to\n * thData->decNodeFun[ decNode.id ]. */\naig::GNode ChoiceManager::decGraphToAigCreate(ThreadLocalDataCH* thData,\n                                              DecGraph* decGraph) {\n\n  DecNode* decNode = nullptr;\n  DecNode* lhsNode;\n  DecNode* rhsNode;\n  aig::GNode curAnd;\n  aig::GNode lhsAnd;\n  aig::GNode rhsAnd;\n  bool lhsAndPol;\n  bool rhsAndPol;\n  aig::Graph& aigGraph = this->aig.getGraph();\n\n  // build the AIG nodes corresponding to the AND gates of the graph\n  for (int i = decGraph->getLeaveNum();\n       (i < decGraph->getNodeNum()) && ((decNode = decGraph->getNode(i)), 1);\n       i++) {\n\n    // get the children of this node\n    lhsNode = decGraph->getNode(decNode->eEdge0.Node);\n    rhsNode = decGraph->getNode(decNode->eEdge1.Node);\n\n    // get the AIG nodes corresponding to the children\n    lhsAnd = thData->decNodeFunc[lhsNode->id];\n    rhsAnd = thData->decNodeFunc[rhsNode->id];\n\n    if (lhsNode->id < 4) { // If lhs is a cut leaf\n      lhsAndPol = decNode->eEdge0.fCompl\n                      ? !(thData->currentFaninsPol[lhsNode->id])\n                      : thData->currentFaninsPol[lhsNode->id];\n    } else {\n      lhsAndPol = decNode->eEdge0.fCompl ? false : true;\n    }\n\n    if (rhsNode->id < 4) { // If rhs is a cut leaf\n      rhsAndPol = decNode->eEdge1.fCompl\n                      ? !(thData->currentFaninsPol[rhsNode->id])\n                      : thData->currentFaninsPol[rhsNode->id];\n    } else {\n      rhsAndPol = decNode->eEdge1.fCompl ? false : true;\n    }\n\n    curAnd =\n        this->aig.lookupNodeInFanoutMap(lhsAnd, rhsAnd, lhsAndPol, rhsAndPol);\n\n    if (curAnd) {\n      thData->decNodeFunc[decNode->id] = curAnd;\n    } else {\n      thData->decNodeFunc[decNode->id] =\n          this->aig.createAND(lhsAnd, rhsAnd, lhsAndPol, rhsAndPol);\n      aig::NodeData& newNodeData = aigGraph.getData(\n          thData->decNodeFunc[decNode->id], galois::MethodFlag::WRITE);\n      newNodeData.counter =\n          3; // Mark as processed to avoind to insert it into the worklist.\n    }\n  }\n\n  return thData->decNodeFunc[decNode->id];\n}\n\nvoid ChoiceManager::lockFaninCone(aig::Graph& aigGraph, aig::GNode node,\n                                  Cut* cut) {\n\n  aig::NodeData& nodeData =\n      aigGraph.getData(node, galois::MethodFlag::READ); // lock\n\n  // If node is a cut leaf\n  if ((nodeData.id == cut->leaves[0]) || (nodeData.id == cut->leaves[1]) ||\n      (nodeData.id == cut->leaves[2]) || (nodeData.id == cut->leaves[3])) {\n    return;\n  }\n\n  // If node is a PI\n  if ((nodeData.type == aig::NodeType::PI) ||\n      (nodeData.type == aig::NodeType::LATCH)) {\n    return;\n  }\n\n  auto inEdgeIt      = aigGraph.in_edge_begin(node);\n  aig::GNode lhsNode = aigGraph.getEdgeDst(inEdgeIt);\n  //  aig::NodeData& lhsData = aigGraph.getData(lhsNode,\n  //  galois::MethodFlag::READ); // lock\n  aigGraph.getData(lhsNode, galois::MethodFlag::READ); // lock\n  inEdgeIt++;\n  aig::GNode rhsNode = aigGraph.getEdgeDst(inEdgeIt);\n  //  aig::NodeData& rhsData = aigGraph.getData(rhsNode,\n  //  galois::MethodFlag::READ); // lock\n  aigGraph.getData(rhsNode, galois::MethodFlag::READ); // lock\n\n  lockFaninCone(aigGraph, lhsNode, cut);\n  lockFaninCone(aigGraph, rhsNode, cut);\n}\n\naig::Aig& ChoiceManager::getAig() { return this->aig; }\n\nCutManager& ChoiceManager::getCutMan() { return this->cutMan; }\n\nNPNManager& ChoiceManager::getNPNMan() { return this->npnMan; }\n\nPreCompGraphManager& ChoiceManager::getPcgMan() { return this->pcgMan; }\n\nPerThreadDataCH& ChoiceManager::getPerThreadDataCH() {\n  return this->perThreadDataCH;\n}\n\nstruct ChoiceOperator {\n\n  ChoiceManager& chMan;\n  CutManager& cutMan;\n\n  ChoiceOperator(ChoiceManager& chMan)\n      : chMan(chMan), cutMan(chMan.getCutMan()) {}\n\n  void operator()(aig::GNode node, galois::UserContext<aig::GNode>& ctx) {\n\n    aig::Graph& aigGraph = chMan.getAig().getGraph();\n\n    if ((node == nullptr) ||\n        !aigGraph.containsNode(node, galois::MethodFlag::WRITE)) {\n      return;\n    }\n\n    aig::NodeData& nodeData = aigGraph.getData(node, galois::MethodFlag::WRITE);\n\n    if (nodeData.counter == 3) {\n      return;\n    }\n\n    // Touching outgoing neighobors to acquire their locks\n    aigGraph.out_edges(node);\n    //    for (auto outEdge : aigGraph.out_edges(node)) {}\n\n    if (nodeData.type == aig::NodeType::AND) {\n      ThreadLocalDataCH* thData = chMan.getPerThreadDataCH().getLocal();\n      chMan.createNodeChoices(thData, node);\n      /*\n      aig::GNode currChoice = nodeData.choiceList;\n      while ( currChoice != nullptr ) {\n          aig::NodeData& currChoiceData = aigGraph.getData( currChoice,\n      galois::MethodFlag::READ ); std::cout << \"Node \" << nodeData.id << \" ->\n      Choice Node \" << currChoiceData.id << std::endl; currChoice =\n      currChoiceData.choiceList;\n      }\n      */\n    } else {\n      if ((nodeData.type == aig::NodeType::PI) ||\n          (nodeData.type == aig::NodeType::LATCH)) {\n        // Set the trivial cut\n        nodeData.counter      = 3;\n        CutPool* cutPool      = cutMan.getPerThreadCutPool().getLocal();\n        Cut* trivialCut       = cutPool->getMemory();\n        trivialCut->leaves[0] = nodeData.id;\n        trivialCut->nLeaves++;\n        trivialCut->sig = (1U << (nodeData.id % 31));\n        if (cutMan.getCompTruthFlag()) {\n          unsigned* cutTruth = cutMan.readTruth(trivialCut);\n          for (int i = 0; i < cutMan.getNWords(); i++) {\n            cutTruth[i] = 0xAAAAAAAA;\n          }\n        }\n        cutMan.getNodeCuts()[nodeData.id] = trivialCut;\n      }\n    }\n\n    // Schedule fanout nodes\n    if (nodeData.counter == 2) {\n      nodeData.counter += 1;\n    }\n    if (nodeData.counter == 3) {\n      // Insert nextNodes in the worklist\n      for (auto outEdge : aigGraph.out_edges(node)) {\n        aig::GNode nextNode = aigGraph.getEdgeDst(outEdge);\n        aig::NodeData& nextNodeData =\n            aigGraph.getData(nextNode, galois::MethodFlag::WRITE);\n\n        if ((nextNodeData.type == aig::NodeType::PO) ||\n            (nextNodeData.type == aig::NodeType::LATCH)) {\n          continue;\n        }\n\n        nextNodeData.counter += 1;\n        if (nextNodeData.counter == 2) {\n          if (cutMan.getNodeCuts()[nextNodeData.id] != nullptr) {\n            cutMan.recycleNodeCuts(nextNodeData.id);\n          }\n          ctx.push(nextNode);\n        }\n      }\n    }\n  }\n};\n\nvoid runChoiceOperator(ChoiceManager& chMan) {\n\n  galois::InsertBag<aig::GNode> workList;\n  typedef galois::worklists::PerSocketChunkBag<500> DC_BAG;\n\n  for (auto pi : chMan.getAig().getInputNodes()) {\n    workList.push(pi);\n  }\n\n  // for (auto latch : chMan.getAig().getLatchNodes()) {\n  //  workList.push(latch);\n  //}\n\n  // Galois Parallel Foreach\n  galois::for_each(galois::iterate(workList.begin(), workList.end()),\n                   ChoiceOperator(chMan), galois::wl<DC_BAG>(),\n                   galois::loopname(\"ChoiceOperator\"),\n                   galois::per_iter_alloc());\n}\n\n} /* namespace algorithm */\n"
  },
  {
    "path": "lonestar/eda/cpu/aig-rewriting/algorithms/ChoiceManager.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n/*\n\n @Vinicius Possani\n Parallel AIG Choice Insertion December 6, 2018.\n\n*/\n\n#ifndef CHOICEMANAGER_H_\n#define CHOICEMANAGER_H_\n\n#include \"Aig.h\"\n#include \"CutManager.h\"\n#include \"NPNManager.h\"\n#include \"PreCompGraphManager.h\"\n#include \"galois/worklists/Chunk.h\"\n\n#include <vector>\n\nnamespace algorithm {\n\ntypedef struct ThreadLocalDataCH_ {\n  std::vector<bool> currentFaninsPol;\n  std::vector<aig::GNode> currentFanins;\n  std::vector<aig::GNode> decNodeFunc;\n\n  ThreadLocalDataCH_()\n      : currentFaninsPol(4), currentFanins(4), decNodeFunc(20) {}\n\n} ThreadLocalDataCH;\n\ntypedef galois::substrate::PerThreadStorage<ThreadLocalDataCH> PerThreadDataCH;\n\nclass ChoiceManager {\n\nprivate:\n  aig::Aig& aig;\n  CutManager& cutMan;\n  NPNManager& npnMan;\n  PreCompGraphManager& pcgMan;\n  PerThreadDataCH perThreadDataCH;\n  int nFuncs;\n  int nGraphs;\n  int nChoices;\n\n  bool updateAig(ThreadLocalDataCH* thData, aig::GNode rootNode,\n                 aig::NodeData& rootData, DecGraph* decGraph,\n                 bool isOutputCompl);\n  bool decGraphToAigTry(ThreadLocalDataCH* thData, DecGraph* decGraph);\n  aig::GNode decGraphToAigCreate(ThreadLocalDataCH* thData, DecGraph* decGraph);\n\n  void lockFaninCone(aig::Graph& aigGraph, aig::GNode node, Cut* cut);\n\npublic:\n  ChoiceManager(aig::Aig& aig, CutManager& cutMan, NPNManager& npnMan,\n                PreCompGraphManager& pcgMan, int nGraphs, int nChoinces);\n\n  ~ChoiceManager();\n\n  void createNodeChoices(ThreadLocalDataCH* thData, aig::GNode node);\n\n  aig::Aig& getAig();\n  CutManager& getCutMan();\n  NPNManager& getNPNMan();\n  PreCompGraphManager& getPcgMan();\n  PerThreadDataCH& getPerThreadDataCH();\n};\n\nvoid runChoiceOperator(ChoiceManager& chMan);\n\n} /* namespace algorithm */\n\n#endif /* CHOICEMANAGER_H_ */\n"
  },
  {
    "path": "lonestar/eda/cpu/aig-rewriting/algorithms/CutManager.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n/*\n\n @Vinicius Possani\n Parallel Rewriting January 5, 2018.\n ABC-based implementation on Galois.\n\n*/\n\n#include \"CutManager.h\"\n#include \"galois/Galois.h\"\n#include \"galois/Bag.h\"\n\n#include <iostream>\n#include <cstdlib>\n#include <chrono>\n#include <assert.h>\n\nusing namespace std::chrono;\n\nnamespace algorithm {\n\nCutManager::CutManager(aig::Aig& aig, int K, int C, int nThreads,\n                       bool compTruth)\n    :\n\n      aig(aig), K(K), C(C), nWords(Functional32::wordNum(K)),\n      nNodes(std::distance(aig.getGraph().begin(), aig.getGraph().end()) -\n             aig.getNumOutputs()),\n      nThreads(nThreads), cutPoolSize(nNodes / nThreads), compTruth(compTruth),\n      perThreadCutPool(cutPoolSize, K, compTruth), perThreadCutList(K),\n      perThreadAuxTruth(nWords) {\n\n  kcutTime = 0;\n  nodeCuts = new Cut*[nNodes + 1];\n  for (int i = 0; i < nNodes + 1; i++) {\n    nodeCuts[i] = nullptr;\n  }\n}\n\nCutManager::~CutManager() { delete[] nodeCuts; }\n\nvoid CutManager::computeCuts(CutPool* cutPool, CutList* cutList,\n                             AuxTruth* auxTruth, int nodeId, int lhsId,\n                             int rhsId, bool lhsPolarity, bool rhsPolarity) {\n\n  int i;\n  int currentNumCuts = 0;\n\n  // start with the elementary cut\n  Cut* trivialCut       = cutPool->getMemory();\n  trivialCut->leaves[0] = nodeId;\n  trivialCut->nLeaves++;\n  trivialCut->sig = (1U << (nodeId % 31));\n  if (this->compTruth) {\n    unsigned* cutTruth = readTruth(trivialCut);\n    for (int i = 0; i < this->nWords; i++) {\n      cutTruth[i] = 0xAAAAAAAA;\n    }\n  }\n  cutList->head[1] = trivialCut;\n  cutList->tail[1] = trivialCut;\n  currentNumCuts++;\n  nCuts += 1;\n  nTriv += 1;\n\n  // std::chrono::high_resolution_clock::time_point t1 =\n  // std::chrono::high_resolution_clock::now();\n\n  Cut* lhsLargeCutsBegin;\n  for (lhsLargeCutsBegin = this->nodeCuts[lhsId]; lhsLargeCutsBegin != nullptr;\n       lhsLargeCutsBegin = lhsLargeCutsBegin->nextCut) {\n    if (lhsLargeCutsBegin->nLeaves == this->K) {\n      break;\n    }\n  }\n\n  Cut* rhsLargeCutsBegin;\n  for (rhsLargeCutsBegin = this->nodeCuts[rhsId]; rhsLargeCutsBegin != nullptr;\n       rhsLargeCutsBegin = rhsLargeCutsBegin->nextCut) {\n    if (rhsLargeCutsBegin->nLeaves == this->K) {\n      break;\n    }\n  }\n\n  // small by small\n  for (Cut* lhsCut = this->nodeCuts[lhsId]; lhsCut != lhsLargeCutsBegin;\n       lhsCut      = lhsCut->nextCut) {\n    for (Cut* rhsCut = this->nodeCuts[rhsId]; rhsCut != rhsLargeCutsBegin;\n         rhsCut      = rhsCut->nextCut) {\n      if (processTwoCuts(cutPool, cutList, auxTruth, lhsCut, rhsCut,\n                         lhsPolarity, rhsPolarity, currentNumCuts)) {\n        commitCuts(nodeId, cutList);\n        return; // The Maximum number of cuts per node was reached\n      }\n    }\n  }\n\n  // small by large\n  for (Cut* lhsCut = this->nodeCuts[lhsId]; lhsCut != lhsLargeCutsBegin;\n       lhsCut      = lhsCut->nextCut) {\n    for (Cut* rhsCut = rhsLargeCutsBegin; rhsCut != nullptr;\n         rhsCut      = rhsCut->nextCut) {\n      if ((lhsCut->sig & rhsCut->sig) != lhsCut->sig) {\n        continue;\n      }\n      if (processTwoCuts(cutPool, cutList, auxTruth, lhsCut, rhsCut,\n                         lhsPolarity, rhsPolarity, currentNumCuts)) {\n        commitCuts(nodeId, cutList);\n        return; // The Maximum number of cuts per node was reached\n      }\n    }\n  }\n\n  // small by large\n  for (Cut* rhsCut = this->nodeCuts[rhsId]; rhsCut != rhsLargeCutsBegin;\n       rhsCut      = rhsCut->nextCut) {\n    for (Cut* lhsCut = lhsLargeCutsBegin; lhsCut != nullptr;\n         lhsCut      = lhsCut->nextCut) {\n      if ((lhsCut->sig & rhsCut->sig) != rhsCut->sig) {\n        continue;\n      }\n      if (processTwoCuts(cutPool, cutList, auxTruth, lhsCut, rhsCut,\n                         lhsPolarity, rhsPolarity, currentNumCuts)) {\n        commitCuts(nodeId, cutList);\n        return; // The Maximum number of cuts per node was reached\n      }\n    }\n  }\n\n  // large by large\n  for (Cut* lhsCut = lhsLargeCutsBegin; lhsCut != nullptr;\n       lhsCut      = lhsCut->nextCut) {\n    for (Cut* rhsCut = rhsLargeCutsBegin; rhsCut != nullptr;\n         rhsCut      = rhsCut->nextCut) {\n      if (lhsCut->sig != rhsCut->sig) {\n        continue;\n      }\n      for (i = 0; i < this->K; i++) {\n        if (lhsCut->leaves[i] != rhsCut->leaves[i]) {\n          break;\n        }\n      }\n      if (i < this->K) {\n        continue;\n      }\n      if (processTwoCuts(cutPool, cutList, auxTruth, lhsCut, rhsCut,\n                         lhsPolarity, rhsPolarity, currentNumCuts)) {\n        commitCuts(nodeId, cutList);\n        return; // The Maximum number of cuts per node was reached\n      }\n    }\n  }\n\n  // Copy from currentCutList to the nodeCuts\n  commitCuts(nodeId, cutList);\n\n  // std::chrono::high_resolution_clock::time_point t2 =\n  // std::chrono::high_resolution_clock::now(); compTime +=\n  // std::chrono::duration_cast<std::chrono::microseconds>( t2 - t1 ).count();\n}\n\nvoid CutManager::computeCutsRecursively(aig::GNode node) {\n\n  aig::Graph& aigGraph    = this->aig.getGraph();\n  aig::NodeData& nodeData = aigGraph.getData(node, galois::MethodFlag::READ);\n\n  if (this->nodeCuts[nodeData.id] == nullptr) {\n\n    auto inEdgeIt          = aigGraph.in_edge_begin(node);\n    aig::GNode lhsNode     = aigGraph.getEdgeDst(inEdgeIt);\n    aig::NodeData& lhsData = aigGraph.getData(lhsNode);\n    bool lhsPolarity       = aigGraph.getEdgeData(inEdgeIt);\n\n    inEdgeIt++;\n    aig::GNode rhsNode     = aigGraph.getEdgeDst(inEdgeIt);\n    aig::NodeData& rhsData = aigGraph.getData(rhsNode);\n    bool rhsPolarity       = aigGraph.getEdgeData(inEdgeIt);\n\n    CutPool* cutPool   = this->perThreadCutPool.getLocal();\n    CutList* cutList   = this->perThreadCutList.getLocal();\n    AuxTruth* auxTruth = this->perThreadAuxTruth.getLocal();\n\n    computeCutsRec(lhsNode, cutPool, cutList, auxTruth);\n    computeCutsRec(rhsNode, cutPool, cutList, auxTruth);\n\n    computeCuts(cutPool, cutList, auxTruth, nodeData.id, lhsData.id, rhsData.id,\n                lhsPolarity, rhsPolarity);\n  }\n}\n\nvoid CutManager::computeCutsRec(aig::GNode node, CutPool* cutPool,\n                                CutList* cutList, AuxTruth* auxTruth) {\n\n  aig::Graph& aigGraph    = this->aig.getGraph();\n  aig::NodeData& nodeData = aigGraph.getData(node, galois::MethodFlag::READ);\n\n  if (this->nodeCuts[nodeData.id] == nullptr) {\n\n    auto inEdgeIt          = aigGraph.in_edge_begin(node);\n    aig::GNode lhsNode     = aigGraph.getEdgeDst(inEdgeIt);\n    aig::NodeData& lhsData = aigGraph.getData(lhsNode);\n    bool lhsPolarity       = aigGraph.getEdgeData(inEdgeIt);\n\n    inEdgeIt++;\n    aig::GNode rhsNode     = aigGraph.getEdgeDst(inEdgeIt);\n    aig::NodeData& rhsData = aigGraph.getData(rhsNode);\n    bool rhsPolarity       = aigGraph.getEdgeData(inEdgeIt);\n\n    computeCutsRec(lhsNode, cutPool, cutList, auxTruth);\n    computeCutsRec(rhsNode, cutPool, cutList, auxTruth);\n\n    computeCuts(cutPool, cutList, auxTruth, nodeData.id, lhsData.id, rhsData.id,\n                lhsPolarity, rhsPolarity);\n  }\n}\n\ninline bool CutManager::processTwoCuts(CutPool* cutPool, CutList* cutList,\n                                       AuxTruth* auxTruth, Cut* lhsCut,\n                                       Cut* rhsCut, bool lhsPolarity,\n                                       bool rhsPolarity, int& currentNumCuts) {\n\n  // std::chrono::high_resolution_clock::time_point t0 =\n  // std::chrono::high_resolution_clock::now();\n\n  Cut* resCut;\n\n  // merge the cuts\n  // std::chrono::high_resolution_clock::time_point t1 =\n  // std::chrono::high_resolution_clock::now();\n  if (lhsCut->nLeaves >= rhsCut->nLeaves) {\n    resCut = mergeCuts(cutPool, lhsCut, rhsCut);\n  } else {\n    resCut = mergeCuts(cutPool, rhsCut, lhsCut);\n  }\n  // std::chrono::high_resolution_clock::time_point t2 =\n  // std::chrono::high_resolution_clock::now(); mergeTime +=\n  // std::chrono::duration_cast<std::chrono::microseconds>( t2 - t1 ).count();\n\n  if (resCut == nullptr) {\n    return false;\n  }\n\n  // assert( (resCut->nLeaves > 1) && (resCut->nLeaves <= K) );\n\n  // set the signature\n  resCut->sig = lhsCut->sig | rhsCut->sig;\n\n  // std::chrono::high_resolution_clock::time_point t3 =\n  // std::chrono::high_resolution_clock::now();\n  // check containment\n  if (cutFilter(cutPool, cutList, resCut, currentNumCuts)) {\n    return false;\n  }\n  // std::chrono::high_resolution_clock::time_point t4 =\n  // std::chrono::high_resolution_clock::now(); filterTime +=\n  // std::chrono::duration_cast<std::chrono::microseconds>( t4 - t3 ).count();\n\n  if (this->compTruth) {\n    computeTruth(auxTruth, resCut, lhsCut, rhsCut, lhsPolarity, rhsPolarity);\n    // printf( \"%x\\n\", (*readTruth( resCut )) );\n  }\n\n  // add to the list\n  if (cutList->head[resCut->nLeaves] == nullptr) {\n    cutList->head[resCut->nLeaves] = resCut;\n  } else {\n    cutList->tail[resCut->nLeaves]->nextCut = resCut;\n  }\n  cutList->tail[resCut->nLeaves] = resCut;\n  currentNumCuts++;\n  nCuts += 1;\n\n  // std::chrono::high_resolution_clock::time_point t5 =\n  // std::chrono::high_resolution_clock::now(); procTwoTime +=\n  // std::chrono::duration_cast<std::chrono::microseconds>( t5 - t0 ).count();\n\n  // return status (0 if okay; 1 if exceeded the limit)\n\n  if (currentNumCuts >= this->C) {\n    nSatu += 1;\n    return true;\n  }\n\n  return false;\n}\n\nCut* CutManager::mergeCuts(CutPool* cutPool, Cut* lhsCut, Cut* rhsCut) {\n\n  int i, j, l;\n\n  // assert( lhsCut->nLeaves >= rhsCut->nLeaves );\n\n  Cut* resCut;\n\n  // the case of the largest cut sizes\n  if (lhsCut->nLeaves == this->K && rhsCut->nLeaves == this->K) {\n    for (i = 0; i < lhsCut->nLeaves; i++) {\n      if (lhsCut->leaves[i] != rhsCut->leaves[i]) {\n        return nullptr;\n      }\n    }\n    resCut = cutPool->getMemory();\n    for (i = 0; i < lhsCut->nLeaves; i++) {\n      resCut->leaves[i] = lhsCut->leaves[i];\n    }\n    resCut->nLeaves = lhsCut->nLeaves;\n    return resCut;\n  }\n\n  // the case when one of the cuts is the largest\n  if (lhsCut->nLeaves == this->K) {\n    for (i = 0; i < rhsCut->nLeaves; i++) {\n      for (j = lhsCut->nLeaves - 1; j >= 0; j--) {\n        if (lhsCut->leaves[j] == rhsCut->leaves[i]) {\n          break;\n        }\n      }\n      if (j == -1) { // did not find\n        return nullptr;\n      }\n    }\n    resCut = cutPool->getMemory();\n    for (i = 0; i < lhsCut->nLeaves; i++) {\n      resCut->leaves[i] = lhsCut->leaves[i];\n    }\n    resCut->nLeaves = lhsCut->nLeaves;\n    return resCut;\n  }\n\n  // compare two cuts with different numbers\n  resCut = cutPool->getMemory();\n  i      = 0;\n  j      = 0;\n  for (l = 0; l < this->K; l++) {\n    if (j == rhsCut->nLeaves) {\n      if (i == lhsCut->nLeaves) {\n        resCut->nLeaves = l;\n        return resCut;\n      }\n      resCut->leaves[l] = lhsCut->leaves[i++];\n      continue;\n    }\n\n    if (i == lhsCut->nLeaves) {\n      if (j == rhsCut->nLeaves) {\n        resCut->nLeaves = l;\n        return resCut;\n      }\n      resCut->leaves[l] = rhsCut->leaves[j++];\n      continue;\n    }\n\n    if (lhsCut->leaves[i] < rhsCut->leaves[j]) {\n      resCut->leaves[l] = lhsCut->leaves[i++];\n      continue;\n    }\n\n    if (lhsCut->leaves[i] > rhsCut->leaves[j]) {\n      resCut->leaves[l] = rhsCut->leaves[j++];\n      continue;\n    }\n\n    resCut->leaves[l] = lhsCut->leaves[i++];\n    j++;\n  }\n\n  if (i < lhsCut->nLeaves || j < rhsCut->nLeaves) {\n    cutPool->giveBackMemory(resCut);\n    return nullptr;\n  }\n\n  resCut->nLeaves = l;\n  return resCut;\n}\n\ninline bool CutManager::cutFilter(CutPool* cutPool, CutList* cutList,\n                                  Cut* resCut, int& currentNumCuts) {\n\n  // check if this cut is filtered out by smaller cuts\n  for (int i = 2; i <= resCut->nLeaves; i++) {\n\n    for (Cut* cut = cutList->head[i]; cut != nullptr; cut = cut->nextCut) {\n\n      // skip the non-contained cuts\n      if ((cut->sig & resCut->sig) != cut->sig) {\n        continue;\n      }\n      // check containment seriously\n      if (checkCutDominance(cut, resCut)) {\n        // Recycle Cut\n        cutPool->giveBackMemory(resCut);\n        nFilt += 1;\n        return true; // resCut is dominated\n      }\n    }\n  }\n\n  // filter out other cuts using this one\n  for (int i = resCut->nLeaves + 1; i <= this->K; i++) {\n\n    Cut* prevCut  = nullptr;\n    Cut* toRemove = nullptr;\n    Cut* cut      = cutList->head[i];\n\n    while (cut != nullptr) {\n\n      // sKip the non-contained cuts\n      if ((cut->sig & resCut->sig) != resCut->sig) {\n        prevCut = cut;\n        cut     = cut->nextCut;\n        continue;\n      }\n      // check containment seriously\n      if (checkCutDominance(resCut, cut)) {\n\n        currentNumCuts--;\n        nCuts -= 1;\n        nFilt += 1;\n\n        // when the cut to be removed is the first of the list\n        if (cut == cutList->head[i]) {\n          cutList->head[i] = cut->nextCut;\n          toRemove         = cut;\n          cut              = cut->nextCut;\n          // Recycle Cut\n          cutPool->giveBackMemory(toRemove);\n          continue;\n        }\n\n        // when the cut to be removed is in the middle or in the end of the list\n        if (prevCut != nullptr) {\n          prevCut->nextCut = cut->nextCut;\n          toRemove         = cut;\n          cut              = cut->nextCut;\n          // Recycle Cut\n          cutPool->giveBackMemory(toRemove);\n        } else {\n          std::cout << \"Bug cut removal!\" << std::endl;\n          exit(1);\n        }\n      } else {\n        prevCut = cut;\n        cut     = cut->nextCut;\n      }\n    }\n\n    cutList->tail[i] = prevCut;\n  }\n\n  return false;\n}\n\ninline bool CutManager::checkCutDominance(Cut* smallerCut, Cut* largerCut) {\n\n  int i, j;\n\n  for (i = 0; i < smallerCut->nLeaves; i++) {\n    for (j = 0; j < largerCut->nLeaves; j++) {\n      if (smallerCut->leaves[i] == largerCut->leaves[j]) {\n        break;\n      }\n    }\n    if (j ==\n        largerCut\n            ->nLeaves) { // node i in smallerCut is not contained in largerCut\n      return false;\n    }\n  }\n  // every node in smallerCut is contained in largerCut\n  return true;\n}\n\nvoid CutManager::commitCuts(int nodeId, CutList* cutList) {\n\n  // Copy from currentCutList to the nodeCuts\n  this->nodeCuts[nodeId] = cutList->head[1];\n  Cut* lastCut           = cutList->head[1];\n  cutList->head[1]       = nullptr;\n  for (int i = 2; i < this->K + 1; i++) {\n    if (cutList->head[i] == nullptr) {\n      continue;\n    }\n    lastCut->nextCut = cutList->head[i];\n    lastCut          = cutList->tail[i];\n    cutList->head[i] = nullptr;\n    cutList->tail[i] = nullptr;\n  }\n}\n\nvoid CutManager::computeTruth(AuxTruth* auxTruth, Cut* resCut, Cut* lhsCut,\n                              Cut* rhsCut, bool lhsPolarity, bool rhsPolarity) {\n\n  // permute the first table\n  if (lhsPolarity) {\n    Functional32::copy(auxTruth->truth[0], readTruth(lhsCut), this->nWords);\n  } else {\n    Functional32::NOT(auxTruth->truth[0], readTruth(lhsCut), this->nWords);\n  }\n  Functional32::truthStretch(auxTruth->truth[2], auxTruth->truth[0],\n                             lhsCut->nLeaves, this->K,\n                             truthPhase(resCut, lhsCut));\n\n  // permute the second table\n  if (rhsPolarity) {\n    Functional32::copy(auxTruth->truth[1], readTruth(rhsCut), this->nWords);\n  } else {\n    Functional32::NOT(auxTruth->truth[1], readTruth(rhsCut), this->nWords);\n  }\n  Functional32::truthStretch(auxTruth->truth[3], auxTruth->truth[1],\n                             rhsCut->nLeaves, this->K,\n                             truthPhase(resCut, rhsCut));\n\n  // produce the resulting table. In this first version we are not considering\n  // the cut->fCompl flag. It may be considerer in further versions according to\n  // the demand.\n  // if ( cut->fCompl ) {\n  //\tFunctional32::NAND( readTruth( cut ) , auxTruth[2], auxTruth[3], K );\n  //}\n  // else {\n  Functional32::AND(readTruth(resCut), auxTruth->truth[2], auxTruth->truth[3],\n                    this->nWords);\n  //}\n}\n\ninline unsigned CutManager::truthPhase(Cut* resCut, Cut* inCut) {\n\n  unsigned phase = 0;\n  int i, j;\n  for (i = j = 0; i < resCut->nLeaves; i++) {\n\n    if (j == inCut->nLeaves) {\n      break;\n    }\n    if (resCut->leaves[i] < inCut->leaves[j]) {\n      continue;\n    }\n\n    assert(resCut->leaves[i] == inCut->leaves[j]);\n    phase |= (1 << i);\n    j++;\n  }\n\n  return phase;\n}\n\nunsigned int* CutManager::readTruth(Cut* cut) {\n  return (unsigned*)(cut->leaves + this->K);\n}\n\n/*\n *     This method gives the cut's memory back to current thread cutPool.\n *     However, the memory can be allocated by the cutPool of one thread\n *     and returned to cutPool of another thread.\n */\nvoid CutManager::recycleNodeCuts(int nodeId) {\n\n  CutPool* cutPool = this->perThreadCutPool.getLocal();\n  Cut* cut         = this->nodeCuts[nodeId];\n\n  while (cut != nullptr) {\n    Cut* nextCut = cut->nextCut;\n    cutPool->giveBackMemory(cut);\n    cut = nextCut;\n  }\n\n  this->nodeCuts[nodeId] = nullptr;\n}\n\nvoid CutManager::printNodeCuts(int nodeId, long int& counter) {\n\n  std::cout << \"Node \" << nodeId << \": { \";\n  for (Cut* currentCut = this->nodeCuts[nodeId]; currentCut != nullptr;\n       currentCut      = currentCut->nextCut) {\n    counter++;\n    std::cout << \"{ \";\n    for (int i = 0; i < currentCut->nLeaves; i++) {\n      std::cout << currentCut->leaves[i] << \" \";\n    }\n    std::cout << \"} \";\n  }\n  std::cout << \"}\" << std::endl;\n}\n\nvoid CutManager::printAllCuts() {\n\n  long int counter     = 0;\n  aig::Graph& aigGraph = this->aig.getGraph();\n\n  std::cout << std::endl << \"########## All K-Cuts ###########\" << std::endl;\n  for (aig::GNode node : aigGraph) {\n    aig::NodeData& nodeData =\n        aigGraph.getData(node, galois::MethodFlag::UNPROTECTED);\n    if ((nodeData.type == aig::NodeType::AND) ||\n        (nodeData.type == aig::NodeType::PI)) {\n      printNodeCuts(nodeData.id, counter);\n    }\n  }\n  std::cout << \"#################################\" << std::endl;\n}\n\nvoid CutManager::printCutStatistics() {\n\n  long int nCutsRed = nCuts.reduce();\n  nCutsRed += this->aig.getNumInputs();\n\n  long int nTrivRed = nTriv.reduce();\n  nTrivRed += this->aig.getNumInputs();\n\n  long int nFiltRed = nFilt.reduce();\n\n  long int nSatuRed = nSatu.reduce();\n\n  std::cout << std::endl\n            << \"############## Cut Statistics #############\" << std::endl;\n  std::cout << \"nCuts: \" << nCutsRed << std::endl;\n  std::cout << \"nTriv: \" << nTrivRed << std::endl;\n  std::cout << \"nFilt: \" << nFiltRed << std::endl;\n  std::cout << \"nSatu: \" << nSatuRed << std::endl;\n  std::cout << \"nCutPerNode: \" << (((double)nCutsRed) / this->nNodes)\n            << std::endl;\n  std::cout << \"###########################################\" << std::endl;\n}\n\nvoid CutManager::printRuntimes() {\n\n  std::cout << std::endl << \"#### Runtimes in microsecond ####\" << std::endl;\n  // std::cout << \"Merge: \" << mergeTime << std::endl;\n  // std::cout << \"Filter: \" << filterTime << std::endl;\n  // std::cout << \"ProcTwo: \" << procTwoTime << std::endl;\n  // std::cout << \"Compute: \" << compTime << std::endl;\n  // std::cout << \"Schedule: \" << scheduleTime << std::endl;\n  std::cout << \"Total: \" << this->kcutTime << std::endl;\n  std::cout << \"#################################\" << std::endl;\n}\n\naig::Aig& CutManager::getAig() { return this->aig; }\n\nint CutManager::getK() { return this->K; }\n\nint CutManager::getC() { return this->C; }\n\nint CutManager::getNWords() { return this->nWords; }\n\nint CutManager::getNThreads() { return this->nThreads; }\n\nbool CutManager::getCompTruthFlag() { return this->compTruth; }\n\nlong double CutManager::getKcutTime() { return this->kcutTime; }\n\nvoid CutManager::setKcutTime(long double time) { this->kcutTime = time; }\n\nPerThreadCutPool& CutManager::getPerThreadCutPool() {\n  return this->perThreadCutPool;\n}\n\nPerThreadCutList& CutManager::getPerThreadCutList() {\n  return this->perThreadCutList;\n}\n\nPerThreadAuxTruth& CutManager::getPerThreadAuxTruth() {\n  return this->perThreadAuxTruth;\n}\n\nCut** CutManager::getNodeCuts() { return this->nodeCuts; }\n\n// ######################## BEGIN OPERATOR ######################## //\nstruct KCutOperator {\n\n  CutManager& cutMan;\n\n  KCutOperator(CutManager& cutMan) : cutMan(cutMan) {}\n\n  void operator()(aig::GNode node, galois::UserContext<aig::GNode>& ctx) {\n\n    aig::Aig& aig        = cutMan.getAig();\n    aig::Graph& aigGraph = aig.getGraph();\n\n    aig::NodeData& nodeData = aigGraph.getData(node, galois::MethodFlag::READ);\n\n    if (nodeData.type == aig::NodeType::AND) {\n\n      // Touching outgoing neighobors to acquire their locks\n      aigGraph.out_edges(node);\n\n      // Combine Cuts\n      auto inEdgeIt          = aigGraph.in_edge_begin(node);\n      aig::GNode lhsNode     = aigGraph.getEdgeDst(inEdgeIt);\n      aig::NodeData& lhsData = aigGraph.getData(lhsNode);\n      bool lhsPolarity       = aigGraph.getEdgeData(inEdgeIt);\n\n      inEdgeIt++;\n      aig::GNode rhsNode     = aigGraph.getEdgeDst(inEdgeIt);\n      aig::NodeData& rhsData = aigGraph.getData(rhsNode);\n      bool rhsPolarity       = aigGraph.getEdgeData(inEdgeIt);\n\n      CutPool* cutPool   = cutMan.getPerThreadCutPool().getLocal();\n      CutList* cutList   = cutMan.getPerThreadCutList().getLocal();\n      AuxTruth* auxTruth = cutMan.getPerThreadAuxTruth().getLocal();\n\n      // ctx.cautiousPoint();\n\n      cutMan.computeCuts(cutPool, cutList, auxTruth, nodeData.id, lhsData.id,\n                         rhsData.id, lhsPolarity, rhsPolarity);\n\n      // NextNodes\n      for (auto edge : aigGraph.out_edges(node)) {\n        aig::GNode nextNode = aigGraph.getEdgeDst(edge);\n        aig::NodeData& nextNodeData =\n            aigGraph.getData(nextNode, galois::MethodFlag::WRITE);\n        nextNodeData.counter += 1;\n        if (nextNodeData.counter == 2) {\n          ctx.push(nextNode);\n        }\n      }\n    } else {\n      if (nodeData.type == aig::NodeType::PI) {\n        // Touching outgoing neighobors to acquire their locks and their fanin\n        // node's locks.\n        aigGraph.out_edges(node);\n\n        // ctx.cautiousPoint();\n\n        // Set the trivial cut\n        nodeData.counter      = 3;\n        CutPool* cutPool      = cutMan.getPerThreadCutPool().getLocal();\n        Cut* trivialCut       = cutPool->getMemory();\n        trivialCut->leaves[0] = nodeData.id;\n        trivialCut->nLeaves++;\n        trivialCut->sig = (1U << (nodeData.id % 31));\n        if (cutMan.getCompTruthFlag()) {\n          unsigned* cutTruth = cutMan.readTruth(trivialCut);\n          for (int i = 0; i < cutMan.getNWords(); i++) {\n            cutTruth[i] = 0xAAAAAAAA;\n          }\n        }\n        cutMan.getNodeCuts()[nodeData.id] = trivialCut;\n\n        // Schedule next nodes\n        for (auto edge : aigGraph.out_edges(node)) {\n          aig::GNode nextNode = aigGraph.getEdgeDst(edge);\n          aig::NodeData& nextNodeData =\n              aigGraph.getData(nextNode, galois::MethodFlag::WRITE);\n          nextNodeData.counter += 1;\n          if (nextNodeData.counter == 2) {\n            ctx.push(nextNode);\n          }\n        }\n      }\n    }\n  }\n};\n\nvoid runKCutOperator(CutManager& cutMan) {\n\n  galois::InsertBag<aig::GNode> workList;\n  typedef galois::worklists::PerSocketChunkBag<500> DC_BAG;\n  // typedef galois::worklists::PerSocketChunkFIFO< 200 > DC_FIFO;\n  // typedef galois::worklists::PerSocketChunkLIFO< 200 > DC_LIFO;\n  // typedef galois::worklists::PerThreadChunkFIFO< 200 > AC_FIFO;\n\n  for (auto pi : cutMan.getAig().getInputNodes()) {\n    workList.push(pi);\n  }\n\n  // Galois Parallel Foreach\n  galois::for_each(galois::iterate(workList.begin(), workList.end()),\n                   KCutOperator(cutMan), galois::wl<DC_BAG>(),\n                   galois::loopname(\"KCutOperator\"));\n\n  // galois::wl<galois::worklists::Deterministic<>>(),\n  // galois::wl<DC_BAG>(),\n}\n// ######################## END OPERATOR ######################## //\n\n} /* namespace algorithm */\n"
  },
  {
    "path": "lonestar/eda/cpu/aig-rewriting/algorithms/CutManager.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n/*\n\n @Vinicius Possani\n Parallel Rewriting January 5, 2018.\n ABC-based implementation on Galois.\n\n*/\n\n#ifndef CUTMANAGER_H_\n#define CUTMANAGER_H_\n\n#include \"Aig.h\"\n#include \"CutPool.h\"\n#include \"../functional/FunctionHandler32.h\"\n#include \"galois/Reduction.h\"\n\nnamespace algorithm {\n\ntypedef struct cutList_ {\n\n  Cut** head;\n  Cut** tail;\n\n  cutList_(int K) {\n    head = new Cut*[K + 1];\n    for (int i = 0; i < K + 1; i++) {\n      head[i] = nullptr;\n    }\n\n    tail = new Cut*[K + 1];\n    for (int i = 0; i < K + 1; i++) {\n      tail[i] = nullptr;\n    }\n  }\n\n  ~cutList_() {\n    delete[] head;\n    delete[] tail;\n  }\n\n} CutList;\n\ntypedef struct auxTruth_ {\n\n  unsigned int* truth[4];\n\n  auxTruth_(int nWords) {\n    for (int i = 0; i < 4; i++) {\n      truth[i] = new unsigned int[nWords];\n    }\n  }\n\n  ~auxTruth_() {\n    for (int i = 0; i < 4; i++) {\n      delete[] truth[i];\n    }\n  }\n\n} AuxTruth;\n\ntypedef galois::substrate::PerThreadStorage<CutPool> PerThreadCutPool;\ntypedef galois::substrate::PerThreadStorage<CutList> PerThreadCutList;\ntypedef galois::substrate::PerThreadStorage<AuxTruth> PerThreadAuxTruth;\n\nclass CutManager {\n\nprivate:\n  aig::Aig& aig;\n  int K;\n  int C;\n  int nWords;\n  int nNodes;\n  int nThreads;\n  long int cutPoolSize;\n  bool compTruth;\n  long double kcutTime;\n\n  PerThreadCutPool perThreadCutPool;\n  PerThreadCutList perThreadCutList;\n  PerThreadAuxTruth perThreadAuxTruth;\n  Cut** nodeCuts;\n\n  // Cuts Statistics //\n  galois::GAccumulator<long int> nCuts;\n  galois::GAccumulator<long int> nTriv;\n  galois::GAccumulator<long int> nFilt;\n  galois::GAccumulator<long int> nSatu;\n\n  // Runtime Statistics //\n  galois::GAccumulator<long int> mergeTime;\n  galois::GAccumulator<long int> filterTime;\n  galois::GAccumulator<long int> procTwoTime;\n  galois::GAccumulator<long int> compTime;\n  galois::GAccumulator<long int> scheduleTime;\n\n  void computeCutsRec(aig::GNode node, CutPool* cutPool, CutList* cutList,\n                      AuxTruth* auxTruth);\n\n  inline bool processTwoCuts(CutPool* cutPool, CutList* cutList,\n                             AuxTruth* auxTruth, Cut* lhsCut, Cut* rhsCut,\n                             bool lhsPolarity, bool rhsPolarity,\n                             int& currentNumCuts);\n\n  Cut* mergeCuts(CutPool* cutPool, Cut* lhsCut, Cut* rhsCut);\n\n  inline bool cutFilter(CutPool* cutPool, CutList* cutList, Cut* resCut,\n                        int& currentNumCuts);\n\n  inline bool checkCutDominance(Cut* smallerCut, Cut* largerCut);\n\n  inline void commitCuts(int nodeId, CutList* cutList);\n\n  void computeTruth(AuxTruth* auxTruth, Cut* resCut, Cut* lhsCut, Cut* rhsCut,\n                    bool lhsPolarity, bool rhsPolarity);\n\n  inline unsigned truthPhase(Cut* resCut, Cut* inCut);\n\npublic:\n  CutManager(aig::Aig& aig, int K, int C, int nThreads, bool compTruth);\n\n  ~CutManager();\n\n  void computeCuts(CutPool* cutPool, CutList* cutList, AuxTruth* auxTruth,\n                   int nodeId, int lhsId, int rhsId, bool lhsPolarity,\n                   bool rhsPolarity);\n\n  void computeCutsRecursively(aig::GNode node);\n\n  unsigned int* readTruth(Cut* cut);\n  void recycleNodeCuts(int nodeId);\n  void printNodeCuts(int nodeId, long int& counter);\n  void printAllCuts();\n  void printCutStatistics();\n  void printRuntimes();\n\n  aig::Aig& getAig();\n  int getK();\n  int getC();\n  int getNWords();\n  int getNThreads();\n  bool getCompTruthFlag();\n  long double getKcutTime();\n  void setKcutTime(long double time);\n  PerThreadCutPool& getPerThreadCutPool();\n  PerThreadCutList& getPerThreadCutList();\n  PerThreadAuxTruth& getPerThreadAuxTruth();\n  Cut** getNodeCuts();\n};\n\n// Function that runs the KCut operator define in the end of file CutManager.cpp\n// //\nvoid runKCutOperator(CutManager& cutMan);\n\n} /* namespace algorithm */\n\n#endif /* CUTMANAGERC_H_ */\n"
  },
  {
    "path": "lonestar/eda/cpu/aig-rewriting/algorithms/CutPool.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n/*\n\n @Vinicius Possani\n Parallel Rewriting January 5, 2018.\n ABC-based implementation on Galois.\n\n*/\n\n#include \"CutPool.h\"\n#include \"../functional/FunctionHandler32.h\"\n\n#include <iostream>\n#include <cstdlib>\n#include <cstring>\n#include <cassert>\n\nnamespace algorithm {\n\nCutPool::CutPool(long int initialSize, int k, bool compTruth) {\n  this->blockSize              = initialSize;\n  this->k                      = k;\n  std::size_t initialEntrySize = sizeof(Cut) + (k * sizeof(int));\n  if (compTruth) {\n    initialEntrySize += (Functional32::wordNum(k) * sizeof(unsigned int));\n  }\n  // We need to pad the allocation to make sure alignment constraints\n  // are still followed, however the computation used to do that\n  // assumes the conditions in this assertion.\n  static_assert(alignof(Cut) >= alignof(int) &&\n                alignof(Cut) >= alignof(unsigned));\n  constexpr std::size_t alignment = alignof(Cut);\n  std::size_t remainder           = initialEntrySize % alignment;\n  std::size_t padding             = remainder ? (alignment - remainder) : 0;\n  this->entrySize                 = initialEntrySize + padding;\n  this->entriesUsed               = 0;\n  this->entriesAlloc              = 0;\n  this->entriesFree               = nullptr;\n}\n\nCutPool::~CutPool() {\n  for (char* ptr : this->blocks) {\n    free(ptr);\n  }\n}\n\ninline void CutPool::alloc() {\n\n  this->entriesFree =\n      (char*)malloc((long int)(this->entrySize * this->blockSize));\n\n  if (this->entriesFree == nullptr) {\n    std::cout << \"Error: memory could not be allocated by CutPool!\"\n              << std::endl;\n    exit(1);\n  }\n\n  char* pTemp = this->entriesFree;\n\n  for (int i = 1; i < this->blockSize; i++) {\n    *((char**)pTemp) = pTemp + this->entrySize;\n    pTemp += this->entrySize;\n  }\n\n  *((char**)pTemp) = nullptr;\n\n  this->entriesAlloc += this->blockSize;\n  this->blocks.push_back(this->entriesFree);\n}\n\nCut* CutPool::getMemory() {\n\n  if (this->entriesUsed == this->entriesAlloc) {\n    assert(this->entriesFree == nullptr);\n    alloc();\n  }\n\n  this->entriesUsed++;\n  char* pTemp       = this->entriesFree;\n  this->entriesFree = *((char**)pTemp);\n\n  Cut* cut = (Cut*)pTemp;\n  memset(cut, 0, this->entrySize);\n  cut->nextCut = nullptr;\n\n  return cut;\n}\n\nvoid CutPool::giveBackMemory(Cut* cut) {\n\n  this->entriesUsed--;\n  char* pTemp       = (char*)cut;\n  *((char**)pTemp)  = this->entriesFree;\n  this->entriesFree = pTemp;\n}\n\nint CutPool::getNumBlocks() { return this->blocks.size(); }\n\nint CutPool::getBlockSize() { return this->blockSize; }\n\n} /* namespace algorithm */\n"
  },
  {
    "path": "lonestar/eda/cpu/aig-rewriting/algorithms/CutPool.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n/*\n\n @Vinicius Possani\n Parallel Rewriting January 5, 2018.\n ABC-based implementation on Galois.\n\n*/\n\n#ifndef CUTPOOL_H_\n#define CUTPOOL_H_\n\n#include <vector>\n\nnamespace algorithm {\n\n// The size of the leaves is defined acording the parameter Kk, during the\n// memory allocation in the CutPool.cpp\ntypedef struct cut_ {\n  unsigned int sig;\n  short int nLeaves;\n  struct cut_* nextCut;\n  int leaves[0];\n} Cut;\n\nclass CutPool {\n\nprivate:\n  long int blockSize;\n  int k;\n  int entrySize;\n  long int entriesUsed;\n  long int entriesAlloc;\n  char* entriesFree;\n  std::vector<char*> blocks;\n\n  void alloc();\n\npublic:\n  CutPool(long int initialSize, int k, bool compTruth);\n\n  ~CutPool();\n\n  Cut* getMemory();\n\n  void giveBackMemory(Cut* cut);\n\n  int getNumBlocks();\n\n  int getBlockSize();\n};\n\n} /* namespace algorithm */\n\n#endif /* CUTPOOL_H_ */\n"
  },
  {
    "path": "lonestar/eda/cpu/aig-rewriting/algorithms/NPNManager.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n/*\n\n @Vinicius Possani\n Parallel Rewriting January 5, 2018.\n ABC-based implementation on Galois.\n\n*/\n\n#include \"NPNManager.h\"\n\n#include <stdlib.h>\n#include <string.h>\n#include <assert.h>\n\nnamespace algorithm {\n\n/*\n * Static attribute with precomputed data\n * the following 135 practical NPN classes of 4-variable functions were computed\n * by considering all 4-input cuts appearing in IWLS, MCNC, and ISCAS benchmarks\n *  */\nconst unsigned short NPNManager::rewritePracticalClasses[136] = {\n    0x0000, 0x0001, 0x0003, 0x0006, 0x0007, 0x000f, 0x0016, 0x0017, 0x0018,\n    0x0019, 0x001b, 0x001e, 0x001f, 0x003c, 0x003d, 0x003f, 0x0069, 0x006b,\n    0x006f, 0x007e, 0x007f, 0x00ff, 0x0116, 0x0118, 0x0119, 0x011a, 0x011b,\n    0x011e, 0x011f, 0x012c, 0x012d, 0x012f, 0x013c, 0x013d, 0x013e, 0x013f,\n    0x0168, 0x0169, 0x016f, 0x017f, 0x0180, 0x0181, 0x0182, 0x0183, 0x0186,\n    0x0189, 0x018b, 0x018f, 0x0198, 0x0199, 0x019b, 0x01a8, 0x01a9, 0x01aa,\n    0x01ab, 0x01ac, 0x01ad, 0x01ae, 0x01af, 0x01bf, 0x01e9, 0x01ea, 0x01eb,\n    0x01ee, 0x01ef, 0x01fe, 0x033c, 0x033d, 0x033f, 0x0356, 0x0357, 0x0358,\n    0x0359, 0x035a, 0x035b, 0x035f, 0x0368, 0x0369, 0x036c, 0x036e, 0x037d,\n    0x03c0, 0x03c1, 0x03c3, 0x03c7, 0x03cf, 0x03d4, 0x03d5, 0x03d7, 0x03d8,\n    0x03d9, 0x03dc, 0x03dd, 0x03de, 0x03fc, 0x0660, 0x0661, 0x0666, 0x0669,\n    0x066f, 0x0676, 0x067e, 0x0690, 0x0696, 0x0697, 0x069f, 0x06b1, 0x06b6,\n    0x06f0, 0x06f2, 0x06f6, 0x06f9, 0x0776, 0x0778, 0x07b0, 0x07b1, 0x07b4,\n    0x07bc, 0x07f0, 0x07f2, 0x07f8, 0x0ff0, 0x1683, 0x1696, 0x1698, 0x169e,\n    0x16e9, 0x178e, 0x17e8, 0x18e7, 0x19e6, 0x1be4, 0x1ee1, 0x3cc3, 0x6996,\n    0x0000};\n\n/* Computes NPN canonical forms for 4-variable functions */\nNPNManager::NPNManager() {\n\n  unsigned uTruth, phase, uPerm;\n  int nFuncsAux, nClasses;\n  int i, k;\n\n  nFuncs    = (1 << 16);\n  nFuncsAux = nFuncs;\n\n  canons = (unsigned short*)malloc(sizeof(unsigned short) * nFuncsAux);\n  memset(canons, 0, sizeof(unsigned short) * nFuncsAux);\n\n  phases = (char*)malloc(sizeof(char) * nFuncsAux);\n  memset(phases, 0, sizeof(char) * nFuncsAux);\n\n  perms = (char*)malloc(sizeof(char) * nFuncsAux);\n  memset(perms, 0, sizeof(char) * nFuncsAux);\n\n  map = (unsigned char*)malloc(sizeof(unsigned char) * nFuncsAux);\n  memset(map, 0, sizeof(unsigned char) * nFuncsAux);\n\n  // mapInt will filled during the processing of precomputed graphs\n  mapInv = (unsigned short*)malloc(sizeof(unsigned short) * 222);\n  memset(mapInv, 0, sizeof(unsigned short) * 222);\n\n  perms4 = getPermutations(4);\n\n  practical = (char*)malloc(sizeof(char) * nFuncsAux);\n  memset(practical, 0, sizeof(char) * nFuncsAux);\n  initializePractical();\n\n  nClasses  = 1;\n  nFuncsAux = (1 << 15);\n\n  for (uTruth = 1; uTruth < (unsigned)nFuncsAux; uTruth++) {\n\n    // skip already assigned\n    if (canons[uTruth]) {\n      assert(uTruth > canons[uTruth]);\n      map[~uTruth & 0xFFFF] = map[uTruth] = map[canons[uTruth]];\n      continue;\n    }\n\n    map[uTruth] = nClasses++;\n\n    for (i = 0; i < 16; i++) {\n      phase = truthPolarize(uTruth, i, 4);\n      for (k = 0; k < 24; k++) {\n        uPerm = truthPermute(phase, perms4[k], 4, 0);\n        if (canons[uPerm] == 0) {\n          canons[uPerm] = uTruth;\n          phases[uPerm] = i;\n          perms[uPerm]  = k;\n\n          uPerm         = ~uPerm & 0xFFFF;\n          canons[uPerm] = uTruth;\n          phases[uPerm] = i | 16;\n          perms[uPerm]  = k;\n        } else {\n          assert(canons[uPerm] == uTruth);\n        }\n      }\n      phase = truthPolarize(~uTruth & 0xFFFF, i, 4);\n      for (k = 0; k < 24; k++) {\n        uPerm = truthPermute(phase, perms4[k], 4, 0);\n        if (canons[uPerm] == 0) {\n          canons[uPerm] = uTruth;\n          phases[uPerm] = i;\n          perms[uPerm]  = k;\n\n          uPerm         = ~uPerm & 0xFFFF;\n          canons[uPerm] = uTruth;\n          phases[uPerm] = i | 16;\n          perms[uPerm]  = k;\n        } else {\n          assert(canons[uPerm] == uTruth);\n        }\n      }\n    }\n  }\n\n  phases[(1 << 16) - 1] = 16;\n  assert(nClasses == 222);\n}\n\nNPNManager::~NPNManager() {\n  free(phases);\n  free(perms);\n  free(map);\n  free(mapInv);\n  free(canons);\n  free(perms4);\n  free(practical);\n}\n\nchar** NPNManager::getPermutations(int n) {\n\n  char Array[50];\n  char** pRes;\n  int nFact, i;\n  // allocate memory\n  nFact = factorial(n);\n  pRes  = (char**)arrayAlloc(nFact, n, sizeof(char));\n  // fill in the permutations\n  for (i = 0; i < n; i++) {\n    Array[i] = i;\n  }\n  getPermutationsRec(pRes, nFact, n, Array);\n\n  return pRes;\n}\n\n/* Fills in the array of permutations */\nvoid NPNManager::getPermutationsRec(char** pRes, int nFact, int n,\n                                    char Array[]) {\n\n  char** pNext;\n  int nFactNext;\n  int iTemp, iCur, iLast, k;\n\n  if (n == 1) {\n    pRes[0][0] = Array[0];\n    return;\n  }\n\n  // get the next factorial\n  nFactNext = nFact / n;\n  // get the last entry\n  iLast = n - 1;\n\n  for (iCur = 0; iCur < n; iCur++) {\n    // swap Cur and Last\n    iTemp        = Array[iCur];\n    Array[iCur]  = Array[iLast];\n    Array[iLast] = iTemp;\n\n    // get the pointer to the current section\n    pNext = pRes + (n - 1 - iCur) * nFactNext;\n\n    // set the last entry\n    for (k = 0; k < nFactNext; k++)\n      pNext[k][iLast] = Array[iLast];\n\n    // call recursively for this part\n    getPermutationsRec(pNext, nFactNext, n - 1, Array);\n\n    // swap them back\n    iTemp        = Array[iCur];\n    Array[iCur]  = Array[iLast];\n    Array[iLast] = iTemp;\n  }\n}\n\n/* Permutes the given vector of minterms. */\nvoid NPNManager::truthPermuteInt(int* pMints, int nMints, char* pPerm,\n                                 int nVars, int* pMintsP) {\n\n  int m, v;\n  // clean the storage for minterms\n  memset(pMintsP, 0, sizeof(int) * nMints);\n  // go through minterms and add the variables\n  for (m = 0; m < nMints; m++)\n    for (v = 0; v < nVars; v++)\n      if (pMints[m] & (1 << v))\n        pMintsP[m] |= (1 << pPerm[v]);\n}\n\n/* Permutes the function. */\nunsigned NPNManager::truthPermute(unsigned Truth, char* pPerms, int nVars,\n                                  int fReverse) {\n\n  unsigned Result;\n  int* pMints;\n  int* pMintsP;\n  int nMints;\n  int i, m;\n\n  assert(nVars < 6);\n  nMints  = (1 << nVars);\n  pMints  = (int*)malloc(sizeof(int) * nMints);\n  pMintsP = (int*)malloc(sizeof(int) * nMints);\n  for (i = 0; i < nMints; i++)\n    pMints[i] = i;\n\n  truthPermuteInt(pMints, nMints, pPerms, nVars, pMintsP);\n\n  Result = 0;\n  if (fReverse) {\n    for (m = 0; m < nMints; m++) {\n      if (Truth & (1 << pMintsP[m])) {\n        Result |= (1 << m);\n      }\n    }\n  } else {\n    for (m = 0; m < nMints; m++) {\n      if (Truth & (1 << m)) {\n        Result |= (1 << pMintsP[m]);\n      }\n    }\n  }\n\n  free(pMints);\n  free(pMintsP);\n\n  return Result;\n}\n\n/* Changes the phase of the function. */\nunsigned NPNManager::truthPolarize(unsigned uTruth, int Polarity, int nVars) {\n\n  // elementary truth tables\n  static unsigned Signs[5] = {\n      0xAAAAAAAA, // 1010 1010 1010 1010 1010 1010 1010 1010\n      0xCCCCCCCC, // 1010 1010 1010 1010 1010 1010 1010 1010\n      0xF0F0F0F0, // 1111 0000 1111 0000 1111 0000 1111 0000\n      0xFF00FF00, // 1111 1111 0000 0000 1111 1111 0000 0000\n      0xFFFF0000  // 1111 1111 1111 1111 0000 0000 0000 0000\n  };\n\n  unsigned uCof0, uCof1;\n  int Shift, v;\n  assert(nVars < 6);\n\n  for (v = 0; v < nVars; v++) {\n    if (Polarity & (1 << v)) {\n      uCof0 = uTruth & ~Signs[v];\n      uCof1 = uTruth & Signs[v];\n      Shift = (1 << v);\n      uCof0 <<= Shift;\n      uCof1 >>= Shift;\n      uTruth = uCof0 | uCof1;\n    }\n  }\n  return uTruth;\n}\n\nvoid NPNManager::initializePractical() {\n\n  int i;\n  this->practical[0] = 1;\n  for (i = 1;; i++) {\n    if (rewritePracticalClasses[i] == 0) {\n      break;\n    }\n    this->practical[rewritePracticalClasses[i]] = 1;\n  }\n}\n\n/* Allocated one-memory-chunk array. */\nvoid** NPNManager::arrayAlloc(int nCols, int nRows, int Size) {\n\n  void** pRes;\n  char* pBuffer;\n  int i;\n  assert(nCols > 0 && nRows > 0 && Size > 0);\n  pBuffer =\n      (char*)malloc(sizeof(char) * (nCols * (sizeof(void*) + nRows * Size)));\n  pRes    = (void**)pBuffer;\n  pRes[0] = pBuffer + nCols * sizeof(void*);\n  for (i = 1; i < nCols; i++) {\n    pRes[i] = (void*)((char*)pRes[0] + i * nRows * Size);\n  }\n  return pRes;\n}\n\nint NPNManager::factorial(int n) {\n\n  int res = 1;\n  for (int i = 1; i <= n; i++) {\n    res *= i;\n  }\n  return res;\n}\n\nint NPNManager::getNFuncs() { return this->nFuncs; }\n\nunsigned short* NPNManager::getCanons() { return this->canons; }\n\nchar* NPNManager::getPhases() { return this->phases; }\n\nchar* NPNManager::getPerms() { return this->perms; }\n\nchar* NPNManager::getPractical() { return this->practical; }\n\nunsigned char* NPNManager::getMap() { return this->map; }\n\nunsigned short* NPNManager::getMapInv() { return this->mapInv; }\n\nchar** NPNManager::getPerms4() { return this->perms4; }\n\n} /* namespace algorithm */\n"
  },
  {
    "path": "lonestar/eda/cpu/aig-rewriting/algorithms/NPNManager.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n/*\n\n @Vinicius Possani\n Parallel Rewriting January 5, 2018.\n ABC-based implementation on Galois.\n\n*/\n\n#ifndef NPNMANAGER_H_\n#define NPNMANAGER_H_\n\n#include <stdlib.h>\n#include <string.h>\n#include <assert.h>\n\nnamespace algorithm {\n\nclass NPNManager {\n\nprivate:\n  int nFuncs;\n  char* phases;           // canonical phases\n  char* perms;            // canonical permutations\n  unsigned char* map;     // mapping of functions into class numbers\n  unsigned short* mapInv; // mapping of classes into functions\n  unsigned short* canons; // canonical forms\n  char** perms4;          // four-var permutations\n  char* practical;        // practical NPN classes\n  static const unsigned short rewritePracticalClasses[136];\n\n  char** getPermutations(int n);\n  void getPermutationsRec(char** pRes, int nFact, int n, char Array[]);\n  void truthPermuteInt(int* pMints, int nMints, char* pPerm, int nVars,\n                       int* pMintsP);\n  unsigned truthPermute(unsigned Truth, char* pPerms, int nVars, int fReverse);\n  unsigned truthPolarize(unsigned uTruth, int Polarity, int nVars);\n  void initializePractical();\n  void** arrayAlloc(int nCols, int nRows, int Size);\n  int factorial(int n);\n\npublic:\n  NPNManager();\n\n  ~NPNManager();\n\n  int getNFuncs();\n  unsigned short* getCanons();\n  char* getPhases();\n  char* getPerms();\n  char* getPractical();\n  unsigned char* getMap();\n  unsigned short* getMapInv();\n  char** getPerms4();\n};\n\n} /* namespace algorithm */\n\n#endif /* NPNMANAGER_H_ */\n"
  },
  {
    "path": "lonestar/eda/cpu/aig-rewriting/algorithms/PreCompGraphManager.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n/*\n\n @Vinicius Possani\n Parallel Rewriting January 5, 2018.\n ABC-based implementation on Galois.\n\n*/\n\n#include \"PreCompGraphManager.h\"\n#include <iostream>\n#include <assert.h>\n#include <stdlib.h>\n#include <string.h>\n\nnamespace algorithm {\n\nconst unsigned short PreCompGraphManager::aigSubgraphs[3562] = {\n    0x0008, 0x0002, 0x000a, 0x0002, 0x0008, 0x0003, 0x000a, 0x0003, 0x0009,\n    0x0002, 0x000c, 0x0002, 0x000e, 0x0002, 0x000c, 0x0003, 0x000e, 0x0003,\n    0x000d, 0x0002, 0x000c, 0x0004, 0x000e, 0x0004, 0x000c, 0x0005, 0x000e,\n    0x0005, 0x000d, 0x0004, 0x0010, 0x0002, 0x0012, 0x0002, 0x0010, 0x0003,\n    0x0012, 0x0003, 0x0011, 0x0002, 0x0010, 0x0004, 0x0012, 0x0004, 0x0010,\n    0x0005, 0x0012, 0x0005, 0x0011, 0x0004, 0x0010, 0x0006, 0x0012, 0x0006,\n    0x0010, 0x0007, 0x0012, 0x0007, 0x0011, 0x0006, 0x0016, 0x0005, 0x0014,\n    0x0006, 0x0016, 0x0006, 0x0014, 0x0007, 0x0016, 0x0007, 0x0015, 0x0006,\n    0x0014, 0x0008, 0x0016, 0x0008, 0x0014, 0x0009, 0x0016, 0x0009, 0x0015,\n    0x0008, 0x0018, 0x0006, 0x001a, 0x0006, 0x0018, 0x0007, 0x001a, 0x0007,\n    0x0019, 0x0006, 0x0018, 0x0009, 0x001a, 0x0009, 0x0019, 0x0008, 0x001e,\n    0x0005, 0x001c, 0x0006, 0x001e, 0x0006, 0x001c, 0x0007, 0x001e, 0x0007,\n    0x001d, 0x0006, 0x001c, 0x0008, 0x001e, 0x0008, 0x001c, 0x0009, 0x001e,\n    0x0009, 0x001d, 0x0008, 0x0020, 0x0006, 0x0022, 0x0006, 0x0020, 0x0007,\n    0x0022, 0x0007, 0x0021, 0x0006, 0x0020, 0x0008, 0x0022, 0x0008, 0x0020,\n    0x0009, 0x0022, 0x0009, 0x0021, 0x0008, 0x0024, 0x0006, 0x0026, 0x0006,\n    0x0024, 0x0007, 0x0026, 0x0007, 0x0025, 0x0006, 0x0026, 0x0008, 0x0024,\n    0x0009, 0x0026, 0x0009, 0x0025, 0x0008, 0x0028, 0x0004, 0x002a, 0x0004,\n    0x0028, 0x0005, 0x002a, 0x0007, 0x0028, 0x0008, 0x002a, 0x0009, 0x0029,\n    0x0008, 0x002a, 0x000b, 0x0029, 0x000a, 0x002a, 0x000f, 0x0029, 0x000e,\n    0x002a, 0x0011, 0x002a, 0x0013, 0x002c, 0x0004, 0x002e, 0x0004, 0x002c,\n    0x0005, 0x002c, 0x0009, 0x002e, 0x0009, 0x002d, 0x0008, 0x002d, 0x000c,\n    0x002e, 0x000f, 0x002e, 0x0011, 0x002e, 0x0012, 0x0030, 0x0004, 0x0032,\n    0x0007, 0x0032, 0x0009, 0x0031, 0x0008, 0x0032, 0x000b, 0x0032, 0x000d,\n    0x0032, 0x000f, 0x0031, 0x000e, 0x0032, 0x0013, 0x0034, 0x0004, 0x0036,\n    0x0004, 0x0034, 0x0005, 0x0036, 0x0005, 0x0035, 0x0004, 0x0036, 0x0008,\n    0x0034, 0x0009, 0x0036, 0x0009, 0x0035, 0x0008, 0x0036, 0x000b, 0x0036,\n    0x000d, 0x0036, 0x0011, 0x0035, 0x0010, 0x0036, 0x0013, 0x0038, 0x0004,\n    0x0039, 0x0004, 0x0038, 0x0009, 0x003a, 0x0009, 0x0039, 0x0008, 0x0038,\n    0x000b, 0x003a, 0x000b, 0x003a, 0x000d, 0x003a, 0x0011, 0x003a, 0x0012,\n    0x0038, 0x0013, 0x003a, 0x0013, 0x003c, 0x0002, 0x003e, 0x0002, 0x003c,\n    0x0003, 0x003e, 0x0005, 0x003e, 0x0007, 0x003c, 0x0008, 0x003e, 0x0008,\n    0x003c, 0x0009, 0x003e, 0x0009, 0x003d, 0x0008, 0x003e, 0x000d, 0x003e,\n    0x0011, 0x003e, 0x0013, 0x003e, 0x0017, 0x003e, 0x001b, 0x003e, 0x001d,\n    0x0040, 0x0002, 0x0042, 0x0002, 0x0042, 0x0005, 0x0041, 0x0006, 0x0042,\n    0x0008, 0x0041, 0x0008, 0x0042, 0x000d, 0x0042, 0x0011, 0x0042, 0x0015,\n    0x0042, 0x0019, 0x0042, 0x001b, 0x0042, 0x001c, 0x0041, 0x001c, 0x0044,\n    0x0002, 0x0046, 0x0003, 0x0045, 0x0004, 0x0046, 0x0007, 0x0045, 0x0008,\n    0x0046, 0x000b, 0x0046, 0x000f, 0x0046, 0x0013, 0x0045, 0x0012, 0x0046,\n    0x0017, 0x0046, 0x001b, 0x0046, 0x0021, 0x0048, 0x0002, 0x004a, 0x0002,\n    0x0048, 0x0003, 0x004a, 0x0003, 0x0049, 0x0002, 0x0048, 0x0008, 0x004a,\n    0x0008, 0x0048, 0x0009, 0x004a, 0x0009, 0x0049, 0x0008, 0x004a, 0x000b,\n    0x004a, 0x000f, 0x004a, 0x0011, 0x004a, 0x0012, 0x004a, 0x0013, 0x004a,\n    0x0015, 0x004a, 0x0019, 0x004a, 0x001b, 0x004a, 0x001d, 0x004c, 0x0002,\n    0x004c, 0x0003, 0x004d, 0x0002, 0x004c, 0x0008, 0x004e, 0x0008, 0x004c,\n    0x0009, 0x004e, 0x0009, 0x004d, 0x0008, 0x004c, 0x000b, 0x004e, 0x000b,\n    0x004c, 0x000f, 0x004e, 0x000f, 0x004e, 0x0011, 0x004c, 0x0012, 0x004c,\n    0x0013, 0x004e, 0x0013, 0x004e, 0x0015, 0x004c, 0x0017, 0x004e, 0x0019,\n    0x004c, 0x001b, 0x004e, 0x001b, 0x004c, 0x001c, 0x004c, 0x001d, 0x004e,\n    0x001d, 0x0050, 0x0004, 0x0052, 0x0004, 0x0050, 0x0006, 0x0052, 0x0009,\n    0x0052, 0x000d, 0x0052, 0x000f, 0x0052, 0x0013, 0x0052, 0x0017, 0x0052,\n    0x0019, 0x0052, 0x001d, 0x0052, 0x001f, 0x0052, 0x0021, 0x0052, 0x0023,\n    0x0052, 0x0024, 0x0052, 0x0025, 0x0051, 0x0024, 0x0052, 0x0027, 0x0054,\n    0x0004, 0x0056, 0x0004, 0x0054, 0x0005, 0x0056, 0x0006, 0x0054, 0x0007,\n    0x0056, 0x0011, 0x0056, 0x001b, 0x0056, 0x001e, 0x0054, 0x001f, 0x0056,\n    0x001f, 0x0056, 0x0020, 0x0054, 0x0021, 0x0055, 0x0020, 0x0056, 0x0024,\n    0x0054, 0x0025, 0x0056, 0x0025, 0x0055, 0x0024, 0x0054, 0x0027, 0x0056,\n    0x0027, 0x0055, 0x0026, 0x005a, 0x0007, 0x005a, 0x0009, 0x005a, 0x000b,\n    0x005a, 0x0015, 0x005a, 0x001f, 0x0059, 0x0020, 0x0058, 0x0024, 0x005a,\n    0x0024, 0x005a, 0x0027, 0x0059, 0x0026, 0x005c, 0x0004, 0x005e, 0x0004,\n    0x005c, 0x0005, 0x005e, 0x0006, 0x005c, 0x0007, 0x005d, 0x0006, 0x005e,\n    0x000d, 0x005e, 0x0013, 0x005e, 0x0017, 0x005c, 0x001f, 0x005d, 0x001e,\n    0x005e, 0x0020, 0x005e, 0x0021, 0x005e, 0x0022, 0x005e, 0x0023, 0x005c,\n    0x0024, 0x005e, 0x0024, 0x005c, 0x0025, 0x005e, 0x0025, 0x005d, 0x0024,\n    0x005e, 0x0026, 0x005e, 0x0027, 0x0062, 0x0004, 0x0061, 0x0004, 0x0062,\n    0x0006, 0x0061, 0x0006, 0x0060, 0x000f, 0x0060, 0x0013, 0x0062, 0x0013,\n    0x0060, 0x0019, 0x0062, 0x001c, 0x0060, 0x001d, 0x0062, 0x001d, 0x0062,\n    0x001f, 0x0060, 0x0021, 0x0060, 0x0023, 0x0062, 0x0024, 0x0060, 0x0027,\n    0x0061, 0x0026, 0x0064, 0x0002, 0x0066, 0x0002, 0x0064, 0x0006, 0x0066,\n    0x0007, 0x0066, 0x0009, 0x0066, 0x000d, 0x0066, 0x0013, 0x0066, 0x0015,\n    0x0066, 0x0017, 0x0066, 0x0019, 0x0066, 0x001a, 0x0065, 0x001a, 0x0066,\n    0x001f, 0x0066, 0x0023, 0x0066, 0x0027, 0x0066, 0x002f, 0x0066, 0x0030,\n    0x006a, 0x0002, 0x0068, 0x0003, 0x0068, 0x0006, 0x006a, 0x0006, 0x006a,\n    0x0011, 0x0068, 0x0016, 0x0068, 0x0017, 0x006a, 0x0017, 0x006a, 0x001a,\n    0x006a, 0x001b, 0x006a, 0x0025, 0x006a, 0x002d, 0x006e, 0x0003, 0x006e,\n    0x0007, 0x006e, 0x0009, 0x006e, 0x000b, 0x006e, 0x0015, 0x006e, 0x0016,\n    0x006e, 0x0017, 0x006c, 0x001a, 0x006e, 0x001a, 0x006e, 0x001f, 0x006e,\n    0x002b, 0x006e, 0x0035, 0x0070, 0x0002, 0x0070, 0x0003, 0x0072, 0x0006,\n    0x0070, 0x0007, 0x0071, 0x0006, 0x0072, 0x000b, 0x0072, 0x000f, 0x0072,\n    0x0013, 0x0070, 0x0015, 0x0071, 0x0014, 0x0072, 0x0017, 0x0072, 0x0018,\n    0x0070, 0x0019, 0x0072, 0x0019, 0x0070, 0x001a, 0x0070, 0x001b, 0x0072,\n    0x001b, 0x0071, 0x001a, 0x0072, 0x0021, 0x0072, 0x0029, 0x0076, 0x0002,\n    0x0076, 0x0003, 0x0075, 0x0002, 0x0076, 0x0006, 0x0074, 0x0007, 0x0076,\n    0x0007, 0x0075, 0x0006, 0x0076, 0x000d, 0x0076, 0x0011, 0x0076, 0x0013,\n    0x0075, 0x0014, 0x0076, 0x0019, 0x0076, 0x001a, 0x0076, 0x001b, 0x0075,\n    0x001c, 0x0074, 0x0023, 0x0075, 0x0022, 0x0074, 0x0026, 0x0076, 0x0026,\n    0x0074, 0x0027, 0x0076, 0x002b, 0x0076, 0x002f, 0x0078, 0x0002, 0x0078,\n    0x0004, 0x007a, 0x0004, 0x007a, 0x0005, 0x0079, 0x0004, 0x007a, 0x0009,\n    0x007a, 0x000a, 0x007a, 0x000b, 0x007a, 0x000d, 0x007a, 0x000f, 0x007a,\n    0x0010, 0x007a, 0x0011, 0x007a, 0x0012, 0x007a, 0x0013, 0x007a, 0x0017,\n    0x007a, 0x001b, 0x007a, 0x0021, 0x007a, 0x0027, 0x007a, 0x002b, 0x007a,\n    0x002f, 0x007a, 0x0030, 0x0079, 0x0034, 0x007a, 0x0039, 0x007a, 0x003a,\n    0x007e, 0x0002, 0x007c, 0x0004, 0x007e, 0x0004, 0x007e, 0x000c, 0x007c,\n    0x000d, 0x007e, 0x0011, 0x007e, 0x0013, 0x007e, 0x001b, 0x007e, 0x0025,\n    0x007e, 0x002d, 0x007e, 0x0037, 0x0082, 0x0003, 0x0082, 0x0005, 0x0082,\n    0x0009, 0x0082, 0x000b, 0x0080, 0x0010, 0x0082, 0x0010, 0x0082, 0x0012,\n    0x0082, 0x0015, 0x0082, 0x001f, 0x0082, 0x002b, 0x0082, 0x0035, 0x0082,\n    0x0039, 0x0082, 0x003f, 0x0084, 0x0002, 0x0086, 0x0002, 0x0084, 0x0003,\n    0x0086, 0x0003, 0x0085, 0x0002, 0x0086, 0x0004, 0x0084, 0x0005, 0x0085,\n    0x0004, 0x0086, 0x000a, 0x0084, 0x000b, 0x0085, 0x000a, 0x0086, 0x000d,\n    0x0086, 0x000e, 0x0086, 0x000f, 0x0084, 0x0010, 0x0084, 0x0011, 0x0086,\n    0x0011, 0x0085, 0x0010, 0x0084, 0x0012, 0x0084, 0x0013, 0x0086, 0x0013,\n    0x0085, 0x0012, 0x0086, 0x0019, 0x0086, 0x0023, 0x0086, 0x0029, 0x0086,\n    0x0033, 0x0086, 0x0039, 0x008a, 0x0003, 0x0089, 0x0002, 0x0088, 0x0004,\n    0x008a, 0x0004, 0x0088, 0x0005, 0x0089, 0x0004, 0x008a, 0x000b, 0x008a,\n    0x0010, 0x0088, 0x0011, 0x008a, 0x0011, 0x0089, 0x0010, 0x0088, 0x0012,\n    0x008a, 0x0012, 0x0089, 0x0012, 0x008a, 0x0017, 0x008a, 0x001b, 0x0089,\n    0x0020, 0x008a, 0x0025, 0x0088, 0x0027, 0x008a, 0x002b, 0x008a, 0x002f,\n    0x008a, 0x0039, 0x0088, 0x003a, 0x008d, 0x0044, 0x0092, 0x0009, 0x0092,\n    0x0025, 0x0092, 0x0029, 0x0092, 0x002d, 0x0092, 0x0033, 0x0092, 0x0037,\n    0x0092, 0x003d, 0x0092, 0x0041, 0x0095, 0x0002, 0x0095, 0x0004, 0x0095,\n    0x0010, 0x0095, 0x0012, 0x0096, 0x0021, 0x0096, 0x0029, 0x0095, 0x002e,\n    0x0096, 0x0030, 0x0096, 0x0033, 0x0096, 0x003a, 0x0096, 0x0043, 0x009a,\n    0x0008, 0x009a, 0x0009, 0x0099, 0x0008, 0x009a, 0x0011, 0x009a, 0x0023,\n    0x009a, 0x0033, 0x009a, 0x003d, 0x009a, 0x0044, 0x009a, 0x0045, 0x0099,\n    0x0044, 0x009d, 0x0002, 0x009e, 0x0008, 0x009c, 0x0009, 0x009e, 0x0009,\n    0x009d, 0x0008, 0x009e, 0x0011, 0x009d, 0x0010, 0x009e, 0x001f, 0x009e,\n    0x003f, 0x00a0, 0x0009, 0x00a0, 0x0011, 0x00a2, 0x0030, 0x00a2, 0x0033,\n    0x00a6, 0x0006, 0x00a6, 0x0007, 0x00a6, 0x0011, 0x00a6, 0x0044, 0x00a6,\n    0x004b, 0x00aa, 0x0007, 0x00aa, 0x0015, 0x00ae, 0x0006, 0x00ae, 0x0011,\n    0x00ae, 0x001b, 0x00ae, 0x0025, 0x00ae, 0x003d, 0x00ae, 0x0041, 0x00ae,\n    0x0043, 0x00ae, 0x0045, 0x00b2, 0x0006, 0x00b0, 0x0007, 0x00b1, 0x0006,\n    0x00b2, 0x0017, 0x00b1, 0x0016, 0x00b0, 0x0019, 0x00b2, 0x0021, 0x00b2,\n    0x003d, 0x00b5, 0x004a, 0x00ba, 0x0009, 0x00ba, 0x000f, 0x00bc, 0x0009,\n    0x00be, 0x0009, 0x00be, 0x000f, 0x00bd, 0x000e, 0x00be, 0x0017, 0x00c2,\n    0x0009, 0x00c2, 0x0019, 0x00c2, 0x001f, 0x00c2, 0x0033, 0x00c6, 0x0009,\n    0x00c5, 0x000e, 0x00c6, 0x0015, 0x00c6, 0x0023, 0x00c4, 0x002d, 0x00c6,\n    0x002f, 0x00c5, 0x002e, 0x00c6, 0x0045, 0x00ce, 0x0007, 0x00ce, 0x0021,\n    0x00ce, 0x0023, 0x00ce, 0x0025, 0x00ce, 0x0027, 0x00ce, 0x0033, 0x00ce,\n    0x003d, 0x00d2, 0x0006, 0x00d0, 0x0015, 0x00d0, 0x001b, 0x00d2, 0x001b,\n    0x00d1, 0x001a, 0x00d0, 0x001f, 0x00d2, 0x0025, 0x00d1, 0x0024, 0x00d2,\n    0x0037, 0x00d2, 0x0041, 0x00d2, 0x0045, 0x00d9, 0x0044, 0x00e1, 0x0004,\n    0x00e2, 0x000d, 0x00e2, 0x0021, 0x00e0, 0x003a, 0x00e6, 0x003d, 0x00e6,\n    0x0061, 0x00e6, 0x0067, 0x00e9, 0x0004, 0x00ea, 0x0008, 0x00ea, 0x0009,\n    0x00ea, 0x0039, 0x00e9, 0x0038, 0x00ea, 0x003f, 0x00ec, 0x000d, 0x00ee,\n    0x000d, 0x00ee, 0x0037, 0x00f2, 0x003d, 0x00f2, 0x0062, 0x00f5, 0x0002,\n    0x00fa, 0x0017, 0x00fa, 0x003d, 0x00fe, 0x0006, 0x00fd, 0x0006, 0x00fc,\n    0x0015, 0x00fe, 0x001b, 0x00fc, 0x0025, 0x00fe, 0x0025, 0x00fd, 0x0024,\n    0x00fe, 0x0041, 0x00fe, 0x004d, 0x00fd, 0x004e, 0x0101, 0x0014, 0x0106,\n    0x004d, 0x010a, 0x0009, 0x010a, 0x000b, 0x0109, 0x000a, 0x010a, 0x004f,\n    0x010a, 0x0058, 0x010e, 0x0008, 0x010c, 0x0009, 0x010e, 0x0009, 0x010d,\n    0x0008, 0x010e, 0x000b, 0x010e, 0x002b, 0x010d, 0x002a, 0x010e, 0x0035,\n    0x010e, 0x003d, 0x010e, 0x003f, 0x010e, 0x0049, 0x010e, 0x0057, 0x010d,\n    0x0056, 0x010d, 0x0058, 0x0111, 0x0004, 0x0111, 0x0006, 0x0110, 0x0009,\n    0x0112, 0x0009, 0x0111, 0x0008, 0x0112, 0x002f, 0x0110, 0x0035, 0x0110,\n    0x0037, 0x0112, 0x0039, 0x0112, 0x003d, 0x0112, 0x003f, 0x0112, 0x0045,\n    0x0111, 0x0044, 0x0112, 0x004b, 0x0112, 0x0059, 0x0112, 0x0069, 0x0112,\n    0x007f, 0x0116, 0x0009, 0x0115, 0x0008, 0x0114, 0x000b, 0x0116, 0x000b,\n    0x0116, 0x0058, 0x011a, 0x0015, 0x011a, 0x001f, 0x011a, 0x002b, 0x011a,\n    0x003f, 0x011a, 0x0049, 0x011a, 0x0085, 0x011e, 0x0007, 0x011e, 0x0019,\n    0x011e, 0x001b, 0x011e, 0x0023, 0x011e, 0x0027, 0x011e, 0x002f, 0x011e,\n    0x0043, 0x011e, 0x004b, 0x011e, 0x004e, 0x011e, 0x004f, 0x011e, 0x005f,\n    0x011e, 0x0061, 0x011e, 0x0065, 0x011e, 0x0083, 0x0122, 0x0006, 0x0120,\n    0x0007, 0x0122, 0x0007, 0x0121, 0x0006, 0x0122, 0x0049, 0x0121, 0x004e,\n    0x0122, 0x008f, 0x0125, 0x0004, 0x0124, 0x0007, 0x0125, 0x0006, 0x0124,\n    0x001b, 0x0126, 0x001b, 0x0126, 0x0045, 0x0126, 0x0087, 0x0128, 0x0007,\n    0x0129, 0x0006, 0x012a, 0x0019, 0x012a, 0x003d, 0x012a, 0x0051, 0x012a,\n    0x0065, 0x012a, 0x0083, 0x012d, 0x005a, 0x0132, 0x0009, 0x0132, 0x008f,\n    0x0134, 0x0009, 0x0135, 0x003e, 0x013a, 0x003d, 0x013a, 0x0044, 0x0139,\n    0x0044, 0x013e, 0x0009, 0x013d, 0x0008, 0x013c, 0x003d, 0x013c, 0x0044,\n    0x013c, 0x0053, 0x013e, 0x008f, 0x013e, 0x0095, 0x0142, 0x0044, 0x0142,\n    0x0097, 0x0142, 0x009e, 0x0144, 0x0007, 0x0148, 0x0015, 0x0148, 0x001c,\n    0x0148, 0x001f, 0x0148, 0x0026, 0x0149, 0x0086, 0x014d, 0x0006, 0x014e,\n    0x0044, 0x014d, 0x0048, 0x014e, 0x009e, 0x0152, 0x0009, 0x0151, 0x00a6,\n    0x0155, 0x0030, 0x015d, 0x003a, 0x0162, 0x009e, 0x0164, 0x000f, 0x0164,\n    0x0013, 0x0169, 0x000e, 0x0174, 0x0009, 0x0179, 0x0008, 0x0180, 0x0009,\n    0x0181, 0x0044, 0x0186, 0x0044, 0x0185, 0x0044, 0x018a, 0x0068, 0x0195,\n    0x004e, 0x01a6, 0x0009, 0x01a5, 0x0008, 0x01b1, 0x003a, 0x01c4, 0x0029,\n    0x01c4, 0x0030, 0x01ca, 0x008f, 0x01ca, 0x0095, 0x01cc, 0x0029, 0x01cc,\n    0x0033, 0x01ce, 0x003d, 0x01d6, 0x00b2, 0x01d8, 0x0009, 0x01d9, 0x002a,\n    0x01d9, 0x0056, 0x01d9, 0x00a4, 0x01dd, 0x003a, 0x01e2, 0x00b2, 0x01e6,\n    0x0013, 0x01e6, 0x009f, 0x01e6, 0x00ba, 0x01e6, 0x00c0, 0x01e6, 0x00d3,\n    0x01e6, 0x00d5, 0x01e6, 0x00e5, 0x01e8, 0x0005, 0x01f2, 0x0013, 0x01f2,\n    0x0095, 0x01f2, 0x009f, 0x01f2, 0x00ba, 0x01f2, 0x00c0, 0x01f2, 0x00d3,\n    0x0202, 0x008f, 0x0202, 0x0095, 0x0202, 0x00f3, 0x0202, 0x00f9, 0x020a,\n    0x0044, 0x0209, 0x00b4, 0x020e, 0x0009, 0x020d, 0x0008, 0x020c, 0x003d,\n    0x020c, 0x0044, 0x020c, 0x0053, 0x020e, 0x008f, 0x020e, 0x0095, 0x020c,\n    0x00b1, 0x020e, 0x00f3, 0x020e, 0x00f9, 0x0210, 0x0013, 0x0211, 0x0024,\n    0x0210, 0x0026, 0x0219, 0x0004, 0x021e, 0x008f, 0x021e, 0x0095, 0x0221,\n    0x003a, 0x0230, 0x0009, 0x0236, 0x0009, 0x0234, 0x0029, 0x0234, 0x0030,\n    0x0234, 0x0033, 0x0234, 0x003a, 0x0234, 0x003d, 0x0234, 0x0044, 0x0235,\n    0x00a6, 0x023a, 0x0009, 0x023d, 0x003a, 0x0245, 0x0044, 0x0249, 0x003a,\n    0x024e, 0x009e, 0x024e, 0x0106, 0x0251, 0x0026, 0x0258, 0x0013, 0x0259,\n    0x0024, 0x0258, 0x0061, 0x0259, 0x0086, 0x0258, 0x00c7, 0x0258, 0x00df,\n    0x0259, 0x00ec, 0x0258, 0x00fc, 0x025d, 0x0024, 0x025d, 0x00de, 0x0260,\n    0x00f6, 0x0268, 0x0009, 0x0269, 0x0044, 0x0268, 0x00f3, 0x0268, 0x00f9,\n    0x026d, 0x003a, 0x0270, 0x0068, 0x0275, 0x003a, 0x027a, 0x0044, 0x0279,\n    0x0044, 0x027e, 0x007e, 0x0281, 0x0044, 0x0285, 0x0008, 0x028d, 0x0006,\n    0x028d, 0x00d2, 0x0295, 0x00cc, 0x0296, 0x00f6, 0x0295, 0x00f8, 0x0299,\n    0x0030, 0x029e, 0x007e, 0x029d, 0x0080, 0x02a6, 0x008f, 0x02a6, 0x0095,\n    0x02aa, 0x0029, 0x02aa, 0x0030, 0x02b5, 0x0008, 0x02b9, 0x003a, 0x02bd,\n    0x0004, 0x02bd, 0x00fc, 0x02c2, 0x00b2, 0x02c1, 0x00b4, 0x02c4, 0x0029,\n    0x02c8, 0x0029, 0x02c8, 0x0033, 0x02ca, 0x003d, 0x02ce, 0x0029, 0x02ce,\n    0x0030, 0x02d2, 0x0068, 0x02d1, 0x006a, 0x02d5, 0x006a, 0x02d9, 0x0008,\n    0x02de, 0x012c, 0x02e2, 0x012c, 0x02e4, 0x0009, 0x02e5, 0x002a, 0x02e5,\n    0x0056, 0x02e5, 0x012c, 0x02ea, 0x0029, 0x02ea, 0x0030, 0x02e9, 0x0030,\n    0x02ec, 0x0029, 0x02ec, 0x0030, 0x02ee, 0x012c, 0x02f1, 0x0068, 0x02f1,\n    0x00b2, 0x02f1, 0x0108, 0x02f1, 0x012c, 0x02f6, 0x0013, 0x02f6, 0x0015,\n    0x02f6, 0x001f, 0x02f6, 0x0030, 0x02f6, 0x0065, 0x02f6, 0x0067, 0x02f6,\n    0x009f, 0x02f6, 0x00b6, 0x02f6, 0x00b9, 0x02f6, 0x00c0, 0x02f6, 0x00cf,\n    0x02f6, 0x0107, 0x02f6, 0x010b, 0x02f6, 0x010f, 0x02f6, 0x0115, 0x02f6,\n    0x012d, 0x02f6, 0x0134, 0x02f6, 0x0153, 0x02f6, 0x0171, 0x02f6, 0x0176,\n    0x02f8, 0x0003, 0x02fa, 0x017b, 0x02fc, 0x00ba, 0x02fc, 0x00d3, 0x0302,\n    0x0013, 0x0302, 0x001f, 0x0302, 0x0030, 0x0302, 0x005d, 0x0302, 0x0065,\n    0x0302, 0x0067, 0x0302, 0x0099, 0x0302, 0x009f, 0x0302, 0x00ad, 0x0302,\n    0x00b9, 0x0302, 0x00c0, 0x0302, 0x00cf, 0x0301, 0x00d2, 0x0301, 0x00fe,\n    0x0302, 0x0107, 0x0302, 0x010b, 0x0302, 0x010f, 0x0302, 0x0117, 0x0302,\n    0x0134, 0x0302, 0x0153, 0x0302, 0x0157, 0x0302, 0x0176, 0x0306, 0x0029,\n    0x0308, 0x00b2, 0x0309, 0x00dc, 0x030d, 0x00f8, 0x0312, 0x00f3, 0x0318,\n    0x007e, 0x031d, 0x0080, 0x0321, 0x0008, 0x0321, 0x0094, 0x0326, 0x017b,\n    0x0326, 0x0181, 0x0329, 0x012e, 0x032a, 0x017b, 0x032a, 0x0181, 0x032e,\n    0x008f, 0x032e, 0x0095, 0x032e, 0x00f3, 0x032e, 0x00f9, 0x0332, 0x0009,\n    0x0331, 0x0008, 0x0330, 0x003d, 0x0330, 0x0044, 0x0330, 0x0053, 0x0332,\n    0x008f, 0x0332, 0x0095, 0x0330, 0x00b1, 0x0332, 0x00f3, 0x0332, 0x00f9,\n    0x0330, 0x0127, 0x0332, 0x017b, 0x0332, 0x0181, 0x033c, 0x0013, 0x033c,\n    0x001c, 0x033d, 0x0086, 0x033d, 0x00ec, 0x033d, 0x0172, 0x033e, 0x019d,\n    0x0345, 0x0002, 0x0344, 0x008f, 0x0344, 0x00f3, 0x034d, 0x0030, 0x0352,\n    0x0033, 0x0354, 0x0029, 0x0354, 0x0030, 0x035a, 0x0009, 0x035a, 0x017b,\n    0x035a, 0x019b, 0x035a, 0x01a2, 0x035e, 0x0181, 0x0360, 0x0009, 0x0366,\n    0x0009, 0x0364, 0x0029, 0x0364, 0x0030, 0x0364, 0x0033, 0x0364, 0x003a,\n    0x0364, 0x003d, 0x0364, 0x0044, 0x0369, 0x0030, 0x0370, 0x0029, 0x0370,\n    0x0030, 0x0376, 0x0033, 0x037a, 0x0009, 0x037a, 0x019b, 0x037a, 0x01a2,\n    0x037c, 0x0009, 0x0382, 0x0181, 0x0386, 0x0009, 0x0384, 0x0029, 0x0384,\n    0x0030, 0x0384, 0x0033, 0x0384, 0x003a, 0x0384, 0x003d, 0x0384, 0x0044,\n    0x038a, 0x0044, 0x038a, 0x009e, 0x038a, 0x0106, 0x038a, 0x0198, 0x038d,\n    0x010e, 0x038d, 0x0152, 0x038d, 0x0158, 0x0392, 0x009e, 0x0392, 0x0106,\n    0x0392, 0x0198, 0x0395, 0x0086, 0x0395, 0x009a, 0x0395, 0x00ec, 0x0395,\n    0x0172, 0x0398, 0x014e, 0x0398, 0x0175, 0x0398, 0x018d, 0x039c, 0x0023,\n    0x039c, 0x0027, 0x039c, 0x00ef, 0x039c, 0x0139, 0x039c, 0x0168, 0x03a0,\n    0x0019, 0x03a0, 0x001d, 0x03a0, 0x0023, 0x03a0, 0x0027, 0x03a1, 0x004e,\n    0x03a4, 0x0162, 0x03a4, 0x0183, 0x03a8, 0x0013, 0x03a8, 0x0027, 0x03a8,\n    0x0133, 0x03a8, 0x0148, 0x03a8, 0x0181, 0x03ac, 0x0013, 0x03ac, 0x0027,\n    0x03b0, 0x017b, 0x03b0, 0x0181, 0x03b4, 0x004b, 0x03b4, 0x00e0, 0x03b4,\n    0x00fb, 0x03b8, 0x000f, 0x03b8, 0x0013, 0x03b8, 0x00ab, 0x03b8, 0x00bf,\n    0x03b8, 0x00d0, 0x03bd, 0x00da, 0x03bd, 0x012c, 0x03c8, 0x000f, 0x03c8,\n    0x0013, 0x03c8, 0x0019, 0x03c8, 0x001d, 0x03cd, 0x0086, 0x03cd, 0x00ec,\n    0x03cd, 0x0172, 0x03d2, 0x00e0, 0x03d2, 0x00ef, 0x03d2, 0x0112, 0x03d2,\n    0x0139, 0x03d2, 0x0168, 0x03d6, 0x017b, 0x03d6, 0x0181, 0x03da, 0x0133,\n    0x03da, 0x0148, 0x03e2, 0x0023, 0x03e2, 0x0027, 0x03e6, 0x0027, 0x03e6,\n    0x0181, 0x03ee, 0x017b, 0x03ee, 0x0181, 0x03fe, 0x003d, 0x0401, 0x012a,\n    0x0401, 0x019e, 0x0405, 0x01a0, 0x040a, 0x000d, 0x040a, 0x011f, 0x040a,\n    0x016f, 0x040d, 0x012a, 0x0412, 0x017b, 0x041a, 0x0033, 0x041a, 0x003d,\n    0x041a, 0x0181, 0x0421, 0x0086, 0x0421, 0x009a, 0x0421, 0x00ec, 0x0421,\n    0x0172, 0x042e, 0x0205, 0x043a, 0x0205, 0x043e, 0x017b, 0x0442, 0x01f5,\n    0x044c, 0x0007, 0x0452, 0x0033, 0x0452, 0x01ce, 0x0452, 0x01d0, 0x0452,\n    0x01f1, 0x0452, 0x01fb, 0x0452, 0x0225, 0x0454, 0x0005, 0x045a, 0x0033,\n    0x045a, 0x0181, 0x045a, 0x01ce, 0x045a, 0x01d0, 0x045a, 0x01f1, 0x0469,\n    0x01de, 0x046e, 0x0181, 0x047a, 0x01ce, 0x047a, 0x01f1, 0x0485, 0x012c,\n    0x0489, 0x012c, 0x0490, 0x01d8, 0x0496, 0x0033, 0x0496, 0x003d, 0x0498,\n    0x008f, 0x0498, 0x00f3, 0x049e, 0x0044, 0x049e, 0x0221, 0x04a1, 0x0006,\n    0x04a2, 0x0044, 0x04a6, 0x0221, 0x04a9, 0x0004, 0x04ac, 0x0027, 0x04b1,\n    0x009a, 0x04b6, 0x0097, 0x04b8, 0x0027, 0x04c6, 0x0219, 0x04ca, 0x017b,\n    0x04cc, 0x004b, 0x04d0, 0x00ab, 0x04d6, 0x017b, 0x04d8, 0x000f, 0x04d8,\n    0x0019, 0x04d8, 0x0033, 0x04d8, 0x003d, 0x04de, 0x003d, 0x04de, 0x0103,\n    0x04de, 0x018b, 0x04de, 0x0231, 0x04e2, 0x0044, 0x04e2, 0x009e, 0x04e2,\n    0x0106, 0x04e2, 0x0198, 0x04e5, 0x01a4, 0x04e5, 0x01b6, 0x04ea, 0x009e,\n    0x04ea, 0x0106, 0x04ea, 0x0198, 0x04ed, 0x002e, 0x04ed, 0x0038, 0x04ed,\n    0x00a2, 0x04f1, 0x0086, 0x04f1, 0x009a, 0x04f1, 0x00ec, 0x04f1, 0x0172,\n    0x04f9, 0x004e, 0x04f8, 0x0229, 0x04f8, 0x022d, 0x0500, 0x023e, 0x0504,\n    0x0217, 0x0510, 0x00f3, 0x0514, 0x0043, 0x0514, 0x004d, 0x0514, 0x00c3,\n    0x0514, 0x013d, 0x0514, 0x0215, 0x0514, 0x0232, 0x0515, 0x0260, 0x0519,\n    0x002a, 0x0518, 0x0030, 0x0518, 0x0067, 0x0518, 0x00c9, 0x0518, 0x01eb,\n    0x0518, 0x01ef, 0x051c, 0x0139, 0x051c, 0x0168, 0x0520, 0x0027, 0x0526,\n    0x014e, 0x0526, 0x0175, 0x0526, 0x018d, 0x052d, 0x0200, 0x0532, 0x0021,\n    0x0532, 0x00bf, 0x0532, 0x00d0, 0x0532, 0x0239, 0x0532, 0x0266, 0x053d,\n    0x0024, 0x053d, 0x00da, 0x054a, 0x000f, 0x054a, 0x00ab, 0x054a, 0x023a,\n    0x054e, 0x0043, 0x054e, 0x004d, 0x054e, 0x00c3, 0x054e, 0x013d, 0x054e,\n    0x0215, 0x054e, 0x0232, 0x054e, 0x029d, 0x0552, 0x014e, 0x0552, 0x018d,\n    0x0556, 0x00f3, 0x0556, 0x01e4, 0x055a, 0x0299, 0x055d, 0x0086, 0x055d,\n    0x009a, 0x055d, 0x00ec, 0x055d, 0x0172, 0x0566, 0x01dc, 0x0566, 0x02a5,\n    0x056d, 0x020a, 0x057a, 0x003d, 0x057a, 0x01d4, 0x057a, 0x01f3, 0x0579,\n    0x025e, 0x057e, 0x0139, 0x057e, 0x0168, 0x0581, 0x0006, 0x0586, 0x017b,\n    0x0586, 0x0181, 0x0586, 0x028c, 0x0588, 0x0007, 0x058e, 0x0033, 0x058e,\n    0x008f, 0x058e, 0x01d0, 0x058e, 0x027c, 0x0590, 0x0003, 0x0596, 0x0033,\n    0x0596, 0x008f, 0x0596, 0x0095, 0x0596, 0x01d0, 0x0596, 0x027c, 0x05a2,\n    0x026f, 0x05a5, 0x0284, 0x05aa, 0x017b, 0x05ac, 0x0205, 0x05b2, 0x008f,\n    0x05b6, 0x017b, 0x05b8, 0x01da, 0x05c1, 0x0276, 0x05c6, 0x0248, 0x05c8,\n    0x0247, 0x05c8, 0x027e, 0x05cc, 0x003d, 0x05cc, 0x01d4, 0x05cc, 0x01f3,\n    0x05d0, 0x014e, 0x05d0, 0x018d, 0x05da, 0x00f9, 0x05dd, 0x0006, 0x05de,\n    0x0044, 0x05e5, 0x002e, 0x05e6, 0x02f1, 0x05ea, 0x01d4, 0x05ea, 0x01f3,\n    0x05ea, 0x022d, 0x05ed, 0x0002, 0x05f6, 0x0027, 0x05fa, 0x0097, 0x05fc,\n    0x003d, 0x0602, 0x003d, 0x0606, 0x00f3, 0x060a, 0x0027, 0x060e, 0x003d,\n    0x060e, 0x0103, 0x060e, 0x018b, 0x060e, 0x0231, 0x060e, 0x02d1, 0x0611,\n    0x01fc, 0x0611, 0x0234, 0x061a, 0x0287, 0x061d, 0x0214, 0x0621, 0x01d4,\n    0x062a, 0x0027, 0x062a, 0x022d, 0x062e, 0x009e, 0x062e, 0x0106, 0x062e,\n    0x0198, 0x0632, 0x009e, 0x0632, 0x0106, 0x0632, 0x0198, 0x0639, 0x0042,\n    0x0639, 0x00b2, 0x0639, 0x0108, 0x063d, 0x01f8, 0x0641, 0x0086, 0x0641,\n    0x009a, 0x0641, 0x00ec, 0x0641, 0x0172, 0x0645, 0x0044, 0x0649, 0x0042,\n    0x0648, 0x0087, 0x0648, 0x00ed, 0x0648, 0x0173, 0x0649, 0x01a0, 0x0648,\n    0x0241, 0x0648, 0x026f, 0x0648, 0x02df, 0x0648, 0x0307, 0x064c, 0x023a,\n    0x064c, 0x02b3, 0x0651, 0x0062, 0x0650, 0x0217, 0x0651, 0x02ac, 0x0650,\n    0x02d6, 0x0655, 0x0042, 0x065d, 0x0042, 0x0664, 0x02b1, 0x0664, 0x02ce,\n    0x0669, 0x0238, 0x066d, 0x002a, 0x066c, 0x0039, 0x066d, 0x01f6, 0x066c,\n    0x0213, 0x066c, 0x022e, 0x066d, 0x02a2, 0x066c, 0x02e1, 0x0671, 0x002a,\n    0x0670, 0x0030, 0x0670, 0x0067, 0x0670, 0x00c9, 0x0670, 0x01eb, 0x0670,\n    0x01ef, 0x0670, 0x02c3, 0x0675, 0x0020, 0x0678, 0x0133, 0x0678, 0x0148,\n    0x067c, 0x0027, 0x0681, 0x023a, 0x0684, 0x0021, 0x0684, 0x00bf, 0x0684,\n    0x00d0, 0x0689, 0x01fc, 0x068e, 0x0162, 0x068e, 0x0183, 0x0691, 0x0200,\n    0x0696, 0x0023, 0x0696, 0x00e0, 0x0696, 0x00fb, 0x0696, 0x0268, 0x069a,\n    0x0282, 0x069d, 0x007e, 0x06a2, 0x004b, 0x06a2, 0x023e, 0x06a2, 0x02dc,\n    0x06a6, 0x0097, 0x06aa, 0x02b1, 0x06aa, 0x02ce, 0x06ae, 0x0039, 0x06ae,\n    0x0213, 0x06ae, 0x022e, 0x06ae, 0x02e1, 0x06b2, 0x0162, 0x06b2, 0x0183,\n    0x06b6, 0x0023, 0x06b6, 0x00e0, 0x06b6, 0x00fb, 0x06ba, 0x008f, 0x06ba,\n    0x01e4, 0x06be, 0x034b, 0x06c1, 0x0086, 0x06c1, 0x009a, 0x06c1, 0x00ec,\n    0x06c1, 0x0172, 0x06c6, 0x01da, 0x06c6, 0x0280, 0x06c6, 0x0351, 0x06ce,\n    0x008f, 0x06d2, 0x01e3, 0x06d2, 0x0287, 0x06d2, 0x0353, 0x06d6, 0x027a,\n    0x06d6, 0x029b, 0x06da, 0x0033, 0x06da, 0x01ce, 0x06da, 0x01f1, 0x06de,\n    0x0133, 0x06de, 0x0148, 0x06e2, 0x0021, 0x06e2, 0x00bf, 0x06e2, 0x00d0,\n    0x06e5, 0x023a, 0x06e9, 0x0004, 0x06ee, 0x028c, 0x06ee, 0x0338, 0x06f2,\n    0x0328, 0x06f2, 0x0330, 0x06f4, 0x0005, 0x06f9, 0x01e0, 0x06fe, 0x0328,\n    0x06fe, 0x0330, 0x0702, 0x003d, 0x0702, 0x00f3, 0x0702, 0x0330, 0x0704,\n    0x0003, 0x070a, 0x003d, 0x070a, 0x00f3, 0x070a, 0x01d4, 0x070a, 0x01f3,\n    0x070a, 0x0330, 0x0711, 0x032a, 0x0711, 0x032e, 0x0716, 0x003d, 0x0718,\n    0x0205, 0x0718, 0x0282, 0x071e, 0x00f3, 0x0720, 0x01dc, 0x0720, 0x02a5,\n    0x0726, 0x0324, 0x072a, 0x028a, 0x072a, 0x02a7, 0x0729, 0x031c, 0x0729,\n    0x032a, 0x072e, 0x003d, 0x072e, 0x00f9, 0x072e, 0x022d, 0x072e, 0x0248,\n    0x072e, 0x02e4, 0x0730, 0x003d, 0x0730, 0x0247, 0x0730, 0x02e3, 0x0730,\n    0x0324, 0x0732, 0x0324, 0x0739, 0x032e, 0x073e, 0x003d, 0x0740, 0x003d,\n    0x0744, 0x027a, 0x0744, 0x029b, 0x0748, 0x0033, 0x0748, 0x01ce, 0x0748,\n    0x01f1, 0x074c, 0x0162, 0x074c, 0x0183, 0x0750, 0x0023, 0x0750, 0x00e0,\n    0x0750, 0x00fb, 0x0755, 0x0246, 0x075a, 0x0095, 0x075a, 0x0397, 0x075d,\n    0x0004, 0x076a, 0x03b3, 0x076d, 0x0002, 0x0772, 0x02fb, 0x0772, 0x0301,\n    0x0772, 0x0315, 0x0772, 0x0397, 0x0776, 0x008f, 0x077e, 0x0027, 0x078a,\n    0x00a1, 0x0792, 0x009d, 0x0792, 0x00c3, 0x0792, 0x02fb, 0x0792, 0x0301,\n    0x0792, 0x0315, 0x0792, 0x03bd, 0x0796, 0x0027, 0x0796, 0x024f, 0x079e,\n    0x009d, 0x07a6, 0x009d, 0x07a6, 0x02fb, 0x07a6, 0x0301, 0x07a6, 0x0315,\n    0x07a6, 0x03bd, 0x07aa, 0x0027, 0x07aa, 0x024f, 0x07ae, 0x009d, 0x07b9,\n    0x004e, 0x07b8, 0x0087, 0x07b8, 0x00ed, 0x07b8, 0x0173, 0x07b8, 0x0197,\n    0x07b9, 0x021a, 0x07b9, 0x02b8, 0x07b9, 0x0364, 0x07be, 0x0029, 0x07be,\n    0x0030, 0x07c0, 0x017b, 0x07c6, 0x017b, 0x07c8, 0x00f3, 0x07ce, 0x00f3,\n    0x07d0, 0x008f, 0x07d6, 0x008f, 0x07d9, 0x01e8, 0x07dd, 0x0292, 0x07e2,\n    0x0053, 0x07e6, 0x008f, 0x07e6, 0x00f3, 0x07e6, 0x017b, 0x07e8, 0x0029,\n    0x07e8, 0x0030, 0x07ec, 0x0021, 0x07ec, 0x02ad, 0x07f2, 0x0181, 0x07f2,\n    0x0315, 0x07f4, 0x0021, 0x07f8, 0x020f, 0x07fd, 0x002e, 0x0800, 0x008f,\n    0x0805, 0x0006, 0x0809, 0x03c2, 0x080d, 0x0084, 0x0812, 0x0009, 0x0811,\n    0x0008, 0x0812, 0x00f3, 0x0812, 0x00f9, 0x0812, 0x017b, 0x0812, 0x0181,\n    0x0814, 0x0033, 0x0818, 0x0023, 0x081c, 0x0285, 0x0826, 0x03bd, 0x082c,\n    0x008f, 0x082c, 0x017b, 0x0832, 0x0043, 0x0832, 0x011b, 0x0832, 0x01b3,\n    0x0832, 0x01c3, 0x0835, 0x032a, 0x0838, 0x0085, 0x0839, 0x032a, 0x083e,\n    0x0049, 0x083d, 0x0084, 0x083e, 0x02fb, 0x083e, 0x0301, 0x083e, 0x0315,\n    0x083e, 0x0397, 0x0842, 0x0009, 0x0841, 0x0008, 0x0844, 0x0009, 0x0846,\n    0x008f, 0x084a, 0x0033, 0x084e, 0x0285, 0x0851, 0x009a, 0x0856, 0x00a1,\n    0x0859, 0x031c, 0x085d, 0x00b2, 0x0861, 0x0012, 0x0861, 0x02cc, 0x0865,\n    0x0058, 0x0865, 0x007e, 0x0869, 0x004a, 0x0871, 0x0010, 0x0876, 0x003d,\n    0x0879, 0x032c, 0x087e, 0x0089, 0x0882, 0x0229, 0x0882, 0x022d, 0x0882,\n    0x02c7, 0x0882, 0x02cb, 0x0886, 0x0021, 0x0886, 0x02ad, 0x0885, 0x0356,\n    0x088a, 0x0017, 0x088a, 0x020f, 0x0889, 0x0354, 0x088d, 0x009c, 0x0892,\n    0x0089, 0x0895, 0x0246, 0x089a, 0x03bd, 0x089e, 0x008f, 0x089e, 0x02f9,\n    0x089e, 0x0313, 0x08a1, 0x032a, 0x08a6, 0x0053, 0x08a6, 0x0095, 0x08a6,\n    0x0397, 0x08a8, 0x017b, 0x08ad, 0x031a, 0x08b2, 0x017b, 0x08b4, 0x00f3,\n    0x08b5, 0x02a0, 0x08b8, 0x0089, 0x08c1, 0x0024, 0x08c4, 0x00f3, 0x08c9,\n    0x007e, 0x08cd, 0x007c, 0x08cd, 0x0222, 0x08cd, 0x0294, 0x08d1, 0x003a,\n    0x08d6, 0x0009, 0x08d9, 0x003a, 0x08dc, 0x001f, 0x08e0, 0x008f, 0x08e0,\n    0x017b, 0x08e4, 0x0009, 0x08e8, 0x01ed, 0x08ed, 0x031c, 0x08f2, 0x003d,\n    0x08f6, 0x008f, 0x08f6, 0x017b, 0x08fa, 0x0009, 0x08fe, 0x003d, 0x0902,\n    0x01e9, 0x0904, 0x01e9, 0x0904, 0x0381, 0x090a, 0x03b1, 0x090d, 0x031a,\n    0x0910, 0x0299, 0x0914, 0x034b, 0x0919, 0x0008, 0x091c, 0x0033, 0x091c,\n    0x003d, 0x0920, 0x0027, 0x0924, 0x0027, 0x0924, 0x01fb, 0x092a, 0x01ce,\n    0x092a, 0x01f1, 0x092d, 0x031c, 0x0930, 0x001f, 0x0936, 0x00c5, 0x0938,\n    0x00c5, 0x0938, 0x0381, 0x093c, 0x001b, 0x0942, 0x017d, 0x094a, 0x0027,\n    0x094e, 0x0027, 0x094e, 0x01fb, 0x0952, 0x03b1, 0x095a, 0x0029, 0x095a,\n    0x0030, 0x095d, 0x0030, 0x0961, 0x0030, 0x0966, 0x02f9, 0x0966, 0x0313,\n    0x0968, 0x02eb, 0x096d, 0x0008, 0x0970, 0x017b, 0x0974, 0x0033, 0x0979,\n    0x0150, 0x097d, 0x009a, 0x0982, 0x0293, 0x0984, 0x0293, 0x0984, 0x0379,\n    0x098a, 0x02eb, 0x098e, 0x0009, 0x0992, 0x003d, 0x0996, 0x003d, 0x0999,\n    0x0062, 0x099e, 0x003d, 0x09a0, 0x0027, 0x09a5, 0x0144, 0x09a8, 0x02b5,\n    0x09ae, 0x008f, 0x09ae, 0x009d, 0x09b2, 0x004d, 0x09b2, 0x0053, 0x09b2,\n    0x00c3, 0x09b2, 0x013d, 0x09b2, 0x01c5, 0x09b2, 0x0271, 0x09b4, 0x0025,\n    0x09ba, 0x0033, 0x09ba, 0x0079, 0x09bc, 0x0015, 0x09c2, 0x013f, 0x09c4,\n    0x013f, 0x09c4, 0x0379, 0x09ca, 0x02b5, 0x09cd, 0x0006, 0x09da, 0x0009,\n    0x09d9, 0x0008, 0x09dc, 0x000b, 0x09dc, 0x004f, 0x09dd, 0x0086, 0x09e0,\n    0x0009, 0x09e6, 0x00a1, 0x09e8, 0x0009, 0x09ed, 0x0086, 0x09f2, 0x001f,\n    0x09f2, 0x002f, 0x09f2, 0x0049, 0x09f2, 0x006f, 0x09f2, 0x0085, 0x09f2,\n    0x0091, 0x09f2, 0x00a9, 0x09f2, 0x00d3, 0x09f2, 0x00d7, 0x09f2, 0x011d,\n    0x09f2, 0x0121, 0x09f2, 0x0235, 0x09f2, 0x0393, 0x09f6, 0x0324, 0x09f8,\n    0x0049, 0x09f8, 0x00a9, 0x09f8, 0x011d, 0x09fe, 0x001f, 0x09fe, 0x0029,\n    0x09fe, 0x0033, 0x09fe, 0x003d, 0x09fe, 0x0085, 0x09fe, 0x008f, 0x09fe,\n    0x00d3, 0x0a00, 0x003d, 0x0a06, 0x012d, 0x0a0e, 0x00b3, 0x0a10, 0x000b,\n    0x0a10, 0x0387, 0x0a16, 0x0059, 0x0a18, 0x0009, 0x0a1e, 0x0043, 0x0a24,\n    0x0085, 0x0a2a, 0x0009, 0x0a2d, 0x0008, 0x0a32, 0x028a, 0x0a32, 0x02a7,\n    0x0a31, 0x031c, 0x0a35, 0x032e, 0x0a39, 0x0006, 0x0a3a, 0x0105, 0x0a3a,\n    0x024f, 0x0a3c, 0x0299, 0x0a42, 0x01ed, 0x0a46, 0x0299, 0x0a48, 0x01ed,\n    0x0a4c, 0x0059, 0x0a52, 0x000b, 0x0a52, 0x0387, 0x0a56, 0x000b, 0x0a5e,\n    0x0009, 0x0a60, 0x003d, 0x0a66, 0x0105, 0x0a6a, 0x0195, 0x0a6c, 0x000b,\n    0x0a76, 0x0053, 0x0a78, 0x0009, 0x0a7a, 0x008f, 0x0a82, 0x0299, 0x0a86,\n    0x01ed, 0x0a8a, 0x0027, 0x0a8e, 0x004b, 0x0a92, 0x003d, 0x0a95, 0x0322,\n    0x0a99, 0x0038, 0x0a99, 0x0090, 0x0a9c, 0x0061, 0x0a9c, 0x00c7, 0x0a9c,\n    0x012d, 0x0a9c, 0x016f, 0x0a9c, 0x017d, 0x0a9c, 0x02c9, 0x0a9c, 0x0383,\n    0x0aa1, 0x0010, 0x0aa4, 0x00b3, 0x0aa8, 0x002f, 0x0aac, 0x0027, 0x0ab0,\n    0x004b, 0x0ab4, 0x0043, 0x0ab9, 0x0090, 0x0abd, 0x0010, 0x0ac4, 0x0019,\n    0x0acc, 0x00f5, 0x0acc, 0x022b, 0x0acc, 0x037b, 0x0ad2, 0x008f, 0x0ad2,\n    0x01f1, 0x0ad6, 0x0324, 0x0ad9, 0x0330, 0x0ade, 0x008f, 0x0ade, 0x01f1,\n    0x0ae0, 0x017b, 0x0ae4, 0x008f, 0x0ae9, 0x004e, 0x0aee, 0x0027, 0x0af2,\n    0x028a, 0x0af2, 0x02a7, 0x0af1, 0x031c, 0x0af6, 0x0027, 0x0af9, 0x031c,\n    0x0afe, 0x00e9, 0x0afe, 0x02bb, 0x0b02, 0x000b, 0x0b06, 0x00f5, 0x0b06,\n    0x022b, 0x0b06, 0x037b, 0x0b0a, 0x003d, 0x0000, 0x0000};\n\nPreCompGraphManager::PreCompGraphManager(NPNManager& npnManager)\n    : npnManager(npnManager), table(npnManager.getNFuncs()), classes(222) {\n\n  this->nTravIds   = 0;\n  this->forestSize = 0;\n  this->forest     = (ForestNode*)malloc(\n      sizeof(ForestNode) *\n      1800); // Value based on the execution of ABC rewrite command.\n  memset(this->forest, 0, sizeof(ForestNode));\n\n  addForestVar(0x0000); // constant 0\n  addForestVar(0xAAAA); // var A\n  addForestVar(0xCCCC); // var B\n  addForestVar(0xF0F0); // var C\n  addForestVar(0xFF00); // var D\n}\n\nPreCompGraphManager::~PreCompGraphManager() {\n\n  if (!this->classes.empty()) {\n    ForestNode* node;\n    DecGraph* decGraph;\n    for (size_t i = 0; i < classes.size(); i++) {\n      for (size_t j = 0; j < classes[i].size(); j++) {\n        node     = classes[i][j];\n        decGraph = (DecGraph*)node->pNext;\n        delete decGraph;\n      }\n    }\n  }\n\n  free(this->forest);\n}\n\nvoid PreCompGraphManager::loadPreCompGraphFromArray() {\n\n  ForestNode *p0, *p1;\n  unsigned Entry0, Entry1;\n  int Level, Volume, fExor;\n  int i;\n\n  // reconstruct the forest\n  for (i = 0;; i++) {\n\n    Entry0 = aigSubgraphs[2 * i + 0];\n    Entry1 = aigSubgraphs[2 * i + 1];\n    if (Entry0 == 0 && Entry1 == 0) {\n      break;\n    }\n    // get EXOR flag\n    fExor = (Entry0 & 1);\n    Entry0 >>= 1;\n    // get the nodes\n    p0 = &(this->forest[Entry0 >> 1]);\n    p1 = &(this->forest[Entry1 >> 1]);\n    // compute the level and volume of the new nodes\n    Level  = 1 + std::max(p0->Level, p1->Level);\n    Volume = 1 + getForestNodeVolume(p0, p1);\n    // set the complemented attributes\n    p0 = forestNodeComplementCond(p0, (Entry0 & 1));\n    p1 = forestNodeComplementCond(p1, (Entry1 & 1));\n    // add the node\n    addForestNode(p0, p1, fExor, Level, Volume + fExor);\n  }\n}\n\nForestNode* PreCompGraphManager::addForestNode(ForestNode* p0, ForestNode* p1,\n                                               int fExor, int Level,\n                                               int Volume) {\n\n  unsigned uTruth;\n  // compute truth table, leve, volume\n  if (fExor) {\n    uTruth = (p0->uTruth ^ p1->uTruth);\n  } else {\n    uTruth = (isForestNodeComplement(p0) ? ~forestNodeRegular(p0)->uTruth\n                                         : forestNodeRegular(p0)->uTruth) &\n             (isForestNodeComplement(p1) ? ~forestNodeRegular(p1)->uTruth\n                                         : forestNodeRegular(p1)->uTruth) &\n             0xFFFF;\n  }\n\n  // create the new node\n  ForestNode* pNew = &(this->forest[forestSize]);\n  pNew->Id         = this->forestSize++;\n  pNew->TravId     = 0;\n  pNew->uTruth     = uTruth;\n  pNew->Level      = Level;\n  pNew->Volume     = Volume;\n  pNew->fUsed      = 0;\n  pNew->fExor      = fExor;\n  pNew->p0         = p0;\n  pNew->p1         = p1;\n  pNew->pNext      = NULL;\n\n  // do not add if the node is not essential\n  if (uTruth != this->npnManager.getCanons()[uTruth]) {\n    return pNew;\n  }\n\n  // add to the list\n  addForestNodeToTable(uTruth, pNew);\n\n  return pNew;\n}\n\nForestNode* PreCompGraphManager::addForestVar(unsigned uTruth) {\n\n  ForestNode* pNew = &(this->forest[forestSize]);\n  pNew->Id         = this->forestSize++;\n  pNew->TravId     = 0;\n  pNew->uTruth     = uTruth;\n  pNew->Level      = 0;\n  pNew->Volume     = 0;\n  pNew->fUsed      = 1;\n  pNew->fExor      = 0;\n  pNew->p0         = NULL;\n  pNew->p1         = NULL;\n  pNew->pNext      = NULL;\n  return pNew;\n}\n\nvoid PreCompGraphManager::addForestNodeToTable(unsigned uTruth,\n                                               ForestNode* node) {\n\n  ForestNode** position = &(this->table[uTruth]);\n  ForestNode* temp;\n  // find the last one\n  for (temp = *position; temp; temp = temp->pNext)\n    position = &temp->pNext;\n  // attach at the end\n  *position = node;\n}\n\nint PreCompGraphManager::getForestNodeVolume(ForestNode* p0, ForestNode* p1) {\n\n  int volume = 0;\n  incTravId();\n  getVolumeRec(p0, &volume);\n  getVolumeRec(p1, &volume);\n  return volume;\n}\n\nvoid PreCompGraphManager::getVolumeRec(ForestNode* node, int* volume) {\n\n  if (node->fUsed || node->TravId == this->nTravIds) {\n    return;\n  }\n  node->TravId = this->nTravIds;\n  (*volume)++;\n  if (node->fExor) {\n    (*volume)++;\n  }\n  getVolumeRec(forestNodeRegular(node->p0), volume);\n  getVolumeRec(forestNodeRegular(node->p1), volume);\n}\n\nvoid PreCompGraphManager::incTravId() {\n  // no overflow\n  auto result = this->nTravIds++;\n  if (this->nTravIds > result) {\n    return;\n  }\n\n  // overflow detected; reset the counters\n  for (int i = 0; i < this->forestSize; i++) {\n    forest[i].TravId = 0;\n  }\n  this->nTravIds = 1;\n}\n\nbool PreCompGraphManager::isForestNodeComplement(ForestNode* node) {\n  return (bool)(((unsigned long int)node) & 0x1ul);\n}\n\nForestNode* PreCompGraphManager::forestNodeRegular(ForestNode* node) {\n  return (ForestNode*)((unsigned long int)(node) & ~0x1ul);\n}\n\nForestNode* PreCompGraphManager::forestNodeComplement(ForestNode* node) {\n  return (ForestNode*)((unsigned long int)(node) ^ 0x1ul);\n}\n\nForestNode* PreCompGraphManager::forestNodeComplementCond(ForestNode* node,\n                                                          int c) {\n  return (ForestNode*)((unsigned long int)(node) ^ (c));\n}\n\nvoid PreCompGraphManager::processDecompositionGraphs() {\n\n  DecGraph* decGraph;\n  ForestNode* node;\n  unsigned char* map     = this->npnManager.getMap();\n  unsigned short* mapInv = this->npnManager.getMapInv();\n  unsigned short* canons = this->npnManager.getCanons();\n\n  // put the nodes into the structure\n  for (int i = 0; i < this->npnManager.getNFuncs(); i++) {\n\n    if (this->table[i] == NULL) {\n      continue;\n    }\n    // consider all implementations of this function\n    for (node = this->table[i]; node; node = node->pNext) {\n      assert(node->uTruth == this->table[i]->uTruth);\n      assert(map[node->uTruth] < 222); // Guaranteed to be >=0 b/c unsigned\n      this->classes[map[node->uTruth]].push_back(node);\n      mapInv[map[node->uTruth]] = canons[node->uTruth];\n    }\n  }\n  // compute decomposition forms for each node and verify them\n  for (size_t i = 0; i < classes.size(); i++) {\n\n    // Print the number of precomputed structures for each of the 222 fucntions\n    // std::cout << i << \" \" << classes[i].size() << std::endl;\n\n    for (size_t j = 0; j < classes[i].size(); j++) {\n      node        = classes[i][j];\n      decGraph    = processNode(node);\n      node->pNext = (ForestNode*)decGraph;\n      assert(node->uTruth == (decGraph->deriveTruth() & 0xFFFF));\n    }\n  }\n}\n\nDecGraph* PreCompGraphManager::processNode(ForestNode* node) {\n\n  DecGraph* decGraph;\n  DecEdge eRoot;\n  assert(!isForestNodeComplement(node));\n  // consider constant\n  if (node->uTruth == 0) {\n    decGraph = new DecGraph(); // Constant-Zero Graph Constructor\n    return decGraph;\n  }\n  // consider the case of elementary var\n  if (node->uTruth == 0x00FF) {\n    decGraph = new DecGraph(3, 4, 1); // Leaf Graph Constructor\n    return decGraph;\n  }\n  // start the subgraphs\n  decGraph = new DecGraph(4); // 4-Leaves Graph Contructor\n\n  // collect the nodes\n  incTravId();\n  eRoot = processNodeRec(node, decGraph);\n  decGraph->setRootEdge(eRoot);\n\n  return decGraph;\n}\n\nDecEdge PreCompGraphManager::processNodeRec(ForestNode* node,\n                                            DecGraph* decGraph) {\n\n  DecEdge eNode0, eNode1, eNode;\n  // elementary variable\n  if (node->fUsed) {\n    return decGraph->createEdge(node->Id - 1, 0);\n  }\n  // previously visited node\n  if (node->TravId == this->nTravIds) {\n    return decGraph->intToEdge(node->Volume);\n  }\n  node->TravId = this->nTravIds;\n  // solve for children\n  eNode0 = processNodeRec(forestNodeRegular(node->p0), decGraph);\n  if (isForestNodeComplement(node->p0)) {\n    eNode0.fCompl = !eNode0.fCompl;\n  }\n  eNode1 = processNodeRec(forestNodeRegular(node->p1), decGraph);\n  if (isForestNodeComplement(node->p1)) {\n    eNode1.fCompl = !eNode1.fCompl;\n  }\n  // create the decomposition node(s)\n  if (node->fExor) {\n    eNode = decGraph->addXorNode(eNode0, eNode1, 0);\n  } else {\n    eNode = decGraph->addAndNode(eNode0, eNode1);\n  }\n  // save the result\n  node->Volume = decGraph->edgeToInt(eNode);\n\n  return eNode;\n}\n\nForestNode* PreCompGraphManager::getForest() { return this->forest; }\n\nstd::vector<ForestNode*>& PreCompGraphManager::getTable() {\n  return this->table;\n}\n\nstd::vector<std::vector<ForestNode*>>& PreCompGraphManager::getClasses() {\n  return this->classes;\n}\n\n// ########################### DECOMPOSITION GRAPH METHODS\n// ########################### //\n\n// Create a Const graph\nDecGraph::DecGraph() {\n\n  this->fConst       = true;\n  this->nLeaves      = 0;\n  this->nSize        = 0;\n  this->nCap         = 0;\n  this->idCounter    = 0;\n  this->pNodes       = nullptr;\n  this->eRoot.fCompl = 1;\n  this->eRoot.Node   = 0;\n}\n\n// Create a graph with nLeaves\nDecGraph::DecGraph(int nLeaves) {\n\n  this->fConst    = false;\n  this->nLeaves   = nLeaves;\n  this->nSize     = nLeaves;\n  this->nCap      = 20; // Original ABC = 2 * nLeaves + 50;\n  this->idCounter = 0;\n  this->pNodes    = (DecNode*)malloc(sizeof(DecNode) * this->nCap);\n  memset(this->pNodes, 0, sizeof(DecNode) * this->nSize);\n  this->eRoot.fCompl = 0;\n  this->eRoot.Node   = 0;\n  // Initialize the id for leaves\n  for (int i = 0; i < this->nLeaves; i++) {\n    this->pNodes[i].id = this->idCounter++;\n  }\n}\n\n// Create a leaf graph\nDecGraph::DecGraph(int iLeaf, int nLeaves, int fCompl) {\n\n  assert(0 <= iLeaf && iLeaf < nLeaves);\n  this->fConst    = false;\n  this->nLeaves   = nLeaves;\n  this->nSize     = nLeaves;\n  this->nCap      = 20; // Original ABC = 2 * nLeaves + 50;\n  this->idCounter = 0;\n  this->pNodes    = (DecNode*)malloc(sizeof(DecNode) * this->nCap);\n  memset(this->pNodes, 0, sizeof(DecNode) * this->nSize);\n  this->eRoot.fCompl = fCompl;\n  this->eRoot.Node   = iLeaf;\n  // Initialize the id for leaves\n  for (int i = 0; i < this->nLeaves; i++) {\n    this->pNodes[i].id = this->idCounter++;\n  }\n}\n\nDecGraph::~DecGraph() { free(this->pNodes); }\n\nDecEdge DecGraph::addAndNode(DecEdge eEdge0, DecEdge eEdge1) {\n\n  // get the new node\n  DecNode* node = this->appendNode();\n  // set the inputs and other info\n  node->id      = this->idCounter++;\n  node->eEdge0  = eEdge0;\n  node->eEdge1  = eEdge1;\n  node->fCompl0 = eEdge0.fCompl;\n  node->fCompl1 = eEdge1.fCompl;\n  return this->createEdge(this->nSize - 1, 0);\n}\n\nDecEdge DecGraph::addOrNode(DecEdge eEdge0, DecEdge eEdge1) {\n\n  // get the new node\n  DecNode* node = this->appendNode();\n  // set the inputs and other info\n  node->id      = this->idCounter++;\n  node->eEdge0  = eEdge0;\n  node->eEdge1  = eEdge1;\n  node->fCompl0 = eEdge0.fCompl;\n  node->fCompl1 = eEdge1.fCompl;\n  // make adjustments for the OR gate\n  node->fNodeOr       = 1;\n  node->eEdge0.fCompl = !node->eEdge0.fCompl;\n  node->eEdge1.fCompl = !node->eEdge1.fCompl;\n  return this->createEdge(this->nSize - 1, 1);\n}\n\nDecEdge DecGraph::addXorNode(DecEdge eEdge0, DecEdge eEdge1, int Type) {\n\n  DecEdge eNode0, eNode1, eNode;\n  if (Type == 0) {\n    // derive the first AND\n    eEdge0.fCompl ^= 1;\n    eNode0 = this->addAndNode(eEdge0, eEdge1);\n    eEdge0.fCompl ^= 1;\n    // derive the second AND\n    eEdge1.fCompl ^= 1;\n    eNode1 = this->addAndNode(eEdge0, eEdge1);\n    // derive the final OR\n    eNode = this->addOrNode(eNode0, eNode1);\n  } else {\n    // derive the first AND\n    eNode0 = this->addAndNode(eEdge0, eEdge1);\n    // derive the second AND\n    eEdge0.fCompl ^= 1;\n    eEdge1.fCompl ^= 1;\n    eNode1 = this->addAndNode(eEdge0, eEdge1);\n    // derive the final OR\n    eNode = this->addOrNode(eNode0, eNode1);\n    eNode.fCompl ^= 1;\n  }\n  return eNode;\n}\n\nDecEdge DecGraph::createEdge(unsigned Node, unsigned fCompl) {\n  DecEdge eEdge = {fCompl, Node};\n  return eEdge;\n}\n\nDecNode* DecGraph::appendNode() {\n\n  DecNode* node;\n  if (this->nSize == this->nCap) {\n    this->pNodes = (DecNode*)realloc(this->pNodes, 2 * this->nCap);\n    this->nCap   = 2 * this->nCap;\n  }\n  node = this->pNodes + this->nSize++;\n  memset(node, 0, sizeof(DecNode));\n  return node;\n}\n\nDecNode* DecGraph::getNodes() { return this->pNodes; }\n\nDecNode* DecGraph::getNode(int i) { return this->pNodes + i; }\n\nDecNode* DecGraph::getVar() {\n  assert(this->isVar());\n  return this->getNode(this->eRoot.Node);\n}\n\nvoid DecGraph::setRootEdge(DecEdge eRoot) { this->eRoot = eRoot; }\n\nDecEdge DecGraph::getRootEdge() { return this->eRoot; }\n\nint DecGraph::getLeaveNum() { return this->nLeaves; }\n\nint DecGraph::getNodeNum() { return this->nSize; }\n\nbool DecGraph::isConst() { return this->fConst; }\n\nbool DecGraph::isVar() { return this->eRoot.Node < (unsigned)this->nLeaves; }\n\nunsigned DecGraph::isComplement() { return this->eRoot.fCompl; }\n\nint DecGraph::nodeToInt(DecNode* node) { return node - this->pNodes; }\n\nint DecGraph::varToInt() {\n  assert(this->isVar());\n  return this->nodeToInt(this->getVar());\n}\n\nDecEdge DecGraph::intToEdge(unsigned Edge) {\n  return this->createEdge(Edge >> 1, Edge & 1);\n}\n\nunsigned DecGraph::edgeToInt(DecEdge eEdge) {\n  return (eEdge.Node << 1) | eEdge.fCompl;\n}\n\nunsigned DecGraph::deriveTruth() {\n\n  unsigned uTruths[5] = {0xAAAAAAAA, 0xCCCCCCCC, 0xF0F0F0F0, 0xFF00FF00,\n                         0xFFFF0000};\n  unsigned uTruth     = 0; // Suppress \"might be used uninitialized\"\n  unsigned uTruth0, uTruth1;\n  DecNode* node;\n\n  // sanity checks\n  assert(this->nLeaves >= 0);\n  assert(this->nLeaves <= this->nSize);\n  assert(this->nLeaves <= 5);\n\n  // check for constant function\n  if (this->isConst()) {\n    return this->isComplement() ? 0 : ~((unsigned)0);\n  }\n  // check for a literal\n  if (this->isVar()) {\n    return this->isComplement() ? ~uTruths[this->varToInt()]\n                                : uTruths[this->varToInt()];\n  }\n\n  // assign the elementary variables\n  for (int i = 0; (i < this->nLeaves) && ((node = this->getNode(i)), 1); i++) {\n    node->pFunc = (void*)(unsigned long)uTruths[i];\n  }\n\n  // compute the function for each internal node\n  for (int i = this->nLeaves;\n       (i < this->nSize) && ((node = this->getNode(i)), 1); i++) {\n    uTruth0 = (unsigned)(unsigned long)this->getNode(node->eEdge0.Node)->pFunc;\n    uTruth1 = (unsigned)(unsigned long)this->getNode(node->eEdge1.Node)->pFunc;\n    uTruth0 = node->eEdge0.fCompl ? ~uTruth0 : uTruth0;\n    uTruth1 = node->eEdge1.fCompl ? ~uTruth1 : uTruth1;\n    uTruth  = uTruth0 & uTruth1;\n    node->pFunc = (void*)(unsigned long)uTruth;\n  }\n\n  // complement the result if necessary\n  return this->isComplement() ? ~uTruth : uTruth;\n}\n\n} /* namespace algorithm */\n"
  },
  {
    "path": "lonestar/eda/cpu/aig-rewriting/algorithms/PreCompGraphManager.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n/*\n\n @Vinicius Possani\n Parallel Rewriting January 5, 2018.\n ABC-based implementation on Galois.\n\n*/\n\n#ifndef PRECOMPGRAPHMANAGER_h_\n#define PRECOMPGRAPHMANAGER_h_\n\n#include \"NPNManager.h\"\n#include <vector>\n\nnamespace algorithm {\n\ntypedef struct DecEdge_ {\n\n  unsigned fCompl : 1; // the complemented bit\n  unsigned Node : 30;  // the decomposition node pointed by the edge\n\n} DecEdge;\n\ntypedef struct DecNode_ {\n\n  DecEdge eEdge0; // the left child of the node\n  DecEdge eEdge1; // the right child of the node\n  // other info\n  union {\n    int iFunc;   // the literal of the node (AIG)\n    void* pFunc; // the function of the node (BDD or AIG)\n  };\n  int id;\n  unsigned Level : 14; // the level of this node in the global AIG\n  // printing info\n  unsigned fNodeOr : 1; // marks the original OR node\n  unsigned fCompl0 : 1; // marks the original complemented edge\n  unsigned fCompl1 : 1; // marks the original complemented edge\n  // latch info\n  unsigned nLat0 : 5; // the number of latches on the first edge\n  unsigned nLat1 : 5; // the number of latches on the second edge\n  unsigned nLat2 : 5; // the number of latches on the output edge\n\n} DecNode;\n\nclass DecGraph {\n\nprivate:\n  bool fConst; // marks the constant 1 graph\n  int nLeaves; // the number of leaves\n  int nSize;   // the number of nodes (including the leaves)\n  int nCap;    // the number of allocated nodes\n  int idCounter;\n  DecNode* pNodes; // the array of leaves and internal nodes\n  DecEdge eRoot;   // the pointer to the topmost node\n\n  DecNode* appendNode();\n\npublic:\n  DecGraph();                                   // Create a Const graph\n  DecGraph(int nLeaves);                        // Create a graph with nLeaves\n  DecGraph(int iLeaf, int nLeaves, int fCompl); // Create a leaf graph\n  ~DecGraph();\n\n  DecEdge addAndNode(DecEdge eEdge0, DecEdge eEdge1);\n  DecEdge addOrNode(DecEdge eEdge0, DecEdge eEdge1);\n  DecEdge addXorNode(DecEdge eEdge0, DecEdge eEdge1, int Type);\n  DecEdge createEdge(unsigned Node, unsigned fCompl);\n\n  DecNode* getNodes();\n  DecNode* getNode(int i);\n  DecNode* getVar();\n  DecEdge getRootEdge();\n  void setRootEdge(DecEdge eRoot);\n  int getLeaveNum();\n  int getNodeNum();\n  bool isConst();\n  bool isVar();\n  unsigned isComplement();\n\n  int nodeToInt(DecNode* node);\n  int varToInt();\n  DecEdge intToEdge(unsigned Edge);\n  unsigned edgeToInt(DecEdge eEdge);\n\n  unsigned deriveTruth();\n};\n\ntypedef struct ForestNode_ ForestNode;\nstruct ForestNode_ {\n\n  int Id;     // ID\n  int TravId; // traversal ID\n  short nScore;\n  short nGain;\n  short nAdded;\n  unsigned uTruth : 16; // truth table\n  unsigned Volume : 8;  // volume\n  unsigned Level : 6;   // level\n  unsigned fUsed : 1;   // mark\n  unsigned fExor : 1;   // mark\n  ForestNode* p0;       // first child\n  ForestNode* p1;       // second child\n  ForestNode* pNext;    // next in the table\n};\n\nclass PreCompGraphManager {\n\nprivate:\n  static const unsigned short aigSubgraphs[3562];\n\n  NPNManager& npnManager;\n  ForestNode* forest; // all the nodes\n  std::vector<ForestNode*>\n      table; // the hash table of nodes by their canonical form\n  std::vector<std::vector<ForestNode*>>\n      classes; // the nodes of the equivalence classes\n  int forestSize;\n  int nTravIds;\n\n  ForestNode* addForestNode(ForestNode* p0, ForestNode* p1, int fExor,\n                            int Level, int Volume);\n  ForestNode* addForestVar(unsigned uTruth);\n  void addForestNodeToTable(unsigned uTruth, ForestNode* node);\n\n  int getForestNodeVolume(ForestNode* p0, ForestNode* p1);\n  void getVolumeRec(ForestNode* node, int* volume);\n  void incTravId();\n\n  bool isForestNodeComplement(ForestNode* node);\n  ForestNode* forestNodeRegular(ForestNode* node);\n  ForestNode* forestNodeComplement(ForestNode* node);\n  ForestNode* forestNodeComplementCond(ForestNode* node, int c);\n\n  DecGraph* processNode(ForestNode* node);\n  DecEdge processNodeRec(ForestNode* node, DecGraph* decGraph);\n\npublic:\n  PreCompGraphManager(NPNManager& npnManager);\n  ~PreCompGraphManager();\n\n  void loadPreCompGraphFromArray();\n  void processDecompositionGraphs();\n\n  ForestNode* getForest();\n  std::vector<ForestNode*>& getTable();\n  std::vector<std::vector<ForestNode*>>& getClasses();\n};\n\n} /* namespace algorithm */\n\n#endif /* PRECOMPGRAPHMANAGER_H_ */\n"
  },
  {
    "path": "lonestar/eda/cpu/aig-rewriting/algorithms/PriorityCutManager.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n/*\n\n @Vinicius Possani\n Parallel Parallel LUT-Based Tech Mapping October 16, 2018.\n ABC-based implementation on Galois.\n\n*/\n\n#include \"PriorityCutManager.h\"\n#include \"galois/Galois.h\"\n#include \"galois/Bag.h\"\n\n#include <iostream>\n#include <cstdlib>\n#include <chrono>\n#include <assert.h>\n\nusing namespace std::chrono;\n\nnamespace algorithm {\n\nPriCutManager::PriCutManager(aig::Aig& aig, int K, int C, int nThreads,\n                             bool compTruth, bool deterministic, bool verbose)\n    : aig(aig), aigGraph(aig.getGraph()), K(K), C(C),\n      nWords(Functional32::wordNum(K)),\n      nNodes(std::distance(aig.getGraph().begin(), aig.getGraph().end()) -\n             aig.getNumOutputs()),\n      nThreads(nThreads), cutPoolSize(nNodes / nThreads), compTruth(compTruth),\n      deterministic(deterministic), verbose(verbose),\n      perThreadData(cutPoolSize, K, compTruth, C, nWords) {\n\n  nLUTs   = 0;\n  nLevels = 0;\n  passId  = 0;\n\n  sortMode = SortMode::DELAY;\n  costMode = CostMode::AREA_FLOW;\n\n  if (deterministic) {\n    refMode = RefMode::MAP;\n  } else {\n    refMode = RefMode::STANDARD;\n  }\n\n  fPower   = false;\n  fEpsilon = (float)0.005;\n  kcutTime = 0;\n\n  nodePriCuts = new PriCut*[nNodes + 1];\n  for (int i = 0; i < nNodes + 1; i++) {\n    nodePriCuts[i] = nullptr;\n  }\n\n  // iterating from 0 to N is reverse topological order\n  // iterating from N to 0 is topological order\n  aig.computeGenericTopologicalSortForAnds(this->sortedNodes);\n}\n\nPriCutManager::~PriCutManager() { delete[] nodePriCuts; }\n\nvoid PriCutManager::computePriCutsRecursively(aig::GNode node, RefMap& refMap) {\n\n  aig::NodeData& nodeData = aigGraph.getData(node, galois::MethodFlag::READ);\n\n  if (this->nodePriCuts[nodeData.id] == nullptr) {\n\n    auto inEdgeIt      = aigGraph.in_edge_begin(node);\n    aig::GNode lhsNode = aigGraph.getEdgeDst(inEdgeIt);\n    aig::NodeData& lhsData =\n        aigGraph.getData(lhsNode, galois::MethodFlag::READ);\n    bool lhsPolarity = aigGraph.getEdgeData(inEdgeIt);\n\n    inEdgeIt++;\n    aig::GNode rhsNode = aigGraph.getEdgeDst(inEdgeIt);\n    aig::NodeData& rhsData =\n        aigGraph.getData(rhsNode, galois::MethodFlag::READ);\n    bool rhsPolarity = aigGraph.getEdgeData(inEdgeIt);\n\n    ThreadLocalData* thData = this->perThreadData.getLocal();\n\n    computePriCutsRec(lhsNode, thData, refMap);\n    computePriCutsRec(rhsNode, thData, refMap);\n\n    computePriCuts(thData, refMap, nodeData, lhsData.id, rhsData.id,\n                   lhsPolarity, rhsPolarity);\n  }\n}\n\nvoid PriCutManager::computePriCutsRec(aig::GNode node, ThreadLocalData* thData,\n                                      RefMap& refMap) {\n\n  aig::NodeData& nodeData = aigGraph.getData(node, galois::MethodFlag::READ);\n\n  if (this->nodePriCuts[nodeData.id] == nullptr) {\n\n    auto inEdgeIt      = aigGraph.in_edge_begin(node);\n    aig::GNode lhsNode = aigGraph.getEdgeDst(inEdgeIt);\n    aig::NodeData& lhsData =\n        aigGraph.getData(lhsNode, galois::MethodFlag::READ);\n    bool lhsPolarity = aigGraph.getEdgeData(inEdgeIt);\n\n    inEdgeIt++;\n    aig::GNode rhsNode = aigGraph.getEdgeDst(inEdgeIt);\n    aig::NodeData& rhsData =\n        aigGraph.getData(rhsNode, galois::MethodFlag::READ);\n    bool rhsPolarity = aigGraph.getEdgeData(inEdgeIt);\n\n    computePriCutsRec(lhsNode, thData, refMap);\n    computePriCutsRec(rhsNode, thData, refMap);\n\n    computePriCuts(thData, refMap, nodeData, lhsData.id, rhsData.id,\n                   lhsPolarity, rhsPolarity);\n  }\n}\n\nvoid PriCutManager::computePriCuts(ThreadLocalData* thData, RefMap& refMap,\n                                   aig::NodeData& nodeData, int lhsId,\n                                   int rhsId, bool lhsPolarity,\n                                   bool rhsPolarity) {\n\n  PriCut *trivialCut, *resCut, *oldBestCut = nullptr;\n\n  cleanupCutList(thData->cutPool,\n                 thData->cutList); // Ensure that the cutList is empty\n\n  if ((this->passId > 1) && (this->nodePriCuts[nodeData.id] != nullptr)) {\n    // Save a copy of the previous bestCut and recompute the cut's costs\n    oldBestCut = thData->cutPool.getMemory();\n    copyCut(oldBestCut, this->nodePriCuts[nodeData.id]);\n\n    if (this->costMode == CostMode::AREA_FLOW) {\n      cutFlowCosts(oldBestCut);\n    } else {\n      if (this->refMode == RefMode::MAP) {\n        cutDerefedCosts(oldBestCut, refMap);\n      } else {\n        cutDerefedCosts(oldBestCut);\n      }\n    }\n\n    if (this->deterministic == false) {\n      decreaseCutReferences(oldBestCut);\n    }\n  }\n\n  for (PriCut* lhsCut = this->nodePriCuts[lhsId]; lhsCut != nullptr;\n       lhsCut         = lhsCut->nextCut) {\n    for (PriCut* rhsCut = this->nodePriCuts[rhsId]; rhsCut != nullptr;\n         rhsCut         = rhsCut->nextCut) {\n\n      if (Functional32::countOnes(lhsCut->sig | rhsCut->sig) > this->K) {\n        continue;\n      }\n\n      // merge the cuts\n      if (lhsCut->nLeaves >= rhsCut->nLeaves) {\n        resCut = mergeCuts(thData->cutPool, lhsCut, rhsCut);\n      } else {\n        resCut = mergeCuts(thData->cutPool, rhsCut, lhsCut);\n      }\n\n      if (resCut == nullptr) {\n        continue;\n      }\n\n      // check containment\n      if (cutFilter(thData->cutPool, thData->cutList, resCut)) {\n        continue;\n      }\n\n      if (this->compTruth) {\n        computeTruth(thData->auxTruth, resCut, lhsCut, rhsCut, lhsPolarity,\n                     rhsPolarity);\n        // std::cout << Functional32::toHex( readTruth( resCut ), getNWords() )\n        // << std::endl;\n      }\n\n      if (this->costMode == CostMode::AREA_FLOW) {\n        cutFlowCosts(resCut);\n      } else {\n        if (this->refMode == RefMode::MAP) {\n          cutDerefedCosts(resCut, refMap);\n        } else {\n          cutDerefedCosts(resCut);\n        }\n      }\n\n      // add to the sorted list\n      cutSort(thData->cutPool, thData->cutList, resCut);\n    }\n  }\n\n  if ((nodeData.nFanout > 0) && (nodeData.choiceList != nullptr)) {\n    mapChoices(thData, refMap, nodeData);\n  }\n\n  // start with the elementary cut\n  trivialCut            = thData->cutPool.getMemory();\n  trivialCut->leaves[0] = nodeData.id;\n  trivialCut->nLeaves++;\n  trivialCut->sig = (1U << (nodeData.id % 31));\n  if (this->compTruth) {\n    unsigned* cutTruth = readTruth(trivialCut);\n    for (int i = 0; i < this->nWords; i++) {\n      cutTruth[i] = 0xAAAAAAAA;\n    }\n  }\n  thData->cutList.array[thData->cutList.nCuts++] = trivialCut;\n  nCuts += 1;\n  nTriv += 1;\n\n  // Copy from currentCutList to the nodeCuts\n  commitCuts(thData->cutPool, thData->cutList, nodeData.id);\n\n  if (oldBestCut != nullptr) {\n    if (this->nodePriCuts[nodeData.id]->delay > nodeData.reqTime) {\n      oldBestCut->nextCut =\n          this->nodePriCuts[nodeData.id]; // Keep the oldBestCut as the best one\n      this->nodePriCuts[nodeData.id] = oldBestCut;\n    } else {\n      thData->cutPool.giveBackMemory(oldBestCut);\n    }\n  }\n\n  if (this->deterministic == false) {\n    increaseCutReferences(this->nodePriCuts[nodeData.id]);\n  }\n}\n\nvoid PriCutManager::mapChoices(ThreadLocalData* thData, RefMap& refMap,\n                               aig::NodeData& nodeData) {\n\n  aig::GNode nextChoice = nullptr;\n\n  for (aig::GNode currChoice = nodeData.choiceList; currChoice != nullptr;\n       currChoice            = nextChoice) {\n\n    aig::NodeData& currChoiceData =\n        aigGraph.getData(currChoice, galois::MethodFlag::READ);\n    nextChoice = currChoiceData.choiceList;\n\n    // std::cout << \"Node \" << currChoiceData.id << \" has fanout \" <<\n    // currChoiceData.nFanout << std::endl;\n\n    for (PriCut* chCut = this->nodePriCuts[currChoiceData.id]; chCut != nullptr;\n         chCut         = chCut->nextCut) {\n\n      // Discard trivial cuts\n      if (chCut->nLeaves == 1) {\n        continue;\n      }\n\n      PriCut* chCutCopy = thData->cutPool.getMemory();\n      copyCut(chCutCopy, chCut);\n\n      /*\n      if (this->compTruth) { // FIXME treat complemented choices\n          if(currChoiceData.isCompl) {\n              Functional32::NOT(readTruth(chCutCopy), readTruth(chCutCopy),\n      this->nWords);\n          }\n      }\n      */\n\n      // check containment\n      if (cutFilter(thData->cutPool, thData->cutList, chCutCopy)) {\n        continue;\n      }\n\n      if (this->costMode == CostMode::AREA_FLOW) {\n        cutFlowCosts(chCutCopy);\n      } else {\n        if (this->refMode == RefMode::MAP) {\n          cutDerefedCosts(chCutCopy, refMap);\n          // cutDerefedCosts(chCutCopy, thData->refMap);\n        } else {\n          cutDerefedCosts(chCutCopy);\n        }\n      }\n\n      // add to the sorted list\n      cutSort(thData->cutPool, thData->cutList, chCutCopy);\n    }\n  }\n}\n\nPriCut* PriCutManager::mergeCuts(PriCutPool& cutPool, PriCut* lhsCut,\n                                 PriCut* rhsCut) {\n\n  // assert( lhsCut->nLeaves >= rhsCut->nLeaves );\n  int i, j, l;\n  PriCut* resCut;\n\n  // the case of the largest cut sizes\n  if (lhsCut->nLeaves == this->K && rhsCut->nLeaves == this->K) {\n    for (i = 0; i < lhsCut->nLeaves; i++) {\n      if (lhsCut->leaves[i] != rhsCut->leaves[i]) {\n        return nullptr;\n      }\n    }\n    resCut = cutPool.getMemory();\n    for (i = 0; i < lhsCut->nLeaves; i++) {\n      resCut->leaves[i] = lhsCut->leaves[i];\n    }\n    resCut->nLeaves = lhsCut->nLeaves;\n    resCut->sig     = lhsCut->sig | rhsCut->sig; // set the signature\n    return resCut;\n  }\n\n  // the case when one of the cuts is the largest\n  if (lhsCut->nLeaves == this->K) {\n    for (i = 0; i < rhsCut->nLeaves; i++) {\n      for (j = lhsCut->nLeaves - 1; j >= 0; j--) {\n        if (lhsCut->leaves[j] == rhsCut->leaves[i]) {\n          break;\n        }\n      }\n      if (j == -1) { // did not find\n        return nullptr;\n      }\n    }\n    resCut = cutPool.getMemory();\n    for (i = 0; i < lhsCut->nLeaves; i++) {\n      resCut->leaves[i] = lhsCut->leaves[i];\n    }\n    resCut->nLeaves = lhsCut->nLeaves;\n    resCut->sig     = lhsCut->sig | rhsCut->sig; // set the signature\n    return resCut;\n  }\n\n  // compare two cuts with different numbers\n  resCut = cutPool.getMemory();\n  i      = 0;\n  j      = 0;\n  for (l = 0; l < this->K; l++) {\n    if (j == rhsCut->nLeaves) {\n      if (i == lhsCut->nLeaves) {\n        resCut->nLeaves = l;\n        resCut->sig     = lhsCut->sig | rhsCut->sig; // set the signature\n        return resCut;\n      }\n      resCut->leaves[l] = lhsCut->leaves[i++];\n      continue;\n    }\n\n    if (i == lhsCut->nLeaves) {\n      if (j == rhsCut->nLeaves) {\n        resCut->nLeaves = l;\n        resCut->sig     = lhsCut->sig | rhsCut->sig; // set the signature\n        return resCut;\n      }\n      resCut->leaves[l] = rhsCut->leaves[j++];\n      continue;\n    }\n\n    if (lhsCut->leaves[i] < rhsCut->leaves[j]) {\n      resCut->leaves[l] = lhsCut->leaves[i++];\n      continue;\n    }\n\n    if (lhsCut->leaves[i] > rhsCut->leaves[j]) {\n      resCut->leaves[l] = rhsCut->leaves[j++];\n      continue;\n    }\n\n    resCut->leaves[l] = lhsCut->leaves[i++];\n    j++;\n  }\n\n  if (i < lhsCut->nLeaves || j < rhsCut->nLeaves) {\n    cutPool.giveBackMemory(resCut);\n    return nullptr;\n  }\n\n  resCut->nLeaves = l;\n  resCut->sig     = lhsCut->sig | rhsCut->sig; // set the signature\n  return resCut;\n}\n\ninline bool PriCutManager::cutFilter(PriCutPool& cutPool, PriCutList& cutList,\n                                     PriCut* resCut) {\n\n  PriCut* cut;\n\n  for (int i = 0; i < cutList.nCuts; i++) {\n\n    cut = cutList.array[i];\n\n    if (cut->nLeaves <= resCut->nLeaves) {\n      // skip the non-contained cuts\n      if ((cut->sig & resCut->sig) != cut->sig) {\n        continue;\n      }\n      // check containment seriously\n      if (checkCutDominance(cut, resCut)) {\n        cutPool.giveBackMemory(resCut); // Recycle Cut\n        nFilt += 1;\n        return true; // resCut is dominated\n      }\n    } else {\n      // sKip the non-contained cuts\n      if ((cut->sig & resCut->sig) != resCut->sig) {\n        continue;\n      }\n      // check containment seriously\n      if (checkCutDominance(resCut, cut)) {\n        nCuts -= 1;\n        nFilt += 1;\n        cutList.nCuts--;\n        cutPool.giveBackMemory(cut); // Recycle Cut\n        for (int j = i; j < cutList.nCuts; j++) {\n          cutList.array[j] = cutList.array[j + 1];\n        }\n      }\n    }\n  }\n  return false;\n}\n\ninline bool PriCutManager::checkCutDominance(PriCut* smallerCut,\n                                             PriCut* largerCut) {\n\n  int i, j;\n  for (i = 0; i < smallerCut->nLeaves; i++) {\n    for (j = 0; j < largerCut->nLeaves; j++) {\n      if (smallerCut->leaves[i] == largerCut->leaves[j]) {\n        break;\n      }\n    }\n    if (j ==\n        largerCut\n            ->nLeaves) { // node i in smallerCut is not contained in largerCut\n      return false;\n    }\n  }\n  // every node in smallerCut is contained in largerCut\n  return true;\n}\n\ninline void PriCutManager::cutSort(PriCutPool& cutPool, PriCutList& cutList,\n                                   PriCut* resCut) {\n\n  // cut structure is empty\n  if (cutList.nCuts == 0) {\n    cutList.array[cutList.nCuts++] = resCut;\n    nCuts += 1;\n    return;\n  }\n\n  // the cut will be added - find its place\n  cutList.array[cutList.nCuts++] = resCut;\n\n  for (int i = cutList.nCuts - 2; i >= 0; i--) {\n    if (sortCompare(cutList.array[i], resCut) <= 0) {\n      break;\n    }\n    cutList.array[i + 1] = cutList.array[i];\n    cutList.array[i]     = resCut;\n  }\n\n  if (cutList.nCuts > this->C) {\n    cutPool.giveBackMemory(cutList.array[--cutList.nCuts]);\n  } else {\n    nCuts += 1;\n  }\n}\n\nint PriCutManager::sortCompare(PriCut* lhsCut, PriCut* rhsCut) {\n\n  if (this->fPower) {\n    if (this->sortMode == SortMode::AREA) { // area flow\n      if (lhsCut->area < rhsCut->area - this->fEpsilon)\n        return -1;\n      if (lhsCut->area > rhsCut->area + this->fEpsilon)\n        return 1;\n      if (lhsCut->power < rhsCut->power - this->fEpsilon)\n        return -1;\n      if (lhsCut->power > rhsCut->power + this->fEpsilon)\n        return 1;\n      if (lhsCut->edge < rhsCut->edge - this->fEpsilon)\n        return -1;\n      if (lhsCut->edge > rhsCut->edge + this->fEpsilon)\n        return 1;\n      if (lhsCut->nLeaves < rhsCut->nLeaves)\n        return -1;\n      if (lhsCut->nLeaves > rhsCut->nLeaves)\n        return 1;\n      if (lhsCut->delay < rhsCut->delay - this->fEpsilon)\n        return -1;\n      if (lhsCut->delay > rhsCut->delay + this->fEpsilon)\n        return 1;\n      return 0;\n    }\n    if (this->sortMode == SortMode::DELAY) { // delay\n      if (lhsCut->delay < rhsCut->delay - this->fEpsilon)\n        return -1;\n      if (lhsCut->delay > rhsCut->delay + this->fEpsilon)\n        return 1;\n      if (lhsCut->nLeaves < rhsCut->nLeaves)\n        return -1;\n      if (lhsCut->nLeaves > rhsCut->nLeaves)\n        return 1;\n      if (lhsCut->area < rhsCut->area - this->fEpsilon)\n        return -1;\n      if (lhsCut->area > rhsCut->area + this->fEpsilon)\n        return 1;\n      if (lhsCut->power < rhsCut->power - this->fEpsilon)\n        return -1;\n      if (lhsCut->power > rhsCut->power + this->fEpsilon)\n        return 1;\n      if (lhsCut->edge < rhsCut->edge - this->fEpsilon)\n        return -1;\n      if (lhsCut->edge > rhsCut->edge + this->fEpsilon)\n        return 1;\n      return 0;\n    }\n    assert(this->sortMode == SortMode::DELAY_OLD); // delay old, exact area\n    if (lhsCut->delay < rhsCut->delay - this->fEpsilon)\n      return -1;\n    if (lhsCut->delay > rhsCut->delay + this->fEpsilon)\n      return 1;\n    if (lhsCut->power < rhsCut->power - this->fEpsilon)\n      return -1;\n    if (lhsCut->power > rhsCut->power + this->fEpsilon)\n      return 1;\n    if (lhsCut->edge < rhsCut->edge - this->fEpsilon)\n      return -1;\n    if (lhsCut->edge > rhsCut->edge + this->fEpsilon)\n      return 1;\n    if (lhsCut->area < rhsCut->area - this->fEpsilon)\n      return -1;\n    if (lhsCut->area > rhsCut->area + this->fEpsilon)\n      return 1;\n    if (lhsCut->nLeaves < rhsCut->nLeaves)\n      return -1;\n    if (lhsCut->nLeaves > rhsCut->nLeaves)\n      return 1;\n    return 0;\n  } else {                                  // regular\n    if (this->sortMode == SortMode::AREA) { // area\n      if (lhsCut->area < rhsCut->area - this->fEpsilon)\n        return -1;\n      if (lhsCut->area > rhsCut->area + this->fEpsilon)\n        return 1;\n      if (lhsCut->edge < rhsCut->edge - this->fEpsilon)\n        return -1;\n      if (lhsCut->edge > rhsCut->edge + this->fEpsilon)\n        return 1;\n      if (lhsCut->power < rhsCut->power - this->fEpsilon)\n        return -1;\n      if (lhsCut->power > rhsCut->power + this->fEpsilon)\n        return 1;\n      if (lhsCut->nLeaves < rhsCut->nLeaves)\n        return -1;\n      if (lhsCut->nLeaves > rhsCut->nLeaves)\n        return 1;\n      if (lhsCut->delay < rhsCut->delay - this->fEpsilon)\n        return -1;\n      if (lhsCut->delay > rhsCut->delay + this->fEpsilon)\n        return 1;\n      return 0;\n    }\n    if (this->sortMode == SortMode::DELAY) { // delay\n      if (lhsCut->delay < rhsCut->delay - this->fEpsilon)\n        return -1;\n      if (lhsCut->delay > rhsCut->delay + this->fEpsilon)\n        return 1;\n      if (lhsCut->nLeaves < rhsCut->nLeaves)\n        return -1;\n      if (lhsCut->nLeaves > rhsCut->nLeaves)\n        return 1;\n      if (lhsCut->area < rhsCut->area - this->fEpsilon)\n        return -1;\n      if (lhsCut->area > rhsCut->area + this->fEpsilon)\n        return 1;\n      if (lhsCut->edge < rhsCut->edge - this->fEpsilon)\n        return -1;\n      if (lhsCut->edge > rhsCut->edge + this->fEpsilon)\n        return 1;\n      if (lhsCut->power < rhsCut->power - this->fEpsilon)\n        return -1;\n      if (lhsCut->power > rhsCut->power + this->fEpsilon)\n        return 1;\n      return 0;\n    }\n    assert(this->sortMode == SortMode::DELAY_OLD);\n    if (lhsCut->delay < rhsCut->delay - this->fEpsilon)\n      return -1;\n    if (lhsCut->delay > rhsCut->delay + this->fEpsilon)\n      return 1;\n    if (lhsCut->area < rhsCut->area - this->fEpsilon)\n      return -1;\n    if (lhsCut->area > rhsCut->area + this->fEpsilon)\n      return 1;\n    if (lhsCut->edge < rhsCut->edge - this->fEpsilon)\n      return -1;\n    if (lhsCut->edge > rhsCut->edge + this->fEpsilon)\n      return 1;\n    if (lhsCut->power < rhsCut->power - this->fEpsilon)\n      return -1;\n    if (lhsCut->power > rhsCut->power + this->fEpsilon)\n      return 1;\n    if (lhsCut->nLeaves < rhsCut->nLeaves)\n      return -1;\n    if (lhsCut->nLeaves > rhsCut->nLeaves)\n      return 1;\n    return 0;\n  }\n}\n\ninline void PriCutManager::commitCuts(PriCutPool& cutPool, PriCutList& cutList,\n                                      int nodeId) {\n\n  assert(cutList.nCuts != 0);\n\n  recycleNodeCuts(cutPool, nodeId);\n\n  // Copy from currenti CutList to the nodePriCuts and clean up the cutList\n  this->nodePriCuts[nodeId] = cutList.array[0];\n\n  int i;\n  for (i = 0; i < cutList.nCuts - 1; i++) {\n    cutList.array[i]->nextCut = cutList.array[i + 1];\n    cutList.array[i]          = nullptr;\n  }\n  cutList.array[i]->nextCut = nullptr;\n  cutList.array[i]          = nullptr;\n  cutList.nCuts             = 0;\n\n  assert(this->nodePriCuts[nodeId] != nullptr);\n}\n\n/*\n *     This method gives the cut's memory back to current thread cutPool.\n *     However, the memory can be allocated by the cutPool of one thread\n *     and reused by a cutPool of another thread. But, the original cutPool\n *     will be responsible for dealocating the memory.\n */\ninline void PriCutManager::recycleNodeCuts(PriCutPool& cutPool, int nodeId) {\n\n  PriCut* cut = this->nodePriCuts[nodeId];\n\n  while (cut != nullptr) {\n    PriCut* nextCut = cut->nextCut;\n    cutPool.giveBackMemory(cut);\n    cut = nextCut;\n  }\n\n  this->nodePriCuts[nodeId] = nullptr;\n}\n\ninline void PriCutManager::cleanupCutList(PriCutPool& cutPool,\n                                          PriCutList& cutList) {\n\n  for (int i = 0; i < cutList.nCuts; i++) {\n    cutPool.giveBackMemory(cutList.array[i]);\n    cutList.array[i] = nullptr;\n  }\n\n  cutList.nCuts = 0;\n}\n\ninline void PriCutManager::copyCut(PriCut* dest, PriCut* source) {\n\n  dest->area    = source->area;\n  dest->edge    = source->edge;\n  dest->power   = source->power;\n  dest->delay   = source->delay;\n  dest->sig     = source->sig;\n  dest->nLeaves = source->nLeaves;\n  dest->nextCut = nullptr;\n  for (int i = 0; i < source->nLeaves; i++) {\n    dest->leaves[i] = source->leaves[i];\n  }\n  if (this->compTruth) {\n    unsigned int* destTruth   = readTruth(dest);\n    unsigned int* sourceTruth = readTruth(source);\n    Functional32::copy(destTruth, sourceTruth, this->nWords);\n  }\n}\n\nvoid PriCutManager::computeTruth(AuxTruth& auxTruth, PriCut* resCut,\n                                 PriCut* lhsCut, PriCut* rhsCut,\n                                 bool lhsPolarity, bool rhsPolarity) {\n\n  // permute the first table\n  if (lhsPolarity) {\n    Functional32::copy(auxTruth.truth[0], readTruth(lhsCut), this->nWords);\n  } else {\n    Functional32::NOT(auxTruth.truth[0], readTruth(lhsCut), this->nWords);\n  }\n  Functional32::truthStretch(auxTruth.truth[2], auxTruth.truth[0],\n                             lhsCut->nLeaves, this->K,\n                             truthPhase(resCut, lhsCut));\n\n  // permute the second table\n  if (rhsPolarity) {\n    Functional32::copy(auxTruth.truth[1], readTruth(rhsCut), this->nWords);\n  } else {\n    Functional32::NOT(auxTruth.truth[1], readTruth(rhsCut), this->nWords);\n  }\n  Functional32::truthStretch(auxTruth.truth[3], auxTruth.truth[1],\n                             rhsCut->nLeaves, this->K,\n                             truthPhase(resCut, rhsCut));\n\n  // produce the resulting table. In this first version we are not considering\n  // the cut->fCompl flag. It may be considerer in further versions according to\n  // the demand.\n  // if ( cut->fCompl ) {\n  //\tFunctional32::NAND( readTruth( cut ) , auxTruth[2], auxTruth[3], K );\n  //}\n  // else {\n  Functional32::AND(readTruth(resCut), auxTruth.truth[2], auxTruth.truth[3],\n                    this->nWords);\n  //}\n}\n\ninline unsigned PriCutManager::truthPhase(PriCut* resCut, PriCut* inCut) {\n\n  unsigned phase = 0;\n  int i, j;\n  for (i = j = 0; i < resCut->nLeaves; i++) {\n    if (j == inCut->nLeaves) {\n      break;\n    }\n    if (resCut->leaves[i] < inCut->leaves[j]) {\n      continue;\n    }\n    assert(resCut->leaves[i] == inCut->leaves[j]);\n    phase |= (1 << i);\n    j++;\n  }\n  return phase;\n}\n\nunsigned int* PriCutManager::readTruth(PriCut* cut) {\n  return (unsigned*)(cut->leaves + this->K);\n}\n\nvoid PriCutManager::increaseCutReferences(PriCut* cut) {\n\n  int leafId;\n\n  for (int i = 0; i < cut->nLeaves; i++) {\n    leafId                  = cut->leaves[i];\n    aig::GNode leaf         = this->aig.getNodes()[leafId];\n    aig::NodeData& leafData = aigGraph.getData(leaf, galois::MethodFlag::WRITE);\n    assert(leafData.nRefs >= 0);\n    leafData.nRefs++;\n  }\n}\n\nvoid PriCutManager::decreaseCutReferences(PriCut* cut) {\n\n  int leafId;\n\n  for (int i = 0; i < cut->nLeaves; i++) {\n    leafId                  = cut->leaves[i];\n    aig::GNode leaf         = this->aig.getNodes()[leafId];\n    aig::NodeData& leafData = aigGraph.getData(leaf, galois::MethodFlag::WRITE);\n    assert(leafData.nRefs > 0);\n    --leafData.nRefs;\n  }\n}\n\n// ################### Start of the New Cut's Cost Functions\n// ###################### //\ninline float PriCutManager::cutDelay(PriCut* cut) {\n\n  int leafId;\n  float currDelay, delay = std::numeric_limits<float>::min();\n\n  for (int i = 0; i < cut->nLeaves; i++) {\n    leafId    = cut->leaves[i];\n    currDelay = getBestCut(leafId)->delay + 1.0;\n    delay     = std::max(delay, currDelay);\n  }\n  return delay;\n}\n\nvoid PriCutManager::cutFlowCosts(PriCut* cut) {\n\n  int leafId;\n  float areaFlow = 1.0;\n  float edgeFlow = cut->nLeaves;\n  float currDelay, delay = std::numeric_limits<float>::min();\n  PriCut* bestCut = nullptr;\n\n  for (int i = 0; i < cut->nLeaves; i++) {\n\n    leafId          = cut->leaves[i];\n    aig::GNode leaf = this->aig.getNodes()[leafId];\n    aig::NodeData& leafData =\n        aigGraph.getData(leaf, galois::MethodFlag::UNPROTECTED);\n    bestCut = getBestCut(leafId);\n\n    if ((leafData.nRefs == 0) || (leafData.type == aig::NodeType::CONSTZERO)) {\n      areaFlow += bestCut->area;\n      edgeFlow += bestCut->edge;\n    } else {\n      assert(leafData.nRefs > this->fEpsilon);\n      areaFlow += bestCut->area / leafData.nRefs;\n      edgeFlow += bestCut->edge / leafData.nRefs;\n    }\n\n    currDelay = bestCut->delay + 1.0;\n    delay     = std::max(delay, currDelay);\n  }\n\n  cut->area  = areaFlow;\n  cut->edge  = edgeFlow;\n  cut->delay = delay;\n}\n\n// STANDARD VERSION\nvoid PriCutManager::cutDerefedCosts(PriCut* cut) {\n\n  float area1 = 0, area2 = 0, edge1 = 0, edge2 = 0;\n\n  if (cut->nLeaves < 2) {\n    cut->area = 0;\n    cut->edge = cut->nLeaves;\n    return;\n  }\n\n  cutRefCosts(cut, area1, edge1);\n  cutDerefCosts(cut, area2, edge2);\n\n  assert(area2 > area1 - this->fEpsilon);\n  assert(area2 < area1 + this->fEpsilon);\n  assert(edge2 > edge1 - this->fEpsilon);\n  assert(edge2 < edge1 + this->fEpsilon);\n\n  cut->area  = area2;\n  cut->edge  = edge2;\n  cut->delay = cutDelay(cut);\n}\n\nvoid PriCutManager::cutRefCosts(PriCut* cut, float& area, float& edge) {\n\n  int leafId;\n  area += 1.0;\n  edge += cut->nLeaves;\n\n  for (int i = 0; i < cut->nLeaves; i++) {\n\n    leafId                  = cut->leaves[i];\n    aig::GNode leaf         = this->aig.getNodes()[leafId];\n    aig::NodeData& leafData = aigGraph.getData(leaf, galois::MethodFlag::WRITE);\n\n    assert(leafData.nRefs >= 0);\n    if ((leafData.nRefs++ > 0) || (leafData.type != aig::NodeType::AND))\n      continue;\n\n    cutRefCosts(getBestCut(leafId), area, edge);\n  }\n}\n\nvoid PriCutManager::cutDerefCosts(PriCut* cut, float& area, float& edge) {\n\n  int leafId;\n  area += 1.0;\n  edge += cut->nLeaves;\n\n  for (int i = 0; i < cut->nLeaves; i++) {\n\n    leafId                  = cut->leaves[i];\n    aig::GNode leaf         = this->aig.getNodes()[leafId];\n    aig::NodeData& leafData = aigGraph.getData(leaf, galois::MethodFlag::WRITE);\n\n    assert(leafData.nRefs > 0);\n    if (--leafData.nRefs > 0 || (leafData.type != aig::NodeType::AND))\n      continue;\n\n    cutDerefCosts(getBestCut(leafId), area, edge);\n  }\n}\n\n// REFMAP VERSION\n\nvoid PriCutManager::cutDerefedCosts(PriCut* cut, RefMap& refMap) {\n\n  float area1 = 0, area2 = 0, edge1 = 0, edge2 = 0;\n\n  if (cut->nLeaves < 2) {\n    cut->area = 0;\n    cut->edge = cut->nLeaves;\n    return;\n  }\n\n  cutRefCosts(cut, area1, edge1, refMap);\n  cutDerefCosts(cut, area2, edge2, refMap);\n\n  assert(area2 > area1 - this->fEpsilon);\n  assert(area2 < area1 + this->fEpsilon);\n  assert(edge2 > edge1 - this->fEpsilon);\n  assert(edge2 < edge1 + this->fEpsilon);\n\n  cut->area  = area2;\n  cut->edge  = edge2;\n  cut->delay = cutDelay(cut);\n}\n\nvoid PriCutManager::cutRefCosts(PriCut* cut, float& area, float& edge,\n                                RefMap& refMap) {\n\n  int leafId;\n  area += 1.0;\n  edge += cut->nLeaves;\n\n  for (int i = 0; i < cut->nLeaves; i++) {\n\n    leafId          = cut->leaves[i];\n    aig::GNode leaf = this->aig.getNodes()[leafId];\n    aig::NodeData& leafData =\n        aigGraph.getData(leaf, galois::MethodFlag::UNPROTECTED);\n\n    // Experimental\n    auto it = refMap.find(leafId);\n    if (it != refMap.end()) {\n      assert(it->second >= 0);\n      if ((it->second++ > 0) || (leafData.type != aig::NodeType::AND))\n        continue;\n    } else {\n      assert(leafData.nRefs >= 0);\n      refMap.insert({leafId, leafData.nRefs + 1});\n      if ((leafData.nRefs > 0) || (leafData.type != aig::NodeType::AND))\n        continue;\n    }\n\n    cutRefCosts(getBestCut(leafId), area, edge, refMap);\n  }\n}\n\nvoid PriCutManager::cutDerefCosts(PriCut* cut, float& area, float& edge,\n                                  RefMap& refMap) {\n\n  int leafId;\n  area += 1.0;\n  edge += cut->nLeaves;\n\n  for (int i = 0; i < cut->nLeaves; i++) {\n\n    leafId          = cut->leaves[i];\n    aig::GNode leaf = this->aig.getNodes()[leafId];\n    aig::NodeData& leafData =\n        aigGraph.getData(leaf, galois::MethodFlag::UNPROTECTED);\n\n    // Experimental\n    auto it = refMap.find(leafId);\n    if (it != refMap.end()) {\n      assert(it->second > 0);\n      if (--it->second > 0 || (leafData.type != aig::NodeType::AND))\n        continue;\n    } else {\n      assert(leafData.nRefs > 0);\n      refMap.insert({leafId, leafData.nRefs - 1});\n      if ((leafData.nRefs - 1) > 0 || (leafData.type != aig::NodeType::AND))\n        continue;\n    }\n\n    cutDerefCosts(getBestCut(leafId), area, edge, refMap);\n  }\n}\n// ################### End of the NewCuts Cost Functions ######################\n// //\n\nvoid PriCutManager::resetNodeCountersFanout() {\n\n  const float FLOAT_MAX = std::numeric_limits<float>::max();\n\n  for (aig::GNode node : aigGraph) {\n    aig::NodeData& nodeData =\n        aigGraph.getData(node, galois::MethodFlag::UNPROTECTED);\n    nodeData.counter = 0;\n    nodeData.nRefs   = nodeData.nFanout;\n    nodeData.reqTime = FLOAT_MAX;\n  }\n\n  // galois::do_all( galois::iterate( aigGraph ), ResetNodeCountersFanout{\n  // aigGraph }, galois::loopname(\"ResetOperatorFanout\"), galois::steal() );\n}\n\nvoid PriCutManager::resetNodeCountersZero() {\n\n  const float FLOAT_MAX = std::numeric_limits<float>::max();\n\n  for (aig::GNode node : aigGraph) {\n    aig::NodeData& nodeData =\n        aigGraph.getData(node, galois::MethodFlag::UNPROTECTED);\n    nodeData.counter = 0;\n    nodeData.nRefs   = 0;\n    nodeData.reqTime = FLOAT_MAX;\n  }\n\n  // galois::do_all( galois::iterate( aigGraph ), ResetNodeCountersZero{\n  // aigGraph }, galois::loopname(\"ResetOperatorZero\"), galois::steal() );\n}\n\nvoid PriCutManager::resetNodeCountersOnly() {\n\n  const float FLOAT_MAX = std::numeric_limits<float>::max();\n\n  for (aig::GNode node : aigGraph) {\n    aig::NodeData& nodeData =\n        aigGraph.getData(node, galois::MethodFlag::UNPROTECTED);\n    nodeData.counter = 0;\n    nodeData.reqTime = FLOAT_MAX;\n  }\n\n  // galois::do_all( galois::iterate( aigGraph ), ResetNodeCountersOnly{\n  // aigGraph }, galois::loopname(\"ResetOperatorOnly\"), galois::steal() );\n}\n\nvoid PriCutManager::computeReferenceCounters() {\n\n  PriCut* bestCut;\n  int size = this->nNodes + 1;\n\n  for (int i = 0; i < size; i++) {\n    bestCut = this->nodePriCuts[i];\n    if (bestCut == nullptr) {\n      continue;\n    }\n    if (bestCut->nLeaves == 1) {\n      continue; // skip trivial cuts\n    }\n    for (int j = 0; j < bestCut->nLeaves; j++) {\n      aig::GNode leaf = this->aig.getNodes()[bestCut->leaves[j]];\n      aig::NodeData& leafData =\n          aigGraph.getData(leaf, galois::MethodFlag::UNPROTECTED);\n      leafData.nRefs++;\n    }\n  }\n}\n\nvoid PriCutManager::computeCoveringReferenceCounters() {\n\n  computeCovering();\n\n  PriCut* bestCut;\n\n  for (auto entry : this->covering) {\n    bestCut = entry.second;\n    if (bestCut == nullptr) {\n      continue;\n    }\n    if (bestCut->nLeaves == 1) {\n      continue; // skip trivial cuts\n    }\n    for (int j = 0; j < bestCut->nLeaves; j++) {\n      aig::GNode leaf = this->aig.getNodes()[bestCut->leaves[j]];\n      aig::NodeData& leafData =\n          aigGraph.getData(leaf, galois::MethodFlag::UNPROTECTED);\n      leafData.nRefs++;\n    }\n  }\n}\n\nvoid PriCutManager::computeRequiredTimes() {\n\n  float maxDelay = 0;\n  PriCut* bestCut;\n\n  for (auto po : this->aig.getOutputNodes()) {\n    auto inEdgeIt     = aigGraph.in_edge_begin(po);\n    aig::GNode inNode = aigGraph.getEdgeDst(inEdgeIt);\n    aig::NodeData& inNodeData =\n        aigGraph.getData(inNode, galois::MethodFlag::UNPROTECTED);\n    if (inNodeData.type == aig::NodeType::CONSTZERO) {\n      continue;\n    }\n    bestCut = getBestCut(inNodeData.id);\n    if (maxDelay < bestCut->delay - this->fEpsilon) {\n      maxDelay = bestCut->delay;\n    }\n  }\n\n  for (auto po : this->aig.getOutputNodes()) {\n    auto inEdgeIt     = aigGraph.in_edge_begin(po);\n    aig::GNode inNode = aigGraph.getEdgeDst(inEdgeIt);\n    aig::NodeData& inNodeData =\n        aigGraph.getData(inNode, galois::MethodFlag::UNPROTECTED);\n    inNodeData.reqTime = maxDelay;\n  }\n\n  // iterating from 0 to N is reverse topological order\n  // iterating from N to 0 is topological order\n  for (auto node : this->sortedNodes) {\n    aig::NodeData& nodeData =\n        aigGraph.getData(node, galois::MethodFlag::UNPROTECTED);\n\n    // Reset node data to prepare for the next mapping pass\n    nodeData.counter = 0;\n    if (this->deterministic) {\n      nodeData.nRefs = 0;\n    }\n\n    bestCut = getBestCut(nodeData.id);\n    for (int i = 0; i < bestCut->nLeaves; i++) {\n      aig::GNode leaf = this->aig.getNodes()[bestCut->leaves[i]];\n      aig::NodeData& leafData =\n          aigGraph.getData(leaf, galois::MethodFlag::UNPROTECTED);\n      leafData.reqTime = std::min((nodeData.reqTime - 1), leafData.reqTime);\n    }\n  }\n}\n\nvoid PriCutManager::computeCovering() {\n\n  PriCut* bestCut;\n  aig::GNode leaf;\n  int leafId, nodeId;\n  std::vector<int> S;\n\n  this->covering.clear();\n\n  this->nLevels = -1;\n  for (auto po : this->aig.getOutputNodes()) {\n    auto inEdgeIt     = aigGraph.in_edge_begin(po);\n    aig::GNode inNode = aigGraph.getEdgeDst(inEdgeIt);\n    aig::NodeData& inNodeData =\n        aigGraph.getData(inNode, galois::MethodFlag::UNPROTECTED);\n\n    if ((inNodeData.type != aig::NodeType::PI) &&\n        (inNodeData.type != aig::NodeType::LATCH) &&\n        (inNodeData.type != aig::NodeType::CONSTZERO)) {\n      S.push_back(inNodeData.id);\n      bestCut = getBestCut(inNodeData.id);\n      if (this->nLevels < bestCut->delay) {\n        this->nLevels = bestCut->delay;\n      }\n    }\n  }\n\n  while (!S.empty()) {\n    nodeId = S.back();\n    S.pop_back();\n\n    auto it = this->covering.find(nodeId);\n    if (it != this->covering.end()) {\n      continue;\n    }\n\n    bestCut = getBestCut(nodeId);\n    this->covering.insert({nodeId, bestCut});\n\n    for (int i = 0; i < bestCut->nLeaves; i++) {\n      leafId = bestCut->leaves[i];\n      leaf   = this->aig.getNodes()[leafId];\n      aig::NodeData& leafData =\n          aigGraph.getData(leaf, galois::MethodFlag::UNPROTECTED);\n      leafData.nRefs++; // Update reference counters\n      auto it = this->covering.find(leafId);\n      if (it == this->covering.end()) {\n        if ((leafData.type != aig::NodeType::PI) &&\n            (leafData.type != aig::NodeType::LATCH) &&\n            (leafData.type != aig::NodeType::CONSTZERO)) {\n          S.push_back(leafId);\n        }\n      }\n    }\n  }\n}\n\ninline void PriCutManager::switchToFirstDelayMode() {\n  this->passId++;\n  this->sortMode = SortMode::DELAY;\n  this->costMode = CostMode::AREA_FLOW;\n}\n\ninline void PriCutManager::switchToSecondDelayMode() {\n  this->passId++;\n  this->sortMode = SortMode::DELAY_OLD;\n  this->costMode = CostMode::AREA_FLOW;\n}\n\ninline void PriCutManager::switchToAreaFlowMode() {\n  this->passId++;\n  this->sortMode = SortMode::AREA;\n  this->costMode = CostMode::AREA_FLOW;\n}\n\ninline void PriCutManager::switchToLocalAreaMode() {\n  this->passId++;\n  this->sortMode = SortMode::AREA;\n  this->costMode = CostMode::LOCAL_AREA;\n}\n\ninline aig::Aig& PriCutManager::getAig() { return this->aig; }\n\ninline PriCut* PriCutManager::getBestCut(int nodeId) {\n  return this->nodePriCuts[nodeId]; // the first cut is the best cut\n}\n\nint PriCutManager::getNumLUTs() {\n  this->nLUTs = this->covering.size();\n  return this->nLUTs;\n}\n\nint PriCutManager::getNumLevels() { return this->nLevels; }\n\nint PriCutManager::getK() { return this->K; }\n\nint PriCutManager::getC() { return this->C; }\n\nint PriCutManager::getNWords() { return this->nWords; }\n\nint PriCutManager::getNThreads() { return this->nThreads; }\n\nbool PriCutManager::isDeterministic() { return this->deterministic; }\n\nbool PriCutManager::getCompTruthFlag() { return this->compTruth; }\n\nbool PriCutManager::getVerboseFlag() { return this->verbose; }\n\nlong double PriCutManager::getKcutTime() { return this->kcutTime; }\n\nvoid PriCutManager::setKcutTime(long double time) { this->kcutTime = time; }\n\nPerThreadData& PriCutManager::getPerThreadData() { return this->perThreadData; }\n\nPriCut** PriCutManager::getNodePriCuts() { return this->nodePriCuts; }\n\nCovering& PriCutManager::getCovering() { return this->covering; }\n\nvoid PriCutManager::printCovering() {\n\n  std::cout << std::endl\n            << \"########## Mapping Covering ###############\" << std::endl;\n  PriCut* bestCut;\n  for (auto entry : this->covering) {\n    std::cout << \"Node \" << entry.first << \": { \";\n    bestCut = entry.second;\n    for (int i = 0; i < bestCut->nLeaves; i++) {\n      std::cout << bestCut->leaves[i] << \" \";\n    }\n    std::cout << \"}\" << std::endl;\n    // std::cout << \"}[\" << Functional32::toHex( readTruth( bestCut ),\n    // this->nWords )  << \"] \" << std::endl;\n  }\n  std::cout << std::endl\n            << \"###########################################\" << std::endl;\n}\n\nvoid PriCutManager::printNodeCuts(int nodeId, long int& counter) {\n\n  std::cout << \"Node \" << nodeId << \": { \";\n  for (PriCut* currentCut = this->nodePriCuts[nodeId]; currentCut != nullptr;\n       currentCut         = currentCut->nextCut) {\n    counter++;\n    std::cout << \"{ \";\n    for (int i = 0; i < currentCut->nLeaves; i++) {\n      std::cout << currentCut->leaves[i] << \" \";\n    }\n    // std::cout << \"} \";\n    std::cout << \"}(a\" << currentCut->area << \") \";\n    // std::cout << \"}(a\" << currentCut->area << \", e\" << currentCut->edge << \")\n    // \"; std::cout << \"}[\" << Functional32::toHex( readTruth( currentCut ),\n    // this->nWords )  << \"] \";\n  }\n  std::cout << \"}\" << std::endl;\n}\n\nvoid PriCutManager::printAllCuts() {\n\n  long int counter = 0;\n\n  std::cout << std::endl << \"########## All K-Cuts ###########\" << std::endl;\n  for (aig::GNode node : aigGraph) {\n    aig::NodeData& nodeData =\n        aigGraph.getData(node, galois::MethodFlag::UNPROTECTED);\n    if ((nodeData.type == aig::NodeType::AND) ||\n        (nodeData.type == aig::NodeType::PI)) {\n      printNodeCuts(nodeData.id, counter);\n    }\n  }\n  std::cout << \"#################################\" << std::endl;\n}\n\nvoid PriCutManager::printNodeBestCut(int nodeId) {\n\n  PriCut* bestCut = getBestCut(nodeId);\n  std::cout << \"Node \" << nodeId << \": { \";\n  for (int i = 0; i < bestCut->nLeaves; i++) {\n    std::cout << bestCut->leaves[i] << \" \";\n  }\n  std::cout << \"}(a\" << bestCut->area << \")\" << std::endl;\n  // std::cout << \"}(a\" << bestCut->area << \", e\" << bestCut->edge << \")\" <<\n  // std::endl; std::cout << \"}[\" << Functional32::toHex( readTruth( bestCut ),\n  // this->nWords )  << \"] \" << std::endl;\n}\n\nvoid PriCutManager::printBestCuts() {\n\n  std::cout << std::endl << \"########## Best K-Cuts ###########\" << std::endl;\n  for (aig::GNode node : aigGraph) {\n    aig::NodeData& nodeData =\n        aigGraph.getData(node, galois::MethodFlag::UNPROTECTED);\n    if ((nodeData.type == aig::NodeType::AND) ||\n        (nodeData.type == aig::NodeType::PI)) {\n      printNodeBestCut(nodeData.id);\n    }\n  }\n  std::cout << \"#################################\" << std::endl;\n}\n\nvoid PriCutManager::printCutStatistics() {\n\n  long int nCutsRed = nCuts.reduce();\n  nCutsRed += this->aig.getNumInputs();\n\n  long int nTrivRed = nTriv.reduce();\n  nTrivRed += this->aig.getNumInputs();\n\n  long int nFiltRed = nFilt.reduce();\n\n  long int nSatuRed = nSatu.reduce();\n\n  std::cout << std::endl\n            << \"############## Cut Statistics #############\" << std::endl;\n  std::cout << \"nCuts: \" << nCutsRed << std::endl;\n  std::cout << \"nTriv: \" << nTrivRed << std::endl;\n  std::cout << \"nFilt: \" << nFiltRed << std::endl;\n  std::cout << \"nSatu: \" << nSatuRed << std::endl;\n  std::cout << \"nCutPerNode: \" << (((double)nCutsRed) / this->nNodes)\n            << std::endl;\n  std::cout << \"###########################################\" << std::endl;\n}\n\nvoid PriCutManager::printRuntimes() {\n\n  std::cout << std::endl << \"#### Runtimes in microsecond ####\" << std::endl;\n  // std::cout << \"Merge: \" << mergeTime << std::endl;\n  // std::cout << \"Filter: \" << filterTime << std::endl;\n  // std::cout << \"ProcTwo: \" << procTwoTime << std::endl;\n  // std::cout << \"Compute: \" << compTime << std::endl;\n  // std::cout << \"Schedule: \" << scheduleTime << std::endl;\n  std::cout << \"Total: \" << this->kcutTime << std::endl;\n  std::cout << \"#################################\" << std::endl;\n}\n\n// ######################## BEGIN OPERATOR ######################## //\nstruct KPriCutOperator {\n\n  const float FLOAT_MAX = std::numeric_limits<float>::max();\n  PriCutManager& cutMan;\n\n  KPriCutOperator(PriCutManager& cutMan) : cutMan(cutMan) {}\n\n  void operator()(aig::GNode node, galois::UserContext<aig::GNode>& ctx) {\n\n    aig::Aig& aig        = cutMan.getAig();\n    aig::Graph& aigGraph = aig.getGraph();\n\n    aig::NodeData& nodeData = aigGraph.getData(node, galois::MethodFlag::READ);\n\n    if (nodeData.type == aig::NodeType::AND) {\n\n      // Touching outgoing neighobors to acquire their locks\n      aigGraph.out_edges(node);\n\n      // Combine Cuts\n      auto inEdgeIt      = aigGraph.in_edge_begin(node);\n      aig::GNode lhsNode = aigGraph.getEdgeDst(inEdgeIt);\n      aig::NodeData& lhsData =\n          aigGraph.getData(lhsNode, galois::MethodFlag::READ);\n      bool lhsPolarity = aigGraph.getEdgeData(inEdgeIt);\n\n      inEdgeIt++;\n      aig::GNode rhsNode = aigGraph.getEdgeDst(inEdgeIt);\n      aig::NodeData& rhsData =\n          aigGraph.getData(rhsNode, galois::MethodFlag::READ);\n      bool rhsPolarity = aigGraph.getEdgeData(inEdgeIt);\n\n      ThreadLocalData* thData = cutMan.getPerThreadData().getLocal();\n\n      RefMap refMap(ctx.getPerIterAlloc());\n\n      cutMan.computePriCuts(thData, refMap, nodeData, lhsData.id, rhsData.id,\n                            lhsPolarity, rhsPolarity);\n\n      // Mark node as processed\n      nodeData.counter = nodeData.nFanout;\n      nodeData.reqTime = FLOAT_MAX;\n\n      // Schedule next nodes\n      for (auto edge : aigGraph.out_edges(node)) {\n        aig::GNode nextNode = aigGraph.getEdgeDst(edge);\n        aig::NodeData& nextNodeData =\n            aigGraph.getData(nextNode, galois::MethodFlag::WRITE);\n        nextNodeData.counter += 1;\n        if (nextNodeData.counter == 2) {\n          ctx.push(nextNode);\n        }\n      }\n\n      // Delete cuts of previous nodes if possibles\n      if (lhsData.type == aig::NodeType::AND) {\n        if (--lhsData.counter == 0) {\n          PriCut* cut =\n              cutMan.getNodePriCuts()[lhsData.id]\n                  ->nextCut; // skipe the first cut which is the best cut\n          cutMan.getNodePriCuts()[lhsData.id]->nextCut = nullptr;\n          while (cut != nullptr) {\n            PriCut* nextCut = cut->nextCut;\n            thData->cutPool.giveBackMemory(cut);\n            cut = nextCut;\n          }\n        }\n      }\n      if (rhsData.type == aig::NodeType::AND) {\n        if (--rhsData.counter == 0) {\n          PriCut* cut =\n              cutMan.getNodePriCuts()[rhsData.id]\n                  ->nextCut; // skipe the first cut which is the best cut\n          cutMan.getNodePriCuts()[rhsData.id]->nextCut = nullptr;\n          while (cut != nullptr) {\n            PriCut* nextCut = cut->nextCut;\n            thData->cutPool.giveBackMemory(cut);\n            cut = nextCut;\n          }\n        }\n      }\n    } else {\n      if (nodeData.type == aig::NodeType::PI) {\n        // Touching outgoing neighobors to acquire their locks and their fanin\n        // node's locks.\n        aigGraph.out_edges(node);\n\n        if (cutMan.getNodePriCuts()[nodeData.id] == nullptr) {\n          // Set the trivial cut\n          nodeData.counter        = 3;\n          ThreadLocalData* thData = cutMan.getPerThreadData().getLocal();\n          PriCut* trivialCut      = thData->cutPool.getMemory();\n          trivialCut->leaves[0]   = nodeData.id;\n          trivialCut->nLeaves++;\n          trivialCut->sig = (1U << (nodeData.id % 31));\n          if (cutMan.getCompTruthFlag()) {\n            unsigned* cutTruth = cutMan.readTruth(trivialCut);\n            for (int i = 0; i < cutMan.getNWords(); i++) {\n              cutTruth[i] = 0xAAAAAAAA;\n            }\n          }\n          cutMan.getNodePriCuts()[nodeData.id] = trivialCut;\n        }\n\n        nodeData.counter = 0;\n        nodeData.reqTime = FLOAT_MAX;\n\n        // Schedule next nodes\n        for (auto edge : aigGraph.out_edges(node)) {\n          aig::GNode nextNode = aigGraph.getEdgeDst(edge);\n          aig::NodeData& nextNodeData =\n              aigGraph.getData(nextNode, galois::MethodFlag::WRITE);\n          nextNodeData.counter += 1;\n          if (nextNodeData.counter == 2) {\n            ctx.push(nextNode);\n          }\n        }\n      }\n    }\n  }\n};\n// ######################## END OPERATOR ######################## //\n\nvoid runKPriCutOperator(PriCutManager& cutMan) {\n\n  typedef galois::worklists::PerSocketChunkBag<1000> DC_BAG;\n  bool verbose      = cutMan.getVerboseFlag();\n  int nAreaRecovery = 2, nAreaFlow = 1, nLocalArea = 1;\n  aig::Aig& aig = cutMan.getAig();\n  //\taig::Graph & aigGraph = aig.getGraph();\n\n  if (verbose) {\n    std::cout << std::endl << \"########## LUT Mapping ###########\" << std::endl;\n    std::cout << \"Mapping in First Delay Mode\" << std::endl;\n  }\n  cutMan.switchToFirstDelayMode();\n  cutMan.resetNodeCountersFanout();\n  // Galois Parallel Foreach\n  galois::for_each(galois::iterate(aig.getInputNodes()),\n                   KPriCutOperator(cutMan), galois::wl<DC_BAG>(),\n                   galois::loopname(\"KPriCutOperator\"),\n                   galois::per_iter_alloc());\n\n  if (verbose) {\n    std::cout << \"Mapping in Second Delay Mode\" << std::endl;\n  }\n  cutMan.switchToSecondDelayMode();\n  cutMan.computeRequiredTimes();\n  if (cutMan.isDeterministic()) {\n    cutMan.computeCovering();\n  }\n  // Galois Parallel Foreach\n  galois::for_each(galois::iterate(aig.getInputNodes()),\n                   KPriCutOperator(cutMan), galois::wl<DC_BAG>(),\n                   galois::loopname(\"KPriCutOperator\"),\n                   galois::per_iter_alloc());\n\n  for (int i = 1; i <= nAreaRecovery; i++) {\n\n    for (int j = 1; j <= nLocalArea; j++) {\n      if (verbose) {\n        std::cout << \"Mapping in Local Area Mode\" << std::endl;\n      }\n      cutMan.switchToLocalAreaMode();\n      cutMan.computeRequiredTimes();\n      if (cutMan.isDeterministic()) {\n        cutMan.computeCovering();\n      }\n      // Galois Parallel Foreach\n      galois::for_each(galois::iterate(aig.getInputNodes()),\n                       KPriCutOperator(cutMan), galois::wl<DC_BAG>(),\n                       galois::loopname(\"KPriCutOperator\"),\n                       galois::per_iter_alloc());\n    }\n\n    for (int j = 1; j <= nAreaFlow; j++) {\n      if (verbose) {\n        std::cout << \"Mapping in Area Flow Mode\" << std::endl;\n      }\n      cutMan.switchToAreaFlowMode();\n      cutMan.computeRequiredTimes();\n      if (cutMan.isDeterministic()) {\n        cutMan.computeCovering();\n      }\n      // Galois Parallel Foreach\n      galois::for_each(galois::iterate(aig.getInputNodes()),\n                       KPriCutOperator(cutMan), galois::wl<DC_BAG>(),\n                       galois::loopname(\"KPriCutOperator\"),\n                       galois::per_iter_alloc());\n    }\n  }\n\n  if (verbose) {\n    std::cout << \"Covering ...\" << std::endl;\n  }\n  cutMan.computeCovering();\n}\n\n} /* namespace algorithm */\n"
  },
  {
    "path": "lonestar/eda/cpu/aig-rewriting/algorithms/PriorityCutManager.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n/*\n\n @Vinicius Possani\n Parallel Parallel LUT-Based Tech Mapping October 16, 2018.\n ABC-based implementation on Galois.\n\n*/\n\n#ifndef PRIORITYCUTMANAGER_H_\n#define PRIORITYCUTMANAGER_H_\n\n#include \"Aig.h\"\n#include \"PriorityCutPool.h\"\n#include \"CutManager.h\"\n#include \"../functional/FunctionHandler32.h\"\n#include \"galois/Reduction.h\"\n#include <unordered_map>\n#include <unordered_set>\n\nnamespace algorithm {\n\ntypedef std::unordered_map<int, PriCut*> Covering;\n\n// #### THREAD LOCAL #### //\ntypedef struct pricutList_ {\n\n  PriCut** array;\n  int nCuts;\n\n  pricutList_(int C) {\n    array = new PriCut*[C + 2];\n    nCuts = 0;\n  }\n\n  ~pricutList_() { delete array; }\n\n} PriCutList;\n\ntypedef galois::PerIterAllocTy::rebind<std::pair<const int, int>>::other\n    MapAlloc;\ntypedef std::unordered_map<int, int, std::hash<int>, std::equal_to<int>,\n                           MapAlloc>\n    RefMap;\n\ntypedef struct threadLocalData_ {\n\n  PriCutPool cutPool;\n  PriCutList cutList;\n  AuxTruth auxTruth;\n\n  threadLocalData_(int cutPoolSize, int K, bool compTruth, int C, int nWords)\n      : cutPool(cutPoolSize, K, compTruth), cutList(C), auxTruth(nWords) {}\n\n} ThreadLocalData;\n\ntypedef galois::substrate::PerThreadStorage<ThreadLocalData> PerThreadData;\n\n// #### CONTROL TYPES #### //\nenum SortMode { DELAY, DELAY_OLD, AREA };\nenum CostMode { AREA_FLOW, LOCAL_AREA };\nenum RefMode { STANDARD, MAP };\n\nclass PriCutManager {\n\nprivate:\n  aig::Aig& aig;\n  aig::Graph& aigGraph;\n  int K;\n  int C;\n  int nWords;\n  int nNodes;\n  int nThreads;\n  long int cutPoolSize;\n  bool compTruth;\n  bool deterministic;\n  bool verbose;\n  int passId;\n  int sortMode;\n  int costMode;\n  int refMode;\n  int nLUTs;\n  int nLevels;\n  bool fPower;\n  float fEpsilon;\n  long double kcutTime;\n\n  PerThreadData perThreadData;\n\n  PriCut** nodePriCuts;\n  Covering covering;\n  std::vector<aig::GNode> sortedNodes;\n\n  // Cuts Statistics //\n  galois::GAccumulator<long int> nCuts;\n  galois::GAccumulator<long int> nTriv;\n  galois::GAccumulator<long int> nFilt;\n  galois::GAccumulator<long int> nSatu;\n\n  // Runtime Statistics //\n  galois::GAccumulator<long int> mergeTime;\n  galois::GAccumulator<long int> filterTime;\n  galois::GAccumulator<long int> procTwoTime;\n  galois::GAccumulator<long int> compTime;\n  galois::GAccumulator<long int> scheduleTime;\n\n  void computePriCutsRec(aig::GNode node, ThreadLocalData* thData,\n                         RefMap& refMap);\n\n  PriCut* mergeCuts(PriCutPool& cutPool, PriCut* lhsCut, PriCut* rhsCut);\n  inline bool cutFilter(PriCutPool& cutPool, PriCutList& cutList,\n                        PriCut* resCut);\n  inline bool checkCutDominance(PriCut* smallerCut, PriCut* largerCut);\n\n  inline void commitCuts(PriCutPool& cutPool, PriCutList& cutList, int nodeId);\n\n  inline void recycleNodeCuts(PriCutPool& cutPool, int nodeId);\n  inline void cleanupCutList(PriCutPool& cutPool, PriCutList& cutList);\n\n  void computeTruth(AuxTruth& auxTruth, PriCut* resCut, PriCut* lhsCut,\n                    PriCut* rhsCut, bool lhsPolarity, bool rhsPolarity);\n  inline unsigned truthPhase(PriCut* resCut, PriCut* inCut);\n\n  void cutSort(PriCutPool& cutPool, PriCutList& cutList, PriCut* resCut);\n  int sortCompare(PriCut* lhsCut, PriCut* rhsCut);\n\n  void increaseCutReferences(PriCut* cut);\n  void decreaseCutReferences(PriCut* cut);\n\n  // ################### Start of the NewCuts Cost Functions\n  // ###################### //\n  inline float cutDelay(PriCut* cut);\n  // STANDARD VERSIONS\n  void cutFlowCosts(PriCut* cut);\n  void cutDerefedCosts(PriCut* cut);\n  void cutRefCosts(PriCut* cut, float& area, float& edge);\n  void cutDerefCosts(PriCut* cut, float& area, float& edge);\n  // REFMAP RERSIONS\n  void cutDerefedCosts(PriCut* cut, RefMap& refMap);\n  void cutRefCosts(PriCut* cut, float& area, float& edge, RefMap& refMap);\n  void cutDerefCosts(PriCut* cut, float& area, float& edge, RefMap& refMap);\n  // ################### End of the NewCuts Cost Functions\n  // ###################### //\n\n  inline void copyCut(PriCut* dest, PriCut* source);\n\npublic:\n  PriCutManager(aig::Aig& aig, int K, int C, int nThreads, bool compTruth,\n                bool deterministic, bool verbose);\n\n  ~PriCutManager();\n\n  void computePriCuts(ThreadLocalData* thData, RefMap& refMap,\n                      aig::NodeData& nodeData, int lhsId, int rhsId,\n                      bool lhsPolarity, bool rhsPolarity);\n\n  void mapChoices(ThreadLocalData* thData, RefMap& refMap,\n                  aig::NodeData& nodeData);\n\n  void computePriCutsRecursively(aig::GNode node, RefMap& refMap);\n\n  unsigned int* readTruth(PriCut* cut);\n  inline void switchToFirstDelayMode();\n  inline void switchToSecondDelayMode();\n  inline void switchToAreaFlowMode();\n  inline void switchToLocalAreaMode();\n\n  void resetNodeCountersFanout();\n  void resetNodeCountersZero();\n  void resetNodeCountersOnly();\n  void computeReferenceCounters();\n  void computeCoveringReferenceCounters();\n  void computeRequiredTimes();\n  void computeCovering();\n\n  void printCovering();\n  void printNodeCuts(int nodeId, long int& counter);\n  void printAllCuts();\n  void printNodeBestCut(int nodeId);\n  void printBestCuts();\n  void printCutStatistics();\n  void printRuntimes();\n\n  int getNumLUTs();\n  int getNumLevels();\n  int getK();\n  int getC();\n  int getNWords();\n  int getNThreads();\n  bool isDeterministic();\n  bool getCompTruthFlag();\n  bool getVerboseFlag();\n  long double getKcutTime();\n  void setKcutTime(long double time);\n\n  inline aig::Aig& getAig();\n  inline PriCut* getBestCut(int nodeId);\n  PriCut** getNodePriCuts();\n  Covering& getCovering();\n\n  PerThreadData& getPerThreadData();\n};\n\n// Function that runs the KCut operator define in the end of file CutManager.cpp\n// //\nvoid runKPriCutOperator(PriCutManager& cutMan);\n\n} /* namespace algorithm */\n\n#endif /* PRIORITYCUTMANAGERC_H_ */\n"
  },
  {
    "path": "lonestar/eda/cpu/aig-rewriting/algorithms/PriorityCutPool.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n/*\n\n @Vinicius Possani\n Parallel Rewriting January 5, 2018.\n ABC-based implementation on Galois.\n\n*/\n\n#include \"PriorityCutPool.h\"\n#include \"../functional/FunctionHandler32.h\"\n\n#include <iostream>\n#include <cstdlib>\n#include <cstring>\n#include <cassert>\n\nnamespace algorithm {\n\nPriCutPool::PriCutPool(long int initialSize, int k, bool compTruth) {\n  this->blockSize = initialSize;\n  this->k         = k;\n  if (compTruth) {\n    this->entrySize = sizeof(PriCut) + (k * sizeof(int)) +\n                      (Functional32::wordNum(k) * sizeof(unsigned int));\n  } else {\n    this->entrySize = sizeof(PriCut) + (k * sizeof(int));\n  }\n  this->entriesUsed  = 0;\n  this->entriesAlloc = 0;\n  this->entriesFree  = nullptr;\n}\n\nPriCutPool::~PriCutPool() {\n  for (char* ptr : this->blocks) {\n    free(ptr);\n  }\n}\n\ninline void PriCutPool::alloc() {\n\n  this->entriesFree =\n      (char*)malloc((long int)(this->entrySize * this->blockSize));\n\n  if (this->entriesFree == nullptr) {\n    std::cout << \"Error: memory could not be allocated by CutPool!\"\n              << std::endl;\n    exit(1);\n  }\n\n  char* pTemp = this->entriesFree;\n\n  for (int i = 1; i < this->blockSize; i++) {\n    *((char**)pTemp) = pTemp + this->entrySize;\n    pTemp += this->entrySize;\n  }\n\n  *((char**)pTemp) = nullptr;\n\n  this->entriesAlloc += this->blockSize;\n  this->blocks.push_back(this->entriesFree);\n}\n\nPriCut* PriCutPool::getMemory() {\n\n  if (this->entriesUsed == this->entriesAlloc) {\n    assert(this->entriesFree == nullptr);\n    alloc();\n  }\n\n  this->entriesUsed++;\n  char* pTemp       = this->entriesFree;\n  this->entriesFree = *((char**)pTemp);\n\n  PriCut* cut = (PriCut*)pTemp;\n  memset(cut, 0, this->entrySize);\n  cut->nextCut = nullptr;\n\n  return cut;\n}\n\nvoid PriCutPool::giveBackMemory(PriCut* cut) {\n\n  this->entriesUsed--;\n  char* pTemp       = (char*)cut;\n  *((char**)pTemp)  = this->entriesFree;\n  this->entriesFree = pTemp;\n}\n\nint PriCutPool::getNumBlocks() { return this->blocks.size(); }\n\nint PriCutPool::getBlockSize() { return this->blockSize; }\n\n// void PriCutPool::copyCut(PriCut* dest, PriCut* source) {\n//\tmemcpy(dest, source, this->entrySize);\n//}\n\n} /* namespace algorithm */\n"
  },
  {
    "path": "lonestar/eda/cpu/aig-rewriting/algorithms/PriorityCutPool.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n/*\n\n @Vinicius Possani\n Parallel Rewriting January 5, 2018.\n ABC-based implementation on Galois.\n\n*/\n\n#ifndef PRIORITYCUTPOOL_H_\n#define PRIORITYCUTPOOL_H_\n\n#include <vector>\n\nnamespace algorithm {\n\n// The size of the leaves is defined acording the parameter K, during the\n// memory allocation in the CutPool.cpp\ntypedef struct priCut_ {\n  float area;  // area (or area-flow) of the cut\n  float edge;  // the edge flow\n  float power; // the power flow\n  float delay; // delay of the cut\n\n  unsigned int sig;\n  short int nLeaves;\n  struct priCut_* nextCut;\n  int leaves[0];\n} PriCut;\n\nclass PriCutPool {\n\nprivate:\n  long int blockSize;\n  int k;\n  int entrySize;\n  long int entriesUsed;\n  long int entriesAlloc;\n  char* entriesFree;\n  std::vector<char*> blocks;\n\n  void alloc();\n\npublic:\n  PriCutPool(long int initialSize, int k, bool compTruth);\n\n  ~PriCutPool();\n\n  PriCut* getMemory();\n\n  void giveBackMemory(PriCut* cut);\n\n  int getNumBlocks();\n\n  int getBlockSize();\n\n  // void copyCut(PriCut* dest, PriCut* source);\n};\n\n} /* namespace algorithm */\n\n#endif /* PRIORITYCUTPOOL_H_ */\n"
  },
  {
    "path": "lonestar/eda/cpu/aig-rewriting/algorithms/ReconvDrivenCut.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n/*\n\n @Vinicius Possani\n Parallel Rewriting January 5, 2018.\n ABC-based implementation on Galois.\n\n*/\n\n#include \"ReconvDrivenCut.h\"\n\n#include <iostream>\n#include <unordered_set>\n\nnamespace algorithm {\n\ntypedef galois::PerIterAllocTy Alloc;\ntypedef std::unordered_set<aig::GNode, std::hash<aig::GNode>,\n                           std::equal_to<aig::GNode>,\n                           galois::PerIterAllocTy::rebind<aig::GNode>::other>\n    GNodeSet;\n\nReconvDrivenCut::ReconvDrivenCut(aig::Aig& aig) : aig(aig) {}\n\nReconvDrivenCut::~ReconvDrivenCut() {}\n\nstruct Preprocess {\n\n  aig::Graph& aigGraph;\n  galois::InsertBag<aig::GNode>& workList;\n\n  Preprocess(aig::Graph& aigGraph, galois::InsertBag<aig::GNode>& workList)\n      : aigGraph(aigGraph), workList(workList) {}\n\n  void operator()(aig::GNode node) const {\n\n    aig::NodeData& nodeData =\n        aigGraph.getData(node, galois::MethodFlag::UNPROTECTED);\n\n    if ((nodeData.type == aig::NodeType::AND) && (nodeData.counter == 0) &&\n        (nodeData.nFanout < 1000)) {\n      workList.push(node);\n    }\n  }\n};\n\nstruct ReconvergenceDrivenCut {\n\n  // typedef int tt_does_not_need_aborts;\n  // typedef int tt_needs_per_iter_alloc;\n  // typedef int tt_does_not_need_push;\n\n  aig::Graph& aigGraph;\n  PerThreadRDCutData& perThreadRDCutData;\n  size_t cutSizeLimit;\n\n  ReconvergenceDrivenCut(aig::Graph& aigGraph,\n                         PerThreadRDCutData& perThreadRDCutData,\n                         size_t cutSizeLimit)\n      : aigGraph(aigGraph), perThreadRDCutData(perThreadRDCutData),\n        cutSizeLimit(cutSizeLimit) {}\n\n  void operator()(aig::GNode node, galois::UserContext<aig::GNode>& ctx) const {\n    // void operator()( aig::GNode node ) const {\n\n    aig::NodeData& nodeData = aigGraph.getData(node, galois::MethodFlag::READ);\n\n    // if ( nodeData.type == aig::NodeType::AND ) {\n    if ((nodeData.type == aig::NodeType::AND) && (nodeData.counter == 0) &&\n        (nodeData.nFanout < 1000)) {\n\n      // galois::PerIterAllocTy & allocator = ctx.getPerIterAlloc();\n\n      // GNodeSet leaves( allocator );\n      // GNodeSet visited( allocator );\n\n      // leaves.insert( node );\n      // visited.insert( node );\n\n      RDCutData* rdCutData = perThreadRDCutData.getLocal();\n\n      rdCutData->visited.clear();\n      rdCutData->leaves.clear();\n\n      rdCutData->visited.insert(node);\n      rdCutData->leaves.insert(node);\n\n      // constructCut( leaves, visited );\n      constructCut_iter(rdCutData->leaves, rdCutData->visited);\n\n      /*\n      std::cout << \"Leaves = { \";\n      for ( auto leaf : rdCutData->leaves ) {\n          aig::NodeData & leafData = aigGraph.getData( leaf,\n      galois::MethodFlag::READ ); std::cout << leafData.id << \" \";\n      }\n      std::cout << \"} \" << std::endl;\n\n      std::cout << \"Visited = { \";\n      for ( auto vis : rdCutData->visited ) {\n          aig::NodeData & visData = aigGraph.getData( vis,\n      galois::MethodFlag::READ ); std::cout << visData.id << \" \";\n      }\n      std::cout << \"} \" << std::endl;\n      */\n    }\n\n    nodeData.counter = 1;\n\n    for (auto inEdge : aigGraph.in_edges(node)) {\n\n      aig::GNode inNode = aigGraph.getEdgeDst(inEdge);\n      aig::NodeData& inNodeData =\n          aigGraph.getData(inNode, galois::MethodFlag::WRITE);\n\n      if ((inNodeData.type == aig::NodeType::AND) &&\n          (inNodeData.counter == 0)) {\n        ctx.push(inNode);\n      }\n    }\n  }\n\n  /*\n      void constructCut( GNodeSet & leaves, GNodeSet & visited ) const {\n\n          aig::GNode minCostNode = nullptr;\n          int minCost = std::numeric_limits<int>::max();\n          bool onlyPIs = true;\n          for ( aig::GNode node : leaves ) {\n              aig::NodeData & nodeData = aigGraph.getData( node,\n     galois::MethodFlag::READ ); if ( nodeData.type != aig::NodeType::PI ) { int\n     cost = leafCost( node, visited ); if ( minCost > cost ) { minCost = cost;\n                      minCostNode = node;\n                      onlyPIs = false;\n                  }\n              }\n          }\n          if ( onlyPIs || (leaves.size() + minCost) > cutSizeLimit ) {\n              return;\n          }\n\n          if( minCostNode == nullptr ) {\n              std::cout << \"MinCostNode is null\" << std::endl;\n              exit( 1 );\n          }\n\n          leaves.erase( minCostNode );\n          for ( auto edge : aigGraph.in_edges( minCostNode ) ) {\n              aig::GNode currentNode = aigGraph.getEdgeDst( edge );\n              leaves.insert( currentNode );\n              visited.insert( currentNode );\n          }\n\n          constructCut( leaves, visited );\n      }\n  */\n\n  // ITER\n  // void constructCut_iter( GNodeSet & leaves, GNodeSet & visited ) const {\n  void constructCut_iter(std::unordered_set<aig::GNode>& leaves,\n                         std::unordered_set<aig::GNode>& visited) const {\n\n    while (true) {\n      aig::GNode minCostNode = nullptr;\n      int minCost            = std::numeric_limits<int>::max();\n      bool onlyPIs           = true;\n      for (aig::GNode node : leaves) {\n        aig::NodeData& nodeData =\n            aigGraph.getData(node, galois::MethodFlag::READ);\n        if (nodeData.type != aig::NodeType::PI) {\n          int cost = leafCost(node, visited);\n          if (minCost > cost) {\n            minCost     = cost;\n            minCostNode = node;\n            onlyPIs     = false;\n          }\n        }\n      }\n\n      if (onlyPIs || (leaves.size() + minCost) > cutSizeLimit) {\n        break;\n      }\n\n      if (minCostNode == nullptr) {\n        std::cout << \"MinCostNode is null\" << std::endl;\n        exit(1);\n      }\n\n      leaves.erase(minCostNode);\n      for (auto edge : aigGraph.in_edges(minCostNode)) {\n        aig::GNode currentNode = aigGraph.getEdgeDst(edge);\n        leaves.insert(currentNode);\n        visited.insert(currentNode);\n      }\n    }\n  }\n\n  // int leafCost( aig::GNode & node, GNodeSet & visited ) const {\n  int leafCost(aig::GNode& node,\n               std::unordered_set<aig::GNode>& visited) const {\n\n    int cost = -1;\n    for (auto edge : aigGraph.in_edges(node)) {\n      aig::GNode currentNode = aigGraph.getEdgeDst(edge);\n      auto it                = visited.find(currentNode);\n      if (it == visited.end()) {\n        cost++;\n      }\n    }\n    return cost;\n  }\n};\n\nvoid ReconvDrivenCut::run(size_t cutSizeLimit) {\n\n  aig::Graph& aigGraph = this->aig.getGraph();\n\n  galois::InsertBag<aig::GNode> workList;\n  typedef galois::worklists::PerSocketChunkFIFO<5000> DC_FIFO;\n\n  // typedef galois::worklists::PerSocketChunkBag<5000> DC_BAG;\n  // galois::do_all_local( aigGraph, Preprocess( aigGraph, workList ) );\n  // galois::for_each_local( workList, ReconvergenceDrivenCut( aigGraph,\n  // cutSizeLimit ), galois::wl< DC_BAG >() );\n\n  // galois::for_each( aigGraph.begin(), aigGraph.end(), ReconvergenceDrivenCut(\n  // aigGraph, cutSizeLimit ) );\n\n  /*\n      for ( aig::GNode po : this->aig.getOutputNodes() ) {\n          auto inEdge = aigGraph.in_edge_begin( po );\n          aig::GNode inNode = aigGraph.getEdgeDst( inEdge );\n          workList.push( inNode );\n      }\n\n  */\n\n  /*\n      typedef struct FanoutComparator_ {\n\n          aig::Graph & aigGraph;\n\n          FanoutComparator_( aig::Graph & aigGraph ) : aigGraph( aigGraph ) { }\n\n          bool operator()( aig::GNode lhs, aig::GNode rhs ) const {\n              aig::NodeData & lhsData = aigGraph.getData( lhs,\n     galois::MethodFlag::UNPROTECTED ); aig::NodeData & rhsData =\n     aigGraph.getData( rhs, galois::MethodFlag::UNPROTECTED ); return\n     lhsData.nFanout > rhsData.nFanout;\n          }\n\n      } FanoutComparator;\n\n      std::vector< aig::GNode > nodes = aig.getNodes();\n\n      std::sort( nodes.begin(), nodes.end(), FanoutComparator( aigGraph ) );\n\n      for ( aig::GNode node : nodes ) {\n          aig::NodeData & nodeData = aigGraph.getData( node,\n     galois::MethodFlag::UNPROTECTED );\n\n          if ( (nodeData.type == aig::NodeType::AND) ) {\n              workList.push( node );\n          }\n      }\n  */\n\n  galois::for_each(\n      galois::iterate(workList.begin(), workList.end()),\n      ReconvergenceDrivenCut(aigGraph, perThreadRDCutData, cutSizeLimit),\n      galois::wl<DC_FIFO>(), galois::loopname(\"ReconvergenceDrivenCut\"));\n}\n\n} /* namespace algorithm */\n"
  },
  {
    "path": "lonestar/eda/cpu/aig-rewriting/algorithms/ReconvDrivenCut.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n/*\n\n @Vinicius Possani\n Parallel Rewriting January 5, 2018.\n ABC-based implementation on Galois.\n\n*/\n\n#ifndef RECONVDRIVENCUT_H_\n#define RECONVDRIVENCUT_H_\n\n#include \"Aig.h\"\n\n#include <unordered_set>\n\nnamespace algorithm {\n\ntypedef struct RDCutData_ {\n\n  std::unordered_set<aig::GNode> visited;\n  std::unordered_set<aig::GNode> leaves;\n\n} RDCutData;\n\ntypedef galois::substrate::PerThreadStorage<RDCutData> PerThreadRDCutData;\n\nclass ReconvDrivenCut {\n\nprivate:\n  aig::Aig& aig;\n  PerThreadRDCutData perThreadRDCutData;\n\npublic:\n  ReconvDrivenCut(aig::Aig& aig);\n\n  virtual ~ReconvDrivenCut();\n\n  void run(size_t cutSizeLimit);\n};\n\n} /* namespace algorithm */\n\nnamespace alg = algorithm;\n\n#endif /* RECONVDRIVENCUT_H_ */\n"
  },
  {
    "path": "lonestar/eda/cpu/aig-rewriting/algorithms/RewriteManager.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n/*\n\n @Vinicius Possani\n Parallel Rewriting January 5, 2018.\n ABC-based implementation on Galois.\n\n*/\n\n#include \"RewriteManager.h\"\n\n#include \"galois/worklists/Chunk.h\"\n\n//#include \"galois/runtime/profile.h\"\n\n#include <stdlib.h>\n#include <string.h>\n#include <assert.h>\n#include <chrono>\n\nusing namespace std::chrono;\n\nnamespace algorithm {\n\nRewriteManager::RewriteManager(aig::Aig& aig, CutManager& cutMan,\n                               NPNManager& npnMan, PreCompGraphManager& pcgMan,\n                               int triesNGraphs, bool useZeros,\n                               bool updateLevel)\n    : aig(aig), cutMan(cutMan), npnMan(npnMan), pcgMan(pcgMan),\n      perThreadContextData(), triesNGraphs(triesNGraphs), useZeros(useZeros),\n      updateLevel(updateLevel) {\n\n  nFuncs = (1 << 16);\n\n  for (int i = 0; i < cutMan.getNThreads(); i++) {\n    ThreadContextData* threadCtx = perThreadContextData.getRemote(i);\n    threadCtx->threadId          = i;\n  }\n\n  rewriteTime = 0;\n}\n\nRewriteManager::~RewriteManager() {\n  // TODO\n}\n\naig::GNode RewriteManager::rewriteNode(ThreadContextData* threadCtx,\n                                       aig::GNode node,\n                                       GNodeVector& fanoutNodes) {\n\n  aig::Graph& aigGraph    = this->aig.getGraph();\n  aig::NodeData& nodeData = aigGraph.getData(node, galois::MethodFlag::WRITE);\n\n  // Get the node's cuts\n  this->cutMan.computeCutsRecursively(node);\n  Cut* cutsBegin = this->cutMan.getNodeCuts()[nodeData.id];\n  assert(cutsBegin != nullptr);\n\n  threadCtx->bestCutMFFCIds.clear();\n  threadCtx->bestCutMFFCPreservedIds.clear();\n\n  Cut* cut;\n  char* perm;\n  unsigned phase;\n  unsigned truth;\n  // unsigned bestTruth = 0;\n  bool isOutputCompl = false;\n  int requiredLevel  = 0;\n  int nodesSaved;\n  // int bestNodesSaved;\n  int currentGain = -1, bestGain = -1;\n  int i;\n  DecGraph* currentGraph = nullptr;\n  DecGraph* bestGraph    = nullptr;\n\n  // Go through the cuts to lock the fanin conee\n  for (cut = cutsBegin; cut != nullptr; cut = cut->nextCut) {\n    // Consider only 4-input cuts\n    if (cut->nLeaves != 4) {\n      continue;\n    }\n    lockFaninCone(aigGraph, node, cut);\n  }\n\n  // Go through the cuts to rewrite\n  for (cut = cutsBegin; cut != nullptr; cut = cut->nextCut) {\n\n    // Consider only 4-input cuts\n    if (cut->nLeaves != 4) {\n      continue;\n    }\n\n    // Get the fanin permutation\n    truth = 0xFFFF & (*(this->cutMan.readTruth(cut)));\n    perm  = this->npnMan.getPerms4()[(int)this->npnMan.getPerms()[truth]];\n    phase = this->npnMan.getPhases()[truth];\n\n    // Collect fanins with the corresponding permutation/phase\n    for (i = 0; i < cut->nLeaves; i++) {\n      aig::GNode faninNode = this->aig.getNodes()[cut->leaves[(int)perm[i]]];\n      if (faninNode == nullptr) {\n        break;\n      }\n      threadCtx->currentFanins[i]    = faninNode;\n      threadCtx->currentFaninsPol[i] = !((phase & (1 << i)) > 0);\n    }\n\n    if (i != cut->nLeaves) {\n      continue;\n    }\n\n    int counter = 0;\n    for (aig::GNode faninNode : threadCtx->currentFanins) {\n      aig::NodeData& faninNodeData =\n          aigGraph.getData(faninNode, galois::MethodFlag::READ);\n      if (faninNodeData.nFanout == 1) {\n        counter++;\n      }\n    }\n\n    if (counter > 2) {\n      continue;\n    }\n\n    // lockFaninCone( aigGraph, node, cut );\n\n    // mark the fanin boundary\n    for (aig::GNode faninNode : threadCtx->currentFanins) {\n      aig::NodeData& faninNodeData =\n          aigGraph.getData(faninNode, galois::MethodFlag::WRITE);\n      faninNodeData.nFanout++;\n    }\n\n    // label MFFC with current ThreadId and the ThreadTravId\n    threadCtx->travId += 1;\n    nodesSaved =\n        labelMFFC(threadCtx, node, threadCtx->threadId, threadCtx->travId);\n\n    // unmark the fanin boundary\n    for (aig::GNode faninNode : threadCtx->currentFanins) {\n      aig::NodeData& faninNodeData =\n          aigGraph.getData(faninNode, galois::MethodFlag::WRITE);\n      faninNodeData.nFanout--;\n    }\n\n    // evaluate the cut\n    currentGraph = evaluateCut(threadCtx, node, cut, nodesSaved, requiredLevel,\n                               currentGain);\n\n    // cheeck if the cut is better than the current best one\n    if ((currentGraph != nullptr) && (bestGain < currentGain)) {\n      bestGain      = currentGain;\n      bestGraph     = currentGraph;\n      isOutputCompl = ((phase & (1 << 4)) > 0);\n      // bestTruth = 0xFFFF & *this->cutMan.readTruth( cut );\n      // bestNodesSaved = nodesSaved;\n      // collect fanins in the\n      for (size_t i = 0; i < threadCtx->currentFanins.size(); i++) {\n        threadCtx->bestFanins[i]    = threadCtx->currentFanins[i];\n        threadCtx->bestFaninsPol[i] = threadCtx->currentFaninsPol[i];\n      }\n      threadCtx->bestCutMFFCIds = threadCtx->currentCutMFFCIds;\n      threadCtx->bestCutMFFCPreservedIds =\n          threadCtx->currentCutMFFCPreservedIds;\n    }\n  }\n\n  if (!(bestGain > 0 || (bestGain == 0 && useZeros))) {\n    return nullptr;\n  }\n\n  assert(bestGraph != nullptr);\n\n  // Preparing structure/AIG tracking for updating the AIG\n  for (int j = 0; j < 20; j++) {\n    if (j < 4) {\n      threadCtx->decNodeFunc[j] =\n          threadCtx->bestFanins[j]; // Link cut leaves with the best\n                                    // decomposition graph\n    } else {\n      threadCtx->decNodeFunc[j] = nullptr; // Clear the link table, after leaves\n    }\n  }\n\n  // Define the MFFC available IDs to be reused\n  for (int id : threadCtx->bestCutMFFCPreservedIds) {\n    threadCtx->bestCutMFFCIds.erase(id);\n  }\n\n  // std::cout << threadCtx->threadId << \" - Updating AIG with gain \" <<\n  // bestGain << std::endl;\n  aig::GNode newRoot =\n      updateAig(threadCtx, node, bestGraph, fanoutNodes, isOutputCompl);\n  // std::cout << threadCtx->threadId << \" - Update done \" << std::endl;\n\n  return newRoot;\n}\n\nvoid RewriteManager::lockFaninCone(aig::Graph& aigGraph, aig::GNode node,\n                                   Cut* cut) {\n\n  aig::NodeData& nodeData =\n      aigGraph.getData(node, galois::MethodFlag::READ); // lock\n\n  // If node is a cut leaf\n  if ((nodeData.id == cut->leaves[0]) || (nodeData.id == cut->leaves[1]) ||\n      (nodeData.id == cut->leaves[2]) || (nodeData.id == cut->leaves[3])) {\n    return;\n  }\n\n  // If node is a PI\n  if ((nodeData.type == aig::NodeType::PI) ||\n      (nodeData.type == aig::NodeType::LATCH)) {\n    return;\n  }\n\n  auto inEdgeIt      = aigGraph.in_edge_begin(node);\n  aig::GNode lhsNode = aigGraph.getEdgeDst(inEdgeIt);\n  //  aig::NodeData& lhsData =\n  aigGraph.getData(lhsNode, galois::MethodFlag::READ); // lock\n  inEdgeIt++;\n  aig::GNode rhsNode = aigGraph.getEdgeDst(inEdgeIt);\n  //  aig::NodeData& rhsData =\n  aigGraph.getData(rhsNode, galois::MethodFlag::READ); // lock\n\n  lockFaninCone(aigGraph, lhsNode, cut);\n  lockFaninCone(aigGraph, rhsNode, cut);\n}\n\nint RewriteManager::labelMFFC(ThreadContextData* threadCtx, aig::GNode node,\n                              int threadId, int travId) {\n\n  aig::Graph& aigGraph = this->aig.getGraph();\n\n  threadCtx->currentCutMFFCIds.clear();\n\n  aig::NodeData& nodeData = aigGraph.getData(node, galois::MethodFlag::READ);\n  if ((nodeData.type == aig::NodeType::PI) ||\n      (nodeData.type == aig::NodeType::LATCH)) {\n    return 0;\n  }\n\n  threadCtx->currentCutMFFCIds.insert(nodeData.id);\n\n  int nConeSize1 = refDerefMFFCNodes(threadCtx, node, threadId, travId, false,\n                                     true); // dereference\n  int GALOIS_USED_ONLY_IN_DEBUG(nConeSize2) =\n      refDerefMFFCNodes(threadCtx, node, threadId, travId, true,\n                        false); // reference\n\n  assert(nConeSize1 == nConeSize2);\n  assert(nConeSize1 > 0);\n\n  return nConeSize1;\n}\n\nint RewriteManager::refDerefMFFCNodes(ThreadContextData* threadCtx,\n                                      aig::GNode node, int threadId, int travId,\n                                      bool reference, bool label) {\n\n  aig::Graph& aigGraph    = this->aig.getGraph();\n  aig::NodeData& nodeData = aigGraph.getData(node, galois::MethodFlag::READ);\n\n  // label visited nodes\n  if (label) {\n    this->aig.registerTravId(nodeData.id, threadId, travId);\n  }\n  // skip the CI\n  if ((nodeData.type == aig::NodeType::PI) ||\n      (nodeData.type == aig::NodeType::LATCH)) {\n    return 0;\n  }\n\n  // process the internal node\n  auto inEdgeIt          = aigGraph.in_edge_begin(node);\n  aig::GNode lhsNode     = aigGraph.getEdgeDst(inEdgeIt);\n  aig::NodeData& lhsData = aigGraph.getData(lhsNode, galois::MethodFlag::WRITE);\n\n  inEdgeIt++;\n  aig::GNode rhsNode     = aigGraph.getEdgeDst(inEdgeIt);\n  aig::NodeData& rhsData = aigGraph.getData(rhsNode, galois::MethodFlag::WRITE);\n\n  int counter = 1;\n\n  if (reference) {\n    if (lhsData.nFanout++ == 0) {\n      counter += refDerefMFFCNodes(threadCtx, lhsNode, threadId, travId,\n                                   reference, label);\n    }\n    if (rhsData.nFanout++ == 0) {\n      counter += refDerefMFFCNodes(threadCtx, rhsNode, threadId, travId,\n                                   reference, label);\n    }\n  } else {\n    assert(lhsData.nFanout > 0);\n    assert(rhsData.nFanout > 0);\n    if (--lhsData.nFanout == 0) {\n      threadCtx->currentCutMFFCIds.insert(lhsData.id);\n      counter += refDerefMFFCNodes(threadCtx, lhsNode, threadId, travId,\n                                   reference, label);\n    }\n    if (--rhsData.nFanout == 0) {\n      threadCtx->currentCutMFFCIds.insert(rhsData.id);\n      counter += refDerefMFFCNodes(threadCtx, rhsNode, threadId, travId,\n                                   reference, label);\n    }\n  }\n\n  return counter;\n}\n\nDecGraph* RewriteManager::evaluateCut(ThreadContextData* threadCtx,\n                                      aig::GNode root, Cut* cut,\n                                      int nNodesSaved, int maxLevel,\n                                      int& bestGain) {\n\n  DecGraph* bestGraph = NULL;\n  DecGraph* currentGraph;\n  ForestNode* node;\n  int nNodesAdded;\n  unsigned uTruth;\n  bestGain = -1;\n\n  threadCtx->currentCutMFFCPreservedIds.clear();\n\n  // find the matching class of subgraphs\n  uTruth = 0xFFFF & *this->cutMan.readTruth(cut);\n  std::vector<ForestNode*>& subgraphs =\n      this->pcgMan.getClasses()[this->npnMan.getMap()[uTruth]];\n\n  aig::Graph& aigGraph = aig.getGraph();\n\n  // copy the leaves\n  for (int i = 0; i < 4; i++) { // each deGraph has eactly 4 inputs (vars).\n    aig::GNode fanin = threadCtx->currentFanins[i];\n    //    aig::NodeData& faninData =\n    aigGraph.getData(fanin, galois::MethodFlag::READ);\n    threadCtx->decNodeFunc[i] = fanin;\n    // threadCtx->decNodeLevel[i] = faninData.level;\n  }\n\n  // Pruning\n  int nSubgraphs = subgraphs.size();\n  if (nSubgraphs > this->triesNGraphs) {\n    nSubgraphs = this->triesNGraphs;\n  }\n\n  // determine the best subgrap\n  for (int i = 0; i < nSubgraphs; i++) {\n    node = subgraphs[i];\n    // get the current graph\n    currentGraph = (DecGraph*)node->pNext;\n\n    // clear link table, after leaves\n    for (int j = 4; j < 20; j++) { // each decGraph has at most 20 nodes.\n      threadCtx->decNodeFunc[j] = NULL;\n    }\n\n    // detect how many unlabeled nodes will be reused\n    nNodesAdded = decGraphToAigCount(threadCtx, root, currentGraph, nNodesSaved,\n                                     maxLevel);\n\n    if (nNodesAdded == -1) {\n      continue;\n    }\n\n    assert(nNodesSaved >= nNodesAdded);\n\n    // count the gain at this node\n    if (bestGain < nNodesSaved - nNodesAdded) {\n      bestGain  = nNodesSaved - nNodesAdded;\n      bestGraph = currentGraph;\n      threadCtx->currentCutMFFCPreservedIds =\n          threadCtx->currentGraphMFFCPreservedIds;\n    }\n  }\n\n  if (bestGain == -1) {\n    return NULL;\n  }\n\n  return bestGraph;\n}\n\n/*\n *   Before calling this procedure, AIG nodes should be assigned to DecNodes by\n *   using the threadCtx->decNodeFunc[ DecNode.id ] for each leaf of the\n * decGraph. Returns -1 if the number of nodes and levels exceeded the given\n * limit or the number of levels exceeded the maximum allowed level.\n */\nint RewriteManager::decGraphToAigCount(ThreadContextData* threadCtx,\n                                       aig::GNode root, DecGraph* decGraph,\n                                       int maxNode,\n                                       int GALOIS_UNUSED(maxLevel)) {\n\n  DecNode* node;\n  DecNode* lhsNode;\n  DecNode* rhsNode;\n  aig::GNode curAnd;\n  aig::GNode lhsAnd;\n  aig::GNode rhsAnd;\n  bool lhsPol, rhsPol;\n  int counter = 0;\n  // int newLevel, oldLevel;\n\n  aig::Graph& aigGraph = this->aig.getGraph();\n\n  threadCtx->currentGraphMFFCPreservedIds.clear();\n\n  // check for constant function or a literal\n  if (decGraph->isConst() || decGraph->isVar()) {\n    return counter;\n  }\n\n  // compute the AIG size after adding the internal nodes\n  for (int i = decGraph->getLeaveNum();\n       (i < decGraph->getNodeNum()) && ((node = decGraph->getNode(i)), 1);\n       i++) {\n\n    // get the children of this node\n    lhsNode = decGraph->getNode(node->eEdge0.Node);\n    rhsNode = decGraph->getNode(node->eEdge1.Node);\n\n    // get the AIG nodes corresponding to the children\n    lhsAnd = threadCtx->decNodeFunc[lhsNode->id];\n    rhsAnd = threadCtx->decNodeFunc[rhsNode->id];\n\n    // if they are both present, find the resulting node\n    if (lhsAnd && rhsAnd) {\n      if (lhsNode->id < 4) { // If lhs is a cut leaf\n        lhsPol = node->eEdge0.fCompl\n                     ? !(threadCtx->currentFaninsPol[lhsNode->id])\n                     : threadCtx->currentFaninsPol[lhsNode->id];\n      } else {\n        lhsPol = node->eEdge0.fCompl ? false : true;\n      }\n\n      if (rhsNode->id < 4) { // If rhs is a cut leaf\n        rhsPol = node->eEdge1.fCompl\n                     ? !(threadCtx->currentFaninsPol[rhsNode->id])\n                     : threadCtx->currentFaninsPol[rhsNode->id];\n      } else {\n        rhsPol = node->eEdge1.fCompl ? false : true;\n      }\n\n      curAnd = this->aig.lookupNodeInFanoutMap(lhsAnd, rhsAnd, lhsPol, rhsPol);\n\n      // return -1 if the node is the same as the original root\n      if (curAnd == root) {\n        return -1;\n      }\n    } else {\n      curAnd = nullptr;\n    }\n\n    // count the number of new levels\n    // newLevel = 1 + std::max( threadCtx->decNodeLevel[ lhsNode->id ],\n    // threadCtx->decNodeLevel[ rhsNode->id ] );\n\n    if (curAnd) {\n      aig::NodeData& curAndData =\n          aigGraph.getData(curAnd, galois::MethodFlag::READ);\n      bool isMFFC = this->aig.lookupTravId(curAndData.id, threadCtx->threadId,\n                                           threadCtx->travId);\n\n      if (isMFFC) {\n        threadCtx->currentGraphMFFCPreservedIds.insert(curAndData.id);\n        // count the number of added nodes\n        if (++counter > maxNode) {\n          return -1;\n        }\n      }\n\n      // TODO Implement an Heuristic for levels preservation\n      /*\n      if ( curAnd == aig.getConstZero() ) {\n          newLevel = 0;\n      }\n      else {\n          if ( curAnd == lhsAnd ) {\n              aig::NodeData & lhsAndData = aigGraph.getData( lhsAnd,\n      galois::MethodFlag::READ ); newLevel = lhsAndData.level;\n          }\n          else {\n              if ( curAnd == rhsAnd ) {\n                  aig::NodeData & rhsAndData = aigGraph.getData( rhsAnd,\n      galois::MethodFlag::READ ); newLevel = rhsAndData.level;\n              }\n          }\n      }\n\n      oldLevel = curAndData.level;\n      //assert( LevelNew == LevelOld );\n      */\n    } else {\n      // count the number of added nodes\n      if (++counter > maxNode) {\n        return -1;\n      }\n    }\n\n    // if ( newLevel > maxLevel ) {\n    //    return -1;\n    //}\n\n    threadCtx->decNodeFunc[node->id] = curAnd;\n    // threadCtx->decNodeLevel[ node->id ] = newLevel;\n  }\n\n  return counter;\n}\n\naig::GNode RewriteManager::updateAig(ThreadContextData* threadCtx,\n                                     aig::GNode oldRoot, DecGraph* decGraph,\n                                     GNodeVector& fanoutNodes,\n                                     bool isOutputCompl) {\n\n  aig::Graph& aigGraph = this->aig.getGraph();\n\n  // Prepare to delete nodes in the MFFC\n  for (int id : threadCtx->bestCutMFFCIds) {\n    aig::GNode mffcNode = this->aig.getNodes()[id];\n    auto inEdge         = aigGraph.in_edge_begin(mffcNode);\n\n    aig::GNode lhsNode = aigGraph.getEdgeDst(inEdge);\n    //    aig::NodeData& lhsNodeData =\n    aigGraph.getData(lhsNode, galois::MethodFlag::WRITE);\n    bool lhsNodePol = aigGraph.getEdgeData(inEdge);\n    inEdge++;\n    aig::GNode rhsNode = aigGraph.getEdgeDst(inEdge);\n    //    aig::NodeData& rhsNodeData =\n    aigGraph.getData(rhsNode, galois::MethodFlag::WRITE);\n    bool rhsNodePol = aigGraph.getEdgeData(inEdge);\n\n    this->aig.removeNodeInFanoutMap(mffcNode, lhsNode, rhsNode, lhsNodePol,\n                                    rhsNodePol);\n    this->aig.getNodes()[id] = nullptr;\n    this->aig.getFanoutMap(id).clear();\n    this->cutMan.recycleNodeCuts(id);\n  }\n\n  bool isDecGraphComplement = isOutputCompl\n                                  ? (bool)decGraph->getRootEdge().fCompl ^ 1\n                                  : (bool)decGraph->getRootEdge().fCompl;\n  aig::GNode newRoot;\n\n  // check for constant function\n  if (decGraph->isConst()) {\n    newRoot = this->aig.getConstZero();\n  } else {\n    // check for a literal\n    if (decGraph->isVar()) {\n      DecNode* decNode = decGraph->getVar();\n      isDecGraphComplement =\n          isDecGraphComplement ? (!threadCtx->bestFaninsPol[decNode->id]) ^ true\n                               : !threadCtx->bestFaninsPol[decNode->id];\n      newRoot = threadCtx->decNodeFunc[decNode->id];\n    } else {\n      newRoot = decGraphToAig(threadCtx, decGraph);\n    }\n  }\n\n  addNewSubgraph(oldRoot, newRoot, fanoutNodes, isDecGraphComplement);\n\n  deleteOldMFFC(aigGraph, oldRoot);\n\n  return newRoot;\n}\n\n/*\n *   Transforms the decomposition graph into the AIG.\n *   Before calling this procedure, AIG nodes for the fanins\n *   should be assigned to threadCtx.decNodeFun[ decNode.id ].\n */\naig::GNode RewriteManager::decGraphToAig(ThreadContextData* threadCtx,\n                                         DecGraph* decGraph) {\n\n  DecNode* decNode = nullptr;\n  DecNode* lhsNode;\n  DecNode* rhsNode;\n  aig::GNode curAnd;\n  aig::GNode lhsAnd;\n  aig::GNode rhsAnd;\n  bool lhsAndPol;\n  bool rhsAndPol;\n\n  // build the AIG nodes corresponding to the AND gates of the graph\n  for (int i = decGraph->getLeaveNum();\n       (i < decGraph->getNodeNum()) && ((decNode = decGraph->getNode(i)), 1);\n       i++) {\n\n    // get the children of this node\n    lhsNode = decGraph->getNode(decNode->eEdge0.Node);\n    rhsNode = decGraph->getNode(decNode->eEdge1.Node);\n\n    // get the AIG nodes corresponding to the children\n    lhsAnd = threadCtx->decNodeFunc[lhsNode->id];\n    rhsAnd = threadCtx->decNodeFunc[rhsNode->id];\n\n    if (lhsNode->id < 4) { // If lhs is a cut leaf\n      lhsAndPol = decNode->eEdge0.fCompl\n                      ? !(threadCtx->bestFaninsPol[lhsNode->id])\n                      : threadCtx->bestFaninsPol[lhsNode->id];\n    } else {\n      lhsAndPol = decNode->eEdge0.fCompl ? false : true;\n    }\n\n    if (rhsNode->id < 4) { // If rhs is a cut leaf\n      rhsAndPol = decNode->eEdge1.fCompl\n                      ? !(threadCtx->bestFaninsPol[rhsNode->id])\n                      : threadCtx->bestFaninsPol[rhsNode->id];\n    } else {\n      rhsAndPol = decNode->eEdge1.fCompl ? false : true;\n    }\n\n    curAnd =\n        this->aig.lookupNodeInFanoutMap(lhsAnd, rhsAnd, lhsAndPol, rhsAndPol);\n\n    if (curAnd) {\n      threadCtx->decNodeFunc[decNode->id] = curAnd;\n    } else {\n      threadCtx->decNodeFunc[decNode->id] =\n          createAndNode(threadCtx, lhsAnd, rhsAnd, lhsAndPol, rhsAndPol);\n    }\n  }\n\n  return threadCtx->decNodeFunc[decNode->id];\n}\n\naig::GNode RewriteManager::createAndNode(ThreadContextData* threadCtx,\n                                         aig::GNode lhsAnd, aig::GNode rhsAnd,\n                                         bool lhsAndPol, bool rhsAndPol) {\n\n  aig::Graph& aigGraph = this->aig.getGraph();\n  aig::NodeData& lhsAndData =\n      aigGraph.getData(lhsAnd, galois::MethodFlag::READ);\n  aig::NodeData& rhsAndData =\n      aigGraph.getData(rhsAnd, galois::MethodFlag::READ);\n\n  aig::NodeData newAndData;\n\n  auto idIt =\n      threadCtx->bestCutMFFCIds.begin(); // reuse an ID from deleted MFFC\n  auto id = (*idIt);\n  threadCtx->bestCutMFFCIds.erase(\n      idIt); // remove the reused ID from the available IDs set\n  assert(id < int(this->aig.getNodes().size()));\n\n  newAndData.id      = id;\n  newAndData.type    = aig::NodeType::AND;\n  newAndData.level   = 1 + std::max(lhsAndData.level, rhsAndData.level);\n  newAndData.counter = 0;\n\n  if (lhsAndData.counter == 3) {\n    newAndData.counter += 1;\n  }\n\n  if (rhsAndData.counter == 3) {\n    newAndData.counter += 1;\n  }\n\n  if (newAndData.counter == 2) {\n    newAndData.counter += 1;\n  }\n\n  aig::GNode newAnd = aigGraph.createNode(newAndData);\n  aigGraph.addNode(newAnd);\n\n  aigGraph.getEdgeData(aigGraph.addMultiEdge(\n      lhsAnd, newAnd, galois::MethodFlag::WRITE)) = lhsAndPol;\n  aigGraph.getEdgeData(aigGraph.addMultiEdge(\n      rhsAnd, newAnd, galois::MethodFlag::WRITE)) = rhsAndPol;\n  lhsAndData.nFanout++;\n  rhsAndData.nFanout++;\n\n  // int faninSize = std::distance( aigGraph.in_edge_begin( newAnd ),\n  // aigGraph.in_edge_end( newAnd ) ); assert( faninSize == 2 );\n\n  this->aig.getNodes()[id] = newAnd;\n  this->aig.insertNodeInFanoutMap(newAnd, lhsAnd, rhsAnd, lhsAndPol, rhsAndPol);\n\n  return newAnd;\n}\n\nvoid RewriteManager::addNewSubgraph(aig::GNode oldNode, aig::GNode newNode,\n                                    GNodeVector& fanoutNodes,\n                                    bool isNewRootComplement) {\n\n  int fanoutNodesSize = fanoutNodes.size();\n\n  aig::GNode fanoutNode;\n  aig::GNode otherNode;\n  bool otherNodePol;\n  bool newNodePol;\n  bool oldNodePol;\n\n  aig::Graph& aigGraph = this->aig.getGraph();\n  aig::NodeData& newNodeData =\n      aigGraph.getData(newNode, galois::MethodFlag::READ);\n  aig::NodeData& oldNodeData =\n      aigGraph.getData(oldNode, galois::MethodFlag::READ);\n  assert(oldNodeData.nFanout == fanoutNodesSize);\n\n  // look at the fanouts of old node\n  for (int i = 0; i < fanoutNodesSize; i++) {\n\n    fanoutNode = fanoutNodes[i];\n    aig::NodeData& fanoutNodeData =\n        aigGraph.getData(fanoutNode, galois::MethodFlag::READ);\n\n    // auto outEdge = aigGraph.findEdge( oldNode, fanoutNode );\n    auto fanoutNodeInEdge = aigGraph.findInEdge(oldNode, fanoutNode);\n\n    if (fanoutNodeInEdge == aigGraph.in_edge_end(fanoutNode)) {\n      std::cout << \"Adding new subgraph, fanoutNode inEdge not found!\"\n                << std::endl;\n    }\n\n    oldNodePol = aigGraph.getEdgeData(fanoutNodeInEdge);\n    // newNodePol = isNewRootComplement ? !(false ^ oldNodePol) : !(true ^\n    // oldNodePol);\n    newNodePol = isNewRootComplement ? !(oldNodePol) : oldNodePol;\n\n    if ((fanoutNodeData.type == aig::NodeType::PO) ||\n        (fanoutNodeData.type == aig::NodeType::LATCH)) {\n      // remove the oldNode from the fanoutNode's fanin\n      // aigGraph.removeEdge( oldNode, fanoutEdge );\n      aigGraph.removeInEdge(fanoutNode, fanoutNodeInEdge);\n      oldNodeData.nFanout--;\n      // add newNode to the fanoutNode's fanin\n      aigGraph.getEdgeData(aigGraph.addMultiEdge(\n          newNode, fanoutNode, galois::MethodFlag::WRITE)) = newNodePol;\n      newNodeData.nFanout++;\n      fanoutNodeData.level = newNodeData.level;\n      continue;\n    }\n\n    // find the otherNode diffetent of oldNode as a fanin of the fanoutNode\n    auto inEdge  = aigGraph.in_edge_begin(fanoutNode);\n    otherNode    = aigGraph.getEdgeDst(inEdge);\n    otherNodePol = aigGraph.getEdgeData(inEdge);\n\n    if (otherNode == oldNode) {\n      inEdge++;\n      otherNode    = aigGraph.getEdgeDst(inEdge);\n      otherNodePol = aigGraph.getEdgeData(inEdge);\n    }\n\n    assert(newNode != otherNode);\n\n    // Remove fanoutNode from the fanoutMap from otherNode\n    this->aig.removeNodeInFanoutMap(fanoutNode, otherNode, oldNode,\n                                    otherNodePol, oldNodePol);\n\n    // remove the oldNode from the fanoutNode fanin\n    // aigGraph.removeEdge( oldNode, fanoutEdge );\n    aigGraph.removeInEdge(fanoutNode, fanoutNodeInEdge);\n    oldNodeData.nFanout--;\n\n    // add newNode to the fanoutNode fanins\n    aigGraph.getEdgeData(aigGraph.addMultiEdge(\n        newNode, fanoutNode, galois::MethodFlag::WRITE)) = newNodePol;\n    newNodeData.nFanout++;\n\n    aig::NodeData& otherNodeData =\n        aigGraph.getData(otherNode, galois::MethodFlag::READ);\n    fanoutNodeData.level = 1 + std::max(newNodeData.level, otherNodeData.level);\n\n    // Insert fanoutNode in the fanoutMap from other Node with new inEdge\n    this->aig.insertNodeInFanoutMap(fanoutNode, otherNode, newNode,\n                                    otherNodePol, newNodePol);\n  }\n}\n\nvoid RewriteManager::deleteOldMFFC(aig::Graph& aigGraph, aig::GNode oldNode) {\n\n  // assert( oldNode != nullptr );\n\n  aig::NodeData& oldNodeData =\n      aigGraph.getData(oldNode, galois::MethodFlag::READ);\n\n  if ((oldNodeData.type == aig::NodeType::AND) && (oldNodeData.nFanout == 0) &&\n      aigGraph.containsNode(oldNode, galois::MethodFlag::WRITE)) {\n    deleteOldMFFCRec(aigGraph, oldNode);\n  }\n}\n\nvoid RewriteManager::deleteOldMFFCRec(aig::Graph& aigGraph,\n                                      aig::GNode oldNode) {\n\n  auto inEdge        = aigGraph.in_edge_begin(oldNode);\n  aig::GNode lhsNode = aigGraph.getEdgeDst(inEdge);\n  aig::NodeData& lhsNodeData =\n      aigGraph.getData(lhsNode, galois::MethodFlag::WRITE);\n  inEdge++;\n  aig::GNode rhsNode = aigGraph.getEdgeDst(inEdge);\n  aig::NodeData& rhsNodeData =\n      aigGraph.getData(rhsNode, galois::MethodFlag::WRITE);\n\n  // assert( (lhsNode != nullptr) && (rhsNode != nullptr) );\n\n  aigGraph.removeNode(oldNode);\n  lhsNodeData.nFanout--;\n  rhsNodeData.nFanout--;\n\n  if ((lhsNodeData.type == aig::NodeType::AND) && (lhsNodeData.nFanout == 0) &&\n      aigGraph.containsNode(lhsNode, galois::MethodFlag::WRITE)) {\n    deleteOldMFFCRec(aigGraph, lhsNode);\n  }\n\n  if ((rhsNodeData.type == aig::NodeType::AND) && (rhsNodeData.nFanout == 0) &&\n      aigGraph.containsNode(rhsNode, galois::MethodFlag::WRITE)) {\n    deleteOldMFFCRec(aigGraph, rhsNode);\n  }\n}\n\naig::Aig& RewriteManager::getAig() { return this->aig; }\n\nCutManager& RewriteManager::getCutMan() { return this->cutMan; }\n\nNPNManager& RewriteManager::getNPNMan() { return this->npnMan; }\n\nPreCompGraphManager& RewriteManager::getPcgMan() { return this->pcgMan; }\n\nPerThreadContextData& RewriteManager::getPerThreadContextData() {\n  return this->perThreadContextData;\n}\n\nbool RewriteManager::getUseZerosFlag() { return this->useZeros; }\n\nbool RewriteManager::getUpdateLevelFlag() { return this->updateLevel; }\n\nlong double RewriteManager::getRewriteTime() { return this->rewriteTime; }\n\nvoid RewriteManager::setRewriteTime(long double time) {\n  this->rewriteTime = time;\n}\n\nstruct RewriteOperator {\n\n  RewriteManager& rwtMan;\n  CutManager& cutMan;\n\n  RewriteOperator(RewriteManager& rwtMan)\n      : rwtMan(rwtMan), cutMan(rwtMan.getCutMan()) {}\n\n  void operator()(aig::GNode node, galois::UserContext<aig::GNode>& ctx) {\n\n    aig::Graph& aigGraph = rwtMan.getAig().getGraph();\n\n    if ((node == nullptr) ||\n        !aigGraph.containsNode(node, galois::MethodFlag::WRITE)) {\n      return;\n    }\n\n    aig::NodeData& nodeData = aigGraph.getData(node, galois::MethodFlag::WRITE);\n\n    if (nodeData.type == aig::NodeType::AND) {\n\n      if ((nodeData.nFanout < 1000)) {\n\n        Alloc& alloc = ctx.getPerIterAlloc();\n        GNodeVector fanoutNodes(alloc);\n\n        // Touching outgoing neighobors to acquire their locks and their fanin\n        // node's locks.\n        for (auto outEdge : aigGraph.out_edges(node)) {\n          aig::GNode fanoutNode = aigGraph.getEdgeDst(outEdge);\n          fanoutNodes.push_back(fanoutNode);\n          aigGraph.in_edges(fanoutNode);\n        }\n\n        ThreadContextData* threadCtx =\n            rwtMan.getPerThreadContextData().getLocal();\n\n        // Try to rewrite the node\n        aig::GNode newNode = rwtMan.rewriteNode(threadCtx, node, fanoutNodes);\n\n        bool scheduleFanoutNodes = false;\n\n        if (newNode == nullptr) { // it means that node was not rewritten\n          if (nodeData.counter == 2) {\n            nodeData.counter += 1;\n          }\n\n          if (nodeData.counter == 3) {\n            scheduleFanoutNodes = true;\n          }\n        } else {\n          aig::NodeData& newNodeData =\n              aigGraph.getData(newNode, galois::MethodFlag::READ);\n          if (newNodeData.counter == 3) {\n            scheduleFanoutNodes = true;\n          }\n        }\n\n        if (scheduleFanoutNodes) {\n          for (aig::GNode nextNode : fanoutNodes) {\n            aig::NodeData& nextNodeData =\n                aigGraph.getData(nextNode, galois::MethodFlag::WRITE);\n\n            if ((nextNodeData.type == aig::NodeType::PO) ||\n                (nextNodeData.type == aig::NodeType::LATCH)) {\n              continue;\n            }\n\n            nextNodeData.counter += 1;\n            if (nextNodeData.counter == 2) {\n              if (cutMan.getNodeCuts()[nextNodeData.id] != nullptr) {\n                cutMan.recycleNodeCuts(nextNodeData.id);\n              }\n              // rwtMan.nPushes += 1;\n              ctx.push(nextNode);\n            }\n          }\n        }\n\n      } else {\n\n        // Touching outgoing neighobors to acquire their locks and their fanin\n        // node's locks.\n        aigGraph.out_edges(node);\n\n        if (nodeData.counter == 2) {\n          nodeData.counter += 1;\n        }\n\n        if (nodeData.counter == 3) {\n          // Insert nextNodes in the worklist\n          for (auto outEdge : aigGraph.out_edges(node)) {\n            aig::GNode nextNode = aigGraph.getEdgeDst(outEdge);\n            aig::NodeData& nextNodeData =\n                aigGraph.getData(nextNode, galois::MethodFlag::WRITE);\n\n            if ((nextNodeData.type == aig::NodeType::PO) ||\n                (nextNodeData.type == aig::NodeType::LATCH)) {\n              continue;\n            }\n\n            nextNodeData.counter += 1;\n            if (nextNodeData.counter == 2) {\n              if (cutMan.getNodeCuts()[nextNodeData.id] != nullptr) {\n                cutMan.recycleNodeCuts(nextNodeData.id);\n              }\n              // rwtMan.nPushes += 1;\n              ctx.push(nextNode);\n            }\n          }\n        }\n      }\n    } else {\n      if ((nodeData.type == aig::NodeType::PI) ||\n          (nodeData.type == aig::NodeType::LATCH)) {\n\n        // Touching outgoing neighobors to acquire their locks and their fanin\n        // node's locks.\n        aigGraph.out_edges(node);\n\n        // Set the trivial cut\n        nodeData.counter      = 3;\n        CutPool* cutPool      = cutMan.getPerThreadCutPool().getLocal();\n        Cut* trivialCut       = cutPool->getMemory();\n        trivialCut->leaves[0] = nodeData.id;\n        trivialCut->nLeaves++;\n        trivialCut->sig = (1U << (nodeData.id % 31));\n        if (cutMan.getCompTruthFlag()) {\n          unsigned* cutTruth = cutMan.readTruth(trivialCut);\n          for (int i = 0; i < cutMan.getNWords(); i++) {\n            cutTruth[i] = 0xAAAAAAAA;\n          }\n        }\n        cutMan.getNodeCuts()[nodeData.id] = trivialCut;\n\n        // Schedule next nodes\n        for (auto edge : aigGraph.out_edges(node)) {\n          aig::GNode nextNode = aigGraph.getEdgeDst(edge);\n          aig::NodeData& nextNodeData =\n              aigGraph.getData(nextNode, galois::MethodFlag::WRITE);\n\n          if ((nextNodeData.type == aig::NodeType::PO) ||\n              (nextNodeData.type == aig::NodeType::LATCH)) {\n            continue;\n          }\n\n          nextNodeData.counter += 1;\n          if (nextNodeData.counter == 2) {\n            // rwtMan.nPushes += 1;\n            ctx.push(nextNode);\n          }\n        }\n      }\n    }\n  }\n};\n\nvoid runRewriteOperator(RewriteManager& rwtMan) {\n\n  // galois::runtime::profileVtune(\n\n  galois::InsertBag<aig::GNode> workList;\n  typedef galois::worklists::PerSocketChunkBag<500> DC_BAG;\n  // typedef galois::worklists::PerSocketChunkFIFO< 5000 > DC_FIFO;\n  // typedef galois::worklists::PerSocketChunkLIFO< 5000 > DC_LIFO;\n  // typedef galois::worklists::PerThreadChunkFIFO< 5000 > AC_FIFO;\n\n  for (auto pi : rwtMan.getAig().getInputNodes()) {\n    workList.push(pi);\n  }\n\n  for (auto latch : rwtMan.getAig().getLatchNodes()) {\n    workList.push(latch);\n  }\n\n  // Galois Parallel Foreach\n  galois::for_each(galois::iterate(workList.begin(), workList.end()),\n                   RewriteOperator(rwtMan), galois::wl<DC_BAG>(),\n                   galois::loopname(\"RewriteOperator\"),\n                   galois::per_iter_alloc());\n\n  // galois::wl<galois::worklists::Deterministic<>>(),\n  // galois::wl<DC_BAG>(),\n\n  //,\"REWRITING\" );\n}\n\n} /* namespace algorithm */\n"
  },
  {
    "path": "lonestar/eda/cpu/aig-rewriting/algorithms/RewriteManager.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n/*\n\n @Vinicius Possani\n Parallel Rewriting January 5, 2018.\n ABC-based implementation on Galois.\n\n*/\n\n#ifndef REWRITEMANAGER_H_\n#define REWRITEMANAGER_H_\n\n#include \"Aig.h\"\n#include \"CutManager.h\"\n#include \"NPNManager.h\"\n#include \"PreCompGraphManager.h\"\n\n#include \"galois/worklists/Chunk.h\"\n\n#include <vector>\n#include <unordered_set>\n\nnamespace algorithm {\n\ntypedef struct ThreadContextData_ {\n  // Labels\n  int threadId;\n  int travId;\n  // Cut under evaluation data\n  std::vector<bool> currentFaninsPol;\n  std::vector<bool> bestFaninsPol;\n  std::vector<aig::GNode> currentFanins;\n  std::vector<aig::GNode> bestFanins;\n  // Decomposition graphs data\n  std::vector<aig::GNode> decNodeFunc;\n  // std::vector< int > decNodeLevel;\n  // MFFC IDs to be reused\n  std::unordered_set<int> currentCutMFFCIds;\n  std::unordered_set<int> bestCutMFFCIds;\n  std::unordered_set<int> currentGraphMFFCPreservedIds;\n  std::unordered_set<int> currentCutMFFCPreservedIds;\n  std::unordered_set<int> bestCutMFFCPreservedIds;\n\n  ThreadContextData_()\n      : threadId(0), travId(0), currentFaninsPol(4), bestFaninsPol(4),\n        currentFanins(4), bestFanins(4), decNodeFunc(20) {\n  } //, decNodeLevel( 20 ) { }\n\n} ThreadContextData;\n\ntypedef galois::PerIterAllocTy Alloc;\ntypedef std::vector<int, galois::PerIterAllocTy::rebind<int>::other> IntVector;\ntypedef std::vector<aig::GNode,\n                    galois::PerIterAllocTy::rebind<aig::GNode>::other>\n    GNodeVector;\ntypedef std::unordered_set<int, std::hash<int>, std::equal_to<int>,\n                           galois::PerIterAllocTy::rebind<int>::other>\n    IntSet;\n\ntypedef galois::substrate::PerThreadStorage<ThreadContextData>\n    PerThreadContextData;\n\nclass RewriteManager {\n\nprivate:\n  aig::Aig& aig;\n  CutManager& cutMan;\n  NPNManager& npnMan;\n  PreCompGraphManager& pcgMan;\n\n  PerThreadContextData perThreadContextData;\n\n  int nFuncs;\n  int triesNGraphs;\n  bool useZeros;\n  bool updateLevel;\n\n  long double rewriteTime;\n\n  void lockFaninCone(aig::Graph& aigGraph, aig::GNode node, Cut* cut);\n  int labelMFFC(ThreadContextData* threadCtx, aig::GNode node, int threadId,\n                int travId);\n  int refDerefMFFCNodes(ThreadContextData* threadCtx, aig::GNode node,\n                        int threadId, int travId, bool reference, bool label);\n\n  DecGraph* evaluateCut(ThreadContextData* threadCtx, aig::GNode root, Cut* cut,\n                        int nNodesSaved, int maxLevel, int& bestGain);\n  int decGraphToAigCount(ThreadContextData* threadCtx, aig::GNode root,\n                         DecGraph* decGraph, int maxNode, int maxLevel);\n  aig::GNode updateAig(ThreadContextData* threadCtx, aig::GNode oldRoot,\n                       DecGraph* decGraph, GNodeVector& fanoutNodes,\n                       bool isOutputCompl);\n  aig::GNode decGraphToAig(ThreadContextData* threadCtx, DecGraph* decGraph);\n  aig::GNode createAndNode(ThreadContextData* threadCtx, aig::GNode lhsAnd,\n                           aig::GNode rhsAnd, bool lhsAndPol, bool rhsAndPol);\n  void addNewSubgraph(aig::GNode oldNode, aig::GNode newNode,\n                      GNodeVector& fanoutNodes, bool isNewRootComplement);\n  void deleteOldMFFC(aig::Graph& aigGraph, aig::GNode oldNode);\n  void deleteOldMFFCRec(aig::Graph& aigGraph, aig::GNode oldNode);\n\n  // void recycleIDsAndCuts( ThreadContextData * threadCtx, IntVector &\n  // availableIDs ); aig::GNode searchNode( aig::GNode lhsNode, aig::GNode\n  // rhsNode, bool lhsPol, bool rhsPol );\n\n  /*\n  void buildLocalStrash( ThreadContextData * threadCtx, Cut * cut, IntSet &\n  visited ); void addLocalStrash( ThreadContextData * threadCtx, aig::GNode node\n  ); aig::GNode lookupLocalStrash( ThreadContextData * threadCtx, aig::GNode\n  lhsNode, aig::GNode rhsNode, bool lhsPol, bool rhsPol ); int makeAndHashKey(\n  aig::GNode lhsNode, aig::GNode rhsNode, bool lhsPol, bool rhsPol ); void\n  showLocalStrash( std::vector< aig::GNode > & strashMap );\n  */\n\npublic:\n  galois::GAccumulator<long int> nPushes;\n\n  RewriteManager(aig::Aig& aig, CutManager& cutMan, NPNManager& npnMan,\n                 PreCompGraphManager& pcgMan, int triesNGraphs, bool useZeros,\n                 bool updateLevel);\n\n  ~RewriteManager();\n\n  aig::GNode rewriteNode(ThreadContextData* threadCtx, aig::GNode node,\n                         GNodeVector& fanoutNodes);\n\n  aig::Aig& getAig();\n  CutManager& getCutMan();\n  NPNManager& getNPNMan();\n  PreCompGraphManager& getPcgMan();\n  PerThreadContextData& getPerThreadContextData();\n  bool getUseZerosFlag();\n  bool getUpdateLevelFlag();\n  long double getRewriteTime();\n  void setRewriteTime(long double time);\n};\n\nvoid runRewriteOperator(RewriteManager& rwtMan);\n\n} /* namespace algorithm */\n\n#endif /* REWRITEMANAGER_H_ */\n"
  },
  {
    "path": "lonestar/eda/cpu/aig-rewriting/functional/BitVectorPool.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n/*\n * FunctionPool.cpp\n *\n *  Created on: 27/03/2017\n *      Author: viniciuspossani\n */\n\n#include <iostream>\n#include \"BitVectorPool.h\"\n\nnamespace Functional {\n\nBitVectorPool::BitVectorPool(int nElements, int nWords) {\n  this->blockSize    = nElements;\n  this->nWords       = nWords;\n  this->index        = 0;\n  this->currentBlock = nullptr;\n  alloc();\n}\n\nBitVectorPool::~BitVectorPool() {\n  //\tstd::cout << \"Deleting Blocks...\" << std::endl;\n  for (auto ptr : this->blocks) {\n    free(ptr[0]);\n    free(ptr);\n  }\n}\n\nvoid BitVectorPool::alloc() {\n\n  word* tmp = (word*)malloc(sizeof(word) * (this->blockSize * this->nWords));\n  if (tmp == nullptr) {\n    std::cout << \"Error: memory could not be allocated by BitVectorPool!\"\n              << std::endl;\n    exit(1);\n  }\n\n  this->currentBlock = (word**)malloc(sizeof(word*) * this->blockSize);\n  if (this->currentBlock == nullptr) {\n    std::cout << \"Error: memory could not be allocated by BitVectorPool!\"\n              << std::endl;\n    exit(1);\n  }\n\n  int i, j = 0;\n  for (i = 0; i < this->blockSize; i++) {\n    this->currentBlock[i] = &tmp[j];\n    j += this->nWords;\n  }\n\n  this->blocks.push_back(this->currentBlock);\n}\n\nword* BitVectorPool::getMemory() {\n\n  if (index >= blockSize) {\n    alloc();\n    this->index = 0;\n  }\n\n  word* ptr = this->currentBlock[this->index];\n  this->index++;\n  return ptr;\n}\n\nword* BitVectorPool::getCleanMemory() {\n\n  if (index >= blockSize) {\n    alloc();\n    this->index = 0;\n  }\n\n  for (int i = 0; i < this->nWords; i++) {\n    this->currentBlock[this->index][i] = 0;\n  }\n\n  word* ptr = this->currentBlock[this->index];\n  this->index++;\n  return ptr;\n}\n\nvoid BitVectorPool::giveBackMemory() { this->index--; }\n\nint BitVectorPool::getNumBlocks() { return this->blocks.size(); }\n\n} /* namespace Functional */\n"
  },
  {
    "path": "lonestar/eda/cpu/aig-rewriting/functional/BitVectorPool.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n/*\n * FunctionPool.h\n *\n *  Created on: 27/03/2017\n *      Author: viniciuspossani\n */\n\n#ifndef SRC_MAIN_FUNCTIONPOOL_H_\n#define SRC_MAIN_FUNCTIONPOOL_H_\n\n#include <vector>\n\nnamespace Functional {\n\ntypedef unsigned long word;\n\nclass BitVectorPool {\n\n  int blockSize;\n  int nWords;\n  int index;\n  word** currentBlock;\n  std::vector<word**> blocks;\n\n  void alloc();\n\npublic:\n  BitVectorPool(int nElements, int nWords);\n\n  virtual ~BitVectorPool();\n\n  word* getMemory();\n\n  word* getCleanMemory();\n\n  void giveBackMemory();\n\n  int getNumBlocks();\n};\n\n} /* namespace Functional */\n\n#endif /* SRC_MAIN_FUNCTIONPOOL_H_ */\n"
  },
  {
    "path": "lonestar/eda/cpu/aig-rewriting/functional/FunctionHandler.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n/**\n * This class represents the Function basic data structure. A Function is a\n * vector of unsigned long integers that represents the truth table of a\n * boolean function.\n *\n * @author Marcos Henrique Backes - mhbackes@inf.ufrgs.br.\n *\n * @see InputNode, ChoiceNode.\n *\n * Modified by Vinicius Possani\n * Last modification in July 28, 2017.\n */\n\n#ifndef FUNCTIONAL_H_\n#define FUNCTIONAL_H_\n\n#include \"../xxHash/xxhash.h\"\n#include <cmath>\n#include <cstring>\n#include <string>\n#include <sstream>\n#include <stdexcept>\n#include <iomanip>\n#include <iostream>\n#include <bitset>\n#include <cassert>\n#include <algorithm>\n#include <vector>\n#include <unordered_map>\n#include <unordered_set>\n\n#include \"BitVectorPool.h\"\n\nnamespace Functional {\n\ntypedef unsigned long word;\n\nenum Order { SMALLER, LARGER, NOTCOMPARABLE, EQUAL };\n\ninline void createLiterals(\n    std::vector<std::string>& varSet,\n    std::unordered_map<std::string, std::pair<word*, unsigned int>>& literals,\n    BitVectorPool& functionPool);\ninline bool less(word* lhs, word* rhs, int nWords);\ninline bool equals(word* lhs, word* rhs, int nWords);\ninline bool diff(word* lhs, word* rhs, int nWords);\ninline void copy(word* result, word* original, int nWords);\ninline void NOT(word* result, word* original, int nWords);\ninline void AND(word* result, word* lhs, word* rhs, int nWords);\ninline void OR(word* result, word* lhs, word* rhs, int nWords);\ninline void XOR(word* result, word* lhs, word* rhs, int nWords);\ninline void MUX(word* result, word* zero, word* one, word* sel, int nWords);\ninline void cofactor0(word* result, word* original, int nWords, int iVar);\ninline void cofactor1(word* result, word* original, int nWords, int iVar);\ninline int getSupport(word* function, int nVars);\ninline int getPolarizedSupport(word* function, int nVars);\ninline bool hasVar(word* functin, int nVars, int iVar);\ninline bool hasVarTruth6(word* function, int iVar);\ninline bool posVar6(word t, int iVar);\ninline bool negVar6(word t, int iVar);\ninline bool posVar(word* function, int nVars, int iVar);\ninline bool negVar(word* function, int nVars, int iVar);\ninline bool isUnate(word* function, int nVars);\ninline bool isPosUnate(word* function, int nVars);\ninline bool isConstZero(word* function, int nVars);\ninline bool isConstOne(word* function, int nVars);\ninline Order order(word* sub, word* target, int nWords);\ninline int getHammingDist(word* f1, word* f2, int nWords);\ninline int oneCounter(unsigned long int word);\ninline int wordNum(int nVars);\ninline bool isOdd(word* function);\ninline std::string toBin(word* function, int nWords);\ninline std::string toHex(word* function, int nWords);\ninline std::string supportToBin(unsigned int support);\n\ninline constexpr word truths6[6] = {0xAAAAAAAAAAAAAAAA, 0xCCCCCCCCCCCCCCCC,\n                                    0xF0F0F0F0F0F0F0F0, 0xFF00FF00FF00FF00,\n                                    0xFFFF0000FFFF0000, 0xFFFFFFFF00000000};\n\ninline constexpr word truths6Neg[6] = {0x5555555555555555, 0x3333333333333333,\n                                       0x0F0F0F0F0F0F0F0F, 0x00FF00FF00FF00FF,\n                                       0x0000FFFF0000FFFF, 0x00000000FFFFFFFF};\n\nclass FunctionHasher {\n  int nWords;\n  size_t TRUTH_WORDS_BYTE_COUNT;\n\npublic:\n  FunctionHasher(int nWords) : nWords(nWords) {\n    TRUTH_WORDS_BYTE_COUNT = sizeof(Functional::word) * nWords;\n  }\n\n  size_t operator()(const Functional::word* function) const {\n\n    if (nWords == 1) {\n      return function[0];\n    } else {\n      return XXH64(function, TRUTH_WORDS_BYTE_COUNT, 0);\n    }\n  }\n};\n\nclass FunctionComparator {\n  int nWords;\n\npublic:\n  FunctionComparator(int nWords) : nWords(nWords) {}\n\n  bool operator()(Functional::word* f1, Functional::word* f2) const {\n    return Functional::equals(f1, f2, nWords);\n  }\n};\n\ntypedef struct functionData {\n  unsigned int support;\n  unsigned int occurrences;\n} FunctionData;\n\nusing FunctionSet =\n    std::unordered_set<Functional::word*, FunctionHasher, FunctionComparator>;\nusing FunctionDataMap =\n    std::unordered_map<word*, FunctionData, FunctionHasher, FunctionComparator>;\n\ninline void computeAllCubeCofactors(BitVectorPool& functionPool,\n                                    FunctionSet& cubeCofactors, word* function,\n                                    int nVars);\ninline void computeAllCubeCofactorsRec(BitVectorPool& functionPool,\n                                       FunctionSet& cubeCofactors,\n                                       word* function, int nVars, int nWords,\n                                       int iVar, bool isPrevVarCofactored);\ninline void\ncomputeAllCubeCofactorsWithSupport(BitVectorPool& functionPool,\n                                   FunctionDataMap& cubeCofactorData,\n                                   word* function, int nVars);\ninline void computeAllCubeCofactorsWithSupportRec(\n    BitVectorPool& functionPool, FunctionDataMap& cubeCofactorData,\n    word* function, int nVars, int nWords, int iVar);\ninline void registerFunction(FunctionDataMap& cubeCofactorData, word* function,\n                             int nVars, unsigned int occurencesInc);\n\ninline void createLiterals(\n    std::vector<std::string>& varSet,\n    std::unordered_map<std::string, std::pair<word*, unsigned int>>& literals,\n    BitVectorPool& functionPool) {\n\n  int nVars  = varSet.size();\n  int nWords = wordNum(nVars);\n\n  for (int iVar = 0; iVar < nVars; iVar++) {\n\n    word* currentFunction = functionPool.getMemory();\n\n    if (iVar < 6) {\n      for (int k = 0; k < nWords; k++) {\n        currentFunction[k] = truths6[iVar];\n      }\n    } else {\n      for (int k = 0; k < nWords; k++) {\n        currentFunction[k] = (k & (1 << (iVar - 6))) ? ~(word)0 : 0;\n      }\n    }\n\n    unsigned int support = getPolarizedSupport(currentFunction, nVars);\n    literals.insert(\n        std::make_pair(varSet[iVar], std::make_pair(currentFunction, support)));\n\n    // Create negative literals\n    std::string negLit = \"!\" + varSet[iVar];\n    word* negFunction  = functionPool.getMemory();\n    NOT(negFunction, currentFunction, nWords);\n    unsigned int negSupport = (support >> 1);\n    literals.insert(\n        std::make_pair(negLit, std::make_pair(negFunction, negSupport)));\n  }\n\n  // Create constant zero and constant one\n  word* constZero = functionPool.getMemory();\n  word* constOne  = functionPool.getMemory();\n  for (int k = 0; k < nWords; k++) {\n    constZero[k] = 0UL;\n    constOne[k]  = ~0UL;\n  }\n  literals.insert(std::make_pair(\"0\", std::make_pair(constZero, 0)));\n  literals.insert(std::make_pair(\"1\", std::make_pair(constOne, 0)));\n\n  //\tstd::cout << std::endl << \"############################## Literals\n  //##############################\" << std::endl; \tfor ( auto lit : literals )\n  //{ \t\tstd::cout << lit.first << \" = \" << toHex( lit.second.first, nWords )\n  //<< \" | \" << supportToBin( lit.second.second ) << std::endl;\n  //\t}\n  //\tstd::cout << std::endl;\n}\n\ninline bool less(word* lhs, word* rhs, int nWords) {\n\n  if ((lhs == nullptr) || (rhs == nullptr)) {\n    return false;\n  }\n\n  for (int i = nWords - 1; i >= 0; --i) {\n    if (lhs[i] < rhs[i]) {\n      return true;\n    }\n    if (lhs[i] > rhs[i]) {\n      return false;\n    }\n  }\n\n  return false;\n}\n\ninline bool equals(word* lhs, word* rhs, int nWords) {\n\n  if ((lhs == nullptr) || (rhs == nullptr)) {\n    return false;\n  }\n\n  for (int i = 0; i < nWords; i++) {\n    if (lhs[i] != rhs[i]) {\n      return false;\n    }\n  }\n\n  return true;\n}\n\ninline bool diff(word* lhs, word* rhs, int nWords) {\n\n  if ((lhs == nullptr) || (rhs == nullptr)) {\n    return false;\n  }\n\n  for (int i = 0; i < nWords; i++) {\n    if (lhs[i] != rhs[i]) {\n      return true;\n    }\n  }\n\n  return false;\n}\n\ninline void copy(word* result, word* original, int nWords) {\n\n  for (int i = 0; i < nWords; i++) {\n    result[i] = original[i];\n  }\n}\n\ninline void NOT(word* result, word* original, int nWords) {\n\n  for (int i = 0; i < nWords; i++) {\n    result[i] = ~(original[i]);\n  }\n}\n\ninline void AND(word* result, word* lhs, word* rhs, int nWords) {\n\n  for (int i = 0; i < nWords; i++) {\n    result[i] = lhs[i] & rhs[i];\n  }\n}\n\ninline void OR(word* result, word* lhs, word* rhs, int nWords) {\n\n  for (int i = 0; i < nWords; i++) {\n    result[i] = lhs[i] | rhs[i];\n  }\n}\n\ninline void XOR(word* result, word* lhs, word* rhs, int nWords) {\n\n  for (int i = 0; i < nWords; i++) {\n    result[i] = lhs[i] ^ rhs[i];\n  }\n}\n\ninline void MUX(word* result, word* zero, word* one, word* sel, int nWords) {\n\n  for (int i = 0; i < nWords; i++) {\n    result[i] = (zero[i] & ~sel[i]) | (one[i] & sel[i]);\n  }\n}\n\ninline void cofactor0(word* result, word* original, int nWords, int iVar) {\n\n  if (nWords == 1) {\n    result[0] = ((original[0] & truths6Neg[iVar]) << (1 << iVar)) |\n                (original[0] & truths6Neg[iVar]);\n  } else {\n    if (iVar <= 5) {\n      int w, shift = (1 << iVar);\n      for (w = 0; w < nWords; w++) {\n        result[w] = ((original[w] & truths6Neg[iVar]) << shift) |\n                    (original[w] & truths6Neg[iVar]);\n      }\n    } else { // if ( iVar > 5 )\n      word* pOriginal = original;\n      word* pResult   = result;\n\n      word* pLimit = pOriginal + nWords;\n      int i, iStep = wordNum(iVar);\n      for (; pOriginal < pLimit; pOriginal += 2 * iStep, pResult += 2 * iStep) {\n        for (i = 0; i < iStep; i++) {\n          pResult[i]         = pOriginal[i];\n          pResult[i + iStep] = pOriginal[i];\n        }\n      }\n    }\n  }\n}\n\ninline void cofactor1(word* result, word* original, int nWords, int iVar) {\n\n  if (nWords == 1) {\n    result[0] = (original[0] & truths6[iVar]) |\n                ((original[0] & truths6[iVar]) >> (1 << iVar));\n  } else {\n    if (iVar <= 5) {\n      int w, shift = (1 << iVar);\n      for (w = 0; w < nWords; w++) {\n        result[w] = (original[w] & truths6[iVar]) |\n                    ((original[w] & truths6[iVar]) >> shift);\n      }\n    } else { // if ( iVar > 5 )\n      word* pOriginal = original;\n      word* pResult   = result;\n\n      word* pLimit = pOriginal + nWords;\n      int i, iStep = wordNum(iVar);\n      for (; pOriginal < pLimit; pOriginal += 2 * iStep, pResult += 2 * iStep) {\n        for (i = 0; i < iStep; i++) {\n          pResult[i]         = pOriginal[i + iStep];\n          pResult[i + iStep] = pOriginal[i + iStep];\n        }\n      }\n    }\n  }\n}\n\ninline void computeAllCubeCofactors(BitVectorPool& functionPool,\n                                    FunctionSet& cubeCofactors, word* function,\n                                    int nVars) {\n  bool isPrevVarCofactored = true;\n  int nWords               = wordNum(nVars);\n  int iVar                 = nVars - 1;\n  computeAllCubeCofactorsRec(functionPool, cubeCofactors, function, nVars,\n                             nWords, iVar, isPrevVarCofactored);\n}\n\ninline void computeAllCubeCofactorsRec(BitVectorPool& functionPool,\n                                       FunctionSet& cubeCofactors,\n                                       word* function, int nVars, int nWords,\n                                       int iVar, bool isPrevVarCofactored) {\n\n  // ignoring constants\n  if (isConstZero(function, nVars) || isConstOne(function, nVars)) {\n    return;\n  }\n\n  if (isPrevVarCofactored) {\n    // inserting the current function to the unique set\n    auto status = cubeCofactors.insert(function);\n    // When the function was already visited\n    if (status.second == false) {\n      functionPool.giveBackMemory();\n      return;\n    }\n  }\n\n  // When the terminal case is found\n  if (iVar < 0) {\n    return;\n  }\n\n  // Calling recursing with iVar as dont care\n  computeAllCubeCofactorsRec(functionPool, cubeCofactors, function, nVars,\n                             nWords, iVar - 1, false);\n\n  // When iVar is dont care\n  if (hasVar(function, nVars, iVar) == false) {\n    return;\n  }\n\n  // Calling recursing with iVar = 0\n  word* negCof = functionPool.getMemory();\n  cofactor0(negCof, function, nWords, iVar);\n  computeAllCubeCofactorsRec(functionPool, cubeCofactors, negCof, nVars, nWords,\n                             iVar - 1, true);\n\n  // Calling recursing with iVar = 1\n  word* posCof = functionPool.getMemory();\n  cofactor1(posCof, function, nWords, iVar);\n  computeAllCubeCofactorsRec(functionPool, cubeCofactors, posCof, nVars, nWords,\n                             iVar - 1, true);\n}\n\ninline void\ncomputeAllCubeCofactorsWithSupport(BitVectorPool& functionPool,\n                                   FunctionDataMap& cubeCofactorData,\n                                   word* function, int nVars) {\n  int nWords = wordNum(nVars);\n  int iVar   = nVars - 1;\n  computeAllCubeCofactorsWithSupportRec(functionPool, cubeCofactorData,\n                                        function, nVars, nWords, iVar);\n}\n\ninline void computeAllCubeCofactorsWithSupportRec(\n    BitVectorPool& functionPool, FunctionDataMap& cubeCofactorData,\n    word* function, int nVars, int nWords, int iVar) {\n\n  // When the constants are found\n  if (isConstZero(function, nVars) || isConstOne(function, nVars)) {\n    unsigned int occurrencesInc = (unsigned int)pow(3, iVar + 1);\n    registerFunction(cubeCofactorData, function, nVars, occurrencesInc);\n    return;\n  }\n\n  // When the terminal case is found\n  if (iVar < 0) {\n    unsigned int occurrencesInc = 1;\n    registerFunction(cubeCofactorData, function, nVars, occurrencesInc);\n    return;\n  }\n\n  // Calling recursing with iVar as dont care\n  computeAllCubeCofactorsWithSupportRec(functionPool, cubeCofactorData,\n                                        function, nVars, nWords, iVar - 1);\n\n  // Calling recursing with iVar = 0\n  word* negCof = functionPool.getMemory();\n  cofactor0(negCof, function, nWords, iVar);\n  computeAllCubeCofactorsWithSupportRec(functionPool, cubeCofactorData, negCof,\n                                        nVars, nWords, iVar - 1);\n\n  // Calling recursing with iVar = 1\n  word* posCof = functionPool.getMemory();\n  cofactor1(posCof, function, nWords, iVar);\n  computeAllCubeCofactorsWithSupportRec(functionPool, cubeCofactorData, posCof,\n                                        nVars, nWords, iVar - 1);\n}\n\ninline void registerFunction(FunctionDataMap& cubeCofactorData, word* function,\n                             int nVars, unsigned int occurencesInc) {\n\n  auto it = cubeCofactorData.find(function);\n  if (it == cubeCofactorData.end()) {\n    FunctionData functionData;\n    // functionData.support = getSupport( function, nVars );\n    functionData.support     = getPolarizedSupport(function, nVars);\n    functionData.occurrences = occurencesInc;\n    cubeCofactorData.insert(std::make_pair(function, functionData));\n  } else {\n    it->second.occurrences = it->second.occurrences + occurencesInc;\n  }\n}\n\ninline bool hasVar(word* function, int nVars, int iVar) {\n\n  word* t    = function;\n  int nWords = wordNum(nVars);\n\n  if (nWords == 1) {\n    return hasVarTruth6(function, iVar);\n  }\n  if (iVar < 6) {\n    int i, Shift = (1 << iVar);\n    for (i = 0; i < nWords; i++) {\n      if (((t[i] >> Shift) & truths6Neg[iVar]) != (t[i] & truths6Neg[iVar])) {\n        return true;\n      }\n    }\n    return false;\n  } else {\n    int i, Step = (1 << (iVar - 6));\n    word* tLimit = t + nWords;\n    for (; t < tLimit; t += 2 * Step) {\n      for (i = 0; i < Step; i++) {\n        if (t[i] != t[Step + i]) {\n          return true;\n        }\n      }\n    }\n    return false;\n  }\n}\n\ninline bool hasVarTruth6(word* function, int iVar) {\n  word t = function[0];\n  return ((t >> (1 << iVar)) & truths6Neg[iVar]) != (t & truths6Neg[iVar]);\n}\n\ninline bool posVar6(word t, int iVar) {\n  return ((t >> (1 << iVar)) & t & truths6Neg[iVar]) == (t & truths6Neg[iVar]);\n}\n\ninline bool negVar6(word t, int iVar) {\n  return ((t << (1 << iVar)) & t & truths6[iVar]) == (t & truths6[iVar]);\n}\n\ninline bool posVar(word* function, int nVars, int iVar) {\n\n  assert(iVar < nVars);\n\n  word* t = function;\n\n  if (nVars <= 6) {\n    return posVar6(t[0], iVar);\n  }\n  if (iVar < 6) {\n    int i, shift = (1 << iVar);\n    int nWords = wordNum(nVars);\n    for (i = 0; i < nWords; i++) {\n      if (((t[i] >> shift) & t[i] & truths6Neg[iVar]) !=\n          (t[i] & truths6Neg[iVar])) {\n        return false;\n      }\n    }\n    return true;\n  } else {\n    int i, step = (1 << (iVar - 6));\n    word* tLimit = t + wordNum(nVars);\n    for (; t < tLimit; t += 2 * step) {\n      for (i = 0; i < step; i++) {\n        if (t[i] != (t[i] & t[step + i])) {\n          return false;\n        }\n      }\n    }\n    return true;\n  }\n}\n\ninline bool negVar(word* function, int nVars, int iVar) {\n\n  assert(iVar < nVars);\n\n  word* t = function;\n\n  if (nVars <= 6) {\n    return negVar6(t[0], iVar);\n  }\n  if (iVar < 6) {\n    int i, shift = (1 << iVar);\n    int nWords = wordNum(nVars);\n    for (i = 0; i < nWords; i++) {\n      if (((t[i] << shift) & t[i] & truths6[iVar]) != (t[i] & truths6[iVar])) {\n        return false;\n      }\n    }\n    return true;\n  } else {\n    int i, step = (1 << (iVar - 6));\n    word* tLimit = t + wordNum(nVars);\n    for (; t < tLimit; t += 2 * step) {\n      for (i = 0; i < step; i++) {\n        if ((t[i] & t[step + i]) != t[step + i]) {\n          return false;\n        }\n      }\n    }\n    return true;\n  }\n}\n\ninline bool isUnate(word* function, int nVars) {\n\n  for (int i = 0; i < nVars; i++) {\n    if (!negVar(function, nVars, i) && !posVar(function, nVars, i)) {\n      return false;\n    }\n  }\n  return true;\n}\n\ninline bool isPosUnate(word* function, int nVars) {\n\n  for (int i = 0; i < nVars; i++) {\n    if (!posVar(function, nVars, i)) {\n      return false;\n    }\n  }\n  return true;\n}\n\ninline int getSupport(word* function, int nVars) {\n\n  int v, Supp = 0;\n  for (v = 0; v < nVars; v++) {\n    if (hasVar(function, nVars, v)) {\n      Supp |= (1 << v);\n    }\n  }\n  return Supp;\n}\n\ninline int getPolarizedSupport(word* function, int nVars) {\n\n  int v, Supp = 0;\n  for (v = 0; v < nVars; v++) {\n    if (!posVar(function, nVars, v)) {\n      Supp |= (1 << (v * 2));\n    }\n    if (!negVar(function, nVars, v)) {\n      Supp |= (1 << ((v * 2) + 1));\n    }\n  }\n  return Supp;\n}\n\ninline bool isConstZero(word* function, int nVars) {\n\n  word* pFunction = function;\n  word* pLimit    = pFunction + wordNum(nVars);\n\n  while (pFunction != pLimit) {\n    if (*pFunction++ != 0ULL) {\n      return false;\n    }\n  }\n\n  return true;\n}\n\ninline bool isConstOne(word* function, int nVars) {\n\n  word* pFunction = function;\n  word* pLimit    = pFunction + wordNum(nVars);\n  const word ONE  = ~0ULL;\n\n  while (pFunction != pLimit) {\n    if (*pFunction++ != ONE) {\n      return false;\n    }\n  }\n\n  return true;\n}\n\ninline Order order(word* sub, word* target, int nWords) {\n\n  if (equals(sub, target, nWords)) {\n    return Order::EQUAL;\n  }\n\n  bool smaller = true;\n  bool larger  = true;\n  unsigned long int partialResult;\n\n  for (int i = 0; i < nWords; i++) {\n\n    partialResult = sub[i] & target[i];\n\n    if (partialResult != sub[i]) {\n      smaller = false;\n    }\n    if (partialResult != target[i]) {\n      larger = false;\n    }\n\n    if (!smaller && !larger) {\n      return Order::NOTCOMPARABLE;\n    }\n  }\n\n  if (smaller)\n    return Order::SMALLER;\n\n  if (larger)\n    return Order::LARGER;\n\n  assert(false); // Should never happen\n  return Order::NOTCOMPARABLE;\n}\n\ninline int getHammingDist(word* f1, word* f2, int nWords) {\n\n  unsigned long int currentWord;\n  int count = 0;\n\n  for (int i = 0; i < nWords; i++) {\n    currentWord = f1[i] ^ f2[i];\n    count += oneCounter(currentWord);\n  }\n\n  return count;\n}\n\n// This is better when most bits in word are 0. It uses 3 arithmetic operations\n// and one comparison/branch per \"1\" bit in word.\ninline int oneCounter(unsigned long int word) {\n\n  int count;\n  for (count = 0; word; count++) {\n    word &= word - 1;\n  }\n  return count;\n}\n\ninline int wordNum(int nVars) { return nVars <= 6 ? 1 : 1 << (nVars - 6); }\n\ninline bool isOdd(word* function) { return (function[0] & 1) != 0; }\n\ninline std::string toBin(word* function, int nWords) {\n\n  if (function != nullptr) {\n\n    std::stringstream result;\n\n    result << \"\";\n\n    for (int i = nWords - 1; i >= 0; i--) {\n      for (int j = 63; j >= 0; j--) {\n        if ((function[i] >> j) & 1) {\n          result << (\"1\");\n        } else {\n          result << (\"0\");\n        }\n      }\n    }\n\n    return result.str();\n  } else {\n    return \"nullptr\";\n  }\n}\n\ninline std::string toHex(word* function, int nWords) {\n\n  std::stringstream result;\n\n  result << \"0x\";\n\n  for (int i = nWords - 1; i >= 0; i--) {\n    result << std::setw(16) << std::setfill('0') << std::hex << function[i];\n  }\n\n  return result.str();\n}\n\ninline std::string supportToBin(unsigned int support) {\n  word ptr[1];\n  ptr[0] = support;\n  return Functional::toBin(ptr, 1);\n}\n\n} /* namespace Functional */\n\n#endif /* FUNCTIONAL_H_ */\n"
  },
  {
    "path": "lonestar/eda/cpu/aig-rewriting/functional/FunctionHandler32.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n/**\n * This class represents the Function basic data structure. A Function is a\n * vector of unsigned long integers that represents the truth table of a\n * boolean function.\n *\n * @author Marcos Henrique Backes - mhbackes@inf.ufrgs.br.\n *\n * @see InputNode, ChoiceNode.\n *\n * Modified by Vinicius Possani\n * Last modification in July 28, 2017.\n */\n\n#ifndef FUNCTIONAL32_H_\n#define FUNCTIONAL32_H_\n\n#include \"../xxHash/xxhash.h\"\n#include <cmath>\n#include <cassert>\n#include <cstring>\n#include <string>\n#include <sstream>\n#include <iostream>\n#include <iomanip>\n#include <stdexcept>\n\nnamespace Functional32 {\n\ntypedef unsigned int word;\n\ninline void copy(word* result, word* original, int nWords);\ninline void NOT(word* result, word* original, int nWords);\ninline void AND(word* result, word* lhs, word* rhs, int nWords);\ninline void NAND(word* result, word* lhs, word* rhs, int nWords);\ninline void OR(word* result, word* lhs, word* rhs, int nWords);\ninline void XOR(word* result, word* lhs, word* rhs, int nWords);\ninline bool isConstZero(word* function, int nVars);\ninline bool isConstOne(word* function, int nVars);\ninline int countOnes(unsigned uWord);\ninline int wordNum(int nVars);\ninline void truthStretch(word* result, word* input, int inVars, int nVars,\n                         unsigned phase);\ninline void swapAdjacentVars(word* result, word* input, int nVars, int iVar);\ninline std::string toCubeString(word* function, int nWords, int nVars);\ninline std::string toHex(word* function, int nWords);\ninline std::string toBin(word* function, int nWords);\n\ninline void copy(word* result, word* original, int nWords) {\n\n  for (int i = 0; i < nWords; i++) {\n    result[i] = original[i];\n  }\n}\n\ninline void NOT(word* result, word* original, int nWords) {\n\n  for (int i = 0; i < nWords; i++) {\n    result[i] = ~(original[i]);\n  }\n}\n\ninline void AND(word* result, word* lhs, word* rhs, int nWords) {\n\n  for (int i = 0; i < nWords; i++) {\n    result[i] = lhs[i] & rhs[i];\n  }\n}\n\ninline void NAND(word* result, word* lhs, word* rhs, int nWords) {\n\n  for (int i = 0; i < nWords; i++) {\n    result[i] = ~(lhs[i] & rhs[i]);\n  }\n}\n\ninline void OR(word* result, word* lhs, word* rhs, int nWords) {\n\n  for (int i = 0; i < nWords; i++) {\n    result[i] = lhs[i] | rhs[i];\n  }\n}\n\ninline void XOR(word* result, word* lhs, word* rhs, int nWords) {\n\n  for (int i = 0; i < nWords; i++) {\n    result[i] = lhs[i] ^ rhs[i];\n  }\n}\n\ninline bool isConstZero(word* function, int nVars) {\n\n  word* pFunction = function;\n  word* pLimit    = pFunction + wordNum(nVars);\n\n  while (pFunction != pLimit) {\n    if (*pFunction++ != 0U) {\n      return false;\n    }\n  }\n\n  return true;\n}\n\ninline bool isConstOne(word* function, int nVars) {\n\n  word* pFunction = function;\n  word* pLimit    = pFunction + wordNum(nVars);\n  const word ONE  = ~0U;\n\n  while (pFunction != pLimit) {\n    if (*pFunction++ != ONE) {\n      return false;\n    }\n  }\n\n  return true;\n}\n\ninline int countOnes(unsigned uWord) {\n\n  uWord = (uWord & 0x55555555) + ((uWord >> 1) & 0x55555555);\n  uWord = (uWord & 0x33333333) + ((uWord >> 2) & 0x33333333);\n  uWord = (uWord & 0x0F0F0F0F) + ((uWord >> 4) & 0x0F0F0F0F);\n  uWord = (uWord & 0x00FF00FF) + ((uWord >> 8) & 0x00FF00FF);\n  return (uWord & 0x0000FFFF) + (uWord >> 16);\n}\n\ninline int wordNum(int nVars) { return nVars <= 5 ? 1 : 1 << (nVars - 5); }\n\ninline void truthStretch(word* result, word* input, int inVars, int nVars,\n                         unsigned phase) {\n\n  unsigned* pTemp;\n  int var = inVars - 1, counter = 0;\n\n  for (int i = nVars - 1; i >= 0; i--) {\n\n    if (phase & (1 << i)) {\n\n      for (int j = var; j < i; j++) {\n\n        swapAdjacentVars(result, input, nVars, j);\n        pTemp  = input;\n        input  = result;\n        result = pTemp;\n        counter++;\n      }\n      var--;\n    }\n  }\n\n  assert(var == -1);\n\n  // swap if it was moved an even number of times\n  int nWords = wordNum(nVars);\n  if (!(counter & 1)) {\n    copy(result, input, nWords);\n  }\n}\n\ninline void swapAdjacentVars(word* result, word* input, int nVars, int iVar) {\n\n  static unsigned PMasks[4][3] = {{0x99999999, 0x22222222, 0x44444444},\n                                  {0xC3C3C3C3, 0x0C0C0C0C, 0x30303030},\n                                  {0xF00FF00F, 0x00F000F0, 0x0F000F00},\n                                  {0xFF0000FF, 0x0000FF00, 0x00FF0000}};\n\n  int nWords = wordNum(nVars);\n  int i, k, step, shift;\n\n  assert(iVar < nVars - 1);\n\n  if (iVar < 4) {\n    shift = (1 << iVar);\n    for (i = 0; i < nWords; i++) {\n      result[i] = (input[i] & PMasks[iVar][0]) |\n                  ((input[i] & PMasks[iVar][1]) << shift) |\n                  ((input[i] & PMasks[iVar][2]) >> shift);\n    }\n  } else {\n    if (iVar > 4) {\n      step = (1 << (iVar - 5));\n      for (k = 0; k < nWords; k += 4 * step) {\n\n        for (i = 0; i < step; i++) {\n          result[i] = input[i];\n        }\n\n        for (i = 0; i < step; i++) {\n          result[step + i] = input[2 * step + i];\n        }\n\n        for (i = 0; i < step; i++) {\n          result[2 * step + i] = input[step + i];\n        }\n\n        for (i = 0; i < step; i++) {\n          result[3 * step + i] = input[3 * step + i];\n        }\n\n        input += 4 * step;\n        result += 4 * step;\n      }\n    } else { // if ( iVar == 4 )\n      for (i = 0; i < nWords; i += 2) {\n        result[i] =\n            (input[i] & 0x0000FFFF) | ((input[i + 1] & 0x0000FFFF) << 16);\n        result[i + 1] =\n            (input[i + 1] & 0xFFFF0000) | ((input[i] & 0xFFFF0000) >> 16);\n      }\n    }\n  }\n}\n\ninline std::string toBin(word* function, int nWords) {\n\n  if (function != nullptr) {\n\n    std::stringstream result;\n\n    result << \"\";\n\n    for (int i = nWords - 1; i >= 0; i--) {\n      for (int j = 31; j >= 0; j--) {\n        if ((function[i] >> j) & 1) {\n          result << (\"1\");\n        } else {\n          result << (\"0\");\n        }\n      }\n    }\n\n    return result.str();\n  } else {\n    return \"nullptr\";\n  }\n}\ninline std::string toHex(word* function, int nWords) {\n\n  std::stringstream result;\n\n  result << \"0x\";\n\n  for (int i = nWords - 1; i >= 0; i--) {\n    result << std::setw(16) << std::setfill('0') << std::hex << function[i];\n  }\n\n  return result.str();\n}\n\ninline std::string toCubeString(word* function, int nWords, int nVars) {\n\n  std::stringstream cubes;\n  word mask, cube;\n  int nRows;\n\n  if (nWords == 1) {\n    nRows = 2 << (nVars - 1);\n    mask  = 1;\n    for (int j = 0; j < nRows; j++) {\n      if (function[0] & mask) {\n        cube = j;\n        for (int k = 0; k < nVars; k++) {\n          if ((cube >> k) & 1) {\n            cubes << (\"1\");\n          } else {\n            cubes << (\"0\");\n          }\n        }\n        cubes << \" 1\" << std::endl;\n      }\n      mask = mask << 1;\n    }\n  } else {\n    for (int i = 0; i < nWords; i++) {\n      mask = 1;\n      for (int j = 0; j < 32; j++) {\n        if (function[i] & mask) {\n          cube = (i * 32) + j;\n          for (int k = 0; k < nVars; k++) {\n            if ((cube >> k) & 1) {\n              cubes << (\"1\");\n            } else {\n              cubes << (\"0\");\n            }\n          }\n          cubes << \" 1\" << std::endl;\n        }\n        mask = mask << 1;\n      }\n    }\n  }\n  return cubes.str();\n}\n\n} /* namespace Functional32 */\n\n#endif /* FUNCTIONAL32_H_ */\n"
  },
  {
    "path": "lonestar/eda/cpu/aig-rewriting/functional/FunctionUtil.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n/*\n * FunctionUtil.cpp\n *\n *  Created on: 14/02/2017\n *      Author: possani\n */\n\n#include \"../functional/FunctionUtil.h\"\n\n#include \"../functional/FunctionHandler.h\"\n\nnamespace Functional {\n\nFunctionUtil::FunctionUtil(StringFunctionMap& entries,\n                           BitVectorPool& functionPool, int nVars, int nWords)\n    : literals(entries), functionPool(functionPool) {\n  this->nVars        = nVars;\n  this->nWords       = nWords;\n  this->currentToken = EMPTY;\n}\n\nFunctionUtil::~FunctionUtil() {}\n\nword* FunctionUtil::parseExpression(std::string expression) {\n\n  std::istringstream functionality(expression);\n  return expr1(functionality);\n}\n\nword* FunctionUtil::expr1(std::istringstream& expression) {\n\n  word* lhs = expr2(expression);\n\n  for (;;) {\n    switch (this->currentToken) {\n    case ORop: {\n      word* result = this->functionPool.getMemory();\n      word* rhs    = expr2(expression);\n      Functional::OR(result, lhs, rhs, this->nWords);\n      lhs = result;\n      break;\n    }\n\n    default:\n      return lhs;\n    }\n  }\n}\n\nword* FunctionUtil::expr2(std::istringstream& expression) {\n\n  word* lhs = term(expression);\n\n  for (;;) {\n    switch (this->currentToken) {\n    case XORop: {\n      word* result = this->functionPool.getMemory();\n      word* rhs    = term(expression);\n      Functional::XOR(result, lhs, rhs, this->nWords);\n      lhs = result;\n      break;\n    }\n\n    default:\n      return lhs;\n    }\n  }\n}\n\nword* FunctionUtil::term(std::istringstream& expression) {\n\n  word* lhs = prim(expression);\n\n  for (;;) {\n    switch (this->currentToken) {\n    case ANDop: {\n      word* result = this->functionPool.getMemory();\n      word* rhs    = prim(expression);\n      Functional::AND(result, lhs, rhs, this->nWords);\n      lhs = result;\n      break;\n    }\n\n    default:\n      return lhs;\n    }\n  }\n}\n\nword* FunctionUtil::prim(std::istringstream& expression) {\n\n  getToken(expression);\n\n  switch (this->currentToken) {\n\n  case LIT: {\n    getToken(expression);\n    StringFunctionMap::iterator it = literals.find(this->tokenValue);\n    if (it != literals.end()) {\n      word* var     = it->second.first;\n      word* literal = this->functionPool.getMemory();\n      Functional::copy(literal, var, this->nWords);\n      return literal;\n    } else {\n      std::cout << \"ERROR: Literal ( \" << tokenValue << \" ) not found!\"\n                << std::endl;\n      exit(1);\n    }\n  }\n\n  case NOTop: {\n    word* function = prim(expression);\n    Functional::NOT(function, function, this->nWords);\n    return function;\n  }\n\n  case LP: {\n    word* function = expr1(expression);\n    if (currentToken != RP) {\n      std::cout << \"ERROR: current token = \" << currentToken << std::endl;\n      exit(1);\n    }\n    getToken(expression); // eat )\n    return function;\n  }\n\n  default:\n    return nullptr;\n    break;\n  }\n}\n\nToken FunctionUtil::getToken(std::istringstream& expression) {\n\n  char ch = 0;\n  expression >> ch;\n\n  switch (ch) {\n  case 0: {\n    return this->currentToken = END;\n  }\n\n  case ';':\n  case '*':\n  case '+':\n  case '^':\n  case '!':\n  case '(':\n  case ')':\n  case '=':\n    return this->currentToken = Token(ch);\n\n  default: {\n    if (isalpha(ch)) {\n      this->tokenValue = \"\";\n      for (; isalnum(ch) && !expression.eof(); expression >> ch) {\n        tokenValue += ch; //; needed at the end of string\n      }\n      expression.putback(ch);\n      return this->currentToken = LIT;\n    }\n    return this->currentToken = END;\n  }\n  }\n}\n\nword* FunctionUtil::parseHexa(std::string hexa) {\n\n  if ((hexa.at(0) == '0') && (hexa.at(1) == 'x')) {\n    hexa = hexa.substr(2);\n  }\n\n  word* function = this->functionPool.getMemory();\n  unsigned long int value;\n  std::stringstream ss;\n\n  if (this->nVars < 6) {\n    ss << std::hex << hexa;\n    ss >> value;\n    function[0] = static_cast<unsigned long int>(value);\n    return function;\n  } else {\n    int lhs = hexa.size() - 16;\n    int i   = 0;\n    while (lhs >= 0) {\n      std::string currentHexa = hexa.substr(lhs, 16);\n      ss.clear();\n      ss.str(\"\");\n      ss << std::hex << currentHexa;\n      ss >> value;\n      function[i++] = static_cast<unsigned long int>(value);\n      lhs -= 16;\n    }\n    return function;\n  }\n}\n\n} // namespace Functional\n"
  },
  {
    "path": "lonestar/eda/cpu/aig-rewriting/functional/FunctionUtil.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n/*\n * FunctionUtil.h\n *\n *  Created on: 14/02/2017\n *      Author: possani\n */\n\n#ifndef SRC_FUNCTION_FUNCTIONUTIL_H_\n#define SRC_FUNCTION_FUNCTIONUTIL_H_\n\n#include \"BitVectorPool.h\"\n\n#include <cstdlib>\n#include <string>\n#include <iterator>\n#include <algorithm>\n#include <iostream>\n#include <fstream>\n#include <cmath>\n#include <sstream>\n#include <vector>\n#include <unordered_map>\n\nnamespace Functional {\n\nenum Token {\n  ANDop = '*',\n  ORop  = '+',\n  XORop = '^',\n  LP    = '(',\n  RP    = ')',\n  NOTop = '!',\n  LIT,\n  END = ';',\n  EMPTY\n};\n\ntypedef unsigned long int word;\ntypedef std::unordered_map<std::string, std::pair<word*, unsigned int>>\n    StringFunctionMap;\n\nclass FunctionUtil {\n\n  Token currentToken;\n  std::string tokenValue;\n  StringFunctionMap& literals;\n  BitVectorPool& functionPool;\n  int nVars;\n  int nWords;\n\npublic:\n  FunctionUtil(StringFunctionMap& entries, BitVectorPool& functionPool,\n               int nVars, int nWords);\n\n  virtual ~FunctionUtil();\n\n  word* parseExpression(std::string expression);\n  word* prim(std::istringstream& expression);\n  word* term(std::istringstream& expression);\n  word* expr2(std::istringstream& expression);\n  word* expr1(std::istringstream& expression);\n  Token getToken(std::istringstream& expression);\n  word* parseHexa(std::string hexa);\n};\n\n} // namespace Functional\n\n#endif /* SRC_FUNCTION_FUNCTIONUTIL_H_ */\n"
  },
  {
    "path": "lonestar/eda/cpu/aig-rewriting/main.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#include \"parsers/AigParser.h\"\n#include \"writers/AigWriter.h\"\n#include \"writers/BlifWriter.h\"\n#include \"subjectgraph/aig/Aig.h\"\n#include \"algorithms/CutManager.h\"\n#include \"algorithms/PriorityCutManager.h\"\n#include \"algorithms/NPNManager.h\"\n#include \"algorithms/RewriteManager.h\"\n#include \"algorithms/PreCompGraphManager.h\"\n#include \"algorithms/ChoiceManager.h\"\n#include \"algorithms/ReconvDrivenCut.h\"\n#include \"galois/Galois.h\"\n#include \"Lonestar/BoilerPlate.h\"\n#include <chrono>\n#include <iostream>\n#include <sstream>\n\nstatic const char* name = \"AIG Rewriting\";\n\nstatic const char* desc =\n    \"Optimization in logic synthesis through rewriting AND-Inverter Graphs\";\n\nstatic const char* url = \"aigRewriting\";\n\nnamespace cll = llvm::cl;\nstatic cll::opt<std::string>\n    inputFile(cll::Positional, cll::desc(\"<input file>\"), cll::Required);\n\nstatic cll::opt<bool>\n    outputVerbose(\"v\", cll::desc(\"verbose output (default: false)\"),\n                  cll::init(false));\n\n//! Flag that forces user to be aware that they should be passing in a\n//! AIG format.\nstatic cll::opt<bool> AIG(\n    \"AIG\",\n    cll::desc(\"Specify that the input graph is a AND-Inverter Graph format\"),\n    cll::init(false));\n\nusing namespace std::chrono;\n\nvoid aigRewriting(aig::Aig& aig, std::string& fileName, int nThreads,\n                  bool verbose);\nvoid kcut(aig::Aig& aig, std::string& fileName, int nThreads, bool verbose);\nvoid prioritycut(aig::Aig& aig, std::string& fileName, int nThreads,\n                 bool deterministic, bool verbose);\nvoid addChoices(aig::Aig& aig, std::string& fileName, int nThreads,\n                bool verbose);\nvoid rdCut(aig::Aig& aig, std::string& fileName, int nThreads, bool verbose);\nstd::string getFileName(std::string path);\n\nint main(int argc, char* argv[]) {\n  // shared-memory system object initializes global variables for galois\n  galois::SharedMemSys G;\n  LonestarStart(argc, argv, name, desc, url, &inputFile);\n\n  if (!AIG) {\n    GALOIS_DIE(\"This application requires an AND-Inverter Graph (AIG) format;\"\n               \" please use the -AIG flag \"\n               \" to indicate the input is a AIG format.\");\n  }\n\n  int nThreads         = numThreads;\n  std::string path     = inputFile;\n  std::string fileName = getFileName(path);\n\n  aig::Aig aig;\n  AigParser aigParser(path, aig);\n  aigParser.parseAig();\n  // aigParser.parseAag();\n\n  if (outputVerbose) {\n    std::cout << \"############## AIG REWRITING ##############\" << std::endl;\n    std::cout << \"Design Name: \" << fileName << std::endl;\n    std::cout << \"|Nodes|: \" << aig.getGraph().size() << std::endl;\n    std::cout << \"|I|: \" << aigParser.getI() << std::endl;\n    std::cout << \"|L|: \" << aigParser.getL() << std::endl;\n    std::cout << \"|O|: \" << aigParser.getO() << std::endl;\n    std::cout << \"|A|: \" << aigParser.getA() << std::endl;\n    std::cout << \"|E|: \" << aigParser.getE() << \" (outgoing edges)\"\n              << std::endl;\n  }\n\n  aigRewriting(aig, fileName, nThreads, outputVerbose);\n\n  return 0;\n}\n\nvoid aigRewriting(aig::Aig& aig, std::string& fileName, int nThreads,\n                  bool verbose) {\n\n  int numThreads = galois::setActiveThreads(nThreads);\n\n  int K = 4, C = 500;\n  int triesNGraphs = 500;\n  bool compTruth   = true;\n  bool useZeros    = false;\n  bool updateLevel = false;\n\n  if (verbose) {\n    std::cout << \"############# Configurations ############## \" << std::endl;\n    std::cout << \"K: \" << K << std::endl;\n    std::cout << \"C: \" << C << std::endl;\n    std::cout << \"TriesNGraphs: \" << triesNGraphs << std::endl;\n    std::cout << \"CompTruth: \" << (compTruth ? \"yes\" : \"no\") << std::endl;\n    std::cout << \"UseZeroCost: \" << (useZeros ? \"yes\" : \"no\") << std::endl;\n    std::cout << \"UpdateLevel: \" << (updateLevel ? \"yes\" : \"no\") << std::endl;\n    std::cout << \"nThreads: \" << numThreads << std::endl;\n  }\n\n  high_resolution_clock::time_point t1 = high_resolution_clock::now();\n\n  // CutMan\n  algorithm::CutManager cutMan(aig, K, C, numThreads, compTruth);\n\n  // NPNMan\n  algorithm::NPNManager npnMan;\n\n  // StrMan\n  algorithm::PreCompGraphManager pcgMan(npnMan);\n  pcgMan.loadPreCompGraphFromArray();\n  pcgMan.processDecompositionGraphs();\n\n  // RWMan\n  algorithm::RewriteManager rwtMan(aig, cutMan, npnMan, pcgMan, triesNGraphs,\n                                   useZeros, updateLevel);\n\n  algorithm::runRewriteOperator(rwtMan);\n\n  high_resolution_clock::time_point t2 = high_resolution_clock::now();\n  long double rewriteTime = duration_cast<microseconds>(t2 - t1).count();\n\n  if (verbose) {\n    std::cout << \"################ Results ################## \" << std::endl;\n    std::cout << \"Size: \" << aig.getNumAnds() << std::endl;\n    std::cout << \"Depth: \" << aig.getDepth() << std::endl;\n    std::cout << \"Runtime (us): \" << rewriteTime << std::endl;\n  } else {\n    std::cout << fileName << \";\" << C << \";\" << triesNGraphs << \";\" << useZeros\n              << \";\" << aig.getNumAnds() << \";\" << aig.getDepth() << \";\"\n              << numThreads << \";\" << rewriteTime << std::endl;\n  }\n\n  // WRITE AIG //\n  AigWriter aigWriter(fileName + \"_rewritten.aig\");\n  aigWriter.writeAig(aig);\n\n  // WRITE DOT //\n  // aig.writeDot( fileName + \"_rewritten.dot\", aig.toDot() );\n}\n\nvoid prioritycut(aig::Aig& aig, std::string& fileName, int nThreads,\n                 bool deterministic, bool verbose) {\n\n  int numThreads = galois::setActiveThreads(nThreads);\n\n  int K = 6, C = 8;\n  bool compTruth = true;\n\n  if (verbose) {\n    std::cout << \"############# Configurations ############## \" << std::endl;\n    std::cout << \"K: \" << K << std::endl;\n    std::cout << \"C: \" << C << std::endl;\n    std::cout << \"CompTruth: \" << (compTruth ? \"yes\" : \"no\") << std::endl;\n    std::cout << \"Deterministic: \" << (deterministic ? \"yes\" : \"no\")\n              << std::endl;\n    std::cout << \"nThreads: \" << numThreads << std::endl;\n  }\n\n  long double kcutTime                 = 0;\n  high_resolution_clock::time_point t1 = high_resolution_clock::now();\n\n  algorithm::PriCutManager cutMan(aig, K, C, numThreads, compTruth,\n                                  deterministic, verbose);\n  algorithm::runKPriCutOperator(cutMan);\n\n  high_resolution_clock::time_point t2 = high_resolution_clock::now();\n  kcutTime = duration_cast<microseconds>(t2 - t1).count();\n\n  BlifWriter blifWriter(fileName + \"_mapped.blif\");\n  blifWriter.writeNetlist(aig, cutMan);\n\n  // WRITE DOT //\n  // aig.writeDot( fileName + \".dot\", aig.toDot() );\n\n  if (verbose) {\n    std::cout << \"################ Results ################## \" << std::endl;\n    // cutMan.printCovering();\n    // cutMan.printAllCuts();\n    // cutMan.printBestCuts();\n    // cutMan.printRuntimes();\n    cutMan.printCutStatistics();\n    std::cout << \"AIG Size: \" << aig.getNumAnds() << std::endl;\n    std::cout << \"AIG Depth: \" << aig.getDepth() << std::endl;\n    std::cout << \"LUT Size: \" << cutMan.getNumLUTs() << std::endl;\n    std::cout << \"LUT Depth: \" << cutMan.getNumLevels() << std::endl;\n    std::cout << \"Runtime (us): \" << kcutTime << std::endl;\n  } else {\n    std::cout << fileName << \";\" << K << \";\" << C << \";\" << compTruth << \";\"\n              << aig.getNumAnds() << \";\" << aig.getDepth() << \";\"\n              << cutMan.getNumLUTs() << \";\" << cutMan.getNumLevels() << \";\"\n              << numThreads << \";\" << kcutTime << std::endl;\n  }\n}\n\nvoid addChoices(aig::Aig& aig, std::string& GALOIS_UNUSED(fileName),\n                int GALOIS_UNUSED(nThreads), bool verbose) {\n\n  // FIXME\n  // In the current implementation the creation of choices must be serial.\n  // It is due to the management of AIGi node's IDs when creating new nodes.\n  int numThreads = galois::setActiveThreads(1);\n  // int numThreads = galois::setActiveThreads(nThreads);\n\n  int K = 4, C = 500;\n  int nGraphs      = 4;\n  int nChoices     = 4;\n  bool compTruth   = true;\n  bool useZeros    = false;\n  bool updateLevel = false;\n\n  if (verbose) {\n    std::cout << \"############# Configurations ############## \" << std::endl;\n    std::cout << \"K: \" << K << std::endl;\n    std::cout << \"C: \" << C << std::endl;\n    std::cout << \"nGraphs: \" << nGraphs << std::endl;\n    std::cout << \"nChoices: \" << nChoices << std::endl;\n    std::cout << \"CompTruth: \" << (compTruth ? \"yes\" : \"no\") << std::endl;\n    std::cout << \"UseZeroCost: \" << (useZeros ? \"yes\" : \"no\") << std::endl;\n    std::cout << \"UpdateLevel: \" << (updateLevel ? \"yes\" : \"no\") << std::endl;\n    std::cout << \"nThreads: \" << numThreads << std::endl;\n  }\n\n  high_resolution_clock::time_point t1 = high_resolution_clock::now();\n\n  // CutMan\n  algorithm::CutManager cutMan(aig, K, C, numThreads, compTruth);\n\n  // NPNMan\n  algorithm::NPNManager npnMan;\n\n  // StrMan\n  algorithm::PreCompGraphManager pcgMan(npnMan);\n  pcgMan.loadPreCompGraphFromArray();\n  pcgMan.processDecompositionGraphs();\n\n  // RWMan\n  algorithm::ChoiceManager chMan(aig, cutMan, npnMan, pcgMan, nGraphs,\n                                 nChoices);\n\n  algorithm::runChoiceOperator(chMan);\n  aig.resetAllIds();\n\n  high_resolution_clock::time_point t2 = high_resolution_clock::now();\n  long double runtime = duration_cast<microseconds>(t2 - t1).count();\n\n  // WRITE DOT //\n  // aig.writeDot( fileName + \"_choices.dot\", aig.toDot() );\n\n  if (verbose) {\n    std::cout << \"################ Results ################## \" << std::endl;\n    std::cout << \"Size: \" << aig.getNumAnds() << std::endl;\n    std::cout << \"Depth: \" << aig.getDepth() << std::endl;\n    std::cout << \"Runtime (us): \" << runtime << std::endl;\n  } else {\n    /*\n     * std::cout << fileName << \";\" << C << \";\" << nGraphs << \";\" << useZeros\n     * << \";\" << aig.getNumAnds() << \";\" << aig.getDepth() << \";\"\n     * << numThreads << \";\" << runtime << std::endl;\n     */\n  }\n}\n\nvoid kcut(aig::Aig& aig, std::string& fileName, int nThreads, bool verbose) {\n\n  int numThreads = galois::setActiveThreads(nThreads);\n\n  int K = 4, C = 500;\n  bool compTruth = false;\n\n  if (verbose) {\n    std::cout << \"############# Configurations ############## \" << std::endl;\n    std::cout << \"K: \" << K << std::endl;\n    std::cout << \"C: \" << C << std::endl;\n    std::cout << \"CompTruth: \" << (compTruth ? \"yes\" : \"no\") << std::endl;\n    std::cout << \"nThreads: \" << numThreads << std::endl;\n  }\n\n  long double kcutTime                 = 0;\n  high_resolution_clock::time_point t1 = high_resolution_clock::now();\n\n  algorithm::CutManager cutMan(aig, K, C, numThreads, compTruth);\n  algorithm::runKCutOperator(cutMan);\n\n  high_resolution_clock::time_point t2 = high_resolution_clock::now();\n  kcutTime = duration_cast<microseconds>(t2 - t1).count();\n\n  if (verbose) {\n    std::cout << \"################ Results ################## \" << std::endl;\n    // cutMan.printAllCuts();\n    // cutMan.printRuntimes();\n    cutMan.printCutStatistics();\n    std::cout << \"Size: \" << aig.getNumAnds() << std::endl;\n    std::cout << \"Depth: \" << aig.getDepth() << std::endl;\n    std::cout << \"Runtime (us): \" << kcutTime << std::endl;\n  } else {\n    std::cout << fileName << \";\" << K << \";\" << C << \";\" << compTruth << \";\"\n              << aig.getNumAnds() << \";\" << aig.getDepth() << \";\" << numThreads\n              << \";\" << kcutTime << std::endl;\n  }\n}\n\nvoid rdCut(aig::Aig& aig, std::string& fileName, int nThreads, bool verbose) {\n\n  int numThreads = galois::setActiveThreads(nThreads);\n\n  size_t K = 4;\n\n  if (verbose) {\n    std::cout << \"############# Configurations ############## \" << std::endl;\n    std::cout << \"K: \" << K << std::endl;\n    std::cout << \"nThreads: \" << numThreads << std::endl;\n  }\n\n  high_resolution_clock::time_point t1 = high_resolution_clock::now();\n\n  algorithm::ReconvDrivenCut rdcMan(aig);\n  rdcMan.run(K);\n\n  high_resolution_clock::time_point t2 = high_resolution_clock::now();\n  long double rdCutTime = duration_cast<microseconds>(t2 - t1).count();\n\n  if (verbose) {\n    std::cout << \"################ Results ################## \" << std::endl;\n    std::cout << \"Size: \" << aig.getNumAnds() << std::endl;\n    std::cout << \"Depth: \" << aig.getDepth() << std::endl;\n    std::cout << \"Runtime (us): \" << rdCutTime << std::endl;\n  } else {\n    std::cout << fileName << \";\" << K << \";\" << aig.getNumAnds() << \";\"\n              << aig.getDepth() << \";\" << numThreads << \";\" << rdCutTime\n              << std::endl;\n  }\n}\n\nstd::string getFileName(std::string path) {\n  std::size_t slash    = path.find_last_of(\"/\") + 1;\n  std::size_t dot      = path.find_last_of(\".\");\n  std::string fileName = path.substr(slash, (dot - slash));\n  return fileName;\n}\n"
  },
  {
    "path": "lonestar/eda/cpu/aig-rewriting/misc/util/utilString.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n/*\n * utilString.cpp\n *\n *  Created on: 22/07/2014\n *      Author: jodymaick\n */\n\n#include <cstdio>\n#include <stdarg.h>\n#include <regex>\n#include \"utilString.h\"\n\nvoid split(const std::string& str, const std::string& delim,\n           std::vector<std::string>& parts) {\n  size_t start, end = 0;\n  while (end < str.size()) {\n    start = end;\n    while (start < str.size() &&\n           (delim.find(str[start]) != std::string::npos)) {\n      start++; // skip initial whitespace\n    }\n    end = start;\n    while (end < str.size() && (delim.find(str[end]) == std::string::npos)) {\n      end++; // skip to end of word\n    }\n    if (end - start != 0) { // just ignore zero-length strings.\n      parts.push_back(std::string(str, start, end - start));\n    }\n  }\n}\n\n/*\nstd::vector<std::string> regex_split(const std::string & s, std::string rgx_str)\n{ std::vector<std::string> elems;\n\n    std::regex rgx (rgx_str);\n\n    std::sregex_token_iterator iter(s.begin(), s.end(), rgx, -1);\n    std::sregex_token_iterator end;\n\n    while (iter != end)  {\n        //std::cout << \"S43:\" << *iter << std::endl;\n        elems.push_back(*iter);\n        ++iter;\n    }\n\n    return elems;\n}\n*/\n\nbool startsWith(std::string str, std::string part) {\n  for (unsigned int i = 0; i < part.size(); i++)\n    if (str.at(i) != part.at(i))\n      return false;\n  return true;\n}\n\nbool endsWith(std::string str, std::string part) {\n  if (str.size() < part.size())\n    return false;\n  for (unsigned int i = str.size() - part.size(), j = 0; j < part.size();\n       i++, j++)\n    if (str.at(i) != part.at(j))\n      return false;\n  return true;\n}\n\nstd::string format(const std::string fmt, ...) {\n  int size = 100;\n  std::string str;\n  va_list ap;\n  while (1) {\n    str.resize(size);\n    va_start(ap, fmt);\n    int n = vsnprintf((char*)str.c_str(), size, fmt.c_str(), ap);\n    va_end(ap);\n    if (n > -1 && n < size) {\n      str.resize(n);\n      return str;\n    }\n    if (n > -1)\n      size = n + 1;\n    else\n      size *= 2;\n  }\n  return str;\n}\n\nvoid find_and_replace(std::string& source, std::string const& find,\n                      std::string const& replace) {\n  for (std::string::size_type i = 0;\n       (i = source.find(find, i)) != std::string::npos;) {\n    source.replace(i, find.length(), replace);\n    i += replace.length();\n  }\n}\n\nstd::string get_clean_string(std::string string) {\n  find_and_replace(string, \"/\", \"_\");\n  find_and_replace(string, \"\\\\\", \"_\");\n  find_and_replace(string, \".\", \"_\");\n  find_and_replace(string, \"(\", \"_\");\n  find_and_replace(string, \")\", \"_\");\n  find_and_replace(string, \"[\", \"_\");\n  find_and_replace(string, \"]\", \"_\");\n  return string;\n}\n"
  },
  {
    "path": "lonestar/eda/cpu/aig-rewriting/misc/util/utilString.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n/*\n * utilstd::string.h\n *\n *  Created on: 22/07/2014\n *      Author: jodymaick\n */\n\n#ifndef UTILSTRING_H_\n#define UTILSTRING_H_\n\n#include <string>\n#include <vector>\n\n/**\n * Splits the given std::string in a vector of std::strings at each occurrence\n * of a given delimiter.\n *\n * @param str The std::string to be splitted.\n * @param delim The delimiter.\n * @param parts The vector comprising the parts of the split std::string.\n */\nvoid split(const std::string& str, const std::string& delim,\n           std::vector<std::string>& parts);\n\n/**\n * Splits the given std::string at each occurrence of a given regex-based\n * delimiter and returns a vector of std::strings.\n *\n * @param s The std::string to be splitted.\n * @param rgx_str The regex-based delimiter.\n * @return The vector comprising the parts of the split std::string.\n */\n// std::vector<std::string> regex_split(const std::string & s, std::string\n// rgx_str = \"\\\\s+\");\n\n/**\n * Checks if a given std::string starts with another given std::string.\n *\n * @param str The std::string to be checked in.\n * @param part The part to be searched into the std::string.\n * @return True if str starts with part. False otherwise.\n */\nbool startsWith(std::string str, std::string part);\n\n/**\n * Checks if a given std::string ends with another given std::string.\n *\n * @param str The std::string to be checked in.\n * @param part The part to be searched into the std::string.\n * @return True if str ends with part. False otherwise.\n */\nbool endsWith(std::string str, std::string part);\n\n/**\n * Returns a formatted std::string (just like it would be printed with printf).\n *\n * @param fmt The desired format of the std::string.\n * @return The formatted std::string.\n */\nstd::string format(const std::string fmt, ...);\n\nvoid find_and_replace(std::string& source, std::string const& find,\n                      std::string const& replace);\n\nstd::string get_clean_string(std::string string);\n\n#endif /* UTILstd::string_H_ */\n"
  },
  {
    "path": "lonestar/eda/cpu/aig-rewriting/parsers/AigParser.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#include \"galois/Galois.h\"\n#include \"galois/Timer.h\"\n#include <AigParser.h>\n#include <iostream>\n#include <algorithm>\n\nAigParser::AigParser(aig::Aig& aig) : aig(aig) {\n  currLine = 0;\n  currChar = 0;\n}\n\nAigParser::AigParser(std::string fileName, aig::Aig& aig) : aig(aig) {\n  currLine = 1;\n  currChar = 0;\n  open(fileName);\n}\n\nAigParser::~AigParser() { close(); }\n\nvoid AigParser::open(std::string fileName) {\n  close();\n  currLine = 1;\n  currChar = 0;\n  file.open(fileName.c_str());\n}\n\nbool AigParser::isOpen() const { return file.is_open(); }\n\nvoid AigParser::close() {\n  if (isOpen()) {\n    file.close();\n    currLine = 0;\n    currChar = 0;\n  }\n}\n\nunsigned AigParser::decode() {\n  unsigned x = 0, in = 0;\n  unsigned char c;\n  while ((c = parseByte()) & 0x80)\n    x |= (c & 0x7f) << (7 * in++);\n  return x | (c << (7 * in));\n}\n\nchar AigParser::parseByte() {\n  char byte;\n  file.read(&byte, 1);\n  if (file.eof()) {\n    throw unexpected_eof(currLine, currChar);\n  }\n  if (byte == '\\n') {\n    currLine++;\n    currChar = 0;\n  } else {\n    currChar++;\n  }\n  return byte;\n}\n\nbool AigParser::parseBool(std::string delimChar) {\n  bool result;\n  int c = parseChar();\n  switch (c) {\n  case '0':\n    result = false;\n    break;\n  case '1':\n    result = true;\n    break;\n  default:\n    throw syntax_error(\n        currLine, currChar,\n        format(\"Expected Boolean (0 or 1) but found: ASCII %d\", c));\n  }\n  c = parseChar();\n  if (delimChar.find(c) == std::string::npos) {\n    throw syntax_error(\n        currLine, currChar,\n        format(\"Expected Boolean (0 or 1) but found: ASCII %d\", c));\n  }\n  return result;\n}\n\nunsigned char AigParser::parseChar() {\n  int result = file.get();\n  if (file.eof()) {\n    throw unexpected_eof(currLine, currChar);\n  }\n  if (result == '\\r') {\n    result = file.get();\n    if (file.eof()) {\n      throw unexpected_eof(currLine, currChar);\n    }\n  }\n  if (result == '\\n') {\n    currLine++;\n    currChar = 0;\n  } else {\n    currChar++;\n  }\n  return result;\n}\n\nint AigParser::parseInt(std::string delimChar) {\n  unsigned char c;\n  int result;\n  bool done = false;\n  std::stringstream buffer;\n  c = parseChar();\n  if (!isdigit(c))\n    throw syntax_error(currLine, currChar,\n                       format(\"Expected integer but found: ASCII %d\", c));\n  buffer << c;\n  while (!done) {\n    c = parseChar();\n    if (isdigit(c)) {\n      buffer << c;\n    } else if (delimChar.find(c) != std::string::npos) {\n      buffer >> result;\n      done = true;\n    } else {\n      throw syntax_error(currLine, currChar,\n                         format(\"Expected integer but found: ASCII %d\", c));\n    }\n  }\n  return result;\n}\n\nstd::string AigParser::parseString(std::string delimChar) {\n  bool done = false;\n  unsigned char c;\n  std::stringstream buffer;\n  c = parseChar();\n  if (delimChar.find(c) != std::string::npos || c == '\\0') {\n    throw syntax_error(currLine, currChar,\n                       format(\"Expected integer but found: ASCII %d\", c));\n  }\n  buffer << c;\n  while (!done) {\n    c = parseChar();\n    if (delimChar.find(c) != std::string::npos || c == '\\0') {\n      done = true;\n    } else {\n      buffer << c;\n    }\n  }\n  return buffer.str();\n}\n\nvoid AigParser::resize() {\n  inputs.resize(i);\n  latches.resize(l);\n  outputs.resize(o);\n  ands.resize(a);\n\n  // The symbols may be empty\n  // inputNames.resize(i);\n  // latchNames.resize(l);\n  // outputNames.resize(o);\n\n  int nNodes = m + o + 1;\n  aig.resizeNodeVectors(nNodes);\n  // aig.getNodes().resize(nNodes);\n  // aig.getNodesTravId().resize(nNodes);\n  // aig.getNodesFanoutMap().resize(nNodes);\n}\n\nvoid AigParser::parseAagHeader() {\n  std::string aag = parseString(\" \");\n  if (aag.compare(\"aag\") != 0) {\n    throw syntax_error(1, 0, \"Expected aag header\");\n  }\n  m = parseInt(\" \");\n  i = parseInt(\" \");\n  l = parseInt(\" \");\n  o = parseInt(\" \");\n  a = parseInt(\"\\n\");\n\n  if (m != (i + l + a)) {\n    throw semantic_error(1, 4, \"Incorrect value for M\");\n  }\n  resize();\n}\n\nvoid AigParser::parseAigHeader() {\n  std::string aig = parseString(\" \");\n  if (aig.compare(\"aig\") != 0) {\n    throw syntax_error(1, 0, \"Expected aig header\");\n  }\n  m = parseInt(\" \");\n  i = parseInt(\" \");\n  l = parseInt(\" \");\n  o = parseInt(\" \");\n  a = parseInt(\"\\n\");\n\n  if (m != (i + l + a)) {\n    throw semantic_error(1, 4, \"Incorrect value for M\");\n  }\n  resize();\n}\n\nvoid AigParser::parseAagInputs() {\n  for (int in = 0; in < i; in++) {\n    inputs[in] = parseInt(\"\\n\");\n  }\n}\n\nvoid AigParser::parseAigInputs() {\n  int x = 2;\n  for (int in = 0; in < i; in++) {\n    inputs[in] = x;\n    x          = x + 2;\n  }\n}\n\nvoid AigParser::parseAagLatches() {\n  unsigned line;\n  for (int in = 0; in < l; in++) {\n    int lhs = parseInt(\" \");\n    line    = currLine;\n    int rhs = parseInt(\"\\n\");\n    bool init;\n    if (line == currLine) {\n      init = parseBool(\"\\n\");\n    } else {\n      init = false;\n    }\n    latches[in] = std::make_tuple(lhs, rhs, init);\n  }\n}\n\nvoid AigParser::parseAigLatches() {\n  unsigned line;\n  int lhs = i * 2 + 2;\n  for (int in = 0; in < l; in++) {\n    line    = currLine;\n    int rhs = parseInt(\"\\n\");\n    bool init;\n    if (line == currLine) {\n      init = parseBool(\"\\n\");\n    } else {\n      init = false;\n    }\n    latches[in] = std::make_tuple(lhs, rhs, init);\n    lhs         = lhs + 2;\n  }\n}\n\nvoid AigParser::parseOutputs() {\n  for (int in = 0; in < o; in++) {\n    outputs[in] = parseInt(\"\\n\");\n  }\n}\n\nvoid AigParser::parseAagAnds() {\n  for (int in = 0; in < a; in++) {\n    int lhs  = parseInt(\" \");\n    int rhs0 = parseInt(\" \");\n    int rhs1 = parseInt(\"\\n\");\n    ands[in] = std::make_tuple(lhs, rhs0, rhs1);\n  }\n}\n\nvoid AigParser::parseAigAnds() {\n  int delta0, delta1;\n  int lhs = (i + l) * 2 + 2;\n  for (int in = 0; in < a; in++) {\n    delta0   = decode();\n    delta1   = decode();\n    int rhs0 = lhs - delta0;\n    if (rhs0 < 0) {\n      throw semantic_error(currLine, currChar,\n                           format(\"Negative rhs0: %d\", rhs0));\n    }\n    int rhs1 = rhs0 - delta1;\n    if (rhs1 < 0) {\n      throw semantic_error(currLine, currChar,\n                           format(\"Negative rhs0: %d\", rhs1));\n    }\n    ands[in] = std::make_tuple(lhs, rhs0, rhs1);\n    lhs      = lhs + 2;\n  }\n}\n\nvoid AigParser::parseSymbolTable() {\n  int c, n;\n  while (true) {\n    try {\n      c = parseChar();\n    } catch (unexpected_eof& e) {\n      return;\n    }\n    switch (c) {\n    case 'i':\n      n = parseInt(\" \");\n      if (n >= i)\n        throw semantic_error(currLine, currChar,\n                             \"Input number greater than number of inputs\");\n      if (inputNames.empty()) {\n        inputNames.resize(i);\n      }\n      inputNames[n] = parseString(\"\\n\");\n      break;\n    case 'l':\n      n = parseInt(\" \");\n      if (n >= l)\n        throw semantic_error(currLine, currChar,\n                             \"Latch number greater than number of latches\");\n      if (latchNames.empty()) {\n        latchNames.resize(l);\n      }\n      latchNames[n] = parseString(\"\\n\");\n      break;\n    case 'o':\n      n = parseInt(\" \");\n      if (n >= o)\n        throw semantic_error(currLine, currChar,\n                             \"Output number greater than number of outputs\");\n      if (outputNames.empty()) {\n        outputNames.resize(o);\n      }\n      outputNames[n] = parseString(\"\\n\");\n      break;\n    case 'c':\n      c = parseChar();\n      if (c != '\\n' && c != '\\r')\n        throw syntax_error(currLine, currChar);\n      if (file.peek() != 0)\n        designName = parseString(\"\\n\\0\");\n      else\n        designName = \"Unnamed\";\n      return;\n    }\n  }\n}\n\nvoid AigParser::parseAag() {\n  parseAagHeader();\n  parseAagInputs();\n  parseAagLatches();\n  parseOutputs();\n  parseAagAnds();\n  parseSymbolTable();\n  createAig();\n}\n\nvoid AigParser::parseAig() {\n  parseAigHeader();\n  parseAigInputs();\n  parseAigLatches();\n  parseOutputs();\n  parseAigAnds();\n  parseSymbolTable();\n  createAig();\n}\n\nvoid AigParser::createAig() {\n  aig.setDesignName(this->designName);\n  createConstant();\n  createInputs();\n  createLatches();\n  createOutputs();\n  createAnds();\n  // connectAnds();\n  connectAndsWithFanoutMap();\n  connectLatches();\n  connectOutputs();\n}\n\nvoid AigParser::createConstant() {\n  // Node Data\n  aig::NodeData nodeData;\n  nodeData.id      = 0;\n  nodeData.counter = 0;\n  nodeData.type    = aig::NodeType::CONSTZERO;\n  nodeData.level   = 0;\n  // AIG Node\n  aig::Graph& aigGraph = aig.getGraph();\n  aig::GNode constNode;\n  constNode = aigGraph.createNode(nodeData);\n  aigGraph.addNode(constNode);\n  aig.getNodes()[nodeData.id] = constNode;\n}\n\nvoid AigParser::createInputs() {\n  aig::Graph& aigGraph = aig.getGraph();\n  for (int in = 0; in < i; in++) {\n    // Node Data\n    aig::NodeData nodeData;\n    nodeData.id      = inputs[in] / 2;\n    nodeData.counter = 0;\n    nodeData.type    = aig::NodeType::PI;\n    nodeData.level   = 0;\n    // AIG Node\n    aig::GNode inputNode;\n    inputNode = aigGraph.createNode(nodeData);\n    aigGraph.addNode(inputNode);\n    aig.getInputNodes().push_back(inputNode);\n    aig.getNodes()[nodeData.id] = inputNode;\n  }\n\n  aig.setInputNames(this->inputNames);\n}\n\nvoid AigParser::createLatches() {\n  aig::Graph& aigGraph = aig.getGraph();\n  for (int in = 0; in < l; in++) {\n    // Node Data\n    aig::NodeData nodeData;\n    nodeData.id      = (std::get<0>(latches[in]) / 2);\n    nodeData.counter = 0;\n    nodeData.type    = aig::NodeType::LATCH;\n    // NodeData.initialValue = std::get<3>( latches[in] ); // FIXME\n    // AIG Node\n    aig::GNode latchNode;\n    latchNode = aigGraph.createNode(nodeData);\n    aigGraph.addNode(latchNode);\n    aig.getLatchNodes().push_back(latchNode);\n    aig.getNodes()[nodeData.id] = latchNode;\n  }\n\n  aig.setLatchNames(this->latchNames);\n}\n\nvoid AigParser::createOutputs() {\n  aig::Graph& aigGraph = aig.getGraph();\n  for (int in = 0; in < o; in++) {\n    // Node Data\n    aig::NodeData nodeData;\n    nodeData.id      = m + in + 1;\n    nodeData.counter = 0;\n    nodeData.type    = aig::NodeType::PO;\n    // AIG Node\n    aig::GNode outputNode;\n    outputNode = aigGraph.createNode(nodeData);\n    aigGraph.addNode(outputNode);\n    aig.getOutputNodes().push_back(outputNode);\n    aig.getNodes()[nodeData.id] = outputNode;\n  }\n\n  aig.setOutputNames(this->outputNames);\n}\n\nvoid AigParser::createAnds() {\n  aig::Graph& aigGraph = aig.getGraph();\n  for (int in = 0; in < a; in++) {\n    // Node Data\n    aig::NodeData nodeData;\n    nodeData.id = (std::get<0>(ands[in]) / 2);\n    std::stringstream sName;\n    nodeData.counter = 0;\n    nodeData.type    = aig::NodeType::AND;\n    // AIG Node\n    aig::GNode andNode;\n    andNode = aigGraph.createNode(nodeData);\n    aigGraph.addNode(andNode);\n    aig.getNodes()[nodeData.id] = andNode;\n  }\n}\n\nvoid AigParser::connectLatches() {\n  aig::Graph& aigGraph = aig.getGraph();\n  for (int in = 0; in < l; in++) {\n    int lhs                      = std::get<0>(latches[in]);\n    aig::GNode latchNode         = aig.getNodes()[lhs / 2];\n    aig::NodeData& latchNodeData = aigGraph.getData(latchNode);\n\n    int rhs                      = std::get<1>(latches[in]);\n    aig::GNode inputNode         = aig.getNodes()[rhs / 2];\n    aig::NodeData& inputNodeData = aigGraph.getData(inputNode);\n    inputNodeData.nFanout += 1;\n\n    aigGraph.getEdgeData(aigGraph.addEdge(inputNode, latchNode)) = !(rhs % 2);\n    latchNodeData.level = 1 + inputNodeData.level;\n  }\n}\n\nvoid AigParser::connectOutputs() {\n  aig::Graph& aigGraph = aig.getGraph();\n  for (int in = 0; in < o; in++) {\n    aig::GNode outputNode = aig.getNodes()[m + in + 1];\n    aig::NodeData& outputNodeData =\n        aigGraph.getData(outputNode, galois::MethodFlag::WRITE);\n    // outputNodeData.nFanin = 1;\n\n    int rhs              = outputs[in];\n    aig::GNode inputNode = aig.getNodes()[rhs / 2];\n    aig::NodeData& inputNodeData =\n        aigGraph.getData(inputNode, galois::MethodFlag::WRITE);\n    inputNodeData.nFanout += 1;\n\n    aigGraph.getEdgeData(aigGraph.addEdge(inputNode, outputNode)) = !(rhs % 2);\n    outputNodeData.level = 1 + inputNodeData.level;\n  }\n}\n\nvoid AigParser::connectAnds() {\n\n  aig::Graph& aigGraph = aig.getGraph();\n\n  // Each andDef is composed by three nodes A B C, A is the AND itself, B and C\n  // are the two input nodes.\n  for (auto andDef : this->ands) {\n\n    int A              = std::get<0>(andDef);\n    aig::GNode andNode = aig.getNodes()[A / 2];\n    aig::NodeData& andData =\n        aigGraph.getData(andNode, galois::MethodFlag::WRITE);\n    // andData.nFanin = 2;\n\n    int B              = std::get<1>(andDef);\n    aig::GNode lhsNode = aig.getNodes()[B / 2];\n    aig::NodeData& lhsData =\n        aigGraph.getData(lhsNode, galois::MethodFlag::WRITE);\n    lhsData.nFanout += 1;\n\n    int C              = std::get<2>(andDef);\n    aig::GNode rhsNode = aig.getNodes()[C / 2];\n    aig::NodeData& rhsData =\n        aigGraph.getData(rhsNode, galois::MethodFlag::WRITE);\n    rhsData.nFanout += 1;\n\n    aigGraph.getEdgeData(aigGraph.addMultiEdge(\n        lhsNode, andNode, galois::MethodFlag::UNPROTECTED)) = !(B % 2);\n\n    aigGraph.getEdgeData(aigGraph.addMultiEdge(\n        rhsNode, andNode, galois::MethodFlag::UNPROTECTED)) = !(C % 2);\n\n    andData.level = 1 + std::max(lhsData.level, rhsData.level);\n  }\n}\n\nvoid AigParser::connectAndsWithFanoutMap() {\n\n  aig::Graph& aigGraph = aig.getGraph();\n\n  this->levelHistogram.resize(50000, 0); // FIXME\n  this->levelHistogram[0] = this->i;\n\n  // Each andDef is composed by three nodes A B C, A is the AND itself, B and C\n  // are the two input nodes.\n  for (auto andDef : this->ands) {\n\n    int A              = std::get<0>(andDef);\n    aig::GNode andNode = aig.getNodes()[A / 2];\n    aig::NodeData& andData =\n        aigGraph.getData(andNode, galois::MethodFlag::WRITE);\n    // andData.nFanin = 2;\n\n    int B              = std::get<1>(andDef);\n    aig::GNode lhsNode = aig.getNodes()[B / 2];\n    aig::NodeData& lhsData =\n        aigGraph.getData(lhsNode, galois::MethodFlag::WRITE);\n    bool lhsPol = !(B % 2);\n    lhsData.nFanout += 1;\n\n    int C              = std::get<2>(andDef);\n    aig::GNode rhsNode = aig.getNodes()[C / 2];\n    aig::NodeData& rhsData =\n        aigGraph.getData(rhsNode, galois::MethodFlag::WRITE);\n    bool rhsPol = !(C % 2);\n    rhsData.nFanout += 1;\n\n    aigGraph.getEdgeData(aigGraph.addMultiEdge(\n        lhsNode, andNode, galois::MethodFlag::UNPROTECTED)) = lhsPol;\n\n    aigGraph.getEdgeData(aigGraph.addMultiEdge(\n        rhsNode, andNode, galois::MethodFlag::UNPROTECTED)) = rhsPol;\n\n    aig.insertNodeInFanoutMap(andNode, lhsNode, rhsNode, lhsPol, rhsPol);\n\n    andData.level = 1 + std::max(lhsData.level, rhsData.level);\n    this->levelHistogram[andData.level] += 1;\n  }\n\n  int i = 0;\n  while (i < 50000) {\n    if (this->levelHistogram[i] == 0) {\n      break;\n    }\n    i++;\n  }\n  this->levelHistogram.resize(i);\n}\n\nint AigParser::getI() { return i; }\n\nint AigParser::getL() { return l; }\n\nint AigParser::getO() { return o; }\n\nint AigParser::getA() { return a; }\n\nint AigParser::getE() {\n\n  aig::Graph& aigGraph = aig.getGraph();\n  int nEdges           = 0;\n\n  for (auto node : aigGraph) {\n    nEdges += std::distance(aigGraph.edge_begin(node), aigGraph.edge_end(node));\n  }\n\n  return nEdges;\n}\n\nstd::vector<int>& AigParser::getLevelHistogram() {\n  return this->levelHistogram;\n}\n"
  },
  {
    "path": "lonestar/eda/cpu/aig-rewriting/parsers/AigParser.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#ifndef AIGPARSER_H_\n#define AIGPARSER_H_\n#include <string>\n#include <fstream>\n#include <sstream>\n\n#include \"semantic_error.h\"\n#include \"syntax_error.h\"\n#include \"unexpected_eof.h\"\n#include \"../misc/util/utilString.h\"\n#include \"../subjectgraph/aig/Aig.h\"\n\nclass AigParser {\n\nprivate:\n  unsigned currLine;\n  unsigned currChar;\n  std::ifstream file;\n  int m, i, l, o, a;\n  std::vector<int> inputs, outputs;\n  std::vector<std::tuple<int, int, bool>> latches;\n  std::vector<std::tuple<int, int, int>> ands;\n  std::vector<std::string> inputNames, latchNames, outputNames;\n  std::vector<aig::GNode> nodes;\n  std::vector<int> levelHistogram;\n\n  aig::Aig& aig;\n  std::string designName;\n\n  unsigned decode();\n  bool parseBool(std::string delimChar);\n  char parseByte();\n  unsigned char parseChar();\n  int parseInt(std::string delimChar);\n  std::string parseString(std::string delimChar);\n\n  void resize();\n\n  void parseAagHeader();\n  void parseAigHeader();\n  void parseAagInputs();\n  void parseAigInputs();\n  void parseAagLatches();\n  void parseAigLatches();\n  void parseOutputs();\n  void parseAagAnds();\n  void parseAigAnds();\n  void parseSymbolTable();\n\n  void createAig();\n  void createConstant();\n  void createInputs();\n  void createLatches();\n  void createOutputs();\n  void createAnds();\n  void connectAndsWithFanoutMap();\n\n  void connectLatches();\n  void connectOutputs();\n  void connectAnds();\n\npublic:\n  AigParser(aig::Aig& aig);\n  AigParser(std::string fileName, aig::Aig& aig);\n  virtual ~AigParser();\n\n  void open(std::string fileName);\n  bool isOpen() const;\n  void close();\n  void parseAag();\n  void parseAig();\n\n  int getI();\n  int getL();\n  int getO();\n  int getA();\n  int getE();\n\n  std::vector<int>& getLevelHistogram();\n};\n\n#endif /* AIGPARSER_H_ */\n"
  },
  {
    "path": "lonestar/eda/cpu/aig-rewriting/parsers/LookupTableParser.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#include \"LookupTableParser.h\"\n\n#include <fstream>\n#include <sstream>\n\nnamespace lookuptables {\n\nLookupTableParser::LookupTableParser() {}\n\nLookupTableParser::~LookupTableParser() {}\n\nvoid LookupTableParser::parseFile(std::string fileName,\n                                  LookupTableElement** lookupTable) {\n\n  std::ifstream file(fileName);\n  std::string line;\n  std::string token, exp;\n  char lit, lev;\n\n  int i = 0;\n  int j = 0;\n\n  while (std::getline(file, line)) {\n\n    if (line.at(0) == '#') {\n      i++;\n      j = 0;\n      continue;\n    }\n\n    std::stringstream tokenizer;\n    tokenizer << line;\n\n    std::getline(tokenizer, exp, ';');\n\n    std::getline(tokenizer, token, ';');\n    lit = std::stoi(token);\n\n    std::getline(tokenizer, token, ';');\n    lev = std::stoi(token);\n\n    lookupTable[i][j] = LookupTableElement(exp, lit, lev);\n\n    j++;\n  }\n}\n\n} /* namespace lookuptables */\n"
  },
  {
    "path": "lonestar/eda/cpu/aig-rewriting/parsers/LookupTableParser.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#ifndef LOOKUPTABLEPARSER_H_\n#define LOOKUPTABLEPARSER_H_\n\n#include <iostream>\n\nnamespace lookuptables {\n\ntypedef struct lookupTableElement {\n  std::string expression;\n  char literals;\n  char levels;\n\n  lookupTableElement() {\n    expression = \"\";\n    literals   = 0;\n    levels     = 0;\n  }\n\n  lookupTableElement(std::string& expression, char literals, char levels)\n      : expression(expression), literals(literals), levels(levels) {}\n\n} LookupTableElement;\n\nclass LookupTableParser {\n\nprivate:\npublic:\n  LookupTableParser();\n\n  ~LookupTableParser();\n\n  void parseFile(std::string fileName, LookupTableElement** lookupTable);\n};\n\n} /* namespace lookuptables */\n\n#endif /* LOOKUPTABLEPARSER_H_ */\n"
  },
  {
    "path": "lonestar/eda/cpu/aig-rewriting/parsers/semantic_error.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n/*\n * semantic_error.cpp\n *\n *  Created on: Aug 25, 2014\n *      Author: marcos\n */\n\n#include \"semantic_error.h\"\n\nsemantic_error::semantic_error(unsigned l, unsigned c, std::string msg)\n    : exception() {\n  this->l = l;\n  this->c = c;\n  std::stringstream ret;\n  ret << \"Semantic error in line-\" << l << \" char-\" << c << \": \" << msg\n      << std::endl;\n  this->full_msg = ret.str();\n}\n\nconst char* semantic_error::what() const throw() {\n  return this->full_msg.c_str();\n}\n\nsemantic_error::~semantic_error() throw() {}\n"
  },
  {
    "path": "lonestar/eda/cpu/aig-rewriting/parsers/semantic_error.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n/*\n * semantic_error.h\n *\n *  Created on: Aug 25, 2014\n *      Author: marcos\n */\n\n#ifndef SEMANTIC_ERROR_H_\n#define SEMANTIC_ERROR_H_\n\n#include <exception>\n#include <sstream>\n#include <string>\n\nclass semantic_error : public std::exception {\n  unsigned l, c;\n  std::string full_msg;\n\npublic:\n  semantic_error(unsigned l, unsigned c, std::string msg = \"\");\n  virtual const char* what() const throw();\n  virtual ~semantic_error() throw();\n};\n\n#endif /* SEMANTIC_ERROR_H_ */\n"
  },
  {
    "path": "lonestar/eda/cpu/aig-rewriting/parsers/syntax_error.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n/*\n * SyntaxExeption.cpp\n *\n *  Created on: Aug 25, 2014\n *      Author: marcos\n */\n\n#include \"syntax_error.h\"\n\nsyntax_error::syntax_error(unsigned l, unsigned c, std::string msg)\n    : std::exception() {\n  this->l = l;\n  this->c = c;\n  std::stringstream ret;\n  ret << \"Syntax error in line-\" << l << \" char-\" << c << \": \" << msg\n      << std::endl;\n  this->full_msg = ret.str();\n}\n\nconst char* syntax_error::what() const throw() {\n  return this->full_msg.c_str();\n}\n\nsyntax_error::~syntax_error() throw() {}\n"
  },
  {
    "path": "lonestar/eda/cpu/aig-rewriting/parsers/syntax_error.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n/*\n * syntax_error.h\n *\n *  Created on: Aug 25, 2014\n *      Author: marcos\n */\n\n#ifndef SYNTAXEXEPTION_H_\n#define SYNTAXEXEPTION_H_\n\n#include <exception>\n#include <sstream>\n#include <string>\n\nclass syntax_error : public std::exception {\n  unsigned l, c;\n  std::string full_msg;\n\npublic:\n  syntax_error(unsigned l, unsigned c, std::string msg = \"\");\n  virtual const char* what() const throw();\n  virtual ~syntax_error() throw();\n};\n\n#endif /* SYNTAXEXEPTION_H_ */\n"
  },
  {
    "path": "lonestar/eda/cpu/aig-rewriting/parsers/unexpected_eof.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n/*\n * unexpected_eof.cpp\n *\n *  Created on: Aug 25, 2014\n *      Author: marcos\n */\n\n#include \"unexpected_eof.h\"\n\nunexpected_eof::unexpected_eof(unsigned l, unsigned c) : std::exception() {\n  this->l = l;\n  this->c = c;\n  std::stringstream ret;\n  ret << \"Unexpected eof in line-\" << l << \" char-\" << c << std::endl;\n  this->full_msg = ret.str();\n}\n\nunexpected_eof::unexpected_eof(unsigned l, unsigned c, std::string msg)\n    : std::exception() {\n  this->l = l;\n  this->c = c;\n  std::stringstream ret;\n  ret << \"Unexpected eof in line-\" << l << \" char-\" << c\n      << \". Last token: \" << msg << std::endl;\n  this->full_msg = ret.str();\n}\n\nconst char* unexpected_eof::what() const throw() {\n  return this->full_msg.c_str();\n}\n\nunexpected_eof::~unexpected_eof() throw() {}\n"
  },
  {
    "path": "lonestar/eda/cpu/aig-rewriting/parsers/unexpected_eof.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n/*\n * unexpected_eof.h\n *\n *  Created on: Aug 25, 2014\n *      Author: marcos\n */\n\n#ifndef UNEXPECTED_EOF_H_\n#define UNEXPECTED_EOF_H_\n\n#include <exception>\n#include <sstream>\n#include <string>\n\nclass unexpected_eof : public std::exception {\n  unsigned l, c;\n  std::string full_msg;\n\npublic:\n  unexpected_eof(unsigned l, unsigned c);\n  unexpected_eof(unsigned l, unsigned c, std::string msg);\n  virtual const char* what() const throw();\n  virtual ~unexpected_eof() throw();\n};\n\n#endif /* UNEXPECTED_EOF_H_ */\n"
  },
  {
    "path": "lonestar/eda/cpu/aig-rewriting/subjectgraph/aig/Aig.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#include \"Aig.h\"\n\n#include <fstream>\n#include <unordered_map>\n\nnamespace andInverterGraph {\n\nAig::Aig() {\n  this->idCounter     = 0;\n  this->expansionRate = 0.2;\n}\n\nAig::Aig(float expansionRate) {\n  this->idCounter     = 0;\n  this->expansionRate = expansionRate;\n}\n\nAig::~Aig() {}\n\nvoid Aig::resize(int m, int i, int l, int o, bool hasSymbols) {\n  this->inputNodes.resize(i);\n  this->latchNodes.resize(l);\n  this->outputNodes.resize(o);\n\n  if (hasSymbols) {\n    this->inputNames.resize(i);\n    this->latchNames.resize(l);\n    this->outputNames.resize(o);\n  }\n\n  int nNodes = m + o + 1;\n  this->nodes.resize(nNodes);\n  this->nodesTravId.resize(nNodes);\n  this->nodesFanoutMap.resize(nNodes);\n  this->idCounter = nNodes;\n}\n\nvoid Aig::resizeNodeVectors(int size) {\n  this->idCounter = size;\n  this->nodes.resize(size);\n  this->nodesTravId.resize(size);\n  this->nodesFanoutMap.resize(size);\n}\n\nvoid Aig::expandNodeVectors(size_t extraSize) {\n  this->idCounter = this->nodes.size();\n  size_t nNodes   = this->idCounter + extraSize;\n  this->nodes.resize(nNodes);\n  this->nodesTravId.resize(nNodes);\n  this->nodesFanoutMap.resize(nNodes);\n}\n\nsize_t Aig::getNextId() {\n  if (this->idCounter == nodes.size()) {\n    size_t extraSize =\n        (size_t)(float(this->nodes.size()) * this->expansionRate);\n    expandNodeVectors(extraSize);\n  }\n  size_t nextId = this->idCounter++;\n  return nextId;\n}\n\nGNode Aig::createAND(GNode lhsAnd, GNode rhsAnd, bool lhsAndPol,\n                     bool rhsAndPol) {\n\n  NodeData& lhsAndData = this->graph.getData(lhsAnd, galois::MethodFlag::READ);\n  NodeData& rhsAndData = this->graph.getData(rhsAnd, galois::MethodFlag::READ);\n  NodeData newAndData;\n\n  newAndData.id      = getNextId();\n  newAndData.type    = aig::NodeType::AND;\n  newAndData.level   = 1 + std::max(lhsAndData.level, rhsAndData.level);\n  newAndData.counter = 0;\n  newAndData.nFanout = 0;\n\n  GNode newAnd = this->graph.createNode(newAndData);\n  this->graph.addNode(newAnd);\n\n  this->graph.getEdgeData(graph.addMultiEdge(\n      lhsAnd, newAnd, galois::MethodFlag::WRITE)) = lhsAndPol;\n  this->graph.getEdgeData(graph.addMultiEdge(\n      rhsAnd, newAnd, galois::MethodFlag::WRITE)) = rhsAndPol;\n  lhsAndData.nFanout++;\n  rhsAndData.nFanout++;\n\n  // int faninSize = std::distance( aigGraph.in_edge_begin( newAnd ),\n  // aigGraph.in_edge_begin( newAnd ) ); assert( faninSize == 2 );\n\n  this->nodes[newAndData.id] = newAnd;\n  this->insertNodeInFanoutMap(newAnd, lhsAnd, rhsAnd, lhsAndPol, rhsAndPol);\n\n  return newAnd;\n}\n\nvoid Aig::insertNodeInFanoutMap(GNode andNode, GNode lhsNode, GNode rhsNode,\n                                bool lhsPol, bool rhsPol) {\n\n  NodeData& lhsNodeData =\n      this->graph.getData(lhsNode, galois::MethodFlag::READ);\n  NodeData& rhsNodeData =\n      this->graph.getData(rhsNode, galois::MethodFlag::READ);\n\n  unsigned key = makeAndHashKey(lhsNodeData.id, rhsNodeData.id, lhsPol, rhsPol);\n\n  if (lhsNodeData.id < rhsNodeData.id) {\n    this->nodesFanoutMap[lhsNodeData.id].emplace(key, andNode);\n  } else {\n    this->nodesFanoutMap[rhsNodeData.id].emplace(key, andNode);\n  }\n}\n\nvoid Aig::removeNodeInFanoutMap(GNode removedNode, GNode lhsNode, GNode rhsNode,\n                                bool lhsPol, bool rhsPol) {\n\n  GNode lhsInNode;\n  GNode rhsInNode;\n  bool lhsInNodePol;\n  bool rhsInNodePol;\n  int smallestId;\n\n  NodeData& lhsNodeData =\n      this->graph.getData(lhsNode, galois::MethodFlag::READ);\n  NodeData& rhsNodeData =\n      this->graph.getData(rhsNode, galois::MethodFlag::READ);\n\n  unsigned key = makeAndHashKey(lhsNodeData.id, rhsNodeData.id, lhsPol, rhsPol);\n\n  if (lhsNodeData.id < rhsNodeData.id) {\n    smallestId = lhsNodeData.id;\n  } else {\n    smallestId = rhsNodeData.id;\n  }\n\n  std::unordered_multimap<unsigned, GNode>& fanoutMap =\n      this->nodesFanoutMap[smallestId];\n  auto range = fanoutMap.equal_range(key);\n\n  for (auto it = range.first; it != range.second;) {\n\n    GNode fanoutNode = it->second;\n    NodeData& fanoutNodeData =\n        this->graph.getData(fanoutNode, galois::MethodFlag::READ);\n\n    if (fanoutNodeData.type != NodeType::AND) {\n      it++;\n      continue;\n    }\n\n    auto inEdge  = this->graph.in_edge_begin(fanoutNode);\n    lhsInNode    = this->graph.getEdgeDst(inEdge);\n    lhsInNodePol = this->graph.getEdgeData(inEdge);\n\n    if (lhsInNode == lhsNode) {\n      inEdge++;\n      rhsInNode    = this->graph.getEdgeDst(inEdge);\n      rhsInNodePol = this->graph.getEdgeData(inEdge);\n    } else {\n      rhsInNode    = lhsInNode;\n      rhsInNodePol = lhsInNodePol;\n      inEdge++;\n      lhsInNode    = this->graph.getEdgeDst(inEdge);\n      lhsInNodePol = this->graph.getEdgeData(inEdge);\n    }\n\n    if ((lhsInNode == lhsNode) && (lhsInNodePol == lhsPol) &&\n        (rhsInNode == rhsNode) && (rhsInNodePol == rhsPol) &&\n        (fanoutNode == removedNode)) {\n      it = fanoutMap.erase(it);\n    } else {\n      it++;\n    }\n  }\n}\n\nGNode Aig::lookupNodeInFanoutMap(GNode lhsNode, GNode rhsNode, bool lhsPol,\n                                 bool rhsPol) {\n\n  GNode lhsInNode;\n  GNode rhsInNode;\n  bool lhsInNodePol;\n  bool rhsInNodePol;\n  int smallestId;\n\n  NodeData& lhsNodeData =\n      this->graph.getData(lhsNode, galois::MethodFlag::READ);\n  NodeData& rhsNodeData =\n      this->graph.getData(rhsNode, galois::MethodFlag::READ);\n\n  unsigned key = makeAndHashKey(lhsNodeData.id, rhsNodeData.id, lhsPol, rhsPol);\n\n  if (lhsNodeData.id < rhsNodeData.id) {\n    smallestId = lhsNodeData.id;\n  } else {\n    smallestId = rhsNodeData.id;\n  }\n\n  std::unordered_multimap<unsigned, GNode>& fanoutMap =\n      this->nodesFanoutMap[smallestId];\n  auto range = fanoutMap.equal_range(key);\n\n  for (auto it = range.first; it != range.second; it++) {\n\n    GNode fanoutNode = it->second;\n    NodeData& fanoutNodeData =\n        this->graph.getData(fanoutNode, galois::MethodFlag::READ);\n\n    if (fanoutNodeData.type != NodeType::AND) {\n      continue;\n    }\n\n    auto inEdge  = this->graph.in_edge_begin(fanoutNode);\n    lhsInNode    = this->graph.getEdgeDst(inEdge);\n    lhsInNodePol = this->graph.getEdgeData(inEdge);\n\n    if (lhsInNode == lhsNode) {\n      inEdge++;\n      rhsInNode    = this->graph.getEdgeDst(inEdge);\n      rhsInNodePol = this->graph.getEdgeData(inEdge);\n    } else {\n      rhsInNode    = lhsInNode;\n      rhsInNodePol = lhsInNodePol;\n      inEdge++;\n      lhsInNode    = this->graph.getEdgeDst(inEdge);\n      lhsInNodePol = this->graph.getEdgeData(inEdge);\n      assert(lhsInNode == lhsNode);\n    }\n    if ((lhsInNode == lhsNode) && (lhsInNodePol == lhsPol) &&\n        (rhsInNode == rhsNode) && (rhsInNodePol == rhsPol)) {\n      return fanoutNode;\n    }\n  }\n\n  return nullptr;\n}\n\nunsigned Aig::makeAndHashKey(int lhsId, int rhsId, bool lhsPol, bool rhsPol) {\n\n  unsigned key = 0;\n\n  if (lhsId < rhsId) {\n    key ^= lhsId * 7937;\n    key ^= rhsId * 2971;\n    key ^= lhsPol ? 911 : 0;\n    key ^= rhsPol ? 353 : 0;\n  } else {\n    key ^= rhsId * 7937;\n    key ^= lhsId * 2971;\n    key ^= rhsPol ? 911 : 0;\n    key ^= lhsPol ? 353 : 0;\n  }\n\n  return key;\n}\n\nvoid Aig::registerTravId(int nodeId, int threadId, int travId) {\n  this->nodesTravId[nodeId].first  = threadId;\n  this->nodesTravId[nodeId].second = travId;\n}\n\nbool Aig::lookupTravId(int nodeId, int threadId, int travId) {\n  if ((this->nodesTravId[nodeId].first == threadId) &&\n      (this->nodesTravId[nodeId].second == travId)) {\n    return true;\n  } else {\n    return false;\n  }\n}\n\nstd::vector<std::pair<int, int>>& Aig::getNodesTravId() {\n  return this->nodesTravId;\n}\n\nstd::unordered_multimap<unsigned, GNode>& Aig::getFanoutMap(int nodeId) {\n  return this->nodesFanoutMap[nodeId];\n}\n\nstd::vector<std::unordered_multimap<unsigned, GNode>>&\nAig::getNodesFanoutMap() {\n  return this->nodesFanoutMap;\n}\n\nGraph& Aig::getGraph() { return this->graph; }\n\nstd::vector<GNode>& Aig::getNodes() { return this->nodes; }\n\nstd::vector<GNode>& Aig::getInputNodes() { return this->inputNodes; }\n\nstd::vector<GNode>& Aig::getLatchNodes() { return this->latchNodes; }\n\nstd::vector<GNode>& Aig::getOutputNodes() { return this->outputNodes; }\n\nGNode Aig::getConstZero() { return this->nodes[0]; }\n\nint Aig::getNumInputs() { return this->inputNodes.size(); }\n\nint Aig::getNumLatches() { return this->latchNodes.size(); }\n\nint Aig::getNumOutputs() { return this->outputNodes.size(); }\n\nint Aig::getNumAnds() {\n  int nNodes = std::distance(this->graph.begin(), this->graph.end());\n  return (nNodes - (getNumInputs() + getNumLatches() + getNumOutputs() + 1));\n  // +1 is to disconsider the constant node.\n}\n\nint Aig::getDepth() {\n\n  resetAllIds();\n\n  int max = -1;\n\n  for (auto po : this->outputNodes) {\n    NodeData& poData = this->graph.getData(po, galois::MethodFlag::READ);\n    if (max < poData.level) {\n      max = poData.level;\n    }\n  }\n\n  assert(max > -1);\n\n  return max;\n}\n\nstd::vector<std::string>& Aig::getInputNames() { return this->inputNames; }\n\nvoid Aig::setInputNames(std::vector<std::string> inputNames) {\n  this->inputNames = inputNames;\n}\n\nstd::vector<std::string>& Aig::getLatchNames() { return this->latchNames; }\n\nvoid Aig::setLatchNames(std::vector<std::string> latchNames) {\n  this->latchNames = latchNames;\n}\n\nstd::vector<std::string>& Aig::getOutputNames() { return this->outputNames; }\n\nvoid Aig::setOutputNames(std::vector<std::string> outputNames) {\n  this->outputNames = outputNames;\n}\n\nstd::string Aig::getDesignName() { return this->designName; }\n\nvoid Aig::setDesignName(std::string designName) {\n  this->designName = designName;\n}\n\n/*\nbool Aig::isGNodeComplemented(GNode node) {\n  return (bool)(((unsigned long int)node) & 01u);\n}\n\nGNode Aig::makeGNodeRegular(GNode node) {\n  return (GNode)((unsigned long int)(node) & ~01u);\n}\n\nGNode Aig::makeGNodeComplemented(GNode node) {\n  return (GNode)((unsigned long int)(node) ^ 01u);\n}\n*/\n\n// ########## ALGORITHMES ######## ///\n\nstruct ResetNodeCounters {\n  aig::Graph& aigGraph;\n\n  ResetNodeCounters(aig::Graph& aigGraph) : aigGraph(aigGraph) {}\n\n  void operator()(aig::GNode node) {\n    aig::NodeData& nodeData = aigGraph.getData(node, galois::MethodFlag::WRITE);\n    nodeData.counter        = 0;\n  }\n};\n\nvoid Aig::resetAllNodeCounters() {\n  galois::do_all(galois::iterate(graph), ResetNodeCounters{graph},\n                 galois::steal());\n}\n\nvoid Aig::resetAndIds() {\n\n  std::stack<GNode> stack;\n\n  computeTopologicalSortForAnds(stack);\n\n  int currentId = this->getNumInputs() + this->getNumLatches() + 1;\n\n  while (!stack.empty()) {\n\n    GNode node = stack.top();\n    stack.pop();\n    NodeData& nodeData       = graph.getData(node, galois::MethodFlag::WRITE);\n    nodeData.id              = currentId++;\n    nodeData.counter         = 0;\n    this->nodes[nodeData.id] = node;\n  }\n\n  // std::cout << std::endl << \"All AND node IDs were reseted!\" << std::endl;\n}\n\nvoid Aig::resetAndPIsIds() {\n\n  std::stack<GNode> stack;\n\n  computeTopologicalSortForAnds(stack);\n\n  int currentId = 1;\n\n  for (GNode pi : this->inputNodes) {\n    NodeData& piData       = this->graph.getData(pi, galois::MethodFlag::WRITE);\n    piData.id              = currentId++;\n    this->nodes[piData.id] = pi;\n  }\n\n  while (!stack.empty()) {\n    GNode node = stack.top();\n    stack.pop();\n    NodeData& nodeData       = graph.getData(node, galois::MethodFlag::WRITE);\n    nodeData.id              = currentId++;\n    this->nodes[nodeData.id] = node;\n  }\n\n  // std::cout << std::endl << \"All AND node IDs were reseted!\" << std::endl;\n}\n\nvoid Aig::resetAndPOsIds() {\n\n  std::stack<GNode> stack;\n\n  computeTopologicalSortForAnds(stack);\n\n  int currentId = this->getNumInputs() + this->getNumLatches() + 1;\n\n  while (!stack.empty()) {\n    GNode node = stack.top();\n    stack.pop();\n    NodeData& nodeData       = graph.getData(node, galois::MethodFlag::WRITE);\n    nodeData.id              = currentId++;\n    this->nodes[nodeData.id] = node;\n  }\n\n  for (GNode po : this->outputNodes) {\n    NodeData& poData       = this->graph.getData(po, galois::MethodFlag::WRITE);\n    poData.id              = currentId++;\n    this->nodes[poData.id] = po;\n  }\n\n  // std::cout << std::endl << \"All AND node IDs were reseted!\" << std::endl;\n}\n\nvoid Aig::resetAllIds() {\n\n  std::stack<GNode> stack;\n\n  computeTopologicalSortForAnds(stack);\n\n  int currentId = 1;\n\n  for (GNode pi : this->inputNodes) {\n    NodeData& piData       = this->graph.getData(pi, galois::MethodFlag::WRITE);\n    piData.id              = currentId++;\n    piData.level           = 0;\n    this->nodes[piData.id] = pi;\n  }\n\n  for (GNode latch : this->latchNodes) {\n    NodeData& latchData = this->graph.getData(latch, galois::MethodFlag::WRITE);\n    latchData.id        = currentId++;\n    latchData.level     = 0; // FIXME\n    this->nodes[latchData.id] = latch;\n  }\n\n  while (!stack.empty()) {\n    GNode node = stack.top();\n    stack.pop();\n    NodeData& nodeData = graph.getData(node, galois::MethodFlag::WRITE);\n    nodeData.id        = currentId++;\n\n    auto inEdge      = this->graph.in_edge_begin(node);\n    GNode lhsNode    = this->graph.getEdgeDst(inEdge);\n    NodeData lhsData = this->graph.getData(lhsNode, galois::MethodFlag::READ);\n    inEdge++;\n    GNode rhsNode    = this->graph.getEdgeDst(inEdge);\n    NodeData rhsData = this->graph.getData(rhsNode, galois::MethodFlag::READ);\n\n    nodeData.level = 1 + std::max(lhsData.level, rhsData.level);\n\n    this->nodes[nodeData.id] = node;\n  }\n\n  for (GNode po : this->outputNodes) {\n    NodeData& poData = this->graph.getData(po, galois::MethodFlag::WRITE);\n    poData.id        = currentId++;\n\n    auto inEdge     = this->graph.in_edge_begin(po);\n    GNode inNode    = this->graph.getEdgeDst(inEdge);\n    NodeData inData = this->graph.getData(inNode, galois::MethodFlag::READ);\n\n    poData.level           = inData.level;\n    this->nodes[poData.id] = po;\n  }\n\n  // std::cout << std::endl << \"All AND node IDs were reseted!\" << std::endl;\n}\n\nvoid Aig::computeTopologicalSortForAll(std::stack<GNode>& stack) {\n\n  int size = this->nodes.size();\n  std::vector<bool> visited(size, false);\n\n  for (GNode pi : this->inputNodes) {\n    for (auto outEdge : this->graph.out_edges(pi)) {\n\n      GNode node         = this->graph.getEdgeDst(outEdge);\n      NodeData& nodeData = this->graph.getData(node, galois::MethodFlag::READ);\n\n      if (!visited[nodeData.id]) {\n        topologicalSortAll(node, visited, stack);\n      }\n    }\n\n    stack.push(pi);\n  }\n\n  for (GNode latch : this->latchNodes) {\n    for (auto outEdge : this->graph.out_edges(latch)) {\n\n      GNode node         = this->graph.getEdgeDst(outEdge);\n      NodeData& nodeData = this->graph.getData(node, galois::MethodFlag::READ);\n\n      if (!visited[nodeData.id]) {\n        topologicalSortAll(node, visited, stack);\n      }\n    }\n\n    stack.push(latch);\n  }\n}\n\nvoid Aig::topologicalSortAll(GNode node, std::vector<bool>& visited,\n                             std::stack<GNode>& stack) {\n\n  NodeData& nodeData   = graph.getData(node, galois::MethodFlag::READ);\n  visited[nodeData.id] = true;\n\n  for (auto outEdge : this->graph.out_edges(node)) {\n\n    GNode nextNode = this->graph.getEdgeDst(outEdge);\n    NodeData& nextNodeData =\n        this->graph.getData(nextNode, galois::MethodFlag::READ);\n\n    if (!visited[nextNodeData.id]) {\n      topologicalSortAll(nextNode, visited, stack);\n    }\n  }\n\n  stack.push(node);\n}\n\nvoid Aig::computeTopologicalSortForAnds(std::stack<GNode>& stack) {\n\n  int size = this->nodes.size();\n  std::vector<bool> visited(size, false);\n\n  for (GNode pi : this->inputNodes) {\n    for (auto outEdge : this->graph.out_edges(pi)) {\n\n      GNode node         = this->graph.getEdgeDst(outEdge);\n      NodeData& nodeData = this->graph.getData(node, galois::MethodFlag::READ);\n\n      if ((!visited[nodeData.id]) && (nodeData.type == NodeType::AND)) {\n        topologicalSortAnds(node, visited, stack);\n      }\n    }\n  }\n\n  for (GNode latch : this->latchNodes) {\n    for (auto outEdge : this->graph.out_edges(latch)) {\n\n      GNode node         = this->graph.getEdgeDst(outEdge);\n      NodeData& nodeData = this->graph.getData(node, galois::MethodFlag::READ);\n\n      if ((!visited[nodeData.id]) && (nodeData.type == NodeType::AND)) {\n        topologicalSortAnds(node, visited, stack);\n      }\n    }\n  }\n}\n\nvoid Aig::topologicalSortAnds(GNode node, std::vector<bool>& visited,\n                              std::stack<GNode>& stack) {\n\n  NodeData& nodeData   = graph.getData(node, galois::MethodFlag::READ);\n  visited[nodeData.id] = true;\n\n  for (auto outEdge : this->graph.out_edges(node)) {\n\n    GNode nextNode = this->graph.getEdgeDst(outEdge);\n    NodeData& nextNodeData =\n        this->graph.getData(nextNode, galois::MethodFlag::READ);\n\n    if ((!visited[nextNodeData.id]) && (nextNodeData.type == NodeType::AND)) {\n      topologicalSortAnds(nextNode, visited, stack);\n    }\n  }\n\n  stack.push(node);\n}\n\nvoid Aig::computeGenericTopologicalSortForAnds(\n    std::vector<GNode>& sortedNodes) {\n\n  int size = this->nodes.size();\n  std::vector<bool> visited(size, false);\n\n  for (GNode pi : this->inputNodes) {\n    for (auto outEdge : this->graph.out_edges(pi)) {\n\n      GNode node = this->graph.getEdgeDst(outEdge);\n      NodeData& nodeData =\n          this->graph.getData(node, galois::MethodFlag::UNPROTECTED);\n\n      if ((!visited[nodeData.id]) && (nodeData.type == NodeType::AND)) {\n        genericTopologicalSortAnds(node, visited, sortedNodes);\n      }\n    }\n  }\n\n  for (GNode latch : this->latchNodes) {\n    for (auto outEdge : this->graph.out_edges(latch)) {\n\n      GNode node = this->graph.getEdgeDst(outEdge);\n      NodeData& nodeData =\n          this->graph.getData(node, galois::MethodFlag::UNPROTECTED);\n\n      if ((!visited[nodeData.id]) && (nodeData.type == NodeType::AND)) {\n        genericTopologicalSortAnds(node, visited, sortedNodes);\n      }\n    }\n  }\n}\n\nvoid Aig::genericTopologicalSortAnds(GNode node, std::vector<bool>& visited,\n                                     std::vector<GNode>& sortedNodes) {\n\n  NodeData& nodeData   = graph.getData(node, galois::MethodFlag::UNPROTECTED);\n  visited[nodeData.id] = true;\n\n  for (auto outEdge : this->graph.out_edges(node)) {\n\n    GNode nextNode = this->graph.getEdgeDst(outEdge);\n    NodeData& nextNodeData =\n        this->graph.getData(nextNode, galois::MethodFlag::UNPROTECTED);\n\n    if ((!visited[nextNodeData.id]) && (nextNodeData.type == NodeType::AND)) {\n      genericTopologicalSortAnds(nextNode, visited, sortedNodes);\n    }\n  }\n\n  sortedNodes.push_back(node);\n}\n\nstd::string Aig::toDot() {\n\n  // Preprocess PI, LATCH and PO names\n  std::unordered_map<int, std::string> piNames;\n  for (size_t i = 0; i < this->inputNodes.size(); i++) {\n    aig::NodeData& nodeData =\n        graph.getData(this->inputNodes[i], galois::MethodFlag::READ);\n    piNames.insert(std::make_pair(nodeData.id, this->inputNames[i]));\n  }\n\n  std::unordered_map<int, std::string> latchNames;\n  for (size_t i = 0; i < this->latchNodes.size(); i++) {\n    aig::NodeData& nodeData =\n        graph.getData(this->latchNodes[i], galois::MethodFlag::READ);\n    latchNames.insert(std::make_pair(nodeData.id, this->latchNames[i]));\n  }\n\n  std::unordered_map<int, std::string> poNames;\n  for (size_t i = 0; i < this->outputNodes.size(); i++) {\n    aig::NodeData& nodeData =\n        graph.getData(this->outputNodes[i], galois::MethodFlag::READ);\n    poNames.insert(std::make_pair(nodeData.id, this->outputNames[i]));\n  }\n\n  std::stringstream dot, inputs, latches, outputs, ands, edges;\n\n  for (auto node : this->graph) {\n\n    aig::NodeData& nodeData = graph.getData(node, galois::MethodFlag::READ);\n\n    // Write Edges\n    for (auto edge : graph.in_edges(node)) {\n      aig::GNode dstNode     = graph.getEdgeDst(edge);\n      aig::NodeData& dstData = graph.getData(dstNode, galois::MethodFlag::READ);\n      bool polarity = graph.getEdgeData(edge, galois::MethodFlag::READ);\n\n      std::string nodeName, dstName;\n\n      if (nodeData.type == NodeType::PI) {\n        nodeName = piNames[nodeData.id];\n      } else {\n        if (nodeData.type == NodeType::LATCH) {\n          nodeName = latchNames[nodeData.id];\n        } else {\n          if (nodeData.type == NodeType::PO) {\n            nodeName = poNames[nodeData.id];\n          } else {\n            nodeName = std::to_string(nodeData.id);\n          }\n        }\n      }\n\n      if (dstData.type == NodeType::PI) {\n        dstName = piNames[dstData.id];\n      } else {\n        if (dstData.type == NodeType::LATCH) {\n          dstName = latchNames[dstData.id];\n        } else {\n          if (dstData.type == NodeType::PO) {\n            dstName = poNames[dstData.id];\n          } else {\n            dstName = std::to_string(dstData.id);\n          }\n        }\n      }\n\n      edges << \"\\\"\" << dstName << \"\\\" -> \\\"\" << nodeName << \"\\\"\";\n\n      if (polarity) {\n        edges << \" [penwidth = 3, color=blue]\" << std::endl;\n      } else {\n        edges << \" [penwidth = 3, color=red, style=dashed]\" << std::endl;\n      }\n    }\n\n    if (nodeData.type == NodeType::PI) {\n      inputs << \"\\\"\" << piNames[nodeData.id] << \"\\\"\";\n      inputs << \" [shape=circle, height=1, width=1, penwidth=5 style=filled, \"\n                \"fillcolor=\\\"#ff8080\\\", fontsize=20]\"\n             << std::endl;\n      continue;\n    }\n\n    if (nodeData.type == NodeType::LATCH) {\n      latches << \"\\\"\" << latchNames[nodeData.id] << \"\\\"\";\n      latches << \" [shape=square, height=1, width=1, penwidth=5 style=filled, \"\n                 \"fillcolor=\\\"#ff8080\\\", fontsize=20]\"\n              << std::endl;\n      continue;\n    }\n\n    if (nodeData.type == NodeType::PO) {\n      outputs << \"\\\"\" << poNames[nodeData.id] << \"\\\"\";\n      outputs << \" [shape=circle, height=1, width=1, penwidth=5 style=filled, \"\n                 \"fillcolor=\\\"#008080\\\", fontsize=20]\"\n              << std::endl;\n      continue;\n    }\n\n    if (nodeData.type == NodeType::AND) {\n      ands << \"\\\"\" << nodeData.id << \"\\\"\";\n      ands << \" [shape=circle, height=1, width=1, penwidth=5 style=filled, \"\n              \"fillcolor=\\\"#ffffff\\\", fontsize=20]\"\n           << std::endl;\n    }\n  }\n\n  dot << \"digraph aig {\" << std::endl;\n  dot << \"ranksep=1.5;\" << std::endl;\n  dot << \"nodesep=1.5;\" << std::endl;\n  dot << inputs.str();\n  dot << latches.str();\n  dot << ands.str();\n  dot << outputs.str();\n  dot << edges.str();\n  dot << \"{ rank=source;\";\n  for (GNode node : this->inputNodes) {\n    aig::NodeData& nodeData = graph.getData(node, galois::MethodFlag::READ);\n    dot << \" \\\"\" << piNames[nodeData.id] << \"\\\"\";\n  }\n\n  for (GNode node : this->latchNodes) {\n    aig::NodeData& nodeData = graph.getData(node, galois::MethodFlag::READ);\n    dot << \" \\\"\" << latchNames[nodeData.id] << \"\\\"\";\n  }\n  dot << \" }\" << std::endl;\n\n  dot << \"{ rank=sink;\";\n  for (GNode node : this->outputNodes) {\n    aig::NodeData& nodeData = graph.getData(node, galois::MethodFlag::READ);\n    dot << \" \\\"\" << poNames[nodeData.id] << \"\\\"\";\n  }\n  dot << \" }\" << std::endl;\n\n  dot << \"rankdir=\\\"BT\\\"\" << std::endl;\n  dot << \"}\" << std::endl;\n\n  return dot.str();\n}\n\nvoid Aig::writeDot(std::string path, std::string dotText) {\n\n  std::ofstream dotFile;\n  dotFile.open(path);\n  dotFile << dotText;\n  dotFile.close();\n}\n\n} // namespace andInverterGraph\n"
  },
  {
    "path": "lonestar/eda/cpu/aig-rewriting/subjectgraph/aig/Aig.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#ifndef AIG_AIG_H_\n#define AIG_AIG_H_\n\n#include \"galois/Galois.h\"\n#include \"galois/runtime/Statistics.h\"\n#include \"galois/graphs/Morph_SepInOut_Graph.h\"\n\n#include <iostream>\n#include <vector>\n#include <stack>\n#include <set>\n#include <utility>\n#include <unordered_map>\n\nnamespace andInverterGraph {\n\nstruct NodeData;\n\n// Nodes hold a NodeData structure, edges hold a boolean value and are\n// directional with InOut distinction\ntypedef galois::graphs::Morph_SepInOut_Graph<NodeData, bool, true, true> Graph;\n\ntypedef Graph::GraphNode GNode;\n\nenum NodeType { AND, PI, PO, LATCH, CONSTZERO, CHOICE };\n\nstruct NodeData {\n  NodeType type;    // AIG node type acording to the NodeType enum\n  int id;           // AIG node identifier\n  int level;        // AIG node level\n  int counter;      // Counter used for controlling graph traversal\n  int nFanout;      // AIG node fanout counter\n  int nRefs;        // AIG node reference counter for tech mapping\n  float reqTime;    // AIG node required time for tech mapping\n  GNode choiceList; // Pointer to the first choice node, if it exists\n  // bool isCompl;\t\t\t// Mark is the output is complemented. It is used in\n  // choice nodes.\n\n  NodeData()\n      : level(0), counter(0), nFanout(0), nRefs(0),\n        reqTime(std::numeric_limits<float>::max()), choiceList(nullptr) {\n  } //, isCompl(false) {}\n};\n\nclass Aig {\n\nprivate:\n  Graph graph;\n  std::string designName;\n  std::vector<GNode> inputNodes;\n  std::vector<GNode> latchNodes;\n  std::vector<GNode> outputNodes;\n  std::vector<std::string> inputNames;\n  std::vector<std::string> latchNames;\n  std::vector<std::string> outputNames;\n  std::vector<GNode> nodes;\n  std::vector<std::pair<int, int>> nodesTravId;\n  std::vector<std::unordered_multimap<unsigned, GNode>> nodesFanoutMap;\n  size_t idCounter;\n  float expansionRate;\n\n  void topologicalSortAll(GNode node, std::vector<bool>& visited,\n                          std::stack<GNode>& stack);\n  void topologicalSortAnds(GNode node, std::vector<bool>& visited,\n                           std::stack<GNode>& stack);\n  void genericTopologicalSortAnds(GNode node, std::vector<bool>& visited,\n                                  std::vector<GNode>& sortedNodes);\n\npublic:\n  Aig();\n  Aig(float expansionRate);\n  virtual ~Aig();\n\n  void resize(int m, int i, int l, int o, bool hasSymbols);\n  void resizeNodeVectors(int size);\n  void expandNodeVectors(size_t extraSize);\n  size_t getNextId();\n  GNode createAND(GNode lhsAnd, GNode rhsAnd, bool lhsAndPol, bool rhsAndPol);\n\n  void insertNodeInFanoutMap(GNode andNode, GNode lhsNode, GNode rhsNode,\n                             bool lhsPol, bool rhsPol);\n  void removeNodeInFanoutMap(GNode removedNode, GNode lhsNode, GNode rhsNode,\n                             bool lhsPol, bool rhsPol);\n  GNode lookupNodeInFanoutMap(GNode lhsNode, GNode rhsNode, bool lhsPol,\n                              bool rhsPol);\n  unsigned makeAndHashKey(int lhsId, int rhsId, bool lhsPol, bool rhsPol);\n\n  void registerTravId(int nodeId, int threadId, int travId);\n  bool lookupTravId(int nodeId, int threadId, int travId);\n\n  std::vector<std::pair<int, int>>& getNodesTravId();\n  std::unordered_multimap<unsigned, GNode>& getFanoutMap(int nodeId);\n  std::vector<std::unordered_multimap<unsigned, GNode>>& getNodesFanoutMap();\n  Graph& getGraph();\n  std::vector<GNode>& getNodes();\n  std::vector<GNode>& getInputNodes();\n  std::vector<GNode>& getLatchNodes();\n  std::vector<GNode>& getOutputNodes();\n  std::vector<std::string>& getInputNames();\n  void setInputNames(std::vector<std::string> inputNames);\n  std::vector<std::string>& getLatchNames();\n  void setLatchNames(std::vector<std::string> latchNames);\n  std::vector<std::string>& getOutputNames();\n  void setOutputNames(std::vector<std::string> outputNames);\n  GNode getConstZero();\n  int getNumInputs();\n  int getNumLatches();\n  int getNumOutputs();\n  int getNumAnds();\n  int getDepth();\n  std::string getDesignName();\n  void setDesignName(std::string designName);\n\n  // bool isGNodeComplemented(GNode node);\n  // GNode makeGNodeRegular(GNode node);\n  // GNode makeGNodeComplemented(GNode node);\n\n  void resetAndIds();\n  void resetAndPIsIds();\n  void resetAndPOsIds();\n  void resetAllIds();\n\n  void resetAllNodeCounters();\n\n  void computeTopologicalSortForAll(std::stack<GNode>& stack);\n  void computeTopologicalSortForAnds(std::stack<GNode>& stack);\n  void computeGenericTopologicalSortForAnds(std::vector<GNode>& sortedNodes);\n\n  void writeDot(std::string path, std::string dotText);\n  std::string toDot();\n};\n\n} // namespace andInverterGraph\n\nnamespace aig = andInverterGraph;\n\n#endif /* AIG_H_ */\n"
  },
  {
    "path": "lonestar/eda/cpu/aig-rewriting/writers/AigWriter.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#include \"AigWriter.h\"\n#include \"../util/utilString.h\"\n\nAigWriter::AigWriter() {}\n\nAigWriter::AigWriter(std::string path) { setFile(path); }\n\nAigWriter::~AigWriter() { aigerFile.close(); }\n\nvoid AigWriter::setFile(std::string path) {\n  this->path = path;\n  aigerFile.close();\n  aigerFile.open(path.c_str(), std::ios::trunc);\n}\n\nbool AigWriter::isOpen() { return aigerFile.is_open(); }\n\nvoid AigWriter::writeAag(Aig& aig) {\n  aig.resetAndIds();\n  writeAagHeader(aig);\n  writeInputs(aig);\n  writeLatchesAag(aig);\n  writeOutputs(aig);\n  writeAndsAag(aig);\n  writeSymbolTable(aig);\n}\n\nvoid AigWriter::writeAagHeader(Aig& aig) {\n  int i = aig.getNumInputs();\n  int l = aig.getNumLatches();\n  int o = aig.getNumOutputs();\n  int a = aig.getNumAnds();\n  int m = i + l + a;\n  aigerFile << \"aag \" << m << \" \" << i << \" \" << l << \" \" << o << \" \" << a\n            << std::endl;\n}\n\nvoid AigWriter::writeInputs(Aig& aig) {\n\n  aig::Graph& graph = aig.getGraph();\n\n  for (auto input : aig.getInputNodes()) {\n    aig::NodeData& inputData = graph.getData(input, galois::MethodFlag::READ);\n    aigerFile << inputData.id * 2 << std::endl;\n  }\n}\n\nvoid AigWriter::writeLatchesAag(Aig& aig) {\n\n  aig::Graph& aigGraph = aig.getGraph();\n\n  for (aig::GNode latchNode : aig.getLatchNodes()) {\n    aig::NodeData& latchNodeData =\n        aigGraph.getData(latchNode, galois::MethodFlag::READ);\n    aigerFile << latchNodeData.id * 2 << \" \";\n    auto inEdge         = aigGraph.in_edge_begin(latchNode);\n    bool inEdgePolarity = aigGraph.getEdgeData(inEdge);\n    aig::GNode inNode   = aigGraph.getEdgeDst(inEdge);\n    aig::NodeData& inNodeData =\n        aigGraph.getData(inNode, galois::MethodFlag::READ);\n    // bool initState = latchNode->getInitialValue(); // FIXME;\n    if (inEdgePolarity) {\n      aigerFile << inNodeData.id * 2 << std::endl;\n      // aigerFile << inNodeData.id << \" \" << initState << std::endl;\n    } else {\n      aigerFile << (inNodeData.id * 2) + 1 << std::endl;\n      // aigerFile << inNodeData.id + 1 << \" \" << initState << std::endl;\n    }\n  }\n}\n\nvoid AigWriter::writeOutputs(Aig& aig) {\n\n  aig::Graph& graph = aig.getGraph();\n\n  for (auto output : aig.getOutputNodes()) {\n\n    auto inEdge         = graph.in_edge_begin(output);\n    bool inEdgePolarity = graph.getEdgeData(inEdge, galois::MethodFlag::READ);\n    aig::GNode inNode   = graph.getEdgeDst(inEdge);\n    aig::NodeData& inNodeData = graph.getData(inNode, galois::MethodFlag::READ);\n    if (inEdgePolarity) {\n      aigerFile << inNodeData.id * 2 << std::endl;\n    } else {\n      aigerFile << (inNodeData.id * 2) + 1 << std::endl;\n    }\n  }\n}\n\nvoid AigWriter::writeAndsAag(Aig& aig) {\n\n  std::stack<aig::GNode> stack;\n\n  aig.computeTopologicalSortForAnds(stack);\n\n  // std::cout << \"size: \" << aig.getNumAnds() << std::endl;\n  // std::cout << \"stack size: \" << stack.size() << std::endl;\n\n  aig::Graph& graph = aig.getGraph();\n\n  // unsigned int currentID = aig.getNumInputs() + aig.getNumLatches() + 1;\n\n  while (!stack.empty()) {\n\n    aig::GNode node = stack.top();\n    stack.pop();\n\n    aig::NodeData& nodeData = graph.getData(node, galois::MethodFlag::WRITE);\n    // std::cout << nodeData.id << \" -> \";\n    // nodeData.id = currentID++; // Redefines the AND IDs according to the\n    // topological sorting. std::cout << nodeData.id << std::endl;\n\n    unsigned int andIndex = nodeData.id * 2;\n\n    auto inEdge        = graph.in_edge_begin(node);\n    bool lhsPolarity   = graph.getEdgeData(inEdge, galois::MethodFlag::READ);\n    aig::GNode lhsNode = graph.getEdgeDst(inEdge);\n    aig::NodeData& lhsNodeData =\n        graph.getData(lhsNode, galois::MethodFlag::READ);\n    unsigned int lhsIndex = lhsNodeData.id * 2;\n    lhsIndex              = lhsPolarity ? lhsIndex : (lhsIndex + 1);\n\n    inEdge++;\n    bool rhsPolarity   = graph.getEdgeData(inEdge, galois::MethodFlag::READ);\n    aig::GNode rhsNode = graph.getEdgeDst(inEdge);\n    aig::NodeData& rhsNodeData =\n        graph.getData(rhsNode, galois::MethodFlag::READ);\n    unsigned int rhsIndex = rhsNodeData.id * 2;\n    rhsIndex              = rhsPolarity ? rhsIndex : (rhsIndex + 1);\n\n    if (lhsIndex < rhsIndex) {\n      std::swap(lhsIndex, rhsIndex);\n    }\n\n    aigerFile << andIndex << \" \" << lhsIndex << \" \" << rhsIndex << std::endl;\n  }\n}\n\nvoid AigWriter::writeAig(Aig& aig) {\n  aig.resetAndIds();\n  writeAigHeader(aig);\n  writeLatchesAig(aig);\n  writeOutputs(aig);\n  writeAndsAig(aig);\n  writeSymbolTable(aig);\n}\n\nvoid AigWriter::writeAigHeader(Aig& aig) {\n  int i = aig.getNumInputs();\n  int l = aig.getNumLatches();\n  int o = aig.getNumOutputs();\n  int a = aig.getNumAnds();\n  int m = i + l + a;\n  aigerFile << \"aig \" << m << \" \" << i << \" \" << l << \" \" << o << \" \" << a\n            << std::endl;\n}\n\nvoid AigWriter::writeLatchesAig(Aig& aig) {\n\n  aig::Graph& aigGraph = aig.getGraph();\n\n  for (aig::GNode latchNode : aig.getLatchNodes()) {\n    auto inEdge         = aigGraph.in_edge_begin(latchNode);\n    bool inEdgePolarity = aigGraph.getEdgeData(inEdge);\n    aig::GNode inNode   = aigGraph.getEdgeDst(inEdge);\n    aig::NodeData& inNodeData =\n        aigGraph.getData(inNode, galois::MethodFlag::READ);\n    // bool initState = latchNode->getInitialValue(); // FIXME;\n    if (inEdgePolarity) {\n      aigerFile << inNodeData.id * 2 << std::endl;\n      // aigerFile << inNodeData.id + 1 << \" \" << initState << std::endl;\n    } else {\n      aigerFile << (inNodeData.id * 2) + 1 << std::endl;\n      // aigerFile << inNodeData.id + 1 << \" \" << initState << std::endl;\n    }\n  }\n}\n\nvoid AigWriter::writeAndsAig(Aig& aig) {\n\n  std::stack<aig::GNode> stack;\n\n  aig.computeTopologicalSortForAnds(stack);\n\n  aig::Graph& graph = aig.getGraph();\n\n  // unsigned int currentID = aig.getNumInputs() + aig.getNumLatches() + 1;\n\n  while (!stack.empty()) {\n\n    aig::GNode node = stack.top();\n    stack.pop();\n\n    aig::NodeData& nodeData = graph.getData(node, galois::MethodFlag::WRITE);\n    // std::cout << nodeData.id << \" -> \";\n    // nodeData.id = currentID++; // Redefines the AND IDs according to the\n    // topological sorting. std::cout << nodeData.id << std::endl;\n\n    unsigned int andIndex = nodeData.id * 2;\n\n    auto inEdge        = graph.in_edge_begin(node);\n    bool lhsPolarity   = graph.getEdgeData(inEdge, galois::MethodFlag::READ);\n    aig::GNode lhsNode = graph.getEdgeDst(inEdge);\n    aig::NodeData& lhsNodeData =\n        graph.getData(lhsNode, galois::MethodFlag::READ);\n    unsigned int lhsIndex = lhsNodeData.id * 2;\n    lhsIndex              = lhsPolarity ? lhsIndex : (lhsIndex + 1);\n\n    inEdge++;\n    bool rhsPolarity   = graph.getEdgeData(inEdge, galois::MethodFlag::READ);\n    aig::GNode rhsNode = graph.getEdgeDst(inEdge);\n    aig::NodeData& rhsNodeData =\n        graph.getData(rhsNode, galois::MethodFlag::READ);\n    unsigned int rhsIndex = rhsNodeData.id * 2;\n    rhsIndex              = rhsPolarity ? rhsIndex : (rhsIndex + 1);\n\n    if (lhsIndex < rhsIndex) {\n      std::swap(lhsIndex, rhsIndex);\n    }\n\n    encode(andIndex - lhsIndex);\n    encode(lhsIndex - rhsIndex);\n  }\n}\n\nvoid AigWriter::writeSymbolTable(Aig& aig) {\n\n  int i = 0;\n  for (auto inputName : aig.getInputNames()) {\n    aigerFile << \"i\" << i++ << \" \" << inputName << std::endl;\n  }\n\n  i = 0;\n  for (auto latchName : aig.getLatchNames()) {\n    aigerFile << \"l\" << i++ << \" \" << latchName << std::endl;\n  }\n\n  i = 0;\n  for (auto outputName : aig.getOutputNames()) {\n    aigerFile << \"o\" << i++ << \" \" << outputName << std::endl;\n  }\n\n  aigerFile << \"c\" << std::endl << aig.getDesignName() << std::endl;\n}\n\nvoid AigWriter::encode(unsigned x) {\n  unsigned char ch;\n  while (x & ~0x7f) {\n    ch = (x & 0x7f) | 0x80;\n    aigerFile.put(ch);\n    x >>= 7;\n  }\n  ch = x;\n  aigerFile.put(ch);\n}\n\nvoid AigWriter::close() { aigerFile.close(); }\n"
  },
  {
    "path": "lonestar/eda/cpu/aig-rewriting/writers/AigWriter.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#ifndef AIGWRITER_H_\n#define AIGWRITER_H_\n\n#include <fstream>\n#include <iostream>\n#include <string>\n\n#include \"../subjectgraph/aig/Aig.h\"\n#include \"galois/Galois.h\"\n\ntypedef aig::Aig Aig;\n\nclass AigWriter {\n\nprivate:\n  std::ofstream aigerFile;\n  std::string path;\n\n  void writeAagHeader(Aig& aig);\n  void writeLatchesAag(Aig& aig);\n  void writeAndsAag(Aig& aig);\n\n  void writeAigHeader(Aig& aig);\n  void writeLatchesAig(Aig& aig);\n  void writeAndsAig(Aig& aig);\n\n  void writeInputs(Aig& aig);\n  void writeOutputs(Aig& aig);\n  void writeSymbolTable(Aig& aig);\n\n  void encode(unsigned x);\n\npublic:\n  AigWriter();\n  AigWriter(std::string path);\n  virtual ~AigWriter();\n\n  void setFile(std::string path);\n  bool isOpen();\n  void close();\n\n  void writeAag(Aig& aig);\n  void writeAig(Aig& aig);\n};\n\n#endif /* AIGWRITER_H_ */\n"
  },
  {
    "path": "lonestar/eda/cpu/aig-rewriting/writers/BlifWriter.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n/*\n\n @Vinicius Possani\n Blif format writer, October 17, 2018.\n\n*/\n\n#include \"BlifWriter.h\"\n#include \"../util/utilString.h\"\n#include <unordered_set>\n\nBlifWriter::BlifWriter() {}\n\nBlifWriter::BlifWriter(std::string path) { setFile(path); }\n\nBlifWriter::~BlifWriter() { blifFile.close(); }\n\nvoid BlifWriter::setFile(std::string path) {\n  this->path = path;\n  blifFile.close();\n  blifFile.open(path.c_str(), std::ios::trunc);\n}\n\nbool BlifWriter::isOpen() { return blifFile.is_open(); }\n\nvoid BlifWriter::close() { blifFile.close(); }\n\nvoid BlifWriter::writeNetlist(aig::Aig& aig, algorithm::PriCutManager& cutMan) {\n\n  aig::GNode leaf;\n  aig::Graph& aigGraph   = aig.getGraph();\n  std::string designName = aig.getDesignName();\n  find_and_replace(designName, \" \", \"_\");\n  int nDigitsPIs = countDigits(aig.getInputNodes().size() - 1);\n  int nDigitsPOs = countDigits(aig.getOutputNodes().size() - 1);\n\n  this->blifFile << \".model \" << designName << std::endl;\n\n  this->blifFile << \".inputs \";\n  for (size_t i = 0; i < aig.getInputNodes().size(); i++) {\n    this->blifFile << \"pi\" << std::setfill('0') << std::setw(nDigitsPIs) << i\n                   << \" \";\n  }\n  this->blifFile << std::endl;\n\n  this->blifFile << \".outputs \";\n  for (size_t i = 0; i < aig.getOutputNodes().size(); i++) {\n    this->blifFile << \"po\" << std::setfill('0') << std::setw(nDigitsPOs) << i\n                   << \" \";\n  }\n  this->blifFile << std::endl;\n\n  for (auto entry : cutMan.getCovering()) {\n    this->blifFile << \".names\";\n\n    if (Functional32::isConstZero(cutMan.readTruth(entry.second),\n                                  cutMan.getK())) {\n      // Output\n      this->blifFile << \" n\" << entry.first << std::endl << \"0\" << std::endl;\n      continue;\n    }\n\n    if (Functional32::isConstOne(cutMan.readTruth(entry.second),\n                                 cutMan.getK())) {\n      // Output\n      this->blifFile << \" n\" << entry.first << std::endl << \"1\" << std::endl;\n      continue;\n    }\n\n    // Inputs\n    for (int i = 0; i < entry.second->nLeaves; i++) {\n\n      leaf                    = aig.getNodes()[entry.second->leaves[i]];\n      aig::NodeData& leafData = aigGraph.getData(leaf);\n\n      if (leafData.type == aig::NodeType::PI) {\n        this->blifFile << \" pi\" << std::setfill('0') << std::setw(nDigitsPIs)\n                       << (leafData.id - 1);\n      } else {\n        this->blifFile << \" n\" << leafData.id;\n      }\n    }\n    // Output\n    this->blifFile << \" n\" << entry.first << std::endl;\n    // Cubes\n    this->blifFile << Functional32::toCubeString(cutMan.readTruth(entry.second),\n                                                 cutMan.getNWords(),\n                                                 entry.second->nLeaves);\n  }\n\n  // Define PO poloarities\n  for (size_t i = 0; i < aig.getOutputNodes().size(); i++) {\n\n    auto inEdgeIt = aigGraph.in_edge_begin(aig.getOutputNodes()[i]);\n    bool outEdgePolarity =\n        aigGraph.getEdgeData(inEdgeIt, galois::MethodFlag::READ);\n    aig::GNode inNode         = aigGraph.getEdgeDst(inEdgeIt);\n    aig::NodeData& inNodeData = aigGraph.getData(inNode);\n\n    if (inNodeData.type == aig::NodeType::PI) {\n      this->blifFile << \".names pi\" << std::setfill('0')\n                     << std::setw(nDigitsPIs) << (inNodeData.id - 1);\n      this->blifFile << \" po\" << std::setfill('0') << std::setw(nDigitsPOs) << i\n                     << std::endl;\n      this->blifFile << ((outEdgePolarity == true) ? \"1 1\" : \"0 1\")\n                     << std::endl;\n    } else {\n      if (inNodeData.type == aig::NodeType::CONSTZERO) {\n        this->blifFile << \".names \"\n                       << \" po\" << std::setfill('0');\n        this->blifFile << std::setw(nDigitsPOs) << i << std::endl;\n        this->blifFile << ((outEdgePolarity == true) ? \"0\" : \"1\") << std::endl;\n      } else {\n        this->blifFile << \".names n\" << inNodeData.id;\n        this->blifFile << \" po\" << std::setfill('0') << std::setw(nDigitsPOs)\n                       << i << std::endl;\n        this->blifFile << ((outEdgePolarity == true) ? \"1 1\" : \"0 1\")\n                       << std::endl;\n      }\n    }\n  }\n\n  this->blifFile << \".end\" << std::endl;\n}\n\nint BlifWriter::countDigits(int n) {\n\n  int nDigits = 0;\n  while (n) {\n    n = n / 10;\n    nDigits++;\n  }\n  return nDigits;\n}\n"
  },
  {
    "path": "lonestar/eda/cpu/aig-rewriting/writers/BlifWriter.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n/*\n\n @Vinicius Possani\n Blif format writer, October 17, 2018.\n\n*/\n\n#ifndef BLIFWRITER_H_\n#define BLIFWRITER_H_\n\n#include <fstream>\n#include <iostream>\n#include <string>\n#include <unordered_map>\n\n#include \"../subjectgraph/aig/Aig.h\"\n#include \"../algorithms/PriorityCutManager.h\"\n#include \"galois/Galois.h\"\n\nclass BlifWriter {\n\nprivate:\n  std::ofstream blifFile;\n  std::string path;\n\npublic:\n  BlifWriter();\n  BlifWriter(std::string path);\n  ~BlifWriter();\n\n  void setFile(std::string path);\n  bool isOpen();\n  void close();\n  int countDigits(int n);\n\n  void writeNetlist(aig::Aig& aig, algorithm::PriCutManager& cutMan);\n};\n\n#endif /* BLIFWRITER_H_ */\n"
  },
  {
    "path": "lonestar/eda/cpu/aig-rewriting/xxHash/xxhash.c",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting parallelism.\n * The code is being released under the terms of the 3-Clause BSD License (a\n * copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n/*\n *  xxHash - Fast Hash algorithm\n *  Copyright (C) 2012-2016, Yann Collet\n *\n *  BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)\n *\n *  Redistribution and use in source and binary forms, with or without\n *  modification, are permitted provided that the following conditions are\n *  met:\n *\n *  * Redistributions of source code must retain the above copyright\n *  notice, this list of conditions and the following disclaimer.\n *  * Redistributions in binary form must reproduce the above\n *  copyright notice, this list of conditions and the following disclaimer\n *  in the documentation and/or other materials provided with the\n *  distribution.\n *\n *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS\n *  \"AS IS\" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT\n *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR\n *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT\n *  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,\n *  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT\n *  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,\n *  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY\n *  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\n *  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\n *  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n *\n *  You can contact the author at :\n *  - xxHash homepage: http://www.xxhash.com\n *  - xxHash source repository : https://github.com/Cyan4973/xxHash\n */\n\n/* *************************************\n *  Tuning parameters\n ***************************************/\n/*!XXH_FORCE_MEMORY_ACCESS :\n * By default, access to unaligned memory is controlled by `memcpy()`, which is\n * safe and portable. Unfortunately, on some target/compiler combinations, the\n * generated assembly is sub-optimal. The below switch allow to select different\n * access method for improved performance. Method 0 (default) : use `memcpy()`.\n * Safe and portable. Method 1 : `__packed` statement. It depends on compiler\n * extension (ie, not portable). This method is safe if your compiler supports\n * it, and *generally* as fast or faster than `memcpy`. Method 2 : direct\n * access. This method doesn't depend on compiler but violate C standard. It can\n * generate buggy code on targets which do not support unaligned memory\n * accesses. But in some circumstances, it's the only known way to get the most\n * performance (ie GCC + ARMv6) See http://stackoverflow.com/a/32095106/646947\n * for details. Prefer these methods in priority order (0 > 1 > 2)\n */\n#ifndef XXH_FORCE_MEMORY_ACCESS /* can be defined externally, on command line  \\\n                                   for example */\n#if defined(__GNUC__) &&                                                       \\\n    (defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) ||                    \\\n     defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6Z__) ||                   \\\n     defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_6T2__))\n#define XXH_FORCE_MEMORY_ACCESS 2\n#elif defined(__INTEL_COMPILER) ||                                             \\\n    (defined(__GNUC__) &&                                                      \\\n     (defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) ||                   \\\n      defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7M__) ||                  \\\n      defined(__ARM_ARCH_7S__)))\n#define XXH_FORCE_MEMORY_ACCESS 1\n#endif\n#endif\n\n/*!XXH_ACCEPT_NULL_INPUT_POINTER :\n * If the input pointer is a null pointer, xxHash default behavior is to trigger\n * a memory access error, since it is a bad pointer. When this option is\n * enabled, xxHash output for null input pointers will be the same as a\n * null-length input. By default, this option is disabled. To enable it,\n * uncomment below define :\n */\n/* #define XXH_ACCEPT_NULL_INPUT_POINTER 1 */\n\n/*!XXH_FORCE_NATIVE_FORMAT :\n * By default, xxHash library provides endian-independent Hash values, based on\n * little-endian convention. Results are therefore identical for little-endian\n * and big-endian CPU. This comes at a performance cost for big-endian CPU,\n * since some swapping is required to emulate little-endian format. Should\n * endian-independence be of no importance for your application, you may set the\n * #define below to 1, to improve speed for Big-endian CPU. This option has no\n * impact on Little_Endian CPU.\n */\n#ifndef XXH_FORCE_NATIVE_FORMAT /* can be defined externally */\n#define XXH_FORCE_NATIVE_FORMAT 0\n#endif\n\n/*!XXH_FORCE_ALIGN_CHECK :\n * This is a minor performance trick, only useful with lots of very small keys.\n * It means : check for aligned/unaligned input.\n * The check costs one initial branch per hash;\n * set it to 0 when the input is guaranteed to be aligned,\n * or when alignment doesn't matter for performance.\n */\n#ifndef XXH_FORCE_ALIGN_CHECK /* can be defined externally */\n#if defined(__i386) || defined(_M_IX86) || defined(__x86_64__) ||              \\\n    defined(_M_X64)\n#define XXH_FORCE_ALIGN_CHECK 0\n#else\n#define XXH_FORCE_ALIGN_CHECK 1\n#endif\n#endif\n\n/* *************************************\n *  Includes & Memory related functions\n ***************************************/\n/*! Modify the local functions below should you wish to use some other memory\n * routines for malloc(), free() */\n#include <stdlib.h>\nstatic void* XXH_malloc(size_t s) { return malloc(s); }\nstatic void XXH_free(void* p) { free(p); }\n/*! and for memcpy() */\n#include <string.h>\nstatic void* XXH_memcpy(void* dest, const void* src, size_t size) {\n  return memcpy(dest, src, size);\n}\n\n#define XXH_STATIC_LINKING_ONLY\n#include \"xxhash.h\"\n\n/* *************************************\n *  Compiler Specific Options\n ***************************************/\n#ifdef _MSC_VER /* Visual Studio */\n#pragma warning(                                                               \\\n    disable : 4127) /* disable: C4127: conditional expression is constant */\n#define FORCE_INLINE static __forceinline\n#else\n#if defined(__cplusplus) ||                                                    \\\n    defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L /* C99 */\n#ifdef __GNUC__\n#define FORCE_INLINE static inline __attribute__((always_inline))\n#else\n#define FORCE_INLINE static inline\n#endif\n#else\n#define FORCE_INLINE static\n#endif /* __STDC_VERSION__ */\n#endif\n\n/* *************************************\n *  Basic Types\n ***************************************/\n#ifndef MEM_MODULE\n#if !defined(__VMS) &&                                                         \\\n    (defined(__cplusplus) ||                                                   \\\n     (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */))\n#include <stdint.h>\ntypedef uint8_t BYTE;\ntypedef uint16_t U16;\ntypedef uint32_t U32;\n#else\ntypedef unsigned char BYTE;\ntypedef unsigned short U16;\ntypedef unsigned int U32;\n#endif\n#endif\n\n#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS == 2))\n\n/* Force direct memory access. Only works on CPU which support unaligned memory\n * access in hardware */\nstatic U32 XXH_read32(const void* memPtr) { return *(const U32*)memPtr; }\n\n#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS == 1))\n\n/* __pack instructions are safer, but compiler specific, hence potentially\n * problematic for some compilers */\n/* currently only defined for gcc and icc */\ntypedef union {\n  U32 u32;\n} __attribute__((packed)) unalign;\nstatic U32 XXH_read32(const void* ptr) { return ((const unalign*)ptr)->u32; }\n\n#else\n\n/* portable and safe solution. Generally efficient.\n * see : http://stackoverflow.com/a/32095106/646947\n */\nstatic U32 XXH_read32(const void* memPtr) {\n  U32 val;\n  memcpy(&val, memPtr, sizeof(val));\n  return val;\n}\n\n#endif /* XXH_FORCE_DIRECT_MEMORY_ACCESS */\n\n/* ****************************************\n *  Compiler-specific Functions and Macros\n ******************************************/\n#define XXH_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)\n\n/* Note : although _rotl exists for minGW (GCC under windows), performance seems\n * poor */\n#if defined(_MSC_VER)\n#define XXH_rotl32(x, r) _rotl(x, r)\n#define XXH_rotl64(x, r) _rotl64(x, r)\n#else\n#define XXH_rotl32(x, r) ((x << r) | (x >> (32 - r)))\n#define XXH_rotl64(x, r) ((x << r) | (x >> (64 - r)))\n#endif\n\n#if defined(_MSC_VER) /* Visual Studio */\n#define XXH_swap32 _byteswap_ulong\n#elif XXH_GCC_VERSION >= 403\n#define XXH_swap32 __builtin_bswap32\n#else\nstatic U32 XXH_swap32(U32 x) {\n  return ((x << 24) & 0xff000000) | ((x << 8) & 0x00ff0000) |\n         ((x >> 8) & 0x0000ff00) | ((x >> 24) & 0x000000ff);\n}\n#endif\n\n/* *************************************\n *  Architecture Macros\n ***************************************/\ntypedef enum { XXH_bigEndian = 0, XXH_littleEndian = 1 } XXH_endianess;\n\n/* XXH_CPU_LITTLE_ENDIAN can be defined externally, for example on the compiler\n * command line */\n#ifndef XXH_CPU_LITTLE_ENDIAN\nstatic const int g_one = 1;\n#define XXH_CPU_LITTLE_ENDIAN (*(const char*)(&g_one))\n#endif\n\n/* ***************************\n *  Memory reads\n *****************************/\ntypedef enum { XXH_aligned, XXH_unaligned } XXH_alignment;\n\nFORCE_INLINE U32 XXH_readLE32_align(const void* ptr, XXH_endianess endian,\n                                    XXH_alignment align) {\n  if (align == XXH_unaligned)\n    return endian == XXH_littleEndian ? XXH_read32(ptr)\n                                      : XXH_swap32(XXH_read32(ptr));\n  else\n    return endian == XXH_littleEndian ? *(const U32*)ptr\n                                      : XXH_swap32(*(const U32*)ptr);\n}\n\nFORCE_INLINE U32 XXH_readLE32(const void* ptr, XXH_endianess endian) {\n  return XXH_readLE32_align(ptr, endian, XXH_unaligned);\n}\n\nstatic U32 XXH_readBE32(const void* ptr) {\n  return XXH_CPU_LITTLE_ENDIAN ? XXH_swap32(XXH_read32(ptr)) : XXH_read32(ptr);\n}\n\n/* *************************************\n *  Macros\n ***************************************/\n#define XXH_STATIC_ASSERT(c)                                                   \\\n  {                                                                            \\\n    enum { XXH_static_assert = 1 / (int)(!!(c)) };                             \\\n  } /* use only *after* variable declarations */\nXXH_PUBLIC_API unsigned XXH_versionNumber(void) { return XXH_VERSION_NUMBER; }\n\n/* *******************************************************************\n *  32-bits hash functions\n *********************************************************************/\nstatic const U32 PRIME32_1 = 2654435761U;\nstatic const U32 PRIME32_2 = 2246822519U;\nstatic const U32 PRIME32_3 = 3266489917U;\nstatic const U32 PRIME32_4 = 668265263U;\nstatic const U32 PRIME32_5 = 374761393U;\n\nstatic U32 XXH32_round(U32 seed, U32 input) {\n  seed += input * PRIME32_2;\n  seed = XXH_rotl32(seed, 13);\n  seed *= PRIME32_1;\n  return seed;\n}\n\nFORCE_INLINE U32 XXH32_endian_align(const void* input, size_t len, U32 seed,\n                                    XXH_endianess endian, XXH_alignment align) {\n  const BYTE* p    = (const BYTE*)input;\n  const BYTE* bEnd = p + len;\n  U32 h32;\n#define XXH_get32bits(p) XXH_readLE32_align(p, endian, align)\n\n#ifdef XXH_ACCEPT_NULL_INPUT_POINTER\n  if (p == NULL) {\n    len  = 0;\n    bEnd = p = (const BYTE*)(size_t)16;\n  }\n#endif\n\n  if (len >= 16) {\n    const BYTE* const limit = bEnd - 16;\n    U32 v1                  = seed + PRIME32_1 + PRIME32_2;\n    U32 v2                  = seed + PRIME32_2;\n    U32 v3                  = seed + 0;\n    U32 v4                  = seed - PRIME32_1;\n\n    do {\n      v1 = XXH32_round(v1, XXH_get32bits(p));\n      p += 4;\n      v2 = XXH32_round(v2, XXH_get32bits(p));\n      p += 4;\n      v3 = XXH32_round(v3, XXH_get32bits(p));\n      p += 4;\n      v4 = XXH32_round(v4, XXH_get32bits(p));\n      p += 4;\n    } while (p <= limit);\n\n    h32 = XXH_rotl32(v1, 1) + XXH_rotl32(v2, 7) + XXH_rotl32(v3, 12) +\n          XXH_rotl32(v4, 18);\n  } else {\n    h32 = seed + PRIME32_5;\n  }\n\n  h32 += (U32)len;\n\n  while (p + 4 <= bEnd) {\n    h32 += XXH_get32bits(p) * PRIME32_3;\n    h32 = XXH_rotl32(h32, 17) * PRIME32_4;\n    p += 4;\n  }\n\n  while (p < bEnd) {\n    h32 += (*p) * PRIME32_5;\n    h32 = XXH_rotl32(h32, 11) * PRIME32_1;\n    p++;\n  }\n\n  h32 ^= h32 >> 15;\n  h32 *= PRIME32_2;\n  h32 ^= h32 >> 13;\n  h32 *= PRIME32_3;\n  h32 ^= h32 >> 16;\n\n  return h32;\n}\n\nXXH_PUBLIC_API unsigned int XXH32(const void* input, size_t len,\n                                  unsigned int seed) {\n#if 0\n    /* Simple version, good for code maintenance, but unfortunately slow for small inputs */\n    XXH32_state_t state;\n    XXH32_reset(&state, seed);\n    XXH32_update(&state, input, len);\n    return XXH32_digest(&state);\n#else\n  XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;\n\n  if (XXH_FORCE_ALIGN_CHECK) {\n    if ((((size_t)input) & 3) ==\n        0) { /* Input is 4-bytes aligned, leverage the speed benefit */\n      if ((endian_detected == XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)\n        return XXH32_endian_align(input, len, seed, XXH_littleEndian,\n                                  XXH_aligned);\n      else\n        return XXH32_endian_align(input, len, seed, XXH_bigEndian, XXH_aligned);\n    }\n  }\n\n  if ((endian_detected == XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)\n    return XXH32_endian_align(input, len, seed, XXH_littleEndian,\n                              XXH_unaligned);\n  else\n    return XXH32_endian_align(input, len, seed, XXH_bigEndian, XXH_unaligned);\n#endif\n}\n\n/*======   Hash streaming   ======*/\n\nXXH_PUBLIC_API XXH32_state_t* XXH32_createState(void) {\n  return (XXH32_state_t*)XXH_malloc(sizeof(XXH32_state_t));\n}\nXXH_PUBLIC_API XXH_errorcode XXH32_freeState(XXH32_state_t* statePtr) {\n  XXH_free(statePtr);\n  return XXH_OK;\n}\n\nXXH_PUBLIC_API void XXH32_copyState(XXH32_state_t* dstState,\n                                    const XXH32_state_t* srcState) {\n  memcpy(dstState, srcState, sizeof(*dstState));\n}\n\nXXH_PUBLIC_API XXH_errorcode XXH32_reset(XXH32_state_t* statePtr,\n                                         unsigned int seed) {\n  XXH32_state_t state; /* using a local state to memcpy() in order to avoid\n                          strict-aliasing warnings */\n  memset(&state, 0,\n         sizeof(state) -\n             4); /* do not write into reserved, for future removal */\n  state.v1 = seed + PRIME32_1 + PRIME32_2;\n  state.v2 = seed + PRIME32_2;\n  state.v3 = seed + 0;\n  state.v4 = seed - PRIME32_1;\n  memcpy(statePtr, &state, sizeof(state));\n  return XXH_OK;\n}\n\nFORCE_INLINE XXH_errorcode XXH32_update_endian(XXH32_state_t* state,\n                                               const void* input, size_t len,\n                                               XXH_endianess endian) {\n  const BYTE* p          = (const BYTE*)input;\n  const BYTE* const bEnd = p + len;\n\n#ifdef XXH_ACCEPT_NULL_INPUT_POINTER\n  if (input == NULL)\n    return XXH_ERROR;\n#endif\n\n  state->total_len_32 += (unsigned)len;\n  state->large_len |= (len >= 16) | (state->total_len_32 >= 16);\n\n  if (state->memsize + len < 16) { /* fill in tmp buffer */\n    XXH_memcpy((BYTE*)(state->mem32) + state->memsize, input, len);\n    state->memsize += (unsigned)len;\n    return XXH_OK;\n  }\n\n  if (state->memsize) { /* some data left from previous update */\n    XXH_memcpy((BYTE*)(state->mem32) + state->memsize, input,\n               16 - state->memsize);\n    {\n      const U32* p32 = state->mem32;\n      state->v1      = XXH32_round(state->v1, XXH_readLE32(p32, endian));\n      p32++;\n      state->v2 = XXH32_round(state->v2, XXH_readLE32(p32, endian));\n      p32++;\n      state->v3 = XXH32_round(state->v3, XXH_readLE32(p32, endian));\n      p32++;\n      state->v4 = XXH32_round(state->v4, XXH_readLE32(p32, endian));\n    }\n    p += 16 - state->memsize;\n    state->memsize = 0;\n  }\n\n  if (p <= bEnd - 16) {\n    const BYTE* const limit = bEnd - 16;\n    U32 v1                  = state->v1;\n    U32 v2                  = state->v2;\n    U32 v3                  = state->v3;\n    U32 v4                  = state->v4;\n\n    do {\n      v1 = XXH32_round(v1, XXH_readLE32(p, endian));\n      p += 4;\n      v2 = XXH32_round(v2, XXH_readLE32(p, endian));\n      p += 4;\n      v3 = XXH32_round(v3, XXH_readLE32(p, endian));\n      p += 4;\n      v4 = XXH32_round(v4, XXH_readLE32(p, endian));\n      p += 4;\n    } while (p <= limit);\n\n    state->v1 = v1;\n    state->v2 = v2;\n    state->v3 = v3;\n    state->v4 = v4;\n  }\n\n  if (p < bEnd) {\n    XXH_memcpy(state->mem32, p, (size_t)(bEnd - p));\n    state->memsize = (unsigned)(bEnd - p);\n  }\n\n  return XXH_OK;\n}\n\nXXH_PUBLIC_API XXH_errorcode XXH32_update(XXH32_state_t* state_in,\n                                          const void* input, size_t len) {\n  XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;\n\n  if ((endian_detected == XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)\n    return XXH32_update_endian(state_in, input, len, XXH_littleEndian);\n  else\n    return XXH32_update_endian(state_in, input, len, XXH_bigEndian);\n}\n\nFORCE_INLINE U32 XXH32_digest_endian(const XXH32_state_t* state,\n                                     XXH_endianess endian) {\n  const BYTE* p          = (const BYTE*)state->mem32;\n  const BYTE* const bEnd = (const BYTE*)(state->mem32) + state->memsize;\n  U32 h32;\n\n  if (state->large_len) {\n    h32 = XXH_rotl32(state->v1, 1) + XXH_rotl32(state->v2, 7) +\n          XXH_rotl32(state->v3, 12) + XXH_rotl32(state->v4, 18);\n  } else {\n    h32 = state->v3 /* == seed */ + PRIME32_5;\n  }\n\n  h32 += state->total_len_32;\n\n  while (p + 4 <= bEnd) {\n    h32 += XXH_readLE32(p, endian) * PRIME32_3;\n    h32 = XXH_rotl32(h32, 17) * PRIME32_4;\n    p += 4;\n  }\n\n  while (p < bEnd) {\n    h32 += (*p) * PRIME32_5;\n    h32 = XXH_rotl32(h32, 11) * PRIME32_1;\n    p++;\n  }\n\n  h32 ^= h32 >> 15;\n  h32 *= PRIME32_2;\n  h32 ^= h32 >> 13;\n  h32 *= PRIME32_3;\n  h32 ^= h32 >> 16;\n\n  return h32;\n}\n\nXXH_PUBLIC_API unsigned int XXH32_digest(const XXH32_state_t* state_in) {\n  XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;\n\n  if ((endian_detected == XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)\n    return XXH32_digest_endian(state_in, XXH_littleEndian);\n  else\n    return XXH32_digest_endian(state_in, XXH_bigEndian);\n}\n\n/*======   Canonical representation   ======*/\n\n/*! Default XXH result types are basic unsigned 32 and 64 bits.\n *   The canonical representation follows human-readable write convention, aka\n * big-endian (large digits first). These functions allow transformation of hash\n * result into and from its canonical format. This way, hash values can be\n * written into a file or buffer, and remain comparable across different systems\n * and programs.\n */\n\nXXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst,\n                                            XXH32_hash_t hash) {\n  XXH_STATIC_ASSERT(sizeof(XXH32_canonical_t) == sizeof(XXH32_hash_t));\n  if (XXH_CPU_LITTLE_ENDIAN)\n    hash = XXH_swap32(hash);\n  memcpy(dst, &hash, sizeof(*dst));\n}\n\nXXH_PUBLIC_API XXH32_hash_t\nXXH32_hashFromCanonical(const XXH32_canonical_t* src) {\n  return XXH_readBE32(src);\n}\n\n#ifndef XXH_NO_LONG_LONG\n\n/* *******************************************************************\n *  64-bits hash functions\n *********************************************************************/\n\n/*======   Memory access   ======*/\n\n#ifndef MEM_MODULE\n#define MEM_MODULE\n#if !defined(__VMS) &&                                                         \\\n    (defined(__cplusplus) ||                                                   \\\n     (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */))\n#include <stdint.h>\ntypedef uint64_t U64;\n#else\ntypedef unsigned long long\n    U64; /* if your compiler doesn't support unsigned long long, replace by\n            another 64-bit type here. Note that xxhash.h will also need to be\n            updated. */\n#endif\n#endif\n\n#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS == 2))\n\n/* Force direct memory access. Only works on CPU which support unaligned memory\n * access in hardware */\nstatic U64 XXH_read64(const void* memPtr) { return *(const U64*)memPtr; }\n\n#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS == 1))\n\n/* __pack instructions are safer, but compiler specific, hence potentially\n * problematic for some compilers */\n/* currently only defined for gcc and icc */\ntypedef union {\n  U32 u32;\n  U64 u64;\n} __attribute__((packed)) unalign64;\nstatic U64 XXH_read64(const void* ptr) { return ((const unalign64*)ptr)->u64; }\n\n#else\n\n/* portable and safe solution. Generally efficient.\n * see : http://stackoverflow.com/a/32095106/646947\n */\n\nstatic U64 XXH_read64(const void* memPtr) {\n  U64 val;\n  memcpy(&val, memPtr, sizeof(val));\n  return val;\n}\n\n#endif /* XXH_FORCE_DIRECT_MEMORY_ACCESS */\n\n#if defined(_MSC_VER) /* Visual Studio */\n#define XXH_swap64 _byteswap_uint64\n#elif XXH_GCC_VERSION >= 403\n#define XXH_swap64 __builtin_bswap64\n#else\nstatic U64 XXH_swap64(U64 x) {\n  return ((x << 56) & 0xff00000000000000ULL) |\n         ((x << 40) & 0x00ff000000000000ULL) |\n         ((x << 24) & 0x0000ff0000000000ULL) |\n         ((x << 8) & 0x000000ff00000000ULL) |\n         ((x >> 8) & 0x00000000ff000000ULL) |\n         ((x >> 24) & 0x0000000000ff0000ULL) |\n         ((x >> 40) & 0x000000000000ff00ULL) |\n         ((x >> 56) & 0x00000000000000ffULL);\n}\n#endif\n\nFORCE_INLINE U64 XXH_readLE64_align(const void* ptr, XXH_endianess endian,\n                                    XXH_alignment align) {\n  if (align == XXH_unaligned)\n    return endian == XXH_littleEndian ? XXH_read64(ptr)\n                                      : XXH_swap64(XXH_read64(ptr));\n  else\n    return endian == XXH_littleEndian ? *(const U64*)ptr\n                                      : XXH_swap64(*(const U64*)ptr);\n}\n\nFORCE_INLINE U64 XXH_readLE64(const void* ptr, XXH_endianess endian) {\n  return XXH_readLE64_align(ptr, endian, XXH_unaligned);\n}\n\nstatic U64 XXH_readBE64(const void* ptr) {\n  return XXH_CPU_LITTLE_ENDIAN ? XXH_swap64(XXH_read64(ptr)) : XXH_read64(ptr);\n}\n\n/*======   xxh64   ======*/\n\nstatic const U64 PRIME64_1 = 11400714785074694791ULL;\nstatic const U64 PRIME64_2 = 14029467366897019727ULL;\nstatic const U64 PRIME64_3 = 1609587929392839161ULL;\nstatic const U64 PRIME64_4 = 9650029242287828579ULL;\nstatic const U64 PRIME64_5 = 2870177450012600261ULL;\n\nstatic U64 XXH64_round(U64 acc, U64 input) {\n  acc += input * PRIME64_2;\n  acc = XXH_rotl64(acc, 31);\n  acc *= PRIME64_1;\n  return acc;\n}\n\nstatic U64 XXH64_mergeRound(U64 acc, U64 val) {\n  val = XXH64_round(0, val);\n  acc ^= val;\n  acc = acc * PRIME64_1 + PRIME64_4;\n  return acc;\n}\n\nFORCE_INLINE U64 XXH64_endian_align(const void* input, size_t len, U64 seed,\n                                    XXH_endianess endian, XXH_alignment align) {\n  const BYTE* p    = (const BYTE*)input;\n  const BYTE* bEnd = p + len;\n  U64 h64;\n#define XXH_get64bits(p) XXH_readLE64_align(p, endian, align)\n\n#ifdef XXH_ACCEPT_NULL_INPUT_POINTER\n  if (p == NULL) {\n    len  = 0;\n    bEnd = p = (const BYTE*)(size_t)32;\n  }\n#endif\n\n  if (len >= 32) {\n    const BYTE* const limit = bEnd - 32;\n    U64 v1                  = seed + PRIME64_1 + PRIME64_2;\n    U64 v2                  = seed + PRIME64_2;\n    U64 v3                  = seed + 0;\n    U64 v4                  = seed - PRIME64_1;\n\n    do {\n      v1 = XXH64_round(v1, XXH_get64bits(p));\n      p += 8;\n      v2 = XXH64_round(v2, XXH_get64bits(p));\n      p += 8;\n      v3 = XXH64_round(v3, XXH_get64bits(p));\n      p += 8;\n      v4 = XXH64_round(v4, XXH_get64bits(p));\n      p += 8;\n    } while (p <= limit);\n\n    h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) +\n          XXH_rotl64(v4, 18);\n    h64 = XXH64_mergeRound(h64, v1);\n    h64 = XXH64_mergeRound(h64, v2);\n    h64 = XXH64_mergeRound(h64, v3);\n    h64 = XXH64_mergeRound(h64, v4);\n\n  } else {\n    h64 = seed + PRIME64_5;\n  }\n\n  h64 += (U64)len;\n\n  while (p + 8 <= bEnd) {\n    U64 const k1 = XXH64_round(0, XXH_get64bits(p));\n    h64 ^= k1;\n    h64 = XXH_rotl64(h64, 27) * PRIME64_1 + PRIME64_4;\n    p += 8;\n  }\n\n  if (p + 4 <= bEnd) {\n    h64 ^= (U64)(XXH_get32bits(p)) * PRIME64_1;\n    h64 = XXH_rotl64(h64, 23) * PRIME64_2 + PRIME64_3;\n    p += 4;\n  }\n\n  while (p < bEnd) {\n    h64 ^= (*p) * PRIME64_5;\n    h64 = XXH_rotl64(h64, 11) * PRIME64_1;\n    p++;\n  }\n\n  h64 ^= h64 >> 33;\n  h64 *= PRIME64_2;\n  h64 ^= h64 >> 29;\n  h64 *= PRIME64_3;\n  h64 ^= h64 >> 32;\n\n  return h64;\n}\n\nXXH_PUBLIC_API unsigned long long XXH64(const void* input, size_t len,\n                                        unsigned long long seed) {\n#if 0\n    /* Simple version, good for code maintenance, but unfortunately slow for small inputs */\n    XXH64_state_t state;\n    XXH64_reset(&state, seed);\n    XXH64_update(&state, input, len);\n    return XXH64_digest(&state);\n#else\n  XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;\n\n  if (XXH_FORCE_ALIGN_CHECK) {\n    if ((((size_t)input) & 7) ==\n        0) { /* Input is aligned, let's leverage the speed advantage */\n      if ((endian_detected == XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)\n        return XXH64_endian_align(input, len, seed, XXH_littleEndian,\n                                  XXH_aligned);\n      else\n        return XXH64_endian_align(input, len, seed, XXH_bigEndian, XXH_aligned);\n    }\n  }\n\n  if ((endian_detected == XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)\n    return XXH64_endian_align(input, len, seed, XXH_littleEndian,\n                              XXH_unaligned);\n  else\n    return XXH64_endian_align(input, len, seed, XXH_bigEndian, XXH_unaligned);\n#endif\n}\n\n/*======   Hash Streaming   ======*/\n\nXXH_PUBLIC_API XXH64_state_t* XXH64_createState(void) {\n  return (XXH64_state_t*)XXH_malloc(sizeof(XXH64_state_t));\n}\nXXH_PUBLIC_API XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr) {\n  XXH_free(statePtr);\n  return XXH_OK;\n}\n\nXXH_PUBLIC_API void XXH64_copyState(XXH64_state_t* dstState,\n                                    const XXH64_state_t* srcState) {\n  memcpy(dstState, srcState, sizeof(*dstState));\n}\n\nXXH_PUBLIC_API XXH_errorcode XXH64_reset(XXH64_state_t* statePtr,\n                                         unsigned long long seed) {\n  XXH64_state_t state; /* using a local state to memcpy() in order to avoid\n                          strict-aliasing warnings */\n  memset(&state, 0,\n         sizeof(state) -\n             8); /* do not write into reserved, for future removal */\n  state.v1 = seed + PRIME64_1 + PRIME64_2;\n  state.v2 = seed + PRIME64_2;\n  state.v3 = seed + 0;\n  state.v4 = seed - PRIME64_1;\n  memcpy(statePtr, &state, sizeof(state));\n  return XXH_OK;\n}\n\nFORCE_INLINE XXH_errorcode XXH64_update_endian(XXH64_state_t* state,\n                                               const void* input, size_t len,\n                                               XXH_endianess endian) {\n  const BYTE* p          = (const BYTE*)input;\n  const BYTE* const bEnd = p + len;\n\n#ifdef XXH_ACCEPT_NULL_INPUT_POINTER\n  if (input == NULL)\n    return XXH_ERROR;\n#endif\n\n  state->total_len += len;\n\n  if (state->memsize + len < 32) { /* fill in tmp buffer */\n    XXH_memcpy(((BYTE*)state->mem64) + state->memsize, input, len);\n    state->memsize += (U32)len;\n    return XXH_OK;\n  }\n\n  if (state->memsize) { /* tmp buffer is full */\n    XXH_memcpy(((BYTE*)state->mem64) + state->memsize, input,\n               32 - state->memsize);\n    state->v1 = XXH64_round(state->v1, XXH_readLE64(state->mem64 + 0, endian));\n    state->v2 = XXH64_round(state->v2, XXH_readLE64(state->mem64 + 1, endian));\n    state->v3 = XXH64_round(state->v3, XXH_readLE64(state->mem64 + 2, endian));\n    state->v4 = XXH64_round(state->v4, XXH_readLE64(state->mem64 + 3, endian));\n    p += 32 - state->memsize;\n    state->memsize = 0;\n  }\n\n  if (p + 32 <= bEnd) {\n    const BYTE* const limit = bEnd - 32;\n    U64 v1                  = state->v1;\n    U64 v2                  = state->v2;\n    U64 v3                  = state->v3;\n    U64 v4                  = state->v4;\n\n    do {\n      v1 = XXH64_round(v1, XXH_readLE64(p, endian));\n      p += 8;\n      v2 = XXH64_round(v2, XXH_readLE64(p, endian));\n      p += 8;\n      v3 = XXH64_round(v3, XXH_readLE64(p, endian));\n      p += 8;\n      v4 = XXH64_round(v4, XXH_readLE64(p, endian));\n      p += 8;\n    } while (p <= limit);\n\n    state->v1 = v1;\n    state->v2 = v2;\n    state->v3 = v3;\n    state->v4 = v4;\n  }\n\n  if (p < bEnd) {\n    XXH_memcpy(state->mem64, p, (size_t)(bEnd - p));\n    state->memsize = (unsigned)(bEnd - p);\n  }\n\n  return XXH_OK;\n}\n\nXXH_PUBLIC_API XXH_errorcode XXH64_update(XXH64_state_t* state_in,\n                                          const void* input, size_t len) {\n  XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;\n\n  if ((endian_detected == XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)\n    return XXH64_update_endian(state_in, input, len, XXH_littleEndian);\n  else\n    return XXH64_update_endian(state_in, input, len, XXH_bigEndian);\n}\n\nFORCE_INLINE U64 XXH64_digest_endian(const XXH64_state_t* state,\n                                     XXH_endianess endian) {\n  const BYTE* p          = (const BYTE*)state->mem64;\n  const BYTE* const bEnd = (const BYTE*)state->mem64 + state->memsize;\n  U64 h64;\n\n  if (state->total_len >= 32) {\n    U64 const v1 = state->v1;\n    U64 const v2 = state->v2;\n    U64 const v3 = state->v3;\n    U64 const v4 = state->v4;\n\n    h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) +\n          XXH_rotl64(v4, 18);\n    h64 = XXH64_mergeRound(h64, v1);\n    h64 = XXH64_mergeRound(h64, v2);\n    h64 = XXH64_mergeRound(h64, v3);\n    h64 = XXH64_mergeRound(h64, v4);\n  } else {\n    h64 = state->v3 + PRIME64_5;\n  }\n\n  h64 += (U64)state->total_len;\n\n  while (p + 8 <= bEnd) {\n    U64 const k1 = XXH64_round(0, XXH_readLE64(p, endian));\n    h64 ^= k1;\n    h64 = XXH_rotl64(h64, 27) * PRIME64_1 + PRIME64_4;\n    p += 8;\n  }\n\n  if (p + 4 <= bEnd) {\n    h64 ^= (U64)(XXH_readLE32(p, endian)) * PRIME64_1;\n    h64 = XXH_rotl64(h64, 23) * PRIME64_2 + PRIME64_3;\n    p += 4;\n  }\n\n  while (p < bEnd) {\n    h64 ^= (*p) * PRIME64_5;\n    h64 = XXH_rotl64(h64, 11) * PRIME64_1;\n    p++;\n  }\n\n  h64 ^= h64 >> 33;\n  h64 *= PRIME64_2;\n  h64 ^= h64 >> 29;\n  h64 *= PRIME64_3;\n  h64 ^= h64 >> 32;\n\n  return h64;\n}\n\nXXH_PUBLIC_API unsigned long long XXH64_digest(const XXH64_state_t* state_in) {\n  XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;\n\n  if ((endian_detected == XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)\n    return XXH64_digest_endian(state_in, XXH_littleEndian);\n  else\n    return XXH64_digest_endian(state_in, XXH_bigEndian);\n}\n\n/*====== Canonical representation   ======*/\n\nXXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t* dst,\n                                            XXH64_hash_t hash) {\n  XXH_STATIC_ASSERT(sizeof(XXH64_canonical_t) == sizeof(XXH64_hash_t));\n  if (XXH_CPU_LITTLE_ENDIAN)\n    hash = XXH_swap64(hash);\n  memcpy(dst, &hash, sizeof(*dst));\n}\n\nXXH_PUBLIC_API XXH64_hash_t\nXXH64_hashFromCanonical(const XXH64_canonical_t* src) {\n  return XXH_readBE64(src);\n}\n\n#endif /* XXH_NO_LONG_LONG */\n"
  },
  {
    "path": "lonestar/eda/cpu/aig-rewriting/xxHash/xxhash.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n/*\n   xxHash - Extremely Fast Hash algorithm\n   Header File\n   Copyright (C) 2012-2016, Yann Collet.\n\n   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)\n\n   Redistribution and use in source and binary forms, with or without\n   modification, are permitted provided that the following conditions are\n   met:\n\n       * Redistributions of source code must retain the above copyright\n   notice, this list of conditions and the following disclaimer.\n       * Redistributions in binary form must reproduce the above\n   copyright notice, this list of conditions and the following disclaimer\n   in the documentation and/or other materials provided with the\n   distribution.\n\n   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS\n   \"AS IS\" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT\n   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR\n   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT\n   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,\n   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT\n   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,\n   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY\n   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\n   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\n   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n\n   You can contact the author at :\n   - xxHash source repository : https://github.com/Cyan4973/xxHash\n*/\n\n/* Notice extracted from xxHash homepage :\n\nxxHash is an extremely fast Hash algorithm, running at RAM speed limits.\nIt also successfully passes all tests from the SMHasher suite.\n\nComparison (single thread, Windows Seven 32 bits, using SMHasher on a Core 2 Duo\n@3GHz)\n\nName            Speed       Q.Score   Author\nxxHash          5.4 GB/s     10\nCrapWow         3.2 GB/s      2       Andrew\nMumurHash 3a    2.7 GB/s     10       Austin Appleby\nSpookyHash      2.0 GB/s     10       Bob Jenkins\nSBox            1.4 GB/s      9       Bret Mulvey\nLookup3         1.2 GB/s      9       Bob Jenkins\nSuperFastHash   1.2 GB/s      1       Paul Hsieh\nCityHash64      1.05 GB/s    10       Pike & Alakuijala\nFNV             0.55 GB/s     5       Fowler, Noll, Vo\nCRC32           0.43 GB/s     9\nMD5-32          0.33 GB/s    10       Ronald L. Rivest\nSHA1-32         0.28 GB/s    10\n\nQ.Score is a measure of quality of the hash function.\nIt depends on successfully passing SMHasher test set.\n10 is a perfect score.\n\nA 64-bits version, named XXH64, is available since r35.\nIt offers much better speed, but for 64-bits applications only.\nName     Speed on 64 bits    Speed on 32 bits\nXXH64       13.8 GB/s            1.9 GB/s\nXXH32        6.8 GB/s            6.0 GB/s\n*/\n\n#ifndef XXHASH_H_5627135585666179\n#define XXHASH_H_5627135585666179 1\n\n#if defined(__cplusplus)\nextern \"C\" {\n#endif\n\n/* ****************************\n *  Definitions\n ******************************/\n#include <stddef.h> /* size_t */\ntypedef enum { XXH_OK = 0, XXH_ERROR } XXH_errorcode;\n\n/* ****************************\n *  API modifier\n ******************************/\n/** XXH_PRIVATE_API\n *   This is useful to include xxhash functions in `static` mode\n *   in order to inline them, and remove their symbol from the public list.\n *   Methodology :\n *     #define XXH_PRIVATE_API\n *     #include \"xxhash.h\"\n *   `xxhash.c` is automatically included.\n *   It's not useful to compile and link it as a separate module.\n */\n#ifdef XXH_PRIVATE_API\n#ifndef XXH_STATIC_LINKING_ONLY\n#define XXH_STATIC_LINKING_ONLY\n#endif\n#if defined(__GNUC__)\n#define XXH_PUBLIC_API static __inline __attribute__((unused))\n#elif defined(__cplusplus) ||                                                  \\\n    (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)\n#define XXH_PUBLIC_API static inline\n#elif defined(_MSC_VER)\n#define XXH_PUBLIC_API static __inline\n#else\n#define XXH_PUBLIC_API                                                         \\\n  static /* this version may generate warnings for unused static functions;    \\\n            disable the relevant warning */\n#endif\n#else\n#define XXH_PUBLIC_API /* do nothing */\n#endif                 /* XXH_PRIVATE_API */\n\n/*!XXH_NAMESPACE, aka Namespace Emulation :\n\nIf you want to include _and expose_ xxHash functions from within your own\nlibrary, but also want to avoid symbol collisions with other libraries which may\nalso include xxHash,\n\nyou can use XXH_NAMESPACE, to automatically prefix any public symbol from xxhash\nlibrary with the value of XXH_NAMESPACE (therefore, avoid NULL and numeric\nvalues).\n\nNote that no change is required within the calling program as long as it\nincludes `xxhash.h` : regular symbol name will be automatically translated by\nthis header.\n*/\n#ifdef XXH_NAMESPACE\n#define XXH_CAT(A, B) A##B\n#define XXH_NAME2(A, B) XXH_CAT(A, B)\n#define XXH_versionNumber XXH_NAME2(XXH_NAMESPACE, XXH_versionNumber)\n#define XXH32 XXH_NAME2(XXH_NAMESPACE, XXH32)\n#define XXH32_createState XXH_NAME2(XXH_NAMESPACE, XXH32_createState)\n#define XXH32_freeState XXH_NAME2(XXH_NAMESPACE, XXH32_freeState)\n#define XXH32_reset XXH_NAME2(XXH_NAMESPACE, XXH32_reset)\n#define XXH32_update XXH_NAME2(XXH_NAMESPACE, XXH32_update)\n#define XXH32_digest XXH_NAME2(XXH_NAMESPACE, XXH32_digest)\n#define XXH32_copyState XXH_NAME2(XXH_NAMESPACE, XXH32_copyState)\n#define XXH32_canonicalFromHash                                                \\\n  XXH_NAME2(XXH_NAMESPACE, XXH32_canonicalFromHash)\n#define XXH32_hashFromCanonical                                                \\\n  XXH_NAME2(XXH_NAMESPACE, XXH32_hashFromCanonical)\n#define XXH64 XXH_NAME2(XXH_NAMESPACE, XXH64)\n#define XXH64_createState XXH_NAME2(XXH_NAMESPACE, XXH64_createState)\n#define XXH64_freeState XXH_NAME2(XXH_NAMESPACE, XXH64_freeState)\n#define XXH64_reset XXH_NAME2(XXH_NAMESPACE, XXH64_reset)\n#define XXH64_update XXH_NAME2(XXH_NAMESPACE, XXH64_update)\n#define XXH64_digest XXH_NAME2(XXH_NAMESPACE, XXH64_digest)\n#define XXH64_copyState XXH_NAME2(XXH_NAMESPACE, XXH64_copyState)\n#define XXH64_canonicalFromHash                                                \\\n  XXH_NAME2(XXH_NAMESPACE, XXH64_canonicalFromHash)\n#define XXH64_hashFromCanonical                                                \\\n  XXH_NAME2(XXH_NAMESPACE, XXH64_hashFromCanonical)\n#endif\n\n/* *************************************\n *  Version\n ***************************************/\n#define XXH_VERSION_MAJOR 0\n#define XXH_VERSION_MINOR 6\n#define XXH_VERSION_RELEASE 2\n#define XXH_VERSION_NUMBER                                                     \\\n  (XXH_VERSION_MAJOR * 100 * 100 + XXH_VERSION_MINOR * 100 +                   \\\n   XXH_VERSION_RELEASE)\nXXH_PUBLIC_API unsigned XXH_versionNumber(void);\n\n/*-**********************************************************************\n *  32-bits hash\n ************************************************************************/\ntypedef unsigned int XXH32_hash_t;\n\n/*! XXH32() :\n    Calculate the 32-bits hash of sequence \"length\" bytes stored at memory\n   address \"input\". The memory between input & input+length must be valid\n   (allocated and read-accessible). \"seed\" can be used to alter the result\n   predictably.\n    Speed on Core 2 Duo @ 3 GHz (single thread, SMHasher benchmark) : 5.4 GB/s\n */\nXXH_PUBLIC_API XXH32_hash_t XXH32(const void* input, size_t length,\n                                  unsigned int seed);\n\n/*======   Streaming   ======*/\ntypedef struct XXH32_state_s XXH32_state_t; /* incomplete type */\nXXH_PUBLIC_API XXH32_state_t* XXH32_createState(void);\nXXH_PUBLIC_API XXH_errorcode XXH32_freeState(XXH32_state_t* statePtr);\nXXH_PUBLIC_API void XXH32_copyState(XXH32_state_t* dst_state,\n                                    const XXH32_state_t* src_state);\n\nXXH_PUBLIC_API XXH_errorcode XXH32_reset(XXH32_state_t* statePtr,\n                                         unsigned int seed);\nXXH_PUBLIC_API XXH_errorcode XXH32_update(XXH32_state_t* statePtr,\n                                          const void* input, size_t length);\nXXH_PUBLIC_API XXH32_hash_t XXH32_digest(const XXH32_state_t* statePtr);\n\n/*\nThese functions generate the xxHash of an input provided in multiple segments.\nNote that, for small input, they are slower than single-call functions, due to\nstate management. For small input, prefer `XXH32()` and `XXH64()` .\n\nXXH state must first be allocated, using XXH*_createState() .\n\nStart a new hash by initializing state with a seed, using XXH*_reset().\n\nThen, feed the hash state by calling XXH*_update() as many times as necessary.\nObviously, input must be allocated and read accessible.\nThe function returns an error code, with 0 meaning OK, and any other value\nmeaning there is an error.\n\nFinally, a hash value can be produced anytime, by using XXH*_digest().\nThis function returns the nn-bits hash as an int or long long.\n\nIt's still possible to continue inserting input into the hash state after a\ndigest, and generate some new hashes later on, by calling again XXH*_digest().\n\nWhen done, free XXH state space if it was allocated dynamically.\n*/\n\n/*======   Canonical representation   ======*/\n\ntypedef struct {\n  unsigned char digest[4];\n} XXH32_canonical_t;\nXXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst,\n                                            XXH32_hash_t hash);\nXXH_PUBLIC_API XXH32_hash_t\nXXH32_hashFromCanonical(const XXH32_canonical_t* src);\n\n/* Default result type for XXH functions are primitive unsigned 32 and 64 bits.\n *  The canonical representation uses human-readable write convention, aka\n * big-endian (large digits first). These functions allow transformation of hash\n * result into and from its canonical format. This way, hash values can be\n * written into a file / memory, and remain comparable on different systems and\n * programs.\n */\n\n#ifndef XXH_NO_LONG_LONG\n/*-**********************************************************************\n *  64-bits hash\n ************************************************************************/\ntypedef unsigned long long XXH64_hash_t;\n\n/*! XXH64() :\n    Calculate the 64-bits hash of sequence of length \"len\" stored at memory\n   address \"input\". \"seed\" can be used to alter the result predictably. This\n   function runs faster on 64-bits systems, but slower on 32-bits systems (see\n   benchmark).\n*/\nXXH_PUBLIC_API XXH64_hash_t XXH64(const void* input, size_t length,\n                                  unsigned long long seed);\n\n/*======   Streaming   ======*/\ntypedef struct XXH64_state_s XXH64_state_t; /* incomplete type */\nXXH_PUBLIC_API XXH64_state_t* XXH64_createState(void);\nXXH_PUBLIC_API XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr);\nXXH_PUBLIC_API void XXH64_copyState(XXH64_state_t* dst_state,\n                                    const XXH64_state_t* src_state);\n\nXXH_PUBLIC_API XXH_errorcode XXH64_reset(XXH64_state_t* statePtr,\n                                         unsigned long long seed);\nXXH_PUBLIC_API XXH_errorcode XXH64_update(XXH64_state_t* statePtr,\n                                          const void* input, size_t length);\nXXH_PUBLIC_API XXH64_hash_t XXH64_digest(const XXH64_state_t* statePtr);\n\n/*======   Canonical representation   ======*/\ntypedef struct {\n  unsigned char digest[8];\n} XXH64_canonical_t;\nXXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t* dst,\n                                            XXH64_hash_t hash);\nXXH_PUBLIC_API XXH64_hash_t\nXXH64_hashFromCanonical(const XXH64_canonical_t* src);\n#endif /* XXH_NO_LONG_LONG */\n\n#ifdef XXH_STATIC_LINKING_ONLY\n\n/* ================================================================================================\n   This section contains definitions which are not guaranteed to remain stable.\n   They may change in future versions, becoming incompatible with a different\nversion of the library. They shall only be used with static linking. Never use\nthese definitions in association with dynamic linking !\n===================================================================================================\n*/\n\n/* These definitions are only meant to make possible\n   static allocation of XXH state, on stack or in a struct for example.\n   Never use members directly. */\n\nstruct XXH32_state_s {\n  unsigned total_len_32;\n  unsigned large_len;\n  unsigned v1;\n  unsigned v2;\n  unsigned v3;\n  unsigned v4;\n  unsigned mem32[4]; /* buffer defined as U32 for alignment */\n  unsigned memsize;\n  unsigned\n      reserved; /* never read nor write, will be removed in a future version */\n};              /* typedef'd to XXH32_state_t */\n\n#ifndef XXH_NO_LONG_LONG /* remove 64-bits support */\nstruct XXH64_state_s {\n  unsigned long long total_len;\n  unsigned long long v1;\n  unsigned long long v2;\n  unsigned long long v3;\n  unsigned long long v4;\n  unsigned long long mem64[4]; /* buffer defined as U64 for alignment */\n  unsigned memsize;\n  unsigned reserved[2]; /* never read nor write, will be removed in a future\n                           version */\n};                      /* typedef'd to XXH64_state_t */\n#endif\n\n#ifdef XXH_PRIVATE_API\n#include \"xxhash.c\" /* include xxhash function bodies as `static`, for inlining */\n#endif\n\n#endif /* XXH_STATIC_LINKING_ONLY */\n\n#if defined(__cplusplus)\n}\n#endif\n\n#endif /* XXHASH_H_5627135585666179 */\n"
  },
  {
    "path": "lonestar/eda/cpu/sproute/BoilerPlate.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#include <sstream>\n#include \"galois/Galois.h\"\n#include \"galois/Version.h\"\n#include \"llvm/Support/CommandLine.h\"\n\nllvm::cl::opt<int>\n    numThreads(\"t\", llvm::cl::desc(\"Number of threads (default value 1)\"),\n               llvm::cl::init(1));\n\nstatic void LonestarPrintVersion(llvm::raw_ostream& out) {\n  out << \"LoneStar Benchmark Suite v\" << galois::getVersion() << \" (\"\n      << galois::getRevision() << \")\\n\";\n  out.flush();\n}\n\n//! initialize lonestar benchmark\nvoid LonestarStart(int argc, char** argv, const char* app, const char* desc,\n                   const char* url, llvm::cl::opt<std::string>* input) {\n  llvm::cl::SetVersionPrinter(LonestarPrintVersion);\n  llvm::cl::ParseCommandLineOptions(argc, argv);\n  numThreads = galois::setActiveThreads(numThreads);\n\n  LonestarPrintVersion(llvm::outs());\n  llvm::outs() << \"Copyright (C) \" << galois::getCopyrightYear()\n               << \" The University of Texas at Austin\\n\";\n  llvm::outs() << \"http://iss.ices.utexas.edu/galois/\\n\\n\";\n  llvm::outs() << \"application: \" << (app ? app : \"unspecified\") << \"\\n\";\n  if (desc)\n    llvm::outs() << desc << \"\\n\";\n  if (url)\n    llvm::outs() << \"http://iss.ices.utexas.edu/?p=projects/galois/benchmarks/\"\n                 << url << \"\\n\";\n  llvm::outs() << \"\\n\";\n  llvm::outs().flush();\n\n  std::ostringstream cmdout;\n  for (int i = 0; i < argc; ++i) {\n    cmdout << argv[i];\n    if (i != argc - 1)\n      cmdout << \" \";\n  }\n\n  galois::runtime::reportParam(\"(NULL)\", \"CommandLine\", cmdout.str());\n  galois::runtime::reportParam(\"(NULL)\", \"Threads\", numThreads);\n  galois::runtime::reportParam(\"(NULL)\", \"Hosts\", 1);\n  if (input) {\n    galois::runtime::reportParam(\"(NULL)\", \"Input\", input->getValue());\n  }\n\n  char name[256];\n  gethostname(name, 256);\n  galois::runtime::reportParam(\"(NULL)\", \"Hostname\", name);\n}\n"
  },
  {
    "path": "lonestar/eda/cpu/sproute/CMakeLists.txt",
    "content": "add_executable(sproute-cpu main.cpp dist.c dl.c err.c heap.c mst2.c neighbors.c bookshelf_IO.c memAlloc.c)\nadd_dependencies(apps sproute-cpu)\ntarget_link_libraries(sproute-cpu PRIVATE Galois::shmem lonestar)\ninstall(TARGETS sproute-cpu DESTINATION \"${CMAKE_INSTALL_BINDIR}\" COMPONENT apps EXCLUDE_FROM_ALL)\n\nadd_test_scale(small1 sproute-cpu -ISPD2008Graph \"${BASEINPUT}/eda/routing/test.gr\" --flute \"${BASEINPUT}/eda/routing\")\n"
  },
  {
    "path": "lonestar/eda/cpu/sproute/DataProc.h",
    "content": "#ifndef _DATAPROC_H_\n#define _DATAPROC_H_\n\n#include <stdio.h>\n#include <stdlib.h>\n#include <string.h>\n#include \"DataType.h\"\n\n#include \"galois/Galois.h\"\n#include \"galois/Reduction.h\"\n#include \"galois/PriorityQueue.h\"\n#include \"galois/Timer.h\"\n#include \"galois/graphs/Graph.h\"\n#include \"galois/graphs/TypeTraits.h\"\n#include \"galois/substrate/SimpleLock.h\"\n#include \"galois/AtomicHelpers.h\"\n#include \"galois/runtime/Profile.h\"\n#include \"galois/LargeArray.h\"\n\n#include \"DataType.h\"\n#include \"flute.h\"\n\nusing namespace std;\n\n#define BUFFERSIZE 800\n#define STRINGLEN 100\n#define MAXNETDEG 1000\n#define MAXEDGES 10000000\n\n#define MAXLEN 20000\n\n#define XRANGE 2536\n#define YRANGE 2536\n\n// Global variables\nfloat round_avg_dist;\nint round_avg_length;\nint xGrid, yGrid, numGrids, numNets, vCapacity, hCapacity;\nfloat vCapacity_lb, hCapacity_lb, vCapacity_ub, hCapacity_ub;\nint MaxDegree;\nint MinWidth[MAXLAYER], MinSpacing[MAXLAYER], ViaSpacing[MAXLAYER];\nint xcorner, ycorner, wTile, hTile;\nint enlarge, costheight, ripup_threshold, ahTH;\nint numValidNets,\n    numInvalidNets; // # nets need to be routed (having pins in different grids)\nint numLayers;\nint totalNumSeg;   // total # segments\nint totalOverflow; // total # overflow\nint mazeThreshold; // the wirelen threshold to do maze routing\nNet** nets;\nNet** invalid_nets;\nEdge *h_edges, *v_edges;\nfloat d1[YRANGE][XRANGE];\nfloat d2[YRANGE][XRANGE];\n\n/*Bool HV[YRANGE][XRANGE];\nBool hyperV[YRANGE][XRANGE];\nBool hyperH[YRANGE][XRANGE];\n\nint corrEdge[YRANGE][XRANGE];*/ //Michael\nint SLOPE;\nint vCapacity3D[MAXLAYER], hCapacity3D[MAXLAYER];\n\nfloat LB;\nfloat UB;\nint THRESH_M;\ndouble LOGIS_COF;\nint ENLARGE;\nint STEP;\nint COSHEIGHT;\nint STOP;\nint VCA;\nfloat L;\nint VIA, slope, max_adj;\n\nchar benchFile[STRINGLEN];\n\nSegment* seglist;\nint* seglistIndex; // the index for the segments for each net\nint* seglistCnt;   // the number of segements for each net\nint* segOrder;     // the order of segments for routing\nTree* trees;       // the tree topologies\nStTree* sttrees;   // the Steiner trees\nDTYPE** gxs;       // the copy of xs for nets, used for second FLUTE\nDTYPE** gys;       // the copy of xs for nets, used for second FLUTE\nDTYPE** gs; // the copy of vertical sequence for nets, used for second FLUTE\nint MD = 0, TD = 0;\n\nEdge3D* h_edges3D;\nEdge3D* v_edges3D;\n\nOrderNetPin* treeOrderPV;\nOrderTree* treeOrderCong;\nint numTreeedges;\nint viacost;\n\nint layerGrid[MAXLAYER][MAXLEN];\nint gridD[MAXLAYER][MAXLEN];\nint viaLink[MAXLAYER][MAXLEN];\n\nint d13D[MAXLAYER][YRANGE][XRANGE];\nshort d23D[MAXLAYER][YRANGE][XRANGE];\n\ndirctionT*** directions3D;\nint*** corrEdge3D;\nparent3D*** pr3D;\n\nint mazeedge_Threshold;\nBool inRegion[YRANGE][XRANGE];\n\nBool heapVisited[MAXNETDEG];\n// int heapQueue[MAXNETDEG]; //Michael\n\nint gridHV, gridH, gridV, gridHs[MAXLAYER], gridVs[MAXLAYER];\n\nint** heap13D;\nshort** heap23D;\n\nfloat *h_costTable, *v_costTable;\nBool stopDEC, errorPRONE;\n// OrderNetEdge netEO[2000]; //Michael\nint xcor[2000], ycor[2000], dcor[2000];\n\nStTree* sttreesBK;\n\nshort **parentX1, **parentY1, **parentX3, **parentY3;\n\n/*float **heap2,**heap1; //Michael\n\nBool *pop_heap2;*/\n\n// Michael:\nint LOCK;\n\nvoid readFile(const char* benchFile) {\n  FILE* fp;\n  int i, j, k;\n  int pinX, pinY, pinL, netID, numPins, pinInd = 0, grid, newnetID,\n                                        invalid_netID, segcount, minwidth;\n  float pinX_in, pinY_in;\n  int maxDeg = 0;\n  int pinXarray[MAXNETDEG], pinYarray[MAXNETDEG], pinLarray[MAXNETDEG];\n  char netName[STRINGLEN];\n  Bool remove;\n  int numAdjust, x1, x2, y1, y2, l1, l2, cap, reduce, reducedCap;\n  int net;\n  int TC;\n\n  net = 0;\n\n  fp = fopen(benchFile, \"r\");\n  if (fp == NULL) {\n    printf(\"Error in opening %s\\n\", benchFile);\n    exit(1);\n  }\n\n  if (fscanf(fp, \"grid\t%d %d %d\\n\", &xGrid, &yGrid, &numLayers) != 3)\n    abort_with_message(\"Failed to read required info from benchfile.\");\n\n  vCapacity = hCapacity = 0;\n\n  if (fscanf(fp, \"vertical capacity\t\"))\n    abort_with_message(\"Failed to read required info from benchfile.\");\n  for (i = 0; i < numLayers; i++) {\n    if (fscanf(fp, \"%d\", &vCapacity3D[i]) != 1)\n      abort_with_message(\"Failed to read required info from benchfile.\");\n    if (fscanf(fp, \" \"))\n      abort_with_message(\"Failed to read required info from benchfile.\");\n    vCapacity3D[i] = vCapacity3D[i] / 2;\n    vCapacity += vCapacity3D[i];\n  }\n  if (fscanf(fp, \"\\n\"))\n    abort_with_message(\"Failed to read required info from benchfile.\");\n\n  if (fscanf(fp, \"horizontal capacity\t\"))\n    abort_with_message(\"Failed to read required info from benchfile.\");\n  for (i = 0; i < numLayers; i++) {\n    if (fscanf(fp, \"%d\", &hCapacity3D[i]) != 1)\n      abort_with_message(\"Failed to read required info from benchfile.\");\n    if (fscanf(fp, \" \"))\n      abort_with_message(\"Failed to read required info from benchfile.\");\n    hCapacity3D[i] = hCapacity3D[i] / 2;\n    hCapacity += hCapacity3D[i];\n  }\n  if (fscanf(fp, \"\\n\"))\n    abort_with_message(\"Failed to read required info from benchfile.\");\n\n  if (fscanf(fp, \"minimum width\t\"))\n    abort_with_message(\"Failed to read required info from benchfile.\");\n  for (i = 0; i < numLayers; i++) {\n    if (fscanf(fp, \"%d\", &(MinWidth[i])) != 1)\n      abort_with_message(\"Failed to read required info from benchfile.\");\n    if (fscanf(fp, \" \"))\n      abort_with_message(\"Failed to read required info from benchfile.\");\n  }\n  if (fscanf(fp, \"\\n\"))\n    abort_with_message(\"Failed to read required info from benchfile.\");\n\n  if (fscanf(fp, \"minimum spacing\t\"))\n    abort_with_message(\"Failed to read required info from benchfile.\");\n  for (i = 0; i < numLayers; i++) {\n    if (fscanf(fp, \"%d\", &(MinSpacing[i])) != 1)\n      abort_with_message(\"Failed to read required info from benchfile.\");\n    if (fscanf(fp, \" \"))\n      abort_with_message(\"Failed to read required info from benchfile.\");\n  }\n  if (fscanf(fp, \"\\n\"))\n    abort_with_message(\"Failed to read required info from benchfile.\");\n\n  if (fscanf(fp, \"via spacing\t\"))\n    abort_with_message(\"Failed to read required info from benchfile.\");\n  for (i = 0; i < numLayers; i++) {\n    if (fscanf(fp, \"%d\", &(ViaSpacing[i])) != 1)\n      abort_with_message(\"Failed to read required info from benchfile.\");\n    if (fscanf(fp, \" \"))\n      abort_with_message(\"Failed to read required info from benchfile.\");\n  }\n  if (fscanf(fp, \"\\n\"))\n    abort_with_message(\"Failed to read required info from benchfile.\");\n\n  if (fscanf(fp, \"%d %d %d %d\\n\\n\", &xcorner, &ycorner, &wTile, &hTile) != 4)\n    abort_with_message(\"Failed to read required info from benchfile.\");\n\n  if (fscanf(fp, \"num net %d\\n\", &numNets) != 1)\n    abort_with_message(\"Failed to read required info from benchfile.\");\n\n  numGrids = xGrid * yGrid;\n\n  vCapacity_lb = LB * vCapacity;\n  hCapacity_lb = LB * hCapacity;\n  vCapacity_ub = UB * vCapacity;\n  hCapacity_ub = UB * hCapacity;\n\n  printf(\"\\n\");\n  printf(\"grid %d %d %d\\n\", xGrid, yGrid, numLayers);\n  for (i = 0; i < numLayers; i++) {\n    printf(\"Layer %d vertical capacity %d, horizontal capacity %d\\n\", i,\n           vCapacity3D[i], hCapacity3D[i]);\n  }\n\n  printf(\"total vertical capacity %d\\n\", vCapacity);\n  printf(\"total horizontal capacity %d\\n\", hCapacity);\n  printf(\"num net %d\\n\", numNets);\n\n  // allocate memory for nets\n  nets         = (Net**)malloc(numNets * sizeof(Net*));\n  invalid_nets = (Net**)malloc(numNets * sizeof(Net*));\n  for (i = 0; i < numNets; i++) {\n    nets[i]         = (Net*)malloc(sizeof(Net));\n    invalid_nets[i] = (Net*)malloc(sizeof(Net));\n  }\n  seglistIndex = (int*)malloc((numNets + 1) * sizeof(int));\n\n  // read nets information from the input file\n  segcount      = 0;\n  newnetID      = 0;\n  invalid_netID = 0;\n  for (i = 0; i < numNets; i++) {\n    net++;\n    if (fscanf(fp, \"%s %d %d %d\\n\", netName, &netID, &numPins, &minwidth) != 4)\n      abort_with_message(\"Failed to read required info from benchfile.\");\n    if (numPins < 1000) {\n      pinInd = 0;\n      for (j = 0; j < numPins; j++) {\n        if (fscanf(fp, \"%f\t%f\t%d\\n\", &pinX_in, &pinY_in, &pinL) != 3)\n          abort_with_message(\"Failed to read required info from benchfile.\");\n        pinX = (int)((pinX_in - xcorner) / wTile);\n        pinY = (int)((pinY_in - ycorner) / hTile);\n        if (!(pinX < 0 || pinX >= xGrid || pinY < -1 || pinY >= yGrid ||\n              pinL >= numLayers || pinL < 0)) {\n          remove = FALSE;\n          for (k = 0; k < pinInd; k++) {\n            if (pinX == pinXarray[k] && pinY == pinYarray[k] &&\n                pinL == pinLarray[k]) {\n              remove = TRUE;\n              break;\n            }\n          }\n          if (!remove) // the pin is in different grid from other pins\n          {\n            pinXarray[pinInd] = pinX;\n            pinYarray[pinInd] = pinY;\n            pinLarray[pinInd] = pinL;\n            pinInd++;\n          }\n        }\n      }\n\n      if (pinInd > 1) // valid net\n      {\n        MD = max(MD, pinInd);\n        TD += pinInd;\n        strcpy(nets[newnetID]->name, netName);\n        nets[newnetID]->netIDorg = netID;\n        nets[newnetID]->numPins  = numPins;\n        nets[newnetID]->deg      = pinInd;\n        nets[newnetID]->pinX     = (short*)malloc(pinInd * sizeof(short));\n        nets[newnetID]->pinY     = (short*)malloc(pinInd * sizeof(short));\n        nets[newnetID]->pinL     = (short*)malloc(pinInd * sizeof(short));\n\n        for (j = 0; j < pinInd; j++) {\n          nets[newnetID]->pinX[j] = pinXarray[j];\n          nets[newnetID]->pinY[j] = pinYarray[j];\n          nets[newnetID]->pinL[j] = pinLarray[j];\n        }\n        maxDeg                 = pinInd > maxDeg ? pinInd : maxDeg;\n        seglistIndex[newnetID] = segcount;\n        newnetID++;\n        segcount +=\n            2 * pinInd -\n            3; // at most (2*numPins-2) nodes, (2*numPins-3) nets for a net\n      }        // if valid net\n      else {\n        strcpy(invalid_nets[invalid_netID]->name, netName);\n        invalid_nets[invalid_netID]->netIDorg = netID;\n        invalid_nets[invalid_netID]->numPins  = numPins;\n        invalid_nets[invalid_netID]->deg      = pinInd;\n        invalid_nets[invalid_netID]->pinX =\n            (short*)malloc(pinInd * sizeof(short));\n        invalid_nets[invalid_netID]->pinY =\n            (short*)malloc(pinInd * sizeof(short));\n        invalid_nets[invalid_netID]->pinL =\n            (short*)malloc(pinInd * sizeof(short));\n\n        for (j = 0; j < pinInd; j++) {\n          invalid_nets[invalid_netID]->pinX[j] = pinXarray[j];\n          invalid_nets[invalid_netID]->pinY[j] = pinYarray[j];\n          invalid_nets[invalid_netID]->pinL[j] = pinLarray[j];\n        }\n        invalid_netID++;\n      }\n\n    } // if\n\n    else {\n      for (j = 0; j < numPins; j++)\n        if (fscanf(fp, \"%f\t%f\t%d\\n\", &pinX_in, &pinY_in, &pinL) != 3)\n          abort_with_message(\"Failed to read required info from benchfile.\");\n    }\n\n  } // loop i\n  printf(\"the total net number is %d\\n\\n\", net);\n\n  if ((pinInd > 1) && (pinInd < 1000)) {\n    seglistIndex[newnetID] = segcount; // the end pointer of the seglist\n  }\n  numValidNets   = newnetID;\n  numInvalidNets = invalid_netID;\n\n  // allocate memory and initialize for edges\n\n  h_edges = (Edge*)calloc(((xGrid - 1) * yGrid), sizeof(Edge));\n  v_edges = (Edge*)calloc((xGrid * (yGrid - 1)), sizeof(Edge));\n\n  v_edges3D = (Edge3D*)calloc((numLayers * xGrid * yGrid), sizeof(Edge3D));\n  h_edges3D = (Edge3D*)calloc((numLayers * xGrid * yGrid), sizeof(Edge3D));\n\n  // 2D edge innitialization\n  TC = 0;\n  for (i = 0; i < yGrid; i++) {\n    for (j = 0; j < xGrid - 1; j++) {\n      grid              = i * (xGrid - 1) + j;\n      h_edges[grid].cap = hCapacity;\n      TC += hCapacity;\n      h_edges[grid].usage      = 0;\n      h_edges[grid].est_usage  = 0;\n      h_edges[grid].red        = 0;\n      h_edges[grid].last_usage = 0;\n\n      h_edges[grid].max_ripups         = 0;\n      h_edges[grid].max_have_rippedups = 0;\n      h_edges[grid].ripups_cur_round   = false;\n    }\n  }\n  for (i = 0; i < yGrid - 1; i++) {\n    for (j = 0; j < xGrid; j++) {\n      grid              = i * xGrid + j;\n      v_edges[grid].cap = vCapacity;\n      TC += vCapacity;\n      v_edges[grid].usage      = 0;\n      v_edges[grid].est_usage  = 0;\n      v_edges[grid].red        = 0;\n      v_edges[grid].last_usage = 0;\n\n      v_edges[grid].max_ripups         = 0;\n      v_edges[grid].max_have_rippedups = 0;\n      v_edges[grid].ripups_cur_round   = false;\n    }\n  }\n\n  for (k = 0; k < numLayers; k++) {\n    for (i = 0; i < yGrid; i++) {\n      for (j = 0; j < xGrid - 1; j++) {\n        grid                  = i * (xGrid - 1) + j + k * (xGrid - 1) * yGrid;\n        h_edges3D[grid].cap   = hCapacity3D[k];\n        h_edges3D[grid].usage = 0;\n        h_edges3D[grid].red   = 0;\n      }\n    }\n    for (i = 0; i < yGrid - 1; i++) {\n      for (j = 0; j < xGrid; j++) {\n        grid                  = i * xGrid + j + k * xGrid * (yGrid - 1);\n        v_edges3D[grid].cap   = vCapacity3D[k];\n        v_edges3D[grid].usage = 0;\n        v_edges3D[grid].red   = 0;\n      }\n    }\n  }\n\n  // modify the capacity of edges according to the input file\n\n  if (fscanf(fp, \"%d\\n\", &numAdjust) != 1)\n    abort_with_message(\"Failed to read required info from benchfile.\");\n  printf(\"num of Adjust is %d\\n\", numAdjust);\n  while (numAdjust > 0) {\n    if (fscanf(fp, \"%d %d %d %d %d %d %d\\n\", &x1, &y1, &l1, &x2, &y2, &l2,\n               &reducedCap) != 7)\n      abort_with_message(\"Failed to read required info from benchfile.\");\n    reducedCap = reducedCap / 2;\n\n    k = l1 - 1;\n\n    if (y1 == y2) // horizontal edge\n    {\n      grid                = y1 * (xGrid - 1) + x1 + k * (xGrid - 1) * yGrid;\n      cap                 = h_edges3D[grid].cap;\n      reduce              = cap - reducedCap;\n      h_edges3D[grid].cap = reducedCap;\n      h_edges3D[grid].red = reduce;\n      grid                = y1 * (xGrid - 1) + x1;\n      h_edges[grid].cap -= reduce;\n      h_edges[grid].red += reduce;\n\n    } else if (x1 == x2) // vertical edge\n    {\n      grid                = y1 * xGrid + x1 + k * xGrid * (yGrid - 1);\n      cap                 = v_edges3D[grid].cap;\n      reduce              = cap - reducedCap;\n      v_edges3D[grid].cap = reducedCap;\n      v_edges3D[grid].red = reduce;\n      grid                = y1 * xGrid + x1;\n      v_edges[grid].cap -= reduce;\n      v_edges[grid].red += reduce;\n    }\n    numAdjust--;\n  }\n\n  treeOrderCong = NULL;\n  stopDEC       = FALSE;\n\n  seglistCnt = (int*)malloc(numValidNets * sizeof(int));\n  seglist    = (Segment*)malloc(segcount * sizeof(Segment));\n  trees      = (Tree*)malloc(numValidNets * sizeof(Tree));\n  sttrees    = (StTree*)malloc(numValidNets * sizeof(StTree));\n  gxs        = (DTYPE**)malloc(numValidNets * sizeof(DTYPE*));\n  gys        = (DTYPE**)malloc(numValidNets * sizeof(DTYPE*));\n  gs         = (DTYPE**)malloc(numValidNets * sizeof(DTYPE*));\n\n  gridHV = XRANGE * YRANGE;\n  gridH  = (xGrid - 1) * yGrid;\n  gridV  = xGrid * (yGrid - 1);\n  for (k = 0; k < numLayers; k++) {\n    gridHs[k] = k * gridH;\n    gridVs[k] = k * gridV;\n  }\n\n  MaxDegree = MD;\n\n  printf(\"# valid nets: %d\\n\", numValidNets);\n  printf(\"# segments: %d\\n\", segcount);\n  printf(\"maxDeg:     %d\\n\", maxDeg);\n  printf(\"\\nDone getting input\\n\");\n  printf(\"MD: %d, AD: %.2f, #nets: %d, #routed nets: %d\\n\", MD,\n         (float)TD / newnetID, numNets, newnetID);\n  printf(\"TC is %d\\n\", TC);\n  fclose(fp);\n\n  /*parentX1 = (short**)calloc(yGrid, sizeof(short*));\n  parentY1 = (short**)calloc(yGrid, sizeof(short*));\n  parentX3 = (short**)calloc(yGrid, sizeof(short*));\n  parentY3 = (short**)calloc(yGrid, sizeof(short*));\n\n\n  for(i=0; i<yGrid; i++)\n  {\n      parentX1[i] = (short*)calloc(xGrid, sizeof(short));\n      parentY1[i] = (short*)calloc(xGrid, sizeof(short));\n      parentX3[i] = (short*)calloc(xGrid, sizeof(short));\n      parentY3[i] = (short*)calloc(xGrid, sizeof(short));\n  }*/\n\n  /*pop_heap2 = (Bool*)calloc(yGrid*XRANGE, sizeof(Bool));\n\n  // allocate memory for priority queue\n  heap1 = (float**)calloc((yGrid*xGrid), sizeof(float*));\n  heap2 = (float**)calloc((yGrid*xGrid), sizeof(float*));*/\n\n  sttreesBK = NULL;\n}\n\nvoid init_usage() {\n  int i;\n\n  for (i = 0; i < yGrid * (xGrid - 1); i++)\n    h_edges[i].usage = 0;\n  for (i = 0; i < (yGrid - 1) * xGrid; i++)\n    v_edges[i].usage = 0;\n}\n\nvoid freeAllMemory() {\n  int deg, numEdges, edgeID;\n  TreeEdge* treeedge;\n\n  for (int i = 0; i < numValidNets; i++) {\n    free(nets[i]->pinX);\n    free(nets[i]->pinY);\n    free(nets[i]->pinL);\n    free(nets[i]);\n  }\n  free(seglistIndex);\n  free(seglistCnt);\n  free(seglist);\n  free(h_edges);\n  free(v_edges);\n  free(h_edges3D);\n  free(v_edges3D);\n  free(segOrder);\n\n  for (int i = 0; i < numValidNets; i++)\n    free(trees[i].branch);\n  free(trees);\n\n  for (int i = 0; i < numValidNets; i++) {\n    deg      = sttrees[i].deg;\n    numEdges = 2 * deg - 3;\n    for (edgeID = 0; edgeID < numEdges; edgeID++) {\n      treeedge = &(sttrees[i].edges[edgeID]);\n      if (treeedge->len > 0) {\n        free(treeedge->route.gridsX);\n        free(treeedge->route.gridsY);\n        free(treeedge->route.gridsL);\n      }\n    }\n    free(sttrees[i].nodes);\n    free(sttrees[i].edges);\n  }\n  free(sttrees);\n\n  /*for(i=0; i<yGrid; i++)\n  {\n      free(parentX1[i]);\n      free(parentY1[i]);\n      free(parentX3[i]);\n      free(parentY3[i]);\n  }\n  free(parentX1);\n  free(parentY1);\n  free(parentX3);\n  free(parentY3);\n  free(pop_heap2);\n  free(heap1);\n  free(heap2); //Michael*/\n}\n\n#endif\n"
  },
  {
    "path": "lonestar/eda/cpu/sproute/DataType.h",
    "content": "#ifndef _DATATYPE_H_\n#define _DATATYPE_H_\n\n#define MAXDEMAND 500  // MAX # Segments over an edge\n#define MAXLAYER 20    // MAX # Layer of a routing\n#define FILESTRLEN 100 // MAX length of file name\n#define BIG_INT 1e7    // big integer used as infinity\n\n#define TRUE 1\n#define FALSE 0\ntypedef char Bool;\n\ntypedef struct {\n  Bool xFirst; // route x-direction first (only for L route)\n  Bool HVH;    // TRUE = HVH or FALSE = VHV (only for Z route)\n  Bool maze;   // Whether this segment is routed by maze\n\n  short x1, y1, x2, y2; // coordinates of two endpoints\n  int netID;            // the netID of the net this segment belonging to\n  short Zpoint;         // The coordinates of Z point (x for HVH and y for VHV)\n  short* route;         // array of H and V Edges to implement this Segment\n  int numEdges;         // number of H and V Edges to implement this Segment\n} Segment;              // A Segment is a 2-pin connection\n\ntypedef struct {\n  char name[18]; // net name\n  int netIDorg;  // orginal net ID in the input file\n  short numPins; // number of pins in the net\n  short deg; // net degree (number of MazePoints connecting by the net, pins in\n             // same MazePoints count only 1)\n  short* pinX; // array of X coordinates of pins\n  short* pinY; // array of Y coordinates of pins\n  short* pinL; // array of L coordinates of pins\n  short minwidth;\n} Net; // A Net is a set of connected MazePoints\n\ntypedef struct edge_t : public galois::runtime::Lockable {\n  // galois::substrate::SimpleLock lock;\n  int congCNT;\n  int cap;                // the capacity of the edge\n  std::atomic<int> usage; // the usage of the edge\n  int red;\n  int last_usage;\n  float est_usage; // the estimated usage of the edge\n\n  std::atomic<int> max_ripups;\n  int max_have_rippedups;\n  bool ripups_cur_round;\n  /*inline void acquireLock()\n  {\n      lock.lock();\n  }\n  inline void releaseLock()\n  {\n      lock.unlock();\n  } */\n} Edge; // An Edge is the routing track holder between two adjacent MazePoints\n\ntypedef struct {\n  unsigned short cap;   // the capacity of the edge\n  unsigned short usage; // the usage of the edge\n  unsigned short red;\n\n} Edge3D;\n\ntypedef struct {\n  Bool assigned;\n\n  short status;\n  short conCNT;\n  short botL, topL;\n  short heights[6];\n\n  short x, y;    // position in the grid graph\n  short nbr[3];  // three neighbors\n  short edge[3]; // three adjacent edges\n  int hID;\n  int lID;\n  int eID[6];\n  int stackAlias;\n\n} TreeNode;\n\n#define NOROUTE 0\n#define LROUTE 1\n#define ZROUTE 2\n#define MAZEROUTE 3\n\ntypedef char RouteType;\n\ntypedef struct {\n  RouteType type; // type of route: LROUTE, ZROUTE, MAZEROUTE\n  Bool\n      xFirst; // valid for LROUTE, TRUE - the route is horizontal first (x1, y1)\n              // - (x2, y1) - (x2, y2), FALSE (x1, y1) - (x1, y2) - (x2, y2)\n  Bool\n      HVH; // valid for ZROUTE, TRUE - the route is HVH shape, FALSE - VHV shape\n  short Zpoint; // valid for ZROUTE, the position of turn point for Z-shape\n  short*\n      gridsX; // valid for MAZEROUTE, a list of grids (n=routelen+1) the route\n              // passes, (x1, y1) is the first one, but (x2, y2) is the lastone\n  short*\n      gridsY; // valid for MAZEROUTE, a list of grids (n=routelen+1) the route\n              // passes, (x1, y1) is the first one, but (x2, y2) is the lastone\n  short* gridsL; // n\n  int routelen;  // valid for MAZEROUTE, the number of edges in the route\n  // Edge3D *edge;       // list of 3D edges the route go through;\n\n} Route;\n\ntypedef struct {\n  Bool assigned;\n\n  int len; // the Manhanttan Distance for two end nodes\n  int n1, n1a;\n  int n2, n2a;\n  Route route;\n\n  int n_ripups;\n  bool ripup;\n} TreeEdge;\n\ntypedef struct {\n  int deg;\n  TreeNode* nodes; // the nodes (pin and Steiner nodes) in the tree\n  TreeEdge* edges; // the tree edges\n} StTree;\n\ntypedef struct {\n\n  int treeIndex;\n  int minX;\n  float npv; // net length over pin\n} OrderNetPin;\n\ntypedef struct {\n  int length;\n  int treeIndex;\n  int xmin;\n} OrderTree;\n\ntypedef struct {\n  short l;\n  int x, y;\n} parent3D;\n\ntypedef struct {\n  int length;\n  int edgeID;\n} OrderNetEdge;\n\ntypedef enum { NORTH, EAST, SOUTH, WEST, ORIGIN, UP, DOWN } dirctionT;\ntypedef enum { NONE, HORI, VERT, BID } viaST;\n\n#endif /* _DATATYPE_H_ */\n"
  },
  {
    "path": "lonestar/eda/cpu/sproute/EdgeShift.h",
    "content": "#ifndef _EDGESHIFT_H_\n#define _EDGESHIFT_H_\n\n#include <stdio.h>\n#include <stdlib.h>\n#include \"DataType.h\"\n#include \"flute.h\"\n#include \"DataProc.h\"\n#include \"route.h\"\n#include \"RipUp.h\"\n\n#define HORIZONTAL 1\n#define VERTICAL 0\n\nint edgeShift(Tree* t) {\n  int i, j, k, l, m, deg, root = 0, x, y, n, n1, n2, n3;\n  int maxX, minX, maxY, minY, maxX1, minX1, maxY1, minY1, maxX2, minX2, maxY2,\n      minY2, bigX, smallX, bigY, smallY, grid, grid1, grid2;\n  int nbr[MAXNETDEG * 2][3], nbrCnt[MAXNETDEG * 2];\n  int pairCnt, pairN1[MAXNETDEG], pairN2[MAXNETDEG];\n  int benefit, bestBenefit, bestCost;\n  int cost1, cost2, *costH, *costV, bestPair, Pos, bestPos, numShift = 0;\n\n  costH = (int*)malloc(yGrid * sizeof(int));\n  costV = (int*)malloc(xGrid * sizeof(int));\n\n  deg = t->deg;\n  // find root of the tree\n  for (i = deg; i < 2 * deg - 2; i++) {\n    if (t->branch[i].n == i) {\n      root = i;\n      break;\n    }\n  }\n  // printf(\"root=%d\\n\", root);\n\n  // find all neighbors for steiner nodes\n  for (i = deg; i < 2 * deg - 2; i++)\n    nbrCnt[i] = 0;\n  // edges from pin to steiner\n  for (i = 0; i < deg; i++) {\n    n                 = t->branch[i].n;\n    nbr[n][nbrCnt[n]] = i;\n    nbrCnt[n]++;\n  }\n  // edges from steiner to steiner\n  for (i = deg; i < 2 * deg - 2; i++) {\n    if (i != root) // not the removed steiner nodes and root\n    {\n      n                 = t->branch[i].n;\n      nbr[i][nbrCnt[i]] = n;\n      nbrCnt[i]++;\n      nbr[n][nbrCnt[n]] = i;\n      nbrCnt[n]++;\n    }\n  }\n\n  // for(i=deg; i<2*deg-2; i++)\n  //   if(nbrCnt[i]!=3) printf(\"nbrCnt[%d]=%d(!=3)\\n\", i, nbrCnt[i]);\n\n  bestBenefit = BIG_INT;  // used to enter while loop\n  while (bestBenefit > 0) // && numShift<60)\n  {\n    // find all H or V edges (steiner pairs)\n    pairCnt = 0;\n    for (i = deg; i < 2 * deg - 2; i++) {\n      n = t->branch[i].n;\n      if (t->branch[i].x == t->branch[n].x) {\n        if (t->branch[i].y < t->branch[n].y) {\n          pairN1[pairCnt] = i;\n          pairN2[pairCnt] = n;\n          pairCnt++;\n        } else if (t->branch[i].y > t->branch[n].y) {\n          pairN1[pairCnt] = n;\n          pairN2[pairCnt] = i;\n          pairCnt++;\n        }\n      } else if (t->branch[i].y == t->branch[n].y) {\n        if (t->branch[i].x < t->branch[n].x) {\n          pairN1[pairCnt] = i;\n          pairN2[pairCnt] = n;\n          pairCnt++;\n        } else if (t->branch[i].x > t->branch[n].x) {\n          pairN1[pairCnt] = n;\n          pairN2[pairCnt] = i;\n          pairCnt++;\n        }\n      }\n    }\n\n    bestPair    = -1;\n    bestBenefit = -1;\n    // for each H or V edge, find the best benefit by shifting it\n    for (i = 0; i < pairCnt; i++) {\n      // find the range of shifting for this pair\n      n1 = pairN1[i];\n      n2 = pairN2[i];\n      if (t->branch[n1].y == t->branch[n2].y) // a horizontal edge\n      {\n        // find the shifting range for the edge (minY~maxY)\n        maxY1 = minY1 = t->branch[n1].y;\n        for (j = 0; j < 3; j++) {\n          y = t->branch[nbr[n1][j]].y;\n          if (y > maxY1)\n            maxY1 = y;\n          else if (y < minY1)\n            minY1 = y;\n        }\n        maxY2 = minY2 = t->branch[n2].y;\n        for (j = 0; j < 3; j++) {\n          y = t->branch[nbr[n2][j]].y;\n          if (y > maxY2)\n            maxY2 = y;\n          else if (y < minY2)\n            minY2 = y;\n        }\n        minY = max(minY1, minY2);\n        maxY = min(maxY1, maxY2);\n        // printf(\"(%d-%d) minY1=%d, maxY1=%d, minY2=%d, maxY2=%d, minY = %d,\n        // maxY = %d\\n\", n1, n2, minY1, maxY1, minY2, maxY2, minY, maxY);\n\n        // find the best position (least total usage) to shift\n        if (minY < maxY) // more than 1 possible positions\n        {\n          for (j = minY; j <= maxY; j++) {\n            costH[j] = 0;\n            grid     = j * (xGrid - 1);\n            for (k = t->branch[n1].x; k < t->branch[n2].x; k++) {\n              costH[j] += h_edges[grid + k].est_usage;\n            }\n            // add the cost of all edges adjacent to the two steiner nodes\n            for (l = 0; l < nbrCnt[n1]; l++) {\n              n3 = nbr[n1][l];\n              if (n3 != n2) // exclude current edge n1-n2\n              {\n                cost1 = cost2 = 0;\n                if (t->branch[n1].x < t->branch[n3].x) {\n                  smallX = t->branch[n1].x;\n                  bigX   = t->branch[n3].x;\n                } else {\n                  smallX = t->branch[n3].x;\n                  bigX   = t->branch[n1].x;\n                }\n                if (j < t->branch[n3].y) {\n                  smallY = j;\n                  bigY   = t->branch[n3].y;\n                } else {\n                  smallY = t->branch[n3].y;\n                  bigY   = j;\n                }\n                grid1 = smallY * (xGrid - 1);\n                grid2 = bigY * (xGrid - 1);\n                for (m = smallX; m < bigX; m++) {\n                  cost1 += h_edges[grid1 + m].est_usage;\n                  cost2 += h_edges[grid2 + m].est_usage;\n                }\n                grid1 = smallY * xGrid;\n                for (m = smallY; m < bigY; m++) {\n                  cost1 += v_edges[grid1 + bigX].est_usage;\n                  cost2 += v_edges[grid1 + smallX].est_usage;\n                  grid1 += xGrid;\n                }\n                costH[j] += min(cost1, cost2);\n              } // if(n3!=n2)\n            }   // loop l\n            for (l = 0; l < nbrCnt[n2]; l++) {\n              n3 = nbr[n2][l];\n              if (n3 != n1) // exclude current edge n1-n2\n              {\n                cost1 = cost2 = 0;\n                if (t->branch[n2].x < t->branch[n3].x) {\n                  smallX = t->branch[n2].x;\n                  bigX   = t->branch[n3].x;\n                } else {\n                  smallX = t->branch[n3].x;\n                  bigX   = t->branch[n2].x;\n                }\n                if (j < t->branch[n3].y) {\n                  smallY = j;\n                  bigY   = t->branch[n3].y;\n                } else {\n                  smallY = t->branch[n3].y;\n                  bigY   = j;\n                }\n                grid1 = smallY * (xGrid - 1);\n                grid2 = bigY * (xGrid - 1);\n                for (m = smallX; m < bigX; m++) {\n                  cost1 += h_edges[grid1 + m].est_usage;\n                  cost2 += h_edges[grid2 + m].est_usage;\n                }\n                grid1 = smallY * xGrid;\n                for (m = smallY; m < bigY; m++) {\n                  cost1 += v_edges[grid1 + bigX].est_usage;\n                  cost2 += v_edges[grid1 + smallX].est_usage;\n                  grid1 += xGrid;\n                }\n                costH[j] += min(cost1, cost2);\n              } // if(n3!=n1)\n            }   // loop l\n          }     // loop j\n          bestCost = BIG_INT;\n          Pos      = t->branch[n1].y;\n          for (j = minY; j <= maxY; j++) {\n            if (costH[j] < bestCost) {\n              bestCost = costH[j];\n              Pos      = j;\n            }\n          }\n          if (Pos != t->branch[n1].y) // find a better position than current\n          {\n            benefit = costH[t->branch[n1].y] - bestCost;\n            if (benefit > bestBenefit) {\n              bestBenefit = benefit;\n              bestPair    = i;\n              bestPos     = Pos;\n            }\n          }\n        }\n\n      } else // a vertical edge\n      {\n        // find the shifting range for the edge (minX~maxX)\n        maxX1 = minX1 = t->branch[n1].x;\n        for (j = 0; j < 3; j++) {\n          x = t->branch[nbr[n1][j]].x;\n          if (x > maxX1)\n            maxX1 = x;\n          else if (x < minX1)\n            minX1 = x;\n        }\n        maxX2 = minX2 = t->branch[n2].x;\n        for (j = 0; j < 3; j++) {\n          x = t->branch[nbr[n2][j]].x;\n          if (x > maxX2)\n            maxX2 = x;\n          else if (x < minX2)\n            minX2 = x;\n        }\n        minX = max(minX1, minX2);\n        maxX = min(maxX1, maxX2);\n\n        // find the best position (least total usage) to shift\n        if (minX < maxX) // more than 1 possible positions\n        {\n          for (j = minX; j <= maxX; j++) {\n            costV[j] = 0;\n            for (k = t->branch[n1].y; k < t->branch[n2].y; k++) {\n              costV[j] += v_edges[k * xGrid + j].est_usage;\n            }\n            // add the cost of all edges adjacent to the two steiner nodes\n            for (l = 0; l < nbrCnt[n1]; l++) {\n              n3 = nbr[n1][l];\n              if (n3 != n2) // exclude current edge n1-n2\n              {\n                cost1 = cost2 = 0;\n                if (j < t->branch[n3].x) {\n                  smallX = j;\n                  bigX   = t->branch[n3].x;\n                } else {\n                  smallX = t->branch[n3].x;\n                  bigX   = j;\n                }\n                if (t->branch[n1].y < t->branch[n3].y) {\n                  smallY = t->branch[n1].y;\n                  bigY   = t->branch[n3].y;\n                } else {\n                  smallY = t->branch[n3].y;\n                  bigY   = t->branch[n1].y;\n                }\n                grid1 = smallY * (xGrid - 1);\n                grid2 = bigY * (xGrid - 1);\n                for (m = smallX; m < bigX; m++) {\n                  cost1 += h_edges[grid1 + m].est_usage;\n                  cost2 += h_edges[grid2 + m].est_usage;\n                }\n                grid1 = smallY * xGrid;\n                for (m = smallY; m < bigY; m++) {\n                  cost1 += v_edges[grid1 + bigX].est_usage;\n                  cost2 += v_edges[grid1 + smallX].est_usage;\n                  grid1 += xGrid;\n                }\n                costV[j] += min(cost1, cost2);\n              } // if(n3!=n2)\n            }   // loop l\n            for (l = 0; l < nbrCnt[n2]; l++) {\n              n3 = nbr[n2][l];\n              if (n3 != n1) // exclude current edge n1-n2\n              {\n                cost1 = cost2 = 0;\n                if (j < t->branch[n3].x) {\n                  smallX = j;\n                  bigX   = t->branch[n3].x;\n                } else {\n                  smallX = t->branch[n3].x;\n                  bigX   = j;\n                }\n                if (t->branch[n2].y < t->branch[n3].y) {\n                  smallY = t->branch[n2].y;\n                  bigY   = t->branch[n3].y;\n                } else {\n                  smallY = t->branch[n3].y;\n                  bigY   = t->branch[n2].y;\n                }\n                grid1 = smallY * (xGrid - 1);\n                grid2 = bigY * (xGrid - 1);\n                for (m = smallX; m < bigX; m++) {\n                  cost1 += h_edges[grid1 + m].est_usage;\n                  cost2 += h_edges[grid2 + m].est_usage;\n                }\n                grid1 = smallY * xGrid;\n                for (m = smallY; m < bigY; m++) {\n                  cost1 += v_edges[grid1 + bigX].est_usage;\n                  cost2 += v_edges[grid1 + smallX].est_usage;\n                  grid1 += xGrid;\n                }\n                costV[j] += min(cost1, cost2);\n              } // if(n3!=n1)\n            }   // loop l\n          }     // loop j\n          bestCost = BIG_INT;\n          Pos      = t->branch[n1].x;\n          for (j = minX; j <= maxX; j++) {\n            if (costV[j] < bestCost) {\n              bestCost = costV[j];\n              Pos      = j;\n            }\n          }\n          if (Pos != t->branch[n1].x) // find a better position than current\n          {\n            benefit = costV[t->branch[n1].x] - bestCost;\n            if (benefit > bestBenefit) {\n              bestBenefit = benefit;\n              bestPair    = i;\n              bestPos     = Pos;\n            }\n          }\n        }\n\n      } // else (a vertical edge)\n\n    } // loop i\n\n    if (bestBenefit > 0) {\n      n1 = pairN1[bestPair];\n      n2 = pairN2[bestPair];\n\n      if (t->branch[n1].y == t->branch[n2].y) // horizontal edge\n      {\n        t->branch[n1].y = bestPos;\n        t->branch[n2].y = bestPos;\n      } // vertical edge\n      else {\n        t->branch[n1].x = bestPos;\n        t->branch[n2].x = bestPos;\n      }\n      numShift++;\n    }\n  } // while(bestBenefit>0)\n\n  free(costH);\n  free(costV);\n\n  return (numShift);\n}\n\n// exchange Steiner nodes at the same position, then call edgeShift()\nint edgeShiftNew(Tree* t) {\n  int i, j, n;\n  int deg, pairCnt, pairN1[MAXNETDEG], pairN2[MAXNETDEG], cur_pairN1,\n      cur_pairN2;\n  int N1nbrH, N1nbrV, N2nbrH, N2nbrV, iter;\n  int numShift;\n  Bool isPair;\n  // printf(\"net[%d]\\n\", net); getchar();\n  numShift = edgeShift(t);\n  deg      = t->deg;\n\n  // if(net==3){printtree(*t);getchar();}\n  iter       = 0;\n  cur_pairN1 = cur_pairN2 = -1;\n  while (iter < 3) {\n    iter++;\n\n    // find all pairs of steiner node at the same position (steiner pairs)\n    pairCnt = 0;\n    for (i = deg; i < 2 * deg - 2; i++) {\n      n = t->branch[i].n;\n      if (n != i && n != t->branch[n].n && t->branch[i].x == t->branch[n].x &&\n          t->branch[i].y == t->branch[n].y) {\n        pairN1[pairCnt] = i;\n        pairN2[pairCnt] = n;\n        pairCnt++;\n      }\n    }\n    // if(net==3){printf(\"# pairs: %d, N1=%d, N2=%d\\n\", pairCnt, pairN1[0],\n    // pairN2[0]);getchar();}\n    if (pairCnt > 0) {\n      if (pairN1[0] != cur_pairN1 ||\n          pairN2[0] != cur_pairN2) // don't try the same as last one\n      {\n        cur_pairN1 = pairN1[0];\n        cur_pairN2 = pairN2[0];\n        isPair     = TRUE;\n      } else if (pairN1[0] == cur_pairN1 && pairN2[0] == cur_pairN2 &&\n                 pairCnt > 1) {\n        cur_pairN1 = pairN1[1];\n        cur_pairN2 = pairN2[1];\n        isPair     = TRUE;\n      } else\n        isPair = FALSE;\n\n      // if(net==3){printf(\"isPair: %d, N1=%d, N2=%d\\n\", isPair, cur_pairN1,\n      // cur_pairN2);getchar();}\n      if (isPair) // find a new pair to swap\n      {\n        N1nbrH = N1nbrV = N2nbrH = N2nbrV = -1;\n        // find the nodes directed to cur_pairN1(2 nodes) and cur_pairN2(1\n        // nodes)\n        for (j = 0; j < 2 * deg - 2; j++) {\n          n = t->branch[j].n;\n          if (n == cur_pairN1) {\n            if (t->branch[j].x == t->branch[cur_pairN1].x &&\n                t->branch[j].y != t->branch[cur_pairN1].y)\n              N1nbrV = j;\n            else if (t->branch[j].y == t->branch[cur_pairN1].y &&\n                     t->branch[j].x != t->branch[cur_pairN1].x)\n              N1nbrH = j;\n          } else if (n == cur_pairN2) {\n            if (t->branch[j].x == t->branch[cur_pairN2].x &&\n                t->branch[j].y != t->branch[cur_pairN2].y)\n              N2nbrV = j;\n            else if (t->branch[j].y == t->branch[cur_pairN2].y &&\n                     t->branch[j].x != t->branch[cur_pairN2].x)\n              N2nbrH = j;\n          }\n        }\n        // find the node cur_pairN2 directed to\n        n = t->branch[cur_pairN2].n;\n        if (t->branch[n].x == t->branch[cur_pairN2].x &&\n            t->branch[n].y != t->branch[cur_pairN2].y)\n          N2nbrV = n;\n        else if (t->branch[n].y == t->branch[cur_pairN2].y &&\n                 t->branch[n].x != t->branch[cur_pairN2].x)\n          N2nbrH = n;\n\n        // if(net==3){printf(\"N1=%d, N2=%d, N1nbrH=%d, N1nbrV=%d, N2nbrH=%d,\n        // N2nbrV=%d\\n\", cur_pairN1, cur_pairN2, N1nbrH, N1nbrV, N2nbrH,\n        // N2nbrV);getchar();}\n        if (N1nbrH >= 0 && N2nbrH >= 0) {\n          if (N2nbrH == t->branch[cur_pairN2].n) {\n            t->branch[N1nbrH].n     = cur_pairN2;\n            t->branch[cur_pairN1].n = N2nbrH;\n            t->branch[cur_pairN2].n = cur_pairN1;\n          } else {\n            t->branch[N1nbrH].n = cur_pairN2;\n            t->branch[N2nbrH].n = cur_pairN1;\n          }\n          numShift += edgeShift(t);\n        } else if (N1nbrV >= 0 && N2nbrV >= 0) {\n          if (N2nbrV == t->branch[cur_pairN2].n) {\n            t->branch[N1nbrV].n     = cur_pairN2;\n            t->branch[cur_pairN1].n = N2nbrV;\n            t->branch[cur_pairN2].n = cur_pairN1;\n          } else {\n            t->branch[N1nbrV].n = cur_pairN2;\n            t->branch[N2nbrV].n = cur_pairN1;\n          }\n          numShift += edgeShift(t);\n        }\n        // if(net==3){printtree(*t);getchar();}\n      } // if(isPair)\n\n    } // if(pairCnt>0)\n    else\n      iter = 3;\n\n  } // while\n\n  return (numShift);\n}\n#endif\n"
  },
  {
    "path": "lonestar/eda/cpu/sproute/LICENSE",
    "content": "-------------- FLUTE - Version 3.0 -----------------\n                       by\n                Chris C.-N. Chu\n       Dept. of ECpE, Iowa State University\n             Copyright (c) - 2005\n  Iowa State University Research Foundation, Inc.\n----------------------------------------------------\n\n\nBSD 3-Clause License\n\nCopyright (c) 2018, Iowa State University\nAll rights reserved.\n\nRedistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:\n\n* Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.\n\n* Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.\n\n* Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.\n\nTHIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS \"AS IS\" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,\nDATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n\n"
  },
  {
    "path": "lonestar/eda/cpu/sproute/README.md",
    "content": "SPRoute\n================================================================================\n\n\nDESCRIPTION \n--------------------------------------------------------------------------------\n\nThis program performs global routing on a circuit. Please find our ICCAD 2019 paper \"He, Jiayuan, et al. \"SPRoute: A Scalable Parallel Negotiation-based Global Router.\" 2019 IEEE/ACM International Conference on Computer-Aided Design (ICCAD). IEEE, 2019.\" for details.\n\nSPRoute is based on FastRoute 4.1 and consists of four stages: tree decomposition, pattern routing, maze routing and layer assignment. SRoute parallelizes the most time-consuming maze routing stage in a novel hybrid parallel scheme which combines net-level parallelism and fine-grain parallelism. \n\nINPUT\n--------------------------------------------------------------------------------\n\nInput circuit is ISPD2008 contest format. For more information please visit http://www.ispd.cc/contests/08/ispd08rc.html\n\nInput also requires FLUTE files. Please download flute-3.1.tgz from http://home.eng.iastate.edu/~cnchu/flute.html.\n\nBUILD\n--------------------------------------------------------------------------------\n\n1. Run cmake at BUILD directory (refer to top-level README for cmake instructions).\n\n2. Run `cd <BUILD>/lonestar/eda/cpu/sproute; make -j sproute-cpu`\n\n\nRUN\n--------------------------------------------------------------------------------\n\nThe following are a few example command lines.\n\n-`$ ./sproute-cpu -ISPD2008Graph <path-to-circuit> --flute <path-to-flute-directory> -t 40`\n\n\n\nPERFORMANCE  \n--------------------------------------------------------------------------------\nPlease find more details in the SPRoute paper.\n\nOn a 28-core machine, SPRoute achieves an average speedup of 11X on overflow-free cases and 3.1X on hard-to-route cases in ISPD2008 benchmarks. \n\n\n"
  },
  {
    "path": "lonestar/eda/cpu/sproute/RSMT.h",
    "content": "#ifndef _RSMT_H\n#define _RSMT_H\n\n#include <stdio.h>\n#include <stdlib.h>\n#include <math.h>\n#include \"DataType.h\"\n#include \"flute.h\"\n#include \"DataProc.h\"\n#include \"EdgeShift.h\"\n#include \"utility.h\"\n\n#define FLUTEACCURACY 2\n\nstruct pnt {\n  DTYPE x, y;\n  int o;\n};\n\n// global variable\n// ** V_table;\n// int ** H_table;\n\n/*static int ordery1(const void *a,  const void *b)\n{\n    struct wire *pa, *pb;\n\n    pa = *(struct wire**)a;\n    pb = *(struct wire**)b;\n\n    if (pa->y1 < pb->y1) return -1;\n    if (pa->y1 > pb->y1) return 1;\n    return 0;\n   // return ((struct Segment*)a->x1-(struct Segment*)b->x1);\n\n}\n\nstatic int ordery2(const void *a,  const void *b)\n{\n    struct wire *pa, *pb;\n\n    pa = *(struct wire**)a;\n    pb = *(struct wire**)b;\n\n    if (pa->y2 < pb->y2) return -1;\n    if (pa->y2 > pb->y2) return 1;\n    return 0;\n   // return ((struct Segment*)a->x1-(struct Segment*)b->x1);\n\n}\n\nstatic int orderx1(const void *a,  const void *b)\n{\n    struct wire *pa, *pb;\n\n    pa = *(struct wire**)a;\n    pb = *(struct wire**)b;\n\n    if (pa->x1 < pb->x1) return -1;\n    if (pa->x1 > pb->x1) return 1;\n    return 0;\n   // return ((struct Segment*)a->x1-(struct Segment*)b->x1);\n\n}\n\nstatic int orderx2(const void *a,  const void *b)\n{\n    struct wire *pa, *pb;\n\n    pa = *(struct wire**)a;\n    pb = *(struct wire**)b;\n\n    if (pa->x2 < pb->x2) return -1;\n    if (pa->x2 > pb->x2) return 1;\n    return 0;\n   // return ((struct Segment*)a->x1-(struct Segment*)b->x1);\n\n}*/\n\n/*int orderx(const void *a, const void *b)\n{\n    struct pnt *pa, *pb;\n\n    pa = *(struct pnt**)a;\n    pb = *(struct pnt**)b;\n\n    if (pa->x < pb->x) return -1;\n    if (pa->x > pb->x) return 1;\n    return 0;\n    //return Y(*(struct Segment*)a.x1-*(struct Segment*)b.x1);\n}\n\n\nstatic int ordery(const void *a, const void *b)\n{\n    struct pnt *pa, *pb;\n\n    pa = *(struct pnt**)a;\n    pb = *(struct pnt**)b;\n\n    if (pa->y < pb->y) return -1;\n    if (pa->y > pb->y) return 1;\n    return 0;\n}*/\n\n// binary search to map the new coordinates to original coordinates\nint mapxy(int nx, int xs[], int nxs[], int d) {\n  int max, min, mid;\n\n  min = 0;\n  max = d - 1;\n\n  while (min <= max) {\n    mid = (min + max) / 2;\n    if (nx == nxs[mid])\n      return (xs[mid]);\n    if (nx < nxs[mid])\n      max = mid - 1;\n    else\n      min = mid + 1;\n  }\n\n  if (min > max)\n    printf(\"mapping error\\n\");\n  return 0;\n}\n\nvoid copyStTree(int ind, Tree rsmt) {\n  int i, d, numnodes, numedges;\n  int n, x1, y1, x2, y2, edgecnt, nbrcnt[2 * MAXNETDEG];\n  TreeEdge* treeedges;\n  TreeNode* treenodes;\n\n  d                  = rsmt.deg;\n  sttrees[ind].deg   = d;\n  numnodes           = 2 * d - 2;\n  numedges           = 2 * d - 3;\n  sttrees[ind].nodes = (TreeNode*)malloc(numnodes * sizeof(TreeNode));\n  sttrees[ind].edges = (TreeEdge*)malloc(numedges * sizeof(TreeEdge));\n\n  treenodes = sttrees[ind].nodes;\n  treeedges = sttrees[ind].edges;\n\n  // initialize the nbrcnt for treenodes\n  for (i = 0; i < numnodes; i++)\n    nbrcnt[i] = 0;\n\n  // printf(\"tree ind %d\\n\",ind);\n\n  edgecnt = 0;\n  // original rsmt has 2*d-2 branch (one is a loop for root), in StTree 2*d-3\n  // edges (no original loop)\n  for (i = 0; i < numnodes; i++) {\n    x1             = rsmt.branch[i].x;\n    y1             = rsmt.branch[i].y;\n    n              = rsmt.branch[i].n;\n    x2             = rsmt.branch[n].x;\n    y2             = rsmt.branch[n].y;\n    treenodes[i].x = x1;\n    treenodes[i].y = y1;\n    if (i < d) {\n      treenodes[i].status = 2;\n    } else {\n      treenodes[i].status = 0;\n    }\n    if (n != i) // not root\n    {\n      treeedges[edgecnt].len      = ADIFF(x1, x2) + ADIFF(y1, y2);\n      treeedges[edgecnt].n_ripups = 0;\n      // make x1 always less than x2\n      if (x1 < x2) {\n        treeedges[edgecnt].n1 = i;\n        treeedges[edgecnt].n2 = n;\n      } else {\n        treeedges[edgecnt].n1 = n;\n        treeedges[edgecnt].n2 = i;\n      }\n      treenodes[i].nbr[nbrcnt[i]]  = n;\n      treenodes[i].edge[nbrcnt[i]] = edgecnt;\n      treenodes[n].nbr[nbrcnt[n]]  = i;\n      treenodes[n].edge[nbrcnt[n]] = edgecnt;\n\n      nbrcnt[i]++;\n      nbrcnt[n]++;\n      edgecnt++;\n    }\n    if (nbrcnt[i] > 3 || nbrcnt[n] > 3)\n      printf(\"wrong\\n\");\n  }\n  if (edgecnt != numnodes - 1) {\n    printf(\"copy tree wrong\\n\");\n    printf(\"num edges %d, num nodes %d\\n\", edgecnt, numnodes);\n    exit(0);\n  }\n}\n\nvoid fluteNormal(int netID, int d, DTYPE x[], DTYPE y[], int acc, float coeffV,\n                 Tree* t) {\n  DTYPE *xs, *ys, minval, x_max, x_min, x_mid, y_max, y_min, y_mid, *tmp_xs,\n      *tmp_ys;\n  int* s;\n  int i, j, k, minidx;\n  struct pnt *pt, **ptp, *tmpp;\n\n  if (d == 2) {\n    t->deg         = 2;\n    t->length      = ADIFF(x[0], x[1]) + ADIFF(y[0], y[1]);\n    t->branch      = (Branch*)malloc(2 * sizeof(Branch));\n    t->branch[0].x = x[0];\n    t->branch[0].y = y[0];\n    t->branch[0].n = 1;\n    t->branch[1].x = x[1];\n    t->branch[1].y = y[1];\n    t->branch[1].n = 1;\n  } else if (d == 3) {\n    t->deg = 3;\n    if (x[0] < x[1]) {\n      if (x[0] < x[2]) {\n        x_min = x[0];\n        x_mid = min(x[1], x[2]);\n        x_max = max(x[1], x[2]);\n      } else {\n        x_min = x[2];\n        x_mid = x[0];\n        x_max = x[1];\n      }\n    } else {\n      if (x[0] < x[2]) {\n        x_min = x[1];\n        x_mid = x[0];\n        x_max = x[2];\n      } else {\n        x_min = min(x[1], x[2]);\n        x_mid = max(x[1], x[2]);\n        x_max = x[0];\n      }\n    }\n    if (y[0] < y[1]) {\n      if (y[0] < y[2]) {\n        y_min = y[0];\n        y_mid = min(y[1], y[2]);\n        y_max = max(y[1], y[2]);\n      } else {\n        y_min = y[2];\n        y_mid = y[0];\n        y_max = y[1];\n      }\n    } else {\n      if (y[0] < y[2]) {\n        y_min = y[1];\n        y_mid = y[0];\n        y_max = y[2];\n      } else {\n        y_min = min(y[1], y[2]);\n        y_mid = max(y[1], y[2]);\n        y_max = y[0];\n      }\n    }\n\n    t->length      = ADIFF(x_max, x_min) + ADIFF(y_max, y_min);\n    t->branch      = (Branch*)malloc(4 * sizeof(Branch));\n    t->branch[0].x = x[0];\n    t->branch[0].y = y[0];\n    t->branch[0].n = 3;\n    t->branch[1].x = x[1];\n    t->branch[1].y = y[1];\n    t->branch[1].n = 3;\n    t->branch[2].x = x[2];\n    t->branch[2].y = y[2];\n    t->branch[2].n = 3;\n    t->branch[3].x = x_mid;\n    t->branch[3].y = y_mid;\n    t->branch[3].n = 3;\n  } else {\n    xs = (DTYPE*)malloc(sizeof(DTYPE) * (d));\n    ys = (DTYPE*)malloc(sizeof(DTYPE) * (d));\n\n    tmp_xs = (DTYPE*)malloc(sizeof(DTYPE) * (d));\n    tmp_ys = (DTYPE*)malloc(sizeof(DTYPE) * (d));\n\n    s   = (int*)malloc(sizeof(int) * (d));\n    pt  = (struct pnt*)malloc(sizeof(struct pnt) * (d + 1));\n    ptp = (struct pnt**)malloc(sizeof(struct pnt*) * (d + 1));\n\n    for (i = 0; i < d; i++) {\n      pt[i].x = x[i];\n      pt[i].y = y[i];\n      ptp[i]  = &pt[i];\n    }\n    // printf(\"OK here\\n\");\n    // sort x\n\n    if (d < 1000) {\n      for (i = 0; i < d - 1; i++) {\n        minval = ptp[i]->x;\n        minidx = i;\n        for (j = i + 1; j < d; j++) {\n          if (minval > ptp[j]->x) {\n            minval = ptp[j]->x;\n            minidx = j;\n          }\n        }\n        tmpp        = ptp[i];\n        ptp[i]      = ptp[minidx];\n        ptp[minidx] = tmpp;\n      }\n    } else {\n      qsort(ptp, d, sizeof(struct point*), orderx);\n    }\n\n#if REMOVE_DUPLICATE_PIN == 1\n    ptp[d]    = &pt[d];\n    ptp[d]->x = ptp[d]->y = -999999;\n    j                     = 0;\n    for (i = 0; i < d; i++) {\n      for (k = i + 1; ptp[k]->x == ptp[i]->x; k++)\n        if (ptp[k]->y == ptp[i]->y) // pins k and i are the same\n          break;\n      if (ptp[k]->x != ptp[i]->x)\n        ptp[j++] = ptp[i];\n    }\n    d = j;\n#endif\n\n    for (i = 0; i < d; i++) {\n      xs[i]     = ptp[i]->x;\n      ptp[i]->o = i;\n    }\n\n    // sort y to find s[]\n    if (d < 1000) {\n      for (i = 0; i < d - 1; i++) {\n        minval = ptp[i]->y;\n        minidx = i;\n        for (j = i + 1; j < d; j++) {\n          if (minval > ptp[j]->y) {\n            minval = ptp[j]->y;\n            minidx = j;\n          }\n        }\n        ys[i]       = ptp[minidx]->y;\n        s[i]        = ptp[minidx]->o;\n        ptp[minidx] = ptp[i];\n      }\n      ys[d - 1] = ptp[d - 1]->y;\n      s[d - 1]  = ptp[d - 1]->o;\n    } else {\n      qsort(ptp, d, sizeof(struct point*), ordery);\n      for (i = 0; i < d; i++) {\n        ys[i] = ptp[i]->y;\n        s[i]  = ptp[i]->o;\n      }\n    }\n\n    gxs[netID] = (DTYPE*)malloc(d * sizeof(DTYPE));\n    gys[netID] = (DTYPE*)malloc(d * sizeof(DTYPE));\n    gs[netID]  = (DTYPE*)malloc(d * sizeof(DTYPE));\n\n    for (i = 0; i < d; i++) {\n      gxs[netID][i] = xs[i];\n      gys[netID][i] = ys[i];\n      gs[netID][i]  = s[i];\n\n      tmp_xs[i] = xs[i] * 100;\n      tmp_ys[i] = ys[i] * ((int)(100 * coeffV));\n    }\n\n    *t = flutes(d, tmp_xs, tmp_ys, s, acc);\n\n    for (i = 0; i < 2 * d - 2; i++) {\n      t->branch[i].x = t->branch[i].x / 100;\n      t->branch[i].y = t->branch[i].y / ((int)(100 * coeffV));\n    }\n\n    free(xs);\n    free(ys);\n    free(tmp_xs);\n    free(tmp_ys);\n    free(s);\n    free(pt);\n    free(ptp);\n  }\n}\n\nvoid fluteCongest(int netID, int d, DTYPE x[], DTYPE y[], int acc, float coeffV,\n                  Tree* t) {\n  DTYPE *xs, *ys, *nxs, *nys, *x_seg, *y_seg, x_max, x_min, x_mid, y_max, y_min,\n      y_mid;\n  int* s;\n  int i, j, k, grid;\n  DTYPE height, width;\n  int usageH, usageV;\n  float coeffH = 1;\n  //\tfloat coeffV = 2;//1.36;//hCapacity/vCapacity;//1;//\n\n  if (d == 2) {\n    t->deg         = 2;\n    t->length      = ADIFF(x[0], x[1]) + ADIFF(y[0], y[1]);\n    t->branch      = (Branch*)malloc(2 * sizeof(Branch));\n    t->branch[0].x = x[0];\n    t->branch[0].y = y[0];\n    t->branch[0].n = 1;\n    t->branch[1].x = x[1];\n    t->branch[1].y = y[1];\n    t->branch[1].n = 1;\n  } else if (d == 3) {\n    t->deg = 3;\n    if (x[0] < x[1]) {\n      if (x[0] < x[2]) {\n        x_min = x[0];\n        x_mid = min(x[1], x[2]);\n        x_max = max(x[1], x[2]);\n      } else {\n        x_min = x[2];\n        x_mid = x[0];\n        x_max = x[1];\n      }\n    } else {\n      if (x[0] < x[2]) {\n        x_min = x[1];\n        x_mid = x[0];\n        x_max = x[2];\n      } else {\n        x_min = min(x[1], x[2]);\n        x_mid = max(x[1], x[2]);\n        x_max = x[0];\n      }\n    }\n    if (y[0] < y[1]) {\n      if (y[0] < y[2]) {\n        y_min = y[0];\n        y_mid = min(y[1], y[2]);\n        y_max = max(y[1], y[2]);\n      } else {\n        y_min = y[2];\n        y_mid = y[0];\n        y_max = y[1];\n      }\n    } else {\n      if (y[0] < y[2]) {\n        y_min = y[1];\n        y_mid = y[0];\n        y_max = y[2];\n      } else {\n        y_min = min(y[1], y[2]);\n        y_mid = max(y[1], y[2]);\n        y_max = y[0];\n      }\n    }\n\n    t->length      = ADIFF(x_max, x_min) + ADIFF(y_max, y_min);\n    t->branch      = (Branch*)malloc(4 * sizeof(Branch));\n    t->branch[0].x = x[0];\n    t->branch[0].y = y[0];\n    t->branch[0].n = 3;\n    t->branch[1].x = x[1];\n    t->branch[1].y = y[1];\n    t->branch[1].n = 3;\n    t->branch[2].x = x[2];\n    t->branch[2].y = y[2];\n    t->branch[2].n = 3;\n    t->branch[3].x = x_mid;\n    t->branch[3].y = y_mid;\n    t->branch[3].n = 3;\n  } else {\n    xs    = (DTYPE*)malloc(sizeof(DTYPE) * (d));\n    ys    = (DTYPE*)malloc(sizeof(DTYPE) * (d));\n    nxs   = (DTYPE*)malloc(sizeof(DTYPE) * (d));\n    nys   = (DTYPE*)malloc(sizeof(DTYPE) * (d));\n    x_seg = (DTYPE*)malloc(sizeof(DTYPE) * (d - 1));\n    y_seg = (DTYPE*)malloc(sizeof(DTYPE) * (d - 1));\n    s     = (int*)malloc(sizeof(int) * (d));\n\n    for (i = 0; i < d; i++) {\n      xs[i] = gxs[netID][i];\n      ys[i] = gys[netID][i];\n      s[i]  = gs[netID][i];\n    }\n\n    // get the new coordinates considering congestion\n    for (i = 0; i < d - 1; i++) {\n      x_seg[i] = (xs[i + 1] - xs[i]) * 100;\n      y_seg[i] = (ys[i + 1] - ys[i]) * 100;\n    }\n\n    height = ys[d - 1] - ys[0] + 1; // # vertical grids the net span\n    width  = xs[d - 1] - xs[0] + 1; // # horizontal grids the net span\n\n    for (i = 0; i < d - 1; i++) {\n      usageH = 0;\n      for (k = ys[0]; k <= ys[d - 1]; k++) // all grids in the column\n      {\n        grid = k * (xGrid - 1);\n        for (j = xs[i]; j < xs[i + 1]; j++)\n          usageH += (h_edges[grid + j].est_usage + h_edges[grid + j].red);\n      }\n      if (x_seg[i] != 0 && usageH != 0) {\n        x_seg[i] *=\n            coeffH * usageH / ((xs[i + 1] - xs[i]) * height * hCapacity);\n        x_seg[i] = max(1, x_seg[i]); // the segment len is at least 1 if\n                                     // original segment len > 0\n      }\n      usageV = 0;\n      for (j = ys[i]; j < ys[i + 1]; j++) {\n        grid = j * xGrid;\n        for (k = xs[0]; k <= xs[d - 1]; k++) // all grids in the row\n          usageV += (v_edges[grid + k].est_usage + v_edges[grid + k].red);\n      }\n      if (y_seg[i] != 0 && usageV != 0) {\n        y_seg[i] *= coeffV * usageV / ((ys[i + 1] - ys[i]) * width * vCapacity);\n        y_seg[i] = max(1, y_seg[i]); // the segment len is at least 1 if\n                                     // original segment len > 0\n      }\n    }\n\n    nxs[0] = xs[0];\n    nys[0] = ys[0];\n    for (i = 0; i < d - 1; i++) {\n      nxs[i + 1] = nxs[i] + x_seg[i];\n      nys[i + 1] = nys[i] + y_seg[i];\n    }\n\n    (*t) = flutes(d, nxs, nys, s, acc);\n\n    // map the new coordinates back to original coordinates\n    for (i = 0; i < 2 * d - 2; i++) {\n      t->branch[i].x = mapxy(t->branch[i].x, xs, nxs, d);\n      t->branch[i].y = mapxy(t->branch[i].y, ys, nys, d);\n    }\n\n    free(xs);\n    free(ys);\n    free(nxs);\n    free(nys);\n    free(x_seg);\n    free(y_seg);\n    free(s);\n  }\n\n  // return t;\n}\n\nBool netCongestion(int netID) {\n  int i, j;\n  int grid, ymin, ymax;\n  //\tBool Congested;\n  Segment* seg;\n\n  for (j = seglistIndex[netID]; j < seglistIndex[netID] + seglistCnt[netID];\n       j++) {\n    seg = &seglist[j];\n\n    if (seg->y1 < seg->y2) {\n      ymin = seg->y1;\n      ymax = seg->y2;\n    } else {\n      ymin = seg->y2;\n      ymax = seg->y1;\n    }\n\n    // remove L routing\n    if (seg->xFirst) {\n      grid = seg->y1 * (xGrid - 1);\n      for (i = seg->x1; i < seg->x2; i++) {\n        if (h_edges[grid + i].est_usage >= h_edges[grid + i].cap) {\n          return (TRUE);\n        }\n      }\n      for (i = ymin; i < ymax; i++) {\n        if (v_edges[i * xGrid + seg->x2].est_usage >=\n            v_edges[i * xGrid + seg->x2].cap) {\n          return (TRUE);\n        }\n      }\n    } else {\n      for (i = ymin; i < ymax; i++) {\n        if (v_edges[i * xGrid + seg->x1].est_usage >=\n            v_edges[i * xGrid + seg->x1].cap) {\n          return (TRUE);\n        }\n      }\n      grid = seg->y2 * (xGrid - 1);\n      for (i = seg->x1; i < seg->x2; i++) {\n        if (h_edges[grid + i].est_usage >= h_edges[grid + i].cap) {\n          return (TRUE);\n        }\n      }\n    }\n  }\n  return (FALSE);\n}\n\nBool VTreeSuite(int netID) {\n  int xmin, xmax, ymin, ymax;\n\n  int i, deg;\n\n  deg  = nets[netID]->deg;\n  xmax = ymax = 0;\n  xmin = ymin = BIG_INT;\n\n  for (i = 0; i < deg; i++) {\n    if (xmin > nets[netID]->pinX[i]) {\n      xmin = nets[netID]->pinX[i];\n    }\n    if (xmax < nets[netID]->pinX[i]) {\n      xmax = nets[netID]->pinX[i];\n    }\n    if (ymin > nets[netID]->pinY[i]) {\n      ymin = nets[netID]->pinY[i];\n    }\n    if (ymax < nets[netID]->pinY[i]) {\n      ymax = nets[netID]->pinY[i];\n    }\n  }\n\n  if ((ymax - ymin) > 3 * (xmax - xmin)) {\n    return (TRUE);\n  } else {\n    return (FALSE);\n  }\n}\n\nBool HTreeSuite(int netID) {\n  int xmin, xmax, ymin, ymax;\n\n  int i, deg;\n\n  deg  = nets[netID]->deg;\n  xmax = ymax = 0;\n  xmin = ymin = BIG_INT;\n\n  //\tprintf(\"d %d\\n\",deg);\n\n  for (i = 0; i < deg; i++) {\n    if (xmin > nets[netID]->pinX[i]) {\n      xmin = nets[netID]->pinX[i];\n    }\n    if (xmax < nets[netID]->pinX[i]) {\n      xmax = nets[netID]->pinX[i];\n    }\n    if (ymin > nets[netID]->pinY[i]) {\n      ymin = nets[netID]->pinY[i];\n    }\n    if (ymax < nets[netID]->pinY[i]) {\n      ymax = nets[netID]->pinY[i];\n    }\n  }\n\n  if (5 * (ymax - ymin) < (xmax - xmin)) {\n    return (TRUE);\n  } else {\n    return (FALSE);\n  }\n}\n\nfloat coeffADJ(int netID) {\n  int xmin, xmax, ymin, ymax, Hcap, Vcap;\n  float Husage, Vusage, coef;\n\n  int i, j, deg, grid;\n\n  deg  = nets[netID]->deg;\n  xmax = ymax = 0;\n  xmin = ymin = BIG_INT;\n  Hcap = Vcap = 0;\n  Husage = Vusage = 0;\n\n  //\tprintf(\"d %d\\n\",deg);\n\n  for (i = 0; i < deg; i++) {\n    if (xmin > nets[netID]->pinX[i]) {\n      xmin = nets[netID]->pinX[i];\n    }\n    if (xmax < nets[netID]->pinX[i]) {\n      xmax = nets[netID]->pinX[i];\n    }\n    if (ymin > nets[netID]->pinY[i]) {\n      ymin = nets[netID]->pinY[i];\n    }\n    if (ymax < nets[netID]->pinY[i]) {\n      ymax = nets[netID]->pinY[i];\n    }\n  }\n\n  if (xmin == xmax) {\n    for (j = ymin; j < ymax; j++) {\n      grid = j * xGrid + xmin;\n      Vcap += v_edges[grid].cap;\n      Vusage += v_edges[grid].est_usage;\n    }\n    coef = 1;\n  } else if (ymin == ymax) {\n    for (i = xmin; i < xmax; i++) {\n      grid = ymin * (xGrid - 1) + i;\n      Hcap += h_edges[grid].cap;\n      Husage += h_edges[grid].est_usage;\n    }\n    coef = 1;\n  } else {\n    for (j = ymin; j <= ymax; j++) {\n      for (i = xmin; i < xmax; i++) {\n        grid = j * (xGrid - 1) + i;\n        Hcap += h_edges[grid].cap;\n        Husage += h_edges[grid].est_usage;\n      }\n    }\n    for (j = ymin; j < ymax; j++) {\n      for (i = xmin; i <= xmax; i++) {\n        grid = j * xGrid + i;\n        Vcap += v_edges[grid].cap;\n        Vusage += v_edges[grid].est_usage;\n      }\n    }\n    // coef  = (Husage*Vcap)/ (Hcap*Vusage);\n    coef = (Hcap * Vusage) / (Husage * Vcap);\n  }\n\n  if (coef < 1.2) {\n    coef = 1.2;\n  }\n\n  return (coef);\n}\n\nvoid gen_brk_RSMT(Bool congestionDriven, Bool reRoute, Bool genTree,\n                  Bool newType, Bool noADJ) {\n  int i, j, d, n, netID, n1, n2;\n  int x1, y1, x2, y2;\n  int x[MAXNETDEG], y[MAXNETDEG];\n  int segPos, segcnt;\n  Tree rsmt;\n  int wl, wl1, numShift = 0, cnt1, cnt2, cnt3;\n  float coeffV, coefMax, coefMin;\n\n  coefMax = 0;\n  coefMin = BIG_INT;\n\n  TreeEdge *treeedges, *treeedge;\n  TreeNode* treenodes;\n\n  int totalnon = 0;\n  Bool cong;\n\n  wl = wl1    = 0;\n  totalNumSeg = 0;\n\n  cnt1 = cnt2 = cnt3 = 0;\n\n  /*if (congestionDriven) {\n      netpinOrderInc();\n  } */\n\n  for (netID = 0; netID < numValidNets; netID++) {\n    i      = netID;\n    coeffV = 1.36;\n\n    if (congestionDriven) {\n\n      coeffV = coeffADJ(i);\n      cong   = netCongestion(i);\n\n    } else {\n      if (HTreeSuite(i)) {\n        coeffV = 1.2;\n      }\n    }\n\n    d = nets[i]->deg;\n    for (j = 0; j < d; j++) {\n      x[j] = nets[i]->pinX[j];\n      y[j] = nets[i]->pinY[j];\n    }\n\n    if (reRoute) {\n      if (newType) {\n        treeedges = sttrees[i].edges;\n        treenodes = sttrees[i].nodes;\n        for (j = 0; j < 2 * d - 3; j++) {\n          if (sttrees[i].edges[j].len >\n              0) // only route the non-degraded edges (len>0)\n          {\n            treeedge = &(treeedges[j]);\n            n1       = treeedge->n1;\n            n2       = treeedge->n2;\n            x1       = treenodes[n1].x;\n            y1       = treenodes[n1].y;\n            x2       = treenodes[n2].x;\n            y2       = treenodes[n2].y;\n            newRipup(treeedge, x1, y1, x2, y2);\n          }\n        }\n      } else {\n        // remove the est_usage due to the segments in this net\n        for (j = seglistIndex[i]; j < seglistIndex[i] + seglistCnt[i]; j++) {\n          ripupSegL(&seglist[j]);\n        }\n      }\n    }\n\n    if (noADJ) {\n      coeffV = 1.2;\n    }\n    if (congestionDriven) {\n      // call congestion driven flute to generate RSMT\n      if (cong) {\n        fluteCongest(i, d, x, y, FLUTEACCURACY, coeffV, &rsmt);\n      } else {\n        fluteNormal(i, d, x, y, FLUTEACCURACY, coeffV, &rsmt);\n      }\n      if (d > 3) {\n        numShift += edgeShiftNew(&rsmt);\n      }\n    } else {\n      // call FLUTE to generate RSMT for each net\n      fluteNormal(i, d, x, y, FLUTEACCURACY, coeffV, &rsmt);\n    }\n\n    if (genTree) {\n      copyStTree(i, rsmt);\n    }\n\n    if (congestionDriven) {\n      for (j = 0; j < 2 * d - 3; j++)\n        wl1 += sttrees[i].edges[j].len;\n    }\n\n    segcnt = 0;\n    d      = rsmt.deg;\n    for (j = 0; j < 2 * d - 2; j++) {\n      x1 = rsmt.branch[j].x;\n      y1 = rsmt.branch[j].y;\n      n  = rsmt.branch[j].n;\n      x2 = rsmt.branch[n].x;\n      y2 = rsmt.branch[n].y;\n\n      wl += ADIFF(x1, x2) + ADIFF(y1, y2);\n\n      if (x1 != x2 || y1 != y2) // the branch is not degraded (a point)\n      {\n        segPos =\n            seglistIndex[i] + segcnt; // the position of this segment in seglist\n        if (x1 < x2) {\n          seglist[segPos].x1 = x1;\n          seglist[segPos].x2 = x2;\n          seglist[segPos].y1 = y1;\n          seglist[segPos].y2 = y2;\n        } else {\n          seglist[segPos].x1 = x2;\n          seglist[segPos].x2 = x1;\n          seglist[segPos].y1 = y2;\n          seglist[segPos].y2 = y1;\n        }\n\n        seglist[segPos].netID = i;\n        segcnt++;\n      }\n    } // loop j\n\n    free(rsmt.branch);\n\n    seglistCnt[i] = segcnt; // the number of segments for net i\n    totalNumSeg += segcnt;\n\n    if (reRoute) {\n      // update the est_usage due to the segments in this net\n      newrouteL(\n          i, NOROUTE,\n          TRUE); // route the net with no previous route for each tree edge\n    }\n  } // loop i\n\n  printf(\"WIRELEN : %d, WIRELEN1 : %d\\n\", wl, wl1);\n  printf(\"NumSeg  : %d\\n\", totalNumSeg);\n  printf(\"NumShift: %d\\n\", numShift);\n  printf(\"totalnon %d\\n\", totalnon);\n  printf(\"Max %f, Min %f\\n\", coefMax, coefMin);\n  printf(\"cnt1 %d, cnt2 %d, cnt3 %d\\n\", cnt1, cnt2, cnt3);\n}\n\n#endif\n"
  },
  {
    "path": "lonestar/eda/cpu/sproute/RipUp.h",
    "content": "#ifndef _RIPUP_H_\n#define _RIPUP_H_\n\n#include <stdio.h>\n#include <stdlib.h>\n#include \"DataType.h\"\n#include \"flute.h\"\n#include \"DataProc.h\"\n#include \"route.h\"\n#include \"utility.h\"\n\n// rip-up a L segment\nvoid ripupSegL(Segment* seg) {\n  int i, grid;\n  int ymin, ymax;\n\n  if (seg->y1 < seg->y2) {\n    ymin = seg->y1;\n    ymax = seg->y2;\n  } else {\n    ymin = seg->y2;\n    ymax = seg->y1;\n  }\n\n  // remove L routing\n  if (seg->xFirst) {\n    grid = seg->y1 * (xGrid - 1);\n    for (i = seg->x1; i < seg->x2; i++)\n      h_edges[grid + i].est_usage -= 1;\n    for (i = ymin; i < ymax; i++)\n      v_edges[i * xGrid + seg->x2].est_usage -= 1;\n  } else {\n    for (i = ymin; i < ymax; i++)\n      v_edges[i * xGrid + seg->x1].est_usage -= 1;\n    grid = seg->y2 * (xGrid - 1);\n    for (i = seg->x1; i < seg->x2; i++)\n      h_edges[grid + i].est_usage -= 1;\n  }\n}\n\nvoid ripupSegZ(Segment* seg) {\n  int i, grid;\n  int ymin, ymax;\n\n  if (seg->y1 < seg->y2) {\n    ymin = seg->y1;\n    ymax = seg->y2;\n  } else {\n    ymin = seg->y2;\n    ymax = seg->y1;\n  }\n\n  if (seg->x1 == seg->x2) {\n    // remove V routing\n    for (i = ymin; i < ymax; i++)\n      v_edges[i * xGrid + seg->x1].est_usage -= 1;\n  } else if (seg->y1 == seg->y2) {\n    // remove H routing\n    grid = seg->y1 * (xGrid - 1);\n    for (i = seg->x1; i < seg->x2; i++)\n      h_edges[grid + i].est_usage -= 1;\n  } else {\n    // remove Z routing\n    if (seg->HVH) {\n      grid = seg->y1 * (xGrid - 1);\n      for (i = seg->x1; i < seg->Zpoint; i++)\n        h_edges[grid + i].est_usage -= 1;\n      grid = seg->y2 * (xGrid - 1);\n      for (i = seg->Zpoint; i < seg->x2; i++)\n        h_edges[grid + i].est_usage -= 1;\n      for (i = ymin; i < ymax; i++)\n        v_edges[i * xGrid + seg->Zpoint].est_usage -= 1;\n    } else {\n      if (seg->y1 < seg->y2) {\n        for (i = seg->y1; i < seg->Zpoint; i++)\n          v_edges[i * xGrid + seg->x1].est_usage -= 1;\n        for (i = seg->Zpoint; i < seg->y2; i++)\n          v_edges[i * xGrid + seg->x2].est_usage -= 1;\n        grid = seg->Zpoint * (xGrid - 1);\n        for (i = seg->x1; i < seg->x2; i++)\n          h_edges[grid + i].est_usage -= 1;\n      } else {\n        for (i = seg->y2; i < seg->Zpoint; i++)\n          v_edges[i * xGrid + seg->x2].est_usage -= 1;\n        for (i = seg->Zpoint; i < seg->y1; i++)\n          v_edges[i * xGrid + seg->x1].est_usage -= 1;\n        grid = seg->Zpoint * (xGrid - 1);\n        for (i = seg->x1; i < seg->x2; i++)\n          h_edges[grid + i].est_usage -= 1;\n      }\n    }\n  }\n}\n\nvoid newRipup(TreeEdge* treeedge, int x1, int y1, int x2, int y2) {\n  short *gridsX, *gridsY;\n  int i, j, grid, Zpoint, ymin, ymax, xmin;\n  RouteType ripuptype;\n\n  if (treeedge->len == 0) {\n    return; // not ripup for degraded edge\n  }\n\n  ripuptype = treeedge->route.type;\n  if (y1 < y2) {\n    ymin = y1;\n    ymax = y2;\n  } else {\n    ymin = y2;\n    ymax = y1;\n  }\n\n  if (ripuptype == LROUTE) // remove L routing\n  {\n    if (treeedge->route.xFirst) {\n      grid = y1 * (xGrid - 1);\n      for (i = x1; i < x2; i++)\n        h_edges[grid + i].est_usage -= 1;\n      for (i = ymin; i < ymax; i++)\n        v_edges[i * xGrid + x2].est_usage -= 1;\n    } else {\n      for (i = ymin; i < ymax; i++)\n        v_edges[i * xGrid + x1].est_usage -= 1;\n      grid = y2 * (xGrid - 1);\n      for (i = x1; i < x2; i++)\n        h_edges[grid + i].est_usage -= 1;\n    }\n  } else if (ripuptype == ZROUTE) {\n    // remove Z routing\n    Zpoint = treeedge->route.Zpoint;\n    if (treeedge->route.HVH) {\n      grid = y1 * (xGrid - 1);\n      for (i = x1; i < Zpoint; i++)\n        h_edges[grid + i].est_usage -= 1;\n      grid = y2 * (xGrid - 1);\n      for (i = Zpoint; i < x2; i++)\n        h_edges[grid + i].est_usage -= 1;\n      for (i = ymin; i < ymax; i++)\n        v_edges[i * xGrid + Zpoint].est_usage -= 1;\n    } else {\n      if (y1 < y2) {\n        for (i = y1; i < Zpoint; i++)\n          v_edges[i * xGrid + x1].est_usage -= 1;\n        for (i = Zpoint; i < y2; i++)\n          v_edges[i * xGrid + x2].est_usage -= 1;\n        grid = Zpoint * (xGrid - 1);\n        for (i = x1; i < x2; i++)\n          h_edges[grid + i].est_usage -= 1;\n      } else {\n        for (i = y2; i < Zpoint; i++)\n          v_edges[i * xGrid + x2].est_usage -= 1;\n        for (i = Zpoint; i < y1; i++)\n          v_edges[i * xGrid + x1].est_usage -= 1;\n        grid = Zpoint * (xGrid - 1);\n        for (i = x1; i < x2; i++)\n          h_edges[grid + i].est_usage -= 1;\n      }\n    }\n  } else if (ripuptype == MAZEROUTE) {\n    gridsX = treeedge->route.gridsX;\n    gridsY = treeedge->route.gridsY;\n    for (i = 0; i < treeedge->route.routelen; i++) {\n      if (gridsX[i] == gridsX[i + 1]) // a vertical edge\n      {\n        ymin = min(gridsY[i], gridsY[i + 1]);\n        v_edges[ymin * xGrid + gridsX[i]].est_usage -= 1;\n      } else if (gridsY[i] == gridsY[i + 1]) // a horizontal edge\n      {\n        xmin = min(gridsX[i], gridsX[i + 1]);\n        h_edges[gridsY[i] * (xGrid - 1) + xmin].est_usage -= 1;\n      } else {\n        printf(\"MAZE RIPUP WRONG\\n\");\n        for (j = 0; j < treeedge->route.routelen; j++) {\n          printf(\"x %d y %d\\n\", gridsX[j], gridsY[j]);\n        }\n        exit(1);\n      }\n    }\n  }\n}\n\nBool newRipupType2(TreeEdge* treeedge, TreeNode* treenodes, int x1, int y1,\n                   int x2, int y2, int deg) {\n  int i, grid, ymin, ymax, n1, n2;\n  RouteType ripuptype;\n  Bool needRipup = FALSE;\n\n  if (treeedge->len == 0) {\n    return (FALSE); // not ripup for degraded edge\n  }\n\n  ripuptype = treeedge->route.type;\n  if (y1 < y2) {\n    ymin = y1;\n    ymax = y2;\n  } else {\n    ymin = y2;\n    ymax = y1;\n  }\n\n  if (ripuptype == LROUTE) // remove L routing\n  {\n    if (treeedge->route.xFirst) {\n      grid = y1 * (xGrid - 1);\n      for (i = x1; i < x2; i++) {\n        if (h_edges[grid + i].est_usage > h_edges[grid + i].cap) {\n          needRipup = TRUE;\n          break;\n        }\n      }\n\n      for (i = ymin; i < ymax; i++) {\n        if (v_edges[i * xGrid + x2].est_usage > v_edges[i * xGrid + x2].cap) {\n          needRipup = TRUE;\n          break;\n        }\n      }\n    } else {\n      for (i = ymin; i < ymax; i++) {\n        if (v_edges[i * xGrid + x1].est_usage > v_edges[i * xGrid + x1].cap) {\n          needRipup = TRUE;\n          break;\n        }\n      }\n      grid = y2 * (xGrid - 1);\n      for (i = x1; i < x2; i++) {\n        if (h_edges[grid + i].est_usage > v_edges[grid + i].cap) {\n          needRipup = TRUE;\n          break;\n        }\n      }\n    }\n\n    if (needRipup) {\n      n1 = treeedge->n1;\n      n2 = treeedge->n2;\n\n      if (treeedge->route.xFirst) {\n        if (n1 >= deg) {\n          treenodes[n1].status -= 2;\n        }\n        treenodes[n2].status -= 1;\n\n        grid = y1 * (xGrid - 1);\n        for (i = x1; i < x2; i++)\n          h_edges[grid + i].est_usage -= 1;\n        for (i = ymin; i < ymax; i++)\n          v_edges[i * xGrid + x2].est_usage -= 1;\n      } else {\n        if (n2 >= deg) {\n          treenodes[n2].status -= 2;\n        }\n        treenodes[n1].status -= 1;\n\n        for (i = ymin; i < ymax; i++)\n          v_edges[i * xGrid + x1].est_usage -= 1;\n        grid = y2 * (xGrid - 1);\n        for (i = x1; i < x2; i++)\n          h_edges[grid + i].est_usage -= 1;\n      }\n    }\n    return (needRipup);\n\n  } else {\n    printf(\"type2 ripup not type L\\n\");\n    exit(0);\n  }\n}\n\nvoid printEdgeVEC(TreeEdge* treeedge) {\n  int i;\n\n  for (i = 0; i <= treeedge->route.routelen; i++) {\n    printf(\"(%d, %d) \", treeedge->route.gridsX[i], treeedge->route.gridsY[i]);\n  }\n  printf(\"\\n\");\n}\n\nBool newRipupCheckProb(TreeEdge* treeedge, int ripup_threshold, int netID,\n                       int edgeID) {\n  short *gridsX, *gridsY;\n  int i, grid, ymin, xmin;\n  Bool needRipup = FALSE;\n\n  if (treeedge->len == 0) {\n    return (FALSE);\n  } // not ripup for degraded edge\n\n  // std::random_device rd;\n  // std::mt19937 g(rd());\n\n  if (treeedge->route.type == MAZEROUTE) {\n    gridsX = treeedge->route.gridsX;\n    gridsY = treeedge->route.gridsY;\n    for (i = 0; i < treeedge->route.routelen; i++) {\n      if (gridsX[i] == gridsX[i + 1]) // a vertical edge\n      {\n        ymin         = min(gridsY[i], gridsY[i + 1]);\n        grid         = ymin * xGrid + gridsX[i];\n        int cap      = vCapacity - ripup_threshold - v_edges[grid].red;\n        int overflow = v_edges[grid].usage + v_edges[grid].red - vCapacity -\n                       ripup_threshold;\n        int r = rand();\n        // if(overflow >= 0) printf(\"red %d r %d cap %d %d overflow %d ripup:\n        // %d\\n\", v_edges[grid].red, r, cap, r%cap, overflow, (int)(r%cap <=\n        // overflow));\n        if (overflow >= 0 && (r % cap <= overflow)) {\n          needRipup = TRUE;\n          break;\n        }\n      } else if (gridsY[i] == gridsY[i + 1]) // a horizontal edge\n      {\n        xmin         = min(gridsX[i], gridsX[i + 1]);\n        grid         = gridsY[i] * (xGrid - 1) + xmin;\n        int cap      = hCapacity - ripup_threshold - h_edges[grid].red;\n        int overflow = h_edges[grid].usage + h_edges[grid].red - hCapacity -\n                       ripup_threshold;\n        int r = rand();\n        // if(overflow >= 0) printf(\"red %d r %d cap %d %d overflow %d ripup:\n        // %d\\n\", h_edges[grid].red, r, cap, r%cap, overflow, (int)(r%cap <=\n        // overflow));\n        if (overflow >= 0 && (r % cap <= overflow)) {\n          needRipup = TRUE;\n          break;\n        }\n      }\n    }\n\n    if (needRipup) {\n\n      for (i = 0; i < treeedge->route.routelen; i++) {\n        if (gridsX[i] == gridsX[i + 1]) // a vertical edge\n        {\n          ymin = min(gridsY[i], gridsY[i + 1]);\n          // v_edges[ymin*xGrid+gridsX[i]].usage -= 1;\n          v_edges[ymin * xGrid + gridsX[i]].usage.fetch_sub(\n              (short unsigned)1, std::memory_order_relaxed);\n        } else /// if(gridsY[i]==gridsY[i+1])// a horizontal edge\n        {\n          xmin = min(gridsX[i], gridsX[i + 1]);\n          // h_edges[gridsY[i]*(xGrid-1)+xmin].usage -= 1;\n          h_edges[gridsY[i] * (xGrid - 1) + xmin].usage.fetch_sub(\n              (short unsigned)1, std::memory_order_relaxed);\n        }\n      }\n\n      return (TRUE);\n    } else {\n      return (FALSE);\n    }\n  } else {\n    printf(\"route type is not maze, netID %d\\n\", netID);\n    fflush(stdout);\n    printEdge(netID, edgeID);\n\n    exit(0);\n  }\n}\n\nBool newRipupCheck(TreeEdge* treeedge, int ripup_threshold, int netID,\n                   int edgeID) {\n  short *gridsX, *gridsY;\n  int i, grid, ymin, xmin;\n  Bool needRipup = FALSE;\n\n  if (treeedge->len == 0) {\n    return (FALSE);\n  } // not ripup for degraded edge\n\n  if (treeedge->route.type == MAZEROUTE) {\n    gridsX = treeedge->route.gridsX;\n    gridsY = treeedge->route.gridsY;\n    for (i = 0; i < treeedge->route.routelen; i++) {\n      if (gridsX[i] == gridsX[i + 1]) // a vertical edge\n      {\n        ymin = min(gridsY[i], gridsY[i + 1]);\n        grid = ymin * xGrid + gridsX[i];\n        if (v_edges[grid].usage + v_edges[grid].red >=\n            vCapacity - ripup_threshold) {\n          needRipup = TRUE;\n          break;\n        }\n\n      } else if (gridsY[i] == gridsY[i + 1]) // a horizontal edge\n      {\n        xmin = min(gridsX[i], gridsX[i + 1]);\n        grid = gridsY[i] * (xGrid - 1) + xmin;\n        if (h_edges[grid].usage + h_edges[grid].red >=\n            hCapacity - ripup_threshold) {\n          needRipup = TRUE;\n          break;\n        }\n      }\n    }\n\n    if (needRipup) {\n      for (i = 0; i < treeedge->route.routelen; i++) {\n        if (gridsX[i] == gridsX[i + 1]) // a vertical edge\n        {\n          ymin = min(gridsY[i], gridsY[i + 1]);\n          // v_edges[ymin*xGrid+gridsX[i]].usage -= 1;\n          v_edges[ymin * xGrid + gridsX[i]].usage.fetch_sub(\n              (short unsigned)1, std::memory_order_relaxed);\n        } else /// if(gridsY[i]==gridsY[i+1])// a horizontal edge\n        {\n          xmin = min(gridsX[i], gridsX[i + 1]);\n          // h_edges[gridsY[i]*(xGrid-1)+xmin].usage -= 1;\n          h_edges[gridsY[i] * (xGrid - 1) + xmin].usage.fetch_sub(\n              (short unsigned)1, std::memory_order_relaxed);\n        }\n      }\n\n      return (TRUE);\n    } else {\n      return (FALSE);\n    }\n  } else {\n    printf(\"route type is not maze, netID %d\\n\", netID);\n    fflush(stdout);\n    printEdge(netID, edgeID);\n\n    exit(0);\n  }\n}\n\nBool newRipupCheck_M1M2(TreeEdge* treeedge, int ripup_threshold, int netID,\n                        int edgeID) {\n  short *gridsX, *gridsY;\n  int i, grid, ymin, xmin;\n  Bool needRipup = FALSE;\n\n  if (treeedge->len == 0) {\n    return (FALSE);\n  } // not ripup for degraded edge\n\n  if (treeedge->route.type == MAZEROUTE) {\n    gridsX = treeedge->route.gridsX;\n    gridsY = treeedge->route.gridsY;\n    for (i = 0; i < treeedge->route.routelen; i++) {\n      if (gridsX[i] == gridsX[i + 1]) // a vertical edge\n      {\n        ymin = min(gridsY[i], gridsY[i + 1]);\n        grid = ymin * xGrid + gridsX[i];\n        if (v_edges[grid].usage + v_edges[grid].red >=\n                vCapacity - ripup_threshold &&\n            treeedge->n_ripups <= v_edges[grid].max_have_rippedups) {\n          v_edges[grid].ripups_cur_round = true;\n          needRipup                      = TRUE;\n          break;\n        }\n\n      } else if (gridsY[i] == gridsY[i + 1]) // a horizontal edge\n      {\n        xmin = min(gridsX[i], gridsX[i + 1]);\n        grid = gridsY[i] * (xGrid - 1) + xmin;\n        if (h_edges[grid].usage + h_edges[grid].red >=\n                hCapacity - ripup_threshold &&\n            treeedge->n_ripups <= h_edges[grid].max_have_rippedups) {\n          h_edges[grid].ripups_cur_round = true;\n          needRipup                      = TRUE;\n          break;\n        }\n      }\n    }\n\n    if (needRipup) {\n      for (i = 0; i < treeedge->route.routelen; i++) {\n        if (gridsX[i] == gridsX[i + 1]) // a vertical edge\n        {\n          ymin = min(gridsY[i], gridsY[i + 1]);\n          // v_edges[ymin*xGrid+gridsX[i]].usage -= 1;\n          v_edges[ymin * xGrid + gridsX[i]].usage.fetch_sub(\n              (short unsigned)1, std::memory_order_relaxed);\n        } else /// if(gridsY[i]==gridsY[i+1])// a horizontal edge\n        {\n          xmin = min(gridsX[i], gridsX[i + 1]);\n          // h_edges[gridsY[i]*(xGrid-1)+xmin].usage -= 1;\n          h_edges[gridsY[i] * (xGrid - 1) + xmin].usage.fetch_sub(\n              (short unsigned)1, std::memory_order_relaxed);\n        }\n      }\n\n      return (TRUE);\n    } else {\n      return (FALSE);\n    }\n  } else {\n    printf(\"route type is not maze, netID %d\\n\", netID);\n    fflush(stdout);\n    printEdge(netID, edgeID);\n\n    exit(0);\n  }\n}\n\nBool newRipupCheck_atomic(TreeEdge* treeedge, int ripup_threshold, int netID,\n                          int edgeID) {\n  short *gridsX, *gridsY;\n  int i, grid, ymin, xmin;\n  Bool needRipup = FALSE;\n  int break_edge = 0;\n\n  if (treeedge->len == 0) {\n    return (FALSE);\n  } // not ripup for degraded edge\n  // std::cout << \" atomic ripup\" << std::endl;\n  if (treeedge->route.type == MAZEROUTE) {\n    gridsX = treeedge->route.gridsX;\n    gridsY = treeedge->route.gridsY;\n    for (i = 0; i < treeedge->route.routelen; i++) {\n      if (gridsX[i] == gridsX[i + 1]) // a vertical edge\n      {\n        ymin          = min(gridsY[i], gridsY[i + 1]);\n        grid          = ymin * xGrid + gridsX[i];\n        int old_usage = v_edges[grid].usage;\n\n        while (old_usage + v_edges[grid].red >= vCapacity - ripup_threshold) {\n          if (v_edges[grid].usage.compare_exchange_weak(old_usage,\n                                                        old_usage - 1)) {\n            break_edge = i;\n            needRipup  = TRUE;\n            break;\n          }\n          old_usage = v_edges[grid].usage;\n        }\n        if (needRipup)\n          break;\n\n      } else if (gridsY[i] == gridsY[i + 1]) // a horizontal edge\n      {\n        xmin          = min(gridsX[i], gridsX[i + 1]);\n        grid          = gridsY[i] * (xGrid - 1) + xmin;\n        int old_usage = h_edges[grid].usage;\n\n        while (old_usage + h_edges[grid].red >= hCapacity - ripup_threshold) {\n          if (h_edges[grid].usage.compare_exchange_weak(old_usage,\n                                                        old_usage - 1)) {\n            break_edge = i;\n            needRipup  = TRUE;\n            break;\n          }\n          old_usage = h_edges[grid].usage;\n        }\n        if (needRipup)\n          break;\n      }\n    }\n\n    if (needRipup) {\n      for (i = 0; i < treeedge->route.routelen; i++) {\n        if (i == break_edge)\n          continue;\n        if (gridsX[i] == gridsX[i + 1]) // a vertical edge\n        {\n          ymin = min(gridsY[i], gridsY[i + 1]);\n          // v_edges[ymin*xGrid+gridsX[i]].usage -= 1;\n          v_edges[ymin * xGrid + gridsX[i]].usage.fetch_sub(\n              (short unsigned)1, std::memory_order_relaxed);\n        } else /// if(gridsY[i]==gridsY[i+1])// a horizontal edge\n        {\n          xmin = min(gridsX[i], gridsX[i + 1]);\n          // h_edges[gridsY[i]*(xGrid-1)+xmin].usage -= 1;\n          h_edges[gridsY[i] * (xGrid - 1) + xmin].usage.fetch_sub(\n              (short unsigned)1, std::memory_order_relaxed);\n        }\n      }\n\n      return (TRUE);\n    } else {\n      return (FALSE);\n    }\n  } else {\n    printf(\"route type is not maze, netID %d\\n\", netID);\n    fflush(stdout);\n    printEdge(netID, edgeID);\n\n    exit(0);\n  }\n}\n\nBool newRipupCheck_sort(TreeEdge* treeedge, int ripup_threshold, int netID,\n                        int edgeID, bool& is_horizontal, int& grid_pos) {\n  short *gridsX, *gridsY;\n  int i, grid, ymin, xmin;\n  Bool needRipup  = FALSE;\n  treeedge->ripup = false;\n  if (treeedge->len == 0) {\n    return (FALSE);\n  } // not ripup for degraded edge\n\n  if (treeedge->route.type == MAZEROUTE) {\n    gridsX = treeedge->route.gridsX;\n    gridsY = treeedge->route.gridsY;\n    for (i = 0; i < treeedge->route.routelen; i++) {\n      if (gridsX[i] == gridsX[i + 1]) // a vertical edge\n      {\n        ymin = min(gridsY[i], gridsY[i + 1]);\n        grid = ymin * xGrid + gridsX[i];\n        if (v_edges[grid].usage + v_edges[grid].red >=\n            vCapacity - ripup_threshold) {\n          needRipup     = TRUE;\n          is_horizontal = false;\n          grid_pos      = grid;\n          break;\n        }\n\n      } else if (gridsY[i] == gridsY[i + 1]) // a horizontal edge\n      {\n        xmin = min(gridsX[i], gridsX[i + 1]);\n        grid = gridsY[i] * (xGrid - 1) + xmin;\n        if (h_edges[grid].usage + h_edges[grid].red >=\n            hCapacity - ripup_threshold) {\n          needRipup     = TRUE;\n          is_horizontal = true;\n          grid_pos      = grid;\n          break;\n        }\n      }\n    }\n\n    if (needRipup) {\n      /*for(i=0; i<treeedge->route.routelen; i++)\n      {\n          if(gridsX[i]==gridsX[i+1]) // a vertical edge\n          {\n              ymin = min(gridsY[i], gridsY[i+1]);\n              if(netID == 2 && edgeID == 21)\n                  printf(\"i %d x %d y %d\\n\", i, gridsX[i], gridsY[i]);\n              //v_edges[ymin*xGrid+gridsX[i]].usage -= 1;\n              //v_edges[ymin*xGrid+gridsX[i]].usage.fetch_sub((short unsigned)1,\n      std::memory_order_relaxed);\n          }\n          else ///if(gridsY[i]==gridsY[i+1])// a horizontal edge\n          {\n              xmin = min(gridsX[i], gridsX[i+1]);\n              if(netID == 2 && edgeID == 21)\n                  printf(\"i %d x %d y %d\\n\", i, gridsX[i], gridsY[i]);\n              //h_edges[gridsY[i]*(xGrid-1)+xmin].usage -= 1;\n              //h_edges[gridsY[i]*(xGrid-1)+xmin].usage.fetch_sub((short\n      unsigned)1, std::memory_order_relaxed);\n          }\n      }*/\n\n      return (TRUE);\n    } else {\n      return (FALSE);\n    }\n  } else {\n    printf(\"route type is not maze, netID %d\\n\", netID);\n    fflush(stdout);\n    printEdge(netID, edgeID);\n\n    exit(0);\n  }\n}\n\nBool newRipupCheck_nosub(TreeEdge* treeedge, int ripup_threshold, int netID,\n                         int edgeID) {\n  short *gridsX, *gridsY;\n  int i, grid, ymin, xmin;\n  Bool needRipup = FALSE;\n\n  if (treeedge->len == 0) {\n    return (FALSE);\n  } // not ripup for degraded edge\n\n  if (treeedge->route.type == MAZEROUTE) {\n    gridsX = treeedge->route.gridsX;\n    gridsY = treeedge->route.gridsY;\n    for (i = 0; i < treeedge->route.routelen; i++) {\n      if (gridsX[i] == gridsX[i + 1]) // a vertical edge\n      {\n        ymin = min(gridsY[i], gridsY[i + 1]);\n        grid = ymin * xGrid + gridsX[i];\n        if (v_edges[grid].usage + v_edges[grid].red >=\n            vCapacity - ripup_threshold) {\n          needRipup = TRUE;\n          break;\n        }\n\n      } else if (gridsY[i] == gridsY[i + 1]) // a horizontal edge\n      {\n        xmin = min(gridsX[i], gridsX[i + 1]);\n        grid = gridsY[i] * (xGrid - 1) + xmin;\n        if (h_edges[grid].usage + h_edges[grid].red >=\n            hCapacity - ripup_threshold) {\n          needRipup = TRUE;\n          break;\n        }\n      }\n    }\n\n    if (needRipup) {\n      /*for(i=0; i<treeedge->route.routelen; i++)\n      {\n          if(pre_gridsX[i]==pre_gridsX[i+1]) // a vertical edge\n          {\n              ymin = min(pre_gridsY[i], pre_gridsY[i+1]);\n              printf(\"nosub x y %d %d i %d\\n\", pre_gridsX[i], ymin, i);\n              //v_edges[ymin*xGrid+gridsX[i]].usage -= 1;\n              //v_edges[ymin*xGrid+gridsX[i]].usage.fetch_sub((short unsigned)1,\n      std::memory_order_relaxed);\n          }\n          else ///if(gridsY[i]==gridsY[i+1])// a horizontal edge\n          {\n              xmin = min(pre_gridsX[i], pre_gridsX[i+1]);\n              printf(\"nosub x y %d %d i %d \\n\", xmin, pre_gridsY[i], i);\n              //h_edges[gridsY[i]*(xGrid-1)+xmin].usage -= 1;\n              //h_edges[gridsY[i]*(xGrid-1)+xmin].usage.fetch_sub((short\n      unsigned)1, std::memory_order_relaxed);\n          }\n      }*/\n\n      return (TRUE);\n    } else {\n      return (FALSE);\n    }\n  } else {\n    printf(\"route type is not maze, netID %d\\n\", netID);\n    fflush(stdout);\n    printEdge(netID, edgeID);\n\n    exit(0);\n  }\n}\n\nBool newRipup3DType3(int netID, int edgeID) {\n  short *gridsX, *gridsY, *gridsL;\n  int i, k, grid, ymin, xmin, n1a, n2a, hl, bl, hid, bid, deg;\n\n  TreeEdge *treeedges, *treeedge;\n  TreeNode* treenodes;\n\n  treeedges = sttrees[netID].edges;\n  treeedge  = &(treeedges[edgeID]);\n\n  if (treeedge->len == 0) {\n    return (FALSE); // not ripup for degraded edge\n  }\n\n  treenodes = sttrees[netID].nodes;\n\n  deg = sttrees[netID].deg;\n\n  n1a = treeedge->n1a;\n  n2a = treeedge->n2a;\n\n  if (n1a < deg) {\n    bl = 0;\n  } else {\n    bl = BIG_INT;\n  }\n  hl  = 0;\n  hid = bid = BIG_INT;\n\n  for (i = 0; i < treenodes[n1a].conCNT; i++) {\n    if (treenodes[n1a].eID[i] == edgeID) {\n      for (k = i + 1; k < treenodes[n1a].conCNT; k++) {\n        treenodes[n1a].eID[k - 1]     = treenodes[n1a].eID[k];\n        treenodes[n1a].heights[k - 1] = treenodes[n1a].heights[k];\n        if (bl > treenodes[n1a].heights[k]) {\n          bl  = treenodes[n1a].heights[k];\n          bid = treenodes[n1a].eID[k];\n        }\n        if (hl < treenodes[n1a].heights[k]) {\n          hl  = treenodes[n1a].heights[k];\n          hid = treenodes[n1a].eID[k];\n        }\n      }\n      break;\n    } else {\n      if (bl > treenodes[n1a].heights[i]) {\n        bl  = treenodes[n1a].heights[i];\n        bid = treenodes[n1a].eID[i];\n      }\n      if (hl < treenodes[n1a].heights[i]) {\n        hl  = treenodes[n1a].heights[i];\n        hid = treenodes[n1a].eID[i];\n      }\n    }\n  }\n  treenodes[n1a].conCNT--;\n\n  treenodes[n1a].botL = bl;\n  treenodes[n1a].lID  = bid;\n  treenodes[n1a].topL = hl;\n  treenodes[n1a].hID  = hid;\n\n  if (n2a < deg) {\n    bl = 0;\n  } else {\n    bl = BIG_INT;\n  }\n  hl  = 0;\n  hid = bid = BIG_INT;\n\n  for (i = 0; i < treenodes[n2a].conCNT; i++) {\n    if (treenodes[n2a].eID[i] == edgeID) {\n      for (k = i + 1; k < treenodes[n2a].conCNT; k++) {\n        treenodes[n2a].eID[k - 1]     = treenodes[n2a].eID[k];\n        treenodes[n2a].heights[k - 1] = treenodes[n2a].heights[k];\n        if (bl > treenodes[n2a].heights[k]) {\n          bl  = treenodes[n2a].heights[k];\n          bid = treenodes[n2a].eID[k];\n        }\n        if (hl < treenodes[n2a].heights[k]) {\n          hl  = treenodes[n2a].heights[k];\n          hid = treenodes[n2a].eID[k];\n        }\n      }\n      break;\n    } else {\n      if (bl > treenodes[n2a].heights[i]) {\n        bl  = treenodes[n2a].heights[i];\n        bid = treenodes[n2a].eID[i];\n      }\n      if (hl < treenodes[n2a].heights[i]) {\n        hl  = treenodes[n2a].heights[i];\n        hid = treenodes[n2a].eID[i];\n      }\n    }\n  }\n  treenodes[n2a].conCNT--;\n\n  treenodes[n2a].botL = bl;\n  treenodes[n2a].lID  = bid;\n  treenodes[n2a].topL = hl;\n  treenodes[n2a].hID  = hid;\n\n  gridsX = treeedge->route.gridsX;\n  gridsY = treeedge->route.gridsY;\n  gridsL = treeedge->route.gridsL;\n  for (i = 0; i < treeedge->route.routelen; i++) {\n    if (gridsL[i] == gridsL[i + 1]) {\n      if (gridsX[i] == gridsX[i + 1]) // a vertical edge\n      {\n        ymin = min(gridsY[i], gridsY[i + 1]);\n        grid = gridsL[i] * gridV + ymin * xGrid + gridsX[i];\n        v_edges3D[grid].usage -= 1;\n      } else if (gridsY[i] == gridsY[i + 1]) // a horizontal edge\n      {\n        xmin = min(gridsX[i], gridsX[i + 1]);\n        grid = gridsL[i] * gridH + gridsY[i] * (xGrid - 1) + xmin;\n        h_edges3D[grid].usage -= 1;\n      } else {\n        printf(\"MAZE RIPUP WRONG\\n\");\n        return (FALSE);\n        // exit(1);\n      }\n    }\n  }\n\n  return (TRUE);\n}\n\nvoid newRipupNet(int netID) {\n  short *gridsX, *gridsY;\n  int i, j, grid, Zpoint, ymin, ymax, xmin, n1, n2, edgeID;\n\n  RouteType ripuptype;\n  TreeEdge *treeedges, *treeedge;\n  TreeNode* treenodes;\n  int x1, y1, x2, y2, deg;\n\n  treeedges = sttrees[netID].edges;\n  treenodes = sttrees[netID].nodes;\n  deg       = sttrees[netID].deg;\n\n  for (edgeID = 0; edgeID < 2 * deg - 3; edgeID++) {\n    treeedge = &(treeedges[edgeID]);\n    if (treeedge->len > 0) {\n\n      n1 = treeedge->n1;\n      n2 = treeedge->n2;\n      x1 = treenodes[n1].x;\n      y1 = treenodes[n1].y;\n      x2 = treenodes[n2].x;\n      y2 = treenodes[n2].y;\n\n      ripuptype = treeedge->route.type;\n      if (y1 < y2) {\n        ymin = y1;\n        ymax = y2;\n      } else {\n        ymin = y2;\n        ymax = y1;\n      }\n\n      if (ripuptype == LROUTE) // remove L routing\n      {\n        if (treeedge->route.xFirst) {\n          grid = y1 * (xGrid - 1);\n          for (i = x1; i < x2; i++)\n            h_edges[grid + i].est_usage -= 1;\n          for (i = ymin; i < ymax; i++)\n            v_edges[i * xGrid + x2].est_usage -= 1;\n        } else {\n          for (i = ymin; i < ymax; i++)\n            v_edges[i * xGrid + x1].est_usage -= 1;\n          grid = y2 * (xGrid - 1);\n          for (i = x1; i < x2; i++)\n            h_edges[grid + i].est_usage -= 1;\n        }\n      } else if (ripuptype == ZROUTE) {\n        // remove Z routing\n        Zpoint = treeedge->route.Zpoint;\n        if (treeedge->route.HVH) {\n          grid = y1 * (xGrid - 1);\n          for (i = x1; i < Zpoint; i++)\n            h_edges[grid + i].est_usage -= 1;\n          grid = y2 * (xGrid - 1);\n          for (i = Zpoint; i < x2; i++)\n            h_edges[grid + i].est_usage -= 1;\n          for (i = ymin; i < ymax; i++)\n            v_edges[i * xGrid + Zpoint].est_usage -= 1;\n        } else {\n          if (y1 < y2) {\n            for (i = y1; i < Zpoint; i++)\n              v_edges[i * xGrid + x1].est_usage -= 1;\n            for (i = Zpoint; i < y2; i++)\n              v_edges[i * xGrid + x2].est_usage -= 1;\n            grid = Zpoint * (xGrid - 1);\n            for (i = x1; i < x2; i++)\n              h_edges[grid + i].est_usage -= 1;\n          } else {\n            for (i = y2; i < Zpoint; i++)\n              v_edges[i * xGrid + x2].est_usage -= 1;\n            for (i = Zpoint; i < y1; i++)\n              v_edges[i * xGrid + x1].est_usage -= 1;\n            grid = Zpoint * (xGrid - 1);\n            for (i = x1; i < x2; i++)\n              h_edges[grid + i].est_usage -= 1;\n          }\n        }\n      } else if (ripuptype == MAZEROUTE) {\n        gridsX = treeedge->route.gridsX;\n        gridsY = treeedge->route.gridsY;\n        for (i = 0; i < treeedge->route.routelen; i++) {\n          if (gridsX[i] == gridsX[i + 1]) // a vertical edge\n          {\n            ymin = min(gridsY[i], gridsY[i + 1]);\n            v_edges[ymin * xGrid + gridsX[i]].est_usage -= 1;\n          } else if (gridsY[i] == gridsY[i + 1]) // a horizontal edge\n          {\n            xmin = min(gridsX[i], gridsX[i + 1]);\n            h_edges[gridsY[i] * (xGrid - 1) + xmin].est_usage -= 1;\n          } else {\n            printf(\"MAZE RIPUP WRONG in newRipupNet\\n\");\n            for (j = 0; j < treeedge->route.routelen; j++) {\n              printf(\"x %d y %d\\n\", gridsX[j], gridsY[j]);\n              // if(gridsX[i]!=gridsX[i+1] && gridsY[i]==gridsY[i+1])\n            }\n            // exit(1);\n          }\n        }\n      }\n    }\n  }\n}\n\n#endif\n"
  },
  {
    "path": "lonestar/eda/cpu/sproute/bitmap_image.hpp",
    "content": "/*\n *****************************************************************************\n *                                                                           *\n *                          Platform Independent                             *\n *                    Bitmap Image Reader Writer Library                     *\n *                                                                           *\n * Author: Arash Partow - 2002                                               *\n * URL: http://partow.net/programming/bitmap/index.html                      *\n *                                                                           *\n * Note: This library only supports 24-bits per pixel bitmap format files.   *\n *                                                                           *\n * Copyright notice:                                                         *\n * Free use of the Platform Independent Bitmap Image Reader Writer Library   *\n * is permitted under the guidelines and in accordance with the most current *\n * version of the MIT License.                                               *\n * http://www.opensource.org/licenses/MIT                                    *\n *                                                                           *\n *****************************************************************************\n*/\n\n\n#ifndef INCLUDE_BITMAP_IMAGE_HPP\n#define INCLUDE_BITMAP_IMAGE_HPP\n\n#include <algorithm>\n#include <cmath>\n#include <cstdlib>\n#include <cstring>\n#include <fstream>\n#include <iostream>\n#include <iterator>\n#include <limits>\n#include <string>\n#include <vector>\n\n\nclass bitmap_image\n{\npublic:\n\n   enum channel_mode {\n                        rgb_mode = 0,\n                        bgr_mode = 1\n                     };\n\n   enum color_plane {\n                       blue_plane  = 0,\n                       green_plane = 1,\n                       red_plane   = 2\n                    };\n\n   struct rgb_t\n   {\n      unsigned char   red;\n      unsigned char green;\n      unsigned char  blue;\n   };\n\n   bitmap_image()\n   : file_name_(\"\"),\n     width_          (0),\n     height_         (0),\n     row_increment_  (0),\n     bytes_per_pixel_(3),\n     channel_mode_(bgr_mode)\n   {}\n\n   bitmap_image(const std::string& filename)\n   : file_name_(filename),\n     width_          (0),\n     height_         (0),\n     row_increment_  (0),\n     bytes_per_pixel_(0),\n     channel_mode_(bgr_mode)\n   {\n      load_bitmap();\n   }\n\n   bitmap_image(const unsigned int width, const unsigned int height)\n   : file_name_(\"\"),\n     width_ (width ),\n     height_(height),\n     row_increment_  (0),\n     bytes_per_pixel_(3),\n     channel_mode_(bgr_mode)\n   {\n      create_bitmap();\n   }\n\n   bitmap_image(const bitmap_image& image)\n   : file_name_(image.file_name_),\n     width_    (image.width_    ),\n     height_   (image.height_   ),\n     row_increment_  (0),\n     bytes_per_pixel_(3),\n     channel_mode_(bgr_mode)\n   {\n      create_bitmap();\n      data_ = image.data_;\n   }\n\n   bitmap_image& operator=(const bitmap_image& image)\n   {\n      if (this != &image)\n      {\n         file_name_       = image.file_name_;\n         bytes_per_pixel_ = image.bytes_per_pixel_;\n         width_           = image.width_;\n         height_          = image.height_;\n         row_increment_   = 0;\n         channel_mode_    = image.channel_mode_;\n         create_bitmap();\n         data_ = image.data_;\n      }\n\n      return *this;\n   }\n\n   inline bool operator!()\n   {\n      return (data_.size()   == 0) ||\n             (width_         == 0) ||\n             (height_        == 0) ||\n             (row_increment_ == 0);\n   }\n\n   inline void clear(const unsigned char v = 0x00)\n   {\n      std::fill(data_.begin(), data_.end(), v);\n   }\n\n   inline unsigned char red_channel(const unsigned int x, const unsigned int y) const\n   {\n      return data_[(y * row_increment_) + (x * bytes_per_pixel_ + 2)];\n   }\n\n   inline unsigned char green_channel(const unsigned int x, const unsigned int y) const\n   {\n      return data_[(y * row_increment_) + (x * bytes_per_pixel_ + 1)];\n   }\n\n   inline unsigned char blue_channel (const unsigned int x, const unsigned int y) const\n   {\n      return data_[(y * row_increment_) + (x * bytes_per_pixel_ + 0)];\n   }\n\n   inline void red_channel(const unsigned int x, const unsigned int y, const unsigned char value)\n   {\n      data_[(y * row_increment_) + (x * bytes_per_pixel_ + 2)] = value;\n   }\n\n   inline void green_channel(const unsigned int x, const unsigned int y, const unsigned char value)\n   {\n      data_[(y * row_increment_) + (x * bytes_per_pixel_ + 1)] = value;\n   }\n\n   inline void blue_channel (const unsigned int x, const unsigned int y, const unsigned char value)\n   {\n      data_[(y * row_increment_) + (x * bytes_per_pixel_ + 0)] = value;\n   }\n\n   inline unsigned char* row(unsigned int row_index) const\n   {\n      return const_cast<unsigned char*>(&data_[(row_index * row_increment_)]);\n   }\n\n   inline void get_pixel(const unsigned int x, const unsigned int y,\n                         unsigned char& red,\n                         unsigned char& green,\n                         unsigned char& blue) const\n   {\n      const unsigned int y_offset = y * row_increment_;\n      const unsigned int x_offset = x * bytes_per_pixel_;\n      const unsigned int offset   = y_offset + x_offset;\n\n      blue  = data_[offset + 0];\n      green = data_[offset + 1];\n      red   = data_[offset + 2];\n   }\n\n   template <typename RGB>\n   inline void get_pixel(const unsigned int x, const unsigned int y, RGB& colour) const\n   {\n      get_pixel(x, y, colour.red, colour.green, colour.blue);\n   }\n\n   inline rgb_t get_pixel(const unsigned int x, const unsigned int y) const\n   {\n      rgb_t colour;\n      get_pixel(x, y, colour.red, colour.green, colour.blue);\n      return colour;\n   }\n\n   inline void set_pixel(const unsigned int x, const unsigned int y,\n                         const unsigned char red,\n                         const unsigned char green,\n                         const unsigned char blue)\n   {\n      const unsigned int y_offset = y * row_increment_;\n      const unsigned int x_offset = x * bytes_per_pixel_;\n      const unsigned int offset   = y_offset + x_offset;\n\n      data_[offset + 0] = blue;\n      data_[offset + 1] = green;\n      data_[offset + 2] = red;\n   }\n\n   template <typename RGB>\n   inline void set_pixel(const unsigned int x, const unsigned int y, const RGB& colour)\n   {\n      set_pixel(x, y, colour.red, colour.green, colour.blue);\n   }\n\n   inline bool copy_from(const bitmap_image& image)\n   {\n      if (\n           (image.height_ != height_) ||\n           (image.width_  != width_ )\n         )\n      {\n         return false;\n      }\n\n      data_ = image.data_;\n\n      return true;\n   }\n\n   inline bool copy_from(const bitmap_image& source_image,\n                         const unsigned int& x_offset,\n                         const unsigned int& y_offset)\n   {\n      if ((x_offset + source_image.width_ ) > width_ ) { return false; }\n      if ((y_offset + source_image.height_) > height_) { return false; }\n\n      for (unsigned int y = 0; y < source_image.height_; ++y)\n      {\n         unsigned char* itr1           = row(y + y_offset) + x_offset * bytes_per_pixel_;\n         const unsigned char* itr2     = source_image.row(y);\n         const unsigned char* itr2_end = itr2 + source_image.width_ * bytes_per_pixel_;\n\n         std::copy(itr2, itr2_end, itr1);\n      }\n\n      return true;\n   }\n\n   inline bool region(const unsigned int& x     ,\n                      const unsigned int& y     ,\n                      const unsigned int& width ,\n                      const unsigned int& height,\n                      bitmap_image& dest_image  ) const\n   {\n      if ((x + width ) > width_ ) { return false; }\n      if ((y + height) > height_) { return false; }\n\n      if (\n           (dest_image.width_  < width_ ) ||\n           (dest_image.height_ < height_)\n         )\n      {\n         dest_image.setwidth_height(width,height);\n      }\n\n      for (unsigned int r = 0; r < height; ++r)\n      {\n         unsigned char* itr1     = row(r + y) + x * bytes_per_pixel_;\n         unsigned char* itr1_end = itr1 + (width * bytes_per_pixel_);\n         unsigned char* itr2     = dest_image.row(r);\n\n         std::copy(itr1, itr1_end, itr2);\n      }\n\n      return true;\n   }\n\n   inline bool roi_from_center(const unsigned int& cx    ,\n                               const unsigned int& cy    ,\n                               const unsigned int& width ,\n                               const unsigned int& height,\n                               bitmap_image& dest_image  ) const\n   {\n      return region(cx - (width / 2), cy - (height / 2),\n                    width, height,\n                    dest_image);\n   }\n\n   inline bool set_region(const unsigned int&  x     ,\n                          const unsigned int&  y     ,\n                          const unsigned int&  width ,\n                          const unsigned int&  height,\n                          const unsigned char& value )\n   {\n      if ((x + width ) > width_ ) { return false; }\n      if ((y + height) > height_) { return false; }\n\n      for (unsigned int r = 0; r < height; ++r)\n      {\n         unsigned char* itr     = row(r + y) + x * bytes_per_pixel_;\n         unsigned char* itr_end = itr + (width * bytes_per_pixel_);\n\n         std::fill(itr, itr_end, value);\n      }\n\n      return true;\n   }\n\n   inline bool set_region(const unsigned int&  x     ,\n                          const unsigned int&  y     ,\n                          const unsigned int&  width ,\n                          const unsigned int&  height,\n                          const color_plane    color ,\n                          const unsigned char& value )\n   {\n      if ((x + width ) > width_ ) { return false; }\n      if ((y + height) > height_) { return false; }\n\n      const unsigned int color_plane_offset = offset(color);\n\n      for (unsigned int r = 0; r < height; ++r)\n      {\n         unsigned char* itr     = row(r + y) + x * bytes_per_pixel_ + color_plane_offset;\n         unsigned char* itr_end = itr + (width * bytes_per_pixel_);\n\n         while (itr != itr_end)\n         {\n            *itr  = value;\n             itr += bytes_per_pixel_;\n         }\n      }\n\n      return true;\n   }\n\n   inline bool set_region(const unsigned int&  x     ,\n                          const unsigned int&  y     ,\n                          const unsigned int&  width ,\n                          const unsigned int&  height,\n                          const unsigned char& red   ,\n                          const unsigned char& green ,\n                          const unsigned char& blue  )\n   {\n      if ((x + width ) > width_ ) { return false; }\n      if ((y + height) > height_) { return false; }\n\n      for (unsigned int r = 0; r < height; ++r)\n      {\n         unsigned char* itr     = row(r + y) + x * bytes_per_pixel_;\n         unsigned char* itr_end = itr + (width * bytes_per_pixel_);\n\n         while (itr != itr_end)\n         {\n            *(itr++) =  blue;\n            *(itr++) = green;\n            *(itr++) =   red;\n         }\n      }\n\n      return true;\n   }\n\n   void reflective_image(bitmap_image& image, const bool include_diagnols = false)\n   {\n      image.setwidth_height(3 * width_, 3 * height_, true);\n\n      image.copy_from(*this, width_, height_);\n\n      vertical_flip();\n\n      image.copy_from(*this, width_,           0);\n      image.copy_from(*this, width_, 2 * height_);\n\n      vertical_flip();\n      horizontal_flip();\n\n      image.copy_from(*this,          0, height_);\n      image.copy_from(*this, 2 * width_, height_);\n\n      horizontal_flip();\n\n      if (include_diagnols)\n      {\n         bitmap_image tile = *this;\n\n         tile.vertical_flip();\n         tile.horizontal_flip();\n\n         image.copy_from(tile,          0,           0);\n         image.copy_from(tile, 2 * width_,           0);\n         image.copy_from(tile, 2 * width_, 2 * height_);\n         image.copy_from(tile, 0         , 2 * height_);\n      }\n   }\n\n   inline unsigned int width() const\n   {\n      return width_;\n   }\n\n   inline unsigned int height() const\n   {\n      return height_;\n   }\n\n   inline unsigned int bytes_per_pixel() const\n   {\n      return bytes_per_pixel_;\n   }\n\n   inline unsigned int pixel_count() const\n   {\n      return width_ *  height_;\n   }\n\n   inline void setwidth_height(const unsigned int width,\n                               const unsigned int height,\n                               const bool clear = false)\n   {\n      data_.clear();\n      width_  = width;\n      height_ = height;\n\n      create_bitmap();\n\n      if (clear)\n      {\n         std::fill(data_.begin(), data_.end(), static_cast<unsigned char>(0x00));\n      }\n   }\n\n   void save_image(const std::string& file_name) const\n   {\n      std::ofstream stream(file_name.c_str(),std::ios::binary);\n\n      if (!stream)\n      {\n         std::cerr << \"bitmap_image::save_image(): Error - Could not open file \"  << file_name << \" for writing!\" << std::endl;\n         return;\n      }\n\n      bitmap_information_header bih;\n\n      bih.width            = width_;\n      bih.height           = height_;\n      bih.bit_count        = static_cast<unsigned short>(bytes_per_pixel_ << 3);\n      bih.clr_important    = 0;\n      bih.clr_used         = 0;\n      bih.compression      = 0;\n      bih.planes           = 1;\n      bih.size             = bih.struct_size();\n      bih.x_pels_per_meter = 0;\n      bih.y_pels_per_meter = 0;\n      bih.size_image       = (((bih.width * bytes_per_pixel_) + 3) & 0x0000FFFC) * bih.height;\n\n      bitmap_file_header bfh;\n\n      bfh.type             = 19778;\n      bfh.size             = bfh.struct_size() + bih.struct_size() + bih.size_image;\n      bfh.reserved1        = 0;\n      bfh.reserved2        = 0;\n      bfh.off_bits         = bih.struct_size() + bfh.struct_size();\n\n      write_bfh(stream,bfh);\n      write_bih(stream,bih);\n\n      unsigned int padding = (4 - ((3 * width_) % 4)) % 4;\n      char padding_data[4] = { 0x00, 0x00, 0x00, 0x00 };\n\n      for (unsigned int i = 0; i < height_; ++i)\n      {\n         const unsigned char* data_ptr = &data_[(row_increment_ * (height_ - i - 1))];\n\n         stream.write(reinterpret_cast<const char*>(data_ptr), sizeof(unsigned char) * bytes_per_pixel_ * width_);\n         stream.write(padding_data,padding);\n      }\n\n      stream.close();\n   }\n\n   inline void set_all_ith_bits_low(const unsigned int bitr_index)\n   {\n      unsigned char mask = static_cast<unsigned char>(~(1 << bitr_index));\n\n      for (unsigned char* itr = data(); itr != end(); ++itr)\n      {\n         *itr &= mask;\n      }\n   }\n\n   inline void set_all_ith_bits_high(const unsigned int bitr_index)\n   {\n      unsigned char mask = static_cast<unsigned char>(1 << bitr_index);\n\n      for (unsigned char* itr = data(); itr != end(); ++itr)\n      {\n         *itr |= mask;\n      }\n   }\n\n   inline void set_all_ith_channels(const unsigned int& channel, const unsigned char& value)\n   {\n      for (unsigned char* itr = (data() + channel); itr < end(); itr += bytes_per_pixel_)\n      {\n         *itr = value;\n      }\n   }\n\n   inline void set_channel(const color_plane color,const unsigned char& value)\n   {\n      for (unsigned char* itr = (data() + offset(color)); itr < end(); itr += bytes_per_pixel_)\n      {\n         *itr = value;\n      }\n   }\n\n   inline void ror_channel(const color_plane color, const unsigned int& ror)\n   {\n      for (unsigned char* itr = (data() + offset(color)); itr < end(); itr += bytes_per_pixel_)\n      {\n         *itr = static_cast<unsigned char>(((*itr) >> ror) | ((*itr) << (8 - ror)));\n      }\n   }\n\n   inline void set_all_channels(const unsigned char& value)\n   {\n      for (unsigned char* itr = data(); itr < end(); )\n      {\n         *(itr++) = value;\n      }\n   }\n\n   inline void set_all_channels(const unsigned char& r_value,\n                                const unsigned char& g_value,\n                                const unsigned char& b_value)\n   {\n      for (unsigned char* itr = (data() + 0); itr < end(); itr += bytes_per_pixel_)\n      {\n         *(itr + 0) = b_value;\n         *(itr + 1) = g_value;\n         *(itr + 2) = r_value;\n      }\n   }\n\n   inline void invert_color_planes()\n   {\n      for (unsigned char* itr = data(); itr < end(); *itr = ~(*itr), ++itr);\n   }\n\n   inline void add_to_color_plane(const color_plane color, const unsigned char& value)\n   {\n      for (unsigned char* itr = (data() + offset(color)); itr < end(); itr += bytes_per_pixel_)\n      {\n         (*itr) += value;\n      }\n   }\n\n   inline void convert_to_grayscale()\n   {\n      double r_scaler = 0.299;\n      double g_scaler = 0.587;\n      double b_scaler = 0.114;\n\n      if (rgb_mode == channel_mode_)\n      {\n         std::swap(r_scaler, b_scaler);\n      }\n\n      for (unsigned char* itr = data(); itr < end(); )\n      {\n         unsigned char gray_value = static_cast<unsigned char>\n                       (\n                         (r_scaler * (*(itr + 2))) +\n                         (g_scaler * (*(itr + 1))) +\n                         (b_scaler * (*(itr + 0)))\n                       );\n\n         *(itr++) = gray_value;\n         *(itr++) = gray_value;\n         *(itr++) = gray_value;\n      }\n   }\n\n   inline const unsigned char* data() const\n   {\n      return data_.data();\n   }\n\n   inline unsigned char* data()\n   {\n      return const_cast<unsigned char*>(data_.data());\n   }\n\n   inline void bgr_to_rgb()\n   {\n      if ((bgr_mode == channel_mode_) && (3 == bytes_per_pixel_))\n      {\n         reverse_channels();\n         channel_mode_ = rgb_mode;\n      }\n   }\n\n   inline void rgb_to_bgr()\n   {\n      if ((rgb_mode == channel_mode_) && (3 == bytes_per_pixel_))\n      {\n         reverse_channels();\n         channel_mode_ = bgr_mode;\n      }\n   }\n\n   inline void reverse()\n   {\n      unsigned char* itr1 = data();\n      unsigned char* itr2 = end() - bytes_per_pixel_;\n\n      while (itr1 < itr2)\n      {\n         for (std::size_t i = 0; i < bytes_per_pixel_; ++i)\n         {\n            unsigned char* citr1 = itr1 + i;\n            unsigned char* citr2 = itr2 + i;\n\n            std::swap(*citr1,*citr2);\n         }\n\n         itr1 += bytes_per_pixel_;\n         itr2 -= bytes_per_pixel_;\n      }\n   }\n\n   inline void horizontal_flip()\n   {\n      for (unsigned int y = 0; y < height_; ++y)\n      {\n         unsigned char* itr1 = row(y);\n         unsigned char* itr2 = itr1 + row_increment_ - bytes_per_pixel_;\n\n         while (itr1 < itr2)\n         {\n            for (unsigned int i = 0; i < bytes_per_pixel_; ++i)\n            {\n               unsigned char* p1 = (itr1 + i);\n               unsigned char* p2 = (itr2 + i);\n\n               std::swap(*p1,*p2);\n            }\n\n            itr1 += bytes_per_pixel_;\n            itr2 -= bytes_per_pixel_;\n         }\n      }\n   }\n\n   inline void vertical_flip()\n   {\n      for (unsigned int y = 0; y < (height_ / 2); ++y)\n      {\n         unsigned char* itr1 = row(y);\n         unsigned char* itr2 = row(height_ - y - 1);\n\n         for (std::size_t x = 0; x < row_increment_; ++x)\n         {\n            std::swap(*(itr1 + x),*(itr2 + x));\n         }\n      }\n   }\n\n   inline void export_color_plane(const color_plane color, unsigned char* image)\n   {\n      for (unsigned char* itr = (data() + offset(color)); itr < end(); ++image, itr += bytes_per_pixel_)\n      {\n         (*image) = (*itr);\n      }\n   }\n\n   inline void export_color_plane(const color_plane color, bitmap_image& image)\n   {\n      if (\n           (width_  != image.width_ ) ||\n           (height_ != image.height_)\n         )\n      {\n         image.setwidth_height(width_,height_);\n      }\n\n      image.clear();\n\n      unsigned char* itr1     = (data() + offset(color));\n      unsigned char* itr1_end = end();\n      unsigned char* itr2     = (image.data() + offset(color));\n\n      while (itr1 < itr1_end)\n      {\n         (*itr2) = (*itr1);\n\n         itr1 += bytes_per_pixel_;\n         itr2 += bytes_per_pixel_;\n      }\n   }\n\n   inline void export_response_image(const color_plane color, double* response_image)\n   {\n      double* resp_itr = response_image;\n\n      for (unsigned char* itr = (data() + offset(color)); itr < end(); ++response_image, itr += bytes_per_pixel_)\n      {\n         *(resp_itr++) = (1.0 * (*itr)) / 256.0;\n      }\n   }\n\n   inline void export_gray_scale_response_image(double* response_image) const\n   {\n      double* resp_itr = response_image;\n\n      for (const unsigned char* itr = data(); itr < end(); itr += bytes_per_pixel_)\n      {\n         unsigned char gray_value = static_cast<unsigned char>\n                       (\n                         (0.299 * (*(itr + 2))) +\n                         (0.587 * (*(itr + 1))) +\n                         (0.114 * (*(itr + 0)))\n                       );\n\n         *(resp_itr++) = (1.0 * gray_value) / 256.0;\n      }\n   }\n\n   inline void export_rgb(double* red, double* green, double* blue) const\n   {\n      if (bgr_mode != channel_mode_)\n         return;\n\n      for (const unsigned char* itr = data(); itr < end(); ++red, ++green, ++blue)\n      {\n         (*blue ) = (1.0 * (*(itr++))) / 256.0;\n         (*green) = (1.0 * (*(itr++))) / 256.0;\n         (*red  ) = (1.0 * (*(itr++))) / 256.0;\n      }\n   }\n\n   inline void export_rgb(float* red, float* green, float* blue) const\n   {\n      if (bgr_mode != channel_mode_)\n         return;\n\n      for (const unsigned char* itr = data(); itr < end(); ++red, ++green, ++blue)\n      {\n         (*blue ) = (1.0f * (*(itr++))) / 256.0f;\n         (*green) = (1.0f * (*(itr++))) / 256.0f;\n         (*red  ) = (1.0f * (*(itr++))) / 256.0f;\n      }\n   }\n\n   inline void export_rgb(unsigned char* red, unsigned char* green, unsigned char* blue) const\n   {\n      if (bgr_mode != channel_mode_)\n         return;\n\n      for (const unsigned char* itr = data(); itr < end(); ++red, ++green, ++blue)\n      {\n         (*blue ) = *(itr++);\n         (*green) = *(itr++);\n         (*red  ) = *(itr++);\n      }\n   }\n\n   inline void export_ycbcr(double* y, double* cb, double* cr) const\n   {\n      if (bgr_mode != channel_mode_)\n         return;\n\n      for (const unsigned char* itr = data(); itr < end(); ++y, ++cb, ++cr)\n      {\n         const double blue  = (1.0 * (*(itr++)));\n         const double green = (1.0 * (*(itr++)));\n         const double red   = (1.0 * (*(itr++)));\n\n         ( *y) = clamp<double>( 16.0 + (1.0/256.0) * (  65.738 * red + 129.057 * green +  25.064 * blue),1.0,254);\n         (*cb) = clamp<double>(128.0 + (1.0/256.0) * (- 37.945 * red -  74.494 * green + 112.439 * blue),1.0,254);\n         (*cr) = clamp<double>(128.0 + (1.0/256.0) * ( 112.439 * red -  94.154 * green -  18.285 * blue),1.0,254);\n      }\n   }\n\n   inline void export_rgb_normal(double* red, double* green, double* blue) const\n   {\n      if (bgr_mode != channel_mode_)\n         return;\n\n      for (const unsigned char* itr = data(); itr < end(); ++red, ++green, ++blue)\n      {\n         (*blue ) = (1.0 * (*(itr++)));\n         (*green) = (1.0 * (*(itr++)));\n         (*red  ) = (1.0 * (*(itr++)));\n      }\n   }\n\n   inline void export_rgb_normal(float* red, float* green, float* blue) const\n   {\n      if (bgr_mode != channel_mode_)\n         return;\n\n      for (const unsigned char* itr = data(); itr < end(); ++red, ++green, ++blue)\n      {\n         (*blue ) = (1.0f * (*(itr++)));\n         (*green) = (1.0f * (*(itr++)));\n         (*red  ) = (1.0f * (*(itr++)));\n      }\n   }\n\n   inline void import_rgb(double* red, double* green, double* blue)\n   {\n      if (bgr_mode != channel_mode_)\n         return;\n\n      for (unsigned char* itr = data(); itr < end(); ++red, ++green, ++blue)\n      {\n         *(itr++) = static_cast<unsigned char>(256.0 * (*blue ));\n         *(itr++) = static_cast<unsigned char>(256.0 * (*green));\n         *(itr++) = static_cast<unsigned char>(256.0 * (*red  ));\n      }\n   }\n\n   inline void import_rgb(float* red, float* green, float* blue)\n   {\n      if (bgr_mode != channel_mode_)\n         return;\n\n      for (unsigned char* itr = data(); itr < end(); ++red, ++green, ++blue)\n      {\n         *(itr++) = static_cast<unsigned char>(256.0f * (*blue ));\n         *(itr++) = static_cast<unsigned char>(256.0f * (*green));\n         *(itr++) = static_cast<unsigned char>(256.0f * (*red  ));\n      }\n   }\n\n   inline void import_rgb(unsigned char* red, unsigned char* green, unsigned char* blue)\n   {\n      if (bgr_mode != channel_mode_)\n         return;\n\n      for (unsigned char* itr = data(); itr < end(); ++red, ++green, ++blue)\n      {\n         *(itr++) = (*blue );\n         *(itr++) = (*green);\n         *(itr++) = (*red  );\n      }\n   }\n\n   inline void import_ycbcr(double* y, double* cb, double* cr)\n   {\n      if (bgr_mode != channel_mode_)\n         return;\n\n      for (unsigned char* itr = data(); itr < end(); ++y, ++cb, ++cr)\n      {\n         double y_  =  (*y);\n         double cb_ = (*cb);\n         double cr_ = (*cr);\n\n         *(itr++) = static_cast<unsigned char>(clamp((298.082 * y_ + 516.412 * cb_                 ) / 256.0 - 276.836,0.0,255.0));\n         *(itr++) = static_cast<unsigned char>(clamp((298.082 * y_ - 100.291 * cb_ - 208.120 * cr_ ) / 256.0 + 135.576,0.0,255.0));\n         *(itr++) = static_cast<unsigned char>(clamp((298.082 * y_                 + 408.583 * cr_ ) / 256.0 - 222.921,0.0,255.0));\n      }\n   }\n\n   inline void import_gray_scale_clamped(double* gray)\n   {\n      if (bgr_mode != channel_mode_)\n         return;\n\n      for (unsigned char* itr = data(); itr < end(); ++gray)\n      {\n         unsigned char c = static_cast<unsigned char>(clamp<double>(256.0 * (*gray),0.0,255.0));\n\n         *(itr + 0) = c;\n         *(itr + 1) = c;\n         *(itr + 2) = c;\n\n         itr += 3;\n      }\n   }\n\n   inline void import_rgb_clamped(double* red, double* green, double* blue)\n   {\n      if (bgr_mode != channel_mode_)\n         return;\n\n      for (unsigned char* itr = data(); itr < end(); ++red, ++green, ++blue)\n      {\n         *(itr++) = static_cast<unsigned char>(clamp<double>(256.0 * (*blue ),0.0,255.0));\n         *(itr++) = static_cast<unsigned char>(clamp<double>(256.0 * (*green),0.0,255.0));\n         *(itr++) = static_cast<unsigned char>(clamp<double>(256.0 * (*red  ),0.0,255.0));\n      }\n   }\n\n   inline void import_rgb_clamped(float* red, float* green, float* blue)\n   {\n      if (bgr_mode != channel_mode_)\n         return;\n\n      for (unsigned char* itr = data(); itr < end(); ++red, ++green, ++blue)\n      {\n         *(itr++) = static_cast<unsigned char>(clamp<double>(256.0f * (*blue ),0.0,255.0));\n         *(itr++) = static_cast<unsigned char>(clamp<double>(256.0f * (*green),0.0,255.0));\n         *(itr++) = static_cast<unsigned char>(clamp<double>(256.0f * (*red  ),0.0,255.0));\n      }\n   }\n\n   inline void import_rgb_normal(double* red, double* green, double* blue)\n   {\n      if (bgr_mode != channel_mode_)\n         return;\n\n      for (unsigned char* itr = data(); itr < end(); ++red, ++green, ++blue)\n      {\n         *(itr++) = static_cast<unsigned char>(*blue );\n         *(itr++) = static_cast<unsigned char>(*green);\n         *(itr++) = static_cast<unsigned char>(*red  );\n      }\n   }\n\n   inline void import_rgb_normal(float* red, float* green, float* blue)\n   {\n      if (bgr_mode != channel_mode_)\n         return;\n\n      for (unsigned char* itr = data(); itr < end(); ++red, ++green, ++blue)\n      {\n         *(itr++) = static_cast<unsigned char>(*blue );\n         *(itr++) = static_cast<unsigned char>(*green);\n         *(itr++) = static_cast<unsigned char>(*red  );\n      }\n   }\n\n   inline void subsample(bitmap_image& dest) const\n   {\n      /*\n         Half sub-sample of original image.\n      */\n      unsigned int w = 0;\n      unsigned int h = 0;\n\n      bool odd_width = false;\n      bool odd_height = false;\n\n      if (0 == (width_ % 2))\n         w = width_ / 2;\n      else\n      {\n         w = 1 + (width_ / 2);\n         odd_width = true;\n      }\n\n      if (0 == (height_ % 2))\n         h = height_ / 2;\n      else\n      {\n         h = 1 + (height_ / 2);\n         odd_height = true;\n      }\n\n      unsigned int horizontal_upper = (odd_width)  ? (w - 1) : w;\n      unsigned int vertical_upper   = (odd_height) ? (h - 1) : h;\n\n      dest.setwidth_height(w,h);\n      dest.clear();\n\n            unsigned char* s_itr[3];\n      const unsigned char*  itr1[3];\n      const unsigned char*  itr2[3];\n\n      s_itr[0] = dest.data() + 0;\n      s_itr[1] = dest.data() + 1;\n      s_itr[2] = dest.data() + 2;\n\n      itr1[0] = data() + 0;\n      itr1[1] = data() + 1;\n      itr1[2] = data() + 2;\n\n      itr2[0] = data() + row_increment_ + 0;\n      itr2[1] = data() + row_increment_ + 1;\n      itr2[2] = data() + row_increment_ + 2;\n\n      unsigned int total = 0;\n\n      for (unsigned int j = 0; j < vertical_upper; ++j)\n      {\n         for (unsigned int i = 0; i < horizontal_upper; ++i)\n         {\n            for (unsigned int k = 0; k < bytes_per_pixel_; s_itr[k] += bytes_per_pixel_, ++k)\n            {\n               total = 0;\n               total += *(itr1[k]);\n               total += *(itr1[k]);\n               total += *(itr2[k]);\n               total += *(itr2[k]);\n\n               itr1[k] += bytes_per_pixel_;\n               itr1[k] += bytes_per_pixel_;\n               itr2[k] += bytes_per_pixel_;\n               itr2[k] += bytes_per_pixel_;\n\n               *(s_itr[k]) = static_cast<unsigned char>(total >> 2);\n            }\n         }\n\n         if (odd_width)\n         {\n            for (unsigned int k = 0; k < bytes_per_pixel_; s_itr[k] += bytes_per_pixel_, ++k)\n            {\n               total = 0;\n               total += *(itr1[k]);\n               total += *(itr2[k]);\n\n               itr1[k] += bytes_per_pixel_;\n               itr2[k] += bytes_per_pixel_;\n\n               *(s_itr[k]) = static_cast<unsigned char>(total >> 1);\n            }\n         }\n\n         for (unsigned int k = 0; k < bytes_per_pixel_; ++k)\n         {\n            itr1[k] += row_increment_;\n         }\n\n         if (j != (vertical_upper - 1))\n         {\n            for (unsigned int k = 0; k < bytes_per_pixel_; ++k)\n            {\n               itr2[k] += row_increment_;\n            }\n         }\n      }\n\n      if (odd_height)\n      {\n         for (unsigned int i = 0; i < horizontal_upper; ++i)\n         {\n            for (unsigned int k = 0; k < bytes_per_pixel_; s_itr[k] += bytes_per_pixel_, ++k)\n            {\n               total = 0;\n               total += *(itr1[k]);\n               total += *(itr2[k]);\n\n               itr1[k] += bytes_per_pixel_;\n               itr2[k] += bytes_per_pixel_;\n\n               *(s_itr[k]) = static_cast<unsigned char>(total >> 1);\n            }\n         }\n\n         if (odd_width)\n         {\n            for (unsigned int k = 0; k < bytes_per_pixel_; ++k)\n            {\n               (*(s_itr[k])) = *(itr1[k]);\n            }\n         }\n      }\n   }\n\n   inline void upsample(bitmap_image& dest) const\n   {\n      /*\n         2x up-sample of original image.\n      */\n\n      dest.setwidth_height(2 * width_ ,2 * height_);\n      dest.clear();\n\n      const unsigned char* s_itr[3];\n            unsigned char*  itr1[3];\n            unsigned char*  itr2[3];\n\n      s_itr[0] = data() + 0;\n      s_itr[1] = data() + 1;\n      s_itr[2] = data() + 2;\n\n      itr1[0] = dest.data() + 0;\n      itr1[1] = dest.data() + 1;\n      itr1[2] = dest.data() + 2;\n\n      itr2[0] = dest.data() + dest.row_increment_ + 0;\n      itr2[1] = dest.data() + dest.row_increment_ + 1;\n      itr2[2] = dest.data() + dest.row_increment_ + 2;\n\n      for (unsigned int j = 0; j < height_; ++j)\n      {\n         for (unsigned int i = 0; i < width_; ++i)\n         {\n            for (unsigned int k = 0; k < bytes_per_pixel_; s_itr[k] += bytes_per_pixel_, ++k)\n            {\n               *(itr1[k]) = *(s_itr[k]); itr1[k] += bytes_per_pixel_;\n               *(itr1[k]) = *(s_itr[k]); itr1[k] += bytes_per_pixel_;\n\n               *(itr2[k]) = *(s_itr[k]); itr2[k] += bytes_per_pixel_;\n               *(itr2[k]) = *(s_itr[k]); itr2[k] += bytes_per_pixel_;\n            }\n         }\n\n         for (unsigned int k = 0; k < bytes_per_pixel_; ++k)\n         {\n            itr1[k] += dest.row_increment_;\n            itr2[k] += dest.row_increment_;\n         }\n      }\n   }\n\n   inline void alpha_blend(const double& alpha, const bitmap_image& image)\n   {\n      if (\n           (image.width_  != width_ ) ||\n           (image.height_ != height_)\n         )\n      {\n         return;\n      }\n\n      if ((alpha < 0.0) || (alpha > 1.0))\n      {\n         return;\n      }\n\n      unsigned char* itr1           = data();\n      const unsigned char* itr1_end = end();\n      const unsigned char* itr2     = image.data();\n\n      double alpha_compliment = 1.0 - alpha;\n\n      while (itr1 != itr1_end)\n      {\n         *(itr1) = static_cast<unsigned char>((alpha * (*itr2)) + (alpha_compliment * (*itr1)));\n         ++itr1;\n         ++itr2;\n      }\n   }\n\n   inline double psnr(const bitmap_image& image)\n   {\n      if (\n           (image.width_  != width_ ) ||\n           (image.height_ != height_)\n         )\n      {\n         return 0.0;\n      }\n\n      const unsigned char* itr1 = data();\n      const unsigned char* itr2 = image.data();\n\n      double mse = 0.0;\n\n      while (itr1 != end())\n      {\n         const double v = (static_cast<double>(*itr1) - static_cast<double>(*itr2));\n\n         mse += v * v;\n         ++itr1;\n         ++itr2;\n      }\n\n      if (mse <= 0.0000001)\n      {\n         return 1000000.0;\n      }\n      else\n      {\n         mse /= (3.0 * width_ * height_);\n\n         return 20.0 * std::log10(255.0 / std::sqrt(mse));\n      }\n   }\n\n   inline double psnr(const unsigned int& x,\n                      const unsigned int& y,\n                      const bitmap_image& image)\n   {\n      if ((x + image.width() ) > width_ ) { return 0.0; }\n      if ((y + image.height()) > height_) { return 0.0; }\n\n      double mse = 0.0;\n\n      const unsigned int height = image.height();\n      const unsigned int width  = image.width();\n\n      for (unsigned int r = 0; r < height; ++r)\n      {\n         const unsigned char* itr1     = row(r + y) + x * bytes_per_pixel_;\n         const unsigned char* itr1_end = itr1 + (width * bytes_per_pixel_);\n         const unsigned char* itr2     = image.row(r);\n\n         while (itr1 != itr1_end)\n         {\n            double v = (static_cast<double>(*itr1) - static_cast<double>(*itr2));\n            mse += v * v;\n            ++itr1;\n            ++itr2;\n         }\n      }\n\n      if (mse <= 0.0000001)\n      {\n         return 1000000.0;\n      }\n      else\n      {\n         mse /= (3.0 * image.width() * image.height());\n         return 20.0 * std::log10(255.0 / std::sqrt(mse));\n      }\n   }\n\n   inline void histogram(const color_plane color, double hist[256]) const\n   {\n      std::fill(hist, hist + 256, 0.0);\n\n      for (const unsigned char* itr = (data() + offset(color)); itr < end(); itr += bytes_per_pixel_)\n      {\n         ++hist[(*itr)];\n      }\n   }\n\n   inline void histogram_normalized(const color_plane color, double hist[256]) const\n   {\n      histogram(color,hist);\n\n      double* h_itr = hist;\n      const double* h_end = hist + 256;\n      const double pixel_count = static_cast<double>(width_ * height_);\n\n      while (h_end != h_itr)\n      {\n         *(h_itr++) /= pixel_count;\n      }\n   }\n\n   inline unsigned int offset(const color_plane color) const\n   {\n      switch (channel_mode_)\n      {\n         case rgb_mode : {\n                            switch (color)\n                            {\n                               case red_plane   : return 0;\n                               case green_plane : return 1;\n                               case blue_plane  : return 2;\n                               default          : return std::numeric_limits<unsigned int>::max();\n                            }\n                         }\n\n         case bgr_mode : {\n                            switch (color)\n                            {\n                               case red_plane   : return 2;\n                               case green_plane : return 1;\n                               case blue_plane  : return 0;\n                               default          : return std::numeric_limits<unsigned int>::max();\n                            }\n                         }\n\n         default       : return std::numeric_limits<unsigned int>::max();\n      }\n   }\n\n   inline void incremental()\n   {\n      unsigned char current_color = 0;\n\n      for (unsigned char* itr = data(); itr < end();)\n      {\n         (*itr++) = (current_color);\n         (*itr++) = (current_color);\n         (*itr++) = (current_color);\n\n         ++current_color;\n      }\n   }\n\n   inline void reverse_channels()\n   {\n      if (3 != bytes_per_pixel_)\n         return;\n\n      for (unsigned char* itr = data(); itr < end(); itr += bytes_per_pixel_)\n      {\n         std::swap(*(itr + 0),*(itr + 2));\n      }\n   }\n\nprivate:\n\n   inline const unsigned char* end() const\n   {\n      return data_.data() + data_.size();\n   }\n\n   inline unsigned char* end()\n   {\n      return const_cast<unsigned char*>(data() + data_.size());\n   }\n\n   struct bitmap_file_header\n   {\n      unsigned short type;\n      unsigned int   size;\n      unsigned short reserved1;\n      unsigned short reserved2;\n      unsigned int   off_bits;\n\n      unsigned int struct_size() const\n      {\n         return sizeof(type     ) +\n                sizeof(size     ) +\n                sizeof(reserved1) +\n                sizeof(reserved2) +\n                sizeof(off_bits ) ;\n      }\n\n      void clear()\n      {\n         std::memset(this, 0x00, sizeof(bitmap_file_header));\n      }\n   };\n\n   struct bitmap_information_header\n   {\n      unsigned int   size;\n      unsigned int   width;\n      unsigned int   height;\n      unsigned short planes;\n      unsigned short bit_count;\n      unsigned int   compression;\n      unsigned int   size_image;\n      unsigned int   x_pels_per_meter;\n      unsigned int   y_pels_per_meter;\n      unsigned int   clr_used;\n      unsigned int   clr_important;\n\n      unsigned int struct_size() const\n      {\n         return sizeof(size            ) +\n                sizeof(width           ) +\n                sizeof(height          ) +\n                sizeof(planes          ) +\n                sizeof(bit_count       ) +\n                sizeof(compression     ) +\n                sizeof(size_image      ) +\n                sizeof(x_pels_per_meter) +\n                sizeof(y_pels_per_meter) +\n                sizeof(clr_used        ) +\n                sizeof(clr_important   ) ;\n      }\n\n      void clear()\n      {\n         std::memset(this, 0x00, sizeof(bitmap_information_header));\n      }\n   };\n\n   inline bool big_endian() const\n   {\n      unsigned int v = 0x01;\n\n      return (1 != reinterpret_cast<char*>(&v)[0]);\n   }\n\n   inline unsigned short flip(const unsigned short& v) const\n   {\n      return ((v >> 8) | (v << 8));\n   }\n\n   inline unsigned int flip(const unsigned int& v) const\n   {\n      return (\n               ((v & 0xFF000000) >> 0x18) |\n               ((v & 0x000000FF) << 0x18) |\n               ((v & 0x00FF0000) >> 0x08) |\n               ((v & 0x0000FF00) << 0x08)\n             );\n   }\n\n   template <typename T>\n   inline void read_from_stream(std::ifstream& stream,T& t)\n   {\n      stream.read(reinterpret_cast<char*>(&t),sizeof(T));\n   }\n\n   template <typename T>\n   inline void write_to_stream(std::ofstream& stream,const T& t) const\n   {\n      stream.write(reinterpret_cast<const char*>(&t),sizeof(T));\n   }\n\n   inline void read_bfh(std::ifstream& stream, bitmap_file_header& bfh)\n   {\n      read_from_stream(stream,bfh.type     );\n      read_from_stream(stream,bfh.size     );\n      read_from_stream(stream,bfh.reserved1);\n      read_from_stream(stream,bfh.reserved2);\n      read_from_stream(stream,bfh.off_bits );\n\n      if (big_endian())\n      {\n         bfh.type      = flip(bfh.type     );\n         bfh.size      = flip(bfh.size     );\n         bfh.reserved1 = flip(bfh.reserved1);\n         bfh.reserved2 = flip(bfh.reserved2);\n         bfh.off_bits  = flip(bfh.off_bits );\n      }\n   }\n\n   inline void write_bfh(std::ofstream& stream, const bitmap_file_header& bfh) const\n   {\n      if (big_endian())\n      {\n         write_to_stream(stream,flip(bfh.type     ));\n         write_to_stream(stream,flip(bfh.size     ));\n         write_to_stream(stream,flip(bfh.reserved1));\n         write_to_stream(stream,flip(bfh.reserved2));\n         write_to_stream(stream,flip(bfh.off_bits ));\n      }\n      else\n      {\n         write_to_stream(stream,bfh.type     );\n         write_to_stream(stream,bfh.size     );\n         write_to_stream(stream,bfh.reserved1);\n         write_to_stream(stream,bfh.reserved2);\n         write_to_stream(stream,bfh.off_bits );\n      }\n   }\n\n   inline void read_bih(std::ifstream& stream,bitmap_information_header& bih)\n   {\n      read_from_stream(stream,bih.size            );\n      read_from_stream(stream,bih.width           );\n      read_from_stream(stream,bih.height          );\n      read_from_stream(stream,bih.planes          );\n      read_from_stream(stream,bih.bit_count       );\n      read_from_stream(stream,bih.compression     );\n      read_from_stream(stream,bih.size_image      );\n      read_from_stream(stream,bih.x_pels_per_meter);\n      read_from_stream(stream,bih.y_pels_per_meter);\n      read_from_stream(stream,bih.clr_used        );\n      read_from_stream(stream,bih.clr_important   );\n\n      if (big_endian())\n      {\n         bih.size          = flip(bih.size               );\n         bih.width         = flip(bih.width              );\n         bih.height        = flip(bih.height             );\n         bih.planes        = flip(bih.planes             );\n         bih.bit_count     = flip(bih.bit_count          );\n         bih.compression   = flip(bih.compression        );\n         bih.size_image    = flip(bih.size_image         );\n         bih.x_pels_per_meter = flip(bih.x_pels_per_meter);\n         bih.y_pels_per_meter = flip(bih.y_pels_per_meter);\n         bih.clr_used      = flip(bih.clr_used           );\n         bih.clr_important = flip(bih.clr_important      );\n      }\n   }\n\n   inline void write_bih(std::ofstream& stream, const bitmap_information_header& bih) const\n   {\n      if (big_endian())\n      {\n         write_to_stream(stream,flip(bih.size            ));\n         write_to_stream(stream,flip(bih.width           ));\n         write_to_stream(stream,flip(bih.height          ));\n         write_to_stream(stream,flip(bih.planes          ));\n         write_to_stream(stream,flip(bih.bit_count       ));\n         write_to_stream(stream,flip(bih.compression     ));\n         write_to_stream(stream,flip(bih.size_image      ));\n         write_to_stream(stream,flip(bih.x_pels_per_meter));\n         write_to_stream(stream,flip(bih.y_pels_per_meter));\n         write_to_stream(stream,flip(bih.clr_used        ));\n         write_to_stream(stream,flip(bih.clr_important   ));\n      }\n      else\n      {\n         write_to_stream(stream,bih.size            );\n         write_to_stream(stream,bih.width           );\n         write_to_stream(stream,bih.height          );\n         write_to_stream(stream,bih.planes          );\n         write_to_stream(stream,bih.bit_count       );\n         write_to_stream(stream,bih.compression     );\n         write_to_stream(stream,bih.size_image      );\n         write_to_stream(stream,bih.x_pels_per_meter);\n         write_to_stream(stream,bih.y_pels_per_meter);\n         write_to_stream(stream,bih.clr_used        );\n         write_to_stream(stream,bih.clr_important   );\n      }\n   }\n\n   inline std::size_t file_size(const std::string& file_name) const\n   {\n      std::ifstream file(file_name.c_str(),std::ios::in | std::ios::binary);\n      if (!file) return 0;\n      file.seekg (0, std::ios::end);\n      return static_cast<std::size_t>(file.tellg());\n   }\n\n   void create_bitmap()\n   {\n      row_increment_ = width_ * bytes_per_pixel_;\n      data_.resize(height_ * row_increment_);\n   }\n\n   void load_bitmap()\n   {\n      std::ifstream stream(file_name_.c_str(),std::ios::binary);\n\n      if (!stream)\n      {\n         std::cerr << \"bitmap_image::load_bitmap() ERROR: bitmap_image - file \" << file_name_ << \" not found!\" << std::endl;\n         return;\n      }\n\n      width_  = 0;\n      height_ = 0;\n\n      bitmap_file_header bfh;\n      bitmap_information_header bih;\n\n      bfh.clear();\n      bih.clear();\n\n      read_bfh(stream,bfh);\n      read_bih(stream,bih);\n\n      if (bfh.type != 19778)\n      {\n         bfh.clear();\n         bih.clear();\n\n         stream.close();\n\n         std::cerr << \"bitmap_image::load_bitmap() ERROR: bitmap_image - Invalid type value \" << bfh.type << \" expected 19778.\" << std::endl;\n         return;\n      }\n\n      if (bih.bit_count != 24)\n      {\n         bfh.clear();\n         bih.clear();\n\n         stream.close();\n\n         std::cerr << \"bitmap_image::load_bitmap() ERROR: bitmap_image - Invalid bit depth \" << bih.bit_count << \" expected 24.\" << std::endl;\n\n         return;\n      }\n\n      if (bih.size != bih.struct_size())\n      {\n         bfh.clear();\n         bih.clear();\n\n         stream.close();\n\n         std::cerr << \"bitmap_image::load_bitmap() ERROR: bitmap_image - Invalid BIH size \" << bih.size << \" expected \" << bih.struct_size() << std::endl;\n\n         return;\n      }\n\n      width_  = bih.width;\n      height_ = bih.height;\n\n      bytes_per_pixel_ = bih.bit_count >> 3;\n\n      unsigned int padding = (4 - ((3 * width_) % 4)) % 4;\n      char padding_data[4] = {0,0,0,0};\n\n      std::size_t bitmap_file_size = file_size(file_name_);\n\n      std::size_t bitmap_logical_size = (height_ * width_ * bytes_per_pixel_) +\n                                        (height_ * padding)                   +\n                                         bih.struct_size()                    +\n                                         bfh.struct_size()                    ;\n\n      if (bitmap_file_size != bitmap_logical_size)\n      {\n         bfh.clear();\n         bih.clear();\n\n         stream.close();\n\n         std::cerr << \"bitmap_image::load_bitmap() ERROR: bitmap_image - Mismatch between logical and physical sizes of bitmap. \" <<\n                      \"Logical: \"  << bitmap_logical_size << \" \" <<\n                      \"Physical: \" << bitmap_file_size    << std::endl;\n\n         return;\n      }\n\n      create_bitmap();\n\n      for (unsigned int i = 0; i < height_; ++i)\n      {\n         unsigned char* data_ptr = row(height_ - i - 1); // read in inverted row order\n\n         stream.read(reinterpret_cast<char*>(data_ptr), sizeof(char) * bytes_per_pixel_ * width_);\n         stream.read(padding_data,padding);\n      }\n   }\n\n   template <typename T>\n   inline T clamp(const T& v, const T& lower_range, const T& upper_range) const\n   {\n      if (v < lower_range)\n         return lower_range;\n      else if (v >  upper_range)\n         return upper_range;\n      else\n         return v;\n   }\n\n   std::string  file_name_;\n   unsigned int width_;\n   unsigned int height_;\n   unsigned int row_increment_;\n   unsigned int bytes_per_pixel_;\n   channel_mode channel_mode_;\n   std::vector<unsigned char> data_;\n};\n\ntypedef bitmap_image::rgb_t rgb_t;\n\ninline bool operator==(const rgb_t& c0, const rgb_t& c1)\n{\n   return (c0.red   == c1  .red) &&\n          (c0.green == c1.green) &&\n          (c0.blue  == c1 .blue) ;\n}\n\ninline bool operator!=(const rgb_t& c0, const rgb_t& c1)\n{\n   return (c0.red   != c1  .red) ||\n          (c0.green != c1.green) ||\n          (c0.blue  != c1 .blue) ;\n}\n\ninline std::size_t hamming_distance(const rgb_t& c0, const rgb_t& c1)\n{\n   std::size_t result = 0;\n\n   if (c0.red   != c1  .red) ++result;\n   if (c0.green != c1.green) ++result;\n   if (c0.blue  != c1 .blue) ++result;\n\n   return result;\n}\n\ninline rgb_t make_colour(const unsigned int& red, const unsigned int& green, const unsigned int& blue)\n{\n   rgb_t result;\n\n   result.red   = static_cast<unsigned char>(red  );\n   result.green = static_cast<unsigned char>(green);\n   result.blue  = static_cast<unsigned char>(blue );\n\n   return result;\n}\n\ntemplate <typename OutputIterator>\ninline void generate_colours(const std::size_t& steps, const rgb_t c0, const rgb_t& c1, OutputIterator out)\n{\n   double dr = ((double)c1.red   -  (double)c0.red   ) / steps;\n   double dg = ((double)c1.green -  (double)c0.green ) / steps;\n   double db = ((double)c1.blue  -  (double)c0.blue  ) / steps;\n\n   for (std::size_t i = 0; i < steps; ++i)\n   {\n      rgb_t c;\n\n      c.red   = static_cast<unsigned char>(c0.red   + (i * dr));\n      c.green = static_cast<unsigned char>(c0.green + (i * dg));\n      c.blue  = static_cast<unsigned char>(c0.blue  + (i * db));\n\n      *(out++) = c;\n   }\n}\n\ntemplate <typename ResponseImage, typename Palette>\ninline std::size_t convert_rsp_to_image(const ResponseImage& resp_image, const Palette& palette, bitmap_image& image)\n{\n   if (\n        (resp_image.width () > image.width ()) ||\n        (resp_image.height() > image.height())\n      )\n      return 0;\n\n   for (std::size_t y = 0; y < resp_image.height(); ++y)\n   {\n      for (std::size_t x = 0; x < resp_image.width(); ++x)\n      {\n         const double v = resp_image(x,y);\n\n         unsigned int index = static_cast<unsigned int>((v < 0) ? 0 : v > (palette.size()) ? (palette.size() - 1) : v);\n\n         image.set_pixel(x,y,palette[index]);\n      }\n   }\n\n   return (resp_image.width() * resp_image.height());\n}\n\ninline void rgb_to_ycbcr(const unsigned int& length, double* red, double* green, double* blue,\n                                                     double* y,   double* cb,    double* cr)\n{\n   unsigned int i = 0;\n\n   while (i < length)\n   {\n      ( *y) =   16.0 + (  65.481 * (*red) +  128.553 * (*green) +  24.966 * (*blue));\n      (*cb) =  128.0 + ( -37.797 * (*red) +  -74.203 * (*green) + 112.000 * (*blue));\n      (*cr) =  128.0 + ( 112.000 * (*red) +  -93.786 * (*green) -  18.214 * (*blue));\n\n      ++i;\n      ++red; ++green; ++blue;\n      ++y;   ++cb;    ++cr;\n   }\n}\n\ninline void ycbcr_to_rgb(const unsigned int& length, double* y,   double* cb,    double* cr,\n                                                     double* red, double* green, double* blue)\n{\n   unsigned int i = 0;\n\n   while (i < length)\n   {\n      double y_  =  (*y) -  16.0;\n      double cb_ = (*cb) - 128.0;\n      double cr_ = (*cr) - 128.0;\n\n        (*red) = 0.000456621 * y_                    + 0.00625893 * cr_;\n      (*green) = 0.000456621 * y_ - 0.00153632 * cb_ - 0.00318811 * cr_;\n       (*blue) = 0.000456621 * y_                    + 0.00791071 * cb_;\n\n      ++i;\n      ++red; ++green; ++blue;\n      ++y;   ++cb;    ++cr;\n   }\n}\n\ninline void subsample(const unsigned int& width,\n                      const unsigned int& height,\n                      const double* source,\n                      unsigned int& w,\n                      unsigned int& h,\n                      double*& dest)\n{\n   /*  Single channel.  */\n\n   w = 0;\n   h = 0;\n\n   bool odd_width = false;\n   bool odd_height = false;\n\n   if (0 == (width % 2))\n      w = width / 2;\n   else\n   {\n      w = 1 + (width / 2);\n      odd_width = true;\n   }\n\n   if (0 == (height % 2))\n      h = height / 2;\n   else\n   {\n      h = 1 + (height / 2);\n      odd_height = true;\n   }\n\n   unsigned int horizontal_upper = (odd_width)  ? w - 1 : w;\n   unsigned int vertical_upper   = (odd_height) ? h - 1 : h;\n\n   dest = new double[w * h];\n\n         double* s_itr = dest;\n   const double* itr1  = source;\n   const double* itr2  = source + width;\n\n   for (unsigned int j = 0; j < vertical_upper; ++j)\n   {\n      for (unsigned int i = 0; i < horizontal_upper; ++i, ++s_itr)\n      {\n          (*s_itr)  = *(itr1++);\n          (*s_itr) += *(itr1++);\n          (*s_itr) += *(itr2++);\n          (*s_itr) += *(itr2++);\n          (*s_itr) /=  4.0;\n      }\n\n      if (odd_width)\n      {\n         (*(s_itr++)) = ((*itr1++) + (*itr2++)) / 2.0;\n      }\n\n      itr1 += width;\n\n      if (j != (vertical_upper -1))\n      {\n         itr2 += width;\n      }\n   }\n\n   if (odd_height)\n   {\n      for (unsigned int i = 0; i < horizontal_upper; ++i, ++s_itr)\n      {\n         (*s_itr) += (*(itr1++));\n         (*s_itr) += (*(itr1++));\n         (*s_itr) /= 2.0;\n      }\n\n      if (odd_width)\n      {\n         (*(s_itr++)) = (*itr1);\n      }\n   }\n}\n\ninline void upsample(const unsigned int& width,\n                     const unsigned int& height,\n                     const double* source,\n                     unsigned int& w,\n                     unsigned int& h,\n                     double*& dest)\n{\n   /* Single channel. */\n\n   w = 2 * width;\n   h = 2 * height;\n\n   dest = new double[w * h];\n\n   const double* s_itr = source;\n         double* itr1  = dest;\n         double* itr2  = dest + w;\n\n   for (unsigned int j = 0; j < height; ++j)\n   {\n      for (unsigned int i = 0; i < width; ++i, ++s_itr)\n      {\n          *(itr1++) = (*s_itr);\n          *(itr1++) = (*s_itr);\n          *(itr2++) = (*s_itr);\n          *(itr2++) = (*s_itr);\n      }\n\n      itr1 += w;\n      itr2 += w;\n   }\n}\n\ninline void checkered_pattern(const unsigned int x_width,\n                              const unsigned int y_width,\n                              const unsigned char value,\n                              const bitmap_image::color_plane color,\n                              bitmap_image& image)\n{\n   if (\n        (x_width >= image.width ()) ||\n        (y_width >= image.height())\n      )\n   {\n      return;\n   }\n\n   bool setter_x = false;\n   bool setter_y = true;\n\n   const unsigned int color_plane_offset = image.offset(color);\n   const unsigned int height = image.height();\n   const unsigned int width  = image.width();\n\n   for (unsigned int y = 0; y < height; ++y)\n   {\n      if (0 == (y % y_width))\n      {\n         setter_y = !setter_y;\n      }\n\n      unsigned char* row = image.row(y) + color_plane_offset;\n\n      for (unsigned int x = 0; x < width; ++x, row += image.bytes_per_pixel())\n      {\n         if (0 == (x % x_width))\n         {\n            setter_x = !setter_x;\n         }\n\n         if (setter_x ^ setter_y)\n         {\n            *row = value;\n         }\n      }\n   }\n}\n\ninline void checkered_pattern(const unsigned int x_width,\n                              const unsigned int y_width,\n                              const unsigned char red,\n                              const unsigned char green,\n                              const unsigned char blue,\n                              bitmap_image& image)\n{\n   if (\n        (x_width >= image.width ()) ||\n        (y_width >= image.height())\n      )\n   {\n      return;\n   }\n\n   bool setter_x = false;\n   bool setter_y = true;\n\n   const unsigned int height = image.height();\n   const unsigned int width  = image.width();\n\n   for (unsigned int y = 0; y < height; ++y)\n   {\n      if (0 == (y % y_width))\n      {\n         setter_y = !setter_y;\n      }\n\n      unsigned char* row = image.row(y);\n\n      for (unsigned int x = 0; x < width; ++x, row += image.bytes_per_pixel())\n      {\n         if (0 == (x % x_width))\n         {\n            setter_x = !setter_x;\n         }\n\n         if (setter_x ^ setter_y)\n         {\n            *(row + 0) = blue;\n            *(row + 1) = green;\n            *(row + 2) = red;\n         }\n      }\n   }\n}\n\ninline void plasma(bitmap_image& image,\n                   const double& x,     const double& y,\n                   const double& width, const double& height,\n                   const double& c1,    const double& c2,\n                   const double& c3,    const double& c4,\n                   const double& roughness  = 3.0,\n                   const rgb_t   colormap[] = 0)\n{\n   // Note: c1,c2,c3,c4 -> [0.0,1.0]\n\n   const double half_width  = ( width / 2.0);\n   const double half_height = (height / 2.0);\n\n   if ((width >= 1.0) || (height >= 1.0))\n   {\n      const double corner1 = (c1 + c2) / 2.0;\n      const double corner2 = (c2 + c3) / 2.0;\n      const double corner3 = (c3 + c4) / 2.0;\n      const double corner4 = (c4 + c1) / 2.0;\n            double center  = (c1 + c2 + c3 + c4) / 4.0 +\n                             ((1.0 * ::rand() /(1.0 * RAND_MAX))  - 0.5) * // should use a better rng\n                             ((1.0 * half_width + half_height) / (image.width() + image.height()) * roughness);\n\n      center = std::min<double>(std::max<double>(0.0,center),1.0);\n\n      plasma(image, x,                            y, half_width, half_height,      c1, corner1,  center, corner4,roughness,colormap);\n      plasma(image, x + half_width,               y, half_width, half_height, corner1,      c2, corner2,  center,roughness,colormap);\n      plasma(image, x + half_width, y + half_height, half_width, half_height,  center, corner2,      c3, corner3,roughness,colormap);\n      plasma(image, x,              y + half_height, half_width, half_height, corner4,  center, corner3,      c4,roughness,colormap);\n   }\n   else\n   {\n      rgb_t color = colormap[static_cast<unsigned int>(1000.0 * ((c1 + c2 + c3 + c4) / 4.0)) % 1000];\n\n      image.set_pixel(static_cast<unsigned int>(x),static_cast<unsigned int>(y),color);\n   }\n}\n\ninline void plasma(bitmap_image& image,\n                   const double& c1, const double& c2,\n                   const double& c3, const double& c4,\n                   const double& roughness  = 3.0,\n                   const rgb_t   colormap[] = 0)\n{\n   plasma\n   (\n     image, 0, 0, image.width(), image.height(),\n     c1, c2, c3, c4,\n     roughness, colormap\n   );\n}\n\ninline double psnr_region(const unsigned int& x,      const unsigned int& y,\n                          const unsigned int& width,  const unsigned int& height,\n                          const bitmap_image& image1, const bitmap_image& image2)\n{\n   if (\n        (image1.width()  != image2.width ()) ||\n        (image1.height() != image2.height())\n      )\n   {\n      return 0.0;\n   }\n\n   if ((x + width ) > image1.width() ) { return 0.0; }\n   if ((y + height) > image1.height()) { return 0.0; }\n\n   double mse = 0.0;\n\n   for (unsigned int r = 0; r < height; ++r)\n   {\n      const unsigned char* itr1     = image1.row(r + y) + x * image1.bytes_per_pixel();\n      const unsigned char* itr1_end = itr1 + (width * image1.bytes_per_pixel());\n      const unsigned char* itr2     = image2.row(r + y) + x * image2.bytes_per_pixel();\n\n      while (itr1 != itr1_end)\n      {\n         double v = (static_cast<double>(*itr1) - static_cast<double>(*itr2));\n         mse += v * v;\n         ++itr1;\n         ++itr2;\n      }\n   }\n\n   if (mse <= 0.0000001)\n   {\n      return 1000000.0;\n   }\n   else\n   {\n      mse /= (3.0 * width * height);\n      return 20.0 * std::log10(255.0 / std::sqrt(mse));\n   }\n}\n\ninline void hierarchical_psnr_r(const double& x,     const double& y,\n                                const double& width, const double& height,\n                                const bitmap_image& image1,\n                                      bitmap_image& image2,\n                                const double& threshold,\n                                const rgb_t colormap[])\n{\n   if ((width <= 4.0) || (height <= 4.0))\n   {\n      const double psnr = psnr_region\n                          (\n                            static_cast<unsigned int>(x),\n                            static_cast<unsigned int>(y),\n                            static_cast<unsigned int>(width),\n                            static_cast<unsigned int>(height),\n                            image1, image2\n                          );\n\n      if (psnr < threshold)\n      {\n         rgb_t c = colormap[static_cast<unsigned int>(1000.0 * (1.0 - (psnr / threshold)))];\n\n         image2.set_region\n                (\n                  static_cast<unsigned int>(x),\n                  static_cast<unsigned int>(y),\n                  static_cast<unsigned int>(width + 1),\n                  static_cast<unsigned int>(height + 1),\n                  c.red, c.green, c.blue\n                );\n      }\n   }\n   else\n   {\n      const double half_width  = ( width / 2.0);\n      const double half_height = (height / 2.0);\n\n      hierarchical_psnr_r(x             , y              , half_width, half_height, image1, image2, threshold, colormap);\n      hierarchical_psnr_r(x + half_width, y              , half_width, half_height, image1, image2, threshold, colormap);\n      hierarchical_psnr_r(x + half_width, y + half_height, half_width, half_height, image1, image2, threshold, colormap);\n      hierarchical_psnr_r(x             , y + half_height, half_width, half_height, image1, image2, threshold, colormap);\n   }\n}\n\ninline void hierarchical_psnr(bitmap_image& image1, bitmap_image& image2, const double threshold, const rgb_t colormap[])\n{\n   if (\n        (image1.width()  != image2.width ()) ||\n        (image1.height() != image2.height())\n      )\n   {\n      return;\n   }\n\n   const double psnr = psnr_region\n                       (\n                         0, 0, image1.width(), image1.height(),\n                         image1, image2\n                       );\n\n   if (psnr < threshold)\n   {\n      hierarchical_psnr_r\n      (\n        0, 0, image1.width(), image1.height(),\n        image1, image2,\n        threshold,\n        colormap\n      );\n   }\n}\n\nclass image_drawer\n{\npublic:\n\n   image_drawer(bitmap_image& image)\n   : image_(image),\n     pen_width_(1),\n     pen_color_red_  (0),\n     pen_color_green_(0),\n     pen_color_blue_ (0)\n   {}\n\n   void rectangle(int x1, int y1, int x2, int y2)\n   {\n      line_segment(x1, y1, x2, y1);\n      line_segment(x2, y1, x2, y2);\n      line_segment(x2, y2, x1, y2);\n      line_segment(x1, y2, x1, y1);\n   }\n\n   void triangle(int x1, int y1, int x2, int y2,int x3, int y3)\n   {\n      line_segment(x1, y1, x2, y2);\n      line_segment(x2, y2, x3, y3);\n      line_segment(x3, y3, x1, y1);\n   }\n\n   void quadix(int x1, int y1, int x2, int y2,int x3, int y3, int x4, int y4)\n   {\n      line_segment(x1, y1, x2, y2);\n      line_segment(x2, y2, x3, y3);\n      line_segment(x3, y3, x4, y4);\n      line_segment(x4, y4, x1, y1);\n   }\n\n   void line_segment(int x1, int y1, int x2, int y2)\n   {\n      int steep = 0;\n      int sx    = ((x2 - x1) > 0) ? 1 : -1;\n      int sy    = ((y2 - y1) > 0) ? 1 : -1;\n      int dx    = abs(x2 - x1);\n      int dy    = abs(y2 - y1);\n\n      if (dy > dx)\n      {\n         std::swap(x1,y1);\n         std::swap(dx,dy);\n         std::swap(sx,sy);\n\n         steep = 1;\n      }\n\n      int e = 2 * dy - dx;\n\n      for (int i = 0; i < dx; ++i)\n      {\n         if (steep)\n            plot_pen_pixel(y1,x1);\n         else\n            plot_pen_pixel(x1,y1);\n\n         while (e >= 0)\n         {\n            y1 += sy;\n            e -= (dx << 1);\n         }\n\n         x1 += sx;\n         e  += (dy << 1);\n      }\n\n      plot_pen_pixel(x2,y2);\n   }\n\n   void horiztonal_line_segment(int x1, int x2, int y)\n   {\n      if (x1 > x2)\n      {\n         std::swap(x1,x2);\n      }\n\n      for (int i = 0; i < (x2 - x1); ++i)\n      {\n         plot_pen_pixel(x1 +  i,y);\n      }\n   }\n\n   void vertical_line_segment(int y1, int y2, int x)\n   {\n      if (y1 > y2)\n      {\n         std::swap(y1,y2);\n      }\n\n      for (int i = 0; i < (y2 - y1); ++i)\n      {\n         plot_pen_pixel(x, y1 +  i);\n      }\n   }\n\n   void ellipse(int centerx, int centery, int a, int b)\n   {\n      int t1 = a * a;\n      int t2 = t1 << 1;\n      int t3 = t2 << 1;\n      int t4 = b * b;\n      int t5 = t4 << 1;\n      int t6 = t5 << 1;\n      int t7 = a * t5;\n      int t8 = t7 << 1;\n      int t9 = 0;\n\n      int d1 = t2 - t7 + (t4 >> 1);\n      int d2 = (t1 >> 1) - t8 + t5;\n      int x  = a;\n      int y  = 0;\n\n      int negative_tx = centerx - x;\n      int positive_tx = centerx + x;\n      int negative_ty = centery - y;\n      int positive_ty = centery + y;\n\n      while (d2 < 0)\n      {\n         plot_pen_pixel(positive_tx, positive_ty);\n         plot_pen_pixel(positive_tx, negative_ty);\n         plot_pen_pixel(negative_tx, positive_ty);\n         plot_pen_pixel(negative_tx, negative_ty);\n\n         ++y;\n\n         t9 = t9 + t3;\n\n         if (d1 < 0)\n         {\n            d1 = d1 + t9 + t2;\n            d2 = d2 + t9;\n         }\n         else\n         {\n            x--;\n            t8 = t8 - t6;\n            d1 = d1 + (t9 + t2 - t8);\n            d2 = d2 + (t9 + t5 - t8);\n            negative_tx = centerx - x;\n            positive_tx = centerx + x;\n         }\n\n         negative_ty = centery - y;\n         positive_ty = centery + y;\n      }\n\n      do\n      {\n         plot_pen_pixel(positive_tx, positive_ty);\n         plot_pen_pixel(positive_tx, negative_ty);\n         plot_pen_pixel(negative_tx, positive_ty);\n         plot_pen_pixel(negative_tx, negative_ty);\n\n         x--;\n         t8 = t8 - t6;\n\n         if (d2 < 0)\n         {\n            ++y;\n            t9 = t9 + t3;\n            d2 = d2 + (t9 + t5 - t8);\n            negative_ty = centery - y;\n            positive_ty = centery + y;\n         }\n         else\n            d2 = d2 + (t5 - t8);\n\n         negative_tx = centerx - x;\n         positive_tx = centerx + x;\n      }\n      while (x >= 0);\n   }\n\n   void circle(int centerx, int centery, int radius)\n   {\n      int x = 0;\n      int d = (1 - radius) << 1;\n\n      while (radius >= 0)\n      {\n         plot_pen_pixel(centerx + x, centery + radius);\n         plot_pen_pixel(centerx + x, centery - radius);\n         plot_pen_pixel(centerx - x, centery + radius);\n         plot_pen_pixel(centerx - x, centery - radius);\n\n         if ((d + radius) > 0)\n            d -= ((--radius) << 1) - 1;\n         if (x > d)\n            d += ((++x) << 1) + 1;\n      }\n   }\n\n   void plot_pen_pixel(int x, int y)\n   {\n      switch (pen_width_)\n      {\n         case 1  : plot_pixel(x,y);\n                   break;\n\n         case 2  : {\n                      plot_pixel(x    , y    );\n                      plot_pixel(x + 1, y    );\n                      plot_pixel(x + 1, y + 1);\n                      plot_pixel(x    , y + 1);\n                   }\n                   break;\n\n         case  3 : {\n                      plot_pixel(x    , y - 1);\n                      plot_pixel(x - 1, y - 1);\n                      plot_pixel(x + 1, y - 1);\n\n                      plot_pixel(x    , y    );\n                      plot_pixel(x - 1, y    );\n                      plot_pixel(x + 1, y    );\n\n                      plot_pixel(x    , y + 1);\n                      plot_pixel(x - 1, y + 1);\n                      plot_pixel(x + 1, y + 1);\n                   }\n                   break;\n\n         default : plot_pixel(x,y);\n                   break;\n      }\n   }\n\n   void plot_pixel(int x, int y)\n   {\n      if (\n           (x < 0) ||\n           (y < 0) ||\n           (x >= static_cast<int>(image_.width ())) ||\n           (y >= static_cast<int>(image_.height()))\n         )\n         return;\n\n      image_.set_pixel(x,y,pen_color_red_,pen_color_green_,pen_color_blue_);\n   }\n\n   void pen_width(const unsigned int& width)\n   {\n      if ((width > 0) && (width < 4))\n      {\n         pen_width_ = width;\n      }\n   }\n\n   void pen_color(const unsigned char& red,\n                  const unsigned char& green,\n                  const unsigned char& blue)\n   {\n      pen_color_red_   = red;\n      pen_color_green_ = green;\n      pen_color_blue_  = blue;\n   }\n\n   template <typename RGB>\n   void pen_color(const RGB colour)\n   {\n      pen_color_red_   = colour.red;\n      pen_color_green_ = colour.green;\n      pen_color_blue_  = colour.blue;\n   }\n\nprivate:\n\n   image_drawer(const image_drawer& id);\n   image_drawer& operator =(const image_drawer& id);\n\n   bitmap_image& image_;\n   unsigned int  pen_width_;\n   unsigned char pen_color_red_;\n   unsigned char pen_color_green_;\n   unsigned char pen_color_blue_;\n};\n\nclass cartesian_canvas\n{\npublic:\n\n   cartesian_canvas(const double x_length, const double y_length)\n   : width_div2_ (0.0),\n     height_div2_(0.0),\n     min_x_      (0.0),\n     min_y_      (0.0),\n     max_x_      (0.0),\n     max_y_      (0.0),\n     draw_       (image_)\n   {\n      setup_canvas(x_length,y_length);\n   }\n\n   inline bool operator!()\n   {\n      return !image_;\n   }\n\n   void rectangle(double x1, double y1, double x2, double y2)\n   {\n      line_segment(x1, y1, x2, y1);\n      line_segment(x2, y1, x2, y2);\n      line_segment(x2, y2, x1, y2);\n      line_segment(x1, y2, x1, y1);\n   }\n\n   void triangle(double x1, double y1, double x2, double y2, double x3, double y3)\n   {\n      line_segment(x1, y1, x2, y2);\n      line_segment(x2, y2, x3, y3);\n      line_segment(x3, y3, x1, y1);\n   }\n\n   void quadix(double x1, double y1, double x2, double y2, double x3, double y3, double x4, double y4)\n   {\n      line_segment(x1, y1, x2, y2);\n      line_segment(x2, y2, x3, y3);\n      line_segment(x3, y3, x4, y4);\n      line_segment(x4, y4, x1, y1);\n   }\n\n   void line_segment(double x1, double y1, double x2, double y2)\n   {\n      if (clip(x1, y1, x2, y2))\n      {\n         const int sc_x1 = static_cast<int>(cart_to_screen_x(x1));\n         const int sc_x2 = static_cast<int>(cart_to_screen_x(x2));\n         const int sc_y1 = static_cast<int>(cart_to_screen_y(y1));\n         const int sc_y2 = static_cast<int>(cart_to_screen_y(y2));\n\n         draw_.line_segment(sc_x1, sc_y1, sc_x2, sc_y2);\n      }\n   }\n\n   void horiztonal_line_segment(double x1, double x2, double y)\n   {\n      x1 = clamp_x(x1);\n      x2 = clamp_x(x2);\n      y  = clamp_y( y);\n\n      const int sc_x1 = static_cast<int>(cart_to_screen_x(x1));\n      const int sc_x2 = static_cast<int>(cart_to_screen_x(x2));\n      const int sc_y  = static_cast<int>(cart_to_screen_y(y ));\n\n      draw_.horiztonal_line_segment(sc_x1, sc_x2, sc_y);\n   }\n\n   void vertical_line_segment(double y1, double y2, double x)\n   {\n      y1 = clamp_y(y1);\n      y2 = clamp_y(y2);\n      x  = clamp_x( x);\n\n      const int sc_y1 = static_cast<int>(cart_to_screen_y(y1));\n      const int sc_y2 = static_cast<int>(cart_to_screen_y(y2));\n      const int sc_x  = static_cast<int>(cart_to_screen_x(x ));\n\n      draw_.vertical_line_segment(sc_y1, sc_y2, sc_x);\n   }\n\n   void ellipse(double centerx, double centery, double a, double b)\n   {\n\n      const int sc_cx = static_cast<int>(cart_to_screen_x(centerx));\n      const int sc_cy = static_cast<int>(cart_to_screen_y(centery));\n\n      draw_.ellipse(sc_cx, sc_cy, static_cast<int>(a), static_cast<int>(b));\n   }\n\n   void circle(double centerx, double centery, double radius)\n   {\n      const int sc_cx = static_cast<int>(cart_to_screen_x(centerx));\n      const int sc_cy = static_cast<int>(cart_to_screen_y(centery));\n\n      draw_.circle(sc_cx, sc_cy, static_cast<int>(radius));\n   }\n\n   void fill_rectangle(double x1, double y1, double x2, double y2)\n   {\n      if (y1 > y2)\n         std::swap(y1, y2);\n\n      for (double y = y1; y <= y2; y += 0.5)\n      {\n        line_segment(x1, y, x2, y);\n      }\n   }\n\n   void fill_triangle(double x1, double y1, double x2, double y2, double x3, double y3)\n   {\n      typedef std::pair<double,double> point_t;\n\n      std::vector<point_t> p;\n\n      p.push_back(std::make_pair(x1,y1));\n      p.push_back(std::make_pair(x2,y2));\n      p.push_back(std::make_pair(x3,y3));\n\n      if (p[0].second > p[1].second)\n         std::swap(p[0],p[1]);\n      if (p[0].second > p[2].second)\n         std::swap(p[0],p[2]);\n      if (p[1].second > p[2].second)\n         std::swap(p[1],p[2]);\n\n      class draw_modes\n      {\n      private:\n\n         cartesian_canvas& canvas;\n\n         // Needed for incompetent and broken msvc compiler versions\n         #ifdef _MSC_VER\n            #pragma warning(push)\n            #pragma warning(disable: 4822)\n         #endif\n         draw_modes& operator=(const draw_modes&);\n         #ifdef _MSC_VER\n            #pragma warning(pop)\n         #endif\n\n      public:\n\n         draw_modes(cartesian_canvas& c)\n         : canvas(c)\n         {}\n\n         void bottom(const point_t& p0, const point_t& p1, const point_t& p2)\n         {\n            const double m0 = (p1.first - p0.first) / (2.0 * (p1.second - p0.second));\n            const double m1 = (p2.first - p0.first) / (2.0 * (p2.second - p0.second));\n\n            double x0 = p0.first;\n            double x1 = p0.first;\n\n            for (double y = p0.second; y <= p1.second; y += 0.5)\n            {\n               canvas.horiztonal_line_segment(x0, x1, y);\n\n               x0 += m0;\n               x1 += m1;\n            }\n         }\n\n         void top(const point_t& p0, const point_t& p1, const point_t& p2)\n         {\n            const double m0 = (p2.first - p0.first) / (2.0 * (p2.second - p0.second));\n            const double m1 = (p2.first - p1.first) / (2.0 * (p2.second - p1.second));\n\n            double x0 = p2.first;\n            double x1 = p2.first;\n\n            for (double y = p2.second; y >= p0.second; y -= 0.5)\n            {\n               canvas.horiztonal_line_segment(x0, x1, y);\n\n               x0 -= m0;\n               x1 -= m1;\n            }\n         }\n      };\n\n      draw_modes dm(*this);\n\n      const double eps = 0.00001;\n\n      if (std::abs(p[1].second - p[2].second) < eps)\n         dm.bottom(p[0], p[1], p[2]);\n      else if (std::abs(p[0].second - p[1].second) < eps)\n         dm.top(p[0], p[1], p[2]);\n      else\n      {\n         point_t p3;\n\n         p3.first  = (p[0].first + ((p[1].second - p[0].second) / (p[2].second - p[0].second)) * (p[2].first - p[0].first));\n         p3.second = p[1].second;\n\n         dm.bottom(p[0], p[1], p3  );\n         dm.top   (p[1], p3  , p[2]);\n      }\n   }\n\n   void fill_quadix(double x1, double y1, double x2, double y2, double x3, double y3, double x4, double y4)\n   {\n      fill_triangle(x1, y1, x2, y2, x3, y3);\n      fill_triangle(x1, y1, x3, y3, x4, y4);\n   }\n\n   void fill_circle(double cx, double cy, double radius)\n   {\n      const double delta = 1.0;\n      double  x = radius;\n      double  y = 0.0;\n      double dx = delta - (2.0 * delta * radius);\n      double dy = 0.0;\n      double dr = 0.0;\n\n      while (x >= y)\n      {\n         for (double i = cx - x; i <= cx + x; i += delta)\n         {\n            horiztonal_line_segment(cx - x, cx + x, cy + y);\n            horiztonal_line_segment(cx - x, cx + x, cy - y);\n         }\n\n         for (double i = cx - y; i <= cx + y; i += delta)\n         {\n            horiztonal_line_segment(cx - y, cx + y, cy + x);\n            horiztonal_line_segment(cx - y, cx + y, cy - x);\n         }\n\n         y += delta;\n\n         dr += dy;\n         dy += 2.0 * delta;\n\n         if ((2.0 * delta * dr + dx) > 0)\n         {\n             x -= delta;\n            dr +=  dx;\n            dx += 2.0 * delta;\n         }\n      }\n   }\n\n   void plot_pen_pixel(double x, double y)\n   {\n      if ((x < min_x_) || (x > max_x_)) return;\n      if ((y < min_y_) || (y > max_y_)) return;\n\n      const int sc_x = static_cast<int>(cart_to_screen_x(x));\n      const int sc_y = static_cast<int>(cart_to_screen_y(y));\n\n      draw_.plot_pen_pixel(sc_x, sc_y);\n   }\n\n   void plot_pixel(double x, double y)\n   {\n      if ((x < min_x_) || (x > max_x_)) return;\n      if ((y < min_y_) || (y > max_y_)) return;\n\n      const int sc_x = static_cast<int>(cart_to_screen_x(x));\n      const int sc_y = static_cast<int>(cart_to_screen_y(y));\n\n      draw_.plot_pixel(sc_x, sc_y);\n   }\n\n   void pen_width(const unsigned int& width)\n   {\n      draw_.pen_width(width);\n   }\n\n   void pen_color(const unsigned char&   red,\n                  const unsigned char& green,\n                  const unsigned char&  blue)\n   {\n      draw_.pen_color(red,green,blue);\n   }\n\n   template <typename RGB>\n   void pen_color(const RGB colour)\n   {\n      draw_.pen_color(colour);\n   }\n\n   const bitmap_image& image() const\n   {\n      return image_;\n   }\n\n   bitmap_image& image()\n   {\n      return image_;\n   }\n\n   void set_widthheight(const double x_length, const double y_length)\n   {\n      setup_canvas(x_length, y_length);\n   }\n\n   double min_x() const { return min_x_; }\n   double min_y() const { return min_y_; }\n   double max_x() const { return max_x_; }\n   double max_y() const { return max_y_; }\n\nprivate:\n\n   void setup_canvas(const double x_length, const double y_length)\n   {\n      if ((x_length < 2.0) || (y_length < 2.0))\n         return;\n\n      width_div2_  = x_length / 2.0;\n      height_div2_ = y_length / 2.0;\n\n      min_x_ = -width_div2_ ;\n      min_y_ = -height_div2_;\n      max_x_ =  width_div2_ ;\n      max_y_ =  height_div2_;\n\n      image_.setwidth_height(static_cast<unsigned int>(x_length) + 1, static_cast<unsigned int>(y_length) + 1);\n\n      image_.clear(0xFF);\n   }\n\n   double clamp_x(const double& x)\n   {\n           if (x < min_x_)  return min_x_;\n      else if (x > max_x_)  return max_x_;\n      else                  return x;\n   }\n\n   double clamp_y(const double& y)\n   {\n           if (y < min_y_)  return min_y_;\n      else if (y > max_y_)  return max_y_;\n      else                  return y;\n   }\n\n   double cart_to_screen_x(const double& x)\n   {\n      return x + width_div2_;\n   }\n\n   double cart_to_screen_y(const double& y)\n   {\n      return height_div2_ - y;\n   }\n\n   enum clip_code\n   {\n      e_clip_bottom = 1,\n      e_clip_top    = 2,\n      e_clip_left   = 4,\n      e_clip_right  = 8\n   };\n\n   int out_code(\n                 const double&  x, const double&  y,\n                 const double& x1, const double& y1,\n                 const double& x2, const double& y2\n               )\n   {\n      int result = 0;\n      if (y < y1)      result |= e_clip_bottom;\n      else if (y > y2) result |= e_clip_top;\n\n      if (x < x1)      result |= e_clip_left;\n      else if (x > x2) result |= e_clip_right;\n\n      return result;\n   }\n\n   bool clip(double& x1, double& y1, double& x2, double& y2)\n   {\n      bool   result = false;\n      double x      = 0.0;\n      double y      = 0.0;\n\n      int outcode0   = out_code(x1, y1, min_x_, min_y_, max_x_, max_y_);\n      int outcode1   = out_code(x2, y2, min_x_, min_y_, max_x_, max_y_);\n      int outcodeout = 0;\n\n      while ((outcode0 != 0) || (outcode1 != 0))\n      {\n         if ((outcode0 & outcode1) != 0)\n            return result;\n         else\n         {\n            if (outcode0 != 0)\n               outcodeout = outcode0;\n            else\n               outcodeout = outcode1;\n\n            double dx = (x2 - x1);\n            double dy = (y2 - y1);\n\n            if ((outcodeout & e_clip_bottom) == e_clip_bottom)\n            {\n               x = x1 + dx * (min_y_ - y1) / dy;\n               y = min_y_;\n            }\n            else if ((outcodeout & e_clip_top) == e_clip_top)\n            {\n               x = x1 + dx * (max_y_ - y1) / dy;\n               y = max_y_;\n            }\n            else if ((outcodeout & e_clip_right) == e_clip_right)\n            {\n               y = y1 + dy * (max_x_ - x1) / dx;\n               x = max_x_;\n            }\n            else if ((outcodeout & e_clip_left) == e_clip_left)\n            {\n               y = y1 + dy * (min_x_ - x1) / dx;\n               x = min_x_;\n            }\n\n            if (outcodeout == outcode0)\n            {\n               x1 = x;\n               y1 = y;\n               outcode0 = out_code(x1, y1, min_x_, min_y_, max_x_, max_y_);\n            }\n            else\n            {\n               x2 = x;\n               y2 = y;\n               outcode1 = out_code(x2, y2, min_x_, min_y_, max_x_, max_y_);\n            }\n         }\n      }\n\n      return true;\n   }\n\n   cartesian_canvas(const cartesian_canvas&);\n   cartesian_canvas operator=(const cartesian_canvas&);\n\n   double width_div2_;\n   double height_div2_;\n   double min_x_;\n   double min_y_;\n   double max_x_;\n   double max_y_;\n   bitmap_image image_;\n   image_drawer draw_;\n};\n\ninline rgb_t convert_wave_length_nm_to_rgb(const double wave_length_nm)\n{\n   // Credits: Dan Bruton http://www.physics.sfasu.edu/astro/color.html\n   double red   = 0.0;\n   double green = 0.0;\n   double blue  = 0.0;\n\n   if ((380.0 <= wave_length_nm) && (wave_length_nm <= 439.0))\n   {\n      red   = -(wave_length_nm - 440.0) / (440.0 - 380.0);\n      green = 0.0;\n      blue  = 1.0;\n   }\n   else if ((440.0 <= wave_length_nm) && (wave_length_nm <= 489.0))\n   {\n      red   = 0.0;\n      green = (wave_length_nm - 440.0) / (490.0 - 440.0);\n      blue  = 1.0;\n   }\n   else if ((490.0 <= wave_length_nm) && (wave_length_nm <= 509.0))\n   {\n      red   = 0.0;\n      green = 1.0;\n      blue  = -(wave_length_nm - 510.0) / (510.0 - 490.0);\n   }\n   else if ((510.0 <= wave_length_nm) && (wave_length_nm <= 579.0))\n   {\n      red   = (wave_length_nm - 510.0) / (580.0 - 510.0);\n      green = 1.0;\n      blue  = 0.0;\n   }\n   else if ((580.0 <= wave_length_nm) && (wave_length_nm <= 644.0))\n   {\n      red   = 1.0;\n      green = -(wave_length_nm - 645.0) / (645.0 - 580.0);\n      blue  = 0.0;\n   }\n   else if ((645.0 <= wave_length_nm) && (wave_length_nm <= 780.0))\n   {\n      red   = 1.0;\n      green = 0.0;\n      blue  = 0.0;\n   }\n\n   double factor = 0.0;\n\n   if ((380.0 <= wave_length_nm) && (wave_length_nm <= 419.0))\n      factor = 0.3 + 0.7 * (wave_length_nm - 380.0) / (420.0 - 380.0);\n   else if ((420.0 <= wave_length_nm) && (wave_length_nm <= 700.0))\n      factor = 1.0;\n   else if ((701.0 <= wave_length_nm) && (wave_length_nm <= 780.0))\n      factor = 0.3 + 0.7 * (780.0 - wave_length_nm) / (780.0 - 700.0);\n   else\n      factor = 0.0;\n\n   rgb_t result;\n\n   const double gamma         =   0.8;\n   const double intensity_max = 255.0;\n\n   #define round(d) std::floor(d + 0.5)\n\n   result.red   = static_cast<unsigned char>((red   == 0.0) ? red   : round(intensity_max * std::pow(red   * factor, gamma)));\n   result.green = static_cast<unsigned char>((green == 0.0) ? green : round(intensity_max * std::pow(green * factor, gamma)));\n   result.blue  = static_cast<unsigned char>((blue  == 0.0) ? blue  : round(intensity_max * std::pow(blue  * factor, gamma)));\n\n   #undef round\n\n   return result;\n}\n\ninline double weighted_distance(const unsigned char r0, const unsigned char g0, const unsigned char b0,\n                                const unsigned char r1, const unsigned char g1, const unsigned char b1)\n{\n   const double diff_r = /*0.30 */ (r0 - r1);\n   const double diff_g = /*0.59 */ (g0 - g1);\n   const double diff_b = /*0.11 */ (b0 - b1);\n\n   return std::sqrt((diff_r * diff_r) + (diff_g * diff_g) + (diff_b * diff_b));\n}\n\ninline double weighted_distance(const rgb_t c0, const rgb_t c1)\n{\n   return weighted_distance(c0.red, c0.green, c0.blue,\n                            c1.red, c1.green, c1.blue);\n}\n\ntemplate <typename Iterator>\ninline rgb_t find_nearest_color(const rgb_t& c, const Iterator begin, const Iterator end)\n{\n   if (0 == std::distance(begin,end))\n      return c;\n\n   double min_d = std::numeric_limits<double>::max();\n   rgb_t result = *begin;\n\n   for (Iterator itr = begin; itr != end; ++itr)\n   {\n      if (c == (*itr))\n      {\n         return (*itr);\n      }\n\n      double curr_d = weighted_distance(c,*itr);\n\n      if (curr_d < min_d)\n      {\n          min_d = curr_d;\n         result = *itr;\n      }\n   }\n\n   return result;\n}\n\ntemplate <template <typename,typename> class Sequence,\n          typename Allocator>\ninline rgb_t find_nearest_color(const rgb_t& c, const Sequence<rgb_t,Allocator>& seq)\n{\n   return find_nearest_color(c, seq.begin(),seq.end());\n}\n\ntemplate <std::size_t N>\ninline rgb_t find_nearest_color(const rgb_t& c, const rgb_t (&colors)[N])\n{\n   return find_nearest_color(c, colors, colors + N);\n}\n\ninline double find_nearest_wave_length(const rgb_t& c, const double increment = 0.001)\n{\n   const double max_wave_length = 800.0; //800nm\n\n   double min_wave_length = 0.0;\n   double min_d           = std::numeric_limits<double>::max();\n\n   for (double i = 0.0; i < max_wave_length; i += increment)\n   {\n      const rgb_t  curr_rgb = convert_wave_length_nm_to_rgb(i);\n\n      if (c == curr_rgb)\n      {\n         return i;\n      }\n\n      const double curr_d = weighted_distance(c, curr_rgb);\n\n      if (curr_d <= min_d)\n      {\n         min_wave_length = i;\n         min_d = curr_d;\n      }\n   }\n\n   return min_wave_length;\n}\n\ntemplate <typename T>\nclass response_image\n{\npublic:\n\n   response_image(const std::size_t& width, const std::size_t& height, const T null = T(0))\n   : width_ (width ),\n     height_(height),\n     null_  (null  )\n   {\n      data_.resize(width_ * height_);\n   }\n\n   std::size_t width () const { return  width_; }\n   std::size_t height() const { return height_; }\n\n   void set_all(const T& t)\n   {\n      std::fill_n(data_.begin(), data_.size(), t);\n   }\n\n   const T& operator()(const std::size_t& x, const std::size_t& y) const\n   {\n      if (y >= height_) return null_;\n      if (x >= width_ ) return null_;\n\n      return data_[width_ * y + x];\n   }\n\n   T& operator()(const std::size_t& x, const std::size_t& y)\n   {\n      if (y >= height_) return null_;\n      if (x >= width_ ) return null_;\n\n      return data_[width_ * y + x];\n   }\n\n   bool valid(const std::size_t& x, const std::size_t& y)\n   {\n      return ((x < width_ ) || (y < height_));\n   }\n\n   void inc_all(const T& v)\n   {\n      for (std::size_t i = 0; i < data_.size(); ++i)\n      {\n         data_[i] += v;\n      }\n   }\n\n   void mul_all(const T& v)\n   {\n      for (std::size_t i = 0; i < data_.size(); ++i)\n      {\n         data_[i] *= v;\n      }\n   }\n\n   T* row (const std::size_t& row_index)\n   {\n      if (row_index < height_)\n         return &data_[width_ * row_index];\n      else\n         return reinterpret_cast<T*>(0);\n   }\n\n   const T* row (const std::size_t& row_index) const\n   {\n      if (row_index < height_)\n         return data_[width_ * row_index];\n      else\n         return reinterpret_cast<T*>(0);\n   }\n\nprivate:\n\n   std::size_t    width_;\n   std::size_t    height_;\n   std::vector<T> data_;\n   T              null_;\n};\n\ninline void sobel_operator(const bitmap_image& src_image,\n                                 bitmap_image& dst_image,\n                           const double threshold = 0.0)\n{\n   typedef double T;\n\n   response_image<T> im0(src_image.width(), src_image.height(), 0.0);\n   response_image<T> im1(src_image.width(), src_image.height(), 0.0);\n\n   src_image.export_gray_scale_response_image(&im0(0,0));\n\n   for (std::size_t y = 1; y < im0.height() - 1; ++y)\n   {\n      const T* itr0 = im0.row(y - 1);\n      const T* itr1 = im0.row(y    );\n      const T* itr2 = im0.row(y + 1);\n            T* out  = im1.row(y    ) + 1;\n\n      for (std::size_t x = 1; x < im0.width() - 1; ++x)\n      {\n         const T c0 = *(itr0 + x - 1);   const T c1 = *(itr0 + x);   const T c2 = *(itr0 + x + 1);\n         const T c3 = *(itr1 + x - 1); /*const T c4 = *(itr1 + x);*/ const T c5 = *(itr1 + x + 1);\n         const T c6 = *(itr2 + x - 1);   const T c7 = *(itr2 + x);   const T c8 = *(itr2 + x + 1);\n\n         const T gx = (2.0 * (c5 - c3)) + (c2 - c0) + (c8 - c6);\n         const T gy = (2.0 * (c1 - c7)) + (c0 - c6) + (c2 - c8);\n\n         *(out++) = std::sqrt((gx * gx) + (gy * gy));\n      }\n   }\n\n   if (threshold > 0.0)\n   {\n      const T* end = im1.row(0) + (im1.width() * im1.height());\n\n      for (T* itr = im1.row(0); itr != end; ++itr)\n      {\n         T& v = *itr;\n         if (v <= threshold) v = 0;\n      }\n   }\n\n   dst_image.setwidth_height\n             (\n               static_cast<unsigned int>(im1.width()),\n               static_cast<unsigned int>(im1.height())\n             );\n\n   dst_image.import_gray_scale_clamped(&im1(0,0));\n}\n\nenum palette_name\n{\n   e_red,           e_scarlet,      e_vermilion,        e_tangelo,         e_orange,\n   e_gamboge,       e_amber,        e_gold,             e_yellow,          e_apple_green,\n   e_lime_green,    e_spring_bud,   e_chartreuse_green, e_pistachio,       e_harlequin,\n   e_sap_green,     e_green,        e_emerald_green,    e_malachite_green, e_sea_green,\n   e_spring_green,  e_aquamarine,   e_turquoise,        e_opal,            e_cyan,\n   e_arctic_blue,   e_cerulean,     e_cornflower_blue,  e_azure,           e_cobalt_blue,\n   e_sapphire_blue, e_phthalo_blue, e_blue,             e_persian_blue,    e_indigo,\n   e_blue_violet,   e_violet,       e_purple,           e_mulberry,        e_heliotrope,\n   e_magenta,       e_orchid,       e_fuchsia,          e_cerise,          e_rose,\n   e_raspberry,     e_crimson,      e_amaranth,         e_white,           e_black\n};\n\nconst rgb_t palette_colormap[] = {\n   {255,   0,   0}, {255,  31,   0}, {255,  63,   0}, {255,  95,   0}, {255, 127,   0},\n   {255, 159,   0}, {255, 191,   0}, {255, 223,   0}, {255, 255,   0}, {223, 255,   0},\n   {191, 255,   0}, {159, 255,   0}, {127, 255,   0}, { 95, 255,   0}, { 63, 255,   0},\n   { 31, 255,   0}, {  0, 255,   0}, {  0, 255,  31}, {  0, 255,  63}, {  0, 255,  95},\n   {  0, 255, 127}, {  0, 255, 159}, {  0, 255, 191}, {  0, 255, 223}, {  0, 255, 255},\n   {  0, 223, 255}, {  0, 191, 255}, {  0, 159, 255}, {  0, 127, 255}, {  0,  95, 255},\n   {  0,  63, 255}, {  0,  31, 255}, {  0,   0, 255}, { 31,   0, 255}, { 63,   0, 255},\n   { 95,   0, 255}, {127,   0, 255}, {159,   0, 255}, {191,   0, 255}, {223,   0, 255},\n   {255,   0, 255}, {255,   0, 223}, {255,   0, 191}, {255,   0, 159}, {255,   0, 127},\n   {255,   0,  95}, {255,   0,  63}, {255,   0,  31}, {255, 255, 255}, {  0,   0,   0}\n};\n\nconst rgb_t autumn_colormap[1000] = {\n   {255,   0,   0}, {255,   0,   0}, {255,   1,   0}, {255,   1,   0}, {255,   1,   0},\n   {255,   1,   0}, {255,   2,   0}, {255,   2,   0}, {255,   2,   0}, {255,   2,   0},\n   {255,   3,   0}, {255,   3,   0}, {255,   3,   0}, {255,   3,   0}, {255,   4,   0},\n   {255,   4,   0}, {255,   4,   0}, {255,   4,   0}, {255,   5,   0}, {255,   5,   0},\n   {255,   5,   0}, {255,   5,   0}, {255,   6,   0}, {255,   6,   0}, {255,   6,   0},\n   {255,   6,   0}, {255,   7,   0}, {255,   7,   0}, {255,   7,   0}, {255,   7,   0},\n   {255,   8,   0}, {255,   8,   0}, {255,   8,   0}, {255,   8,   0}, {255,   9,   0},\n   {255,   9,   0}, {255,   9,   0}, {255,   9,   0}, {255,  10,   0}, {255,  10,   0},\n   {255,  10,   0}, {255,  10,   0}, {255,  11,   0}, {255,  11,   0}, {255,  11,   0},\n   {255,  11,   0}, {255,  12,   0}, {255,  12,   0}, {255,  12,   0}, {255,  13,   0},\n   {255,  13,   0}, {255,  13,   0}, {255,  13,   0}, {255,  14,   0}, {255,  14,   0},\n   {255,  14,   0}, {255,  14,   0}, {255,  15,   0}, {255,  15,   0}, {255,  15,   0},\n   {255,  15,   0}, {255,  16,   0}, {255,  16,   0}, {255,  16,   0}, {255,  16,   0},\n   {255,  17,   0}, {255,  17,   0}, {255,  17,   0}, {255,  17,   0}, {255,  18,   0},\n   {255,  18,   0}, {255,  18,   0}, {255,  18,   0}, {255,  19,   0}, {255,  19,   0},\n   {255,  19,   0}, {255,  19,   0}, {255,  20,   0}, {255,  20,   0}, {255,  20,   0},\n   {255,  20,   0}, {255,  21,   0}, {255,  21,   0}, {255,  21,   0}, {255,  21,   0},\n   {255,  22,   0}, {255,  22,   0}, {255,  22,   0}, {255,  22,   0}, {255,  23,   0},\n   {255,  23,   0}, {255,  23,   0}, {255,  23,   0}, {255,  24,   0}, {255,  24,   0},\n   {255,  24,   0}, {255,  25,   0}, {255,  25,   0}, {255,  25,   0}, {255,  25,   0},\n   {255,  26,   0}, {255,  26,   0}, {255,  26,   0}, {255,  26,   0}, {255,  27,   0},\n   {255,  27,   0}, {255,  27,   0}, {255,  27,   0}, {255,  28,   0}, {255,  28,   0},\n   {255,  28,   0}, {255,  28,   0}, {255,  29,   0}, {255,  29,   0}, {255,  29,   0},\n   {255,  29,   0}, {255,  30,   0}, {255,  30,   0}, {255,  30,   0}, {255,  30,   0},\n   {255,  31,   0}, {255,  31,   0}, {255,  31,   0}, {255,  31,   0}, {255,  32,   0},\n   {255,  32,   0}, {255,  32,   0}, {255,  32,   0}, {255,  33,   0}, {255,  33,   0},\n   {255,  33,   0}, {255,  33,   0}, {255,  34,   0}, {255,  34,   0}, {255,  34,   0},\n   {255,  34,   0}, {255,  35,   0}, {255,  35,   0}, {255,  35,   0}, {255,  35,   0},\n   {255,  36,   0}, {255,  36,   0}, {255,  36,   0}, {255,  37,   0}, {255,  37,   0},\n   {255,  37,   0}, {255,  37,   0}, {255,  38,   0}, {255,  38,   0}, {255,  38,   0},\n   {255,  38,   0}, {255,  39,   0}, {255,  39,   0}, {255,  39,   0}, {255,  39,   0},\n   {255,  40,   0}, {255,  40,   0}, {255,  40,   0}, {255,  40,   0}, {255,  41,   0},\n   {255,  41,   0}, {255,  41,   0}, {255,  41,   0}, {255,  42,   0}, {255,  42,   0},\n   {255,  42,   0}, {255,  42,   0}, {255,  43,   0}, {255,  43,   0}, {255,  43,   0},\n   {255,  43,   0}, {255,  44,   0}, {255,  44,   0}, {255,  44,   0}, {255,  44,   0},\n   {255,  45,   0}, {255,  45,   0}, {255,  45,   0}, {255,  45,   0}, {255,  46,   0},\n   {255,  46,   0}, {255,  46,   0}, {255,  46,   0}, {255,  47,   0}, {255,  47,   0},\n   {255,  47,   0}, {255,  47,   0}, {255,  48,   0}, {255,  48,   0}, {255,  48,   0},\n   {255,  48,   0}, {255,  49,   0}, {255,  49,   0}, {255,  49,   0}, {255,  50,   0},\n   {255,  50,   0}, {255,  50,   0}, {255,  50,   0}, {255,  51,   0}, {255,  51,   0},\n   {255,  51,   0}, {255,  51,   0}, {255,  52,   0}, {255,  52,   0}, {255,  52,   0},\n   {255,  52,   0}, {255,  53,   0}, {255,  53,   0}, {255,  53,   0}, {255,  53,   0},\n   {255,  54,   0}, {255,  54,   0}, {255,  54,   0}, {255,  54,   0}, {255,  55,   0},\n   {255,  55,   0}, {255,  55,   0}, {255,  55,   0}, {255,  56,   0}, {255,  56,   0},\n   {255,  56,   0}, {255,  56,   0}, {255,  57,   0}, {255,  57,   0}, {255,  57,   0},\n   {255,  57,   0}, {255,  58,   0}, {255,  58,   0}, {255,  58,   0}, {255,  58,   0},\n   {255,  59,   0}, {255,  59,   0}, {255,  59,   0}, {255,  59,   0}, {255,  60,   0},\n   {255,  60,   0}, {255,  60,   0}, {255,  60,   0}, {255,  61,   0}, {255,  61,   0},\n   {255,  61,   0}, {255,  62,   0}, {255,  62,   0}, {255,  62,   0}, {255,  62,   0},\n   {255,  63,   0}, {255,  63,   0}, {255,  63,   0}, {255,  63,   0}, {255,  64,   0},\n   {255,  64,   0}, {255,  64,   0}, {255,  64,   0}, {255,  65,   0}, {255,  65,   0},\n   {255,  65,   0}, {255,  65,   0}, {255,  66,   0}, {255,  66,   0}, {255,  66,   0},\n   {255,  66,   0}, {255,  67,   0}, {255,  67,   0}, {255,  67,   0}, {255,  67,   0},\n   {255,  68,   0}, {255,  68,   0}, {255,  68,   0}, {255,  68,   0}, {255,  69,   0},\n   {255,  69,   0}, {255,  69,   0}, {255,  69,   0}, {255,  70,   0}, {255,  70,   0},\n   {255,  70,   0}, {255,  70,   0}, {255,  71,   0}, {255,  71,   0}, {255,  71,   0},\n   {255,  71,   0}, {255,  72,   0}, {255,  72,   0}, {255,  72,   0}, {255,  72,   0},\n   {255,  73,   0}, {255,  73,   0}, {255,  73,   0}, {255,  74,   0}, {255,  74,   0},\n   {255,  74,   0}, {255,  74,   0}, {255,  75,   0}, {255,  75,   0}, {255,  75,   0},\n   {255,  75,   0}, {255,  76,   0}, {255,  76,   0}, {255,  76,   0}, {255,  76,   0},\n   {255,  77,   0}, {255,  77,   0}, {255,  77,   0}, {255,  77,   0}, {255,  78,   0},\n   {255,  78,   0}, {255,  78,   0}, {255,  78,   0}, {255,  79,   0}, {255,  79,   0},\n   {255,  79,   0}, {255,  79,   0}, {255,  80,   0}, {255,  80,   0}, {255,  80,   0},\n   {255,  80,   0}, {255,  81,   0}, {255,  81,   0}, {255,  81,   0}, {255,  81,   0},\n   {255,  82,   0}, {255,  82,   0}, {255,  82,   0}, {255,  82,   0}, {255,  83,   0},\n   {255,  83,   0}, {255,  83,   0}, {255,  83,   0}, {255,  84,   0}, {255,  84,   0},\n   {255,  84,   0}, {255,  84,   0}, {255,  85,   0}, {255,  85,   0}, {255,  85,   0},\n   {255,  86,   0}, {255,  86,   0}, {255,  86,   0}, {255,  86,   0}, {255,  87,   0},\n   {255,  87,   0}, {255,  87,   0}, {255,  87,   0}, {255,  88,   0}, {255,  88,   0},\n   {255,  88,   0}, {255,  88,   0}, {255,  89,   0}, {255,  89,   0}, {255,  89,   0},\n   {255,  89,   0}, {255,  90,   0}, {255,  90,   0}, {255,  90,   0}, {255,  90,   0},\n   {255,  91,   0}, {255,  91,   0}, {255,  91,   0}, {255,  91,   0}, {255,  92,   0},\n   {255,  92,   0}, {255,  92,   0}, {255,  92,   0}, {255,  93,   0}, {255,  93,   0},\n   {255,  93,   0}, {255,  93,   0}, {255,  94,   0}, {255,  94,   0}, {255,  94,   0},\n   {255,  94,   0}, {255,  95,   0}, {255,  95,   0}, {255,  95,   0}, {255,  95,   0},\n   {255,  96,   0}, {255,  96,   0}, {255,  96,   0}, {255,  96,   0}, {255,  97,   0},\n   {255,  97,   0}, {255,  97,   0}, {255,  98,   0}, {255,  98,   0}, {255,  98,   0},\n   {255,  98,   0}, {255,  99,   0}, {255,  99,   0}, {255,  99,   0}, {255,  99,   0},\n   {255, 100,   0}, {255, 100,   0}, {255, 100,   0}, {255, 100,   0}, {255, 101,   0},\n   {255, 101,   0}, {255, 101,   0}, {255, 101,   0}, {255, 102,   0}, {255, 102,   0},\n   {255, 102,   0}, {255, 102,   0}, {255, 103,   0}, {255, 103,   0}, {255, 103,   0},\n   {255, 103,   0}, {255, 104,   0}, {255, 104,   0}, {255, 104,   0}, {255, 104,   0},\n   {255, 105,   0}, {255, 105,   0}, {255, 105,   0}, {255, 105,   0}, {255, 106,   0},\n   {255, 106,   0}, {255, 106,   0}, {255, 106,   0}, {255, 107,   0}, {255, 107,   0},\n   {255, 107,   0}, {255, 107,   0}, {255, 108,   0}, {255, 108,   0}, {255, 108,   0},\n   {255, 108,   0}, {255, 109,   0}, {255, 109,   0}, {255, 109,   0}, {255, 110,   0},\n   {255, 110,   0}, {255, 110,   0}, {255, 110,   0}, {255, 111,   0}, {255, 111,   0},\n   {255, 111,   0}, {255, 111,   0}, {255, 112,   0}, {255, 112,   0}, {255, 112,   0},\n   {255, 112,   0}, {255, 113,   0}, {255, 113,   0}, {255, 113,   0}, {255, 113,   0},\n   {255, 114,   0}, {255, 114,   0}, {255, 114,   0}, {255, 114,   0}, {255, 115,   0},\n   {255, 115,   0}, {255, 115,   0}, {255, 115,   0}, {255, 116,   0}, {255, 116,   0},\n   {255, 116,   0}, {255, 116,   0}, {255, 117,   0}, {255, 117,   0}, {255, 117,   0},\n   {255, 117,   0}, {255, 118,   0}, {255, 118,   0}, {255, 118,   0}, {255, 118,   0},\n   {255, 119,   0}, {255, 119,   0}, {255, 119,   0}, {255, 119,   0}, {255, 120,   0},\n   {255, 120,   0}, {255, 120,   0}, {255, 120,   0}, {255, 121,   0}, {255, 121,   0},\n   {255, 121,   0}, {255, 122,   0}, {255, 122,   0}, {255, 122,   0}, {255, 122,   0},\n   {255, 123,   0}, {255, 123,   0}, {255, 123,   0}, {255, 123,   0}, {255, 124,   0},\n   {255, 124,   0}, {255, 124,   0}, {255, 124,   0}, {255, 125,   0}, {255, 125,   0},\n   {255, 125,   0}, {255, 125,   0}, {255, 126,   0}, {255, 126,   0}, {255, 126,   0},\n   {255, 126,   0}, {255, 127,   0}, {255, 127,   0}, {255, 127,   0}, {255, 127,   0},\n   {255, 128,   0}, {255, 128,   0}, {255, 128,   0}, {255, 128,   0}, {255, 129,   0},\n   {255, 129,   0}, {255, 129,   0}, {255, 129,   0}, {255, 130,   0}, {255, 130,   0},\n   {255, 130,   0}, {255, 130,   0}, {255, 131,   0}, {255, 131,   0}, {255, 131,   0},\n   {255, 131,   0}, {255, 132,   0}, {255, 132,   0}, {255, 132,   0}, {255, 132,   0},\n   {255, 133,   0}, {255, 133,   0}, {255, 133,   0}, {255, 133,   0}, {255, 134,   0},\n   {255, 134,   0}, {255, 134,   0}, {255, 135,   0}, {255, 135,   0}, {255, 135,   0},\n   {255, 135,   0}, {255, 136,   0}, {255, 136,   0}, {255, 136,   0}, {255, 136,   0},\n   {255, 137,   0}, {255, 137,   0}, {255, 137,   0}, {255, 137,   0}, {255, 138,   0},\n   {255, 138,   0}, {255, 138,   0}, {255, 138,   0}, {255, 139,   0}, {255, 139,   0},\n   {255, 139,   0}, {255, 139,   0}, {255, 140,   0}, {255, 140,   0}, {255, 140,   0},\n   {255, 140,   0}, {255, 141,   0}, {255, 141,   0}, {255, 141,   0}, {255, 141,   0},\n   {255, 142,   0}, {255, 142,   0}, {255, 142,   0}, {255, 142,   0}, {255, 143,   0},\n   {255, 143,   0}, {255, 143,   0}, {255, 143,   0}, {255, 144,   0}, {255, 144,   0},\n   {255, 144,   0}, {255, 144,   0}, {255, 145,   0}, {255, 145,   0}, {255, 145,   0},\n   {255, 145,   0}, {255, 146,   0}, {255, 146,   0}, {255, 146,   0}, {255, 147,   0},\n   {255, 147,   0}, {255, 147,   0}, {255, 147,   0}, {255, 148,   0}, {255, 148,   0},\n   {255, 148,   0}, {255, 148,   0}, {255, 149,   0}, {255, 149,   0}, {255, 149,   0},\n   {255, 149,   0}, {255, 150,   0}, {255, 150,   0}, {255, 150,   0}, {255, 150,   0},\n   {255, 151,   0}, {255, 151,   0}, {255, 151,   0}, {255, 151,   0}, {255, 152,   0},\n   {255, 152,   0}, {255, 152,   0}, {255, 152,   0}, {255, 153,   0}, {255, 153,   0},\n   {255, 153,   0}, {255, 153,   0}, {255, 154,   0}, {255, 154,   0}, {255, 154,   0},\n   {255, 154,   0}, {255, 155,   0}, {255, 155,   0}, {255, 155,   0}, {255, 155,   0},\n   {255, 156,   0}, {255, 156,   0}, {255, 156,   0}, {255, 156,   0}, {255, 157,   0},\n   {255, 157,   0}, {255, 157,   0}, {255, 157,   0}, {255, 158,   0}, {255, 158,   0},\n   {255, 158,   0}, {255, 159,   0}, {255, 159,   0}, {255, 159,   0}, {255, 159,   0},\n   {255, 160,   0}, {255, 160,   0}, {255, 160,   0}, {255, 160,   0}, {255, 161,   0},\n   {255, 161,   0}, {255, 161,   0}, {255, 161,   0}, {255, 162,   0}, {255, 162,   0},\n   {255, 162,   0}, {255, 162,   0}, {255, 163,   0}, {255, 163,   0}, {255, 163,   0},\n   {255, 163,   0}, {255, 164,   0}, {255, 164,   0}, {255, 164,   0}, {255, 164,   0},\n   {255, 165,   0}, {255, 165,   0}, {255, 165,   0}, {255, 165,   0}, {255, 166,   0},\n   {255, 166,   0}, {255, 166,   0}, {255, 166,   0}, {255, 167,   0}, {255, 167,   0},\n   {255, 167,   0}, {255, 167,   0}, {255, 168,   0}, {255, 168,   0}, {255, 168,   0},\n   {255, 168,   0}, {255, 169,   0}, {255, 169,   0}, {255, 169,   0}, {255, 169,   0},\n   {255, 170,   0}, {255, 170,   0}, {255, 170,   0}, {255, 171,   0}, {255, 171,   0},\n   {255, 171,   0}, {255, 171,   0}, {255, 172,   0}, {255, 172,   0}, {255, 172,   0},\n   {255, 172,   0}, {255, 173,   0}, {255, 173,   0}, {255, 173,   0}, {255, 173,   0},\n   {255, 174,   0}, {255, 174,   0}, {255, 174,   0}, {255, 174,   0}, {255, 175,   0},\n   {255, 175,   0}, {255, 175,   0}, {255, 175,   0}, {255, 176,   0}, {255, 176,   0},\n   {255, 176,   0}, {255, 176,   0}, {255, 177,   0}, {255, 177,   0}, {255, 177,   0},\n   {255, 177,   0}, {255, 178,   0}, {255, 178,   0}, {255, 178,   0}, {255, 178,   0},\n   {255, 179,   0}, {255, 179,   0}, {255, 179,   0}, {255, 179,   0}, {255, 180,   0},\n   {255, 180,   0}, {255, 180,   0}, {255, 180,   0}, {255, 181,   0}, {255, 181,   0},\n   {255, 181,   0}, {255, 181,   0}, {255, 182,   0}, {255, 182,   0}, {255, 182,   0},\n   {255, 183,   0}, {255, 183,   0}, {255, 183,   0}, {255, 183,   0}, {255, 184,   0},\n   {255, 184,   0}, {255, 184,   0}, {255, 184,   0}, {255, 185,   0}, {255, 185,   0},\n   {255, 185,   0}, {255, 185,   0}, {255, 186,   0}, {255, 186,   0}, {255, 186,   0},\n   {255, 186,   0}, {255, 187,   0}, {255, 187,   0}, {255, 187,   0}, {255, 187,   0},\n   {255, 188,   0}, {255, 188,   0}, {255, 188,   0}, {255, 188,   0}, {255, 189,   0},\n   {255, 189,   0}, {255, 189,   0}, {255, 189,   0}, {255, 190,   0}, {255, 190,   0},\n   {255, 190,   0}, {255, 190,   0}, {255, 191,   0}, {255, 191,   0}, {255, 191,   0},\n   {255, 191,   0}, {255, 192,   0}, {255, 192,   0}, {255, 192,   0}, {255, 192,   0},\n   {255, 193,   0}, {255, 193,   0}, {255, 193,   0}, {255, 193,   0}, {255, 194,   0},\n   {255, 194,   0}, {255, 194,   0}, {255, 195,   0}, {255, 195,   0}, {255, 195,   0},\n   {255, 195,   0}, {255, 196,   0}, {255, 196,   0}, {255, 196,   0}, {255, 196,   0},\n   {255, 197,   0}, {255, 197,   0}, {255, 197,   0}, {255, 197,   0}, {255, 198,   0},\n   {255, 198,   0}, {255, 198,   0}, {255, 198,   0}, {255, 199,   0}, {255, 199,   0},\n   {255, 199,   0}, {255, 199,   0}, {255, 200,   0}, {255, 200,   0}, {255, 200,   0},\n   {255, 200,   0}, {255, 201,   0}, {255, 201,   0}, {255, 201,   0}, {255, 201,   0},\n   {255, 202,   0}, {255, 202,   0}, {255, 202,   0}, {255, 202,   0}, {255, 203,   0},\n   {255, 203,   0}, {255, 203,   0}, {255, 203,   0}, {255, 204,   0}, {255, 204,   0},\n   {255, 204,   0}, {255, 204,   0}, {255, 205,   0}, {255, 205,   0}, {255, 205,   0},\n   {255, 205,   0}, {255, 206,   0}, {255, 206,   0}, {255, 206,   0}, {255, 207,   0},\n   {255, 207,   0}, {255, 207,   0}, {255, 207,   0}, {255, 208,   0}, {255, 208,   0},\n   {255, 208,   0}, {255, 208,   0}, {255, 209,   0}, {255, 209,   0}, {255, 209,   0},\n   {255, 209,   0}, {255, 210,   0}, {255, 210,   0}, {255, 210,   0}, {255, 210,   0},\n   {255, 211,   0}, {255, 211,   0}, {255, 211,   0}, {255, 211,   0}, {255, 212,   0},\n   {255, 212,   0}, {255, 212,   0}, {255, 212,   0}, {255, 213,   0}, {255, 213,   0},\n   {255, 213,   0}, {255, 213,   0}, {255, 214,   0}, {255, 214,   0}, {255, 214,   0},\n   {255, 214,   0}, {255, 215,   0}, {255, 215,   0}, {255, 215,   0}, {255, 215,   0},\n   {255, 216,   0}, {255, 216,   0}, {255, 216,   0}, {255, 216,   0}, {255, 217,   0},\n   {255, 217,   0}, {255, 217,   0}, {255, 217,   0}, {255, 218,   0}, {255, 218,   0},\n   {255, 218,   0}, {255, 218,   0}, {255, 219,   0}, {255, 219,   0}, {255, 219,   0},\n   {255, 220,   0}, {255, 220,   0}, {255, 220,   0}, {255, 220,   0}, {255, 221,   0},\n   {255, 221,   0}, {255, 221,   0}, {255, 221,   0}, {255, 222,   0}, {255, 222,   0},\n   {255, 222,   0}, {255, 222,   0}, {255, 223,   0}, {255, 223,   0}, {255, 223,   0},\n   {255, 223,   0}, {255, 224,   0}, {255, 224,   0}, {255, 224,   0}, {255, 224,   0},\n   {255, 225,   0}, {255, 225,   0}, {255, 225,   0}, {255, 225,   0}, {255, 226,   0},\n   {255, 226,   0}, {255, 226,   0}, {255, 226,   0}, {255, 227,   0}, {255, 227,   0},\n   {255, 227,   0}, {255, 227,   0}, {255, 228,   0}, {255, 228,   0}, {255, 228,   0},\n   {255, 228,   0}, {255, 229,   0}, {255, 229,   0}, {255, 229,   0}, {255, 229,   0},\n   {255, 230,   0}, {255, 230,   0}, {255, 230,   0}, {255, 230,   0}, {255, 231,   0},\n   {255, 231,   0}, {255, 231,   0}, {255, 232,   0}, {255, 232,   0}, {255, 232,   0},\n   {255, 232,   0}, {255, 233,   0}, {255, 233,   0}, {255, 233,   0}, {255, 233,   0},\n   {255, 234,   0}, {255, 234,   0}, {255, 234,   0}, {255, 234,   0}, {255, 235,   0},\n   {255, 235,   0}, {255, 235,   0}, {255, 235,   0}, {255, 236,   0}, {255, 236,   0},\n   {255, 236,   0}, {255, 236,   0}, {255, 237,   0}, {255, 237,   0}, {255, 237,   0},\n   {255, 237,   0}, {255, 238,   0}, {255, 238,   0}, {255, 238,   0}, {255, 238,   0},\n   {255, 239,   0}, {255, 239,   0}, {255, 239,   0}, {255, 239,   0}, {255, 240,   0},\n   {255, 240,   0}, {255, 240,   0}, {255, 240,   0}, {255, 241,   0}, {255, 241,   0},\n   {255, 241,   0}, {255, 241,   0}, {255, 242,   0}, {255, 242,   0}, {255, 242,   0},\n   {255, 242,   0}, {255, 243,   0}, {255, 243,   0}, {255, 243,   0}, {255, 244,   0},\n   {255, 244,   0}, {255, 244,   0}, {255, 244,   0}, {255, 245,   0}, {255, 245,   0},\n   {255, 245,   0}, {255, 245,   0}, {255, 246,   0}, {255, 246,   0}, {255, 246,   0},\n   {255, 246,   0}, {255, 247,   0}, {255, 247,   0}, {255, 247,   0}, {255, 247,   0},\n   {255, 248,   0}, {255, 248,   0}, {255, 248,   0}, {255, 248,   0}, {255, 249,   0},\n   {255, 249,   0}, {255, 249,   0}, {255, 249,   0}, {255, 250,   0}, {255, 250,   0},\n   {255, 250,   0}, {255, 250,   0}, {255, 251,   0}, {255, 251,   0}, {255, 251,   0},\n   {255, 251,   0}, {255, 252,   0}, {255, 252,   0}, {255, 252,   0}, {255, 252,   0},\n   {255, 253,   0}, {255, 253,   0}, {255, 253,   0}, {255, 253,   0}, {255, 254,   0},\n   {255, 254,   0}, {255, 254,   0}, {255, 254,   0}, {255, 255,   0}, {255, 255,   0}\n};\n\nconst rgb_t copper_colormap[1000] = {\n   {  0,   0,   0}, {  0,   0,   0}, {  1,   0,   0}, {  1,   1,   0}, {  1,   1,   1},\n   {  2,   1,   1}, {  2,   1,   1}, {  2,   1,   1}, {  3,   2,   1}, {  3,   2,   1},\n   {  3,   2,   1}, {  4,   2,   1}, {  4,   2,   2}, {  4,   3,   2}, {  4,   3,   2},\n   {  5,   3,   2}, {  5,   3,   2}, {  5,   3,   2}, {  6,   4,   2}, {  6,   4,   2},\n   {  6,   4,   3}, {  7,   4,   3}, {  7,   4,   3}, {  7,   5,   3}, {  8,   5,   3},\n   {  8,   5,   3}, {  8,   5,   3}, {  9,   5,   3}, {  9,   6,   4}, {  9,   6,   4},\n   { 10,   6,   4}, { 10,   6,   4}, { 10,   6,   4}, { 11,   7,   4}, { 11,   7,   4},\n   { 11,   7,   4}, { 11,   7,   5}, { 12,   7,   5}, { 12,   8,   5}, { 12,   8,   5},\n   { 13,   8,   5}, { 13,   8,   5}, { 13,   8,   5}, { 14,   9,   5}, { 14,   9,   6},\n   { 14,   9,   6}, { 15,   9,   6}, { 15,   9,   6}, { 15,  10,   6}, { 16,  10,   6},\n   { 16,  10,   6}, { 16,  10,   6}, { 17,  10,   7}, { 17,  11,   7}, { 17,  11,   7},\n   { 18,  11,   7}, { 18,  11,   7}, { 18,  11,   7}, { 19,  12,   7}, { 19,  12,   7},\n   { 19,  12,   8}, { 19,  12,   8}, { 20,  12,   8}, { 20,  13,   8}, { 20,  13,   8},\n   { 21,  13,   8}, { 21,  13,   8}, { 21,  13,   9}, { 22,  14,   9}, { 22,  14,   9},\n   { 22,  14,   9}, { 23,  14,   9}, { 23,  14,   9}, { 23,  15,   9}, { 24,  15,   9},\n   { 24,  15,  10}, { 24,  15,  10}, { 25,  15,  10}, { 25,  16,  10}, { 25,  16,  10},\n   { 26,  16,  10}, { 26,  16,  10}, { 26,  16,  10}, { 26,  17,  11}, { 27,  17,  11},\n   { 27,  17,  11}, { 27,  17,  11}, { 28,  17,  11}, { 28,  18,  11}, { 28,  18,  11},\n   { 29,  18,  11}, { 29,  18,  12}, { 29,  18,  12}, { 30,  19,  12}, { 30,  19,  12},\n   { 30,  19,  12}, { 31,  19,  12}, { 31,  19,  12}, { 31,  20,  12}, { 32,  20,  13},\n   { 32,  20,  13}, { 32,  20,  13}, { 33,  20,  13}, { 33,  21,  13}, { 33,  21,  13},\n   { 34,  21,  13}, { 34,  21,  13}, { 34,  21,  14}, { 34,  22,  14}, { 35,  22,  14},\n   { 35,  22,  14}, { 35,  22,  14}, { 36,  22,  14}, { 36,  23,  14}, { 36,  23,  14},\n   { 37,  23,  15}, { 37,  23,  15}, { 37,  23,  15}, { 38,  24,  15}, { 38,  24,  15},\n   { 38,  24,  15}, { 39,  24,  15}, { 39,  24,  15}, { 39,  25,  16}, { 40,  25,  16},\n   { 40,  25,  16}, { 40,  25,  16}, { 41,  25,  16}, { 41,  26,  16}, { 41,  26,  16},\n   { 41,  26,  17}, { 42,  26,  17}, { 42,  26,  17}, { 42,  27,  17}, { 43,  27,  17},\n   { 43,  27,  17}, { 43,  27,  17}, { 44,  27,  17}, { 44,  28,  18}, { 44,  28,  18},\n   { 45,  28,  18}, { 45,  28,  18}, { 45,  28,  18}, { 46,  29,  18}, { 46,  29,  18},\n   { 46,  29,  18}, { 47,  29,  19}, { 47,  29,  19}, { 47,  30,  19}, { 48,  30,  19},\n   { 48,  30,  19}, { 48,  30,  19}, { 48,  30,  19}, { 49,  31,  19}, { 49,  31,  20},\n   { 49,  31,  20}, { 50,  31,  20}, { 50,  31,  20}, { 50,  32,  20}, { 51,  32,  20},\n   { 51,  32,  20}, { 51,  32,  20}, { 52,  32,  21}, { 52,  33,  21}, { 52,  33,  21},\n   { 53,  33,  21}, { 53,  33,  21}, { 53,  33,  21}, { 54,  34,  21}, { 54,  34,  21},\n   { 54,  34,  22}, { 55,  34,  22}, { 55,  34,  22}, { 55,  34,  22}, { 56,  35,  22},\n   { 56,  35,  22}, { 56,  35,  22}, { 56,  35,  22}, { 57,  35,  23}, { 57,  36,  23},\n   { 57,  36,  23}, { 58,  36,  23}, { 58,  36,  23}, { 58,  36,  23}, { 59,  37,  23},\n   { 59,  37,  23}, { 59,  37,  24}, { 60,  37,  24}, { 60,  37,  24}, { 60,  38,  24},\n   { 61,  38,  24}, { 61,  38,  24}, { 61,  38,  24}, { 62,  38,  25}, { 62,  39,  25},\n   { 62,  39,  25}, { 63,  39,  25}, { 63,  39,  25}, { 63,  39,  25}, { 63,  40,  25},\n   { 64,  40,  25}, { 64,  40,  26}, { 64,  40,  26}, { 65,  40,  26}, { 65,  41,  26},\n   { 65,  41,  26}, { 66,  41,  26}, { 66,  41,  26}, { 66,  41,  26}, { 67,  42,  27},\n   { 67,  42,  27}, { 67,  42,  27}, { 68,  42,  27}, { 68,  42,  27}, { 68,  43,  27},\n   { 69,  43,  27}, { 69,  43,  27}, { 69,  43,  28}, { 70,  43,  28}, { 70,  44,  28},\n   { 70,  44,  28}, { 71,  44,  28}, { 71,  44,  28}, { 71,  44,  28}, { 71,  45,  28},\n   { 72,  45,  29}, { 72,  45,  29}, { 72,  45,  29}, { 73,  45,  29}, { 73,  46,  29},\n   { 73,  46,  29}, { 74,  46,  29}, { 74,  46,  29}, { 74,  46,  30}, { 75,  47,  30},\n   { 75,  47,  30}, { 75,  47,  30}, { 76,  47,  30}, { 76,  47,  30}, { 76,  48,  30},\n   { 77,  48,  30}, { 77,  48,  31}, { 77,  48,  31}, { 78,  48,  31}, { 78,  49,  31},\n   { 78,  49,  31}, { 78,  49,  31}, { 79,  49,  31}, { 79,  49,  31}, { 79,  50,  32},\n   { 80,  50,  32}, { 80,  50,  32}, { 80,  50,  32}, { 81,  50,  32}, { 81,  51,  32},\n   { 81,  51,  32}, { 82,  51,  33}, { 82,  51,  33}, { 82,  51,  33}, { 83,  52,  33},\n   { 83,  52,  33}, { 83,  52,  33}, { 84,  52,  33}, { 84,  52,  33}, { 84,  53,  34},\n   { 85,  53,  34}, { 85,  53,  34}, { 85,  53,  34}, { 86,  53,  34}, { 86,  54,  34},\n   { 86,  54,  34}, { 86,  54,  34}, { 87,  54,  35}, { 87,  54,  35}, { 87,  55,  35},\n   { 88,  55,  35}, { 88,  55,  35}, { 88,  55,  35}, { 89,  55,  35}, { 89,  56,  35},\n   { 89,  56,  36}, { 90,  56,  36}, { 90,  56,  36}, { 90,  56,  36}, { 91,  57,  36},\n   { 91,  57,  36}, { 91,  57,  36}, { 92,  57,  36}, { 92,  57,  37}, { 92,  58,  37},\n   { 93,  58,  37}, { 93,  58,  37}, { 93,  58,  37}, { 93,  58,  37}, { 94,  59,  37},\n   { 94,  59,  37}, { 94,  59,  38}, { 95,  59,  38}, { 95,  59,  38}, { 95,  60,  38},\n   { 96,  60,  38}, { 96,  60,  38}, { 96,  60,  38}, { 97,  60,  38}, { 97,  61,  39},\n   { 97,  61,  39}, { 98,  61,  39}, { 98,  61,  39}, { 98,  61,  39}, { 99,  62,  39},\n   { 99,  62,  39}, { 99,  62,  39}, {100,  62,  40}, {100,  62,  40}, {100,  63,  40},\n   {101,  63,  40}, {101,  63,  40}, {101,  63,  40}, {101,  63,  40}, {102,  64,  41},\n   {102,  64,  41}, {102,  64,  41}, {103,  64,  41}, {103,  64,  41}, {103,  65,  41},\n   {104,  65,  41}, {104,  65,  41}, {104,  65,  42}, {105,  65,  42}, {105,  66,  42},\n   {105,  66,  42}, {106,  66,  42}, {106,  66,  42}, {106,  66,  42}, {107,  67,  42},\n   {107,  67,  43}, {107,  67,  43}, {108,  67,  43}, {108,  67,  43}, {108,  68,  43},\n   {108,  68,  43}, {109,  68,  43}, {109,  68,  43}, {109,  68,  44}, {110,  69,  44},\n   {110,  69,  44}, {110,  69,  44}, {111,  69,  44}, {111,  69,  44}, {111,  70,  44},\n   {112,  70,  44}, {112,  70,  45}, {112,  70,  45}, {113,  70,  45}, {113,  71,  45},\n   {113,  71,  45}, {114,  71,  45}, {114,  71,  45}, {114,  71,  45}, {115,  72,  46},\n   {115,  72,  46}, {115,  72,  46}, {116,  72,  46}, {116,  72,  46}, {116,  73,  46},\n   {116,  73,  46}, {117,  73,  46}, {117,  73,  47}, {117,  73,  47}, {118,  74,  47},\n   {118,  74,  47}, {118,  74,  47}, {119,  74,  47}, {119,  74,  47}, {119,  75,  47},\n   {120,  75,  48}, {120,  75,  48}, {120,  75,  48}, {121,  75,  48}, {121,  76,  48},\n   {121,  76,  48}, {122,  76,  48}, {122,  76,  49}, {122,  76,  49}, {123,  77,  49},\n   {123,  77,  49}, {123,  77,  49}, {123,  77,  49}, {124,  77,  49}, {124,  78,  49},\n   {124,  78,  50}, {125,  78,  50}, {125,  78,  50}, {125,  78,  50}, {126,  79,  50},\n   {126,  79,  50}, {126,  79,  50}, {127,  79,  50}, {127,  79,  51}, {127,  80,  51},\n   {128,  80,  51}, {128,  80,  51}, {128,  80,  51}, {129,  80,  51}, {129,  81,  51},\n   {129,  81,  51}, {130,  81,  52}, {130,  81,  52}, {130,  81,  52}, {130,  82,  52},\n   {131,  82,  52}, {131,  82,  52}, {131,  82,  52}, {132,  82,  52}, {132,  83,  53},\n   {132,  83,  53}, {133,  83,  53}, {133,  83,  53}, {133,  83,  53}, {134,  84,  53},\n   {134,  84,  53}, {134,  84,  53}, {135,  84,  54}, {135,  84,  54}, {135,  85,  54},\n   {136,  85,  54}, {136,  85,  54}, {136,  85,  54}, {137,  85,  54}, {137,  86,  54},\n   {137,  86,  55}, {138,  86,  55}, {138,  86,  55}, {138,  86,  55}, {138,  87,  55},\n   {139,  87,  55}, {139,  87,  55}, {139,  87,  55}, {140,  87,  56}, {140,  88,  56},\n   {140,  88,  56}, {141,  88,  56}, {141,  88,  56}, {141,  88,  56}, {142,  89,  56},\n   {142,  89,  57}, {142,  89,  57}, {143,  89,  57}, {143,  89,  57}, {143,  90,  57},\n   {144,  90,  57}, {144,  90,  57}, {144,  90,  57}, {145,  90,  58}, {145,  91,  58},\n   {145,  91,  58}, {145,  91,  58}, {146,  91,  58}, {146,  91,  58}, {146,  92,  58},\n   {147,  92,  58}, {147,  92,  59}, {147,  92,  59}, {148,  92,  59}, {148,  93,  59},\n   {148,  93,  59}, {149,  93,  59}, {149,  93,  59}, {149,  93,  59}, {150,  94,  60},\n   {150,  94,  60}, {150,  94,  60}, {151,  94,  60}, {151,  94,  60}, {151,  95,  60},\n   {152,  95,  60}, {152,  95,  60}, {152,  95,  61}, {153,  95,  61}, {153,  96,  61},\n   {153,  96,  61}, {153,  96,  61}, {154,  96,  61}, {154,  96,  61}, {154,  97,  61},\n   {155,  97,  62}, {155,  97,  62}, {155,  97,  62}, {156,  97,  62}, {156,  98,  62},\n   {156,  98,  62}, {157,  98,  62}, {157,  98,  62}, {157,  98,  63}, {158,  99,  63},\n   {158,  99,  63}, {158,  99,  63}, {159,  99,  63}, {159,  99,  63}, {159, 100,  63},\n   {160, 100,  63}, {160, 100,  64}, {160, 100,  64}, {160, 100,  64}, {161, 101,  64},\n   {161, 101,  64}, {161, 101,  64}, {162, 101,  64}, {162, 101,  65}, {162, 101,  65},\n   {163, 102,  65}, {163, 102,  65}, {163, 102,  65}, {164, 102,  65}, {164, 102,  65},\n   {164, 103,  65}, {165, 103,  66}, {165, 103,  66}, {165, 103,  66}, {166, 103,  66},\n   {166, 104,  66}, {166, 104,  66}, {167, 104,  66}, {167, 104,  66}, {167, 104,  67},\n   {168, 105,  67}, {168, 105,  67}, {168, 105,  67}, {168, 105,  67}, {169, 105,  67},\n   {169, 106,  67}, {169, 106,  67}, {170, 106,  68}, {170, 106,  68}, {170, 106,  68},\n   {171, 107,  68}, {171, 107,  68}, {171, 107,  68}, {172, 107,  68}, {172, 107,  68},\n   {172, 108,  69}, {173, 108,  69}, {173, 108,  69}, {173, 108,  69}, {174, 108,  69},\n   {174, 109,  69}, {174, 109,  69}, {175, 109,  69}, {175, 109,  70}, {175, 109,  70},\n   {175, 110,  70}, {176, 110,  70}, {176, 110,  70}, {176, 110,  70}, {177, 110,  70},\n   {177, 111,  70}, {177, 111,  71}, {178, 111,  71}, {178, 111,  71}, {178, 111,  71},\n   {179, 112,  71}, {179, 112,  71}, {179, 112,  71}, {180, 112,  71}, {180, 112,  72},\n   {180, 113,  72}, {181, 113,  72}, {181, 113,  72}, {181, 113,  72}, {182, 113,  72},\n   {182, 114,  72}, {182, 114,  73}, {183, 114,  73}, {183, 114,  73}, {183, 114,  73},\n   {183, 115,  73}, {184, 115,  73}, {184, 115,  73}, {184, 115,  73}, {185, 115,  74},\n   {185, 116,  74}, {185, 116,  74}, {186, 116,  74}, {186, 116,  74}, {186, 116,  74},\n   {187, 117,  74}, {187, 117,  74}, {187, 117,  75}, {188, 117,  75}, {188, 117,  75},\n   {188, 118,  75}, {189, 118,  75}, {189, 118,  75}, {189, 118,  75}, {190, 118,  75},\n   {190, 119,  76}, {190, 119,  76}, {190, 119,  76}, {191, 119,  76}, {191, 119,  76},\n   {191, 120,  76}, {192, 120,  76}, {192, 120,  76}, {192, 120,  77}, {193, 120,  77},\n   {193, 121,  77}, {193, 121,  77}, {194, 121,  77}, {194, 121,  77}, {194, 121,  77},\n   {195, 122,  77}, {195, 122,  78}, {195, 122,  78}, {196, 122,  78}, {196, 122,  78},\n   {196, 123,  78}, {197, 123,  78}, {197, 123,  78}, {197, 123,  78}, {198, 123,  79},\n   {198, 124,  79}, {198, 124,  79}, {198, 124,  79}, {199, 124,  79}, {199, 124,  79},\n   {199, 125,  79}, {200, 125,  79}, {200, 125,  80}, {200, 125,  80}, {201, 125,  80},\n   {201, 126,  80}, {201, 126,  80}, {202, 126,  80}, {202, 126,  80}, {202, 126,  81},\n   {203, 127,  81}, {203, 127,  81}, {203, 127,  81}, {204, 127,  81}, {204, 127,  81},\n   {204, 128,  81}, {205, 128,  81}, {205, 128,  82}, {205, 128,  82}, {205, 128,  82},\n   {206, 129,  82}, {206, 129,  82}, {206, 129,  82}, {207, 129,  82}, {207, 129,  82},\n   {207, 130,  83}, {208, 130,  83}, {208, 130,  83}, {208, 130,  83}, {209, 130,  83},\n   {209, 131,  83}, {209, 131,  83}, {210, 131,  83}, {210, 131,  84}, {210, 131,  84},\n   {211, 132,  84}, {211, 132,  84}, {211, 132,  84}, {212, 132,  84}, {212, 132,  84},\n   {212, 133,  84}, {212, 133,  85}, {213, 133,  85}, {213, 133,  85}, {213, 133,  85},\n   {214, 134,  85}, {214, 134,  85}, {214, 134,  85}, {215, 134,  85}, {215, 134,  86},\n   {215, 135,  86}, {216, 135,  86}, {216, 135,  86}, {216, 135,  86}, {217, 135,  86},\n   {217, 136,  86}, {217, 136,  86}, {218, 136,  87}, {218, 136,  87}, {218, 136,  87},\n   {219, 137,  87}, {219, 137,  87}, {219, 137,  87}, {220, 137,  87}, {220, 137,  87},\n   {220, 138,  88}, {220, 138,  88}, {221, 138,  88}, {221, 138,  88}, {221, 138,  88},\n   {222, 139,  88}, {222, 139,  88}, {222, 139,  89}, {223, 139,  89}, {223, 139,  89},\n   {223, 140,  89}, {224, 140,  89}, {224, 140,  89}, {224, 140,  89}, {225, 140,  89},\n   {225, 141,  90}, {225, 141,  90}, {226, 141,  90}, {226, 141,  90}, {226, 141,  90},\n   {227, 142,  90}, {227, 142,  90}, {227, 142,  90}, {227, 142,  91}, {228, 142,  91},\n   {228, 143,  91}, {228, 143,  91}, {229, 143,  91}, {229, 143,  91}, {229, 143,  91},\n   {230, 144,  91}, {230, 144,  92}, {230, 144,  92}, {231, 144,  92}, {231, 144,  92},\n   {231, 145,  92}, {232, 145,  92}, {232, 145,  92}, {232, 145,  92}, {233, 145,  93},\n   {233, 146,  93}, {233, 146,  93}, {234, 146,  93}, {234, 146,  93}, {234, 146,  93},\n   {235, 147,  93}, {235, 147,  93}, {235, 147,  94}, {235, 147,  94}, {236, 147,  94},\n   {236, 148,  94}, {236, 148,  94}, {237, 148,  94}, {237, 148,  94}, {237, 148,  94},\n   {238, 149,  95}, {238, 149,  95}, {238, 149,  95}, {239, 149,  95}, {239, 149,  95},\n   {239, 150,  95}, {240, 150,  95}, {240, 150,  95}, {240, 150,  96}, {241, 150,  96},\n   {241, 151,  96}, {241, 151,  96}, {242, 151,  96}, {242, 151,  96}, {242, 151,  96},\n   {242, 152,  97}, {243, 152,  97}, {243, 152,  97}, {243, 152,  97}, {244, 152,  97},\n   {244, 153,  97}, {244, 153,  97}, {245, 153,  97}, {245, 153,  98}, {245, 153,  98},\n   {246, 154,  98}, {246, 154,  98}, {246, 154,  98}, {247, 154,  98}, {247, 154,  98},\n   {247, 155,  98}, {248, 155,  99}, {248, 155,  99}, {248, 155,  99}, {249, 155,  99},\n   {249, 156,  99}, {249, 156,  99}, {250, 156,  99}, {250, 156,  99}, {250, 156, 100},\n   {250, 157, 100}, {251, 157, 100}, {251, 157, 100}, {251, 157, 100}, {252, 157, 100},\n   {252, 158, 100}, {252, 158, 100}, {253, 158, 101}, {253, 158, 101}, {253, 158, 101},\n   {253, 159, 101}, {253, 159, 101}, {254, 159, 101}, {254, 159, 101}, {254, 159, 101},\n   {254, 160, 102}, {254, 160, 102}, {254, 160, 102}, {254, 160, 102}, {254, 160, 102},\n   {255, 161, 102}, {255, 161, 102}, {255, 161, 102}, {255, 161, 103}, {255, 161, 103},\n   {255, 162, 103}, {255, 162, 103}, {255, 162, 103}, {255, 162, 103}, {255, 162, 103},\n   {255, 163, 103}, {255, 163, 104}, {255, 163, 104}, {255, 163, 104}, {255, 163, 104},\n   {255, 164, 104}, {255, 164, 104}, {255, 164, 104}, {255, 164, 105}, {255, 164, 105},\n   {255, 165, 105}, {255, 165, 105}, {255, 165, 105}, {255, 165, 105}, {255, 165, 105},\n   {255, 166, 105}, {255, 166, 106}, {255, 166, 106}, {255, 166, 106}, {255, 166, 106},\n   {255, 167, 106}, {255, 167, 106}, {255, 167, 106}, {255, 167, 106}, {255, 167, 107},\n   {255, 168, 107}, {255, 168, 107}, {255, 168, 107}, {255, 168, 107}, {255, 168, 107},\n   {255, 168, 107}, {255, 169, 107}, {255, 169, 108}, {255, 169, 108}, {255, 169, 108},\n   {255, 169, 108}, {255, 170, 108}, {255, 170, 108}, {255, 170, 108}, {255, 170, 108},\n   {255, 170, 109}, {255, 171, 109}, {255, 171, 109}, {255, 171, 109}, {255, 171, 109},\n   {255, 171, 109}, {255, 172, 109}, {255, 172, 109}, {255, 172, 110}, {255, 172, 110},\n   {255, 172, 110}, {255, 173, 110}, {255, 173, 110}, {255, 173, 110}, {255, 173, 110},\n   {255, 173, 110}, {255, 174, 111}, {255, 174, 111}, {255, 174, 111}, {255, 174, 111},\n   {255, 174, 111}, {255, 175, 111}, {255, 175, 111}, {255, 175, 111}, {255, 175, 112},\n   {255, 175, 112}, {255, 176, 112}, {255, 176, 112}, {255, 176, 112}, {255, 176, 112},\n   {255, 176, 112}, {255, 177, 113}, {255, 177, 113}, {255, 177, 113}, {255, 177, 113},\n   {255, 177, 113}, {255, 178, 113}, {255, 178, 113}, {255, 178, 113}, {255, 178, 114},\n   {255, 178, 114}, {255, 179, 114}, {255, 179, 114}, {255, 179, 114}, {255, 179, 114},\n   {255, 179, 114}, {255, 180, 114}, {255, 180, 115}, {255, 180, 115}, {255, 180, 115},\n   {255, 180, 115}, {255, 181, 115}, {255, 181, 115}, {255, 181, 115}, {255, 181, 115},\n   {255, 181, 116}, {255, 182, 116}, {255, 182, 116}, {255, 182, 116}, {255, 182, 116},\n   {255, 182, 116}, {255, 183, 116}, {255, 183, 116}, {255, 183, 117}, {255, 183, 117},\n   {255, 183, 117}, {255, 184, 117}, {255, 184, 117}, {255, 184, 117}, {255, 184, 117},\n   {255, 184, 117}, {255, 185, 118}, {255, 185, 118}, {255, 185, 118}, {255, 185, 118},\n   {255, 185, 118}, {255, 186, 118}, {255, 186, 118}, {255, 186, 118}, {255, 186, 119},\n   {255, 186, 119}, {255, 187, 119}, {255, 187, 119}, {255, 187, 119}, {255, 187, 119},\n   {255, 187, 119}, {255, 188, 119}, {255, 188, 120}, {255, 188, 120}, {255, 188, 120},\n   {255, 188, 120}, {255, 189, 120}, {255, 189, 120}, {255, 189, 120}, {255, 189, 121},\n   {255, 189, 121}, {255, 190, 121}, {255, 190, 121}, {255, 190, 121}, {255, 190, 121},\n   {255, 190, 121}, {255, 191, 121}, {255, 191, 122}, {255, 191, 122}, {255, 191, 122},\n   {255, 191, 122}, {255, 192, 122}, {255, 192, 122}, {255, 192, 122}, {255, 192, 122},\n   {255, 192, 123}, {255, 193, 123}, {255, 193, 123}, {255, 193, 123}, {255, 193, 123},\n   {255, 193, 123}, {255, 194, 123}, {255, 194, 123}, {255, 194, 124}, {255, 194, 124},\n   {255, 194, 124}, {255, 195, 124}, {255, 195, 124}, {255, 195, 124}, {255, 195, 124},\n   {255, 195, 124}, {255, 196, 125}, {255, 196, 125}, {255, 196, 125}, {255, 196, 125},\n   {255, 196, 125}, {255, 197, 125}, {255, 197, 125}, {255, 197, 125}, {255, 197, 126},\n   {255, 197, 126}, {255, 198, 126}, {255, 198, 126}, {255, 198, 126}, {255, 198, 126},\n   {255, 198, 126}, {255, 199, 126}, {255, 199, 127}, {255, 199, 127}, {255, 199, 127}\n};\n\nconst rgb_t gray_colormap[1000] = {\n   {255, 255, 255}, {255, 255, 255}, {254, 254, 254}, {254, 254, 254}, {254, 254, 254},\n   {254, 254, 254}, {253, 253, 253}, {253, 253, 253}, {253, 253, 253}, {253, 253, 253},\n   {252, 252, 252}, {252, 252, 252}, {252, 252, 252}, {252, 252, 252}, {251, 251, 251},\n   {251, 251, 251}, {251, 251, 251}, {251, 251, 251}, {250, 250, 250}, {250, 250, 250},\n   {250, 250, 250}, {250, 250, 250}, {249, 249, 249}, {249, 249, 249}, {249, 249, 249},\n   {249, 249, 249}, {248, 248, 248}, {248, 248, 248}, {248, 248, 248}, {248, 248, 248},\n   {247, 247, 247}, {247, 247, 247}, {247, 247, 247}, {247, 247, 247}, {246, 246, 246},\n   {246, 246, 246}, {246, 246, 246}, {246, 246, 246}, {245, 245, 245}, {245, 245, 245},\n   {245, 245, 245}, {245, 245, 245}, {244, 244, 244}, {244, 244, 244}, {244, 244, 244},\n   {244, 244, 244}, {243, 243, 243}, {243, 243, 243}, {243, 243, 243}, {242, 242, 242},\n   {242, 242, 242}, {242, 242, 242}, {242, 242, 242}, {241, 241, 241}, {241, 241, 241},\n   {241, 241, 241}, {241, 241, 241}, {240, 240, 240}, {240, 240, 240}, {240, 240, 240},\n   {240, 240, 240}, {239, 239, 239}, {239, 239, 239}, {239, 239, 239}, {239, 239, 239},\n   {238, 238, 238}, {238, 238, 238}, {238, 238, 238}, {238, 238, 238}, {237, 237, 237},\n   {237, 237, 237}, {237, 237, 237}, {237, 237, 237}, {236, 236, 236}, {236, 236, 236},\n   {236, 236, 236}, {236, 236, 236}, {235, 235, 235}, {235, 235, 235}, {235, 235, 235},\n   {235, 235, 235}, {234, 234, 234}, {234, 234, 234}, {234, 234, 234}, {234, 234, 234},\n   {233, 233, 233}, {233, 233, 233}, {233, 233, 233}, {233, 233, 233}, {232, 232, 232},\n   {232, 232, 232}, {232, 232, 232}, {232, 232, 232}, {231, 231, 231}, {231, 231, 231},\n   {231, 231, 231}, {230, 230, 230}, {230, 230, 230}, {230, 230, 230}, {230, 230, 230},\n   {229, 229, 229}, {229, 229, 229}, {229, 229, 229}, {229, 229, 229}, {228, 228, 228},\n   {228, 228, 228}, {228, 228, 228}, {228, 228, 228}, {227, 227, 227}, {227, 227, 227},\n   {227, 227, 227}, {227, 227, 227}, {226, 226, 226}, {226, 226, 226}, {226, 226, 226},\n   {226, 226, 226}, {225, 225, 225}, {225, 225, 225}, {225, 225, 225}, {225, 225, 225},\n   {224, 224, 224}, {224, 224, 224}, {224, 224, 224}, {224, 224, 224}, {223, 223, 223},\n   {223, 223, 223}, {223, 223, 223}, {223, 223, 223}, {222, 222, 222}, {222, 222, 222},\n   {222, 222, 222}, {222, 222, 222}, {221, 221, 221}, {221, 221, 221}, {221, 221, 221},\n   {221, 221, 221}, {220, 220, 220}, {220, 220, 220}, {220, 220, 220}, {220, 220, 220},\n   {219, 219, 219}, {219, 219, 219}, {219, 219, 219}, {218, 218, 218}, {218, 218, 218},\n   {218, 218, 218}, {218, 218, 218}, {217, 217, 217}, {217, 217, 217}, {217, 217, 217},\n   {217, 217, 217}, {216, 216, 216}, {216, 216, 216}, {216, 216, 216}, {216, 216, 216},\n   {215, 215, 215}, {215, 215, 215}, {215, 215, 215}, {215, 215, 215}, {214, 214, 214},\n   {214, 214, 214}, {214, 214, 214}, {214, 214, 214}, {213, 213, 213}, {213, 213, 213},\n   {213, 213, 213}, {213, 213, 213}, {212, 212, 212}, {212, 212, 212}, {212, 212, 212},\n   {212, 212, 212}, {211, 211, 211}, {211, 211, 211}, {211, 211, 211}, {211, 211, 211},\n   {210, 210, 210}, {210, 210, 210}, {210, 210, 210}, {210, 210, 210}, {209, 209, 209},\n   {209, 209, 209}, {209, 209, 209}, {209, 209, 209}, {208, 208, 208}, {208, 208, 208},\n   {208, 208, 208}, {208, 208, 208}, {207, 207, 207}, {207, 207, 207}, {207, 207, 207},\n   {207, 207, 207}, {206, 206, 206}, {206, 206, 206}, {206, 206, 206}, {205, 205, 205},\n   {205, 205, 205}, {205, 205, 205}, {205, 205, 205}, {204, 204, 204}, {204, 204, 204},\n   {204, 204, 204}, {204, 204, 204}, {203, 203, 203}, {203, 203, 203}, {203, 203, 203},\n   {203, 203, 203}, {202, 202, 202}, {202, 202, 202}, {202, 202, 202}, {202, 202, 202},\n   {201, 201, 201}, {201, 201, 201}, {201, 201, 201}, {201, 201, 201}, {200, 200, 200},\n   {200, 200, 200}, {200, 200, 200}, {200, 200, 200}, {199, 199, 199}, {199, 199, 199},\n   {199, 199, 199}, {199, 199, 199}, {198, 198, 198}, {198, 198, 198}, {198, 198, 198},\n   {198, 198, 198}, {197, 197, 197}, {197, 197, 197}, {197, 197, 197}, {197, 197, 197},\n   {196, 196, 196}, {196, 196, 196}, {196, 196, 196}, {196, 196, 196}, {195, 195, 195},\n   {195, 195, 195}, {195, 195, 195}, {195, 195, 195}, {194, 194, 194}, {194, 194, 194},\n   {194, 194, 194}, {193, 193, 193}, {193, 193, 193}, {193, 193, 193}, {193, 193, 193},\n   {192, 192, 192}, {192, 192, 192}, {192, 192, 192}, {192, 192, 192}, {191, 191, 191},\n   {191, 191, 191}, {191, 191, 191}, {191, 191, 191}, {190, 190, 190}, {190, 190, 190},\n   {190, 190, 190}, {190, 190, 190}, {189, 189, 189}, {189, 189, 189}, {189, 189, 189},\n   {189, 189, 189}, {188, 188, 188}, {188, 188, 188}, {188, 188, 188}, {188, 188, 188},\n   {187, 187, 187}, {187, 187, 187}, {187, 187, 187}, {187, 187, 187}, {186, 186, 186},\n   {186, 186, 186}, {186, 186, 186}, {186, 186, 186}, {185, 185, 185}, {185, 185, 185},\n   {185, 185, 185}, {185, 185, 185}, {184, 184, 184}, {184, 184, 184}, {184, 184, 184},\n   {184, 184, 184}, {183, 183, 183}, {183, 183, 183}, {183, 183, 183}, {183, 183, 183},\n   {182, 182, 182}, {182, 182, 182}, {182, 182, 182}, {181, 181, 181}, {181, 181, 181},\n   {181, 181, 181}, {181, 181, 181}, {180, 180, 180}, {180, 180, 180}, {180, 180, 180},\n   {180, 180, 180}, {179, 179, 179}, {179, 179, 179}, {179, 179, 179}, {179, 179, 179},\n   {178, 178, 178}, {178, 178, 178}, {178, 178, 178}, {178, 178, 178}, {177, 177, 177},\n   {177, 177, 177}, {177, 177, 177}, {177, 177, 177}, {176, 176, 176}, {176, 176, 176},\n   {176, 176, 176}, {176, 176, 176}, {175, 175, 175}, {175, 175, 175}, {175, 175, 175},\n   {175, 175, 175}, {174, 174, 174}, {174, 174, 174}, {174, 174, 174}, {174, 174, 174},\n   {173, 173, 173}, {173, 173, 173}, {173, 173, 173}, {173, 173, 173}, {172, 172, 172},\n   {172, 172, 172}, {172, 172, 172}, {172, 172, 172}, {171, 171, 171}, {171, 171, 171},\n   {171, 171, 171}, {171, 171, 171}, {170, 170, 170}, {170, 170, 170}, {170, 170, 170},\n   {169, 169, 169}, {169, 169, 169}, {169, 169, 169}, {169, 169, 169}, {168, 168, 168},\n   {168, 168, 168}, {168, 168, 168}, {168, 168, 168}, {167, 167, 167}, {167, 167, 167},\n   {167, 167, 167}, {167, 167, 167}, {166, 166, 166}, {166, 166, 166}, {166, 166, 166},\n   {166, 166, 166}, {165, 165, 165}, {165, 165, 165}, {165, 165, 165}, {165, 165, 165},\n   {164, 164, 164}, {164, 164, 164}, {164, 164, 164}, {164, 164, 164}, {163, 163, 163},\n   {163, 163, 163}, {163, 163, 163}, {163, 163, 163}, {162, 162, 162}, {162, 162, 162},\n   {162, 162, 162}, {162, 162, 162}, {161, 161, 161}, {161, 161, 161}, {161, 161, 161},\n   {161, 161, 161}, {160, 160, 160}, {160, 160, 160}, {160, 160, 160}, {160, 160, 160},\n   {159, 159, 159}, {159, 159, 159}, {159, 159, 159}, {159, 159, 159}, {158, 158, 158},\n   {158, 158, 158}, {158, 158, 158}, {157, 157, 157}, {157, 157, 157}, {157, 157, 157},\n   {157, 157, 157}, {156, 156, 156}, {156, 156, 156}, {156, 156, 156}, {156, 156, 156},\n   {155, 155, 155}, {155, 155, 155}, {155, 155, 155}, {155, 155, 155}, {154, 154, 154},\n   {154, 154, 154}, {154, 154, 154}, {154, 154, 154}, {153, 153, 153}, {153, 153, 153},\n   {153, 153, 153}, {153, 153, 153}, {152, 152, 152}, {152, 152, 152}, {152, 152, 152},\n   {152, 152, 152}, {151, 151, 151}, {151, 151, 151}, {151, 151, 151}, {151, 151, 151},\n   {150, 150, 150}, {150, 150, 150}, {150, 150, 150}, {150, 150, 150}, {149, 149, 149},\n   {149, 149, 149}, {149, 149, 149}, {149, 149, 149}, {148, 148, 148}, {148, 148, 148},\n   {148, 148, 148}, {148, 148, 148}, {147, 147, 147}, {147, 147, 147}, {147, 147, 147},\n   {147, 147, 147}, {146, 146, 146}, {146, 146, 146}, {146, 146, 146}, {145, 145, 145},\n   {145, 145, 145}, {145, 145, 145}, {145, 145, 145}, {144, 144, 144}, {144, 144, 144},\n   {144, 144, 144}, {144, 144, 144}, {143, 143, 143}, {143, 143, 143}, {143, 143, 143},\n   {143, 143, 143}, {142, 142, 142}, {142, 142, 142}, {142, 142, 142}, {142, 142, 142},\n   {141, 141, 141}, {141, 141, 141}, {141, 141, 141}, {141, 141, 141}, {140, 140, 140},\n   {140, 140, 140}, {140, 140, 140}, {140, 140, 140}, {139, 139, 139}, {139, 139, 139},\n   {139, 139, 139}, {139, 139, 139}, {138, 138, 138}, {138, 138, 138}, {138, 138, 138},\n   {138, 138, 138}, {137, 137, 137}, {137, 137, 137}, {137, 137, 137}, {137, 137, 137},\n   {136, 136, 136}, {136, 136, 136}, {136, 136, 136}, {136, 136, 136}, {135, 135, 135},\n   {135, 135, 135}, {135, 135, 135}, {135, 135, 135}, {134, 134, 134}, {134, 134, 134},\n   {134, 134, 134}, {133, 133, 133}, {133, 133, 133}, {133, 133, 133}, {133, 133, 133},\n   {132, 132, 132}, {132, 132, 132}, {132, 132, 132}, {132, 132, 132}, {131, 131, 131},\n   {131, 131, 131}, {131, 131, 131}, {131, 131, 131}, {130, 130, 130}, {130, 130, 130},\n   {130, 130, 130}, {130, 130, 130}, {129, 129, 129}, {129, 129, 129}, {129, 129, 129},\n   {129, 129, 129}, {128, 128, 128}, {128, 128, 128}, {128, 128, 128}, {128, 128, 128},\n   {127, 127, 127}, {127, 127, 127}, {127, 127, 127}, {127, 127, 127}, {126, 126, 126},\n   {126, 126, 126}, {126, 126, 126}, {126, 126, 126}, {125, 125, 125}, {125, 125, 125},\n   {125, 125, 125}, {125, 125, 125}, {124, 124, 124}, {124, 124, 124}, {124, 124, 124},\n   {124, 124, 124}, {123, 123, 123}, {123, 123, 123}, {123, 123, 123}, {123, 123, 123},\n   {122, 122, 122}, {122, 122, 122}, {122, 122, 122}, {122, 122, 122}, {121, 121, 121},\n   {121, 121, 121}, {121, 121, 121}, {120, 120, 120}, {120, 120, 120}, {120, 120, 120},\n   {120, 120, 120}, {119, 119, 119}, {119, 119, 119}, {119, 119, 119}, {119, 119, 119},\n   {118, 118, 118}, {118, 118, 118}, {118, 118, 118}, {118, 118, 118}, {117, 117, 117},\n   {117, 117, 117}, {117, 117, 117}, {117, 117, 117}, {116, 116, 116}, {116, 116, 116},\n   {116, 116, 116}, {116, 116, 116}, {115, 115, 115}, {115, 115, 115}, {115, 115, 115},\n   {115, 115, 115}, {114, 114, 114}, {114, 114, 114}, {114, 114, 114}, {114, 114, 114},\n   {113, 113, 113}, {113, 113, 113}, {113, 113, 113}, {113, 113, 113}, {112, 112, 112},\n   {112, 112, 112}, {112, 112, 112}, {112, 112, 112}, {111, 111, 111}, {111, 111, 111},\n   {111, 111, 111}, {111, 111, 111}, {110, 110, 110}, {110, 110, 110}, {110, 110, 110},\n   {110, 110, 110}, {109, 109, 109}, {109, 109, 109}, {109, 109, 109}, {108, 108, 108},\n   {108, 108, 108}, {108, 108, 108}, {108, 108, 108}, {107, 107, 107}, {107, 107, 107},\n   {107, 107, 107}, {107, 107, 107}, {106, 106, 106}, {106, 106, 106}, {106, 106, 106},\n   {106, 106, 106}, {105, 105, 105}, {105, 105, 105}, {105, 105, 105}, {105, 105, 105},\n   {104, 104, 104}, {104, 104, 104}, {104, 104, 104}, {104, 104, 104}, {103, 103, 103},\n   {103, 103, 103}, {103, 103, 103}, {103, 103, 103}, {102, 102, 102}, {102, 102, 102},\n   {102, 102, 102}, {102, 102, 102}, {101, 101, 101}, {101, 101, 101}, {101, 101, 101},\n   {101, 101, 101}, {100, 100, 100}, {100, 100, 100}, {100, 100, 100}, {100, 100, 100},\n   { 99,  99,  99}, { 99,  99,  99}, { 99,  99,  99}, { 99,  99,  99}, { 98,  98,  98},\n   { 98,  98,  98}, { 98,  98,  98}, { 98,  98,  98}, { 97,  97,  97}, { 97,  97,  97},\n   { 97,  97,  97}, { 96,  96,  96}, { 96,  96,  96}, { 96,  96,  96}, { 96,  96,  96},\n   { 95,  95,  95}, { 95,  95,  95}, { 95,  95,  95}, { 95,  95,  95}, { 94,  94,  94},\n   { 94,  94,  94}, { 94,  94,  94}, { 94,  94,  94}, { 93,  93,  93}, { 93,  93,  93},\n   { 93,  93,  93}, { 93,  93,  93}, { 92,  92,  92}, { 92,  92,  92}, { 92,  92,  92},\n   { 92,  92,  92}, { 91,  91,  91}, { 91,  91,  91}, { 91,  91,  91}, { 91,  91,  91},\n   { 90,  90,  90}, { 90,  90,  90}, { 90,  90,  90}, { 90,  90,  90}, { 89,  89,  89},\n   { 89,  89,  89}, { 89,  89,  89}, { 89,  89,  89}, { 88,  88,  88}, { 88,  88,  88},\n   { 88,  88,  88}, { 88,  88,  88}, { 87,  87,  87}, { 87,  87,  87}, { 87,  87,  87},\n   { 87,  87,  87}, { 86,  86,  86}, { 86,  86,  86}, { 86,  86,  86}, { 86,  86,  86},\n   { 85,  85,  85}, { 85,  85,  85}, { 85,  85,  85}, { 84,  84,  84}, { 84,  84,  84},\n   { 84,  84,  84}, { 84,  84,  84}, { 83,  83,  83}, { 83,  83,  83}, { 83,  83,  83},\n   { 83,  83,  83}, { 82,  82,  82}, { 82,  82,  82}, { 82,  82,  82}, { 82,  82,  82},\n   { 81,  81,  81}, { 81,  81,  81}, { 81,  81,  81}, { 81,  81,  81}, { 80,  80,  80},\n   { 80,  80,  80}, { 80,  80,  80}, { 80,  80,  80}, { 79,  79,  79}, { 79,  79,  79},\n   { 79,  79,  79}, { 79,  79,  79}, { 78,  78,  78}, { 78,  78,  78}, { 78,  78,  78},\n   { 78,  78,  78}, { 77,  77,  77}, { 77,  77,  77}, { 77,  77,  77}, { 77,  77,  77},\n   { 76,  76,  76}, { 76,  76,  76}, { 76,  76,  76}, { 76,  76,  76}, { 75,  75,  75},\n   { 75,  75,  75}, { 75,  75,  75}, { 75,  75,  75}, { 74,  74,  74}, { 74,  74,  74},\n   { 74,  74,  74}, { 74,  74,  74}, { 73,  73,  73}, { 73,  73,  73}, { 73,  73,  73},\n   { 72,  72,  72}, { 72,  72,  72}, { 72,  72,  72}, { 72,  72,  72}, { 71,  71,  71},\n   { 71,  71,  71}, { 71,  71,  71}, { 71,  71,  71}, { 70,  70,  70}, { 70,  70,  70},\n   { 70,  70,  70}, { 70,  70,  70}, { 69,  69,  69}, { 69,  69,  69}, { 69,  69,  69},\n   { 69,  69,  69}, { 68,  68,  68}, { 68,  68,  68}, { 68,  68,  68}, { 68,  68,  68},\n   { 67,  67,  67}, { 67,  67,  67}, { 67,  67,  67}, { 67,  67,  67}, { 66,  66,  66},\n   { 66,  66,  66}, { 66,  66,  66}, { 66,  66,  66}, { 65,  65,  65}, { 65,  65,  65},\n   { 65,  65,  65}, { 65,  65,  65}, { 64,  64,  64}, { 64,  64,  64}, { 64,  64,  64},\n   { 64,  64,  64}, { 63,  63,  63}, { 63,  63,  63}, { 63,  63,  63}, { 63,  63,  63},\n   { 62,  62,  62}, { 62,  62,  62}, { 62,  62,  62}, { 62,  62,  62}, { 61,  61,  61},\n   { 61,  61,  61}, { 61,  61,  61}, { 60,  60,  60}, { 60,  60,  60}, { 60,  60,  60},\n   { 60,  60,  60}, { 59,  59,  59}, { 59,  59,  59}, { 59,  59,  59}, { 59,  59,  59},\n   { 58,  58,  58}, { 58,  58,  58}, { 58,  58,  58}, { 58,  58,  58}, { 57,  57,  57},\n   { 57,  57,  57}, { 57,  57,  57}, { 57,  57,  57}, { 56,  56,  56}, { 56,  56,  56},\n   { 56,  56,  56}, { 56,  56,  56}, { 55,  55,  55}, { 55,  55,  55}, { 55,  55,  55},\n   { 55,  55,  55}, { 54,  54,  54}, { 54,  54,  54}, { 54,  54,  54}, { 54,  54,  54},\n   { 53,  53,  53}, { 53,  53,  53}, { 53,  53,  53}, { 53,  53,  53}, { 52,  52,  52},\n   { 52,  52,  52}, { 52,  52,  52}, { 52,  52,  52}, { 51,  51,  51}, { 51,  51,  51},\n   { 51,  51,  51}, { 51,  51,  51}, { 50,  50,  50}, { 50,  50,  50}, { 50,  50,  50},\n   { 50,  50,  50}, { 49,  49,  49}, { 49,  49,  49}, { 49,  49,  49}, { 48,  48,  48},\n   { 48,  48,  48}, { 48,  48,  48}, { 48,  48,  48}, { 47,  47,  47}, { 47,  47,  47},\n   { 47,  47,  47}, { 47,  47,  47}, { 46,  46,  46}, { 46,  46,  46}, { 46,  46,  46},\n   { 46,  46,  46}, { 45,  45,  45}, { 45,  45,  45}, { 45,  45,  45}, { 45,  45,  45},\n   { 44,  44,  44}, { 44,  44,  44}, { 44,  44,  44}, { 44,  44,  44}, { 43,  43,  43},\n   { 43,  43,  43}, { 43,  43,  43}, { 43,  43,  43}, { 42,  42,  42}, { 42,  42,  42},\n   { 42,  42,  42}, { 42,  42,  42}, { 41,  41,  41}, { 41,  41,  41}, { 41,  41,  41},\n   { 41,  41,  41}, { 40,  40,  40}, { 40,  40,  40}, { 40,  40,  40}, { 40,  40,  40},\n   { 39,  39,  39}, { 39,  39,  39}, { 39,  39,  39}, { 39,  39,  39}, { 38,  38,  38},\n   { 38,  38,  38}, { 38,  38,  38}, { 38,  38,  38}, { 37,  37,  37}, { 37,  37,  37},\n   { 37,  37,  37}, { 37,  37,  37}, { 36,  36,  36}, { 36,  36,  36}, { 36,  36,  36},\n   { 35,  35,  35}, { 35,  35,  35}, { 35,  35,  35}, { 35,  35,  35}, { 34,  34,  34},\n   { 34,  34,  34}, { 34,  34,  34}, { 34,  34,  34}, { 33,  33,  33}, { 33,  33,  33},\n   { 33,  33,  33}, { 33,  33,  33}, { 32,  32,  32}, { 32,  32,  32}, { 32,  32,  32},\n   { 32,  32,  32}, { 31,  31,  31}, { 31,  31,  31}, { 31,  31,  31}, { 31,  31,  31},\n   { 30,  30,  30}, { 30,  30,  30}, { 30,  30,  30}, { 30,  30,  30}, { 29,  29,  29},\n   { 29,  29,  29}, { 29,  29,  29}, { 29,  29,  29}, { 28,  28,  28}, { 28,  28,  28},\n   { 28,  28,  28}, { 28,  28,  28}, { 27,  27,  27}, { 27,  27,  27}, { 27,  27,  27},\n   { 27,  27,  27}, { 26,  26,  26}, { 26,  26,  26}, { 26,  26,  26}, { 26,  26,  26},\n   { 25,  25,  25}, { 25,  25,  25}, { 25,  25,  25}, { 25,  25,  25}, { 24,  24,  24},\n   { 24,  24,  24}, { 24,  24,  24}, { 23,  23,  23}, { 23,  23,  23}, { 23,  23,  23},\n   { 23,  23,  23}, { 22,  22,  22}, { 22,  22,  22}, { 22,  22,  22}, { 22,  22,  22},\n   { 21,  21,  21}, { 21,  21,  21}, { 21,  21,  21}, { 21,  21,  21}, { 20,  20,  20},\n   { 20,  20,  20}, { 20,  20,  20}, { 20,  20,  20}, { 19,  19,  19}, { 19,  19,  19},\n   { 19,  19,  19}, { 19,  19,  19}, { 18,  18,  18}, { 18,  18,  18}, { 18,  18,  18},\n   { 18,  18,  18}, { 17,  17,  17}, { 17,  17,  17}, { 17,  17,  17}, { 17,  17,  17},\n   { 16,  16,  16}, { 16,  16,  16}, { 16,  16,  16}, { 16,  16,  16}, { 15,  15,  15},\n   { 15,  15,  15}, { 15,  15,  15}, { 15,  15,  15}, { 14,  14,  14}, { 14,  14,  14},\n   { 14,  14,  14}, { 14,  14,  14}, { 13,  13,  13}, { 13,  13,  13}, { 13,  13,  13},\n   { 13,  13,  13}, { 12,  12,  12}, { 12,  12,  12}, { 12,  12,  12}, { 11,  11,  11},\n   { 11,  11,  11}, { 11,  11,  11}, { 11,  11,  11}, { 10,  10,  10}, { 10,  10,  10},\n   { 10,  10,  10}, { 10,  10,  10}, {  9,   9,   9}, {  9,   9,   9}, {  9,   9,   9},\n   {  9,   9,   9}, {  8,   8,   8}, {  8,   8,   8}, {  8,   8,   8}, {  8,   8,   8},\n   {  7,   7,   7}, {  7,   7,   7}, {  7,   7,   7}, {  7,   7,   7}, {  6,   6,   6},\n   {  6,   6,   6}, {  6,   6,   6}, {  6,   6,   6}, {  5,   5,   5}, {  5,   5,   5},\n   {  5,   5,   5}, {  5,   5,   5}, {  4,   4,   4}, {  4,   4,   4}, {  4,   4,   4},\n   {  4,   4,   4}, {  3,   3,   3}, {  3,   3,   3}, {  3,   3,   3}, {  3,   3,   3},\n   {  2,   2,   2}, {  2,   2,   2}, {  2,   2,   2}, {  2,   2,   2}, {  1,   1,   1},\n   {  1,   1,   1}, {  1,   1,   1}, {  1,   1,   1}, {  0,   0,   0}, {  0,   0,   0}\n};\n\nconst rgb_t hot_colormap[1000] = {\n   { 11,   0,   0}, { 11,   0,   0}, { 12,   0,   0}, { 13,   0,   0}, { 13,   0,   0},\n   { 14,   0,   0}, { 15,   0,   0}, { 15,   0,   0}, { 16,   0,   0}, { 17,   0,   0},\n   { 17,   0,   0}, { 18,   0,   0}, { 19,   0,   0}, { 19,   0,   0}, { 20,   0,   0},\n   { 21,   0,   0}, { 21,   0,   0}, { 22,   0,   0}, { 23,   0,   0}, { 23,   0,   0},\n   { 24,   0,   0}, { 25,   0,   0}, { 25,   0,   0}, { 26,   0,   0}, { 27,   0,   0},\n   { 27,   0,   0}, { 28,   0,   0}, { 29,   0,   0}, { 29,   0,   0}, { 30,   0,   0},\n   { 31,   0,   0}, { 31,   0,   0}, { 32,   0,   0}, { 33,   0,   0}, { 33,   0,   0},\n   { 34,   0,   0}, { 35,   0,   0}, { 35,   0,   0}, { 36,   0,   0}, { 37,   0,   0},\n   { 37,   0,   0}, { 38,   0,   0}, { 39,   0,   0}, { 39,   0,   0}, { 40,   0,   0},\n   { 41,   0,   0}, { 41,   0,   0}, { 42,   0,   0}, { 43,   0,   0}, { 43,   0,   0},\n   { 44,   0,   0}, { 45,   0,   0}, { 45,   0,   0}, { 46,   0,   0}, { 47,   0,   0},\n   { 47,   0,   0}, { 48,   0,   0}, { 49,   0,   0}, { 49,   0,   0}, { 50,   0,   0},\n   { 51,   0,   0}, { 51,   0,   0}, { 52,   0,   0}, { 53,   0,   0}, { 54,   0,   0},\n   { 54,   0,   0}, { 55,   0,   0}, { 56,   0,   0}, { 56,   0,   0}, { 57,   0,   0},\n   { 58,   0,   0}, { 58,   0,   0}, { 59,   0,   0}, { 60,   0,   0}, { 60,   0,   0},\n   { 61,   0,   0}, { 62,   0,   0}, { 62,   0,   0}, { 63,   0,   0}, { 64,   0,   0},\n   { 64,   0,   0}, { 65,   0,   0}, { 66,   0,   0}, { 66,   0,   0}, { 67,   0,   0},\n   { 68,   0,   0}, { 68,   0,   0}, { 69,   0,   0}, { 70,   0,   0}, { 70,   0,   0},\n   { 71,   0,   0}, { 72,   0,   0}, { 72,   0,   0}, { 73,   0,   0}, { 74,   0,   0},\n   { 74,   0,   0}, { 75,   0,   0}, { 76,   0,   0}, { 76,   0,   0}, { 77,   0,   0},\n   { 78,   0,   0}, { 78,   0,   0}, { 79,   0,   0}, { 80,   0,   0}, { 80,   0,   0},\n   { 81,   0,   0}, { 82,   0,   0}, { 82,   0,   0}, { 83,   0,   0}, { 84,   0,   0},\n   { 84,   0,   0}, { 85,   0,   0}, { 86,   0,   0}, { 86,   0,   0}, { 87,   0,   0},\n   { 88,   0,   0}, { 88,   0,   0}, { 89,   0,   0}, { 90,   0,   0}, { 90,   0,   0},\n   { 91,   0,   0}, { 92,   0,   0}, { 92,   0,   0}, { 93,   0,   0}, { 94,   0,   0},\n   { 94,   0,   0}, { 95,   0,   0}, { 96,   0,   0}, { 96,   0,   0}, { 97,   0,   0},\n   { 98,   0,   0}, { 98,   0,   0}, { 99,   0,   0}, {100,   0,   0}, {100,   0,   0},\n   {101,   0,   0}, {102,   0,   0}, {102,   0,   0}, {103,   0,   0}, {104,   0,   0},\n   {104,   0,   0}, {105,   0,   0}, {106,   0,   0}, {106,   0,   0}, {107,   0,   0},\n   {108,   0,   0}, {108,   0,   0}, {109,   0,   0}, {110,   0,   0}, {110,   0,   0},\n   {111,   0,   0}, {112,   0,   0}, {112,   0,   0}, {113,   0,   0}, {114,   0,   0},\n   {114,   0,   0}, {115,   0,   0}, {116,   0,   0}, {116,   0,   0}, {117,   0,   0},\n   {118,   0,   0}, {119,   0,   0}, {119,   0,   0}, {120,   0,   0}, {121,   0,   0},\n   {121,   0,   0}, {122,   0,   0}, {123,   0,   0}, {123,   0,   0}, {124,   0,   0},\n   {125,   0,   0}, {125,   0,   0}, {126,   0,   0}, {127,   0,   0}, {127,   0,   0},\n   {128,   0,   0}, {129,   0,   0}, {129,   0,   0}, {130,   0,   0}, {131,   0,   0},\n   {131,   0,   0}, {132,   0,   0}, {133,   0,   0}, {133,   0,   0}, {134,   0,   0},\n   {135,   0,   0}, {135,   0,   0}, {136,   0,   0}, {137,   0,   0}, {137,   0,   0},\n   {138,   0,   0}, {139,   0,   0}, {139,   0,   0}, {140,   0,   0}, {141,   0,   0},\n   {141,   0,   0}, {142,   0,   0}, {143,   0,   0}, {143,   0,   0}, {144,   0,   0},\n   {145,   0,   0}, {145,   0,   0}, {146,   0,   0}, {147,   0,   0}, {147,   0,   0},\n   {148,   0,   0}, {149,   0,   0}, {149,   0,   0}, {150,   0,   0}, {151,   0,   0},\n   {151,   0,   0}, {152,   0,   0}, {153,   0,   0}, {153,   0,   0}, {154,   0,   0},\n   {155,   0,   0}, {155,   0,   0}, {156,   0,   0}, {157,   0,   0}, {157,   0,   0},\n   {158,   0,   0}, {159,   0,   0}, {159,   0,   0}, {160,   0,   0}, {161,   0,   0},\n   {161,   0,   0}, {162,   0,   0}, {163,   0,   0}, {163,   0,   0}, {164,   0,   0},\n   {165,   0,   0}, {165,   0,   0}, {166,   0,   0}, {167,   0,   0}, {167,   0,   0},\n   {168,   0,   0}, {169,   0,   0}, {169,   0,   0}, {170,   0,   0}, {171,   0,   0},\n   {171,   0,   0}, {172,   0,   0}, {173,   0,   0}, {173,   0,   0}, {174,   0,   0},\n   {175,   0,   0}, {175,   0,   0}, {176,   0,   0}, {177,   0,   0}, {177,   0,   0},\n   {178,   0,   0}, {179,   0,   0}, {179,   0,   0}, {180,   0,   0}, {181,   0,   0},\n   {181,   0,   0}, {182,   0,   0}, {183,   0,   0}, {183,   0,   0}, {184,   0,   0},\n   {185,   0,   0}, {186,   0,   0}, {186,   0,   0}, {187,   0,   0}, {188,   0,   0},\n   {188,   0,   0}, {189,   0,   0}, {190,   0,   0}, {190,   0,   0}, {191,   0,   0},\n   {192,   0,   0}, {192,   0,   0}, {193,   0,   0}, {194,   0,   0}, {194,   0,   0},\n   {195,   0,   0}, {196,   0,   0}, {196,   0,   0}, {197,   0,   0}, {198,   0,   0},\n   {198,   0,   0}, {199,   0,   0}, {200,   0,   0}, {200,   0,   0}, {201,   0,   0},\n   {202,   0,   0}, {202,   0,   0}, {203,   0,   0}, {204,   0,   0}, {204,   0,   0},\n   {205,   0,   0}, {206,   0,   0}, {206,   0,   0}, {207,   0,   0}, {208,   0,   0},\n   {208,   0,   0}, {209,   0,   0}, {210,   0,   0}, {210,   0,   0}, {211,   0,   0},\n   {212,   0,   0}, {212,   0,   0}, {213,   0,   0}, {214,   0,   0}, {214,   0,   0},\n   {215,   0,   0}, {216,   0,   0}, {216,   0,   0}, {217,   0,   0}, {218,   0,   0},\n   {218,   0,   0}, {219,   0,   0}, {220,   0,   0}, {220,   0,   0}, {221,   0,   0},\n   {222,   0,   0}, {222,   0,   0}, {223,   0,   0}, {224,   0,   0}, {224,   0,   0},\n   {225,   0,   0}, {226,   0,   0}, {226,   0,   0}, {227,   0,   0}, {228,   0,   0},\n   {228,   0,   0}, {229,   0,   0}, {230,   0,   0}, {230,   0,   0}, {231,   0,   0},\n   {232,   0,   0}, {232,   0,   0}, {233,   0,   0}, {234,   0,   0}, {234,   0,   0},\n   {235,   0,   0}, {236,   0,   0}, {236,   0,   0}, {237,   0,   0}, {238,   0,   0},\n   {238,   0,   0}, {239,   0,   0}, {240,   0,   0}, {240,   0,   0}, {241,   0,   0},\n   {242,   0,   0}, {242,   0,   0}, {243,   0,   0}, {244,   0,   0}, {244,   0,   0},\n   {245,   0,   0}, {246,   0,   0}, {246,   0,   0}, {247,   0,   0}, {248,   0,   0},\n   {248,   0,   0}, {249,   0,   0}, {250,   0,   0}, {251,   0,   0}, {251,   0,   0},\n   {252,   0,   0}, {253,   0,   0}, {253,   0,   0}, {254,   0,   0}, {255,   0,   0},\n   {255,   0,   0}, {255,   1,   0}, {255,   2,   0}, {255,   2,   0}, {255,   3,   0},\n   {255,   4,   0}, {255,   4,   0}, {255,   5,   0}, {255,   6,   0}, {255,   6,   0},\n   {255,   7,   0}, {255,   8,   0}, {255,   8,   0}, {255,   9,   0}, {255,  10,   0},\n   {255,  10,   0}, {255,  11,   0}, {255,  12,   0}, {255,  12,   0}, {255,  13,   0},\n   {255,  14,   0}, {255,  14,   0}, {255,  15,   0}, {255,  16,   0}, {255,  16,   0},\n   {255,  17,   0}, {255,  18,   0}, {255,  18,   0}, {255,  19,   0}, {255,  20,   0},\n   {255,  20,   0}, {255,  21,   0}, {255,  22,   0}, {255,  22,   0}, {255,  23,   0},\n   {255,  24,   0}, {255,  24,   0}, {255,  25,   0}, {255,  26,   0}, {255,  26,   0},\n   {255,  27,   0}, {255,  28,   0}, {255,  28,   0}, {255,  29,   0}, {255,  30,   0},\n   {255,  30,   0}, {255,  31,   0}, {255,  32,   0}, {255,  32,   0}, {255,  33,   0},\n   {255,  34,   0}, {255,  34,   0}, {255,  35,   0}, {255,  36,   0}, {255,  36,   0},\n   {255,  37,   0}, {255,  38,   0}, {255,  38,   0}, {255,  39,   0}, {255,  40,   0},\n   {255,  40,   0}, {255,  41,   0}, {255,  42,   0}, {255,  42,   0}, {255,  43,   0},\n   {255,  44,   0}, {255,  44,   0}, {255,  45,   0}, {255,  46,   0}, {255,  46,   0},\n   {255,  47,   0}, {255,  48,   0}, {255,  48,   0}, {255,  49,   0}, {255,  50,   0},\n   {255,  50,   0}, {255,  51,   0}, {255,  52,   0}, {255,  52,   0}, {255,  53,   0},\n   {255,  54,   0}, {255,  54,   0}, {255,  55,   0}, {255,  56,   0}, {255,  56,   0},\n   {255,  57,   0}, {255,  58,   0}, {255,  58,   0}, {255,  59,   0}, {255,  60,   0},\n   {255,  60,   0}, {255,  61,   0}, {255,  62,   0}, {255,  63,   0}, {255,  63,   0},\n   {255,  64,   0}, {255,  65,   0}, {255,  65,   0}, {255,  66,   0}, {255,  67,   0},\n   {255,  67,   0}, {255,  68,   0}, {255,  69,   0}, {255,  69,   0}, {255,  70,   0},\n   {255,  71,   0}, {255,  71,   0}, {255,  72,   0}, {255,  73,   0}, {255,  73,   0},\n   {255,  74,   0}, {255,  75,   0}, {255,  75,   0}, {255,  76,   0}, {255,  77,   0},\n   {255,  77,   0}, {255,  78,   0}, {255,  79,   0}, {255,  79,   0}, {255,  80,   0},\n   {255,  81,   0}, {255,  81,   0}, {255,  82,   0}, {255,  83,   0}, {255,  83,   0},\n   {255,  84,   0}, {255,  85,   0}, {255,  85,   0}, {255,  86,   0}, {255,  87,   0},\n   {255,  87,   0}, {255,  88,   0}, {255,  89,   0}, {255,  89,   0}, {255,  90,   0},\n   {255,  91,   0}, {255,  91,   0}, {255,  92,   0}, {255,  93,   0}, {255,  93,   0},\n   {255,  94,   0}, {255,  95,   0}, {255,  95,   0}, {255,  96,   0}, {255,  97,   0},\n   {255,  97,   0}, {255,  98,   0}, {255,  99,   0}, {255,  99,   0}, {255, 100,   0},\n   {255, 101,   0}, {255, 101,   0}, {255, 102,   0}, {255, 103,   0}, {255, 103,   0},\n   {255, 104,   0}, {255, 105,   0}, {255, 105,   0}, {255, 106,   0}, {255, 107,   0},\n   {255, 107,   0}, {255, 108,   0}, {255, 109,   0}, {255, 109,   0}, {255, 110,   0},\n   {255, 111,   0}, {255, 111,   0}, {255, 112,   0}, {255, 113,   0}, {255, 113,   0},\n   {255, 114,   0}, {255, 115,   0}, {255, 115,   0}, {255, 116,   0}, {255, 117,   0},\n   {255, 117,   0}, {255, 118,   0}, {255, 119,   0}, {255, 119,   0}, {255, 120,   0},\n   {255, 121,   0}, {255, 121,   0}, {255, 122,   0}, {255, 123,   0}, {255, 123,   0},\n   {255, 124,   0}, {255, 125,   0}, {255, 125,   0}, {255, 126,   0}, {255, 127,   0},\n   {255, 128,   0}, {255, 128,   0}, {255, 129,   0}, {255, 130,   0}, {255, 130,   0},\n   {255, 131,   0}, {255, 132,   0}, {255, 132,   0}, {255, 133,   0}, {255, 134,   0},\n   {255, 134,   0}, {255, 135,   0}, {255, 136,   0}, {255, 136,   0}, {255, 137,   0},\n   {255, 138,   0}, {255, 138,   0}, {255, 139,   0}, {255, 140,   0}, {255, 140,   0},\n   {255, 141,   0}, {255, 142,   0}, {255, 142,   0}, {255, 143,   0}, {255, 144,   0},\n   {255, 144,   0}, {255, 145,   0}, {255, 146,   0}, {255, 146,   0}, {255, 147,   0},\n   {255, 148,   0}, {255, 148,   0}, {255, 149,   0}, {255, 150,   0}, {255, 150,   0},\n   {255, 151,   0}, {255, 152,   0}, {255, 152,   0}, {255, 153,   0}, {255, 154,   0},\n   {255, 154,   0}, {255, 155,   0}, {255, 156,   0}, {255, 156,   0}, {255, 157,   0},\n   {255, 158,   0}, {255, 158,   0}, {255, 159,   0}, {255, 160,   0}, {255, 160,   0},\n   {255, 161,   0}, {255, 162,   0}, {255, 162,   0}, {255, 163,   0}, {255, 164,   0},\n   {255, 164,   0}, {255, 165,   0}, {255, 166,   0}, {255, 166,   0}, {255, 167,   0},\n   {255, 168,   0}, {255, 168,   0}, {255, 169,   0}, {255, 170,   0}, {255, 170,   0},\n   {255, 171,   0}, {255, 172,   0}, {255, 172,   0}, {255, 173,   0}, {255, 174,   0},\n   {255, 174,   0}, {255, 175,   0}, {255, 176,   0}, {255, 176,   0}, {255, 177,   0},\n   {255, 178,   0}, {255, 178,   0}, {255, 179,   0}, {255, 180,   0}, {255, 180,   0},\n   {255, 181,   0}, {255, 182,   0}, {255, 182,   0}, {255, 183,   0}, {255, 184,   0},\n   {255, 184,   0}, {255, 185,   0}, {255, 186,   0}, {255, 186,   0}, {255, 187,   0},\n   {255, 188,   0}, {255, 188,   0}, {255, 189,   0}, {255, 190,   0}, {255, 190,   0},\n   {255, 191,   0}, {255, 192,   0}, {255, 192,   0}, {255, 193,   0}, {255, 194,   0},\n   {255, 195,   0}, {255, 195,   0}, {255, 196,   0}, {255, 197,   0}, {255, 197,   0},\n   {255, 198,   0}, {255, 199,   0}, {255, 199,   0}, {255, 200,   0}, {255, 201,   0},\n   {255, 201,   0}, {255, 202,   0}, {255, 203,   0}, {255, 203,   0}, {255, 204,   0},\n   {255, 205,   0}, {255, 205,   0}, {255, 206,   0}, {255, 207,   0}, {255, 207,   0},\n   {255, 208,   0}, {255, 209,   0}, {255, 209,   0}, {255, 210,   0}, {255, 211,   0},\n   {255, 211,   0}, {255, 212,   0}, {255, 213,   0}, {255, 213,   0}, {255, 214,   0},\n   {255, 215,   0}, {255, 215,   0}, {255, 216,   0}, {255, 217,   0}, {255, 217,   0},\n   {255, 218,   0}, {255, 219,   0}, {255, 219,   0}, {255, 220,   0}, {255, 221,   0},\n   {255, 221,   0}, {255, 222,   0}, {255, 223,   0}, {255, 223,   0}, {255, 224,   0},\n   {255, 225,   0}, {255, 225,   0}, {255, 226,   0}, {255, 227,   0}, {255, 227,   0},\n   {255, 228,   0}, {255, 229,   0}, {255, 229,   0}, {255, 230,   0}, {255, 231,   0},\n   {255, 231,   0}, {255, 232,   0}, {255, 233,   0}, {255, 233,   0}, {255, 234,   0},\n   {255, 235,   0}, {255, 235,   0}, {255, 236,   0}, {255, 237,   0}, {255, 237,   0},\n   {255, 238,   0}, {255, 239,   0}, {255, 239,   0}, {255, 240,   0}, {255, 241,   0},\n   {255, 241,   0}, {255, 242,   0}, {255, 243,   0}, {255, 243,   0}, {255, 244,   0},\n   {255, 245,   0}, {255, 245,   0}, {255, 246,   0}, {255, 247,   0}, {255, 247,   0},\n   {255, 248,   0}, {255, 249,   0}, {255, 249,   0}, {255, 250,   0}, {255, 251,   0},\n   {255, 251,   0}, {255, 252,   0}, {255, 253,   0}, {255, 253,   0}, {255, 254,   0},\n   {255, 255,   0}, {255, 255,   1}, {255, 255,   2}, {255, 255,   3}, {255, 255,   4},\n   {255, 255,   5}, {255, 255,   6}, {255, 255,   7}, {255, 255,   8}, {255, 255,   9},\n   {255, 255,  10}, {255, 255,  11}, {255, 255,  12}, {255, 255,  13}, {255, 255,  14},\n   {255, 255,  15}, {255, 255,  16}, {255, 255,  17}, {255, 255,  18}, {255, 255,  19},\n   {255, 255,  20}, {255, 255,  21}, {255, 255,  22}, {255, 255,  23}, {255, 255,  24},\n   {255, 255,  25}, {255, 255,  26}, {255, 255,  27}, {255, 255,  28}, {255, 255,  29},\n   {255, 255,  30}, {255, 255,  31}, {255, 255,  32}, {255, 255,  33}, {255, 255,  34},\n   {255, 255,  35}, {255, 255,  36}, {255, 255,  37}, {255, 255,  38}, {255, 255,  39},\n   {255, 255,  40}, {255, 255,  41}, {255, 255,  42}, {255, 255,  43}, {255, 255,  44},\n   {255, 255,  45}, {255, 255,  46}, {255, 255,  47}, {255, 255,  48}, {255, 255,  49},\n   {255, 255,  50}, {255, 255,  51}, {255, 255,  52}, {255, 255,  53}, {255, 255,  54},\n   {255, 255,  55}, {255, 255,  56}, {255, 255,  57}, {255, 255,  58}, {255, 255,  59},\n   {255, 255,  60}, {255, 255,  61}, {255, 255,  62}, {255, 255,  63}, {255, 255,  64},\n   {255, 255,  65}, {255, 255,  66}, {255, 255,  67}, {255, 255,  68}, {255, 255,  69},\n   {255, 255,  70}, {255, 255,  71}, {255, 255,  72}, {255, 255,  73}, {255, 255,  74},\n   {255, 255,  75}, {255, 255,  76}, {255, 255,  77}, {255, 255,  78}, {255, 255,  79},\n   {255, 255,  80}, {255, 255,  81}, {255, 255,  82}, {255, 255,  83}, {255, 255,  84},\n   {255, 255,  85}, {255, 255,  86}, {255, 255,  87}, {255, 255,  88}, {255, 255,  89},\n   {255, 255,  90}, {255, 255,  91}, {255, 255,  92}, {255, 255,  93}, {255, 255,  94},\n   {255, 255,  95}, {255, 255,  96}, {255, 255,  97}, {255, 255,  98}, {255, 255,  99},\n   {255, 255, 100}, {255, 255, 101}, {255, 255, 102}, {255, 255, 103}, {255, 255, 104},\n   {255, 255, 105}, {255, 255, 106}, {255, 255, 107}, {255, 255, 108}, {255, 255, 109},\n   {255, 255, 110}, {255, 255, 111}, {255, 255, 112}, {255, 255, 113}, {255, 255, 114},\n   {255, 255, 115}, {255, 255, 116}, {255, 255, 117}, {255, 255, 118}, {255, 255, 119},\n   {255, 255, 120}, {255, 255, 121}, {255, 255, 122}, {255, 255, 123}, {255, 255, 124},\n   {255, 255, 125}, {255, 255, 126}, {255, 255, 127}, {255, 255, 128}, {255, 255, 129},\n   {255, 255, 130}, {255, 255, 131}, {255, 255, 132}, {255, 255, 133}, {255, 255, 134},\n   {255, 255, 135}, {255, 255, 136}, {255, 255, 137}, {255, 255, 138}, {255, 255, 139},\n   {255, 255, 140}, {255, 255, 141}, {255, 255, 142}, {255, 255, 143}, {255, 255, 144},\n   {255, 255, 145}, {255, 255, 146}, {255, 255, 147}, {255, 255, 148}, {255, 255, 149},\n   {255, 255, 150}, {255, 255, 151}, {255, 255, 152}, {255, 255, 153}, {255, 255, 154},\n   {255, 255, 155}, {255, 255, 157}, {255, 255, 158}, {255, 255, 159}, {255, 255, 160},\n   {255, 255, 161}, {255, 255, 162}, {255, 255, 163}, {255, 255, 164}, {255, 255, 165},\n   {255, 255, 166}, {255, 255, 167}, {255, 255, 168}, {255, 255, 169}, {255, 255, 170},\n   {255, 255, 171}, {255, 255, 172}, {255, 255, 173}, {255, 255, 174}, {255, 255, 175},\n   {255, 255, 176}, {255, 255, 177}, {255, 255, 178}, {255, 255, 179}, {255, 255, 180},\n   {255, 255, 181}, {255, 255, 182}, {255, 255, 183}, {255, 255, 184}, {255, 255, 185},\n   {255, 255, 186}, {255, 255, 187}, {255, 255, 188}, {255, 255, 189}, {255, 255, 190},\n   {255, 255, 191}, {255, 255, 192}, {255, 255, 193}, {255, 255, 194}, {255, 255, 195},\n   {255, 255, 196}, {255, 255, 197}, {255, 255, 198}, {255, 255, 199}, {255, 255, 200},\n   {255, 255, 201}, {255, 255, 202}, {255, 255, 203}, {255, 255, 204}, {255, 255, 205},\n   {255, 255, 206}, {255, 255, 207}, {255, 255, 208}, {255, 255, 209}, {255, 255, 210},\n   {255, 255, 211}, {255, 255, 212}, {255, 255, 213}, {255, 255, 214}, {255, 255, 215},\n   {255, 255, 216}, {255, 255, 217}, {255, 255, 218}, {255, 255, 219}, {255, 255, 220},\n   {255, 255, 221}, {255, 255, 222}, {255, 255, 223}, {255, 255, 224}, {255, 255, 225},\n   {255, 255, 226}, {255, 255, 227}, {255, 255, 228}, {255, 255, 229}, {255, 255, 230},\n   {255, 255, 231}, {255, 255, 232}, {255, 255, 233}, {255, 255, 234}, {255, 255, 235},\n   {255, 255, 236}, {255, 255, 237}, {255, 255, 238}, {255, 255, 239}, {255, 255, 240},\n   {255, 255, 241}, {255, 255, 242}, {255, 255, 243}, {255, 255, 244}, {255, 255, 245},\n   {255, 255, 246}, {255, 255, 247}, {255, 255, 248}, {255, 255, 249}, {255, 255, 250},\n   {255, 255, 251}, {255, 255, 252}, {255, 255, 253}, {255, 255, 254}, {255, 255, 255}\n};\n\nconst rgb_t hsv_colormap[1000] = {\n   {255,   0,   0}, {255,   2,   0}, {255,   3,   0}, {255,   5,   0}, {255,   6,   0},\n   {255,   8,   0}, {255,   9,   0}, {255,  11,   0}, {255,  12,   0}, {255,  14,   0},\n   {255,  15,   0}, {255,  17,   0}, {255,  18,   0}, {255,  20,   0}, {255,  21,   0},\n   {255,  23,   0}, {255,  24,   0}, {255,  26,   0}, {255,  27,   0}, {255,  29,   0},\n   {255,  30,   0}, {255,  32,   0}, {255,  33,   0}, {255,  35,   0}, {255,  36,   0},\n   {255,  38,   0}, {255,  39,   0}, {255,  41,   0}, {255,  42,   0}, {255,  44,   0},\n   {255,  45,   0}, {255,  47,   0}, {255,  48,   0}, {255,  50,   0}, {255,  51,   0},\n   {255,  53,   0}, {255,  54,   0}, {255,  56,   0}, {255,  57,   0}, {255,  59,   0},\n   {255,  60,   0}, {255,  62,   0}, {255,  63,   0}, {255,  65,   0}, {255,  66,   0},\n   {255,  68,   0}, {255,  69,   0}, {255,  71,   0}, {255,  72,   0}, {255,  74,   0},\n   {255,  75,   0}, {255,  77,   0}, {255,  78,   0}, {255,  80,   0}, {255,  81,   0},\n   {255,  83,   0}, {255,  84,   0}, {255,  86,   0}, {255,  87,   0}, {255,  89,   0},\n   {255,  90,   0}, {255,  92,   0}, {255,  93,   0}, {255,  95,   0}, {255,  96,   0},\n   {255,  98,   0}, {255, 100,   0}, {255, 101,   0}, {255, 103,   0}, {255, 104,   0},\n   {255, 106,   0}, {255, 107,   0}, {255, 109,   0}, {255, 110,   0}, {255, 112,   0},\n   {255, 113,   0}, {255, 115,   0}, {255, 116,   0}, {255, 118,   0}, {255, 119,   0},\n   {255, 121,   0}, {255, 122,   0}, {255, 124,   0}, {255, 125,   0}, {255, 127,   0},\n   {255, 128,   0}, {255, 130,   0}, {255, 131,   0}, {255, 133,   0}, {255, 134,   0},\n   {255, 136,   0}, {255, 137,   0}, {255, 139,   0}, {255, 140,   0}, {255, 142,   0},\n   {255, 143,   0}, {255, 145,   0}, {255, 146,   0}, {255, 148,   0}, {255, 149,   0},\n   {255, 151,   0}, {255, 152,   0}, {255, 154,   0}, {255, 155,   0}, {255, 157,   0},\n   {255, 158,   0}, {255, 160,   0}, {255, 161,   0}, {255, 163,   0}, {255, 164,   0},\n   {255, 166,   0}, {255, 167,   0}, {255, 169,   0}, {255, 170,   0}, {255, 172,   0},\n   {255, 173,   0}, {255, 175,   0}, {255, 176,   0}, {255, 178,   0}, {255, 179,   0},\n   {255, 181,   0}, {255, 182,   0}, {255, 184,   0}, {255, 185,   0}, {255, 187,   0},\n   {255, 188,   0}, {255, 190,   0}, {255, 191,   0}, {255, 193,   0}, {255, 194,   0},\n   {255, 196,   0}, {255, 197,   0}, {255, 199,   0}, {255, 201,   0}, {255, 202,   0},\n   {255, 204,   0}, {255, 205,   0}, {255, 207,   0}, {255, 208,   0}, {255, 210,   0},\n   {255, 211,   0}, {255, 213,   0}, {255, 214,   0}, {255, 216,   0}, {255, 217,   0},\n   {255, 219,   0}, {255, 220,   0}, {255, 222,   0}, {255, 223,   0}, {255, 225,   0},\n   {255, 226,   0}, {255, 228,   0}, {255, 229,   0}, {255, 231,   0}, {255, 232,   0},\n   {255, 234,   0}, {255, 235,   0}, {255, 237,   0}, {255, 238,   0}, {255, 239,   0},\n   {254, 240,   0}, {254, 242,   0}, {253, 243,   0}, {253, 244,   0}, {252, 245,   0},\n   {252, 246,   0}, {251, 247,   0}, {251, 248,   0}, {250, 249,   0}, {250, 250,   0},\n   {249, 251,   0}, {249, 252,   0}, {248, 253,   0}, {248, 254,   0}, {247, 255,   0},\n   {246, 255,   0}, {245, 255,   0}, {243, 255,   0}, {242, 255,   0}, {240, 255,   0},\n   {239, 255,   0}, {237, 255,   0}, {236, 255,   0}, {234, 255,   0}, {233, 255,   0},\n   {231, 255,   0}, {230, 255,   0}, {228, 255,   0}, {227, 255,   0}, {225, 255,   0},\n   {224, 255,   0}, {222, 255,   0}, {221, 255,   0}, {219, 255,   0}, {218, 255,   0},\n   {216, 255,   0}, {215, 255,   0}, {213, 255,   0}, {211, 255,   0}, {210, 255,   0},\n   {208, 255,   0}, {207, 255,   0}, {205, 255,   0}, {204, 255,   0}, {202, 255,   0},\n   {201, 255,   0}, {199, 255,   0}, {198, 255,   0}, {196, 255,   0}, {195, 255,   0},\n   {193, 255,   0}, {192, 255,   0}, {190, 255,   0}, {189, 255,   0}, {187, 255,   0},\n   {186, 255,   0}, {184, 255,   0}, {183, 255,   0}, {181, 255,   0}, {180, 255,   0},\n   {178, 255,   0}, {177, 255,   0}, {175, 255,   0}, {174, 255,   0}, {172, 255,   0},\n   {171, 255,   0}, {169, 255,   0}, {168, 255,   0}, {166, 255,   0}, {165, 255,   0},\n   {163, 255,   0}, {162, 255,   0}, {160, 255,   0}, {159, 255,   0}, {157, 255,   0},\n   {156, 255,   0}, {154, 255,   0}, {153, 255,   0}, {151, 255,   0}, {150, 255,   0},\n   {148, 255,   0}, {147, 255,   0}, {145, 255,   0}, {144, 255,   0}, {142, 255,   0},\n   {141, 255,   0}, {139, 255,   0}, {138, 255,   0}, {136, 255,   0}, {135, 255,   0},\n   {133, 255,   0}, {132, 255,   0}, {130, 255,   0}, {129, 255,   0}, {127, 255,   0},\n   {126, 255,   0}, {124, 255,   0}, {123, 255,   0}, {121, 255,   0}, {120, 255,   0},\n   {118, 255,   0}, {117, 255,   0}, {115, 255,   0}, {114, 255,   0}, {112, 255,   0},\n   {110, 255,   0}, {109, 255,   0}, {107, 255,   0}, {106, 255,   0}, {104, 255,   0},\n   {103, 255,   0}, {101, 255,   0}, {100, 255,   0}, { 98, 255,   0}, { 97, 255,   0},\n   { 95, 255,   0}, { 94, 255,   0}, { 92, 255,   0}, { 91, 255,   0}, { 89, 255,   0},\n   { 88, 255,   0}, { 86, 255,   0}, { 85, 255,   0}, { 83, 255,   0}, { 82, 255,   0},\n   { 80, 255,   0}, { 79, 255,   0}, { 77, 255,   0}, { 76, 255,   0}, { 74, 255,   0},\n   { 73, 255,   0}, { 71, 255,   0}, { 70, 255,   0}, { 68, 255,   0}, { 67, 255,   0},\n   { 65, 255,   0}, { 64, 255,   0}, { 62, 255,   0}, { 61, 255,   0}, { 59, 255,   0},\n   { 58, 255,   0}, { 56, 255,   0}, { 55, 255,   0}, { 53, 255,   0}, { 52, 255,   0},\n   { 50, 255,   0}, { 49, 255,   0}, { 47, 255,   0}, { 46, 255,   0}, { 44, 255,   0},\n   { 43, 255,   0}, { 41, 255,   0}, { 40, 255,   0}, { 38, 255,   0}, { 37, 255,   0},\n   { 35, 255,   0}, { 34, 255,   0}, { 32, 255,   0}, { 31, 255,   0}, { 29, 255,   0},\n   { 28, 255,   0}, { 26, 255,   0}, { 25, 255,   0}, { 23, 255,   0}, { 22, 255,   0},\n   { 20, 255,   0}, { 19, 255,   0}, { 17, 255,   0}, { 16, 255,   0}, { 14, 255,   0},\n   { 12, 255,   0}, { 11, 255,   0}, {  9, 255,   0}, {  8, 255,   0}, {  7, 255,   1},\n   {  7, 255,   2}, {  6, 255,   3}, {  6, 255,   4}, {  5, 255,   5}, {  5, 255,   6},\n   {  4, 255,   7}, {  4, 255,   8}, {  3, 255,   9}, {  3, 255,  10}, {  2, 255,  11},\n   {  2, 255,  12}, {  1, 255,  13}, {  1, 255,  14}, {  0, 255,  15}, {  0, 255,  16},\n   {  0, 255,  18}, {  0, 255,  19}, {  0, 255,  21}, {  0, 255,  22}, {  0, 255,  24},\n   {  0, 255,  25}, {  0, 255,  27}, {  0, 255,  28}, {  0, 255,  30}, {  0, 255,  31},\n   {  0, 255,  33}, {  0, 255,  34}, {  0, 255,  36}, {  0, 255,  37}, {  0, 255,  39},\n   {  0, 255,  40}, {  0, 255,  42}, {  0, 255,  43}, {  0, 255,  45}, {  0, 255,  46},\n   {  0, 255,  48}, {  0, 255,  49}, {  0, 255,  51}, {  0, 255,  52}, {  0, 255,  54},\n   {  0, 255,  55}, {  0, 255,  57}, {  0, 255,  58}, {  0, 255,  60}, {  0, 255,  61},\n   {  0, 255,  63}, {  0, 255,  64}, {  0, 255,  66}, {  0, 255,  67}, {  0, 255,  69},\n   {  0, 255,  70}, {  0, 255,  72}, {  0, 255,  73}, {  0, 255,  75}, {  0, 255,  76},\n   {  0, 255,  78}, {  0, 255,  79}, {  0, 255,  81}, {  0, 255,  82}, {  0, 255,  84},\n   {  0, 255,  86}, {  0, 255,  87}, {  0, 255,  89}, {  0, 255,  90}, {  0, 255,  92},\n   {  0, 255,  93}, {  0, 255,  95}, {  0, 255,  96}, {  0, 255,  98}, {  0, 255,  99},\n   {  0, 255, 101}, {  0, 255, 102}, {  0, 255, 104}, {  0, 255, 105}, {  0, 255, 107},\n   {  0, 255, 108}, {  0, 255, 110}, {  0, 255, 111}, {  0, 255, 113}, {  0, 255, 114},\n   {  0, 255, 116}, {  0, 255, 117}, {  0, 255, 119}, {  0, 255, 120}, {  0, 255, 122},\n   {  0, 255, 123}, {  0, 255, 125}, {  0, 255, 126}, {  0, 255, 128}, {  0, 255, 129},\n   {  0, 255, 131}, {  0, 255, 132}, {  0, 255, 134}, {  0, 255, 135}, {  0, 255, 137},\n   {  0, 255, 138}, {  0, 255, 140}, {  0, 255, 141}, {  0, 255, 143}, {  0, 255, 144},\n   {  0, 255, 146}, {  0, 255, 147}, {  0, 255, 149}, {  0, 255, 150}, {  0, 255, 152},\n   {  0, 255, 153}, {  0, 255, 155}, {  0, 255, 156}, {  0, 255, 158}, {  0, 255, 159},\n   {  0, 255, 161}, {  0, 255, 162}, {  0, 255, 164}, {  0, 255, 165}, {  0, 255, 167},\n   {  0, 255, 168}, {  0, 255, 170}, {  0, 255, 171}, {  0, 255, 173}, {  0, 255, 174},\n   {  0, 255, 176}, {  0, 255, 177}, {  0, 255, 179}, {  0, 255, 180}, {  0, 255, 182},\n   {  0, 255, 183}, {  0, 255, 185}, {  0, 255, 187}, {  0, 255, 188}, {  0, 255, 190},\n   {  0, 255, 191}, {  0, 255, 193}, {  0, 255, 194}, {  0, 255, 196}, {  0, 255, 197},\n   {  0, 255, 199}, {  0, 255, 200}, {  0, 255, 202}, {  0, 255, 203}, {  0, 255, 205},\n   {  0, 255, 206}, {  0, 255, 208}, {  0, 255, 209}, {  0, 255, 211}, {  0, 255, 212},\n   {  0, 255, 214}, {  0, 255, 215}, {  0, 255, 217}, {  0, 255, 218}, {  0, 255, 220},\n   {  0, 255, 221}, {  0, 255, 223}, {  0, 255, 224}, {  0, 255, 226}, {  0, 255, 227},\n   {  0, 255, 229}, {  0, 255, 230}, {  0, 255, 232}, {  0, 255, 233}, {  0, 255, 235},\n   {  0, 255, 236}, {  0, 255, 238}, {  0, 255, 239}, {  0, 255, 241}, {  0, 255, 242},\n   {  0, 255, 244}, {  0, 255, 245}, {  0, 255, 247}, {  0, 255, 248}, {  0, 255, 250},\n   {  0, 255, 251}, {  0, 255, 253}, {  0, 255, 254}, {  0, 254, 255}, {  0, 253, 255},\n   {  0, 251, 255}, {  0, 250, 255}, {  0, 248, 255}, {  0, 247, 255}, {  0, 245, 255},\n   {  0, 244, 255}, {  0, 242, 255}, {  0, 241, 255}, {  0, 239, 255}, {  0, 238, 255},\n   {  0, 236, 255}, {  0, 235, 255}, {  0, 233, 255}, {  0, 232, 255}, {  0, 230, 255},\n   {  0, 229, 255}, {  0, 227, 255}, {  0, 225, 255}, {  0, 224, 255}, {  0, 222, 255},\n   {  0, 221, 255}, {  0, 219, 255}, {  0, 218, 255}, {  0, 216, 255}, {  0, 215, 255},\n   {  0, 213, 255}, {  0, 212, 255}, {  0, 210, 255}, {  0, 209, 255}, {  0, 207, 255},\n   {  0, 206, 255}, {  0, 204, 255}, {  0, 203, 255}, {  0, 201, 255}, {  0, 200, 255},\n   {  0, 198, 255}, {  0, 197, 255}, {  0, 195, 255}, {  0, 194, 255}, {  0, 192, 255},\n   {  0, 191, 255}, {  0, 189, 255}, {  0, 188, 255}, {  0, 186, 255}, {  0, 185, 255},\n   {  0, 183, 255}, {  0, 182, 255}, {  0, 180, 255}, {  0, 179, 255}, {  0, 177, 255},\n   {  0, 176, 255}, {  0, 174, 255}, {  0, 173, 255}, {  0, 171, 255}, {  0, 170, 255},\n   {  0, 168, 255}, {  0, 167, 255}, {  0, 165, 255}, {  0, 164, 255}, {  0, 162, 255},\n   {  0, 161, 255}, {  0, 159, 255}, {  0, 158, 255}, {  0, 156, 255}, {  0, 155, 255},\n   {  0, 153, 255}, {  0, 152, 255}, {  0, 150, 255}, {  0, 149, 255}, {  0, 147, 255},\n   {  0, 146, 255}, {  0, 144, 255}, {  0, 143, 255}, {  0, 141, 255}, {  0, 140, 255},\n   {  0, 138, 255}, {  0, 137, 255}, {  0, 135, 255}, {  0, 134, 255}, {  0, 132, 255},\n   {  0, 131, 255}, {  0, 129, 255}, {  0, 128, 255}, {  0, 126, 255}, {  0, 124, 255},\n   {  0, 123, 255}, {  0, 121, 255}, {  0, 120, 255}, {  0, 118, 255}, {  0, 117, 255},\n   {  0, 115, 255}, {  0, 114, 255}, {  0, 112, 255}, {  0, 111, 255}, {  0, 109, 255},\n   {  0, 108, 255}, {  0, 106, 255}, {  0, 105, 255}, {  0, 103, 255}, {  0, 102, 255},\n   {  0, 100, 255}, {  0,  99, 255}, {  0,  97, 255}, {  0,  96, 255}, {  0,  94, 255},\n   {  0,  93, 255}, {  0,  91, 255}, {  0,  90, 255}, {  0,  88, 255}, {  0,  87, 255},\n   {  0,  85, 255}, {  0,  84, 255}, {  0,  82, 255}, {  0,  81, 255}, {  0,  79, 255},\n   {  0,  78, 255}, {  0,  76, 255}, {  0,  75, 255}, {  0,  73, 255}, {  0,  72, 255},\n   {  0,  70, 255}, {  0,  69, 255}, {  0,  67, 255}, {  0,  66, 255}, {  0,  64, 255},\n   {  0,  63, 255}, {  0,  61, 255}, {  0,  60, 255}, {  0,  58, 255}, {  0,  57, 255},\n   {  0,  55, 255}, {  0,  54, 255}, {  0,  52, 255}, {  0,  51, 255}, {  0,  49, 255},\n   {  0,  48, 255}, {  0,  46, 255}, {  0,  45, 255}, {  0,  43, 255}, {  0,  42, 255},\n   {  0,  40, 255}, {  0,  39, 255}, {  0,  37, 255}, {  0,  36, 255}, {  0,  34, 255},\n   {  0,  33, 255}, {  0,  31, 255}, {  0,  30, 255}, {  0,  28, 255}, {  0,  26, 255},\n   {  0,  25, 255}, {  0,  23, 255}, {  0,  22, 255}, {  0,  20, 255}, {  0,  19, 255},\n   {  0,  17, 255}, {  0,  16, 255}, {  1,  15, 255}, {  1,  14, 255}, {  2,  13, 255},\n   {  2,  12, 255}, {  3,  11, 255}, {  3,  10, 255}, {  4,   9, 255}, {  4,   8, 255},\n   {  5,   7, 255}, {  5,   6, 255}, {  6,   5, 255}, {  6,   4, 255}, {  7,   3, 255},\n   {  7,   2, 255}, {  8,   1, 255}, {  8,   0, 255}, { 10,   0, 255}, { 11,   0, 255},\n   { 13,   0, 255}, { 14,   0, 255}, { 16,   0, 255}, { 17,   0, 255}, { 19,   0, 255},\n   { 20,   0, 255}, { 22,   0, 255}, { 23,   0, 255}, { 25,   0, 255}, { 26,   0, 255},\n   { 28,   0, 255}, { 29,   0, 255}, { 31,   0, 255}, { 32,   0, 255}, { 34,   0, 255},\n   { 35,   0, 255}, { 37,   0, 255}, { 38,   0, 255}, { 40,   0, 255}, { 41,   0, 255},\n   { 43,   0, 255}, { 44,   0, 255}, { 46,   0, 255}, { 47,   0, 255}, { 49,   0, 255},\n   { 50,   0, 255}, { 52,   0, 255}, { 53,   0, 255}, { 55,   0, 255}, { 56,   0, 255},\n   { 58,   0, 255}, { 59,   0, 255}, { 61,   0, 255}, { 62,   0, 255}, { 64,   0, 255},\n   { 65,   0, 255}, { 67,   0, 255}, { 68,   0, 255}, { 70,   0, 255}, { 72,   0, 255},\n   { 73,   0, 255}, { 75,   0, 255}, { 76,   0, 255}, { 78,   0, 255}, { 79,   0, 255},\n   { 81,   0, 255}, { 82,   0, 255}, { 84,   0, 255}, { 85,   0, 255}, { 87,   0, 255},\n   { 88,   0, 255}, { 90,   0, 255}, { 91,   0, 255}, { 93,   0, 255}, { 94,   0, 255},\n   { 96,   0, 255}, { 97,   0, 255}, { 99,   0, 255}, {100,   0, 255}, {102,   0, 255},\n   {103,   0, 255}, {105,   0, 255}, {106,   0, 255}, {108,   0, 255}, {109,   0, 255},\n   {111,   0, 255}, {112,   0, 255}, {114,   0, 255}, {115,   0, 255}, {117,   0, 255},\n   {118,   0, 255}, {120,   0, 255}, {121,   0, 255}, {123,   0, 255}, {124,   0, 255},\n   {126,   0, 255}, {127,   0, 255}, {129,   0, 255}, {130,   0, 255}, {132,   0, 255},\n   {133,   0, 255}, {135,   0, 255}, {136,   0, 255}, {138,   0, 255}, {139,   0, 255},\n   {141,   0, 255}, {142,   0, 255}, {144,   0, 255}, {145,   0, 255}, {147,   0, 255},\n   {148,   0, 255}, {150,   0, 255}, {151,   0, 255}, {153,   0, 255}, {154,   0, 255},\n   {156,   0, 255}, {157,   0, 255}, {159,   0, 255}, {160,   0, 255}, {162,   0, 255},\n   {163,   0, 255}, {165,   0, 255}, {166,   0, 255}, {168,   0, 255}, {169,   0, 255},\n   {171,   0, 255}, {173,   0, 255}, {174,   0, 255}, {176,   0, 255}, {177,   0, 255},\n   {179,   0, 255}, {180,   0, 255}, {182,   0, 255}, {183,   0, 255}, {185,   0, 255},\n   {186,   0, 255}, {188,   0, 255}, {189,   0, 255}, {191,   0, 255}, {192,   0, 255},\n   {194,   0, 255}, {195,   0, 255}, {197,   0, 255}, {198,   0, 255}, {200,   0, 255},\n   {201,   0, 255}, {203,   0, 255}, {204,   0, 255}, {206,   0, 255}, {207,   0, 255},\n   {209,   0, 255}, {210,   0, 255}, {212,   0, 255}, {213,   0, 255}, {215,   0, 255},\n   {216,   0, 255}, {218,   0, 255}, {219,   0, 255}, {221,   0, 255}, {222,   0, 255},\n   {224,   0, 255}, {225,   0, 255}, {227,   0, 255}, {228,   0, 255}, {230,   0, 255},\n   {231,   0, 255}, {233,   0, 255}, {234,   0, 255}, {236,   0, 255}, {237,   0, 255},\n   {239,   0, 255}, {240,   0, 255}, {242,   0, 255}, {243,   0, 255}, {245,   0, 255},\n   {246,   0, 255}, {247,   0, 254}, {248,   0, 253}, {248,   0, 252}, {249,   0, 251},\n   {249,   0, 250}, {250,   0, 249}, {250,   0, 248}, {251,   0, 247}, {251,   0, 246},\n   {252,   0, 245}, {252,   0, 244}, {253,   0, 243}, {253,   0, 242}, {254,   0, 241},\n   {254,   0, 240}, {255,   0, 239}, {255,   0, 238}, {255,   0, 236}, {255,   0, 235},\n   {255,   0, 233}, {255,   0, 232}, {255,   0, 230}, {255,   0, 229}, {255,   0, 227},\n   {255,   0, 226}, {255,   0, 224}, {255,   0, 223}, {255,   0, 221}, {255,   0, 220},\n   {255,   0, 218}, {255,   0, 217}, {255,   0, 215}, {255,   0, 214}, {255,   0, 212},\n   {255,   0, 211}, {255,   0, 209}, {255,   0, 208}, {255,   0, 206}, {255,   0, 205},\n   {255,   0, 203}, {255,   0, 202}, {255,   0, 200}, {255,   0, 199}, {255,   0, 197},\n   {255,   0, 196}, {255,   0, 194}, {255,   0, 193}, {255,   0, 191}, {255,   0, 190},\n   {255,   0, 188}, {255,   0, 187}, {255,   0, 185}, {255,   0, 184}, {255,   0, 182},\n   {255,   0, 181}, {255,   0, 179}, {255,   0, 178}, {255,   0, 176}, {255,   0, 175},\n   {255,   0, 173}, {255,   0, 172}, {255,   0, 170}, {255,   0, 169}, {255,   0, 167},\n   {255,   0, 166}, {255,   0, 164}, {255,   0, 163}, {255,   0, 161}, {255,   0, 160},\n   {255,   0, 158}, {255,   0, 157}, {255,   0, 155}, {255,   0, 154}, {255,   0, 152},\n   {255,   0, 151}, {255,   0, 149}, {255,   0, 148}, {255,   0, 146}, {255,   0, 145},\n   {255,   0, 143}, {255,   0, 141}, {255,   0, 140}, {255,   0, 138}, {255,   0, 137},\n   {255,   0, 135}, {255,   0, 134}, {255,   0, 132}, {255,   0, 131}, {255,   0, 129},\n   {255,   0, 128}, {255,   0, 126}, {255,   0, 125}, {255,   0, 123}, {255,   0, 122},\n   {255,   0, 120}, {255,   0, 119}, {255,   0, 117}, {255,   0, 116}, {255,   0, 114},\n   {255,   0, 113}, {255,   0, 111}, {255,   0, 110}, {255,   0, 108}, {255,   0, 107},\n   {255,   0, 105}, {255,   0, 104}, {255,   0, 102}, {255,   0, 101}, {255,   0,  99},\n   {255,   0,  98}, {255,   0,  96}, {255,   0,  95}, {255,   0,  93}, {255,   0,  92},\n   {255,   0,  90}, {255,   0,  89}, {255,   0,  87}, {255,   0,  86}, {255,   0,  84},\n   {255,   0,  83}, {255,   0,  81}, {255,   0,  80}, {255,   0,  78}, {255,   0,  77},\n   {255,   0,  75}, {255,   0,  74}, {255,   0,  72}, {255,   0,  71}, {255,   0,  69},\n   {255,   0,  68}, {255,   0,  66}, {255,   0,  65}, {255,   0,  63}, {255,   0,  62},\n   {255,   0,  60}, {255,   0,  59}, {255,   0,  57}, {255,   0,  56}, {255,   0,  54},\n   {255,   0,  53}, {255,   0,  51}, {255,   0,  50}, {255,   0,  48}, {255,   0,  47},\n   {255,   0,  45}, {255,   0,  44}, {255,   0,  42}, {255,   0,  40}, {255,   0,  39},\n   {255,   0,  37}, {255,   0,  36}, {255,   0,  34}, {255,   0,  33}, {255,   0,  31},\n   {255,   0,  30}, {255,   0,  28}, {255,   0,  27}, {255,   0,  25}, {255,   0,  24}\n};\n\nconst rgb_t jet_colormap[1000] = {\n   { 29,   0, 102}, { 23,   0, 107}, { 17,   0, 112}, { 12,   0, 117}, {  6,   0, 122},\n   {  0,   0, 127}, {  0,   0, 128}, {  0,   0, 129}, {  0,   0, 129}, {  0,   0, 130},\n   {  0,   0, 131}, {  0,   0, 132}, {  0,   0, 133}, {  0,   0, 133}, {  0,   0, 134},\n   {  0,   0, 135}, {  0,   0, 136}, {  0,   0, 137}, {  0,   0, 138}, {  0,   0, 140},\n   {  0,   0, 141}, {  0,   0, 142}, {  0,   0, 143}, {  0,   0, 145}, {  0,   0, 146},\n   {  0,   0, 147}, {  0,   0, 148}, {  0,   0, 150}, {  0,   0, 151}, {  0,   0, 152},\n   {  0,   0, 153}, {  0,   0, 154}, {  0,   0, 156}, {  0,   0, 157}, {  0,   0, 158},\n   {  0,   0, 159}, {  0,   0, 160}, {  0,   0, 161}, {  0,   0, 163}, {  0,   0, 164},\n   {  0,   0, 165}, {  0,   0, 166}, {  0,   0, 168}, {  0,   0, 169}, {  0,   0, 170},\n   {  0,   0, 171}, {  0,   0, 173}, {  0,   0, 174}, {  0,   0, 175}, {  0,   0, 176},\n   {  0,   0, 178}, {  0,   0, 179}, {  0,   0, 180}, {  0,   0, 181}, {  0,   0, 183},\n   {  0,   0, 184}, {  0,   0, 185}, {  0,   0, 186}, {  0,   0, 188}, {  0,   0, 189},\n   {  0,   0, 190}, {  0,   0, 191}, {  0,   0, 193}, {  0,   0, 194}, {  0,   0, 195},\n   {  0,   0, 196}, {  0,   0, 197}, {  0,   0, 198}, {  0,   0, 200}, {  0,   0, 201},\n   {  0,   0, 202}, {  0,   0, 203}, {  0,   0, 204}, {  0,   0, 206}, {  0,   0, 207},\n   {  0,   0, 208}, {  0,   0, 209}, {  0,   0, 211}, {  0,   0, 212}, {  0,   0, 213},\n   {  0,   0, 214}, {  0,   0, 216}, {  0,   0, 217}, {  0,   0, 218}, {  0,   0, 219},\n   {  0,   0, 221}, {  0,   0, 222}, {  0,   0, 223}, {  0,   0, 225}, {  0,   0, 226},\n   {  0,   0, 227}, {  0,   0, 228}, {  0,   0, 230}, {  0,   0, 231}, {  0,   0, 232},\n   {  0,   0, 233}, {  0,   0, 234}, {  0,   0, 234}, {  0,   0, 235}, {  0,   0, 236},\n   {  0,   0, 237}, {  0,   0, 238}, {  0,   0, 239}, {  0,   0, 239}, {  0,   0, 240},\n   {  0,   0, 241}, {  0,   0, 242}, {  0,   0, 243}, {  0,   0, 244}, {  0,   0, 246},\n   {  0,   0, 247}, {  0,   0, 248}, {  0,   0, 249}, {  0,   0, 250}, {  0,   0, 251},\n   {  0,   0, 253}, {  0,   0, 254}, {  0,   0, 254}, {  0,   0, 254}, {  0,   0, 254},\n   {  0,   0, 254}, {  0,   0, 254}, {  0,   0, 255}, {  0,   0, 255}, {  0,   0, 255},\n   {  0,   0, 255}, {  0,   0, 255}, {  0,   0, 255}, {  0,   1, 255}, {  0,   1, 255},\n   {  0,   2, 255}, {  0,   3, 255}, {  0,   3, 255}, {  0,   4, 255}, {  0,   5, 255},\n   {  0,   6, 255}, {  0,   6, 255}, {  0,   7, 255}, {  0,   8, 255}, {  0,   9, 255},\n   {  0,  10, 255}, {  0,  11, 255}, {  0,  12, 255}, {  0,  13, 255}, {  0,  14, 255},\n   {  0,  15, 255}, {  0,  16, 255}, {  0,  17, 255}, {  0,  18, 255}, {  0,  19, 255},\n   {  0,  21, 255}, {  0,  22, 255}, {  0,  23, 255}, {  0,  24, 255}, {  0,  25, 255},\n   {  0,  26, 255}, {  0,  27, 255}, {  0,  28, 255}, {  0,  29, 255}, {  0,  30, 255},\n   {  0,  31, 255}, {  0,  32, 255}, {  0,  34, 255}, {  0,  35, 255}, {  0,  36, 255},\n   {  0,  37, 255}, {  0,  38, 255}, {  0,  39, 255}, {  0,  40, 255}, {  0,  41, 255},\n   {  0,  42, 255}, {  0,  43, 255}, {  0,  44, 255}, {  0,  45, 255}, {  0,  46, 255},\n   {  0,  48, 255}, {  0,  49, 255}, {  0,  50, 255}, {  0,  51, 255}, {  0,  52, 255},\n   {  0,  53, 255}, {  0,  54, 255}, {  0,  55, 255}, {  0,  56, 255}, {  0,  57, 255},\n   {  0,  58, 255}, {  0,  58, 255}, {  0,  59, 255}, {  0,  60, 255}, {  0,  60, 255},\n   {  0,  61, 255}, {  0,  62, 255}, {  0,  63, 255}, {  0,  63, 255}, {  0,  64, 255},\n   {  0,  65, 255}, {  0,  66, 255}, {  0,  67, 255}, {  0,  68, 255}, {  0,  69, 255},\n   {  0,  71, 255}, {  0,  72, 255}, {  0,  73, 255}, {  0,  74, 255}, {  0,  75, 255},\n   {  0,  76, 255}, {  0,  77, 255}, {  0,  78, 255}, {  0,  79, 255}, {  0,  80, 255},\n   {  0,  81, 255}, {  0,  82, 255}, {  0,  84, 255}, {  0,  85, 255}, {  0,  86, 255},\n   {  0,  87, 255}, {  0,  88, 255}, {  0,  89, 255}, {  0,  90, 255}, {  0,  91, 255},\n   {  0,  92, 255}, {  0,  93, 255}, {  0,  94, 255}, {  0,  95, 255}, {  0,  96, 255},\n   {  0,  98, 255}, {  0,  99, 255}, {  0, 100, 255}, {  0, 101, 255}, {  0, 102, 255},\n   {  0, 103, 255}, {  0, 104, 255}, {  0, 105, 255}, {  0, 106, 255}, {  0, 107, 255},\n   {  0, 108, 255}, {  0, 109, 255}, {  0, 111, 255}, {  0, 112, 255}, {  0, 113, 255},\n   {  0, 114, 255}, {  0, 115, 255}, {  0, 116, 255}, {  0, 117, 255}, {  0, 118, 255},\n   {  0, 119, 255}, {  0, 120, 255}, {  0, 121, 255}, {  0, 122, 255}, {  0, 123, 255},\n   {  0, 125, 255}, {  0, 126, 255}, {  0, 127, 255}, {  0, 128, 255}, {  0, 129, 255},\n   {  0, 130, 255}, {  0, 131, 255}, {  0, 132, 255}, {  0, 133, 255}, {  0, 134, 255},\n   {  0, 135, 255}, {  0, 136, 255}, {  0, 138, 255}, {  0, 139, 255}, {  0, 140, 255},\n   {  0, 141, 255}, {  0, 142, 255}, {  0, 143, 255}, {  0, 144, 255}, {  0, 145, 255},\n   {  0, 146, 255}, {  0, 147, 255}, {  0, 148, 255}, {  0, 149, 255}, {  0, 150, 255},\n   {  0, 150, 255}, {  0, 151, 255}, {  0, 152, 255}, {  0, 153, 255}, {  0, 153, 255},\n   {  0, 154, 255}, {  0, 155, 255}, {  0, 155, 255}, {  0, 156, 255}, {  0, 157, 255},\n   {  0, 158, 255}, {  0, 159, 255}, {  0, 161, 255}, {  0, 162, 255}, {  0, 163, 255},\n   {  0, 164, 255}, {  0, 165, 255}, {  0, 166, 255}, {  0, 167, 255}, {  0, 168, 255},\n   {  0, 169, 255}, {  0, 170, 255}, {  0, 171, 255}, {  0, 172, 255}, {  0, 173, 255},\n   {  0, 175, 255}, {  0, 176, 255}, {  0, 177, 255}, {  0, 178, 255}, {  0, 179, 255},\n   {  0, 180, 255}, {  0, 181, 255}, {  0, 182, 255}, {  0, 183, 255}, {  0, 184, 255},\n   {  0, 185, 255}, {  0, 186, 255}, {  0, 188, 255}, {  0, 189, 255}, {  0, 190, 255},\n   {  0, 191, 255}, {  0, 192, 255}, {  0, 193, 255}, {  0, 194, 255}, {  0, 195, 255},\n   {  0, 196, 255}, {  0, 197, 255}, {  0, 198, 255}, {  0, 199, 255}, {  0, 200, 255},\n   {  0, 202, 255}, {  0, 203, 255}, {  0, 204, 255}, {  0, 205, 255}, {  0, 206, 255},\n   {  0, 207, 255}, {  0, 208, 255}, {  0, 209, 255}, {  0, 210, 255}, {  0, 211, 255},\n   {  0, 212, 255}, {  0, 213, 255}, {  0, 215, 255}, {  0, 216, 255}, {  0, 217, 255},\n   {  0, 218, 254}, {  0, 219, 253}, {  0, 220, 252}, {  0, 221, 252}, {  0, 222, 251},\n   {  0, 223, 250}, {  0, 224, 250}, {  0, 225, 249}, {  0, 226, 248}, {  0, 227, 247},\n   {  0, 229, 247}, {  1, 230, 246}, {  2, 231, 245}, {  3, 232, 244}, {  3, 233, 243},\n   {  4, 234, 242}, {  5, 235, 241}, {  5, 236, 240}, {  6, 237, 239}, {  7, 238, 238},\n   {  8, 239, 238}, {  8, 240, 237}, {  9, 241, 236}, { 10, 242, 236}, { 10, 242, 235},\n   { 11, 243, 235}, { 11, 244, 234}, { 12, 245, 234}, { 13, 245, 233}, { 13, 246, 232},\n   { 14, 247, 232}, { 15, 247, 231}, { 15, 248, 231}, { 16, 249, 230}, { 17, 249, 229},\n   { 18, 250, 228}, { 18, 251, 227}, { 19, 251, 226}, { 20, 252, 225}, { 21, 253, 224},\n   { 22, 253, 224}, { 23, 254, 223}, { 23, 254, 222}, { 24, 255, 221}, { 25, 255, 220},\n   { 26, 255, 219}, { 27, 255, 218}, { 28, 255, 218}, { 29, 255, 217}, { 30, 255, 216},\n   { 30, 255, 215}, { 31, 255, 214}, { 32, 255, 214}, { 33, 255, 213}, { 34, 255, 212},\n   { 35, 255, 211}, { 36, 255, 210}, { 37, 255, 209}, { 38, 255, 208}, { 39, 255, 207},\n   { 39, 255, 207}, { 40, 255, 206}, { 41, 255, 205}, { 42, 255, 204}, { 43, 255, 203},\n   { 44, 255, 202}, { 45, 255, 201}, { 46, 255, 200}, { 47, 255, 199}, { 48, 255, 198},\n   { 48, 255, 198}, { 49, 255, 197}, { 50, 255, 196}, { 51, 255, 195}, { 52, 255, 194},\n   { 53, 255, 193}, { 54, 255, 192}, { 55, 255, 191}, { 55, 255, 191}, { 56, 255, 190},\n   { 57, 255, 189}, { 58, 255, 188}, { 59, 255, 187}, { 60, 255, 186}, { 60, 255, 186},\n   { 61, 255, 185}, { 62, 255, 184}, { 63, 255, 183}, { 64, 255, 182}, { 65, 255, 181},\n   { 65, 255, 181}, { 66, 255, 180}, { 67, 255, 179}, { 68, 255, 178}, { 69, 255, 177},\n   { 70, 255, 176}, { 71, 255, 175}, { 72, 255, 174}, { 73, 255, 173}, { 74, 255, 172},\n   { 74, 255, 172}, { 75, 255, 171}, { 76, 255, 170}, { 77, 255, 169}, { 78, 255, 168},\n   { 79, 255, 167}, { 80, 255, 166}, { 81, 255, 165}, { 82, 255, 164}, { 83, 255, 163},\n   { 83, 255, 163}, { 84, 255, 162}, { 84, 255, 162}, { 85, 255, 161}, { 85, 255, 161},\n   { 86, 255, 160}, { 87, 255, 159}, { 87, 255, 159}, { 88, 255, 158}, { 88, 255, 158},\n   { 89, 255, 157}, { 89, 255, 157}, { 90, 255, 156}, { 91, 255, 155}, { 92, 255, 154},\n   { 93, 255, 153}, { 94, 255, 152}, { 95, 255, 151}, { 96, 255, 150}, { 97, 255, 149},\n   { 97, 255, 149}, { 98, 255, 148}, { 99, 255, 147}, {100, 255, 146}, {101, 255, 145},\n   {102, 255, 144}, {102, 255, 143}, {103, 255, 142}, {104, 255, 141}, {105, 255, 140},\n   {106, 255, 140}, {107, 255, 139}, {107, 255, 138}, {108, 255, 137}, {109, 255, 136},\n   {110, 255, 135}, {111, 255, 134}, {112, 255, 134}, {113, 255, 133}, {114, 255, 132},\n   {114, 255, 131}, {115, 255, 130}, {116, 255, 130}, {117, 255, 129}, {118, 255, 128},\n   {119, 255, 127}, {120, 255, 126}, {121, 255, 125}, {122, 255, 124}, {123, 255, 123},\n   {123, 255, 123}, {124, 255, 122}, {125, 255, 121}, {126, 255, 120}, {127, 255, 119},\n   {128, 255, 118}, {129, 255, 117}, {130, 255, 116}, {130, 255, 115}, {131, 255, 114},\n   {132, 255, 114}, {133, 255, 113}, {134, 255, 112}, {134, 255, 111}, {135, 255, 110},\n   {136, 255, 109}, {137, 255, 108}, {138, 255, 107}, {139, 255, 107}, {140, 255, 106},\n   {140, 255, 105}, {141, 255, 104}, {142, 255, 103}, {143, 255, 102}, {144, 255, 102},\n   {145, 255, 101}, {146, 255, 100}, {147, 255,  99}, {148, 255,  98}, {149, 255,  97},\n   {149, 255,  97}, {150, 255,  96}, {151, 255,  95}, {152, 255,  94}, {153, 255,  93},\n   {154, 255,  92}, {155, 255,  91}, {156, 255,  90}, {157, 255,  89}, {157, 255,  89},\n   {158, 255,  88}, {158, 255,  88}, {159, 255,  87}, {159, 255,  87}, {160, 255,  86},\n   {161, 255,  85}, {161, 255,  85}, {162, 255,  84}, {162, 255,  84}, {163, 255,  83},\n   {163, 255,  83}, {164, 255,  82}, {165, 255,  81}, {166, 255,  80}, {167, 255,  79},\n   {168, 255,  78}, {169, 255,  77}, {170, 255,  76}, {171, 255,  75}, {172, 255,  74},\n   {172, 255,  74}, {173, 255,  73}, {174, 255,  72}, {175, 255,  71}, {176, 255,  70},\n   {177, 255,  69}, {178, 255,  68}, {179, 255,  67}, {180, 255,  66}, {181, 255,  65},\n   {181, 255,  65}, {182, 255,  64}, {183, 255,  63}, {184, 255,  62}, {185, 255,  61},\n   {186, 255,  60}, {186, 255,  60}, {187, 255,  59}, {188, 255,  58}, {189, 255,  57},\n   {190, 255,  56}, {191, 255,  55}, {191, 255,  55}, {192, 255,  54}, {193, 255,  53},\n   {194, 255,  52}, {195, 255,  51}, {196, 255,  50}, {197, 255,  49}, {198, 255,  48},\n   {198, 255,  48}, {199, 255,  47}, {200, 255,  46}, {201, 255,  45}, {202, 255,  44},\n   {203, 255,  43}, {204, 255,  42}, {205, 255,  41}, {206, 255,  40}, {207, 255,  39},\n   {207, 255,  39}, {208, 255,  38}, {209, 255,  37}, {210, 255,  36}, {211, 255,  35},\n   {212, 255,  34}, {213, 255,  33}, {214, 255,  32}, {214, 255,  31}, {215, 255,  30},\n   {216, 255,  30}, {217, 255,  29}, {218, 255,  28}, {218, 255,  27}, {219, 255,  26},\n   {220, 255,  25}, {221, 255,  24}, {222, 255,  23}, {223, 255,  23}, {224, 255,  22},\n   {224, 255,  21}, {225, 255,  20}, {226, 255,  19}, {227, 255,  18}, {228, 255,  18},\n   {229, 255,  17}, {230, 255,  16}, {231, 255,  15}, {231, 255,  15}, {232, 255,  14},\n   {232, 255,  13}, {233, 255,  13}, {234, 255,  12}, {234, 255,  11}, {235, 255,  11},\n   {235, 255,  10}, {236, 255,  10}, {236, 255,   9}, {237, 255,   8}, {238, 254,   8},\n   {238, 253,   7}, {239, 252,   6}, {240, 251,   5}, {241, 250,   5}, {242, 249,   4},\n   {243, 248,   3}, {244, 247,   3}, {245, 246,   2}, {246, 246,   1}, {247, 245,   0},\n   {247, 243,   0}, {248, 242,   0}, {249, 242,   0}, {250, 241,   0}, {250, 240,   0},\n   {251, 239,   0}, {252, 238,   0}, {252, 237,   0}, {253, 236,   0}, {254, 235,   0},\n   {255, 234,   0}, {255, 233,   0}, {255, 232,   0}, {255, 231,   0}, {255, 230,   0},\n   {255, 229,   0}, {255, 228,   0}, {255, 227,   0}, {255, 226,   0}, {255, 225,   0},\n   {255, 224,   0}, {255, 223,   0}, {255, 222,   0}, {255, 221,   0}, {255, 220,   0},\n   {255, 219,   0}, {255, 218,   0}, {255, 217,   0}, {255, 216,   0}, {255, 215,   0},\n   {255, 214,   0}, {255, 213,   0}, {255, 212,   0}, {255, 211,   0}, {255, 210,   0},\n   {255, 209,   0}, {255, 208,   0}, {255, 207,   0}, {255, 206,   0}, {255, 205,   0},\n   {255, 204,   0}, {255, 203,   0}, {255, 202,   0}, {255, 201,   0}, {255, 200,   0},\n   {255, 199,   0}, {255, 198,   0}, {255, 197,   0}, {255, 196,   0}, {255, 195,   0},\n   {255, 194,   0}, {255, 193,   0}, {255, 192,   0}, {255, 191,   0}, {255, 190,   0},\n   {255, 189,   0}, {255, 188,   0}, {255, 187,   0}, {255, 186,   0}, {255, 185,   0},\n   {255, 184,   0}, {255, 183,   0}, {255, 182,   0}, {255, 180,   0}, {255, 179,   0},\n   {255, 178,   0}, {255, 177,   0}, {255, 176,   0}, {255, 176,   0}, {255, 175,   0},\n   {255, 175,   0}, {255, 174,   0}, {255, 173,   0}, {255, 173,   0}, {255, 172,   0},\n   {255, 171,   0}, {255, 171,   0}, {255, 170,   0}, {255, 169,   0}, {255, 168,   0},\n   {255, 167,   0}, {255, 166,   0}, {255, 165,   0}, {255, 164,   0}, {255, 163,   0},\n   {255, 162,   0}, {255, 161,   0}, {255, 160,   0}, {255, 159,   0}, {255, 158,   0},\n   {255, 157,   0}, {255, 156,   0}, {255, 155,   0}, {255, 154,   0}, {255, 153,   0},\n   {255, 152,   0}, {255, 151,   0}, {255, 150,   0}, {255, 150,   0}, {255, 149,   0},\n   {255, 147,   0}, {255, 146,   0}, {255, 146,   0}, {255, 145,   0}, {255, 144,   0},\n   {255, 143,   0}, {255, 142,   0}, {255, 141,   0}, {255, 140,   0}, {255, 139,   0},\n   {255, 138,   0}, {255, 137,   0}, {255, 136,   0}, {255, 135,   0}, {255, 134,   0},\n   {255, 133,   0}, {255, 132,   0}, {255, 131,   0}, {255, 130,   0}, {255, 129,   0},\n   {255, 128,   0}, {255, 127,   0}, {255, 126,   0}, {255, 125,   0}, {255, 124,   0},\n   {255, 123,   0}, {255, 122,   0}, {255, 121,   0}, {255, 120,   0}, {255, 119,   0},\n   {255, 118,   0}, {255, 117,   0}, {255, 116,   0}, {255, 115,   0}, {255, 114,   0},\n   {255, 113,   0}, {255, 112,   0}, {255, 111,   0}, {255, 109,   0}, {255, 108,   0},\n   {255, 107,   0}, {255, 106,   0}, {255, 105,   0}, {255, 104,   0}, {255, 103,   0},\n   {255, 102,   0}, {255, 101,   0}, {255, 100,   0}, {255,  99,   0}, {255,  98,   0},\n   {255,  97,   0}, {255,  96,   0}, {255,  95,   0}, {255,  94,   0}, {255,  93,   0},\n   {255,  92,   0}, {255,  91,   0}, {255,  91,   0}, {255,  90,   0}, {255,  90,   0},\n   {255,  89,   0}, {255,  88,   0}, {255,  88,   0}, {255,  87,   0}, {255,  86,   0},\n   {255,  86,   0}, {255,  85,   0}, {255,  84,   0}, {255,  83,   0}, {255,  82,   0},\n   {255,  81,   0}, {255,  80,   0}, {255,  79,   0}, {255,  78,   0}, {255,  77,   0},\n   {255,  76,   0}, {255,  75,   0}, {255,  74,   0}, {255,  73,   0}, {255,  72,   0},\n   {255,  71,   0}, {255,  70,   0}, {255,  69,   0}, {255,  68,   0}, {255,  67,   0},\n   {255,  66,   0}, {255,  65,   0}, {255,  64,   0}, {255,  63,   0}, {255,  62,   0},\n   {255,  61,   0}, {255,  60,   0}, {255,  59,   0}, {255,  58,   0}, {255,  57,   0},\n   {255,  56,   0}, {255,  55,   0}, {255,  54,   0}, {255,  54,   0}, {255,  53,   0},\n   {255,  51,   0}, {255,  50,   0}, {255,  49,   0}, {255,  48,   0}, {255,  47,   0},\n   {255,  46,   0}, {255,  45,   0}, {255,  44,   0}, {255,  43,   0}, {255,  42,   0},\n   {255,  41,   0}, {255,  40,   0}, {255,  39,   0}, {255,  38,   0}, {255,  37,   0},\n   {255,  36,   0}, {255,  35,   0}, {255,  34,   0}, {255,  33,   0}, {255,  32,   0},\n   {255,  31,   0}, {255,  30,   0}, {255,  29,   0}, {255,  28,   0}, {255,  27,   0},\n   {255,  26,   0}, {255,  25,   0}, {255,  24,   0}, {254,  23,   0}, {254,  22,   0},\n   {254,  21,   0}, {254,  20,   0}, {254,  19,   0}, {254,  18,   0}, {253,  17,   0},\n   {251,  16,   0}, {250,  15,   0}, {249,  14,   0}, {248,  13,   0}, {247,  12,   0},\n   {246,  11,   0}, {244,  10,   0}, {243,   9,   0}, {242,   8,   0}, {241,   7,   0},\n   {240,   6,   0}, {239,   6,   0}, {239,   5,   0}, {238,   4,   0}, {237,   4,   0},\n   {236,   3,   0}, {235,   3,   0}, {234,   2,   0}, {234,   1,   0}, {233,   1,   0},\n   {232,   0,   0}, {231,   0,   0}, {230,   0,   0}, {228,   0,   0}, {227,   0,   0},\n   {226,   0,   0}, {225,   0,   0}, {223,   0,   0}, {222,   0,   0}, {221,   0,   0},\n   {219,   0,   0}, {218,   0,   0}, {217,   0,   0}, {216,   0,   0}, {214,   0,   0},\n   {213,   0,   0}, {212,   0,   0}, {211,   0,   0}, {209,   0,   0}, {208,   0,   0},\n   {207,   0,   0}, {206,   0,   0}, {204,   0,   0}, {203,   0,   0}, {202,   0,   0},\n   {201,   0,   0}, {200,   0,   0}, {198,   0,   0}, {197,   0,   0}, {196,   0,   0},\n   {195,   0,   0}, {194,   0,   0}, {193,   0,   0}, {191,   0,   0}, {190,   0,   0},\n   {189,   0,   0}, {188,   0,   0}, {186,   0,   0}, {185,   0,   0}, {184,   0,   0},\n   {183,   0,   0}, {181,   0,   0}, {180,   0,   0}, {179,   0,   0}, {178,   0,   0},\n   {176,   0,   0}, {175,   0,   0}, {174,   0,   0}, {173,   0,   0}, {171,   0,   0},\n   {170,   0,   0}, {169,   0,   0}, {168,   0,   0}, {166,   0,   0}, {165,   0,   0},\n   {164,   0,   0}, {163,   0,   0}, {161,   0,   0}, {160,   0,   0}, {159,   0,   0},\n   {158,   0,   0}, {157,   0,   0}, {156,   0,   0}, {154,   0,   0}, {153,   0,   0},\n   {152,   0,   0}, {151,   0,   0}, {150,   0,   0}, {148,   0,   0}, {147,   0,   0},\n   {146,   0,   0}, {145,   0,   0}, {143,   0,   0}, {142,   0,   0}, {141,   0,   0},\n   {140,   0,   0}, {138,   0,   0}, {137,   0,   0}, {136,   0,   0}, {135,   0,   0},\n   {134,   0,   0}, {133,   0,   0}, {133,   0,   0}, {132,   0,   0}, {131,   0,   0},\n   {130,   0,   0}, {129,   0,   0}, {129,   0,   0}, {128,   0,   0}, {127,   0,   0},\n   {122,   0,   9}, {117,   0,  18}, {112,   0,  27}, {107,   0,  36}, {102,   0,  45}\n};\n\nconst rgb_t prism_colormap[1000] = {\n   {255,   0,   0}, {255,   2,   0}, {255,   4,   0}, {255,   6,   0}, {255,   8,   0},\n   {255,  10,   0}, {255,  11,   0}, {255,  13,   0}, {255,  15,   0}, {255,  17,   0},\n   {255,  19,   0}, {255,  21,   0}, {255,  23,   0}, {255,  25,   0}, {255,  27,   0},\n   {255,  29,   0}, {255,  31,   0}, {255,  33,   0}, {255,  34,   0}, {255,  36,   0},\n   {255,  38,   0}, {255,  40,   0}, {255,  42,   0}, {255,  44,   0}, {255,  46,   0},\n   {255,  48,   0}, {255,  50,   0}, {255,  52,   0}, {255,  54,   0}, {255,  56,   0},\n   {255,  57,   0}, {255,  59,   0}, {255,  61,   0}, {255,  63,   0}, {255,  65,   0},\n   {255,  67,   0}, {255,  69,   0}, {255,  71,   0}, {255,  73,   0}, {255,  75,   0},\n   {255,  77,   0}, {255,  78,   0}, {255,  80,   0}, {255,  82,   0}, {255,  84,   0},\n   {255,  86,   0}, {255,  88,   0}, {255,  90,   0}, {255,  92,   0}, {255,  94,   0},\n   {255,  96,   0}, {255,  98,   0}, {255, 100,   0}, {255, 101,   0}, {255, 103,   0},\n   {255, 105,   0}, {255, 107,   0}, {255, 109,   0}, {255, 111,   0}, {255, 113,   0},\n   {255, 115,   0}, {255, 117,   0}, {255, 119,   0}, {255, 121,   0}, {255, 123,   0},\n   {255, 124,   0}, {255, 126,   0}, {255, 128,   0}, {255, 130,   0}, {255, 132,   0},\n   {255, 134,   0}, {255, 136,   0}, {255, 138,   0}, {255, 140,   0}, {255, 142,   0},\n   {255, 144,   0}, {255, 145,   0}, {255, 147,   0}, {255, 149,   0}, {255, 151,   0},\n   {255, 153,   0}, {255, 155,   0}, {255, 157,   0}, {255, 159,   0}, {255, 161,   0},\n   {255, 163,   0}, {255, 165,   0}, {255, 167,   0}, {255, 168,   0}, {255, 170,   0},\n   {255, 172,   0}, {255, 174,   0}, {255, 176,   0}, {255, 178,   0}, {255, 180,   0},\n   {255, 182,   0}, {255, 184,   0}, {255, 186,   0}, {255, 188,   0}, {255, 190,   0},\n   {255, 191,   0}, {255, 193,   0}, {255, 195,   0}, {255, 197,   0}, {255, 199,   0},\n   {255, 201,   0}, {255, 203,   0}, {255, 205,   0}, {255, 207,   0}, {255, 209,   0},\n   {255, 211,   0}, {255, 212,   0}, {255, 214,   0}, {255, 216,   0}, {255, 218,   0},\n   {255, 220,   0}, {255, 222,   0}, {255, 224,   0}, {255, 226,   0}, {255, 228,   0},\n   {255, 230,   0}, {255, 232,   0}, {255, 234,   0}, {255, 235,   0}, {255, 237,   0},\n   {255, 239,   0}, {255, 241,   0}, {255, 243,   0}, {255, 245,   0}, {255, 247,   0},\n   {255, 249,   0}, {255, 251,   0}, {255, 253,   0}, {255, 255,   0}, {252, 255,   0},\n   {248, 255,   0}, {244, 255,   0}, {240, 255,   0}, {237, 255,   0}, {233, 255,   0},\n   {229, 255,   0}, {225, 255,   0}, {221, 255,   0}, {217, 255,   0}, {214, 255,   0},\n   {210, 255,   0}, {206, 255,   0}, {202, 255,   0}, {198, 255,   0}, {195, 255,   0},\n   {191, 255,   0}, {187, 255,   0}, {183, 255,   0}, {179, 255,   0}, {175, 255,   0},\n   {172, 255,   0}, {168, 255,   0}, {164, 255,   0}, {160, 255,   0}, {156, 255,   0},\n   {152, 255,   0}, {149, 255,   0}, {145, 255,   0}, {141, 255,   0}, {137, 255,   0},\n   {133, 255,   0}, {129, 255,   0}, {126, 255,   0}, {122, 255,   0}, {118, 255,   0},\n   {114, 255,   0}, {110, 255,   0}, {106, 255,   0}, {103, 255,   0}, { 99, 255,   0},\n   { 95, 255,   0}, { 91, 255,   0}, { 87, 255,   0}, { 83, 255,   0}, { 80, 255,   0},\n   { 76, 255,   0}, { 72, 255,   0}, { 68, 255,   0}, { 64, 255,   0}, { 60, 255,   0},\n   { 57, 255,   0}, { 53, 255,   0}, { 49, 255,   0}, { 45, 255,   0}, { 41, 255,   0},\n   { 38, 255,   0}, { 34, 255,   0}, { 30, 255,   0}, { 26, 255,   0}, { 22, 255,   0},\n   { 18, 255,   0}, { 15, 255,   0}, { 11, 255,   0}, {  7, 255,   0}, {  3, 255,   0},\n   {  0, 254,   1}, {  0, 250,   5}, {  0, 247,   8}, {  0, 243,  12}, {  0, 239,  16},\n   {  0, 235,  20}, {  0, 231,  24}, {  0, 227,  28}, {  0, 224,  31}, {  0, 220,  35},\n   {  0, 216,  39}, {  0, 212,  43}, {  0, 208,  47}, {  0, 204,  51}, {  0, 201,  54},\n   {  0, 197,  58}, {  0, 193,  62}, {  0, 189,  66}, {  0, 185,  70}, {  0, 181,  74},\n   {  0, 178,  77}, {  0, 174,  81}, {  0, 170,  85}, {  0, 166,  89}, {  0, 162,  93},\n   {  0, 159,  96}, {  0, 155, 100}, {  0, 151, 104}, {  0, 147, 108}, {  0, 143, 112},\n   {  0, 139, 116}, {  0, 136, 119}, {  0, 132, 123}, {  0, 128, 127}, {  0, 124, 131},\n   {  0, 120, 135}, {  0, 116, 139}, {  0, 113, 142}, {  0, 109, 146}, {  0, 105, 150},\n   {  0, 101, 154}, {  0,  97, 158}, {  0,  93, 162}, {  0,  90, 165}, {  0,  86, 169},\n   {  0,  82, 173}, {  0,  78, 177}, {  0,  74, 181}, {  0,  70, 185}, {  0,  67, 188},\n   {  0,  63, 192}, {  0,  59, 196}, {  0,  55, 200}, {  0,  51, 204}, {  0,  47, 208},\n   {  0,  44, 211}, {  0,  40, 215}, {  0,  36, 219}, {  0,  32, 223}, {  0,  28, 227},\n   {  0,  25, 230}, {  0,  21, 234}, {  0,  17, 238}, {  0,  13, 242}, {  0,   9, 246},\n   {  0,   5, 250}, {  0,   2, 253}, {  2,   0, 255}, {  4,   0, 255}, {  7,   0, 255},\n   {  9,   0, 255}, { 12,   0, 255}, { 14,   0, 255}, { 17,   0, 255}, { 19,   0, 255},\n   { 22,   0, 255}, { 25,   0, 255}, { 27,   0, 255}, { 30,   0, 255}, { 32,   0, 255},\n   { 35,   0, 255}, { 37,   0, 255}, { 40,   0, 255}, { 42,   0, 255}, { 45,   0, 255},\n   { 47,   0, 255}, { 50,   0, 255}, { 53,   0, 255}, { 55,   0, 255}, { 58,   0, 255},\n   { 60,   0, 255}, { 63,   0, 255}, { 65,   0, 255}, { 68,   0, 255}, { 70,   0, 255},\n   { 73,   0, 255}, { 76,   0, 255}, { 78,   0, 255}, { 81,   0, 255}, { 83,   0, 255},\n   { 86,   0, 255}, { 88,   0, 255}, { 91,   0, 255}, { 93,   0, 255}, { 96,   0, 255},\n   { 99,   0, 255}, {101,   0, 255}, {104,   0, 255}, {106,   0, 255}, {109,   0, 255},\n   {111,   0, 255}, {114,   0, 255}, {116,   0, 255}, {119,   0, 255}, {122,   0, 255},\n   {124,   0, 255}, {127,   0, 255}, {129,   0, 255}, {132,   0, 255}, {134,   0, 255},\n   {137,   0, 255}, {139,   0, 255}, {142,   0, 255}, {144,   0, 255}, {147,   0, 255},\n   {150,   0, 255}, {152,   0, 255}, {155,   0, 255}, {157,   0, 255}, {160,   0, 255},\n   {162,   0, 255}, {165,   0, 255}, {167,   0, 255}, {170,   0, 255}, {171,   0, 251},\n   {173,   0, 247}, {174,   0, 244}, {175,   0, 240}, {176,   0, 236}, {178,   0, 232},\n   {179,   0, 228}, {180,   0, 224}, {181,   0, 221}, {183,   0, 217}, {184,   0, 213},\n   {185,   0, 209}, {187,   0, 205}, {188,   0, 201}, {189,   0, 198}, {190,   0, 194},\n   {192,   0, 190}, {193,   0, 186}, {194,   0, 182}, {196,   0, 178}, {197,   0, 175},\n   {198,   0, 171}, {199,   0, 167}, {201,   0, 163}, {202,   0, 159}, {203,   0, 155},\n   {204,   0, 152}, {206,   0, 148}, {207,   0, 144}, {208,   0, 140}, {210,   0, 136},\n   {211,   0, 132}, {212,   0, 129}, {213,   0, 125}, {215,   0, 121}, {216,   0, 117},\n   {217,   0, 113}, {218,   0, 110}, {220,   0, 106}, {221,   0, 102}, {222,   0,  98},\n   {224,   0,  94}, {225,   0,  90}, {226,   0,  87}, {227,   0,  83}, {229,   0,  79},\n   {230,   0,  75}, {231,   0,  71}, {233,   0,  67}, {234,   0,  64}, {235,   0,  60},\n   {236,   0,  56}, {238,   0,  52}, {239,   0,  48}, {240,   0,  44}, {241,   0,  41},\n   {243,   0,  37}, {244,   0,  33}, {245,   0,  29}, {247,   0,  25}, {248,   0,  21},\n   {249,   0,  18}, {250,   0,  14}, {252,   0,  10}, {253,   0,   6}, {254,   0,   2},\n   {255,   1,   0}, {255,   3,   0}, {255,   5,   0}, {255,   7,   0}, {255,   8,   0},\n   {255,  10,   0}, {255,  12,   0}, {255,  14,   0}, {255,  16,   0}, {255,  18,   0},\n   {255,  20,   0}, {255,  22,   0}, {255,  24,   0}, {255,  26,   0}, {255,  28,   0},\n   {255,  29,   0}, {255,  31,   0}, {255,  33,   0}, {255,  35,   0}, {255,  37,   0},\n   {255,  39,   0}, {255,  41,   0}, {255,  43,   0}, {255,  45,   0}, {255,  47,   0},\n   {255,  49,   0}, {255,  51,   0}, {255,  52,   0}, {255,  54,   0}, {255,  56,   0},\n   {255,  58,   0}, {255,  60,   0}, {255,  62,   0}, {255,  64,   0}, {255,  66,   0},\n   {255,  68,   0}, {255,  70,   0}, {255,  72,   0}, {255,  74,   0}, {255,  75,   0},\n   {255,  77,   0}, {255,  79,   0}, {255,  81,   0}, {255,  83,   0}, {255,  85,   0},\n   {255,  87,   0}, {255,  89,   0}, {255,  91,   0}, {255,  93,   0}, {255,  95,   0},\n   {255,  96,   0}, {255,  98,   0}, {255, 100,   0}, {255, 102,   0}, {255, 104,   0},\n   {255, 106,   0}, {255, 108,   0}, {255, 110,   0}, {255, 112,   0}, {255, 114,   0},\n   {255, 116,   0}, {255, 118,   0}, {255, 119,   0}, {255, 121,   0}, {255, 123,   0},\n   {255, 125,   0}, {255, 127,   0}, {255, 129,   0}, {255, 131,   0}, {255, 133,   0},\n   {255, 135,   0}, {255, 137,   0}, {255, 139,   0}, {255, 141,   0}, {255, 142,   0},\n   {255, 144,   0}, {255, 146,   0}, {255, 148,   0}, {255, 150,   0}, {255, 152,   0},\n   {255, 154,   0}, {255, 156,   0}, {255, 158,   0}, {255, 160,   0}, {255, 162,   0},\n   {255, 163,   0}, {255, 165,   0}, {255, 167,   0}, {255, 169,   0}, {255, 171,   0},\n   {255, 173,   0}, {255, 175,   0}, {255, 177,   0}, {255, 179,   0}, {255, 181,   0},\n   {255, 183,   0}, {255, 185,   0}, {255, 186,   0}, {255, 188,   0}, {255, 190,   0},\n   {255, 192,   0}, {255, 194,   0}, {255, 196,   0}, {255, 198,   0}, {255, 200,   0},\n   {255, 202,   0}, {255, 204,   0}, {255, 206,   0}, {255, 208,   0}, {255, 209,   0},\n   {255, 211,   0}, {255, 213,   0}, {255, 215,   0}, {255, 217,   0}, {255, 219,   0},\n   {255, 221,   0}, {255, 223,   0}, {255, 225,   0}, {255, 227,   0}, {255, 229,   0},\n   {255, 230,   0}, {255, 232,   0}, {255, 234,   0}, {255, 236,   0}, {255, 238,   0},\n   {255, 240,   0}, {255, 242,   0}, {255, 244,   0}, {255, 246,   0}, {255, 248,   0},\n   {255, 250,   0}, {255, 252,   0}, {255, 253,   0}, {254, 255,   0}, {250, 255,   0},\n   {247, 255,   0}, {243, 255,   0}, {239, 255,   0}, {235, 255,   0}, {231, 255,   0},\n   {227, 255,   0}, {224, 255,   0}, {220, 255,   0}, {216, 255,   0}, {212, 255,   0},\n   {208, 255,   0}, {204, 255,   0}, {201, 255,   0}, {197, 255,   0}, {193, 255,   0},\n   {189, 255,   0}, {185, 255,   0}, {181, 255,   0}, {178, 255,   0}, {174, 255,   0},\n   {170, 255,   0}, {166, 255,   0}, {162, 255,   0}, {159, 255,   0}, {155, 255,   0},\n   {151, 255,   0}, {147, 255,   0}, {143, 255,   0}, {139, 255,   0}, {136, 255,   0},\n   {132, 255,   0}, {128, 255,   0}, {124, 255,   0}, {120, 255,   0}, {116, 255,   0},\n   {113, 255,   0}, {109, 255,   0}, {105, 255,   0}, {101, 255,   0}, { 97, 255,   0},\n   { 93, 255,   0}, { 90, 255,   0}, { 86, 255,   0}, { 82, 255,   0}, { 78, 255,   0},\n   { 74, 255,   0}, { 70, 255,   0}, { 67, 255,   0}, { 63, 255,   0}, { 59, 255,   0},\n   { 55, 255,   0}, { 51, 255,   0}, { 47, 255,   0}, { 44, 255,   0}, { 40, 255,   0},\n   { 36, 255,   0}, { 32, 255,   0}, { 28, 255,   0}, { 25, 255,   0}, { 21, 255,   0},\n   { 17, 255,   0}, { 13, 255,   0}, {  9, 255,   0}, {  5, 255,   0}, {  2, 255,   0},\n   {  0, 253,   2}, {  0, 249,   6}, {  0, 245,  10}, {  0, 241,  14}, {  0, 237,  18},\n   {  0, 234,  21}, {  0, 230,  25}, {  0, 226,  29}, {  0, 222,  33}, {  0, 218,  37},\n   {  0, 214,  41}, {  0, 211,  44}, {  0, 207,  48}, {  0, 203,  52}, {  0, 199,  56},\n   {  0, 195,  60}, {  0, 191,  64}, {  0, 188,  67}, {  0, 184,  71}, {  0, 180,  75},\n   {  0, 176,  79}, {  0, 172,  83}, {  0, 168,  87}, {  0, 165,  90}, {  0, 161,  94},\n   {  0, 157,  98}, {  0, 153, 102}, {  0, 149, 106}, {  0, 145, 110}, {  0, 142, 113},\n   {  0, 138, 117}, {  0, 134, 121}, {  0, 130, 125}, {  0, 126, 129}, {  0, 123, 132},\n   {  0, 119, 136}, {  0, 115, 140}, {  0, 111, 144}, {  0, 107, 148}, {  0, 103, 152},\n   {  0, 100, 155}, {  0,  96, 159}, {  0,  92, 163}, {  0,  88, 167}, {  0,  84, 171},\n   {  0,  80, 175}, {  0,  77, 178}, {  0,  73, 182}, {  0,  69, 186}, {  0,  65, 190},\n   {  0,  61, 194}, {  0,  57, 198}, {  0,  54, 201}, {  0,  50, 205}, {  0,  46, 209},\n   {  0,  42, 213}, {  0,  38, 217}, {  0,  34, 221}, {  0,  31, 224}, {  0,  27, 228},\n   {  0,  23, 232}, {  0,  19, 236}, {  0,  15, 240}, {  0,  11, 244}, {  0,   8, 247},\n   {  0,   4, 251}, {  0,   0, 255}, {  3,   0, 255}, {  5,   0, 255}, {  8,   0, 255},\n   { 10,   0, 255}, { 13,   0, 255}, { 15,   0, 255}, { 18,   0, 255}, { 20,   0, 255},\n   { 23,   0, 255}, { 26,   0, 255}, { 28,   0, 255}, { 31,   0, 255}, { 33,   0, 255},\n   { 36,   0, 255}, { 38,   0, 255}, { 41,   0, 255}, { 43,   0, 255}, { 46,   0, 255},\n   { 48,   0, 255}, { 51,   0, 255}, { 54,   0, 255}, { 56,   0, 255}, { 59,   0, 255},\n   { 61,   0, 255}, { 64,   0, 255}, { 66,   0, 255}, { 69,   0, 255}, { 71,   0, 255},\n   { 74,   0, 255}, { 77,   0, 255}, { 79,   0, 255}, { 82,   0, 255}, { 84,   0, 255},\n   { 87,   0, 255}, { 89,   0, 255}, { 92,   0, 255}, { 94,   0, 255}, { 97,   0, 255},\n   {100,   0, 255}, {102,   0, 255}, {105,   0, 255}, {107,   0, 255}, {110,   0, 255},\n   {112,   0, 255}, {115,   0, 255}, {117,   0, 255}, {120,   0, 255}, {123,   0, 255},\n   {125,   0, 255}, {128,   0, 255}, {130,   0, 255}, {133,   0, 255}, {135,   0, 255},\n   {138,   0, 255}, {140,   0, 255}, {143,   0, 255}, {145,   0, 255}, {148,   0, 255},\n   {151,   0, 255}, {153,   0, 255}, {156,   0, 255}, {158,   0, 255}, {161,   0, 255},\n   {163,   0, 255}, {166,   0, 255}, {168,   0, 255}, {171,   0, 253}, {172,   0, 250},\n   {173,   0, 246}, {174,   0, 242}, {176,   0, 238}, {177,   0, 234}, {178,   0, 230},\n   {179,   0, 227}, {181,   0, 223}, {182,   0, 219}, {183,   0, 215}, {185,   0, 211},\n   {186,   0, 208}, {187,   0, 204}, {188,   0, 200}, {190,   0, 196}, {191,   0, 192},\n   {192,   0, 188}, {193,   0, 185}, {195,   0, 181}, {196,   0, 177}, {197,   0, 173},\n   {199,   0, 169}, {200,   0, 165}, {201,   0, 162}, {202,   0, 158}, {204,   0, 154},\n   {205,   0, 150}, {206,   0, 146}, {208,   0, 142}, {209,   0, 139}, {210,   0, 135},\n   {211,   0, 131}, {213,   0, 127}, {214,   0, 123}, {215,   0, 119}, {216,   0, 116},\n   {218,   0, 112}, {219,   0, 108}, {220,   0, 104}, {222,   0, 100}, {223,   0,  96},\n   {224,   0,  93}, {225,   0,  89}, {227,   0,  85}, {228,   0,  81}, {229,   0,  77},\n   {230,   0,  74}, {232,   0,  70}, {233,   0,  66}, {234,   0,  62}, {236,   0,  58},\n   {237,   0,  54}, {238,   0,  51}, {239,   0,  47}, {241,   0,  43}, {242,   0,  39},\n   {243,   0,  35}, {245,   0,  31}, {246,   0,  28}, {247,   0,  24}, {248,   0,  20},\n   {250,   0,  16}, {251,   0,  12}, {252,   0,   8}, {253,   0,   5}, {255,   0,   1},\n   {255,   2,   0}, {255,   3,   0}, {255,   5,   0}, {255,   7,   0}, {255,   9,   0},\n   {255,  11,   0}, {255,  13,   0}, {255,  15,   0}, {255,  17,   0}, {255,  19,   0},\n   {255,  21,   0}, {255,  23,   0}, {255,  25,   0}, {255,  26,   0}, {255,  28,   0},\n   {255,  30,   0}, {255,  32,   0}, {255,  34,   0}, {255,  36,   0}, {255,  38,   0},\n   {255,  40,   0}, {255,  42,   0}, {255,  44,   0}, {255,  46,   0}, {255,  47,   0},\n   {255,  49,   0}, {255,  51,   0}, {255,  53,   0}, {255,  55,   0}, {255,  57,   0},\n   {255,  59,   0}, {255,  61,   0}, {255,  63,   0}, {255,  65,   0}, {255,  67,   0},\n   {255,  69,   0}, {255,  70,   0}, {255,  72,   0}, {255,  74,   0}, {255,  76,   0},\n   {255,  78,   0}, {255,  80,   0}, {255,  82,   0}, {255,  84,   0}, {255,  86,   0},\n   {255,  88,   0}, {255,  90,   0}, {255,  92,   0}, {255,  93,   0}, {255,  95,   0},\n   {255,  97,   0}, {255,  99,   0}, {255, 101,   0}, {255, 103,   0}, {255, 105,   0},\n   {255, 107,   0}, {255, 109,   0}, {255, 111,   0}, {255, 113,   0}, {255, 114,   0},\n   {255, 116,   0}, {255, 118,   0}, {255, 120,   0}, {255, 122,   0}, {255, 124,   0},\n   {255, 126,   0}, {255, 128,   0}, {255, 130,   0}, {255, 132,   0}, {255, 134,   0},\n   {255, 136,   0}, {255, 137,   0}, {255, 139,   0}, {255, 141,   0}, {255, 143,   0},\n   {255, 145,   0}, {255, 147,   0}, {255, 149,   0}, {255, 151,   0}, {255, 153,   0},\n   {255, 155,   0}, {255, 157,   0}, {255, 159,   0}, {255, 160,   0}, {255, 162,   0},\n   {255, 164,   0}, {255, 166,   0}, {255, 168,   0}, {255, 170,   0}, {255, 172,   0},\n   {255, 174,   0}, {255, 176,   0}, {255, 178,   0}, {255, 180,   0}, {255, 181,   0},\n   {255, 183,   0}, {255, 185,   0}, {255, 187,   0}, {255, 189,   0}, {255, 191,   0},\n   {255, 193,   0}, {255, 195,   0}, {255, 197,   0}, {255, 199,   0}, {255, 201,   0},\n   {255, 203,   0}, {255, 204,   0}, {255, 206,   0}, {255, 208,   0}, {255, 210,   0},\n   {255, 212,   0}, {255, 214,   0}, {255, 216,   0}, {255, 218,   0}, {255, 220,   0},\n   {255, 222,   0}, {255, 224,   0}, {255, 226,   0}, {255, 227,   0}, {255, 229,   0},\n   {255, 231,   0}, {255, 233,   0}, {255, 235,   0}, {255, 237,   0}, {255, 239,   0},\n   {255, 241,   0}, {255, 243,   0}, {255, 245,   0}, {255, 247,   0}, {255, 248,   0},\n   {255, 250,   0}, {255, 252,   0}, {255, 254,   0}, {253, 255,   0}, {249, 255,   0},\n   {245, 255,   0}, {241, 255,   0}, {237, 255,   0}, {234, 255,   0}, {230, 255,   0},\n   {226, 255,   0}, {222, 255,   0}, {218, 255,   0}, {214, 255,   0}, {211, 255,   0},\n   {207, 255,   0}, {203, 255,   0}, {199, 255,   0}, {195, 255,   0}, {191, 255,   0},\n   {188, 255,   0}, {184, 255,   0}, {180, 255,   0}, {176, 255,   0}, {172, 255,   0},\n   {168, 255,   0}, {165, 255,   0}, {161, 255,   0}, {157, 255,   0}, {153, 255,   0},\n   {149, 255,   0}, {145, 255,   0}, {142, 255,   0}, {138, 255,   0}, {134, 255,   0},\n   {130, 255,   0}, {126, 255,   0}, {123, 255,   0}, {119, 255,   0}, {115, 255,   0},\n   {111, 255,   0}, {107, 255,   0}, {103, 255,   0}, {100, 255,   0}, { 96, 255,   0},\n   { 92, 255,   0}, { 88, 255,   0}, { 84, 255,   0}, { 80, 255,   0}, { 77, 255,   0},\n   { 73, 255,   0}, { 69, 255,   0}, { 65, 255,   0}, { 61, 255,   0}, { 57, 255,   0},\n   { 54, 255,   0}, { 50, 255,   0}, { 46, 255,   0}, { 42, 255,   0}, { 38, 255,   0},\n   { 34, 255,   0}, { 31, 255,   0}, { 27, 255,   0}, { 23, 255,   0}, { 19, 255,   0},\n   { 15, 255,   0}, { 11, 255,   0}, {  8, 255,   0}, {  4, 255,   0}, {  0, 255,   0}\n};\n\nconst rgb_t vga_colormap[1000] = {\n   {255, 255, 255}, {254, 254, 254}, {253, 253, 253}, {252, 252, 252}, {251, 251, 251},\n   {250, 250, 250}, {249, 249, 249}, {248, 248, 248}, {247, 247, 247}, {246, 246, 246},\n   {245, 245, 245}, {244, 244, 244}, {244, 244, 244}, {243, 243, 243}, {242, 242, 242},\n   {241, 241, 241}, {240, 240, 240}, {239, 239, 239}, {238, 238, 238}, {237, 237, 237},\n   {236, 236, 236}, {235, 235, 235}, {234, 234, 234}, {233, 233, 233}, {232, 232, 232},\n   {231, 231, 231}, {230, 230, 230}, {229, 229, 229}, {228, 228, 228}, {227, 227, 227},\n   {226, 226, 226}, {225, 225, 225}, {224, 224, 224}, {223, 223, 223}, {222, 222, 222},\n   {221, 221, 221}, {221, 221, 221}, {220, 220, 220}, {219, 219, 219}, {218, 218, 218},\n   {217, 217, 217}, {216, 216, 216}, {215, 215, 215}, {214, 214, 214}, {213, 213, 213},\n   {212, 212, 212}, {211, 211, 211}, {210, 210, 210}, {209, 209, 209}, {208, 208, 208},\n   {207, 207, 207}, {206, 206, 206}, {205, 205, 205}, {204, 204, 204}, {203, 203, 203},\n   {202, 202, 202}, {201, 201, 201}, {200, 200, 200}, {199, 199, 199}, {199, 199, 199},\n   {198, 198, 198}, {197, 197, 197}, {196, 196, 196}, {195, 195, 195}, {194, 194, 194},\n   {193, 193, 193}, {192, 192, 192}, {192, 190, 190}, {193, 187, 187}, {194, 184, 184},\n   {195, 181, 181}, {195, 179, 179}, {196, 176, 176}, {197, 173, 173}, {198, 170, 170},\n   {199, 167, 167}, {200, 164, 164}, {201, 161, 161}, {202, 159, 159}, {203, 156, 156},\n   {204, 153, 153}, {205, 150, 150}, {206, 147, 147}, {207, 144, 144}, {208, 141, 141},\n   {209, 138, 138}, {210, 136, 136}, {211, 133, 133}, {212, 130, 130}, {213, 127, 127},\n   {214, 124, 124}, {215, 121, 121}, {216, 118, 118}, {217, 115, 115}, {217, 113, 113},\n   {218, 110, 110}, {219, 107, 107}, {220, 104, 104}, {221, 101, 101}, {222,  98,  98},\n   {223,  95,  95}, {224,  92,  92}, {225,  90,  90}, {226,  87,  87}, {227,  84,  84},\n   {228,  81,  81}, {229,  78,  78}, {230,  75,  75}, {231,  72,  72}, {232,  69,  69},\n   {233,  67,  67}, {234,  64,  64}, {235,  61,  61}, {236,  58,  58}, {237,  55,  55},\n   {238,  52,  52}, {239,  49,  49}, {239,  47,  47}, {240,  44,  44}, {241,  41,  41},\n   {242,  38,  38}, {243,  35,  35}, {244,  32,  32}, {245,  29,  29}, {246,  26,  26},\n   {247,  24,  24}, {248,  21,  21}, {249,  18,  18}, {250,  15,  15}, {251,  12,  12},\n   {252,   9,   9}, {253,   6,   6}, {254,   3,   3}, {255,   1,   1}, {255,   3,   0},\n   {255,   7,   0}, {255,  11,   0}, {255,  15,   0}, {255,  18,   0}, {255,  22,   0},\n   {255,  26,   0}, {255,  30,   0}, {255,  34,   0}, {255,  38,   0}, {255,  41,   0},\n   {255,  45,   0}, {255,  49,   0}, {255,  53,   0}, {255,  57,   0}, {255,  60,   0},\n   {255,  64,   0}, {255,  68,   0}, {255,  72,   0}, {255,  76,   0}, {255,  80,   0},\n   {255,  83,   0}, {255,  87,   0}, {255,  91,   0}, {255,  95,   0}, {255,  99,   0},\n   {255, 103,   0}, {255, 106,   0}, {255, 110,   0}, {255, 114,   0}, {255, 118,   0},\n   {255, 122,   0}, {255, 126,   0}, {255, 129,   0}, {255, 133,   0}, {255, 137,   0},\n   {255, 141,   0}, {255, 145,   0}, {255, 149,   0}, {255, 152,   0}, {255, 156,   0},\n   {255, 160,   0}, {255, 164,   0}, {255, 168,   0}, {255, 172,   0}, {255, 175,   0},\n   {255, 179,   0}, {255, 183,   0}, {255, 187,   0}, {255, 191,   0}, {255, 195,   0},\n   {255, 198,   0}, {255, 202,   0}, {255, 206,   0}, {255, 210,   0}, {255, 214,   0},\n   {255, 217,   0}, {255, 221,   0}, {255, 225,   0}, {255, 229,   0}, {255, 233,   0},\n   {255, 237,   0}, {255, 240,   0}, {255, 244,   0}, {255, 248,   0}, {255, 252,   0},\n   {254, 255,   0}, {250, 255,   0}, {247, 255,   0}, {243, 255,   0}, {239, 255,   0},\n   {235, 255,   0}, {231, 255,   0}, {227, 255,   0}, {224, 255,   0}, {220, 255,   0},\n   {216, 255,   0}, {212, 255,   0}, {208, 255,   0}, {204, 255,   0}, {201, 255,   0},\n   {197, 255,   0}, {193, 255,   0}, {189, 255,   0}, {185, 255,   0}, {181, 255,   0},\n   {178, 255,   0}, {174, 255,   0}, {170, 255,   0}, {166, 255,   0}, {162, 255,   0},\n   {159, 255,   0}, {155, 255,   0}, {151, 255,   0}, {147, 255,   0}, {143, 255,   0},\n   {139, 255,   0}, {136, 255,   0}, {132, 255,   0}, {128, 255,   0}, {124, 255,   0},\n   {120, 255,   0}, {116, 255,   0}, {113, 255,   0}, {109, 255,   0}, {105, 255,   0},\n   {101, 255,   0}, { 97, 255,   0}, { 93, 255,   0}, { 90, 255,   0}, { 86, 255,   0},\n   { 82, 255,   0}, { 78, 255,   0}, { 74, 255,   0}, { 70, 255,   0}, { 67, 255,   0},\n   { 63, 255,   0}, { 59, 255,   0}, { 55, 255,   0}, { 51, 255,   0}, { 47, 255,   0},\n   { 44, 255,   0}, { 40, 255,   0}, { 36, 255,   0}, { 32, 255,   0}, { 28, 255,   0},\n   { 25, 255,   0}, { 21, 255,   0}, { 17, 255,   0}, { 13, 255,   0}, {  9, 255,   0},\n   {  5, 255,   0}, {  2, 255,   0}, {  0, 255,   2}, {  0, 255,   6}, {  0, 255,  10},\n   {  0, 255,  14}, {  0, 255,  18}, {  0, 255,  21}, {  0, 255,  25}, {  0, 255,  29},\n   {  0, 255,  33}, {  0, 255,  37}, {  0, 255,  41}, {  0, 255,  44}, {  0, 255,  48},\n   {  0, 255,  52}, {  0, 255,  56}, {  0, 255,  60}, {  0, 255,  64}, {  0, 255,  67},\n   {  0, 255,  71}, {  0, 255,  75}, {  0, 255,  79}, {  0, 255,  83}, {  0, 255,  87},\n   {  0, 255,  90}, {  0, 255,  94}, {  0, 255,  98}, {  0, 255, 102}, {  0, 255, 106},\n   {  0, 255, 110}, {  0, 255, 113}, {  0, 255, 117}, {  0, 255, 121}, {  0, 255, 125},\n   {  0, 255, 129}, {  0, 255, 132}, {  0, 255, 136}, {  0, 255, 140}, {  0, 255, 144},\n   {  0, 255, 148}, {  0, 255, 152}, {  0, 255, 155}, {  0, 255, 159}, {  0, 255, 163},\n   {  0, 255, 167}, {  0, 255, 171}, {  0, 255, 175}, {  0, 255, 178}, {  0, 255, 182},\n   {  0, 255, 186}, {  0, 255, 190}, {  0, 255, 194}, {  0, 255, 198}, {  0, 255, 201},\n   {  0, 255, 205}, {  0, 255, 209}, {  0, 255, 213}, {  0, 255, 217}, {  0, 255, 221},\n   {  0, 255, 224}, {  0, 255, 228}, {  0, 255, 232}, {  0, 255, 236}, {  0, 255, 240},\n   {  0, 255, 244}, {  0, 255, 247}, {  0, 255, 251}, {  0, 255, 255}, {  0, 251, 255},\n   {  0, 247, 255}, {  0, 244, 255}, {  0, 240, 255}, {  0, 236, 255}, {  0, 232, 255},\n   {  0, 228, 255}, {  0, 224, 255}, {  0, 221, 255}, {  0, 217, 255}, {  0, 213, 255},\n   {  0, 209, 255}, {  0, 205, 255}, {  0, 201, 255}, {  0, 198, 255}, {  0, 194, 255},\n   {  0, 190, 255}, {  0, 186, 255}, {  0, 182, 255}, {  0, 178, 255}, {  0, 175, 255},\n   {  0, 171, 255}, {  0, 167, 255}, {  0, 163, 255}, {  0, 159, 255}, {  0, 155, 255},\n   {  0, 152, 255}, {  0, 148, 255}, {  0, 144, 255}, {  0, 140, 255}, {  0, 136, 255},\n   {  0, 132, 255}, {  0, 129, 255}, {  0, 125, 255}, {  0, 121, 255}, {  0, 117, 255},\n   {  0, 113, 255}, {  0, 110, 255}, {  0, 106, 255}, {  0, 102, 255}, {  0,  98, 255},\n   {  0,  94, 255}, {  0,  90, 255}, {  0,  87, 255}, {  0,  83, 255}, {  0,  79, 255},\n   {  0,  75, 255}, {  0,  71, 255}, {  0,  67, 255}, {  0,  64, 255}, {  0,  60, 255},\n   {  0,  56, 255}, {  0,  52, 255}, {  0,  48, 255}, {  0,  44, 255}, {  0,  41, 255},\n   {  0,  37, 255}, {  0,  33, 255}, {  0,  29, 255}, {  0,  25, 255}, {  0,  21, 255},\n   {  0,  18, 255}, {  0,  14, 255}, {  0,  10, 255}, {  0,   6, 255}, {  0,   2, 255},\n   {  2,   0, 255}, {  5,   0, 255}, {  9,   0, 255}, { 13,   0, 255}, { 17,   0, 255},\n   { 21,   0, 255}, { 25,   0, 255}, { 28,   0, 255}, { 32,   0, 255}, { 36,   0, 255},\n   { 40,   0, 255}, { 44,   0, 255}, { 47,   0, 255}, { 51,   0, 255}, { 55,   0, 255},\n   { 59,   0, 255}, { 63,   0, 255}, { 67,   0, 255}, { 70,   0, 255}, { 74,   0, 255},\n   { 78,   0, 255}, { 82,   0, 255}, { 86,   0, 255}, { 90,   0, 255}, { 93,   0, 255},\n   { 97,   0, 255}, {101,   0, 255}, {105,   0, 255}, {109,   0, 255}, {113,   0, 255},\n   {116,   0, 255}, {120,   0, 255}, {124,   0, 255}, {128,   0, 255}, {132,   0, 255},\n   {136,   0, 255}, {139,   0, 255}, {143,   0, 255}, {147,   0, 255}, {151,   0, 255},\n   {155,   0, 255}, {159,   0, 255}, {162,   0, 255}, {166,   0, 255}, {170,   0, 255},\n   {174,   0, 255}, {178,   0, 255}, {181,   0, 255}, {185,   0, 255}, {189,   0, 255},\n   {193,   0, 255}, {197,   0, 255}, {201,   0, 255}, {204,   0, 255}, {208,   0, 255},\n   {212,   0, 255}, {216,   0, 255}, {220,   0, 255}, {224,   0, 255}, {227,   0, 255},\n   {231,   0, 255}, {235,   0, 255}, {239,   0, 255}, {243,   0, 255}, {247,   0, 255},\n   {250,   0, 255}, {254,   0, 255}, {252,   0, 252}, {248,   0, 248}, {244,   0, 244},\n   {240,   0, 240}, {237,   0, 237}, {233,   0, 233}, {229,   0, 229}, {225,   0, 225},\n   {221,   0, 221}, {217,   0, 217}, {214,   0, 214}, {210,   0, 210}, {206,   0, 206},\n   {202,   0, 202}, {198,   0, 198}, {195,   0, 195}, {191,   0, 191}, {187,   0, 187},\n   {183,   0, 183}, {179,   0, 179}, {175,   0, 175}, {172,   0, 172}, {168,   0, 168},\n   {164,   0, 164}, {160,   0, 160}, {156,   0, 156}, {152,   0, 152}, {149,   0, 149},\n   {145,   0, 145}, {141,   0, 141}, {137,   0, 137}, {133,   0, 133}, {129,   0, 129},\n   {126,   0, 126}, {122,   0, 122}, {118,   0, 118}, {114,   0, 114}, {110,   0, 110},\n   {106,   0, 106}, {103,   0, 103}, { 99,   0,  99}, { 95,   0,  95}, { 91,   0,  91},\n   { 87,   0,  87}, { 83,   0,  83}, { 80,   0,  80}, { 76,   0,  76}, { 72,   0,  72},\n   { 68,   0,  68}, { 64,   0,  64}, { 60,   0,  60}, { 57,   0,  57}, { 53,   0,  53},\n   { 49,   0,  49}, { 45,   0,  45}, { 41,   0,  41}, { 38,   0,  38}, { 34,   0,  34},\n   { 30,   0,  30}, { 26,   0,  26}, { 22,   0,  22}, { 18,   0,  18}, { 15,   0,  15},\n   { 11,   0,  11}, {  7,   0,   7}, {  3,   0,   3}, {  0,   0,   0}, {  2,   2,   2},\n   {  4,   4,   4}, {  6,   6,   6}, {  8,   8,   8}, { 10,  10,  10}, { 12,  12,  12},\n   { 14,  14,  14}, { 16,  16,  16}, { 18,  18,  18}, { 20,  20,  20}, { 21,  21,  21},\n   { 23,  23,  23}, { 25,  25,  25}, { 27,  27,  27}, { 29,  29,  29}, { 31,  31,  31},\n   { 33,  33,  33}, { 35,  35,  35}, { 37,  37,  37}, { 39,  39,  39}, { 41,  41,  41},\n   { 43,  43,  43}, { 44,  44,  44}, { 46,  46,  46}, { 48,  48,  48}, { 50,  50,  50},\n   { 52,  52,  52}, { 54,  54,  54}, { 56,  56,  56}, { 58,  58,  58}, { 60,  60,  60},\n   { 62,  62,  62}, { 64,  64,  64}, { 65,  65,  65}, { 67,  67,  67}, { 69,  69,  69},\n   { 71,  71,  71}, { 73,  73,  73}, { 75,  75,  75}, { 77,  77,  77}, { 79,  79,  79},\n   { 81,  81,  81}, { 83,  83,  83}, { 85,  85,  85}, { 87,  87,  87}, { 88,  88,  88},\n   { 90,  90,  90}, { 92,  92,  92}, { 94,  94,  94}, { 96,  96,  96}, { 98,  98,  98},\n   {100, 100, 100}, {102, 102, 102}, {104, 104, 104}, {106, 106, 106}, {108, 108, 108},\n   {110, 110, 110}, {111, 111, 111}, {113, 113, 113}, {115, 115, 115}, {117, 117, 117},\n   {119, 119, 119}, {121, 121, 121}, {123, 123, 123}, {125, 125, 125}, {127, 127, 127},\n   {128, 126, 126}, {128, 124, 124}, {128, 123, 123}, {128, 121, 121}, {128, 119, 119},\n   {128, 117, 117}, {128, 115, 115}, {128, 113, 113}, {128, 111, 111}, {128, 109, 109},\n   {128, 107, 107}, {128, 105, 105}, {128, 103, 103}, {128, 101, 101}, {128, 100, 100},\n   {128,  98,  98}, {128,  96,  96}, {128,  94,  94}, {128,  92,  92}, {128,  90,  90},\n   {128,  88,  88}, {128,  86,  86}, {128,  84,  84}, {128,  82,  82}, {128,  80,  80},\n   {128,  78,  78}, {128,  77,  77}, {128,  75,  75}, {128,  73,  73}, {128,  71,  71},\n   {128,  69,  69}, {128,  67,  67}, {128,  65,  65}, {128,  63,  63}, {128,  61,  61},\n   {128,  59,  59}, {128,  57,  57}, {128,  56,  56}, {128,  54,  54}, {128,  52,  52},\n   {128,  50,  50}, {128,  48,  48}, {128,  46,  46}, {128,  44,  44}, {128,  42,  42},\n   {128,  40,  40}, {128,  38,  38}, {128,  36,  36}, {128,  34,  34}, {128,  33,  33},\n   {128,  31,  31}, {128,  29,  29}, {128,  27,  27}, {128,  25,  25}, {128,  23,  23},\n   {128,  21,  21}, {128,  19,  19}, {128,  17,  17}, {128,  15,  15}, {128,  13,  13},\n   {128,  11,  11}, {128,  10,  10}, {128,   8,   8}, {128,   6,   6}, {128,   4,   4},\n   {128,   2,   2}, {128,   0,   0}, {128,   2,   0}, {128,   4,   0}, {128,   6,   0},\n   {128,   8,   0}, {128,  10,   0}, {128,  11,   0}, {128,  13,   0}, {128,  15,   0},\n   {128,  17,   0}, {128,  19,   0}, {128,  21,   0}, {128,  23,   0}, {128,  25,   0},\n   {128,  27,   0}, {128,  29,   0}, {128,  31,   0}, {128,  33,   0}, {128,  34,   0},\n   {128,  36,   0}, {128,  38,   0}, {128,  40,   0}, {128,  42,   0}, {128,  44,   0},\n   {128,  46,   0}, {128,  48,   0}, {128,  50,   0}, {128,  52,   0}, {128,  54,   0},\n   {128,  56,   0}, {128,  57,   0}, {128,  59,   0}, {128,  61,   0}, {128,  63,   0},\n   {128,  65,   0}, {128,  67,   0}, {128,  69,   0}, {128,  71,   0}, {128,  73,   0},\n   {128,  75,   0}, {128,  77,   0}, {128,  78,   0}, {128,  80,   0}, {128,  82,   0},\n   {128,  84,   0}, {128,  86,   0}, {128,  88,   0}, {128,  90,   0}, {128,  92,   0},\n   {128,  94,   0}, {128,  96,   0}, {128,  98,   0}, {128, 100,   0}, {128, 101,   0},\n   {128, 103,   0}, {128, 105,   0}, {128, 107,   0}, {128, 109,   0}, {128, 111,   0},\n   {128, 113,   0}, {128, 115,   0}, {128, 117,   0}, {128, 119,   0}, {128, 121,   0},\n   {128, 123,   0}, {128, 124,   0}, {128, 126,   0}, {127, 128,   0}, {125, 128,   0},\n   {123, 128,   0}, {121, 128,   0}, {119, 128,   0}, {117, 128,   0}, {115, 128,   0},\n   {113, 128,   0}, {111, 128,   0}, {110, 128,   0}, {108, 128,   0}, {106, 128,   0},\n   {104, 128,   0}, {102, 128,   0}, {100, 128,   0}, { 98, 128,   0}, { 96, 128,   0},\n   { 94, 128,   0}, { 92, 128,   0}, { 90, 128,   0}, { 88, 128,   0}, { 87, 128,   0},\n   { 85, 128,   0}, { 83, 128,   0}, { 81, 128,   0}, { 79, 128,   0}, { 77, 128,   0},\n   { 75, 128,   0}, { 73, 128,   0}, { 71, 128,   0}, { 69, 128,   0}, { 67, 128,   0},\n   { 65, 128,   0}, { 64, 128,   0}, { 62, 128,   0}, { 60, 128,   0}, { 58, 128,   0},\n   { 56, 128,   0}, { 54, 128,   0}, { 52, 128,   0}, { 50, 128,   0}, { 48, 128,   0},\n   { 46, 128,   0}, { 44, 128,   0}, { 43, 128,   0}, { 41, 128,   0}, { 39, 128,   0},\n   { 37, 128,   0}, { 35, 128,   0}, { 33, 128,   0}, { 31, 128,   0}, { 29, 128,   0},\n   { 27, 128,   0}, { 25, 128,   0}, { 23, 128,   0}, { 21, 128,   0}, { 20, 128,   0},\n   { 18, 128,   0}, { 16, 128,   0}, { 14, 128,   0}, { 12, 128,   0}, { 10, 128,   0},\n   {  8, 128,   0}, {  6, 128,   0}, {  4, 128,   0}, {  2, 128,   0}, {  0, 128,   0},\n   {  0, 128,   2}, {  0, 128,   3}, {  0, 128,   5}, {  0, 128,   7}, {  0, 128,   9},\n   {  0, 128,  11}, {  0, 128,  13}, {  0, 128,  15}, {  0, 128,  17}, {  0, 128,  19},\n   {  0, 128,  21}, {  0, 128,  23}, {  0, 128,  25}, {  0, 128,  26}, {  0, 128,  28},\n   {  0, 128,  30}, {  0, 128,  32}, {  0, 128,  34}, {  0, 128,  36}, {  0, 128,  38},\n   {  0, 128,  40}, {  0, 128,  42}, {  0, 128,  44}, {  0, 128,  46}, {  0, 128,  47},\n   {  0, 128,  49}, {  0, 128,  51}, {  0, 128,  53}, {  0, 128,  55}, {  0, 128,  57},\n   {  0, 128,  59}, {  0, 128,  61}, {  0, 128,  63}, {  0, 128,  65}, {  0, 128,  67},\n   {  0, 128,  69}, {  0, 128,  70}, {  0, 128,  72}, {  0, 128,  74}, {  0, 128,  76},\n   {  0, 128,  78}, {  0, 128,  80}, {  0, 128,  82}, {  0, 128,  84}, {  0, 128,  86},\n   {  0, 128,  88}, {  0, 128,  90}, {  0, 128,  92}, {  0, 128,  93}, {  0, 128,  95},\n   {  0, 128,  97}, {  0, 128,  99}, {  0, 128, 101}, {  0, 128, 103}, {  0, 128, 105},\n   {  0, 128, 107}, {  0, 128, 109}, {  0, 128, 111}, {  0, 128, 113}, {  0, 128, 114},\n   {  0, 128, 116}, {  0, 128, 118}, {  0, 128, 120}, {  0, 128, 122}, {  0, 128, 124},\n   {  0, 128, 126}, {  0, 127, 128}, {  0, 125, 128}, {  0, 123, 128}, {  0, 121, 128},\n   {  0, 119, 128}, {  0, 118, 128}, {  0, 116, 128}, {  0, 114, 128}, {  0, 112, 128},\n   {  0, 110, 128}, {  0, 108, 128}, {  0, 106, 128}, {  0, 104, 128}, {  0, 102, 128},\n   {  0, 100, 128}, {  0,  98, 128}, {  0,  96, 128}, {  0,  95, 128}, {  0,  93, 128},\n   {  0,  91, 128}, {  0,  89, 128}, {  0,  87, 128}, {  0,  85, 128}, {  0,  83, 128},\n   {  0,  81, 128}, {  0,  79, 128}, {  0,  77, 128}, {  0,  75, 128}, {  0,  74, 128},\n   {  0,  72, 128}, {  0,  70, 128}, {  0,  68, 128}, {  0,  66, 128}, {  0,  64, 128},\n   {  0,  62, 128}, {  0,  60, 128}, {  0,  58, 128}, {  0,  56, 128}, {  0,  54, 128},\n   {  0,  52, 128}, {  0,  51, 128}, {  0,  49, 128}, {  0,  47, 128}, {  0,  45, 128},\n   {  0,  43, 128}, {  0,  41, 128}, {  0,  39, 128}, {  0,  37, 128}, {  0,  35, 128},\n   {  0,  33, 128}, {  0,  31, 128}, {  0,  29, 128}, {  0,  28, 128}, {  0,  26, 128},\n   {  0,  24, 128}, {  0,  22, 128}, {  0,  20, 128}, {  0,  18, 128}, {  0,  16, 128},\n   {  0,  14, 128}, {  0,  12, 128}, {  0,  10, 128}, {  0,   8, 128}, {  0,   7, 128},\n   {  0,   5, 128}, {  0,   3, 128}, {  0,   1, 128}, {  1,   0, 128}, {  3,   0, 128},\n   {  5,   0, 128}, {  7,   0, 128}, {  9,   0, 128}, { 11,   0, 128}, { 13,   0, 128},\n   { 15,   0, 128}, { 16,   0, 128}, { 18,   0, 128}, { 20,   0, 128}, { 22,   0, 128},\n   { 24,   0, 128}, { 26,   0, 128}, { 28,   0, 128}, { 30,   0, 128}, { 32,   0, 128},\n   { 34,   0, 128}, { 36,   0, 128}, { 38,   0, 128}, { 39,   0, 128}, { 41,   0, 128},\n   { 43,   0, 128}, { 45,   0, 128}, { 47,   0, 128}, { 49,   0, 128}, { 51,   0, 128},\n   { 53,   0, 128}, { 55,   0, 128}, { 57,   0, 128}, { 59,   0, 128}, { 60,   0, 128},\n   { 62,   0, 128}, { 64,   0, 128}, { 66,   0, 128}, { 68,   0, 128}, { 70,   0, 128},\n   { 72,   0, 128}, { 74,   0, 128}, { 76,   0, 128}, { 78,   0, 128}, { 80,   0, 128},\n   { 82,   0, 128}, { 83,   0, 128}, { 85,   0, 128}, { 87,   0, 128}, { 89,   0, 128},\n   { 91,   0, 128}, { 93,   0, 128}, { 95,   0, 128}, { 97,   0, 128}, { 99,   0, 128},\n   {101,   0, 128}, {103,   0, 128}, {105,   0, 128}, {106,   0, 128}, {108,   0, 128},\n   {110,   0, 128}, {112,   0, 128}, {114,   0, 128}, {116,   0, 128}, {118,   0, 128},\n   {120,   0, 128}, {122,   0, 128}, {124,   0, 128}, {126,   0, 128}, {128,   0, 128}\n};\n\nconst rgb_t yarg_colormap[1000] = {\n   {  0,   0,   0}, {  0,   0,   0}, {  1,   1,   1}, {  1,   1,   1}, {  1,   1,   1},\n   {  1,   1,   1}, {  2,   2,   2}, {  2,   2,   2}, {  2,   2,   2}, {  2,   2,   2},\n   {  3,   3,   3}, {  3,   3,   3}, {  3,   3,   3}, {  3,   3,   3}, {  4,   4,   4},\n   {  4,   4,   4}, {  4,   4,   4}, {  4,   4,   4}, {  5,   5,   5}, {  5,   5,   5},\n   {  5,   5,   5}, {  5,   5,   5}, {  6,   6,   6}, {  6,   6,   6}, {  6,   6,   6},\n   {  6,   6,   6}, {  7,   7,   7}, {  7,   7,   7}, {  7,   7,   7}, {  7,   7,   7},\n   {  8,   8,   8}, {  8,   8,   8}, {  8,   8,   8}, {  8,   8,   8}, {  9,   9,   9},\n   {  9,   9,   9}, {  9,   9,   9}, {  9,   9,   9}, { 10,  10,  10}, { 10,  10,  10},\n   { 10,  10,  10}, { 10,  10,  10}, { 11,  11,  11}, { 11,  11,  11}, { 11,  11,  11},\n   { 11,  11,  11}, { 12,  12,  12}, { 12,  12,  12}, { 12,  12,  12}, { 13,  13,  13},\n   { 13,  13,  13}, { 13,  13,  13}, { 13,  13,  13}, { 14,  14,  14}, { 14,  14,  14},\n   { 14,  14,  14}, { 14,  14,  14}, { 15,  15,  15}, { 15,  15,  15}, { 15,  15,  15},\n   { 15,  15,  15}, { 16,  16,  16}, { 16,  16,  16}, { 16,  16,  16}, { 16,  16,  16},\n   { 17,  17,  17}, { 17,  17,  17}, { 17,  17,  17}, { 17,  17,  17}, { 18,  18,  18},\n   { 18,  18,  18}, { 18,  18,  18}, { 18,  18,  18}, { 19,  19,  19}, { 19,  19,  19},\n   { 19,  19,  19}, { 19,  19,  19}, { 20,  20,  20}, { 20,  20,  20}, { 20,  20,  20},\n   { 20,  20,  20}, { 21,  21,  21}, { 21,  21,  21}, { 21,  21,  21}, { 21,  21,  21},\n   { 22,  22,  22}, { 22,  22,  22}, { 22,  22,  22}, { 22,  22,  22}, { 23,  23,  23},\n   { 23,  23,  23}, { 23,  23,  23}, { 23,  23,  23}, { 24,  24,  24}, { 24,  24,  24},\n   { 24,  24,  24}, { 25,  25,  25}, { 25,  25,  25}, { 25,  25,  25}, { 25,  25,  25},\n   { 26,  26,  26}, { 26,  26,  26}, { 26,  26,  26}, { 26,  26,  26}, { 27,  27,  27},\n   { 27,  27,  27}, { 27,  27,  27}, { 27,  27,  27}, { 28,  28,  28}, { 28,  28,  28},\n   { 28,  28,  28}, { 28,  28,  28}, { 29,  29,  29}, { 29,  29,  29}, { 29,  29,  29},\n   { 29,  29,  29}, { 30,  30,  30}, { 30,  30,  30}, { 30,  30,  30}, { 30,  30,  30},\n   { 31,  31,  31}, { 31,  31,  31}, { 31,  31,  31}, { 31,  31,  31}, { 32,  32,  32},\n   { 32,  32,  32}, { 32,  32,  32}, { 32,  32,  32}, { 33,  33,  33}, { 33,  33,  33},\n   { 33,  33,  33}, { 33,  33,  33}, { 34,  34,  34}, { 34,  34,  34}, { 34,  34,  34},\n   { 34,  34,  34}, { 35,  35,  35}, { 35,  35,  35}, { 35,  35,  35}, { 35,  35,  35},\n   { 36,  36,  36}, { 36,  36,  36}, { 36,  36,  36}, { 37,  37,  37}, { 37,  37,  37},\n   { 37,  37,  37}, { 37,  37,  37}, { 38,  38,  38}, { 38,  38,  38}, { 38,  38,  38},\n   { 38,  38,  38}, { 39,  39,  39}, { 39,  39,  39}, { 39,  39,  39}, { 39,  39,  39},\n   { 40,  40,  40}, { 40,  40,  40}, { 40,  40,  40}, { 40,  40,  40}, { 41,  41,  41},\n   { 41,  41,  41}, { 41,  41,  41}, { 41,  41,  41}, { 42,  42,  42}, { 42,  42,  42},\n   { 42,  42,  42}, { 42,  42,  42}, { 43,  43,  43}, { 43,  43,  43}, { 43,  43,  43},\n   { 43,  43,  43}, { 44,  44,  44}, { 44,  44,  44}, { 44,  44,  44}, { 44,  44,  44},\n   { 45,  45,  45}, { 45,  45,  45}, { 45,  45,  45}, { 45,  45,  45}, { 46,  46,  46},\n   { 46,  46,  46}, { 46,  46,  46}, { 46,  46,  46}, { 47,  47,  47}, { 47,  47,  47},\n   { 47,  47,  47}, { 47,  47,  47}, { 48,  48,  48}, { 48,  48,  48}, { 48,  48,  48},\n   { 48,  48,  48}, { 49,  49,  49}, { 49,  49,  49}, { 49,  49,  49}, { 50,  50,  50},\n   { 50,  50,  50}, { 50,  50,  50}, { 50,  50,  50}, { 51,  51,  51}, { 51,  51,  51},\n   { 51,  51,  51}, { 51,  51,  51}, { 52,  52,  52}, { 52,  52,  52}, { 52,  52,  52},\n   { 52,  52,  52}, { 53,  53,  53}, { 53,  53,  53}, { 53,  53,  53}, { 53,  53,  53},\n   { 54,  54,  54}, { 54,  54,  54}, { 54,  54,  54}, { 54,  54,  54}, { 55,  55,  55},\n   { 55,  55,  55}, { 55,  55,  55}, { 55,  55,  55}, { 56,  56,  56}, { 56,  56,  56},\n   { 56,  56,  56}, { 56,  56,  56}, { 57,  57,  57}, { 57,  57,  57}, { 57,  57,  57},\n   { 57,  57,  57}, { 58,  58,  58}, { 58,  58,  58}, { 58,  58,  58}, { 58,  58,  58},\n   { 59,  59,  59}, { 59,  59,  59}, { 59,  59,  59}, { 59,  59,  59}, { 60,  60,  60},\n   { 60,  60,  60}, { 60,  60,  60}, { 60,  60,  60}, { 61,  61,  61}, { 61,  61,  61},\n   { 61,  61,  61}, { 62,  62,  62}, { 62,  62,  62}, { 62,  62,  62}, { 62,  62,  62},\n   { 63,  63,  63}, { 63,  63,  63}, { 63,  63,  63}, { 63,  63,  63}, { 64,  64,  64},\n   { 64,  64,  64}, { 64,  64,  64}, { 64,  64,  64}, { 65,  65,  65}, { 65,  65,  65},\n   { 65,  65,  65}, { 65,  65,  65}, { 66,  66,  66}, { 66,  66,  66}, { 66,  66,  66},\n   { 66,  66,  66}, { 67,  67,  67}, { 67,  67,  67}, { 67,  67,  67}, { 67,  67,  67},\n   { 68,  68,  68}, { 68,  68,  68}, { 68,  68,  68}, { 68,  68,  68}, { 69,  69,  69},\n   { 69,  69,  69}, { 69,  69,  69}, { 69,  69,  69}, { 70,  70,  70}, { 70,  70,  70},\n   { 70,  70,  70}, { 70,  70,  70}, { 71,  71,  71}, { 71,  71,  71}, { 71,  71,  71},\n   { 71,  71,  71}, { 72,  72,  72}, { 72,  72,  72}, { 72,  72,  72}, { 72,  72,  72},\n   { 73,  73,  73}, { 73,  73,  73}, { 73,  73,  73}, { 74,  74,  74}, { 74,  74,  74},\n   { 74,  74,  74}, { 74,  74,  74}, { 75,  75,  75}, { 75,  75,  75}, { 75,  75,  75},\n   { 75,  75,  75}, { 76,  76,  76}, { 76,  76,  76}, { 76,  76,  76}, { 76,  76,  76},\n   { 77,  77,  77}, { 77,  77,  77}, { 77,  77,  77}, { 77,  77,  77}, { 78,  78,  78},\n   { 78,  78,  78}, { 78,  78,  78}, { 78,  78,  78}, { 79,  79,  79}, { 79,  79,  79},\n   { 79,  79,  79}, { 79,  79,  79}, { 80,  80,  80}, { 80,  80,  80}, { 80,  80,  80},\n   { 80,  80,  80}, { 81,  81,  81}, { 81,  81,  81}, { 81,  81,  81}, { 81,  81,  81},\n   { 82,  82,  82}, { 82,  82,  82}, { 82,  82,  82}, { 82,  82,  82}, { 83,  83,  83},\n   { 83,  83,  83}, { 83,  83,  83}, { 83,  83,  83}, { 84,  84,  84}, { 84,  84,  84},\n   { 84,  84,  84}, { 84,  84,  84}, { 85,  85,  85}, { 85,  85,  85}, { 85,  85,  85},\n   { 86,  86,  86}, { 86,  86,  86}, { 86,  86,  86}, { 86,  86,  86}, { 87,  87,  87},\n   { 87,  87,  87}, { 87,  87,  87}, { 87,  87,  87}, { 88,  88,  88}, { 88,  88,  88},\n   { 88,  88,  88}, { 88,  88,  88}, { 89,  89,  89}, { 89,  89,  89}, { 89,  89,  89},\n   { 89,  89,  89}, { 90,  90,  90}, { 90,  90,  90}, { 90,  90,  90}, { 90,  90,  90},\n   { 91,  91,  91}, { 91,  91,  91}, { 91,  91,  91}, { 91,  91,  91}, { 92,  92,  92},\n   { 92,  92,  92}, { 92,  92,  92}, { 92,  92,  92}, { 93,  93,  93}, { 93,  93,  93},\n   { 93,  93,  93}, { 93,  93,  93}, { 94,  94,  94}, { 94,  94,  94}, { 94,  94,  94},\n   { 94,  94,  94}, { 95,  95,  95}, { 95,  95,  95}, { 95,  95,  95}, { 95,  95,  95},\n   { 96,  96,  96}, { 96,  96,  96}, { 96,  96,  96}, { 96,  96,  96}, { 97,  97,  97},\n   { 97,  97,  97}, { 97,  97,  97}, { 98,  98,  98}, { 98,  98,  98}, { 98,  98,  98},\n   { 98,  98,  98}, { 99,  99,  99}, { 99,  99,  99}, { 99,  99,  99}, { 99,  99,  99},\n   {100, 100, 100}, {100, 100, 100}, {100, 100, 100}, {100, 100, 100}, {101, 101, 101},\n   {101, 101, 101}, {101, 101, 101}, {101, 101, 101}, {102, 102, 102}, {102, 102, 102},\n   {102, 102, 102}, {102, 102, 102}, {103, 103, 103}, {103, 103, 103}, {103, 103, 103},\n   {103, 103, 103}, {104, 104, 104}, {104, 104, 104}, {104, 104, 104}, {104, 104, 104},\n   {105, 105, 105}, {105, 105, 105}, {105, 105, 105}, {105, 105, 105}, {106, 106, 106},\n   {106, 106, 106}, {106, 106, 106}, {106, 106, 106}, {107, 107, 107}, {107, 107, 107},\n   {107, 107, 107}, {107, 107, 107}, {108, 108, 108}, {108, 108, 108}, {108, 108, 108},\n   {108, 108, 108}, {109, 109, 109}, {109, 109, 109}, {109, 109, 109}, {110, 110, 110},\n   {110, 110, 110}, {110, 110, 110}, {110, 110, 110}, {111, 111, 111}, {111, 111, 111},\n   {111, 111, 111}, {111, 111, 111}, {112, 112, 112}, {112, 112, 112}, {112, 112, 112},\n   {112, 112, 112}, {113, 113, 113}, {113, 113, 113}, {113, 113, 113}, {113, 113, 113},\n   {114, 114, 114}, {114, 114, 114}, {114, 114, 114}, {114, 114, 114}, {115, 115, 115},\n   {115, 115, 115}, {115, 115, 115}, {115, 115, 115}, {116, 116, 116}, {116, 116, 116},\n   {116, 116, 116}, {116, 116, 116}, {117, 117, 117}, {117, 117, 117}, {117, 117, 117},\n   {117, 117, 117}, {118, 118, 118}, {118, 118, 118}, {118, 118, 118}, {118, 118, 118},\n   {119, 119, 119}, {119, 119, 119}, {119, 119, 119}, {119, 119, 119}, {120, 120, 120},\n   {120, 120, 120}, {120, 120, 120}, {120, 120, 120}, {121, 121, 121}, {121, 121, 121},\n   {121, 121, 121}, {122, 122, 122}, {122, 122, 122}, {122, 122, 122}, {122, 122, 122},\n   {123, 123, 123}, {123, 123, 123}, {123, 123, 123}, {123, 123, 123}, {124, 124, 124},\n   {124, 124, 124}, {124, 124, 124}, {124, 124, 124}, {125, 125, 125}, {125, 125, 125},\n   {125, 125, 125}, {125, 125, 125}, {126, 126, 126}, {126, 126, 126}, {126, 126, 126},\n   {126, 126, 126}, {127, 127, 127}, {127, 127, 127}, {127, 127, 127}, {127, 127, 127},\n   {128, 128, 128}, {128, 128, 128}, {128, 128, 128}, {128, 128, 128}, {129, 129, 129},\n   {129, 129, 129}, {129, 129, 129}, {129, 129, 129}, {130, 130, 130}, {130, 130, 130},\n   {130, 130, 130}, {130, 130, 130}, {131, 131, 131}, {131, 131, 131}, {131, 131, 131},\n   {131, 131, 131}, {132, 132, 132}, {132, 132, 132}, {132, 132, 132}, {132, 132, 132},\n   {133, 133, 133}, {133, 133, 133}, {133, 133, 133}, {133, 133, 133}, {134, 134, 134},\n   {134, 134, 134}, {134, 134, 134}, {135, 135, 135}, {135, 135, 135}, {135, 135, 135},\n   {135, 135, 135}, {136, 136, 136}, {136, 136, 136}, {136, 136, 136}, {136, 136, 136},\n   {137, 137, 137}, {137, 137, 137}, {137, 137, 137}, {137, 137, 137}, {138, 138, 138},\n   {138, 138, 138}, {138, 138, 138}, {138, 138, 138}, {139, 139, 139}, {139, 139, 139},\n   {139, 139, 139}, {139, 139, 139}, {140, 140, 140}, {140, 140, 140}, {140, 140, 140},\n   {140, 140, 140}, {141, 141, 141}, {141, 141, 141}, {141, 141, 141}, {141, 141, 141},\n   {142, 142, 142}, {142, 142, 142}, {142, 142, 142}, {142, 142, 142}, {143, 143, 143},\n   {143, 143, 143}, {143, 143, 143}, {143, 143, 143}, {144, 144, 144}, {144, 144, 144},\n   {144, 144, 144}, {144, 144, 144}, {145, 145, 145}, {145, 145, 145}, {145, 145, 145},\n   {145, 145, 145}, {146, 146, 146}, {146, 146, 146}, {146, 146, 146}, {147, 147, 147},\n   {147, 147, 147}, {147, 147, 147}, {147, 147, 147}, {148, 148, 148}, {148, 148, 148},\n   {148, 148, 148}, {148, 148, 148}, {149, 149, 149}, {149, 149, 149}, {149, 149, 149},\n   {149, 149, 149}, {150, 150, 150}, {150, 150, 150}, {150, 150, 150}, {150, 150, 150},\n   {151, 151, 151}, {151, 151, 151}, {151, 151, 151}, {151, 151, 151}, {152, 152, 152},\n   {152, 152, 152}, {152, 152, 152}, {152, 152, 152}, {153, 153, 153}, {153, 153, 153},\n   {153, 153, 153}, {153, 153, 153}, {154, 154, 154}, {154, 154, 154}, {154, 154, 154},\n   {154, 154, 154}, {155, 155, 155}, {155, 155, 155}, {155, 155, 155}, {155, 155, 155},\n   {156, 156, 156}, {156, 156, 156}, {156, 156, 156}, {156, 156, 156}, {157, 157, 157},\n   {157, 157, 157}, {157, 157, 157}, {157, 157, 157}, {158, 158, 158}, {158, 158, 158},\n   {158, 158, 158}, {159, 159, 159}, {159, 159, 159}, {159, 159, 159}, {159, 159, 159},\n   {160, 160, 160}, {160, 160, 160}, {160, 160, 160}, {160, 160, 160}, {161, 161, 161},\n   {161, 161, 161}, {161, 161, 161}, {161, 161, 161}, {162, 162, 162}, {162, 162, 162},\n   {162, 162, 162}, {162, 162, 162}, {163, 163, 163}, {163, 163, 163}, {163, 163, 163},\n   {163, 163, 163}, {164, 164, 164}, {164, 164, 164}, {164, 164, 164}, {164, 164, 164},\n   {165, 165, 165}, {165, 165, 165}, {165, 165, 165}, {165, 165, 165}, {166, 166, 166},\n   {166, 166, 166}, {166, 166, 166}, {166, 166, 166}, {167, 167, 167}, {167, 167, 167},\n   {167, 167, 167}, {167, 167, 167}, {168, 168, 168}, {168, 168, 168}, {168, 168, 168},\n   {168, 168, 168}, {169, 169, 169}, {169, 169, 169}, {169, 169, 169}, {169, 169, 169},\n   {170, 170, 170}, {170, 170, 170}, {170, 170, 170}, {171, 171, 171}, {171, 171, 171},\n   {171, 171, 171}, {171, 171, 171}, {172, 172, 172}, {172, 172, 172}, {172, 172, 172},\n   {172, 172, 172}, {173, 173, 173}, {173, 173, 173}, {173, 173, 173}, {173, 173, 173},\n   {174, 174, 174}, {174, 174, 174}, {174, 174, 174}, {174, 174, 174}, {175, 175, 175},\n   {175, 175, 175}, {175, 175, 175}, {175, 175, 175}, {176, 176, 176}, {176, 176, 176},\n   {176, 176, 176}, {176, 176, 176}, {177, 177, 177}, {177, 177, 177}, {177, 177, 177},\n   {177, 177, 177}, {178, 178, 178}, {178, 178, 178}, {178, 178, 178}, {178, 178, 178},\n   {179, 179, 179}, {179, 179, 179}, {179, 179, 179}, {179, 179, 179}, {180, 180, 180},\n   {180, 180, 180}, {180, 180, 180}, {180, 180, 180}, {181, 181, 181}, {181, 181, 181},\n   {181, 181, 181}, {181, 181, 181}, {182, 182, 182}, {182, 182, 182}, {182, 182, 182},\n   {183, 183, 183}, {183, 183, 183}, {183, 183, 183}, {183, 183, 183}, {184, 184, 184},\n   {184, 184, 184}, {184, 184, 184}, {184, 184, 184}, {185, 185, 185}, {185, 185, 185},\n   {185, 185, 185}, {185, 185, 185}, {186, 186, 186}, {186, 186, 186}, {186, 186, 186},\n   {186, 186, 186}, {187, 187, 187}, {187, 187, 187}, {187, 187, 187}, {187, 187, 187},\n   {188, 188, 188}, {188, 188, 188}, {188, 188, 188}, {188, 188, 188}, {189, 189, 189},\n   {189, 189, 189}, {189, 189, 189}, {189, 189, 189}, {190, 190, 190}, {190, 190, 190},\n   {190, 190, 190}, {190, 190, 190}, {191, 191, 191}, {191, 191, 191}, {191, 191, 191},\n   {191, 191, 191}, {192, 192, 192}, {192, 192, 192}, {192, 192, 192}, {192, 192, 192},\n   {193, 193, 193}, {193, 193, 193}, {193, 193, 193}, {193, 193, 193}, {194, 194, 194},\n   {194, 194, 194}, {194, 194, 194}, {195, 195, 195}, {195, 195, 195}, {195, 195, 195},\n   {195, 195, 195}, {196, 196, 196}, {196, 196, 196}, {196, 196, 196}, {196, 196, 196},\n   {197, 197, 197}, {197, 197, 197}, {197, 197, 197}, {197, 197, 197}, {198, 198, 198},\n   {198, 198, 198}, {198, 198, 198}, {198, 198, 198}, {199, 199, 199}, {199, 199, 199},\n   {199, 199, 199}, {199, 199, 199}, {200, 200, 200}, {200, 200, 200}, {200, 200, 200},\n   {200, 200, 200}, {201, 201, 201}, {201, 201, 201}, {201, 201, 201}, {201, 201, 201},\n   {202, 202, 202}, {202, 202, 202}, {202, 202, 202}, {202, 202, 202}, {203, 203, 203},\n   {203, 203, 203}, {203, 203, 203}, {203, 203, 203}, {204, 204, 204}, {204, 204, 204},\n   {204, 204, 204}, {204, 204, 204}, {205, 205, 205}, {205, 205, 205}, {205, 205, 205},\n   {205, 205, 205}, {206, 206, 206}, {206, 206, 206}, {206, 206, 206}, {207, 207, 207},\n   {207, 207, 207}, {207, 207, 207}, {207, 207, 207}, {208, 208, 208}, {208, 208, 208},\n   {208, 208, 208}, {208, 208, 208}, {209, 209, 209}, {209, 209, 209}, {209, 209, 209},\n   {209, 209, 209}, {210, 210, 210}, {210, 210, 210}, {210, 210, 210}, {210, 210, 210},\n   {211, 211, 211}, {211, 211, 211}, {211, 211, 211}, {211, 211, 211}, {212, 212, 212},\n   {212, 212, 212}, {212, 212, 212}, {212, 212, 212}, {213, 213, 213}, {213, 213, 213},\n   {213, 213, 213}, {213, 213, 213}, {214, 214, 214}, {214, 214, 214}, {214, 214, 214},\n   {214, 214, 214}, {215, 215, 215}, {215, 215, 215}, {215, 215, 215}, {215, 215, 215},\n   {216, 216, 216}, {216, 216, 216}, {216, 216, 216}, {216, 216, 216}, {217, 217, 217},\n   {217, 217, 217}, {217, 217, 217}, {217, 217, 217}, {218, 218, 218}, {218, 218, 218},\n   {218, 218, 218}, {218, 218, 218}, {219, 219, 219}, {219, 219, 219}, {219, 219, 219},\n   {220, 220, 220}, {220, 220, 220}, {220, 220, 220}, {220, 220, 220}, {221, 221, 221},\n   {221, 221, 221}, {221, 221, 221}, {221, 221, 221}, {222, 222, 222}, {222, 222, 222},\n   {222, 222, 222}, {222, 222, 222}, {223, 223, 223}, {223, 223, 223}, {223, 223, 223},\n   {223, 223, 223}, {224, 224, 224}, {224, 224, 224}, {224, 224, 224}, {224, 224, 224},\n   {225, 225, 225}, {225, 225, 225}, {225, 225, 225}, {225, 225, 225}, {226, 226, 226},\n   {226, 226, 226}, {226, 226, 226}, {226, 226, 226}, {227, 227, 227}, {227, 227, 227},\n   {227, 227, 227}, {227, 227, 227}, {228, 228, 228}, {228, 228, 228}, {228, 228, 228},\n   {228, 228, 228}, {229, 229, 229}, {229, 229, 229}, {229, 229, 229}, {229, 229, 229},\n   {230, 230, 230}, {230, 230, 230}, {230, 230, 230}, {230, 230, 230}, {231, 231, 231},\n   {231, 231, 231}, {231, 231, 231}, {232, 232, 232}, {232, 232, 232}, {232, 232, 232},\n   {232, 232, 232}, {233, 233, 233}, {233, 233, 233}, {233, 233, 233}, {233, 233, 233},\n   {234, 234, 234}, {234, 234, 234}, {234, 234, 234}, {234, 234, 234}, {235, 235, 235},\n   {235, 235, 235}, {235, 235, 235}, {235, 235, 235}, {236, 236, 236}, {236, 236, 236},\n   {236, 236, 236}, {236, 236, 236}, {237, 237, 237}, {237, 237, 237}, {237, 237, 237},\n   {237, 237, 237}, {238, 238, 238}, {238, 238, 238}, {238, 238, 238}, {238, 238, 238},\n   {239, 239, 239}, {239, 239, 239}, {239, 239, 239}, {239, 239, 239}, {240, 240, 240},\n   {240, 240, 240}, {240, 240, 240}, {240, 240, 240}, {241, 241, 241}, {241, 241, 241},\n   {241, 241, 241}, {241, 241, 241}, {242, 242, 242}, {242, 242, 242}, {242, 242, 242},\n   {242, 242, 242}, {243, 243, 243}, {243, 243, 243}, {243, 243, 243}, {244, 244, 244},\n   {244, 244, 244}, {244, 244, 244}, {244, 244, 244}, {245, 245, 245}, {245, 245, 245},\n   {245, 245, 245}, {245, 245, 245}, {246, 246, 246}, {246, 246, 246}, {246, 246, 246},\n   {246, 246, 246}, {247, 247, 247}, {247, 247, 247}, {247, 247, 247}, {247, 247, 247},\n   {248, 248, 248}, {248, 248, 248}, {248, 248, 248}, {248, 248, 248}, {249, 249, 249},\n   {249, 249, 249}, {249, 249, 249}, {249, 249, 249}, {250, 250, 250}, {250, 250, 250},\n   {250, 250, 250}, {250, 250, 250}, {251, 251, 251}, {251, 251, 251}, {251, 251, 251},\n   {251, 251, 251}, {252, 252, 252}, {252, 252, 252}, {252, 252, 252}, {252, 252, 252},\n   {253, 253, 253}, {253, 253, 253}, {253, 253, 253}, {253, 253, 253}, {254, 254, 254},\n   {254, 254, 254}, {254, 254, 254}, {254, 254, 254}, {255, 255, 255}, {255, 255, 255}\n};\n\n#endif\n"
  },
  {
    "path": "lonestar/eda/cpu/sproute/bitmap_test.cpp",
    "content": "/*\n *****************************************************************************\n *                                                                           *\n *                          Platform Independent                             *\n *                     Bitmap Image Reader Writer Library                    *\n *                                                                           *\n * Author: Arash Partow - 2002                                               *\n * URL: http://partow.net/programming/bitmap/index.html                      *\n *                                                                           *\n * Note: This library only supports 24-bits per pixel bitmap format files.   *\n *                                                                           *\n * Copyright notice:                                                         *\n * Free use of the Platform Independent Bitmap Image Reader Writer Library   *\n * is permitted under the guidelines and in accordance with the most current *\n * version of the MIT License.                                               *\n * http://www.opensource.org/licenses/MIT                                    *\n *                                                                           *\n *****************************************************************************\n */\n\n#include <cmath>\n#include <cstdio>\n#include <cstdlib>\n#include <iostream>\n#include <string>\n\n#include \"bitmap_image.hpp\"\n\nvoid test01() {\n  std::string file_name(\"image.bmp\");\n\n  bitmap_image image(file_name);\n\n  if (!image) {\n    printf(\"test01() - Error - Failed to open '%s'\\n\", file_name.c_str());\n    return;\n  }\n\n  image.save_image(\"test01_saved.bmp\");\n}\n\nvoid test02() {\n  std::string file_name(\"image.bmp\");\n\n  bitmap_image image(file_name);\n\n  if (!image) {\n    printf(\"test02() - Error - Failed to open '%s'\\n\", file_name.c_str());\n    return;\n  }\n\n  image.save_image(\"test02_saved.bmp\");\n\n  image.vertical_flip();\n  image.save_image(\"test02_saved_vert_flip.bmp\");\n  image.vertical_flip();\n\n  image.horizontal_flip();\n  image.save_image(\"test02_saved_horiz_flip.bmp\");\n}\n\nvoid test03() {\n  std::string file_name(\"image.bmp\");\n\n  bitmap_image image(file_name);\n\n  if (!image) {\n    printf(\"test03() - Error - Failed to open '%s'\\n\", file_name.c_str());\n    return;\n  }\n\n  bitmap_image subsampled_image1;\n  bitmap_image subsampled_image2;\n  bitmap_image subsampled_image3;\n\n  image.subsample(subsampled_image1);\n  subsampled_image1.save_image(\"test03_1xsubsampled_image.bmp\");\n\n  subsampled_image1.subsample(subsampled_image2);\n  subsampled_image2.save_image(\"test03_2xsubsampled_image.bmp\");\n\n  subsampled_image2.subsample(subsampled_image3);\n  subsampled_image3.save_image(\"test03_3xsubsampled_image.bmp\");\n}\n\nvoid test04() {\n  std::string file_name(\"image.bmp\");\n\n  bitmap_image image(file_name);\n\n  if (!image) {\n    printf(\"test04() - Error - Failed to open '%s'\\n\", file_name.c_str());\n    return;\n  }\n\n  bitmap_image upsampled_image1;\n  bitmap_image upsampled_image2;\n  bitmap_image upsampled_image3;\n\n  image.upsample(upsampled_image1);\n  upsampled_image1.save_image(\"test04_1xupsampled_image.bmp\");\n\n  upsampled_image1.upsample(upsampled_image2);\n  upsampled_image2.save_image(\"test04_2xupsampled_image.bmp\");\n\n  upsampled_image2.upsample(upsampled_image3);\n  upsampled_image3.save_image(\"test04_3xupsampled_image.bmp\");\n}\n\nvoid test05() {\n  std::string file_name(\"image.bmp\");\n\n  bitmap_image image(file_name);\n\n  if (!image) {\n    printf(\"test05() - Error - Failed to open '%s'\\n\", file_name.c_str());\n    return;\n  }\n\n  image.set_all_ith_bits_low(0);\n  image.save_image(\"test05_lsb0_removed_saved.bmp\");\n  image.set_all_ith_bits_low(1);\n  image.save_image(\"test05_lsb01_removed_saved.bmp\");\n  image.set_all_ith_bits_low(2);\n  image.save_image(\"test05_lsb012_removed_saved.bmp\");\n  image.set_all_ith_bits_low(3);\n  image.save_image(\"test05_lsb0123_removed_saved.bmp\");\n  image.set_all_ith_bits_low(4);\n  image.save_image(\"test05_lsb01234_removed_saved.bmp\");\n  image.set_all_ith_bits_low(5);\n  image.save_image(\"test05_lsb012345_removed_saved.bmp\");\n  image.set_all_ith_bits_low(6);\n  image.save_image(\"test05_lsb0123456_removed_saved.bmp\");\n}\n\nvoid test06() {\n  std::string file_name(\"image.bmp\");\n\n  bitmap_image image(file_name);\n\n  if (!image) {\n    printf(\"test06() - Error - Failed to open '%s'\\n\", file_name.c_str());\n    return;\n  }\n\n  bitmap_image red_channel_image;\n  image.export_color_plane(bitmap_image::red_plane, red_channel_image);\n  red_channel_image.save_image(\"test06_red_channel_image.bmp\");\n\n  bitmap_image green_channel_image;\n  image.export_color_plane(bitmap_image::green_plane, green_channel_image);\n  green_channel_image.save_image(\"test06_green_channel_image.bmp\");\n\n  bitmap_image blue_channel_image;\n  image.export_color_plane(bitmap_image::blue_plane, blue_channel_image);\n  blue_channel_image.save_image(\"test06_blue_channel_image.bmp\");\n}\n\nvoid test07() {\n  std::string file_name(\"image.bmp\");\n\n  bitmap_image image(file_name);\n\n  if (!image) {\n    printf(\"test07() - Error - Failed to open '%s'\\n\", file_name.c_str());\n    return;\n  }\n\n  image.convert_to_grayscale();\n  image.save_image(\"test07_grayscale_image.bmp\");\n}\n\nvoid test08() {\n  std::string file_name(\"image.bmp\");\n\n  bitmap_image image(file_name);\n\n  if (!image) {\n    printf(\"test08() - Error - Failed to open '%s'\\n\", file_name.c_str());\n    return;\n  }\n\n  bitmap_image image1;\n  bitmap_image image2;\n  bitmap_image image3;\n  bitmap_image image4;\n\n  unsigned int w = image.width();\n  unsigned int h = image.height();\n\n  if (!image.region(0, 0, w / 2, h / 2, image1)) {\n    std::cout << \"ERROR: upper_left_image\" << std::endl;\n  }\n\n  if (!image.region((w - 1) / 2, 0, w / 2, h / 2, image2)) {\n    std::cout << \"ERROR: upper_right_image\" << std::endl;\n  }\n\n  if (!image.region(0, (h - 1) / 2, w / 2, h / 2, image3)) {\n    std::cout << \"ERROR: lower_left_image\" << std::endl;\n  }\n\n  if (!image.region((w - 1) / 2, (h - 1) / 2, w / 2, h / 2, image4)) {\n    std::cout << \"ERROR: lower_right_image\" << std::endl;\n  }\n\n  image1.save_image(\"test08_upper_left_image.bmp\");\n  image2.save_image(\"test08_upper_right_image.bmp\");\n  image3.save_image(\"test08_lower_left_image.bmp\");\n  image4.save_image(\"test08_lower_right_image.bmp\");\n}\n\nvoid test09() {\n  const unsigned int dim = 1000;\n\n  bitmap_image image(dim, dim);\n\n  for (unsigned int x = 0; x < dim; ++x) {\n    for (unsigned int y = 0; y < dim; ++y) {\n      rgb_t col = jet_colormap[(x + y) % dim];\n      image.set_pixel(x, y, col.red, col.green, col.blue);\n    }\n  }\n\n  image.save_image(\"test09_color_map_image.bmp\");\n}\n\nvoid test10() {\n  std::string file_name(\"image.bmp\");\n\n  bitmap_image image(file_name);\n\n  if (!image) {\n    printf(\"test10() - Error - Failed to open '%s'\\n\", file_name.c_str());\n    return;\n  }\n\n  image.invert_color_planes();\n  image.save_image(\"test10_inverted_color_image.bmp\");\n}\n\nvoid test11() {\n  std::string file_name(\"image.bmp\");\n\n  bitmap_image image(file_name);\n\n  if (!image) {\n    printf(\"test11() - Error - Failed to open '%s'\\n\", file_name.c_str());\n    return;\n  }\n\n  for (unsigned int i = 0; i < 10; ++i) {\n    image.add_to_color_plane(bitmap_image::red_plane, 10);\n    image.save_image(std::string(\"test11_\") + static_cast<char>(48 + i) +\n                     std::string(\"_red_inc_image.bmp\"));\n  }\n}\n\nvoid test12() {\n  std::string file_name(\"image.bmp\");\n\n  bitmap_image image(file_name);\n\n  if (!image) {\n    printf(\"test12() - Error - Failed to open '%s'\\n\", file_name.c_str());\n    return;\n  }\n\n  double* y  = new double[image.pixel_count()];\n  double* cb = new double[image.pixel_count()];\n  double* cr = new double[image.pixel_count()];\n\n  image.export_ycbcr(y, cb, cr);\n\n  for (unsigned int i = 0; i < image.pixel_count(); ++i) {\n    cb[i] = cr[i] = 0.0;\n  }\n\n  image.import_ycbcr(y, cb, cr);\n  image.save_image(\"test12_only_y_image.bmp\");\n\n  delete[] y;\n  delete[] cb;\n  delete[] cr;\n}\n\nvoid test13() {\n  std::string file_name(\"image.bmp\");\n\n  bitmap_image image(file_name);\n\n  if (!image) {\n    printf(\"test13() - Error - Failed to open '%s'\\n\", file_name.c_str());\n    return;\n  }\n\n  double* y  = new double[image.pixel_count()];\n  double* cb = new double[image.pixel_count()];\n  double* cr = new double[image.pixel_count()];\n\n  image.export_ycbcr(y, cb, cr);\n\n  for (unsigned int j = 0; j < 10; ++j) {\n    for (unsigned int i = 0; i < image.pixel_count(); ++i) {\n      y[i] += 15.0;\n    }\n\n    image.import_ycbcr(y, cb, cr);\n    image.save_image(std::string(\"test13_\") + static_cast<char>(48 + j) +\n                     std::string(\"_y_image.bmp\"));\n  }\n\n  delete[] y;\n  delete[] cb;\n  delete[] cr;\n}\n\nvoid test14() {\n  bitmap_image image(512, 512);\n\n  image.clear();\n  checkered_pattern(64, 64, 220, bitmap_image::red_plane, image);\n  image.save_image(\"test14_checkered_01.bmp\");\n\n  image.clear();\n  checkered_pattern(32, 64, 100, 200, 50, image);\n  image.save_image(\"test14_checkered_02.bmp\");\n}\n\nvoid test15() {\n  bitmap_image image(1024, 1024);\n\n  image.clear();\n\n  double c1 = 0.9;\n  double c2 = 0.5;\n  double c3 = 0.3;\n  double c4 = 0.7;\n\n  ::srand(0xA5AA5AA5);\n  plasma(image, 0, 0, image.width(), image.height(), c1, c2, c3, c4, 3.0,\n         jet_colormap);\n  image.save_image(\"test15_plasma.bmp\");\n}\n\nvoid test16() {\n  std::string file_name(\"image.bmp\");\n\n  bitmap_image image(file_name);\n\n  if (!image) {\n    printf(\"test16() - Error - Failed to open '%s'\\n\", file_name.c_str());\n    return;\n  }\n\n  double c1 = 0.9;\n  double c2 = 0.5;\n  double c3 = 0.3;\n  double c4 = 0.7;\n\n  bitmap_image plasma_image(image.width(), image.height());\n  plasma(plasma_image, 0, 0, plasma_image.width(), plasma_image.height(), c1,\n         c2, c3, c4, 3.0, jet_colormap);\n\n  bitmap_image temp_image(image);\n\n  temp_image.alpha_blend(0.1, plasma_image);\n  temp_image.save_image(\"test16_alpha_0.1.bmp\");\n  temp_image = image;\n\n  temp_image.alpha_blend(0.2, plasma_image);\n  temp_image.save_image(\"test16_alpha_0.2.bmp\");\n  temp_image = image;\n\n  temp_image.alpha_blend(0.3, plasma_image);\n  temp_image.save_image(\"test16_alpha_0.3.bmp\");\n  temp_image = image;\n\n  temp_image.alpha_blend(0.4, plasma_image);\n  temp_image.save_image(\"test16_alpha_0.4.bmp\");\n  temp_image = image;\n\n  temp_image.alpha_blend(0.5, plasma_image);\n  temp_image.save_image(\"test16_alpha_0.5.bmp\");\n  temp_image = image;\n\n  temp_image.alpha_blend(0.6, plasma_image);\n  temp_image.save_image(\"test16_alpha_0.6.bmp\");\n  temp_image = image;\n\n  temp_image.alpha_blend(0.7, plasma_image);\n  temp_image.save_image(\"test16_alpha_0.7.bmp\");\n  temp_image = image;\n\n  temp_image.alpha_blend(0.8, plasma_image);\n  temp_image.save_image(\"test16_alpha_0.8.bmp\");\n  temp_image = image;\n\n  temp_image.alpha_blend(0.9, plasma_image);\n  temp_image.save_image(\"test16_alpha_0.9.bmp\");\n}\n\nvoid test17() {\n  bitmap_image image(1024, 1024);\n\n  double c1 = 0.9;\n  double c2 = 0.5;\n  double c3 = 0.3;\n  double c4 = 0.7;\n\n  plasma(image, 0, 0, image.width(), image.height(), c1, c2, c3, c4, 3.0,\n         jet_colormap);\n\n  image_drawer draw(image);\n\n  draw.pen_width(3);\n  draw.pen_color(255, 0, 0);\n  draw.circle(image.width() / 2 + 100, image.height() / 2, 100);\n\n  draw.pen_width(2);\n  draw.pen_color(0, 255, 255);\n  draw.ellipse(image.width() / 2, image.height() / 2, 200, 350);\n\n  draw.pen_width(1);\n  draw.pen_color(255, 255, 0);\n  draw.rectangle(50, 50, 250, 400);\n\n  draw.pen_color(0, 255, 0);\n  draw.rectangle(450, 250, 850, 880);\n\n  image.save_image(\"test17_image_drawer.bmp\");\n}\n\nvoid test18() {\n  {\n    bitmap_image image(1000, 180);\n    image_drawer draw(image);\n    const rgb_t* colormap[9] = {\n        autumn_colormap, copper_colormap, gray_colormap,\n        hot_colormap,    hsv_colormap,    jet_colormap,\n        prism_colormap,  vga_colormap,    yarg_colormap};\n\n    for (unsigned int i = 0; i < image.width(); ++i) {\n      for (unsigned int j = 0; j < 9; ++j) {\n        draw.pen_color(colormap[j][i].red, colormap[j][i].green,\n                       colormap[j][i].blue);\n        draw.vertical_line_segment(j * 20, (j + 1) * 20, i);\n      }\n    }\n\n    image.save_image(\"test18_color_maps.bmp\");\n  }\n\n  {\n    bitmap_image image(1000, 500);\n    image_drawer draw(image);\n\n    std::size_t palette_colormap_size =\n        sizeof(palette_colormap) / sizeof(rgb_t);\n    std::size_t bar_width = image.width() / palette_colormap_size;\n\n    for (std::size_t i = 0; i < palette_colormap_size; ++i) {\n      for (std::size_t j = 0; j < bar_width; ++j) {\n        draw.pen_color(palette_colormap[i].red, palette_colormap[i].green,\n                       palette_colormap[i].blue);\n        draw.vertical_line_segment(0, image.height(),\n                                   static_cast<int>(i * bar_width + j));\n      }\n    }\n\n    image.save_image(\"test18_palette_colormap.bmp\");\n  }\n}\n\nvoid test19() {\n  {\n    cartesian_canvas canvas(1000, 1000);\n\n    if (!canvas) {\n      printf(\"test19() - Error - Failed to instantiate cartesian \"\n             \"canvas(1000x1000) [1]\\n\");\n      return;\n    }\n\n    canvas.rectangle(canvas.min_x(), canvas.min_y(), canvas.max_x(),\n                     canvas.max_y());\n\n    canvas.horiztonal_line_segment(canvas.min_x(), canvas.max_x(), -400.0);\n\n    canvas.line_segment(-500.0, 600.0, 600.0, -500.0);\n\n    canvas.pen_width(3);\n\n    for (std::size_t i = 0; i < 160; i++) {\n      std::size_t c_idx = i % (sizeof(palette_colormap) / sizeof(rgb_t));\n\n      canvas.pen_color(palette_colormap[c_idx].red,\n                       palette_colormap[c_idx].green,\n                       palette_colormap[c_idx].blue);\n\n      canvas.circle(0.0, 0.0, 3.0 * i);\n    }\n\n    canvas.image().save_image(\"test19_cartesian_canvas01.bmp\");\n  }\n\n  {\n    static const double pi =\n        3.14159265358979323846264338327950288419716939937510;\n\n    cartesian_canvas canvas(1000, 1000);\n\n    if (!canvas) {\n      printf(\"test19() - Error - Failed to instantiate cartesian \"\n             \"canvas(1000x1000) [2]\\n\");\n      return;\n    }\n\n    canvas.image().set_all_channels(0xFF);\n\n    canvas.pen_width(2);\n\n    unsigned int i = 0;\n\n    for (double x = -500; x < 500; x += 3, ++i) {\n      std::size_t c_idx = i % (sizeof(palette_colormap) / sizeof(rgb_t));\n\n      canvas.pen_color(palette_colormap[c_idx].red,\n                       palette_colormap[c_idx].green,\n                       palette_colormap[c_idx].blue);\n\n      double radius =\n          std::max(10.0, std::abs(80.0 * std::sin((1.0 / 80.0) * pi * x)));\n\n      double y = 400.0 * std::sin((1.0 / 200.0) * pi * x);\n\n      canvas.circle(x, y, radius);\n    }\n\n    canvas.image().save_image(\"test19_cartesian_canvas02.bmp\");\n  }\n}\n\nvoid test20() {\n  const rgb_t* colormap[4] = {hsv_colormap, jet_colormap, prism_colormap,\n                              vga_colormap};\n\n  const unsigned int fractal_width  = 1200;\n  const unsigned int fractal_height = 800;\n\n  {\n    bitmap_image fractal_hsv(fractal_width, fractal_height);\n    bitmap_image fractal_jet(fractal_width, fractal_height);\n    bitmap_image fractal_prism(fractal_width, fractal_height);\n    bitmap_image fractal_vga(fractal_width, fractal_height);\n\n    fractal_hsv.clear();\n    fractal_jet.clear();\n    fractal_prism.clear();\n    fractal_vga.clear();\n\n    double cr, ci;\n    double nextr, nexti;\n    double prevr, previ;\n\n    const unsigned int max_iterations = 1000;\n\n    for (unsigned int y = 0; y < fractal_height; ++y) {\n      for (unsigned int x = 0; x < fractal_width; ++x) {\n        cr = 1.5 * (2.0 * x / fractal_width - 1.0) - 0.5;\n        ci = (2.0 * y / fractal_height - 1.0);\n\n        nextr = nexti = 0;\n        prevr = previ = 0;\n\n        for (unsigned int i = 0; i < max_iterations; i++) {\n          prevr = nextr;\n          previ = nexti;\n\n          nextr = prevr * prevr - previ * previ + cr;\n          nexti = 2 * prevr * previ + ci;\n\n          if (((nextr * nextr) + (nexti * nexti)) > 4) {\n            if (max_iterations != i) {\n              double z = sqrt(nextr * nextr + nexti * nexti);\n\n#define log2(x) (std::log(1.0 * x) / std::log(2.0))\n\n              unsigned int index = static_cast<unsigned int>(\n                  1000.0 * log2(1.75 + i - log2(log2(z))) /\n                  log2(max_iterations));\n#undef log2\n\n              rgb_t c0 = colormap[0][index];\n              rgb_t c1 = colormap[1][index];\n              rgb_t c2 = colormap[2][index];\n              rgb_t c3 = colormap[3][index];\n\n              fractal_hsv.set_pixel(x, y, c0.red, c0.green, c0.blue);\n              fractal_jet.set_pixel(x, y, c1.red, c1.green, c1.blue);\n              fractal_prism.set_pixel(x, y, c2.red, c2.green, c2.blue);\n              fractal_vga.set_pixel(x, y, c3.red, c3.green, c3.blue);\n            }\n\n            break;\n          }\n        }\n      }\n    }\n\n    fractal_hsv.save_image(\"test20_mandelbrot_set_hsv.bmp\");\n    fractal_jet.save_image(\"test20_mandelbrot_set_jet.bmp\");\n    fractal_prism.save_image(\"test20_mandelbrot_set_prism.bmp\");\n    fractal_vga.save_image(\"test20_mandelbrot_set_vga.bmp\");\n  }\n\n  {\n    bitmap_image fractal_hsv(fractal_width, fractal_height);\n    bitmap_image fractal_jet(fractal_width, fractal_height);\n    bitmap_image fractal_prism(fractal_width, fractal_height);\n    bitmap_image fractal_vga(fractal_width, fractal_height);\n\n    fractal_hsv.clear();\n    fractal_jet.clear();\n    fractal_prism.clear();\n    fractal_vga.clear();\n\n    const unsigned int max_iterations = 300;\n\n    const double cr = -0.70000;\n    const double ci = 0.27015;\n\n    double prevr, previ;\n\n    for (unsigned int y = 0; y < fractal_height; ++y) {\n      for (unsigned int x = 0; x < fractal_width; ++x) {\n        double nextr = 1.5 * (2.0 * x / fractal_width - 1.0);\n        double nexti = (2.0 * y / fractal_height - 1.0);\n\n        for (unsigned int i = 0; i < max_iterations; i++) {\n          prevr = nextr;\n          previ = nexti;\n\n          nextr = prevr * prevr - previ * previ + cr;\n          nexti = 2 * prevr * previ + ci;\n\n          if (((nextr * nextr) + (nexti * nexti)) > 4) {\n            if (max_iterations != i) {\n              unsigned int index =\n                  static_cast<int>((1000.0 * i) / max_iterations);\n\n              rgb_t c0 = colormap[0][index];\n              rgb_t c1 = colormap[1][index];\n              rgb_t c2 = colormap[2][index];\n              rgb_t c3 = colormap[3][index];\n\n              fractal_hsv.set_pixel(x, y, c0.red, c0.green, c0.blue);\n              fractal_jet.set_pixel(x, y, c1.red, c1.green, c1.blue);\n              fractal_prism.set_pixel(x, y, c2.red, c2.green, c2.blue);\n              fractal_vga.set_pixel(x, y, c3.red, c3.green, c3.blue);\n            }\n\n            break;\n          }\n        }\n      }\n    }\n\n    fractal_hsv.save_image(\"test20_julia_set_hsv.bmp\");\n    fractal_jet.save_image(\"test20_julia_set_jet.bmp\");\n    fractal_prism.save_image(\"test20_julia_set_prism.bmp\");\n    fractal_vga.save_image(\"test20_julia_set_vga.bmp\");\n  }\n}\n\nint main() {\n  test01();\n  test02();\n  test03();\n  test04();\n  test05();\n  test06();\n  test07();\n  test08();\n  test09();\n  test10();\n  test11();\n  test12();\n  test13();\n  test14();\n  test15();\n  test16();\n  test17();\n  test18();\n  test19();\n  test20();\n  return 0;\n}\n\n/*\n   Note: In some of the tests a bitmap image by the name of 'image.bmp'\n         is required. If not present the test will fail.\n*/\n"
  },
  {
    "path": "lonestar/eda/cpu/sproute/bookshelf_IO.c",
    "content": "/* -----------  FastPlace - Version 1.0 ----------------\n                       by \n   Natarajan Viswanathan and Chris C.-N. Chu\n     Dept. of ECpE, Iowa State University\n          Copyright (c) - 2004 \nIowa State University Research Foundation, Inc.\n--------------------------------------------------------*/\n/* --------------------------------------------------------------------------\n   Contains routines to:\n   - Read and Write the benchmark files in Bookshelf format \n----------------------------------------------------------------------------*/\n\n#include <stdio.h>\n#include <stdlib.h>\n#include <string.h>\n#include <math.h>\n\n#include \"memAlloc.h\"\n#include \"bookshelf_IO.h\"\n\n#define MAX(a,b) ((a)>(b) ? (a) : (b))\n#define MIN(a,b) ((a)<(b) ? (a) : (b))\n\n\n/*-- extern variables --*/\n    // from createHash()\n    char **cellName;\n    \n    // from readAuxFile()\n    char nodesFile[BUFFERSIZE], netsFile[BUFFERSIZE], wtsFile[BUFFERSIZE]; \n    char sclFile[BUFFERSIZE], plFile[BUFFERSIZE], benchmarkName[BUFFERSIZE];\n    \n    // from readNodesFile()\n    int movableNodes, numTerminals;\n    float averageCellWidth, *cellWidth, *cellHeight; \n    \n    // from readNetsFile() \n    int numPins, *netlist, *netlistIndex;\n    float *xPinOffset, *yPinOffset;\n       \n    // from readPlFile()\n    float *xCellCoord, *yCellCoord, minX, maxX, minY, maxY;\n    int *areaArrayIO, numAreaArrayIO;\n\n\n    // from readSclFile()\n    int numRows, numRowBlockages;\n    float siteOriginY, siteEndY, coreHeight;\n    float siteOriginX, siteEndX, coreWidth;\n    float siteWidth, siteSpacing, coreRowHeight;\n    float *rowOriginX, *rowEndX;\n    float *xRowBlockage, *yRowBlockage, *widthRowBlockage;\n\n/*-- global variables --*/\n    typedef struct nodesHash NODES;\n    struct nodesHash  {\n        char name[STRINGLEN];\n        unsigned long index;\n    };\n\n    NODES *NodesInfo;\n\n    long hashSize, hashBits, *RN;\n    unsigned char *hashFlag;\n    long modresNum;\n    int numNodes;\n\n\n/*-- functions --*/\n    void createHash(char benchmarkPath[], char nodesFile[]);\n    void freeHash();\n    void readAuxFile(char benchmarkPath[], char auxFile[]);\n    void readNodesFile(char benchmarkPath[], char nodesFile[]);\n    void readNetsFile(char benchmarkPath[], char netsFile[]);\n    void readPlFile(char benchmarkPath[], char plFile[]);\n    void readSclFile(char benchmarkPath[], char sclFile[]);\n    void writePlFile(char outputDir[], char benchmarkName[], float xCoord[], float yCoord[]);\n\n\n/* -----------------------------------------------------------\n   Reads the .nodes file and creates a hash linking cell name \n   to cell index for all the nodes in the circuit \n   (movable nodes + fixed nodes + I/O pads)\n   \n   creates extern vars:\n      cellName[]\n----------------------------------------------------------- */\nvoid createHash(char benchmarkPath[], char nodesFile[]) \n{\n    FILE *fp;\n    char line[BUFFERSIZE], temp[BUFFERSIZE], s4[BUFFERSIZE];\n    float nodeWidth, nodeHeight;\n    long currentPos, j, k, nodeIndex, nodeNo;\n    long R, nonpin_ptr, pin_ptr, hashfunc, RN_index; \n\n\n    strcpy(temp, benchmarkPath);\n    strcat(temp, \"/\");\n    strcat(temp, nodesFile);\n    \n    if((fp=fopen(temp, \"r\")) == NULL) {\n        printf(\"Error in opening: %s \\n\", temp);\n        exit(1);\n    }\n    \n    // Reading first few lines \n    if(!fgets(temp, BUFFERSIZE, fp)) abort();\n    do {\n        currentPos = ftell(fp);\n        if(!fgets(temp, BUFFERSIZE, fp)) abort();\n    } while( (temp[0] == '#') || (strlen(temp) < 5) );  \n    fseek(fp, currentPos, SEEK_SET);  \n\n    // getting numNodes and numTerminals\n    if(fscanf(fp, \"NumNodes\\t:\\t%d\\n\", &numNodes) != 1) abort();\n    if(fscanf(fp, \"NumTerminals\\t:\\t%d\\n\", &numTerminals) != 1) abort();\n\n    // in case there are any more comments or blank lines before actual cell information\n    do {\n        currentPos = ftell(fp);\n        if(!fgets(temp, BUFFERSIZE, fp)) abort();\n    } while( (temp[0] == '#') || (strlen(temp) < 5) );  \n    fseek(fp, currentPos, SEEK_SET);  \n    \n    // defining hash variables\n    hashBits = 3+(long)(log((double)numNodes)/log((double)2));\n    hashSize = (long) pow(2, (double)hashBits);\n    NodesInfo = (NODES *) malloc(hashSize*sizeof(NODES));\n    RN = (long *) lvector(1, hashSize);\n    hashFlag = cvector(1, hashSize);   \n\n    // global vector giving inverse mapping b/w cell names and cell indexes\n    cellName = cmatrix(1, numNodes, 1, STRINGLEN); \n\n     // initialize hash flags\n    for(j=1;j<=hashSize;j++)\n        hashFlag[j] = 0;\n\n    // generate random sequence\n    R = 1;\n    for(j=1;j<=hashSize;j++) {\n        R = (5*R)%hashSize;\n        RN[j] = R/4;\n    }\n    modresNum = (hashBits+2)/3;\n\n    nonpin_ptr = 1;                      // movable nodes start from 1\n    pin_ptr = numNodes-numTerminals+1;   // fixed nodes start from movableNodes+1\n    \n    for(nodeNo=1;nodeNo<=numNodes;nodeNo++) {\n\n        if(!fgets(line, BUFFERSIZE, fp)) abort();\n        strcpy(s4, \"\");\n        sscanf(line, \"%s%f%f%s\\n\", temp, &nodeWidth, &nodeHeight, s4);\n\n        if(strcmp(s4, \"terminal\")==0) {\n\n            // create array to save cell name\n            strcpy(cellName[pin_ptr], temp);\n\n            // create a hash table for name searching\n            hashfunc = 0;\n            for(j=1;j<=IMIN(strlen(temp), modresNum);j++)\n                hashfunc += ((long)temp[j-1]<<3*(j-1))%hashSize;\n          \n            hashfunc = hashfunc%hashSize;\n            RN_index = 1;\n\n            while(hashFlag[hashfunc]!=0 && RN_index<hashSize) {\n                hashfunc = (hashfunc+RN[RN_index])%hashSize;\n                RN_index++;\n            }\n\n            if (RN_index>=hashSize) {  \n                printf(\"cannot fill in hash table\\n\");\n                exit(1);\n            }\n          \n            strcpy(NodesInfo[hashfunc].name, temp);\n            NodesInfo[hashfunc].index = pin_ptr;\n            hashFlag[hashfunc] = 1;\n          \n            pin_ptr++;\n       \n        } else {\n\n            // create array to save cell name\n            strcpy(cellName[nonpin_ptr], temp);\n          \n            // create a hash table for name searching\n            hashfunc = 0;\n            for(j=1;j<=IMIN(strlen(temp), modresNum);j++)\n                hashfunc += ((long)temp[j-1]<<3*(j-1))%hashSize;\n          \n            hashfunc = hashfunc%hashSize;\n            RN_index = 1;\n          \n            while(hashFlag[hashfunc]!=0 && RN_index<hashSize) {\n                hashfunc = (hashfunc+RN[RN_index])%hashSize;\n                RN_index++;\n            }\n          \n            if (RN_index>=hashSize) {  \n                printf(\"cannot fill in hash table\\n\");\n                exit(1);\n            }\n          \n            strcpy(NodesInfo[hashfunc].name, temp);\n            NodesInfo[hashfunc].index = nonpin_ptr;\n            hashFlag[hashfunc] = 1;\n\n            nonpin_ptr++;\n        }\n    }\n\n    fclose(fp);  \n}\n\n    \n/* -----------------------------------------------------------\n  frees hash elements\n----------------------------------------------------------- */\nvoid freeHash()\n{\n  free(NodesInfo);\n  free_lvector((unsigned long *) RN, 1, hashSize);\n  free_cvector(hashFlag, 1, hashSize);\n}\n\n\n/* -----------------------------------------------------------\n  Reads the .aux file to get the other file names\n  \n  creates extern vars:\n     nodesFile[], netsFile[], wtsFile[], sclFile[], \n     plFile[], benchmarkName[];\n----------------------------------------------------------- */\nvoid readAuxFile(char benchmarkPath[], char auxFile[]) \n{\n    FILE *fp;\n    char temp[BUFFERSIZE], placementType[BUFFERSIZE], *name;\n\n  \n    strcpy(temp, benchmarkPath);\n    strcat(temp, \"/\");\n    strcat(temp, auxFile);\n   \n    if((fp=fopen(temp, \"r\")) == NULL) {\n        printf(\"Error in opening: %s \\n\", auxFile);\n        exit(1);\n    }\n//    printf(\"Reading %s ...\\n\",auxFile);\n    \n    if(fscanf(fp, \"%s\\t:\\t%s%s%s%s%s\\n\", placementType, nodesFile, netsFile, wtsFile, plFile, sclFile) != 6) abort();\n\n    strcpy(temp, auxFile);\n    name = strtok(temp, \".\");\n    strcpy(benchmarkName, name);\n\n    fclose(fp);\n}  \n\n\n/* -----------------------------------------------------------\n  Reads the .nodes file to get cell widths and heights\n  \n  creates extern vars: \n     movableNodes, numTerminals, averageCellWidth, \n     cellWidth[], cellHeight[]\n----------------------------------------------------------- */\nvoid readNodesFile(char benchmarkPath[], char nodesFile[])\n{\n    FILE *fp;\n    char line[BUFFERSIZE], tempStr[BUFFERSIZE], s4[STRINGLEN];\n    long j, nodeIndex, nodeNo, currentPos;\n    long hashfunc, RN_index;\n    float nodeWidth, nodeHeight, sumWidth;\n\n    \n    strcpy(tempStr, benchmarkPath);\n    strcat(tempStr, \"/\");\n    strcat(tempStr, nodesFile);\n    \n    if((fp=fopen(tempStr, \"r\"))==NULL) {\n        printf(\"Error in opening %s file \\n\", nodesFile);\n        exit(1);\n    }\n//    printf(\"Reading %s ...\\n\", nodesFile);\n\n    // Reading first few lines \n    if(!fgets(tempStr, BUFFERSIZE, fp)) abort();\n    do {\n        currentPos = ftell(fp);\n        if(!fgets(tempStr, BUFFERSIZE, fp)) abort();\n    } while( (tempStr[0] == '#') || (strlen(tempStr) < 5) );  \n    fseek(fp, currentPos, SEEK_SET);  \n\n    if(fscanf(fp, \"NumNodes\\t:\\t%d\\n\", &numNodes) != 1) abort();\n    if(fscanf(fp, \"NumTerminals\\t:\\t%d\\n\", &numTerminals) != 1) abort();\n\n    do {\n       currentPos = ftell(fp);\n       if(!fgets(tempStr, BUFFERSIZE, fp)) abort();\n    } while( (tempStr[0] == '#') || (strlen(tempStr) < 5) );  \n    fseek(fp, currentPos, SEEK_SET);  \n    \n    movableNodes = numNodes - numTerminals;       // global var - num of movable cells\n    cellWidth = vector(1, numNodes);               // global vector giving cell widths\n    cellHeight = vector(1, numNodes);             // global vector giving cell heights\n   \n    sumWidth = 0;\n    \n    for(nodeNo=1;nodeNo<=numNodes;nodeNo++) {\n\n        if(!fgets(line, BUFFERSIZE, fp)) abort();\n        strcpy(s4, \"\");\n        sscanf(line, \"%s%f%f%s\\n\", tempStr, &nodeWidth, &nodeHeight, s4);\n\n        if(strcmp(s4, \"terminal\")==0) {\n\n            // find the nodeIndex corresponding to tempStr\n            hashfunc = 0;\n            for(j=1;j<=IMIN(strlen(tempStr), modresNum);j++)\n                hashfunc += ((long)tempStr[j-1]<<3*(j-1))%hashSize;\n      \n            hashfunc = hashfunc%hashSize;\n            RN_index = 1;\n  \n            while(strcmp(tempStr, NodesInfo[hashfunc].name)!=0 && RN_index<hashSize) {\n                hashfunc = (hashfunc+RN[RN_index])%hashSize;\n                RN_index++;\n            }\n      \n            if (RN_index>=hashSize) {  \n                printf(\"cannot find in hash table\\n\");\n                exit(1);\n            }\n      \n            nodeIndex = NodesInfo[hashfunc].index;\n\n            // store cellwidth and cellheight corresponding to nodeIndex\n            cellWidth[nodeIndex] = nodeWidth;\n            cellHeight[nodeIndex] = nodeHeight;\n      \n        } else {\n\n            // find the nodeIndex corresponding to tempStr\n            hashfunc = 0;\n            for(j=1;j<=IMIN(strlen(tempStr), modresNum);j++)\n                hashfunc += ((long)tempStr[j-1]<<3*(j-1))%hashSize;\n      \n            hashfunc = hashfunc%hashSize;\n            RN_index = 1;\n  \n            while(strcmp(tempStr, NodesInfo[hashfunc].name)!=0 && RN_index<hashSize) {\n                hashfunc = (hashfunc+RN[RN_index])%hashSize;\n                RN_index++;\n            }\n      \n            if (RN_index>=hashSize) {  \n                printf(\"cannot find in hash table\\n\");\n                exit(1);\n            }\n            \n            nodeIndex = NodesInfo[hashfunc].index;\n\n            // store cellwidth and cellheight corresponding to nodeIndex\n            cellWidth[nodeIndex] = nodeWidth;\n            cellHeight[nodeIndex] = nodeHeight;\n            sumWidth += nodeWidth;\n        }\n    }\n\n    // find average cell width\n    averageCellWidth = sumWidth/movableNodes;\n    averageCellWidth *= 100;\n    averageCellWidth = (int)averageCellWidth;\n    averageCellWidth /= 100;\n\n    fclose(fp);  \n\n#if(DEBUG)\nint i;\nfor(i=1; i<=movableNodes+numTerminals; i++) {\n    printf(\"%d  %s  %.2f  %.2f\\n\", i, cellName[i], cellWidth[i], cellHeight[i]);\n}\n\nprintf(\"Avg Cell Width:  %.2f \\n\", averageCellWidth);    \n#endif\n}  \n\n\n/* -----------------------------------------------------------\n   Reads the .nets file to get the netlist information\n   \n   creates extern vars: \n      numNets, numPins, \n      xPinOffset[], yPinOffset[], netlist[], netlistIndex[]\n----------------------------------------------------------- */\nvoid readNetsFile(char benchmarkPath[], char netsFile[])\n{\n    FILE *fp;\n    long i, j, k, netNo, nodeIndex;\n    long currentPos, startPointer, hashfunc, RN_index;\n    char tempStr[BUFFERSIZE], nodeName[BUFFERSIZE];\n    int degree, prevElements;\n    float xOffset, yOffset;\n\n\n    strcpy(tempStr, benchmarkPath);\n    strcat(tempStr, \"/\");\n    strcat(tempStr, netsFile);\n\n    if((fp=fopen(tempStr, \"r\"))==NULL) {\n        printf(\"Error in opening %s file \\n\", netsFile);\n        exit(1);\n    }\n//    printf(\"Reading %s ...\\n\", netsFile);\n\n    // Reading first four lines \n    if(!fgets(tempStr, BUFFERSIZE, fp)) abort();\n    do {\n        currentPos = ftell(fp);\n        if(!fgets(tempStr, BUFFERSIZE, fp)) abort();\n    } while( (tempStr[0] == '#') || (strlen(tempStr) < 5) );  \n    fseek(fp, currentPos, SEEK_SET);  \n\n    // getting numNets and numPins\n    if(fscanf(fp, \"NumNets\\t:\\t%d\\n\", &numNets) != 1) abort();\n    if(fscanf(fp, \"NumPins\\t:\\t%d\\n\", &numPins) != 1) abort();\n\n    do {\n        currentPos = ftell(fp);\n        if(!fgets(tempStr, BUFFERSIZE, fp)) abort();\n    } while( (tempStr[0] == '#') || (strlen(tempStr) < 5) );  \n    fseek(fp, currentPos, SEEK_SET);  \n   \n    // stores the netlist and pin offsets relative to the center of the cells\n    netlist = ivector(1,numPins+1);\n    xPinOffset = vector(1,numPins+1);\n    yPinOffset = vector(1,numPins+1);\n\n    // index vector for the netlist and offset vectors\n    netlistIndex = ivector(0,numNets+1);\n    \n    netlistIndex[0] = 1;\n    prevElements = 0;\n\n    for(netNo=1;netNo<=numNets;netNo++) {\n\n        do {\n            currentPos = ftell(fp);\n            if(!fgets(tempStr, BUFFERSIZE, fp)) abort();\n        } while( (tempStr[0] == '#') || (strlen(tempStr) < 5) );  \n\n        sscanf(tempStr, \"NetDegree\\t:\\t%d\\n\", &degree);\n\n        netlistIndex[netNo] = netlistIndex[netNo-1] + prevElements;\n        startPointer = netlistIndex[netNo];\n        prevElements = degree;\n      \n        for(k=1;k<=degree;k++) {\n         \n            do {\n                currentPos = ftell(fp);\n                if(!fgets(tempStr, BUFFERSIZE, fp)) abort();\n            } while( (tempStr[0] == '#') || (strlen(tempStr) < 5) );  \n        \n            xOffset = yOffset = 0.0;\n            sscanf(tempStr, \"%s%f%f\", nodeName, &xOffset, &yOffset);\n\n            // find the nodeIndex corresponding to nodeName\n            hashfunc = 0;\n            for(j=1;j<=IMIN(strlen(nodeName), modresNum);j++)\n                hashfunc += ((long)nodeName[j-1]<<3*(j-1))%hashSize;\n        \n            hashfunc = hashfunc%hashSize;\n            RN_index = 1;\n   \n            while(strcmp(nodeName, NodesInfo[hashfunc].name)!=0 && RN_index<hashSize) {\n                hashfunc = (hashfunc+RN[RN_index])%hashSize;\n                RN_index++;\n            }\n        \n            if (RN_index>=hashSize) {  \n                printf(\"cannot find in hash table\\n\");\n               exit(1);\n            }\n\n            nodeIndex = NodesInfo[hashfunc].index;\n            netlist[startPointer+k-1] = nodeIndex;\n            xPinOffset[startPointer+k-1] = xOffset;\n            yPinOffset[startPointer+k-1] = yOffset; \n        }\n    }\n   \n    netlistIndex[numNets+1] = netlistIndex[numNets] + prevElements;\n    netlist[netlistIndex[numNets+1]] = 0;\n\n    fclose(fp); \n\n#if(DEBUG)\nfor(i=1; i<=numNets; i++) {\n    printf(\"**%d**  \", netlistIndex[i+1]-netlistIndex[i]);\n    for(j=netlistIndex[i]; j<netlistIndex[i+1]; j++) {\n        printf(\"(%d) %.2f %.2f  \", netlist[j], xPinOffset[j], yPinOffset[j]);\n    }\n    printf(\"\\n\");    \n}\n#endif\n}\n\n\n/* -----------------------------------------------------------\n  Reads the .pl file to get coordinates of all nodes and the \n  placement boundary based on the position of the I/O pads\n  \n  creates extern vars:\n     xCellCoord[], yCellCoord[]\n     areaArrayIO[], numAreaArrayIO\n----------------------------------------------------------- */\nvoid readPlFile(char benchmarkPath[], char plFile[])\n{\n    FILE *fp;\n    long nodeIndex, currentPos, j, hashfunc, RN_index, nodeNo, movable;\n    char tempStr[BUFFERSIZE], nodeName[BUFFERSIZE], fixedType[BUFFERSIZE];\n    float xCoord, yCoord;\n\n  \n    strcpy(tempStr, benchmarkPath);\n    strcat(tempStr, \"/\");\n    strcat(tempStr, plFile);\n\n    if((fp=fopen(tempStr, \"r\"))==NULL) {\n        printf(\"Error in opening %s file \\n\", plFile);\n        exit(1);\n    }\n//    printf(\"Reading %s ...\\n\", plFile);\n  \n    // Reading first four lines \n    if(!fgets(tempStr, BUFFERSIZE, fp)) abort();\n    do {\n        currentPos = ftell(fp);\n        if(!fgets(tempStr, BUFFERSIZE, fp)) abort();\n    } while( (tempStr[0] == '#') || (strlen(tempStr) < 5) );  \n    fseek(fp, currentPos, SEEK_SET);\n  \n    xCellCoord = vector(1,numNodes);\n    yCellCoord = vector(1,numNodes);\n    areaArrayIO = ivector(1, numTerminals);\n    numAreaArrayIO = 0;\n\n    movable = numNodes-numTerminals;\n    for(nodeNo=1; nodeNo<=numNodes; nodeNo++) {\n\n        if(!fgets(tempStr, BUFFERSIZE, fp)) abort();\n        strcpy(fixedType, \"\");\n        sscanf(tempStr, \"%s%f%f\\t:\\t%*s%s\\n\", nodeName, &xCoord, &yCoord, fixedType);\n\n        hashfunc = 0;\n        for(j=1;j<=IMIN(strlen(nodeName), modresNum);j++)\n            hashfunc += ((long)nodeName[j-1]<<3*(j-1))%hashSize;\n      \n        hashfunc = hashfunc%hashSize;\n        RN_index = 1;\n    \n        while(strcmp(nodeName, NodesInfo[hashfunc].name)!=0 && RN_index<hashSize) {\n            hashfunc = (hashfunc+RN[RN_index])%hashSize;\n            RN_index++;\n        }\n      \n        if (RN_index>=hashSize) {  \n            printf(\"cannot find in hash table\\n\");\n            exit(1);\n        }\n      \n        nodeIndex = NodesInfo[hashfunc].index;\n        // Assume all coordinates are integers\n        xCellCoord[nodeIndex] = (int) (xCoord + 0.5*cellWidth[nodeIndex]);\n        yCellCoord[nodeIndex] = (int) (yCoord + 0.5*cellHeight[nodeIndex]);\n/*        \n        if(nodeIndex > movable) {\n            // Is a fixed terminal but can allow overlap with it\n            if(strcmp(fixedType, \"/FIXED\") != 0) {\n                numAreaArrayIO++;\n                areaArrayIO[numAreaArrayIO] = nodeIndex;\n            }\n        }\n*/\n    }\n\n    fclose(fp);\n}    \n\n\n/* -----------------------------------------------------------\n  Reads the .scl file to get placement region information\n  \n  creates extern vars:\n     siteOriginX, siteEndX, siteOriginY, siteEndY, siteWidth, \n     siteSpacing, coreRowHeight, coreWidth, coreHeight, \n     numRows, minX, maxX, minY, maxY\n----------------------------------------------------------- */\nvoid readSclFile(char benchmarkPath[], char sclFile[])\n{\n    FILE *fp;\n    char tempStr[BUFFERSIZE], siteOrient[2], siteSymmetry[2], junk[BUFFERSIZE];\n    int totalSites, row;\n    long currentPos, nodeIndex, movable;\n    float originY, originX, minOrigin, maxEnd;\n\n\n    strcpy(tempStr, benchmarkPath);\n    strcat(tempStr, \"/\");\n    strcat(tempStr, sclFile);\n\n    if((fp=fopen(tempStr, \"r\"))==NULL) {\n      printf(\"Error in opening %s file \\n\", sclFile);\n      exit(1);\n    }\n//    printf(\"Reading %s ...\\n\", sclFile);\n\n    // Reading first four lines \n    if(!fgets(tempStr, BUFFERSIZE, fp)) abort();\n    do {\n      currentPos = ftell(fp);\n      if(!fgets(tempStr, BUFFERSIZE, fp)) abort();\n    } while( (tempStr[0] == '#') || (strlen(tempStr) < 5) );  \n    fseek(fp, currentPos, SEEK_SET);  \n   \n    // getting numRows\n    if(fscanf(fp, \"%*s\\t:\\t%d\\n\", &numRows) != 1) abort();\n\n    rowOriginX = vector(1, numRows);\n    rowEndX = vector(1, numRows);\n    xRowBlockage = vector(1, 2*numRows);\n    yRowBlockage = vector(1, 2*numRows);\n    widthRowBlockage = vector(1, 2*numRows);\n\n    // any blanks or comments after numRows line\n    do {\n      currentPos = ftell(fp);\n      if(!fgets(tempStr, BUFFERSIZE, fp)) abort();\n    } while( (tempStr[0] == '#') || (strlen(tempStr) < 5) );  \n    fseek(fp, currentPos, SEEK_SET);  \n\n    siteOriginX = 1.0e6;\n    siteEndX = -1.0e6;\n    for(row=1; row<=numRows; row++) {\n        // Reading CoreRow Horizontal\n        if(!fgets(junk, BUFFERSIZE, fp)) abort();\n    \n        if(fscanf(fp, \"\\tCoordinate\\t:\\t%f\\n\", &originY) != 1) abort();\n        if(row == 1) siteOriginY = originY;\n        \n        if(fscanf(fp, \"Height\\t:\\t%f\\n\", &coreRowHeight) != 1) abort();\n        if(fscanf(fp, \"Sitewidth\\t:\\t%f\\n\", &siteWidth) != 1) abort();\n        if(fscanf(fp, \"Sitespacing\\t:\\t%f\\n\", &siteSpacing) != 1) abort();\n        if(fscanf(fp, \"Siteorient\\t:\\t%s\\n\", siteOrient) != 1) abort();\n        if(fscanf(fp, \"Sitesymmetry\\t:\\t%s\\n\", siteSymmetry)!= 1) abort();\n        if(fscanf(fp, \"SubrowOrigin\\t:\\t%f\\t%*s\\t:\\t%d\\n\", &originX, &totalSites) != 2) abort();\n        // Reading End\n        if(!fgets(junk, BUFFERSIZE, fp)) abort();\n      \n        rowOriginX[row] = originX;\n        rowEndX[row] = originX + totalSites*siteSpacing;\n        if(rowOriginX[row] < siteOriginX) siteOriginX = rowOriginX[row];\n        if(rowEndX[row] > siteEndX) siteEndX = rowEndX[row];\n    }\n\n    siteEndY = numRows*coreRowHeight + siteOriginY;\n    coreHeight = siteEndY - siteOriginY;          // height of placement area \n    coreWidth = siteEndX - siteOriginX;\n\n    numRowBlockages = 0;\n    for(row=1; row<=numRows; row++) {\n        if(rowOriginX[row] > siteOriginX) {\n            numRowBlockages++;\n            xRowBlockage[numRowBlockages] = siteOriginX + 0.5*(rowOriginX[row] - siteOriginX);\n            yRowBlockage[numRowBlockages] = siteOriginY + (row-0.5)*coreRowHeight;\n            widthRowBlockage[numRowBlockages] = (rowOriginX[row] - siteOriginX);\n        }\n\n        if(siteEndX > rowEndX[row]) {\n            numRowBlockages++;\n            xRowBlockage[numRowBlockages] = rowEndX[row] + 0.5*(siteEndX - rowEndX[row]);\n            yRowBlockage[numRowBlockages] = siteOriginY + (row-0.5)*coreRowHeight;\n            widthRowBlockage[numRowBlockages] = (siteEndX - rowEndX[row]);\n        }\n    }\n    \n    maxX = 0;\n    minX = 1.0e10;\n    maxY = 0;\n    minY = 1.0e10;\n    movable = numNodes-numTerminals;\n    for(nodeIndex=movable+1; nodeIndex<=numNodes; nodeIndex++) {\n    \n        if(xCellCoord[nodeIndex] > maxX) maxX = xCellCoord[nodeIndex];\n        if(xCellCoord[nodeIndex] < minX) minX = xCellCoord[nodeIndex];\n        if(yCellCoord[nodeIndex] > maxY) maxY = yCellCoord[nodeIndex];\n        if(yCellCoord[nodeIndex] < minY) minY = yCellCoord[nodeIndex];\n    }\n    maxX = MAX(maxX, siteEndX+5.0);\n    minX = MIN(minX, siteOriginX-5.0);\n    maxY = MAX(maxY, siteEndY+5.0);\n    minY = MIN(minY, siteOriginY-5.0);   \n    \n    fclose(fp);\n}  \n\n\n/* -----------------------------------------------------------\n   writes out a bookshelf format .pl file\n----------------------------------------------------------- */\nvoid writePlFile(char outputDir[], char benchmarkName[], float xCoord[], float yCoord[]) \n{\n    FILE *fp;\n    char tempStr[BUFFERSIZE];\n    int i;\n\n\n    strcpy(tempStr, outputDir);\n    strcat(tempStr, \"/\");\n    strcat(tempStr, benchmarkName);\n    strcat(tempStr, \"_out.pl\");\n    \n    if( (fp=fopen(tempStr,\"w\")) == NULL ) {\n     \n        printf(\"ERROR in opening the %s file for write \\n\", tempStr);\n        exit(1);\n    }\n    printf(\"\\nPrinting %s File... \\n\",tempStr);\n\n    fprintf(fp, \"UCLA pl 1.0\\n\");\n    fprintf(fp, \"# Circuit  :  %s\\n\", benchmarkName);\n\n    for(i=1; i<=numNodes; i++)\n        fprintf(fp, \"    %20s    %-10.2f    %-10.2f  :  N\\n\", \n                cellName[i], xCoord[i]-0.5*cellWidth[i], yCoord[i]-0.5*cellHeight[i]);\n   \n    fclose(fp);\n}\n"
  },
  {
    "path": "lonestar/eda/cpu/sproute/bookshelf_IO.h",
    "content": "/* -----------  FastPlace - Version 1.0 ----------------\n                       by\n   Natarajan Viswanathan and Chris C.-N. Chu\n     Dept. of ECpE, Iowa State University\n          Copyright (c) - 2004\nIowa State University Research Foundation, Inc.\n-------------------------------------------------------- */\n/* --------------------------------------------------------------------------\n   Header file used in bookshelf_IO.c\n----------------------------------------------------------------------------*/\n\n#ifndef _BOOKSHELF_IO_H_\n#define _BOOKSHELF_IO_H_\n\n#define BUFFERSIZE 200\n#define STRINGLEN 30\n\n/* -----------------------------------------------------------------------------\n    Reads the .nodes file and creates a hash linking cell name to cell index for\n    all the nodes in the circuit (movable nodes + fixed nodes + I/O pads)\n\n    creates extern vars:\n        cellName[i]     (i = 1..movableNodes + numTerminals)\n--------------------------------------------------------------------------------\n*/\nextern void createHash(char benchmarkPath[], char nodesFile[]);\nextern void freeHash();\n\n/* -----------------------------------------------------------------------------\n    Reads the .aux file to get the other file names\n\n    creates extern vars:\n        nodesFile[], netsFile[], wtsFile[], sclFile[], plFile[], benchmarkName[]\n--------------------------------------------------------------------------------\n*/\nextern void readAuxFile(char benchmarkPath[], char auxFile[]);\n\n/* -----------------------------------------------------------------------------\n    Reads the .nodes file to get cell widths and heights\n\n    creates extern vars:\n        movableNodes, numTerminals, averageCellWidth, cellWidth[], cellHeight[]\n--------------------------------------------------------------------------------\n*/\nextern void readNodesFile(char benchmarkPath[], char nodesFile[]);\n\n/* -----------------------------------------------------------------------------\n    Reads the .nets file to get the netlist information\n\n    creates extern vars:\n        numNets, numPins, netlist[], netlistIndex[], xPinOffset[], yPinOffset[]\n--------------------------------------------------------------------------------\n*/\nextern void readNetsFile(char benchmarkPath[], char netsFile[]);\n\n/* -----------------------------------------------------------------------------\n    Reads the .pl file to get coordinates of all the nodes and the placement\n    boundary based on the position of the I/O pads\n\n    creates extern vars:\n        xCellCoord[], yCellCoord[], minX, maxX, minY, maxY\n--------------------------------------------------------------------------------\n*/\nextern void readPlFile(char benchmarkPath[], char plFile[]);\n\n/* -----------------------------------------------------------------------------\n    Reads the .scl file to get placement (core) region information\n\n    creates extern vars:\n        siteOriginX, siteEndX, siteOriginY, siteEndY, siteWidth, siteSpacing,\n        numRows, coreRowHeight, coreWidth, coreHeight\n--------------------------------------------------------------------------------\n*/\nextern void readSclFile(char benchmarkPath[], char sclFile[]);\n\n/* -----------------------------------------------------------------------------\n    writes out a bookshelf format .pl file\n--------------------------------------------------------------------------------\n*/\nextern void writePlFile(char outputDir[], char benchmarkName[], float xCoord[],\n                        float yCoord[]);\n\n/*--------------  Extern Variables  ------------------*/\n\nextern char** cellName;\n\nextern char nodesFile[BUFFERSIZE], netsFile[BUFFERSIZE], wtsFile[BUFFERSIZE];\nextern char sclFile[BUFFERSIZE], plFile[BUFFERSIZE], benchmarkName[BUFFERSIZE];\n\nextern int movableNodes, numTerminals;\nextern float averageCellWidth, *cellWidth, *cellHeight;\n\nextern int numNets, numPins, *netlist, *netlistIndex;\nextern float *xPinOffset, *yPinOffset;\n\nextern float *xCellCoord, *yCellCoord, minX, maxX, minY, maxY;\nextern int *areaArrayIO, numAreaArrayIO;\n\nextern int numRows, numRowBlockages;\nextern float siteOriginY, siteEndY, coreHeight;\nextern float siteOriginX, siteEndX, coreWidth;\nextern float siteWidth, siteSpacing, coreRowHeight;\nextern float *rowOriginX, *rowEndX;\nextern float *xRowBlockage, *yRowBlockage, *widthRowBlockage;\n\n#endif /* _BOOKSHELF_IO_H_*/\n\n/* -----------------------------------------------------------------------------------------------\n                                Description of Extern Variables\n\n    cellName[i]         =   cell name corresponding to cell index \"i\"\n                            (i = 1..movableNodes+numTerminals)\n\n    movableNodes        =   number of movable nodes,\n    numTerminals        =   number of fixed nodes (I/O Pads + Fixed Macros)\n    averageCellWidth    =   avg width of movable nodes,\n    cellWidth[i]        =   width of cell \"i\"   (i\n= 1..movableNodes+numTerminals) cellHeight[i]       =   height of cell \"i\"  (i\n= 1..movableNodes+numTerminals)\n\n    numNets             =   number of nets\n    numPins             =   number of pins\n    netlist[]           =   netlist of the circuit\n    xPinOffset[]        =   x-offset of the pins from the center of the cell\n    yPinOffset[]        =   y-offset (      \"       )\n    netlistIndex[]      =   index to the netlist and offset vectors\n\n    xCellCoord[i]       =   x-coordinate of cell \"i\"  (i\n= 1..movableNodes+numTerminals) yCellCoord[i]       =   y-coordinate of cell\n\"i\", minX, maxX          =   left and right boundaries of the chip (Note: not\nthe placement region) minY, maxY          =   bottom and top boundaries of the\nchip areaArrayIO[]       =   All fixed terminals with which there can be an\noverlap of movable nodes (eg area array IOs) numAreaArrayIO      =   the total\nnumber of area array IOs\n\n    siteOriginX[]       =   left boundary of the placement region within a row\n    siteEndX[]          =   right boundary of the placement region within a row\n    siteOriginY         =   bottom boundary of the placement region\n    siteEndY            =   top boundary of the placement region\n    siteWidth           =   width of a placement site within a row\n    siteSpacing         =   the space b/w adjacent placement sites within a row\n    numRows             =   number of placement rows\n    coreRowHeight       =   row Height\n    coreWidth           =   siteEndX - siteOriginX\n    coreHeight          =   siteEndY - siteOriginY\n\n---------------------------------------------------------------------------------------------------*/\n"
  },
  {
    "path": "lonestar/eda/cpu/sproute/cong.c",
    "content": "#include <stdio.h>\r\n#include <stdlib.h>\r\n#include <string.h>\r\n#include \"DataType.h\"\r\n#include \"DataProc.h\"\r\n\r\n\r\n"
  },
  {
    "path": "lonestar/eda/cpu/sproute/cong.h",
    "content": "#ifndef _DATAPROC_H_\n#define _DATAPROC_H_\n\n#endif /* _DATAPROC_H_ */\n"
  },
  {
    "path": "lonestar/eda/cpu/sproute/dist.c",
    "content": "#include \"global.h\"\n\n/*********************************************************************/\n/*\n   Return the Manhattan distance between two points\n*/\n\nlong  dist(\n  Point  p,\n  Point  q\n)\n{\n  long  dx, dy;\n    \n  dx = (p.x) - (q.x);\n  if( dx < 0 )  dx = -dx;\n  dy = (p.y) - (q.y);\n  if( dy < 0 )  dy = -dy;\n\n  return  dx + dy; \n}\n\n/*********************************************************************/\n/*\n   Return the Manhattan distance between two points\n*/\n\nlong  dist2(\n  Point*  p,\n  Point*  q\n)\n{\n  long  dx, dy;\n    \n  dx = (p->x) - (q->x);\n  if( dx < 0 )  dx = -dx;\n  dy = (p->y) - (q->y);\n  if( dy < 0 )  dy = -dy;\n\n  return  dx + dy; \n}\n\n/*********************************************************************/\n/*********************************************************************/\n"
  },
  {
    "path": "lonestar/eda/cpu/sproute/dist.h",
    "content": "#ifndef _DIST_H_\n#define _DIST_H_\n\n#include \"global.h\"\n\nlong dist(Point p, Point q);\n\nlong dist2(Point* p, Point* q);\n\n#endif\n"
  },
  {
    "path": "lonestar/eda/cpu/sproute/dl.c",
    "content": "#include \"dl.h\"\n#include <assert.h>\n#include <stdio.h>\n\ndl_t dl_alloc()\n{\n  dl_t dl = (dl_t)malloc(sizeof(dl_s));\n  if (!dl) {\n      printf(\"Out of memory!!\\n\");\n  } else {\n    dl->first = dl->last = 0; dl->count = 0;\n  }\n  return dl;\n}\n\nvoid dl_delete(dl_t dl, dl_el *el)\n{\n  if (dl->first == el) {\n    dl->first = el->next;\n  }\n  if (dl->last == el) {\n    dl->last = el->prev;\n  }\n  if (el->next) {\n    el->next->prev = el->prev;\n  }\n  if (el->prev) {\n    el->prev->next = el->next;\n  }\n  free(el); dl->count--;\n}\n\nvoid dl_clear(dl_t dl)\n{\n  dl_el *el, *next;\n  if (dl->count > 0) {\n    for (el=dl->first; el; el=next) {\n      next = el->next;\n      free(el);\n    }\n  }\n  dl->first = dl->last = 0;\n  dl->count = 0;\n}\n\nvoid dl_concat(dl_t first_list, dl_t second_list)\n{\n  if (first_list->count <= 0) {\n    *first_list = *second_list;\n  } else if (second_list->count > 0) {\n    first_list->last->next = second_list->first;\n    second_list->first->prev = first_list->last;\n    first_list->last = second_list->last;\n    first_list->count += second_list->count;\n  }\n\n  free(second_list);\n}\n\nstatic void dl_insertion_sort(dl_t dl, size_t el_size,\n\t\t\t      int(*compar)(void *, void *))\n{\n  char *buf;\n  void *curr_d, *srch_d;\n  dl_el *curr, *srch;\n\n  if (dl_length(dl) <= 1) {\n    return;\n  }\n\n  buf = (char*)malloc(el_size);\n\n  for (curr=dl->first; curr!=dl->last; curr=curr->next) {\n    curr_d = (void*)(((dl_el*)curr)+1);\n\n    for (srch=dl->last; srch!=curr; srch=srch->prev) {\n      srch_d = (void*)(((dl_el*)srch)+1);\n      if (compar(curr_d, srch_d) > 0) {\n\tmemcpy((void*)buf, curr_d, el_size);\t\n\tmemcpy(curr_d, srch_d, el_size);\n\tmemcpy(srch_d, (void*)buf, el_size);\n      }\n    }\n  }\n  \n\n  free(buf);\n}\n\nvoid dl_sort(dl_t dl, size_t el_size, int(*compar)(void *, void *))\n{\n  dl_el *el, *first_head, *second_head;\n  dl_s first_list, second_list;\n  void *first_item, *second_item;\n  int i, len;\n\n  if (dl_length(dl) <= 25) {\n    dl_insertion_sort(dl, el_size, compar);\n    return;\n  }\n\n  len = dl_length(dl)/2;\n  for (i=0, el=dl->first; i<len; i++) {\n    el = el->next;\n  }\n\n  first_list.first = dl->first;\n  first_list.last = el->prev;\n  first_list.count = len;\n  first_list.last->next = 0;\n\n  second_list.first = el;\n  second_list.last = dl->last;\n  second_list.count = dl_length(dl)-len;\n  second_list.first->prev = 0;\n\n  dl_sort(&first_list, el_size, compar);\n  dl_sort(&second_list, el_size, compar);\n\n  /* in-place merging */\n  first_head = first_list.first;\n  second_head = second_list.first;\n\n  first_item = (void*)(((dl_el*)first_head)+1);\n  second_item = (void*)(((dl_el*)second_head)+1);\n  if (compar(first_item, second_item) <= 0) {\n    dl->first = el = first_head;\n    first_head = first_head->next;\n  } else {\n    dl->first = el = second_head;\n    second_head = second_head->next;\n  }\n\n  while (1) {\n    first_item = (void*)(((dl_el*)first_head)+1);\n    second_item = (void*)(((dl_el*)second_head)+1);\n    if (compar(first_item, second_item) <= 0) {\n      el->next = first_head;\n      first_head->prev = el;\n      el = first_head;\n      first_head = first_head->next;\n      if (!first_head) {\n\tel->next = second_head;\n\tsecond_head->prev = el;\n\tdl->last = second_list.last;\n\tbreak;\n      }\n    } else {\n      el->next = second_head;\n      second_head->prev = el;\n      el = second_head;\n      second_head = second_head->next;\n      if (!second_head) {\n\tel->next = first_head;\n\tfirst_head->prev = el;\n\tdl->last = first_list.last;\n\tbreak;\n      }\n    }\n  }\n}\n"
  },
  {
    "path": "lonestar/eda/cpu/sproute/dl.h",
    "content": "#ifndef DL_H\n#define DL_H\n\n#include <string.h>\n#include <stdlib.h>\n\ntypedef struct dl_el_s {\n  struct dl_el_s *prev, *next;\n} dl_el;\n\ntypedef struct {\n  dl_el *first, *last;\n  unsigned int count;\n} dl_s, *dl_t;\n\ndl_t dl_alloc(void);\nvoid dl_delete(dl_t dl, dl_el* el);\nvoid dl_clear(dl_t dl);\nvoid dl_concat(dl_t list1, dl_t list2);\nvoid dl_sort(dl_t dl, size_t el_size, int (*compar)(void*, void*));\n\n#define dl_length(dl) (dl)->count\n\n#define dl_empty(dl) ((dl)->count <= 0)\n\n#define dl_data(type, el) *(type*)(((dl_el*)(el)) + 1)\n\n#define dl_data_p(type, el) ((type*)(((dl_el*)(el)) + 1))\n\n#define dl_forall(type, dl, data)                                              \\\n  {                                                                            \\\n    dl_el *_el, *_next;                                                        \\\n    dl_t _curr_dl = (dl);                                                      \\\n    for (_el = _curr_dl->first; _el; _el = _next) {                            \\\n      _next  = _el->next;                                                      \\\n      (data) = dl_data(type, _el);\n\n#define dl_forall_p(type, dl, data_p)                                          \\\n  {                                                                            \\\n    dl_el *_el, *_next;                                                        \\\n    dl_t _curr_dl = (dl);                                                      \\\n    for (_el = _curr_dl->first; _el; _el = _next) {                            \\\n      _next    = _el->next;                                                    \\\n      (data_p) = dl_data_p(type, _el);\n\n#define dl_current() _el\n#define dl_delete_current() dl_delete(_curr_dl, _el)\n\n#define dl_endfor                                                              \\\n  }                                                                            \\\n  }\n\n#define dl_forall_reverse(type, dl, data)                                      \\\n  {                                                                            \\\n    dl_el *_el, *_next;                                                        \\\n    dl_t _curr_dl = (dl);                                                      \\\n    for (_el = _curr_dl->last; _el; _el = _next) {                             \\\n      _next  = _el->prev;                                                      \\\n      (data) = dl_data(type, _el);\n\n#define dl_forall_reverse_p(type, dl, data_p)                                  \\\n  {                                                                            \\\n    dl_el *_el, *_next;                                                        \\\n    dl_t _curr_dl = (dl);                                                      \\\n    for (_el = _curr_dl->last; _el; _el = _next) {                             \\\n      _next    = _el->prev;                                                    \\\n      (data_p) = dl_data_p(type, _el);\n\n#define dl_first(type, dl) dl_data(type, (dl)->first)\n\n#define dl_first_element(dl) (dl)->first\n\n#define dl_last(type, dl) dl_data(type, (dl)->last)\n\n#define dl_pop_first(type, dl, data)                                           \\\n  {                                                                            \\\n    (data) = dl_first(type, dl);                                               \\\n    dl_delete((dl), (dl)->first);                                              \\\n  }\n\n#define dl_pop_last(type, dl, data)                                            \\\n  {                                                                            \\\n    (data) = dl_last(type, dl);                                                \\\n    dl_delete((dl), (dl)->last);                                               \\\n  }\n\n#define dl_insert_before(type, dl, element, data)                              \\\n  {                                                                            \\\n    if ((element) == (dl)->first) {                                            \\\n      dl_prepend(type, dl, data);                                              \\\n    } else {                                                                   \\\n      dl_el* _el = (dl_el*)malloc(sizeof(dl_el) + sizeof(type));               \\\n      if (!_el) {                                                              \\\n        printf(\"Out of memory!!\\n\");                                           \\\n      } else {                                                                 \\\n        memcpy(_el + 1, &(data), sizeof(type));                                \\\n        _el->prev             = (element)->prev;                               \\\n        _el->next             = (element);                                     \\\n        (element)->prev->next = _el;                                           \\\n        (element)->prev       = _el;                                           \\\n        (dl)->count++;                                                         \\\n      }                                                                        \\\n    }                                                                          \\\n  }\n\n#define dl_insert_after(type, dl, element, data)                               \\\n  {                                                                            \\\n    if ((element) == (dl)->last) {                                             \\\n      dl_append(type, dl, data);                                               \\\n    } else {                                                                   \\\n      dl_el* _el = (dl_el*)malloc(sizeof(dl_el) + sizeof(type));               \\\n      if (!_el) {                                                              \\\n        printf(\"Out of memory!!\\n\");                                           \\\n      } else {                                                                 \\\n        memcpy(_el + 1, &(data), sizeof(type));                                \\\n        _el->next             = (element)->next;                               \\\n        _el->prev             = (element);                                     \\\n        (element)->next->prev = _el;                                           \\\n        (element)->next       = _el;                                           \\\n        (dl)->count++;                                                         \\\n      }                                                                        \\\n    }                                                                          \\\n  }\n\n#define dl_append(type, dl, data)                                              \\\n  {                                                                            \\\n    dl_el* _el = (dl_el*)malloc(sizeof(dl_el) + sizeof(type));                 \\\n    if (!_el) {                                                                \\\n      printf(\"Out of memory!!\\n\");                                             \\\n    } else {                                                                   \\\n      memcpy(_el + 1, &(data), sizeof(type));                                  \\\n      _el->next = 0;                                                           \\\n      if ((dl)->count <= 0) {                                                  \\\n        _el->prev   = 0;                                                       \\\n        (dl)->first = (dl)->last = _el;                                        \\\n        (dl)->count              = 1;                                          \\\n      } else {                                                                 \\\n        _el->prev        = (dl)->last;                                         \\\n        (dl)->last->next = _el;                                                \\\n        (dl)->last       = _el;                                                \\\n        (dl)->count++;                                                         \\\n      }                                                                        \\\n    }                                                                          \\\n  }\n\n#define dl_prepend(type, dl, data)                                             \\\n  {                                                                            \\\n    dl_el* _el = (dl_el*)malloc(sizeof(dl_el) + sizeof(type));                 \\\n    if (!_el) {                                                                \\\n      printf(\"Out of memory!!\\n\");                                             \\\n    } else {                                                                   \\\n      memcpy(_el + 1, &(data), sizeof(type));                                  \\\n      _el->prev = 0;                                                           \\\n      if ((dl)->count <= 0) {                                                  \\\n        _el->next   = 0;                                                       \\\n        (dl)->first = (dl)->last = _el;                                        \\\n        (dl)->count              = 1;                                          \\\n      } else {                                                                 \\\n        _el->next         = (dl)->first;                                       \\\n        (dl)->first->prev = _el;                                               \\\n        (dl)->first       = _el;                                               \\\n        (dl)->count++;                                                         \\\n      }                                                                        \\\n    }                                                                          \\\n  }\n\n#define dl_free(dl)                                                            \\\n  {                                                                            \\\n    dl_clear(dl);                                                              \\\n    free(dl);                                                                  \\\n    dl = 0;                                                                    \\\n  }\n\n#define dl_duplicate(dest, src, type)                                          \\\n  {                                                                            \\\n    dest = dl_alloc();                                                         \\\n    type _data_el;                                                             \\\n    dl_forall(type, src, _data_el) { dl_append(type, dest, _data_el); }        \\\n    dl_endfor;                                                                 \\\n  }\n\n#endif\n"
  },
  {
    "path": "lonestar/eda/cpu/sproute/err.c",
    "content": "#include <stdio.h>\n#include <stdlib.h>\n\n/**************************************************************************/\n/*\n  print error message and continue\n*/\n\nvoid  err_msg(\nchar* msg\n)\n{\n  fprintf(stderr, \"%s\\n\", msg);\n}\n\n/**************************************************************************/\n/*\n  print error message and  exit\n*/\n\nvoid  err_exit(\nchar* msg\n)\n{\n  fprintf(stderr, \"%s\\n\", msg);\n  exit(1);\n}\n\n"
  },
  {
    "path": "lonestar/eda/cpu/sproute/err.h",
    "content": "#ifndef _ERR_H_\n#define _ERR_H_\n\nvoid err_msg(char* msg);\n\nvoid err_exit(char* msg);\n\n#endif\n"
  },
  {
    "path": "lonestar/eda/cpu/sproute/flute-ckt.c",
    "content": "#include <stdio.h>\n#include <stdlib.h>\n#include \"bookshelf_IO.h\"\n#include \"memAlloc.h\"\n#include \"flute.h\"\n#include \"string.h\"\n\nfloat HPwl();\nfloat flutewl();\n\nint main (int argc, char *argv[])\n{\n    char auxFile[BUFFERSIZE], benchmarkPath[BUFFERSIZE], placefile[BUFFERSIZE];\n\n    if(argc != 4) {\n        printf(\"Usage: %s <benchmark_dir> <aux_file> <placement_file>\\n\",\n               argv[0]);\n        printf(\"    <benchmark_dir> is the benchmark file directory.\\n\");\n        printf(\"    <aux_file> is the bookshelf format auxiliary file\");\n        printf(\" (assume in <benchmark_dir>).\\n\");\n        printf(\"    <placement_file> is the placement file\");\n        printf(\" (assume in current directory).\\n\");\n        exit(1);\n    }    \n\n    strcpy(benchmarkPath, argv[1]);\n    strcpy(auxFile, argv[2]);\n    strcpy(placefile, argv[3]);\n\n    readAuxFile(benchmarkPath, auxFile);\n    createHash(benchmarkPath, nodesFile);\n    readNodesFile(benchmarkPath, nodesFile);\n    readNetsFile(benchmarkPath, netsFile);\n    readPlFile(\".\", placefile);\n    freeHash();\n\n    readLUT();\n\n    printf(\"Half-perimeter wirelength: %.2f\\n\", HPwl());\n    printf(\"FLUTE wirelength         : %.2f\\n\", flutewl());\n}\n\nfloat HPwl()\n{\n    float totx, toty, xu, xl, yu, yl, xOffset, yOffset;\n    int i, j, k;\n\n    totx = 0.0; toty = 0.0;\n    for (j=1; j<=numNets; j++) {\n        xl = yl = 1e9;\n        xu = yu = -1e9;\n        for (k=netlistIndex[j]; k<netlistIndex[j+1]; k++) {\n            i = netlist[k];\n            xOffset = xPinOffset[k];\n            yOffset = yPinOffset[k];\n\n            if (xCellCoord[i]+xOffset < xl) xl = xCellCoord[i]+xOffset;\n            if (xu < xCellCoord[i]+xOffset) xu = xCellCoord[i]+xOffset;\n            if (yCellCoord[i]+yOffset < yl) yl = yCellCoord[i]+yOffset;\n            if (yu < yCellCoord[i]+yOffset) yu = yCellCoord[i]+yOffset;\n        }\n        totx += xu - xl;\n        toty += yu - yl;\n    }\n    \n    return totx + toty;\n}\n\nfloat flutewl()\n{\n    Tree t;\n    DTYPE totwl;\n    DTYPE x[MAXD], y[MAXD];\n    float xOffset, yOffset;\n    int i, j, k, r, d;\n\n    totwl = 0;\n    for (j=1; j<=numNets; j++) {\n        d = netlistIndex[j+1] - netlistIndex[j];\n        k = netlistIndex[j]; \n        for (r=0; r<d; r++) {\n            i = netlist[k+r];\n            xOffset = xPinOffset[k+r];\n            yOffset = yPinOffset[k+r];\n            x[r] = (DTYPE) xCellCoord[i]+xOffset;\n            y[r] = (DTYPE) yCellCoord[i]+yOffset;\n        }\n#if ROUTING==1\n        t = flute(d, x, y, ACCURACY); totwl += t.length;\n#else        \n        totwl += flute_wl(d, x, y, ACCURACY);\n#endif        \n    }\n    \n    return (float) totwl;\n}    \n"
  },
  {
    "path": "lonestar/eda/cpu/sproute/flute-net.c",
    "content": "#include <stdio.h>\n#include <stdlib.h>\n#include \"flute.h\"\n\nint main()\n{\n    int d=0;\n    int x[MAXD], y[MAXD];\n    Tree flutetree;\n    int flutewl;\n    \n    while (!feof(stdin)) {\n        scanf(\"%d %d\\n\", &x[d], &y[d]);\n        d++;\n    }\n    readLUT();\n\n    flutetree = flute(d, x, y, ACCURACY);\n    printf(\"FLUTE wirelength = %d\\n\", flutetree.length);\n\n    flutewl = flute_wl(d, x, y, ACCURACY);\n    printf(\"FLUTE wirelength (without RSMT construction) = %d\\n\", flutewl);\n}\n"
  },
  {
    "path": "lonestar/eda/cpu/sproute/flute.h",
    "content": "#ifndef _FLUTE_H_\n#define _FLUTE_H_\n\n#include <stdio.h>\n#include <stdlib.h>\n#include <limits.h>\n#include <math.h>\n//#include \"flute_mst.h\"\n\n/*****************************/\n/*  User-Defined Parameters  */\n/*****************************/\n#define MAXD 1000              // max. degree that can be handled\n#define ACCURACY 10            // Default accuracy\n#define ROUTING 1              // 1 to construct routing, 0 to estimate WL only\n#define LOCAL_REFINEMENT 1     // Suggestion: Set to 1 if ACCURACY >= 5\n#define REMOVE_DUPLICATE_PIN 1 // Remove dup. pin for flute_wl() & flute()\n\n#ifndef DTYPE // Data type for distance\n#define DTYPE int\n#endif\n\n/*****************************/\n/*  User-Callable Functions  */\n/*****************************/\n// void readLUT();\n// DTYPE flute_wl(int d, DTYPE x[], DTYPE y[], int acc);\n// DTYPE flutes_wl(int d, DTYPE xs[], DTYPE ys[], int s[], int acc);\n// Tree flute(int d, DTYPE x[], DTYPE y[], int acc);\n// Tree flutes(int d, DTYPE xs[], DTYPE ys[], int s[], int acc);\n// DTYPE wirelength(Tree t);\n// void printtree(Tree t);\n// void plottree(Tree t);\n\n/*************************************/\n/* Internal Parameters and Functions */\n/*************************************/\n#define POWVFILE \"/POWV9.dat\" // LUT for POWV (Wirelength Vector)\n#define POSTFILE \"/POST9.dat\" // LUT for POST (Steiner Tree)\n#define D 9                   // LUT is used for d <= D, D <= 9\n#define TAU(A) (8 + 1.3 * (A))\n#define D1(A) (25 + 120 / ((A) * (A))) // flute_mr is used for D1 < d <= D2\n#define D2(A) ((A) <= 6 ? 500 : 75 + 5 * (A))\n\ntypedef struct {\n  DTYPE x, y; // starting point of the branch\n  int n;      // index of neighbor\n} Branch;\n\ntypedef struct {\n  int deg;        // degree\n  DTYPE length;   // total wirelength\n  Branch* branch; // array of tree branches\n} Tree;\n\n#if REMOVE_DUPLICATE_PIN == 1\n#define flutes_wl(d, xs, ys, s, acc) flutes_wl_RDP(d, xs, ys, s, acc)\n#define flutes(d, xs, ys, s, acc) flutes_RDP(d, xs, ys, s, acc)\n#else\n#define flutes_wl(d, xs, ys, s, acc) flutes_wl_ALLD(d, xs, ys, s, acc)\n#define flutes(d, xs, ys, s, acc) flutes_ALLD(d, xs, ys, s, acc)\n#endif\n\n#define flutes_wl_ALLD(d, xs, ys, s, acc) flutes_wl_LMD(d, xs, ys, s, acc)\n#define flutes_ALLD(d, xs, ys, s, acc)                                         \\\n  (d <= D ? flutes_LD(d, xs, ys, s) : flutes_MD(d, xs, ys, s, acc))\n//          : (d<=D1(acc) ? flutes_MD(d, xs, ys, s, acc)\n//                        : flutes_HD(d, xs, ys, s, acc)))\n\n#define flutes_wl_LMD(d, xs, ys, s, acc)                                       \\\n  (d <= D ? flutes_wl_LD(d, xs, ys, s) : flutes_wl_MD(d, xs, ys, s, acc))\n#define flutes_LMD(d, xs, ys, s, acc)                                          \\\n  (d <= D ? flutes_LD(d, xs, ys, s) : flutes_MD(d, xs, ys, s, acc))\n\n//#define max(x,y) ((x)>(y)?(x):(y))\n//#define min(x,y) ((x)<(y)?(x):(y))\n// to work around max conflict with bitmap\n//#define abs(x) ((x)<0?(-x):(x))\nusing namespace std;\n#define ADIFF(x, y) ((x) > (y) ? (x - y) : (y - x)) // Absolute difference\n\n#if D <= 7\n#define MGROUP 5040 / 4 // Max. # of groups, 7! = 5040\n#define MPOWV 15        // Max. # of POWVs per group\n#elif D == 8\n#define MGROUP 40320 / 4 // Max. # of groups, 8! = 40320\n#define MPOWV 33         // Max. # of POWVs per group\n#elif D == 9\n#define MGROUP 362880 / 4 // Max. # of groups, 9! = 362880\n#define MPOWV 79          // Max. # of POWVs per group\n#endif\nint numgrp[10] = {0, 0, 0, 0, 6, 30, 180, 1260, 10080, 90720};\n\nstruct csoln {\n  unsigned char parent;\n  unsigned char seg[11];       // Add: 0..i, Sub: j..10; seg[i+1]=seg[j-1]=0\n  unsigned char rowcol[D - 2]; // row = rowcol[]/16, col = rowcol[]%16,\n  unsigned char neighbor[2 * D - 2];\n};\nstruct csoln* LUT[D + 1][MGROUP]; // storing 4 .. D\nint numsoln[D + 1][MGROUP];\n\ntypedef struct node_pair_s { // pair of nodes representing an edge\n  int node1, node2;\n} node_pair;\nnode_pair* heap;\n\nstruct point {\n  DTYPE x, y;\n  int o;\n};\n\nvoid readLUT();\nDTYPE flute_wl(int d, DTYPE x[], DTYPE y[], int acc);\nDTYPE flutes_wl_LD(int d, DTYPE xs[], DTYPE ys[], int s[]);\nDTYPE flutes_wl_MD(int d, DTYPE xs[], DTYPE ys[], int s[], int acc);\nDTYPE flutes_wl_RDP(int d, DTYPE xs[], DTYPE ys[], int s[], int acc);\nTree flute(int d, DTYPE x[], DTYPE y[], int acc);\nTree flutes_LD(int d, DTYPE xs[], DTYPE ys[], int s[]);\nTree flutes_MD(int d, DTYPE xs[], DTYPE ys[], int s[], int acc);\nTree flutes_RDP(int d, DTYPE xs[], DTYPE ys[], int s[], int acc);\nTree dmergetree(Tree t1, Tree t2);\nTree hmergetree(Tree t1, Tree t2, int s[]);\nTree vmergetree(Tree t1, Tree t2);\nvoid local_refinement(Tree* tp, int p);\nDTYPE wirelength(Tree t);\nvoid printtree(Tree t);\nvoid plottree(Tree t);\n\n#define MAX_HEAP_SIZE (MAXD * 2)\nint max_heap_size = MAX_HEAP_SIZE;\nvoid init_param() {\n  heap = (node_pair*)malloc(sizeof(node_pair) * (max_heap_size + 1));\n}\n\n[[noreturn]] void abort_with_message(std::string message) noexcept {\n  std::cerr << message << std::endl;\n  std::abort();\n}\n\nvoid readLUT(const char* fluteDir) {\n  unsigned char charnum[256], line[32], *linep, c;\n  FILE *fpwv, *fprt;\n  struct csoln* p;\n  int d, i, j, k, kk, ns, nn;\n\n  init_param();\n\n  for (i = 0; i <= 255; i++) {\n    if ('0' <= i && i <= '9')\n      charnum[i] = i - '0';\n    else if (i >= 'A')\n      charnum[i] = i - 'A' + 10;\n    else // if (i=='$' || i=='\\n' || ... )\n      charnum[i] = 0;\n  }\n\n  string powvfile, postfile;\n  powvfile = fluteDir;\n  powvfile += POWVFILE;\n  postfile = fluteDir;\n  postfile += POSTFILE;\n\n  fpwv = fopen(powvfile.c_str(), \"r\");\n  if (fpwv == NULL) {\n    printf(\"Error in opening POWV: %s\\n\", powvfile.c_str());\n    exit(1);\n  }\n\n#if ROUTING == 1\n  fprt = fopen(postfile.c_str(), \"r\");\n  if (fprt == NULL) {\n    printf(\"Error in opening POST: %s\\n\", postfile.c_str());\n    exit(1);\n  }\n#endif\n\n  for (d = 4; d <= D; d++) {\n    if (fscanf(fpwv, \"d=%d\\n\", &d) != 1)\n      abort_with_message(\"Unable to get needed info from POWV.\");\n#if ROUTING == 1\n    if (fscanf(fprt, \"d=%d\\n\", &d) != 1)\n      abort_with_message(\"Unable to get needed info from POST.\");\n#endif\n    for (k = 0; k < numgrp[d]; k++) {\n      ns = (int)charnum[fgetc(fpwv)];\n\n      if (ns == 0) { // same as some previous group\n        if (fscanf(fpwv, \"%d\\n\", &kk) != 1)\n          abort_with_message(\"Unable to get needed info from POWV.\");\n        numsoln[d][k] = numsoln[d][kk];\n        LUT[d][k]     = LUT[d][kk];\n      } else {\n        fgetc(fpwv); // '\\n'\n        numsoln[d][k] = ns;\n        p             = (struct csoln*)malloc(ns * sizeof(struct csoln));\n        LUT[d][k]     = p;\n        for (i = 1; i <= ns; i++) {\n          linep     = (unsigned char*)fgets((char*)line, 32, fpwv);\n          p->parent = charnum[*(linep++)];\n          j         = 0;\n          while ((p->seg[j++] = charnum[*(linep++)]) != 0)\n            ;\n          j = 10;\n          while ((p->seg[j--] = charnum[*(linep++)]) != 0)\n            ;\n#if ROUTING == 1\n          nn = 2 * d - 2;\n          if (!fread(line, 1, d - 2, fprt))\n            abort_with_message(\"Unable to get needed info from POST.\");\n          linep = line;\n          for (j = d; j < nn; j++) {\n            c                = charnum[*(linep++)];\n            p->rowcol[j - d] = c;\n          }\n          if (!fread(line, 1, nn / 2 + 1, fprt))\n            abort_with_message(\"Unable to get needed info from POST.\");\n          linep = line; // last char \\n\n          for (j = 0; j < nn;) {\n            c                = *(linep++);\n            p->neighbor[j++] = c / 16;\n            p->neighbor[j++] = c % 16;\n          }\n#endif\n          p++;\n        }\n      }\n    }\n  }\n}\n\nvoid readLUT() {\n  unsigned char charnum[256], line[32], *linep, c;\n  FILE *fpwv, *fprt;\n  struct csoln* p;\n  int d, i, j, k, kk, ns, nn;\n\n  init_param();\n\n  for (i = 0; i <= 255; i++) {\n    if ('0' <= i && i <= '9')\n      charnum[i] = i - '0';\n    else if (i >= 'A')\n      charnum[i] = i - 'A' + 10;\n    else // if (i=='$' || i=='\\n' || ... )\n      charnum[i] = 0;\n  }\n\n  fpwv = fopen(POWVFILE, \"r\");\n  if (fpwv == NULL) {\n    printf(\"Error in opening %s\\n\", POWVFILE);\n    exit(1);\n  }\n\n#if ROUTING == 1\n  fprt = fopen(POSTFILE, \"r\");\n  if (fprt == NULL) {\n    printf(\"Error in opening %s\\n\", POSTFILE);\n    exit(1);\n  }\n#endif\n\n  for (d = 4; d <= D; d++) {\n    if (fscanf(fpwv, \"d=%d\\n\", &d) != 1)\n      abort_with_message(\"Unable to get needed info from POWV.\");\n#if ROUTING == 1\n    if (fscanf(fprt, \"d=%d\\n\", &d) != 1)\n      abort_with_message(\"Unable to get needed info from POST.\");\n#endif\n    for (k = 0; k < numgrp[d]; k++) {\n      ns = (int)charnum[fgetc(fpwv)];\n\n      if (ns == 0) { // same as some previous group\n        if (fscanf(fpwv, \"%d\\n\", &kk) != 1)\n          abort_with_message(\"Unable to get needed info from POWV.\");\n        numsoln[d][k] = numsoln[d][kk];\n        LUT[d][k]     = LUT[d][kk];\n      } else {\n        fgetc(fpwv); // '\\n'\n        numsoln[d][k] = ns;\n        p             = (struct csoln*)malloc(ns * sizeof(struct csoln));\n        LUT[d][k]     = p;\n        for (i = 1; i <= ns; i++) {\n          linep     = (unsigned char*)fgets((char*)line, 32, fpwv);\n          p->parent = charnum[*(linep++)];\n          j         = 0;\n          while ((p->seg[j++] = charnum[*(linep++)]) != 0)\n            ;\n          j = 10;\n          while ((p->seg[j--] = charnum[*(linep++)]) != 0)\n            ;\n#if ROUTING == 1\n          nn = 2 * d - 2;\n          if (!fread(line, 1, d - 2, fprt))\n            abort_with_message(\"Unable to get needed info from POST.\");\n          linep = line;\n          for (j = d; j < nn; j++) {\n            c                = charnum[*(linep++)];\n            p->rowcol[j - d] = c;\n          }\n          if (!fread(line, 1, nn / 2 + 1, fprt))\n            abort_with_message(\"Unable to get needed info from POST.\");\n          linep = line; // last char \\n\n          for (j = 0; j < nn;) {\n            c                = *(linep++);\n            p->neighbor[j++] = c / 16;\n            p->neighbor[j++] = c % 16;\n          }\n#endif\n          p++;\n        }\n      }\n    }\n  }\n}\n\nDTYPE flute_wl(int d, DTYPE x[], DTYPE y[], int acc) {\n  DTYPE xs[MAXD], ys[MAXD], minval, l, xu, xl, yu, yl;\n  int s[MAXD];\n  int i, j, k, minidx;\n  struct point pt[MAXD], *ptp[MAXD], *tmpp;\n\n  if (d == 2)\n    l = ADIFF(x[0], x[1]) + ADIFF(y[0], y[1]);\n  else if (d == 3) {\n    if (x[0] > x[1]) {\n      xu = max(x[0], x[2]);\n      xl = min(x[1], x[2]);\n    } else {\n      xu = max(x[1], x[2]);\n      xl = min(x[0], x[2]);\n    }\n    if (y[0] > y[1]) {\n      yu = max(y[0], y[2]);\n      yl = min(y[1], y[2]);\n    } else {\n      yu = max(y[1], y[2]);\n      yl = min(y[0], y[2]);\n    }\n    l = (xu - xl) + (yu - yl);\n  } else {\n    for (i = 0; i < d; i++) {\n      pt[i].x = x[i];\n      pt[i].y = y[i];\n      ptp[i]  = &pt[i];\n    }\n\n    // sort x\n    for (i = 0; i < d - 1; i++) {\n      minval = ptp[i]->x;\n      minidx = i;\n      for (j = i + 1; j < d; j++) {\n        if (minval > ptp[j]->x) {\n          minval = ptp[j]->x;\n          minidx = j;\n        }\n      }\n      tmpp        = ptp[i];\n      ptp[i]      = ptp[minidx];\n      ptp[minidx] = tmpp;\n    }\n\n#if REMOVE_DUPLICATE_PIN == 1\n    ptp[d]    = &pt[d];\n    ptp[d]->x = ptp[d]->y = -999999;\n    j                     = 0;\n    for (i = 0; i < d; i++) {\n      for (k = i + 1; ptp[k]->x == ptp[i]->x; k++)\n        if (ptp[k]->y == ptp[i]->y) // pins k and i are the same\n          break;\n      if (ptp[k]->x != ptp[i]->x)\n        ptp[j++] = ptp[i];\n    }\n    d = j;\n#endif\n\n    for (i = 0; i < d; i++) {\n      xs[i]     = ptp[i]->x;\n      ptp[i]->o = i;\n    }\n\n    // sort y to find s[]\n    for (i = 0; i < d - 1; i++) {\n      minval = ptp[i]->y;\n      minidx = i;\n      for (j = i + 1; j < d; j++) {\n        if (minval > ptp[j]->y) {\n          minval = ptp[j]->y;\n          minidx = j;\n        }\n      }\n      ys[i]       = ptp[minidx]->y;\n      s[i]        = ptp[minidx]->o;\n      ptp[minidx] = ptp[i];\n    }\n    ys[d - 1] = ptp[d - 1]->y;\n    s[d - 1]  = ptp[d - 1]->o;\n\n    l = flutes_wl(d, xs, ys, s, acc);\n  }\n  return l;\n}\n\n// xs[] and ys[] are coords in x and y in sorted order\n// s[] is a list of nodes in increasing y direction\n//   if nodes are indexed in the order of increasing x coord\n//   i.e., s[i] = s_i as defined in paper\n// The points are (xs[s[i]], ys[i]) for i=0..d-1\n//             or (xs[i], ys[si[i]]) for i=0..d-1\n\nDTYPE flutes_wl_RDP(int d, DTYPE xs[], DTYPE ys[], int s[], int acc) {\n  int i, j, ss;\n\n  for (i = 0; i < d - 1; i++) {\n    if (xs[s[i]] == xs[s[i + 1]] && ys[i] == ys[i + 1]) {\n      if (s[i] < s[i + 1])\n        ss = s[i + 1];\n      else {\n        ss   = s[i];\n        s[i] = s[i + 1];\n      }\n      for (j = i + 2; j < d; j++) {\n        ys[j - 1] = ys[j];\n        s[j - 1]  = s[j];\n      }\n      for (j = ss + 1; j < d; j++)\n        xs[j - 1] = xs[j];\n      for (j = 0; j <= d - 2; j++)\n        if (s[j] > ss)\n          s[j]--;\n      i--;\n      d--;\n    }\n  }\n  return flutes_wl_ALLD(d, xs, ys, s, acc);\n}\n\n// For low-degree, i.e., 2 <= d <= D\nDTYPE flutes_wl_LD(int d, DTYPE xs[], DTYPE ys[], int s[]) {\n  int k, pi, i, j;\n  struct csoln* rlist;\n  DTYPE dd[2 * D - 2]; // 0..D-2 for v, D-1..2*D-3 for h\n  DTYPE minl, sum, l[MPOWV + 1];\n\n  if (d <= 3)\n    minl = xs[d - 1] - xs[0] + ys[d - 1] - ys[0];\n  else {\n    k = 0;\n    if (s[0] < s[2])\n      k++;\n    if (s[1] < s[2])\n      k++;\n\n    for (i = 3; i <= d - 1; i++) { // p0=0 always, skip i=1 for symmetry\n      pi = s[i];\n      for (j = d - 1; j > i; j--)\n        if (s[j] < s[i])\n          pi--;\n      k = pi + (i + 1) * k;\n    }\n\n    if (k < numgrp[d]) // no horizontal flip\n      for (i = 1; i <= d - 3; i++) {\n        dd[i]         = ys[i + 1] - ys[i];\n        dd[d - 1 + i] = xs[i + 1] - xs[i];\n      }\n    else {\n      k = 2 * numgrp[d] - 1 - k;\n      for (i = 1; i <= d - 3; i++) {\n        dd[i]         = ys[i + 1] - ys[i];\n        dd[d - 1 + i] = xs[d - 1 - i] - xs[d - 2 - i];\n      }\n    }\n\n    minl = l[0] = xs[d - 1] - xs[0] + ys[d - 1] - ys[0];\n    rlist       = LUT[d][k];\n    for (i = 0; rlist->seg[i] > 0; i++)\n      minl += dd[rlist->seg[i]];\n\n    l[1] = minl;\n    j    = 2;\n    while (j <= numsoln[d][k]) {\n      rlist++;\n      sum = l[rlist->parent];\n      for (i = 0; rlist->seg[i] > 0; i++)\n        sum += dd[rlist->seg[i]];\n      for (i = 10; rlist->seg[i] > 0; i--)\n        sum -= dd[rlist->seg[i]];\n      minl   = min(minl, sum);\n      l[j++] = sum;\n    }\n  }\n\n  return minl;\n}\n\n// For medium-degree, i.e., D+1 <= d\nDTYPE flutes_wl_MD(int d, DTYPE xs[], DTYPE ys[], int s[], int acc) {\n  DTYPE x1[MAXD], x2[MAXD], y1[MAXD], y2[MAXD];\n  int si[MAXD], s1[MAXD], s2[MAXD];\n  float score[2 * MAXD], penalty[MAXD], pnlty, dx, dy;\n  DTYPE ll, minl, extral = 0;\n  int i, r, p, maxbp, nbp, bp, ub, lb, n1, n2, newacc;\n  int ms, mins, maxs, minsi, maxsi;\n  DTYPE distx[MAXD], disty[MAXD], xydiff;\n\n  if (s[0] < s[d - 1]) {\n    ms = max(s[0], s[1]);\n    for (i = 2; i <= ms; i++)\n      ms = max(ms, s[i]);\n    if (ms <= d - 3) {\n      for (i = 0; i <= ms; i++) {\n        x1[i] = xs[i];\n        y1[i] = ys[i];\n        s1[i] = s[i];\n      }\n      x1[ms + 1] = xs[ms];\n      y1[ms + 1] = ys[ms];\n      s1[ms + 1] = ms + 1;\n\n      s2[0] = 0;\n      for (i = 1; i <= d - 1 - ms; i++)\n        s2[i] = s[i + ms] - ms;\n\n      return flutes_wl_LMD(ms + 2, x1, y1, s1, acc) +\n             flutes_wl_LMD(d - ms, xs + ms, ys + ms, s2, acc);\n    }\n  } else { // (s[0] > s[d-1])\n    ms = min(s[0], s[1]);\n    for (i = 2; i <= d - 1 - ms; i++)\n      ms = min(ms, s[i]);\n    if (ms >= 2) {\n      x1[0] = xs[ms];\n      y1[0] = ys[0];\n      s1[0] = s[0] - ms + 1;\n      for (i = 1; i <= d - 1 - ms; i++) {\n        x1[i] = xs[i + ms - 1];\n        y1[i] = ys[i];\n        s1[i] = s[i] - ms + 1;\n      }\n      x1[d - ms] = xs[d - 1];\n      y1[d - ms] = ys[d - 1 - ms];\n      s1[d - ms] = 0;\n\n      s2[0] = ms;\n      for (i = 1; i <= ms; i++)\n        s2[i] = s[i + d - 1 - ms];\n\n      return flutes_wl_LMD(d + 1 - ms, x1, y1, s1, acc) +\n             flutes_wl_LMD(ms + 1, xs, ys + d - 1 - ms, s2, acc);\n    }\n  }\n\n  // Find inverse si[] of s[]\n  for (r = 0; r < d; r++)\n    si[s[r]] = r;\n\n  // Determine breaking directions and positions dp[]\n  lb = (d - 2 * acc + 2) / 4;\n  if (lb < 2)\n    lb = 2;\n  ub = d - 1 - lb;\n\n// Compute scores\n#define AAWL 0.6\n#define BBWL 0.3\n  float CCWL = 7.4 / ((d + 10.) * (d - 3.));\n  float DDWL = 4.8 / (d - 1);\n\n  // Compute penalty[]\n  dx = CCWL * (xs[d - 2] - xs[1]);\n  dy = CCWL * (ys[d - 2] - ys[1]);\n  for (r = d / 2, pnlty = 0; r >= 0; r--, pnlty += dx)\n    penalty[r] = pnlty, penalty[d - 1 - r] = pnlty;\n  for (r = d / 2 - 1, pnlty = dy; r >= 0; r--, pnlty += dy)\n    penalty[s[r]] += pnlty, penalty[s[d - 1 - r]] += pnlty;\n  //#define CCWL 0.16\n  //    for (r=0; r<d; r++)\n  //        penalty[r] = abs(d-1-r-r)*dx + abs(d-1-si[r]-si[r])*dy;\n\n  // Compute distx[], disty[]\n  xydiff = (xs[d - 1] - xs[0]) - (ys[d - 1] - ys[0]);\n  if (s[0] < s[1])\n    mins = s[0], maxs = s[1];\n  else\n    mins = s[1], maxs = s[0];\n  if (si[0] < si[1])\n    minsi = si[0], maxsi = si[1];\n  else\n    minsi = si[1], maxsi = si[0];\n  for (r = 2; r <= ub; r++) {\n    if (s[r] < mins)\n      mins = s[r];\n    else if (s[r] > maxs)\n      maxs = s[r];\n    distx[r] = xs[maxs] - xs[mins];\n    if (si[r] < minsi)\n      minsi = si[r];\n    else if (si[r] > maxsi)\n      maxsi = si[r];\n    disty[r] = ys[maxsi] - ys[minsi] + xydiff;\n  }\n\n  if (s[d - 2] < s[d - 1])\n    mins = s[d - 2], maxs = s[d - 1];\n  else\n    mins = s[d - 1], maxs = s[d - 2];\n  if (si[d - 2] < si[d - 1])\n    minsi = si[d - 2], maxsi = si[d - 1];\n  else\n    minsi = si[d - 1], maxsi = si[d - 2];\n  for (r = d - 3; r >= lb; r--) {\n    if (s[r] < mins)\n      mins = s[r];\n    else if (s[r] > maxs)\n      maxs = s[r];\n    distx[r] += xs[maxs] - xs[mins];\n    if (si[r] < minsi)\n      minsi = si[r];\n    else if (si[r] > maxsi)\n      maxsi = si[r];\n    disty[r] += ys[maxsi] - ys[minsi];\n  }\n\n  nbp = 0;\n  for (r = lb; r <= ub; r++) {\n    if (si[r] == 0 || si[r] == d - 1)\n      score[nbp] = (xs[r + 1] - xs[r - 1]) - penalty[r] -\n                   AAWL * (ys[d - 2] - ys[1]) - DDWL * disty[r];\n    else\n      score[nbp] = (xs[r + 1] - xs[r - 1]) - penalty[r] -\n                   BBWL * (ys[si[r] + 1] - ys[si[r] - 1]) - DDWL * disty[r];\n    nbp++;\n\n    if (s[r] == 0 || s[r] == d - 1)\n      score[nbp] = (ys[r + 1] - ys[r - 1]) - penalty[s[r]] -\n                   AAWL * (xs[d - 2] - xs[1]) - DDWL * distx[r];\n    else\n      score[nbp] = (ys[r + 1] - ys[r - 1]) - penalty[s[r]] -\n                   BBWL * (xs[s[r] + 1] - xs[s[r] - 1]) - DDWL * distx[r];\n    nbp++;\n  }\n\n  if (acc <= 3)\n    newacc = 1;\n  else {\n    newacc = acc / 2;\n    if (acc >= nbp)\n      acc = nbp - 1;\n  }\n\n  minl = (DTYPE)INT_MAX;\n  for (i = 0; i < acc; i++) {\n    maxbp = 0;\n    for (bp = 1; bp < nbp; bp++)\n      if (score[maxbp] < score[bp])\n        maxbp = bp;\n    score[maxbp] = -9e9;\n\n#define BreakPt(bp) ((bp) / 2 + lb)\n#define BreakInX(bp) ((bp) % 2 == 0)\n    p = BreakPt(maxbp);\n    // Breaking in p\n    if (BreakInX(maxbp)) { // break in x\n      n1 = n2 = 0;\n      for (r = 0; r < d; r++) {\n        if (s[r] < p) {\n          s1[n1] = s[r];\n          y1[n1] = ys[r];\n          n1++;\n        } else if (s[r] > p) {\n          s2[n2] = s[r] - p;\n          y2[n2] = ys[r];\n          n2++;\n        } else { // if (s[r] == p)  i.e.,  r = si[p]\n          s1[n1] = p;\n          s2[n2] = 0;\n          if (r == d - 1 || r == d - 2) {\n            y1[n1] = y2[n2] = ys[r - 1];\n            extral          = ys[r] - ys[r - 1];\n          }\n          if (r == 0 || r == 1) {\n            y1[n1] = y2[n2] = ys[r + 1];\n            extral          = ys[r + 1] - ys[r];\n          } else {\n            y1[n1] = y2[n2] = ys[r];\n            extral          = 0;\n          }\n          n1++;\n          n2++;\n        }\n      }\n      ll = extral + flutes_wl_LMD(p + 1, xs, y1, s1, newacc) +\n           flutes_wl_LMD(d - p, xs + p, y2, s2, newacc);\n    } else { // if (!BreakInX(maxbp))\n      n1 = n2 = 0;\n      for (r = 0; r < d; r++) {\n        if (si[r] < p) {\n          s1[si[r]] = n1;\n          x1[n1]    = xs[r];\n          n1++;\n        } else if (si[r] > p) {\n          s2[si[r] - p] = n2;\n          x2[n2]        = xs[r];\n          n2++;\n        } else { // if (si[r] == p)  i.e.,  r = s[p]\n          s1[p] = n1;\n          s2[0] = n2;\n          if (r == d - 1 || r == d - 2) {\n            x1[n1] = x2[n2] = xs[r - 1];\n            extral          = xs[r] - xs[r - 1];\n          }\n          if (r == 0 || r == 1) {\n            x1[n1] = x2[n2] = xs[r + 1];\n            extral          = xs[r + 1] - xs[r];\n          } else {\n            x1[n1] = x2[n2] = xs[r];\n            extral          = 0;\n          }\n          n1++;\n          n2++;\n        }\n      }\n      ll = extral + flutes_wl_LMD(p + 1, x1, ys, s1, newacc) +\n           flutes_wl_LMD(d - p, x2, ys + p, s2, newacc);\n    }\n    if (minl > ll)\n      minl = ll;\n  }\n  return minl;\n}\n\nstatic int orderx(const void* a, const void* b) {\n  struct point *pa, *pb;\n\n  pa = *(struct point**)a;\n  pb = *(struct point**)b;\n\n  if (pa->x < pb->x)\n    return -1;\n  if (pa->x > pb->x)\n    return 1;\n  return 0;\n}\n\nstatic int ordery(const void* a, const void* b) {\n  struct point *pa, *pb;\n\n  pa = *(struct point**)a;\n  pb = *(struct point**)b;\n\n  if (pa->y < pb->y)\n    return -1;\n  if (pa->y > pb->y)\n    return 1;\n  return 0;\n}\n\nTree flute(int d, DTYPE x[], DTYPE y[], int acc) {\n  DTYPE *xs, *ys, minval;\n  int* s;\n  int i, j, k, minidx;\n  struct point *pt, **ptp, *tmpp;\n  Tree t;\n\n  if (d == 2) {\n    t.deg         = 2;\n    t.length      = ADIFF(x[0], x[1]) + ADIFF(y[0], y[1]);\n    t.branch      = (Branch*)malloc(2 * sizeof(Branch));\n    t.branch[0].x = x[0];\n    t.branch[0].y = y[0];\n    t.branch[0].n = 1;\n    t.branch[1].x = x[1];\n    t.branch[1].y = y[1];\n    t.branch[1].n = 1;\n  } else {\n    xs  = (DTYPE*)malloc(sizeof(DTYPE) * (d));\n    ys  = (DTYPE*)malloc(sizeof(DTYPE) * (d));\n    s   = (int*)malloc(sizeof(int) * (d));\n    pt  = (struct point*)malloc(sizeof(struct point) * (d + 1));\n    ptp = (struct point**)malloc(sizeof(struct point*) * (d + 1));\n\n    for (i = 0; i < d; i++) {\n      pt[i].x = x[i];\n      pt[i].y = y[i];\n      ptp[i]  = &pt[i];\n    }\n\n    // sort x\n    if (d < 200) {\n      for (i = 0; i < d - 1; i++) {\n        minval = ptp[i]->x;\n        minidx = i;\n        for (j = i + 1; j < d; j++) {\n          if (minval > ptp[j]->x) {\n            minval = ptp[j]->x;\n            minidx = j;\n          }\n        }\n        tmpp        = ptp[i];\n        ptp[i]      = ptp[minidx];\n        ptp[minidx] = tmpp;\n      }\n    } else {\n      qsort(ptp, d, sizeof(struct point*), orderx);\n    }\n\n#if REMOVE_DUPLICATE_PIN == 1\n    ptp[d]    = &pt[d];\n    ptp[d]->x = ptp[d]->y = -999999;\n    j                     = 0;\n    for (i = 0; i < d; i++) {\n      for (k = i + 1; ptp[k]->x == ptp[i]->x; k++)\n        if (ptp[k]->y == ptp[i]->y) // pins k and i are the same\n          break;\n      if (ptp[k]->x != ptp[i]->x)\n        ptp[j++] = ptp[i];\n    }\n    d = j;\n#endif\n\n    for (i = 0; i < d; i++) {\n      xs[i]     = ptp[i]->x;\n      ptp[i]->o = i;\n    }\n\n    // sort y to find s[]\n    if (d < 200) {\n      for (i = 0; i < d - 1; i++) {\n        minval = ptp[i]->y;\n        minidx = i;\n        for (j = i + 1; j < d; j++) {\n          if (minval > ptp[j]->y) {\n            minval = ptp[j]->y;\n            minidx = j;\n          }\n        }\n        ys[i]       = ptp[minidx]->y;\n        s[i]        = ptp[minidx]->o;\n        ptp[minidx] = ptp[i];\n      }\n      ys[d - 1] = ptp[d - 1]->y;\n      s[d - 1]  = ptp[d - 1]->o;\n    } else {\n      qsort(ptp, d, sizeof(struct point*), ordery);\n      for (i = 0; i < d; i++) {\n        ys[i] = ptp[i]->y;\n        s[i]  = ptp[i]->o;\n      }\n    }\n\n    t = flutes(d, xs, ys, s, acc);\n\n    free(xs);\n    free(ys);\n    free(s);\n    free(pt);\n    free(ptp);\n  }\n\n  return t;\n}\n\n// xs[] and ys[] are coords in x and y in sorted order\n// s[] is a list of nodes in increasing y direction\n//   if nodes are indexed in the order of increasing x coord\n//   i.e., s[i] = s_i as defined in paper\n// The points are (xs[s[i]], ys[i]) for i=0..d-1\n//             or (xs[i], ys[si[i]]) for i=0..d-1\n\nTree flutes_RDP(int d, DTYPE xs[], DTYPE ys[], int s[], int acc) {\n  int i, j, ss;\n\n  for (i = 0; i < d - 1; i++) {\n    if (xs[s[i]] == xs[s[i + 1]] && ys[i] == ys[i + 1]) {\n      if (s[i] < s[i + 1])\n        ss = s[i + 1];\n      else {\n        ss   = s[i];\n        s[i] = s[i + 1];\n      }\n      for (j = i + 2; j < d; j++) {\n        ys[j - 1] = ys[j];\n        s[j - 1]  = s[j];\n      }\n      for (j = ss + 1; j < d; j++)\n        xs[j - 1] = xs[j];\n      for (j = 0; j <= d - 2; j++)\n        if (s[j] > ss)\n          s[j]--;\n      i--;\n      d--;\n    }\n  }\n  return flutes_ALLD(d, xs, ys, s, acc);\n}\n\n// For low-degree, i.e., 2 <= d <= D\nTree flutes_LD(int d, DTYPE xs[], DTYPE ys[], int s[]) {\n  int k, pi, i, j;\n  struct csoln *rlist, *bestrlist;\n  DTYPE dd[2 * D - 2]; // 0..D-2 for v, D-1..2*D-3 for h\n  DTYPE minl, sum, l[MPOWV + 1];\n  int hflip;\n  Tree t;\n\n  t.deg    = d;\n  t.branch = (Branch*)malloc((2 * d - 2) * sizeof(Branch));\n  if (d == 2) {\n    minl          = xs[1] - xs[0] + ys[1] - ys[0];\n    t.branch[0].x = xs[s[0]];\n    t.branch[0].y = ys[0];\n    t.branch[0].n = 1;\n    t.branch[1].x = xs[s[1]];\n    t.branch[1].y = ys[1];\n    t.branch[1].n = 1;\n  } else if (d == 3) {\n    minl          = xs[2] - xs[0] + ys[2] - ys[0];\n    t.branch[0].x = xs[s[0]];\n    t.branch[0].y = ys[0];\n    t.branch[0].n = 3;\n    t.branch[1].x = xs[s[1]];\n    t.branch[1].y = ys[1];\n    t.branch[1].n = 3;\n    t.branch[2].x = xs[s[2]];\n    t.branch[2].y = ys[2];\n    t.branch[2].n = 3;\n    t.branch[3].x = xs[1];\n    t.branch[3].y = ys[1];\n    t.branch[3].n = 3;\n  } else {\n    k = 0;\n    if (s[0] < s[2])\n      k++;\n    if (s[1] < s[2])\n      k++;\n\n    for (i = 3; i <= d - 1; i++) { // p0=0 always, skip i=1 for symmetry\n      pi = s[i];\n      for (j = d - 1; j > i; j--)\n        if (s[j] < s[i])\n          pi--;\n      k = pi + (i + 1) * k;\n    }\n\n    if (k < numgrp[d]) { // no horizontal flip\n      hflip = 0;\n      for (i = 1; i <= d - 3; i++) {\n        dd[i]         = ys[i + 1] - ys[i];\n        dd[d - 1 + i] = xs[i + 1] - xs[i];\n      }\n    } else {\n      hflip = 1;\n      k     = 2 * numgrp[d] - 1 - k;\n      for (i = 1; i <= d - 3; i++) {\n        dd[i]         = ys[i + 1] - ys[i];\n        dd[d - 1 + i] = xs[d - 1 - i] - xs[d - 2 - i];\n      }\n    }\n\n    minl = l[0] = xs[d - 1] - xs[0] + ys[d - 1] - ys[0];\n    rlist       = LUT[d][k];\n    for (i = 0; rlist->seg[i] > 0; i++)\n      minl += dd[rlist->seg[i]];\n    bestrlist = rlist;\n    l[1]      = minl;\n    j         = 2;\n    while (j <= numsoln[d][k]) {\n      rlist++;\n      sum = l[rlist->parent];\n      for (i = 0; rlist->seg[i] > 0; i++)\n        sum += dd[rlist->seg[i]];\n      for (i = 10; rlist->seg[i] > 0; i--)\n        sum -= dd[rlist->seg[i]];\n      if (sum < minl) {\n        minl      = sum;\n        bestrlist = rlist;\n      }\n      l[j++] = sum;\n    }\n\n    t.branch[0].x = xs[s[0]];\n    t.branch[0].y = ys[0];\n    t.branch[1].x = xs[s[1]];\n    t.branch[1].y = ys[1];\n    for (i = 2; i < d - 2; i++) {\n      t.branch[i].x = xs[s[i]];\n      t.branch[i].y = ys[i];\n      t.branch[i].n = bestrlist->neighbor[i];\n    }\n    t.branch[d - 2].x = xs[s[d - 2]];\n    t.branch[d - 2].y = ys[d - 2];\n    t.branch[d - 1].x = xs[s[d - 1]];\n    t.branch[d - 1].y = ys[d - 1];\n    if (hflip) {\n      if (s[1] < s[0]) {\n        t.branch[0].n = bestrlist->neighbor[1];\n        t.branch[1].n = bestrlist->neighbor[0];\n      } else {\n        t.branch[0].n = bestrlist->neighbor[0];\n        t.branch[1].n = bestrlist->neighbor[1];\n      }\n      if (s[d - 1] < s[d - 2]) {\n        t.branch[d - 2].n = bestrlist->neighbor[d - 1];\n        t.branch[d - 1].n = bestrlist->neighbor[d - 2];\n      } else {\n        t.branch[d - 2].n = bestrlist->neighbor[d - 2];\n        t.branch[d - 1].n = bestrlist->neighbor[d - 1];\n      }\n      for (i = d; i < 2 * d - 2; i++) {\n        t.branch[i].x = xs[d - 1 - bestrlist->rowcol[i - d] % 16];\n        t.branch[i].y = ys[bestrlist->rowcol[i - d] / 16];\n        t.branch[i].n = bestrlist->neighbor[i];\n      }\n    } else { // !hflip\n      if (s[0] < s[1]) {\n        t.branch[0].n = bestrlist->neighbor[1];\n        t.branch[1].n = bestrlist->neighbor[0];\n      } else {\n        t.branch[0].n = bestrlist->neighbor[0];\n        t.branch[1].n = bestrlist->neighbor[1];\n      }\n      if (s[d - 2] < s[d - 1]) {\n        t.branch[d - 2].n = bestrlist->neighbor[d - 1];\n        t.branch[d - 1].n = bestrlist->neighbor[d - 2];\n      } else {\n        t.branch[d - 2].n = bestrlist->neighbor[d - 2];\n        t.branch[d - 1].n = bestrlist->neighbor[d - 1];\n      }\n      for (i = d; i < 2 * d - 2; i++) {\n        t.branch[i].x = xs[bestrlist->rowcol[i - d] % 16];\n        t.branch[i].y = ys[bestrlist->rowcol[i - d] / 16];\n        t.branch[i].n = bestrlist->neighbor[i];\n      }\n    }\n  }\n  t.length = minl;\n\n  return t;\n}\n\n// For medium-degree, i.e., D+1 <= d\nTree flutes_MD(int d, DTYPE xs[], DTYPE ys[], int s[], int acc) {\n  DTYPE x1[MAXD], x2[MAXD], y1[MAXD], y2[MAXD];\n  int si[MAXD], s1[MAXD], s2[MAXD];\n  float score[2 * MAXD], penalty[MAXD], pnlty, dx, dy;\n  DTYPE ll, minl, coord1, coord2;\n  int i, r, p, maxbp, bestbp = 0, bp, nbp, ub, lb, n1, n2, nn1 = 0, nn2 = 0,\n                      newacc;\n  Tree t, t1, t2, bestt1, bestt2;\n  int ms, mins, maxs, minsi, maxsi;\n  DTYPE distx[MAXD], disty[MAXD], xydiff;\n\n  if (s[0] < s[d - 1]) {\n    ms = max(s[0], s[1]);\n    for (i = 2; i <= ms; i++)\n      ms = max(ms, s[i]);\n    if (ms <= d - 3) {\n      for (i = 0; i <= ms; i++) {\n        x1[i] = xs[i];\n        y1[i] = ys[i];\n        s1[i] = s[i];\n      }\n      x1[ms + 1] = xs[ms];\n      y1[ms + 1] = ys[ms];\n      s1[ms + 1] = ms + 1;\n\n      s2[0] = 0;\n      for (i = 1; i <= d - 1 - ms; i++)\n        s2[i] = s[i + ms] - ms;\n\n      t1 = flutes_LMD(ms + 2, x1, y1, s1, acc);\n      t2 = flutes_LMD(d - ms, xs + ms, ys + ms, s2, acc);\n      t  = dmergetree(t1, t2);\n      free(t1.branch);\n      free(t2.branch);\n\n      return t;\n    }\n  } else { // (s[0] > s[d-1])\n    ms = min(s[0], s[1]);\n    for (i = 2; i <= d - 1 - ms; i++)\n      ms = min(ms, s[i]);\n    if (ms >= 2) {\n      x1[0] = xs[ms];\n      y1[0] = ys[0];\n      s1[0] = s[0] - ms + 1;\n      for (i = 1; i <= d - 1 - ms; i++) {\n        x1[i] = xs[i + ms - 1];\n        y1[i] = ys[i];\n        s1[i] = s[i] - ms + 1;\n      }\n      x1[d - ms] = xs[d - 1];\n      y1[d - ms] = ys[d - 1 - ms];\n      s1[d - ms] = 0;\n\n      s2[0] = ms;\n      for (i = 1; i <= ms; i++)\n        s2[i] = s[i + d - 1 - ms];\n\n      t1 = flutes_LMD(d + 1 - ms, x1, y1, s1, acc);\n      t2 = flutes_LMD(ms + 1, xs, ys + d - 1 - ms, s2, acc);\n      t  = dmergetree(t1, t2);\n      free(t1.branch);\n      free(t2.branch);\n\n      return t;\n    }\n  }\n\n  // Find inverse si[] of s[]\n  for (r = 0; r < d; r++)\n    si[s[r]] = r;\n\n  // Determine breaking directions and positions dp[]\n  lb = (d - 2 * acc + 2) / 4;\n  if (lb < 2)\n    lb = 2;\n  ub = d - 1 - lb;\n\n// Compute scores\n#define AA 0.6 // 2.0*BB\n#define BB 0.3\n  float CC = 7.4 / ((d + 10.) * (d - 3.));\n  float DD = 4.8 / (d - 1);\n\n  // Compute penalty[]\n  dx = CC * (xs[d - 2] - xs[1]);\n  dy = CC * (ys[d - 2] - ys[1]);\n  for (r = d / 2, pnlty = 0; r >= 2; r--, pnlty += dx)\n    penalty[r] = pnlty, penalty[d - 1 - r] = pnlty;\n  penalty[1] = pnlty, penalty[d - 2] = pnlty;\n  penalty[0] = pnlty, penalty[d - 1] = pnlty;\n  for (r = d / 2 - 1, pnlty = dy; r >= 2; r--, pnlty += dy)\n    penalty[s[r]] += pnlty, penalty[s[d - 1 - r]] += pnlty;\n  penalty[s[1]] += pnlty, penalty[s[d - 2]] += pnlty;\n  penalty[s[0]] += pnlty, penalty[s[d - 1]] += pnlty;\n  //#define CC 0.16\n  //#define v(r) ((r==0||r==1||r==d-2||r==d-1) ? d-3 : abs(d-1-r-r))\n  //    for (r=0; r<d; r++)\n  //        penalty[r] = v(r)*dx + v(si[r])*dy;\n\n  // Compute distx[], disty[]\n  xydiff = (xs[d - 1] - xs[0]) - (ys[d - 1] - ys[0]);\n  if (s[0] < s[1])\n    mins = s[0], maxs = s[1];\n  else\n    mins = s[1], maxs = s[0];\n  if (si[0] < si[1])\n    minsi = si[0], maxsi = si[1];\n  else\n    minsi = si[1], maxsi = si[0];\n  for (r = 2; r <= ub; r++) {\n    if (s[r] < mins)\n      mins = s[r];\n    else if (s[r] > maxs)\n      maxs = s[r];\n    distx[r] = xs[maxs] - xs[mins];\n    if (si[r] < minsi)\n      minsi = si[r];\n    else if (si[r] > maxsi)\n      maxsi = si[r];\n    disty[r] = ys[maxsi] - ys[minsi] + xydiff;\n  }\n\n  if (s[d - 2] < s[d - 1])\n    mins = s[d - 2], maxs = s[d - 1];\n  else\n    mins = s[d - 1], maxs = s[d - 2];\n  if (si[d - 2] < si[d - 1])\n    minsi = si[d - 2], maxsi = si[d - 1];\n  else\n    minsi = si[d - 1], maxsi = si[d - 2];\n  for (r = d - 3; r >= lb; r--) {\n    if (s[r] < mins)\n      mins = s[r];\n    else if (s[r] > maxs)\n      maxs = s[r];\n    distx[r] += xs[maxs] - xs[mins];\n    if (si[r] < minsi)\n      minsi = si[r];\n    else if (si[r] > maxsi)\n      maxsi = si[r];\n    disty[r] += ys[maxsi] - ys[minsi];\n  }\n\n  nbp = 0;\n  for (r = lb; r <= ub; r++) {\n    if (si[r] <= 1)\n      score[nbp] = (xs[r + 1] - xs[r - 1]) - penalty[r] - AA * (ys[2] - ys[1]) -\n                   DD * disty[r];\n    else if (si[r] >= d - 2)\n      score[nbp] = (xs[r + 1] - xs[r - 1]) - penalty[r] -\n                   AA * (ys[d - 2] - ys[d - 3]) - DD * disty[r];\n    else\n      score[nbp] = (xs[r + 1] - xs[r - 1]) - penalty[r] -\n                   BB * (ys[si[r] + 1] - ys[si[r] - 1]) - DD * disty[r];\n    nbp++;\n\n    if (s[r] <= 1)\n      score[nbp] = (ys[r + 1] - ys[r - 1]) - penalty[s[r]] -\n                   AA * (xs[2] - xs[1]) - DD * distx[r];\n    else if (s[r] >= d - 2)\n      score[nbp] = (ys[r + 1] - ys[r - 1]) - penalty[s[r]] -\n                   AA * (xs[d - 2] - xs[d - 3]) - DD * distx[r];\n    else\n      score[nbp] = (ys[r + 1] - ys[r - 1]) - penalty[s[r]] -\n                   BB * (xs[s[r] + 1] - xs[s[r] - 1]) - DD * distx[r];\n    nbp++;\n  }\n\n  if (acc <= 3)\n    newacc = 1;\n  else {\n    newacc = acc / 2;\n    if (acc >= nbp)\n      acc = nbp - 1;\n  }\n\n  minl          = (DTYPE)INT_MAX;\n  bestt1.branch = bestt2.branch = NULL;\n  for (i = 0; i < acc; i++) {\n    maxbp = 0;\n    for (bp = 1; bp < nbp; bp++)\n      if (score[maxbp] < score[bp])\n        maxbp = bp;\n    score[maxbp] = -9e9;\n\n#define BreakPt(bp) ((bp) / 2 + lb)\n#define BreakInX(bp) ((bp) % 2 == 0)\n    p = BreakPt(maxbp);\n    // Breaking in p\n    if (BreakInX(maxbp)) { // break in x\n      n1 = n2 = 0;\n      for (r = 0; r < d; r++) {\n        if (s[r] < p) {\n          s1[n1] = s[r];\n          y1[n1] = ys[r];\n          n1++;\n        } else if (s[r] > p) {\n          s2[n2] = s[r] - p;\n          y2[n2] = ys[r];\n          n2++;\n        } else { // if (s[r] == p)  i.e.,  r = si[p]\n          s1[n1] = p;\n          s2[n2] = 0;\n          y1[n1] = y2[n2] = ys[r];\n          nn1             = n1;\n          nn2             = n2;\n          n1++;\n          n2++;\n        }\n      }\n\n      t1     = flutes_LMD(p + 1, xs, y1, s1, newacc);\n      t2     = flutes_LMD(d - p, xs + p, y2, s2, newacc);\n      ll     = t1.length + t2.length;\n      coord1 = t1.branch[t1.branch[nn1].n].y;\n      coord2 = t2.branch[t2.branch[nn2].n].y;\n      if (t2.branch[nn2].y > max(coord1, coord2))\n        ll -= t2.branch[nn2].y - max(coord1, coord2);\n      else if (t2.branch[nn2].y < min(coord1, coord2))\n        ll -= min(coord1, coord2) - t2.branch[nn2].y;\n    } else { // if (!BreakInX(maxbp))\n      n1 = n2 = 0;\n      for (r = 0; r < d; r++) {\n        if (si[r] < p) {\n          s1[si[r]] = n1;\n          x1[n1]    = xs[r];\n          n1++;\n        } else if (si[r] > p) {\n          s2[si[r] - p] = n2;\n          x2[n2]        = xs[r];\n          n2++;\n        } else { // if (si[r] == p)  i.e.,  r = s[p]\n          s1[p]  = n1;\n          s2[0]  = n2;\n          x1[n1] = x2[n2] = xs[r];\n          n1++;\n          n2++;\n        }\n      }\n\n      t1     = flutes_LMD(p + 1, x1, ys, s1, newacc);\n      t2     = flutes_LMD(d - p, x2, ys + p, s2, newacc);\n      ll     = t1.length + t2.length;\n      coord1 = t1.branch[t1.branch[p].n].x;\n      coord2 = t2.branch[t2.branch[0].n].x;\n      if (t2.branch[0].x > max(coord1, coord2))\n        ll -= t2.branch[0].x - max(coord1, coord2);\n      else if (t2.branch[0].x < min(coord1, coord2))\n        ll -= min(coord1, coord2) - t2.branch[0].x;\n    }\n    if (minl > ll) {\n      minl = ll;\n      free(bestt1.branch);\n      free(bestt2.branch);\n      bestt1 = t1;\n      bestt2 = t2;\n      bestbp = maxbp;\n    } else {\n      free(t1.branch);\n      free(t2.branch);\n    }\n  }\n\n#if LOCAL_REFINEMENT == 1\n  if (BreakInX(bestbp)) {\n    t = hmergetree(bestt1, bestt2, s);\n    local_refinement(&t, si[BreakPt(bestbp)]);\n  } else {\n    t = vmergetree(bestt1, bestt2);\n    local_refinement(&t, BreakPt(bestbp));\n  }\n#else\n  if (BreakInX(bestbp)) {\n    t = hmergetree(bestt1, bestt2, s);\n  } else {\n    t = vmergetree(bestt1, bestt2);\n  }\n#endif\n\n  free(bestt1.branch);\n  free(bestt2.branch);\n\n  return t;\n}\n\nTree dmergetree(Tree t1, Tree t2) {\n  int i, d, prev, curr, next, offset1, offset2;\n  Tree t;\n\n  t.deg = d = t1.deg + t2.deg - 2;\n  t.length  = t1.length + t2.length;\n  t.branch  = (Branch*)malloc((2 * d - 2) * sizeof(Branch));\n  offset1   = t2.deg - 2;\n  offset2   = 2 * t1.deg - 4;\n\n  for (i = 0; i <= t1.deg - 2; i++) {\n    t.branch[i].x = t1.branch[i].x;\n    t.branch[i].y = t1.branch[i].y;\n    t.branch[i].n = t1.branch[i].n + offset1;\n  }\n  for (i = t1.deg - 1; i <= d - 1; i++) {\n    t.branch[i].x = t2.branch[i - t1.deg + 2].x;\n    t.branch[i].y = t2.branch[i - t1.deg + 2].y;\n    t.branch[i].n = t2.branch[i - t1.deg + 2].n + offset2;\n  }\n  for (i = d; i <= d + t1.deg - 3; i++) {\n    t.branch[i].x = t1.branch[i - offset1].x;\n    t.branch[i].y = t1.branch[i - offset1].y;\n    t.branch[i].n = t1.branch[i - offset1].n + offset1;\n  }\n  for (i = d + t1.deg - 2; i <= 2 * d - 3; i++) {\n    t.branch[i].x = t2.branch[i - offset2].x;\n    t.branch[i].y = t2.branch[i - offset2].y;\n    t.branch[i].n = t2.branch[i - offset2].n + offset2;\n  }\n\n  prev = t2.branch[0].n + offset2;\n  curr = t1.branch[t1.deg - 1].n + offset1;\n  next = t.branch[curr].n;\n  while (curr != next) {\n    t.branch[curr].n = prev;\n    prev             = curr;\n    curr             = next;\n    next             = t.branch[curr].n;\n  }\n  t.branch[curr].n = prev;\n\n  return t;\n}\n\nTree hmergetree(Tree t1, Tree t2, int s[]) {\n  int i, prev, curr, next, extra, offset1, offset2;\n  int p, ii = 0, n1, n2, nn1 = 0, nn2 = 0;\n  DTYPE coord1, coord2;\n  Tree t;\n\n  t.deg    = t1.deg + t2.deg - 1;\n  t.length = t1.length + t2.length;\n  t.branch = (Branch*)malloc((2 * t.deg - 2) * sizeof(Branch));\n  offset1  = t2.deg - 1;\n  offset2  = 2 * t1.deg - 3;\n\n  p  = t1.deg - 1;\n  n1 = n2 = 0;\n  for (i = 0; i < t.deg; i++) {\n    if (s[i] < p) {\n      t.branch[i].x = t1.branch[n1].x;\n      t.branch[i].y = t1.branch[n1].y;\n      t.branch[i].n = t1.branch[n1].n + offset1;\n      n1++;\n    } else if (s[i] > p) {\n      t.branch[i].x = t2.branch[n2].x;\n      t.branch[i].y = t2.branch[n2].y;\n      t.branch[i].n = t2.branch[n2].n + offset2;\n      n2++;\n    } else {\n      t.branch[i].x = t2.branch[n2].x;\n      t.branch[i].y = t2.branch[n2].y;\n      t.branch[i].n = t2.branch[n2].n + offset2;\n      nn1           = n1;\n      nn2           = n2;\n      ii            = i;\n      n1++;\n      n2++;\n    }\n  }\n  for (i = t.deg; i <= t.deg + t1.deg - 3; i++) {\n    t.branch[i].x = t1.branch[i - offset1].x;\n    t.branch[i].y = t1.branch[i - offset1].y;\n    t.branch[i].n = t1.branch[i - offset1].n + offset1;\n  }\n  for (i = t.deg + t1.deg - 2; i <= 2 * t.deg - 4; i++) {\n    t.branch[i].x = t2.branch[i - offset2].x;\n    t.branch[i].y = t2.branch[i - offset2].y;\n    t.branch[i].n = t2.branch[i - offset2].n + offset2;\n  }\n  extra  = 2 * t.deg - 3;\n  coord1 = t1.branch[t1.branch[nn1].n].y;\n  coord2 = t2.branch[t2.branch[nn2].n].y;\n  if (t2.branch[nn2].y > max(coord1, coord2)) {\n    t.branch[extra].y = max(coord1, coord2);\n    t.length -= t2.branch[nn2].y - t.branch[extra].y;\n  } else if (t2.branch[nn2].y < min(coord1, coord2)) {\n    t.branch[extra].y = min(coord1, coord2);\n    t.length -= t.branch[extra].y - t2.branch[nn2].y;\n  } else\n    t.branch[extra].y = t2.branch[nn2].y;\n  t.branch[extra].x = t2.branch[nn2].x;\n  t.branch[extra].n = t.branch[ii].n;\n  t.branch[ii].n    = extra;\n\n  prev = extra;\n  curr = t1.branch[nn1].n + offset1;\n  next = t.branch[curr].n;\n  while (curr != next) {\n    t.branch[curr].n = prev;\n    prev             = curr;\n    curr             = next;\n    next             = t.branch[curr].n;\n  }\n  t.branch[curr].n = prev;\n\n  return t;\n}\n\nTree vmergetree(Tree t1, Tree t2) {\n  int i, prev, curr, next, extra, offset1, offset2;\n  DTYPE coord1, coord2;\n  Tree t;\n\n  t.deg    = t1.deg + t2.deg - 1;\n  t.length = t1.length + t2.length;\n  t.branch = (Branch*)malloc((2 * t.deg - 2) * sizeof(Branch));\n  offset1  = t2.deg - 1;\n  offset2  = 2 * t1.deg - 3;\n\n  for (i = 0; i <= t1.deg - 2; i++) {\n    t.branch[i].x = t1.branch[i].x;\n    t.branch[i].y = t1.branch[i].y;\n    t.branch[i].n = t1.branch[i].n + offset1;\n  }\n  for (i = t1.deg - 1; i <= t.deg - 1; i++) {\n    t.branch[i].x = t2.branch[i - t1.deg + 1].x;\n    t.branch[i].y = t2.branch[i - t1.deg + 1].y;\n    t.branch[i].n = t2.branch[i - t1.deg + 1].n + offset2;\n  }\n  for (i = t.deg; i <= t.deg + t1.deg - 3; i++) {\n    t.branch[i].x = t1.branch[i - offset1].x;\n    t.branch[i].y = t1.branch[i - offset1].y;\n    t.branch[i].n = t1.branch[i - offset1].n + offset1;\n  }\n  for (i = t.deg + t1.deg - 2; i <= 2 * t.deg - 4; i++) {\n    t.branch[i].x = t2.branch[i - offset2].x;\n    t.branch[i].y = t2.branch[i - offset2].y;\n    t.branch[i].n = t2.branch[i - offset2].n + offset2;\n  }\n  extra  = 2 * t.deg - 3;\n  coord1 = t1.branch[t1.branch[t1.deg - 1].n].x;\n  coord2 = t2.branch[t2.branch[0].n].x;\n  if (t2.branch[0].x > max(coord1, coord2)) {\n    t.branch[extra].x = max(coord1, coord2);\n    t.length -= t2.branch[0].x - t.branch[extra].x;\n  } else if (t2.branch[0].x < min(coord1, coord2)) {\n    t.branch[extra].x = min(coord1, coord2);\n    t.length -= t.branch[extra].x - t2.branch[0].x;\n  } else\n    t.branch[extra].x = t2.branch[0].x;\n  t.branch[extra].y      = t2.branch[0].y;\n  t.branch[extra].n      = t.branch[t1.deg - 1].n;\n  t.branch[t1.deg - 1].n = extra;\n\n  prev = extra;\n  curr = t1.branch[t1.deg - 1].n + offset1;\n  next = t.branch[curr].n;\n  while (curr != next) {\n    t.branch[curr].n = prev;\n    prev             = curr;\n    curr             = next;\n    next             = t.branch[curr].n;\n  }\n  t.branch[curr].n = prev;\n\n  return t;\n}\n\nvoid local_refinement(Tree* tp, int p) {\n  int d, dd, i, ii, j, prev, curr, next, root;\n  int SteinerPin[2 * MAXD], index[2 * MAXD];\n  DTYPE x[MAXD], xs[D], ys[D];\n  int ss[D];\n  Tree tt;\n\n  d    = tp->deg;\n  root = tp->branch[p].n;\n\n  // Reverse edges to point to root\n  prev = root;\n  curr = tp->branch[prev].n;\n  next = tp->branch[curr].n;\n  while (curr != next) {\n    tp->branch[curr].n = prev;\n    prev               = curr;\n    curr               = next;\n    next               = tp->branch[curr].n;\n  }\n  tp->branch[curr].n = prev;\n  tp->branch[root].n = root;\n\n  // Find Steiner nodes that are at pins\n  for (i = d; i <= 2 * d - 3; i++)\n    SteinerPin[i] = -1;\n  for (i = 0; i < d; i++) {\n    next = tp->branch[i].n;\n    if (tp->branch[i].x == tp->branch[next].x &&\n        tp->branch[i].y == tp->branch[next].y)\n      SteinerPin[next] = i; // Steiner 'next' at Pin 'i'\n  }\n  SteinerPin[root] = p;\n\n  // Find pins that are directly connected to root\n  dd = 0;\n  for (i = 0; i < d; i++) {\n    curr = tp->branch[i].n;\n    if (SteinerPin[curr] == i)\n      curr = tp->branch[curr].n;\n    while (SteinerPin[curr] < 0)\n      curr = tp->branch[curr].n;\n    if (curr == root) {\n      x[dd] = tp->branch[i].x;\n      if (SteinerPin[tp->branch[i].n] == i && tp->branch[i].n != root)\n        index[dd++] = tp->branch[i].n; // Steiner node\n      else\n        index[dd++] = i; // Pin\n    }\n  }\n\n  if (4 <= dd && dd <= D) {\n    // Find Steiner nodes that are directly connected to root\n    ii = dd;\n    for (i = 0; i < dd; i++) {\n      curr = tp->branch[index[i]].n;\n      while (SteinerPin[curr] < 0) {\n        index[ii++]      = curr;\n        SteinerPin[curr] = INT_MAX;\n        curr             = tp->branch[curr].n;\n      }\n    }\n    index[ii] = root;\n\n    for (ii = 0; ii < dd; ii++) {\n      ss[ii] = 0;\n      for (j = 0; j < ii; j++)\n        if (x[j] < x[ii])\n          ss[ii]++;\n      for (j = ii + 1; j < dd; j++)\n        if (x[j] <= x[ii])\n          ss[ii]++;\n      xs[ss[ii]] = x[ii];\n      ys[ii]     = tp->branch[index[ii]].y;\n    }\n\n    tt = flutes_LD(dd, xs, ys, ss);\n\n    // Find new wirelength\n    tp->length += tt.length;\n    for (ii = 0; ii < 2 * dd - 3; ii++) {\n      i = index[ii];\n      j = tp->branch[i].n;\n      tp->length -= ADIFF(tp->branch[i].x, tp->branch[j].x) +\n                    ADIFF(tp->branch[i].y, tp->branch[j].y);\n    }\n\n    // Copy tt into t\n    for (ii = 0; ii < dd; ii++) {\n      tp->branch[index[ii]].n = index[tt.branch[ii].n];\n    }\n    for (; ii <= 2 * dd - 3; ii++) {\n      tp->branch[index[ii]].x = tt.branch[ii].x;\n      tp->branch[index[ii]].y = tt.branch[ii].y;\n      tp->branch[index[ii]].n = index[tt.branch[ii].n];\n    }\n    free(tt.branch);\n  }\n\n  return;\n}\n\nDTYPE wirelength(Tree t) {\n  int i, j;\n  DTYPE l = 0;\n\n  for (i = 0; i < 2 * t.deg - 2; i++) {\n    j = t.branch[i].n;\n    l += ADIFF(t.branch[i].x, t.branch[j].x) +\n         ADIFF(t.branch[i].y, t.branch[j].y);\n  }\n\n  return l;\n}\n\nvoid printtree(Tree t) {\n  int i;\n\n  for (i = 0; i < t.deg; i++)\n    printf(\" %-2d:  x=%4g  y=%4g  e=%d\\n\", i, (float)t.branch[i].x,\n           (float)t.branch[i].y, t.branch[i].n);\n  for (i = t.deg; i < 2 * t.deg - 2; i++)\n    printf(\"s%-2d:  x=%4g  y=%4g  e=%d\\n\", i, (float)t.branch[i].x,\n           (float)t.branch[i].y, t.branch[i].n);\n  printf(\"\\n\");\n}\n\n// Output in a format that can be plotted by gnuplot\nvoid plottree(Tree t) {\n  int i;\n\n  for (i = 0; i < 2 * t.deg - 2; i++) {\n    printf(\"%d %d\\n\", t.branch[i].x, t.branch[i].y);\n    printf(\"%d %d\\n\\n\", t.branch[t.branch[i].n].x, t.branch[t.branch[i].n].y);\n  }\n}\n#endif /* _FLUTE_H_ */\n"
  },
  {
    "path": "lonestar/eda/cpu/sproute/flute_mst.h",
    "content": "#ifndef _FLUTE_MST_H_\n#define _FLUTE_MST_H_\n\n#include <stdio.h>\n#include <stdlib.h>\n#include <limits.h>\n#include <math.h>\n#include <string.h>\n#include <assert.h>\n#include \"dl.h\"\n#include \"flute.h\"\n\n#include \"mst2.h\"\n\n#define INFNTY INT_MAX\n\n#define D2M D2(1) // Max net degree that flute_mr will handle\n#define MR_FOR_SMALL_CASES_ONLY 1\n#if MR_FOR_SMALL_CASES_ONLY\n#define MAXPART D2M // max partition of an MST\n#define MAXT (D2M / D * 2)\n#else\n#define MAXPART (d / 9 * 2) //(MAXD/THD*2) // max partition of an MST\n#define MAXPART2 ((t1.deg + t2.deg) / 9 * 2)\n#define MAXT (d / 5)\n#endif\n\nint D3 = INFNTY;\n\nint FIRST_ROUND         = 2; // note that num of total rounds = 1+FIRST_ROUND\nint EARLY_QUIT_CRITERIA = 1;\n\n#define DEFAULT_QSIZE (3 + min(d, 1000))\n\n#define USE_HASHING 1\n#if USE_HASHING\n#define new_ht 1\n// int new_ht=1;\ndl_t ht[D2M + 1]; // hash table of subtrees indexed by degree\n#endif\n\nunsigned int curr_mark = 0;\n\nTree wmergetree(Tree t1, Tree t2, int* order1, int* order2, DTYPE cx, DTYPE cy,\n                int acc);\nTree xmergetree(Tree t1, Tree t2, int* order1, int* order2, DTYPE cx, DTYPE cy);\nvoid color_tree(Tree t, int* color);\nint longest_path_edge(int i, int j, int* e, int* p, int* es);\nvoid preprocess_edges(int num_edges, int* edges, DTYPE* len, int* e, int* p,\n                      int* es);\n\n#define init_queue(q)                                                          \\\n  { q[1] = 2; }\ninline void enqueue(int** q, int e) {\n  int _qsize;\n  if ((*q)[0] == (*q)[1]) {\n    _qsize  = 2 * ((*q)[0] + 1);\n    (*q)    = (int*)realloc((*q), _qsize * sizeof(int));\n    (*q)[0] = _qsize;\n  }\n  (*q)[(*q)[1]++] = e;\n}\n\nDTYPE** hdist;\n\nint heap_size = 0;\n\nint in_heap_order(int e1, int e2) {\n  if (hdist[heap[e1].node1][heap[e1].node2] <\n      hdist[heap[e2].node1][heap[e2].node2]) {\n    return 1;\n  } else {\n    return 0;\n  }\n}\n\nvoid sift_up(int i) {\n  node_pair tmp;\n  int j;\n\n  for (j = i / 2; j >= 1 && in_heap_order(i, j); i = j, j /= 2) {\n    tmp     = heap[j];\n    heap[j] = heap[i];\n    heap[i] = tmp;\n  }\n}\n\nvoid sift_down(int i) {\n  int left, right, j;\n  node_pair tmp;\n\n  left  = i * 2;\n  right = left + 1;\n\n  while (left <= heap_size) {\n    if (left == heap_size || in_heap_order(left, right)) {\n      j = left;\n    } else {\n      j = right;\n    }\n    if (in_heap_order(j, i)) {\n      tmp     = heap[j];\n      heap[j] = heap[i];\n      heap[i] = tmp;\n      i       = j;\n      left    = i * 2;\n      right   = left + 1;\n    } else {\n      break;\n    }\n  }\n}\n\nvoid insert_heap(node_pair* np) {\n  if (heap_size >= max_heap_size) {\n    max_heap_size *= 2;\n    heap = (node_pair*)realloc(heap, sizeof(node_pair) * (max_heap_size + 1));\n  }\n  heap[++heap_size] = *np;\n  sift_up(heap_size);\n}\n\nvoid extract_heap(node_pair* np) {\n  // caller has to make sure heap is not empty\n  *np     = heap[1];\n  heap[1] = heap[heap_size--];\n  sift_down(1);\n}\n\nTree reftree; // reference for qsort\nint cmp_branch(const void* a, const void* b) {\n  int n;\n  DTYPE x1, x2, x3;\n\n  x1 = reftree.branch[*(int*)a].x;\n  n  = reftree.branch[*(int*)a].n;\n  x3 = reftree.branch[n].x;\n  if (x3 < x1)\n    x1 = x3;\n\n  x2 = reftree.branch[*(int*)b].x;\n  n  = reftree.branch[*(int*)b].n;\n  x3 = reftree.branch[n].x;\n  if (x3 < x2)\n    x2 = x3;\n\n  return (x1 <= x2) ? -1 : 1;\n}\n\nvoid update_dist2(Tree t, DTYPE** dist, DTYPE longest, int* host,\n                  int* min_node1, int* min_node2, int** nb) {\n  int i, j, m, n, dd, node1, node2, node3, node4, p1, p2, pi, pn;\n  DTYPE min_dist, smallest;\n  DTYPE x1, x2, x3, x4, y1, y2, y3, y4;\n  DTYPE threshold_x, threshold_y;\n  DTYPE md = dist[*min_node1][*min_node2];\n\n#if MR_FOR_SMALL_CASES_ONLY\n  int isPin_base[D2M], *isPin, id[2 * D2M];\n  int u, v, b[D2M * 2];\n#else\n  int *isPin_base, *isPin, *id;\n  int u, v, *b;\n\n  isPin_base = (int*)malloc(sizeof(int) * t.deg);\n  id         = (int*)malloc(sizeof(int) * t.deg * 2);\n  b          = (int*)malloc(sizeof(int) * t.deg * 2);\n#endif\n\n  isPin = &(isPin_base[0]) - t.deg;\n  dd    = t.deg * 2 - 2;\n\n  for (i = 0; i < t.deg; i++) {\n    isPin_base[i] = -1;\n  }\n\n  for (i = t.deg; i < dd; i++) {\n    host[i] = -1;\n  }\n\n  for (i = 0; i < t.deg; i++) {\n    n = t.branch[i].n;\n    if (isPin[n] < 0 && t.branch[n].x == t.branch[i].x &&\n        t.branch[n].y == t.branch[i].y) {\n      isPin[n] = i; // this steiner node coincides with a pin\n    }\n    host[i] = i;\n\n    if (isPin[n] == i) {\n      host[n] = host[i];\n    }\n  }\n\n  for (i = 0; i < dd; i++) {\n    id[i] = i;\n  }\n\n  for (i = 0; i < dd; i++) {\n    n = t.branch[i].n;\n    if (isPin[n] >= 0 || n == i) {\n      continue;\n    }\n\n    if (id[i] < id[n]) {\n      id[n] = id[i];\n    } else {\n      id[i] = id[n];\n    }\n  }\n\n  for (i = 0; i < dd; i++) {\n    while (id[i] != id[id[i]]) {\n      id[i] = id[id[i]];\n    }\n  }\n\n  x1 = y1 = INFNTY;\n  x2 = y2 = 0;\n  for (i = 0; i < t.deg; i++) {\n    x1 = min(x1, t.branch[i].x);\n    y1 = min(y1, t.branch[i].y);\n    x2 = max(x2, t.branch[i].x);\n    y2 = max(y2, t.branch[i].y);\n  }\n\n  threshold_x = (x2 - x1) / 4;\n  threshold_y = (y2 - y1) / 4;\n  threshold_x = min(threshold_x, longest);\n  threshold_y = min(threshold_y, longest);\n\n  for (i = 0; i < dd; i++) {\n    b[i] = i;\n  }\n  reftree = t;\n  qsort(b, dd, sizeof(int), cmp_branch);\n\n  for (u = 0; u < dd; u++) {\n    i     = b[u];\n    n     = t.branch[i].n;\n    node1 = host[i];\n    node2 = host[n];\n    if (node1 < 0 && node2 < 0) {\n      continue;\n    }\n    if (t.branch[i].x <= t.branch[n].x) {\n      x3 = t.branch[i].x;\n      x4 = t.branch[n].x;\n    } else {\n      x3 = t.branch[n].x;\n      x4 = t.branch[i].x;\n    }\n    if (t.branch[i].y <= t.branch[n].y) {\n      y3 = t.branch[i].y;\n      y4 = t.branch[n].y;\n    } else {\n      y3 = t.branch[n].y;\n      y4 = t.branch[i].y;\n    }\n\n    for (v = u + 1; v < dd; v++) {\n      j = b[v];\n      if (id[i] == id[j]) { // in the same connecting subtree\n        continue;\n      }\n      if (ADIFF(t.branch[i].x, t.branch[j].x) > threshold_x ||\n          ADIFF(t.branch[i].y, t.branch[j].y) > threshold_y)\n        continue;\n      m     = t.branch[j].n;\n      node3 = host[j];\n      node4 = host[m];\n      if (node3 < 0 && node4 < 0) {\n        continue;\n      }\n\n      if (t.branch[j].x <= t.branch[m].x) {\n        x1 = t.branch[j].x;\n        x2 = t.branch[m].x;\n      } else {\n        x1 = t.branch[m].x;\n        x2 = t.branch[j].x;\n      }\n      if (t.branch[j].y <= t.branch[m].y) {\n        y1 = t.branch[j].y;\n        y2 = t.branch[m].y;\n      } else {\n        y1 = t.branch[m].y;\n        y2 = t.branch[j].y;\n      }\n\n      if (x2 < x3) {\n        min_dist = x3 - x2;\n      } else if (x4 < x1) {\n        min_dist = x1 - x4;\n      } else {\n        min_dist = 0;\n      }\n\n      if (min_dist >= threshold_x) {\n        break;\n      }\n\n      if (y2 < y3) {\n        min_dist += y3 - y2;\n      } else if (y4 < y1) {\n        min_dist += y1 - y4;\n      }\n\n      if (min_dist >= longest) {\n        continue;\n      }\n\n      p1 = (node1 < 0) ? node2 : ((node2 < 0) ? node1 : -1);\n      p2 = (node3 < 0) ? node4 : ((node4 < 0) ? node3 : -1);\n\n      if (p1 >= 0 && p2 < 0) {\n        dist[p1][node3] = ADIFF(t.branch[p1].x, t.branch[node3].x) +\n                          ADIFF(t.branch[p1].y, t.branch[node3].y);\n        dist[p1][node4] = ADIFF(t.branch[p1].x, t.branch[node4].x) +\n                          ADIFF(t.branch[p1].y, t.branch[node4].y);\n        p2 = (dist[p1][node3] <= dist[p1][node4]) ? node3 : node4;\n      } else if (p1 < 0 && p2 >= 0) {\n        dist[node1][p2] = ADIFF(t.branch[node1].x, t.branch[p2].x) +\n                          ADIFF(t.branch[node1].y, t.branch[p2].y);\n        dist[node2][p2] = ADIFF(t.branch[node2].x, t.branch[p2].x) +\n                          ADIFF(t.branch[node2].y, t.branch[p2].y);\n        p1 = (dist[node1][p2] <= dist[node2][p2]) ? node1 : node2;\n      } else if (p1 < 0 && p2 < 0) {\n        // all 4 nodes are real, pick the closest pair\n\n        dist[node1][node3] = ADIFF(t.branch[node1].x, t.branch[node3].x) +\n                             ADIFF(t.branch[node1].y, t.branch[node3].y);\n        dist[node1][node4] = ADIFF(t.branch[node1].x, t.branch[node4].x) +\n                             ADIFF(t.branch[node1].y, t.branch[node4].y);\n        dist[node2][node3] = ADIFF(t.branch[node2].x, t.branch[node3].x) +\n                             ADIFF(t.branch[node2].y, t.branch[node3].y);\n        dist[node2][node4] = ADIFF(t.branch[node2].x, t.branch[node4].x) +\n                             ADIFF(t.branch[node2].y, t.branch[node4].y);\n\n        p1       = node1;\n        p2       = node3;\n        smallest = dist[p1][p2];\n\n        if (dist[node2][node3] < smallest) {\n          p1       = node2;\n          smallest = dist[p1][p2];\n        }\n        if (dist[node1][node4] < smallest) {\n          p1       = node1;\n          p2       = node4;\n          smallest = dist[p1][p2];\n        }\n        if (dist[node2][node4] < smallest) {\n          p1       = node2;\n          p2       = node4;\n          smallest = dist[p1][p2];\n        }\n      } else {\n        dist[p1][p2] = ADIFF(t.branch[p1].x, t.branch[p2].x) +\n                       ADIFF(t.branch[p1].y, t.branch[p2].y);\n      }\n\n      if (min_dist < dist[p1][p2]) {\n        dist[p1][p2] = dist[p2][p1] = min_dist;\n        enqueue(&nb[p1], p2);\n        enqueue(&nb[p2], p1);\n\n        if (min_dist < md) {\n          md         = min_dist;\n          *min_node1 = p1;\n          *min_node2 = p2;\n        }\n      }\n    }\n  }\n\n#if !(MR_FOR_SMALL_CASES_ONLY)\n  free(isPin_base);\n  free(id);\n  free(b);\n#endif\n}\n\nvoid mst_from_heap(int d, DTYPE** dist, int node1, int node2, int** nb,\n                   int* edges, int* tree_id) {\n  int i, j, itr, idx;\n  node_pair e;\n\n  hdist     = dist;\n  heap_size = 0;\n\n  for (i = 0; i < d; i++) {\n    tree_id[i] = 0;\n  }\n\n  tree_id[node1] = 1;\n  tree_id[node2] = 1;\n  e.node1 = edges[0] = node1;\n  e.node2 = edges[1] = node2;\n  idx                = 2;\n\n  insert_heap(&e);\n\n  for (j = nb[node1][1] - 1; j > 1; j--) {\n    i = nb[node1][j];\n    if (tree_id[i])\n      continue;\n    {\n      e.node2 = i;\n      insert_heap(&e);\n    }\n  }\n  for (itr = d - 2; itr >= 1; itr--) {\n    e.node1 = node2;\n\n    for (j = nb[node2][1] - 1; j > 1; j--) {\n      i = nb[node2][j];\n      if (tree_id[i])\n        continue;\n      {\n        e.node2 = i;\n        insert_heap(&e);\n      }\n    }\n\n    do {\n      // assert(heap_size>0);\n      // extract_heap(&e);\n      e = heap[1];\n      while (tree_id[heap[heap_size].node2]) {\n        heap_size--;\n      }\n      heap[1] = heap[heap_size--];\n      sift_down(1);\n      node2 = e.node2;\n    } while (tree_id[node2]);\n    node1          = e.node1;\n    tree_id[node2] = tree_id[node1];\n    edges[idx++]   = node1;\n    edges[idx++]   = node2;\n  }\n  // assert(idx==2*d-2);\n}\n\nvoid build_rmst(long d, DTYPE* x, DTYPE* y, int* edges, int* inMST) {\n  int i, j, idx, n;\n  int* map     = (int*)calloc(d, sizeof(int));\n  Point* pt    = (Point*)calloc(4 * d, sizeof(Point));\n  long* parent = (long*)calloc(4 * d, sizeof(long));\n  long* par    = (long*)calloc(4 * d, sizeof(long));\n  int* size    = (int*)calloc(d, sizeof(int));\n\n  for (i = 0; i < d; i++) {\n    pt[i].x   = x[i];\n    pt[i].y   = y[i];\n    parent[i] = par[i] = -1;\n    size[i]            = 1;\n    inMST[i]           = 0;\n  }\n\n  map[0] = 0;\n  for (i = n = 1; i < d; i++) {\n    if (x[i] != x[i - 1] || y[i] != y[i - 1]) {\n      pt[n].x = pt[i].x;\n      pt[n].y = pt[i].y;\n      map[n]  = i;\n      n++;\n    } else {\n      parent[i] = i - 1;\n    }\n  }\n\n  // mst2( d, pt, parent );\n  mst2_package_init(n);\n\n  mst2(n, pt, par);\n\n  mst2_package_done();\n\n  /* special treatment for duplicated points not filtered in previous loop */\n  for (i = 1; i < n; i++) {\n    if (par[i] < 0) {\n      for (j = n - 1; j >= 0; j--) {\n        if (pt[j].x == pt[i].x && pt[j].y == pt[i].y && par[j] >= 0) {\n          par[i] = j;\n          break;\n        }\n      }\n    }\n  }\n\n  for (i = 0; i < n; i++) {\n    parent[map[i]] = map[par[i]];\n  }\n\n  for (i = 0; i < d; i++) {\n    // assert(parent[i]>=0);\n    size[parent[i]]++;\n  }\n\n  idx = 2 * d - 3;\n  for (i = 0; i < d; i++) {\n    if (inMST[i])\n      continue;\n    j = i;\n    while (size[j] <= 1 && idx > 0) {\n      // assert(!inMST[j]);\n      inMST[j]     = 1;\n      edges[idx--] = j;\n      edges[idx--] = j = parent[j];\n      size[j]--;\n    }\n  }\n  // assert(idx==-1);\n\n  inMST[edges[0]] = 1;\n\n  free(pt);\n  free(map);\n  free(parent);\n  free(par);\n  free(size);\n}\n\n/* cached version */\nTree flutes_c(int d, DTYPE* xs, DTYPE* ys, int* s, int acc) {\n  int i;\n  // int orig_ht_flag;\n  Tree t, tdup;\n\n#if USE_HASHING\n  dl_forall(Tree, ht[d], t) {\n    for (i = 0; i < d; i++) {\n      if (t.branch[i].y != ys[i] || t.branch[i].x != xs[s[i]]) {\n        break;\n      }\n    }\n    if (i >= d) { // found a match\n      tdup        = t;\n      tdup.branch = (Branch*)malloc(sizeof(Branch) * (2 * d - 2));\n      for (i = 2 * d - 3; i >= 0; i--) {\n        tdup.branch[i] = t.branch[i];\n      }\n      return tdup;\n    }\n  }\n  dl_endfor;\n\n  // orig_ht_flag = new_ht;\n  // new_ht = 0;\n#endif\n\n  t = flutes_LMD(d, xs, ys, s, acc);\n\n#if USE_HASHING\n  // new_ht = orig_ht_flag;\n\n  tdup        = t;\n  tdup.branch = (Branch*)malloc(sizeof(Branch) * (2 * d - 2));\n  for (i = 2 * d - 3; i >= 0; i--) {\n    tdup.branch[i] = t.branch[i];\n  }\n  dl_prepend(Tree, ht[d], tdup);\n#endif\n\n  return t;\n}\n\nTree flute_mr(int d, DTYPE* xs, DTYPE* ys, int* s, int acc, int rounds,\n              DTYPE** dist, DTYPE* threshold_x, DTYPE* threshold_y,\n              DTYPE* threshold, int* best_round, int* min_node1, int* min_node2,\n              int** nb) {\n  int i, j, k, m, n, itr, node1, node2;\n  DTYPE min_dist, longest;\n  DTYPE dist1, dist2;\n  Tree t, best_t, *subtree, ttmp;\n  DTYPE min_x, max_x;\n\n#if MR_FOR_SMALL_CASES_ONLY\n  int num_subtree, subroot[MAXPART], suproot[MAXPART], isSuperRoot[D2M];\n  int tree_id[D2M], tid, tree_size[D2M], edges[2 * D2M];\n  int idx[MAXPART], offset[MAXPART], *order[MAXT], order_base[MAXT * D2M];\n  DTYPE x[D2M + MAXPART], y[D2M + MAXPART];\n  int new_s[D2M + MAXPART], si[D2M], xmap[D2M + MAXPART];\n#else\n  int num_subtree, *subroot, *suproot, *isSuperRoot;\n  int *tree_id, tid, *tree_size, *edges;\n  int *idx, *offset, **order, *order_base;\n  DTYPE *x, *y;\n  int *new_s, *si, *xmap;\n\n  subroot = (int*)malloc(sizeof(int) * MAXPART);\n  suproot = (int*)malloc(sizeof(int) * MAXPART);\n  idx     = (int*)malloc(sizeof(int) * MAXPART);\n  offset  = (int*)malloc(sizeof(int) * MAXPART);\n  order   = (int**)malloc(sizeof(int*) * MAXT);\n\n  isSuperRoot = (int*)malloc(sizeof(int) * d);\n  tree_id     = (int*)malloc(sizeof(int) * d);\n  tree_size   = (int*)malloc(sizeof(int) * d);\n  edges       = (int*)malloc(sizeof(int) * d * 2);\n  order_base  = (int*)malloc(sizeof(int) * d * MAXT);\n  new_s       = (int*)malloc(sizeof(int) * (d + MAXPART));\n  si          = (int*)malloc(sizeof(int) * d);\n  xmap        = (int*)malloc(sizeof(int) * (d + MAXPART));\n  x           = (DTYPE*)malloc(sizeof(DTYPE) * (d + MAXPART));\n  y           = (DTYPE*)malloc(sizeof(DTYPE) * (d + MAXPART));\n#endif\n\n#if USE_HASHING\n  if (new_ht) {\n    for (i = 0; i <= d; i++) {\n      ht[i] = dl_alloc();\n    }\n  }\n#endif\n\n  best_t.branch = NULL;\n  best_t.length = INFNTY;\n\n  for (i = 0; i < MAXT; i++) {\n    // order[i] = &(order_base[i*D2(acc)]);\n    order[i] = &(order_base[i * d]);\n  }\n\n  while (rounds >= 0) {\n    for (i = 0; i < d; i++) {\n      x[i]           = xs[s[i]];\n      y[i]           = ys[i];\n      isSuperRoot[i] = 0;\n    }\n\n    if (rounds == FIRST_ROUND) {\n      build_rmst((long)d, x, y, edges, tree_id);\n      for (i = 0; i < d; i++)\n        dist[i][i] = 0;\n    } else {\n      mst_from_heap(d, dist, *min_node1, *min_node2, nb, edges, tree_id);\n    }\n\n    if (rounds != 0) {\n      longest = 0;\n      for (i = 0; i < d; i++) {\n        init_queue(nb[i]);\n      }\n    }\n\n    for (i = 0; i < 2 * d - 2;) {\n      node1 = edges[i++];\n      node2 = edges[i++];\n      if (rounds != 0) {\n        enqueue(&nb[node1], node2);\n        enqueue(&nb[node2], node1);\n        dist[node1][node2] = dist[node2][node1] =\n            ADIFF(x[node1], x[node2]) + ADIFF(y[node1], y[node2]);\n        if (longest < dist[node1][node2]) {\n          longest = dist[node1][node2];\n        }\n      }\n    }\n\n    for (i = 0; i < d; i++) {\n      tree_size[i] = 1; // the node itself\n    }\n\n    suproot[0] = subroot[0] = edges[0];\n    num_subtree             = 1;\n\n    for (i = 2 * d - 3; i >= 0;) {\n      node2 = edges[i--];\n      node1 = edges[i--];\n      j     = tree_size[node1] + tree_size[node2];\n      // Chris\n      if (j >= TAU(acc)) {\n        isSuperRoot[node1]     = 1;\n        suproot[num_subtree]   = node1;\n        subroot[num_subtree++] = node2;\n      } else {\n        tree_size[node1] = j;\n      }\n    }\n\n    // assert(num_subtree<=MAXT);\n\n    for (i = 1; i < num_subtree; i++) {\n      tree_id[subroot[i]] = i + 1;\n      tree_size[subroot[i]] += 1; // to account for the link to parent tree\n    }\n\n    for (i = 0; i < 2 * d - 2;) {\n      node1 = edges[i++];\n      node2 = edges[i++];\n      if (tree_id[node2] == 1) { // non-root node\n        tree_id[node2] = tree_id[node1];\n      }\n    }\n\n    // Find inverse si[] of s[]\n    for (i = 0; i < d; i++)\n      si[s[i]] = i;\n\n    offset[1] = 0;\n    for (i = 1; i < num_subtree; i++) {\n      offset[i + 1] = offset[i] + tree_size[subroot[i - 1]];\n    }\n    // assert(offset[num_subtree]==d+num_subtree-1-tree_size[subroot[num_subtree-1]]);\n\n    for (i = 0; i <= num_subtree; i++)\n      idx[i] = 0;\n\n    for (i = 0; i < d; i++) {\n      tid                = tree_id[si[i]];\n      j                  = idx[tid]++;\n      x[offset[tid] + j] = xs[i];\n      xmap[i]            = j;\n\n      if (isSuperRoot[si[i]]) {\n        for (k = 1; k < num_subtree; k++) {\n          if (suproot[k] == si[i]) {\n            tid                = k + 1;\n            j                  = idx[tid]++;\n            x[offset[tid] + j] = xs[i];\n            xmap[d - 1 + tid]  = j;\n          }\n        }\n      }\n    }\n\n    for (i = 0; i <= num_subtree; i++)\n      idx[i] = 0;\n\n    for (i = 0; i < d; i++) {\n      tid                    = tree_id[i];\n      j                      = idx[tid]++;\n      y[offset[tid] + j]     = ys[i];\n      new_s[offset[tid] + j] = xmap[s[i]];\n      order[tid - 1][j]      = i;\n\n      if (isSuperRoot[i]) {\n        for (k = 1; k < num_subtree; k++) {\n          if (suproot[k] == i) {\n            tid                    = k + 1;\n            j                      = idx[tid]++;\n            y[offset[tid] + j]     = ys[i];\n            new_s[offset[tid] + j] = xmap[d - 1 + tid];\n            order[tid - 1][j]      = i;\n          }\n        }\n      }\n    }\n\n    subtree = (Tree*)malloc(num_subtree * sizeof(Tree));\n    for (i = 1; i <= num_subtree; i++) {\n      if (tree_size[subroot[i - 1]] <= 1) {\n        subtree[i - 1].deg = 0;\n        continue;\n      }\n\n      t = flutes_c(tree_size[subroot[i - 1]], x + offset[i], y + offset[i],\n                   new_s + offset[i], acc);\n\n      subtree[i - 1] = t;\n    }\n\n    for (i = 1; i < num_subtree; i++) {\n      // assert(tree_id[suproot[i]] != tree_id[subroot[i]]);\n\n      t = wmergetree(\n          subtree[tree_id[suproot[i]] - 1], subtree[tree_id[subroot[i]] - 1],\n          order[tree_id[suproot[i]] - 1], order[tree_id[subroot[i]] - 1],\n          xs[s[suproot[i]]], ys[suproot[i]], acc);\n\n      subtree[tree_id[subroot[i]] - 1].deg = 0;\n      subtree[tree_id[suproot[i]] - 1]     = t;\n    }\n\n    t = subtree[0];\n    free(subtree);\n\n    if (best_t.length < t.length) {\n      if (*best_round - rounds >= EARLY_QUIT_CRITERIA) {\n        if (t.branch) {\n          free(t.branch);\n        }\n        break;\n      }\n    } else if (best_t.length == t.length) {\n      *best_round = rounds;\n    } else if (best_t.length > t.length) {\n      if (best_t.branch) {\n        free(best_t.branch);\n      }\n      best_t      = t;\n      *best_round = rounds;\n    }\n\n    if (rounds > 0) {\n      for (i = 0; i < d; i++) {\n        x[i] = xs[s[i]];\n        y[i] = ys[i];\n      }\n\n      *min_node1 = edges[0];\n      *min_node2 = edges[1];\n\n      update_dist2(t, dist, longest, edges, min_node1, min_node2, nb);\n    }\n\n    if (t.branch != 0 && best_t.branch != t.branch) {\n      free(t.branch);\n    }\n\n    rounds--;\n  }\n\n#if !(MR_FOR_SMALL_CASES_ONLY)\n  free(subroot);\n  free(suproot);\n  free(idx);\n  free(offset);\n  free(order);\n  free(isSuperRoot);\n  free(tree_id);\n  free(tree_size);\n  free(edges);\n  free(order_base);\n  free(new_s);\n  free(si);\n  free(xmap);\n  free(x);\n  free(y);\n#endif\n\n#if USE_HASHING\n  if (new_ht) {\n    for (i = 0; i <= d; i++) {\n      dl_forall(Tree, ht[i], ttmp) { free(ttmp.branch); }\n      dl_endfor;\n      dl_free(ht[i]);\n    }\n  }\n#endif\n\n  return best_t;\n}\n\nTree flute_am(int d, DTYPE* xs, DTYPE* ys, int* s, int acc, DTYPE* threshold_x,\n              DTYPE* threshold_y, DTYPE* threshold) {\n  int i, j, k, m, n, itr, node1, node2;\n  DTYPE smallest_gap, gap;\n  Tree t, t0, *subtree;\n  int prev_effort;\n  /*\n  int num_subtree, subroot[MAXPART], suproot[MAXPART], isSuperRoot[MAXD];\n  int tree_id[MAXD], tid, tree_size[MAXD], edges[2*MAXD];\n  int idx[MAXPART], offset[MAXPART], *order[MAXT],\n        order_base[MAXD+10]; //order_base[MAXT*MAXD];\n  DTYPE x[MAXD+MAXPART], y[MAXD+MAXPART];\n  int new_s[MAXD+MAXPART], si[MAXD], xmap[MAXD+MAXPART];\n  */\n  DTYPE *x, *y;\n  int num_subtree, subroot[3], suproot[3], *isSuperRoot;\n  int *tree_id, tid, *tree_size, *edges;\n  int idx[3], offset[3], *order[3], *order_base;\n  int *new_s, *si, *xmap;\n\n  int maxd    = d + 1;\n  x           = (DTYPE*)malloc(sizeof(DTYPE) * (maxd + 3));\n  y           = (DTYPE*)malloc(sizeof(DTYPE) * (maxd + 3));\n  isSuperRoot = (int*)malloc(sizeof(int) * maxd);\n  tree_id     = (int*)malloc(sizeof(int) * maxd);\n  tree_size   = (int*)malloc(sizeof(int) * maxd);\n  edges       = (int*)malloc(sizeof(int) * maxd * 2);\n  order_base  = (int*)malloc(sizeof(int) * (maxd + 10));\n  new_s       = (int*)malloc(sizeof(int) * (maxd + 3));\n  si          = (int*)malloc(sizeof(int) * maxd);\n  xmap        = (int*)malloc(sizeof(int) * (maxd + 3));\n\n  /*\n  for (i=0; i<MAXT; i++) {\n    order[i] = &(order_base[i*MAXD]);\n  }\n  */\n\n  for (i = 0; i < d; i++) {\n    x[i]           = xs[s[i]];\n    y[i]           = ys[i];\n    isSuperRoot[i] = 0;\n  }\n\n  build_rmst((long)d, x, y, edges, tree_id);\n\n  for (i = 0; i < d; i++) {\n    tree_size[i] = 1; // the node itself\n  }\n\n  suproot[0] = subroot[0] = edges[0];\n  num_subtree             = 1;\n\n  /*\n  for (i=2*d-3; i>=0; ) {\n    node2 = edges[i--];\n    node1 = edges[i--];\n    j = tree_size[node1]+tree_size[node2];\n    //if (j >= d/2) {\n    if (j >= d/2 && num_subtree<2) {\n      isSuperRoot[node1] = 1;\n      suproot[num_subtree] = node1;\n      subroot[num_subtree++] = node2;\n    } else {\n      tree_size[node1] = j;\n    }\n  }\n  */\n\n  for (i = 2 * d - 3; i >= 0;) {\n    node2 = edges[i--];\n    node1 = edges[i--];\n    tree_size[node1] += tree_size[node2];\n  }\n\n  j            = 0;\n  smallest_gap = ADIFF(tree_size[j], d / 2);\n  for (i = 1; i < d; i++) {\n    gap = ADIFF(tree_size[i], d / 2);\n    if (gap < smallest_gap) {\n      j            = i;\n      smallest_gap = gap;\n    }\n  }\n\n  for (i = 2 * d - 3; i >= 0;) {\n    node2 = edges[i--];\n    node1 = edges[i--];\n    if (node2 == j) {\n      isSuperRoot[node1]     = 1;\n      suproot[num_subtree]   = node1;\n      subroot[num_subtree++] = node2;\n      tree_size[subroot[0]] -= tree_size[j];\n      break;\n    }\n  }\n\n  // assert(num_subtree<=MAXT);\n\n  for (i = 1; i < num_subtree; i++) {\n    tree_id[subroot[i]] = i + 1;\n    tree_size[subroot[i]] += 1; // to account for the link to parent tree\n  }\n\n  for (i = 0; i < 2 * d - 2;) {\n    node1 = edges[i++];\n    node2 = edges[i++];\n    if (tree_id[node2] == 1) { // non-root node\n      tree_id[node2] = tree_id[node1];\n    }\n  }\n\n  // Find inverse si[] of s[]\n  for (i = 0; i < d; i++)\n    si[s[i]] = i;\n\n  offset[1] = 0;\n  for (i = 1; i < num_subtree; i++) {\n    offset[i + 1] = offset[i] + tree_size[subroot[i - 1]];\n  }\n  // assert(offset[num_subtree]==d+num_subtree-1-tree_size[subroot[num_subtree-1]]);\n\n  for (i = 0; i < num_subtree; i++) {\n    order[i] = &(order_base[offset[i + 1]]);\n  }\n\n  for (i = 0; i <= num_subtree; i++)\n    idx[i] = 0;\n\n  for (i = 0; i < d; i++) {\n    tid                = tree_id[si[i]];\n    j                  = idx[tid]++;\n    x[offset[tid] + j] = xs[i];\n    xmap[i]            = j;\n\n    if (isSuperRoot[si[i]]) {\n      for (k = 1; k < num_subtree; k++) {\n        if (suproot[k] == si[i]) {\n          tid                = k + 1;\n          j                  = idx[tid]++;\n          x[offset[tid] + j] = xs[i];\n          xmap[d - 1 + tid]  = j;\n        }\n      }\n    }\n  }\n\n  for (i = 0; i <= num_subtree; i++)\n    idx[i] = 0;\n\n  for (i = 0; i < d; i++) {\n    tid                    = tree_id[i];\n    j                      = idx[tid]++;\n    y[offset[tid] + j]     = ys[i];\n    new_s[offset[tid] + j] = xmap[s[i]];\n    order[tid - 1][j]      = i;\n\n    if (isSuperRoot[i]) {\n      for (k = 1; k < num_subtree; k++) {\n        if (suproot[k] == i) {\n          tid                    = k + 1;\n          j                      = idx[tid]++;\n          y[offset[tid] + j]     = ys[i];\n          new_s[offset[tid] + j] = xmap[d - 1 + tid];\n          order[tid - 1][j]      = i;\n        }\n      }\n    }\n  }\n\n  subtree = (Tree*)malloc(num_subtree * sizeof(Tree));\n  for (i = 1; i <= num_subtree; i++) {\n    if (tree_size[subroot[i - 1]] <= 1) {\n      subtree[i - 1].deg = 0;\n      continue;\n    }\n\n    t = flutes_ALLD(tree_size[subroot[i - 1]], x + offset[i], y + offset[i],\n                    new_s + offset[i], acc);\n    subtree[i - 1] = t;\n  }\n\n  for (i = 1; i < num_subtree; i++) {\n    // assert(tree_id[suproot[i]] != tree_id[subroot[i]]);\n\n    t = xmergetree(\n        subtree[tree_id[suproot[i]] - 1], subtree[tree_id[subroot[i]] - 1],\n        order[tree_id[suproot[i]] - 1], order[tree_id[subroot[i]] - 1],\n        xs[s[suproot[i]]], ys[suproot[i]]);\n\n    subtree[tree_id[subroot[i]] - 1].deg = 0;\n    subtree[tree_id[suproot[i]] - 1]     = t;\n  }\n\n  t0 = subtree[0];\n  free(subtree);\n\n  t = t0;\n\n  free(x);\n  free(y);\n  free(isSuperRoot);\n  free(tree_id);\n  free(tree_size);\n  free(edges);\n  free(order_base);\n  free(new_s);\n  free(si);\n  free(xmap);\n\n  return t;\n}\n\nTree flutes_HD(int d, DTYPE* xs, DTYPE* ys, int* s, int acc) {\n  int i, A, orig_D3;\n  Tree t;\n  // DTYPE *dist[MAXD], *dist_base;\n  DTYPE **dist, *dist_base;\n  DTYPE threshold, threshold_x, threshold_y;\n  int best_round, min_node1, min_node2;\n  int** nb;\n  DTYPE prev_len;\n\n  // Chris\n  if (d <= D2(acc)) {\n    if (acc <= 6) {\n      FIRST_ROUND = 0;\n      A           = acc;\n    } else {\n      FIRST_ROUND = acc - 6;\n      A           = 6 + ((acc - 5) / 4) * 2; // Even A is better\n    }\n    EARLY_QUIT_CRITERIA = (int)(0.75 * FIRST_ROUND + 0.5);\n\n    dist_base = (DTYPE*)malloc(d * d * sizeof(DTYPE));\n    dist      = (DTYPE**)malloc(d * sizeof(DTYPE*));\n    nb        = (int**)malloc(d * sizeof(int*));\n    for (i = 0; i < d; i++) {\n      dist[i]  = &(dist_base[i * d]);\n      nb[i]    = (int*)malloc(DEFAULT_QSIZE * sizeof(int));\n      nb[i][0] = DEFAULT_QSIZE;\n      nb[i][1] = 2; // queue head\n    }\n\n    t = flute_mr(d, xs, ys, s, A, FIRST_ROUND, dist, &threshold_x, &threshold_y,\n                 &threshold, &best_round, &min_node1, &min_node2, nb);\n\n    free(dist_base);\n    free(dist);\n    for (i = 0; i < d; i++) {\n      free(nb[i]);\n    }\n    free(nb);\n  } else {\n    A       = acc;\n    orig_D3 = D3;\n    if (orig_D3 >= INFNTY && d > 1000) {\n      D3 = (d <= 10000) ? 1000 : 10000;\n    }\n    t = flute_am(d, xs, ys, s, A, &threshold_x, &threshold_y, &threshold);\n    if (orig_D3 >= INFNTY) {\n      D3 = orig_D3;\n    }\n  }\n\n  return t;\n}\n\nint pickWin(Tree t, DTYPE cx, DTYPE cy, int inWin[]) {\n#if MR_FOR_SMALL_CASES_ONLY\n  int i, j, n, dd, cnt, stack[D2M * 2], top = 0, prev, curr, next;\n  int isPin_base[D2M], *isPin, nghbr_base[D2M], *nghbr, q[2 * D2M];\n#else\n  int i, j, n, dd, cnt, *stack, top = 0, prev, curr, next;\n  int *isPin_base, *isPin, *nghbr_base, *nghbr, *q;\n\n  stack      = (int*)malloc(sizeof(int) * t.deg * 2);\n  isPin_base = (int*)malloc(sizeof(int) * t.deg);\n  nghbr_base = (int*)malloc(sizeof(int) * t.deg);\n  q          = (int*)malloc(sizeof(int) * t.deg * 2);\n#endif\n\n  if (t.deg <= 3) {\n    for (i = 0; i < t.deg * 2 - 2; i++) {\n      inWin[i] = 1;\n    }\n#if !(MR_FOR_SMALL_CASES_ONLY)\n    free(stack);\n    free(isPin_base);\n    free(nghbr_base);\n    free(q);\n#endif\n    return t.deg;\n  }\n\n  memset(nghbr_base, 0, sizeof(int) * t.deg);\n  nghbr = &(nghbr_base[0]) - t.deg;\n  isPin = &(isPin_base[0]) - t.deg;\n\n  for (i = 0; i < t.deg; i++) {\n    isPin_base[i] = -1;\n  }\n\n  dd = t.deg * 2 - 2;\n  for (i = 0; i < dd; i++) {\n    inWin[i] = 0;\n    nghbr[t.branch[i].n]++;\n  }\n\n  for (i = t.deg + 1; i < dd; i++) {\n    nghbr[i] += nghbr[i - 1];\n  }\n  nghbr[dd] = nghbr[dd - 1];\n\n  for (i = 0; i < dd; i++) {\n    q[--nghbr[t.branch[i].n]] = i;\n  }\n\n  cnt = 0;\n  for (i = 0; i < t.deg; i++) {\n    n = t.branch[i].n;\n    if (t.branch[n].x == t.branch[i].x && t.branch[n].y == t.branch[i].y) {\n      isPin[n] = i; // this steiner node coincides with a pin\n    }\n    if (t.branch[i].x == cx && t.branch[i].y == cy) {\n      inWin[i] = 1;\n      cnt++;\n      if (isPin[n] == i) {\n        inWin[n]     = 1;\n        stack[top++] = t.branch[n].n;\n        for (j = nghbr[n]; j < nghbr[n + 1]; j++) {\n          if (q[j] == i) {\n            continue;\n          }\n          stack[top++] = q[j];\n        }\n      } else {\n        stack[top++] = n;\n      }\n    }\n  }\n  // assert(top > 0);\n\n  while (top > 0) {\n    i = stack[--top];\n    if (inWin[i]) {\n      continue;\n    }\n    inWin[i] = 1;\n    if (i < t.deg) {\n      cnt++;\n      continue;\n    }\n    n = isPin[i];\n    if (n >= 0) {\n      if (!inWin[n]) {\n        inWin[n] = 1;\n        cnt++;\n      }\n    } else {\n      stack[top++] = t.branch[i].n;\n      for (j = nghbr[i]; j < nghbr[i + 1]; j++) {\n        stack[top++] = q[j];\n      }\n    }\n  }\n\n  for (i = 0; i < t.deg; i++) {\n    if (inWin[i]) {\n      n = t.branch[i].n;\n      if (isPin[n] != i) {\n        continue;\n      }\n      prev = n;\n      curr = t.branch[n].n;\n      next = t.branch[curr].n;\n      while (curr != next) {\n        t.branch[curr].n = prev;\n        prev             = curr;\n        curr             = next;\n        next             = t.branch[curr].n;\n      }\n      t.branch[curr].n = prev;\n      t.branch[n].n    = n;\n    }\n  }\n\n  // assert(cnt>0);\n#if !(MR_FOR_SMALL_CASES_ONLY)\n  free(stack);\n  free(isPin_base);\n  free(nghbr_base);\n  free(q);\n#endif\n  return cnt;\n}\n\n/* merge tree t2 into tree t1, which have shared common nodes.  The intention\n   is to add the non-common tree nodes of t2 into t1, as well as the\n   associated steiner nodes */\nTree merge_into(Tree t1, Tree t2, int common[], int nc, int* o1, int* o2) {\n  Tree t;\n  DTYPE cx, cy;\n#if MR_FOR_SMALL_CASES_ONLY\n  int i, j, k, d, n, offset, map[2 * D2M], reachable[2 * D2M];\n  int o[D2M + MAXPART];\n#else\n  int i, j, k, d, n, offset, *map, *reachable;\n  int* o;\n\n  map       = (int*)malloc(sizeof(int) * (t1.deg + t2.deg) * 2);\n  reachable = (int*)malloc(sizeof(int) * (t1.deg + t2.deg) * 2);\n  o         = (int*)malloc(sizeof(int) * (t1.deg + t2.deg + MAXPART2));\n#endif\n\n  if (t2.deg <= nc) {\n    free(t2.branch);\n#if !(MR_FOR_SMALL_CASES_ONLY)\n    free(map);\n    free(reachable);\n    free(o);\n#endif\n    return t1;\n  }\n\n  t.deg    = t1.deg + t2.deg - nc;\n  t.branch = (Branch*)malloc((2 * t.deg - 2) * sizeof(Branch));\n\n  offset = t2.deg - nc;\n\n  for (i = t1.deg; i < t1.deg * 2 - 2; i++) {\n    t.branch[i + offset]   = t1.branch[i];\n    t.branch[i + offset].n = t1.branch[i].n + offset;\n  }\n\n  memset(reachable, 0, sizeof(int) * 2 * t2.deg);\n  for (i = 2 * t2.deg - 3; i >= 0; i--) {\n    if (!common[i] && t2.branch[i].n != i) {\n      reachable[t2.branch[i].n] = 1;\n    }\n  }\n\n  for (i = 2 * t2.deg - 3; i >= 0; i--) {\n    map[i] = -1;\n  }\n\n  d = t1.deg * 2 - 2;\n\n  /* a slow method; could be sped up here */\n  for (i = 0; i < t2.deg; i++) {\n    if (common[i]) {\n      n = t2.branch[i].n;\n      if (map[n] != -1 || !reachable[n]) {\n        continue;\n      }\n      if (t2.branch[i].x != t2.branch[n].x ||\n          t2.branch[i].y != t2.branch[n].y) {\n        continue;\n      }\n      for (j = 0; j < t1.deg; j++) {\n        if (t1.branch[j].x == t2.branch[i].x &&\n            t1.branch[j].y == t2.branch[i].y) {\n          break;\n        }\n      }\n      // assert(j<t1.deg);\n      n = t1.branch[j].n;\n      if (t1.branch[j].x == t1.branch[n].x &&\n          t1.branch[j].y == t1.branch[n].y) {\n        /* pin j in t1 is also a steiner node */\n        map[i] = n;\n      } else { // create a steiner node for the common pin in t1\n        t.branch[d + offset] = t1.branch[j];\n        t.branch[d + offset].n += offset;\n        t1.branch[j].n = d;\n        map[i]         = d;\n        d++;\n        // assert(d+offset<=t.deg*2-2);\n      }\n\n      map[t2.branch[i].n] = map[i];\n    }\n  }\n\n  for (; i < 2 * t2.deg - 2; i++) {\n    if (map[i] == -1 && !common[i] && reachable[i]) {\n      map[i] = d++;\n      // assert(d+offset<=t.deg*2-2);\n    }\n  }\n\n  /* merge the pin nodes in the correct order */\n  j = i = k = 0;\n  while (k < t2.deg && common[k]) {\n    k++;\n  }\n\n  do {\n    if (k >= t2.deg) {\n      for (; i < t1.deg; i++) {\n        t.branch[j]   = t1.branch[i];\n        t.branch[j].n = t1.branch[i].n + offset;\n        o[j]          = o1[i];\n        j++;\n      }\n    } else if (i >= t1.deg) {\n      for (; k < t2.deg; k++) {\n        if (common[k]) {\n          continue;\n        }\n        t.branch[j] = t2.branch[k];\n        n           = t2.branch[k].n;\n        // assert(map[n]>=t1.deg);\n        t.branch[j].n = map[n] + offset;\n        o[j]          = o2[k];\n        j++;\n      }\n    } else if (o1[i] < o2[k]) {\n      t.branch[j]   = t1.branch[i];\n      t.branch[j].n = t1.branch[i].n + offset;\n      o[j]          = o1[i];\n      j++;\n      i++;\n    } else {\n      t.branch[j] = t2.branch[k];\n      n           = t2.branch[k].n;\n      // assert(map[n]>=t1.deg);\n      t.branch[j].n = map[n] + offset;\n      o[j]          = o2[k];\n      j++;\n      do {\n        k++;\n      } while (k < t2.deg && common[k]);\n    }\n  } while (i < t1.deg || k < t2.deg);\n  // assert(j==t.deg);\n\n  for (i = 0; i < j; i++) {\n    o1[i] = o[i];\n  }\n\n  j += t1.deg - 2;\n  for (i = t2.deg; i < 2 * t2.deg - 2; i++) {\n    if (!common[i] && reachable[i]) {\n      t.branch[map[i] + offset] = t2.branch[i];\n      n                         = t2.branch[i].n;\n      // assert(map[n]>=t1.deg);\n      t.branch[map[i] + offset].n = map[n] + offset;\n      j++;\n    }\n  }\n\n  j = d + offset;\n  // assert(j <= t.deg*2-2);\n  while (j < t.deg * 2 - 2) {\n    /* redundant steiner nodes */\n    t.branch[j]   = t2.branch[0];\n    t.branch[j].n = j;\n    j++;\n  }\n\n  /*\n  for (i=0; i<t.deg; i++) {\n    assert(t.branch[i].n>=t.deg);\n  }\n  */\n\n  t.length = wirelength(t);\n\n  free(t1.branch);\n  free(t2.branch);\n\n#if !(MR_FOR_SMALL_CASES_ONLY)\n  free(map);\n  free(reachable);\n  free(o);\n#endif\n  return t;\n}\n\n/* simply merge two trees at their common node */\nTree smergetree(Tree t1, Tree t2, int* o1, int* o2, DTYPE cx, DTYPE cy) {\n  Tree t;\n  int d, i, j, k, n, ci, cn, mapped_cn, prev, curr, next, offset;\n#if MR_FOR_SMALL_CASES_ONLY\n  int o[D2M + MAXPART], map[2 * D2M];\n#else\n  int *o, *map;\n\n  map = (int*)malloc(sizeof(int) * (t1.deg + t2.deg) * 2);\n  o   = (int*)malloc(sizeof(int) * (t1.deg + t2.deg + MAXPART2));\n#endif\n\n  t.deg    = t1.deg + t2.deg - 1;\n  t.branch = (Branch*)malloc((2 * t.deg - 2) * sizeof(Branch));\n\n  offset = t2.deg - 1;\n  d      = t1.deg * 2 - 2;\n\n  for (i = 0; i < t1.deg; i++) {\n    if (t1.branch[i].x == cx && t1.branch[i].y == cy) {\n      break;\n    }\n  }\n  n = t1.branch[i].n;\n  if (t1.branch[n].x == cx && t1.branch[n].y == cy) {\n    mapped_cn = n;\n  } else {\n    t.branch[d + offset] = t1.branch[i];\n    t.branch[d + offset].n += offset;\n    t1.branch[i].n = d;\n    mapped_cn      = d;\n    d++;\n  }\n\n  for (i = t2.deg; i < t2.deg * 2 - 2; i++) {\n    map[i] = -1;\n  }\n  for (i = 0; i < t2.deg; i++) {\n    if (t2.branch[i].x == cx && t2.branch[i].y == cy) {\n      ci = i;\n      break;\n    }\n  }\n\n  n = t2.branch[i].n;\n\n  if (t2.branch[n].x == cx && t2.branch[n].y == cy) {\n    cn = n;\n  } else {\n    cn = i;\n  }\n\n  prev = n;\n  curr = t2.branch[n].n;\n  next = t2.branch[curr].n;\n  while (curr != next) {\n    t2.branch[curr].n = prev;\n    prev              = curr;\n    curr              = next;\n    next              = t2.branch[curr].n;\n  }\n  t2.branch[curr].n = prev;\n  t2.branch[n].n    = cn;\n\n  for (i = t2.deg; i < 2 * t2.deg - 2; i++) {\n    if (i != cn) {\n      map[i] = d++;\n    }\n  }\n  map[cn] = mapped_cn;\n\n  /* merge the pin nodes in the correct order */\n  j = i = k = 0;\n  if (k == ci) {\n    k++;\n  }\n\n  do {\n    if (k >= t2.deg) {\n      for (; i < t1.deg; i++) {\n        t.branch[j]   = t1.branch[i];\n        t.branch[j].n = t1.branch[i].n + offset;\n        o[j]          = o1[i];\n        j++;\n      }\n    } else if (i >= t1.deg) {\n      for (; k < t2.deg; k++) {\n        if (k == ci) {\n          continue;\n        }\n        t.branch[j]   = t2.branch[k];\n        n             = t2.branch[k].n;\n        t.branch[j].n = map[n] + offset;\n        o[j]          = o2[k];\n        j++;\n      }\n    } else if (o1[i] < o2[k]) {\n      t.branch[j]   = t1.branch[i];\n      t.branch[j].n = t1.branch[i].n + offset;\n      o[j]          = o1[i];\n      j++;\n      i++;\n    } else {\n      t.branch[j]   = t2.branch[k];\n      n             = t2.branch[k].n;\n      t.branch[j].n = map[n] + offset;\n      o[j]          = o2[k];\n      j++;\n      k++;\n      if (k == ci) {\n        k++;\n      }\n    }\n  } while (i < t1.deg || k < t2.deg);\n  // assert(j==t.deg);\n\n  for (i = 0; i < j; i++) {\n    o1[i] = o[i];\n  }\n\n  for (i = t1.deg; i < t1.deg * 2 - 2; i++) {\n    t.branch[i + offset]   = t1.branch[i];\n    t.branch[i + offset].n = t1.branch[i].n + offset;\n  }\n\n  for (i = t2.deg; i < 2 * t2.deg - 2; i++) {\n    if (i != cn) {\n      t.branch[map[i] + offset]   = t2.branch[i];\n      n                           = t2.branch[i].n;\n      t.branch[map[i] + offset].n = map[n] + offset;\n    }\n  }\n\n  for (i = d + offset; i < t.deg * 2 - 2; i++) {\n    t.branch[i]   = t2.branch[0];\n    t.branch[i].n = i;\n  }\n\n  free(t1.branch);\n  free(t2.branch);\n\n  t.length = wirelength(t);\n\n#if !(MR_FOR_SMALL_CASES_ONLY)\n  free(map);\n  free(o);\n#endif\n  return t;\n}\n\n/* window-based heuristics */\nTree wmergetree(Tree t1, Tree t2, int* order1, int* order2, DTYPE cx, DTYPE cy,\n                int acc) {\n  Tree t, t3, t4;\n#if MR_FOR_SMALL_CASES_ONLY\n  int s[D2M], inWin[2 * D2M], d, d2, i, ci, n;\n  int i1, i2, o[D2M], os[D2M], si[D2M];\n  DTYPE x[D2M], y[D2M], tmp;\n#else\n  int *s, *inWin, d, d2, i, ci, n;\n  int i1, i2, *o, *os, *si;\n  DTYPE *x, *y, tmp;\n\n  s     = (int*)malloc(sizeof(int) * (t1.deg + t2.deg));\n  inWin = (int*)malloc(sizeof(int) * (t1.deg + t2.deg) * 2);\n  o     = (int*)malloc(sizeof(int) * (t1.deg + t2.deg));\n  os    = (int*)malloc(sizeof(int) * (t1.deg + t2.deg));\n  si    = (int*)malloc(sizeof(int) * (t1.deg + t2.deg));\n  x     = (DTYPE*)malloc(sizeof(DTYPE) * (t1.deg + t2.deg));\n  y     = (DTYPE*)malloc(sizeof(DTYPE) * (t1.deg + t2.deg));\n#endif\n\n  if (t1.deg <= 0) {\n    for (i = 0; i < t2.deg; i++) {\n      order1[i] = order2[i];\n    }\n#if !(MR_FOR_SMALL_CASES_ONLY)\n    free(s);\n    free(inWin);\n    free(o);\n    free(os);\n    free(si);\n    free(x);\n    free(y);\n#endif\n    return t2;\n  } else if (t2.deg <= 0) {\n#if !(MR_FOR_SMALL_CASES_ONLY)\n    free(s);\n    free(inWin);\n    free(o);\n    free(os);\n    free(si);\n    free(x);\n    free(y);\n#endif\n    return t1;\n  }\n\n  d  = pickWin(t1, cx, cy, inWin);\n  d2 = pickWin(t2, cx, cy, inWin + 2 * t1.deg);\n  d += d2;\n\n  // if (d<t1.deg+t2.deg && t1.deg>2 && t2.deg>2) {\n  if (d < t1.deg + t2.deg) {\n    for (i = 0; i < t2.deg; i++) {\n      if (t2.branch[i].x == cx && t2.branch[i].y == cy) {\n        ci = i;\n        break;\n      }\n    }\n\n    d--; // to exclude the duplicated common point (cx, cy)\n\n    n = 0;\n\n    i1 = i2 = 0;\n    while (i1 < t1.deg && !inWin[i1]) {\n      i1++;\n    }\n    while (i2 < t2.deg && (!inWin[i2 + 2 * t1.deg] || i2 == ci)) {\n      i2++;\n    }\n    do {\n      if (i2 >= t2.deg) {\n        for (; i1 < t1.deg; i1++) {\n          if (inWin[i1]) {\n            x[n] = t1.branch[i1].x;\n            y[n] = t1.branch[i1].y;\n            o[n] = order1[i1];\n            n++;\n          }\n        }\n      } else if (i1 >= t1.deg) {\n        for (; i2 < t2.deg; i2++) {\n          if (inWin[i2 + 2 * t1.deg] && i2 != ci) {\n            x[n] = t2.branch[i2].x;\n            y[n] = t2.branch[i2].y;\n            o[n] = order2[i2];\n            n++;\n          }\n        }\n      } else if (order1[i1] < order2[i2]) {\n        x[n] = t1.branch[i1].x;\n        y[n] = t1.branch[i1].y;\n        o[n] = order1[i1];\n        n++;\n        i1++;\n        while (i1 < t1.deg && !inWin[i1]) {\n          i1++;\n        }\n      } else {\n        x[n] = t2.branch[i2].x;\n        y[n] = t2.branch[i2].y;\n        o[n] = order2[i2];\n        n++;\n        i2++;\n        while (i2 < t2.deg && (!inWin[i2 + 2 * t1.deg] || i2 == ci)) {\n          i2++;\n        }\n      }\n    } while (i1 < t1.deg || i2 < t2.deg);\n    // assert(n==d);\n\n    for (i = 0; i < d; i++) {\n      si[i] = i;\n    }\n    for (i = 0; i < d; i++) {\n      n = i;\n      for (i1 = i + 1; i1 < d; i1++) {\n        if (x[i1] < x[n]) {\n          n = i1;\n        }\n      }\n      tmp   = x[i];\n      x[i]  = x[n];\n      x[n]  = tmp;\n      tmp   = si[n];\n      si[n] = si[i];\n      si[i] = tmp;\n    }\n    for (i = 0; i < d; i++) {\n      os[si[i]] = i;\n    }\n    t3 = flutes_LMD(d, x, y, os, acc);\n    t  = merge_into(t3, t2, inWin + 2 * t1.deg, d2, o, order2);\n    t4 = merge_into(t, t1, inWin, d + 1 - d2, o, order1);\n  } else if (t2.deg > 2) {\n    for (i = 0; i < t2.deg; i++) {\n      o[i] = order2[i];\n    }\n    t4 = smergetree(t2, t1, o, order1, cx, cy);\n  } else {\n    for (i = 0; i < t1.deg; i++) {\n      o[i] = order1[i];\n    }\n    t4 = smergetree(t1, t2, o, order2, cx, cy);\n  }\n\n  for (i = 0; i < t4.deg; i++) {\n    order1[i] = o[i];\n  }\n\n#if !(MR_FOR_SMALL_CASES_ONLY)\n  free(s);\n  free(inWin);\n  free(o);\n  free(os);\n  free(si);\n  free(x);\n  free(y);\n#endif\n\n  return t4;\n}\n\n/* xmerge heuristics */\ntypedef struct TreeNode_s {\n  struct TreeNode_s* parent;\n  dl_t children;\n  int order, id;\n  unsigned int mark;\n  DTYPE x, y;\n  DTYPE blen; // length of this edge (i.e. branch length)\n  // longest edge from here, use child node of an edge to represent it\n  struct TreeNode_s* e;\n  DTYPE len; // len of current e\n} TreeNode;\n\nvoid redirect(Tree t, DTYPE cx, DTYPE cy) {\n  int i, root, prev, curr, next;\n\n  /* assume that one of the nodes must match (cx, cy) */\n  root = 0;\n  for (i = 1; i < t.deg; i++) {\n    if (t.branch[i].x == cx && t.branch[i].y == cy) {\n      root = i;\n      break;\n    }\n  }\n\n  prev = root;\n  curr = t.branch[root].n;\n  next = t.branch[curr].n;\n  while (curr != next) {\n    t.branch[curr].n = prev;\n    prev             = curr;\n    curr             = next;\n    next             = t.branch[curr].n;\n  }\n  t.branch[curr].n = prev;\n\n  t.branch[root].n = root;\n}\n\nvoid update_subtree(TreeNode* p, int id) {\n  TreeNode *child, *grandp;\n  dl_t subtree = dl_alloc();\n\n  dl_append(TreeNode*, subtree, p);\n\n  while (dl_length(subtree) > 0) {\n    dl_pop_first(TreeNode*, subtree, p);\n    p->e   = p;\n    grandp = p->parent;\n    if (grandp) {\n      p->len = p->blen = ADIFF(p->x, grandp->x) + ADIFF(p->y, grandp->y);\n      if (p->len < grandp->len) {\n        p->len = grandp->len;\n        p->e   = grandp->e;\n      }\n    } else {\n      p->len = 0;\n    }\n\n    if (id) {\n      p->id = id;\n    }\n\n    dl_forall(TreeNode*, p->children, child) {\n      dl_prepend(TreeNode*, subtree, child);\n    }\n    dl_endfor;\n  }\n\n  dl_free(subtree);\n}\n\nTreeNode* createRootedTree(Tree t, int* order, int id, dl_t list_of_nodes) {\n  int i, dd, n;\n  TreeNode *root = 0, **nodes, *p;\n\n  dd    = t.deg * 2 - 2;\n  nodes = (TreeNode**)malloc(sizeof(TreeNode*) * dd);\n  for (i = 0; i < dd; i++) {\n    nodes[i]           = (TreeNode*)malloc(sizeof(TreeNode));\n    nodes[i]->mark     = curr_mark;\n    nodes[i]->children = dl_alloc();\n  }\n\n  curr_mark++;\n  for (i = 0; i < dd; i++) {\n    nodes[i]->mark = curr_mark;\n    n              = t.branch[i].n;\n    if (i == n) {\n      if (i < t.deg) {\n        // assert(root==0);\n        nodes[i]->parent = 0;\n        root             = nodes[i];\n      } else { /* must be redundant */\n        dl_free(nodes[i]->children);\n        free(nodes[i]);\n        nodes[i] = 0;\n        continue;\n      }\n    } else {\n      p                = nodes[n];\n      nodes[i]->parent = p;\n      dl_append(TreeNode*, p->children, nodes[i]);\n    }\n    nodes[i]->order = (i < t.deg) ? order[i] : -1;\n    nodes[i]->id    = id;\n    nodes[i]->x     = t.branch[i].x;\n    nodes[i]->y     = t.branch[i].y;\n\n    /* len will be computed in update_subtree\n    nodes[i]->blen =\n      ADIFF(t.branch[i].x, t.branch[n].x)+ADIFF(t.branch[i].y, t.branch[n].y);\n\n    nodes[i]->e = nodes[i];\n    nodes[i]->len =\n      ADIFF(t.branch[i].x, t.branch[n].x)+ADIFF(t.branch[i].y, t.branch[n].y);\n    */\n\n    dl_append(TreeNode*, list_of_nodes, nodes[i]);\n  }\n\n  // assert(root);\n\n  update_subtree(root, 0);\n\n  for (i = 0; i < dd; i++) {\n    if (nodes[i] && nodes[i]->mark != curr_mark) {\n      dl_free(nodes[i]->children);\n      free(nodes[i]);\n    }\n  }\n\n  free(nodes);\n  return root;\n}\n\nvoid freeTree(TreeNode* t) {\n  TreeNode* child;\n  dl_forall(TreeNode*, t->children, child) { freeTree(child); }\n  dl_endfor;\n  dl_free(t->children);\n  free(t);\n}\n\nint cmpNodeByYX(const void* a, const void* b) {\n  DTYPE ay = (*(TreeNode**)a)->y;\n  DTYPE by = (*(TreeNode**)b)->y;\n  DTYPE ax, bx;\n\n  if (ay < by)\n    return -1;\n  if (ay > by)\n    return 1;\n\n  ax = (*(TreeNode**)a)->x;\n  bx = (*(TreeNode**)b)->x;\n\n  if (ax < bx)\n    return -1;\n  if (ax > bx)\n    return 1;\n  return 0;\n}\n\nint cmpNodeByXY(const void* a, const void* b) {\n  DTYPE ax = (*(TreeNode**)a)->x;\n  DTYPE bx = (*(TreeNode**)b)->x;\n  DTYPE ay, by;\n\n  if (ax < bx)\n    return -1;\n  if (ax > bx)\n    return 1;\n\n  ay = (*(TreeNode**)a)->y;\n  by = (*(TreeNode**)b)->y;\n\n  if (ay < by)\n    return -1;\n  if (ay > by)\n    return 1;\n  return 0;\n}\n\nvoid remove_child(dl_t children_list, TreeNode* c) {\n  TreeNode* child;\n  dl_forall(TreeNode*, children_list, child) {\n    if (child == c) {\n      dl_delete_current();\n      break;\n    }\n  }\n  dl_endfor;\n}\n\nvoid cleanTree(TreeNode* tn) {\n  /*\n  TreeNode *c, *p;\n\n  dl_forall(TreeNode*, tn->children, c) {\n    cleanTree(c);\n  } dl_endfor;\n\n  p = tn->parent;\n  if (!p) return;\n\n  if (tn->order >= 0) return;  // don't clean pin nodes\n\n  if (dl_length(tn->children)<=0) {\n    remove_child(p->children, tn);\n    dl_free(tn->children);\n    free(tn);\n  } else if (dl_length(tn->children)<=1) {\n    c = dl_first(TreeNode*, tn->children);\n    c->parent = p;\n    dl_append(TreeNode*, p->children, c);\n    remove_child(p->children, tn);\n    dl_free(tn->children);\n    free(tn);\n  }\n  */\n\n  // non-recursive version\n  TreeNode *c, *p;\n  dl_t nlist = dl_alloc();\n\n  dl_append(TreeNode*, nlist, tn);\n\n  while (dl_length(nlist) > 0) {\n    dl_pop_first(TreeNode*, nlist, tn);\n    dl_forall(TreeNode*, tn->children, c) { dl_append(TreeNode*, nlist, c); }\n    dl_endfor;\n\n    p = tn->parent;\n    if (p && tn->order < 0) {\n      if (dl_length(tn->children) <= 0) {\n        remove_child(p->children, tn);\n        dl_free(tn->children);\n        free(tn);\n      } else if (dl_length(tn->children) <= 1) {\n        c         = dl_first(TreeNode*, tn->children);\n        c->parent = p;\n        dl_append(TreeNode*, p->children, c);\n        remove_child(p->children, tn);\n        dl_free(tn->children);\n        free(tn);\n      }\n    }\n  }\n\n  dl_free(nlist);\n}\n\nint cmpNodeByOrder(void* a, void* b) {\n  int ax = (*(TreeNode**)a)->order;\n  int bx = (*(TreeNode**)b)->order;\n\n  if (ax < bx)\n    return -1;\n  if (ax > bx)\n    return 1;\n  return 0;\n}\n\nTree mergeRootedTrees(TreeNode* tn1, TreeNode* tn2, int* order1) {\n  int i, n, redundant;\n  Tree t;\n  TreeNode *child, *p;\n  dl_t list_of_nodes = dl_alloc();\n  dl_t pin_nodes = dl_alloc(), steiner_nodes = dl_alloc();\n\n  // assert(tn1->x==tn2->x && tn1->y==tn2->y);\n\n  /* merge tn2 to tn1 */\n  while (dl_length(tn2->children) > 0) {\n    dl_pop_first(TreeNode*, tn2->children, child);\n    child->parent = tn1;\n    dl_append(TreeNode*, tn1->children, child);\n  }\n  dl_free(tn2->children);\n  free(tn2);\n\n  cleanTree(tn1);\n\n  /* convert tn1 back to a Tree */\n\n  dl_append(TreeNode*, list_of_nodes, tn1);\n  do {\n    dl_pop_first(TreeNode*, list_of_nodes, child);\n    if (child->order < 0) {\n      if (dl_length(child->children) == 1) { /* redundant steiner node */\n        p         = dl_first(TreeNode*, child->children);\n        p->parent = child->parent;\n        /* note that p->parent's children list is already gone */\n        dl_append(TreeNode*, list_of_nodes, p);\n        dl_free(child->children);\n        free(child);\n        continue;\n      } else if (dl_length(child->children) == 0) {\n        dl_free(child->children);\n        free(child);\n        continue;\n      }\n      dl_append(TreeNode*, steiner_nodes, child);\n    } else {\n      dl_append(TreeNode*, pin_nodes, child);\n    }\n    dl_concat(list_of_nodes, child->children);\n  } while (dl_length(list_of_nodes) > 0);\n  dl_free(list_of_nodes);\n\n  dl_sort(pin_nodes, sizeof(TreeNode*), cmpNodeByOrder);\n\n  i = 0;\n  dl_forall(TreeNode*, pin_nodes, child) { child->id = i++; }\n  dl_endfor;\n\n  t.deg = i;\n\n  dl_forall(TreeNode*, steiner_nodes, child) { child->id = i++; }\n  dl_endfor;\n\n  // assert(i<=2*t.deg-2);\n\n  t.branch = (Branch*)malloc(sizeof(Branch) * (t.deg * 2 - 2));\n\n  redundant = i;\n  for (; i < 2 * t.deg - 2; i++) {\n    t.branch[i].n = i;\n    t.branch[i].x = tn1->x;\n    t.branch[i].y = tn1->y;\n  }\n\n  t.branch[tn1->id].n = -1;\n\n  dl_forall(TreeNode*, pin_nodes, child) {\n    i = child->id;\n    if (child->order >= 0) {\n      order1[i] = child->order;\n    }\n    t.branch[i].x = child->x;\n    t.branch[i].y = child->y;\n    p             = child->parent;\n    if (p) {\n      if (p->id >= t.deg) {\n        t.branch[i].n = p->id;\n      } else {\n        // assert(p==tn1);\n        // assert(redundant<t.deg*2-2);\n        t.branch[i].n         = redundant;\n        t.branch[p->id].n     = redundant;\n        t.branch[redundant].x = p->x;\n        t.branch[redundant].y = p->y;\n        redundant++;\n      }\n    }\n  }\n  dl_endfor;\n  dl_forall(TreeNode*, steiner_nodes, child) {\n    i = child->id;\n    if (child->order >= 0) {\n      order1[i] = child->order;\n    }\n    t.branch[i].x = child->x;\n    t.branch[i].y = child->y;\n    p             = child->parent;\n    if (p->id < t.deg) { // must be the root\n      if (t.branch[p->id].n < 0) {\n        t.branch[p->id].n = i;\n        t.branch[i].n     = i;\n      } else {\n        n = t.branch[p->id].n;\n        if (t.branch[p->id].x == t.branch[n].x &&\n            t.branch[p->id].y == t.branch[n].y) {\n          t.branch[i].n = n;\n        } else {\n          // assert(redundant<t.deg*2-2);\n          t.branch[redundant].x = t.branch[p->id].x;\n          t.branch[redundant].y = t.branch[p->id].y;\n          t.branch[redundant].n = t.branch[p->id].n;\n          t.branch[p->id].n     = redundant;\n          t.branch[i].n         = redundant;\n          redundant++;\n        }\n      }\n    } else {\n      t.branch[i].n = p->id;\n    }\n  }\n  dl_endfor;\n\n  dl_forall(TreeNode*, pin_nodes, child) { free(child); }\n  dl_endfor;\n  dl_free(pin_nodes);\n\n  dl_forall(TreeNode*, steiner_nodes, child) { free(child); }\n  dl_endfor;\n  dl_free(steiner_nodes);\n\n  t.length = wirelength(t);\n  return t;\n}\n\nvoid collect_nodes(TreeNode* tn, dl_t nlist) {\n  /*\n  TreeNode* c;\n\n  dl_append(TreeNode*, nlist, tn);\n  dl_forall(TreeNode*, tn->children, c) {\n    collect_nodes(c, nlist);\n  }dl_endfor;\n  */\n  // non-recursive version\n  TreeNode* c;\n  dl_el* curr;\n\n  dl_append(TreeNode*, nlist, tn);\n\n  for (curr = nlist->last; curr; curr = curr->next) {\n    tn = dl_data(TreeNode*, curr);\n    dl_forall(TreeNode*, tn->children, c) { dl_append(TreeNode*, nlist, c); }\n    dl_endfor;\n  }\n}\n\ntypedef struct {\n  TreeNode *n1, *n2;\n  DTYPE new_x, new_y, gain;\n} xdata;\n\nint cmpXdata(void* a, void* b) {\n  DTYPE ga = (*(xdata*)a).gain;\n  DTYPE gb = (*(xdata*)b).gain;\n  if (ga > gb)\n    return -1;\n  if (ga < gb)\n    return 1;\n  return 0;\n}\n\ninline TreeNode* cedge_lca(TreeNode* n1, TreeNode* n2, DTYPE* len,\n                           int* n2ton1) {\n  int i;\n  TreeNode *curr, *lca, *e;\n\n  curr_mark++;\n\n  curr = n1;\n  while (curr) {\n    curr->mark = curr_mark;\n    curr       = curr->parent;\n  }\n\n  lca = n2;\n  while (lca && lca->mark != curr_mark) {\n    lca->mark = curr_mark;\n    lca       = lca->parent;\n  }\n\n  if (!lca) {\n    n1 = n1->parent;\n    if (n1 && n1 != lca && (n1->len > n2->len)) {\n      *n2ton1 = 0;\n      *len    = n1->len;\n      return n1->e;\n    } else {\n      *n2ton1 = 1;\n      *len    = n2->len;\n      return n2->e;\n    }\n  }\n\n  if (lca == n1 || lca == n1->parent || lca == n2) {\n    if (lca != n2) {\n      *n2ton1 = 1;\n      *len    = n2->blen;\n      e       = n2;\n      curr    = n2->parent;\n    } else {\n      *n2ton1 = 0;\n      *len    = n1->blen;\n      e       = n1;\n      curr    = n1->parent;\n    }\n    while (curr != lca) {\n      if (*len < curr->blen) {\n        *len = curr->blen;\n        e    = curr;\n      }\n      curr = curr->parent;\n    }\n    return e;\n  }\n\n  /* lca is above both n1 and n2 */\n  *n2ton1 = 0;\n  n1      = n1->parent;\n  *len    = n1->blen;\n  e       = n1;\n  curr    = n1;\n  for (i = 0; i < 2; i++, curr = n2) {\n    while (curr != lca) {\n      if (*len < curr->blen) {\n        if (i > 0) {\n          *n2ton1 = 1;\n        }\n        *len = curr->blen;\n        e    = curr;\n      }\n      curr = curr->parent;\n    }\n  }\n\n  return e;\n}\n\nTreeNode* critical_edge(TreeNode* n1, TreeNode* n2, DTYPE* len, int* n2ton1) {\n  if (n1->id != n2->id) {\n    n1 = n1->parent;\n    if (n1 && (n1->len > n2->len)) {\n      *n2ton1 = 0;\n      *len    = n1->len;\n      return n1->e;\n    } else {\n      *n2ton1 = 1;\n      *len    = n2->len;\n      return n2->e;\n    }\n  }\n\n  return cedge_lca(n1, n2, len, n2ton1);\n}\n\nvoid splice2(TreeNode* n1, TreeNode* n2, TreeNode* e) {\n  TreeNode *curr, *prev, *next, *s;\n\n  // assert(n2->parent);\n  // assert(e->id==n2->id);\n\n  prev = n2;\n  curr = n2->parent;\n  next = curr->parent;\n  while (prev != e) {\n    remove_child(curr->children, prev);\n    curr->parent = prev;\n    dl_append(TreeNode*, prev->children, curr);\n    prev = curr;\n    curr = next;\n    next = curr->parent;\n  }\n  remove_child(curr->children, prev);\n\n  n2->parent = n1;\n  dl_append(TreeNode*, n1->children, n2);\n\n  update_subtree(n1, n1->parent->id);\n}\n\nvoid cut_and_splice(TreeNode* n1, TreeNode* n2, DTYPE new_x, DTYPE new_y,\n                    DTYPE* x1, DTYPE* y1, DTYPE* x2, DTYPE* y2, TreeNode* e,\n                    int n2ton1) {\n  TreeNode *p1, *node, *s;\n\n  /* new steiner node */\n  p1 = n1->parent;\n  remove_child(p1->children, n1);\n  node       = (TreeNode*)malloc(sizeof(TreeNode));\n  node->x    = new_x;\n  node->y    = new_y;\n  node->mark = curr_mark;\n\n  node->parent = p1;\n  dl_append(TreeNode*, p1->children, node);\n  n1->parent     = node;\n  node->children = dl_alloc();\n  dl_append(TreeNode*, node->children, n1);\n  node->order = -1;\n\n  node->e  = n1->e;\n  node->id = n1->id;\n\n  if (*x1 == n1->x) {\n    *x2 = new_x;\n  } else {\n    *x1 = new_x;\n  }\n  if (*y1 == n1->y) {\n    *y2 = new_y;\n  } else {\n    *y1 = new_y;\n  }\n\n  if (n2->order >= 0) {\n    /* n2 is a pin, need to replicate a steiner node */\n    s = n2->parent;\n    if (s->x != n2->x || s->y != n2->y) {\n      s        = (TreeNode*)malloc(sizeof(TreeNode));\n      s->mark  = curr_mark;\n      s->order = -1;\n      s->id    = n2->id;\n      s->x     = n2->x;\n      s->y     = n2->y;\n      s->e     = n2->e;\n      if (s->e == n2) {\n        s->e = s;\n      }\n      if (e == n2) {\n        e = s;\n      }\n      s->len   = n2->len;\n      s->blen  = n2->blen;\n      n2->blen = 0;\n\n      remove_child(n2->parent->children, n2);\n      dl_append(TreeNode*, n2->parent->children, s);\n      s->parent   = n2->parent;\n      n2->parent  = s;\n      s->children = dl_alloc();\n      dl_append(TreeNode*, s->children, n2);\n    }\n    n2 = s;\n  }\n\n  if (n2ton1) {\n    splice2(node, n2, e);\n  } else {\n    splice2(n2, node, e);\n  }\n}\n\ntypedef struct {\n  TreeNode *n1, *n2;\n  DTYPE min_dist, new_x, new_y;\n  int n2ton1;\n} splice_info;\n\nDTYPE exchange_branches_order_x(int num_nodes, TreeNode** nodes,\n                                DTYPE threshold_x, DTYPE threshold_y,\n                                DTYPE max_len) {\n  int n2ton1;\n  TreeNode *n1, *p1, *n2, *p2, *node, *e, *s;\n  DTYPE x1, x2, y1, y2, min_dist, new_x, new_y, len;\n  DTYPE gain = 0;\n  int i, j, curr_row, next_header, num_rows, start, end, mid;\n  int* header     = (int*)malloc(sizeof(int) * (num_nodes + 1));\n  dl_t batch_list = dl_alloc();\n  splice_info sinfo;\n\n  int batch_mode = (num_nodes >= D3);\n\n  header[0] = 0;\n\n  y1 = nodes[0]->y;\n  for (i = num_rows = 1; i < num_nodes; i++) {\n    if (nodes[i]->y == y1) {\n      continue;\n    }\n    header[num_rows++] = i;\n    y1                 = nodes[i]->y;\n  }\n  header[num_rows] = i;\n\n  curr_row    = 0;\n  next_header = header[1];\n  for (i = 0; i < num_nodes; i++) {\n    if (i >= next_header) {\n      curr_row++;\n      next_header = header[curr_row + 1];\n    }\n    n1 = nodes[i];\n    p1 = n1->parent;\n    if (!p1) {\n      continue;\n    }\n    if (p1->x == n1->x && p1->y == n1->y) {\n      continue;\n    }\n    if (n1->x <= p1->x) {\n      x1 = n1->x;\n      x2 = p1->x;\n    } else {\n      x1 = p1->x;\n      x2 = n1->x;\n    }\n    if (n1->y <= p1->y) {\n      y1 = n1->y;\n      y2 = p1->y;\n    } else {\n      y1 = p1->y;\n      y2 = n1->y;\n    }\n\n    if (curr_row > 0) {\n      for (j = curr_row - 1; j > 0; j--) {\n        if (y1 - threshold_y > nodes[header[j]]->y) {\n          j++;\n          break;\n        }\n      }\n    } else {\n      j = 0;\n    }\n    for (; j < num_rows && nodes[header[j]]->y <= y2 + threshold_y; j++) {\n      /* find the closest node on row j */\n      start = header[j];\n      end   = header[j + 1];\n      while (start < end) {\n        mid = (start + end) / 2;\n        if (nodes[mid]->x <= x1) {\n          start = mid + 1;\n        } else {\n          end = mid;\n        }\n      }\n      // assert(start==end);\n\n      if (start >= header[j + 1]) {\n        continue;\n      }\n      n2 = nodes[start];\n\n      if (batch_mode && n1->id == n2->id)\n        continue;\n\n      if (!n2->parent) {\n        continue;\n      }\n\n      min_dist = n2->x - x2;\n\n      if (abs(min_dist) > threshold_x) {\n        continue;\n      } else if (min_dist < 0) {\n        min_dist = 0;\n        new_x    = n2->x;\n      } else {\n        new_x = x2;\n      }\n\n      if (n2->y < y1) {\n        min_dist += y1 - n2->y;\n        new_y = y1;\n      } else if (n2->y > y2) {\n        min_dist += n2->y - y2;\n        new_y = y2;\n      } else {\n        new_y = n2->y;\n      }\n\n      if (min_dist == 0 || min_dist > max_len) {\n        continue;\n      }\n\n      e = critical_edge(n1, n2, &len, &n2ton1);\n      if (min_dist < len && e != n1) {\n        if (batch_mode) {\n          sinfo.n1       = n1;\n          sinfo.n2       = n2;\n          sinfo.min_dist = min_dist;\n          sinfo.new_x    = new_x;\n          sinfo.new_y    = new_y;\n          sinfo.n2ton1   = n2ton1;\n          dl_append(splice_info, batch_list, sinfo);\n        } else {\n          gain += len - min_dist;\n          cut_and_splice(n1, n2, new_x, new_y, &x1, &y1, &x2, &y2, e, n2ton1);\n        }\n      }\n    }\n  }\n\n  dl_forall(splice_info, batch_list, sinfo) {\n    n1       = sinfo.n1;\n    n2       = sinfo.n2;\n    n2ton1   = sinfo.n2ton1;\n    min_dist = sinfo.min_dist;\n\n    e = critical_edge(n1, n2, &len, &n2ton1);\n    if (min_dist < len && e != n1) {\n      gain += len - min_dist;\n      cut_and_splice(n1, n2, sinfo.new_x, sinfo.new_y, &x1, &y1, &x2, &y2, e,\n                     n2ton1);\n    }\n  }\n  dl_endfor;\n\n  dl_free(batch_list);\n\n  free(header);\n\n  return gain;\n}\n\nDTYPE exchange_branches_order_y(int num_nodes, TreeNode** nodes,\n                                DTYPE threshold_x, DTYPE threshold_y,\n                                DTYPE max_len) {\n  int n2ton1;\n  TreeNode *n1, *p1, *n2, *p2, *node, *e, *s;\n  DTYPE x1, x2, y1, y2, min_dist, new_x, new_y, len;\n  DTYPE gain = 0;\n  int i, j, curr_row, next_header, num_rows, start, end, mid;\n  int* header     = (int*)malloc(sizeof(int) * (num_nodes + 1));\n  dl_t batch_list = dl_alloc();\n  splice_info sinfo;\n\n  int batch_mode = (num_nodes >= D3);\n\n  header[0] = 0;\n\n  x1 = nodes[0]->x;\n  for (i = num_rows = 1; i < num_nodes; i++) {\n    if (nodes[i]->x == x1) {\n      continue;\n    }\n    header[num_rows++] = i;\n    x1                 = nodes[i]->x;\n  }\n  header[num_rows] = i;\n\n  curr_row    = 0;\n  next_header = header[1];\n  for (i = 0; i < num_nodes; i++) {\n    if (i >= next_header) {\n      curr_row++;\n      next_header = header[curr_row + 1];\n    }\n    n1 = nodes[i];\n    p1 = n1->parent;\n    if (!p1) {\n      continue;\n    }\n    if (p1->x == n1->x && p1->y == n1->y) {\n      continue;\n    }\n    if (n1->x <= p1->x) {\n      x1 = n1->x;\n      x2 = p1->x;\n    } else {\n      x1 = p1->x;\n      x2 = n1->x;\n    }\n    if (n1->y <= p1->y) {\n      y1 = n1->y;\n      y2 = p1->y;\n    } else {\n      y1 = p1->y;\n      y2 = n1->y;\n    }\n\n    if (curr_row > 0) {\n      for (j = curr_row - 1; j > 0; j--) {\n        if (x1 - threshold_x > nodes[header[j]]->x) {\n          j++;\n          break;\n        }\n      }\n    } else {\n      j = 0;\n    }\n    for (; j < num_rows && nodes[header[j]]->x <= x2 + threshold_x; j++) {\n      /* find the closest node on row j */\n      start = header[j];\n      end   = header[j + 1];\n      while (start < end) {\n        mid = (start + end) / 2;\n        if (nodes[mid]->y <= y1) {\n          start = mid + 1;\n        } else {\n          end = mid;\n        }\n      }\n      // assert(start==end);\n      if (start >= header[j + 1]) {\n        continue;\n      }\n      n2 = nodes[start];\n\n      if (batch_mode && n1->id == n2->id)\n        continue;\n\n      if (!n2->parent) {\n        continue;\n      }\n\n      min_dist = n2->y - y2;\n\n      if (abs(min_dist) > threshold_y) {\n        continue;\n      } else if (min_dist < 0) {\n        min_dist = 0;\n        new_y    = n2->y;\n      } else {\n        new_y = y2;\n      }\n\n      if (n2->x < x1) {\n        min_dist += x1 - n2->x;\n        new_x = x1;\n      } else if (n2->x > x2) {\n        min_dist += n2->x - x2;\n        new_x = x2;\n      } else {\n        new_x = n2->x;\n      }\n\n      if (min_dist == 0 || min_dist > max_len) {\n        continue;\n      }\n\n      e = critical_edge(n1, n2, &len, &n2ton1);\n      if (min_dist < len && e != n1) {\n        if (batch_mode) {\n          sinfo.n1       = n1;\n          sinfo.n2       = n2;\n          sinfo.min_dist = min_dist;\n          sinfo.new_x    = new_x;\n          sinfo.new_y    = new_y;\n          sinfo.n2ton1   = n2ton1;\n          dl_append(splice_info, batch_list, sinfo);\n        } else {\n          gain += len - min_dist;\n          cut_and_splice(n1, n2, new_x, new_y, &x1, &y1, &x2, &y2, e, n2ton1);\n        }\n      }\n    }\n  }\n\n  dl_forall(splice_info, batch_list, sinfo) {\n    n1       = sinfo.n1;\n    n2       = sinfo.n2;\n    n2ton1   = sinfo.n2ton1;\n    min_dist = sinfo.min_dist;\n\n    e = critical_edge(n1, n2, &len, &n2ton1);\n    if (min_dist < len && e != n1) {\n      gain += len - min_dist;\n      cut_and_splice(n1, n2, sinfo.new_x, sinfo.new_y, &x1, &y1, &x2, &y2, e,\n                     n2ton1);\n    }\n  }\n  dl_endfor;\n\n  dl_free(batch_list);\n\n  free(header);\n\n  return gain;\n}\n\n/* cross exchange branches after merging */\nTree xmergetree(Tree t1, Tree t2, int* order1, int* order2, DTYPE cx,\n                DTYPE cy) {\n  int i, num, cnt, order_by_x = 1;\n  Tree t;\n  TreeNode *tn1, *tn2, *n1, *p1, **nodes;\n  dl_t list_of_nodes = dl_alloc();\n  DTYPE threshold_x, threshold_y;\n  DTYPE min_x, max_x, max_len, len, gain;\n\n  if (t1.deg <= 0) {\n    for (i = 0; i < t2.deg; i++) {\n      order1[i] = order2[i];\n    }\n    return t2;\n  } else if (t2.deg <= 0) {\n    return t1;\n  }\n\n  redirect(t1, cx, cy);\n  redirect(t2, cx, cy);\n\n  curr_mark = 0;\n  tn1       = createRootedTree(t1, order1, 1, list_of_nodes);\n  tn2       = createRootedTree(t2, order2, 2, list_of_nodes);\n\n  num   = dl_length(list_of_nodes);\n  nodes = (TreeNode**)malloc(sizeof(TreeNode*) * num);\n  i     = 0;\n  dl_forall(TreeNode*, list_of_nodes, n1) { nodes[i++] = n1; }\n  dl_endfor;\n  dl_clear(list_of_nodes);\n\n  qsort(nodes, num, sizeof(TreeNode*), cmpNodeByYX);\n\n  max_len = 0;\n  min_x = max_x = nodes[0]->x;\n  for (i = 0; i < num; i++) {\n    n1 = nodes[i];\n    p1 = n1->parent;\n    if (p1) {\n      len = ADIFF(n1->x, p1->x) + ADIFF(n1->y, p1->y);\n      if (len > max_len) {\n        max_len = len;\n      }\n    }\n    if (n1->x < min_x) {\n      min_x = n1->x;\n    } else if (n1->x > max_x) {\n      max_x = n1->x;\n    }\n  }\n\n  threshold_x = (max_x - min_x) / 4;\n  threshold_y = (nodes[num - 1]->y - nodes[0]->y) / 4;\n\n  threshold_x = min(threshold_x, max_len);\n  threshold_y = min(threshold_y, max_len);\n\n  for (cnt = (t1.deg + t2.deg) / 2; cnt > 0; cnt--) {\n    gain = (order_by_x) ? exchange_branches_order_x(num, nodes, threshold_x,\n                                                    threshold_y, max_len)\n                        : exchange_branches_order_y(num, nodes, threshold_x,\n                                                    threshold_y, max_len);\n\n    // assert(gain>=0);\n\n    if (gain <= 0 && !order_by_x) {\n      break;\n    }\n    if (cnt > 1) {\n      collect_nodes(tn1, list_of_nodes);\n      num = dl_length(list_of_nodes);\n      if (num <= 1) {\n        break;\n      }\n\n      collect_nodes(tn2, list_of_nodes);\n      if (dl_length(list_of_nodes) - num <= 1) {\n        break;\n      }\n\n      free(nodes);\n      num   = dl_length(list_of_nodes);\n      nodes = (TreeNode**)malloc(sizeof(TreeNode*) * num);\n      i     = 0;\n      dl_forall(TreeNode*, list_of_nodes, n1) { nodes[i++] = n1; }\n      dl_endfor;\n      dl_clear(list_of_nodes);\n\n      if (order_by_x) {\n        order_by_x = 0;\n        qsort(nodes, num, sizeof(TreeNode*), cmpNodeByXY);\n      } else {\n        order_by_x = 1;\n        qsort(nodes, num, sizeof(TreeNode*), cmpNodeByYX);\n      }\n    }\n  }\n\n  dl_free(list_of_nodes);\n  free(nodes);\n\n  t = mergeRootedTrees(tn1, tn2, order1);\n\n  free(t1.branch);\n  free(t2.branch);\n\n  return t;\n}\n#endif\n"
  },
  {
    "path": "lonestar/eda/cpu/sproute/global.h",
    "content": "#ifndef _GLOBAL_H_\n#define _GLOBAL_H_\n\n#include <stdio.h>\n\n#define TRUE 1\n#define FALSE 0\n#define MAXLONG 0x7fffffffL\n\nstruct point {\n  long x, y;\n};\n\ntypedef struct point Point;\n\ntypedef long nn_array[8];\n\n#endif /* _GLOBAL_H_ */\n"
  },
  {
    "path": "lonestar/eda/cpu/sproute/heap.c",
    "content": "/****************************************************************************/\n/*\n  Binary heap routines for use in Prim's algorithm, \n  with points are numbered from 0 to n-1\n*/\n\n#include <stdlib.h>\n#include \"heap.h\"\n#include \"err.h\"\n\n\nHeap*   _heap = (Heap*)NULL;\nlong    _max_heap_size = 0;\nlong    _heap_size = 0;\n\n/****************************************************************************/\n/*\n*/\n\nvoid  allocate_heap( long n )\n{\n  if( _max_heap_size < n ) \n  {\n    _heap = (Heap*)realloc( (void*)_heap, (size_t)(n+1)*sizeof(Heap) ); \n    if( ! _heap )\n    {\n      err_exit( \"Cannot reallocate memory in allocate_heap!\" );\n    } \n    _max_heap_size = n;\n  }\n}\n/****************************************************************************/\n/*\n*/\n\nvoid  deallocate_heap()\n{\n  _max_heap_size = 0; \n  if( _heap )\n  {\n    free( (void*)_heap );\n    _heap = (Heap*)NULL;\n  }\n}\n\n/****************************************************************************/\n\nvoid  heap_init( long  n )\n{\n  register long  p;\n\n  allocate_heap( n );\n  _heap_size = 0;\n  for( p = 0;  p < n;  p++ )\n  { \n    heap_idx( p ) = 0;\n  }\n \n} /* END heap_init() */\n\n/****************************************************************************/\n\nvoid  heap_insert( \n  long   p, \n  long   key \n)\n{\n  register long  k;       /* hole in the heap     */   \n  register long  j;       /* parent of the hole   */\n  register long  q;       /* heap_elt(j)          */\n\n  heap_key( p ) = key;\n\n  if( _heap_size == 0 )\n  {\n    _heap_size = 1;\n    heap_elt( 1 ) = p;\n    heap_idx( p ) = 1;          \n    return;\n  }\n\n  k = ++ _heap_size;\n  j = k >> 1;            /* k/2 */\n\n  while( (j > 0) && (heap_key(q=heap_elt(j)) > key) ) { \n\n    heap_elt( k ) = q;\n    heap_idx( q ) = k;\n    k = j;\n    j = k>>1;    /* k/2 */\n\n  }\n \n  /* store p in the position of the hole */\n  heap_elt( k ) = p;\n  heap_idx( p ) = k;      \n\n} /* END heap_insert() */\n\n\n/****************************************************************************/\n\nvoid  heap_decrease_key\n( \n  long   p, \n  long   new_key \n)\n{\n  register long    k;       /* hole in the heap     */   \n  register long    j;       /* parent of the hole   */\n  register long    q;       /* heap_elt(j)          */\n\n  heap_key( p ) = new_key;\n  k = heap_idx( p ); \n  j = k >> 1;            /* k/2 */\n\n  if( (j > 0) && (heap_key(q=heap_elt(j)) > new_key) ) { /* change is needed */\n    do {\n\n      heap_elt( k ) = q;\n      heap_idx( q ) = k;\n      k = j;\n      j = k>>1;    /* k/2 */\n\n    } while( (j > 0) && (heap_key(q=heap_elt(j)) > new_key) );\n\n    /* store p in the position of the hole */\n    heap_elt( k ) = p;\n    heap_idx( p ) = k;      \n  }\n\n} /* END heap_decrease_key() */\n\n\n/****************************************************************************/\n\nlong  heap_delete_min()\n{\n  long    min, last;  \n  register long  k;         /* hole in the heap     */   \n  register long  j;         /* child of the hole    */\n  register long  l_key;     /* key of last point    */\n\n  if( _heap_size == 0 )            /* heap is empty */\n    return( -1 );\n\n  min  = heap_elt( 1 );\n  last = heap_elt( _heap_size -- );\n  l_key = heap_key( last );\n\n  k = 1;  j = 2;\n  while( j <= _heap_size ) {\n\n    if( heap_key(heap_elt(j)) > heap_key(heap_elt(j+1)) ) \n      j++;\n\n    if( heap_key(heap_elt(j)) >= l_key)  \n      break;                     /* found a position to insert 'last' */\n\n    /* else, sift hole down */ \n    heap_elt(k) = heap_elt(j);    /* Note that j <= _heap_size */\n    heap_idx( heap_elt(k) ) = k;\n    k = j;\n    j = k << 1;\n  }\n\n  heap_elt( k ) = last;\n  heap_idx( last ) = k;\n\n  heap_idx( min ) = -1;   /* mark the point visited */\n  return( min );\n\n} /* END heap_delete_min() */\n\n\n/****************************************************************************/\n\n"
  },
  {
    "path": "lonestar/eda/cpu/sproute/heap.h",
    "content": "#ifndef _HEAP_H_\n#define _HEAP_H_\n\n#include \"global.h\"\n\nstruct heap_info {\n  long key;\n  long idx;\n  long elt;\n};\n\ntypedef struct heap_info Heap;\n\nextern Heap* _heap;\n\n#define heap_key(p) (_heap[p].key)\n#define heap_idx(p) (_heap[p].idx)\n#define heap_elt(k) (_heap[k].elt)\n\n#define in_heap(p) (heap_idx(p) > 0)\n#define never_seen(p) (heap_idx(p) == 0)\n\nvoid allocate_heap(long n);\nvoid deallocate_heap();\nvoid heap_init(long n);\nvoid heap_insert(long p, long key);\nvoid heap_decrease_key(long p, long new_key);\nlong heap_delete_min();\n\n#endif /* _HEAP_H_ */\n"
  },
  {
    "path": "lonestar/eda/cpu/sproute/main.cpp",
    "content": "#include <stdio.h>\n#include <stdlib.h>\n#include <string.h>\n#include <math.h>\n#include <time.h>\n\n#include \"galois/Galois.h\"\n#include \"galois/Reduction.h\"\n#include \"galois/PriorityQueue.h\"\n#include \"galois/Timer.h\"\n#include \"galois/graphs/Graph.h\"\n#include \"galois/graphs/TypeTraits.h\"\n#include \"galois/substrate/SimpleLock.h\"\n#include \"galois/AtomicHelpers.h\"\n#include \"galois/runtime/Profile.h\"\n\n#include \"galois/LargeArray.h\"\n#include \"llvm/Support/CommandLine.h\"\n\n#include \"BoilerPlate.h\"\n#include \"Lonestar/BFS_SSSP.h\"\n\n#include \"DataType.h\"\n#include \"flute.h\"\n#include \"DataProc.h\"\n#include \"RSMT.h\"\n#include \"maze.h\"\n#include \"RipUp.h\"\n#include \"utility.h\"\n#include \"route.h\"\n#include \"maze3D.h\"\n#include \"maze_finegrain.h\"\n#include \"maze_finegrain_lateupdate.h\"\n#include \"maze_lock.h\"\n\nstatic const char* name = \"SPRoute\";\n\nstatic const char* desc =\n    \"A Scalable Parallel global router with a hybrid parallel algorithm which \"\n    \"combines net-level parallelism and fine-grain parallelism\";\n\nstatic const char* url = \"SPRoute\";\n\nnamespace cll = llvm::cl;\nstatic cll::opt<std::string>\n    inputFile(cll::Positional, cll::desc(\"<input file>\"), cll::Required);\n\nstatic cll::opt<std::string> outfile(\"o\", cll::desc(\"output file (optional)\"),\n                                     cll::init(\"\"));\n\nstatic cll::opt<std::string>\n    fluteDir(\"flute\",\n             cll::desc(\"directory of POWV9.dat and POST9.dat (REQUIRED)\"),\n             cll::Required);\n\n//! Flag that forces user to be aware that they should be passing in a\n//! ISPD2008 graph.\nstatic cll::opt<bool> ISPD2008Graph(\n    \"ISPD2008Graph\",\n    cll::desc(\"Specify that the input graph is a ISPD2008 graph format\"),\n    cll::init(false));\n\nint main(int argc, char** argv) {\n  //    char benchFile[FILESTRLEN];\n  clock_t t1, t2, t3;\n  float gen_brk_Time, reading_Time;\n  int enlarge, ripup_threshold;\n  int i;\n  int ESTEP1, CSTEP1, thStep1;\n  int ESTEP2, CSTEP2, thStep2;\n  int ESTEP3, CSTEP3, tUsage;\n  int Ripvalue, LVIter, cost_step;\n  int maxOverflow, past_cong, last_cong, finallength, numVia, ripupTH3D, newTH,\n      healingTrigger;\n  int minofl, minoflrnd = 0, mazeRound, upType, cost_type, bmfl, bwcnt;\n  bool goingLV, noADJ, needOUTPUT;\n\n  needOUTPUT = false;\n\n  /*string outFile;\n  for(int i = 1; i < argc; i++) {\n      string tmp(argv[i]);\n      if(tmp == \"-t\")\n          numThreads = atoi(argv[i+1]);\n      else if(tmp == \"-o\") {\n          outFile = string(argv[i+1]);\n          needOUTPUT = true;\n      }\n      else if(tmp == \"-h\" || tmp == \"--help\") {\n          printf(\"Usage: ./SPRoute  <input> -o <output> -t <nthreads> \\n\");\n          exit(1);\n      }\n  }*/\n\n  galois::SharedMemSys G;\n  LonestarStart(argc, argv, name, desc, url, &inputFile);\n\n  galois::preAlloc(numThreads * 2);\n\n  if (!ISPD2008Graph) {\n    GALOIS_DIE(\"This application requires a ISPD2008 graph input;\"\n               \" please use the -ISPD2008Graph flag \"\n               \" to indicate the input is a ISPD2008 graph format.\");\n  }\n\n  if (outfile != \"\") {\n    needOUTPUT = true;\n  }\n  LB = 0.9;\n  UB = 1.3;\n\n  SLOPE     = 5;\n  THRESH_M  = 20;\n  ENLARGE   = 15; // 5\n  ESTEP1    = 10; // 10\n  ESTEP2    = 5;  // 5\n  ESTEP3    = 5;  // 5\n  CSTEP1    = 2;  // 5\n  CSTEP2    = 2;  // 3\n  CSTEP3    = 5;  // 15\n  COSHEIGHT = 4;\n  L         = 0;\n  VIA       = 2;\n  Ripvalue  = -1;\n  ripupTH3D = 10;\n  goingLV   = TRUE;\n  noADJ     = FALSE;\n  thStep1   = 10;\n  thStep2   = 4;\n  LVIter    = 3;\n  mazeRound = 500;\n  bmfl      = BIG_INT;\n  minofl    = BIG_INT;\n\n  // galois::substrate::PerThreadStorage<THREAD_LOCAL_STORAGE>\n  // thread_local_storage; galois::setActiveThreads(numThreads);\n  /* galois::on_each(\n           [&] (const unsigned tid, const unsigned numT)\n           {\n               printf(\"threadid: %d %d\\n\", tid, numT);\n           }\n           );\n*/\n  cout << \" nthreads: \" << numThreads << endl;\n\n  int finegrain     = false;\n  int thread_choice = 0;\n  // int thread_steps[6] = {28,14,8,4,1};\n  // int thread_livelock_limit[6] = {1,1,1,1,1};\n  bool extrarun       = false;\n  int thread_livelock = 0;\n\n  if (1) {\n    t1 = clock();\n    printf(\"\\nReading %s ...\\n\", inputFile.c_str());\n    readFile(inputFile.c_str());\n    printf(\"\\nReading Lookup Table ...\\n\");\n    readLUT(fluteDir.c_str());\n    printf(\"\\nDone reading table\\n\\n\");\n\n    t2           = clock();\n    reading_Time = (float)(t2 - t1) / CLOCKS_PER_SEC;\n    printf(\"Reading Time: %f sec\\n\", reading_Time);\n\n    // call FLUTE to generate RSMT and break the nets into segments (2-pin nets)\n\n    VIA = 2;\n    // viacost = VIA;\n    viacost = 0;\n    gen_brk_RSMT(FALSE, FALSE, FALSE, FALSE, noADJ);\n    printf(\"first L\\n\");\n    routeLAll(TRUE);\n    gen_brk_RSMT(TRUE, TRUE, TRUE, FALSE, noADJ);\n    getOverflow2D(&maxOverflow);\n    printf(\"second L\\n\");\n    newrouteLAll(FALSE, TRUE);\n    getOverflow2D(&maxOverflow);\n    spiralRouteAll();\n    newrouteZAll(10);\n    printf(\"first Z\\n\");\n    past_cong = getOverflow2D(&maxOverflow);\n\n    convertToMazeroute();\n\n    enlarge        = 10;\n    newTH          = 10;\n    healingTrigger = 0;\n    stopDEC        = 0;\n    upType         = 1;\n\n    // iniBDE();\n\n    costheight = COSHEIGHT;\n\n    if (maxOverflow > 700) {\n      costheight = 8;\n      LOGIS_COF  = 1.33;\n      VIA        = 0;\n      THRESH_M   = 0;\n      CSTEP1     = 30;\n      slope      = BIG_INT;\n    }\n\n    for (i = 0; i < LVIter; i++) {\n\n      LOGIS_COF = max(2.0 / (1 + log(maxOverflow)), LOGIS_COF);\n      LOGIS_COF = 2.0 / (1 + log(maxOverflow));\n      printf(\"LV routing round %d, enlarge %d \\n\", i, enlarge);\n      routeLVAll(newTH, enlarge);\n\n      past_cong = getOverflow2Dmaze(&maxOverflow, &tUsage);\n\n      enlarge += 5;\n      newTH -= 5;\n      if (newTH < 1) {\n        newTH = 1;\n      }\n    }\n\n    //\tpast_cong = getOverflow2Dmaze( &maxOverflow);\n\n    t3           = clock();\n    reading_Time = (float)(t3 - t2) / CLOCKS_PER_SEC;\n    printf(\"LV Time: %f sec\\n\", reading_Time);\n    InitEstUsage();\n\n    i               = 1;\n    costheight      = COSHEIGHT;\n    enlarge         = ENLARGE;\n    ripup_threshold = Ripvalue;\n\n    minofl  = totalOverflow;\n    stopDEC = FALSE;\n\n    slope     = 20;\n    L         = 1;\n    cost_type = 1;\n\n    InitLastUsage(upType);\n\n    // OrderNetEdge* netEO = (OrderNetEdge*)calloc(2000, sizeof(OrderNetEdge));\n\n    PRINT_HEAT = 0;\n    // checkUsageCorrectness();\n    galois::StatTimer roundtimer(\"round\");\n    unsigned long oldtime = 0;\n    round_avg_dist        = 0;\n    round_avg_length      = 0;\n    while (totalOverflow > 0) {\n\n      if (THRESH_M > 15) {\n        THRESH_M -= thStep1;\n      } else if (THRESH_M >= 2) {\n        THRESH_M -= thStep2;\n      } else {\n        THRESH_M = 0;\n      }\n      if (THRESH_M <= 0) {\n        THRESH_M = 0;\n      }\n      // std::cout << \"totalOverflow : \" << totalOverflow << \" enlarge: \" <<\n      // enlarge << std::endl;\n      if (totalOverflow > 2000) {\n        enlarge += ESTEP1; // ENLARGE+(i-1)*ESTEP;\n        cost_step = CSTEP1;\n        updateCongestionHistory(upType);\n\n      } else if (totalOverflow < 500) {\n\n        cost_step = CSTEP3;\n        enlarge += ESTEP3;\n        ripup_threshold = -1;\n        updateCongestionHistory(upType);\n      } else {\n        cost_step = CSTEP2;\n        enlarge += ESTEP2;\n        updateCongestionHistory(upType);\n      }\n\n      if (totalOverflow > 15000 && maxOverflow > 400) {\n        enlarge = max(xGrid, yGrid) /\n                  30; // This is the key!!!! to enlarge routing area!!!!\n        // enlarge = max(xGrid,yGrid) / 10;\n        slope = BIG_INT;\n        // slope = 20;\n        if (i == 5) {\n          VIA             = 0;\n          LOGIS_COF       = 1.33;\n          ripup_threshold = -1;\n          //\tcost_type = 3;\n\n        } else if (i > 6) {\n          if (i % 2 == 0) {\n            LOGIS_COF += 0.5;\n          }\n          if (i > 20) {\n            break;\n          }\n        }\n        if (i > 10) {\n          cost_type       = 1;\n          ripup_threshold = 0;\n        }\n      }\n\n      int maxGrid = max(xGrid + 1, yGrid + 1);\n      enlarge     = min(enlarge, maxGrid / 2);\n      // std::cout << \"costheight : \" << costheight << \" enlarge: \" << enlarge\n      // << std::endl;\n      costheight += cost_step;\n      // std::cout << \"costheight : \" << costheight << \" enlarge: \" << enlarge\n      // << std::endl;\n      mazeedge_Threshold = THRESH_M;\n\n      if (upType == 3) {\n        LOGIS_COF = max(2.0 / (1 + log(maxOverflow + max_adj)), LOGIS_COF);\n      } else {\n        LOGIS_COF = max(2.0 / (1 + log(maxOverflow)), LOGIS_COF);\n      }\n\n      if (i == 8) {\n        L      = 0;\n        upType = 2;\n        InitLastUsage(upType);\n      }\n\n      if (maxOverflow == 1) {\n        // L = 0;\n        ripup_threshold = -1;\n        slope           = 5;\n      }\n\n      if (maxOverflow > 300 && past_cong > 15000) {\n        L = 0;\n      }\n      // checkUsageCorrectness();\n\n      // getOverflow2Dmaze(&maxOverflow , & tUsage);\n\n      printf(\"iteration %d, enlarge %d, costheight %d, threshold %d via cost \"\n             \"%d \\nlog_coef %f, healingTrigger %d cost_step %d slope %d L %f \"\n             \"cost_type %d OBIM delta %d\\n\",\n             i, enlarge, costheight, mazeedge_Threshold, VIA, LOGIS_COF,\n             healingTrigger, cost_step, slope, L, cost_type,\n             max(OBIM_delta, (int)(costheight / (2 * slope))));\n      // L = 2;\n      roundtimer.start();\n      round_num = i;\n      if (finegrain) {\n        printf(\"finegrain\\n\");\n\n        mazeRouteMSMD_finegrain_spinlock(i, enlarge, costheight,\n                                         ripup_threshold, mazeedge_Threshold,\n                                         !(i % 3), cost_type);\n      } else {\n        mazeRouteMSMD(i, enlarge, costheight, ripup_threshold,\n                      mazeedge_Threshold, !(i % 3), cost_type);\n      }\n      roundtimer.stop();\n      cout << \"round : \" << i << \" time(ms): \" << roundtimer.get() - oldtime\n           << \" acc time(ms): \" << roundtimer.get() << endl;\n      oldtime = roundtimer.get();\n\n      last_cong        = past_cong;\n      past_cong        = getOverflow2Dmaze(&maxOverflow, &tUsage);\n      int nthreads_tmp = numThreads;\n      if (past_cong > last_cong && !extrarun) // Michael\n      {\n        if (!finegrain && nthreads_tmp != 1) {\n          thread_livelock++;\n          if (thread_livelock == 1) {\n            thread_choice++;\n            thread_livelock = 0;\n            if (nthreads_tmp < 6) {\n              galois::setActiveThreads(4);\n              numThreads = 4;\n              finegrain  = true;\n            } else {\n              numThreads = numThreads / 2;\n              galois::setActiveThreads(numThreads);\n            }\n          }\n        }\n      }\n      cout << \"nthreads :\" << numThreads << endl;\n      extrarun = false;\n\n      if (minofl > past_cong) {\n        minofl    = past_cong;\n        minoflrnd = i;\n      }\n\n      if (i == 8) {\n        L = 1;\n      }\n\n      i++;\n\n      if (past_cong < 200 && i > 30 && upType == 2 && max_adj <= 20) {\n        upType  = 4;\n        stopDEC = TRUE;\n      }\n\n      if (maxOverflow < 150) {\n        if (i == 20 && past_cong > 200) {\n          printf(\"Extra Run for hard benchmark\\n\");\n          L       = 0;\n          upType  = 3;\n          stopDEC = TRUE;\n          slope   = 5;\n          galois::runtime::profileVtune(\n              [&](void) {\n                if (finegrain) {\n                  printf(\"finegrain\\n\");\n\n                  mazeRouteMSMD_finegrain_spinlock(\n                      i, enlarge, costheight, ripup_threshold,\n                      mazeedge_Threshold, !(i % 3), cost_type);\n                } else {\n                  mazeRouteMSMD(i, enlarge, costheight, ripup_threshold,\n                                mazeedge_Threshold, !(i % 3), cost_type);\n                }\n              },\n              \"mazeroute\");\n          last_cong = past_cong;\n          past_cong = getOverflow2Dmaze(&maxOverflow, &tUsage);\n          extrarun  = true;\n\n          str_accu(12);\n          L       = 1;\n          stopDEC = FALSE;\n          slope   = 3;\n          upType  = 2;\n        }\n        if (i == 35 && tUsage > 800000) {\n          str_accu(25);\n          extrarun = true;\n        }\n        if (i == 50 && tUsage > 800000) {\n          str_accu(40);\n          extrarun = true;\n        }\n      }\n\n      if (i > 50) {\n        upType = 4;\n        if (i > 70) {\n          stopDEC = TRUE;\n        }\n      }\n\n      if (past_cong > 0.7 * last_cong) {\n        costheight += CSTEP3;\n      }\n\n      if (past_cong >= last_cong) {\n        VIA = 0; // is this good?\n        healingTrigger++;\n      }\n\n      if (past_cong < bmfl) {\n        bwcnt = 0;\n        if (i > 140 || (i > 80 && past_cong < 20)) {\n          copyRS();\n          bmfl = past_cong;\n\n          L     = 0;\n          slope = BIG_INT;\n          // SLOPE = BIG_INT;\n          galois::runtime::profileVtune(\n              [&](void) {\n                if (finegrain) {\n                  printf(\"finegrain\\n\");\n\n                  mazeRouteMSMD_finegrain_spinlock(\n                      i, enlarge, costheight, ripup_threshold,\n                      mazeedge_Threshold, !(i % 3), cost_type);\n                } else {\n                  mazeRouteMSMD(i, enlarge, costheight, ripup_threshold,\n                                mazeedge_Threshold, !(i % 3), cost_type);\n                }\n              },\n              \"mazeroute\");\n          last_cong = past_cong;\n          past_cong = getOverflow2Dmaze(&maxOverflow, &tUsage);\n          extrarun  = true;\n          if (past_cong < last_cong) {\n            copyRS();\n            bmfl = past_cong;\n          }\n          L     = 1;\n          slope = 5;\n          // SLOPE = 5;\n          if (minofl > past_cong) {\n            minofl    = past_cong;\n            minoflrnd = i;\n          }\n          if (bmfl < 72)\n            break;\n        }\n      } else {\n        bwcnt++;\n      }\n\n      if (bmfl > 10) {\n        if (bmfl > 30 && bmfl < 72 && bwcnt > 50) {\n          break;\n        }\n        if (bmfl < 30 && bwcnt > 50) {\n          break;\n        }\n        if (i >= mazeRound) {\n          getOverflow2Dmaze(&maxOverflow, &tUsage);\n          break;\n        }\n      }\n\n      if (i >= mazeRound) {\n        getOverflow2Dmaze(&maxOverflow, &tUsage);\n        break;\n      }\n    }\n\n    if (minofl > 0) {\n      printf(\"\\n\\n minimal ofl %d, occuring at round %d\\n\\n\", minofl,\n             minoflrnd);\n      copyBR();\n    }\n\n    freeRR();\n\n    checkUsage();\n\n    printf(\"maze routing finished\\n\");\n\n    // t4 = clock();\n    // maze_Time = (float)(t4-t3)/CLOCKS_PER_SEC;\n    // printf(\"P3 runtime: %f sec\\n\", maze_Time);\n\n    printf(\"Final 2D results: \\n\");\n    getOverflow2Dmaze(&maxOverflow, &tUsage);\n\n    printf(\"\\nLayer Assignment Begins\");\n    newLA();\n    printf(\"layer assignment finished\\n\");\n\n    t2           = clock();\n    gen_brk_Time = (float)(t2 - t1) / CLOCKS_PER_SEC;\n    // printf(\"2D + Layer Assignment Runtime: %f sec\\n\", gen_brk_Time);\n\n    costheight = 3;\n    viacost    = 1;\n\n    if (gen_brk_Time < 60) {\n      ripupTH3D = 15;\n    } else if (gen_brk_Time < 120) {\n      ripupTH3D = 18;\n    } else {\n      ripupTH3D = 20;\n    }\n\n    if (goingLV && past_cong == 0) {\n      printf(\"Post Processing Begins \\n\");\n      mazeRouteMSMDOrder3D(enlarge, 0, ripupTH3D);\n\n      //\tmazeRouteMSMDOrder3D(enlarge, 0, 10 );\n      if (gen_brk_Time > 120) {\n        mazeRouteMSMDOrder3D(enlarge, 0, 12);\n      }\n      printf(\"Post Processsing finished, starting via filling\\n\");\n    }\n\n    fillVIA();\n    finallength = getOverflow3D();\n    numVia      = threeDVIA();\n    checkRoute3D();\n    if (needOUTPUT) {\n      writeRoute3D(outfile.c_str());\n    }\n\n  } // Input ==1\n\n  // t4 = clock();\n  // maze_Time = (float)(t4-t1)/CLOCKS_PER_SEC;\n  printf(\"Final routing length : %d\\n\", finallength);\n  printf(\"Final number of via  : %d\\n\", numVia);\n  printf(\"Final total length 1 : %d\\n\\n\", finallength + numVia);\n\n  // printf(\"Final total length 3 : %d\\n\",(finallength+3*numVia));\n  // printf(\"3D runtime: %f sec\\n\", maze_Time);\n\n  // freeAllMemory();\n  return 0;\n}\n"
  },
  {
    "path": "lonestar/eda/cpu/sproute/maze.h",
    "content": "#ifndef _MAZE_H_\n#define _MAZE_H_\n\n#include <stdio.h>\n#include <stdlib.h>\n#include <math.h>\n#include <functional>\n#include \"bitmap_image.hpp\"\n#include \"DataType.h\"\n#include \"flute.h\"\n#include \"DataProc.h\"\n#include \"route.h\"\n#include \"RipUp.h\"\n\n#include \"galois/Galois.h\"\n#include \"galois/gstl.h\"\n#include \"galois/PerThreadContainer.h\"\n#include \"galois/Reduction.h\"\n#include \"galois/PriorityQueue.h\"\n#include \"galois/Timer.h\"\n#include \"galois/graphs/Graph.h\"\n#include \"galois/graphs/TypeTraits.h\"\n#include \"galois/substrate/SimpleLock.h\"\n#include \"galois/substrate/NumaMem.h\"\n#include \"galois/AtomicHelpers.h\"\n#include \"galois/runtime/Profile.h\"\n\n#include \"galois/LargeArray.h\"\n#include \"llvm/Support/CommandLine.h\"\n\n// using namespace std;\n\n#define PARENT(i) (i - 1) / 2\n//#define PARENT(i) ((i-1)>>1)\n#define LEFT(i) 2 * i + 1\n#define RIGHT(i) 2 * i + 2\n\n#define NET_PARALLEL 0\n\nint PRINT;\nint PRINT_HEAT;\ntypedef struct {\n  bool operator()(float* left, float* right) { return (*left) > (*right); }\n} maze_greater;\n\nclass pq_grid {\npublic:\n  float* d1_p;\n  float d1_push;\n  pq_grid() {\n    d1_p    = NULL;\n    d1_push = 0;\n  };\n  pq_grid(float* d1_p, float d1_push) {\n    this->d1_p    = d1_p;\n    this->d1_push = d1_push;\n  }\n};\n\nclass lateUpdateReq {\npublic:\n  std::atomic<float>* d1_p;\n  float d1_push;\n  short parentX;\n  short parentY;\n  bool HV;\n  lateUpdateReq() {\n    d1_p    = NULL;\n    d1_push = 0;\n    parentX = 0;\n    parentY = 0;\n    HV      = false; // 1 == true, 3 == false\n  };\n  lateUpdateReq(std::atomic<float>* d1_p, float d1_push, short parentX,\n                short parentY, bool HV) {\n    this->d1_p    = d1_p;\n    this->d1_push = d1_push;\n    this->parentX = parentX;\n    this->parentY = parentY;\n    this->HV      = HV;\n  }\n};\n\ntypedef struct {\n  bool operator()(const pq_grid& left, const pq_grid& right) const {\n    return left.d1_push < right.d1_push;\n  }\n} pq_less;\n\n/*typedef galois::PerThreadDeque< float* > PerThread_PQ;\ntypedef galois::gstl::Deque< float* > local_pq;*/ //FIFO TRIAL\n\ntypedef galois::PerThreadMinHeap<pq_grid, pq_less> PerThread_PQ;\ntypedef galois::gstl::PQ<pq_grid, pq_less> local_pq;\n\ntypedef galois::PerThreadVector<int> PerThread_Vec;\ntypedef galois::gstl::Vector<int> local_vec;\n\ntypedef struct {\n  int x; // x position\n  int y; // y position\n} Pos;\n\n#define FIFO_CHUNK_SIZE 4\n#define OBIM_delta 20\n\nauto RequestIndexer = [](const pq_grid& top) {\n  return (unsigned int)(top.d1_push) /\n         max(OBIM_delta, (int)(costheight / (2 * slope)));\n};\n\nauto RequestIndexerLate = [](const lateUpdateReq& top) {\n  return (unsigned int)(top.d1_push) / OBIM_delta;\n};\n\n/*auto RequestIndexerConcurrent = [&](const concurrent_pq_grid& top) {\n    return (unsigned int)(top.d1_push) / OBIM_delta;\n};*/\n\nnamespace gwl = galois::worklists;\nusing PSChunk = gwl::PerThreadChunkFIFO<FIFO_CHUNK_SIZE>;\nusing OBIM    = gwl::OrderedByIntegerMetric<decltype(RequestIndexer), PSChunk>;\nusing OBIM_late =\n    gwl::OrderedByIntegerMetric<decltype(RequestIndexerLate), PSChunk>;\n// using OBIM_concurrent =\n// gwl::OrderedByIntegerMetric<decltype(RequestIndexerConcurrent), PSChunk>;\n\nstruct THREAD_LOCAL_STORAGE {\n  using LAptr = galois::substrate::LAptr;\n  LAptr pop_heap2_LA;\n  bool* pop_heap2;\n\n  LAptr d1_p_LA, d1_alloc_LA;\n  float** d1_p;\n  float* d1_alloc;\n\n  LAptr HV_p_LA, HV_alloc_LA, hyperV_p_LA, hyperV_alloc_LA, hyperH_p_LA,\n      hyperH_alloc_LA;\n  bool **HV_p, **hyperV_p, **hyperH_p;\n  bool *HV_alloc, *hyperV_alloc, *hyperH_alloc;\n\n  LAptr parentX1_p_LA, parentX1_alloc_LA, parentY1_p_LA, parentY1_alloc_LA,\n      parentX3_p_LA, parentX3_alloc_LA, parentY3_p_LA, parentY3_alloc_LA;\n  short **parentX1_p, **parentY1_p, **parentX3_p, **parentY3_p;\n  short *parentX1_alloc, *parentY1_alloc, *parentX3_alloc, *parentY3_alloc;\n\n  LAptr corrEdge_p_LA, corrEdge_alloc_LA;\n  int** corrEdge_p;\n  int* corrEdge_alloc;\n\n  LAptr inRegion_p_LA, inRegion_alloc_LA;\n  bool** inRegion_p;\n  bool* inRegion_alloc;\n\n  LAptr netEO_p_LA;\n  OrderNetEdge* netEO_p;\n\n  // maze_pq pq1;\n  // std::vector<float*> v2;\n  THREAD_LOCAL_STORAGE() {\n    using namespace galois::substrate;\n\n    if (NET_PARALLEL) {\n      pop_heap2_LA = largeMallocLocal(yGrid * xGrid * sizeof(bool));\n      pop_heap2    = reinterpret_cast<bool*>(pop_heap2_LA.get());\n\n      d1_alloc_LA = largeMallocLocal(yGrid * xGrid * sizeof(float));\n      d1_alloc    = reinterpret_cast<float*>(d1_alloc_LA.get());\n      d1_p_LA     = largeMallocLocal(yGrid * sizeof(float*));\n      d1_p        = reinterpret_cast<float**>(d1_p_LA.get());\n\n      HV_alloc_LA     = largeMallocLocal(yGrid * xGrid * sizeof(bool));\n      HV_alloc        = reinterpret_cast<bool*>(HV_alloc_LA.get());\n      hyperV_alloc_LA = largeMallocLocal(yGrid * xGrid * sizeof(bool));\n      hyperV_alloc    = reinterpret_cast<bool*>(hyperV_alloc_LA.get());\n      hyperH_alloc_LA = largeMallocLocal(yGrid * xGrid * sizeof(bool));\n      hyperH_alloc    = reinterpret_cast<bool*>(hyperH_alloc_LA.get());\n\n      HV_p_LA     = largeMallocLocal(yGrid * sizeof(bool*));\n      HV_p        = reinterpret_cast<bool**>(HV_p_LA.get());\n      hyperV_p_LA = largeMallocLocal(yGrid * sizeof(bool*));\n      hyperV_p    = reinterpret_cast<bool**>(hyperV_p_LA.get());\n      hyperH_p_LA = largeMallocLocal(yGrid * sizeof(bool*));\n      hyperH_p    = reinterpret_cast<bool**>(hyperH_p_LA.get());\n\n      parentX1_alloc_LA = largeMallocLocal(yGrid * xGrid * sizeof(short));\n      parentX1_alloc    = reinterpret_cast<short*>(parentX1_alloc_LA.get());\n      parentX3_alloc_LA = largeMallocLocal(yGrid * xGrid * sizeof(short));\n      parentX3_alloc    = reinterpret_cast<short*>(parentX3_alloc_LA.get());\n      parentY1_alloc_LA = largeMallocLocal(yGrid * xGrid * sizeof(short));\n      parentY1_alloc    = reinterpret_cast<short*>(parentY1_alloc_LA.get());\n      parentY3_alloc_LA = largeMallocLocal(yGrid * xGrid * sizeof(short));\n      parentY3_alloc    = reinterpret_cast<short*>(parentY1_alloc_LA.get());\n\n      parentX1_p_LA = largeMallocLocal(yGrid * sizeof(short*));\n      parentX1_p    = reinterpret_cast<short**>(parentX1_p_LA.get());\n      parentX3_p_LA = largeMallocLocal(yGrid * sizeof(short*));\n      parentX3_p    = reinterpret_cast<short**>(parentX3_p_LA.get());\n      parentY1_p_LA = largeMallocLocal(yGrid * sizeof(short*));\n      parentY1_p    = reinterpret_cast<short**>(parentY1_p_LA.get());\n      parentY3_p_LA = largeMallocLocal(yGrid * sizeof(short*));\n      parentY3_p    = reinterpret_cast<short**>(parentY3_p_LA.get());\n\n      corrEdge_alloc_LA = largeMallocLocal(yGrid * xGrid * sizeof(int));\n      corrEdge_alloc    = reinterpret_cast<int*>(corrEdge_alloc_LA.get());\n      corrEdge_p_LA     = largeMallocLocal(yGrid * sizeof(int*));\n      corrEdge_p        = reinterpret_cast<int**>(corrEdge_p_LA.get());\n\n      inRegion_alloc_LA = largeMallocLocal(yGrid * xGrid * sizeof(bool));\n      inRegion_alloc    = reinterpret_cast<bool*>(inRegion_alloc_LA.get());\n      inRegion_p_LA     = largeMallocLocal(yGrid * sizeof(bool*));\n      inRegion_p        = reinterpret_cast<bool**>(inRegion_p_LA.get());\n\n      netEO_p_LA = largeMallocLocal(MAXNETDEG * 2 * sizeof(OrderNetEdge));\n      netEO_p    = reinterpret_cast<OrderNetEdge*>(netEO_p_LA.get());\n    } else {\n      pop_heap2 = (bool*)calloc(yGrid * xGrid, sizeof(bool));\n\n      d1_alloc = (float*)calloc(yGrid * xGrid, sizeof(float));\n      d1_p     = (float**)calloc(yGrid, sizeof(float*));\n\n      HV_alloc     = (bool*)calloc(yGrid * xGrid, sizeof(bool));\n      hyperV_alloc = (bool*)calloc(yGrid * xGrid, sizeof(bool));\n      hyperH_alloc = (bool*)calloc(yGrid * xGrid, sizeof(bool));\n      HV_p         = (bool**)calloc(yGrid, sizeof(bool*));\n      hyperV_p     = (bool**)calloc(yGrid, sizeof(bool*));\n      hyperH_p     = (bool**)calloc(yGrid, sizeof(bool*));\n\n      parentX1_alloc = (short*)calloc(yGrid * xGrid, sizeof(short));\n      parentX3_alloc = (short*)calloc(yGrid * xGrid, sizeof(short));\n      parentY1_alloc = (short*)calloc(yGrid * xGrid, sizeof(short));\n      parentY3_alloc = (short*)calloc(yGrid * xGrid, sizeof(short));\n      parentX1_p     = (short**)calloc(yGrid, sizeof(short*));\n      parentX3_p     = (short**)calloc(yGrid, sizeof(short*));\n      parentY1_p     = (short**)calloc(yGrid, sizeof(short*));\n      parentY3_p     = (short**)calloc(yGrid, sizeof(short*));\n\n      corrEdge_alloc = (int*)calloc(yGrid * xGrid, sizeof(int));\n      corrEdge_p     = (int**)calloc(yGrid, sizeof(int*));\n\n      inRegion_alloc = (bool*)calloc(yGrid * xGrid, sizeof(bool));\n      inRegion_p     = (bool**)calloc(yGrid, sizeof(bool*));\n\n      netEO_p = (OrderNetEdge*)calloc(MAXNETDEG * 2, sizeof(OrderNetEdge));\n    }\n    // printf(\"allocation success\\n\");\n    for (int i = 0; i < yGrid; i++) {\n      d1_p[i] = &(d1_alloc[i * xGrid]);\n\n      HV_p[i]     = &(HV_alloc[i * xGrid]);\n      hyperV_p[i] = &(hyperV_alloc[i * xGrid]);\n      hyperH_p[i] = &(hyperH_alloc[i * xGrid]);\n\n      corrEdge_p[i] = &(corrEdge_alloc[i * xGrid]);\n\n      inRegion_p[i] = &(inRegion_alloc[i * xGrid]);\n    }\n\n    for (int i = 0; i < yGrid; i++) {\n      parentX1_p[i] = &(parentX1_alloc[i * xGrid]);\n      parentX3_p[i] = &(parentX3_alloc[i * xGrid]);\n      parentY1_p[i] = &(parentY1_alloc[i * xGrid]);\n      parentY3_p[i] = &(parentY3_alloc[i * xGrid]);\n    }\n  }\n  void reset_heap() { memset(pop_heap2, 0, yGrid * xGrid * sizeof(bool)); }\n\n  ~THREAD_LOCAL_STORAGE() {\n    free(pop_heap2);\n\n    free(d1_p);\n    free(d1_alloc);\n\n    free(HV_p);\n    free(hyperV_p);\n    free(hyperH_p);\n    free(HV_alloc);\n    free(hyperV_alloc);\n    free(hyperH_alloc);\n\n    free(parentX1_p);\n    free(parentY1_p);\n    free(parentX3_p);\n    free(parentY3_p);\n\n    free(parentX1_alloc);\n    free(parentY1_alloc);\n    free(parentX3_alloc);\n    free(parentY3_alloc);\n\n    free(corrEdge_alloc);\n    free(corrEdge_p);\n\n    free(inRegion_alloc);\n    free(inRegion_p);\n\n    free(netEO_p);\n  }\n};\n\nvoid convertToMazerouteNet(int netID) {\n  short *gridsX, *gridsY;\n  int i, edgeID, edgelength;\n  int n1, n2, x1, y1, x2, y2;\n  int cnt, Zpoint;\n  TreeEdge* treeedge;\n  TreeNode* treenodes;\n\n  treenodes = sttrees[netID].nodes;\n  for (edgeID = 0; edgeID < 2 * sttrees[netID].deg - 3; edgeID++) {\n    treeedge               = &(sttrees[netID].edges[edgeID]);\n    edgelength             = treeedge->len;\n    n1                     = treeedge->n1;\n    n2                     = treeedge->n2;\n    x1                     = treenodes[n1].x;\n    y1                     = treenodes[n1].y;\n    x2                     = treenodes[n2].x;\n    y2                     = treenodes[n2].y;\n    treeedge->route.gridsX = (short*)calloc((edgelength + 1), sizeof(short));\n    treeedge->route.gridsY = (short*)calloc((edgelength + 1), sizeof(short));\n    gridsX                 = treeedge->route.gridsX;\n    gridsY                 = treeedge->route.gridsY;\n    treeedge->len          = ADIFF(x1, x2) + ADIFF(y1, y2);\n\n    cnt = 0;\n    if (treeedge->route.type == NOROUTE) {\n      gridsX[0]                = x1;\n      gridsY[0]                = y1;\n      treeedge->route.type     = MAZEROUTE;\n      treeedge->route.routelen = 0;\n      treeedge->len            = 0;\n      cnt++;\n    } else if (treeedge->route.type == LROUTE) {\n      if (treeedge->route.xFirst) // horizontal first\n      {\n        for (i = x1; i <= x2; i++) {\n          gridsX[cnt] = i;\n          gridsY[cnt] = y1;\n          cnt++;\n        }\n        if (y1 <= y2) {\n          for (i = y1 + 1; i <= y2; i++) {\n            gridsX[cnt] = x2;\n            gridsY[cnt] = i;\n            cnt++;\n          }\n        } else {\n          for (i = y1 - 1; i >= y2; i--) {\n            gridsX[cnt] = x2;\n            gridsY[cnt] = i;\n            cnt++;\n          }\n        }\n      } else // vertical first\n      {\n        if (y1 <= y2) {\n          for (i = y1; i <= y2; i++) {\n            gridsX[cnt] = x1;\n            gridsY[cnt] = i;\n            cnt++;\n          }\n        } else {\n          for (i = y1; i >= y2; i--) {\n            gridsX[cnt] = x1;\n            gridsY[cnt] = i;\n            cnt++;\n          }\n        }\n        for (i = x1 + 1; i <= x2; i++) {\n          gridsX[cnt] = i;\n          gridsY[cnt] = y2;\n          cnt++;\n        }\n      }\n    } else if (treeedge->route.type == ZROUTE) {\n      Zpoint = treeedge->route.Zpoint;\n      if (treeedge->route.HVH) // HVH\n      {\n        for (i = x1; i < Zpoint; i++) {\n          gridsX[cnt] = i;\n          gridsY[cnt] = y1;\n          cnt++;\n        }\n        if (y1 <= y2) {\n          for (i = y1; i <= y2; i++) {\n            gridsX[cnt] = Zpoint;\n            gridsY[cnt] = i;\n            cnt++;\n          }\n        } else {\n          for (i = y1; i >= y2; i--) {\n            gridsX[cnt] = Zpoint;\n            gridsY[cnt] = i;\n            cnt++;\n          }\n        }\n        for (i = Zpoint + 1; i <= x2; i++) {\n          gridsX[cnt] = i;\n          gridsY[cnt] = y2;\n          cnt++;\n        }\n      } else // VHV\n      {\n        if (y1 <= y2) {\n          for (i = y1; i < Zpoint; i++) {\n            gridsX[cnt] = x1;\n            gridsY[cnt] = i;\n            cnt++;\n          }\n          for (i = x1; i <= x2; i++) {\n            gridsX[cnt] = i;\n            gridsY[cnt] = Zpoint;\n            cnt++;\n          }\n          for (i = Zpoint + 1; i <= y2; i++) {\n            gridsX[cnt] = x2;\n            gridsY[cnt] = i;\n            cnt++;\n          }\n        } else {\n          for (i = y1; i > Zpoint; i--) {\n            gridsX[cnt] = x1;\n            gridsY[cnt] = i;\n            cnt++;\n          }\n          for (i = x1; i <= x2; i++) {\n            gridsX[cnt] = i;\n            gridsY[cnt] = Zpoint;\n            cnt++;\n          }\n          for (i = Zpoint - 1; i >= y2; i--) {\n            gridsX[cnt] = x2;\n            gridsY[cnt] = i;\n            cnt++;\n          }\n        }\n      }\n    }\n\n    treeedge->route.type     = MAZEROUTE;\n    treeedge->route.routelen = edgelength;\n\n  } // loop for all the edges\n}\n\nvoid convertToMazeroute() {\n  int i, j, netID;\n\n  for (netID = 0; netID < numValidNets; netID++) {\n    convertToMazerouteNet(netID);\n  }\n\n  for (i = 0; i < yGrid; i++) {\n    for (j = 0; j < xGrid - 1; j++) {\n      int grid            = i * (xGrid - 1) + j;\n      h_edges[grid].usage = h_edges[grid].est_usage;\n    }\n  }\n  //    fprintf(fpv, \"\\nVertical Congestion\\n\");\n  for (i = 0; i < yGrid - 1; i++) {\n    for (j = 0; j < xGrid; j++) {\n      int grid            = i * xGrid + j;\n      v_edges[grid].usage = v_edges[grid].est_usage;\n    }\n  }\n}\n\n// non recursive version of heapify\nvoid heapify(float** array, int heapSize, int i) {\n  int l, r, smallest;\n  float* tmp;\n  Bool STOP = FALSE;\n\n  tmp = array[i];\n  do {\n\n    l = LEFT(i);\n    r = RIGHT(i);\n\n    if (l < heapSize && *(array[l]) < *tmp) {\n      smallest = l;\n      if (r < heapSize && *(array[r]) < *(array[l]))\n        smallest = r;\n    } else {\n      smallest = i;\n      if (r < heapSize && *(array[r]) < *tmp)\n        smallest = r;\n    }\n    if (smallest != i) {\n      array[i] = array[smallest];\n      i        = smallest;\n    } else {\n      array[i] = tmp;\n      STOP     = TRUE;\n    }\n  } while (!STOP);\n}\n\n// build heap for an list of grid\n/*void buildHeap(float **array, int arrayLen)\n{\n    int i;\n\n    for (i=arrayLen/2-1; i>=0; i--)\n        heapify(array, arrayLen, i);\n}*/\n\nvoid updateHeap(float** array, int i) {\n  int parent;\n  float* tmpi;\n\n  tmpi = array[i];\n  while (i > 0 && *(array[PARENT(i)]) > *tmpi) {\n    parent   = PARENT(i);\n    array[i] = array[parent];\n    i        = parent;\n  }\n  array[i] = tmpi;\n}\n\n// extract the entry with minimum distance from Priority queue\nvoid extractMin(float** array, int arrayLen) {\n\n  //    if(arrayLen<1)\n  //        printf(\"Error: heap underflow\\n\");\n  array[0] = array[arrayLen - 1];\n  heapify(array, arrayLen - 1, 0);\n}\n\n/*\n * num_iteration : the total number of iterations for maze route to run\n * round : the number of maze route stages runned\n */\n\nvoid updateCongestionHistory(int upType) {\n  int i, j, grid, maxlimit;\n  float overflow;\n\n  maxlimit = 0;\n\n  printf(\"updateType %d\\n\", upType);\n\n  if (upType == 1) {\n    for (i = 0; i < yGrid; i++) {\n      for (j = 0; j < xGrid - 1; j++) {\n        grid     = i * (xGrid - 1) + j;\n        overflow = h_edges[grid].usage - h_edges[grid].cap;\n\n        if (overflow > 0) {\n          h_edges[grid].last_usage += overflow;\n          h_edges[grid].congCNT++;\n        } else {\n          if (!stopDEC) {\n            h_edges[grid].last_usage = h_edges[grid].last_usage * 0.9;\n          }\n        }\n        maxlimit = max(maxlimit, h_edges[grid].last_usage);\n      }\n    }\n\n    for (i = 0; i < yGrid - 1; i++) {\n      for (j = 0; j < xGrid; j++) {\n        grid     = i * xGrid + j;\n        overflow = v_edges[grid].usage - v_edges[grid].cap;\n\n        if (overflow > 0) {\n          v_edges[grid].last_usage += overflow;\n          v_edges[grid].congCNT++;\n        } else {\n          if (!stopDEC) {\n            v_edges[grid].last_usage = v_edges[grid].last_usage * 0.9;\n          }\n        }\n        maxlimit = max(maxlimit, v_edges[grid].last_usage);\n      }\n    }\n  } else if (upType == 2) {\n    if (max_adj < ahTH) {\n      stopDEC = TRUE;\n    } else {\n      stopDEC = FALSE;\n    }\n    for (i = 0; i < yGrid; i++) {\n      for (j = 0; j < xGrid - 1; j++) {\n        grid     = i * (xGrid - 1) + j;\n        overflow = h_edges[grid].usage - h_edges[grid].cap;\n\n        if (overflow > 0) {\n          h_edges[grid].congCNT++;\n          h_edges[grid].last_usage += overflow;\n        } else {\n          if (!stopDEC) {\n            h_edges[grid].congCNT--;\n            h_edges[grid].congCNT    = max(0, h_edges[grid].congCNT);\n            h_edges[grid].last_usage = h_edges[grid].last_usage * 0.9;\n          }\n        }\n        maxlimit = max(maxlimit, h_edges[grid].last_usage);\n      }\n    }\n\n    for (i = 0; i < yGrid - 1; i++) {\n      for (j = 0; j < xGrid; j++) {\n        grid     = i * xGrid + j;\n        overflow = v_edges[grid].usage - v_edges[grid].cap;\n\n        if (overflow > 0) {\n          v_edges[grid].congCNT++;\n          v_edges[grid].last_usage += overflow;\n        } else {\n          if (!stopDEC) {\n            v_edges[grid].congCNT--;\n            v_edges[grid].congCNT    = max(0, v_edges[grid].congCNT);\n            v_edges[grid].last_usage = v_edges[grid].last_usage * 0.9;\n          }\n        }\n        maxlimit = max(maxlimit, v_edges[grid].last_usage);\n      }\n    }\n\n  } else if (upType == 3) {\n    for (i = 0; i < yGrid; i++) {\n      for (j = 0; j < xGrid - 1; j++) {\n        grid     = i * (xGrid - 1) + j;\n        overflow = h_edges[grid].usage - h_edges[grid].cap;\n\n        if (overflow > 0) {\n          h_edges[grid].congCNT++;\n          h_edges[grid].last_usage += overflow;\n        } else {\n          if (!stopDEC) {\n            h_edges[grid].congCNT--;\n            h_edges[grid].congCNT = max(0, h_edges[grid].congCNT);\n            h_edges[grid].last_usage += overflow;\n            h_edges[grid].last_usage = max(h_edges[grid].last_usage, 0);\n          }\n        }\n        maxlimit = max(maxlimit, h_edges[grid].last_usage);\n      }\n    }\n\n    for (i = 0; i < yGrid - 1; i++) {\n      for (j = 0; j < xGrid; j++) {\n        grid     = i * xGrid + j;\n        overflow = v_edges[grid].usage - v_edges[grid].cap;\n\n        if (overflow > 0) {\n          v_edges[grid].congCNT++;\n          v_edges[grid].last_usage += overflow;\n        } else {\n          if (!stopDEC) {\n            v_edges[grid].congCNT--;\n            v_edges[grid].last_usage += overflow;\n            v_edges[grid].last_usage = max(v_edges[grid].last_usage, 0);\n          }\n        }\n        maxlimit = max(maxlimit, v_edges[grid].last_usage);\n      }\n    }\n\n  } else if (upType == 4) {\n    for (i = 0; i < yGrid; i++) {\n      for (j = 0; j < xGrid - 1; j++) {\n        grid     = i * (xGrid - 1) + j;\n        overflow = h_edges[grid].usage - h_edges[grid].cap;\n\n        if (overflow > 0) {\n          h_edges[grid].congCNT++;\n          h_edges[grid].last_usage += overflow;\n        } else {\n          if (!stopDEC) {\n            h_edges[grid].congCNT--;\n            h_edges[grid].congCNT    = max(0, h_edges[grid].congCNT);\n            h_edges[grid].last_usage = h_edges[grid].last_usage * 0.9;\n          }\n        }\n        maxlimit = max(maxlimit, h_edges[grid].last_usage);\n      }\n    }\n\n    for (i = 0; i < yGrid - 1; i++) {\n      for (j = 0; j < xGrid; j++) {\n        grid     = i * xGrid + j;\n        overflow = v_edges[grid].usage - v_edges[grid].cap;\n\n        if (overflow > 0) {\n          v_edges[grid].congCNT++;\n          v_edges[grid].last_usage += overflow;\n        } else {\n          if (!stopDEC) {\n            v_edges[grid].congCNT--;\n            v_edges[grid].congCNT    = max(0, v_edges[grid].congCNT);\n            v_edges[grid].last_usage = v_edges[grid].last_usage * 0.9;\n          }\n        }\n        maxlimit = max(maxlimit, v_edges[grid].last_usage);\n      }\n    }\n    //\tif (maxlimit < 20) {\n    //\t\tstopDEC = TRUE;\n    //\t}\n  }\n\n  max_adj = maxlimit;\n\n  printf(\"max value %d stop %d\\n\", maxlimit, stopDEC);\n}\n\n// ripup a tree edge according to its ripup type and Z-route it\n// put all the nodes in the subtree t1 and t2 into heap1 and heap2\n// netID   - the ID for the net\n// edgeID  - the ID for the tree edge to route\n// d1      - the distance of any grid from the source subtree t1\n// d2      - the distance of any grid from the destination subtree t2\n// heap1   - the heap storing the addresses for d1[][]\n// heap2   - the heap storing the addresses for d2[][]\nvoid setupHeap(int netID, int edgeID, local_pq& pq1, local_vec& v2,\n               int regionX1, int regionX2, int regionY1, int regionY2,\n               float** d1, int** corrEdge, bool** inRegion) {\n  int i, j, d, numNodes, n1, n2, x1, y1, x2, y2;\n  int nbr, nbrX, nbrY, cur, edge;\n  int x_grid, y_grid;\n  int queuehead, queuetail, *queue;\n  Bool* visited;\n  TreeEdge* treeedges;\n  TreeNode* treenodes;\n  Route* route;\n\n  for (i = regionY1; i <= regionY2; i++) {\n    for (j = regionX1; j <= regionX2; j++)\n      inRegion[i][j] = TRUE;\n  }\n\n  treeedges = sttrees[netID].edges;\n  treenodes = sttrees[netID].nodes;\n  d         = sttrees[netID].deg;\n\n  n1 = treeedges[edgeID].n1;\n  n2 = treeedges[edgeID].n2;\n  x1 = treenodes[n1].x;\n  y1 = treenodes[n1].y;\n  x2 = treenodes[n2].x;\n  y2 = treenodes[n2].y;\n\n  // if(netID == 14628)\n  //    printf(\"net: %d edge: %d src: %d %d dst: %d %d d: %d\\n\", netID, edgeID,\n  //    y1, x1, y2, x2, d);\n  pq1.clear();\n  v2.clear(); // Michael\n  if (d == 2) // 2-pin net\n  {\n    d1[y1][x1] = 0;\n    pq1.push({&(d1[y1][x1]), 0});\n    v2.push_back(y2 * xGrid + x2);\n  } else // net with more than 2 pins\n  {\n    numNodes = 2 * d - 2;\n\n    visited = (Bool*)calloc(numNodes, sizeof(Bool));\n    for (i = 0; i < numNodes; i++)\n      visited[i] = FALSE;\n\n    queue = (int*)calloc(numNodes, sizeof(int));\n\n    // find all the grids on tree edges in subtree t1 (connecting to n1) and put\n    // them into heap1\n    if (n1 < d) // n1 is a Pin node\n    {\n      // just need to put n1 itself into heap1\n      d1[y1][x1] = 0;\n      pq1.push({&(d1[y1][x1]), 0});\n      visited[n1] = TRUE;\n    } else // n1 is a Steiner node\n    {\n      queuehead = queuetail = 0;\n\n      // add n1 into heap1\n      d1[y1][x1] = 0;\n      // if(netID == 252163 && edgeID == 51)\n      //    printf(\"y: %d x: %d\\n\", y1, x1);\n      pq1.push({&(d1[y1][x1]), 0});\n      visited[n1] = TRUE;\n\n      // add n1 into the queue\n      queue[queuetail] = n1;\n      queuetail++;\n\n      // loop to find all the edges in subtree t1\n      while (queuetail > queuehead) {\n        // get cur node from the queuehead\n        cur = queue[queuehead];\n        queuehead++;\n        visited[cur] = TRUE;\n        if (cur >= d) // cur node is a Steiner node\n        {\n          for (i = 0; i < 3; i++) {\n            nbr  = treenodes[cur].nbr[i];\n            edge = treenodes[cur].edge[i];\n            if (nbr != n2) // not n2\n            {\n              if (visited[nbr] == FALSE) {\n                // put all the grids on the two adjacent tree edges into heap1\n                if (treeedges[edge].route.routelen > 0) // not a degraded edge\n                {\n                  // put nbr into heap1 if in enlarged region\n                  if (inRegion[treenodes[nbr].y][treenodes[nbr].x]) {\n                    nbrX           = treenodes[nbr].x;\n                    nbrY           = treenodes[nbr].y;\n                    d1[nbrY][nbrX] = 0;\n                    // if(netID == 252163 && edgeID == 51)\n                    //    printf(\"y: %d x: %d\\n\", nbrY, nbrX);\n                    pq1.push({&(d1[nbrY][nbrX]), 0});\n                    corrEdge[nbrY][nbrX] = edge;\n                  }\n\n                  // the coordinates of two end nodes of the edge\n\n                  route = &(treeedges[edge].route);\n                  if (route->type == MAZEROUTE) {\n                    for (j = 1; j < route->routelen;\n                         j++) // don't put edge_n1 and edge_n2 into heap1\n                    {\n                      x_grid = route->gridsX[j];\n                      y_grid = route->gridsY[j];\n\n                      if (inRegion[y_grid][x_grid]) {\n                        d1[y_grid][x_grid] = 0;\n                        // if(netID == 252163 && edgeID == 51)\n                        //    printf(\"y: %d x: %d\\n\", y_grid, x_grid);\n                        pq1.push({&(d1[y_grid][x_grid]), 0});\n                        corrEdge[y_grid][x_grid] = edge;\n                      }\n                    }\n                  } // if MAZEROUTE\n                  else {\n                    printf(\"Setup Heap: not maze routing\\n\");\n                  }\n                } // if not a degraded edge (len>0)\n\n                // add the neighbor of cur node into queue\n                queue[queuetail] = nbr;\n                queuetail++;\n              } // if the node is not visited\n            }   // if nbr!=n2\n          }     // loop i (3 neigbors for cur node)\n        }       // if cur node is a Steiner nodes\n      }         // while queue is not empty\n    }           // else n1 is not a Pin node\n\n    // find all the grids on subtree t2 (connect to n2) and put them into heap2\n    // find all the grids on tree edges in subtree t2 (connecting to n2) and put\n    // them into heap2\n    if (n2 < d) // n2 is a Pin node\n    {\n      // just need to put n2 itself into heap2\n      v2.push_back(y2 * xGrid + x2);\n      // if(netID == 14628)\n      //    printf(\"y: %d x: %d \\n\", y2, x2);\n      visited[n2] = TRUE;\n    } else // n2 is a Steiner node\n    {\n      queuehead = queuetail = 0;\n\n      // add n2 into heap2\n      v2.push_back(y2 * xGrid + x2);\n      // if(netID == 252163 && edgeID == 51)\n      //    printf(\"dst y: %d x: %d \\n\", y2, x2);\n      visited[n2] = TRUE;\n\n      // add n2 into the queue\n      queue[queuetail] = n2;\n      queuetail++;\n\n      // loop to find all the edges in subtree t2\n      while (queuetail > queuehead) {\n        // get cur node form queuehead\n        cur          = queue[queuehead];\n        visited[cur] = TRUE;\n        queuehead++;\n\n        if (cur >= d) // cur node is a Steiner node\n        {\n          for (i = 0; i < 3; i++) {\n            nbr  = treenodes[cur].nbr[i];\n            edge = treenodes[cur].edge[i];\n            if (nbr != n1) // not n1\n            {\n              if (visited[nbr] == FALSE) {\n                // put all the grids on the two adjacent tree edges into heap2\n                if (treeedges[edge].route.routelen > 0) // not a degraded edge\n                {\n                  // put nbr into heap2\n                  if (inRegion[treenodes[nbr].y][treenodes[nbr].x]) {\n                    nbrX = treenodes[nbr].x;\n                    nbrY = treenodes[nbr].y;\n                    v2.push_back(nbrY * xGrid + nbrX);\n                    // if(netID == 252163 && edgeID == 51)\n                    //    printf(\"dst y: %d x: %d\\n\", nbrY, nbrX);\n                    corrEdge[nbrY][nbrX] = edge;\n                  }\n\n                  // the coordinates of two end nodes of the edge\n\n                  route = &(treeedges[edge].route);\n                  if (route->type == MAZEROUTE) {\n                    for (j = 1; j < route->routelen;\n                         j++) // don't put edge_n1 and edge_n2 into heap2\n                    {\n                      x_grid = route->gridsX[j];\n                      y_grid = route->gridsY[j];\n                      if (inRegion[y_grid][x_grid]) {\n                        v2.push_back(y_grid * xGrid + x_grid);\n                        // if(netID == 252163 && edgeID == 51)\n                        //    printf(\"dst y: %d x: %d\\n\", y_grid, x_grid);\n                        corrEdge[y_grid][x_grid] = edge;\n                      }\n                    }\n                  } // if MAZEROUTE\n                  else {\n                    printf(\"Setup Heap: not maze routing\\n\");\n                  }\n                } // if the edge is not degraded (len>0)\n\n                // add the neighbor of cur node into queue\n                queue[queuetail] = nbr;\n                queuetail++;\n              } // if the node is not visited\n            }   // if nbr!=n1\n          }     // loop i (3 neigbors for cur node)\n        }       // if cur node is a Steiner nodes\n      }         // while queue is not empty\n    }           // else n2 is not a Pin node\n\n    free(queue);\n    free(visited);\n  } // net with more than two pins\n\n  for (i = regionY1; i <= regionY2; i++) {\n    for (j = regionX1; j <= regionX2; j++)\n      inRegion[i][j] = FALSE;\n  }\n}\n\nint copyGrids(TreeNode* treenodes, int n1, TreeEdge* treeedges, int edge_n1n2,\n              int* gridsX_n1n2, int* gridsY_n1n2) {\n  int i, cnt;\n  int n1x, n1y;\n\n  n1x = treenodes[n1].x;\n  n1y = treenodes[n1].y;\n\n  cnt = 0;\n  if (treeedges[edge_n1n2].n1 == n1) // n1 is the first node of (n1, n2)\n  {\n    if (treeedges[edge_n1n2].route.type == MAZEROUTE) {\n      for (i = 0; i <= treeedges[edge_n1n2].route.routelen; i++) {\n        gridsX_n1n2[cnt] = treeedges[edge_n1n2].route.gridsX[i];\n        gridsY_n1n2[cnt] = treeedges[edge_n1n2].route.gridsY[i];\n        cnt++;\n      }\n    }    // MAZEROUTE\n    else // NOROUTE\n    {\n      gridsX_n1n2[cnt] = n1x;\n      gridsY_n1n2[cnt] = n1y;\n      cnt++;\n    }\n  }    // if n1 is the first node of (n1, n2)\n  else // n2 is the first node of (n1, n2)\n  {\n    if (treeedges[edge_n1n2].route.type == MAZEROUTE) {\n      for (i = treeedges[edge_n1n2].route.routelen; i >= 0; i--) {\n        gridsX_n1n2[cnt] = treeedges[edge_n1n2].route.gridsX[i];\n        gridsY_n1n2[cnt] = treeedges[edge_n1n2].route.gridsY[i];\n        cnt++;\n      }\n    }    // MAZEROUTE\n    else // NOROUTE\n    {\n      gridsX_n1n2[cnt] = n1x;\n      gridsY_n1n2[cnt] = n1y;\n      cnt++;\n    } // MAZEROUTE\n  }\n\n  return (cnt);\n}\n\nvoid updateRouteType1(TreeNode* treenodes, int n1, int A1, int A2, int E1x,\n                      int E1y, TreeEdge* treeedges, int edge_n1A1,\n                      int edge_n1A2) {\n  using namespace galois::substrate;\n  int i, cnt, A1x, A1y, A2x, A2y;\n  int cnt_n1A1, cnt_n1A2, E1_pos = 0;\n  // int gridsXY[4 * (xGrid + yGrid)];\n\n  int gridsX_n1A1[2 * (xGrid + yGrid)], gridsY_n1A1[2 * (xGrid + yGrid)],\n      gridsX_n1A2[2 * (xGrid + yGrid)], gridsY_n1A2[2 * (xGrid + yGrid)];\n\n  /*int* gridsX_n1A1 = gridsXY;\n  int* gridsY_n1A1 = gridsXY + xGrid + yGrid;\n  int* gridsX_n1A2 = gridsXY + 2 * (xGrid + yGrid);\n  int* gridsY_n1A2 = gridsXY + 3 * (xGrid + yGrid);*/\n\n  /*LAptr gridsX_n1A1_LA, gridsY_n1A1_LA, gridsX_n1A2_LA, gridsY_n1A2_LA;\n  gridsX_n1A1_LA = largeMallocLocal((xGrid + yGrid) * sizeof(int));\n  gridsY_n1A1_LA = largeMallocLocal((xGrid + yGrid) * sizeof(int));\n  gridsX_n1A2_LA = largeMallocLocal((xGrid + yGrid) * sizeof(int));\n  gridsY_n1A2_LA = largeMallocLocal((xGrid + yGrid) * sizeof(int));\n\n  int* gridsX_n1A1 = reinterpret_cast<int*> (gridsX_n1A1_LA.get());\n  int* gridsY_n1A1 = reinterpret_cast<int*> (gridsY_n1A1_LA.get());\n  int* gridsX_n1A2 = reinterpret_cast<int*> (gridsX_n1A2_LA.get());\n  int* gridsY_n1A2 = reinterpret_cast<int*> (gridsY_n1A2_LA.get());*/\n\n  A1x = treenodes[A1].x;\n  A1y = treenodes[A1].y;\n  A2x = treenodes[A2].x;\n  A2y = treenodes[A2].y;\n\n  // copy all the grids on (n1, A1) and (n2, A2) to tmp arrays, and keep the\n  // grids order A1->n1->A2 copy (n1, A1)\n  cnt_n1A1 =\n      copyGrids(treenodes, A1, treeedges, edge_n1A1, gridsX_n1A1, gridsY_n1A1);\n\n  // copy (n1, A2)\n  cnt_n1A2 =\n      copyGrids(treenodes, n1, treeedges, edge_n1A2, gridsX_n1A2, gridsY_n1A2);\n\n  // update route for (n1, A1) and (n1, A2)\n  // find the index of E1 in (n1, A1)\n  for (i = 0; i < cnt_n1A1; i++) {\n    if (gridsX_n1A1[i] == E1x && gridsY_n1A1[i] == E1y) // reach the E1\n    {\n      E1_pos = i;\n      break;\n    }\n  }\n\n  // reallocate memory for route.gridsX and route.gridsY\n  if (treeedges[edge_n1A1].route.type ==\n      MAZEROUTE) // if originally allocated, free them first\n  {\n    free(treeedges[edge_n1A1].route.gridsX);\n    free(treeedges[edge_n1A1].route.gridsY);\n  }\n  treeedges[edge_n1A1].route.gridsX =\n      (short*)calloc((E1_pos + 1), sizeof(short));\n  treeedges[edge_n1A1].route.gridsY =\n      (short*)calloc((E1_pos + 1), sizeof(short));\n\n  if (A1x <= E1x) {\n    cnt = 0;\n    for (i = 0; i <= E1_pos; i++) {\n      treeedges[edge_n1A1].route.gridsX[cnt] = gridsX_n1A1[i];\n      treeedges[edge_n1A1].route.gridsY[cnt] = gridsY_n1A1[i];\n      cnt++;\n    }\n    treeedges[edge_n1A1].n1 = A1;\n    treeedges[edge_n1A1].n2 = n1;\n  } else {\n    cnt = 0;\n    for (i = E1_pos; i >= 0; i--) {\n      treeedges[edge_n1A1].route.gridsX[cnt] = gridsX_n1A1[i];\n      treeedges[edge_n1A1].route.gridsY[cnt] = gridsY_n1A1[i];\n      cnt++;\n    }\n    treeedges[edge_n1A1].n1 = n1;\n    treeedges[edge_n1A1].n2 = A1;\n  }\n\n  treeedges[edge_n1A1].route.type     = MAZEROUTE;\n  treeedges[edge_n1A1].route.routelen = E1_pos;\n  treeedges[edge_n1A1].len            = ADIFF(A1x, E1x) + ADIFF(A1y, E1y);\n\n  // reallocate memory for route.gridsX and route.gridsY\n  if (treeedges[edge_n1A2].route.type ==\n      MAZEROUTE) // if originally allocated, free them first\n  {\n    free(treeedges[edge_n1A2].route.gridsX);\n    free(treeedges[edge_n1A2].route.gridsY);\n  }\n  treeedges[edge_n1A2].route.gridsX =\n      (short*)calloc((cnt_n1A1 + cnt_n1A2 - E1_pos - 1), sizeof(short));\n  treeedges[edge_n1A2].route.gridsY =\n      (short*)calloc((cnt_n1A1 + cnt_n1A2 - E1_pos - 1), sizeof(short));\n\n  if (E1x <= A2x) {\n    cnt = 0;\n    for (i = E1_pos; i < cnt_n1A1; i++) {\n      treeedges[edge_n1A2].route.gridsX[cnt] = gridsX_n1A1[i];\n      treeedges[edge_n1A2].route.gridsY[cnt] = gridsY_n1A1[i];\n      cnt++;\n    }\n    for (i = 1; i < cnt_n1A2; i++) // 0 is n1 again, so no repeat\n    {\n      treeedges[edge_n1A2].route.gridsX[cnt] = gridsX_n1A2[i];\n      treeedges[edge_n1A2].route.gridsY[cnt] = gridsY_n1A2[i];\n      cnt++;\n    }\n    treeedges[edge_n1A2].n1 = n1;\n    treeedges[edge_n1A2].n2 = A2;\n  } else {\n    cnt = 0;\n    for (i = cnt_n1A2 - 1; i >= 1; i--) // 0 is n1 again, so no repeat\n    {\n      treeedges[edge_n1A2].route.gridsX[cnt] = gridsX_n1A2[i];\n      treeedges[edge_n1A2].route.gridsY[cnt] = gridsY_n1A2[i];\n      cnt++;\n    }\n    for (i = cnt_n1A1 - 1; i >= E1_pos; i--) {\n      treeedges[edge_n1A2].route.gridsX[cnt] = gridsX_n1A1[i];\n      treeedges[edge_n1A2].route.gridsY[cnt] = gridsY_n1A1[i];\n      cnt++;\n    }\n    treeedges[edge_n1A2].n1 = A2;\n    treeedges[edge_n1A2].n2 = n1;\n  }\n  treeedges[edge_n1A2].route.type     = MAZEROUTE;\n  treeedges[edge_n1A2].route.routelen = cnt - 1;\n  treeedges[edge_n1A2].len            = ADIFF(A2x, E1x) + ADIFF(A2y, E1y);\n}\n\nvoid updateRouteType2(TreeNode* treenodes, int n1, int A1, int A2, int C1,\n                      int C2, int E1x, int E1y, TreeEdge* treeedges,\n                      int edge_n1A1, int edge_n1A2, int edge_C1C2) {\n  int i, cnt, A1x, A1y, A2x, A2y, C1x, C1y, C2x, C2y;\n  int edge_n1C1, edge_n1C2, edge_A1A2;\n  int cnt_n1A1, cnt_n1A2, cnt_C1C2, E1_pos = 0;\n  int len_A1A2, len_n1C1, len_n1C2;\n  int gridsX_n1A1[2 * (xGrid + yGrid)], gridsY_n1A1[2 * (xGrid + yGrid)];\n  int gridsX_n1A2[2 * (xGrid + yGrid)], gridsY_n1A2[2 * (xGrid + yGrid)];\n  int gridsX_C1C2[2 * (xGrid + yGrid)], gridsY_C1C2[2 * (xGrid + yGrid)];\n\n  A1x = treenodes[A1].x;\n  A1y = treenodes[A1].y;\n  A2x = treenodes[A2].x;\n  A2y = treenodes[A2].y;\n  C1x = treenodes[C1].x;\n  C1y = treenodes[C1].y;\n  C2x = treenodes[C2].x;\n  C2y = treenodes[C2].y;\n\n  edge_n1C1 = edge_n1A1;\n  edge_n1C2 = edge_n1A2;\n  edge_A1A2 = edge_C1C2;\n\n  // combine (n1, A1) and (n1, A2) into (A1, A2), A1 is the first node and A2 is\n  // the second grids order A1->n1->A2 copy (A1, n1)\n  cnt_n1A1 =\n      copyGrids(treenodes, A1, treeedges, edge_n1A1, gridsX_n1A1, gridsY_n1A1);\n\n  // copy (n1, A2)\n  cnt_n1A2 =\n      copyGrids(treenodes, n1, treeedges, edge_n1A2, gridsX_n1A2, gridsY_n1A2);\n\n  // copy all the grids on (C1, C2) to gridsX_C1C2[] and gridsY_C1C2[]\n  cnt_C1C2 =\n      copyGrids(treenodes, C1, treeedges, edge_C1C2, gridsX_C1C2, gridsY_C1C2);\n\n  // combine grids on original (A1, n1) and (n1, A2) to new (A1, A2)\n  // allocate memory for gridsX[] and gridsY[] of edge_A1A2\n  if (treeedges[edge_A1A2].route.type == MAZEROUTE) {\n    free(treeedges[edge_A1A2].route.gridsX);\n    free(treeedges[edge_A1A2].route.gridsY);\n  }\n  len_A1A2 = cnt_n1A1 + cnt_n1A2 - 1;\n\n  treeedges[edge_A1A2].route.gridsX   = (short*)calloc(len_A1A2, sizeof(short));\n  treeedges[edge_A1A2].route.gridsY   = (short*)calloc(len_A1A2, sizeof(short));\n  treeedges[edge_A1A2].route.routelen = len_A1A2 - 1;\n  treeedges[edge_A1A2].len            = ADIFF(A1x, A2x) + ADIFF(A1y, A2y);\n\n  cnt = 0;\n  for (i = 0; i < cnt_n1A1; i++) {\n    treeedges[edge_A1A2].route.gridsX[cnt] = gridsX_n1A1[i];\n    treeedges[edge_A1A2].route.gridsY[cnt] = gridsY_n1A1[i];\n    cnt++;\n  }\n  for (i = 1; i < cnt_n1A2; i++) // do not repeat point n1\n  {\n    treeedges[edge_A1A2].route.gridsX[cnt] = gridsX_n1A2[i];\n    treeedges[edge_A1A2].route.gridsY[cnt] = gridsY_n1A2[i];\n    cnt++;\n  }\n\n  // find the index of E1 in (C1, C2)\n  for (i = 0; i < cnt_C1C2; i++) {\n    if (gridsX_C1C2[i] == E1x && gridsY_C1C2[i] == E1y) {\n      E1_pos = i;\n      break;\n    }\n  }\n\n  // allocate memory for gridsX[] and gridsY[] of edge_n1C1 and edge_n1C2\n  if (treeedges[edge_n1C1].route.type == MAZEROUTE) {\n    free(treeedges[edge_n1C1].route.gridsX);\n    free(treeedges[edge_n1C1].route.gridsY);\n  }\n  len_n1C1                            = E1_pos + 1;\n  treeedges[edge_n1C1].route.gridsX   = (short*)calloc(len_n1C1, sizeof(short));\n  treeedges[edge_n1C1].route.gridsY   = (short*)calloc(len_n1C1, sizeof(short));\n  treeedges[edge_n1C1].route.routelen = len_n1C1 - 1;\n  treeedges[edge_n1C1].len            = ADIFF(C1x, E1x) + ADIFF(C1y, E1y);\n\n  if (treeedges[edge_n1C2].route.type == MAZEROUTE) {\n    free(treeedges[edge_n1C2].route.gridsX);\n    free(treeedges[edge_n1C2].route.gridsY);\n  }\n  len_n1C2                            = cnt_C1C2 - E1_pos;\n  treeedges[edge_n1C2].route.gridsX   = (short*)calloc(len_n1C2, sizeof(short));\n  treeedges[edge_n1C2].route.gridsY   = (short*)calloc(len_n1C2, sizeof(short));\n  treeedges[edge_n1C2].route.routelen = len_n1C2 - 1;\n  treeedges[edge_n1C2].len            = ADIFF(C2x, E1x) + ADIFF(C2y, E1y);\n\n  // split original (C1, C2) to (C1, n1) and (n1, C2)\n  cnt = 0;\n  for (i = 0; i <= E1_pos; i++) {\n    treeedges[edge_n1C1].route.gridsX[i] = gridsX_C1C2[i];\n    treeedges[edge_n1C1].route.gridsY[i] = gridsY_C1C2[i];\n    cnt++;\n  }\n\n  cnt = 0;\n  for (i = E1_pos; i < cnt_C1C2; i++) {\n    treeedges[edge_n1C2].route.gridsX[cnt] = gridsX_C1C2[i];\n    treeedges[edge_n1C2].route.gridsY[cnt] = gridsY_C1C2[i];\n    cnt++;\n  }\n}\n\nvoid reInitTree(int netID) {\n  int deg, numEdges, edgeID, d, j;\n  TreeEdge* treeedge;\n  Tree rsmt;\n  int x[MAXNETDEG], y[MAXNETDEG];\n\n  // printf(\"re init tree for net %d\\n\",netID);\n\n  newRipupNet(netID);\n\n  deg      = sttrees[netID].deg;\n  numEdges = 2 * deg - 3;\n  for (edgeID = 0; edgeID < numEdges; edgeID++) {\n    treeedge = &(sttrees[netID].edges[edgeID]);\n    if (treeedge->len > 0) {\n      free(treeedge->route.gridsX);\n      free(treeedge->route.gridsY);\n      free(treeedge->route.gridsL);\n    }\n  }\n  free(sttrees[netID].nodes);\n  free(sttrees[netID].edges);\n\n  // printf(\"old tree component freed\\n\");\n\n  d = nets[netID]->deg;\n  // printf(\"net deg %d\\n\",d);\n  // fflush(stdout);\n  for (j = 0; j < d; j++) {\n    x[j] = nets[netID]->pinX[j];\n    y[j] = nets[netID]->pinY[j];\n  }\n  // printf(\"before flute\\n\");\n  // fflush(stdout);\n  fluteCongest(netID, d, x, y, 2, 1.2, &rsmt);\n  // printf(\"fluted worked\\n\");\n  // fflush(stdout);\n  if (d > 3) {\n    edgeShiftNew(&rsmt);\n    // printf(\"edge shifted\\n\");\n  }\n  // fflush(stdout);\n  copyStTree(netID, rsmt);\n  // printf(\"tree copied\\n\");\n  // fflush(stdout);\n  newrouteLInMaze(netID);\n  // printf(\"L routing worked\\n\");\n  // fflush(stdout);\n  // newrouteZ(netID, 10);\n  // printf(\"Z routign worked\\n\");\n  // fflush(stdout);\n  convertToMazerouteNet(netID);\n  // printf(\"L to mzed converted\\n\");\n  // fflush(stdout);\n  // checkRoute2DTree(netID);\n  // printf(\"tree double checked\\n\");\n  // fflush(stdout);\n}\n\nvoid mazeRouteMSMD(int iter, int expand, float costHeight, int ripup_threshold,\n                   int mazeedge_Threshold, Bool Ordering, int cost_type) {\n  // LOCK = 0;\n  float forange;\n\n  // allocate memory for distance and parent and pop_heap\n  h_costTable = (float*)calloc(40 * hCapacity, sizeof(float));\n  v_costTable = (float*)calloc(40 * vCapacity, sizeof(float));\n\n  forange = 40 * hCapacity;\n\n  if (cost_type == 2) {\n    for (int i = 0; i < forange; i++) {\n      if (i < hCapacity - 1)\n        h_costTable[i] =\n            costHeight / (exp((float)(hCapacity - i - 1) * LOGIS_COF) + 1) + 1;\n      else\n        h_costTable[i] =\n            costHeight / (exp((float)(hCapacity - i - 1) * LOGIS_COF) + 1) + 1 +\n            costHeight / slope * (i - hCapacity);\n    }\n    forange = 40 * vCapacity;\n    for (int i = 0; i < forange; i++) {\n      if (i < vCapacity - 1)\n        v_costTable[i] =\n            costHeight / (exp((float)(vCapacity - i - 1) * LOGIS_COF) + 1) + 1;\n      else\n        v_costTable[i] =\n            costHeight / (exp((float)(vCapacity - i - 1) * LOGIS_COF) + 1) + 1 +\n            costHeight / slope * (i - vCapacity);\n    }\n  } else {\n\n    for (int i = 0; i < forange; i++) {\n      if (i < hCapacity)\n        h_costTable[i] =\n            costHeight / (exp((float)(hCapacity - i) * LOGIS_COF) + 1) + 1;\n      else\n        h_costTable[i] =\n            costHeight / (exp((float)(hCapacity - i) * LOGIS_COF) + 1) + 1 +\n            costHeight / slope * (i - hCapacity);\n    }\n    forange = 40 * vCapacity;\n    for (int i = 0; i < forange; i++) {\n      if (i < vCapacity)\n        v_costTable[i] =\n            costHeight / (exp((float)(vCapacity - i) * LOGIS_COF) + 1) + 1;\n      else\n        v_costTable[i] =\n            costHeight / (exp((float)(vCapacity - i) * LOGIS_COF) + 1) + 1 +\n            costHeight / slope * (i - vCapacity);\n    }\n  }\n  // cout << \" i = vCap:\" << v_costTable[vCapacity-1] << \" \" <<\n  // v_costTable[vCapacity] << \" \" << v_costTable[vCapacity+1] << endl;\n\n  /*forange = yGrid*xGrid;\n  for(int i=0; i<forange; i++)\n  {\n      pop_heap2[i] = FALSE;\n  } //Michael*/\n\n  if (Ordering) {\n    StNetOrder();\n    // printf(\"order?\\n\");\n  }\n\n  galois::substrate::PerThreadStorage<THREAD_LOCAL_STORAGE>\n      thread_local_storage{};\n  // for(nidRPC=0; nidRPC<numValidNets; nidRPC++)//parallelize\n  PerThread_PQ perthread_pq;\n  PerThread_Vec perthread_vec;\n  PRINT = 0;\n  galois::GAccumulator<int> total_ripups;\n  galois::GReduceMax<int> max_ripups;\n  total_ripups.reset();\n  max_ripups.reset();\n\n  // galois::runtime::profileVtune( [&] (void) {\n  /*std::random_device rd;\n  std::mt19937 g(rd());\n  std::shuffle(net_shuffle.begin(), net_shuffle.end(), g);\n\n  galois::do_all(galois::iterate(net_shuffle), */\n  // galois::for_each(galois::iterate(0, numValidNets),\n  //        [&] (const auto nidRPC, auto& ctx)\n  galois::do_all(\n      galois::iterate(0, numValidNets),\n      [&](const auto nidRPC) {\n        int grid, netID;\n\n        // maze routing for multi-source, multi-destination\n        bool hypered, enter;\n        int i, j, deg, edgeID, n1, n2, n1x, n1y, n2x, n2y, ymin, ymax, xmin,\n            xmax, curX, curY, crossX, crossY, tmpX, tmpY, tmpi, min_x, min_y,\n            num_edges;\n        int regionX1, regionX2, regionY1, regionY2;\n        int ind1, tmpind, gridsX[XRANGE], gridsY[YRANGE], tmp_gridsX[XRANGE],\n            tmp_gridsY[YRANGE];\n        int endpt1, endpt2, A1, A2, B1, B2, C1, C2, D1, D2, cnt, cnt_n1n2;\n        int edge_n1n2, edge_n1A1, edge_n1A2, edge_n1C1, edge_n1C2, edge_A1A2,\n            edge_C1C2;\n        int edge_n2B1, edge_n2B2, edge_n2D1, edge_n2D2, edge_B1B2, edge_D1D2;\n        int E1x, E1y, E2x, E2y;\n        int tmp_grid;\n        int preX, preY, origENG, edgeREC;\n\n        float tmp, tmp_cost;\n        TreeEdge *treeedges, *treeedge;\n        TreeNode* treenodes;\n\n        bool* pop_heap2 = thread_local_storage.getLocal()->pop_heap2;\n\n        float** d1    = thread_local_storage.getLocal()->d1_p;\n        bool** HV     = thread_local_storage.getLocal()->HV_p;\n        bool** hyperV = thread_local_storage.getLocal()->hyperV_p;\n        bool** hyperH = thread_local_storage.getLocal()->hyperH_p;\n\n        short** parentX1 = thread_local_storage.getLocal()->parentX1_p;\n        short** parentX3 = thread_local_storage.getLocal()->parentX3_p;\n        short** parentY1 = thread_local_storage.getLocal()->parentY1_p;\n        short** parentY3 = thread_local_storage.getLocal()->parentY3_p;\n\n        int** corrEdge = thread_local_storage.getLocal()->corrEdge_p;\n\n        OrderNetEdge* netEO = thread_local_storage.getLocal()->netEO_p;\n\n        bool** inRegion = thread_local_storage.getLocal()->inRegion_p;\n\n        local_pq pq1 = perthread_pq.get();\n        local_vec v2 = perthread_vec.get();\n\n        /*for(i=0; i<yGrid*xGrid; i++)\n        {\n            pop_heap2[i] = FALSE;\n        } */\n\n        // memset(inRegion_alloc, 0, xGrid * yGrid * sizeof(bool));\n        /*for(int i=0; i<yGrid; i++)\n        {\n            for(int j=0; j<xGrid; j++)\n                inRegion[i][j] = FALSE;\n        }*/\n        // printf(\"hyperV[153][134]: %d %d %d\\n\", hyperV[153][134],\n        // parentY1[153][134], parentX3[153][134]); printf(\"what is\n        // happening?\\n\");\n\n        if (Ordering) {\n          netID = treeOrderCong[nidRPC].treeIndex;\n        } else {\n          netID = nidRPC;\n        }\n\n        deg = sttrees[netID].deg;\n\n        origENG = expand;\n\n        netedgeOrderDec(netID, netEO);\n\n        treeedges = sttrees[netID].edges;\n        treenodes = sttrees[netID].nodes;\n        // loop for all the tree edges (2*deg-3)\n        num_edges = 2 * deg - 3;\n\n        for (edgeREC = 0; edgeREC < num_edges; edgeREC++) {\n          edgeID   = netEO[edgeREC].edgeID;\n          treeedge = &(treeedges[edgeID]);\n\n          n1            = treeedge->n1;\n          n2            = treeedge->n2;\n          n1x           = treenodes[n1].x;\n          n1y           = treenodes[n1].y;\n          n2x           = treenodes[n2].x;\n          n2y           = treenodes[n2].y;\n          treeedge->len = ADIFF(n2x, n1x) + ADIFF(n2y, n1y);\n\n          if (treeedge->len >\n              mazeedge_Threshold) // only route the non-degraded edges (len>0)\n          {\n            // enter = newRipupCheck(treeedge, n1x, n1y, n2x, n2y,\n            // ripup_threshold, netID, edgeID);\n            enter =\n                newRipupCheck_atomic(treeedge, ripup_threshold, netID, edgeID);\n\n            // ripup the routing for the edge\n            if (enter) {\n              /*pre_length = treeedge->route.routelen;\n              for(int i = 0; i < pre_length; i++)\n              {\n                  pre_gridsY[i] = treeedge->route.gridsY[i];\n                  pre_gridsX[i] = treeedge->route.gridsX[i];\n                  //printf(\"i %d x %d y %d\\n\", i, pre_gridsX[i], pre_gridsY[i]);\n              }*/\n              // if(netID == 252163 && edgeID == 51)\n              //    printf(\"netID %d edgeID %d src %d %d dst %d %d\\n\", netID,\n              //    edgeID, n1x, n1y, n2x, n2y);\n              if (n1y <= n2y) {\n                ymin = n1y;\n                ymax = n2y;\n              } else {\n                ymin = n2y;\n                ymax = n1y;\n              }\n\n              if (n1x <= n2x) {\n                xmin = n1x;\n                xmax = n2x;\n              } else {\n                xmin = n2x;\n                xmax = n1x;\n              }\n\n              int enlarge =\n                  min(origENG,\n                      (iter / 6 + 3) *\n                          treeedge->route\n                              .routelen); // michael, this was global variable\n              regionX1 = max(0, xmin - enlarge);\n              regionX2 = min(xGrid - 1, xmax + enlarge);\n              regionY1 = max(0, ymin - enlarge);\n              regionY2 = min(yGrid - 1, ymax + enlarge);\n\n              // initialize d1[][] and d2[][] as BIG_INT\n              for (i = regionY1; i <= regionY2; i++) {\n                for (j = regionX1; j <= regionX2; j++) {\n                  d1[i][j] = BIG_INT;\n                  /*d2[i][j] = BIG_INT;\n                  hyperH[i][j] = FALSE;\n                  hyperV[i][j] = FALSE;*/\n                }\n              }\n              // memset(hyperH, 0, xGrid * yGrid * sizeof(bool));\n              // memset(hyperV, 0, xGrid * yGrid * sizeof(bool));\n              for (i = regionY1; i <= regionY2; i++) {\n                for (j = regionX1; j <= regionX2; j++) {\n                  hyperH[i][j] = FALSE;\n                }\n              }\n              for (i = regionY1; i <= regionY2; i++) {\n                for (j = regionX1; j <= regionX2; j++) {\n                  hyperV[i][j] = FALSE;\n                }\n              }\n              // TODO: use seperate loops\n\n              // setup heap1, heap2 and initialize d1[][] and d2[][] for all the\n              // grids on the two subtrees\n              setupHeap(netID, edgeID, pq1, v2, regionX1, regionX2, regionY1,\n                        regionY2, d1, corrEdge, inRegion);\n              // TODO: use std priority queue\n              // while loop to find shortest path\n              ind1 = (pq1.top().d1_p - &d1[0][0]);\n              pq1.pop();\n              curX = ind1 % xGrid;\n              curY = ind1 / xGrid;\n\n              for (local_vec::iterator ii = v2.begin(); ii != v2.end(); ii++) {\n                pop_heap2[*ii] = TRUE;\n              }\n              float curr_d1;\n              while (pop_heap2[ind1] ==\n                     FALSE) // stop until the grid position been popped out from\n                            // both heap1 and heap2\n              {\n                // relax all the adjacent grids within the enlarged region for\n                // source subtree\n\n                // if(PRINT) printf(\"curX curY %d %d, (%d, %d), (%d, %d),\n                // pq1.size: %d\\n\", curX, curY, regionX1, regionX2, regionY1,\n                // regionY2, pq1.size()); if(curX == 102 && curY == 221)\n                // exit(1);\n                curr_d1 = d1[curY][curX];\n                if (curr_d1 != 0) {\n                  if (HV[curY][curX]) {\n                    preX = parentX1[curY][curX];\n                    preY = parentY1[curY][curX];\n                  } else {\n                    preX = parentX3[curY][curX];\n                    preY = parentY3[curY][curX];\n                  }\n                } else {\n                  preX = curX;\n                  preY = curY;\n                }\n\n                // left\n                if (curX > regionX1) {\n                  grid = curY * (xGrid - 1) + curX - 1;\n                  tmpX = curX - 1; // the left neighbor\n                  if ((preY == curY) || (curr_d1 == 0)) {\n                    tmp = curr_d1 +\n                          h_costTable[h_edges[grid].usage + h_edges[grid].red +\n                                      (int)(L * h_edges[grid].last_usage)];\n                  } else {\n                    if (curX < regionX2 - 1) {\n                      tmp_grid = curY * (xGrid - 1) + curX;\n                      tmp_cost =\n                          d1[curY][curX + 1] +\n                          h_costTable[h_edges[tmp_grid].usage +\n                                      h_edges[tmp_grid].red +\n                                      (int)(L * h_edges[tmp_grid].last_usage)];\n\n                      if (tmp_cost < curr_d1 + VIA &&\n                          d1[curY][tmpX] >\n                              tmp_cost +\n                                  h_costTable\n                                      [h_edges[grid].usage + h_edges[grid].red +\n                                       (int)(L * h_edges[grid].last_usage)]) {\n                        hyperH[curY][curX] = TRUE; // Michael\n                      }\n                    }\n                    tmp = curr_d1 + VIA +\n                          h_costTable[h_edges[grid].usage + h_edges[grid].red +\n                                      (int)(L * h_edges[grid].last_usage)];\n                  }\n                  // if(LOCK)  h_edges[grid].releaseLock();\n\n                  if (d1[curY][tmpX] >\n                      tmp) // left neighbor been put into heap1 but needs update\n                  {\n                    d1[curY][tmpX]       = tmp;\n                    parentX3[curY][tmpX] = curX;\n                    parentY3[curY][tmpX] = curY;\n                    HV[curY][tmpX]       = FALSE;\n                    pq1.push({&(d1[curY][tmpX]), tmp});\n                  }\n                }\n                // right\n                if (curX < regionX2) {\n                  grid = curY * (xGrid - 1) + curX;\n\n                  tmpX = curX + 1; // the right neighbor\n                  if ((preY == curY) || (curr_d1 == 0)) {\n                    tmp = curr_d1 +\n                          h_costTable[h_edges[grid].usage + h_edges[grid].red +\n                                      (int)(L * h_edges[grid].last_usage)];\n                  } else {\n                    if (curX > regionX1 + 1) {\n                      tmp_grid = curY * (xGrid - 1) + curX - 1;\n                      tmp_cost =\n                          d1[curY][curX - 1] +\n                          h_costTable[h_edges[tmp_grid].usage +\n                                      h_edges[tmp_grid].red +\n                                      (int)(L * h_edges[tmp_grid].last_usage)];\n\n                      if (tmp_cost < curr_d1 + VIA &&\n                          d1[curY][tmpX] >\n                              tmp_cost +\n                                  h_costTable\n                                      [h_edges[grid].usage + h_edges[grid].red +\n                                       (int)(L * h_edges[grid].last_usage)]) {\n                        hyperH[curY][curX] = TRUE;\n                      }\n                    }\n                    tmp = curr_d1 + VIA +\n                          h_costTable[h_edges[grid].usage + h_edges[grid].red +\n                                      (int)(L * h_edges[grid].last_usage)];\n                  }\n\n                  if (d1[curY][tmpX] > tmp) // right neighbor been put into\n                                            // heap1 but needs update\n                  {\n                    d1[curY][tmpX]       = tmp;\n                    parentX3[curY][tmpX] = curX;\n                    parentY3[curY][tmpX] = curY;\n                    HV[curY][tmpX]       = FALSE;\n                    pq1.push({&(d1[curY][tmpX]), tmp});\n                  }\n                }\n                // bottom\n                if (curY > regionY1) {\n                  grid = (curY - 1) * xGrid + curX;\n\n                  tmpY = curY - 1; // the bottom neighbor\n                  if ((preX == curX) || (curr_d1 == 0)) {\n                    tmp = curr_d1 +\n                          v_costTable[v_edges[grid].usage + v_edges[grid].red +\n                                      (int)(L * v_edges[grid].last_usage)];\n                  } else {\n                    if (curY < regionY2 - 1) {\n                      tmp_grid = curY * xGrid + curX;\n                      tmp_cost =\n                          d1[curY + 1][curX] +\n                          v_costTable[v_edges[tmp_grid].usage +\n                                      v_edges[tmp_grid].red +\n                                      (int)(L * v_edges[tmp_grid].last_usage)];\n\n                      if (tmp_cost < curr_d1 + VIA &&\n                          d1[tmpY][curX] >\n                              tmp_cost +\n                                  v_costTable\n                                      [v_edges[grid].usage + v_edges[grid].red +\n                                       (int)(L * v_edges[grid].last_usage)]) {\n                        hyperV[curY][curX] = TRUE;\n                      }\n                    }\n                    tmp = curr_d1 + VIA +\n                          v_costTable[v_edges[grid].usage + v_edges[grid].red +\n                                      (int)(L * v_edges[grid].last_usage)];\n                  }\n\n                  if (d1[tmpY][curX] > tmp) // bottom neighbor been put into\n                                            // heap1 but needs update\n                  {\n                    d1[tmpY][curX]       = tmp;\n                    parentX1[tmpY][curX] = curX;\n                    parentY1[tmpY][curX] = curY;\n                    HV[tmpY][curX]       = TRUE;\n                    pq1.push({&(d1[tmpY][curX]), tmp});\n                  }\n                }\n                // top\n                if (curY < regionY2) {\n                  grid = curY * xGrid + curX;\n                  tmpY = curY + 1; // the top neighbor\n\n                  if ((preX == curX) || (curr_d1 == 0)) {\n                    tmp = curr_d1 +\n                          v_costTable[v_edges[grid].usage + v_edges[grid].red +\n                                      (int)(L * v_edges[grid].last_usage)];\n                  } else {\n                    if (curY > regionY1 + 1) {\n                      tmp_grid = (curY - 1) * xGrid + curX;\n                      tmp_cost =\n                          d1[curY - 1][curX] +\n                          v_costTable[v_edges[tmp_grid].usage +\n                                      v_edges[tmp_grid].red +\n                                      (int)(L * v_edges[tmp_grid].last_usage)];\n\n                      if (tmp_cost < curr_d1 + VIA &&\n                          d1[tmpY][curX] >\n                              tmp_cost +\n                                  v_costTable\n                                      [v_edges[grid].usage + v_edges[grid].red +\n                                       (int)(L * v_edges[grid].last_usage)]) {\n                        hyperV[curY][curX] = TRUE;\n                      }\n                    }\n                    tmp = curr_d1 + VIA +\n                          v_costTable[v_edges[grid].usage + v_edges[grid].red +\n                                      (int)(L * v_edges[grid].last_usage)];\n                  }\n                  if (d1[tmpY][curX] >\n                      tmp) // top neighbor been put into heap1 but needs update\n                  {\n                    d1[tmpY][curX]       = tmp;\n                    parentX1[tmpY][curX] = curX;\n                    parentY1[tmpY][curX] = curY;\n                    HV[tmpY][curX]       = TRUE;\n                    pq1.push({&(d1[tmpY][curX]), tmp});\n                  }\n                }\n\n                // update ind1 and ind2 for next loop, Michael: need to check if\n                // it is up-to-date value.\n                float d1_push;\n                do {\n                  ind1    = pq1.top().d1_p - &d1[0][0];\n                  d1_push = pq1.top().d1_push;\n                  pq1.pop();\n                  curX = ind1 % xGrid;\n                  curY = ind1 / xGrid;\n                } while (d1_push != d1[curY][curX]);\n              } // while loop\n\n              for (local_vec::iterator ii = v2.begin(); ii != v2.end(); ii++)\n                pop_heap2[*ii] = FALSE;\n\n              crossX = ind1 % xGrid;\n              crossY = ind1 / xGrid;\n\n              cnt  = 0;\n              curX = crossX;\n              curY = crossY;\n              while (d1[curY][curX] != 0) // loop until reach subtree1\n              {\n\n                /*if(cnt > 2000) {\n                    cerr << \"Y: \" << curY <<\" X:\" << curX << \" hyperH: \" <<\n                hyperH[curY][curX]; cerr << \" hyperV:\" << hyperV[curY][curX] <<\n                \" HV: \" << HV[curY][curX]; cerr << \" d1: \" << d1[curY][curX] <<\n                endl; cerr << \" deadloop return!\" << endl; reInitTree(netID);\n                    return;\n                }*/\n\n                hypered = FALSE;\n                if (cnt != 0) {\n                  if (curX != tmpX && hyperH[curY][curX]) {\n                    curX    = 2 * curX - tmpX;\n                    hypered = TRUE;\n                  }\n                  // printf(\"hyperV[153][134]: %d\\n\", hyperV[curY][curX]);\n                  if (curY != tmpY && hyperV[curY][curX]) {\n                    curY    = 2 * curY - tmpY;\n                    hypered = TRUE;\n                  }\n                }\n                tmpX = curX;\n                tmpY = curY;\n                if (!hypered) {\n                  if (HV[tmpY][tmpX]) {\n                    curY = parentY1[tmpY][tmpX];\n                  } else {\n                    curX = parentX3[tmpY][tmpX];\n                  }\n                }\n\n                tmp_gridsX[cnt] = curX;\n                tmp_gridsY[cnt] = curY;\n                cnt++;\n              }\n              // reverse the grids on the path\n\n              for (i = 0; i < cnt; i++) {\n                tmpind    = cnt - 1 - i;\n                gridsX[i] = tmp_gridsX[tmpind];\n                gridsY[i] = tmp_gridsY[tmpind];\n              }\n              // add the connection point (crossX, crossY)\n              gridsX[cnt] = crossX;\n              gridsY[cnt] = crossY;\n              cnt++;\n\n              curX     = crossX;\n              curY     = crossY;\n              cnt_n1n2 = cnt;\n\n              // change the tree structure according to the new routing for the\n              // tree edge find E1 and E2, and the endpoints of the edges they\n              // are on\n              E1x = gridsX[0];\n              E1y = gridsY[0];\n              E2x = gridsX[cnt_n1n2 - 1];\n              E2y = gridsY[cnt_n1n2 - 1];\n\n              edge_n1n2 = edgeID;\n              // if(netID == 252163 && edgeID == 51)\n              //    printf(\"E1x: %d, E1y: %d, E2x: %d, E2y %d length: %d\\n\",\n              //    E1x, E1y, E2x, E2y, cnt_n1n2);\n\n              // (1) consider subtree1\n              if (n1 >= deg && (E1x != n1x || E1y != n1y))\n              // n1 is not a pin and E1!=n1, then make change to subtree1,\n              // otherwise, no change to subtree1\n              {\n                // find the endpoints of the edge E1 is on\n                endpt1 = treeedges[corrEdge[E1y][E1x]].n1;\n                endpt2 = treeedges[corrEdge[E1y][E1x]].n2;\n\n                // find A1, A2 and edge_n1A1, edge_n1A2\n                if (treenodes[n1].nbr[0] == n2) {\n                  A1        = treenodes[n1].nbr[1];\n                  A2        = treenodes[n1].nbr[2];\n                  edge_n1A1 = treenodes[n1].edge[1];\n                  edge_n1A2 = treenodes[n1].edge[2];\n                } else if (treenodes[n1].nbr[1] == n2) {\n                  A1        = treenodes[n1].nbr[0];\n                  A2        = treenodes[n1].nbr[2];\n                  edge_n1A1 = treenodes[n1].edge[0];\n                  edge_n1A2 = treenodes[n1].edge[2];\n                } else {\n                  A1        = treenodes[n1].nbr[0];\n                  A2        = treenodes[n1].nbr[1];\n                  edge_n1A1 = treenodes[n1].edge[0];\n                  edge_n1A2 = treenodes[n1].edge[1];\n                }\n\n                if (endpt1 == n1 ||\n                    endpt2 == n1) // E1 is on (n1, A1) or (n1, A2)\n                {\n                  // if E1 is on (n1, A2), switch A1 and A2 so that E1 is always\n                  // on (n1, A1)\n                  if (endpt1 == A2 || endpt2 == A2) {\n                    tmpi      = A1;\n                    A1        = A2;\n                    A2        = tmpi;\n                    tmpi      = edge_n1A1;\n                    edge_n1A1 = edge_n1A2;\n                    edge_n1A2 = tmpi;\n                  }\n\n                  // update route for edge (n1, A1), (n1, A2)\n                  updateRouteType1(treenodes, n1, A1, A2, E1x, E1y, treeedges,\n                                   edge_n1A1, edge_n1A2);\n                  // update position for n1\n                  treenodes[n1].x = E1x;\n                  treenodes[n1].y = E1y;\n                }    // if E1 is on (n1, A1) or (n1, A2)\n                else // E1 is not on (n1, A1) or (n1, A2), but on (C1, C2)\n                {\n                  C1        = endpt1;\n                  C2        = endpt2;\n                  edge_C1C2 = corrEdge[E1y][E1x];\n\n                  // update route for edge (n1, C1), (n1, C2) and (A1, A2)\n                  updateRouteType2(treenodes, n1, A1, A2, C1, C2, E1x, E1y,\n                                   treeedges, edge_n1A1, edge_n1A2, edge_C1C2);\n                  // update position for n1\n                  treenodes[n1].x = E1x;\n                  treenodes[n1].y = E1y;\n                  // update 3 edges (n1, A1)->(C1, n1), (n1, A2)->(n1, C2), (C1,\n                  // C2)->(A1, A2)\n                  edge_n1C1               = edge_n1A1;\n                  treeedges[edge_n1C1].n1 = C1;\n                  treeedges[edge_n1C1].n2 = n1;\n                  edge_n1C2               = edge_n1A2;\n                  treeedges[edge_n1C2].n1 = n1;\n                  treeedges[edge_n1C2].n2 = C2;\n                  edge_A1A2               = edge_C1C2;\n                  treeedges[edge_A1A2].n1 = A1;\n                  treeedges[edge_A1A2].n2 = A2;\n                  // update nbr and edge for 5 nodes n1, A1, A2, C1, C2\n                  // n1's nbr (n2, A1, A2)->(n2, C1, C2)\n                  treenodes[n1].nbr[0]  = n2;\n                  treenodes[n1].edge[0] = edge_n1n2;\n                  treenodes[n1].nbr[1]  = C1;\n                  treenodes[n1].edge[1] = edge_n1C1;\n                  treenodes[n1].nbr[2]  = C2;\n                  treenodes[n1].edge[2] = edge_n1C2;\n                  // A1's nbr n1->A2\n                  for (i = 0; i < 3; i++) {\n                    if (treenodes[A1].nbr[i] == n1) {\n                      treenodes[A1].nbr[i]  = A2;\n                      treenodes[A1].edge[i] = edge_A1A2;\n                      break;\n                    }\n                  }\n                  // A2's nbr n1->A1\n                  for (i = 0; i < 3; i++) {\n                    if (treenodes[A2].nbr[i] == n1) {\n                      treenodes[A2].nbr[i]  = A1;\n                      treenodes[A2].edge[i] = edge_A1A2;\n                      break;\n                    }\n                  }\n                  // C1's nbr C2->n1\n                  for (i = 0; i < 3; i++) {\n                    if (treenodes[C1].nbr[i] == C2) {\n                      treenodes[C1].nbr[i]  = n1;\n                      treenodes[C1].edge[i] = edge_n1C1;\n                      break;\n                    }\n                  }\n                  // C2's nbr C1->n1\n                  for (i = 0; i < 3; i++) {\n                    if (treenodes[C2].nbr[i] == C1) {\n                      treenodes[C2].nbr[i]  = n1;\n                      treenodes[C2].edge[i] = edge_n1C2;\n                      break;\n                    }\n                  }\n\n                } // else E1 is not on (n1, A1) or (n1, A2), but on (C1, C2)\n              }   // n1 is not a pin and E1!=n1\n\n              // (2) consider subtree2\n\n              if (n2 >= deg && (E2x != n2x || E2y != n2y))\n              // n2 is not a pin and E2!=n2, then make change to subtree2,\n              // otherwise, no change to subtree2\n              {\n                // find the endpoints of the edge E1 is on\n                endpt1 = treeedges[corrEdge[E2y][E2x]].n1;\n                endpt2 = treeedges[corrEdge[E2y][E2x]].n2;\n\n                // find B1, B2\n                if (treenodes[n2].nbr[0] == n1) {\n                  B1        = treenodes[n2].nbr[1];\n                  B2        = treenodes[n2].nbr[2];\n                  edge_n2B1 = treenodes[n2].edge[1];\n                  edge_n2B2 = treenodes[n2].edge[2];\n                } else if (treenodes[n2].nbr[1] == n1) {\n                  B1        = treenodes[n2].nbr[0];\n                  B2        = treenodes[n2].nbr[2];\n                  edge_n2B1 = treenodes[n2].edge[0];\n                  edge_n2B2 = treenodes[n2].edge[2];\n                } else {\n                  B1        = treenodes[n2].nbr[0];\n                  B2        = treenodes[n2].nbr[1];\n                  edge_n2B1 = treenodes[n2].edge[0];\n                  edge_n2B2 = treenodes[n2].edge[1];\n                }\n\n                if (endpt1 == n2 ||\n                    endpt2 == n2) // E2 is on (n2, B1) or (n2, B2)\n                {\n                  // if E2 is on (n2, B2), switch B1 and B2 so that E2 is always\n                  // on (n2, B1)\n                  if (endpt1 == B2 || endpt2 == B2) {\n                    tmpi      = B1;\n                    B1        = B2;\n                    B2        = tmpi;\n                    tmpi      = edge_n2B1;\n                    edge_n2B1 = edge_n2B2;\n                    edge_n2B2 = tmpi;\n                  }\n\n                  // update route for edge (n2, B1), (n2, B2)\n                  updateRouteType1(treenodes, n2, B1, B2, E2x, E2y, treeedges,\n                                   edge_n2B1, edge_n2B2);\n\n                  // update position for n2\n                  treenodes[n2].x = E2x;\n                  treenodes[n2].y = E2y;\n                }    // if E2 is on (n2, B1) or (n2, B2)\n                else // E2 is not on (n2, B1) or (n2, B2), but on (D1, D2)\n                {\n                  D1        = endpt1;\n                  D2        = endpt2;\n                  edge_D1D2 = corrEdge[E2y][E2x];\n\n                  // update route for edge (n2, D1), (n2, D2) and (B1, B2)\n                  updateRouteType2(treenodes, n2, B1, B2, D1, D2, E2x, E2y,\n                                   treeedges, edge_n2B1, edge_n2B2, edge_D1D2);\n                  // update position for n2\n                  treenodes[n2].x = E2x;\n                  treenodes[n2].y = E2y;\n                  // update 3 edges (n2, B1)->(D1, n2), (n2, B2)->(n2, D2), (D1,\n                  // D2)->(B1, B2)\n                  edge_n2D1               = edge_n2B1;\n                  treeedges[edge_n2D1].n1 = D1;\n                  treeedges[edge_n2D1].n2 = n2;\n                  edge_n2D2               = edge_n2B2;\n                  treeedges[edge_n2D2].n1 = n2;\n                  treeedges[edge_n2D2].n2 = D2;\n                  edge_B1B2               = edge_D1D2;\n                  treeedges[edge_B1B2].n1 = B1;\n                  treeedges[edge_B1B2].n2 = B2;\n                  // update nbr and edge for 5 nodes n2, B1, B2, D1, D2\n                  // n1's nbr (n1, B1, B2)->(n1, D1, D2)\n                  treenodes[n2].nbr[0]  = n1;\n                  treenodes[n2].edge[0] = edge_n1n2;\n                  treenodes[n2].nbr[1]  = D1;\n                  treenodes[n2].edge[1] = edge_n2D1;\n                  treenodes[n2].nbr[2]  = D2;\n                  treenodes[n2].edge[2] = edge_n2D2;\n                  // B1's nbr n2->B2\n                  for (i = 0; i < 3; i++) {\n                    if (treenodes[B1].nbr[i] == n2) {\n                      treenodes[B1].nbr[i]  = B2;\n                      treenodes[B1].edge[i] = edge_B1B2;\n                      break;\n                    }\n                  }\n                  // B2's nbr n2->B1\n                  for (i = 0; i < 3; i++) {\n                    if (treenodes[B2].nbr[i] == n2) {\n                      treenodes[B2].nbr[i]  = B1;\n                      treenodes[B2].edge[i] = edge_B1B2;\n                      break;\n                    }\n                  }\n                  // D1's nbr D2->n2\n                  for (i = 0; i < 3; i++) {\n                    if (treenodes[D1].nbr[i] == D2) {\n                      treenodes[D1].nbr[i]  = n2;\n                      treenodes[D1].edge[i] = edge_n2D1;\n                      break;\n                    }\n                  }\n                  // D2's nbr D1->n2\n                  for (i = 0; i < 3; i++) {\n                    if (treenodes[D2].nbr[i] == D1) {\n                      treenodes[D2].nbr[i]  = n2;\n                      treenodes[D2].edge[i] = edge_n2D2;\n                      break;\n                    }\n                  }\n                } // else E2 is not on (n2, B1) or (n2, B2), but on (D1, D2)\n              }   // n2 is not a pin and E2!=n2\n\n              // update route for edge (n1, n2) and edge usage\n\n              // printf(\"update route? %d %d\\n\", netID, num_edges);\n              if (treeedges[edge_n1n2].route.type == MAZEROUTE) {\n                free(treeedges[edge_n1n2].route.gridsX);\n                free(treeedges[edge_n1n2].route.gridsY);\n              }\n              treeedges[edge_n1n2].route.gridsX =\n                  (short*)calloc(cnt_n1n2, sizeof(short));\n              treeedges[edge_n1n2].route.gridsY =\n                  (short*)calloc(cnt_n1n2, sizeof(short));\n              treeedges[edge_n1n2].route.type     = MAZEROUTE;\n              treeedges[edge_n1n2].route.routelen = cnt_n1n2 - 1;\n              treeedges[edge_n1n2].len = ADIFF(E1x, E2x) + ADIFF(E1y, E2y);\n              treeedges[edge_n1n2].n_ripups += 1;\n              total_ripups += 1;\n              max_ripups.update(treeedges[edge_n1n2].n_ripups);\n\n              for (i = 0; i < cnt_n1n2; i++) {\n                // printf(\"cnt_n1n2: %d\\n\", cnt_n1n2);\n                treeedges[edge_n1n2].route.gridsX[i] = gridsX[i];\n                treeedges[edge_n1n2].route.gridsY[i] = gridsY[i];\n              }\n\n              // update edge usage\n\n              /*for(i=0; i<pre_length; i++)\n              {\n                  if(pre_gridsX[i]==pre_gridsX[i+1]) // a vertical edge\n                  {\n                      if(i != pre_length - 1)\n                          min_y = min(pre_gridsY[i], pre_gridsY[i+1]);\n                      else\n                          min_y = pre_gridsY[i];\n                      //v_edges[min_y*xGrid+gridsX[i]].usage += 1;\n                      //galois::atomicAdd(v_edges[min_y*xGrid+gridsX[i]].usage,\n              (short unsigned)1);\n                      //printf(\"x y %d %d i %d \\n\", pre_gridsX[i], min_y, i);\n                      v_edges[min_y*xGrid+pre_gridsX[i]].usage.fetch_sub((short\n              int)1);\n                      //if(v_edges[min_y*xGrid+pre_gridsX[i]].usage < 0)\n              printf(\"V negative! %d \\n\", i);\n                  }\n                  else ///if(gridsY[i]==gridsY[i+1])// a horizontal edge\n                  {\n                      if(i != pre_length - 1)\n                          min_x = min(pre_gridsX[i], pre_gridsX[i+1]);\n                      else\n                          min_x = pre_gridsX[i];\n                      //h_edges[gridsY[i]*(xGrid-1)+min_x].usage += 1;\n                      //galois::atomicAdd(h_edges[gridsY[i]*(xGrid-1)+min_x].usage,\n              (short unsigned)1);\n                      //printf(\"x y %d %d i %d\\n\", min_x, pre_gridsY[i], i);\n                      h_edges[pre_gridsY[i]*(xGrid-1)+min_x].usage.fetch_sub((short\n              int)1);\n                      //if(h_edges[pre_gridsY[i]*(xGrid-1)+min_x].usage < 0)\n              printf(\"H negative! %d \\n\", i);\n                  }\n              }*/\n\n              for (i = 0; i < cnt_n1n2 - 1; i++) {\n                if (gridsX[i] == gridsX[i + 1]) // a vertical edge\n                {\n                  min_y = min(gridsY[i], gridsY[i + 1]);\n                  // v_edges[min_y*xGrid+gridsX[i]].usage += 1;\n                  // galois::atomicAdd(v_edges[min_y*xGrid+gridsX[i]].usage,\n                  // (short unsigned)1);\n                  v_edges[min_y * xGrid + gridsX[i]].usage.fetch_add(\n                      (short int)1);\n\n                } else /// if(gridsY[i]==gridsY[i+1])// a horizontal edge\n                {\n                  min_x = min(gridsX[i], gridsX[i + 1]);\n                  // h_edges[gridsY[i]*(xGrid-1)+min_x].usage += 1;\n                  // galois::atomicAdd(h_edges[gridsY[i]*(xGrid-1)+min_x].usage,\n                  // (short unsigned)1);\n                  h_edges[gridsY[i] * (xGrid - 1) + min_x].usage.fetch_add(\n                      (short int)1);\n                }\n              }\n              /*if(LOCK){\n                  for(i=0; i<cnt_n1n2-1; i++)\n                  {\n                      if(gridsX[i]==gridsX[i+1]) // a vertical edge\n                      {\n                          min_y = min(gridsY[i], gridsY[i+1]);\n                          v_edges[min_y*xGrid+gridsX[i]].releaseLock();\n                      }\n                      else ///if(gridsY[i]==gridsY[i+1])// a horizontal edge\n                      {\n                          min_x = min(gridsX[i], gridsX[i+1]);\n                          h_edges[gridsY[i]*(xGrid-1)+min_x].releaseLock();\n                      }\n                  }\n              }*/\n              if (checkRoute2DTree(netID)) {\n                reInitTree(netID);\n                return;\n              }\n            } // congested route\n          }   // maze routing\n        }     // loop edgeID\n      },\n      // galois::wl<galois::worklists::ParaMeter<>>(),\n      galois::steal(),\n      // galois::chunk_size<64>(),\n      galois::loopname(\"net-level parallelism\")); // galois::do_all\n\n  printf(\"total ripups: %d max ripups: %d\\n\", total_ripups.reduce(),\n         max_ripups.reduce());\n  //}, \"mazeroute vtune function\");\n  free(h_costTable);\n  free(v_costTable);\n}\n\nvoid mazeRouteMSMD_block(int iter, int expand, float costHeight,\n                         int ripup_threshold, int mazeedge_Threshold,\n                         Bool Ordering, int cost_type,\n                         galois::InsertBag<int>* net_shuffle) {\n  // LOCK = 0;\n  float forange;\n\n  // allocate memory for distance and parent and pop_heap\n  h_costTable = (float*)calloc(40 * hCapacity, sizeof(float));\n  v_costTable = (float*)calloc(40 * vCapacity, sizeof(float));\n\n  forange = 40 * hCapacity;\n\n  if (cost_type == 2) {\n    for (int i = 0; i < forange; i++) {\n      if (i < hCapacity - 1)\n        h_costTable[i] =\n            costHeight / (exp((float)(hCapacity - i - 1) * LOGIS_COF) + 1) + 1;\n      else\n        h_costTable[i] =\n            costHeight / (exp((float)(hCapacity - i - 1) * LOGIS_COF) + 1) + 1 +\n            costHeight / slope * (i - hCapacity);\n    }\n    forange = 40 * vCapacity;\n    for (int i = 0; i < forange; i++) {\n      if (i < vCapacity - 1)\n        v_costTable[i] =\n            costHeight / (exp((float)(vCapacity - i - 1) * LOGIS_COF) + 1) + 1;\n      else\n        v_costTable[i] =\n            costHeight / (exp((float)(vCapacity - i - 1) * LOGIS_COF) + 1) + 1 +\n            costHeight / slope * (i - vCapacity);\n    }\n  } else {\n\n    for (int i = 0; i < forange; i++) {\n      if (i < hCapacity)\n        h_costTable[i] =\n            costHeight / (exp((float)(hCapacity - i) * LOGIS_COF) + 1) + 1;\n      else\n        h_costTable[i] =\n            costHeight / (exp((float)(hCapacity - i) * LOGIS_COF) + 1) + 1 +\n            costHeight / slope * (i - hCapacity);\n    }\n    forange = 40 * vCapacity;\n    for (int i = 0; i < forange; i++) {\n      if (i < vCapacity)\n        v_costTable[i] =\n            costHeight / (exp((float)(vCapacity - i) * LOGIS_COF) + 1) + 1;\n      else\n        v_costTable[i] =\n            costHeight / (exp((float)(vCapacity - i) * LOGIS_COF) + 1) + 1 +\n            costHeight / slope * (i - vCapacity);\n    }\n  }\n\n  /*forange = yGrid*xGrid;\n  for(i=0; i<forange; i++)\n  {\n      pop_heap2[i] = FALSE;\n  } //Michael*/\n\n  if (Ordering) {\n    StNetOrder();\n    // printf(\"order?\\n\");\n  }\n\n  galois::substrate::PerThreadStorage<THREAD_LOCAL_STORAGE>\n      thread_local_storage{};\n  // for(nidRPC=0; nidRPC<numValidNets; nidRPC++)//parallelize\n  PerThread_PQ perthread_pq;\n  PerThread_Vec perthread_vec;\n  PRINT = 0;\n  // galois::runtime::profileVtune( [&] (void) {\n  /*std::random_device rd;\n  std::mt19937 g(rd());\n  std::shuffle(net_shuffle.begin(), net_shuffle.end(), g);\n\n  galois::do_all(galois::iterate(net_shuffle), */\n  // galois::do_all(galois::iterate(0, numValidNets), [&] (const auto nidRPC)\n\n  galois::on_each(\n      [&](const unsigned tid, const unsigned numT)\n      // for(unsigned int nidRPC = 0; nidRPC < numValidNets; nidRPC++)\n      {\n        if (tid >= numT)\n          return;\n        for (const auto nidRPC : net_shuffle[tid]) {\n          int grid, netID;\n\n          // maze routing for multi-source, multi-destination\n          Bool hypered, enter;\n          int i, j, deg, edgeID, n1, n2, n1x, n1y, n2x, n2y, ymin, ymax, xmin,\n              xmax, curX, curY, crossX, crossY, tmpX, tmpY, tmpi, min_x, min_y,\n              num_edges;\n          int regionX1, regionX2, regionY1, regionY2;\n          int ind1, tmpind, gridsX[XRANGE], gridsY[YRANGE], tmp_gridsX[XRANGE],\n              tmp_gridsY[YRANGE];\n          int endpt1, endpt2, A1, A2, B1, B2, C1, C2, D1, D2, cnt, cnt_n1n2;\n          int edge_n1n2, edge_n1A1, edge_n1A2, edge_n1C1, edge_n1C2, edge_A1A2,\n              edge_C1C2;\n          int edge_n2B1, edge_n2B2, edge_n2D1, edge_n2D2, edge_B1B2, edge_D1D2;\n          int E1x, E1y, E2x, E2y;\n          int tmp_grid, tmp_cost;\n          int preX, preY, origENG, edgeREC;\n\n          float tmp;\n          TreeEdge *treeedges, *treeedge;\n          TreeNode* treenodes;\n\n          bool* pop_heap2 = thread_local_storage.getLocal()->pop_heap2;\n\n          float** d1    = thread_local_storage.getLocal()->d1_p;\n          bool** HV     = thread_local_storage.getLocal()->HV_p;\n          bool** hyperV = thread_local_storage.getLocal()->hyperV_p;\n          bool** hyperH = thread_local_storage.getLocal()->hyperH_p;\n\n          short** parentX1 = thread_local_storage.getLocal()->parentX1_p;\n          short** parentX3 = thread_local_storage.getLocal()->parentX3_p;\n          short** parentY1 = thread_local_storage.getLocal()->parentY1_p;\n          short** parentY3 = thread_local_storage.getLocal()->parentY3_p;\n\n          int** corrEdge = thread_local_storage.getLocal()->corrEdge_p;\n\n          OrderNetEdge* netEO = thread_local_storage.getLocal()->netEO_p;\n\n          bool** inRegion = thread_local_storage.getLocal()->inRegion_p;\n\n          local_pq pq1 = perthread_pq.get();\n          local_vec v2 = perthread_vec.get();\n\n          /*for(i=0; i<yGrid*xGrid; i++)\n          {\n              pop_heap2[i] = FALSE;\n          } */\n\n          /*for(int i=0; i<yGrid; i++)\n          {\n              for(int j=0; j<xGrid; j++)\n                  inRegion[i][j] = FALSE;\n          }*/\n          // printf(\"hyperV[153][134]: %d %d %d\\n\", hyperV[153][134],\n          // parentY1[153][134], parentX3[153][134]); printf(\"what is\n          // happening?\\n\");\n\n          if (Ordering) {\n            netID = treeOrderCong[nidRPC].treeIndex;\n          } else {\n            netID = nidRPC;\n          }\n\n          deg = sttrees[netID].deg;\n\n          origENG = expand;\n\n          netedgeOrderDec(netID, netEO);\n\n          treeedges = sttrees[netID].edges;\n          treenodes = sttrees[netID].nodes;\n          // loop for all the tree edges (2*deg-3)\n          num_edges = 2 * deg - 3;\n\n          for (edgeREC = 0; edgeREC < num_edges; edgeREC++) {\n            edgeID   = netEO[edgeREC].edgeID;\n            treeedge = &(treeedges[edgeID]);\n\n            n1            = treeedge->n1;\n            n2            = treeedge->n2;\n            n1x           = treenodes[n1].x;\n            n1y           = treenodes[n1].y;\n            n2x           = treenodes[n2].x;\n            n2y           = treenodes[n2].y;\n            treeedge->len = ADIFF(n2x, n1x) + ADIFF(n2y, n1y);\n\n            if (treeedge->len >\n                mazeedge_Threshold) // only route the non-degraded edges (len>0)\n            {\n\n              enter = newRipupCheck(treeedge, ripup_threshold, netID, edgeID);\n\n              // ripup the routing for the edge\n              if (enter) {\n\n                if (n1y <= n2y) {\n                  ymin = n1y;\n                  ymax = n2y;\n                } else {\n                  ymin = n2y;\n                  ymax = n1y;\n                }\n\n                if (n1x <= n2x) {\n                  xmin = n1x;\n                  xmax = n2x;\n                } else {\n                  xmin = n2x;\n                  xmax = n1x;\n                }\n\n                int enlarge =\n                    min(origENG,\n                        (iter / 6 + 3) *\n                            treeedge->route\n                                .routelen); // michael, this was global variable\n                regionX1 = max(0, xmin - enlarge);\n                regionX2 = min(xGrid - 1, xmax + enlarge);\n                regionY1 = max(0, ymin - enlarge);\n                regionY2 = min(yGrid - 1, ymax + enlarge);\n\n                // initialize d1[][] and d2[][] as BIG_INT\n                for (i = regionY1; i <= regionY2; i++) {\n                  for (j = regionX1; j <= regionX2; j++) {\n                    d1[i][j] = BIG_INT;\n                    /*d2[i][j] = BIG_INT;\n                    hyperH[i][j] = FALSE;\n                    hyperV[i][j] = FALSE;*/\n                  }\n                }\n                // memset(hyperH, 0, xGrid * yGrid * sizeof(bool));\n                // memset(hyperV, 0, xGrid * yGrid * sizeof(bool));\n                for (i = regionY1; i <= regionY2; i++) {\n                  for (j = regionX1; j <= regionX2; j++) {\n                    hyperH[i][j] = FALSE;\n                  }\n                }\n                for (i = regionY1; i <= regionY2; i++) {\n                  for (j = regionX1; j <= regionX2; j++) {\n                    hyperV[i][j] = FALSE;\n                  }\n                }\n                // TODO: use seperate loops\n\n                // setup heap1, heap2 and initialize d1[][] and d2[][] for all\n                // the grids on the two subtrees\n                setupHeap(netID, edgeID, pq1, v2, regionX1, regionX2, regionY1,\n                          regionY2, d1, corrEdge, inRegion);\n                // TODO: use std priority queue\n                // while loop to find shortest path\n                ind1 = (pq1.top().d1_p - &d1[0][0]);\n                pq1.pop();\n                curX = ind1 % xGrid;\n                curY = ind1 / xGrid;\n\n                for (local_vec::iterator ii = v2.begin(); ii != v2.end();\n                     ii++) {\n                  pop_heap2[*ii] = TRUE;\n                }\n                while (pop_heap2[ind1] ==\n                       FALSE) // stop until the grid position been popped out\n                              // from both heap1 and heap2\n                {\n                  // relax all the adjacent grids within the enlarged region for\n                  // source subtree\n\n                  // if(PRINT) printf(\"curX curY %d %d, (%d, %d), (%d, %d),\n                  // pq1.size: %d\\n\", curX, curY, regionX1, regionX2, regionY1,\n                  // regionY2, pq1.size()); if(curX == 102 && curY == 221)\n                  // exit(1);\n                  float curr_d1 = d1[curY][curX];\n                  if (curr_d1 != 0) {\n                    if (HV[curY][curX]) {\n                      preX = parentX1[curY][curX];\n                      preY = parentY1[curY][curX];\n                    } else {\n                      preX = parentX3[curY][curX];\n                      preY = parentY3[curY][curX];\n                    }\n                  } else {\n                    preX = curX;\n                    preY = curY;\n                  }\n\n                  // left\n                  if (curX > regionX1) {\n                    grid = curY * (xGrid - 1) + curX - 1;\n\n                    if ((preY == curY) || (curr_d1 == 0)) {\n                      tmp =\n                          curr_d1 +\n                          h_costTable[h_edges[grid].usage + h_edges[grid].red +\n                                      (int)(L * h_edges[grid].last_usage)];\n                    } else {\n                      if (curX < regionX2 - 1) {\n                        tmp_grid = curY * (xGrid - 1) + curX;\n                        tmp_cost =\n                            d1[curY][curX + 1] +\n                            h_costTable[h_edges[tmp_grid].usage +\n                                        h_edges[tmp_grid].red +\n                                        (int)(L *\n                                              h_edges[tmp_grid].last_usage)];\n\n                        if (tmp_cost < curr_d1 + VIA) {\n                          hyperH[curY][curX] = TRUE; // Michael\n                        }\n                      }\n                      tmp =\n                          curr_d1 + VIA +\n                          h_costTable[h_edges[grid].usage + h_edges[grid].red +\n                                      (int)(L * h_edges[grid].last_usage)];\n                    }\n                    // if(LOCK)  h_edges[grid].releaseLock();\n                    tmpX = curX - 1; // the left neighbor\n\n                    /*if(d1[curY][tmpX]>=BIG_INT) // left neighbor not been put\n                    into heap1\n                    {\n                        d1[curY][tmpX] = tmp;\n                        parentX3[curY][tmpX] = curX;\n                        parentY3[curY][tmpX] = curY;\n                        HV[curY][tmpX] = FALSE;\n                        pq1.push(&(d1[curY][tmpX]));\n                    }\n                    else */\n                    if (d1[curY][tmpX] > tmp) // left neighbor been put into\n                                              // heap1 but needs update\n                    {\n                      d1[curY][tmpX]       = tmp;\n                      parentX3[curY][tmpX] = curX;\n                      parentY3[curY][tmpX] = curY;\n                      HV[curY][tmpX]       = FALSE;\n                      pq1.push({&(d1[curY][tmpX]), tmp});\n                    }\n                  }\n                  // right\n                  if (curX < regionX2) {\n                    grid = curY * (xGrid - 1) + curX;\n\n                    if ((preY == curY) || (curr_d1 == 0)) {\n                      tmp =\n                          curr_d1 +\n                          h_costTable[h_edges[grid].usage + h_edges[grid].red +\n                                      (int)(L * h_edges[grid].last_usage)];\n                    } else {\n                      if (curX > regionX1 + 1) {\n                        tmp_grid = curY * (xGrid - 1) + curX - 1;\n                        tmp_cost =\n                            d1[curY][curX - 1] +\n                            h_costTable[h_edges[tmp_grid].usage +\n                                        h_edges[tmp_grid].red +\n                                        (int)(L *\n                                              h_edges[tmp_grid].last_usage)];\n\n                        if (tmp_cost < curr_d1 + VIA) {\n                          hyperH[curY][curX] = TRUE;\n                        }\n                      }\n                      tmp =\n                          curr_d1 + VIA +\n                          h_costTable[h_edges[grid].usage + h_edges[grid].red +\n                                      (int)(L * h_edges[grid].last_usage)];\n                    }\n                    // if(LOCK) h_edges[grid].releaseLock();\n                    tmpX = curX + 1; // the right neighbor\n\n                    /*if(d1[curY][tmpX]>=BIG_INT) // right neighbor not been put\n                    into heap1\n                    {\n                        d1[curY][tmpX] = tmp;\n                        parentX3[curY][tmpX] = curX;\n                        parentY3[curY][tmpX] = curY;\n                        HV[curY][tmpX] = FALSE;\n                        pq1.push(&(d1[curY][tmpX]));\n\n                    }\n                    else */\n                    if (d1[curY][tmpX] > tmp) // right neighbor been put into\n                                              // heap1 but needs update\n                    {\n                      d1[curY][tmpX]       = tmp;\n                      parentX3[curY][tmpX] = curX;\n                      parentY3[curY][tmpX] = curY;\n                      HV[curY][tmpX]       = FALSE;\n                      pq1.push({&(d1[curY][tmpX]), tmp});\n                    }\n                  }\n                  // bottom\n                  if (curY > regionY1) {\n                    grid = (curY - 1) * xGrid + curX;\n\n                    if ((preX == curX) || (curr_d1 == 0)) {\n                      tmp =\n                          curr_d1 +\n                          v_costTable[v_edges[grid].usage + v_edges[grid].red +\n                                      (int)(L * v_edges[grid].last_usage)];\n                    } else {\n                      if (curY < regionY2 - 1) {\n                        tmp_grid = curY * xGrid + curX;\n                        tmp_cost =\n                            d1[curY + 1][curX] +\n                            v_costTable[v_edges[tmp_grid].usage +\n                                        v_edges[tmp_grid].red +\n                                        (int)(L *\n                                              v_edges[tmp_grid].last_usage)];\n\n                        if (tmp_cost < curr_d1 + VIA) {\n                          hyperV[curY][curX] = TRUE;\n                        }\n                      }\n                      tmp =\n                          curr_d1 + VIA +\n                          v_costTable[v_edges[grid].usage + v_edges[grid].red +\n                                      (int)(L * v_edges[grid].last_usage)];\n                    }\n                    // if(LOCK) v_edges[grid].releaseLock();\n                    tmpY = curY - 1; // the bottom neighbor\n\n                    /*if(d1[tmpY][curX]>=BIG_INT) // bottom neighbor not been\n                    put into heap1\n                    {\n                        d1[tmpY][curX] = tmp;\n                        parentX1[tmpY][curX] = curX;\n                        parentY1[tmpY][curX] = curY;\n                        HV[tmpY][curX] = TRUE;\n                        pq1.push(&(d1[tmpY][curX]));\n\n                    }\n                    else */\n                    if (d1[tmpY][curX] > tmp) // bottom neighbor been put into\n                                              // heap1 but needs update\n                    {\n                      d1[tmpY][curX]       = tmp;\n                      parentX1[tmpY][curX] = curX;\n                      parentY1[tmpY][curX] = curY;\n                      HV[tmpY][curX]       = TRUE;\n                      pq1.push({&(d1[tmpY][curX]), tmp});\n                    }\n                  }\n                  // top\n                  if (curY < regionY2) {\n                    grid = curY * xGrid + curX;\n\n                    if ((preX == curX) || (curr_d1 == 0)) {\n                      tmp =\n                          curr_d1 +\n                          v_costTable[v_edges[grid].usage + v_edges[grid].red +\n                                      (int)(L * v_edges[grid].last_usage)];\n                    } else {\n                      if (curY > regionY1 + 1) {\n                        tmp_grid = (curY - 1) * xGrid + curX;\n                        tmp_cost =\n                            d1[curY - 1][curX] +\n                            v_costTable[v_edges[tmp_grid].usage +\n                                        v_edges[tmp_grid].red +\n                                        (int)(L *\n                                              v_edges[tmp_grid].last_usage)];\n\n                        if (tmp_cost < curr_d1 + VIA) {\n                          hyperV[curY][curX] = TRUE;\n                        }\n                      }\n                      tmp =\n                          curr_d1 + VIA +\n                          v_costTable[v_edges[grid].usage + v_edges[grid].red +\n                                      (int)(L * v_edges[grid].last_usage)];\n                    }\n                    // if(LOCK) v_edges[grid].releaseLock();\n                    tmpY = curY + 1; // the top neighbor\n\n                    /*if(d1[tmpY][curX]>=BIG_INT) // top neighbor not been put\n                    into heap1\n                    {\n                        d1[tmpY][curX] = tmp;\n                        parentX1[tmpY][curX] = curX;\n                        parentY1[tmpY][curX] = curY;\n                        HV[tmpY][curX] = TRUE;\n                        pq1.push(&(d1[tmpY][curX]));\n                    }\n                    else*/\n                    if (d1[tmpY][curX] > tmp) // top neighbor been put into\n                                              // heap1 but needs update\n                    {\n                      d1[tmpY][curX]       = tmp;\n                      parentX1[tmpY][curX] = curX;\n                      parentY1[tmpY][curX] = curY;\n                      HV[tmpY][curX]       = TRUE;\n                      pq1.push({&(d1[tmpY][curX]), tmp});\n                    }\n                  }\n\n                  // update ind1 and ind2 for next loop, Michael: need to check\n                  // if it is up-to-date value.\n                  float d1_push;\n                  do {\n                    ind1    = pq1.top().d1_p - &d1[0][0];\n                    d1_push = pq1.top().d1_push;\n                    pq1.pop();\n                    curX = ind1 % xGrid;\n                    curY = ind1 / xGrid;\n                  } while (d1_push != d1[curY][curX]);\n                } // while loop\n\n                for (local_vec::iterator ii = v2.begin(); ii != v2.end(); ii++)\n                  pop_heap2[*ii] = FALSE;\n\n                crossX = ind1 % xGrid;\n                crossY = ind1 / xGrid;\n\n                cnt  = 0;\n                curX = crossX;\n                curY = crossY;\n                while (d1[curY][curX] != 0) // loop until reach subtree1\n                {\n                  hypered = FALSE;\n                  if (cnt != 0) {\n                    if (curX != tmpX && hyperH[curY][curX]) {\n                      curX    = 2 * curX - tmpX;\n                      hypered = TRUE;\n                    }\n                    // printf(\"hyperV[153][134]: %d\\n\", hyperV[curY][curX]);\n                    if (curY != tmpY && hyperV[curY][curX]) {\n                      curY    = 2 * curY - tmpY;\n                      hypered = TRUE;\n                    }\n                  }\n                  tmpX = curX;\n                  tmpY = curY;\n                  if (!hypered) {\n                    if (HV[tmpY][tmpX]) {\n                      curY = parentY1[tmpY][tmpX];\n                    } else {\n                      curX = parentX3[tmpY][tmpX];\n                    }\n                  }\n\n                  tmp_gridsX[cnt] = curX;\n                  tmp_gridsY[cnt] = curY;\n                  cnt++;\n                }\n                // reverse the grids on the path\n\n                for (i = 0; i < cnt; i++) {\n                  tmpind    = cnt - 1 - i;\n                  gridsX[i] = tmp_gridsX[tmpind];\n                  gridsY[i] = tmp_gridsY[tmpind];\n                }\n                // add the connection point (crossX, crossY)\n                gridsX[cnt] = crossX;\n                gridsY[cnt] = crossY;\n                cnt++;\n\n                curX     = crossX;\n                curY     = crossY;\n                cnt_n1n2 = cnt;\n\n                // change the tree structure according to the new routing for\n                // the tree edge find E1 and E2, and the endpoints of the edges\n                // they are on\n                E1x = gridsX[0];\n                E1y = gridsY[0];\n                E2x = gridsX[cnt_n1n2 - 1];\n                E2y = gridsY[cnt_n1n2 - 1];\n\n                edge_n1n2 = edgeID;\n\n                // (1) consider subtree1\n                if (n1 >= deg && (E1x != n1x || E1y != n1y))\n                // n1 is not a pin and E1!=n1, then make change to subtree1,\n                // otherwise, no change to subtree1\n                {\n                  // find the endpoints of the edge E1 is on\n                  endpt1 = treeedges[corrEdge[E1y][E1x]].n1;\n                  endpt2 = treeedges[corrEdge[E1y][E1x]].n2;\n\n                  // find A1, A2 and edge_n1A1, edge_n1A2\n                  if (treenodes[n1].nbr[0] == n2) {\n                    A1        = treenodes[n1].nbr[1];\n                    A2        = treenodes[n1].nbr[2];\n                    edge_n1A1 = treenodes[n1].edge[1];\n                    edge_n1A2 = treenodes[n1].edge[2];\n                  } else if (treenodes[n1].nbr[1] == n2) {\n                    A1        = treenodes[n1].nbr[0];\n                    A2        = treenodes[n1].nbr[2];\n                    edge_n1A1 = treenodes[n1].edge[0];\n                    edge_n1A2 = treenodes[n1].edge[2];\n                  } else {\n                    A1        = treenodes[n1].nbr[0];\n                    A2        = treenodes[n1].nbr[1];\n                    edge_n1A1 = treenodes[n1].edge[0];\n                    edge_n1A2 = treenodes[n1].edge[1];\n                  }\n\n                  if (endpt1 == n1 ||\n                      endpt2 == n1) // E1 is on (n1, A1) or (n1, A2)\n                  {\n                    // if E1 is on (n1, A2), switch A1 and A2 so that E1 is\n                    // always on (n1, A1)\n                    if (endpt1 == A2 || endpt2 == A2) {\n                      tmpi      = A1;\n                      A1        = A2;\n                      A2        = tmpi;\n                      tmpi      = edge_n1A1;\n                      edge_n1A1 = edge_n1A2;\n                      edge_n1A2 = tmpi;\n                    }\n\n                    // update route for edge (n1, A1), (n1, A2)\n                    updateRouteType1(treenodes, n1, A1, A2, E1x, E1y, treeedges,\n                                     edge_n1A1, edge_n1A2);\n                    // update position for n1\n                    treenodes[n1].x = E1x;\n                    treenodes[n1].y = E1y;\n                  }    // if E1 is on (n1, A1) or (n1, A2)\n                  else // E1 is not on (n1, A1) or (n1, A2), but on (C1, C2)\n                  {\n                    C1        = endpt1;\n                    C2        = endpt2;\n                    edge_C1C2 = corrEdge[E1y][E1x];\n\n                    // update route for edge (n1, C1), (n1, C2) and (A1, A2)\n                    updateRouteType2(treenodes, n1, A1, A2, C1, C2, E1x, E1y,\n                                     treeedges, edge_n1A1, edge_n1A2,\n                                     edge_C1C2);\n                    // update position for n1\n                    treenodes[n1].x = E1x;\n                    treenodes[n1].y = E1y;\n                    // update 3 edges (n1, A1)->(C1, n1), (n1, A2)->(n1, C2),\n                    // (C1, C2)->(A1, A2)\n                    edge_n1C1               = edge_n1A1;\n                    treeedges[edge_n1C1].n1 = C1;\n                    treeedges[edge_n1C1].n2 = n1;\n                    edge_n1C2               = edge_n1A2;\n                    treeedges[edge_n1C2].n1 = n1;\n                    treeedges[edge_n1C2].n2 = C2;\n                    edge_A1A2               = edge_C1C2;\n                    treeedges[edge_A1A2].n1 = A1;\n                    treeedges[edge_A1A2].n2 = A2;\n                    // update nbr and edge for 5 nodes n1, A1, A2, C1, C2\n                    // n1's nbr (n2, A1, A2)->(n2, C1, C2)\n                    treenodes[n1].nbr[0]  = n2;\n                    treenodes[n1].edge[0] = edge_n1n2;\n                    treenodes[n1].nbr[1]  = C1;\n                    treenodes[n1].edge[1] = edge_n1C1;\n                    treenodes[n1].nbr[2]  = C2;\n                    treenodes[n1].edge[2] = edge_n1C2;\n                    // A1's nbr n1->A2\n                    for (i = 0; i < 3; i++) {\n                      if (treenodes[A1].nbr[i] == n1) {\n                        treenodes[A1].nbr[i]  = A2;\n                        treenodes[A1].edge[i] = edge_A1A2;\n                        break;\n                      }\n                    }\n                    // A2's nbr n1->A1\n                    for (i = 0; i < 3; i++) {\n                      if (treenodes[A2].nbr[i] == n1) {\n                        treenodes[A2].nbr[i]  = A1;\n                        treenodes[A2].edge[i] = edge_A1A2;\n                        break;\n                      }\n                    }\n                    // C1's nbr C2->n1\n                    for (i = 0; i < 3; i++) {\n                      if (treenodes[C1].nbr[i] == C2) {\n                        treenodes[C1].nbr[i]  = n1;\n                        treenodes[C1].edge[i] = edge_n1C1;\n                        break;\n                      }\n                    }\n                    // C2's nbr C1->n1\n                    for (i = 0; i < 3; i++) {\n                      if (treenodes[C2].nbr[i] == C1) {\n                        treenodes[C2].nbr[i]  = n1;\n                        treenodes[C2].edge[i] = edge_n1C2;\n                        break;\n                      }\n                    }\n\n                  } // else E1 is not on (n1, A1) or (n1, A2), but on (C1, C2)\n                }   // n1 is not a pin and E1!=n1\n\n                // (2) consider subtree2\n\n                if (n2 >= deg && (E2x != n2x || E2y != n2y))\n                // n2 is not a pin and E2!=n2, then make change to subtree2,\n                // otherwise, no change to subtree2\n                {\n                  // find the endpoints of the edge E1 is on\n                  endpt1 = treeedges[corrEdge[E2y][E2x]].n1;\n                  endpt2 = treeedges[corrEdge[E2y][E2x]].n2;\n\n                  // find B1, B2\n                  if (treenodes[n2].nbr[0] == n1) {\n                    B1        = treenodes[n2].nbr[1];\n                    B2        = treenodes[n2].nbr[2];\n                    edge_n2B1 = treenodes[n2].edge[1];\n                    edge_n2B2 = treenodes[n2].edge[2];\n                  } else if (treenodes[n2].nbr[1] == n1) {\n                    B1        = treenodes[n2].nbr[0];\n                    B2        = treenodes[n2].nbr[2];\n                    edge_n2B1 = treenodes[n2].edge[0];\n                    edge_n2B2 = treenodes[n2].edge[2];\n                  } else {\n                    B1        = treenodes[n2].nbr[0];\n                    B2        = treenodes[n2].nbr[1];\n                    edge_n2B1 = treenodes[n2].edge[0];\n                    edge_n2B2 = treenodes[n2].edge[1];\n                  }\n\n                  if (endpt1 == n2 ||\n                      endpt2 == n2) // E2 is on (n2, B1) or (n2, B2)\n                  {\n                    // if E2 is on (n2, B2), switch B1 and B2 so that E2 is\n                    // always on (n2, B1)\n                    if (endpt1 == B2 || endpt2 == B2) {\n                      tmpi      = B1;\n                      B1        = B2;\n                      B2        = tmpi;\n                      tmpi      = edge_n2B1;\n                      edge_n2B1 = edge_n2B2;\n                      edge_n2B2 = tmpi;\n                    }\n\n                    // update route for edge (n2, B1), (n2, B2)\n                    updateRouteType1(treenodes, n2, B1, B2, E2x, E2y, treeedges,\n                                     edge_n2B1, edge_n2B2);\n\n                    // update position for n2\n                    treenodes[n2].x = E2x;\n                    treenodes[n2].y = E2y;\n                  }    // if E2 is on (n2, B1) or (n2, B2)\n                  else // E2 is not on (n2, B1) or (n2, B2), but on (D1, D2)\n                  {\n                    D1        = endpt1;\n                    D2        = endpt2;\n                    edge_D1D2 = corrEdge[E2y][E2x];\n\n                    // update route for edge (n2, D1), (n2, D2) and (B1, B2)\n                    updateRouteType2(treenodes, n2, B1, B2, D1, D2, E2x, E2y,\n                                     treeedges, edge_n2B1, edge_n2B2,\n                                     edge_D1D2);\n                    // update position for n2\n                    treenodes[n2].x = E2x;\n                    treenodes[n2].y = E2y;\n                    // update 3 edges (n2, B1)->(D1, n2), (n2, B2)->(n2, D2),\n                    // (D1, D2)->(B1, B2)\n                    edge_n2D1               = edge_n2B1;\n                    treeedges[edge_n2D1].n1 = D1;\n                    treeedges[edge_n2D1].n2 = n2;\n                    edge_n2D2               = edge_n2B2;\n                    treeedges[edge_n2D2].n1 = n2;\n                    treeedges[edge_n2D2].n2 = D2;\n                    edge_B1B2               = edge_D1D2;\n                    treeedges[edge_B1B2].n1 = B1;\n                    treeedges[edge_B1B2].n2 = B2;\n                    // update nbr and edge for 5 nodes n2, B1, B2, D1, D2\n                    // n1's nbr (n1, B1, B2)->(n1, D1, D2)\n                    treenodes[n2].nbr[0]  = n1;\n                    treenodes[n2].edge[0] = edge_n1n2;\n                    treenodes[n2].nbr[1]  = D1;\n                    treenodes[n2].edge[1] = edge_n2D1;\n                    treenodes[n2].nbr[2]  = D2;\n                    treenodes[n2].edge[2] = edge_n2D2;\n                    // B1's nbr n2->B2\n                    for (i = 0; i < 3; i++) {\n                      if (treenodes[B1].nbr[i] == n2) {\n                        treenodes[B1].nbr[i]  = B2;\n                        treenodes[B1].edge[i] = edge_B1B2;\n                        break;\n                      }\n                    }\n                    // B2's nbr n2->B1\n                    for (i = 0; i < 3; i++) {\n                      if (treenodes[B2].nbr[i] == n2) {\n                        treenodes[B2].nbr[i]  = B1;\n                        treenodes[B2].edge[i] = edge_B1B2;\n                        break;\n                      }\n                    }\n                    // D1's nbr D2->n2\n                    for (i = 0; i < 3; i++) {\n                      if (treenodes[D1].nbr[i] == D2) {\n                        treenodes[D1].nbr[i]  = n2;\n                        treenodes[D1].edge[i] = edge_n2D1;\n                        break;\n                      }\n                    }\n                    // D2's nbr D1->n2\n                    for (i = 0; i < 3; i++) {\n                      if (treenodes[D2].nbr[i] == D1) {\n                        treenodes[D2].nbr[i]  = n2;\n                        treenodes[D2].edge[i] = edge_n2D2;\n                        break;\n                      }\n                    }\n                  } // else E2 is not on (n2, B1) or (n2, B2), but on (D1, D2)\n                }   // n2 is not a pin and E2!=n2\n\n                // update route for edge (n1, n2) and edge usage\n\n                // printf(\"update route? %d %d\\n\", netID, num_edges);\n                if (treeedges[edge_n1n2].route.type == MAZEROUTE) {\n                  free(treeedges[edge_n1n2].route.gridsX);\n                  free(treeedges[edge_n1n2].route.gridsY);\n                }\n                treeedges[edge_n1n2].route.gridsX =\n                    (short*)calloc(cnt_n1n2, sizeof(short));\n                treeedges[edge_n1n2].route.gridsY =\n                    (short*)calloc(cnt_n1n2, sizeof(short));\n                treeedges[edge_n1n2].route.type     = MAZEROUTE;\n                treeedges[edge_n1n2].route.routelen = cnt_n1n2 - 1;\n                treeedges[edge_n1n2].len = ADIFF(E1x, E2x) + ADIFF(E1y, E2y);\n\n                for (i = 0; i < cnt_n1n2; i++) {\n                  // printf(\"cnt_n1n2: %d\\n\", cnt_n1n2);\n                  treeedges[edge_n1n2].route.gridsX[i] = gridsX[i];\n                  treeedges[edge_n1n2].route.gridsY[i] = gridsY[i];\n                }\n\n                // update edge usage\n\n                for (i = 0; i < cnt_n1n2 - 1; i++) {\n                  if (gridsX[i] == gridsX[i + 1]) // a vertical edge\n                  {\n                    min_y = min(gridsY[i], gridsY[i + 1]);\n                    // v_edges[min_y*xGrid+gridsX[i]].usage += 1;\n                    // galois::atomicAdd(v_edges[min_y*xGrid+gridsX[i]].usage,\n                    // (short unsigned)1);\n                    v_edges[min_y * xGrid + gridsX[i]].usage.fetch_add(\n                        (short unsigned)1, std::memory_order_relaxed);\n                  } else /// if(gridsY[i]==gridsY[i+1])// a horizontal edge\n                  {\n                    min_x = min(gridsX[i], gridsX[i + 1]);\n                    // h_edges[gridsY[i]*(xGrid-1)+min_x].usage += 1;\n                    // galois::atomicAdd(h_edges[gridsY[i]*(xGrid-1)+min_x].usage,\n                    // (short unsigned)1);\n                    h_edges[gridsY[i] * (xGrid - 1) + min_x].usage.fetch_add(\n                        (short unsigned)1, std::memory_order_relaxed);\n                  }\n                }\n                /*if(LOCK){\n                    for(i=0; i<cnt_n1n2-1; i++)\n                    {\n                        if(gridsX[i]==gridsX[i+1]) // a vertical edge\n                        {\n                            min_y = min(gridsY[i], gridsY[i+1]);\n                            v_edges[min_y*xGrid+gridsX[i]].releaseLock();\n                        }\n                        else ///if(gridsY[i]==gridsY[i+1])// a horizontal edge\n                        {\n                            min_x = min(gridsX[i], gridsX[i+1]);\n                            h_edges[gridsY[i]*(xGrid-1)+min_x].releaseLock();\n                        }\n                    }\n                }*/\n                if (checkRoute2DTree(netID)) {\n                  reInitTree(netID);\n                  return;\n                }\n              } // congested route\n\n            } // maze routing\n          }   // loop edgeID\n        }\n      },\n      galois::steal(),\n      // galois::chunk_size<32>(),\n      galois::loopname(\"maze routing block\")); // galois::do_all\n  //}, \"mazeroute vtune function\");\n  free(h_costTable);\n  free(v_costTable);\n}\n\nint getOverflow2Dmaze(int* maxOverflow, int* tUsage) {\n  int i, j, grid, overflow, max_overflow, H_overflow, max_H_overflow,\n      V_overflow, max_V_overflow, numedges = 0;\n  int total_usage, total_cap;\n\n  // get overflow\n  overflow = max_overflow = H_overflow = max_H_overflow = V_overflow =\n      max_V_overflow                                    = 0;\n\n  total_usage = 0;\n  total_cap   = 0;\n\n  //    fprintf(fph, \"Horizontal Congestion\\n\");\n\n  // int ripup_same = 0;\n  // int ripup_diff = 0;\n\n  for (i = 0; i < yGrid; i++) {\n    for (j = 0; j < xGrid - 1; j++) {\n      grid = i * (xGrid - 1) + j;\n      total_usage += h_edges[grid].usage;\n      overflow = h_edges[grid].usage - h_edges[grid].cap;\n      total_cap += h_edges[grid].cap;\n      if (overflow > 0) {\n        H_overflow += overflow;\n        max_H_overflow = max(max_H_overflow, overflow);\n        numedges++;\n        /*if(!h_edges[grid].ripups_cur_round)\n        {\n            h_edges[grid].max_have_rippedups = h_edges[grid].max_ripups.load();\n        }*/\n      }\n      // h_edges[grid].ripups_cur_round = false;\n    }\n  }\n  //    fprintf(fpv, \"\\nVertical Congestion\\n\");\n  for (i = 0; i < yGrid - 1; i++) {\n    for (j = 0; j < xGrid; j++) {\n      grid = i * xGrid + j;\n      total_usage += v_edges[grid].usage;\n      overflow = v_edges[grid].usage - v_edges[grid].cap;\n      total_cap += v_edges[grid].cap;\n      if (overflow > 0) {\n        V_overflow += overflow;\n        max_V_overflow = max(max_V_overflow, overflow);\n        numedges++;\n        /*if(!v_edges[grid].ripups_cur_round)\n        {\n            v_edges[grid].max_have_rippedups = v_edges[grid].max_ripups.load();\n        }*/\n      }\n\n      // v_edges[grid].ripups_cur_round = false;\n    }\n  }\n  /*bitmap_image image(dimx,dimy);\n  for(i=0; i<yGrid-1; i++)\n  {\n      for(j=0; j<xGrid - 1; j++)\n      {\n          int gridx = i*(xGrid-1)+j;\n          int gridy = i*xGrid+j;\n\n          int h_overflow = h_edges[gridx].usage - h_edges[gridx].cap;\n          h_overflow = (h_overflow > 0)? h_overflow : 0;\n          int v_overflow = v_edges[gridy].usage - v_edges[gridy].cap;\n          v_overflow = (v_overflow > 0)? v_overflow : 0;\n\n          overflow = h_overflow + v_overflow;\n          if(overflow > 0)\n          {\n              float red = (overflow >= 1)? 0 : (255 - ((float)overflow/1) *\n  255); int red_int = (int)red; image.set_pixel(j,i,255,(unsigned\n  char)red_int,(unsigned char)red_int);\n          }\n          else\n          {\n              image.set_pixel(j,i,255,255,255);\n          }\n\n      }\n  }\n  std::string file_name = \"route\" + to_string(PRINT_HEAT) + \".bmp\";\n  image.save_image(file_name);\n  PRINT_HEAT++;*/\n\n  max_overflow  = max(max_H_overflow, max_V_overflow);\n  totalOverflow = H_overflow + V_overflow;\n  *maxOverflow  = max_overflow;\n\n  printf(\"total Usage   : %d\\n\", (int)total_usage);\n  printf(\"Max H Overflow: %d\\n\", max_H_overflow);\n  printf(\"Max V Overflow: %d\\n\", max_V_overflow);\n  printf(\"Max Overflow  : %d\\n\", max_overflow);\n  printf(\"Num Overflow e: %d\\n\", numedges);\n  printf(\"H   Overflow  : %d\\n\", H_overflow);\n  printf(\"V   Overflow  : %d\\n\", V_overflow);\n  printf(\"Final Overflow: %d\\n\\n\", totalOverflow);\n\n  *tUsage = total_usage;\n\n  if (total_usage > 800000) {\n    ahTH = 30;\n  } else {\n    ahTH = 20;\n  }\n\n  return (totalOverflow);\n}\n\nvoid checkUsageCorrectness() {\n  int* vedge_usage = new int[xGrid * (yGrid - 1)];\n  int* hedge_usage = new int[(xGrid - 1) * yGrid];\n  memset(vedge_usage, 0, xGrid * (yGrid - 1) * sizeof(int));\n  memset(hedge_usage, 0, (xGrid - 1) * yGrid * sizeof(int));\n\n  for (int netID = 0; netID < numValidNets; netID++) {\n    // maze routing for multi-source, multi-destination\n\n    int deg, edgeID, n1, n2, n1x, n1y, n2x, n2y, num_edges;\n\n    TreeEdge *treeedges, *treeedge;\n    TreeNode* treenodes;\n\n    deg = sttrees[netID].deg;\n\n    treeedges = sttrees[netID].edges;\n    treenodes = sttrees[netID].nodes;\n    // loop for all the tree edges (2*deg-3)\n    num_edges = 2 * deg - 3;\n\n    for (edgeID = 0; edgeID < num_edges; edgeID++) {\n      treeedge = &(treeedges[edgeID]);\n\n      n1            = treeedge->n1;\n      n2            = treeedge->n2;\n      n1x           = treenodes[n1].x;\n      n1y           = treenodes[n1].y;\n      n2x           = treenodes[n2].x;\n      n2y           = treenodes[n2].y;\n      treeedge->len = ADIFF(n2x, n1x) + ADIFF(n2y, n1y);\n\n      if (treeedge->len > 0) // only route the non-degraded edges (len>0)\n      {\n        if (treeedge->route.type == MAZEROUTE) {\n          short* gridsX = treeedge->route.gridsX;\n          short* gridsY = treeedge->route.gridsY;\n\n          // std::cout << \"net: \" << netID << \" route lenth: \" <<\n          // treeedge->route.routelen << std::endl;\n\n          for (int i = 0; i < treeedge->route.routelen; i++) {\n            // std::cout << \"path: \" << gridsX[i] << \",\" << gridsY[i] << \"<->\"\n            // << gridsX[i + 1] << \",\" << gridsY[i + 1] << std::endl;\n            if (gridsX[i] == gridsX[i + 1]) {\n              int ymin = min(gridsY[i], gridsY[i + 1]);\n              vedge_usage[ymin * xGrid + gridsX[i]]++;\n            } else if (gridsY[i] == gridsY[i + 1]) {\n              int xmin = min(gridsX[i], gridsX[i + 1]);\n              hedge_usage[gridsY[i] * (xGrid - 1) + xmin]++;\n            } else {\n              std::cout << \"usage correctness: net not connected!\" << std::endl;\n            }\n          }\n        } else {\n          std::cout << \"usage correctness: not maze route!\" << std::endl;\n          exit(1);\n        }\n      }\n    }\n  }\n  int same = 0;\n  int diff = 0;\n  for (int i = 0; i < yGrid; i++) {\n    for (int j = 0; j < xGrid - 1; j++) {\n      int grid = i * (xGrid - 1) + j;\n      if (hedge_usage[grid] == h_edges[grid].usage) {\n        same++;\n      } else {\n        diff++;\n        // std::cout << \"h edge diff: \" << j << \", \" << i << \" check: \" <<\n        // hedge_usage[grid] << \" actual: \" << h_edges[grid].usage << std::endl;\n      }\n    }\n  }\n\n  for (int i = 0; i < yGrid - 1; i++) {\n    for (int j = 0; j < xGrid; j++) {\n      int grid = i * xGrid + j;\n      if (vedge_usage[grid] == v_edges[grid].usage) {\n        same++;\n      } else {\n        diff++;\n        // std::cout << \"v edge diff: \" << j << \", \" << i << \" check: \" <<\n        // vedge_usage[grid] << \" actual: \" << v_edges[grid].usage << std::endl;\n      }\n    }\n  }\n\n  std::cout << \"same: \" << same << \" diff: \" << diff << std::endl;\n\n  delete[] vedge_usage;\n  delete[] hedge_usage;\n}\n\nint getOverflow2D(int* maxOverflow) {\n  int i, j, grid, overflow, max_overflow, H_overflow, max_H_overflow,\n      V_overflow, max_V_overflow, numedges;\n  int total_usage, total_cap, hCap, vCap;\n\n  // get overflow\n  overflow = max_overflow = H_overflow = max_H_overflow = V_overflow =\n      max_V_overflow                                    = 0;\n  hCap = vCap = numedges = 0;\n\n  total_usage = 0;\n  total_cap   = 0;\n  //    fprintf(fph, \"Horizontal Congestion\\n\");\n  for (i = 0; i < yGrid; i++) {\n    for (j = 0; j < xGrid - 1; j++) {\n      grid = i * (xGrid - 1) + j;\n      total_usage += h_edges[grid].est_usage;\n      overflow = h_edges[grid].est_usage - h_edges[grid].cap;\n      total_cap += h_edges[grid].cap;\n      hCap += h_edges[grid].cap;\n      if (overflow > 0) {\n        H_overflow += overflow;\n        max_H_overflow = max(max_H_overflow, overflow);\n        numedges++;\n      }\n    }\n  }\n  //    fprintf(fpv, \"\\nVertical Congestion\\n\");\n  for (i = 0; i < yGrid - 1; i++) {\n    for (j = 0; j < xGrid; j++) {\n      grid = i * xGrid + j;\n      total_usage += v_edges[grid].est_usage;\n      overflow = v_edges[grid].est_usage - v_edges[grid].cap;\n      total_cap += v_edges[grid].cap;\n      vCap += v_edges[grid].cap;\n      if (overflow > 0) {\n        V_overflow += overflow;\n        max_V_overflow = max(max_V_overflow, overflow);\n        numedges++;\n      }\n    }\n  }\n\n  max_overflow  = max(max_H_overflow, max_V_overflow);\n  totalOverflow = H_overflow + V_overflow;\n  *maxOverflow  = max_overflow;\n\n  if (total_usage > 800000) {\n    ahTH = 30;\n  } else {\n    ahTH = 20;\n  }\n\n  printf(\"total hCap    : %d\\n\", hCap);\n  printf(\"total vCap    : %d\\n\", vCap);\n  printf(\"total Usage   : %d\\n\", (int)total_usage);\n  printf(\"Max H Overflow: %d\\n\", max_H_overflow);\n  printf(\"Max V Overflow: %d\\n\", max_V_overflow);\n  printf(\"Max Overflow  : %d\\n\", max_overflow);\n  printf(\"Num Overflow e: %d\\n\", numedges);\n  printf(\"H   Overflow  : %d\\n\", H_overflow);\n  printf(\"V   Overflow  : %d\\n\", V_overflow);\n  printf(\"Final Overflow: %d\\n\\n\", totalOverflow);\n\n  return (totalOverflow);\n}\n\nint getOverflow3D(void) {\n  int i, j, k, grid, overflow, max_overflow, H_overflow, max_H_overflow,\n      V_overflow, max_V_overflow;\n  int cap;\n  int total_usage;\n\n  // get overflow\n  overflow = max_overflow = H_overflow = max_H_overflow = V_overflow =\n      max_V_overflow                                    = 0;\n\n  total_usage = 0;\n  cap         = 0;\n  //    fprintf(fph, \"Horizontal Congestion\\n\");\n\n  for (k = 0; k < numLayers; k++) {\n    for (i = 0; i < yGrid; i++) {\n      for (j = 0; j < xGrid - 1; j++) {\n        grid = i * (xGrid - 1) + j + k * (xGrid - 1) * yGrid;\n        total_usage += h_edges3D[grid].usage;\n        overflow = h_edges3D[grid].usage - h_edges3D[grid].cap;\n        cap += h_edges3D[grid].cap;\n\n        if (overflow > 0) {\n          H_overflow += overflow;\n          max_H_overflow = max(max_H_overflow, overflow);\n        }\n      }\n    }\n    for (i = 0; i < yGrid - 1; i++) {\n      for (j = 0; j < xGrid; j++) {\n        grid = i * xGrid + j + k * xGrid * (yGrid - 1);\n        total_usage += v_edges3D[grid].usage;\n        overflow = v_edges3D[grid].usage - v_edges3D[grid].cap;\n        cap += v_edges3D[grid].cap;\n\n        if (overflow > 0) {\n          V_overflow += overflow;\n          max_V_overflow = max(max_V_overflow, overflow);\n        }\n      }\n    }\n  }\n\n  max_overflow  = max(max_H_overflow, max_V_overflow);\n  totalOverflow = H_overflow + V_overflow;\n\n  printf(\"total Usage   : %d\\n\", total_usage);\n  printf(\"Total Capacity: %d\\n\", cap);\n  printf(\"Max H Overflow: %d\\n\", max_H_overflow);\n  printf(\"Max V Overflow: %d\\n\", max_V_overflow);\n  printf(\"Max Overflow  : %d\\n\", max_overflow);\n  printf(\"H   Overflow  : %d\\n\", H_overflow);\n  printf(\"V   Overflow  : %d\\n\", V_overflow);\n  printf(\"Final Overflow: %d\\n\\n\", totalOverflow);\n\n  return (total_usage);\n}\n\nint unsolved;\n\n/*void initialCongestionHistory(int round)\n{\n    int i, j, grid;\n\n    for(i=0; i<yGrid; i++)\n    {\n        for(j=0; j<xGrid-1; j++)\n        {\n            grid = i*(xGrid-1)+j;\n            h_edges[grid].est_usage -=\n((float)h_edges[grid].usage/h_edges[grid].cap);\n\n        }\n    }\n\n    for(i=0; i<yGrid-1; i++)\n    {\n        for(j=0; j<xGrid; j++)\n        {\n            grid = i*xGrid+j;\n            v_edges[grid].est_usage -=\n((float)v_edges[grid].usage/v_edges[grid].cap);\n\n        }\n    }\n\n}\n\nvoid reduceCongestionHistory(int round)\n{\n    int i, j, grid;\n\n    for(i=0; i<yGrid; i++)\n    {\n        for(j=0; j<xGrid-1; j++)\n        {\n            grid = i*(xGrid-1)+j;\n            h_edges[grid].est_usage -=\n0.2*((float)h_edges[grid].usage/h_edges[grid].cap);\n        }\n    }\n\n    for(i=0; i<yGrid-1; i++)\n    {\n        for(j=0; j<xGrid; j++)\n        {\n            grid = i*xGrid+j;\n            v_edges[grid].est_usage -=\n0.2*((float)v_edges[grid].usage/v_edges[grid].cap);\n        }\n    }\n\n}*/\n\nvoid InitEstUsage() {\n  int i, j, grid;\n  for (i = 0; i < yGrid; i++) {\n    for (j = 0; j < xGrid - 1; j++) {\n      grid                    = i * (xGrid - 1) + j;\n      h_edges[grid].est_usage = 0;\n    }\n  }\n  //    fprintf(fpv, \"\\nVertical Congestion\\n\");\n  for (i = 0; i < yGrid - 1; i++) {\n    for (j = 0; j < xGrid; j++) {\n      grid                    = i * xGrid + j;\n      v_edges[grid].est_usage = 0;\n    }\n  }\n}\n\nvoid str_accu(int rnd) {\n  int i, j, grid, overflow;\n  for (i = 0; i < yGrid; i++) {\n    for (j = 0; j < xGrid - 1; j++) {\n      grid     = i * (xGrid - 1) + j;\n      overflow = h_edges[grid].usage - h_edges[grid].cap;\n      if (overflow > 0 || h_edges[grid].congCNT > rnd) {\n        h_edges[grid].last_usage += h_edges[grid].congCNT * overflow / 2;\n      }\n    }\n  }\n  //    fprintf(fpv, \"\\nVertical Congestion\\n\");\n  for (i = 0; i < yGrid - 1; i++) {\n    for (j = 0; j < xGrid; j++) {\n      grid     = i * xGrid + j;\n      overflow = v_edges[grid].usage - v_edges[grid].cap;\n      if (overflow > 0 || v_edges[grid].congCNT > rnd) {\n        v_edges[grid].last_usage += v_edges[grid].congCNT * overflow / 2;\n      }\n    }\n  }\n}\n\nvoid InitLastUsage(int upType) {\n  int i, j, grid;\n  for (i = 0; i < yGrid; i++) {\n    for (j = 0; j < xGrid - 1; j++) {\n      grid                     = i * (xGrid - 1) + j;\n      h_edges[grid].last_usage = 0;\n    }\n  }\n  //    fprintf(fpv, \"\\nVertical Congestion\\n\");\n  for (i = 0; i < yGrid - 1; i++) {\n    for (j = 0; j < xGrid; j++) {\n      grid                     = i * xGrid + j;\n      v_edges[grid].last_usage = 0;\n    }\n  }\n\n  if (upType == 1) {\n    for (i = 0; i < yGrid; i++) {\n      for (j = 0; j < xGrid - 1; j++) {\n        grid                  = i * (xGrid - 1) + j;\n        h_edges[grid].congCNT = 0;\n      }\n    }\n    //    fprintf(fpv, \"\\nVertical Congestion\\n\");\n    for (i = 0; i < yGrid - 1; i++) {\n      for (j = 0; j < xGrid; j++) {\n        grid                  = i * xGrid + j;\n        v_edges[grid].congCNT = 0;\n      }\n    }\n  } else if (upType == 2) {\n\n    for (i = 0; i < yGrid; i++) {\n      for (j = 0; j < xGrid - 1; j++) {\n\n        grid = i * (xGrid - 1) + j;\n        // if (overflow > 0)\n        h_edges[grid].last_usage = h_edges[grid].last_usage * 0.2;\n      }\n    }\n    //    fprintf(fpv, \"\\nVertical Congestion\\n\");\n    for (i = 0; i < yGrid - 1; i++) {\n      for (j = 0; j < xGrid; j++) {\n\n        grid = i * xGrid + j;\n        //\tif (overflow > 0)\n        v_edges[grid].last_usage = v_edges[grid].last_usage * 0.2;\n      }\n    }\n  }\n}\n\n#endif\n"
  },
  {
    "path": "lonestar/eda/cpu/sproute/maze3D.h",
    "content": "#ifndef _MAZE3D_H_\n#define _MAZE3D_H_\n\n#include <stdio.h>\n#include <stdlib.h>\n#include <math.h>\n#include \"DataType.h\"\n#include \"flute.h\"\n#include \"DataProc.h\"\n#include \"route.h\"\n#include \"RipUp.h\"\n#include <time.h>\n\n#define PARENT(i) (i - 1) / 2\n//#define PARENT(i) ((i-1)>>1)\n#define LEFT(i) 2 * i + 1\n#define RIGHT(i) 2 * i + 2\n\ntypedef struct {\n  int x; // x position\n  int y; // y position\n  int l;\n} Pos3D;\n// non recursive version of heapify-\nstatic void heapify3D(int** array, int heapSize, int i) {\n  int l, r, smallest;\n  int* tmp;\n  Bool STOP = FALSE;\n\n  tmp = array[i];\n  do {\n\n    l = LEFT(i);\n    r = RIGHT(i);\n\n    if (l < heapSize && *(array[l]) < *tmp) {\n      smallest = l;\n      if (r < heapSize && *(array[r]) < *(array[l]))\n        smallest = r;\n    } else {\n      smallest = i;\n      if (r < heapSize && *(array[r]) < *tmp)\n        smallest = r;\n    }\n    if (smallest != i) {\n      array[i] = array[smallest];\n      i        = smallest;\n    } else {\n      array[i] = tmp;\n      STOP     = TRUE;\n    }\n  } while (!STOP);\n}\n\n// build heap for an list of grid\n/*static void buildHeap3D(int **array, int arrayLen)\n{\n    int i;\n\n    for (i=arrayLen/2-1; i>=0; i--)\n        heapify3D(array, arrayLen, i);\n}*/\n\nstatic void updateHeap3D(int** array, int i) {\n\n  int parent;\n  int* tmpi;\n\n  //\tprintf(\"heap being updated\");\n  //\tfflush(stdout);\n\n  tmpi = array[i];\n  while (i > 0 && *(array[PARENT(i)]) > *tmpi) {\n    parent   = PARENT(i);\n    array[i] = array[parent];\n    i        = parent;\n  }\n  array[i] = tmpi;\n\n  //\tprintf(\"heap updated succedded\");\n  //\tfflush(stdout);\n}\n\n// extract the entry with minimum distance from Priority queue\nstatic void extractMin3D(int** array, int arrayLen) {\n\n  if (arrayLen < 1)\n    printf(\"Error: heap underflow\\n\");\n  array[0] = array[arrayLen - 1];\n  heapify3D(array, arrayLen - 1, 0);\n}\n\nvoid setupHeap3D(int netID, int edgeID, int* heapLen1, int* heapLen2,\n                 int regionX1, int regionX2, int regionY1, int regionY2) {\n  int i, j, l, d, numNodes, x1, y1, x2, y2, n1, n2, nt;\n  int nbr, nbrX, nbrY, cur, edge;\n  int x_grid, y_grid, l_grid, heapcnt;\n  int queuehead, queuetail;\n  int* heapQueue;\n  //   Bool *heapVisited;\n  TreeEdge* treeedges;\n  TreeNode* treenodes;\n  Route* route;\n  // Bool **inRegion;  // the flag to check weather the node is in the enlarged\n  // region for maze routing\n\n  // inRegion = (Bool**) calloc(yGrid, sizeof(Bool*));\n  // for(i=0; i<yGrid; i++)\n  // inRegion[i] = (Bool*) calloc(xGrid, sizeof(Bool));\n\n  /*\tfor ( l = 0; l < numLayers; l++) {\n          for(i=regionY1; i<=regionY2; i++) {\n              for(j=regionX1; j<=regionX2; j++) {\n                  corrEdge3D[l][i][j] = BIG_INT;\n              }\n          }\n      }*/\n\n  // return;\n\n  treeedges = sttrees[netID].edges;\n  treenodes = sttrees[netID].nodes;\n  d         = sttrees[netID].deg;\n\n  n1 = treeedges[edgeID].n1;\n  n2 = treeedges[edgeID].n2;\n  x1 = treenodes[n1].x;\n  y1 = treenodes[n1].y;\n  x2 = treenodes[n2].x;\n  y2 = treenodes[n2].y;\n\n  if (d == 2) // 2-pin net\n  {\n    //\t\tprintf(\"2pinnet l1 %d, l2 %d\\n\", l1, l2);\n    d13D[0][y1][x1]         = 0;\n    directions3D[0][y1][x1] = ORIGIN;\n    heap13D[0]              = &(d13D[0][y1][x1]);\n    *heapLen1               = 1;\n    d23D[0][y2][x2]         = 0;\n    directions3D[0][y2][x2] = ORIGIN;\n    heap23D[0]              = &(d23D[0][y2][x2]);\n    *heapLen2               = 1;\n  } else // net with more than 2 pins\n  {\n    heapQueue = (int*)calloc(MAXNETDEG, sizeof(int));\n\n    for (i = regionY1; i <= regionY2; i++) {\n      for (j = regionX1; j <= regionX2; j++) {\n        inRegion[i][j] = TRUE;\n      }\n    }\n\n    numNodes = 2 * d - 2;\n\n    for (i = 0; i < numNodes; i++)\n      heapVisited[i] = FALSE;\n\n    // find all the grids on tree edges in subtree t1 (connecting to n1) and put\n    // them into heap13D\n    if (n1 < d) // n1 is a Pin node\n    {\n      //\t\t\tgetLayerRange(treenodes, treeedges ,n1, edgeID, &topL,\n      //&botL);\n\n      // just need to put n1 itself into heap13D\n      heapcnt = 0;\n\n      nt = treenodes[n1].stackAlias;\n\n      for (l = treenodes[nt].botL; l <= treenodes[nt].topL; l++) {\n        d13D[l][y1][x1] = 0;\n        // cout << heap13D << \" \" << heapcnt << \" \" << d13D[l][y1][x1] << \" \" <<\n        // &(d13D[l][y1][x1]) << endl;\n        heap13D[heapcnt]        = &(d13D[l][y1][x1]);\n        directions3D[l][y1][x1] = ORIGIN;\n        heapVisited[n1]         = TRUE;\n        heapcnt++;\n      }\n      *heapLen1 = heapcnt;\n\n    } else // n1 is a Steiner node\n    {\n      heapcnt   = 0;\n      queuehead = queuetail = 0;\n\n      //\t\t\tgetLayerRange(treenodes, treeedges ,n1, edgeID, &topL,\n      //&botL);\n\n      nt = treenodes[n1].stackAlias;\n\n      // add n1 into heap13D\n      for (l = treenodes[nt].botL; l <= treenodes[nt].topL; l++) {\n        d13D[l][y1][x1]         = 0;\n        directions3D[l][y1][x1] = ORIGIN;\n        heap13D[heapcnt]        = &(d13D[l][y1][x1]);\n        heapVisited[n1]         = TRUE;\n        heapcnt++;\n      }\n\n      // add n1 into the heapQueue\n      heapQueue[queuetail] = n1;\n      queuetail++;\n\n      // loop to find all the edges in subtree t1\n      while (queuetail > queuehead) {\n        // get cur node from the queuehead\n        cur = heapQueue[queuehead];\n        queuehead++;\n        heapVisited[cur] = TRUE;\n        if (cur >= d) // cur node is a Steiner node\n        {\n          for (i = 0; i < 3; i++) {\n            nbr  = treenodes[cur].nbr[i];\n            edge = treenodes[cur].edge[i];\n            if (nbr != n2) // not n2\n            {\n              if (heapVisited[nbr] == FALSE) {\n\n                // put all the grids on the two adjacent tree edges into heap13D\n                if (treeedges[edge].route.routelen > 0) // not a degraded edge\n                {\n\n                  // put nbr into heap13D if in enlarged region\n                  if (inRegion[treenodes[nbr].y][treenodes[nbr].x]) {\n                    nbrX = treenodes[nbr].x;\n                    nbrY = treenodes[nbr].y;\n                    nt   = treenodes[nbr].stackAlias;\n                    for (l = treenodes[nt].botL; l <= treenodes[nt].topL; l++) {\n\n                      d13D[l][nbrY][nbrX]         = 0;\n                      directions3D[l][nbrY][nbrX] = ORIGIN;\n                      heap13D[heapcnt]            = &(d13D[l][nbrY][nbrX]);\n                      heapcnt++;\n                      corrEdge3D[l][nbrY][nbrX] = edge;\n                    }\n                  }\n\n                  // the coordinates of two end nodes of the edge\n\n                  route = &(treeedges[edge].route);\n                  if (route->type == MAZEROUTE) {\n                    for (j = 1; j < route->routelen;\n                         j++) // don't put edge_n1 and edge_n2 into heap13D\n                    {\n                      x_grid = route->gridsX[j];\n                      y_grid = route->gridsY[j];\n                      l_grid = route->gridsL[j];\n\n                      if (inRegion[y_grid][x_grid]) {\n                        d13D[l_grid][y_grid][x_grid] = 0;\n                        heap13D[heapcnt] = &(d13D[l_grid][y_grid][x_grid]);\n                        directions3D[l_grid][y_grid][x_grid] = ORIGIN;\n                        heapcnt++;\n                        corrEdge3D[l_grid][y_grid][x_grid] = edge;\n                      }\n                    }\n\n                  } // if MAZEROUTE\n                }   // if not a degraded edge (len>0)\n\n                // add the neighbor of cur node into heapQueue\n                heapQueue[queuetail] = nbr;\n                queuetail++;\n              }            // if the node is not heapVisited\n            }              // if nbr!=n2\n          }                // loop i (3 neigbors for cur node)\n        }                  // if cur node is a Steiner nodes\n      }                    // while heapQueue is not empty\n      *heapLen1 = heapcnt; // record the length of heap13D\n    }                      // else n1 is not a Pin node\n\n    // find all the grids on subtree t2 (connect to n2) and put them into\n    // heap23D find all the grids on tree edges in subtree t2 (connecting to n2)\n    // and put them into heap23D\n    if (n2 < d) // n2 is a Pin node\n    {\n\n      nt = treenodes[n2].stackAlias;\n      //*heapLen2 = 0;\n      heapcnt = 0;\n\n      for (l = treenodes[nt].botL; l <= treenodes[nt].topL; l++) {\n        // just need to put n1 itself into heap13D\n        d23D[l][y2][x2]         = 0;\n        directions3D[l][y2][x2] = ORIGIN;\n        heap23D[heapcnt]        = &(d23D[l][y2][x2]);\n        heapVisited[n2]         = TRUE;\n        //*heapLen2 += 1;\n        heapcnt++;\n      }\n      *heapLen2 = heapcnt;\n    } else // n2 is a Steiner node\n    {\n      heapcnt   = 0;\n      queuehead = queuetail = 0;\n\n      nt = treenodes[n2].stackAlias;\n      // add n2 into heap23D\n      for (l = treenodes[nt].botL; l <= treenodes[nt].topL; l++) {\n        d23D[l][y2][x2]         = 0;\n        directions3D[l][y2][x2] = ORIGIN;\n        heap23D[heapcnt]        = &(d23D[l][y2][x2]);\n        heapcnt++;\n      }\n      heapVisited[n2] = TRUE;\n\n      // add n2 into the heapQueue\n      heapQueue[queuetail] = n2;\n      queuetail++;\n\n      // loop to find all the edges in subtree t2\n      while (queuetail > queuehead) {\n        // get cur node form queuehead\n        cur              = heapQueue[queuehead];\n        heapVisited[cur] = TRUE;\n        queuehead++;\n\n        if (cur >= d) // cur node is a Steiner node\n        {\n          for (i = 0; i < 3; i++) {\n            nbr  = treenodes[cur].nbr[i];\n            edge = treenodes[cur].edge[i];\n            if (nbr != n1) // not n1\n            {\n              if (heapVisited[nbr] == FALSE) {\n                // put all the grids on the two adjacent tree edges into heap23D\n                if (treeedges[edge].route.routelen > 0) // not a degraded edge\n                {\n\n                  // put nbr into heap23D\n                  if (inRegion[treenodes[nbr].y][treenodes[nbr].x]) {\n                    nbrX = treenodes[nbr].x;\n                    nbrY = treenodes[nbr].y;\n                    nt   = treenodes[nbr].stackAlias;\n                    for (l = treenodes[nt].botL; l <= treenodes[nt].topL; l++) {\n                      // nbrL = treenodes[nbr].l;\n\n                      d23D[l][nbrY][nbrX]         = 0;\n                      directions3D[l][nbrY][nbrX] = ORIGIN;\n                      heap23D[heapcnt]            = &(d23D[l][nbrY][nbrX]);\n                      heapcnt++;\n                      corrEdge3D[l][nbrY][nbrX] = edge;\n                    }\n                  }\n\n                  // the coordinates of two end nodes of the edge\n\n                  route = &(treeedges[edge].route);\n                  if (route->type == MAZEROUTE) {\n\n                    for (j = 1; j < route->routelen;\n                         j++) // don't put edge_n1 and edge_n2 into heap23D\n                    {\n                      x_grid = route->gridsX[j];\n                      y_grid = route->gridsY[j];\n                      l_grid = route->gridsL[j];\n                      if (inRegion[y_grid][x_grid]) {\n\n                        d23D[l_grid][y_grid][x_grid]         = 0;\n                        directions3D[l_grid][y_grid][x_grid] = ORIGIN;\n                        heap23D[heapcnt] = &(d23D[l_grid][y_grid][x_grid]);\n                        heapcnt++;\n\n                        corrEdge3D[l_grid][y_grid][x_grid] = edge;\n                      }\n                    }\n\n                  } // if MAZEROUTE\n                }   // if the edge is not degraded (len>0)\n\n                // add the neighbor of cur node into heapQueue\n                heapQueue[queuetail] = nbr;\n                queuetail++;\n              }            // if the node is not heapVisited\n            }              // if nbr!=n1\n          }                // loop i (3 neigbors for cur node)\n        }                  // if cur node is a Steiner nodes\n      }                    // while heapQueue is not empty\n      *heapLen2 = heapcnt; // record the length of heap23D\n    }                      // else n2 is not a Pin node\n\n    //\tprintf(\"queuetail %d, numnodes %d\\n\", queuetail, numNodes);\n    //\tfflush(stdout);\n    free(heapQueue);\n    //   free(heapVisited);\n\n    //\tprintf(\"there after\\n\", queuetail, numNodes);\n    //\tfflush(stdout);\n\n    for (i = regionY1; i <= regionY2; i++) {\n      for (j = regionX1; j <= regionX2; j++) {\n        inRegion[i][j] = FALSE;\n      }\n    }\n  } // net with more than two pins\n\n  // for(i=0; i<yGrid; i++) {\n  // free(inRegion[i]);\n  //}\n  // free(inRegion);\n}\n\nvoid newUpdateNodeLayers(TreeNode* treenodes, int edgeID, int n1, int lastL) {\n  int con;\n\n  con = treenodes[n1].conCNT;\n\n  treenodes[n1].heights[con] = lastL;\n  treenodes[n1].eID[con]     = edgeID;\n  treenodes[n1].conCNT++;\n  if (treenodes[n1].topL < lastL) {\n    treenodes[n1].topL = lastL;\n    treenodes[n1].hID  = edgeID;\n  }\n  if (treenodes[n1].botL > lastL) {\n    treenodes[n1].botL = lastL;\n    treenodes[n1].lID  = edgeID;\n  }\n}\n\nint copyGrids3D(TreeNode* treenodes, int n1, TreeEdge* treeedges, int edge_n1n2,\n                int gridsX_n1n2[], int gridsY_n1n2[], int gridsL_n1n2[]) {\n  int i, cnt;\n  int n1x, n1y, n1l = 0;\n\n  n1x = treenodes[n1].x;\n  n1y = treenodes[n1].y;\n  // n1l = treenodes[n1].l;\n  // n2l = treenodes[n2].l;\n\n  cnt = 0;\n  if (treeedges[edge_n1n2].n1 == n1) // n1 is the first node of (n1, n2)\n  {\n    if (treeedges[edge_n1n2].route.routelen > 0) {\n      for (i = 0; i <= treeedges[edge_n1n2].route.routelen; i++) {\n        gridsX_n1n2[cnt] = treeedges[edge_n1n2].route.gridsX[i];\n        gridsY_n1n2[cnt] = treeedges[edge_n1n2].route.gridsY[i];\n        gridsL_n1n2[cnt] = treeedges[edge_n1n2].route.gridsL[i];\n        cnt++;\n      }\n    }    // MAZEROUTE\n    else // NOROUTE\n    {\n      fflush(stdout);\n      gridsX_n1n2[cnt] = n1x;\n      gridsY_n1n2[cnt] = n1y;\n      gridsL_n1n2[cnt] = n1l;\n      cnt++;\n    }\n  }    // if n1 is the first node of (n1, n2)\n  else // n2 is the first node of (n1, n2)\n  {\n    if (treeedges[edge_n1n2].route.routelen > 0) {\n      for (i = treeedges[edge_n1n2].route.routelen; i >= 0; i--) {\n        gridsX_n1n2[cnt] = treeedges[edge_n1n2].route.gridsX[i];\n        gridsY_n1n2[cnt] = treeedges[edge_n1n2].route.gridsY[i];\n        gridsL_n1n2[cnt] = treeedges[edge_n1n2].route.gridsL[i];\n        cnt++;\n      }\n    }    // MAZEROUTE\n    else // NOROUTE\n    {\n      gridsX_n1n2[cnt] = n1x;\n      gridsY_n1n2[cnt] = n1y;\n      gridsL_n1n2[cnt] = n1l;\n      cnt++;\n    } // MAZEROUTE\n  }\n\n  return (cnt);\n}\n\nvoid updateRouteType13D(TreeNode* treenodes, int n1, int A1, int A2, int E1x,\n                        int E1y, TreeEdge* treeedges, int edge_n1A1,\n                        int edge_n1A2) {\n  int i, l, cnt, A1x, A1y, A2x, A2y;\n  int cnt_n1A1, cnt_n1A2, E1_pos1 = 0, E1_pos2 = 0;\n  int gridsX_n1A1[MAXLEN], gridsY_n1A1[MAXLEN], gridsL_n1A1[MAXLEN],\n      gridsX_n1A2[MAXLEN], gridsY_n1A2[MAXLEN], gridsL_n1A2[MAXLEN];\n\n  A1x = treenodes[A1].x;\n  A1y = treenodes[A1].y;\n  A2x = treenodes[A2].x;\n  A2y = treenodes[A2].y;\n\n  // copy all the grids on (n1, A1) and (n2, A2) to tmp arrays, and keep the\n  // grids order A1->n1->A2 copy (n1, A1)\n  cnt_n1A1 = copyGrids3D(treenodes, A1, treeedges, edge_n1A1, gridsX_n1A1,\n                         gridsY_n1A1, gridsL_n1A1);\n\n  // copy (n1, A2)\n  cnt_n1A2 = copyGrids3D(treenodes, n1, treeedges, edge_n1A2, gridsX_n1A2,\n                         gridsY_n1A2, gridsL_n1A2);\n\n  if (cnt_n1A1 == 1) {\n    printf(\"in 3D maze routing, type 1 node shift, cnt_n1A1 is 1\\n\");\n    exit(0);\n  }\n\n  for (i = 0; i < cnt_n1A1; i++) {\n    if (gridsX_n1A1[i] == E1x && gridsY_n1A1[i] == E1y) // reach the E1\n    {\n      E1_pos1 = i;\n      break;\n    }\n  }\n\n  for (i = cnt_n1A1 - 1; i >= 0; i--) {\n    if (gridsX_n1A1[i] == E1x && gridsY_n1A1[i] == E1y) // reach the E1\n    {\n      E1_pos2 = i;\n      break;\n    }\n  }\n\n  // reallocate memory for route.gridsX and route.gridsY\n  if (treeedges[edge_n1A1].route.type == MAZEROUTE &&\n      treeedges[edge_n1A1].route.routelen >\n          0) // if originally allocated, free them first\n  {\n    free(treeedges[edge_n1A1].route.gridsX);\n    free(treeedges[edge_n1A1].route.gridsY);\n    free(treeedges[edge_n1A1].route.gridsL);\n  }\n  treeedges[edge_n1A1].route.gridsX =\n      (short*)calloc((E1_pos1 + 1), sizeof(short));\n  treeedges[edge_n1A1].route.gridsY =\n      (short*)calloc((E1_pos1 + 1), sizeof(short));\n  treeedges[edge_n1A1].route.gridsL =\n      (short*)calloc((E1_pos1 + 1), sizeof(short));\n\n  if (A1x <= E1x) {\n    cnt = 0;\n    for (i = 0; i <= E1_pos1; i++) {\n      treeedges[edge_n1A1].route.gridsX[cnt] = gridsX_n1A1[i];\n      treeedges[edge_n1A1].route.gridsY[cnt] = gridsY_n1A1[i];\n      treeedges[edge_n1A1].route.gridsL[cnt] = gridsL_n1A1[i];\n      cnt++;\n    }\n    treeedges[edge_n1A1].n1 = A1;\n    treeedges[edge_n1A1].n2 = n1;\n  } else {\n    cnt = 0;\n    for (i = E1_pos1; i >= 0; i--) {\n      treeedges[edge_n1A1].route.gridsX[cnt] = gridsX_n1A1[i];\n      treeedges[edge_n1A1].route.gridsY[cnt] = gridsY_n1A1[i];\n      treeedges[edge_n1A1].route.gridsL[cnt] = gridsL_n1A1[i];\n      cnt++;\n    }\n    treeedges[edge_n1A1].n1 = n1;\n    treeedges[edge_n1A1].n2 = A1;\n  }\n  treeedges[edge_n1A1].len = ADIFF(A1x, E1x) + ADIFF(A1y, E1y);\n\n  treeedges[edge_n1A1].route.type     = MAZEROUTE;\n  treeedges[edge_n1A1].route.routelen = E1_pos1;\n\n  // reallocate memory for route.gridsX and route.gridsY\n  if (treeedges[edge_n1A2].route.type == MAZEROUTE &&\n      treeedges[edge_n1A2].route.routelen >\n          0) // if originally allocated, free them first\n  {\n    free(treeedges[edge_n1A2].route.gridsX);\n    free(treeedges[edge_n1A2].route.gridsY);\n    free(treeedges[edge_n1A2].route.gridsL);\n  }\n\n  if (cnt_n1A2 > 1) {\n    treeedges[edge_n1A2].route.gridsX =\n        (short*)calloc((cnt_n1A1 + cnt_n1A2 - E1_pos2 - 1 +\n                        ADIFF(gridsL_n1A1[cnt_n1A1 - 1], gridsL_n1A2[0])),\n                       sizeof(short));\n    treeedges[edge_n1A2].route.gridsY =\n        (short*)calloc((cnt_n1A1 + cnt_n1A2 - E1_pos2 - 1 +\n                        ADIFF(gridsL_n1A1[cnt_n1A1 - 1], gridsL_n1A2[0])),\n                       sizeof(short));\n    treeedges[edge_n1A2].route.gridsL =\n        (short*)calloc((cnt_n1A1 + cnt_n1A2 - E1_pos2 - 1 +\n                        ADIFF(gridsL_n1A1[cnt_n1A1 - 1], gridsL_n1A2[0])),\n                       sizeof(short));\n  } else {\n    treeedges[edge_n1A2].route.gridsX =\n        (short*)calloc((cnt_n1A1 + cnt_n1A2 - E1_pos2 - 1), sizeof(short));\n    treeedges[edge_n1A2].route.gridsY =\n        (short*)calloc((cnt_n1A1 + cnt_n1A2 - E1_pos2 - 1), sizeof(short));\n    treeedges[edge_n1A2].route.gridsL =\n        (short*)calloc((cnt_n1A1 + cnt_n1A2 - E1_pos2 - 1), sizeof(short));\n  }\n\n  if (E1x <= A2x) {\n    cnt = 0;\n    for (i = E1_pos2; i < cnt_n1A1; i++) {\n      treeedges[edge_n1A2].route.gridsX[cnt] = gridsX_n1A1[i];\n      treeedges[edge_n1A2].route.gridsY[cnt] = gridsY_n1A1[i];\n      treeedges[edge_n1A2].route.gridsL[cnt] = gridsL_n1A1[i];\n      cnt++;\n    }\n    if (cnt_n1A2 > 1) {\n      if (gridsL_n1A1[cnt_n1A1 - 1] > gridsL_n1A2[0]) {\n        for (l = gridsL_n1A1[cnt_n1A1 - 1] - 1; l >= gridsL_n1A2[0]; l--) {\n          treeedges[edge_n1A2].route.gridsX[cnt] = gridsX_n1A2[0];\n          treeedges[edge_n1A2].route.gridsY[cnt] = gridsY_n1A2[0];\n          treeedges[edge_n1A2].route.gridsL[cnt] = l;\n          cnt++;\n        }\n      } else if (gridsL_n1A1[cnt_n1A1 - 1] < gridsL_n1A2[0]) {\n        for (l = gridsL_n1A1[cnt_n1A1 - 1] + 1; l <= gridsL_n1A2[0]; l++) {\n          treeedges[edge_n1A2].route.gridsX[cnt] = gridsX_n1A2[0];\n          treeedges[edge_n1A2].route.gridsY[cnt] = gridsY_n1A2[0];\n          treeedges[edge_n1A2].route.gridsL[cnt] = l;\n          cnt++;\n        }\n      }\n    }\n\n    for (i = 1; i < cnt_n1A2; i++) // 0 is n1 again, so no repeat\n    {\n      treeedges[edge_n1A2].route.gridsX[cnt] = gridsX_n1A2[i];\n      treeedges[edge_n1A2].route.gridsY[cnt] = gridsY_n1A2[i];\n      treeedges[edge_n1A2].route.gridsL[cnt] = gridsL_n1A2[i];\n      cnt++;\n    }\n    treeedges[edge_n1A2].n1 = n1;\n    treeedges[edge_n1A2].n2 = A2;\n  } else {\n    cnt = 0;\n    for (i = cnt_n1A2 - 1; i >= 1; i--) // 0 is n1 again, so no repeat\n    {\n      treeedges[edge_n1A2].route.gridsX[cnt] = gridsX_n1A2[i];\n      treeedges[edge_n1A2].route.gridsY[cnt] = gridsY_n1A2[i];\n      treeedges[edge_n1A2].route.gridsL[cnt] = gridsL_n1A2[i];\n      cnt++;\n    }\n\n    if (cnt_n1A2 > 1) {\n      if (gridsL_n1A1[cnt_n1A1 - 1] > gridsL_n1A2[0]) {\n        for (l = gridsL_n1A2[0]; l < gridsL_n1A1[cnt_n1A1 - 1]; l++) {\n          treeedges[edge_n1A2].route.gridsX[cnt] = gridsX_n1A2[0];\n          treeedges[edge_n1A2].route.gridsY[cnt] = gridsY_n1A2[0];\n          treeedges[edge_n1A2].route.gridsL[cnt] = l;\n          cnt++;\n        }\n      } else if (gridsL_n1A1[cnt_n1A1 - 1] < gridsL_n1A2[0]) {\n        for (l = gridsL_n1A2[0]; l > gridsL_n1A1[cnt_n1A1 - 1]; l--) {\n          treeedges[edge_n1A2].route.gridsX[cnt] = gridsX_n1A2[0];\n          treeedges[edge_n1A2].route.gridsY[cnt] = gridsY_n1A2[0];\n          treeedges[edge_n1A2].route.gridsL[cnt] = l;\n          cnt++;\n        }\n      }\n    }\n    for (i = cnt_n1A1 - 1; i >= E1_pos2; i--) {\n      treeedges[edge_n1A2].route.gridsX[cnt] = gridsX_n1A1[i];\n      treeedges[edge_n1A2].route.gridsY[cnt] = gridsY_n1A1[i];\n      treeedges[edge_n1A2].route.gridsL[cnt] = gridsL_n1A1[i];\n      cnt++;\n    }\n    treeedges[edge_n1A2].n1 = A2;\n    treeedges[edge_n1A2].n2 = n1;\n  }\n  treeedges[edge_n1A2].route.type     = MAZEROUTE;\n  treeedges[edge_n1A2].route.routelen = cnt - 1;\n  treeedges[edge_n1A2].len            = ADIFF(A2x, E1x) + ADIFF(A2y, E1y);\n\n  treenodes[n1].x = E1x;\n  treenodes[n1].y = E1y;\n}\n\nvoid updateRouteType23D(TreeNode* treenodes, int n1, int A1, int A2, int C1,\n                        int C2, int E1x, int E1y, TreeEdge* treeedges,\n                        int edge_n1A1, int edge_n1A2, int edge_C1C2) {\n  int i, cnt, A1x, A1y, A2x, A2y, C1x, C1y, C2x, C2y, extraLen, startIND;\n  int edge_n1C1, edge_n1C2, edge_A1A2;\n  int cnt_n1A1, cnt_n1A2, cnt_C1C2, E1_pos1 = 0, E1_pos2 = 0;\n  int len_A1A2, len_n1C1, len_n1C2;\n  int gridsX_n1A1[MAXLEN], gridsY_n1A1[MAXLEN], gridsL_n1A1[MAXLEN];\n  int gridsX_n1A2[MAXLEN], gridsY_n1A2[MAXLEN], gridsL_n1A2[MAXLEN];\n  int gridsX_C1C2[MAXLEN], gridsY_C1C2[MAXLEN], gridsL_C1C2[MAXLEN];\n\n  A1x = treenodes[A1].x;\n  A1y = treenodes[A1].y;\n  // A1l = treenodes[A1].l;\n  A2x = treenodes[A2].x;\n  A2y = treenodes[A2].y;\n  C1x = treenodes[C1].x;\n  C1y = treenodes[C1].y;\n  C2x = treenodes[C2].x;\n  C2y = treenodes[C2].y;\n\n  // printf(\"orig edge_n1A1 %d edge_n1A2 %d edge_C1C2\n  // %d\\n\",edge_n1A1,edge_n1A2,edge_C1C2 );\n  edge_n1C1 = edge_n1A1;\n  edge_n1C2 = edge_n1A2;\n  edge_A1A2 = edge_C1C2;\n\n  // combine (n1, A1) and (n1, A2) into (A1, A2), A1 is the first node and A2 is\n  // the second grids order A1->n1->A2 copy (A1, n1)\n  cnt_n1A1 = copyGrids3D(treenodes, A1, treeedges, edge_n1A1, gridsX_n1A1,\n                         gridsY_n1A1, gridsL_n1A1);\n\n  // copy (n1, A2)\n  cnt_n1A2 = copyGrids3D(treenodes, n1, treeedges, edge_n1A2, gridsX_n1A2,\n                         gridsY_n1A2, gridsL_n1A2);\n\n  // copy all the grids on (C1, C2) to gridsX_C1C2[] and gridsY_C1C2[]\n  cnt_C1C2 = copyGrids3D(treenodes, C1, treeedges, edge_C1C2, gridsX_C1C2,\n                         gridsY_C1C2, gridsL_C1C2);\n\n  // combine grids on original (A1, n1) and (n1, A2) to new (A1, A2)\n  // allocate memory for gridsX[] and gridsY[] of edge_A1A2\n  if (treeedges[edge_A1A2].route.type == MAZEROUTE) {\n    free(treeedges[edge_A1A2].route.gridsX);\n    free(treeedges[edge_A1A2].route.gridsY);\n    free(treeedges[edge_A1A2].route.gridsL);\n  }\n  len_A1A2 = cnt_n1A1 + cnt_n1A2 - 1;\n\n  if (len_A1A2 == 1) {\n    treeedges[edge_A1A2].route.routelen = len_A1A2 - 1;\n    treeedges[edge_A1A2].len            = ADIFF(A1x, A2x) + ADIFF(A1y, A2y);\n  } else {\n\n    extraLen = 0;\n    if (cnt_n1A1 > 1 && cnt_n1A2 > 1) {\n      extraLen = ADIFF(gridsL_n1A1[cnt_n1A1 - 1], gridsL_n1A2[0]);\n      len_A1A2 += extraLen;\n    }\n    treeedges[edge_A1A2].route.gridsX = (short*)calloc(len_A1A2, sizeof(short));\n    treeedges[edge_A1A2].route.gridsY = (short*)calloc(len_A1A2, sizeof(short));\n    treeedges[edge_A1A2].route.gridsL = (short*)calloc(len_A1A2, sizeof(short));\n    treeedges[edge_A1A2].route.routelen = len_A1A2 - 1;\n    treeedges[edge_A1A2].len            = ADIFF(A1x, A2x) + ADIFF(A1y, A2y);\n\n    cnt      = 0;\n    startIND = 0;\n\n    if (cnt_n1A1 > 1) {\n      startIND = 1;\n      for (i = 0; i < cnt_n1A1; i++) {\n        treeedges[edge_A1A2].route.gridsX[cnt] = gridsX_n1A1[i];\n        treeedges[edge_A1A2].route.gridsY[cnt] = gridsY_n1A1[i];\n        treeedges[edge_A1A2].route.gridsL[cnt] = gridsL_n1A1[i];\n        cnt++;\n      }\n    }\n\n    if (extraLen > 0) {\n      if (gridsL_n1A1[cnt_n1A1 - 1] < gridsL_n1A2[0]) {\n        for (i = gridsL_n1A1[cnt_n1A1 - 1] + 1; i <= gridsL_n1A2[0]; i++) {\n          treeedges[edge_A1A2].route.gridsX[cnt] = gridsX_n1A2[0];\n          treeedges[edge_A1A2].route.gridsY[cnt] = gridsY_n1A2[0];\n          treeedges[edge_A1A2].route.gridsL[cnt] = i;\n          cnt++;\n        }\n      } else {\n        for (i = gridsL_n1A1[cnt_n1A1 - 1] - 1; i >= gridsL_n1A2[1]; i--) {\n          treeedges[edge_A1A2].route.gridsX[cnt] = gridsX_n1A2[0];\n          treeedges[edge_A1A2].route.gridsY[cnt] = gridsY_n1A2[0];\n          treeedges[edge_A1A2].route.gridsL[cnt] = i;\n          cnt++;\n        }\n      }\n    }\n\n    for (i = startIND; i < cnt_n1A2; i++) // do not repeat point n1\n    {\n      treeedges[edge_A1A2].route.gridsX[cnt] = gridsX_n1A2[i];\n      treeedges[edge_A1A2].route.gridsY[cnt] = gridsY_n1A2[i];\n      treeedges[edge_A1A2].route.gridsL[cnt] = gridsL_n1A2[i];\n      cnt++;\n    }\n  }\n\n  if (cnt_C1C2 == 1) {\n    printf(\"shift to 0 length edge, type2\\n\");\n  }\n\n  // find the index of E1 in (C1, C2)\n  for (i = 0; i < cnt_C1C2; i++) {\n    if (gridsX_C1C2[i] == E1x && gridsY_C1C2[i] == E1y) {\n      E1_pos1 = i;\n      break;\n    }\n  }\n\n  for (i = cnt_C1C2 - 1; i >= 0; i--) {\n    if (gridsX_C1C2[i] == E1x && gridsY_C1C2[i] == E1y) {\n      E1_pos2 = i;\n      break;\n    }\n  }\n\n  // allocate memory for gridsX[] and gridsY[] of edge_n1C1 and edge_n1C2\n  if (treeedges[edge_n1C1].route.type == MAZEROUTE &&\n      treeedges[edge_n1C1].route.routelen > 0) {\n    free(treeedges[edge_n1C1].route.gridsX);\n    free(treeedges[edge_n1C1].route.gridsY);\n    free(treeedges[edge_n1C1].route.gridsL);\n  }\n  len_n1C1 = E1_pos1 + 1;\n\n  treeedges[edge_n1C1].route.gridsX   = (short*)calloc(len_n1C1, sizeof(short));\n  treeedges[edge_n1C1].route.gridsY   = (short*)calloc(len_n1C1, sizeof(short));\n  treeedges[edge_n1C1].route.gridsL   = (short*)calloc(len_n1C1, sizeof(short));\n  treeedges[edge_n1C1].route.routelen = len_n1C1 - 1;\n  treeedges[edge_n1C1].len            = ADIFF(C1x, E1x) + ADIFF(C1y, E1y);\n\n  if (treeedges[edge_n1C2].route.type == MAZEROUTE &&\n      treeedges[edge_n1C2].route.routelen > 0) {\n    free(treeedges[edge_n1C2].route.gridsX);\n    free(treeedges[edge_n1C2].route.gridsY);\n    free(treeedges[edge_n1C2].route.gridsL);\n  }\n  len_n1C2 = cnt_C1C2 - E1_pos2;\n\n  treeedges[edge_n1C2].route.gridsX   = (short*)calloc(len_n1C2, sizeof(short));\n  treeedges[edge_n1C2].route.gridsY   = (short*)calloc(len_n1C2, sizeof(short));\n  treeedges[edge_n1C2].route.gridsL   = (short*)calloc(len_n1C2, sizeof(short));\n  treeedges[edge_n1C2].route.routelen = len_n1C2 - 1;\n  treeedges[edge_n1C2].len            = ADIFF(C2x, E1x) + ADIFF(C2y, E1y);\n\n  // split original (C1, C2) to (C1, n1) and (n1, C2)\n  cnt = 0;\n  for (i = 0; i <= E1_pos1; i++) {\n    treeedges[edge_n1C1].route.gridsX[i] = gridsX_C1C2[i];\n    treeedges[edge_n1C1].route.gridsY[i] = gridsY_C1C2[i];\n    treeedges[edge_n1C1].route.gridsL[i] = gridsL_C1C2[i];\n    cnt++;\n  }\n  /// if(cnt!=len_n1C1) {printf(\"len_n1C1 wrong!\\n\");exit(1);}\n\n  cnt = 0;\n  for (i = E1_pos2; i < cnt_C1C2; i++) {\n    treeedges[edge_n1C2].route.gridsX[cnt] = gridsX_C1C2[i];\n    treeedges[edge_n1C2].route.gridsY[cnt] = gridsY_C1C2[i];\n    treeedges[edge_n1C2].route.gridsL[cnt] = gridsL_C1C2[i];\n    cnt++;\n  }\n}\n\nvoid mazeRouteMSMDOrder3D(int expand, int ripupTHlb, int ripupTHub) {\n\n  short *gridsLtmp, gridsX[MAXLEN], gridsY[MAXLEN], gridsL[MAXLEN],\n      tmp_gridsX[MAXLEN], tmp_gridsY[MAXLEN], tmp_gridsL[MAXLEN];\n  int netID, enlarge, endIND;\n  Bool* pop_heap23D;\n\n  int i, j, k, deg, n1, n2, n1x, n1y, n2x, n2y, ymin, ymax, xmin, xmax, curX,\n      curY, curL, crossX, crossY, crossL, tmpX, tmpY, tmpL, tmpi, min_x, min_y,\n      *dtmp;\n  int regionX1, regionX2, regionY1, regionY2, routeLen;\n  int heapLen1, heapLen2, ind, ind1, tmpind, grid;\n  float tmp;\n  TreeEdge *treeedges, *treeedge;\n  TreeNode* treenodes;\n\n  int endpt1, endpt2, A1, A2, B1, B2, C1, C2, cnt, cnt_n1n2, remd;\n  int edge_n1n2, edge_n1A1, edge_n1A2, edge_n1C1, edge_n1C2, edge_A1A2,\n      edge_C1C2;\n  int edge_n2B1, edge_n2B2, edge_n2D1, edge_n2D2, edge_B1B2, edge_D1D2, D1, D2;\n  int E1x, E1y, E2x, E2y, range, corE1, corE2, edgeID;\n\n  Bool Horizontal, n1Shift, n2Shift, redundant;\n  int lastL, origL, headRoom, tailRoom, newcnt_n1n2, numpoints, d, n1a, n2a,\n      connectionCNT;\n  int origEng, orderIndex;\n\n  directions3D = (dirctionT***)calloc(numLayers, sizeof(dirctionT**));\n  corrEdge3D   = (int***)calloc(numLayers, sizeof(int**));\n  pr3D         = (parent3D***)calloc(numLayers, sizeof(parent3D**));\n\n  for (i = 0; i < numLayers; i++) {\n    directions3D[i] = (dirctionT**)calloc(yGrid, sizeof(dirctionT*));\n    corrEdge3D[i]   = (int**)calloc(yGrid, sizeof(int*));\n    pr3D[i]         = (parent3D**)calloc(yGrid, sizeof(parent3D*));\n\n    for (j = 0; j < yGrid; j++) {\n      directions3D[i][j] = (dirctionT*)calloc(xGrid, sizeof(dirctionT));\n      corrEdge3D[i][j]   = (int*)calloc(xGrid, sizeof(int));\n      pr3D[i][j]         = (parent3D*)calloc(xGrid, sizeof(parent3D));\n    }\n  }\n\n  pop_heap23D = (Bool*)calloc(numLayers * YRANGE * XRANGE, sizeof(Bool));\n\n  // allocate memory for priority queue\n  heap13D = (int**)calloc((yGrid * xGrid * numLayers), sizeof(int*));\n  heap23D = (short**)calloc((yGrid * xGrid * numLayers), sizeof(short*));\n\n  // cout << heap13D << endl;\n\n  for (i = 0; i < yGrid; i++) {\n    for (j = 0; j < xGrid; j++) {\n      inRegion[i][j] = FALSE;\n    }\n  }\n\n  range = YRANGE * XRANGE * numLayers;\n  for (i = 0; i < range; i++) {\n    pop_heap23D[i] = FALSE;\n  }\n\n  endIND = numValidNets * 0.9;\n\n  for (orderIndex = 0; orderIndex < endIND; orderIndex++) {\n\n    netID = treeOrderPV[orderIndex].treeIndex;\n\n    // printf(\"netID %d\\n\",netID);\n    // fflush(stdout);\n    // if (netID == 53757)\n    //{\n    //\tcontinue;\n    //}\n\n    enlarge   = expand;\n    deg       = sttrees[netID].deg;\n    treeedges = sttrees[netID].edges;\n    treenodes = sttrees[netID].nodes;\n    origEng   = enlarge;\n\n    for (edgeID = 0; edgeID < 2 * deg - 3; edgeID++) {\n      treeedge = &(treeedges[edgeID]);\n\n      if (treeedge->len < ripupTHub && treeedge->len > ripupTHlb) {\n\n        n1       = treeedge->n1;\n        n2       = treeedge->n2;\n        n1x      = treenodes[n1].x;\n        n1y      = treenodes[n1].y;\n        n2x      = treenodes[n2].x;\n        n2y      = treenodes[n2].y;\n        routeLen = treeedges[edgeID].route.routelen;\n\n        if (n1y <= n2y) {\n          ymin = n1y;\n          ymax = n2y;\n        } else {\n          ymin = n2y;\n          ymax = n1y;\n        }\n\n        if (n1x <= n2x) {\n          xmin = n1x;\n          xmax = n2x;\n        } else {\n          xmin = n2x;\n          xmax = n1x;\n        }\n\n        // ripup the routing for the edge\n        if (newRipup3DType3(netID, edgeID)) {\n          enlarge = min(origEng, treeedge->route.routelen);\n\n          regionX1 = max(0, xmin - enlarge);\n          regionX2 = min(xGrid - 1, xmax + enlarge);\n          regionY1 = max(0, ymin - enlarge);\n          regionY2 = min(yGrid - 1, ymax + enlarge);\n\n          n1Shift = FALSE;\n          n2Shift = FALSE;\n          n1a     = treeedge->n1a;\n          n2a     = treeedge->n2a;\n\n          // initialize pop_heap13D[] and pop_heap23D[] as FALSE (for detecting\n          // the shortest path is found or not)\n\n          for (k = 0; k < numLayers; k++) {\n            for (i = regionY1; i <= regionY2; i++) {\n              for (j = regionX1; j <= regionX2; j++) {\n                d13D[k][i][j] = BIG_INT;\n                d23D[k][i][j] = 256;\n              }\n            }\n          }\n\n          // setup heap13D, heap23D and initialize d13D[][] and d23D[][] for all\n          // the grids on the two subtrees\n          setupHeap3D(netID, edgeID, &heapLen1, &heapLen2, regionX1, regionX2,\n                      regionY1, regionY2);\n\n          // while loop to find shortest path\n          ind1 = (heap13D[0] - (int*)d13D);\n\n          for (i = 0; i < heapLen2; i++)\n            pop_heap23D[(heap23D[i] - (short*)d23D)] = TRUE;\n\n          while (pop_heap23D[ind1] ==\n                 FALSE) // stop until the grid position been popped out from\n                        // both heap13D and heap23D\n          {\n            // relax all the adjacent grids within the enlarged region for\n            // source subtree\n            curL = ind1 / (gridHV);\n            remd = ind1 % (gridHV);\n            curX = remd % XRANGE;\n            curY = remd / XRANGE;\n\n            extractMin3D(heap13D, heapLen1);\n            // pop_heap13D[ind1] = TRUE;\n            heapLen1--;\n\n            if (hCapacity3D[curL]) {\n              Horizontal = TRUE;\n            } else {\n              Horizontal = FALSE;\n            }\n\n            if (Horizontal) {\n\n              // left\n              if (curX > regionX1 && directions3D[curL][curY][curX] != EAST) {\n                grid = gridHs[curL] + curY * (xGrid - 1) + curX - 1;\n                tmp  = d13D[curL][curY][curX] + 1;\n                if (h_edges3D[grid].usage < h_edges3D[grid].cap) {\n\n                  tmpX = curX - 1; // the left neighbor\n\n                  if (d13D[curL][curY][tmpX] >=\n                      BIG_INT) // left neighbor not been put into heap13D\n                  {\n                    d13D[curL][curY][tmpX]         = tmp;\n                    pr3D[curL][curY][tmpX].l       = curL;\n                    pr3D[curL][curY][tmpX].x       = curX;\n                    pr3D[curL][curY][tmpX].y       = curY;\n                    directions3D[curL][curY][tmpX] = WEST;\n                    heap13D[heapLen1]              = &(d13D[curL][curY][tmpX]);\n                    heapLen1++;\n                    updateHeap3D(heap13D, heapLen1 - 1);\n                  } else if (d13D[curL][curY][tmpX] >\n                             tmp) // left neighbor been put into heap13D but\n                                  // needs update\n                  {\n                    d13D[curL][curY][tmpX]         = tmp;\n                    pr3D[curL][curY][tmpX].l       = curL;\n                    pr3D[curL][curY][tmpX].x       = curX;\n                    pr3D[curL][curY][tmpX].y       = curY;\n                    directions3D[curL][curY][tmpX] = WEST;\n                    dtmp                           = &(d13D[curL][curY][tmpX]);\n                    ind                            = 0;\n                    while (heap13D[ind] != dtmp)\n                      ind++;\n                    updateHeap3D(heap13D, ind);\n                  }\n                }\n              }\n              // right\n              if (Horizontal && curX < regionX2 &&\n                  directions3D[curL][curY][curX] != WEST) {\n                grid = gridHs[curL] + curY * (xGrid - 1) + curX;\n\n                tmp  = d13D[curL][curY][curX] + 1;\n                tmpX = curX + 1; // the right neighbor\n\n                if (h_edges3D[grid].usage < h_edges3D[grid].cap) {\n                  if (d13D[curL][curY][tmpX] >=\n                      BIG_INT) // right neighbor not been put into heap13D\n                  {\n                    d13D[curL][curY][tmpX]         = tmp;\n                    pr3D[curL][curY][tmpX].l       = curL;\n                    pr3D[curL][curY][tmpX].x       = curX;\n                    pr3D[curL][curY][tmpX].y       = curY;\n                    directions3D[curL][curY][tmpX] = EAST;\n                    heap13D[heapLen1]              = &(d13D[curL][curY][tmpX]);\n                    heapLen1++;\n                    updateHeap3D(heap13D, heapLen1 - 1);\n                  } else if (d13D[curL][curY][tmpX] >\n                             tmp) // right neighbor been put into heap13D but\n                                  // needs update\n                  {\n                    d13D[curL][curY][tmpX]         = tmp;\n                    pr3D[curL][curY][tmpX].l       = curL;\n                    pr3D[curL][curY][tmpX].x       = curX;\n                    pr3D[curL][curY][tmpX].y       = curY;\n                    directions3D[curL][curY][tmpX] = EAST;\n                    dtmp                           = &(d13D[curL][curY][tmpX]);\n                    ind                            = 0;\n                    while (heap13D[ind] != dtmp)\n                      ind++;\n                    updateHeap3D(heap13D, ind);\n                  }\n                }\n              }\n            } else {\n              // bottom\n              if (!Horizontal && curY > regionY1 &&\n                  directions3D[curL][curY][curX] != SOUTH) {\n                grid = gridVs[curL] + (curY - 1) * xGrid + curX;\n                tmp  = d13D[curL][curY][curX] + 1;\n                tmpY = curY - 1; // the bottom neighbor\n                if (v_edges3D[grid].usage < v_edges3D[grid].cap) {\n\n                  if (d13D[curL][tmpY][curX] >=\n                      BIG_INT) // bottom neighbor not been put into heap13D\n                  {\n                    d13D[curL][tmpY][curX]         = tmp;\n                    pr3D[curL][tmpY][curX].l       = curL;\n                    pr3D[curL][tmpY][curX].x       = curX;\n                    pr3D[curL][tmpY][curX].y       = curY;\n                    directions3D[curL][tmpY][curX] = NORTH;\n                    heap13D[heapLen1]              = &(d13D[curL][tmpY][curX]);\n                    heapLen1++;\n                    updateHeap3D(heap13D, heapLen1 - 1);\n                  } else if (d13D[curL][tmpY][curX] >\n                             tmp) // bottom neighbor been put into heap13D but\n                                  // needs update\n                  {\n                    d13D[curL][tmpY][curX]         = tmp;\n                    pr3D[curL][tmpY][curX].l       = curL;\n                    pr3D[curL][tmpY][curX].x       = curX;\n                    pr3D[curL][tmpY][curX].y       = curY;\n                    directions3D[curL][tmpY][curX] = NORTH;\n                    dtmp                           = &(d13D[curL][tmpY][curX]);\n                    ind                            = 0;\n                    while (heap13D[ind] != dtmp)\n                      ind++;\n                    updateHeap3D(heap13D, ind);\n                  }\n                }\n              }\n              // top\n              if (!Horizontal && curY < regionY2 &&\n                  directions3D[curL][curY][curX] != NORTH) {\n                grid = gridVs[curL] + curY * xGrid + curX;\n                tmp  = d13D[curL][curY][curX] + 1;\n                tmpY = curY + 1; // the top neighbor\n                if (v_edges3D[grid].usage < v_edges3D[grid].cap) {\n\n                  if (d13D[curL][tmpY][curX] >=\n                      BIG_INT) // top neighbor not been put into heap13D\n                  {\n                    d13D[curL][tmpY][curX]         = tmp;\n                    pr3D[curL][tmpY][curX].l       = curL;\n                    pr3D[curL][tmpY][curX].x       = curX;\n                    pr3D[curL][tmpY][curX].y       = curY;\n                    directions3D[curL][tmpY][curX] = SOUTH;\n                    heap13D[heapLen1]              = &(d13D[curL][tmpY][curX]);\n                    heapLen1++;\n                    updateHeap3D(heap13D, heapLen1 - 1);\n                  } else if (d13D[curL][tmpY][curX] >\n                             tmp) // top neighbor been put into heap13D but\n                                  // needs update\n                  {\n                    d13D[curL][tmpY][curX]         = tmp;\n                    pr3D[curL][tmpY][curX].l       = curL;\n                    pr3D[curL][tmpY][curX].x       = curX;\n                    pr3D[curL][tmpY][curX].y       = curY;\n                    directions3D[curL][tmpY][curX] = SOUTH;\n                    dtmp                           = &(d13D[curL][tmpY][curX]);\n                    ind                            = 0;\n                    while (heap13D[ind] != dtmp)\n                      ind++;\n                    updateHeap3D(heap13D, ind);\n                  }\n                }\n              }\n            }\n\n            // down\n            if (curL > 0 && directions3D[curL][curY][curX] != UP) {\n\n              tmp  = d13D[curL][curY][curX] + viacost;\n              tmpL = curL - 1; // the bottom neighbor\n\n              // printf(\"down, new value %f, old value\n              // %f\\n\",tmp,d13D[tmpL][curY][curX]);\n              if (d13D[tmpL][curY][curX] >=\n                  BIG_INT) // bottom neighbor not been put into heap13D\n              {\n                d13D[tmpL][curY][curX]         = tmp;\n                pr3D[tmpL][curY][curX].l       = curL;\n                pr3D[tmpL][curY][curX].x       = curX;\n                pr3D[tmpL][curY][curX].y       = curY;\n                directions3D[tmpL][curY][curX] = DOWN;\n                heap13D[heapLen1]              = &(d13D[tmpL][curY][curX]);\n                heapLen1++;\n                updateHeap3D(heap13D, heapLen1 - 1);\n              } else if (d13D[tmpL][curY][curX] >\n                         tmp) // bottom neighbor been put into heap13D but needs\n                              // update\n              {\n                d13D[tmpL][curY][curX]         = tmp;\n                pr3D[tmpL][curY][curX].l       = curL;\n                pr3D[tmpL][curY][curX].x       = curX;\n                pr3D[tmpL][curY][curX].y       = curY;\n                directions3D[tmpL][curY][curX] = DOWN;\n                dtmp                           = &(d13D[tmpL][curY][curX]);\n                ind                            = 0;\n                while (heap13D[ind] != dtmp)\n                  ind++;\n                updateHeap3D(heap13D, ind);\n              }\n            }\n\n            // up\n            if (curL < numLayers - 1 &&\n                directions3D[curL][curY][curX] != DOWN) {\n\n              tmp  = d13D[curL][curY][curX] + viacost;\n              tmpL = curL + 1; // the bottom neighbor\n              if (d13D[tmpL][curY][curX] >=\n                  BIG_INT) // bottom neighbor not been put into heap13D\n              {\n                d13D[tmpL][curY][curX]         = tmp;\n                pr3D[tmpL][curY][curX].l       = curL;\n                pr3D[tmpL][curY][curX].x       = curX;\n                pr3D[tmpL][curY][curX].y       = curY;\n                directions3D[tmpL][curY][curX] = UP;\n                heap13D[heapLen1]              = &(d13D[tmpL][curY][curX]);\n                heapLen1++;\n                updateHeap3D(heap13D, heapLen1 - 1);\n              } else if (d13D[tmpL][curY][curX] >\n                         tmp) // bottom neighbor been put into heap13D but needs\n                              // update\n              {\n                d13D[tmpL][curY][curX]         = tmp;\n                pr3D[tmpL][curY][curX].l       = curL;\n                pr3D[tmpL][curY][curX].x       = curX;\n                pr3D[tmpL][curY][curX].y       = curY;\n                directions3D[tmpL][curY][curX] = UP;\n                dtmp                           = &(d13D[tmpL][curY][curX]);\n                ind                            = 0;\n                while (heap13D[ind] != dtmp)\n                  ind++;\n                updateHeap3D(heap13D, ind);\n              }\n            }\n\n            // update ind1 and ind2 for next loop\n            ind1 = (heap13D[0] - (int*)d13D);\n          } // while loop\n\n          for (i = 0; i < heapLen2; i++)\n            pop_heap23D[(heap23D[i] - (short*)d23D)] = FALSE;\n\n          // get the new route for the edge and store it in gridsX[] and\n          // gridsY[] temporarily\n\n          crossL = ind1 / (gridHV);\n          crossX = (ind1 % (gridHV)) % XRANGE;\n          crossY = (ind1 % (gridHV)) / XRANGE;\n\n          cnt  = 0;\n          curX = crossX;\n          curY = crossY;\n          curL = crossL;\n\n          if (d13D[curL][curY][curX] == 0) {\n            recoverEdge(netID, edgeID);\n            break;\n          }\n          // printf(\"the initial value %f LYX [%d %d\n          // %d]\\n\",d13D[curL][curY][curX],curL, curY, curX);\n\n          while (d13D[curL][curY][curX] != 0) // loop until reach subtree1\n          {\n\n            tmpL = pr3D[curL][curY][curX].l;\n            tmpX = pr3D[curL][curY][curX].x;\n            tmpY = pr3D[curL][curY][curX].y;\n            curX = tmpX;\n            curY = tmpY;\n            curL = tmpL;\n            fflush(stdout);\n            tmp_gridsX[cnt] = curX;\n            tmp_gridsY[cnt] = curY;\n            tmp_gridsL[cnt] = curL;\n            cnt++;\n          }\n\n          // printf(\"the end value %f\\n\",d13D[curL][curY][curX]);\n          // reverse the grids on the path\n          for (i = 0; i < cnt; i++) {\n            tmpind    = cnt - 1 - i;\n            gridsX[i] = tmp_gridsX[tmpind];\n            gridsY[i] = tmp_gridsY[tmpind];\n            gridsL[i] = tmp_gridsL[tmpind];\n          }\n\n          // add the connection point (crossX, crossY)\n          gridsX[cnt] = crossX;\n          gridsY[cnt] = crossY;\n          gridsL[cnt] = crossL;\n          cnt++;\n\n          curX = crossX;\n          curY = crossY;\n          curL = crossL;\n\n          cnt_n1n2 = cnt;\n\n          E1x = gridsX[0];\n          E1y = gridsY[0];\n          E2x = gridsX[cnt_n1n2 - 1];\n          E2y = gridsY[cnt_n1n2 - 1];\n\n          headRoom = 0;\n          origL    = gridsL[0];\n\n          while (gridsX[headRoom] == E1x && gridsY[headRoom] == E1y) {\n            lastL = gridsL[headRoom];\n            headRoom++;\n          }\n          if (headRoom > 0) {\n            headRoom--;\n          }\n\n          lastL = gridsL[headRoom];\n\n          // change the tree structure according to the new routing for the tree\n          // edge find E1 and E2, and the endpoints of the edges they are on\n\n          edge_n1n2 = edgeID;\n          // (1) consider subtree1\n          if (n1 >= deg && (E1x != n1x || E1y != n1y))\n          // n1 is not a pin and E1!=n1, then make change to subtree1,\n          // otherwise, no change to subtree1\n          {\n            n1Shift = TRUE;\n            corE1   = corrEdge3D[origL][E1y][E1x];\n\n            endpt1 = treeedges[corE1].n1;\n            endpt2 = treeedges[corE1].n2;\n\n            // find A1, A2 and edge_n1A1, edge_n1A2\n            if (treenodes[n1].nbr[0] == n2) {\n              A1        = treenodes[n1].nbr[1];\n              A2        = treenodes[n1].nbr[2];\n              edge_n1A1 = treenodes[n1].edge[1];\n              edge_n1A2 = treenodes[n1].edge[2];\n            } else if (treenodes[n1].nbr[1] == n2) {\n              A1        = treenodes[n1].nbr[0];\n              A2        = treenodes[n1].nbr[2];\n              edge_n1A1 = treenodes[n1].edge[0];\n              edge_n1A2 = treenodes[n1].edge[2];\n            } else {\n              A1        = treenodes[n1].nbr[0];\n              A2        = treenodes[n1].nbr[1];\n              edge_n1A1 = treenodes[n1].edge[0];\n              edge_n1A2 = treenodes[n1].edge[1];\n            }\n\n            if (endpt1 == n1 || endpt2 == n1) // E1 is on (n1, A1) or (n1, A2)\n            {\n              // if E1 is on (n1, A2), switch A1 and A2 so that E1 is always on\n              // (n1, A1)\n              if (endpt1 == A2 || endpt2 == A2) {\n                tmpi      = A1;\n                A1        = A2;\n                A2        = tmpi;\n                tmpi      = edge_n1A1;\n                edge_n1A1 = edge_n1A2;\n                edge_n1A2 = tmpi;\n              }\n\n              // update route for edge (n1, A1), (n1, A2)\n              updateRouteType13D(treenodes, n1, A1, A2, E1x, E1y, treeedges,\n                                 edge_n1A1, edge_n1A2);\n              // newUpdateNodeLayers(treenodes, edge_n1n2,n1, lastL);\n\n              // update position for n1\n\n              // treenodes[n1].l = E1l;\n              treenodes[n1].assigned = TRUE;\n            }    // if E1 is on (n1, A1) or (n1, A2)\n            else // E1 is not on (n1, A1) or (n1, A2), but on (C1, C2)\n            {\n              C1        = endpt1;\n              C2        = endpt2;\n              edge_C1C2 = corrEdge3D[origL][E1y][E1x];\n\n              // update route for edge (n1, C1), (n1, C2) and (A1, A2)\n              updateRouteType23D(treenodes, n1, A1, A2, C1, C2, E1x, E1y,\n                                 treeedges, edge_n1A1, edge_n1A2, edge_C1C2);\n              // update position for n1\n              treenodes[n1].x        = E1x;\n              treenodes[n1].y        = E1y;\n              treenodes[n1].assigned = TRUE;\n              // update 3 edges (n1, A1)->(C1, n1), (n1, A2)->(n1, C2), (C1,\n              // C2)->(A1, A2)\n              edge_n1C1               = edge_n1A1;\n              treeedges[edge_n1C1].n1 = C1;\n              treeedges[edge_n1C1].n2 = n1;\n              edge_n1C2               = edge_n1A2;\n              treeedges[edge_n1C2].n1 = n1;\n              treeedges[edge_n1C2].n2 = C2;\n              edge_A1A2               = edge_C1C2;\n              treeedges[edge_A1A2].n1 = A1;\n              treeedges[edge_A1A2].n2 = A2;\n              // update nbr and edge for 5 nodes n1, A1, A2, C1, C2\n              // n1's nbr (n2, A1, A2)->(n2, C1, C2)\n              treenodes[n1].nbr[0]  = n2;\n              treenodes[n1].edge[0] = edge_n1n2;\n              treenodes[n1].nbr[1]  = C1;\n              treenodes[n1].edge[1] = edge_n1C1;\n              treenodes[n1].nbr[2]  = C2;\n              treenodes[n1].edge[2] = edge_n1C2;\n              // A1's nbr n1->A2\n              for (i = 0; i < 3; i++) {\n                if (treenodes[A1].nbr[i] == n1) {\n                  treenodes[A1].nbr[i]  = A2;\n                  treenodes[A1].edge[i] = edge_A1A2;\n                  break;\n                }\n              }\n              // A2's nbr n1->A1\n              for (i = 0; i < 3; i++) {\n                if (treenodes[A2].nbr[i] == n1) {\n                  treenodes[A2].nbr[i]  = A1;\n                  treenodes[A2].edge[i] = edge_A1A2;\n                  break;\n                }\n              }\n              // C1's nbr C2->n1\n              for (i = 0; i < 3; i++) {\n                if (treenodes[C1].nbr[i] == C2) {\n                  treenodes[C1].nbr[i]  = n1;\n                  treenodes[C1].edge[i] = edge_n1C1;\n                  break;\n                }\n              }\n              // C2's nbr C1->n1\n              for (i = 0; i < 3; i++) {\n                if (treenodes[C2].nbr[i] == C1) {\n                  treenodes[C2].nbr[i]  = n1;\n                  treenodes[C2].edge[i] = edge_n1C2;\n                  break;\n                }\n              }\n            } // else E1 is not on (n1, A1) or (n1, A2), but on (C1, C2)\n          }   // n1 is not a pin and E1!=n1\n          else {\n            newUpdateNodeLayers(treenodes, edge_n1n2, n1a, lastL);\n          }\n\n          origL    = gridsL[cnt_n1n2 - 1];\n          tailRoom = cnt_n1n2 - 1;\n\n          while (gridsX[tailRoom] == E2x && gridsY[tailRoom] == E2y) {\n            tailRoom--;\n            if (tailRoom == -1)\n              break;\n          }\n          if (tailRoom < cnt_n1n2 - 1) {\n            tailRoom++;\n          }\n\n          lastL = gridsL[tailRoom];\n\n          // updateNodeLayers(treenodes, edgeID, n2a, ntpL,nbtL,lastL);\n\n          // (2) consider subtree2\n          if (n2 >= deg && (E2x != n2x || E2y != n2y))\n          // n2 is not a pin and E2!=n2, then make change to subtree2,\n          // otherwise, no change to subtree2\n          {\n            // find the endpoints of the edge E1 is on\n\n            n2Shift = TRUE;\n            corE2   = corrEdge3D[origL][E2y][E2x];\n            endpt1  = treeedges[corE2].n1;\n            endpt2  = treeedges[corE2].n2;\n\n            // find B1, B2\n            if (treenodes[n2].nbr[0] == n1) {\n              B1        = treenodes[n2].nbr[1];\n              B2        = treenodes[n2].nbr[2];\n              edge_n2B1 = treenodes[n2].edge[1];\n              edge_n2B2 = treenodes[n2].edge[2];\n            } else if (treenodes[n2].nbr[1] == n1) {\n              B1        = treenodes[n2].nbr[0];\n              B2        = treenodes[n2].nbr[2];\n              edge_n2B1 = treenodes[n2].edge[0];\n              edge_n2B2 = treenodes[n2].edge[2];\n            } else {\n              B1        = treenodes[n2].nbr[0];\n              B2        = treenodes[n2].nbr[1];\n              edge_n2B1 = treenodes[n2].edge[0];\n              edge_n2B2 = treenodes[n2].edge[1];\n            }\n\n            if (endpt1 == n2 || endpt2 == n2) // E2 is on (n2, B1) or (n2, B2)\n            {\n              // if E2 is on (n2, B2), switch B1 and B2 so that E2 is always on\n              // (n2, B1)\n              if (endpt1 == B2 || endpt2 == B2) {\n                tmpi      = B1;\n                B1        = B2;\n                B2        = tmpi;\n                tmpi      = edge_n2B1;\n                edge_n2B1 = edge_n2B2;\n                edge_n2B2 = tmpi;\n              }\n              // printf(\" type1\\n\");\n\n              // update route for edge (n2, B1), (n2, B2)\n              updateRouteType13D(treenodes, n2, B1, B2, E2x, E2y, treeedges,\n                                 edge_n2B1, edge_n2B2);\n              // newUpdateNodeLayers(treenodes, edge_n1n2,n2, lastL);\n\n              // update position for n2\n              treenodes[n2].assigned = TRUE;\n            }    // if E2 is on (n2, B1) or (n2, B2)\n            else // E2 is not on (n2, B1) or (n2, B2), but on (d13D, d23D)\n            {\n              D1        = endpt1;\n              D2        = endpt2;\n              edge_D1D2 = corrEdge3D[origL][E2y][E2x];\n              // printf(\" type2\\n\");\n\n              // update route for edge (n2, d13D), (n2, d23D) and (B1, B2)\n              updateRouteType23D(treenodes, n2, B1, B2, D1, D2, E2x, E2y,\n                                 treeedges, edge_n2B1, edge_n2B2, edge_D1D2);\n              // update position for n2\n              treenodes[n2].x        = E2x;\n              treenodes[n2].y        = E2y;\n              treenodes[n2].assigned = TRUE;\n              // update 3 edges (n2, B1)->(d13D, n2), (n2, B2)->(n2, d23D),\n              // (d13D, d23D)->(B1, B2)\n              edge_n2D1               = edge_n2B1;\n              treeedges[edge_n2D1].n1 = D1;\n              treeedges[edge_n2D1].n2 = n2;\n              edge_n2D2               = edge_n2B2;\n              treeedges[edge_n2D2].n1 = n2;\n              treeedges[edge_n2D2].n2 = D2;\n              edge_B1B2               = edge_D1D2;\n              treeedges[edge_B1B2].n1 = B1;\n              treeedges[edge_B1B2].n2 = B2;\n              // update nbr and edge for 5 nodes n2, B1, B2, d13D, d23D\n              // n1's nbr (n1, B1, B2)->(n1, d13D, d23D)\n              treenodes[n2].nbr[0]  = n1;\n              treenodes[n2].edge[0] = edge_n1n2;\n              treenodes[n2].nbr[1]  = D1;\n              treenodes[n2].edge[1] = edge_n2D1;\n              treenodes[n2].nbr[2]  = D2;\n              treenodes[n2].edge[2] = edge_n2D2;\n              // B1's nbr n2->B2\n              for (i = 0; i < 3; i++) {\n                if (treenodes[B1].nbr[i] == n2) {\n                  treenodes[B1].nbr[i]  = B2;\n                  treenodes[B1].edge[i] = edge_B1B2;\n                  break;\n                }\n              }\n              // B2's nbr n2->B1\n              for (i = 0; i < 3; i++) {\n                if (treenodes[B2].nbr[i] == n2) {\n                  treenodes[B2].nbr[i]  = B1;\n                  treenodes[B2].edge[i] = edge_B1B2;\n                  break;\n                }\n              }\n              // D1's nbr D2->n2\n              for (i = 0; i < 3; i++) {\n                if (treenodes[D1].nbr[i] == D2) {\n                  treenodes[D1].nbr[i]  = n2;\n                  treenodes[D1].edge[i] = edge_n2D1;\n                  break;\n                }\n              }\n              // D2's nbr D1->n2\n              for (i = 0; i < 3; i++) {\n                if (treenodes[D2].nbr[i] == D1) {\n                  treenodes[D2].nbr[i]  = n2;\n                  treenodes[D2].edge[i] = edge_n2D2;\n                  break;\n                }\n              }\n            }    // else E2 is not on (n2, B1) or (n2, B2), but on (d13D, d23D)\n          } else // n2 is not a pin and E2!=n2\n          {\n            newUpdateNodeLayers(treenodes, edge_n1n2, n2a, lastL);\n          }\n\n          newcnt_n1n2 = tailRoom - headRoom + 1;\n\n          // update route for edge (n1, n2) and edge usage\n          if (treeedges[edge_n1n2].route.type == MAZEROUTE) {\n            free(treeedges[edge_n1n2].route.gridsX);\n            free(treeedges[edge_n1n2].route.gridsY);\n            free(treeedges[edge_n1n2].route.gridsL);\n          }\n\n          treeedges[edge_n1n2].route.gridsX =\n              (short*)calloc(newcnt_n1n2, sizeof(short));\n          treeedges[edge_n1n2].route.gridsY =\n              (short*)calloc(newcnt_n1n2, sizeof(short));\n          treeedges[edge_n1n2].route.gridsL =\n              (short*)calloc(newcnt_n1n2, sizeof(short));\n          treeedges[edge_n1n2].route.type     = MAZEROUTE;\n          treeedges[edge_n1n2].route.routelen = newcnt_n1n2 - 1;\n          treeedges[edge_n1n2].len = ADIFF(E1x, E2x) + ADIFF(E1y, E2y);\n\n          j = headRoom;\n          for (i = 0; i < newcnt_n1n2; i++) {\n            treeedges[edge_n1n2].route.gridsX[i] = gridsX[j];\n            treeedges[edge_n1n2].route.gridsY[i] = gridsY[j];\n            treeedges[edge_n1n2].route.gridsL[i] = gridsL[j];\n            j++;\n          }\n\n          // update edge usage\n          for (i = headRoom; i < tailRoom; i++) {\n            if (gridsL[i] == gridsL[i + 1]) {\n              if (gridsX[i] == gridsX[i + 1]) // a vertical edge\n              {\n                min_y = min(gridsY[i], gridsY[i + 1]);\n                v_edges3D[gridsL[i] * gridV + min_y * xGrid + gridsX[i]]\n                    .usage += 1;\n              } else /// if(gridsY[i]==gridsY[i+1])// a horizontal edge\n              {\n                min_x = min(gridsX[i], gridsX[i + 1]);\n                h_edges3D[gridsL[i] * gridH + gridsY[i] * (xGrid - 1) + min_x]\n                    .usage += 1;\n              }\n            }\n          }\n\n          if (n1Shift || n2Shift) {\n            // re statis the node overlap\n            numpoints = 0;\n\n            for (d = 0; d < 2 * deg - 2; d++) {\n              treenodes[d].topL       = -1;\n              treenodes[d].botL       = numLayers;\n              treenodes[d].assigned   = FALSE;\n              treenodes[d].stackAlias = d;\n              treenodes[d].conCNT     = 0;\n              treenodes[d].hID        = BIG_INT;\n              treenodes[d].lID        = BIG_INT;\n              treenodes[d].status     = 0;\n\n              if (d < deg) {\n                treenodes[d].botL = treenodes[d].topL = 0;\n                // treenodes[d].l = 0;\n                treenodes[d].assigned = TRUE;\n                treenodes[d].status   = 1;\n\n                xcor[numpoints] = treenodes[d].x;\n                ycor[numpoints] = treenodes[d].y;\n                dcor[numpoints] = d;\n                numpoints++;\n              } else {\n                redundant = FALSE;\n                for (k = 0; k < numpoints; k++) {\n                  if ((treenodes[d].x == xcor[k]) &&\n                      (treenodes[d].y == ycor[k])) {\n                    treenodes[d].stackAlias = dcor[k];\n\n                    redundant = TRUE;\n                    break;\n                  }\n                }\n                if (!redundant) {\n                  xcor[numpoints] = treenodes[d].x;\n                  ycor[numpoints] = treenodes[d].y;\n                  dcor[numpoints] = d;\n                  numpoints++;\n                }\n              }\n            } // numerating for nodes\n            for (k = 0; k < 2 * deg - 3; k++) {\n\n              treeedge = &(treeedges[k]);\n\n              if (treeedge->len > 0) {\n\n                routeLen = treeedge->route.routelen;\n\n                n1        = treeedge->n1;\n                n2        = treeedge->n2;\n                gridsLtmp = treeedge->route.gridsL;\n\n                n1a = treenodes[n1].stackAlias;\n\n                n2a = treenodes[n2].stackAlias;\n\n                treeedge->n1a = n1a;\n                treeedge->n2a = n2a;\n\n                connectionCNT                         = treenodes[n1a].conCNT;\n                treenodes[n1a].heights[connectionCNT] = gridsLtmp[0];\n                treenodes[n1a].eID[connectionCNT]     = k;\n                treenodes[n1a].conCNT++;\n\n                if (gridsLtmp[0] > treenodes[n1a].topL) {\n                  treenodes[n1a].hID  = k;\n                  treenodes[n1a].topL = gridsLtmp[0];\n                }\n                if (gridsLtmp[0] < treenodes[n1a].botL) {\n                  treenodes[n1a].lID  = k;\n                  treenodes[n1a].botL = gridsLtmp[0];\n                }\n\n                treenodes[n1a].assigned = TRUE;\n\n                connectionCNT                         = treenodes[n2a].conCNT;\n                treenodes[n2a].heights[connectionCNT] = gridsLtmp[routeLen];\n                treenodes[n2a].eID[connectionCNT]     = k;\n                treenodes[n2a].conCNT++;\n                if (gridsLtmp[routeLen] > treenodes[n2a].topL) {\n                  treenodes[n2a].hID  = k;\n                  treenodes[n2a].topL = gridsLtmp[routeLen];\n                }\n                if (gridsLtmp[routeLen] < treenodes[n2a].botL) {\n                  treenodes[n2a].lID  = k;\n                  treenodes[n2a].botL = gridsLtmp[routeLen];\n                }\n\n                treenodes[n2a].assigned = TRUE;\n\n              } // edge len > 0\n\n            } // eunmerating edges\n\n            //\tprintf(\"edge %d shifted post processing finished\\n\",edgeID);\n          } // if shift1 and shift2\n        }\n      }\n    }\n  }\n\n  for (i = 0; i < numLayers; i++) {\n    for (j = 0; j < yGrid; j++) {\n      free(directions3D[i][j]);\n      free(corrEdge3D[i][j]);\n      free(pr3D[i][j]);\n    }\n  }\n\n  for (i = 0; i < numLayers; i++) {\n\n    free(directions3D[i]);\n    free(corrEdge3D[i]);\n    free(pr3D[i]);\n  }\n\n  free(directions3D);\n  free(corrEdge3D);\n  free(pr3D);\n\n  free(pop_heap23D);\n  free(heap13D);\n  free(heap23D);\n}\n\nvoid getLayerRange(TreeNode* treenodes, int edgeID, int n1, int deg) {\n  int i;\n  int ntpL, nbtL, nhID = 0, nlID = 0;\n\n  ntpL = -1;\n  nbtL = BIG_INT;\n\n  if (treenodes[n1].conCNT > 1) {\n    for (i = 0; i < treenodes[n1].conCNT; i++) {\n      if (treenodes[n1].eID[i] != edgeID) {\n        if (ntpL < treenodes[n1].heights[i]) {\n          ntpL = treenodes[n1].heights[i];\n          nhID = treenodes[n1].eID[i];\n        }\n        if (nbtL > treenodes[n1].heights[i]) {\n          nbtL = treenodes[n1].heights[i];\n          nlID = treenodes[n1].eID[i];\n        }\n      }\n    }\n    if (n1 < deg) {\n      nbtL = 0;\n    }\n    treenodes[n1].topL = ntpL;\n    treenodes[n1].botL = nbtL;\n    treenodes[n1].hID  = nhID;\n    treenodes[n1].lID  = nlID;\n  } else {\n\n    if (treenodes[n1].botL > 0) {\n      printf(\"bottom layer acutally %d\\n\", treenodes[n1].botL);\n    }\n    treenodes[n1].topL = 0;\n    treenodes[n1].botL = 0;\n    treenodes[n1].hID  = BIG_INT;\n    treenodes[n1].lID  = BIG_INT;\n    if (n1 >= deg) {\n      printf(\"steiner nodes only have one connection\\n\");\n      exit(0);\n    }\n  }\n}\n\n#endif\n"
  },
  {
    "path": "lonestar/eda/cpu/sproute/maze_finegrain.h",
    "content": "#include \"galois/DynamicBitset.h\"\n\nstruct grid_lock : public galois::runtime::Lockable {\npublic:\n  int lock;\n  int& getData() { return lock; }\n};\n\nint round_num = 0;\n\nvoid mazeRouteMSMD_finegrain(int iter, int expand, float costHeight,\n                             int ripup_threshold, int mazeedge_Threshold,\n                             Bool Ordering, int cost_type) {\n  // LOCK = 0;\n  galois::StatTimer timer_finegrain(\"fine grain function\", \"fine grain maze\");\n\n  float forange;\n  // std::cout << \" enter here \" << std::endl;\n  // allocate memory for distance and parent and pop_heap\n  h_costTable = (float*)calloc(40 * hCapacity, sizeof(float));\n  v_costTable = (float*)calloc(40 * vCapacity, sizeof(float));\n\n  forange = 40 * hCapacity;\n\n  if (cost_type == 2) {\n    for (int i = 0; i < forange; i++) {\n      if (i < hCapacity - 1)\n        h_costTable[i] =\n            costHeight / (exp((float)(hCapacity - i - 1) * LOGIS_COF) + 1) + 1;\n      else\n        h_costTable[i] =\n            costHeight / (exp((float)(hCapacity - i - 1) * LOGIS_COF) + 1) + 1 +\n            costHeight / slope * (i - hCapacity);\n    }\n    forange = 40 * vCapacity;\n    for (int i = 0; i < forange; i++) {\n      if (i < vCapacity - 1)\n        v_costTable[i] =\n            costHeight / (exp((float)(vCapacity - i - 1) * LOGIS_COF) + 1) + 1;\n      else\n        v_costTable[i] =\n            costHeight / (exp((float)(vCapacity - i - 1) * LOGIS_COF) + 1) + 1 +\n            costHeight / slope * (i - vCapacity);\n    }\n  } else {\n\n    for (int i = 0; i < forange; i++) {\n      if (i < hCapacity)\n        h_costTable[i] =\n            (round_num > 50)\n                ? 10\n                : costHeight / (exp((float)(hCapacity - i) * LOGIS_COF) + 1) +\n                      1;\n      else\n        h_costTable[i] =\n            costHeight / (exp((float)(hCapacity - i) * LOGIS_COF) + 1) + 1 +\n            costHeight / slope * (i - hCapacity);\n    }\n    forange = 40 * vCapacity;\n    for (int i = 0; i < forange; i++) {\n      if (i < vCapacity)\n        v_costTable[i] =\n            (round_num > 50)\n                ? 10\n                : costHeight / (exp((float)(vCapacity - i) * LOGIS_COF) + 1) +\n                      1;\n      else\n        v_costTable[i] =\n            costHeight / (exp((float)(vCapacity - i) * LOGIS_COF) + 1) + 1 +\n            costHeight / slope * (i - vCapacity);\n    }\n  }\n\n  /*forange = yGrid*xGrid;\n  for(i=0; i<forange; i++)\n  {\n      pop_heap2[i] = FALSE;\n  }*/\n\n  // Michael\n\n  galois::LargeArray<grid_lock> data;\n  data.allocateInterleaved(yGrid * xGrid);\n  for (int n = 0; n < yGrid * xGrid; n++) {\n    data.constructAt(n);\n  }\n\n  if (Ordering) {\n    StNetOrder();\n    // printf(\"order?\\n\");\n  }\n\n  THREAD_LOCAL_STORAGE thread_local_storage{};\n  // for(nidRPC=0; nidRPC<numValidNets; nidRPC++)//parallelize\n  PerThread_PQ perthread_pq;\n  PerThread_Vec perthread_vec;\n  PRINT = 0;\n  galois::GAccumulator<int> total_ripups;\n  galois::GReduceMax<int> max_ripups;\n  total_ripups.reset();\n  max_ripups.reset();\n\n  // galois::runtime::profileVtune( [&] (void) {\n  /*std::random_device rd;\n  std::mt19937 g(rd());\n  std::shuffle(net_shuffle.begin(), net_shuffle.end(), g);\n\n  galois::do_all(galois::iterate(net_shuffle), */\n  // galois::for_each(galois::iterate(0, numValidNets),\n  //        [&] (const auto nidRPC, auto& ctx)\n  galois::StatTimer timer_newripupcheck(\"ripup\", \"fine grain maze\");\n  galois::StatTimer timer_setupheap(\"setup heap\", \"fine grain maze\");\n  galois::StatTimer timer_traceback(\"trace back\", \"fine grain maze\");\n  galois::StatTimer timer_adjusttree(\"adjust tree\", \"fine grain maze\");\n  galois::StatTimer timer_updateusage(\"update usage\", \"fine grain maze\");\n  galois::StatTimer timer_checkroute2dtree(\"checkroute2dtree\",\n                                           \"fine grain maze\");\n  galois::StatTimer timer_init(\"init\", \"fine grain maze\");\n  galois::StatTimer timer_foreach(\"foreach\", \"fine grain maze\");\n  for (int nidRPC = 0; nidRPC < numValidNets; nidRPC++) {\n\n    int netID;\n\n    // maze routing for multi-source, multi-destination\n    Bool hypered, enter;\n    int i, j, deg, edgeID, n1, n2, n1x, n1y, n2x, n2y, ymin, ymax, xmin, xmax,\n        crossX, crossY, tmpi, min_x, min_y, num_edges;\n    int regionX1, regionX2, regionY1, regionY2;\n    int tmpind, gridsX[XRANGE], gridsY[YRANGE], tmp_gridsX[XRANGE],\n        tmp_gridsY[YRANGE];\n    int endpt1, endpt2, A1, A2, B1, B2, C1, C2, D1, D2, cnt, cnt_n1n2;\n    int edge_n1n2, edge_n1A1, edge_n1A2, edge_n1C1, edge_n1C2, edge_A1A2,\n        edge_C1C2;\n    int edge_n2B1, edge_n2B2, edge_n2D1, edge_n2D2, edge_B1B2, edge_D1D2;\n    int E1x, E1y, E2x, E2y;\n    int origENG, edgeREC;\n\n    TreeEdge *treeedges, *treeedge;\n    TreeNode* treenodes;\n\n    bool* pop_heap2 = thread_local_storage.pop_heap2;\n\n    float** d1    = thread_local_storage.d1_p;\n    bool** HV     = thread_local_storage.HV_p;\n    bool** hyperV = thread_local_storage.hyperV_p;\n    bool** hyperH = thread_local_storage.hyperH_p;\n\n    short** parentX1 = thread_local_storage.parentX1_p;\n    short** parentX3 = thread_local_storage.parentX3_p;\n    short** parentY1 = thread_local_storage.parentY1_p;\n    short** parentY3 = thread_local_storage.parentY3_p;\n\n    int** corrEdge = thread_local_storage.corrEdge_p;\n\n    OrderNetEdge* netEO = thread_local_storage.netEO_p;\n\n    bool** inRegion = thread_local_storage.inRegion_p;\n\n    local_pq pq1 = perthread_pq.get();\n    local_vec v2 = perthread_vec.get();\n\n    /*for(i=0; i<yGrid*xGrid; i++)\n    {\n        pop_heap2[i] = FALSE;\n    } */\n\n    // memset(inRegion_alloc, 0, xGrid * yGrid * sizeof(bool));\n    /*for(int i=0; i<yGrid; i++)\n    {\n        for(int j=0; j<xGrid; j++)\n            inRegion[i][j] = FALSE;\n    }*/\n    // printf(\"hyperV[153][134]: %d %d %d\\n\", hyperV[153][134],\n    // parentY1[153][134], parentX3[153][134]); printf(\"what is happening?\\n\");\n\n    if (Ordering) {\n      netID = treeOrderCong[nidRPC].treeIndex;\n    } else {\n      netID = nidRPC;\n    }\n\n    deg = sttrees[netID].deg;\n\n    origENG = expand;\n\n    netedgeOrderDec(netID, netEO);\n\n    treeedges = sttrees[netID].edges;\n    treenodes = sttrees[netID].nodes;\n    // loop for all the tree edges (2*deg-3)\n    num_edges = 2 * deg - 3;\n\n    for (edgeREC = 0; edgeREC < num_edges; edgeREC++) {\n\n      edgeID   = netEO[edgeREC].edgeID;\n      treeedge = &(treeedges[edgeID]);\n\n      n1            = treeedge->n1;\n      n2            = treeedge->n2;\n      n1x           = treenodes[n1].x;\n      n1y           = treenodes[n1].y;\n      n2x           = treenodes[n2].x;\n      n2y           = treenodes[n2].y;\n      treeedge->len = ADIFF(n2x, n1x) + ADIFF(n2y, n1y);\n\n      if (treeedge->len >\n          mazeedge_Threshold) // only route the non-degraded edges (len>0)\n      {\n        timer_newripupcheck.start();\n        enter = newRipupCheck(treeedge, ripup_threshold, netID, edgeID);\n        timer_newripupcheck.stop();\n\n        // ripup the routing for the edge\n        timer_finegrain.start();\n        if (enter) {\n          // if(netID == 2 && edgeID == 26)\n          //    printf(\"netID %d edgeID %d src %d %d dst %d %d\\n\", netID,\n          //    edgeID, n1x, n1y, n2x, n2y);\n          // pre_length = treeedge->route.routelen;\n          /*for(int i = 0; i < pre_length; i++)\n          {\n              pre_gridsY[i] = treeedge->route.gridsY[i];\n              pre_gridsX[i] = treeedge->route.gridsX[i];\n              //printf(\"i %d x %d y %d\\n\", i, pre_gridsX[i], pre_gridsY[i]);\n          }*/\n          timer_init.start();\n          if (n1y <= n2y) {\n            ymin = n1y;\n            ymax = n2y;\n          } else {\n            ymin = n2y;\n            ymax = n1y;\n          }\n\n          if (n1x <= n2x) {\n            xmin = n1x;\n            xmax = n2x;\n          } else {\n            xmin = n2x;\n            xmax = n1x;\n          }\n\n          int enlarge = min(\n              origENG, (iter / 6 + 3) *\n                           treeedge->route\n                               .routelen); // michael, this was global variable\n          regionX1 = max(0, xmin - enlarge);\n          regionX2 = min(xGrid - 1, xmax + enlarge);\n          regionY1 = max(0, ymin - enlarge);\n          regionY2 = min(yGrid - 1, ymax + enlarge);\n          for (i = regionY1; i <= regionY2; i++) {\n            for (j = regionX1; j <= regionX2; j++) {\n              d1[i][j] = BIG_INT;\n            }\n          }\n          for (i = regionY1; i <= regionY2; i++) {\n            for (j = regionX1; j <= regionX2; j++) {\n              HV[i][j] = FALSE;\n            }\n          }\n          for (i = regionY1; i <= regionY2; i++) {\n            for (j = regionX1; j <= regionX2; j++) {\n              hyperH[i][j] = FALSE;\n            }\n          }\n          for (i = regionY1; i <= regionY2; i++) {\n            for (j = regionX1; j <= regionX2; j++) {\n              hyperV[i][j] = FALSE;\n            }\n          }\n          // TODO: use seperate loops\n\n          // setup heap1, heap2 and initialize d1[][] and d2[][] for all the\n          // grids on the two subtrees\n          timer_setupheap.start();\n          setupHeap(netID, edgeID, pq1, v2, regionX1, regionX2, regionY1,\n                    regionY2, d1, corrEdge, inRegion);\n          timer_setupheap.stop();\n          // TODO: use std priority queue\n          // while loop to find shortest path\n          /*ind1 = (pq1.top().d1_p - &d1[0][0]);\n          curX = ind1%xGrid;\n          curY = ind1/xGrid;\n          printf(\"src size: %d dst size: %d\\n\", pq1.size(), v2.size());*/\n          for (local_vec::iterator ii = v2.begin(); ii != v2.end(); ii++) {\n            pop_heap2[*ii] = TRUE;\n          }\n          std::atomic<int> return_ind1;\n          std::atomic<float> return_dist;\n          return_dist = (float)BIG_INT;\n\n          timer_init.stop();\n          timer_foreach.start();\n          galois::for_each(\n              galois::iterate(pq1),\n              [&](const auto& top, auto& ctx)\n              // while( pop_heap2[ind1]==FALSE) // stop until the grid position\n              // been popped out from both heap1 and heap2\n              {\n                // relax all the adjacent grids within the enlarged region for\n                // source subtree\n\n                int ind1 = top.d1_p - &d1[0][0];\n\n                float d1_push = top.d1_push;\n                int curX      = ind1 % xGrid;\n                int curY      = ind1 / xGrid;\n                int grid      = curY * xGrid + curX;\n\n                float curr_d1 = d1[curY][curX];\n                // if(netID == 2 && edgeID == 26)\n                //    printf(\"netID: %d edgeID:%d curX curY %d %d, d1_push: %f,\n                //    curr_d1: %f\\n\", netID, edgeID, curX, curY, d1_push,\n                //    curr_d1);\n\n                if (d1_push > return_dist + OBIM_delta) {\n                  // ctx.breakLoop();\n                }\n                galois::runtime::acquire(&data[grid],\n                                         galois::MethodFlag::WRITE);\n                if (d1_push == curr_d1 && d1_push < return_dist.load()) {\n                  if (pop_heap2[ind1] != false) {\n                    return_ind1.store(ind1);\n                    return_dist.store(d1_push);\n                  }\n\n                  grid = curY * xGrid + curX - 1;\n                  if (curX > regionX1)\n                    galois::runtime::acquire(&data[grid],\n                                             galois::MethodFlag::WRITE);\n\n                  grid = curY * xGrid + curX + 1;\n                  if (curX < regionX2)\n                    galois::runtime::acquire(&data[grid],\n                                             galois::MethodFlag::WRITE);\n\n                  grid = (curY - 1) * xGrid + curX;\n                  if (curY > regionY1)\n                    galois::runtime::acquire(&data[grid],\n                                             galois::MethodFlag::WRITE);\n\n                  grid = (curY + 1) * xGrid + curX;\n                  if (curY < regionY2)\n                    galois::runtime::acquire(&data[grid],\n                                             galois::MethodFlag::WRITE);\n\n                  int preX, preY;\n                  if (curr_d1 != 0) {\n                    if (HV[curY][curX]) {\n                      preX = parentX1[curY][curX];\n                      preY = parentY1[curY][curX];\n                    } else {\n                      preX = parentX3[curY][curX];\n                      preY = parentY3[curY][curX];\n                    }\n                  } else {\n                    preX = curX;\n                    preY = curY;\n                  }\n                  // printf(\"pop curY: %d curX: %d d1: %f preX: %d preY: %d\n                  // hyperH: %d hyperV: %d HV: %d return_dist: %f\\n\",\n                  //    curY, curX, curr_d1, preX, preY, hyperH[curY][curX],\n                  //    hyperV[curY][curX], HV[curY][curX], return_dist.load());\n                  float tmp, tmp_cost;\n                  int tmp_grid;\n                  int tmpX, tmpY;\n                  // left\n                  bool tmpH = false;\n                  bool tmpV = false;\n                  if (curX > regionX1) {\n                    grid = curY * (xGrid - 1) + curX - 1;\n                    // printf(\"grid: %d usage: %d red:%d last:%d sum%f\n                    // %d\\n\",grid, h_edges[grid].usage.load(),\n                    // h_edges[grid].red, h_edges[grid].last_usage, L ,\n                    // h_edges[grid].usage.load() + h_edges[grid].red +\n                    // (int)(L*h_edges[grid].last_usage));\n                    if ((preY == curY) || (curr_d1 == 0)) {\n                      tmp =\n                          curr_d1 +\n                          h_costTable[h_edges[grid].usage + h_edges[grid].red +\n                                      (int)(L * h_edges[grid].last_usage)];\n                    } else {\n                      if (curX < regionX2 - 1) {\n                        tmp_grid = curY * (xGrid - 1) + curX;\n                        tmp_cost =\n                            d1[curY][curX + 1] +\n                            h_costTable[h_edges[tmp_grid].usage +\n                                        h_edges[tmp_grid].red +\n                                        (int)(L *\n                                              h_edges[tmp_grid].last_usage)];\n\n                        if (tmp_cost < curr_d1 + VIA) {\n                          // hyperH[curY][curX] = TRUE; //Michael\n                          tmpH = true;\n                        }\n                      }\n                      tmp =\n                          curr_d1 + VIA +\n                          h_costTable[h_edges[grid].usage + h_edges[grid].red +\n                                      (int)(L * h_edges[grid].last_usage)];\n                    }\n                    // if(LOCK)  h_edges[grid].releaseLock();\n                    tmpX = curX - 1; // the left neighbor\n\n                    /*if(d1[curY][tmpX]>=BIG_INT) // left neighbor not been put\n                    into heap1\n                    {\n                        d1[curY][tmpX] = tmp;\n                        parentX3[curY][tmpX] = curX;\n                        parentY3[curY][tmpX] = curY;\n                        HV[curY][tmpX] = FALSE;\n                        pq1.push(&(d1[curY][tmpX]));\n                    }\n                    else */\n                    // galois::runtime::acquire(&data[curY * yGrid + tmpX],\n                    // galois::MethodFlag::WRITE);\n                    if (d1[curY][tmpX] > tmp &&\n                        tmp < return_dist) // left neighbor been put into heap1\n                                           // but needs update\n                    {\n                      d1[curY][tmpX]       = tmp;\n                      parentX3[curY][tmpX] = curX;\n                      parentY3[curY][tmpX] = curY;\n                      HV[curY][tmpX]       = FALSE;\n                      // pq1.push({&(d1[curY][tmpX]), tmp});\n                      // pq_grid grid_push = {&(d1[curY][tmpX]), tmp};\n                      ctx.push(pq_grid(&(d1[curY][tmpX]), tmp));\n                      // printf(\"left push Y: %d X: %d tmp: %f HV: false hyperH:\n                      // %d\\n\", curY, tmpX, tmp, true);\n                    }\n                  }\n                  // right\n                  if (curX < regionX2) {\n                    grid = curY * (xGrid - 1) + curX;\n                    // printf(\"grid: %d usage: %d red:%d last:%d sum%f\n                    // %d\\n\",grid, h_edges[grid].usage.load(),\n                    // h_edges[grid].red, h_edges[grid].last_usage, L ,\n                    // h_edges[grid].usage.load() + h_edges[grid].red +\n                    // (int)(L*h_edges[grid].last_usage));\n                    if ((preY == curY) || (curr_d1 == 0)) {\n                      tmp =\n                          curr_d1 +\n                          h_costTable[h_edges[grid].usage + h_edges[grid].red +\n                                      (int)(L * h_edges[grid].last_usage)];\n                    } else {\n                      if (curX > regionX1 + 1) {\n                        tmp_grid = curY * (xGrid - 1) + curX - 1;\n                        tmp_cost =\n                            d1[curY][curX - 1] +\n                            h_costTable[h_edges[tmp_grid].usage +\n                                        h_edges[tmp_grid].red +\n                                        (int)(L *\n                                              h_edges[tmp_grid].last_usage)];\n\n                        if (tmp_cost < curr_d1 + VIA) {\n                          // hyperH[curY][curX] = TRUE;\n                          tmpH = true;\n                        }\n                      }\n                      tmp =\n                          curr_d1 + VIA +\n                          h_costTable[h_edges[grid].usage + h_edges[grid].red +\n                                      (int)(L * h_edges[grid].last_usage)];\n                    }\n                    // if(LOCK) h_edges[grid].releaseLock();\n                    tmpX = curX + 1; // the right neighbor\n\n                    /*if(d1[curY][tmpX]>=BIG_INT) // right neighbor not been put\n                    into heap1\n                    {\n                        d1[curY][tmpX] = tmp;\n                        parentX3[curY][tmpX] = curX;\n                        parentY3[curY][tmpX] = curY;\n                        HV[curY][tmpX] = FALSE;\n                        pq1.push(&(d1[curY][tmpX]));\n\n                    }\n                    else */\n                    // galois::runtime::acquire(&data[curY * yGrid + tmpX],\n                    // galois::MethodFlag::WRITE);\n                    if (d1[curY][tmpX] > tmp &&\n                        tmp < return_dist) // right neighbor been put into heap1\n                                           // but needs update\n                    {\n                      d1[curY][tmpX]       = tmp;\n                      parentX3[curY][tmpX] = curX;\n                      parentY3[curY][tmpX] = curY;\n                      HV[curY][tmpX]       = FALSE;\n                      // pq1.push({&(d1[curY][tmpX]), tmp});\n                      // pq_grid grid_push = {&(d1[curY][tmpX]), tmp};\n                      ctx.push(pq_grid(&(d1[curY][tmpX]), tmp));\n                      // printf(\"right push Y: %d X: %d tmp: %f HV: false\n                      // hyperH: %d\\n\", curY, tmpX, tmp, true);\n                    }\n                  }\n                  hyperH[curY][curX] = tmpH;\n\n                  // bottom\n                  if (curY > regionY1) {\n                    grid = (curY - 1) * xGrid + curX;\n                    // printf(\"grid: %d usage: %d red:%d last:%d sum%f\n                    // %d\\n\",grid, v_edges[grid].usage.load(),\n                    // v_edges[grid].red, v_edges[grid].last_usage, L ,\n                    // v_edges[grid].usage.load() + v_edges[grid].red +\n                    // (int)(L*v_edges[grid].last_usage));\n                    if ((preX == curX) || (curr_d1 == 0)) {\n                      tmp =\n                          curr_d1 +\n                          v_costTable[v_edges[grid].usage + v_edges[grid].red +\n                                      (int)(L * v_edges[grid].last_usage)];\n                    } else {\n                      if (curY < regionY2 - 1) {\n                        tmp_grid = curY * xGrid + curX;\n                        tmp_cost =\n                            d1[curY + 1][curX] +\n                            v_costTable[v_edges[tmp_grid].usage +\n                                        v_edges[tmp_grid].red +\n                                        (int)(L *\n                                              v_edges[tmp_grid].last_usage)];\n\n                        if (tmp_cost < curr_d1 + VIA) {\n                          // hyperV[curY][curX] = TRUE;\n                          tmpV = true;\n                        }\n                      }\n                      tmp =\n                          curr_d1 + VIA +\n                          v_costTable[v_edges[grid].usage + v_edges[grid].red +\n                                      (int)(L * v_edges[grid].last_usage)];\n                    }\n                    // if(LOCK) v_edges[grid].releaseLock();\n                    tmpY = curY - 1; // the bottom neighbor\n\n                    /*if(d1[tmpY][curX]>=BIG_INT) // bottom neighbor not been\n                    put into heap1\n                    {\n                        d1[tmpY][curX] = tmp;\n                        parentX1[tmpY][curX] = curX;\n                        parentY1[tmpY][curX] = curY;\n                        HV[tmpY][curX] = TRUE;\n                        pq1.push(&(d1[tmpY][curX]));\n\n                    }\n                    else */\n                    // galois::runtime::acquire(&data[tmpY * yGrid + curX],\n                    // galois::MethodFlag::WRITE);\n                    if (d1[tmpY][curX] > tmp &&\n                        tmp < return_dist) // bottom neighbor been put into\n                                           // heap1 but needs update\n                    {\n                      d1[tmpY][curX]       = tmp;\n                      parentX1[tmpY][curX] = curX;\n                      parentY1[tmpY][curX] = curY;\n                      HV[tmpY][curX]       = TRUE;\n                      // pq1.push({&(d1[tmpY][curX]), tmp});\n                      // pq_grid grid_push = {&(d1[tmpY][curX]), tmp};\n                      ctx.push(pq_grid(&(d1[tmpY][curX]), tmp));\n                      // printf(\"bottom push Y: %d X: %d tmp: %f HV: false\n                      // hyperH: %d\\n\", tmpY, curX, tmp, true);\n                    }\n                  }\n                  // top\n                  if (curY < regionY2) {\n                    grid = curY * xGrid + curX;\n                    // printf(\"grid: %d usage: %d red:%d last:%d sum%f\n                    // %d\\n\",grid, v_edges[grid].usage.load(),\n                    // v_edges[grid].red, v_edges[grid].last_usage, L ,\n                    // v_edges[grid].usage.load() + v_edges[grid].red +\n                    // (int)(L*v_edges[grid].last_usage));\n                    if ((preX == curX) || (curr_d1 == 0)) {\n                      tmp =\n                          curr_d1 +\n                          v_costTable[v_edges[grid].usage + v_edges[grid].red +\n                                      (int)(L * v_edges[grid].last_usage)];\n                    } else {\n                      if (curY > regionY1 + 1) {\n                        tmp_grid = (curY - 1) * xGrid + curX;\n                        tmp_cost =\n                            d1[curY - 1][curX] +\n                            v_costTable[v_edges[tmp_grid].usage +\n                                        v_edges[tmp_grid].red +\n                                        (int)(L *\n                                              v_edges[tmp_grid].last_usage)];\n\n                        if (tmp_cost < curr_d1 + VIA) {\n                          // hyperV[curY][curX] = TRUE;\n                          tmpV = true;\n                        }\n                      }\n                      tmp =\n                          curr_d1 + VIA +\n                          v_costTable[v_edges[grid].usage + v_edges[grid].red +\n                                      (int)(L * v_edges[grid].last_usage)];\n                    }\n                    // if(LOCK) v_edges[grid].releaseLock();\n                    tmpY = curY + 1; // the top neighbor\n\n                    /*if(d1[tmpY][curX]>=BIG_INT) // top neighbor not been put\n                    into heap1\n                    {\n                        d1[tmpY][curX] = tmp;\n                        parentX1[tmpY][curX] = curX;\n                        parentY1[tmpY][curX] = curY;\n                        HV[tmpY][curX] = TRUE;\n                        pq1.push(&(d1[tmpY][curX]));\n                    }\n                    else*/\n                    // galois::runtime::acquire(&data[tmpY * yGrid + curX],\n                    // galois::MethodFlag::WRITE);\n                    if (d1[tmpY][curX] > tmp &&\n                        tmp < return_dist) // top neighbor been put into heap1\n                                           // but needs update\n                    {\n                      d1[tmpY][curX]       = tmp;\n                      parentX1[tmpY][curX] = curX;\n                      parentY1[tmpY][curX] = curY;\n                      HV[tmpY][curX]       = TRUE;\n                      // pq_grid grid_push = {&(d1[tmpY][curX]), tmp};\n                      ctx.push(pq_grid(&(d1[tmpY][curX]), tmp));\n                      // printf(\"top push Y: %d X: %d tmp: %f HV: false hyperH:\n                      // %d\\n\", tmpY, curX, tmp, true);\n                      // pq1.push({&(d1[tmpY][curX]), tmp});\n                    }\n                  }\n                  hyperV[curY][curX] = tmpV;\n                }\n              },\n              galois::wl<galois::worklists::ParaMeter<>>(),\n              // galois::wl<PSChunk>(),\n              // galois::wl<OBIM>(RequestIndexer),\n              // galois::chunk_size<MAZE_CHUNK_SIZE>()\n              // galois::parallel_break(),\n              // galois::steal(),\n              galois::loopname(\"fine_grain\"));\n\n          timer_foreach.stop();\n\n          for (local_vec::iterator ii = v2.begin(); ii != v2.end(); ii++)\n            pop_heap2[*ii] = FALSE;\n\n          crossX = return_ind1 % xGrid;\n          crossY = return_ind1 / xGrid;\n\n          cnt      = 0;\n          int curX = crossX;\n          int curY = crossY;\n          int tmpX, tmpY;\n          // if(netID == 2 && edgeID == 26)\n          //    printf(\"crossX %d crossY %d return_d: %f\\n\", crossX, crossY,\n          //    return_dist.load());\n          timer_traceback.start();\n          while (d1[curY][curX] != 0) // loop until reach subtree1\n          {\n            // if(cnt > 1000 && cnt < 1100)\n            //    printf(\"Y: %d X: %d hyperH: %d hyperV: %d HV: %d d1: %f\\n\",\n            //    curY, curX, hyperH[curY][curX], hyperV[curY][curX],\n            //    HV[curY][curX], d1[curY][curX]);\n\n            hypered = FALSE;\n            if (cnt != 0) {\n              if (curX != tmpX && hyperH[curY][curX]) {\n                curX    = 2 * curX - tmpX;\n                hypered = TRUE;\n              }\n              // printf(\"hyperV[153][134]: %d\\n\", hyperV[curY][curX]);\n              if (curY != tmpY && hyperV[curY][curX]) {\n                curY    = 2 * curY - tmpY;\n                hypered = TRUE;\n              }\n            }\n            tmpX = curX;\n            tmpY = curY;\n            if (!hypered) {\n              if (HV[tmpY][tmpX]) {\n                curY = parentY1[tmpY][tmpX];\n              } else {\n                curX = parentX3[tmpY][tmpX];\n              }\n            }\n\n            tmp_gridsX[cnt] = curX;\n            tmp_gridsY[cnt] = curY;\n            cnt++;\n          }\n          // reverse the grids on the path\n\n          for (i = 0; i < cnt; i++) {\n            tmpind    = cnt - 1 - i;\n            gridsX[i] = tmp_gridsX[tmpind];\n            gridsY[i] = tmp_gridsY[tmpind];\n          }\n          // add the connection point (crossX, crossY)\n          gridsX[cnt] = crossX;\n          gridsY[cnt] = crossY;\n          cnt++;\n\n          curX     = crossX;\n          curY     = crossY;\n          cnt_n1n2 = cnt;\n\n          // change the tree structure according to the new routing for the tree\n          // edge find E1 and E2, and the endpoints of the edges they are on\n          E1x = gridsX[0];\n          E1y = gridsY[0];\n          E2x = gridsX[cnt_n1n2 - 1];\n          E2y = gridsY[cnt_n1n2 - 1];\n\n          edge_n1n2 = edgeID;\n\n          timer_traceback.stop();\n\n          // if(netID == 14628)\n          //    printf(\"netID %d edgeID %d src %d %d dst %d %d routelen: %d\\n\",\n          //    netID, edgeID, E1x, E1y, E2x, E2y, cnt_n1n2);\n          // (1) consider subtree1\n          timer_adjusttree.start();\n          if (n1 >= deg && (E1x != n1x || E1y != n1y))\n          // n1 is not a pin and E1!=n1, then make change to subtree1,\n          // otherwise, no change to subtree1\n          {\n            // find the endpoints of the edge E1 is on\n            endpt1 = treeedges[corrEdge[E1y][E1x]].n1;\n            endpt2 = treeedges[corrEdge[E1y][E1x]].n2;\n\n            // find A1, A2 and edge_n1A1, edge_n1A2\n            if (treenodes[n1].nbr[0] == n2) {\n              A1        = treenodes[n1].nbr[1];\n              A2        = treenodes[n1].nbr[2];\n              edge_n1A1 = treenodes[n1].edge[1];\n              edge_n1A2 = treenodes[n1].edge[2];\n            } else if (treenodes[n1].nbr[1] == n2) {\n              A1        = treenodes[n1].nbr[0];\n              A2        = treenodes[n1].nbr[2];\n              edge_n1A1 = treenodes[n1].edge[0];\n              edge_n1A2 = treenodes[n1].edge[2];\n            } else {\n              A1        = treenodes[n1].nbr[0];\n              A2        = treenodes[n1].nbr[1];\n              edge_n1A1 = treenodes[n1].edge[0];\n              edge_n1A2 = treenodes[n1].edge[1];\n            }\n\n            if (endpt1 == n1 || endpt2 == n1) // E1 is on (n1, A1) or (n1, A2)\n            {\n              // if E1 is on (n1, A2), switch A1 and A2 so that E1 is always on\n              // (n1, A1)\n              if (endpt1 == A2 || endpt2 == A2) {\n                tmpi      = A1;\n                A1        = A2;\n                A2        = tmpi;\n                tmpi      = edge_n1A1;\n                edge_n1A1 = edge_n1A2;\n                edge_n1A2 = tmpi;\n              }\n\n              // update route for edge (n1, A1), (n1, A2)\n              updateRouteType1(treenodes, n1, A1, A2, E1x, E1y, treeedges,\n                               edge_n1A1, edge_n1A2);\n              // update position for n1\n              treenodes[n1].x = E1x;\n              treenodes[n1].y = E1y;\n            }    // if E1 is on (n1, A1) or (n1, A2)\n            else // E1 is not on (n1, A1) or (n1, A2), but on (C1, C2)\n            {\n              C1        = endpt1;\n              C2        = endpt2;\n              edge_C1C2 = corrEdge[E1y][E1x];\n\n              // update route for edge (n1, C1), (n1, C2) and (A1, A2)\n              updateRouteType2(treenodes, n1, A1, A2, C1, C2, E1x, E1y,\n                               treeedges, edge_n1A1, edge_n1A2, edge_C1C2);\n              // update position for n1\n              treenodes[n1].x = E1x;\n              treenodes[n1].y = E1y;\n              // update 3 edges (n1, A1)->(C1, n1), (n1, A2)->(n1, C2), (C1,\n              // C2)->(A1, A2)\n              edge_n1C1               = edge_n1A1;\n              treeedges[edge_n1C1].n1 = C1;\n              treeedges[edge_n1C1].n2 = n1;\n              edge_n1C2               = edge_n1A2;\n              treeedges[edge_n1C2].n1 = n1;\n              treeedges[edge_n1C2].n2 = C2;\n              edge_A1A2               = edge_C1C2;\n              treeedges[edge_A1A2].n1 = A1;\n              treeedges[edge_A1A2].n2 = A2;\n              // update nbr and edge for 5 nodes n1, A1, A2, C1, C2\n              // n1's nbr (n2, A1, A2)->(n2, C1, C2)\n              treenodes[n1].nbr[0]  = n2;\n              treenodes[n1].edge[0] = edge_n1n2;\n              treenodes[n1].nbr[1]  = C1;\n              treenodes[n1].edge[1] = edge_n1C1;\n              treenodes[n1].nbr[2]  = C2;\n              treenodes[n1].edge[2] = edge_n1C2;\n              // A1's nbr n1->A2\n              for (i = 0; i < 3; i++) {\n                if (treenodes[A1].nbr[i] == n1) {\n                  treenodes[A1].nbr[i]  = A2;\n                  treenodes[A1].edge[i] = edge_A1A2;\n                  break;\n                }\n              }\n              // A2's nbr n1->A1\n              for (i = 0; i < 3; i++) {\n                if (treenodes[A2].nbr[i] == n1) {\n                  treenodes[A2].nbr[i]  = A1;\n                  treenodes[A2].edge[i] = edge_A1A2;\n                  break;\n                }\n              }\n              // C1's nbr C2->n1\n              for (i = 0; i < 3; i++) {\n                if (treenodes[C1].nbr[i] == C2) {\n                  treenodes[C1].nbr[i]  = n1;\n                  treenodes[C1].edge[i] = edge_n1C1;\n                  break;\n                }\n              }\n              // C2's nbr C1->n1\n              for (i = 0; i < 3; i++) {\n                if (treenodes[C2].nbr[i] == C1) {\n                  treenodes[C2].nbr[i]  = n1;\n                  treenodes[C2].edge[i] = edge_n1C2;\n                  break;\n                }\n              }\n\n            } // else E1 is not on (n1, A1) or (n1, A2), but on (C1, C2)\n          }   // n1 is not a pin and E1!=n1\n\n          // (2) consider subtree2\n\n          if (n2 >= deg && (E2x != n2x || E2y != n2y))\n          // n2 is not a pin and E2!=n2, then make change to subtree2,\n          // otherwise, no change to subtree2\n          {\n            // find the endpoints of the edge E1 is on\n            endpt1 = treeedges[corrEdge[E2y][E2x]].n1;\n            endpt2 = treeedges[corrEdge[E2y][E2x]].n2;\n\n            // find B1, B2\n            if (treenodes[n2].nbr[0] == n1) {\n              B1        = treenodes[n2].nbr[1];\n              B2        = treenodes[n2].nbr[2];\n              edge_n2B1 = treenodes[n2].edge[1];\n              edge_n2B2 = treenodes[n2].edge[2];\n            } else if (treenodes[n2].nbr[1] == n1) {\n              B1        = treenodes[n2].nbr[0];\n              B2        = treenodes[n2].nbr[2];\n              edge_n2B1 = treenodes[n2].edge[0];\n              edge_n2B2 = treenodes[n2].edge[2];\n            } else {\n              B1        = treenodes[n2].nbr[0];\n              B2        = treenodes[n2].nbr[1];\n              edge_n2B1 = treenodes[n2].edge[0];\n              edge_n2B2 = treenodes[n2].edge[1];\n            }\n\n            if (endpt1 == n2 || endpt2 == n2) // E2 is on (n2, B1) or (n2, B2)\n            {\n              // if E2 is on (n2, B2), switch B1 and B2 so that E2 is always on\n              // (n2, B1)\n              if (endpt1 == B2 || endpt2 == B2) {\n                tmpi      = B1;\n                B1        = B2;\n                B2        = tmpi;\n                tmpi      = edge_n2B1;\n                edge_n2B1 = edge_n2B2;\n                edge_n2B2 = tmpi;\n              }\n\n              // update route for edge (n2, B1), (n2, B2)\n              updateRouteType1(treenodes, n2, B1, B2, E2x, E2y, treeedges,\n                               edge_n2B1, edge_n2B2);\n\n              // update position for n2\n              treenodes[n2].x = E2x;\n              treenodes[n2].y = E2y;\n            }    // if E2 is on (n2, B1) or (n2, B2)\n            else // E2 is not on (n2, B1) or (n2, B2), but on (D1, D2)\n            {\n              D1        = endpt1;\n              D2        = endpt2;\n              edge_D1D2 = corrEdge[E2y][E2x];\n\n              // update route for edge (n2, D1), (n2, D2) and (B1, B2)\n              updateRouteType2(treenodes, n2, B1, B2, D1, D2, E2x, E2y,\n                               treeedges, edge_n2B1, edge_n2B2, edge_D1D2);\n              // update position for n2\n              treenodes[n2].x = E2x;\n              treenodes[n2].y = E2y;\n              // update 3 edges (n2, B1)->(D1, n2), (n2, B2)->(n2, D2), (D1,\n              // D2)->(B1, B2)\n              edge_n2D1               = edge_n2B1;\n              treeedges[edge_n2D1].n1 = D1;\n              treeedges[edge_n2D1].n2 = n2;\n              edge_n2D2               = edge_n2B2;\n              treeedges[edge_n2D2].n1 = n2;\n              treeedges[edge_n2D2].n2 = D2;\n              edge_B1B2               = edge_D1D2;\n              treeedges[edge_B1B2].n1 = B1;\n              treeedges[edge_B1B2].n2 = B2;\n              // update nbr and edge for 5 nodes n2, B1, B2, D1, D2\n              // n1's nbr (n1, B1, B2)->(n1, D1, D2)\n              treenodes[n2].nbr[0]  = n1;\n              treenodes[n2].edge[0] = edge_n1n2;\n              treenodes[n2].nbr[1]  = D1;\n              treenodes[n2].edge[1] = edge_n2D1;\n              treenodes[n2].nbr[2]  = D2;\n              treenodes[n2].edge[2] = edge_n2D2;\n              // B1's nbr n2->B2\n              for (i = 0; i < 3; i++) {\n                if (treenodes[B1].nbr[i] == n2) {\n                  treenodes[B1].nbr[i]  = B2;\n                  treenodes[B1].edge[i] = edge_B1B2;\n                  break;\n                }\n              }\n              // B2's nbr n2->B1\n              for (i = 0; i < 3; i++) {\n                if (treenodes[B2].nbr[i] == n2) {\n                  treenodes[B2].nbr[i]  = B1;\n                  treenodes[B2].edge[i] = edge_B1B2;\n                  break;\n                }\n              }\n              // D1's nbr D2->n2\n              for (i = 0; i < 3; i++) {\n                if (treenodes[D1].nbr[i] == D2) {\n                  treenodes[D1].nbr[i]  = n2;\n                  treenodes[D1].edge[i] = edge_n2D1;\n                  break;\n                }\n              }\n              // D2's nbr D1->n2\n              for (i = 0; i < 3; i++) {\n                if (treenodes[D2].nbr[i] == D1) {\n                  treenodes[D2].nbr[i]  = n2;\n                  treenodes[D2].edge[i] = edge_n2D2;\n                  break;\n                }\n              }\n            } // else E2 is not on (n2, B1) or (n2, B2), but on (D1, D2)\n          }   // n2 is not a pin and E2!=n2\n\n          // update route for edge (n1, n2) and edge usage\n\n          // printf(\"update route? %d %d\\n\", netID, num_edges);\n          if (treeedges[edge_n1n2].route.type == MAZEROUTE) {\n            free(treeedges[edge_n1n2].route.gridsX);\n            free(treeedges[edge_n1n2].route.gridsY);\n          }\n          treeedges[edge_n1n2].route.gridsX =\n              (short*)calloc(cnt_n1n2, sizeof(short));\n          treeedges[edge_n1n2].route.gridsY =\n              (short*)calloc(cnt_n1n2, sizeof(short));\n          treeedges[edge_n1n2].route.type     = MAZEROUTE;\n          treeedges[edge_n1n2].route.routelen = cnt_n1n2 - 1;\n          treeedges[edge_n1n2].len = ADIFF(E1x, E2x) + ADIFF(E1y, E2y);\n          treeedges[edge_n1n2].n_ripups += 1;\n          total_ripups += 1;\n          max_ripups.update(treeedges[edge_n1n2].n_ripups);\n\n          for (i = 0; i < cnt_n1n2; i++) {\n            // printf(\"cnt_n1n2: %d\\n\", cnt_n1n2);\n            treeedges[edge_n1n2].route.gridsX[i] = gridsX[i];\n            treeedges[edge_n1n2].route.gridsY[i] = gridsY[i];\n          }\n          // std::cout << \" adjsut tree\" << std::endl;\n          timer_adjusttree.stop();\n\n          // update edge usage\n\n          timer_updateusage.start();\n          for (i = 0; i < cnt_n1n2 - 1; i++) {\n            if (gridsX[i] == gridsX[i + 1]) // a vertical edge\n            {\n              min_y = min(gridsY[i], gridsY[i + 1]);\n              // v_edges[min_y*xGrid+gridsX[i]].usage += 1;\n              // galois::atomicAdd(v_edges[min_y*xGrid+gridsX[i]].usage, (short\n              // unsigned)1);\n              v_edges[min_y * xGrid + gridsX[i]].usage.fetch_add(\n                  (short int)1, std::memory_order_relaxed);\n            } else /// if(gridsY[i]==gridsY[i+1])// a horizontal edge\n            {\n              min_x = min(gridsX[i], gridsX[i + 1]);\n              // h_edges[gridsY[i]*(xGrid-1)+min_x].usage += 1;\n              // galois::atomicAdd(h_edges[gridsY[i]*(xGrid-1)+min_x].usage,\n              // (short unsigned)1);\n              h_edges[gridsY[i] * (xGrid - 1) + min_x].usage.fetch_add(\n                  (short int)1, std::memory_order_relaxed);\n            }\n          }\n          timer_updateusage.stop();\n          timer_checkroute2dtree.start();\n          if (checkRoute2DTree(netID)) {\n            reInitTree(netID);\n            return;\n          }\n          timer_checkroute2dtree.stop();\n        } // congested route, if(enter)\n        timer_finegrain.stop();\n      } // only route the non-degraded edges (len>0)\n    }   // iterate on edges of a net\n  }\n\n  printf(\"total ripups: %d max ripups: %d\\n\", total_ripups.reduce(),\n         max_ripups.reduce());\n  //}, \"mazeroute vtune function\");\n  free(h_costTable);\n  free(v_costTable);\n}\n\nvoid mazeRouteMSMD_finegrain_spinlock(int iter, int expand, float costHeight,\n                                      int ripup_threshold,\n                                      int mazeedge_Threshold, Bool Ordering,\n                                      int cost_type) {\n  // LOCK = 0;\n  galois::StatTimer timer_finegrain(\"fine grain maze\", \"fine grain maze\");\n\n  float forange;\n  // allocate memory for distance and parent and pop_heap\n  h_costTable = (float*)calloc(40 * hCapacity, sizeof(float));\n  v_costTable = (float*)calloc(40 * vCapacity, sizeof(float));\n\n  forange = 40 * hCapacity;\n\n  if (cost_type == 2) {\n    for (int i = 0; i < forange; i++) {\n      if (i < hCapacity - 1)\n        h_costTable[i] =\n            costHeight / (exp((float)(hCapacity - i - 1) * LOGIS_COF) + 1) + 1;\n      else\n        h_costTable[i] =\n            costHeight / (exp((float)(hCapacity - i - 1) * LOGIS_COF) + 1) + 1 +\n            (float)costHeight / slope * (i - hCapacity);\n    }\n    forange = 40 * vCapacity;\n    for (int i = 0; i < forange; i++) {\n      if (i < vCapacity - 1)\n        v_costTable[i] =\n            costHeight / (exp((float)(vCapacity - i - 1) * LOGIS_COF) + 1) + 1;\n      else\n        v_costTable[i] =\n            costHeight / (exp((float)(vCapacity - i - 1) * LOGIS_COF) + 1) + 1 +\n            (float)costHeight / slope * (i - vCapacity);\n    }\n  } else {\n\n    for (int i = 0; i < forange; i++) {\n      if (i < hCapacity)\n        h_costTable[i] =\n            costHeight / (exp((float)(hCapacity - i) * LOGIS_COF) + 1) + 1;\n      else\n        h_costTable[i] =\n            costHeight / (exp((float)(hCapacity - i) * LOGIS_COF) + 1) + 1 +\n            (float)costHeight / slope * (i - hCapacity);\n    }\n    forange = 40 * vCapacity;\n    for (int i = 0; i < forange; i++) {\n      if (i < vCapacity)\n        v_costTable[i] =\n            costHeight / (exp((float)(vCapacity - i) * LOGIS_COF) + 1) + 1;\n      else\n        v_costTable[i] =\n            costHeight / (exp((float)(vCapacity - i) * LOGIS_COF) + 1) + 1 +\n            (float)costHeight / slope * (i - vCapacity);\n    }\n  }\n\n  // cout << \" i = vCap:\" << v_costTable[vCapacity - 1] << \" \"\n  //     << v_costTable[vCapacity] << \" \" << v_costTable[vCapacity + 1] << endl;\n\n  /*forange = yGrid*xGrid;\n  for(i=0; i<forange; i++)\n  {\n      pop_heap2[i] = FALSE;\n  } //Michael*/\n\n  galois::LargeArray<galois::substrate::SimpleLock> data;\n  data.allocateInterleaved(xGrid * yGrid);\n\n  galois::substrate::SimpleLock return_lock;\n  if (Ordering) {\n    StNetOrder();\n    // printf(\"order?\\n\");\n  }\n\n  THREAD_LOCAL_STORAGE thread_local_storage{};\n  // for(nidRPC=0; nidRPC<numValidNets; nidRPC++)//parallelize\n  PerThread_PQ perthread_pq;\n  PerThread_Vec perthread_vec;\n  PRINT = 0;\n  galois::GAccumulator<int> total_ripups;\n  galois::GReduceMax<int> max_ripups;\n  total_ripups.reset();\n  max_ripups.reset();\n\n  // galois::runtime::profileVtune( [&] (void) {\n  /*std::random_device rd;\n  std::mt19937 g(rd());\n  std::shuffle(net_shuffle.begin(), net_shuffle.end(), g);\n\n  galois::do_all(galois::iterate(net_shuffle), */\n  // galois::for_each(galois::iterate(0, numValidNets),\n  //        [&] (const auto nidRPC, auto& ctx)\n  /*galois::StatTimer timer_newripupcheck(\"ripup\", \"fine grain maze\");\n  galois::StatTimer timer_setupheap(\"setup heap\", \"fine grain maze\");\n  galois::StatTimer timer_traceback(\"trace back\", \"fine grain maze\");\n  galois::StatTimer timer_adjusttree(\"adjust tree\", \"fine grain maze\");\n  galois::StatTimer timer_updateusage(\"update usage\", \"fine grain maze\");\n  galois::StatTimer timer_checkroute2dtree(\"checkroute2dtree\", \"fine grain\n  maze\"); galois::StatTimer timer_init(\"init\", \"fine grain maze\");\n  galois::StatTimer timer_foreach(\"foreach\", \"fine grain maze\");\n  galois::StatTimer timer_init_int(\"big int initialize\", \"fine grain maze\");*/\n  float acc_dist = 0;\n  int acc_length = 0;\n  int acc_cnt    = 0;\n  float max_dist = 0;\n  int max_length = 0;\n\n  for (int nidRPC = 0; nidRPC < numValidNets; nidRPC++) {\n\n    int netID;\n\n    // maze routing for multi-source, multi-destination\n    Bool hypered, enter;\n    int i, j, deg, edgeID, n1, n2, n1x, n1y, n2x, n2y, ymin, ymax, xmin, xmax,\n        crossX, crossY, tmpi, min_x, min_y, num_edges;\n    int regionX1, regionX2, regionY1, regionY2;\n    int tmpind, gridsX[XRANGE], gridsY[YRANGE], tmp_gridsX[XRANGE],\n        tmp_gridsY[YRANGE];\n    int endpt1, endpt2, A1, A2, B1, B2, C1, C2, D1, D2, cnt, cnt_n1n2;\n    int edge_n1n2, edge_n1A1, edge_n1A2, edge_n1C1, edge_n1C2, edge_A1A2,\n        edge_C1C2;\n    int edge_n2B1, edge_n2B2, edge_n2D1, edge_n2D2, edge_B1B2, edge_D1D2;\n    int E1x, E1y, E2x, E2y;\n    int origENG, edgeREC;\n\n    TreeEdge *treeedges, *treeedge;\n    TreeNode* treenodes;\n\n    bool* pop_heap2 = thread_local_storage.pop_heap2;\n\n    float** d1    = thread_local_storage.d1_p;\n    bool** HV     = thread_local_storage.HV_p;\n    bool** hyperV = thread_local_storage.hyperV_p;\n    bool** hyperH = thread_local_storage.hyperH_p;\n\n    short** parentX1 = thread_local_storage.parentX1_p;\n    short** parentX3 = thread_local_storage.parentX3_p;\n    short** parentY1 = thread_local_storage.parentY1_p;\n    short** parentY3 = thread_local_storage.parentY3_p;\n\n    int** corrEdge = thread_local_storage.corrEdge_p;\n\n    OrderNetEdge* netEO = thread_local_storage.netEO_p;\n\n    bool** inRegion = thread_local_storage.inRegion_p;\n\n    local_pq pq1 = perthread_pq.get();\n    local_vec v2 = perthread_vec.get();\n\n    /*for(i=0; i<yGrid*xGrid; i++)\n    {\n        pop_heap2[i] = FALSE;\n    } */\n\n    // memset(inRegion_alloc, 0, xGrid * yGrid * sizeof(bool));\n    /*for(int i=0; i<yGrid; i++)\n    {\n        for(int j=0; j<xGrid; j++)\n            inRegion[i][j] = FALSE;\n    }*/\n    // printf(\"hyperV[153][134]: %d %d %d\\n\", hyperV[153][134],\n    // parentY1[153][134], parentX3[153][134]); printf(\"what is happening?\\n\");\n\n    if (Ordering) {\n      netID = treeOrderCong[nidRPC].treeIndex;\n    } else {\n      netID = nidRPC;\n    }\n\n    deg = sttrees[netID].deg;\n\n    origENG = expand;\n\n    netedgeOrderDec(netID, netEO);\n\n    treeedges = sttrees[netID].edges;\n    treenodes = sttrees[netID].nodes;\n    // loop for all the tree edges (2*deg-3)\n    num_edges = 2 * deg - 3;\n\n    for (edgeREC = 0; edgeREC < num_edges; edgeREC++) {\n\n      edgeID   = netEO[edgeREC].edgeID;\n      treeedge = &(treeedges[edgeID]);\n\n      n1            = treeedge->n1;\n      n2            = treeedge->n2;\n      n1x           = treenodes[n1].x;\n      n1y           = treenodes[n1].y;\n      n2x           = treenodes[n2].x;\n      n2y           = treenodes[n2].y;\n      treeedge->len = ADIFF(n2x, n1x) + ADIFF(n2y, n1y);\n\n      if (treeedge->len >\n          mazeedge_Threshold) // only route the non-degraded edges (len>0)\n      {\n        // timer_newripupcheck.start();\n        enter = newRipupCheck(treeedge, ripup_threshold, netID, edgeID);\n        // timer_newripupcheck.stop();\n\n        // ripup the routing for the edge\n        timer_finegrain.start();\n        if (enter) {\n          // if(netID == 2 && edgeID == 26)\n          //    printf(\"netID %d edgeID %d src %d %d dst %d %d\\n\", netID,\n          //    edgeID, n1x, n1y, n2x, n2y);\n          // pre_length = treeedge->route.routelen;\n          /*for(int i = 0; i < pre_length; i++)\n          {\n              pre_gridsY[i] = treeedge->route.gridsY[i];\n              pre_gridsX[i] = treeedge->route.gridsX[i];\n              //printf(\"i %d x %d y %d\\n\", i, pre_gridsX[i], pre_gridsY[i]);\n          }*/\n          // timer_init.start();\n          if (n1y <= n2y) {\n            ymin = n1y;\n            ymax = n2y;\n          } else {\n            ymin = n2y;\n            ymax = n1y;\n          }\n\n          if (n1x <= n2x) {\n            xmin = n1x;\n            xmax = n2x;\n          } else {\n            xmin = n2x;\n            xmax = n1x;\n          }\n\n          int enlarge = min(\n              origENG, (iter / 6 + 3) *\n                           treeedge->route\n                               .routelen); // michael, this was global variable\n          regionX1 = max(0, xmin - enlarge);\n          regionX2 = min(xGrid - 1, xmax + enlarge);\n          regionY1 = max(0, ymin - enlarge);\n          regionY2 = min(yGrid - 1, ymax + enlarge);\n          // std::cout << \"region size\" << regionWidth << \", \" << regionHeight\n          // << std::endl;\n          // initialize d1[][] and d2[][] as BIG_INT\n          // timer_init_int.start();\n          for (i = regionY1; i <= regionY2; i++) {\n            for (j = regionX1; j <= regionX2; j++) {\n              d1[i][j] = BIG_INT;\n\n              /*d2[i][j] = BIG_INT;\n              hyperH[i][j] = FALSE;\n              hyperV[i][j] = FALSE;*/\n            }\n          }\n          // timer_init_int.stop();\n          // memset(hyperH, 0, xGrid * yGrid * sizeof(bool));\n          // memset(hyperV, 0, xGrid * yGrid * sizeof(bool));\n          for (i = regionY1; i <= regionY2; i++) {\n            for (j = regionX1; j <= regionX2; j++) {\n              HV[i][j] = FALSE;\n            }\n          }\n          for (i = regionY1; i <= regionY2; i++) {\n            for (j = regionX1; j <= regionX2; j++) {\n              hyperH[i][j] = FALSE;\n            }\n          }\n          for (i = regionY1; i <= regionY2; i++) {\n            for (j = regionX1; j <= regionX2; j++) {\n              hyperV[i][j] = FALSE;\n            }\n          }\n          // TODO: use seperate loops\n\n          // setup heap1, heap2 and initialize d1[][] and d2[][] for all the\n          // grids on the two subtrees\n          // timer_setupheap.start();\n          setupHeap(netID, edgeID, pq1, v2, regionX1, regionX2, regionY1,\n                    regionY2, d1, corrEdge, inRegion);\n          // timer_setupheap.stop();\n          // TODO: use std priority queue\n          // while loop to find shortest path\n          /*ind1 = (pq1.top().d1_p - &d1[0][0]);\n          curX = ind1%xGrid;\n          curY = ind1/xGrid;\n          printf(\"src size: %d dst size: %d\\n\", pq1.size(), v2.size());*/\n          for (local_vec::iterator ii = v2.begin(); ii != v2.end(); ii++) {\n            pop_heap2[*ii] = TRUE;\n          }\n          std::atomic<int> return_ind1;\n          std::atomic<float> return_dist;\n          return_dist = (float)BIG_INT;\n\n          galois::for_each(\n              galois::iterate(pq1),\n              [&](const auto& top, auto& ctx)\n              // while( pop_heap2[ind1]==FALSE) // stop until the grid position\n              // been popped out from both heap1 and heap2\n              {\n                // relax all the adjacent grids within the enlarged region for\n                // source subtree\n\n                int ind1 = top.d1_p - &d1[0][0];\n\n                int curX = ind1 % xGrid;\n                int curY = ind1 / xGrid;\n                int grid = curY * xGrid + curX;\n\n                float curr_d1 = d1[curY][curX];\n                float d1_push = top.d1_push;\n\n                if (d1_push > return_dist + OBIM_delta) {\n                  // ctx.breakLoop();\n                }\n                if (d1_push == curr_d1 && d1_push < return_dist.load()) {\n                  if (pop_heap2[ind1] != false) {\n                    // if(netID == 2 && edgeID == 26)\n                    //    printf(\"reach! curX curY %d %d, d1_push: %f, curr_d1:\n                    //    %f return_d: %f\\n\", curX, curY, d1_push, curr_d1,\n                    //    return_dist.load());\n                    return_lock.lock();\n                    if (d1_push < return_dist.load()) {\n                      return_ind1.store(ind1);\n                      return_dist.store(d1_push);\n                    } else {\n                      return_lock.unlock();\n                      return;\n                    }\n                    return_lock.unlock();\n                  }\n                  // curr_d1 = d1_push;\n\n                  int preX = curX, preY = curY;\n                  if (curr_d1 != 0) {\n                    if (HV[curY][curX]) {\n                      preX = parentX1[curY][curX];\n                      preY = parentY1[curY][curX];\n                    } else {\n                      preX = parentX3[curY][curX];\n                      preY = parentY3[curY][curX];\n                    }\n                  }\n                  // printf(\"pop curY: %d curX: %d d1: %f preX: %d preY: %d\n                  // hyperH: %d hyperV: %d HV: %d return_dist: %f\\n\",\n                  //    curY, curX, curr_d1, preX, preY, hyperH[curY][curX],\n                  //    hyperV[curY][curX], HV[curY][curX], return_dist.load());\n                  float tmp = 0.f, tmp_cost = 0.f;\n                  int tmp_grid = 0;\n                  int tmpX = 0, tmpY = 0;\n                  // left\n                  bool tmpH = false;\n                  bool tmpV = false;\n\n                  if (curX > regionX1) {\n                    grid = curY * (xGrid - 1) + curX - 1;\n\n                    tmpX = curX - 1; // the left neighbor\n                    if ((preY == curY) || (curr_d1 == 0)) {\n                      tmp =\n                          curr_d1 +\n                          h_costTable[h_edges[grid].usage + h_edges[grid].red +\n                                      (int)(L * h_edges[grid].last_usage)];\n                    } else {\n                      if (curX < regionX2 - 1) {\n                        tmp_grid = curY * (xGrid - 1) + curX;\n                        tmp_cost =\n                            d1[curY][curX + 1] +\n                            h_costTable[h_edges[tmp_grid].usage +\n                                        h_edges[tmp_grid].red +\n                                        (int)(L *\n                                              h_edges[tmp_grid].last_usage)];\n\n                        if (tmp_cost < curr_d1 + VIA &&\n                            d1[curY][tmpX] >\n                                tmp_cost +\n                                    h_costTable[h_edges[grid].usage +\n                                                h_edges[grid].red +\n                                                (int)(L * h_edges[grid]\n                                                              .last_usage)]) {\n                          tmpH = true;\n                        }\n                      }\n                      tmp =\n                          curr_d1 + VIA +\n                          h_costTable[h_edges[grid].usage + h_edges[grid].red +\n                                      (int)(L * h_edges[grid].last_usage)];\n                    }\n\n                    if (d1[curY][tmpX] > tmp && tmp < return_dist) {\n\n                      data[curY * xGrid + curX - 1].lock();\n                      if (d1[curY][tmpX] > tmp &&\n                          tmp < return_dist) // left neighbor been put into\n                                             // heap1 but needs update\n                      {\n                        d1[curY][tmpX]       = tmp;\n                        parentX3[curY][tmpX] = curX;\n                        parentY3[curY][tmpX] = curY;\n                        HV[curY][tmpX]       = FALSE;\n                        ctx.push(pq_grid(&(d1[curY][tmpX]), tmp));\n                      }\n                      data[curY * xGrid + curX - 1].unlock();\n                    }\n                  }\n                  // right\n\n                  if (curX < regionX2) {\n                    grid = curY * (xGrid - 1) + curX;\n                    tmpX = curX + 1; // the right neighbor\n                    if ((preY == curY) || (curr_d1 == 0)) {\n                      tmp =\n                          curr_d1 +\n                          h_costTable[h_edges[grid].usage + h_edges[grid].red +\n                                      (int)(L * h_edges[grid].last_usage)];\n                    } else {\n                      if (curX > regionX1 + 1) {\n                        tmp_grid = curY * (xGrid - 1) + curX - 1;\n                        tmp_cost =\n                            d1[curY][curX - 1] +\n                            h_costTable[h_edges[tmp_grid].usage +\n                                        h_edges[tmp_grid].red +\n                                        (int)(L *\n                                              h_edges[tmp_grid].last_usage)];\n\n                        if (tmp_cost < curr_d1 + VIA &&\n                            d1[curY][tmpX] >\n                                tmp_cost +\n                                    h_costTable[h_edges[grid].usage +\n                                                h_edges[grid].red +\n                                                (int)(L * h_edges[grid]\n                                                              .last_usage)]) {\n                          // hyperH[curY][curX] = TRUE;\n                          tmpH = true;\n                        }\n                      }\n                      tmp =\n                          curr_d1 + VIA +\n                          h_costTable[h_edges[grid].usage + h_edges[grid].red +\n                                      (int)(L * h_edges[grid].last_usage)];\n                    }\n\n                    /*if(d1[curY][tmpX]>=BIG_INT) // right neighbor not been put\n                    into heap1\n                    {\n                        d1[curY][tmpX] = tmp;\n                        parentX3[curY][tmpX] = curX;\n                        parentY3[curY][tmpX] = curY;\n                        HV[curY][tmpX] = FALSE;\n                        pq1.push(&(d1[curY][tmpX]));\n\n                    }\n                    else */\n                    // galois::runtime::acquire(&data[curY * yGrid + tmpX],\n                    // galois::MethodFlag::WRITE);\n                    if (d1[curY][tmpX] > tmp && tmp < return_dist) {\n\n                      data[curY * xGrid + curX + 1].lock();\n                      if (d1[curY][tmpX] > tmp &&\n                          tmp < return_dist) // right neighbor been put into\n                                             // heap1 but needs update\n                      {\n                        d1[curY][tmpX]       = tmp;\n                        parentX3[curY][tmpX] = curX;\n                        parentY3[curY][tmpX] = curY;\n                        HV[curY][tmpX]       = FALSE;\n                        ctx.push(pq_grid(&(d1[curY][tmpX]), tmp));\n\n                        // printf(\"right push Y: %d X: %d tmp: %f HV: false\n                        // hyperH: %d\\n\", curY, tmpX, tmp, true);\n                      }\n                      data[curY * xGrid + curX + 1].unlock();\n                    }\n                  }\n                  hyperH[curY][curX] = tmpH;\n\n                  if (curY > regionY1) {\n                    grid = (curY - 1) * xGrid + curX;\n\n                    tmpY = curY - 1; // the bottom neighbor\n                    if ((preX == curX) || (curr_d1 == 0)) {\n                      tmp =\n                          curr_d1 +\n                          v_costTable[v_edges[grid].usage + v_edges[grid].red +\n                                      (int)(L * v_edges[grid].last_usage)];\n                    } else {\n                      if (curY < regionY2 - 1) {\n                        tmp_grid = curY * xGrid + curX;\n                        tmp_cost =\n                            d1[curY + 1][curX] +\n                            v_costTable[v_edges[tmp_grid].usage +\n                                        v_edges[tmp_grid].red +\n                                        (int)(L *\n                                              v_edges[tmp_grid].last_usage)];\n\n                        if (tmp_cost < curr_d1 + VIA &&\n                            d1[tmpY][curX] >\n                                tmp_cost +\n                                    v_costTable[v_edges[grid].usage +\n                                                v_edges[grid].red +\n                                                (int)(L * v_edges[grid]\n                                                              .last_usage)]) {\n                          // hyperV[curY][curX] = TRUE;\n                          tmpV = true;\n                        }\n                      }\n                      tmp =\n                          curr_d1 + VIA +\n                          v_costTable[v_edges[grid].usage + v_edges[grid].red +\n                                      (int)(L * v_edges[grid].last_usage)];\n                    }\n\n                    if (d1[tmpY][curX] > tmp && tmp < return_dist) {\n\n                      data[(curY - 1) * xGrid + curX].lock();\n                      if (d1[tmpY][curX] > tmp &&\n                          tmp < return_dist) // bottom neighbor been put into\n                                             // heap1 but needs update\n                      {\n                        d1[tmpY][curX]       = tmp;\n                        parentX1[tmpY][curX] = curX;\n                        parentY1[tmpY][curX] = curY;\n                        HV[tmpY][curX]       = TRUE;\n                        ctx.push(pq_grid(&(d1[tmpY][curX]), tmp));\n                      }\n                      data[(curY - 1) * xGrid + curX].unlock();\n                    }\n                  }\n                  // top\n                  if (curY < regionY2) {\n\n                    grid = curY * xGrid + curX;\n\n                    tmpY = curY + 1; // the top neighbor\n                    if ((preX == curX) || (curr_d1 == 0)) {\n                      tmp =\n                          curr_d1 +\n                          v_costTable[v_edges[grid].usage + v_edges[grid].red +\n                                      (int)(L * v_edges[grid].last_usage)];\n                    } else {\n                      if (curY > regionY1 + 1) {\n                        tmp_grid = (curY - 1) * xGrid + curX;\n                        tmp_cost =\n                            d1[curY - 1][curX] +\n                            v_costTable[v_edges[tmp_grid].usage +\n                                        v_edges[tmp_grid].red +\n                                        (int)(L *\n                                              v_edges[tmp_grid].last_usage)];\n\n                        if (tmp_cost < curr_d1 + VIA &&\n                            d1[tmpY][curX] >\n                                tmp_cost +\n                                    v_costTable[v_edges[grid].usage +\n                                                v_edges[grid].red +\n                                                (int)(L * v_edges[grid]\n                                                              .last_usage)]) {\n                          // hyperV[curY][curX] = TRUE;\n                          tmpV = true;\n                        }\n                      }\n                      tmp =\n                          curr_d1 + VIA +\n                          v_costTable[v_edges[grid].usage + v_edges[grid].red +\n                                      (int)(L * v_edges[grid].last_usage)];\n                    }\n\n                    if (d1[tmpY][curX] > tmp && tmp < return_dist) {\n\n                      data[(curY + 1) * xGrid + curX].lock();\n                      if (d1[tmpY][curX] > tmp &&\n                          tmp < return_dist) // top neighbor been put into heap1\n                                             // but needs update\n                      {\n                        d1[tmpY][curX]       = tmp;\n                        parentX1[tmpY][curX] = curX;\n                        parentY1[tmpY][curX] = curY;\n                        HV[tmpY][curX]       = TRUE;\n                        ctx.push(pq_grid(&(d1[tmpY][curX]), tmp));\n                      }\n                      data[(curY + 1) * xGrid + curX].unlock();\n                    }\n                  }\n                  hyperV[curY][curX] = tmpV;\n                }\n              },\n              // galois::wl<galois::worklists::ParaMeter<>>(),\n              // galois::wl<PSChunk>(),\n              galois::wl<OBIM>(RequestIndexer)\n              // galois::chunk_size<MAZE_CHUNK_SIZE>()\n              // galois::parallel_break(),\n              // galois::steal(),\n              // galois::loopname(\"fine_grain\")\n          );\n\n          // timer_foreach.stop();\n\n          for (local_vec::iterator ii = v2.begin(); ii != v2.end(); ii++)\n            pop_heap2[*ii] = FALSE;\n\n          crossX = return_ind1 % xGrid;\n          crossY = return_ind1 / xGrid;\n\n          cnt      = 0;\n          int curX = crossX;\n          int curY = crossY;\n          int tmpX, tmpY;\n          acc_cnt++;\n          acc_dist += return_dist;\n          max_dist =\n              (max_dist >= return_dist.load()) ? max_dist : return_dist.load();\n\n          while (d1[curY][curX] != 0) // loop until reach subtree1\n          {\n            hypered = FALSE;\n            if (cnt != 0) {\n              if (curX != tmpX && hyperH[curY][curX]) {\n                curX    = 2 * curX - tmpX;\n                hypered = TRUE;\n              }\n              if (curY != tmpY && hyperV[curY][curX]) {\n                curY    = 2 * curY - tmpY;\n                hypered = TRUE;\n              }\n            }\n            tmpX = curX;\n            tmpY = curY;\n            if (!hypered) {\n              if (HV[tmpY][tmpX]) {\n                curY = parentY1[tmpY][tmpX];\n              } else {\n                curX = parentX3[tmpY][tmpX];\n              }\n            }\n\n            tmp_gridsX[cnt] = curX;\n            tmp_gridsY[cnt] = curY;\n            cnt++;\n          }\n          // reverse the grids on the path\n\n          for (i = 0; i < cnt; i++) {\n            tmpind    = cnt - 1 - i;\n            gridsX[i] = tmp_gridsX[tmpind];\n            gridsY[i] = tmp_gridsY[tmpind];\n          }\n          // add the connection point (crossX, crossY)\n          gridsX[cnt] = crossX;\n          gridsY[cnt] = crossY;\n          cnt++;\n\n          curX     = crossX;\n          curY     = crossY;\n          cnt_n1n2 = cnt;\n\n          // change the tree structure according to the new routing for the tree\n          // edge find E1 and E2, and the endpoints of the edges they are on\n          E1x = gridsX[0];\n          E1y = gridsY[0];\n          E2x = gridsX[cnt_n1n2 - 1];\n          E2y = gridsY[cnt_n1n2 - 1];\n\n          edge_n1n2 = edgeID;\n\n          if (n1 >= deg && (E1x != n1x || E1y != n1y))\n          // n1 is not a pin and E1!=n1, then make change to subtree1,\n          // otherwise, no change to subtree1\n          {\n            // find the endpoints of the edge E1 is on\n            endpt1 = treeedges[corrEdge[E1y][E1x]].n1;\n            endpt2 = treeedges[corrEdge[E1y][E1x]].n2;\n\n            // find A1, A2 and edge_n1A1, edge_n1A2\n            if (treenodes[n1].nbr[0] == n2) {\n              A1        = treenodes[n1].nbr[1];\n              A2        = treenodes[n1].nbr[2];\n              edge_n1A1 = treenodes[n1].edge[1];\n              edge_n1A2 = treenodes[n1].edge[2];\n            } else if (treenodes[n1].nbr[1] == n2) {\n              A1        = treenodes[n1].nbr[0];\n              A2        = treenodes[n1].nbr[2];\n              edge_n1A1 = treenodes[n1].edge[0];\n              edge_n1A2 = treenodes[n1].edge[2];\n            } else {\n              A1        = treenodes[n1].nbr[0];\n              A2        = treenodes[n1].nbr[1];\n              edge_n1A1 = treenodes[n1].edge[0];\n              edge_n1A2 = treenodes[n1].edge[1];\n            }\n\n            if (endpt1 == n1 || endpt2 == n1) // E1 is on (n1, A1) or (n1, A2)\n            {\n              // if E1 is on (n1, A2), switch A1 and A2 so that E1 is always on\n              // (n1, A1)\n              if (endpt1 == A2 || endpt2 == A2) {\n                tmpi      = A1;\n                A1        = A2;\n                A2        = tmpi;\n                tmpi      = edge_n1A1;\n                edge_n1A1 = edge_n1A2;\n                edge_n1A2 = tmpi;\n              }\n\n              // update route for edge (n1, A1), (n1, A2)\n              updateRouteType1(treenodes, n1, A1, A2, E1x, E1y, treeedges,\n                               edge_n1A1, edge_n1A2);\n              // update position for n1\n              treenodes[n1].x = E1x;\n              treenodes[n1].y = E1y;\n            }    // if E1 is on (n1, A1) or (n1, A2)\n            else // E1 is not on (n1, A1) or (n1, A2), but on (C1, C2)\n            {\n              C1        = endpt1;\n              C2        = endpt2;\n              edge_C1C2 = corrEdge[E1y][E1x];\n\n              // update route for edge (n1, C1), (n1, C2) and (A1, A2)\n              updateRouteType2(treenodes, n1, A1, A2, C1, C2, E1x, E1y,\n                               treeedges, edge_n1A1, edge_n1A2, edge_C1C2);\n              // update position for n1\n              treenodes[n1].x = E1x;\n              treenodes[n1].y = E1y;\n              // update 3 edges (n1, A1)->(C1, n1), (n1, A2)->(n1, C2), (C1,\n              // C2)->(A1, A2)\n              edge_n1C1               = edge_n1A1;\n              treeedges[edge_n1C1].n1 = C1;\n              treeedges[edge_n1C1].n2 = n1;\n              edge_n1C2               = edge_n1A2;\n              treeedges[edge_n1C2].n1 = n1;\n              treeedges[edge_n1C2].n2 = C2;\n              edge_A1A2               = edge_C1C2;\n              treeedges[edge_A1A2].n1 = A1;\n              treeedges[edge_A1A2].n2 = A2;\n              // update nbr and edge for 5 nodes n1, A1, A2, C1, C2\n              // n1's nbr (n2, A1, A2)->(n2, C1, C2)\n              treenodes[n1].nbr[0]  = n2;\n              treenodes[n1].edge[0] = edge_n1n2;\n              treenodes[n1].nbr[1]  = C1;\n              treenodes[n1].edge[1] = edge_n1C1;\n              treenodes[n1].nbr[2]  = C2;\n              treenodes[n1].edge[2] = edge_n1C2;\n              // A1's nbr n1->A2\n              for (i = 0; i < 3; i++) {\n                if (treenodes[A1].nbr[i] == n1) {\n                  treenodes[A1].nbr[i]  = A2;\n                  treenodes[A1].edge[i] = edge_A1A2;\n                  break;\n                }\n              }\n              // A2's nbr n1->A1\n              for (i = 0; i < 3; i++) {\n                if (treenodes[A2].nbr[i] == n1) {\n                  treenodes[A2].nbr[i]  = A1;\n                  treenodes[A2].edge[i] = edge_A1A2;\n                  break;\n                }\n              }\n              // C1's nbr C2->n1\n              for (i = 0; i < 3; i++) {\n                if (treenodes[C1].nbr[i] == C2) {\n                  treenodes[C1].nbr[i]  = n1;\n                  treenodes[C1].edge[i] = edge_n1C1;\n                  break;\n                }\n              }\n              // C2's nbr C1->n1\n              for (i = 0; i < 3; i++) {\n                if (treenodes[C2].nbr[i] == C1) {\n                  treenodes[C2].nbr[i]  = n1;\n                  treenodes[C2].edge[i] = edge_n1C2;\n                  break;\n                }\n              }\n\n            } // else E1 is not on (n1, A1) or (n1, A2), but on (C1, C2)\n          }   // n1 is not a pin and E1!=n1\n\n          // (2) consider subtree2\n\n          if (n2 >= deg && (E2x != n2x || E2y != n2y))\n          // n2 is not a pin and E2!=n2, then make change to subtree2,\n          // otherwise, no change to subtree2\n          {\n            // find the endpoints of the edge E1 is on\n            endpt1 = treeedges[corrEdge[E2y][E2x]].n1;\n            endpt2 = treeedges[corrEdge[E2y][E2x]].n2;\n\n            // find B1, B2\n            if (treenodes[n2].nbr[0] == n1) {\n              B1        = treenodes[n2].nbr[1];\n              B2        = treenodes[n2].nbr[2];\n              edge_n2B1 = treenodes[n2].edge[1];\n              edge_n2B2 = treenodes[n2].edge[2];\n            } else if (treenodes[n2].nbr[1] == n1) {\n              B1        = treenodes[n2].nbr[0];\n              B2        = treenodes[n2].nbr[2];\n              edge_n2B1 = treenodes[n2].edge[0];\n              edge_n2B2 = treenodes[n2].edge[2];\n            } else {\n              B1        = treenodes[n2].nbr[0];\n              B2        = treenodes[n2].nbr[1];\n              edge_n2B1 = treenodes[n2].edge[0];\n              edge_n2B2 = treenodes[n2].edge[1];\n            }\n\n            if (endpt1 == n2 || endpt2 == n2) // E2 is on (n2, B1) or (n2, B2)\n            {\n              // if E2 is on (n2, B2), switch B1 and B2 so that E2 is always on\n              // (n2, B1)\n              if (endpt1 == B2 || endpt2 == B2) {\n                tmpi      = B1;\n                B1        = B2;\n                B2        = tmpi;\n                tmpi      = edge_n2B1;\n                edge_n2B1 = edge_n2B2;\n                edge_n2B2 = tmpi;\n              }\n\n              // update route for edge (n2, B1), (n2, B2)\n              updateRouteType1(treenodes, n2, B1, B2, E2x, E2y, treeedges,\n                               edge_n2B1, edge_n2B2);\n\n              // update position for n2\n              treenodes[n2].x = E2x;\n              treenodes[n2].y = E2y;\n            }    // if E2 is on (n2, B1) or (n2, B2)\n            else // E2 is not on (n2, B1) or (n2, B2), but on (D1, D2)\n            {\n              D1        = endpt1;\n              D2        = endpt2;\n              edge_D1D2 = corrEdge[E2y][E2x];\n\n              // update route for edge (n2, D1), (n2, D2) and (B1, B2)\n              updateRouteType2(treenodes, n2, B1, B2, D1, D2, E2x, E2y,\n                               treeedges, edge_n2B1, edge_n2B2, edge_D1D2);\n              // update position for n2\n              treenodes[n2].x = E2x;\n              treenodes[n2].y = E2y;\n              // update 3 edges (n2, B1)->(D1, n2), (n2, B2)->(n2, D2), (D1,\n              // D2)->(B1, B2)\n              edge_n2D1               = edge_n2B1;\n              treeedges[edge_n2D1].n1 = D1;\n              treeedges[edge_n2D1].n2 = n2;\n              edge_n2D2               = edge_n2B2;\n              treeedges[edge_n2D2].n1 = n2;\n              treeedges[edge_n2D2].n2 = D2;\n              edge_B1B2               = edge_D1D2;\n              treeedges[edge_B1B2].n1 = B1;\n              treeedges[edge_B1B2].n2 = B2;\n              // update nbr and edge for 5 nodes n2, B1, B2, D1, D2\n              // n1's nbr (n1, B1, B2)->(n1, D1, D2)\n              treenodes[n2].nbr[0]  = n1;\n              treenodes[n2].edge[0] = edge_n1n2;\n              treenodes[n2].nbr[1]  = D1;\n              treenodes[n2].edge[1] = edge_n2D1;\n              treenodes[n2].nbr[2]  = D2;\n              treenodes[n2].edge[2] = edge_n2D2;\n              // B1's nbr n2->B2\n              for (i = 0; i < 3; i++) {\n                if (treenodes[B1].nbr[i] == n2) {\n                  treenodes[B1].nbr[i]  = B2;\n                  treenodes[B1].edge[i] = edge_B1B2;\n                  break;\n                }\n              }\n              // B2's nbr n2->B1\n              for (i = 0; i < 3; i++) {\n                if (treenodes[B2].nbr[i] == n2) {\n                  treenodes[B2].nbr[i]  = B1;\n                  treenodes[B2].edge[i] = edge_B1B2;\n                  break;\n                }\n              }\n              // D1's nbr D2->n2\n              for (i = 0; i < 3; i++) {\n                if (treenodes[D1].nbr[i] == D2) {\n                  treenodes[D1].nbr[i]  = n2;\n                  treenodes[D1].edge[i] = edge_n2D1;\n                  break;\n                }\n              }\n              // D2's nbr D1->n2\n              for (i = 0; i < 3; i++) {\n                if (treenodes[D2].nbr[i] == D1) {\n                  treenodes[D2].nbr[i]  = n2;\n                  treenodes[D2].edge[i] = edge_n2D2;\n                  break;\n                }\n              }\n            } // else E2 is not on (n2, B1) or (n2, B2), but on (D1, D2)\n          }   // n2 is not a pin and E2!=n2\n\n          // update route for edge (n1, n2) and edge usage\n\n          // printf(\"update route? %d %d\\n\", netID, num_edges);\n          if (treeedges[edge_n1n2].route.type == MAZEROUTE) {\n            free(treeedges[edge_n1n2].route.gridsX);\n            free(treeedges[edge_n1n2].route.gridsY);\n          }\n          treeedges[edge_n1n2].route.gridsX =\n              (short*)calloc(cnt_n1n2, sizeof(short));\n          treeedges[edge_n1n2].route.gridsY =\n              (short*)calloc(cnt_n1n2, sizeof(short));\n          treeedges[edge_n1n2].route.type     = MAZEROUTE;\n          treeedges[edge_n1n2].route.routelen = cnt_n1n2 - 1;\n          treeedges[edge_n1n2].len = ADIFF(E1x, E2x) + ADIFF(E1y, E2y);\n          treeedges[edge_n1n2].n_ripups += 1;\n          total_ripups += 1;\n          max_ripups.update(treeedges[edge_n1n2].n_ripups);\n\n          for (i = 0; i < cnt_n1n2; i++) {\n            // printf(\"cnt_n1n2: %d\\n\", cnt_n1n2);\n            treeedges[edge_n1n2].route.gridsX[i] = gridsX[i];\n            treeedges[edge_n1n2].route.gridsY[i] = gridsY[i];\n          }\n          // std::cout << \" adjsut tree\" << std::endl;\n          // timer_adjusttree.stop();\n\n          // update edge usage\n\n          // timer_updateusage.start();\n          acc_length += cnt_n1n2;\n          max_length = (max_length >= cnt_n1n2) ? max_length : cnt_n1n2;\n          for (i = 0; i < cnt_n1n2 - 1; i++) {\n            if (gridsX[i] == gridsX[i + 1]) // a vertical edge\n            {\n              min_y = min(gridsY[i], gridsY[i + 1]);\n              // v_edges[min_y*xGrid+gridsX[i]].usage += 1;\n              // galois::atomicAdd(v_edges[min_y*xGrid+gridsX[i]].usage, (short\n              // unsigned)1);\n              v_edges[min_y * xGrid + gridsX[i]].usage.fetch_add(\n                  (short int)1, std::memory_order_relaxed);\n            } else /// if(gridsY[i]==gridsY[i+1])// a horizontal edge\n            {\n              min_x = min(gridsX[i], gridsX[i + 1]);\n              // h_edges[gridsY[i]*(xGrid-1)+min_x].usage += 1;\n              // galois::atomicAdd(h_edges[gridsY[i]*(xGrid-1)+min_x].usage,\n              // (short unsigned)1);\n              h_edges[gridsY[i] * (xGrid - 1) + min_x].usage.fetch_add(\n                  (short int)1, std::memory_order_relaxed);\n            }\n          }\n          // timer_updateusage.stop();\n          // timer_checkroute2dtree.start();\n          if (checkRoute2DTree(netID)) {\n            reInitTree(netID);\n            return;\n          }\n          // timer_checkroute2dtree.stop();\n        } // congested route, if(enter)\n        timer_finegrain.stop();\n      } // only route the non-degraded edges (len>0)\n    }   // iterate on edges of a net\n  }\n\n  printf(\"total ripups: %d max ripups: %d\\n\", total_ripups.reduce(),\n         max_ripups.reduce());\n  if (acc_cnt != 0) {\n    cout << \" max_dist \" << max_dist << \" max_length \" << max_length << endl;\n    cout << \" avg dist \" << acc_dist / acc_cnt << \" avg length \"\n         << (float)acc_length / acc_cnt << \" acc_cnt: \" << acc_cnt << endl;\n    round_avg_dist   = acc_dist / acc_cnt;\n    round_avg_length = acc_length / acc_cnt;\n  }\n  free(h_costTable);\n  free(v_costTable);\n}\n\nvoid mazeRouteMSMD_finegrain_doall(int iter, int expand, float costHeight,\n                                   int ripup_threshold, int mazeedge_Threshold,\n                                   Bool Ordering, int cost_type) {\n  // LOCK = 0;\n  galois::StatTimer timer_finegrain(\"fine grain function\", \"fine grain maze\");\n\n  float forange;\n  // allocate memory for distance and parent and pop_heap\n  h_costTable = (float*)calloc(40 * hCapacity, sizeof(float));\n  v_costTable = (float*)calloc(40 * vCapacity, sizeof(float));\n\n  forange = 40 * hCapacity;\n\n  if (cost_type == 2) {\n    for (int i = 0; i < forange; i++) {\n      if (i < hCapacity - 1)\n        h_costTable[i] =\n            costHeight / (exp((float)(hCapacity - i - 1) * LOGIS_COF) + 1) + 1;\n      else\n        h_costTable[i] =\n            costHeight / (exp((float)(hCapacity - i - 1) * LOGIS_COF) + 1) + 1 +\n            (float)costHeight / slope * (i - hCapacity);\n    }\n    forange = 40 * vCapacity;\n    for (int i = 0; i < forange; i++) {\n      if (i < vCapacity - 1)\n        v_costTable[i] =\n            costHeight / (exp((float)(vCapacity - i - 1) * LOGIS_COF) + 1) + 1;\n      else\n        v_costTable[i] =\n            costHeight / (exp((float)(vCapacity - i - 1) * LOGIS_COF) + 1) + 1 +\n            (float)costHeight / slope * (i - vCapacity);\n    }\n  } else {\n\n    for (int i = 0; i < forange; i++) {\n      if (i < hCapacity)\n        h_costTable[i] =\n            costHeight / (exp((float)(hCapacity - i) * LOGIS_COF) + 1) + 1;\n      else\n        h_costTable[i] =\n            costHeight / (exp((float)(hCapacity - i) * LOGIS_COF) + 1) + 1 +\n            (float)costHeight / slope * (i - hCapacity);\n    }\n    forange = 40 * vCapacity;\n    for (int i = 0; i < forange; i++) {\n      if (i < vCapacity)\n        v_costTable[i] =\n            costHeight / (exp((float)(vCapacity - i) * LOGIS_COF) + 1) + 1;\n      else\n        v_costTable[i] =\n            costHeight / (exp((float)(vCapacity - i) * LOGIS_COF) + 1) + 1 +\n            (float)costHeight / slope * (i - vCapacity);\n    }\n  }\n\n  /*forange = yGrid*xGrid;\n  for(i=0; i<forange; i++)\n  {\n      pop_heap2[i] = FALSE;\n  } //Michael*/\n\n  galois::LargeArray<galois::substrate::SimpleLock> data;\n  data.allocateInterleaved(xGrid * yGrid);\n\n  if (Ordering) {\n    StNetOrder();\n    // printf(\"order?\\n\");\n  }\n\n  THREAD_LOCAL_STORAGE thread_local_storage{};\n  // for(nidRPC=0; nidRPC<numValidNets; nidRPC++)//parallelize\n  PerThread_PQ perthread_pq;\n  PerThread_Vec perthread_vec;\n  PRINT = 0;\n  galois::GAccumulator<int> total_ripups;\n  galois::GReduceMax<int> max_ripups;\n  total_ripups.reset();\n  max_ripups.reset();\n\n  galois::StatTimer timer_newripupcheck(\"ripup\", \"fine grain maze\");\n  galois::StatTimer timer_setupheap(\"setup heap\", \"fine grain maze\");\n  galois::StatTimer timer_traceback(\"trace back\", \"fine grain maze\");\n  galois::StatTimer timer_adjusttree(\"adjust tree\", \"fine grain maze\");\n  galois::StatTimer timer_updateusage(\"update usage\", \"fine grain maze\");\n  galois::StatTimer timer_checkroute2dtree(\"checkroute2dtree\",\n                                           \"fine grain maze\");\n  galois::StatTimer timer_init(\"init\", \"fine grain maze\");\n  galois::StatTimer timer_foreach(\"foreach\", \"fine grain maze\");\n  for (int nidRPC = 0; nidRPC < numValidNets; nidRPC++) {\n\n    int netID;\n\n    // maze routing for multi-source, multi-destination\n    Bool hypered, enter;\n    int i, j, deg, edgeID, n1, n2, n1x, n1y, n2x, n2y, ymin, ymax, xmin, xmax,\n        crossX, crossY, tmpi, min_x, min_y, num_edges;\n    int regionX1, regionX2, regionY1, regionY2;\n    int tmpind, gridsX[XRANGE], gridsY[YRANGE], tmp_gridsX[XRANGE],\n        tmp_gridsY[YRANGE];\n    int endpt1, endpt2, A1, A2, B1, B2, C1, C2, D1, D2, cnt, cnt_n1n2;\n    int edge_n1n2, edge_n1A1, edge_n1A2, edge_n1C1, edge_n1C2, edge_A1A2,\n        edge_C1C2;\n    int edge_n2B1, edge_n2B2, edge_n2D1, edge_n2D2, edge_B1B2, edge_D1D2;\n    int E1x, E1y, E2x, E2y;\n    int origENG, edgeREC;\n\n    TreeEdge *treeedges, *treeedge;\n    TreeNode* treenodes;\n\n    bool* pop_heap2 = thread_local_storage.pop_heap2;\n\n    float** d1    = thread_local_storage.d1_p;\n    bool** HV     = thread_local_storage.HV_p;\n    bool** hyperV = thread_local_storage.hyperV_p;\n    bool** hyperH = thread_local_storage.hyperH_p;\n\n    short** parentX1 = thread_local_storage.parentX1_p;\n    short** parentX3 = thread_local_storage.parentX3_p;\n    short** parentY1 = thread_local_storage.parentY1_p;\n    short** parentY3 = thread_local_storage.parentY3_p;\n\n    int** corrEdge = thread_local_storage.corrEdge_p;\n\n    OrderNetEdge* netEO = thread_local_storage.netEO_p;\n\n    bool** inRegion = thread_local_storage.inRegion_p;\n\n    local_pq pq1 = perthread_pq.get();\n    local_vec v2 = perthread_vec.get();\n\n    /*for(i=0; i<yGrid*xGrid; i++)\n    {\n        pop_heap2[i] = FALSE;\n    } */\n\n    // memset(inRegion_alloc, 0, xGrid * yGrid * sizeof(bool));\n    /*for(int i=0; i<yGrid; i++)\n    {\n        for(int j=0; j<xGrid; j++)\n            inRegion[i][j] = FALSE;\n    }*/\n    // printf(\"hyperV[153][134]: %d %d %d\\n\", hyperV[153][134],\n    // parentY1[153][134], parentX3[153][134]); printf(\"what is happening?\\n\");\n\n    if (Ordering) {\n      netID = treeOrderCong[nidRPC].treeIndex;\n    } else {\n      netID = nidRPC;\n    }\n\n    deg = sttrees[netID].deg;\n\n    origENG = expand;\n\n    netedgeOrderDec(netID, netEO);\n\n    treeedges = sttrees[netID].edges;\n    treenodes = sttrees[netID].nodes;\n    // loop for all the tree edges (2*deg-3)\n    num_edges = 2 * deg - 3;\n\n    for (edgeREC = 0; edgeREC < num_edges; edgeREC++) {\n\n      edgeID   = netEO[edgeREC].edgeID;\n      treeedge = &(treeedges[edgeID]);\n\n      n1            = treeedge->n1;\n      n2            = treeedge->n2;\n      n1x           = treenodes[n1].x;\n      n1y           = treenodes[n1].y;\n      n2x           = treenodes[n2].x;\n      n2y           = treenodes[n2].y;\n      treeedge->len = ADIFF(n2x, n1x) + ADIFF(n2y, n1y);\n\n      if (treeedge->len >\n          mazeedge_Threshold) // only route the non-degraded edges (len>0)\n      {\n        timer_newripupcheck.start();\n        enter = newRipupCheck(treeedge, ripup_threshold, netID, edgeID);\n        timer_newripupcheck.stop();\n\n        // ripup the routing for the edge\n        timer_finegrain.start();\n        if (enter) {\n          // if(netID == 2 && edgeID == 26)\n          //    printf(\"netID %d edgeID %d src %d %d dst %d %d\\n\", netID,\n          //    edgeID, n1x, n1y, n2x, n2y);\n          // pre_length = treeedge->route.routelen;\n          /*for(int i = 0; i < pre_length; i++)\n          {\n              pre_gridsY[i] = treeedge->route.gridsY[i];\n              pre_gridsX[i] = treeedge->route.gridsX[i];\n              //printf(\"i %d x %d y %d\\n\", i, pre_gridsX[i], pre_gridsY[i]);\n          }*/\n          timer_init.start();\n          if (n1y <= n2y) {\n            ymin = n1y;\n            ymax = n2y;\n          } else {\n            ymin = n2y;\n            ymax = n1y;\n          }\n\n          if (n1x <= n2x) {\n            xmin = n1x;\n            xmax = n2x;\n          } else {\n            xmin = n2x;\n            xmax = n1x;\n          }\n\n          int enlarge = min(\n              origENG, (iter / 6 + 3) *\n                           treeedge->route\n                               .routelen); // michael, this was global variable\n          regionX1 = max(0, xmin - enlarge);\n          regionX2 = min(xGrid - 1, xmax + enlarge);\n          regionY1 = max(0, ymin - enlarge);\n          regionY2 = min(yGrid - 1, ymax + enlarge);\n          // std::cout << \"region size\" << regionWidth << \", \" << regionHeight\n          // << std::endl;\n          // initialize d1[][] and d2[][] as BIG_INT\n          for (i = regionY1; i <= regionY2; i++) {\n            for (j = regionX1; j <= regionX2; j++) {\n              d1[i][j] = BIG_INT;\n\n              /*d2[i][j] = BIG_INT;\n              hyperH[i][j] = FALSE;\n              hyperV[i][j] = FALSE;*/\n            }\n          }\n          // memset(hyperH, 0, xGrid * yGrid * sizeof(bool));\n          // memset(hyperV, 0, xGrid * yGrid * sizeof(bool));\n          for (i = regionY1; i <= regionY2; i++) {\n            for (j = regionX1; j <= regionX2; j++) {\n              HV[i][j] = FALSE;\n            }\n          }\n          for (i = regionY1; i <= regionY2; i++) {\n            for (j = regionX1; j <= regionX2; j++) {\n              hyperH[i][j] = FALSE;\n            }\n          }\n          for (i = regionY1; i <= regionY2; i++) {\n            for (j = regionX1; j <= regionX2; j++) {\n              hyperV[i][j] = FALSE;\n            }\n          }\n          // TODO: use seperate loops\n\n          // setup heap1, heap2 and initialize d1[][] and d2[][] for all the\n          // grids on the two subtrees\n          timer_setupheap.start();\n          setupHeap(netID, edgeID, pq1, v2, regionX1, regionX2, regionY1,\n                    regionY2, d1, corrEdge, inRegion);\n          timer_setupheap.stop();\n          // TODO: use std priority queue\n          // while loop to find shortest path\n          /*ind1 = (pq1.top().d1_p - &d1[0][0]);\n          curX = ind1%xGrid;\n          curY = ind1/xGrid;\n          printf(\"src size: %d dst size: %d\\n\", pq1.size(), v2.size());*/\n          for (local_vec::iterator ii = v2.begin(); ii != v2.end(); ii++) {\n            pop_heap2[*ii] = TRUE;\n          }\n          std::atomic<int> return_ind1;\n          std::atomic<float> return_dist;\n          return_dist = (float)BIG_INT;\n\n          timer_init.stop();\n          timer_foreach.start();\n\n          galois::InsertBag<pq_grid> wls[2];\n          galois::InsertBag<pq_grid>* next;\n          galois::InsertBag<pq_grid>* cur;\n\n          cur  = &wls[0];\n          next = &wls[1];\n          std::atomic<int> bagsize;\n          bagsize = 0;\n\n          while (!pq1.empty()) {\n            auto tmp = pq1.top();\n            pq1.pop();\n            cur->push(tmp);\n            // bagsize++;\n          }\n\n          while (!cur->empty()) {\n            // std::cout << \"bag size: \" << bagsize.load() << \" unique items: \"\n            // << dynBS.count() << std::endl; bagsize = 0; dynBS.reset();\n            // galois::for_each(galois::iterate(*cur),\n            //[&] (const auto& top, auto& ctx)\n            galois::do_all(\n                galois::iterate(*cur),\n                [&](const auto& top)\n\n                /*galois::for_each(galois::iterate(pq1),\n                [&] (const auto& top, auto& ctx)*/\n                // while( pop_heap2[ind1]==FALSE) // stop until the grid\n                // position been popped out from both heap1 and heap2\n                {\n                  // relax all the adjacent grids within the enlarged region for\n                  // source subtree\n\n                  int ind1 = top.d1_p - &d1[0][0];\n\n                  //\n                  int curX = ind1 % xGrid;\n                  int curY = ind1 / xGrid;\n                  int grid = curY * xGrid + curX;\n                  // std::cout << \" d1: \" << d1[curY][curX] << std::endl;\n                  float d1_push = top.d1_push; // d1[curY][curX];\n                  float curr_d1 = d1[curY][curX];\n                  // std::cout << \"d1_push: \" << d1_push << \" d1: \" <<\n                  // d1[curY][curX] << std::endl; if(netID == 2 && edgeID == 26)\n                  // printf(\"netID: %d edgeID:%d curX curY %d %d, d1_push: %f,\n                  // curr_d1: %f\\n\", netID, edgeID, curX, curY, d1_push,\n                  // curr_d1);\n\n                  if (d1_push > return_dist + OBIM_delta) {\n                    // printf(\"netID: %d early break\\n\", netID);\n                    // if(netID == 2 && edgeID == 26)\n                    //    printf(\"break! curX curY %d %d, d1_push: %f, curr_d1:\n                    //    %f return_d: %f\\n\", curX, curY, d1_push, curr_d1,\n                    //    return_dist.load());\n                    // ctx.breakLoop();\n                  }\n                  // galois::runtime::acquire(&data[grid],\n                  // galois::MethodFlag::WRITE);\n                  if (d1_push == curr_d1 && d1_push < return_dist.load()) {\n                    if (pop_heap2[ind1] != false) {\n                      // if(netID == 2 && edgeID == 26)\n                      //    printf(\"reach! curX curY %d %d, d1_push: %f,\n                      //    curr_d1: %f return_d: %f\\n\", curX, curY, d1_push,\n                      //    curr_d1, return_dist.load());\n                      return_ind1.store(ind1);\n                      return_dist.store(d1_push);\n                    }\n\n                    /*grid = curY*xGrid + curX - 1;\n                    if(curX>regionX1)\n                        galois::runtime::acquire(&data[grid],\n                    galois::MethodFlag::WRITE);\n\n                    grid = curY*xGrid + curX + 1;\n                    if(curX<regionX2)\n                        galois::runtime::acquire(&data[grid],\n                    galois::MethodFlag::WRITE);\n\n                    grid = (curY - 1)*xGrid + curX;\n                    if(curY>regionY1)\n                        galois::runtime::acquire(&data[grid],\n                    galois::MethodFlag::WRITE);\n\n                    grid = (curY + 1)*xGrid + curX;\n                    if(curY<regionY2)\n                        galois::runtime::acquire(&data[grid],\n                    galois::MethodFlag::WRITE);*/\n\n                    int preX = curX, preY = curY;\n                    if (curr_d1 != 0) {\n                      if (HV[curY][curX]) {\n                        preX = parentX1[curY][curX];\n                        preY = parentY1[curY][curX];\n                      } else {\n                        preX = parentX3[curY][curX];\n                        preY = parentY3[curY][curX];\n                      }\n                    }\n                    // printf(\"pop curY: %d curX: %d d1: %f preX: %d preY: %d\n                    // hyperH: %d hyperV: %d HV: %d return_dist: %f\\n\",\n                    //    curY, curX, curr_d1, preX, preY, hyperH[curY][curX],\n                    //    hyperV[curY][curX], HV[curY][curX],\n                    //    return_dist.load());\n                    float tmp = 0.f, tmp_cost = 0.f;\n                    int tmp_grid = 0;\n                    int tmpX = 0, tmpY = 0;\n                    // left\n                    bool tmpH = false;\n                    bool tmpV = false;\n\n                    // if(curX>regionX1)\n                    //    data[curY*xGrid+curX-1].lock();\n\n                    // data[curY*(xGrid-1)+curX].lock();\n\n                    if (curX > regionX1) {\n                      grid = curY * (xGrid - 1) + curX - 1;\n\n                      // printf(\"grid: %d %d usage: %d red:%d last:%d sum%f\n                      // %d\\n\",\n                      //    grid%xGrid, grid/xGrid, h_edges[grid].usage.load(),\n                      //    h_edges[grid].red, h_edges[grid].last_usage, L ,\n                      //    h_edges[grid].usage.load() + h_edges[grid].red +\n                      //    (int)(L*h_edges[grid].last_usage));\n                      if ((preY == curY) || (curr_d1 == 0)) {\n                        tmp = curr_d1 +\n                              h_costTable[h_edges[grid].usage +\n                                          h_edges[grid].red +\n                                          (int)(L * h_edges[grid].last_usage)];\n                      } else {\n                        if (curX < regionX2 - 1) {\n                          tmp_grid = curY * (xGrid - 1) + curX;\n                          tmp_cost =\n                              d1[curY][curX + 1] +\n                              h_costTable[h_edges[tmp_grid].usage +\n                                          h_edges[tmp_grid].red +\n                                          (int)(L *\n                                                h_edges[tmp_grid].last_usage)];\n\n                          if (tmp_cost < curr_d1 + VIA) {\n                            // hyperH[curY][curX] = TRUE; //Michael\n                            tmpH = true;\n                          }\n                        }\n                        tmp = curr_d1 + VIA +\n                              h_costTable[h_edges[grid].usage +\n                                          h_edges[grid].red +\n                                          (int)(L * h_edges[grid].last_usage)];\n                      }\n                      tmpX = curX - 1; // the left neighbor\n                      if (d1[curY][tmpX] > tmp && tmp < return_dist) {\n                        data[curY * xGrid + curX - 1].lock();\n                        if (d1[curY][tmpX] > tmp &&\n                            tmp < return_dist) // left neighbor been put into\n                                               // heap1 but needs update\n                        {\n                          d1[curY][tmpX]       = tmp;\n                          parentX3[curY][tmpX] = curX;\n                          parentY3[curY][tmpX] = curY;\n                          HV[curY][tmpX]       = FALSE;\n                          // pq1.push({&(d1[curY][tmpX]), tmp});\n                          // pq_grid grid_push = {&(d1[curY][tmpX]), tmp};\n\n                          // next->push(&(d1[curY][tmpX]));\n                          next->push(pq_grid(&(d1[curY][tmpX]), tmp));\n                          // bagsize.fetch_add(1, std::memory_order_relaxed);\n\n                          // printf(\"left push Y: %d X: %d tmp: %f HV: false\n                          // hyperH: %d\\n\", curY, tmpX, tmp, true);\n                        }\n                        data[curY * xGrid + curX - 1].unlock();\n                      }\n                    }\n                    // right\n\n                    if (curX < regionX2) {\n                      // data[curY*xGrid+curX+1].lock();\n                      grid = curY * (xGrid - 1) + curX;\n                      // printf(\"grid: %d %d usage: %d red:%d last:%d L:%f\n                      // sum:%d\\n\",grid%xGrid, grid/xGrid,\n                      // h_edges[grid].usage.load(), h_edges[grid].red,\n                      // h_edges[grid].last_usage, L ,\n                      // h_edges[grid].usage.load()\n                      // + h_edges[grid].red +\n                      // (int)(L*h_edges[grid].last_usage));\n                      if ((preY == curY) || (curr_d1 == 0)) {\n                        tmp = curr_d1 +\n                              h_costTable[h_edges[grid].usage +\n                                          h_edges[grid].red +\n                                          (int)(L * h_edges[grid].last_usage)];\n                      } else {\n                        if (curX > regionX1 + 1) {\n                          tmp_grid = curY * (xGrid - 1) + curX - 1;\n                          tmp_cost =\n                              d1[curY][curX - 1] +\n                              h_costTable[h_edges[tmp_grid].usage +\n                                          h_edges[tmp_grid].red +\n                                          (int)(L *\n                                                h_edges[tmp_grid].last_usage)];\n\n                          if (tmp_cost < curr_d1 + VIA) {\n                            // hyperH[curY][curX] = TRUE;\n                            tmpH = true;\n                          }\n                        }\n                        tmp = curr_d1 + VIA +\n                              h_costTable[h_edges[grid].usage +\n                                          h_edges[grid].red +\n                                          (int)(L * h_edges[grid].last_usage)];\n                      }\n                      tmpX = curX + 1; // the right neighbor\n\n                      if (d1[curY][tmpX] > tmp && tmp < return_dist) {\n                        data[curY * xGrid + curX + 1].lock();\n                        if (d1[curY][tmpX] > tmp &&\n                            tmp < return_dist) // right neighbor been put into\n                                               // heap1 but needs update\n                        {\n                          d1[curY][tmpX]       = tmp;\n                          parentX3[curY][tmpX] = curX;\n                          parentY3[curY][tmpX] = curY;\n                          HV[curY][tmpX]       = FALSE;\n\n                          // next->push(&(d1[curY][tmpX]));\n                          next->push(pq_grid(&(d1[curY][tmpX]), tmp));\n                          // bagsize.fetch_add(1, std::memory_order_relaxed);\n\n                          // printf(\"right push Y: %d X: %d tmp: %f HV: false\n                          // hyperH: %d\\n\", curY, tmpX, tmp, true);\n                        }\n                        data[curY * xGrid + curX + 1].unlock();\n                      }\n                    }\n                    // data[curY*(xGrid-1)+curX].lock();\n                    hyperH[curY][curX] = tmpH;\n\n                    // bottom\n\n                    if (curY > regionY1) {\n                      grid = (curY - 1) * xGrid + curX;\n                      // printf(\"grid: %d %d usage: %d red:%d last:%d sum%f\n                      // %d\\n\",\n                      //    grid%xGrid, grid/xGrid, v_edges[grid].usage.load(),\n                      //    v_edges[grid].red, v_edges[grid].last_usage, L ,\n                      //    v_edges[grid].usage.load() + v_edges[grid].red +\n                      //    (int)(L*v_edges[grid].last_usage));\n                      if ((preX == curX) || (curr_d1 == 0)) {\n                        tmp = curr_d1 +\n                              v_costTable[v_edges[grid].usage +\n                                          v_edges[grid].red +\n                                          (int)(L * v_edges[grid].last_usage)];\n                      } else {\n                        if (curY < regionY2 - 1) {\n                          tmp_grid = curY * xGrid + curX;\n                          tmp_cost =\n                              d1[curY + 1][curX] +\n                              v_costTable[v_edges[tmp_grid].usage +\n                                          v_edges[tmp_grid].red +\n                                          (int)(L *\n                                                v_edges[tmp_grid].last_usage)];\n\n                          if (tmp_cost < curr_d1 + VIA) {\n                            // hyperV[curY][curX] = TRUE;\n                            tmpV = true;\n                          }\n                        }\n                        tmp = curr_d1 + VIA +\n                              v_costTable[v_edges[grid].usage +\n                                          v_edges[grid].red +\n                                          (int)(L * v_edges[grid].last_usage)];\n                      }\n                      tmpY = curY - 1; // the bottom neighbor\n\n                      if (d1[tmpY][curX] > tmp && tmp < return_dist) {\n                        data[(curY - 1) * xGrid + curX].lock();\n                        if (d1[tmpY][curX] > tmp &&\n                            tmp < return_dist) // bottom neighbor been put into\n                                               // heap1 but needs update\n                        {\n                          d1[tmpY][curX]       = tmp;\n                          parentX1[tmpY][curX] = curX;\n                          parentY1[tmpY][curX] = curY;\n                          HV[tmpY][curX]       = TRUE;\n\n                          // next->push(&(d1[tmpY][curX]));\n                          next->push(pq_grid(&(d1[tmpY][curX]), tmp));\n                          // bagsize.fetch_add(1, std::memory_order_relaxed);\n                        }\n                        data[(curY - 1) * xGrid + curX].unlock();\n                      }\n                    }\n                    // top\n                    if (curY < regionY2) {\n\n                      grid = curY * xGrid + curX;\n                      // printf(\"grid: %d %d usage: %d red:%d last:%d sum%f\n                      // %d\\n\",\n                      //    grid%xGrid, grid/xGrid, v_edges[grid].usage.load(),\n                      //    v_edges[grid].red, v_edges[grid].last_usage, L ,\n                      //    v_edges[grid].usage.load() + v_edges[grid].red +\n                      //    (int)(L*v_edges[grid].last_usage));\n                      if ((preX == curX) || (curr_d1 == 0)) {\n                        tmp = curr_d1 +\n                              v_costTable[v_edges[grid].usage +\n                                          v_edges[grid].red +\n                                          (int)(L * v_edges[grid].last_usage)];\n                      } else {\n                        if (curY > regionY1 + 1) {\n                          tmp_grid = (curY - 1) * xGrid + curX;\n                          tmp_cost =\n                              d1[curY - 1][curX] +\n                              v_costTable[v_edges[tmp_grid].usage +\n                                          v_edges[tmp_grid].red +\n                                          (int)(L *\n                                                v_edges[tmp_grid].last_usage)];\n\n                          if (tmp_cost < curr_d1 + VIA) {\n                            // hyperV[curY][curX] = TRUE;\n                            tmpV = true;\n                          }\n                        }\n                        tmp = curr_d1 + VIA +\n                              v_costTable[v_edges[grid].usage +\n                                          v_edges[grid].red +\n                                          (int)(L * v_edges[grid].last_usage)];\n                      }\n                      tmpY = curY + 1; // the top neighbor\n\n                      if (d1[tmpY][curX] > tmp && tmp < return_dist) {\n                        data[(curY + 1) * xGrid + curX].lock();\n                        if (d1[tmpY][curX] > tmp &&\n                            tmp < return_dist) // top neighbor been put into\n                                               // heap1 but needs update\n                        {\n\n                          d1[tmpY][curX]       = tmp;\n                          parentX1[tmpY][curX] = curX;\n                          parentY1[tmpY][curX] = curY;\n                          HV[tmpY][curX]       = TRUE;\n\n                          // next->push(&(d1[tmpY][curX]));\n                          next->push(pq_grid(&(d1[tmpY][curX]), tmp));\n                          // bagsize.fetch_add(1, std::memory_order_relaxed);\n                        }\n                        data[(curY + 1) * xGrid + curX].unlock();\n                      }\n                    }\n                    hyperV[curY][curX] = tmpV;\n                  }\n                },\n                // galois::wl<galois::worklists::ParaMeter<>>(),\n                // galois::wl<PSChunk>(),\n                // galois::wl<OBIM>(RequestIndexer),\n                galois::chunk_size<4>(),\n                // galois::parallel_break(),\n                // galois::steal(),\n                galois::loopname(\"fine_grain\"));\n            std::swap(cur, next);\n            next->clear();\n\n          } // do all while curr is not empty\n\n          timer_foreach.stop();\n\n          for (local_vec::iterator ii = v2.begin(); ii != v2.end(); ii++)\n            pop_heap2[*ii] = FALSE;\n\n          crossX = return_ind1 % xGrid;\n          crossY = return_ind1 / xGrid;\n\n          cnt      = 0;\n          int curX = crossX;\n          int curY = crossY;\n          int tmpX, tmpY;\n          // if(netID == 2 && edgeID == 26)\n          //    printf(\"crossX %d crossY %d return_d: %f\\n\", crossX, crossY,\n          //    return_dist.load());\n          timer_traceback.start();\n          while (d1[curY][curX] != 0) // loop until reach subtree1\n          {\n            // if(cnt > 1000)\n            //    printf(\"Y: %d X: %d hyperH: %d hyperV: %d HV: %d d1: %f\\n\",\n            //    curY, curX, hyperH[curY][curX], hyperV[curY][curX],\n            //    HV[curY][curX], d1[curY][curX]);\n\n            hypered = FALSE;\n            if (cnt != 0) {\n              if (curX != tmpX && hyperH[curY][curX]) {\n                curX    = 2 * curX - tmpX;\n                hypered = TRUE;\n              }\n              // printf(\"hyperV[153][134]: %d\\n\", hyperV[curY][curX]);\n              if (curY != tmpY && hyperV[curY][curX]) {\n                curY    = 2 * curY - tmpY;\n                hypered = TRUE;\n              }\n            }\n            tmpX = curX;\n            tmpY = curY;\n            if (!hypered) {\n              if (HV[tmpY][tmpX]) {\n                curY = parentY1[tmpY][tmpX];\n              } else {\n                curX = parentX3[tmpY][tmpX];\n              }\n            }\n\n            tmp_gridsX[cnt] = curX;\n            tmp_gridsY[cnt] = curY;\n            cnt++;\n          }\n          // reverse the grids on the path\n\n          for (i = 0; i < cnt; i++) {\n            tmpind    = cnt - 1 - i;\n            gridsX[i] = tmp_gridsX[tmpind];\n            gridsY[i] = tmp_gridsY[tmpind];\n          }\n          // add the connection point (crossX, crossY)\n          gridsX[cnt] = crossX;\n          gridsY[cnt] = crossY;\n          cnt++;\n\n          curX     = crossX;\n          curY     = crossY;\n          cnt_n1n2 = cnt;\n\n          // change the tree structure according to the new routing for the tree\n          // edge find E1 and E2, and the endpoints of the edges they are on\n          E1x = gridsX[0];\n          E1y = gridsY[0];\n          E2x = gridsX[cnt_n1n2 - 1];\n          E2y = gridsY[cnt_n1n2 - 1];\n\n          edge_n1n2 = edgeID;\n\n          timer_traceback.stop();\n\n          // if(netID == 14628)\n          //    printf(\"netID %d edgeID %d src %d %d dst %d %d routelen: %d\\n\",\n          //    netID, edgeID, E1x, E1y, E2x, E2y, cnt_n1n2);\n          // (1) consider subtree1\n          timer_adjusttree.start();\n          if (n1 >= deg && (E1x != n1x || E1y != n1y))\n          // n1 is not a pin and E1!=n1, then make change to subtree1,\n          // otherwise, no change to subtree1\n          {\n            // find the endpoints of the edge E1 is on\n            endpt1 = treeedges[corrEdge[E1y][E1x]].n1;\n            endpt2 = treeedges[corrEdge[E1y][E1x]].n2;\n\n            // find A1, A2 and edge_n1A1, edge_n1A2\n            if (treenodes[n1].nbr[0] == n2) {\n              A1        = treenodes[n1].nbr[1];\n              A2        = treenodes[n1].nbr[2];\n              edge_n1A1 = treenodes[n1].edge[1];\n              edge_n1A2 = treenodes[n1].edge[2];\n            } else if (treenodes[n1].nbr[1] == n2) {\n              A1        = treenodes[n1].nbr[0];\n              A2        = treenodes[n1].nbr[2];\n              edge_n1A1 = treenodes[n1].edge[0];\n              edge_n1A2 = treenodes[n1].edge[2];\n            } else {\n              A1        = treenodes[n1].nbr[0];\n              A2        = treenodes[n1].nbr[1];\n              edge_n1A1 = treenodes[n1].edge[0];\n              edge_n1A2 = treenodes[n1].edge[1];\n            }\n\n            if (endpt1 == n1 || endpt2 == n1) // E1 is on (n1, A1) or (n1, A2)\n            {\n              // if E1 is on (n1, A2), switch A1 and A2 so that E1 is always on\n              // (n1, A1)\n              if (endpt1 == A2 || endpt2 == A2) {\n                tmpi      = A1;\n                A1        = A2;\n                A2        = tmpi;\n                tmpi      = edge_n1A1;\n                edge_n1A1 = edge_n1A2;\n                edge_n1A2 = tmpi;\n              }\n\n              // update route for edge (n1, A1), (n1, A2)\n              updateRouteType1(treenodes, n1, A1, A2, E1x, E1y, treeedges,\n                               edge_n1A1, edge_n1A2);\n              // update position for n1\n              treenodes[n1].x = E1x;\n              treenodes[n1].y = E1y;\n            }    // if E1 is on (n1, A1) or (n1, A2)\n            else // E1 is not on (n1, A1) or (n1, A2), but on (C1, C2)\n            {\n              C1        = endpt1;\n              C2        = endpt2;\n              edge_C1C2 = corrEdge[E1y][E1x];\n\n              // update route for edge (n1, C1), (n1, C2) and (A1, A2)\n              updateRouteType2(treenodes, n1, A1, A2, C1, C2, E1x, E1y,\n                               treeedges, edge_n1A1, edge_n1A2, edge_C1C2);\n              // update position for n1\n              treenodes[n1].x = E1x;\n              treenodes[n1].y = E1y;\n              // update 3 edges (n1, A1)->(C1, n1), (n1, A2)->(n1, C2), (C1,\n              // C2)->(A1, A2)\n              edge_n1C1               = edge_n1A1;\n              treeedges[edge_n1C1].n1 = C1;\n              treeedges[edge_n1C1].n2 = n1;\n              edge_n1C2               = edge_n1A2;\n              treeedges[edge_n1C2].n1 = n1;\n              treeedges[edge_n1C2].n2 = C2;\n              edge_A1A2               = edge_C1C2;\n              treeedges[edge_A1A2].n1 = A1;\n              treeedges[edge_A1A2].n2 = A2;\n              // update nbr and edge for 5 nodes n1, A1, A2, C1, C2\n              // n1's nbr (n2, A1, A2)->(n2, C1, C2)\n              treenodes[n1].nbr[0]  = n2;\n              treenodes[n1].edge[0] = edge_n1n2;\n              treenodes[n1].nbr[1]  = C1;\n              treenodes[n1].edge[1] = edge_n1C1;\n              treenodes[n1].nbr[2]  = C2;\n              treenodes[n1].edge[2] = edge_n1C2;\n              // A1's nbr n1->A2\n              for (i = 0; i < 3; i++) {\n                if (treenodes[A1].nbr[i] == n1) {\n                  treenodes[A1].nbr[i]  = A2;\n                  treenodes[A1].edge[i] = edge_A1A2;\n                  break;\n                }\n              }\n              // A2's nbr n1->A1\n              for (i = 0; i < 3; i++) {\n                if (treenodes[A2].nbr[i] == n1) {\n                  treenodes[A2].nbr[i]  = A1;\n                  treenodes[A2].edge[i] = edge_A1A2;\n                  break;\n                }\n              }\n              // C1's nbr C2->n1\n              for (i = 0; i < 3; i++) {\n                if (treenodes[C1].nbr[i] == C2) {\n                  treenodes[C1].nbr[i]  = n1;\n                  treenodes[C1].edge[i] = edge_n1C1;\n                  break;\n                }\n              }\n              // C2's nbr C1->n1\n              for (i = 0; i < 3; i++) {\n                if (treenodes[C2].nbr[i] == C1) {\n                  treenodes[C2].nbr[i]  = n1;\n                  treenodes[C2].edge[i] = edge_n1C2;\n                  break;\n                }\n              }\n\n            } // else E1 is not on (n1, A1) or (n1, A2), but on (C1, C2)\n          }   // n1 is not a pin and E1!=n1\n\n          // (2) consider subtree2\n\n          if (n2 >= deg && (E2x != n2x || E2y != n2y))\n          // n2 is not a pin and E2!=n2, then make change to subtree2,\n          // otherwise, no change to subtree2\n          {\n            // find the endpoints of the edge E1 is on\n            endpt1 = treeedges[corrEdge[E2y][E2x]].n1;\n            endpt2 = treeedges[corrEdge[E2y][E2x]].n2;\n\n            // find B1, B2\n            if (treenodes[n2].nbr[0] == n1) {\n              B1        = treenodes[n2].nbr[1];\n              B2        = treenodes[n2].nbr[2];\n              edge_n2B1 = treenodes[n2].edge[1];\n              edge_n2B2 = treenodes[n2].edge[2];\n            } else if (treenodes[n2].nbr[1] == n1) {\n              B1        = treenodes[n2].nbr[0];\n              B2        = treenodes[n2].nbr[2];\n              edge_n2B1 = treenodes[n2].edge[0];\n              edge_n2B2 = treenodes[n2].edge[2];\n            } else {\n              B1        = treenodes[n2].nbr[0];\n              B2        = treenodes[n2].nbr[1];\n              edge_n2B1 = treenodes[n2].edge[0];\n              edge_n2B2 = treenodes[n2].edge[1];\n            }\n\n            if (endpt1 == n2 || endpt2 == n2) // E2 is on (n2, B1) or (n2, B2)\n            {\n              // if E2 is on (n2, B2), switch B1 and B2 so that E2 is always on\n              // (n2, B1)\n              if (endpt1 == B2 || endpt2 == B2) {\n                tmpi      = B1;\n                B1        = B2;\n                B2        = tmpi;\n                tmpi      = edge_n2B1;\n                edge_n2B1 = edge_n2B2;\n                edge_n2B2 = tmpi;\n              }\n\n              // update route for edge (n2, B1), (n2, B2)\n              updateRouteType1(treenodes, n2, B1, B2, E2x, E2y, treeedges,\n                               edge_n2B1, edge_n2B2);\n\n              // update position for n2\n              treenodes[n2].x = E2x;\n              treenodes[n2].y = E2y;\n            }    // if E2 is on (n2, B1) or (n2, B2)\n            else // E2 is not on (n2, B1) or (n2, B2), but on (D1, D2)\n            {\n              D1        = endpt1;\n              D2        = endpt2;\n              edge_D1D2 = corrEdge[E2y][E2x];\n\n              // update route for edge (n2, D1), (n2, D2) and (B1, B2)\n              updateRouteType2(treenodes, n2, B1, B2, D1, D2, E2x, E2y,\n                               treeedges, edge_n2B1, edge_n2B2, edge_D1D2);\n              // update position for n2\n              treenodes[n2].x = E2x;\n              treenodes[n2].y = E2y;\n              // update 3 edges (n2, B1)->(D1, n2), (n2, B2)->(n2, D2), (D1,\n              // D2)->(B1, B2)\n              edge_n2D1               = edge_n2B1;\n              treeedges[edge_n2D1].n1 = D1;\n              treeedges[edge_n2D1].n2 = n2;\n              edge_n2D2               = edge_n2B2;\n              treeedges[edge_n2D2].n1 = n2;\n              treeedges[edge_n2D2].n2 = D2;\n              edge_B1B2               = edge_D1D2;\n              treeedges[edge_B1B2].n1 = B1;\n              treeedges[edge_B1B2].n2 = B2;\n              // update nbr and edge for 5 nodes n2, B1, B2, D1, D2\n              // n1's nbr (n1, B1, B2)->(n1, D1, D2)\n              treenodes[n2].nbr[0]  = n1;\n              treenodes[n2].edge[0] = edge_n1n2;\n              treenodes[n2].nbr[1]  = D1;\n              treenodes[n2].edge[1] = edge_n2D1;\n              treenodes[n2].nbr[2]  = D2;\n              treenodes[n2].edge[2] = edge_n2D2;\n              // B1's nbr n2->B2\n              for (i = 0; i < 3; i++) {\n                if (treenodes[B1].nbr[i] == n2) {\n                  treenodes[B1].nbr[i]  = B2;\n                  treenodes[B1].edge[i] = edge_B1B2;\n                  break;\n                }\n              }\n              // B2's nbr n2->B1\n              for (i = 0; i < 3; i++) {\n                if (treenodes[B2].nbr[i] == n2) {\n                  treenodes[B2].nbr[i]  = B1;\n                  treenodes[B2].edge[i] = edge_B1B2;\n                  break;\n                }\n              }\n              // D1's nbr D2->n2\n              for (i = 0; i < 3; i++) {\n                if (treenodes[D1].nbr[i] == D2) {\n                  treenodes[D1].nbr[i]  = n2;\n                  treenodes[D1].edge[i] = edge_n2D1;\n                  break;\n                }\n              }\n              // D2's nbr D1->n2\n              for (i = 0; i < 3; i++) {\n                if (treenodes[D2].nbr[i] == D1) {\n                  treenodes[D2].nbr[i]  = n2;\n                  treenodes[D2].edge[i] = edge_n2D2;\n                  break;\n                }\n              }\n            } // else E2 is not on (n2, B1) or (n2, B2), but on (D1, D2)\n          }   // n2 is not a pin and E2!=n2\n\n          // update route for edge (n1, n2) and edge usage\n\n          // printf(\"update route? %d %d\\n\", netID, num_edges);\n          if (treeedges[edge_n1n2].route.type == MAZEROUTE) {\n            free(treeedges[edge_n1n2].route.gridsX);\n            free(treeedges[edge_n1n2].route.gridsY);\n          }\n          treeedges[edge_n1n2].route.gridsX =\n              (short*)calloc(cnt_n1n2, sizeof(short));\n          treeedges[edge_n1n2].route.gridsY =\n              (short*)calloc(cnt_n1n2, sizeof(short));\n          treeedges[edge_n1n2].route.type     = MAZEROUTE;\n          treeedges[edge_n1n2].route.routelen = cnt_n1n2 - 1;\n          treeedges[edge_n1n2].len = ADIFF(E1x, E2x) + ADIFF(E1y, E2y);\n          treeedges[edge_n1n2].n_ripups += 1;\n          total_ripups += 1;\n          max_ripups.update(treeedges[edge_n1n2].n_ripups);\n\n          for (i = 0; i < cnt_n1n2; i++) {\n            // printf(\"cnt_n1n2: %d\\n\", cnt_n1n2);\n            treeedges[edge_n1n2].route.gridsX[i] = gridsX[i];\n            treeedges[edge_n1n2].route.gridsY[i] = gridsY[i];\n          }\n          // std::cout << \" adjsut tree\" << std::endl;\n          timer_adjusttree.stop();\n\n          // update edge usage\n\n          /*for(i=0; i<pre_length; i++)\n          {\n              if(pre_gridsX[i]==pre_gridsX[i+1]) // a vertical edge\n              {\n                  if(i != pre_length - 1)\n                      min_y = min(pre_gridsY[i], pre_gridsY[i+1]);\n                  else\n                      min_y = pre_gridsY[i];\n                  //v_edges[min_y*xGrid+gridsX[i]].usage += 1;\n                  //galois::atomicAdd(v_edges[min_y*xGrid+gridsX[i]].usage,\n          (short unsigned)1);\n                  //printf(\"x y %d %d i %d \\n\", pre_gridsX[i], min_y, i);\n                  v_edges[min_y*xGrid+pre_gridsX[i]].usage.fetch_sub((short\n          int)1, std::memory_order_relaxed);\n                  //if(v_edges[min_y*xGrid+pre_gridsX[i]].usage < 0) printf(\"V\n          negative! %d \\n\", i);\n              }\n              else ///if(gridsY[i]==gridsY[i+1])// a horizontal edge\n              {\n                  if(i != pre_length - 1)\n                      min_x = min(pre_gridsX[i], pre_gridsX[i+1]);\n                  else\n                      min_x = pre_gridsX[i];\n                  //h_edges[gridsY[i]*(xGrid-1)+min_x].usage += 1;\n                  //galois::atomicAdd(h_edges[gridsY[i]*(xGrid-1)+min_x].usage,\n          (short unsigned)1);\n                  //printf(\"x y %d %d i %d\\n\", min_x, pre_gridsY[i], i);\n                  h_edges[pre_gridsY[i]*(xGrid-1)+min_x].usage.fetch_sub((short\n          int)1, std::memory_order_relaxed);\n                  //if(h_edges[pre_gridsY[i]*(xGrid-1)+min_x].usage < 0)\n          printf(\"H negative! %d \\n\", i);\n              }\n          }*/\n          timer_updateusage.start();\n          for (i = 0; i < cnt_n1n2 - 1; i++) {\n            if (gridsX[i] == gridsX[i + 1]) // a vertical edge\n            {\n              min_y = min(gridsY[i], gridsY[i + 1]);\n              // v_edges[min_y*xGrid+gridsX[i]].usage += 1;\n              // galois::atomicAdd(v_edges[min_y*xGrid+gridsX[i]].usage, (short\n              // unsigned)1);\n              v_edges[min_y * xGrid + gridsX[i]].usage.fetch_add(\n                  (short int)1, std::memory_order_relaxed);\n            } else /// if(gridsY[i]==gridsY[i+1])// a horizontal edge\n            {\n              min_x = min(gridsX[i], gridsX[i + 1]);\n              // h_edges[gridsY[i]*(xGrid-1)+min_x].usage += 1;\n              // galois::atomicAdd(h_edges[gridsY[i]*(xGrid-1)+min_x].usage,\n              // (short unsigned)1);\n              h_edges[gridsY[i] * (xGrid - 1) + min_x].usage.fetch_add(\n                  (short int)1, std::memory_order_relaxed);\n            }\n          }\n          timer_updateusage.stop();\n          /*if(LOCK){\n              for(i=0; i<cnt_n1n2-1; i++)\n              {\n                  if(gridsX[i]==gridsX[i+1]) // a vertical edge\n                  {\n                      min_y = min(gridsY[i], gridsY[i+1]);\n                      v_edges[min_y*xGrid+gridsX[i]].releaseLock();\n                  }\n                  else ///if(gridsY[i]==gridsY[i+1])// a horizontal edge\n                  {\n                      min_x = min(gridsX[i], gridsX[i+1]);\n                      h_edges[gridsY[i]*(xGrid-1)+min_x].releaseLock();\n                  }\n              }\n          }*/\n          // printf(\"netID %d edgeID %d src %d %d dst %d %d routelen: %d\\n\",\n          // netID, edgeID, n1x, n1y, n2x, n2y, cnt_n1n2);\n          timer_checkroute2dtree.start();\n          if (checkRoute2DTree(netID)) {\n            reInitTree(netID);\n            return;\n          }\n          timer_checkroute2dtree.stop();\n        } // congested route, if(enter)\n        timer_finegrain.stop();\n      } // only route the non-degraded edges (len>0)\n    }   // iterate on edges of a net\n  }\n\n  printf(\"total ripups: %d max ripups: %d\\n\", total_ripups.reduce(),\n         max_ripups.reduce());\n  //}, \"mazeroute vtune function\");\n  free(h_costTable);\n  free(v_costTable);\n}\n"
  },
  {
    "path": "lonestar/eda/cpu/sproute/maze_finegrain_concurrent.h",
    "content": "\n/*struct netID_edgeID {\n    int netID;\n    int edgeID;\n    netID_edgeID() {\n        netID = 0;\n        edgeID = 0;\n    }\n\n\n};*/\nclass Concurrent_pq_grid {\npublic:\n  float* d1_p;\n  float d1_push;\n  int concurrentID;\n  Concurrent_pq_grid() {\n    d1_p         = NULL;\n    d1_push      = 0;\n    concurrentID = -1;\n  };\n  Concurrent_pq_grid(float* d1_p, float d1_push, int concurrentID) {\n    this->d1_p         = d1_p;\n    this->d1_push      = d1_push;\n    this->concurrentID = concurrentID;\n  }\n};\n\nstruct CONCURRENT_NET_STORAGE {\n  using LAptr = galois::substrate::LAptr;\n  LAptr pop_heap2_LA;\n  bool* pop_heap2;\n\n  LAptr d1_p_LA, d1_alloc_LA;\n  float** d1_p;\n  float* d1_alloc;\n\n  LAptr HV_p_LA, HV_alloc_LA, hyperV_p_LA, hyperV_alloc_LA, hyperH_p_LA,\n      hyperH_alloc_LA;\n  bool **HV_p, **hyperV_p, **hyperH_p;\n  bool *HV_alloc, *hyperV_alloc, *hyperH_alloc;\n\n  LAptr parentX1_p_LA, parentX1_alloc_LA, parentY1_p_LA, parentY1_alloc_LA,\n      parentX3_p_LA, parentX3_alloc_LA, parentY3_p_LA, parentY3_alloc_LA;\n  short **parentX1_p, **parentY1_p, **parentX3_p, **parentY3_p;\n  short *parentX1_alloc, *parentY1_alloc, *parentX3_alloc, *parentY3_alloc;\n\n  LAptr corrEdge_p_LA, corrEdge_alloc_LA;\n  int** corrEdge_p;\n  int* corrEdge_alloc;\n\n  LAptr inRegion_p_LA, inRegion_alloc_LA;\n  bool** inRegion_p;\n  bool* inRegion_alloc;\n\n  LAptr netEO_p_LA;\n  OrderNetEdge* netEO_p;\n\n  galois::LargeArray<galois::substrate::SimpleLock> nodelock;\n\n  std::vector<int> v2;\n\n  std::atomic<int> return_ind1;\n  std::atomic<float> return_dist;\n\n  CONCURRENT_NET_STORAGE() {\n    using namespace galois::substrate;\n\n    if (NET_PARALLEL) {\n      pop_heap2_LA = largeMallocLocal(yGrid * xGrid * sizeof(bool));\n      pop_heap2    = reinterpret_cast<bool*>(pop_heap2_LA.get());\n\n      d1_alloc_LA = largeMallocLocal(yGrid * xGrid * sizeof(float));\n      d1_alloc    = reinterpret_cast<float*>(d1_alloc_LA.get());\n      d1_p_LA     = largeMallocLocal(yGrid * sizeof(float*));\n      d1_p        = reinterpret_cast<float**>(d1_p_LA.get());\n\n      HV_alloc_LA     = largeMallocLocal(yGrid * xGrid * sizeof(bool));\n      HV_alloc        = reinterpret_cast<bool*>(HV_alloc_LA.get());\n      hyperV_alloc_LA = largeMallocLocal(yGrid * xGrid * sizeof(bool));\n      hyperV_alloc    = reinterpret_cast<bool*>(hyperV_alloc_LA.get());\n      hyperH_alloc_LA = largeMallocLocal(yGrid * xGrid * sizeof(bool));\n      hyperH_alloc    = reinterpret_cast<bool*>(hyperH_alloc_LA.get());\n\n      HV_p_LA     = largeMallocLocal(yGrid * sizeof(bool*));\n      HV_p        = reinterpret_cast<bool**>(HV_p_LA.get());\n      hyperV_p_LA = largeMallocLocal(yGrid * sizeof(bool*));\n      hyperV_p    = reinterpret_cast<bool**>(hyperV_p_LA.get());\n      hyperH_p_LA = largeMallocLocal(yGrid * sizeof(bool*));\n      hyperH_p    = reinterpret_cast<bool**>(hyperH_p_LA.get());\n\n      parentX1_alloc_LA = largeMallocLocal(yGrid * xGrid * sizeof(short));\n      parentX1_alloc    = reinterpret_cast<short*>(parentX1_alloc_LA.get());\n      parentX3_alloc_LA = largeMallocLocal(yGrid * xGrid * sizeof(short));\n      parentX3_alloc    = reinterpret_cast<short*>(parentX3_alloc_LA.get());\n      parentY1_alloc_LA = largeMallocLocal(yGrid * xGrid * sizeof(short));\n      parentY1_alloc    = reinterpret_cast<short*>(parentY1_alloc_LA.get());\n      parentY3_alloc_LA = largeMallocLocal(yGrid * xGrid * sizeof(short));\n      parentY3_alloc    = reinterpret_cast<short*>(parentY1_alloc_LA.get());\n\n      parentX1_p_LA = largeMallocLocal(yGrid * sizeof(short*));\n      parentX1_p    = reinterpret_cast<short**>(parentX1_p_LA.get());\n      parentX3_p_LA = largeMallocLocal(yGrid * sizeof(short*));\n      parentX3_p    = reinterpret_cast<short**>(parentX3_p_LA.get());\n      parentY1_p_LA = largeMallocLocal(yGrid * sizeof(short*));\n      parentY1_p    = reinterpret_cast<short**>(parentY1_p_LA.get());\n      parentY3_p_LA = largeMallocLocal(yGrid * sizeof(short*));\n      parentY3_p    = reinterpret_cast<short**>(parentY3_p_LA.get());\n\n      corrEdge_alloc_LA = largeMallocLocal(yGrid * xGrid * sizeof(int));\n      corrEdge_alloc    = reinterpret_cast<int*>(corrEdge_alloc_LA.get());\n      corrEdge_p_LA     = largeMallocLocal(yGrid * sizeof(int*));\n      corrEdge_p        = reinterpret_cast<int**>(corrEdge_p_LA.get());\n\n      inRegion_alloc_LA = largeMallocLocal(yGrid * xGrid * sizeof(bool));\n      inRegion_alloc    = reinterpret_cast<bool*>(inRegion_alloc_LA.get());\n      inRegion_p_LA     = largeMallocLocal(yGrid * sizeof(bool*));\n      inRegion_p        = reinterpret_cast<bool**>(inRegion_p_LA.get());\n\n      netEO_p_LA = largeMallocLocal(2000 * sizeof(OrderNetEdge));\n      netEO_p    = reinterpret_cast<OrderNetEdge*>(netEO_p_LA.get());\n\n      nodelock.allocateInterleaved(xGrid * yGrid);\n    } else {\n      pop_heap2 = (bool*)calloc(yGrid * xGrid, sizeof(bool));\n\n      d1_alloc = (float*)calloc(yGrid * xGrid, sizeof(float));\n      d1_p     = (float**)calloc(yGrid, sizeof(float*));\n\n      HV_alloc     = (bool*)calloc(yGrid * xGrid, sizeof(bool));\n      hyperV_alloc = (bool*)calloc(yGrid * xGrid, sizeof(bool));\n      hyperH_alloc = (bool*)calloc(yGrid * xGrid, sizeof(bool));\n      HV_p         = (bool**)calloc(yGrid, sizeof(bool*));\n      hyperV_p     = (bool**)calloc(yGrid, sizeof(bool*));\n      hyperH_p     = (bool**)calloc(yGrid, sizeof(bool*));\n\n      parentX1_alloc = (short*)calloc(yGrid * xGrid, sizeof(short));\n      parentX3_alloc = (short*)calloc(yGrid * xGrid, sizeof(short));\n      parentY1_alloc = (short*)calloc(yGrid * xGrid, sizeof(short));\n      parentY3_alloc = (short*)calloc(yGrid * xGrid, sizeof(short));\n      parentX1_p     = (short**)calloc(yGrid, sizeof(short*));\n      parentX3_p     = (short**)calloc(yGrid, sizeof(short*));\n      parentY1_p     = (short**)calloc(yGrid, sizeof(short*));\n      parentY3_p     = (short**)calloc(yGrid, sizeof(short*));\n\n      corrEdge_alloc = (int*)calloc(yGrid * xGrid, sizeof(int));\n      corrEdge_p     = (int**)calloc(yGrid, sizeof(int*));\n\n      inRegion_alloc = (bool*)calloc(yGrid * xGrid, sizeof(bool));\n      inRegion_p     = (bool**)calloc(yGrid, sizeof(bool*));\n\n      netEO_p = (OrderNetEdge*)calloc(2000, sizeof(OrderNetEdge));\n\n      nodelock.allocateInterleaved(xGrid * yGrid);\n    }\n    // printf(\"allocation success\\n\");\n    for (int i = 0; i < yGrid; i++) {\n      d1_p[i] = &(d1_alloc[i * xGrid]);\n\n      HV_p[i]     = &(HV_alloc[i * xGrid]);\n      hyperV_p[i] = &(hyperV_alloc[i * xGrid]);\n      hyperH_p[i] = &(hyperH_alloc[i * xGrid]);\n\n      corrEdge_p[i] = &(corrEdge_alloc[i * xGrid]);\n\n      inRegion_p[i] = &(inRegion_alloc[i * xGrid]);\n    }\n\n    for (int i = 0; i < yGrid; i++) {\n      parentX1_p[i] = &(parentX1_alloc[i * xGrid]);\n      parentX3_p[i] = &(parentX3_alloc[i * xGrid]);\n      parentY1_p[i] = &(parentY1_alloc[i * xGrid]);\n      parentY3_p[i] = &(parentY3_alloc[i * xGrid]);\n    }\n  }\n  void reset_heap() { memset(pop_heap2, 0, yGrid * xGrid * sizeof(bool)); }\n  void clear() {\n    if (!NET_PARALLEL) {\n      free(pop_heap2);\n\n      free(d1_p);\n      free(d1_alloc);\n\n      free(HV_p);\n      free(hyperV_p);\n      free(hyperH_p);\n      free(HV_alloc);\n      free(hyperV_alloc);\n      free(hyperH_alloc);\n\n      free(parentX1_p);\n      free(parentY1_p);\n      free(parentX3_p);\n      free(parentY3_p);\n\n      free(parentX1_alloc);\n      free(parentY1_alloc);\n      free(parentX3_alloc);\n      free(parentY3_alloc);\n\n      free(corrEdge_alloc);\n      free(corrEdge_p);\n\n      free(netEO_p);\n    } else {\n      /*delete [] pop_heap2;\n      delete [] heap1;\n      delete [] heap2;\n\n      delete [] d1_p;\n      delete [] d1_alloc;\n\n      delete [] HV_p;\n      delete [] hyperV_p;\n      delete [] hyperH_p;\n      delete [] HV_alloc;\n      delete [] hyperV_alloc;\n      delete [] hyperH_alloc;\n\n      delete [] parentX1_p;\n      delete [] parentY1_p;\n      delete [] parentX3_p;\n      delete [] parentY3_p;\n\n      delete [] parentX1_alloc;\n      delete [] parentY1_alloc;\n      delete [] parentX3_alloc;\n      delete [] parentY3_alloc;\n\n      delete [] corrEdge_alloc;\n      delete [] corrEdge_p;\n\n      delete [] netEO_p;*/\n    }\n  }\n\n  ~CONCURRENT_NET_STORAGE() {\n    if (!NET_PARALLEL) {\n      free(pop_heap2);\n\n      free(d1_p);\n      free(d1_alloc);\n\n      free(HV_p);\n      free(hyperV_p);\n      free(hyperH_p);\n      free(HV_alloc);\n      free(hyperV_alloc);\n      free(hyperH_alloc);\n\n      free(parentX1_p);\n      free(parentY1_p);\n      free(parentX3_p);\n      free(parentY3_p);\n\n      free(parentX1_alloc);\n      free(parentY1_alloc);\n      free(parentX3_alloc);\n      free(parentY3_alloc);\n\n      free(corrEdge_alloc);\n      free(corrEdge_p);\n\n      free(netEO_p);\n    }\n  }\n};\n\nvoid setupHeap_nopq1clear(int netID, int edgeID,\n                          galois::InsertBag<Concurrent_pq_grid>& pq1,\n                          std::vector<int>& v2, int regionX1, int regionX2,\n                          int regionY1, int regionY2, float** d1,\n                          int** corrEdge, bool** inRegion, int concurrentID) {\n  int i, j, d, numNodes, n1, n2, x1, y1, x2, y2;\n  int nbr, nbrX, nbrY, cur, edge;\n  int grid, x_grid, y_grid;\n  int queuehead, queuetail, *queue;\n  Bool* visited;\n  TreeEdge* treeedges;\n  TreeNode* treenodes;\n  Route* route;\n\n  for (i = regionY1; i <= regionY2; i++) {\n    for (j = regionX1; j <= regionX2; j++)\n      inRegion[i][j] = TRUE;\n  }\n\n  treeedges = sttrees[netID].edges;\n  treenodes = sttrees[netID].nodes;\n  d         = sttrees[netID].deg;\n\n  n1 = treeedges[edgeID].n1;\n  n2 = treeedges[edgeID].n2;\n  x1 = treenodes[n1].x;\n  y1 = treenodes[n1].y;\n  x2 = treenodes[n2].x;\n  y2 = treenodes[n2].y;\n\n  // if(netID == 14628)\n  //    printf(\"net: %d edge: %d src: %d %d dst: %d %d d: %d\\n\", netID, edgeID,\n  //    y1, x1, y2, x2, d);\n  v2.clear(); // Michael\n  if (d == 2) // 2-pin net\n  {\n    d1[y1][x1] = 0;\n    pq1.push(Concurrent_pq_grid(&(d1[y1][x1]), 0, concurrentID));\n    v2.push_back(y2 * xGrid + x2);\n  } else // net with more than 2 pins\n  {\n    numNodes = 2 * d - 2;\n\n    visited = (Bool*)calloc(numNodes, sizeof(Bool));\n    for (i = 0; i < numNodes; i++)\n      visited[i] = FALSE;\n\n    queue = (int*)calloc(numNodes, sizeof(int));\n\n    // find all the grids on tree edges in subtree t1 (connecting to n1) and put\n    // them into heap1\n    if (n1 < d) // n1 is a Pin node\n    {\n      // just need to put n1 itself into heap1\n      d1[y1][x1] = 0;\n      pq1.push(Concurrent_pq_grid(&(d1[y1][x1]), 0, concurrentID));\n      visited[n1] = TRUE;\n    } else // n1 is a Steiner node\n    {\n      queuehead = queuetail = 0;\n\n      // add n1 into heap1\n      d1[y1][x1] = 0;\n      // if(netID == 252163 && edgeID == 51)\n      //    printf(\"y: %d x: %d\\n\", y1, x1);\n      pq1.push(Concurrent_pq_grid(&(d1[y1][x1]), 0, concurrentID));\n      visited[n1] = TRUE;\n\n      // add n1 into the queue\n      queue[queuetail] = n1;\n      queuetail++;\n\n      // loop to find all the edges in subtree t1\n      while (queuetail > queuehead) {\n        // get cur node from the queuehead\n        cur = queue[queuehead];\n        queuehead++;\n        visited[cur] = TRUE;\n        if (cur >= d) // cur node is a Steiner node\n        {\n          for (i = 0; i < 3; i++) {\n            nbr  = treenodes[cur].nbr[i];\n            edge = treenodes[cur].edge[i];\n            if (nbr != n2) // not n2\n            {\n              if (visited[nbr] == FALSE) {\n                // put all the grids on the two adjacent tree edges into heap1\n                if (treeedges[edge].route.routelen > 0) // not a degraded edge\n                {\n                  // put nbr into heap1 if in enlarged region\n                  if (inRegion[treenodes[nbr].y][treenodes[nbr].x]) {\n                    nbrX           = treenodes[nbr].x;\n                    nbrY           = treenodes[nbr].y;\n                    d1[nbrY][nbrX] = 0;\n                    // if(netID == 252163 && edgeID == 51)\n                    //    printf(\"y: %d x: %d\\n\", nbrY, nbrX);\n                    pq1.push(\n                        Concurrent_pq_grid(&(d1[nbrY][nbrX]), 0, concurrentID));\n                    corrEdge[nbrY][nbrX] = edge;\n                  }\n\n                  // the coordinates of two end nodes of the edge\n\n                  route = &(treeedges[edge].route);\n                  if (route->type == MAZEROUTE) {\n                    for (j = 1; j < route->routelen;\n                         j++) // don't put edge_n1 and edge_n2 into heap1\n                    {\n                      x_grid = route->gridsX[j];\n                      y_grid = route->gridsY[j];\n\n                      if (inRegion[y_grid][x_grid]) {\n                        d1[y_grid][x_grid] = 0;\n                        // if(netID == 252163 && edgeID == 51)\n                        //    printf(\"y: %d x: %d\\n\", y_grid, x_grid);\n                        pq1.push(Concurrent_pq_grid(&(d1[y_grid][x_grid]), 0,\n                                                    concurrentID));\n                        corrEdge[y_grid][x_grid] = edge;\n                      }\n                    }\n                  } // if MAZEROUTE\n                  else {\n                    printf(\"Setup Heap: not maze routing\\n\");\n                  }\n                } // if not a degraded edge (len>0)\n\n                // add the neighbor of cur node into queue\n                queue[queuetail] = nbr;\n                queuetail++;\n              } // if the node is not visited\n            }   // if nbr!=n2\n          }     // loop i (3 neigbors for cur node)\n        }       // if cur node is a Steiner nodes\n      }         // while queue is not empty\n    }           // else n1 is not a Pin node\n\n    // find all the grids on subtree t2 (connect to n2) and put them into heap2\n    // find all the grids on tree edges in subtree t2 (connecting to n2) and put\n    // them into heap2\n    if (n2 < d) // n2 is a Pin node\n    {\n      // just need to put n2 itself into heap2\n      v2.push_back(y2 * xGrid + x2);\n      // if(netID == 14628)\n      //    printf(\"y: %d x: %d \\n\", y2, x2);\n      visited[n2] = TRUE;\n    } else // n2 is a Steiner node\n    {\n      queuehead = queuetail = 0;\n\n      // add n2 into heap2\n      v2.push_back(y2 * xGrid + x2);\n      // if(netID == 252163 && edgeID == 51)\n      //    printf(\"dst y: %d x: %d \\n\", y2, x2);\n      visited[n2] = TRUE;\n\n      // add n2 into the queue\n      queue[queuetail] = n2;\n      queuetail++;\n\n      // loop to find all the edges in subtree t2\n      while (queuetail > queuehead) {\n        // get cur node form queuehead\n        cur          = queue[queuehead];\n        visited[cur] = TRUE;\n        queuehead++;\n\n        if (cur >= d) // cur node is a Steiner node\n        {\n          for (i = 0; i < 3; i++) {\n            nbr  = treenodes[cur].nbr[i];\n            edge = treenodes[cur].edge[i];\n            if (nbr != n1) // not n1\n            {\n              if (visited[nbr] == FALSE) {\n                // put all the grids on the two adjacent tree edges into heap2\n                if (treeedges[edge].route.routelen > 0) // not a degraded edge\n                {\n                  // put nbr into heap2\n                  if (inRegion[treenodes[nbr].y][treenodes[nbr].x]) {\n                    nbrX = treenodes[nbr].x;\n                    nbrY = treenodes[nbr].y;\n                    v2.push_back(nbrY * xGrid + nbrX);\n                    // if(netID == 252163 && edgeID == 51)\n                    //    printf(\"dst y: %d x: %d\\n\", nbrY, nbrX);\n                    corrEdge[nbrY][nbrX] = edge;\n                  }\n\n                  // the coordinates of two end nodes of the edge\n\n                  route = &(treeedges[edge].route);\n                  if (route->type == MAZEROUTE) {\n                    for (j = 1; j < route->routelen;\n                         j++) // don't put edge_n1 and edge_n2 into heap2\n                    {\n                      x_grid = route->gridsX[j];\n                      y_grid = route->gridsY[j];\n                      if (inRegion[y_grid][x_grid]) {\n                        v2.push_back(y_grid * xGrid + x_grid);\n                        // if(netID == 252163 && edgeID == 51)\n                        //    printf(\"dst y: %d x: %d\\n\", y_grid, x_grid);\n                        corrEdge[y_grid][x_grid] = edge;\n                      }\n                    }\n                  } // if MAZEROUTE\n                  else {\n                    printf(\"Setup Heap: not maze routing\\n\");\n                  }\n                } // if the edge is not degraded (len>0)\n\n                // add the neighbor of cur node into queue\n                queue[queuetail] = nbr;\n                queuetail++;\n              } // if the node is not visited\n            }   // if nbr!=n1\n          }     // loop i (3 neigbors for cur node)\n        }       // if cur node is a Steiner nodes\n      }         // while queue is not empty\n    }           // else n2 is not a Pin node\n\n    free(queue);\n    free(visited);\n  } // net with more than two pins\n\n  for (i = regionY1; i <= regionY2; i++) {\n    for (j = regionX1; j <= regionX2; j++)\n      inRegion[i][j] = FALSE;\n  }\n}\n\nvoid trace_back(CONCURRENT_NET_STORAGE* concurrent_net_storage,\n                int* concurrentID2netID, int* concurrentID2edgeID,\n                int concurrentNet_cnt) {\n\n  for (int concurrentID = 0; concurrentID < concurrentNet_cnt; concurrentID++) {\n\n    int concurrentID = top.concurrentID;\n\n    float** d1 = concurrent_net_storage[concurrentID].d1_p;\n    std::atomic<int>& return_ind1 =\n        concurrent_net_storage[concurrentID].return_ind1;\n    std::atomic<float>& return_dist =\n        concurrent_net_storage[concurrentID].return_dist;\n\n    bool* pop_heap2 = concurrent_net_storage[concurrentID].pop_heap2;\n\n    bool** HV     = concurrent_net_storage[concurrentID].HV_p;\n    bool** hyperV = concurrent_net_storage[concurrentID].hyperV_p;\n    bool** hyperH = concurrent_net_storage[concurrentID].hyperH_p;\n\n    short** parentX1 = concurrent_net_storage[concurrentID].parentX1_p;\n    short** parentX3 = concurrent_net_storage[concurrentID].parentX3_p;\n    short** parentY1 = concurrent_net_storage[concurrentID].parentY1_p;\n    short** parentY3 = concurrent_net_storage[concurrentID].parentY3_p;\n\n    int** corrEdge = concurrent_net_storage[concurrentID].corrEdge_p;\n\n    OrderNetEdge* netEO = concurrent_net_storage[concurrentID].netEO_p;\n\n    bool** inRegion      = concurrent_net_storage[concurrentID].inRegion_p;\n    bool* inRegion_alloc = concurrent_net_storage[concurrentID].inRegion_alloc;\n\n    galois::LargeArray<galois::substrate::SimpleLock>& nodelock =\n        concurrent_net_storage[concurrentID].nodelock;\n\n    std::vector<int>& v2 = concurrent_net_storage[concurrentID].v2;\n\n    int regionX1     = concurrent_net_storage[concurrentID].regionX1;\n    int regionX2     = concurrent_net_storage[concurrentID].regionX2;\n    int regionY1     = concurrent_net_storage[concurrentID].regionY1;\n    int regionY2     = concurrent_net_storage[concurrentID].regionY2;\n    int regionWidth  = concurrent_net_storage[concurrentID].regionWidth;\n    int regionHeight = concurrent_net_storage[concurrentID].regionHeight;\n\n    for (auto ii = v2.begin(); ii != v2.end(); ii++)\n      pop_heap2[*ii] = FALSE;\n\n    crossX = return_ind1 % xGrid;\n    crossY = return_ind1 / xGrid;\n\n    cnt      = 0;\n    int curX = crossX;\n    int curY = crossY;\n    int tmpX, tmpY;\n    // if(netID == 2 && edgeID == 26)\n    //    printf(\"crossX %d crossY %d return_d: %f\\n\", crossX, crossY,\n    //    return_dist.load());\n    timer_traceback.start();\n    while (d1[curY][curX] != 0) // loop until reach subtree1\n    {\n      // if(cnt < 200)\n      //    printf(\"Y: %d X: %d hyperH: %d hyperV: %d HV: %d d1: %f\\n\", curY,\n      //    curX, hyperH[curY][curX], hyperV[curY][curX], HV[curY][curX],\n      //    d1[curY][curX]);\n\n      hypered = FALSE;\n      if (cnt != 0) {\n        if (curX != tmpX && hyperH[curY][curX]) {\n          curX    = 2 * curX - tmpX;\n          hypered = TRUE;\n        }\n        // printf(\"hyperV[153][134]: %d\\n\", hyperV[curY][curX]);\n        if (curY != tmpY && hyperV[curY][curX]) {\n          curY    = 2 * curY - tmpY;\n          hypered = TRUE;\n        }\n      }\n      tmpX = curX;\n      tmpY = curY;\n      if (!hypered) {\n        if (HV[tmpY][tmpX]) {\n          curY = parentY1[tmpY][tmpX];\n        } else {\n          curX = parentX3[tmpY][tmpX];\n        }\n      }\n\n      tmp_gridsX[cnt] = curX;\n      tmp_gridsY[cnt] = curY;\n      cnt++;\n    }\n    // reverse the grids on the path\n\n    for (i = 0; i < cnt; i++) {\n      tmpind    = cnt - 1 - i;\n      gridsX[i] = tmp_gridsX[tmpind];\n      gridsY[i] = tmp_gridsY[tmpind];\n    }\n    // add the connection point (crossX, crossY)\n    gridsX[cnt] = crossX;\n    gridsY[cnt] = crossY;\n    cnt++;\n\n    curX     = crossX;\n    curY     = crossY;\n    cnt_n1n2 = cnt;\n\n    // change the tree structure according to the new routing for the tree edge\n    // find E1 and E2, and the endpoints of the edges they are on\n    E1x = gridsX[0];\n    E1y = gridsY[0];\n    E2x = gridsX[cnt_n1n2 - 1];\n    E2y = gridsY[cnt_n1n2 - 1];\n\n    edge_n1n2 = edgeID;\n\n    timer_traceback.stop();\n\n    // if(netID == 14628)\n    //    printf(\"netID %d edgeID %d src %d %d dst %d %d routelen: %d\\n\", netID,\n    //    edgeID, E1x, E1y, E2x, E2y, cnt_n1n2);\n    // (1) consider subtree1\n    timer_adjusttree.start();\n    if (n1 >= deg && (E1x != n1x || E1y != n1y))\n    // n1 is not a pin and E1!=n1, then make change to subtree1, otherwise, no\n    // change to subtree1\n    {\n      shifted = TRUE;\n      // find the endpoints of the edge E1 is on\n      endpt1 = treeedges[corrEdge[E1y][E1x]].n1;\n      endpt2 = treeedges[corrEdge[E1y][E1x]].n2;\n\n      // find A1, A2 and edge_n1A1, edge_n1A2\n      if (treenodes[n1].nbr[0] == n2) {\n        A1        = treenodes[n1].nbr[1];\n        A2        = treenodes[n1].nbr[2];\n        edge_n1A1 = treenodes[n1].edge[1];\n        edge_n1A2 = treenodes[n1].edge[2];\n      } else if (treenodes[n1].nbr[1] == n2) {\n        A1        = treenodes[n1].nbr[0];\n        A2        = treenodes[n1].nbr[2];\n        edge_n1A1 = treenodes[n1].edge[0];\n        edge_n1A2 = treenodes[n1].edge[2];\n      } else {\n        A1        = treenodes[n1].nbr[0];\n        A2        = treenodes[n1].nbr[1];\n        edge_n1A1 = treenodes[n1].edge[0];\n        edge_n1A2 = treenodes[n1].edge[1];\n      }\n\n      if (endpt1 == n1 || endpt2 == n1) // E1 is on (n1, A1) or (n1, A2)\n      {\n        // if E1 is on (n1, A2), switch A1 and A2 so that E1 is always on (n1,\n        // A1)\n        if (endpt1 == A2 || endpt2 == A2) {\n          tmpi      = A1;\n          A1        = A2;\n          A2        = tmpi;\n          tmpi      = edge_n1A1;\n          edge_n1A1 = edge_n1A2;\n          edge_n1A2 = tmpi;\n        }\n\n        // update route for edge (n1, A1), (n1, A2)\n        updateRouteType1(treenodes, n1, A1, A2, E1x, E1y, treeedges, edge_n1A1,\n                         edge_n1A2);\n        // update position for n1\n        treenodes[n1].x = E1x;\n        treenodes[n1].y = E1y;\n      }    // if E1 is on (n1, A1) or (n1, A2)\n      else // E1 is not on (n1, A1) or (n1, A2), but on (C1, C2)\n      {\n        C1        = endpt1;\n        C2        = endpt2;\n        edge_C1C2 = corrEdge[E1y][E1x];\n\n        // update route for edge (n1, C1), (n1, C2) and (A1, A2)\n        updateRouteType2(treenodes, n1, A1, A2, C1, C2, E1x, E1y, treeedges,\n                         edge_n1A1, edge_n1A2, edge_C1C2);\n        // update position for n1\n        treenodes[n1].x = E1x;\n        treenodes[n1].y = E1y;\n        // update 3 edges (n1, A1)->(C1, n1), (n1, A2)->(n1, C2), (C1, C2)->(A1,\n        // A2)\n        edge_n1C1               = edge_n1A1;\n        treeedges[edge_n1C1].n1 = C1;\n        treeedges[edge_n1C1].n2 = n1;\n        edge_n1C2               = edge_n1A2;\n        treeedges[edge_n1C2].n1 = n1;\n        treeedges[edge_n1C2].n2 = C2;\n        edge_A1A2               = edge_C1C2;\n        treeedges[edge_A1A2].n1 = A1;\n        treeedges[edge_A1A2].n2 = A2;\n        // update nbr and edge for 5 nodes n1, A1, A2, C1, C2\n        // n1's nbr (n2, A1, A2)->(n2, C1, C2)\n        treenodes[n1].nbr[0]  = n2;\n        treenodes[n1].edge[0] = edge_n1n2;\n        treenodes[n1].nbr[1]  = C1;\n        treenodes[n1].edge[1] = edge_n1C1;\n        treenodes[n1].nbr[2]  = C2;\n        treenodes[n1].edge[2] = edge_n1C2;\n        // A1's nbr n1->A2\n        for (i = 0; i < 3; i++) {\n          if (treenodes[A1].nbr[i] == n1) {\n            treenodes[A1].nbr[i]  = A2;\n            treenodes[A1].edge[i] = edge_A1A2;\n            break;\n          }\n        }\n        // A2's nbr n1->A1\n        for (i = 0; i < 3; i++) {\n          if (treenodes[A2].nbr[i] == n1) {\n            treenodes[A2].nbr[i]  = A1;\n            treenodes[A2].edge[i] = edge_A1A2;\n            break;\n          }\n        }\n        // C1's nbr C2->n1\n        for (i = 0; i < 3; i++) {\n          if (treenodes[C1].nbr[i] == C2) {\n            treenodes[C1].nbr[i]  = n1;\n            treenodes[C1].edge[i] = edge_n1C1;\n            break;\n          }\n        }\n        // C2's nbr C1->n1\n        for (i = 0; i < 3; i++) {\n          if (treenodes[C2].nbr[i] == C1) {\n            treenodes[C2].nbr[i]  = n1;\n            treenodes[C2].edge[i] = edge_n1C2;\n            break;\n          }\n        }\n\n      } // else E1 is not on (n1, A1) or (n1, A2), but on (C1, C2)\n    }   // n1 is not a pin and E1!=n1\n\n    // (2) consider subtree2\n\n    if (n2 >= deg && (E2x != n2x || E2y != n2y))\n    // n2 is not a pin and E2!=n2, then make change to subtree2, otherwise, no\n    // change to subtree2\n    {\n      shifted = TRUE;\n      // find the endpoints of the edge E1 is on\n      endpt1 = treeedges[corrEdge[E2y][E2x]].n1;\n      endpt2 = treeedges[corrEdge[E2y][E2x]].n2;\n\n      // find B1, B2\n      if (treenodes[n2].nbr[0] == n1) {\n        B1        = treenodes[n2].nbr[1];\n        B2        = treenodes[n2].nbr[2];\n        edge_n2B1 = treenodes[n2].edge[1];\n        edge_n2B2 = treenodes[n2].edge[2];\n      } else if (treenodes[n2].nbr[1] == n1) {\n        B1        = treenodes[n2].nbr[0];\n        B2        = treenodes[n2].nbr[2];\n        edge_n2B1 = treenodes[n2].edge[0];\n        edge_n2B2 = treenodes[n2].edge[2];\n      } else {\n        B1        = treenodes[n2].nbr[0];\n        B2        = treenodes[n2].nbr[1];\n        edge_n2B1 = treenodes[n2].edge[0];\n        edge_n2B2 = treenodes[n2].edge[1];\n      }\n\n      if (endpt1 == n2 || endpt2 == n2) // E2 is on (n2, B1) or (n2, B2)\n      {\n        // if E2 is on (n2, B2), switch B1 and B2 so that E2 is always on (n2,\n        // B1)\n        if (endpt1 == B2 || endpt2 == B2) {\n          tmpi      = B1;\n          B1        = B2;\n          B2        = tmpi;\n          tmpi      = edge_n2B1;\n          edge_n2B1 = edge_n2B2;\n          edge_n2B2 = tmpi;\n        }\n\n        // update route for edge (n2, B1), (n2, B2)\n        updateRouteType1(treenodes, n2, B1, B2, E2x, E2y, treeedges, edge_n2B1,\n                         edge_n2B2);\n\n        // update position for n2\n        treenodes[n2].x = E2x;\n        treenodes[n2].y = E2y;\n      }    // if E2 is on (n2, B1) or (n2, B2)\n      else // E2 is not on (n2, B1) or (n2, B2), but on (D1, D2)\n      {\n        D1        = endpt1;\n        D2        = endpt2;\n        edge_D1D2 = corrEdge[E2y][E2x];\n\n        // update route for edge (n2, D1), (n2, D2) and (B1, B2)\n        updateRouteType2(treenodes, n2, B1, B2, D1, D2, E2x, E2y, treeedges,\n                         edge_n2B1, edge_n2B2, edge_D1D2);\n        // update position for n2\n        treenodes[n2].x = E2x;\n        treenodes[n2].y = E2y;\n        // update 3 edges (n2, B1)->(D1, n2), (n2, B2)->(n2, D2), (D1, D2)->(B1,\n        // B2)\n        edge_n2D1               = edge_n2B1;\n        treeedges[edge_n2D1].n1 = D1;\n        treeedges[edge_n2D1].n2 = n2;\n        edge_n2D2               = edge_n2B2;\n        treeedges[edge_n2D2].n1 = n2;\n        treeedges[edge_n2D2].n2 = D2;\n        edge_B1B2               = edge_D1D2;\n        treeedges[edge_B1B2].n1 = B1;\n        treeedges[edge_B1B2].n2 = B2;\n        // update nbr and edge for 5 nodes n2, B1, B2, D1, D2\n        // n1's nbr (n1, B1, B2)->(n1, D1, D2)\n        treenodes[n2].nbr[0]  = n1;\n        treenodes[n2].edge[0] = edge_n1n2;\n        treenodes[n2].nbr[1]  = D1;\n        treenodes[n2].edge[1] = edge_n2D1;\n        treenodes[n2].nbr[2]  = D2;\n        treenodes[n2].edge[2] = edge_n2D2;\n        // B1's nbr n2->B2\n        for (i = 0; i < 3; i++) {\n          if (treenodes[B1].nbr[i] == n2) {\n            treenodes[B1].nbr[i]  = B2;\n            treenodes[B1].edge[i] = edge_B1B2;\n            break;\n          }\n        }\n        // B2's nbr n2->B1\n        for (i = 0; i < 3; i++) {\n          if (treenodes[B2].nbr[i] == n2) {\n            treenodes[B2].nbr[i]  = B1;\n            treenodes[B2].edge[i] = edge_B1B2;\n            break;\n          }\n        }\n        // D1's nbr D2->n2\n        for (i = 0; i < 3; i++) {\n          if (treenodes[D1].nbr[i] == D2) {\n            treenodes[D1].nbr[i]  = n2;\n            treenodes[D1].edge[i] = edge_n2D1;\n            break;\n          }\n        }\n        // D2's nbr D1->n2\n        for (i = 0; i < 3; i++) {\n          if (treenodes[D2].nbr[i] == D1) {\n            treenodes[D2].nbr[i]  = n2;\n            treenodes[D2].edge[i] = edge_n2D2;\n            break;\n          }\n        }\n      } // else E2 is not on (n2, B1) or (n2, B2), but on (D1, D2)\n    }   // n2 is not a pin and E2!=n2\n\n    // update route for edge (n1, n2) and edge usage\n\n    // printf(\"update route? %d %d\\n\", netID, num_edges);\n    if (treeedges[edge_n1n2].route.type == MAZEROUTE) {\n      free(treeedges[edge_n1n2].route.gridsX);\n      free(treeedges[edge_n1n2].route.gridsY);\n    }\n    treeedges[edge_n1n2].route.gridsX = (short*)calloc(cnt_n1n2, sizeof(short));\n    treeedges[edge_n1n2].route.gridsY = (short*)calloc(cnt_n1n2, sizeof(short));\n    treeedges[edge_n1n2].route.type   = MAZEROUTE;\n    treeedges[edge_n1n2].route.routelen = cnt_n1n2 - 1;\n    treeedges[edge_n1n2].len            = ADIFF(E1x, E2x) + ADIFF(E1y, E2y);\n    treeedges[edge_n1n2].n_ripups += 1;\n    total_ripups += 1;\n    max_ripups.update(treeedges[edge_n1n2].n_ripups);\n\n    for (i = 0; i < cnt_n1n2; i++) {\n      // printf(\"cnt_n1n2: %d\\n\", cnt_n1n2);\n      treeedges[edge_n1n2].route.gridsX[i] = gridsX[i];\n      treeedges[edge_n1n2].route.gridsY[i] = gridsY[i];\n    }\n    // std::cout << \" adjsut tree\" << std::endl;\n    timer_adjusttree.stop();\n\n    // update edge usage\n\n    /*for(i=0; i<pre_length; i++)\n    {\n        if(pre_gridsX[i]==pre_gridsX[i+1]) // a vertical edge\n        {\n            if(i != pre_length - 1)\n                min_y = min(pre_gridsY[i], pre_gridsY[i+1]);\n            else\n                min_y = pre_gridsY[i];\n            //v_edges[min_y*xGrid+gridsX[i]].usage += 1;\n            //galois::atomicAdd(v_edges[min_y*xGrid+gridsX[i]].usage, (short\n    unsigned)1);\n            //printf(\"x y %d %d i %d \\n\", pre_gridsX[i], min_y, i);\n            v_edges[min_y*xGrid+pre_gridsX[i]].usage.fetch_sub((short int)1,\n    std::memory_order_relaxed);\n            //if(v_edges[min_y*xGrid+pre_gridsX[i]].usage < 0) printf(\"V\n    negative! %d \\n\", i);\n        }\n        else ///if(gridsY[i]==gridsY[i+1])// a horizontal edge\n        {\n            if(i != pre_length - 1)\n                min_x = min(pre_gridsX[i], pre_gridsX[i+1]);\n            else\n                min_x = pre_gridsX[i];\n            //h_edges[gridsY[i]*(xGrid-1)+min_x].usage += 1;\n            //galois::atomicAdd(h_edges[gridsY[i]*(xGrid-1)+min_x].usage, (short\n    unsigned)1);\n            //printf(\"x y %d %d i %d\\n\", min_x, pre_gridsY[i], i);\n            h_edges[pre_gridsY[i]*(xGrid-1)+min_x].usage.fetch_sub((short int)1,\n    std::memory_order_relaxed);\n            //if(h_edges[pre_gridsY[i]*(xGrid-1)+min_x].usage < 0) printf(\"H\n    negative! %d \\n\", i);\n        }\n    }*/\n    timer_updateusage.start();\n    for (i = 0; i < cnt_n1n2 - 1; i++) {\n      if (gridsX[i] == gridsX[i + 1]) // a vertical edge\n      {\n        min_y = min(gridsY[i], gridsY[i + 1]);\n        // v_edges[min_y*xGrid+gridsX[i]].usage += 1;\n        // galois::atomicAdd(v_edges[min_y*xGrid+gridsX[i]].usage, (short\n        // unsigned)1);\n        v_edges[min_y * xGrid + gridsX[i]].usage.fetch_add(\n            (short int)1, std::memory_order_relaxed);\n      } else /// if(gridsY[i]==gridsY[i+1])// a horizontal edge\n      {\n        min_x = min(gridsX[i], gridsX[i + 1]);\n        // h_edges[gridsY[i]*(xGrid-1)+min_x].usage += 1;\n        // galois::atomicAdd(h_edges[gridsY[i]*(xGrid-1)+min_x].usage, (short\n        // unsigned)1);\n        h_edges[gridsY[i] * (xGrid - 1) + min_x].usage.fetch_add(\n            (short int)1, std::memory_order_relaxed);\n      }\n    }\n    timer_updateusage.stop();\n    /*if(LOCK){\n        for(i=0; i<cnt_n1n2-1; i++)\n        {\n            if(gridsX[i]==gridsX[i+1]) // a vertical edge\n            {\n                min_y = min(gridsY[i], gridsY[i+1]);\n                v_edges[min_y*xGrid+gridsX[i]].releaseLock();\n            }\n            else ///if(gridsY[i]==gridsY[i+1])// a horizontal edge\n            {\n                min_x = min(gridsX[i], gridsX[i+1]);\n                h_edges[gridsY[i]*(xGrid-1)+min_x].releaseLock();\n            }\n        }\n    }*/\n    // printf(\"netID %d edgeID %d src %d %d dst %d %d routelen: %d\\n\", netID,\n    // edgeID, n1x, n1y, n2x, n2y, cnt_n1n2);\n    if (checkRoute2DTree(netID)) {\n      reInitTree(netID);\n      return;\n    }\n  }\n}\n\nvoid concurrent_maze(galois::InsertBag<pq_grid> pq1,\n                     CONCURRENT_NET_STORAGE* concurrent_net_storage) {\n\n  galois::for_each(\n      galois::iterate(pq1),\n      [&](const auto& top, auto& ctx) {\n        // relax all the adjacent grids within the enlarged region for source\n        // subtree\n        int concurrentID = top.concurrentID;\n\n        float** d1 = concurrent_net_storage[concurrentID].d1_p;\n        std::atomic<int>& return_ind1 =\n            concurrent_net_storage[concurrentID].return_ind1;\n        std::atomic<float>& return_dist =\n            concurrent_net_storage[concurrentID].return_dist;\n\n        bool* pop_heap2 = concurrent_net_storage[concurrentID].pop_heap2;\n\n        bool** HV     = concurrent_net_storage[concurrentID].HV_p;\n        bool** hyperV = concurrent_net_storage[concurrentID].hyperV_p;\n        bool** hyperH = concurrent_net_storage[concurrentID].hyperH_p;\n\n        short** parentX1 = concurrent_net_storage[concurrentID].parentX1_p;\n        short** parentX3 = concurrent_net_storage[concurrentID].parentX3_p;\n        short** parentY1 = concurrent_net_storage[concurrentID].parentY1_p;\n        short** parentY3 = concurrent_net_storage[concurrentID].parentY3_p;\n\n        int** corrEdge = concurrent_net_storage[concurrentID].corrEdge_p;\n\n        OrderNetEdge* netEO = concurrent_net_storage[concurrentID].netEO_p;\n\n        bool** inRegion = concurrent_net_storage[concurrentID].inRegion_p;\n        bool* inRegion_alloc =\n            concurrent_net_storage[concurrentID].inRegion_alloc;\n\n        galois::LargeArray<galois::substrate::SimpleLock>& nodelock =\n            concurrent_net_storage[concurrentID].nodelock;\n\n        std::vector<int>& v2 = concurrent_net_storage[concurrentID].v2;\n\n        int regionX1     = concurrent_net_storage[concurrentID].regionX1;\n        int regionX2     = concurrent_net_storage[concurrentID].regionX2;\n        int regionY1     = concurrent_net_storage[concurrentID].regionY1;\n        int regionY2     = concurrent_net_storage[concurrentID].regionY2;\n        int regionWidth  = concurrent_net_storage[concurrentID].regionWidth;\n        int regionHeight = concurrent_net_storage[concurrentID].regionHeight;\n\n        int ind1 = top.d1_p - &d1[0][0];\n\n        int curX = ind1 % xGrid;\n        int curY = ind1 / xGrid;\n        int grid = curY * xGrid + curX;\n\n        float curr_d1 = d1[curY][curX];\n        float d1_push = top.d1_push;\n\n        if (d1_push == curr_d1 && d1_push < return_dist.load()) {\n          if (pop_heap2[ind1] != false) {\n            // if(netID == 2 && edgeID == 26)\n            //    printf(\"reach! curX curY %d %d, d1_push: %f, curr_d1: %f\n            //    return_d: %f\\n\", curX, curY, d1_push, curr_d1,\n            //    return_dist.load());\n            return_ind1.store(ind1);\n            return_dist.store(d1_push);\n          }\n\n          /*grid = curY*xGrid + curX - 1;\n          if(curX>regionX1)\n              galois::runtime::acquire(&data[grid], galois::MethodFlag::WRITE);\n\n          grid = curY*xGrid + curX + 1;\n          if(curX<regionX2)\n              galois::runtime::acquire(&data[grid], galois::MethodFlag::WRITE);\n\n          grid = (curY - 1)*xGrid + curX;\n          if(curY>regionY1)\n              galois::runtime::acquire(&data[grid], galois::MethodFlag::WRITE);\n\n          grid = (curY + 1)*xGrid + curX;\n          if(curY<regionY2)\n              galois::runtime::acquire(&data[grid],\n          galois::MethodFlag::WRITE);*/\n\n          int preX, preY;\n          if (curr_d1 != 0) {\n            if (HV[curY][curX]) {\n              preX = parentX1[curY][curX];\n              preY = parentY1[curY][curX];\n            } else {\n              preX = parentX3[curY][curX];\n              preY = parentY3[curY][curX];\n            }\n          } else {\n            preX = curX;\n            preY = curY;\n          }\n          // printf(\"pop curY: %d curX: %d d1: %f preX: %d preY: %d hyperH: %d\n          // hyperV: %d HV: %d return_dist: %f\\n\",\n          //    curY, curX, curr_d1, preX, preY, hyperH[curY][curX],\n          //    hyperV[curY][curX], HV[curY][curX], return_dist.load());\n          float tmp, tmp_cost;\n          int tmp_grid;\n          int tmpX, tmpY;\n          // left\n          bool tmpH = false;\n          bool tmpV = false;\n\n          // if(curX>regionX1)\n          //    data[curY*xGrid+curX-1].lock();\n\n          // data[curY*(xGrid-1)+curX].lock();\n\n          if (curX > regionX1) {\n            grid = curY * (xGrid - 1) + curX - 1;\n\n            // printf(\"grid: %d %d usage: %d red:%d last:%d sum%f %d\\n\",\n            //    grid%xGrid, grid/xGrid, h_edges[grid].usage.load(),\n            //    h_edges[grid].red, h_edges[grid].last_usage, L ,\n            //    h_edges[grid].usage.load() + h_edges[grid].red +\n            //    (int)(L*h_edges[grid].last_usage));\n            if ((preY == curY) || (curr_d1 == 0)) {\n              tmp = curr_d1 +\n                    h_costTable[h_edges[grid].usage + h_edges[grid].red +\n                                (int)(L * h_edges[grid].last_usage)];\n            } else {\n              if (curX < regionX2 - 1) {\n                tmp_grid = curY * (xGrid - 1) + curX;\n                tmp_cost = d1[curY][curX + 1] +\n                           h_costTable[h_edges[tmp_grid].usage +\n                                       h_edges[tmp_grid].red +\n                                       (int)(L * h_edges[tmp_grid].last_usage)];\n\n                if (tmp_cost < curr_d1 + VIA) {\n                  // hyperH[curY][curX] = TRUE; //Michael\n                  tmpH = true;\n                }\n              }\n              tmp = curr_d1 + VIA +\n                    h_costTable[h_edges[grid].usage + h_edges[grid].red +\n                                (int)(L * h_edges[grid].last_usage)];\n            }\n            tmpX = curX - 1; // the left neighbor\n\n            if (d1[curY][tmpX] > tmp && tmp < return_dist) {\n              ctx.push(\n                  Concurrent_pq_grid(&(d1[curY][tmpX]), tmp, concurrentID));\n              nodelock[curY * xGrid + curX - 1].lock();\n              if (d1[curY][tmpX] > tmp &&\n                  tmp < return_dist) // left neighbor been put into heap1 but\n                                     // needs update\n              {\n                d1[curY][tmpX]       = tmp;\n                parentX3[curY][tmpX] = curX;\n                parentY3[curY][tmpX] = curY;\n                HV[curY][tmpX]       = FALSE;\n              }\n              nodelock[curY * xGrid + curX - 1].unlock();\n            }\n          }\n          // right\n\n          if (curX < regionX2) {\n            // data[curY*xGrid+curX+1].lock();\n            grid = curY * (xGrid - 1) + curX;\n            // printf(\"grid: %d %d usage: %d red:%d last:%d L:%f\n            // sum:%d\\n\",grid%xGrid, grid/xGrid, h_edges[grid].usage.load(),\n            // h_edges[grid].red, h_edges[grid].last_usage, L ,\n            // h_edges[grid].usage.load() + h_edges[grid].red +\n            // (int)(L*h_edges[grid].last_usage));\n            if ((preY == curY) || (curr_d1 == 0)) {\n              tmp = curr_d1 +\n                    h_costTable[h_edges[grid].usage + h_edges[grid].red +\n                                (int)(L * h_edges[grid].last_usage)];\n            } else {\n              if (curX > regionX1 + 1) {\n                tmp_grid = curY * (xGrid - 1) + curX - 1;\n                tmp_cost = d1[curY][curX - 1] +\n                           h_costTable[h_edges[tmp_grid].usage +\n                                       h_edges[tmp_grid].red +\n                                       (int)(L * h_edges[tmp_grid].last_usage)];\n\n                if (tmp_cost < curr_d1 + VIA) {\n                  // hyperH[curY][curX] = TRUE;\n                  tmpH = true;\n                }\n              }\n              tmp = curr_d1 + VIA +\n                    h_costTable[h_edges[grid].usage + h_edges[grid].red +\n                                (int)(L * h_edges[grid].last_usage)];\n            }\n            tmpX = curX + 1; // the right neighbor\n\n            if (d1[curY][tmpX] > tmp && tmp < return_dist) {\n              ctx.push(\n                  Concurrent_pq_grid(&(d1[curY][tmpX]), tmp, concurrentID));\n              nodelock[curY * xGrid + curX + 1].lock();\n              if (d1[curY][tmpX] > tmp &&\n                  tmp < return_dist) // right neighbor been put into heap1 but\n                                     // needs update\n              {\n                d1[curY][tmpX]       = tmp;\n                parentX3[curY][tmpX] = curX;\n                parentY3[curY][tmpX] = curY;\n                HV[curY][tmpX]       = FALSE;\n              }\n              nodelock[curY * xGrid + curX + 1].unlock();\n            }\n          }\n          // data[curY*(xGrid-1)+curX].lock();\n          hyperH[curY][curX] = tmpH;\n\n          // data[curY*(xGrid-1)+curX].unlock();\n\n          // bottom\n\n          // if(curY>regionY1)\n          //   data[(curY-1)*xGrid+curX].lock();\n\n          // data[curY*(xGrid-1)+curX].lock();\n\n          if (curY > regionY1) {\n            grid = (curY - 1) * xGrid + curX;\n            // printf(\"grid: %d %d usage: %d red:%d last:%d sum%f %d\\n\",\n            //    grid%xGrid, grid/xGrid, v_edges[grid].usage.load(),\n            //    v_edges[grid].red, v_edges[grid].last_usage, L ,\n            //    v_edges[grid].usage.load() + v_edges[grid].red +\n            //    (int)(L*v_edges[grid].last_usage));\n            if ((preX == curX) || (curr_d1 == 0)) {\n              tmp = curr_d1 +\n                    v_costTable[v_edges[grid].usage + v_edges[grid].red +\n                                (int)(L * v_edges[grid].last_usage)];\n            } else {\n              if (curY < regionY2 - 1) {\n                tmp_grid = curY * xGrid + curX;\n                tmp_cost = d1[curY + 1][curX] +\n                           v_costTable[v_edges[tmp_grid].usage +\n                                       v_edges[tmp_grid].red +\n                                       (int)(L * v_edges[tmp_grid].last_usage)];\n\n                if (tmp_cost < curr_d1 + VIA) {\n                  // hyperV[curY][curX] = TRUE;\n                  tmpV = true;\n                }\n              }\n              tmp = curr_d1 + VIA +\n                    v_costTable[v_edges[grid].usage + v_edges[grid].red +\n                                (int)(L * v_edges[grid].last_usage)];\n            }\n            tmpY = curY - 1; // the bottom neighbor\n\n            /*if(d1[tmpY][curX]>=BIG_INT) // bottom neighbor not been put into\n            heap1\n            {\n                d1[tmpY][curX] = tmp;\n                parentX1[tmpY][curX] = curX;\n                parentY1[tmpY][curX] = curY;\n                HV[tmpY][curX] = TRUE;\n                pq1.push(&(d1[tmpY][curX]));\n\n            }\n            else */\n            // galois::runtime::acquire(&data[tmpY * yGrid + curX],\n            // galois::MethodFlag::WRITE);\n            if (d1[tmpY][curX] > tmp && tmp < return_dist) {\n              ctx.push(\n                  concurrent_pq_grid(&(d1[tmpY][curX]), tmp, concurrentID));\n\n              nodelock[(curY - 1) * xGrid + curX].lock();\n              if (d1[tmpY][curX] > tmp &&\n                  tmp < return_dist) // bottom neighbor been put into heap1 but\n                                     // needs update\n              {\n                d1[tmpY][curX]       = tmp;\n                parentX1[tmpY][curX] = curX;\n                parentY1[tmpY][curX] = curY;\n                HV[tmpY][curX]       = TRUE;\n              }\n              nodelock[(curY - 1) * xGrid + curX].unlock();\n            }\n          }\n          // top\n          if (curY < regionY2) {\n\n            grid = curY * xGrid + curX;\n            // printf(\"grid: %d %d usage: %d red:%d last:%d sum%f %d\\n\",\n            //    grid%xGrid, grid/xGrid, v_edges[grid].usage.load(),\n            //    v_edges[grid].red, v_edges[grid].last_usage, L ,\n            //    v_edges[grid].usage.load() + v_edges[grid].red +\n            //    (int)(L*v_edges[grid].last_usage));\n            if ((preX == curX) || (curr_d1 == 0)) {\n              tmp = curr_d1 +\n                    v_costTable[v_edges[grid].usage + v_edges[grid].red +\n                                (int)(L * v_edges[grid].last_usage)];\n            } else {\n              if (curY > regionY1 + 1) {\n                tmp_grid = (curY - 1) * xGrid + curX;\n                tmp_cost = d1[curY - 1][curX] +\n                           v_costTable[v_edges[tmp_grid].usage +\n                                       v_edges[tmp_grid].red +\n                                       (int)(L * v_edges[tmp_grid].last_usage)];\n\n                if (tmp_cost < curr_d1 + VIA) {\n                  // hyperV[curY][curX] = TRUE;\n                  tmpV = true;\n                }\n              }\n              tmp = curr_d1 + VIA +\n                    v_costTable[v_edges[grid].usage + v_edges[grid].red +\n                                (int)(L * v_edges[grid].last_usage)];\n            }\n            tmpY = curY + 1; // the top neighbor\n\n            /*if(d1[tmpY][curX]>=BIG_INT) // top neighbor not been put into\n            heap1\n            {\n                d1[tmpY][curX] = tmp;\n                parentX1[tmpY][curX] = curX;\n                parentY1[tmpY][curX] = curY;\n                HV[tmpY][curX] = TRUE;\n                pq1.push(&(d1[tmpY][curX]));\n            }\n            else*/\n            // galois::runtime::acquire(&data[tmpY * yGrid + curX],\n            // galois::MethodFlag::WRITE);\n            if (d1[tmpY][curX] > tmp && tmp < return_dist) {\n              ctx.push(\n                  Concurrent_pq_grid(&(d1[tmpY][curX]), tmp, concurrentID));\n              nodelock[(curY + 1) * xGrid + curX].lock();\n              if (d1[tmpY][curX] > tmp &&\n                  tmp < return_dist) // top neighbor been put into heap1 but\n                                     // needs update\n              {\n\n                d1[tmpY][curX]       = tmp;\n                parentX1[tmpY][curX] = curX;\n                parentY1[tmpY][curX] = curY;\n                HV[tmpY][curX]       = TRUE;\n\n                // printf(\"top push Y: %d X: %d tmp: %f HV: false hyperH: %d\\n\",\n                // tmpY, curX, tmp, true); pq1.push({&(d1[tmpY][curX]), tmp});\n              }\n              nodelock[(curY + 1) * xGrid + curX].unlock();\n            }\n          }\n          hyperV[curY][curX] = tmpV;\n          // data[curY*xGrid+curX].unlock();\n        }\n      },\n      // galois::wl<galois::worklists::ParaMeter<>>(),\n      // galois::wl<PSChunk>(),\n      galois::wl<OBIM_concurrent>(RequestIndexerConcurrent),\n      // galois::chunk_size<MAZE_CHUNK_SIZE>()\n      // galois::parallel_break(),\n      // galois::steal(),\n      galois::loopname(\"fine_grain\"));\n}\n\nvoid mazeRouteMSMD_finegrain_concurrent(int iter, int expand, float costHeight,\n                                        int ripup_threshold,\n                                        int mazeedge_Threshold, Bool Ordering,\n                                        int cost_type, int nConcurrentNets) {\n  // LOCK = 0;\n  galois::StatTimer timer_finegrain(\"fine grain function\",\n                                    \"fine grain concurrent maze\");\n\n  float forange;\n  // allocate memory for distance and parent and pop_heap\n  h_costTable = (float*)calloc(40 * hCapacity, sizeof(float));\n  v_costTable = (float*)calloc(40 * vCapacity, sizeof(float));\n\n  forange = 40 * hCapacity;\n\n  if (cost_type == 2) {\n    for (int i = 0; i < forange; i++) {\n      if (i < hCapacity - 1)\n        h_costTable[i] =\n            costHeight / (exp((float)(hCapacity - i - 1) * LOGIS_COF) + 1) + 1;\n      else\n        h_costTable[i] =\n            costHeight / (exp((float)(hCapacity - i - 1) * LOGIS_COF) + 1) + 1 +\n            (float)costHeight / slope * (i - hCapacity);\n    }\n    forange = 40 * vCapacity;\n    for (int i = 0; i < forange; i++) {\n      if (i < vCapacity - 1)\n        v_costTable[i] =\n            costHeight / (exp((float)(vCapacity - i - 1) * LOGIS_COF) + 1) + 1;\n      else\n        v_costTable[i] =\n            costHeight / (exp((float)(vCapacity - i - 1) * LOGIS_COF) + 1) + 1 +\n            (float)costHeight / slope * (i - vCapacity);\n    }\n  } else {\n\n    for (int i = 0; i < forange; i++) {\n      if (i < hCapacity)\n        h_costTable[i] =\n            costHeight / (exp((float)(hCapacity - i) * LOGIS_COF) + 1) + 1;\n      else\n        h_costTable[i] =\n            costHeight / (exp((float)(hCapacity - i) * LOGIS_COF) + 1) + 1 +\n            (float)costHeight / slope * (i - hCapacity);\n    }\n    forange = 40 * vCapacity;\n    for (int i = 0; i < forange; i++) {\n      if (i < vCapacity)\n        v_costTable[i] =\n            costHeight / (exp((float)(vCapacity - i) * LOGIS_COF) + 1) + 1;\n      else\n        v_costTable[i] =\n            costHeight / (exp((float)(vCapacity - i) * LOGIS_COF) + 1) + 1 +\n            (float)costHeight / slope * (i - vCapacity);\n    }\n  }\n\n  /*forange = yGrid*xGrid;\n  for(i=0; i<forange; i++)\n  {\n      pop_heap2[i] = FALSE;\n  } //Michael*/\n\n  galois::LargeArray<galois::substrate::SimpleLock> data;\n  data.allocateInterleaved(xGrid * yGrid);\n\n  if (Ordering) {\n    StNetOrder();\n    // printf(\"order?\\n\");\n  }\n\n  CONCURRENT_NET_STORAGE* concurrent_net_storage =\n      new CONCURRENT_NET_STORAGE[nConcurrentNets];\n  // for(nidRPC=0; nidRPC<numValidNets; nidRPC++)//parallelize\n  PerThread_PQ perthread_pq;\n  PerThread_Vec perthread_vec;\n  PRINT = 0;\n  galois::GAccumulator<int> total_ripups;\n  galois::GReduceMax<int> max_ripups;\n  total_ripups.reset();\n  max_ripups.reset();\n\n  // galois::runtime::profileVtune( [&] (void) {\n  /*std::random_device rd;\n  std::mt19937 g(rd());\n  std::shuffle(net_shuffle.begin(), net_shuffle.end(), g);\n\n  galois::do_all(galois::iterate(net_shuffle), */\n  // galois::for_each(galois::iterate(0, numValidNets),\n  //        [&] (const auto nidRPC, auto& ctx)\n  galois::StatTimer timer_newripupcheck(\"ripup\", \"fine grain concurrent maze\");\n  galois::StatTimer timer_setupheap(\"setup heap\", \"fine grain concurrent maze\");\n  galois::StatTimer timer_traceback(\"trace back\", \"fine grain concurrent maze\");\n  galois::StatTimer timer_adjusttree(\"adjust tree\",\n                                     \"fine grain concurrent maze\");\n  galois::StatTimer timer_updateusage(\"update usage\",\n                                      \"fine grain concurrent maze\");\n  galois::StatTimer timer_checkroute2dtree(\"checkroute2dtree\",\n                                           \"fine grain concurrent maze\");\n  galois::StatTimer timer_init(\"init\", \"fine grain concurrent maze\");\n  galois::StatTimer timer_foreach(\"foreach\", \"fine grain concurrent maze\");\n  galois::StatTimer timer_init_int(\"big int initialize\",\n                                   \"fine grain concurrent maze\");\n\n  int concurrentNet_cnt = 0;\n  int concurrentID2netID[nConcurrentNets];\n  int concurrentID2edgeID[nConcurrentNets];\n\n  for (int nidRPC = 0; nidRPC < numValidNets; nidRPC++) {\n\n    int l, netID;\n    float total_usage;\n    float overflow;\n\n    // maze routing for multi-source, multi-destination\n    Bool preD, hypered, enter, shifted;\n    int i, j, k, deg, edgeID, n1, n2, n1x, n1y, n2x, n2y, ymin, ymax, xmin,\n        xmax, curX, curY, crossX, crossY, tmpi, min_x, min_y, num_edges;\n    int segWidth, segHeight;\n    int tmpind, gridsX[XRANGE], gridsY[YRANGE], tmp_gridsX[XRANGE],\n        tmp_gridsY[YRANGE];\n    int endpt1, endpt2, A1, A2, B1, B2, C1, C2, D1, D2, cnt, cnt_n1n2;\n    int edge_n1n2, edge_n1A1, edge_n1A2, edge_n1C1, edge_n1C2, edge_A1A2,\n        edge_C1C2;\n    int edge_n2B1, edge_n2B2, edge_n2D1, edge_n2D2, edge_B1B2, edge_D1D2;\n    int E1x, E1y, E2x, E2y;\n    int tmp_of;\n    int origENG, edgeREC;\n\n    float costL1, costL2, *dtmp;\n    TreeEdge *treeedges, *treeedge, *cureedge;\n    TreeNode* treenodes;\n\n    bool* pop_heap2 = concurrent_net_storage[concurrentNet_cnt].pop_heap2;\n\n    float** d1    = concurrent_net_storage[concurrentNet_cnt].d1_p;\n    bool** HV     = concurrent_net_storage[concurrentNet_cnt].HV_p;\n    bool** hyperV = concurrent_net_storage[concurrentNet_cnt].hyperV_p;\n    bool** hyperH = concurrent_net_storage[concurrentNet_cnt].hyperH_p;\n\n    short** parentX1 = concurrent_net_storage[concurrentNet_cnt].parentX1_p;\n    short** parentX3 = concurrent_net_storage[concurrentNet_cnt].parentX3_p;\n    short** parentY1 = concurrent_net_storage[concurrentNet_cnt].parentY1_p;\n    short** parentY3 = concurrent_net_storage[concurrentNet_cnt].parentY3_p;\n\n    int** corrEdge = concurrent_net_storage[concurrentNet_cnt].corrEdge_p;\n\n    OrderNetEdge* netEO = concurrent_net_storage[concurrentNet_cnt].netEO_p;\n\n    bool** inRegion = concurrent_net_storage[concurrentNet_cnt].inRegion_p;\n    bool* inRegion_alloc =\n        concurrent_net_storage[concurrentNet_cnt].inRegion_alloc;\n\n    galois::LargeArray<galois::substrate::SimpleLock>& nodelock =\n        concurrent_net_storage[concurrentNet_cnt].nodelock;\n\n    galois::InsertBag<pq_grid> pq1;\n    std::vector<int>& v2 = concurrent_net_storage[concurrentNet_cnt].v2;\n\n    int& regionX1     = concurrent_net_storage[concurrentNet_cnt].regionX1;\n    int& regionX2     = concurrent_net_storage[concurrentNet_cnt].regionX2;\n    int& regionY1     = concurrent_net_storage[concurrentNet_cnt].regionY1;\n    int& regionY2     = concurrent_net_storage[concurrentNet_cnt].regionY2;\n    int& regionWidth  = concurrent_net_storage[concurrentNet_cnt].regionWidth;\n    int& regionHeight = concurrent_net_storage[concurrentNet_cnt].regionHeight;\n\n    if (Ordering) {\n      netID = treeOrderCong[nidRPC].treeIndex;\n    } else {\n      netID = nidRPC;\n    }\n\n    deg = sttrees[netID].deg;\n\n    origENG = expand;\n\n    netedgeOrderDec(netID, netEO);\n\n    treeedges = sttrees[netID].edges;\n    treenodes = sttrees[netID].nodes;\n    // loop for all the tree edges (2*deg-3)\n    num_edges = 2 * deg - 3;\n\n    for (edgeREC = 0; edgeREC < num_edges; edgeREC++) {\n\n      edgeID   = netEO[edgeREC].edgeID;\n      treeedge = &(treeedges[edgeID]);\n\n      n1            = treeedge->n1;\n      n2            = treeedge->n2;\n      n1x           = treenodes[n1].x;\n      n1y           = treenodes[n1].y;\n      n2x           = treenodes[n2].x;\n      n2y           = treenodes[n2].y;\n      treeedge->len = ADIFF(n2x, n1x) + ADIFF(n2y, n1y);\n\n      if (treeedge->len >\n          mazeedge_Threshold) // only route the non-degraded edges (len>0)\n      {\n        timer_newripupcheck.start();\n        enter = newRipupCheck(treeedge, n1x, n1y, n2x, n2y, ripup_threshold,\n                              netID, edgeID);\n        timer_newripupcheck.stop();\n\n        // ripup the routing for the edge\n        timer_finegrain.start();\n        if (enter) {\n          concurrentID2netID[concurrentNet_cnt]  = netID;\n          concurrentID2edgeID[concurrentNet_cnt] = edgeID;\n\n          timer_init.start();\n          if (n1y <= n2y) {\n            ymin = n1y;\n            ymax = n2y;\n          } else {\n            ymin = n2y;\n            ymax = n1y;\n          }\n\n          if (n1x <= n2x) {\n            xmin = n1x;\n            xmax = n2x;\n          } else {\n            xmin = n2x;\n            xmax = n1x;\n          }\n\n          shifted     = FALSE;\n          int enlarge = min(\n              origENG, (iter / 6 + 3) *\n                           treeedge->route\n                               .routelen); // michael, this was global variable\n          segWidth     = xmax - xmin;\n          segHeight    = ymax - ymin;\n          regionX1     = max(0, xmin - enlarge);\n          regionX2     = min(xGrid - 1, xmax + enlarge);\n          regionY1     = max(0, ymin - enlarge);\n          regionY2     = min(yGrid - 1, ymax + enlarge);\n          regionWidth  = regionX2 - regionX1 + 1;\n          regionHeight = regionY2 - regionY1 + 1;\n\n          timer_init_int.start();\n          for (i = regionY1; i <= regionY2; i++) {\n            for (j = regionX1; j <= regionX2; j++) {\n              d1[i][j] = BIG_INT;\n\n              /*d2[i][j] = BIG_INT;\n              hyperH[i][j] = FALSE;\n              hyperV[i][j] = FALSE;*/\n            }\n          }\n          timer_init_int.stop();\n          // memset(hyperH, 0, xGrid * yGrid * sizeof(bool));\n          // memset(hyperV, 0, xGrid * yGrid * sizeof(bool));\n          for (i = regionY1; i <= regionY2; i++) {\n            for (j = regionX1; j <= regionX2; j++) {\n              HV[i][j] = FALSE;\n            }\n          }\n          for (i = regionY1; i <= regionY2; i++) {\n            for (j = regionX1; j <= regionX2; j++) {\n              hyperH[i][j] = FALSE;\n            }\n          }\n          for (i = regionY1; i <= regionY2; i++) {\n            for (j = regionX1; j <= regionX2; j++) {\n              hyperV[i][j] = FALSE;\n            }\n          }\n          // TODO: use seperate loops\n\n          // setup heap1, heap2 and initialize d1[][] and d2[][] for all the\n          // grids on the two subtrees\n          timer_setupheap.start();\n          setupHeap_nopq1clear(netID, edgeID, pq1, v2, regionX1, regionX2,\n                               regionY1, regionY2, d1, corrEdge, inRegion,\n                               concurrentNet_cnt);\n          timer_setupheap.stop();\n          // TODO: use std priority queue\n          // while loop to find shortest path\n          /*ind1 = (pq1.top().d1_p - &d1[0][0]);\n          curX = ind1%xGrid;\n          curY = ind1/xGrid;\n          printf(\"src size: %d dst size: %d\\n\", pq1.size(), v2.size());*/\n          for (auto ii = v2.begin(); ii != v2.end(); ii++) {\n            pop_heap2[*ii] = TRUE;\n          }\n\n          std::atomic<int>& return_ind1 =\n              concurrent_net_storage[concurrentNet_cnt]->return_ind1;\n          return_ind1 = 0;\n          std::atomic<float>& return_dist =\n              concurrent_net_storage[concurrentNet_cnt]->return_dist;\n          return_dist = (float)BIG_INT;\n\n          timer_init.stop();\n          timer_foreach.start();\n\n          concurrent_maze(pq1, concurrent_net_storage);\n\n          timer_foreach.stop();\n\n          trace_back(concurrent_net_storage, concurrentID2netID,\n                     concurrentID2edgeID, concurrentNet_cnt);\n\n        } // congested route, if(enter)\n        timer_finegrain.stop();\n      } // only route the non-degraded edges (len>0)\n    }   // iterate on edges of a net\n  }\n\n  printf(\"total ripups: %d max ripups: %d\\n\", total_ripups.reduce(),\n         max_ripups.reduce());\n  //}, \"mazeroute vtune function\");\n  free(h_costTable);\n  free(v_costTable);\n\n  thread_local_storage->clear();\n  delete thread_local_storage;\n}\n"
  },
  {
    "path": "lonestar/eda/cpu/sproute/maze_finegrain_lateupdate.h",
    "content": "\nstruct LateUpdateStorage {\n  using LAptr = galois::substrate::LAptr;\n  LAptr pop_heap2_LA;\n  bool* pop_heap2;\n\n  LAptr d1_p_LA, d1_alloc_LA;\n  std::atomic<float>** d1_p;\n  std::atomic<float>* d1_alloc;\n\n  LAptr HV_p_LA, HV_alloc_LA, hyperV_p_LA, hyperV_alloc_LA, hyperH_p_LA,\n      hyperH_alloc_LA;\n  bool **HV_p, **hyperV_p, **hyperH_p;\n  bool *HV_alloc, *hyperV_alloc, *hyperH_alloc;\n\n  LAptr parentX1_p_LA, parentX1_alloc_LA, parentY1_p_LA, parentY1_alloc_LA,\n      parentX3_p_LA, parentX3_alloc_LA, parentY3_p_LA, parentY3_alloc_LA;\n  short **parentX1_p, **parentY1_p, **parentX3_p, **parentY3_p;\n  short *parentX1_alloc, *parentY1_alloc, *parentX3_alloc, *parentY3_alloc;\n\n  LAptr corrEdge_p_LA, corrEdge_alloc_LA;\n  int** corrEdge_p;\n  int* corrEdge_alloc;\n\n  LAptr inRegion_p_LA, inRegion_alloc_LA;\n  bool** inRegion_p;\n  bool* inRegion_alloc;\n\n  LAptr netEO_p_LA;\n  OrderNetEdge* netEO_p;\n\n  // maze_pq pq1;\n  // std::vector<float*> v2;\n  LateUpdateStorage() {\n    using namespace galois::substrate;\n\n    if (NET_PARALLEL) {\n      pop_heap2_LA = largeMallocLocal(yGrid * xGrid * sizeof(bool));\n      pop_heap2    = reinterpret_cast<bool*>(pop_heap2_LA.get());\n\n      d1_alloc_LA = largeMallocLocal(yGrid * xGrid * sizeof(atomic<float>));\n      d1_alloc    = reinterpret_cast<atomic<float>*>(d1_alloc_LA.get());\n      d1_p_LA     = largeMallocLocal(yGrid * sizeof(atomic<float>*));\n      d1_p        = reinterpret_cast<atomic<float>**>(d1_p_LA.get());\n\n      HV_alloc_LA     = largeMallocLocal(yGrid * xGrid * sizeof(bool));\n      HV_alloc        = reinterpret_cast<bool*>(HV_alloc_LA.get());\n      hyperV_alloc_LA = largeMallocLocal(yGrid * xGrid * sizeof(bool));\n      hyperV_alloc    = reinterpret_cast<bool*>(hyperV_alloc_LA.get());\n      hyperH_alloc_LA = largeMallocLocal(yGrid * xGrid * sizeof(bool));\n      hyperH_alloc    = reinterpret_cast<bool*>(hyperH_alloc_LA.get());\n\n      HV_p_LA     = largeMallocLocal(yGrid * sizeof(bool*));\n      HV_p        = reinterpret_cast<bool**>(HV_p_LA.get());\n      hyperV_p_LA = largeMallocLocal(yGrid * sizeof(bool*));\n      hyperV_p    = reinterpret_cast<bool**>(hyperV_p_LA.get());\n      hyperH_p_LA = largeMallocLocal(yGrid * sizeof(bool*));\n      hyperH_p    = reinterpret_cast<bool**>(hyperH_p_LA.get());\n\n      parentX1_alloc_LA = largeMallocLocal(yGrid * xGrid * sizeof(short));\n      parentX1_alloc    = reinterpret_cast<short*>(parentX1_alloc_LA.get());\n      parentX3_alloc_LA = largeMallocLocal(yGrid * xGrid * sizeof(short));\n      parentX3_alloc    = reinterpret_cast<short*>(parentX3_alloc_LA.get());\n      parentY1_alloc_LA = largeMallocLocal(yGrid * xGrid * sizeof(short));\n      parentY1_alloc    = reinterpret_cast<short*>(parentY1_alloc_LA.get());\n      parentY3_alloc_LA = largeMallocLocal(yGrid * xGrid * sizeof(short));\n      parentY3_alloc    = reinterpret_cast<short*>(parentY1_alloc_LA.get());\n\n      parentX1_p_LA = largeMallocLocal(yGrid * sizeof(short*));\n      parentX1_p    = reinterpret_cast<short**>(parentX1_p_LA.get());\n      parentX3_p_LA = largeMallocLocal(yGrid * sizeof(short*));\n      parentX3_p    = reinterpret_cast<short**>(parentX3_p_LA.get());\n      parentY1_p_LA = largeMallocLocal(yGrid * sizeof(short*));\n      parentY1_p    = reinterpret_cast<short**>(parentY1_p_LA.get());\n      parentY3_p_LA = largeMallocLocal(yGrid * sizeof(short*));\n      parentY3_p    = reinterpret_cast<short**>(parentY3_p_LA.get());\n\n      corrEdge_alloc_LA = largeMallocLocal(yGrid * xGrid * sizeof(int));\n      corrEdge_alloc    = reinterpret_cast<int*>(corrEdge_alloc_LA.get());\n      corrEdge_p_LA     = largeMallocLocal(yGrid * sizeof(int*));\n      corrEdge_p        = reinterpret_cast<int**>(corrEdge_p_LA.get());\n\n      inRegion_alloc_LA = largeMallocLocal(yGrid * xGrid * sizeof(bool));\n      inRegion_alloc    = reinterpret_cast<bool*>(inRegion_alloc_LA.get());\n      inRegion_p_LA     = largeMallocLocal(yGrid * sizeof(bool*));\n      inRegion_p        = reinterpret_cast<bool**>(inRegion_p_LA.get());\n\n      netEO_p_LA = largeMallocLocal(2000 * sizeof(OrderNetEdge));\n      netEO_p    = reinterpret_cast<OrderNetEdge*>(netEO_p_LA.get());\n    } else {\n      pop_heap2 = (bool*)calloc(yGrid * xGrid, sizeof(bool));\n\n      d1_alloc = (std::atomic<float>*)calloc(yGrid * xGrid,\n                                             sizeof(std::atomic<float>));\n      d1_p = (std::atomic<float>**)calloc(yGrid, sizeof(std::atomic<float>*));\n\n      HV_alloc     = (bool*)calloc(yGrid * xGrid, sizeof(bool));\n      hyperV_alloc = (bool*)calloc(yGrid * xGrid, sizeof(bool));\n      hyperH_alloc = (bool*)calloc(yGrid * xGrid, sizeof(bool));\n      HV_p         = (bool**)calloc(yGrid, sizeof(bool*));\n      hyperV_p     = (bool**)calloc(yGrid, sizeof(bool*));\n      hyperH_p     = (bool**)calloc(yGrid, sizeof(bool*));\n\n      parentX1_alloc = (short*)calloc(yGrid * xGrid, sizeof(short));\n      parentX3_alloc = (short*)calloc(yGrid * xGrid, sizeof(short));\n      parentY1_alloc = (short*)calloc(yGrid * xGrid, sizeof(short));\n      parentY3_alloc = (short*)calloc(yGrid * xGrid, sizeof(short));\n      parentX1_p     = (short**)calloc(yGrid, sizeof(short*));\n      parentX3_p     = (short**)calloc(yGrid, sizeof(short*));\n      parentY1_p     = (short**)calloc(yGrid, sizeof(short*));\n      parentY3_p     = (short**)calloc(yGrid, sizeof(short*));\n\n      corrEdge_alloc = (int*)calloc(yGrid * xGrid, sizeof(int));\n      corrEdge_p     = (int**)calloc(yGrid, sizeof(int*));\n\n      inRegion_alloc = (bool*)calloc(yGrid * xGrid, sizeof(bool));\n      inRegion_p     = (bool**)calloc(yGrid, sizeof(bool*));\n\n      netEO_p = (OrderNetEdge*)calloc(2000, sizeof(OrderNetEdge));\n    }\n    // printf(\"allocation success\\n\");\n    for (int i = 0; i < yGrid; i++) {\n      d1_p[i] = &(d1_alloc[i * xGrid]);\n\n      HV_p[i]     = &(HV_alloc[i * xGrid]);\n      hyperV_p[i] = &(hyperV_alloc[i * xGrid]);\n      hyperH_p[i] = &(hyperH_alloc[i * xGrid]);\n\n      corrEdge_p[i] = &(corrEdge_alloc[i * xGrid]);\n\n      inRegion_p[i] = &(inRegion_alloc[i * xGrid]);\n    }\n\n    for (int i = 0; i < yGrid; i++) {\n      parentX1_p[i] = &(parentX1_alloc[i * xGrid]);\n      parentX3_p[i] = &(parentX3_alloc[i * xGrid]);\n      parentY1_p[i] = &(parentY1_alloc[i * xGrid]);\n      parentY3_p[i] = &(parentY3_alloc[i * xGrid]);\n    }\n  }\n  void reset_heap() { memset(pop_heap2, 0, yGrid * xGrid * sizeof(bool)); }\n  void clear() {\n    if (!NET_PARALLEL) {\n      free(pop_heap2);\n\n      free(d1_p);\n      free(d1_alloc);\n\n      free(HV_p);\n      free(hyperV_p);\n      free(hyperH_p);\n      free(HV_alloc);\n      free(hyperV_alloc);\n      free(hyperH_alloc);\n\n      free(parentX1_p);\n      free(parentY1_p);\n      free(parentX3_p);\n      free(parentY3_p);\n\n      free(parentX1_alloc);\n      free(parentY1_alloc);\n      free(parentX3_alloc);\n      free(parentY3_alloc);\n\n      free(corrEdge_alloc);\n      free(corrEdge_p);\n\n      free(netEO_p);\n    } else {\n      /*delete [] pop_heap2;\n      delete [] heap1;\n      delete [] heap2;\n\n      delete [] d1_p;\n      delete [] d1_alloc;\n\n      delete [] HV_p;\n      delete [] hyperV_p;\n      delete [] hyperH_p;\n      delete [] HV_alloc;\n      delete [] hyperV_alloc;\n      delete [] hyperH_alloc;\n\n      delete [] parentX1_p;\n      delete [] parentY1_p;\n      delete [] parentX3_p;\n      delete [] parentY3_p;\n\n      delete [] parentX1_alloc;\n      delete [] parentY1_alloc;\n      delete [] parentX3_alloc;\n      delete [] parentY3_alloc;\n\n      delete [] corrEdge_alloc;\n      delete [] corrEdge_p;\n\n      delete [] netEO_p;*/\n    }\n  }\n};\n\n// ripup a tree edge according to its ripup type and Z-route it\n// put all the nodes in the subtree t1 and t2 into heap1 and heap2\n// netID   - the ID for the net\n// edgeID  - the ID for the tree edge to route\n// d1      - the distance of any grid from the source subtree t1\n// d2      - the distance of any grid from the destination subtree t2\n// heap1   - the heap storing the addresses for d1[][]\n// heap2   - the heap storing the addresses for d2[][]\nvoid setupHeapLate(int netID, int edgeID, galois::InsertBag<lateUpdateReq>& pq1,\n                   std::vector<int>& v2, int regionX1, int regionX2,\n                   int regionY1, int regionY2, std::atomic<float>** d1,\n                   int** corrEdge, bool** inRegion) {\n  int i, j, d, numNodes, n1, n2, x1, y1, x2, y2;\n  int nbr, nbrX, nbrY, cur, edge;\n  int x_grid, y_grid;\n  int queuehead, queuetail, *queue;\n  Bool* visited;\n  TreeEdge* treeedges;\n  TreeNode* treenodes;\n  Route* route;\n\n  for (i = regionY1; i <= regionY2; i++) {\n    for (j = regionX1; j <= regionX2; j++)\n      inRegion[i][j] = TRUE;\n  }\n\n  treeedges = sttrees[netID].edges;\n  treenodes = sttrees[netID].nodes;\n  d         = sttrees[netID].deg;\n\n  n1 = treeedges[edgeID].n1;\n  n2 = treeedges[edgeID].n2;\n  x1 = treenodes[n1].x;\n  y1 = treenodes[n1].y;\n  x2 = treenodes[n2].x;\n  y2 = treenodes[n2].y;\n\n  // if(netID == 14628)\n  //    printf(\"net: %d edge: %d src: %d %d dst: %d %d d: %d\\n\", netID, edgeID,\n  //    y1, x1, y2, x2, d);\n  pq1.clear();\n  v2.clear(); // Michael\n  if (d == 2) // 2-pin net\n  {\n    d1[y1][x1] = 0;\n    pq1.push(lateUpdateReq(&(d1[y1][x1]), 0, 0, 0, false));\n    v2.push_back(y2 * xGrid + x2);\n  } else // net with more than 2 pins\n  {\n    numNodes = 2 * d - 2;\n\n    visited = (Bool*)calloc(numNodes, sizeof(Bool));\n    for (i = 0; i < numNodes; i++)\n      visited[i] = FALSE;\n\n    queue = (int*)calloc(numNodes, sizeof(int));\n\n    // find all the grids on tree edges in subtree t1 (connecting to n1) and put\n    // them into heap1\n    if (n1 < d) // n1 is a Pin node\n    {\n      // just need to put n1 itself into heap1\n      d1[y1][x1] = 0;\n      pq1.push(lateUpdateReq(&(d1[y1][x1]), 0, 0, 0, false));\n      visited[n1] = TRUE;\n    } else // n1 is a Steiner node\n    {\n      queuehead = queuetail = 0;\n\n      // add n1 into heap1\n      d1[y1][x1] = 0;\n      // if(netID == 252163 && edgeID == 51)\n      //    printf(\"y: %d x: %d\\n\", y1, x1);\n      pq1.push(lateUpdateReq(&(d1[y1][x1]), 0, 0, 0, false));\n      visited[n1] = TRUE;\n\n      // add n1 into the queue\n      queue[queuetail] = n1;\n      queuetail++;\n\n      // loop to find all the edges in subtree t1\n      while (queuetail > queuehead) {\n        // get cur node from the queuehead\n        cur = queue[queuehead];\n        queuehead++;\n        visited[cur] = TRUE;\n        if (cur >= d) // cur node is a Steiner node\n        {\n          for (i = 0; i < 3; i++) {\n            nbr  = treenodes[cur].nbr[i];\n            edge = treenodes[cur].edge[i];\n            if (nbr != n2) // not n2\n            {\n              if (visited[nbr] == FALSE) {\n                // put all the grids on the two adjacent tree edges into heap1\n                if (treeedges[edge].route.routelen > 0) // not a degraded edge\n                {\n                  // put nbr into heap1 if in enlarged region\n                  if (inRegion[treenodes[nbr].y][treenodes[nbr].x]) {\n                    nbrX           = treenodes[nbr].x;\n                    nbrY           = treenodes[nbr].y;\n                    d1[nbrY][nbrX] = 0;\n                    // if(netID == 252163 && edgeID == 51)\n                    //    printf(\"y: %d x: %d\\n\", nbrY, nbrX);\n                    pq1.push(lateUpdateReq(&(d1[nbrY][nbrX]), 0, 0, 0, false));\n                    corrEdge[nbrY][nbrX] = edge;\n                  }\n\n                  // the coordinates of two end nodes of the edge\n\n                  route = &(treeedges[edge].route);\n                  if (route->type == MAZEROUTE) {\n                    for (j = 1; j < route->routelen;\n                         j++) // don't put edge_n1 and edge_n2 into heap1\n                    {\n                      x_grid = route->gridsX[j];\n                      y_grid = route->gridsY[j];\n\n                      if (inRegion[y_grid][x_grid]) {\n                        d1[y_grid][x_grid] = 0;\n                        // if(netID == 252163 && edgeID == 51)\n                        //    printf(\"y: %d x: %d\\n\", y_grid, x_grid);\n                        pq1.push(lateUpdateReq(&(d1[y_grid][x_grid]), 0, 0, 0,\n                                               false));\n                        corrEdge[y_grid][x_grid] = edge;\n                      }\n                    }\n                  } // if MAZEROUTE\n                  else {\n                    printf(\"Setup Heap: not maze routing\\n\");\n                  }\n                } // if not a degraded edge (len>0)\n\n                // add the neighbor of cur node into queue\n                queue[queuetail] = nbr;\n                queuetail++;\n              } // if the node is not visited\n            }   // if nbr!=n2\n          }     // loop i (3 neigbors for cur node)\n        }       // if cur node is a Steiner nodes\n      }         // while queue is not empty\n    }           // else n1 is not a Pin node\n\n    // find all the grids on subtree t2 (connect to n2) and put them into heap2\n    // find all the grids on tree edges in subtree t2 (connecting to n2) and put\n    // them into heap2\n    if (n2 < d) // n2 is a Pin node\n    {\n      // just need to put n2 itself into heap2\n      v2.push_back(y2 * xGrid + x2);\n      // if(netID == 14628)\n      //    printf(\"y: %d x: %d \\n\", y2, x2);\n      visited[n2] = TRUE;\n    } else // n2 is a Steiner node\n    {\n      queuehead = queuetail = 0;\n\n      // add n2 into heap2\n      v2.push_back(y2 * xGrid + x2);\n      // if(netID == 252163 && edgeID == 51)\n      //    printf(\"dst y: %d x: %d \\n\", y2, x2);\n      visited[n2] = TRUE;\n\n      // add n2 into the queue\n      queue[queuetail] = n2;\n      queuetail++;\n\n      // loop to find all the edges in subtree t2\n      while (queuetail > queuehead) {\n        // get cur node form queuehead\n        cur          = queue[queuehead];\n        visited[cur] = TRUE;\n        queuehead++;\n\n        if (cur >= d) // cur node is a Steiner node\n        {\n          for (i = 0; i < 3; i++) {\n            nbr  = treenodes[cur].nbr[i];\n            edge = treenodes[cur].edge[i];\n            if (nbr != n1) // not n1\n            {\n              if (visited[nbr] == FALSE) {\n                // put all the grids on the two adjacent tree edges into heap2\n                if (treeedges[edge].route.routelen > 0) // not a degraded edge\n                {\n                  // put nbr into heap2\n                  if (inRegion[treenodes[nbr].y][treenodes[nbr].x]) {\n                    nbrX = treenodes[nbr].x;\n                    nbrY = treenodes[nbr].y;\n                    v2.push_back(nbrY * xGrid + nbrX);\n                    // if(netID == 252163 && edgeID == 51)\n                    //    printf(\"dst y: %d x: %d\\n\", nbrY, nbrX);\n                    corrEdge[nbrY][nbrX] = edge;\n                  }\n\n                  // the coordinates of two end nodes of the edge\n\n                  route = &(treeedges[edge].route);\n                  if (route->type == MAZEROUTE) {\n                    for (j = 1; j < route->routelen;\n                         j++) // don't put edge_n1 and edge_n2 into heap2\n                    {\n                      x_grid = route->gridsX[j];\n                      y_grid = route->gridsY[j];\n                      if (inRegion[y_grid][x_grid]) {\n                        v2.push_back(y_grid * xGrid + x_grid);\n                        // if(netID == 252163 && edgeID == 51)\n                        //    printf(\"dst y: %d x: %d\\n\", y_grid, x_grid);\n                        corrEdge[y_grid][x_grid] = edge;\n                      }\n                    }\n                  } // if MAZEROUTE\n                  else {\n                    printf(\"Setup Heap: not maze routing\\n\");\n                  }\n                } // if the edge is not degraded (len>0)\n\n                // add the neighbor of cur node into queue\n                queue[queuetail] = nbr;\n                queuetail++;\n              } // if the node is not visited\n            }   // if nbr!=n1\n          }     // loop i (3 neigbors for cur node)\n        }       // if cur node is a Steiner nodes\n      }         // while queue is not empty\n    }           // else n2 is not a Pin node\n\n    free(queue);\n    free(visited);\n  } // net with more than two pins\n\n  for (i = regionY1; i <= regionY2; i++) {\n    for (j = regionX1; j <= regionX2; j++)\n      inRegion[i][j] = FALSE;\n  }\n}\n\nvoid mazeRouteMSMD_finegrain_lateupdate(int iter, int expand, float costHeight,\n                                        int ripup_threshold,\n                                        int mazeedge_Threshold, Bool Ordering,\n                                        int cost_type) {\n  // LOCK = 0;\n  galois::StatTimer timer_finegrain(\"fine grain function\", \"fine grain maze\");\n\n  float forange;\n  // allocate memory for distance and parent and pop_heap\n  h_costTable = (float*)calloc(40 * hCapacity, sizeof(float));\n  v_costTable = (float*)calloc(40 * vCapacity, sizeof(float));\n\n  forange = 40 * hCapacity;\n\n  if (cost_type == 2) {\n    for (int i = 0; i < forange; i++) {\n      if (i < hCapacity - 1)\n        h_costTable[i] =\n            costHeight / (exp((float)(hCapacity - i - 1) * LOGIS_COF) + 1) + 1;\n      else\n        h_costTable[i] =\n            costHeight / (exp((float)(hCapacity - i - 1) * LOGIS_COF) + 1) + 1 +\n            (float)costHeight / slope * (i - hCapacity);\n    }\n    forange = 40 * vCapacity;\n    for (int i = 0; i < forange; i++) {\n      if (i < vCapacity - 1)\n        v_costTable[i] =\n            costHeight / (exp((float)(vCapacity - i - 1) * LOGIS_COF) + 1) + 1;\n      else\n        v_costTable[i] =\n            costHeight / (exp((float)(vCapacity - i - 1) * LOGIS_COF) + 1) + 1 +\n            (float)costHeight / slope * (i - vCapacity);\n    }\n  } else {\n\n    for (int i = 0; i < forange; i++) {\n      if (i < hCapacity)\n        h_costTable[i] =\n            costHeight / (exp((float)(hCapacity - i) * LOGIS_COF) + 1) + 1;\n      else\n        h_costTable[i] =\n            costHeight / (exp((float)(hCapacity - i) * LOGIS_COF) + 1) + 1 +\n            (float)costHeight / slope * (i - hCapacity);\n    }\n    forange = 40 * vCapacity;\n    for (int i = 0; i < forange; i++) {\n      if (i < vCapacity)\n        v_costTable[i] =\n            costHeight / (exp((float)(vCapacity - i) * LOGIS_COF) + 1) + 1;\n      else\n        v_costTable[i] =\n            costHeight / (exp((float)(vCapacity - i) * LOGIS_COF) + 1) + 1 +\n            (float)costHeight / slope * (i - vCapacity);\n    }\n  }\n\n  /*forange = yGrid*xGrid;\n  for(i=0; i<forange; i++)\n  {\n      pop_heap2[i] = FALSE;\n  } //Michael*/\n\n  int* d1_edgeID = new int[yGrid * xGrid];\n  int* d1_netID  = new int[yGrid * xGrid];\n  for (int i = 0; i < yGrid * xGrid; i++) {\n    d1_edgeID[i] = 0;\n    d1_netID[i]  = 0;\n  }\n\n  galois::LargeArray<galois::substrate::SimpleLock> data;\n  data.allocateInterleaved(xGrid * yGrid);\n\n  if (Ordering) {\n    StNetOrder();\n    // printf(\"order?\\n\");\n  }\n\n  LateUpdateStorage* thread_local_storage = new LateUpdateStorage;\n  // for(nidRPC=0; nidRPC<numValidNets; nidRPC++)//parallelize\n  PRINT = 0;\n  galois::GAccumulator<int> total_ripups;\n  galois::GReduceMax<int> max_ripups;\n  total_ripups.reset();\n  max_ripups.reset();\n\n  // galois::runtime::profileVtune( [&] (void) {\n  /*std::random_device rd;\n  std::mt19937 g(rd());\n  std::shuffle(net_shuffle.begin(), net_shuffle.end(), g);\n\n  galois::do_all(galois::iterate(net_shuffle), */\n  // galois::for_each(galois::iterate(0, numValidNets),\n  //        [&] (const auto nidRPC, auto& ctx)\n  galois::StatTimer timer_newripupcheck(\"ripup\", \"fine grain maze\");\n  galois::StatTimer timer_setupheap(\"setup heap\", \"fine grain maze\");\n  galois::StatTimer timer_traceback(\"trace back\", \"fine grain maze\");\n  galois::StatTimer timer_adjusttree(\"adjust tree\", \"fine grain maze\");\n  galois::StatTimer timer_updateusage(\"update usage\", \"fine grain maze\");\n  galois::StatTimer timer_checkroute2dtree(\"checkroute2dtree\",\n                                           \"fine grain maze\");\n  galois::StatTimer timer_init(\"init\", \"fine grain maze\");\n  galois::StatTimer timer_foreach(\"foreach\", \"fine grain maze\");\n  galois::StatTimer timer_init_int(\"big int initialize\", \"fine grain maze\");\n  for (int nidRPC = 0; nidRPC < numValidNets; nidRPC++) {\n\n    int netID;\n\n    // maze routing for multi-source, multi-destination\n    Bool hypered, enter;\n    int i, j, deg, edgeID, n1, n2, n1x, n1y, n2x, n2y, ymin, ymax, xmin, xmax,\n        crossX, crossY, tmpi, min_x, min_y, num_edges;\n    int regionX1, regionX2, regionY1, regionY2;\n    int tmpind, gridsX[XRANGE], gridsY[YRANGE], tmp_gridsX[XRANGE],\n        tmp_gridsY[YRANGE];\n    int endpt1, endpt2, A1, A2, B1, B2, C1, C2, D1, D2, cnt, cnt_n1n2;\n    int edge_n1n2, edge_n1A1, edge_n1A2, edge_n1C1, edge_n1C2, edge_A1A2,\n        edge_C1C2;\n    int edge_n2B1, edge_n2B2, edge_n2D1, edge_n2D2, edge_B1B2, edge_D1D2;\n    int E1x, E1y, E2x, E2y;\n    int origENG, edgeREC;\n\n    TreeEdge *treeedges, *treeedge;\n    TreeNode* treenodes;\n\n    bool* pop_heap2 = thread_local_storage->pop_heap2;\n\n    std::atomic<float>** d1 = thread_local_storage->d1_p;\n    bool** HV               = thread_local_storage->HV_p;\n    bool** hyperV           = thread_local_storage->hyperV_p;\n    bool** hyperH           = thread_local_storage->hyperH_p;\n\n    short** parentX1 = thread_local_storage->parentX1_p;\n    short** parentX3 = thread_local_storage->parentX3_p;\n    short** parentY1 = thread_local_storage->parentY1_p;\n    short** parentY3 = thread_local_storage->parentY3_p;\n\n    int** corrEdge = thread_local_storage->corrEdge_p;\n\n    OrderNetEdge* netEO = thread_local_storage->netEO_p;\n\n    bool** inRegion = thread_local_storage->inRegion_p;\n\n    galois::InsertBag<lateUpdateReq> pq1;\n    std::vector<int> v2;\n\n    /*for(i=0; i<yGrid*xGrid; i++)\n    {\n        pop_heap2[i] = FALSE;\n    } */\n\n    // memset(inRegion_alloc, 0, xGrid * yGrid * sizeof(bool));\n    /*for(int i=0; i<yGrid; i++)\n    {\n        for(int j=0; j<xGrid; j++)\n            inRegion[i][j] = FALSE;\n    }*/\n    // printf(\"hyperV[153][134]: %d %d %d\\n\", hyperV[153][134],\n    // parentY1[153][134], parentX3[153][134]); printf(\"what is happening?\\n\");\n\n    if (Ordering) {\n      netID = treeOrderCong[nidRPC].treeIndex;\n    } else {\n      netID = nidRPC;\n    }\n\n    deg = sttrees[netID].deg;\n\n    origENG = expand;\n\n    netedgeOrderDec(netID, netEO);\n\n    treeedges = sttrees[netID].edges;\n    treenodes = sttrees[netID].nodes;\n    // loop for all the tree edges (2*deg-3)\n    num_edges = 2 * deg - 3;\n\n    for (edgeREC = 0; edgeREC < num_edges; edgeREC++) {\n\n      edgeID   = netEO[edgeREC].edgeID;\n      treeedge = &(treeedges[edgeID]);\n\n      n1            = treeedge->n1;\n      n2            = treeedge->n2;\n      n1x           = treenodes[n1].x;\n      n1y           = treenodes[n1].y;\n      n2x           = treenodes[n2].x;\n      n2y           = treenodes[n2].y;\n      treeedge->len = ADIFF(n2x, n1x) + ADIFF(n2y, n1y);\n\n      if (treeedge->len >\n          mazeedge_Threshold) // only route the non-degraded edges (len>0)\n      {\n        timer_newripupcheck.start();\n        enter = newRipupCheck(treeedge, ripup_threshold, netID, edgeID);\n        timer_newripupcheck.stop();\n\n        // ripup the routing for the edge\n        timer_finegrain.start();\n        if (enter) {\n          // if(netID == 2 && edgeID == 26)\n          //    printf(\"netID %d edgeID %d src %d %d dst %d %d\\n\", netID,\n          //    edgeID, n1x, n1y, n2x, n2y);\n          // pre_length = treeedge->route.routelen;\n          /*for(int i = 0; i < pre_length; i++)\n          {\n              pre_gridsY[i] = treeedge->route.gridsY[i];\n              pre_gridsX[i] = treeedge->route.gridsX[i];\n              //printf(\"i %d x %d y %d\\n\", i, pre_gridsX[i], pre_gridsY[i]);\n          }*/\n          timer_init.start();\n          if (n1y <= n2y) {\n            ymin = n1y;\n            ymax = n2y;\n          } else {\n            ymin = n2y;\n            ymax = n1y;\n          }\n\n          if (n1x <= n2x) {\n            xmin = n1x;\n            xmax = n2x;\n          } else {\n            xmin = n2x;\n            xmax = n1x;\n          }\n\n          int enlarge = min(\n              origENG, (iter / 6 + 3) *\n                           treeedge->route\n                               .routelen); // michael, this was global variable\n          regionX1 = max(0, xmin - enlarge);\n          regionX2 = min(xGrid - 1, xmax + enlarge);\n          regionY1 = max(0, ymin - enlarge);\n          regionY2 = min(yGrid - 1, ymax + enlarge);\n          // std::cout << \"region size\" << regionWidth << \", \" << regionHeight\n          // << std::endl;\n          // initialize d1[][] and d2[][] as BIG_INT\n          timer_init_int.start();\n          for (i = regionY1; i <= regionY2; i++) {\n            for (j = regionX1; j <= regionX2; j++) {\n              d1[i][j] = BIG_INT;\n            }\n          }\n          timer_init_int.stop();\n          // memset(hyperH, 0, xGrid * yGrid * sizeof(bool));\n          // memset(hyperV, 0, xGrid * yGrid * sizeof(bool));\n          for (i = regionY1; i <= regionY2; i++) {\n            for (j = regionX1; j <= regionX2; j++) {\n              HV[i][j] = FALSE;\n            }\n          }\n          for (i = regionY1; i <= regionY2; i++) {\n            for (j = regionX1; j <= regionX2; j++) {\n              hyperH[i][j] = FALSE;\n            }\n          }\n          for (i = regionY1; i <= regionY2; i++) {\n            for (j = regionX1; j <= regionX2; j++) {\n              hyperV[i][j] = FALSE;\n            }\n          }\n          // TODO: use seperate loops\n\n          // setup heap1, heap2 and initialize d1[][] and d2[][] for all the\n          // grids on the two subtrees\n          timer_setupheap.start();\n          setupHeapLate(netID, edgeID, pq1, v2, regionX1, regionX2, regionY1,\n                        regionY2, d1, corrEdge, inRegion);\n          timer_setupheap.stop();\n          // TODO: use std priority queue\n          // while loop to find shortest path\n          /*ind1 = (pq1.top().d1_p - &d1[0][0]);\n          curX = ind1%xGrid;\n          curY = ind1/xGrid;\n          printf(\"src size: %d dst size: %d\\n\", pq1.size(), v2.size());*/\n          for (auto ii = v2.begin(); ii != v2.end(); ii++) {\n            pop_heap2[*ii] = TRUE;\n            // cout << \"dst : \" << *ii % xGrid << \" \" << *ii / xGrid << endl;\n          }\n          std::atomic<int> return_ind1;\n          std::atomic<float> return_dist;\n          return_dist = (float)BIG_INT;\n\n          timer_init.stop();\n          timer_foreach.start();\n\n          galois::for_each(\n              galois::iterate(pq1),\n              [&](const auto& top, auto& ctx)\n              // while( pop_heap2[ind1]==FALSE) // stop until the grid position\n              // been popped out from both heap1 and heap2\n              {\n                // relax all the adjacent grids within the enlarged region for\n                // source subtree\n\n                int ind1 = top.d1_p - &d1[0][0];\n                // data[ind1].lock();\n\n                int curX = ind1 % xGrid;\n                int curY = ind1 / xGrid;\n                int grid = curY * xGrid + curX;\n\n                float curr_d1 = d1[curY][curX];\n                float d1_push = top.d1_push;\n\n                // printf(\"netID: %d edgeID:%d curX curY %d %d, d1_push: %f,\n                // curr_d1: %f\\n\", netID, edgeID, curX, curY, d1_push, curr_d1);\n                if ((d1_push == curr_d1 && d1_push < return_dist.load()) ||\n                    d1_push == 0) {\n\n                  // d1[curY][curX] = d1_push;\n                  // curr_d1 = d1_push;\n                  if (d1_push == 0) {\n                    d1_edgeID[ind1] = edgeID;\n                    d1_netID[ind1]  = netID;\n                  }\n\n                  HV[curY][curX] = top.HV;\n                  if (top.HV) {\n                    parentX1[curY][curX] = top.parentX;\n                    parentY1[curY][curX] = top.parentY;\n                  } else {\n                    parentX3[curY][curX] = top.parentX;\n                    parentY3[curY][curX] = top.parentY;\n                  }\n\n                  if (pop_heap2[ind1]) {\n                    // printf(\"reach! curX curY %d %d, d1_push: %f, curr_d1: %f\n                    // return_d: %f\\n\", curX, curY, d1_push, curr_d1,\n                    // return_dist.load());\n                    return_ind1.store(ind1);\n                    return_dist.store(d1_push);\n                  }\n\n                  /*grid = curY*xGrid + curX - 1;\n                  if(curX>regionX1)\n                      galois::runtime::acquire(&data[grid],\n                  galois::MethodFlag::WRITE);\n\n                  grid = curY*xGrid + curX + 1;\n                  if(curX<regionX2)\n                      galois::runtime::acquire(&data[grid],\n                  galois::MethodFlag::WRITE);\n\n                  grid = (curY - 1)*xGrid + curX;\n                  if(curY>regionY1)\n                      galois::runtime::acquire(&data[grid],\n                  galois::MethodFlag::WRITE);\n\n                  grid = (curY + 1)*xGrid + curX;\n                  if(curY<regionY2)\n                      galois::runtime::acquire(&data[grid],\n                  galois::MethodFlag::WRITE);*/\n\n                  int preX = curX, preY = curY;\n                  if (curr_d1 != 0) {\n                    if (HV[curY][curX]) {\n                      preX = parentX1[curY][curX];\n                      preY = parentY1[curY][curX];\n                    } else {\n                      preX = parentX3[curY][curX];\n                      preY = parentY3[curY][curX];\n                    }\n                  }\n                  // printf(\"pop curY: %d curX: %d d1: %f preX: %d preY: %d\n                  // hyperH: %d hyperV: %d HV: %d return_dist: %f\\n\",\n                  //    curY, curX, curr_d1, preX, preY, hyperH[curY][curX],\n                  //    hyperV[curY][curX], HV[curY][curX], return_dist.load());\n                  float tmp = 0.f, tmp_cost = 0.f;\n                  int tmp_grid = 0;\n                  int tmpX = 0, tmpY = 0;\n                  bool tmpH = false;\n                  bool tmpV = false;\n\n                  // if(curX>regionX1)\n                  //    data[curY*xGrid+curX-1].lock();\n\n                  // data[curY*(xGrid-1)+curX].lock();\n\n                  // left\n                  if (curX > regionX1) {\n                    grid = curY * (xGrid - 1) + curX - 1;\n\n                    // printf(\"grid: %d %d usage: %d red:%d last:%d sum%f %d\\n\",\n                    //    grid%xGrid, grid/xGrid, h_edges[grid].usage.load(),\n                    //    h_edges[grid].red, h_edges[grid].last_usage, L ,\n                    //    h_edges[grid].usage.load() + h_edges[grid].red +\n                    //    (int)(L*h_edges[grid].last_usage));\n                    if ((preY == curY) || (curr_d1 == 0)) {\n                      tmp =\n                          curr_d1 +\n                          h_costTable[h_edges[grid].usage + h_edges[grid].red +\n                                      (int)(L * h_edges[grid].last_usage)];\n                    } else {\n                      if (curX < regionX2 - 1) {\n                        tmp_grid = curY * (xGrid - 1) + curX;\n                        // float d1_right = (d1_edgeID[curY*xGrid+curX+1] ==\n                        // edgeID && d1_netID[curY*xGrid+curX+1] == netID)?\n                        // d1[curY][curX+1] : BIG_INT;\n                        tmp_cost =\n                            d1[curY][curX + 1] +\n                            h_costTable[h_edges[tmp_grid].usage +\n                                        h_edges[tmp_grid].red +\n                                        (int)(L *\n                                              h_edges[tmp_grid].last_usage)];\n\n                        if (tmp_cost < curr_d1 + VIA) {\n                          // hyperH[curY][curX] = TRUE; //Michael\n                          tmpH = true;\n                        }\n                      }\n                      tmp =\n                          curr_d1 + VIA +\n                          h_costTable[h_edges[grid].usage + h_edges[grid].red +\n                                      (int)(L * h_edges[grid].last_usage)];\n                    }\n                    tmpX = curX - 1; // the left neighbor\n\n                    /*if(d1[curY][tmpX]>=BIG_INT) // left neighbor not been put\n                    into heap1\n                    {\n                        d1[curY][tmpX] = tmp;\n                        parentX3[curY][tmpX] = curX;\n                        parentY3[curY][tmpX] = curY;\n                        HV[curY][tmpX] = FALSE;\n                        pq1.push(&(d1[curY][tmpX]));\n                    }\n                    else */\n                    // galois::runtime::acquire(&data[curY * yGrid + tmpX],\n                    // galois::MethodFlag::WRITE);\n                    if (d1[curY][tmpX] > tmp && tmp < return_dist) {\n                      galois::atomicMin(d1[curY][tmpX], tmp);\n                      if (d1[curY][tmpX] == tmp)\n                        ctx.push(lateUpdateReq(&(d1[curY][tmpX]), tmp, curX,\n                                               curY, false));\n                    }\n                  }\n\n                  // right\n\n                  if (curX < regionX2) {\n                    // data[curY*xGrid+curX+1].lock();\n                    grid = curY * (xGrid - 1) + curX;\n                    // printf(\"grid: %d %d usage: %d red:%d last:%d L:%f\n                    // sum:%d\\n\",grid%xGrid, grid/xGrid,\n                    // h_edges[grid].usage.load(), h_edges[grid].red,\n                    // h_edges[grid].last_usage, L , h_edges[grid].usage.load()\n                    // + h_edges[grid].red + (int)(L*h_edges[grid].last_usage));\n                    if ((preY == curY) || (curr_d1 == 0)) {\n                      tmp =\n                          curr_d1 +\n                          h_costTable[h_edges[grid].usage + h_edges[grid].red +\n                                      (int)(L * h_edges[grid].last_usage)];\n                    } else {\n                      if (curX > regionX1 + 1) {\n                        tmp_grid = curY * (xGrid - 1) + curX - 1;\n                        // float d1_left = (d1_edgeID[curY*xGrid+curX-1] ==\n                        // edgeID && d1_netID[curY*xGrid+curX-1] == netID)?\n                        // d1[curY][curX-1] : BIG_INT;\n                        tmp_cost =\n                            d1[curY][curX - 1] +\n                            h_costTable[h_edges[tmp_grid].usage +\n                                        h_edges[tmp_grid].red +\n                                        (int)(L *\n                                              h_edges[tmp_grid].last_usage)];\n\n                        if (tmp_cost < curr_d1 + VIA) {\n                          // hyperH[curY][curX] = TRUE;\n                          tmpH = true;\n                        }\n                      }\n                      tmp =\n                          curr_d1 + VIA +\n                          h_costTable[h_edges[grid].usage + h_edges[grid].red +\n                                      (int)(L * h_edges[grid].last_usage)];\n                    }\n                    tmpX = curX + 1; // the right neighbor\n\n                    /*if(d1[curY][tmpX]>=BIG_INT) // right neighbor not been put\n                    into heap1\n                    {\n                        d1[curY][tmpX] = tmp;\n                        parentX3[curY][tmpX] = curX;\n                        parentY3[curY][tmpX] = curY;\n                        HV[curY][tmpX] = FALSE;\n                        pq1.push(&(d1[curY][tmpX]));\n\n                    }\n                    else */\n                    // galois::runtime::acquire(&data[curY * yGrid + tmpX],\n                    // galois::MethodFlag::WRITE);\n                    if (d1[curY][tmpX] > tmp && tmp < return_dist) {\n                      galois::atomicMin(d1[curY][tmpX], tmp);\n                      if (d1[curY][tmpX] == tmp)\n                        ctx.push(lateUpdateReq(&(d1[curY][tmpX]), tmp, curX,\n                                               curY, false));\n                    }\n                  }\n                  // data[curY*(xGrid-1)+curX].lock();\n                  hyperH[curY][curX] = tmpH;\n\n                  // data[curY*(xGrid-1)+curX].unlock();\n\n                  // bottom\n\n                  // if(curY>regionY1)\n                  //   data[(curY-1)*xGrid+curX].lock();\n\n                  // data[curY*(xGrid-1)+curX].lock();\n\n                  if (curY > regionY1) {\n                    grid = (curY - 1) * xGrid + curX;\n                    // printf(\"grid: %d %d usage: %d red:%d last:%d sum%f %d\\n\",\n                    //    grid%xGrid, grid/xGrid, v_edges[grid].usage.load(),\n                    //    v_edges[grid].red, v_edges[grid].last_usage, L ,\n                    //    v_edges[grid].usage.load() + v_edges[grid].red +\n                    //    (int)(L*v_edges[grid].last_usage));\n                    if ((preX == curX) || (curr_d1 == 0)) {\n                      tmp =\n                          curr_d1 +\n                          v_costTable[v_edges[grid].usage + v_edges[grid].red +\n                                      (int)(L * v_edges[grid].last_usage)];\n                    } else {\n                      if (curY < regionY2 - 1) {\n                        tmp_grid = curY * xGrid + curX;\n                        // float d1_top = (d1_edgeID[(curY+1)*xGrid+curX] ==\n                        // edgeID && d1_netID[(curY+1)*xGrid+curX] == netID)?\n                        // d1[curY+1][curX] : BIG_INT;\n                        tmp_cost =\n                            d1[curY + 1][curX] +\n                            v_costTable[v_edges[tmp_grid].usage +\n                                        v_edges[tmp_grid].red +\n                                        (int)(L *\n                                              v_edges[tmp_grid].last_usage)];\n\n                        if (tmp_cost < curr_d1 + VIA) {\n                          // hyperV[curY][curX] = TRUE;\n                          tmpV = true;\n                        }\n                      }\n                      tmp =\n                          curr_d1 + VIA +\n                          v_costTable[v_edges[grid].usage + v_edges[grid].red +\n                                      (int)(L * v_edges[grid].last_usage)];\n                    }\n                    tmpY = curY - 1; // the bottom neighbor\n\n                    /*if(d1[tmpY][curX]>=BIG_INT) // bottom neighbor not been\n                    put into heap1\n                    {\n                        d1[tmpY][curX] = tmp;\n                        parentX1[tmpY][curX] = curX;\n                        parentY1[tmpY][curX] = curY;\n                        HV[tmpY][curX] = TRUE;\n                        pq1.push(&(d1[tmpY][curX]));\n\n                    }\n                    else */\n                    // galois::runtime::acquire(&data[tmpY * yGrid + curX],\n                    // galois::MethodFlag::WRITE);\n                    if (d1[tmpY][curX] > tmp && tmp < return_dist) {\n                      galois::atomicMin(d1[tmpY][curX], tmp);\n                      if (d1[tmpY][curX] == tmp)\n                        ctx.push(lateUpdateReq(&(d1[tmpY][curX]), tmp, curX,\n                                               curY, true));\n                    }\n                  }\n                  // top\n                  if (curY < regionY2) {\n\n                    grid = curY * xGrid + curX;\n                    // printf(\"grid: %d %d usage: %d red:%d last:%d sum%f %d\\n\",\n                    //    grid%xGrid, grid/xGrid, v_edges[grid].usage.load(),\n                    //    v_edges[grid].red, v_edges[grid].last_usage, L ,\n                    //    v_edges[grid].usage.load() + v_edges[grid].red +\n                    //    (int)(L*v_edges[grid].last_usage));\n                    if ((preX == curX) || (curr_d1 == 0)) {\n                      tmp =\n                          curr_d1 +\n                          v_costTable[v_edges[grid].usage + v_edges[grid].red +\n                                      (int)(L * v_edges[grid].last_usage)];\n                    } else {\n                      if (curY > regionY1 + 1) {\n                        tmp_grid = (curY - 1) * xGrid + curX;\n                        // float d1_bot = (d1_edgeID[(curY-1)*xGrid+curX] ==\n                        // edgeID && d1_netID[(curY-1)*xGrid+curX] == netID)?\n                        // d1[curY-1][curX] : BIG_INT;\n                        tmp_cost =\n                            d1[curY - 1][curX] +\n                            v_costTable[v_edges[tmp_grid].usage +\n                                        v_edges[tmp_grid].red +\n                                        (int)(L *\n                                              v_edges[tmp_grid].last_usage)];\n\n                        if (tmp_cost < curr_d1 + VIA) {\n                          // hyperV[curY][curX] = TRUE;\n                          tmpV = true;\n                        }\n                      }\n                      tmp =\n                          curr_d1 + VIA +\n                          v_costTable[v_edges[grid].usage + v_edges[grid].red +\n                                      (int)(L * v_edges[grid].last_usage)];\n                    }\n                    tmpY = curY + 1; // the top neighbor\n\n                    /*if(d1[tmpY][curX]>=BIG_INT) // top neighbor not been put\n                    into heap1\n                    {\n                        d1[tmpY][curX] = tmp;\n                        parentX1[tmpY][curX] = curX;\n                        parentY1[tmpY][curX] = curY;\n                        HV[tmpY][curX] = TRUE;\n                        pq1.push(&(d1[tmpY][curX]));\n                    }\n                    else*/\n                    // galois::runtime::acquire(&data[tmpY * yGrid + curX],\n                    // galois::MethodFlag::WRITE);\n                    if (d1[tmpY][curX] > tmp && tmp < return_dist) {\n                      galois::atomicMin(d1[tmpY][curX], tmp);\n                      if (d1[tmpY][curX] == tmp)\n                        ctx.push(lateUpdateReq(&(d1[tmpY][curX]), tmp, curX,\n                                               curY, true));\n                    }\n                  }\n                  hyperV[curY][curX] = tmpV;\n                  // data[curY*xGrid+curX].unlock();\n                }\n                // data[ind1].unlock();\n              },\n              // galois::wl<galois::worklists::ParaMeter<>>(),\n              // galois::wl<PSChunk>(),\n              galois::wl<OBIM_late>(RequestIndexerLate),\n              // galois::chunk_size<MAZE_CHUNK_SIZE>()\n              // galois::parallel_break(),\n              // galois::steal(),\n              galois::loopname(\"fine_grain\"));\n\n          timer_foreach.stop();\n\n          for (auto ii = v2.begin(); ii != v2.end(); ii++)\n            pop_heap2[*ii] = FALSE;\n\n          crossX = return_ind1 % xGrid;\n          crossY = return_ind1 / xGrid;\n\n          cnt      = 0;\n          int curX = crossX;\n          int curY = crossY;\n          int tmpX, tmpY;\n          // if(netID == 2 && edgeID == 26)\n          //    printf(\"crossX %d crossY %d return_d: %f\\n\", crossX, crossY,\n          //    return_dist.load());\n          timer_traceback.start();\n          while (d1[curY][curX] != 0) // loop until reach subtree1\n          {\n            // if(cnt < 200)\n            //    printf(\"Y: %d X: %d hyperH: %d hyperV: %d HV: %d d1: %f\\n\",\n            //    curY, curX, hyperH[curY][curX], hyperV[curY][curX],\n            //    HV[curY][curX], d1[curY][curX]);\n\n            hypered = FALSE;\n            if (cnt != 0) {\n              if (curX != tmpX && hyperH[curY][curX]) {\n                curX    = 2 * curX - tmpX;\n                hypered = TRUE;\n              }\n              // printf(\"hyperV[153][134]: %d\\n\", hyperV[curY][curX]);\n              if (curY != tmpY && hyperV[curY][curX]) {\n                curY    = 2 * curY - tmpY;\n                hypered = TRUE;\n              }\n            }\n            tmpX = curX;\n            tmpY = curY;\n            if (!hypered) {\n              if (HV[tmpY][tmpX]) {\n                curY = parentY1[tmpY][tmpX];\n              } else {\n                curX = parentX3[tmpY][tmpX];\n              }\n            }\n\n            tmp_gridsX[cnt] = curX;\n            tmp_gridsY[cnt] = curY;\n            cnt++;\n          }\n          // reverse the grids on the path\n\n          for (i = 0; i < cnt; i++) {\n            tmpind    = cnt - 1 - i;\n            gridsX[i] = tmp_gridsX[tmpind];\n            gridsY[i] = tmp_gridsY[tmpind];\n          }\n          // add the connection point (crossX, crossY)\n          gridsX[cnt] = crossX;\n          gridsY[cnt] = crossY;\n          cnt++;\n\n          curX     = crossX;\n          curY     = crossY;\n          cnt_n1n2 = cnt;\n\n          // change the tree structure according to the new routing for the tree\n          // edge find E1 and E2, and the endpoints of the edges they are on\n          E1x = gridsX[0];\n          E1y = gridsY[0];\n          E2x = gridsX[cnt_n1n2 - 1];\n          E2y = gridsY[cnt_n1n2 - 1];\n\n          edge_n1n2 = edgeID;\n\n          timer_traceback.stop();\n\n          // if(netID == 14628)\n          //    printf(\"netID %d edgeID %d src %d %d dst %d %d routelen: %d\\n\",\n          //    netID, edgeID, E1x, E1y, E2x, E2y, cnt_n1n2);\n          // (1) consider subtree1\n          timer_adjusttree.start();\n          if (n1 >= deg && (E1x != n1x || E1y != n1y))\n          // n1 is not a pin and E1!=n1, then make change to subtree1,\n          // otherwise, no change to subtree1\n          {\n            // find the endpoints of the edge E1 is on\n            endpt1 = treeedges[corrEdge[E1y][E1x]].n1;\n            endpt2 = treeedges[corrEdge[E1y][E1x]].n2;\n\n            // find A1, A2 and edge_n1A1, edge_n1A2\n            if (treenodes[n1].nbr[0] == n2) {\n              A1        = treenodes[n1].nbr[1];\n              A2        = treenodes[n1].nbr[2];\n              edge_n1A1 = treenodes[n1].edge[1];\n              edge_n1A2 = treenodes[n1].edge[2];\n            } else if (treenodes[n1].nbr[1] == n2) {\n              A1        = treenodes[n1].nbr[0];\n              A2        = treenodes[n1].nbr[2];\n              edge_n1A1 = treenodes[n1].edge[0];\n              edge_n1A2 = treenodes[n1].edge[2];\n            } else {\n              A1        = treenodes[n1].nbr[0];\n              A2        = treenodes[n1].nbr[1];\n              edge_n1A1 = treenodes[n1].edge[0];\n              edge_n1A2 = treenodes[n1].edge[1];\n            }\n\n            if (endpt1 == n1 || endpt2 == n1) // E1 is on (n1, A1) or (n1, A2)\n            {\n              // if E1 is on (n1, A2), switch A1 and A2 so that E1 is always on\n              // (n1, A1)\n              if (endpt1 == A2 || endpt2 == A2) {\n                tmpi      = A1;\n                A1        = A2;\n                A2        = tmpi;\n                tmpi      = edge_n1A1;\n                edge_n1A1 = edge_n1A2;\n                edge_n1A2 = tmpi;\n              }\n\n              // update route for edge (n1, A1), (n1, A2)\n              updateRouteType1(treenodes, n1, A1, A2, E1x, E1y, treeedges,\n                               edge_n1A1, edge_n1A2);\n              // update position for n1\n              treenodes[n1].x = E1x;\n              treenodes[n1].y = E1y;\n            }    // if E1 is on (n1, A1) or (n1, A2)\n            else // E1 is not on (n1, A1) or (n1, A2), but on (C1, C2)\n            {\n              C1        = endpt1;\n              C2        = endpt2;\n              edge_C1C2 = corrEdge[E1y][E1x];\n\n              // update route for edge (n1, C1), (n1, C2) and (A1, A2)\n              updateRouteType2(treenodes, n1, A1, A2, C1, C2, E1x, E1y,\n                               treeedges, edge_n1A1, edge_n1A2, edge_C1C2);\n              // update position for n1\n              treenodes[n1].x = E1x;\n              treenodes[n1].y = E1y;\n              // update 3 edges (n1, A1)->(C1, n1), (n1, A2)->(n1, C2), (C1,\n              // C2)->(A1, A2)\n              edge_n1C1               = edge_n1A1;\n              treeedges[edge_n1C1].n1 = C1;\n              treeedges[edge_n1C1].n2 = n1;\n              edge_n1C2               = edge_n1A2;\n              treeedges[edge_n1C2].n1 = n1;\n              treeedges[edge_n1C2].n2 = C2;\n              edge_A1A2               = edge_C1C2;\n              treeedges[edge_A1A2].n1 = A1;\n              treeedges[edge_A1A2].n2 = A2;\n              // update nbr and edge for 5 nodes n1, A1, A2, C1, C2\n              // n1's nbr (n2, A1, A2)->(n2, C1, C2)\n              treenodes[n1].nbr[0]  = n2;\n              treenodes[n1].edge[0] = edge_n1n2;\n              treenodes[n1].nbr[1]  = C1;\n              treenodes[n1].edge[1] = edge_n1C1;\n              treenodes[n1].nbr[2]  = C2;\n              treenodes[n1].edge[2] = edge_n1C2;\n              // A1's nbr n1->A2\n              for (i = 0; i < 3; i++) {\n                if (treenodes[A1].nbr[i] == n1) {\n                  treenodes[A1].nbr[i]  = A2;\n                  treenodes[A1].edge[i] = edge_A1A2;\n                  break;\n                }\n              }\n              // A2's nbr n1->A1\n              for (i = 0; i < 3; i++) {\n                if (treenodes[A2].nbr[i] == n1) {\n                  treenodes[A2].nbr[i]  = A1;\n                  treenodes[A2].edge[i] = edge_A1A2;\n                  break;\n                }\n              }\n              // C1's nbr C2->n1\n              for (i = 0; i < 3; i++) {\n                if (treenodes[C1].nbr[i] == C2) {\n                  treenodes[C1].nbr[i]  = n1;\n                  treenodes[C1].edge[i] = edge_n1C1;\n                  break;\n                }\n              }\n              // C2's nbr C1->n1\n              for (i = 0; i < 3; i++) {\n                if (treenodes[C2].nbr[i] == C1) {\n                  treenodes[C2].nbr[i]  = n1;\n                  treenodes[C2].edge[i] = edge_n1C2;\n                  break;\n                }\n              }\n\n            } // else E1 is not on (n1, A1) or (n1, A2), but on (C1, C2)\n          }   // n1 is not a pin and E1!=n1\n\n          // (2) consider subtree2\n\n          if (n2 >= deg && (E2x != n2x || E2y != n2y))\n          // n2 is not a pin and E2!=n2, then make change to subtree2,\n          // otherwise, no change to subtree2\n          {\n            // find the endpoints of the edge E1 is on\n            endpt1 = treeedges[corrEdge[E2y][E2x]].n1;\n            endpt2 = treeedges[corrEdge[E2y][E2x]].n2;\n\n            // find B1, B2\n            if (treenodes[n2].nbr[0] == n1) {\n              B1        = treenodes[n2].nbr[1];\n              B2        = treenodes[n2].nbr[2];\n              edge_n2B1 = treenodes[n2].edge[1];\n              edge_n2B2 = treenodes[n2].edge[2];\n            } else if (treenodes[n2].nbr[1] == n1) {\n              B1        = treenodes[n2].nbr[0];\n              B2        = treenodes[n2].nbr[2];\n              edge_n2B1 = treenodes[n2].edge[0];\n              edge_n2B2 = treenodes[n2].edge[2];\n            } else {\n              B1        = treenodes[n2].nbr[0];\n              B2        = treenodes[n2].nbr[1];\n              edge_n2B1 = treenodes[n2].edge[0];\n              edge_n2B2 = treenodes[n2].edge[1];\n            }\n\n            if (endpt1 == n2 || endpt2 == n2) // E2 is on (n2, B1) or (n2, B2)\n            {\n              // if E2 is on (n2, B2), switch B1 and B2 so that E2 is always on\n              // (n2, B1)\n              if (endpt1 == B2 || endpt2 == B2) {\n                tmpi      = B1;\n                B1        = B2;\n                B2        = tmpi;\n                tmpi      = edge_n2B1;\n                edge_n2B1 = edge_n2B2;\n                edge_n2B2 = tmpi;\n              }\n\n              // update route for edge (n2, B1), (n2, B2)\n              updateRouteType1(treenodes, n2, B1, B2, E2x, E2y, treeedges,\n                               edge_n2B1, edge_n2B2);\n\n              // update position for n2\n              treenodes[n2].x = E2x;\n              treenodes[n2].y = E2y;\n            }    // if E2 is on (n2, B1) or (n2, B2)\n            else // E2 is not on (n2, B1) or (n2, B2), but on (D1, D2)\n            {\n              D1        = endpt1;\n              D2        = endpt2;\n              edge_D1D2 = corrEdge[E2y][E2x];\n\n              // update route for edge (n2, D1), (n2, D2) and (B1, B2)\n              updateRouteType2(treenodes, n2, B1, B2, D1, D2, E2x, E2y,\n                               treeedges, edge_n2B1, edge_n2B2, edge_D1D2);\n              // update position for n2\n              treenodes[n2].x = E2x;\n              treenodes[n2].y = E2y;\n              // update 3 edges (n2, B1)->(D1, n2), (n2, B2)->(n2, D2), (D1,\n              // D2)->(B1, B2)\n              edge_n2D1               = edge_n2B1;\n              treeedges[edge_n2D1].n1 = D1;\n              treeedges[edge_n2D1].n2 = n2;\n              edge_n2D2               = edge_n2B2;\n              treeedges[edge_n2D2].n1 = n2;\n              treeedges[edge_n2D2].n2 = D2;\n              edge_B1B2               = edge_D1D2;\n              treeedges[edge_B1B2].n1 = B1;\n              treeedges[edge_B1B2].n2 = B2;\n              // update nbr and edge for 5 nodes n2, B1, B2, D1, D2\n              // n1's nbr (n1, B1, B2)->(n1, D1, D2)\n              treenodes[n2].nbr[0]  = n1;\n              treenodes[n2].edge[0] = edge_n1n2;\n              treenodes[n2].nbr[1]  = D1;\n              treenodes[n2].edge[1] = edge_n2D1;\n              treenodes[n2].nbr[2]  = D2;\n              treenodes[n2].edge[2] = edge_n2D2;\n              // B1's nbr n2->B2\n              for (i = 0; i < 3; i++) {\n                if (treenodes[B1].nbr[i] == n2) {\n                  treenodes[B1].nbr[i]  = B2;\n                  treenodes[B1].edge[i] = edge_B1B2;\n                  break;\n                }\n              }\n              // B2's nbr n2->B1\n              for (i = 0; i < 3; i++) {\n                if (treenodes[B2].nbr[i] == n2) {\n                  treenodes[B2].nbr[i]  = B1;\n                  treenodes[B2].edge[i] = edge_B1B2;\n                  break;\n                }\n              }\n              // D1's nbr D2->n2\n              for (i = 0; i < 3; i++) {\n                if (treenodes[D1].nbr[i] == D2) {\n                  treenodes[D1].nbr[i]  = n2;\n                  treenodes[D1].edge[i] = edge_n2D1;\n                  break;\n                }\n              }\n              // D2's nbr D1->n2\n              for (i = 0; i < 3; i++) {\n                if (treenodes[D2].nbr[i] == D1) {\n                  treenodes[D2].nbr[i]  = n2;\n                  treenodes[D2].edge[i] = edge_n2D2;\n                  break;\n                }\n              }\n            } // else E2 is not on (n2, B1) or (n2, B2), but on (D1, D2)\n          }   // n2 is not a pin and E2!=n2\n\n          // update route for edge (n1, n2) and edge usage\n\n          // printf(\"update route? %d %d\\n\", netID, num_edges);\n          if (treeedges[edge_n1n2].route.type == MAZEROUTE) {\n            free(treeedges[edge_n1n2].route.gridsX);\n            free(treeedges[edge_n1n2].route.gridsY);\n          }\n          treeedges[edge_n1n2].route.gridsX =\n              (short*)calloc(cnt_n1n2, sizeof(short));\n          treeedges[edge_n1n2].route.gridsY =\n              (short*)calloc(cnt_n1n2, sizeof(short));\n          treeedges[edge_n1n2].route.type     = MAZEROUTE;\n          treeedges[edge_n1n2].route.routelen = cnt_n1n2 - 1;\n          treeedges[edge_n1n2].len = ADIFF(E1x, E2x) + ADIFF(E1y, E2y);\n          treeedges[edge_n1n2].n_ripups += 1;\n          total_ripups += 1;\n          max_ripups.update(treeedges[edge_n1n2].n_ripups);\n\n          for (i = 0; i < cnt_n1n2; i++) {\n            // printf(\"cnt_n1n2: %d\\n\", cnt_n1n2);\n            treeedges[edge_n1n2].route.gridsX[i] = gridsX[i];\n            treeedges[edge_n1n2].route.gridsY[i] = gridsY[i];\n          }\n          // std::cout << \" adjsut tree\" << std::endl;\n          timer_adjusttree.stop();\n\n          // update edge usage\n\n          /*for(i=0; i<pre_length; i++)\n          {\n              if(pre_gridsX[i]==pre_gridsX[i+1]) // a vertical edge\n              {\n                  if(i != pre_length - 1)\n                      min_y = min(pre_gridsY[i], pre_gridsY[i+1]);\n                  else\n                      min_y = pre_gridsY[i];\n                  //v_edges[min_y*xGrid+gridsX[i]].usage += 1;\n                  //galois::atomicAdd(v_edges[min_y*xGrid+gridsX[i]].usage,\n          (short unsigned)1);\n                  //printf(\"x y %d %d i %d \\n\", pre_gridsX[i], min_y, i);\n                  v_edges[min_y*xGrid+pre_gridsX[i]].usage.fetch_sub((short\n          int)1, std::memory_order_relaxed);\n                  //if(v_edges[min_y*xGrid+pre_gridsX[i]].usage < 0) printf(\"V\n          negative! %d \\n\", i);\n              }\n              else ///if(gridsY[i]==gridsY[i+1])// a horizontal edge\n              {\n                  if(i != pre_length - 1)\n                      min_x = min(pre_gridsX[i], pre_gridsX[i+1]);\n                  else\n                      min_x = pre_gridsX[i];\n                  //h_edges[gridsY[i]*(xGrid-1)+min_x].usage += 1;\n                  //galois::atomicAdd(h_edges[gridsY[i]*(xGrid-1)+min_x].usage,\n          (short unsigned)1);\n                  //printf(\"x y %d %d i %d\\n\", min_x, pre_gridsY[i], i);\n                  h_edges[pre_gridsY[i]*(xGrid-1)+min_x].usage.fetch_sub((short\n          int)1, std::memory_order_relaxed);\n                  //if(h_edges[pre_gridsY[i]*(xGrid-1)+min_x].usage < 0)\n          printf(\"H negative! %d \\n\", i);\n              }\n          }*/\n          timer_updateusage.start();\n          for (i = 0; i < cnt_n1n2 - 1; i++) {\n            if (gridsX[i] == gridsX[i + 1]) // a vertical edge\n            {\n              min_y = min(gridsY[i], gridsY[i + 1]);\n              // v_edges[min_y*xGrid+gridsX[i]].usage += 1;\n              // galois::atomicAdd(v_edges[min_y*xGrid+gridsX[i]].usage, (short\n              // unsigned)1);\n              v_edges[min_y * xGrid + gridsX[i]].usage.fetch_add(\n                  (short int)1, std::memory_order_relaxed);\n            } else /// if(gridsY[i]==gridsY[i+1])// a horizontal edge\n            {\n              min_x = min(gridsX[i], gridsX[i + 1]);\n              // h_edges[gridsY[i]*(xGrid-1)+min_x].usage += 1;\n              // galois::atomicAdd(h_edges[gridsY[i]*(xGrid-1)+min_x].usage,\n              // (short unsigned)1);\n              h_edges[gridsY[i] * (xGrid - 1) + min_x].usage.fetch_add(\n                  (short int)1, std::memory_order_relaxed);\n            }\n          }\n          timer_updateusage.stop();\n          /*if(LOCK){\n              for(i=0; i<cnt_n1n2-1; i++)\n              {\n                  if(gridsX[i]==gridsX[i+1]) // a vertical edge\n                  {\n                      min_y = min(gridsY[i], gridsY[i+1]);\n                      v_edges[min_y*xGrid+gridsX[i]].releaseLock();\n                  }\n                  else ///if(gridsY[i]==gridsY[i+1])// a horizontal edge\n                  {\n                      min_x = min(gridsX[i], gridsX[i+1]);\n                      h_edges[gridsY[i]*(xGrid-1)+min_x].releaseLock();\n                  }\n              }\n          }*/\n          // printf(\"netID %d edgeID %d src %d %d dst %d %d routelen: %d\\n\",\n          // netID, edgeID, n1x, n1y, n2x, n2y, cnt_n1n2);\n          timer_checkroute2dtree.start();\n          if (checkRoute2DTree(netID)) {\n            reInitTree(netID);\n            return;\n          }\n          timer_checkroute2dtree.stop();\n        } // congested route, if(enter)\n        timer_finegrain.stop();\n      } // only route the non-degraded edges (len>0)\n    }   // iterate on edges of a net\n  }\n\n  printf(\"total ripups: %d max ripups: %d\\n\", total_ripups.reduce(),\n         max_ripups.reduce());\n  //}, \"mazeroute vtune function\");\n  free(h_costTable);\n  free(v_costTable);\n\n  thread_local_storage->clear();\n  delete thread_local_storage;\n\n  delete[] d1_edgeID;\n  delete[] d1_netID;\n}\n"
  },
  {
    "path": "lonestar/eda/cpu/sproute/maze_lock.h",
    "content": "\n\nBool newRipupCheck_lock(TreeEdge* treeedge, int ripup_threshold, int netID,\n                        int edgeID) {\n  short *gridsX, *gridsY;\n  int i, grid, ymin, xmin;\n  Bool needRipup  = FALSE;\n  Bool needRipup2 = FALSE;\n\n  if (treeedge->len == 0) {\n    return (FALSE);\n  } // not ripup for degraded edge\n\n  if (treeedge->route.type == MAZEROUTE) {\n    gridsX = treeedge->route.gridsX;\n    gridsY = treeedge->route.gridsY;\n\n    for (i = 0; i < treeedge->route.routelen; i++) // first check\n    {\n      if (gridsX[i] == gridsX[i + 1]) // a vertical edge\n      {\n        ymin = min(gridsY[i], gridsY[i + 1]);\n        grid = ymin * xGrid + gridsX[i];\n        if (v_edges[grid].usage + v_edges[grid].red >=\n            vCapacity - ripup_threshold) {\n          needRipup = TRUE;\n          break;\n        }\n\n      } else if (gridsY[i] == gridsY[i + 1]) // a horizontal edge\n      {\n        xmin = min(gridsX[i], gridsX[i + 1]);\n        grid = gridsY[i] * (xGrid - 1) + xmin;\n        if (h_edges[grid].usage + h_edges[grid].red >=\n            hCapacity - ripup_threshold) {\n          needRipup = TRUE;\n          break;\n        }\n      }\n    }\n\n    if (needRipup) {\n\n      for (i = 0; i < treeedge->route.routelen; i++) // then lock\n      {\n        if (gridsX[i] == gridsX[i + 1]) // a vertical edge\n        {\n          ymin = min(gridsY[i], gridsY[i + 1]);\n          grid = ymin * xGrid + gridsX[i];\n          galois::runtime::acquire(&v_edges[grid], galois::MethodFlag::WRITE);\n\n        } else if (gridsY[i] == gridsY[i + 1]) // a horizontal edge\n        {\n          xmin = min(gridsX[i], gridsX[i + 1]);\n          grid = gridsY[i] * (xGrid - 1) + xmin;\n          galois::runtime::acquire(&h_edges[grid], galois::MethodFlag::WRITE);\n        }\n      }\n\n      for (i = 0; i < treeedge->route.routelen; i++) // second check\n      {\n        if (gridsX[i] == gridsX[i + 1]) // a vertical edge\n        {\n          ymin = min(gridsY[i], gridsY[i + 1]);\n          grid = ymin * xGrid + gridsX[i];\n          if (v_edges[grid].usage + v_edges[grid].red >=\n              vCapacity - ripup_threshold) {\n            needRipup2 = TRUE;\n            break;\n          }\n\n        } else if (gridsY[i] == gridsY[i + 1]) // a horizontal edge\n        {\n          xmin = min(gridsX[i], gridsX[i + 1]);\n          grid = gridsY[i] * (xGrid - 1) + xmin;\n          if (h_edges[grid].usage + h_edges[grid].red >=\n              hCapacity - ripup_threshold) {\n            needRipup2 = TRUE;\n            break;\n          }\n        }\n      }\n\n      if (needRipup2) {\n\n        for (i = 0; i < treeedge->route.routelen; i++) {\n          if (gridsX[i] == gridsX[i + 1]) // a vertical edge\n          {\n            ymin = min(gridsY[i], gridsY[i + 1]);\n            // v_edges[ymin*xGrid+gridsX[i]].usage -= 1;\n            v_edges[ymin * xGrid + gridsX[i]].usage.fetch_sub(\n                (short unsigned)1, std::memory_order_relaxed);\n          } else /// if(gridsY[i]==gridsY[i+1])// a horizontal edge\n          {\n            xmin = min(gridsX[i], gridsX[i + 1]);\n            // h_edges[gridsY[i]*(xGrid-1)+xmin].usage -= 1;\n            h_edges[gridsY[i] * (xGrid - 1) + xmin].usage.fetch_sub(\n                (short unsigned)1, std::memory_order_relaxed);\n          }\n        }\n      }\n\n      return (TRUE);\n    } else {\n      return (FALSE);\n    }\n  } else {\n    printf(\"route type is not maze, netID %d\\n\", netID);\n    fflush(stdout);\n    printEdge(netID, edgeID);\n\n    exit(0);\n  }\n}\n\nvoid mazeRouteMSMD_lock(int iter, int expand, float costHeight,\n                        int ripup_threshold, int mazeedge_Threshold,\n                        Bool Ordering, int cost_type) {\n  // LOCK = 0;\n  float forange;\n\n  // allocate memory for distance and parent and pop_heap\n  h_costTable = (float*)calloc(40 * hCapacity, sizeof(float));\n  v_costTable = (float*)calloc(40 * vCapacity, sizeof(float));\n\n  forange = 40 * hCapacity;\n\n  if (cost_type == 2) {\n    for (int i = 0; i < forange; i++) {\n      if (i < hCapacity - 1)\n        h_costTable[i] =\n            costHeight / (exp((float)(hCapacity - i - 1) * LOGIS_COF) + 1) + 1;\n      else\n        h_costTable[i] =\n            costHeight / (exp((float)(hCapacity - i - 1) * LOGIS_COF) + 1) + 1 +\n            costHeight / slope * (i - hCapacity);\n    }\n    forange = 40 * vCapacity;\n    for (int i = 0; i < forange; i++) {\n      if (i < vCapacity - 1)\n        v_costTable[i] =\n            costHeight / (exp((float)(vCapacity - i - 1) * LOGIS_COF) + 1) + 1;\n      else\n        v_costTable[i] =\n            costHeight / (exp((float)(vCapacity - i - 1) * LOGIS_COF) + 1) + 1 +\n            costHeight / slope * (i - vCapacity);\n    }\n  } else {\n\n    for (int i = 0; i < forange; i++) {\n      if (i < hCapacity)\n        h_costTable[i] =\n            costHeight / (exp((float)(hCapacity - i) * LOGIS_COF) + 1) + 1;\n      else\n        h_costTable[i] =\n            costHeight / (exp((float)(hCapacity - i) * LOGIS_COF) + 1) + 1 +\n            costHeight / slope * (i - hCapacity);\n    }\n    forange = 40 * vCapacity;\n    for (int i = 0; i < forange; i++) {\n      if (i < vCapacity)\n        v_costTable[i] =\n            costHeight / (exp((float)(vCapacity - i) * LOGIS_COF) + 1) + 1;\n      else\n        v_costTable[i] =\n            costHeight / (exp((float)(vCapacity - i) * LOGIS_COF) + 1) + 1 +\n            costHeight / slope * (i - vCapacity);\n    }\n  }\n\n  /*forange = yGrid*xGrid;\n  for(i=0; i<forange; i++)\n  {\n      pop_heap2[i] = FALSE;\n  } //Michael*/\n\n  if (Ordering) {\n    StNetOrder();\n    // printf(\"order?\\n\");\n  }\n\n  galois::substrate::PerThreadStorage<THREAD_LOCAL_STORAGE>\n      thread_local_storage{};\n  // for(nidRPC=0; nidRPC<numValidNets; nidRPC++)//parallelize\n  PerThread_PQ perthread_pq;\n  PerThread_Vec perthread_vec;\n  PRINT = 0;\n  galois::GAccumulator<int> total_ripups;\n  galois::GReduceMax<int> max_ripups;\n  total_ripups.reset();\n  max_ripups.reset();\n\n  // galois::runtime::profileVtune( [&] (void) {\n  /*std::random_device rd;\n  std::mt19937 g(rd());\n  std::shuffle(net_shuffle.begin(), net_shuffle.end(), g);\n\n  galois::do_all(galois::iterate(net_shuffle), */\n  galois::do_all(\n      galois::iterate(0, numValidNets),\n      [&](const auto nidRPC)\n      // galois::do_all(galois::iterate(0, numValidNets),\n      //        [&] (const auto nidRPC)\n      {\n        int grid, netID;\n\n        // maze routing for multi-source, multi-destination\n        bool hypered, enter;\n        int i, j, deg, edgeID, n1, n2, n1x, n1y, n2x, n2y, ymin, ymax, xmin,\n            xmax, curX, curY, crossX, crossY, tmpX, tmpY, tmpi, min_x, min_y,\n            num_edges;\n        int regionX1, regionX2, regionY1, regionY2;\n        int ind1, tmpind, gridsX[XRANGE], gridsY[YRANGE], tmp_gridsX[XRANGE],\n            tmp_gridsY[YRANGE];\n        int endpt1, endpt2, A1, A2, B1, B2, C1, C2, D1, D2, cnt, cnt_n1n2;\n        int edge_n1n2, edge_n1A1, edge_n1A2, edge_n1C1, edge_n1C2, edge_A1A2,\n            edge_C1C2;\n        int edge_n2B1, edge_n2B2, edge_n2D1, edge_n2D2, edge_B1B2, edge_D1D2;\n        int E1x, E1y, E2x, E2y;\n        int tmp_grid;\n        int preX, preY, origENG, edgeREC;\n\n        float tmp, tmp_cost;\n        TreeEdge *treeedges, *treeedge;\n        TreeNode* treenodes;\n\n        bool* pop_heap2 = thread_local_storage.getLocal()->pop_heap2;\n\n        float** d1    = thread_local_storage.getLocal()->d1_p;\n        bool** HV     = thread_local_storage.getLocal()->HV_p;\n        bool** hyperV = thread_local_storage.getLocal()->hyperV_p;\n        bool** hyperH = thread_local_storage.getLocal()->hyperH_p;\n\n        short** parentX1 = thread_local_storage.getLocal()->parentX1_p;\n        short** parentX3 = thread_local_storage.getLocal()->parentX3_p;\n        short** parentY1 = thread_local_storage.getLocal()->parentY1_p;\n        short** parentY3 = thread_local_storage.getLocal()->parentY3_p;\n\n        int** corrEdge = thread_local_storage.getLocal()->corrEdge_p;\n\n        OrderNetEdge* netEO = thread_local_storage.getLocal()->netEO_p;\n\n        bool** inRegion = thread_local_storage.getLocal()->inRegion_p;\n        // bool* inRegion_alloc =\n        // thread_local_storage->getLocal()->inRegion_alloc;\n\n        local_pq pq1 = perthread_pq.get();\n        local_vec v2 = perthread_vec.get();\n\n        /*for(i=0; i<yGrid*xGrid; i++)\n        {\n            pop_heap2[i] = FALSE;\n        } */\n\n        // memset(inRegion_alloc, 0, xGrid * yGrid * sizeof(bool));\n        /*for(int i=0; i<yGrid; i++)\n        {\n            for(int j=0; j<xGrid; j++)\n                inRegion[i][j] = FALSE;\n        }*/\n        // printf(\"hyperV[153][134]: %d %d %d\\n\", hyperV[153][134],\n        // parentY1[153][134], parentX3[153][134]); printf(\"what is\n        // happening?\\n\");\n\n        if (Ordering) {\n          netID = treeOrderCong[nidRPC].treeIndex;\n        } else {\n          netID = nidRPC;\n        }\n\n        deg = sttrees[netID].deg;\n\n        origENG = expand;\n\n        netedgeOrderDec(netID, netEO);\n\n        treeedges = sttrees[netID].edges;\n        treenodes = sttrees[netID].nodes;\n        // loop for all the tree edges (2*deg-3)\n        num_edges = 2 * deg - 3;\n\n        for (edgeREC = 0; edgeREC < num_edges; edgeREC++) {\n          edgeID   = netEO[edgeREC].edgeID;\n          treeedge = &(treeedges[edgeID]);\n\n          n1            = treeedge->n1;\n          n2            = treeedge->n2;\n          n1x           = treenodes[n1].x;\n          n1y           = treenodes[n1].y;\n          n2x           = treenodes[n2].x;\n          n2y           = treenodes[n2].y;\n          treeedge->len = ADIFF(n2x, n1x) + ADIFF(n2y, n1y);\n\n          if (treeedge->len >\n              mazeedge_Threshold) // only route the non-degraded edges (len>0)\n          {\n\n            // enter = newRipupCheck_nosub(treeedge, n1x, n1y, n2x, n2y,\n            // ripup_threshold, netID, edgeID);\n            enter =\n                newRipupCheck_lock(treeedge, ripup_threshold, netID, edgeID);\n\n            // ripup the routing for the edge\n            if (enter) {\n\n              // if(netID == 252163 && edgeID == 51)\n              //    printf(\"netID %d edgeID %d src %d %d dst %d %d\\n\", netID,\n              //    edgeID, n1x, n1y, n2x, n2y);\n              if (n1y <= n2y) {\n                ymin = n1y;\n                ymax = n2y;\n              } else {\n                ymin = n2y;\n                ymax = n1y;\n              }\n\n              if (n1x <= n2x) {\n                xmin = n1x;\n                xmax = n2x;\n              } else {\n                xmin = n2x;\n                xmax = n1x;\n              }\n\n              int enlarge =\n                  min(origENG,\n                      (iter / 6 + 3) *\n                          treeedge->route\n                              .routelen); // michael, this was global variable\n              regionX1 = max(0, xmin - enlarge);\n              regionX2 = min(xGrid - 1, xmax + enlarge);\n              regionY1 = max(0, ymin - enlarge);\n              regionY2 = min(yGrid - 1, ymax + enlarge);\n\n              // initialize d1[][] and d2[][] as BIG_INT\n              for (i = regionY1; i <= regionY2; i++) {\n                for (j = regionX1; j <= regionX2; j++) {\n                  d1[i][j] = BIG_INT;\n                  /*d2[i][j] = BIG_INT;\n                  hyperH[i][j] = FALSE;\n                  hyperV[i][j] = FALSE;*/\n                }\n              }\n              // memset(hyperH, 0, xGrid * yGrid * sizeof(bool));\n              // memset(hyperV, 0, xGrid * yGrid * sizeof(bool));\n              for (i = regionY1; i <= regionY2; i++) {\n                for (j = regionX1; j <= regionX2; j++) {\n                  hyperH[i][j] = FALSE;\n                }\n              }\n              for (i = regionY1; i <= regionY2; i++) {\n                for (j = regionX1; j <= regionX2; j++) {\n                  hyperV[i][j] = FALSE;\n                }\n              }\n              // TODO: use seperate loops\n\n              // setup heap1, heap2 and initialize d1[][] and d2[][] for all the\n              // grids on the two subtrees\n              setupHeap(netID, edgeID, pq1, v2, regionX1, regionX2, regionY1,\n                        regionY2, d1, corrEdge, inRegion);\n              // TODO: use std priority queue\n              // while loop to find shortest path\n              ind1 = (pq1.top().d1_p - &d1[0][0]);\n              pq1.pop();\n              curX = ind1 % xGrid;\n              curY = ind1 / xGrid;\n\n              for (local_vec::iterator ii = v2.begin(); ii != v2.end(); ii++) {\n                pop_heap2[*ii] = TRUE;\n              }\n              float curr_d1;\n              while (pop_heap2[ind1] ==\n                     FALSE) // stop until the grid position been popped out from\n                            // both heap1 and heap2\n              {\n                // relax all the adjacent grids within the enlarged region for\n                // source subtree\n\n                // if(PRINT) printf(\"curX curY %d %d, (%d, %d), (%d, %d),\n                // pq1.size: %d\\n\", curX, curY, regionX1, regionX2, regionY1,\n                // regionY2, pq1.size()); if(curX == 102 && curY == 221)\n                // exit(1);\n                curr_d1 = d1[curY][curX];\n                if (curr_d1 != 0) {\n                  if (HV[curY][curX]) {\n                    preX = parentX1[curY][curX];\n                    preY = parentY1[curY][curX];\n                  } else {\n                    preX = parentX3[curY][curX];\n                    preY = parentY3[curY][curX];\n                  }\n                } else {\n                  preX = curX;\n                  preY = curY;\n                }\n\n                // left\n                if (curX > regionX1) {\n                  grid = curY * (xGrid - 1) + curX - 1;\n                  // printf(\"grid: %d usage: %d red:%d last:%d sum%f %d\\n\",grid,\n                  // h_edges[grid].usage.load(), h_edges[grid].red,\n                  // h_edges[grid].last_usage, L , h_edges[grid].usage.load() +\n                  // h_edges[grid].red + (int)(L*h_edges[grid].last_usage));\n                  if ((preY == curY) || (curr_d1 == 0)) {\n                    tmp = curr_d1 +\n                          h_costTable[h_edges[grid].usage + h_edges[grid].red +\n                                      (int)(L * h_edges[grid].last_usage)];\n                  } else {\n                    if (curX < regionX2 - 1) {\n                      tmp_grid = curY * (xGrid - 1) + curX;\n                      tmp_cost =\n                          d1[curY][curX + 1] +\n                          h_costTable[h_edges[tmp_grid].usage +\n                                      h_edges[tmp_grid].red +\n                                      (int)(L * h_edges[tmp_grid].last_usage)];\n\n                      if (tmp_cost < curr_d1 + VIA) {\n                        hyperH[curY][curX] = TRUE; // Michael\n                      }\n                    }\n                    tmp = curr_d1 + VIA +\n                          h_costTable[h_edges[grid].usage + h_edges[grid].red +\n                                      (int)(L * h_edges[grid].last_usage)];\n                  }\n                  // if(LOCK)  h_edges[grid].releaseLock();\n                  tmpX = curX - 1; // the left neighbor\n\n                  /*if(d1[curY][tmpX]>=BIG_INT) // left neighbor not been put\n                  into heap1\n                  {\n                      d1[curY][tmpX] = tmp;\n                      parentX3[curY][tmpX] = curX;\n                      parentY3[curY][tmpX] = curY;\n                      HV[curY][tmpX] = FALSE;\n                      pq1.push(&(d1[curY][tmpX]));\n                  }\n                  else */\n                  if (d1[curY][tmpX] >\n                      tmp) // left neighbor been put into heap1 but needs update\n                  {\n                    d1[curY][tmpX]       = tmp;\n                    parentX3[curY][tmpX] = curX;\n                    parentY3[curY][tmpX] = curY;\n                    HV[curY][tmpX]       = FALSE;\n                    pq1.push({&(d1[curY][tmpX]), tmp});\n                  }\n                }\n                // right\n                if (curX < regionX2) {\n                  grid = curY * (xGrid - 1) + curX;\n\n                  // printf(\"grid: %d usage: %d red:%d last:%d sum%f %d\\n\",grid,\n                  // h_edges[grid].usage.load(), h_edges[grid].red,\n                  // h_edges[grid].last_usage, L , h_edges[grid].usage.load() +\n                  // h_edges[grid].red + (int)(L*h_edges[grid].last_usage));\n                  if ((preY == curY) || (curr_d1 == 0)) {\n                    tmp = curr_d1 +\n                          h_costTable[h_edges[grid].usage + h_edges[grid].red +\n                                      (int)(L * h_edges[grid].last_usage)];\n                  } else {\n                    if (curX > regionX1 + 1) {\n                      tmp_grid = curY * (xGrid - 1) + curX - 1;\n                      tmp_cost =\n                          d1[curY][curX - 1] +\n                          h_costTable[h_edges[tmp_grid].usage +\n                                      h_edges[tmp_grid].red +\n                                      (int)(L * h_edges[tmp_grid].last_usage)];\n\n                      if (tmp_cost < curr_d1 + VIA) {\n                        hyperH[curY][curX] = TRUE;\n                      }\n                    }\n                    tmp = curr_d1 + VIA +\n                          h_costTable[h_edges[grid].usage + h_edges[grid].red +\n                                      (int)(L * h_edges[grid].last_usage)];\n                  }\n                  // if(LOCK) h_edges[grid].releaseLock();\n                  tmpX = curX + 1; // the right neighbor\n\n                  /*if(d1[curY][tmpX]>=BIG_INT) // right neighbor not been put\n                  into heap1\n                  {\n                      d1[curY][tmpX] = tmp;\n                      parentX3[curY][tmpX] = curX;\n                      parentY3[curY][tmpX] = curY;\n                      HV[curY][tmpX] = FALSE;\n                      pq1.push(&(d1[curY][tmpX]));\n\n                  }\n                  else */\n                  if (d1[curY][tmpX] > tmp) // right neighbor been put into\n                                            // heap1 but needs update\n                  {\n                    d1[curY][tmpX]       = tmp;\n                    parentX3[curY][tmpX] = curX;\n                    parentY3[curY][tmpX] = curY;\n                    HV[curY][tmpX]       = FALSE;\n                    pq1.push({&(d1[curY][tmpX]), tmp});\n                  }\n                }\n                // bottom\n                if (curY > regionY1) {\n                  grid = (curY - 1) * xGrid + curX;\n\n                  // printf(\"grid: %d usage: %d red:%d last:%d sum%f %d\\n\",grid,\n                  // v_edges[grid].usage.load(), v_edges[grid].red,\n                  // v_edges[grid].last_usage, L , v_edges[grid].usage.load() +\n                  // v_edges[grid].red + (int)(L*v_edges[grid].last_usage));\n                  if ((preX == curX) || (curr_d1 == 0)) {\n                    tmp = curr_d1 +\n                          v_costTable[v_edges[grid].usage + v_edges[grid].red +\n                                      (int)(L * v_edges[grid].last_usage)];\n                  } else {\n                    if (curY < regionY2 - 1) {\n                      tmp_grid = curY * xGrid + curX;\n                      tmp_cost =\n                          d1[curY + 1][curX] +\n                          v_costTable[v_edges[tmp_grid].usage +\n                                      v_edges[tmp_grid].red +\n                                      (int)(L * v_edges[tmp_grid].last_usage)];\n\n                      if (tmp_cost < curr_d1 + VIA) {\n                        hyperV[curY][curX] = TRUE;\n                      }\n                    }\n                    tmp = curr_d1 + VIA +\n                          v_costTable[v_edges[grid].usage + v_edges[grid].red +\n                                      (int)(L * v_edges[grid].last_usage)];\n                  }\n                  // if(LOCK) v_edges[grid].releaseLock();\n                  tmpY = curY - 1; // the bottom neighbor\n\n                  /*if(d1[tmpY][curX]>=BIG_INT) // bottom neighbor not been put\n                  into heap1\n                  {\n                      d1[tmpY][curX] = tmp;\n                      parentX1[tmpY][curX] = curX;\n                      parentY1[tmpY][curX] = curY;\n                      HV[tmpY][curX] = TRUE;\n                      pq1.push(&(d1[tmpY][curX]));\n\n                  }\n                  else */\n                  if (d1[tmpY][curX] > tmp) // bottom neighbor been put into\n                                            // heap1 but needs update\n                  {\n                    d1[tmpY][curX]       = tmp;\n                    parentX1[tmpY][curX] = curX;\n                    parentY1[tmpY][curX] = curY;\n                    HV[tmpY][curX]       = TRUE;\n                    pq1.push({&(d1[tmpY][curX]), tmp});\n                  }\n                }\n                // top\n                if (curY < regionY2) {\n                  grid = curY * xGrid + curX;\n\n                  // printf(\"grid: %d usage: %d red:%d last:%d sum%f %d\\n\",grid,\n                  // v_edges[grid].usage.load(), v_edges[grid].red,\n                  // v_edges[grid].last_usage, L , v_edges[grid].usage.load() +\n                  // v_edges[grid].red + (int)(L*v_edges[grid].last_usage));\n                  if ((preX == curX) || (curr_d1 == 0)) {\n                    tmp = curr_d1 +\n                          v_costTable[v_edges[grid].usage + v_edges[grid].red +\n                                      (int)(L * v_edges[grid].last_usage)];\n                  } else {\n                    if (curY > regionY1 + 1) {\n                      tmp_grid = (curY - 1) * xGrid + curX;\n                      tmp_cost =\n                          d1[curY - 1][curX] +\n                          v_costTable[v_edges[tmp_grid].usage +\n                                      v_edges[tmp_grid].red +\n                                      (int)(L * v_edges[tmp_grid].last_usage)];\n\n                      if (tmp_cost < curr_d1 + VIA) {\n                        hyperV[curY][curX] = TRUE;\n                      }\n                    }\n                    tmp = curr_d1 + VIA +\n                          v_costTable[v_edges[grid].usage + v_edges[grid].red +\n                                      (int)(L * v_edges[grid].last_usage)];\n                  }\n                  // if(LOCK) v_edges[grid].releaseLock();\n                  tmpY = curY + 1; // the top neighbor\n\n                  /*if(d1[tmpY][curX]>=BIG_INT) // top neighbor not been put\n                  into heap1\n                  {\n                      d1[tmpY][curX] = tmp;\n                      parentX1[tmpY][curX] = curX;\n                      parentY1[tmpY][curX] = curY;\n                      HV[tmpY][curX] = TRUE;\n                      pq1.push(&(d1[tmpY][curX]));\n                  }\n                  else*/\n                  if (d1[tmpY][curX] >\n                      tmp) // top neighbor been put into heap1 but needs update\n                  {\n                    d1[tmpY][curX]       = tmp;\n                    parentX1[tmpY][curX] = curX;\n                    parentY1[tmpY][curX] = curY;\n                    HV[tmpY][curX]       = TRUE;\n                    pq1.push({&(d1[tmpY][curX]), tmp});\n                  }\n                }\n\n                // update ind1 and ind2 for next loop, Michael: need to check if\n                // it is up-to-date value.\n                float d1_push;\n                do {\n                  ind1    = pq1.top().d1_p - &d1[0][0];\n                  d1_push = pq1.top().d1_push;\n                  pq1.pop();\n                  curX = ind1 % xGrid;\n                  curY = ind1 / xGrid;\n                } while (d1_push != d1[curY][curX]);\n              } // while loop\n\n              for (local_vec::iterator ii = v2.begin(); ii != v2.end(); ii++)\n                pop_heap2[*ii] = FALSE;\n\n              crossX = ind1 % xGrid;\n              crossY = ind1 / xGrid;\n\n              cnt  = 0;\n              curX = crossX;\n              curY = crossY;\n              while (d1[curY][curX] != 0) // loop until reach subtree1\n              {\n                hypered = FALSE;\n                if (cnt != 0) {\n                  if (curX != tmpX && hyperH[curY][curX]) {\n                    curX    = 2 * curX - tmpX;\n                    hypered = TRUE;\n                  }\n                  // printf(\"hyperV[153][134]: %d\\n\", hyperV[curY][curX]);\n                  if (curY != tmpY && hyperV[curY][curX]) {\n                    curY    = 2 * curY - tmpY;\n                    hypered = TRUE;\n                  }\n                }\n                tmpX = curX;\n                tmpY = curY;\n                if (!hypered) {\n                  if (HV[tmpY][tmpX]) {\n                    curY = parentY1[tmpY][tmpX];\n                  } else {\n                    curX = parentX3[tmpY][tmpX];\n                  }\n                }\n\n                tmp_gridsX[cnt] = curX;\n                tmp_gridsY[cnt] = curY;\n                cnt++;\n              }\n              // reverse the grids on the path\n\n              for (i = 0; i < cnt; i++) {\n                tmpind    = cnt - 1 - i;\n                gridsX[i] = tmp_gridsX[tmpind];\n                gridsY[i] = tmp_gridsY[tmpind];\n              }\n              // add the connection point (crossX, crossY)\n              gridsX[cnt] = crossX;\n              gridsY[cnt] = crossY;\n              cnt++;\n\n              curX     = crossX;\n              curY     = crossY;\n              cnt_n1n2 = cnt;\n\n              // change the tree structure according to the new routing for the\n              // tree edge find E1 and E2, and the endpoints of the edges they\n              // are on\n              E1x = gridsX[0];\n              E1y = gridsY[0];\n              E2x = gridsX[cnt_n1n2 - 1];\n              E2y = gridsY[cnt_n1n2 - 1];\n\n              edge_n1n2 = edgeID;\n              // if(netID == 252163 && edgeID == 51)\n              //    printf(\"E1x: %d, E1y: %d, E2x: %d, E2y %d length: %d\\n\",\n              //    E1x, E1y, E2x, E2y, cnt_n1n2);\n\n              // (1) consider subtree1\n              if (n1 >= deg && (E1x != n1x || E1y != n1y))\n              // n1 is not a pin and E1!=n1, then make change to subtree1,\n              // otherwise, no change to subtree1\n              {\n                // find the endpoints of the edge E1 is on\n                endpt1 = treeedges[corrEdge[E1y][E1x]].n1;\n                endpt2 = treeedges[corrEdge[E1y][E1x]].n2;\n\n                // find A1, A2 and edge_n1A1, edge_n1A2\n                if (treenodes[n1].nbr[0] == n2) {\n                  A1        = treenodes[n1].nbr[1];\n                  A2        = treenodes[n1].nbr[2];\n                  edge_n1A1 = treenodes[n1].edge[1];\n                  edge_n1A2 = treenodes[n1].edge[2];\n                } else if (treenodes[n1].nbr[1] == n2) {\n                  A1        = treenodes[n1].nbr[0];\n                  A2        = treenodes[n1].nbr[2];\n                  edge_n1A1 = treenodes[n1].edge[0];\n                  edge_n1A2 = treenodes[n1].edge[2];\n                } else {\n                  A1        = treenodes[n1].nbr[0];\n                  A2        = treenodes[n1].nbr[1];\n                  edge_n1A1 = treenodes[n1].edge[0];\n                  edge_n1A2 = treenodes[n1].edge[1];\n                }\n\n                if (endpt1 == n1 ||\n                    endpt2 == n1) // E1 is on (n1, A1) or (n1, A2)\n                {\n                  // if E1 is on (n1, A2), switch A1 and A2 so that E1 is always\n                  // on (n1, A1)\n                  if (endpt1 == A2 || endpt2 == A2) {\n                    tmpi      = A1;\n                    A1        = A2;\n                    A2        = tmpi;\n                    tmpi      = edge_n1A1;\n                    edge_n1A1 = edge_n1A2;\n                    edge_n1A2 = tmpi;\n                  }\n\n                  // update route for edge (n1, A1), (n1, A2)\n                  updateRouteType1(treenodes, n1, A1, A2, E1x, E1y, treeedges,\n                                   edge_n1A1, edge_n1A2);\n                  // update position for n1\n                  treenodes[n1].x = E1x;\n                  treenodes[n1].y = E1y;\n                }    // if E1 is on (n1, A1) or (n1, A2)\n                else // E1 is not on (n1, A1) or (n1, A2), but on (C1, C2)\n                {\n                  C1        = endpt1;\n                  C2        = endpt2;\n                  edge_C1C2 = corrEdge[E1y][E1x];\n\n                  // update route for edge (n1, C1), (n1, C2) and (A1, A2)\n                  updateRouteType2(treenodes, n1, A1, A2, C1, C2, E1x, E1y,\n                                   treeedges, edge_n1A1, edge_n1A2, edge_C1C2);\n                  // update position for n1\n                  treenodes[n1].x = E1x;\n                  treenodes[n1].y = E1y;\n                  // update 3 edges (n1, A1)->(C1, n1), (n1, A2)->(n1, C2), (C1,\n                  // C2)->(A1, A2)\n                  edge_n1C1               = edge_n1A1;\n                  treeedges[edge_n1C1].n1 = C1;\n                  treeedges[edge_n1C1].n2 = n1;\n                  edge_n1C2               = edge_n1A2;\n                  treeedges[edge_n1C2].n1 = n1;\n                  treeedges[edge_n1C2].n2 = C2;\n                  edge_A1A2               = edge_C1C2;\n                  treeedges[edge_A1A2].n1 = A1;\n                  treeedges[edge_A1A2].n2 = A2;\n                  // update nbr and edge for 5 nodes n1, A1, A2, C1, C2\n                  // n1's nbr (n2, A1, A2)->(n2, C1, C2)\n                  treenodes[n1].nbr[0]  = n2;\n                  treenodes[n1].edge[0] = edge_n1n2;\n                  treenodes[n1].nbr[1]  = C1;\n                  treenodes[n1].edge[1] = edge_n1C1;\n                  treenodes[n1].nbr[2]  = C2;\n                  treenodes[n1].edge[2] = edge_n1C2;\n                  // A1's nbr n1->A2\n                  for (i = 0; i < 3; i++) {\n                    if (treenodes[A1].nbr[i] == n1) {\n                      treenodes[A1].nbr[i]  = A2;\n                      treenodes[A1].edge[i] = edge_A1A2;\n                      break;\n                    }\n                  }\n                  // A2's nbr n1->A1\n                  for (i = 0; i < 3; i++) {\n                    if (treenodes[A2].nbr[i] == n1) {\n                      treenodes[A2].nbr[i]  = A1;\n                      treenodes[A2].edge[i] = edge_A1A2;\n                      break;\n                    }\n                  }\n                  // C1's nbr C2->n1\n                  for (i = 0; i < 3; i++) {\n                    if (treenodes[C1].nbr[i] == C2) {\n                      treenodes[C1].nbr[i]  = n1;\n                      treenodes[C1].edge[i] = edge_n1C1;\n                      break;\n                    }\n                  }\n                  // C2's nbr C1->n1\n                  for (i = 0; i < 3; i++) {\n                    if (treenodes[C2].nbr[i] == C1) {\n                      treenodes[C2].nbr[i]  = n1;\n                      treenodes[C2].edge[i] = edge_n1C2;\n                      break;\n                    }\n                  }\n\n                } // else E1 is not on (n1, A1) or (n1, A2), but on (C1, C2)\n              }   // n1 is not a pin and E1!=n1\n\n              // (2) consider subtree2\n\n              if (n2 >= deg && (E2x != n2x || E2y != n2y))\n              // n2 is not a pin and E2!=n2, then make change to subtree2,\n              // otherwise, no change to subtree2\n              {\n                // find the endpoints of the edge E1 is on\n                endpt1 = treeedges[corrEdge[E2y][E2x]].n1;\n                endpt2 = treeedges[corrEdge[E2y][E2x]].n2;\n\n                // find B1, B2\n                if (treenodes[n2].nbr[0] == n1) {\n                  B1        = treenodes[n2].nbr[1];\n                  B2        = treenodes[n2].nbr[2];\n                  edge_n2B1 = treenodes[n2].edge[1];\n                  edge_n2B2 = treenodes[n2].edge[2];\n                } else if (treenodes[n2].nbr[1] == n1) {\n                  B1        = treenodes[n2].nbr[0];\n                  B2        = treenodes[n2].nbr[2];\n                  edge_n2B1 = treenodes[n2].edge[0];\n                  edge_n2B2 = treenodes[n2].edge[2];\n                } else {\n                  B1        = treenodes[n2].nbr[0];\n                  B2        = treenodes[n2].nbr[1];\n                  edge_n2B1 = treenodes[n2].edge[0];\n                  edge_n2B2 = treenodes[n2].edge[1];\n                }\n\n                if (endpt1 == n2 ||\n                    endpt2 == n2) // E2 is on (n2, B1) or (n2, B2)\n                {\n                  // if E2 is on (n2, B2), switch B1 and B2 so that E2 is always\n                  // on (n2, B1)\n                  if (endpt1 == B2 || endpt2 == B2) {\n                    tmpi      = B1;\n                    B1        = B2;\n                    B2        = tmpi;\n                    tmpi      = edge_n2B1;\n                    edge_n2B1 = edge_n2B2;\n                    edge_n2B2 = tmpi;\n                  }\n\n                  // update route for edge (n2, B1), (n2, B2)\n                  updateRouteType1(treenodes, n2, B1, B2, E2x, E2y, treeedges,\n                                   edge_n2B1, edge_n2B2);\n\n                  // update position for n2\n                  treenodes[n2].x = E2x;\n                  treenodes[n2].y = E2y;\n                }    // if E2 is on (n2, B1) or (n2, B2)\n                else // E2 is not on (n2, B1) or (n2, B2), but on (D1, D2)\n                {\n                  D1        = endpt1;\n                  D2        = endpt2;\n                  edge_D1D2 = corrEdge[E2y][E2x];\n\n                  // update route for edge (n2, D1), (n2, D2) and (B1, B2)\n                  updateRouteType2(treenodes, n2, B1, B2, D1, D2, E2x, E2y,\n                                   treeedges, edge_n2B1, edge_n2B2, edge_D1D2);\n                  // update position for n2\n                  treenodes[n2].x = E2x;\n                  treenodes[n2].y = E2y;\n                  // update 3 edges (n2, B1)->(D1, n2), (n2, B2)->(n2, D2), (D1,\n                  // D2)->(B1, B2)\n                  edge_n2D1               = edge_n2B1;\n                  treeedges[edge_n2D1].n1 = D1;\n                  treeedges[edge_n2D1].n2 = n2;\n                  edge_n2D2               = edge_n2B2;\n                  treeedges[edge_n2D2].n1 = n2;\n                  treeedges[edge_n2D2].n2 = D2;\n                  edge_B1B2               = edge_D1D2;\n                  treeedges[edge_B1B2].n1 = B1;\n                  treeedges[edge_B1B2].n2 = B2;\n                  // update nbr and edge for 5 nodes n2, B1, B2, D1, D2\n                  // n1's nbr (n1, B1, B2)->(n1, D1, D2)\n                  treenodes[n2].nbr[0]  = n1;\n                  treenodes[n2].edge[0] = edge_n1n2;\n                  treenodes[n2].nbr[1]  = D1;\n                  treenodes[n2].edge[1] = edge_n2D1;\n                  treenodes[n2].nbr[2]  = D2;\n                  treenodes[n2].edge[2] = edge_n2D2;\n                  // B1's nbr n2->B2\n                  for (i = 0; i < 3; i++) {\n                    if (treenodes[B1].nbr[i] == n2) {\n                      treenodes[B1].nbr[i]  = B2;\n                      treenodes[B1].edge[i] = edge_B1B2;\n                      break;\n                    }\n                  }\n                  // B2's nbr n2->B1\n                  for (i = 0; i < 3; i++) {\n                    if (treenodes[B2].nbr[i] == n2) {\n                      treenodes[B2].nbr[i]  = B1;\n                      treenodes[B2].edge[i] = edge_B1B2;\n                      break;\n                    }\n                  }\n                  // D1's nbr D2->n2\n                  for (i = 0; i < 3; i++) {\n                    if (treenodes[D1].nbr[i] == D2) {\n                      treenodes[D1].nbr[i]  = n2;\n                      treenodes[D1].edge[i] = edge_n2D1;\n                      break;\n                    }\n                  }\n                  // D2's nbr D1->n2\n                  for (i = 0; i < 3; i++) {\n                    if (treenodes[D2].nbr[i] == D1) {\n                      treenodes[D2].nbr[i]  = n2;\n                      treenodes[D2].edge[i] = edge_n2D2;\n                      break;\n                    }\n                  }\n                } // else E2 is not on (n2, B1) or (n2, B2), but on (D1, D2)\n              }   // n2 is not a pin and E2!=n2\n\n              // update route for edge (n1, n2) and edge usage\n\n              // printf(\"update route? %d %d\\n\", netID, num_edges);\n              if (treeedges[edge_n1n2].route.type == MAZEROUTE) {\n                free(treeedges[edge_n1n2].route.gridsX);\n                free(treeedges[edge_n1n2].route.gridsY);\n              }\n              treeedges[edge_n1n2].route.gridsX =\n                  (short*)calloc(cnt_n1n2, sizeof(short));\n              treeedges[edge_n1n2].route.gridsY =\n                  (short*)calloc(cnt_n1n2, sizeof(short));\n              treeedges[edge_n1n2].route.type     = MAZEROUTE;\n              treeedges[edge_n1n2].route.routelen = cnt_n1n2 - 1;\n              treeedges[edge_n1n2].len = ADIFF(E1x, E2x) + ADIFF(E1y, E2y);\n              treeedges[edge_n1n2].n_ripups += 1;\n              total_ripups += 1;\n              max_ripups.update(treeedges[edge_n1n2].n_ripups);\n\n              for (i = 0; i < cnt_n1n2; i++) {\n                // printf(\"cnt_n1n2: %d\\n\", cnt_n1n2);\n                treeedges[edge_n1n2].route.gridsX[i] = gridsX[i];\n                treeedges[edge_n1n2].route.gridsY[i] = gridsY[i];\n              }\n\n              // update edge usage\n\n              for (i = 0; i < cnt_n1n2 - 1; i++) {\n                if (gridsX[i] == gridsX[i + 1]) // a vertical edge\n                {\n                  min_y = min(gridsY[i], gridsY[i + 1]);\n                  // v_edges[min_y*xGrid+gridsX[i]].usage += 1;\n                  // galois::atomicAdd(v_edges[min_y*xGrid+gridsX[i]].usage,\n                  // (short unsigned)1);\n                  v_edges[min_y * xGrid + gridsX[i]].usage.fetch_add(\n                      (short int)1);\n                } else /// if(gridsY[i]==gridsY[i+1])// a horizontal edge\n                {\n                  min_x = min(gridsX[i], gridsX[i + 1]);\n                  // h_edges[gridsY[i]*(xGrid-1)+min_x].usage += 1;\n                  // galois::atomicAdd(h_edges[gridsY[i]*(xGrid-1)+min_x].usage,\n                  // (short unsigned)1);\n                  h_edges[gridsY[i] * (xGrid - 1) + min_x].usage.fetch_add(\n                      (short int)1);\n                }\n              }\n              /*if(LOCK){\n                  for(i=0; i<cnt_n1n2-1; i++)\n                  {\n                      if(gridsX[i]==gridsX[i+1]) // a vertical edge\n                      {\n                          min_y = min(gridsY[i], gridsY[i+1]);\n                          v_edges[min_y*xGrid+gridsX[i]].releaseLock();\n                      }\n                      else ///if(gridsY[i]==gridsY[i+1])// a horizontal edge\n                      {\n                          min_x = min(gridsX[i], gridsX[i+1]);\n                          h_edges[gridsY[i]*(xGrid-1)+min_x].releaseLock();\n                      }\n                  }\n              }*/\n              if (checkRoute2DTree(netID)) {\n                reInitTree(netID);\n                return;\n              }\n            } // congested route\n          }   // maze routing\n        }     // loop edgeID\n      },\n      // galois::wl<galois::worklists::ParaMeter<>>(),\n      galois::steal(),\n      // galois::chunk_size<32>(),\n      galois::loopname(\"maze routing\")); // galois::do_all\n\n  printf(\"total ripups: %d max ripups: %d\\n\", total_ripups.reduce(),\n         max_ripups.reduce());\n  //}, \"mazeroute vtune function\");\n  free(h_costTable);\n  free(v_costTable);\n}\n\nvoid mazeRouteMSMD_M1M2(int iter, int expand, float costHeight,\n                        int ripup_threshold, int mazeedge_Threshold,\n                        Bool Ordering, int cost_type) {\n  // LOCK = 0;\n  float forange;\n\n  // allocate memory for distance and parent and pop_heap\n  h_costTable = (float*)calloc(40 * hCapacity, sizeof(float));\n  v_costTable = (float*)calloc(40 * vCapacity, sizeof(float));\n\n  forange = 40 * hCapacity;\n\n  if (cost_type == 2) {\n    for (int i = 0; i < forange; i++) {\n      if (i < hCapacity - 1)\n        h_costTable[i] =\n            costHeight / (exp((float)(hCapacity - i - 1) * LOGIS_COF) + 1) + 1;\n      else\n        h_costTable[i] =\n            costHeight / (exp((float)(hCapacity - i - 1) * LOGIS_COF) + 1) + 1 +\n            costHeight / slope * (i - hCapacity);\n    }\n    forange = 40 * vCapacity;\n    for (int i = 0; i < forange; i++) {\n      if (i < vCapacity - 1)\n        v_costTable[i] =\n            costHeight / (exp((float)(vCapacity - i - 1) * LOGIS_COF) + 1) + 1;\n      else\n        v_costTable[i] =\n            costHeight / (exp((float)(vCapacity - i - 1) * LOGIS_COF) + 1) + 1 +\n            costHeight / slope * (i - vCapacity);\n    }\n  } else {\n\n    for (int i = 0; i < forange; i++) {\n      if (i < hCapacity)\n        h_costTable[i] =\n            costHeight / (exp((float)(hCapacity - i) * LOGIS_COF) + 1) + 1;\n      else\n        h_costTable[i] =\n            costHeight / (exp((float)(hCapacity - i) * LOGIS_COF) + 1) + 1 +\n            costHeight / slope * (i - hCapacity);\n    }\n    forange = 40 * vCapacity;\n    for (int i = 0; i < forange; i++) {\n      if (i < vCapacity)\n        v_costTable[i] =\n            costHeight / (exp((float)(vCapacity - i) * LOGIS_COF) + 1) + 1;\n      else\n        v_costTable[i] =\n            costHeight / (exp((float)(vCapacity - i) * LOGIS_COF) + 1) + 1 +\n            costHeight / slope * (i - vCapacity);\n    }\n  }\n\n  /*forange = yGrid*xGrid;\n  for(i=0; i<forange; i++)\n  {\n      pop_heap2[i] = FALSE;\n  } //Michael*/\n\n  if (Ordering) {\n    StNetOrder();\n    // printf(\"order?\\n\");\n  }\n\n  galois::substrate::PerThreadStorage<THREAD_LOCAL_STORAGE>\n      thread_local_storage{};\n  // for(nidRPC=0; nidRPC<numValidNets; nidRPC++)//parallelize\n  PerThread_PQ perthread_pq;\n  PerThread_Vec perthread_vec;\n  PRINT = 0;\n  galois::GAccumulator<int> total_ripups;\n  galois::GReduceMax<int> max_ripups;\n  total_ripups.reset();\n  max_ripups.reset();\n\n  // galois::runtime::profileVtune( [&] (void) {\n  /*std::random_device rd;\n  std::mt19937 g(rd());\n  std::shuffle(net_shuffle.begin(), net_shuffle.end(), g);\n\n  galois::do_all(galois::iterate(net_shuffle), */\n  // galois::for_each(galois::iterate(0, numValidNets),\n  //        [&] (const auto nidRPC, auto& ctx)\n  galois::do_all(\n      galois::iterate(0, numValidNets),\n      [&](const auto nidRPC) {\n        int grid, netID;\n\n        // maze routing for multi-source, multi-destination\n        bool hypered, enter;\n        int i, j, deg, edgeID, n1, n2, n1x, n1y, n2x, n2y, ymin, ymax, xmin,\n            xmax, curX, curY, crossX, crossY, tmpX, tmpY, tmpi, min_x, min_y,\n            num_edges;\n        int regionX1, regionX2, regionY1, regionY2;\n        int ind1, tmpind, gridsX[XRANGE], gridsY[YRANGE], tmp_gridsX[XRANGE],\n            tmp_gridsY[YRANGE];\n        int endpt1, endpt2, A1, A2, B1, B2, C1, C2, D1, D2, cnt, cnt_n1n2;\n        int edge_n1n2, edge_n1A1, edge_n1A2, edge_n1C1, edge_n1C2, edge_A1A2,\n            edge_C1C2;\n        int edge_n2B1, edge_n2B2, edge_n2D1, edge_n2D2, edge_B1B2, edge_D1D2;\n        int E1x, E1y, E2x, E2y;\n        int tmp_grid;\n        int preX, preY, origENG, edgeREC;\n\n        float tmp, tmp_cost;\n        TreeEdge *treeedges, *treeedge;\n        TreeNode* treenodes;\n\n        bool* pop_heap2 = thread_local_storage.getLocal()->pop_heap2;\n\n        float** d1    = thread_local_storage.getLocal()->d1_p;\n        bool** HV     = thread_local_storage.getLocal()->HV_p;\n        bool** hyperV = thread_local_storage.getLocal()->hyperV_p;\n        bool** hyperH = thread_local_storage.getLocal()->hyperH_p;\n\n        short** parentX1 = thread_local_storage.getLocal()->parentX1_p;\n        short** parentX3 = thread_local_storage.getLocal()->parentX3_p;\n        short** parentY1 = thread_local_storage.getLocal()->parentY1_p;\n        short** parentY3 = thread_local_storage.getLocal()->parentY3_p;\n\n        int** corrEdge = thread_local_storage.getLocal()->corrEdge_p;\n\n        OrderNetEdge* netEO = thread_local_storage.getLocal()->netEO_p;\n\n        bool** inRegion = thread_local_storage.getLocal()->inRegion_p;\n        // bool* inRegion_alloc =\n        // thread_local_storage.getLocal()->inRegion_alloc;\n\n        local_pq pq1 = perthread_pq.get();\n        local_vec v2 = perthread_vec.get();\n\n        /*for(i=0; i<yGrid*xGrid; i++)\n        {\n            pop_heap2[i] = FALSE;\n        } */\n\n        // memset(inRegion_alloc, 0, xGrid * yGrid * sizeof(bool));\n        /*for(int i=0; i<yGrid; i++)\n        {\n            for(int j=0; j<xGrid; j++)\n                inRegion[i][j] = FALSE;\n        }*/\n        // printf(\"hyperV[153][134]: %d %d %d\\n\", hyperV[153][134],\n        // parentY1[153][134], parentX3[153][134]); printf(\"what is\n        // happening?\\n\");\n\n        if (Ordering) {\n          netID = treeOrderCong[nidRPC].treeIndex;\n        } else {\n          netID = nidRPC;\n        }\n\n        deg = sttrees[netID].deg;\n\n        origENG = expand;\n\n        netedgeOrderDec(netID, netEO);\n\n        treeedges = sttrees[netID].edges;\n        treenodes = sttrees[netID].nodes;\n        // loop for all the tree edges (2*deg-3)\n        num_edges = 2 * deg - 3;\n\n        for (edgeREC = 0; edgeREC < num_edges; edgeREC++) {\n          edgeID   = netEO[edgeREC].edgeID;\n          treeedge = &(treeedges[edgeID]);\n\n          n1            = treeedge->n1;\n          n2            = treeedge->n2;\n          n1x           = treenodes[n1].x;\n          n1y           = treenodes[n1].y;\n          n2x           = treenodes[n2].x;\n          n2y           = treenodes[n2].y;\n          treeedge->len = ADIFF(n2x, n1x) + ADIFF(n2y, n1y);\n\n          if (treeedge->len >\n              mazeedge_Threshold) // only route the non-degraded edges (len>0)\n          {\n\n            // enter = newRipupCheck_nosub(treeedge, n1x, n1y, n2x, n2y,\n            // ripup_threshold, netID, edgeID);\n            enter =\n                newRipupCheck_lock(treeedge, ripup_threshold, netID, edgeID);\n            // enter = newRipupCheck_atomic(treeedge, n1x, n1y, n2x, n2y,\n            // ripup_threshold, netID, edgeID);\n\n            // ripup the routing for the edge\n            if (enter) {\n\n              // if(netID == 252163 && edgeID == 51)\n              //    printf(\"netID %d edgeID %d src %d %d dst %d %d\\n\", netID,\n              //    edgeID, n1x, n1y, n2x, n2y);\n              if (n1y <= n2y) {\n                ymin = n1y;\n                ymax = n2y;\n              } else {\n                ymin = n2y;\n                ymax = n1y;\n              }\n\n              if (n1x <= n2x) {\n                xmin = n1x;\n                xmax = n2x;\n              } else {\n                xmin = n2x;\n                xmax = n1x;\n              }\n\n              int enlarge =\n                  min(origENG,\n                      (iter / 6 + 3) *\n                          treeedge->route\n                              .routelen); // michael, this was global variable\n              regionX1 = max(0, xmin - enlarge);\n              regionX2 = min(xGrid - 1, xmax + enlarge);\n              regionY1 = max(0, ymin - enlarge);\n              regionY2 = min(yGrid - 1, ymax + enlarge);\n\n              // initialize d1[][] and d2[][] as BIG_INT\n              for (i = regionY1; i <= regionY2; i++) {\n                for (j = regionX1; j <= regionX2; j++) {\n                  d1[i][j] = BIG_INT;\n                  /*d2[i][j] = BIG_INT;\n                  hyperH[i][j] = FALSE;\n                  hyperV[i][j] = FALSE;*/\n                }\n              }\n              // memset(hyperH, 0, xGrid * yGrid * sizeof(bool));\n              // memset(hyperV, 0, xGrid * yGrid * sizeof(bool));\n              for (i = regionY1; i <= regionY2; i++) {\n                for (j = regionX1; j <= regionX2; j++) {\n                  hyperH[i][j] = FALSE;\n                }\n              }\n              for (i = regionY1; i <= regionY2; i++) {\n                for (j = regionX1; j <= regionX2; j++) {\n                  hyperV[i][j] = FALSE;\n                }\n              }\n              // TODO: use seperate loops\n\n              // setup heap1, heap2 and initialize d1[][] and d2[][] for all the\n              // grids on the two subtrees\n              setupHeap(netID, edgeID, pq1, v2, regionX1, regionX2, regionY1,\n                        regionY2, d1, corrEdge, inRegion);\n              // TODO: use std priority queue\n              // while loop to find shortest path\n              ind1 = (pq1.top().d1_p - &d1[0][0]);\n              pq1.pop();\n              curX = ind1 % xGrid;\n              curY = ind1 / xGrid;\n\n              for (local_vec::iterator ii = v2.begin(); ii != v2.end(); ii++) {\n                pop_heap2[*ii] = TRUE;\n              }\n              float curr_d1;\n              while (pop_heap2[ind1] ==\n                     FALSE) // stop until the grid position been popped out from\n                            // both heap1 and heap2\n              {\n                // relax all the adjacent grids within the enlarged region for\n                // source subtree\n\n                // if(PRINT) printf(\"curX curY %d %d, (%d, %d), (%d, %d),\n                // pq1.size: %d\\n\", curX, curY, regionX1, regionX2, regionY1,\n                // regionY2, pq1.size()); if(curX == 102 && curY == 221)\n                // exit(1);\n                curr_d1 = d1[curY][curX];\n                if (curr_d1 != 0) {\n                  if (HV[curY][curX]) {\n                    preX = parentX1[curY][curX];\n                    preY = parentY1[curY][curX];\n                  } else {\n                    preX = parentX3[curY][curX];\n                    preY = parentY3[curY][curX];\n                  }\n                } else {\n                  preX = curX;\n                  preY = curY;\n                }\n\n                // left\n                if (curX > regionX1) {\n                  grid = curY * (xGrid - 1) + curX - 1;\n                  // printf(\"grid: %d usage: %d red:%d last:%d sum%f %d\\n\",grid,\n                  // h_edges[grid].usage.load(), h_edges[grid].red,\n                  // h_edges[grid].last_usage, L , h_edges[grid].usage.load() +\n                  // h_edges[grid].red + (int)(L*h_edges[grid].last_usage));\n                  if ((preY == curY) || (curr_d1 == 0)) {\n                    tmp = curr_d1 +\n                          h_costTable[h_edges[grid].usage + h_edges[grid].red +\n                                      (int)(L * h_edges[grid].last_usage)];\n                  } else {\n                    if (curX < regionX2 - 1) {\n                      tmp_grid = curY * (xGrid - 1) + curX;\n                      tmp_cost =\n                          d1[curY][curX + 1] +\n                          h_costTable[h_edges[tmp_grid].usage +\n                                      h_edges[tmp_grid].red +\n                                      (int)(L * h_edges[tmp_grid].last_usage)];\n\n                      if (tmp_cost < curr_d1 + VIA) {\n                        hyperH[curY][curX] = TRUE; // Michael\n                      }\n                    }\n                    tmp = curr_d1 + VIA +\n                          h_costTable[h_edges[grid].usage + h_edges[grid].red +\n                                      (int)(L * h_edges[grid].last_usage)];\n                  }\n                  // if(LOCK)  h_edges[grid].releaseLock();\n                  tmpX = curX - 1; // the left neighbor\n\n                  /*if(d1[curY][tmpX]>=BIG_INT) // left neighbor not been put\n                  into heap1\n                  {\n                      d1[curY][tmpX] = tmp;\n                      parentX3[curY][tmpX] = curX;\n                      parentY3[curY][tmpX] = curY;\n                      HV[curY][tmpX] = FALSE;\n                      pq1.push(&(d1[curY][tmpX]));\n                  }\n                  else */\n                  if (d1[curY][tmpX] >\n                      tmp) // left neighbor been put into heap1 but needs update\n                  {\n                    d1[curY][tmpX]       = tmp;\n                    parentX3[curY][tmpX] = curX;\n                    parentY3[curY][tmpX] = curY;\n                    HV[curY][tmpX]       = FALSE;\n                    pq1.push({&(d1[curY][tmpX]), tmp});\n                  }\n                }\n                // right\n                if (curX < regionX2) {\n                  grid = curY * (xGrid - 1) + curX;\n\n                  // printf(\"grid: %d usage: %d red:%d last:%d sum%f %d\\n\",grid,\n                  // h_edges[grid].usage.load(), h_edges[grid].red,\n                  // h_edges[grid].last_usage, L , h_edges[grid].usage.load() +\n                  // h_edges[grid].red + (int)(L*h_edges[grid].last_usage));\n                  if ((preY == curY) || (curr_d1 == 0)) {\n                    tmp = curr_d1 +\n                          h_costTable[h_edges[grid].usage + h_edges[grid].red +\n                                      (int)(L * h_edges[grid].last_usage)];\n                  } else {\n                    if (curX > regionX1 + 1) {\n                      tmp_grid = curY * (xGrid - 1) + curX - 1;\n                      tmp_cost =\n                          d1[curY][curX - 1] +\n                          h_costTable[h_edges[tmp_grid].usage +\n                                      h_edges[tmp_grid].red +\n                                      (int)(L * h_edges[tmp_grid].last_usage)];\n\n                      if (tmp_cost < curr_d1 + VIA) {\n                        hyperH[curY][curX] = TRUE;\n                      }\n                    }\n                    tmp = curr_d1 + VIA +\n                          h_costTable[h_edges[grid].usage + h_edges[grid].red +\n                                      (int)(L * h_edges[grid].last_usage)];\n                  }\n                  // if(LOCK) h_edges[grid].releaseLock();\n                  tmpX = curX + 1; // the right neighbor\n\n                  /*if(d1[curY][tmpX]>=BIG_INT) // right neighbor not been put\n                  into heap1\n                  {\n                      d1[curY][tmpX] = tmp;\n                      parentX3[curY][tmpX] = curX;\n                      parentY3[curY][tmpX] = curY;\n                      HV[curY][tmpX] = FALSE;\n                      pq1.push(&(d1[curY][tmpX]));\n\n                  }\n                  else */\n                  if (d1[curY][tmpX] > tmp) // right neighbor been put into\n                                            // heap1 but needs update\n                  {\n                    d1[curY][tmpX]       = tmp;\n                    parentX3[curY][tmpX] = curX;\n                    parentY3[curY][tmpX] = curY;\n                    HV[curY][tmpX]       = FALSE;\n                    pq1.push({&(d1[curY][tmpX]), tmp});\n                  }\n                }\n                // bottom\n                if (curY > regionY1) {\n                  grid = (curY - 1) * xGrid + curX;\n\n                  // printf(\"grid: %d usage: %d red:%d last:%d sum%f %d\\n\",grid,\n                  // v_edges[grid].usage.load(), v_edges[grid].red,\n                  // v_edges[grid].last_usage, L , v_edges[grid].usage.load() +\n                  // v_edges[grid].red + (int)(L*v_edges[grid].last_usage));\n                  if ((preX == curX) || (curr_d1 == 0)) {\n                    tmp = curr_d1 +\n                          v_costTable[v_edges[grid].usage + v_edges[grid].red +\n                                      (int)(L * v_edges[grid].last_usage)];\n                  } else {\n                    if (curY < regionY2 - 1) {\n                      tmp_grid = curY * xGrid + curX;\n                      tmp_cost =\n                          d1[curY + 1][curX] +\n                          v_costTable[v_edges[tmp_grid].usage +\n                                      v_edges[tmp_grid].red +\n                                      (int)(L * v_edges[tmp_grid].last_usage)];\n\n                      if (tmp_cost < curr_d1 + VIA) {\n                        hyperV[curY][curX] = TRUE;\n                      }\n                    }\n                    tmp = curr_d1 + VIA +\n                          v_costTable[v_edges[grid].usage + v_edges[grid].red +\n                                      (int)(L * v_edges[grid].last_usage)];\n                  }\n                  // if(LOCK) v_edges[grid].releaseLock();\n                  tmpY = curY - 1; // the bottom neighbor\n\n                  /*if(d1[tmpY][curX]>=BIG_INT) // bottom neighbor not been put\n                  into heap1\n                  {\n                      d1[tmpY][curX] = tmp;\n                      parentX1[tmpY][curX] = curX;\n                      parentY1[tmpY][curX] = curY;\n                      HV[tmpY][curX] = TRUE;\n                      pq1.push(&(d1[tmpY][curX]));\n\n                  }\n                  else */\n                  if (d1[tmpY][curX] > tmp) // bottom neighbor been put into\n                                            // heap1 but needs update\n                  {\n                    d1[tmpY][curX]       = tmp;\n                    parentX1[tmpY][curX] = curX;\n                    parentY1[tmpY][curX] = curY;\n                    HV[tmpY][curX]       = TRUE;\n                    pq1.push({&(d1[tmpY][curX]), tmp});\n                  }\n                }\n                // top\n                if (curY < regionY2) {\n                  grid = curY * xGrid + curX;\n\n                  // printf(\"grid: %d usage: %d red:%d last:%d sum%f %d\\n\",grid,\n                  // v_edges[grid].usage.load(), v_edges[grid].red,\n                  // v_edges[grid].last_usage, L , v_edges[grid].usage.load() +\n                  // v_edges[grid].red + (int)(L*v_edges[grid].last_usage));\n                  if ((preX == curX) || (curr_d1 == 0)) {\n                    tmp = curr_d1 +\n                          v_costTable[v_edges[grid].usage + v_edges[grid].red +\n                                      (int)(L * v_edges[grid].last_usage)];\n                  } else {\n                    if (curY > regionY1 + 1) {\n                      tmp_grid = (curY - 1) * xGrid + curX;\n                      tmp_cost =\n                          d1[curY - 1][curX] +\n                          v_costTable[v_edges[tmp_grid].usage +\n                                      v_edges[tmp_grid].red +\n                                      (int)(L * v_edges[tmp_grid].last_usage)];\n\n                      if (tmp_cost < curr_d1 + VIA) {\n                        hyperV[curY][curX] = TRUE;\n                      }\n                    }\n                    tmp = curr_d1 + VIA +\n                          v_costTable[v_edges[grid].usage + v_edges[grid].red +\n                                      (int)(L * v_edges[grid].last_usage)];\n                  }\n                  // if(LOCK) v_edges[grid].releaseLock();\n                  tmpY = curY + 1; // the top neighbor\n\n                  /*if(d1[tmpY][curX]>=BIG_INT) // top neighbor not been put\n                  into heap1\n                  {\n                      d1[tmpY][curX] = tmp;\n                      parentX1[tmpY][curX] = curX;\n                      parentY1[tmpY][curX] = curY;\n                      HV[tmpY][curX] = TRUE;\n                      pq1.push(&(d1[tmpY][curX]));\n                  }\n                  else*/\n                  if (d1[tmpY][curX] >\n                      tmp) // top neighbor been put into heap1 but needs update\n                  {\n                    d1[tmpY][curX]       = tmp;\n                    parentX1[tmpY][curX] = curX;\n                    parentY1[tmpY][curX] = curY;\n                    HV[tmpY][curX]       = TRUE;\n                    pq1.push({&(d1[tmpY][curX]), tmp});\n                  }\n                }\n\n                // update ind1 and ind2 for next loop, Michael: need to check if\n                // it is up-to-date value.\n                float d1_push;\n                do {\n                  ind1    = pq1.top().d1_p - &d1[0][0];\n                  d1_push = pq1.top().d1_push;\n                  pq1.pop();\n                  curX = ind1 % xGrid;\n                  curY = ind1 / xGrid;\n                } while (d1_push != d1[curY][curX]);\n              } // while loop\n\n              for (local_vec::iterator ii = v2.begin(); ii != v2.end(); ii++)\n                pop_heap2[*ii] = FALSE;\n\n              crossX = ind1 % xGrid;\n              crossY = ind1 / xGrid;\n\n              cnt  = 0;\n              curX = crossX;\n              curY = crossY;\n              while (d1[curY][curX] != 0) // loop until reach subtree1\n              {\n                hypered = FALSE;\n                if (cnt != 0) {\n                  if (curX != tmpX && hyperH[curY][curX]) {\n                    curX    = 2 * curX - tmpX;\n                    hypered = TRUE;\n                  }\n                  // printf(\"hyperV[153][134]: %d\\n\", hyperV[curY][curX]);\n                  if (curY != tmpY && hyperV[curY][curX]) {\n                    curY    = 2 * curY - tmpY;\n                    hypered = TRUE;\n                  }\n                }\n                tmpX = curX;\n                tmpY = curY;\n                if (!hypered) {\n                  if (HV[tmpY][tmpX]) {\n                    curY = parentY1[tmpY][tmpX];\n                  } else {\n                    curX = parentX3[tmpY][tmpX];\n                  }\n                }\n\n                tmp_gridsX[cnt] = curX;\n                tmp_gridsY[cnt] = curY;\n                cnt++;\n              }\n              // reverse the grids on the path\n\n              for (i = 0; i < cnt; i++) {\n                tmpind    = cnt - 1 - i;\n                gridsX[i] = tmp_gridsX[tmpind];\n                gridsY[i] = tmp_gridsY[tmpind];\n              }\n              // add the connection point (crossX, crossY)\n              gridsX[cnt] = crossX;\n              gridsY[cnt] = crossY;\n              cnt++;\n\n              curX     = crossX;\n              curY     = crossY;\n              cnt_n1n2 = cnt;\n\n              // change the tree structure according to the new routing for the\n              // tree edge find E1 and E2, and the endpoints of the edges they\n              // are on\n              E1x = gridsX[0];\n              E1y = gridsY[0];\n              E2x = gridsX[cnt_n1n2 - 1];\n              E2y = gridsY[cnt_n1n2 - 1];\n\n              edge_n1n2 = edgeID;\n              // if(netID == 252163 && edgeID == 51)\n              //    printf(\"E1x: %d, E1y: %d, E2x: %d, E2y %d length: %d\\n\",\n              //    E1x, E1y, E2x, E2y, cnt_n1n2);\n\n              // (1) consider subtree1\n              if (n1 >= deg && (E1x != n1x || E1y != n1y))\n              // n1 is not a pin and E1!=n1, then make change to subtree1,\n              // otherwise, no change to subtree1\n              {\n                // find the endpoints of the edge E1 is on\n                endpt1 = treeedges[corrEdge[E1y][E1x]].n1;\n                endpt2 = treeedges[corrEdge[E1y][E1x]].n2;\n\n                // find A1, A2 and edge_n1A1, edge_n1A2\n                if (treenodes[n1].nbr[0] == n2) {\n                  A1        = treenodes[n1].nbr[1];\n                  A2        = treenodes[n1].nbr[2];\n                  edge_n1A1 = treenodes[n1].edge[1];\n                  edge_n1A2 = treenodes[n1].edge[2];\n                } else if (treenodes[n1].nbr[1] == n2) {\n                  A1        = treenodes[n1].nbr[0];\n                  A2        = treenodes[n1].nbr[2];\n                  edge_n1A1 = treenodes[n1].edge[0];\n                  edge_n1A2 = treenodes[n1].edge[2];\n                } else {\n                  A1        = treenodes[n1].nbr[0];\n                  A2        = treenodes[n1].nbr[1];\n                  edge_n1A1 = treenodes[n1].edge[0];\n                  edge_n1A2 = treenodes[n1].edge[1];\n                }\n\n                if (endpt1 == n1 ||\n                    endpt2 == n1) // E1 is on (n1, A1) or (n1, A2)\n                {\n                  // if E1 is on (n1, A2), switch A1 and A2 so that E1 is always\n                  // on (n1, A1)\n                  if (endpt1 == A2 || endpt2 == A2) {\n                    tmpi      = A1;\n                    A1        = A2;\n                    A2        = tmpi;\n                    tmpi      = edge_n1A1;\n                    edge_n1A1 = edge_n1A2;\n                    edge_n1A2 = tmpi;\n                  }\n\n                  // update route for edge (n1, A1), (n1, A2)\n                  updateRouteType1(treenodes, n1, A1, A2, E1x, E1y, treeedges,\n                                   edge_n1A1, edge_n1A2);\n                  // update position for n1\n                  treenodes[n1].x = E1x;\n                  treenodes[n1].y = E1y;\n                }    // if E1 is on (n1, A1) or (n1, A2)\n                else // E1 is not on (n1, A1) or (n1, A2), but on (C1, C2)\n                {\n                  C1        = endpt1;\n                  C2        = endpt2;\n                  edge_C1C2 = corrEdge[E1y][E1x];\n\n                  // update route for edge (n1, C1), (n1, C2) and (A1, A2)\n                  updateRouteType2(treenodes, n1, A1, A2, C1, C2, E1x, E1y,\n                                   treeedges, edge_n1A1, edge_n1A2, edge_C1C2);\n                  // update position for n1\n                  treenodes[n1].x = E1x;\n                  treenodes[n1].y = E1y;\n                  // update 3 edges (n1, A1)->(C1, n1), (n1, A2)->(n1, C2), (C1,\n                  // C2)->(A1, A2)\n                  edge_n1C1               = edge_n1A1;\n                  treeedges[edge_n1C1].n1 = C1;\n                  treeedges[edge_n1C1].n2 = n1;\n                  edge_n1C2               = edge_n1A2;\n                  treeedges[edge_n1C2].n1 = n1;\n                  treeedges[edge_n1C2].n2 = C2;\n                  edge_A1A2               = edge_C1C2;\n                  treeedges[edge_A1A2].n1 = A1;\n                  treeedges[edge_A1A2].n2 = A2;\n                  // update nbr and edge for 5 nodes n1, A1, A2, C1, C2\n                  // n1's nbr (n2, A1, A2)->(n2, C1, C2)\n                  treenodes[n1].nbr[0]  = n2;\n                  treenodes[n1].edge[0] = edge_n1n2;\n                  treenodes[n1].nbr[1]  = C1;\n                  treenodes[n1].edge[1] = edge_n1C1;\n                  treenodes[n1].nbr[2]  = C2;\n                  treenodes[n1].edge[2] = edge_n1C2;\n                  // A1's nbr n1->A2\n                  for (i = 0; i < 3; i++) {\n                    if (treenodes[A1].nbr[i] == n1) {\n                      treenodes[A1].nbr[i]  = A2;\n                      treenodes[A1].edge[i] = edge_A1A2;\n                      break;\n                    }\n                  }\n                  // A2's nbr n1->A1\n                  for (i = 0; i < 3; i++) {\n                    if (treenodes[A2].nbr[i] == n1) {\n                      treenodes[A2].nbr[i]  = A1;\n                      treenodes[A2].edge[i] = edge_A1A2;\n                      break;\n                    }\n                  }\n                  // C1's nbr C2->n1\n                  for (i = 0; i < 3; i++) {\n                    if (treenodes[C1].nbr[i] == C2) {\n                      treenodes[C1].nbr[i]  = n1;\n                      treenodes[C1].edge[i] = edge_n1C1;\n                      break;\n                    }\n                  }\n                  // C2's nbr C1->n1\n                  for (i = 0; i < 3; i++) {\n                    if (treenodes[C2].nbr[i] == C1) {\n                      treenodes[C2].nbr[i]  = n1;\n                      treenodes[C2].edge[i] = edge_n1C2;\n                      break;\n                    }\n                  }\n\n                } // else E1 is not on (n1, A1) or (n1, A2), but on (C1, C2)\n              }   // n1 is not a pin and E1!=n1\n\n              // (2) consider subtree2\n\n              if (n2 >= deg && (E2x != n2x || E2y != n2y))\n              // n2 is not a pin and E2!=n2, then make change to subtree2,\n              // otherwise, no change to subtree2\n              {\n                // find the endpoints of the edge E1 is on\n                endpt1 = treeedges[corrEdge[E2y][E2x]].n1;\n                endpt2 = treeedges[corrEdge[E2y][E2x]].n2;\n\n                // find B1, B2\n                if (treenodes[n2].nbr[0] == n1) {\n                  B1        = treenodes[n2].nbr[1];\n                  B2        = treenodes[n2].nbr[2];\n                  edge_n2B1 = treenodes[n2].edge[1];\n                  edge_n2B2 = treenodes[n2].edge[2];\n                } else if (treenodes[n2].nbr[1] == n1) {\n                  B1        = treenodes[n2].nbr[0];\n                  B2        = treenodes[n2].nbr[2];\n                  edge_n2B1 = treenodes[n2].edge[0];\n                  edge_n2B2 = treenodes[n2].edge[2];\n                } else {\n                  B1        = treenodes[n2].nbr[0];\n                  B2        = treenodes[n2].nbr[1];\n                  edge_n2B1 = treenodes[n2].edge[0];\n                  edge_n2B2 = treenodes[n2].edge[1];\n                }\n\n                if (endpt1 == n2 ||\n                    endpt2 == n2) // E2 is on (n2, B1) or (n2, B2)\n                {\n                  // if E2 is on (n2, B2), switch B1 and B2 so that E2 is always\n                  // on (n2, B1)\n                  if (endpt1 == B2 || endpt2 == B2) {\n                    tmpi      = B1;\n                    B1        = B2;\n                    B2        = tmpi;\n                    tmpi      = edge_n2B1;\n                    edge_n2B1 = edge_n2B2;\n                    edge_n2B2 = tmpi;\n                  }\n\n                  // update route for edge (n2, B1), (n2, B2)\n                  updateRouteType1(treenodes, n2, B1, B2, E2x, E2y, treeedges,\n                                   edge_n2B1, edge_n2B2);\n\n                  // update position for n2\n                  treenodes[n2].x = E2x;\n                  treenodes[n2].y = E2y;\n                }    // if E2 is on (n2, B1) or (n2, B2)\n                else // E2 is not on (n2, B1) or (n2, B2), but on (D1, D2)\n                {\n                  D1        = endpt1;\n                  D2        = endpt2;\n                  edge_D1D2 = corrEdge[E2y][E2x];\n\n                  // update route for edge (n2, D1), (n2, D2) and (B1, B2)\n                  updateRouteType2(treenodes, n2, B1, B2, D1, D2, E2x, E2y,\n                                   treeedges, edge_n2B1, edge_n2B2, edge_D1D2);\n                  // update position for n2\n                  treenodes[n2].x = E2x;\n                  treenodes[n2].y = E2y;\n                  // update 3 edges (n2, B1)->(D1, n2), (n2, B2)->(n2, D2), (D1,\n                  // D2)->(B1, B2)\n                  edge_n2D1               = edge_n2B1;\n                  treeedges[edge_n2D1].n1 = D1;\n                  treeedges[edge_n2D1].n2 = n2;\n                  edge_n2D2               = edge_n2B2;\n                  treeedges[edge_n2D2].n1 = n2;\n                  treeedges[edge_n2D2].n2 = D2;\n                  edge_B1B2               = edge_D1D2;\n                  treeedges[edge_B1B2].n1 = B1;\n                  treeedges[edge_B1B2].n2 = B2;\n                  // update nbr and edge for 5 nodes n2, B1, B2, D1, D2\n                  // n1's nbr (n1, B1, B2)->(n1, D1, D2)\n                  treenodes[n2].nbr[0]  = n1;\n                  treenodes[n2].edge[0] = edge_n1n2;\n                  treenodes[n2].nbr[1]  = D1;\n                  treenodes[n2].edge[1] = edge_n2D1;\n                  treenodes[n2].nbr[2]  = D2;\n                  treenodes[n2].edge[2] = edge_n2D2;\n                  // B1's nbr n2->B2\n                  for (i = 0; i < 3; i++) {\n                    if (treenodes[B1].nbr[i] == n2) {\n                      treenodes[B1].nbr[i]  = B2;\n                      treenodes[B1].edge[i] = edge_B1B2;\n                      break;\n                    }\n                  }\n                  // B2's nbr n2->B1\n                  for (i = 0; i < 3; i++) {\n                    if (treenodes[B2].nbr[i] == n2) {\n                      treenodes[B2].nbr[i]  = B1;\n                      treenodes[B2].edge[i] = edge_B1B2;\n                      break;\n                    }\n                  }\n                  // D1's nbr D2->n2\n                  for (i = 0; i < 3; i++) {\n                    if (treenodes[D1].nbr[i] == D2) {\n                      treenodes[D1].nbr[i]  = n2;\n                      treenodes[D1].edge[i] = edge_n2D1;\n                      break;\n                    }\n                  }\n                  // D2's nbr D1->n2\n                  for (i = 0; i < 3; i++) {\n                    if (treenodes[D2].nbr[i] == D1) {\n                      treenodes[D2].nbr[i]  = n2;\n                      treenodes[D2].edge[i] = edge_n2D2;\n                      break;\n                    }\n                  }\n                } // else E2 is not on (n2, B1) or (n2, B2), but on (D1, D2)\n              }   // n2 is not a pin and E2!=n2\n\n              // update route for edge (n1, n2) and edge usage\n\n              // printf(\"update route? %d %d\\n\", netID, num_edges);\n              if (treeedges[edge_n1n2].route.type == MAZEROUTE) {\n                free(treeedges[edge_n1n2].route.gridsX);\n                free(treeedges[edge_n1n2].route.gridsY);\n              }\n              treeedges[edge_n1n2].route.gridsX =\n                  (short*)calloc(cnt_n1n2, sizeof(short));\n              treeedges[edge_n1n2].route.gridsY =\n                  (short*)calloc(cnt_n1n2, sizeof(short));\n              treeedges[edge_n1n2].route.type     = MAZEROUTE;\n              treeedges[edge_n1n2].route.routelen = cnt_n1n2 - 1;\n              treeedges[edge_n1n2].len = ADIFF(E1x, E2x) + ADIFF(E1y, E2y);\n              treeedges[edge_n1n2].n_ripups += 1;\n              total_ripups += 1;\n              max_ripups.update(treeedges[edge_n1n2].n_ripups);\n\n              for (i = 0; i < cnt_n1n2; i++) {\n                // printf(\"cnt_n1n2: %d\\n\", cnt_n1n2);\n                treeedges[edge_n1n2].route.gridsX[i] = gridsX[i];\n                treeedges[edge_n1n2].route.gridsY[i] = gridsY[i];\n              }\n\n              // update edge usage\n\n              /*for(i=0; i<pre_length; i++)\n              {\n                  if(pre_gridsX[i]==pre_gridsX[i+1]) // a vertical edge\n                  {\n                      if(i != pre_length - 1)\n                          min_y = min(pre_gridsY[i], pre_gridsY[i+1]);\n                      else\n                          min_y = pre_gridsY[i];\n                      //v_edges[min_y*xGrid+gridsX[i]].usage += 1;\n                      //galois::atomicAdd(v_edges[min_y*xGrid+gridsX[i]].usage,\n              (short unsigned)1);\n                      //printf(\"x y %d %d i %d \\n\", pre_gridsX[i], min_y, i);\n                      v_edges[min_y*xGrid+pre_gridsX[i]].usage.fetch_sub((short\n              int)1);\n                      //if(v_edges[min_y*xGrid+pre_gridsX[i]].usage < 0)\n              printf(\"V negative! %d \\n\", i);\n                  }\n                  else ///if(gridsY[i]==gridsY[i+1])// a horizontal edge\n                  {\n                      if(i != pre_length - 1)\n                          min_x = min(pre_gridsX[i], pre_gridsX[i+1]);\n                      else\n                          min_x = pre_gridsX[i];\n                      //h_edges[gridsY[i]*(xGrid-1)+min_x].usage += 1;\n                      //galois::atomicAdd(h_edges[gridsY[i]*(xGrid-1)+min_x].usage,\n              (short unsigned)1);\n                      //printf(\"x y %d %d i %d\\n\", min_x, pre_gridsY[i], i);\n                      h_edges[pre_gridsY[i]*(xGrid-1)+min_x].usage.fetch_sub((short\n              int)1);\n                      //if(h_edges[pre_gridsY[i]*(xGrid-1)+min_x].usage < 0)\n              printf(\"H negative! %d \\n\", i);\n                  }\n              }*/\n\n              for (i = 0; i < cnt_n1n2 - 1; i++) {\n                if (gridsX[i] == gridsX[i + 1]) // a vertical edge\n                {\n                  min_y = min(gridsY[i], gridsY[i + 1]);\n                  // v_edges[min_y*xGrid+gridsX[i]].usage += 1;\n                  // galois::atomicAdd(v_edges[min_y*xGrid+gridsX[i]].usage,\n                  // (short unsigned)1);\n                  v_edges[min_y * xGrid + gridsX[i]].usage.fetch_add(\n                      (short int)1);\n                  galois::atomicMax(\n                      v_edges[min_y * xGrid + gridsX[i]].max_ripups,\n                      treeedges[edge_n1n2].n_ripups);\n                } else /// if(gridsY[i]==gridsY[i+1])// a horizontal edge\n                {\n                  min_x = min(gridsX[i], gridsX[i + 1]);\n                  // h_edges[gridsY[i]*(xGrid-1)+min_x].usage += 1;\n                  // galois::atomicAdd(h_edges[gridsY[i]*(xGrid-1)+min_x].usage,\n                  // (short unsigned)1);\n                  h_edges[gridsY[i] * (xGrid - 1) + min_x].usage.fetch_add(\n                      (short int)1);\n                  galois::atomicMax(\n                      h_edges[gridsY[i] * (xGrid - 1) + min_x].max_ripups,\n                      treeedges[edge_n1n2].n_ripups);\n                }\n              }\n              /*if(LOCK){\n                  for(i=0; i<cnt_n1n2-1; i++)\n                  {\n                      if(gridsX[i]==gridsX[i+1]) // a vertical edge\n                      {\n                          min_y = min(gridsY[i], gridsY[i+1]);\n                          v_edges[min_y*xGrid+gridsX[i]].releaseLock();\n                      }\n                      else ///if(gridsY[i]==gridsY[i+1])// a horizontal edge\n                      {\n                          min_x = min(gridsX[i], gridsX[i+1]);\n                          h_edges[gridsY[i]*(xGrid-1)+min_x].releaseLock();\n                      }\n                  }\n              }*/\n              if (checkRoute2DTree(netID)) {\n                reInitTree(netID);\n                return;\n              }\n            } // congested route\n          }   // maze routing\n        }     // loop edgeID\n      },\n      // galois::wl<galois::worklists::ParaMeter<>>(),\n      galois::steal(),\n      // galois::chunk_size<32>(),\n      galois::loopname(\"maze routing\")); // galois::do_all\n\n  printf(\"total ripups: %d max ripups: %d\\n\", total_ripups.reduce(),\n         max_ripups.reduce());\n  //}, \"mazeroute vtune function\");\n  free(h_costTable);\n  free(v_costTable);\n}\n"
  },
  {
    "path": "lonestar/eda/cpu/sproute/memAlloc.c",
    "content": "/* --------------------------------------------------------------------------\n   Public domain memory allocation and de-allocation routines.\n   Taken from Appendix B of: \n   Numerical Recipes in C: The Art of Scientific Computing, Second Edition,\n   Cambridge University Press, 1992\n----------------------------------------------------------------------------*/\n#include <stdio.h>\n#include <stddef.h>\n#include <stdlib.h>\n\n#include \"memAlloc.h\"\n\n#define MEM_END 1\n#define FREE_ARG char*\n\nvoid runtimeError(char error_text[])\n/* error handler */\n{\n  fprintf(stderr,\"run-time error...\\n\");\n  fprintf(stderr,\"%s\\n\",error_text);\n  fprintf(stderr,\"...now exiting to system...\\n\");\n  exit(1);\n}\n\nfloat *vector(long nl, long nh)\n/* allocate a float vector with subscript range v[nl..nh] */\n{\n  float *v;\n  v=(float *)malloc((size_t) ((nh-nl+1+MEM_END)*sizeof(float)));\n  if (!v) runtimeError(\"allocation failure in vector()\");\n  return v-nl+MEM_END;\n}\n\nint *ivector(long nl, long nh)\n/* allocate an int vector with subscript range v[nl..nh] */\n{\n  int *v;\n  v=(int *)malloc((size_t) ((nh-nl+1+MEM_END)*sizeof(int)));\n  if (!v) runtimeError(\"allocation failure in ivector()\");\n  return v-nl+MEM_END;\n}\n\nunsigned char *cvector(long nl, long nh)\n/* allocate an unsigned char vector with subscript range v[nl..nh] */\n{\n  unsigned char *v;\n  v=(unsigned char *)malloc((size_t) ((nh-nl+1+MEM_END)*sizeof(unsigned char)));\n  if (!v) runtimeError(\"allocation failure in cvector()\");\n  return v-nl+MEM_END;\n}\n\nunsigned long *lvector(long nl, long nh)\n/* allocate an unsigned long vector with subscript range v[nl..nh] */\n{\n  unsigned long *v;\n  v=(unsigned long *)malloc((size_t) ((nh-nl+1+MEM_END)*sizeof(long)));\n  if (!v) runtimeError(\"allocation failure in lvector()\");\n  return v-nl+MEM_END;\n}\n\ndouble *dvector(long nl, long nh)\n/* allocate a double vector with subscript range v[nl..nh] */\n{\n  double *v;\n  v=(double *)malloc((size_t) ((nh-nl+1+MEM_END)*sizeof(double)));\n  if (!v) runtimeError(\"allocation failure in dvector()\");\n  return v-nl+MEM_END;\n}\n\nfloat **matrix(long nrl, long nrh, long ncl, long nch)\n/* allocate a float matrix with subscript range m[nrl..nrh][ncl..nch] */\n{\n  long i, nrow=nrh-nrl+1,ncol=nch-ncl+1;\n  float **m;\n  \n  /* allocate pointers to rows */\n  m=(float **) malloc((size_t)((nrow+MEM_END)*sizeof(float*)));\n  if (!m) runtimeError(\"allocation failure 1 in matrix()\");\n  m += MEM_END;\n  m -= nrl;\n  \n  /* allocate rows and set pointers to them */\n  m[nrl]=(float *) malloc((size_t)((nrow*ncol+MEM_END)*sizeof(float)));\n  if (!m[nrl]) runtimeError(\"allocation failure 2 in matrix()\");\n  m[nrl] += MEM_END;\n  m[nrl] -= ncl;\n  for(i=nrl+1;i<=nrh;i++) m[i]=m[i-1]+ncol;\n  /* return pointer to array of pointers to rows */\n  return m;\n}\n\ndouble **dmatrix(long nrl, long nrh, long ncl, long nch)\n/* allocate a double matrix with subscript range m[nrl..nrh][ncl..nch] */\n{\n  long i, nrow=nrh-nrl+1,ncol=nch-ncl+1;\n  double **m;\n\n  /* allocate pointers to rows */\n  m=(double **) malloc((size_t)((nrow+MEM_END)*sizeof(double*)));\n  if (!m) runtimeError(\"allocation failure 1 in dmatrix()\");\n  m += MEM_END;\n  m -= nrl;\n  \n  /* allocate rows and set pointers to them */\n  m[nrl]=(double *) malloc((size_t)((nrow*ncol+MEM_END)*sizeof(double)));\n  if (!m[nrl]) runtimeError(\"allocation failure 2 din matrix()\");\n  m[nrl] += MEM_END;\n  m[nrl] -= ncl;\n  \n  for(i=nrl+1;i<=nrh;i++) m[i]=m[i-1]+ncol;\n  \n  /* return pointer to array of pointers to rows */\n  return m;\n}\n\nint **imatrix(long nrl, long nrh, long ncl, long nch)\n/* allocate a int matrix with subscript range m[nrl..nrh][ncl..nch] */\n{\n  long i, nrow=nrh-nrl+1,ncol=nch-ncl+1;\n  int **m;\n\n  /* allocate pointers to rows */\n  m=(int **) malloc((size_t)((nrow+MEM_END)*sizeof(int*)));\n  if (!m) runtimeError(\"allocation failure 1 in imatrix()\");\n  m += MEM_END;\n  m -= nrl;\n  \n  /* allocate rows and set pointers to them */\n  m[nrl]=(int *) malloc((size_t)((nrow*ncol+MEM_END)*sizeof(int)));\n  if (!m[nrl]) runtimeError(\"allocation failure 2 in imatrix()\");\n  m[nrl] += MEM_END;\n  m[nrl] -= ncl;\n  \n  for(i=nrl+1;i<=nrh;i++) m[i]=m[i-1]+ncol;\n  \n  /* return pointer to array of pointers to rows */\n  return m;\n}\n\n\nchar **cmatrix(long nrl, long nrh, long ncl, long nch)\n/* allocate a char matrix with subscript range m[nrl..nrh][ncl..nch] */\n{\n  long i, nrow=nrh-nrl+1,ncol=nch-ncl+1;\n  char **m;\n\n  /* allocate pointers to rows */\n  m=(char **) malloc((size_t)((nrow+MEM_END)*sizeof(char*)));\n  if (!m) runtimeError(\"allocation failure 1 in cmatrix()\");\n  m += MEM_END;\n  m -= nrl;\n  \n  /* allocate rows and set pointers to them */\n  m[nrl]=(char *) malloc((size_t)((nrow*ncol+MEM_END)*sizeof(char)));\n  if (!m[nrl]) runtimeError(\"allocation failure 2 in cmatrix()\");\n  m[nrl] += MEM_END;\n  m[nrl] -= ncl;\n  \n  for(i=nrl+1;i<=nrh;i++) m[i]=m[i-1]+ncol;\n  \n  /* return pointer to array of pointers to rows */\n  return m;\n}\n\n\nunsigned long **lmatrix(long nrl, long nrh, long ncl, long nch)\n/* allocate a int matrix with subscript range m[nrl..nrh][ncl..nch] */\n{\n  long i, nrow=nrh-nrl+1,ncol=nch-ncl+1;\n  unsigned long **m;\n\n  /* allocate pointers to rows */\n  m=(unsigned long **) malloc((size_t)((nrow+MEM_END)*sizeof(long*)));\n  if (!m) runtimeError(\"allocation failure 1 in lmatrix()\");\n  m += MEM_END;\n  m -= nrl;\n  \n  /* allocate rows and set pointers to them */\n  m[nrl]=(unsigned long *) malloc((size_t)((nrow*ncol+MEM_END)*sizeof(long)));\n  if (!m[nrl]) runtimeError(\"allocation failure 2 in lmatrix()\");\n  m[nrl] += MEM_END;\n  m[nrl] -= ncl;\n  \n  for(i=nrl+1;i<=nrh;i++) m[i]=m[i-1]+ncol;\n  \n  /* return pointer to array of pointers to rows */\n  return m;\n}\n\n\nfloat **submatrix(float **a, long oldrl, long oldrh, long oldcl, long oldch, \n  long newrl, long newcl)\n/* point a submatrix [newrl..][newcl..] to a[oldrl..oldrh][oldcl..oldch] */\n{\n  long i,j,nrow=oldrh-oldrl+1,ncol=oldcl-newcl;\n  float **m;\n  \n  /* allocate array of pointers to rows */\n  m=(float **) malloc((size_t) ((nrow+MEM_END)*sizeof(float*)));\n  if (!m) runtimeError(\"allocation failure in submatrix()\");\n  m += MEM_END;\n  m -= newrl;\n  \n  /* set pointers to rows */\n  for(i=oldrl,j=newrl;i<=oldrh;i++,j++) m[j]=a[i]+ncol;\n  \n  /* return pointer to array of pointers to rows */\n  return m;\n}\n\nfloat **convert_matrix(float *a, long nrl, long nrh, long ncl, long nch)\n/* allocate a float matrix m[nrl..nrh][ncl..nch] that points to the matrix\ndeclared in the standard C manner as a[nrow][ncol], where nrow=nrh-nrl+1\nand ncol=nch-ncl+1. The routine should be called with the address\n&a[0][0] as the first argument. */\n{\n  long i,j,nrow=nrh-nrl+1,ncol=nch-ncl+1;\n  float **m;\n\n  /* allocate pointers to rows */\n  m=(float **) malloc((size_t) ((nrow+MEM_END)*sizeof(float*)));\n  if (!m) runtimeError(\"allocation failure in convert_matrix()\");\n  m += MEM_END;\n  m -= nrl;\n  \n  /* set pointers to rows */\n  m[nrl]=a-ncl;\n  for(i=1,j=nrl+1;i<nrow;i++,j++) m[j]=m[j-1]+ncol;\n  \n  /* return pointer to array of pointers to rows */\n  return m;\n}\n\nfloat ***f3tensor(long nrl, long nrh, long ncl, long nch, long ndl, long ndh)\n/* allocate a float 3tensor with range t[nrl..nrh][ncl..nch][ndl..ndh] */\n{\n  long i,j,nrow=nrh-nrl+1,ncol=nch-ncl+1,ndep=ndh-ndl+1;\n  float ***t;\n\n  /* allocate pointers to pointers to rows */\n  t=(float ***) malloc((size_t)((nrow+MEM_END)*sizeof(float**)));\n  if (!t) runtimeError(\"allocation failure 1 in f3tensor()\");\n  t += MEM_END;\n  t -= nrl;\n  \n  /* allocate pointers to rows and set pointers to them */\n  t[nrl]=(float **) malloc((size_t)((nrow*ncol+MEM_END)*sizeof(float*)));\n  if (!t[nrl]) runtimeError(\"allocation failure 2 in f3tensor()\");\n  t[nrl] += MEM_END;\n  t[nrl] -= ncl;\n  \n  /* allocate rows and set pointers to them */\n  t[nrl][ncl]=(float *) malloc((size_t)((nrow*ncol*ndep+MEM_END)*sizeof(float)));\n  if (!t[nrl][ncl]) runtimeError(\"allocation failure 3 in f3tensor()\");\n  t[nrl][ncl] += MEM_END;\n  t[nrl][ncl] -= ndl;\n  \n  for(j=ncl+1;j<=nch;j++) t[nrl][j]=t[nrl][j-1]+ndep;\n  for(i=nrl+1;i<=nrh;i++) {\n    t[i]=t[i-1]+ncol;\n    t[i][ncl]=t[i-1][ncl]+ncol*ndep;\n    for(j=ncl+1;j<=nch;j++) t[i][j]=t[i][j-1]+ndep;\n  }\n  \n  /* return pointer to array of pointers to rows */\n  return t;\n}\n\n\nunsigned long ***lmatrix3D(long nrl, long nrh, long ncl, long nch, long ndl, long ndh)\n/* allocate an unsigned long 3D matrix with range t[nrl..nrh][ncl..nch][ndl..ndh] */\n{\n  long i,j,nrow=nrh-nrl+1,ncol=nch-ncl+1,ndep=ndh-ndl+1;\n  unsigned long ***t;\n  long d1, d2, d3;\n  \n  /* allocate pointers to pointers to rows */\n  t=(unsigned long ***) malloc((size_t)((nrow+MEM_END)*sizeof(long**)));\n  if (!t) runtimeError(\"allocation failure 1 in lmatrix3D()\");\n  t += MEM_END;\n  t -= nrl;\n  \n  /* allocate pointers to rows and set pointers to them */\n  t[nrl]=(unsigned long **) malloc((size_t)((nrow*ncol+MEM_END)*sizeof(long*)));\n  if (!t[nrl]) runtimeError(\"allocation failure 2 in lmatrix3D()\");\n  t[nrl] += MEM_END;\n  t[nrl] -= ncl;\n  \n  /* allocate rows and set pointers to them */\n  t[nrl][ncl]=(unsigned long *) malloc((size_t)((nrow*ncol*ndep+MEM_END)*sizeof(long)));\n  if (!t[nrl][ncl]) runtimeError(\"allocation failure 3 in lmatrix3D()\");\n  t[nrl][ncl] += MEM_END;\n  t[nrl][ncl] -= ndl;\n  \n  for(j=ncl+1;j<=nch;j++) t[nrl][j]=t[nrl][j-1]+ndep;\n  for(i=nrl+1;i<=nrh;i++) {\n    t[i]=t[i-1]+ncol;\n    t[i][ncl]=t[i-1][ncl]+ncol*ndep;\n    for(j=ncl+1;j<=nch;j++) t[i][j]=t[i][j-1]+ndep;\n  }  \n  \n  /* return pointer to array of pointers to rows */\n  return t;\n}\n\n\nint ***imatrix3D(int nrl, int nrh, int ncl, int nch, int ndl, int ndh)\n/* allocate an int 3D matrix with range t[nrl..nrh][ncl..nch][ndl..ndh] */\n{\n  int i,j,nrow=nrh-nrl+1,ncol=nch-ncl+1,ndep=ndh-ndl+1;\n  int ***t;\n  int d1, d2, d3;\n  \n  /* allocate pointers to pointers to rows */\n  t=(int ***) malloc((size_t)((nrow+MEM_END)*sizeof(int**)));\n  if (!t) runtimeError(\"allocation failure 1 in imatrix3D()\");\n  t += MEM_END;\n  t -= nrl;\n  \n  /* allocate pointers to rows and set pointers to them */\n  t[nrl]=(int **) malloc((size_t)((nrow*ncol+MEM_END)*sizeof(int*)));\n  if (!t[nrl]) runtimeError(\"allocation failure 2 in imatrix3D()\");\n  t[nrl] += MEM_END;\n  t[nrl] -= ncl;\n  \n  /* allocate rows and set pointers to them */\n  t[nrl][ncl]=(int *) malloc((size_t)((nrow*ncol*ndep+MEM_END)*sizeof(int)));\n  if (!t[nrl][ncl]) runtimeError(\"allocation failure 3 in imatrix3D()\");\n  t[nrl][ncl] += MEM_END;\n  t[nrl][ncl] -= ndl;\n  \n  for(j=ncl+1;j<=nch;j++) t[nrl][j]=t[nrl][j-1]+ndep;\n  for(i=nrl+1;i<=nrh;i++) {\n    t[i]=t[i-1]+ncol;\n    t[i][ncl]=t[i-1][ncl]+ncol*ndep;\n    for(j=ncl+1;j<=nch;j++) t[i][j]=t[i][j-1]+ndep;\n  }  \n  \n  /* return pointer to array of pointers to rows */\n  return t;\n}\n\n\nvoid free_vector(float *v, long nl, long nh)\n/* free a float vector allocated with vector() */\n{\n  free((FREE_ARG) (v+nl-MEM_END));\n}\n\nvoid free_ivector(int *v, long nl, long nh)\n/* free an int vector allocated with ivector() */\n{\n  free((FREE_ARG) (v+nl-MEM_END));\n}\n\nvoid free_cvector(unsigned char *v, long nl, long nh)\n/* free an unsigned char vector allocated with cvector() */\n{\n  free((FREE_ARG) (v+nl-MEM_END));\n}\n\nvoid free_lvector(unsigned long *v, long nl, long nh)\n/* free an unsigned long vector allocated with lvector() */\n{\n  free((FREE_ARG) (v+nl-MEM_END));\n}\n\nvoid free_dvector(double *v, long nl, long nh)\n/* free a double vector allocated with dvector() */\n{\n  free((FREE_ARG) (v+nl-MEM_END));\n}\n\nvoid free_matrix(float **m, long nrl, long nrh, long ncl, long nch)\n/* free a float matrix allocated by matrix() */\n{\n  free((FREE_ARG) (m[nrl]+ncl-MEM_END));\n  free((FREE_ARG) (m+nrl-MEM_END));\n}\n\nvoid free_dmatrix(double **m, long nrl, long nrh, long ncl, long nch)\n/* free a double matrix allocated by dmatrix() */\n{\n  free((FREE_ARG) (m[nrl]+ncl-MEM_END));\n  free((FREE_ARG) (m+nrl-MEM_END));\n}\n\nvoid free_imatrix(int **m, long nrl, long nrh, long ncl, long nch)\n/* free an int matrix allocated by imatrix() */\n{\n  free((FREE_ARG) (m[nrl]+ncl-MEM_END));\n  free((FREE_ARG) (m+nrl-MEM_END));\n}\n\nvoid free_cmatrix(char **m, long nrl, long nrh, long ncl, long nch)\n/* free a char matrix allocated by imatrix() */\n{\n  free((FREE_ARG) (m[nrl]+ncl-MEM_END));\n  free((FREE_ARG) (m+nrl-MEM_END));\n}\n\nvoid free_lmatrix(unsigned long **m, long nrl, long nrh, long ncl, long nch)\n/* free an unsigned long matrix allocated by lmatrix() */\n{\n  free((FREE_ARG) (m[nrl]+ncl-MEM_END));\n  free((FREE_ARG) (m+nrl-MEM_END));\n}\n\nvoid free_submatrix(float **b, long nrl, long nrh, long ncl, long nch)\n/* free a submatrix allocated by submatrix() */\n{\n  free((FREE_ARG) (b+nrl-MEM_END));\n}\n\nvoid free_convert_matrix(float **b, long nrl, long nrh, long ncl, long nch)\n/* free a matrix allocated by convert_matrix() */\n{\n  free((FREE_ARG) (b+nrl-MEM_END));\n}\n\nvoid free_f3tensor(float ***t, long nrl, long nrh, long ncl, long nch,\n  long ndl, long ndh)\n/* free a float f3tensor allocated by f3tensor() */\n{\n  free((FREE_ARG) (t[nrl][ncl]+ndl-MEM_END));\n  free((FREE_ARG) (t[nrl]+ncl-MEM_END));\n  free((FREE_ARG) (t+nrl-MEM_END));\n}\n\n\nvoid free_lmatrix3D(unsigned long ***t, long nrl, long nrh, long ncl, long nch,\n  long ndl, long ndh)\n/* free an unsigned long 3D matrix allocated by lmatrix3D() */\n{\n  free((FREE_ARG) (t[nrl][ncl]+ndl-MEM_END));\n  free((FREE_ARG) (t[nrl]+ncl-MEM_END));\n  free((FREE_ARG) (t+nrl-MEM_END));\n}\n\n\nvoid free_imatrix3D(int ***t, int nrl, int nrh, int ncl, int nch,\n  int ndl, int ndh)\n/* free an unsigned int 3D matrix allocated by imatrix3D() */\n{\n  free((FREE_ARG) (t[nrl][ncl]+ndl-MEM_END));\n  free((FREE_ARG) (t[nrl]+ncl-MEM_END));\n  free((FREE_ARG) (t+nrl-MEM_END));\n}\n"
  },
  {
    "path": "lonestar/eda/cpu/sproute/memAlloc.h",
    "content": "/* --------------------------------------------------------------------------\n   Public domain memory allocation and de-allocation routine header file.\n   Taken from Appendix B of:\n   Numerical Recipes in C: The Art of Scientific Computing, Second Edition,\n   Cambridge University Press, 1992\n----------------------------------------------------------------------------*/\n#ifndef _MEMALLOC_H_\n#define _MEMALLOC_H_\n\nstatic float sqrarg;\n#define SQR(a) ((sqrarg = (a)) == 0.0 ? 0.0 : sqrarg * sqrarg)\n\nstatic double dsqrarg;\n#define DSQR(a) ((dsqrarg = (a)) == 0.0 ? 0.0 : dsqrarg * dsqrarg)\n\nstatic double dmaxarg1, dmaxarg2;\n#define DMAX(a, b)                                                             \\\n  (dmaxarg1 = (a), dmaxarg2 = (b),                                             \\\n   (dmaxarg1) > (dmaxarg2) ? (dmaxarg1) : (dmaxarg2))\n\nstatic double dminarg1, dminarg2;\n#define DMIN(a, b)                                                             \\\n  (dminarg1 = (a), dminarg2 = (b),                                             \\\n   (dminarg1) < (dminarg2) ? (dminarg1) : (dminarg2))\n\nstatic float maxarg1, maxarg2;\n#define FMAX(a, b)                                                             \\\n  (maxarg1 = (a), maxarg2 = (b), (maxarg1) > (maxarg2) ? (maxarg1) : (maxarg2))\n\nstatic float minarg1, minarg2;\n#define FMIN(a, b)                                                             \\\n  (minarg1 = (a), minarg2 = (b), (minarg1) < (minarg2) ? (minarg1) : (minarg2))\n\nstatic long lmaxarg1, lmaxarg2;\n#define LMAX(a, b)                                                             \\\n  (lmaxarg1 = (a), lmaxarg2 = (b),                                             \\\n   (lmaxarg1) > (lmaxarg2) ? (lmaxarg1) : (lmaxarg2))\n\nstatic long lminarg1, lminarg2;\n#define LMIN(a, b)                                                             \\\n  (lminarg1 = (a), lminarg2 = (b),                                             \\\n   (lminarg1) < (lminarg2) ? (lminarg1) : (lminarg2))\n\nstatic int imaxarg1, imaxarg2;\n#define IMAX(a, b)                                                             \\\n  (imaxarg1 = (a), imaxarg2 = (b),                                             \\\n   (imaxarg1) > (imaxarg2) ? (imaxarg1) : (imaxarg2))\n\nstatic int iminarg1, iminarg2;\n#define IMIN(a, b)                                                             \\\n  (iminarg1 = (a), iminarg2 = (b),                                             \\\n   (iminarg1) < (iminarg2) ? (iminarg1) : (iminarg2))\n\n#define SIGN(a, b) ((b) >= 0.0 ? fabs(a) : -fabs(a))\n\nvoid runTimeError(char error_text[]);\nfloat* vector(long nl, long nh);\nint* ivector(long nl, long nh);\nunsigned char* cvector(long nl, long nh);\nunsigned long* lvector(long nl, long nh);\ndouble* dvector(long nl, long nh);\nfloat** matrix(long nrl, long nrh, long ncl, long nch);\ndouble** dmatrix(long nrl, long nrh, long ncl, long nch);\nint** imatrix(long nrl, long nrh, long ncl, long nch);\nchar** cmatrix(long nrl, long nrh, long ncl, long nch);\nunsigned long** lmatrix(long nrl, long nrh, long ncl, long nch);\nfloat** submatrix(float** a, long oldrl, long oldrh, long oldcl, long oldch,\n                  long newrl, long newcl);\nfloat** convert_matrix(float* a, long nrl, long nrh, long ncl, long nch);\nfloat*** f3tensor(long nrl, long nrh, long ncl, long nch, long ndl, long ndh);\nunsigned long*** lmatrix3D(long nrl, long nrh, long ncl, long nch, long ndl,\n                           long ndh);\nint*** imatrix3D(int nrl, int nrh, int ncl, int nch, int ndl, int ndh);\n\nvoid free_vector(float* v, long nl, long nh);\nvoid free_ivector(int* v, long nl, long nh);\nvoid free_cvector(unsigned char* v, long nl, long nh);\nvoid free_lvector(unsigned long* v, long nl, long nh);\nvoid free_dvector(double* v, long nl, long nh);\nvoid free_matrix(float** m, long nrl, long nrh, long ncl, long nch);\nvoid free_dmatrix(double** m, long nrl, long nrh, long ncl, long nch);\nvoid free_imatrix(int** m, long nrl, long nrh, long ncl, long nch);\nvoid free_cmatrix(char** m, long nrl, long nrh, long ncl, long nch);\nvoid free_lmatrix(unsigned long** m, long nrl, long nrh, long ncl, long nch);\nvoid free_submatrix(float** b, long nrl, long nrh, long ncl, long nch);\nvoid free_convert_matrix(float** b, long nrl, long nrh, long ncl, long nch);\nvoid free_f3tensor(float*** t, long nrl, long nrh, long ncl, long nch, long ndl,\n                   long ndh);\nvoid free_lmatrix3D(unsigned long*** t, long nrl, long nrh, long ncl, long nch,\n                    long ndl, long ndh);\nvoid free_imatrix3D(int*** t, int nrl, int nrh, int ncl, int nch, int ndl,\n                    int ndh);\n\n#endif /* _MEMALLOC_H_ */\n"
  },
  {
    "path": "lonestar/eda/cpu/sproute/mst2.c",
    "content": "#include  <stdlib.h>\n#include  <stdio.h>\n#include   <assert.h>\n#include  \"global.h\"\n#include  \"neighbors.h\"\n#include  \"dist.h\"\n#include  \"heap.h\"\n#include  \"err.h\"\n\n\n\nvoid  mst2_package_init( long  n )\n{\n  allocate_heap( n );\n  allocate_nn_arrays( n );\n}\n\n/****************************************************************************/\n/*\n*/\n\nvoid  mst2_package_done()\n{\n  deallocate_heap();\n  deallocate_nn_arrays();\n}  \n\n/****************************************************************************/\n/*\n*/\n\nvoid  mst2\n( \n  long    n,\n  Point*  pt, \n  long*   parent\n)\n{\n  long  i, k, nn1;\n  long  d;\n  long  oct;\n  long  root = 0;\n  extern  nn_array*  nn;\n\n//  brute_force_nearest_neighbors( n, pt, nn );\n  dq_nearest_neighbors( n, pt, nn );\n\n  /* \n     Binary heap implementation of Prim's algorithm.\n     Runs in O(n*log(n)) time since at most 8n edges are considered\n  */\n\n  heap_init( n );\n  heap_insert( root, 0 );\n  parent[root] = root;\n\n  for( k = 0;  k < n;  k++ )   /* n points to be extracted from heap */\n  {\n    i = heap_delete_min();\n\n    if (i<0) break;\n#ifdef DEBUG\n    assert( i >= 0 );\n#endif \n\n    /*\n      pt[i] entered the tree, update heap keys for its neighbors\n    */\n    for( oct = 0;  oct < 8;  oct++ )\n    {\n      nn1 = nn[i][oct]; \n      if( nn1 >= 0 )\n      {\n        d  = dist( pt[i], pt[nn1] );\n        if( in_heap(nn1) && (d < heap_key(nn1)) )\n        {\n          heap_decrease_key( nn1, d );\n          parent[nn1] = i;\n        } \n        else if( never_seen(nn1) )\n        {\n          heap_insert( nn1, d );\n          parent[nn1] = i;\n        }\n      }\n    }\n  }\n}\n\n/****************************************************************************/\n/****************************************************************************/\n\n"
  },
  {
    "path": "lonestar/eda/cpu/sproute/mst2.h",
    "content": "#ifndef _MST2_H_\n#define _MST2_H_\n\n#include \"global.h\"\n\nvoid mst2_package_init(long n);\nvoid mst2_package_done();\nvoid mst2(long n, Point* pt, long* parent);\n\n#endif\n"
  },
  {
    "path": "lonestar/eda/cpu/sproute/neighbors.c",
    "content": "#include  <assert.h>\n#include  <string.h>\n#include  <stdlib.h>\n#include  \"global.h\"\n#include  \"err.h\"\n#include  \"dist.h\"\n\nlong  octant\n(\n  Point  from,\n  Point  to\n);\n\nstatic Point* _pt;\n\n/***************************************************************************/\n/*\n  For efficiency purposes auxiliary arrays are allocated as globals \n*/\n\nlong    max_arrays_size = 0;\nnn_array*  nn   = (nn_array*)NULL;\nPoint*  sheared = (Point*)NULL;\nlong*  sorted   = (long*)NULL;\nlong*  aux      = (long*)NULL;  \n\n/***************************************************************************/\n/*\n  resize the auxiliary arrays to fit the specified number of points \n*/\n\nvoid  allocate_nn_arrays( long  n )\n{\n  if( max_arrays_size < n ) \n  {\n    nn      = (nn_array*)realloc( (void*)nn, (size_t)n*sizeof(nn_array) );\n    sheared = (Point*)realloc( (void*)sheared, (size_t)n*sizeof(Point) );\n    sorted  = (long*)realloc( (void*)sorted, (size_t)n*sizeof(long) );\n    aux     = (long*)realloc( (void*)aux, (size_t)n*sizeof(long) );\n    if( !nn || !sheared || !sorted || !aux )\n    {\n      err_exit( \"Cannot allocate memory in allocate_nn_arrays!\" );\n    }\n    max_arrays_size = n;\n  }\n}\n\n/***************************************************************************/\n/*\n  free memory used by auxiliary arrays\n*/\n\nvoid  deallocate_nn_arrays()\n{\n  max_arrays_size = 0;\n  if( nn )\n  {\n    free( (void*)nn );\n    nn = (nn_array*)NULL;\n  }\n  if( sheared )\n  {\n    free( (void*)sheared );\n    sheared = (Point*)NULL;\n  }\n  if( sorted )\n  {\n    free( (void*)sorted );\n    sorted = (long*)NULL;\n  }\n  if( aux )\n  {\n    free( (void*)aux );\n    aux = (long*)NULL;\n  }\n\n}\n\n/***************************************************************************/\n/*\n  comparison function for use in quicksort\n*/\n\nstatic  int compare_x\n( \n  const void*  i, \n  const void*  j \n)\n{\n  /*\n    points with the same x must appear in increasing order of y \n  */\n  if( sheared[*((long*)i)].x == sheared[*((long*)j)].x)\n  {\n    return  sheared[*((long*)i)].y - sheared[*((long*)j)].y;\n  }\n  else\n  {\n    return  sheared[*((long*)i)].x - sheared[*((long*)j)].x;\n  }\n}\n\n\n/***************************************************************************/\n/*\n  Combine step of the Guibas-Stolfi divide-and-conquer NE nearest neighbor\n  algorithm. For efficiency purposes SW nearest neighbors are computed \n  at the same time.\n*/\n\nvoid  ne_sw_combine\n(\n  long    left,\n  long    mid,\n  long    right,\n  Point*  pt,\n  long*   sorted,\n  long*   aux,\n  long    oct,\n  nn_array*  nn\n)\n{\n  long   i, j, k, y2; \n  long   i1;\n  long   i2; \n  long   best_i2;     /* index of current best nearest-neighbor */\n  long   best_dist;   /* distance to best nearest-neighbor      */\n  long   d;\n\n#ifdef DEBUG\n  assert( right > mid );\n  assert( mid > left );\n#endif\n\n  /*\n    update north-east nearest neighbors accross the mid-line\n  */\n\n  i1 = left;\n  i2 = mid;   y2 = pt[ sorted[i2] ].y;\n\n  while( (i1 < mid) && (pt[ sorted[i1] ].y >= y2) )\n  {\n    i1++;\n  }\n  \n  if( i1 < mid )\n  {\n    best_i2   = i2;\n    best_dist = dist2( pt + sorted[i1], pt + sorted[best_i2] );\n    i2++;\n\n    while( (i1 < mid) && (i2 < right) )\n    {\n      if( pt[ sorted[i1] ].y < pt[ sorted[i2] ].y )\n      {\n        d = dist2( pt + sorted[i1], pt + sorted[i2] );\n        if( d < best_dist ) \n        {\n          best_i2   = i2;\n          best_dist = d;\n        }\n        i2++;\n      }\n      else \n      {\n        if( (nn[ sorted[i1] ][oct] == -1) || \n            ( best_dist < dist2( pt + sorted[i1], pt + nn[ sorted[i1] ][oct]) ) \n           )\n        {\n          nn[ sorted[i1] ][oct] = sorted[best_i2];\n        }\n        i1++;\n        if( i1 < mid )\n        {\n          best_dist = dist2( pt + sorted[i1], pt + sorted[best_i2] );\n        }\n      }    \n    }\n\n    while( i1 < mid )\n    {\n      if( (nn[ sorted[i1] ][oct] == -1) || \n          ( dist2( pt + sorted[i1], pt + sorted[best_i2] ) < \n            dist2( pt + sorted[i1], pt + nn[ sorted[i1] ][oct]) ) \n        )\n      {\n        nn[ sorted[i1] ][oct] = sorted[best_i2];\n      }\n      i1++;\n    }\n  }\n  /*\n    repeat for south-west nearest neighbors\n  */\n\n  oct = (oct + 4) % 8;\n\n  i1 = right - 1;\n  i2 = mid - 1;   y2 = pt[ sorted[i2] ].y;\n     \n  while( (i1 >= mid) && (pt[ sorted[i1] ].y <= y2) )\n  {\n    i1--;\n  }\n\n  if( i1 >= mid )\n  {\n    best_i2   = i2;\n    best_dist = dist2( pt + sorted[i1], pt + sorted[best_i2] );\n    i2--;\n\n    while( (i1 >= mid) && (i2 >= left) )\n    {\n      if( pt[ sorted[i1] ].y > pt[ sorted[i2] ].y )\n      {\n        d = dist2( pt + sorted[i1], pt + sorted[i2] );\n        if( d < best_dist ) \n        {\n          best_i2   = i2;   \n          best_dist = d;\n        }\n        i2--;\n      }\n      else \n      {\n        if( (nn[ sorted[i1] ][oct] == -1) || \n            ( best_dist < dist2( pt + sorted[i1], pt + nn[ sorted[i1] ][oct]) ) \n           )\n        {\n          nn[ sorted[i1] ][oct] = sorted[best_i2];\n        }\n        i1--;\n        if( i1 >= mid )\n        {\n          best_dist = dist2( pt + sorted[i1], pt + sorted[best_i2] );\n        }\n      }    \n    }\n\n    while( i1 >= mid )\n    {\n      if( (nn[ sorted[i1] ][oct] == -1) || \n          ( dist2( pt + sorted[i1], pt + sorted[best_i2] ) < \n            dist2( pt + sorted[i1], pt + nn[ sorted[i1] ][oct]) ) \n        )\n      {\n        nn[ sorted[i1] ][oct] = sorted[best_i2];\n      }\n      i1--;\n    }\n  }\n\n  /*\n    merge sorted[left..mid-1] with sorted[mid..right-1] by y-coordinate\n  */\n\n  i = left;  /* first unprocessed element in left  list  */\n  j = mid;   /* first unprocessed element in right list  */\n  k = left;  /* first free available slot in output list */\n\n  while( (i < mid) && (j < right) )\n  {\n    if( pt[ sorted[i] ].y >= pt[ sorted[j] ].y )\n    {\n      aux[k++] = sorted[i++]; \n    }\n    else \n    {\n      aux[k++] = sorted[j++]; \n    }\n  }\n\n  /*\n    copy leftovers \n  */\n  while( i < mid   ) {  aux[k++] = sorted[i++]; }\n  while( j < right ) {  aux[k++] = sorted[j++]; }\n\n  /*\n    now copy sorted points from 'aux' to 'sorted' \n  */\n\n  for( i = left;  i < right;  i++ )  { sorted[i] = aux[i]; }\n\n#if 0\n  memcpy( (void*)(sorted+left),             /* destination */\n          (void*)(aux+left),             /* source      */\n          (size_t)(right-left)*sizeof(long) /* number of bytes */ \n        );\n#endif\n\n}\n\n/***************************************************************************/\n/*\n   compute north-east and south-west nearest neighbors for points indexed \n   by {sorted[left],...,sorted[right-1]} \n*/\n\nvoid  ne_sw_nearest_neighbors\n(\n  long    left,\n  long    right,\n  Point*  pt,\n  long*   sorted,\n  long*   aux,\n  long    oct,\n  nn_array*  nn\n)\n{\n  long   mid;\n\n#ifdef DEBUG\n  assert( right > left );\n#endif\n\n  if( right == left + 1 )  \n  {\n    nn[ sorted[left] ][oct] = nn[ sorted[left]][(oct+4) % 8] = -1;\n  }\n  else\n  {\n    mid = (left + right) / 2;\n    ne_sw_nearest_neighbors( left, mid, pt, sorted, aux, oct, nn );\n    ne_sw_nearest_neighbors( mid, right, pt, sorted, aux, oct, nn );\n    ne_sw_combine( left, mid, right, pt, sorted, aux, oct, nn );\n  }\n}\n\n/***************************************************************************/\n/*\n  Guibas-Stolfi algorithm for computing nearest NE neighbors\n*/\n\nvoid  dq_nearest_neighbors\n(\n  long      n,\n  Point*    pt,\n  nn_array*  nn\n)\n{\n  long   i, oct;\n  void  check_nn( long, Point*, nn_array* );\n\n  long   shear[4][4] = {\n                         {1, -1,  0,  2}, \n                         {2,  0, -1,  1}, \n                         {1,  1, -2,  0}, \n                         {0,  2, -1, -1} \n                       };\n\n\n\n_pt = pt;\n\n  for( oct = 0;  oct < 4;  oct++ )\n  {\n    for( i = 0;   i < n;   i++ )\n    {\n      sheared[i].x = shear[oct][0]*pt[i].x + shear[oct][1]*pt[i].y;\n      sheared[i].y = shear[oct][2]*pt[i].x + shear[oct][3]*pt[i].y;\n      sorted[i] = i;\n    }\n    \n    qsort( sorted, n, sizeof(long), compare_x );\n    ne_sw_nearest_neighbors( 0, n, sheared, sorted, aux, oct, nn );\n  }\n\n#ifdef DEBUG\n  check_nn( n, pt, nn );\n#endif\n\n}\n\n/***************************************************************************/\n/***************************************************************************/\n/*\n  Brute-force nearest-neighbor computation for debugging purposes\n*/\n\n/***************************************************************************/\n/*\n  Half-open octants are numbered from 0 to 7 in anti-clockwise order \n  starting from ( dx >= dy > 0 ).\n*/\n\n#define sgn(x)  ( x>0 ? 1 : (x < 0 ? -1 : 0) )\n\nlong  octant\n( \n  Point  from,\n  Point  to\n)\n{\n  long  dx = to.x - from.x;\n  long  dy = to.y - from.y;\n  long  sgn1 = sgn(dx)*sgn(dy);\n  long  sgn2 = sgn(dx+dy)*sgn(dx-dy);\n  long   oct = 0x0;\n\n  \n  if( (dy < 0) || ((dy==0) && (dx>0)) )        oct += 4;\n  if( (sgn1 < 0) || (dy==0) )                  oct += 2;\n  if( (sgn1*sgn2 < 0) || (dy==0) || (dx==0) )  oct += 1;\n\n  return  oct;\n}\n\n/***************************************************************************/\n/*\n  O(n^2) algorithm for computing all nearest neighbors\n*/\n\nvoid  brute_force_nearest_neighbors\n(\n  long    n,\n  Point*  pt,\n  nn_array*  nn\n)\n{\n  long  i, j, oct;\n  long  d;\n\n  /*\n    compute nearest neighbors by inspecting all pairs of points \n  */\n  for( i = 0;   i < n;   i++ )\n  {\n    for( oct = 0;  oct < 8;  oct++ )\n    {\n      nn[i][oct]   = -1;\n    }\n  }\n\n  for( i = 0;   i < n;  i++ )\n  {\n    for( j = i+1;   j < n;  j++ )\n    {\n      d = dist(pt[i], pt[j]);\n\n      oct = octant( pt[i], pt[j] ); \n      if( ( nn[i][oct] == -1 ) ||\n          ( d < dist(pt[i], pt[ nn[i][oct] ]) )\n        )\n      {\n        nn[i][oct]  = j;\n      }\n\n      oct = (oct + 4) % 8;       \n      if( ( nn[j][oct] == -1 ) ||\n          ( d < dist(pt[j], pt[ nn[j][oct] ]) )\n        )\n      {\n        nn[j][oct]  = i;\n      }\n    }\n  }\n}\n\n\n/***************************************************************************/\n/*\n  compare nearest neighbors against those computed by brute force\n*/\n\nvoid  check_nn\n(\n  long    n,\n  Point*  pt,\n  nn_array*  nn\n)\n{\n  long       i, j, oct;\n  nn_array*  nn1;\n\n  nn1  = (nn_array*)calloc( (size_t)n, (size_t)sizeof(nn_array) );\n  brute_force_nearest_neighbors( n, pt, nn1 );\n\n  for( i = 0;   i < n;   i++ )\n  {\n    for( oct = 0;  oct < 8;  oct++ )\n    {\n      if( nn[i][oct] == -1 )\n      {\n        assert( nn1[i][oct] == -1 );\n      }\n      else\n      {\n        assert( nn1[i][oct] != -1 );\n\n        if( octant(pt[i], pt[ nn[i][oct] ]) != oct )\n        {\n        printf( \"WRONG OCTANT!\\noct=%ld\\n\", oct );\n        printf( \"i=%ld, x=%ld, y=%ld\\n\", i, pt[i].x, pt[i].y );\n        j = nn[i][oct];\n        printf( \"nn=%ld, x=%ld, y=%ld, dist = %ld\\n\", j, pt[j].x, pt[j].y,\n                 dist(pt[i], pt[j ]) );          \n        }\n//        assert( octant(pt[i], pt[ nn[i][oct] ]) == oct );\n\n        assert( octant(pt[i], pt[ nn1[i][oct] ]) == oct );\n\n        if( dist(pt[i], pt[ nn[i][oct] ]) != \n                dist(pt[i], pt[ nn1[i][oct] ]) ) \n       {\n        printf( \"NNs DON'T MATCH!\\noct=%ld\\n\", oct );\n        printf( \"i=%ld, x=%ld, y=%ld\\n\", i, pt[i].x, pt[i].y );\n        j = nn[i][oct];\n        printf( \"nn=%ld, x=%ld, y=%ld, dist = %ld\\n\", j, pt[j].x, pt[j].y,\n                 dist(pt[i], pt[j ]) );\n        j = nn1[i][oct];\n        printf( \"nn1=%ld, x=%ld, y=%ld, dist = %ld\\n\", j, pt[j].x, pt[j].y,\n                 dist(pt[i], pt[ j ]) );\n       }\n//        assert( dist(pt[i], pt[ nn[i][oct] ]) == \n//                dist(pt[i], pt[ nn1[i][oct] ]) );\n      }\n    }\n  }\n  \n  free( nn1 );\n}\n\n/***************************************************************************/\n/***************************************************************************/\n\n"
  },
  {
    "path": "lonestar/eda/cpu/sproute/neighbors.h",
    "content": "#include \"global.h\"\n\nvoid allocate_nn_arrays(long n);\nvoid deallocate_nn_arrays();\n\nvoid brute_force_nearest_neighbors(long n, Point* pt, nn_array* nn);\n\nvoid dq_nearest_neighbors(long n, Point* pt, nn_array* nn);\n"
  },
  {
    "path": "lonestar/eda/cpu/sproute/parallel_router_morphgraph.cpp",
    "content": "#include <iostream>\n#include <fstream>\n#include <string>\n#include <list>\n#include <queue>\n#include <limits>\n#include \"galois/Galois.h\"\n#include \"galois/Reduction.h\"\n#include \"galois/PriorityQueue.h\"\n#include \"galois/Timer.h\"\n#include \"galois/graphs/Graph.h\"\n#include \"galois/graphs/TypeTraits.h\"\n#include \"galois/substrate/SimpleLock.h\"\n#include \"galois/AtomicHelpers.h\"\n#include \"galois/runtime/Profile.h\"\n\n#include \"galois/LargeArray.h\"\n\n#include \"llvm/Support/CommandLine.h\"\n\n#include \"Lonestar/BoilerPlate.h\"\n#include \"Lonestar/BFS_SSSP.h\"\nusing namespace std;\n\n#define VIA_COST 1\nfloat HEIGHT      = 6.0;\nfloat K           = 0.5;\nfloat HEIGHT_STEP = 2.0;\nfloat K_STEP      = 0.05;\n\nfloat slope = 3.0;\n\nstatic const int DIST_INFINITY   = std::numeric_limits<int>::max() / 2 - 1;\nstatic const int FIFO_CHUNK_SIZE = 64;\nstatic const int MAZE_CHUNK_SIZE = 32;\nstatic const int OBIM_delta      = 4;\n\nstruct Info {\n  int grid_x, grid_y, num_layers;\n  int* v_capacity;\n  int* h_capacity;\n  int* min_width;\n  int* min_spacing;\n  int* via_spacing;\n  int lower_left_x, lower_left_y, tile_height, tile_width;\n  int num_nets;\n  int num_capa_adjust;\n  int num_tiles;\n};\n\nstruct Pin {\n  int x, y, layer;\n  int tile_x, tile_y;\n  Pin() : x(0), y(0), layer(0) {}\n};\n\nstruct Path {\n  int src_tile_x, src_tile_y, src_layer;\n  int dst_tile_x, dst_tile_y, dst_layer;\n  Path()\n      : src_tile_x(-1), src_tile_y(-1), src_layer(-1), dst_tile_x(-1),\n        dst_tile_y(-1), dst_layer(-1) {}\n  Path(int src_tile_x, int src_tile_y, int src_layer, int dst_tile_x,\n       int dst_tile_y, int dst_layer)\n      : src_tile_x(src_tile_x), src_tile_y(src_tile_y), src_layer(src_layer),\n        dst_tile_x(dst_tile_x), dst_tile_y(dst_tile_y), dst_layer(dst_layer) {}\n};\n\nstruct Node {\n  // galois::substrate::SimpleLock lock;\n  float dist;\n  bool in_direction[4]; // four directions indicating x+1, y+1, x-1, y-1\n  int pin_layer;\n  Node() : dist(DIST_INFINITY) {\n    pin_layer = 0;\n    for (int i = 0; i < 4; i++)\n      in_direction[i] = 0;\n  }\n  inline void reset() {\n    pin_layer = 0;\n    dist      = DIST_INFINITY;\n    for (int i = 0; i < 4; i++)\n      in_direction[i] = 0;\n  }\n  inline bool update_direction(float new_dist, float old_dist, int direction) {\n    if (new_dist < old_dist) {\n      for (int i = 0; i < 4; i++)\n        in_direction[i] = (i == direction) ? 1 : 0;\n      return true;\n    } else if (new_dist == old_dist) {\n      in_direction[direction] = 1;\n      return false;\n    } else {\n      cout << \"error in update direction\" << endl;\n      return false;\n    }\n  }\n  /*inline void acquireLock()\n  {\n      lock.lock();\n  }\n  inline void releaseLock()\n  {\n      lock.unlock();\n  } */\n};\nstruct Edge {\n  int capacity;\n  int utilization;\n  float scale;\n  float cost;\n  Edge() : capacity(0), utilization(0), scale(1), cost(1) {}\n  void compute_cost() {\n    cost = 1 + HEIGHT / (1 + exp(-K * (float)(utilization - capacity)));\n    /*if(utilization > capacity)\n        cost += HEIGHT/ slope * ((float)(utilization - capacity));*/\n  }\n};\n\nusing edgeArray = galois::LargeArray<Edge>;\nusing nodeArray = galois::LargeArray<Node>;\n\nstruct Net {\n  string name;\n  int id;\n  int num_pins;\n  int min_width;\n  Pin* pinlist;\n  bool reroute;\n  std::list<Path> pathlist;\n  std::list<Edge*> edgelist;\n  std::list<Node*> nodelist;\n};\n\nstruct timerstruct {\n  galois::StatTimer t_init_dist;\n  galois::StatTimer t_trace_back;\n  galois::StatTimer t_rebuild_queue;\n  galois::StatTimer t_maze_route;\n  galois::StatTimer t_print_path;\n  galois::StatTimer t_record_path;\n  galois::StatTimer t_mazeloop;\n  timerstruct()\n      : t_init_dist(\"init_dist\"), t_trace_back(\"trace_back\"),\n        t_rebuild_queue(\"rebuild_queue\"), t_maze_route(\"maze_route\"),\n        t_print_path(\"print_path\"), t_record_path(\"record_path\"),\n        t_mazeloop(\"mazeloop\") {}\n};\n\nusing netArray = galois::LargeArray<Net>;\nusing Graph    = galois::graphs::MorphGraph<int, void, true>;\nusing GNode    = Graph::GraphNode;\n\nvoid init_dist(const Info info, nodeArray& nodelist) {\n  galois::on_each([&](const unsigned tid, const unsigned numT) {\n    unsigned total_nodes = (info.grid_x - 1) * (info.grid_y - 1);\n    unsigned start       = total_nodes / numT * tid;\n    unsigned end =\n        (tid == numT - 1) ? total_nodes : total_nodes / numT * (tid + 1);\n    for (unsigned i = start; i < end; i++)\n      nodelist[i].reset();\n  });\n\n  /*for(int y = 0; y < info.grid_y - 1; y++)\n  {\n      for(int x = 0; x < info.grid_x - 1; x++)\n      {\n          nodelist[y * (info.grid_x - 1) + x].reset();\n      }\n  }*/\n}\n\nvoid trace_back(const Info info, nodeArray& nodelist, int nodeid, int netid) {\n  Node* dst = &nodelist[nodeid];\n  int srcid;\n  int dstid          = nodeid;\n  int prev_direction = -1;\n  while (dst->dist != 0) {\n    int tile_y = dstid / (info.grid_x - 1);\n    int tile_x = dstid % (info.grid_x - 1);\n    // cout << \"trace back:\"<< dstid << \"(\"<< tile_x << \",\" << tile_y << \") \" <<\n    // \"dist: \" << nodelist[dstid].dist << \" \"; cout << dst->in_direction[0] <<\n    // dst->in_direction[1] << dst->in_direction[2] <<\n    // dst->in_direction[3]<<endl;\n\n    int rand_offset   = (prev_direction >= 0) ? prev_direction : rand() % 4;\n    int end_direction = -1;\n    for (int i = 0; i < 4; i++) {\n      if (dst->in_direction[(i + rand_offset) % 4]) {\n        end_direction  = (i + rand_offset) % 4;\n        prev_direction = end_direction;\n        break;\n      }\n    }\n    if (end_direction >= 0) {\n      for (int i = 0; i < 4; i++) {\n        dst->in_direction[i] = (i == end_direction) ? 1 : 0;\n      }\n    } else {\n      cout << \" error in end direction reset\" << endl;\n      exit(1);\n    }\n\n    switch (end_direction) {\n    case 0:\n      srcid = dstid + 1;\n      break;\n    case 1:\n      srcid = dstid + info.grid_x - 1;\n      break;\n    case 2:\n      srcid = dstid - 1;\n      break;\n    case 3:\n      srcid = dstid - (info.grid_x - 1);\n      break;\n    default: {\n\n      cout << \"error in trace back direction: end_direction = \" << end_direction\n           << \" tile: \" << tile_x << \",\" << tile_y;\n      cout << \" netid: \" << netid << endl;\n\n      cout << \"direction:\" << dst->in_direction[0]\n           << \" distance: \" << nodelist[dstid + 1].dist << endl;\n      cout << \"direction:\" << dst->in_direction[1]\n           << \" distance: \" << nodelist[dstid + info.grid_x - 1].dist << endl;\n      cout << \"direction:\" << dst->in_direction[2]\n           << \" distance: \" << nodelist[dstid - 1].dist << endl;\n      cout << \"direction:\" << dst->in_direction[3]\n           << \" distance: \" << nodelist[dstid - (info.grid_x - 1)].dist << endl;\n\n      // exit(1);\n    }\n    }\n\n    if (srcid < 0 || srcid > info.num_tiles - 1) {\n      cout << \"error in trace back srcid\" << endl;\n      // exit(1);\n    }\n    nodelist[dstid].dist = 0;\n    dstid                = srcid;\n    dst                  = &nodelist[dstid];\n  }\n}\n\nvoid rebuild_queue(\n    const Info info, nodeArray& nodelist, galois::InsertBag<int>& allNodeBag,\n    galois::InsertBag<int>& initBag,\n    bool last_pin) // actually this src is the dst just connected to the net\n{\n  /*galois::do_all(galois::iterate(allNodeBag),\n      [&] (const auto nodeid)\n      {\n              int neighborid;\n              if(nodelist[nodeid].dist == 0)\n              {\n                  //cout<<\"rebuild queue: (\"<< nodeid %(info.grid_x - 1) << \",\"\n     << nodeid/(info.grid_x - 1) <<\")\"<< endl; neighborid = nodeid + 1;\n                  if(neighborid <= info.num_tiles - 1 &&\n     nodelist[neighborid].dist == 0 && nodelist[neighborid].in_direction[2])\n                  {\n                      nodelist[nodeid].in_direction[0] = 1;\n                  }\n\n                  neighborid = nodeid - 1;\n                  if(neighborid >= 0 && nodelist[neighborid].dist == 0 &&\n     nodelist[neighborid].in_direction[0])\n                  {\n                      nodelist[nodeid].in_direction[2] = 1;\n                  }\n\n                  neighborid = nodeid + info.grid_x - 1;\n                  if(neighborid <= info.num_tiles - 1 &&\n     nodelist[neighborid].dist == 0 && nodelist[neighborid].in_direction[3])\n                  {\n                      nodelist[nodeid].in_direction[1] = 1;\n                  }\n\n                  neighborid = nodeid - (info.grid_x - 1);\n                  if(neighborid >= 0 && nodelist[neighborid].dist == 0 &&\n     nodelist[neighborid].in_direction[1])\n                  {\n                      nodelist[nodeid].in_direction[3] = 1;\n                  }\n                  if(!last_pin)\n                      initBag.push(nodeid);\n\n              }\n              else\n              {\n                  nodelist[nodeid].reset();\n              }\n\n      },\n      galois::steal(),\n      galois::chunk_size<2048>()\n      );*/\n  galois::on_each([&](const unsigned tid, const unsigned numT) {\n    unsigned total_nodes = (info.grid_x - 1) * (info.grid_y - 1);\n    unsigned start       = total_nodes / numT * tid;\n    unsigned end =\n        (tid == numT - 1) ? total_nodes : total_nodes / numT * (tid + 1);\n\n    for (unsigned nodeid = start; nodeid < end; nodeid++) {\n      int neighborid;\n      if (nodelist[nodeid].dist == 0) {\n        // cout<<\"rebuild queue: (\"<< nodeid %(info.grid_x - 1) << \",\" <<\n        // nodeid/(info.grid_x - 1) <<\")\"<< endl;\n        neighborid = nodeid + 1;\n        if (neighborid <= info.num_tiles - 1 &&\n            nodelist[neighborid].dist == 0 &&\n            nodelist[neighborid].in_direction[2]) {\n          nodelist[nodeid].in_direction[0] = 1;\n        }\n\n        neighborid = nodeid - 1;\n        if (neighborid >= 0 && nodelist[neighborid].dist == 0 &&\n            nodelist[neighborid].in_direction[0]) {\n          nodelist[nodeid].in_direction[2] = 1;\n        }\n\n        neighborid = nodeid + info.grid_x - 1;\n        if (neighborid <= info.num_tiles - 1 &&\n            nodelist[neighborid].dist == 0 &&\n            nodelist[neighborid].in_direction[3]) {\n          nodelist[nodeid].in_direction[1] = 1;\n        }\n\n        neighborid = nodeid - (info.grid_x - 1);\n        if (neighborid >= 0 && nodelist[neighborid].dist == 0 &&\n            nodelist[neighborid].in_direction[1]) {\n          nodelist[nodeid].in_direction[3] = 1;\n        }\n        if (!last_pin)\n          initBag.push(nodeid);\n\n      } else {\n        nodelist[nodeid].reset();\n      }\n    }\n  });\n\n  /*for(int nodeid = 0; nodeid < info.num_tiles; nodeid++)\n  {\n      int neighborid;\n      if(nodelist[nodeid].dist == 0)\n      {\n          //cout<<\"rebuild queue: (\"<< nodeid %(info.grid_x - 1) << \",\" <<\n  nodeid/(info.grid_x - 1) <<\")\"<< endl; neighborid = nodeid + 1; if(neighborid\n  <= info.num_tiles - 1 && nodelist[neighborid].dist == 0 &&\n  nodelist[neighborid].in_direction[2])\n          {\n              nodelist[nodeid].in_direction[0] = 1;\n          }\n\n          neighborid = nodeid - 1;\n          if(neighborid >= 0 && nodelist[neighborid].dist == 0 &&\n  nodelist[neighborid].in_direction[0])\n          {\n              nodelist[nodeid].in_direction[2] = 1;\n          }\n\n          neighborid = nodeid + info.grid_x - 1;\n          if(neighborid <= info.num_tiles - 1 && nodelist[neighborid].dist == 0\n  && nodelist[neighborid].in_direction[3])\n          {\n              nodelist[nodeid].in_direction[1] = 1;\n          }\n\n          neighborid = nodeid - (info.grid_x - 1);\n          if(neighborid >= 0 && nodelist[neighborid].dist == 0 &&\n  nodelist[neighborid].in_direction[1])\n          {\n              nodelist[nodeid].in_direction[3] = 1;\n          }\n          if(!last_pin)\n              initBag.push(nodeid);\n\n      }\n      else\n      {\n          nodelist[nodeid].reset();\n      }\n  }*/\n}\n\ninline int horizontal_min_spacing(const Info info, edgeArray& h_edge,\n                                  int nodeid) {\n  for (int i = 0; i < info.num_layers; i++) {\n    // if(v_edge[nodeid].utilization < v_edge[nodeid].capacity)\n    if (info.h_capacity[i] != 0)\n      return info.min_spacing[i];\n  }\n  cout << \"error in finding horizontal min spacing!\" << endl;\n  exit(1);\n}\n\ninline int vertical_min_spacing(const Info info, edgeArray& v_edge,\n                                int nodeid) {\n  for (int i = 0; i < info.num_layers; i++) {\n    // if(v_edge[nodeid].utilization < v_edge[nodeid].capacity)\n    if (info.h_capacity[i] != 0)\n      return info.min_spacing[i];\n  }\n  cout << \"error in finding horizontal min spacing!\" << endl;\n  exit(1);\n}\n\ninline int horizontal_layer(const Info info, edgeArray& h_edge, int nodeid) {\n  for (int i = 0; i < info.num_layers; i++) {\n    // if(v_edge[nodeid].utilization < v_edge[nodeid].capacity)\n    if (info.h_capacity[i] != 0)\n      return i + 1;\n  }\n  cout << \"error in finding horizontal layer!\" << endl;\n  exit(1);\n}\n\ninline int vertical_layer(const Info info, edgeArray& v_edge, int nodeid) {\n  for (int i = 0; i < info.num_layers; i++) {\n    if (info.v_capacity[i] != 0)\n      return i + 1;\n  }\n  cout << \"error in finding horizontal layer!\" << endl;\n  exit(1);\n}\n\nvoid record_path(const Info info, nodeArray& nodelist, edgeArray& h_edge,\n                 edgeArray& v_edge, Net& net) {\n  for (int nodeid = 0; nodeid < info.num_tiles; nodeid++) {\n    int neighborid;\n    Node& node = nodelist[nodeid];\n    if (nodelist[nodeid].dist == 0) {\n      // cout<< nodeid << \" \" << nodeid % (info.grid_x - 1) << \" \" << nodeid\n      // /(info.grid_x - 1) << \" \" << node.in_direction[0] <<\n      // node.in_direction[1] << node.in_direction[2] << node.in_direction[3] <<\n      // endl;\n      neighborid = nodeid + 1;\n      if (nodeid % (info.grid_x - 1) != info.grid_x - 2 &&\n          nodelist[neighborid].dist == 0 && node.in_direction[0] &&\n          nodelist[neighborid].in_direction[2])\n      //&& h_edge[nodeid].utilization + net.min_width +\n      // horizontal_min_spacing(info, h_edge, nodeid) <=\n      // h_edge[nodeid].capacity)\n      {\n        Path* path       = new Path;\n        path->src_tile_x = nodeid % (info.grid_x - 1);\n        path->src_tile_y = nodeid / (info.grid_x - 1);\n        path->dst_tile_x = nodeid % (info.grid_x - 1) + 1;\n        path->dst_tile_y = nodeid / (info.grid_x - 1);\n\n        path->src_layer = horizontal_layer(info, h_edge, nodeid);\n        path->dst_layer = path->src_layer;\n\n        h_edge[nodeid].utilization +=\n            net.min_width + horizontal_min_spacing(info, h_edge, nodeid);\n        h_edge[nodeid].compute_cost();\n        /*h_edge[nodeid].cost = 1 + HEIGHT / ( 1 + exp(- K *\n        (h_edge[nodeid].utilization - h_edge[nodeid].capacity)));\n        if(h_edge[nodeid].utilization > h_edge[nodeid].capacity)\n            h_edge[nodeid].cost += HEIGHT/ slope *\n        ((float)(h_edge[nodeid].utilization - h_edge[nodeid].capacity));*/\n        net.pathlist.push_back(*path);\n        net.edgelist.push_back(&h_edge[nodeid]);\n        /*{\n            cout<< \"utilization exceeds capacity in h net \" << net.id << \" node:\n        \" << path->src_tile_x << \",\" << path->src_tile_y << endl; cout << \" \"<<\n        h_edge[nodeid].utilization << \"/\" << h_edge[nodeid].capacity << endl;\n            exit(1);\n        }*/\n      }\n\n      neighborid = nodeid + info.grid_x - 1;\n      if (neighborid <= info.num_tiles - 1 && nodelist[neighborid].dist == 0 &&\n          node.in_direction[1] && nodelist[neighborid].in_direction[3])\n      //&& v_edge[nodeid].utilization + net.min_width +\n      // vertical_min_spacing(info, v_edge, nodeid) <= v_edge[nodeid].capacity)\n      {\n        Path* path       = new Path;\n        path->src_tile_x = nodeid % (info.grid_x - 1);\n        path->src_tile_y = nodeid / (info.grid_x - 1);\n        path->dst_tile_x = nodeid % (info.grid_x - 1);\n        path->dst_tile_y = nodeid / (info.grid_x - 1) + 1;\n\n        path->src_layer = vertical_layer(info, v_edge, nodeid);\n        path->dst_layer = path->src_layer;\n\n        v_edge[nodeid].utilization +=\n            net.min_width + vertical_min_spacing(info, v_edge, nodeid);\n        v_edge[nodeid].compute_cost();\n        /*v_edge[nodeid].cost = 1 + HEIGHT / ( 1 + exp(- K *\n        (v_edge[nodeid].utilization - v_edge[nodeid].capacity)));\n        if(v_edge[nodeid].utilization > v_edge[nodeid].capacity)\n            v_edge[nodeid].cost += HEIGHT/ slope *\n        ((float)(v_edge[nodeid].utilization - v_edge[nodeid].capacity));*/\n        net.pathlist.push_back(*path);\n        net.edgelist.push_back(&v_edge[nodeid]);\n\n        /*{\n            cout<< \"utilization exceeds capacity in v net \" << net.id << \" node:\n        \" << path->src_tile_x << \",\" << path->src_tile_y << endl; cout << \" \"<<\n        v_edge[nodeid].utilization << \"/\" << v_edge[nodeid].capacity << endl;\n            exit(1);\n        }*/\n      }\n\n      if ((nodelist[nodeid].in_direction[0] ||\n           nodelist[nodeid].in_direction[2] ||\n           nodelist[nodeid].pin_layer ==\n               horizontal_layer(info, h_edge, nodeid)) &&\n          (nodelist[nodeid].in_direction[1] ||\n           nodelist[nodeid].in_direction[3] ||\n           nodelist[nodeid].pin_layer ==\n               vertical_layer(info, v_edge, nodeid))) {\n        Path* path       = new Path;\n        path->src_tile_x = nodeid % (info.grid_x - 1);\n        path->src_tile_y = nodeid / (info.grid_x - 1);\n        path->dst_tile_x = nodeid % (info.grid_x - 1);\n        path->dst_tile_y = nodeid / (info.grid_x - 1);\n\n        path->src_layer = 1;\n        path->dst_layer = 2; // only for 2D\n\n        net.pathlist.push_back(*path);\n        net.nodelist.push_back(&nodelist[nodeid]);\n      }\n    } else\n      nodelist[nodeid].reset();\n  }\n}\nvoid printnode(const Info info, nodeArray& nodelist, int node_x, int node_y) {\n  int nodeid = node_x + (info.grid_x - 1) * node_y;\n  Node& node = nodelist[nodeid];\n  cout << \"node: \" << nodeid << \"(\" << node_x << \",\" << node_y << \") \"\n       << node.in_direction[0] << node.in_direction[1] << node.in_direction[2];\n  cout << node.in_direction[3] << endl;\n}\n\nvoid print_capacity(const Info info, edgeArray& v_edge, edgeArray& h_edge) {\n  for (int i = 0; i < (info.grid_x - 1) * (info.grid_y - 1); i++) {\n    if (v_edge[i].utilization > 0)\n      cout << \"utilization at v edge: (\" << i % (info.grid_x - 1) << \",\"\n           << i / (info.grid_x - 1) << \") \" << v_edge[i].utilization << endl;\n  }\n  for (int i = 0; i < (info.grid_x - 1) * (info.grid_y - 1); i++) {\n    if (h_edge[i].utilization > 0)\n      cout << \"utilization at h edge: (\" << i % (info.grid_x - 1) << \",\"\n           << i / (info.grid_x - 1) << \") \" << h_edge[i].utilization << endl;\n  }\n}\n\nstruct UpdateRequestIndexer {\n  nodeArray& nodelist;\n\n  unsigned int operator()(const int& req) const {\n    unsigned int t = (unsigned int)(nodelist[req].dist) / OBIM_delta;\n    return t;\n  }\n};\n\nvoid mazeroute(const Info info, nodeArray& nodelist,\n               galois::InsertBag<int>& allNodeBag, edgeArray& v_edge,\n               edgeArray& h_edge, Net* netlist, Graph& g,\n               std::vector<GNode>& node_GNode, timerstruct& timer) {\n  // std::queue<int> node_queue; // store nodeid\n  // int reference_dist;\n  std::atomic<float> reference_dist;\n\n  namespace gwl = galois::worklists;\n  using dChunk  = gwl::ChunkFIFO<FIFO_CHUNK_SIZE>;\n  using OBIM    = gwl::OrderedByIntegerMetric<UpdateRequestIndexer, dChunk>;\n\n  int num_reroute     = 0;\n  int sweep_direction = rand() % 2;\n  for (int netid = rand() % (info.num_nets), cnt = 0; cnt < info.num_nets;\n       cnt++) {\n    if (sweep_direction)\n      netid = (netid + 1) % info.num_nets;\n    else\n      netid = (netid == 0) ? (info.num_nets - 1) : (netid - 1);\n\n    if (netid % 10000 == 0)\n      cout << \"net: \" << netid << \" \" << netlist[netid].id << endl;\n    Net& net = netlist[netid];\n    if (net.num_pins > 1000 ||\n        !net.reroute) // contest says no need to route it for now, but we have\n                      // to resolve it\n      continue;\n    num_reroute++;\n    /*if(net.min_width != 1)\n        cout<<\"net: \" << net.id << \" minwidth: \" << net.min_width << endl;*/\n    bool debug = false;\n    /*if(netid >= 20000)\n    {\n        cout<<\"remove from netid\"<<net.id<<endl;\n        break;\n    }*/\n    /*if(netid == 11285)\n        debug = true;*/\n    timer.t_init_dist.start();\n    init_dist(info, nodelist);\n    timer.t_init_dist.stop();\n\n    bool last_pin = false;\n    galois::InsertBag<int> initBag;\n    for (int pinid = 0; pinid < net.num_pins; pinid++) {\n      Pin& pin = net.pinlist[pinid];\n\n      int nodeid     = pin.tile_y * (info.grid_x - 1) + pin.tile_x;\n      Node& node     = nodelist[nodeid];\n      node.pin_layer = pin.layer;\n      if (debug == true)\n        cout << \"pin: \" << pinid << \" tile: \" << pin.tile_x << \" \" << pin.tile_y\n             << endl;\n\n      reference_dist.store(DIST_INFINITY);\n\n      if (pinid == net.num_pins - 1)\n        last_pin = true;\n\n      if (pinid == 0) {\n        node.dist = 0;\n\n        if (info.v_capacity[pin.layer - 1] != 0) // vertical direction layer\n        {\n          node.in_direction[1] = 1;\n          node.in_direction[3] = 1;\n        } else if (info.h_capacity[pin.layer - 1] !=\n                   0) // herizontal direction layer\n        {\n          node.in_direction[0] = 1;\n          node.in_direction[2] = 1;\n        } else\n          cout << \"error in init qnode\" << endl;\n\n        // node_queue.push(nodeid);\n        initBag.push(nodeid);\n        /*int tile_x = nodeid % (info.grid_x - 1);\n        int tile_y = nodeid / (info.grid_x - 1);\n        cout<<\"push pin 0 :\" << tile_x << \" \" << tile_y << endl;*/\n\n      } else if (node.dist != 0) {\n        timer.t_mazeloop.start();\n        galois::for_each(\n            galois::iterate(initBag),\n            [&](const auto& srcid, auto& ctx)\n\n            {\n              GNode src_GNode = node_GNode[srcid];\n              float local_reference_dist =\n                  reference_dist; // or reference_dist.load()\n              Node& src = nodelist[srcid];\n\n              /*if(debug)\n              {\n                  int tile_x = srcid % (info.grid_x - 1);\n                  int tile_y = srcid / (info.grid_x - 1);\n                  cout<<\"src: \\t\"<< tile_x << \"\\t\"<< tile_y <<\"\\tdistance:\\t\" <<\n              nodelist[srcid].dist << \"\\tindirection: \"; cout<<\n              src.in_direction[0] <<\n              src.in_direction[1]<<src.in_direction[2]<<src.in_direction[3]<<endl;\n                  cout<<\"to 0: \" << h_edge[srcid].utilization << \"/\"<<\n              h_edge[srcid].capacity <<endl; cout<<\"to 1: \" <<\n              v_edge[srcid].utilization << \"/\"<< v_edge[srcid].capacity <<endl;\n                  cout<<\"to 2: \" << h_edge[srcid - 1].utilization << \"/\"<<\n              h_edge[srcid - 1].capacity <<endl; cout<<\"to 3: \" << v_edge[srcid\n              - (info.grid_x - 1)].utilization << \"/\"<< v_edge[srcid -\n              (info.grid_x - 1)].capacity <<endl;\n              }*/\n\n              // int edge_cnt = 0;\n              int dstid;\n              for (auto e : g.edges(src_GNode)) {\n                auto dst_GNode = g.getEdgeDst(e);\n                dstid          = g.getData(dst_GNode);\n                if (dstid == srcid + 1) {\n                  Node& dst    = nodelist[dstid];\n                  Edge& r_edge = h_edge[srcid];\n\n                  if (src.in_direction[2] &&\n                      src.dist + r_edge.cost <= dst.dist &&\n                      src.dist + r_edge.cost <=\n                          local_reference_dist) // previous node is x - 1\n                  {\n                    bool updated = dst.update_direction(src.dist + r_edge.cost,\n                                                        dst.dist, 2);\n                    dst.dist     = src.dist + r_edge.cost;\n\n                    if (dstid == nodeid) // if dst is one of the destination\n                                         // pins\n                      local_reference_dist = dst.dist;\n                    else if (updated)\n                      ctx.push(dstid);\n                  } else if ((src.in_direction[1] || src.in_direction[3]) &&\n                             src.dist + r_edge.cost + VIA_COST <= dst.dist &&\n                             src.dist + r_edge.cost + VIA_COST <=\n                                 local_reference_dist) {\n                    bool updated = dst.update_direction(\n                        src.dist + r_edge.cost + VIA_COST, dst.dist, 2);\n                    dst.dist = src.dist + r_edge.cost + VIA_COST;\n\n                    if (dstid == nodeid) // if dst is one of the destination\n                                         // pins\n                      local_reference_dist = dst.dist;\n                    else if (updated)\n                      ctx.push(dstid);\n                  }\n\n                }\n\n                else if (dstid == srcid - 1) {\n                  Node& dst    = nodelist[dstid];\n                  Edge& l_edge = h_edge[srcid - 1];\n\n                  if (src.in_direction[0] &&\n                      src.dist + l_edge.cost <= dst.dist &&\n                      src.dist + l_edge.cost <=\n                          local_reference_dist) // previous node is x + 1\n                  {\n                    bool updated = dst.update_direction(src.dist + l_edge.cost,\n                                                        dst.dist, 0);\n                    dst.dist     = src.dist + l_edge.cost;\n\n                    if (dstid == nodeid) // if dst is one of the destination\n                                         // pins\n                      local_reference_dist = dst.dist;\n                    else if (updated)\n                      ctx.push(dstid);\n                  } else if ((src.in_direction[1] || src.in_direction[3]) &&\n                             src.dist + l_edge.cost + VIA_COST <= dst.dist &&\n                             src.dist + l_edge.cost + VIA_COST <=\n                                 local_reference_dist) {\n                    bool updated = dst.update_direction(\n                        src.dist + l_edge.cost + VIA_COST, dst.dist, 0);\n                    dst.dist = src.dist + l_edge.cost + VIA_COST;\n\n                    if (dstid == nodeid) // if dst is one of the destination\n                                         // pins\n                      local_reference_dist = dst.dist;\n                    else if (updated)\n                      ctx.push(dstid);\n                  }\n                }\n\n                else if (dstid == srcid + info.grid_x - 1) {\n                  Node& dst    = nodelist[dstid];\n                  Edge& t_edge = v_edge[srcid];\n\n                  if (src.in_direction[3] &&\n                      src.dist + t_edge.cost <= dst.dist &&\n                      src.dist + t_edge.cost <=\n                          local_reference_dist) // previous node is y - 1\n                  {\n                    bool updated = dst.update_direction(src.dist + t_edge.cost,\n                                                        dst.dist, 3);\n                    dst.dist     = src.dist + t_edge.cost;\n\n                    if (dstid == nodeid) // if dst is one of the destination\n                                         // pins\n                      local_reference_dist = dst.dist;\n                    else if (updated)\n                      ctx.push(dstid);\n                  } else if ((src.in_direction[0] || src.in_direction[2]) &&\n                             src.dist + t_edge.cost + VIA_COST <= dst.dist &&\n                             src.dist + t_edge.cost + VIA_COST <=\n                                 local_reference_dist) {\n                    bool updated = dst.update_direction(\n                        src.dist + t_edge.cost + VIA_COST, dst.dist, 3);\n                    dst.dist = src.dist + t_edge.cost + VIA_COST;\n\n                    if (dstid == nodeid) // if dst is one of the destination\n                                         // pins\n                      local_reference_dist = dst.dist;\n                    else if (updated)\n                      ctx.push(dstid);\n                  }\n                } else if (dstid == srcid - (info.grid_x - 1)) {\n                  Node& dst    = nodelist[dstid];\n                  Edge& b_edge = v_edge[srcid - (info.grid_x - 1)];\n\n                  if (src.in_direction[1] &&\n                      src.dist + b_edge.cost <= dst.dist &&\n                      src.dist + b_edge.cost <=\n                          local_reference_dist) // previous node is y + 1\n                  {\n                    bool updated = dst.update_direction(src.dist + b_edge.cost,\n                                                        dst.dist, 1);\n                    dst.dist     = src.dist + b_edge.cost;\n\n                    if (dstid == nodeid) // if dst is one of the destination\n                                         // pins\n                      local_reference_dist = dst.dist;\n                    else if (updated)\n                      ctx.push(dstid);\n                  } else if ((src.in_direction[0] || src.in_direction[2]) &&\n                             src.dist + b_edge.cost + VIA_COST <= dst.dist &&\n                             src.dist + b_edge.cost + VIA_COST <=\n                                 local_reference_dist) {\n                    bool updated = dst.update_direction(\n                        src.dist + b_edge.cost + VIA_COST, dst.dist, 1);\n                    dst.dist = src.dist + b_edge.cost + VIA_COST;\n\n                    if (dstid == nodeid) // if dst is one of the destination\n                                         // pins\n                      local_reference_dist = dst.dist;\n                    else if (updated)\n                      ctx.push(dstid);\n                  }\n                } else {\n                  cout << \"there must be something wrong in constructing the \"\n                          \"graph\"\n                       << endl;\n                  exit(1);\n                }\n              }\n\n              galois::atomicMin(reference_dist, local_reference_dist);\n            },\n            galois::wl<dChunk>(),\n            // galois::wl<OBIM>(UpdateRequestIndexer{nodelist}),\n            galois::steal(), galois::chunk_size<MAZE_CHUNK_SIZE>());\n        timer.t_mazeloop.stop();\n\n        // while queue is empty, nodeid is reached.\n        timer.t_trace_back.start();\n        trace_back(info, nodelist, nodeid, net.id);\n        if (info.v_capacity[pin.layer - 1] != 0) // vertical direction layer\n        {\n          node.in_direction[1] = 1;\n          node.in_direction[3] = 1;\n        } else if (info.h_capacity[pin.layer - 1] !=\n                   0) // herizontal direction layer\n        {\n          node.in_direction[0] = 1;\n          node.in_direction[2] = 1;\n        } else\n          cout << \"error in init qnode\" << endl;\n        timer.t_trace_back.stop();\n\n        timer.t_rebuild_queue.start();\n        rebuild_queue(info, nodelist, allNodeBag, initBag, last_pin);\n        timer.t_rebuild_queue.stop();\n        // printnode(info, nodelist, 99, 191);\n        /*if(last_pin && !node_queue.empty())\n        {\n            cout<<\"queue is not empty when finish a net\"<<endl;\n        }*/\n      }\n    }\n    timer.t_record_path.start();\n    record_path(info, nodelist, h_edge, v_edge, net);\n    timer.t_record_path.stop();\n    // print_capacity(info, v_edge, h_edge);\n  }\n  cout << \"num of routed nets :\" << num_reroute << endl;\n}\n\nvoid readfile(ifstream& infile, Info& info, nodeArray& nodelist,\n              edgeArray& v_edge, edgeArray& h_edge, Net*& netlist, Graph& g,\n              std::vector<GNode>& node_GNode) {\n  string temp1, temp2;\n  infile >> temp1 >> info.grid_x >> info.grid_y >> info.num_layers;\n  // cout << info.grid_x << info.grid_y << info.layer << endl;\n  info.num_tiles = (info.grid_x - 1) * (info.grid_y - 1);\n  v_edge.allocateInterleaved(info.num_tiles);\n  h_edge.allocateInterleaved(info.num_tiles);\n  // v_edge.allocateBlocked(info.num_tiles);\n  // h_edge.allocateBlocked(info.num_tiles);\n  int nlayers      = info.num_layers;\n  info.v_capacity  = new int[nlayers];\n  info.h_capacity  = new int[nlayers];\n  info.min_width   = new int[nlayers];\n  info.min_spacing = new int[nlayers];\n  info.via_spacing = new int[nlayers];\n  infile >> temp1 >> temp2;\n\n  if (temp1 == \"vertical\" && temp2 == \"capacity\") {\n    for (int i = 0; i < nlayers; i++) {\n      infile >> info.v_capacity[i];\n      // cout<<info.v_capacity[i]<<\" \";\n    }\n    // cout<<endl;\n  }\n\n  infile >> temp1 >> temp2;\n  if (temp1 == \"horizontal\" && temp2 == \"capacity\") {\n    for (int i = 0; i < nlayers; i++) {\n      infile >> info.h_capacity[i];\n      // cout<<info.h_capacity[i]<<\" \";\n    }\n    // cout<<endl;\n  }\n\n  infile >> temp1 >> temp2;\n  if (temp1 == \"minimum\" && temp2 == \"width\") {\n    for (int i = 0; i < nlayers; i++) {\n      infile >> info.min_width[i];\n      // cout<<info.min_width[i]<<\" \";\n    }\n    // cout<<endl;\n  }\n\n  infile >> temp1 >> temp2;\n  if (temp1 == \"minimum\" && temp2 == \"spacing\") {\n    for (int i = 0; i < nlayers; i++) {\n      infile >> info.min_spacing[i];\n      // cout<<info.min_spacing[i]<<\" \";\n    }\n    // cout<<endl;\n  }\n\n  infile >> temp1 >> temp2;\n  if (temp1 == \"via\" && temp2 == \"spacing\") {\n    for (int i = 0; i < nlayers; i++) {\n      infile >> info.via_spacing[i];\n      // cout<<info.via_spacing[i]<<\" \";\n    }\n    // cout<<endl;\n  }\n\n  infile >> info.lower_left_x >> info.lower_left_y >> info.tile_width >>\n      info.tile_height;\n  // cout << info.lower_left_x << info.lower_left_y << info.tile_width <<\n  // info.tile_height << endl;\n\n  infile >> temp1 >> temp2;\n  if (temp1 == \"num\" && temp2 == \"net\")\n    infile >> info.num_nets;\n\n  netlist = new Net[info.num_nets];\n  for (int i = 0; i < info.num_nets; i++) {\n    infile >> netlist[i].name >> netlist[i].id >> netlist[i].num_pins >>\n        netlist[i].min_width;\n    int num_pins       = netlist[i].num_pins;\n    netlist[i].pinlist = new Pin[num_pins];\n    netlist[i].reroute = true;\n    for (int j = 0; j < netlist[i].num_pins; j++) {\n      infile >> netlist[i].pinlist[j].x >> netlist[i].pinlist[j].y >>\n          netlist[i].pinlist[j].layer;\n      netlist[i].pinlist[j].tile_x =\n          (netlist[i].pinlist[j].x - info.lower_left_x) / info.tile_width;\n      netlist[i].pinlist[j].tile_y =\n          (netlist[i].pinlist[j].y - info.lower_left_y) / info.tile_height;\n    }\n  }\n\n  infile >> info.num_capa_adjust;\n  int src_tile_x, src_tile_y, dst_tile_x, dst_tile_y, src_layer, dst_layer,\n      capacity;\n  int total_v_capacity = 0, total_h_capacity = 0;\n  for (int i = 0; i < info.num_layers; i++) {\n    total_v_capacity += info.v_capacity[i];\n    total_h_capacity += info.h_capacity[i];\n  }\n  for (int i = 0; i < (info.grid_x - 1) * (info.grid_y - 2); i++)\n    v_edge[i].capacity = total_v_capacity;\n\n  for (int i = 0; i < (info.grid_x - 1) * (info.grid_y - 1); i++)\n    if (i % (info.grid_x - 1) != info.grid_x - 2)\n      h_edge[i].capacity = total_h_capacity;\n\n  for (int i = 0; i < info.num_capa_adjust; i++) {\n    infile >> src_tile_x >> src_tile_y >> src_layer;\n    infile >> dst_tile_x >> dst_tile_y >> dst_layer;\n    infile >> capacity;\n    if (dst_tile_x - src_tile_x == 1)\n      h_edge[src_tile_y * (info.grid_x - 1) + src_tile_x].capacity = capacity;\n    else if (dst_tile_y - src_tile_y == 1)\n      v_edge[src_tile_y * (info.grid_x - 1) + src_tile_x].capacity = capacity;\n    else {\n      cout << \"error in reading capacity adjustment\" << endl;\n      exit(1);\n    }\n  }\n\n  nodelist.allocateInterleaved(info.num_tiles);\n  node_GNode.resize(info.num_tiles);\n  for (int i = 0; i < info.num_tiles; i++) {\n    GNode n = g.createNode(i);\n    g.addNode(n);\n    node_GNode[i] = n;\n  }\n\n  for (int nodeid = 0; nodeid < info.num_tiles; nodeid++) {\n    GNode src = node_GNode[nodeid];\n    int neighborid;\n    neighborid = nodeid + 1;\n    if (nodeid % (info.grid_x - 1) != info.grid_x - 2) {\n      GNode dst = node_GNode[neighborid];\n      g.addEdge(src, dst);\n      // g.addMultiEdge(src, dst, galois::MethodFlag::UNPROTECTED );\n    }\n\n    neighborid = nodeid - 1;\n    if (nodeid % (info.grid_x - 1) != 0) {\n      GNode dst = node_GNode[neighborid];\n      g.addEdge(src, dst);\n      // g.addMultiEdge(src, dst, galois::MethodFlag::UNPROTECTED );\n    }\n\n    neighborid = nodeid + (info.grid_x - 1);\n    if (neighborid <= info.num_tiles - 1) {\n      GNode dst = node_GNode[neighborid];\n      g.addEdge(src, dst);\n      // g.addMultiEdge(src, dst, galois::MethodFlag::UNPROTECTED );\n    }\n\n    neighborid = nodeid - (info.grid_x - 1);\n    if (neighborid >= 0) {\n      GNode dst = node_GNode[neighborid];\n      g.addEdge(src, dst);\n      // g.addMultiEdge(src, dst, galois::MethodFlag::UNPROTECTED );\n    }\n  }\n\n  infile.close();\n}\n\nvoid print_path(const Info info, Net* netlist) {\n  ofstream outfile(\"router.out\");\n  for (int netid = 0; netid < info.num_nets; netid++) {\n    Net& net = netlist[netid];\n    // if(netid != 2)\n    //    continue;\n    outfile << net.name << \" \" << net.id << endl;\n\n    for (std::list<Path>::iterator path = net.pathlist.begin();\n         path != net.pathlist.end(); path++) {\n      // cout<< tile_x<<\" \" << tile_y << endl;\n      int src_cor_x = path->src_tile_x * info.tile_width +\n                      0.5 * info.tile_width + info.lower_left_x;\n      int src_cor_y = path->src_tile_y * info.tile_height +\n                      0.5 * info.tile_height + info.lower_left_y;\n      int src_layer = path->src_layer;\n\n      int dst_cor_x = path->dst_tile_x * info.tile_width +\n                      0.5 * info.tile_width + info.lower_left_x;\n      int dst_cor_y = path->dst_tile_y * info.tile_height +\n                      0.5 * info.tile_height + info.lower_left_y;\n      int dst_layer = path->dst_layer;\n\n      // outfile<< \"(\" << path->src_tile_x << \",\" << path->src_tile_y << \",\" <<\n      // src_layer << \")-(\" << path->dst_tile_x << \",\" << path->dst_tile_y <<\n      // \",\"\n      // << dst_layer << \")\" << endl;\n      outfile << \"(\" << src_cor_x << \",\" << src_cor_y << \",\" << src_layer\n              << \")-(\" << dst_cor_x << \",\" << dst_cor_y << \",\" << dst_layer\n              << \")\" << endl;\n    }\n    outfile << \"!\" << endl;\n  }\n}\n\nstatic const char* name = \"Single Source Shortest Path\";\nstatic const char* desc =\n    \"Computes the shortest path from a source node to all nodes in a directed \"\n    \"graph using a modified chaotic iteration algorithm\";\nstatic const char* url = \"single_source_shortest_path\";\n\nnamespace cll = llvm::cl;\nstatic cll::opt<std::string>\n    inputFilename(cll::Positional, cll::desc(\"<input file>\"), cll::Required);\n\nint check_overflow(const Info info, nodeArray& nodelist, edgeArray& v_edge,\n                   edgeArray& h_edge, Net* netlist, bool print) {\n  int total_of = 0;\n  for (int i = 0; i < (info.grid_x - 1) * (info.grid_y - 1); i++) {\n    if (i % (info.grid_x - 1) != info.grid_x - 2) {\n      if (h_edge[i].utilization <= h_edge[i].capacity) {\n        h_edge[i].scale = 1.0;\n      } else {\n        h_edge[i].scale =\n            (float)h_edge[i].utilization / (float)h_edge[i].capacity;\n        total_of += h_edge[i].utilization - h_edge[i].capacity;\n        h_edge[i].utilization = 0;\n      }\n      // if(i % (info.grid_x - 1) == 121 && i / (info.grid_x - 1) == 86)\n      if (print) {\n        cout << \"h_edge \" << i % (info.grid_x - 1) << \" \"\n             << i / (info.grid_x - 1) << \" overflow: \" << h_edge[i].utilization\n             << \"/\" << h_edge[i].capacity << \" = \" << h_edge[i].scale << endl;\n      }\n    }\n  }\n\n  for (int i = 0; i < (info.grid_x - 1) * (info.grid_y - 2); i++) {\n    if (v_edge[i].utilization <= v_edge[i].capacity) {\n      v_edge[i].scale = 1.0;\n    } else {\n      v_edge[i].scale =\n          (float)v_edge[i].utilization / (float)v_edge[i].capacity;\n      total_of += v_edge[i].utilization - v_edge[i].capacity;\n      if (print) {\n        cout << \"v_edge \" << i % (info.grid_x - 1) << \" \"\n             << i / (info.grid_x - 1) << \" \" << i\n             << \" overflow: \" << v_edge[i].utilization << \"/\"\n             << v_edge[i].capacity << \" = \" << v_edge[i].scale << endl;\n      }\n      v_edge[i].utilization = 0;\n    }\n  }\n\n  cout << \"total overflow is \" << total_of << endl;\n  return total_of;\n}\n\nvoid reset_net(const Info info, nodeArray& nodelist, edgeArray& v_edge,\n               edgeArray& h_edge, Net* netlist) {\n  int num_reroute = 0;\n  for (int i = 0; i < info.num_nets; i++) {\n    bool no_need_reroute = true;\n    for (auto edge_iter = netlist[i].edgelist.begin();\n         edge_iter != netlist[i].edgelist.end(); edge_iter++) {\n      if ((*edge_iter)->scale > 1) {\n        num_reroute++;\n        no_need_reroute = false;\n        break;\n      }\n    }\n    if (!no_need_reroute) {\n      for (auto edge_iter = netlist[i].edgelist.begin();\n           edge_iter != netlist[i].edgelist.end(); edge_iter++) {\n        if ((*edge_iter)->scale > 1) {\n\n        } else {\n          (*edge_iter)->utilization -= (netlist[i].min_width + 1);\n          (*edge_iter)->compute_cost();\n        }\n      }\n      netlist[i].reroute = true;\n      netlist[i].pathlist.clear();\n      netlist[i].edgelist.clear();\n      netlist[i].nodelist.clear();\n    } else\n      netlist[i].reroute = false;\n  }\n  cout << \"num of nets to reroute: \" << num_reroute << endl;\n}\n\nint main(int argc, char* argv[]) {\n  galois::SharedMemSys G;\n  LonestarStart(argc, argv, name, desc, url, argv[1]);\n  /*if(argc != 2)\n  {\n      cout<<\"usage: ./a.out [input]\"<<endl;\n      return 0;\n  }*/\n  ifstream infile(argv[1]);\n\n  if (!infile.is_open()) {\n    cout << \"unable to open input file\" << endl;\n    return 0;\n  }\n  Info info;\n\n  Graph graph;\n\n  galois::preAlloc(numThreads * 2);\n  nodeArray nodelist;\n  edgeArray v_edge;\n  edgeArray h_edge;\n  Net* netlist;\n  std::vector<GNode> node_GNode;\n  const int MAX_ITERATION = 1;\n  readfile(infile, info, nodelist, v_edge, h_edge, netlist, graph, node_GNode);\n  infile.close();\n  // cout<<\"reading input file done\"<<endl;\n  timerstruct Time;\n  // bool print = false;\n  galois::InsertBag<int> allNodeBag;\n  // galois::on_each(\n  //        [&] (const unsigned tid, const unsigned numT)\n  {\n    unsigned total_nodes = (info.grid_x - 1) * (info.grid_y - 1);\n    unsigned start       = 0;   // total_nodes / numT * tid;\n    unsigned end = total_nodes; //(tid == numT - 1)? total_nodes : total_nodes /\n                                // numT * (tid + 1);\n    for (unsigned i = start; i < end; i++)\n      allNodeBag.push(i);\n  }\n  //        );\n\n  for (int i = 0; i < MAX_ITERATION; i++) {\n    cout << endl;\n    cout << \"iteration: \" << i << \" K = \" << K << \" HEIGHT = \" << HEIGHT\n         << \" slope = \" << slope << endl;\n    if (i != 0)\n      reset_net(info, nodelist, v_edge, h_edge, netlist);\n\n    Time.t_maze_route.start();\n\n    galois::runtime::profileVtune(\n        [&](void) {\n          mazeroute(info, nodelist, allNodeBag, v_edge, h_edge, netlist, graph,\n                    node_GNode, Time);\n        },\n        \"mazeroute\");\n    Time.t_maze_route.stop();\n    bool print = false; // Martin\n    if (i == MAX_ITERATION - 1)\n      print = true;\n    int overflow =\n        check_overflow(info, nodelist, v_edge, h_edge, netlist, print);\n    if (overflow == 0)\n      break;\n\n    if (K < 1)\n      K += K_STEP;\n    else\n      K = 2.0;\n    HEIGHT += HEIGHT_STEP;\n    if (i > 20)\n      HEIGHT_STEP = 4;\n  }\n\n  Time.t_print_path.start();\n  print_path(info, netlist);\n  Time.t_print_path.stop();\n\n  return 0;\n}\n"
  },
  {
    "path": "lonestar/eda/cpu/sproute/rand-pts.c",
    "content": "#include <stdio.h>\n#include <stdlib.h>\n#include <unistd.h>\n\nint main(int ac, char *av[])\n{\n    int d=10, tmp, i;\n    int PNUM = 0;\n\n    for (i=1; i<ac; i++) {\n        if (strcmp(av[i], \"-r\")==0)  // random\n            srandom((int) getpid());\n        else if (strncmp(av[i], \"-s\", 2)==0)  // set random seed\n            srandom(atoi(av[i]+2));\n        else if (strcmp(av[i], \"-n\")==0)  // print # of points first\n            PNUM=1;\n        else if (sscanf(av[i], \"%d\", &tmp))  // set # of points\n            d = tmp;\n        else {\n            printf(\"Usage: %s [-r] [-s<S>] [-n] [<D>]\\n\", av[0]);\n            printf(\"  Output <D> random points \");\n            printf(\"as <D> lines of coordinate pairs.\\n\");\n            printf(\"  Default <D> is 10.\\n\");\n            printf(\"  -r\\t Randomize. Use getpid() as seed.\\n\");\n            printf(\"  -s<S>\\t Set random seed to <S>.\\n\");\n            printf(\"  -n\\t Write <D> first before the random points.\\n\");\n            exit(-1);\n        }\n    }\n    \n    if (PNUM)\n        printf(\"%d\\n\", d);\n    for (i=1; i<=d; i++)\n        printf(\"%4d %4d\\n\", (int) random()%10000, (int) random()%10000);\n}\n"
  },
  {
    "path": "lonestar/eda/cpu/sproute/route.h",
    "content": "#ifndef _ROUTE_H_\n#define _ROUTE_H_\n\n#include <stdio.h>\n#include <stdlib.h>\n#include <time.h>\n#include <math.h>\n#include \"DataType.h\"\n#include \"flute.h\"\n#include \"DataProc.h\"\n#include \"RipUp.h\"\n\n#define SAMEX 0\n#define SAMEY 1\n\nfloat costHVH[XRANGE]; // Horizontal first Z\nfloat costVHV[YRANGE]; // Vertical first Z\nfloat costH[YRANGE];   // Horizontal segment cost\nfloat costV[XRANGE];   // Vertical segment cost\nfloat costLR[YRANGE];  // Left and right boundary cost\nfloat costTB[XRANGE];  // Top and bottom boundary cost\n\nfloat costHVHtest[YRANGE]; // Vertical first Z\nfloat costVtest[XRANGE];   // Vertical segment cost\nfloat costTBtest[XRANGE];  // Top and bottom boundary cost\n\n#define HCOST 5000;\n\n// estimate the routing by assigning 1 for H and V segments, 0.5 to both\n// possible L for L segments\nvoid estimateOneSeg(Segment* seg) {\n  int i;\n  int ymin, ymax;\n\n  if (seg->y1 < seg->y2) {\n    ymin = seg->y1;\n    ymax = seg->y2;\n  } else {\n    ymin = seg->y2;\n    ymax = seg->y1;\n  }\n\n  // assign 0.5 to both Ls (x1,y1)-(x1,y2) + (x1,y2)-(x2,y2) + (x1,y1)-(x2,y1) +\n  // (x2,y1)-(x2,y2)\n  if (seg->x1 == seg->x2) // a vertical segment\n  {\n    for (i = ymin; i < ymax; i++)\n      v_edges[i * xGrid + seg->x1].est_usage += 1;\n  } else if (seg->y1 == seg->y2) // a horizontal segment\n  {\n    for (i = seg->x1; i < seg->x2; i++)\n      h_edges[seg->y1 * (xGrid - 1) + i].est_usage += 1;\n  } else // a diagonal segment\n  {\n    for (i = ymin; i < ymax; i++) {\n      v_edges[i * xGrid + seg->x1].est_usage += 0.5;\n      v_edges[i * xGrid + seg->x2].est_usage += 0.5;\n    }\n    for (i = seg->x1; i < seg->x2; i++) {\n      h_edges[seg->y1 * (xGrid - 1) + i].est_usage += 0.5;\n      h_edges[seg->y2 * (xGrid - 1) + i].est_usage += 0.5;\n    }\n  }\n}\n\nvoid routeSegV(Segment* seg) {\n  int i;\n  int ymin, ymax;\n\n  if (seg->y1 < seg->y2) {\n    ymin = seg->y1;\n    ymax = seg->y2;\n  } else {\n    ymin = seg->y2;\n    ymax = seg->y1;\n  }\n\n  for (i = ymin; i < ymax; i++)\n    v_edges[i * xGrid + seg->x1].est_usage++;\n}\n\nvoid routeSegH(Segment* seg) {\n  int i;\n\n  for (i = seg->x1; i < seg->x2; i++)\n    h_edges[seg->y1 * (xGrid - 1) + i].est_usage++;\n}\n\n// L-route, based on previous L route\nvoid routeSegL(Segment* seg) {\n  int i, grid, grid1;\n  float costL1, costL2, tmp;\n  int ymin, ymax;\n\n  if (seg->y1 < seg->y2) {\n    ymin = seg->y1;\n    ymax = seg->y2;\n  } else {\n    ymin = seg->y2;\n    ymax = seg->y1;\n  }\n\n  if (seg->x1 == seg->x2) // V route\n    routeSegV(seg);\n  else if (seg->y1 == seg->y2) // H route\n    routeSegH(seg);\n  else // L route\n  {\n    costL1 = costL2 = 0;\n\n    for (i = ymin; i < ymax; i++) {\n      grid = i * xGrid;\n      tmp  = v_edges[grid + seg->x1].red + v_edges[grid + seg->x1].est_usage -\n            vCapacity_lb;\n      if (tmp > 0)\n        costL1 += tmp;\n      tmp = v_edges[grid + seg->x2].red + v_edges[grid + seg->x2].est_usage -\n            vCapacity_lb;\n      if (tmp > 0)\n        costL2 += tmp;\n    }\n    grid  = seg->y2 * (xGrid - 1);\n    grid1 = seg->y1 * (xGrid - 1);\n    for (i = seg->x1; i < seg->x2; i++) {\n      tmp = h_edges[grid + i].red + h_edges[grid + i].est_usage - hCapacity_lb;\n      if (tmp > 0)\n        costL1 += tmp;\n      tmp =\n          h_edges[grid1 + i].red + h_edges[grid1 + i].est_usage - hCapacity_lb;\n      if (tmp > 0)\n        costL2 += tmp;\n    }\n\n    printf(\"costL1 is %f, costL2 is %f\\n\", costL1, costL2);\n\n    if (costL1 < costL2) {\n      // two parts (x1, y1)-(x1, y2) and (x1, y2)-(x2, y2)\n      for (i = ymin; i < ymax; i++) {\n        v_edges[i * xGrid + seg->x1].est_usage += 1;\n      }\n      grid = seg->y2 * (xGrid - 1);\n      for (i = seg->x1; i < seg->x2; i++) {\n        h_edges[grid + i].est_usage += 1;\n      }\n      seg->xFirst = FALSE;\n    } // if costL1<costL2\n    else {\n      // two parts (x1, y1)-(x2, y1) and (x2, y1)-(x2, y2)\n      grid = seg->y1 * (xGrid - 1);\n      for (i = seg->x1; i < seg->x2; i++) {\n        h_edges[grid + i].est_usage += 1;\n      }\n      for (i = ymin; i < ymax; i++) {\n        v_edges[i * xGrid + seg->x2].est_usage += 1;\n      }\n      seg->xFirst = TRUE;\n    }\n  } // else L route\n}\n\n// First time L-route, based on 0.5-0.5 estimation\nvoid routeSegLFirstTime(Segment* seg) {\n  int i, vedge, hedge;\n  float costL1, costL2, tmp;\n  int ymin, ymax;\n\n  if (seg->y1 < seg->y2) {\n    ymin = seg->y1;\n    ymax = seg->y2;\n  } else {\n    ymin = seg->y2;\n    ymax = seg->y1;\n  }\n\n  costL1 = costL2 = 0;\n\n  for (i = ymin; i < ymax; i++) {\n    vedge = i * xGrid + seg->x1;\n    tmp   = v_edges[vedge].red + v_edges[vedge].est_usage - vCapacity_lb;\n    if (tmp > 0)\n      costL1 += tmp;\n  }\n  for (i = ymin; i < ymax; i++) {\n    vedge = i * xGrid + seg->x2;\n    tmp   = v_edges[vedge].red + v_edges[vedge].est_usage - vCapacity_lb;\n    if (tmp > 0)\n      costL2 += tmp;\n  }\n\n  for (i = seg->x1; i < seg->x2; i++) {\n    hedge = seg->y2 * (xGrid - 1) + i;\n    tmp   = h_edges[hedge].red + h_edges[hedge].est_usage - hCapacity_lb;\n    if (tmp > 0)\n      costL1 += tmp;\n  }\n  for (i = seg->x1; i < seg->x2; i++) {\n    hedge = seg->y1 * (xGrid - 1) + i;\n    tmp   = h_edges[hedge].red + h_edges[hedge].est_usage - hCapacity_lb;\n    if (tmp > 0)\n      costL2 += tmp;\n  }\n\n  if (costL1 < costL2) {\n    // two parts (x1, y1)-(x1, y2) and (x1, y2)-(x2, y2)\n    for (i = ymin; i < ymax; i++) {\n      vedge = i * xGrid + seg->x1;\n      v_edges[vedge].est_usage += 0.5;\n      vedge += seg->x2 - seg->x1;\n      v_edges[vedge].est_usage -= 0.5;\n    }\n    for (i = seg->x1; i < seg->x2; i++) {\n      hedge = seg->y2 * (xGrid - 1) + i;\n      h_edges[hedge].est_usage += 0.5;\n      hedge = seg->y1 * (xGrid - 1) + i;\n      h_edges[hedge].est_usage -= 0.5;\n    }\n    seg->xFirst = FALSE;\n  } else {\n    // two parts (x1, y1)-(x2, y1) and (x2, y1)-(x2, y2)\n    for (i = seg->x1; i < seg->x2; i++) {\n      hedge = seg->y1 * (xGrid - 1) + i;\n      h_edges[hedge].est_usage += 0.5;\n      hedge = seg->y2 * (xGrid - 1) + i;\n      h_edges[hedge].est_usage -= 0.5;\n    }\n    for (i = ymin; i < ymax; i++) {\n      vedge = i * xGrid + seg->x2;\n      v_edges[vedge].est_usage += 0.5;\n      vedge += seg->x1 - seg->x2;\n      v_edges[vedge].est_usage -= 0.5;\n    }\n    seg->xFirst = TRUE;\n  }\n}\n\n// route all segments with L, firstTime: TRUE, no previous route, FALSE -\n// previous is L-route\nvoid routeLAll(Bool firstTime) {\n  int i, j;\n\n  if (firstTime) // no previous route\n  {\n    // estimate congestion with 0.5+0.5 L\n    for (i = 0; i < numValidNets; i++) {\n      for (j = seglistIndex[i]; j < seglistIndex[i] + seglistCnt[i]; j++) {\n        estimateOneSeg(&seglist[j]);\n      }\n    }\n    // L route\n    for (i = 0; i < numValidNets; i++) {\n      for (j = seglistIndex[i]; j < seglistIndex[i] + seglistCnt[i]; j++) {\n        // no need to reroute the H or V segs\n        if (seglist[j].x1 != seglist[j].x2 || seglist[j].y1 != seglist[j].y2)\n          routeSegLFirstTime(&seglist[j]);\n      }\n    }\n  } else // previous is L-route\n  {\n    for (i = 0; i < numValidNets; i++) {\n      for (j = seglistIndex[i]; j < seglistIndex[i] + seglistCnt[i]; j++) {\n        // no need to reroute the H or V segs\n        if (seglist[j].x1 != seglist[j].x2 || seglist[j].y1 != seglist[j].y2) {\n          ripupSegL(&seglist[j]);\n          routeSegL(&seglist[j]);\n        }\n      }\n    }\n  }\n}\n\n// L-route, rip-up the previous route according to the ripuptype\n// L-route, rip-up the previous route according to the ripuptype\nvoid newrouteL(int netID, RouteType ripuptype, Bool viaGuided) {\n  int i, j, d, n1, n2, x1, y1, x2, y2, grid, grid1;\n  float costL1 = 0, costL2 = 0, tmp;\n  int ymin, ymax;\n  TreeEdge *treeedges, *treeedge;\n  TreeNode* treenodes;\n\n  d         = sttrees[netID].deg;\n  treeedges = sttrees[netID].edges;\n  treenodes = sttrees[netID].nodes;\n\n  // loop for all the tree edges (2*d-3)\n  for (i = 0; i < 2 * d - 3; i++) {\n    if (sttrees[netID].edges[i].len >\n        0) // only route the non-degraded edges (len>0)\n    {\n\n      treeedge = &(treeedges[i]);\n\n      n1 = treeedge->n1;\n      n2 = treeedge->n2;\n      x1 = treenodes[n1].x;\n      y1 = treenodes[n1].y;\n      x2 = treenodes[n2].x;\n      y2 = treenodes[n2].y;\n\n      if (y1 < y2) {\n        ymin = y1;\n        ymax = y2;\n      } else {\n        ymin = y2;\n        ymax = y1;\n      }\n\n      // ripup the original routing\n      if (ripuptype > NOROUTE) // it's been routed\n        newRipup(treeedge, x1, y1, x2, y2);\n\n      treeedge->route.type = LROUTE;\n      if (x1 == x2) // V-routing\n      {\n        for (j = ymin; j < ymax; j++)\n          v_edges[j * xGrid + x1].est_usage++;\n        treeedge->route.xFirst = FALSE;\n        if (treenodes[n1].status % 2 == 0) {\n          treenodes[n1].status += 1;\n        }\n        if (treenodes[n2].status % 2 == 0) {\n          treenodes[n2].status += 1;\n        }\n      } else if (y1 == y2) // H-routing\n      {\n        for (j = x1; j < x2; j++)\n          h_edges[y1 * (xGrid - 1) + j].est_usage++;\n        treeedge->route.xFirst = TRUE;\n        if (treenodes[n2].status < 2) {\n          treenodes[n2].status += 2;\n        }\n        if (treenodes[n1].status < 2) {\n          treenodes[n1].status += 2;\n        }\n      } else // L-routing\n      {\n\n        if (viaGuided) {\n\n          if (treenodes[n1].status == 0 || treenodes[n1].status == 3) {\n            costL1 = costL2 = 0;\n          } else if (treenodes[n1].status == 2) {\n            costL1 = viacost;\n            costL2 = 0;\n          } else if (treenodes[n1].status == 1) {\n\n            costL1 = 0;\n            costL2 = viacost;\n          } else {\n            printf(\"wrong node status %d\", treenodes[n1].status);\n          }\n          if (treenodes[n2].status == 2) {\n            costL2 += viacost;\n          } else if (treenodes[n2].status == 1) {\n            costL1 += viacost;\n          }\n        } else {\n          costL1 = costL2 = 0;\n        }\n\n        for (j = ymin; j < ymax; j++) {\n          grid = j * xGrid;\n          tmp  = v_edges[grid + x1].est_usage - vCapacity_lb +\n                v_edges[grid + x1].red;\n          if (tmp > 0)\n            costL1 += tmp;\n          tmp = v_edges[grid + x2].est_usage - vCapacity_lb +\n                v_edges[grid + x2].red;\n          if (tmp > 0)\n            costL2 += tmp;\n        }\n        grid  = y2 * (xGrid - 1);\n        grid1 = y1 * (xGrid - 1);\n        for (j = x1; j < x2; j++) {\n          tmp = h_edges[grid + j].est_usage - hCapacity_lb +\n                h_edges[grid + j].red;\n          if (tmp > 0)\n            costL1 += tmp;\n          tmp = h_edges[grid1 + j].est_usage - hCapacity_lb +\n                h_edges[grid1 + j].red;\n          if (tmp > 0)\n            costL2 += tmp;\n        }\n\n        if (costL1 < costL2) {\n          if (treenodes[n1].status % 2 == 0) {\n            treenodes[n1].status += 1;\n          }\n          if (treenodes[n2].status < 2) {\n            treenodes[n2].status += 2;\n          }\n\n          // two parts (x1, y1)-(x1, y2) and (x1, y2)-(x2, y2)\n          for (j = ymin; j < ymax; j++) {\n            v_edges[j * xGrid + x1].est_usage += 1;\n          }\n          grid = y2 * (xGrid - 1);\n          for (j = x1; j < x2; j++) {\n            h_edges[grid + j].est_usage += 1;\n          }\n          treeedge->route.xFirst = FALSE;\n        } // if costL1<costL2\n        else {\n          if (treenodes[n2].status % 2 == 0) {\n            treenodes[n2].status += 1;\n          }\n          if (treenodes[n1].status < 2) {\n            treenodes[n1].status += 2;\n          }\n\n          // two parts (x1, y1)-(x2, y1) and (x2, y1)-(x2, y2)\n          grid = y1 * (xGrid - 1);\n          for (j = x1; j < x2; j++) {\n            h_edges[grid + j].est_usage += 1;\n          }\n          for (j = ymin; j < ymax; j++) {\n            v_edges[j * xGrid + x2].est_usage += 1;\n          }\n          treeedge->route.xFirst = TRUE;\n        }\n\n      } // else L-routing\n    }   // if non-degraded edge\n    else\n      sttrees[netID].edges[i].route.type = NOROUTE;\n  } // loop i\n}\n\n// route all segments with L, firstTime: TRUE, first newrouteLAll, FALSE - not\n// first\nvoid newrouteLAll(Bool firstTime, Bool viaGuided) {\n  int i;\n\n  if (firstTime) {\n    for (i = 0; i < numValidNets; i++) {\n      newrouteL(i, NOROUTE, viaGuided); // do L-routing\n    }\n  } else {\n    for (i = 0; i < numValidNets; i++) {\n      newrouteL(i, LROUTE, viaGuided);\n    }\n  }\n}\n\nvoid newrouteZ_edge(int netID, int edgeID) {\n  int i, j, n1, n2, x1, y1, x2, y2, segWidth, bestZ, grid, grid1, grid2, ymin,\n      ymax;\n  float tmp, bestcost, btTEST;\n  Bool HVH; // the shape of Z routing (TRUE - HVH, FALSE - VHV)\n  TreeEdge *treeedges, *treeedge;\n  TreeNode* treenodes;\n\n  if (sttrees[netID].edges[edgeID].len >\n      0) // only route the non-degraded edges (len>0)\n  {\n    treeedges = sttrees[netID].edges;\n    treeedge  = &(treeedges[edgeID]);\n    treenodes = sttrees[netID].nodes;\n    n1        = treeedge->n1;\n    n2        = treeedge->n2;\n    x1        = treenodes[n1].x;\n    y1        = treenodes[n1].y;\n    x2        = treenodes[n2].x;\n    y2        = treenodes[n2].y;\n\n    if (x1 != x2 || y1 != y2) // not H or V edge, do Z-routing (if H or V edge,\n                              // no need to reroute)\n    {\n      // ripup the original routing\n      newRipup(treeedge, x1, y1, x2, y2);\n\n      treeedge->route.type = ZROUTE;\n\n      segWidth = x2 - x1;\n      if (y1 < y2) {\n        ymin = y1;\n        ymax = y2;\n      } else {\n        ymin = y2;\n        ymax = y1;\n      }\n\n      // compute the cost for all Z routing\n\n      for (i = 0; i <= segWidth; i++) {\n        costHVH[i] = 0;\n        costV[i]   = 0;\n        costTB[i]  = 0;\n\n        costHVHtest[i] = 0;\n        costVtest[i]   = 0;\n        costTBtest[i]  = 0;\n      }\n\n      // compute the cost for all H-segs and V-segs and partial boundary seg\n      // cost for V-segs\n      for (i = x1; i <= x2; i++) {\n        grid = ymin * xGrid;\n        for (j = ymin; j < ymax; j++) {\n          tmp = v_edges[grid + i].est_usage - vCapacity_lb +\n                v_edges[grid + i].red;\n          grid += xGrid;\n          if (tmp > 0) {\n            costV[i - x1] += tmp;\n            costVtest[i - x1] += HCOST;\n            ;\n          } else {\n            costVtest[i - x1] += tmp;\n          }\n        }\n      }\n      // cost for Top&Bot boundary segs (form Z with V-seg)\n      grid = y2 * (xGrid - 1);\n      for (j = x1; j <= x2; j++) {\n        tmp =\n            h_edges[grid + j].est_usage - hCapacity_lb + h_edges[grid + j].red;\n        if (tmp > 0) {\n          costTB[0] += tmp;\n          costTBtest[0] += HCOST;\n        } else {\n          costTBtest[0] += tmp;\n        }\n      }\n      grid1 = y1 * (xGrid - 1) + x1;\n      grid2 = y2 * (xGrid - 1) + x1;\n      for (i = 1; i <= segWidth; i++) {\n        costTB[i] = costTB[i - 1];\n        tmp       = h_edges[grid1 + i - 1].est_usage - hCapacity_lb +\n              h_edges[grid1 + i - 1].red;\n        if (tmp > 0) {\n          costTB[i] += tmp;\n          costTBtest[i] += HCOST;\n        } else {\n          costTBtest[i] += tmp;\n        }\n        tmp = h_edges[grid2 + i - 1].est_usage - hCapacity_lb +\n              h_edges[grid2 + i - 1].red;\n        if (tmp > 0) {\n          costTB[i] -= tmp;\n          costTBtest[i] -= HCOST;\n        } else {\n          costTBtest[i] -= tmp;\n        }\n      }\n      // compute cost for all Z routing\n      HVH      = TRUE;\n      bestcost = BIG_INT;\n      btTEST   = BIG_INT;\n      bestZ    = 0;\n      for (i = 0; i <= segWidth; i++) {\n        costHVH[i]     = costV[i] + costTB[i];\n        costHVHtest[i] = costVtest[i] + costTBtest[i];\n        if (costHVH[i] < bestcost) {\n          bestcost = costHVH[i];\n          btTEST   = costHVHtest[i];\n          bestZ    = i + x1;\n        } else if (costHVH[i] == bestcost) {\n          if (costHVHtest[i] < btTEST) {\n            btTEST = costHVHtest[i];\n            bestZ  = i + x1;\n          }\n        }\n      }\n\n      if (HVH) {\n        grid = y1 * (xGrid - 1);\n        for (i = x1; i < bestZ; i++) {\n          h_edges[grid + i].est_usage += 1;\n        }\n        grid = y2 * (xGrid - 1);\n        for (i = bestZ; i < x2; i++) {\n          h_edges[grid + i].est_usage += 1;\n        }\n        grid = ymin * xGrid;\n        for (i = ymin; i < ymax; i++) {\n          v_edges[grid + bestZ].est_usage += 1;\n          grid += xGrid;\n        }\n        treeedge->route.HVH    = HVH;\n        treeedge->route.Zpoint = bestZ;\n      } else {\n        printf(\"warning, in the maze edge, not HVH results is produced\");\n      }\n    } // else Z route\n\n  } // if non-degraded edge\n}\n\n// Z-route, rip-up the previous route according to the ripuptype\nvoid newrouteZ(int netID, int threshold) {\n  int ind, i, j, d, n1, n2, x1, y1, x2, y2, segWidth, segHeight, bestZ, grid,\n      grid1, grid2, ymin, ymax, n1a, n2a, status1, status2;\n  float tmp, bestcost, btTEST;\n  Bool HVH;       // the shape of Z routing (TRUE - HVH, FALSE - VHV)\n  Bool y1Smaller; // TRUE - y1<y2, FALSE y1>y2\n  TreeEdge *treeedges, *treeedge;\n  TreeNode* treenodes;\n\n  d = sttrees[netID].deg;\n\n  treeedges = sttrees[netID].edges;\n  treenodes = sttrees[netID].nodes;\n\n  // loop for all the tree edges (2*d-3)\n\n  for (ind = 0; ind < 2 * d - 3; ind++) {\n    treeedge = &(treeedges[ind]);\n\n    n1 = treeedge->n1;\n    n2 = treeedge->n2;\n    x1 = treenodes[n1].x;\n    y1 = treenodes[n1].y;\n    x2 = treenodes[n2].x;\n    y2 = treenodes[n2].y;\n\n    if (sttrees[netID].edges[ind].len >\n        threshold) // only route the edges with len>5\n    {\n\n      if (y1 < y2) {\n        ymin = y1;\n        ymax = y2;\n      } else {\n        ymin = y2;\n        ymax = y1;\n      }\n\n      if (x1 != x2 && y1 != y2) // not H or V edge, do Z-routing\n      {\n        // ripup the original routing\n        if (newRipupType2(treeedge, treenodes, x1, y1, x2, y2, d)) {\n\n          n1a     = treenodes[n1].stackAlias;\n          n2a     = treenodes[n2].stackAlias;\n          status1 = treenodes[n1a].status;\n          status2 = treenodes[n2a].status;\n\n          treeedge->route.type = ZROUTE;\n\n          segWidth = x2 - x1;\n          if (y1 < y2) {\n            ymin      = y1;\n            ymax      = y2;\n            y1Smaller = TRUE;\n          } else {\n            ymin      = y2;\n            ymax      = y1;\n            y1Smaller = FALSE;\n          }\n          segHeight = ymax - ymin;\n\n          // compute the cost for all Z routing\n\n          if (status1 == 0 || status1 == 3) {\n            for (i = 0; i < segWidth; i++) {\n              costHVH[i]     = 0;\n              costHVHtest[i] = 0;\n            }\n            for (i = 0; i < segHeight; i++) {\n              costVHV[i] = 0;\n            }\n          } else if (status1 == 2) {\n            for (i = 0; i < segWidth; i++) {\n              costHVH[i]     = 0;\n              costHVHtest[i] = 0;\n            }\n            for (i = 0; i < segHeight; i++) {\n              costVHV[i] = viacost;\n            }\n          } else {\n\n            for (i = 0; i < segWidth; i++) {\n              costHVH[i]     = viacost;\n              costHVHtest[i] = viacost;\n            }\n            for (i = 0; i < segHeight; i++) {\n              costVHV[i] = 0;\n            }\n          }\n\n          if (status2 == 2) {\n            for (i = 0; i < segHeight; i++) {\n              costVHV[i] += viacost;\n            }\n\n          } else if (status2 == 1) {\n            for (i = 0; i < segWidth; i++) {\n              costHVH[i] += viacost;\n              costHVHtest[i] += viacost;\n            }\n          }\n\n          for (i = 0; i < segWidth; i++) {\n            costV[i]  = 0;\n            costTB[i] = 0;\n\n            costVtest[i]  = 0;\n            costTBtest[i] = 0;\n          }\n          for (i = 0; i < segHeight; i++) {\n            costH[i]  = 0;\n            costLR[i] = 0;\n          }\n\n          // compute the cost for all H-segs and V-segs and partial boundary seg\n          // cost for V-segs\n          for (i = x1; i < x2; i++) {\n            grid = ymin * xGrid;\n            for (j = ymin; j < ymax; j++) {\n              tmp = v_edges[grid + i].est_usage - vCapacity_lb +\n                    v_edges[grid + i].red;\n              grid += xGrid;\n              if (tmp > 0) {\n                costV[i - x1] += tmp;\n                costVtest[i - x1] += HCOST;\n              } else {\n                costVtest[i - x1] += tmp;\n              }\n            }\n          }\n          // cost for Top&Bot boundary segs (form Z with V-seg)\n          grid = y2 * (xGrid - 1);\n          for (j = x1; j < x2; j++) {\n            tmp = h_edges[grid + j].est_usage - hCapacity_lb +\n                  h_edges[grid + j].red;\n            if (tmp > 0) {\n              costTB[0] += tmp;\n              costTBtest[0] += HCOST;\n            } else {\n              costTBtest[0] += tmp;\n            }\n          }\n          grid1 = y1 * (xGrid - 1) + x1;\n          grid2 = y2 * (xGrid - 1) + x1;\n          for (i = 1; i < segWidth; i++) {\n            costTB[i] = costTB[i - 1];\n            tmp       = h_edges[grid1 + i - 1].est_usage - hCapacity_lb +\n                  h_edges[grid1 + i - 1].red;\n            if (tmp > 0) {\n              costTB[i] += tmp;\n              costTBtest[0] += HCOST;\n            } else {\n              costTBtest[0] += tmp;\n            }\n            tmp = h_edges[grid2 + i - 1].est_usage - hCapacity_lb +\n                  h_edges[grid2 + i - 1].red;\n            if (tmp > 0) {\n              costTB[i] -= tmp;\n              costTBtest[0] -= HCOST;\n            } else {\n              costTBtest[0] -= tmp;\n            }\n          }\n          // cost for H-segs\n          grid = ymin * (xGrid - 1);\n          for (i = ymin; i < ymax; i++) {\n            for (j = x1; j < x2; j++) {\n              tmp = h_edges[grid + j].est_usage - hCapacity_lb +\n                    h_edges[grid + j].red;\n              if (tmp > 0)\n                costH[i - ymin] += tmp;\n            }\n            grid += xGrid - 1;\n          }\n          // cost for Left&Right boundary segs (form Z with H-seg)\n          if (y1Smaller) {\n            for (j = y1; j < y2; j++) {\n              tmp = v_edges[j * xGrid + x2].est_usage - vCapacity_lb +\n                    v_edges[j * xGrid + x2].red;\n              if (tmp > 0)\n                costLR[0] += tmp;\n            }\n            for (i = 1; i < segHeight; i++) {\n              costLR[i] = costLR[i - 1];\n              grid      = (y1 + i - 1) * xGrid;\n              tmp       = v_edges[grid + x1].est_usage - vCapacity_lb +\n                    v_edges[grid + x1].red;\n              if (tmp > 0)\n                costLR[i] += tmp;\n              tmp = v_edges[grid + x2].est_usage - vCapacity_lb +\n                    v_edges[grid + x2].red;\n              if (tmp > 0)\n                costLR[i] -= tmp;\n            }\n          } else {\n            for (j = y2; j < y1; j++) {\n              tmp = v_edges[j * xGrid + x1].est_usage - vCapacity_lb;\n              if (tmp > 0)\n                costLR[0] += tmp;\n            }\n            for (i = 1; i < segHeight; i++) {\n              costLR[i] = costLR[i - 1];\n              grid      = (y2 + i - 1) * xGrid;\n              tmp       = v_edges[grid + x2].est_usage - vCapacity_lb +\n                    v_edges[grid + x2].red;\n              if (tmp > 0)\n                costLR[i] += tmp;\n              tmp = v_edges[grid + x1].est_usage - vCapacity_lb +\n                    v_edges[grid + x1].red;\n              if (tmp > 0)\n                costLR[i] -= tmp;\n            }\n          }\n\n          // compute cost for all Z routing\n          HVH      = TRUE;\n          bestcost = BIG_INT;\n          btTEST   = BIG_INT;\n          bestZ    = 0;\n          for (i = 0; i < segWidth; i++) {\n            costHVH[i] += costV[i] + costTB[i];\n            if (costHVH[i] < bestcost) {\n              bestcost = costHVH[i];\n              btTEST   = costHVHtest[i];\n              bestZ    = i + x1;\n            } else if (costHVH[i] == bestcost) {\n              if (costHVHtest[i] < btTEST) {\n                btTEST = costHVHtest[i];\n                bestZ  = i + x1;\n              }\n            }\n          }\n          for (i = 0; i < segHeight; i++) {\n            costVHV[i] += costH[i] + costLR[i];\n            if (costVHV[i] < bestcost) {\n              bestcost = costVHV[i];\n              bestZ    = i + ymin;\n              HVH      = FALSE;\n            }\n          }\n\n          if (HVH) {\n            if (treenodes[n1a].status < 2) {\n              treenodes[n1a].status += 2;\n            }\n            if (treenodes[n2a].status < 2) {\n              treenodes[n2a].status += 2;\n            }\n\n            treenodes[n1a].hID++;\n            treenodes[n2a].hID++;\n\n            grid = y1 * (xGrid - 1);\n            for (i = x1; i < bestZ; i++) {\n              h_edges[grid + i].est_usage += 1;\n            }\n            grid = y2 * (xGrid - 1);\n            for (i = bestZ; i < x2; i++) {\n              h_edges[grid + i].est_usage += 1;\n            }\n            grid = ymin * xGrid;\n            for (i = ymin; i < ymax; i++) {\n              v_edges[grid + bestZ].est_usage += 1;\n              grid += xGrid;\n            }\n            treeedge->route.HVH    = HVH;\n            treeedge->route.Zpoint = bestZ;\n          } else {\n            if (treenodes[n2a].status % 2 == 0) {\n              treenodes[n2a].status += 1;\n            }\n            if (treenodes[n1a].status % 2 == 0) {\n              treenodes[n1a].status += 1;\n            }\n\n            treenodes[n1a].lID++;\n            treenodes[n2a].lID++;\n            if (y1Smaller) {\n              grid = y1 * xGrid;\n              for (i = y1; i < bestZ; i++) {\n                v_edges[grid + x1].est_usage += 1;\n                grid += xGrid;\n              }\n              grid = bestZ * xGrid;\n              for (i = bestZ; i < y2; i++) {\n                v_edges[grid + x2].est_usage += 1;\n                grid += xGrid;\n              }\n              grid = bestZ * (xGrid - 1);\n              for (i = x1; i < x2; i++) {\n                h_edges[grid + i].est_usage += 1;\n              }\n              treeedge->route.HVH    = HVH;\n              treeedge->route.Zpoint = bestZ;\n            } else {\n              grid = y2 * xGrid;\n              for (i = y2; i < bestZ; i++) {\n                v_edges[grid + x2].est_usage += 1;\n                grid += xGrid;\n              }\n              grid = bestZ * xGrid;\n              for (i = bestZ; i < y1; i++) {\n                v_edges[grid + x1].est_usage += 1;\n                grid += xGrid;\n              }\n              grid = bestZ * (xGrid - 1);\n              for (i = x1; i < x2; i++) {\n                h_edges[grid + i].est_usage += 1;\n              }\n              treeedge->route.HVH    = HVH;\n              treeedge->route.Zpoint = bestZ;\n            }\n          }\n        } else { // if ripuped by type 2\n          if (d == 2) {\n            newrouteZ_edge(netID, ind);\n          }\n        }\n      }\n\n    } else if (d == 2 && sttrees[netID].edges[ind].len > threshold) {\n      newrouteZ_edge(netID, ind);\n    } // if non-degraded edge\n      //        else\n      //            sttrees[netID].edges[ind].route.type = NOROUTE;\n  }   // loop ind\n}\n\n// ripup a tree edge according to its ripup type and Z-route it\n// route all segments with L, firstTime: TRUE, first newrouteLAll, FALSE - not\n// first\nvoid newrouteZAll(int threshold) {\n  int i;\n\n  for (i = 0; i < numValidNets; i++) {\n    newrouteZ(i, threshold); // ripup previous route and do Z-routing\n  }\n}\n\n// Ripup the original route and do Monotonic routing within bounding box\nvoid routeMonotonic(int netID, int edgeID, int threshold) {\n  int i, j, cnt, x, xl, yl, xr, yr, n1, n2, x1, y1, x2, y2, grid, xGrid_1,\n      ind_i, ind_j, ind_x;\n  int vedge, hedge, segWidth, segHeight, curX, curY;\n  int gridsX[XRANGE + YRANGE], gridsY[XRANGE + YRANGE];\n  float **cost, tmp;\n  Bool** parent; // remember the parent of a grid on the shortest path, TRUE -\n                 // same x, FALSE - same y\n  TreeEdge *treeedges, *treeedge;\n  TreeNode* treenodes;\n\n  if (sttrees[netID].edges[edgeID].route.routelen >\n      threshold) // only route the non-degraded edges (len>0)\n  {\n    treeedges = sttrees[netID].edges;\n    treeedge  = &(treeedges[edgeID]);\n    treenodes = sttrees[netID].nodes;\n    n1        = treeedge->n1;\n    n2        = treeedge->n2;\n    x1        = treenodes[n1].x;\n    y1        = treenodes[n1].y;\n    x2        = treenodes[n2].x;\n    y2        = treenodes[n2].y;\n\n    if (x1 != x2 || y1 != y2) // not H or V edge, do Z-routing (if H or V edge,\n                              // no need to reroute)\n    {\n      // ripup the original routing\n      newRipup(treeedge, x1, y1, x2, y2);\n\n      segWidth  = ADIFF(x1, x2);\n      segHeight = ADIFF(y1, y2);\n      if (x1 <= x2) {\n        xl = x1;\n        yl = y1;\n        xr = x2;\n        yr = y2;\n      } else {\n        xl = x2;\n        yl = y2;\n        xr = x1;\n        yr = y1;\n      }\n\n      // find the best monotonic path from (x1, y1) to (x2, y2)\n      cost   = (float**)malloc((segHeight + 1) * sizeof(float*));\n      parent = (Bool**)malloc((segHeight + 1) * sizeof(Bool*));\n      for (i = 0; i <= segHeight; i++) {\n        cost[i]   = (float*)malloc((segWidth + 1) * sizeof(float));\n        parent[i] = (Bool*)malloc((segWidth + 1) * sizeof(Bool));\n      }\n\n      xGrid_1 = xGrid - 1; // tmp variable to save runtime\n      if (yl <= yr) {\n        // initialize first column\n        cost[0][0] = 0;\n        grid       = yl * xGrid;\n        for (j = 0; j < segHeight; j++) {\n          cost[j + 1][0] =\n              cost[j][0] +\n              max((float)0, v_edges[grid + xl].red +\n                                v_edges[grid + xl].est_usage - vCapacity_lb);\n          parent[j + 1][0] = SAMEX;\n          grid += xGrid;\n        }\n        // update other columns\n        for (i = 0; i < segWidth; i++) {\n          x = xl + i;\n          // update the cost of a column of grids by h-edges\n          grid = yl * xGrid_1;\n          for (j = 0; j <= segHeight; j++) {\n            tmp              = max((float)0, h_edges[grid + x].red +\n                                    h_edges[grid + x].est_usage - hCapacity_lb);\n            cost[j][i + 1]   = cost[j][i] + tmp;\n            parent[j][i + 1] = SAMEY;\n            grid += xGrid - 1;\n          }\n          // update the cost of a column of grids by v-edges\n          grid  = yl * xGrid;\n          ind_x = x + 1;\n          ind_i = i + 1;\n          for (j = 0; j < segHeight; j++) {\n            ind_j = j + 1;\n            tmp   = cost[j][ind_i] +\n                  max((float)0, v_edges[grid + ind_x].red +\n                                    v_edges[grid + ind_x].est_usage -\n                                    vCapacity_lb);\n            if (cost[ind_j][ind_i] > tmp) {\n              cost[ind_j][ind_i]   = tmp;\n              parent[ind_j][ind_i] = SAMEX;\n            }\n            grid += xGrid;\n          }\n        }\n\n        // store the shortest path and update the usage\n        curX = xr;\n        curY = yr;\n        cnt  = 0;\n\n        while (curX != xl || curY != yl) {\n          // printf(\"xl is %d, yl is %d, curX is %d, curY is\n          // %d\\n\",xl,yl,curX,curY); printf(\"%d\\n\", sttrees[netID].deg);\n          gridsX[cnt] = curX;\n          gridsY[cnt] = curY;\n          cnt++;\n          if (parent[curY - yl][curX - xl] == SAMEX) {\n            curY--;\n            vedge = curY * xGrid + curX;\n            v_edges[vedge].est_usage += 1;\n          } else {\n            curX--;\n            hedge = curY * (xGrid - 1) + curX;\n            h_edges[hedge].est_usage += 1;\n          }\n        }\n\n        gridsX[cnt] = xl;\n        gridsY[cnt] = yl;\n        cnt++;\n\n      } // yl<=yr\n\n      else // yl>yr\n      {\n        // initialize first column\n        cost[segHeight][0] = 0;\n        grid               = (yl - 1) * xGrid;\n        for (j = segHeight - 1; j >= 0; j--) {\n          cost[j][0] =\n              cost[j + 1][0] +\n              max((float)0, v_edges[grid + xl].red +\n                                v_edges[grid + xl].est_usage - vCapacity_lb);\n          parent[j][0] = SAMEX;\n          grid -= xGrid;\n        }\n        // update other columns\n        for (i = 0; i < segWidth; i++) {\n          x = xl + i;\n          // update the cost of a column of grids by h-edges\n          grid  = yl * (xGrid - 1);\n          ind_i = i + 1;\n          for (j = segHeight; j >= 0; j--) {\n            tmp              = max((float)0, h_edges[grid + x].red +\n                                    h_edges[grid + x].est_usage - hCapacity_lb);\n            cost[j][ind_i]   = cost[j][i] + tmp;\n            parent[j][ind_i] = SAMEY;\n            grid -= xGrid - 1;\n          }\n          // update the cost of a column of grids by v-edges\n          grid  = (yl - 1) * xGrid;\n          ind_x = x + 1;\n          for (j = segHeight - 1; j >= 0; j--) {\n            tmp = cost[j + 1][ind_i] +\n                  max((float)0, v_edges[grid + ind_x].red +\n                                    v_edges[grid + ind_x].est_usage -\n                                    vCapacity_lb);\n            if (cost[j][ind_i] > tmp) {\n              cost[j][ind_i]   = tmp;\n              parent[j][ind_i] = SAMEX;\n            }\n            grid -= xGrid;\n          }\n        }\n\n        // store the shortest path and update the usage\n        curX = xr;\n        curY = yr;\n        cnt  = 0;\n        while (curX != xl || curY != yl) {\n          gridsX[cnt] = curX;\n          gridsY[cnt] = curY;\n          cnt++;\n          if (parent[curY - yr][curX - xl] == SAMEX) {\n            vedge = curY * xGrid + curX;\n            v_edges[vedge].est_usage += 1;\n            curY++;\n          } else {\n            curX--;\n            hedge = curY * (xGrid - 1) + curX;\n            h_edges[hedge].est_usage += 1;\n          }\n        }\n        gridsX[cnt] = xl;\n        gridsY[cnt] = yl;\n        cnt++;\n\n      } // yl>yr\n      treeedge->route.routelen = cnt - 1;\n\n      treeedge->route.gridsX =\n          (short*)realloc(treeedge->route.gridsX, cnt * sizeof(short));\n      treeedge->route.gridsY =\n          (short*)realloc(treeedge->route.gridsY, cnt * sizeof(short));\n      if (x1 != gridsX[0] ||\n          y1 != gridsY[0]) // gridsX[] and gridsY[] store the path from n2 to n1\n      {\n        cnt = 0;\n        for (i = treeedge->route.routelen; i >= 0; i--) {\n          treeedge->route.gridsX[cnt] = gridsX[i];\n          treeedge->route.gridsY[cnt] = gridsY[i];\n          cnt++;\n        }\n      } else // gridsX[] and gridsY[] store the path from n1 to n2\n      {\n        for (i = 0; i <= treeedge->route.routelen; i++) {\n          treeedge->route.gridsX[i] = gridsX[i];\n          treeedge->route.gridsY[i] = gridsY[i];\n        }\n      }\n\n      for (i = 0; i <= segHeight; i++) {\n        free(cost[i]);\n        free(parent[i]);\n      }\n      free(cost);\n      free(parent);\n\n    } // if(x1!=x2 || y1!=y2)\n  }   // non-degraded edge\n}\n\nvoid routeMonotonicAll(int threshold) {\n  int netID, edgeID;\n\n  for (netID = 0; netID < numValidNets; netID++) {\n    for (edgeID = 0; edgeID < sttrees[netID].deg * 2 - 3; edgeID++) {\n      routeMonotonic(\n          netID, edgeID,\n          threshold); // ripup previous route and do Monotonic routing\n    }\n  }\n  printf(\"MonotonicAll OK\\n\");\n}\n\nvoid spiralRoute(int netID, int edgeID) {\n  int j, n1, n2, x1, y1, x2, y2, grid, grid1, n1a, n2a;\n  float costL1 = 0, costL2 = 0, tmp;\n  int ymin, ymax;\n  TreeEdge *treeedges, *treeedge;\n  TreeNode* treenodes;\n\n  treeedges = sttrees[netID].edges;\n  treenodes = sttrees[netID].nodes;\n\n  // loop for all the tree edges (2*d-3)\n\n  treeedge = &(treeedges[edgeID]);\n  if (treeedge->len > 0) // only route the non-degraded edges (len>0)\n  {\n\n    n1 = treeedge->n1;\n    n2 = treeedge->n2;\n    x1 = treenodes[n1].x;\n    y1 = treenodes[n1].y;\n    x2 = treenodes[n2].x;\n    y2 = treenodes[n2].y;\n\n    n1a = treenodes[n1].stackAlias;\n    n2a = treenodes[n2].stackAlias;\n\n    if (y1 < y2) {\n      ymin = y1;\n      ymax = y2;\n    } else {\n      ymin = y2;\n      ymax = y1;\n    }\n\n    // ripup the original routing\n\n    treeedge->route.type = LROUTE;\n    if (x1 == x2) // V-routing\n    {\n      for (j = ymin; j < ymax; j++)\n        v_edges[j * xGrid + x1].est_usage++;\n      treeedge->route.xFirst = FALSE;\n      if (treenodes[n1].status % 2 == 0) {\n        treenodes[n1].status += 1;\n      }\n      if (treenodes[n2].status % 2 == 0) {\n        treenodes[n2].status += 1;\n      }\n\n      if (treenodes[n1a].status % 2 == 0) {\n        treenodes[n1a].status += 1;\n      }\n      if (treenodes[n2a].status % 2 == 0) {\n        treenodes[n2a].status += 1;\n      }\n    } else if (y1 == y2) // H-routing\n    {\n      for (j = x1; j < x2; j++)\n        h_edges[y1 * (xGrid - 1) + j].est_usage++;\n      treeedge->route.xFirst = TRUE;\n      if (treenodes[n2].status < 2) {\n        treenodes[n2].status += 2;\n      }\n      if (treenodes[n1].status < 2) {\n        treenodes[n1].status += 2;\n      }\n\n      if (treenodes[n2a].status < 2) {\n        treenodes[n2a].status += 2;\n      }\n      if (treenodes[n1a].status < 2) {\n        treenodes[n1a].status += 2;\n      }\n    } else // L-routing\n    {\n\n      if (treenodes[n1].status == 0 || treenodes[n1].status == 3) {\n        costL1 = costL2 = 0;\n      } else if (treenodes[n1].status == 2) {\n        costL1 = viacost;\n        costL2 = 0;\n      } else if (treenodes[n1].status == 1) {\n\n        costL1 = 0;\n        costL2 = viacost;\n      } else {\n        printf(\"wrong node status %d\", treenodes[n1].status);\n      }\n      if (treenodes[n2].status == 2) {\n        costL2 += viacost;\n      } else if (treenodes[n2].status == 1) {\n        costL1 += viacost;\n      }\n\n      for (j = ymin; j < ymax; j++) {\n        grid = j * xGrid;\n        tmp  = v_edges[grid + x1].est_usage - vCapacity_lb +\n              v_edges[grid + x1].red;\n        if (tmp > 0)\n          costL1 += tmp;\n        tmp = v_edges[grid + x2].est_usage - vCapacity_lb +\n              v_edges[grid + x2].red;\n        if (tmp > 0)\n          costL2 += tmp;\n        // costL1 += simpleCost (v_edges[grid+x1].est_usage,\n        // v_edges[grid+x1].cap); costL2 += simpleCost (\n        // v_edges[grid+x2].est_usage,  v_edges[grid+x2].cap);\n      }\n      grid  = y2 * (xGrid - 1);\n      grid1 = y1 * (xGrid - 1);\n      for (j = x1; j < x2; j++) {\n        tmp =\n            h_edges[grid + j].est_usage - hCapacity_lb + h_edges[grid + j].red;\n        if (tmp > 0)\n          costL1 += tmp;\n        tmp = h_edges[grid1 + j].est_usage - hCapacity_lb +\n              h_edges[grid1 + j].red;\n        if (tmp > 0)\n          costL2 += tmp;\n        // costL1 += simpleCost (h_edges[grid+j].est_usage,\n        // h_edges[grid+j].cap); costL2 += simpleCost\n        // (h_edges[grid1+j].est_usage, h_edges[grid1+j].cap);\n      }\n\n      if (costL1 < costL2) {\n        if (treenodes[n1].status % 2 == 0) {\n          treenodes[n1].status += 1;\n        }\n        if (treenodes[n2].status < 2) {\n          treenodes[n2].status += 2;\n        }\n\n        if (treenodes[n1a].status % 2 == 0) {\n          treenodes[n1a].status += 1;\n        }\n        if (treenodes[n2a].status < 2) {\n          treenodes[n2a].status += 2;\n        }\n        treenodes[n2a].hID++;\n        treenodes[n1a].lID++;\n\n        // two parts (x1, y1)-(x1, y2) and (x1, y2)-(x2, y2)\n        for (j = ymin; j < ymax; j++) {\n          v_edges[j * xGrid + x1].est_usage += 1;\n        }\n        grid = y2 * (xGrid - 1);\n        for (j = x1; j < x2; j++) {\n          h_edges[grid + j].est_usage += 1;\n        }\n        treeedge->route.xFirst = FALSE;\n      } // if costL1<costL2\n      else {\n        if (treenodes[n2].status % 2 == 0) {\n          treenodes[n2].status += 1;\n        }\n        if (treenodes[n1].status < 2) {\n          treenodes[n1].status += 2;\n        }\n\n        if (treenodes[n2a].status % 2 == 0) {\n          treenodes[n2a].status += 1;\n        }\n\n        if (treenodes[n1a].status < 2) {\n          treenodes[n1a].status += 2;\n        }\n        treenodes[n1a].hID++;\n        treenodes[n2a].lID++;\n\n        // two parts (x1, y1)-(x2, y1) and (x2, y1)-(x2, y2)\n        grid = y1 * (xGrid - 1);\n        for (j = x1; j < x2; j++) {\n          h_edges[grid + j].est_usage += 1;\n        }\n        for (j = ymin; j < ymax; j++) {\n          v_edges[j * xGrid + x2].est_usage += 1;\n        }\n        treeedge->route.xFirst = TRUE;\n      }\n\n    } // else L-routing\n  }   // if non-degraded edge\n  else\n    sttrees[netID].edges[edgeID].route.type = NOROUTE;\n}\n\nvoid spiralRouteAll() {\n  int netID, d, k, edgeID, nodeID, deg, numpoints, n1, n2;\n  int na;\n  Bool redundant;\n  TreeEdge *treeedges, *treeedge;\n  TreeNode* treenodes;\n  int quehead, quetail;\n  int edgeQueue[5000];\n\n  for (netID = 0; netID < numValidNets; netID++) {\n    treeedges = sttrees[netID].edges;\n    treenodes = sttrees[netID].nodes;\n    deg       = sttrees[netID].deg;\n\n    numpoints = 0;\n\n    for (d = 0; d < 2 * deg - 2; d++) {\n      treenodes[d].topL = -1;\n      treenodes[d].botL = MAXLAYER;\n      // treenodes[d].l = 0;\n      treenodes[d].assigned   = FALSE;\n      treenodes[d].stackAlias = d;\n      treenodes[d].conCNT     = 0;\n      treenodes[d].hID        = 0;\n      treenodes[d].lID        = 0;\n      treenodes[d].status     = 0;\n\n      if (d < deg) {\n        treenodes[d].botL = treenodes[d].topL = 0;\n        // treenodes[d].l = 0;\n        treenodes[d].assigned = TRUE;\n        treenodes[d].status   = 2;\n\n        xcor[numpoints] = treenodes[d].x;\n        ycor[numpoints] = treenodes[d].y;\n        dcor[numpoints] = d;\n        numpoints++;\n      } else {\n        redundant = FALSE;\n        for (k = 0; k < numpoints; k++) {\n          if ((treenodes[d].x == xcor[k]) && (treenodes[d].y == ycor[k])) {\n            treenodes[d].stackAlias = dcor[k];\n            redundant               = TRUE;\n            break;\n          }\n        }\n        if (!redundant) {\n          xcor[numpoints] = treenodes[d].x;\n          ycor[numpoints] = treenodes[d].y;\n          dcor[numpoints] = d;\n          numpoints++;\n        }\n      }\n    }\n  }\n\n  for (netID = 0; netID < numValidNets; netID++) {\n    treeedges = sttrees[netID].edges;\n    treenodes = sttrees[netID].nodes;\n    deg       = sttrees[netID].deg;\n\n    for (edgeID = 0; edgeID < 2 * deg - 3; edgeID++) {\n      treeedge = &(treeedges[edgeID]);\n      if (treeedge->len > 0) {\n\n        n1 = treeedge->n1;\n        n2 = treeedge->n2;\n\n        treeedge->n1a = treenodes[n1].stackAlias;\n        treenodes[treeedge->n1a].eID[treenodes[treeedge->n1a].conCNT] = edgeID;\n        treenodes[treeedge->n1a].conCNT++;\n\n        treeedge->n2a = treenodes[n2].stackAlias;\n        treenodes[treeedge->n2a].eID[treenodes[treeedge->n2a].conCNT] = edgeID;\n        (treenodes[treeedge->n2a].conCNT)++;\n        treeedges[edgeID].assigned = FALSE;\n      } else {\n        treeedges[edgeID].assigned = TRUE;\n      }\n    }\n  }\n\n  for (netID = 0; netID < numValidNets; netID++) {\n\n    newRipupNet(netID);\n\n    treeedges = sttrees[netID].edges;\n    treenodes = sttrees[netID].nodes;\n    deg       = sttrees[netID].deg;\n    quehead = quetail = 0;\n\n    for (nodeID = 0; nodeID < deg; nodeID++) {\n      treenodes[nodeID].assigned = TRUE;\n      for (k = 0; k < treenodes[nodeID].conCNT; k++) {\n\n        edgeID = treenodes[nodeID].eID[k];\n\n        if (treeedges[edgeID].assigned == FALSE) {\n          edgeQueue[quetail]         = edgeID;\n          treeedges[edgeID].assigned = TRUE;\n          quetail++;\n        }\n      }\n    }\n\n    while (quehead != quetail) {\n      edgeID   = edgeQueue[quehead];\n      treeedge = &(treeedges[edgeID]);\n      if (treenodes[treeedge->n1a].assigned) {\n        spiralRoute(netID, edgeID);\n        treeedge->assigned = TRUE;\n        if (!treenodes[treeedge->n2a].assigned) {\n          for (k = 0; k < treenodes[treeedge->n2a].conCNT; k++) {\n            edgeID = treenodes[treeedge->n2a].eID[k];\n            if (!treeedges[edgeID].assigned) {\n              edgeQueue[quetail]         = edgeID;\n              treeedges[edgeID].assigned = TRUE;\n              quetail++;\n            }\n          }\n          treenodes[treeedge->n2a].assigned = TRUE;\n        }\n      } else {\n        spiralRoute(netID, edgeID);\n        treeedge->assigned = TRUE;\n        if (!treenodes[treeedge->n1a].assigned) {\n          for (k = 0; k < treenodes[treeedge->n1a].conCNT; k++) {\n            edgeID = treenodes[treeedge->n1a].eID[k];\n            if (!treeedges[edgeID].assigned) {\n              edgeQueue[quetail]         = edgeID;\n              treeedges[edgeID].assigned = TRUE;\n              quetail++;\n            }\n          }\n          treenodes[treeedge->n1a].assigned = TRUE;\n        }\n      }\n      quehead++;\n    }\n  }\n\n  for (netID = 0; netID < numValidNets; netID++) {\n    treenodes = sttrees[netID].nodes;\n    deg       = sttrees[netID].deg;\n\n    for (d = 0; d < 2 * deg - 2; d++) {\n      na = treenodes[d].stackAlias;\n\n      treenodes[d].status = treenodes[na].status;\n    }\n  }\n}\n\nvoid routeLVEnew(int netID, int edgeID, int threshold, int enlarge) {\n  int i, j, cnt, xmin, xmax, ymin, ymax, n1, n2, x1, y1, x2, y2, grid, xGrid_1,\n      deg, yminorig, ymaxorig;\n  int vedge, hedge, bestp1x = 0, bestp1y = 0;\n  int gridsX[XRANGE + YRANGE], gridsY[XRANGE + YRANGE];\n  float tmp1, tmp2, tmp3, tmp4, tmp, best;\n  Bool LH1, LH2, BL1 = false, BL2 = false;\n  TreeEdge *treeedges, *treeedge;\n  TreeNode* treenodes;\n\n  if (sttrees[netID].edges[edgeID].len >\n      threshold) // only route the non-degraded edges (len>0)\n  {\n    treeedges = sttrees[netID].edges;\n    treeedge  = &(treeedges[edgeID]);\n    treenodes = sttrees[netID].nodes;\n    n1        = treeedge->n1;\n    n2        = treeedge->n2;\n    x1        = treenodes[n1].x;\n    y1        = treenodes[n1].y;\n    x2        = treenodes[n2].x;\n    y2        = treenodes[n2].y;\n\n    // ripup the original routing\n    if (newRipupCheck(treeedge, threshold, netID, edgeID)) {\n\n      deg  = sttrees[netID].deg;\n      xmin = max(x1 - enlarge, 0);\n      xmax = min(xGrid - 1, x2 + enlarge);\n\n      if (y1 < y2) {\n        ymin     = max(y1 - enlarge, 0);\n        ymax     = min(yGrid - 1, y2 + enlarge);\n        yminorig = y1;\n        ymaxorig = y2;\n      } else {\n        ymin     = max(y2 - enlarge, 0);\n        ymax     = min(yGrid - 1, y1 + enlarge);\n        yminorig = y2;\n        ymaxorig = y1;\n      }\n\n      if (deg > 2) {\n        for (j = 0; j < deg * 2 - 2; j++) {\n          if (treenodes[j].x < x1) {\n            xmin = x1;\n          }\n          if (treenodes[j].x > x2) {\n            xmax = x2;\n          }\n          if (treenodes[j].y < yminorig) {\n            ymin = yminorig;\n          }\n          if (treenodes[j].y > ymaxorig) {\n            ymax = ymaxorig;\n          }\n        }\n      }\n\n      xGrid_1 = xGrid - 1; // tmp variable to save runtime\n\n      for (j = ymin; j <= ymax; j++) {\n        d1[j][xmin] = 0;\n      }\n      // update other columns\n      for (i = xmin; i <= xmax; i++) {\n        d2[ymin][i] = 0;\n      }\n\n      for (j = ymin; j <= ymax; j++) {\n        grid = j * xGrid_1 + xmin;\n        for (i = xmin; i < xmax; i++) {\n          tmp          = h_costTable[h_edges[grid].red + h_edges[grid].usage];\n          d1[j][i + 1] = d1[j][i] + tmp;\n          grid++;\n        }\n        // update the cost of a column of grids by v-edges\n      }\n\n      for (j = ymin; j < ymax; j++) {\n        // update the cost of a column of grids by h-edges\n        grid = j * xGrid + xmin;\n        for (i = xmin; i <= xmax; i++) {\n          tmp          = h_costTable[v_edges[grid].red + v_edges[grid].usage];\n          d2[j + 1][i] = d2[j][i] + tmp;\n          grid++;\n        }\n        // update the cost of a column of grids by v-edges\n      }\n\n      best = BIG_INT;\n\n      for (j = ymin; j <= ymax; j++) {\n        for (i = xmin; i <= xmax; i++) {\n\n          tmp1 = ADIFF(d2[j][x1], d2[y1][x1]) +\n                 ADIFF(d1[j][i], d1[j][x1]); // yfirst for point 1\n          tmp2 = ADIFF(d2[j][i], d2[y1][i]) + ADIFF(d1[y1][i], d1[y1][x1]);\n          tmp3 = ADIFF(d2[y2][i], d2[j][i]) + ADIFF(d1[y2][i], d1[y2][x2]);\n          tmp4 = ADIFF(d2[y2][x2], d2[j][x2]) +\n                 ADIFF(d1[j][x2], d1[j][i]); // xifrst for mid point\n\n          tmp = tmp1 + tmp4;\n          LH1 = FALSE;\n          LH2 = TRUE;\n\n          if (tmp2 + tmp3 < tmp) {\n            tmp = tmp2 + tmp3;\n            LH1 = TRUE;\n            LH2 = FALSE;\n          }\n\n          if (tmp1 + tmp3 + viacost < tmp) {\n            LH1 = FALSE;\n            LH2 = FALSE;\n            tmp = tmp1 + tmp3 + viacost;\n          }\n\n          if (tmp2 + tmp4 + viacost < tmp) {\n            LH1 = TRUE;\n            LH2 = TRUE;\n            tmp = tmp2 + tmp4 + viacost;\n          }\n\n          if (tmp < best) {\n            bestp1x = i;\n            bestp1y = j;\n            BL1     = LH1;\n            BL2     = LH2;\n            best    = tmp;\n          }\n        }\n      }\n      cnt = 0;\n\n      if (BL1) {\n        if (bestp1x > x1) {\n          for (i = x1; i < bestp1x; i++) {\n            gridsX[cnt] = i;\n            gridsY[cnt] = y1;\n            hedge       = y1 * xGrid_1 + i;\n            h_edges[hedge].usage += 1;\n            cnt++;\n          }\n        } else {\n          for (i = x1; i > bestp1x; i--) {\n            gridsX[cnt] = i;\n            gridsY[cnt] = y1;\n            hedge       = y1 * xGrid_1 + i - 1;\n            h_edges[hedge].usage += 1;\n            cnt++;\n          }\n        }\n        if (bestp1y > y1) {\n          for (i = y1; i < bestp1y; i++) {\n            gridsX[cnt] = bestp1x;\n            gridsY[cnt] = i;\n            cnt++;\n            vedge = i * xGrid + bestp1x;\n            v_edges[vedge].usage += 1;\n          }\n        } else {\n          for (i = y1; i > bestp1y; i--) {\n            gridsX[cnt] = bestp1x;\n            gridsY[cnt] = i;\n            cnt++;\n            vedge = (i - 1) * xGrid + bestp1x;\n            v_edges[vedge].usage += 1;\n          }\n        }\n      } else {\n        if (bestp1y > y1) {\n          for (i = y1; i < bestp1y; i++) {\n            gridsX[cnt] = x1;\n            gridsY[cnt] = i;\n            cnt++;\n            vedge = i * xGrid + x1;\n            v_edges[vedge].usage += 1;\n          }\n        } else {\n          for (i = y1; i > bestp1y; i--) {\n            gridsX[cnt] = x1;\n            gridsY[cnt] = i;\n            cnt++;\n            vedge = (i - 1) * xGrid + x1;\n            v_edges[vedge].usage += 1;\n          }\n        }\n        if (bestp1x > x1) {\n          for (i = x1; i < bestp1x; i++) {\n            gridsX[cnt] = i;\n            gridsY[cnt] = bestp1y;\n            hedge       = bestp1y * xGrid_1 + i;\n            h_edges[hedge].usage += 1;\n            cnt++;\n          }\n        } else {\n          for (i = x1; i > bestp1x; i--) {\n            gridsX[cnt] = i;\n            gridsY[cnt] = bestp1y;\n            hedge       = bestp1y * xGrid_1 + i - 1;\n            h_edges[hedge].usage += 1;\n            cnt++;\n          }\n        }\n      }\n\n      if (BL2) {\n        if (bestp1x < x2) {\n          for (i = bestp1x; i < x2; i++) {\n            gridsX[cnt] = i;\n            gridsY[cnt] = bestp1y;\n            hedge       = bestp1y * xGrid_1 + i;\n            h_edges[hedge].usage += 1;\n            cnt++;\n          }\n        } else {\n          for (i = bestp1x; i > x2; i--) {\n            gridsX[cnt] = i;\n            gridsY[cnt] = bestp1y;\n            hedge       = bestp1y * xGrid_1 + i - 1;\n            h_edges[hedge].usage += 1;\n            cnt++;\n          }\n        }\n\n        if (y2 > bestp1y) {\n          for (i = bestp1y; i < y2; i++) {\n            gridsX[cnt] = x2;\n            gridsY[cnt] = i;\n            cnt++;\n            vedge = i * xGrid + x2;\n            v_edges[vedge].usage += 1;\n          }\n        } else {\n          for (i = bestp1y; i > y2; i--) {\n            gridsX[cnt] = x2;\n            gridsY[cnt] = i;\n            cnt++;\n            vedge = (i - 1) * xGrid + x2;\n            v_edges[vedge].usage += 1;\n          }\n        }\n      } else {\n\n        if (y2 > bestp1y) {\n          for (i = bestp1y; i < y2; i++) {\n            gridsX[cnt] = bestp1x;\n            gridsY[cnt] = i;\n            cnt++;\n            vedge = i * xGrid + bestp1x;\n            v_edges[vedge].usage += 1;\n          }\n        } else {\n          for (i = bestp1y; i > y2; i--) {\n            gridsX[cnt] = bestp1x;\n            gridsY[cnt] = i;\n            cnt++;\n            vedge = (i - 1) * xGrid + bestp1x;\n            v_edges[vedge].usage += 1;\n          }\n        }\n        if (x2 > bestp1x) {\n          for (i = bestp1x; i < x2; i++) {\n            gridsX[cnt] = i;\n            gridsY[cnt] = y2;\n            hedge       = y2 * xGrid_1 + i;\n            h_edges[hedge].usage += 1;\n            cnt++;\n          }\n        } else {\n          for (i = bestp1x; i > x2; i--) {\n            gridsX[cnt] = i;\n            gridsY[cnt] = y2;\n            hedge       = y2 * xGrid_1 + i - 1;\n            h_edges[hedge].usage += 1;\n            cnt++;\n          }\n        }\n      }\n\n      gridsX[cnt] = x2;\n      gridsY[cnt] = y2;\n      cnt++;\n\n      treeedge->route.routelen = cnt - 1;\n      free(treeedge->route.gridsX);\n      free(treeedge->route.gridsY);\n\n      treeedge->route.gridsX = (short*)calloc(cnt, sizeof(short));\n      treeedge->route.gridsY = (short*)calloc(cnt, sizeof(short));\n\n      for (i = 0; i < cnt; i++) {\n        treeedge->route.gridsX[i] = gridsX[i];\n        treeedge->route.gridsY[i] = gridsY[i];\n      }\n\n    } // if(x1!=x2 || y1!=y2)\n  }   // non-degraded edge\n}\n\nvoid routeLVAll(int threshold, int expand) {\n  int netID, edgeID, numEdges, i, forange;\n\n  printf(\"%d threshold, %d expand\\n\", threshold, expand);\n\n  h_costTable = (float*)calloc(10 * hCapacity, sizeof(float));\n\n  forange = 10 * hCapacity;\n  for (i = 0; i < forange; i++) {\n    h_costTable[i] =\n        costheight / (exp((float)(hCapacity - i) * LOGIS_COF) + 1) +\n        1; // /hCapacity*30));\n  }\n\n  for (netID = 0; netID < numValidNets; netID++) {\n    numEdges = 2 * sttrees[netID].deg - 3;\n    for (edgeID = 0; edgeID < numEdges; edgeID++) {\n      routeLVEnew(netID, edgeID, threshold,\n                  expand); // ripup previous route and do Monotonic routing\n    }\n  }\n  free(h_costTable);\n  // printf(\"LV routing OK\\n\");\n}\n\nvoid newrouteLInMaze(int netID) {\n  int i, j, d, n1, n2, x1, y1, x2, y2, grid, grid1;\n  int costL1, costL2, tmp;\n  int ymin, ymax;\n  TreeEdge *treeedges, *treeedge;\n  TreeNode* treenodes;\n\n  d         = sttrees[netID].deg;\n  treeedges = sttrees[netID].edges;\n  treenodes = sttrees[netID].nodes;\n\n  // loop for all the tree edges (2*d-3)\n  for (i = 0; i < 2 * d - 3; i++) {\n    if (sttrees[netID].edges[i].len >\n        0) // only route the non-degraded edges (len>0)\n    {\n\n      treeedge = &(treeedges[i]);\n\n      n1 = treeedge->n1;\n      n2 = treeedge->n2;\n      x1 = treenodes[n1].x;\n      y1 = treenodes[n1].y;\n      x2 = treenodes[n2].x;\n      y2 = treenodes[n2].y;\n\n      if (y1 < y2) {\n        ymin = y1;\n        ymax = y2;\n      } else {\n        ymin = y2;\n        ymax = y1;\n      }\n\n      treeedge->route.type = LROUTE;\n      if (x1 == x2) // V-routing\n      {\n        for (j = ymin; j < ymax; j++)\n          v_edges[j * xGrid + x1].usage++;\n        treeedge->route.xFirst = FALSE;\n\n      } else if (y1 == y2) // H-routing\n      {\n        for (j = x1; j < x2; j++)\n          h_edges[y1 * (xGrid - 1) + j].usage++;\n        treeedge->route.xFirst = TRUE;\n\n      } else // L-routing\n      {\n\n        costL1 = costL2 = 0;\n\n        for (j = ymin; j < ymax; j++) {\n          grid = j * xGrid;\n          tmp =\n              v_edges[grid + x1].usage - vCapacity_lb + v_edges[grid + x1].red;\n          if (tmp > 0)\n            costL1 += tmp;\n          tmp =\n              v_edges[grid + x2].usage - vCapacity_lb + v_edges[grid + x2].red;\n          if (tmp > 0)\n            costL2 += tmp;\n          // costL1 += simpleCost (v_edges[grid+x1].est_usage,\n          // v_edges[grid+x1].cap); costL2 += simpleCost (\n          // v_edges[grid+x2].est_usage,  v_edges[grid+x2].cap);\n        }\n        grid  = y2 * (xGrid - 1);\n        grid1 = y1 * (xGrid - 1);\n        for (j = x1; j < x2; j++) {\n          tmp = h_edges[grid + j].usage - hCapacity_lb + h_edges[grid + j].red;\n          if (tmp > 0)\n            costL1 += tmp;\n          tmp =\n              h_edges[grid1 + j].usage - hCapacity_lb + h_edges[grid1 + j].red;\n          if (tmp > 0)\n            costL2 += tmp;\n        }\n\n        if (costL1 < costL2) {\n\n          // two parts (x1, y1)-(x1, y2) and (x1, y2)-(x2, y2)\n          for (j = ymin; j < ymax; j++) {\n            v_edges[j * xGrid + x1].usage += 1;\n          }\n          grid = y2 * (xGrid - 1);\n          for (j = x1; j < x2; j++) {\n            h_edges[grid + j].usage += 1;\n          }\n          treeedge->route.xFirst = FALSE;\n        } // if costL1<costL2\n        else {\n\n          // two parts (x1, y1)-(x2, y1) and (x2, y1)-(x2, y2)\n          grid = y1 * (xGrid - 1);\n          for (j = x1; j < x2; j++) {\n            h_edges[grid + j].usage += 1;\n          }\n          for (j = ymin; j < ymax; j++) {\n            v_edges[j * xGrid + x2].usage += 1;\n          }\n          treeedge->route.xFirst = TRUE;\n        }\n\n      } // else L-routing\n    }   // if non-degraded edge\n    else\n      sttrees[netID].edges[i].route.type = NOROUTE;\n  } // loop i\n}\n\n#endif\n"
  },
  {
    "path": "lonestar/eda/cpu/sproute/utility.h",
    "content": "#ifndef _UTILITY_H_\n#define _UTILITY_H_\n\n#include <stdio.h>\n#include <stdlib.h>\n\n#include \"DataType.h\"\n#include \"flute.h\"\n#include \"DataProc.h\"\n\nvoid printEdge(int netID, int edgeID) {\n  int i;\n  TreeEdge edge;\n  TreeNode* nodes;\n\n  edge  = sttrees[netID].edges[edgeID];\n  nodes = sttrees[netID].nodes;\n\n  printf(\"edge %d: (%d, %d)->(%d, %d)\\n\", edgeID, nodes[edge.n1].x,\n         nodes[edge.n1].y, nodes[edge.n2].x, nodes[edge.n2].y);\n  for (i = 0; i <= edge.route.routelen; i++) {\n    printf(\"(%d, %d) \", edge.route.gridsX[i], edge.route.gridsY[i]);\n  }\n  printf(\"\\n\");\n}\n\nvoid plotTree(int netID) {\n  short *gridsX, *gridsY;\n  int i, j, Zpoint, n1, n2, x1, x2, y1, y2, ymin, ymax, xmin, xmax;\n\n  RouteType routetype;\n  TreeEdge* treeedge;\n  TreeNode* treenodes;\n  FILE* fp;\n\n  xmin = ymin = 1e5;\n  xmax = ymax = 0;\n\n  fp = fopen(\"plottree\", \"w\");\n  if (fp == NULL) {\n    printf(\"Error in opening file plottree\\n\");\n    exit(1);\n  }\n\n  treenodes = sttrees[netID].nodes;\n  for (i = 0; i < sttrees[netID].deg; i++) {\n    x1 = treenodes[i].x;\n    y1 = treenodes[i].y;\n    fprintf(fp, \"%f %f\\n\", (float)x1 - 0.1, (float)y1);\n    fprintf(fp, \"%f %f\\n\", (float)x1, (float)y1 - 0.1);\n    fprintf(fp, \"%f %f\\n\", (float)x1 + 0.1, (float)y1);\n    fprintf(fp, \"%f %f\\n\", (float)x1, (float)y1 + 0.1);\n    fprintf(fp, \"%f %f\\n\", (float)x1 - 0.1, (float)y1);\n    fprintf(fp, \"\\n\");\n  }\n  for (i = sttrees[netID].deg; i < sttrees[netID].deg * 2 - 2; i++) {\n    x1 = treenodes[i].x;\n    y1 = treenodes[i].y;\n    fprintf(fp, \"%f %f\\n\", (float)x1 - 0.1, (float)y1 + 0.1);\n    fprintf(fp, \"%f %f\\n\", (float)x1 + 0.1, (float)y1 - 0.1);\n    fprintf(fp, \"\\n\");\n    fprintf(fp, \"%f %f\\n\", (float)x1 + 0.1, (float)y1 + 0.1);\n    fprintf(fp, \"%f %f\\n\", (float)x1 - 0.1, (float)y1 - 0.1);\n    fprintf(fp, \"\\n\");\n  }\n\n  for (i = 0; i < sttrees[netID].deg * 2 - 3; i++) {\n    if (1) // i!=14)\n    {\n      treeedge = &(sttrees[netID].edges[i]);\n\n      n1   = treeedge->n1;\n      n2   = treeedge->n2;\n      x1   = treenodes[n1].x;\n      y1   = treenodes[n1].y;\n      x2   = treenodes[n2].x;\n      y2   = treenodes[n2].y;\n      xmin = min(xmin, min(x1, x2));\n      xmax = max(xmax, max(x1, x2));\n      ymin = min(ymin, min(y1, y2));\n      ymax = max(ymax, max(y1, y2));\n\n      routetype = treeedge->route.type;\n\n      if (routetype == LROUTE) // remove L routing\n      {\n        if (treeedge->route.xFirst) {\n          fprintf(fp, \"%d %d\\n\", x1, y1);\n          fprintf(fp, \"%d %d\\n\", x2, y1);\n          fprintf(fp, \"%d %d\\n\", x2, y2);\n          fprintf(fp, \"\\n\");\n        } else {\n          fprintf(fp, \"%d %d\\n\", x1, y1);\n          fprintf(fp, \"%d %d\\n\", x1, y2);\n          fprintf(fp, \"%d %d\\n\", x2, y2);\n          fprintf(fp, \"\\n\");\n        }\n      } else if (routetype == ZROUTE) {\n        Zpoint = treeedge->route.Zpoint;\n        if (treeedge->route.HVH) {\n          fprintf(fp, \"%d %d\\n\", x1, y1);\n          fprintf(fp, \"%d %d\\n\", Zpoint, y1);\n          fprintf(fp, \"%d %d\\n\", Zpoint, y2);\n          fprintf(fp, \"%d %d\\n\", x2, y2);\n          fprintf(fp, \"\\n\");\n        } else {\n          fprintf(fp, \"%d %d\\n\", x1, y1);\n          fprintf(fp, \"%d %d\\n\", x1, Zpoint);\n          fprintf(fp, \"%d %d\\n\", x2, Zpoint);\n          fprintf(fp, \"%d %d\\n\", x2, y2);\n          fprintf(fp, \"\\n\");\n        }\n      } else if (routetype == MAZEROUTE) {\n        gridsX = treeedge->route.gridsX;\n        gridsY = treeedge->route.gridsY;\n        for (j = 0; j <= treeedge->route.routelen; j++) {\n          fprintf(fp, \"%d %d\\n\", gridsX[j], gridsY[j]);\n        }\n        fprintf(fp, \"\\n\");\n      }\n    }\n  }\n\n  fprintf(fp, \"%d %d\\n\", xmin - 2, ymin - 2);\n  fprintf(fp, \"\\n\");\n  fprintf(fp, \"%d %d\\n\", xmax + 2, ymax + 2);\n  fclose(fp);\n}\n\nvoid getlen() {\n  int i, edgeID, totlen = 0;\n  TreeEdge* treeedge;\n\n  for (i = 0; i < numValidNets; i++) {\n    for (edgeID = 0; edgeID < 2 * sttrees[i].deg - 3; edgeID++) {\n      treeedge = &(sttrees[i].edges[edgeID]);\n      if (treeedge->route.type < MAZEROUTE)\n        printf(\"wrong\\n\");\n      //                totlen += ADIFF(treenodes[treeedge->n1].x,\n      //                treenodes[treeedge->n2].x) +\n      //                ADIFF(treenodes[treeedge->n1].y,\n      //                treenodes[treeedge->n2].y);\n      else\n        totlen += treeedge->route.routelen;\n    }\n  }\n  printf(\"Routed len: %d\\n\", totlen);\n}\n\nvoid ConvertToFull3DType2() {\n  short *gridsX, *gridsY, *gridsL, tmpX[MAXLEN], tmpY[MAXLEN], tmpL[MAXLEN];\n  int k, netID, edgeID, routeLen;\n  int newCNT, numVIA, deg, j;\n  TreeEdge *treeedges, *treeedge;\n\n  numVIA = 0;\n\n  for (netID = 0; netID < numValidNets; netID++) {\n    treeedges = sttrees[netID].edges;\n    deg       = sttrees[netID].deg;\n\n    for (edgeID = 0; edgeID < 2 * deg - 3; edgeID++) {\n      treeedge = &(treeedges[edgeID]);\n      if (treeedge->len > 0) {\n\n        newCNT   = 0;\n        routeLen = treeedge->route.routelen;\n        //\t\t\t\tprintf(\"netID %d, edgeID %d, len %d\\n\",netID, edgeID,\n        // routeLen);\n        gridsX = treeedge->route.gridsX;\n        gridsY = treeedge->route.gridsY;\n        gridsL = treeedge->route.gridsL;\n        /*\n                        if (edgeID == treenodes[n1a].hID) {\n                            for (k = treenodes[n1a].botL; k <\n           treenodes[n1a].topL; k++) { tmpX[newCNT] = gridsX[0]; tmpY[newCNT] =\n           gridsY[0]; tmpL[newCNT] = k; newCNT++; numVIA++;\n                            }\n                        }\n                        */\n\n        // finish from n1->real route\n\n        for (j = 0; j < routeLen; j++) {\n          tmpX[newCNT] = gridsX[j];\n          tmpY[newCNT] = gridsY[j];\n          tmpL[newCNT] = gridsL[j];\n          newCNT++;\n\n          if (gridsL[j] > gridsL[j + 1]) {\n            for (k = gridsL[j]; k > gridsL[j + 1]; k--) {\n              tmpX[newCNT] = gridsX[j + 1];\n              tmpY[newCNT] = gridsY[j + 1];\n              tmpL[newCNT] = k;\n              newCNT++;\n              numVIA++;\n            }\n          } else if (gridsL[j] < gridsL[j + 1]) {\n            for (k = gridsL[j]; k < gridsL[j + 1]; k++) {\n              tmpX[newCNT] = gridsX[j + 1];\n              tmpY[newCNT] = gridsY[j + 1];\n              tmpL[newCNT] = k;\n              newCNT++;\n              numVIA++;\n            }\n          }\n        }\n        tmpX[newCNT] = gridsX[j];\n        tmpY[newCNT] = gridsY[j];\n        tmpL[newCNT] = gridsL[j];\n        newCNT++;\n\n        /*\n        if (edgeID == treenodes[n2a].hID) {\n            if (treenodes[n2a].topL != treenodes[n2a].botL)\n            for (k = treenodes[n2a].topL-1; k >= treenodes[n2a].botL; k--) {\n                tmpX[newCNT] = gridsX[routeLen];\n                tmpY[newCNT] = gridsY[routeLen];\n                tmpL[newCNT] = k;\n                newCNT++;\n                numVIA++;\n            }\n        }\n        */\n        // last grid -> node2 finished\n\n        if (treeedges[edgeID].route.type == MAZEROUTE) {\n          free(treeedges[edgeID].route.gridsX);\n          free(treeedges[edgeID].route.gridsY);\n          free(treeedges[edgeID].route.gridsL);\n        }\n        treeedge->route.gridsX   = (short*)calloc(newCNT, sizeof(short));\n        treeedge->route.gridsY   = (short*)calloc(newCNT, sizeof(short));\n        treeedge->route.gridsL   = (short*)calloc(newCNT, sizeof(short));\n        treeedge->route.type     = MAZEROUTE;\n        treeedge->route.routelen = newCNT - 1;\n\n        for (k = 0; k < newCNT; k++) {\n          treeedge->route.gridsX[k] = tmpX[k];\n          treeedge->route.gridsY[k] = tmpY[k];\n          treeedge->route.gridsL[k] = tmpL[k];\n        }\n      }\n      //\t\t\tprintEdge3D(netID, edgeID);\n    }\n  }\n  //\tprintf(\"Total number of via %d\\n\",numVIA);\n}\n\nstatic int comparePVMINX(const void* a, const void* b) {\n  if (((OrderNetPin*)a)->minX > ((OrderNetPin*)b)->minX)\n    return 1;\n  else if (((OrderNetPin*)a)->minX == ((OrderNetPin*)b)->minX)\n    return 0;\n  else\n    return -1;\n}\n\nstatic int comparePVPV(const void* a, const void* b) {\n  if (((OrderNetPin*)a)->npv > ((OrderNetPin*)b)->npv)\n    return 1;\n  else if (((OrderNetPin*)a)->npv == ((OrderNetPin*)b)->npv)\n    return 0;\n  else\n    return -1;\n}\n\nvoid netpinOrderInc() {\n  int j, d, ind, totalLength, xmin;\n  TreeNode* treenodes;\n  StTree* stree;\n\n  float npvalue;\n\n  numTreeedges = 0;\n  for (j = 0; j < numValidNets; j++) {\n    d = sttrees[j].deg;\n    numTreeedges += 2 * d - 3;\n  }\n\n  if (treeOrderPV != NULL) {\n    free(treeOrderPV);\n  }\n\n  treeOrderPV = (OrderNetPin*)malloc(numValidNets * sizeof(OrderNetPin));\n\n  for (j = 0; j < numValidNets; j++) {\n    xmin        = BIG_INT;\n    totalLength = 0;\n    treenodes   = sttrees[j].nodes;\n    stree       = &(sttrees[j]);\n    d           = stree->deg;\n    for (ind = 0; ind < 2 * d - 3; ind++) {\n      totalLength += stree->edges[ind].len;\n      if (xmin < treenodes[stree->edges[ind].n1].x) {\n        xmin = treenodes[stree->edges[ind].n1].x;\n      }\n    }\n\n    npvalue = (float)totalLength / d;\n\n    treeOrderPV[j].npv       = npvalue;\n    treeOrderPV[j].treeIndex = j;\n    treeOrderPV[j].minX      = xmin;\n  }\n\n  qsort(treeOrderPV, numValidNets, sizeof(OrderNetPin), comparePVMINX);\n  qsort(treeOrderPV, numValidNets, sizeof(OrderNetPin), comparePVPV);\n}\n\nvoid fillVIA() {\n  short tmpX[MAXLEN], tmpY[MAXLEN], *gridsX, *gridsY, *gridsL, tmpL[MAXLEN];\n  int k, netID, edgeID, routeLen, n1a, n2a;\n  int n1, n2, newCNT, numVIAT1, numVIAT2, deg, j;\n  TreeEdge *treeedges, *treeedge;\n  TreeNode* treenodes;\n\n  numVIAT1 = 0;\n  numVIAT2 = 0;\n\n  for (netID = 0; netID < numValidNets; netID++) {\n    treeedges = sttrees[netID].edges;\n    deg       = sttrees[netID].deg;\n    treenodes = sttrees[netID].nodes;\n\n    for (edgeID = 0; edgeID < 2 * deg - 3; edgeID++) {\n      treeedge = &(treeedges[edgeID]);\n      if (treeedge->len > 0) {\n\n        newCNT   = 0;\n        routeLen = treeedge->route.routelen;\n        //\t\t\t\tprintf(\"netID %d, edgeID %d, len %d\\n\",netID, edgeID,\n        // routeLen);\n        n1     = treeedge->n1;\n        n2     = treeedge->n2;\n        gridsX = treeedge->route.gridsX;\n        gridsY = treeedge->route.gridsY;\n        gridsL = treeedge->route.gridsL;\n\n        n1a = treenodes[n1].stackAlias;\n\n        n2a = treenodes[n2].stackAlias;\n\n        n1a = treeedge->n1a;\n        n2a = treeedge->n2a;\n\n        if (edgeID == treenodes[n1a].hID || edgeID == treenodes[n2a].hID) {\n\n          if (edgeID == treenodes[n1a].hID) {\n\n            for (k = treenodes[n1a].botL; k < treenodes[n1a].topL; k++) {\n              tmpX[newCNT] = gridsX[0];\n              tmpY[newCNT] = gridsY[0];\n              tmpL[newCNT] = k;\n              newCNT++;\n              if (n1a < deg) {\n                numVIAT1++;\n              } else {\n                numVIAT2++;\n              }\n            }\n          }\n\n          // finish from n1->real route\n\n          for (j = 0; j < routeLen; j++) {\n            tmpX[newCNT] = gridsX[j];\n            tmpY[newCNT] = gridsY[j];\n            tmpL[newCNT] = gridsL[j];\n            newCNT++;\n\n            /*\t\t\t\t\t\tif (gridsL[j] > gridsL[j+1]) {\n                                        printf(\"fill via should not entered\\n\");\n                                        for (k = gridsL[j]; k > gridsL[j+1];\n               k--) { tmpX[newCNT] = gridsX[j+1]; tmpY[newCNT] = gridsY[j+1];\n                                            tmpL[newCNT] = k;\n                                            newCNT++;\n                                            numVIA++;\n                                        }\n                                    } else if (gridsL[j] < gridsL[j+1]){\n                                        printf(\"fill via should not entered\\n\");\n                                        for (k = gridsL[j]; k < gridsL[j+1];\n               k++) { tmpX[newCNT] = gridsX[j+1]; tmpY[newCNT] = gridsY[j+1];\n                                            tmpL[newCNT] = k;\n                                            newCNT++;\n                                            numVIA++;\n                                        }\n                                    }\n                                    */\n          }\n          tmpX[newCNT] = gridsX[j];\n          tmpY[newCNT] = gridsY[j];\n          tmpL[newCNT] = gridsL[j];\n          newCNT++;\n\n          if (edgeID == treenodes[n2a].hID) {\n            if (treenodes[n2a].topL != treenodes[n2a].botL)\n              for (k = treenodes[n2a].topL - 1; k >= treenodes[n2a].botL; k--) {\n                tmpX[newCNT] = gridsX[routeLen];\n                tmpY[newCNT] = gridsY[routeLen];\n                tmpL[newCNT] = k;\n                newCNT++;\n                if (n2a < deg) {\n                  numVIAT1++;\n                } else {\n                  numVIAT2++;\n                }\n              }\n          }\n          // last grid -> node2 finished\n\n          if (treeedges[edgeID].route.type == MAZEROUTE) {\n            free(treeedges[edgeID].route.gridsX);\n            free(treeedges[edgeID].route.gridsY);\n            free(treeedges[edgeID].route.gridsL);\n          }\n          treeedge->route.gridsX   = (short*)calloc(newCNT, sizeof(short));\n          treeedge->route.gridsY   = (short*)calloc(newCNT, sizeof(short));\n          treeedge->route.gridsL   = (short*)calloc(newCNT, sizeof(short));\n          treeedge->route.type     = MAZEROUTE;\n          treeedge->route.routelen = newCNT - 1;\n\n          for (k = 0; k < newCNT; k++) {\n            treeedge->route.gridsX[k] = tmpX[k];\n            treeedge->route.gridsY[k] = tmpY[k];\n            treeedge->route.gridsL[k] = tmpL[k];\n          }\n        } // if edgeID == treenodes[n1a].hID || edgeID == treenodes[n2a].hID\n      }\n      //\t\t\tprintEdge3D(netID, edgeID);\n    }\n  }\n  printf(\"via related to pin nodes %d\\n\", numVIAT1);\n  printf(\"via related stiner nodes %d\\n\", numVIAT2);\n}\n\nint threeDVIA() {\n  short* gridsL;\n  int netID, edgeID, deg;\n  int routeLen, numVIA, j;\n  TreeEdge *treeedges, *treeedge;\n\n  numVIA = 0;\n\n  for (netID = 0; netID < numValidNets; netID++) {\n    treeedges = sttrees[netID].edges;\n    deg       = sttrees[netID].deg;\n\n    for (edgeID = 0; edgeID < 2 * deg - 3; edgeID++) {\n\n      treeedge = &(treeedges[edgeID]);\n\n      if (treeedge->len > 0) {\n\n        routeLen = treeedge->route.routelen;\n        gridsL   = treeedge->route.gridsL;\n\n        for (j = 0; j < routeLen; j++) {\n          if (gridsL[j] != gridsL[j + 1]) {\n            numVIA++;\n          }\n        }\n      }\n    }\n  }\n\n  // printf(\"num of vias %d\\n\",numVIA);\n  return (numVIA);\n}\n\nvoid assignEdge(int netID, int edgeID, Bool processDIR) {\n\n  short *gridsX, *gridsY, *gridsL;\n  int i, k, l, grid, min_x, min_y, routelen, n1a, n2a, last_layer;\n  int min_result, endLayer = 0;\n  TreeEdge *treeedges, *treeedge;\n  TreeNode* treenodes;\n\n  treeedges = sttrees[netID].edges;\n  treenodes = sttrees[netID].nodes;\n  treeedge  = &(treeedges[edgeID]);\n\n  gridsX = treeedge->route.gridsX;\n  gridsY = treeedge->route.gridsY;\n  gridsL = treeedge->route.gridsL;\n\n  routelen = treeedge->route.routelen;\n  n1a      = treeedge->n1a;\n  n2a      = treeedge->n2a;\n\n  for (l = 0; l < numLayers; l++) {\n    for (k = 0; k <= routelen; k++) {\n      gridD[l][k]   = BIG_INT;\n      viaLink[l][k] = BIG_INT;\n    }\n  }\n\n  for (k = 0; k < routelen; k++) {\n    if (gridsX[k] == gridsX[k + 1]) {\n      min_y = min(gridsY[k], gridsY[k + 1]);\n      for (l = 0; l < numLayers; l++) {\n        grid            = l * gridV + min_y * xGrid + gridsX[k];\n        layerGrid[l][k] = v_edges3D[grid].cap - v_edges3D[grid].usage;\n      }\n    } else {\n      min_x = min(gridsX[k], gridsX[k + 1]);\n      for (l = 0; l < numLayers; l++) {\n        grid            = l * gridH + gridsY[k] * (xGrid - 1) + min_x;\n        layerGrid[l][k] = h_edges3D[grid].cap - h_edges3D[grid].usage;\n      }\n    }\n  }\n\n  if (processDIR) {\n    if (treenodes[n1a].assigned) {\n      for (l = treenodes[n1a].botL; l <= treenodes[n1a].topL; l++) {\n        gridD[l][0] = 0;\n      }\n    } else {\n      printf(\"warning, start point not assigned\\n\");\n      fflush(stdout);\n    }\n\n    for (k = 0; k < routelen; k++) {\n      for (l = 0; l < numLayers; l++) {\n        for (i = 0; i < numLayers; i++) {\n          if (k == 0) {\n            if (l != i) {\n              if (gridD[i][k] > gridD[l][k] + ADIFF(i, l) * 2) {\n                gridD[i][k]   = gridD[l][k] + ADIFF(i, l) * 2;\n                viaLink[i][k] = l;\n              }\n            }\n          } else {\n            if (l != i) {\n              if (gridD[i][k] > gridD[l][k] + ADIFF(i, l) * 3) {\n                gridD[i][k]   = gridD[l][k] + ADIFF(i, l) * 3;\n                viaLink[i][k] = l;\n              }\n            }\n          }\n        }\n      }\n      for (l = 0; l < numLayers; l++) {\n        if (layerGrid[l][k] > 0) {\n          gridD[l][k + 1] = gridD[l][k] + 1;\n        } else {\n          gridD[l][k + 1] = gridD[l][k] + BIG_INT;\n        }\n      }\n    }\n\n    for (l = 0; l < numLayers; l++) {\n      for (i = 0; i < numLayers; i++) {\n        if (l != i) {\n          if (gridD[i][k] >\n              gridD[l][k] + ADIFF(i, l) * 1) { //+ ADIFF(i,l) * 3 ) {\n            gridD[i][k]   = gridD[l][k] + ADIFF(i, l) * 1; //+ ADIFF(i,l) * 3 ;\n            viaLink[i][k] = l;\n          }\n        }\n      }\n    }\n\n    k = routelen;\n\n    if (treenodes[n2a].assigned) {\n      min_result = BIG_INT;\n      for (i = treenodes[n2a].topL; i >= treenodes[n2a].botL; i--) {\n        if (gridD[i][routelen] < min_result) {\n          min_result = gridD[i][routelen];\n          endLayer   = i;\n        }\n      }\n    } else {\n      min_result = gridD[0][routelen];\n      endLayer   = 0;\n      for (i = 0; i < numLayers; i++) {\n        if (gridD[i][routelen] < min_result) {\n          min_result = gridD[i][routelen];\n          endLayer   = i;\n        }\n      }\n    }\n\n    k = routelen;\n\n    if (viaLink[endLayer][routelen] == BIG_INT) {\n\n      last_layer = endLayer;\n      // printf(\"endlayer: %d\\n\", last_layer);\n    } else {\n      last_layer = viaLink[endLayer][routelen];\n      // printf(\"vialink last layer: %d\\n\", last_layer);\n    }\n\n    for (k = routelen; k >= 0; k--) {\n      gridsL[k] = last_layer;\n      if (viaLink[last_layer][k] == BIG_INT) {\n        // last_layer = last_layer;\n      } else {\n        last_layer = viaLink[last_layer][k];\n      }\n    }\n\n    if (gridsL[0] < treenodes[n1a].botL) {\n      treenodes[n1a].botL = gridsL[0];\n      treenodes[n1a].lID  = edgeID;\n    }\n    if (gridsL[0] > treenodes[n1a].topL) {\n      treenodes[n1a].topL = gridsL[0];\n      treenodes[n1a].hID  = edgeID;\n    }\n\n    k = routelen;\n    if (treenodes[n2a].assigned) {\n\n      if (gridsL[routelen] < treenodes[n2a].botL) {\n        treenodes[n2a].botL = gridsL[routelen];\n        treenodes[n2a].lID  = edgeID;\n      }\n      if (gridsL[routelen] > treenodes[n2a].topL) {\n        treenodes[n2a].topL = gridsL[routelen];\n        treenodes[n2a].hID  = edgeID;\n      }\n\n    } else {\n      // treenodes[n2a].assigned = TRUE;\n      treenodes[n2a].topL = gridsL[routelen]; // max(endLayer,\n                                              // gridsL[routelen]);\n      treenodes[n2a].botL = gridsL[routelen]; // min(endLayer,\n                                              // gridsL[routelen]);\n      treenodes[n2a].lID = treenodes[n2a].hID = edgeID;\n    }\n\n    if (treenodes[n2a].assigned) {\n      if (gridsL[routelen] > treenodes[n2a].topL ||\n          gridsL[routelen] < treenodes[n2a].botL) {\n        printf(\"target ending layer out of range\\n\");\n      }\n    }\n\n  } else {\n\n    if (treenodes[n2a].assigned) {\n      for (l = treenodes[n2a].botL; l <= treenodes[n2a].topL; l++) {\n        gridD[l][routelen] = 0;\n      }\n    }\n\n    for (k = routelen; k > 0; k--) {\n      for (l = 0; l < numLayers; l++) {\n        for (i = 0; i < numLayers; i++) {\n          if (k == routelen) {\n            if (l != i) {\n              if (gridD[i][k] > gridD[l][k] + ADIFF(i, l) * 2) {\n                gridD[i][k]   = gridD[l][k] + ADIFF(i, l) * 2;\n                viaLink[i][k] = l;\n              }\n            }\n          } else {\n            if (l != i) {\n              if (gridD[i][k] > gridD[l][k] + ADIFF(i, l) * 3) {\n                gridD[i][k]   = gridD[l][k] + ADIFF(i, l) * 3;\n                viaLink[i][k] = l;\n              }\n            }\n          }\n        }\n      }\n      for (l = 0; l < numLayers; l++) {\n        if (layerGrid[l][k - 1] > 0) {\n          gridD[l][k - 1] = gridD[l][k] + 1;\n        } else {\n          gridD[l][k - 1] = gridD[l][k] + BIG_INT;\n        }\n      }\n    }\n\n    for (l = 0; l < numLayers; l++) {\n      for (i = 0; i < numLayers; i++) {\n        if (l != i) {\n          if (gridD[i][0] > gridD[l][0] + ADIFF(i, l) * 1) {\n            gridD[i][0]   = gridD[l][0] + ADIFF(i, l) * 1;\n            viaLink[i][0] = l;\n          }\n        }\n      }\n    }\n\n    if (treenodes[n1a].assigned) {\n      min_result = BIG_INT;\n      for (i = treenodes[n1a].topL; i >= treenodes[n1a].botL; i--) {\n        if (gridD[i][k] < min_result) {\n          min_result = gridD[i][0];\n          endLayer   = i;\n        }\n      }\n\n    } else {\n      min_result = gridD[0][k];\n      endLayer   = 0;\n      for (i = 0; i < numLayers; i++) {\n        if (gridD[i][k] < min_result) {\n          min_result = gridD[i][k];\n          endLayer   = i;\n        }\n      }\n    }\n\n    last_layer = endLayer;\n\n    for (k = 0; k <= routelen; k++) {\n      if (viaLink[last_layer][k] == BIG_INT) {\n        // last_layer = last_layer;\n      } else {\n        last_layer = viaLink[last_layer][k];\n      }\n      gridsL[k] = last_layer;\n    }\n\n    gridsL[routelen] = gridsL[routelen - 1];\n\n    if (gridsL[routelen] < treenodes[n2a].botL) {\n      treenodes[n2a].botL = gridsL[routelen];\n      treenodes[n2a].lID  = edgeID;\n    }\n    if (gridsL[routelen] > treenodes[n2a].topL) {\n      treenodes[n2a].topL = gridsL[routelen];\n      treenodes[n2a].hID  = edgeID;\n    }\n\n    if (treenodes[n1a].assigned) {\n\n      if (gridsL[0] < treenodes[n1a].botL) {\n        treenodes[n1a].botL = gridsL[0];\n        treenodes[n1a].lID  = edgeID;\n      }\n      if (gridsL[0] > treenodes[n1a].topL) {\n        treenodes[n1a].topL = gridsL[0];\n        treenodes[n1a].hID  = edgeID;\n      }\n\n    } else {\n      // treenodes[n1a].assigned = TRUE;\n      treenodes[n1a].topL = gridsL[0]; // max(endLayer, gridsL[0]);\n      treenodes[n1a].botL = gridsL[0]; // min(endLayer, gridsL[0]);\n      treenodes[n1a].lID = treenodes[n1a].hID = edgeID;\n    }\n  }\n  treeedge->assigned = TRUE;\n\n  for (k = 0; k < routelen; k++) {\n    if (gridsX[k] == gridsX[k + 1]) {\n      min_y = min(gridsY[k], gridsY[k + 1]);\n      grid  = gridsL[k] * gridV + min_y * xGrid + gridsX[k];\n\n      if (v_edges3D[grid].usage < v_edges3D[grid].cap) {\n        v_edges3D[grid].usage++;\n\n      } else {\n        v_edges3D[grid].usage++;\n      }\n\n    } else {\n      min_x = min(gridsX[k], gridsX[k + 1]);\n      grid  = gridsL[k] * gridH + gridsY[k] * (xGrid - 1) + min_x;\n\n      if (h_edges3D[grid].usage < h_edges3D[grid].cap) {\n        h_edges3D[grid].usage++;\n      } else {\n        h_edges3D[grid].usage++;\n      }\n    }\n  }\n}\n\nvoid newLayerAssignmentV4() {\n  short* gridsL;\n  int i, k, netID, edgeID, nodeID, routeLen;\n  int n1, n2, connectionCNT, deg;\n\n  int n1a, n2a;\n  int quehead, quetail;\n  int edgeQueue[5000];\n  int sumcheck = 0;\n\n  TreeEdge *treeedges, *treeedge;\n  TreeNode* treenodes;\n\n  for (netID = 0; netID < numValidNets; netID++) {\n    treeedges = sttrees[netID].edges;\n    treenodes = sttrees[netID].nodes;\n    deg       = sttrees[netID].deg;\n    for (edgeID = 0; edgeID < 2 * deg - 3; edgeID++) {\n\n      treeedge = &(treeedges[edgeID]);\n      if (treeedge->len > 0) {\n\n        routeLen               = treeedge->route.routelen;\n        treeedge->route.gridsL = (short*)calloc((routeLen + 1), sizeof(short));\n        treeedge->assigned     = FALSE;\n      }\n    }\n  }\n  netpinOrderInc();\n\n  for (i = 0; i < numValidNets; i++) {\n\n    netID     = treeOrderPV[i].treeIndex;\n    treeedges = sttrees[netID].edges;\n    treenodes = sttrees[netID].nodes;\n    deg       = sttrees[netID].deg;\n    quehead = quetail = 0;\n\n    for (nodeID = 0; nodeID < deg; nodeID++) {\n      for (k = 0; k < treenodes[nodeID].conCNT; k++) {\n        edgeID = treenodes[nodeID].eID[k];\n        if (!treeedges[edgeID].assigned) {\n          edgeQueue[quetail]         = edgeID;\n          treeedges[edgeID].assigned = TRUE;\n          quetail++;\n        }\n      }\n    }\n\n    while (quehead != quetail) {\n      edgeID   = edgeQueue[quehead];\n      treeedge = &(treeedges[edgeID]);\n      sumcheck += treeedge->route.routelen;\n      if (treenodes[treeedge->n1a].assigned) {\n        assignEdge(netID, edgeID, 1);\n        treeedge->assigned = TRUE;\n        if (!treenodes[treeedge->n2a].assigned) {\n          for (k = 0; k < treenodes[treeedge->n2a].conCNT; k++) {\n            edgeID = treenodes[treeedge->n2a].eID[k];\n            if (!treeedges[edgeID].assigned) {\n              edgeQueue[quetail]         = edgeID;\n              treeedges[edgeID].assigned = TRUE;\n              quetail++;\n            }\n          }\n          treenodes[treeedge->n2a].assigned = TRUE;\n        }\n      } else {\n        assignEdge(netID, edgeID, 0);\n        treeedge->assigned = TRUE;\n        if (!treenodes[treeedge->n1a].assigned) {\n          for (k = 0; k < treenodes[treeedge->n1a].conCNT; k++) {\n            edgeID = treenodes[treeedge->n1a].eID[k];\n            if (!treeedges[edgeID].assigned) {\n              edgeQueue[quetail]         = edgeID;\n              treeedges[edgeID].assigned = TRUE;\n              quetail++;\n            }\n          }\n          treenodes[treeedge->n1a].assigned = TRUE;\n        }\n      }\n      quehead++;\n    }\n\n    deg = sttrees[netID].deg;\n\n    for (nodeID = 0; nodeID < 2 * deg - 2; nodeID++) {\n      treenodes[nodeID].topL     = -1;\n      treenodes[nodeID].botL     = numLayers;\n      treenodes[nodeID].conCNT   = 0;\n      treenodes[nodeID].hID      = BIG_INT;\n      treenodes[nodeID].lID      = BIG_INT;\n      treenodes[nodeID].status   = 0;\n      treenodes[nodeID].assigned = FALSE;\n\n      if (nodeID < deg) {\n        treenodes[nodeID].botL     = 0;\n        treenodes[nodeID].assigned = TRUE;\n        treenodes[nodeID].status   = 1;\n      }\n    }\n\n    for (edgeID = 0; edgeID < 2 * deg - 3; edgeID++) {\n\n      treeedge = &(treeedges[edgeID]);\n\n      if (treeedge->len > 0) {\n\n        routeLen = treeedge->route.routelen;\n\n        n1     = treeedge->n1;\n        n2     = treeedge->n2;\n        gridsL = treeedge->route.gridsL;\n\n        n1a                                   = treenodes[n1].stackAlias;\n        n2a                                   = treenodes[n2].stackAlias;\n        connectionCNT                         = treenodes[n1a].conCNT;\n        treenodes[n1a].heights[connectionCNT] = gridsL[0];\n        treenodes[n1a].eID[connectionCNT]     = edgeID;\n        treenodes[n1a].conCNT++;\n\n        if (gridsL[0] > treenodes[n1a].topL) {\n          treenodes[n1a].hID  = edgeID;\n          treenodes[n1a].topL = gridsL[0];\n        }\n        if (gridsL[0] < treenodes[n1a].botL) {\n          treenodes[n1a].lID  = edgeID;\n          treenodes[n1a].botL = gridsL[0];\n        }\n\n        treenodes[n1a].assigned = TRUE;\n\n        connectionCNT                         = treenodes[n2a].conCNT;\n        treenodes[n2a].heights[connectionCNT] = gridsL[routeLen];\n        treenodes[n2a].eID[connectionCNT]     = edgeID;\n        treenodes[n2a].conCNT++;\n        if (gridsL[routeLen] > treenodes[n2a].topL) {\n          treenodes[n2a].hID  = edgeID;\n          treenodes[n2a].topL = gridsL[routeLen];\n        }\n        if (gridsL[routeLen] < treenodes[n2a].botL) {\n          treenodes[n2a].lID  = edgeID;\n          treenodes[n2a].botL = gridsL[routeLen];\n        }\n\n        treenodes[n2a].assigned = TRUE;\n\n      } // edge len > 0\n    }   // eunmerating edges\n  }\n\n  // printf(\"sum check number 2 %d\\n\",sumcheck);\n}\n\nvoid newLA() {\n  int netID, d, k, edgeID, deg, numpoints, n1, n2;\n  Bool redundant;\n  TreeEdge *treeedges, *treeedge;\n  TreeNode* treenodes;\n\n  for (netID = 0; netID < numValidNets; netID++) {\n    treeedges = sttrees[netID].edges;\n    treenodes = sttrees[netID].nodes;\n    deg       = sttrees[netID].deg;\n\n    numpoints = 0;\n\n    for (d = 0; d < 2 * deg - 2; d++) {\n      treenodes[d].topL = -1;\n      treenodes[d].botL = numLayers;\n      // treenodes[d].l = 0;\n      treenodes[d].assigned   = FALSE;\n      treenodes[d].stackAlias = d;\n      treenodes[d].conCNT     = 0;\n      treenodes[d].hID        = BIG_INT;\n      treenodes[d].lID        = BIG_INT;\n      treenodes[d].status     = 0;\n\n      if (d < deg) {\n        treenodes[d].botL = treenodes[d].topL = 0;\n        // treenodes[d].l = 0;\n        treenodes[d].assigned = TRUE;\n        treenodes[d].status   = 1;\n\n        xcor[numpoints] = treenodes[d].x;\n        ycor[numpoints] = treenodes[d].y;\n        dcor[numpoints] = d;\n        numpoints++;\n      } else {\n        redundant = FALSE;\n        for (k = 0; k < numpoints; k++) {\n          if ((treenodes[d].x == xcor[k]) && (treenodes[d].y == ycor[k])) {\n            treenodes[d].stackAlias = dcor[k];\n\n            redundant = TRUE;\n            break;\n          }\n        }\n        if (!redundant) {\n          xcor[numpoints] = treenodes[d].x;\n          ycor[numpoints] = treenodes[d].y;\n          dcor[numpoints] = d;\n          numpoints++;\n        }\n      }\n    }\n  }\n\n  for (netID = 0; netID < numValidNets; netID++) {\n    treeedges = sttrees[netID].edges;\n    treenodes = sttrees[netID].nodes;\n    deg       = sttrees[netID].deg;\n\n    for (edgeID = 0; edgeID < 2 * deg - 3; edgeID++) {\n      treeedge = &(treeedges[edgeID]);\n      if (treeedge->len > 0) {\n\n        n1 = treeedge->n1;\n        n2 = treeedge->n2;\n\n        treeedge->n1a = treenodes[n1].stackAlias;\n        treenodes[treeedge->n1a].eID[treenodes[treeedge->n1a].conCNT] = edgeID;\n        treenodes[treeedge->n1a].conCNT++;\n        treeedge->n2a = treenodes[n2].stackAlias;\n        treenodes[treeedge->n2a].eID[treenodes[treeedge->n2a].conCNT] = edgeID;\n        treenodes[treeedge->n2a].conCNT++;\n      }\n    }\n  }\n\n  printf(\"node processing\\n\");\n  newLayerAssignmentV4();\n  printf(\"layer assignment\\n\");\n  ConvertToFull3DType2();\n}\n\nvoid printEdge3D(int netID, int edgeID) {\n  int i;\n  TreeEdge edge;\n  TreeNode* nodes;\n\n  edge  = sttrees[netID].edges[edgeID];\n  nodes = sttrees[netID].nodes;\n\n  printf(\"edge %d: n1 %d (%d, %d)-> n2 %d(%d, %d)\\n\", edgeID, edge.n1,\n         nodes[edge.n1].x, nodes[edge.n1].y, edge.n2, nodes[edge.n2].x,\n         nodes[edge.n2].y);\n  if (edge.len > 0) {\n    for (i = 0; i <= edge.route.routelen; i++) {\n      printf(\"(%d, %d,%d) \", edge.route.gridsX[i], edge.route.gridsY[i],\n             edge.route.gridsL[i]);\n    }\n    printf(\"\\n\");\n  }\n}\n\nvoid printTree3D(int netID) {\n  int edgeID, nodeID;\n  for (nodeID = 0; nodeID < 2 * sttrees[netID].deg - 2; nodeID++) {\n    printf(\"nodeID %d,  [%d, %d]\\n\", nodeID, sttrees[netID].nodes[nodeID].y,\n           sttrees[netID].nodes[nodeID].x);\n  }\n\n  for (edgeID = 0; edgeID < 2 * sttrees[netID].deg - 3; edgeID++) {\n    printEdge3D(netID, edgeID);\n  }\n}\n\nvoid checkRoute3D() {\n  short *gridsX, *gridsY, *gridsL;\n  int i, netID, edgeID, nodeID, edgelength;\n  int n1, n2, x1, y1, x2, y2, deg;\n  int distance;\n  Bool gridFlag;\n  TreeEdge* treeedge;\n  TreeNode* treenodes;\n\n  for (netID = 0; netID < numValidNets; netID++) {\n\n    treenodes = sttrees[netID].nodes;\n    deg       = sttrees[netID].deg;\n\n    for (nodeID = 0; nodeID < 2 * deg - 2; nodeID++) {\n      if (nodeID < deg) {\n        if (treenodes[nodeID].botL != 0) {\n          printf(\"causing pin node floating\\n\");\n        }\n\n        if (treenodes[nodeID].botL > treenodes[nodeID].topL) {\n          // printf(\"pin node l %d h %d wrong lid %d hid %d\\n\",\n          // treenodes[nodeID].botL, treenodes[nodeID].topL,\n          // treenodes[nodeID].lID, treenodes[nodeID].hID);\n        }\n      }\n    }\n    for (edgeID = 0; edgeID < 2 * sttrees[netID].deg - 3; edgeID++) {\n      if (sttrees[netID].edges[edgeID].len == 0) {\n        continue;\n      }\n      treeedge   = &(sttrees[netID].edges[edgeID]);\n      edgelength = treeedge->route.routelen;\n      n1         = treeedge->n1;\n      n2         = treeedge->n2;\n      x1         = treenodes[n1].x;\n      y1         = treenodes[n1].y;\n      x2         = treenodes[n2].x;\n      y2         = treenodes[n2].y;\n      gridsX     = treeedge->route.gridsX;\n      gridsY     = treeedge->route.gridsY;\n      gridsL     = treeedge->route.gridsL;\n\n      gridFlag = FALSE;\n\n      if (gridsX[0] != x1 || gridsY[0] != y1) {\n        printf(\"net[%d] edge[%d] start node wrong, net deg %d, n1 %d\\n\", netID,\n               edgeID, deg, n1);\n        printEdge3D(netID, edgeID);\n      }\n      if (gridsX[edgelength] != x2 || gridsY[edgelength] != y2) {\n        printf(\"net[%d] edge[%d] end node wrong, net deg %d, n2 %d\\n\", netID,\n               edgeID, deg, n2);\n        printEdge3D(netID, edgeID);\n      }\n      for (i = 0; i < treeedge->route.routelen; i++) {\n        distance = ADIFF(gridsX[i + 1], gridsX[i]) +\n                   ADIFF(gridsY[i + 1], gridsY[i]) +\n                   ADIFF(gridsL[i + 1], gridsL[i]);\n        if (distance > 1 || distance < 0) {\n          gridFlag = TRUE;\n          printf(\"net[%d] edge[%d] maze route wrong, distance %d, i %d\\n\",\n                 netID, edgeID, distance, i);\n          printf(\"current [%d, %d, %d], next [%d, %d, %d]\", gridsL[i],\n                 gridsY[i], gridsX[i], gridsL[i + 1], gridsY[i + 1],\n                 gridsX[i + 1]);\n        }\n      }\n\n      for (i = 0; i <= treeedge->route.routelen; i++) {\n        if (gridsL[i] < 0) {\n          printf(\"gridsL less than 0, %d\\n\", gridsL[i]);\n        }\n      }\n      if (gridFlag) {\n        printEdge3D(netID, edgeID);\n      }\n    }\n  }\n}\n\nvoid write3D() {\n  short *gridsX, *gridsY, *gridsL;\n  int netID, i, edgeID, deg, lastX, lastY, lastL, xreal, yreal, routeLen;\n  TreeEdge *treeedges, *treeedge;\n  FILE* fp;\n  TreeEdge edge;\n\n  fp = fopen(\"output.out\", \"w\");\n  if (fp == NULL) {\n    printf(\"Error in opening %s\\n\", \"output.out\");\n    exit(1);\n  }\n\n  for (netID = 0; netID < numValidNets; netID++) {\n    fprintf(fp, \"%s %d\\n\", nets[netID]->name, netID);\n    treeedges = sttrees[netID].edges;\n    deg       = sttrees[netID].deg;\n\n    for (edgeID = 0; edgeID < 2 * deg - 3; edgeID++) {\n      edge     = sttrees[netID].edges[edgeID];\n      treeedge = &(treeedges[edgeID]);\n      if (treeedge->len > 0) {\n\n        routeLen = treeedge->route.routelen;\n        gridsX   = treeedge->route.gridsX;\n        gridsY   = treeedge->route.gridsY;\n        gridsL   = treeedge->route.gridsL;\n        lastX    = wTile * (gridsX[0] + 0.5) + xcorner;\n        lastY    = hTile * (gridsY[0] + 0.5) + ycorner;\n        lastL    = gridsL[0];\n        for (i = 1; i <= routeLen; i++) {\n          xreal = wTile * (gridsX[i] + 0.5) + xcorner;\n          yreal = hTile * (gridsY[i] + 0.5) + ycorner;\n\n          fprintf(fp, \"(%d,%d,%d)-(%d,%d,%d)\\n\", lastX, lastY, lastL + 1, xreal,\n                  yreal, gridsL[i] + 1);\n          lastX = xreal;\n          lastY = yreal;\n          lastL = gridsL[i];\n        }\n      }\n    }\n    fprintf(fp, \"!\\n\");\n  }\n  fclose(fp);\n}\n\nstatic int compareTEL(const void* a, const void* b) {\n  if (((OrderTree*)a)->xmin < ((OrderTree*)b)->xmin)\n    return 1;\n  else if (((OrderTree*)a)->xmin == ((OrderTree*)b)->xmin)\n    return 0;\n  else\n    return -1;\n}\n\nvoid StNetOrder() {\n  short *gridsX, *gridsY;\n  int i, j, d, ind, grid, min_x, min_y;\n  TreeEdge *treeedges, *treeedge;\n  StTree* stree;\n\n  numTreeedges = 0;\n\n  if (treeOrderCong != NULL) {\n    free(treeOrderCong);\n  }\n\n  treeOrderCong = (OrderTree*)malloc(numValidNets * sizeof(OrderTree));\n\n  i = 0;\n  for (j = 0; j < numValidNets; j++) {\n    stree                      = &(sttrees[j]);\n    d                          = stree->deg;\n    treeOrderCong[j].xmin      = 0;\n    treeOrderCong[j].treeIndex = j;\n    for (ind = 0; ind < 2 * d - 3; ind++) {\n      treeedges = stree->edges;\n      treeedge  = &(treeedges[ind]);\n\n      gridsX = treeedge->route.gridsX;\n      gridsY = treeedge->route.gridsY;\n      for (i = 0; i < treeedge->route.routelen; i++) {\n        if (gridsX[i] == gridsX[i + 1]) // a vertical edge\n        {\n          min_y = min(gridsY[i], gridsY[i + 1]);\n          grid  = min_y * xGrid + gridsX[i];\n          treeOrderCong[j].xmin +=\n              max(0, v_edges[grid].usage - v_edges[grid].cap);\n        } else /// if(gridsY[i]==gridsY[i+1])// a horizontal edge\n        {\n          min_x = min(gridsX[i], gridsX[i + 1]);\n          grid  = gridsY[i] * (xGrid - 1) + min_x;\n          treeOrderCong[j].xmin +=\n              max(0, h_edges[grid].usage - h_edges[grid].cap);\n        }\n      }\n    }\n  }\n\n  qsort(treeOrderCong, numValidNets, sizeof(OrderTree), compareTEL);\n}\n\nvoid recoverEdge(int netID, int edgeID) {\n  short *gridsX, *gridsY, *gridsL;\n  int i, grid, ymin, xmin, n1a, n2a;\n  int connectionCNT, routeLen;\n  TreeEdge *treeedges, *treeedge;\n  TreeNode* treenodes;\n\n  treeedges = sttrees[netID].edges;\n  treeedge  = &(treeedges[edgeID]);\n\n  routeLen = treeedge->route.routelen;\n\n  if (treeedge->len == 0) {\n    printf(\"trying to recover an 0 length edge\\n\");\n    exit(0);\n  }\n\n  treenodes = sttrees[netID].nodes;\n\n  gridsX = treeedge->route.gridsX;\n  gridsY = treeedge->route.gridsY;\n  gridsL = treeedge->route.gridsL;\n\n  n1a = treeedge->n1a;\n  n2a = treeedge->n2a;\n\n  connectionCNT                         = treenodes[n1a].conCNT;\n  treenodes[n1a].heights[connectionCNT] = gridsL[0];\n  treenodes[n1a].eID[connectionCNT]     = edgeID;\n  treenodes[n1a].conCNT++;\n\n  if (gridsL[0] > treenodes[n1a].topL) {\n    treenodes[n1a].hID  = edgeID;\n    treenodes[n1a].topL = gridsL[0];\n  }\n  if (gridsL[0] < treenodes[n1a].botL) {\n    treenodes[n1a].lID  = edgeID;\n    treenodes[n1a].botL = gridsL[0];\n  }\n\n  treenodes[n1a].assigned = TRUE;\n\n  connectionCNT                         = treenodes[n2a].conCNT;\n  treenodes[n2a].heights[connectionCNT] = gridsL[routeLen];\n  treenodes[n2a].eID[connectionCNT]     = edgeID;\n  treenodes[n2a].conCNT++;\n  if (gridsL[routeLen] > treenodes[n2a].topL) {\n    treenodes[n2a].hID  = edgeID;\n    treenodes[n2a].topL = gridsL[routeLen];\n  }\n  if (gridsL[routeLen] < treenodes[n2a].botL) {\n    treenodes[n2a].lID  = edgeID;\n    treenodes[n2a].botL = gridsL[routeLen];\n  }\n\n  treenodes[n2a].assigned = TRUE;\n\n  for (i = 0; i < treeedge->route.routelen; i++) {\n    if (gridsL[i] == gridsL[i + 1]) {\n      if (gridsX[i] == gridsX[i + 1]) // a vertical edge\n      {\n        ymin = min(gridsY[i], gridsY[i + 1]);\n        grid = gridsL[i] * gridV + ymin * xGrid + gridsX[i];\n        v_edges3D[grid].usage += 1;\n      } else if (gridsY[i] == gridsY[i + 1]) // a horizontal edge\n      {\n        xmin = min(gridsX[i], gridsX[i + 1]);\n        grid = gridsL[i] * gridH + gridsY[i] * (xGrid - 1) + xmin;\n        h_edges3D[grid].usage += 1;\n      }\n    }\n  }\n}\n\nvoid checkUsage() {\n  short *gridsX, *gridsY;\n  int netID, i, k, edgeID, deg;\n  int j, cnt;\n  Bool redsus;\n  TreeEdge *treeedges, *treeedge;\n  TreeEdge edge;\n\n  for (netID = 0; netID < numValidNets; netID++) {\n    treeedges = sttrees[netID].edges;\n    deg       = sttrees[netID].deg;\n\n    for (edgeID = 0; edgeID < 2 * deg - 3; edgeID++) {\n      edge     = sttrees[netID].edges[edgeID];\n      treeedge = &(treeedges[edgeID]);\n      if (treeedge->len > 0) {\n\n        gridsX = treeedge->route.gridsX;\n        gridsY = treeedge->route.gridsY;\n\n        redsus = TRUE;\n\n        while (redsus) {\n          redsus = FALSE;\n\n          for (i = 0; i <= treeedge->route.routelen; i++) {\n            for (j = 0; j < i; j++) {\n              if (gridsX[i] == gridsX[j] &&\n                  gridsY[i] == gridsY[j]) // a vertical edge\n              {\n                cnt = 1;\n                for (k = i + 1; k <= treeedge->route.routelen; k++) {\n                  gridsX[j + cnt] = gridsX[k];\n                  gridsY[j + cnt] = gridsY[k];\n                  cnt++;\n                }\n                treeedge->route.routelen -= i - j;\n                redsus = TRUE;\n                i      = 0;\n                j      = 0;\n                printf(\"redundant edge component discovered\\n\");\n              }\n            }\n          }\n        }\n      }\n    }\n  }\n  printf(\"usage checked\\n\");\n}\n\nstatic int compareEdgeLen(const void* a, const void* b) {\n  if (((OrderNetEdge*)a)->length < ((OrderNetEdge*)b)->length)\n    return 1;\n  else if (((OrderNetEdge*)a)->length == ((OrderNetEdge*)b)->length)\n    return 0;\n  else\n    return -1;\n}\n\nvoid netedgeOrderDec(int netID, OrderNetEdge* netEO) {\n  int j, d, numTreeedges;\n\n  d            = sttrees[netID].deg;\n  numTreeedges = 2 * d - 3;\n\n  for (j = 0; j < numTreeedges; j++) {\n    netEO[j].length = sttrees[netID].edges[j].route.routelen;\n    netEO[j].edgeID = j;\n  }\n\n  qsort(netEO, numTreeedges, sizeof(OrderNetEdge), compareEdgeLen);\n}\n\nvoid printEdge2D(int netID, int edgeID) {\n  int i;\n  TreeEdge edge;\n  TreeNode* nodes;\n\n  edge  = sttrees[netID].edges[edgeID];\n  nodes = sttrees[netID].nodes;\n\n  printf(\"edge %d: n1 %d (%d, %d)-> n2 %d(%d, %d), routeType %d\\n\", edgeID,\n         edge.n1, nodes[edge.n1].x, nodes[edge.n1].y, edge.n2, nodes[edge.n2].x,\n         nodes[edge.n2].y, edge.route.type);\n  if (edge.len > 0) {\n    for (i = 0; i <= edge.route.routelen; i++) {\n      printf(\"(%d, %d) \", edge.route.gridsX[i], edge.route.gridsY[i]);\n    }\n    printf(\"\\n\");\n  }\n}\n\nvoid printTree2D(int netID) {\n  int edgeID, nodeID;\n  for (nodeID = 0; nodeID < 2 * sttrees[netID].deg - 2; nodeID++) {\n    printf(\"nodeID %d,  [%d, %d]\\n\", nodeID, sttrees[netID].nodes[nodeID].y,\n           sttrees[netID].nodes[nodeID].x);\n  }\n\n  for (edgeID = 0; edgeID < 2 * sttrees[netID].deg - 3; edgeID++) {\n    printEdge2D(netID, edgeID);\n  }\n}\n\nBool checkRoute2DTree(int netID) {\n  Bool STHwrong, gridFlag;\n  short *gridsX, *gridsY;\n  int i, edgeID, edgelength;\n  int n1, n2, x1, y1, x2, y2;\n  int distance;\n  TreeEdge* treeedge;\n  TreeNode* treenodes;\n\n  STHwrong = FALSE;\n\n  treenodes = sttrees[netID].nodes;\n  // if(netID == 2nnn/b52163) return false;\n  for (edgeID = 0; edgeID < 2 * sttrees[netID].deg - 3; edgeID++) {\n    treeedge   = &(sttrees[netID].edges[edgeID]);\n    edgelength = treeedge->route.routelen;\n    n1         = treeedge->n1;\n    n2         = treeedge->n2;\n    x1         = treenodes[n1].x;\n    y1         = treenodes[n1].y;\n    x2         = treenodes[n2].x;\n    y2         = treenodes[n2].y;\n    gridsX     = treeedge->route.gridsX;\n    gridsY     = treeedge->route.gridsY;\n\n    gridFlag = FALSE;\n\n    if (treeedge->len < 0) {\n      printf(\"rip upped edge without edge len re assignment\\n\");\n      STHwrong = TRUE;\n    }\n\n    if (treeedge->len > 0) {\n\n      if (treeedge->route.routelen < 1) {\n        // printf(\"%d %d .routelen %d len  %d\\n\",netID, edgeID,\n        // treeedge->route.routelen, treeedge->len);\n        STHwrong = TRUE;\n        printf(\"checking failed %d roulen = 0\\n\", netID);\n        return (TRUE);\n      }\n      // if(netID == 252163 && edgeID == 10)\n      //\tprintf(\"checking src: %d %d gridstart: %d %d dst: %d %d gridend: %d\n      //%d\\n\", y1, x1, gridsY[0],gridsX[0], y2, x2,\n      // gridsY[edgelength],gridsX[edgelength]);\n      if (gridsX[0] != x1 || gridsY[0] != y1) {\n        printf(\"%d %d initial grid wrong y1 x1 [%d %d] , net start [%d %d] \"\n               \"routelen %d\\n \",\n               netID, edgeID, y1, x1, gridsY[0], gridsX[0],\n               treeedge->route.routelen);\n        STHwrong = TRUE;\n      }\n      if (gridsX[edgelength] != x2 || gridsY[edgelength] != y2) {\n        printf(\"%d %d end grid wrong y2 x2 [%d %d] , net start [%d %d] \"\n               \"routelen %d\\n \",\n               netID, edgeID, y2, x2, gridsY[edgelength], gridsX[edgelength],\n               treeedge->route.routelen);\n        STHwrong = TRUE;\n      }\n      for (i = 0; i < treeedge->route.routelen; i++) {\n\n        distance =\n            ADIFF(gridsX[i + 1], gridsX[i]) + ADIFF(gridsY[i + 1], gridsY[i]);\n        if (distance != 1) {\n          printf(\"net[%d] edge[%d] maze route wrong, distance %d, i %d\\n\",\n                 netID, edgeID, distance, i);\n          gridFlag = TRUE;\n          STHwrong = TRUE;\n        }\n      }\n\n      if (gridFlag) {\n        printEdge2D(netID, edgeID);\n      }\n      if (STHwrong) {\n        printf(\"checking failed %d STHwrong\\n\", netID);\n        return (TRUE);\n      }\n    }\n  }\n\n  return (STHwrong);\n}\n\nvoid writeRoute3D(const char* routingfile3D) {\n  short *gridsX, *gridsY, *gridsL;\n  int netID, i, edgeID, deg, routeLen;\n  TreeEdge *treeedges, *treeedge;\n  FILE* fp;\n  TreeEdge edge;\n\n  fp = fopen(routingfile3D, \"w\");\n  if (fp == NULL) {\n    printf(\"Error in opening %s\\n\", routingfile3D);\n    exit(1);\n  }\n\n  for (netID = 0; netID < numValidNets; netID++) {\n    fprintf(fp, \"%s\\n\", nets[netID]->name);\n    fprintf(fp, \"(\\n\");\n    treeedges = sttrees[netID].edges;\n    deg       = sttrees[netID].deg;\n\n    for (edgeID = 0; edgeID < 2 * deg - 3; edgeID++) {\n      edge     = sttrees[netID].edges[edgeID];\n      treeedge = &(treeedges[edgeID]);\n      if (treeedge->len > 0) {\n\n        routeLen = treeedge->route.routelen;\n        gridsX   = treeedge->route.gridsX;\n        gridsY   = treeedge->route.gridsY;\n        gridsL   = treeedge->route.gridsL;\n        /*lastX = wTile*(gridsX[0]+0.5)+xcorner;\n        lastY = hTile*(gridsY[0]+0.5)+ycorner;\n        lastL = gridsL[0];\n        for (i = 1; i <= routeLen; i ++) {\n            xreal = wTile*(gridsX[i]+0.5)+xcorner;\n            yreal = hTile*(gridsY[i]+0.5)+ycorner;\n\n                fprintf(fp, \"(%d,%d,%d)-(%d,%d,%d)\\n\",\n        lastX,lastY,lastL+1,xreal,yreal,gridsL[i]+1); lastX = xreal; lastY =\n        yreal; lastL = gridsL[i];\n        }*/\n        for (i = 0; i <= routeLen; i++) {\n          int llx   = wTile * gridsX[i] + xcorner;\n          int lly   = hTile * gridsY[i] + ycorner;\n          int urx   = wTile * (gridsX[i] + 1) + xcorner;\n          int ury   = hTile * (gridsY[i] + 1) + ycorner;\n          int layer = gridsL[i] + 1;\n\n          fprintf(fp, \"%d %d %d %d Metal%d\\n\", llx, lly, urx, ury, layer);\n        }\n      }\n    }\n    fprintf(fp, \")\\n\");\n  }\n\n  for (netID = 0; netID < numInvalidNets; netID++) {\n    fprintf(fp, \"%s\\n\", invalid_nets[netID]->name);\n    fprintf(fp, \"(\\n\");\n\n    int llx   = wTile * invalid_nets[netID]->pinX[0] + xcorner;\n    int lly   = hTile * invalid_nets[netID]->pinY[0] + ycorner;\n    int urx   = wTile * (invalid_nets[netID]->pinX[0] + 1) + xcorner;\n    int ury   = hTile * (invalid_nets[netID]->pinY[0] + 1) + ycorner;\n    int layer = invalid_nets[netID]->pinL[0];\n\n    fprintf(fp, \"%d %d %d %d Metal%d\\n\", llx, lly, urx, ury, layer);\n    fprintf(fp, \"%d %d %d %d Metal%d\\n\", llx, lly, urx, ury, layer + 1);\n\n    fprintf(fp, \")\\n\");\n  }\n\n  fclose(fp);\n}\n\nfloat* pH;\nfloat* pV;\nstruct BBox* netBox;\nstruct BBox** pnetBox;\n\nstruct TD {\n  int id;\n  float cost;\n};\n\nstruct BBox {\n  int xmin;\n  int ymin;\n  int xmax;\n  int ymax;\n  int hSpan;\n  int vSpan;\n}; // lower_left corner and upper_right corner\n\nstruct wire {\n  int x1, y1, x2, y2;\n  int netID;\n};\n\n/*static int ordercost(const void *a,  const void *b)\n{\n    struct TD *pa, *pb;\n\n    pa = *(struct TD**)a;\n    pb = *(struct TD**)b;\n\n    if (pa->cost < pb->cost) return 1;\n    if (pa->cost > pb->cost) return -1;\n    return 0;\n   // return ((struct Segment*)a->x1-(struct Segment*)b->x1);\n}//decreasing order\n\nstatic int ordervSpan(const void *a,  const void *b)\n{\n    struct BBox *pa, *pb;\n\n    pa = *(struct BBox**)a;\n    pb = *(struct BBox**)b;\n\n    if (pa->vSpan < pb->vSpan) return -1;\n    if (pa->vSpan > pb->vSpan) return 1;\n    return 0;\n   // return ((struct Segment*)a->x1-(struct Segment*)b->x1);\n}\n\nstatic int orderhSpan(const void *a,  const void *b)\n{\n    struct BBox *pa, *pb;\n\n    pa = *(struct BBox**)a;\n    pb = *(struct BBox**)b;\n\n    if (pa->hSpan < pb->hSpan) return -1;\n    if (pa->hSpan > pb->hSpan) return 1;\n    return 0;\n   // return ((struct Segment*)a->x1-(struct Segment*)b->x1);\n}*/\n\n// binary search to map the new coordinates to original coordinates\n\n// Copy Routing Solution for the best routing solution so far\nvoid copyRS(void) {\n  int i, j, netID, edgeID, numEdges, numNodes;\n\n  if (sttreesBK != NULL) {\n    for (netID = 0; netID < numValidNets; netID++) {\n\n      numEdges = 2 * sttreesBK[netID].deg - 3;\n      for (edgeID = 0; edgeID < numEdges; edgeID++) {\n        if (sttreesBK[netID].edges[edgeID].len > 0) {\n          free(sttreesBK[netID].edges[edgeID].route.gridsX);\n          free(sttreesBK[netID].edges[edgeID].route.gridsY);\n        }\n      }\n      free(sttreesBK[netID].nodes);\n      free(sttreesBK[netID].edges);\n    }\n    free(sttreesBK);\n  }\n\n  sttreesBK = (StTree*)malloc(numValidNets * sizeof(StTree));\n\n  for (netID = 0; netID < numValidNets; netID++) {\n    numNodes = 2 * sttrees[netID].deg - 2;\n    numEdges = 2 * sttrees[netID].deg - 3;\n\n    sttreesBK[netID].nodes = (TreeNode*)malloc(numNodes * sizeof(TreeNode));\n\n    for (i = 0; i < numNodes; i++) {\n      sttreesBK[netID].nodes[i].x = sttrees[netID].nodes[i].x;\n      sttreesBK[netID].nodes[i].y = sttrees[netID].nodes[i].y;\n      for (j = 0; j < 3; j++) {\n        sttreesBK[netID].nodes[i].nbr[j]  = sttrees[netID].nodes[i].nbr[j];\n        sttreesBK[netID].nodes[i].edge[j] = sttrees[netID].nodes[i].edge[j];\n      }\n    }\n    sttreesBK[netID].deg = sttrees[netID].deg;\n\n    sttreesBK[netID].edges = (TreeEdge*)malloc(numEdges * sizeof(TreeEdge));\n\n    for (edgeID = 0; edgeID < numEdges; edgeID++) {\n      sttreesBK[netID].edges[edgeID].len = sttrees[netID].edges[edgeID].len;\n      sttreesBK[netID].edges[edgeID].n1  = sttrees[netID].edges[edgeID].n1;\n      sttreesBK[netID].edges[edgeID].n2  = sttrees[netID].edges[edgeID].n2;\n\n      if (sttrees[netID].edges[edgeID].len >\n          0) // only route the non-degraded edges (len>0)\n      {\n        sttreesBK[netID].edges[edgeID].route.routelen =\n            sttrees[netID].edges[edgeID].route.routelen;\n        sttreesBK[netID].edges[edgeID].route.gridsX = (short*)calloc(\n            (sttrees[netID].edges[edgeID].route.routelen + 1), sizeof(short));\n        sttreesBK[netID].edges[edgeID].route.gridsY = (short*)calloc(\n            (sttrees[netID].edges[edgeID].route.routelen + 1), sizeof(short));\n\n        for (i = 0; i <= sttrees[netID].edges[edgeID].route.routelen; i++) {\n          sttreesBK[netID].edges[edgeID].route.gridsX[i] =\n              sttrees[netID].edges[edgeID].route.gridsX[i];\n          sttreesBK[netID].edges[edgeID].route.gridsY[i] =\n              sttrees[netID].edges[edgeID].route.gridsY[i];\n        }\n      }\n    }\n  }\n}\n\nvoid copyBR() {\n  short *gridsX, *gridsY;\n  int i, j, netID, edgeID, numEdges, numNodes, grid, min_y, min_x;\n\n  if (sttreesBK != NULL) {\n\n    printf(\"copy BR working\\n\");\n\n    for (netID = 0; netID < numValidNets; netID++) {\n      numEdges = 2 * sttrees[netID].deg - 3;\n      for (edgeID = 0; edgeID < numEdges; edgeID++) {\n        if (sttrees[netID].edges[edgeID].len > 0) {\n          free(sttrees[netID].edges[edgeID].route.gridsX);\n          free(sttrees[netID].edges[edgeID].route.gridsY);\n        }\n      }\n      free(sttrees[netID].nodes);\n      free(sttrees[netID].edges);\n    }\n    free(sttrees);\n\n    sttrees = (StTree*)malloc(numValidNets * sizeof(StTree));\n\n    for (netID = 0; netID < numValidNets; netID++) {\n      numNodes = 2 * sttreesBK[netID].deg - 2;\n      numEdges = 2 * sttreesBK[netID].deg - 3;\n\n      sttrees[netID].nodes = (TreeNode*)malloc(numNodes * sizeof(TreeNode));\n\n      for (i = 0; i < numNodes; i++) {\n        sttrees[netID].nodes[i].x = sttreesBK[netID].nodes[i].x;\n        sttrees[netID].nodes[i].y = sttreesBK[netID].nodes[i].y;\n        for (j = 0; j < 3; j++) {\n          sttrees[netID].nodes[i].nbr[j]  = sttreesBK[netID].nodes[i].nbr[j];\n          sttrees[netID].nodes[i].edge[j] = sttreesBK[netID].nodes[i].edge[j];\n        }\n      }\n\n      sttrees[netID].edges = (TreeEdge*)malloc(numEdges * sizeof(TreeEdge));\n\n      sttrees[netID].deg = sttreesBK[netID].deg;\n\n      for (edgeID = 0; edgeID < numEdges; edgeID++) {\n        sttrees[netID].edges[edgeID].len = sttreesBK[netID].edges[edgeID].len;\n        sttrees[netID].edges[edgeID].n1  = sttreesBK[netID].edges[edgeID].n1;\n        sttrees[netID].edges[edgeID].n2  = sttreesBK[netID].edges[edgeID].n2;\n\n        sttrees[netID].edges[edgeID].route.type = MAZEROUTE;\n        sttrees[netID].edges[edgeID].route.routelen =\n            sttreesBK[netID].edges[edgeID].route.routelen;\n\n        if (sttreesBK[netID].edges[edgeID].len >\n            0) // only route the non-degraded edges (len>0)\n        {\n          sttrees[netID].edges[edgeID].route.type = MAZEROUTE;\n          sttrees[netID].edges[edgeID].route.routelen =\n              sttreesBK[netID].edges[edgeID].route.routelen;\n          sttrees[netID].edges[edgeID].route.gridsX = (short*)calloc(\n              (sttreesBK[netID].edges[edgeID].route.routelen + 1),\n              sizeof(short));\n          sttrees[netID].edges[edgeID].route.gridsY = (short*)calloc(\n              (sttreesBK[netID].edges[edgeID].route.routelen + 1),\n              sizeof(short));\n\n          for (i = 0; i <= sttreesBK[netID].edges[edgeID].route.routelen; i++) {\n            sttrees[netID].edges[edgeID].route.gridsX[i] =\n                sttreesBK[netID].edges[edgeID].route.gridsX[i];\n            sttrees[netID].edges[edgeID].route.gridsY[i] =\n                sttreesBK[netID].edges[edgeID].route.gridsY[i];\n            // printf(\"x %d y %d\n            // \",sttrees[netID].edges[edgeID].route.gridsX[i],sttrees[netID].edges[edgeID].route.gridsY[i]);\n          }\n          // printf(\"\\n\");\n        }\n      }\n    }\n    /*\n            for(netID=0; netID<numValidNets; netID++) {\n                numEdges = 2 * sttreesBK[netID].deg -3;\n                treeedges = sttreesBK[netID].edges;\n                for(edgeID=0; edgeID<numEdges; edgeID++)\n                {\n                    if(sttrees[netID].edges[edgeID].len>0)\n                    {\n                        free (sttreesBK[netID].edges[edgeID].route.gridsX);\n                        free (sttreesBK[netID].edges[edgeID].route.gridsY);\n                    }\n                }\n                free(sttreesBK[netID].nodes);\n                free(sttreesBK[netID].edges);\n            }\n            free(sttreesBK); */\n\n    for (i = 0; i < yGrid; i++) {\n      for (j = 0; j < xGrid - 1; j++) {\n\n        grid                = i * (xGrid - 1) + j;\n        h_edges[grid].usage = 0;\n      }\n    }\n    for (i = 0; i < yGrid - 1; i++) {\n      for (j = 0; j < xGrid; j++) {\n\n        grid                = i * xGrid + j;\n        v_edges[grid].usage = 0;\n      }\n    }\n    for (netID = 0; netID < numValidNets; netID++) {\n      numEdges = 2 * sttrees[netID].deg - 3;\n      for (edgeID = 0; edgeID < numEdges; edgeID++) {\n        if (sttrees[netID].edges[edgeID].len > 0) {\n          gridsX = sttrees[netID].edges[edgeID].route.gridsX;\n          gridsY = sttrees[netID].edges[edgeID].route.gridsY;\n          for (i = 0; i < sttrees[netID].edges[edgeID].route.routelen; i++) {\n            if (gridsX[i] == gridsX[i + 1]) // a vertical edge\n            {\n              min_y = min(gridsY[i], gridsY[i + 1]);\n              v_edges[min_y * xGrid + gridsX[i]].usage += 1;\n            } else /// if(gridsY[i]==gridsY[i+1])// a horizontal edge\n            {\n              min_x = min(gridsX[i], gridsX[i + 1]);\n              h_edges[gridsY[i] * (xGrid - 1) + min_x].usage += 1;\n            }\n          }\n        }\n      }\n    }\n  }\n}\n\nvoid freeRR(void) {\n  int netID, edgeID, numEdges;\n\n  if (sttreesBK != NULL) {\n    for (netID = 0; netID < numValidNets; netID++) {\n\n      numEdges = 2 * sttreesBK[netID].deg - 3;\n      for (edgeID = 0; edgeID < numEdges; edgeID++) {\n        if (sttreesBK[netID].edges[edgeID].len > 0) {\n          free(sttreesBK[netID].edges[edgeID].route.gridsX);\n          free(sttreesBK[netID].edges[edgeID].route.gridsY);\n        }\n      }\n      free(sttreesBK[netID].nodes);\n      free(sttreesBK[netID].edges);\n    }\n    free(sttreesBK);\n  }\n}\n\n#endif\n"
  },
  {
    "path": "lonestar/libdistbench/CMakeLists.txt",
    "content": "add_library(distbench STATIC src/Start.cpp src/Input.cpp src/Output.cpp)\ntarget_include_directories(distbench PUBLIC\n  \"${CMAKE_CURRENT_SOURCE_DIR}/include\"\n)\n\ntarget_link_libraries(distbench Galois::cusp Galois::gluon LLVMSupport)\n"
  },
  {
    "path": "lonestar/libdistbench/include/DistBench/Input.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2019, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n/**\n * @file Reader.h\n *\n * Contains definitions for the common distributed graph loading functionality\n * of benchmark applications.\n */\n#ifndef GALOIS_DISTBENCH_INPUT_H\n#define GALOIS_DISTBENCH_INPUT_H\n\n#include \"galois/graphs/CuSPPartitioner.h\"\n#include \"llvm/Support/CommandLine.h\"\n\n/*******************************************************************************\n * Supported partitioning schemes\n ******************************************************************************/\n\n//! enums of partitioning schemes supported\nenum PARTITIONING_SCHEME {\n  OEC,           //!< outgoing edge cut\n  IEC,           //!< incoming edge cut\n  HOVC,          //!< outgoing hybrid vertex cut\n  HIVC,          //!< incoming hybrid vertex cut\n  CART_VCUT,     //!< cartesian vertex cut\n  CART_VCUT_IEC, //!< cartesian vertex cut using iec\n  // CEC,                   //!< custom edge cut\n  GINGER_O, //!< Ginger, outgoing\n  GINGER_I, //!< Ginger, incoming\n  FENNEL_O, //!< Fennel, oec\n  FENNEL_I, //!< Fennel, iec\n  SUGAR_O   //!< Sugar, oec\n};\n\n/**\n * Turns a PARTITIONING_SCHEME enum to a string\n *\n * @param e partitioning scheme enum\n * @return string version of e\n */\ninline const char* EnumToString(PARTITIONING_SCHEME e) {\n  switch (e) {\n  case OEC:\n    return \"oec\";\n  case IEC:\n    return \"iec\";\n  case HOVC:\n    return \"hovc\";\n  case HIVC:\n    return \"hivc\";\n  case CART_VCUT:\n    return \"cvc\";\n  case CART_VCUT_IEC:\n    return \"cvc_iec\";\n  // case CEC:\n  //  return \"cec\";\n  case GINGER_O:\n    return \"ginger-oec\";\n  case GINGER_I:\n    return \"ginger-iec\";\n  case FENNEL_O:\n    return \"fennel-oec\";\n  case FENNEL_I:\n    return \"fennel-iec\";\n  case SUGAR_O:\n    return \"sugar-oec\";\n  default:\n    GALOIS_DIE(\"unsupported partition scheme: \", e);\n  }\n}\n\n/*******************************************************************************\n * Graph-loading-related command line arguments\n ******************************************************************************/\nnamespace cll = llvm::cl;\n\n//! input graph file\nextern cll::opt<std::string> inputFile;\n//! input graph file, but transposed\nextern cll::opt<std::string> inputFileTranspose;\n//! symmetric input graph file\nextern cll::opt<bool> symmetricGraph;\n//! partitioning scheme to use\nextern cll::opt<PARTITIONING_SCHEME> partitionScheme;\n////! path to vertex id map for custom edge cut\n// extern cll::opt<std::string> vertexIDMapFileName;\n//! true if you want to read graph structure from a file\nextern cll::opt<bool> readFromFile;\n//! path to local graph structure to read\nextern cll::opt<std::string> localGraphFileName;\n//! if true, the local graph structure will be saved to disk after partitioning\nextern cll::opt<bool> saveLocalGraph;\n//! file specifying blocking of masters\nextern cll::opt<std::string> mastersFile;\n\n// @todo command line argument for read balancing across hosts\n\n/*******************************************************************************\n * Graph-loading functions\n ******************************************************************************/\n\ntemplate <typename NodeData, typename EdgeData>\nusing DistGraphPtr =\n    std::unique_ptr<galois::graphs::DistGraph<NodeData, EdgeData>>;\n\n/**\n * Loads a symmetric graph file (i.e. directed graph with edges in both\n * directions)\n *\n * @tparam NodeData node data to store in graph\n * @tparam EdgeData edge data to store in graph\n * @param scaleFactor How to split nodes among hosts\n * @returns a pointer to a newly allocated DistGraph based on the command line\n * loaded based on command line arguments\n */\ntemplate <typename NodeData, typename EdgeData>\nDistGraphPtr<NodeData, EdgeData>\nconstructSymmetricGraph(std::vector<unsigned>& GALOIS_UNUSED(scaleFactor)) {\n  if (!symmetricGraph) {\n    GALOIS_DIE(\"application requires a symmetric graph input;\"\n               \" please use the -symmetricGraph flag \"\n               \" to indicate the input is a symmetric graph\");\n  }\n\n  switch (partitionScheme) {\n  case OEC:\n  case IEC:\n    return galois::cuspPartitionGraph<NoCommunication, NodeData, EdgeData>(\n        inputFile, galois::CUSP_CSR, galois::CUSP_CSR, true, inputFileTranspose,\n        mastersFile);\n  case HOVC:\n  case HIVC:\n    return galois::cuspPartitionGraph<GenericHVC, NodeData, EdgeData>(\n        inputFile, galois::CUSP_CSR, galois::CUSP_CSR, true,\n        inputFileTranspose);\n\n  case CART_VCUT:\n  case CART_VCUT_IEC:\n    return galois::cuspPartitionGraph<GenericCVC, NodeData, EdgeData>(\n        inputFile, galois::CUSP_CSR, galois::CUSP_CSR, true,\n        inputFileTranspose);\n\n    // case CEC:\n    //  return new Graph_customEdgeCut(inputFile, \"\", net.ID, net.Num,\n    //                                 scaleFactor, vertexIDMapFileName, false);\n\n  case GINGER_O:\n  case GINGER_I:\n    return galois::cuspPartitionGraph<GingerP, NodeData, EdgeData>(\n        inputFile, galois::CUSP_CSR, galois::CUSP_CSR, true,\n        inputFileTranspose);\n\n  case FENNEL_O:\n  case FENNEL_I:\n    return galois::cuspPartitionGraph<FennelP, NodeData, EdgeData>(\n        inputFile, galois::CUSP_CSR, galois::CUSP_CSR, true,\n        inputFileTranspose);\n\n  case SUGAR_O:\n    return galois::cuspPartitionGraph<SugarP, NodeData, EdgeData>(\n        inputFile, galois::CUSP_CSR, galois::CUSP_CSR, true,\n        inputFileTranspose);\n  default:\n    GALOIS_DIE(\"partition scheme specified is invalid: \", partitionScheme);\n    return DistGraphPtr<NodeData, EdgeData>(nullptr);\n  }\n}\n\n/**\n * Loads a graph file with the purpose of iterating over the out edges\n * of the graph.\n *\n * @tparam NodeData node data to store in graph\n * @tparam EdgeData edge data to store in graph\n * @tparam iterateOut says if you want to iterate over out edges or not; if\n * false, will iterate over in edgse\n * @tparam enable_if this function  will only be enabled if iterateOut is true\n * @param scaleFactor How to split nodes among hosts\n * @returns a pointer to a newly allocated DistGraph based on the command line\n * loaded based on command line arguments\n */\ntemplate <typename NodeData, typename EdgeData, bool iterateOut = true,\n          typename std::enable_if<iterateOut>::type* = nullptr>\nDistGraphPtr<NodeData, EdgeData>\nconstructGraph(std::vector<unsigned>& GALOIS_UNUSED(scaleFactor)) {\n  // 1 host = no concept of cut; just load from edgeCut, no transpose\n  auto& net = galois::runtime::getSystemNetworkInterface();\n  if (net.Num == 1) {\n    return galois::cuspPartitionGraph<NoCommunication, NodeData, EdgeData>(\n        inputFile, galois::CUSP_CSR, galois::CUSP_CSR, false,\n        inputFileTranspose);\n  }\n\n  switch (partitionScheme) {\n  case OEC:\n    return galois::cuspPartitionGraph<NoCommunication, NodeData, EdgeData>(\n        inputFile, galois::CUSP_CSR, galois::CUSP_CSR, false,\n        inputFileTranspose, mastersFile);\n  case IEC:\n    if (inputFileTranspose.size()) {\n      return galois::cuspPartitionGraph<NoCommunication, NodeData, EdgeData>(\n          inputFile, galois::CUSP_CSC, galois::CUSP_CSR, false,\n          inputFileTranspose, mastersFile);\n    } else {\n      GALOIS_DIE(\"incoming edge cut requires transpose graph\");\n      break;\n    }\n\n  case HOVC:\n    return galois::cuspPartitionGraph<GenericHVC, NodeData, EdgeData>(\n        inputFile, galois::CUSP_CSR, galois::CUSP_CSR, false,\n        inputFileTranspose);\n  case HIVC:\n    if (inputFileTranspose.size()) {\n      return galois::cuspPartitionGraph<GenericHVC, NodeData, EdgeData>(\n          inputFile, galois::CUSP_CSC, galois::CUSP_CSR, false,\n          inputFileTranspose);\n    } else {\n      GALOIS_DIE(\"incoming hybrid cut requires transpose graph\");\n      break;\n    }\n\n  case CART_VCUT:\n    return galois::cuspPartitionGraph<GenericCVC, NodeData, EdgeData>(\n        inputFile, galois::CUSP_CSR, galois::CUSP_CSR, false,\n        inputFileTranspose);\n\n  case CART_VCUT_IEC:\n    if (inputFileTranspose.size()) {\n      return galois::cuspPartitionGraph<GenericCVC, NodeData, EdgeData>(\n          inputFile, galois::CUSP_CSC, galois::CUSP_CSR, false,\n          inputFileTranspose);\n    } else {\n      GALOIS_DIE(\"cvc incoming cut requires transpose graph\");\n      break;\n    }\n\n    // case CEC:\n    //  return new Graph_customEdgeCut(inputFile, \"\", net.ID, net.Num,\n    //                                 scaleFactor, vertexIDMapFileName, false);\n\n  case GINGER_O:\n    return galois::cuspPartitionGraph<GingerP, NodeData, EdgeData>(\n        inputFile, galois::CUSP_CSR, galois::CUSP_CSR, false,\n        inputFileTranspose);\n  case GINGER_I:\n    if (inputFileTranspose.size()) {\n      return galois::cuspPartitionGraph<GingerP, NodeData, EdgeData>(\n          inputFile, galois::CUSP_CSC, galois::CUSP_CSR, false,\n          inputFileTranspose);\n    } else {\n      GALOIS_DIE(\"Ginger requires transpose graph\");\n      break;\n    }\n\n  case FENNEL_O:\n    return galois::cuspPartitionGraph<FennelP, NodeData, EdgeData>(\n        inputFile, galois::CUSP_CSR, galois::CUSP_CSR, false,\n        inputFileTranspose);\n  case FENNEL_I:\n    if (inputFileTranspose.size()) {\n      return galois::cuspPartitionGraph<FennelP, NodeData, EdgeData>(\n          inputFile, galois::CUSP_CSC, galois::CUSP_CSR, false,\n          inputFileTranspose);\n    } else {\n      GALOIS_DIE(\"Fennel requires transpose graph\");\n      break;\n    }\n\n  case SUGAR_O:\n    return galois::cuspPartitionGraph<SugarP, NodeData, EdgeData>(\n        inputFile, galois::CUSP_CSR, galois::CUSP_CSR, false,\n        inputFileTranspose);\n\n  default:\n    GALOIS_DIE(\"partition scheme specified is invalid: \", partitionScheme);\n    return DistGraphPtr<NodeData, EdgeData>(nullptr);\n  }\n}\n\n/**\n * Loads a graph file with the purpose of iterating over the in edges\n * of the graph.\n *\n * @tparam NodeData node data to store in graph\n * @tparam EdgeData edge data to store in graph\n * @tparam iterateOut says if you want to iterate over out edges or not; if\n * false, will iterate over in edges\n * @tparam enable_if this function  will only be enabled if iterateOut is false\n * (i.e. iterate over in-edges)\n * @param scaleFactor How to split nodes among hosts\n * @returns a pointer to a newly allocated DistGraph based on the command line\n * loaded based on command line arguments\n */\ntemplate <typename NodeData, typename EdgeData, bool iterateOut = true,\n          typename std::enable_if<!iterateOut>::type* = nullptr>\nDistGraphPtr<NodeData, EdgeData> constructGraph(std::vector<unsigned>&) {\n  auto& net = galois::runtime::getSystemNetworkInterface();\n\n  // 1 host = no concept of cut; just load from edgeCut\n  if (net.Num == 1) {\n    if (inputFileTranspose.size()) {\n      return galois::cuspPartitionGraph<NoCommunication, NodeData, EdgeData>(\n          inputFile, galois::CUSP_CSC, galois::CUSP_CSC, false,\n          inputFileTranspose);\n    } else {\n      fprintf(stderr, \"WARNING: Loading transpose graph through in-memory \"\n                      \"transpose to iterate over in-edges: pass in transpose \"\n                      \"graph with -graphTranspose to avoid unnecessary \"\n                      \"overhead.\\n\");\n      return galois::cuspPartitionGraph<NoCommunication, NodeData, EdgeData>(\n          inputFile, galois::CUSP_CSR, galois::CUSP_CSC, false,\n          inputFileTranspose);\n    }\n  }\n\n  switch (partitionScheme) {\n  case OEC:\n    return galois::cuspPartitionGraph<NoCommunication, NodeData, EdgeData>(\n        inputFile, galois::CUSP_CSR, galois::CUSP_CSC, false,\n        inputFileTranspose, mastersFile);\n  case IEC:\n    if (inputFileTranspose.size()) {\n      return galois::cuspPartitionGraph<NoCommunication, NodeData, EdgeData>(\n          inputFile, galois::CUSP_CSC, galois::CUSP_CSC, false,\n          inputFileTranspose, mastersFile);\n    } else {\n      GALOIS_DIE(\"iec requires transpose graph\");\n      break;\n    }\n\n  case HOVC:\n    return galois::cuspPartitionGraph<GenericHVC, NodeData, EdgeData>(\n        inputFile, galois::CUSP_CSR, galois::CUSP_CSC, false,\n        inputFileTranspose);\n  case HIVC:\n    if (inputFileTranspose.size()) {\n      return galois::cuspPartitionGraph<GenericHVC, NodeData, EdgeData>(\n          inputFile, galois::CUSP_CSC, galois::CUSP_CSC, false,\n          inputFileTranspose);\n    } else {\n      GALOIS_DIE(\"hivc requires transpose graph\");\n      break;\n    }\n\n  case CART_VCUT:\n    return galois::cuspPartitionGraph<GenericCVCColumnFlip, NodeData, EdgeData>(\n        inputFile, galois::CUSP_CSR, galois::CUSP_CSC, false,\n        inputFileTranspose);\n  case CART_VCUT_IEC:\n    if (inputFileTranspose.size()) {\n      return galois::cuspPartitionGraph<GenericCVCColumnFlip, NodeData,\n                                        EdgeData>(inputFile, galois::CUSP_CSC,\n                                                  galois::CUSP_CSC, false,\n                                                  inputFileTranspose);\n    } else {\n      GALOIS_DIE(\"cvc requires transpose graph\");\n      break;\n    }\n\n  case GINGER_O:\n    return galois::cuspPartitionGraph<GingerP, NodeData, EdgeData>(\n        inputFile, galois::CUSP_CSR, galois::CUSP_CSC, false,\n        inputFileTranspose);\n  case GINGER_I:\n    if (inputFileTranspose.size()) {\n      return galois::cuspPartitionGraph<GingerP, NodeData, EdgeData>(\n          inputFile, galois::CUSP_CSC, galois::CUSP_CSC, false,\n          inputFileTranspose);\n    } else {\n      GALOIS_DIE(\"Ginger requires transpose graph\");\n      break;\n    }\n\n  case FENNEL_O:\n    return galois::cuspPartitionGraph<FennelP, NodeData, EdgeData>(\n        inputFile, galois::CUSP_CSR, galois::CUSP_CSC, false,\n        inputFileTranspose);\n  case FENNEL_I:\n    if (inputFileTranspose.size()) {\n      return galois::cuspPartitionGraph<FennelP, NodeData, EdgeData>(\n          inputFile, galois::CUSP_CSC, galois::CUSP_CSC, false,\n          inputFileTranspose);\n    } else {\n      GALOIS_DIE(\"Fennel requires transpose graph\");\n      break;\n    }\n\n  case SUGAR_O:\n    return galois::cuspPartitionGraph<SugarColumnFlipP, NodeData, EdgeData>(\n        inputFile, galois::CUSP_CSR, galois::CUSP_CSC, false,\n        inputFileTranspose);\n\n  default:\n    GALOIS_DIE(\"partition scheme specified is invalid: \", partitionScheme);\n    return DistGraphPtr<NodeData, EdgeData>(nullptr);\n  }\n}\n\n#endif\n"
  },
  {
    "path": "lonestar/libdistbench/include/DistBench/MiningStart.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2019, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n// TODO document + merge with regular bench start? have to figure out how to\n// have both substrates coexist\n\n#ifndef GALOIS_DISTBENCH_MININGSTART_H\n#define GALOIS_DISTBENCH_MININGSTART_H\n\n#include \"DistBench/Input.h\"\n#include \"galois/AtomicHelpers.h\"\n#include \"galois/Galois.h\"\n#include \"galois/graphs/GenericPartitioners.h\"\n#include \"galois/graphs/GluonEdgeSubstrate.h\"\n#include \"galois/graphs/MiningPartitioner.h\"\n#include \"galois/Version.h\"\n#include \"llvm/Support/CommandLine.h\"\n\n#ifdef GALOIS_ENABLE_GPU\n#include \"galois/cuda/EdgeHostDecls.h\"\n#else\n// dummy struct declaration to allow non-het code to compile without\n// having to include cuda_context_decl\nstruct CUDA_Context;\n#endif\n\n//! standard global options to the benchmarks\nnamespace cll = llvm::cl;\n\n// note these come from distbenchstart rather than mining bench\nextern cll::opt<int> numThreads;\nextern cll::opt<int> numRuns;\nextern cll::opt<std::string> statFile;\n//! Set method for metadata sends\nextern cll::opt<DataCommMode> commMetadata;\nextern cll::opt<bool> output;\n\n#ifdef GALOIS_ENABLE_GPU\nenum Personality { CPU, GPU_CUDA };\n\nstd::string personality_str(Personality p);\n\nextern int gpudevice;\nextern Personality personality;\nextern cll::opt<unsigned> scalegpu;\nextern cll::opt<unsigned> scalecpu;\nextern cll::opt<int> num_nodes;\nextern cll::opt<std::string> personality_set;\n#endif\n\n/**\n * Initialize Galois runtime for distributed benchmarks and print/report various\n * information.\n *\n * @param argc argument count\n * @param argv list of arguments\n * @param app Name of the application\n * @param desc Description of the application\n * @param url URL to the application\n */\nvoid DistBenchStart(int argc, char** argv, const char* app,\n                    const char* desc = nullptr, const char* url = nullptr);\n\ntemplate <typename NodeData, typename EdgeData>\nusing MiningGraphPtr = std::unique_ptr<\n    galois::graphs::MiningGraph<NodeData, EdgeData, MiningPolicyDegrees>>;\ntemplate <typename NodeData, typename EdgeData>\nusing MiningSubstratePtr = std::unique_ptr<galois::graphs::GluonEdgeSubstrate<\n    galois::graphs::MiningGraph<NodeData, EdgeData, MiningPolicyDegrees>>>;\n\n#ifdef GALOIS_ENABLE_GPU\n// in internal namespace because this function shouldn't be called elsewhere\nnamespace internal {\nvoid heteroSetup(std::vector<unsigned>& scaleFactor);\n}; // namespace internal\n\n/**\n * Given a loaded graph, marshal it over to the GPU device for use\n * on the GPU.\n *\n * @param GluonEdgeSubstrate Gluon substrate containing info needed to marshal\n * to GPU\n * @param cuda_ctx the CUDA context of the currently running program\n */\ntemplate <typename NodeData, typename EdgeData>\nstatic void\nmarshalGPUGraph(MiningSubstratePtr<NodeData, EdgeData>& GluonEdgeSubstrate,\n                struct CUDA_Context** cuda_ctx, bool LoadProxyEdges = true) {\n  auto& net                 = galois::runtime::getSystemNetworkInterface();\n  const unsigned my_host_id = galois::runtime::getHostID();\n\n  galois::StatTimer marshalTimer(\"TIMER_GRAPH_MARSHAL\", \"DistBench\");\n\n  marshalTimer.start();\n\n  if (personality == GPU_CUDA) {\n    *cuda_ctx = get_CUDA_context(my_host_id);\n\n    if (!init_CUDA_context(*cuda_ctx, gpudevice)) {\n      GALOIS_DIE(\"failed to initialize CUDA context\");\n    }\n\n    EdgeMarshalGraph m;\n    (*GluonEdgeSubstrate).getEdgeMarshalGraph(m, LoadProxyEdges);\n    load_graph_CUDA(*cuda_ctx, m, net.Num);\n  }\n  marshalTimer.stop();\n}\n#endif\n\n/**\n */\ntemplate <typename NodeData, typename EdgeData, bool iterateOutEdges = true>\nstatic MiningGraphPtr<NodeData, EdgeData> loadDGraph(bool loadProxyEdges) {\n  using Graph =\n      galois::graphs::MiningGraph<NodeData, EdgeData, MiningPolicyDegrees>;\n  galois::StatTimer dGraphTimer(\"GraphConstructTime\", \"DistBench\");\n\n  dGraphTimer.start();\n  const auto& net = galois::runtime::getSystemNetworkInterface();\n  MiningGraphPtr<NodeData, EdgeData> loadedGraph = std::make_unique<Graph>(\n      inputFile, net.ID, net.Num, loadProxyEdges, loadProxyEdges);\n  assert(loadedGraph != nullptr);\n  dGraphTimer.stop();\n\n  return loadedGraph;\n}\n/**\n * Loads a graph into memory, setting up heterogeneous execution if\n * necessary. Unlike the dGraph load functions above, this is meant\n * to be exposed to the user.\n *\n * @tparam NodeData struct specifying what kind of data the node contains\n * @tparam EdgeData type specifying the type of the edge data\n * @tparam iterateOutEdges Boolean specifying if the graph should be iterating\n * over outgoing or incoming edges\n *\n * @param cuda_ctx CUDA context of the currently running program; only matters\n * if using GPU\n *\n * @returns Pointer to the loaded graph and Gluon substrate\n */\ntemplate <typename NodeData, typename EdgeData, bool iterateOutEdges = true>\nstd::pair<MiningGraphPtr<NodeData, EdgeData>,\n          MiningSubstratePtr<NodeData, EdgeData>>\n#ifdef GALOIS_ENABLE_GPU\ndistGraphInitialization(struct CUDA_Context** cuda_ctx,\n#else\ndistGraphInitialization(\n#endif\n                        bool loadProxyEdges = true) {\n  galois::StatTimer initTimer(\"DistGraphInitialization\", \"DistMiningBench\");\n  using Graph =\n      galois::graphs::MiningGraph<NodeData, EdgeData, MiningPolicyDegrees>;\n  using Substrate = galois::graphs::GluonEdgeSubstrate<Graph>;\n\n  initTimer.start();\n  std::vector<unsigned> scaleFactor;\n  MiningGraphPtr<NodeData, EdgeData> g;\n  MiningSubstratePtr<NodeData, EdgeData> s;\n\n#ifdef GALOIS_ENABLE_GPU\n  internal::heteroSetup(scaleFactor);\n#endif\n  g = loadDGraph<NodeData, EdgeData, iterateOutEdges>(loadProxyEdges);\n\n  // load substrate\n  const auto& net = galois::runtime::getSystemNetworkInterface();\n  // if you want to load proxy edges (true), then do nothing should be false\n  // hence the use of ! to negate\n  s = std::make_unique<Substrate>(*g, net.ID, net.Num, !loadProxyEdges,\n                                  commMetadata);\n\n// marshal graph to GPU as necessary\n#ifdef GALOIS_ENABLE_GPU\n  if (net.ID == 0) {\n    galois::gPrint(\"Beginning to marshal graph to GPU\\n\");\n  }\n  marshalGPUGraph(s, cuda_ctx, loadProxyEdges);\n#endif\n\n  initTimer.stop();\n\n  return std::make_pair(std::move(g), std::move(s));\n}\n\n#endif\n"
  },
  {
    "path": "lonestar/libdistbench/include/DistBench/Output.h",
    "content": "#ifndef GALOIS_DISTBENCH_OUTPUT_H\n#define GALOIS_DISTBENCH_OUTPUT_H\n\n#include <stdint.h>\n#include <string>\n#include <fstream>\n#include \"galois/gIO.h\"\n\nstd::string makeOutputFilename(const std::string& outputDir);\n\ntemplate <typename T>\nvoid writeOutput(const std::string& outputDir, const std::string& /*fieldName*/,\n                 T* values, size_t length, uint64_t* IDs) {\n  std::string filename = makeOutputFilename(outputDir);\n\n  std::ofstream outputFile(filename.c_str());\n\n  for (size_t i = 0; i < length; i++) {\n    outputFile << *(IDs++) << \" \" << *(values++) << \"\\n\";\n  }\n\n  galois::gPrint(\"Output written to: \", filename, \"\\n\");\n}\n\n#endif\n"
  },
  {
    "path": "lonestar/libdistbench/include/DistBench/Start.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#ifndef GALOIS_DISTBENCH_START_H\n#define GALOIS_DISTBENCH_START_H\n\n#include \"DistBench/Input.h\"\n#include \"galois/AtomicHelpers.h\"\n#include \"galois/Galois.h\"\n#include \"galois/graphs/GluonSubstrate.h\"\n#include \"galois/Version.h\"\n#include \"llvm/Support/CommandLine.h\"\n\n#ifdef GALOIS_ENABLE_GPU\n#include \"galois/cuda/HostDecls.h\"\n#else\n// dummy struct declaration to allow non-het code to compile without\n// having to include cuda_context_decl\nstruct CUDA_Context;\n#endif\n\n//! standard global options to the benchmarks\nnamespace cll = llvm::cl;\n\nextern cll::opt<int> numThreads;\nextern cll::opt<int> numRuns;\nextern cll::opt<std::string> statFile;\n//! If set, ignore partitioning comm optimizations\nextern cll::opt<bool> partitionAgnostic;\n//! Set method for metadata sends\nextern cll::opt<DataCommMode> commMetadata;\n//! Where to write output if output is set\nextern cll::opt<std::string> outputLocation;\nextern cll::opt<bool> output;\n\n#ifdef GALOIS_ENABLE_GPU\nenum Personality { CPU, GPU_CUDA };\n\nstd::string personality_str(Personality p);\n\nextern int gpudevice;\nextern Personality personality;\nextern cll::opt<unsigned> scalegpu;\nextern cll::opt<unsigned> scalecpu;\nextern cll::opt<int> num_nodes;\nextern cll::opt<std::string> personality_set;\n#endif\n\n/**\n * Initialize Galois runtime for distributed benchmarks and print/report various\n * information.\n *\n * @param argc argument count\n * @param argv list of arguments\n * @param app Name of the application\n * @param desc Description of the application\n * @param url URL to the application\n */\nvoid DistBenchStart(int argc, char** argv, const char* app,\n                    const char* desc = nullptr, const char* url = nullptr);\n\ntemplate <typename NodeData, typename EdgeData>\nusing DistGraphPtr =\n    std::unique_ptr<galois::graphs::DistGraph<NodeData, EdgeData>>;\ntemplate <typename NodeData, typename EdgeData>\nusing DistSubstratePtr = std::unique_ptr<galois::graphs::GluonSubstrate<\n    galois::graphs::DistGraph<NodeData, EdgeData>>>;\n\n#ifdef GALOIS_ENABLE_GPU\n// in internal namespace because this function shouldn't be called elsewhere\nnamespace internal {\nvoid heteroSetup(std::vector<unsigned>& scaleFactor);\n}; // namespace internal\n\n/**\n * Given a loaded graph, marshal it over to the GPU device for use\n * on the GPU.\n *\n * @param gluonSubstrate Gluon substrate containing info needed to marshal\n * to GPU\n * @param cuda_ctx the CUDA context of the currently running program\n */\ntemplate <typename NodeData, typename EdgeData>\nstatic void\nmarshalGPUGraph(DistSubstratePtr<NodeData, EdgeData>& gluonSubstrate,\n                struct CUDA_Context** cuda_ctx) {\n  auto& net                 = galois::runtime::getSystemNetworkInterface();\n  const unsigned my_host_id = galois::runtime::getHostID();\n\n  galois::StatTimer marshalTimer(\"TIMER_GRAPH_MARSHAL\", \"DistBench\");\n\n  marshalTimer.start();\n\n  if (personality == GPU_CUDA) {\n    *cuda_ctx = get_CUDA_context(my_host_id);\n\n    if (!init_CUDA_context(*cuda_ctx, gpudevice)) {\n      GALOIS_DIE(\"failed to initialize CUDA context\");\n    }\n\n    MarshalGraph m;\n    (*gluonSubstrate).getMarshalGraph(m);\n    load_graph_CUDA(*cuda_ctx, m, net.Num);\n  }\n\n  marshalTimer.stop();\n}\n#endif\n\n/**\n * Loads a graph into memory. Details/partitioning will be handled in the\n * construct graph call.\n *\n * The user should NOT call this function.\n *\n * @tparam NodeData struct specifying what kind of data the node contains\n * @tparam EdgeData type specifying the type of the edge data\n * @tparam iterateOutEdges Boolean specifying if the graph should be iterating\n * over outgoing or incoming edges\n *\n * @param scaleFactor Vector that specifies how much of the graph each\n * host should get\n *\n * @returns Pointer to the loaded graph\n */\ntemplate <typename NodeData, typename EdgeData, bool iterateOutEdges = true>\nstatic DistGraphPtr<NodeData, EdgeData>\nloadDistGraph(std::vector<unsigned>& scaleFactor) {\n  galois::StatTimer dGraphTimer(\"GraphConstructTime\", \"DistBench\");\n  dGraphTimer.start();\n\n  DistGraphPtr<NodeData, EdgeData> loadedGraph =\n      constructGraph<NodeData, EdgeData, iterateOutEdges>(scaleFactor);\n  assert(loadedGraph != nullptr);\n\n  dGraphTimer.stop();\n\n  // Save local graph structure\n  // if (saveLocalGraph)\n  //  (*loadedGraph).save_local_graph_to_file(localGraphFileName);\n\n  return loadedGraph;\n}\n\n/**\n * Loads a symmetric graph into memory.\n * Details/partitioning will be handled in the construct graph call.\n *\n * The user should NOT call this function.\n *\n * @tparam NodeData struct specifying what kind of data the node contains\n * @tparam EdgeData type specifying the type of the edge data\n *\n * @param scaleFactor Vector that specifies how much of the graph each\n * host should get\n *\n * @returns Pointer to the loaded symmetric graph\n */\ntemplate <typename NodeData, typename EdgeData>\nstatic DistGraphPtr<NodeData, EdgeData>\nloadSymmetricDistGraph(std::vector<unsigned>& scaleFactor) {\n  galois::StatTimer dGraphTimer(\"GraphConstructTime\", \"DistBench\");\n  dGraphTimer.start();\n\n  DistGraphPtr<NodeData, EdgeData> loadedGraph = nullptr;\n\n  // make sure that the symmetric graph flag was passed in\n  if (symmetricGraph) {\n    loadedGraph = constructSymmetricGraph<NodeData, EdgeData>(scaleFactor);\n  } else {\n    GALOIS_DIE(\"This application requires a symmetric graph input;\"\n               \" please use the -symmetricGraph flag \"\n               \" to indicate the input is a symmetric graph.\");\n  }\n\n  assert(loadedGraph != nullptr);\n\n  dGraphTimer.stop();\n\n  // Save local graph structure\n  // if (saveLocalGraph)\n  //  (*loadedGraph).save_local_graph_to_file(localGraphFileName);\n\n  return loadedGraph;\n}\n\n/**\n * Loads a graph into memory, setting up heterogeneous execution if\n * necessary. Unlike the dGraph load functions above, this is meant\n * to be exposed to the user.\n *\n * @tparam NodeData struct specifying what kind of data the node contains\n * @tparam EdgeData type specifying the type of the edge data\n * @tparam iterateOutEdges Boolean specifying if the graph should be iterating\n * over outgoing or incoming edges\n *\n * @param cuda_ctx CUDA context of the currently running program; only matters\n * if using GPU\n *\n * @returns Pointer to the loaded graph and Gluon substrate\n */\ntemplate <typename NodeData, typename EdgeData, bool iterateOutEdges = true>\nstd::pair<DistGraphPtr<NodeData, EdgeData>,\n          DistSubstratePtr<NodeData, EdgeData>>\n#ifdef GALOIS_ENABLE_GPU\ndistGraphInitialization(struct CUDA_Context** cuda_ctx) {\n#else\ndistGraphInitialization() {\n#endif\n  using Graph     = galois::graphs::DistGraph<NodeData, EdgeData>;\n  using Substrate = galois::graphs::GluonSubstrate<Graph>;\n  std::vector<unsigned> scaleFactor;\n  DistGraphPtr<NodeData, EdgeData> g;\n  DistSubstratePtr<NodeData, EdgeData> s;\n\n#ifdef GALOIS_ENABLE_GPU\n  internal::heteroSetup(scaleFactor);\n#endif\n  g = loadDistGraph<NodeData, EdgeData, iterateOutEdges>(scaleFactor);\n  // load substrate\n  const auto& net = galois::runtime::getSystemNetworkInterface();\n  s = std::make_unique<Substrate>(*g, net.ID, net.Num, g->isTransposed(),\n                                  g->cartesianGrid(), partitionAgnostic,\n                                  commMetadata);\n\n// marshal graph to GPU as necessary\n#ifdef GALOIS_ENABLE_GPU\n  marshalGPUGraph(s, cuda_ctx);\n#endif\n\n  return std::make_pair(std::move(g), std::move(s));\n}\n\n/**\n * Loads a symmetric graph into memory, setting up heterogeneous execution if\n * necessary. Unlike the dGraph load functions above, this is meant\n * to be exposed to the user.\n *\n * @tparam NodeData struct specifying what kind of data the node contains\n * @tparam EdgeData type specifying the type of the edge data\n *\n * @param cuda_ctx CUDA context of the currently running program; only matters\n * if using GPU\n *\n * @returns Pointer to the loaded symmetric graph\n */\ntemplate <typename NodeData, typename EdgeData>\nstd::pair<DistGraphPtr<NodeData, EdgeData>,\n          DistSubstratePtr<NodeData, EdgeData>>\n#ifdef GALOIS_ENABLE_GPU\nsymmetricDistGraphInitialization(struct CUDA_Context** cuda_ctx) {\n#else\nsymmetricDistGraphInitialization() {\n#endif\n  using Graph     = galois::graphs::DistGraph<NodeData, EdgeData>;\n  using Substrate = galois::graphs::GluonSubstrate<Graph>;\n  std::vector<unsigned> scaleFactor;\n  DistGraphPtr<NodeData, EdgeData> g;\n  DistSubstratePtr<NodeData, EdgeData> s;\n\n#ifdef GALOIS_ENABLE_GPU\n  internal::heteroSetup(scaleFactor);\n#endif\n  g = loadSymmetricDistGraph<NodeData, EdgeData>(scaleFactor);\n  // load substrate\n  const auto& net = galois::runtime::getSystemNetworkInterface();\n  s = std::make_unique<Substrate>(*g, net.ID, net.Num, g->isTransposed(),\n                                  g->cartesianGrid(), partitionAgnostic,\n                                  commMetadata);\n\n// marshal graph to GPU as necessary\n#ifdef GALOIS_ENABLE_GPU\n  marshalGPUGraph(s, cuda_ctx);\n#endif\n\n  return std::make_pair(std::move(g), std::move(s));\n}\n\n#endif\n"
  },
  {
    "path": "lonestar/libdistbench/src/Input.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2019, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n/**\n * @file Reader.cpp\n *\n * Contains definitions for command line arguments related to distributed\n * graph loading.\n */\n\n#include \"DistBench/Input.h\"\n\nusing namespace galois::graphs;\n\nnamespace cll = llvm::cl;\n\ncll::opt<std::string> inputFile(cll::Positional, cll::desc(\"<input file>\"),\n                                cll::Required);\ncll::opt<std::string> inputFileTranspose(\"graphTranspose\",\n                                         cll::desc(\"<input file, transposed>\"),\n                                         cll::init(\"\"));\ncll::opt<bool>\n    symmetricGraph(\"symmetricGraph\",\n                   cll::desc(\"Specify that the input graph is symmetric\"),\n                   cll::init(false));\n\ncll::opt<PARTITIONING_SCHEME> partitionScheme(\n    \"partition\", cll::desc(\"Type of partitioning.\"),\n    cll::values(\n        clEnumValN(OEC, \"oec\", \"Outgoing Edge-Cut (default)\"),\n        clEnumValN(IEC, \"iec\", \"Incoming Edge-Cut\"),\n        clEnumValN(HOVC, \"hovc\", \"Outgoing Hybrid Vertex-Cut\"),\n        clEnumValN(HIVC, \"hivc\", \"Incoming Hybrid Vertex-Cut\"),\n        clEnumValN(CART_VCUT, \"cvc\", \"Cartesian Vertex-Cut of oec\"),\n        clEnumValN(CART_VCUT_IEC, \"cvc-iec\", \"Cartesian Vertex-Cut of iec\"),\n        // clEnumValN(CEC, \"cec\", \"Custom edge cut from vertexID mapping\"),\n        clEnumValN(GINGER_O, \"ginger-o\", \"ginger, outgiong edges, using CuSP\"),\n        clEnumValN(GINGER_I, \"ginger-i\", \"ginger, incoming edges, using CuSP\"),\n        clEnumValN(FENNEL_O, \"fennel-o\",\n                   \"fennel, outgoing edge cut, using CuSP\"),\n        clEnumValN(FENNEL_I, \"fennel-i\",\n                   \"fennel, incoming edge cut, using CuSP\"),\n        clEnumValN(SUGAR_O, \"sugar-o\",\n                   \"fennel, incoming edge cut, using CuSP\")),\n    cll::init(OEC));\n\ncll::opt<bool> readFromFile(\"readFromFile\",\n                            cll::desc(\"Set this flag if graph is to be \"\n                                      \"constructed from file (file must be \"\n                                      \"created by Abelian CSR)\"),\n                            cll::init(false), cll::Hidden);\n\ncll::opt<std::string>\n    localGraphFileName(\"localGraphFileName\",\n                       cll::desc(\"Name of the local file to construct \"\n                                 \"local graph (file must be created by \"\n                                 \"Abelian CSR)\"),\n                       cll::init(\"local_graph\"), cll::Hidden);\n\ncll::opt<bool> saveLocalGraph(\"saveLocalGraph\",\n                              cll::desc(\"Set to save the local CSR graph\"),\n                              cll::init(false), cll::Hidden);\n\ncll::opt<std::string> mastersFile(\"mastersFile\",\n                                  cll::desc(\"File specifying masters blocking\"),\n                                  cll::init(\"\"), cll::Hidden);\n"
  },
  {
    "path": "lonestar/libdistbench/src/Output.cpp",
    "content": "#include \"DistBench/Output.h\"\n#include \"galois/runtime/Network.h\"\n\n#include <iomanip>\n\nnamespace {\nstd::string zeroPad(int num, int width) {\n  std::ostringstream out;\n\n  out << std::setw(width) << std::setfill('0') << num;\n\n  return out.str();\n}\n\n} // namespace\n\nstd::string makeOutputFilename(const std::string& outputDir) {\n  std::string filename = zeroPad(galois::runtime::getHostID(), 8);\n\n  std::string output{outputDir};\n  if (output.empty() || output.compare(output.size() - 1, 1, \"/\") == 0) {\n    output += filename;\n  } else {\n    output += \"/\" + filename;\n  }\n\n  return output;\n}\n"
  },
  {
    "path": "lonestar/libdistbench/src/Start.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#include \"DistBench/Start.h\"\n#include \"galois/Version.h\"\n#include \"galois/runtime/Network.h\"\n#include \"galois/runtime/DistStats.h\"\n#include \"galois/runtime/DataCommMode.h\"\n\n#include <sstream>\n\n////////////////////////////////////////////////////////////////////////////////\n// Command line args\n////////////////////////////////////////////////////////////////////////////////\n\ncll::opt<int> numThreads(\"t\", cll::desc(\"Number of threads (default 1)\"),\n                         cll::init(1));\ncll::opt<int> numRuns(\"runs\", cll::desc(\"Number of runs (default 3)\"),\n                      cll::init(3));\ncll::opt<std::string>\n    statFile(\"statFile\", cll::desc(\"Optional output file to print stats to\"));\n\ncll::opt<bool>\n    partitionAgnostic(\"partitionAgnostic\",\n                      cll::desc(\"Do not use partition-aware optimizations\"),\n                      cll::init(false), cll::Hidden);\n\n// TODO: use enums\ncll::opt<DataCommMode> commMetadata(\n    \"metadata\", cll::desc(\"Communication metadata\"),\n    cll::values(clEnumValN(noData, \"auto\", \"Dynamically choose the metadata\"),\n                clEnumValN(bitsetData, \"bitset\", \"Use bitset metadata always\"),\n                clEnumValN(offsetsData, \"offsets\",\n                           \"Use offsets metadata always\"),\n                clEnumValN(gidsData, \"gids\", \"Use global IDs metadata always\"),\n                clEnumValN(onlyData, \"none\",\n                           \"Do not use any metadata (sends \"\n                           \"non-updated values)\")),\n    cll::init(noData), cll::Hidden);\n\ncll::opt<std::string> outputLocation(\n    \"outputLocation\",\n    cll::desc(\"Location (directory) to write results to when output is true\"));\n\ncll::opt<bool> output(\"output\", cll::desc(\"Write result (default false)\"),\n                      cll::init(false));\n\n#ifdef GALOIS_ENABLE_GPU\nstd::string personality_str(Personality p) {\n  switch (p) {\n  case CPU:\n    return \"CPU\";\n  case GPU_CUDA:\n    return \"GPU_CUDA\";\n  }\n\n  assert(false && \"Invalid personality\");\n  return \"\";\n}\n\nint gpudevice;\nPersonality personality = CPU;\n\ncll::opt<unsigned> scalegpu(\n    \"scalegpu\",\n    cll::desc(\"Scale GPU workload w.r.t. CPU, default is proportionally \"\n              \"equal workload to CPU and GPU (1)\"),\n    cll::init(1));\ncll::opt<unsigned> scalecpu(\n    \"scalecpu\",\n    cll::desc(\"Scale CPU workload w.r.t. GPU, default is proportionally \"\n              \"equal workload to CPU and GPU (1)\"),\n    cll::init(1));\ncll::opt<int> num_nodes(\n    \"num_nodes\",\n    cll::desc(\"Num of physical nodes with devices (default = num of hosts): \"\n              \"detect GPU to use for each host automatically\"),\n    cll::init(-1));\ncll::opt<std::string> personality_set(\n    \"pset\",\n    cll::desc(\"String specifying personality for hosts on each physical \"\n              \"node. 'c'=CPU, 'g'=GPU (default 'c')\"),\n    cll::init(\"c\"));\n#endif\n\nstatic void PrintVersion(llvm::raw_ostream& out) {\n  out << \"D-Galois Benchmark Suite v\" << galois::getVersion() << \" (\"\n      << galois::getRevision() << \")\\n\";\n  out.flush();\n}\n\n////////////////////////////////////////////////////////////////////////////////\n//! initialize benchmark + functions to help initialization\n////////////////////////////////////////////////////////////////////////////////\n\nvoid DistBenchStart(int argc, char** argv, const char* app, const char* desc,\n                    const char* url) {\n  llvm::cl::SetVersionPrinter(PrintVersion);\n  llvm::cl::ParseCommandLineOptions(argc, argv);\n  numThreads = galois::setActiveThreads(numThreads);\n  galois::runtime::setStatFile(statFile);\n\n  auto& net = galois::runtime::getSystemNetworkInterface();\n\n  if (net.ID == 0) {\n    PrintVersion(llvm::outs());\n    llvm::outs() << \"Copyright (C) \" << galois::getCopyrightYear()\n                 << \" The University of Texas at Austin\\n\";\n    llvm::outs() << \"http://iss.ices.utexas.edu/galois/\\n\\n\";\n    llvm::outs() << \"application: \" << (app ? app : \"unspecified\") << \"\\n\";\n\n    if (desc) {\n      llvm::outs() << desc << \"\\n\";\n    }\n    if (url) {\n      llvm::outs()\n          << \"http://iss.ices.utexas.edu/?p=projects/galois/benchmarks/\" << url\n          << \"\\n\";\n    }\n    llvm::outs() << \"\\n\";\n    llvm::outs().flush();\n\n    std::ostringstream cmdout;\n\n    for (int i = 0; i < argc; ++i) {\n      cmdout << argv[i];\n      if (i != argc - 1)\n        cmdout << \" \";\n    }\n\n    galois::runtime::reportParam(\"DistBench\", \"CommandLine\", cmdout.str());\n    galois::runtime::reportParam(\"DistBench\", \"Threads\", numThreads);\n    galois::runtime::reportParam(\"DistBench\", \"Hosts\", net.Num);\n    galois::runtime::reportParam(\"DistBench\", \"Runs\", numRuns);\n    galois::runtime::reportParam(\"DistBench\", \"Run_UUID\",\n                                 galois::runtime::getRandUUID());\n    galois::runtime::reportParam(\"DistBench\", \"Input\", inputFile);\n    galois::runtime::reportParam(\"DistBench\", \"PartitionScheme\",\n                                 EnumToString(partitionScheme));\n  }\n\n  char name[256];\n  gethostname(name, 256);\n  galois::runtime::reportParam(\"DistBench\", \"Hostname\", name);\n}\n\n#ifdef GALOIS_ENABLE_GPU\n/**\n * Processes/setups the specified heterogeneous configuration (the pset\n * command line option) and sets up the scale factor vector for\n * graph partitioning.\n *\n * @param scaleFactor input and output: an empty vector that will hold\n * the scale factor (i.e. how much each host will get relative to\n * other hosts) at the end of the function\n */\nvoid internal::heteroSetup(std::vector<unsigned>& scaleFactor) {\n  const unsigned my_host_id = galois::runtime::getHostID();\n\n  // Parse arg string when running on multiple hosts and update\n  // personality with corresponding value.\n  auto& net = galois::runtime::getSystemNetworkInterface();\n\n  if (num_nodes == -1)\n    num_nodes = net.Num;\n\n  assert((net.Num % num_nodes) == 0);\n\n  if (personality_set.length() == (net.Num / num_nodes)) {\n    switch (personality_set.c_str()[my_host_id % (net.Num / num_nodes)]) {\n    case 'g':\n      personality = GPU_CUDA;\n      break;\n    case 'c':\n    default:\n      personality = CPU;\n      break;\n    }\n\n    if (personality == GPU_CUDA) {\n      gpudevice = get_gpu_device_id(personality_set, num_nodes);\n    } else {\n      gpudevice = -1;\n    }\n\n    // scale factor setup\n    if ((scalecpu > 1) || (scalegpu > 1)) {\n      for (unsigned i = 0; i < net.Num; ++i) {\n        if (personality_set.c_str()[i % num_nodes] == 'c') {\n          scaleFactor.push_back(scalecpu);\n        } else {\n          scaleFactor.push_back(scalegpu);\n        }\n      }\n    }\n  } else {\n    galois::gWarn(\n        \"Command line option -pset ignored because its string length is not \"\n        \"equal to the number of processes/hosts on each physical node\");\n  }\n}\n#endif\n"
  },
  {
    "path": "lonestar/liblonestar/CMakeLists.txt",
    "content": "add_library(lonestar STATIC src/BoilerPlate.cpp)\n\ntarget_include_directories(lonestar PUBLIC\n  $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>\n  $<INSTALL_INTERFACE:include>\n)\n\ntarget_link_libraries(lonestar Galois::shmem LLVMSupport)\n"
  },
  {
    "path": "lonestar/liblonestar/include/Lonestar/BFS_SSSP.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#ifndef LONESTAR_BFS_SSSP_H\n#define LONESTAR_BFS_SSSP_H\n#include <iostream>\n#include <cstdlib>\n\ntemplate <typename Graph, typename _DistLabel, bool USE_EDGE_WT,\n          ptrdiff_t EDGE_TILE_SIZE = 256>\nstruct BFS_SSSP {\n\n  using Dist = _DistLabel;\n\n  constexpr static const Dist DIST_INFINITY =\n      std::numeric_limits<Dist>::max() / 2 - 1;\n\n  using GNode = typename Graph::GraphNode;\n  using EI    = typename Graph::edge_iterator;\n\n  struct UpdateRequest {\n    GNode src;\n    Dist dist;\n    UpdateRequest(const GNode& N, Dist W) : src(N), dist(W) {}\n    UpdateRequest() : src(), dist(0) {}\n\n    friend bool operator<(const UpdateRequest& left,\n                          const UpdateRequest& right) {\n      return left.dist == right.dist ? left.src < right.src\n                                     : left.dist < right.dist;\n    }\n  };\n\n  struct UpdateRequestIndexer {\n    unsigned shift;\n\n    template <typename R>\n    unsigned int operator()(const R& req) const {\n      unsigned int t = req.dist >> shift;\n      return t;\n    }\n  };\n\n  struct SrcEdgeTile {\n    GNode src;\n    Dist dist;\n    EI beg;\n    EI end;\n\n    friend bool operator<(const SrcEdgeTile& left, const SrcEdgeTile& right) {\n      return left.dist == right.dist ? left.src < right.src\n                                     : left.dist < right.dist;\n    }\n  };\n\n  struct SrcEdgeTileMaker {\n    GNode src;\n    Dist dist;\n\n    SrcEdgeTile operator()(const EI& beg, const EI& end) const {\n      return SrcEdgeTile{src, dist, beg, end};\n    }\n  };\n\n  template <typename WL, typename TileMaker>\n  static void pushEdgeTiles(WL& wl, EI beg, const EI end, const TileMaker& f) {\n    assert(beg <= end);\n\n    if ((end - beg) > EDGE_TILE_SIZE) {\n      for (; beg + EDGE_TILE_SIZE < end;) {\n        auto ne = beg + EDGE_TILE_SIZE;\n        assert(ne < end);\n        wl.push(f(beg, ne));\n        beg = ne;\n      }\n    }\n\n    if ((end - beg) > 0) {\n      wl.push(f(beg, end));\n    }\n  }\n\n  template <typename WL, typename TileMaker>\n  static void pushEdgeTiles(WL& wl, Graph& graph, GNode src,\n                            const TileMaker& f) {\n    auto beg       = graph.edge_begin(src, galois::MethodFlag::UNPROTECTED);\n    const auto end = graph.edge_end(src, galois::MethodFlag::UNPROTECTED);\n\n    pushEdgeTiles(wl, beg, end, f);\n  }\n\n  template <typename WL, typename TileMaker>\n  static void pushEdgeTilesParallel(WL& wl, Graph& graph, GNode src,\n                                    const TileMaker& f) {\n\n    auto beg       = graph.edge_begin(src);\n    const auto end = graph.edge_end(src);\n\n    if ((end - beg) > EDGE_TILE_SIZE) {\n\n      galois::on_each(\n          [&](const unsigned tid, const unsigned numT) {\n            auto p = galois::block_range(beg, end, tid, numT);\n\n            auto b       = p.first;\n            const auto e = p.second;\n\n            pushEdgeTiles(wl, b, e, f);\n          },\n          galois::loopname(\"Init-Tiling\"));\n\n    } else if ((end - beg) > 0) {\n      wl.push(f(beg, end));\n    }\n  }\n\n  struct ReqPushWrap {\n    template <typename C>\n    void operator()(C& cont, const GNode& n, const Dist& dist,\n                    const char* const) const {\n      (*this)(cont, n, dist);\n    }\n\n    template <typename C>\n    void operator()(C& cont, const GNode& n, const Dist& dist) const {\n      cont.push(UpdateRequest(n, dist));\n    }\n  };\n\n  struct SrcEdgeTilePushWrap {\n\n    Graph& graph;\n\n    template <typename C>\n    void operator()(C& cont, const GNode& n, const Dist& dist,\n                    const char* const) const {\n      pushEdgeTilesParallel(cont, graph, n, SrcEdgeTileMaker{n, dist});\n    }\n\n    template <typename C>\n    void operator()(C& cont, const GNode& n, const Dist& dist) const {\n      pushEdgeTiles(cont, graph, n, SrcEdgeTileMaker{n, dist});\n    }\n  };\n\n  struct OutEdgeRangeFn {\n    Graph& graph;\n    auto operator()(const GNode& n) const {\n      return graph.edges(n, galois::MethodFlag::UNPROTECTED);\n    }\n\n    auto operator()(const UpdateRequest& req) const {\n      return graph.edges(req.src, galois::MethodFlag::UNPROTECTED);\n    }\n  };\n\n  struct TileRangeFn {\n    template <typename T>\n    auto operator()(const T& tile) const {\n      return galois::makeIterRange(tile.beg, tile.end);\n    }\n  };\n\n  struct not_consistent {\n    Graph& g;\n    std::atomic<bool>& refb;\n    not_consistent(Graph& g, std::atomic<bool>& refb) : g(g), refb(refb) {}\n\n    template <bool useWt, typename iiTy>\n    Dist getEdgeWeight(iiTy,\n                       typename std::enable_if<!useWt>::type* = nullptr) const {\n      return 1;\n    }\n\n    template <bool useWt, typename iiTy>\n    Dist getEdgeWeight(iiTy ii,\n                       typename std::enable_if<useWt>::type* = nullptr) const {\n      return g.getEdgeData(ii);\n    }\n\n    void operator()(typename Graph::GraphNode node) const {\n      Dist sd = g.getData(node);\n      if (sd == DIST_INFINITY)\n        return;\n\n      for (auto ii : g.edges(node)) {\n        auto dst = g.getEdgeDst(ii);\n        Dist dd  = g.getData(dst);\n        Dist ew  = getEdgeWeight<USE_EDGE_WT>(ii);\n        if (dd > sd + ew) {\n          std::cout << \"Wrong label: \" << dd << \", on node: \" << dst\n                    << \", correct label from src node \" << node << \" is \"\n                    << sd + ew << \"\\n\"; // XXX\n          refb = true;\n          // return;\n        }\n      }\n    }\n  };\n\n  struct max_dist {\n    Graph& g;\n    galois::GReduceMax<Dist>& m;\n\n    max_dist(Graph& g, galois::GReduceMax<Dist>& m) : g(g), m(m) {}\n\n    void operator()(typename Graph::GraphNode node) const {\n      Dist d = g.getData(node);\n      if (d == DIST_INFINITY)\n        return;\n      m.update(d);\n    }\n  };\n\n  static bool verify(Graph& graph, GNode source) {\n    if (graph.getData(source) != 0) {\n      std::cerr << \"ERROR: source has non-zero dist value == \"\n                << graph.getData(source) << std::endl;\n      return false;\n    }\n\n    std::atomic<size_t> notVisited(0);\n    galois::do_all(galois::iterate(graph), [&notVisited, &graph](GNode node) {\n      if (graph.getData(node) >= DIST_INFINITY)\n        ++notVisited;\n    });\n\n    if (notVisited)\n      std::cerr << notVisited\n                << \" unvisited nodes; this is an error if the graph is \"\n                   \"strongly connected\\n\";\n\n    std::atomic<bool> not_c(false);\n    galois::do_all(galois::iterate(graph), not_consistent(graph, not_c));\n\n    if (not_c) {\n      std::cerr << \"node found with incorrect distance\\n\";\n      return false;\n    }\n\n    galois::GReduceMax<Dist> m;\n    galois::do_all(galois::iterate(graph), max_dist(graph, m));\n\n    std::cout << \"max dist: \" << m.reduce() << \"\\n\";\n\n    return true;\n  }\n};\n\ntemplate <typename T, typename BucketFunc, size_t MAX_BUCKETS = 543210ul>\nclass SerialBucketWL {\n\n  using Bucket      = std::deque<T>;\n  using BucketsCont = std::vector<Bucket>;\n\n  size_t m_minBucket;\n  BucketFunc m_func;\n  BucketsCont m_buckets;\n  Bucket m_lastBucket;\n\n  static_assert(MAX_BUCKETS > 0, \"MAX_BUCKETS must be > 0\");\n\npublic:\n  explicit SerialBucketWL(const BucketFunc& f) : m_minBucket(0ul), m_func(f) {\n    // reserve enough so that resize never reallocates memory\n    // otherwise, minBucket may return an invalid reference\n    m_buckets.reserve(MAX_BUCKETS);\n  }\n\n  void push(const T& item) {\n    size_t b = m_func(item);\n    assert(b >= m_minBucket && \"can't push below m_minBucket\");\n\n    if (b < m_buckets.size()) {\n      m_buckets[b].push_back(item);\n      return;\n    } else {\n      if (b >= MAX_BUCKETS) {\n        std::cerr << \"Increase MAX_BUCKETS limit\" << std::endl;\n        m_lastBucket.push_back(item);\n      } else {\n        m_buckets.resize(b + 1);\n        m_buckets[b].push_back(item);\n      }\n    }\n  }\n\n  void goToNextBucket(void) {\n    while (m_minBucket < m_buckets.size() && m_buckets[m_minBucket].empty()) {\n      ++m_minBucket;\n    }\n  }\n\n  Bucket& minBucket(void) {\n    if (m_minBucket < m_buckets.size()) {\n      return m_buckets[m_minBucket];\n    } else {\n      return m_lastBucket;\n    }\n  }\n\n  bool empty(void) const { return emptyImpl(m_minBucket); }\n\n  bool allEmpty(void) const { return emptyImpl(0ul); }\n\nprivate:\n  bool emptyImpl(size_t start) const {\n    for (size_t i = start; i < m_buckets.size(); ++i) {\n      if (!m_buckets[i].empty()) {\n        return false;\n      }\n    }\n\n    return m_lastBucket.empty();\n  }\n};\n\n#endif //  LONESTAR_BFS_SSSP_H\n"
  },
  {
    "path": "lonestar/liblonestar/include/Lonestar/BoilerPlate.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#ifndef LONESTAR_BOILERPLATE_H\n#define LONESTAR_BOILERPLATE_H\n\n#include \"galois/Galois.h\"\n#include \"galois/Version.h\"\n#include \"llvm/Support/CommandLine.h\"\n\n//! standard global options to the benchmarks\nextern llvm::cl::opt<bool> skipVerify;\nextern llvm::cl::opt<int> numThreads;\nextern llvm::cl::opt<std::string> statFile;\nextern llvm::cl::opt<bool> symmetricGraph;\n\n//! initialize lonestar benchmark\nvoid LonestarStart(int argc, char** argv, const char* app, const char* desc,\n                   const char* url, llvm::cl::opt<std::string>* input);\nvoid LonestarStart(int argc, char** argv);\n#endif\n"
  },
  {
    "path": "lonestar/liblonestar/include/Lonestar/Utils.h",
    "content": "\n\n#pragma once\n#include <random>\n#include <vector>\n#include <algorithm>\n\n//! Used to pick random non-zero degree starting points for search algorithms\n//! This code has been copied from GAP benchmark suite\n//! (https://github.com/sbeamer/gapbs/blob/master/src/benchmark.h)\ntemplate <typename Graph>\nclass SourcePicker {\n  static const uint32_t kRandSeed;\n  std::mt19937 rng;\n  std::uniform_int_distribution<typename Graph::GraphNode> udist;\n  const Graph& graph;\n\npublic:\n  explicit SourcePicker(const Graph& g)\n      : rng(kRandSeed), udist(0, g.size() - 1), graph(g) {}\n\n  auto PickNext() {\n    typename Graph::GraphNode source;\n    do {\n      source = udist(rng);\n    } while (graph.getDegree(source) == 0);\n    return source;\n  }\n};\ntemplate <typename Graph>\nconst uint32_t SourcePicker<Graph>::kRandSeed = 27491095;\n\n//! Used to determine if a graph has power-law degree distribution or not\n//! by sampling some of the vertices in the graph randomly\n//! This code has been copied from GAP benchmark suite\n//! (https://github.com/sbeamer/gapbs/blob/master/src/tc.cc WorthRelabelling())\ntemplate <typename Graph>\nbool isApproximateDegreeDistributionPowerLaw(const Graph& graph) {\n  uint32_t averageDegree = graph.sizeEdges() / graph.size();\n  if (averageDegree < 10)\n    return false;\n  SourcePicker<Graph> sp(graph);\n  uint32_t num_samples = 1000;\n  if (num_samples > graph.size())\n    num_samples = graph.size();\n  uint32_t sample_total = 0;\n  std::vector<uint32_t> samples(num_samples);\n  for (uint32_t trial = 0; trial < num_samples; trial++) {\n    typename Graph::GraphNode node = sp.PickNext();\n    samples[trial]                 = graph.getDegree(node);\n    sample_total += samples[trial];\n  }\n  std::sort(samples.begin(), samples.end());\n  double sample_average = static_cast<double>(sample_total) / num_samples;\n  double sample_median  = samples[num_samples / 2];\n  return sample_average / 1.25 > sample_median;\n}\n"
  },
  {
    "path": "lonestar/liblonestar/src/BoilerPlate.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#include \"Lonestar/BoilerPlate.h\"\n\n#include <sstream>\n\n//! standard global options to the benchmarks\nllvm::cl::opt<bool>\n    skipVerify(\"noverify\",\n               llvm::cl::desc(\"Skip verification step (default value false)\"),\n               llvm::cl::init(false));\nllvm::cl::opt<int>\n    numThreads(\"t\", llvm::cl::desc(\"Number of threads (default value 1)\"),\n               llvm::cl::init(1));\nllvm::cl::opt<std::string> statFile(\n    \"statFile\",\n    llvm::cl::desc(\"ouput file to print stats to (default value empty)\"),\n    llvm::cl::init(\"\"));\n\n//! Flag that forces user to be aware that they should be passing in a\n//! symmetric graph.\nllvm::cl::opt<bool>\n    symmetricGraph(\"symmetricGraph\",\n                   llvm::cl::desc(\"Specify that the input graph is symmetric\"),\n                   llvm::cl::init(false));\n\nstatic void LonestarPrintVersion(llvm::raw_ostream& out) {\n  out << \"LoneStar Benchmark Suite v\" << galois::getVersion() << \" (\"\n      << galois::getRevision() << \")\\n\";\n  out.flush();\n}\n\n//! initialize lonestar benchmark\nvoid LonestarStart(int argc, char** argv) {\n  LonestarStart(argc, argv, nullptr, nullptr, nullptr, nullptr);\n}\n\n//! initialize lonestar benchmark\nvoid LonestarStart(int argc, char** argv, const char* app, const char* desc,\n                   const char* url, llvm::cl::opt<std::string>* input) {\n  llvm::cl::SetVersionPrinter(LonestarPrintVersion);\n  llvm::cl::ParseCommandLineOptions(argc, argv);\n  numThreads = galois::setActiveThreads(numThreads);\n\n  galois::runtime::setStatFile(statFile);\n\n  LonestarPrintVersion(llvm::outs());\n  llvm::outs() << \"Copyright (C) \" << galois::getCopyrightYear()\n               << \" The University of Texas at Austin\\n\";\n  llvm::outs() << \"http://iss.ices.utexas.edu/galois/\\n\\n\";\n  llvm::outs() << \"application: \" << (app ? app : \"unspecified\") << \"\\n\";\n  if (desc) {\n    llvm::outs() << desc << \"\\n\";\n  }\n  if (url) {\n    llvm::outs() << \"http://iss.ices.utexas.edu/?p=projects/galois/benchmarks/\"\n                 << url << \"\\n\";\n  }\n  llvm::outs() << \"\\n\";\n  llvm::outs().flush();\n\n  std::ostringstream cmdout;\n  for (int i = 0; i < argc; ++i) {\n    cmdout << argv[i];\n    if (i != argc - 1) {\n      cmdout << \" \";\n    }\n  }\n\n  galois::runtime::reportParam(\"(NULL)\", \"CommandLine\", cmdout.str());\n  galois::runtime::reportParam(\"(NULL)\", \"Threads\", numThreads);\n  galois::runtime::reportParam(\"(NULL)\", \"Hosts\", 1);\n  if (input) {\n    galois::runtime::reportParam(\"(NULL)\", \"Input\", input->getValue());\n  }\n\n  char name[256];\n  gethostname(name, 256);\n  galois::runtime::reportParam(\"(NULL)\", \"Hostname\", name);\n}\n"
  },
  {
    "path": "lonestar/mining/CMakeLists.txt",
    "content": "include_directories(${CMAKE_SOURCE_DIR}/libgalois/include)\n\nadd_subdirectory(libminingbench)\n\nadd_subdirectory(cpu)\n\nif(GALOIS_ENABLE_GPU)\n  add_subdirectory(gpu)\nendif()\n"
  },
  {
    "path": "lonestar/mining/README.md",
    "content": "Overview of Graph Pattern Mining (GPM) in Galois\n================================================================================\n\nThis directory contains benchmarks for efficient and flexible graph mining \nthat run using the Pangolin framework [1] on a multi-core CPU or a GPU. \nIt uses the bliss [2][3] library v0.73 for graph isomorphism test. \nThe license for this library is in the bliss\ndirectory: note that **it does not use the same license as the rest of Galois**.  \n\n[1] Xuhao Chen, Roshan Dathathri, Gurbinder Gill, Keshav Pingali, \nPangolin: An Efficient and Flexible Graph Pattern Mining System on CPU and GPU, VLDB 2020\n\n[2] Bliss: A tool for computing automorphism groups and canonical \nlabelings of graphs. http://www.tcs.hut.fi/Software/bliss/, 2017.\n\n[3] Tommi Junttila and Petteri Kaski. 2007. Engineering an efficient \ncanonical labeling tool for large and sparse graphs. In Proceedings \nof the Meeting on Algorithm Engineering & Expermiments, 135-149.\n\nCompiling GPM Applications Through CMake \n================================================================================\n\nThe dependencies for LonestarGPU suite are the same as shared-memory.\nNote that  LonestarGPU requires CUDA 8.0 and above.\n\nNote that heterogeneous Galois requires the cub and moderngpu git submodules,\nwhich can be cloned using the followed commands.\n\n```Shell\ncd $GALOIS_ROOT\ngit submodule init\ngit submodule update\n```\nThese modules will be cloned in the ${GALOIS\\_ROOT}/external directory\n\nMining applications for CPU are enabled by default.\nTo build the mining applications for GPU, first, create a build directory and\nrun CMake with -DGALOIS\\_CUDA\\_CAPABILITY=\\<insert CUDA capability here\\> flag\nin the build directory. The CUDA capability should be one that your\nGPU supports. For example, if you wanted to build for a GTX 1080 and a K80,\nthe commands would look like this:\n\n```Shell\ncd ${GALOIS_ROOT}\nmkdir build\ncd build\ncmake ${GALOIS_ROOT} -DGALOIS_CUDA_CAPABILITY=\"3.7;6.1\"\n```\n\nAfter compiling through CMake, the system will create the 'lonestar/mining/cpu' \nand 'lonestar/mining/gpu' directories in ${GALOIS\\_ROOT}/build directory. \n\nCompiling Mining Applications\n================================================================================\n\nOnce CMake is completed,  compile the provided mining apps by executing the \nfollowing command in the ${GALOIS\\_ROOT}/build/lonestar/mining directory.\n\n```Shell\n`make -j`\n```\n\nYou can compile a specific app by executing the following commands (shown for motif-counting on CPU).\n\n```Shell\ncd cpu/motif-counting\nmake -j\n```\n\nINPUT\n================================================================================\n\nWe support four input graph format: **gr**, **txt**, **adj**, **mtx**.\nFor unlabeled graphs, we use the gr graph format, same as other Galois benchmarks.\n**Make sure that the graph is symmetric and contains no self-loop or redundant edges**.\nIf not, use the convert tool in tools/graph-convert/ to convert the graph.\nWe use **adj** format for labeled graphs as also used by Arabesque and RStream.\nThe **adj** format takes as input graphs with the following formats (vertex labeled):\n\n```\n# <num vertices> <num edges>\n<vertex id> <vertex label> [<neighbour id1> <neighbour id2> ... <neighbour id n>]\n<vertex id> <vertex label> [<neighbour id1> <neighbour id2> ... <neighbour id n>]\n...\n```\n\nWe currently do not support graphs label on edges.\n\nVertex ids are expected to be sequential integers between 0 and (total number of vertices - 1).\nFor testing, we have prepared a test graph **citeseer**. After running make input,\nthe needed input files can be build in \"$BUILD_DIR/inputs/Mining\".\n\nRunning Provided Apps\n================================================================================\n\nThe following are a few example command lines.\n\n- `$ ./triangle-counting-mining-cpu -symmetricGraph -simpleGraph $BUILD_DIR/inputs/Mining/citeseer.csgr -t 28`\n- `$ ./k-clique-listing-cpu -symmetricGraph -simpleGraph $BUILD_DIR/inputs/Mining/citeseer.csgr -k=3 -t 28`\n- `$ ./motif-counting-cpu -symmetricGraph -simpleGraph $BUILD_DIR/inputs/Mining/citeseer.csgr -k=3 -t 56`\n- `$ ./frequent-subgraph-mining-cpu -symmetricGraph -simpleGraph $BUILD_DIR/inputs/Mining/citeseer.sadj -ft adj -k=2 -ms=300 -t 28`\n\nPERFORMANCE\n================================================================================\n\nPlease see details in the paper.\n\nCITATION\n================================================================================\n\nPlease cite the following paper if you use Pangolin:\n\n```\n@article{Pangolin,\n\ttitle={Pangolin: An Efficient and Flexible Graph Mining System on CPU and GPU},\n\tauthor={Xuhao Chen and Roshan Dathathri and Gurbinder Gill and Keshav Pingali},\n\tyear={2020},\n\tjournal = {Proc. VLDB Endow.},\n\tissue_date = {August 2020},\n\tvolume = {13},\n\tnumber = {8},\n\tmonth = aug,\n\tyear = {2020},\n\tnumpages = {12},\n\tpublisher = {VLDB Endowment},\n}\n```\n"
  },
  {
    "path": "lonestar/mining/cpu/CMakeLists.txt",
    "content": "function(add_test_mine type app)\n  set(options NOT_QUICK)\n  set(one_value_args)\n  set(multi_value_args REQUIRES COMMAND_PREFIX)\n  cmake_parse_arguments(X \"${options}\" \"${one_value_args}\" \"${multi_value_args}\" ${ARGN})\n\n  set(threads)\n  set(thr \"${GALOIS_NUM_TEST_THREADS}\")\n  while (${thr} GREATER 1)\n    list(APPEND threads ${thr})\n    math(EXPR thr \"${thr} / 2\")\n  endwhile()\n  list(APPEND threads \"1\")\n\n  foreach (thr ${threads})\n    set(name run-${type}-${app}-${thr})\n    add_test(NAME ${name} COMMAND ${app} ${X_UNPARSED_ARGUMENTS} -t ${thr})\n    if (NOT ${X_NOT_QUICK})\n      # Allow parallel tests\n      set_tests_properties(${name}\n        PROPERTIES ENVIRONMENT GALOIS_DO_NOT_BIND_THREADS=1 LABELS quick)\n    endif()\n  endforeach()\nendfunction(add_test_mine)\n\nadd_subdirectory(frequent-subgraph-mining)\nadd_subdirectory(k-clique-listing)\nadd_subdirectory(motif-counting)\nadd_subdirectory(triangle-counting)\n#add_subdirectory(subgraph-listing)\n"
  },
  {
    "path": "lonestar/mining/cpu/frequent-subgraph-mining/CMakeLists.txt",
    "content": "add_executable(frequent-subgraph-mining-cpu fsm.cpp)\nadd_dependencies(apps frequent-subgraph-mining-cpu)\ntarget_link_libraries(frequent-subgraph-mining-cpu PRIVATE Galois::pangolin miningbench)\ninstall(TARGETS frequent-subgraph-mining-cpu DESTINATION \"${CMAKE_INSTALL_BINDIR}\" COMPONENT apps EXCLUDE_FROM_ALL)\n\nadd_test_mine(small1 frequent-subgraph-mining-cpu -symmetricGraph -simpleGraph \"${BASEINPUT}/Mining/citeseer.sadj\" \"-ft=adj\" NOT_QUICK)\n"
  },
  {
    "path": "lonestar/mining/cpu/frequent-subgraph-mining/README.md",
    "content": "Frequent Subgraph Mining\n================================================================================\n\nDESCRIPTION \n--------------------------------------------------------------------------------\n\nThis application does frequent subgraph mining in a graph using BFS \nexpansion. It uses the bliss library [1][2] for graph isomorphism check.\n\n[1] Bliss: A tool for computing automorphism groups and canonical \nlabelings of graphs. http://www.tcs.hut.fi/Software/bliss/, 2017.\n[2] Tommi Junttila and Petteri Kaski. 2007. Engineering an efficient \ncanonical labeling tool for large and sparse graphs. In Proceedings \nof the Meeting on Algorithm Engineering & Expermiments, 135-149.\n\nINPUT\n--------------------------------------------------------------------------------\n\nWe support the following input graph formats: **txt**, **adj**.\n\nWe mostly use **adj** format as it is also used by Arabesque and RStream.\nThe **adj** format takes as input graphs with the following formats:\n\n* **Labels on vertices (default)**\n```\n# <num vertices> <num edges>\n<vertex id> <vertex label> [<neighbour id1> <neighbour id2> ... <neighbour id n>]\n<vertex id> <vertex label> [<neighbour id1> <neighbour id2> ... <neighbour id n>]\n...\n```\n\nWe currently do not support graph labels on edges.\nVertex ids are expected to be sequential integers from 0 to (total number of vertices - 1).\n\nThis application takes in symmetric and simple graphs.\nYou must specify both the -symmetricGraph and the -simpleGraph flags when\nrunning this benchmark.\n\nBUILD\n--------------------------------------------------------------------------------\n\n1. Run cmake at BUILD directory (refer to top-level README for cmake instructions).\n\n2. Run `cd <BUILD>/lonestar/mining/cpu/frequent-subgraph-mining; make -j`\n\nRUN\n--------------------------------------------------------------------------------\n\nThe following is an example command line.\n\n-`$ ./frequent-subgraph-mining-cpu <path-to-graph> -symmetricGraph -simpleGraph -k=3 -minsup=300 -t 40`\n\nPERFORMANCE\n--------------------------------------------------------------------------------\n\nPlease see details in the paper.\n\n"
  },
  {
    "path": "lonestar/mining/cpu/frequent-subgraph-mining/fsm.cpp",
    "content": "#include \"MiningBench/Start.h\"\n#include \"pangolin/BfsMining/edge_miner.h\"\n\nconst char* name = \"FSM\";\nconst char* desc = \"Frequent subgraph mining in a graph using BFS extension\";\nconst char* url  = nullptr;\n\n#include \"pangolin/BfsMining/edge_miner_api.h\"\nclass MyAPI : public EdgeMinerAPI<EdgeEmbedding> {\npublic:\n};\n\nclass AppMiner : public EdgeMiner<LabeledElement, EdgeEmbedding, MyAPI, true> {\npublic:\n  AppMiner(unsigned ms, int nt)\n      : EdgeMiner<LabeledElement, EdgeEmbedding, MyAPI, true>(ms, nt) {\n    if (ms <= 1) {\n      printf(\"ERROR: command line argument k must be 2 or greater\\n\");\n      exit(1);\n    }\n    if (filetype == \"gr\") {\n      printf(\"ERROR: gr file is not acceptable for FSM. Add -ft=adj and use \"\n             \"adj file instead.\\n\");\n      exit(1);\n    }\n    set_threshold(minsup);\n    total_num = 0;\n  }\n  ~AppMiner() {}\n  void print_output() {\n    std::cout << \"\\n\\ttotal_num_frequent_patterns = \" << this->total_num\n              << \"\\n\";\n  }\n};\n\n#include \"pangolin/BfsMining/engine.h\"\n"
  },
  {
    "path": "lonestar/mining/cpu/frequent-subgraph-mining/fsm.h",
    "content": "#pragma once\n#include <string>\n#include <iostream>\n#include \"pangolin/types.cuh\"\n\nvoid fsm_gpu_solver(std::string fname, unsigned k, unsigned minsup,\n                    AccType& total);\n"
  },
  {
    "path": "lonestar/mining/cpu/k-clique-listing/CMakeLists.txt",
    "content": "add_executable(k-clique-listing-cpu kcl.cpp)\nadd_dependencies(apps k-clique-listing-cpu)\ntarget_link_libraries(k-clique-listing-cpu PRIVATE Galois::pangolin miningbench)\ninstall(TARGETS k-clique-listing-cpu DESTINATION \"${CMAKE_INSTALL_BINDIR}\" COMPONENT apps EXCLUDE_FROM_ALL)\n\nadd_test_mine(small1 k-clique-listing-cpu -symmetricGraph -simpleGraph \"${BASEINPUT}/Mining/citeseer.csgr\" NOT_QUICK)\n"
  },
  {
    "path": "lonestar/mining/cpu/k-clique-listing/README.md",
    "content": "k-Clique Listing\n================================================================================\n\nDESCRIPTION \n--------------------------------------------------------------------------------\n\nThis application counts the k-Cliques in a graph. \n\nINPUT\n--------------------------------------------------------------------------------\n\nThis application takes in symmetric and simple Galois .gr graphs.\nYou must specify both the -symmetricGraph and the -simpleGraph flags when\nrunning this benchmark.\n\nBUILD\n--------------------------------------------------------------------------------\n\n1. Run cmake at BUILD directory (refer to top-level README for cmake instructions).\n\n2. Run `cd <BUILD>/lonestar/mining/cpu/k-clique-listing; make -j`\n\nRUN\n--------------------------------------------------------------------------------\n\nThe following is an example command line.\n\n-`$ ./k-clique-listing-cpu -symmetricGraph -simpleGraph <path-to-graph> -k=3 -t 40`\n\nPERFORMANCE\n--------------------------------------------------------------------------------\n\nPlease see details in the paper.\n\n"
  },
  {
    "path": "lonestar/mining/cpu/k-clique-listing/kcl.cpp",
    "content": "#include \"MiningBench/Start.h\"\n#include \"pangolin/BfsMining/vertex_miner.h\"\n\nconst char* name = \"Kcl\";\nconst char* desc = \"Listing cliques of size k in a graph using BFS extension\";\nconst char* url  = nullptr;\n\n#include \"pangolin/BfsMining/vertex_miner_api.h\"\nclass MyAPI : public VertexMinerAPI<BaseEmbedding> {\npublic:\n  // toExtend (only extend the last vertex in the embedding)\n  static bool toExtend(unsigned n, const BaseEmbedding&, unsigned pos) {\n    return pos == n - 1;\n  }\n  // toAdd (only add vertex connected to all the vertices in the embedding)\n  static bool toAdd(unsigned n, PangolinGraph& g, const BaseEmbedding& emb,\n                    unsigned, VertexId dst) {\n    return is_all_connected_dag(g, dst, emb, n - 1);\n  }\n};\n\nclass AppMiner : public VertexMiner<SimpleElement, BaseEmbedding, MyAPI, true> {\npublic:\n  AppMiner(unsigned ms, int nt)\n      : VertexMiner<SimpleElement, BaseEmbedding, MyAPI, true>(ms, nt,\n                                                               nblocks) {\n    if (ms <= 2) {\n      printf(\"ERROR: command line argument k must be 3 or greater\\n\");\n      exit(1);\n    }\n    set_num_patterns(1);\n  }\n  ~AppMiner() {}\n  void print_output() {\n    std::cout << \"\\n\\ttotal_num_cliques = \" << get_total_count() << \"\\n\";\n  }\n};\n\n#include \"pangolin/BfsMining/engine.h\"\n"
  },
  {
    "path": "lonestar/mining/cpu/k-clique-listing/kcl.h",
    "content": "#pragma once\n#include <string>\n#include <iostream>\n#include \"pangolin/types.cuh\"\n\nvoid kcl_gpu_solver(std::string filename, unsigned k, AccType& total,\n                    size_t N_CHUNK = 1);\n"
  },
  {
    "path": "lonestar/mining/cpu/motif-counting/CMakeLists.txt",
    "content": "add_executable(motif-counting-cpu motif.cpp)\nadd_dependencies(apps motif-counting-cpu)\ntarget_link_libraries(motif-counting-cpu PRIVATE Galois::pangolin miningbench)\ninstall(TARGETS motif-counting-cpu DESTINATION \"${CMAKE_INSTALL_BINDIR}\" COMPONENT apps EXCLUDE_FROM_ALL)\n\nadd_test_mine(small1 motif-counting-cpu -symmetricGraph -simpleGraph \"${BASEINPUT}/Mining/citeseer.csgr\" NOT_QUICK)\n"
  },
  {
    "path": "lonestar/mining/cpu/motif-counting/README.md",
    "content": "Motif Counting\n================================================================================\n\nDESCRIPTION \n--------------------------------------------------------------------------------\n\nThis application counts the motifs in a graph using BFS \nexpansion. It uses the bliss library [1][2] for graph isomorphism test.\n\n[1] Bliss: A tool for computing automorphism groups and canonical \nlabelings of graphs. http://www.tcs.hut.fi/Software/bliss/, 2017.\n[2] Tommi Junttila and Petteri Kaski. 2007. Engineering an efficient \ncanonical labeling tool for large and sparse graphs. In Proceedings \nof the Meeting on Algorithm Engineering & Expermiments, 135-149.\n\nINPUT\n--------------------------------------------------------------------------------\n\nThis application takes in symmetric and simple Galois .gr graphs.\nYou must specify both the -symmetricGraph and the -simpleGraph flags when\nrunning this benchmark.\n\nBUILD\n--------------------------------------------------------------------------------\n\n1. Run cmake at BUILD directory (refer to top-level README for cmake instructions).\n\n2. Run `cd <BUILD>/lonestar/mining/cpu/motif-counting; make -j`\n\nRUN\n--------------------------------------------------------------------------------\n\nThe following is an example command line.\n\n-`$ ./motif-counting-cpu -symmetricGraph -simpleGraph <path-to-graph> -k=3 -t 28`\n\nPERFORMANCE\n--------------------------------------------------------------------------------\n\nPlease see details in the paper.\n\n"
  },
  {
    "path": "lonestar/mining/cpu/motif-counting/motif.cpp",
    "content": "#include \"MiningBench/Start.h\"\n#include \"pangolin/BfsMining/vertex_miner.h\"\n\nconst char* name = \"Motif Counting\";\nconst char* desc =\n    \"Counts the vertex-induced motifs in a graph using BFS extension\";\nconst char* url     = nullptr;\nint num_patterns[3] = {2, 6, 21};\n\n#include \"pangolin/BfsMining/vertex_miner_api.h\"\nclass MyAPI : public VertexMinerAPI<VertexEmbedding> {\npublic:\n  // customized pattern classification method\n  static unsigned getPattern(unsigned n, PangolinGraph& g, unsigned i,\n                             VertexId dst, const VertexEmbedding& emb,\n                             BYTE* pre_pid, unsigned pos) {\n    return find_motif_pattern_id(n, g, i, dst, emb, pre_pid, pos);\n  }\n};\n\nclass AppMiner : public VertexMiner<SimpleElement, VertexEmbedding, MyAPI,\n                                    false, false, true> {\npublic:\n  AppMiner(unsigned ms, int nt)\n      : VertexMiner<SimpleElement, VertexEmbedding, MyAPI, false, false, true>(\n            ms, nt, nblocks) {\n    if (ms <= 2) {\n      printf(\"ERROR: command line argument k must be 3 or greater\\n\");\n      exit(1);\n    }\n    set_num_patterns(num_patterns[k - 3]);\n  }\n  ~AppMiner() {}\n  void print_output() { printout_motifs(); }\n};\n\n#include \"pangolin/BfsMining/engine.h\"\n"
  },
  {
    "path": "lonestar/mining/cpu/motif-counting/motif.h",
    "content": "#pragma once\n#include <vector>\n#include <string>\n#include <iostream>\n#include \"pangolin/types.cuh\"\n\nvoid motif_gpu_solver(std::string fname, unsigned k, std::vector<AccType>& acc,\n                      size_t N_CHUNK = 1);\n"
  },
  {
    "path": "lonestar/mining/cpu/subgraph-listing/CMakeLists.txt",
    "content": "add_executable(sgl_cycle sgl_cycle.cpp)\nadd_executable(sgl_diamond sgl_diamond.cpp)\nadd_dependencies(apps sgl_cycle)\nadd_dependencies(apps sgl_diamond)\ntarget_link_libraries(sgl_cycle PRIVATE Galois::pangolin miningbench)\ntarget_link_libraries(sgl_diamond PRIVATE Galois::pangolin miningbench)\ninstall(TARGETS sgl_cycle DESTINATION \"${CMAKE_INSTALL_BINDIR}\" COMPONENT apps EXCLUDE_FROM_ALL)\ninstall(TARGETS sgl_diamond DESTINATION \"${CMAKE_INSTALL_BINDIR}\" COMPONENT apps EXCLUDE_FROM_ALL)\n"
  },
  {
    "path": "lonestar/mining/cpu/subgraph-listing/README.mb",
    "content": "Subgraph Listing\n================================================================================\n\nDESCRIPTION \n--------------------------------------------------------------------------------\n\nThis application counts the occurances of a given subgraph in a graph. \nCurrently only two patterns are supported: diamond and 4-cycle.\n\nINPUT\n--------------------------------------------------------------------------------\n\nThis application takes in symmetric and simple Galois .gr graphs.\nYou must specify both the -symmetricGraph and the -simpleGraph flags when\nrunning this benchmark.\nYou must also specify the query graph (i.e. pattern) using -p.\nCurrently you need to pass the 4-cycle and diamond query graphs\nto the sgl_cycle and sgl_diamond executables respectively.\n\nBUILD\n--------------------------------------------------------------------------------\n\n1. Run cmake at BUILD directory (refer to top-level README for cmake instructions).\n\n2. Run `cd <BUILD>/lonestar/mining/cpu/subgraph-listing; make -j`\n\nRUN\n--------------------------------------------------------------------------------\n\nThe following is an example command line.\n\n-`$ ./sgl_cycle -symmetricGraph -simpleGraph <path-to-graph> -k 4 -p query/p0.graph -t 16`\n-`$ ./sgl_diamond -symmetricGraph -simpleGraph <path-to-graph> -k 4 -p query/p1.graph -t 16`\n\nPERFORMANCE\n--------------------------------------------------------------------------------\n\nPlease see details in the paper.\n\n"
  },
  {
    "path": "lonestar/mining/cpu/subgraph-listing/sgl_cycle.cpp",
    "content": "#include \"MiningBench/Start.h\"\n#include \"pangolin/BfsMining/vertex_miner.h\"\n\nconst char* name = \"sgl\";\nconst char* desc = \"listing edge-induced subgraphs of a given pattern in a \"\n                   \"graph using bfs extension\";\nconst char* url = nullptr;\n\n#include \"pangolin/BfsMining/vertex_miner_api.h\"\nclass MyAPI : public VertexMinerAPI<BaseEmbedding> {\npublic:\n  // matching order of the pattern\n  static inline unsigned getExtendableVertex(unsigned n) {\n    if (n == 2)\n      return 0;   // u2 extended from u0\n    return n - 1; // u[i] extended from u[i-1]\n  }\n\n  static inline bool toAdd(unsigned n, PangolinGraph& g,\n                           const BaseEmbedding& emb, unsigned, VertexId dst) {\n    // std::cout << \"\\t emb: \" << emb << \", dst=\" << dst << \"\\n\";\n    if (n == 3) {\n      if (dst <= emb.get_vertex(0))\n        return false;\n      if (!is_connected(g, dst, emb.get_vertex(1)))\n        return false;\n    } else {\n      if (dst <= emb.get_vertex(n - 1))\n        return false;\n    }\n    // if (g.get_degree(dst) < pattern.get_degree(n)) return false;\n    // for (unsigned i = 1; i < n; ++i)\n    //  if (dst == emb.get_vertex(i)) return false;\n\n    // u3 (extended from u2) connected to u1\n    return true;\n  }\n};\n\nclass AppMiner\n    : public VertexMiner<SimpleElement, BaseEmbedding, MyAPI, 0, 1, 0, 1> {\npublic:\n  AppMiner(unsigned ms, int nt)\n      : VertexMiner<SimpleElement, BaseEmbedding, MyAPI, 0, 1, 0, 1>(ms, nt,\n                                                                     nblocks) {}\n  ~AppMiner() {}\n  void print_output() {\n    std::cout << \"\\n\\ttotal_num_subgraphs = \" << get_total_count() << \"\\n\";\n  }\n};\n\n#include \"pangolin/BfsMining/engine.h\"\n"
  },
  {
    "path": "lonestar/mining/cpu/subgraph-listing/sgl_diamond.cpp",
    "content": "#include \"MiningBench/Start.h\"\n#include \"pangolin/BfsMining/vertex_miner.h\"\n\nconst char* name = \"sgl\";\nconst char* desc = \"listing edge-induced subgraphs of a given pattern in a \"\n                   \"graph using bfs extension\";\nconst char* url = nullptr;\n\n#include \"pangolin/BfsMining/vertex_miner_api.h\"\nclass MyAPI : public VertexMinerAPI<BaseEmbedding> {\npublic:\n  // matching order of the pattern\n  static inline unsigned getExtendableVertex(unsigned n) {\n    if (n == 3)\n      return 1;   // u3 extended from u1\n    return n - 1; // u[i] extended from u[i-1]\n  }\n\n  static inline bool toAdd(unsigned n, PangolinGraph& g,\n                           const BaseEmbedding& emb, unsigned, VertexId dst) {\n    // std::cout << \"\\t emb: \" << emb << \", dst=\" << dst << \", pos=\" << pos <<\n    // \"\\n\";\n    // u3 > u2\n    if (n == 3) {\n      if (dst <= emb.get_vertex(2))\n        return false;\n    }\n    // both u2 and u3 (extended from u1) connected to u0\n    if (!is_connected(g, dst, emb.get_vertex(0)))\n      return false;\n    return true;\n  }\n};\n\nclass AppMiner\n    : public VertexMiner<SimpleElement, BaseEmbedding, MyAPI, 0, 1, 0, 1> {\npublic:\n  AppMiner(unsigned ms, int nt)\n      : VertexMiner<SimpleElement, BaseEmbedding, MyAPI, 0, 1, 0, 1>(ms, nt,\n                                                                     nblocks) {}\n  ~AppMiner() {}\n  void print_output() {\n    std::cout << \"\\n\\ttotal_num_subgraphs = \" << get_total_count() << \"\\n\";\n  }\n};\n\n#include \"pangolin/BfsMining/engine.h\"\n"
  },
  {
    "path": "lonestar/mining/cpu/triangle-counting/CMakeLists.txt",
    "content": "add_executable(triangle-counting-mining-cpu tc_mine.cpp)\nadd_dependencies(apps triangle-counting-mining-cpu)\ntarget_link_libraries(triangle-counting-mining-cpu PRIVATE Galois::pangolin miningbench)\ninstall(TARGETS triangle-counting-mining-cpu DESTINATION \"${CMAKE_INSTALL_BINDIR}\" COMPONENT apps EXCLUDE_FROM_ALL)\n\nadd_test_mine(small1 triangle-counting-mining-cpu -symmetricGraph -simpleGraph \"${BASEINPUT}/Mining/citeseer.csgr\" NOT_QUICK)\n"
  },
  {
    "path": "lonestar/mining/cpu/triangle-counting/README.md",
    "content": "Triangle Counting\n================================================================================\n\nDESCRIPTION \n--------------------------------------------------------------------------------\n\nThis program counts the number of triangles in a given undirected graph.\n\nINPUT\n--------------------------------------------------------------------------------\n\nThis application takes in symmetric and simple Galois .gr graphs.\nYou must specify both the -symmetricGraph and the -simpleGraph flags when\nrunning this benchmark.\n\nBUILD\n--------------------------------------------------------------------------------\n\n1. Run cmake at BUILD directory (refer to top-level README for cmake instructions).\n\n2. Run `cd <BUILD>/lonestar/mining/cpu/triangle-counting; make -j`\n\nRUN\n--------------------------------------------------------------------------------\n\nThe following is an example command line.\n\n-`$ ./triangle-counting-cpu -symmetricGraph -simpleGraph <path-to-graph> -k=3 -t 40`\n\nPERFORMANCE\n--------------------------------------------------------------------------------\n\nPlease see details in the paper.\n"
  },
  {
    "path": "lonestar/mining/cpu/triangle-counting/tc.h",
    "content": "#pragma once\n#include <string>\n#include <iostream>\n#include \"pangolin/types.cuh\"\n\nvoid tc_gpu_solver(std::string filename, AccType& total, size_t N_CHUNK = 1);\n"
  },
  {
    "path": "lonestar/mining/cpu/triangle-counting/tc_mine.cpp",
    "content": "#include \"MiningBench/Start.h\"\n#include \"pangolin/BfsMining/vertex_miner.h\"\n#define TRIANGLE\n\nconst char* name = \"TC\";\nconst char* desc =\n    \"Counts the triangles in a graph (inputs do NOT need to be symmetrized)\";\nconst char* url = nullptr;\n\n#include \"pangolin/BfsMining/vertex_miner_api.h\"\nclass MyAPI : public VertexMinerAPI<BaseEmbedding> {\npublic:\n  // toExtend (only extend the last vertex in the embedding)\n  static bool toExtend(unsigned n, const BaseEmbedding&, unsigned pos) {\n    return pos == n - 1;\n  }\n  // toAdd (only add vertex connected to all the vertices in the embedding)\n  static bool toAdd(unsigned, PangolinGraph&, const BaseEmbedding&, unsigned,\n                    VertexId) {\n    return true;\n  }\n};\n\nclass AppMiner : public VertexMiner<SimpleElement, BaseEmbedding, MyAPI, true> {\npublic:\n  AppMiner(unsigned ms, int nt)\n      : VertexMiner<SimpleElement, BaseEmbedding, MyAPI, true>(ms, nt,\n                                                               nblocks) {\n    if (ms != 3) {\n      printf(\"ERROR: command line argument k must be 3\\n\");\n      exit(1);\n    }\n    set_num_patterns(1);\n  }\n  ~AppMiner() {}\n  void print_output() {\n    std::cout << \"\\n\\ttotal_num_triangles = \" << get_total_count() << \"\\n\";\n  }\n};\n#include \"pangolin/BfsMining/engine.h\"\n"
  },
  {
    "path": "lonestar/mining/gpu/CMakeLists.txt",
    "content": "include_directories(include)\n\nfunction(add_test_mine type app)\n  set(options NOT_QUICK)\n  set(one_value_args)\n  set(multi_value_args REQUIRES COMMAND_PREFIX)\n  cmake_parse_arguments(X \"${options}\" \"${one_value_args}\" \"${multi_value_args}\" ${ARGN})\n\n  set(threads)\n  set(thr \"${GALOIS_NUM_TEST_THREADS}\")\n  while (${thr} GREATER 1)\n    list(APPEND threads ${thr})\n    math(EXPR thr \"${thr} / 2\")\n  endwhile()\n  list(APPEND threads \"1\")\n\n  foreach (thr ${threads})\n    set(name run-${type}-${app}-${thr})\n    add_test(NAME ${name} COMMAND ${app} ${X_UNPARSED_ARGUMENTS} -t ${thr})\n    if (NOT ${X_NOT_QUICK})\n      # Allow parallel tests\n      set_tests_properties(${name}\n        PROPERTIES ENVIRONMENT GALOIS_DO_NOT_BIND_THREADS=1 LABELS quick)\n    endif()\n  endforeach()\nendfunction(add_test_mine)\n\nadd_subdirectory(frequent-subgraph-mining)\nadd_subdirectory(k-clique-listing)\nadd_subdirectory(motif-counting)\nadd_subdirectory(triangle-counting)\n"
  },
  {
    "path": "lonestar/mining/gpu/frequent-subgraph-mining/CMakeLists.txt",
    "content": "add_executable(frequent-subgraph-mining-gpu fsm_gpu.cpp fsm.cu)\nadd_dependencies(apps frequent-subgraph-mining-gpu)\ntarget_link_libraries(frequent-subgraph-mining-gpu PRIVATE Galois::pangolin_gpu miningbench_gpu)\ntarget_compile_definitions(frequent-subgraph-mining-gpu PRIVATE GALOIS_ENABLE_GPU=1)\nset_property(TARGET frequent-subgraph-mining-gpu PROPERTY CUDA_STANDARD 14)\ninstall(TARGETS frequent-subgraph-mining-gpu DESTINATION \"${CMAKE_INSTALL_BINDIR}\" COMPONENT apps EXCLUDE_FROM_ALL)\nadd_test_mine(small1 frequent-subgraph-mining-gpu -symmetricGraph -simpleGraph \"${BASEINPUT}/Mining/citeseer.sadj\" \"-ft=adj\")\n"
  },
  {
    "path": "lonestar/mining/gpu/frequent-subgraph-mining/README.md",
    "content": "Frequent Subgraph Mining\n================================================================================\n\nDESCRIPTION \n--------------------------------------------------------------------------------\n\nThis application does frequent subgraph mining in a graph using BFS \nexpansion. It uses the bliss library [1][2] for graph isomorphism check.\n\n[1] Bliss: A tool for computing automorphism groups and canonical \nlabelings of graphs. http://www.tcs.hut.fi/Software/bliss/, 2017.\n[2] Tommi Junttila and Petteri Kaski. 2007. Engineering an efficient \ncanonical labeling tool for large and sparse graphs. In Proceedings \nof the Meeting on Algorithm Engineering & Expermiments, 135-149.\n\nINPUT\n--------------------------------------------------------------------------------\n\nWe support the following input graph formats: **txt**, **adj**.\n\nWe mostly use **adj** format as also used by Arabesque and RStream.\nThe **adj** format takes as input graphs with the following formats:\n\n* **Labels on vertices (default)**\n```\n# <num vertices> <num edges>\n<vertex id> <vertex label> [<neighbour id1> <neighbour id2> ... <neighbour id n>]\n<vertex id> <vertex label> [<neighbour id1> <neighbour id2> ... <neighbour id n>]\n...\n```\n\nWe currently do not support graph labels on edges.\nVertex ids are expected to be sequential integers from 0 to (total number of vertices - 1).\n\nThis application takes in symmetric and simple graphs.\nYou must specify both the -symmetricGraph and the -simpleGraph flags when\nrunning this benchmark.\n\nBUILD\n--------------------------------------------------------------------------------\n\n1. Run cmake at BUILD directory (refer to top-level README for cmake instructions).\n\n2. Run `cd <BUILD>/lonestar/mining/gpu/frequent-subgraph-mining; make -j`\n\nRUN\n--------------------------------------------------------------------------------\n\nThe following is an example command line.\n\n-`$ ./frequent-subgraph-mining-gpu -symmetricGraph -simpleGraph <path-to-graph> -k=3 -minsup=300 -t 40`\n\nPERFORMANCE\n--------------------------------------------------------------------------------\n\nPlease see details in the paper.\n\n"
  },
  {
    "path": "lonestar/mining/gpu/frequent-subgraph-mining/fsm.cu",
    "content": "// Copyright (c) 2019, Xuhao Chen\n#include \"fsm.h\"\n#include \"pangolin/timer.h\"\n#include \"pangolin/cutils.h\"\n#define USE_PID\n#define USE_DOMAIN\n#define EDGE_INDUCED\n#define ENABLE_LABEL\n#include <cub/cub.cuh>\n#include \"pangolin/miner.cuh\"\n#include \"pangolin/bitsets.h\"\n#include <thrust/scan.h>\n#include <thrust/extrema.h>\n#include <thrust/execution_policy.h>\n#define MAX_NUM_PATTERNS 21251\n\nstruct OrderedEdge {\n  IndexT src;\n  IndexT dst;\n};\n\ninline __device__ int get_init_pattern_id(node_data_type src_label,\n                                          node_data_type dst_label,\n                                          int nlabels) {\n  return (int)src_label * nlabels + (int)dst_label;\n}\n\ninline __device__ unsigned get_pattern_id(node_data_type label0,\n                                          node_data_type label1,\n                                          node_data_type label2, int nlabels) {\n  return nlabels * (nlabels * label0 + label1) + label2;\n}\n\ninline __device__ bool is_quick_automorphism(unsigned size, IndexT* vids,\n                                             history_type his2,\n                                             history_type his, IndexT src,\n                                             IndexT dst) {\n  if (dst <= vids[0])\n    return true;\n  if (dst == vids[1])\n    return true;\n  if (his == 0 && dst < vids[1])\n    return true;\n  if (size == 2) {\n  } else if (size == 3) {\n    if (his == 0 && his2 == 0 && dst <= vids[2])\n      return true;\n    if (his == 0 && his2 == 1 && dst == vids[2])\n      return true;\n    if (his == 1 && his2 == 1 && dst <= vids[2])\n      return true;\n  } else {\n  }\n  return false;\n}\n\ninline __device__ void swap(IndexT first, IndexT second) {\n  if (first > second) {\n    IndexT tmp = first;\n    first      = second;\n    second     = tmp;\n  }\n}\n\ninline __device__ int compare(OrderedEdge oneEdge, OrderedEdge otherEdge) {\n  swap(oneEdge.src, oneEdge.dst);\n  swap(otherEdge.src, otherEdge.dst);\n  if (oneEdge.src == otherEdge.src)\n    return oneEdge.dst - otherEdge.dst;\n  else\n    return oneEdge.src - otherEdge.src;\n}\n\ninline __device__ bool is_edge_automorphism(unsigned size, IndexT* vids,\n                                            history_type* hiss,\n                                            history_type his, IndexT src,\n                                            IndexT dst) {\n  if (size < 3)\n    return is_quick_automorphism(size, vids, hiss[2], his, src, dst);\n  if (dst <= vids[0])\n    return true;\n  if (his == 0 && dst <= vids[1])\n    return true;\n  if (dst == vids[hiss[his]])\n    return true;\n  OrderedEdge added_edge;\n  added_edge.src = src;\n  added_edge.dst = dst;\n  for (unsigned index = his + 1; index < size; ++index) {\n    OrderedEdge edge;\n    edge.src = vids[hiss[index]];\n    edge.dst = vids[index];\n    int cmp  = compare(added_edge, edge);\n    if (cmp <= 0)\n      return true;\n  }\n  return false;\n}\n\n__global__ void extend_alloc(unsigned m, unsigned level, CSRGraph graph,\n                             EmbeddingList emb_list, IndexT* num_new_emb) {\n  unsigned tid = threadIdx.x;\n  unsigned pos = blockIdx.x * blockDim.x + threadIdx.x;\n  __shared__ IndexT vid[BLOCK_SIZE][PANGOLIN_MAX_SIZE];\n  __shared__ history_type his[BLOCK_SIZE][PANGOLIN_MAX_SIZE];\n  if (pos < m) {\n    emb_list.get_edge_embedding(level, pos, vid[tid], his[tid]);\n    num_new_emb[pos] = 0;\n    // if (pos == 1) printf(\"src=%d, dst=%d\\n\", vid[tid][0], vid[tid][1]);\n    for (unsigned i = 0; i < level + 1; ++i) {\n      IndexT src       = vid[tid][i];\n      IndexT row_begin = graph.edge_begin(src);\n      IndexT row_end   = graph.edge_end(src);\n      for (IndexT e = row_begin; e < row_end; e++) {\n        IndexT dst = graph.getEdgeDst(e);\n        if (!is_edge_automorphism(level + 1, vid[tid], his[tid], i, src, dst))\n          num_new_emb[pos]++;\n      }\n    }\n  }\n}\n\n__global__ void extend_insert(unsigned m, unsigned level, CSRGraph graph,\n                              EmbeddingList emb_list, IndexT* indices) {\n  unsigned tid = threadIdx.x;\n  unsigned pos = blockIdx.x * blockDim.x + threadIdx.x;\n  __shared__ IndexT vids[BLOCK_SIZE][PANGOLIN_MAX_SIZE];\n  __shared__ history_type his[BLOCK_SIZE][PANGOLIN_MAX_SIZE];\n  if (pos < m) {\n    emb_list.get_edge_embedding(level, pos, vids[tid], his[tid]);\n    IndexT start = indices[pos];\n    for (unsigned i = 0; i < level + 1; ++i) {\n      IndexT src       = vids[tid][i];\n      IndexT row_begin = graph.edge_begin(src);\n      IndexT row_end   = graph.edge_end(src);\n      for (IndexT e = row_begin; e < row_end; e++) {\n        IndexT dst = graph.getEdgeDst(e);\n        if (!is_edge_automorphism(level + 1, vids[tid], his[tid], i, src,\n                                  dst)) {\n          emb_list.set_idx(level + 1, start, pos);\n          emb_list.set_his(level + 1, start, i);\n          emb_list.set_vid(level + 1, start++, dst);\n        }\n      }\n    }\n  }\n}\n\n__global__ void init_aggregate(unsigned m, unsigned num_emb, CSRGraph graph,\n                               EmbeddingList emb_list, unsigned* pids,\n                               int nlabels, unsigned threshold,\n                               Bitsets small_sets, Bitsets large_sets) {\n  unsigned pos = blockIdx.x * blockDim.x + threadIdx.x;\n  if (pos < num_emb) {\n    IndexT src               = emb_list.get_idx(1, pos);\n    IndexT dst               = emb_list.get_vid(1, pos);\n    node_data_type src_label = graph.getData(src);\n    node_data_type dst_label = graph.getData(dst);\n    int pid                  = 0;\n    if (src_label <= dst_label)\n      pid = get_init_pattern_id(src_label, dst_label, nlabels);\n    else\n      pid = get_init_pattern_id(dst_label, src_label, nlabels);\n    pids[pos] = pid;\n    if (src_label < dst_label) {\n      small_sets.set(pid, src);\n      large_sets.set(pid, dst);\n    } else if (src_label > dst_label) {\n      small_sets.set(pid, dst);\n      large_sets.set(pid, src);\n    } else {\n      small_sets.set(pid, src);\n      small_sets.set(pid, dst);\n      large_sets.set(pid, src);\n      large_sets.set(pid, dst);\n    }\n  }\n}\n\n__global__ void count_ones(int id, Bitsets sets, int* count) {\n  typedef cub::BlockReduce<int, BLOCK_SIZE> BlockReduce;\n  unsigned pos = blockIdx.x * blockDim.x + threadIdx.x;\n  __shared__ typename BlockReduce::TempStorage temp_storage;\n  int num = 0;\n  if (pos < sets.vec_size())\n    num = sets.count_num_ones(id, pos);\n  int block_total = BlockReduce(temp_storage).Sum(num);\n  if (threadIdx.x == 0)\n    atomicAdd(count, block_total);\n}\n\nint init_support_count(unsigned m, int npatterns, unsigned threshold,\n                       Bitsets small_sets, Bitsets large_sets,\n                       bool* init_support_map) {\n  int num_freq_patterns = 0;\n  for (int i = 0; i < npatterns; i++) {\n    int a, b, *d_count;\n    CUDA_SAFE_CALL(cudaMalloc((void**)&d_count, sizeof(int)));\n    CUDA_SAFE_CALL(cudaMemset(d_count, 0, sizeof(int)));\n    count_ones<<<(m - 1) / 256 + 1, 256>>>(i, small_sets, d_count);\n    CudaTest(\"solving count_ones `failed\");\n    CUDA_SAFE_CALL(\n        cudaMemcpy(&a, d_count, sizeof(int), cudaMemcpyDeviceToHost));\n    CUDA_SAFE_CALL(cudaMemset(d_count, 0, sizeof(int)));\n    count_ones<<<(m - 1) / 256 + 1, 256>>>(i, large_sets, d_count);\n    CUDA_SAFE_CALL(\n        cudaMemcpy(&b, d_count, sizeof(int), cudaMemcpyDeviceToHost));\n    unsigned support = a < b ? a : b;\n    if (support >= threshold) {\n      init_support_map[i] = 1;\n      num_freq_patterns++;\n    } else\n      init_support_map[i] = 0;\n  }\n  return num_freq_patterns;\n}\n\nint support_count(unsigned m, unsigned npatterns, unsigned threshold,\n                  Bitsets small_sets, Bitsets middle_sets, Bitsets large_sets,\n                  bool* support_map) {\n  int num_freq_patterns = 0;\n  for (int i = 0; i < npatterns; i++) {\n    int a, b, c, *d_count;\n    CUDA_SAFE_CALL(cudaMalloc((void**)&d_count, sizeof(int)));\n    CUDA_SAFE_CALL(cudaMemset(d_count, 0, sizeof(int)));\n    count_ones<<<(m - 1) / 256 + 1, 256>>>(i, small_sets, d_count);\n    CUDA_SAFE_CALL(\n        cudaMemcpy(&a, d_count, sizeof(int), cudaMemcpyDeviceToHost));\n    CUDA_SAFE_CALL(cudaMemset(d_count, 0, sizeof(int)));\n    count_ones<<<(m - 1) / 256 + 1, 256>>>(i, large_sets, d_count);\n    CUDA_SAFE_CALL(\n        cudaMemcpy(&b, d_count, sizeof(int), cudaMemcpyDeviceToHost));\n    CUDA_SAFE_CALL(cudaMemset(d_count, 0, sizeof(int)));\n    count_ones<<<(m - 1) / 256 + 1, 256>>>(i, middle_sets, d_count);\n    CUDA_SAFE_CALL(\n        cudaMemcpy(&c, d_count, sizeof(int), cudaMemcpyDeviceToHost));\n    unsigned small   = a < b ? a : b;\n    unsigned support = small < c ? small : c;\n    if (support >= threshold) {\n      support_map[i] = 1;\n      num_freq_patterns++;\n    } else\n      support_map[i] = 0;\n  }\n  return num_freq_patterns;\n}\n\n__global__ void init_filter_check(unsigned m, unsigned* pids,\n                                  bool* init_support_map,\n                                  IndexT* is_frequent_emb) {\n  unsigned pos = blockIdx.x * blockDim.x + threadIdx.x;\n  if (pos < m) {\n    unsigned pid     = pids[pos];\n    bool is_frequent = init_support_map[pid];\n    if (is_frequent)\n      is_frequent_emb[pos] = 1;\n  }\n}\n\n__global__ void copy_vids(unsigned m, EmbeddingList emb_list, IndexT* vid_list0,\n                          IndexT* vid_list1) {\n  unsigned pos = blockIdx.x * blockDim.x + threadIdx.x;\n  if (pos < m) {\n    vid_list0[pos] = emb_list.get_idx(1, pos);\n    vid_list1[pos] = emb_list.get_vid(1, pos);\n  }\n}\n\n__global__ void init_filter(unsigned m, EmbeddingList emb_list,\n                            IndexT* vid_list0, IndexT* vid_list1,\n                            IndexT* indices, IndexT* is_frequent_emb) {\n  unsigned pos = blockIdx.x * blockDim.x + threadIdx.x;\n  if (pos < m) {\n    if (is_frequent_emb[pos]) {\n      IndexT src     = vid_list0[pos];\n      IndexT dst     = vid_list1[pos];\n      unsigned start = indices[pos];\n      emb_list.set_vid(1, start, dst);\n      emb_list.set_idx(1, start, src);\n    }\n  }\n}\n\n__global__ void aggregate_check(unsigned num_emb, unsigned level,\n                                CSRGraph graph, EmbeddingList emb_list,\n                                unsigned* pids, int nlabels, unsigned threshold,\n                                unsigned* ne) {\n  unsigned tid = threadIdx.x;\n  unsigned pos = blockIdx.x * blockDim.x + threadIdx.x;\n  __shared__ IndexT vids[BLOCK_SIZE][PANGOLIN_MAX_SIZE];\n  __shared__ history_type his[BLOCK_SIZE][PANGOLIN_MAX_SIZE];\n  if (pos < num_emb) {\n    emb_list.get_edge_embedding(level, pos, vids[tid], his[tid]);\n    unsigned n = level + 1;\n    assert(n < 4);\n    IndexT first      = vids[tid][0];\n    IndexT second     = vids[tid][1];\n    IndexT third      = vids[tid][2];\n    node_data_type l0 = graph.getData(first);\n    node_data_type l1 = graph.getData(second);\n    node_data_type l2 = graph.getData(third);\n    history_type h2   = his[tid][2];\n    unsigned pid      = 0;\n    if (n == 3) {\n      if (h2 == 0) {\n        if (l1 < l2) {\n          pid = get_pattern_id(l0, l2, l1, nlabels);\n        } else {\n          pid = get_pattern_id(l0, l1, l2, nlabels);\n        }\n      } else {\n        assert(h2 == 1);\n        if (l0 < l2) {\n          pid = get_pattern_id(l1, l2, l0, nlabels);\n        } else {\n          pid = get_pattern_id(l1, l0, l2, nlabels);\n        }\n      }\n    } else {\n    }\n    pids[pos] = pid;\n    atomicAdd(&ne[pid], 1);\n  }\n}\n\n__global__ void find_candidate_patterns(unsigned num_patterns, unsigned* ne,\n                                        unsigned minsup, unsigned* id_map,\n                                        unsigned* num_new_patterns) {\n  unsigned pos = blockIdx.x * blockDim.x + threadIdx.x;\n  if (pos < num_patterns) {\n    if (ne[pos] >= minsup) {\n      unsigned new_id = atomicAdd(num_new_patterns, 1);\n      id_map[pos]     = new_id;\n    }\n  }\n}\n\n__global__ void aggregate(unsigned m, unsigned num_emb, unsigned level,\n                          CSRGraph graph, EmbeddingList emb_list,\n                          unsigned* pids, unsigned* ne, unsigned* id_map,\n                          int nlabels, unsigned threshold, Bitsets small_sets,\n                          Bitsets middle_sets, Bitsets large_sets) {\n  unsigned tid = threadIdx.x;\n  unsigned pos = blockIdx.x * blockDim.x + threadIdx.x;\n  __shared__ IndexT vids[BLOCK_SIZE][PANGOLIN_MAX_SIZE];\n  __shared__ history_type his[BLOCK_SIZE][PANGOLIN_MAX_SIZE];\n  if (pos < num_emb) {\n    emb_list.get_edge_embedding(level, pos, vids[tid], his[tid]);\n    assert(level == 2);\n    IndexT first      = vids[tid][0];\n    IndexT second     = vids[tid][1];\n    IndexT third      = vids[tid][2];\n    node_data_type l0 = graph.getData(first);\n    node_data_type l1 = graph.getData(second);\n    node_data_type l2 = graph.getData(third);\n    history_type h2   = his[tid][2];\n    IndexT small, middle, large;\n    unsigned pid = pids[pos];\n    if (ne[pid] >= threshold) {\n      pid = id_map[pid];\n      if (h2 == 0) {\n        middle = first;\n        if (l1 < l2) {\n          small = second;\n          large = third;\n        } else {\n          small = third;\n          large = second;\n        }\n        small_sets.set(pid, small);\n        middle_sets.set(pid, middle);\n        large_sets.set(pid, large);\n        if (l1 == l2) {\n          small_sets.set(pid, large);\n          large_sets.set(pid, small);\n        }\n      } else {\n        assert(h2 == 1);\n        middle = second;\n        if (l0 < l2) {\n          small = first;\n          large = third;\n        } else {\n          small = third;\n          large = first;\n        }\n        small_sets.set(pid, small);\n        middle_sets.set(pid, middle);\n        large_sets.set(pid, large);\n        if (l0 == l2) {\n          small_sets.set(pid, large);\n          large_sets.set(pid, small);\n        }\n      }\n    }\n  }\n}\n\nvoid parallel_prefix_sum(int n, IndexT* in, IndexT* out) {\n  IndexT total = 0;\n  for (size_t i = 0; i < n; i++) {\n    out[i] = total;\n    total += in[i];\n  }\n  out[n] = total;\n}\n\nvoid fsm_gpu_solver(std::string fname, unsigned k, unsigned minsup,\n                    AccType& total_num) {\n  CSRGraph graph_cpu, graph_gpu;\n  int nlabels = graph_cpu.read(fname); // read graph into CPU memoryA\n  int m       = graph_cpu.get_nnodes();\n  int nnz     = graph_cpu.get_nedges();\n  graph_cpu.copy_to_gpu(graph_gpu); // copy graph to GPU memory\n  EmbeddingList emb_list;\n  emb_list.init(nnz, k + 1, false);\n  emb_list.init_cpu(&graph_cpu);\n\n  int nthreads          = BLOCK_SIZE;\n  int nblocks           = DIVIDE_INTO(nnz, nthreads);\n  int num_init_patterns = (nlabels + 1) * (nlabels + 1);\n  std::cout << \"Number of init patterns: \" << num_init_patterns << std::endl;\n  unsigned num_emb = emb_list.size();\n  std::cout << \"number of single-edge embeddings: \" << num_emb << \"\\n\";\n  unsigned* pids;\n  CUDA_SAFE_CALL(cudaMalloc((void**)&pids, sizeof(unsigned) * num_emb));\n  bool* h_init_support_map = (bool*)malloc(sizeof(bool) * num_init_patterns);\n  bool* d_init_support_map;\n  CUDA_SAFE_CALL(cudaMalloc((void**)&d_init_support_map,\n                            sizeof(bool) * num_init_patterns));\n  IndexT* is_frequent_emb;\n  CUDA_SAFE_CALL(\n      cudaMalloc((void**)&is_frequent_emb, sizeof(IndexT) * (num_emb + 1)));\n  CUDA_SAFE_CALL(\n      cudaMemset(is_frequent_emb, 0, sizeof(IndexT) * (num_emb + 1)));\n  IndexT *vid_list0, *vid_list1;\n  CUDA_SAFE_CALL(cudaMalloc((void**)&vid_list0, sizeof(IndexT) * num_emb));\n  CUDA_SAFE_CALL(cudaMalloc((void**)&vid_list1, sizeof(IndexT) * num_emb));\n  Bitsets small_sets, large_sets, middle_sets;\n  small_sets.alloc(MAX_NUM_PATTERNS, m);\n  large_sets.alloc(MAX_NUM_PATTERNS, m);\n  middle_sets.alloc(MAX_NUM_PATTERNS, m);\n  small_sets.set_size(num_init_patterns, m);\n  large_sets.set_size(num_init_patterns, m);\n  middle_sets.set_size(num_init_patterns, m);\n\n  IndexT *num_new_emb, *indices;\n  CUDA_SAFE_CALL(cudaMalloc((void**)&indices, sizeof(IndexT) * (num_emb + 1)));\n  CUDA_SAFE_CALL(cudaDeviceSynchronize());\n  nblocks = (num_emb - 1) / nthreads + 1;\n  unsigned* d_num_new_patterns;\n  unsigned h_num_new_patterns = 0;\n  CUDA_SAFE_CALL(cudaMalloc((void**)&d_num_new_patterns, sizeof(unsigned)));\n  printf(\"Launching CUDA TC solver (%d CTAs, %d threads/CTA) ...\\n\", nblocks,\n         nthreads);\n\n  Timer t;\n  t.Start();\n  unsigned level = 1;\n  init_aggregate<<<nblocks, nthreads>>>(m, num_emb, graph_gpu, emb_list, pids,\n                                        nlabels, minsup, small_sets,\n                                        large_sets);\n  CudaTest(\"solving init_aggregate `failed\");\n  std::cout << \"Init_aggregate Done\\n\";\n  int num_freq_patterns = init_support_count(\n      m, num_init_patterns, minsup, small_sets, large_sets, h_init_support_map);\n  total_num += num_freq_patterns;\n  if (num_freq_patterns == 0) {\n    std::cout << \"No frequent pattern found\\n\\n\";\n    return;\n  }\n  std::cout << \"Number of frequent single-edge patterns: \" << num_freq_patterns\n            << \"\\n\";\n  CUDA_SAFE_CALL(cudaMemcpy(d_init_support_map, h_init_support_map,\n                            sizeof(bool) * num_init_patterns,\n                            cudaMemcpyHostToDevice));\n  init_filter_check<<<nblocks, nthreads>>>(num_emb, pids, d_init_support_map,\n                                           is_frequent_emb);\n  CudaTest(\"solving init_filter_check `failed\");\n  thrust::exclusive_scan(thrust::device, is_frequent_emb,\n                         is_frequent_emb + num_emb + 1, indices);\n  IndexT new_size;\n  CUDA_SAFE_CALL(cudaMemcpy(&new_size, &indices[num_emb], sizeof(IndexT),\n                            cudaMemcpyDeviceToHost));\n  std::cout << \"number of embeddings after pruning: \" << new_size << \"\\n\";\n  copy_vids<<<nblocks, nthreads>>>(num_emb, emb_list, vid_list0, vid_list1);\n  CudaTest(\"solving copy_vids `failed\");\n  init_filter<<<nblocks, nthreads>>>(num_emb, emb_list, vid_list0, vid_list1,\n                                     indices, is_frequent_emb);\n  CudaTest(\"solving init_filter `failed\");\n  CUDA_SAFE_CALL(cudaFree(indices));\n  CUDA_SAFE_CALL(cudaFree(is_frequent_emb));\n  CUDA_SAFE_CALL(cudaFree(pids));\n  // small_sets.clean();\n  // large_sets.clean();\n  small_sets.clear();\n  large_sets.clear();\n  CUDA_SAFE_CALL(cudaFree(vid_list0));\n  CUDA_SAFE_CALL(cudaFree(vid_list1));\n  CUDA_SAFE_CALL(cudaFree(d_init_support_map));\n  emb_list.remove_tail(new_size);\n\n  while (1) {\n    num_emb = emb_list.size();\n    std::cout << \"number of embeddings in level \" << level << \": \" << num_emb\n              << \"\\n\";\n    CUDA_SAFE_CALL(\n        cudaMalloc((void**)&num_new_emb, sizeof(IndexT) * (num_emb + 1)));\n    CUDA_SAFE_CALL(\n        cudaMalloc((void**)&indices, sizeof(IndexT) * (num_emb + 1)));\n    std::cout << \"Done allocating memory for embeddings in level \" << level\n              << \"\\n\";\n    nblocks = (num_emb - 1) / nthreads + 1;\n    extend_alloc<<<nblocks, nthreads>>>(num_emb, level, graph_gpu, emb_list,\n                                        num_new_emb);\n    CudaTest(\"solving extend_alloc failed\");\n    thrust::exclusive_scan(thrust::device, num_new_emb,\n                           num_new_emb + num_emb + 1, indices);\n    CudaTest(\"Scan failed\");\n    CUDA_SAFE_CALL(cudaMemcpy(&new_size, &indices[num_emb], sizeof(IndexT),\n                              cudaMemcpyDeviceToHost));\n    std::cout << \"number of new embeddings: \" << new_size << \"\\n\";\n    emb_list.add_level(new_size);\n    extend_insert<<<nblocks, nthreads>>>(num_emb, level, graph_gpu, emb_list,\n                                         indices);\n    CudaTest(\"solving extend_insert failed\");\n    std::cout << \"Extend_insert Done\\n\";\n    num_emb = emb_list.size();\n    CUDA_SAFE_CALL(cudaFree(num_new_emb));\n    CUDA_SAFE_CALL(cudaFree(indices));\n    level++;\n\n    int num_patterns = nlabels * num_init_patterns;\n    nblocks          = (num_emb - 1) / nthreads + 1;\n    std::cout << \"Number of patterns in level \" << level << \": \" << num_patterns\n              << std::endl;\n    std::cout << \"number of embeddings in level \" << level << \": \" << num_emb\n              << \"\\n\";\n    unsigned *ne, *id_map;\n    CUDA_SAFE_CALL(cudaMalloc((void**)&ne, sizeof(unsigned) * num_patterns));\n    CUDA_SAFE_CALL(\n        cudaMalloc((void**)&id_map, sizeof(unsigned) * num_patterns));\n    CUDA_SAFE_CALL(cudaMemset(ne, 0, sizeof(unsigned) * num_patterns));\n    CUDA_SAFE_CALL(cudaMalloc((void**)&pids, sizeof(unsigned) * num_emb));\n    std::cout << \"Done allocating memory for aggregation in level \" << level\n              << \"\\n\";\n    aggregate_check<<<nblocks, nthreads>>>(num_emb, level, graph_gpu, emb_list,\n                                           pids, nlabels, minsup, ne);\n    CudaTest(\"solving aggregate_check failed\");\n    CUDA_SAFE_CALL(cudaMemset(d_num_new_patterns, 0, sizeof(unsigned)));\n    find_candidate_patterns<<<(num_patterns - 1) / nthreads + 1, nthreads>>>(\n        num_patterns, ne, minsup, id_map, d_num_new_patterns);\n    CudaTest(\"solving find_candidate_patterns failed\");\n    CUDA_SAFE_CALL(cudaMemcpy(&h_num_new_patterns, d_num_new_patterns,\n                              sizeof(unsigned), cudaMemcpyDeviceToHost));\n    std::cout << \"Number of candidate patterns in level \" << level << \": \"\n              << h_num_new_patterns << std::endl;\n\n    // small_sets.alloc(h_num_new_patterns, m);\n    // large_sets.alloc(h_num_new_patterns, m);\n    // middle_sets.alloc(h_num_new_patterns, m);\n    small_sets.set_size(h_num_new_patterns, m);\n    large_sets.set_size(h_num_new_patterns, m);\n    middle_sets.set_size(h_num_new_patterns, m);\n    std::cout << \"Done allocating sets\\n\";\n    aggregate<<<nblocks, nthreads>>>(m, num_emb, level, graph_gpu, emb_list,\n                                     pids, ne, id_map, nlabels, minsup,\n                                     small_sets, middle_sets, large_sets);\n    CudaTest(\"solving aggregate failed\");\n    bool* h_support_map = (bool*)malloc(sizeof(bool) * h_num_new_patterns);\n    num_freq_patterns = support_count(m, h_num_new_patterns, minsup, small_sets,\n                                      middle_sets, large_sets, h_support_map);\n    CudaTest(\"solving support_count failed\");\n    CUDA_SAFE_CALL(cudaFree(ne));\n    CUDA_SAFE_CALL(cudaFree(id_map));\n    std::cout << \"num_frequent_patterns: \" << num_freq_patterns << \"\\n\";\n    total_num += num_freq_patterns;\n    if (num_freq_patterns == 0)\n      break;\n    if (level == k)\n      break;\n    // filter<<<nblocks, nthreads>>>(level, emb_list);\n  }\n  CUDA_SAFE_CALL(cudaDeviceSynchronize());\n  t.Stop();\n\n  printf(\"\\truntime = %f ms.\\n\", t.Millisecs());\n}\n"
  },
  {
    "path": "lonestar/mining/gpu/frequent-subgraph-mining/fsm.h",
    "content": "#pragma once\n#include <string>\n#include <iostream>\n#include \"pangolin/types.cuh\"\n\nvoid fsm_gpu_solver(std::string fname, unsigned k, unsigned minsup,\n                    AccType& total);\n"
  },
  {
    "path": "lonestar/mining/gpu/frequent-subgraph-mining/fsm_gpu.cpp",
    "content": "// Copyright 2019, University of Texas at Austin\n// Authors: Xuhao Chen <cxh@utexas.edu>\n#define EDGE_INDUCED\n#include \"galois/Galois.h\"\n#include \"fsm.h\"\n#include \"MiningBench/Start.h\"\n#include \"llvm/Support/CommandLine.h\"\n\nconst char* name = \"FSM\";\nconst char* desc = \"Frequent subgraph mining in an undirected graph\";\nconst char* url  = 0;\n\nint main(int argc, char** argv) {\n  LonestarMineStart(argc, argv, name, desc, url);\n\n  if (!simpleGraph || !symmetricGraph) {\n    GALOIS_DIE(\"This application requires a symmetric simple graph input \"\n               \" which is symmetric and has no multiple edges or self-loops;\"\n               \" please use both -symmetricGraph and -simpleGraph flag \"\n               \" to indicate the input is a symmetric simple graph\");\n  }\n\n  if (filetype != \"adj\") {\n    galois::gError(\"This application only supports adj format for FSM\\n\"\n                   \"Please add the -ft=adj flag\\n\");\n    exit(1);\n  }\n  AccType total = 0;\n  fsm_gpu_solver(inputFile, k, minsup, total);\n  std::cout << \"\\n\\ttotal_num_frequent_patterns = \" << total << \"\\n\\n\";\n  return 0;\n}\n"
  },
  {
    "path": "lonestar/mining/gpu/k-clique-listing/CMakeLists.txt",
    "content": "add_executable(k-clique-listing-gpu kcl_gpu.cpp kcl.cu)\nadd_dependencies(apps k-clique-listing-gpu)\ntarget_link_libraries(k-clique-listing-gpu PRIVATE Galois::pangolin_gpu miningbench_gpu)\ntarget_compile_definitions(k-clique-listing-gpu PRIVATE GALOIS_ENABLE_GPU=1)\nset_property(TARGET k-clique-listing-gpu PROPERTY CUDA_STANDARD 14)\ninstall(TARGETS k-clique-listing-gpu DESTINATION \"${CMAKE_INSTALL_BINDIR}\" COMPONENT apps EXCLUDE_FROM_ALL)\nadd_test_mine(small1 k-clique-listing-gpu -symmetricGraph -simpleGraph \"${BASEINPUT}/Mining/citeseer.csgr\")\n"
  },
  {
    "path": "lonestar/mining/gpu/k-clique-listing/README.md",
    "content": "K-Clique\n================================================================================\n\nDESCRIPTION \n--------------------------------------------------------------------------------\n\nThis application counts the K-Cliques in a graph.\n\nINPUT\n--------------------------------------------------------------------------------\n\nThis application takes in symmetric and simple Galois .gr graphs.\nYou must specify both the -symmetricGraph and the -simpleGraph flags when\nrunning this benchmark.\n\nBUILD\n--------------------------------------------------------------------------------\n\n1. Run cmake at BUILD directory (refer to top-level README for cmake instructions).\n\n2. Run `cd <BUILD>/lonestar/mining/gpu/k-clique-listing; make -j`\n\nRUN\n--------------------------------------------------------------------------------\n\nThe following is an example command line.\n\n-`$ ./k-clique-listing-gpu -symmetricGraph -simpleGraph <path-to-graph> -k=3 -t 40`\n\nPERFORMANCE\n--------------------------------------------------------------------------------\n\nPlease see details in the paper.\n"
  },
  {
    "path": "lonestar/mining/gpu/k-clique-listing/kcl.cu",
    "content": "// Copyright (c) 2019, Xuhao Chen\n#include \"kcl.h\"\n#include \"pangolin/timer.h\"\n#include \"pangolin/cutils.h\"\n#define USE_SIMPLE\n#define USE_BASE_TYPES\n#include \"pangolin/miner.cuh\"\n#include <cub/cub.cuh>\n#include <thrust/scan.h>\n#include <thrust/execution_policy.h>\n\n#define USE_SHM\ntypedef cub::BlockScan<int, BLOCK_SIZE> BlockScan;\ntypedef cub::BlockReduce<AccType, BLOCK_SIZE> BlockReduce;\n\n__global__ void extend_alloc(size_t begin, size_t end, unsigned level, unsigned max_size, CSRGraph graph, EmbeddingList emb_list, size_t *num_new_emb, AccType *total) {\n\tunsigned tid = threadIdx.x;\n\tunsigned pos = blockIdx.x * blockDim.x + threadIdx.x;\n\t__shared__ typename BlockReduce::TempStorage temp_storage;\n#ifdef USE_SHM\n\t__shared__ IndexT emb[BLOCK_SIZE][PANGOLIN_MAX_SIZE];\n#else\n\tIndexT emb[PANGOLIN_MAX_SIZE];\n#endif\n\tAccType local_num = 0;\n\tif(pos < end - begin) {\n#ifdef USE_SHM\n\t\temb_list.get_embedding(level, begin + pos, emb[tid]);\n#else\n\t\temb_list.get_embedding(level, begin + pos, emb);\n#endif\n\t\tIndexT vid = emb_list.get_vid(level, begin + pos);\n\t\tnum_new_emb[pos] = 0;\n\t\tIndexT row_begin = graph.edge_begin(vid);\n\t\tIndexT row_end = graph.edge_end(vid);\n\t\tfor (IndexT e = row_begin; e < row_end; e++) {\n\t\t\tIndexT dst = graph.getEdgeDst(e);\n#ifdef USE_SHM\n\t\t\tif (is_all_connected_dag(dst, emb[tid], level, graph)) {\n#else\n\t\t\tif (is_all_connected_dag(dst, emb, level, graph)) {\n#endif\n\t\t\t\tif (level < max_size-2) num_new_emb[pos] ++;\n\t\t\t\telse local_num += 1;\n\t\t\t}\n\t\t}\n\t}\n\tAccType block_num = BlockReduce(temp_storage).Sum(local_num);\n\tif(threadIdx.x == 0) atomicAdd(total, block_num);\n}\n\n__global__ void extend_alloc_lb(size_t begin, size_t end, unsigned level, unsigned max_size, CSRGraph graph, EmbeddingList emb_list, unsigned long long *num_new_emb, AccType *total) {\n\tunsigned tid = threadIdx.x;\n\tunsigned base_id = blockIdx.x * blockDim.x;\n\tunsigned pos = blockIdx.x * blockDim.x + threadIdx.x;\n\t__shared__ typename BlockReduce::TempStorage reduce_storage;\n\n\tconst unsigned SCRATCHSIZE = BLOCK_SIZE;\n\t__shared__ BlockScan::TempStorage temp_storage;\n\t__shared__ int gather_offsets[SCRATCHSIZE];\n\t__shared__ unsigned src[SCRATCHSIZE];\n\t__shared__ IndexT emb[BLOCK_SIZE][PANGOLIN_MAX_SIZE];\n\t//IndexT emb[MAX_SIZE];\n\n\tgather_offsets[threadIdx.x] = 0;\n\tint neighbor_size = 0;\n\tint neighbor_offset = 0;\n\tint scratch_offset = 0;\n\tint total_edges = 0;\n\tIndexT row_begin = 0;\n\tIndexT row_end = 0;\n\n\tIndexT vid;\n\tAccType local_num = 0;\n\tif (pos < end - begin) {\n\t\t//emb_list.get_embedding(level, begin + pos, emb);\n\t\temb_list.get_embedding(level, begin + pos, emb[tid]);\n\t\tvid = emb_list.get_vid(level, begin + pos);\n\t\tnum_new_emb[pos] = 0;\n\t\trow_begin = graph.edge_begin(vid);\n\t\trow_end = graph.edge_end(vid);\n\t\tneighbor_offset = row_begin;\n\t\tneighbor_size = row_end - row_begin;\n\t}\n\tBlockScan(temp_storage).ExclusiveSum(neighbor_size, scratch_offset, total_edges);\n\tint done = 0;\n\tint neighbors_done = 0;\n\twhile(total_edges > 0) {\n\t\t__syncthreads();\n\t\tint i;\n\t\tfor(i = 0; neighbors_done + i < neighbor_size && (scratch_offset + i - done) < SCRATCHSIZE; i++) {\n\t\t\tgather_offsets[scratch_offset + i - done] = neighbor_offset + neighbors_done + i;\n\t\t\tsrc[scratch_offset + i - done] = tid;\n\t\t}\n\t\tneighbors_done += i;\n\t\tscratch_offset += i;\n\t\t__syncthreads();\n\t\tif(tid < total_edges) {\n\t\t\tint e = gather_offsets[tid];\n\t\t\tIndexT dst = graph.getEdgeDst(e);\n\t\t\tunsigned idx = src[tid];\n\t\t\tif (is_all_connected_dag(dst, emb[idx], level, graph)) {\n\t\t\t\tif (level < max_size-2) atomicAdd(num_new_emb+base_id+idx, 1);\n\t\t\t\telse local_num += 1;\n\t\t\t}\n\t\t}\n\t\ttotal_edges -= BLOCK_SIZE;\n\t\tdone += BLOCK_SIZE;\n\t}\n\tAccType block_num = BlockReduce(reduce_storage).Sum(local_num);\n\tif (tid == 0) atomicAdd(total, block_num);\n}\n\n\n__global__ void extend_insert(size_t begin, size_t end, unsigned level, CSRGraph graph, EmbeddingList emb_list, size_t *indices) {\n\tunsigned tid = threadIdx.x;\n\tunsigned pos = blockIdx.x * blockDim.x + threadIdx.x;\n#ifdef USE_SHM\n\t__shared__ IndexT emb[BLOCK_SIZE][PANGOLIN_MAX_SIZE];\n#else\n\tIndexT emb[PANGOLIN_MAX_SIZE];\n#endif\n\tif(pos < end - begin) {\n#ifdef USE_SHM\n\t\temb_list.get_embedding(level, begin + pos, emb[tid]);\n#else\n\t\temb_list.get_embedding(level, begin + pos, emb);\n#endif\n\t\tIndexT vid = emb_list.get_vid(level, begin + pos);\n\t\tIndexT start = indices[pos];\n\t\tIndexT row_begin = graph.edge_begin(vid);\n\t\tIndexT row_end = graph.edge_end(vid);\n\t\tfor (IndexT e = row_begin; e < row_end; e++) {\n\t\t\tIndexT dst = graph.getEdgeDst(e);\n#ifdef USE_SHM\n\t\t\tif (is_all_connected_dag(dst, emb[tid], level, graph)) {\n#else\n\t\t\tif (is_all_connected_dag(dst, emb, level, graph)) {\n#endif\n\t\t\t\temb_list.set_idx(level+1, start, begin + pos);\n\t\t\t\temb_list.set_vid(level+1, start++, dst);\n\t\t\t}\n\t\t}\n\t}\n}\n\nvoid kcl_gpu_solver(std::string fname, unsigned k, AccType &total, size_t N_CHUNK) {\n\tCSRGraph graph_cpu, graph_gpu;\n\tgraph_cpu.read(fname, false, true); // read graph into CPU memory, use DAG\n\tint m = graph_cpu.get_nnodes();\n\tint nnz = graph_cpu.get_nedges();\n\tgraph_cpu.copy_to_gpu(graph_gpu); // copy graph to GPU memory\n\tEmbeddingList emb_list;\n\temb_list.init(nnz, k);\n\n\tint nthreads = BLOCK_SIZE;\n\tint nblocks = DIVIDE_INTO(m, nthreads);\n\tinit_gpu_dag<<<nblocks, nthreads>>>(m, graph_gpu, emb_list);\n\tcheck_cuda(cudaDeviceSynchronize());\n\n\tAccType h_total = 0, *d_total;\n\tAccType zero = 0;\n\tsize_t chunk_length = (nnz - 1) / N_CHUNK + 1;\n\tcheck_cuda(cudaMalloc((void **)&d_total, sizeof(AccType)));\n\tprintf(\"Launching CUDA k-clique solver (%d CTAs, %d threads/CTA) ...\\n\", nblocks, nthreads);\n\n\tTimer t;\n\tt.Start();\n\tstd::cout << \"number of single-edge embeddings: \" << nnz << \"\\n\";\n\tfor (size_t cid = 0; cid < N_CHUNK; cid ++) {\n\t\tsize_t chunk_begin = cid * chunk_length;\n\t\tsize_t chunk_end = std::min((cid+1) * chunk_length, (size_t)nnz);\n\t\tsize_t cur_size = chunk_end-chunk_begin;\n\t\tstd::cout << \"Processing the \" << cid << \" chunk of \" << cur_size << \" edges\\n\";\n\n\t\tunsigned level = 1;\n\t\twhile (1) {\n\t\t\tsize_t *num_new_emb;\n\t\t\tsize_t num_emb = emb_list.size();\n\t\t\tsize_t begin = 0, end = num_emb;\n\t\t\tif (level == 1) { begin = chunk_begin; end = chunk_end; num_emb = end - begin; }\n\t\t\tstd::cout << \"\\t number of embeddings in level \" << level << \": \" << num_emb << \"\\n\";\n\t\t\tcheck_cuda(cudaMalloc((void **)&num_new_emb, sizeof(size_t) * (num_emb+1)));\n\t\t\tcheck_cuda(cudaMemset(num_new_emb, 0, sizeof(size_t) * (num_emb+1)));\n\t\t\tnblocks = (num_emb-1)/nthreads+1;\n\t\t\tcheck_cuda(cudaMemcpy(d_total, &zero, sizeof(AccType), cudaMemcpyHostToDevice));\n\t\t\textend_alloc<<<nblocks, nthreads>>>(begin, end, level, k, graph_gpu, emb_list, num_new_emb, d_total);\n\t\t\tcheck_cuda(cudaMemcpy(&h_total, d_total, sizeof(AccType), cudaMemcpyDeviceToHost));\n\t\t\ttotal += h_total;\n\t\t\tCudaTest(\"solving extend alloc failed\");\n\t\t\tif (level == k-2) {\n\t\t\t\tcheck_cuda(cudaFree(num_new_emb));\n\t\t\t\tbreak; \n\t\t\t}\n\t\t\tsize_t *indices;\n\t\t\tcheck_cuda(cudaMalloc((void **)&indices, sizeof(size_t) * (num_emb+1)));\n\t\t\tthrust::exclusive_scan(thrust::device, num_new_emb, num_new_emb+num_emb+1, indices);\n\t\t\tcheck_cuda(cudaFree(num_new_emb));\n\t\t\tsize_t new_size;\n\t\t\tcheck_cuda(cudaMemcpy(&new_size, &indices[num_emb], sizeof(unsigned), cudaMemcpyDeviceToHost));\n\t\t\tstd::cout << \"\\t number of new embeddings: \" << new_size << \"\\n\";\n\t\t\temb_list.add_level(new_size);\n\t\t\textend_insert<<<nblocks, nthreads>>>(begin, end, level, graph_gpu, emb_list, indices);\n\t\t\tCudaTest(\"solving extend insert failed\");\n\t\t\tcheck_cuda(cudaFree(indices));\n\t\t\tlevel ++;\n\t\t}\n\t\temb_list.reset_level();\n\t}\n\tcheck_cuda(cudaDeviceSynchronize());\n\tt.Stop();\n\n\tprintf(\"\\truntime = %f ms.\\n\", t.Millisecs());\n\tcheck_cuda(cudaFree(d_total));\n}\n\n"
  },
  {
    "path": "lonestar/mining/gpu/k-clique-listing/kcl.h",
    "content": "#pragma once\n#include <string>\n#include <iostream>\n#include \"pangolin/types.cuh\"\n\nvoid kcl_gpu_solver(std::string filename, unsigned k, AccType& total,\n                    size_t N_CHUNK = 1);\n"
  },
  {
    "path": "lonestar/mining/gpu/k-clique-listing/kcl_gpu.cpp",
    "content": "// Copyright 2019, University of Texas at Austin\n// Authors: Xuhao Chen <cxh@utexas.edu>\n#include \"galois/Galois.h\"\n#include \"kcl.h\"\n#include \"MiningBench/Start.h\"\n#include \"llvm/Support/CommandLine.h\"\n\nconst char* name = \"k-cliques\";\nconst char* desc = \"Listing all k-cliques in an undirected graph\";\nconst char* url  = 0;\n\nint main(int argc, char** argv) {\n  LonestarMineStart(argc, argv, name, desc, url);\n\n  if (!simpleGraph || !symmetricGraph) {\n    GALOIS_DIE(\"This application requires a symmetric simple graph input \"\n               \" which is symmetric and has no multiple edges or self-loops;\"\n               \" please use both -symmetricGraph and -simpleGraph flag \"\n               \" to indicate the input is a symmetric simple graph\");\n  }\n\n  if (filetype != \"gr\") {\n    galois::gError(\"This application only supports gr format\\n\"\n                   \"Please add the -ft=gr flag\\n\");\n    exit(1);\n  }\n\n  AccType total = 0;\n  kcl_gpu_solver(inputFile, k, total);\n  std::cout << \"\\n\\ttotal_num_cliques = \" << total << \"\\n\\n\";\n  return 0;\n}\n"
  },
  {
    "path": "lonestar/mining/gpu/motif-counting/CMakeLists.txt",
    "content": "add_executable(motif-counting-gpu motif_gpu.cpp motif.cu)\nadd_dependencies(apps motif-counting-gpu)\ntarget_link_libraries(motif-counting-gpu PRIVATE Galois::pangolin_gpu miningbench_gpu)\ntarget_compile_definitions(motif-counting-gpu PRIVATE GALOIS_ENABLE_GPU=1)\nset_property(TARGET motif-counting-gpu PROPERTY CUDA_STANDARD 14)\ninstall(TARGETS motif-counting-gpu DESTINATION \"${CMAKE_INSTALL_BINDIR}\" COMPONENT apps EXCLUDE_FROM_ALL)\n#add_test_mine(small1 motif \"${BASEINPUT}/Mining/citeseer.csgr\")\nadd_test_mine(small1 motif-counting-gpu -symmetricGraph -simpleGraph \"${BASEINPUT}/Mining/citeseer.csgr\")\n"
  },
  {
    "path": "lonestar/mining/gpu/motif-counting/README.md",
    "content": "Motif Counting\n================================================================================\n\nDESCRIPTION \n--------------------------------------------------------------------------------\n\nThis application counts the motifs in a graph using BFS \nexpansion. It uses the bliss library [1][2] for graph isomorphism test.\n\n[1] Bliss: A tool for computing automorphism groups and canonical \nlabelings of graphs. http://www.tcs.hut.fi/Software/bliss/, 2017.\n[2] Tommi Junttila and Petteri Kaski. 2007. Engineering an efficient \ncanonical labeling tool for large and sparse graphs. In Proceedings \nof the Meeting on Algorithm Engineering & Expermiments, 135-149.\n\nINPUT\n--------------------------------------------------------------------------------\n\nThis application takes in symmetric and simple Galois .gr graphs.\nYou must specify both the -symmetricGraph and the -simpleGraph flags when\nrunning this benchmark.\n\nBUILD\n--------------------------------------------------------------------------------\n\n1. Run cmake at BUILD directory (refer to top-level README for cmake instructions).\n\n2. Run `cd <BUILD>/lonestar/mining/gpu/motif-counting; make -j`\n\nRUN\n--------------------------------------------------------------------------------\n\nThe following is an example command line.\n\n-`$ ./motif-counting-gpu -symmetricGraph -simpleGraph <path-to-graph> -k=3 -t 28`\n\nPERFORMANCE\n--------------------------------------------------------------------------------\n\nPlease see details in the paper.\n\n"
  },
  {
    "path": "lonestar/mining/gpu/motif-counting/motif.cu",
    "content": "// Copyright (c) 2019, Xuhao Chen\n#include \"motif.h\"\n#include \"pangolin/timer.h\"\n#include \"pangolin/cutils.h\"\n#define USE_PID\n#define USE_SIMPLE\n#define VERTEX_INDUCED\n#include \"pangolin/miner.cuh\"\n#include <cub/cub.cuh>\n#include <thrust/scan.h>\n#include <thrust/execution_policy.h>\ntypedef cub::BlockReduce<AccType, BLOCK_SIZE> BlockReduce;\n\nvoid printout_motifs(int npatterns, AccType *accumulators) {\n\tstd::cout << std::endl;\n\tif (npatterns == 2) {\n\t\tstd::cout << \"\\ttriangles\\t\" << accumulators[0] << std::endl;\n\t\tstd::cout << \"\\t3-chains\\t\" << accumulators[1] << std::endl;\n\t} else if (npatterns == 6) {\n\t\tstd::cout << \"\\t4-paths --> \" << accumulators[0] << std::endl;\n\t\tstd::cout << \"\\t3-stars --> \" << accumulators[1] << std::endl;\n\t\tstd::cout << \"\\t4-cycles --> \" << accumulators[2] << std::endl;\n\t\tstd::cout << \"\\ttailed-triangles --> \" << accumulators[3] << std::endl;\n\t\tstd::cout << \"\\tdiamonds --> \" << accumulators[4] << std::endl;\n\t\tstd::cout << \"\\t4-cliques --> \" << accumulators[5] << std::endl;\n\t} else {\n\t\tstd::cout << \"\\ttoo many patterns to show\\n\";\n\t}\n\tstd::cout << std::endl;\n}\n\n__global__ void extend_alloc(unsigned m, unsigned level, CSRGraph graph, EmbeddingList emb_list, IndexT *num_new_emb) {\n\tunsigned tid = threadIdx.x;\n\tunsigned pos = blockIdx.x * blockDim.x + threadIdx.x;\n\t__shared__ IndexT emb[BLOCK_SIZE][PANGOLIN_MAX_SIZE];\n\tif(pos < m) {\n\t\tIndexT num = 0;\n\t\temb_list.get_embedding(level, pos, emb[tid]);\n\t\tfor (unsigned i = 0; i < level+1; ++i) {\n\t\t\tIndexT src = emb[tid][i];\n\t\t\tIndexT row_begin = graph.edge_begin(src);\n\t\t\tIndexT row_end = graph.edge_end(src);\n\t\t\tfor (IndexT e = row_begin; e < row_end; e++) {\n\t\t\t\tIndexT dst = graph.getEdgeDst(e);\n\t\t\t\tif (!is_vertexInduced_automorphism(level+1, emb[tid], i, src, dst, graph))\n\t\t\t\t\tnum ++;\n\t\t\t}\n\t\t}\n\t\tnum_new_emb[pos] = num;\n\t}\n}\n\n__global__ void extend_insert(unsigned m, unsigned max_size, unsigned level, CSRGraph graph, EmbeddingList emb_list, IndexT *indices) {\n\tunsigned tid = threadIdx.x;\n\tunsigned pos = blockIdx.x * blockDim.x + threadIdx.x;\n\t__shared__ IndexT emb[BLOCK_SIZE][PANGOLIN_MAX_SIZE];\n\tif(pos < m) {\n\t\temb_list.get_embedding(level, pos, emb[tid]);\n\t\tIndexT start = indices[pos];\n\t\tfor (unsigned i = 0; i < level+1; ++i) {\n\t\t\tIndexT src = emb[tid][i];\n\t\t\tIndexT row_begin = graph.edge_begin(src);\n\t\t\tIndexT row_end = graph.edge_end(src);\n\t\t\tfor (IndexT e = row_begin; e < row_end; e++) {\n\t\t\t\tIndexT dst = graph.getEdgeDst(e);\n\t\t\t\tif (!is_vertexInduced_automorphism(level+1, emb[tid], i, src, dst, graph)) {\n\t\t\t\t\tif (level == 1 && max_size == 4)\n\t\t\t\t\t\temb_list.set_pid(start, find_3motif_pattern_id(i, dst, emb[tid], graph, start));\n\t\t\t\t\temb_list.set_idx(level+1, start, pos);\n\t\t\t\t\temb_list.set_vid(level+1, start++, dst);\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t}\n}\n\n__global__ void aggregate(unsigned m, unsigned level, unsigned npatterns, CSRGraph graph, EmbeddingList emb_list, AccType *accumulators) {\n\tunsigned tid = threadIdx.x;\n\tunsigned pos = blockIdx.x * blockDim.x + threadIdx.x;\n\t//__shared__ typename BlockReduce::TempStorage temp_storage;\n\t__shared__ IndexT emb[BLOCK_SIZE][PANGOLIN_MAX_SIZE];\n\tAccType local_num[6];\n\tfor (int i = 0; i < npatterns; i++) local_num[i] = 0;\n\tif(pos < m) {\n\t\tunsigned pattern = 0;\n\t\temb_list.get_embedding(level, pos, emb[tid]);\n\t\t//if (pos == 0) printout_embedding(level, emb[tid]);\n\t\tunsigned n = level+1;\n\t\tassert(n < 4);\n\t\tif (n == 3) pattern = emb_list.get_pid(pos);\n\t\tfor (unsigned i = 0; i < n; ++i) {\n\t\t\tIndexT src = emb[tid][i];\n\t\t\tIndexT row_begin = graph.edge_begin(src);\n\t\t\tIndexT row_end = graph.edge_end(src);\n\t\t\tfor (IndexT e = row_begin; e < row_end; e++) {\n\t\t\t\tIndexT dst = graph.getEdgeDst(e);\n\t\t\t\tif (!is_vertexInduced_automorphism(n, emb[tid], i, src, dst, graph)) {\n\t\t\t\t\tunsigned pid = 1; // 3-chain\n\t\t\t\t\t//if (i == 0 && is_connected(emb[tid][1], dst, graph)) pid = 0; // triangle\n\t\t\t\t\tif (n == 2) pid = find_3motif_pattern_id(i, dst, emb[tid], graph, pos);\n\t\t\t\t\telse pid = find_4motif_pattern_id(n, i, dst, emb[tid], pattern, graph, pos);\n\t\t\t\t\t//printf(\"pid = %u\\n\", pid);\n\t\t\t\t\tlocal_num[pid] += 1;\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t}\n\t//AccType block_num;\n\tfor (int i = 0; i < npatterns; i++) {\n\t\t//block_num = BlockReduce(temp_storage).Sum(local_num[i]);\n\t\t//if(threadIdx.x == 0) atomicAdd(&accumulators[i], block_num);\n\t\tatomicAdd(&accumulators[i], local_num[i]);\n\t}\n}\n\n__global__ void clear(AccType *accumulators) {\n\tunsigned i = blockIdx.x * blockDim.x + threadIdx.x;\n\taccumulators[i] = 0;\n}\n\nvoid parallel_prefix_sum(int n, IndexT *in, IndexT *out) {\n\tIndexT total = 0;\n\tfor (size_t i = 0; i < n; i++) {\n\t\tout[i] = total;\n\t\ttotal += in[i];\n\t}\n\tout[n] = total;\n}\n\nvoid motif_gpu_solver(std::string fname, unsigned k, std::vector<AccType> &acc, size_t N_CHUNK) {\n\tsize_t npatterns = acc.size();\n\tAccType *h_accumulators = (AccType *)malloc(sizeof(AccType) * npatterns);\n\tfor (int i = 0; i < npatterns; i++) h_accumulators[i] = 0;\n\tAccType *d_accumulators;\n\tCUDA_SAFE_CALL(cudaMalloc((void **)&d_accumulators, sizeof(AccType) * npatterns));\n\tclear<<<1, npatterns>>>(d_accumulators);\n\tCudaTest(\"clear accumulator failed\");\n\n\tCSRGraph graph_cpu, graph_gpu;\n\tgraph_cpu.read(fname, false); // read graph into CPU memoryA\n\tint m = graph_cpu.get_nnodes();\n\tint nnz = graph_cpu.get_nedges();\n\tgraph_cpu.copy_to_gpu(graph_gpu); // copy graph to GPU memory\n\n\tint nthreads = BLOCK_SIZE;\n\tint nblocks = DIVIDE_INTO(m, nthreads);\n\tprintf(\"Launching CUDA TC solver (%d CTAs, %d threads/CTA) ...\\n\", nblocks, nthreads);\n\tEmbeddingList emb_list;\n\temb_list.init(nnz, k, false);\n\temb_list.init_cpu(&graph_cpu);\n\tCUDA_SAFE_CALL(cudaDeviceSynchronize());\n\n\tTimer t;\n\tt.Start();\n\tunsigned level = 1;\n\tunsigned num_emb = emb_list.size();\n\twhile (level < k-2) {\n\t\tIndexT *num_new_emb, *indices;\n\t\tCUDA_SAFE_CALL(cudaMalloc((void **)&num_new_emb, sizeof(IndexT) * (num_emb+1)));\n\t\tCUDA_SAFE_CALL(cudaMalloc((void **)&indices, sizeof(IndexT) * (num_emb+1)));\n\t\tnblocks = (num_emb-1)/nthreads+1;\n\t\textend_alloc<<<nblocks, nthreads>>>(num_emb, level, graph_gpu, emb_list, num_new_emb);\n\t\tCudaTest(\"solving extend_alloc failed\");\n\t\tthrust::exclusive_scan(thrust::device, num_new_emb, num_new_emb+num_emb+1, indices);\n\t\tCudaTest(\"Scan failed\");\n\t\tIndexT new_size;\n\t\tCUDA_SAFE_CALL(cudaMemcpy(&new_size, &indices[num_emb], sizeof(IndexT), cudaMemcpyDeviceToHost));\n\t\tassert(new_size < 4294967296); // TODO: currently do not support vector size larger than 2^32\n\t\temb_list.add_level(new_size);\n\t\t#ifdef USE_WEDGE\n\t\t//if (level == 1 && max_size == 4) {\n\t\t//\tis_wedge.resize(emb_list.size());\n\t\t//\tstd::fill(is_wedge.begin(), is_wedge.end(), 0);\n\t\t//}\n\t\t#endif\n\t\textend_insert<<<nblocks, nthreads>>>(num_emb, k, level, graph_gpu, emb_list, indices);\n\t\tCudaTest(\"solving extend_insert failed\");\n\t\tstd::cout << \"Extend_insert Done\\n\";\n\t\tnum_emb = emb_list.size();\n\t\tCUDA_SAFE_CALL(cudaFree(num_new_emb));\n\t\tCUDA_SAFE_CALL(cudaFree(indices));\n\t\tlevel ++;\n\t}\n\tif (k < 5) {\n\t\tnblocks = (num_emb-1)/nthreads+1;\n\t\taggregate<<<nblocks, nthreads>>>(num_emb, level, npatterns, graph_gpu, emb_list, d_accumulators);\n\t\tCudaTest(\"solving aggregate failed\");\n\t} else {\n\t\tprintf(\"Not supported\\n\");\n\t}\n\tCUDA_SAFE_CALL(cudaDeviceSynchronize());\n\tt.Stop();\n\n\tprintf(\"\\truntime = %f ms.\\n\", t.Millisecs());\n\tCUDA_SAFE_CALL(cudaMemcpy(h_accumulators, d_accumulators, sizeof(AccType) * npatterns, cudaMemcpyDeviceToHost));\n\tprintout_motifs(npatterns, h_accumulators);\n\tCUDA_SAFE_CALL(cudaFree(d_accumulators));\n}\n\n"
  },
  {
    "path": "lonestar/mining/gpu/motif-counting/motif.h",
    "content": "#pragma once\n#include <vector>\n#include <string>\n#include <iostream>\n#include \"pangolin/types.cuh\"\n\nvoid motif_gpu_solver(std::string fname, unsigned k, std::vector<AccType>& acc,\n                      size_t N_CHUNK = 1);\n"
  },
  {
    "path": "lonestar/mining/gpu/motif-counting/motif_gpu.cpp",
    "content": "// Copyright 2019, University of Texas at Austin\n// Authors: Xuhao Chen <cxh@utexas.edu>\n#include \"galois/Galois.h\"\n#include \"motif.h\"\n#include \"MiningBench/Start.h\"\n#include \"llvm/Support/CommandLine.h\"\n\nconst char* name           = \"k-cliques\";\nconst char* desc           = \"Listing all k-cliques in an undirected graph\";\nconst char* url            = 0;\nstatic int num_patterns[3] = {2, 6, 21};\n\nint main(int argc, char** argv) {\n  LonestarMineStart(argc, argv, name, desc, url);\n\n  if (!simpleGraph || !symmetricGraph) {\n    GALOIS_DIE(\"This application requires a symmetric simple graph input \"\n               \" which is symmetric and has no multiple edges or self-loops;\"\n               \" please use both -symmetricGraph and -simpleGraph flag \"\n               \" to indicate the input is a symmetric simple graph\");\n  }\n\n  if (filetype != \"gr\") {\n    galois::gError(\"This application only supports gr format\\n\"\n                   \"Please add the -ft=gr flag\\n\");\n    exit(1);\n  }\n  int npatterns = num_patterns[k - 3];\n  std::cout << k << \"-motif has \" << npatterns << \" patterns in total\\n\";\n  std::vector<AccType> accumulators(npatterns);\n  for (int i = 0; i < npatterns; i++)\n    accumulators[i] = 0;\n\n  motif_gpu_solver(inputFile, k, accumulators);\n  return 0;\n}\n"
  },
  {
    "path": "lonestar/mining/gpu/triangle-counting/CMakeLists.txt",
    "content": "add_executable(triangle-counting-mining-gpu tc_mine_gpu.cpp tc_mine.cu)\nadd_dependencies(apps triangle-counting-mining-gpu)\ntarget_link_libraries(triangle-counting-mining-gpu PRIVATE Galois::pangolin_gpu miningbench_gpu)\ntarget_compile_definitions(triangle-counting-mining-gpu PRIVATE GALOIS_ENABLE_GPU=1)\nset_property(TARGET triangle-counting-mining-gpu PROPERTY CUDA_STANDARD 14)\ninstall(TARGETS triangle-counting-mining-gpu DESTINATION \"${CMAKE_INSTALL_BINDIR}\" COMPONENT apps EXCLUDE_FROM_ALL)\nadd_test_mine(small1 triangle-counting-mining-gpu -symmetricGraph -simpleGraph \"${BASEINPUT}/Mining/citeseer.csgr\")\n"
  },
  {
    "path": "lonestar/mining/gpu/triangle-counting/README.md",
    "content": "Triangle Counting\n================================================================================\n\nDESCRIPTION \n--------------------------------------------------------------------------------\n\nThis program counts the number of triangles in a given undirected graph.\n\nINPUT\n--------------------------------------------------------------------------------\n\nThis application takes in symmetric and simple Galois .gr graphs.\nYou must specify both the -symmetricGraph and the -simpleGraph flags when\nrunning this benchmark.\n\nBUILD\n--------------------------------------------------------------------------------\n\n1. Run cmake at BUILD directory (refer to top-level README for cmake instructions).\n\n2. Run `cd <BUILD>/lonestar/mining/gpu/triangle-counting; make -j`\n\nRUN\n--------------------------------------------------------------------------------\n\nThe following is an example command line.\n\n-`$ ./triangle-counting-gpu -symmetricGraph -simpleGraph <path-to-graph> -k=3 -t 40`\n\nPERFORMANCE\n--------------------------------------------------------------------------------\n\nPlease see details in the paper.\n"
  },
  {
    "path": "lonestar/mining/gpu/triangle-counting/tc.h",
    "content": "#pragma once\n#include <string>\n#include <iostream>\n#include \"pangolin/types.cuh\"\n\nvoid tc_gpu_solver(std::string filename, AccType& total, size_t N_CHUNK = 1);\n"
  },
  {
    "path": "lonestar/mining/gpu/triangle-counting/tc_mine.cu",
    "content": "// Copyright (c) 2019, Xuhao Chen\n#include \"tc.h\"\n#include \"pangolin/timer.h\"\n#include \"pangolin/cutils.h\"\n#define USE_SIMPLE\n#define USE_BASE_TYPES\n#include <cub/cub.cuh>\n#include \"pangolin/miner.cuh\"\ntypedef cub::BlockReduce<unsigned long long, BLOCK_SIZE> BlockReduce;\n\n__global__ void warp_edge(int m, CSRGraph graph, EmbeddingList emb_list, unsigned long long *total) {\n\t__shared__ typename BlockReduce::TempStorage temp_storage;\n\tunsigned thread_id   = blockIdx.x * blockDim.x + threadIdx.x;\n\tunsigned thread_lane = threadIdx.x & (WARP_SIZE-1);            // thread index within the warp\n\tunsigned warp_id     = thread_id   / WARP_SIZE;                // global warp index\n\t//unsigned warp_lane   = threadIdx.x / WARP_SIZE;                // warp index within the CTA\n\tunsigned num_warps   = (BLOCK_SIZE / WARP_SIZE) * gridDim.x;      // total number of active warps\n\n\tunsigned long long local_num = 0;\n\tfor (IndexT tid = warp_id; tid < m; tid += num_warps) {\n\t\tIndexT src = emb_list.get_idx(1, tid);\n\t\tIndexT dst = emb_list.get_vid(1, tid);\n\t\tassert(src != dst);\n\t\tIndexT src_size = graph.getOutDegree(src);\n\t\tIndexT dst_size = graph.getOutDegree(dst);\n\t\tIndexT lookup = src;\n\t\tIndexT search = dst;\n\t\tif (src_size > dst_size) {\n\t\t\tlookup = dst;\n\t\t\tsearch = src;\n\t\t}\n\t\tIndexT lookup_begin = graph.edge_begin(lookup);\n\t\tIndexT lookup_size = graph.getOutDegree(lookup);\n\t\tIndexT search_size = graph.getOutDegree(search);\n\t\tif (lookup_size > 0 && search_size > 0) {\n\t\t\tfor (IndexT i = thread_lane; i < lookup_size; i += WARP_SIZE) {\n\t\t\t\tIndexT index = lookup_begin + i;\n\t\t\t\tIndexT key = graph.getEdgeDst(index);\n\t\t\t\tIndexT search_begin = graph.edge_begin(search);\n\t\t\t\tif (binary_search(graph, key, search_begin, search_begin+search_size))\n\t\t\t\t\tlocal_num += 1;\n\t\t\t}\n\t\t}\n\t}\n\tunsigned long long block_num = BlockReduce(temp_storage).Sum(local_num);\n\tif(threadIdx.x == 0) atomicAdd(total, block_num);\n}\n\n__global__ void warp(int m, IndexT *row_offsets, IndexT *column_indices, int *degrees, unsigned long long *total) {\n\t__shared__ typename BlockReduce::TempStorage temp_storage;\n\tunsigned thread_id   = blockIdx.x * blockDim.x + threadIdx.x;\n\tunsigned thread_lane = threadIdx.x & (WARP_SIZE-1);            // thread index within the warp\n\tunsigned warp_id     = thread_id   / WARP_SIZE;                // global warp index\n\t//unsigned warp_lane   = threadIdx.x / WARP_SIZE;                // warp index within the CTA\n\tunsigned num_warps   = (BLOCK_SIZE / WARP_SIZE) * gridDim.x;      // total number of active warps\n\n\tunsigned long long local_num = 0;\n\t// each warp takes one vertex\n\tfor (IndexT src = warp_id; src < m; src += num_warps) {\n\t\tIndexT row_begin = row_offsets[src];\n\t\tIndexT row_end = row_offsets[src+1];\n\t\tIndexT src_size = degrees[src];\n\t\t// take one edge\n\t\tfor (IndexT offset = row_begin; offset < row_end; offset ++) {\n\t\t\tIndexT dst = column_indices[offset];\n\t\t\tassert(src != dst);\n\t\t\tIndexT dst_size = degrees[dst];\n\t\t\tIndexT lookup = src;\n\t\t\tIndexT search = dst;\n\t\t\tif (src_size > dst_size) {\n\t\t\t\tlookup = dst;\n\t\t\t\tsearch = src;\n\t\t\t}\n\t\t\tIndexT lookup_begin = row_offsets[lookup];\n\t\t\tIndexT lookup_size = degrees[lookup];\n\t\t\tIndexT search_size = degrees[search];\n\t\t\tif (lookup_size > 0 && search_size > 0) {\n\t\t\t\tfor (IndexT i = thread_lane; i < lookup_size; i += WARP_SIZE) {\n\t\t\t\t\tIndexT index = lookup_begin + i;\n\t\t\t\t\tIndexT key = column_indices[index];\n\t\t\t\t\tIndexT search_begin = row_offsets[search];\n\t\t\t\t\tif (binary_search(column_indices, key, search_begin, search_begin+search_size))\n\t\t\t\t\t\tlocal_num += 1;\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t}\n\tunsigned long long block_num = BlockReduce(temp_storage).Sum(local_num);\n\tif(threadIdx.x == 0) atomicAdd(total, block_num);\n}\n\n\nvoid tc_gpu_solver(std::string fname, AccType &total, size_t N_CHUNK) {\n\tCSRGraph graph_cpu, graph_gpu;\n\tgraph_cpu.read(fname, false, true); // read graph into CPU memory, use DAG\n\tint m = graph_cpu.get_nnodes();\n\tint nnz = graph_cpu.get_nedges();\n\tgraph_cpu.copy_to_gpu(graph_gpu); // copy graph to GPU memory\n\tEmbeddingList emb_list;\n\temb_list.init(nnz);\n\n\tint nthreads = BLOCK_SIZE;\n\tint nblocks = DIVIDE_INTO(m, WARPS_PER_BLOCK);\n\tinit_gpu_dag<<<nblocks, nthreads>>>(m, graph_gpu, emb_list);\n\n\tunsigned long long h_total = 0, *d_total;\n\tunsigned long long  zero = 0;\n\tCUDA_SAFE_CALL(cudaMalloc((void **)&d_total, sizeof(unsigned long long)));\n\tCUDA_SAFE_CALL(cudaMemcpy(d_total, &zero, sizeof(unsigned long long), cudaMemcpyHostToDevice));\n\tprintf(\"Launching CUDA TC solver (%d CTAs, %d threads/CTA) ...\\n\", nblocks, nthreads);\n\n\tTimer t;\n\tt.Start();\n\t//warp<<<nblocks, nthreads>>>(m, d_row_offsets, d_column_indices, d_degrees, d_total);\n\twarp_edge<<<nblocks, nthreads>>>(nnz, graph_gpu, emb_list, d_total);\n\tCudaTest(\"solving failed\");\n\tCUDA_SAFE_CALL(cudaDeviceSynchronize());\n\tt.Stop();\n\n\tprintf(\"\\truntime = %f ms.\\n\", t.Millisecs());\n\tCUDA_SAFE_CALL(cudaMemcpy(&h_total, d_total, sizeof(unsigned long long), cudaMemcpyDeviceToHost));\n\ttotal = h_total;\n\tCUDA_SAFE_CALL(cudaFree(d_total));\n}\n\n"
  },
  {
    "path": "lonestar/mining/gpu/triangle-counting/tc_mine_gpu.cpp",
    "content": "// Copyright 2019, University of Texas at Austin\n// Authors: Xuhao Chen <cxh@utexas.edu>\n#include \"galois/Galois.h\"\n#include \"tc.h\"\n#include \"MiningBench/Start.h\"\n#include \"llvm/Support/CommandLine.h\"\n\nconst char* name = \"Triangle counting\";\nconst char* desc = \"Counting triangles in an undirected graph\";\nconst char* url  = 0;\n\nint main(int argc, char** argv) {\n  LonestarMineStart(argc, argv, name, desc, url);\n\n  if (!simpleGraph || !symmetricGraph) {\n    GALOIS_DIE(\"This application requires a symmetric simple graph input \"\n               \" which is symmetric and has no multiple edges or self-loops;\"\n               \" please use both -symmetricGraph and -simpleGraph flag \"\n               \" to indicate the input is a symmetric simple graph\");\n  }\n\n  if (filetype != \"gr\") {\n    galois::gError(\"This application only supports gr format\\n\"\n                   \"Please add the -ft=gr flag\\n\");\n    exit(1);\n  }\n  AccType total = 0;\n  tc_gpu_solver(inputFile, total);\n  std::cout << \"\\n\\ttotal_num_triangles = \" << total << \"\\n\\n\";\n  return 0;\n}\n"
  },
  {
    "path": "lonestar/mining/libminingbench/CMakeLists.txt",
    "content": "add_library(miningbench STATIC src/Start.cpp src/Input.cpp)\ntarget_include_directories(miningbench PUBLIC\n  \"${CMAKE_CURRENT_SOURCE_DIR}/include\"\n)\n\ntarget_link_libraries(miningbench Galois::shmem LLVMSupport lonestar)\n\nif(GALOIS_ENABLE_DIST)\n  add_library(miningbench_dist STATIC src/Start.cpp)\n  target_include_directories(miningbench_dist PUBLIC\n    \"${CMAKE_CURRENT_SOURCE_DIR}/include\"\n  )\n  target_compile_definitions(miningbench_dist PRIVATE GALOIS_ENABLE_DIST=1) \n  target_link_libraries(miningbench_dist Galois::shmem LLVMSupport distbench)\nendif()\n\nif(GALOIS_ENABLE_GPU)\n  add_library(miningbench_gpu STATIC src/Start.cpp src/Input.cpp)\n  target_include_directories(miningbench_gpu PUBLIC\n    \"${CMAKE_CURRENT_SOURCE_DIR}/include\"\n  )\n  target_compile_definitions(miningbench_gpu PRIVATE GALOIS_ENABLE_GPU=1) \n  target_link_libraries(miningbench_gpu Galois::shmem LLVMSupport)\nendif()\n"
  },
  {
    "path": "lonestar/mining/libminingbench/include/MiningBench/Start.h",
    "content": "#pragma once\n#include \"llvm/Support/CommandLine.h\"\n\nnamespace cll = llvm::cl;\nextern cll::opt<std::string> inputFile;\nextern cll::opt<std::string> filetype;\nextern cll::opt<unsigned> num_trials;\nextern cll::opt<unsigned> nblocks;\nextern cll::opt<std::string> pattern_filename;\nextern cll::opt<std::string> morder_filename;\nextern cll::opt<unsigned> fv;\nextern cll::opt<unsigned> k;\nextern cll::opt<unsigned> show;\nextern cll::opt<unsigned> debug;\nextern cll::opt<unsigned> minsup;\nextern cll::opt<std::string> preset_filename;\n\nextern cll::opt<bool> simpleGraph;\n\n// note these may come from uplevel liblonestar or libdistbench\nextern cll::opt<int> numThreads; // locally defined for gpu apps (necessary?)\nextern cll::opt<bool> verify;    // TODO use skipVerify from liblonestar\n#ifndef GALOIS_ENABLE_GPU\nextern cll::opt<std::string> statFile;\n#endif\nextern cll::opt<bool> symmetricGraph; // locally defined for gpu apps\n\nvoid LonestarMineStart(int argc, char** argv, const char* app, const char* desc,\n                       const char* url);\n"
  },
  {
    "path": "lonestar/mining/libminingbench/src/Input.cpp",
    "content": "#include \"MiningBench/Start.h\"\n\ncll::opt<std::string> inputFile(cll::Positional,\n                                cll::desc(\"<filename: symmetrized graph>\"),\n                                cll::Required);\n"
  },
  {
    "path": "lonestar/mining/libminingbench/src/Start.cpp",
    "content": "#include <string>\n#include <sstream>\n#include <iostream>\n#ifndef GALOIS_ENABLE_GPU\n#include \"galois/Galois.h\"\n#endif\n#include \"galois/gIO.h\"\n#include \"MiningBench/Start.h\"\n\nnamespace cll = llvm::cl;\ncll::opt<std::string> filetype(\"ft\", cll::desc(\"<filetype: txt,adj,mtx,gr>\"),\n                               cll::init(\"gr\"));\ncll::opt<unsigned> num_trials(\"n\",\n                              cll::desc(\"perform n trials (default value 1)\"),\n                              cll::init(1));\ncll::opt<unsigned>\n    nblocks(\"b\", cll::desc(\"edge blocking to b blocks (default value 1)\"),\n            cll::init(1));\ncll::opt<std::string>\n    pattern_filename(\"p\",\n                     cll::desc(\"<pattern graph filename: symmetrized graph>\"),\n                     cll::init(\"\"));\ncll::opt<std::string>\n    morder_filename(\"mo\", cll::desc(\"<filename: pre-defined matching order>\"),\n                    cll::init(\"\"));\ncll::opt<unsigned> fv(\"fv\", cll::desc(\"first vertex is special\"), cll::init(0));\ncll::opt<unsigned>\n    k(\"k\", cll::desc(\"max number of vertices in k-clique (default value 3)\"),\n      cll::init(3));\ncll::opt<unsigned> show(\"s\", cll::desc(\"print out the details\"), cll::init(0));\ncll::opt<unsigned>\n    debug(\"d\", cll::desc(\"print out the frequent patterns for debugging\"),\n          cll::init(0));\ncll::opt<unsigned> minsup(\"ms\",\n                          cll::desc(\"minimum support (default value 300)\"),\n                          cll::init(300));\ncll::opt<std::string>\n    preset_filename(\"pf\", cll::desc(\"<filename: preset matching order>\"),\n                    cll::init(\"\"));\n// TODO use skipVerify from liblonestar\ncll::opt<bool>\n    verify(\"v\", llvm::cl::desc(\"do verification step (default value false)\"),\n           llvm::cl::init(false));\n\ncll::opt<bool>\n    simpleGraph(\"simpleGraph\",\n                cll::desc(\"Specify that the input graph is \"\n                          \"simple (has no multiple edges or self-loops)\"),\n                cll::init(false));\n\n#ifdef GALOIS_ENABLE_GPU\n// TODO is numThreads necessary for gpu apps? remove it if not.\ncll::opt<int> numThreads(\"t\",\n                         llvm::cl::desc(\"Number of threads (default value 1)\"),\n                         llvm::cl::init(1));\ncll::opt<bool>\n    symmetricGraph(\"symmetricGraph\",\n                   cll::desc(\"Specify that the input graph is symmetric\"),\n                   cll::init(false));\n#endif\n\n// TODO merge LonestarStart for cpu apps\nvoid LonestarMineStart(int argc, char** argv, const char* app, const char* desc,\n                       const char* url) {\n  llvm::cl::ParseCommandLineOptions(argc, argv);\n\n  if (!simpleGraph || !symmetricGraph) {\n    GALOIS_DIE(\"This application requires a symmetric simple graph input \"\n               \" which is symmetric and has no multiple edges or self-loops;\"\n               \" please use both -symmetricGraph and -simpleGraph flag \"\n               \" to indicate the input is a symmetric simple graph\");\n  }\n\n#ifndef GALOIS_ENABLE_GPU\n  numThreads = galois::setActiveThreads(numThreads);\n  galois::runtime::setStatFile(statFile);\n#endif\n  std::cout << \"Copyright (C) 2020 The University of Texas at Austin\\n\";\n  std::cout << \"http://iss.ices.utexas.edu/galois/\\n\\n\";\n  std::cout << \"application: \" << (app ? app : \"unspecified\") << \"\\n\";\n  if (desc)\n    std::cout << desc << \"\\n\";\n  if (url)\n    std::cout << \"http://iss.ices.utexas.edu/?p=projects/galois/benchmarks/\"\n              << url << \"\\n\";\n  std::cout << \"\\n\";\n  std::ostringstream cmdout;\n  for (int i = 0; i < argc; ++i) {\n    cmdout << argv[i];\n    if (i != argc - 1)\n      cmdout << \" \";\n  }\n#ifndef GALOIS_ENABLE_GPU\n  galois::runtime::reportParam(\"(NULL)\", \"CommandLine\", cmdout.str());\n  galois::runtime::reportParam(\"(NULL)\", \"Threads\", numThreads);\n  galois::runtime::reportParam(\"(NULL)\", \"Runs\", num_trials);\n  galois::runtime::reportParam(\"(NULL)\", \"Input\", inputFile);\n  galois::runtime::reportParam(\"(NULL)\", \"Hosts\", 1);\n#endif\n}\n"
  },
  {
    "path": "lonestar/scientific/CMakeLists.txt",
    "content": "add_subdirectory(cpu)\nif(GALOIS_ENABLE_GPU)\n  add_subdirectory(gpu)\nendif()\n"
  },
  {
    "path": "lonestar/scientific/cpu/CMakeLists.txt",
    "content": "add_subdirectory(barneshut)\nadd_subdirectory(delaunayrefinement)\nadd_subdirectory(delaunaytriangulation)\nadd_subdirectory(longestedge)\n"
  },
  {
    "path": "lonestar/scientific/cpu/barneshut/Barneshut.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#include \"galois/Galois.h\"\n#include \"galois/Timer.h\"\n#include \"galois/Bag.h\"\n#include \"galois/Reduction.h\"\n#include \"Lonestar/BoilerPlate.h\"\n#include \"galois/runtime/Profile.h\"\n\n#include <boost/math/constants/constants.hpp>\n#include <boost/iterator/transform_iterator.hpp>\n\n#include <array>\n#include <limits>\n#include <iostream>\n#include <fstream>\n#include <random>\n#include <deque>\n\n#include <strings.h>\n\n#include \"Point.h\"\n\nconst char* name = \"Barnes-Hut N-Body Simulator\";\nconst char* desc =\n    \"Simulates gravitational forces in a galactic cluster using the \"\n    \"Barnes-Hut n-body algorithm\";\nconst char* url = \"barneshut\";\n\nstatic llvm::cl::opt<int>\n    nbodies(\"n\", llvm::cl::desc(\"Number of bodies (default value 10000)\"),\n            llvm::cl::init(10000));\nstatic llvm::cl::opt<int>\n    ntimesteps(\"steps\", llvm::cl::desc(\"Number of steps (default value 1)\"),\n               llvm::cl::init(1));\nstatic llvm::cl::opt<int> seed(\"seed\",\n                               llvm::cl::desc(\"Random seed (default value 7)\"),\n                               llvm::cl::init(7));\n\nstruct Node {\n  Point pos;\n  double mass;\n  bool Leaf;\n};\n\nstruct Body : public Node {\n  Point vel;\n  Point acc;\n};\n\n/**\n * A node in an octree is either an internal node or a leaf.\n */\nstruct Octree : public Node {\n  std::array<galois::substrate::PtrLock<Node>, 8> child;\n  char cLeafs;\n  char nChildren;\n\n  Octree(const Point& p) {\n    Node::pos  = p;\n    Node::Leaf = false;\n    cLeafs     = 0;\n    nChildren  = 0;\n  }\n};\n\nstd::ostream& operator<<(std::ostream& os, const Body& b) {\n  os << \"(pos:\" << b.pos << \" vel:\" << b.vel << \" acc:\" << b.acc\n     << \" mass:\" << b.mass << \")\";\n  return os;\n}\n\nstruct BoundingBox {\n  Point min;\n  Point max;\n  explicit BoundingBox(const Point& p) : min(p), max(p) {}\n  BoundingBox()\n      : min(std::numeric_limits<double>::max()),\n        max(std::numeric_limits<double>::min()) {}\n\n  BoundingBox merge(const BoundingBox& other) const {\n    BoundingBox copy(*this);\n\n    copy.min.pairMin(other.min);\n    copy.max.pairMax(other.max);\n    return copy;\n  }\n\n  double diameter() const { return (max - min).minDim(); }\n  double radius() const { return diameter() * 0.5; }\n  Point center() const { return (min + max) * 0.5; }\n};\n\nstd::ostream& operator<<(std::ostream& os, const BoundingBox& b) {\n  os << \"(min:\" << b.min << \" max:\" << b.max << \")\";\n  return os;\n}\n\nstruct Config {\n  const double dtime; // length of one time step\n  const double eps;   // potential softening parameter\n  const double tol;   // tolerance for stopping recursion, <0.57 to bound error\n  const double dthf, epssq, itolsq;\n  Config()\n      : dtime(0.5), eps(0.05), tol(0.05), // 0.025),\n        dthf(dtime * 0.5), epssq(eps * eps), itolsq(1.0 / (tol * tol)) {}\n};\n\nstd::ostream& operator<<(std::ostream& os, const Config& c) {\n  os << \"Barnes-Hut configuration:\"\n     << \" dtime: \" << c.dtime << \" eps: \" << c.eps << \" tol: \" << c.tol;\n  return os;\n}\n\nConfig config;\n\ninline int getIndex(const Point& a, const Point& b) {\n  int index = 0;\n  for (int i = 0; i < 3; ++i)\n    if (a[i] < b[i])\n      index += (1 << i);\n  return index;\n}\n\ninline Point updateCenter(Point v, int index, double radius) {\n  for (int i = 0; i < 3; i++)\n    v[i] += (index & (1 << i)) > 0 ? radius : -radius;\n  return v;\n}\n\ntypedef galois::InsertBag<Body> Bodies;\ntypedef galois::InsertBag<Body*> BodyPtrs;\n// FIXME: reclaim memory for multiple steps\ntypedef galois::InsertBag<Octree> Tree;\n\nstruct BuildOctree {\n\n  Tree& T;\n\n  void insert(Body* b, Octree* node, double radius) const {\n    int index   = getIndex(node->pos, b->pos);\n    Node* child = node->child[index].getValue();\n\n    // go through the tree lock-free while we can\n    if (child && !child->Leaf) {\n      insert(b, static_cast<Octree*>(child), radius);\n      return;\n    }\n\n    node->child[index].lock();\n    child = node->child[index].getValue();\n\n    if (child == NULL) {\n      node->child[index].unlock_and_set(b);\n      return;\n    }\n\n    radius *= 0.5;\n    if (child->Leaf) {\n      // Expand leaf\n\n      Octree* new_node = &T.emplace(updateCenter(node->pos, index, radius));\n      if (b->pos == child->pos) {\n        // Jitter point to gaurantee uniqueness.\n        double jitter = config.tol / 2;\n        assert(jitter < radius);\n        b->pos += (new_node->pos - b->pos) * jitter;\n      }\n\n      // assert(node->pos != b->pos);\n      // node->child[index].unlock_and_set(new_node);\n      insert(b, new_node, radius);\n      insert(static_cast<Body*>(child), new_node, radius);\n      node->child[index].unlock_and_set(new_node);\n    } else {\n      node->child[index].unlock();\n      insert(b, static_cast<Octree*>(child), radius);\n    }\n  }\n};\n\nunsigned computeCenterOfMass(Octree* node) {\n  double mass = 0.0;\n  Point accum;\n  unsigned num = 1;\n\n  // Reorganize leaves to be dense\n  // remove copies values\n  int index = 0;\n  for (int i = 0; i < 8; ++i)\n    if (node->child[i].getValue())\n      node->child[index++].setValue(node->child[i].getValue());\n  for (int i = index; i < 8; ++i)\n    node->child[i].setValue(NULL);\n  node->nChildren = index;\n\n  for (int i = 0; i < index; i++) {\n    Node* child = node->child[i].getValue();\n    if (!child->Leaf) {\n      num += computeCenterOfMass(static_cast<Octree*>(child));\n    } else {\n      node->cLeafs |= (1 << i);\n      ++num;\n    }\n    mass += child->mass;\n    accum += child->pos * child->mass;\n  }\n\n  node->mass = mass;\n\n  if (mass > 0.0)\n    node->pos = accum / mass;\n  return num;\n}\n\n/*\nvoid printRec(std::ofstream& file, Node* node, unsigned level) {\n  static const char* ct[] = {\n    \"blue\", \"cyan\", \"aquamarine\", \"chartreuse\",\n    \"darkorchid\", \"darkorange\",\n    \"deeppink\", \"gold\", \"chocolate\"\n  };\n  if (!node) return;\n  file << \"\\\"\" << node << \"\\\" [color=\" << ct[node->owner / 4] << (node->owner %\n4 + 1) << (level ? \"\" : \" style=filled\") << \" label = \\\"\" << (node->Leaf ? \"L\" :\n\"N\") << \"\\\"];\\n\"; if (!node->Leaf) { Octree* node2 = static_cast<Octree*>(node);\n    for (int i = 0; i < 8 && node2->child[i]; ++i) {\n      if (level == 3 || level == 6)\n        file << \"subgraph cluster_\" << level << \"_\" << i << \" {\\n\";\n      file << \"\\\"\" << node << \"\\\" -> \\\"\" << node2->child[i] << \"\\\"\n[weight=0.01]\\n\"; printRec(file, node2->child[i], level + 1); if (level == 3 ||\nlevel == 6) file << \"}\\n\";\n    }\n  }\n}\n\nvoid printTree(Octree* node) {\n  std::ofstream file(\"out.txt\");\n  file << \"digraph octree {\\n\";\n  file << \"ranksep = 2\\n\";\n  file << \"root = \\\"\" << node << \"\\\"\\n\";\n  //  file << \"overlap = scale\\n\";\n  printRec(file, node, 0);\n  file << \"}\\n\";\n}\n*/\n\nPoint updateForce(Point delta, double psq, double mass) {\n  // Computing force += delta * mass * (|delta|^2 + eps^2)^{-3/2}\n  double idr   = 1 / sqrt((float)(psq + config.epssq));\n  double scale = mass * idr * idr * idr;\n  return delta * scale;\n}\n\nstruct ComputeForces {\n  // Optimize runtime for no conflict case\n\n  Octree* top;\n  double root_dsq;\n\n  ComputeForces(Octree* _top, double diameter) : top(_top) {\n    assert(diameter > 0.0 && \"non positive diameter of bb\");\n    root_dsq = diameter * diameter * config.itolsq;\n  }\n\n  template <typename Context>\n  void computeForce(Body* b, Context& cnx) {\n    Point p = b->acc;\n    b->acc  = Point(0.0, 0.0, 0.0);\n    iterate(*b, cnx);\n    b->vel += (b->acc - p) * config.dthf;\n  }\n\n  struct Frame {\n    double dsq;\n    Octree* node;\n    Frame(Octree* _node, double _dsq) : dsq(_dsq), node(_node) {}\n  };\n\n  template <typename Context>\n  void iterate(Body& b, Context& cnx) {\n    std::deque<Frame, galois::PerIterAllocTy::rebind<Frame>::other> stack(\n        cnx.getPerIterAlloc());\n    stack.push_back(Frame(top, root_dsq));\n\n    while (!stack.empty()) {\n      const Frame f = stack.back();\n      stack.pop_back();\n\n      Point p    = b.pos - f.node->pos;\n      double psq = p.dist2();\n\n      // Node is far enough away, summarize contribution\n      if (psq >= f.dsq) {\n        b.acc += updateForce(p, psq, f.node->mass);\n        continue;\n      }\n\n      double dsq = f.dsq * 0.25;\n      for (int i = 0; i < f.node->nChildren; i++) {\n        Node* n = f.node->child[i].getValue();\n        assert(n);\n        if (f.node->cLeafs & (1 << i)) {\n          assert(n->Leaf);\n          if (static_cast<const Node*>(&b) != n) {\n            Point p = b.pos - n->pos;\n            b.acc += updateForce(p, p.dist2(), n->mass);\n          }\n        } else {\n#ifndef GALOIS_CXX11_DEQUE_HAS_NO_EMPLACE\n          stack.emplace_back(static_cast<Octree*>(n), dsq);\n#else\n          stack.push_back(Frame(static_cast<Octree*>(n), dsq));\n#endif\n          __builtin_prefetch(n);\n        }\n      }\n    }\n  }\n};\n\nstruct centerXCmp {\n  template <typename T>\n  bool operator()(const T& lhs, const T& rhs) const {\n    return lhs.pos[0] < rhs.pos[0];\n  }\n};\n\nstruct centerYCmp {\n  template <typename T>\n  bool operator()(const T& lhs, const T& rhs) const {\n    return lhs.pos[1] < rhs.pos[1];\n  }\n};\n\nstruct centerYCmpInv {\n  template <typename T>\n  bool operator()(const T& lhs, const T& rhs) const {\n    return rhs.pos[1] < lhs.pos[1];\n  }\n};\n\ntemplate <typename Iter, typename Gen>\nvoid divide(const Iter& b, const Iter& e, Gen& gen) {\n  if (std::distance(b, e) > 32) {\n    std::sort(b, e, centerXCmp());\n    Iter m = galois::split_range(b, e);\n    std::sort(b, m, centerYCmpInv());\n    std::sort(m, e, centerYCmp());\n    divide(b, galois::split_range(b, m), gen);\n    divide(galois::split_range(b, m), m, gen);\n    divide(m, galois::split_range(m, e), gen);\n    divide(galois::split_range(m, e), e, gen);\n  } else {\n    std::shuffle(b, e, gen);\n  }\n}\n\n/**\n * Generates random input according to the Plummer model, which is more\n * realistic but perhaps not so much so according to astrophysicists\n */\nvoid generateInput(Bodies& bodies, BodyPtrs& pBodies, int nbodies, int seed) {\n  double v, sq, scale;\n  Point p;\n  double PI = boost::math::constants::pi<double>();\n\n  std::mt19937 gen(seed);\n#if __cplusplus >= 201103L || defined(HAVE_CXX11_UNIFORM_INT_DISTRIBUTION)\n  std::uniform_real_distribution<double> dist(0, 1);\n#else\n  std::uniform_real<double> dist(0, 1);\n#endif\n\n  double rsc = (3 * PI) / 16;\n  double vsc = sqrt(1.0 / rsc);\n\n  std::vector<Body> tmp;\n\n  for (int body = 0; body < nbodies; body++) {\n    double r = 1.0 / sqrt(pow(dist(gen) * 0.999, -2.0 / 3.0) - 1);\n    do {\n      for (int i = 0; i < 3; i++)\n        p[i] = dist(gen) * 2.0 - 1.0;\n      sq = p.dist2();\n    } while (sq > 1.0);\n    scale = rsc * r / sqrt(sq);\n\n    Body b;\n    b.mass = 1.0 / nbodies;\n    b.pos  = p * scale;\n    do {\n      p[0] = dist(gen);\n      p[1] = dist(gen) * 0.1;\n    } while (p[1] > p[0] * p[0] * pow(1 - p[0] * p[0], 3.5));\n    v = p[0] * sqrt(2.0 / sqrt(1 + r * r));\n    do {\n      for (int i = 0; i < 3; i++)\n        p[i] = dist(gen) * 2.0 - 1.0;\n      sq = p.dist2();\n    } while (sq > 1.0);\n    scale  = vsc * v / sqrt(sq);\n    b.vel  = p * scale;\n    b.Leaf = true;\n    tmp.push_back(b);\n    // pBodies.push_back(&bodies.push_back(b));\n  }\n\n  // sort and copy out\n  divide(tmp.begin(), tmp.end(), gen);\n\n  galois::do_all(\n      galois::iterate(tmp),\n      [&pBodies, &bodies](const Body& b) {\n        pBodies.push_back(&(bodies.push_back(b)));\n      },\n      galois::loopname(\"InsertBody\"));\n}\n\nstruct CheckAllPairs {\n  Bodies& bodies;\n\n  CheckAllPairs(Bodies& b) : bodies(b) {}\n\n  double operator()(const Body& body) const {\n    const Body* me = &body;\n    Point acc;\n    for (Bodies::iterator ii = bodies.begin(), ei = bodies.end(); ii != ei;\n         ++ii) {\n      Body* b = &*ii;\n      if (me == b)\n        continue;\n      Point delta = me->pos - b->pos;\n      double psq  = delta.dist2();\n      acc += updateForce(delta, psq, b->mass);\n    }\n\n    double dist2 = acc.dist2();\n    acc -= me->acc;\n    double retval = acc.dist2() / dist2;\n    return retval;\n  }\n};\n\ndouble checkAllPairs(Bodies& bodies, int N) {\n  Bodies::iterator end(bodies.begin());\n  std::advance(end, N);\n\n  return galois::ParallelSTL::map_reduce(bodies.begin(), end,\n                                         CheckAllPairs(bodies),\n                                         std::plus<double>(), 0.0) /\n         N;\n}\n\nvoid run(Bodies& bodies, BodyPtrs& pBodies, size_t nbodies) {\n  typedef galois::worklists::StableIterator<true> WLL;\n\n  galois::preAlloc(galois::getActiveThreads() +\n                   (3 * sizeof(Octree) + 2 * sizeof(Body)) * nbodies /\n                       galois::runtime::pagePoolSize());\n  galois::reportPageAlloc(\"MeminfoPre\");\n\n  for (int step = 0; step < ntimesteps; step++) {\n\n    auto mergeBoxes = [](const BoundingBox& lhs, const BoundingBox& rhs) {\n      return lhs.merge(rhs);\n    };\n\n    auto identity = []() { return BoundingBox(); };\n\n    // Do tree building sequentially\n    auto boxes = galois::make_reducible(mergeBoxes, identity);\n\n    galois::do_all(\n        galois::iterate(pBodies),\n        [&boxes](const Body* b) { boxes.update(BoundingBox(b->pos)); },\n        galois::loopname(\"reduceBoxes\"));\n\n    BoundingBox box = boxes.reduce();\n\n    Tree t;\n    BuildOctree treeBuilder{t};\n    Octree& top = t.emplace(box.center());\n\n    galois::StatTimer T_build(\"BuildTime\");\n    T_build.start();\n    galois::do_all(\n        galois::iterate(pBodies),\n        [&](Body* body) { treeBuilder.insert(body, &top, box.radius()); },\n        galois::loopname(\"BuildTree\"));\n    T_build.stop();\n\n    // update centers of mass in tree\n    galois::timeThis(\n        [&](void) {\n          unsigned size = computeCenterOfMass(&top);\n          // printTree(&top);\n          std::cout << \"Tree Size: \" << size << \"\\n\";\n        },\n        \"summarize-Serial\");\n\n    ComputeForces cf(&top, box.diameter());\n\n    galois::StatTimer T_compute(\"ComputeTime\");\n    T_compute.start();\n    galois::for_each(\n        galois::iterate(pBodies),\n        [&](Body* b, auto& cnx) { cf.computeForce(b, cnx); },\n        galois::loopname(\"compute\"), galois::wl<WLL>(),\n        galois::disable_conflict_detection(), galois::no_pushes(),\n        galois::per_iter_alloc());\n    T_compute.stop();\n\n    if (!skipVerify) {\n      galois::timeThis(\n          [&](void) {\n            std::cout << \"MSE (sampled) \"\n                      << checkAllPairs(bodies, std::min((int)nbodies, 100))\n                      << \"\\n\";\n          },\n          \"checkAllPairs\");\n    }\n    // Done in compute forces\n    galois::do_all(\n        galois::iterate(pBodies),\n        [](Body* b) {\n          Point dvel(b->acc);\n          dvel *= config.dthf;\n          Point velh(b->vel);\n          velh += dvel;\n          b->pos += velh * config.dtime;\n          b->vel = velh + dvel;\n        },\n        galois::loopname(\"advance\"));\n\n    std::cout << \"Timestep \" << step << \" Center of Mass = \";\n    std::ios::fmtflags flags =\n        std::cout.setf(std::ios::showpos | std::ios::right |\n                       std::ios::scientific | std::ios::showpoint);\n    std::cout << top.pos;\n    std::cout.flags(flags);\n    std::cout << \"\\n\";\n  }\n\n  galois::reportPageAlloc(\"MeminfoPost\");\n}\n\nint main(int argc, char** argv) {\n  galois::SharedMemSys G;\n  LonestarStart(argc, argv, name, desc, url, nullptr);\n\n  galois::StatTimer totalTime(\"TimerTotal\");\n  totalTime.start();\n\n  std::cout << config << \"\\n\";\n  std::cout << nbodies << \" bodies, \" << ntimesteps << \" time steps\\n\";\n\n  Bodies bodies;\n  BodyPtrs pBodies;\n  generateInput(bodies, pBodies, nbodies, seed);\n\n  galois::StatTimer execTime(\"Timer_0\");\n  execTime.start();\n  run(bodies, pBodies, nbodies);\n  execTime.stop();\n\n  totalTime.stop();\n\n  return 0;\n}\n"
  },
  {
    "path": "lonestar/scientific/cpu/barneshut/CMakeLists.txt",
    "content": "add_executable(barneshut-cpu Barneshut.cpp)\nadd_dependencies(apps barneshut-cpu)\ntarget_link_libraries(barneshut-cpu PRIVATE Galois::shmem lonestar)\ninstall(TARGETS barneshut-cpu DESTINATION \"${CMAKE_INSTALL_BINDIR}\" COMPONENT apps EXCLUDE_FROM_ALL)\n\nif(CMAKE_COMPILER_IS_GNUCC)\n  target_compile_options(barneshut-cpu PRIVATE -ffast-math)\nendif()\n\nadd_test_scale(small barneshut-cpu -n 10000 -steps 1 -seed 0)\n"
  },
  {
    "path": "lonestar/scientific/cpu/barneshut/Point.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\nstruct Point {\n  double val[3];\n  Point() { val[0] = val[1] = val[2] = 0.0; }\n  // Point(double _x, double _y, double _z) : val{_x,_y,_z} {}\n  Point(double _x, double _y, double _z) {\n    val[0] = _x;\n    val[1] = _y;\n    val[2] = _z;\n  }\n  // explicit Point(double v) : val{v,v,v} {}\n  explicit Point(double v) {\n    val[0] = v;\n    val[1] = v;\n    val[2] = v;\n  }\n\n  double operator[](const int index) const { return val[index]; }\n\n  double& operator[](const int index) { return val[index]; }\n\n  double x() const { return val[0]; }\n\n  double y() const { return val[1]; }\n\n  double z() const { return val[2]; }\n\n  bool operator==(const Point& other) const {\n    return val[0] == other.val[0] && val[1] == other.val[1] &&\n           val[2] == other.val[2];\n  }\n\n  bool operator!=(const Point& other) const { return !operator==(other); }\n\n  Point& operator+=(const Point& other) {\n    for (int i = 0; i < 3; ++i)\n      val[i] += other.val[i];\n    return *this;\n  }\n\n  Point& operator-=(const Point& other) {\n    for (int i = 0; i < 3; ++i)\n      val[i] -= other.val[i];\n    return *this;\n  }\n\n  Point& operator*=(double value) {\n    for (int i = 0; i < 3; ++i)\n      val[i] *= value;\n    return *this;\n  }\n\n  Point operator-(const Point& other) const {\n    return Point(val[0] - other.val[0], val[1] - other.val[1],\n                 val[2] - other.val[2]);\n  }\n\n  Point operator+(const Point& other) const {\n    return Point(val[0] + other.val[0], val[1] + other.val[1],\n                 val[2] + other.val[2]);\n  }\n\n  Point operator*(double d) const {\n    return Point(val[0] * d, val[1] * d, val[2] * d);\n  }\n\n  Point operator/(double d) const {\n    return Point(val[0] / d, val[1] / d, val[2] / d);\n  }\n\n  double dist2() const { return dot(*this); }\n\n  double dot(const Point& p2) const {\n    return val[0] * p2.val[0] + val[1] * p2.val[1] + val[2] * p2.val[2];\n  }\n\n  void pairMin(const Point& p2) {\n    for (int i = 0; i < 3; ++i)\n      if (p2.val[i] < val[i])\n        val[i] = p2.val[i];\n  }\n\n  void pairMax(const Point& p2) {\n    for (int i = 0; i < 3; ++i)\n      if (p2.val[i] > val[i])\n        val[i] = p2.val[i];\n  }\n\n  double minDim() const { return std::min(val[0], std::min(val[1], val[2])); }\n};\n\nstd::ostream& operator<<(std::ostream& os, const Point& p) {\n  os << \"(\" << p[0] << \",\" << p[1] << \",\" << p[2] << \")\";\n  return os;\n}\n"
  },
  {
    "path": "lonestar/scientific/cpu/barneshut/README.md",
    "content": "Barnes Hut\n================================================================================\n\nDESCRIPTION \n--------------------------------------------------------------------------------\n\nThis program performs N-body simulation using Barnes-Hut algorithm.\n\nThe simulation proceeds in rounds (specified via -steps), where in every round,\nit creates an Oct-Tree of the bodies (specified via -n) and performs force\ncomputation between all pairs of bodies while traversing the Oct-Tree. \n\nINPUT\n--------------------------------------------------------------------------------\n\nInput is randomly generated graphs (using Plummer model) upon\nrunning the program.\n\nBUILD\n--------------------------------------------------------------------------------\n\n1. Run cmake at BUILD directory (refer to top-level README for cmake instructions).\n\n2. Run `cd <BUILD>/lonestar/scientific/cpu/barneshut; make -j`\n\nRUN\n--------------------------------------------------------------------------------\n\nThe following are a few example command lines.\n\n-`$ ./barneshut-cpu -n 12345 -t 40`\n-`$ ./barneshut-cpu -n 12345 -steps 100 -t 40`\n\nPERFORMANCE  \n--------------------------------------------------------------------------------\n\n* CHUNK_SIZE needs to be tuned for machine and input. \n"
  },
  {
    "path": "lonestar/scientific/cpu/delaunayrefinement/CMakeLists.txt",
    "content": "add_executable(delaunayrefinement-cpu DelaunayRefinement.cpp)\nadd_dependencies(apps delaunayrefinement-cpu)\ntarget_link_libraries(delaunayrefinement-cpu PRIVATE Galois::shmem lonestar)\ninstall(TARGETS delaunayrefinement-cpu DESTINATION \"${CMAKE_INSTALL_BINDIR}\" COMPONENT apps EXCLUDE_FROM_ALL)\n\nif(CMAKE_COMPILER_IS_GNUCC)\n  target_compile_options(delaunayrefinement-cpu PRIVATE -ffast-math)\nendif()\n\nadd_test_scale(small1 delaunayrefinement-cpu -meshGraph \"${BASEINPUT}/reference/meshes/r10k.1\")\nadd_test_scale(small2 delaunayrefinement-cpu -meshGraph \"${BASEINPUT}/meshes/250k.2\" NOT_QUICK)\n"
  },
  {
    "path": "lonestar/scientific/cpu/delaunayrefinement/Cavity.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#include <vector>\n#include <algorithm>\n\nclass Cavity {\n  //! [STL vector using PerIterAllocTy]\n  typedef std::vector<EdgeTuple,\n                      galois::PerIterAllocTy::rebind<EdgeTuple>::other>\n      ConnTy;\n  //! [STL vector using PerIterAllocTy]\n\n  Tuple center;\n  GNode centerNode;\n  std::vector<GNode, galois::PerIterAllocTy::rebind<GNode>::other> frontier;\n  // !the cavity itself\n  PreGraph pre;\n  // !what the new elements should look like\n  PostGraph post;\n  // the edge-relations that connect the boundary to the cavity\n  ConnTy connections;\n  Element* centerElement;\n  Graph* graph;\n  int dim;\n\n  /**\n   * find the node that is opposite the obtuse angle of the element\n   */\n  GNode getOpposite(GNode node) {\n    assert(std::distance(graph->edge_begin(node), graph->edge_end(node)) == 3);\n    Element& element   = graph->getData(node, galois::MethodFlag::WRITE);\n    Tuple elementTuple = element.getObtuse();\n    for (Graph::edge_iterator\n             ii = graph->edge_begin(node, galois::MethodFlag::WRITE),\n             ee = graph->edge_end(node, galois::MethodFlag::WRITE);\n         ii != ee; ++ii) {\n      GNode neighbor = graph->getEdgeDst(ii);\n      // Edge& edgeData = graph->getEdgeData(node, neighbor);\n      Edge edgeData = element.getRelatedEdge(\n          graph->getData(neighbor, galois::MethodFlag::WRITE));\n      if (elementTuple != edgeData.getPoint(0) &&\n          elementTuple != edgeData.getPoint(1)) {\n        return neighbor;\n      }\n    }\n    GALOIS_DIE(\"unreachable\");\n    return node;\n  }\n\n  void expand(GNode node, GNode next) {\n    Element& nextElement = graph->getData(next, galois::MethodFlag::WRITE);\n    if ((!(dim == 2 && nextElement.dim() == 2 && next != centerNode)) &&\n        nextElement.inCircle(center)) {\n      // isMember says next is part of the cavity, and we're not the second\n      // segment encroaching on this cavity\n      if ((nextElement.dim() == 2) && (dim != 2)) {\n        // is segment, and we are encroaching\n        initialize(next);\n        build();\n      } else {\n        if (!pre.containsNode(next)) {\n          pre.addNode(next);\n          frontier.push_back(next);\n        }\n      }\n    } else {\n      // not a member\n      // Edge& edgeData = graph->getEdgeData(node, next);\n      Edge edgeData = nextElement.getRelatedEdge(\n          graph->getData(node, galois::MethodFlag::WRITE));\n      EdgeTuple edge(node, next, edgeData);\n      if (std::find(connections.begin(), connections.end(), edge) ==\n          connections.end()) {\n        connections.push_back(edge);\n      }\n    }\n  }\n\npublic:\n  Cavity(Graph* g, galois::PerIterAllocTy& cnx)\n      : frontier(cnx), pre(cnx), post(cnx), connections(cnx), graph(g) {}\n\n  void initialize(GNode node) {\n    pre.reset();\n    post.reset();\n    connections.clear();\n    frontier.clear();\n    centerNode    = node;\n    centerElement = &graph->getData(centerNode, galois::MethodFlag::WRITE);\n    while (graph->containsNode(centerNode, galois::MethodFlag::WRITE) &&\n           centerElement->isObtuse()) {\n      centerNode    = getOpposite(centerNode);\n      centerElement = &graph->getData(centerNode, galois::MethodFlag::WRITE);\n    }\n    center = centerElement->getCenter();\n    dim    = centerElement->dim();\n    pre.addNode(centerNode);\n    frontier.push_back(centerNode);\n  }\n\n  void build() {\n    while (!frontier.empty()) {\n      GNode curr = frontier.back();\n      frontier.pop_back();\n      for (Graph::edge_iterator\n               ii = graph->edge_begin(curr, galois::MethodFlag::WRITE),\n               ee = graph->edge_end(curr, galois::MethodFlag::WRITE);\n           ii != ee; ++ii) {\n        GNode neighbor = graph->getEdgeDst(ii);\n        expand(curr, neighbor);\n      }\n    }\n  }\n\n  /**\n   * Create the new cavity based on the data of the old one\n   */\n  void computePost() {\n    if (centerElement->dim() == 2) { // we built around a segment\n      GNode n1 = graph->createNode(Element(center, centerElement->getPoint(0)));\n      GNode n2 = graph->createNode(Element(center, centerElement->getPoint(1)));\n\n      post.addNode(n1);\n      post.addNode(n2);\n    }\n\n    for (ConnTy::iterator ii = connections.begin(), ee = connections.end();\n         ii != ee; ++ii) {\n      EdgeTuple tuple = *ii;\n      Element newElement(center, tuple.data.getPoint(0),\n                         tuple.data.getPoint(1));\n      GNode other = pre.containsNode(tuple.dst) ? tuple.src : tuple.dst;\n      Element& otherElement = graph->getData(other, galois::MethodFlag::WRITE);\n\n      GNode newNode         = graph->createNode(newElement); // XXX\n      const Edge& otherEdge = newElement.getRelatedEdge(otherElement);\n      post.addEdge(newNode, other, otherEdge);\n\n      for (PostGraph::iterator ii = post.begin(), ee = post.end(); ii != ee;\n           ++ii) {\n        GNode node       = *ii;\n        Element& element = graph->getData(node, galois::MethodFlag::WRITE);\n        if (element.isRelated(newElement)) {\n          const Edge& edge = newElement.getRelatedEdge(element);\n          post.addEdge(newNode, node, edge);\n        }\n      }\n      post.addNode(newNode);\n    }\n  }\n\n  void update(GNode node, galois::UserContext<GNode>& ctx) {\n    for (PreGraph::iterator ii = pre.begin(), ee = pre.end(); ii != ee; ++ii)\n      graph->removeNode(*ii, galois::MethodFlag::UNPROTECTED);\n\n    // add new data\n    for (PostGraph::iterator ii = post.begin(), ee = post.end(); ii != ee;\n         ++ii) {\n      GNode n = *ii;\n      graph->addNode(n, galois::MethodFlag::UNPROTECTED);\n      Element& element = graph->getData(n, galois::MethodFlag::UNPROTECTED);\n      if (element.isBad()) {\n        ctx.push(n);\n      }\n    }\n\n    for (PostGraph::edge_iterator ii = post.edge_begin(), ee = post.edge_end();\n         ii != ee; ++ii) {\n      EdgeTuple edge = *ii;\n      graph->addEdge(edge.src, edge.dst, galois::MethodFlag::UNPROTECTED);\n    }\n\n    if (graph->containsNode(node, galois::MethodFlag::UNPROTECTED)) {\n      ctx.push(node);\n    }\n  }\n};\n"
  },
  {
    "path": "lonestar/scientific/cpu/delaunayrefinement/DelaunayRefinement.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#include \"Mesh.h\"\n#include \"Cavity.h\"\n#include \"Verifier.h\"\n\n#include \"galois/Galois.h\"\n#include \"galois/ParallelSTL.h\"\n#include \"galois/Bag.h\"\n#include \"galois/Timer.h\"\n\n#include \"llvm/Support/CommandLine.h\"\n#include \"Lonestar/BoilerPlate.h\"\n\n#include <iostream>\n#include <string.h>\n#include <cassert>\n\nnamespace cll = llvm::cl;\n\nstatic const char* name = \"Delaunay Mesh Refinement\";\nstatic const char* desc = \"Refines a Delaunay triangulation mesh such that no \"\n                          \"angle in the mesh is less than 30 degrees\";\nstatic const char* url = \"delaunay_mesh_refinement\";\n\nstatic cll::opt<std::string>\n    inputFile(cll::Positional, cll::desc(\"<input file>\"), cll::Required);\n\nenum DetAlgo { nondet, detBase, detPrefix, detDisjoint };\n\nstatic cll::opt<DetAlgo>\n    detAlgo(cll::desc(\"Deterministic schedule (default value nondet):\"),\n            cll::values(clEnumVal(nondet, \"Non-deterministic\"),\n                        clEnumVal(detBase, \"Base execution\"),\n                        clEnumVal(detPrefix, \"Prefix execution\"),\n                        clEnumVal(detDisjoint, \"Disjoint execution\")),\n            cll::init(nondet));\n\n//! Flag that forces user to be aware that they should be passing in a\n//! mesh graph.\nstatic cll::opt<bool>\n    meshGraph(\"meshGraph\", cll::desc(\"Specify that the input graph is a mesh\"),\n              cll::init(false));\n\ntemplate <typename WL, int Version = detBase>\nvoid refine(galois::InsertBag<GNode>& initialBad, Graph& graph) {\n\n  struct LocalState {\n    Cavity cav;\n    LocalState(Graph& graph, galois::PerIterAllocTy& alloc)\n        : cav(&graph, alloc) {}\n  };\n\n  //! [for_each example]\n  galois::for_each(\n      galois::iterate(initialBad),\n      [&](GNode item, auto& ctx) {\n        if (!graph.containsNode(item, galois::MethodFlag::WRITE))\n          return;\n\n        if (Version == detDisjoint) {\n\n          if (ctx.isFirstPass()) {\n            LocalState* localState = ctx.template createLocalState<LocalState>(\n                graph, ctx.getPerIterAlloc());\n            localState->cav.initialize(item);\n            localState->cav.build();\n            localState->cav.computePost();\n          } else {\n            LocalState* localState = ctx.template getLocalState<LocalState>();\n            localState->cav.update(item, ctx);\n          }\n\n          return;\n        } else {\n          //! [Accessing Per Iteration Allocator in DMR]\n          Cavity cav(&graph, ctx.getPerIterAlloc());\n          //! [Accessing Per Iteration Allocator in DMR]\n          cav.initialize(item);\n          cav.build();\n          cav.computePost();\n          if (Version == detPrefix)\n            return;\n          ctx.cautiousPoint();\n          cav.update(item, ctx);\n        }\n      },\n      galois::loopname(\"refine\"), galois::wl<WL>(), galois::per_iter_alloc(),\n      galois::local_state<LocalState>());\n\n  //! [for_each example]\n}\n\ntemplate <typename Loop>\nvoid findBad(Graph& graph, galois::InsertBag<GNode>& initialBad,\n             const Loop& loop) {\n  loop(\n      galois::iterate(graph),\n      [&](GNode item) {\n        if (graph.getData(item, galois::MethodFlag::UNPROTECTED).isBad()) {\n          initialBad.push(item);\n        }\n      },\n      galois::loopname(\"findBad\"));\n}\n\n/*\nstruct DetLessThan {\n  bool operator()(const GNode& a, const GNode& b) const {\n    int idA = graph.getData(a, galois::MethodFlag::UNPROTECTED).getId();\n    int idB = graph.getData(b, galois::MethodFlag::UNPROTECTED).getId();\n    if (idA == 0 || idB == 0) abort();\n    return idA < idB;\n  }\n};\n*/\n\nint main(int argc, char** argv) {\n  galois::SharedMemSys G;\n  LonestarStart(argc, argv, name, desc, url, &inputFile);\n\n  galois::StatTimer totalTime(\"TimerTotal\");\n  totalTime.start();\n\n  if (!meshGraph) {\n    GALOIS_DIE(\"This application requires a mesh graph input;\"\n               \" please use the -meshGraph flag \"\n               \" to indicate the input is a mesh graph.\");\n  }\n\n  Graph graph;\n  {\n    Mesh m;\n    m.read(graph, inputFile.c_str(), detAlgo == nondet);\n    Verifier v;\n    if (!skipVerify && !v.verify(graph)) {\n      GALOIS_DIE(\"bad input mesh\");\n    }\n  }\n  std::cout << \"configuration: \" << std::distance(graph.begin(), graph.end())\n            << \" total triangles, \"\n            << std::count_if(graph.begin(), graph.end(), is_bad(graph))\n            << \" bad triangles\\n\";\n\n  galois::reportPageAlloc(\"MeminfoPre1\");\n  // Tighter upper bound for pre-alloc, useful for machines with limited memory,\n  // e.g., Intel MIC. May not be enough for deterministic execution\n  constexpr size_t NODE_SIZE = sizeof(**graph.begin());\n  galois::preAlloc(5 * galois::getActiveThreads() +\n                   NODE_SIZE * 32 * graph.size() /\n                       galois::runtime::pagePoolSize());\n\n  galois::reportPageAlloc(\"MeminfoPre2\");\n\n  galois::StatTimer execTime(\"Timer_0\");\n  execTime.start();\n\n  galois::InsertBag<GNode> initialBad;\n\n  if (detAlgo == nondet) {\n    findBad(graph, initialBad, galois::DoAll());\n  } else {\n    findBad(graph, initialBad, galois::StdForEach());\n  }\n\n  galois::reportPageAlloc(\"MeminfoMid\");\n\n  galois::StatTimer Trefine(\"refine\");\n  Trefine.start();\n  using namespace galois::worklists;\n\n  typedef Deterministic<> DWL;\n  typedef PerThreadChunkLIFO<32> Chunk;\n\n  switch (detAlgo) {\n  case nondet:\n    refine<Chunk>(initialBad, graph);\n    break;\n  case detBase:\n    refine<DWL>(initialBad, graph);\n    break;\n  case detPrefix:\n    refine<DWL, detPrefix>(initialBad, graph);\n    break;\n  case detDisjoint:\n    refine<DWL, detDisjoint>(initialBad, graph);\n    break;\n  default:\n    std::cerr << \"Unknown algorithm\" << detAlgo << \"\\n\";\n    abort();\n  }\n  Trefine.stop();\n  execTime.stop();\n\n  galois::reportPageAlloc(\"MeminfoPost\");\n\n  if (!skipVerify) {\n    int size = galois::ParallelSTL::count_if(graph.begin(), graph.end(),\n                                             is_bad(graph));\n    if (size != 0) {\n      GALOIS_DIE(\"bad triangles remaining\");\n    }\n    Verifier v;\n    if (!v.verify(graph)) {\n      GALOIS_DIE(\"refinement failed\");\n    }\n    std::cout << std::distance(graph.begin(), graph.end())\n              << \" total triangles\\n\";\n    std::cout << \"Refinement OK\\n\";\n  }\n\n  totalTime.stop();\n\n  return 0;\n}\n"
  },
  {
    "path": "lonestar/scientific/cpu/delaunayrefinement/Edge.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#ifndef EDGE_H\n#define EDGE_H\n\n#include \"Tuple.h\"\n\nclass Element;\n\nclass Edge {\n  Tuple p[2];\n\npublic:\n  Edge() {}\n  Edge(const Tuple& a, const Tuple& b) {\n    if (a < b) {\n      p[0] = a;\n      p[1] = b;\n    } else {\n      p[0] = b;\n      p[1] = a;\n    }\n  }\n  Edge(const Edge& rhs) {\n    p[0] = rhs.p[0];\n    p[1] = rhs.p[1];\n  }\n\n  bool operator==(const Edge& rhs) const {\n    return p[0] == rhs.p[0] && p[1] == rhs.p[1];\n  }\n  bool operator!=(const Edge& rhs) const { return !(*this == rhs); }\n  bool operator<(const Edge& rhs) const {\n    return (p[0] < rhs.p[0]) || ((p[0] == rhs.p[0]) && (p[1] < rhs.p[1]));\n  }\n\n  bool operator>(const Edge& rhs) const {\n    return (p[0] > rhs.p[0]) || ((p[0] == rhs.p[0]) && (p[1] > rhs.p[1]));\n  }\n\n  Tuple getPoint(int i) const { return p[i]; }\n};\n#endif\n"
  },
  {
    "path": "lonestar/scientific/cpu/delaunayrefinement/Element.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#ifndef _ELEMENT_H\n#define _ELEMENT_H\n\n#include \"galois/gIO.h\"\n\n#include <cassert>\n#include <stdlib.h>\n\n#include \"Edge.h\"\n\n#define MINANGLE 30.0\n\nclass Element {\n  Tuple coords[3]; // The three endpoints of the triangle\n  // if the triangle has an obtuse angle\n  // obtuse - 1 is which one\n  signed char obtuse;\n  bool bDim; // true == 3, false == 2\n  int id;\n\npublic:\n  //! Constructor for Triangles\n  Element(const Tuple& a, const Tuple& b, const Tuple& c, int _id = 0)\n      : obtuse(0), bDim(true), id(_id) {\n    coords[0] = a;\n    coords[1] = b;\n    coords[2] = c;\n    if (b < a || c < a) {\n      if (b < c) {\n        coords[0] = b;\n        coords[1] = c;\n        coords[2] = a;\n      } else {\n        coords[0] = c;\n        coords[1] = a;\n        coords[2] = b;\n      }\n    }\n    //    edges[0] = Edge(coords[0], coords[1]);\n    //    edges[1] = Edge(coords[1], coords[2]);\n    //    edges[2] = Edge(coords[2], coords[0]);\n    for (int i = 0; i < 3; i++)\n      if (angleOBCheck(i))\n        obtuse = i + 1;\n    // computeCenter();\n  }\n\n  //! Constructor for segments\n  Element(const Tuple& a, const Tuple& b, int _id = 0)\n      : obtuse(0), bDim(false), id(_id) {\n    coords[0] = a;\n    coords[1] = b;\n    if (b < a) {\n      coords[0] = b;\n      coords[1] = a;\n    }\n    // computeCenter();\n  }\n\n  Tuple getCenter() const {\n    if (dim() == 2) {\n      return (coords[0] + coords[1]) * 0.5;\n    } else {\n      const Tuple& a = coords[0];\n      const Tuple& b = coords[1];\n      const Tuple& c = coords[2];\n      Tuple x        = b - a;\n      Tuple y        = c - a;\n      double xlen    = a.distance(b);\n      double ylen    = a.distance(c);\n      double cosine  = (x * y) / (xlen * ylen);\n      double sine_sq = 1.0 - cosine * cosine;\n      double plen    = ylen / xlen;\n      double s       = plen * cosine;\n      double t       = plen * sine_sq;\n      double wp      = (plen - cosine) / (2 * t);\n      double wb      = 0.5 - (wp * s);\n      Tuple tmpval   = a * (1 - wb - wp);\n      tmpval         = tmpval + (b * wb);\n      return tmpval + (c * wp);\n    }\n  }\n\n  double get_radius_squared() const { return get_radius_squared(getCenter()); }\n\n  double get_radius_squared(const Tuple& center) const {\n    return center.distance_squared(coords[0]);\n  }\n\n  bool operator<(const Element& rhs) const {\n    // apparently a triangle is less than a line\n    if (dim() < rhs.dim())\n      return false;\n    if (dim() > rhs.dim())\n      return true;\n    for (int i = 0; i < dim(); i++) {\n      if (coords[i] < rhs.coords[i])\n        return true;\n      else if (coords[i] > rhs.coords[i])\n        return false;\n    }\n    return false;\n  }\n\n  /// @return if the current triangle has a common edge with e\n  bool isRelated(const Element& rhs) const {\n    int num_eq = 0;\n    for (int i = 0; i < dim(); ++i)\n      for (int j = 0; j < rhs.dim(); ++j)\n        if (coords[i] == rhs.coords[j])\n          ++num_eq;\n    return num_eq == 2;\n  }\n\n  bool inCircle(Tuple p) const {\n    Tuple center = getCenter();\n    double ds    = center.distance_squared(p);\n    return ds <= get_radius_squared(center);\n  }\n\n  void angleCheck(int i, bool& ob, bool& sm, double M) const {\n    int j = (i + 1) % dim();\n    int k = (i + 2) % dim();\n    Tuple::angleCheck(coords[j], coords[i], coords[k], ob, sm, M);\n  }\n\n  bool angleGTCheck(int i, double M) const {\n    int j = (i + 1) % dim();\n    int k = (i + 2) % dim();\n    return Tuple::angleGTCheck(coords[j], coords[i], coords[k], M);\n  }\n\n  bool angleOBCheck(int i) const {\n    int j = (i + 1) % dim();\n    int k = (i + 2) % dim();\n    return Tuple::angleOBCheck(coords[j], coords[i], coords[k]);\n  }\n\n  // Virtualize the Edges array\n  // Used only by Mesh now\n  Edge getEdge(int i) const {\n    if (i == 0)\n      return Edge(coords[0], coords[1]);\n    if (!bDim) {\n      if (i == 1)\n        return Edge(coords[1], coords[0]);\n    } else {\n      if (i == 1)\n        return Edge(coords[1], coords[2]);\n      else if (i == 2)\n        return Edge(coords[2], coords[0]);\n    }\n    GALOIS_DIE(\"unknown edge\");\n    return Edge(coords[0], coords[0]);\n  }\n\n  Edge getOppositeObtuse() const {\n    // The edge opposite the obtuse angle is the edge formed by\n    // the other indexes\n    switch (obtuse) {\n    case 1:\n      return getEdge(1);\n    case 2:\n      return getEdge(2);\n    case 3:\n      return getEdge(0);\n    }\n    GALOIS_DIE(\"no obtuse edge\");\n    return getEdge(0);\n  }\n\n  //! Should the node be processed?\n  bool isBad() const {\n    if (!bDim)\n      return false;\n    for (int i = 0; i < 3; i++)\n      if (angleGTCheck(i, MINANGLE))\n        return true;\n    return false;\n  }\n\n  const Tuple& getPoint(int i) const { return coords[i]; }\n\n  const Tuple& getObtuse() const { return coords[obtuse - 1]; }\n\n  int dim() const { return bDim ? 3 : 2; }\n\n  int numEdges() const { return dim() + dim() - 3; }\n\n  bool isObtuse() const { return obtuse != 0; }\n\n  int getId() const { return id; }\n\n  /**\n   * Scans all the edges of the two elements and if it finds one that is\n   * equal, then sets this as the Edge of the EdgeRelation\n   */\n  Edge getRelatedEdge(const Element& e) const {\n    int at = 0;\n    Tuple d[2];\n    for (int i = 0; i < dim(); ++i)\n      for (int j = 0; j < e.dim(); ++j)\n        if (coords[i] == e.coords[j])\n          d[at++] = coords[i];\n    assert(at == 2);\n    return Edge(d[0], d[1]);\n  }\n\n  std::ostream& print(std::ostream& s) const {\n    s << '[';\n    for (int i = 0; i < dim(); ++i)\n      s << coords[i] << (i < (dim() - 1) ? \", \" : \"\");\n    s << ']';\n    return s;\n  }\n};\n\nstatic std::ostream& operator<<(std::ostream& s, const Element& E) {\n  return E.print(s);\n}\n\n#endif\n"
  },
  {
    "path": "lonestar/scientific/cpu/delaunayrefinement/Mesh.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#ifndef MESH_H\n#define MESH_H\n\n#include \"Subgraph.h\"\n\n#include <vector>\n#include <string>\n#include <map>\n#include <iostream>\n#include <cstdio>\n\nstruct is_bad {\n  Graph& g;\n  is_bad(Graph& _g) : g(_g) {}\n  bool operator()(const GNode& n) const {\n    return g.getData(n, galois::MethodFlag::UNPROTECTED).isBad();\n  }\n};\n\nstruct centerXCmp {\n  bool operator()(const Element& lhs, const Element& rhs) const {\n    // return lhs.getCenter() < rhs.getCenter();\n    return lhs.getPoint(0)[0] < rhs.getPoint(0)[0];\n  }\n};\n\nstruct centerYCmp {\n  bool operator()(const Element& lhs, const Element& rhs) const {\n    // return lhs.getCenter() < rhs.getCenter();\n    return lhs.getPoint(0)[1] < rhs.getPoint(0)[1];\n  }\n};\n\nstruct centerYCmpInv {\n  bool operator()(const Element& lhs, const Element& rhs) const {\n    // return lhs.getCenter() < rhs.getCenter();\n    return rhs.getPoint(0)[1] < lhs.getPoint(0)[1];\n  }\n};\n\n/**\n * Helper class used providing methods to read in information and create the\n * graph\n *\n */\nclass Mesh {\n  std::vector<Element> elements;\n  size_t id;\n\nprivate:\n  void checkResults(int act, int exp, std::string& str) {\n    if (act != exp) {\n      std::cerr << \"Failed read in \" << str << \"\\n\";\n      abort();\n    }\n  }\n\n  bool readNodesBin(std::string filename, std::vector<Tuple>& tuples) {\n    FILE* pFile = fopen(filename.append(\".node.bin\").c_str(), \"r\");\n    if (!pFile) {\n      return false;\n    }\n    std::cout << \"Using bin for node\\n\";\n    uint32_t ntups[4];\n    if (fread(&ntups[0], sizeof(uint32_t), 4, pFile) < 4) {\n      std::cerr << \"Malformed binary file\\n\";\n      abort();\n    }\n    tuples.resize(ntups[0]);\n    for (size_t i = 0; i < ntups[0]; i++) {\n      struct record {\n        uint32_t index;\n        double x, y, z;\n      };\n      record R;\n      if (fread(&R, sizeof(record), 1, pFile) < 1) {\n        std::cerr << \"Malformed binary file\\n\";\n        abort();\n      }\n      tuples[R.index] = Tuple(R.x, R.y);\n    }\n    fclose(pFile);\n    return true;\n  }\n\n  void readNodes(std::string filename, std::vector<Tuple>& tuples) {\n    if (readNodesBin(filename, tuples))\n      return;\n    else\n      writeNodes(filename);\n    FILE* pFile = fopen(filename.append(\".node\").c_str(), \"r\");\n    if (!pFile) {\n      std::cerr << \"Failed to load file \" << filename << \"\\n\";\n      abort();\n    }\n    unsigned ntups;\n    int r = fscanf(pFile, \"%u %*u %*u %*u\", &ntups);\n    checkResults(r, 1, filename);\n    tuples.resize(ntups);\n    for (size_t i = 0; i < ntups; i++) {\n      unsigned index;\n      double x, y;\n      r = fscanf(pFile, \"%u %lf %lf %*f\", &index, &x, &y);\n      checkResults(r, 3, filename);\n      tuples[index] = Tuple(x, y);\n    }\n    fclose(pFile);\n  }\n\n  void writeNodes(std::string filename) {\n    std::string filename2 = filename;\n    FILE* pFile           = fopen(filename.append(\".node\").c_str(), \"r\");\n    FILE* oFile           = fopen(filename2.append(\".node.bin\").c_str(), \"w\");\n    if (!pFile) {\n      std::cerr << \"Failed to load file \" << filename << \" (continuing)\\n\";\n      return;\n    }\n    if (!oFile) {\n      std::cerr << \"Failed to open file \" << filename2 << \" (continuing)\\n\";\n      return;\n    }\n    unsigned ntups[4];\n    int r = fscanf(pFile, \"%u %u %u %u\", &ntups[0], &ntups[1], &ntups[2],\n                   &ntups[3]);\n    checkResults(r, 4, filename);\n    uint32_t ntups32[4] = {ntups[0], ntups[1], ntups[2], ntups[3]};\n    fwrite(&ntups32[0], sizeof(uint32_t), 4, oFile);\n\n    for (size_t i = 0; i < ntups[0]; i++) {\n      struct record {\n        unsigned index;\n        double x, y, z;\n      };\n      struct recordOut {\n        uint32_t index;\n        double x, y, z;\n      };\n      record R;\n      r = fscanf(pFile, \"%u %lf %lf %lf\", &R.index, &R.x, &R.y, &R.z);\n      checkResults(r, 4, filename);\n      recordOut R2 = {R.index, R.x, R.y, R.z};\n      fwrite(&R2, sizeof(recordOut), 1, oFile);\n    }\n    fclose(pFile);\n    fclose(oFile);\n  }\n\n  bool readElementsBin(std::string filename, std::vector<Tuple>& tuples) {\n    FILE* pFile = fopen(filename.append(\".ele.bin\").c_str(), \"r\");\n    if (!pFile) {\n      return false;\n    }\n    std::cout << \"Using bin for ele\\n\";\n    uint32_t nels[3];\n    if (fread(&nels[0], sizeof(uint32_t), 3, pFile) < 3) {\n      std::cerr << \"Malformed binary file\\n\";\n      abort();\n    }\n    for (size_t i = 0; i < nels[0]; i++) {\n      uint32_t r[4];\n      if (fread(&r[0], sizeof(uint32_t), 4, pFile) < 4) {\n        std::cerr << \"Malformed binary file\\n\";\n        abort();\n      }\n      assert(r[1] < tuples.size());\n      assert(r[2] < tuples.size());\n      assert(r[3] < tuples.size());\n      Element e(tuples[r[1]], tuples[r[2]], tuples[r[3]], ++id);\n      elements.push_back(e);\n    }\n    fclose(pFile);\n    return true;\n  }\n\n  void readElements(std::string filename, std::vector<Tuple>& tuples) {\n    if (readElementsBin(filename, tuples))\n      return;\n    else\n      writeElements(filename);\n    FILE* pFile = fopen(filename.append(\".ele\").c_str(), \"r\");\n    if (!pFile) {\n      std::cerr << \"Failed to load file \" << filename << \"\\n\";\n      abort();\n    }\n    unsigned nels;\n    int r = fscanf(pFile, \"%u %*u %*u\", &nels);\n    checkResults(r, 1, filename);\n    for (size_t i = 0; i < nels; i++) {\n      unsigned index;\n      unsigned n1, n2, n3;\n      r = fscanf(pFile, \"%u %u %u %u\", &index, &n1, &n2, &n3);\n      checkResults(r, 4, filename);\n      assert(n1 < tuples.size());\n      assert(n2 < tuples.size());\n      assert(n3 < tuples.size());\n      Element e(tuples[n1], tuples[n2], tuples[n3], ++id);\n      elements.push_back(e);\n    }\n    fclose(pFile);\n  }\n\n  void writeElements(std::string filename) {\n    std::string filename2 = filename;\n    FILE* pFile           = fopen(filename.append(\".ele\").c_str(), \"r\");\n    FILE* oFile           = fopen(filename2.append(\".ele.bin\").c_str(), \"w\");\n    if (!pFile) {\n      std::cerr << \"Failed to load file \" << filename << \" (continuing)\\n\";\n      return;\n    }\n    if (!oFile) {\n      std::cerr << \"Failed to open file \" << filename2 << \" (continuing)\\n\";\n      return;\n    }\n    unsigned nels[3];\n    int r = fscanf(pFile, \"%u %u %u\", &nels[0], &nels[1], &nels[2]);\n    checkResults(r, 3, filename);\n    uint32_t nels32[3] = {nels[0], nels[1], nels[2]};\n    fwrite(&nels32[0], sizeof(uint32_t), 3, oFile);\n\n    for (size_t i = 0; i < nels[0]; i++) {\n      unsigned index;\n      unsigned n1, n2, n3;\n      r = fscanf(pFile, \"%u %u %u %u\", &index, &n1, &n2, &n3);\n      checkResults(r, 4, filename);\n      uint32_t vals[4] = {index, n1, n2, n3};\n      fwrite(&vals[0], sizeof(uint32_t), 4, oFile);\n    }\n    fclose(pFile);\n    fclose(oFile);\n  }\n\n  bool readPolyBin(std::string filename, std::vector<Tuple>& tuples) {\n    FILE* pFile = fopen(filename.append(\".poly.bin\").c_str(), \"r\");\n    if (!pFile) {\n      return false;\n    }\n    std::cout << \"Using bin for poly\\n\";\n    uint32_t nsegs[4];\n    if (fread(&nsegs[0], sizeof(uint32_t), 4, pFile) < 4) {\n      std::cerr << \"Malformed binary file\\n\";\n      abort();\n    }\n    if (fread(&nsegs[0], sizeof(uint32_t), 2, pFile) < 2) {\n      std::cerr << \"Malformed binary file\\n\";\n      abort();\n    }\n    for (size_t i = 0; i < nsegs[0]; i++) {\n      uint32_t r[4];\n      if (fread(&r[0], sizeof(uint32_t), 4, pFile) < 4) {\n        std::cerr << \"Malformed binary file\\n\";\n        abort();\n      }\n      assert(r[1] < tuples.size());\n      assert(r[2] < tuples.size());\n      Element e(tuples[r[1]], tuples[r[2]], ++id);\n      elements.push_back(e);\n    }\n    fclose(pFile);\n    return true;\n  }\n\n  void readPoly(std::string filename, std::vector<Tuple>& tuples) {\n    if (readPolyBin(filename, tuples))\n      return;\n    else\n      writePoly(filename);\n    FILE* pFile = fopen(filename.append(\".poly\").c_str(), \"r\");\n    if (!pFile) {\n      std::cerr << \"Failed to load file \" << filename << \"\\n\";\n      abort();\n    }\n    unsigned nsegs;\n    int r = fscanf(pFile, \"%*u %*u %*u %*u\");\n    checkResults(r, 0, filename);\n    r = fscanf(pFile, \"%u %*u\", &nsegs);\n    checkResults(r, 1, filename);\n    for (size_t i = 0; i < nsegs; i++) {\n      unsigned index, n1, n2;\n      r = fscanf(pFile, \"%u %u %u %*u\", &index, &n1, &n2);\n      checkResults(r, 3, filename);\n      assert(n1 < tuples.size());\n      assert(n2 < tuples.size());\n      Element e(tuples[n1], tuples[n2], ++id);\n      elements.push_back(e);\n    }\n    fclose(pFile);\n  }\n\n  void writePoly(std::string filename) {\n    std::string filename2 = filename;\n    FILE* pFile           = fopen(filename.append(\".poly\").c_str(), \"r\");\n    FILE* oFile           = fopen(filename2.append(\".poly.bin\").c_str(), \"w\");\n    if (!pFile) {\n      std::cerr << \"Failed to load file \" << filename << \" (continuing)\\n\";\n      return;\n    }\n    if (!oFile) {\n      std::cerr << \"Failed to open file \" << filename2 << \" (continuing)\\n\";\n      return;\n    }\n    unsigned nsegs[4];\n    int r = fscanf(pFile, \"%u %u %u %u\", &nsegs[0], &nsegs[1], &nsegs[2],\n                   &nsegs[3]);\n    checkResults(r, 4, filename);\n    uint32_t nsegs32[4] = {nsegs[0], nsegs[1], nsegs[2], nsegs[3]};\n    fwrite(&nsegs32[0], sizeof(uint32_t), 4, oFile);\n    r = fscanf(pFile, \"%u %u\", &nsegs[0], &nsegs[1]);\n    checkResults(r, 2, filename);\n    nsegs32[0] = nsegs[0];\n    nsegs32[1] = nsegs[1];\n    fwrite(&nsegs32[0], sizeof(uint32_t), 2, oFile);\n    for (size_t i = 0; i < nsegs[0]; i++) {\n      unsigned index, n1, n2, n3;\n      r = fscanf(pFile, \"%u %u %u %u\", &index, &n1, &n2, &n3);\n      checkResults(r, 4, filename);\n      uint32_t r[4] = {index, n1, n2, n3};\n      fwrite(&r[0], sizeof(uint32_t), 4, oFile);\n    }\n    fclose(pFile);\n    fclose(oFile);\n  }\n\n  void addElement(Graph& mesh, GNode node, std::map<Edge, GNode>& edge_map) {\n    Element& element = mesh.getData(node);\n    for (int i = 0; i < element.numEdges(); i++) {\n      Edge edge = element.getEdge(i);\n      if (edge_map.find(edge) == edge_map.end()) {\n        edge_map[edge] = node;\n      } else {\n        mesh.addEdge(node, edge_map[edge], galois::MethodFlag::UNPROTECTED);\n        edge_map.erase(edge);\n      }\n    }\n  }\n\n  template <typename Iter>\n  void divide(const Iter& b, const Iter& e) {\n    if (std::distance(b, e) > 16) {\n      std::sort(b, e, centerXCmp());\n      Iter m = galois::split_range(b, e);\n      std::sort(b, m, centerYCmpInv());\n      std::sort(m, e, centerYCmp());\n      divide(b, galois::split_range(b, m));\n      divide(galois::split_range(b, m), m);\n      divide(m, galois::split_range(m, e));\n      divide(galois::split_range(m, e), e);\n    }\n  }\n\n  template <typename L>\n  void createNodes(Graph& g, const L& loop) {\n\n    loop(\n        galois::iterate(elements),\n        [&](const Element& item) {\n          GNode n = g.createNode(item);\n          g.addNode(n);\n        },\n        galois::loopname(\"allocate\"));\n  }\n  void makeGraph(Graph& mesh, bool parallelAllocate) {\n    // std::sort(elements.begin(), elements.end(), centerXCmp());\n    divide(elements.begin(), elements.end());\n\n    if (parallelAllocate)\n      createNodes(mesh, galois::DoAll());\n    else\n      createNodes(mesh, galois::StdForEach());\n\n    std::map<Edge, GNode> edge_map;\n    for (auto ii = mesh.begin(), ee = mesh.end(); ii != ee; ++ii)\n      addElement(mesh, *ii, edge_map);\n  }\n\npublic:\n  Mesh() : id(0) {}\n\n  void read(Graph& mesh, std::string basename, bool parallelAllocate) {\n    std::vector<Tuple> tuples;\n    readNodes(basename, tuples);\n    readElements(basename, tuples);\n    readPoly(basename, tuples);\n    makeGraph(mesh, parallelAllocate);\n  }\n};\n\n#endif\n"
  },
  {
    "path": "lonestar/scientific/cpu/delaunayrefinement/README.md",
    "content": "Delaunayrefinement\n================================================================================\n\nDESCRIPTION \n--------------------------------------------------------------------------------\n\nThis program refines a 2D Delaunay Mesh such that no angle in any triangles is less\nthan a certain value (30 deg in this implementation).\n\nThis implementation contains both non-deterministic and deterministic parallel\nschedules for refining the mesh. \n\nINPUT\n--------------------------------------------------------------------------------\n\nThe user specifies a *basename* of 3 files read by delaunayrefinement:\n  1. basename.nodes contains positions of vertices/points\n  2. basename.ele contains info about vertices of triangles\n  3. basename.poly contains info about which triangles are adjacent to each other\n\nYou must specify the -meshGraph flag when running this benchmark.\n\nBUILD\n--------------------------------------------------------------------------------\n\n1. Run cmake at BUILD directory (refer to top-level README for cmake instructions).\n\n2. Run `cd <BUILD>/lonestar/scientific/cpu/delaunayrefinement; make -j`\n\nRUN\n--------------------------------------------------------------------------------\n\nThe following are a few example command lines.\n\n- `$ ./delaunayrefinement-cpu <input-basename> -meshGraph -t 40`\n- `$ ./delaunayrefinement-cpu <input-basename> -meshGraph -detPrefix -t 40` for one of the\n  available deterministic schedules\n\nPERFORMANCE  \n--------------------------------------------------------------------------------\n\n* In our experience, nondet schedule in  delaunayrefinement outperforms deterministic schedules, because determinism incurs a performance cost\n* Performance is sensitive to CHUNK_SIZE for the worklist, whose optimal value is input and\n  machine dependent\n"
  },
  {
    "path": "lonestar/scientific/cpu/delaunayrefinement/Subgraph.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#ifndef SUBGRAPH_H\n#define SUBGRAPH_H\n\n#include \"Element.h\"\n\n#include \"galois/Galois.h\"\n#include \"galois/graphs/Graph.h\"\n\n#include <vector>\n#include <algorithm>\n\ntypedef galois::graphs::MorphGraph<Element, void, false> Graph;\ntypedef Graph::GraphNode GNode;\n\nstruct EdgeTuple {\n  GNode src;\n  GNode dst;\n  Edge data;\n  EdgeTuple(GNode s, GNode d, const Edge& _d) : src(s), dst(d), data(_d) {}\n\n  bool operator==(const EdgeTuple& rhs) const {\n    return src == rhs.src && dst == rhs.dst && data == data;\n  }\n};\n\n/**\n *  A sub-graph of the mesh. Used to store information about the original\n *  cavity\n */\nclass PreGraph {\n  typedef std::vector<GNode, galois::PerIterAllocTy::rebind<GNode>::other>\n      NodesTy;\n  NodesTy nodes;\n\npublic:\n  typedef NodesTy::iterator iterator;\n\n  explicit PreGraph(galois::PerIterAllocTy& cnx) : nodes(cnx) {}\n\n  bool containsNode(GNode N) {\n    return std::find(nodes.begin(), nodes.end(), N) != nodes.end();\n  }\n\n  void addNode(GNode n) { return nodes.push_back(n); }\n  void reset() { nodes.clear(); }\n  iterator begin() { return nodes.begin(); }\n  iterator end() { return nodes.end(); }\n};\n\n/**\n *  A sub-graph of the mesh. Used to store information about the original\n *  and updated cavity\n */\nclass PostGraph {\n  struct TempEdge {\n    size_t src;\n    GNode dst;\n    Edge edge;\n    TempEdge(size_t s, GNode d, const Edge& e) : src(s), dst(d), edge(e) {}\n  };\n\n  typedef std::vector<GNode, galois::PerIterAllocTy::rebind<GNode>::other>\n      NodesTy;\n  typedef std::vector<EdgeTuple,\n                      galois::PerIterAllocTy::rebind<EdgeTuple>::other>\n      EdgesTy;\n\n  //! the nodes in the graph before updating\n  NodesTy nodes;\n  //! the edges that connect the subgraph to the rest of the graph\n  EdgesTy edges;\n\npublic:\n  typedef NodesTy::iterator iterator;\n  typedef EdgesTy::iterator edge_iterator;\n\n  explicit PostGraph(galois::PerIterAllocTy& cnx) : nodes(cnx), edges(cnx) {}\n\n  void addNode(GNode n) { nodes.push_back(n); }\n\n  void addEdge(GNode src, GNode dst, const Edge& e) {\n    edges.push_back(EdgeTuple(src, dst, e));\n  }\n\n  void reset() {\n    nodes.clear();\n    edges.clear();\n  }\n\n  iterator begin() { return nodes.begin(); }\n  iterator end() { return nodes.end(); }\n  edge_iterator edge_begin() { return edges.begin(); }\n  edge_iterator edge_end() { return edges.end(); }\n};\n\n#endif\n"
  },
  {
    "path": "lonestar/scientific/cpu/delaunayrefinement/Tuple.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#ifndef TUPLE_H\n#define TUPLE_H\n\n#include <ostream>\n#include <cmath>\n\nclass Tuple {\n  double _t[2];\n\npublic:\n  Tuple(double a, double b) {\n    _t[0] = a;\n    _t[1] = b;\n  }\n\n  Tuple(){};\n  ~Tuple(){};\n\n  bool operator==(const Tuple& rhs) const {\n    for (int x = 0; x < 2; ++x) {\n      if (_t[x] != rhs._t[x])\n        return false;\n    }\n    return true;\n  }\n\n  bool operator!=(const Tuple& rhs) const { return !(*this == rhs); }\n\n  bool operator<(const Tuple& rhs) const {\n    for (int i = 0; i < 2; ++i) {\n      if (_t[i] < rhs._t[i])\n        return true;\n      else if (_t[i] > rhs._t[i])\n        return false;\n    }\n    return false;\n  }\n\n  bool operator>(const Tuple& rhs) const {\n    for (int i = 0; i < 2; ++i) {\n      if (_t[i] > rhs._t[i])\n        return true;\n      else if (_t[i] < rhs._t[i])\n        return false;\n    }\n    return false;\n  }\n\n  Tuple operator+(const Tuple& rhs) const {\n    return Tuple(_t[0] + rhs._t[0], _t[1] + rhs._t[1]);\n  }\n\n  Tuple operator-(const Tuple& rhs) const {\n    return Tuple(_t[0] - rhs._t[0], _t[1] - rhs._t[1]);\n  }\n\n  Tuple operator*(double d) const { // scalar product\n    return Tuple(_t[0] * d, _t[1] * d);\n  }\n\n  double operator*(const Tuple& rhs) const { // dot product\n    return _t[0] * rhs._t[0] + _t[1] * rhs._t[1];\n  }\n\n  double operator[](int i) const { return _t[i]; };\n\n  int cmp(const Tuple& x) const {\n    if (*this == x)\n      return 0;\n    if (*this > x)\n      return 1;\n    return -1;\n  }\n\n  double distance_squared(\n      const Tuple& p) const { // squared distance between current tuple and x\n    double sum = 0.0;\n    for (int i = 0; i < 2; ++i) {\n      double d = _t[i] - p._t[i];\n      sum += d * d;\n    }\n    return sum;\n  }\n\n  double distance(const Tuple& p) const { // distance between current tuple and\n                                          // x\n    return sqrt(distance_squared(p));\n  }\n\n  double angle(const Tuple& a,\n               const Tuple& b) const { // angle formed by a, current tuple, b\n    Tuple vb  = a - *this;\n    Tuple vc  = b - *this;\n    double dp = vb * vc;\n    double c  = dp / sqrt(distance_squared(a) * distance_squared(b));\n    return (180 / M_PI) * acos(c);\n  }\n\n  void angleCheck(const Tuple& a, const Tuple& b, bool& ob, bool& sm,\n                  double M) const { // angle formed by a, current tuple, b\n    Tuple vb  = a - *this;\n    Tuple vc  = b - *this;\n    double dp = vb * vc;\n\n    if (dp < 0) {\n      ob = true;\n      return;\n    }\n\n    double c = dp / sqrt(distance_squared(b) * distance_squared(a));\n    if (c > cos(M * M_PI / 180)) {\n      sm = true;\n      return;\n    }\n    return;\n  }\n\n  bool angleGTCheck(const Tuple& a, const Tuple& b,\n                    double M) const { // angle formed by a, current tuple, b\n    Tuple vb  = a - *this;\n    Tuple vc  = b - *this;\n    double dp = vb * vc;\n\n    if (dp < 0)\n      return false;\n\n    double c = dp / sqrt(distance_squared(b) * distance_squared(a));\n    return c > cos(M * M_PI / 180);\n  }\n\n  bool\n  angleOBCheck(const Tuple& a,\n               const Tuple& b) const { // angle formed by a, current tuple, b\n    Tuple vb  = a - *this;\n    Tuple vc  = b - *this;\n    double dp = vb * vc;\n\n    return dp < 0;\n  }\n\n  void print(std::ostream& os) const {\n    os << \"(\" << _t[0] << \", \" << _t[1] << \")\";\n  }\n\n  static int cmp(Tuple a, Tuple b) { return a.cmp(b); }\n  static double distance(Tuple a, Tuple b) { return a.distance(b); }\n  static double angle(const Tuple& a, const Tuple& b, const Tuple& c) {\n    return b.angle(a, c);\n  }\n  static void angleCheck(const Tuple& a, const Tuple& b, const Tuple& c,\n                         bool& ob, bool& sm, double M) {\n    b.angleCheck(a, c, ob, sm, M);\n  }\n  static bool angleGTCheck(const Tuple& a, const Tuple& b, const Tuple& c,\n                           double M) {\n    return b.angleGTCheck(a, c, M);\n  }\n  static bool angleOBCheck(const Tuple& a, const Tuple& b, const Tuple& c) {\n    return b.angleOBCheck(a, c);\n  }\n};\n\nstatic inline std::ostream& operator<<(std::ostream& os, const Tuple& rhs) {\n  rhs.print(os);\n  return os;\n}\n\nstatic inline Tuple operator*(double d, Tuple rhs) { return rhs * d; }\n\n#endif\n"
  },
  {
    "path": "lonestar/scientific/cpu/delaunayrefinement/Verifier.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#ifndef VERIFIER_H\n#define VERIFIER_H\n\n#include \"galois/Galois.h\"\n#include \"galois/ParallelSTL.h\"\n\n#include <stack>\n#include <set>\n#include <iostream>\n\nclass Verifier {\n  struct inconsistent {\n    Graph& graph;\n    inconsistent(Graph& g) : graph(g) {}\n\n    bool operator()(const GNode& node) const {\n      Element& e = graph.getData(node);\n\n      size_t dist = std::distance(graph.edge_begin(node), graph.edge_end(node));\n      if (e.dim() == 2) {\n        if (dist != 1) {\n          std::cerr << \"Error: Segment \" << e << \" has \" << dist\n                    << \" relation(s)\\n\";\n          return true;\n        }\n      } else if (e.dim() == 3) {\n        if (dist != 3) {\n          std::cerr << \"Error: Triangle \" << e << \" has \" << dist\n                    << \" relation(s)\\n\";\n          return true;\n        }\n      } else {\n        std::cerr << \"Error: Element with \" << e.dim() << \" edges\\n\";\n        return true;\n      }\n      return false;\n    }\n  };\n\n  struct not_delaunay {\n    Graph& graph;\n    not_delaunay(Graph& g) : graph(g) {}\n\n    bool operator()(const GNode& node) {\n      Element& e1 = graph.getData(node);\n\n      for (Graph::edge_iterator jj = graph.edge_begin(node),\n                                ej = graph.edge_end(node);\n           jj != ej; ++jj) {\n        const GNode& n = graph.getEdgeDst(jj);\n        Element& e2    = graph.getData(n);\n        if (e1.dim() == 3 && e2.dim() == 3) {\n          Tuple t2;\n          if (!getTupleT2OfRelatedEdge(e1, e2, t2)) {\n            std::cerr << \"missing tuple\\n\";\n            return true;\n          }\n          if (e1.inCircle(t2)) {\n            std::cerr << \"Delaunay property violated: point \" << t2\n                      << \" in element \" << e1 << \"\\n\";\n            return true;\n          }\n        }\n      }\n      return false;\n    }\n\n    bool getTupleT2OfRelatedEdge(const Element& e1, const Element& e2,\n                                 Tuple& t) {\n      int e2_0  = -1;\n      int e2_1  = -1;\n      int phase = 0;\n\n      for (int i = 0; i < e1.dim(); i++) {\n        for (int j = 0; j < e2.dim(); j++) {\n          if (e1.getPoint(i) != e2.getPoint(j))\n            continue;\n\n          if (phase == 0) {\n            e2_0  = j;\n            phase = 1;\n            break;\n          }\n\n          e2_1 = j;\n          for (int k = 0; k < 3; k++) {\n            if (k != e2_0 && k != e2_1) {\n              t = e2.getPoint(k);\n              return true;\n            }\n          }\n        }\n      }\n      return false;\n    }\n  };\n\n  bool checkReachability(Graph& graph) {\n    std::stack<GNode> remaining;\n    std::set<GNode> found;\n    remaining.push(*(graph.begin()));\n\n    while (!remaining.empty()) {\n      GNode node = remaining.top();\n      remaining.pop();\n      if (!found.count(node)) {\n        if (!graph.containsNode(node)) {\n          std::cerr << \"Reachable node was removed from graph\\n\";\n        }\n        found.insert(node);\n        int i = 0;\n        for (Graph::edge_iterator ii = graph.edge_begin(node),\n                                  ei = graph.edge_end(node);\n             ii != ei; ++ii) {\n          GNode n = graph.getEdgeDst(ii);\n          assert(i < 3);\n          assert(graph.containsNode(n));\n          assert(node != n);\n          ++i;\n          remaining.push(n);\n        }\n      }\n    }\n\n    if (found.size() != graph.size()) {\n      std::cerr << \"Error: Not all elements are reachable. \";\n      std::cerr << \"Found: \" << found.size() << \" needed: \" << graph.size()\n                << \".\\n\";\n      return false;\n    }\n    return true;\n  }\n\npublic:\n  bool verify(Graph& g) {\n    return galois::ParallelSTL::find_if(g.begin(), g.end(), inconsistent(g)) ==\n               g.end() &&\n           galois::ParallelSTL::find_if(g.begin(), g.end(), not_delaunay(g)) ==\n               g.end() &&\n           checkReachability(g);\n  }\n};\n\n#endif\n"
  },
  {
    "path": "lonestar/scientific/cpu/delaunaytriangulation/CMakeLists.txt",
    "content": "add_executable(delaunaytriangulation-cpu DelaunayTriangulation.cpp Element.cpp)\nadd_dependencies(apps delaunaytriangulation-cpu)\ntarget_link_libraries(delaunaytriangulation-cpu PRIVATE Galois::shmem lonestar)\ninstall(TARGETS delaunaytriangulation-cpu DESTINATION \"${CMAKE_INSTALL_BINDIR}\" COMPONENT apps EXCLUDE_FROM_ALL)\nadd_test_scale(small1 delaunaytriangulation-cpu -meshGraph \"${BASEINPUT}/reference/meshes/r10k.node\")\nadd_test_scale(small2 delaunaytriangulation-cpu -meshGraph \"${BASEINPUT}/meshes/250k.2.node\" NOT_QUICK)\n\nif(CMAKE_COMPILER_IS_GNUCC)\n  target_compile_options(delaunaytriangulation-cpu PRIVATE -ffast-math)\nendif()\n\nadd_executable(delaunaytriangulation-deterministic-cpu DelaunayTriangulationDet.cpp Element.cpp)\nadd_dependencies(apps delaunaytriangulation-deterministic-cpu)\ntarget_link_libraries(delaunaytriangulation-deterministic-cpu PRIVATE Galois::shmem lonestar)\ninstall(TARGETS delaunaytriangulation-deterministic-cpu DESTINATION \"${CMAKE_INSTALL_BINDIR}\" COMPONENT apps EXCLUDE_FROM_ALL)\nadd_test_scale(small1 delaunaytriangulation-deterministic-cpu -meshGraph \"${BASEINPUT}/reference/meshes/r10k.node\")\nadd_test_scale(small2 delaunaytriangulation-deterministic-cpu -meshGraph \"${BASEINPUT}/meshes/250k.2.node\" NOT_QUICK)\n\nif(CMAKE_COMPILER_IS_GNUCC)\n  target_compile_options(delaunaytriangulation-deterministic-cpu PRIVATE -ffast-math)\nendif()\n"
  },
  {
    "path": "lonestar/scientific/cpu/delaunaytriangulation/Cavity.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#ifndef CAVITY_H\n#define CAVITY_H\n\n#include \"Graph.h\"\n\n#include <vector>\n\n//! A cavity which will be retrangulated\ntemplate <typename Alloc = std::allocator<char>>\nclass Cavity : private boost::noncopyable {\n  typedef typename Alloc::template rebind<GNode>::other GNodeVectorAlloc;\n  typedef std::vector<GNode, GNodeVectorAlloc> GNodeVector;\n  typedef typename Alloc::template rebind<std::pair<GNode, int>>::other\n      GNodeIntPairVectorAlloc;\n  typedef std::vector<std::pair<GNode, int>, GNodeIntPairVectorAlloc>\n      GNodeIntPairVector;\n\n  struct InCircumcenter {\n    const Graph& graph;\n    Tuple tuple;\n    InCircumcenter(const Graph& g, const Tuple& t) : graph(g), tuple(t) {}\n    bool operator()(const GNode& n) const {\n      Element& e = graph.getData(n, galois::MethodFlag::UNPROTECTED);\n      return e.inCircle(tuple);\n    }\n  };\n\n  Searcher<Alloc> searcher;\n  GNodeVector newNodes;\n  GNodeIntPairVector outside;\n  GNode center;\n  Point* point;\n  Graph& graph;\n  const Alloc& alloc;\n\n  //! Find triangles that border cavity but are not in the cavity\n  void findOutside() {\n    for (const auto& ii : searcher.inside) {\n      for (auto jj : graph.edges(ii, galois::MethodFlag::UNPROTECTED)) {\n        GNode n = graph.getEdgeDst(jj);\n        // i.e., if (!e.boundary() && e.inCircle(point->t()))\n        if (std::find(searcher.matches.begin(), searcher.matches.end(), n) !=\n            searcher.matches.end())\n          continue;\n\n        int index = graph.getEdgeData(\n            graph.findEdge(n, ii, galois::MethodFlag::UNPROTECTED));\n        outside.push_back(std::make_pair(n, index));\n\n        Element& e = graph.getData(n, galois::MethodFlag::UNPROTECTED);\n        Point* p2  = e.getPoint(index);\n        Point* p3  = e.getPoint((index + 1) % 3);\n\n        p2->get(galois::MethodFlag::WRITE);\n        p3->get(galois::MethodFlag::WRITE);\n      }\n    }\n  }\n\n  void addElements() {\n    GNodeVector newNodes(alloc);\n\n    // Create new nodes\n    for (auto& ii : outside) {\n      const GNode& n = ii.first;\n      int& index     = ii.second;\n\n      Element& e = graph.getData(n, galois::MethodFlag::UNPROTECTED);\n\n      Point* p2 = e.getPoint(index);\n      Point* p3 = e.getPoint((index + 1) % 3);\n\n      Element newE(point, p2, p3);\n      GNode newNode = graph.createNode(newE);\n      graph.addNode(newNode, galois::MethodFlag::UNPROTECTED);\n\n      point->addElement(newNode);\n      p2->addElement(newNode);\n      p3->addElement(newNode);\n\n      graph.getEdgeData(\n          graph.addEdge(newNode, n, galois::MethodFlag::UNPROTECTED)) = 1;\n      graph.getEdgeData(\n          graph.addEdge(n, newNode, galois::MethodFlag::UNPROTECTED)) = index;\n\n      newNodes.push_back(newNode);\n    }\n\n    // Update new node connectivity\n    for (unsigned i = 0; i < newNodes.size(); ++i) {\n      const GNode& n1   = newNodes[i];\n      const Element& e1 = graph.getData(n1, galois::MethodFlag::UNPROTECTED);\n      for (unsigned j = i + 1; j < newNodes.size(); ++j) {\n        if (i != j) {\n          const GNode& n2 = newNodes[j];\n          const Element& e2 =\n              graph.getData(n2, galois::MethodFlag::UNPROTECTED);\n\n          for (int x = 2; x >= 1; --x) {\n            for (int y = 2; y >= 1; --y) {\n              if (e1.getPoint(x) == e2.getPoint(y)) {\n                int indexForNewNode                           = x & 2;\n                int indexForNode                              = y & 2;\n                graph.getEdgeData(graph.addEdge(\n                    n1, n2, galois::MethodFlag::UNPROTECTED)) = indexForNewNode;\n                graph.getEdgeData(graph.addEdge(\n                    n2, n1, galois::MethodFlag::UNPROTECTED)) = indexForNode;\n              }\n            }\n          }\n        }\n      }\n    }\n  }\n\n  void removeElements() {\n    for (auto ii : searcher.matches) {\n      graph.removeNode(ii, galois::MethodFlag::UNPROTECTED);\n    }\n  }\n\npublic:\n  Cavity(Graph& g, const Alloc& a = Alloc())\n      : searcher(g, a), newNodes(a), outside(a), graph(g), alloc(a) {}\n\n  void init(const GNode& c, Point* p) {\n    center = c;\n    point  = p;\n  }\n\n  void build() {\n    assert(graph.getData(center).inCircle(point->t()));\n    searcher.findAll(center, InCircumcenter(graph, point->t()));\n    assert(!searcher.inside.empty());\n    findOutside();\n  }\n\n  void update() {\n    removeElements();\n    addElements();\n  }\n};\n\n#endif\n"
  },
  {
    "path": "lonestar/scientific/cpu/delaunaytriangulation/DelaunayTriangulation.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#include \"Point.h\"\n#include \"Cavity.h\"\n#include \"Verifier.h\"\n\n#include \"galois/Galois.h\"\n#include \"galois/Bag.h\"\n#include \"galois/Timer.h\"\n#include \"galois/graphs/SpatialTree.h\"\n#include \"Lonestar/BoilerPlate.h\"\n#include \"llvm/Support/CommandLine.h\"\n\n#include \"galois/runtime/Profile.h\"\n\n#include <boost/iterator/transform_iterator.hpp>\n#include <boost/iterator/counting_iterator.hpp>\n\n#include <algorithm>\n#include <deque>\n#include <fstream>\n#include <iostream>\n#include <limits>\n#include <vector>\n\n#include <string.h>\n#include <unistd.h>\n\nnamespace cll = llvm::cl;\n\nstatic const char* name = \"Delaunay Triangulation\";\nstatic const char* desc =\n    \"Produces a Delaunay triangulation for a set of points\";\nstatic const char* url = \"delaunay_triangulation\";\n\nstatic cll::opt<std::string>\n    inputFile(cll::Positional, cll::desc(\"<input file>\"), cll::Required);\nstatic cll::opt<std::string>\n    doWriteMesh(\"writemesh\",\n                cll::desc(\"Write the mesh out to files with basename\"),\n                cll::value_desc(\"basename\"));\n\n//! Flag that forces user to be aware that they should be passing in a\n//! mesh graph.\nstatic cll::opt<bool>\n    meshGraph(\"meshGraph\", cll::desc(\"Specify that the input graph is a mesh\"),\n              cll::init(false));\n\nusing Tree = typename galois::graphs::SpatialTree2d<Point*>;\n\n//! All Point* refer to elements in this bag\nusing basePointBag = typename galois::InsertBag<Point>;\n\n//! [Define Insert Bag]\nusing ptrPointBag = typename galois::InsertBag<Point*>;\n\n//! Our main functor\nstruct Process {\n  Graph& graph;\n  Tree& tree;\n  ptrPointBag& ptrPoints;\n\n  Process(Graph& g, Tree& t, ptrPointBag& p)\n      : graph(g), tree(t), ptrPoints(p) {}\n\n  typedef galois::PerIterAllocTy Alloc;\n\n  struct ContainsTuple {\n    const Graph& graph;\n    Tuple tuple;\n    ContainsTuple(const Graph& g, const Tuple& t) : graph(g), tuple(t) {}\n    bool operator()(const GNode& n) const {\n      assert(!graph.getData(n, galois::MethodFlag::UNPROTECTED).boundary());\n      return graph.getData(n, galois::MethodFlag::UNPROTECTED)\n          .inTriangle(tuple);\n    }\n  };\n\n  void computeCenter(const Element& e, Tuple& t) const {\n    for (int i = 0; i < 3; ++i) {\n      const Tuple& o = e.getPoint(i)->t();\n      for (int j = 0; j < 2; ++j) {\n        t[j] += o[j];\n      }\n    }\n    for (int j = 0; j < 2; ++j) {\n      t[j] *= 1 / 3.0;\n    }\n  }\n\n  void findBestNormal(const Element& element, const Point* p,\n                      const Point*& bestP1, const Point*& bestP2) {\n    Tuple center(0);\n    computeCenter(element, center);\n    int scale = element.clockwise() ? 1 : -1;\n\n    Tuple origin = p->t() - center;\n    //        double length2 = origin.x() * origin.x() + origin.y() *\n    //        origin.y();\n    bestP1 = bestP2 = NULL;\n    double bestVal  = 0.0;\n    for (int i = 0; i < 3; ++i) {\n      int next = i + 1;\n      if (next > 2)\n        next -= 3;\n\n      const Point* p1 = element.getPoint(i);\n      const Point* p2 = element.getPoint(next);\n      double dx       = p2->t().x() - p1->t().x();\n      double dy       = p2->t().y() - p1->t().y();\n      Tuple normal(scale * -dy, scale * dx);\n      double val = normal.dot(origin); // / length2;\n      if (bestP1 == NULL || val > bestVal) {\n        bestVal = val;\n        bestP1  = p1;\n        bestP2  = p2;\n      }\n    }\n    assert(bestP1 != NULL && bestP2 != NULL && bestVal > 0);\n  }\n\n  GNode findCorrespondingNode(GNode start, const Point* p1, const Point* p2) {\n    for (auto ii : graph.edges(start)) {\n      GNode dst  = graph.getEdgeDst(ii);\n      Element& e = graph.getData(dst, galois::MethodFlag::UNPROTECTED);\n      int count  = 0;\n      for (int i = 0; i < e.dim(); ++i) {\n        if (e.getPoint(i) == p1 || e.getPoint(i) == p2) {\n          if (++count == 2)\n            return dst;\n        }\n      }\n    }\n    GALOIS_DIE(\"unreachable\");\n    return start;\n  }\n\n  bool planarSearch(const Point* p, GNode start, GNode& node) {\n    // Try simple hill climbing instead\n    ContainsTuple contains(graph, p->t());\n    while (!contains(start)) {\n      Element& element = graph.getData(start, galois::MethodFlag::WRITE);\n      if (element.boundary()) {\n        // Should only happen when quad tree returns a boundary point which is\n        // rare There's only one way to go from here\n        assert(std::distance(graph.edge_begin(start), graph.edge_end(start)) ==\n               1);\n        start = graph.getEdgeDst(\n            graph.edge_begin(start, galois::MethodFlag::WRITE));\n      } else {\n        // Find which neighbor will get us to point fastest by computing normal\n        // vectors\n        const Point *p1, *p2;\n        findBestNormal(element, p, p1, p2);\n        start = findCorrespondingNode(start, p1, p2);\n      }\n    }\n\n    node = start;\n    return true;\n  }\n\n  bool findContainingElement(const Point* p, GNode& node) {\n    Point** rp = tree.find(p->t().x(), p->t().y());\n    if (!rp)\n      return false;\n\n    (*rp)->get(galois::MethodFlag::WRITE);\n\n    GNode someNode = (*rp)->someElement();\n\n    // Not in mesh yet\n    if (!someNode) {\n      GALOIS_DIE(\"unreachable\");\n      return false;\n    }\n\n    return planarSearch(p, someNode, node);\n  }\n\n  void generateMesh() {\n    typedef galois::worklists::PerThreadChunkLIFO<32> CA;\n    galois::for_each(\n        galois::iterate(ptrPoints),\n        [&, self = this](Point* p, auto& ctx) {\n          p->get(galois::MethodFlag::WRITE);\n          assert(!p->inMesh());\n\n          GNode node;\n          if (!self->findContainingElement(p, node)) {\n            // Someone updated an element while we were searching,\n            // producing a semi-consistent state ctx.push(p);\n            // Current version is safe with locking so this\n            // shouldn't happen\n            GALOIS_DIE(\"unreachable\");\n            return;\n          }\n\n          assert(self->graph.getData(node).inTriangle(p->t()));\n          assert(self->graph.containsNode(node));\n\n          Cavity<Alloc> cav(self->graph, ctx.getPerIterAlloc());\n          cav.init(node, p);\n          cav.build();\n          cav.update();\n          self->tree.insert(p->t().x(), p->t().y(), p);\n        },\n        galois::no_pushes(), galois::per_iter_alloc(), galois::loopname(\"Main\"),\n        galois::wl<CA>());\n  }\n};\n\ntypedef std::vector<Point> PointList;\n\nclass ReadPoints {\n  void addBoundaryPoints() {\n    double minX, maxX, minY, maxY;\n\n    minX = minY = std::numeric_limits<double>::max();\n    maxX = maxY = std::numeric_limits<double>::min();\n\n    for (const auto& p : points) {\n      double x = p.t().x();\n      double y = p.t().y();\n      if (x < minX)\n        minX = x;\n      else if (x > maxX)\n        maxX = x;\n      if (y < minY)\n        minY = y;\n      else if (y > maxY)\n        maxY = y;\n    }\n\n    tree.init(minX, minY, maxX, maxY);\n\n    size_t size      = points.size();\n    double width     = maxX - minX;\n    double height    = maxY - minY;\n    double maxLength = std::max(width, height);\n    double centerX   = minX + width / 2.0;\n    double centerY   = minY + height / 2.0;\n    double radius =\n        maxLength * 3.0; // radius of circle that should cover all points\n\n    for (int i = 0; i < 3; ++i) {\n      double dX = radius * cos(2 * M_PI * (i / 3.0));\n      double dY = radius * sin(2 * M_PI * (i / 3.0));\n      points.push_back(Point(centerX + dX, centerY + dY, size + i));\n    }\n  }\n\n  void nextLine(std::ifstream& scanner) {\n    scanner.ignore(std::numeric_limits<std::streamsize>::max(), '\\n');\n  }\n\n  void fromTriangle(std::ifstream& scanner) {\n    double x, y;\n    long numPoints;\n\n    scanner >> numPoints;\n\n    int dim;\n    scanner >> dim;\n    assert(dim == 2);\n    int k;\n    scanner >> k; // number of attributes\n    assert(k == 0);\n    scanner >> k; // has boundary markers?\n\n    for (long id = 0; id < numPoints; ++id) {\n      scanner >> k; // point id\n      scanner >> x >> y;\n      nextLine(scanner);\n      points.push_back(Point(x, y, id));\n    }\n  }\n\n  void fromPointList(std::ifstream& scanner) {\n    double x, y;\n\n    // comment line\n    nextLine(scanner);\n    size_t id = 0;\n    while (!scanner.eof()) {\n      scanner >> x >> y;\n      if (x == 0 && y == 0)\n        break;\n      points.push_back(Point(x, y, id++));\n      x = y = 0;\n      nextLine(scanner);\n    }\n  }\n\n  PointList& points;\n  Tree& tree;\n\npublic:\n  ReadPoints(PointList& p, Tree& t) : points(p), tree(t) {}\n\n  void from(const std::string& name) {\n    std::ifstream scanner(name.c_str());\n    if (!scanner.good()) {\n      GALOIS_DIE(\"could not open file: \", name);\n    }\n    if (name.find(\".node\") == name.size() - 5) {\n      fromTriangle(scanner);\n    } else {\n      fromPointList(scanner);\n    }\n    scanner.close();\n\n    if (points.size())\n      addBoundaryPoints();\n    else {\n      GALOIS_DIE(\"no points found in file: \", name);\n    }\n  }\n};\n\nstruct ReadInput {\n  Graph& graph;\n  Tree& tree;\n  basePointBag& basePoints;\n  ptrPointBag& ptrPoints;\n  std::random_device rng;\n  std::mt19937 urng;\n\n  ReadInput(Graph& g, Tree& t, basePointBag& b, ptrPointBag& p)\n      : graph(g), tree(t), basePoints(b), ptrPoints(p), urng(rng()) {}\n\n  void addBoundaryNodes(Point* p1, Point* p2, Point* p3) {\n    Element large_triangle(p1, p2, p3);\n    GNode large_node = graph.createNode(large_triangle);\n    graph.addNode(large_node);\n\n    p1->addElement(large_node);\n    p2->addElement(large_node);\n    p3->addElement(large_node);\n\n    tree.insert(p1->t().x(), p1->t().y(), p1);\n\n    Element border_ele1(p1, p2);\n    Element border_ele2(p2, p3);\n    Element border_ele3(p3, p1);\n\n    GNode border_node1 = graph.createNode(border_ele1);\n    GNode border_node2 = graph.createNode(border_ele2);\n    GNode border_node3 = graph.createNode(border_ele3);\n\n    graph.addNode(border_node1);\n    graph.addNode(border_node2);\n    graph.addNode(border_node3);\n\n    graph.getEdgeData(graph.addEdge(large_node, border_node1)) = 0;\n    graph.getEdgeData(graph.addEdge(large_node, border_node2)) = 1;\n    graph.getEdgeData(graph.addEdge(large_node, border_node3)) = 2;\n\n    graph.getEdgeData(graph.addEdge(border_node1, large_node)) = 0;\n    graph.getEdgeData(graph.addEdge(border_node2, large_node)) = 0;\n    graph.getEdgeData(graph.addEdge(border_node3, large_node)) = 0;\n  }\n\n  struct centerXCmp {\n    template <typename T>\n    bool operator()(const T& lhs, const T& rhs) const {\n      return lhs.t().x() < rhs.t().x();\n    }\n  };\n\n  struct centerYCmp {\n    template <typename T>\n    bool operator()(const T& lhs, const T& rhs) const {\n      return lhs.t().y() < rhs.t().y();\n    }\n  };\n\n  struct centerYCmpInv {\n    template <typename T>\n    bool operator()(const T& lhs, const T& rhs) const {\n      return rhs.t().y() < lhs.t().y();\n    }\n  };\n\n  template <typename Iter>\n  void divide(const Iter& b, const Iter& e) {\n    if (std::distance(b, e) > 64) {\n      std::sort(b, e, centerXCmp());\n      Iter m = galois::split_range(b, e);\n      std::sort(b, m, centerYCmpInv());\n      std::sort(m, e, centerYCmp());\n      divide(b, galois::split_range(b, m));\n      divide(galois::split_range(b, m), m);\n      divide(m, galois::split_range(m, e));\n      divide(galois::split_range(m, e), e);\n    } else {\n      std::shuffle(b, e, urng);\n    }\n  }\n\n  void layoutPoints(PointList& points) {\n    divide(points.begin(), points.end() - 3);\n    galois::do_all(galois::iterate(points.begin(), points.end() - 3),\n                   [&](Point& p) {\n                     Point* pr = &basePoints.push(p);\n                     ptrPoints.push(pr);\n                   });\n    //! [Insert elements into InsertBag]\n    Point* p1 = &basePoints.push(*(points.end() - 1));\n    Point* p2 = &basePoints.push(*(points.end() - 2));\n    Point* p3 = &basePoints.push(*(points.end() - 3));\n    //! [Insert elements into InsertBag]\n    addBoundaryNodes(p1, p2, p3);\n  }\n\n  void operator()(const std::string& filename) {\n    PointList points;\n    ReadPoints(points, tree).from(filename);\n\n    std::cout << \"configuration: \" << points.size() << \" points\\n\";\n\n    galois::preAlloc(2 * numThreads // some per-thread state\n                     + 2 * points.size() *\n                           sizeof(Element) // mesh is about 2x number of points\n                                           // (for random points)\n                           * 32            // include graph node size\n                           / (galois::runtime::pagePoolSize()) // in pages\n    );\n    galois::reportPageAlloc(\"MeminfoPre\");\n\n    layoutPoints(points);\n  }\n};\n\nstatic void writePoints(const std::string& filename, const PointList& points) {\n  std::ofstream out(filename.c_str());\n  // <num vertices> <dimension> <num attributes> <has boundary markers>\n  out << points.size() << \" 2 0 0\\n\";\n  // out.setf(std::ios::fixed, std::ios::floatfield);\n  out.setf(std::ios::scientific, std::ios::floatfield);\n  out.precision(10);\n  long id = 0;\n  for (const auto& p : points) {\n    const Tuple& t = p.t();\n    out << id++ << \" \" << t.x() << \" \" << t.y() << \" 0\\n\";\n  }\n\n  out.close();\n}\n\nstatic void writeMesh(const std::string& filename, Graph& graph) {\n  long numTriangles = 0;\n  long numSegments  = 0;\n  for (auto n : graph) {\n    Element& e = graph.getData(n);\n    if (e.boundary()) {\n      numSegments++;\n    } else {\n      numTriangles++;\n    }\n  }\n\n  long tid = 0;\n  long sid = 0;\n  std::string elementName(filename);\n  std::string polyName(filename);\n\n  elementName.append(\".ele\");\n  polyName.append(\".poly\");\n\n  std::ofstream eout(elementName.c_str());\n  std::ofstream pout(polyName.c_str());\n  // <num triangles> <nodes per triangle> <num attributes>\n  eout << numTriangles << \" 3 0\\n\";\n  // <num vertices> <dimension> <num attributes> <has boundary markers>\n  // ...\n  // <num segments> <has boundary markers>\n  pout << \"0 2 0 0\\n\";\n  pout << numSegments << \" 1\\n\";\n  for (auto n : graph) {\n    const Element& e = graph.getData(n);\n    if (e.boundary()) {\n      // <segment id> <vertex> <vertex> <is boundary>\n      pout << sid++ << \" \" << e.getPoint(0)->id() << \" \" << e.getPoint(1)->id()\n           << \" 1\\n\";\n    } else {\n      // <triangle id> <vertex> <vertex> <vertex> [in ccw order]\n      eout << tid++ << \" \" << e.getPoint(0)->id() << \" \";\n      if (e.clockwise()) {\n        eout << e.getPoint(2)->id() << \" \" << e.getPoint(1)->id() << \"\\n\";\n      } else {\n        eout << e.getPoint(1)->id() << \" \" << e.getPoint(2)->id() << \"\\n\";\n      }\n    }\n  }\n\n  eout.close();\n  // <num holes>\n  pout << \"0\\n\";\n  pout.close();\n}\n\nint main(int argc, char** argv) {\n  galois::SharedMemSys G;\n  LonestarStart(argc, argv, name, desc, url, &inputFile);\n\n  galois::StatTimer totalTime(\"TimerTotal\");\n  totalTime.start();\n\n  if (!meshGraph) {\n    GALOIS_DIE(\"This application requires a mesh graph input;\"\n               \" please use the -meshGraph flag \"\n               \" to indicate the input is a mesh graph.\");\n  }\n\n  Graph graph;\n  Tree tree;\n  basePointBag basePoints;\n  ptrPointBag ptrPoints;\n\n  ReadInput(graph, tree, basePoints, ptrPoints)(inputFile);\n\n  galois::StatTimer execTime(\"Timer_0\");\n  execTime.start();\n  galois::runtime::profileVtune(\n      [&]() { Process(graph, tree, ptrPoints).generateMesh(); },\n      \"MeshGeneration\");\n  execTime.stop();\n  std::cout << \"mesh size: \" << graph.size() << \"\\n\";\n\n  galois::reportPageAlloc(\"MeminfoPost\");\n\n  if (!skipVerify) {\n    Verifier verifier;\n    if (!verifier.verify(&graph)) {\n      GALOIS_DIE(\"triangulation failed\");\n    }\n    std::cout << \"Triangulation OK\\n\";\n  }\n\n  if (doWriteMesh.size()) {\n    std::string base = doWriteMesh;\n    std::cout << \"Writing \" << base << \"\\n\";\n    writeMesh(base.c_str(), graph);\n\n    PointList points;\n    // Reordering messes up connection between id and place in pointlist\n    ReadPoints(points, tree).from(inputFile);\n    writePoints(base.append(\".node\"), points);\n  }\n\n  totalTime.stop();\n\n  return 0;\n}\n"
  },
  {
    "path": "lonestar/scientific/cpu/delaunaytriangulation/DelaunayTriangulationDet.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#include \"Point.h\"\n#include \"Cavity.h\"\n#include \"QuadTree.h\"\n#include \"Verifier.h\"\n\n#include \"galois/Galois.h\"\n#include \"galois/Bag.h\"\n#include \"galois/Timer.h\"\n\n#include \"Lonestar/BoilerPlate.h\"\n#include \"llvm/Support/CommandLine.h\"\n\n#include <boost/iterator/transform_iterator.hpp>\n#include <boost/iterator/counting_iterator.hpp>\n\n#include <algorithm>\n#include <deque>\n#include <fstream>\n#include <iostream>\n#include <limits>\n#include <vector>\n\n#include <string.h>\n#include <unistd.h>\n\nnamespace cll = llvm::cl;\n\nstatic const char* name = \"Delaunay Triangulation\";\nstatic const char* desc =\n    \"Produces a Delaunay triangulation for a set of points\";\nstatic const char* url = \"delaunay_triangulation\";\n\nstatic cll::opt<std::string>\n    doWriteMesh(\"writemesh\",\n                cll::desc(\"Write the mesh out to files with basename\"),\n                cll::value_desc(\"basename\"));\nstatic cll::opt<std::string>\n    doWritePoints(\"writepoints\",\n                  cll::desc(\"Write the (reordered) points to filename\"),\n                  cll::value_desc(\"filename\"));\nstatic cll::opt<bool>\n    noReorderPoints(\"noreorder\",\n                    cll::desc(\"Don't reorder points to improve locality\"),\n                    cll::init(false));\nstatic cll::opt<std::string>\n    inputFile(cll::Positional, cll::desc(\"<input file>\"), cll::Required);\n\nenum DetAlgo { nondet, detBase, detPrefix, detDisjoint };\n\nstatic cll::opt<DetAlgo>\n    detAlgo(cll::desc(\"Deterministic algorithm:\"),\n            cll::values(clEnumVal(nondet, \"Non-deterministic\"),\n                        clEnumVal(detBase, \"Base execution\"),\n                        clEnumVal(detPrefix, \"Prefix execution\"),\n                        clEnumVal(detDisjoint, \"Disjoint execution\")),\n            cll::init(nondet));\n\n//! Flag that forces user to be aware that they should be passing in a\n//! mesh graph.\nstatic cll::opt<bool>\n    meshGraph(\"meshGraph\", cll::desc(\"Specify that the input graph is a mesh\"),\n              cll::init(false));\n\nstruct GetPointer {\n  Point* operator()(Point& p) const { return &p; }\n};\n\ntypedef std::vector<Point> PointList;\n\nclass ReadPoints {\n  void addBoundaryPoints() {\n    double minX, maxX, minY, maxY;\n\n    minX = minY = std::numeric_limits<double>::max();\n    maxX = maxY = std::numeric_limits<double>::min();\n\n    for (auto& p : points) {\n      double x = p.t().x();\n      double y = p.t().y();\n      if (x < minX)\n        minX = x;\n      else if (x > maxX)\n        maxX = x;\n      if (y < minY)\n        minY = y;\n      else if (y > maxY)\n        maxY = y;\n    }\n\n    size_t size      = points.size();\n    double width     = maxX - minX;\n    double height    = maxY - minY;\n    double maxLength = std::max(width, height);\n    double centerX   = minX + width / 2.0;\n    double centerY   = minY + height / 2.0;\n    double radius =\n        maxLength * 3.0; // radius of circle that should cover all points\n\n    for (int i = 0; i < 3; ++i) {\n      double dX = radius * cos(2 * M_PI * (i / 3.0));\n      double dY = radius * sin(2 * M_PI * (i / 3.0));\n      points.push_back(Point(centerX + dX, centerY + dY, size + i));\n    }\n  }\n\n  void nextLine(std::ifstream& scanner) {\n    scanner.ignore(std::numeric_limits<std::streamsize>::max(), '\\n');\n  }\n\n  void fromTriangle(std::ifstream& scanner) {\n    double x, y;\n    long numPoints;\n\n    scanner >> numPoints;\n\n    int dim;\n    scanner >> dim;\n    assert(dim == 2);\n    int k;\n    scanner >> k; // number of attributes\n    assert(k == 0);\n    scanner >> k; // has boundary markers?\n\n    for (long id = 0; id < numPoints; ++id) {\n      scanner >> k; // point id\n      scanner >> x >> y;\n      nextLine(scanner);\n      points.push_back(Point(x, y, id));\n    }\n  }\n\n  void fromPointList(std::ifstream& scanner) {\n    double x, y;\n\n    // comment line\n    nextLine(scanner);\n    size_t id = 0;\n    while (!scanner.eof()) {\n      scanner >> x >> y;\n      if (x == 0 && y == 0)\n        break;\n      points.push_back(Point(x, y, id++));\n      x = y = 0;\n      nextLine(scanner);\n    }\n  }\n\n  PointList& points;\n\npublic:\n  ReadPoints(PointList& p) : points(p) {}\n\n  void from(const std::string& name) {\n    std::ifstream scanner(name.c_str());\n    if (!scanner.good()) {\n      GALOIS_DIE(\"could not open file: \", name);\n    }\n    if (name.find(\".node\") == name.size() - 5) {\n      fromTriangle(scanner);\n    } else {\n      fromPointList(scanner);\n    }\n    scanner.close();\n\n    if (points.size())\n      addBoundaryPoints();\n    else {\n      GALOIS_DIE(\"no points found in file: \", name);\n    }\n  }\n};\n\nstatic void writePoints(const std::string& filename, const PointList& points) {\n  std::ofstream out(filename.c_str());\n  // <num vertices> <dimension> <num attributes> <has boundary markers>\n  out << points.size() << \" 2 0 0\\n\";\n  // out.setf(std::ios::fixed, std::ios::floatfield);\n  out.setf(std::ios::scientific, std::ios::floatfield);\n  out.precision(10);\n  long id = 0;\n  for (const auto& p : points) {\n    const Tuple& t = p.t();\n    out << id++ << \" \" << t.x() << \" \" << t.y() << \" 0\\n\";\n  }\n\n  out.close();\n}\n\nusing BasePoints = galois::InsertBag<Point>;\nusing PtrPoints  = galois::InsertBag<Point*>;\nusing Rounds     = std::vector<PtrPoints*>;\n\nsize_t maxRounds;\nconst int roundShift = 4; //! round sizes are portional to (1 << roundsShift)\n\nstatic void copyPointsFromRounds(PointList& points, Rounds& rounds) {\n  for (int i = maxRounds - 1; i >= 0; --i) {\n    //! [Access elements of InsertBag]\n    // PtrPoints expands to galois::InsertBag<Point*>\n    // points is of type std::vector<Point>\n    PtrPoints& pptrs = *rounds[i];\n    for (auto ii : pptrs) {\n      points.push_back(*ii);\n    }\n    //! [Access elements of InsertBag]\n  }\n}\n\nstruct ReadInput {\n  Graph& graph;\n  BasePoints& basePoints;\n  Rounds& rounds;\n  std::random_device rng;\n  std::mt19937 urng;\n\n  ReadInput(Graph& g, BasePoints& b, Rounds& r)\n      : graph(g), basePoints(b), rounds(r), urng(rng()) {}\n\n  void addBoundaryNodes(Point* p1, Point* p2, Point* p3) {\n    Element large_triangle(p1, p2, p3);\n    GNode large_node = graph.createNode(large_triangle);\n    graph.addNode(large_node);\n\n    p1->addElement(large_node);\n    p2->addElement(large_node);\n    p3->addElement(large_node);\n\n    Element border_ele1(p1, p2);\n    Element border_ele2(p2, p3);\n    Element border_ele3(p3, p1);\n\n    GNode border_node1 = graph.createNode(border_ele1);\n    GNode border_node2 = graph.createNode(border_ele2);\n    GNode border_node3 = graph.createNode(border_ele3);\n\n    graph.addNode(border_node1);\n    graph.addNode(border_node2);\n    graph.addNode(border_node3);\n\n    graph.getEdgeData(graph.addEdge(large_node, border_node1)) = 0;\n    graph.getEdgeData(graph.addEdge(large_node, border_node2)) = 1;\n    graph.getEdgeData(graph.addEdge(large_node, border_node3)) = 2;\n\n    graph.getEdgeData(graph.addEdge(border_node1, large_node)) = 0;\n    graph.getEdgeData(graph.addEdge(border_node2, large_node)) = 0;\n    graph.getEdgeData(graph.addEdge(border_node3, large_node)) = 0;\n  }\n\n  template <typename L>\n  void generateRoundsImpl(const L& loop, size_t size, PointList& points,\n                          size_t log2) {\n    loop(\n        galois::iterate(size_t{0}, size),\n        [&, this](size_t index) {\n          const Point& p = points[index];\n\n          Point* ptr = &(basePoints.push(p));\n          int r      = 0;\n          for (size_t i = 0; i < log2; ++i) {\n            size_t mask = (1UL << (i + 1)) - 1;\n            if ((index & mask) == (1UL << i)) {\n              r = i;\n              break;\n            }\n          }\n\n          rounds[r / roundShift]->push(ptr);\n        },\n        galois::loopname(\"generateRoundsImpl\"));\n  }\n\n  //! Blocked point distribution (exponentially increasing block size) with\n  //! points randomized within a round\n  void generateRoundsOld(PointList& points, bool randomize) {\n    size_t counter = 0;\n    size_t round   = 0;\n    size_t next    = 1 << roundShift;\n    std::vector<Point*> buf;\n\n    PointList::iterator ii = points.begin(), ei = points.end();\n    while (ii != ei) {\n      Point* ptr = &(basePoints.push(*ii));\n      buf.push_back(ptr);\n      ++ii;\n      if (ii == ei || counter > next) {\n        next *= next;\n        int r = maxRounds - 1 - round;\n        if (randomize)\n          std::shuffle(buf.begin(), buf.end(), urng);\n        std::copy(buf.begin(), buf.end(), std::back_inserter(*rounds[r]));\n        buf.clear();\n        ++round;\n      }\n      ++counter;\n    }\n  }\n\n  void generateRounds(PointList& points, bool addBoundary) {\n    size_t size = points.size() - 3;\n\n    size_t log2 = std::max((size_t)floor(log(size) / log(2)), (size_t)1);\n    maxRounds   = log2 / roundShift;\n    for (size_t i = 0; i <= maxRounds;\n         i++) { // rounds[maxRounds+1] for boundary points\n      rounds.push_back(new galois::InsertBag<Point*>);\n    }\n\n    PointList ordered;\n    // ordered.reserve(size);\n\n    if (noReorderPoints) {\n      std::copy(points.begin(), points.begin() + size,\n                std::back_inserter(ordered));\n      generateRoundsOld(ordered, false);\n    } else {\n      // Reorganize spatially\n      QuadTree q(\n          boost::make_transform_iterator(points.begin(), GetPointer()),\n          boost::make_transform_iterator(points.begin() + size, GetPointer()));\n\n      q.output(std::back_inserter(ordered));\n\n      if (true) {\n        if (detAlgo == nondet) {\n          generateRoundsImpl(galois::DoAll(), size, ordered, log2);\n\n        } else {\n          generateRoundsImpl(galois::StdForEach(), size, ordered, log2);\n        }\n      } else {\n        generateRoundsOld(ordered, true);\n      }\n    }\n\n    if (!addBoundary)\n      return;\n\n    // Now, handle boundary points\n    size_t last = points.size();\n    //! [Insert elements into InsertBag]\n    // basePoints is of type galois::InsertBag<Point>\n    // points is of type std::vector<Point>\n    Point* p1 = &(basePoints.push(points[last - 1]));\n    Point* p2 = &(basePoints.push(points[last - 2]));\n    Point* p3 = &(basePoints.push(points[last - 3]));\n    //! [Insert elements into InsertBag]\n\n    rounds[maxRounds]->push(p1);\n    rounds[maxRounds]->push(p2);\n    rounds[maxRounds]->push(p3);\n\n    addBoundaryNodes(p1, p2, p3);\n  }\n\n  void operator()(const std::string& filename, bool addBoundary) {\n    PointList points;\n    ReadPoints(points).from(filename);\n\n    std::cout << \"configuration: \" << points.size() << \" points\\n\";\n\n#if 1\n    galois::preAlloc(\n        32 * points.size() * sizeof(Element) *\n        1.5 // mesh is about 2x number of points (for random points)\n        / (galois::runtime::pagePoolSize()) // in pages\n    );\n#else\n    galois::preAlloc(1 * numThreads // some per-thread state\n                     + 2 * points.size() *\n                           sizeof(Element) // mesh is about 2x number of points\n                                           // (for random points)\n                           * 32            // include graph node size\n                           / (galois::runtime::hugePageSize) // in pages\n    );\n#endif\n    galois::reportPageAlloc(\"MeminfoPre\");\n\n    galois::StatTimer T(\"generateRounds\");\n    T.start();\n    generateRounds(points, addBoundary);\n    T.stop();\n  }\n};\n\nstatic void writeMesh(const std::string& filename, Graph& graph) {\n  long numTriangles = 0;\n  long numSegments  = 0;\n  for (auto n : graph) {\n    Element& e = graph.getData(n);\n    if (e.boundary()) {\n      numSegments++;\n    } else {\n      numTriangles++;\n    }\n  }\n\n  long tid = 0;\n  long sid = 0;\n  std::string elementName(filename);\n  std::string polyName(filename);\n\n  elementName.append(\".ele\");\n  polyName.append(\".poly\");\n\n  std::ofstream eout(elementName.c_str());\n  std::ofstream pout(polyName.c_str());\n  // <num triangles> <nodes per triangle> <num attributes>\n  eout << numTriangles << \" 3 0\\n\";\n  // <num vertices> <dimension> <num attributes> <has boundary markers>\n  // ...\n  // <num segments> <has boundary markers>\n  pout << \"0 2 0 0\\n\";\n  pout << numSegments << \" 1\\n\";\n  for (auto n : graph) {\n    const Element& e = graph.getData(n);\n    if (e.boundary()) {\n      // <segment id> <vertex> <vertex> <is boundary>\n      pout << sid++ << \" \" << e.getPoint(0)->id() << \" \" << e.getPoint(1)->id()\n           << \" 1\\n\";\n    } else {\n      // <triangle id> <vertex> <vertex> <vertex> [in ccw order]\n      eout << tid++ << \" \" << e.getPoint(0)->id() << \" \";\n      if (e.clockwise()) {\n        eout << e.getPoint(2)->id() << \" \" << e.getPoint(1)->id() << \"\\n\";\n      } else {\n        eout << e.getPoint(1)->id() << \" \" << e.getPoint(2)->id() << \"\\n\";\n      }\n    }\n  }\n\n  eout.close();\n  // <num holes>\n  pout << \"0\\n\";\n  pout.close();\n}\n\nstruct DelaunayTriangulation {\n\n  QuadTree* tree;\n  Graph& graph;\n\n  struct ContainsTuple {\n    const Graph& graph;\n    Tuple tuple;\n    ContainsTuple(const Graph& g, const Tuple& t) : graph(g), tuple(t) {}\n    bool operator()(const GNode& n) const {\n      assert(!graph.getData(n, galois::MethodFlag::UNPROTECTED).boundary());\n      return graph.getData(n, galois::MethodFlag::UNPROTECTED)\n          .inTriangle(tuple);\n    }\n  };\n\n  void computeCenter(const Element& e, Tuple& t) const {\n    for (int i = 0; i < 3; ++i) {\n      const Tuple& o = e.getPoint(i)->t();\n      for (int j = 0; j < 2; ++j) {\n        t[j] += o[j];\n      }\n    }\n    for (int j = 0; j < 2; ++j) {\n      t[j] *= 1 / 3.0;\n    }\n  }\n\n  void findBestNormal(const Element& element, const Point* p,\n                      const Point*& bestP1, const Point*& bestP2) {\n    Tuple center(0);\n    computeCenter(element, center);\n    int scale = element.clockwise() ? 1 : -1;\n\n    Tuple origin = p->t() - center;\n    //        double length2 = origin.x() * origin.x() + origin.y() *\n    //        origin.y();\n    bestP1 = bestP2 = NULL;\n    double bestVal  = 0.0;\n    for (int i = 0; i < 3; ++i) {\n      int next = i + 1;\n      if (next > 2)\n        next -= 3;\n\n      const Point* p1 = element.getPoint(i);\n      const Point* p2 = element.getPoint(next);\n      double dx       = p2->t().x() - p1->t().x();\n      double dy       = p2->t().y() - p1->t().y();\n      Tuple normal(scale * -dy, scale * dx);\n      double val = normal.dot(origin); // / length2;\n      if (bestP1 == NULL || val > bestVal) {\n        bestVal = val;\n        bestP1  = p1;\n        bestP2  = p2;\n      }\n    }\n    assert(bestP1 != NULL && bestP2 != NULL && bestVal > 0);\n  }\n\n  GNode findCorrespondingNode(GNode start, const Point* p1, const Point* p2) {\n    for (auto ii : graph.edges(start)) {\n      GNode dst  = graph.getEdgeDst(ii);\n      Element& e = graph.getData(dst, galois::MethodFlag::UNPROTECTED);\n      int count  = 0;\n      for (int i = 0; i < e.dim(); ++i) {\n        if (e.getPoint(i) == p1 || e.getPoint(i) == p2) {\n          if (++count == 2)\n            return dst;\n        }\n      }\n    }\n    GALOIS_DIE(\"unreachable\");\n    return start;\n  }\n\n  bool planarSearch(const Point* p, GNode start, GNode& node) {\n    // Try simple hill climbing instead\n    ContainsTuple contains(graph, p->t());\n    while (!contains(start)) {\n      Element& element = graph.getData(start, galois::MethodFlag::WRITE);\n      if (element.boundary()) {\n        // Should only happen when quad tree returns a boundary point which is\n        // rare There's only one way to go from here\n        assert(std::distance(graph.edge_begin(start), graph.edge_end(start)) ==\n               1);\n        start = graph.getEdgeDst(\n            graph.edge_begin(start, galois::MethodFlag::WRITE));\n      } else {\n        // Find which neighbor will get us to point fastest by computing normal\n        // vectors\n        const Point *p1, *p2;\n        findBestNormal(element, p, p1, p2);\n        start = findCorrespondingNode(start, p1, p2);\n      }\n    }\n\n    node = start;\n    return true;\n  }\n\n  bool findContainingElement(const Point* p, GNode& node) {\n    Point* result;\n    if (!tree->find(p, result)) {\n      return false;\n    }\n\n    result->get(galois::MethodFlag::WRITE);\n\n    GNode someNode = result->someElement();\n\n    // Not in mesh yet\n    if (!someNode) {\n      return false;\n    }\n\n    return planarSearch(p, someNode, node);\n  }\n\n  using Alloc = galois::PerIterAllocTy;\n\n  struct LocalState {\n    Cavity<Alloc> cav;\n    LocalState(Graph& graph, Alloc& alloc) : cav(graph, alloc) {}\n  };\n\n  template <int Version, typename C>\n  void processPoint(Point* p, C& ctx) {\n    Cavity<Alloc>* cavp = NULL;\n\n    if (Version == detDisjoint) {\n\n      if (ctx.isFirstPass()) {\n        LocalState* localState = ctx.template createLocalState<LocalState>(\n            graph, ctx.getPerIterAlloc());\n        cavp = &localState->cav;\n\n      } else {\n\n        LocalState* localState = ctx.template getLocalState<LocalState>();\n        localState->cav.update();\n        return;\n      }\n    }\n\n    p->get(galois::MethodFlag::WRITE);\n    assert(!p->inMesh());\n\n    GNode node;\n    if (!findContainingElement(p, node)) {\n      // Someone updated an element while we were searching, producing\n      // a semi-consistent state\n      // ctx.push(p);\n      // Current version is safe with locking so this shouldn't happen\n      GALOIS_DIE(\"unreachable\");\n      return;\n    }\n\n    assert(graph.getData(node).inTriangle(p->t()));\n    assert(graph.containsNode(node));\n\n    if (Version == detDisjoint && ctx.isFirstPass()) {\n      cavp->init(node, p);\n      cavp->build();\n    } else {\n      Cavity<Alloc> cav(graph, ctx.getPerIterAlloc());\n      cav.init(node, p);\n      cav.build();\n      if (Version == detPrefix)\n        return;\n      ctx.cautiousPoint();\n      cav.update();\n    }\n  }\n\n  template <int Version, typename WL, typename B, typename... Args>\n  void generateMesh(B& pptrs, Args&&... args) {\n\n    galois::for_each(\n        galois::iterate(pptrs),\n        [&, this](Point* p, auto& ctx) { this->processPoint<Version>(p, ctx); },\n        galois::wl<WL>(), galois::loopname(\"generateMesh\"),\n        galois::local_state<LocalState>(), galois::per_iter_alloc(),\n        galois::no_pushes(), std::forward<Args>(args)...);\n  }\n};\n\n/*\ntemplate<int Version=detBase>\nstruct Process {\n\n\n\n  //! Serial operator\n  void operator()(Point* p) {\n    p->get(galois::MethodFlag::WRITE);\n    assert(!p->inMesh());\n\n    GNode node;\n    if (!findContainingElement(p, node)) {\n      GALOIS_DIE(\"Could not find triangle containing point\");\n      return;\n    }\n\n    assert(graph.getData(node).inTriangle(p->t()));\n    assert(graph.containsNode(node));\n\n    Cavity<> cav(graph);\n    cav.init(node, p);\n    cav.build();\n    cav.update();\n  }\n};\n*/\n\nstatic void run(Rounds& rounds, Graph& graph) {\n  typedef galois::worklists::PerThreadChunkLIFO<32> Chunk;\n  typedef galois::worklists::Deterministic<> DWL;\n\n  for (int i = maxRounds - 1; i >= 0; --i) {\n\n    galois::StatTimer BT(\"buildtree\");\n    BT.start();\n    assert(rounds[i + 1]);\n    PtrPoints& tptrs = *(rounds[i + 1]);\n    QuadTree tree(tptrs.begin(), tptrs.end());\n    BT.stop();\n\n    galois::StatTimer PT(\"ParallelTime\");\n    PT.start();\n\n    assert(rounds[i]);\n    galois::InsertBag<Point*>& pptrs = *(rounds[i]);\n\n    DelaunayTriangulation dt{&tree, graph};\n    switch (detAlgo) {\n    case nondet:\n      dt.generateMesh<detBase, Chunk>(pptrs);\n      break;\n    case detBase:\n      dt.generateMesh<detBase, DWL>(pptrs);\n      break;\n    case detPrefix: {\n      auto nv = [&dt](Point* p, auto& ctx) {\n        dt.processPoint<detPrefix>(p, ctx);\n      };\n      dt.generateMesh<detBase, DWL>(\n          pptrs, galois::neighborhood_visitor<decltype(nv)>(nv));\n      break;\n    }\n    case detDisjoint:\n      dt.generateMesh<detDisjoint, DWL>(pptrs);\n      break;\n    default:\n      GALOIS_DIE(\"unknown algorithm: \", detAlgo);\n    }\n\n    PT.stop();\n  }\n}\n\nvoid deleteRounds(Rounds& rounds) {\n  for (auto r : rounds)\n    delete r;\n}\n\nint main(int argc, char** argv) {\n  galois::SharedMemSys G;\n  LonestarStart(argc, argv, name, desc, url, &inputFile);\n\n  galois::StatTimer totalTime(\"TimerTotal\");\n  totalTime.start();\n\n  if (!meshGraph) {\n    GALOIS_DIE(\"This application requires a mesh graph input;\"\n               \" please use the -meshGraph flag \"\n               \" to indicate the input is a mesh graph.\");\n  }\n\n  Graph graph;\n\n  //! All Point* refer to elements in this bag\n  //! [Define InsertBag]\n  // BasePoints expands to galois::InsertBag<Point>\n  BasePoints basePoints;\n  //! [Define InsertBag]\n\n  Rounds rounds;\n\n  bool writepoints = doWritePoints.size() > 0;\n  ReadInput(graph, basePoints, rounds)(inputFile, !writepoints);\n  if (writepoints) {\n    std::cout << \"Writing \" << doWritePoints << \"\\n\";\n    PointList points;\n    copyPointsFromRounds(points, rounds);\n    writePoints(doWritePoints, points);\n    deleteRounds(rounds);\n    return 0;\n  }\n\n  const char* name = 0;\n  switch (detAlgo) {\n  case nondet:\n    name = \"nondet\";\n    break;\n  case detBase:\n    name = \"detBase\";\n    break;\n  case detPrefix:\n    name = \"detPrefix\";\n    break;\n  case detDisjoint:\n    name = \"detDisjoint\";\n    break;\n  default:\n    name = \"unknown\";\n    break;\n  }\n  galois::gInfo(\"Algorithm \", name);\n\n  galois::StatTimer execTime(\"Timer_0\");\n  execTime.start();\n  run(rounds, graph);\n  execTime.stop();\n  std::cout << \"mesh size: \" << graph.size() << \"\\n\";\n\n  galois::reportPageAlloc(\"MeminfoPost\");\n\n  if (!skipVerify) {\n    Verifier verifier;\n    if (!verifier.verify(&graph)) {\n      GALOIS_DIE(\"triangulation failed\");\n    }\n    std::cout << \"Triangulation OK\\n\";\n  }\n\n  if (doWriteMesh.size()) {\n    std::string base = doWriteMesh;\n    std::cout << \"Writing \" << base << \"\\n\";\n    writeMesh(base.c_str(), graph);\n\n    PointList points;\n    // Reordering messes up connection between id and place in pointlist\n    ReadPoints(points).from(inputFile);\n    writePoints(base.append(\".node\"), points);\n  }\n\n  deleteRounds(rounds);\n\n  totalTime.stop();\n\n  return 0;\n}\n"
  },
  {
    "path": "lonestar/scientific/cpu/delaunaytriangulation/Element.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#include \"Element.h\"\n#include \"Point.h\"\n\nstd::ostream& operator<<(std::ostream& out, const Element& e) {\n  return e.print(out);\n}\n\nbool Element::inTriangle(const Tuple& p) const {\n  if (boundary())\n    return false;\n\n  const Tuple& p1 = points[0]->t();\n  const Tuple& p2 = points[1]->t();\n  const Tuple& p3 = points[2]->t();\n\n  if ((p1 == p) || (p2 == p) || (p3 == p)) {\n    return false;\n  }\n\n  int count  = 0;\n  double px  = p.x();\n  double py  = p.y();\n  double p1x = p1.x();\n  double p1y = p1.y();\n  double p2x = p2.x();\n  double p2y = p2.y();\n  double p3x = p3.x();\n  double p3y = p3.y();\n\n  if (p2x < p1x) {\n    if ((p2x < px) && (p1x >= px)) {\n      if (((py - p2y) * (p1x - p2x)) < ((px - p2x) * (p1y - p2y))) {\n        count = 1;\n      }\n    }\n  } else {\n    if ((p1x < px) && (p2x >= px)) {\n      if (((py - p1y) * (p2x - p1x)) < ((px - p1x) * (p2y - p1y))) {\n        count = 1;\n      }\n    }\n  }\n\n  if (p3x < p2x) {\n    if ((p3x < px) && (p2x >= px)) {\n      if (((py - p3y) * (p2x - p3x)) < ((px - p3x) * (p2y - p3y))) {\n        if (count == 1) {\n          return false;\n        }\n        count++;\n      }\n    }\n  } else {\n    if ((p2x < px) && (p3x >= px)) {\n      if (((py - p2y) * (p3x - p2x)) < ((px - p2x) * (p3y - p2y))) {\n        if (count == 1) {\n          return false;\n        }\n        count++;\n      }\n    }\n  }\n\n  if (p1x < p3x) {\n    if ((p1x < px) && (p3x >= px)) {\n      if (((py - p1y) * (p3x - p1x)) < ((px - p1x) * (p3y - p1y))) {\n        if (count == 1) {\n          return false;\n        }\n        count++;\n      }\n    }\n  } else {\n    if ((p3x < px) && (p1x >= px)) {\n      if (((py - p3y) * (p1x - p3x)) < ((px - p3x) * (p1y - p3y))) {\n        if (count == 1) {\n          return false;\n        }\n        count++;\n      }\n    }\n  }\n\n  return count == 1;\n}\n\nbool Element::clockwise() const {\n  assert(!boundary());\n\n  double t1_x = points[0]->t().x();\n  double t1_y = points[0]->t().y();\n\n  double t2_x = points[1]->t().x();\n  double t2_y = points[1]->t().y();\n\n  double t3_x = points[2]->t().x();\n  double t3_y = points[2]->t().y();\n\n  double counter_clockwise =\n      (t2_x - t1_x) * (t3_y - t1_y) - (t3_x - t1_x) * (t2_y - t1_y);\n\n  return counter_clockwise < 0;\n}\n\nbool Element::inCircle(const Tuple& p) const {\n  if (boundary())\n    return false;\n\n  // This version computes the determinant of a matrix including the\n  // coordinates of each points + distance of these points to the origin\n  // in order to check if a point is inside a triangle or not\n  double t1_x = points[0]->t().x();\n  double t1_y = points[0]->t().y();\n\n  double t2_x = points[1]->t().x();\n  double t2_y = points[1]->t().y();\n\n  double t3_x = points[2]->t().x();\n  double t3_y = points[2]->t().y();\n\n  double p_x = p.x();\n  double p_y = p.y();\n\n  // Check if the points (t1,t2,t3) are sorted clockwise or\n  // counter-clockwise:\n  // -> counter_clockwise > 0 => counter clockwise\n  // -> counter_clockwise = 0 => degenerated triangle\n  // -> counter_clockwise < 0 => clockwise\n  double counter_clockwise =\n      (t2_x - t1_x) * (t3_y - t1_y) - (t3_x - t1_x) * (t2_y - t1_y);\n\n  // If the triangle is degenerate, then the triangle should be updated\n  if (counter_clockwise == 0.0) {\n    return true;\n  }\n\n  // Compute the following determinant:\n  // | t1_x-p_x  t1_y-p_y  (t1_x-p_x)^2+(t1_y-p_y)^2 |\n  // | t2_x-p_x  t2_y-p_y  (t2_x-p_x)^2+(t2_y-p_y)^2 |\n  // | t3_x-p_x  t3_y-p_y  (t3_x-p_x)^2+(t3_y-p_y)^2 |\n  //\n  // If the determinant is >0 then the point (p_x,p_y) is inside the\n  // circumcircle of the triangle (t1,t2,t3).\n\n  // Value of columns 1 and 2 of the matrix\n  double t1_p_x, t1_p_y, t2_p_x, t2_p_y, t3_p_x, t3_p_y;\n  // Determinant of minors extracted from columns 1 and 2\n  // (det_t3_t1_m corresponds to the opposite)\n  double det_t1_t2, det_t2_t3, det_t3_t1_m;\n  // Values of the column 3 of the matrix\n  double t1_col3, t2_col3, t3_col3;\n\n  t1_p_x = t1_x - p_x;\n  t1_p_y = t1_y - p_y;\n  t2_p_x = t2_x - p_x;\n  t2_p_y = t2_y - p_y;\n  t3_p_x = t3_x - p_x;\n  t3_p_y = t3_y - p_y;\n\n  det_t1_t2   = t1_p_x * t2_p_y - t2_p_x * t1_p_y;\n  det_t2_t3   = t2_p_x * t3_p_y - t3_p_x * t2_p_y;\n  det_t3_t1_m = t3_p_x * t1_p_y - t1_p_x * t3_p_y;\n  t1_col3     = t1_p_x * t1_p_x + t1_p_y * t1_p_y;\n  t2_col3     = t2_p_x * t2_p_x + t2_p_y * t2_p_y;\n  t3_col3     = t3_p_x * t3_p_x + t3_p_y * t3_p_y;\n\n  double det =\n      t1_col3 * det_t2_t3 + t2_col3 * det_t3_t1_m + t3_col3 * det_t1_t2;\n\n  // If the points are enumerated in clockwise, then negate the result\n  if (counter_clockwise < 0) {\n    return det < 0;\n  }\n  return det > 0;\n}\n\nstd::ostream& Element::print(std::ostream& out) const {\n  out << '[';\n  for (int i = 0; i < dim(); ++i) {\n    out << points[i]->id() << \" \";\n    points[i]->print(out);\n    out << (i < (dim() - 1) ? \", \" : \"\");\n  }\n  out << ']';\n  return out;\n}\n"
  },
  {
    "path": "lonestar/scientific/cpu/delaunaytriangulation/Element.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#ifndef ELEMENT_H\n#define ELEMENT_H\n\n#include \"Tuple.h\"\n\n#include <ostream>\n#include <stdlib.h>\n\nclass Point;\n\nclass Element {\n  Point* points[3];\n\npublic:\n  Element(const Element& e) {\n    points[0] = e.points[0];\n    points[1] = e.points[1];\n    points[2] = e.points[2];\n  }\n\n  Element(Point* a, Point* b, Point* c) {\n    points[0] = a;\n    points[1] = b;\n    points[2] = c;\n  }\n\n  Element(Point* a, Point* b) {\n    points[0] = a;\n    points[1] = b;\n    points[2] = NULL;\n  }\n\n  Point* getPoint(int i) { return points[i]; }\n  const Point* getPoint(int i) const { return points[i]; }\n\n  bool boundary() const { return points[2] == NULL; }\n  int dim() const { return boundary() ? 2 : 3; }\n\n  bool clockwise() const;\n\n  //! determine if a tuple is inside the triangle\n  bool inTriangle(const Tuple& p) const;\n\n  //! determine if the circumcircle of the triangle contains the tuple\n  bool inCircle(const Tuple& p) const;\n\n  std::ostream& print(std::ostream& out) const;\n};\n\nstd::ostream& operator<<(std::ostream& out, const Element& e);\n\n#endif\n"
  },
  {
    "path": "lonestar/scientific/cpu/delaunaytriangulation/Graph.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#ifndef GRAPH_H\n#define GRAPH_H\n\n#include \"Element.h\"\n\n#include \"galois/optional.h\"\n#include \"galois/graphs/Graph.h\"\n\n#include <vector>\n#include <deque>\n\ntypedef galois::graphs::MorphGraph<Element, char, true> Graph;\ntypedef Graph::GraphNode GNode;\n\n//! Factor out common graph traversals\ntemplate <typename Alloc = std::allocator<char>>\nstruct Searcher : private boost::noncopyable {\n  typedef Alloc allocator_type;\n  typedef typename Alloc::template rebind<GNode>::other GNodeVectorAlloc;\n  typedef std::vector<GNode, GNodeVectorAlloc> GNodeVector;\n\n  struct Marker {\n    GNodeVector seen;\n    Marker(Graph&, const Alloc& a) : seen(a) {}\n    void mark(GNode n) { seen.push_back(n); }\n    bool hasMark(GNode n) {\n      return std::find(seen.begin(), seen.end(), n) != seen.end();\n    }\n  };\n\n  Graph& graph;\n  GNodeVector matches, inside;\n  const allocator_type& alloc;\n\n  Searcher(Graph& g, const Alloc& a = allocator_type())\n      : graph(g), matches(a), inside(a), alloc(a) {}\n\n  struct DetLess {\n    Graph& g;\n    DetLess(Graph& x) : g(x) {}\n    bool operator()(GNode a, GNode b) const {\n      Element& e1 = g.getData(a, galois::MethodFlag::UNPROTECTED);\n      Element& e2 = g.getData(b, galois::MethodFlag::UNPROTECTED);\n\n      for (int i = 0; i < 3; ++i) {\n        uintptr_t v1 = (i < 2 || !e1.boundary())\n                           ? reinterpret_cast<uintptr_t>(e1.getPoint(i))\n                           : 0;\n        uintptr_t v2 = (i < 2 || !e2.boundary())\n                           ? reinterpret_cast<uintptr_t>(e2.getPoint(i))\n                           : 0;\n        if (v1 < v2)\n          return true;\n        else if (v1 > v2)\n          return false;\n      }\n      return false;\n    }\n  };\n\n  void removeDupes(GNodeVector& v) {\n    std::sort(v.begin(), v.end(), DetLess(graph));\n    typename GNodeVector::iterator end = std::unique(v.begin(), v.end());\n    v.resize(end - v.begin());\n  }\n\n  template <typename Pred>\n  void find_(const GNode& start, const Pred& pred, bool all) {\n    typedef galois::optional<GNode> SomeGNode;\n    typedef typename Alloc::template rebind<std::pair<GNode, SomeGNode>>::other\n        WorklistAlloc;\n    typedef std::deque<std::pair<GNode, SomeGNode>, WorklistAlloc> Worklist;\n\n    Worklist wl(alloc);\n    wl.push_back(std::make_pair(start, SomeGNode()));\n\n    Marker marker(graph, alloc);\n    while (!wl.empty()) {\n      GNode cur      = wl.front().first;\n      SomeGNode prev = wl.front().second;\n\n      wl.pop_front();\n\n      if (!graph.containsNode(cur, galois::MethodFlag::WRITE))\n        continue;\n\n      if (marker.hasMark(cur))\n        continue;\n\n      // NB(ddn): Technically this makes DelaunayTriangulation.cpp::Process not\n      // cautious\n      if (!all)\n        marker.mark(cur);\n\n      bool matched = false;\n      if (pred(cur)) {\n        matched = true;\n        matches.push_back(cur);\n        if (all) {\n          marker.mark(cur);\n        } else\n          break; // Found it\n      } else {\n        if (all && prev)\n          inside.push_back(*prev);\n      }\n\n      // Search neighbors (a) when matched and looking for all or (b) when no\n      // match and looking for first\n      if (matched == all) {\n        for (auto ii : graph.edges(cur)) {\n          GNode dst = graph.getEdgeDst(ii);\n          wl.push_back(std::make_pair(dst, SomeGNode(cur)));\n        }\n      }\n    }\n\n    if (all) {\n      removeDupes(matches);\n      removeDupes(inside);\n    }\n  }\n\n  //! Find the first occurance of element matching pred\n  template <typename Pred>\n  void findFirst(const GNode& start, const Pred& p) {\n    find_(start, p, false);\n  }\n\n  //! Find all the elements matching pred (assuming monotonic predicate)\n  template <typename Pred>\n  void findAll(const GNode& start, const Pred& p) {\n    find_(start, p, true);\n    return;\n  }\n};\n\n#endif\n"
  },
  {
    "path": "lonestar/scientific/cpu/delaunaytriangulation/Point.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#ifndef POINT_H\n#define POINT_H\n\n#include \"Tuple.h\"\n#include \"Graph.h\"\n\n#include \"galois/CheckedObject.h\"\n\n#include <ostream>\n#include <algorithm>\n\nclass Point : public galois::GChecked<void> {\n  Tuple m_t;\n  GNode m_n;\n  long m_id;\n\npublic:\n  Point(double x, double y, long id) : m_t(x, y), m_n(NULL), m_id(id) {}\n\n  const Tuple& t() const { return m_t; }\n  long id() const { return m_id; }\n\n  Tuple& t() { return m_t; }\n  long& id() { return m_id; }\n\n  void addElement(const GNode& n) { m_n = n; }\n\n  void removeElement(const GNode& n) {\n    if (m_n == n)\n      m_n = NULL;\n  }\n\n  bool inMesh() const { return m_n != NULL; }\n\n  GNode someElement() const { return m_n; }\n\n  void print(std::ostream& os) const {\n    os << \"(id: \" << m_id << \" t: \";\n    m_t.print(os);\n    if (m_n != NULL)\n      os << \" SOME)\";\n    else\n      os << \" NULL)\";\n  }\n};\n\nstatic inline std::ostream& operator<<(std::ostream& os, const Point& rhs) {\n  rhs.print(os);\n  return os;\n}\n\n#endif\n"
  },
  {
    "path": "lonestar/scientific/cpu/delaunaytriangulation/QuadTree.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#ifndef QUADTREE_H\n#define QUADTREE_H\n\n#include \"Point.h\"\n#include \"galois/Galois.h\"\n#include \"galois/Reduction.h\"\n#include <boost/iterator/transform_iterator.hpp>\n#include <boost/array.hpp>\n\n#include <limits>\n\ninline int getIndex(const Tuple& a, const Tuple& b) {\n  int index = 0;\n  for (int i = 0; i < 2; ++i) {\n    if (a[i] < b[i]) {\n      index += 1 << i;\n    }\n  }\n  return index;\n}\n\ninline void makeNewCenter(int index, const Tuple& center, double radius,\n                          Tuple& newCenter) {\n  newCenter = center;\n  for (int i = 0; i < 2; ++i) {\n    newCenter[i] += (index & (1 << i)) > 0 ? radius : -radius;\n  }\n}\n\nstatic const int maxLeafSize = 16;\n\n/**\n * Finds points nearby a given point.\n */\nclass PQuadTree {\n  struct FindResult {\n    Point* p;\n    double best;\n  };\n\n  struct DerefPointer {\n    Point operator()(Point* p) const { return *p; }\n  };\n\n  struct Node {\n    typedef boost::array<Point*, maxLeafSize> PointsTy;\n    Node* child[4];\n    PointsTy* points;\n    int size;\n\n    //! Make internal node\n    explicit Node() {\n      memset(child, 0, sizeof(*child) * 4);\n      points = NULL;\n    }\n\n    //! Make leaf node\n    Node(Point* p, PointsTy* ps) {\n      memset(child, 0, sizeof(*child) * 4);\n      points        = ps;\n      points->at(0) = p;\n      size          = 1;\n    }\n\n    bool isLeaf() const { return points != NULL; }\n  };\n\n  void deleteNode(Node* root) {\n    if (root->isLeaf()) {\n      pointsAlloc.destroy(root->points);\n      pointsAlloc.deallocate(root->points, 1);\n    } else {\n      for (int i = 0; i < 4; ++i) {\n        if (root->child[i])\n          deleteNode(root->child[i]);\n      }\n    }\n    nodeAlloc.destroy(root);\n    nodeAlloc.deallocate(root, 1);\n  }\n\n  Node* newNode() {\n    Node* n = nodeAlloc.allocate(1);\n    nodeAlloc.construct(n, Node());\n    return n;\n  }\n\n  Node* newNode(Point* p) {\n    Node* n            = nodeAlloc.allocate(1);\n    Node::PointsTy* ps = pointsAlloc.allocate(1);\n    pointsAlloc.construct(ps, Node::PointsTy());\n    nodeAlloc.construct(n, Node(p, ps));\n    return n;\n  }\n\n  template <typename IterTy>\n  struct WorkItem {\n    IterTy begin;\n    IterTy end;\n    Tuple center;\n    double radius;\n    Node* root;\n    PQuadTree* self;\n\n    WorkItem(PQuadTree* s, IterTy b, IterTy e, Node* n, Tuple c, double r)\n        : begin(b), end(e), center(c), radius(r), root(n), self(s) {}\n\n    void operator()() {\n      for (; begin != end; ++begin) {\n        self->add(root, *begin, center, radius);\n      }\n    }\n  };\n\n  template <typename IterTy>\n  struct PAdd {\n    void operator()(WorkItem<IterTy>& w) { w(); }\n    void operator()(WorkItem<IterTy>& w,\n                    galois::UserContext<WorkItem<IterTy>>&) {\n      w();\n    }\n  };\n\n  struct Split {\n    int index;\n    TupleDataTy pivot;\n    Split(int i, TupleDataTy p) : index(i), pivot(p) {}\n    bool operator()(Point* p) { return p->t()[index] < pivot; }\n  };\n\n  Tuple m_center;\n  double m_radius;\n  Node* m_root;\n\n  galois::FixedSizeAllocator<Node> nodeAlloc;\n  galois::FixedSizeAllocator<Node::PointsTy> pointsAlloc;\n\n  template <typename IterTy>\n  void init(IterTy begin, IterTy end) {\n\n    galois::GReduceMin<TupleDataTy> minX;\n    galois::GReduceMin<TupleDataTy> minY;\n\n    galois::GReduceMax<TupleDataTy> maxX;\n    galois::GReduceMax<TupleDataTy> maxY;\n\n    galois::do_all(galois::iterate(begin, end), [&](const Point* p) {\n      minX.update(p->t().x());\n      minY.update(p->t().y());\n\n      maxX.update(p->t().x());\n      maxY.update(p->t().y());\n    });\n\n    Tuple mmost(maxX.reduce(), maxY.reduce());\n    Tuple lleast(minX.reduce(), minY.reduce());\n\n    m_radius = std::max(mmost.x() - lleast.x(), mmost.y() - lleast.y()) / 2.0;\n\n    m_center = lleast;\n    m_center.x() += m_radius;\n    m_center.y() += m_radius;\n  }\n\n  template <typename IterTy, typename OutIterTy>\n  void divideWork(IterTy begin, IterTy end, Node* root, Tuple center,\n                  double radius, OutIterTy& out, int depth) {\n    if (depth == 0 || std::distance(begin, end) <= 16) {\n      *out++ = WorkItem<IterTy>(this, begin, end, root, center, radius);\n      return;\n    }\n\n    IterTy its[5];\n    its[0] = begin;\n    its[4] = end;\n\n    its[2] = std::partition(its[0], its[4], Split(1, center[1]));\n    its[1] = std::partition(its[0], its[2], Split(0, center[0]));\n    its[3] = std::partition(its[2], its[4], Split(0, center[0]));\n\n    radius *= 0.5;\n    --depth;\n\n    for (int i = 0; i < 4; ++i) {\n      Tuple newC;\n      root->child[i] = newNode();\n      makeNewCenter(i, center, radius, newC);\n      divideWork(its[i], its[i + 1], root->child[i], newC, radius, out, depth);\n    }\n  }\n\n  bool couldBeCloser(const Point* p, const Tuple& center, double radius,\n                     FindResult& result) {\n    if (result.p == NULL)\n      return true;\n\n    const Tuple& t = p->t();\n    double d       = 0;\n    for (int i = 0; i < t.dim(); ++i) {\n      double min = center[i] - radius - t[i];\n      double max = center[i] + radius - t[i];\n      d += std::min(min * min, max * max);\n    }\n    return d < result.best;\n  }\n\n  bool find(Node* root, const Point* p, const Tuple& center, double radius,\n            FindResult& result) {\n    if (root->isLeaf()) {\n      bool retval     = false;\n      const Tuple& t0 = p->t();\n      for (int i = 0; i < root->size; ++i) {\n        const Point* o = root->points->at(i);\n        if (!o->inMesh())\n          continue;\n\n        double d        = 0;\n        const Tuple& t1 = o->t();\n        for (int j = 0; j < t0.dim(); ++j) {\n          double v = t0[j] - t1[j];\n          d += v * v;\n        }\n        if (result.p == NULL || d < result.best) {\n          result.p    = root->points->at(i);\n          result.best = d;\n          retval      = true;\n        }\n      }\n      return retval;\n    }\n\n    // Search, starting at closest quadrant to p\n    radius *= 0.5;\n    int start = getIndex(center, p->t());\n    for (int i = 0; i < 4; ++i) {\n      int index = (start + i) % 4;\n      Node* kid = root->child[index];\n      if (kid != NULL) {\n        Tuple newCenter;\n        makeNewCenter(index, center, radius, newCenter);\n        if (couldBeCloser(p, newCenter, radius, result)) {\n          if (false) {\n            // exhaustive\n            find(kid, p, newCenter, radius, result);\n          } else {\n            // return only first\n            if (find(kid, p, newCenter, radius, result))\n              return true;\n          }\n        }\n      }\n    }\n    return false;\n  }\n\n  void makeInternal(Node* root, const Tuple& center, double radius) {\n    assert(root->isLeaf());\n\n    Node::PointsTy* points = root->points;\n    root->points           = NULL;\n\n    for (Node::PointsTy::iterator ii = points->begin(),\n                                  ei = points->begin() + root->size;\n         ii != ei; ++ii) {\n      add(root, *ii, center, radius);\n    }\n    pointsAlloc.destroy(points);\n    pointsAlloc.deallocate(points, 1);\n  }\n\n  void add(Node* root, Point* p, const Tuple& center, double radius) {\n    if (root->isLeaf()) {\n      if (root->size < maxLeafSize) {\n        root->points->at(root->size++) = p;\n      } else {\n        makeInternal(root, center, radius);\n        add(root, p, center, radius);\n      }\n      return;\n    }\n\n    int index  = getIndex(center, p->t());\n    Node*& kid = root->child[index];\n    if (kid == NULL) {\n      kid = newNode(p);\n    } else {\n      radius *= 0.5;\n      assert(radius != 0.0);\n      Tuple newCenter;\n      makeNewCenter(index, center, radius, newCenter);\n      add(kid, p, newCenter, radius);\n    }\n  }\n\n  template <typename OutputTy>\n  void output(Node* root, OutputTy out) {\n    if (root->isLeaf()) {\n      std::copy(\n          boost::make_transform_iterator(root->points->begin(), DerefPointer()),\n          boost::make_transform_iterator(root->points->begin() + root->size,\n                                         DerefPointer()),\n          out);\n    } else {\n      for (int i = 0; i < 4; ++i) {\n        Node* kid = root->child[i];\n        if (kid != NULL)\n          output(kid, out);\n      }\n    }\n  }\n\npublic:\n  template <typename IterTy>\n  PQuadTree(IterTy begin, IterTy end) {\n    m_root = newNode();\n\n    init(begin, end);\n\n    typedef std::vector<Point*> PointsBufTy;\n    typedef WorkItem<PointsBufTy::iterator> WIT;\n    typedef std::vector<WIT> WorkTy;\n    typedef galois::worklists::PerSocketChunkLIFO<1> WL;\n    PointsBufTy points;\n    std::copy(begin, end, std::back_inserter(points));\n\n    WorkTy work;\n    std::back_insert_iterator<WorkTy> it(work);\n    divideWork(points.begin(), points.end(), m_root, m_center, m_radius, it, 4);\n    galois::for_each(galois::iterate(work), PAdd<PointsBufTy::iterator>(),\n                     galois::wl<WL>());\n  }\n\n  ~PQuadTree() { deleteNode(m_root); }\n\n  template <typename OutputTy>\n  void output(OutputTy out) {\n    if (m_root != NULL) {\n      output(m_root, out);\n    }\n  }\n\n  //! Find point nearby to p\n  bool find(const Point* p, Point*& result) {\n    FindResult r;\n    r.p = NULL;\n    if (m_root) {\n      find(m_root, p, m_center, m_radius, r);\n      if (r.p != NULL) {\n        result = r.p;\n        return true;\n      }\n    }\n    return false;\n  }\n};\n\n/**\n * Finds points nearby a given point.\n */\nclass SQuadTree {\n  struct FindResult {\n    Point* p;\n    double best;\n  };\n\n  struct DerefPointer {\n    Point operator()(Point* p) const { return *p; }\n  };\n\n  struct Node {\n    Node* child[4];\n    Point** points;\n    int size;\n\n    bool isLeaf() const { return points != NULL; }\n\n    void makeInternal(const Tuple& center, double radius) {\n      memset(child, 0, sizeof(*child) * 4);\n      Point** begin = points;\n      points        = NULL;\n\n      for (Point **p = begin, **end = begin + size; p != end; ++p) {\n        add(*p, center, radius);\n      }\n      delete[] begin;\n    }\n\n    void add(Point* p, const Tuple& center, double radius) {\n      if (isLeaf()) {\n        if (size < maxLeafSize) {\n          points[size] = p;\n          ++size;\n        } else {\n          makeInternal(center, radius);\n          add(p, center, radius);\n        }\n        return;\n      }\n\n      int index  = getIndex(center, p->t());\n      Node*& kid = child[index];\n      if (kid == NULL) {\n        kid            = new Node();\n        kid->points    = new Point*[maxLeafSize];\n        kid->points[0] = p;\n        kid->size      = 1;\n      } else {\n        radius *= 0.5;\n        assert(radius != 0.0);\n        Tuple newCenter;\n        makeNewCenter(index, center, radius, newCenter);\n        kid->add(p, newCenter, radius);\n      }\n    }\n\n    bool couldBeCloser(const Point* p, const Tuple& center, double radius,\n                       FindResult& result) {\n      if (result.p == NULL)\n        return true;\n\n      const Tuple& t = p->t();\n      double d       = 0;\n      for (int i = 0; i < t.dim(); ++i) {\n        double min = center[i] - radius - t[i];\n        double max = center[i] + radius - t[i];\n        d += std::min(min * min, max * max);\n      }\n      return d < result.best;\n    }\n\n    void find(const Point* p, const Tuple& center, double radius,\n              FindResult& result) {\n      if (isLeaf()) {\n        const Tuple& t0 = p->t();\n        for (int i = 0; i < size; ++i) {\n          double d       = 0;\n          const Point* o = points[i];\n          if (!o->inMesh())\n            continue;\n          const Tuple& t1 = o->t();\n          for (int j = 0; j < t0.dim(); ++j) {\n            double v = t0[j] - t1[j];\n            d += v * v;\n          }\n          if (result.p == NULL || d < result.best) {\n            result.p    = points[i];\n            result.best = d;\n          }\n        }\n        return;\n      }\n\n      // Search, starting at closest quadrant to p\n      radius *= 0.5;\n      int start = getIndex(center, p->t());\n      for (int i = 0; i < 4; ++i) {\n        int index = (start + i) % 4;\n        Node* kid = child[index];\n        if (kid != NULL) {\n          Tuple newCenter;\n          makeNewCenter(index, center, radius, newCenter);\n          if (kid->couldBeCloser(p, newCenter, radius, result))\n            kid->find(p, newCenter, radius, result);\n        }\n      }\n    }\n\n    template <typename OutputTy>\n    void output(OutputTy out) {\n      if (isLeaf()) {\n        std::copy(boost::make_transform_iterator(points, DerefPointer()),\n                  boost::make_transform_iterator(points + size, DerefPointer()),\n                  out);\n      } else {\n        for (int i = 0; i < 4; ++i) {\n          Node* kid = child[i];\n          if (kid != NULL)\n            kid->output(out);\n        }\n      }\n    }\n  };\n\n  void deleteNode(Node*& n) {\n    if (n == NULL)\n      return;\n    if (n->isLeaf()) {\n      delete[] n->points;\n      n->points = NULL;\n    } else {\n      for (int i = 0; i < 4; ++i) {\n        deleteNode(n->child[i]);\n      }\n    }\n\n    delete n;\n    n = NULL;\n  }\n\n  template <typename Begin, typename End>\n  void computeBox(Begin begin, End end, Tuple& least, Tuple& most) {\n    least.x() = least.y() = std::numeric_limits<double>::max();\n    most.x() = most.y() = std::numeric_limits<double>::min();\n\n    for (; begin != end; ++begin) {\n      const Tuple& p = (*begin)->t();\n      for (int i = 0; i < 2; ++i) {\n        if (p[i] < least[i])\n          least[i] = p[i];\n\n        if (p[i] > most[i])\n          most[i] = p[i];\n      }\n    }\n  }\n\n  template <typename Begin, typename End>\n  void init(Begin begin, End end) {\n    Tuple least, most;\n    computeBox(begin, end, least, most);\n\n    radius = std::max(most.x() - least.x(), most.y() - least.y()) / 2.0;\n    center = least;\n    center.x() += radius;\n    center.y() += radius;\n  }\n\n  void add(Point* p) {\n    if (root == NULL) {\n      root         = new Node();\n      root->points = NULL;\n      memset(root->child, 0, sizeof(*root->child) * 4);\n    }\n    root->add(p, center, radius);\n  }\n\n  Tuple center;\n  double radius;\n  Node* root;\n\npublic:\n  template <typename Begin, typename End>\n  SQuadTree(Begin begin, End end) : root(NULL) {\n    init(begin, end);\n    for (; begin != end; ++begin)\n      add(*begin);\n  }\n\n  ~SQuadTree() { deleteNode(root); }\n\n  //! Find point nearby to p\n  bool find(const Point* p, Point*& result) {\n    FindResult r;\n    r.p = NULL;\n    if (root) {\n      root->find(p, center, radius, r);\n      if (r.p != NULL) {\n        result = r.p;\n        return true;\n      }\n    }\n    return false;\n  }\n\n  template <typename OutputTy>\n  void output(OutputTy out) {\n    if (root != NULL) {\n      root->output(out);\n    }\n  }\n};\n\ntypedef PQuadTree QuadTree;\n\n#endif\n"
  },
  {
    "path": "lonestar/scientific/cpu/delaunaytriangulation/README.md",
    "content": "Delaunay Triangulation\n================================================================================\n\nDESCRIPTION \n--------------------------------------------------------------------------------\n\nThis program produces a Delaunay triangulation from a set of 2-D points. We \nimplement the algorithm proposed by Bowyer and that by Watson:\n\n1. Adrian Bowyer. Computing Dirichlet tessellations, The Computer Journal, \nVol. 24, No. 2, pp 162 - 166, 1981.\n\n2. David F. Watson. Computing the n-dimensional tessellation with application to \nVoronoi polytopes, The Computer Journal, Vol. 24, No. 2, pp 167 - 172, 1981. \n\nINPUT\n--------------------------------------------------------------------------------\n\nThe implementations expect a list of nodes with their coordinates.\n\nYou must specify the -meshGraph flag when running this benchmark.\n\nBUILD\n--------------------------------------------------------------------------------\n\n1. Run cmake at BUILD directory (refer to top-level README for cmake instructions).\n\n2. Run `cd <BUILD>/lonestar/scientific/cpu/delaunaytriangulation; make -j`\n\nRUN\n--------------------------------------------------------------------------------\n\nThe following are a few example command lines.\n\n-`$ ./delaunaytriangulation-cpu -meshGraph <path-to-node-list> -t 40`\n-`$ ./delaunaytriangulation-deterministic-cpu -meshGraph <path-to-node-list> -nondet -t 40`\n-`$ ./delaunaytriangulation-deterministic-cpu -meshGraph <path-to-node-list> -detBase -t 20`\n-`$ ./delaunaytriangulation-deterministic-cpu -meshGraph <path-to-node-list> -detPrefix -t 30`\n-`$ ./delaunaytriangulation-deterministic-cpu -meshGraph <path-to-node-list> -detDisjoint -t 15`\n\nPERFORMANCE\n--------------------------------------------------------------------------------\n\n* In our experience, delaunaytriangulation outperforms deterministic variants in \n  delaunaytriangulation-det.\n\n* For the for_each loop named \"Main\", the chunk size of galois::wl<CA>() should be \n  tuned. It controls the granularity of work distribution. The optimal value of the \n  constant might depend on the architecture, so you might want to evaluate the \n  performance over a range of values (say [16-4096]).\n"
  },
  {
    "path": "lonestar/scientific/cpu/delaunaytriangulation/Tuple.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#ifndef TUPLE_H\n#define TUPLE_H\n\n#include <ostream>\n#include <cmath>\n\ntypedef double TupleDataTy;\n\nclass Tuple {\n  TupleDataTy data[2];\n\npublic:\n  Tuple() {\n    data[0] = 0;\n    data[1] = 0;\n  }\n  Tuple(TupleDataTy xy) {\n    data[0] = xy;\n    data[1] = xy;\n  }\n  Tuple(TupleDataTy x, TupleDataTy y) {\n    data[0] = x;\n    data[1] = y;\n  }\n  int dim() const { return 2; }\n  TupleDataTy x() const { return data[0]; }\n  TupleDataTy y() const { return data[1]; }\n\n  TupleDataTy& x() { return data[0]; }\n  TupleDataTy& y() { return data[1]; }\n\n  bool operator==(const Tuple& rhs) const {\n    for (int i = 0; i < 2; ++i) {\n      if (data[i] != rhs.data[i])\n        return false;\n    }\n    return true;\n  }\n\n  bool operator!=(const Tuple& rhs) const { return !(*this == rhs); }\n\n  TupleDataTy operator[](int index) const { return data[index]; }\n\n  TupleDataTy& operator[](int index) { return data[index]; }\n\n  Tuple operator+(const Tuple& rhs) const {\n    return Tuple(data[0] + rhs.data[0], data[1] + rhs.data[1]);\n  }\n\n  Tuple operator-(const Tuple& rhs) const {\n    return Tuple(data[0] - rhs.data[0], data[1] - rhs.data[1]);\n  }\n\n  //! scalar product\n  Tuple operator*(TupleDataTy d) const {\n    return Tuple(data[0] * d, data[1] * d);\n  }\n\n  //! dot product\n  TupleDataTy dot(const Tuple& rhs) const {\n    return data[0] * rhs.data[0] + data[1] * rhs.data[1];\n  }\n\n  TupleDataTy cross(const Tuple& rhs) const {\n    return data[0] * rhs.data[1] - data[1] * rhs.data[0];\n  }\n\n  void print(std::ostream& os) const {\n    os << \"(\" << data[0] << \", \" << data[1] << \")\";\n  }\n};\n\nstatic inline std::ostream& operator<<(std::ostream& os, const Tuple& rhs) {\n  rhs.print(os);\n  return os;\n}\n\nclass Tuple3 {\n  TupleDataTy data[3];\n\npublic:\n  Tuple3() {\n    data[0] = 0;\n    data[1] = 0;\n    data[2] = 0;\n  }\n  Tuple3(TupleDataTy xyz) {\n    data[0] = xyz;\n    data[1] = xyz;\n    data[1] = xyz;\n  }\n  Tuple3(TupleDataTy x, TupleDataTy y, TupleDataTy z) {\n    data[0] = x;\n    data[1] = y;\n    data[2] = z;\n  }\n  int dim() const { return 3; }\n  TupleDataTy x() const { return data[0]; }\n  TupleDataTy y() const { return data[1]; }\n  TupleDataTy z() const { return data[2]; }\n\n  TupleDataTy& x() { return data[0]; }\n  TupleDataTy& y() { return data[1]; }\n  TupleDataTy& z() { return data[2]; }\n\n  bool operator==(const Tuple3& rhs) const {\n    for (int i = 0; i < 3; ++i) {\n      if (data[i] != rhs.data[i])\n        return false;\n    }\n    return true;\n  }\n\n  bool operator!=(const Tuple3& rhs) const { return !(*this == rhs); }\n\n  TupleDataTy operator[](int index) const { return data[index]; }\n\n  TupleDataTy& operator[](int index) { return data[index]; }\n\n  Tuple3 operator+(const Tuple3& rhs) const {\n    return Tuple3(data[0] + rhs.data[0], data[1] + rhs.data[1],\n                  data[2] + rhs.data[2]);\n  }\n\n  Tuple3 operator-(const Tuple3& rhs) const {\n    return Tuple3(data[0] - rhs.data[0], data[1] - rhs.data[1],\n                  data[2] + rhs.data[2]);\n  }\n\n  //! scalar product\n  Tuple3 operator*(TupleDataTy d) const {\n    return Tuple3(data[0] * d, data[1] * d, data[2] * d);\n  }\n\n  //! dot product\n  TupleDataTy dot(const Tuple3& rhs) const {\n    return data[0] * rhs.data[0] + data[1] * rhs.data[1] +\n           data[2] * rhs.data[2];\n  }\n\n  Tuple3 cross(const Tuple3& rhs) const {\n    return Tuple3(data[1] * rhs.data[2] - data[2] * rhs.data[1],\n                  data[2] * rhs.data[0] - data[0] * rhs.data[2],\n                  data[0] * rhs.data[1] - data[1] * rhs.data[0]);\n  }\n\n  void print(std::ostream& os) const {\n    os << \"(\" << data[0] << \", \" << data[1] << \", \" << data[2] << \")\";\n  }\n};\n\nstatic inline std::ostream& operator<<(std::ostream& os, const Tuple3& rhs) {\n  rhs.print(os);\n  return os;\n}\n\n#endif\n"
  },
  {
    "path": "lonestar/scientific/cpu/delaunaytriangulation/Verifier.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#ifndef VERIFIER_H\n#define VERIFIER_H\n\n#include \"Graph.h\"\n#include \"Point.h\"\n\n#include \"galois/Galois.h\"\n#include \"galois/ParallelSTL.h\"\n\n#include <stack>\n#include <set>\n#include <iostream>\n\nclass Verifier {\n  struct inconsistent {\n    Graph* graph;\n    inconsistent(Graph* g) : graph(g) {}\n\n    bool operator()(const GNode& node) const {\n      Element& e = graph->getData(node);\n\n      size_t dist =\n          std::distance(graph->edge_begin(node), graph->edge_end(node));\n      if (e.dim() == 2) {\n        if (dist != 1) {\n          std::cerr << \"Error: Segment \" << e << \" has \" << dist\n                    << \" relation(s)\\n\";\n          return true;\n        }\n      } else if (e.dim() == 3) {\n        if (dist != 3) {\n          std::cerr << \"Error: Triangle \" << e << \" has \" << dist\n                    << \" relation(s)\\n\";\n          return true;\n        }\n      } else {\n        std::cerr << \"Error: Element with \" << e.dim() << \" edges\\n\";\n        return true;\n      }\n      return false;\n    }\n  };\n\n  struct not_delaunay {\n    Graph* graph;\n    not_delaunay(Graph* g) : graph(g) {}\n\n    bool operator()(const GNode& node) {\n      Element& e1 = graph->getData(node);\n\n      for (auto jj : graph->edges(node)) {\n        const GNode& n = graph->getEdgeDst(jj);\n        Element& e2    = graph->getData(n);\n        if (e1.dim() == 3 && e2.dim() == 3) {\n          Tuple t2;\n          if (!getTupleT2OfRelatedEdge(e1, e2, t2)) {\n            std::cerr << \"missing tuple\\n\";\n            return true;\n          }\n          if (e1.inCircle(t2)) {\n            std::cerr << \"Delaunay property violated: point \" << t2\n                      << \" in element \" << e1 << \"\\n\";\n            return true;\n          }\n        }\n      }\n      return false;\n    }\n\n    bool getTupleT2OfRelatedEdge(const Element& e1, const Element& e2,\n                                 Tuple& t) {\n      int e2_0  = -1;\n      int e2_1  = -1;\n      int phase = 0;\n\n      for (int i = 0; i < e1.dim(); i++) {\n        for (int j = 0; j < e2.dim(); j++) {\n          if (e1.getPoint(i) != e2.getPoint(j))\n            continue;\n\n          if (phase == 0) {\n            e2_0  = j;\n            phase = 1;\n            break;\n          }\n\n          e2_1 = j;\n          for (int k = 0; k < 3; k++) {\n            if (k != e2_0 && k != e2_1) {\n              t = e2.getPoint(k)->t();\n              return true;\n            }\n          }\n        }\n      }\n      return false;\n    }\n  };\n\n  bool checkReachability(Graph* graph) {\n    std::stack<GNode> remaining;\n    std::set<GNode> found;\n    remaining.push(*(graph->begin()));\n\n    while (!remaining.empty()) {\n      GNode node = remaining.top();\n      remaining.pop();\n      if (!found.count(node)) {\n        if (!graph->containsNode(node)) {\n          std::cerr << \"Reachable node was removed from graph\\n\";\n        }\n        found.insert(node);\n        int i = 0;\n        for (auto ii : graph->edges(node)) {\n          GNode n = graph->getEdgeDst(ii);\n          assert(i < 3);\n          assert(graph->containsNode(n));\n          assert(node != n);\n          ++i;\n          remaining.push(n);\n        }\n      }\n    }\n\n    if (found.size() != graph->size()) {\n      std::cerr << \"Error: Not all elements are reachable. \";\n      std::cerr << \"Found: \" << found.size() << \" needed: \" << graph->size()\n                << \".\\n\";\n      return false;\n    }\n    return true;\n  }\n\npublic:\n  bool verify(Graph* g) {\n    return galois::ParallelSTL::find_if(g->begin(), g->end(),\n                                        inconsistent(g)) == g->end() &&\n           galois::ParallelSTL::find_if(g->begin(), g->end(),\n                                        not_delaunay(g)) == g->end() &&\n           checkReachability(g);\n  }\n};\n\n#endif\n"
  },
  {
    "path": "lonestar/scientific/cpu/longestedge/CMakeLists.txt",
    "content": "add_executable(longestedge-cpu src/LongestEdge.cpp src/model/Map.cpp\n               src/readers/SrtmReader.cpp src/readers/AsciiReader.cpp\n               src/libmgrs/mgrs.c src/libmgrs/polarst.c src/libmgrs/tranmerc.c\n               src/libmgrs/utm.c src/libmgrs/ups.c src/utils/Utils.cpp\n               src/readers/InpReader.cpp src/writers/InpWriter.cpp\n               src/writers/TriangleFormatWriter.cpp)\nadd_dependencies(apps longestedge-cpu)\ntarget_link_libraries(longestedge-cpu PRIVATE Galois::shmem lonestar)\ninstall(TARGETS longestedge-cpu DESTINATION \"${CMAKE_INSTALL_BINDIR}\" COMPONENT apps EXCLUDE_FROM_ALL)\n\nadd_executable(longestedgeTest test/TestMain.cpp src/model/Map.cpp\n               src/libmgrs/mgrs.c src/libmgrs/polarst.c src/libmgrs/tranmerc.c\n               src/libmgrs/utm.c src/libmgrs/ups.c src/utils/Utils.cpp)\nadd_dependencies(apps longestedgeTest)\ntarget_link_libraries(longestedgeTest PRIVATE Galois::shmem lonestar)\n\nadd_test(test_longest_edge longestedgeTest)\n"
  },
  {
    "path": "lonestar/scientific/cpu/longestedge/README.md",
    "content": "Longest Edge\n============\n\nDESCRIPTION \n-----------\n\nThis program runs a variant of Rivaras mesh refinement algorithm on portions of the earth's surface.\nIt requires the data available at https://dds.cr.usgs.gov/srtm/version2_1/SRTM3/ and can generate `.node`, `.ele.`, and `.poly` files that follow the same format used in https://www.cs.cmu.edu/~quake/triangle.html.\nThe command line inputs are the bounds of a box in UTM coordinates.\n\nBUILD\n-----\n\n1. Run cmake at BUILD directory (refer to top-level README for cmake instructions).\n\n2. Run `cd <BUILD>/lonestar/scientific/cpu/longestedge && make -j`\n\n\nRUN\n---\n\nThe following is an example command line call:\n\n - `./longestedge -l 25 -s 14 -N 52.4 -S 49. -E 23.1 -W 18.1 -data <dataDirectory> -o <outputFile> -square -altOutput`\n\n"
  },
  {
    "path": "lonestar/scientific/cpu/longestedge/out/.gitignore",
    "content": "# Ignore everything in this directory\n*\n# Except this file\n!.gitignore"
  },
  {
    "path": "lonestar/scientific/cpu/longestedge/src/LongestEdge.cpp",
    "content": "#include \"conditions/TerrainConditionChecker.h\"\n#include \"libmgrs/utm.h\"\n#include \"model/Coordinates.h\"\n#include \"model/Graph.h\"\n#include \"model/Map.h\"\n#include \"model/ProductionState.h\"\n#include \"productions/Production.h\"\n#include \"productions/Production1.h\"\n#include \"productions/Production2.h\"\n#include \"productions/Production3.h\"\n#include \"productions/Production4.h\"\n#include \"productions/Production5.h\"\n#include \"productions/Production6.h\"\n#include \"readers/InpReader.h\"\n#include \"readers/SrtmReader.h\"\n#include \"writers/InpWriter.h\"\n#include \"writers/TriangleFormatWriter.h\"\n#include \"utils/ConnectivityManager.h\"\n#include \"utils/GraphGenerator.h\"\n#include \"utils/Utils.h\"\n#include \"readers/AsciiReader.h\"\n\n#include <Lonestar/BoilerPlate.h>\n\n#include <algorithm>\n#include <cstdlib>\n#include <string>\n#include <tuple>\n#include <vector>\n\nnamespace cll = llvm::cl;\n\nstatic const char* name = \"Longest edge mesh generator\";\nstatic const char* desc = \"Implementation of Rivara's Longest Edge algorithm \"\n                          \"based on hyper-graph grammars.\";\nstatic const char* url = \"longest_edge\";\n\n// Command line arguments\nstatic cll::opt<std::string> dataDir(\"data\", cll::Positional,\n                                     cll::desc(\"Directory with data files\"));\nstatic cll::opt<std::string> output(\"o\", cll::Positional,\n                                    cll::desc(\"Basename for output file\"));\nstatic cll::opt<int>\n    tolerance(\"l\", cll::Positional,\n              cll::desc(\"Tolerance for for refinement in meters\"),\n              cll::init(5));\nstatic cll::opt<bool>\n    version2D(\"version2D\",\n              cll::desc(\"Calculate distances using only XY coordinates\"));\nstatic cll::opt<int> steps(\"s\", cll::Positional, cll::desc(\"Number of steps\"));\nstatic cll::opt<double> N(\"N\", cll::desc(\"Latitude of north border\"));\nstatic cll::opt<double> S(\"S\", cll::desc(\"Latitude of south border\"));\nstatic cll::opt<double> E(\"E\", cll::desc(\"Longitude of east border\"));\nstatic cll::opt<double> W(\"W\", cll::desc(\"Longitude of west border\"));\nstatic cll::opt<bool> ascii(\"a\", cll::desc(\"Read data from ascii file\"));\nstatic cll::opt<std::string>\n    asciiFile(\"asciiFile\", cll::Positional,\n              cll::desc(\"File with data in ASCII format\"));\nstatic cll::opt<bool> square(\"square\", cll::desc(\"Bind domain to square\"));\nstatic cll::opt<std::string>\n    inputMeshFile(\"imesh\", cll::desc(\"Filename of the inp mesh. It should be \"\n                                     \"inside dataDir and use UTM coordinates\"));\nstatic cll::opt<long> zone(\"zone\", cll::desc(\"UTM zone of the inputMeshFile\"));\nstatic cll::opt<char> hemisphere(\"hemisphere\",\n                                 cll::desc(\"Hemisphere of the inputMeshFile\"));\nstatic cll::opt<bool> altOutput(\n    \"altOutput\",\n    cll::desc(\"Write to .ele,.node,.poly files instead of AVS UCD (.inp)\"));\nstatic cll::opt<bool> display(\"display\",\n                              cll::desc(\"Use external visualizator.\"));\n\nvoid afterStep(int i, Graph& graph);\n\nbool basicCondition(const Graph& graph, GNode& node);\n\nint main(int argc, char** argv) {\n  galois::SharedMemSys G;\n\n  LonestarStart(argc, argv, name, desc, url, nullptr);\n  Graph graph{};\n\n  galois::reportPageAlloc(\"MeminfoPre1\");\n  // Tighter upper bound for pre-alloc, useful for machines with limited memory,\n  // e.g., Intel MIC. May not be enough for deterministic execution\n  constexpr size_t NODE_SIZE = sizeof(**graph.begin());\n\n  // preallocating memory\n  galois::preAlloc(5 * galois::getActiveThreads() +\n                   NODE_SIZE * 32 * graph.size() /\n                       galois::runtime::pagePoolSize());\n\n  galois::reportPageAlloc(\"MeminfoPre2\");\n\n  galois::gInfo(\"Initial configuration set.\");\n\n  Map* map;\n\n  // creates the initial mesh using the borders and the new map\n  if (inputMeshFile.empty()) {\n    if (ascii) {\n      AsciiReader reader;\n      map = reader.read(asciiFile);\n      GraphGenerator::generateSampleGraphWithData(\n          graph, *map, 0, map->getLength() - 1, map->getWidth() - 1, 0,\n          version2D);\n    } else {\n      SrtmReader reader;\n      // terrain setup:  load terrain heights into the map object\n      map = reader.read(W, N, E, S, dataDir.c_str());\n      galois::gInfo(\"Terrain data read.\");\n      GraphGenerator::generateSampleGraphWithDataWithConversionToUtm(\n          graph, *map, W, N, E, S, version2D, square);\n    }\n    galois::gInfo(\"Initial graph generated\");\n  } else {\n\n    inpRead(dataDir + \"/\" + inputMeshFile, graph, N, S, E, W, version2D);\n    galois::gInfo(\"INP mesh read.\");\n\n    // Let's convert the four corners to geodesic coordinates\n    double x1, x2, x3, x4, y1, y2, y3, y4;\n    Convert_UTM_To_Geodetic(zone, hemisphere, E, N, &y1, &x1);\n    Convert_UTM_To_Geodetic(zone, hemisphere, E, S, &y2, &x2);\n    Convert_UTM_To_Geodetic(zone, hemisphere, W, N, &y3, &x3);\n    Convert_UTM_To_Geodetic(zone, hemisphere, W, S, &y4, &x4);\n\n    std::tie(W, E) = std::minmax(\n        {Utils::r2d(x1), Utils::r2d(x2), Utils::r2d(x3), Utils::r2d(x4)});\n    std::tie(S, N) = std::minmax(\n        {Utils::r2d(y1), Utils::r2d(y2), Utils::r2d(y3), Utils::r2d(y4)});\n\n    // Create the map\n    SrtmReader reader;\n\n    // terrain setup:  load terrain heights into the map object\n    map = reader.read(W, N, E, S, dataDir.c_str());\n    galois::gInfo(\"Terrain data read.\");\n\n    map->setZone(zone);\n    map->setHemisphere(hemisphere);\n\n    // Update the coordinates of all graph nodes (mesh nodes, and the interior\n    // nodes)\n    for (auto node : graph) {\n      const auto coords = node->getData().getCoords();\n\n      node->getData().setCoords(\n          Coordinates{coords.getX(), coords.getY(), *map});\n    }\n  }\n\n  // initialize wrapper over graph object (ConnManager)\n  ConnectivityManager connManager{graph};\n  //    DummyConditionChecker checker = DummyConditionChecker();\n  TerrainConditionChecker checker =\n      TerrainConditionChecker(tolerance, connManager, *map);\n  Production1 production1{connManager};\n  Production2 production2{connManager};\n  Production3 production3{connManager};\n  Production4 production4{connManager};\n  Production5 production5{connManager};\n  Production6 production6{connManager};\n  vector<Production*> productions = {&production1, &production2, &production3,\n                                     &production4, &production5, &production6};\n  galois::gInfo(\"Loop is being started...\");\n  //    afterStep(0, graph);\n  for (int j = 0; j < steps; j++) {\n    galois::for_each(galois::iterate(graph.begin(), graph.end()),\n                     [&](GNode node, auto&) {\n                       if (basicCondition(graph, node)) {\n\n                         // terrain checker to see if refinement needed\n                         // based on terrain\n                         checker.execute(node);\n                       }\n                     });\n    galois::gInfo(\"Condition chceking in step \", j, \" finished.\");\n    galois::StatTimer step((\"step\" + std::to_string(j)).c_str());\n    step.start();\n\n    auto prodExecuted = true;\n\n    while (prodExecuted) {\n      prodExecuted = false;\n\n      galois::for_each(\n          galois::iterate(graph.begin(), graph.end()),\n          [&](GNode node, auto& ctx) {\n            // only need to check hyperedges\n            if (!basicCondition(graph, node)) {\n              return;\n            }\n\n            // TODO does this have to be initialized for every one?\n            // may be able to optimize\n            ProductionState pState(connManager, node, version2D,\n                                   [&map](double x, double y) -> double {\n                                     return map->get_height(x, y);\n                                   });\n\n            // loop through productions and apply the first applicable\n            // one\n            for (Production* production : productions) {\n              if (production->execute(pState, ctx)) {\n                afterStep(j, graph);\n                prodExecuted = true;\n                return;\n              }\n            }\n          },\n          galois::loopname((\"step\" + std::to_string(j)).c_str()));\n    }\n\n    step.stop();\n    galois::gInfo(\"Step \", j, \" finished.\");\n  }\n  galois::gInfo(\"All steps finished.\");\n\n  // final result writing\n  if (!output.empty()) {\n    if (altOutput) {\n      triangleFormatWriter(output, graph);\n    } else {\n      inpWriter(output + \".inp\", graph);\n    }\n    galois::gInfo(\"Graph written to file \", output);\n  }\n\n  if (display) {\n    if (system((std::string(\"./display.sh \") + output).c_str()))\n      std::abort();\n  }\n\n  delete map;\n  return 0;\n}\n\n//! Checks if node exists + is hyperedge\nbool basicCondition(const Graph& graph, GNode& node) {\n  return graph.containsNode(node, galois::MethodFlag::WRITE) &&\n         node->getData().isHyperEdge();\n}\n\n//! Writes intermediate data to file\nvoid afterStep(int GALOIS_UNUSED(step), Graph& GALOIS_UNUSED(graph)) {}\n"
  },
  {
    "path": "lonestar/scientific/cpu/longestedge/src/conditions/ConditionChecker.h",
    "content": "#ifndef GALOIS_CONDITIONCHECKER_H\n#define GALOIS_CONDITIONCHECKER_H\n\n#include \"../model/Graph.h\"\n\n//! An implementation of a condition checker just needs to implement\n//! the execute function to check the condition on some node\nclass ConditionChecker {\npublic:\n  virtual bool execute(GNode& node) = 0;\n};\n\n#endif // GALOIS_CONDITIONCHECKER_H\n"
  },
  {
    "path": "lonestar/scientific/cpu/longestedge/src/conditions/DummyConditionChecker.h",
    "content": "#ifndef GALOIS_DUMMYCONDITIONCHECKER_H\n#define GALOIS_DUMMYCONDITIONCHECKER_H\n\n#include \"ConditionChecker.h\"\n\n//! This condition checker always sets a hyperedge node to be refined and\n//! returns true\nclass DummyConditionChecker : ConditionChecker {\npublic:\n  //! Sets refinement and returns true for hyperedge nodes\n  bool execute(GNode& node) override {\n    NodeData& nodeData = node->getData();\n    if (!nodeData.isHyperEdge()) {\n      return false;\n    }\n    nodeData.setToRefine(true);\n    return true;\n  }\n};\n\n#endif // GALOIS_DUMMYCONDITIONCHECKER_H\n"
  },
  {
    "path": "lonestar/scientific/cpu/longestedge/src/conditions/TerrainConditionChecker.h",
    "content": "#ifndef GALOIS_TERRAINCONDITIONCHECKER_H\n#define GALOIS_TERRAINCONDITIONCHECKER_H\n\n#include <cmath>\n#include \"../utils/ConnectivityManager.h\"\n#include \"../utils/GaloisUtils.h\"\n#include \"../model/Map.h\"\n#include \"../model/ProductionState.h\"\n#include \"../libmgrs/utm.h\"\n#include \"ConditionChecker.h\"\n\n//! Uses terrain to determine if a triangle is to be refined.\nclass TerrainConditionChecker : public ConditionChecker {\npublic:\n  explicit TerrainConditionChecker(double tolerance,\n                                   ConnectivityManager& connManager, Map& map)\n      : tolerance(tolerance), connManager(connManager), map(map) {}\n\n  //! Only refine if meets inside_condition + is hyperedge node\n  bool execute(GNode& node) override {\n    NodeData& nodeData = node->getData();\n    if (!nodeData.isHyperEdge()) {\n      return false;\n    }\n\n    // gets coordinates of vertices connected by this hyperedge\n    vector<Coordinates> verticesCoords = connManager.getVerticesCoords(node);\n\n    if (!inside_condition(verticesCoords)) {\n      return false;\n    }\n\n    nodeData.setToRefine(true);\n    return true;\n  }\n\nprivate:\n  double tolerance;\n  ConnectivityManager& connManager;\n  Map& map;\n\n  bool inside_condition(const vector<Coordinates>& verticesCoords) {\n\n    // lowest x among 3\n    double lowest_x = verticesCoords[0].getX() < verticesCoords[1].getX()\n                          ? verticesCoords[0].getX()\n                          : verticesCoords[1].getX();\n    lowest_x = verticesCoords[2].getX() < lowest_x ? verticesCoords[2].getX()\n                                                   : lowest_x;\n\n    // highest x among 3\n    double highest_x = verticesCoords[0].getX() > verticesCoords[1].getX()\n                           ? verticesCoords[0].getX()\n                           : verticesCoords[1].getX();\n    highest_x = verticesCoords[2].getX() > highest_x ? verticesCoords[2].getX()\n                                                     : highest_x;\n\n    // lowest y among 3\n    double lowest_y = verticesCoords[0].getY() < verticesCoords[1].getY()\n                          ? verticesCoords[0].getY()\n                          : verticesCoords[1].getY();\n    lowest_y = verticesCoords[2].getY() < lowest_y ? verticesCoords[2].getY()\n                                                   : lowest_y;\n\n    // highest y among 3\n    double highest_y = verticesCoords[0].getY() > verticesCoords[1].getY()\n                           ? verticesCoords[0].getY()\n                           : verticesCoords[1].getY();\n    highest_y = verticesCoords[2].getY() > highest_y ? verticesCoords[2].getY()\n                                                     : highest_y;\n\n    double step = map.isUtm() ? 90 : map.getCellWidth();\n    for (double i = lowest_x; i <= highest_x; i += step) {\n      for (double j = lowest_y; j <= highest_y; j += step) {\n        Coordinates tmp{i, j, 0.};\n        double barycentric_point[3];\n        compute_barycentric_coords(barycentric_point, tmp, verticesCoords);\n        if (is_inside_triangle(barycentric_point)) {\n          double height = 0;\n          for (int k = 0; k < 3; ++k) {\n            height += barycentric_point[k] * verticesCoords[k].getZ();\n          }\n          if (fabs(height - map.get_height(i, j)) > tolerance) {\n            return true;\n          }\n        }\n      }\n    }\n    return false;\n  }\n\n  void compute_barycentric_coords(double* barycentric_coords,\n                                  Coordinates& point,\n                                  const vector<Coordinates>& verticesCoords) {\n    double triangle_area =\n        get_area(verticesCoords[0], verticesCoords[1], verticesCoords[2]);\n    barycentric_coords[2] =\n        get_area(point, verticesCoords[0], verticesCoords[1]) / triangle_area;\n    barycentric_coords[1] =\n        get_area(point, verticesCoords[2], verticesCoords[0]) / triangle_area;\n    barycentric_coords[0] =\n        get_area(point, verticesCoords[1], verticesCoords[2]) / triangle_area;\n  }\n\n  bool is_inside_triangle(double barycentric_coords[]) {\n    return !greater(barycentric_coords[0] + barycentric_coords[1] +\n                        barycentric_coords[2],\n                    1.);\n  }\n\n  double get_area(const Coordinates& a, const Coordinates& b,\n                  const Coordinates& c) {\n    return 0.5 * fabs((b.getX() - a.getX()) * (c.getY() - a.getY()) -\n                      (b.getY() - a.getY()) * (c.getX() - a.getX()));\n  }\n};\n\n#endif // GALOIS_TERRAINCONDITIONCHECKER_H\n"
  },
  {
    "path": "lonestar/scientific/cpu/longestedge/src/libmgrs/LICENSE",
    "content": "The MIT License (MIT)\nCopyright (c) 2016 Howard Butler <howard@hobu.co>\n\nPermission is hereby granted, free of charge, to any person obtaining\na copy of this software and associated documentation files (the \"Software\"),\nto deal in the Software without restriction, including without limitation\nthe rights to use, copy, modify, merge, publish, distribute, sublicense,\nand/or sell copies of the Software, and to permit persons to whom the\nSoftware is furnished to do so, subject to the following conditions:\n\nThe above copyright notice and this permission notice shall be included\nin all copies or substantial portions of the Software.\n\nTHE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS\nOR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\nFITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL\nTHE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\nLIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING\nFROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS\nIN THE SOFTWARE.\n"
  },
  {
    "path": "lonestar/scientific/cpu/longestedge/src/libmgrs/mgrs.c",
    "content": "/***************************************************************************/\n/* RSC IDENTIFIER:  MGRS\n *\n * ABSTRACT\n *\n *    This component converts between geodetic coordinates (latitude and\n *    longitude) and Military Grid Reference System (MGRS) coordinates.\n *\n * ERROR HANDLING\n *\n *    This component checks parameters for valid values.  If an invalid value\n *    is found, the error code is combined with the current error code using\n *    the bitwise or.  This combining allows multiple error codes to be\n *    returned. The possible error codes are:\n *\n *          MGRS_NO_ERROR          : No errors occurred in function\n *          MGRS_LAT_ERROR         : Latitude outside of valid range\n *                                    (-90 to 90 degrees)\n *          MGRS_LON_ERROR         : Longitude outside of valid range\n *                                    (-180 to 360 degrees)\n *          MGRS_STR_ERROR         : An MGRS string error: string too long,\n *                                    too short, or badly formed\n *          MGRS_PRECISION_ERROR   : The precision must be between 0 and 5\n *                                    inclusive.\n *          MGRS_A_ERROR           : Semi-major axis less than or equal to zero\n *          MGRS_INV_F_ERROR       : Inverse flattening outside of valid range\n *\t\t\t\t\t\t\t\t\t                  (250 to 350)\n *          MGRS_EASTING_ERROR     : Easting outside of valid range\n *                                    (100,000 to 900,000 meters for UTM)\n *                                    (0 to 4,000,000 meters for UPS)\n *          MGRS_NORTHING_ERROR    : Northing outside of valid range\n *                                    (0 to 10,000,000 meters for UTM)\n *                                    (0 to 4,000,000 meters for UPS)\n *          MGRS_ZONE_ERROR        : Zone outside of valid range (1 to 60)\n *          MGRS_HEMISPHERE_ERROR  : Invalid hemisphere ('N' or 'S')\n *\n * REUSE NOTES\n *\n *    MGRS is intended for reuse by any application that does conversions\n *    between geodetic coordinates and MGRS coordinates.\n *\n * REFERENCES\n *\n *    Further information on MGRS can be found in the Reuse Manual.\n *\n *    MGRS originated from : U.S. Army Topographic Engineering Center\n *                           Geospatial Information Division\n *                           7701 Telegraph Road\n *                           Alexandria, VA  22310-3864\n *\n * LICENSES\n *\n *    None apply to this component.\n *\n * RESTRICTIONS\n *\n *\n * ENVIRONMENT\n *\n *    MGRS was tested and certified in the following environments:\n *\n *    1. Solaris 2.5 with GCC version 2.8.1\n *    2. Windows 95 with MS Visual C++ version 6\n *\n * MODIFICATIONS\n *\n *    Date              Description\n *    ----              -----------\n *    16-11-94          Original Code\n *    15-09-99          Reengineered upper layers\n *    02-05-03          Corrected latitude band bug in GRID_UTM\n *    08-20-03          Reengineered lower layers\n */\n\n\n/***************************************************************************/\n/*\n *                               INCLUDES\n */\n#include <ctype.h>\n#include <math.h>\n#include <stdio.h>\n#include <string.h>\n#include \"ups.h\"\n#include \"utm.h\"\n#include \"mgrs.h\"\n\n/*\n *      ctype.h     - Standard C character handling library\n *      math.h      - Standard C math library\n *      stdio.h     - Standard C input/output library\n *      string.h    - Standard C string handling library\n *      ups.h       - Universal Polar Stereographic (UPS) projection\n *      utm.h       - Universal Transverse Mercator (UTM) projection\n *      mgrs.h      - function prototype error checking\n */\n\n\n/***************************************************************************/\n/*\n *                              GLOBAL DECLARATIONS\n */\n#define DEG_TO_RAD       0.017453292519943295 /* PI/180                      */\n#define RAD_TO_DEG       57.29577951308232087 /* 180/PI                      */\n#define LETTER_A               0   /* ARRAY INDEX FOR LETTER A               */\n#define LETTER_B               1   /* ARRAY INDEX FOR LETTER B               */\n#define LETTER_C               2   /* ARRAY INDEX FOR LETTER C               */\n#define LETTER_D               3   /* ARRAY INDEX FOR LETTER D               */\n#define LETTER_E               4   /* ARRAY INDEX FOR LETTER E               */\n#define LETTER_F               5   /* ARRAY INDEX FOR LETTER F               */\n#define LETTER_G               6   /* ARRAY INDEX FOR LETTER G               */\n#define LETTER_H               7   /* ARRAY INDEX FOR LETTER H               */\n#define LETTER_I               8   /* ARRAY INDEX FOR LETTER I               */\n#define LETTER_J               9   /* ARRAY INDEX FOR LETTER J               */\n#define LETTER_K              10   /* ARRAY INDEX FOR LETTER K               */\n#define LETTER_L              11   /* ARRAY INDEX FOR LETTER L               */\n#define LETTER_M              12   /* ARRAY INDEX FOR LETTER M               */\n#define LETTER_N              13   /* ARRAY INDEX FOR LETTER N               */\n#define LETTER_O              14   /* ARRAY INDEX FOR LETTER O               */\n#define LETTER_P              15   /* ARRAY INDEX FOR LETTER P               */\n#define LETTER_Q              16   /* ARRAY INDEX FOR LETTER Q               */\n#define LETTER_R              17   /* ARRAY INDEX FOR LETTER R               */\n#define LETTER_S              18   /* ARRAY INDEX FOR LETTER S               */\n#define LETTER_T              19   /* ARRAY INDEX FOR LETTER T               */\n#define LETTER_U              20   /* ARRAY INDEX FOR LETTER U               */\n#define LETTER_V              21   /* ARRAY INDEX FOR LETTER V               */\n#define LETTER_W              22   /* ARRAY INDEX FOR LETTER W               */\n#define LETTER_X              23   /* ARRAY INDEX FOR LETTER X               */\n#define LETTER_Y              24   /* ARRAY INDEX FOR LETTER Y               */\n#define LETTER_Z              25   /* ARRAY INDEX FOR LETTER Z               */\n#define MGRS_LETTERS            3  /* NUMBER OF LETTERS IN MGRS              */\n#define ONEHT          100000.e0    /* ONE HUNDRED THOUSAND                  */\n#define TWOMIL        2000000.e0    /* TWO MILLION                           */\n#define TRUE                      1  /* CONSTANT VALUE FOR TRUE VALUE  */\n#define FALSE                     0  /* CONSTANT VALUE FOR FALSE VALUE */\n#define PI    3.14159265358979323e0  /* PI                             */\n#define PI_OVER_2  (PI / 2.0e0)\n\n#define MIN_EASTING  100000\n#define MAX_EASTING  900000\n#define MIN_NORTHING 0\n#define MAX_NORTHING 10000000\n#define MAX_PRECISION           5   /* Maximum precision of easting & northing */\n#define MIN_UTM_LAT      ( (-80 * PI) / 180.0 ) /* -80 degrees in radians    */\n#define MAX_UTM_LAT      ( (84 * PI) / 180.0 )  /* 84 degrees in radians     */\n\n#define MIN_EAST_NORTH 0\n#define MAX_EAST_NORTH 4000000\n\n\n/* Ellipsoid parameters, default to WGS 84 */\ndouble MGRS_a = 6378137.0;    /* Semi-major axis of ellipsoid in meters */\ndouble MGRS_f = 1 / 298.257223563; /* Flattening of ellipsoid           */\nchar   MGRS_Ellipsoid_Code[3] = {'W','E',0};\n\n\n/*\n *    CLARKE_1866 : Ellipsoid code for CLARKE_1866\n *    CLARKE_1880 : Ellipsoid code for CLARKE_1880\n *    BESSEL_1841 : Ellipsoid code for BESSEL_1841\n *    BESSEL_1841_NAMIBIA : Ellipsoid code for BESSEL 1841 (NAMIBIA)\n */\nconst char* CLARKE_1866 = \"CC\";\nconst char* CLARKE_1880 = \"CD\";\nconst char* BESSEL_1841 = \"BR\";\nconst char* BESSEL_1841_NAMIBIA = \"BN\";\n\n\ntypedef struct Latitude_Band_Value\n{\n  long letter;            /* letter representing latitude band  */\n  double min_northing;    /* minimum northing for latitude band */\n  double north;           /* upper latitude for latitude band   */\n  double south;           /* lower latitude for latitude band   */\n  double northing_offset; /* latitude band northing offset      */\n} Latitude_Band;\n\nstatic const Latitude_Band Latitude_Band_Table[20] =\n  {{LETTER_C, 1100000.0, -72.0, -80.5, 0.0},\n  {LETTER_D, 2000000.0, -64.0, -72.0, 2000000.0},\n  {LETTER_E, 2800000.0, -56.0, -64.0, 2000000.0},\n  {LETTER_F, 3700000.0, -48.0, -56.0, 2000000.0},\n  {LETTER_G, 4600000.0, -40.0, -48.0, 4000000.0},\n  {LETTER_H, 5500000.0, -32.0, -40.0, 4000000.0},\n  {LETTER_J, 6400000.0, -24.0, -32.0, 6000000.0},\n  {LETTER_K, 7300000.0, -16.0, -24.0, 6000000.0},\n  {LETTER_L, 8200000.0, -8.0, -16.0, 8000000.0},\n  {LETTER_M, 9100000.0, 0.0, -8.0, 8000000.0},\n  {LETTER_N, 0.0, 8.0, 0.0, 0.0},\n  {LETTER_P, 800000.0, 16.0, 8.0, 0.0},\n  {LETTER_Q, 1700000.0, 24.0, 16.0, 0.0},\n  {LETTER_R, 2600000.0, 32.0, 24.0, 2000000.0},\n  {LETTER_S, 3500000.0, 40.0, 32.0, 2000000.0},\n  {LETTER_T, 4400000.0, 48.0, 40.0, 4000000.0},\n  {LETTER_U, 5300000.0, 56.0, 48.0, 4000000.0},\n  {LETTER_V, 6200000.0, 64.0, 56.0, 6000000.0},\n  {LETTER_W, 7000000.0, 72.0, 64.0, 6000000.0},\n  {LETTER_X, 7900000.0, 84.5, 72.0, 6000000.0}};\n\n\ntypedef struct UPS_Constant_Value\n{\n  long letter;            /* letter representing latitude band      */\n  long ltr2_low_value;    /* 2nd letter range - low number         */\n  long ltr2_high_value;   /* 2nd letter range - high number          */\n  long ltr3_high_value;   /* 3rd letter range - high number (UPS)   */\n  double false_easting;   /* False easting based on 2nd letter      */\n  double false_northing;  /* False northing based on 3rd letter     */\n} UPS_Constant;\n\nstatic const UPS_Constant UPS_Constant_Table[4] =\n  {{LETTER_A, LETTER_J, LETTER_Z, LETTER_Z, 800000.0, 800000.0},\n  {LETTER_B, LETTER_A, LETTER_R, LETTER_Z, 2000000.0, 800000.0},\n  {LETTER_Y, LETTER_J, LETTER_Z, LETTER_P, 800000.0, 1300000.0},\n  {LETTER_Z, LETTER_A, LETTER_J, LETTER_P, 2000000.0, 1300000.0}};\n\n/***************************************************************************/\n/*\n *                              FUNCTIONS\n */\n\nlong Get_Latitude_Band_Min_Northing(long letter, double* min_northing, double* northing_offset)\n/*\n * The function Get_Latitude_Band_Min_Northing receives a latitude band letter\n * and uses the Latitude_Band_Table to determine the minimum northing and northing offset\n * for that latitude band letter.\n *\n *   letter        : Latitude band letter             (input)\n *   min_northing  : Minimum northing for that letter\t(output)\n */\n{ /* Get_Latitude_Band_Min_Northing */\n  long error_code = MGRS_NO_ERROR;\n\n  if ((letter >= LETTER_C) && (letter <= LETTER_H))\n  {\n    *min_northing = Latitude_Band_Table[letter-2].min_northing;\n    *northing_offset = Latitude_Band_Table[letter-2].northing_offset;\n  }\n  else if ((letter >= LETTER_J) && (letter <= LETTER_N))\n  {\n    *min_northing = Latitude_Band_Table[letter-3].min_northing;\n    *northing_offset = Latitude_Band_Table[letter-3].northing_offset;\n  }\n  else if ((letter >= LETTER_P) && (letter <= LETTER_X))\n  {\n    *min_northing = Latitude_Band_Table[letter-4].min_northing;\n    *northing_offset = Latitude_Band_Table[letter-4].northing_offset;\n  }\n  else\n    error_code |= MGRS_STRING_ERROR;\n\n  return error_code;\n} /* Get_Latitude_Band_Min_Northing */\n\n\nlong Get_Latitude_Range(long letter, double* north, double* south)\n/*\n * The function Get_Latitude_Range receives a latitude band letter\n * and uses the Latitude_Band_Table to determine the latitude band\n * boundaries for that latitude band letter.\n *\n *   letter   : Latitude band letter                        (input)\n *   north    : Northern latitude boundary for that letter\t(output)\n *   north    : Southern latitude boundary for that letter\t(output)\n */\n{ /* Get_Latitude_Range */\n  long error_code = MGRS_NO_ERROR;\n\n  if ((letter >= LETTER_C) && (letter <= LETTER_H))\n  {\n    *north = Latitude_Band_Table[letter-2].north * DEG_TO_RAD;\n    *south = Latitude_Band_Table[letter-2].south * DEG_TO_RAD;\n  }\n  else if ((letter >= LETTER_J) && (letter <= LETTER_N))\n  {\n    *north = Latitude_Band_Table[letter-3].north * DEG_TO_RAD;\n    *south = Latitude_Band_Table[letter-3].south * DEG_TO_RAD;\n  }\n  else if ((letter >= LETTER_P) && (letter <= LETTER_X))\n  {\n    *north = Latitude_Band_Table[letter-4].north * DEG_TO_RAD;\n    *south = Latitude_Band_Table[letter-4].south * DEG_TO_RAD;\n  }\n  else\n    error_code |= MGRS_STRING_ERROR;\n\n  return error_code;\n} /* Get_Latitude_Range */\n\n\nlong Get_Latitude_Letter(double latitude, int* letter)\n/*\n * The function Get_Latitude_Letter receives a latitude value\n * and uses the Latitude_Band_Table to determine the latitude band\n * letter for that latitude.\n *\n *   latitude   : Latitude              (input)\n *   letter     : Latitude band letter  (output)\n */\n{ /* Get_Latitude_Letter */\n  double temp = 0.0;\n  long error_code = MGRS_NO_ERROR;\n  double lat_deg = latitude * RAD_TO_DEG;\n\n  if (lat_deg >= 72 && lat_deg < 84.5)\n    *letter = LETTER_X;\n  else if (lat_deg > -80.5 && lat_deg < 72)\n  {\n    temp = ((latitude + (80.0 * DEG_TO_RAD)) / (8.0 * DEG_TO_RAD)) + 1.0e-12;\n    *letter = Latitude_Band_Table[(int)temp].letter;\n  }\n  else\n    error_code |= MGRS_LAT_ERROR;\n\n  return error_code;\n} /* Get_Latitude_Letter */\n\n\nlong Check_Zone(char* MGRS, long* zone_exists)\n/*\n * The function Check_Zone receives an MGRS coordinate string.\n * If a zone is given, TRUE is returned. Otherwise, FALSE\n * is returned.\n *\n *   MGRS           : MGRS coordinate string        (input)\n *   zone_exists    : TRUE if a zone is given,\n *                    FALSE if a zone is not given  (output)\n */\n{ /* Check_Zone */\n  int i = 0;\n  int j = 0;\n  int num_digits = 0;\n  long error_code = MGRS_NO_ERROR;\n\n  /* skip any leading blanks */\n  while (MGRS[i] == ' ')\n    i++;\n  j = i;\n  while (isdigit(MGRS[i]))\n    i++;\n  num_digits = i - j;\n  if (num_digits <= 2)\n    if (num_digits > 0)\n      *zone_exists = TRUE;\n    else\n      *zone_exists = FALSE;\n  else\n    error_code |= MGRS_STRING_ERROR;\n\n  return error_code;\n} /* Check_Zone */\n\n\nlong Make_MGRS_String (char* MGRS,\n                       long Zone,\n                       int Letters[MGRS_LETTERS],\n                       double Easting,\n                       double Northing,\n                       long Precision)\n/*\n * The function Make_MGRS_String constructs an MGRS string\n * from its component parts.\n *\n *   MGRS           : MGRS coordinate string          (output)\n *   Zone           : UTM Zone                        (input)\n *   Letters        : MGRS coordinate string letters  (input)\n *   Easting        : Easting value                   (input)\n *   Northing       : Northing value                  (input)\n *   Precision      : Precision level of MGRS string  (input)\n */\n{ /* Make_MGRS_String */\n  long i;\n  long j;\n  double divisor;\n  long east;\n  long north;\n  char alphabet[] = \"ABCDEFGHIJKLMNOPQRSTUVWXYZ\";\n  long error_code = MGRS_NO_ERROR;\n\n  i = 0;\n  if (Zone)\n    i = sprintf (MGRS+i,\"%2.2ld\",Zone);\n  else\n    strncpy(MGRS, \"  \", 2);  // 2 spaces\n\n  for (j=0;j<3;j++)\n    MGRS[i++] = alphabet[Letters[j]];\n  divisor = pow (10.0, (5 - Precision));\n  Easting = fmod (Easting, 100000.0);\n  if (Easting >= 99999.5)\n    Easting = 99999.0;\n  east = (long)(Easting/divisor);\n  i += sprintf (MGRS+i, \"%*.*ld\", (int)Precision, (int)Precision, east);\n  Northing = fmod (Northing, 100000.0);\n  if (Northing >= 99999.5)\n    Northing = 99999.0;\n  north = (long)(Northing/divisor);\n  i += sprintf (MGRS+i, \"%*.*ld\", (int)Precision, (int)Precision, north);\n  return (error_code);\n} /* Make_MGRS_String */\n\n\nlong Break_MGRS_String (char* MGRS,\n                        long* Zone,\n                        long Letters[MGRS_LETTERS],\n                        double* Easting,\n                        double* Northing,\n                        long* Precision)\n/*\n * The function Break_MGRS_String breaks down an MGRS\n * coordinate string into its component parts.\n *\n *   MGRS           : MGRS coordinate string          (input)\n *   Zone           : UTM Zone                        (output)\n *   Letters        : MGRS coordinate string letters  (output)\n *   Easting        : Easting value                   (output)\n *   Northing       : Northing value                  (output)\n *   Precision      : Precision level of MGRS string  (output)\n */\n{ /* Break_MGRS_String */\n  long num_digits;\n  long num_letters;\n  long i = 0;\n  long j = 0;\n  long error_code = MGRS_NO_ERROR;\n\n  while (MGRS[i] == ' ')\n    i++;  /* skip any leading blanks */\n  j = i;\n  while (isdigit(MGRS[i]))\n    i++;\n  num_digits = i - j;\n  if (num_digits <= 2)\n    if (num_digits > 0)\n    {\n      char zone_string[3];\n      /* get zone */\n      strncpy (zone_string, MGRS+j, 2);\n      zone_string[2] = 0;\n      sscanf (zone_string, \"%ld\", Zone);\n      if ((*Zone < 1) || (*Zone > 60))\n        error_code |= MGRS_STRING_ERROR;\n    }\n    else\n      *Zone = 0;\n  else\n    error_code |= MGRS_STRING_ERROR;\n  j = i;\n\n  while (isalpha(MGRS[i]))\n    i++;\n  num_letters = i - j;\n  if (num_letters == 3)\n  {\n    /* get letters */\n    Letters[0] = (toupper(MGRS[j]) - (long)'A');\n    if ((Letters[0] == LETTER_I) || (Letters[0] == LETTER_O))\n      error_code |= MGRS_STRING_ERROR;\n    Letters[1] = (toupper(MGRS[j+1]) - (long)'A');\n    if ((Letters[1] == LETTER_I) || (Letters[1] == LETTER_O))\n      error_code |= MGRS_STRING_ERROR;\n    Letters[2] = (toupper(MGRS[j+2]) - (long)'A');\n    if ((Letters[2] == LETTER_I) || (Letters[2] == LETTER_O))\n      error_code |= MGRS_STRING_ERROR;\n  }\n  else\n    error_code |= MGRS_STRING_ERROR;\n  j = i;\n  while (isdigit(MGRS[i]))\n    i++;\n  num_digits = i - j;\n  if ((num_digits <= 10) && (num_digits%2 == 0))\n  {\n    long n;\n    char east_string[6];\n    char north_string[6];\n    long east;\n    long north;\n    double multiplier;\n    /* get easting & northing */\n    n = num_digits/2;\n    *Precision = n;\n    if (n > 0)\n    {\n      strncpy (east_string, MGRS+j, n);\n      east_string[n] = 0;\n      sscanf (east_string, \"%ld\", &east);\n      strncpy (north_string, MGRS+j+n, n);\n      north_string[n] = 0;\n      sscanf (north_string, \"%ld\", &north);\n      multiplier = pow (10.0, 5 - n);\n      *Easting = east * multiplier;\n      *Northing = north * multiplier;\n    }\n    else\n    {\n      *Easting = 0.0;\n      *Northing = 0.0;\n    }\n  }\n  else\n    error_code |= MGRS_STRING_ERROR;\n\n  return (error_code);\n} /* Break_MGRS_String */\n\n\nvoid Get_Grid_Values (long zone,\n                      long* ltr2_low_value,\n                      long* ltr2_high_value,\n                      double *pattern_offset)\n/*\n * The function getGridValues sets the letter range used for\n * the 2nd letter in the MGRS coordinate string, based on the set\n * number of the utm zone. It also sets the pattern offset using a\n * value of A for the second letter of the grid square, based on\n * the grid pattern and set number of the utm zone.\n *\n *    zone            : Zone number             (input)\n *    ltr2_low_value  : 2nd letter low number   (output)\n *    ltr2_high_value : 2nd letter high number  (output)\n *    pattern_offset  : Pattern offset          (output)\n */\n{ /* BEGIN Get_Grid_Values */\n  long set_number;    /* Set number (1-6) based on UTM zone number */\n  long aa_pattern;    /* Pattern based on ellipsoid code */\n\n  set_number = zone % 6;\n\n  if (!set_number)\n    set_number = 6;\n\n  if (!strcmp(MGRS_Ellipsoid_Code,CLARKE_1866) || !strcmp(MGRS_Ellipsoid_Code, CLARKE_1880) ||\n      !strcmp(MGRS_Ellipsoid_Code,BESSEL_1841) || !strcmp(MGRS_Ellipsoid_Code,BESSEL_1841_NAMIBIA))\n    aa_pattern = FALSE;\n  else\n    aa_pattern = TRUE;\n\n  if ((set_number == 1) || (set_number == 4))\n  {\n    *ltr2_low_value = LETTER_A;\n    *ltr2_high_value = LETTER_H;\n  }\n  else if ((set_number == 2) || (set_number == 5))\n  {\n    *ltr2_low_value = LETTER_J;\n    *ltr2_high_value = LETTER_R;\n  }\n  else if ((set_number == 3) || (set_number == 6))\n  {\n    *ltr2_low_value = LETTER_S;\n    *ltr2_high_value = LETTER_Z;\n  }\n\n  /* False northing at A for second letter of grid square */\n  if (aa_pattern)\n  {\n    if ((set_number % 2) ==  0)\n      *pattern_offset = 500000.0;\n    else\n      *pattern_offset = 0.0;\n  }\n  else\n  {\n    if ((set_number % 2) == 0)\n      *pattern_offset =  1500000.0;\n    else\n      *pattern_offset = 1000000.00;\n  }\n} /* END OF Get_Grid_Values */\n\n\nlong UTM_To_MGRS (long Zone,\n                  char Hemisphere,\n                  double Longitude,\n                  double Latitude,\n                  double Easting,\n                  double Northing,\n                  long Precision,\n                  char *MGRS)\n/*\n * The function UTM_To_MGRS calculates an MGRS coordinate string\n * based on the zone, latitude, easting and northing.\n *\n *    Zone      : Zone number             (input)\n *    Hemisphere: Hemisphere              (input)\n *    Longitude : Longitude in radians    (input)\n *    Latitude  : Latitude in radians     (input)\n *    Easting   : Easting                 (input)\n *    Northing  : Northing                (input)\n *    Precision : Precision               (input)\n *    MGRS      : MGRS coordinate string  (output)\n */\n{ /* BEGIN UTM_To_MGRS */\n  double pattern_offset;      /* Northing offset for 3rd letter               */\n  double grid_easting;        /* Easting used to derive 2nd letter of MGRS   */\n  double grid_northing;       /* Northing used to derive 3rd letter of MGRS  */\n  long ltr2_low_value;        /* 2nd letter range - low number               */\n  long ltr2_high_value;       /* 2nd letter range - high number              */\n  int letters[MGRS_LETTERS];  /* Number location of 3 letters in alphabet    */\n  long temp_error_code = MGRS_NO_ERROR;\n  long error_code = MGRS_NO_ERROR;\n\n\n\t/* Special check for rounding to (truncated) eastern edge of zone 31V */\n\tif ((Zone == 31) && (((Latitude >= 56.0 * DEG_TO_RAD) && (Latitude < 64.0 * DEG_TO_RAD)) && ((Longitude >= 3.0 * DEG_TO_RAD) || (Easting >= 500000.0))))\n\t{ /* Reconvert to UTM zone 32 */\n    Set_UTM_Parameters (MGRS_a, MGRS_f, 32);\n    temp_error_code = Convert_Geodetic_To_UTM (Latitude, Longitude, &Zone, &Hemisphere, &Easting, &Northing);\n    if(temp_error_code)\n    {\n      if(temp_error_code & UTM_LAT_ERROR)\n        error_code |= MGRS_LAT_ERROR;\n      if(temp_error_code & UTM_LON_ERROR)\n        error_code |= MGRS_LON_ERROR;\n      if(temp_error_code & UTM_ZONE_OVERRIDE_ERROR)\n        error_code |= MGRS_ZONE_ERROR;\n      if(temp_error_code & UTM_EASTING_ERROR)\n        error_code |= MGRS_EASTING_ERROR;\n      if(temp_error_code & UTM_NORTHING_ERROR)\n        error_code |= MGRS_NORTHING_ERROR;\n\n      return error_code;\n    }\n  }\n\n  if( Latitude <= 0.0 && Northing == 1.0e7)\n  {\n    Latitude = 0.0;\n    Northing = 0.0;\n  }\n\n  Get_Grid_Values(Zone, &ltr2_low_value, &ltr2_high_value, &pattern_offset);\n\n  error_code = Get_Latitude_Letter(Latitude, &letters[0]);\n\n  if (!error_code)\n  {\n    grid_northing = Northing;\n\n    while (grid_northing >= TWOMIL)\n    {\n      grid_northing = grid_northing - TWOMIL;\n    }\n    grid_northing = grid_northing + pattern_offset;\n    if(grid_northing >= TWOMIL)\n      grid_northing = grid_northing - TWOMIL;\n\n    letters[2] = (long)(grid_northing / ONEHT);\n    if (letters[2] > LETTER_H)\n      letters[2] = letters[2] + 1;\n\n    if (letters[2] > LETTER_N)\n      letters[2] = letters[2] + 1;\n\n    grid_easting = Easting;\n    if (((letters[0] == LETTER_V) && (Zone == 31)) && (grid_easting == 500000.0))\n      grid_easting = grid_easting - 1.0; /* SUBTRACT 1 METER */\n\n    letters[1] = ltr2_low_value + ((long)(grid_easting / ONEHT) -1);\n    if ((ltr2_low_value == LETTER_J) && (letters[1] > LETTER_N))\n      letters[1] = letters[1] + 1;\n\n    Make_MGRS_String (MGRS, Zone, letters, grid_easting, Northing, Precision);\n  }\n  return error_code;\n} /* END UTM_To_MGRS */\n\n\nlong Set_MGRS_Parameters (double a,\n                          double f,\n                          char   *Ellipsoid_Code)\n/*\n * The function SET_MGRS_PARAMETERS receives the ellipsoid parameters and sets\n * the corresponding state variables. If any errors occur, the error code(s)\n * are returned by the function, otherwise MGRS_NO_ERROR is returned.\n *\n *   a                : Semi-major axis of ellipsoid in meters  (input)\n *   f                : Flattening of ellipsoid\t\t\t\t\t        (input)\n *   Ellipsoid_Code   : 2-letter code for ellipsoid             (input)\n */\n{ /* Set_MGRS_Parameters  */\n\n  double inv_f = 1 / f;\n  long Error_Code = MGRS_NO_ERROR;\n\n  if (a <= 0.0)\n  { /* Semi-major axis must be greater than zero */\n    Error_Code |= MGRS_A_ERROR;\n  }\n  if ((inv_f < 250) || (inv_f > 350))\n  { /* Inverse flattening must be between 250 and 350 */\n    Error_Code |= MGRS_INV_F_ERROR;\n  }\n  if (!Error_Code)\n  { /* no errors */\n    MGRS_a = a;\n    MGRS_f = f;\n    strcpy (MGRS_Ellipsoid_Code, Ellipsoid_Code);\n  }\n  return (Error_Code);\n}  /* Set_MGRS_Parameters  */\n\n\nvoid Get_MGRS_Parameters (double *a,\n                          double *f,\n                          char* Ellipsoid_Code)\n/*\n * The function Get_MGRS_Parameters returns the current ellipsoid\n * parameters.\n *\n *  a                : Semi-major axis of ellipsoid, in meters (output)\n *  f                : Flattening of ellipsoid\t\t\t\t\t       (output)\n *  Ellipsoid_Code   : 2-letter code for ellipsoid             (output)\n */\n{ /* Get_MGRS_Parameters */\n  *a = MGRS_a;\n  *f = MGRS_f;\n  strcpy (Ellipsoid_Code, MGRS_Ellipsoid_Code);\n  return;\n} /* Get_MGRS_Parameters */\n\n\nlong Convert_Geodetic_To_MGRS (double Latitude,\n                               double Longitude,\n                               long Precision,\n                               char* MGRS)\n/*\n * The function Convert_Geodetic_To_MGRS converts Geodetic (latitude and\n * longitude) coordinates to an MGRS coordinate string, according to the\n * current ellipsoid parameters.  If any errors occur, the error code(s)\n * are returned by the function, otherwise MGRS_NO_ERROR is returned.\n *\n *    Latitude   : Latitude in radians              (input)\n *    Longitude  : Longitude in radians             (input)\n *    Precision  : Precision level of MGRS string   (input)\n *    MGRS       : MGRS coordinate string           (output)\n *\n */\n{ /* Convert_Geodetic_To_MGRS */\n  long zone;\n  char hemisphere;\n  double easting;\n  double northing;\n  long temp_error_code = MGRS_NO_ERROR;\n  long error_code = MGRS_NO_ERROR;\n\n  if ((Latitude < -PI_OVER_2) || (Latitude > PI_OVER_2))\n  { /* Latitude out of range */\n    error_code |= MGRS_LAT_ERROR;\n  }\n  if ((Longitude < -PI) || (Longitude > (2*PI)))\n  { /* Longitude out of range */\n    error_code |= MGRS_LON_ERROR;\n  }\n  if ((Precision < 0) || (Precision > MAX_PRECISION))\n    error_code |= MGRS_PRECISION_ERROR;\n  if (!error_code)\n  {\n    if ((Latitude < MIN_UTM_LAT) || (Latitude > MAX_UTM_LAT))\n    {\n      temp_error_code = Set_UPS_Parameters (MGRS_a, MGRS_f);\n      if(!temp_error_code)\n      {\n        temp_error_code = Convert_Geodetic_To_UPS (Latitude, Longitude, &hemisphere, &easting, &northing);\n        if(!temp_error_code)\n        {\n          error_code |= Convert_UPS_To_MGRS (hemisphere, easting, northing, Precision, MGRS);\n        }\n        else\n        {\n          if(temp_error_code & UPS_LAT_ERROR)\n            error_code |= MGRS_LAT_ERROR;\n          if(temp_error_code & UPS_LON_ERROR)\n            error_code |= MGRS_LON_ERROR;\n        }\n      }\n      else\n      {\n        if(temp_error_code & UPS_A_ERROR)\n          error_code |= MGRS_A_ERROR;\n        if(temp_error_code & UPS_INV_F_ERROR)\n          error_code |= MGRS_INV_F_ERROR;\n      }\n    }\n    else\n    {\n      temp_error_code = Set_UTM_Parameters (MGRS_a, MGRS_f, 0);\n      if(!temp_error_code)\n      {\n        temp_error_code = Convert_Geodetic_To_UTM (Latitude, Longitude, &zone, &hemisphere, &easting, &northing);\n        if(!temp_error_code)\n          error_code |= UTM_To_MGRS (zone, hemisphere, Longitude, Latitude, easting, northing, Precision, MGRS);\n        else\n        {\n          if(temp_error_code & UTM_LAT_ERROR)\n            error_code |= MGRS_LAT_ERROR;\n          if(temp_error_code & UTM_LON_ERROR)\n            error_code |= MGRS_LON_ERROR;\n          if(temp_error_code & UTM_ZONE_OVERRIDE_ERROR)\n            error_code |= MGRS_ZONE_ERROR;\n          if(temp_error_code & UTM_EASTING_ERROR)\n            error_code |= MGRS_EASTING_ERROR;\n          if(temp_error_code & UTM_NORTHING_ERROR)\n            error_code |= MGRS_NORTHING_ERROR;\n        }\n      }\n      else\n      {\n        if(temp_error_code & UTM_A_ERROR)\n          error_code |= MGRS_A_ERROR;\n        if(temp_error_code & UTM_INV_F_ERROR)\n          error_code |= MGRS_INV_F_ERROR;\n        if(temp_error_code & UTM_ZONE_OVERRIDE_ERROR)\n          error_code |= MGRS_ZONE_ERROR;\n      }\n    }\n  }\n  return (error_code);\n} /* Convert_Geodetic_To_MGRS */\n\n\nlong Convert_MGRS_To_Geodetic (char* MGRS,\n                               double *Latitude,\n                               double *Longitude)\n/*\n * The function Convert_MGRS_To_Geodetic converts an MGRS coordinate string\n * to Geodetic (latitude and longitude) coordinates\n * according to the current ellipsoid parameters.  If any errors occur,\n * the error code(s) are returned by the function, otherwise UTM_NO_ERROR\n * is returned.\n *\n *    MGRS       : MGRS coordinate string           (input)\n *    Latitude   : Latitude in radians              (output)\n *    Longitude  : Longitude in radians             (output)\n *\n */\n{ /* Convert_MGRS_To_Geodetic */\n  long zone;\n  char hemisphere;\n  double easting;\n  double northing;\n  long zone_exists;\n  long temp_error_code = MGRS_NO_ERROR;\n  long error_code = MGRS_NO_ERROR;\n\n  error_code = Check_Zone(MGRS, &zone_exists);\n  if (!error_code)\n  {\n    if (zone_exists)\n    {\n      error_code |= Convert_MGRS_To_UTM (MGRS, &zone, &hemisphere, &easting, &northing);\n      if(!error_code || (error_code & MGRS_LAT_WARNING))\n      {\n        temp_error_code = Set_UTM_Parameters (MGRS_a, MGRS_f, 0);\n        if(!temp_error_code)\n        {\n          temp_error_code = Convert_UTM_To_Geodetic (zone, hemisphere, easting, northing, Latitude, Longitude);\n          if(temp_error_code)\n          {\n            if((temp_error_code & UTM_ZONE_ERROR) || (temp_error_code & UTM_HEMISPHERE_ERROR))\n              error_code |= MGRS_STRING_ERROR;\n            if(temp_error_code & UTM_EASTING_ERROR)\n              error_code |= MGRS_EASTING_ERROR;\n            if(temp_error_code & UTM_NORTHING_ERROR)\n              error_code |= MGRS_NORTHING_ERROR;\n          }\n        }\n        else\n        {\n          if(temp_error_code & UTM_A_ERROR)\n            error_code |= MGRS_A_ERROR;\n          if(temp_error_code & UTM_INV_F_ERROR)\n            error_code |= MGRS_INV_F_ERROR;\n          if(temp_error_code & UTM_ZONE_OVERRIDE_ERROR)\n            error_code |= MGRS_ZONE_ERROR;\n        }\n      }\n    }\n    else\n    {\n      error_code |= Convert_MGRS_To_UPS (MGRS, &hemisphere, &easting, &northing);\n      if(!error_code)\n      {\n        temp_error_code = Set_UPS_Parameters (MGRS_a, MGRS_f);\n        if(!temp_error_code)\n        {\n          temp_error_code = Convert_UPS_To_Geodetic (hemisphere, easting, northing, Latitude, Longitude);\n          if(temp_error_code)\n          {\n            if(temp_error_code & UPS_HEMISPHERE_ERROR)\n              error_code |= MGRS_STRING_ERROR;\n            if(temp_error_code & UPS_EASTING_ERROR)\n              error_code |= MGRS_EASTING_ERROR;\n            if(temp_error_code & UPS_LAT_ERROR)\n              error_code |= MGRS_NORTHING_ERROR;\n          }\n        }\n        else\n        {\n          if(temp_error_code & UPS_A_ERROR)\n            error_code |= MGRS_A_ERROR;\n          if(temp_error_code & UPS_INV_F_ERROR)\n            error_code |= MGRS_INV_F_ERROR;\n        }\n      }\n    }\n  }\n  return (error_code);\n} /* END OF Convert_MGRS_To_Geodetic */\n\n\nlong Convert_UTM_To_MGRS (long Zone,\n                          char Hemisphere,\n                          double Easting,\n                          double Northing,\n                          long Precision,\n                          char* MGRS)\n/*\n * The function Convert_UTM_To_MGRS converts UTM (zone, easting, and\n * northing) coordinates to an MGRS coordinate string, according to the\n * current ellipsoid parameters.  If any errors occur, the error code(s)\n * are returned by the function, otherwise MGRS_NO_ERROR is returned.\n *\n *    Zone       : UTM zone                         (input)\n *    Hemisphere : North or South hemisphere        (input)\n *    Easting    : Easting (X) in meters            (input)\n *    Northing   : Northing (Y) in meters           (input)\n *    Precision  : Precision level of MGRS string   (input)\n *    MGRS       : MGRS coordinate string           (output)\n */\n{ /* Convert_UTM_To_MGRS */\n  double latitude;           /* Latitude of UTM point */\n  double longitude;          /* Longitude of UTM point */\n  long utm_error_code = MGRS_NO_ERROR;\n  long error_code = MGRS_NO_ERROR;\n\n  if ((Zone < 1) || (Zone > 60))\n    error_code |= MGRS_ZONE_ERROR;\n  if ((Hemisphere != 'S') && (Hemisphere != 'N'))\n    error_code |= MGRS_HEMISPHERE_ERROR;\n  if ((Easting < MIN_EASTING) || (Easting > MAX_EASTING))\n    error_code |= MGRS_EASTING_ERROR;\n  if ((Northing < MIN_NORTHING) || (Northing > MAX_NORTHING))\n    error_code |= MGRS_NORTHING_ERROR;\n  if ((Precision < 0) || (Precision > MAX_PRECISION))\n    error_code |= MGRS_PRECISION_ERROR;\n  if (!error_code)\n  {\n    Set_UTM_Parameters (MGRS_a, MGRS_f, 0);\n    utm_error_code = Convert_UTM_To_Geodetic (Zone, Hemisphere, Easting, Northing, &latitude, &longitude);\n    if(utm_error_code)\n    {\n      if((utm_error_code & UTM_ZONE_ERROR) || (utm_error_code & UTM_HEMISPHERE_ERROR))\n        error_code |= MGRS_STRING_ERROR;\n      if(utm_error_code & UTM_EASTING_ERROR)\n        error_code |= MGRS_EASTING_ERROR;\n      if(utm_error_code & UTM_NORTHING_ERROR)\n        error_code |= MGRS_NORTHING_ERROR;\n    }\n\n\t  error_code = UTM_To_MGRS (Zone, Hemisphere, longitude, latitude, Easting, Northing, Precision, MGRS);\n  }\n  return (error_code);\n} /* Convert_UTM_To_MGRS */\n\n\nlong Convert_MGRS_To_UTM (char   *MGRS,\n                          long   *Zone,\n                          char   *Hemisphere,\n                          double *Easting,\n                          double *Northing)\n/*\n * The function Convert_MGRS_To_UTM converts an MGRS coordinate string\n * to UTM projection (zone, hemisphere, easting and northing) coordinates\n * according to the current ellipsoid parameters.  If any errors occur,\n * the error code(s) are returned by the function, otherwise UTM_NO_ERROR\n * is returned.\n *\n *    MGRS       : MGRS coordinate string           (input)\n *    Zone       : UTM zone                         (output)\n *    Hemisphere : North or South hemisphere        (output)\n *    Easting    : Easting (X) in meters            (output)\n *    Northing   : Northing (Y) in meters           (output)\n */\n{ /* Convert_MGRS_To_UTM */\n  double min_northing;\n  double northing_offset;\n  long ltr2_low_value;\n  long ltr2_high_value;\n  double pattern_offset;\n  double upper_lat_limit;     /* North latitude limits based on 1st letter  */\n  double lower_lat_limit;     /* South latitude limits based on 1st letter  */\n  double grid_easting;        /* Easting for 100,000 meter grid square      */\n  double grid_northing;       /* Northing for 100,000 meter grid square     */\n  long letters[MGRS_LETTERS];\n  long in_precision;\n  double latitude = 0.0;\n  double longitude = 0.0;\n  double divisor = 1.0;\n  long utm_error_code = MGRS_NO_ERROR;\n  long error_code = MGRS_NO_ERROR;\n\n  error_code = Break_MGRS_String (MGRS, Zone, letters, Easting, Northing, &in_precision);\n  if (!*Zone)\n    error_code |= MGRS_STRING_ERROR;\n  else\n  {\n    if (!error_code)\n    {\n      if ((letters[0] == LETTER_X) && ((*Zone == 32) || (*Zone == 34) || (*Zone == 36)))\n        error_code |= MGRS_STRING_ERROR;\n      else\n      {\n        if (letters[0] < LETTER_N)\n          *Hemisphere = 'S';\n        else\n          *Hemisphere = 'N';\n\n        Get_Grid_Values(*Zone, &ltr2_low_value, &ltr2_high_value, &pattern_offset);\n\n        /* Check that the second letter of the MGRS string is within\n         * the range of valid second letter values\n         * Also check that the third letter is valid */\n        if ((letters[1] < ltr2_low_value) || (letters[1] > ltr2_high_value) || (letters[2] > LETTER_V))\n          error_code |= MGRS_STRING_ERROR;\n\n        if (!error_code)\n        {\n          double row_letter_northing = (double)(letters[2]) * ONEHT;\n          grid_easting = (double)((letters[1]) - ltr2_low_value + 1) * ONEHT;\n          if ((ltr2_low_value == LETTER_J) && (letters[1] > LETTER_O))\n            grid_easting = grid_easting - ONEHT;\n\n          if (letters[2] > LETTER_O)\n            row_letter_northing = row_letter_northing - ONEHT;\n\n          if (letters[2] > LETTER_I)\n            row_letter_northing = row_letter_northing - ONEHT;\n\n          if (row_letter_northing >= TWOMIL)\n            row_letter_northing = row_letter_northing - TWOMIL;\n\n          error_code = Get_Latitude_Band_Min_Northing(letters[0], &min_northing, &northing_offset);\n          if (!error_code)\n          {\n            grid_northing = row_letter_northing - pattern_offset;\n            if(grid_northing < 0)\n              grid_northing += TWOMIL;\n\n            grid_northing += northing_offset;\n\n            if(grid_northing < min_northing)\n              grid_northing += TWOMIL;\n\n            *Easting = grid_easting + *Easting;\n            *Northing = grid_northing + *Northing;\n\n            /* check that point is within Zone Letter bounds */\n            utm_error_code = Set_UTM_Parameters(MGRS_a,MGRS_f,0);\n            if (!utm_error_code)\n            {\n              utm_error_code = Convert_UTM_To_Geodetic(*Zone,*Hemisphere,*Easting,*Northing,&latitude,&longitude);\n              if (!utm_error_code)\n              {\n                divisor = pow (10.0, in_precision);\n                error_code = Get_Latitude_Range(letters[0], &upper_lat_limit, &lower_lat_limit);\n                if (!error_code)\n                {\n                  if (!(((lower_lat_limit - DEG_TO_RAD/divisor) <= latitude) && (latitude <= (upper_lat_limit + DEG_TO_RAD/divisor))))\n                    error_code |= MGRS_LAT_WARNING;\n                }\n              }\n              else\n              {\n                if((utm_error_code & UTM_ZONE_ERROR) || (utm_error_code & UTM_HEMISPHERE_ERROR))\n                  error_code |= MGRS_STRING_ERROR;\n                if(utm_error_code & UTM_EASTING_ERROR)\n                  error_code |= MGRS_EASTING_ERROR;\n                if(utm_error_code & UTM_NORTHING_ERROR)\n                  error_code |= MGRS_NORTHING_ERROR;\n              }\n            }\n            else\n            {\n              if(utm_error_code & UTM_A_ERROR)\n                error_code |= MGRS_A_ERROR;\n              if(utm_error_code & UTM_INV_F_ERROR)\n                error_code |= MGRS_INV_F_ERROR;\n              if(utm_error_code & UTM_ZONE_OVERRIDE_ERROR)\n                error_code |= MGRS_ZONE_ERROR;\n            }\n          }\n        }\n      }\n    }\n  }\n  return (error_code);\n} /* Convert_MGRS_To_UTM */\n\n\nlong Convert_UPS_To_MGRS (char   Hemisphere,\n                          double Easting,\n                          double Northing,\n                          long   Precision,\n                          char*  MGRS)\n/*\n *  The function Convert_UPS_To_MGRS converts UPS (hemisphere, easting,\n *  and northing) coordinates to an MGRS coordinate string according to\n *  the current ellipsoid parameters.  If any errors occur, the error\n *  code(s) are returned by the function, otherwise UPS_NO_ERROR is\n *  returned.\n *\n *    Hemisphere    : Hemisphere either 'N' or 'S'     (input)\n *    Easting       : Easting/X in meters              (input)\n *    Northing      : Northing/Y in meters             (input)\n *    Precision     : Precision level of MGRS string   (input)\n *    MGRS          : MGRS coordinate string           (output)\n */\n{ /* Convert_UPS_To_MGRS */\n  double false_easting;       /* False easting for 2nd letter                 */\n  double false_northing;      /* False northing for 3rd letter                */\n  double grid_easting;        /* Easting used to derive 2nd letter of MGRS    */\n  double grid_northing;       /* Northing used to derive 3rd letter of MGRS   */\n  long ltr2_low_value;        /* 2nd letter range - low number                */\n  int letters[MGRS_LETTERS];  /* Number location of 3 letters in alphabet     */\n  int index = 0;\n  long error_code = MGRS_NO_ERROR;\n\n  if ((Hemisphere != 'N') && (Hemisphere != 'S'))\n    error_code |= MGRS_HEMISPHERE_ERROR;\n  if ((Easting < MIN_EAST_NORTH) || (Easting > MAX_EAST_NORTH))\n    error_code |= MGRS_EASTING_ERROR;\n  if ((Northing < MIN_EAST_NORTH) || (Northing > MAX_EAST_NORTH))\n    error_code |= MGRS_NORTHING_ERROR;\n  if ((Precision < 0) || (Precision > MAX_PRECISION))\n    error_code |= MGRS_PRECISION_ERROR;\n  if (!error_code)\n  {\n\n    if (Hemisphere == 'N')\n    {\n      if (Easting >= TWOMIL)\n        letters[0] = LETTER_Z;\n      else\n        letters[0] = LETTER_Y;\n\n      index = letters[0] - 22;\n      ltr2_low_value = UPS_Constant_Table[index].ltr2_low_value;\n      false_easting = UPS_Constant_Table[index].false_easting;\n      false_northing = UPS_Constant_Table[index].false_northing;\n    }\n    else\n    {\n      if (Easting >= TWOMIL)\n        letters[0] = LETTER_B;\n      else\n        letters[0] = LETTER_A;\n\n      ltr2_low_value = UPS_Constant_Table[letters[0]].ltr2_low_value;\n      false_easting = UPS_Constant_Table[letters[0]].false_easting;\n      false_northing = UPS_Constant_Table[letters[0]].false_northing;\n    }\n\n    grid_northing = Northing;\n    grid_northing = grid_northing - false_northing;\n    letters[2] = (long)(grid_northing / ONEHT);\n\n    if (letters[2] > LETTER_H)\n      letters[2] = letters[2] + 1;\n\n    if (letters[2] > LETTER_N)\n      letters[2] = letters[2] + 1;\n\n    grid_easting = Easting;\n    grid_easting = grid_easting - false_easting;\n    letters[1] = ltr2_low_value + ((long)(grid_easting / ONEHT));\n\n    if (Easting < TWOMIL)\n    {\n      if (letters[1] > LETTER_L)\n        letters[1] = letters[1] + 3;\n\n      if (letters[1] > LETTER_U)\n        letters[1] = letters[1] + 2;\n    }\n    else\n    {\n      if (letters[1] > LETTER_C)\n        letters[1] = letters[1] + 2;\n\n      if (letters[1] > LETTER_H)\n        letters[1] = letters[1] + 1;\n\n      if (letters[1] > LETTER_L)\n        letters[1] = letters[1] + 3;\n    }\n\n    Make_MGRS_String (MGRS, 0, letters, Easting, Northing, Precision);\n  }\n  return (error_code);\n} /* Convert_UPS_To_MGRS */\n\n\nlong Convert_MGRS_To_UPS ( char   *MGRS,\n                           char   *Hemisphere,\n                           double *Easting,\n                           double *Northing)\n/*\n *  The function Convert_MGRS_To_UPS converts an MGRS coordinate string\n *  to UPS (hemisphere, easting, and northing) coordinates, according\n *  to the current ellipsoid parameters. If any errors occur, the error\n *  code(s) are returned by the function, otherwide UPS_NO_ERROR is returned.\n *\n *    MGRS          : MGRS coordinate string           (input)\n *    Hemisphere    : Hemisphere either 'N' or 'S'     (output)\n *    Easting       : Easting/X in meters              (output)\n *    Northing      : Northing/Y in meters             (output)\n */\n{ /* Convert_MGRS_To_UPS */\n  long ltr2_high_value;       /* 2nd letter range - high number             */\n  long ltr3_high_value;       /* 3rd letter range - high number (UPS)       */\n  long ltr2_low_value;        /* 2nd letter range - low number              */\n  double false_easting;       /* False easting for 2nd letter               */\n  double false_northing;      /* False northing for 3rd letter              */\n  double grid_easting;        /* easting for 100,000 meter grid square      */\n  double grid_northing;       /* northing for 100,000 meter grid square     */\n  long zone;\n  long letters[MGRS_LETTERS];\n  long in_precision;\n  int index = 0;\n  long error_code = MGRS_NO_ERROR;\n\n  error_code = Break_MGRS_String (MGRS, &zone, letters, Easting, Northing, &in_precision);\n  if (zone)\n    error_code |= MGRS_STRING_ERROR;\n  else\n  {\n    if (!error_code)\n    {\n      if (letters[0] >= LETTER_Y)\n      {\n        *Hemisphere = 'N';\n\n        index = letters[0] - 22;\n        ltr2_low_value = UPS_Constant_Table[index].ltr2_low_value;\n        ltr2_high_value = UPS_Constant_Table[index].ltr2_high_value;\n        ltr3_high_value = UPS_Constant_Table[index].ltr3_high_value;\n        false_easting = UPS_Constant_Table[index].false_easting;\n        false_northing = UPS_Constant_Table[index].false_northing;\n      }\n      else\n      {\n        *Hemisphere = 'S';\n\n        ltr2_low_value = UPS_Constant_Table[letters[0]].ltr2_low_value;\n        ltr2_high_value = UPS_Constant_Table[letters[0]].ltr2_high_value;\n        ltr3_high_value = UPS_Constant_Table[letters[0]].ltr3_high_value;\n        false_easting = UPS_Constant_Table[letters[0]].false_easting;\n        false_northing = UPS_Constant_Table[letters[0]].false_northing;\n      }\n\n      /* Check that the second letter of the MGRS string is within\n       * the range of valid second letter values\n       * Also check that the third letter is valid */\n      if ((letters[1] < ltr2_low_value) || (letters[1] > ltr2_high_value) ||\n          ((letters[1] == LETTER_D) || (letters[1] == LETTER_E) ||\n          (letters[1] == LETTER_M) || (letters[1] == LETTER_N) ||\n          (letters[1] == LETTER_V) || (letters[1] == LETTER_W)) ||\n          (letters[2] > ltr3_high_value))\n          error_code |= MGRS_STRING_ERROR;\n\n      if (!error_code)\n      {\n        grid_northing = (double)letters[2] * ONEHT + false_northing;\n        if (letters[2] > LETTER_I)\n          grid_northing = grid_northing - ONEHT;\n\n        if (letters[2] > LETTER_O)\n          grid_northing = grid_northing - ONEHT;\n\n        grid_easting = (double)((letters[1]) - ltr2_low_value) * ONEHT + false_easting;\n        if (ltr2_low_value != LETTER_A)\n        {\n          if (letters[1] > LETTER_L)\n            grid_easting = grid_easting - 300000.0;\n\n          if (letters[1] > LETTER_U)\n            grid_easting = grid_easting - 200000.0;\n        }\n        else\n        {\n          if (letters[1] > LETTER_C)\n            grid_easting = grid_easting - 200000.0;\n\n          if (letters[1] > LETTER_I)\n            grid_easting = grid_easting - ONEHT;\n\n          if (letters[1] > LETTER_L)\n            grid_easting = grid_easting - 300000.0;\n        }\n\n        *Easting = grid_easting + *Easting;\n        *Northing = grid_northing + *Northing;\n      }\n    }\n  }\n  return (error_code);\n} /* Convert_MGRS_To_UPS */\n\nvoid initlibmgrs() {}\nvoid PyInit_libmgrs() {}\n"
  },
  {
    "path": "lonestar/scientific/cpu/longestedge/src/libmgrs/mgrs.h",
    "content": "#ifndef MGRS_H\n#define MGRS_H\n\n#ifdef _WIN32\n#define USE_DLL __declspec(dllexport)\n#else\n#define USE_DLL\n#endif\n\n/***************************************************************************/\n/* RSC IDENTIFIER:  MGRS\n *\n * ABSTRACT\n *\n *    This component converts between geodetic coordinates (latitude and\n *    longitude) and Military Grid Reference System (MGRS) coordinates.\n *\n * ERROR HANDLING\n *\n *    This component checks parameters for valid values.  If an invalid value\n *    is found, the error code is combined with the current error code using\n *    the bitwise or.  This combining allows multiple error codes to be\n *    returned. The possible error codes are:\n *\n *          MGRS_NO_ERROR          : No errors occurred in function\n *          MGRS_LAT_ERROR         : Latitude outside of valid range\n *                                    (-90 to 90 degrees)\n *          MGRS_LON_ERROR         : Longitude outside of valid range\n *                                    (-180 to 360 degrees)\n *          MGRS_STR_ERROR         : An MGRS string error: string too long,\n *                                    too short, or badly formed\n *          MGRS_PRECISION_ERROR   : The precision must be between 0 and 5\n *                                    inclusive.\n *          MGRS_A_ERROR           : Semi-major axis less than or equal to zero\n *          MGRS_INV_F_ERROR       : Inverse flattening outside of valid range\n *\t\t\t\t\t\t\t\t\t                  (250 to 350)\n *          MGRS_EASTING_ERROR     : Easting outside of valid range\n *                                    (100,000 to 900,000 meters for UTM)\n *                                    (0 to 4,000,000 meters for UPS)\n *          MGRS_NORTHING_ERROR    : Northing outside of valid range\n *                                    (0 to 10,000,000 meters for UTM)\n *                                    (0 to 4,000,000 meters for UPS)\n *          MGRS_ZONE_ERROR        : Zone outside of valid range (1 to 60)\n *          MGRS_HEMISPHERE_ERROR  : Invalid hemisphere ('N' or 'S')\n *\n * REUSE NOTES\n *\n *    MGRS is intended for reuse by any application that does conversions\n *    between geodetic coordinates and MGRS coordinates.\n *\n * REFERENCES\n *\n *    Further information on MGRS can be found in the Reuse Manual.\n *\n *    MGRS originated from : U.S. Army Topographic Engineering Center\n *                           Geospatial Information Division\n *                           7701 Telegraph Road\n *                           Alexandria, VA  22310-3864\n *\n * LICENSES\n *\n *    None apply to this component.\n *\n * RESTRICTIONS\n *\n *\n * ENVIRONMENT\n *\n *    MGRS was tested and certified in the following environments:\n *\n *    1. Solaris 2.5 with GCC version 2.8.1\n *    2. Windows 95 with MS Visual C++ version 6\n *\n * MODIFICATIONS\n *\n *    Date              Description\n *    ----              -----------\n *    16-11-94          Original Code\n *    15-09-99          Reengineered upper layers\n *\n */\n\n/***************************************************************************/\n/*\n *                              DEFINES\n */\n\n#define MGRS_NO_ERROR 0x0000\n#define MGRS_LAT_ERROR 0x0001\n#define MGRS_LON_ERROR 0x0002\n#define MGRS_STRING_ERROR 0x0004\n#define MGRS_PRECISION_ERROR 0x0008\n#define MGRS_A_ERROR 0x0010\n#define MGRS_INV_F_ERROR 0x0020\n#define MGRS_EASTING_ERROR 0x0040\n#define MGRS_NORTHING_ERROR 0x0080\n#define MGRS_ZONE_ERROR 0x0100\n#define MGRS_HEMISPHERE_ERROR 0x0200\n#define MGRS_LAT_WARNING 0x0400\n\n/***************************************************************************/\n/*\n *                              FUNCTION PROTOTYPES\n */\n\n/* ensure proper linkage to c++ programs */\n#ifdef __cplusplus\nextern \"C\" {\n#endif\n\nlong USE_DLL Set_MGRS_Parameters(double a, double f, char* Ellipsoid_Code);\n/*\n * The function Set_MGRS_Parameters receives the ellipsoid parameters and sets\n * the corresponding state variables. If any errors occur, the error code(s)\n * are returned by the function, otherwise MGRS_NO_ERROR is returned.\n *\n *   a                : Semi-major axis of ellipsoid in meters (input)\n *   f                : Flattening of ellipsoid\t\t\t\t\t       (input)\n *   Ellipsoid_Code   : 2-letter code for ellipsoid            (input)\n */\n\nvoid USE_DLL Get_MGRS_Parameters(double* a, double* f, char* Ellipsoid_Code);\n/*\n * The function Get_MGRS_Parameters returns the current ellipsoid\n * parameters.\n *\n *  a                : Semi-major axis of ellipsoid, in meters (output)\n *  f                : Flattening of ellipsoid\t\t\t\t\t       (output)\n *  Ellipsoid_Code   : 2-letter code for ellipsoid             (output)\n */\n\nlong USE_DLL Convert_Geodetic_To_MGRS(double Latitude, double Longitude,\n                                      long Precision, char* MGRS);\n/*\n * The function Convert_Geodetic_To_MGRS converts geodetic (latitude and\n * longitude) coordinates to an MGRS coordinate string, according to the\n * current ellipsoid parameters.  If any errors occur, the error code(s)\n * are returned by the  function, otherwise MGRS_NO_ERROR is returned.\n *\n *    Latitude   : Latitude in radians              (input)\n *    Longitude  : Longitude in radians             (input)\n *    Precision  : Precision level of MGRS string   (input)\n *    MGRS       : MGRS coordinate string           (output)\n *\n */\n\nlong USE_DLL Convert_MGRS_To_Geodetic(char* MGRS, double* Latitude,\n                                      double* Longitude);\n/*\n * This function converts an MGRS coordinate string to Geodetic (latitude\n * and longitude in radians) coordinates.  If any errors occur, the error\n * code(s) are returned by the  function, otherwise MGRS_NO_ERROR is returned.\n *\n *    MGRS       : MGRS coordinate string           (input)\n *    Latitude   : Latitude in radians              (output)\n *    Longitude  : Longitude in radians             (output)\n *\n */\n\nlong USE_DLL Convert_UTM_To_MGRS(long Zone, char Hemisphere, double Easting,\n                                 double Northing, long Precision, char* MGRS);\n/*\n * The function Convert_UTM_To_MGRS converts UTM (zone, easting, and\n * northing) coordinates to an MGRS coordinate string, according to the\n * current ellipsoid parameters.  If any errors occur, the error code(s)\n * are returned by the  function, otherwise MGRS_NO_ERROR is returned.\n *\n *    Zone       : UTM zone                         (input)\n *    Hemisphere : North or South hemisphere        (input)\n *    Easting    : Easting (X) in meters            (input)\n *    Northing   : Northing (Y) in meters           (input)\n *    Precision  : Precision level of MGRS string   (input)\n *    MGRS       : MGRS coordinate string           (output)\n */\n\nlong USE_DLL Convert_MGRS_To_UTM(char* MGRS, long* Zone, char* Hemisphere,\n                                 double* Easting, double* Northing);\n/*\n * The function Convert_MGRS_To_UTM converts an MGRS coordinate string\n * to UTM projection (zone, hemisphere, easting and northing) coordinates\n * according to the current ellipsoid parameters.  If any errors occur,\n * the error code(s) are returned by the function, otherwise UTM_NO_ERROR\n * is returned.\n *\n *    MGRS       : MGRS coordinate string           (input)\n *    Zone       : UTM zone                         (output)\n *    Hemisphere : North or South hemisphere        (output)\n *    Easting    : Easting (X) in meters            (output)\n *    Northing   : Northing (Y) in meters           (output)\n */\n\nlong USE_DLL Convert_UPS_To_MGRS(char Hemisphere, double Easting,\n                                 double Northing, long Precision, char* MGRS);\n\n/*\n *  The function Convert_UPS_To_MGRS converts UPS (hemisphere, easting,\n *  and northing) coordinates to an MGRS coordinate string according to\n *  the current ellipsoid parameters.  If any errors occur, the error\n *  code(s) are returned by the function, otherwise UPS_NO_ERROR is\n *  returned.\n *\n *    Hemisphere    : Hemisphere either 'N' or 'S'     (input)\n *    Easting       : Easting/X in meters              (input)\n *    Northing      : Northing/Y in meters             (input)\n *    Precision     : Precision level of MGRS string   (input)\n *    MGRS          : MGRS coordinate string           (output)\n */\n\nlong USE_DLL Convert_MGRS_To_UPS(char* MGRS, char* Hemisphere, double* Easting,\n                                 double* Northing);\n/*\n *  The function Convert_MGRS_To_UPS converts an MGRS coordinate string\n *  to UPS (hemisphere, easting, and northing) coordinates, according\n *  to the current ellipsoid parameters. If any errors occur, the error\n *  code(s) are returned by the function, otherwide UPS_NO_ERROR is returned.\n *\n *    MGRS          : MGRS coordinate string           (input)\n *    Hemisphere    : Hemisphere either 'N' or 'S'     (output)\n *    Easting       : Easting/X in meters              (output)\n *    Northing      : Northing/Y in meters             (output)\n */\n\nvoid initlibmgrs();\nvoid PyInit_libmgrs();\n\n#ifdef __cplusplus\n}\n#endif\n\n#endif /* MGRS_H */\n"
  },
  {
    "path": "lonestar/scientific/cpu/longestedge/src/libmgrs/polarst.c",
    "content": "/***************************************************************************/\n/* RSC IDENTIFIER: POLAR STEREOGRAPHIC \n *\n *\n * ABSTRACT\n *\n *    This component provides conversions between geodetic (latitude and\n *    longitude) coordinates and Polar Stereographic (easting and northing) \n *    coordinates.\n *\n * ERROR HANDLING\n *\n *    This component checks parameters for valid values.  If an invalid \n *    value is found the error code is combined with the current error code \n *    using the bitwise or.  This combining allows multiple error codes to \n *    be returned. The possible error codes are:\n *\n *          POLAR_NO_ERROR           : No errors occurred in function\n *          POLAR_LAT_ERROR          : Latitude outside of valid range\n *                                      (-90 to 90 degrees)\n *          POLAR_LON_ERROR          : Longitude outside of valid range\n *                                      (-180 to 360 degrees) \n *          POLAR_ORIGIN_LAT_ERROR   : Latitude of true scale outside of valid\n *                                      range (-90 to 90 degrees)\n *          POLAR_ORIGIN_LON_ERROR   : Longitude down from pole outside of valid\n *                                      range (-180 to 360 degrees)\n *          POLAR_EASTING_ERROR      : Easting outside of valid range,\n *                                      depending on ellipsoid and\n *                                      projection parameters\n *          POLAR_NORTHING_ERROR     : Northing outside of valid range,\n *                                      depending on ellipsoid and\n *                                      projection parameters\n *          POLAR_RADIUS_ERROR       : Coordinates too far from pole,\n *                                      depending on ellipsoid and\n *                                      projection parameters\n *          POLAR_A_ERROR            : Semi-major axis less than or equal to zero\n *          POLAR_INV_F_ERROR        : Inverse flattening outside of valid range\n *\t\t\t\t\t\t\t\t  \t                  (250 to 350)\n *\n *\n * REUSE NOTES\n *\n *    POLAR STEREOGRAPHIC is intended for reuse by any application that  \n *    performs a Polar Stereographic projection.\n *\n *\n * REFERENCES\n *\n *    Further information on POLAR STEREOGRAPHIC can be found in the\n *    Reuse Manual.\n *\n *\n *    POLAR STEREOGRAPHIC originated from :\n *                                U.S. Army Topographic Engineering Center\n *                                Geospatial Information Division\n *                                7701 Telegraph Road\n *                                Alexandria, VA  22310-3864\n *\n *\n * LICENSES\n *\n *    None apply to this component.\n *\n *\n * RESTRICTIONS\n *\n *    POLAR STEREOGRAPHIC has no restrictions.\n *\n *\n * ENVIRONMENT\n *\n *    POLAR STEREOGRAPHIC was tested and certified in the following\n *    environments:\n *\n *    1. Solaris 2.5 with GCC, version 2.8.1\n *    2. Window 95 with MS Visual C++, version 6\n *\n *\n * MODIFICATIONS\n *\n *    Date              Description\n *    ----              -----------\n *    06-11-95          Original Code\n *    03-01-97          Original Code\n *\n *\n */\n\n\n/************************************************************************/\n/*\n *                               INCLUDES\n */\n\n#include <math.h>\n#include \"polarst.h\"\n\n/*\n *    math.h     - Standard C math library\n *    polarst.h  - Is for prototype error checking\n */\n\n\n/************************************************************************/\n/*                               DEFINES\n *\n */\n\n\n#define PI           3.14159265358979323e0       /* PI     */\n#define PI_OVER_2    (PI / 2.0)           \n#define TWO_PI       (2.0 * PI)\n#define POLAR_POW(EsSin)     pow((1.0 - EsSin) / (1.0 + EsSin), es_OVER_2)\n\n/************************************************************************/\n/*                           GLOBAL DECLARATIONS\n *\n */\n\nconst double PI_Over_4 = (PI / 4.0);\n\n/* Ellipsoid Parameters, default to WGS 84  */\nstatic double Polar_a = 6378137.0;                    /* Semi-major axis of ellipsoid in meters  */\nstatic double Polar_f = 1 / 298.257223563;            /* Flattening of ellipsoid  */\nstatic double es = 0.08181919084262188000;            /* Eccentricity of ellipsoid    */\nstatic double es_OVER_2 = .040909595421311;           /* es / 2.0 */\nstatic double Southern_Hemisphere = 0;                /* Flag variable */\nstatic double tc = 1.0;\nstatic double e4 = 1.0033565552493;\nstatic double Polar_a_mc = 6378137.0;                 /* Polar_a * mc */\nstatic double two_Polar_a = 12756274.0;               /* 2.0 * Polar_a */\n\n/* Polar Stereographic projection Parameters */\nstatic double Polar_Origin_Lat = ((PI * 90) / 180);   /* Latitude of origin in radians */\nstatic double Polar_Origin_Long = 0.0;                /* Longitude of origin in radians */\nstatic double Polar_False_Easting = 0.0;              /* False easting in meters */\nstatic double Polar_False_Northing = 0.0;             /* False northing in meters */\n\n/* Maximum variance for easting and northing values for WGS 84. */\nstatic double Polar_Delta_Easting = 12713601.0;\nstatic double Polar_Delta_Northing = 12713601.0;\n\n/* These state variables are for optimization purposes. The only function\n * that should modify them is Set_Polar_Stereographic_Parameters.         \n */\n\n\n/************************************************************************/\n/*                              FUNCTIONS\n *\n */\n\n\nlong Set_Polar_Stereographic_Parameters (double a,\n                                         double f,\n                                         double Latitude_of_True_Scale,\n                                         double Longitude_Down_from_Pole,\n                                         double False_Easting,\n                                         double False_Northing)\n\n{  /* BEGIN Set_Polar_Stereographic_Parameters   */\n/*  \n *  The function Set_Polar_Stereographic_Parameters receives the ellipsoid\n *  parameters and Polar Stereograpic projection parameters as inputs, and\n *  sets the corresponding state variables.  If any errors occur, error\n *  code(s) are returned by the function, otherwise POLAR_NO_ERROR is returned.\n *\n *  a                : Semi-major axis of ellipsoid, in meters         (input)\n *  f                : Flattening of ellipsoid\t\t\t\t\t               (input)\n *  Latitude_of_True_Scale  : Latitude of true scale, in radians       (input)\n *  Longitude_Down_from_Pole : Longitude down from pole, in radians    (input)\n *  False_Easting    : Easting (X) at center of projection, in meters  (input)\n *  False_Northing   : Northing (Y) at center of projection, in meters (input)\n */\n\n  double es2;\n  double slat, clat;\n  double essin;\n  double one_PLUS_es, one_MINUS_es;\n  double pow_es;\n  double temp, temp_northing;\n  double inv_f = 1 / f;\n  double mc;                    \n//  const double  epsilon = 1.0e-2;\n  long Error_Code = POLAR_NO_ERROR;\n\n  if (a <= 0.0)\n  { /* Semi-major axis must be greater than zero */\n    Error_Code |= POLAR_A_ERROR;\n  }\n  if ((inv_f < 250) || (inv_f > 350))\n  { /* Inverse flattening must be between 250 and 350 */\n    Error_Code |= POLAR_INV_F_ERROR;\n  }\n  if ((Latitude_of_True_Scale < -PI_OVER_2) || (Latitude_of_True_Scale > PI_OVER_2))\n  { /* Origin Latitude out of range */\n    Error_Code |= POLAR_ORIGIN_LAT_ERROR;\n  }\n  if ((Longitude_Down_from_Pole < -PI) || (Longitude_Down_from_Pole > TWO_PI))\n  { /* Origin Longitude out of range */\n    Error_Code |= POLAR_ORIGIN_LON_ERROR;\n  }\n\n  if (!Error_Code)\n  { /* no errors */\n\n    Polar_a = a;\n    two_Polar_a = 2.0 * Polar_a;\n    Polar_f = f;\n\n    if (Longitude_Down_from_Pole > PI)\n      Longitude_Down_from_Pole -= TWO_PI;\n    if (Latitude_of_True_Scale < 0)\n    {\n      Southern_Hemisphere = 1;\n      Polar_Origin_Lat = -Latitude_of_True_Scale;\n      Polar_Origin_Long = -Longitude_Down_from_Pole;\n    }\n    else\n    {\n      Southern_Hemisphere = 0;\n      Polar_Origin_Lat = Latitude_of_True_Scale;\n      Polar_Origin_Long = Longitude_Down_from_Pole;\n    }\n    Polar_False_Easting = False_Easting;\n    Polar_False_Northing = False_Northing;\n\n    es2 = 2 * Polar_f - Polar_f * Polar_f;\n    es = sqrt(es2);\n    es_OVER_2 = es / 2.0;\n\n    if (fabs(fabs(Polar_Origin_Lat) - PI_OVER_2) > 1.0e-10)\n    {\n      slat = sin(Polar_Origin_Lat);\n      essin = es * slat;\n      pow_es = POLAR_POW(essin);\n      clat = cos(Polar_Origin_Lat);\n      mc = clat / sqrt(1.0 - essin * essin);\n      Polar_a_mc = Polar_a * mc;\n      tc = tan(PI_Over_4 - Polar_Origin_Lat / 2.0) / pow_es;\n    }\n    else\n    {\n      one_PLUS_es = 1.0 + es;\n      one_MINUS_es = 1.0 - es;\n      e4 = sqrt(pow(one_PLUS_es, one_PLUS_es) * pow(one_MINUS_es, one_MINUS_es));\n    }\n\n    /* Calculate Radius */\n    Convert_Geodetic_To_Polar_Stereographic(0, Longitude_Down_from_Pole, \n                                            &temp, &temp_northing);\n\n    Polar_Delta_Northing = temp_northing;\n    if(Polar_False_Northing)\n      Polar_Delta_Northing -= Polar_False_Northing;\n    if (Polar_Delta_Northing < 0)\n      Polar_Delta_Northing = -Polar_Delta_Northing;\n    Polar_Delta_Northing *= 1.01;\n\n    Polar_Delta_Easting = Polar_Delta_Northing;\n\n  /*  Polar_Delta_Easting = temp_northing;\n    if(Polar_False_Easting)\n      Polar_Delta_Easting -= Polar_False_Easting;\n    if (Polar_Delta_Easting < 0)\n      Polar_Delta_Easting = -Polar_Delta_Easting;\n    Polar_Delta_Easting *= 1.01;*/\n  }\n\n  return (Error_Code);\n} /* END OF Set_Polar_Stereographic_Parameters */\n\n\n\nvoid Get_Polar_Stereographic_Parameters (double *a,\n                                         double *f,\n                                         double *Latitude_of_True_Scale,\n                                         double *Longitude_Down_from_Pole,\n                                         double *False_Easting,\n                                         double *False_Northing)\n\n{ /* BEGIN Get_Polar_Stereographic_Parameters  */\n/*\n * The function Get_Polar_Stereographic_Parameters returns the current\n * ellipsoid parameters and Polar projection parameters.\n *\n *  a                : Semi-major axis of ellipsoid, in meters         (output)\n *  f                : Flattening of ellipsoid\t\t\t\t\t               (output)\n *  Latitude_of_True_Scale  : Latitude of true scale, in radians       (output)\n *  Longitude_Down_from_Pole : Longitude down from pole, in radians    (output)\n *  False_Easting    : Easting (X) at center of projection, in meters  (output)\n *  False_Northing   : Northing (Y) at center of projection, in meters (output)\n */\n\n  *a = Polar_a;\n  *f = Polar_f;\n  *Latitude_of_True_Scale = Polar_Origin_Lat;\n  *Longitude_Down_from_Pole = Polar_Origin_Long;\n  *False_Easting = Polar_False_Easting;\n  *False_Northing = Polar_False_Northing;\n  return;\n} /* END OF Get_Polar_Stereographic_Parameters */\n\n\nlong Convert_Geodetic_To_Polar_Stereographic (double Latitude,\n                                              double Longitude,\n                                              double *Easting,\n                                              double *Northing)\n\n{  /* BEGIN Convert_Geodetic_To_Polar_Stereographic */\n\n/*\n * The function Convert_Geodetic_To_Polar_Stereographic converts geodetic\n * coordinates (latitude and longitude) to Polar Stereographic coordinates\n * (easting and northing), according to the current ellipsoid\n * and Polar Stereographic projection parameters. If any errors occur, error\n * code(s) are returned by the function, otherwise POLAR_NO_ERROR is returned.\n *\n *    Latitude   :  Latitude, in radians                      (input)\n *    Longitude  :  Longitude, in radians                     (input)\n *    Easting    :  Easting (X), in meters                    (output)\n *    Northing   :  Northing (Y), in meters                   (output)\n */\n\n  double dlam;\n  double slat;\n  double essin;\n  double t;\n  double rho;\n  double pow_es;\n  long Error_Code = POLAR_NO_ERROR;\n\n  if ((Latitude < -PI_OVER_2) || (Latitude > PI_OVER_2))\n  {   /* Latitude out of range */\n    Error_Code |= POLAR_LAT_ERROR;\n  }\n  if ((Latitude < 0) && (Southern_Hemisphere == 0))\n  {   /* Latitude and Origin Latitude in different hemispheres */\n    Error_Code |= POLAR_LAT_ERROR;\n  }\n  if ((Latitude > 0) && (Southern_Hemisphere == 1))\n  {   /* Latitude and Origin Latitude in different hemispheres */\n    Error_Code |= POLAR_LAT_ERROR;\n  }\n  if ((Longitude < -PI) || (Longitude > TWO_PI))\n  {  /* Longitude out of range */\n    Error_Code |= POLAR_LON_ERROR;\n  }\n\n\n  if (!Error_Code)\n  {  /* no errors */\n\n    if (fabs(fabs(Latitude) - PI_OVER_2) < 1.0e-10)\n    {\n      *Easting = Polar_False_Easting;\n      *Northing = Polar_False_Northing;\n    }\n    else\n    {\n      if (Southern_Hemisphere != 0)\n      {\n        Longitude *= -1.0;\n        Latitude *= -1.0;\n      }\n      dlam = Longitude - Polar_Origin_Long;\n      if (dlam > PI)\n      {\n        dlam -= TWO_PI;\n      }\n      if (dlam < -PI)\n      {\n        dlam += TWO_PI;\n      }\n      slat = sin(Latitude);\n      essin = es * slat;\n      pow_es = POLAR_POW(essin);\n      t = tan(PI_Over_4 - Latitude / 2.0) / pow_es;\n\n      if (fabs(fabs(Polar_Origin_Lat) - PI_OVER_2) > 1.0e-10)\n        rho = Polar_a_mc * t / tc;\n      else\n        rho = two_Polar_a * t / e4;\n\n\n      if (Southern_Hemisphere != 0)\n      {\n        *Easting = -(rho * sin(dlam) - Polar_False_Easting);\n     //   *Easting *= -1.0;\n        *Northing = rho * cos(dlam) + Polar_False_Northing;\n      }\n      else\n      {\n        *Easting = rho * sin(dlam) + Polar_False_Easting;\n        *Northing = -rho * cos(dlam) + Polar_False_Northing;\n      }\n\n    }\n  }\n  return (Error_Code);\n} /* END OF Convert_Geodetic_To_Polar_Stereographic */\n\n\nlong Convert_Polar_Stereographic_To_Geodetic (double Easting,\n                                              double Northing,\n                                              double *Latitude,\n                                              double *Longitude)\n\n{ /*  BEGIN Convert_Polar_Stereographic_To_Geodetic  */\n/*\n *  The function Convert_Polar_Stereographic_To_Geodetic converts Polar\n *  Stereographic coordinates (easting and northing) to geodetic\n *  coordinates (latitude and longitude) according to the current ellipsoid\n *  and Polar Stereographic projection Parameters. If any errors occur, the\n *  code(s) are returned by the function, otherwise POLAR_NO_ERROR\n *  is returned.\n *\n *  Easting          : Easting (X), in meters                   (input)\n *  Northing         : Northing (Y), in meters                  (input)\n *  Latitude         : Latitude, in radians                     (output)\n *  Longitude        : Longitude, in radians                    (output)\n *\n */\n\n  double dy = 0, dx = 0;\n  double rho = 0;\n  double t;\n  double PHI, sin_PHI;\n  double tempPHI = 0.0;\n  double essin;\n  double pow_es;\n  double delta_radius;\n  long Error_Code = POLAR_NO_ERROR;\n  double min_easting = Polar_False_Easting - Polar_Delta_Easting;\n  double max_easting = Polar_False_Easting + Polar_Delta_Easting;\n  double min_northing = Polar_False_Northing - Polar_Delta_Northing;\n  double max_northing = Polar_False_Northing + Polar_Delta_Northing;\n\n  if (Easting > max_easting || Easting < min_easting)\n  { /* Easting out of range */\n    Error_Code |= POLAR_EASTING_ERROR;\n  }\n  if (Northing > max_northing || Northing < min_northing)\n  { /* Northing out of range */\n    Error_Code |= POLAR_NORTHING_ERROR;\n  }\n\n  if (!Error_Code)\n  {\n    dy = Northing - Polar_False_Northing;\n    dx = Easting - Polar_False_Easting;\n\n    /* Radius of point with origin of false easting, false northing */\n    rho = sqrt(dx * dx + dy * dy);   \n    \n    delta_radius = sqrt(Polar_Delta_Easting * Polar_Delta_Easting + Polar_Delta_Northing * Polar_Delta_Northing);\n\n    if(rho > delta_radius)\n    { /* Point is outside of projection area */\n      Error_Code |= POLAR_RADIUS_ERROR;\n    }\n\n    if (!Error_Code)\n    { /* no errors */\n      if ((dy == 0.0) && (dx == 0.0))\n      {\n        *Latitude = PI_OVER_2;\n        *Longitude = Polar_Origin_Long;\n\n      }\n      else\n      {\n        if (Southern_Hemisphere != 0)\n        {\n          dy *= -1.0;\n          dx *= -1.0;\n        }\n\n        if (fabs(fabs(Polar_Origin_Lat) - PI_OVER_2) > 1.0e-10)\n          t = rho * tc / (Polar_a_mc);\n        else\n          t = rho * e4 / (two_Polar_a);\n        PHI = PI_OVER_2 - 2.0 * atan(t);\n        while (fabs(PHI - tempPHI) > 1.0e-10)\n        {\n          tempPHI = PHI;\n          sin_PHI = sin(PHI);\n          essin =  es * sin_PHI;\n          pow_es = POLAR_POW(essin);\n          PHI = PI_OVER_2 - 2.0 * atan(t * pow_es);\n        }\n        *Latitude = PHI;\n        *Longitude = Polar_Origin_Long + atan2(dx, -dy);\n\n        if (*Longitude > PI)\n          *Longitude -= TWO_PI;\n        else if (*Longitude < -PI)\n          *Longitude += TWO_PI;\n\n\n        if (*Latitude > PI_OVER_2)  /* force distorted values to 90, -90 degrees */\n          *Latitude = PI_OVER_2;\n        else if (*Latitude < -PI_OVER_2)\n          *Latitude = -PI_OVER_2;\n\n        if (*Longitude > PI)  /* force distorted values to 180, -180 degrees */\n          *Longitude = PI;\n        else if (*Longitude < -PI)\n          *Longitude = -PI;\n\n      }\n      if (Southern_Hemisphere != 0)\n      {\n        *Latitude *= -1.0;\n        *Longitude *= -1.0;\n      }\n    }\n  }\n  return (Error_Code);\n} /* END OF Convert_Polar_Stereographic_To_Geodetic */\n\n\n\n"
  },
  {
    "path": "lonestar/scientific/cpu/longestedge/src/libmgrs/polarst.h",
    "content": "#ifndef POLARST_H\n#define POLARST_H\n/***************************************************************************/\n/* RSC IDENTIFIER: POLAR STEREOGRAPHIC\n *\n *\n * ABSTRACT\n *\n *    This component provides conversions between geodetic (latitude and\n *    longitude) coordinates and Polar Stereographic (easting and northing)\n *    coordinates.\n *\n * ERROR HANDLING\n *\n *    This component checks parameters for valid values.  If an invalid\n *    value is found the error code is combined with the current error code\n *    using the bitwise or.  This combining allows multiple error codes to\n *    be returned. The possible error codes are:\n *\n *          POLAR_NO_ERROR           : No errors occurred in function\n *          POLAR_LAT_ERROR          : Latitude outside of valid range\n *                                      (-90 to 90 degrees)\n *          POLAR_LON_ERROR          : Longitude outside of valid range\n *                                      (-180 to 360 degrees)\n *          POLAR_ORIGIN_LAT_ERROR   : Latitude of true scale outside of valid\n *                                      range (-90 to 90 degrees)\n *          POLAR_ORIGIN_LON_ERROR   : Longitude down from pole outside of valid\n *                                      range (-180 to 360 degrees)\n *          POLAR_EASTING_ERROR      : Easting outside of valid range,\n *                                      depending on ellipsoid and\n *                                      projection parameters\n *          POLAR_NORTHING_ERROR     : Northing outside of valid range,\n *                                      depending on ellipsoid and\n *                                      projection parameters\n *          POLAR_RADIUS_ERROR       : Coordinates too far from pole,\n *                                      depending on ellipsoid and\n *                                      projection parameters\n *          POLAR_A_ERROR            : Semi-major axis less than or equal to\n *zero POLAR_INV_F_ERROR        : Inverse flattening outside of valid range (250\n *to 350)\n *\n *\n * REUSE NOTES\n *\n *    POLAR STEREOGRAPHIC is intended for reuse by any application that\n *    performs a Polar Stereographic projection.\n *\n *\n * REFERENCES\n *\n *    Further information on POLAR STEREOGRAPHIC can be found in the\n *    Reuse Manual.\n *\n *\n *    POLAR STEREOGRAPHIC originated from :\n *                                U.S. Army Topographic Engineering Center\n *                                Geospatial Information Division\n *                                7701 Telegraph Road\n *                                Alexandria, VA  22310-3864\n *\n *\n * LICENSES\n *\n *    None apply to this component.\n *\n *\n * RESTRICTIONS\n *\n *    POLAR STEREOGRAPHIC has no restrictions.\n *\n *\n * ENVIRONMENT\n *\n *    POLAR STEREOGRAPHIC was tested and certified in the following\n *    environments:\n *\n *    1. Solaris 2.5 with GCC, version 2.8.1\n *    2. Window 95 with MS Visual C++, version 6\n *\n *\n * MODIFICATIONS\n *\n *    Date              Description\n *    ----              -----------\n *    06-11-95          Original Code\n *    03-01-97          Original Code\n *\n *\n */\n\n/**********************************************************************/\n/*\n *                        DEFINES\n */\n\n#define POLAR_NO_ERROR 0x0000\n#define POLAR_LAT_ERROR 0x0001\n#define POLAR_LON_ERROR 0x0002\n#define POLAR_ORIGIN_LAT_ERROR 0x0004\n#define POLAR_ORIGIN_LON_ERROR 0x0008\n#define POLAR_EASTING_ERROR 0x0010\n#define POLAR_NORTHING_ERROR 0x0020\n#define POLAR_A_ERROR 0x0040\n#define POLAR_INV_F_ERROR 0x0080\n#define POLAR_RADIUS_ERROR 0x0100\n\n/**********************************************************************/\n/*\n *                        FUNCTION PROTOTYPES\n */\n\n/* ensure proper linkage to c++ programs */\n#ifdef __cplusplus\nextern \"C\" {\n#endif\n\nlong Set_Polar_Stereographic_Parameters(double a, double f,\n                                        double Latitude_of_True_Scale,\n                                        double Longitude_Down_from_Pole,\n                                        double False_Easting,\n                                        double False_Northing);\n/*\n *  The function Set_Polar_Stereographic_Parameters receives the ellipsoid\n *  parameters and Polar Stereograpic projection parameters as inputs, and\n *  sets the corresponding state variables.  If any errors occur, error\n *  code(s) are returned by the function, otherwise POLAR_NO_ERROR is returned.\n *\n *  a                : Semi-major axis of ellipsoid, in meters         (input)\n *  f                : Flattening of ellipsoid                         (input)\n *  Latitude_of_True_Scale  : Latitude of true scale, in radians       (input)\n *  Longitude_Down_from_Pole : Longitude down from pole, in radians    (input)\n *  False_Easting    : Easting (X) at center of projection, in meters  (input)\n *  False_Northing   : Northing (Y) at center of projection, in meters (input)\n */\n\nvoid Get_Polar_Stereographic_Parameters(double* a, double* f,\n                                        double* Latitude_of_True_Scale,\n                                        double* Longitude_Down_from_Pole,\n                                        double* False_Easting,\n                                        double* False_Northing);\n/*\n * The function Get_Polar_Stereographic_Parameters returns the current\n * ellipsoid parameters and Polar projection parameters.\n *\n *  a                : Semi-major axis of ellipsoid, in meters         (output)\n *  f                : Flattening of ellipsoid                         (output)\n *  Latitude_of_True_Scale  : Latitude of true scale, in radians       (output)\n *  Longitude_Down_from_Pole : Longitude down from pole, in radians    (output)\n *  False_Easting    : Easting (X) at center of projection, in meters  (output)\n *  False_Northing   : Northing (Y) at center of projection, in meters (output)\n */\n\nlong Convert_Geodetic_To_Polar_Stereographic(double Latitude, double Longitude,\n                                             double* Easting, double* Northing);\n/*\n * The function Convert_Geodetic_To_Polar_Stereographic converts geodetic\n * coordinates (latitude and longitude) to Polar Stereographic coordinates\n * (easting and northing), according to the current ellipsoid\n * and Polar Stereographic projection parameters. If any errors occur, error\n * code(s) are returned by the function, otherwise POLAR_NO_ERROR is returned.\n *\n *    Latitude   :  Latitude, in radians                      (input)\n *    Longitude  :  Longitude, in radians                     (input)\n *    Easting    :  Easting (X), in meters                    (output)\n *    Northing   :  Northing (Y), in meters                   (output)\n */\n\nlong Convert_Polar_Stereographic_To_Geodetic(double Easting, double Northing,\n                                             double* Latitude,\n                                             double* Longitude);\n\n/*\n *  The function Convert_Polar_Stereographic_To_Geodetic converts Polar\n *  Stereographic coordinates (easting and northing) to geodetic\n *  coordinates (latitude and longitude) according to the current ellipsoid\n *  and Polar Stereographic projection Parameters. If any errors occur, the\n *  code(s) are returned by the function, otherwise POLAR_NO_ERROR\n *  is returned.\n *\n *  Easting          : Easting (X), in meters                   (input)\n *  Northing         : Northing (Y), in meters                  (input)\n *  Latitude         : Latitude, in radians                     (output)\n *  Longitude        : Longitude, in radians                    (output)\n *\n */\n\n#ifdef __cplusplus\n}\n#endif\n\n#endif /* POLARST_H  */\n"
  },
  {
    "path": "lonestar/scientific/cpu/longestedge/src/libmgrs/tranmerc.c",
    "content": "/***************************************************************************/\n/* RSC IDENTIFIER: TRANSVERSE MERCATOR\n *\n * ABSTRACT\n *\n *    This component provides conversions between Geodetic coordinates \n *    (latitude and longitude) and Transverse Mercator projection coordinates\n *    (easting and northing).\n *\n * ERROR HANDLING\n *\n *    This component checks parameters for valid values.  If an invalid value\n *    is found the error code is combined with the current error code using \n *    the bitwise or.  This combining allows multiple error codes to be\n *    returned. The possible error codes are:\n *\n *       TRANMERC_NO_ERROR           : No errors occurred in function\n *       TRANMERC_LAT_ERROR          : Latitude outside of valid range\n *                                      (-90 to 90 degrees)\n *       TRANMERC_LON_ERROR          : Longitude outside of valid range\n *                                      (-180 to 360 degrees, and within\n *                                        +/-90 of Central Meridian)\n *       TRANMERC_EASTING_ERROR      : Easting outside of valid range\n *                                      (depending on ellipsoid and\n *                                       projection parameters)\n *       TRANMERC_NORTHING_ERROR     : Northing outside of valid range\n *                                      (depending on ellipsoid and\n *                                       projection parameters)\n *       TRANMERC_ORIGIN_LAT_ERROR   : Origin latitude outside of valid range\n *                                      (-90 to 90 degrees)\n *       TRANMERC_CENT_MER_ERROR     : Central meridian outside of valid range\n *                                      (-180 to 360 degrees)\n *       TRANMERC_A_ERROR            : Semi-major axis less than or equal to zero\n *       TRANMERC_INV_F_ERROR        : Inverse flattening outside of valid range\n *\t\t\t\t\t\t\t\t  \t                  (250 to 350)\n *       TRANMERC_SCALE_FACTOR_ERROR : Scale factor outside of valid\n *                                     range (0.3 to 3.0)\n *\t\t TM_LON_WARNING              : Distortion will result if longitude is more\n *                                       than 9 degrees from the Central Meridian\n *\n * REUSE NOTES\n *\n *    TRANSVERSE MERCATOR is intended for reuse by any application that \n *    performs a Transverse Mercator projection or its inverse.\n *    \n * REFERENCES\n *\n *    Further information on TRANSVERSE MERCATOR can be found in the \n *    Reuse Manual.\n *\n *    TRANSVERSE MERCATOR originated from :  \n *                      U.S. Army Topographic Engineering Center\n *                      Geospatial Information Division\n *                      7701 Telegraph Road\n *                      Alexandria, VA  22310-3864\n *\n * LICENSES\n *\n *    None apply to this component.\n *\n * RESTRICTIONS\n *\n *    TRANSVERSE MERCATOR has no restrictions.\n *\n * ENVIRONMENT\n *\n *    TRANSVERSE MERCATOR was tested and certified in the following \n *    environments:\n *\n *    1. Solaris 2.5 with GCC, version 2.8.1\n *    2. Windows 95 with MS Visual C++, version 6\n *\n * MODIFICATIONS\n *\n *    Date              Description\n *    ----              -----------\n *    10-02-97          Original Code\n *    03-02-97          Re-engineered Code\n *\n */\n\n\n/***************************************************************************/\n/*\n *                               INCLUDES\n */\n\n#include <math.h>\n#include \"tranmerc.h\"\n\n/*\n *    math.h      - Standard C math library\n *    tranmerc.h  - Is for prototype error checking\n */\n\n\n/***************************************************************************/\n/*                               DEFINES \n *\n */\n\n#define PI              3.14159265358979323e0   /* PI     */\n#define PI_OVER_2         (PI/2.0e0)            /* PI over 2 */\n#define MAX_LAT         ((PI * 89.99)/180.0)    /* 89.99 degrees in radians */\n#define MAX_DELTA_LONG  ((PI * 90)/180.0)       /* 90 degrees in radians */\n#define MIN_SCALE_FACTOR  0.3\n#define MAX_SCALE_FACTOR  3.0\n\n#define SPHTMD(Latitude) ((double) (TranMerc_ap * Latitude \\\n      - TranMerc_bp * sin(2.e0 * Latitude) + TranMerc_cp * sin(4.e0 * Latitude) \\\n      - TranMerc_dp * sin(6.e0 * Latitude) + TranMerc_ep * sin(8.e0 * Latitude) ) )\n\n#define SPHSN(Latitude) ((double) (TranMerc_a / sqrt( 1.e0 - TranMerc_es * \\\n      pow(sin(Latitude), 2))))\n\n#define SPHSR(Latitude) ((double) (TranMerc_a * (1.e0 - TranMerc_es) / \\\n    pow(DENOM(Latitude), 3)))\n\n#define DENOM(Latitude) ((double) (sqrt(1.e0 - TranMerc_es * pow(sin(Latitude),2))))\n\n\n/**************************************************************************/\n/*                               GLOBAL DECLARATIONS\n *\n */\n\n/* Ellipsoid Parameters, default to WGS 84  */\nstatic double TranMerc_a = 6378137.0;              /* Semi-major axis of ellipsoid in meters */\nstatic double TranMerc_f = 1 / 298.257223563;      /* Flattening of ellipsoid  */\nstatic double TranMerc_es = 0.0066943799901413800; /* Eccentricity (0.08181919084262188000) squared */\nstatic double TranMerc_ebs = 0.0067394967565869;   /* Second Eccentricity squared */\n\n/* Transverse_Mercator projection Parameters */\nstatic double TranMerc_Origin_Lat = 0.0;           /* Latitude of origin in radians */\nstatic double TranMerc_Origin_Long = 0.0;          /* Longitude of origin in radians */\nstatic double TranMerc_False_Northing = 0.0;       /* False northing in meters */\nstatic double TranMerc_False_Easting = 0.0;        /* False easting in meters */\nstatic double TranMerc_Scale_Factor = 1.0;         /* Scale factor  */\n\n/* Isometeric to geodetic latitude parameters, default to WGS 84 */\nstatic double TranMerc_ap = 6367449.1458008;\nstatic double TranMerc_bp = 16038.508696861;\nstatic double TranMerc_cp = 16.832613334334;\nstatic double TranMerc_dp = 0.021984404273757;\nstatic double TranMerc_ep = 3.1148371319283e-005;\n\n/* Maximum variance for easting and northing values for WGS 84. */\nstatic double TranMerc_Delta_Easting = 40000000.0;\nstatic double TranMerc_Delta_Northing = 40000000.0;\n\n/* These state variables are for optimization purposes. The only function\n * that should modify them is Set_Tranverse_Mercator_Parameters.         */\n\n\n/************************************************************************/\n/*                              FUNCTIONS     \n *\n */\n\n\nlong Set_Transverse_Mercator_Parameters(double a,\n                                        double f,\n                                        double Origin_Latitude,\n                                        double Central_Meridian,\n                                        double False_Easting,\n                                        double False_Northing,\n                                        double Scale_Factor)\n\n{ /* BEGIN Set_Tranverse_Mercator_Parameters */\n  /*\n   * The function Set_Tranverse_Mercator_Parameters receives the ellipsoid\n   * parameters and Tranverse Mercator projection parameters as inputs, and\n   * sets the corresponding state variables. If any errors occur, the error\n   * code(s) are returned by the function, otherwise TRANMERC_NO_ERROR is\n   * returned.\n   *\n   *    a                 : Semi-major axis of ellipsoid, in meters    (input)\n   *    f                 : Flattening of ellipsoid\t\t\t\t\t\t         (input)\n   *    Origin_Latitude   : Latitude in radians at the origin of the   (input)\n   *                         projection\n   *    Central_Meridian  : Longitude in radians at the center of the  (input)\n   *                         projection\n   *    False_Easting     : Easting/X at the center of the projection  (input)\n   *    False_Northing    : Northing/Y at the center of the projection (input)\n   *    Scale_Factor      : Projection scale factor                    (input) \n   */\n\n  double tn;        /* True Meridianal distance constant  */\n  double tn2;\n  double tn3;\n  double tn4;\n  double tn5;\n  double dummy_northing;\n  double TranMerc_b; /* Semi-minor axis of ellipsoid, in meters */\n  double inv_f = 1 / f;\n  long Error_Code = TRANMERC_NO_ERROR;\n\n  if (a <= 0.0)\n  { /* Semi-major axis must be greater than zero */\n    Error_Code |= TRANMERC_A_ERROR;\n  }\n  if ((inv_f < 250) || (inv_f > 350))\n  { /* Inverse flattening must be between 250 and 350 */\n    Error_Code |= TRANMERC_INV_F_ERROR;\n  }\n  if ((Origin_Latitude < -PI_OVER_2) || (Origin_Latitude > PI_OVER_2))\n  { /* origin latitude out of range */\n    Error_Code |= TRANMERC_ORIGIN_LAT_ERROR;\n  }\n  if ((Central_Meridian < -PI) || (Central_Meridian > (2*PI)))\n  { /* origin longitude out of range */\n    Error_Code |= TRANMERC_CENT_MER_ERROR;\n  }\n  if ((Scale_Factor < MIN_SCALE_FACTOR) || (Scale_Factor > MAX_SCALE_FACTOR))\n  {\n    Error_Code |= TRANMERC_SCALE_FACTOR_ERROR;\n  }\n  if (!Error_Code)\n  { /* no errors */\n    TranMerc_a = a;\n    TranMerc_f = f;\n    TranMerc_Origin_Lat = Origin_Latitude;\n    if (Central_Meridian > PI)\n      Central_Meridian -= (2*PI);\n    TranMerc_Origin_Long = Central_Meridian;\n    TranMerc_False_Northing = False_Northing;\n    TranMerc_False_Easting = False_Easting; \n    TranMerc_Scale_Factor = Scale_Factor;\n\n    /* Eccentricity Squared */\n    TranMerc_es = 2 * TranMerc_f - TranMerc_f * TranMerc_f;\n    /* Second Eccentricity Squared */\n    TranMerc_ebs = (1 / (1 - TranMerc_es)) - 1;\n\n    TranMerc_b = TranMerc_a * (1 - TranMerc_f);    \n    /*True meridianal constants  */\n    tn = (TranMerc_a - TranMerc_b) / (TranMerc_a + TranMerc_b);\n    tn2 = tn * tn;\n    tn3 = tn2 * tn;\n    tn4 = tn3 * tn;\n    tn5 = tn4 * tn;\n\n    TranMerc_ap = TranMerc_a * (1.e0 - tn + 5.e0 * (tn2 - tn3)/4.e0\n                                + 81.e0 * (tn4 - tn5)/64.e0 );\n    TranMerc_bp = 3.e0 * TranMerc_a * (tn - tn2 + 7.e0 * (tn3 - tn4)\n                                       /8.e0 + 55.e0 * tn5/64.e0 )/2.e0;\n    TranMerc_cp = 15.e0 * TranMerc_a * (tn2 - tn3 + 3.e0 * (tn4 - tn5 )/4.e0) /16.0;\n    TranMerc_dp = 35.e0 * TranMerc_a * (tn3 - tn4 + 11.e0 * tn5 / 16.e0) / 48.e0;\n    TranMerc_ep = 315.e0 * TranMerc_a * (tn4 - tn5) / 512.e0;\n    Convert_Geodetic_To_Transverse_Mercator(MAX_LAT,\n                                            MAX_DELTA_LONG + Central_Meridian,\n                                            &TranMerc_Delta_Easting,\n                                            &TranMerc_Delta_Northing);\n    Convert_Geodetic_To_Transverse_Mercator(0,\n                                            MAX_DELTA_LONG + Central_Meridian,\n                                            &TranMerc_Delta_Easting,\n                                            &dummy_northing);\n    TranMerc_Delta_Northing++;\n    TranMerc_Delta_Easting++;\n\n  } /* END OF if(!Error_Code) */\n  return (Error_Code);\n}  /* END of Set_Transverse_Mercator_Parameters  */\n\n\nvoid Get_Transverse_Mercator_Parameters(double *a,\n                                        double *f,\n                                        double *Origin_Latitude,\n                                        double *Central_Meridian,\n                                        double *False_Easting,\n                                        double *False_Northing,\n                                        double *Scale_Factor)\n\n{ /* BEGIN Get_Tranverse_Mercator_Parameters  */\n  /*\n   * The function Get_Transverse_Mercator_Parameters returns the current\n   * ellipsoid and Transverse Mercator projection parameters.\n   *\n   *    a                 : Semi-major axis of ellipsoid, in meters    (output)\n   *    f                 : Flattening of ellipsoid\t\t\t\t\t\t         (output)\n   *    Origin_Latitude   : Latitude in radians at the origin of the   (output)\n   *                         projection\n   *    Central_Meridian  : Longitude in radians at the center of the  (output)\n   *                         projection\n   *    False_Easting     : Easting/X at the center of the projection  (output)\n   *    False_Northing    : Northing/Y at the center of the projection (output)\n   *    Scale_Factor      : Projection scale factor                    (output) \n   */\n\n  *a = TranMerc_a;\n  *f = TranMerc_f;\n  *Origin_Latitude = TranMerc_Origin_Lat;\n  *Central_Meridian = TranMerc_Origin_Long;\n  *False_Easting = TranMerc_False_Easting;\n  *False_Northing = TranMerc_False_Northing;\n  *Scale_Factor = TranMerc_Scale_Factor;\n  return;\n} /* END OF Get_Tranverse_Mercator_Parameters */\n\n\n\nlong Convert_Geodetic_To_Transverse_Mercator (double Latitude,\n                                              double Longitude,\n                                              double *Easting,\n                                              double *Northing)\n\n{      /* BEGIN Convert_Geodetic_To_Transverse_Mercator */\n\n  /*\n   * The function Convert_Geodetic_To_Transverse_Mercator converts geodetic\n   * (latitude and longitude) coordinates to Transverse Mercator projection\n   * (easting and northing) coordinates, according to the current ellipsoid\n   * and Transverse Mercator projection coordinates.  If any errors occur, the\n   * error code(s) are returned by the function, otherwise TRANMERC_NO_ERROR is\n   * returned.\n   *\n   *    Latitude      : Latitude in radians                         (input)\n   *    Longitude     : Longitude in radians                        (input)\n   *    Easting       : Easting/X in meters                         (output)\n   *    Northing      : Northing/Y in meters                        (output)\n   */\n\n  double c;       /* Cosine of latitude                          */\n  double c2;\n  double c3;\n  double c5;\n  double c7;\n  double dlam;    /* Delta longitude - Difference in Longitude       */\n  double eta;     /* constant - TranMerc_ebs *c *c                   */\n  double eta2;\n  double eta3;\n  double eta4;\n  double s;       /* Sine of latitude                        */\n  double sn;      /* Radius of curvature in the prime vertical       */\n  double t;       /* Tangent of latitude                             */\n  double tan2;\n  double tan3;\n  double tan4;\n  double tan5;\n  double tan6;\n  double t1;      /* Term in coordinate conversion formula - GP to Y */\n  double t2;      /* Term in coordinate conversion formula - GP to Y */\n  double t3;      /* Term in coordinate conversion formula - GP to Y */\n  double t4;      /* Term in coordinate conversion formula - GP to Y */\n  double t5;      /* Term in coordinate conversion formula - GP to Y */\n  double t6;      /* Term in coordinate conversion formula - GP to Y */\n  double t7;      /* Term in coordinate conversion formula - GP to Y */\n  double t8;      /* Term in coordinate conversion formula - GP to Y */\n  double t9;      /* Term in coordinate conversion formula - GP to Y */\n  double tmd;     /* True Meridional distance                        */\n  double tmdo;    /* True Meridional distance for latitude of origin */\n  long    Error_Code = TRANMERC_NO_ERROR;\n  double temp_Origin;\n  double temp_Long;\n\n  if ((Latitude < -MAX_LAT) || (Latitude > MAX_LAT))\n  {  /* Latitude out of range */\n    Error_Code|= TRANMERC_LAT_ERROR;\n  }\n  if (Longitude > PI)\n    Longitude -= (2 * PI);\n  if ((Longitude < (TranMerc_Origin_Long - MAX_DELTA_LONG))\n      || (Longitude > (TranMerc_Origin_Long + MAX_DELTA_LONG)))\n  {\n    if (Longitude < 0)\n      temp_Long = Longitude + 2 * PI;\n    else\n      temp_Long = Longitude;\n    if (TranMerc_Origin_Long < 0)\n      temp_Origin = TranMerc_Origin_Long + 2 * PI;\n    else\n      temp_Origin = TranMerc_Origin_Long;\n    if ((temp_Long < (temp_Origin - MAX_DELTA_LONG))\n        || (temp_Long > (temp_Origin + MAX_DELTA_LONG)))\n      Error_Code|= TRANMERC_LON_ERROR;\n  }\n  if (!Error_Code)\n  { /* no errors */\n\n    /* \n     *  Delta Longitude\n     */\n    dlam = Longitude - TranMerc_Origin_Long;\n\n    if (fabs(dlam) > (9.0 * PI / 180))\n    { /* Distortion will result if Longitude is more than 9 degrees from the Central Meridian */\n      Error_Code |= TRANMERC_LON_WARNING;\n    }\n\n    if (dlam > PI)\n      dlam -= (2 * PI);\n    if (dlam < -PI)\n      dlam += (2 * PI);\n    if (fabs(dlam) < 2.e-10)\n      dlam = 0.0;\n\n    s = sin(Latitude);\n    c = cos(Latitude);\n    c2 = c * c;\n    c3 = c2 * c;\n    c5 = c3 * c2;\n    c7 = c5 * c2;\n    t = tan (Latitude);\n    tan2 = t * t;\n    tan3 = tan2 * t;\n    tan4 = tan3 * t;\n    tan5 = tan4 * t;\n    tan6 = tan5 * t;\n    eta = TranMerc_ebs * c2;\n    eta2 = eta * eta;\n    eta3 = eta2 * eta;\n    eta4 = eta3 * eta;\n\n    /* radius of curvature in prime vertical */\n    sn = SPHSN(Latitude);\n\n    /* True Meridianal Distances */\n    tmd = SPHTMD(Latitude);\n\n    /*  Origin  */\n    tmdo = SPHTMD (TranMerc_Origin_Lat);\n\n    /* northing */\n    t1 = (tmd - tmdo) * TranMerc_Scale_Factor;\n    t2 = sn * s * c * TranMerc_Scale_Factor/ 2.e0;\n    t3 = sn * s * c3 * TranMerc_Scale_Factor * (5.e0 - tan2 + 9.e0 * eta \n                                                + 4.e0 * eta2) /24.e0; \n\n    t4 = sn * s * c5 * TranMerc_Scale_Factor * (61.e0 - 58.e0 * tan2\n                                                + tan4 + 270.e0 * eta - 330.e0 * tan2 * eta + 445.e0 * eta2\n                                                + 324.e0 * eta3 -680.e0 * tan2 * eta2 + 88.e0 * eta4 \n                                                -600.e0 * tan2 * eta3 - 192.e0 * tan2 * eta4) / 720.e0;\n\n    t5 = sn * s * c7 * TranMerc_Scale_Factor * (1385.e0 - 3111.e0 * \n                                                tan2 + 543.e0 * tan4 - tan6) / 40320.e0;\n\n    *Northing = TranMerc_False_Northing + t1 + pow(dlam,2.e0) * t2\n                + pow(dlam,4.e0) * t3 + pow(dlam,6.e0) * t4\n                + pow(dlam,8.e0) * t5; \n\n    /* Easting */\n    t6 = sn * c * TranMerc_Scale_Factor;\n    t7 = sn * c3 * TranMerc_Scale_Factor * (1.e0 - tan2 + eta ) /6.e0;\n    t8 = sn * c5 * TranMerc_Scale_Factor * (5.e0 - 18.e0 * tan2 + tan4\n                                            + 14.e0 * eta - 58.e0 * tan2 * eta + 13.e0 * eta2 + 4.e0 * eta3 \n                                            - 64.e0 * tan2 * eta2 - 24.e0 * tan2 * eta3 )/ 120.e0;\n    t9 = sn * c7 * TranMerc_Scale_Factor * ( 61.e0 - 479.e0 * tan2\n                                             + 179.e0 * tan4 - tan6 ) /5040.e0;\n\n    *Easting = TranMerc_False_Easting + dlam * t6 + pow(dlam,3.e0) * t7 \n               + pow(dlam,5.e0) * t8 + pow(dlam,7.e0) * t9;\n  }\n  return (Error_Code);\n} /* END OF Convert_Geodetic_To_Transverse_Mercator */\n\n\nlong Convert_Transverse_Mercator_To_Geodetic (\n                                             double Easting,\n                                             double Northing,\n                                             double *Latitude,\n                                             double *Longitude)\n{      /* BEGIN Convert_Transverse_Mercator_To_Geodetic */\n\n  /*\n   * The function Convert_Transverse_Mercator_To_Geodetic converts Transverse\n   * Mercator projection (easting and northing) coordinates to geodetic\n   * (latitude and longitude) coordinates, according to the current ellipsoid\n   * and Transverse Mercator projection parameters.  If any errors occur, the\n   * error code(s) are returned by the function, otherwise TRANMERC_NO_ERROR is\n   * returned.\n   *\n   *    Easting       : Easting/X in meters                         (input)\n   *    Northing      : Northing/Y in meters                        (input)\n   *    Latitude      : Latitude in radians                         (output)\n   *    Longitude     : Longitude in radians                        (output)\n   */\n\n  double c;       /* Cosine of latitude                          */\n  double de;      /* Delta easting - Difference in Easting (Easting-Fe)    */\n  double dlam;    /* Delta longitude - Difference in Longitude       */\n  double eta;     /* constant - TranMerc_ebs *c *c                   */\n  double eta2;\n  double eta3;\n  double eta4;\n  double ftphi;   /* Footpoint latitude                              */\n  int    i;       /* Loop iterator                   */\n  double s;       /* Sine of latitude                        */\n  double sn;      /* Radius of curvature in the prime vertical       */\n  double sr;      /* Radius of curvature in the meridian             */\n  double t;       /* Tangent of latitude                             */\n  double tan2;\n  double tan4;\n  double t10;     /* Term in coordinate conversion formula - GP to Y */\n  double t11;     /* Term in coordinate conversion formula - GP to Y */\n  double t12;     /* Term in coordinate conversion formula - GP to Y */\n  double t13;     /* Term in coordinate conversion formula - GP to Y */\n  double t14;     /* Term in coordinate conversion formula - GP to Y */\n  double t15;     /* Term in coordinate conversion formula - GP to Y */\n  double t16;     /* Term in coordinate conversion formula - GP to Y */\n  double t17;     /* Term in coordinate conversion formula - GP to Y */\n  double tmd;     /* True Meridional distance                        */\n  double tmdo;    /* True Meridional distance for latitude of origin */\n  long Error_Code = TRANMERC_NO_ERROR;\n\n  if ((Easting < (TranMerc_False_Easting - TranMerc_Delta_Easting))\n      ||(Easting > (TranMerc_False_Easting + TranMerc_Delta_Easting)))\n  { /* Easting out of range  */\n    Error_Code |= TRANMERC_EASTING_ERROR;\n  }\n  if ((Northing < (TranMerc_False_Northing - TranMerc_Delta_Northing))\n      || (Northing > (TranMerc_False_Northing + TranMerc_Delta_Northing)))\n  { /* Northing out of range */\n    Error_Code |= TRANMERC_NORTHING_ERROR;\n  }\n\n  if (!Error_Code)\n  {\n    /* True Meridional Distances for latitude of origin */\n    tmdo = SPHTMD(TranMerc_Origin_Lat);\n\n    /*  Origin  */\n    tmd = tmdo +  (Northing - TranMerc_False_Northing) / TranMerc_Scale_Factor; \n\n    /* First Estimate */\n    sr = SPHSR(0.e0);\n    ftphi = tmd/sr;\n\n    for (i = 0; i < 5 ; i++)\n    {\n      t10 = SPHTMD (ftphi);\n      sr = SPHSR(ftphi);\n      ftphi = ftphi + (tmd - t10) / sr;\n    }\n\n    /* Radius of Curvature in the meridian */\n    sr = SPHSR(ftphi);\n\n    /* Radius of Curvature in the meridian */\n    sn = SPHSN(ftphi);\n\n    /* Sine Cosine terms */\n    s = sin(ftphi);\n    c = cos(ftphi);\n\n    /* Tangent Value  */\n    t = tan(ftphi);\n    tan2 = t * t;\n    tan4 = tan2 * tan2;\n    eta = TranMerc_ebs * pow(c,2);\n    eta2 = eta * eta;\n    eta3 = eta2 * eta;\n    eta4 = eta3 * eta;\n    de = Easting - TranMerc_False_Easting;\n    if (fabs(de) < 0.0001)\n      de = 0.0;\n\n    /* Latitude */\n    t10 = t / (2.e0 * sr * sn * pow(TranMerc_Scale_Factor, 2));\n    t11 = t * (5.e0  + 3.e0 * tan2 + eta - 4.e0 * pow(eta,2)\n               - 9.e0 * tan2 * eta) / (24.e0 * sr * pow(sn,3) \n                                       * pow(TranMerc_Scale_Factor,4));\n    t12 = t * (61.e0 + 90.e0 * tan2 + 46.e0 * eta + 45.E0 * tan4\n               - 252.e0 * tan2 * eta  - 3.e0 * eta2 + 100.e0 \n               * eta3 - 66.e0 * tan2 * eta2 - 90.e0 * tan4\n               * eta + 88.e0 * eta4 + 225.e0 * tan4 * eta2\n               + 84.e0 * tan2* eta3 - 192.e0 * tan2 * eta4)\n          / ( 720.e0 * sr * pow(sn,5) * pow(TranMerc_Scale_Factor, 6) );\n    t13 = t * ( 1385.e0 + 3633.e0 * tan2 + 4095.e0 * tan4 + 1575.e0 \n                * pow(t,6))/ (40320.e0 * sr * pow(sn,7) * pow(TranMerc_Scale_Factor,8));\n    *Latitude = ftphi - pow(de,2) * t10 + pow(de,4) * t11 - pow(de,6) * t12 \n                + pow(de,8) * t13;\n\n    t14 = 1.e0 / (sn * c * TranMerc_Scale_Factor);\n\n    t15 = (1.e0 + 2.e0 * tan2 + eta) / (6.e0 * pow(sn,3) * c * \n                                        pow(TranMerc_Scale_Factor,3));\n\n    t16 = (5.e0 + 6.e0 * eta + 28.e0 * tan2 - 3.e0 * eta2\n           + 8.e0 * tan2 * eta + 24.e0 * tan4 - 4.e0 \n           * eta3 + 4.e0 * tan2 * eta2 + 24.e0 \n           * tan2 * eta3) / (120.e0 * pow(sn,5) * c  \n                             * pow(TranMerc_Scale_Factor,5));\n\n    t17 = (61.e0 +  662.e0 * tan2 + 1320.e0 * tan4 + 720.e0 \n           * pow(t,6)) / (5040.e0 * pow(sn,7) * c \n                          * pow(TranMerc_Scale_Factor,7));\n\n    /* Difference in Longitude */\n    dlam = de * t14 - pow(de,3) * t15 + pow(de,5) * t16 - pow(de,7) * t17;\n\n    /* Longitude */\n    (*Longitude) = TranMerc_Origin_Long + dlam;\n\n    if((fabs)(*Latitude) > (90.0 * PI / 180.0))\n      Error_Code |= TRANMERC_NORTHING_ERROR;\n\n    if((*Longitude) > (PI))\n    {\n      *Longitude -= (2 * PI);\n      if((fabs)(*Longitude) > PI)\n        Error_Code |= TRANMERC_EASTING_ERROR;\n    }\n    else if((*Longitude) < (-PI))\n    {\n      *Longitude += (2 * PI);\n      if((fabs)(*Longitude) > PI)\n        Error_Code |= TRANMERC_EASTING_ERROR;\n    }\n\n    if (fabs(dlam) > (9.0 * PI / 180) * cos(*Latitude))\n    { /* Distortion will result if Longitude is more than 9 degrees from the Central Meridian at the equator */\n      /* and decreases to 0 degrees at the poles */\n      /* As you move towards the poles, distortion will become more significant */\n      Error_Code |= TRANMERC_LON_WARNING;\n    }\n  }\n  return (Error_Code);\n} /* END OF Convert_Transverse_Mercator_To_Geodetic */\n"
  },
  {
    "path": "lonestar/scientific/cpu/longestedge/src/libmgrs/tranmerc.h",
    "content": "#ifndef TRANMERC_H\n#define TRANMERC_H\n\n/***************************************************************************/\n/* RSC IDENTIFIER: TRANSVERSE MERCATOR\n *\n * ABSTRACT\n *\n *    This component provides conversions between Geodetic coordinates\n *    (latitude and longitude) and Transverse Mercator projection coordinates\n *    (easting and northing).\n *\n * ERROR HANDLING\n *\n *    This component checks parameters for valid values.  If an invalid value\n *    is found the error code is combined with the current error code using\n *    the bitwise or.  This combining allows multiple error codes to be\n *    returned. The possible error codes are:\n *\n *       TRANMERC_NO_ERROR           : No errors occurred in function\n *       TRANMERC_LAT_ERROR          : Latitude outside of valid range\n *                                      (-90 to 90 degrees)\n *       TRANMERC_LON_ERROR          : Longitude outside of valid range\n *                                      (-180 to 360 degrees, and within\n *                                        +/-90 of Central Meridian)\n *       TRANMERC_EASTING_ERROR      : Easting outside of valid range\n *                                      (depending on ellipsoid and\n *                                       projection parameters)\n *       TRANMERC_NORTHING_ERROR     : Northing outside of valid range\n *                                      (depending on ellipsoid and\n *                                       projection parameters)\n *       TRANMERC_ORIGIN_LAT_ERROR   : Origin latitude outside of valid range\n *                                      (-90 to 90 degrees)\n *       TRANMERC_CENT_MER_ERROR     : Central meridian outside of valid range\n *                                      (-180 to 360 degrees)\n *       TRANMERC_A_ERROR            : Semi-major axis less than or equal to\n *zero TRANMERC_INV_F_ERROR        : Inverse flattening outside of valid range\n *\t\t\t\t\t\t\t\t  \t                  (250 to 350)\n *       TRANMERC_SCALE_FACTOR_ERROR : Scale factor outside of valid\n *                                     range (0.3 to 3.0)\n *\t\t TM_LON_WARNING              : Distortion will result if longitude is\n *more than 9 degrees from the Central Meridian\n *\n * REUSE NOTES\n *\n *    TRANSVERSE MERCATOR is intended for reuse by any application that\n *    performs a Transverse Mercator projection or its inverse.\n *\n * REFERENCES\n *\n *    Further information on TRANSVERSE MERCATOR can be found in the\n *    Reuse Manual.\n *\n *    TRANSVERSE MERCATOR originated from :\n *                      U.S. Army Topographic Engineering Center\n *                      Geospatial Information Division\n *                      7701 Telegraph Road\n *                      Alexandria, VA  22310-3864\n *\n * LICENSES\n *\n *    None apply to this component.\n *\n * RESTRICTIONS\n *\n *    TRANSVERSE MERCATOR has no restrictions.\n *\n * ENVIRONMENT\n *\n *    TRANSVERSE MERCATOR was tested and certified in the following\n *    environments:\n *\n *    1. Solaris 2.5 with GCC, version 2.8.1\n *    2. Windows 95 with MS Visual C++, version 6\n *\n * MODIFICATIONS\n *\n *    Date              Description\n *    ----              -----------\n *    10-02-97          Original Code\n *    03-02-97          Re-engineered Code\n *\n */\n\n/***************************************************************************/\n/*\n *                              DEFINES\n */\n\n#define TRANMERC_NO_ERROR 0x0000\n#define TRANMERC_LAT_ERROR 0x0001\n#define TRANMERC_LON_ERROR 0x0002\n#define TRANMERC_EASTING_ERROR 0x0004\n#define TRANMERC_NORTHING_ERROR 0x0008\n#define TRANMERC_ORIGIN_LAT_ERROR 0x0010\n#define TRANMERC_CENT_MER_ERROR 0x0020\n#define TRANMERC_A_ERROR 0x0040\n#define TRANMERC_INV_F_ERROR 0x0080\n#define TRANMERC_SCALE_FACTOR_ERROR 0x0100\n#define TRANMERC_LON_WARNING 0x0200\n\n/***************************************************************************/\n/*\n *                              FUNCTION PROTOTYPES\n *                                for TRANMERC.C\n */\n\n/* ensure proper linkage to c++ programs */\n#ifdef __cplusplus\nextern \"C\" {\n#endif\n\nlong Set_Transverse_Mercator_Parameters(\n    double a, double f, double Origin_Latitude, double Central_Meridian,\n    double False_Easting, double False_Northing, double Scale_Factor);\n/*\n * The function Set_Tranverse_Mercator_Parameters receives the ellipsoid\n * parameters and Tranverse Mercator projection parameters as inputs, and\n * sets the corresponding state variables. If any errors occur, the error\n * code(s) are returned by the function, otherwise TRANMERC_NO_ERROR is\n * returned.\n *\n *    a                 : Semi-major axis of ellipsoid, in meters    (input)\n *    f                 : Flattening of ellipsoid                    (input)\n *    Origin_Latitude   : Latitude in radians at the origin of the   (input)\n *                         projection\n *    Central_Meridian  : Longitude in radians at the center of the  (input)\n *                         projection\n *    False_Easting     : Easting/X at the center of the projection  (input)\n *    False_Northing    : Northing/Y at the center of the projection (input)\n *    Scale_Factor      : Projection scale factor                    (input)\n */\n\nvoid Get_Transverse_Mercator_Parameters(\n    double* a, double* f, double* Origin_Latitude, double* Central_Meridian,\n    double* False_Easting, double* False_Northing, double* Scale_Factor);\n/*\n * The function Get_Transverse_Mercator_Parameters returns the current\n * ellipsoid and Transverse Mercator projection parameters.\n *\n *    a                 : Semi-major axis of ellipsoid, in meters    (output)\n *    f                 : Flattening of ellipsoid                    (output)\n *    Origin_Latitude   : Latitude in radians at the origin of the   (output)\n *                         projection\n *    Central_Meridian  : Longitude in radians at the center of the  (output)\n *                         projection\n *    False_Easting     : Easting/X at the center of the projection  (output)\n *    False_Northing    : Northing/Y at the center of the projection (output)\n *    Scale_Factor      : Projection scale factor                    (output)\n */\n\nlong Convert_Geodetic_To_Transverse_Mercator(double Latitude, double Longitude,\n                                             double* Easting, double* Northing);\n\n/*\n * The function Convert_Geodetic_To_Transverse_Mercator converts geodetic\n * (latitude and longitude) coordinates to Transverse Mercator projection\n * (easting and northing) coordinates, according to the current ellipsoid\n * and Transverse Mercator projection coordinates.  If any errors occur, the\n * error code(s) are returned by the function, otherwise TRANMERC_NO_ERROR is\n * returned.\n *\n *    Latitude      : Latitude in radians                         (input)\n *    Longitude     : Longitude in radians                        (input)\n *    Easting       : Easting/X in meters                         (output)\n *    Northing      : Northing/Y in meters                        (output)\n */\n\nlong Convert_Transverse_Mercator_To_Geodetic(double Easting, double Northing,\n                                             double* Latitude,\n                                             double* Longitude);\n\n/*\n * The function Convert_Transverse_Mercator_To_Geodetic converts Transverse\n * Mercator projection (easting and northing) coordinates to geodetic\n * (latitude and longitude) coordinates, according to the current ellipsoid\n * and Transverse Mercator projection parameters.  If any errors occur, the\n * error code(s) are returned by the function, otherwise TRANMERC_NO_ERROR is\n * returned.\n *\n *    Easting       : Easting/X in meters                         (input)\n *    Northing      : Northing/Y in meters                        (input)\n *    Latitude      : Latitude in radians                         (output)\n *    Longitude     : Longitude in radians                        (output)\n */\n\n#ifdef __cplusplus\n}\n#endif\n\n#endif /* TRANMERC_H */\n"
  },
  {
    "path": "lonestar/scientific/cpu/longestedge/src/libmgrs/ups.c",
    "content": "/********************************************************************/\n/* RSC IDENTIFIER: UPS\n *\n *\n * ABSTRACT\n *\n *    This component provides conversions between geodetic (latitude\n *    and longitude) coordinates and Universal Polar Stereographic (UPS)\n *    projection (hemisphere, easting, and northing) coordinates.\n *\n *\n * ERROR HANDLING\n *\n *    This component checks parameters for valid values.  If an \n *    invalid value is found the error code is combined with the \n *    current error code using the bitwise or.  This combining allows  \n *    multiple error codes to be returned. The possible error codes \n *    are:\n *\n *         UPS_NO_ERROR           : No errors occurred in function\n *         UPS_LAT_ERROR          : Latitude outside of valid range\n *                                   (North Pole: 83.5 to 90,\n *                                    South Pole: -79.5 to -90)\n *         UPS_LON_ERROR          : Longitude outside of valid range\n *                                   (-180 to 360 degrees)\n *         UPS_HEMISPHERE_ERROR   : Invalid hemisphere ('N' or 'S')\n *         UPS_EASTING_ERROR      : Easting outside of valid range,\n *                                   (0 to 4,000,000m)\n *         UPS_NORTHING_ERROR     : Northing outside of valid range,\n *                                   (0 to 4,000,000m)\n *         UPS_A_ERROR            : Semi-major axis less than or equal to zero\n *         UPS_INV_F_ERROR        : Inverse flattening outside of valid range\n *\t\t\t\t\t\t\t\t  \t               (250 to 350)\n *\n *\n * REUSE NOTES\n *\n *    UPS is intended for reuse by any application that performs a Universal\n *    Polar Stereographic (UPS) projection.\n *\n *\n * REFERENCES\n *\n *    Further information on UPS can be found in the Reuse Manual.\n *\n *    UPS originated from :  U.S. Army Topographic Engineering Center\n *                           Geospatial Information Division\n *                           7701 Telegraph Road\n *                           Alexandria, VA  22310-3864\n *\n *\n * LICENSES\n *\n *    None apply to this component.\n *\n *\n * RESTRICTIONS\n *\n *    UPS has no restrictions.\n *\n *\n * ENVIRONMENT\n *\n *    UPS was tested and certified in the following environments:\n *\n *    1. Solaris 2.5 with GCC version 2.8.1\n *    2. Windows 95 with MS Visual C++ version 6\n *\n *\n * MODIFICATIONS\n *\n *    Date              Description\n *    ----              -----------\n *    06-11-95          Original Code\n *    03-01-97          Original Code\n *\n *\n */\n\n\n/************************************************************************/\n/*\n *                               INCLUDES\n */\n\n#include <math.h>\n#include \"polarst.h\"\n#include \"ups.h\"\n/*\n *    math.h     - Is needed to call the math functions.\n *    polar.h    - Is used to convert polar stereographic coordinates\n *    ups.h      - Defines the function prototypes for the ups module.\n */\n\n\n/************************************************************************/\n/*                               GLOBAL DECLARATIONS\n *\n */\n\n#define PI       3.14159265358979323e0  /* PI     */\n#define PI_OVER    (PI/2.0e0)           /* PI over 2 */\n#define MAX_LAT    ((PI * 90)/180.0)    /* 90 degrees in radians */\n#define MAX_ORIGIN_LAT ((81.114528 * PI) / 180.0)\n#define MIN_NORTH_LAT (83.5*PI/180.0)\n#define MIN_SOUTH_LAT (-79.5*PI/180.0)\n#define MIN_EAST_NORTH 0\n#define MAX_EAST_NORTH 4000000\n\n/* Ellipsoid Parameters, default to WGS 84  */\nstatic double UPS_a = 6378137.0;          /* Semi-major axis of ellipsoid in meters   */\nstatic double UPS_f = 1 / 298.257223563;  /* Flattening of ellipsoid  */\nconst double UPS_False_Easting = 2000000;\nconst double UPS_False_Northing = 2000000;\nstatic double UPS_Origin_Latitude = MAX_ORIGIN_LAT;  /*set default = North Hemisphere */\nstatic double UPS_Origin_Longitude = 0.0;\n\n\n/************************************************************************/\n/*                              FUNCTIONS\n *\n */\n\n\nlong Set_UPS_Parameters( double a,\n                         double f)\n{\n/*\n * The function SET_UPS_PARAMETERS receives the ellipsoid parameters and sets\n * the corresponding state variables. If any errors occur, the error code(s)\n * are returned by the function, otherwise UPS_NO_ERROR is returned.\n *\n *   a     : Semi-major axis of ellipsoid in meters (input)\n *   f     : Flattening of ellipsoid\t\t\t\t\t      (input)\n */\n\n  double inv_f = 1 / f;\n  long Error_Code = UPS_NO_ERROR;\n\n  if (a <= 0.0)\n  { /* Semi-major axis must be greater than zero */\n    Error_Code |= UPS_A_ERROR;\n  }\n  if ((inv_f < 250) || (inv_f > 350))\n  { /* Inverse flattening must be between 250 and 350 */\n    Error_Code |= UPS_INV_F_ERROR;\n  }\n\n  if (!Error_Code)\n  { /* no errors */\n    UPS_a = a;\n    UPS_f = f;\n  }\n  return (Error_Code);\n}  /* END of Set_UPS_Parameters  */\n\n\nvoid Get_UPS_Parameters( double *a,\n                         double *f)\n{\n/*\n * The function Get_UPS_Parameters returns the current ellipsoid parameters.\n *\n *  a      : Semi-major axis of ellipsoid, in meters (output)\n *  f      : Flattening of ellipsoid\t\t\t\t\t       (output)\n */\n\n  *a = UPS_a;\n  *f = UPS_f;\n  return;\n} /* END OF Get_UPS_Parameters */\n\n\nlong Convert_Geodetic_To_UPS ( double Latitude,\n                               double Longitude,\n                               char   *Hemisphere,\n                               double *Easting,\n                               double *Northing)\n{\n/*\n *  The function Convert_Geodetic_To_UPS converts geodetic (latitude and\n *  longitude) coordinates to UPS (hemisphere, easting, and northing)\n *  coordinates, according to the current ellipsoid parameters. If any \n *  errors occur, the error code(s) are returned by the function, \n *  otherwide UPS_NO_ERROR is returned.\n *\n *    Latitude      : Latitude in radians                       (input)\n *    Longitude     : Longitude in radians                      (input)\n *    Hemisphere    : Hemisphere either 'N' or 'S'              (output)\n *    Easting       : Easting/X in meters                       (output)\n *    Northing      : Northing/Y in meters                      (output)\n */\n\n  double tempEasting, tempNorthing;\n  long Error_Code = UPS_NO_ERROR;\n\n  if ((Latitude < -MAX_LAT) || (Latitude > MAX_LAT))\n  {   /* latitude out of range */\n    Error_Code |= UPS_LAT_ERROR;\n  }\n  if ((Latitude < 0) && (Latitude > MIN_SOUTH_LAT))\n    Error_Code |= UPS_LAT_ERROR;\n  if ((Latitude >= 0) && (Latitude < MIN_NORTH_LAT))\n    Error_Code |= UPS_LAT_ERROR;\n  if ((Longitude < -PI) || (Longitude > (2 * PI)))\n  {  /* slam out of range */\n    Error_Code |= UPS_LON_ERROR;\n  }\n\n  if (!Error_Code)\n  {  /* no errors */\n    if (Latitude < 0)\n    {\n      UPS_Origin_Latitude = -MAX_ORIGIN_LAT; \n      *Hemisphere = 'S';\n    }\n    else\n    {\n      UPS_Origin_Latitude = MAX_ORIGIN_LAT; \n      *Hemisphere = 'N';\n    }\n\n\n    Set_Polar_Stereographic_Parameters( UPS_a,\n                                        UPS_f,\n                                        UPS_Origin_Latitude,\n                                        UPS_Origin_Longitude,\n                                        UPS_False_Easting,\n                                        UPS_False_Northing);\n\n    Convert_Geodetic_To_Polar_Stereographic(Latitude,\n                                            Longitude,\n                                            &tempEasting,\n                                            &tempNorthing);\n\n    *Easting = tempEasting;\n    *Northing = tempNorthing;\n  }  /*  END of if(!Error_Code)   */\n\n  return Error_Code;\n}  /* END OF Convert_Geodetic_To_UPS  */\n\n\nlong Convert_UPS_To_Geodetic(char   Hemisphere,\n                             double Easting,\n                             double Northing,\n                             double *Latitude,\n                             double *Longitude)\n{\n/*\n *  The function Convert_UPS_To_Geodetic converts UPS (hemisphere, easting, \n *  and northing) coordinates to geodetic (latitude and longitude) coordinates\n *  according to the current ellipsoid parameters.  If any errors occur, the \n *  error code(s) are returned by the function, otherwise UPS_NO_ERROR is \n *  returned.\n *\n *    Hemisphere    : Hemisphere either 'N' or 'S'              (input)\n *    Easting       : Easting/X in meters                       (input)\n *    Northing      : Northing/Y in meters                      (input)\n *    Latitude      : Latitude in radians                       (output)\n *    Longitude     : Longitude in radians                      (output)\n */\n\n  long Error_Code = UPS_NO_ERROR;\n\n  if ((Hemisphere != 'N') && (Hemisphere != 'S'))\n    Error_Code |= UPS_HEMISPHERE_ERROR;\n  if ((Easting < MIN_EAST_NORTH) || (Easting > MAX_EAST_NORTH))\n    Error_Code |= UPS_EASTING_ERROR;\n  if ((Northing < MIN_EAST_NORTH) || (Northing > MAX_EAST_NORTH))\n    Error_Code |= UPS_NORTHING_ERROR;\n\n  if (Hemisphere =='N')\n  {UPS_Origin_Latitude = MAX_ORIGIN_LAT;}\n  if (Hemisphere =='S')\n  {UPS_Origin_Latitude = -MAX_ORIGIN_LAT;}\n\n  if (!Error_Code)\n  {   /*  no errors   */\n    Set_Polar_Stereographic_Parameters( UPS_a,\n                                        UPS_f,\n                                        UPS_Origin_Latitude,\n                                        UPS_Origin_Longitude,\n                                        UPS_False_Easting,\n                                        UPS_False_Northing);\n\n\n\n    Convert_Polar_Stereographic_To_Geodetic( Easting,\n                                             Northing,\n                                             Latitude,\n                                             Longitude); \n\n\n    if ((*Latitude < 0) && (*Latitude > MIN_SOUTH_LAT))\n      Error_Code |= UPS_LAT_ERROR;\n    if ((*Latitude >= 0) && (*Latitude < MIN_NORTH_LAT))\n      Error_Code |= UPS_LAT_ERROR;\n  }  /*  END OF if(!Error_Code) */\n  return (Error_Code);\n}  /*  END OF Convert_UPS_To_Geodetic  */ \n\n"
  },
  {
    "path": "lonestar/scientific/cpu/longestedge/src/libmgrs/ups.h",
    "content": "#ifndef UPS_H\n#define UPS_H\n/********************************************************************/\n/* RSC IDENTIFIER: UPS\n *\n *\n * ABSTRACT\n *\n *    This component provides conversions between geodetic (latitude\n *    and longitude) coordinates and Universal Polar Stereographic (UPS)\n *    projection (hemisphere, easting, and northing) coordinates.\n *\n *\n * ERROR HANDLING\n *\n *    This component checks parameters for valid values.  If an\n *    invalid value is found the error code is combined with the\n *    current error code using the bitwise or.  This combining allows\n *    multiple error codes to be returned. The possible error codes\n *    are:\n *\n *         UPS_NO_ERROR           : No errors occurred in function\n *         UPS_LAT_ERROR          : Latitude outside of valid range\n *                                   (North Pole: 83.5 to 90,\n *                                    South Pole: -79.5 to -90)\n *         UPS_LON_ERROR          : Longitude outside of valid range\n *                                   (-180 to 360 degrees)\n *         UPS_HEMISPHERE_ERROR   : Invalid hemisphere ('N' or 'S')\n *         UPS_EASTING_ERROR      : Easting outside of valid range,\n *                                   (0 to 4,000,000m)\n *         UPS_NORTHING_ERROR     : Northing outside of valid range,\n *                                   (0 to 4,000,000m)\n *         UPS_A_ERROR            : Semi-major axis less than or equal to zero\n *         UPS_INV_F_ERROR        : Inverse flattening outside of valid range\n *\t\t\t\t\t\t\t\t  \t               (250 to 350)\n *\n *\n * REUSE NOTES\n *\n *    UPS is intended for reuse by any application that performs a Universal\n *    Polar Stereographic (UPS) projection.\n *\n *\n * REFERENCES\n *\n *    Further information on UPS can be found in the Reuse Manual.\n *\n *    UPS originated from :  U.S. Army Topographic Engineering Center\n *                           Geospatial Information Division\n *                           7701 Telegraph Road\n *                           Alexandria, VA  22310-3864\n *\n *\n * LICENSES\n *\n *    None apply to this component.\n *\n *\n * RESTRICTIONS\n *\n *    UPS has no restrictions.\n *\n *\n * ENVIRONMENT\n *\n *    UPS was tested and certified in the following environments:\n *\n *    1. Solaris 2.5 with GCC version 2.8.1\n *    2. Windows 95 with MS Visual C++ version 6\n *\n *\n * MODIFICATIONS\n *\n *    Date              Description\n *    ----              -----------\n *    06-11-95          Original Code\n *    03-01-97          Original Code\n *\n *\n */\n\n/**********************************************************************/\n/*\n *                        DEFINES\n */\n\n#define UPS_NO_ERROR 0x0000\n#define UPS_LAT_ERROR 0x0001\n#define UPS_LON_ERROR 0x0002\n#define UPS_HEMISPHERE_ERROR 0x0004\n#define UPS_EASTING_ERROR 0x0008\n#define UPS_NORTHING_ERROR 0x0010\n#define UPS_A_ERROR 0x0020\n#define UPS_INV_F_ERROR 0x0040\n\n/**********************************************************************/\n/*\n *                        FUNCTION PROTOTYPES\n *                          for UPS.C\n */\n\n/* ensure proper linkage to c++ programs */\n#ifdef __cplusplus\nextern \"C\" {\n#endif\n\nlong Set_UPS_Parameters(double a, double f);\n/*\n * The function SET_UPS_PARAMETERS receives the ellipsoid parameters and sets\n * the corresponding state variables. If any errors occur, the error code(s)\n * are returned by the function, otherwise UPS_NO_ERROR is returned.\n *\n *   a     : Semi-major axis of ellipsoid in meters (input)\n *   f     : Flattening of ellipsoid                (input)\n */\n\nvoid Get_UPS_Parameters(double* a, double* f);\n/*\n * The function Get_UPS_Parameters returns the current ellipsoid parameters.\n *\n *  a      : Semi-major axis of ellipsoid, in meters (output)\n *  f      : Flattening of ellipsoid                 (output)\n */\n\nlong Convert_Geodetic_To_UPS(double Latitude, double Longitude,\n                             char* Hemisphere, double* Easting,\n                             double* Northing);\n/*\n *  The function Convert_Geodetic_To_UPS converts geodetic (latitude and\n *  longitude) coordinates to UPS (hemisphere, easting, and northing)\n *  coordinates, according to the current ellipsoid parameters. If any\n *  errors occur, the error code(s) are returned by the function,\n *  otherwide UPS_NO_ERROR is returned.\n *\n *    Latitude      : Latitude in radians                       (input)\n *    Longitude     : Longitude in radians                      (input)\n *    Hemisphere    : Hemisphere either 'N' or 'S'              (output)\n *    Easting       : Easting/X in meters                       (output)\n *    Northing      : Northing/Y in meters                      (output)\n */\n\nlong Convert_UPS_To_Geodetic(char Hemisphere, double Easting, double Northing,\n                             double* Latitude, double* Longitude);\n\n/*\n *  The function Convert_UPS_To_Geodetic converts UPS (hemisphere, easting,\n *  and northing) coordinates to geodetic (latitude and longitude) coordinates\n *  according to the current ellipsoid parameters.  If any errors occur, the\n *  error code(s) are returned by the function, otherwise UPS_NO_ERROR is\n *  returned.\n *\n *    Hemisphere    : Hemisphere either 'N' or 'S'              (input)\n *    Easting       : Easting/X in meters                       (input)\n *    Northing      : Northing/Y in meters                      (input)\n *    Latitude      : Latitude in radians                       (output)\n *    Longitude     : Longitude in radians                      (output)\n */\n\n#ifdef __cplusplus\n}\n#endif\n\n#endif /* UPS_H  */\n"
  },
  {
    "path": "lonestar/scientific/cpu/longestedge/src/libmgrs/utm.c",
    "content": "/***************************************************************************/\n/* RSC IDENTIFIER: UTM\n *\n * ABSTRACT\n *\n *    This component provides conversions between geodetic coordinates \n *    (latitude and longitudes) and Universal Transverse Mercator (UTM)\n *    projection (zone, hemisphere, easting, and northing) coordinates.\n *\n * ERROR HANDLING\n *\n *    This component checks parameters for valid values.  If an invalid value\n *    is found, the error code is combined with the current error code using \n *    the bitwise or.  This combining allows multiple error codes to be\n *    returned. The possible error codes are:\n *\n *          UTM_NO_ERROR           : No errors occurred in function\n *          UTM_LAT_ERROR          : Latitude outside of valid range\n *                                    (-80.5 to 84.5 degrees)\n *          UTM_LON_ERROR          : Longitude outside of valid range\n *                                    (-180 to 360 degrees)\n *          UTM_EASTING_ERROR      : Easting outside of valid range\n *                                    (100,000 to 900,000 meters)\n *          UTM_NORTHING_ERROR     : Northing outside of valid range\n *                                    (0 to 10,000,000 meters)\n *          UTM_ZONE_ERROR         : Zone outside of valid range (1 to 60)\n *          UTM_HEMISPHERE_ERROR   : Invalid hemisphere ('N' or 'S')\n *          UTM_ZONE_OVERRIDE_ERROR: Zone outside of valid range\n *                                    (1 to 60) and within 1 of 'natural' zone\n *          UTM_A_ERROR            : Semi-major axis less than or equal to zero\n *          UTM_INV_F_ERROR        : Inverse flattening outside of valid range\n *\t\t\t\t\t\t\t\t  \t                (250 to 350)\n *\n * REUSE NOTES\n *\n *    UTM is intended for reuse by any application that performs a Universal\n *    Transverse Mercator (UTM) projection or its inverse.\n *    \n * REFERENCES\n *\n *    Further information on UTM can be found in the Reuse Manual.\n *\n *    UTM originated from :  U.S. Army Topographic Engineering Center\n *                           Geospatial Information Division\n *                           7701 Telegraph Road\n *                           Alexandria, VA  22310-3864\n *\n * LICENSES\n *\n *    None apply to this component.\n *\n * RESTRICTIONS\n *\n *    UTM has no restrictions.\n *\n * ENVIRONMENT\n *\n *    UTM was tested and certified in the following environments:\n *\n *    1. Solaris 2.5 with GCC, version 2.8.1\n *    2. MSDOS with MS Visual C++, version 6\n *\n * MODIFICATIONS\n *\n *    Date              Description\n *    ----              -----------\n *    10-02-97          Original Code\n *\n */\n\n\n/***************************************************************************/\n/*\n *                              INCLUDES\n */\n#include <stdio.h>\n#include \"tranmerc.h\"\n#include \"utm.h\"\n/*\n *    tranmerc.h    - Is used to convert transverse mercator coordinates\n *    utm.h         - Defines the function prototypes for the utm module.\n */\n\n\n/***************************************************************************/\n/*\n *                              DEFINES\n */\n\n#define PI           3.14159265358979323e0    /* PI                        */\n#define MIN_LAT      ( (-80.5 * PI) / 180.0 ) /* -80.5 degrees in radians    */\n#define MAX_LAT      ( (84.5 * PI) / 180.0 )  /* 84.5 degrees in radians     */\n#define MIN_EASTING  100000\n#define MAX_EASTING  900000\n#define MIN_NORTHING 0\n#define MAX_NORTHING 10000000\n\n/***************************************************************************/\n/*\n *                              GLOBAL DECLARATIONS\n */\n\nstatic double UTM_a = 6378137.0;         /* Semi-major axis of ellipsoid in meters  */\nstatic double UTM_f = 1 / 298.257223563; /* Flattening of ellipsoid                 */\nstatic long   UTM_Override = 0;          /* Zone override flag                      */\n\n\n/***************************************************************************/\n/*\n *                                FUNCTIONS\n *\n */\n\nlong Set_UTM_Parameters(double a,      \n                        double f,\n                        long   override)\n{\n/*\n * The function Set_UTM_Parameters receives the ellipsoid parameters and\n * UTM zone override parameter as inputs, and sets the corresponding state\n * variables.  If any errors occur, the error code(s) are returned by the \n * function, otherwise UTM_NO_ERROR is returned.\n *\n *    a                 : Semi-major axis of ellipsoid, in meters       (input)\n *    f                 : Flattening of ellipsoid\t\t\t\t\t\t            (input)\n *    override          : UTM override zone, zero indicates no override (input)\n */\n\n  double inv_f = 1 / f;\n  long Error_Code = UTM_NO_ERROR;\n\n  if (a <= 0.0)\n  { /* Semi-major axis must be greater than zero */\n    Error_Code |= UTM_A_ERROR;\n  }\n  if ((inv_f < 250) || (inv_f > 350))\n  { /* Inverse flattening must be between 250 and 350 */\n    Error_Code |= UTM_INV_F_ERROR;\n  }\n  if ((override < 0) || (override > 60))\n  {\n    Error_Code |= UTM_ZONE_OVERRIDE_ERROR;\n  }\n  if (!Error_Code)\n  { /* no errors */\n    UTM_a = a;\n    UTM_f = f;\n    UTM_Override = override;\n  }\n  return (Error_Code);\n} /* END OF Set_UTM_Parameters */\n\n\nvoid Get_UTM_Parameters(double *a,\n                        double *f,\n                        long   *override)\n{\n/*\n * The function Get_UTM_Parameters returns the current ellipsoid\n * parameters and UTM zone override parameter.\n *\n *    a                 : Semi-major axis of ellipsoid, in meters       (output)\n *    f                 : Flattening of ellipsoid\t\t\t\t\t\t            (output)\n *    override          : UTM override zone, zero indicates no override (output)\n */\n\n  *a = UTM_a;\n  *f = UTM_f;\n  *override = UTM_Override;\n} /* END OF Get_UTM_Parameters */\n\n\nlong Convert_Geodetic_To_UTM (double Latitude,\n                              double Longitude,\n                              long   *Zone,\n                              char   *Hemisphere,\n                              double *Easting,\n                              double *Northing)\n{ \n/*\n * The function Convert_Geodetic_To_UTM converts geodetic (latitude and\n * longitude) coordinates to UTM projection (zone, hemisphere, easting and\n * northing) coordinates according to the current ellipsoid and UTM zone\n * override parameters.  If any errors occur, the error code(s) are returned\n * by the function, otherwise UTM_NO_ERROR is returned.\n *\n *    Latitude          : Latitude in radians                 (input)\n *    Longitude         : Longitude in radians                (input)\n *    Zone              : UTM zone                            (output)\n *    Hemisphere        : North or South hemisphere           (output)\n *    Easting           : Easting (X) in meters               (output)\n *    Northing          : Northing (Y) in meters              (output)\n */\n\n  long Lat_Degrees;\n  long Long_Degrees;\n  long temp_zone;\n  long Error_Code = UTM_NO_ERROR;\n  double Origin_Latitude = 0;\n  double Central_Meridian = 0;\n  double False_Easting = 500000;\n  double False_Northing = 0;\n  double Scale = 0.9996;\n\n    if ((Latitude < MIN_LAT) || (Latitude > MAX_LAT))\n  { /* Latitude out of range */\n      Error_Code |= UTM_LAT_ERROR;\n  }\n    if ((Longitude < -PI) || (Longitude > (2*PI)))\n  { /* Longitude out of range */\n      Error_Code |= UTM_LON_ERROR;\n  }\n    if (!Error_Code)\n  { /* no errors */\n      if((Latitude > -1.0e-9) && (Latitude < 0)) {\n          Latitude = 0.0;\n      }\n      if (Longitude < 0) {\n          Longitude += (2*PI) + 1.0e-10;\n      }\n\n      Lat_Degrees = (long)(Latitude * 180.0 / PI);\n      Long_Degrees = (long)(Longitude * 180.0 / PI);\n\n      if (Longitude < PI) {\n          temp_zone = (long)(31 + ((Longitude * 180.0 / PI) / 6.0));\n      }\n    else {\n          temp_zone = (long)(((Longitude * 180.0 / PI) / 6.0) - 29);\n      }\n\n      if (temp_zone > 60) {\n          temp_zone = 1;\n      }\n      /* UTM special cases */\n      if ((Lat_Degrees > 55) && (Lat_Degrees < 64) && (Long_Degrees > -1)\n        && (Long_Degrees < 3)) {\n          temp_zone = 31;\n      }\n      if ((Lat_Degrees > 55) && (Lat_Degrees < 64) && (Long_Degrees > 2)\n        && (Long_Degrees < 12)) {\n          temp_zone = 32;\n      }\n      if ((Lat_Degrees > 71) && (Long_Degrees > -1) && (Long_Degrees < 9)) {\n          temp_zone = 31;\n      }\n      if ((Lat_Degrees > 71) && (Long_Degrees > 8) && (Long_Degrees < 21)) {\n          temp_zone = 33;\n      }\n      if ((Lat_Degrees > 71) && (Long_Degrees > 20) && (Long_Degrees < 33)) {\n          temp_zone = 35;\n      }\n      if ((Lat_Degrees > 71) && (Long_Degrees > 32) && (Long_Degrees < 42)) {\n          temp_zone = 37;\n      }\n\n      if (UTM_Override)\n    {\n        if ((temp_zone == 1) && (UTM_Override == 60)) {\n            temp_zone = UTM_Override;\n        }\n      else if ((temp_zone == 60) && (UTM_Override == 1)) {\n            temp_zone = UTM_Override;\n        }\n      else if ((Lat_Degrees > 71) && (Long_Degrees > -1) && (Long_Degrees < 42))\n      {\n          if (((temp_zone-2) <= UTM_Override) && (UTM_Override <= (temp_zone+2))) {\n              temp_zone = UTM_Override;\n          }\n        else {\n              Error_Code = UTM_ZONE_OVERRIDE_ERROR;\n          }\n      }\n      else if (((temp_zone-1) <= UTM_Override) && (UTM_Override <= (temp_zone+1))) {\n            temp_zone = UTM_Override;\n        }\n      else {\n            Error_Code = UTM_ZONE_OVERRIDE_ERROR;\n        }\n    }\n      if (!Error_Code)\n    {\n        if (temp_zone >= 31) {\n            Central_Meridian = (6 * temp_zone - 183) * PI / 180.0;\n        }\n      else {\n            Central_Meridian = (6 * temp_zone + 177) * PI / 180.0;\n        }\n        *Zone = temp_zone;\n        if (Latitude < 0)\n      {\n          False_Northing = 10000000;\n          *Hemisphere = 'S';\n      }\n      else {\n            *Hemisphere = 'N';\n        }\n        Set_Transverse_Mercator_Parameters(UTM_a, UTM_f, Origin_Latitude,\n                                         Central_Meridian, False_Easting, False_Northing, Scale);\n        Convert_Geodetic_To_Transverse_Mercator(Latitude, Longitude, Easting,\n                                              Northing);\n        if ((*Easting < MIN_EASTING) || (*Easting > MAX_EASTING)) {\n            Error_Code = UTM_EASTING_ERROR;\n        }\n        if ((*Northing < MIN_NORTHING) || (*Northing > MAX_NORTHING)) {\n            Error_Code |= UTM_NORTHING_ERROR;\n        }\n    }\n  } /* END OF if (!Error_Code) */\n    return (Error_Code);\n} /* END OF Convert_Geodetic_To_UTM */\n\n\nlong Convert_UTM_To_Geodetic(long   Zone,\n                             char   Hemisphere,\n                             double Easting,\n                             double Northing,\n                             double *Latitude,\n                             double *Longitude)\n{\n/*\n * The function Convert_UTM_To_Geodetic converts UTM projection (zone, \n * hemisphere, easting and northing) coordinates to geodetic(latitude\n * and  longitude) coordinates, according to the current ellipsoid\n * parameters.  If any errors occur, the error code(s) are returned\n * by the function, otherwise UTM_NO_ERROR is returned.\n *\n *    Zone              : UTM zone                               (input)\n *    Hemisphere        : North or South hemisphere              (input)\n *    Easting           : Easting (X) in meters                  (input)\n *    Northing          : Northing (Y) in meters                 (input)\n *    Latitude          : Latitude in radians                    (output)\n *    Longitude         : Longitude in radians                   (output)\n */\n  long Error_Code = UTM_NO_ERROR;\n  long tm_error_code = UTM_NO_ERROR;\n  double Origin_Latitude = 0;\n  double Central_Meridian = 0;\n  double False_Easting = 500000;\n  double False_Northing = 0;\n  double Scale = 0.9996;\n\n  if ((Zone < 1) || (Zone > 60))\n    Error_Code |= UTM_ZONE_ERROR;\n  if ((Hemisphere != 'S') && (Hemisphere != 'N'))\n    Error_Code |= UTM_HEMISPHERE_ERROR;\n  if ((Easting < MIN_EASTING) || (Easting > MAX_EASTING))\n    Error_Code |= UTM_EASTING_ERROR;\n  if ((Northing < MIN_NORTHING) || (Northing > MAX_NORTHING))\n    Error_Code |= UTM_NORTHING_ERROR;\n  if (!Error_Code)\n  { /* no errors */\n    if (Zone >= 31)\n      Central_Meridian = ((6 * Zone - 183) * PI / 180.0 /*+ 0.00000005*/);\n    else\n      Central_Meridian = ((6 * Zone + 177) * PI / 180.0 /*+ 0.00000005*/);\n    if (Hemisphere == 'S')\n      False_Northing = 10000000;\n    Set_Transverse_Mercator_Parameters(UTM_a, UTM_f, Origin_Latitude,\n                                       Central_Meridian, False_Easting, False_Northing, Scale);\n\n    tm_error_code = Convert_Transverse_Mercator_To_Geodetic(Easting, Northing, Latitude, Longitude);\n    if(tm_error_code)\n    {\n      if(tm_error_code & TRANMERC_EASTING_ERROR)\n        Error_Code |= UTM_EASTING_ERROR;\n      if(tm_error_code & TRANMERC_NORTHING_ERROR)\n        Error_Code |= UTM_NORTHING_ERROR;\n    }\n\n    if ((*Latitude < MIN_LAT) || (*Latitude > MAX_LAT))\n    { /* Latitude out of range */\n      Error_Code |= UTM_NORTHING_ERROR;\n    }\n  }\n  return (Error_Code);\n} /* END OF Convert_UTM_To_Geodetic */\n"
  },
  {
    "path": "lonestar/scientific/cpu/longestedge/src/libmgrs/utm.h",
    "content": "#ifndef UTM_H\n#define UTM_H\n\n/***************************************************************************/\n/* RSC IDENTIFIER: UTM\n *\n * ABSTRACT\n *\n *    This component provides conversions between geodetic coordinates\n *    (latitude and longitudes) and Universal Transverse Mercator (UTM)\n *    projection (zone, hemisphere, easting, and northing) coordinates.\n *\n * ERROR HANDLING\n *\n *    This component checks parameters for valid values.  If an invalid value\n *    is found, the error code is combined with the current error code using\n *    the bitwise or.  This combining allows multiple error codes to be\n *    returned. The possible error codes are:\n *\n *          UTM_NO_ERROR           : No errors occurred in function\n *          UTM_LAT_ERROR          : Latitude outside of valid range\n *                                    (-80.5 to 84.5 degrees)\n *          UTM_LON_ERROR          : Longitude outside of valid range\n *                                    (-180 to 360 degrees)\n *          UTM_EASTING_ERROR      : Easting outside of valid range\n *                                    (100,000 to 900,000 meters)\n *          UTM_NORTHING_ERROR     : Northing outside of valid range\n *                                    (0 to 10,000,000 meters)\n *          UTM_ZONE_ERROR         : Zone outside of valid range (1 to 60)\n *          UTM_HEMISPHERE_ERROR   : Invalid hemisphere ('N' or 'S')\n *          UTM_ZONE_OVERRIDE_ERROR: Zone outside of valid range\n *                                    (1 to 60) and within 1 of 'natural' zone\n *          UTM_A_ERROR            : Semi-major axis less than or equal to zero\n *          UTM_INV_F_ERROR        : Inverse flattening outside of valid range\n *\t\t\t\t\t\t\t\t  \t                (250 to 350)\n *\n * REUSE NOTES\n *\n *    UTM is intended for reuse by any application that performs a Universal\n *    Transverse Mercator (UTM) projection or its inverse.\n *\n * REFERENCES\n *\n *    Further information on UTM can be found in the Reuse Manual.\n *\n *    UTM originated from :  U.S. Army Topographic Engineering Center\n *                           Geospatial Information Division\n *                           7701 Telegraph Road\n *                           Alexandria, VA  22310-3864\n *\n * LICENSES\n *\n *    None apply to this component.\n *\n * RESTRICTIONS\n *\n *    UTM has no restrictions.\n *\n * ENVIRONMENT\n *\n *    UTM was tested and certified in the following environments:\n *\n *    1. Solaris 2.5 with GCC, version 2.8.1\n *    2. MSDOS with MS Visual C++, version 6\n *\n * MODIFICATIONS\n *\n *    Date              Description\n *    ----              -----------\n *    10-02-97          Original Code\n *\n */\n\n/***************************************************************************/\n/*\n *                              DEFINES\n */\n\n#define UTM_NO_ERROR 0x0000\n#define UTM_LAT_ERROR 0x0001\n#define UTM_LON_ERROR 0x0002\n#define UTM_EASTING_ERROR 0x0004\n#define UTM_NORTHING_ERROR 0x0008\n#define UTM_ZONE_ERROR 0x0010\n#define UTM_HEMISPHERE_ERROR 0x0020\n#define UTM_ZONE_OVERRIDE_ERROR 0x0040\n#define UTM_A_ERROR 0x0080\n#define UTM_INV_F_ERROR 0x0100\n\n/***************************************************************************/\n/*\n *                              FUNCTION PROTOTYPES\n *                                for UTM.C\n */\n\n/* ensure proper linkage to c++ programs */\n#ifdef __cplusplus\nextern \"C\" {\n#endif\n\nlong Set_UTM_Parameters(double a, double f, long override);\n/*\n * The function Set_UTM_Parameters receives the ellipsoid parameters and\n * UTM zone override parameter as inputs, and sets the corresponding state\n * variables.  If any errors occur, the error code(s) are returned by the\n * function, otherwise UTM_NO_ERROR is returned.\n *\n *    a                 : Semi-major axis of ellipsoid, in meters       (input)\n *    f                 : Flattening of ellipsoid                       (input)\n *    override          : UTM override zone, zero indicates no override (input)\n */\n\nvoid Get_UTM_Parameters(double* a, double* f, long* override);\n/*\n * The function Get_UTM_Parameters returns the current ellipsoid\n * parameters and UTM zone override parameter.\n *\n *    a                 : Semi-major axis of ellipsoid, in meters       (output)\n *    f                 : Flattening of ellipsoid                       (output)\n *    override          : UTM override zone, zero indicates no override (output)\n */\n\nlong Convert_Geodetic_To_UTM(double Latitude, double Longitude, long* Zone,\n                             char* Hemisphere, double* Easting,\n                             double* Northing);\n/*\n * The function Convert_Geodetic_To_UTM converts geodetic (latitude and\n * longitude) coordinates to UTM projection (zone, hemisphere, easting and\n * northing) coordinates according to the current ellipsoid and UTM zone\n * override parameters.  If any errors occur, the error code(s) are returned\n * by the function, otherwise UTM_NO_ERROR is returned.\n *\n *    Latitude          : Latitude in radians                 (input)\n *    Longitude         : Longitude in radians                (input)\n *    Zone              : UTM zone                            (output)\n *    Hemisphere        : North or South hemisphere           (output)\n *    Easting           : Easting (X) in meters               (output)\n *    Northing          : Northing (Y) in meters              (output)\n */\n\nlong Convert_UTM_To_Geodetic(long Zone, char Hemisphere, double Easting,\n                             double Northing, double* Latitude,\n                             double* Longitude);\n/*\n * The function Convert_UTM_To_Geodetic converts UTM projection (zone,\n * hemisphere, easting and northing) coordinates to geodetic(latitude\n * and  longitude) coordinates, according to the current ellipsoid\n * parameters.  If any errors occur, the error code(s) are returned\n * by the function, otherwise UTM_NO_ERROR is returned.\n *\n *    Zone              : UTM zone                               (input)\n *    Hemisphere        : North or South hemisphere              (input)\n *    Easting           : Easting (X) in meters                  (input)\n *    Northing          : Northing (Y) in meters                 (input)\n *    Latitude          : Latitude in radians                    (output)\n *    Longitude         : Longitude in radians                   (output)\n */\n\n#ifdef __cplusplus\n}\n#endif\n\n#endif /* UTM_H */\n"
  },
  {
    "path": "lonestar/scientific/cpu/longestedge/src/model/Coordinates.h",
    "content": "#ifndef GALOIS_COORDINATES_H\n#define GALOIS_COORDINATES_H\n\n#include <ostream>\n#include \"../utils/GaloisUtils.h\"\n#include \"Map.h\"\n\n/**\n * Container for x, y, and z coordinates and various utitlity functions on the\n * coordinates\n */\nclass Coordinates {\nprivate:\n  double x;\n  double y;\n  double z;\n\npublic:\n  //! empty constructor: x, y, z initialized to default double\n  Coordinates() = default;\n\n  //! Initialize x/y as specified, z = 0\n  Coordinates(double x, double y) : x(x), y(y), z(0.) {}\n\n  //! Initialize all 3 as specified\n  Coordinates(double x, double y, double z) : x(x), y(y), z(z) {}\n\n  //! Determine z from given x, y based on the provided Map\n  Coordinates(double x, double y, Map& map)\n      : x(x), y(y), z(map.get_height(x, y)) {}\n\n  //! x, y in the pair, height given by Map\n  Coordinates(std::pair<double, double> coords, Map& map)\n      : x(coords.first), y(coords.second),\n        z(map.get_height(coords.first, coords.second)) {}\n\n  // what follows are self explanatory get/set functions\n\n  double getX() const { return x; }\n\n  void setX(double x) { Coordinates::x = x; }\n\n  double getY() const { return y; }\n\n  void setY(double y) { Coordinates::y = y; }\n\n  double getZ() const { return z; }\n\n  void setZ(double z) { Coordinates::z = z; }\n\n  void setXYZ(double x, double y, double z) {\n    Coordinates::x = x;\n    Coordinates::y = y;\n    Coordinates::z = z;\n  }\n\n  //! Get 2D or 3D distances given another set of coordinates\n  double dist(const Coordinates& rhs, bool version2D) const {\n    if (version2D) {\n      return dist2D(rhs);\n    } else {\n      return dist3D(rhs);\n    }\n  }\n\n  //! Take z into account for distance\n  double dist3D(const Coordinates& rhs) const {\n    return sqrt(pow(x - rhs.x, 2) + pow(y - rhs.y, 2) + pow(z - rhs.z, 2));\n  }\n\n  //! Distance of just x/y coordinates\n  double dist2D(const Coordinates& rhs) const {\n    return sqrt(pow(x - rhs.x, 2) + pow(y - rhs.y, 2));\n  }\n\n  bool isXYequal(const Coordinates& rhs) {\n    return equals(x, rhs.x) && equals(y, rhs.y);\n  }\n\n  std::string toString() const {\n    return std::to_string(x) + \" \" + std::to_string(y) + \" \" +\n           std::to_string(z);\n  }\n\n  //! element wise add of x,y,z\n  Coordinates operator+(const Coordinates& rhs) const {\n    return Coordinates{x + rhs.x, y + rhs.y, z + rhs.z};\n  }\n\n  //! element wise subtract of x,y,z\n  Coordinates operator-(const Coordinates& rhs) const {\n    return Coordinates{x - rhs.x, y - rhs.y, z - rhs.z};\n  }\n\n  //! element wise multiply of x,y,z\n  Coordinates operator*(double rhs) const {\n    return Coordinates{x * rhs, y * rhs, z * rhs};\n  }\n\n  //! element wise divide of x,y,z\n  Coordinates operator/(double rhs) const {\n    return Coordinates{x / rhs, y / rhs, z / rhs};\n  }\n\n  //! element wise equality check\n  bool operator==(const Coordinates& rhs) const {\n    return equals(x, rhs.x) && equals(y, rhs.y) && equals(z, rhs.z);\n  }\n\n  //! element wise inequality check\n  bool operator!=(const Coordinates& rhs) const { return !(rhs == *this); }\n\n  //! Less than check; checks x, y, z in that order\n  bool operator<(const Coordinates& rhs) const {\n    if (less(x, rhs.x))\n      return true;\n    if (less(rhs.x, x))\n      return false;\n    if (less(y, rhs.y))\n      return true;\n    if (less(rhs.y, y))\n      return false;\n    return less(z, rhs.z);\n  }\n\n  bool operator>(const Coordinates& rhs) const { return rhs < *this; }\n\n  bool operator<=(const Coordinates& rhs) const { return !(rhs < *this); }\n\n  bool operator>=(const Coordinates& rhs) const { return !(*this < rhs); }\n};\n\n#endif // GALOIS_COORDINATES_H\n"
  },
  {
    "path": "lonestar/scientific/cpu/longestedge/src/model/EdgeData.h",
    "content": "#ifndef GALOIS_EDGEDATA_H\n#define GALOIS_EDGEDATA_H\n\n#include \"Coordinates.h\"\n#include \"NodeData.h\"\n\nclass EdgeData {\nprivate:\n  bool border;             //!< tells if this is a border edge\n  double length;           //!< length\n  Coordinates middlePoint; //!< point at middle of this edge\n\npublic:\n  //! default: not a border, negative length, no middle point\n  EdgeData() : border(false), length(-1), middlePoint(){};\n\n  //! Initialize all fields\n  EdgeData(bool border, double length, Coordinates middlePoint)\n      : border(border), length(length), middlePoint(middlePoint) {}\n\n  // self explanatory functions below\n\n  bool isBorder() const { return border; }\n\n  void setBorder(bool isBorder) { EdgeData::border = isBorder; }\n\n  double getLength() const { return length; }\n\n  void setLength(double l) { EdgeData::length = l; }\n\n  const Coordinates& getMiddlePoint() const { return middlePoint; }\n\n  //! Explicitly set middle point given coordinate class\n  void setMiddlePoint(const Coordinates& coordinates) {\n    EdgeData::middlePoint.setXYZ(coordinates.getX(), coordinates.getY(),\n                                 coordinates.getZ());\n  }\n\n  //! Explicitly set middle point given coordinates as three vars\n  void setMiddlePoint(const double x, const double y, const double z) {\n    EdgeData::middlePoint.setXYZ(x, y, z);\n  }\n};\n#endif // GALOIS_EDGEDATA_H\n"
  },
  {
    "path": "lonestar/scientific/cpu/longestedge/src/model/Graph.h",
    "content": "#ifndef GALOIS_GRAPH_H\n#define GALOIS_GRAPH_H\n\n#include <galois/graphs/MorphGraph.h>\n#include \"NodeData.h\"\n#include \"EdgeData.h\"\n\nusing Graph        = galois::graphs::MorphGraph<NodeData, EdgeData, false>;\nusing GNode        = Graph::GraphNode;\nusing EdgeIterator = Graph::edge_iterator;\nusing galois::optional;\n\n#endif // GALOIS_GRAPH_H\n"
  },
  {
    "path": "lonestar/scientific/cpu/longestedge/src/model/Map.cpp",
    "content": "#include \"Map.h\"\n\n#include \"../utils/Utils.h\"\n#include \"../libmgrs/utm.h\"\n\n#include <cstdio>\n#include <cmath>\n#include <limits>\n#include <iostream>\n\ndouble** Map::init_map_data(size_t rows, size_t cols) {\n  double** map;\n  map = (double**)malloc(rows * sizeof(double*));\n  for (size_t i = 0; i < rows; ++i) {\n    map[i] = (double*)malloc(cols * sizeof(double));\n  }\n  return map;\n}\n\nvoid Map::print_map() {\n  for (size_t i = 0; i < this->length; ++i) {\n    for (size_t j = 0; j < this->width; ++j) {\n      fprintf(stdout, \"%5.0lf \", this->data[i][j]);\n    }\n    fprintf(stdout, \"\\n\");\n  }\n}\n\ndouble Map::get_height(double lon, double lat) {\n  return get_height(lon, lat, utm);\n}\n\ndouble Map::get_height(double lon, double lat, bool convert) {\n\n  double x, y;\n\n  // convert to geodetic if required\n  if (convert) {\n    if (Convert_UTM_To_Geodetic(zone, hemisphere, lon, lat, &y, &x)) {\n      fprintf(stderr, \"Error during conversion to geodetic.\\n\");\n      exit(18);\n    }\n    x = Utils::r2d(x);\n    y = Utils::r2d(y);\n  } else {\n    x = lon;\n    y = lat;\n  }\n\n  // Compute \"grid coordinates\".\n  // modf returns the fractional part of the number,\n  // and assigns the integral part to the second argument.\n  //\n  // The integral part let us know in which \"cell\" of the map the point is\n  // located, and the fractional part let us interpolate the heights.\n  double x_grid_int_part, y_grid_int_part;\n  const auto y_fract =\n      std::modf((north_border - y) / cell_length, &y_grid_int_part);\n  const auto x_fract =\n      std::modf((x - west_border) / cell_width, &x_grid_int_part);\n\n  // using Lagrange bilinear interpolation\n  // Compute the height of the four corners\n  double top_left_height =\n      get_height_wo_interpol(x_grid_int_part, y_grid_int_part, 1);\n  double top_right_height =\n      get_height_wo_interpol(x_grid_int_part, y_grid_int_part, 2);\n  double bottom_right_height =\n      get_height_wo_interpol(x_grid_int_part, y_grid_int_part, 3);\n  double bottom_left_height =\n      get_height_wo_interpol(x_grid_int_part, y_grid_int_part, 4);\n\n  // Sum the contributions of each corner\n  double height = 0.;\n  height += top_left_height * (1 - x_fract) * (1 - y_fract);\n  height += top_right_height * x_fract * (1 - y_fract);\n  height += bottom_right_height * x_fract * y_fract;\n  height += bottom_left_height * (1 - x_fract) * y_fract;\n\n  return height;\n}\n\n// corner: 1 - top_left, 2 - top_right, 3 - bottom_right, 4 - bottom_left\ndouble Map::get_height_wo_interpol(const double lon_grid, const double lat_grid,\n                                   const int corner) {\n\n  auto x = (int)lon_grid;\n  auto y = (int)lat_grid;\n\n  switch (corner) {\n  case 1:\n    break;\n  case 2:\n    ++x;\n    break;\n  case 3:\n    ++x;\n    ++y;\n    break;\n  case 4:\n    ++y;\n    break;\n  default:\n    // XXX[AOS]: I think we should raise an error, unless it is used elsewhere.\n    return std::numeric_limits<double>::min();\n  }\n\n  x = std::max(0, x);\n  y = std::max(0, y);\n\n  return data[y][x];\n}\n\nMap::~Map() {\n  for (size_t i = 0; i < this->length; ++i) {\n    free((double*)this->data[i]);\n  }\n  free(this->data);\n}\n"
  },
  {
    "path": "lonestar/scientific/cpu/longestedge/src/model/Map.h",
    "content": "#ifndef TERGEN_MAP_H\n#define TERGEN_MAP_H\n\n#include <cstdlib>\n\n/**\n * Holds the elevation of a particular point for some specified region (borders\n * and their lengths).\n */\nclass Map {\nprivate:\n  size_t width;\n\n  size_t length;\n\n  double cell_width;\n\n  double cell_length;\n\n  double** data;\n\n  double north_border;\n\n  double west_border;\n\n  bool utm;\n\n  long zone;\n\n  char hemisphere;\n\n  double get_height_wo_interpol(const double lon, const double lat,\n                                const int corner);\n\npublic:\n  Map(double** data, size_t width, size_t length, double cellWidth,\n      double cellLength)\n      : width(width), length(length), cell_width(cellWidth),\n        cell_length(cellLength), data(data), utm(true), zone(-1) {}\n\n  static double** init_map_data(size_t rows, size_t cols);\n\n  void print_map();\n\n  double get_height(double lon, double lat);\n\n  double get_height(double lon, double lat, bool convert);\n\n  size_t getWidth() const { return width; }\n\n  void setWidth(size_t width) { Map::width = width; }\n\n  size_t getLength() const { return length; }\n\n  void setLength(size_t length) { Map::length = length; }\n\n  double getCellWidth() const { return cell_width; }\n\n  void setCellWidth(double cellWidth) { cell_width = cellWidth; }\n\n  double getCellLength() const { return cell_length; }\n\n  void setCellLength(double cellLength) { cell_length = cellLength; }\n\n  double** getData() const { return data; }\n\n  void setData(double** data) { Map::data = data; }\n\n  double getNorthBorder() const { return north_border; }\n\n  void setNorthBorder(double northBorder) { north_border = northBorder; }\n\n  double getWestBorder() const { return west_border; }\n\n  void setWestBorder(double westBorder) { west_border = westBorder; }\n\n  bool isUtm() const { return utm; }\n\n  void setUtm(bool utm) { Map::utm = utm; }\n\n  long getZone() const { return zone; }\n\n  void setZone(long zone) { Map::zone = zone; }\n\n  char getHemisphere() const { return hemisphere; }\n\n  void setHemisphere(char hemisphere) { Map::hemisphere = hemisphere; }\n\n  ~Map();\n};\n\n#endif // TERGEN_MAP_H\n"
  },
  {
    "path": "lonestar/scientific/cpu/longestedge/src/model/NodeData.h",
    "content": "#ifndef GALOIS_NODEDATA_H\n#define GALOIS_NODEDATA_H\n\n#include \"Coordinates.h\"\n\nclass NodeData {\nprivate:\n  bool hyperEdge;     //!< Indicates if node is a metanode to track triangles\n  Coordinates coords; //!< Coordinates of node\n  bool toRefine;      //!< Indicates if node needs to be refined\n  bool hanging;       //!< Indicates hanging node status\n\npublic:\n  NodeData(bool isHyperEdge, const Coordinates& coords, bool hanging)\n      : hyperEdge(isHyperEdge), coords(), toRefine(false), hanging(hanging) {\n    setCoords(coords);\n  }\n\n  NodeData(bool isHyperEdge, bool toRefine)\n      : hyperEdge(isHyperEdge), coords(), toRefine(toRefine), hanging(false) {}\n\n  NodeData(bool isHyperEdge, bool toRefine, Coordinates coords)\n      : hyperEdge(isHyperEdge), coords(), toRefine(toRefine), hanging(false) {\n    setCoords(coords);\n  }\n\n  // self-explanatory set/get functions\n\n  Coordinates getCoords() const { return coords; }\n\n  void setCoords(const Coordinates& coordinates) {\n    NodeData::coords.setXYZ(coordinates.getX(), coordinates.getY(),\n                            coordinates.getZ());\n  }\n\n  void setCoords(const double x, const double y, const double z) {\n    NodeData::coords.setXYZ(x, y, z);\n  }\n\n  bool isToRefine() const { return toRefine; }\n\n  void setToRefine(bool refine) { NodeData::toRefine = refine; }\n\n  bool isHanging() const { return hanging; }\n\n  void setHanging(bool hangingNode) { NodeData::hanging = hangingNode; }\n\n  bool isHyperEdge() const { return hyperEdge; }\n\n  bool operator==(const NodeData& rhs) const {\n    return hyperEdge == rhs.hyperEdge && coords == rhs.coords &&\n           (hyperEdge ? toRefine == rhs.toRefine : hanging == rhs.hanging);\n  }\n\n  bool operator!=(const NodeData& rhs) const { return !(rhs == *this); }\n\n  bool operator<(const NodeData& rhs) const {\n    if (hyperEdge < rhs.hyperEdge)\n      return true;\n    if (rhs.hyperEdge < hyperEdge)\n      return false;\n    if (coords < rhs.coords)\n      return true;\n    if (rhs.coords < coords)\n      return false;\n    if (hyperEdge) {\n      return toRefine < rhs.toRefine;\n    } else {\n      return hanging < rhs.hanging;\n    }\n  }\n\n  bool operator>(const NodeData& rhs) const { return rhs < *this; }\n\n  bool operator<=(const NodeData& rhs) const { return !(rhs < *this); }\n\n  bool operator>=(const NodeData& rhs) const { return !(*this < rhs); }\n};\n\n#endif // GALOIS_NODEDATA_H\n"
  },
  {
    "path": "lonestar/scientific/cpu/longestedge/src/model/ProductionState.h",
    "content": "#ifndef GALOIS_PRODUCTIONSTATE_H\n#define GALOIS_PRODUCTIONSTATE_H\n\n#include \"Graph.h\"\n#include \"../utils/ConnectivityManager.h\"\n\nusing std::vector;\n\n/**\n * Wraps a hyperedge representing a triangle and provides methods to make\n * working with it easier.\n */\nclass ProductionState {\nprivate:\n  //! hyperedge ID\n  GNode& interior;\n  //! hyperedge data\n  NodeData& interiorData;\n  //! node data of the triangle the hyperedge connects\n  vector<NodeData> verticesData;\n  //! vertices connected by the hyper edge (i.e. a triangle)\n  const vector<GNode> vertices;\n  //! Edges of the triangle represented by the hyperedge\n  const vector<optional<EdgeIterator>> edgesIterators;\n  //! Edge data of all the triangle edges\n  vector<galois::optional<EdgeData>> edgesData;\n  //! lenghts of the edges of the triangle\n  vector<double> lengths;\n  //! edges indices that are supposed to exist via the hyperedge but do not\n  //! (e.g., removed by another production)\n  vector<int> brokenEdges;\n  //! Indicator of what version is being used\n  bool version2D;\n  //! function to get the height of a point in the terrain\n  std::function<double(double, double)> zGetter;\n\npublic:\n  /**\n   * Initialize state needed for a production given a hyperedge connecting\n   * a triangle\n   *\n   * There's an assumption from getNeighbours that the edges will be in\n   * 0->1, 1->2, 2->0 order that getTriangleEdges from connection manager relies\n   * on.\n   */\n  ProductionState(ConnectivityManager& connManager, GNode& interior,\n                  bool version2D, std::function<double(double, double)> zGetter)\n      : interior(interior), interiorData(interior->getData()),\n        vertices(connManager.getNeighbours(interior)),\n        edgesIterators(connManager.getTriangleEdges(vertices)),\n        version2D(version2D), zGetter(zGetter) {\n    Graph& graph = connManager.getGraph();\n\n    // loop over 3 nodes/edges of triangle (if they exist)\n    for (int i = 0; i < 3; ++i) {\n      auto maybeEdgeIter = edgesIterators[i];\n      edgesData.push_back(\n          maybeEdgeIter\n              ? graph.getEdgeData(maybeEdgeIter.get())\n              : galois::optional<EdgeData>()); // TODO: Look for\n                                               // possible optimization\n      lengths.push_back(maybeEdgeIter ? edgesData[i].get().getLength() : -1);\n      verticesData.push_back(\n          graph.getData(vertices[i])); // TODO: Look for possible optimization\n\n      // if an edge doesn't exist, push to broken edges\n      if (!maybeEdgeIter) {\n        brokenEdges.push_back(i);\n      }\n    }\n  }\n\n  //! find the longest edges (includes ties)\n  std::vector<int> getLongestEdges() const {\n    std::vector<int> longestEdges;\n    for (int i = 0; i < 3; ++i) {\n      if (!less(lengths[i], lengths[(i + 1) % 3]) &&\n          !less(lengths[i], lengths[(i + 2) % 3])) {\n        longestEdges.push_back(i);\n      }\n    }\n    return longestEdges;\n  }\n\n  int getAnyBrokenEdge() const {\n    if (!brokenEdges.empty()) {\n      return brokenEdges[0];\n    } else {\n      return -1;\n    }\n  }\n\n  //! Look at all edges, return the indcies of the ones with max distance\n  //! among them.\n  //! ASSUMPTION: 0->1, 1->2, 2->0 order of edges\n  std::vector<int> getLongestEdgesIncludingBrokenOnes() const {\n    std::vector<double> verticesDistances(3);\n    for (int i = 0; i < 3; ++i) {\n      verticesDistances[i] = verticesData[i].getCoords().dist(\n          verticesData[(i + 1) % 3].getCoords(), version2D);\n    }\n    return indexesOfMaxElems(verticesDistances);\n  }\n\n  GNode& getInterior() const { return interior; }\n\n  NodeData& getInteriorData() const { return interiorData; }\n\n  const vector<galois::optional<EdgeData>>& getEdgesData() const {\n    return edgesData;\n  }\n\n  const vector<double>& getLengths() const { return lengths; }\n\n  const vector<NodeData>& getVerticesData() const { return verticesData; }\n\n  const vector<GNode>& getVertices() const { return vertices; }\n\n  const vector<optional<EdgeIterator>>& getEdgesIterators() const {\n    return edgesIterators;\n  }\n\n  const vector<int>& getBrokenEdges() const { return brokenEdges; }\n\n  bool isVersion2D() const { return version2D; }\n\n  const std::function<double(double, double)>& getZGetter() const {\n    return zGetter;\n  }\n};\n\n#endif // GALOIS_PRODUCTIONSTATE_H\n"
  },
  {
    "path": "lonestar/scientific/cpu/longestedge/src/productions/Production.h",
    "content": "#ifndef GALOIS_PRODUCTION_H\n#define GALOIS_PRODUCTION_H\n\n#include \"../model/ProductionState.h\"\n\nclass Production {\n\npublic:\n  //! constructor needs a connection manager wrapping the graph\n  explicit Production(const ConnectivityManager& connManager)\n      : connManager(connManager) {}\n\n  virtual bool execute(ProductionState& pState,\n                       galois::UserContext<GNode>& ctx) = 0;\n\nprotected:\n  ConnectivityManager connManager;\n\n  bool checkIfBrokenEdgeIsTheLongest(\n      int brokenEdge, const std::vector<optional<EdgeIterator>>& edgesIterators,\n      const std::vector<GNode>& vertices) const {\n    std::vector<double> lengths(4);\n    Graph& graph = connManager.getGraph();\n    for (int i = 0, j = 0; i < 3; ++i) {\n      if (i != brokenEdge) {\n        lengths[j++] = graph.getEdgeData(edgesIterators[i].get()).getLength();\n      } else {\n        std::pair<int, int> brokenEdgeVertices = getEdgeVertices(brokenEdge);\n        // Suppress warning false positive.\n        // See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=80635\n        GALOIS_IGNORE_MAYBE_UNINITIALIZED\n        GNode& hangingNode =\n            connManager\n                .findNodeBetween(vertices[brokenEdgeVertices.first],\n                                 vertices[brokenEdgeVertices.second])\n                .get();\n        lengths[2] = graph\n                         .getEdgeData(graph.findEdge(\n                             vertices[brokenEdgeVertices.first], hangingNode))\n                         .getLength();\n        lengths[3] = graph\n                         .getEdgeData(graph.findEdge(\n                             vertices[brokenEdgeVertices.second], hangingNode))\n                         .getLength();\n        GALOIS_END_IGNORE_MAYBE_UNINITIALIZED\n      }\n    }\n    return !less(lengths[2] + lengths[3], lengths[0]) &&\n           !less(lengths[2] + lengths[3], lengths[1]);\n  }\n\n  //! vertices of an edge\n  //! assumption: edges connect vertices that have adjacent IDs\n  std::pair<int, int> getEdgeVertices(int edge) const {\n    return std::pair<int, int>{edge, (edge + 1) % 3};\n  }\n\n  //! get the vertex not connected to the edge\n  //! assumption: edges are 0->1, 1->2, 2->0\n  int getNeutralVertex(int edgeToBreak) const { return (edgeToBreak + 2) % 3; }\n\n  void breakElementWithHangingNode(int edgeToBreak, ProductionState& pState,\n                                   galois::UserContext<GNode>& ctx) const {\n    GNode hangingNode = getHangingNode(edgeToBreak, pState);\n\n    breakElementUsingNode(edgeToBreak, hangingNode, pState, ctx);\n\n    hangingNode->getData().setHanging(false);\n  }\n\n  //! Break an edge that doesn't have a hanging node already on it\n  void breakElementWithoutHangingNode(int edgeToBreak, ProductionState& pState,\n                                      galois::UserContext<GNode>& ctx) const {\n    // create new node + its edges\n    GNode newNode = createNodeOnEdge(edgeToBreak, pState, ctx);\n    // create the hyperedges that result from the break\n    breakElementUsingNode(edgeToBreak, newNode, pState, ctx);\n  }\n\n  //! logging/debug function; print info to cout\n  static void logg(const NodeData& interiorData,\n                   const std::vector<NodeData>& verticesData) {\n    std::cout << \"interior: (\" << interiorData.getCoords().toString()\n              << \"), neighbours: (\";\n    for (auto vertex : verticesData) {\n      std::cout << vertex.getCoords().toString() + \", \";\n    }\n    std::cout << \") \";\n  }\n\nprivate:\n  //! Creates a new node on an edge + its endpoints; does not\n  //! create corresponding hyperedges\n  GNode createNodeOnEdge(int edgeToBreak, ProductionState& pState,\n                         galois::UserContext<GNode>& ctx) const {\n    Graph& graph = connManager.getGraph();\n    // edges of triangle\n    const vector<galois::optional<EdgeData>>& edgesData = pState.getEdgesData();\n    bool breakingOnBorder = edgesData[edgeToBreak].get().isBorder();\n    int neutralVertex     = getNeutralVertex(edgeToBreak);\n\n    //        const EdgeIterator &edge =\n    //        pState.getEdgesIterators()[edgeToBreak].get();\n    //        graph.removeEdge(*(graph.getEdgeData(edge).getSrc()), edge);\n\n    //        auto edgePair =\n    //        connManager.findSrc(pState.getEdgesIterators()[edgeToBreak].get());\n    //        auto edgePair = connManager.findSrc(edgesData[edgeToBreak].get());\n    //        graph.removeEdge(edgePair.first, edgePair.second);\n\n    // remove original edge from graph\n    const std::pair<int, int>& edgeVertices = getEdgeVertices(edgeToBreak);\n    connManager.removeEdge(pState.getVertices()[edgeVertices.first],\n                           pState.getVertices()[edgeVertices.second]);\n\n    // new point is midway point; height comes from terrain\n    const Coordinates& newPointCoords = getNewPointCoords(\n        pState.getVerticesData()[edgeVertices.first].getCoords(),\n        pState.getVerticesData()[edgeVertices.second].getCoords(),\n        pState.getZGetter());\n\n    // create the new node, push to graph and worklist\n    // note: border nodes are never hanging; hanging means it needs to be\n    // broken on the other end\n    NodeData newNodeData = NodeData{false, newPointCoords, !breakingOnBorder};\n    GNode newNode        = graph.createNode(newNodeData);\n    graph.addNode(newNode);\n    ctx.push(newNode);\n\n    // connect vertices in original triangle to new node\n    for (int i = 0; i < 3; ++i) {\n      auto vertexData = pState.getVerticesData()[i];\n      // addition of the new edge\n      auto edge = graph.addEdge(newNode, pState.getVertices()[i]);\n\n      graph.getEdgeData(edge).setBorder(i != neutralVertex ? breakingOnBorder\n                                                           : false);\n      // midpoint\n      graph.getEdgeData(edge).setMiddlePoint(\n          (newNodeData.getCoords().getX() + vertexData.getCoords().getX()) / 2.,\n          (newNodeData.getCoords().getY() + vertexData.getCoords().getY()) / 2.,\n          (newNodeData.getCoords().getZ() + vertexData.getCoords().getZ()) /\n              2.);\n      // distance\n      graph.getEdgeData(edge).setLength(newNodeData.getCoords().dist(\n          vertexData.getCoords(), pState.isVersion2D()));\n    }\n    return newNode;\n  }\n\n  //! Given a hanging node, create the hyperedges for the 2 resulting triangles\n  void breakElementUsingNode(int edgeToBreak, GNode const& hangingNode,\n                             const ProductionState& pState,\n                             galois::UserContext<GNode>& ctx) const {\n    const std::pair<int, int>& brokenEdgeVertices =\n        getEdgeVertices(edgeToBreak);\n    Graph& graph      = connManager.getGraph();\n    int neutralVertex = getNeutralVertex(edgeToBreak);\n    // newly added hangingnode\n    NodeData hNodeData = hangingNode->getData();\n    double length      = 0;\n    length             = hNodeData.getCoords().dist(\n        pState.getVerticesData()[neutralVertex].getCoords(),\n        pState.isVersion2D());\n    // add edge between hanging node and node that it doesn't connect in\n    // triangle\n    // TODO might this already done in create node on edge?\n    addEdge(graph, hangingNode, pState.getVertices()[neutralVertex], false,\n            length,\n            (hNodeData.getCoords() +\n             pState.getVerticesData()[neutralVertex].getCoords()) /\n                2);\n\n    // create the 2 hyperedges that results from the two triangles\n    connManager.createInterior(hangingNode, pState.getVertices()[neutralVertex],\n                               pState.getVertices()[brokenEdgeVertices.first],\n                               ctx);\n    connManager.createInterior(hangingNode, pState.getVertices()[neutralVertex],\n                               pState.getVertices()[brokenEdgeVertices.second],\n                               ctx);\n\n    // remove original hyperedge\n    graph.removeNode(pState.getInterior());\n  }\n\n  //! Get the hanging node on a broken edge (i.e. midpoint typically)\n  GNode getHangingNode(int edgeToBreak, const ProductionState& pState) const {\n    const std::pair<int, int>& brokenEdgeVertices =\n        getEdgeVertices(edgeToBreak);\n    return connManager\n        .findNodeBetween(pState.getVertices()[brokenEdgeVertices.first],\n                         pState.getVertices()[brokenEdgeVertices.second])\n        .get();\n  }\n\n  //! Adds an edge to the graph given all neccessary parameters\n  void addEdge(Graph& graph, GNode const& node1, GNode const& node2,\n               bool border, double length,\n               const Coordinates& middlePoint) const {\n    const EdgeIterator& newEdge = graph.addEdge(node1, node2);\n    graph.getEdgeData(newEdge).setBorder(border);\n    graph.getEdgeData(newEdge).setLength(length);\n    graph.getEdgeData(newEdge).setMiddlePoint(middlePoint);\n  }\n\n  //! Find the halfway point of 2 coordinates + get its height using\n  //! the provided zgetter function\n  Coordinates getNewPointCoords(\n      const Coordinates& coords1, const Coordinates& coords2,\n      const std::function<double(double, double)>& zGetter) const {\n    double x = (coords1.getX() + coords2.getX()) / 2.;\n    double y = (coords1.getY() + coords2.getY()) / 2.;\n    return {x, y, zGetter(x, y)};\n  }\n};\n\n#endif // GALOIS_PRODUCTION_H\n"
  },
  {
    "path": "lonestar/scientific/cpu/longestedge/src/productions/Production1.h",
    "content": "#ifndef GALOIS_PRODUCTION1_H\n#define GALOIS_PRODUCTION1_H\n\n#include \"Production.h\"\n#include \"../utils/ConnectivityManager.h\"\n#include \"../utils/GaloisUtils.h\"\n\nclass Production1 : public Production {\nprivate:\n  bool checkApplicabilityCondition(\n      const NodeData& nodeData,\n      const std::vector<optional<EdgeIterator>>& edgesIterators) const {\n    return nodeData.isToRefine() && !connManager.hasBrokenEdge(edgesIterators);\n  }\n\n  int getEdgeToBreak(const ProductionState& pState) const {\n    const vector<NodeData>& verticesData = pState.getVerticesData();\n    for (int longest : pState.getLongestEdges()) {\n      if (pState.getEdgesData()[longest].get().isBorder()) {\n        return longest;\n      }\n      if (!verticesData[getEdgeVertices(longest).first].isHanging() &&\n          !verticesData[getEdgeVertices(longest).second].isHanging()) {\n\n        return longest;\n      }\n    }\n    return -1;\n  }\n\npublic:\n  using Production::Production;\n\n  bool execute(ProductionState& pState,\n               galois::UserContext<GNode>& ctx) override {\n    if (!checkApplicabilityCondition(pState.getInteriorData(),\n                                     pState.getEdgesIterators())) {\n      return false;\n    }\n\n    //        logg(pState.getInteriorData(), pState.getVerticesData());\n\n    int edgeToBreak = getEdgeToBreak(pState);\n    if (edgeToBreak == -1) {\n      return false;\n    }\n\n    breakElementWithoutHangingNode(edgeToBreak, pState, ctx);\n    //        std::cout << \"P1 executed \";\n\n    return true;\n  }\n};\n\n#endif // GALOIS_PRODUCTION1_H\n"
  },
  {
    "path": "lonestar/scientific/cpu/longestedge/src/productions/Production2.h",
    "content": "#ifndef GALOIS_PRODUCTION2_H\n#define GALOIS_PRODUCTION2_H\n\n#include \"Production.h\"\n#include \"../utils/ConnectivityManager.h\"\n#include \"../utils/GaloisUtils.h\"\n\nclass Production2 : public Production {\nprivate:\n  bool checkApplicabilityCondition(\n      const std::vector<optional<EdgeIterator>>& edgesIterators) const {\n    return connManager.countBrokenEdges(edgesIterators) == 1;\n  }\n\npublic:\n  using Production::Production;\n\n  bool execute(ProductionState& pState,\n               galois::UserContext<GNode>& ctx) override {\n    if (!checkApplicabilityCondition(pState.getEdgesIterators())) {\n      return false;\n    }\n\n    //        logg(pState.getInteriorData(), pState.getVerticesData());\n\n    int brokenEdge = pState.getAnyBrokenEdge();\n    assert(brokenEdge != -1);\n\n    if (!checkIfBrokenEdgeIsTheLongest(brokenEdge, pState.getEdgesIterators(),\n                                       pState.getVertices())) {\n      return false;\n    }\n\n    breakElementWithHangingNode(brokenEdge, pState, ctx);\n    //        std::cout << \"P2 executed \";\n    return true;\n  }\n};\n\n#endif // GALOIS_PRODUCTION2_H\n"
  },
  {
    "path": "lonestar/scientific/cpu/longestedge/src/productions/Production3.h",
    "content": "#ifndef GALOIS_PRODUCTION3_H\n#define GALOIS_PRODUCTION3_H\n\n#include \"Production.h\"\n#include \"../utils/ConnectivityManager.h\"\n#include \"../utils/GaloisUtils.h\"\n\nclass Production3 : public Production {\nprivate:\n  bool checkApplicabilityCondition(\n      const std::vector<optional<EdgeIterator>>& edgesIterators) const {\n    return connManager.countBrokenEdges(edgesIterators) == 1;\n  }\n\npublic:\n  using Production::Production;\n\n  bool execute(ProductionState& pState,\n               galois::UserContext<GNode>& ctx) override {\n    if (!checkApplicabilityCondition(pState.getEdgesIterators())) {\n      return false;\n    }\n\n    int brokenEdge = pState.getAnyBrokenEdge();\n    assert(brokenEdge != -1);\n\n    if (checkIfBrokenEdgeIsTheLongest(brokenEdge, pState.getEdgesIterators(),\n                                      pState.getVertices())) {\n      return false;\n    }\n\n    //        logg(pState.getInteriorData(), pState.getVerticesData());\n\n    const vector<int>& longestEdges = pState.getLongestEdges();\n\n    for (int longest : longestEdges) {\n      if (pState.getEdgesData()[longest].get().isBorder()) {\n        breakElementWithoutHangingNode(longest, pState, ctx);\n        //                std::cout << \"P3 executed \";\n        return true;\n      }\n    }\n    for (int longest : longestEdges) {\n      if (!pState.getVerticesData()[getEdgeVertices(longest).first]\n               .isHanging() &&\n          !pState.getVerticesData()[getEdgeVertices(longest).second]\n               .isHanging()) {\n\n        breakElementWithoutHangingNode(longest, pState, ctx);\n        //                std::cout << \"P3 executed \";\n        return true;\n      }\n    }\n    return false;\n  }\n};\n\n#endif // GALOIS_PRODUCTION3_H\n"
  },
  {
    "path": "lonestar/scientific/cpu/longestedge/src/productions/Production4.h",
    "content": "#ifndef GALOIS_PRODUCTION4_H\n#define GALOIS_PRODUCTION4_H\n\n#include \"Production.h\"\n#include \"../utils/ConnectivityManager.h\"\n#include \"../utils/GaloisUtils.h\"\n\nclass Production4 : public Production {\nprivate:\n  bool checkApplicabilityCondition(\n      const std::vector<optional<EdgeIterator>>& edgesIterators) const {\n    return connManager.countBrokenEdges(edgesIterators) == 2;\n  }\n\npublic:\n  using Production::Production;\n\n  bool execute(ProductionState& pState,\n               galois::UserContext<GNode>& ctx) override {\n    if (!checkApplicabilityCondition(pState.getEdgesIterators())) {\n      return false;\n    }\n    //        logg(pState.getInteriorData(), pState.getVerticesData());\n\n    const vector<int>& longestEdges =\n        pState.getLongestEdgesIncludingBrokenOnes();\n    for (int longest : longestEdges) {\n      const vector<int>& brokenEdges = pState.getBrokenEdges();\n      if (std::find(brokenEdges.begin(), brokenEdges.end(), longest) !=\n          brokenEdges.end()) {\n        breakElementWithHangingNode(longest, pState, ctx);\n        //                std::cout << \"P4 executed \";\n        return true;\n      }\n    }\n    return false;\n  }\n};\n\n#endif // GALOIS_PRODUCTION4_H\n"
  },
  {
    "path": "lonestar/scientific/cpu/longestedge/src/productions/Production5.h",
    "content": "#ifndef GALOIS_PRODUCTION5_H\n#define GALOIS_PRODUCTION5_H\n\n#include \"Production.h\"\n#include \"../utils/ConnectivityManager.h\"\n#include \"../utils/GaloisUtils.h\"\n\nclass Production5 : public Production {\nprivate:\n  bool checkApplicabilityCondition(\n      const std::vector<optional<EdgeIterator>>& edgesIterators) const {\n    return connManager.countBrokenEdges(edgesIterators) == 2;\n  }\n\npublic:\n  using Production::Production;\n\n  bool execute(ProductionState& pState,\n               galois::UserContext<GNode>& ctx) override {\n    if (!checkApplicabilityCondition(pState.getEdgesIterators())) {\n      return false;\n    }\n\n    const vector<int>& longestEdges =\n        pState.getLongestEdgesIncludingBrokenOnes();\n    if (longestEdges.size() > 1) {\n      return false;\n    }\n\n    //        logg(pState.getInteriorData(), pState.getVerticesData());\n    const vector<int>& brokenEdges = pState.getBrokenEdges();\n    if (std::find(brokenEdges.begin(), brokenEdges.end(), longestEdges[0]) ==\n        brokenEdges.end()) {\n      breakElementWithoutHangingNode(longestEdges[0], pState, ctx);\n      //            std::cout << \"P5 executed \";\n      return true;\n    }\n    return false;\n  }\n};\n\n#endif // GALOIS_PRODUCTION2_H\n"
  },
  {
    "path": "lonestar/scientific/cpu/longestedge/src/productions/Production6.h",
    "content": "#ifndef GALOIS_PRODUCTION6_H\n#define GALOIS_PRODUCTION6_H\n\n#include \"Production.h\"\n#include \"../utils/ConnectivityManager.h\"\n#include \"../utils/GaloisUtils.h\"\n\nclass Production6 : public Production {\nprivate:\n  bool checkApplicabilityCondition(\n      const std::vector<optional<EdgeIterator>>& edgesIterators) const {\n    return connManager.countBrokenEdges(edgesIterators) == 3;\n  }\n\npublic:\n  using Production::Production;\n\n  bool execute(ProductionState& pState,\n               galois::UserContext<GNode>& ctx) override {\n    if (!checkApplicabilityCondition(pState.getEdgesIterators())) {\n      return false;\n    }\n\n    const vector<int>& longestEdges =\n        pState.getLongestEdgesIncludingBrokenOnes();\n\n    //        logg(pState.getInteriorData(), pState.getVerticesData());\n\n    breakElementWithHangingNode(longestEdges[0], pState, ctx);\n    //        std::cout << \"P5 executed \";\n    return true;\n  }\n};\n\n#endif // GALOIS_PRODUCTION2_H\n"
  },
  {
    "path": "lonestar/scientific/cpu/longestedge/src/readers/AsciiReader.cpp",
    "content": "#include \"AsciiReader.h\"\n#include \"../model/Map.h\"\n\n#include <cmath>\n#include <cstddef>\n#include <cstdio>\n#include <cstring>\n\nint AsciiReader::readLine(FILE* f, char* buffer, const size_t buffersize,\n                          size_t* line_number) {\n\n  do {\n    if (fgets(buffer, buffersize, f) != NULL) {\n      ++(*line_number);\n      char* p = strchr(buffer, '\\n');\n      if (p) {\n        *p = '\\0';\n      } else {\n        fprintf(stderr,\n                \"Line %zu longer than buffer\\n\"\n                \"line content: %s\\n\",\n                *line_number, buffer);\n        return 1;\n      }\n    } else {\n      *buffer = '\\0';\n      return 2;\n    }\n  } while (((buffer[0] == '#') || (buffer[0] == '\\0')));\n\n  return 0;\n}\n\nMap* AsciiReader::read(const std::string filename) {\n  const size_t tambuf = 256;\n  char buf[tambuf];\n  size_t line_number = 0;\n\n  FILE* fp = fopen(filename.c_str(), \"r\");\n  if (fp == NULL) {\n    fprintf(stderr, \"Cannot open file %s\\n\", filename.c_str());\n    exit(EXIT_FAILURE);\n  }\n\n  double dtm_data[6] = {INFINITY, INFINITY, INFINITY,\n                        INFINITY, INFINITY, INFINITY};\n\n  for (size_t data_idx = 0; data_idx < 6; ++data_idx) {\n    if (readLine(fp, buf, tambuf, &line_number) == 0) {\n      const char space   = ' ';\n      const char* result = strchr(buf, space);\n      if (result == NULL) {\n        fprintf(stderr,\n                \"Header line has no space\\n\"\n                \"%s:%zu\\n\",\n                filename.c_str(), line_number);\n        exit(EXIT_FAILURE);\n      }\n      char* p;\n      dtm_data[data_idx] = strtod(result, &p);\n      if (buf == p) {\n        fprintf(stderr,\n                \"%s: not a decimal number\\n\"\n                \"%s:%zu\\n\",\n                buf, filename.c_str(), line_number);\n        exit(EXIT_FAILURE);\n      }\n\n    } else {\n      fprintf(stderr,\n              \"Problem reading ASC file\\n\"\n              \"%s:%zu\\n\",\n              filename.c_str(), line_number);\n      exit(EXIT_FAILURE);\n    }\n  }\n\n  const size_t nCols    = dtm_data[0];\n  const size_t nRows    = dtm_data[1];\n  const double xMin     = dtm_data[2];\n  const double yMin     = dtm_data[3];\n  const double cellSize = dtm_data[4];\n  // const double noData   = dtm_data[5];\n\n  const size_t numOfPoints = nCols * nRows;\n\n  double** coords = (double**)malloc(sizeof(double*) * numOfPoints);\n\n  for (size_t i = 0; i < numOfPoints; ++i) {\n    coords[i] = (double*)malloc(sizeof(double) * 3);\n  }\n\n  for (size_t j = 0; j < nRows; ++j) {\n    const double y = yMin + (cellSize * (nRows - (j + 1)));\n\n    if (readLine(fp, buf, tambuf, &line_number) != 0) {\n      fprintf(stderr,\n              \"Problem reading ASC file\\n\"\n              \"%s:%zu\\n\",\n              filename.c_str(), line_number);\n      exit(EXIT_FAILURE);\n    }\n\n    char* buf_dummy = buf;\n    for (size_t i = 0; i < nCols; ++i) {\n\n      const double x = xMin + (cellSize * i);\n\n      char* p;\n      const double z = strtod(buf_dummy, &p);\n      if (buf_dummy == p) {\n        fprintf(stderr,\n                \"%s: not a decimal number\\n\"\n                \"%s:%zu\\n\",\n                buf_dummy, filename.c_str(), line_number);\n        exit(EXIT_FAILURE);\n      } else {\n        buf_dummy = p + 1;\n      }\n\n      coords[i + (nCols * j)][0] = x;\n      coords[i + (nCols * j)][1] = y;\n      coords[i + (nCols * j)][2] = z;\n    }\n  }\n\n  Map* map = convert(coords, nRows, nCols);\n\n  for (size_t k = 0; k < numOfPoints; ++k) {\n    free(coords[k]);\n  }\n  free(coords);\n  return map;\n}\n\nMap* AsciiReader::convert(double** coords, size_t nRows, size_t nCols) {\n  double** map_data = Map::init_map_data(nRows, nCols);\n  for (size_t k = 0; k < nRows; ++k) {\n    for (size_t i = 0; i < nCols; ++i) {\n      map_data[k][i] = coords[k * nCols + i][2];\n    }\n  }\n  Map* map = new Map(map_data, nCols, nRows, 1, 1);\n  return map;\n}\n"
  },
  {
    "path": "lonestar/scientific/cpu/longestedge/src/readers/AsciiReader.h",
    "content": "#ifndef ASC_READER_H\n#define ASC_READER_H\n\n#include <stddef.h>\n#include <stdio.h>\n#include <string>\n#include \"../model/Map.h\"\n\nclass AsciiReader {\npublic:\n  Map* read(const std::string filename);\n\nprivate:\n  static int readLine(FILE* f, char* buffer, const size_t buffersize,\n                      size_t* line_number);\n\n  static Map* convert(double** coords, const size_t nRows, const size_t nCols);\n};\n\n#endif // ASC_READER_H\n"
  },
  {
    "path": "lonestar/scientific/cpu/longestedge/src/readers/InpReader.cpp",
    "content": "#include \"InpReader.h\"\n\n#include \"../utils/ConnectivityManager.h\"\n\n#include <cstdlib>\n#include <fstream>\n#include <iostream>\n#include <limits>\n#include <map>\n#include <set>\n#include <string>\n#include <utility>\n#include <vector>\n\nusing inpEdge = std::pair<size_t, size_t>;\n\nvoid inpRead(const std::string& filename, Graph& graph, double N, double S,\n             double E, double W, bool version2D) {\n\n  auto connManager = ConnectivityManager{graph};\n\n  // First, assign corners to max and min values\n  E = std::numeric_limits<double>::lowest();\n  N = std::numeric_limits<double>::lowest();\n  S = std::numeric_limits<double>::max();\n  W = std::numeric_limits<double>::max();\n\n  std::ifstream file(filename, std::ios_base::in);\n\n  if (!file.is_open()) {\n    std::cerr << \"File \" << filename << \" cannot be opened!\" << std::endl;\n    exit(EXIT_FAILURE);\n  }\n\n  // Read the header (Number of nodes and number of elements)\n  size_t numberOfNodes, numberOfElements, dummy;\n\n  file >> numberOfNodes;\n  file >> numberOfElements;\n  file >> dummy;\n  file >> dummy;\n  file >> dummy;\n\n  // Vector to store the nodes in order to create the interiors\n  auto nodes = std::vector<GNode>(numberOfNodes);\n\n  // Read all coordinates and generating the nodes\n  for (auto i = 0u; i < numberOfNodes; ++i) {\n    file >> dummy;\n    double x, y, z;\n    file >> x;\n    file >> y;\n    file >> z;\n    const auto& coordinates = Coordinates{x, y, z};\n\n    nodes[i] = connManager.createNode(NodeData{false, coordinates, false});\n\n    // Update the four corners\n    N = std::max(y, N);\n    S = std::min(y, S);\n    E = std::max(x, E);\n    W = std::min(x, W);\n  }\n\n  // Containers for edges\n  // Since the same edge can be in two triangles, we need to make sure to create\n  // only one\n  auto edgeSet = std::set<inpEdge>{};\n  // We will keep track if one edge is on the boundary or not\n  auto isEdgeBoundary = std::map<inpEdge, bool>{};\n\n  // Read elements, create interiors, and populate edgeSet and isEdgeBoundary\n  for (auto i = 0u; i < numberOfElements; ++i) {\n    file >> dummy;\n    file >> dummy;\n    std::string dummy_str;\n    file >> dummy_str;\n    size_t conc1, conc2, conc3;\n    file >> conc1;\n    file >> conc2;\n    file >> conc3;\n\n    connManager.createInterior(nodes[conc1], nodes[conc2], nodes[conc3]);\n\n    const auto edges = std::vector<inpEdge>{\n        inpEdge{std::min(conc1, conc2), std::max(conc1, conc2)},\n        inpEdge{std::min(conc2, conc3), std::max(conc2, conc3)},\n        inpEdge{std::min(conc3, conc1), std::max(conc3, conc1)}};\n\n    for (const auto& edge : edges) {\n      if (edgeSet.insert(edge).second) {\n        // First time it's inserted we assume it's on the boundary.\n        isEdgeBoundary[edge] = true;\n      } else {\n        // If we try to insert it again, it means that the edge is shared by two\n        // elements, therefore is not on the boundary\n        isEdgeBoundary[edge] = false;\n      }\n    }\n  }\n\n  // Finally, generate edges\n  for (const auto& edge : edgeSet) {\n    auto node1 = nodes[edge.first];\n    auto node2 = nodes[edge.second];\n\n    const auto node1Coords = node1->getData().getCoords();\n    const auto node2Coords = node2->getData().getCoords();\n\n    auto midPointCoords = (node1Coords + node2Coords) / 2.;\n\n    const auto length = node1Coords.dist(node2Coords, version2D);\n\n    connManager.createEdge(node1, node2, isEdgeBoundary.at(edge),\n                           midPointCoords, length);\n  }\n\n  file.close();\n}\n"
  },
  {
    "path": "lonestar/scientific/cpu/longestedge/src/readers/InpReader.h",
    "content": "#ifndef INP_READER_H\n#define INP_READER_H\n\n#include \"../model/Graph.h\"\n\n#include <string>\n\n//!\n//! \\brief inpRead Function that reads a mesh in INP format generates the graph,\n//! and computes the four corners of the mesh. \\param filename Path of the INP\n//! file. \\param graph Galois hypergraph representing the mesh. \\param config\n//! Config object. The four corners E, W, N, S are populated in the function.\n//! version2D is used for the edge length\n//!\n\nvoid inpRead(const std::string& filename, Graph& graph, double N, double S,\n             double E, double W, bool version2D);\n\n#endif // INP_READER_H\n"
  },
  {
    "path": "lonestar/scientific/cpu/longestedge/src/readers/SrtmReader.cpp",
    "content": "#include <cstdio>\n#include <cerrno>\n#include <cstring>\n#include <cmath>\n#include <galois/gIO.h>\n#include \"SrtmReader.h\"\n#include \"../utils/Utils.h\"\n\nMap* SrtmReader::read(\n    const double west_border, const double north_border,\n    const double east_border, const double south_border,\n    const char* map_dir) { // data[row][column] - it's array of rows\n  Utils::swap_if_required((double*)&south_border, (double*)&north_border);\n  Utils::swap_if_required((double*)&west_border, (double*)&east_border);\n\n  // Rounding to avoid problems with numerical errors\n  // Adding some margin to make sure that the mesh is always inside the map\n  int west_border_int  = border_to_int(west_border) - MARGIN;\n  int north_border_int = border_to_int(north_border) + MARGIN;\n  int east_border_int  = border_to_int(east_border) + MARGIN;\n  int south_border_int = border_to_int(south_border) - MARGIN;\n\n  // Update the map vertices\n  const auto map_N_border = (double)north_border_int / VALUES_IN_DEGREE;\n  const auto map_S_border = (double)south_border_int / VALUES_IN_DEGREE;\n  const auto map_E_border = (double)east_border_int / VALUES_IN_DEGREE;\n  const auto map_W_border = (double)west_border_int / VALUES_IN_DEGREE;\n\n  size_t cols = (size_t)(east_border_int - west_border_int);\n  size_t rows = (size_t)(north_border_int - south_border_int);\n  // mallocs an entire grid of doubles based on borders; to be read in later\n  // from disk\n  double** map_data = Map::init_map_data(rows, cols);\n  Map* map          = new Map(map_data, cols, rows, 1. / VALUES_IN_DEGREE,\n                     1. / VALUES_IN_DEGREE);\n\n  map->setNorthBorder(map_N_border);\n  map->setWestBorder(map_W_border);\n\n  // read in points within specified borders\n  read_from_multiple_files(map_W_border, map_N_border, map_E_border,\n                           map_S_border, map_dir, map_data);\n\n  skip_outliers(map_data, map->getLength(), map->getWidth());\n\n  return map;\n}\n\nvoid SrtmReader::read_from_multiple_files(const double west_border,\n                                          const double north_border,\n                                          const double east_border,\n                                          const double south_border,\n                                          const char* map_dir,\n                                          double** map_data) {\n  int first_free_row = 0;\n  double north_ptr   = north_border;\n  double south_ptr = Utils::is_lesser(Utils::floor2(north_border), south_border)\n                         ? south_border\n                         : Utils::floor2(north_border);\n  if (Utils::equals(north_ptr, south_ptr)) {\n    south_ptr = Utils::is_lesser(north_border - 1, south_border)\n                    ? south_border\n                    : north_border - 1;\n  }\n\n  // loop over y-axis\n  while (Utils::is_greater(north_ptr, south_border)) {\n    int north_ptr_int = border_to_int(north_ptr);\n    size_t rows_here =\n        (size_t)std::abs(north_ptr_int - border_to_int(south_ptr));\n\n    int first_free_col = 0;\n    double west_ptr    = west_border;\n    double east_ptr = Utils::is_greater(Utils::ceil2(west_border), east_border)\n                          ? east_border\n                          : Utils::ceil2(west_border);\n\n    // loop over x-axis\n    while (Utils::is_lesser(west_ptr, east_border)) {\n      int west_ptr_int = border_to_int(west_ptr);\n      size_t cols_here =\n          (size_t)std::abs(border_to_int(east_ptr) - west_ptr_int);\n      // determine file to read and actual do read\n      read_from_file(north_ptr_int, west_ptr_int, rows_here, cols_here,\n                     first_free_row, first_free_col, map_data, map_dir);\n\n      first_free_col += cols_here;\n      west_ptr = Utils::floor2(west_ptr + 1);\n      east_ptr = Utils::is_greater(east_ptr + 1, east_border) ? east_border\n                                                              : east_ptr + 1;\n    }\n\n    first_free_row += rows_here;\n    north_ptr = Utils::equals(Utils::floor2(north_ptr), north_ptr)\n                    ? north_ptr - 1\n                    : Utils::floor2(north_ptr);\n    south_ptr = Utils::is_lesser(south_ptr - 1, south_border) ? south_border\n                                                              : south_ptr - 1;\n  }\n}\n\nvoid SrtmReader::read_from_file(int north_border_int, int west_border_int,\n                                size_t rows, size_t cols, int first_row,\n                                int first_col, double** map_data,\n                                const char* map_dir) {\n  char file_to_open[256];\n  get_filename(file_to_open, map_dir, west_border_int, north_border_int);\n\n  FILE* map_file;\n  if ((map_file = fopen(file_to_open, \"rb\")) == NULL) {\n    fprintf(stderr, \"%s\\n\", strerror(errno));\n    exit(1);\n  }\n  int cells_in_degree = VALUES_IN_DEGREE + 1;\n  if (fseek(map_file,\n            (((VALUES_IN_DEGREE - (north_border_int % VALUES_IN_DEGREE)) %\n              VALUES_IN_DEGREE) *\n                 cells_in_degree +\n             (west_border_int % VALUES_IN_DEGREE)) *\n                PIXEL_SIZE,\n            SEEK_SET) == -1) {\n    fprintf(stderr, \"%s\\n\", strerror(errno));\n    exit(1);\n  }\n  uint16_t* buffer = (uint16_t*)malloc(PIXEL_SIZE * cols);\n  for (size_t i = 0; i < rows; ++i) {\n    if (cols != fread(buffer, PIXEL_SIZE, cols, map_file))\n      std::abort();\n    if (fseek(map_file, (cells_in_degree - cols) * PIXEL_SIZE, SEEK_CUR) ==\n        -1) {\n      fprintf(stderr, \"%s\\n\", strerror(errno));\n      exit(1);\n    }\n    for (size_t j = 0; j < cols; ++j) {\n      Utils::change_bytes_order(&(buffer[j]));\n      // here is where data is being written after being read\n      map_data[first_row + i][first_col + j] = buffer[j];\n    }\n  }\n  free(buffer);\n  if (fclose(map_file) != 0) {\n    fprintf(stderr, \"%s\\n\", strerror(errno));\n    exit(1);\n  }\n}\n\nvoid SrtmReader::skip_outliers(double* const* map_data, size_t length,\n                               size_t width) {\n  bool outlierFound = false;\n  for (size_t i = 0; i < length; ++i) {\n    for (size_t j = 0; j < width; ++j) {\n      // smooth out outlier point here\n      if (map_data[i][j] > 3000 || map_data[i][j] < 10) {\n        outlierFound = true;\n        if (i > 0) {\n          map_data[i][j] = map_data[i - 1][j];\n        } else {\n          map_data[i][j] = map_data[i + 1][j];\n        }\n      }\n    }\n  }\n\n  if (outlierFound) {\n    galois::gInfo(\"Outliers in input data detected.\");\n  }\n}\n\nvoid SrtmReader::get_filename(char* filename, const char* map_dir,\n                              int west_border_int, int north_border_int) {\n  int first_long_to_read;\n  int first_lat_to_read;\n\n  if (west_border_int < 0) {\n    if (west_border_int % VALUES_IN_DEGREE != 0) {\n      first_long_to_read = west_border_int / VALUES_IN_DEGREE + 1;\n    } else {\n      first_long_to_read = west_border_int / VALUES_IN_DEGREE;\n    }\n  } else {\n    first_long_to_read = west_border_int / VALUES_IN_DEGREE;\n  }\n\n  if (north_border_int < 0) {\n    if (north_border_int % VALUES_IN_DEGREE != 0) {\n      first_lat_to_read = north_border_int / VALUES_IN_DEGREE + 1;\n    } else {\n      first_lat_to_read = north_border_int / VALUES_IN_DEGREE;\n    }\n  } else {\n    if (north_border_int % VALUES_IN_DEGREE != 0) {\n      first_lat_to_read = north_border_int / VALUES_IN_DEGREE;\n    } else {\n      first_lat_to_read = north_border_int / VALUES_IN_DEGREE - 1;\n    }\n  }\n\n  sprintf(filename, \"%s/%s%d%s%.3d.hgt\", map_dir,\n          first_lat_to_read < 0 ? \"S\" : \"N\", first_lat_to_read,\n          first_long_to_read < 0 ? \"W\" : \"E\", first_long_to_read);\n}\n\nint SrtmReader::border_to_int(const double border) {\n  return (int)round(border * SrtmReader::VALUES_IN_DEGREE);\n}\n"
  },
  {
    "path": "lonestar/scientific/cpu/longestedge/src/readers/SrtmReader.h",
    "content": "#ifndef TERGEN_SRTMREADER_H\n#define TERGEN_SRTMREADER_H\n\n#include <cstdlib>\n#include \"../model/Map.h\"\n\nclass SrtmReader {\nprivate:\n  static const int RESOLUTION = 3;\n\n  static const unsigned short PIXEL_SIZE = 2;\n\n  /**\n   * Given the border of the coordinates to read, read files corresponding to\n   * the points within the border\n   *\n   * @param map_dir directory storing the files to read\n   * @param map_data 2D malloc'd array representing the map\n   */\n  void read_from_multiple_files(const double west_border,\n                                const double north_border,\n                                const double east_border,\n                                const double south_border, const char* map_dir,\n                                double** map_data);\n\n  /**\n   * Given north and west starting point as well as rows/columns to read,\n   * find the file to read and read it into the map\n   */\n  void read_from_file(int north_border_int, int west_border_int, size_t rows,\n                      size_t cols, int first_row, int first_col,\n                      double** map_data, const char* map_dir);\n\n  /**\n   * Smooth out read outliers by making them take a nearby point.\n   */\n  void skip_outliers(double* const* map_data, size_t length, size_t width);\n\n  //! Given north and west points, determine file name to read\n  void get_filename(char* filename, const char* map_dir, int west_border_int,\n                    int north_border_int);\n\n  //! Convert a border point into an int\n  int border_to_int(const double border);\n\npublic:\n  static const int VALUES_IN_DEGREE = 60 * 60 / RESOLUTION;\n  static const int MARGIN           = 3;\n\n  Map* read(const double west_border, const double north_border,\n            const double east_border, const double south_border,\n            const char* map_dir);\n};\n\n#endif // TERGEN_SRTMREADER_H\n"
  },
  {
    "path": "lonestar/scientific/cpu/longestedge/src/utils/ConnectivityManager.h",
    "content": "#ifndef GALOIS_CONNECTIVITYMANAGER_H\n#define GALOIS_CONNECTIVITYMANAGER_H\n\n#include <galois/optional.h>\n#include \"../model/Graph.h\"\n\nclass ConnectivityManager {\nprivate:\n  Graph& graph;\n\npublic:\n  ConnectivityManager(Graph& graph) : graph(graph) {}\n\n  //! Return a vector of neighbors given some vertex\n  std::vector<GNode> getNeighbours(GNode node) const {\n    std::vector<GNode> vertices;\n    for (Graph::edge_iterator ii = graph.edge_begin(node),\n                              ee = graph.edge_end(node);\n         ii != ee; ++ii) {\n      vertices.push_back(graph.getEdgeDst(ii));\n    }\n    return vertices;\n  }\n\n  //! Given 3 nodes that comprise a triangle, return the triangle's edges\n  //! Key is that edges will be returned such that it is 0->1, 1->2,\n  //! and 2->0\n  //! Assumption for this to work is that the triangle order is 0->1,\n  //! 1->2, and 2->0 else you will get an empty edge\n  std::vector<optional<EdgeIterator>>\n  getTriangleEdges(std::vector<GNode> vertices) {\n    std::vector<optional<EdgeIterator>> edges;\n    for (int i = 0; i < 3; i++) {\n      edges.emplace_back(getEdge(vertices[i], vertices[(i + 1) % 3]));\n    }\n    return edges;\n  }\n\n  //! Return an edge (if it exists; may have been broken into 2)\n  optional<EdgeIterator> getEdge(const GNode& v1, const GNode& v2) const {\n    EdgeIterator edge = graph.findEdge(v1, v2);\n    return convertToOptionalEdge(edge);\n  }\n\n  //! See if an edge exists and return optional if necessary\n  optional<EdgeIterator> convertToOptionalEdge(const EdgeIterator& edge) const {\n    if (edge.base() == edge.end()) {\n      return galois::optional<EdgeIterator>();\n    } else {\n      return galois::optional<EdgeIterator>(edge);\n    }\n  }\n\n  //! True if there's a broken edge in the vector of edges\n  bool hasBrokenEdge(const std::vector<optional<EdgeIterator>>& edges) const {\n    return countBrokenEdges(edges) > 0;\n  }\n\n  //! Count the number of edges that don't exist (i.e. broken) in a vector\n  //! of edges\n  int countBrokenEdges(const std::vector<optional<EdgeIterator>>& edges) const {\n    int counter = 0;\n    for (const optional<EdgeIterator>& edge : edges) {\n      if (!edge) {\n        counter++;\n      }\n    }\n    return counter;\n  }\n\n  //! Attempts to find a node between two nodes (i.e. find midpoint, if it\n  //! exists)\n  optional<GNode> findNodeBetween(const GNode& node1,\n                                  const GNode& node2) const {\n    Coordinates expectedLocation =\n        (node1->getData().getCoords() + node2->getData().getCoords()) / 2.;\n    std::vector<GNode> neighbours1 = getNeighbours(node1);\n    std::vector<GNode> neighbours2 = getNeighbours(node2);\n    for (GNode& iNode : neighbours1) {\n      auto iNodeData = graph.getData(iNode);\n      for (GNode& jNode : neighbours2) {\n        if (iNode == jNode &&\n            iNodeData.getCoords().isXYequal(expectedLocation)) {\n          return optional<GNode>(iNode);\n        }\n      }\n    }\n    return optional<GNode>();\n  }\n\n  //! Creates a node and adds to specified worklist; returns it as well\n  GNode createNode(NodeData& nodeData, galois::UserContext<GNode>& ctx) const {\n    GNode node = createNode(nodeData);\n    ctx.push(node);\n    return std::move(node);\n  }\n\n  //! Adds a new node to the graph; returns node id\n  GNode createNode(NodeData nodeData) const {\n    auto node = graph.createNode(nodeData);\n    graph.addNode(node);\n    return node;\n  }\n\n  /**\n   * Create a new edge; need to specify if border, the middle point, and\n   * its length\n   *\n   * NOTE: can theoretically calculate middle + length given just the\n   * two nodes\n   */\n  void createEdge(GNode& node1, GNode& node2, bool border,\n                  const Coordinates& middlePoint, double length) {\n    // add the edge\n    graph.addEdge(node1, node2);\n    // get the edge\n    const EdgeIterator& edge = graph.findEdge(node1, node2);\n    // edit its edge data\n    graph.getEdgeData(edge).setBorder(border);\n    graph.getEdgeData(edge).setMiddlePoint(middlePoint);\n    graph.getEdgeData(edge).setLength(length);\n  }\n\n  /**\n   * Connects 3 nodes with a hyperedge; should be a triangle.\n   *\n   * Adds the new node to a worklist as well.\n   */\n  void createInterior(const GNode& node1, const GNode& node2,\n                      const GNode& node3,\n                      galois::UserContext<GNode>& ctx) const {\n    // args: is a hyper edge + do not need to refine\n    NodeData interiorData = NodeData{true, false};\n    auto interior         = createNode(interiorData, ctx);\n\n    // connect hyperedge to triangle\n    graph.addEdge(interior, node1);\n    graph.addEdge(interior, node2);\n    graph.addEdge(interior, node3);\n    // located in center of triangle\n    interior->getData().setCoords((node1->getData().getCoords() +\n                                   node2->getData().getCoords() +\n                                   node3->getData().getCoords()) /\n                                  3.);\n  }\n\n  /**\n   * Connects 3 nodes with a hyperedge; should be a triangle. Returns\n   * the new node ID.\n   *\n   * For consistency, node1->node2->node3 edge order is probably\n   * preferred.\n   */\n  GNode createInterior(const GNode& node1, const GNode& node2,\n                       const GNode& node3) const {\n    // args: is a hyper edge + do not need to refine\n    NodeData interiorData = NodeData{true, false};\n    auto interior         = createNode(interiorData);\n\n    // connect hyperedge to triangle\n    graph.addEdge(interior, node1);\n    graph.addEdge(interior, node2);\n    graph.addEdge(interior, node3);\n    // located in center of triangle\n    interior->getData().setCoords((node1->getData().getCoords() +\n                                   node2->getData().getCoords() +\n                                   node3->getData().getCoords()) /\n                                  3.);\n    return std::move(interior);\n  }\n\n  //! Return reference underlying graph\n  Graph& getGraph() const { return graph; }\n\n  //! Get the coordinates of all neighbors of specified vertex and return them.\n  const std::vector<Coordinates> getVerticesCoords(const GNode& node) const {\n    std::vector<Coordinates> result;\n    for (auto neighbour : getNeighbours(node)) {\n      result.push_back(neighbour->getData().getCoords());\n    }\n    return result;\n  }\n\n  //! Remove edge node1->node2 or node2->node1 (whichever is found)\n  void removeEdge(const GNode& node1, const GNode& node2) const {\n    const EdgeIterator& edge1 = graph.findEdge(node1, node2);\n    if (edge1.base() != edge1.end()) {\n      graph.removeEdge(node1, edge1);\n      return;\n    }\n    const EdgeIterator& edge2 = graph.findEdge(node2, node1);\n    if (edge2.base() != edge2.end()) {\n      graph.removeEdge(node2, edge2);\n      return;\n    }\n    std::cerr << \"Problem in removing an edge.\" << std::endl;\n  }\n};\n\n#endif // GALOIS_CONNECTIVITYMANAGER_H\n"
  },
  {
    "path": "lonestar/scientific/cpu/longestedge/src/utils/GaloisUtils.h",
    "content": "#ifndef GALOIS_UTILS_H\n#define GALOIS_UTILS_H\n\n#include <cmath>\n#include <vector>\n\nstatic const double EPS = 1e-4;\n\ninline bool equals(double a, double b) { return fabs(a - b) < EPS; }\n\ninline bool greater(double a, double b) { return a - b >= EPS; }\n\ninline bool less(double a, double b) { return a - b <= -EPS; }\n\ninline std::vector<int> indexesOfMaxElems(std::vector<double> elems) {\n  std::vector<int> result;\n  if (elems.empty()) {\n    return result;\n  }\n  double currentMax = elems[0];\n  result.push_back(0);\n  for (unsigned long i = 1; i < elems.size(); ++i) {\n    if (greater(elems[i], currentMax)) {\n      result.clear();\n      result.push_back(i);\n      currentMax = elems[i];\n    } else if (equals(elems[i], currentMax)) {\n      result.push_back(i);\n    }\n  }\n  return result;\n}\n\n#endif // GALOIS_UTILS_H\n"
  },
  {
    "path": "lonestar/scientific/cpu/longestedge/src/utils/GraphGenerator.h",
    "content": "#ifndef GALOIS_GRAPHGENERATOR_H\n#define GALOIS_GRAPHGENERATOR_H\n\n#include \"Utils.h\"\n\nclass GraphGenerator {\npublic:\n  static void generateSampleGraphWithDataWithConversionToUtm(\n      Graph& graph, Map& map, const double west_border,\n      const double north_border, const double east_border,\n      const double south_border, bool version2D, bool square) {\n    // temp storage for nodes we care about for this function\n    vector<GNode> nodes;\n    // wrapper around graph to edit it\n    ConnectivityManager connManager{graph};\n\n    // note the following coordinates should be the same ones used to load\n    // terrain data into the map; the height (z coordinate) is retrieved from\n    // said map\n\n    // SW\n    const Coordinates& coordinates0 =\n        Coordinates{Utils::convertToUtm(south_border, west_border, map), map};\n    // NW\n    const Coordinates& coordinates1 =\n        Coordinates{Utils::convertToUtm(north_border, west_border, map), map};\n    // SE\n    const Coordinates& coordinates2 =\n        Coordinates{Utils::convertToUtm(south_border, east_border, map), map};\n    // NE\n    const Coordinates& coordinates3 =\n        Coordinates{Utils::convertToUtm(north_border, east_border, map), map};\n\n    std::vector<Coordinates> coords;\n    if (!square) {\n      coords.push_back(coordinates0);\n      coords.push_back(coordinates1);\n      coords.push_back(coordinates2);\n      coords.push_back(coordinates3);\n    } else {\n      double north = std::min(coordinates1.getY(), coordinates3.getY());\n      double south = std::max(coordinates0.getY(), coordinates2.getY());\n      double east  = std::min(coordinates2.getX(), coordinates3.getX());\n      double west  = std::max(coordinates0.getX(), coordinates1.getX());\n      double diff  = std::min(fabs(north - south), fabs(east - west));\n      north        = south + diff;\n      east         = west + diff;\n      coords.emplace_back(west, south, map);\n      coords.emplace_back(west, north, map);\n      coords.emplace_back(east, south, map);\n      coords.emplace_back(east, north, map);\n    }\n\n    // create the node points for the border intersections\n    // NOT a hyperedge or a hanging node (because border points)\n    nodes.push_back(connManager.createNode(NodeData{false, coords[0], false}));\n    nodes.push_back(connManager.createNode(NodeData{false, coords[1], false}));\n    nodes.push_back(connManager.createNode(NodeData{false, coords[2], false}));\n    nodes.push_back(connManager.createNode(NodeData{false, coords[3], false}));\n    galois::gInfo(\"Nodes created.\");\n\n    // nodes.push_back(connManager.createNode(NodeData{false,\n    // Coordinates{west_border, south_border}, false}));\n    // nodes.push_back(connManager.createNode(NodeData{false,\n    // Coordinates{west_border, north_border}, false}));\n    // nodes.push_back(connManager.createNode(NodeData{false,\n    // Coordinates{east_border, south_border}, false}));\n    // nodes.push_back(connManager.createNode(NodeData{false,\n    // Coordinates{east_border, north_border}, false}));\n\n    // 0 = SW, 1 = NW, 2 = SE, 3 = NE\n    double leftBorderLength = nodes[0]->getData().getCoords().dist(\n        nodes[1]->getData().getCoords(), version2D);\n    double topBorderLength = nodes[1]->getData().getCoords().dist(\n        nodes[3]->getData().getCoords(), version2D);\n    double rightBorderLength = nodes[2]->getData().getCoords().dist(\n        nodes[3]->getData().getCoords(), version2D);\n    double bottomBorderLength = nodes[0]->getData().getCoords().dist(\n        nodes[2]->getData().getCoords(), version2D);\n    double SWtoNELength = nodes[3]->getData().getCoords().dist(\n        nodes[0]->getData().getCoords(), version2D);\n\n    // @todo can refactor some of the below to make less redundant\n\n    // create 5 edges\n    // left border creation\n    connManager.createEdge(\n        nodes[0], nodes[1], true,\n        Coordinates{west_border, (north_border + south_border) / 2.,\n                    map.get_height(west_border,\n                                   (north_border + south_border) / 2., false)},\n        leftBorderLength);\n    // top border creation\n    connManager.createEdge(\n        nodes[1], nodes[3], true,\n        Coordinates{(west_border + east_border) / 2., north_border,\n                    map.get_height((west_border + east_border) / 2.,\n                                   north_border, false)},\n        topBorderLength);\n    // right border creation\n    connManager.createEdge(\n        nodes[2], nodes[3], true,\n        Coordinates{east_border, (north_border + south_border) / 2.,\n                    map.get_height(east_border,\n                                   (north_border + south_border) / 2., false)},\n        rightBorderLength);\n    // left border creation\n    connManager.createEdge(\n        nodes[0], nodes[2], true,\n        Coordinates{(west_border + east_border) / 2., south_border,\n                    map.get_height((west_border + east_border) / 2.,\n                                   south_border, false)},\n        bottomBorderLength);\n    // this edge is diagonal from SW to NE with middle point being the center\n    // of the map; this is the common edge of the first 2 triangles\n    connManager.createEdge(\n        nodes[3], nodes[0], false,\n        Coordinates{(west_border + east_border) / 2.,\n                    (north_border + south_border) / 2.,\n                    map.get_height((west_border + east_border) / 2.,\n                                   (north_border + south_border) / 2., false)},\n        SWtoNELength);\n\n    // creates the hyperedge for the 2 initial triangles of the graph\n    connManager.createInterior(nodes[0], nodes[1], nodes[3]);\n    connManager.createInterior(nodes[0], nodes[3], nodes[2]);\n    galois::gInfo(\"Graph generated.\");\n  }\n\n  static void generateSampleGraphWithData(Graph& graph, Map& map,\n                                          const double west_border,\n                                          const double north_border,\n                                          const double east_border,\n                                          const double south_border,\n                                          bool version2D) {\n    vector<GNode> nodes;\n    ConnectivityManager connManager{graph};\n\n    Utils::convertToUtm(south_border, west_border, map);\n\n    nodes.push_back(connManager.createNode(\n        NodeData{false, Coordinates{south_border, west_border, map}, false}));\n    nodes.push_back(connManager.createNode(\n        NodeData{false, Coordinates{north_border, west_border, map}, false}));\n    nodes.push_back(connManager.createNode(\n        NodeData{false, Coordinates{south_border, east_border, map}, false}));\n    nodes.push_back(connManager.createNode(\n        NodeData{false, Coordinates{north_border, east_border, map}, false}));\n\n    //        nodes.push_back(connManager.createNode(NodeData{false,\n    //        Coordinates{west_border, south_border}, false}));\n    //        nodes.push_back(connManager.createNode(NodeData{false,\n    //        Coordinates{west_border, north_border}, false}));\n    //        nodes.push_back(connManager.createNode(NodeData{false,\n    //        Coordinates{east_border, south_border}, false}));\n    //        nodes.push_back(connManager.createNode(NodeData{false,\n    //        Coordinates{east_border, north_border}, false}));\n\n    double length1 = nodes[0]->getData().getCoords().dist(\n        nodes[1]->getData().getCoords(), version2D);\n    double length2 = nodes[1]->getData().getCoords().dist(\n        nodes[3]->getData().getCoords(), version2D);\n    double length3 = nodes[2]->getData().getCoords().dist(\n        nodes[3]->getData().getCoords(), version2D);\n    double length4 = nodes[0]->getData().getCoords().dist(\n        nodes[2]->getData().getCoords(), version2D);\n    double length5 = nodes[3]->getData().getCoords().dist(\n        nodes[0]->getData().getCoords(), version2D);\n\n    connManager.createEdge(\n        nodes[0], nodes[1], true,\n        Coordinates{\n            west_border, (north_border + south_border) / 2.,\n            map.get_height(west_border, (north_border + south_border) / 2.)},\n        length1);\n    connManager.createEdge(\n        nodes[1], nodes[3], true,\n        Coordinates{\n            (west_border + east_border) / 2., north_border,\n            map.get_height((west_border + east_border) / 2., north_border)},\n        length2);\n    connManager.createEdge(\n        nodes[2], nodes[3], true,\n        Coordinates{\n            east_border, (north_border + south_border) / 2.,\n            map.get_height(east_border, (north_border + south_border) / 2.)},\n        length3);\n    connManager.createEdge(\n        nodes[0], nodes[2], true,\n        Coordinates{\n            (west_border + east_border) / 2., south_border,\n            map.get_height((west_border + east_border) / 2., south_border)},\n        length4);\n    connManager.createEdge(\n        nodes[3], nodes[0], false,\n        Coordinates{(west_border + east_border) / 2.,\n                    (north_border + south_border) / 2.,\n                    map.get_height((west_border + east_border) / 2.,\n                                   (north_border + south_border) / 2.)},\n        length5);\n\n    connManager.createInterior(nodes[0], nodes[1], nodes[3]);\n    connManager.createInterior(nodes[0], nodes[3], nodes[2]);\n  }\n};\n\n#endif // GALOIS_GRAPHGENERATOR_H\n"
  },
  {
    "path": "lonestar/scientific/cpu/longestedge/src/utils/MyGraphFormatWriter.h",
    "content": "#ifndef GALOIS_MYGRAPHFORMATWRITER_H\n#define GALOIS_MYGRAPHFORMATWRITER_H\n\n#include <fstream>\n#include <utility>\n\nusing std::set;\nusing std::string;\ntypedef std::tuple<string, string, bool> Edge;\ntypedef std::pair<int, NodeData> Node;\n\nclass MyGraphFormatWriter {\nprivate:\n  static void writeToMyGraphFormat(const set<Node>& vertices,\n                                   const set<Node>& interiors,\n                                   const set<Edge>& edges, const string& path) {\n    std::ofstream file;\n    file.open(path);\n    printNodes(\n        vertices, \"N,n\", [](NodeData n) { return n.isHanging(); }, file);\n    printNodes(\n        interiors, \"H,h\", [](NodeData n) { return n.isToRefine(); }, file);\n    printEdges(edges, file);\n    file.close();\n  }\n\n  static void printEdges(const set<Edge>& edges, std::ofstream& file) {\n    int i = 0;\n    for (auto edge : edges) {\n      file << \"E,e\" << i++ << \",\" << std::get<0>(edge) << \",\"\n           << std::get<1>(edge) << \",\" << (std::get<2>(edge) ? \"true\" : \"false\")\n           << std::endl;\n    }\n  }\n\n  static void printNodes(const set<Node>& nodes, const string preambule,\n                         bool (*attributeChecker)(NodeData),\n                         std::ofstream& file) {\n    for (auto node : nodes) {\n      file << preambule << node.first << \",\" << node.second.getCoords().getX()\n           << \",\" << node.second.getCoords().getY() << \",\"\n           << node.second.getCoords().getZ() << \",\"\n           << (attributeChecker(node.second) ? \"true\" : \"false\") << std::endl;\n    }\n  }\n\n  static void addEdge(set<Edge>& edges, const string& firstNodeId,\n                      const string& secondNodeId, bool border) {\n    if (!findEdge(firstNodeId, secondNodeId, edges).is_initialized()) {\n      edges.emplace(std::make_tuple(firstNodeId, secondNodeId, border));\n    }\n  }\n\n  static optional<Edge> findEdge(const string& first, const string& second,\n                                 const set<Edge>& edges) {\n    for (auto edge : edges) {\n      if ((std::get<0>(edge) == first && std::get<1>(edge) == second) ||\n          (std::get<0>(edge) == second && std::get<1>(edge) == first)) {\n        return galois::optional<Edge>(edge);\n      }\n    }\n    return galois::optional<Edge>();\n  }\n\n  static string getNodeId(set<Node>& nodes, int& nodesIter, NodeData& data) {\n    return getNodeId(nodes, nodesIter, data, optional<set<Node>>());\n  }\n\n  static string getNodeId(set<Node>& nodes, int& nodesIter, NodeData& data,\n                          optional<set<Node>> additionalNodesSet) {\n    optional<string> maybeId = findNode(data, nodes);\n    if (maybeId) {\n      return maybeId.get();\n    }\n    if (additionalNodesSet) {\n      optional<string> maybeId2 = findNode(data, additionalNodesSet.get());\n      if (maybeId2) {\n        return maybeId2.get();\n      }\n    }\n    nodes.emplace(Node(nodesIter, data));\n    return (data.isHyperEdge() ? \"h\" : \"n\") + std::to_string(nodesIter++);\n  }\n\n  static optional<string> findNode(const NodeData& node,\n                                   const set<Node>& nodesSet) {\n    for (auto pair : nodesSet) {\n      if (pair.second == node) {\n        return optional<string>((node.isHyperEdge() ? \"h\" : \"n\") +\n                                std::__cxx11::to_string(pair.first));\n      }\n    }\n    return optional<string>();\n  }\n\npublic:\n  static void writeToFile(Graph& graph, const string& path) {\n    set<Node> vertices;\n    set<Node> interiors;\n    set<Edge> edges;\n    int nodesIter     = 0;\n    int interiorsIter = 0;\n    for (auto node : graph) {\n      NodeData& data = graph.getData(node);\n      if (!data.isHyperEdge()) {\n        string firstNodeId = getNodeId(vertices, nodesIter, data);\n        for (const EdgeIterator& e : graph.edges(node)) {\n          GNode dstNode        = graph.getEdgeDst(e);\n          NodeData dstNodeData = graph.getData(dstNode);\n          if (!dstNodeData.isHyperEdge()) {\n            string secondNodeId = getNodeId(vertices, nodesIter, dstNodeData);\n            addEdge(\n                edges, firstNodeId, secondNodeId,\n                graph.getEdgeData(graph.findEdge(node, dstNode)).isBorder());\n          }\n        }\n      } else {\n        string firstInteriorId = getNodeId(interiors, interiorsIter, data,\n                                           optional<set<Node>>(vertices));\n        for (const EdgeIterator& e : graph.edges(node)) {\n          GNode dstNode        = graph.getEdgeDst(e);\n          NodeData dstNodeData = graph.getData(dstNode);\n          string secondNodeId = getNodeId(interiors, interiorsIter, dstNodeData,\n                                          optional<set<Node>>(vertices));\n          addEdge(edges, firstInteriorId, secondNodeId,\n                  graph.getEdgeData(graph.findEdge(node, dstNode)).isBorder());\n        }\n      }\n    }\n    writeToMyGraphFormat(vertices, interiors, edges, path);\n  }\n};\n\n#endif // GALOIS_MYGRAPHFORMATWRITER_H\n"
  },
  {
    "path": "lonestar/scientific/cpu/longestedge/src/utils/Utils.cpp",
    "content": "#include <cmath>\n#include <cstdio>\n#include \"Utils.h\"\n\n#ifndef PI\n#define PI 3.14159265358979323846\n#endif\n\nbool Utils::is_lesser(double a, double b) { return a - b < -2 * EPSILON; }\n\nbool Utils::is_greater(double a, double b) { return a - b > 2 * EPSILON; }\n\nbool Utils::equals(const double a, const double b) {\n  return fabs(a - b) < EPSILON;\n}\n\ndouble Utils::floor2(double a) {\n  double b = (int)a;\n  if (!(!is_greater(b, a) && is_greater(b + 1, a))) {\n    ++b;\n  }\n  return b;\n}\n\ndouble Utils::ceil2(double a) { return floor2(a) + 1; }\n\nvoid Utils::change_bytes_order(uint16_t* var_ptr) {\n  uint16_t tmp = *var_ptr;\n  tmp <<= 8;\n  (*var_ptr) >>= 8;\n  (*var_ptr) |= tmp;\n}\n\nvoid Utils::swap_if_required(double* should_be_lower,\n                             double* should_be_bigger) {\n  if ((*should_be_lower) > (*should_be_bigger)) {\n    double tmp          = (*should_be_lower);\n    (*should_be_lower)  = (*should_be_bigger);\n    (*should_be_bigger) = tmp;\n  }\n}\n\nsize_t Utils::gcd(size_t a, size_t b) {\n  do {\n    if (b > a) {\n      size_t tmp = a;\n      a          = b;\n      b          = tmp;\n    }\n    a -= b;\n  } while (a != 0);\n  return b;\n}\n\ndouble Utils::d2r(double degrees) { return degrees * PI / 180; }\n\ndouble Utils::r2d(double radians) { return radians * 180 / PI; }\n\nvoid Utils::shift(int from, int to, size_t* array) {\n  for (int i = to; i > from; --i) {\n    array[i] = array[i - 1];\n  }\n}\n\nstd::pair<double, double> Utils::convertToUtm(double latitude, double longitude,\n                                              Map& map) {\n  long zone;\n  char hemisphere;\n  double easting;\n  double northing;\n  if (Convert_Geodetic_To_UTM(d2r(latitude), d2r(longitude), &zone, &hemisphere,\n                              &easting, &northing)) {\n    fprintf(stderr, \"Error during conversion to UTM.\\n\");\n    exit(13);\n  }\n  if (map.getZone() != -1) {\n    if (map.getZone() != zone || map.getHemisphere() != hemisphere) {\n      fprintf(stderr,\n              \"Error: All the points must be within the same UTM zone.\\n\");\n      exit(14);\n    }\n  } else {\n    map.setZone(zone);\n    map.setHemisphere(hemisphere);\n  }\n  return std::pair<double, double>(easting, northing);\n}"
  },
  {
    "path": "lonestar/scientific/cpu/longestedge/src/utils/Utils.h",
    "content": "\n#ifndef TERGEN_UTILS_H\n#define TERGEN_UTILS_H\n\n#include <cstdint>\n#include <cstdlib>\n#include <utility>\n#include \"../libmgrs/utm.h\"\n#include \"../model/Map.h\"\n\nclass Utils {\npublic:\n  constexpr static const double EPSILON = 1e-6;\n\n  static bool is_lesser(double a, double b);\n\n  static bool is_greater(double a, double b);\n\n  static bool equals(const double a, const double b);\n\n  static double floor2(double a);\n\n  static double ceil2(double a);\n\n  static void change_bytes_order(uint16_t* var_ptr);\n\n  static void swap_if_required(double* should_be_lower,\n                               double* should_be_bigger);\n\n  static size_t gcd(size_t a, size_t b);\n\n  static double d2r(double degrees);\n\n  static double r2d(double radians);\n\n  static void shift(int from, int to, size_t* array);\n\n  static std::pair<double, double> convertToUtm(double latitude,\n                                                double longitude, Map& map);\n};\n\n#endif // TERGEN_UTILS_H\n"
  },
  {
    "path": "lonestar/scientific/cpu/longestedge/src/writers/InpWriter.cpp",
    "content": "#include \"InpWriter.h\"\n\n#include \"../model/Graph.h\"\n#include \"../model/NodeData.h\"\n#include \"../utils/ConnectivityManager.h\"\n\n#include <algorithm>\n#include <cstdlib>\n#include <fstream>\n#include <iostream>\n#include <map>\n#include <set>\n#include <string>\n#include <utility>\n#include <vector>\n\nusing inpEdge = std::pair<size_t, size_t>;\n\nvoid inpWriter(const std::string filename, Graph& graph) {\n\n  auto nodeVector  = std::vector<InpNodeInfo>{};\n  auto conecVector = std::vector<InpConecInfo>{};\n\n  // Process the graph and get the vectors to write the inp file\n  processGraph(graph, nodeVector, conecVector);\n\n  // Write the file\n  auto file = std::ofstream(filename);\n\n  if (!file.is_open()) {\n    std::cerr << \"Cannot open output file \" << filename << std::endl;\n    exit(EXIT_FAILURE);\n  }\n\n  // Write header\n  file << nodeVector.size() << \" \" << nodeVector.size() + conecVector.size()\n       << \" 0 0 0\" << std::endl;\n\n  // Write nodes\n  auto counter = 0;\n  for (const auto node : nodeVector) {\n    file << counter << \" \" << node.coods.getX() << \" \" << node.coods.getY()\n         << \" \" << node.coods.getZ() << std::endl;\n\n    ++counter;\n  }\n\n  // Write elements\n  // First elements related to nodes (points)\n  counter = 0;\n  for (const auto node : nodeVector) {\n    file << counter << \" \" << node.mat << \" pt \" << node.id << std::endl;\n\n    ++counter;\n  }\n\n  // Then elements related to nodes (interior, edges, and triangles)\n  for (const auto& conec : conecVector) {\n    file << counter << \" \" << conec.mat << \" \" << conec.type;\n\n    for (const auto id : conec.conec) {\n      file << \" \" << id;\n    }\n\n    file << std::endl;\n    ++counter;\n  }\n\n  file.close();\n}\n\nvoid processGraph(Graph& graph, std::vector<InpNodeInfo>& nodeVector,\n                  std::vector<InpConecInfo>& conecVector) {\n  size_t nodeCounter = 0;\n\n  auto connManager = ConnectivityManager{graph};\n  auto nodeMap     = std::map<GNode, size_t>{};\n\n  // First, process mesh nodes\n  for (const auto graphNode : graph) {\n    if (!graphNode->getData().isHyperEdge()) { // Only mesh nodes\n      const auto coords = graphNode->getData().getCoords();\n      const auto mat    = (graphNode->getData().isHanging()) ? 1u : 0u;\n      const auto id     = nodeCounter;\n      nodeVector.push_back(InpNodeInfo{coords, mat, id});\n      nodeMap.insert({graphNode, id});\n      ++nodeCounter;\n    }\n  }\n\n  auto edgeSet = std::set<inpEdge>{};\n\n  // Then, we process interiors\n  for (const auto graphNode : graph) {\n    if (!graphNode->getData().isHyperEdge()) {\n      continue;\n    }\n\n    const auto coords = graphNode->getData().getCoords();\n    const auto mat    = (graphNode->getData().isToRefine()) ? 3u : 2u;\n    const auto id     = nodeCounter;\n    nodeVector.push_back(InpNodeInfo{coords, mat, id});\n    nodeMap.insert({graphNode, id});\n    ++nodeCounter;\n\n    // Get the three mesh node Ids\n\n    const auto intNodes = connManager.getNeighbours(graphNode);\n    const auto intNodesID =\n        std::vector<size_t>{nodeMap.at(intNodes[0]), nodeMap.at(intNodes[1]),\n                            nodeMap.at(intNodes[2])};\n\n    // Now we generate the mesh triangle\n    conecVector.push_back(InpConecInfo{intNodesID, 7, \"tri\"});\n\n    // Now we generate the three edges that join the interior and the mesh\n    // nodes\n    conecVector.push_back(\n        InpConecInfo{std::vector<size_t>{id, intNodesID.at(0)}, 4, \"line\"});\n\n    conecVector.push_back(\n        InpConecInfo{std::vector<size_t>{id, intNodesID.at(1)}, 4, \"line\"});\n\n    conecVector.push_back(\n        InpConecInfo{std::vector<size_t>{id, intNodesID.at(2)}, 4, \"line\"});\n\n    // Finally, we generate the triangle edges\n    for (auto i = 0u; i < intNodesID.size(); ++i) {\n\n      // Create the inp edge\n      const auto edge =\n          inpEdge{std::min(intNodesID[i], intNodesID[(i + 1) % 3]),\n                  std::max(intNodesID[i], intNodesID[(i + 1) % 3])};\n\n      // Check if it has been created before\n      if (edgeSet.insert(edge).second) {\n\n        // Get the graph edge to see if it's on the boundary\n        const auto graphEdge =\n            connManager.getEdge(intNodes[i], intNodes[(i + 1) % 3]);\n\n        const auto mat =\n            connManager.getGraph().getEdgeData(graphEdge.get()).isBorder() ? 6u\n                                                                           : 5u;\n\n        conecVector.push_back(InpConecInfo{\n            std::vector<size_t>{edge.first, edge.second}, mat, \"line\"});\n      }\n    }\n  }\n}\n"
  },
  {
    "path": "lonestar/scientific/cpu/longestedge/src/writers/InpWriter.h",
    "content": "#ifndef INP_WRITER_H\n#define INP_WRITER_H\n\n#include \"../model/Coordinates.h\"\n#include \"../model/Graph.h\"\n\n#include <cstddef>\n#include <string>\n#include <vector>\n\nstruct InpNodeInfo {\n  Coordinates coods;\n  size_t mat;\n  size_t id;\n};\n\nstruct InpConecInfo {\n  std::vector<size_t> conec;\n  size_t mat;\n  std::string type;\n};\n\nvoid inpWriter(const std::string filename, Graph& graph);\n\nvoid processGraph(Graph& graph, std::vector<InpNodeInfo>& nodeVector,\n                  std::vector<InpConecInfo>& conecVector);\n\n#endif // INP_WRITER_H\n"
  },
  {
    "path": "lonestar/scientific/cpu/longestedge/src/writers/TriangleFormatWriter.cpp",
    "content": "#include \"TriangleFormatWriter.h\"\n\n#include \"../model/Graph.h\"\n#include \"../model/NodeData.h\"\n#include \"../utils/ConnectivityManager.h\"\n\n#include <algorithm>\n#include <cstdlib>\n#include <fstream>\n#include <iostream>\n#include <map>\n#include <set>\n#include <string>\n#include <utility>\n#include <vector>\n\nusing triEdge = std::pair<size_t, size_t>;\n\nvoid triangleFormatWriter(const std::string& filename, Graph& graph) {\n\n  auto nodeVector  = std::vector<TriNodeInfo>{};\n  auto segmVector  = std::vector<TriSegmInfo>{};\n  auto conecVector = std::vector<TriConecInfo>{};\n\n  // Process the graph and get the vectors to write the tri file\n  trProcessGraph(graph, nodeVector, segmVector, conecVector);\n\n  // Write the file\n  auto nodeFile = std::ofstream(filename + \".node\");\n\n  if (!nodeFile.is_open()) {\n    std::cerr << \"Cannot open output file \" << filename << std::endl;\n    exit(EXIT_FAILURE);\n  }\n\n  // Write header\n  nodeFile << nodeVector.size() << \" 2 1 0\" << std::endl;\n\n  // Write nodes\n  auto counter = 0;\n  for (const auto node : nodeVector) {\n    nodeFile << counter++ << \" \" << node.coods.getX() << \" \"\n             << node.coods.getY() << \" \" << node.coods.getZ() << std::endl;\n  }\n  nodeFile.close();\n\n  // Write elements\n  auto eleFile = std::ofstream(filename + \".ele\");\n\n  if (!eleFile.is_open()) {\n    std::cerr << \"Cannot open output file \" << filename << std::endl;\n    exit(EXIT_FAILURE);\n  }\n\n  // Write header\n  eleFile << conecVector.size() << \" 3 0\" << std::endl;\n\n  // First elements related to nodes (points)\n  counter = 0;\n\n  // Then elements related to eles (edges and triangles)\n  for (const auto& conec : conecVector) {\n    eleFile << counter++;\n\n    for (const auto id : conec.conec) {\n      eleFile << \" \" << id;\n    }\n\n    eleFile << std::endl;\n  }\n\n  eleFile.close();\n\n  // Write elements\n  auto polyFile = std::ofstream(filename + \".poly\");\n\n  if (!polyFile.is_open()) {\n    std::cerr << \"Cannot open output file \" << filename << std::endl;\n    exit(EXIT_FAILURE);\n  }\n\n  polyFile << nodeVector.size() << \" 2 1 0\" << std::endl;\n\n  //.poly file\n  // Write nodes\n  counter = 0;\n  for (const auto node : nodeVector) {\n    polyFile << counter++ << \" \" << node.coods.getX() << \" \"\n             << node.coods.getY() << \" \" << node.coods.getZ() << std::endl;\n  }\n\n  counter = 0;\n  polyFile << segmVector.size() << \" 1\" << std::endl;\n  for (const auto& segm : segmVector) {\n    polyFile << counter++ << \" \" << segm.points[0] << \" \" << segm.points[1]\n             << \" \" << (segm.border ? 1 : 0) << std::endl;\n  }\n\n  polyFile << \"0\" << std::endl;\n\n  polyFile.close();\n}\nvoid trProcessGraph(Graph& graph, std::vector<TriNodeInfo>& nodeVector,\n                    std::vector<TriSegmInfo>& segmVector,\n                    std::vector<TriConecInfo>& conecVector) {\n  size_t nodeCounter = 0;\n\n  auto connManager = ConnectivityManager{graph};\n  auto nodeMap     = std::map<GNode, size_t>{};\n\n  // First, process mesh nodes\n  for (const auto graphNode : graph) {\n    if (!graphNode->getData().isHyperEdge()) { // Only mesh nodes\n      const auto coords = graphNode->getData().getCoords();\n      const auto mat    = (graphNode->getData().isHanging()) ? 1u : 0u;\n      const auto id     = nodeCounter;\n      nodeVector.push_back(TriNodeInfo{coords, mat, id});\n      nodeMap.insert({graphNode, id});\n      ++nodeCounter;\n    }\n  }\n\n  auto edgeSet = std::set<triEdge>{};\n\n  // Then, we process interiors\n  for (const auto graphNode : graph) {\n    if (!graphNode->getData().isHyperEdge()) {\n      continue;\n    }\n\n    const auto id = nodeCounter;\n    nodeMap.insert({graphNode, id});\n    ++nodeCounter;\n\n    // Get the three mesh node Ids\n    const auto intNodes = connManager.getNeighbours(graphNode);\n    const auto intNodesID =\n        std::vector<size_t>{nodeMap.at(intNodes[0]), nodeMap.at(intNodes[1]),\n                            nodeMap.at(intNodes[2])};\n    changeOrientationIfRequired(intNodesID, nodeVector);\n\n    // Now we generate the mesh triangle\n    conecVector.push_back(TriConecInfo{intNodesID, 7});\n\n    // Finally, we generate the triangle edges\n    for (auto i = 0u; i < intNodesID.size(); ++i) {\n\n      // Create the tri edge\n      const auto edge =\n          triEdge{std::min(intNodesID[i], intNodesID[(i + 1) % 3]),\n                  std::max(intNodesID[i], intNodesID[(i + 1) % 3])};\n\n      // Check if it has been created before\n      if (edgeSet.insert(edge).second) {\n\n        // Get the graph edge to see if it's on the boundary\n        const auto graphEdge =\n            connManager.getEdge(intNodes[i], intNodes[(i + 1) % 3]);\n\n        segmVector.push_back(TriSegmInfo{\n            std::vector<size_t>{edge.first, edge.second},\n            connManager.getGraph().getEdgeData(graphEdge.get()).isBorder()});\n      }\n    }\n  }\n}\n\nvoid changeOrientationIfRequired(std::vector<unsigned long> element,\n                                 std::vector<TriNodeInfo> nodeVector) {\n  if (greater(((nodeVector[element[1]].coods.getX() -\n                nodeVector[element[0]].coods.getX()) *\n               (nodeVector[element[2]].coods.getY() -\n                nodeVector[element[0]].coods.getY())) -\n                  ((nodeVector[element[1]].coods.getY() -\n                    nodeVector[element[0]].coods.getY()) *\n                   (nodeVector[element[2]].coods.getX() -\n                    nodeVector[element[0]].coods.getX())),\n              0.)) {\n    std::iter_swap(element.begin() + 1, element.begin() + 2);\n  }\n}\n"
  },
  {
    "path": "lonestar/scientific/cpu/longestedge/src/writers/TriangleFormatWriter.h",
    "content": "#ifndef TRI_WRITER_H\n#define TRI_WRITER_H\n\n#include \"../model/Coordinates.h\"\n#include \"../model/Graph.h\"\n\n#include <cstddef>\n#include <string>\n#include <vector>\n\nstruct TriNodeInfo {\n  Coordinates coods;\n  size_t mat;\n  size_t id;\n};\n\nstruct TriConecInfo {\n  std::vector<size_t> conec;\n  size_t mat;\n};\n\nstruct TriSegmInfo {\n  std::vector<size_t> points;\n  bool border;\n};\n\nvoid triangleFormatWriter(const std::string& filename, Graph& graph);\n\nvoid trProcessGraph(Graph& graph, std::vector<TriNodeInfo>& nodeVector,\n                    std::vector<TriSegmInfo>& segmVector,\n                    std::vector<TriConecInfo>& conecVector);\n\nvoid changeOrientationIfRequired(std::vector<unsigned long> element,\n                                 std::vector<TriNodeInfo> nodeVector);\n\n#endif // TRI_WRITER_H\n"
  },
  {
    "path": "lonestar/scientific/cpu/longestedge/test/TestMain.cpp",
    "content": "#ifndef GALOIS_TESTMAIN_H\n#define GALOIS_TESTMAIN_H\n\n#define CATCH_CONFIG_MAIN\n#include \"catch.hpp\"\n#include \"productions/Production1Test.cpp\"\n#include \"utils/ConnectivityManagerTest.cpp\"\n#include \"model/ProductionStateTest.cpp\"\n#include \"utils/UtilsTest.cpp\"\n#include \"model/MapTest.cpp\"\n\n#endif // GALOIS_TESTMAIN_H\n"
  },
  {
    "path": "lonestar/scientific/cpu/longestedge/test/catch.hpp",
    "content": "/*\n *  Catch v2.13.5\n *  Generated: 2021-04-10 23:43:17.560525\n *  ----------------------------------------------------------\n *  This file has been merged from multiple headers. Please don't edit it\n * directly Copyright (c) 2021 Two Blue Cubes Ltd. All rights reserved.\n *\n *  Distributed under the Boost Software License, Version 1.0. (See accompanying\n *  file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)\n */\n#ifndef TWOBLUECUBES_SINGLE_INCLUDE_CATCH_HPP_INCLUDED\n#define TWOBLUECUBES_SINGLE_INCLUDE_CATCH_HPP_INCLUDED\n// start catch.hpp\n\n#define CATCH_VERSION_MAJOR 2\n#define CATCH_VERSION_MINOR 13\n#define CATCH_VERSION_PATCH 5\n\n#ifdef __clang__\n#pragma clang system_header\n#elif defined __GNUC__\n#pragma GCC system_header\n#endif\n\n// start catch_suppress_warnings.h\n\n#ifdef __clang__\n#ifdef __ICC // icpc defines the __clang__ macro\n#pragma warning(push)\n#pragma warning(disable : 161 1682)\n#else // __ICC\n#pragma clang diagnostic push\n#pragma clang diagnostic ignored \"-Wpadded\"\n#pragma clang diagnostic ignored \"-Wswitch-enum\"\n#pragma clang diagnostic ignored \"-Wcovered-switch-default\"\n#endif\n#elif defined __GNUC__\n// Because REQUIREs trigger GCC's -Wparentheses, and because still\n// supported version of g++ have only buggy support for _Pragmas,\n// Wparentheses have to be suppressed globally.\n#pragma GCC diagnostic ignored \"-Wparentheses\" // See #674 for details\n\n#pragma GCC diagnostic push\n#pragma GCC diagnostic ignored \"-Wunused-variable\"\n#pragma GCC diagnostic ignored \"-Wpadded\"\n#endif\n// end catch_suppress_warnings.h\n#if defined(CATCH_CONFIG_MAIN) || defined(CATCH_CONFIG_RUNNER)\n#define CATCH_IMPL\n#define CATCH_CONFIG_ALL_PARTS\n#endif\n\n// In the impl file, we want to have access to all parts of the headers\n// Can also be used to sanely support PCHs\n#if defined(CATCH_CONFIG_ALL_PARTS)\n#define CATCH_CONFIG_EXTERNAL_INTERFACES\n#if defined(CATCH_CONFIG_DISABLE_MATCHERS)\n#undef CATCH_CONFIG_DISABLE_MATCHERS\n#endif\n#if !defined(CATCH_CONFIG_ENABLE_CHRONO_STRINGMAKER)\n#define CATCH_CONFIG_ENABLE_CHRONO_STRINGMAKER\n#endif\n#endif\n\n#if !defined(CATCH_CONFIG_IMPL_ONLY)\n// start catch_platform.h\n\n// See e.g.:\n// https://opensource.apple.com/source/CarbonHeaders/CarbonHeaders-18.1/TargetConditionals.h.auto.html\n#ifdef __APPLE__\n#include <TargetConditionals.h>\n#if (defined(TARGET_OS_OSX) && TARGET_OS_OSX == 1) ||                          \\\n    (defined(TARGET_OS_MAC) && TARGET_OS_MAC == 1)\n#define CATCH_PLATFORM_MAC\n#elif (defined(TARGET_OS_IPHONE) && TARGET_OS_IPHONE == 1)\n#define CATCH_PLATFORM_IPHONE\n#endif\n\n#elif defined(linux) || defined(__linux) || defined(__linux__)\n#define CATCH_PLATFORM_LINUX\n\n#elif defined(WIN32) || defined(__WIN32__) || defined(_WIN32) ||               \\\n    defined(_MSC_VER) || defined(__MINGW32__)\n#define CATCH_PLATFORM_WINDOWS\n#endif\n\n// end catch_platform.h\n\n#ifdef CATCH_IMPL\n#ifndef CLARA_CONFIG_MAIN\n#define CLARA_CONFIG_MAIN_NOT_DEFINED\n#define CLARA_CONFIG_MAIN\n#endif\n#endif\n\n// start catch_user_interfaces.h\n\nnamespace Catch {\nunsigned int rngSeed();\n}\n\n// end catch_user_interfaces.h\n// start catch_tag_alias_autoregistrar.h\n\n// start catch_common.h\n\n// start catch_compiler_capabilities.h\n\n// Detect a number of compiler features - by compiler\n// The following features are defined:\n//\n// CATCH_CONFIG_COUNTER : is the __COUNTER__ macro supported?\n// CATCH_CONFIG_WINDOWS_SEH : is Windows SEH supported?\n// CATCH_CONFIG_POSIX_SIGNALS : are POSIX signals supported?\n// CATCH_CONFIG_DISABLE_EXCEPTIONS : Are exceptions enabled?\n// ****************\n// Note to maintainers: if new toggles are added please document them\n// in configuration.md, too\n// ****************\n\n// In general each macro has a _NO_<feature name> form\n// (e.g. CATCH_CONFIG_NO_POSIX_SIGNALS) which disables the feature.\n// Many features, at point of detection, define an _INTERNAL_ macro, so they\n// can be combined, en-mass, with the _NO_ forms later.\n\n#ifdef __cplusplus\n\n#if (__cplusplus >= 201402L) || (defined(_MSVC_LANG) && _MSVC_LANG >= 201402L)\n#define CATCH_CPP14_OR_GREATER\n#endif\n\n#if (__cplusplus >= 201703L) || (defined(_MSVC_LANG) && _MSVC_LANG >= 201703L)\n#define CATCH_CPP17_OR_GREATER\n#endif\n\n#endif\n\n// Only GCC compiler should be used in this block, so other compilers trying to\n// mask themselves as GCC should be ignored.\n#if defined(__GNUC__) && !defined(__clang__) && !defined(__ICC) &&             \\\n    !defined(__CUDACC__) && !defined(__LCC__)\n#define CATCH_INTERNAL_START_WARNINGS_SUPPRESSION _Pragma(\"GCC diagnostic push\")\n#define CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION _Pragma(\"GCC diagnostic pop\")\n\n#define CATCH_INTERNAL_IGNORE_BUT_WARN(...)                                    \\\n  (void)__builtin_constant_p(__VA_ARGS__)\n\n#endif\n\n#if defined(__clang__)\n\n#define CATCH_INTERNAL_START_WARNINGS_SUPPRESSION                              \\\n  _Pragma(\"clang diagnostic push\")\n#define CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION _Pragma(\"clang diagnostic pop\")\n\n// As of this writing, IBM XL's implementation of __builtin_constant_p has a bug\n// which results in calls to destructors being emitted for each temporary,\n// without a matching initialization. In practice, this can result in something\n// like `std::string::~string` being called on an uninitialized value.\n//\n// For example, this code will likely segfault under IBM XL:\n// ```\n// REQUIRE(std::string(\"12\") + \"34\" == \"1234\")\n// ```\n//\n// Therefore, `CATCH_INTERNAL_IGNORE_BUT_WARN` is not implemented.\n#if !defined(__ibmxl__) && !defined(__CUDACC__)\n#define CATCH_INTERNAL_IGNORE_BUT_WARN(...)                                            \\\n  (void)__builtin_constant_p(__VA_ARGS__) /* NOLINT(cppcoreguidelines-pro-type-vararg, \\\n                                             hicpp-vararg) */\n#endif\n\n#define CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS                               \\\n  _Pragma(\"clang diagnostic ignored \\\"-Wexit-time-destructors\\\"\")              \\\n      _Pragma(\"clang diagnostic ignored \\\"-Wglobal-constructors\\\"\")\n\n#define CATCH_INTERNAL_SUPPRESS_PARENTHESES_WARNINGS                           \\\n  _Pragma(\"clang diagnostic ignored \\\"-Wparentheses\\\"\")\n\n#define CATCH_INTERNAL_SUPPRESS_UNUSED_WARNINGS                                \\\n  _Pragma(\"clang diagnostic ignored \\\"-Wunused-variable\\\"\")\n\n#define CATCH_INTERNAL_SUPPRESS_ZERO_VARIADIC_WARNINGS                         \\\n  _Pragma(\"clang diagnostic ignored \\\"-Wgnu-zero-variadic-macro-arguments\\\"\")\n\n#define CATCH_INTERNAL_SUPPRESS_UNUSED_TEMPLATE_WARNINGS                       \\\n  _Pragma(\"clang diagnostic ignored \\\"-Wunused-template\\\"\")\n\n#endif // __clang__\n\n////////////////////////////////////////////////////////////////////////////////\n// Assume that non-Windows platforms support posix signals by default\n#if !defined(CATCH_PLATFORM_WINDOWS)\n#define CATCH_INTERNAL_CONFIG_POSIX_SIGNALS\n#endif\n\n////////////////////////////////////////////////////////////////////////////////\n// We know some environments not to support full POSIX signals\n#if defined(__CYGWIN__) || defined(__QNX__) || defined(__EMSCRIPTEN__) ||      \\\n    defined(__DJGPP__)\n#define CATCH_INTERNAL_CONFIG_NO_POSIX_SIGNALS\n#endif\n\n#ifdef __OS400__\n#define CATCH_INTERNAL_CONFIG_NO_POSIX_SIGNALS\n#define CATCH_CONFIG_COLOUR_NONE\n#endif\n\n////////////////////////////////////////////////////////////////////////////////\n// Android somehow still does not support std::to_string\n#if defined(__ANDROID__)\n#define CATCH_INTERNAL_CONFIG_NO_CPP11_TO_STRING\n#define CATCH_INTERNAL_CONFIG_ANDROID_LOGWRITE\n#endif\n\n////////////////////////////////////////////////////////////////////////////////\n// Not all Windows environments support SEH properly\n#if defined(__MINGW32__)\n#define CATCH_INTERNAL_CONFIG_NO_WINDOWS_SEH\n#endif\n\n////////////////////////////////////////////////////////////////////////////////\n// PS4\n#if defined(__ORBIS__)\n#define CATCH_INTERNAL_CONFIG_NO_NEW_CAPTURE\n#endif\n\n////////////////////////////////////////////////////////////////////////////////\n// Cygwin\n#ifdef __CYGWIN__\n\n// Required for some versions of Cygwin to declare gettimeofday\n// see:\n// http://stackoverflow.com/questions/36901803/gettimeofday-not-declared-in-this-scope-cygwin\n#define _BSD_SOURCE\n// some versions of cygwin (most) do not support std::to_string. Use the libstd\n// check.\n// https://gcc.gnu.org/onlinedocs/gcc-4.8.2/libstdc++/api/a01053_source.html\n// line 2812-2813\n#if !((__cplusplus >= 201103L) && defined(_GLIBCXX_USE_C99) &&                 \\\n      !defined(_GLIBCXX_HAVE_BROKEN_VSWPRINTF))\n\n#define CATCH_INTERNAL_CONFIG_NO_CPP11_TO_STRING\n\n#endif\n#endif // __CYGWIN__\n\n////////////////////////////////////////////////////////////////////////////////\n// Visual C++\n#if defined(_MSC_VER)\n\n#define CATCH_INTERNAL_START_WARNINGS_SUPPRESSION __pragma(warning(push))\n#define CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION __pragma(warning(pop))\n\n// Universal Windows platform does not support SEH\n// Or console colours (or console at all...)\n#if defined(WINAPI_FAMILY) && (WINAPI_FAMILY == WINAPI_FAMILY_APP)\n#define CATCH_CONFIG_COLOUR_NONE\n#else\n#define CATCH_INTERNAL_CONFIG_WINDOWS_SEH\n#endif\n\n// MSVC traditional preprocessor needs some workaround for __VA_ARGS__\n// _MSVC_TRADITIONAL == 0 means new conformant preprocessor\n// _MSVC_TRADITIONAL == 1 means old traditional non-conformant preprocessor\n#if !defined(__clang__) // Handle Clang masquerading for msvc\n#if !defined(_MSVC_TRADITIONAL) ||                                             \\\n    (defined(_MSVC_TRADITIONAL) && _MSVC_TRADITIONAL)\n#define CATCH_INTERNAL_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR\n#endif // MSVC_TRADITIONAL\n#endif // __clang__\n\n#endif // _MSC_VER\n\n#if defined(_REENTRANT) || defined(_MSC_VER)\n// Enable async processing, as -pthread is specified or no additional linking is\n// required\n#define CATCH_INTERNAL_CONFIG_USE_ASYNC\n#endif // _MSC_VER\n\n////////////////////////////////////////////////////////////////////////////////\n// Check if we are compiled with -fno-exceptions or equivalent\n#if defined(__EXCEPTIONS) || defined(__cpp_exceptions) || defined(_CPPUNWIND)\n#define CATCH_INTERNAL_CONFIG_EXCEPTIONS_ENABLED\n#endif\n\n////////////////////////////////////////////////////////////////////////////////\n// DJGPP\n#ifdef __DJGPP__\n#define CATCH_INTERNAL_CONFIG_NO_WCHAR\n#endif // __DJGPP__\n\n////////////////////////////////////////////////////////////////////////////////\n// Embarcadero C++Build\n#if defined(__BORLANDC__)\n#define CATCH_INTERNAL_CONFIG_POLYFILL_ISNAN\n#endif\n\n////////////////////////////////////////////////////////////////////////////////\n\n// Use of __COUNTER__ is suppressed during code analysis in\n// CLion/AppCode 2017.2.x and former, because __COUNTER__ is not properly\n// handled by it.\n// Otherwise all supported compilers support COUNTER macro,\n// but user still might want to turn it off\n#if (!defined(__JETBRAINS_IDE__) || __JETBRAINS_IDE__ >= 20170300L)\n#define CATCH_INTERNAL_CONFIG_COUNTER\n#endif\n\n////////////////////////////////////////////////////////////////////////////////\n\n// RTX is a special version of Windows that is real time.\n// This means that it is detected as Windows, but does not provide\n// the same set of capabilities as real Windows does.\n#if defined(UNDER_RTSS) || defined(RTX64_BUILD)\n#define CATCH_INTERNAL_CONFIG_NO_WINDOWS_SEH\n#define CATCH_INTERNAL_CONFIG_NO_ASYNC\n#define CATCH_CONFIG_COLOUR_NONE\n#endif\n\n#if !defined(_GLIBCXX_USE_C99_MATH_TR1)\n#define CATCH_INTERNAL_CONFIG_GLOBAL_NEXTAFTER\n#endif\n\n// Various stdlib support checks that require __has_include\n#if defined(__has_include)\n// Check if string_view is available and usable\n#if __has_include(<string_view>) && defined(CATCH_CPP17_OR_GREATER)\n#define CATCH_INTERNAL_CONFIG_CPP17_STRING_VIEW\n#endif\n\n// Check if optional is available and usable\n#if __has_include(<optional>) && defined(CATCH_CPP17_OR_GREATER)\n#define CATCH_INTERNAL_CONFIG_CPP17_OPTIONAL\n#endif // __has_include(<optional>) && defined(CATCH_CPP17_OR_GREATER)\n\n// Check if byte is available and usable\n#if __has_include(<cstddef>) && defined(CATCH_CPP17_OR_GREATER)\n#include <cstddef>\n#if __cpp_lib_byte > 0\n#define CATCH_INTERNAL_CONFIG_CPP17_BYTE\n#endif\n#endif // __has_include(<cstddef>) && defined(CATCH_CPP17_OR_GREATER)\n\n// Check if variant is available and usable\n#if __has_include(<variant>) && defined(CATCH_CPP17_OR_GREATER)\n#if defined(__clang__) && (__clang_major__ < 8)\n// work around clang bug with libstdc++\n// https://bugs.llvm.org/show_bug.cgi?id=31852 fix should be in clang 8,\n// workaround in libstdc++ 8.2\n#include <ciso646>\n#if defined(__GLIBCXX__) && defined(_GLIBCXX_RELEASE) && (_GLIBCXX_RELEASE < 9)\n#define CATCH_CONFIG_NO_CPP17_VARIANT\n#else\n#define CATCH_INTERNAL_CONFIG_CPP17_VARIANT\n#endif // defined(__GLIBCXX__) && defined(_GLIBCXX_RELEASE) && (_GLIBCXX_RELEASE\n       // < 9)\n#else\n#define CATCH_INTERNAL_CONFIG_CPP17_VARIANT\n#endif // defined(__clang__) && (__clang_major__ < 8)\n#endif // __has_include(<variant>) && defined(CATCH_CPP17_OR_GREATER)\n#endif // defined(__has_include)\n\n#if defined(CATCH_INTERNAL_CONFIG_COUNTER) &&                                  \\\n    !defined(CATCH_CONFIG_NO_COUNTER) && !defined(CATCH_CONFIG_COUNTER)\n#define CATCH_CONFIG_COUNTER\n#endif\n#if defined(CATCH_INTERNAL_CONFIG_WINDOWS_SEH) &&                              \\\n    !defined(CATCH_CONFIG_NO_WINDOWS_SEH) &&                                   \\\n    !defined(CATCH_CONFIG_WINDOWS_SEH) &&                                      \\\n    !defined(CATCH_INTERNAL_CONFIG_NO_WINDOWS_SEH)\n#define CATCH_CONFIG_WINDOWS_SEH\n#endif\n// This is set by default, because we assume that unix compilers are\n// posix-signal-compatible by default.\n#if defined(CATCH_INTERNAL_CONFIG_POSIX_SIGNALS) &&                            \\\n    !defined(CATCH_INTERNAL_CONFIG_NO_POSIX_SIGNALS) &&                        \\\n    !defined(CATCH_CONFIG_NO_POSIX_SIGNALS) &&                                 \\\n    !defined(CATCH_CONFIG_POSIX_SIGNALS)\n#define CATCH_CONFIG_POSIX_SIGNALS\n#endif\n// This is set by default, because we assume that compilers with no wchar_t\n// support are just rare exceptions.\n#if !defined(CATCH_INTERNAL_CONFIG_NO_WCHAR) &&                                \\\n    !defined(CATCH_CONFIG_NO_WCHAR) && !defined(CATCH_CONFIG_WCHAR)\n#define CATCH_CONFIG_WCHAR\n#endif\n\n#if !defined(CATCH_INTERNAL_CONFIG_NO_CPP11_TO_STRING) &&                      \\\n    !defined(CATCH_CONFIG_NO_CPP11_TO_STRING) &&                               \\\n    !defined(CATCH_CONFIG_CPP11_TO_STRING)\n#define CATCH_CONFIG_CPP11_TO_STRING\n#endif\n\n#if defined(CATCH_INTERNAL_CONFIG_CPP17_OPTIONAL) &&                           \\\n    !defined(CATCH_CONFIG_NO_CPP17_OPTIONAL) &&                                \\\n    !defined(CATCH_CONFIG_CPP17_OPTIONAL)\n#define CATCH_CONFIG_CPP17_OPTIONAL\n#endif\n\n#if defined(CATCH_INTERNAL_CONFIG_CPP17_STRING_VIEW) &&                        \\\n    !defined(CATCH_CONFIG_NO_CPP17_STRING_VIEW) &&                             \\\n    !defined(CATCH_CONFIG_CPP17_STRING_VIEW)\n#define CATCH_CONFIG_CPP17_STRING_VIEW\n#endif\n\n#if defined(CATCH_INTERNAL_CONFIG_CPP17_VARIANT) &&                            \\\n    !defined(CATCH_CONFIG_NO_CPP17_VARIANT) &&                                 \\\n    !defined(CATCH_CONFIG_CPP17_VARIANT)\n#define CATCH_CONFIG_CPP17_VARIANT\n#endif\n\n#if defined(CATCH_INTERNAL_CONFIG_CPP17_BYTE) &&                               \\\n    !defined(CATCH_CONFIG_NO_CPP17_BYTE) && !defined(CATCH_CONFIG_CPP17_BYTE)\n#define CATCH_CONFIG_CPP17_BYTE\n#endif\n\n#if defined(CATCH_CONFIG_EXPERIMENTAL_REDIRECT)\n#define CATCH_INTERNAL_CONFIG_NEW_CAPTURE\n#endif\n\n#if defined(CATCH_INTERNAL_CONFIG_NEW_CAPTURE) &&                              \\\n    !defined(CATCH_INTERNAL_CONFIG_NO_NEW_CAPTURE) &&                          \\\n    !defined(CATCH_CONFIG_NO_NEW_CAPTURE) &&                                   \\\n    !defined(CATCH_CONFIG_NEW_CAPTURE)\n#define CATCH_CONFIG_NEW_CAPTURE\n#endif\n\n#if !defined(CATCH_INTERNAL_CONFIG_EXCEPTIONS_ENABLED) &&                      \\\n    !defined(CATCH_CONFIG_DISABLE_EXCEPTIONS)\n#define CATCH_CONFIG_DISABLE_EXCEPTIONS\n#endif\n\n#if defined(CATCH_INTERNAL_CONFIG_POLYFILL_ISNAN) &&                           \\\n    !defined(CATCH_CONFIG_NO_POLYFILL_ISNAN) &&                                \\\n    !defined(CATCH_CONFIG_POLYFILL_ISNAN)\n#define CATCH_CONFIG_POLYFILL_ISNAN\n#endif\n\n#if defined(CATCH_INTERNAL_CONFIG_USE_ASYNC) &&                                \\\n    !defined(CATCH_INTERNAL_CONFIG_NO_ASYNC) &&                                \\\n    !defined(CATCH_CONFIG_NO_USE_ASYNC) && !defined(CATCH_CONFIG_USE_ASYNC)\n#define CATCH_CONFIG_USE_ASYNC\n#endif\n\n#if defined(CATCH_INTERNAL_CONFIG_ANDROID_LOGWRITE) &&                         \\\n    !defined(CATCH_CONFIG_NO_ANDROID_LOGWRITE) &&                              \\\n    !defined(CATCH_CONFIG_ANDROID_LOGWRITE)\n#define CATCH_CONFIG_ANDROID_LOGWRITE\n#endif\n\n#if defined(CATCH_INTERNAL_CONFIG_GLOBAL_NEXTAFTER) &&                         \\\n    !defined(CATCH_CONFIG_NO_GLOBAL_NEXTAFTER) &&                              \\\n    !defined(CATCH_CONFIG_GLOBAL_NEXTAFTER)\n#define CATCH_CONFIG_GLOBAL_NEXTAFTER\n#endif\n\n// Even if we do not think the compiler has that warning, we still have\n// to provide a macro that can be used by the code.\n#if !defined(CATCH_INTERNAL_START_WARNINGS_SUPPRESSION)\n#define CATCH_INTERNAL_START_WARNINGS_SUPPRESSION\n#endif\n#if !defined(CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION)\n#define CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION\n#endif\n#if !defined(CATCH_INTERNAL_SUPPRESS_PARENTHESES_WARNINGS)\n#define CATCH_INTERNAL_SUPPRESS_PARENTHESES_WARNINGS\n#endif\n#if !defined(CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS)\n#define CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS\n#endif\n#if !defined(CATCH_INTERNAL_SUPPRESS_UNUSED_WARNINGS)\n#define CATCH_INTERNAL_SUPPRESS_UNUSED_WARNINGS\n#endif\n#if !defined(CATCH_INTERNAL_SUPPRESS_ZERO_VARIADIC_WARNINGS)\n#define CATCH_INTERNAL_SUPPRESS_ZERO_VARIADIC_WARNINGS\n#endif\n\n// The goal of this macro is to avoid evaluation of the arguments, but\n// still have the compiler warn on problems inside...\n#if !defined(CATCH_INTERNAL_IGNORE_BUT_WARN)\n#define CATCH_INTERNAL_IGNORE_BUT_WARN(...)\n#endif\n\n#if defined(__APPLE__) && defined(__apple_build_version__) &&                  \\\n    (__clang_major__ < 10)\n#undef CATCH_INTERNAL_SUPPRESS_UNUSED_TEMPLATE_WARNINGS\n#elif defined(__clang__) && (__clang_major__ < 5)\n#undef CATCH_INTERNAL_SUPPRESS_UNUSED_TEMPLATE_WARNINGS\n#endif\n\n#if !defined(CATCH_INTERNAL_SUPPRESS_UNUSED_TEMPLATE_WARNINGS)\n#define CATCH_INTERNAL_SUPPRESS_UNUSED_TEMPLATE_WARNINGS\n#endif\n\n#if defined(CATCH_CONFIG_DISABLE_EXCEPTIONS)\n#define CATCH_TRY if ((true))\n#define CATCH_CATCH_ALL if ((false))\n#define CATCH_CATCH_ANON(type) if ((false))\n#else\n#define CATCH_TRY try\n#define CATCH_CATCH_ALL catch (...)\n#define CATCH_CATCH_ANON(type) catch (type)\n#endif\n\n#if defined(CATCH_INTERNAL_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR) &&            \\\n    !defined(CATCH_CONFIG_NO_TRADITIONAL_MSVC_PREPROCESSOR) &&                 \\\n    !defined(CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR)\n#define CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR\n#endif\n\n// end catch_compiler_capabilities.h\n#define INTERNAL_CATCH_UNIQUE_NAME_LINE2(name, line) name##line\n#define INTERNAL_CATCH_UNIQUE_NAME_LINE(name, line)                            \\\n  INTERNAL_CATCH_UNIQUE_NAME_LINE2(name, line)\n#ifdef CATCH_CONFIG_COUNTER\n#define INTERNAL_CATCH_UNIQUE_NAME(name)                                       \\\n  INTERNAL_CATCH_UNIQUE_NAME_LINE(name, __COUNTER__)\n#else\n#define INTERNAL_CATCH_UNIQUE_NAME(name)                                       \\\n  INTERNAL_CATCH_UNIQUE_NAME_LINE(name, __LINE__)\n#endif\n\n#include <iosfwd>\n#include <string>\n#include <cstdint>\n\n// We need a dummy global operator<< so we can bring it into Catch namespace\n// later\nstruct Catch_global_namespace_dummy {};\nstd::ostream& operator<<(std::ostream&, Catch_global_namespace_dummy);\n\nnamespace Catch {\n\nstruct CaseSensitive {\n  enum Choice { Yes, No };\n};\n\nclass NonCopyable {\n  NonCopyable(NonCopyable const&)            = delete;\n  NonCopyable(NonCopyable&&)                 = delete;\n  NonCopyable& operator=(NonCopyable const&) = delete;\n  NonCopyable& operator=(NonCopyable&&)      = delete;\n\nprotected:\n  NonCopyable();\n  virtual ~NonCopyable();\n};\n\nstruct SourceLineInfo {\n\n  SourceLineInfo() = delete;\n  SourceLineInfo(char const* _file, std::size_t _line) noexcept\n      : file(_file), line(_line) {}\n\n  SourceLineInfo(SourceLineInfo const& other)          = default;\n  SourceLineInfo& operator=(SourceLineInfo const&)     = default;\n  SourceLineInfo(SourceLineInfo&&) noexcept            = default;\n  SourceLineInfo& operator=(SourceLineInfo&&) noexcept = default;\n\n  bool empty() const noexcept { return file[0] == '\\0'; }\n  bool operator==(SourceLineInfo const& other) const noexcept;\n  bool operator<(SourceLineInfo const& other) const noexcept;\n\n  char const* file;\n  std::size_t line;\n};\n\nstd::ostream& operator<<(std::ostream& os, SourceLineInfo const& info);\n\n// Bring in operator<< from global namespace into Catch namespace\n// This is necessary because the overload of operator<< above makes\n// lookup stop at namespace Catch\nusing ::operator<<;\n\n// Use this in variadic streaming macros to allow\n//    >> +StreamEndStop\n// as well as\n//    >> stuff +StreamEndStop\nstruct StreamEndStop {\n  std::string operator+() const;\n};\ntemplate <typename T>\nT const& operator+(T const& value, StreamEndStop) {\n  return value;\n}\n} // namespace Catch\n\n#define CATCH_INTERNAL_LINEINFO                                                \\\n  ::Catch::SourceLineInfo(__FILE__, static_cast<std::size_t>(__LINE__))\n\n// end catch_common.h\nnamespace Catch {\n\nstruct RegistrarForTagAliases {\n  RegistrarForTagAliases(char const* alias, char const* tag,\n                         SourceLineInfo const& lineInfo);\n};\n\n} // end namespace Catch\n\n#define CATCH_REGISTER_TAG_ALIAS(alias, spec)                                  \\\n  CATCH_INTERNAL_START_WARNINGS_SUPPRESSION                                    \\\n  CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS                                     \\\n  namespace {                                                                  \\\n  Catch::RegistrarForTagAliases INTERNAL_CATCH_UNIQUE_NAME(                    \\\n      AutoRegisterTagAlias)(alias, spec, CATCH_INTERNAL_LINEINFO);             \\\n  }                                                                            \\\n  CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION\n\n// end catch_tag_alias_autoregistrar.h\n// start catch_test_registry.h\n\n// start catch_interfaces_testcase.h\n\n#include <vector>\n\nnamespace Catch {\n\nclass TestSpec;\n\nstruct ITestInvoker {\n  virtual void invoke() const = 0;\n  virtual ~ITestInvoker();\n};\n\nclass TestCase;\nstruct IConfig;\n\nstruct ITestCaseRegistry {\n  virtual ~ITestCaseRegistry();\n  virtual std::vector<TestCase> const& getAllTests() const = 0;\n  virtual std::vector<TestCase> const&\n  getAllTestsSorted(IConfig const& config) const = 0;\n};\n\nbool isThrowSafe(TestCase const& testCase, IConfig const& config);\nbool matchTest(TestCase const& testCase, TestSpec const& testSpec,\n               IConfig const& config);\nstd::vector<TestCase> filterTests(std::vector<TestCase> const& testCases,\n                                  TestSpec const& testSpec,\n                                  IConfig const& config);\nstd::vector<TestCase> const& getAllTestCasesSorted(IConfig const& config);\n\n} // namespace Catch\n\n// end catch_interfaces_testcase.h\n// start catch_stringref.h\n\n#include <cstddef>\n#include <string>\n#include <iosfwd>\n#include <cassert>\n\nnamespace Catch {\n\n/// A non-owning string class (similar to the forthcoming std::string_view)\n/// Note that, because a StringRef may be a substring of another string,\n/// it may not be null terminated.\nclass StringRef {\npublic:\n  using size_type      = std::size_t;\n  using const_iterator = const char*;\n\nprivate:\n  static constexpr char const* const s_empty = \"\";\n\n  char const* m_start = s_empty;\n  size_type m_size    = 0;\n\npublic: // construction\n  constexpr StringRef() noexcept = default;\n\n  StringRef(char const* rawChars) noexcept;\n\n  constexpr StringRef(char const* rawChars, size_type size) noexcept\n      : m_start(rawChars), m_size(size) {}\n\n  StringRef(std::string const& stdString) noexcept\n      : m_start(stdString.c_str()), m_size(stdString.size()) {}\n\n  explicit operator std::string() const { return std::string(m_start, m_size); }\n\npublic: // operators\n  auto operator==(StringRef const& other) const noexcept -> bool;\n  auto operator!=(StringRef const& other) const noexcept -> bool {\n    return !(*this == other);\n  }\n\n  auto operator[](size_type index) const noexcept -> char {\n    assert(index < m_size);\n    return m_start[index];\n  }\n\npublic: // named queries\n  constexpr auto empty() const noexcept -> bool { return m_size == 0; }\n  constexpr auto size() const noexcept -> size_type { return m_size; }\n\n  // Returns the current start pointer. If the StringRef is not\n  // null-terminated, throws std::domain_exception\n  auto c_str() const -> char const*;\n\npublic: // substrings and searches\n  // Returns a substring of [start, start + length).\n  // If start + length > size(), then the substring is [start, size()).\n  // If start > size(), then the substring is empty.\n  auto substr(size_type start, size_type length) const noexcept -> StringRef;\n\n  // Returns the current start pointer. May not be null-terminated.\n  auto data() const noexcept -> char const*;\n\n  constexpr auto isNullTerminated() const noexcept -> bool {\n    return m_start[m_size] == '\\0';\n  }\n\npublic: // iterators\n  constexpr const_iterator begin() const { return m_start; }\n  constexpr const_iterator end() const { return m_start + m_size; }\n};\n\nauto operator+=(std::string& lhs, StringRef const& sr) -> std::string&;\nauto operator<<(std::ostream& os, StringRef const& sr) -> std::ostream&;\n\nconstexpr auto operator\"\" _sr(char const* rawChars, std::size_t size) noexcept\n    -> StringRef {\n  return StringRef(rawChars, size);\n}\n} // namespace Catch\n\nconstexpr auto operator\"\" _catch_sr(char const* rawChars,\n                                    std::size_t size) noexcept\n    -> Catch::StringRef {\n  return Catch::StringRef(rawChars, size);\n}\n\n// end catch_stringref.h\n// start catch_preprocessor.hpp\n\n#define CATCH_RECURSION_LEVEL0(...) __VA_ARGS__\n#define CATCH_RECURSION_LEVEL1(...)                                            \\\n  CATCH_RECURSION_LEVEL0(                                                      \\\n      CATCH_RECURSION_LEVEL0(CATCH_RECURSION_LEVEL0(__VA_ARGS__)))\n#define CATCH_RECURSION_LEVEL2(...)                                            \\\n  CATCH_RECURSION_LEVEL1(                                                      \\\n      CATCH_RECURSION_LEVEL1(CATCH_RECURSION_LEVEL1(__VA_ARGS__)))\n#define CATCH_RECURSION_LEVEL3(...)                                            \\\n  CATCH_RECURSION_LEVEL2(                                                      \\\n      CATCH_RECURSION_LEVEL2(CATCH_RECURSION_LEVEL2(__VA_ARGS__)))\n#define CATCH_RECURSION_LEVEL4(...)                                            \\\n  CATCH_RECURSION_LEVEL3(                                                      \\\n      CATCH_RECURSION_LEVEL3(CATCH_RECURSION_LEVEL3(__VA_ARGS__)))\n#define CATCH_RECURSION_LEVEL5(...)                                            \\\n  CATCH_RECURSION_LEVEL4(                                                      \\\n      CATCH_RECURSION_LEVEL4(CATCH_RECURSION_LEVEL4(__VA_ARGS__)))\n\n#ifdef CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR\n#define INTERNAL_CATCH_EXPAND_VARGS(...) __VA_ARGS__\n// MSVC needs more evaluations\n#define CATCH_RECURSION_LEVEL6(...)                                            \\\n  CATCH_RECURSION_LEVEL5(                                                      \\\n      CATCH_RECURSION_LEVEL5(CATCH_RECURSION_LEVEL5(__VA_ARGS__)))\n#define CATCH_RECURSE(...)                                                     \\\n  CATCH_RECURSION_LEVEL6(CATCH_RECURSION_LEVEL6(__VA_ARGS__))\n#else\n#define CATCH_RECURSE(...) CATCH_RECURSION_LEVEL5(__VA_ARGS__)\n#endif\n\n#define CATCH_REC_END(...)\n#define CATCH_REC_OUT\n\n#define CATCH_EMPTY()\n#define CATCH_DEFER(id) id CATCH_EMPTY()\n\n#define CATCH_REC_GET_END2() 0, CATCH_REC_END\n#define CATCH_REC_GET_END1(...) CATCH_REC_GET_END2\n#define CATCH_REC_GET_END(...) CATCH_REC_GET_END1\n#define CATCH_REC_NEXT0(test, next, ...) next CATCH_REC_OUT\n#define CATCH_REC_NEXT1(test, next) CATCH_DEFER(CATCH_REC_NEXT0)(test, next, 0)\n#define CATCH_REC_NEXT(test, next) CATCH_REC_NEXT1(CATCH_REC_GET_END test, next)\n\n#define CATCH_REC_LIST0(f, x, peek, ...)                                       \\\n  , f(x) CATCH_DEFER(CATCH_REC_NEXT(peek, CATCH_REC_LIST1))(f, peek,           \\\n                                                            __VA_ARGS__)\n#define CATCH_REC_LIST1(f, x, peek, ...)                                       \\\n  , f(x) CATCH_DEFER(CATCH_REC_NEXT(peek, CATCH_REC_LIST0))(f, peek,           \\\n                                                            __VA_ARGS__)\n#define CATCH_REC_LIST2(f, x, peek, ...)                                       \\\n  f(x) CATCH_DEFER(CATCH_REC_NEXT(peek, CATCH_REC_LIST1))(f, peek, __VA_ARGS__)\n\n#define CATCH_REC_LIST0_UD(f, userdata, x, peek, ...)                          \\\n  , f(userdata, x) CATCH_DEFER(CATCH_REC_NEXT(peek, CATCH_REC_LIST1_UD))(      \\\n        f, userdata, peek, __VA_ARGS__)\n#define CATCH_REC_LIST1_UD(f, userdata, x, peek, ...)                          \\\n  , f(userdata, x) CATCH_DEFER(CATCH_REC_NEXT(peek, CATCH_REC_LIST0_UD))(      \\\n        f, userdata, peek, __VA_ARGS__)\n#define CATCH_REC_LIST2_UD(f, userdata, x, peek, ...)                          \\\n  f(userdata, x) CATCH_DEFER(CATCH_REC_NEXT(peek, CATCH_REC_LIST1_UD))(        \\\n      f, userdata, peek, __VA_ARGS__)\n\n// Applies the function macro `f` to each of the remaining parameters, inserts\n// commas between the results, and passes userdata as the first parameter to\n// each invocation, e.g. CATCH_REC_LIST_UD(f, x, a, b, c) evaluates to f(x, a),\n// f(x, b), f(x, c)\n#define CATCH_REC_LIST_UD(f, userdata, ...)                                    \\\n  CATCH_RECURSE(                                                               \\\n      CATCH_REC_LIST2_UD(f, userdata, __VA_ARGS__, ()()(), ()()(), ()()(), 0))\n\n#define CATCH_REC_LIST(f, ...)                                                 \\\n  CATCH_RECURSE(CATCH_REC_LIST2(f, __VA_ARGS__, ()()(), ()()(), ()()(), 0))\n\n#define INTERNAL_CATCH_EXPAND1(param) INTERNAL_CATCH_EXPAND2(param)\n#define INTERNAL_CATCH_EXPAND2(...) INTERNAL_CATCH_NO##__VA_ARGS__\n#define INTERNAL_CATCH_DEF(...) INTERNAL_CATCH_DEF __VA_ARGS__\n#define INTERNAL_CATCH_NOINTERNAL_CATCH_DEF\n#define INTERNAL_CATCH_STRINGIZE(...) INTERNAL_CATCH_STRINGIZE2(__VA_ARGS__)\n#ifndef CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR\n#define INTERNAL_CATCH_STRINGIZE2(...) #__VA_ARGS__\n#define INTERNAL_CATCH_STRINGIZE_WITHOUT_PARENS(param)                         \\\n  INTERNAL_CATCH_STRINGIZE(INTERNAL_CATCH_REMOVE_PARENS(param))\n#else\n// MSVC is adding extra space and needs another indirection to expand\n// INTERNAL_CATCH_NOINTERNAL_CATCH_DEF\n#define INTERNAL_CATCH_STRINGIZE2(...) INTERNAL_CATCH_STRINGIZE3(__VA_ARGS__)\n#define INTERNAL_CATCH_STRINGIZE3(...) #__VA_ARGS__\n#define INTERNAL_CATCH_STRINGIZE_WITHOUT_PARENS(param)                         \\\n  (INTERNAL_CATCH_STRINGIZE(INTERNAL_CATCH_REMOVE_PARENS(param)) + 1)\n#endif\n\n#define INTERNAL_CATCH_MAKE_NAMESPACE2(...) ns_##__VA_ARGS__\n#define INTERNAL_CATCH_MAKE_NAMESPACE(name) INTERNAL_CATCH_MAKE_NAMESPACE2(name)\n\n#define INTERNAL_CATCH_REMOVE_PARENS(...)                                      \\\n  INTERNAL_CATCH_EXPAND1(INTERNAL_CATCH_DEF __VA_ARGS__)\n\n#ifndef CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR\n#define INTERNAL_CATCH_MAKE_TYPE_LIST2(...)                                    \\\n  decltype(get_wrapper<INTERNAL_CATCH_REMOVE_PARENS_GEN(__VA_ARGS__)>())\n#define INTERNAL_CATCH_MAKE_TYPE_LIST(...)                                     \\\n  INTERNAL_CATCH_MAKE_TYPE_LIST2(INTERNAL_CATCH_REMOVE_PARENS(__VA_ARGS__))\n#else\n#define INTERNAL_CATCH_MAKE_TYPE_LIST2(...)                                    \\\n  INTERNAL_CATCH_EXPAND_VARGS(                                                 \\\n      decltype(get_wrapper<INTERNAL_CATCH_REMOVE_PARENS_GEN(__VA_ARGS__)>()))\n#define INTERNAL_CATCH_MAKE_TYPE_LIST(...)                                     \\\n  INTERNAL_CATCH_EXPAND_VARGS(INTERNAL_CATCH_MAKE_TYPE_LIST2(                  \\\n      INTERNAL_CATCH_REMOVE_PARENS(__VA_ARGS__)))\n#endif\n\n#define INTERNAL_CATCH_MAKE_TYPE_LISTS_FROM_TYPES(...)                         \\\n  CATCH_REC_LIST(INTERNAL_CATCH_MAKE_TYPE_LIST, __VA_ARGS__)\n\n#define INTERNAL_CATCH_REMOVE_PARENS_1_ARG(_0) INTERNAL_CATCH_REMOVE_PARENS(_0)\n#define INTERNAL_CATCH_REMOVE_PARENS_2_ARG(_0, _1)                             \\\n  INTERNAL_CATCH_REMOVE_PARENS(_0), INTERNAL_CATCH_REMOVE_PARENS_1_ARG(_1)\n#define INTERNAL_CATCH_REMOVE_PARENS_3_ARG(_0, _1, _2)                         \\\n  INTERNAL_CATCH_REMOVE_PARENS(_0), INTERNAL_CATCH_REMOVE_PARENS_2_ARG(_1, _2)\n#define INTERNAL_CATCH_REMOVE_PARENS_4_ARG(_0, _1, _2, _3)                     \\\n  INTERNAL_CATCH_REMOVE_PARENS(_0),                                            \\\n      INTERNAL_CATCH_REMOVE_PARENS_3_ARG(_1, _2, _3)\n#define INTERNAL_CATCH_REMOVE_PARENS_5_ARG(_0, _1, _2, _3, _4)                 \\\n  INTERNAL_CATCH_REMOVE_PARENS(_0),                                            \\\n      INTERNAL_CATCH_REMOVE_PARENS_4_ARG(_1, _2, _3, _4)\n#define INTERNAL_CATCH_REMOVE_PARENS_6_ARG(_0, _1, _2, _3, _4, _5)             \\\n  INTERNAL_CATCH_REMOVE_PARENS(_0),                                            \\\n      INTERNAL_CATCH_REMOVE_PARENS_5_ARG(_1, _2, _3, _4, _5)\n#define INTERNAL_CATCH_REMOVE_PARENS_7_ARG(_0, _1, _2, _3, _4, _5, _6)         \\\n  INTERNAL_CATCH_REMOVE_PARENS(_0),                                            \\\n      INTERNAL_CATCH_REMOVE_PARENS_6_ARG(_1, _2, _3, _4, _5, _6)\n#define INTERNAL_CATCH_REMOVE_PARENS_8_ARG(_0, _1, _2, _3, _4, _5, _6, _7)     \\\n  INTERNAL_CATCH_REMOVE_PARENS(_0),                                            \\\n      INTERNAL_CATCH_REMOVE_PARENS_7_ARG(_1, _2, _3, _4, _5, _6, _7)\n#define INTERNAL_CATCH_REMOVE_PARENS_9_ARG(_0, _1, _2, _3, _4, _5, _6, _7, _8) \\\n  INTERNAL_CATCH_REMOVE_PARENS(_0),                                            \\\n      INTERNAL_CATCH_REMOVE_PARENS_8_ARG(_1, _2, _3, _4, _5, _6, _7, _8)\n#define INTERNAL_CATCH_REMOVE_PARENS_10_ARG(_0, _1, _2, _3, _4, _5, _6, _7,    \\\n                                            _8, _9)                            \\\n  INTERNAL_CATCH_REMOVE_PARENS(_0),                                            \\\n      INTERNAL_CATCH_REMOVE_PARENS_9_ARG(_1, _2, _3, _4, _5, _6, _7, _8, _9)\n#define INTERNAL_CATCH_REMOVE_PARENS_11_ARG(_0, _1, _2, _3, _4, _5, _6, _7,    \\\n                                            _8, _9, _10)                       \\\n  INTERNAL_CATCH_REMOVE_PARENS(_0),                                            \\\n      INTERNAL_CATCH_REMOVE_PARENS_10_ARG(_1, _2, _3, _4, _5, _6, _7, _8, _9,  \\\n                                          _10)\n\n#define INTERNAL_CATCH_VA_NARGS_IMPL(_0, _1, _2, _3, _4, _5, _6, _7, _8, _9,   \\\n                                     _10, N, ...)                              \\\n  N\n\n#define INTERNAL_CATCH_TYPE_GEN                                                \\\n  template <typename...>                                                       \\\n  struct TypeList {};                                                          \\\n  template <typename... Ts>                                                    \\\n  constexpr auto get_wrapper() noexcept->TypeList<Ts...> {                     \\\n    return {};                                                                 \\\n  }                                                                            \\\n  template <template <typename...> class...>                                   \\\n  struct TemplateTypeList {};                                                  \\\n  template <template <typename...> class... Cs>                                \\\n  constexpr auto get_wrapper() noexcept->TemplateTypeList<Cs...> {             \\\n    return {};                                                                 \\\n  }                                                                            \\\n  template <typename...>                                                       \\\n  struct append;                                                               \\\n  template <typename...>                                                       \\\n  struct rewrap;                                                               \\\n  template <template <typename...> class, typename...>                         \\\n  struct create;                                                               \\\n  template <template <typename...> class, typename>                            \\\n  struct convert;                                                              \\\n                                                                               \\\n  template <typename T>                                                        \\\n  struct append<T> {                                                           \\\n    using type = T;                                                            \\\n  };                                                                           \\\n  template <template <typename...> class L1, typename... E1,                   \\\n            template <typename...> class L2, typename... E2, typename... Rest> \\\n  struct append<L1<E1...>, L2<E2...>, Rest...> {                               \\\n    using type = typename append<L1<E1..., E2...>, Rest...>::type;             \\\n  };                                                                           \\\n  template <template <typename...> class L1, typename... E1, typename... Rest> \\\n  struct append<L1<E1...>, TypeList<mpl_::na>, Rest...> {                      \\\n    using type = L1<E1...>;                                                    \\\n  };                                                                           \\\n                                                                               \\\n  template <template <typename...> class Container,                            \\\n            template <typename...> class List, typename... elems>              \\\n  struct rewrap<TemplateTypeList<Container>, List<elems...>> {                 \\\n    using type = TypeList<Container<elems...>>;                                \\\n  };                                                                           \\\n  template <template <typename...> class Container,                            \\\n            template <typename...> class List, class... Elems,                 \\\n            typename... Elements>                                              \\\n  struct rewrap<TemplateTypeList<Container>, List<Elems...>, Elements...> {    \\\n    using type = typename append<TypeList<Container<Elems...>>,                \\\n                                 typename rewrap<TemplateTypeList<Container>,  \\\n                                                 Elements...>::type>::type;    \\\n  };                                                                           \\\n                                                                               \\\n  template <template <typename...> class Final,                                \\\n            template <typename...> class... Containers, typename... Types>     \\\n  struct create<Final, TemplateTypeList<Containers...>, TypeList<Types...>> {  \\\n    using type =                                                               \\\n        typename append<Final<>, typename rewrap<TemplateTypeList<Containers>, \\\n                                                 Types...>::type...>::type;    \\\n  };                                                                           \\\n  template <template <typename...> class Final,                                \\\n            template <typename...> class List, typename... Ts>                 \\\n  struct convert<Final, List<Ts...>> {                                         \\\n    using type = typename append<Final<>, TypeList<Ts>...>::type;              \\\n  };\n\n#define INTERNAL_CATCH_NTTP_1(signature, ...)                                  \\\n  template <INTERNAL_CATCH_REMOVE_PARENS(signature)>                           \\\n  struct Nttp {};                                                              \\\n  template <INTERNAL_CATCH_REMOVE_PARENS(signature)>                           \\\n  constexpr auto get_wrapper() noexcept->Nttp<__VA_ARGS__> {                   \\\n    return {};                                                                 \\\n  }                                                                            \\\n  template <template <INTERNAL_CATCH_REMOVE_PARENS(signature)> class...>       \\\n  struct NttpTemplateTypeList {};                                              \\\n  template <template <INTERNAL_CATCH_REMOVE_PARENS(signature)> class... Cs>    \\\n  constexpr auto get_wrapper() noexcept->NttpTemplateTypeList<Cs...> {         \\\n    return {};                                                                 \\\n  }                                                                            \\\n                                                                               \\\n  template <template <INTERNAL_CATCH_REMOVE_PARENS(signature)>                 \\\n            class Container,                                                   \\\n            template <INTERNAL_CATCH_REMOVE_PARENS(signature)> class List,     \\\n            INTERNAL_CATCH_REMOVE_PARENS(signature)>                           \\\n  struct rewrap<NttpTemplateTypeList<Container>, List<__VA_ARGS__>> {          \\\n    using type = TypeList<Container<__VA_ARGS__>>;                             \\\n  };                                                                           \\\n  template <template <INTERNAL_CATCH_REMOVE_PARENS(signature)>                 \\\n            class Container,                                                   \\\n            template <INTERNAL_CATCH_REMOVE_PARENS(signature)> class List,     \\\n            INTERNAL_CATCH_REMOVE_PARENS(signature), typename... Elements>     \\\n  struct rewrap<NttpTemplateTypeList<Container>, List<__VA_ARGS__>,            \\\n                Elements...> {                                                 \\\n    using type =                                                               \\\n        typename append<TypeList<Container<__VA_ARGS__>>,                      \\\n                        typename rewrap<NttpTemplateTypeList<Container>,       \\\n                                        Elements...>::type>::type;             \\\n  };                                                                           \\\n  template <template <typename...> class Final,                                \\\n            template <INTERNAL_CATCH_REMOVE_PARENS(signature)>                 \\\n            class... Containers,                                               \\\n            typename... Types>                                                 \\\n  struct create<Final, NttpTemplateTypeList<Containers...>,                    \\\n                TypeList<Types...>> {                                          \\\n    using type =                                                               \\\n        typename append<Final<>,                                               \\\n                        typename rewrap<NttpTemplateTypeList<Containers>,      \\\n                                        Types...>::type...>::type;             \\\n  };\n\n#define INTERNAL_CATCH_DECLARE_SIG_TEST0(TestName)\n#define INTERNAL_CATCH_DECLARE_SIG_TEST1(TestName, signature)                  \\\n  template <INTERNAL_CATCH_REMOVE_PARENS(signature)>                           \\\n  static void TestName()\n#define INTERNAL_CATCH_DECLARE_SIG_TEST_X(TestName, signature, ...)            \\\n  template <INTERNAL_CATCH_REMOVE_PARENS(signature)>                           \\\n  static void TestName()\n\n#define INTERNAL_CATCH_DEFINE_SIG_TEST0(TestName)\n#define INTERNAL_CATCH_DEFINE_SIG_TEST1(TestName, signature)                   \\\n  template <INTERNAL_CATCH_REMOVE_PARENS(signature)>                           \\\n  static void TestName()\n#define INTERNAL_CATCH_DEFINE_SIG_TEST_X(TestName, signature, ...)             \\\n  template <INTERNAL_CATCH_REMOVE_PARENS(signature)>                           \\\n  static void TestName()\n\n#define INTERNAL_CATCH_NTTP_REGISTER0(TestFunc, signature)                     \\\n  template <typename Type>                                                     \\\n  void reg_test(TypeList<Type>, Catch::NameAndTags nameAndTags) {              \\\n    Catch::AutoReg(Catch::makeTestInvoker(&TestFunc<Type>),                    \\\n                   CATCH_INTERNAL_LINEINFO, Catch::StringRef(), nameAndTags);  \\\n  }\n\n#define INTERNAL_CATCH_NTTP_REGISTER(TestFunc, signature, ...)                 \\\n  template <INTERNAL_CATCH_REMOVE_PARENS(signature)>                           \\\n  void reg_test(Nttp<__VA_ARGS__>, Catch::NameAndTags nameAndTags) {           \\\n    Catch::AutoReg(Catch::makeTestInvoker(&TestFunc<__VA_ARGS__>),             \\\n                   CATCH_INTERNAL_LINEINFO, Catch::StringRef(), nameAndTags);  \\\n  }\n\n#define INTERNAL_CATCH_NTTP_REGISTER_METHOD0(TestName, signature, ...)         \\\n  template <typename Type>                                                     \\\n  void reg_test(TypeList<Type>, Catch::StringRef className,                    \\\n                Catch::NameAndTags nameAndTags) {                              \\\n    Catch::AutoReg(Catch::makeTestInvoker(&TestName<Type>::test),              \\\n                   CATCH_INTERNAL_LINEINFO, className, nameAndTags);           \\\n  }\n\n#define INTERNAL_CATCH_NTTP_REGISTER_METHOD(TestName, signature, ...)          \\\n  template <INTERNAL_CATCH_REMOVE_PARENS(signature)>                           \\\n  void reg_test(Nttp<__VA_ARGS__>, Catch::StringRef className,                 \\\n                Catch::NameAndTags nameAndTags) {                              \\\n    Catch::AutoReg(Catch::makeTestInvoker(&TestName<__VA_ARGS__>::test),       \\\n                   CATCH_INTERNAL_LINEINFO, className, nameAndTags);           \\\n  }\n\n#define INTERNAL_CATCH_DECLARE_SIG_TEST_METHOD0(TestName, ClassName)\n#define INTERNAL_CATCH_DECLARE_SIG_TEST_METHOD1(TestName, ClassName,           \\\n                                                signature)                     \\\n  template <typename TestType>                                                 \\\n  struct TestName : INTERNAL_CATCH_REMOVE_PARENS(ClassName)<TestType> {        \\\n    void test();                                                               \\\n  }\n\n#define INTERNAL_CATCH_DECLARE_SIG_TEST_METHOD_X(TestName, ClassName,          \\\n                                                 signature, ...)               \\\n  template <INTERNAL_CATCH_REMOVE_PARENS(signature)>                           \\\n  struct TestName : INTERNAL_CATCH_REMOVE_PARENS(ClassName)<__VA_ARGS__> {     \\\n    void test();                                                               \\\n  }\n\n#define INTERNAL_CATCH_DEFINE_SIG_TEST_METHOD0(TestName)\n#define INTERNAL_CATCH_DEFINE_SIG_TEST_METHOD1(TestName, signature)            \\\n  template <typename TestType>                                                 \\\n  void INTERNAL_CATCH_MAKE_NAMESPACE(TestName)::TestName<TestType>::test()\n#define INTERNAL_CATCH_DEFINE_SIG_TEST_METHOD_X(TestName, signature, ...)      \\\n  template <INTERNAL_CATCH_REMOVE_PARENS(signature)>                           \\\n  void INTERNAL_CATCH_MAKE_NAMESPACE(TestName)::TestName<__VA_ARGS__>::test()\n\n#ifndef CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR\n#define INTERNAL_CATCH_NTTP_0\n#define INTERNAL_CATCH_NTTP_GEN(...)                                           \\\n  INTERNAL_CATCH_VA_NARGS_IMPL(                                                \\\n      __VA_ARGS__, INTERNAL_CATCH_NTTP_1(__VA_ARGS__),                         \\\n      INTERNAL_CATCH_NTTP_1(__VA_ARGS__), INTERNAL_CATCH_NTTP_1(__VA_ARGS__),  \\\n      INTERNAL_CATCH_NTTP_1(__VA_ARGS__), INTERNAL_CATCH_NTTP_1(__VA_ARGS__),  \\\n      INTERNAL_CATCH_NTTP_1(__VA_ARGS__), INTERNAL_CATCH_NTTP_1(__VA_ARGS__),  \\\n      INTERNAL_CATCH_NTTP_1(__VA_ARGS__), INTERNAL_CATCH_NTTP_1(__VA_ARGS__),  \\\n      INTERNAL_CATCH_NTTP_1(__VA_ARGS__), INTERNAL_CATCH_NTTP_0)\n#define INTERNAL_CATCH_DEFINE_SIG_TEST_METHOD(TestName, ...)                   \\\n  INTERNAL_CATCH_VA_NARGS_IMPL(\"dummy\", __VA_ARGS__,                           \\\n                               INTERNAL_CATCH_DEFINE_SIG_TEST_METHOD_X,        \\\n                               INTERNAL_CATCH_DEFINE_SIG_TEST_METHOD_X,        \\\n                               INTERNAL_CATCH_DEFINE_SIG_TEST_METHOD_X,        \\\n                               INTERNAL_CATCH_DEFINE_SIG_TEST_METHOD_X,        \\\n                               INTERNAL_CATCH_DEFINE_SIG_TEST_METHOD_X,        \\\n                               INTERNAL_CATCH_DEFINE_SIG_TEST_METHOD_X,        \\\n                               INTERNAL_CATCH_DEFINE_SIG_TEST_METHOD_X,        \\\n                               INTERNAL_CATCH_DEFINE_SIG_TEST_METHOD_X,        \\\n                               INTERNAL_CATCH_DEFINE_SIG_TEST_METHOD_X,        \\\n                               INTERNAL_CATCH_DEFINE_SIG_TEST_METHOD1,         \\\n                               INTERNAL_CATCH_DEFINE_SIG_TEST_METHOD0)         \\\n  (TestName, __VA_ARGS__)\n#define INTERNAL_CATCH_DECLARE_SIG_TEST_METHOD(TestName, ClassName, ...)       \\\n  INTERNAL_CATCH_VA_NARGS_IMPL(\"dummy\", __VA_ARGS__,                           \\\n                               INTERNAL_CATCH_DECLARE_SIG_TEST_METHOD_X,       \\\n                               INTERNAL_CATCH_DECLARE_SIG_TEST_METHOD_X,       \\\n                               INTERNAL_CATCH_DECLARE_SIG_TEST_METHOD_X,       \\\n                               INTERNAL_CATCH_DECLARE_SIG_TEST_METHOD_X,       \\\n                               INTERNAL_CATCH_DECLARE_SIG_TEST_METHOD_X,       \\\n                               INTERNAL_CATCH_DECLARE_SIG_TEST_METHOD_X,       \\\n                               INTERNAL_CATCH_DECLARE_SIG_TEST_METHOD_X,       \\\n                               INTERNAL_CATCH_DECLARE_SIG_TEST_METHOD_X,       \\\n                               INTERNAL_CATCH_DECLARE_SIG_TEST_METHOD_X,       \\\n                               INTERNAL_CATCH_DECLARE_SIG_TEST_METHOD1,        \\\n                               INTERNAL_CATCH_DECLARE_SIG_TEST_METHOD0)        \\\n  (TestName, ClassName, __VA_ARGS__)\n#define INTERNAL_CATCH_NTTP_REG_METHOD_GEN(TestName, ...)                      \\\n  INTERNAL_CATCH_VA_NARGS_IMPL(\"dummy\", __VA_ARGS__,                           \\\n                               INTERNAL_CATCH_NTTP_REGISTER_METHOD,            \\\n                               INTERNAL_CATCH_NTTP_REGISTER_METHOD,            \\\n                               INTERNAL_CATCH_NTTP_REGISTER_METHOD,            \\\n                               INTERNAL_CATCH_NTTP_REGISTER_METHOD,            \\\n                               INTERNAL_CATCH_NTTP_REGISTER_METHOD,            \\\n                               INTERNAL_CATCH_NTTP_REGISTER_METHOD,            \\\n                               INTERNAL_CATCH_NTTP_REGISTER_METHOD,            \\\n                               INTERNAL_CATCH_NTTP_REGISTER_METHOD,            \\\n                               INTERNAL_CATCH_NTTP_REGISTER_METHOD,            \\\n                               INTERNAL_CATCH_NTTP_REGISTER_METHOD0,           \\\n                               INTERNAL_CATCH_NTTP_REGISTER_METHOD0)           \\\n  (TestName, __VA_ARGS__)\n#define INTERNAL_CATCH_NTTP_REG_GEN(TestFunc, ...)                             \\\n  INTERNAL_CATCH_VA_NARGS_IMPL(                                                \\\n      \"dummy\", __VA_ARGS__, INTERNAL_CATCH_NTTP_REGISTER,                      \\\n      INTERNAL_CATCH_NTTP_REGISTER, INTERNAL_CATCH_NTTP_REGISTER,              \\\n      INTERNAL_CATCH_NTTP_REGISTER, INTERNAL_CATCH_NTTP_REGISTER,              \\\n      INTERNAL_CATCH_NTTP_REGISTER, INTERNAL_CATCH_NTTP_REGISTER,              \\\n      INTERNAL_CATCH_NTTP_REGISTER, INTERNAL_CATCH_NTTP_REGISTER,              \\\n      INTERNAL_CATCH_NTTP_REGISTER0, INTERNAL_CATCH_NTTP_REGISTER0)            \\\n  (TestFunc, __VA_ARGS__)\n#define INTERNAL_CATCH_DEFINE_SIG_TEST(TestName, ...)                          \\\n  INTERNAL_CATCH_VA_NARGS_IMPL(                                                \\\n      \"dummy\", __VA_ARGS__, INTERNAL_CATCH_DEFINE_SIG_TEST_X,                  \\\n      INTERNAL_CATCH_DEFINE_SIG_TEST_X, INTERNAL_CATCH_DEFINE_SIG_TEST_X,      \\\n      INTERNAL_CATCH_DEFINE_SIG_TEST_X, INTERNAL_CATCH_DEFINE_SIG_TEST_X,      \\\n      INTERNAL_CATCH_DEFINE_SIG_TEST_X, INTERNAL_CATCH_DEFINE_SIG_TEST_X,      \\\n      INTERNAL_CATCH_DEFINE_SIG_TEST_X, INTERNAL_CATCH_DEFINE_SIG_TEST_X,      \\\n      INTERNAL_CATCH_DEFINE_SIG_TEST1, INTERNAL_CATCH_DEFINE_SIG_TEST0)        \\\n  (TestName, __VA_ARGS__)\n#define INTERNAL_CATCH_DECLARE_SIG_TEST(TestName, ...)                         \\\n  INTERNAL_CATCH_VA_NARGS_IMPL(                                                \\\n      \"dummy\", __VA_ARGS__, INTERNAL_CATCH_DECLARE_SIG_TEST_X,                 \\\n      INTERNAL_CATCH_DECLARE_SIG_TEST_X, INTERNAL_CATCH_DECLARE_SIG_TEST_X,    \\\n      INTERNAL_CATCH_DECLARE_SIG_TEST_X, INTERNAL_CATCH_DECLARE_SIG_TEST_X,    \\\n      INTERNAL_CATCH_DECLARE_SIG_TEST_X, INTERNAL_CATCH_DEFINE_SIG_TEST_X,     \\\n      INTERNAL_CATCH_DECLARE_SIG_TEST_X, INTERNAL_CATCH_DECLARE_SIG_TEST_X,    \\\n      INTERNAL_CATCH_DECLARE_SIG_TEST1, INTERNAL_CATCH_DECLARE_SIG_TEST0)      \\\n  (TestName, __VA_ARGS__)\n#define INTERNAL_CATCH_REMOVE_PARENS_GEN(...)                                  \\\n  INTERNAL_CATCH_VA_NARGS_IMPL(                                                \\\n      __VA_ARGS__, INTERNAL_CATCH_REMOVE_PARENS_11_ARG,                        \\\n      INTERNAL_CATCH_REMOVE_PARENS_10_ARG, INTERNAL_CATCH_REMOVE_PARENS_9_ARG, \\\n      INTERNAL_CATCH_REMOVE_PARENS_8_ARG, INTERNAL_CATCH_REMOVE_PARENS_7_ARG,  \\\n      INTERNAL_CATCH_REMOVE_PARENS_6_ARG, INTERNAL_CATCH_REMOVE_PARENS_5_ARG,  \\\n      INTERNAL_CATCH_REMOVE_PARENS_4_ARG, INTERNAL_CATCH_REMOVE_PARENS_3_ARG,  \\\n      INTERNAL_CATCH_REMOVE_PARENS_2_ARG, INTERNAL_CATCH_REMOVE_PARENS_1_ARG)  \\\n  (__VA_ARGS__)\n#else\n#define INTERNAL_CATCH_NTTP_0(signature)\n#define INTERNAL_CATCH_NTTP_GEN(...)                                           \\\n  INTERNAL_CATCH_EXPAND_VARGS(INTERNAL_CATCH_VA_NARGS_IMPL(                    \\\n      __VA_ARGS__, INTERNAL_CATCH_NTTP_1, INTERNAL_CATCH_NTTP_1,               \\\n      INTERNAL_CATCH_NTTP_1, INTERNAL_CATCH_NTTP_1, INTERNAL_CATCH_NTTP_1,     \\\n      INTERNAL_CATCH_NTTP_1, INTERNAL_CATCH_NTTP_1, INTERNAL_CATCH_NTTP_1,     \\\n      INTERNAL_CATCH_NTTP_1, INTERNAL_CATCH_NTTP_1,                            \\\n      INTERNAL_CATCH_NTTP_0)(__VA_ARGS__))\n#define INTERNAL_CATCH_DEFINE_SIG_TEST_METHOD(TestName, ...)                   \\\n  INTERNAL_CATCH_EXPAND_VARGS(INTERNAL_CATCH_VA_NARGS_IMPL(                    \\\n      \"dummy\", __VA_ARGS__, INTERNAL_CATCH_DEFINE_SIG_TEST_METHOD_X,           \\\n      INTERNAL_CATCH_DEFINE_SIG_TEST_METHOD_X,                                 \\\n      INTERNAL_CATCH_DEFINE_SIG_TEST_METHOD_X,                                 \\\n      INTERNAL_CATCH_DEFINE_SIG_TEST_METHOD_X,                                 \\\n      INTERNAL_CATCH_DEFINE_SIG_TEST_METHOD_X,                                 \\\n      INTERNAL_CATCH_DEFINE_SIG_TEST_METHOD_X,                                 \\\n      INTERNAL_CATCH_DEFINE_SIG_TEST_METHOD_X,                                 \\\n      INTERNAL_CATCH_DEFINE_SIG_TEST_METHOD_X,                                 \\\n      INTERNAL_CATCH_DEFINE_SIG_TEST_METHOD_X,                                 \\\n      INTERNAL_CATCH_DEFINE_SIG_TEST_METHOD1,                                  \\\n      INTERNAL_CATCH_DEFINE_SIG_TEST_METHOD0)(TestName, __VA_ARGS__))\n#define INTERNAL_CATCH_DECLARE_SIG_TEST_METHOD(TestName, ClassName, ...)       \\\n  INTERNAL_CATCH_EXPAND_VARGS(INTERNAL_CATCH_VA_NARGS_IMPL(                    \\\n      \"dummy\", __VA_ARGS__, INTERNAL_CATCH_DECLARE_SIG_TEST_METHOD_X,          \\\n      INTERNAL_CATCH_DECLARE_SIG_TEST_METHOD_X,                                \\\n      INTERNAL_CATCH_DECLARE_SIG_TEST_METHOD_X,                                \\\n      INTERNAL_CATCH_DECLARE_SIG_TEST_METHOD_X,                                \\\n      INTERNAL_CATCH_DECLARE_SIG_TEST_METHOD_X,                                \\\n      INTERNAL_CATCH_DECLARE_SIG_TEST_METHOD_X,                                \\\n      INTERNAL_CATCH_DECLARE_SIG_TEST_METHOD_X,                                \\\n      INTERNAL_CATCH_DECLARE_SIG_TEST_METHOD_X,                                \\\n      INTERNAL_CATCH_DECLARE_SIG_TEST_METHOD_X,                                \\\n      INTERNAL_CATCH_DECLARE_SIG_TEST_METHOD1,                                 \\\n      INTERNAL_CATCH_DECLARE_SIG_TEST_METHOD0)(TestName, ClassName,            \\\n                                               __VA_ARGS__))\n#define INTERNAL_CATCH_NTTP_REG_METHOD_GEN(TestName, ...)                      \\\n  INTERNAL_CATCH_EXPAND_VARGS(INTERNAL_CATCH_VA_NARGS_IMPL(                    \\\n      \"dummy\", __VA_ARGS__, INTERNAL_CATCH_NTTP_REGISTER_METHOD,               \\\n      INTERNAL_CATCH_NTTP_REGISTER_METHOD,                                     \\\n      INTERNAL_CATCH_NTTP_REGISTER_METHOD,                                     \\\n      INTERNAL_CATCH_NTTP_REGISTER_METHOD,                                     \\\n      INTERNAL_CATCH_NTTP_REGISTER_METHOD,                                     \\\n      INTERNAL_CATCH_NTTP_REGISTER_METHOD,                                     \\\n      INTERNAL_CATCH_NTTP_REGISTER_METHOD,                                     \\\n      INTERNAL_CATCH_NTTP_REGISTER_METHOD,                                     \\\n      INTERNAL_CATCH_NTTP_REGISTER_METHOD,                                     \\\n      INTERNAL_CATCH_NTTP_REGISTER_METHOD0,                                    \\\n      INTERNAL_CATCH_NTTP_REGISTER_METHOD0)(TestName, __VA_ARGS__))\n#define INTERNAL_CATCH_NTTP_REG_GEN(TestFunc, ...)                             \\\n  INTERNAL_CATCH_EXPAND_VARGS(INTERNAL_CATCH_VA_NARGS_IMPL(                    \\\n      \"dummy\", __VA_ARGS__, INTERNAL_CATCH_NTTP_REGISTER,                      \\\n      INTERNAL_CATCH_NTTP_REGISTER, INTERNAL_CATCH_NTTP_REGISTER,              \\\n      INTERNAL_CATCH_NTTP_REGISTER, INTERNAL_CATCH_NTTP_REGISTER,              \\\n      INTERNAL_CATCH_NTTP_REGISTER, INTERNAL_CATCH_NTTP_REGISTER,              \\\n      INTERNAL_CATCH_NTTP_REGISTER, INTERNAL_CATCH_NTTP_REGISTER,              \\\n      INTERNAL_CATCH_NTTP_REGISTER0,                                           \\\n      INTERNAL_CATCH_NTTP_REGISTER0)(TestFunc, __VA_ARGS__))\n#define INTERNAL_CATCH_DEFINE_SIG_TEST(TestName, ...)                          \\\n  INTERNAL_CATCH_EXPAND_VARGS(INTERNAL_CATCH_VA_NARGS_IMPL(                    \\\n      \"dummy\", __VA_ARGS__, INTERNAL_CATCH_DEFINE_SIG_TEST_X,                  \\\n      INTERNAL_CATCH_DEFINE_SIG_TEST_X, INTERNAL_CATCH_DEFINE_SIG_TEST_X,      \\\n      INTERNAL_CATCH_DEFINE_SIG_TEST_X, INTERNAL_CATCH_DEFINE_SIG_TEST_X,      \\\n      INTERNAL_CATCH_DEFINE_SIG_TEST_X, INTERNAL_CATCH_DEFINE_SIG_TEST_X,      \\\n      INTERNAL_CATCH_DEFINE_SIG_TEST_X, INTERNAL_CATCH_DEFINE_SIG_TEST_X,      \\\n      INTERNAL_CATCH_DEFINE_SIG_TEST1,                                         \\\n      INTERNAL_CATCH_DEFINE_SIG_TEST0)(TestName, __VA_ARGS__))\n#define INTERNAL_CATCH_DECLARE_SIG_TEST(TestName, ...)                         \\\n  INTERNAL_CATCH_EXPAND_VARGS(INTERNAL_CATCH_VA_NARGS_IMPL(                    \\\n      \"dummy\", __VA_ARGS__, INTERNAL_CATCH_DECLARE_SIG_TEST_X,                 \\\n      INTERNAL_CATCH_DECLARE_SIG_TEST_X, INTERNAL_CATCH_DECLARE_SIG_TEST_X,    \\\n      INTERNAL_CATCH_DECLARE_SIG_TEST_X, INTERNAL_CATCH_DECLARE_SIG_TEST_X,    \\\n      INTERNAL_CATCH_DECLARE_SIG_TEST_X, INTERNAL_CATCH_DEFINE_SIG_TEST_X,     \\\n      INTERNAL_CATCH_DECLARE_SIG_TEST_X, INTERNAL_CATCH_DECLARE_SIG_TEST_X,    \\\n      INTERNAL_CATCH_DECLARE_SIG_TEST1,                                        \\\n      INTERNAL_CATCH_DECLARE_SIG_TEST0)(TestName, __VA_ARGS__))\n#define INTERNAL_CATCH_REMOVE_PARENS_GEN(...)                                  \\\n  INTERNAL_CATCH_EXPAND_VARGS(INTERNAL_CATCH_VA_NARGS_IMPL(                    \\\n      __VA_ARGS__, INTERNAL_CATCH_REMOVE_PARENS_11_ARG,                        \\\n      INTERNAL_CATCH_REMOVE_PARENS_10_ARG, INTERNAL_CATCH_REMOVE_PARENS_9_ARG, \\\n      INTERNAL_CATCH_REMOVE_PARENS_8_ARG, INTERNAL_CATCH_REMOVE_PARENS_7_ARG,  \\\n      INTERNAL_CATCH_REMOVE_PARENS_6_ARG, INTERNAL_CATCH_REMOVE_PARENS_5_ARG,  \\\n      INTERNAL_CATCH_REMOVE_PARENS_4_ARG, INTERNAL_CATCH_REMOVE_PARENS_3_ARG,  \\\n      INTERNAL_CATCH_REMOVE_PARENS_2_ARG,                                      \\\n      INTERNAL_CATCH_REMOVE_PARENS_1_ARG)(__VA_ARGS__))\n#endif\n\n// end catch_preprocessor.hpp\n// start catch_meta.hpp\n\n#include <type_traits>\n\nnamespace Catch {\ntemplate <typename T>\nstruct always_false : std::false_type {};\n\ntemplate <typename>\nstruct true_given : std::true_type {};\nstruct is_callable_tester {\n  template <typename Fun, typename... Args>\n  true_given<\n      decltype(std::declval<Fun>()(std::declval<Args>()...))> static test(int);\n  template <typename...>\n  std::false_type static test(...);\n};\n\ntemplate <typename T>\nstruct is_callable;\n\ntemplate <typename Fun, typename... Args>\nstruct is_callable<Fun(Args...)>\n    : decltype(is_callable_tester::test<Fun, Args...>(0)) {};\n\n#if defined(__cpp_lib_is_invocable) && __cpp_lib_is_invocable >= 201703\n// std::result_of is deprecated in C++17 and removed in C++20. Hence, it is\n// replaced with std::invoke_result here.\ntemplate <typename Func, typename... U>\nusing FunctionReturnType =\n    std::remove_reference_t<std::remove_cv_t<std::invoke_result_t<Func, U...>>>;\n#else\n// Keep ::type here because we still support C++11\ntemplate <typename Func, typename... U>\nusing FunctionReturnType =\n    typename std::remove_reference<typename std::remove_cv<\n        typename std::result_of<Func(U...)>::type>::type>::type;\n#endif\n\n} // namespace Catch\n\nnamespace mpl_ {\nstruct na;\n}\n\n// end catch_meta.hpp\nnamespace Catch {\n\ntemplate <typename C>\nclass TestInvokerAsMethod : public ITestInvoker {\n  void (C::*m_testAsMethod)();\n\npublic:\n  TestInvokerAsMethod(void (C::*testAsMethod)()) noexcept\n      : m_testAsMethod(testAsMethod) {}\n\n  void invoke() const override {\n    C obj;\n    (obj.*m_testAsMethod)();\n  }\n};\n\nauto makeTestInvoker(void (*testAsFunction)()) noexcept -> ITestInvoker*;\n\ntemplate <typename C>\nauto makeTestInvoker(void (C::*testAsMethod)()) noexcept -> ITestInvoker* {\n  return new (std::nothrow) TestInvokerAsMethod<C>(testAsMethod);\n}\n\nstruct NameAndTags {\n  NameAndTags(StringRef const& name_ = StringRef(),\n              StringRef const& tags_ = StringRef()) noexcept;\n  StringRef name;\n  StringRef tags;\n};\n\nstruct AutoReg : NonCopyable {\n  AutoReg(ITestInvoker* invoker, SourceLineInfo const& lineInfo,\n          StringRef const& classOrMethod,\n          NameAndTags const& nameAndTags) noexcept;\n  ~AutoReg();\n};\n\n} // end namespace Catch\n\n#if defined(CATCH_CONFIG_DISABLE)\n#define INTERNAL_CATCH_TESTCASE_NO_REGISTRATION(TestName, ...)                 \\\n  static void TestName()\n#define INTERNAL_CATCH_TESTCASE_METHOD_NO_REGISTRATION(TestName, ClassName,    \\\n                                                       ...)                    \\\n  namespace {                                                                  \\\n  struct TestName : INTERNAL_CATCH_REMOVE_PARENS(ClassName) {                  \\\n    void test();                                                               \\\n  };                                                                           \\\n  }                                                                            \\\n  void TestName::test()\n#define INTERNAL_CATCH_TEMPLATE_TEST_CASE_NO_REGISTRATION_2(                   \\\n    TestName, TestFunc, Name, Tags, Signature, ...)                            \\\n  INTERNAL_CATCH_DEFINE_SIG_TEST(TestFunc,                                     \\\n                                 INTERNAL_CATCH_REMOVE_PARENS(Signature))\n#define INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_NO_REGISTRATION_2(            \\\n    TestNameClass, TestName, ClassName, Name, Tags, Signature, ...)            \\\n  namespace {                                                                  \\\n  namespace INTERNAL_CATCH_MAKE_NAMESPACE(TestName) {                          \\\n    INTERNAL_CATCH_DECLARE_SIG_TEST_METHOD(                                    \\\n        TestName, ClassName, INTERNAL_CATCH_REMOVE_PARENS(Signature));         \\\n  }                                                                            \\\n  }                                                                            \\\n  INTERNAL_CATCH_DEFINE_SIG_TEST_METHOD(                                       \\\n      TestName, INTERNAL_CATCH_REMOVE_PARENS(Signature))\n\n#ifndef CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR\n#define INTERNAL_CATCH_TEMPLATE_TEST_CASE_NO_REGISTRATION(Name, Tags, ...)     \\\n  INTERNAL_CATCH_TEMPLATE_TEST_CASE_NO_REGISTRATION_2(                         \\\n      INTERNAL_CATCH_UNIQUE_NAME(                                              \\\n          ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____),                    \\\n      INTERNAL_CATCH_UNIQUE_NAME(                                              \\\n          ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____F_U_N_C____),         \\\n      Name, Tags, typename TestType, __VA_ARGS__)\n#else\n#define INTERNAL_CATCH_TEMPLATE_TEST_CASE_NO_REGISTRATION(Name, Tags, ...)     \\\n  INTERNAL_CATCH_EXPAND_VARGS(                                                 \\\n      INTERNAL_CATCH_TEMPLATE_TEST_CASE_NO_REGISTRATION_2(                     \\\n          INTERNAL_CATCH_UNIQUE_NAME(                                          \\\n              ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____),                \\\n          INTERNAL_CATCH_UNIQUE_NAME(                                          \\\n              ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____F_U_N_C____),     \\\n          Name, Tags, typename TestType, __VA_ARGS__))\n#endif\n\n#ifndef CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR\n#define INTERNAL_CATCH_TEMPLATE_TEST_CASE_SIG_NO_REGISTRATION(Name, Tags,      \\\n                                                              Signature, ...)  \\\n  INTERNAL_CATCH_TEMPLATE_TEST_CASE_NO_REGISTRATION_2(                         \\\n      INTERNAL_CATCH_UNIQUE_NAME(                                              \\\n          ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____),                    \\\n      INTERNAL_CATCH_UNIQUE_NAME(                                              \\\n          ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____F_U_N_C____),         \\\n      Name, Tags, Signature, __VA_ARGS__)\n#else\n#define INTERNAL_CATCH_TEMPLATE_TEST_CASE_SIG_NO_REGISTRATION(Name, Tags,      \\\n                                                              Signature, ...)  \\\n  INTERNAL_CATCH_EXPAND_VARGS(                                                 \\\n      INTERNAL_CATCH_TEMPLATE_TEST_CASE_NO_REGISTRATION_2(                     \\\n          INTERNAL_CATCH_UNIQUE_NAME(                                          \\\n              ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____),                \\\n          INTERNAL_CATCH_UNIQUE_NAME(                                          \\\n              ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____F_U_N_C____),     \\\n          Name, Tags, Signature, __VA_ARGS__))\n#endif\n\n#ifndef CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR\n#define INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_NO_REGISTRATION(              \\\n    ClassName, Name, Tags, ...)                                                \\\n  INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_NO_REGISTRATION_2(                  \\\n      INTERNAL_CATCH_UNIQUE_NAME(                                              \\\n          ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____C_L_A_S_S____),       \\\n      INTERNAL_CATCH_UNIQUE_NAME(                                              \\\n          ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____),                    \\\n      ClassName, Name, Tags, typename T, __VA_ARGS__)\n#else\n#define INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_NO_REGISTRATION(              \\\n    ClassName, Name, Tags, ...)                                                \\\n  INTERNAL_CATCH_EXPAND_VARGS(                                                 \\\n      INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_NO_REGISTRATION_2(              \\\n          INTERNAL_CATCH_UNIQUE_NAME(                                          \\\n              ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____C_L_A_S_S____),   \\\n          INTERNAL_CATCH_UNIQUE_NAME(                                          \\\n              ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____),                \\\n          ClassName, Name, Tags, typename T, __VA_ARGS__))\n#endif\n\n#ifndef CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR\n#define INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_SIG_NO_REGISTRATION(          \\\n    ClassName, Name, Tags, Signature, ...)                                     \\\n  INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_NO_REGISTRATION_2(                  \\\n      INTERNAL_CATCH_UNIQUE_NAME(                                              \\\n          ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____C_L_A_S_S____),       \\\n      INTERNAL_CATCH_UNIQUE_NAME(                                              \\\n          ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____),                    \\\n      ClassName, Name, Tags, Signature, __VA_ARGS__)\n#else\n#define INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_SIG_NO_REGISTRATION(          \\\n    ClassName, Name, Tags, Signature, ...)                                     \\\n  INTERNAL_CATCH_EXPAND_VARGS(                                                 \\\n      INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_NO_REGISTRATION_2(              \\\n          INTERNAL_CATCH_UNIQUE_NAME(                                          \\\n              ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____C_L_A_S_S____),   \\\n          INTERNAL_CATCH_UNIQUE_NAME(                                          \\\n              ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____),                \\\n          ClassName, Name, Tags, Signature, __VA_ARGS__))\n#endif\n#endif\n\n///////////////////////////////////////////////////////////////////////////////\n#define INTERNAL_CATCH_TESTCASE2(TestName, ...)                                \\\n  static void TestName();                                                      \\\n  CATCH_INTERNAL_START_WARNINGS_SUPPRESSION                                    \\\n  CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS                                     \\\n  namespace {                                                                  \\\n  Catch::AutoReg INTERNAL_CATCH_UNIQUE_NAME(autoRegistrar)(                    \\\n      Catch::makeTestInvoker(&TestName), CATCH_INTERNAL_LINEINFO,              \\\n      Catch::StringRef(), Catch::NameAndTags{__VA_ARGS__});                    \\\n  } /* NOLINT */                                                               \\\n  CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION                                     \\\n  static void TestName()\n#define INTERNAL_CATCH_TESTCASE(...)                                           \\\n  INTERNAL_CATCH_TESTCASE2(                                                    \\\n      INTERNAL_CATCH_UNIQUE_NAME(____C_A_T_C_H____T_E_S_T____), __VA_ARGS__)\n\n///////////////////////////////////////////////////////////////////////////////\n#define INTERNAL_CATCH_METHOD_AS_TEST_CASE(QualifiedMethod, ...)               \\\n  CATCH_INTERNAL_START_WARNINGS_SUPPRESSION                                    \\\n  CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS                                     \\\n  namespace {                                                                  \\\n  Catch::AutoReg INTERNAL_CATCH_UNIQUE_NAME(autoRegistrar)(                    \\\n      Catch::makeTestInvoker(&QualifiedMethod), CATCH_INTERNAL_LINEINFO,       \\\n      \"&\" #QualifiedMethod, Catch::NameAndTags{__VA_ARGS__});                  \\\n  } /* NOLINT */                                                               \\\n  CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION\n\n///////////////////////////////////////////////////////////////////////////////\n#define INTERNAL_CATCH_TEST_CASE_METHOD2(TestName, ClassName, ...)             \\\n  CATCH_INTERNAL_START_WARNINGS_SUPPRESSION                                    \\\n  CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS                                     \\\n  namespace {                                                                  \\\n  struct TestName : INTERNAL_CATCH_REMOVE_PARENS(ClassName) {                  \\\n    void test();                                                               \\\n  };                                                                           \\\n  Catch::AutoReg INTERNAL_CATCH_UNIQUE_NAME(autoRegistrar)(                    \\\n      Catch::makeTestInvoker(&TestName::test), CATCH_INTERNAL_LINEINFO,        \\\n      #ClassName, Catch::NameAndTags{__VA_ARGS__}); /* NOLINT */               \\\n  }                                                                            \\\n  CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION                                     \\\n  void TestName::test()\n#define INTERNAL_CATCH_TEST_CASE_METHOD(ClassName, ...)                        \\\n  INTERNAL_CATCH_TEST_CASE_METHOD2(                                            \\\n      INTERNAL_CATCH_UNIQUE_NAME(____C_A_T_C_H____T_E_S_T____), ClassName,     \\\n      __VA_ARGS__)\n\n///////////////////////////////////////////////////////////////////////////////\n#define INTERNAL_CATCH_REGISTER_TESTCASE(Function, ...)                        \\\n  CATCH_INTERNAL_START_WARNINGS_SUPPRESSION                                    \\\n  CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS                                     \\\n  Catch::AutoReg INTERNAL_CATCH_UNIQUE_NAME(autoRegistrar)(                    \\\n      Catch::makeTestInvoker(Function), CATCH_INTERNAL_LINEINFO,               \\\n      Catch::StringRef(), Catch::NameAndTags{__VA_ARGS__}); /* NOLINT */       \\\n  CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION\n\n///////////////////////////////////////////////////////////////////////////////\n#define INTERNAL_CATCH_TEMPLATE_TEST_CASE_2(TestName, TestFunc, Name, Tags,    \\\n                                            Signature, ...)                    \\\n  CATCH_INTERNAL_START_WARNINGS_SUPPRESSION                                    \\\n  CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS                                     \\\n  CATCH_INTERNAL_SUPPRESS_ZERO_VARIADIC_WARNINGS                               \\\n  CATCH_INTERNAL_SUPPRESS_UNUSED_TEMPLATE_WARNINGS                             \\\n  INTERNAL_CATCH_DECLARE_SIG_TEST(TestFunc,                                    \\\n                                  INTERNAL_CATCH_REMOVE_PARENS(Signature));    \\\n  namespace {                                                                  \\\n  namespace INTERNAL_CATCH_MAKE_NAMESPACE(TestName) {                          \\\n    INTERNAL_CATCH_TYPE_GEN                                                    \\\n    INTERNAL_CATCH_NTTP_GEN(INTERNAL_CATCH_REMOVE_PARENS(Signature))           \\\n    INTERNAL_CATCH_NTTP_REG_GEN(TestFunc,                                      \\\n                                INTERNAL_CATCH_REMOVE_PARENS(Signature))       \\\n    template <typename... Types>                                               \\\n    struct TestName {                                                          \\\n      TestName() {                                                             \\\n        int index                          = 0;                                \\\n        constexpr char const* tmpl_types[] = {CATCH_REC_LIST(                  \\\n            INTERNAL_CATCH_STRINGIZE_WITHOUT_PARENS, __VA_ARGS__)};            \\\n        using expander                     = int[];                            \\\n        (void)expander{                                                        \\\n            (reg_test(Types{},                                                 \\\n                      Catch::NameAndTags{                                      \\\n                          Name \" - \" + std::string(tmpl_types[index]), Tags}), \\\n             index++)...}; /* NOLINT */                                        \\\n      }                                                                        \\\n    };                                                                         \\\n    static int INTERNAL_CATCH_UNIQUE_NAME(globalRegistrar) = []() {            \\\n      TestName<INTERNAL_CATCH_MAKE_TYPE_LISTS_FROM_TYPES(__VA_ARGS__)>();      \\\n      return 0;                                                                \\\n    }();                                                                       \\\n  }                                                                            \\\n  }                                                                            \\\n  CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION                                     \\\n  INTERNAL_CATCH_DEFINE_SIG_TEST(TestFunc,                                     \\\n                                 INTERNAL_CATCH_REMOVE_PARENS(Signature))\n\n#ifndef CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR\n#define INTERNAL_CATCH_TEMPLATE_TEST_CASE(Name, Tags, ...)                     \\\n  INTERNAL_CATCH_TEMPLATE_TEST_CASE_2(                                         \\\n      INTERNAL_CATCH_UNIQUE_NAME(                                              \\\n          ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____),                    \\\n      INTERNAL_CATCH_UNIQUE_NAME(                                              \\\n          ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____F_U_N_C____),         \\\n      Name, Tags, typename TestType, __VA_ARGS__)\n#else\n#define INTERNAL_CATCH_TEMPLATE_TEST_CASE(Name, Tags, ...)                     \\\n  INTERNAL_CATCH_EXPAND_VARGS(INTERNAL_CATCH_TEMPLATE_TEST_CASE_2(             \\\n      INTERNAL_CATCH_UNIQUE_NAME(                                              \\\n          ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____),                    \\\n      INTERNAL_CATCH_UNIQUE_NAME(                                              \\\n          ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____F_U_N_C____),         \\\n      Name, Tags, typename TestType, __VA_ARGS__))\n#endif\n\n#ifndef CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR\n#define INTERNAL_CATCH_TEMPLATE_TEST_CASE_SIG(Name, Tags, Signature, ...)      \\\n  INTERNAL_CATCH_TEMPLATE_TEST_CASE_2(                                         \\\n      INTERNAL_CATCH_UNIQUE_NAME(                                              \\\n          ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____),                    \\\n      INTERNAL_CATCH_UNIQUE_NAME(                                              \\\n          ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____F_U_N_C____),         \\\n      Name, Tags, Signature, __VA_ARGS__)\n#else\n#define INTERNAL_CATCH_TEMPLATE_TEST_CASE_SIG(Name, Tags, Signature, ...)      \\\n  INTERNAL_CATCH_EXPAND_VARGS(INTERNAL_CATCH_TEMPLATE_TEST_CASE_2(             \\\n      INTERNAL_CATCH_UNIQUE_NAME(                                              \\\n          ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____),                    \\\n      INTERNAL_CATCH_UNIQUE_NAME(                                              \\\n          ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____F_U_N_C____),         \\\n      Name, Tags, Signature, __VA_ARGS__))\n#endif\n\n#define INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE2(                            \\\n    TestName, TestFuncName, Name, Tags, Signature, TmplTypes, TypesList)       \\\n  CATCH_INTERNAL_START_WARNINGS_SUPPRESSION                                    \\\n  CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS                                     \\\n  CATCH_INTERNAL_SUPPRESS_ZERO_VARIADIC_WARNINGS                               \\\n  CATCH_INTERNAL_SUPPRESS_UNUSED_TEMPLATE_WARNINGS                             \\\n  template <typename TestType>                                                 \\\n  static void TestFuncName();                                                  \\\n  namespace {                                                                  \\\n  namespace INTERNAL_CATCH_MAKE_NAMESPACE(TestName) {                          \\\n    INTERNAL_CATCH_TYPE_GEN                                                    \\\n    INTERNAL_CATCH_NTTP_GEN(INTERNAL_CATCH_REMOVE_PARENS(Signature))           \\\n    template <typename... Types>                                               \\\n    struct TestName {                                                          \\\n      void reg_tests() {                                                       \\\n        int index                          = 0;                                \\\n        using expander                     = int[];                            \\\n        constexpr char const* tmpl_types[] = {                                 \\\n            CATCH_REC_LIST(INTERNAL_CATCH_STRINGIZE_WITHOUT_PARENS,            \\\n                           INTERNAL_CATCH_REMOVE_PARENS(TmplTypes))};          \\\n        constexpr char const* types_list[] = {                                 \\\n            CATCH_REC_LIST(INTERNAL_CATCH_STRINGIZE_WITHOUT_PARENS,            \\\n                           INTERNAL_CATCH_REMOVE_PARENS(TypesList))};          \\\n        constexpr auto num_types = sizeof(types_list) / sizeof(types_list[0]); \\\n        (void)expander{                                                        \\\n            (Catch::AutoReg(                                                   \\\n                 Catch::makeTestInvoker(&TestFuncName<Types>),                 \\\n                 CATCH_INTERNAL_LINEINFO, Catch::StringRef(),                  \\\n                 Catch::NameAndTags{                                           \\\n                     Name \" - \" + std::string(tmpl_types[index / num_types]) + \\\n                         \"<\" + std::string(types_list[index % num_types]) +    \\\n                         \">\",                                                  \\\n                     Tags}),                                                   \\\n             index++)...}; /* NOLINT */                                        \\\n      }                                                                        \\\n    };                                                                         \\\n    static int INTERNAL_CATCH_UNIQUE_NAME(globalRegistrar) = []() {            \\\n      using TestInit = typename create<                                        \\\n          TestName,                                                            \\\n          decltype(get_wrapper<INTERNAL_CATCH_REMOVE_PARENS(TmplTypes)>()),    \\\n          TypeList<INTERNAL_CATCH_MAKE_TYPE_LISTS_FROM_TYPES(                  \\\n              INTERNAL_CATCH_REMOVE_PARENS(TypesList))>>::type;                \\\n      TestInit t;                                                              \\\n      t.reg_tests();                                                           \\\n      return 0;                                                                \\\n    }();                                                                       \\\n  }                                                                            \\\n  }                                                                            \\\n  CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION                                     \\\n  template <typename TestType>                                                 \\\n  static void TestFuncName()\n\n#ifndef CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR\n#define INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE(Name, Tags, ...)             \\\n  INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE2(                                  \\\n      INTERNAL_CATCH_UNIQUE_NAME(                                              \\\n          ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____),                    \\\n      INTERNAL_CATCH_UNIQUE_NAME(                                              \\\n          ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____F_U_N_C____),         \\\n      Name, Tags, typename T, __VA_ARGS__)\n#else\n#define INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE(Name, Tags, ...)             \\\n  INTERNAL_CATCH_EXPAND_VARGS(INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE2(      \\\n      INTERNAL_CATCH_UNIQUE_NAME(                                              \\\n          ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____),                    \\\n      INTERNAL_CATCH_UNIQUE_NAME(                                              \\\n          ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____F_U_N_C____),         \\\n      Name, Tags, typename T, __VA_ARGS__))\n#endif\n\n#ifndef CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR\n#define INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE_SIG(Name, Tags, Signature,   \\\n                                                      ...)                     \\\n  INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE2(                                  \\\n      INTERNAL_CATCH_UNIQUE_NAME(                                              \\\n          ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____),                    \\\n      INTERNAL_CATCH_UNIQUE_NAME(                                              \\\n          ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____F_U_N_C____),         \\\n      Name, Tags, Signature, __VA_ARGS__)\n#else\n#define INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE_SIG(Name, Tags, Signature,   \\\n                                                      ...)                     \\\n  INTERNAL_CATCH_EXPAND_VARGS(INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE2(      \\\n      INTERNAL_CATCH_UNIQUE_NAME(                                              \\\n          ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____),                    \\\n      INTERNAL_CATCH_UNIQUE_NAME(                                              \\\n          ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____F_U_N_C____),         \\\n      Name, Tags, Signature, __VA_ARGS__))\n#endif\n\n#define INTERNAL_CATCH_TEMPLATE_LIST_TEST_CASE_2(TestName, TestFunc, Name,     \\\n                                                 Tags, TmplList)               \\\n  CATCH_INTERNAL_START_WARNINGS_SUPPRESSION                                    \\\n  CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS                                     \\\n  CATCH_INTERNAL_SUPPRESS_UNUSED_TEMPLATE_WARNINGS                             \\\n  template <typename TestType>                                                 \\\n  static void TestFunc();                                                      \\\n  namespace {                                                                  \\\n  namespace INTERNAL_CATCH_MAKE_NAMESPACE(TestName) {                          \\\n    INTERNAL_CATCH_TYPE_GEN                                                    \\\n    template <typename... Types>                                               \\\n    struct TestName {                                                          \\\n      void reg_tests() {                                                       \\\n        int index      = 0;                                                    \\\n        using expander = int[];                                                \\\n        (void)expander{                                                        \\\n            (Catch::AutoReg(                                                   \\\n                 Catch::makeTestInvoker(&TestFunc<Types>),                     \\\n                 CATCH_INTERNAL_LINEINFO, Catch::StringRef(),                  \\\n                 Catch::NameAndTags{                                           \\\n                     Name \" - \" +                                              \\\n                         std::string(INTERNAL_CATCH_STRINGIZE(TmplList)) +     \\\n                         \" - \" + std::to_string(index),                        \\\n                     Tags}),                                                   \\\n             index++)...}; /* NOLINT */                                        \\\n      }                                                                        \\\n    };                                                                         \\\n    static int INTERNAL_CATCH_UNIQUE_NAME(globalRegistrar) = []() {            \\\n      using TestInit = typename convert<TestName, TmplList>::type;             \\\n      TestInit t;                                                              \\\n      t.reg_tests();                                                           \\\n      return 0;                                                                \\\n    }();                                                                       \\\n  }                                                                            \\\n  }                                                                            \\\n  CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION                                     \\\n  template <typename TestType>                                                 \\\n  static void TestFunc()\n\n#define INTERNAL_CATCH_TEMPLATE_LIST_TEST_CASE(Name, Tags, TmplList)           \\\n  INTERNAL_CATCH_TEMPLATE_LIST_TEST_CASE_2(                                    \\\n      INTERNAL_CATCH_UNIQUE_NAME(                                              \\\n          ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____),                    \\\n      INTERNAL_CATCH_UNIQUE_NAME(                                              \\\n          ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____F_U_N_C____),         \\\n      Name, Tags, TmplList)\n\n#define INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_2(                            \\\n    TestNameClass, TestName, ClassName, Name, Tags, Signature, ...)            \\\n  CATCH_INTERNAL_START_WARNINGS_SUPPRESSION                                    \\\n  CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS                                     \\\n  CATCH_INTERNAL_SUPPRESS_ZERO_VARIADIC_WARNINGS                               \\\n  CATCH_INTERNAL_SUPPRESS_UNUSED_TEMPLATE_WARNINGS                             \\\n  namespace {                                                                  \\\n  namespace INTERNAL_CATCH_MAKE_NAMESPACE(TestName) {                          \\\n    INTERNAL_CATCH_TYPE_GEN                                                    \\\n    INTERNAL_CATCH_NTTP_GEN(INTERNAL_CATCH_REMOVE_PARENS(Signature))           \\\n    INTERNAL_CATCH_DECLARE_SIG_TEST_METHOD(                                    \\\n        TestName, ClassName, INTERNAL_CATCH_REMOVE_PARENS(Signature));         \\\n    INTERNAL_CATCH_NTTP_REG_METHOD_GEN(                                        \\\n        TestName, INTERNAL_CATCH_REMOVE_PARENS(Signature))                     \\\n    template <typename... Types>                                               \\\n    struct TestNameClass {                                                     \\\n      TestNameClass() {                                                        \\\n        int index                          = 0;                                \\\n        constexpr char const* tmpl_types[] = {CATCH_REC_LIST(                  \\\n            INTERNAL_CATCH_STRINGIZE_WITHOUT_PARENS, __VA_ARGS__)};            \\\n        using expander                     = int[];                            \\\n        (void)expander{                                                        \\\n            (reg_test(Types{}, #ClassName,                                     \\\n                      Catch::NameAndTags{                                      \\\n                          Name \" - \" + std::string(tmpl_types[index]), Tags}), \\\n             index++)...}; /* NOLINT */                                        \\\n      }                                                                        \\\n    };                                                                         \\\n    static int INTERNAL_CATCH_UNIQUE_NAME(globalRegistrar) = []() {            \\\n      TestNameClass<INTERNAL_CATCH_MAKE_TYPE_LISTS_FROM_TYPES(__VA_ARGS__)>(); \\\n      return 0;                                                                \\\n    }();                                                                       \\\n  }                                                                            \\\n  }                                                                            \\\n  CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION                                     \\\n  INTERNAL_CATCH_DEFINE_SIG_TEST_METHOD(                                       \\\n      TestName, INTERNAL_CATCH_REMOVE_PARENS(Signature))\n\n#ifndef CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR\n#define INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD(ClassName, Name, Tags, ...)   \\\n  INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_2(                                  \\\n      INTERNAL_CATCH_UNIQUE_NAME(                                              \\\n          ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____C_L_A_S_S____),       \\\n      INTERNAL_CATCH_UNIQUE_NAME(                                              \\\n          ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____),                    \\\n      ClassName, Name, Tags, typename T, __VA_ARGS__)\n#else\n#define INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD(ClassName, Name, Tags, ...)   \\\n  INTERNAL_CATCH_EXPAND_VARGS(INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_2(      \\\n      INTERNAL_CATCH_UNIQUE_NAME(                                              \\\n          ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____C_L_A_S_S____),       \\\n      INTERNAL_CATCH_UNIQUE_NAME(                                              \\\n          ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____),                    \\\n      ClassName, Name, Tags, typename T, __VA_ARGS__))\n#endif\n\n#ifndef CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR\n#define INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_SIG(ClassName, Name, Tags,    \\\n                                                     Signature, ...)           \\\n  INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_2(                                  \\\n      INTERNAL_CATCH_UNIQUE_NAME(                                              \\\n          ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____C_L_A_S_S____),       \\\n      INTERNAL_CATCH_UNIQUE_NAME(                                              \\\n          ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____),                    \\\n      ClassName, Name, Tags, Signature, __VA_ARGS__)\n#else\n#define INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_SIG(ClassName, Name, Tags,    \\\n                                                     Signature, ...)           \\\n  INTERNAL_CATCH_EXPAND_VARGS(INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_2(      \\\n      INTERNAL_CATCH_UNIQUE_NAME(                                              \\\n          ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____C_L_A_S_S____),       \\\n      INTERNAL_CATCH_UNIQUE_NAME(                                              \\\n          ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____),                    \\\n      ClassName, Name, Tags, Signature, __VA_ARGS__))\n#endif\n\n#define INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE_METHOD_2(                    \\\n    TestNameClass, TestName, ClassName, Name, Tags, Signature, TmplTypes,      \\\n    TypesList)                                                                 \\\n  CATCH_INTERNAL_START_WARNINGS_SUPPRESSION                                    \\\n  CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS                                     \\\n  CATCH_INTERNAL_SUPPRESS_ZERO_VARIADIC_WARNINGS                               \\\n  CATCH_INTERNAL_SUPPRESS_UNUSED_TEMPLATE_WARNINGS                             \\\n  template <typename TestType>                                                 \\\n  struct TestName : INTERNAL_CATCH_REMOVE_PARENS(ClassName<TestType>) {        \\\n    void test();                                                               \\\n  };                                                                           \\\n  namespace {                                                                  \\\n  namespace INTERNAL_CATCH_MAKE_NAMESPACE(TestNameClass) {                     \\\n    INTERNAL_CATCH_TYPE_GEN                                                    \\\n    INTERNAL_CATCH_NTTP_GEN(INTERNAL_CATCH_REMOVE_PARENS(Signature))           \\\n    template <typename... Types>                                               \\\n    struct TestNameClass {                                                     \\\n      void reg_tests() {                                                       \\\n        int index                          = 0;                                \\\n        using expander                     = int[];                            \\\n        constexpr char const* tmpl_types[] = {                                 \\\n            CATCH_REC_LIST(INTERNAL_CATCH_STRINGIZE_WITHOUT_PARENS,            \\\n                           INTERNAL_CATCH_REMOVE_PARENS(TmplTypes))};          \\\n        constexpr char const* types_list[] = {                                 \\\n            CATCH_REC_LIST(INTERNAL_CATCH_STRINGIZE_WITHOUT_PARENS,            \\\n                           INTERNAL_CATCH_REMOVE_PARENS(TypesList))};          \\\n        constexpr auto num_types = sizeof(types_list) / sizeof(types_list[0]); \\\n        (void)expander{                                                        \\\n            (Catch::AutoReg(                                                   \\\n                 Catch::makeTestInvoker(&TestName<Types>::test),               \\\n                 CATCH_INTERNAL_LINEINFO, #ClassName,                          \\\n                 Catch::NameAndTags{                                           \\\n                     Name \" - \" + std::string(tmpl_types[index / num_types]) + \\\n                         \"<\" + std::string(types_list[index % num_types]) +    \\\n                         \">\",                                                  \\\n                     Tags}),                                                   \\\n             index++)...}; /* NOLINT */                                        \\\n      }                                                                        \\\n    };                                                                         \\\n    static int INTERNAL_CATCH_UNIQUE_NAME(globalRegistrar) = []() {            \\\n      using TestInit = typename create<                                        \\\n          TestNameClass,                                                       \\\n          decltype(get_wrapper<INTERNAL_CATCH_REMOVE_PARENS(TmplTypes)>()),    \\\n          TypeList<INTERNAL_CATCH_MAKE_TYPE_LISTS_FROM_TYPES(                  \\\n              INTERNAL_CATCH_REMOVE_PARENS(TypesList))>>::type;                \\\n      TestInit t;                                                              \\\n      t.reg_tests();                                                           \\\n      return 0;                                                                \\\n    }();                                                                       \\\n  }                                                                            \\\n  }                                                                            \\\n  CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION                                     \\\n  template <typename TestType>                                                 \\\n  void TestName<TestType>::test()\n\n#ifndef CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR\n#define INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE_METHOD(ClassName, Name,      \\\n                                                         Tags, ...)            \\\n  INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE_METHOD_2(                          \\\n      INTERNAL_CATCH_UNIQUE_NAME(                                              \\\n          ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____),                    \\\n      INTERNAL_CATCH_UNIQUE_NAME(                                              \\\n          ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____F_U_N_C____),         \\\n      ClassName, Name, Tags, typename T, __VA_ARGS__)\n#else\n#define INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE_METHOD(ClassName, Name,      \\\n                                                         Tags, ...)            \\\n  INTERNAL_CATCH_EXPAND_VARGS(                                                 \\\n      INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE_METHOD_2(                      \\\n          INTERNAL_CATCH_UNIQUE_NAME(                                          \\\n              ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____),                \\\n          INTERNAL_CATCH_UNIQUE_NAME(                                          \\\n              ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____F_U_N_C____),     \\\n          ClassName, Name, Tags, typename T, __VA_ARGS__))\n#endif\n\n#ifndef CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR\n#define INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE_METHOD_SIG(                  \\\n    ClassName, Name, Tags, Signature, ...)                                     \\\n  INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE_METHOD_2(                          \\\n      INTERNAL_CATCH_UNIQUE_NAME(                                              \\\n          ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____),                    \\\n      INTERNAL_CATCH_UNIQUE_NAME(                                              \\\n          ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____F_U_N_C____),         \\\n      ClassName, Name, Tags, Signature, __VA_ARGS__)\n#else\n#define INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE_METHOD_SIG(                  \\\n    ClassName, Name, Tags, Signature, ...)                                     \\\n  INTERNAL_CATCH_EXPAND_VARGS(                                                 \\\n      INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE_METHOD_2(                      \\\n          INTERNAL_CATCH_UNIQUE_NAME(                                          \\\n              ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____),                \\\n          INTERNAL_CATCH_UNIQUE_NAME(                                          \\\n              ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____F_U_N_C____),     \\\n          ClassName, Name, Tags, Signature, __VA_ARGS__))\n#endif\n\n#define INTERNAL_CATCH_TEMPLATE_LIST_TEST_CASE_METHOD_2(                       \\\n    TestNameClass, TestName, ClassName, Name, Tags, TmplList)                  \\\n  CATCH_INTERNAL_START_WARNINGS_SUPPRESSION                                    \\\n  CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS                                     \\\n  CATCH_INTERNAL_SUPPRESS_UNUSED_TEMPLATE_WARNINGS                             \\\n  template <typename TestType>                                                 \\\n  struct TestName : INTERNAL_CATCH_REMOVE_PARENS(ClassName<TestType>) {        \\\n    void test();                                                               \\\n  };                                                                           \\\n  namespace {                                                                  \\\n  namespace INTERNAL_CATCH_MAKE_NAMESPACE(TestName) {                          \\\n    INTERNAL_CATCH_TYPE_GEN                                                    \\\n    template <typename... Types>                                               \\\n    struct TestNameClass {                                                     \\\n      void reg_tests() {                                                       \\\n        int index      = 0;                                                    \\\n        using expander = int[];                                                \\\n        (void)expander{                                                        \\\n            (Catch::AutoReg(                                                   \\\n                 Catch::makeTestInvoker(&TestName<Types>::test),               \\\n                 CATCH_INTERNAL_LINEINFO, #ClassName,                          \\\n                 Catch::NameAndTags{                                           \\\n                     Name \" - \" +                                              \\\n                         std::string(INTERNAL_CATCH_STRINGIZE(TmplList)) +     \\\n                         \" - \" + std::to_string(index),                        \\\n                     Tags}),                                                   \\\n             index++)...}; /* NOLINT */                                        \\\n      }                                                                        \\\n    };                                                                         \\\n    static int INTERNAL_CATCH_UNIQUE_NAME(globalRegistrar) = []() {            \\\n      using TestInit = typename convert<TestNameClass, TmplList>::type;        \\\n      TestInit t;                                                              \\\n      t.reg_tests();                                                           \\\n      return 0;                                                                \\\n    }();                                                                       \\\n  }                                                                            \\\n  }                                                                            \\\n  CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION                                     \\\n  template <typename TestType>                                                 \\\n  void TestName<TestType>::test()\n\n#define INTERNAL_CATCH_TEMPLATE_LIST_TEST_CASE_METHOD(ClassName, Name, Tags,   \\\n                                                      TmplList)                \\\n  INTERNAL_CATCH_TEMPLATE_LIST_TEST_CASE_METHOD_2(                             \\\n      INTERNAL_CATCH_UNIQUE_NAME(                                              \\\n          ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____),                    \\\n      INTERNAL_CATCH_UNIQUE_NAME(                                              \\\n          ____C_A_T_C_H____T_E_M_P_L_A_T_E____T_E_S_T____F_U_N_C____),         \\\n      ClassName, Name, Tags, TmplList)\n\n// end catch_test_registry.h\n// start catch_capture.hpp\n\n// start catch_assertionhandler.h\n\n// start catch_assertioninfo.h\n\n// start catch_result_type.h\n\nnamespace Catch {\n\n// ResultWas::OfType enum\nstruct ResultWas {\n  enum OfType {\n    Unknown = -1,\n    Ok      = 0,\n    Info    = 1,\n    Warning = 2,\n\n    FailureBit = 0x10,\n\n    ExpressionFailed = FailureBit | 1,\n    ExplicitFailure  = FailureBit | 2,\n\n    Exception = 0x100 | FailureBit,\n\n    ThrewException      = Exception | 1,\n    DidntThrowException = Exception | 2,\n\n    FatalErrorCondition = 0x200 | FailureBit\n\n  };\n};\n\nbool isOk(ResultWas::OfType resultType);\nbool isJustInfo(int flags);\n\n// ResultDisposition::Flags enum\nstruct ResultDisposition {\n  enum Flags {\n    Normal = 0x01,\n\n    ContinueOnFailure = 0x02, // Failures fail test, but execution continues\n    FalseTest         = 0x04, // Prefix expression with !\n    SuppressFail      = 0x08  // Failures are reported but do not fail the test\n  };\n};\n\nResultDisposition::Flags operator|(ResultDisposition::Flags lhs,\n                                   ResultDisposition::Flags rhs);\n\nbool shouldContinueOnFailure(int flags);\ninline bool isFalseTest(int flags) {\n  return (flags & ResultDisposition::FalseTest) != 0;\n}\nbool shouldSuppressFailure(int flags);\n\n} // end namespace Catch\n\n// end catch_result_type.h\nnamespace Catch {\n\nstruct AssertionInfo {\n  StringRef macroName;\n  SourceLineInfo lineInfo;\n  StringRef capturedExpression;\n  ResultDisposition::Flags resultDisposition;\n\n  // We want to delete this constructor but a compiler bug in 4.8 means\n  // the struct is then treated as non-aggregate\n  // AssertionInfo() = delete;\n};\n\n} // end namespace Catch\n\n// end catch_assertioninfo.h\n// start catch_decomposer.h\n\n// start catch_tostring.h\n\n#include <vector>\n#include <cstddef>\n#include <type_traits>\n#include <string>\n// start catch_stream.h\n\n#include <iosfwd>\n#include <cstddef>\n#include <ostream>\n\nnamespace Catch {\n\nstd::ostream& cout();\nstd::ostream& cerr();\nstd::ostream& clog();\n\nclass StringRef;\n\nstruct IStream {\n  virtual ~IStream();\n  virtual std::ostream& stream() const = 0;\n};\n\nauto makeStream(StringRef const& filename) -> IStream const*;\n\nclass ReusableStringStream : NonCopyable {\n  std::size_t m_index;\n  std::ostream* m_oss;\n\npublic:\n  ReusableStringStream();\n  ~ReusableStringStream();\n\n  auto str() const -> std::string;\n\n  template <typename T>\n  auto operator<<(T const& value) -> ReusableStringStream& {\n    *m_oss << value;\n    return *this;\n  }\n  auto get() -> std::ostream& { return *m_oss; }\n};\n} // namespace Catch\n\n// end catch_stream.h\n// start catch_interfaces_enum_values_registry.h\n\n#include <vector>\n\nnamespace Catch {\n\nnamespace Detail {\nstruct EnumInfo {\n  StringRef m_name;\n  std::vector<std::pair<int, StringRef>> m_values;\n\n  ~EnumInfo();\n\n  StringRef lookup(int value) const;\n};\n} // namespace Detail\n\nstruct IMutableEnumValuesRegistry {\n  virtual ~IMutableEnumValuesRegistry();\n\n  virtual Detail::EnumInfo const&\n  registerEnum(StringRef enumName, StringRef allEnums,\n               std::vector<int> const& values) = 0;\n\n  template <typename E>\n  Detail::EnumInfo const& registerEnum(StringRef enumName, StringRef allEnums,\n                                       std::initializer_list<E> values) {\n    static_assert(sizeof(int) >= sizeof(E), \"Cannot serialize enum to int\");\n    std::vector<int> intValues;\n    intValues.reserve(values.size());\n    for (auto enumValue : values)\n      intValues.push_back(static_cast<int>(enumValue));\n    return registerEnum(enumName, allEnums, intValues);\n  }\n};\n\n} // namespace Catch\n\n// end catch_interfaces_enum_values_registry.h\n\n#ifdef CATCH_CONFIG_CPP17_STRING_VIEW\n#include <string_view>\n#endif\n\n#ifdef __OBJC__\n// start catch_objc_arc.hpp\n\n#import <Foundation/Foundation.h>\n\n#ifdef __has_feature\n#define CATCH_ARC_ENABLED __has_feature(objc_arc)\n#else\n#define CATCH_ARC_ENABLED 0\n#endif\n\nvoid arcSafeRelease(NSObject* obj);\nid performOptionalSelector(id obj, SEL sel);\n\n#if !CATCH_ARC_ENABLED\ninline void arcSafeRelease(NSObject* obj) { [obj release]; }\ninline id performOptionalSelector(id obj, SEL sel) {\n  if ([obj respondsToSelector:sel])\n    return [obj performSelector:sel];\n  return nil;\n}\n#define CATCH_UNSAFE_UNRETAINED\n#define CATCH_ARC_STRONG\n#else\ninline void arcSafeRelease(NSObject*) {}\ninline id performOptionalSelector(id obj, SEL sel) {\n#ifdef __clang__\n#pragma clang diagnostic push\n#pragma clang diagnostic ignored \"-Warc-performSelector-leaks\"\n#endif\n  if ([obj respondsToSelector:sel])\n    return [obj performSelector:sel];\n#ifdef __clang__\n#pragma clang diagnostic pop\n#endif\n  return nil;\n}\n#define CATCH_UNSAFE_UNRETAINED __unsafe_unretained\n#define CATCH_ARC_STRONG __strong\n#endif\n\n// end catch_objc_arc.hpp\n#endif\n\n#ifdef _MSC_VER\n#pragma warning(push)\n#pragma warning(                                                               \\\n    disable : 4180) // We attempt to stream a function (address) by const&,\n                    // which MSVC complains about but is harmless\n#endif\n\nnamespace Catch {\nnamespace Detail {\n\nextern const std::string unprintableString;\n\nstd::string rawMemoryToString(const void* object, std::size_t size);\n\ntemplate <typename T>\nstd::string rawMemoryToString(const T& object) {\n  return rawMemoryToString(&object, sizeof(object));\n}\n\ntemplate <typename T>\nclass IsStreamInsertable {\n  template <typename Stream, typename U>\n  static auto test(int)\n      -> decltype(std::declval<Stream&>() << std::declval<U>(),\n                  std::true_type());\n\n  template <typename, typename>\n  static auto test(...) -> std::false_type;\n\npublic:\n  static const bool value = decltype(test<std::ostream, const T&>(0))::value;\n};\n\ntemplate <typename E>\nstd::string convertUnknownEnumToString(E e);\n\ntemplate <typename T>\ntypename std::enable_if<!std::is_enum<T>::value &&\n                            !std::is_base_of<std::exception, T>::value,\n                        std::string>::type\nconvertUnstreamable(T const&) {\n  return Detail::unprintableString;\n}\ntemplate <typename T>\ntypename std::enable_if<!std::is_enum<T>::value &&\n                            std::is_base_of<std::exception, T>::value,\n                        std::string>::type\nconvertUnstreamable(T const& ex) {\n  return ex.what();\n}\n\ntemplate <typename T>\ntypename std::enable_if<std::is_enum<T>::value, std::string>::type\nconvertUnstreamable(T const& value) {\n  return convertUnknownEnumToString(value);\n}\n\n#if defined(_MANAGED)\n//! Convert a CLR string to a utf8 std::string\ntemplate <typename T>\nstd::string clrReferenceToString(T ^ ref) {\n  if (ref == nullptr)\n    return std::string(\"null\");\n  auto bytes = System::Text::Encoding::UTF8->GetBytes(ref->ToString());\n  cli::pin_ptr<System::Byte> p = &bytes[0];\n  return std::string(reinterpret_cast<char const*>(p), bytes->Length);\n}\n#endif\n\n} // namespace Detail\n\n// If we decide for C++14, change these to enable_if_ts\ntemplate <typename T, typename = void>\nstruct StringMaker {\n  template <typename Fake = T>\n  static\n      typename std::enable_if<::Catch::Detail::IsStreamInsertable<Fake>::value,\n                              std::string>::type\n      convert(const Fake& value) {\n    ReusableStringStream rss;\n    // NB: call using the function-like syntax to avoid ambiguity with\n    // user-defined templated operator<< under clang.\n    rss.operator<<(value);\n    return rss.str();\n  }\n\n  template <typename Fake = T>\n  static\n      typename std::enable_if<!::Catch::Detail::IsStreamInsertable<Fake>::value,\n                              std::string>::type\n      convert(const Fake& value) {\n#if !defined(CATCH_CONFIG_FALLBACK_STRINGIFIER)\n    return Detail::convertUnstreamable(value);\n#else\n    return CATCH_CONFIG_FALLBACK_STRINGIFIER(value);\n#endif\n  }\n};\n\nnamespace Detail {\n\n// This function dispatches all stringification requests inside of Catch.\n// Should be preferably called fully qualified, like ::Catch::Detail::stringify\ntemplate <typename T>\nstd::string stringify(const T& e) {\n  return ::Catch::StringMaker<typename std::remove_cv<\n      typename std::remove_reference<T>::type>::type>::convert(e);\n}\n\ntemplate <typename E>\nstd::string convertUnknownEnumToString(E e) {\n  return ::Catch::Detail::stringify(\n      static_cast<typename std::underlying_type<E>::type>(e));\n}\n\n#if defined(_MANAGED)\ntemplate <typename T>\nstd::string stringify(T ^ e) {\n  return ::Catch::StringMaker<T ^>::convert(e);\n}\n#endif\n\n} // namespace Detail\n\n// Some predefined specializations\n\ntemplate <>\nstruct StringMaker<std::string> {\n  static std::string convert(const std::string& str);\n};\n\n#ifdef CATCH_CONFIG_CPP17_STRING_VIEW\ntemplate <>\nstruct StringMaker<std::string_view> {\n  static std::string convert(std::string_view str);\n};\n#endif\n\ntemplate <>\nstruct StringMaker<char const*> {\n  static std::string convert(char const* str);\n};\ntemplate <>\nstruct StringMaker<char*> {\n  static std::string convert(char* str);\n};\n\n#ifdef CATCH_CONFIG_WCHAR\ntemplate <>\nstruct StringMaker<std::wstring> {\n  static std::string convert(const std::wstring& wstr);\n};\n\n#ifdef CATCH_CONFIG_CPP17_STRING_VIEW\ntemplate <>\nstruct StringMaker<std::wstring_view> {\n  static std::string convert(std::wstring_view str);\n};\n#endif\n\ntemplate <>\nstruct StringMaker<wchar_t const*> {\n  static std::string convert(wchar_t const* str);\n};\ntemplate <>\nstruct StringMaker<wchar_t*> {\n  static std::string convert(wchar_t* str);\n};\n#endif\n\n// TBD: Should we use `strnlen` to ensure that we don't go out of the buffer,\n//      while keeping string semantics?\ntemplate <int SZ>\nstruct StringMaker<char[SZ]> {\n  static std::string convert(char const* str) {\n    return ::Catch::Detail::stringify(std::string{str});\n  }\n};\ntemplate <int SZ>\nstruct StringMaker<signed char[SZ]> {\n  static std::string convert(signed char const* str) {\n    return ::Catch::Detail::stringify(\n        std::string{reinterpret_cast<char const*>(str)});\n  }\n};\ntemplate <int SZ>\nstruct StringMaker<unsigned char[SZ]> {\n  static std::string convert(unsigned char const* str) {\n    return ::Catch::Detail::stringify(\n        std::string{reinterpret_cast<char const*>(str)});\n  }\n};\n\n#if defined(CATCH_CONFIG_CPP17_BYTE)\ntemplate <>\nstruct StringMaker<std::byte> {\n  static std::string convert(std::byte value);\n};\n#endif // defined(CATCH_CONFIG_CPP17_BYTE)\ntemplate <>\nstruct StringMaker<int> {\n  static std::string convert(int value);\n};\ntemplate <>\nstruct StringMaker<long> {\n  static std::string convert(long value);\n};\ntemplate <>\nstruct StringMaker<long long> {\n  static std::string convert(long long value);\n};\ntemplate <>\nstruct StringMaker<unsigned int> {\n  static std::string convert(unsigned int value);\n};\ntemplate <>\nstruct StringMaker<unsigned long> {\n  static std::string convert(unsigned long value);\n};\ntemplate <>\nstruct StringMaker<unsigned long long> {\n  static std::string convert(unsigned long long value);\n};\n\ntemplate <>\nstruct StringMaker<bool> {\n  static std::string convert(bool b);\n};\n\ntemplate <>\nstruct StringMaker<char> {\n  static std::string convert(char c);\n};\ntemplate <>\nstruct StringMaker<signed char> {\n  static std::string convert(signed char c);\n};\ntemplate <>\nstruct StringMaker<unsigned char> {\n  static std::string convert(unsigned char c);\n};\n\ntemplate <>\nstruct StringMaker<std::nullptr_t> {\n  static std::string convert(std::nullptr_t);\n};\n\ntemplate <>\nstruct StringMaker<float> {\n  static std::string convert(float value);\n  static int precision;\n};\n\ntemplate <>\nstruct StringMaker<double> {\n  static std::string convert(double value);\n  static int precision;\n};\n\ntemplate <typename T>\nstruct StringMaker<T*> {\n  template <typename U>\n  static std::string convert(U* p) {\n    if (p) {\n      return ::Catch::Detail::rawMemoryToString(p);\n    } else {\n      return \"nullptr\";\n    }\n  }\n};\n\ntemplate <typename R, typename C>\nstruct StringMaker<R C::*> {\n  static std::string convert(R C::*p) {\n    if (p) {\n      return ::Catch::Detail::rawMemoryToString(p);\n    } else {\n      return \"nullptr\";\n    }\n  }\n};\n\n#if defined(_MANAGED)\ntemplate <typename T>\nstruct StringMaker<T ^> {\n  static std::string convert(T ^ ref) {\n    return ::Catch::Detail::clrReferenceToString(ref);\n  }\n};\n#endif\n\nnamespace Detail {\ntemplate <typename InputIterator, typename Sentinel = InputIterator>\nstd::string rangeToString(InputIterator first, Sentinel last) {\n  ReusableStringStream rss;\n  rss << \"{ \";\n  if (first != last) {\n    rss << ::Catch::Detail::stringify(*first);\n    for (++first; first != last; ++first)\n      rss << \", \" << ::Catch::Detail::stringify(*first);\n  }\n  rss << \" }\";\n  return rss.str();\n}\n} // namespace Detail\n\n#ifdef __OBJC__\ntemplate <>\nstruct StringMaker<NSString*> {\n  static std::string convert(NSString* nsstring) {\n    if (!nsstring)\n      return \"nil\";\n    return std::string(\"@\") + [nsstring UTF8String];\n  }\n};\ntemplate <>\nstruct StringMaker<NSObject*> {\n  static std::string convert(NSObject* nsObject) {\n    return ::Catch::Detail::stringify([nsObject description]);\n  }\n};\nnamespace Detail {\ninline std::string stringify(NSString* nsstring) {\n  return StringMaker<NSString*>::convert(nsstring);\n}\n\n} // namespace Detail\n#endif // __OBJC__\n\n} // namespace Catch\n\n//////////////////////////////////////////////////////\n// Separate std-lib types stringification, so it can be selectively enabled\n// This means that we do not bring in\n\n#if defined(CATCH_CONFIG_ENABLE_ALL_STRINGMAKERS)\n#define CATCH_CONFIG_ENABLE_PAIR_STRINGMAKER\n#define CATCH_CONFIG_ENABLE_TUPLE_STRINGMAKER\n#define CATCH_CONFIG_ENABLE_VARIANT_STRINGMAKER\n#define CATCH_CONFIG_ENABLE_CHRONO_STRINGMAKER\n#define CATCH_CONFIG_ENABLE_OPTIONAL_STRINGMAKER\n#endif\n\n// Separate std::pair specialization\n#if defined(CATCH_CONFIG_ENABLE_PAIR_STRINGMAKER)\n#include <utility>\nnamespace Catch {\ntemplate <typename T1, typename T2>\nstruct StringMaker<std::pair<T1, T2>> {\n  static std::string convert(const std::pair<T1, T2>& pair) {\n    ReusableStringStream rss;\n    rss << \"{ \" << ::Catch::Detail::stringify(pair.first) << \", \"\n        << ::Catch::Detail::stringify(pair.second) << \" }\";\n    return rss.str();\n  }\n};\n} // namespace Catch\n#endif // CATCH_CONFIG_ENABLE_PAIR_STRINGMAKER\n\n#if defined(CATCH_CONFIG_ENABLE_OPTIONAL_STRINGMAKER) &&                       \\\n    defined(CATCH_CONFIG_CPP17_OPTIONAL)\n#include <optional>\nnamespace Catch {\ntemplate <typename T>\nstruct StringMaker<std::optional<T>> {\n  static std::string convert(const std::optional<T>& optional) {\n    ReusableStringStream rss;\n    if (optional.has_value()) {\n      rss << ::Catch::Detail::stringify(*optional);\n    } else {\n      rss << \"{ }\";\n    }\n    return rss.str();\n  }\n};\n} // namespace Catch\n#endif // CATCH_CONFIG_ENABLE_OPTIONAL_STRINGMAKER\n\n// Separate std::tuple specialization\n#if defined(CATCH_CONFIG_ENABLE_TUPLE_STRINGMAKER)\n#include <tuple>\nnamespace Catch {\nnamespace Detail {\ntemplate <typename Tuple, std::size_t N = 0,\n          bool = (N < std::tuple_size<Tuple>::value)>\nstruct TupleElementPrinter {\n  static void print(const Tuple& tuple, std::ostream& os) {\n    os << (N ? \", \" : \" \") << ::Catch::Detail::stringify(std::get<N>(tuple));\n    TupleElementPrinter<Tuple, N + 1>::print(tuple, os);\n  }\n};\n\ntemplate <typename Tuple, std::size_t N>\nstruct TupleElementPrinter<Tuple, N, false> {\n  static void print(const Tuple&, std::ostream&) {}\n};\n\n} // namespace Detail\n\ntemplate <typename... Types>\nstruct StringMaker<std::tuple<Types...>> {\n  static std::string convert(const std::tuple<Types...>& tuple) {\n    ReusableStringStream rss;\n    rss << '{';\n    Detail::TupleElementPrinter<std::tuple<Types...>>::print(tuple, rss.get());\n    rss << \" }\";\n    return rss.str();\n  }\n};\n} // namespace Catch\n#endif // CATCH_CONFIG_ENABLE_TUPLE_STRINGMAKER\n\n#if defined(CATCH_CONFIG_ENABLE_VARIANT_STRINGMAKER) &&                        \\\n    defined(CATCH_CONFIG_CPP17_VARIANT)\n#include <variant>\nnamespace Catch {\ntemplate <>\nstruct StringMaker<std::monostate> {\n  static std::string convert(const std::monostate&) { return \"{ }\"; }\n};\n\ntemplate <typename... Elements>\nstruct StringMaker<std::variant<Elements...>> {\n  static std::string convert(const std::variant<Elements...>& variant) {\n    if (variant.valueless_by_exception()) {\n      return \"{valueless variant}\";\n    } else {\n      return std::visit(\n          [](const auto& value) { return ::Catch::Detail::stringify(value); },\n          variant);\n    }\n  }\n};\n} // namespace Catch\n#endif // CATCH_CONFIG_ENABLE_VARIANT_STRINGMAKER\n\nnamespace Catch {\n// Import begin/ end from std here\nusing std::begin;\nusing std::end;\n\nnamespace detail {\ntemplate <typename...>\nstruct void_type {\n  using type = void;\n};\n\ntemplate <typename T, typename = void>\nstruct is_range_impl : std::false_type {};\n\ntemplate <typename T>\nstruct is_range_impl<\n    T, typename void_type<decltype(begin(std::declval<T>()))>::type>\n    : std::true_type {};\n} // namespace detail\n\ntemplate <typename T>\nstruct is_range : detail::is_range_impl<T> {};\n\n#if defined(_MANAGED) // Managed types are never ranges\ntemplate <typename T>\nstruct is_range<T ^> {\n  static const bool value = false;\n};\n#endif\n\ntemplate <typename Range>\nstd::string rangeToString(Range const& range) {\n  return ::Catch::Detail::rangeToString(begin(range), end(range));\n}\n\n// Handle vector<bool> specially\ntemplate <typename Allocator>\nstd::string rangeToString(std::vector<bool, Allocator> const& v) {\n  ReusableStringStream rss;\n  rss << \"{ \";\n  bool first = true;\n  for (bool b : v) {\n    if (first)\n      first = false;\n    else\n      rss << \", \";\n    rss << ::Catch::Detail::stringify(b);\n  }\n  rss << \" }\";\n  return rss.str();\n}\n\ntemplate <typename R>\nstruct StringMaker<R,\n                   typename std::enable_if<\n                       is_range<R>::value &&\n                       !::Catch::Detail::IsStreamInsertable<R>::value>::type> {\n  static std::string convert(R const& range) { return rangeToString(range); }\n};\n\ntemplate <typename T, int SZ>\nstruct StringMaker<T[SZ]> {\n  static std::string convert(T const (&arr)[SZ]) { return rangeToString(arr); }\n};\n\n} // namespace Catch\n\n// Separate std::chrono::duration specialization\n#if defined(CATCH_CONFIG_ENABLE_CHRONO_STRINGMAKER)\n#include <ctime>\n#include <ratio>\n#include <chrono>\n\nnamespace Catch {\n\ntemplate <class Ratio>\nstruct ratio_string {\n  static std::string symbol();\n};\n\ntemplate <class Ratio>\nstd::string ratio_string<Ratio>::symbol() {\n  Catch::ReusableStringStream rss;\n  rss << '[' << Ratio::num << '/' << Ratio::den << ']';\n  return rss.str();\n}\ntemplate <>\nstruct ratio_string<std::atto> {\n  static std::string symbol();\n};\ntemplate <>\nstruct ratio_string<std::femto> {\n  static std::string symbol();\n};\ntemplate <>\nstruct ratio_string<std::pico> {\n  static std::string symbol();\n};\ntemplate <>\nstruct ratio_string<std::nano> {\n  static std::string symbol();\n};\ntemplate <>\nstruct ratio_string<std::micro> {\n  static std::string symbol();\n};\ntemplate <>\nstruct ratio_string<std::milli> {\n  static std::string symbol();\n};\n\n////////////\n// std::chrono::duration specializations\ntemplate <typename Value, typename Ratio>\nstruct StringMaker<std::chrono::duration<Value, Ratio>> {\n  static std::string\n  convert(std::chrono::duration<Value, Ratio> const& duration) {\n    ReusableStringStream rss;\n    rss << duration.count() << ' ' << ratio_string<Ratio>::symbol() << 's';\n    return rss.str();\n  }\n};\ntemplate <typename Value>\nstruct StringMaker<std::chrono::duration<Value, std::ratio<1>>> {\n  static std::string\n  convert(std::chrono::duration<Value, std::ratio<1>> const& duration) {\n    ReusableStringStream rss;\n    rss << duration.count() << \" s\";\n    return rss.str();\n  }\n};\ntemplate <typename Value>\nstruct StringMaker<std::chrono::duration<Value, std::ratio<60>>> {\n  static std::string\n  convert(std::chrono::duration<Value, std::ratio<60>> const& duration) {\n    ReusableStringStream rss;\n    rss << duration.count() << \" m\";\n    return rss.str();\n  }\n};\ntemplate <typename Value>\nstruct StringMaker<std::chrono::duration<Value, std::ratio<3600>>> {\n  static std::string\n  convert(std::chrono::duration<Value, std::ratio<3600>> const& duration) {\n    ReusableStringStream rss;\n    rss << duration.count() << \" h\";\n    return rss.str();\n  }\n};\n\n////////////\n// std::chrono::time_point specialization\n// Generic time_point cannot be specialized, only\n// std::chrono::time_point<system_clock>\ntemplate <typename Clock, typename Duration>\nstruct StringMaker<std::chrono::time_point<Clock, Duration>> {\n  static std::string\n  convert(std::chrono::time_point<Clock, Duration> const& time_point) {\n    return ::Catch::Detail::stringify(time_point.time_since_epoch()) +\n           \" since epoch\";\n  }\n};\n// std::chrono::time_point<system_clock> specialization\ntemplate <typename Duration>\nstruct StringMaker<\n    std::chrono::time_point<std::chrono::system_clock, Duration>> {\n  static std::string\n  convert(std::chrono::time_point<std::chrono::system_clock, Duration> const&\n              time_point) {\n    auto converted = std::chrono::system_clock::to_time_t(time_point);\n\n#ifdef _MSC_VER\n    std::tm timeInfo = {};\n    gmtime_s(&timeInfo, &converted);\n#else\n    std::tm* timeInfo = std::gmtime(&converted);\n#endif\n\n    auto const timeStampSize = sizeof(\"2017-01-16T17:06:45Z\");\n    char timeStamp[timeStampSize];\n    const char* const fmt = \"%Y-%m-%dT%H:%M:%SZ\";\n\n#ifdef _MSC_VER\n    std::strftime(timeStamp, timeStampSize, fmt, &timeInfo);\n#else\n    std::strftime(timeStamp, timeStampSize, fmt, timeInfo);\n#endif\n    return std::string(timeStamp);\n  }\n};\n} // namespace Catch\n#endif // CATCH_CONFIG_ENABLE_CHRONO_STRINGMAKER\n\n#define INTERNAL_CATCH_REGISTER_ENUM(enumName, ...)                            \\\n  namespace Catch {                                                            \\\n  template <>                                                                  \\\n  struct StringMaker<enumName> {                                               \\\n    static std::string convert(enumName value) {                               \\\n      static const auto& enumInfo =                                            \\\n          ::Catch::getMutableRegistryHub()                                     \\\n              .getMutableEnumValuesRegistry()                                  \\\n              .registerEnum(#enumName, #__VA_ARGS__, {__VA_ARGS__});           \\\n      return static_cast<std::string>(                                         \\\n          enumInfo.lookup(static_cast<int>(value)));                           \\\n    }                                                                          \\\n  };                                                                           \\\n  }\n\n#define CATCH_REGISTER_ENUM(enumName, ...)                                     \\\n  INTERNAL_CATCH_REGISTER_ENUM(enumName, __VA_ARGS__)\n\n#ifdef _MSC_VER\n#pragma warning(pop)\n#endif\n\n// end catch_tostring.h\n#include <iosfwd>\n\n#ifdef _MSC_VER\n#pragma warning(push)\n#pragma warning(disable : 4389) // '==' : signed/unsigned mismatch\n#pragma warning(disable : 4018) // more \"signed/unsigned mismatch\"\n#pragma warning(disable : 4312) // Converting int to T* using reinterpret_cast\n                                // (issue on x64 platform)\n#pragma warning(                                                               \\\n    disable : 4180) // qualifier applied to function type has no meaning\n#pragma warning(disable : 4800) // Forcing result to true or false\n#endif\n\nnamespace Catch {\n\nstruct ITransientExpression {\n  auto isBinaryExpression() const -> bool { return m_isBinaryExpression; }\n  auto getResult() const -> bool { return m_result; }\n  virtual void streamReconstructedExpression(std::ostream& os) const = 0;\n\n  ITransientExpression(bool isBinaryExpression, bool result)\n      : m_isBinaryExpression(isBinaryExpression), m_result(result) {}\n\n  // We don't actually need a virtual destructor, but many static analysers\n  // complain if it's not here :-(\n  virtual ~ITransientExpression();\n\n  bool m_isBinaryExpression;\n  bool m_result;\n};\n\nvoid formatReconstructedExpression(std::ostream& os, std::string const& lhs,\n                                   StringRef op, std::string const& rhs);\n\ntemplate <typename LhsT, typename RhsT>\nclass BinaryExpr : public ITransientExpression {\n  LhsT m_lhs;\n  StringRef m_op;\n  RhsT m_rhs;\n\n  void streamReconstructedExpression(std::ostream& os) const override {\n    formatReconstructedExpression(os, Catch::Detail::stringify(m_lhs), m_op,\n                                  Catch::Detail::stringify(m_rhs));\n  }\n\npublic:\n  BinaryExpr(bool comparisonResult, LhsT lhs, StringRef op, RhsT rhs)\n      : ITransientExpression{true, comparisonResult}, m_lhs(lhs), m_op(op),\n        m_rhs(rhs) {}\n\n  template <typename T>\n  auto operator&&(T) const -> BinaryExpr<LhsT, RhsT const&> const {\n    static_assert(always_false<T>::value,\n                  \"chained comparisons are not supported inside assertions, \"\n                  \"wrap the expression inside parentheses, or decompose it\");\n  }\n\n  template <typename T>\n  auto operator||(T) const -> BinaryExpr<LhsT, RhsT const&> const {\n    static_assert(always_false<T>::value,\n                  \"chained comparisons are not supported inside assertions, \"\n                  \"wrap the expression inside parentheses, or decompose it\");\n  }\n\n  template <typename T>\n  auto operator==(T) const -> BinaryExpr<LhsT, RhsT const&> const {\n    static_assert(always_false<T>::value,\n                  \"chained comparisons are not supported inside assertions, \"\n                  \"wrap the expression inside parentheses, or decompose it\");\n  }\n\n  template <typename T>\n  auto operator!=(T) const -> BinaryExpr<LhsT, RhsT const&> const {\n    static_assert(always_false<T>::value,\n                  \"chained comparisons are not supported inside assertions, \"\n                  \"wrap the expression inside parentheses, or decompose it\");\n  }\n\n  template <typename T>\n  auto operator>(T) const -> BinaryExpr<LhsT, RhsT const&> const {\n    static_assert(always_false<T>::value,\n                  \"chained comparisons are not supported inside assertions, \"\n                  \"wrap the expression inside parentheses, or decompose it\");\n  }\n\n  template <typename T>\n  auto operator<(T) const -> BinaryExpr<LhsT, RhsT const&> const {\n    static_assert(always_false<T>::value,\n                  \"chained comparisons are not supported inside assertions, \"\n                  \"wrap the expression inside parentheses, or decompose it\");\n  }\n\n  template <typename T>\n  auto operator>=(T) const -> BinaryExpr<LhsT, RhsT const&> const {\n    static_assert(always_false<T>::value,\n                  \"chained comparisons are not supported inside assertions, \"\n                  \"wrap the expression inside parentheses, or decompose it\");\n  }\n\n  template <typename T>\n  auto operator<=(T) const -> BinaryExpr<LhsT, RhsT const&> const {\n    static_assert(always_false<T>::value,\n                  \"chained comparisons are not supported inside assertions, \"\n                  \"wrap the expression inside parentheses, or decompose it\");\n  }\n};\n\ntemplate <typename LhsT>\nclass UnaryExpr : public ITransientExpression {\n  LhsT m_lhs;\n\n  void streamReconstructedExpression(std::ostream& os) const override {\n    os << Catch::Detail::stringify(m_lhs);\n  }\n\npublic:\n  explicit UnaryExpr(LhsT lhs)\n      : ITransientExpression{false, static_cast<bool>(lhs)}, m_lhs(lhs) {}\n};\n\n// Specialised comparison functions to handle equality comparisons between ints\n// and pointers (NULL deduces as an int)\ntemplate <typename LhsT, typename RhsT>\nauto compareEqual(LhsT const& lhs, RhsT const& rhs) -> bool {\n  return static_cast<bool>(lhs == rhs);\n}\ntemplate <typename T>\nauto compareEqual(T* const& lhs, int rhs) -> bool {\n  return lhs == reinterpret_cast<void const*>(rhs);\n}\ntemplate <typename T>\nauto compareEqual(T* const& lhs, long rhs) -> bool {\n  return lhs == reinterpret_cast<void const*>(rhs);\n}\ntemplate <typename T>\nauto compareEqual(int lhs, T* const& rhs) -> bool {\n  return reinterpret_cast<void const*>(lhs) == rhs;\n}\ntemplate <typename T>\nauto compareEqual(long lhs, T* const& rhs) -> bool {\n  return reinterpret_cast<void const*>(lhs) == rhs;\n}\n\ntemplate <typename LhsT, typename RhsT>\nauto compareNotEqual(LhsT const& lhs, RhsT&& rhs) -> bool {\n  return static_cast<bool>(lhs != rhs);\n}\ntemplate <typename T>\nauto compareNotEqual(T* const& lhs, int rhs) -> bool {\n  return lhs != reinterpret_cast<void const*>(rhs);\n}\ntemplate <typename T>\nauto compareNotEqual(T* const& lhs, long rhs) -> bool {\n  return lhs != reinterpret_cast<void const*>(rhs);\n}\ntemplate <typename T>\nauto compareNotEqual(int lhs, T* const& rhs) -> bool {\n  return reinterpret_cast<void const*>(lhs) != rhs;\n}\ntemplate <typename T>\nauto compareNotEqual(long lhs, T* const& rhs) -> bool {\n  return reinterpret_cast<void const*>(lhs) != rhs;\n}\n\ntemplate <typename LhsT>\nclass ExprLhs {\n  LhsT m_lhs;\n\npublic:\n  explicit ExprLhs(LhsT lhs) : m_lhs(lhs) {}\n\n  template <typename RhsT>\n  auto operator==(RhsT const& rhs) -> BinaryExpr<LhsT, RhsT const&> const {\n    return {compareEqual(m_lhs, rhs), m_lhs, \"==\", rhs};\n  }\n  auto operator==(bool rhs) -> BinaryExpr<LhsT, bool> const {\n    return {m_lhs == rhs, m_lhs, \"==\", rhs};\n  }\n\n  template <typename RhsT>\n  auto operator!=(RhsT const& rhs) -> BinaryExpr<LhsT, RhsT const&> const {\n    return {compareNotEqual(m_lhs, rhs), m_lhs, \"!=\", rhs};\n  }\n  auto operator!=(bool rhs) -> BinaryExpr<LhsT, bool> const {\n    return {m_lhs != rhs, m_lhs, \"!=\", rhs};\n  }\n\n  template <typename RhsT>\n  auto operator>(RhsT const& rhs) -> BinaryExpr<LhsT, RhsT const&> const {\n    return {static_cast<bool>(m_lhs > rhs), m_lhs, \">\", rhs};\n  }\n  template <typename RhsT>\n  auto operator<(RhsT const& rhs) -> BinaryExpr<LhsT, RhsT const&> const {\n    return {static_cast<bool>(m_lhs < rhs), m_lhs, \"<\", rhs};\n  }\n  template <typename RhsT>\n  auto operator>=(RhsT const& rhs) -> BinaryExpr<LhsT, RhsT const&> const {\n    return {static_cast<bool>(m_lhs >= rhs), m_lhs, \">=\", rhs};\n  }\n  template <typename RhsT>\n  auto operator<=(RhsT const& rhs) -> BinaryExpr<LhsT, RhsT const&> const {\n    return {static_cast<bool>(m_lhs <= rhs), m_lhs, \"<=\", rhs};\n  }\n  template <typename RhsT>\n  auto operator|(RhsT const& rhs) -> BinaryExpr<LhsT, RhsT const&> const {\n    return {static_cast<bool>(m_lhs | rhs), m_lhs, \"|\", rhs};\n  }\n  template <typename RhsT>\n  auto operator&(RhsT const& rhs) -> BinaryExpr<LhsT, RhsT const&> const {\n    return {static_cast<bool>(m_lhs & rhs), m_lhs, \"&\", rhs};\n  }\n  template <typename RhsT>\n  auto operator^(RhsT const& rhs) -> BinaryExpr<LhsT, RhsT const&> const {\n    return {static_cast<bool>(m_lhs ^ rhs), m_lhs, \"^\", rhs};\n  }\n\n  template <typename RhsT>\n  auto operator&&(RhsT const&) -> BinaryExpr<LhsT, RhsT const&> const {\n    static_assert(always_false<RhsT>::value,\n                  \"operator&& is not supported inside assertions, \"\n                  \"wrap the expression inside parentheses, or decompose it\");\n  }\n\n  template <typename RhsT>\n  auto operator||(RhsT const&) -> BinaryExpr<LhsT, RhsT const&> const {\n    static_assert(always_false<RhsT>::value,\n                  \"operator|| is not supported inside assertions, \"\n                  \"wrap the expression inside parentheses, or decompose it\");\n  }\n\n  auto makeUnaryExpr() const -> UnaryExpr<LhsT> {\n    return UnaryExpr<LhsT>{m_lhs};\n  }\n};\n\nvoid handleExpression(ITransientExpression const& expr);\n\ntemplate <typename T>\nvoid handleExpression(ExprLhs<T> const& expr) {\n  handleExpression(expr.makeUnaryExpr());\n}\n\nstruct Decomposer {\n  template <typename T>\n  auto operator<=(T const& lhs) -> ExprLhs<T const&> {\n    return ExprLhs<T const&>{lhs};\n  }\n\n  auto operator<=(bool value) -> ExprLhs<bool> { return ExprLhs<bool>{value}; }\n};\n\n} // end namespace Catch\n\n#ifdef _MSC_VER\n#pragma warning(pop)\n#endif\n\n// end catch_decomposer.h\n// start catch_interfaces_capture.h\n\n#include <string>\n#include <chrono>\n\nnamespace Catch {\n\nclass AssertionResult;\nstruct AssertionInfo;\nstruct SectionInfo;\nstruct SectionEndInfo;\nstruct MessageInfo;\nstruct MessageBuilder;\nstruct Counts;\nstruct AssertionReaction;\nstruct SourceLineInfo;\n\nstruct ITransientExpression;\nstruct IGeneratorTracker;\n\n#if defined(CATCH_CONFIG_ENABLE_BENCHMARKING)\nstruct BenchmarkInfo;\ntemplate <typename Duration = std::chrono::duration<double, std::nano>>\nstruct BenchmarkStats;\n#endif // CATCH_CONFIG_ENABLE_BENCHMARKING\n\nstruct IResultCapture {\n\n  virtual ~IResultCapture();\n\n  virtual bool sectionStarted(SectionInfo const& sectionInfo,\n                              Counts& assertions)               = 0;\n  virtual void sectionEnded(SectionEndInfo const& endInfo)      = 0;\n  virtual void sectionEndedEarly(SectionEndInfo const& endInfo) = 0;\n\n  virtual auto acquireGeneratorTracker(StringRef generatorName,\n                                       SourceLineInfo const& lineInfo)\n      -> IGeneratorTracker& = 0;\n\n#if defined(CATCH_CONFIG_ENABLE_BENCHMARKING)\n  virtual void benchmarkPreparing(std::string const& name)   = 0;\n  virtual void benchmarkStarting(BenchmarkInfo const& info)  = 0;\n  virtual void benchmarkEnded(BenchmarkStats<> const& stats) = 0;\n  virtual void benchmarkFailed(std::string const& error)     = 0;\n#endif // CATCH_CONFIG_ENABLE_BENCHMARKING\n\n  virtual void pushScopedMessage(MessageInfo const& message) = 0;\n  virtual void popScopedMessage(MessageInfo const& message)  = 0;\n\n  virtual void emplaceUnscopedMessage(MessageBuilder const& builder) = 0;\n\n  virtual void handleFatalErrorCondition(StringRef message) = 0;\n\n  virtual void handleExpr(AssertionInfo const& info,\n                          ITransientExpression const& expr,\n                          AssertionReaction& reaction)    = 0;\n  virtual void handleMessage(AssertionInfo const& info,\n                             ResultWas::OfType resultType,\n                             StringRef const& message,\n                             AssertionReaction& reaction) = 0;\n  virtual void\n  handleUnexpectedExceptionNotThrown(AssertionInfo const& info,\n                                     AssertionReaction& reaction) = 0;\n  virtual void\n  handleUnexpectedInflightException(AssertionInfo const& info,\n                                    std::string const& message,\n                                    AssertionReaction& reaction) = 0;\n  virtual void handleIncomplete(AssertionInfo const& info)       = 0;\n  virtual void handleNonExpr(AssertionInfo const& info,\n                             ResultWas::OfType resultType,\n                             AssertionReaction& reaction)        = 0;\n\n  virtual bool lastAssertionPassed() = 0;\n  virtual void assertionPassed()     = 0;\n\n  // Deprecated, do not use:\n  virtual std::string getCurrentTestName() const       = 0;\n  virtual const AssertionResult* getLastResult() const = 0;\n  virtual void exceptionEarlyReported()                = 0;\n};\n\nIResultCapture& getResultCapture();\n} // namespace Catch\n\n// end catch_interfaces_capture.h\nnamespace Catch {\n\nstruct TestFailureException {};\nstruct AssertionResultData;\nstruct IResultCapture;\nclass RunContext;\n\nclass LazyExpression {\n  friend class AssertionHandler;\n  friend struct AssertionStats;\n  friend class RunContext;\n\n  ITransientExpression const* m_transientExpression = nullptr;\n  bool m_isNegated;\n\npublic:\n  LazyExpression(bool isNegated);\n  LazyExpression(LazyExpression const& other);\n  LazyExpression& operator=(LazyExpression const&) = delete;\n\n  explicit operator bool() const;\n\n  friend auto operator<<(std::ostream& os, LazyExpression const& lazyExpr)\n      -> std::ostream&;\n};\n\nstruct AssertionReaction {\n  bool shouldDebugBreak = false;\n  bool shouldThrow      = false;\n};\n\nclass AssertionHandler {\n  AssertionInfo m_assertionInfo;\n  AssertionReaction m_reaction;\n  bool m_completed = false;\n  IResultCapture& m_resultCapture;\n\npublic:\n  AssertionHandler(StringRef const& macroName, SourceLineInfo const& lineInfo,\n                   StringRef capturedExpression,\n                   ResultDisposition::Flags resultDisposition);\n  ~AssertionHandler() {\n    if (!m_completed) {\n      m_resultCapture.handleIncomplete(m_assertionInfo);\n    }\n  }\n\n  template <typename T>\n  void handleExpr(ExprLhs<T> const& expr) {\n    handleExpr(expr.makeUnaryExpr());\n  }\n  void handleExpr(ITransientExpression const& expr);\n\n  void handleMessage(ResultWas::OfType resultType, StringRef const& message);\n\n  void handleExceptionThrownAsExpected();\n  void handleUnexpectedExceptionNotThrown();\n  void handleExceptionNotThrownAsExpected();\n  void handleThrowingCallSkipped();\n  void handleUnexpectedInflightException();\n\n  void complete();\n  void setCompleted();\n\n  // query\n  auto allowThrows() const -> bool;\n};\n\nvoid handleExceptionMatchExpr(AssertionHandler& handler, std::string const& str,\n                              StringRef const& matcherString);\n\n} // namespace Catch\n\n// end catch_assertionhandler.h\n// start catch_message.h\n\n#include <string>\n#include <vector>\n\nnamespace Catch {\n\nstruct MessageInfo {\n  MessageInfo(StringRef const& _macroName, SourceLineInfo const& _lineInfo,\n              ResultWas::OfType _type);\n\n  StringRef macroName;\n  std::string message;\n  SourceLineInfo lineInfo;\n  ResultWas::OfType type;\n  unsigned int sequence;\n\n  bool operator==(MessageInfo const& other) const;\n  bool operator<(MessageInfo const& other) const;\n\nprivate:\n  static unsigned int globalCount;\n};\n\nstruct MessageStream {\n\n  template <typename T>\n  MessageStream& operator<<(T const& value) {\n    m_stream << value;\n    return *this;\n  }\n\n  ReusableStringStream m_stream;\n};\n\nstruct MessageBuilder : MessageStream {\n  MessageBuilder(StringRef const& macroName, SourceLineInfo const& lineInfo,\n                 ResultWas::OfType type);\n\n  template <typename T>\n  MessageBuilder& operator<<(T const& value) {\n    m_stream << value;\n    return *this;\n  }\n\n  MessageInfo m_info;\n};\n\nclass ScopedMessage {\npublic:\n  explicit ScopedMessage(MessageBuilder const& builder);\n  ScopedMessage(ScopedMessage& duplicate) = delete;\n  ScopedMessage(ScopedMessage&& old);\n  ~ScopedMessage();\n\n  MessageInfo m_info;\n  bool m_moved;\n};\n\nclass Capturer {\n  std::vector<MessageInfo> m_messages;\n  IResultCapture& m_resultCapture = getResultCapture();\n  size_t m_captured               = 0;\n\npublic:\n  Capturer(StringRef macroName, SourceLineInfo const& lineInfo,\n           ResultWas::OfType resultType, StringRef names);\n  ~Capturer();\n\n  void captureValue(size_t index, std::string const& value);\n\n  template <typename T>\n  void captureValues(size_t index, T const& value) {\n    captureValue(index, Catch::Detail::stringify(value));\n  }\n\n  template <typename T, typename... Ts>\n  void captureValues(size_t index, T const& value, Ts const&... values) {\n    captureValue(index, Catch::Detail::stringify(value));\n    captureValues(index + 1, values...);\n  }\n};\n\n} // end namespace Catch\n\n// end catch_message.h\n#if !defined(CATCH_CONFIG_DISABLE)\n\n#if !defined(CATCH_CONFIG_DISABLE_STRINGIFICATION)\n#define CATCH_INTERNAL_STRINGIFY(...) #__VA_ARGS__\n#else\n#define CATCH_INTERNAL_STRINGIFY(...)                                          \\\n  \"Disabled by CATCH_CONFIG_DISABLE_STRINGIFICATION\"\n#endif\n\n#if defined(CATCH_CONFIG_FAST_COMPILE) ||                                      \\\n    defined(CATCH_CONFIG_DISABLE_EXCEPTIONS)\n\n///////////////////////////////////////////////////////////////////////////////\n// Another way to speed-up compilation is to omit local try-catch for REQUIRE*\n// macros.\n#define INTERNAL_CATCH_TRY\n#define INTERNAL_CATCH_CATCH(capturer)\n\n#else // CATCH_CONFIG_FAST_COMPILE\n\n#define INTERNAL_CATCH_TRY try\n#define INTERNAL_CATCH_CATCH(handler)                                          \\\n  catch (...) {                                                                \\\n    handler.handleUnexpectedInflightException();                               \\\n  }\n\n#endif\n\n#define INTERNAL_CATCH_REACT(handler) handler.complete();\n\n///////////////////////////////////////////////////////////////////////////////\n#define INTERNAL_CATCH_TEST(macroName, resultDisposition, ...)                 \\\n  do {                                                                         \\\n    CATCH_INTERNAL_IGNORE_BUT_WARN(__VA_ARGS__);                               \\\n    Catch::AssertionHandler catchAssertionHandler(                             \\\n        macroName##_catch_sr, CATCH_INTERNAL_LINEINFO,                         \\\n        CATCH_INTERNAL_STRINGIFY(__VA_ARGS__), resultDisposition);             \\\n    INTERNAL_CATCH_TRY {                                                       \\\n      CATCH_INTERNAL_START_WARNINGS_SUPPRESSION                                \\\n      CATCH_INTERNAL_SUPPRESS_PARENTHESES_WARNINGS                             \\\n      catchAssertionHandler.handleExpr(Catch::Decomposer() <= __VA_ARGS__);    \\\n      CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION                                 \\\n    }                                                                          \\\n    INTERNAL_CATCH_CATCH(catchAssertionHandler)                                \\\n    INTERNAL_CATCH_REACT(catchAssertionHandler)                                \\\n  } while ((void)0, (false) && static_cast<bool>(!!(__VA_ARGS__)))\n\n///////////////////////////////////////////////////////////////////////////////\n#define INTERNAL_CATCH_IF(macroName, resultDisposition, ...)                   \\\n  INTERNAL_CATCH_TEST(macroName, resultDisposition, __VA_ARGS__);              \\\n  if (Catch::getResultCapture().lastAssertionPassed())\n\n///////////////////////////////////////////////////////////////////////////////\n#define INTERNAL_CATCH_ELSE(macroName, resultDisposition, ...)                 \\\n  INTERNAL_CATCH_TEST(macroName, resultDisposition, __VA_ARGS__);              \\\n  if (!Catch::getResultCapture().lastAssertionPassed())\n\n///////////////////////////////////////////////////////////////////////////////\n#define INTERNAL_CATCH_NO_THROW(macroName, resultDisposition, ...)             \\\n  do {                                                                         \\\n    Catch::AssertionHandler catchAssertionHandler(                             \\\n        macroName##_catch_sr, CATCH_INTERNAL_LINEINFO,                         \\\n        CATCH_INTERNAL_STRINGIFY(__VA_ARGS__), resultDisposition);             \\\n    try {                                                                      \\\n      static_cast<void>(__VA_ARGS__);                                          \\\n      catchAssertionHandler.handleExceptionNotThrownAsExpected();              \\\n    } catch (...) {                                                            \\\n      catchAssertionHandler.handleUnexpectedInflightException();               \\\n    }                                                                          \\\n    INTERNAL_CATCH_REACT(catchAssertionHandler)                                \\\n  } while (false)\n\n///////////////////////////////////////////////////////////////////////////////\n#define INTERNAL_CATCH_THROWS(macroName, resultDisposition, ...)               \\\n  do {                                                                         \\\n    Catch::AssertionHandler catchAssertionHandler(                             \\\n        macroName##_catch_sr, CATCH_INTERNAL_LINEINFO,                         \\\n        CATCH_INTERNAL_STRINGIFY(__VA_ARGS__), resultDisposition);             \\\n    if (catchAssertionHandler.allowThrows())                                   \\\n      try {                                                                    \\\n        static_cast<void>(__VA_ARGS__);                                        \\\n        catchAssertionHandler.handleUnexpectedExceptionNotThrown();            \\\n      } catch (...) {                                                          \\\n        catchAssertionHandler.handleExceptionThrownAsExpected();               \\\n      }                                                                        \\\n    else                                                                       \\\n      catchAssertionHandler.handleThrowingCallSkipped();                       \\\n    INTERNAL_CATCH_REACT(catchAssertionHandler)                                \\\n  } while (false)\n\n///////////////////////////////////////////////////////////////////////////////\n#define INTERNAL_CATCH_THROWS_AS(macroName, exceptionType, resultDisposition,  \\\n                                 expr)                                         \\\n  do {                                                                         \\\n    Catch::AssertionHandler catchAssertionHandler(                             \\\n        macroName##_catch_sr, CATCH_INTERNAL_LINEINFO,                         \\\n        CATCH_INTERNAL_STRINGIFY(expr) \", \" CATCH_INTERNAL_STRINGIFY(          \\\n            exceptionType),                                                    \\\n        resultDisposition);                                                    \\\n    if (catchAssertionHandler.allowThrows())                                   \\\n      try {                                                                    \\\n        static_cast<void>(expr);                                               \\\n        catchAssertionHandler.handleUnexpectedExceptionNotThrown();            \\\n      } catch (exceptionType const&) {                                         \\\n        catchAssertionHandler.handleExceptionThrownAsExpected();               \\\n      } catch (...) {                                                          \\\n        catchAssertionHandler.handleUnexpectedInflightException();             \\\n      }                                                                        \\\n    else                                                                       \\\n      catchAssertionHandler.handleThrowingCallSkipped();                       \\\n    INTERNAL_CATCH_REACT(catchAssertionHandler)                                \\\n  } while (false)\n\n///////////////////////////////////////////////////////////////////////////////\n#define INTERNAL_CATCH_MSG(macroName, messageType, resultDisposition, ...)     \\\n  do {                                                                         \\\n    Catch::AssertionHandler catchAssertionHandler(                             \\\n        macroName##_catch_sr, CATCH_INTERNAL_LINEINFO, Catch::StringRef(),     \\\n        resultDisposition);                                                    \\\n    catchAssertionHandler.handleMessage(                                       \\\n        messageType,                                                           \\\n        (Catch::MessageStream() << __VA_ARGS__ + ::Catch::StreamEndStop())     \\\n            .m_stream.str());                                                  \\\n    INTERNAL_CATCH_REACT(catchAssertionHandler)                                \\\n  } while (false)\n\n///////////////////////////////////////////////////////////////////////////////\n#define INTERNAL_CATCH_CAPTURE(varName, macroName, ...)                        \\\n  auto varName = Catch::Capturer(macroName, CATCH_INTERNAL_LINEINFO,           \\\n                                 Catch::ResultWas::Info, #__VA_ARGS__);        \\\n  varName.captureValues(0, __VA_ARGS__)\n\n///////////////////////////////////////////////////////////////////////////////\n#define INTERNAL_CATCH_INFO(macroName, log)                                    \\\n  Catch::ScopedMessage INTERNAL_CATCH_UNIQUE_NAME(scopedMessage)(              \\\n      Catch::MessageBuilder(macroName##_catch_sr, CATCH_INTERNAL_LINEINFO,     \\\n                            Catch::ResultWas::Info)                            \\\n      << log);\n\n///////////////////////////////////////////////////////////////////////////////\n#define INTERNAL_CATCH_UNSCOPED_INFO(macroName, log)                           \\\n  Catch::getResultCapture().emplaceUnscopedMessage(                            \\\n      Catch::MessageBuilder(macroName##_catch_sr, CATCH_INTERNAL_LINEINFO,     \\\n                            Catch::ResultWas::Info)                            \\\n      << log)\n\n///////////////////////////////////////////////////////////////////////////////\n// Although this is matcher-based, it can be used with just a string\n#define INTERNAL_CATCH_THROWS_STR_MATCHES(macroName, resultDisposition,        \\\n                                          matcher, ...)                        \\\n  do {                                                                         \\\n    Catch::AssertionHandler catchAssertionHandler(                             \\\n        macroName##_catch_sr, CATCH_INTERNAL_LINEINFO,                         \\\n        CATCH_INTERNAL_STRINGIFY(__VA_ARGS__) \", \" CATCH_INTERNAL_STRINGIFY(   \\\n            matcher),                                                          \\\n        resultDisposition);                                                    \\\n    if (catchAssertionHandler.allowThrows())                                   \\\n      try {                                                                    \\\n        static_cast<void>(__VA_ARGS__);                                        \\\n        catchAssertionHandler.handleUnexpectedExceptionNotThrown();            \\\n      } catch (...) {                                                          \\\n        Catch::handleExceptionMatchExpr(catchAssertionHandler, matcher,        \\\n                                        #matcher##_catch_sr);                  \\\n      }                                                                        \\\n    else                                                                       \\\n      catchAssertionHandler.handleThrowingCallSkipped();                       \\\n    INTERNAL_CATCH_REACT(catchAssertionHandler)                                \\\n  } while (false)\n\n#endif // CATCH_CONFIG_DISABLE\n\n// end catch_capture.hpp\n// start catch_section.h\n\n// start catch_section_info.h\n\n// start catch_totals.h\n\n#include <cstddef>\n\nnamespace Catch {\n\nstruct Counts {\n  Counts operator-(Counts const& other) const;\n  Counts& operator+=(Counts const& other);\n\n  std::size_t total() const;\n  bool allPassed() const;\n  bool allOk() const;\n\n  std::size_t passed      = 0;\n  std::size_t failed      = 0;\n  std::size_t failedButOk = 0;\n};\n\nstruct Totals {\n\n  Totals operator-(Totals const& other) const;\n  Totals& operator+=(Totals const& other);\n\n  Totals delta(Totals const& prevTotals) const;\n\n  int error = 0;\n  Counts assertions;\n  Counts testCases;\n};\n} // namespace Catch\n\n// end catch_totals.h\n#include <string>\n\nnamespace Catch {\n\nstruct SectionInfo {\n  SectionInfo(SourceLineInfo const& _lineInfo, std::string const& _name);\n\n  // Deprecated\n  SectionInfo(SourceLineInfo const& _lineInfo, std::string const& _name,\n              std::string const&)\n      : SectionInfo(_lineInfo, _name) {}\n\n  std::string name;\n  std::string description; // !Deprecated: this will always be empty\n  SourceLineInfo lineInfo;\n};\n\nstruct SectionEndInfo {\n  SectionInfo sectionInfo;\n  Counts prevAssertions;\n  double durationInSeconds;\n};\n\n} // end namespace Catch\n\n// end catch_section_info.h\n// start catch_timer.h\n\n#include <cstdint>\n\nnamespace Catch {\n\nauto getCurrentNanosecondsSinceEpoch() -> uint64_t;\nauto getEstimatedClockResolution() -> uint64_t;\n\nclass Timer {\n  uint64_t m_nanoseconds = 0;\n\npublic:\n  void start();\n  auto getElapsedNanoseconds() const -> uint64_t;\n  auto getElapsedMicroseconds() const -> uint64_t;\n  auto getElapsedMilliseconds() const -> unsigned int;\n  auto getElapsedSeconds() const -> double;\n};\n\n} // namespace Catch\n\n// end catch_timer.h\n#include <string>\n\nnamespace Catch {\n\nclass Section : NonCopyable {\npublic:\n  Section(SectionInfo const& info);\n  ~Section();\n\n  // This indicates whether the section should be executed or not\n  explicit operator bool() const;\n\nprivate:\n  SectionInfo m_info;\n\n  std::string m_name;\n  Counts m_assertions;\n  bool m_sectionIncluded;\n  Timer m_timer;\n};\n\n} // end namespace Catch\n\n#define INTERNAL_CATCH_SECTION(...)                                            \\\n  CATCH_INTERNAL_START_WARNINGS_SUPPRESSION                                    \\\n  CATCH_INTERNAL_SUPPRESS_UNUSED_WARNINGS                                      \\\n  if (Catch::Section const& INTERNAL_CATCH_UNIQUE_NAME(                        \\\n          catch_internal_Section) =                                            \\\n          Catch::SectionInfo(CATCH_INTERNAL_LINEINFO, __VA_ARGS__))            \\\n  CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION\n\n#define INTERNAL_CATCH_DYNAMIC_SECTION(...)                                    \\\n  CATCH_INTERNAL_START_WARNINGS_SUPPRESSION                                    \\\n  CATCH_INTERNAL_SUPPRESS_UNUSED_WARNINGS                                      \\\n  if (Catch::Section const& INTERNAL_CATCH_UNIQUE_NAME(                        \\\n          catch_internal_Section) =                                            \\\n          Catch::SectionInfo(                                                  \\\n              CATCH_INTERNAL_LINEINFO,                                         \\\n              (Catch::ReusableStringStream() << __VA_ARGS__).str()))           \\\n  CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION\n\n// end catch_section.h\n// start catch_interfaces_exception.h\n\n// start catch_interfaces_registry_hub.h\n\n#include <string>\n#include <memory>\n\nnamespace Catch {\n\nclass TestCase;\nstruct ITestCaseRegistry;\nstruct IExceptionTranslatorRegistry;\nstruct IExceptionTranslator;\nstruct IReporterRegistry;\nstruct IReporterFactory;\nstruct ITagAliasRegistry;\nstruct IMutableEnumValuesRegistry;\n\nclass StartupExceptionRegistry;\n\nusing IReporterFactoryPtr = std::shared_ptr<IReporterFactory>;\n\nstruct IRegistryHub {\n  virtual ~IRegistryHub();\n\n  virtual IReporterRegistry const& getReporterRegistry() const = 0;\n  virtual ITestCaseRegistry const& getTestCaseRegistry() const = 0;\n  virtual ITagAliasRegistry const& getTagAliasRegistry() const = 0;\n  virtual IExceptionTranslatorRegistry const&\n  getExceptionTranslatorRegistry() const = 0;\n\n  virtual StartupExceptionRegistry const&\n  getStartupExceptionRegistry() const = 0;\n};\n\nstruct IMutableRegistryHub {\n  virtual ~IMutableRegistryHub();\n  virtual void registerReporter(std::string const& name,\n                                IReporterFactoryPtr const& factory)       = 0;\n  virtual void registerListener(IReporterFactoryPtr const& factory)       = 0;\n  virtual void registerTest(TestCase const& testInfo)                     = 0;\n  virtual void registerTranslator(const IExceptionTranslator* translator) = 0;\n  virtual void registerTagAlias(std::string const& alias,\n                                std::string const& tag,\n                                SourceLineInfo const& lineInfo)           = 0;\n  virtual void registerStartupException() noexcept                        = 0;\n  virtual IMutableEnumValuesRegistry& getMutableEnumValuesRegistry()      = 0;\n};\n\nIRegistryHub const& getRegistryHub();\nIMutableRegistryHub& getMutableRegistryHub();\nvoid cleanUp();\nstd::string translateActiveException();\n\n} // namespace Catch\n\n// end catch_interfaces_registry_hub.h\n#if defined(CATCH_CONFIG_DISABLE)\n#define INTERNAL_CATCH_TRANSLATE_EXCEPTION_NO_REG(translatorName, signature)   \\\n  static std::string translatorName(signature)\n#endif\n\n#include <exception>\n#include <string>\n#include <vector>\n\nnamespace Catch {\nusing exceptionTranslateFunction = std::string (*)();\n\nstruct IExceptionTranslator;\nusing ExceptionTranslators =\n    std::vector<std::unique_ptr<IExceptionTranslator const>>;\n\nstruct IExceptionTranslator {\n  virtual ~IExceptionTranslator();\n  virtual std::string\n  translate(ExceptionTranslators::const_iterator it,\n            ExceptionTranslators::const_iterator itEnd) const = 0;\n};\n\nstruct IExceptionTranslatorRegistry {\n  virtual ~IExceptionTranslatorRegistry();\n\n  virtual std::string translateActiveException() const = 0;\n};\n\nclass ExceptionTranslatorRegistrar {\n  template <typename T>\n  class ExceptionTranslator : public IExceptionTranslator {\n  public:\n    ExceptionTranslator(std::string (*translateFunction)(T&))\n        : m_translateFunction(translateFunction) {}\n\n    std::string\n    translate(ExceptionTranslators::const_iterator it,\n              ExceptionTranslators::const_iterator itEnd) const override {\n#if defined(CATCH_CONFIG_DISABLE_EXCEPTIONS)\n      return \"\";\n#else\n      try {\n        if (it == itEnd)\n          std::rethrow_exception(std::current_exception());\n        else\n          return (*it)->translate(it + 1, itEnd);\n      } catch (T& ex) {\n        return m_translateFunction(ex);\n      }\n#endif\n    }\n\n  protected:\n    std::string (*m_translateFunction)(T&);\n  };\n\npublic:\n  template <typename T>\n  ExceptionTranslatorRegistrar(std::string (*translateFunction)(T&)) {\n    getMutableRegistryHub().registerTranslator(\n        new ExceptionTranslator<T>(translateFunction));\n  }\n};\n} // namespace Catch\n\n///////////////////////////////////////////////////////////////////////////////\n#define INTERNAL_CATCH_TRANSLATE_EXCEPTION2(translatorName, signature)         \\\n  static std::string translatorName(signature);                                \\\n  CATCH_INTERNAL_START_WARNINGS_SUPPRESSION                                    \\\n  CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS                                     \\\n  namespace {                                                                  \\\n  Catch::ExceptionTranslatorRegistrar INTERNAL_CATCH_UNIQUE_NAME(              \\\n      catch_internal_ExceptionRegistrar)(&translatorName);                     \\\n  }                                                                            \\\n  CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION                                     \\\n  static std::string translatorName(signature)\n\n#define INTERNAL_CATCH_TRANSLATE_EXCEPTION(signature)                          \\\n  INTERNAL_CATCH_TRANSLATE_EXCEPTION2(                                         \\\n      INTERNAL_CATCH_UNIQUE_NAME(catch_internal_ExceptionTranslator),          \\\n      signature)\n\n// end catch_interfaces_exception.h\n// start catch_approx.h\n\n#include <type_traits>\n\nnamespace Catch {\nnamespace Detail {\n\nclass Approx {\nprivate:\n  bool equalityComparisonImpl(double other) const;\n  // Validates the new margin (margin >= 0)\n  // out-of-line to avoid including stdexcept in the header\n  void setMargin(double margin);\n  // Validates the new epsilon (0 < epsilon < 1)\n  // out-of-line to avoid including stdexcept in the header\n  void setEpsilon(double epsilon);\n\npublic:\n  explicit Approx(double value);\n\n  static Approx custom();\n\n  Approx operator-() const;\n\n  template <typename T, typename = typename std::enable_if<\n                            std::is_constructible<double, T>::value>::type>\n  Approx operator()(T const& value) {\n    Approx approx(static_cast<double>(value));\n    approx.m_epsilon = m_epsilon;\n    approx.m_margin  = m_margin;\n    approx.m_scale   = m_scale;\n    return approx;\n  }\n\n  template <typename T, typename = typename std::enable_if<\n                            std::is_constructible<double, T>::value>::type>\n  explicit Approx(T const& value) : Approx(static_cast<double>(value)) {}\n\n  template <typename T, typename = typename std::enable_if<\n                            std::is_constructible<double, T>::value>::type>\n  friend bool operator==(const T& lhs, Approx const& rhs) {\n    auto lhs_v = static_cast<double>(lhs);\n    return rhs.equalityComparisonImpl(lhs_v);\n  }\n\n  template <typename T, typename = typename std::enable_if<\n                            std::is_constructible<double, T>::value>::type>\n  friend bool operator==(Approx const& lhs, const T& rhs) {\n    return operator==(rhs, lhs);\n  }\n\n  template <typename T, typename = typename std::enable_if<\n                            std::is_constructible<double, T>::value>::type>\n  friend bool operator!=(T const& lhs, Approx const& rhs) {\n    return !operator==(lhs, rhs);\n  }\n\n  template <typename T, typename = typename std::enable_if<\n                            std::is_constructible<double, T>::value>::type>\n  friend bool operator!=(Approx const& lhs, T const& rhs) {\n    return !operator==(rhs, lhs);\n  }\n\n  template <typename T, typename = typename std::enable_if<\n                            std::is_constructible<double, T>::value>::type>\n  friend bool operator<=(T const& lhs, Approx const& rhs) {\n    return static_cast<double>(lhs) < rhs.m_value || lhs == rhs;\n  }\n\n  template <typename T, typename = typename std::enable_if<\n                            std::is_constructible<double, T>::value>::type>\n  friend bool operator<=(Approx const& lhs, T const& rhs) {\n    return lhs.m_value < static_cast<double>(rhs) || lhs == rhs;\n  }\n\n  template <typename T, typename = typename std::enable_if<\n                            std::is_constructible<double, T>::value>::type>\n  friend bool operator>=(T const& lhs, Approx const& rhs) {\n    return static_cast<double>(lhs) > rhs.m_value || lhs == rhs;\n  }\n\n  template <typename T, typename = typename std::enable_if<\n                            std::is_constructible<double, T>::value>::type>\n  friend bool operator>=(Approx const& lhs, T const& rhs) {\n    return lhs.m_value > static_cast<double>(rhs) || lhs == rhs;\n  }\n\n  template <typename T, typename = typename std::enable_if<\n                            std::is_constructible<double, T>::value>::type>\n  Approx& epsilon(T const& newEpsilon) {\n    double epsilonAsDouble = static_cast<double>(newEpsilon);\n    setEpsilon(epsilonAsDouble);\n    return *this;\n  }\n\n  template <typename T, typename = typename std::enable_if<\n                            std::is_constructible<double, T>::value>::type>\n  Approx& margin(T const& newMargin) {\n    double marginAsDouble = static_cast<double>(newMargin);\n    setMargin(marginAsDouble);\n    return *this;\n  }\n\n  template <typename T, typename = typename std::enable_if<\n                            std::is_constructible<double, T>::value>::type>\n  Approx& scale(T const& newScale) {\n    m_scale = static_cast<double>(newScale);\n    return *this;\n  }\n\n  std::string toString() const;\n\nprivate:\n  double m_epsilon;\n  double m_margin;\n  double m_scale;\n  double m_value;\n};\n} // end namespace Detail\n\nnamespace literals {\nDetail::Approx operator\"\" _a(long double val);\nDetail::Approx operator\"\" _a(unsigned long long val);\n} // end namespace literals\n\ntemplate <>\nstruct StringMaker<Catch::Detail::Approx> {\n  static std::string convert(Catch::Detail::Approx const& value);\n};\n\n} // end namespace Catch\n\n// end catch_approx.h\n// start catch_string_manip.h\n\n#include <string>\n#include <iosfwd>\n#include <vector>\n\nnamespace Catch {\n\nbool startsWith(std::string const& s, std::string const& prefix);\nbool startsWith(std::string const& s, char prefix);\nbool endsWith(std::string const& s, std::string const& suffix);\nbool endsWith(std::string const& s, char suffix);\nbool contains(std::string const& s, std::string const& infix);\nvoid toLowerInPlace(std::string& s);\nstd::string toLower(std::string const& s);\n//! Returns a new string without whitespace at the start/end\nstd::string trim(std::string const& str);\n//! Returns a substring of the original ref without whitespace. Beware\n//! lifetimes!\nStringRef trim(StringRef ref);\n\n// !!! Be aware, returns refs into original string - make sure original string\n// outlives them\nstd::vector<StringRef> splitStringRef(StringRef str, char delimiter);\nbool replaceInPlace(std::string& str, std::string const& replaceThis,\n                    std::string const& withThis);\n\nstruct pluralise {\n  pluralise(std::size_t count, std::string const& label);\n\n  friend std::ostream& operator<<(std::ostream& os,\n                                  pluralise const& pluraliser);\n\n  std::size_t m_count;\n  std::string m_label;\n};\n} // namespace Catch\n\n// end catch_string_manip.h\n#ifndef CATCH_CONFIG_DISABLE_MATCHERS\n// start catch_capture_matchers.h\n\n// start catch_matchers.h\n\n#include <string>\n#include <vector>\n\nnamespace Catch {\nnamespace Matchers {\nnamespace Impl {\n\ntemplate <typename ArgT>\nstruct MatchAllOf;\ntemplate <typename ArgT>\nstruct MatchAnyOf;\ntemplate <typename ArgT>\nstruct MatchNotOf;\n\nclass MatcherUntypedBase {\npublic:\n  MatcherUntypedBase()                                     = default;\n  MatcherUntypedBase(MatcherUntypedBase const&)            = default;\n  MatcherUntypedBase& operator=(MatcherUntypedBase const&) = delete;\n  std::string toString() const;\n\nprotected:\n  virtual ~MatcherUntypedBase();\n  virtual std::string describe() const = 0;\n  mutable std::string m_cachedToString;\n};\n\n#ifdef __clang__\n#pragma clang diagnostic push\n#pragma clang diagnostic ignored \"-Wnon-virtual-dtor\"\n#endif\n\ntemplate <typename ObjectT>\nstruct MatcherMethod {\n  virtual bool match(ObjectT const& arg) const = 0;\n};\n\n#if defined(__OBJC__)\n// Hack to fix Catch GH issue #1661. Could use id for generic Object support.\n// use of const for Object pointers is very uncommon and under ARC it causes\n// some kind of signature mismatch that breaks compilation\ntemplate <>\nstruct MatcherMethod<NSString*> {\n  virtual bool match(NSString* arg) const = 0;\n};\n#endif\n\n#ifdef __clang__\n#pragma clang diagnostic pop\n#endif\n\ntemplate <typename T>\nstruct MatcherBase : MatcherUntypedBase, MatcherMethod<T> {\n\n  MatchAllOf<T> operator&&(MatcherBase const& other) const;\n  MatchAnyOf<T> operator||(MatcherBase const& other) const;\n  MatchNotOf<T> operator!() const;\n};\n\ntemplate <typename ArgT>\nstruct MatchAllOf : MatcherBase<ArgT> {\n  bool match(ArgT const& arg) const override {\n    for (auto matcher : m_matchers) {\n      if (!matcher->match(arg))\n        return false;\n    }\n    return true;\n  }\n  std::string describe() const override {\n    std::string description;\n    description.reserve(4 + m_matchers.size() * 32);\n    description += \"( \";\n    bool first = true;\n    for (auto matcher : m_matchers) {\n      if (first)\n        first = false;\n      else\n        description += \" and \";\n      description += matcher->toString();\n    }\n    description += \" )\";\n    return description;\n  }\n\n  MatchAllOf<ArgT> operator&&(MatcherBase<ArgT> const& other) {\n    auto copy(*this);\n    copy.m_matchers.push_back(&other);\n    return copy;\n  }\n\n  std::vector<MatcherBase<ArgT> const*> m_matchers;\n};\ntemplate <typename ArgT>\nstruct MatchAnyOf : MatcherBase<ArgT> {\n\n  bool match(ArgT const& arg) const override {\n    for (auto matcher : m_matchers) {\n      if (matcher->match(arg))\n        return true;\n    }\n    return false;\n  }\n  std::string describe() const override {\n    std::string description;\n    description.reserve(4 + m_matchers.size() * 32);\n    description += \"( \";\n    bool first = true;\n    for (auto matcher : m_matchers) {\n      if (first)\n        first = false;\n      else\n        description += \" or \";\n      description += matcher->toString();\n    }\n    description += \" )\";\n    return description;\n  }\n\n  MatchAnyOf<ArgT> operator||(MatcherBase<ArgT> const& other) {\n    auto copy(*this);\n    copy.m_matchers.push_back(&other);\n    return copy;\n  }\n\n  std::vector<MatcherBase<ArgT> const*> m_matchers;\n};\n\ntemplate <typename ArgT>\nstruct MatchNotOf : MatcherBase<ArgT> {\n\n  MatchNotOf(MatcherBase<ArgT> const& underlyingMatcher)\n      : m_underlyingMatcher(underlyingMatcher) {}\n\n  bool match(ArgT const& arg) const override {\n    return !m_underlyingMatcher.match(arg);\n  }\n\n  std::string describe() const override {\n    return \"not \" + m_underlyingMatcher.toString();\n  }\n  MatcherBase<ArgT> const& m_underlyingMatcher;\n};\n\ntemplate <typename T>\nMatchAllOf<T> MatcherBase<T>::operator&&(MatcherBase const& other) const {\n  return MatchAllOf<T>() && *this && other;\n}\ntemplate <typename T>\nMatchAnyOf<T> MatcherBase<T>::operator||(MatcherBase const& other) const {\n  return MatchAnyOf<T>() || *this || other;\n}\ntemplate <typename T>\nMatchNotOf<T> MatcherBase<T>::operator!() const {\n  return MatchNotOf<T>(*this);\n}\n\n} // namespace Impl\n\n} // namespace Matchers\n\nusing namespace Matchers;\nusing Matchers::Impl::MatcherBase;\n\n} // namespace Catch\n\n// end catch_matchers.h\n// start catch_matchers_exception.hpp\n\nnamespace Catch {\nnamespace Matchers {\nnamespace Exception {\n\nclass ExceptionMessageMatcher : public MatcherBase<std::exception> {\n  std::string m_message;\n\npublic:\n  ExceptionMessageMatcher(std::string const& message) : m_message(message) {}\n\n  bool match(std::exception const& ex) const override;\n\n  std::string describe() const override;\n};\n\n} // namespace Exception\n\nException::ExceptionMessageMatcher Message(std::string const& message);\n\n} // namespace Matchers\n} // namespace Catch\n\n// end catch_matchers_exception.hpp\n// start catch_matchers_floating.h\n\nnamespace Catch {\nnamespace Matchers {\n\nnamespace Floating {\n\nenum class FloatingPointKind : uint8_t;\n\nstruct WithinAbsMatcher : MatcherBase<double> {\n  WithinAbsMatcher(double target, double margin);\n  bool match(double const& matchee) const override;\n  std::string describe() const override;\n\nprivate:\n  double m_target;\n  double m_margin;\n};\n\nstruct WithinUlpsMatcher : MatcherBase<double> {\n  WithinUlpsMatcher(double target, uint64_t ulps, FloatingPointKind baseType);\n  bool match(double const& matchee) const override;\n  std::string describe() const override;\n\nprivate:\n  double m_target;\n  uint64_t m_ulps;\n  FloatingPointKind m_type;\n};\n\n// Given IEEE-754 format for floats and doubles, we can assume\n// that float -> double promotion is lossless. Given this, we can\n// assume that if we do the standard relative comparison of\n// |lhs - rhs| <= epsilon * max(fabs(lhs), fabs(rhs)), then we get\n// the same result if we do this for floats, as if we do this for\n// doubles that were promoted from floats.\nstruct WithinRelMatcher : MatcherBase<double> {\n  WithinRelMatcher(double target, double epsilon);\n  bool match(double const& matchee) const override;\n  std::string describe() const override;\n\nprivate:\n  double m_target;\n  double m_epsilon;\n};\n\n} // namespace Floating\n\n// The following functions create the actual matcher objects.\n// This allows the types to be inferred\nFloating::WithinUlpsMatcher WithinULP(double target, uint64_t maxUlpDiff);\nFloating::WithinUlpsMatcher WithinULP(float target, uint64_t maxUlpDiff);\nFloating::WithinAbsMatcher WithinAbs(double target, double margin);\nFloating::WithinRelMatcher WithinRel(double target, double eps);\n// defaults epsilon to 100*numeric_limits<double>::epsilon()\nFloating::WithinRelMatcher WithinRel(double target);\nFloating::WithinRelMatcher WithinRel(float target, float eps);\n// defaults epsilon to 100*numeric_limits<float>::epsilon()\nFloating::WithinRelMatcher WithinRel(float target);\n\n} // namespace Matchers\n} // namespace Catch\n\n// end catch_matchers_floating.h\n// start catch_matchers_generic.hpp\n\n#include <functional>\n#include <string>\n\nnamespace Catch {\nnamespace Matchers {\nnamespace Generic {\n\nnamespace Detail {\nstd::string finalizeDescription(const std::string& desc);\n}\n\ntemplate <typename T>\nclass PredicateMatcher : public MatcherBase<T> {\n  std::function<bool(T const&)> m_predicate;\n  std::string m_description;\n\npublic:\n  PredicateMatcher(std::function<bool(T const&)> const& elem,\n                   std::string const& descr)\n      : m_predicate(std::move(elem)),\n        m_description(Detail::finalizeDescription(descr)) {}\n\n  bool match(T const& item) const override { return m_predicate(item); }\n\n  std::string describe() const override { return m_description; }\n};\n\n} // namespace Generic\n\n// The following functions create the actual matcher objects.\n// The user has to explicitly specify type to the function, because\n// inferring std::function<bool(T const&)> is hard (but possible) and\n// requires a lot of TMP.\ntemplate <typename T>\nGeneric::PredicateMatcher<T>\nPredicate(std::function<bool(T const&)> const& predicate,\n          std::string const& description = \"\") {\n  return Generic::PredicateMatcher<T>(predicate, description);\n}\n\n} // namespace Matchers\n} // namespace Catch\n\n// end catch_matchers_generic.hpp\n// start catch_matchers_string.h\n\n#include <string>\n\nnamespace Catch {\nnamespace Matchers {\n\nnamespace StdString {\n\nstruct CasedString {\n  CasedString(std::string const& str, CaseSensitive::Choice caseSensitivity);\n  std::string adjustString(std::string const& str) const;\n  std::string caseSensitivitySuffix() const;\n\n  CaseSensitive::Choice m_caseSensitivity;\n  std::string m_str;\n};\n\nstruct StringMatcherBase : MatcherBase<std::string> {\n  StringMatcherBase(std::string const& operation,\n                    CasedString const& comparator);\n  std::string describe() const override;\n\n  CasedString m_comparator;\n  std::string m_operation;\n};\n\nstruct EqualsMatcher : StringMatcherBase {\n  EqualsMatcher(CasedString const& comparator);\n  bool match(std::string const& source) const override;\n};\nstruct ContainsMatcher : StringMatcherBase {\n  ContainsMatcher(CasedString const& comparator);\n  bool match(std::string const& source) const override;\n};\nstruct StartsWithMatcher : StringMatcherBase {\n  StartsWithMatcher(CasedString const& comparator);\n  bool match(std::string const& source) const override;\n};\nstruct EndsWithMatcher : StringMatcherBase {\n  EndsWithMatcher(CasedString const& comparator);\n  bool match(std::string const& source) const override;\n};\n\nstruct RegexMatcher : MatcherBase<std::string> {\n  RegexMatcher(std::string regex, CaseSensitive::Choice caseSensitivity);\n  bool match(std::string const& matchee) const override;\n  std::string describe() const override;\n\nprivate:\n  std::string m_regex;\n  CaseSensitive::Choice m_caseSensitivity;\n};\n\n} // namespace StdString\n\n// The following functions create the actual matcher objects.\n// This allows the types to be inferred\n\nStdString::EqualsMatcher\nEquals(std::string const& str,\n       CaseSensitive::Choice caseSensitivity = CaseSensitive::Yes);\nStdString::ContainsMatcher\nContains(std::string const& str,\n         CaseSensitive::Choice caseSensitivity = CaseSensitive::Yes);\nStdString::EndsWithMatcher\nEndsWith(std::string const& str,\n         CaseSensitive::Choice caseSensitivity = CaseSensitive::Yes);\nStdString::StartsWithMatcher\nStartsWith(std::string const& str,\n           CaseSensitive::Choice caseSensitivity = CaseSensitive::Yes);\nStdString::RegexMatcher\nMatches(std::string const& regex,\n        CaseSensitive::Choice caseSensitivity = CaseSensitive::Yes);\n\n} // namespace Matchers\n} // namespace Catch\n\n// end catch_matchers_string.h\n// start catch_matchers_vector.h\n\n#include <algorithm>\n\nnamespace Catch {\nnamespace Matchers {\n\nnamespace Vector {\ntemplate <typename T, typename Alloc>\nstruct ContainsElementMatcher : MatcherBase<std::vector<T, Alloc>> {\n\n  ContainsElementMatcher(T const& comparator) : m_comparator(comparator) {}\n\n  bool match(std::vector<T, Alloc> const& v) const override {\n    for (auto const& el : v) {\n      if (el == m_comparator) {\n        return true;\n      }\n    }\n    return false;\n  }\n\n  std::string describe() const override {\n    return \"Contains: \" + ::Catch::Detail::stringify(m_comparator);\n  }\n\n  T const& m_comparator;\n};\n\ntemplate <typename T, typename AllocComp, typename AllocMatch>\nstruct ContainsMatcher : MatcherBase<std::vector<T, AllocMatch>> {\n\n  ContainsMatcher(std::vector<T, AllocComp> const& comparator)\n      : m_comparator(comparator) {}\n\n  bool match(std::vector<T, AllocMatch> const& v) const override {\n    // !TBD: see note in EqualsMatcher\n    if (m_comparator.size() > v.size())\n      return false;\n    for (auto const& comparator : m_comparator) {\n      auto present = false;\n      for (const auto& el : v) {\n        if (el == comparator) {\n          present = true;\n          break;\n        }\n      }\n      if (!present) {\n        return false;\n      }\n    }\n    return true;\n  }\n  std::string describe() const override {\n    return \"Contains: \" + ::Catch::Detail::stringify(m_comparator);\n  }\n\n  std::vector<T, AllocComp> const& m_comparator;\n};\n\ntemplate <typename T, typename AllocComp, typename AllocMatch>\nstruct EqualsMatcher : MatcherBase<std::vector<T, AllocMatch>> {\n\n  EqualsMatcher(std::vector<T, AllocComp> const& comparator)\n      : m_comparator(comparator) {}\n\n  bool match(std::vector<T, AllocMatch> const& v) const override {\n    // !TBD: This currently works if all elements can be compared using !=\n    // - a more general approach would be via a compare template that defaults\n    // to using !=. but could be specialised for, e.g. std::vector<T, Alloc> etc\n    // - then just call that directly\n    if (m_comparator.size() != v.size())\n      return false;\n    for (std::size_t i = 0; i < v.size(); ++i)\n      if (m_comparator[i] != v[i])\n        return false;\n    return true;\n  }\n  std::string describe() const override {\n    return \"Equals: \" + ::Catch::Detail::stringify(m_comparator);\n  }\n  std::vector<T, AllocComp> const& m_comparator;\n};\n\ntemplate <typename T, typename AllocComp, typename AllocMatch>\nstruct ApproxMatcher : MatcherBase<std::vector<T, AllocMatch>> {\n\n  ApproxMatcher(std::vector<T, AllocComp> const& comparator)\n      : m_comparator(comparator) {}\n\n  bool match(std::vector<T, AllocMatch> const& v) const override {\n    if (m_comparator.size() != v.size())\n      return false;\n    for (std::size_t i = 0; i < v.size(); ++i)\n      if (m_comparator[i] != approx(v[i]))\n        return false;\n    return true;\n  }\n  std::string describe() const override {\n    return \"is approx: \" + ::Catch::Detail::stringify(m_comparator);\n  }\n  template <typename = typename std::enable_if<\n                std::is_constructible<double, T>::value>::type>\n  ApproxMatcher& epsilon(T const& newEpsilon) {\n    approx.epsilon(newEpsilon);\n    return *this;\n  }\n  template <typename = typename std::enable_if<\n                std::is_constructible<double, T>::value>::type>\n  ApproxMatcher& margin(T const& newMargin) {\n    approx.margin(newMargin);\n    return *this;\n  }\n  template <typename = typename std::enable_if<\n                std::is_constructible<double, T>::value>::type>\n  ApproxMatcher& scale(T const& newScale) {\n    approx.scale(newScale);\n    return *this;\n  }\n\n  std::vector<T, AllocComp> const& m_comparator;\n  mutable Catch::Detail::Approx approx = Catch::Detail::Approx::custom();\n};\n\ntemplate <typename T, typename AllocComp, typename AllocMatch>\nstruct UnorderedEqualsMatcher : MatcherBase<std::vector<T, AllocMatch>> {\n  UnorderedEqualsMatcher(std::vector<T, AllocComp> const& target)\n      : m_target(target) {}\n  bool match(std::vector<T, AllocMatch> const& vec) const override {\n    if (m_target.size() != vec.size()) {\n      return false;\n    }\n    return std::is_permutation(m_target.begin(), m_target.end(), vec.begin());\n  }\n\n  std::string describe() const override {\n    return \"UnorderedEquals: \" + ::Catch::Detail::stringify(m_target);\n  }\n\nprivate:\n  std::vector<T, AllocComp> const& m_target;\n};\n\n} // namespace Vector\n\n// The following functions create the actual matcher objects.\n// This allows the types to be inferred\n\ntemplate <typename T, typename AllocComp = std::allocator<T>,\n          typename AllocMatch = AllocComp>\nVector::ContainsMatcher<T, AllocComp, AllocMatch>\nContains(std::vector<T, AllocComp> const& comparator) {\n  return Vector::ContainsMatcher<T, AllocComp, AllocMatch>(comparator);\n}\n\ntemplate <typename T, typename Alloc = std::allocator<T>>\nVector::ContainsElementMatcher<T, Alloc> VectorContains(T const& comparator) {\n  return Vector::ContainsElementMatcher<T, Alloc>(comparator);\n}\n\ntemplate <typename T, typename AllocComp = std::allocator<T>,\n          typename AllocMatch = AllocComp>\nVector::EqualsMatcher<T, AllocComp, AllocMatch>\nEquals(std::vector<T, AllocComp> const& comparator) {\n  return Vector::EqualsMatcher<T, AllocComp, AllocMatch>(comparator);\n}\n\ntemplate <typename T, typename AllocComp = std::allocator<T>,\n          typename AllocMatch = AllocComp>\nVector::ApproxMatcher<T, AllocComp, AllocMatch>\nApprox(std::vector<T, AllocComp> const& comparator) {\n  return Vector::ApproxMatcher<T, AllocComp, AllocMatch>(comparator);\n}\n\ntemplate <typename T, typename AllocComp = std::allocator<T>,\n          typename AllocMatch = AllocComp>\nVector::UnorderedEqualsMatcher<T, AllocComp, AllocMatch>\nUnorderedEquals(std::vector<T, AllocComp> const& target) {\n  return Vector::UnorderedEqualsMatcher<T, AllocComp, AllocMatch>(target);\n}\n\n} // namespace Matchers\n} // namespace Catch\n\n// end catch_matchers_vector.h\nnamespace Catch {\n\ntemplate <typename ArgT, typename MatcherT>\nclass MatchExpr : public ITransientExpression {\n  ArgT const& m_arg;\n  MatcherT m_matcher;\n  StringRef m_matcherString;\n\npublic:\n  MatchExpr(ArgT const& arg, MatcherT const& matcher,\n            StringRef const& matcherString)\n      : ITransientExpression{true, matcher.match(arg)}, m_arg(arg),\n        m_matcher(matcher), m_matcherString(matcherString) {}\n\n  void streamReconstructedExpression(std::ostream& os) const override {\n    auto matcherAsString = m_matcher.toString();\n    os << Catch::Detail::stringify(m_arg) << ' ';\n    if (matcherAsString == Detail::unprintableString)\n      os << m_matcherString;\n    else\n      os << matcherAsString;\n  }\n};\n\nusing StringMatcher = Matchers::Impl::MatcherBase<std::string>;\n\nvoid handleExceptionMatchExpr(AssertionHandler& handler,\n                              StringMatcher const& matcher,\n                              StringRef const& matcherString);\n\ntemplate <typename ArgT, typename MatcherT>\nauto makeMatchExpr(ArgT const& arg, MatcherT const& matcher,\n                   StringRef const& matcherString)\n    -> MatchExpr<ArgT, MatcherT> {\n  return MatchExpr<ArgT, MatcherT>(arg, matcher, matcherString);\n}\n\n} // namespace Catch\n\n///////////////////////////////////////////////////////////////////////////////\n#define INTERNAL_CHECK_THAT(macroName, matcher, resultDisposition, arg)        \\\n  do {                                                                         \\\n    Catch::AssertionHandler catchAssertionHandler(                             \\\n        macroName##_catch_sr, CATCH_INTERNAL_LINEINFO,                         \\\n        CATCH_INTERNAL_STRINGIFY(arg) \", \" CATCH_INTERNAL_STRINGIFY(matcher),  \\\n        resultDisposition);                                                    \\\n    INTERNAL_CATCH_TRY {                                                       \\\n      catchAssertionHandler.handleExpr(                                        \\\n          Catch::makeMatchExpr(arg, matcher, #matcher##_catch_sr));            \\\n    }                                                                          \\\n    INTERNAL_CATCH_CATCH(catchAssertionHandler)                                \\\n    INTERNAL_CATCH_REACT(catchAssertionHandler)                                \\\n  } while (false)\n\n///////////////////////////////////////////////////////////////////////////////\n#define INTERNAL_CATCH_THROWS_MATCHES(macroName, exceptionType,                \\\n                                      resultDisposition, matcher, ...)         \\\n  do {                                                                         \\\n    Catch::AssertionHandler catchAssertionHandler(                             \\\n        macroName##_catch_sr, CATCH_INTERNAL_LINEINFO,                         \\\n        CATCH_INTERNAL_STRINGIFY(__VA_ARGS__) \", \" CATCH_INTERNAL_STRINGIFY(   \\\n            exceptionType) \", \" CATCH_INTERNAL_STRINGIFY(matcher),             \\\n        resultDisposition);                                                    \\\n    if (catchAssertionHandler.allowThrows())                                   \\\n      try {                                                                    \\\n        static_cast<void>(__VA_ARGS__);                                        \\\n        catchAssertionHandler.handleUnexpectedExceptionNotThrown();            \\\n      } catch (exceptionType const& ex) {                                      \\\n        catchAssertionHandler.handleExpr(                                      \\\n            Catch::makeMatchExpr(ex, matcher, #matcher##_catch_sr));           \\\n      } catch (...) {                                                          \\\n        catchAssertionHandler.handleUnexpectedInflightException();             \\\n      }                                                                        \\\n    else                                                                       \\\n      catchAssertionHandler.handleThrowingCallSkipped();                       \\\n    INTERNAL_CATCH_REACT(catchAssertionHandler)                                \\\n  } while (false)\n\n// end catch_capture_matchers.h\n#endif\n// start catch_generators.hpp\n\n// start catch_interfaces_generatortracker.h\n\n#include <memory>\n\nnamespace Catch {\n\nnamespace Generators {\nclass GeneratorUntypedBase {\npublic:\n  GeneratorUntypedBase() = default;\n  virtual ~GeneratorUntypedBase();\n  // Attempts to move the generator to the next element\n  //\n  // Returns true iff the move succeeded (and a valid element\n  // can be retrieved).\n  virtual bool next() = 0;\n};\nusing GeneratorBasePtr = std::unique_ptr<GeneratorUntypedBase>;\n\n} // namespace Generators\n\nstruct IGeneratorTracker {\n  virtual ~IGeneratorTracker();\n  virtual auto hasGenerator() const -> bool                                = 0;\n  virtual auto getGenerator() const -> Generators::GeneratorBasePtr const& = 0;\n  virtual void setGenerator(Generators::GeneratorBasePtr&& generator)      = 0;\n};\n\n} // namespace Catch\n\n// end catch_interfaces_generatortracker.h\n// start catch_enforce.h\n\n#include <exception>\n\nnamespace Catch {\n#if !defined(CATCH_CONFIG_DISABLE_EXCEPTIONS)\ntemplate <typename Ex>\n[[noreturn]] void throw_exception(Ex const& e) {\n  throw e;\n}\n#else // ^^ Exceptions are enabled //  Exceptions are disabled vv\n[[noreturn]] void throw_exception(std::exception const& e);\n#endif\n\n[[noreturn]] void throw_logic_error(std::string const& msg);\n[[noreturn]] void throw_domain_error(std::string const& msg);\n[[noreturn]] void throw_runtime_error(std::string const& msg);\n\n} // namespace Catch\n\n#define CATCH_MAKE_MSG(...) (Catch::ReusableStringStream() << __VA_ARGS__).str()\n\n#define CATCH_INTERNAL_ERROR(...)                                              \\\n  Catch::throw_logic_error(CATCH_MAKE_MSG(                                     \\\n      CATCH_INTERNAL_LINEINFO << \": Internal Catch2 error: \" << __VA_ARGS__))\n\n#define CATCH_ERROR(...) Catch::throw_domain_error(CATCH_MAKE_MSG(__VA_ARGS__))\n\n#define CATCH_RUNTIME_ERROR(...)                                               \\\n  Catch::throw_runtime_error(CATCH_MAKE_MSG(__VA_ARGS__))\n\n#define CATCH_ENFORCE(condition, ...)                                          \\\n  do {                                                                         \\\n    if (!(condition))                                                          \\\n      CATCH_ERROR(__VA_ARGS__);                                                \\\n  } while (false)\n\n// end catch_enforce.h\n#include <memory>\n#include <vector>\n#include <cassert>\n\n#include <utility>\n#include <exception>\n\nnamespace Catch {\n\nclass GeneratorException : public std::exception {\n  const char* const m_msg = \"\";\n\npublic:\n  GeneratorException(const char* msg) : m_msg(msg) {}\n\n  const char* what() const noexcept override final;\n};\n\nnamespace Generators {\n\n// !TBD move this into its own location?\nnamespace pf {\ntemplate <typename T, typename... Args>\nstd::unique_ptr<T> make_unique(Args&&... args) {\n  return std::unique_ptr<T>(new T(std::forward<Args>(args)...));\n}\n} // namespace pf\n\ntemplate <typename T>\nstruct IGenerator : GeneratorUntypedBase {\n  virtual ~IGenerator() = default;\n\n  // Returns the current element of the generator\n  //\n  // \\Precondition The generator is either freshly constructed,\n  // or the last call to `next()` returned true\n  virtual T const& get() const = 0;\n  using type                   = T;\n};\n\ntemplate <typename T>\nclass SingleValueGenerator final : public IGenerator<T> {\n  T m_value;\n\npublic:\n  SingleValueGenerator(T&& value) : m_value(std::move(value)) {}\n\n  T const& get() const override { return m_value; }\n  bool next() override { return false; }\n};\n\ntemplate <typename T>\nclass FixedValuesGenerator final : public IGenerator<T> {\n  static_assert(\n      !std::is_same<T, bool>::value,\n      \"FixedValuesGenerator does not support bools because of std::vector<bool>\"\n      \"specialization, use SingleValue Generator instead.\");\n  std::vector<T> m_values;\n  size_t m_idx = 0;\n\npublic:\n  FixedValuesGenerator(std::initializer_list<T> values) : m_values(values) {}\n\n  T const& get() const override { return m_values[m_idx]; }\n  bool next() override {\n    ++m_idx;\n    return m_idx < m_values.size();\n  }\n};\n\ntemplate <typename T>\nclass GeneratorWrapper final {\n  std::unique_ptr<IGenerator<T>> m_generator;\n\npublic:\n  GeneratorWrapper(std::unique_ptr<IGenerator<T>> generator)\n      : m_generator(std::move(generator)) {}\n  T const& get() const { return m_generator->get(); }\n  bool next() { return m_generator->next(); }\n};\n\ntemplate <typename T>\nGeneratorWrapper<T> value(T&& value) {\n  return GeneratorWrapper<T>(\n      pf::make_unique<SingleValueGenerator<T>>(std::forward<T>(value)));\n}\ntemplate <typename T>\nGeneratorWrapper<T> values(std::initializer_list<T> values) {\n  return GeneratorWrapper<T>(pf::make_unique<FixedValuesGenerator<T>>(values));\n}\n\ntemplate <typename T>\nclass Generators : public IGenerator<T> {\n  std::vector<GeneratorWrapper<T>> m_generators;\n  size_t m_current = 0;\n\n  void populate(GeneratorWrapper<T>&& generator) {\n    m_generators.emplace_back(std::move(generator));\n  }\n  void populate(T&& val) {\n    m_generators.emplace_back(value(std::forward<T>(val)));\n  }\n  template <typename U>\n  void populate(U&& val) {\n    populate(T(std::forward<U>(val)));\n  }\n  template <typename U, typename... Gs>\n  void populate(U&& valueOrGenerator, Gs&&... moreGenerators) {\n    populate(std::forward<U>(valueOrGenerator));\n    populate(std::forward<Gs>(moreGenerators)...);\n  }\n\npublic:\n  template <typename... Gs>\n  Generators(Gs&&... moreGenerators) {\n    m_generators.reserve(sizeof...(Gs));\n    populate(std::forward<Gs>(moreGenerators)...);\n  }\n\n  T const& get() const override { return m_generators[m_current].get(); }\n\n  bool next() override {\n    if (m_current >= m_generators.size()) {\n      return false;\n    }\n    const bool current_status = m_generators[m_current].next();\n    if (!current_status) {\n      ++m_current;\n    }\n    return m_current < m_generators.size();\n  }\n};\n\ntemplate <typename... Ts>\nGeneratorWrapper<std::tuple<Ts...>>\ntable(std::initializer_list<std::tuple<typename std::decay<Ts>::type...>>\n          tuples) {\n  return values<std::tuple<Ts...>>(tuples);\n}\n\n// Tag type to signal that a generator sequence should convert arguments to a\n// specific type\ntemplate <typename T>\nstruct as {};\n\ntemplate <typename T, typename... Gs>\nauto makeGenerators(GeneratorWrapper<T>&& generator, Gs&&... moreGenerators)\n    -> Generators<T> {\n  return Generators<T>(std::move(generator),\n                       std::forward<Gs>(moreGenerators)...);\n}\ntemplate <typename T>\nauto makeGenerators(GeneratorWrapper<T>&& generator) -> Generators<T> {\n  return Generators<T>(std::move(generator));\n}\ntemplate <typename T, typename... Gs>\nauto makeGenerators(T&& val, Gs&&... moreGenerators) -> Generators<T> {\n  return makeGenerators(value(std::forward<T>(val)),\n                        std::forward<Gs>(moreGenerators)...);\n}\ntemplate <typename T, typename U, typename... Gs>\nauto makeGenerators(as<T>, U&& val, Gs&&... moreGenerators) -> Generators<T> {\n  return makeGenerators(value(T(std::forward<U>(val))),\n                        std::forward<Gs>(moreGenerators)...);\n}\n\nauto acquireGeneratorTracker(StringRef generatorName,\n                             SourceLineInfo const& lineInfo)\n    -> IGeneratorTracker&;\n\ntemplate <typename L>\n// Note: The type after -> is weird, because VS2015 cannot parse\n//       the expression used in the typedef inside, when it is in\n//       return type. Yeah.\nauto generate(StringRef generatorName, SourceLineInfo const& lineInfo,\n              L const& generatorExpression)\n    -> decltype(std::declval<decltype(generatorExpression())>().get()) {\n  using UnderlyingType = typename decltype(generatorExpression())::type;\n\n  IGeneratorTracker& tracker = acquireGeneratorTracker(generatorName, lineInfo);\n  if (!tracker.hasGenerator()) {\n    tracker.setGenerator(\n        pf::make_unique<Generators<UnderlyingType>>(generatorExpression()));\n  }\n\n  auto const& generator =\n      static_cast<IGenerator<UnderlyingType> const&>(*tracker.getGenerator());\n  return generator.get();\n}\n\n} // namespace Generators\n} // namespace Catch\n\n#define GENERATE(...)                                                          \\\n  Catch::Generators::generate(                                                 \\\n      INTERNAL_CATCH_STRINGIZE(INTERNAL_CATCH_UNIQUE_NAME(generator)),         \\\n      CATCH_INTERNAL_LINEINFO, [] {                                            \\\n        using namespace Catch::Generators;                                     \\\n        return makeGenerators(__VA_ARGS__);                                    \\\n      }) // NOLINT(google-build-using-namespace)\n#define GENERATE_COPY(...)                                                     \\\n  Catch::Generators::generate(                                                 \\\n      INTERNAL_CATCH_STRINGIZE(INTERNAL_CATCH_UNIQUE_NAME(generator)),         \\\n      CATCH_INTERNAL_LINEINFO, [=] {                                           \\\n        using namespace Catch::Generators;                                     \\\n        return makeGenerators(__VA_ARGS__);                                    \\\n      }) // NOLINT(google-build-using-namespace)\n#define GENERATE_REF(...)                                                      \\\n  Catch::Generators::generate(                                                 \\\n      INTERNAL_CATCH_STRINGIZE(INTERNAL_CATCH_UNIQUE_NAME(generator)),         \\\n      CATCH_INTERNAL_LINEINFO, [&] {                                           \\\n        using namespace Catch::Generators;                                     \\\n        return makeGenerators(__VA_ARGS__);                                    \\\n      }) // NOLINT(google-build-using-namespace)\n\n// end catch_generators.hpp\n// start catch_generators_generic.hpp\n\nnamespace Catch {\nnamespace Generators {\n\ntemplate <typename T>\nclass TakeGenerator : public IGenerator<T> {\n  GeneratorWrapper<T> m_generator;\n  size_t m_returned = 0;\n  size_t m_target;\n\npublic:\n  TakeGenerator(size_t target, GeneratorWrapper<T>&& generator)\n      : m_generator(std::move(generator)), m_target(target) {\n    assert(target != 0 && \"Empty generators are not allowed\");\n  }\n  T const& get() const override { return m_generator.get(); }\n  bool next() override {\n    ++m_returned;\n    if (m_returned >= m_target) {\n      return false;\n    }\n\n    const auto success = m_generator.next();\n    // If the underlying generator does not contain enough values\n    // then we cut short as well\n    if (!success) {\n      m_returned = m_target;\n    }\n    return success;\n  }\n};\n\ntemplate <typename T>\nGeneratorWrapper<T> take(size_t target, GeneratorWrapper<T>&& generator) {\n  return GeneratorWrapper<T>(\n      pf::make_unique<TakeGenerator<T>>(target, std::move(generator)));\n}\n\ntemplate <typename T, typename Predicate>\nclass FilterGenerator : public IGenerator<T> {\n  GeneratorWrapper<T> m_generator;\n  Predicate m_predicate;\n\npublic:\n  template <typename P = Predicate>\n  FilterGenerator(P&& pred, GeneratorWrapper<T>&& generator)\n      : m_generator(std::move(generator)), m_predicate(std::forward<P>(pred)) {\n    if (!m_predicate(m_generator.get())) {\n      // It might happen that there are no values that pass the\n      // filter. In that case we throw an exception.\n      auto has_initial_value = next();\n      if (!has_initial_value) {\n        Catch::throw_exception(\n            GeneratorException(\"No valid value found in filtered generator\"));\n      }\n    }\n  }\n\n  T const& get() const override { return m_generator.get(); }\n\n  bool next() override {\n    bool success = m_generator.next();\n    if (!success) {\n      return false;\n    }\n    while (!m_predicate(m_generator.get()) &&\n           (success = m_generator.next()) == true)\n      ;\n    return success;\n  }\n};\n\ntemplate <typename T, typename Predicate>\nGeneratorWrapper<T> filter(Predicate&& pred, GeneratorWrapper<T>&& generator) {\n  return GeneratorWrapper<T>(std::unique_ptr<IGenerator<T>>(\n      pf::make_unique<FilterGenerator<T, Predicate>>(\n          std::forward<Predicate>(pred), std::move(generator))));\n}\n\ntemplate <typename T>\nclass RepeatGenerator : public IGenerator<T> {\n  static_assert(!std::is_same<T, bool>::value,\n                \"RepeatGenerator currently does not support bools\"\n                \"because of std::vector<bool> specialization\");\n  GeneratorWrapper<T> m_generator;\n  mutable std::vector<T> m_returned;\n  size_t m_target_repeats;\n  size_t m_current_repeat = 0;\n  size_t m_repeat_index   = 0;\n\npublic:\n  RepeatGenerator(size_t repeats, GeneratorWrapper<T>&& generator)\n      : m_generator(std::move(generator)), m_target_repeats(repeats) {\n    assert(m_target_repeats > 0 &&\n           \"Repeat generator must repeat at least once\");\n  }\n\n  T const& get() const override {\n    if (m_current_repeat == 0) {\n      m_returned.push_back(m_generator.get());\n      return m_returned.back();\n    }\n    return m_returned[m_repeat_index];\n  }\n\n  bool next() override {\n    // There are 2 basic cases:\n    // 1) We are still reading the generator\n    // 2) We are reading our own cache\n\n    // In the first case, we need to poke the underlying generator.\n    // If it happily moves, we are left in that state, otherwise it is time to\n    // start reading from our cache\n    if (m_current_repeat == 0) {\n      const auto success = m_generator.next();\n      if (!success) {\n        ++m_current_repeat;\n      }\n      return m_current_repeat < m_target_repeats;\n    }\n\n    // In the second case, we need to move indices forward and check that we\n    // haven't run up against the end\n    ++m_repeat_index;\n    if (m_repeat_index == m_returned.size()) {\n      m_repeat_index = 0;\n      ++m_current_repeat;\n    }\n    return m_current_repeat < m_target_repeats;\n  }\n};\n\ntemplate <typename T>\nGeneratorWrapper<T> repeat(size_t repeats, GeneratorWrapper<T>&& generator) {\n  return GeneratorWrapper<T>(\n      pf::make_unique<RepeatGenerator<T>>(repeats, std::move(generator)));\n}\n\ntemplate <typename T, typename U, typename Func>\nclass MapGenerator : public IGenerator<T> {\n  // TBD: provide static assert for mapping function, for friendly error message\n  GeneratorWrapper<U> m_generator;\n  Func m_function;\n  // To avoid returning dangling reference, we have to save the values\n  T m_cache;\n\npublic:\n  template <typename F2 = Func>\n  MapGenerator(F2&& function, GeneratorWrapper<U>&& generator)\n      : m_generator(std::move(generator)),\n        m_function(std::forward<F2>(function)),\n        m_cache(m_function(m_generator.get())) {}\n\n  T const& get() const override { return m_cache; }\n  bool next() override {\n    const auto success = m_generator.next();\n    if (success) {\n      m_cache = m_function(m_generator.get());\n    }\n    return success;\n  }\n};\n\ntemplate <typename Func, typename U, typename T = FunctionReturnType<Func, U>>\nGeneratorWrapper<T> map(Func&& function, GeneratorWrapper<U>&& generator) {\n  return GeneratorWrapper<T>(pf::make_unique<MapGenerator<T, U, Func>>(\n      std::forward<Func>(function), std::move(generator)));\n}\n\ntemplate <typename T, typename U, typename Func>\nGeneratorWrapper<T> map(Func&& function, GeneratorWrapper<U>&& generator) {\n  return GeneratorWrapper<T>(pf::make_unique<MapGenerator<T, U, Func>>(\n      std::forward<Func>(function), std::move(generator)));\n}\n\ntemplate <typename T>\nclass ChunkGenerator final : public IGenerator<std::vector<T>> {\n  std::vector<T> m_chunk;\n  size_t m_chunk_size;\n  GeneratorWrapper<T> m_generator;\n  bool m_used_up = false;\n\npublic:\n  ChunkGenerator(size_t size, GeneratorWrapper<T> generator)\n      : m_chunk_size(size), m_generator(std::move(generator)) {\n    m_chunk.reserve(m_chunk_size);\n    if (m_chunk_size != 0) {\n      m_chunk.push_back(m_generator.get());\n      for (size_t i = 1; i < m_chunk_size; ++i) {\n        if (!m_generator.next()) {\n          Catch::throw_exception(GeneratorException(\n              \"Not enough values to initialize the first chunk\"));\n        }\n        m_chunk.push_back(m_generator.get());\n      }\n    }\n  }\n  std::vector<T> const& get() const override { return m_chunk; }\n  bool next() override {\n    m_chunk.clear();\n    for (size_t idx = 0; idx < m_chunk_size; ++idx) {\n      if (!m_generator.next()) {\n        return false;\n      }\n      m_chunk.push_back(m_generator.get());\n    }\n    return true;\n  }\n};\n\ntemplate <typename T>\nGeneratorWrapper<std::vector<T>> chunk(size_t size,\n                                       GeneratorWrapper<T>&& generator) {\n  return GeneratorWrapper<std::vector<T>>(\n      pf::make_unique<ChunkGenerator<T>>(size, std::move(generator)));\n}\n\n} // namespace Generators\n} // namespace Catch\n\n// end catch_generators_generic.hpp\n// start catch_generators_specific.hpp\n\n// start catch_context.h\n\n#include <memory>\n\nnamespace Catch {\n\nstruct IResultCapture;\nstruct IRunner;\nstruct IConfig;\nstruct IMutableContext;\n\nusing IConfigPtr = std::shared_ptr<IConfig const>;\n\nstruct IContext {\n  virtual ~IContext();\n\n  virtual IResultCapture* getResultCapture()  = 0;\n  virtual IRunner* getRunner()                = 0;\n  virtual IConfigPtr const& getConfig() const = 0;\n};\n\nstruct IMutableContext : IContext {\n  virtual ~IMutableContext();\n  virtual void setResultCapture(IResultCapture* resultCapture) = 0;\n  virtual void setRunner(IRunner* runner)                      = 0;\n  virtual void setConfig(IConfigPtr const& config)             = 0;\n\nprivate:\n  static IMutableContext* currentContext;\n  friend IMutableContext& getCurrentMutableContext();\n  friend void cleanUpContext();\n  static void createContext();\n};\n\ninline IMutableContext& getCurrentMutableContext() {\n  if (!IMutableContext::currentContext)\n    IMutableContext::createContext();\n  // NOLINTNEXTLINE(clang-analyzer-core.uninitialized.UndefReturn)\n  return *IMutableContext::currentContext;\n}\n\ninline IContext& getCurrentContext() { return getCurrentMutableContext(); }\n\nvoid cleanUpContext();\n\nclass SimplePcg32;\nSimplePcg32& rng();\n} // namespace Catch\n\n// end catch_context.h\n// start catch_interfaces_config.h\n\n// start catch_option.hpp\n\nnamespace Catch {\n\n// An optional type\ntemplate <typename T>\nclass Option {\npublic:\n  Option() : nullableValue(nullptr) {}\n  Option(T const& _value) : nullableValue(new (storage) T(_value)) {}\n  Option(Option const& _other)\n      : nullableValue(_other ? new (storage) T(*_other) : nullptr) {}\n\n  ~Option() { reset(); }\n\n  Option& operator=(Option const& _other) {\n    if (&_other != this) {\n      reset();\n      if (_other)\n        nullableValue = new (storage) T(*_other);\n    }\n    return *this;\n  }\n  Option& operator=(T const& _value) {\n    reset();\n    nullableValue = new (storage) T(_value);\n    return *this;\n  }\n\n  void reset() {\n    if (nullableValue)\n      nullableValue->~T();\n    nullableValue = nullptr;\n  }\n\n  T& operator*() { return *nullableValue; }\n  T const& operator*() const { return *nullableValue; }\n  T* operator->() { return nullableValue; }\n  const T* operator->() const { return nullableValue; }\n\n  T valueOr(T const& defaultValue) const {\n    return nullableValue ? *nullableValue : defaultValue;\n  }\n\n  bool some() const { return nullableValue != nullptr; }\n  bool none() const { return nullableValue == nullptr; }\n\n  bool operator!() const { return nullableValue == nullptr; }\n  explicit operator bool() const { return some(); }\n\nprivate:\n  T* nullableValue;\n  alignas(alignof(T)) char storage[sizeof(T)];\n};\n\n} // end namespace Catch\n\n// end catch_option.hpp\n#include <chrono>\n#include <iosfwd>\n#include <string>\n#include <vector>\n#include <memory>\n\nnamespace Catch {\n\nenum class Verbosity { Quiet = 0, Normal, High };\n\nstruct WarnAbout {\n  enum What { Nothing = 0x00, NoAssertions = 0x01, NoTests = 0x02 };\n};\n\nstruct ShowDurations {\n  enum OrNot { DefaultForReporter, Always, Never };\n};\nstruct RunTests {\n  enum InWhatOrder {\n    InDeclarationOrder,\n    InLexicographicalOrder,\n    InRandomOrder\n  };\n};\nstruct UseColour {\n  enum YesOrNo { Auto, Yes, No };\n};\nstruct WaitForKeypress {\n  enum When {\n    Never,\n    BeforeStart        = 1,\n    BeforeExit         = 2,\n    BeforeStartAndExit = BeforeStart | BeforeExit\n  };\n};\n\nclass TestSpec;\n\nstruct IConfig : NonCopyable {\n\n  virtual ~IConfig();\n\n  virtual bool allowThrows() const                                 = 0;\n  virtual std::ostream& stream() const                             = 0;\n  virtual std::string name() const                                 = 0;\n  virtual bool includeSuccessfulResults() const                    = 0;\n  virtual bool shouldDebugBreak() const                            = 0;\n  virtual bool warnAboutMissingAssertions() const                  = 0;\n  virtual bool warnAboutNoTests() const                            = 0;\n  virtual int abortAfter() const                                   = 0;\n  virtual bool showInvisibles() const                              = 0;\n  virtual ShowDurations::OrNot showDurations() const               = 0;\n  virtual double minDuration() const                               = 0;\n  virtual TestSpec const& testSpec() const                         = 0;\n  virtual bool hasTestFilters() const                              = 0;\n  virtual std::vector<std::string> const& getTestsOrTags() const   = 0;\n  virtual RunTests::InWhatOrder runOrder() const                   = 0;\n  virtual unsigned int rngSeed() const                             = 0;\n  virtual UseColour::YesOrNo useColour() const                     = 0;\n  virtual std::vector<std::string> const& getSectionsToRun() const = 0;\n  virtual Verbosity verbosity() const                              = 0;\n\n  virtual bool benchmarkNoAnalysis() const                      = 0;\n  virtual int benchmarkSamples() const                          = 0;\n  virtual double benchmarkConfidenceInterval() const            = 0;\n  virtual unsigned int benchmarkResamples() const               = 0;\n  virtual std::chrono::milliseconds benchmarkWarmupTime() const = 0;\n};\n\nusing IConfigPtr = std::shared_ptr<IConfig const>;\n} // namespace Catch\n\n// end catch_interfaces_config.h\n// start catch_random_number_generator.h\n\n#include <cstdint>\n\nnamespace Catch {\n\n// This is a simple implementation of C++11 Uniform Random Number\n// Generator. It does not provide all operators, because Catch2\n// does not use it, but it should behave as expected inside stdlib's\n// distributions.\n// The implementation is based on the PCG family (http://pcg-random.org)\nclass SimplePcg32 {\n  using state_type = std::uint64_t;\n\npublic:\n  using result_type = std::uint32_t;\n  static constexpr result_type(min)() { return 0; }\n  static constexpr result_type(max)() { return static_cast<result_type>(-1); }\n\n  // Provide some default initial state for the default constructor\n  SimplePcg32() : SimplePcg32(0xed743cc4U) {}\n\n  explicit SimplePcg32(result_type seed_);\n\n  void seed(result_type seed_);\n  void discard(uint64_t skip);\n\n  result_type operator()();\n\nprivate:\n  friend bool operator==(SimplePcg32 const& lhs, SimplePcg32 const& rhs);\n  friend bool operator!=(SimplePcg32 const& lhs, SimplePcg32 const& rhs);\n\n  // In theory we also need operator<< and operator>>\n  // In practice we do not use them, so we will skip them for now\n\n  std::uint64_t m_state;\n  // This part of the state determines which \"stream\" of the numbers\n  // is chosen -- we take it as a constant for Catch2, so we only\n  // need to deal with seeding the main state.\n  // Picked by reading 8 bytes from `/dev/random` :-)\n  static const std::uint64_t s_inc = (0x13ed0cc53f939476ULL << 1ULL) | 1ULL;\n};\n\n} // end namespace Catch\n\n// end catch_random_number_generator.h\n#include <random>\n\nnamespace Catch {\nnamespace Generators {\n\ntemplate <typename Float>\nclass RandomFloatingGenerator final : public IGenerator<Float> {\n  Catch::SimplePcg32& m_rng;\n  std::uniform_real_distribution<Float> m_dist;\n  Float m_current_number;\n\npublic:\n  RandomFloatingGenerator(Float a, Float b) : m_rng(rng()), m_dist(a, b) {\n    static_cast<void>(next());\n  }\n\n  Float const& get() const override { return m_current_number; }\n  bool next() override {\n    m_current_number = m_dist(m_rng);\n    return true;\n  }\n};\n\ntemplate <typename Integer>\nclass RandomIntegerGenerator final : public IGenerator<Integer> {\n  Catch::SimplePcg32& m_rng;\n  std::uniform_int_distribution<Integer> m_dist;\n  Integer m_current_number;\n\npublic:\n  RandomIntegerGenerator(Integer a, Integer b) : m_rng(rng()), m_dist(a, b) {\n    static_cast<void>(next());\n  }\n\n  Integer const& get() const override { return m_current_number; }\n  bool next() override {\n    m_current_number = m_dist(m_rng);\n    return true;\n  }\n};\n\n// TODO: Ideally this would be also constrained against the various char types,\n//       but I don't expect users to run into that in practice.\ntemplate <typename T>\ntypename std::enable_if<std::is_integral<T>::value &&\n                            !std::is_same<T, bool>::value,\n                        GeneratorWrapper<T>>::type\nrandom(T a, T b) {\n  return GeneratorWrapper<T>(pf::make_unique<RandomIntegerGenerator<T>>(a, b));\n}\n\ntemplate <typename T>\ntypename std::enable_if<std::is_floating_point<T>::value,\n                        GeneratorWrapper<T>>::type\nrandom(T a, T b) {\n  return GeneratorWrapper<T>(pf::make_unique<RandomFloatingGenerator<T>>(a, b));\n}\n\ntemplate <typename T>\nclass RangeGenerator final : public IGenerator<T> {\n  T m_current;\n  T m_end;\n  T m_step;\n  bool m_positive;\n\npublic:\n  RangeGenerator(T const& start, T const& end, T const& step)\n      : m_current(start), m_end(end), m_step(step), m_positive(m_step > T(0)) {\n    assert(m_current != m_end && \"Range start and end cannot be equal\");\n    assert(m_step != T(0) && \"Step size cannot be zero\");\n    assert(((m_positive && m_current <= m_end) ||\n            (!m_positive && m_current >= m_end)) &&\n           \"Step moves away from end\");\n  }\n\n  RangeGenerator(T const& start, T const& end)\n      : RangeGenerator(start, end, (start < end) ? T(1) : T(-1)) {}\n\n  T const& get() const override { return m_current; }\n\n  bool next() override {\n    m_current += m_step;\n    return (m_positive) ? (m_current < m_end) : (m_current > m_end);\n  }\n};\n\ntemplate <typename T>\nGeneratorWrapper<T> range(T const& start, T const& end, T const& step) {\n  static_assert(std::is_arithmetic<T>::value && !std::is_same<T, bool>::value,\n                \"Type must be numeric\");\n  return GeneratorWrapper<T>(\n      pf::make_unique<RangeGenerator<T>>(start, end, step));\n}\n\ntemplate <typename T>\nGeneratorWrapper<T> range(T const& start, T const& end) {\n  static_assert(std::is_integral<T>::value && !std::is_same<T, bool>::value,\n                \"Type must be an integer\");\n  return GeneratorWrapper<T>(pf::make_unique<RangeGenerator<T>>(start, end));\n}\n\ntemplate <typename T>\nclass IteratorGenerator final : public IGenerator<T> {\n  static_assert(!std::is_same<T, bool>::value,\n                \"IteratorGenerator currently does not support bools\"\n                \"because of std::vector<bool> specialization\");\n\n  std::vector<T> m_elems;\n  size_t m_current = 0;\n\npublic:\n  template <typename InputIterator, typename InputSentinel>\n  IteratorGenerator(InputIterator first, InputSentinel last)\n      : m_elems(first, last) {\n    if (m_elems.empty()) {\n      Catch::throw_exception(\n          GeneratorException(\"IteratorGenerator received no valid values\"));\n    }\n  }\n\n  T const& get() const override { return m_elems[m_current]; }\n\n  bool next() override {\n    ++m_current;\n    return m_current != m_elems.size();\n  }\n};\n\ntemplate <typename InputIterator, typename InputSentinel,\n          typename ResultType =\n              typename std::iterator_traits<InputIterator>::value_type>\nGeneratorWrapper<ResultType> from_range(InputIterator from, InputSentinel to) {\n  return GeneratorWrapper<ResultType>(\n      pf::make_unique<IteratorGenerator<ResultType>>(from, to));\n}\n\ntemplate <typename Container,\n          typename ResultType = typename Container::value_type>\nGeneratorWrapper<ResultType> from_range(Container const& cnt) {\n  return GeneratorWrapper<ResultType>(\n      pf::make_unique<IteratorGenerator<ResultType>>(cnt.begin(), cnt.end()));\n}\n\n} // namespace Generators\n} // namespace Catch\n\n// end catch_generators_specific.hpp\n\n// These files are included here so the single_include script doesn't put them\n// in the conditionally compiled sections\n// start catch_test_case_info.h\n\n#include <string>\n#include <vector>\n#include <memory>\n\n#ifdef __clang__\n#pragma clang diagnostic push\n#pragma clang diagnostic ignored \"-Wpadded\"\n#endif\n\nnamespace Catch {\n\nstruct ITestInvoker;\n\nstruct TestCaseInfo {\n  enum SpecialProperties {\n    None        = 0,\n    IsHidden    = 1 << 1,\n    ShouldFail  = 1 << 2,\n    MayFail     = 1 << 3,\n    Throws      = 1 << 4,\n    NonPortable = 1 << 5,\n    Benchmark   = 1 << 6\n  };\n\n  TestCaseInfo(std::string const& _name, std::string const& _className,\n               std::string const& _description,\n               std::vector<std::string> const& _tags,\n               SourceLineInfo const& _lineInfo);\n\n  friend void setTags(TestCaseInfo& testCaseInfo,\n                      std::vector<std::string> tags);\n\n  bool isHidden() const;\n  bool throws() const;\n  bool okToFail() const;\n  bool expectedToFail() const;\n\n  std::string tagsAsString() const;\n\n  std::string name;\n  std::string className;\n  std::string description;\n  std::vector<std::string> tags;\n  std::vector<std::string> lcaseTags;\n  SourceLineInfo lineInfo;\n  SpecialProperties properties;\n};\n\nclass TestCase : public TestCaseInfo {\npublic:\n  TestCase(ITestInvoker* testCase, TestCaseInfo&& info);\n\n  TestCase withName(std::string const& _newName) const;\n\n  void invoke() const;\n\n  TestCaseInfo const& getTestCaseInfo() const;\n\n  bool operator==(TestCase const& other) const;\n  bool operator<(TestCase const& other) const;\n\nprivate:\n  std::shared_ptr<ITestInvoker> test;\n};\n\nTestCase makeTestCase(ITestInvoker* testCase, std::string const& className,\n                      NameAndTags const& nameAndTags,\n                      SourceLineInfo const& lineInfo);\n} // namespace Catch\n\n#ifdef __clang__\n#pragma clang diagnostic pop\n#endif\n\n// end catch_test_case_info.h\n// start catch_interfaces_runner.h\n\nnamespace Catch {\n\nstruct IRunner {\n  virtual ~IRunner();\n  virtual bool aborting() const = 0;\n};\n} // namespace Catch\n\n// end catch_interfaces_runner.h\n\n#ifdef __OBJC__\n// start catch_objc.hpp\n\n#import <objc/runtime.h>\n\n#include <string>\n\n// NB. Any general catch headers included here must be included\n// in catch.hpp first to make sure they are included by the single\n// header for non obj-usage\n\n///////////////////////////////////////////////////////////////////////////////\n// This protocol is really only here for (self) documenting purposes, since\n// all its methods are optional.\n@protocol OcFixture\n\n@optional\n\n- (void)setUp;\n- (void)tearDown;\n\n@end\n\nnamespace Catch {\n\nclass OcMethod : public ITestInvoker {\n\npublic:\n  OcMethod(Class cls, SEL sel) : m_cls(cls), m_sel(sel) {}\n\n  virtual void invoke() const {\n    id obj = [[m_cls alloc] init];\n\n    performOptionalSelector(obj, @selector(setUp));\n    performOptionalSelector(obj, m_sel);\n    performOptionalSelector(obj, @selector(tearDown));\n\n    arcSafeRelease(obj);\n  }\n\nprivate:\n  virtual ~OcMethod() {}\n\n  Class m_cls;\n  SEL m_sel;\n};\n\nnamespace Detail {\n\ninline std::string getAnnotation(Class cls, std::string const& annotationName,\n                                 std::string const& testCaseName) {\n  NSString* selStr =\n      [[NSString alloc] initWithFormat:@\"Catch_%s_%s\", annotationName.c_str(),\n                                       testCaseName.c_str()];\n  SEL sel = NSSelectorFromString(selStr);\n  arcSafeRelease(selStr);\n  id value = performOptionalSelector(cls, sel);\n  if (value)\n    return [(NSString*)value UTF8String];\n  return \"\";\n}\n} // namespace Detail\n\ninline std::size_t registerTestMethods() {\n  std::size_t noTestMethods = 0;\n  int noClasses             = objc_getClassList(nullptr, 0);\n\n  Class* classes =\n      (CATCH_UNSAFE_UNRETAINED Class*)malloc(sizeof(Class) * noClasses);\n  objc_getClassList(classes, noClasses);\n\n  for (int c = 0; c < noClasses; c++) {\n    Class cls = classes[c];\n    {\n      u_int count;\n      Method* methods = class_copyMethodList(cls, &count);\n      for (u_int m = 0; m < count; m++) {\n        SEL selector           = method_getName(methods[m]);\n        std::string methodName = sel_getName(selector);\n        if (startsWith(methodName, \"Catch_TestCase_\")) {\n          std::string testCaseName = methodName.substr(15);\n          std::string name = Detail::getAnnotation(cls, \"Name\", testCaseName);\n          std::string desc =\n              Detail::getAnnotation(cls, \"Description\", testCaseName);\n          const char* className = class_getName(cls);\n\n          getMutableRegistryHub().registerTest(makeTestCase(\n              new OcMethod(cls, selector), className,\n              NameAndTags(name.c_str(), desc.c_str()), SourceLineInfo(\"\", 0)));\n          noTestMethods++;\n        }\n      }\n      free(methods);\n    }\n  }\n  return noTestMethods;\n}\n\n#if !defined(CATCH_CONFIG_DISABLE_MATCHERS)\n\nnamespace Matchers {\nnamespace Impl {\nnamespace NSStringMatchers {\n\nstruct StringHolder : MatcherBase<NSString*> {\n  StringHolder(NSString* substr) : m_substr([substr copy]) {}\n  StringHolder(StringHolder const& other) : m_substr([other.m_substr copy]) {}\n  StringHolder() { arcSafeRelease(m_substr); }\n\n  bool match(NSString* str) const override { return false; }\n\n  NSString* CATCH_ARC_STRONG m_substr;\n};\n\nstruct Equals : StringHolder {\n  Equals(NSString* substr) : StringHolder(substr) {}\n\n  bool match(NSString* str) const override {\n    return (str != nil || m_substr == nil) && [str isEqualToString:m_substr];\n  }\n\n  std::string describe() const override {\n    return \"equals string: \" + Catch::Detail::stringify(m_substr);\n  }\n};\n\nstruct Contains : StringHolder {\n  Contains(NSString* substr) : StringHolder(substr) {}\n\n  bool match(NSString* str) const override {\n    return (str != nil || m_substr == nil) &&\n           [str rangeOfString:m_substr].location != NSNotFound;\n  }\n\n  std::string describe() const override {\n    return \"contains string: \" + Catch::Detail::stringify(m_substr);\n  }\n};\n\nstruct StartsWith : StringHolder {\n  StartsWith(NSString* substr) : StringHolder(substr) {}\n\n  bool match(NSString* str) const override {\n    return (str != nil || m_substr == nil) &&\n           [str rangeOfString:m_substr].location == 0;\n  }\n\n  std::string describe() const override {\n    return \"starts with: \" + Catch::Detail::stringify(m_substr);\n  }\n};\nstruct EndsWith : StringHolder {\n  EndsWith(NSString* substr) : StringHolder(substr) {}\n\n  bool match(NSString* str) const override {\n    return (str != nil || m_substr == nil) &&\n           [str rangeOfString:m_substr].location ==\n               [str length] - [m_substr length];\n  }\n\n  std::string describe() const override {\n    return \"ends with: \" + Catch::Detail::stringify(m_substr);\n  }\n};\n\n} // namespace NSStringMatchers\n} // namespace Impl\n\ninline Impl::NSStringMatchers::Equals Equals(NSString* substr) {\n  return Impl::NSStringMatchers::Equals(substr);\n}\n\ninline Impl::NSStringMatchers::Contains Contains(NSString* substr) {\n  return Impl::NSStringMatchers::Contains(substr);\n}\n\ninline Impl::NSStringMatchers::StartsWith StartsWith(NSString* substr) {\n  return Impl::NSStringMatchers::StartsWith(substr);\n}\n\ninline Impl::NSStringMatchers::EndsWith EndsWith(NSString* substr) {\n  return Impl::NSStringMatchers::EndsWith(substr);\n}\n\n} // namespace Matchers\n\nusing namespace Matchers;\n\n#endif // CATCH_CONFIG_DISABLE_MATCHERS\n\n} // namespace Catch\n\n///////////////////////////////////////////////////////////////////////////////\n#define OC_MAKE_UNIQUE_NAME(root, uniqueSuffix) root##uniqueSuffix\n#define OC_TEST_CASE2(name, desc, uniqueSuffix)                                \\\n  +(NSString*)OC_MAKE_UNIQUE_NAME(Catch_Name_test_, uniqueSuffix) {            \\\n    return @name;                                                              \\\n  }                                                                            \\\n  +(NSString*)OC_MAKE_UNIQUE_NAME(Catch_Description_test_, uniqueSuffix) {     \\\n    return @desc;                                                              \\\n  }                                                                            \\\n  -(void)OC_MAKE_UNIQUE_NAME(Catch_TestCase_test_, uniqueSuffix)\n\n#define OC_TEST_CASE(name, desc) OC_TEST_CASE2(name, desc, __LINE__)\n\n// end catch_objc.hpp\n#endif\n\n// Benchmarking needs the externally-facing parts of reporters to work\n#if defined(CATCH_CONFIG_EXTERNAL_INTERFACES) ||                               \\\n    defined(CATCH_CONFIG_ENABLE_BENCHMARKING)\n// start catch_external_interfaces.h\n\n// start catch_reporter_bases.hpp\n\n// start catch_interfaces_reporter.h\n\n// start catch_config.hpp\n\n// start catch_test_spec_parser.h\n\n#ifdef __clang__\n#pragma clang diagnostic push\n#pragma clang diagnostic ignored \"-Wpadded\"\n#endif\n\n// start catch_test_spec.h\n\n#ifdef __clang__\n#pragma clang diagnostic push\n#pragma clang diagnostic ignored \"-Wpadded\"\n#endif\n\n// start catch_wildcard_pattern.h\n\nnamespace Catch {\nclass WildcardPattern {\n  enum WildcardPosition {\n    NoWildcard         = 0,\n    WildcardAtStart    = 1,\n    WildcardAtEnd      = 2,\n    WildcardAtBothEnds = WildcardAtStart | WildcardAtEnd\n  };\n\npublic:\n  WildcardPattern(std::string const& pattern,\n                  CaseSensitive::Choice caseSensitivity);\n  virtual ~WildcardPattern() = default;\n  virtual bool matches(std::string const& str) const;\n\nprivate:\n  std::string normaliseString(std::string const& str) const;\n  CaseSensitive::Choice m_caseSensitivity;\n  WildcardPosition m_wildcard = NoWildcard;\n  std::string m_pattern;\n};\n} // namespace Catch\n\n// end catch_wildcard_pattern.h\n#include <string>\n#include <vector>\n#include <memory>\n\nnamespace Catch {\n\nstruct IConfig;\n\nclass TestSpec {\n  class Pattern {\n  public:\n    explicit Pattern(std::string const& name);\n    virtual ~Pattern();\n    virtual bool matches(TestCaseInfo const& testCase) const = 0;\n    std::string const& name() const;\n\n  private:\n    std::string const m_name;\n  };\n  using PatternPtr = std::shared_ptr<Pattern>;\n\n  class NamePattern : public Pattern {\n  public:\n    explicit NamePattern(std::string const& name,\n                         std::string const& filterString);\n    bool matches(TestCaseInfo const& testCase) const override;\n\n  private:\n    WildcardPattern m_wildcardPattern;\n  };\n\n  class TagPattern : public Pattern {\n  public:\n    explicit TagPattern(std::string const& tag,\n                        std::string const& filterString);\n    bool matches(TestCaseInfo const& testCase) const override;\n\n  private:\n    std::string m_tag;\n  };\n\n  class ExcludedPattern : public Pattern {\n  public:\n    explicit ExcludedPattern(PatternPtr const& underlyingPattern);\n    bool matches(TestCaseInfo const& testCase) const override;\n\n  private:\n    PatternPtr m_underlyingPattern;\n  };\n\n  struct Filter {\n    std::vector<PatternPtr> m_patterns;\n\n    bool matches(TestCaseInfo const& testCase) const;\n    std::string name() const;\n  };\n\npublic:\n  struct FilterMatch {\n    std::string name;\n    std::vector<TestCase const*> tests;\n  };\n  using Matches       = std::vector<FilterMatch>;\n  using vectorStrings = std::vector<std::string>;\n\n  bool hasFilters() const;\n  bool matches(TestCaseInfo const& testCase) const;\n  Matches matchesByFilter(std::vector<TestCase> const& testCases,\n                          IConfig const& config) const;\n  const vectorStrings& getInvalidArgs() const;\n\nprivate:\n  std::vector<Filter> m_filters;\n  std::vector<std::string> m_invalidArgs;\n  friend class TestSpecParser;\n};\n} // namespace Catch\n\n#ifdef __clang__\n#pragma clang diagnostic pop\n#endif\n\n// end catch_test_spec.h\n// start catch_interfaces_tag_alias_registry.h\n\n#include <string>\n\nnamespace Catch {\n\nstruct TagAlias;\n\nstruct ITagAliasRegistry {\n  virtual ~ITagAliasRegistry();\n  // Nullptr if not present\n  virtual TagAlias const* find(std::string const& alias) const = 0;\n  virtual std::string\n  expandAliases(std::string const& unexpandedTestSpec) const = 0;\n\n  static ITagAliasRegistry const& get();\n};\n\n} // end namespace Catch\n\n// end catch_interfaces_tag_alias_registry.h\nnamespace Catch {\n\nclass TestSpecParser {\n  enum Mode { None, Name, QuotedName, Tag, EscapedName };\n  Mode m_mode                  = None;\n  Mode lastMode                = None;\n  bool m_exclusion             = false;\n  std::size_t m_pos            = 0;\n  std::size_t m_realPatternPos = 0;\n  std::string m_arg;\n  std::string m_substring;\n  std::string m_patternName;\n  std::vector<std::size_t> m_escapeChars;\n  TestSpec::Filter m_currentFilter;\n  TestSpec m_testSpec;\n  ITagAliasRegistry const* m_tagAliases = nullptr;\n\npublic:\n  TestSpecParser(ITagAliasRegistry const& tagAliases);\n\n  TestSpecParser& parse(std::string const& arg);\n  TestSpec testSpec();\n\nprivate:\n  bool visitChar(char c);\n  void startNewMode(Mode mode);\n  bool processNoneChar(char c);\n  void processNameChar(char c);\n  bool processOtherChar(char c);\n  void endMode();\n  void escape();\n  bool isControlChar(char c) const;\n  void saveLastMode();\n  void revertBackToLastMode();\n  void addFilter();\n  bool separate();\n\n  // Handles common preprocessing of the pattern for name/tag patterns\n  std::string preprocessPattern();\n  // Adds the current pattern as a test name\n  void addNamePattern();\n  // Adds the current pattern as a tag\n  void addTagPattern();\n\n  inline void addCharToPattern(char c) {\n    m_substring += c;\n    m_patternName += c;\n    m_realPatternPos++;\n  }\n};\nTestSpec parseTestSpec(std::string const& arg);\n\n} // namespace Catch\n\n#ifdef __clang__\n#pragma clang diagnostic pop\n#endif\n\n// end catch_test_spec_parser.h\n// Libstdc++ doesn't like incomplete classes for unique_ptr\n\n#include <memory>\n#include <vector>\n#include <string>\n\n#ifndef CATCH_CONFIG_CONSOLE_WIDTH\n#define CATCH_CONFIG_CONSOLE_WIDTH 80\n#endif\n\nnamespace Catch {\n\nstruct IStream;\n\nstruct ConfigData {\n  bool listTests         = false;\n  bool listTags          = false;\n  bool listReporters     = false;\n  bool listTestNamesOnly = false;\n\n  bool showSuccessfulTests = false;\n  bool shouldDebugBreak    = false;\n  bool noThrow             = false;\n  bool showHelp            = false;\n  bool showInvisibles      = false;\n  bool filenamesAsTags     = false;\n  bool libIdentify         = false;\n\n  int abortAfter       = -1;\n  unsigned int rngSeed = 0;\n\n  bool benchmarkNoAnalysis                           = false;\n  unsigned int benchmarkSamples                      = 100;\n  double benchmarkConfidenceInterval                 = 0.95;\n  unsigned int benchmarkResamples                    = 100000;\n  std::chrono::milliseconds::rep benchmarkWarmupTime = 100;\n\n  Verbosity verbosity                   = Verbosity::Normal;\n  WarnAbout::What warnings              = WarnAbout::Nothing;\n  ShowDurations::OrNot showDurations    = ShowDurations::DefaultForReporter;\n  double minDuration                    = -1;\n  RunTests::InWhatOrder runOrder        = RunTests::InDeclarationOrder;\n  UseColour::YesOrNo useColour          = UseColour::Auto;\n  WaitForKeypress::When waitForKeypress = WaitForKeypress::Never;\n\n  std::string outputFilename;\n  std::string name;\n  std::string processName;\n#ifndef CATCH_CONFIG_DEFAULT_REPORTER\n#define CATCH_CONFIG_DEFAULT_REPORTER \"console\"\n#endif\n  std::string reporterName = CATCH_CONFIG_DEFAULT_REPORTER;\n#undef CATCH_CONFIG_DEFAULT_REPORTER\n\n  std::vector<std::string> testsOrTags;\n  std::vector<std::string> sectionsToRun;\n};\n\nclass Config : public IConfig {\npublic:\n  Config() = default;\n  Config(ConfigData const& data);\n  virtual ~Config() = default;\n\n  std::string const& getFilename() const;\n\n  bool listTests() const;\n  bool listTestNamesOnly() const;\n  bool listTags() const;\n  bool listReporters() const;\n\n  std::string getProcessName() const;\n  std::string const& getReporterName() const;\n\n  std::vector<std::string> const& getTestsOrTags() const override;\n  std::vector<std::string> const& getSectionsToRun() const override;\n\n  TestSpec const& testSpec() const override;\n  bool hasTestFilters() const override;\n\n  bool showHelp() const;\n\n  // IConfig interface\n  bool allowThrows() const override;\n  std::ostream& stream() const override;\n  std::string name() const override;\n  bool includeSuccessfulResults() const override;\n  bool warnAboutMissingAssertions() const override;\n  bool warnAboutNoTests() const override;\n  ShowDurations::OrNot showDurations() const override;\n  double minDuration() const override;\n  RunTests::InWhatOrder runOrder() const override;\n  unsigned int rngSeed() const override;\n  UseColour::YesOrNo useColour() const override;\n  bool shouldDebugBreak() const override;\n  int abortAfter() const override;\n  bool showInvisibles() const override;\n  Verbosity verbosity() const override;\n  bool benchmarkNoAnalysis() const override;\n  int benchmarkSamples() const override;\n  double benchmarkConfidenceInterval() const override;\n  unsigned int benchmarkResamples() const override;\n  std::chrono::milliseconds benchmarkWarmupTime() const override;\n\nprivate:\n  IStream const* openStream();\n  ConfigData m_data;\n\n  std::unique_ptr<IStream const> m_stream;\n  TestSpec m_testSpec;\n  bool m_hasTestFilters = false;\n};\n\n} // end namespace Catch\n\n// end catch_config.hpp\n// start catch_assertionresult.h\n\n#include <string>\n\nnamespace Catch {\n\nstruct AssertionResultData {\n  AssertionResultData() = delete;\n\n  AssertionResultData(ResultWas::OfType _resultType,\n                      LazyExpression const& _lazyExpression);\n\n  std::string message;\n  mutable std::string reconstructedExpression;\n  LazyExpression lazyExpression;\n  ResultWas::OfType resultType;\n\n  std::string reconstructExpression() const;\n};\n\nclass AssertionResult {\npublic:\n  AssertionResult() = delete;\n  AssertionResult(AssertionInfo const& info, AssertionResultData const& data);\n\n  bool isOk() const;\n  bool succeeded() const;\n  ResultWas::OfType getResultType() const;\n  bool hasExpression() const;\n  bool hasMessage() const;\n  std::string getExpression() const;\n  std::string getExpressionInMacro() const;\n  bool hasExpandedExpression() const;\n  std::string getExpandedExpression() const;\n  std::string getMessage() const;\n  SourceLineInfo getSourceInfo() const;\n  StringRef getTestMacroName() const;\n\n  // protected:\n  AssertionInfo m_info;\n  AssertionResultData m_resultData;\n};\n\n} // end namespace Catch\n\n// end catch_assertionresult.h\n#if defined(CATCH_CONFIG_ENABLE_BENCHMARKING)\n// start catch_estimate.hpp\n\n// Statistics estimates\n\nnamespace Catch {\nnamespace Benchmark {\ntemplate <typename Duration>\nstruct Estimate {\n  Duration point;\n  Duration lower_bound;\n  Duration upper_bound;\n  double confidence_interval;\n\n  template <typename Duration2>\n  operator Estimate<Duration2>() const {\n    return {point, lower_bound, upper_bound, confidence_interval};\n  }\n};\n} // namespace Benchmark\n} // namespace Catch\n\n// end catch_estimate.hpp\n// start catch_outlier_classification.hpp\n\n// Outlier information\n\nnamespace Catch {\nnamespace Benchmark {\nstruct OutlierClassification {\n  int samples_seen = 0;\n  int low_severe   = 0; // more than 3 times IQR below Q1\n  int low_mild     = 0; // 1.5 to 3 times IQR below Q1\n  int high_mild    = 0; // 1.5 to 3 times IQR above Q3\n  int high_severe  = 0; // more than 3 times IQR above Q3\n\n  int total() const { return low_severe + low_mild + high_mild + high_severe; }\n};\n} // namespace Benchmark\n} // namespace Catch\n\n// end catch_outlier_classification.hpp\n#endif // CATCH_CONFIG_ENABLE_BENCHMARKING\n\n#include <string>\n#include <iosfwd>\n#include <map>\n#include <set>\n#include <memory>\n#include <algorithm>\n\nnamespace Catch {\n\nstruct ReporterConfig {\n  explicit ReporterConfig(IConfigPtr const& _fullConfig);\n\n  ReporterConfig(IConfigPtr const& _fullConfig, std::ostream& _stream);\n\n  std::ostream& stream() const;\n  IConfigPtr fullConfig() const;\n\nprivate:\n  std::ostream* m_stream;\n  IConfigPtr m_fullConfig;\n};\n\nstruct ReporterPreferences {\n  bool shouldRedirectStdOut      = false;\n  bool shouldReportAllAssertions = false;\n};\n\ntemplate <typename T>\nstruct LazyStat : Option<T> {\n  LazyStat& operator=(T const& _value) {\n    Option<T>::operator=(_value);\n    used = false;\n    return *this;\n  }\n  void reset() {\n    Option<T>::reset();\n    used = false;\n  }\n  bool used = false;\n};\n\nstruct TestRunInfo {\n  TestRunInfo(std::string const& _name);\n  std::string name;\n};\nstruct GroupInfo {\n  GroupInfo(std::string const& _name, std::size_t _groupIndex,\n            std::size_t _groupsCount);\n\n  std::string name;\n  std::size_t groupIndex;\n  std::size_t groupsCounts;\n};\n\nstruct AssertionStats {\n  AssertionStats(AssertionResult const& _assertionResult,\n                 std::vector<MessageInfo> const& _infoMessages,\n                 Totals const& _totals);\n\n  AssertionStats(AssertionStats const&)            = default;\n  AssertionStats(AssertionStats&&)                 = default;\n  AssertionStats& operator=(AssertionStats const&) = delete;\n  AssertionStats& operator=(AssertionStats&&)      = delete;\n  virtual ~AssertionStats();\n\n  AssertionResult assertionResult;\n  std::vector<MessageInfo> infoMessages;\n  Totals totals;\n};\n\nstruct SectionStats {\n  SectionStats(SectionInfo const& _sectionInfo, Counts const& _assertions,\n               double _durationInSeconds, bool _missingAssertions);\n  SectionStats(SectionStats const&)            = default;\n  SectionStats(SectionStats&&)                 = default;\n  SectionStats& operator=(SectionStats const&) = default;\n  SectionStats& operator=(SectionStats&&)      = default;\n  virtual ~SectionStats();\n\n  SectionInfo sectionInfo;\n  Counts assertions;\n  double durationInSeconds;\n  bool missingAssertions;\n};\n\nstruct TestCaseStats {\n  TestCaseStats(TestCaseInfo const& _testInfo, Totals const& _totals,\n                std::string const& _stdOut, std::string const& _stdErr,\n                bool _aborting);\n\n  TestCaseStats(TestCaseStats const&)            = default;\n  TestCaseStats(TestCaseStats&&)                 = default;\n  TestCaseStats& operator=(TestCaseStats const&) = default;\n  TestCaseStats& operator=(TestCaseStats&&)      = default;\n  virtual ~TestCaseStats();\n\n  TestCaseInfo testInfo;\n  Totals totals;\n  std::string stdOut;\n  std::string stdErr;\n  bool aborting;\n};\n\nstruct TestGroupStats {\n  TestGroupStats(GroupInfo const& _groupInfo, Totals const& _totals,\n                 bool _aborting);\n  TestGroupStats(GroupInfo const& _groupInfo);\n\n  TestGroupStats(TestGroupStats const&)            = default;\n  TestGroupStats(TestGroupStats&&)                 = default;\n  TestGroupStats& operator=(TestGroupStats const&) = default;\n  TestGroupStats& operator=(TestGroupStats&&)      = default;\n  virtual ~TestGroupStats();\n\n  GroupInfo groupInfo;\n  Totals totals;\n  bool aborting;\n};\n\nstruct TestRunStats {\n  TestRunStats(TestRunInfo const& _runInfo, Totals const& _totals,\n               bool _aborting);\n\n  TestRunStats(TestRunStats const&)            = default;\n  TestRunStats(TestRunStats&&)                 = default;\n  TestRunStats& operator=(TestRunStats const&) = default;\n  TestRunStats& operator=(TestRunStats&&)      = default;\n  virtual ~TestRunStats();\n\n  TestRunInfo runInfo;\n  Totals totals;\n  bool aborting;\n};\n\n#if defined(CATCH_CONFIG_ENABLE_BENCHMARKING)\nstruct BenchmarkInfo {\n  std::string name;\n  double estimatedDuration;\n  int iterations;\n  int samples;\n  unsigned int resamples;\n  double clockResolution;\n  double clockCost;\n};\n\ntemplate <class Duration>\nstruct BenchmarkStats {\n  BenchmarkInfo info;\n\n  std::vector<Duration> samples;\n  Benchmark::Estimate<Duration> mean;\n  Benchmark::Estimate<Duration> standardDeviation;\n  Benchmark::OutlierClassification outliers;\n  double outlierVariance;\n\n  template <typename Duration2>\n  operator BenchmarkStats<Duration2>() const {\n    std::vector<Duration2> samples2;\n    samples2.reserve(samples.size());\n    std::transform(samples.begin(), samples.end(), std::back_inserter(samples2),\n                   [](Duration d) { return Duration2(d); });\n    return {\n        info,     std::move(samples2), mean, standardDeviation,\n        outliers, outlierVariance,\n    };\n  }\n};\n#endif // CATCH_CONFIG_ENABLE_BENCHMARKING\n\nstruct IStreamingReporter {\n  virtual ~IStreamingReporter() = default;\n\n  // Implementing class must also provide the following static methods:\n  // static std::string getDescription();\n  // static std::set<Verbosity> getSupportedVerbosities()\n\n  virtual ReporterPreferences getPreferences() const = 0;\n\n  virtual void noMatchingTestCases(std::string const& spec) = 0;\n\n  virtual void reportInvalidArguments(std::string const&) {}\n\n  virtual void testRunStarting(TestRunInfo const& testRunInfo) = 0;\n  virtual void testGroupStarting(GroupInfo const& groupInfo)   = 0;\n\n  virtual void testCaseStarting(TestCaseInfo const& testInfo)  = 0;\n  virtual void sectionStarting(SectionInfo const& sectionInfo) = 0;\n\n#if defined(CATCH_CONFIG_ENABLE_BENCHMARKING)\n  virtual void benchmarkPreparing(std::string const&) {}\n  virtual void benchmarkStarting(BenchmarkInfo const&) {}\n  virtual void benchmarkEnded(BenchmarkStats<> const&) {}\n  virtual void benchmarkFailed(std::string const&) {}\n#endif // CATCH_CONFIG_ENABLE_BENCHMARKING\n\n  virtual void assertionStarting(AssertionInfo const& assertionInfo) = 0;\n\n  // The return value indicates if the messages buffer should be cleared:\n  virtual bool assertionEnded(AssertionStats const& assertionStats) = 0;\n\n  virtual void sectionEnded(SectionStats const& sectionStats)       = 0;\n  virtual void testCaseEnded(TestCaseStats const& testCaseStats)    = 0;\n  virtual void testGroupEnded(TestGroupStats const& testGroupStats) = 0;\n  virtual void testRunEnded(TestRunStats const& testRunStats)       = 0;\n\n  virtual void skipTest(TestCaseInfo const& testInfo) = 0;\n\n  // Default empty implementation provided\n  virtual void fatalErrorEncountered(StringRef name);\n\n  virtual bool isMulti() const;\n};\nusing IStreamingReporterPtr = std::unique_ptr<IStreamingReporter>;\n\nstruct IReporterFactory {\n  virtual ~IReporterFactory();\n  virtual IStreamingReporterPtr create(ReporterConfig const& config) const = 0;\n  virtual std::string getDescription() const                               = 0;\n};\nusing IReporterFactoryPtr = std::shared_ptr<IReporterFactory>;\n\nstruct IReporterRegistry {\n  using FactoryMap = std::map<std::string, IReporterFactoryPtr>;\n  using Listeners  = std::vector<IReporterFactoryPtr>;\n\n  virtual ~IReporterRegistry();\n  virtual IStreamingReporterPtr create(std::string const& name,\n                                       IConfigPtr const& config) const = 0;\n  virtual FactoryMap const& getFactories() const                       = 0;\n  virtual Listeners const& getListeners() const                        = 0;\n};\n\n} // end namespace Catch\n\n// end catch_interfaces_reporter.h\n#include <algorithm>\n#include <cstring>\n#include <cfloat>\n#include <cstdio>\n#include <cassert>\n#include <memory>\n#include <ostream>\n\nnamespace Catch {\nvoid prepareExpandedExpression(AssertionResult& result);\n\n// Returns double formatted as %.3f (format expected on output)\nstd::string getFormattedDuration(double duration);\n\n//! Should the reporter show\nbool shouldShowDuration(IConfig const& config, double duration);\n\nstd::string serializeFilters(std::vector<std::string> const& container);\n\ntemplate <typename DerivedT>\nstruct StreamingReporterBase : IStreamingReporter {\n\n  StreamingReporterBase(ReporterConfig const& _config)\n      : m_config(_config.fullConfig()), stream(_config.stream()) {\n    m_reporterPrefs.shouldRedirectStdOut = false;\n    if (!DerivedT::getSupportedVerbosities().count(m_config->verbosity()))\n      CATCH_ERROR(\"Verbosity level not supported by this reporter\");\n  }\n\n  ReporterPreferences getPreferences() const override {\n    return m_reporterPrefs;\n  }\n\n  static std::set<Verbosity> getSupportedVerbosities() {\n    return {Verbosity::Normal};\n  }\n\n  ~StreamingReporterBase() override = default;\n\n  void noMatchingTestCases(std::string const&) override {}\n\n  void reportInvalidArguments(std::string const&) override {}\n\n  void testRunStarting(TestRunInfo const& _testRunInfo) override {\n    currentTestRunInfo = _testRunInfo;\n  }\n\n  void testGroupStarting(GroupInfo const& _groupInfo) override {\n    currentGroupInfo = _groupInfo;\n  }\n\n  void testCaseStarting(TestCaseInfo const& _testInfo) override {\n    currentTestCaseInfo = _testInfo;\n  }\n  void sectionStarting(SectionInfo const& _sectionInfo) override {\n    m_sectionStack.push_back(_sectionInfo);\n  }\n\n  void sectionEnded(SectionStats const& /* _sectionStats */) override {\n    m_sectionStack.pop_back();\n  }\n  void testCaseEnded(TestCaseStats const& /* _testCaseStats */) override {\n    currentTestCaseInfo.reset();\n  }\n  void testGroupEnded(TestGroupStats const& /* _testGroupStats */) override {\n    currentGroupInfo.reset();\n  }\n  void testRunEnded(TestRunStats const& /* _testRunStats */) override {\n    currentTestCaseInfo.reset();\n    currentGroupInfo.reset();\n    currentTestRunInfo.reset();\n  }\n\n  void skipTest(TestCaseInfo const&) override {\n    // Don't do anything with this by default.\n    // It can optionally be overridden in the derived class.\n  }\n\n  IConfigPtr m_config;\n  std::ostream& stream;\n\n  LazyStat<TestRunInfo> currentTestRunInfo;\n  LazyStat<GroupInfo> currentGroupInfo;\n  LazyStat<TestCaseInfo> currentTestCaseInfo;\n\n  std::vector<SectionInfo> m_sectionStack;\n  ReporterPreferences m_reporterPrefs;\n};\n\ntemplate <typename DerivedT>\nstruct CumulativeReporterBase : IStreamingReporter {\n  template <typename T, typename ChildNodeT>\n  struct Node {\n    explicit Node(T const& _value) : value(_value) {}\n    virtual ~Node() {}\n\n    using ChildNodes = std::vector<std::shared_ptr<ChildNodeT>>;\n    T value;\n    ChildNodes children;\n  };\n  struct SectionNode {\n    explicit SectionNode(SectionStats const& _stats) : stats(_stats) {}\n    virtual ~SectionNode() = default;\n\n    bool operator==(SectionNode const& other) const {\n      return stats.sectionInfo.lineInfo == other.stats.sectionInfo.lineInfo;\n    }\n    bool operator==(std::shared_ptr<SectionNode> const& other) const {\n      return operator==(*other);\n    }\n\n    SectionStats stats;\n    using ChildSections = std::vector<std::shared_ptr<SectionNode>>;\n    using Assertions    = std::vector<AssertionStats>;\n    ChildSections childSections;\n    Assertions assertions;\n    std::string stdOut;\n    std::string stdErr;\n  };\n\n  struct BySectionInfo {\n    BySectionInfo(SectionInfo const& other) : m_other(other) {}\n    BySectionInfo(BySectionInfo const& other) : m_other(other.m_other) {}\n    bool operator()(std::shared_ptr<SectionNode> const& node) const {\n      return ((node->stats.sectionInfo.name == m_other.name) &&\n              (node->stats.sectionInfo.lineInfo == m_other.lineInfo));\n    }\n    void operator=(BySectionInfo const&) = delete;\n\n  private:\n    SectionInfo const& m_other;\n  };\n\n  using TestCaseNode  = Node<TestCaseStats, SectionNode>;\n  using TestGroupNode = Node<TestGroupStats, TestCaseNode>;\n  using TestRunNode   = Node<TestRunStats, TestGroupNode>;\n\n  CumulativeReporterBase(ReporterConfig const& _config)\n      : m_config(_config.fullConfig()), stream(_config.stream()) {\n    m_reporterPrefs.shouldRedirectStdOut = false;\n    if (!DerivedT::getSupportedVerbosities().count(m_config->verbosity()))\n      CATCH_ERROR(\"Verbosity level not supported by this reporter\");\n  }\n  ~CumulativeReporterBase() override = default;\n\n  ReporterPreferences getPreferences() const override {\n    return m_reporterPrefs;\n  }\n\n  static std::set<Verbosity> getSupportedVerbosities() {\n    return {Verbosity::Normal};\n  }\n\n  void testRunStarting(TestRunInfo const&) override {}\n  void testGroupStarting(GroupInfo const&) override {}\n\n  void testCaseStarting(TestCaseInfo const&) override {}\n\n  void sectionStarting(SectionInfo const& sectionInfo) override {\n    SectionStats incompleteStats(sectionInfo, Counts(), 0, false);\n    std::shared_ptr<SectionNode> node;\n    if (m_sectionStack.empty()) {\n      if (!m_rootSection)\n        m_rootSection = std::make_shared<SectionNode>(incompleteStats);\n      node = m_rootSection;\n    } else {\n      SectionNode& parentNode = *m_sectionStack.back();\n      auto it                 = std::find_if(parentNode.childSections.begin(),\n                                             parentNode.childSections.end(),\n                                             BySectionInfo(sectionInfo));\n      if (it == parentNode.childSections.end()) {\n        node = std::make_shared<SectionNode>(incompleteStats);\n        parentNode.childSections.push_back(node);\n      } else\n        node = *it;\n    }\n    m_sectionStack.push_back(node);\n    m_deepestSection = std::move(node);\n  }\n\n  void assertionStarting(AssertionInfo const&) override {}\n\n  bool assertionEnded(AssertionStats const& assertionStats) override {\n    assert(!m_sectionStack.empty());\n    // AssertionResult holds a pointer to a temporary DecomposedExpression,\n    // which getExpandedExpression() calls to build the expression string.\n    // Our section stack copy of the assertionResult will likely outlive the\n    // temporary, so it must be expanded or discarded now to avoid calling\n    // a destroyed object later.\n    prepareExpandedExpression(\n        const_cast<AssertionResult&>(assertionStats.assertionResult));\n    SectionNode& sectionNode = *m_sectionStack.back();\n    sectionNode.assertions.push_back(assertionStats);\n    return true;\n  }\n  void sectionEnded(SectionStats const& sectionStats) override {\n    assert(!m_sectionStack.empty());\n    SectionNode& node = *m_sectionStack.back();\n    node.stats        = sectionStats;\n    m_sectionStack.pop_back();\n  }\n  void testCaseEnded(TestCaseStats const& testCaseStats) override {\n    auto node = std::make_shared<TestCaseNode>(testCaseStats);\n    assert(m_sectionStack.size() == 0);\n    node->children.push_back(m_rootSection);\n    m_testCases.push_back(node);\n    m_rootSection.reset();\n\n    assert(m_deepestSection);\n    m_deepestSection->stdOut = testCaseStats.stdOut;\n    m_deepestSection->stdErr = testCaseStats.stdErr;\n  }\n  void testGroupEnded(TestGroupStats const& testGroupStats) override {\n    auto node = std::make_shared<TestGroupNode>(testGroupStats);\n    node->children.swap(m_testCases);\n    m_testGroups.push_back(node);\n  }\n  void testRunEnded(TestRunStats const& testRunStats) override {\n    auto node = std::make_shared<TestRunNode>(testRunStats);\n    node->children.swap(m_testGroups);\n    m_testRuns.push_back(node);\n    testRunEndedCumulative();\n  }\n  virtual void testRunEndedCumulative() = 0;\n\n  void skipTest(TestCaseInfo const&) override {}\n\n  IConfigPtr m_config;\n  std::ostream& stream;\n  std::vector<AssertionStats> m_assertions;\n  std::vector<std::vector<std::shared_ptr<SectionNode>>> m_sections;\n  std::vector<std::shared_ptr<TestCaseNode>> m_testCases;\n  std::vector<std::shared_ptr<TestGroupNode>> m_testGroups;\n\n  std::vector<std::shared_ptr<TestRunNode>> m_testRuns;\n\n  std::shared_ptr<SectionNode> m_rootSection;\n  std::shared_ptr<SectionNode> m_deepestSection;\n  std::vector<std::shared_ptr<SectionNode>> m_sectionStack;\n  ReporterPreferences m_reporterPrefs;\n};\n\ntemplate <char C>\nchar const* getLineOfChars() {\n  static char line[CATCH_CONFIG_CONSOLE_WIDTH] = {0};\n  if (!*line) {\n    std::memset(line, C, CATCH_CONFIG_CONSOLE_WIDTH - 1);\n    line[CATCH_CONFIG_CONSOLE_WIDTH - 1] = 0;\n  }\n  return line;\n}\n\nstruct TestEventListenerBase : StreamingReporterBase<TestEventListenerBase> {\n  TestEventListenerBase(ReporterConfig const& _config);\n\n  static std::set<Verbosity> getSupportedVerbosities();\n\n  void assertionStarting(AssertionInfo const&) override;\n  bool assertionEnded(AssertionStats const&) override;\n};\n\n} // end namespace Catch\n\n// end catch_reporter_bases.hpp\n// start catch_console_colour.h\n\nnamespace Catch {\n\nstruct Colour {\n  enum Code {\n    None = 0,\n\n    White,\n    Red,\n    Green,\n    Blue,\n    Cyan,\n    Yellow,\n    Grey,\n\n    Bright = 0x10,\n\n    BrightRed    = Bright | Red,\n    BrightGreen  = Bright | Green,\n    LightGrey    = Bright | Grey,\n    BrightWhite  = Bright | White,\n    BrightYellow = Bright | Yellow,\n\n    // By intention\n    FileName              = LightGrey,\n    Warning               = BrightYellow,\n    ResultError           = BrightRed,\n    ResultSuccess         = BrightGreen,\n    ResultExpectedFailure = Warning,\n\n    Error   = BrightRed,\n    Success = Green,\n\n    OriginalExpression      = Cyan,\n    ReconstructedExpression = BrightYellow,\n\n    SecondaryText = LightGrey,\n    Headers       = White\n  };\n\n  // Use constructed object for RAII guard\n  Colour(Code _colourCode);\n  Colour(Colour&& other) noexcept;\n  Colour& operator=(Colour&& other) noexcept;\n  ~Colour();\n\n  // Use static method for one-shot changes\n  static void use(Code _colourCode);\n\nprivate:\n  bool m_moved = false;\n};\n\nstd::ostream& operator<<(std::ostream& os, Colour const&);\n\n} // end namespace Catch\n\n// end catch_console_colour.h\n// start catch_reporter_registrars.hpp\n\nnamespace Catch {\n\ntemplate <typename T>\nclass ReporterRegistrar {\n\n  class ReporterFactory : public IReporterFactory {\n\n    IStreamingReporterPtr create(ReporterConfig const& config) const override {\n      return std::unique_ptr<T>(new T(config));\n    }\n\n    std::string getDescription() const override { return T::getDescription(); }\n  };\n\npublic:\n  explicit ReporterRegistrar(std::string const& name) {\n    getMutableRegistryHub().registerReporter(\n        name, std::make_shared<ReporterFactory>());\n  }\n};\n\ntemplate <typename T>\nclass ListenerRegistrar {\n\n  class ListenerFactory : public IReporterFactory {\n\n    IStreamingReporterPtr create(ReporterConfig const& config) const override {\n      return std::unique_ptr<T>(new T(config));\n    }\n    std::string getDescription() const override { return std::string(); }\n  };\n\npublic:\n  ListenerRegistrar() {\n    getMutableRegistryHub().registerListener(\n        std::make_shared<ListenerFactory>());\n  }\n};\n} // namespace Catch\n\n#if !defined(CATCH_CONFIG_DISABLE)\n\n#define CATCH_REGISTER_REPORTER(name, reporterType)                            \\\n  CATCH_INTERNAL_START_WARNINGS_SUPPRESSION                                    \\\n  CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS                                     \\\n  namespace {                                                                  \\\n  Catch::ReporterRegistrar<reporterType>                                       \\\n      catch_internal_RegistrarFor##reporterType(name);                         \\\n  }                                                                            \\\n  CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION\n\n#define CATCH_REGISTER_LISTENER(listenerType)                                  \\\n  CATCH_INTERNAL_START_WARNINGS_SUPPRESSION                                    \\\n  CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS                                     \\\n  namespace {                                                                  \\\n  Catch::ListenerRegistrar<listenerType>                                       \\\n      catch_internal_RegistrarFor##listenerType;                               \\\n  }                                                                            \\\n  CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION\n#else // CATCH_CONFIG_DISABLE\n\n#define CATCH_REGISTER_REPORTER(name, reporterType)\n#define CATCH_REGISTER_LISTENER(listenerType)\n\n#endif // CATCH_CONFIG_DISABLE\n\n// end catch_reporter_registrars.hpp\n// Allow users to base their work off existing reporters\n// start catch_reporter_compact.h\n\nnamespace Catch {\n\nstruct CompactReporter : StreamingReporterBase<CompactReporter> {\n\n  using StreamingReporterBase::StreamingReporterBase;\n\n  ~CompactReporter() override;\n\n  static std::string getDescription();\n\n  void noMatchingTestCases(std::string const& spec) override;\n\n  void assertionStarting(AssertionInfo const&) override;\n\n  bool assertionEnded(AssertionStats const& _assertionStats) override;\n\n  void sectionEnded(SectionStats const& _sectionStats) override;\n\n  void testRunEnded(TestRunStats const& _testRunStats) override;\n};\n\n} // end namespace Catch\n\n// end catch_reporter_compact.h\n// start catch_reporter_console.h\n\n#if defined(_MSC_VER)\n#pragma warning(push)\n#pragma warning(disable : 4061) // Not all labels are EXPLICITLY handled in\n                                // switch Note that 4062 (not all labels are\n                                // handled and default is missing) is enabled\n#endif\n\nnamespace Catch {\n// Fwd decls\nstruct SummaryColumn;\nclass TablePrinter;\n\nstruct ConsoleReporter : StreamingReporterBase<ConsoleReporter> {\n  std::unique_ptr<TablePrinter> m_tablePrinter;\n\n  ConsoleReporter(ReporterConfig const& config);\n  ~ConsoleReporter() override;\n  static std::string getDescription();\n\n  void noMatchingTestCases(std::string const& spec) override;\n\n  void reportInvalidArguments(std::string const& arg) override;\n\n  void assertionStarting(AssertionInfo const&) override;\n\n  bool assertionEnded(AssertionStats const& _assertionStats) override;\n\n  void sectionStarting(SectionInfo const& _sectionInfo) override;\n  void sectionEnded(SectionStats const& _sectionStats) override;\n\n#if defined(CATCH_CONFIG_ENABLE_BENCHMARKING)\n  void benchmarkPreparing(std::string const& name) override;\n  void benchmarkStarting(BenchmarkInfo const& info) override;\n  void benchmarkEnded(BenchmarkStats<> const& stats) override;\n  void benchmarkFailed(std::string const& error) override;\n#endif // CATCH_CONFIG_ENABLE_BENCHMARKING\n\n  void testCaseEnded(TestCaseStats const& _testCaseStats) override;\n  void testGroupEnded(TestGroupStats const& _testGroupStats) override;\n  void testRunEnded(TestRunStats const& _testRunStats) override;\n  void testRunStarting(TestRunInfo const& _testRunInfo) override;\n\nprivate:\n  void lazyPrint();\n\n  void lazyPrintWithoutClosingBenchmarkTable();\n  void lazyPrintRunInfo();\n  void lazyPrintGroupInfo();\n  void printTestCaseAndSectionHeader();\n\n  void printClosedHeader(std::string const& _name);\n  void printOpenHeader(std::string const& _name);\n\n  // if string has a : in first line will set indent to follow it on\n  // subsequent lines\n  void printHeaderString(std::string const& _string, std::size_t indent = 0);\n\n  void printTotals(Totals const& totals);\n  void printSummaryRow(std::string const& label,\n                       std::vector<SummaryColumn> const& cols, std::size_t row);\n\n  void printTotalsDivider(Totals const& totals);\n  void printSummaryDivider();\n  void printTestFilters();\n\nprivate:\n  bool m_headerPrinted = false;\n};\n\n} // end namespace Catch\n\n#if defined(_MSC_VER)\n#pragma warning(pop)\n#endif\n\n// end catch_reporter_console.h\n// start catch_reporter_junit.h\n\n// start catch_xmlwriter.h\n\n#include <vector>\n\nnamespace Catch {\nenum class XmlFormatting {\n  None    = 0x00,\n  Indent  = 0x01,\n  Newline = 0x02,\n};\n\nXmlFormatting operator|(XmlFormatting lhs, XmlFormatting rhs);\nXmlFormatting operator&(XmlFormatting lhs, XmlFormatting rhs);\n\nclass XmlEncode {\npublic:\n  enum ForWhat { ForTextNodes, ForAttributes };\n\n  XmlEncode(std::string const& str, ForWhat forWhat = ForTextNodes);\n\n  void encodeTo(std::ostream& os) const;\n\n  friend std::ostream& operator<<(std::ostream& os, XmlEncode const& xmlEncode);\n\nprivate:\n  std::string m_str;\n  ForWhat m_forWhat;\n};\n\nclass XmlWriter {\npublic:\n  class ScopedElement {\n  public:\n    ScopedElement(XmlWriter* writer, XmlFormatting fmt);\n\n    ScopedElement(ScopedElement&& other) noexcept;\n    ScopedElement& operator=(ScopedElement&& other) noexcept;\n\n    ~ScopedElement();\n\n    ScopedElement& writeText(std::string const& text,\n                             XmlFormatting fmt = XmlFormatting::Newline |\n                                                 XmlFormatting::Indent);\n\n    template <typename T>\n    ScopedElement& writeAttribute(std::string const& name, T const& attribute) {\n      m_writer->writeAttribute(name, attribute);\n      return *this;\n    }\n\n  private:\n    mutable XmlWriter* m_writer = nullptr;\n    XmlFormatting m_fmt;\n  };\n\n  XmlWriter(std::ostream& os = Catch::cout());\n  ~XmlWriter();\n\n  XmlWriter(XmlWriter const&)            = delete;\n  XmlWriter& operator=(XmlWriter const&) = delete;\n\n  XmlWriter& startElement(std::string const& name,\n                          XmlFormatting fmt = XmlFormatting::Newline |\n                                              XmlFormatting::Indent);\n\n  ScopedElement scopedElement(std::string const& name,\n                              XmlFormatting fmt = XmlFormatting::Newline |\n                                                  XmlFormatting::Indent);\n\n  XmlWriter& endElement(XmlFormatting fmt = XmlFormatting::Newline |\n                                            XmlFormatting::Indent);\n\n  XmlWriter& writeAttribute(std::string const& name,\n                            std::string const& attribute);\n\n  XmlWriter& writeAttribute(std::string const& name, bool attribute);\n\n  template <typename T>\n  XmlWriter& writeAttribute(std::string const& name, T const& attribute) {\n    ReusableStringStream rss;\n    rss << attribute;\n    return writeAttribute(name, rss.str());\n  }\n\n  XmlWriter& writeText(std::string const& text,\n                       XmlFormatting fmt = XmlFormatting::Newline |\n                                           XmlFormatting::Indent);\n\n  XmlWriter& writeComment(std::string const& text,\n                          XmlFormatting fmt = XmlFormatting::Newline |\n                                              XmlFormatting::Indent);\n\n  void writeStylesheetRef(std::string const& url);\n\n  XmlWriter& writeBlankLine();\n\n  void ensureTagClosed();\n\nprivate:\n  void applyFormatting(XmlFormatting fmt);\n\n  void writeDeclaration();\n\n  void newlineIfNecessary();\n\n  bool m_tagIsOpen    = false;\n  bool m_needsNewline = false;\n  std::vector<std::string> m_tags;\n  std::string m_indent;\n  std::ostream& m_os;\n};\n\n} // namespace Catch\n\n// end catch_xmlwriter.h\nnamespace Catch {\n\nclass JunitReporter : public CumulativeReporterBase<JunitReporter> {\npublic:\n  JunitReporter(ReporterConfig const& _config);\n\n  ~JunitReporter() override;\n\n  static std::string getDescription();\n\n  void noMatchingTestCases(std::string const& /*spec*/) override;\n\n  void testRunStarting(TestRunInfo const& runInfo) override;\n\n  void testGroupStarting(GroupInfo const& groupInfo) override;\n\n  void testCaseStarting(TestCaseInfo const& testCaseInfo) override;\n  bool assertionEnded(AssertionStats const& assertionStats) override;\n\n  void testCaseEnded(TestCaseStats const& testCaseStats) override;\n\n  void testGroupEnded(TestGroupStats const& testGroupStats) override;\n\n  void testRunEndedCumulative() override;\n\n  void writeGroup(TestGroupNode const& groupNode, double suiteTime);\n\n  void writeTestCase(TestCaseNode const& testCaseNode);\n\n  void writeSection(std::string const& className, std::string const& rootName,\n                    SectionNode const& sectionNode);\n\n  void writeAssertions(SectionNode const& sectionNode);\n  void writeAssertion(AssertionStats const& stats);\n\n  XmlWriter xml;\n  Timer suiteTimer;\n  std::string stdOutForSuite;\n  std::string stdErrForSuite;\n  unsigned int unexpectedExceptions = 0;\n  bool m_okToFail                   = false;\n};\n\n} // end namespace Catch\n\n// end catch_reporter_junit.h\n// start catch_reporter_xml.h\n\nnamespace Catch {\nclass XmlReporter : public StreamingReporterBase<XmlReporter> {\npublic:\n  XmlReporter(ReporterConfig const& _config);\n\n  ~XmlReporter() override;\n\n  static std::string getDescription();\n\n  virtual std::string getStylesheetRef() const;\n\n  void writeSourceInfo(SourceLineInfo const& sourceInfo);\n\npublic: // StreamingReporterBase\n  void noMatchingTestCases(std::string const& s) override;\n\n  void testRunStarting(TestRunInfo const& testInfo) override;\n\n  void testGroupStarting(GroupInfo const& groupInfo) override;\n\n  void testCaseStarting(TestCaseInfo const& testInfo) override;\n\n  void sectionStarting(SectionInfo const& sectionInfo) override;\n\n  void assertionStarting(AssertionInfo const&) override;\n\n  bool assertionEnded(AssertionStats const& assertionStats) override;\n\n  void sectionEnded(SectionStats const& sectionStats) override;\n\n  void testCaseEnded(TestCaseStats const& testCaseStats) override;\n\n  void testGroupEnded(TestGroupStats const& testGroupStats) override;\n\n  void testRunEnded(TestRunStats const& testRunStats) override;\n\n#if defined(CATCH_CONFIG_ENABLE_BENCHMARKING)\n  void benchmarkPreparing(std::string const& name) override;\n  void benchmarkStarting(BenchmarkInfo const&) override;\n  void benchmarkEnded(BenchmarkStats<> const&) override;\n  void benchmarkFailed(std::string const&) override;\n#endif // CATCH_CONFIG_ENABLE_BENCHMARKING\n\nprivate:\n  Timer m_testCaseTimer;\n  XmlWriter m_xml;\n  int m_sectionDepth = 0;\n};\n\n} // end namespace Catch\n\n// end catch_reporter_xml.h\n\n// end catch_external_interfaces.h\n#endif\n\n#if defined(CATCH_CONFIG_ENABLE_BENCHMARKING)\n// start catch_benchmarking_all.hpp\n\n// A proxy header that includes all of the benchmarking headers to allow\n// concise include of the benchmarking features. You should prefer the\n// individual includes in standard use.\n\n// start catch_benchmark.hpp\n\n// Benchmark\n\n// start catch_chronometer.hpp\n\n// User-facing chronometer\n\n// start catch_clock.hpp\n\n// Clocks\n\n#include <chrono>\n#include <ratio>\n\nnamespace Catch {\nnamespace Benchmark {\ntemplate <typename Clock>\nusing ClockDuration = typename Clock::duration;\ntemplate <typename Clock>\nusing FloatDuration = std::chrono::duration<double, typename Clock::period>;\n\ntemplate <typename Clock>\nusing TimePoint = typename Clock::time_point;\n\nusing default_clock = std::chrono::steady_clock;\n\ntemplate <typename Clock>\nstruct now {\n  TimePoint<Clock> operator()() const { return Clock::now(); }\n};\n\nusing fp_seconds = std::chrono::duration<double, std::ratio<1>>;\n} // namespace Benchmark\n} // namespace Catch\n\n// end catch_clock.hpp\n// start catch_optimizer.hpp\n\n// Hinting the optimizer\n\n#if defined(_MSC_VER)\n#include <atomic> // atomic_thread_fence\n#endif\n\nnamespace Catch {\nnamespace Benchmark {\n#if defined(__GNUC__) || defined(__clang__)\ntemplate <typename T>\ninline void keep_memory(T* p) {\n  asm volatile(\"\" : : \"g\"(p) : \"memory\");\n}\ninline void keep_memory() { asm volatile(\"\" : : : \"memory\"); }\n\nnamespace Detail {\ninline void optimizer_barrier() { keep_memory(); }\n} // namespace Detail\n#elif defined(_MSC_VER)\n\n#pragma optimize(\"\", off)\ntemplate <typename T>\ninline void keep_memory(T* p) {\n  // thanks @milleniumbug\n  *reinterpret_cast<char volatile*>(p) =\n      *reinterpret_cast<char const volatile*>(p);\n}\n// TODO equivalent keep_memory()\n#pragma optimize(\"\", on)\n\nnamespace Detail {\ninline void optimizer_barrier() {\n  std::atomic_thread_fence(std::memory_order_seq_cst);\n}\n} // namespace Detail\n\n#endif\n\ntemplate <typename T>\ninline void deoptimize_value(T&& x) {\n  keep_memory(&x);\n}\n\ntemplate <typename Fn, typename... Args>\ninline auto invoke_deoptimized(Fn&& fn, Args&&... args) ->\n    typename std::enable_if<\n        !std::is_same<void, decltype(fn(args...))>::value>::type {\n  deoptimize_value(std::forward<Fn>(fn)(std::forward<Args...>(args...)));\n}\n\ntemplate <typename Fn, typename... Args>\ninline auto invoke_deoptimized(Fn&& fn, Args&&... args) ->\n    typename std::enable_if<\n        std::is_same<void, decltype(fn(args...))>::value>::type {\n  std::forward<Fn>(fn)(std::forward<Args...>(args...));\n}\n} // namespace Benchmark\n} // namespace Catch\n\n// end catch_optimizer.hpp\n// start catch_complete_invoke.hpp\n\n// Invoke with a special case for void\n\n#include <type_traits>\n#include <utility>\n\nnamespace Catch {\nnamespace Benchmark {\nnamespace Detail {\ntemplate <typename T>\nstruct CompleteType {\n  using type = T;\n};\ntemplate <>\nstruct CompleteType<void> {\n  struct type {};\n};\n\ntemplate <typename T>\nusing CompleteType_t = typename CompleteType<T>::type;\n\ntemplate <typename Result>\nstruct CompleteInvoker {\n  template <typename Fun, typename... Args>\n  static Result invoke(Fun&& fun, Args&&... args) {\n    return std::forward<Fun>(fun)(std::forward<Args>(args)...);\n  }\n};\ntemplate <>\nstruct CompleteInvoker<void> {\n  template <typename Fun, typename... Args>\n  static CompleteType_t<void> invoke(Fun&& fun, Args&&... args) {\n    std::forward<Fun>(fun)(std::forward<Args>(args)...);\n    return {};\n  }\n};\n\n// invoke and not return void :(\ntemplate <typename Fun, typename... Args>\nCompleteType_t<FunctionReturnType<Fun, Args...>>\ncomplete_invoke(Fun&& fun, Args&&... args) {\n  return CompleteInvoker<FunctionReturnType<Fun, Args...>>::invoke(\n      std::forward<Fun>(fun), std::forward<Args>(args)...);\n}\n\nconst std::string benchmarkErrorMsg = \"a benchmark failed to run successfully\";\n} // namespace Detail\n\ntemplate <typename Fun>\nDetail::CompleteType_t<FunctionReturnType<Fun>> user_code(Fun&& fun) {\n  CATCH_TRY { return Detail::complete_invoke(std::forward<Fun>(fun)); }\n  CATCH_CATCH_ALL {\n    getResultCapture().benchmarkFailed(translateActiveException());\n    CATCH_RUNTIME_ERROR(Detail::benchmarkErrorMsg);\n  }\n}\n} // namespace Benchmark\n} // namespace Catch\n\n// end catch_complete_invoke.hpp\nnamespace Catch {\nnamespace Benchmark {\nnamespace Detail {\nstruct ChronometerConcept {\n  virtual void start()          = 0;\n  virtual void finish()         = 0;\n  virtual ~ChronometerConcept() = default;\n};\ntemplate <typename Clock>\nstruct ChronometerModel final : public ChronometerConcept {\n  void start() override { started = Clock::now(); }\n  void finish() override { finished = Clock::now(); }\n\n  ClockDuration<Clock> elapsed() const { return finished - started; }\n\n  TimePoint<Clock> started;\n  TimePoint<Clock> finished;\n};\n} // namespace Detail\n\nstruct Chronometer {\npublic:\n  template <typename Fun>\n  void measure(Fun&& fun) {\n    measure(std::forward<Fun>(fun), is_callable<Fun(int)>());\n  }\n\n  int runs() const { return k; }\n\n  Chronometer(Detail::ChronometerConcept& meter, int k) : impl(&meter), k(k) {}\n\nprivate:\n  template <typename Fun>\n  void measure(Fun&& fun, std::false_type) {\n    measure([&fun](int) { return fun(); }, std::true_type());\n  }\n\n  template <typename Fun>\n  void measure(Fun&& fun, std::true_type) {\n    Detail::optimizer_barrier();\n    impl->start();\n    for (int i = 0; i < k; ++i)\n      invoke_deoptimized(fun, i);\n    impl->finish();\n    Detail::optimizer_barrier();\n  }\n\n  Detail::ChronometerConcept* impl;\n  int k;\n};\n} // namespace Benchmark\n} // namespace Catch\n\n// end catch_chronometer.hpp\n// start catch_environment.hpp\n\n// Environment information\n\nnamespace Catch {\nnamespace Benchmark {\ntemplate <typename Duration>\nstruct EnvironmentEstimate {\n  Duration mean;\n  OutlierClassification outliers;\n\n  template <typename Duration2>\n  operator EnvironmentEstimate<Duration2>() const {\n    return {mean, outliers};\n  }\n};\ntemplate <typename Clock>\nstruct Environment {\n  using clock_type = Clock;\n  EnvironmentEstimate<FloatDuration<Clock>> clock_resolution;\n  EnvironmentEstimate<FloatDuration<Clock>> clock_cost;\n};\n} // namespace Benchmark\n} // namespace Catch\n\n// end catch_environment.hpp\n// start catch_execution_plan.hpp\n\n// Execution plan\n\n// start catch_benchmark_function.hpp\n\n// Dumb std::function implementation for consistent call overhead\n\n#include <cassert>\n#include <type_traits>\n#include <utility>\n#include <memory>\n\nnamespace Catch {\nnamespace Benchmark {\nnamespace Detail {\ntemplate <typename T>\nusing Decay = typename std::decay<T>::type;\ntemplate <typename T, typename U>\nstruct is_related : std::is_same<Decay<T>, Decay<U>> {};\n\n/// We need to reinvent std::function because every piece of code that might add\n/// overhead in a measurement context needs to have consistent performance\n/// characteristics so that we can account for it in the measurement.\n/// Implementations of std::function with optimizations that aren't always\n/// applicable, like small buffer optimizations, are not uncommon. This is\n/// effectively an implementation of std::function without any such\n/// optimizations; it may be slow, but it is consistently slow.\nstruct BenchmarkFunction {\nprivate:\n  struct callable {\n    virtual void call(Chronometer meter) const = 0;\n    virtual callable* clone() const            = 0;\n    virtual ~callable()                        = default;\n  };\n  template <typename Fun>\n  struct model : public callable {\n    model(Fun&& fun) : fun(std::move(fun)) {}\n    model(Fun const& fun) : fun(fun) {}\n\n    model<Fun>* clone() const override { return new model<Fun>(*this); }\n\n    void call(Chronometer meter) const override {\n      call(meter, is_callable<Fun(Chronometer)>());\n    }\n    void call(Chronometer meter, std::true_type) const { fun(meter); }\n    void call(Chronometer meter, std::false_type) const { meter.measure(fun); }\n\n    Fun fun;\n  };\n\n  struct do_nothing {\n    void operator()() const {}\n  };\n\n  template <typename T>\n  BenchmarkFunction(model<T>* c) : f(c) {}\n\npublic:\n  BenchmarkFunction() : f(new model<do_nothing>{{}}) {}\n\n  template <typename Fun,\n            typename std::enable_if<!is_related<Fun, BenchmarkFunction>::value,\n                                    int>::type = 0>\n  BenchmarkFunction(Fun&& fun)\n      : f(new model<typename std::decay<Fun>::type>(std::forward<Fun>(fun))) {}\n\n  BenchmarkFunction(BenchmarkFunction&& that) : f(std::move(that.f)) {}\n\n  BenchmarkFunction(BenchmarkFunction const& that) : f(that.f->clone()) {}\n\n  BenchmarkFunction& operator=(BenchmarkFunction&& that) {\n    f = std::move(that.f);\n    return *this;\n  }\n\n  BenchmarkFunction& operator=(BenchmarkFunction const& that) {\n    f.reset(that.f->clone());\n    return *this;\n  }\n\n  void operator()(Chronometer meter) const { f->call(meter); }\n\nprivate:\n  std::unique_ptr<callable> f;\n};\n} // namespace Detail\n} // namespace Benchmark\n} // namespace Catch\n\n// end catch_benchmark_function.hpp\n// start catch_repeat.hpp\n\n// repeat algorithm\n\n#include <type_traits>\n#include <utility>\n\nnamespace Catch {\nnamespace Benchmark {\nnamespace Detail {\ntemplate <typename Fun>\nstruct repeater {\n  void operator()(int k) const {\n    for (int i = 0; i < k; ++i) {\n      fun();\n    }\n  }\n  Fun fun;\n};\ntemplate <typename Fun>\nrepeater<typename std::decay<Fun>::type> repeat(Fun&& fun) {\n  return {std::forward<Fun>(fun)};\n}\n} // namespace Detail\n} // namespace Benchmark\n} // namespace Catch\n\n// end catch_repeat.hpp\n// start catch_run_for_at_least.hpp\n\n// Run a function for a minimum amount of time\n\n// start catch_measure.hpp\n\n// Measure\n\n// start catch_timing.hpp\n\n// Timing\n\n#include <tuple>\n#include <type_traits>\n\nnamespace Catch {\nnamespace Benchmark {\ntemplate <typename Duration, typename Result>\nstruct Timing {\n  Duration elapsed;\n  Result result;\n  int iterations;\n};\ntemplate <typename Clock, typename Func, typename... Args>\nusing TimingOf =\n    Timing<ClockDuration<Clock>,\n           Detail::CompleteType_t<FunctionReturnType<Func, Args...>>>;\n} // namespace Benchmark\n} // namespace Catch\n\n// end catch_timing.hpp\n#include <utility>\n\nnamespace Catch {\nnamespace Benchmark {\nnamespace Detail {\ntemplate <typename Clock, typename Fun, typename... Args>\nTimingOf<Clock, Fun, Args...> measure(Fun&& fun, Args&&... args) {\n  auto start = Clock::now();\n  auto&& r   = Detail::complete_invoke(fun, std::forward<Args>(args)...);\n  auto end   = Clock::now();\n  auto delta = end - start;\n  return {delta, std::forward<decltype(r)>(r), 1};\n}\n} // namespace Detail\n} // namespace Benchmark\n} // namespace Catch\n\n// end catch_measure.hpp\n#include <utility>\n#include <type_traits>\n\nnamespace Catch {\nnamespace Benchmark {\nnamespace Detail {\ntemplate <typename Clock, typename Fun>\nTimingOf<Clock, Fun, int> measure_one(Fun&& fun, int iters, std::false_type) {\n  return Detail::measure<Clock>(fun, iters);\n}\ntemplate <typename Clock, typename Fun>\nTimingOf<Clock, Fun, Chronometer> measure_one(Fun&& fun, int iters,\n                                              std::true_type) {\n  Detail::ChronometerModel<Clock> meter;\n  auto&& result = Detail::complete_invoke(fun, Chronometer(meter, iters));\n\n  return {meter.elapsed(), std::move(result), iters};\n}\n\ntemplate <typename Clock, typename Fun>\nusing run_for_at_least_argument_t =\n    typename std::conditional<is_callable<Fun(Chronometer)>::value, Chronometer,\n                              int>::type;\n\nstruct optimized_away_error : std::exception {\n  const char* what() const noexcept override {\n    return \"could not measure benchmark, maybe it was optimized away\";\n  }\n};\n\ntemplate <typename Clock, typename Fun>\nTimingOf<Clock, Fun, run_for_at_least_argument_t<Clock, Fun>>\nrun_for_at_least(ClockDuration<Clock> how_long, int seed, Fun&& fun) {\n  auto iters = seed;\n  while (iters < (1 << 30)) {\n    auto&& Timing =\n        measure_one<Clock>(fun, iters, is_callable<Fun(Chronometer)>());\n\n    if (Timing.elapsed >= how_long) {\n      return {Timing.elapsed, std::move(Timing.result), iters};\n    }\n    iters *= 2;\n  }\n  throw optimized_away_error{};\n}\n} // namespace Detail\n} // namespace Benchmark\n} // namespace Catch\n\n// end catch_run_for_at_least.hpp\n#include <algorithm>\n\nnamespace Catch {\nnamespace Benchmark {\ntemplate <typename Duration>\nstruct ExecutionPlan {\n  int iterations_per_sample;\n  Duration estimated_duration;\n  Detail::BenchmarkFunction benchmark;\n  Duration warmup_time;\n  int warmup_iterations;\n\n  template <typename Duration2>\n  operator ExecutionPlan<Duration2>() const {\n    return {iterations_per_sample, estimated_duration, benchmark, warmup_time,\n            warmup_iterations};\n  }\n\n  template <typename Clock>\n  std::vector<FloatDuration<Clock>>\n  run(const IConfig& cfg, Environment<FloatDuration<Clock>> env) const {\n    // warmup a bit\n    Detail::run_for_at_least<Clock>(\n        std::chrono::duration_cast<ClockDuration<Clock>>(warmup_time),\n        warmup_iterations, Detail::repeat(now<Clock>{}));\n\n    std::vector<FloatDuration<Clock>> times;\n    times.reserve(cfg.benchmarkSamples());\n    std::generate_n(\n        std::back_inserter(times), cfg.benchmarkSamples(), [this, env] {\n          Detail::ChronometerModel<Clock> model;\n          this->benchmark(Chronometer(model, iterations_per_sample));\n          auto sample_time = model.elapsed() - env.clock_cost.mean;\n          if (sample_time < FloatDuration<Clock>::zero())\n            sample_time = FloatDuration<Clock>::zero();\n          return sample_time / iterations_per_sample;\n        });\n    return times;\n  }\n};\n} // namespace Benchmark\n} // namespace Catch\n\n// end catch_execution_plan.hpp\n// start catch_estimate_clock.hpp\n\n// Environment measurement\n\n// start catch_stats.hpp\n\n// Statistical analysis tools\n\n#include <algorithm>\n#include <functional>\n#include <vector>\n#include <iterator>\n#include <numeric>\n#include <tuple>\n#include <cmath>\n#include <utility>\n#include <cstddef>\n#include <random>\n\nnamespace Catch {\nnamespace Benchmark {\nnamespace Detail {\nusing sample = std::vector<double>;\n\ndouble weighted_average_quantile(int k, int q,\n                                 std::vector<double>::iterator first,\n                                 std::vector<double>::iterator last);\n\ntemplate <typename Iterator>\nOutlierClassification classify_outliers(Iterator first, Iterator last) {\n  std::vector<double> copy(first, last);\n\n  auto q1  = weighted_average_quantile(1, 4, copy.begin(), copy.end());\n  auto q3  = weighted_average_quantile(3, 4, copy.begin(), copy.end());\n  auto iqr = q3 - q1;\n  auto los = q1 - (iqr * 3.);\n  auto lom = q1 - (iqr * 1.5);\n  auto him = q3 + (iqr * 1.5);\n  auto his = q3 + (iqr * 3.);\n\n  OutlierClassification o;\n  for (; first != last; ++first) {\n    auto&& t = *first;\n    if (t < los)\n      ++o.low_severe;\n    else if (t < lom)\n      ++o.low_mild;\n    else if (t > his)\n      ++o.high_severe;\n    else if (t > him)\n      ++o.high_mild;\n    ++o.samples_seen;\n  }\n  return o;\n}\n\ntemplate <typename Iterator>\ndouble mean(Iterator first, Iterator last) {\n  auto count = last - first;\n  double sum = std::accumulate(first, last, 0.);\n  return sum / count;\n}\n\ntemplate <typename URng, typename Iterator, typename Estimator>\nsample resample(URng& rng, int resamples, Iterator first, Iterator last,\n                Estimator& estimator) {\n  auto n = last - first;\n  std::uniform_int_distribution<decltype(n)> dist(0, n - 1);\n\n  sample out;\n  out.reserve(resamples);\n  std::generate_n(\n      std::back_inserter(out), resamples, [n, first, &estimator, &dist, &rng] {\n        std::vector<double> resampled;\n        resampled.reserve(n);\n        std::generate_n(std::back_inserter(resampled), n,\n                        [first, &dist, &rng] { return first[dist(rng)]; });\n        return estimator(resampled.begin(), resampled.end());\n      });\n  std::sort(out.begin(), out.end());\n  return out;\n}\n\ntemplate <typename Estimator, typename Iterator>\nsample jackknife(Estimator&& estimator, Iterator first, Iterator last) {\n  auto n      = last - first;\n  auto second = std::next(first);\n  sample results;\n  results.reserve(n);\n\n  for (auto it = first; it != last; ++it) {\n    std::iter_swap(it, first);\n    results.push_back(estimator(second, last));\n  }\n\n  return results;\n}\n\ninline double normal_cdf(double x) {\n  return std::erfc(-x / std::sqrt(2.0)) / 2.0;\n}\n\ndouble erfc_inv(double x);\n\ndouble normal_quantile(double p);\n\ntemplate <typename Iterator, typename Estimator>\nEstimate<double> bootstrap(double confidence_level, Iterator first,\n                           Iterator last, sample const& resample,\n                           Estimator&& estimator) {\n  auto n_samples = last - first;\n\n  double point = estimator(first, last);\n  // Degenerate case with a single sample\n  if (n_samples == 1)\n    return {point, point, point, confidence_level};\n\n  sample jack      = jackknife(estimator, first, last);\n  double jack_mean = mean(jack.begin(), jack.end());\n  double sum_squares, sum_cubes;\n  std::tie(sum_squares, sum_cubes) =\n      std::accumulate(jack.begin(), jack.end(), std::make_pair(0., 0.),\n                      [jack_mean](std::pair<double, double> sqcb,\n                                  double x) -> std::pair<double, double> {\n                        auto d  = jack_mean - x;\n                        auto d2 = d * d;\n                        auto d3 = d2 * d;\n                        return {sqcb.first + d2, sqcb.second + d3};\n                      });\n\n  double accel  = sum_cubes / (6 * std::pow(sum_squares, 1.5));\n  int n         = static_cast<int>(resample.size());\n  double prob_n = std::count_if(resample.begin(), resample.end(),\n                                [point](double x) { return x < point; }) /\n                  (double)n;\n  // degenerate case with uniform samples\n  if (prob_n == 0)\n    return {point, point, point, confidence_level};\n\n  double bias = normal_quantile(prob_n);\n  double z1   = normal_quantile((1. - confidence_level) / 2.);\n\n  auto cumn = [n](double x) -> int { return std::lround(normal_cdf(x) * n); };\n  auto a    = [bias, accel](double b) { return bias + b / (1. - accel * b); };\n  double b1 = bias + z1;\n  double b2 = bias - z1;\n  double a1 = a(b1);\n  double a2 = a(b2);\n  auto lo   = (std::max)(cumn(a1), 0);\n  auto hi   = (std::min)(cumn(a2), n - 1);\n\n  return {point, resample[lo], resample[hi], confidence_level};\n}\n\ndouble outlier_variance(Estimate<double> mean, Estimate<double> stddev, int n);\n\nstruct bootstrap_analysis {\n  Estimate<double> mean;\n  Estimate<double> standard_deviation;\n  double outlier_variance;\n};\n\nbootstrap_analysis analyse_samples(double confidence_level, int n_resamples,\n                                   std::vector<double>::iterator first,\n                                   std::vector<double>::iterator last);\n} // namespace Detail\n} // namespace Benchmark\n} // namespace Catch\n\n// end catch_stats.hpp\n#include <algorithm>\n#include <iterator>\n#include <tuple>\n#include <vector>\n#include <cmath>\n\nnamespace Catch {\nnamespace Benchmark {\nnamespace Detail {\ntemplate <typename Clock>\nstd::vector<double> resolution(int k) {\n  std::vector<TimePoint<Clock>> times;\n  times.reserve(k + 1);\n  std::generate_n(std::back_inserter(times), k + 1, now<Clock>{});\n\n  std::vector<double> deltas;\n  deltas.reserve(k);\n  std::transform(std::next(times.begin()), times.end(), times.begin(),\n                 std::back_inserter(deltas),\n                 [](TimePoint<Clock> a, TimePoint<Clock> b) {\n                   return static_cast<double>((a - b).count());\n                 });\n\n  return deltas;\n}\n\nconst auto warmup_iterations                = 10000;\nconst auto warmup_time                      = std::chrono::milliseconds(100);\nconst auto minimum_ticks                    = 1000;\nconst auto warmup_seed                      = 10000;\nconst auto clock_resolution_estimation_time = std::chrono::milliseconds(500);\nconst auto clock_cost_estimation_time_limit = std::chrono::seconds(1);\nconst auto clock_cost_estimation_tick_limit = 100000;\nconst auto clock_cost_estimation_time       = std::chrono::milliseconds(10);\nconst auto clock_cost_estimation_iterations = 10000;\n\ntemplate <typename Clock>\nint warmup() {\n  return run_for_at_least<Clock>(\n             std::chrono::duration_cast<ClockDuration<Clock>>(warmup_time),\n             warmup_seed, &resolution<Clock>)\n      .iterations;\n}\ntemplate <typename Clock>\nEnvironmentEstimate<FloatDuration<Clock>>\nestimate_clock_resolution(int iterations) {\n  auto r =\n      run_for_at_least<Clock>(std::chrono::duration_cast<ClockDuration<Clock>>(\n                                  clock_resolution_estimation_time),\n                              iterations, &resolution<Clock>)\n          .result;\n  return {\n      FloatDuration<Clock>(mean(r.begin(), r.end())),\n      classify_outliers(r.begin(), r.end()),\n  };\n}\ntemplate <typename Clock>\nEnvironmentEstimate<FloatDuration<Clock>>\nestimate_clock_cost(FloatDuration<Clock> resolution) {\n  auto time_limit =\n      (std::min)(resolution * clock_cost_estimation_tick_limit,\n                 FloatDuration<Clock>(clock_cost_estimation_time_limit));\n  auto time_clock = [](int k) {\n    return Detail::measure<Clock>([k] {\n             for (int i = 0; i < k; ++i) {\n               volatile auto ignored = Clock::now();\n               (void)ignored;\n             }\n           })\n        .elapsed;\n  };\n  time_clock(1);\n  int iters = clock_cost_estimation_iterations;\n  auto&& r =\n      run_for_at_least<Clock>(std::chrono::duration_cast<ClockDuration<Clock>>(\n                                  clock_cost_estimation_time),\n                              iters, time_clock);\n  std::vector<double> times;\n  int nsamples = static_cast<int>(std::ceil(time_limit / r.elapsed));\n  times.reserve(nsamples);\n  std::generate_n(std::back_inserter(times), nsamples, [time_clock, &r] {\n    return static_cast<double>(\n        (time_clock(r.iterations) / r.iterations).count());\n  });\n  return {\n      FloatDuration<Clock>(mean(times.begin(), times.end())),\n      classify_outliers(times.begin(), times.end()),\n  };\n}\n\ntemplate <typename Clock>\nEnvironment<FloatDuration<Clock>> measure_environment() {\n  static Environment<FloatDuration<Clock>>* env = nullptr;\n  if (env) {\n    return *env;\n  }\n\n  auto iters      = Detail::warmup<Clock>();\n  auto resolution = Detail::estimate_clock_resolution<Clock>(iters);\n  auto cost       = Detail::estimate_clock_cost<Clock>(resolution.mean);\n\n  env = new Environment<FloatDuration<Clock>>{resolution, cost};\n  return *env;\n}\n} // namespace Detail\n} // namespace Benchmark\n} // namespace Catch\n\n// end catch_estimate_clock.hpp\n// start catch_analyse.hpp\n\n// Run and analyse one benchmark\n\n// start catch_sample_analysis.hpp\n\n// Benchmark results\n\n#include <algorithm>\n#include <vector>\n#include <string>\n#include <iterator>\n\nnamespace Catch {\nnamespace Benchmark {\ntemplate <typename Duration>\nstruct SampleAnalysis {\n  std::vector<Duration> samples;\n  Estimate<Duration> mean;\n  Estimate<Duration> standard_deviation;\n  OutlierClassification outliers;\n  double outlier_variance;\n\n  template <typename Duration2>\n  operator SampleAnalysis<Duration2>() const {\n    std::vector<Duration2> samples2;\n    samples2.reserve(samples.size());\n    std::transform(samples.begin(), samples.end(), std::back_inserter(samples2),\n                   [](Duration d) { return Duration2(d); });\n    return {\n        std::move(samples2), mean, standard_deviation, outliers,\n        outlier_variance,\n    };\n  }\n};\n} // namespace Benchmark\n} // namespace Catch\n\n// end catch_sample_analysis.hpp\n#include <algorithm>\n#include <iterator>\n#include <vector>\n\nnamespace Catch {\nnamespace Benchmark {\nnamespace Detail {\ntemplate <typename Duration, typename Iterator>\nSampleAnalysis<Duration> analyse(const IConfig& cfg, Environment<Duration>,\n                                 Iterator first, Iterator last) {\n  if (!cfg.benchmarkNoAnalysis()) {\n    std::vector<double> samples;\n    samples.reserve(last - first);\n    std::transform(first, last, std::back_inserter(samples),\n                   [](Duration d) { return d.count(); });\n\n    auto analysis = Catch::Benchmark::Detail::analyse_samples(\n        cfg.benchmarkConfidenceInterval(), cfg.benchmarkResamples(),\n        samples.begin(), samples.end());\n    auto outliers = Catch::Benchmark::Detail::classify_outliers(samples.begin(),\n                                                                samples.end());\n\n    auto wrap_estimate = [](Estimate<double> e) {\n      return Estimate<Duration>{\n          Duration(e.point),\n          Duration(e.lower_bound),\n          Duration(e.upper_bound),\n          e.confidence_interval,\n      };\n    };\n    std::vector<Duration> samples2;\n    samples2.reserve(samples.size());\n    std::transform(samples.begin(), samples.end(), std::back_inserter(samples2),\n                   [](double d) { return Duration(d); });\n    return {\n        std::move(samples2),\n        wrap_estimate(analysis.mean),\n        wrap_estimate(analysis.standard_deviation),\n        outliers,\n        analysis.outlier_variance,\n    };\n  } else {\n    std::vector<Duration> samples;\n    samples.reserve(last - first);\n\n    Duration mean = Duration(0);\n    int i         = 0;\n    for (auto it = first; it < last; ++it, ++i) {\n      samples.push_back(Duration(*it));\n      mean += Duration(*it);\n    }\n    mean /= i;\n\n    return {std::move(samples), Estimate<Duration>{mean, mean, mean, 0.0},\n            Estimate<Duration>{Duration(0), Duration(0), Duration(0), 0.0},\n            OutlierClassification{}, 0.0};\n  }\n}\n} // namespace Detail\n} // namespace Benchmark\n} // namespace Catch\n\n// end catch_analyse.hpp\n#include <algorithm>\n#include <functional>\n#include <string>\n#include <vector>\n#include <cmath>\n\nnamespace Catch {\nnamespace Benchmark {\nstruct Benchmark {\n  Benchmark(std::string&& name) : name(std::move(name)) {}\n\n  template <class FUN>\n  Benchmark(std::string&& name, FUN&& func)\n      : fun(std::move(func)), name(std::move(name)) {}\n\n  template <typename Clock>\n  ExecutionPlan<FloatDuration<Clock>>\n  prepare(const IConfig& cfg, Environment<FloatDuration<Clock>> env) const {\n    auto min_time = env.clock_resolution.mean * Detail::minimum_ticks;\n    auto run_time =\n        std::max(min_time, std::chrono::duration_cast<decltype(min_time)>(\n                               cfg.benchmarkWarmupTime()));\n    auto&& test = Detail::run_for_at_least<Clock>(\n        std::chrono::duration_cast<ClockDuration<Clock>>(run_time), 1, fun);\n    int new_iters =\n        static_cast<int>(std::ceil(min_time * test.iterations / test.elapsed));\n    return {new_iters,\n            test.elapsed / test.iterations * new_iters * cfg.benchmarkSamples(),\n            fun,\n            std::chrono::duration_cast<FloatDuration<Clock>>(\n                cfg.benchmarkWarmupTime()),\n            Detail::warmup_iterations};\n  }\n\n  template <typename Clock = default_clock>\n  void run() {\n    IConfigPtr cfg = getCurrentContext().getConfig();\n\n    auto env = Detail::measure_environment<Clock>();\n\n    getResultCapture().benchmarkPreparing(name);\n    CATCH_TRY {\n      auto plan = user_code([&] { return prepare<Clock>(*cfg, env); });\n\n      BenchmarkInfo info{name,\n                         plan.estimated_duration.count(),\n                         plan.iterations_per_sample,\n                         cfg->benchmarkSamples(),\n                         cfg->benchmarkResamples(),\n                         env.clock_resolution.mean.count(),\n                         env.clock_cost.mean.count()};\n\n      getResultCapture().benchmarkStarting(info);\n\n      auto samples =\n          user_code([&] { return plan.template run<Clock>(*cfg, env); });\n\n      auto analysis =\n          Detail::analyse(*cfg, env, samples.begin(), samples.end());\n      BenchmarkStats<FloatDuration<Clock>> stats{info,\n                                                 analysis.samples,\n                                                 analysis.mean,\n                                                 analysis.standard_deviation,\n                                                 analysis.outliers,\n                                                 analysis.outlier_variance};\n      getResultCapture().benchmarkEnded(stats);\n    }\n    CATCH_CATCH_ALL {\n      if (translateActiveException() !=\n          Detail::benchmarkErrorMsg) // benchmark errors have been reported,\n                                     // otherwise rethrow.\n        std::rethrow_exception(std::current_exception());\n    }\n  }\n\n  // sets lambda to be used in fun *and* executes benchmark!\n  template <typename Fun,\n            typename std::enable_if<!Detail::is_related<Fun, Benchmark>::value,\n                                    int>::type = 0>\n  Benchmark& operator=(Fun func) {\n    fun = Detail::BenchmarkFunction(func);\n    run();\n    return *this;\n  }\n\n  explicit operator bool() { return true; }\n\nprivate:\n  Detail::BenchmarkFunction fun;\n  std::string name;\n};\n} // namespace Benchmark\n} // namespace Catch\n\n#define INTERNAL_CATCH_GET_1_ARG(arg1, arg2, ...) arg1\n#define INTERNAL_CATCH_GET_2_ARG(arg1, arg2, ...) arg2\n\n#define INTERNAL_CATCH_BENCHMARK(BenchmarkName, name, benchmarkIndex)          \\\n  if (Catch::Benchmark::Benchmark BenchmarkName{name})                         \\\n  BenchmarkName = [&](int benchmarkIndex)\n\n#define INTERNAL_CATCH_BENCHMARK_ADVANCED(BenchmarkName, name)                 \\\n  if (Catch::Benchmark::Benchmark BenchmarkName{name})                         \\\n  BenchmarkName = [&]\n\n// end catch_benchmark.hpp\n// start catch_constructor.hpp\n\n// Constructor and destructor helpers\n\n#include <type_traits>\n\nnamespace Catch {\nnamespace Benchmark {\nnamespace Detail {\ntemplate <typename T, bool Destruct>\nstruct ObjectStorage {\n  using TStorage =\n      typename std::aligned_storage<sizeof(T),\n                                    std::alignment_of<T>::value>::type;\n\n  ObjectStorage() : data() {}\n\n  ObjectStorage(const ObjectStorage& other) {\n    new (&data) T(other.stored_object());\n  }\n\n  ObjectStorage(ObjectStorage&& other) {\n    new (&data) T(std::move(other.stored_object()));\n  }\n\n  ~ObjectStorage() { destruct_on_exit<T>(); }\n\n  template <typename... Args>\n  void construct(Args&&... args) {\n    new (&data) T(std::forward<Args>(args)...);\n  }\n\n  template <bool AllowManualDestruction = !Destruct>\n  typename std::enable_if<AllowManualDestruction>::type destruct() {\n    stored_object().~T();\n  }\n\nprivate:\n  // If this is a constructor benchmark, destruct the underlying object\n  template <typename U>\n  void destruct_on_exit(typename std::enable_if<Destruct, U>::type* = 0) {\n    destruct<true>();\n  }\n  // Otherwise, don't\n  template <typename U>\n  void destruct_on_exit(typename std::enable_if<!Destruct, U>::type* = 0) {}\n\n  T& stored_object() { return *static_cast<T*>(static_cast<void*>(&data)); }\n\n  T const& stored_object() const {\n    return *static_cast<T*>(static_cast<void*>(&data));\n  }\n\n  TStorage data;\n};\n} // namespace Detail\n\ntemplate <typename T>\nusing storage_for = Detail::ObjectStorage<T, true>;\n\ntemplate <typename T>\nusing destructable_object = Detail::ObjectStorage<T, false>;\n} // namespace Benchmark\n} // namespace Catch\n\n// end catch_constructor.hpp\n// end catch_benchmarking_all.hpp\n#endif\n\n#endif // ! CATCH_CONFIG_IMPL_ONLY\n\n#ifdef CATCH_IMPL\n// start catch_impl.hpp\n\n#ifdef __clang__\n#pragma clang diagnostic push\n#pragma clang diagnostic ignored \"-Wweak-vtables\"\n#endif\n\n// Keep these here for external reporters\n// start catch_test_case_tracker.h\n\n#include <string>\n#include <vector>\n#include <memory>\n\nnamespace Catch {\nnamespace TestCaseTracking {\n\nstruct NameAndLocation {\n  std::string name;\n  SourceLineInfo location;\n\n  NameAndLocation(std::string const& _name, SourceLineInfo const& _location);\n  friend bool operator==(NameAndLocation const& lhs,\n                         NameAndLocation const& rhs) {\n    return lhs.name == rhs.name && lhs.location == rhs.location;\n  }\n};\n\nclass ITracker;\n\nusing ITrackerPtr = std::shared_ptr<ITracker>;\n\nclass ITracker {\n  NameAndLocation m_nameAndLocation;\n\npublic:\n  ITracker(NameAndLocation const& nameAndLoc) : m_nameAndLocation(nameAndLoc) {}\n\n  // static queries\n  NameAndLocation const& nameAndLocation() const { return m_nameAndLocation; }\n\n  virtual ~ITracker();\n\n  // dynamic queries\n  virtual bool isComplete() const = 0; // Successfully completed or failed\n  virtual bool isSuccessfullyCompleted() const = 0;\n  virtual bool isOpen() const                  = 0; // Started but not complete\n  virtual bool hasChildren() const             = 0;\n  virtual bool hasStarted() const              = 0;\n\n  virtual ITracker& parent() = 0;\n\n  // actions\n  virtual void close()                   = 0; // Successfully complete\n  virtual void fail()                    = 0;\n  virtual void markAsNeedingAnotherRun() = 0;\n\n  virtual void addChild(ITrackerPtr const& child)                       = 0;\n  virtual ITrackerPtr findChild(NameAndLocation const& nameAndLocation) = 0;\n  virtual void openChild()                                              = 0;\n\n  // Debug/ checking\n  virtual bool isSectionTracker() const   = 0;\n  virtual bool isGeneratorTracker() const = 0;\n};\n\nclass TrackerContext {\n\n  enum RunState { NotStarted, Executing, CompletedCycle };\n\n  ITrackerPtr m_rootTracker;\n  ITracker* m_currentTracker = nullptr;\n  RunState m_runState        = NotStarted;\n\npublic:\n  ITracker& startRun();\n  void endRun();\n\n  void startCycle();\n  void completeCycle();\n\n  bool completedCycle() const;\n  ITracker& currentTracker();\n  void setCurrentTracker(ITracker* tracker);\n};\n\nclass TrackerBase : public ITracker {\nprotected:\n  enum CycleState {\n    NotStarted,\n    Executing,\n    ExecutingChildren,\n    NeedsAnotherRun,\n    CompletedSuccessfully,\n    Failed\n  };\n\n  using Children = std::vector<ITrackerPtr>;\n  TrackerContext& m_ctx;\n  ITracker* m_parent;\n  Children m_children;\n  CycleState m_runState = NotStarted;\n\npublic:\n  TrackerBase(NameAndLocation const& nameAndLocation, TrackerContext& ctx,\n              ITracker* parent);\n\n  bool isComplete() const override;\n  bool isSuccessfullyCompleted() const override;\n  bool isOpen() const override;\n  bool hasChildren() const override;\n  bool hasStarted() const override { return m_runState != NotStarted; }\n\n  void addChild(ITrackerPtr const& child) override;\n\n  ITrackerPtr findChild(NameAndLocation const& nameAndLocation) override;\n  ITracker& parent() override;\n\n  void openChild() override;\n\n  bool isSectionTracker() const override;\n  bool isGeneratorTracker() const override;\n\n  void open();\n\n  void close() override;\n  void fail() override;\n  void markAsNeedingAnotherRun() override;\n\nprivate:\n  void moveToParent();\n  void moveToThis();\n};\n\nclass SectionTracker : public TrackerBase {\n  std::vector<std::string> m_filters;\n  std::string m_trimmed_name;\n\npublic:\n  SectionTracker(NameAndLocation const& nameAndLocation, TrackerContext& ctx,\n                 ITracker* parent);\n\n  bool isSectionTracker() const override;\n\n  bool isComplete() const override;\n\n  static SectionTracker& acquire(TrackerContext& ctx,\n                                 NameAndLocation const& nameAndLocation);\n\n  void tryOpen();\n\n  void addInitialFilters(std::vector<std::string> const& filters);\n  void addNextFilters(std::vector<std::string> const& filters);\n  //! Returns filters active in this tracker\n  std::vector<std::string> const& getFilters() const;\n  //! Returns whitespace-trimmed name of the tracked section\n  std::string const& trimmedName() const;\n};\n\n} // namespace TestCaseTracking\n\nusing TestCaseTracking::ITracker;\nusing TestCaseTracking::SectionTracker;\nusing TestCaseTracking::TrackerContext;\n\n} // namespace Catch\n\n// end catch_test_case_tracker.h\n\n// start catch_leak_detector.h\n\nnamespace Catch {\n\nstruct LeakDetector {\n  LeakDetector();\n  ~LeakDetector();\n};\n\n} // namespace Catch\n// end catch_leak_detector.h\n// Cpp files will be included in the single-header file here\n// start catch_stats.cpp\n\n// Statistical analysis tools\n\n#if defined(CATCH_CONFIG_ENABLE_BENCHMARKING)\n\n#include <cassert>\n#include <random>\n\n#if defined(CATCH_CONFIG_USE_ASYNC)\n#include <future>\n#endif\n\nnamespace {\ndouble erf_inv(double x) {\n  // Code accompanying the article \"Approximating the erfinv function\" in GPU\n  // Computing Gems, Volume 2\n  double w, p;\n\n  w = -log((1.0 - x) * (1.0 + x));\n\n  if (w < 6.250000) {\n    w = w - 3.125000;\n    p = -3.6444120640178196996e-21;\n    p = -1.685059138182016589e-19 + p * w;\n    p = 1.2858480715256400167e-18 + p * w;\n    p = 1.115787767802518096e-17 + p * w;\n    p = -1.333171662854620906e-16 + p * w;\n    p = 2.0972767875968561637e-17 + p * w;\n    p = 6.6376381343583238325e-15 + p * w;\n    p = -4.0545662729752068639e-14 + p * w;\n    p = -8.1519341976054721522e-14 + p * w;\n    p = 2.6335093153082322977e-12 + p * w;\n    p = -1.2975133253453532498e-11 + p * w;\n    p = -5.4154120542946279317e-11 + p * w;\n    p = 1.051212273321532285e-09 + p * w;\n    p = -4.1126339803469836976e-09 + p * w;\n    p = -2.9070369957882005086e-08 + p * w;\n    p = 4.2347877827932403518e-07 + p * w;\n    p = -1.3654692000834678645e-06 + p * w;\n    p = -1.3882523362786468719e-05 + p * w;\n    p = 0.0001867342080340571352 + p * w;\n    p = -0.00074070253416626697512 + p * w;\n    p = -0.0060336708714301490533 + p * w;\n    p = 0.24015818242558961693 + p * w;\n    p = 1.6536545626831027356 + p * w;\n  } else if (w < 16.000000) {\n    w = sqrt(w) - 3.250000;\n    p = 2.2137376921775787049e-09;\n    p = 9.0756561938885390979e-08 + p * w;\n    p = -2.7517406297064545428e-07 + p * w;\n    p = 1.8239629214389227755e-08 + p * w;\n    p = 1.5027403968909827627e-06 + p * w;\n    p = -4.013867526981545969e-06 + p * w;\n    p = 2.9234449089955446044e-06 + p * w;\n    p = 1.2475304481671778723e-05 + p * w;\n    p = -4.7318229009055733981e-05 + p * w;\n    p = 6.8284851459573175448e-05 + p * w;\n    p = 2.4031110387097893999e-05 + p * w;\n    p = -0.0003550375203628474796 + p * w;\n    p = 0.00095328937973738049703 + p * w;\n    p = -0.0016882755560235047313 + p * w;\n    p = 0.0024914420961078508066 + p * w;\n    p = -0.0037512085075692412107 + p * w;\n    p = 0.005370914553590063617 + p * w;\n    p = 1.0052589676941592334 + p * w;\n    p = 3.0838856104922207635 + p * w;\n  } else {\n    w = sqrt(w) - 5.000000;\n    p = -2.7109920616438573243e-11;\n    p = -2.5556418169965252055e-10 + p * w;\n    p = 1.5076572693500548083e-09 + p * w;\n    p = -3.7894654401267369937e-09 + p * w;\n    p = 7.6157012080783393804e-09 + p * w;\n    p = -1.4960026627149240478e-08 + p * w;\n    p = 2.9147953450901080826e-08 + p * w;\n    p = -6.7711997758452339498e-08 + p * w;\n    p = 2.2900482228026654717e-07 + p * w;\n    p = -9.9298272942317002539e-07 + p * w;\n    p = 4.5260625972231537039e-06 + p * w;\n    p = -1.9681778105531670567e-05 + p * w;\n    p = 7.5995277030017761139e-05 + p * w;\n    p = -0.00021503011930044477347 + p * w;\n    p = -0.00013871931833623122026 + p * w;\n    p = 1.0103004648645343977 + p * w;\n    p = 4.8499064014085844221 + p * w;\n  }\n  return p * x;\n}\n\ndouble standard_deviation(std::vector<double>::iterator first,\n                          std::vector<double>::iterator last) {\n  auto m          = Catch::Benchmark::Detail::mean(first, last);\n  double variance = std::accumulate(first, last, 0.,\n                                    [m](double a, double b) {\n                                      double diff = b - m;\n                                      return a + diff * diff;\n                                    }) /\n                    (last - first);\n  return std::sqrt(variance);\n}\n\n} // namespace\n\nnamespace Catch {\nnamespace Benchmark {\nnamespace Detail {\n\ndouble weighted_average_quantile(int k, int q,\n                                 std::vector<double>::iterator first,\n                                 std::vector<double>::iterator last) {\n  auto count = last - first;\n  double idx = (count - 1) * k / static_cast<double>(q);\n  int j      = static_cast<int>(idx);\n  double g   = idx - j;\n  std::nth_element(first, first + j, last);\n  auto xj = first[j];\n  if (g == 0)\n    return xj;\n\n  auto xj1 = *std::min_element(first + (j + 1), last);\n  return xj + g * (xj1 - xj);\n}\n\ndouble erfc_inv(double x) { return erf_inv(1.0 - x); }\n\ndouble normal_quantile(double p) {\n  static const double ROOT_TWO = std::sqrt(2.0);\n\n  double result = 0.0;\n  assert(p >= 0 && p <= 1);\n  if (p < 0 || p > 1) {\n    return result;\n  }\n\n  result = -erfc_inv(2.0 * p);\n  // result *= normal distribution standard deviation (1.0) * sqrt(2)\n  result *= /*sd * */ ROOT_TWO;\n  // result += normal disttribution mean (0)\n  return result;\n}\n\ndouble outlier_variance(Estimate<double> mean, Estimate<double> stddev, int n) {\n  double sb     = stddev.point;\n  double mn     = mean.point / n;\n  double mg_min = mn / 2.;\n  double sg     = (std::min)(mg_min / 4., sb / std::sqrt(n));\n  double sg2    = sg * sg;\n  double sb2    = sb * sb;\n\n  auto c_max = [n, mn, sb2, sg2](double x) -> double {\n    double k   = mn - x;\n    double d   = k * k;\n    double nd  = n * d;\n    double k0  = -n * nd;\n    double k1  = sb2 - n * sg2 + nd;\n    double det = k1 * k1 - 4 * sg2 * k0;\n    return (int)(-2. * k0 / (k1 + std::sqrt(det)));\n  };\n\n  auto var_out = [n, sb2, sg2](double c) {\n    double nc = n - c;\n    return (nc / n) * (sb2 - nc * sg2);\n  };\n\n  return (std::min)(var_out(1), var_out((std::min)(c_max(0.), c_max(mg_min)))) /\n         sb2;\n}\n\nbootstrap_analysis analyse_samples(double confidence_level, int n_resamples,\n                                   std::vector<double>::iterator first,\n                                   std::vector<double>::iterator last) {\n  CATCH_INTERNAL_START_WARNINGS_SUPPRESSION\n  CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS\n  static std::random_device entropy;\n  CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION\n\n  auto n = static_cast<int>(\n      last -\n      first); // seriously, one can't use integral types without hell in C++\n\n  auto mean   = &Detail::mean<std::vector<double>::iterator>;\n  auto stddev = &standard_deviation;\n\n#if defined(CATCH_CONFIG_USE_ASYNC)\n  auto Estimate = [=](double (*f)(std::vector<double>::iterator,\n                                  std::vector<double>::iterator)) {\n    auto seed = entropy();\n    return std::async(std::launch::async, [=] {\n      std::mt19937 rng(seed);\n      auto resampled = resample(rng, n_resamples, first, last, f);\n      return bootstrap(confidence_level, first, last, resampled, f);\n    });\n  };\n\n  auto mean_future   = Estimate(mean);\n  auto stddev_future = Estimate(stddev);\n\n  auto mean_estimate   = mean_future.get();\n  auto stddev_estimate = stddev_future.get();\n#else\n  auto Estimate = [=](double (*f)(std::vector<double>::iterator,\n                                  std::vector<double>::iterator)) {\n    auto seed = entropy();\n    std::mt19937 rng(seed);\n    auto resampled = resample(rng, n_resamples, first, last, f);\n    return bootstrap(confidence_level, first, last, resampled, f);\n  };\n\n  auto mean_estimate   = Estimate(mean);\n  auto stddev_estimate = Estimate(stddev);\n#endif // CATCH_USE_ASYNC\n\n  double outlier_variance =\n      Detail::outlier_variance(mean_estimate, stddev_estimate, n);\n\n  return {mean_estimate, stddev_estimate, outlier_variance};\n}\n} // namespace Detail\n} // namespace Benchmark\n} // namespace Catch\n\n#endif // CATCH_CONFIG_ENABLE_BENCHMARKING\n// end catch_stats.cpp\n// start catch_approx.cpp\n\n#include <cmath>\n#include <limits>\n\nnamespace {\n\n// Performs equivalent check of std::fabs(lhs - rhs) <= margin\n// But without the subtraction to allow for INFINITY in comparison\nbool marginComparison(double lhs, double rhs, double margin) {\n  return (lhs + margin >= rhs) && (rhs + margin >= lhs);\n}\n\n} // namespace\n\nnamespace Catch {\nnamespace Detail {\n\nApprox::Approx(double value)\n    : m_epsilon(std::numeric_limits<float>::epsilon() * 100), m_margin(0.0),\n      m_scale(0.0), m_value(value) {}\n\nApprox Approx::custom() { return Approx(0); }\n\nApprox Approx::operator-() const {\n  auto temp(*this);\n  temp.m_value = -temp.m_value;\n  return temp;\n}\n\nstd::string Approx::toString() const {\n  ReusableStringStream rss;\n  rss << \"Approx( \" << ::Catch::Detail::stringify(m_value) << \" )\";\n  return rss.str();\n}\n\nbool Approx::equalityComparisonImpl(const double other) const {\n  // First try with fixed margin, then compute margin based on epsilon, scale\n  // and Approx's value Thanks to Richard Harris for his help refining the\n  // scaled margin value\n  return marginComparison(m_value, other, m_margin) ||\n         marginComparison(\n             m_value, other,\n             m_epsilon *\n                 (m_scale + std::fabs(std::isinf(m_value) ? 0 : m_value)));\n}\n\nvoid Approx::setMargin(double newMargin) {\n  CATCH_ENFORCE(newMargin >= 0,\n                \"Invalid Approx::margin: \"\n                    << newMargin << '.'\n                    << \" Approx::Margin has to be non-negative.\");\n  m_margin = newMargin;\n}\n\nvoid Approx::setEpsilon(double newEpsilon) {\n  CATCH_ENFORCE(newEpsilon >= 0 && newEpsilon <= 1.0,\n                \"Invalid Approx::epsilon: \"\n                    << newEpsilon << '.'\n                    << \" Approx::epsilon has to be in [0, 1]\");\n  m_epsilon = newEpsilon;\n}\n\n} // end namespace Detail\n\nnamespace literals {\nDetail::Approx operator\"\" _a(long double val) { return Detail::Approx(val); }\nDetail::Approx operator\"\" _a(unsigned long long val) {\n  return Detail::Approx(val);\n}\n} // end namespace literals\n\nstd::string StringMaker<Catch::Detail::Approx>::convert(\n    Catch::Detail::Approx const& value) {\n  return value.toString();\n}\n\n} // end namespace Catch\n// end catch_approx.cpp\n// start catch_assertionhandler.cpp\n\n// start catch_debugger.h\n\nnamespace Catch {\nbool isDebuggerActive();\n}\n\n#ifdef CATCH_PLATFORM_MAC\n\n#if defined(__i386__) || defined(__x86_64__)\n#define CATCH_TRAP() __asm__(\"int $3\\n\" : :) /* NOLINT */\n#elif defined(__aarch64__)\n#define CATCH_TRAP() __asm__(\".inst 0xd4200000\")\n#endif\n\n#elif defined(CATCH_PLATFORM_IPHONE)\n\n// use inline assembler\n#if defined(__i386__) || defined(__x86_64__)\n#define CATCH_TRAP() __asm__(\"int $3\")\n#elif defined(__aarch64__)\n#define CATCH_TRAP() __asm__(\".inst 0xd4200000\")\n#elif defined(__arm__) && !defined(__thumb__)\n#define CATCH_TRAP() __asm__(\".inst 0xe7f001f0\")\n#elif defined(__arm__) && defined(__thumb__)\n#define CATCH_TRAP() __asm__(\".inst 0xde01\")\n#endif\n\n#elif defined(CATCH_PLATFORM_LINUX)\n// If we can use inline assembler, do it because this allows us to break\n// directly at the location of the failing check instead of breaking inside\n// raise() called from it, i.e. one stack frame below.\n#if defined(__GNUC__) && (defined(__i386) || defined(__x86_64))\n#define CATCH_TRAP() asm volatile(\"int $3\") /* NOLINT */\n#else                                       // Fall back to the generic way.\n#include <signal.h>\n\n#define CATCH_TRAP() raise(SIGTRAP)\n#endif\n#elif defined(_MSC_VER)\n#define CATCH_TRAP() __debugbreak()\n#elif defined(__MINGW32__)\nextern \"C\" __declspec(dllimport) void __stdcall DebugBreak();\n#define CATCH_TRAP() DebugBreak()\n#endif\n\n#ifndef CATCH_BREAK_INTO_DEBUGGER\n#ifdef CATCH_TRAP\n#define CATCH_BREAK_INTO_DEBUGGER()                                            \\\n  [] {                                                                         \\\n    if (Catch::isDebuggerActive()) {                                           \\\n      CATCH_TRAP();                                                            \\\n    }                                                                          \\\n  }()\n#else\n#define CATCH_BREAK_INTO_DEBUGGER() [] {}()\n#endif\n#endif\n\n// end catch_debugger.h\n// start catch_run_context.h\n\n// start catch_fatal_condition.h\n\n#include <cassert>\n\nnamespace Catch {\n\n// Wrapper for platform-specific fatal error (signals/SEH) handlers\n//\n// Tries to be cooperative with other handlers, and not step over\n// other handlers. This means that unknown structured exceptions\n// are passed on, previous signal handlers are called, and so on.\n//\n// Can only be instantiated once, and assumes that once a signal\n// is caught, the binary will end up terminating. Thus, there\nclass FatalConditionHandler {\n  bool m_started = false;\n\n  // Install/disengage implementation for specific platform.\n  // Should be if-defed to work on current platform, can assume\n  // engage-disengage 1:1 pairing.\n  void engage_platform();\n  void disengage_platform();\n\npublic:\n  // Should also have platform-specific implementations as needed\n  FatalConditionHandler();\n  ~FatalConditionHandler();\n\n  void engage() {\n    assert(!m_started && \"Handler cannot be installed twice.\");\n    m_started = true;\n    engage_platform();\n  }\n\n  void disengage() {\n    assert(m_started &&\n           \"Handler cannot be uninstalled without being installed first\");\n    m_started = false;\n    disengage_platform();\n  }\n};\n\n//! Simple RAII guard for (dis)engaging the FatalConditionHandler\nclass FatalConditionHandlerGuard {\n  FatalConditionHandler* m_handler;\n\npublic:\n  FatalConditionHandlerGuard(FatalConditionHandler* handler)\n      : m_handler(handler) {\n    m_handler->engage();\n  }\n  ~FatalConditionHandlerGuard() { m_handler->disengage(); }\n};\n\n} // end namespace Catch\n\n// end catch_fatal_condition.h\n#include <string>\n\nnamespace Catch {\n\nstruct IMutableContext;\n\n///////////////////////////////////////////////////////////////////////////\n\nclass RunContext : public IResultCapture, public IRunner {\n\npublic:\n  RunContext(RunContext const&)            = delete;\n  RunContext& operator=(RunContext const&) = delete;\n\n  explicit RunContext(IConfigPtr const& _config,\n                      IStreamingReporterPtr&& reporter);\n\n  ~RunContext() override;\n\n  void testGroupStarting(std::string const& testSpec, std::size_t groupIndex,\n                         std::size_t groupsCount);\n  void testGroupEnded(std::string const& testSpec, Totals const& totals,\n                      std::size_t groupIndex, std::size_t groupsCount);\n\n  Totals runTest(TestCase const& testCase);\n\n  IConfigPtr config() const;\n  IStreamingReporter& reporter() const;\n\npublic: // IResultCapture\n  // Assertion handlers\n  void handleExpr(AssertionInfo const& info, ITransientExpression const& expr,\n                  AssertionReaction& reaction) override;\n  void handleMessage(AssertionInfo const& info, ResultWas::OfType resultType,\n                     StringRef const& message,\n                     AssertionReaction& reaction) override;\n  void handleUnexpectedExceptionNotThrown(AssertionInfo const& info,\n                                          AssertionReaction& reaction) override;\n  void handleUnexpectedInflightException(AssertionInfo const& info,\n                                         std::string const& message,\n                                         AssertionReaction& reaction) override;\n  void handleIncomplete(AssertionInfo const& info) override;\n  void handleNonExpr(AssertionInfo const& info, ResultWas::OfType resultType,\n                     AssertionReaction& reaction) override;\n\n  bool sectionStarted(SectionInfo const& sectionInfo,\n                      Counts& assertions) override;\n\n  void sectionEnded(SectionEndInfo const& endInfo) override;\n  void sectionEndedEarly(SectionEndInfo const& endInfo) override;\n\n  auto acquireGeneratorTracker(StringRef generatorName,\n                               SourceLineInfo const& lineInfo)\n      -> IGeneratorTracker& override;\n\n#if defined(CATCH_CONFIG_ENABLE_BENCHMARKING)\n  void benchmarkPreparing(std::string const& name) override;\n  void benchmarkStarting(BenchmarkInfo const& info) override;\n  void benchmarkEnded(BenchmarkStats<> const& stats) override;\n  void benchmarkFailed(std::string const& error) override;\n#endif // CATCH_CONFIG_ENABLE_BENCHMARKING\n\n  void pushScopedMessage(MessageInfo const& message) override;\n  void popScopedMessage(MessageInfo const& message) override;\n\n  void emplaceUnscopedMessage(MessageBuilder const& builder) override;\n\n  std::string getCurrentTestName() const override;\n\n  const AssertionResult* getLastResult() const override;\n\n  void exceptionEarlyReported() override;\n\n  void handleFatalErrorCondition(StringRef message) override;\n\n  bool lastAssertionPassed() override;\n\n  void assertionPassed() override;\n\npublic:\n  // !TBD We need to do this another way!\n  bool aborting() const final;\n\nprivate:\n  void runCurrentTest(std::string& redirectedCout, std::string& redirectedCerr);\n  void invokeActiveTestCase();\n\n  void resetAssertionInfo();\n  bool testForMissingAssertions(Counts& assertions);\n\n  void assertionEnded(AssertionResult const& result);\n  void reportExpr(AssertionInfo const& info, ResultWas::OfType resultType,\n                  ITransientExpression const* expr, bool negated);\n\n  void populateReaction(AssertionReaction& reaction);\n\nprivate:\n  void handleUnfinishedSections();\n\n  TestRunInfo m_runInfo;\n  IMutableContext& m_context;\n  TestCase const* m_activeTestCase = nullptr;\n  ITracker* m_testCaseTracker      = nullptr;\n  Option<AssertionResult> m_lastResult;\n\n  IConfigPtr m_config;\n  Totals m_totals;\n  IStreamingReporterPtr m_reporter;\n  std::vector<MessageInfo> m_messages;\n  std::vector<ScopedMessage>\n      m_messageScopes; /* Keeps owners of so-called unscoped messages. */\n  AssertionInfo m_lastAssertionInfo;\n  std::vector<SectionEndInfo> m_unfinishedSections;\n  std::vector<ITracker*> m_activeSections;\n  TrackerContext m_trackerContext;\n  FatalConditionHandler m_fatalConditionhandler;\n  bool m_lastAssertionPassed    = false;\n  bool m_shouldReportUnexpected = true;\n  bool m_includeSuccessfulResults;\n};\n\nvoid seedRng(IConfig const& config);\nunsigned int rngSeed();\n} // end namespace Catch\n\n// end catch_run_context.h\nnamespace Catch {\n\nnamespace {\nauto operator<<(std::ostream& os, ITransientExpression const& expr)\n    -> std::ostream& {\n  expr.streamReconstructedExpression(os);\n  return os;\n}\n} // namespace\n\nLazyExpression::LazyExpression(bool isNegated) : m_isNegated(isNegated) {}\n\nLazyExpression::LazyExpression(LazyExpression const& other)\n    : m_isNegated(other.m_isNegated) {}\n\nLazyExpression::operator bool() const {\n  return m_transientExpression != nullptr;\n}\n\nauto operator<<(std::ostream& os, LazyExpression const& lazyExpr)\n    -> std::ostream& {\n  if (lazyExpr.m_isNegated)\n    os << \"!\";\n\n  if (lazyExpr) {\n    if (lazyExpr.m_isNegated &&\n        lazyExpr.m_transientExpression->isBinaryExpression())\n      os << \"(\" << *lazyExpr.m_transientExpression << \")\";\n    else\n      os << *lazyExpr.m_transientExpression;\n  } else {\n    os << \"{** error - unchecked empty expression requested **}\";\n  }\n  return os;\n}\n\nAssertionHandler::AssertionHandler(StringRef const& macroName,\n                                   SourceLineInfo const& lineInfo,\n                                   StringRef capturedExpression,\n                                   ResultDisposition::Flags resultDisposition)\n    : m_assertionInfo{macroName, lineInfo, capturedExpression,\n                      resultDisposition},\n      m_resultCapture(getResultCapture()) {}\n\nvoid AssertionHandler::handleExpr(ITransientExpression const& expr) {\n  m_resultCapture.handleExpr(m_assertionInfo, expr, m_reaction);\n}\nvoid AssertionHandler::handleMessage(ResultWas::OfType resultType,\n                                     StringRef const& message) {\n  m_resultCapture.handleMessage(m_assertionInfo, resultType, message,\n                                m_reaction);\n}\n\nauto AssertionHandler::allowThrows() const -> bool {\n  return getCurrentContext().getConfig()->allowThrows();\n}\n\nvoid AssertionHandler::complete() {\n  setCompleted();\n  if (m_reaction.shouldDebugBreak) {\n\n    // If you find your debugger stopping you here then go one level up on the\n    // call-stack for the code that caused it (typically a failed assertion)\n\n    // (To go back to the test and change execution, jump over the throw, next)\n    CATCH_BREAK_INTO_DEBUGGER();\n  }\n  if (m_reaction.shouldThrow) {\n#if !defined(CATCH_CONFIG_DISABLE_EXCEPTIONS)\n    throw Catch::TestFailureException();\n#else\n    CATCH_ERROR(\"Test failure requires aborting test!\");\n#endif\n  }\n}\nvoid AssertionHandler::setCompleted() { m_completed = true; }\n\nvoid AssertionHandler::handleUnexpectedInflightException() {\n  m_resultCapture.handleUnexpectedInflightException(\n      m_assertionInfo, Catch::translateActiveException(), m_reaction);\n}\n\nvoid AssertionHandler::handleExceptionThrownAsExpected() {\n  m_resultCapture.handleNonExpr(m_assertionInfo, ResultWas::Ok, m_reaction);\n}\nvoid AssertionHandler::handleExceptionNotThrownAsExpected() {\n  m_resultCapture.handleNonExpr(m_assertionInfo, ResultWas::Ok, m_reaction);\n}\n\nvoid AssertionHandler::handleUnexpectedExceptionNotThrown() {\n  m_resultCapture.handleUnexpectedExceptionNotThrown(m_assertionInfo,\n                                                     m_reaction);\n}\n\nvoid AssertionHandler::handleThrowingCallSkipped() {\n  m_resultCapture.handleNonExpr(m_assertionInfo, ResultWas::Ok, m_reaction);\n}\n\n// This is the overload that takes a string and infers the Equals matcher from\n// it The more general overload, that takes any string matcher, is in\n// catch_capture_matchers.cpp\nvoid handleExceptionMatchExpr(AssertionHandler& handler, std::string const& str,\n                              StringRef const& matcherString) {\n  handleExceptionMatchExpr(handler, Matchers::Equals(str), matcherString);\n}\n\n} // namespace Catch\n// end catch_assertionhandler.cpp\n// start catch_assertionresult.cpp\n\nnamespace Catch {\nAssertionResultData::AssertionResultData(ResultWas::OfType _resultType,\n                                         LazyExpression const& _lazyExpression)\n    : lazyExpression(_lazyExpression), resultType(_resultType) {}\n\nstd::string AssertionResultData::reconstructExpression() const {\n\n  if (reconstructedExpression.empty()) {\n    if (lazyExpression) {\n      ReusableStringStream rss;\n      rss << lazyExpression;\n      reconstructedExpression = rss.str();\n    }\n  }\n  return reconstructedExpression;\n}\n\nAssertionResult::AssertionResult(AssertionInfo const& info,\n                                 AssertionResultData const& data)\n    : m_info(info), m_resultData(data) {}\n\n// Result was a success\nbool AssertionResult::succeeded() const {\n  return Catch::isOk(m_resultData.resultType);\n}\n\n// Result was a success, or failure is suppressed\nbool AssertionResult::isOk() const {\n  return Catch::isOk(m_resultData.resultType) ||\n         shouldSuppressFailure(m_info.resultDisposition);\n}\n\nResultWas::OfType AssertionResult::getResultType() const {\n  return m_resultData.resultType;\n}\n\nbool AssertionResult::hasExpression() const {\n  return !m_info.capturedExpression.empty();\n}\n\nbool AssertionResult::hasMessage() const {\n  return !m_resultData.message.empty();\n}\n\nstd::string AssertionResult::getExpression() const {\n  // Possibly overallocating by 3 characters should be basically free\n  std::string expr;\n  expr.reserve(m_info.capturedExpression.size() + 3);\n  if (isFalseTest(m_info.resultDisposition)) {\n    expr += \"!(\";\n  }\n  expr += m_info.capturedExpression;\n  if (isFalseTest(m_info.resultDisposition)) {\n    expr += ')';\n  }\n  return expr;\n}\n\nstd::string AssertionResult::getExpressionInMacro() const {\n  std::string expr;\n  if (m_info.macroName.empty())\n    expr = static_cast<std::string>(m_info.capturedExpression);\n  else {\n    expr.reserve(m_info.macroName.size() + m_info.capturedExpression.size() +\n                 4);\n    expr += m_info.macroName;\n    expr += \"( \";\n    expr += m_info.capturedExpression;\n    expr += \" )\";\n  }\n  return expr;\n}\n\nbool AssertionResult::hasExpandedExpression() const {\n  return hasExpression() && getExpandedExpression() != getExpression();\n}\n\nstd::string AssertionResult::getExpandedExpression() const {\n  std::string expr = m_resultData.reconstructExpression();\n  return expr.empty() ? getExpression() : expr;\n}\n\nstd::string AssertionResult::getMessage() const { return m_resultData.message; }\nSourceLineInfo AssertionResult::getSourceInfo() const {\n  return m_info.lineInfo;\n}\n\nStringRef AssertionResult::getTestMacroName() const { return m_info.macroName; }\n\n} // end namespace Catch\n// end catch_assertionresult.cpp\n// start catch_capture_matchers.cpp\n\nnamespace Catch {\n\nusing StringMatcher = Matchers::Impl::MatcherBase<std::string>;\n\n// This is the general overload that takes a any string matcher\n// There is another overload, in catch_assertionhandler.h/.cpp, that only takes\n// a string and infers the Equals matcher (so the header does not mention\n// matchers)\nvoid handleExceptionMatchExpr(AssertionHandler& handler,\n                              StringMatcher const& matcher,\n                              StringRef const& matcherString) {\n  std::string exceptionMessage = Catch::translateActiveException();\n  MatchExpr<std::string, StringMatcher const&> expr(exceptionMessage, matcher,\n                                                    matcherString);\n  handler.handleExpr(expr);\n}\n\n} // namespace Catch\n// end catch_capture_matchers.cpp\n// start catch_commandline.cpp\n\n// start catch_commandline.h\n\n// start catch_clara.h\n\n// Use Catch's value for console width (store Clara's off to the side, if\n// present)\n#ifdef CLARA_CONFIG_CONSOLE_WIDTH\n#define CATCH_TEMP_CLARA_CONFIG_CONSOLE_WIDTH                                  \\\n  CATCH_CLARA_TEXTFLOW_CONFIG_CONSOLE_WIDTH\n#undef CATCH_CLARA_TEXTFLOW_CONFIG_CONSOLE_WIDTH\n#endif\n#define CATCH_CLARA_TEXTFLOW_CONFIG_CONSOLE_WIDTH CATCH_CONFIG_CONSOLE_WIDTH - 1\n\n#ifdef __clang__\n#pragma clang diagnostic push\n#pragma clang diagnostic ignored \"-Wweak-vtables\"\n#pragma clang diagnostic ignored \"-Wexit-time-destructors\"\n#pragma clang diagnostic ignored \"-Wshadow\"\n#endif\n\n// start clara.hpp\n// Copyright 2017 Two Blue Cubes Ltd. All rights reserved.\n//\n// Distributed under the Boost Software License, Version 1.0. (See accompanying\n// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)\n//\n// See https://github.com/philsquared/Clara for more details\n\n// Clara v1.1.5\n\n#ifndef CATCH_CLARA_CONFIG_CONSOLE_WIDTH\n#define CATCH_CLARA_CONFIG_CONSOLE_WIDTH 80\n#endif\n\n#ifndef CATCH_CLARA_TEXTFLOW_CONFIG_CONSOLE_WIDTH\n#define CATCH_CLARA_TEXTFLOW_CONFIG_CONSOLE_WIDTH                              \\\n  CATCH_CLARA_CONFIG_CONSOLE_WIDTH\n#endif\n\n#ifndef CLARA_CONFIG_OPTIONAL_TYPE\n#ifdef __has_include\n#if __has_include(<optional>) && __cplusplus >= 201703L\n#include <optional>\n#define CLARA_CONFIG_OPTIONAL_TYPE std::optional\n#endif\n#endif\n#endif\n\n// ----------- #included from clara_textflow.hpp -----------\n\n// TextFlowCpp\n//\n// A single-header library for wrapping and laying out basic text, by Phil Nash\n//\n// Distributed under the Boost Software License, Version 1.0. (See accompanying\n// file LICENSE.txt or copy at http://www.boost.org/LICENSE_1_0.txt)\n//\n// This project is hosted at https://github.com/philsquared/textflowcpp\n\n#include <cassert>\n#include <ostream>\n#include <sstream>\n#include <vector>\n\n#ifndef CATCH_CLARA_TEXTFLOW_CONFIG_CONSOLE_WIDTH\n#define CATCH_CLARA_TEXTFLOW_CONFIG_CONSOLE_WIDTH 80\n#endif\n\nnamespace Catch {\nnamespace clara {\nnamespace TextFlow {\n\ninline auto isWhitespace(char c) -> bool {\n  static std::string chars = \" \\t\\n\\r\";\n  return chars.find(c) != std::string::npos;\n}\ninline auto isBreakableBefore(char c) -> bool {\n  static std::string chars = \"[({<|\";\n  return chars.find(c) != std::string::npos;\n}\ninline auto isBreakableAfter(char c) -> bool {\n  static std::string chars = \"])}>.,:;*+-=&/\\\\\";\n  return chars.find(c) != std::string::npos;\n}\n\nclass Columns;\n\nclass Column {\n  std::vector<std::string> m_strings;\n  size_t m_width         = CATCH_CLARA_TEXTFLOW_CONFIG_CONSOLE_WIDTH;\n  size_t m_indent        = 0;\n  size_t m_initialIndent = std::string::npos;\n\npublic:\n  class iterator {\n    friend Column;\n\n    Column const& m_column;\n    size_t m_stringIndex = 0;\n    size_t m_pos         = 0;\n\n    size_t m_len  = 0;\n    size_t m_end  = 0;\n    bool m_suffix = false;\n\n    iterator(Column const& column, size_t stringIndex)\n        : m_column(column), m_stringIndex(stringIndex) {}\n\n    auto line() const -> std::string const& {\n      return m_column.m_strings[m_stringIndex];\n    }\n\n    auto isBoundary(size_t at) const -> bool {\n      assert(at > 0);\n      assert(at <= line().size());\n\n      return at == line().size() ||\n             (isWhitespace(line()[at]) && !isWhitespace(line()[at - 1])) ||\n             isBreakableBefore(line()[at]) || isBreakableAfter(line()[at - 1]);\n    }\n\n    void calcLength() {\n      assert(m_stringIndex < m_column.m_strings.size());\n\n      m_suffix   = false;\n      auto width = m_column.m_width - indent();\n      m_end      = m_pos;\n      if (line()[m_pos] == '\\n') {\n        ++m_end;\n      }\n      while (m_end < line().size() && line()[m_end] != '\\n')\n        ++m_end;\n\n      if (m_end < m_pos + width) {\n        m_len = m_end - m_pos;\n      } else {\n        size_t len = width;\n        while (len > 0 && !isBoundary(m_pos + len))\n          --len;\n        while (len > 0 && isWhitespace(line()[m_pos + len - 1]))\n          --len;\n\n        if (len > 0) {\n          m_len = len;\n        } else {\n          m_suffix = true;\n          m_len    = width - 1;\n        }\n      }\n    }\n\n    auto indent() const -> size_t {\n      auto initial = m_pos == 0 && m_stringIndex == 0 ? m_column.m_initialIndent\n                                                      : std::string::npos;\n      return initial == std::string::npos ? m_column.m_indent : initial;\n    }\n\n    auto addIndentAndSuffix(std::string const& plain) const -> std::string {\n      return std::string(indent(), ' ') + (m_suffix ? plain + \"-\" : plain);\n    }\n\n  public:\n    using difference_type   = std::ptrdiff_t;\n    using value_type        = std::string;\n    using pointer           = value_type*;\n    using reference         = value_type&;\n    using iterator_category = std::forward_iterator_tag;\n\n    explicit iterator(Column const& column) : m_column(column) {\n      assert(m_column.m_width > m_column.m_indent);\n      assert(m_column.m_initialIndent == std::string::npos ||\n             m_column.m_width > m_column.m_initialIndent);\n      calcLength();\n      if (m_len == 0)\n        m_stringIndex++; // Empty string\n    }\n\n    auto operator*() const -> std::string {\n      assert(m_stringIndex < m_column.m_strings.size());\n      assert(m_pos <= m_end);\n      return addIndentAndSuffix(line().substr(m_pos, m_len));\n    }\n\n    auto operator++() -> iterator& {\n      m_pos += m_len;\n      if (m_pos < line().size() && line()[m_pos] == '\\n')\n        m_pos += 1;\n      else\n        while (m_pos < line().size() && isWhitespace(line()[m_pos]))\n          ++m_pos;\n\n      if (m_pos == line().size()) {\n        m_pos = 0;\n        ++m_stringIndex;\n      }\n      if (m_stringIndex < m_column.m_strings.size())\n        calcLength();\n      return *this;\n    }\n    auto operator++(int) -> iterator {\n      iterator prev(*this);\n      operator++();\n      return prev;\n    }\n\n    auto operator==(iterator const& other) const -> bool {\n      return m_pos == other.m_pos && m_stringIndex == other.m_stringIndex &&\n             &m_column == &other.m_column;\n    }\n    auto operator!=(iterator const& other) const -> bool {\n      return !operator==(other);\n    }\n  };\n  using const_iterator = iterator;\n\n  explicit Column(std::string const& text) { m_strings.push_back(text); }\n\n  auto width(size_t newWidth) -> Column& {\n    assert(newWidth > 0);\n    m_width = newWidth;\n    return *this;\n  }\n  auto indent(size_t newIndent) -> Column& {\n    m_indent = newIndent;\n    return *this;\n  }\n  auto initialIndent(size_t newIndent) -> Column& {\n    m_initialIndent = newIndent;\n    return *this;\n  }\n\n  auto width() const -> size_t { return m_width; }\n  auto begin() const -> iterator { return iterator(*this); }\n  auto end() const -> iterator { return {*this, m_strings.size()}; }\n\n  inline friend std::ostream& operator<<(std::ostream& os, Column const& col) {\n    bool first = true;\n    for (auto line : col) {\n      if (first)\n        first = false;\n      else\n        os << \"\\n\";\n      os << line;\n    }\n    return os;\n  }\n\n  auto operator+(Column const& other) -> Columns;\n\n  auto toString() const -> std::string {\n    std::ostringstream oss;\n    oss << *this;\n    return oss.str();\n  }\n};\n\nclass Spacer : public Column {\n\npublic:\n  explicit Spacer(size_t spaceWidth) : Column(\"\") { width(spaceWidth); }\n};\n\nclass Columns {\n  std::vector<Column> m_columns;\n\npublic:\n  class iterator {\n    friend Columns;\n    struct EndTag {};\n\n    std::vector<Column> const& m_columns;\n    std::vector<Column::iterator> m_iterators;\n    size_t m_activeIterators;\n\n    iterator(Columns const& columns, EndTag)\n        : m_columns(columns.m_columns), m_activeIterators(0) {\n      m_iterators.reserve(m_columns.size());\n\n      for (auto const& col : m_columns)\n        m_iterators.push_back(col.end());\n    }\n\n  public:\n    using difference_type   = std::ptrdiff_t;\n    using value_type        = std::string;\n    using pointer           = value_type*;\n    using reference         = value_type&;\n    using iterator_category = std::forward_iterator_tag;\n\n    explicit iterator(Columns const& columns)\n        : m_columns(columns.m_columns), m_activeIterators(m_columns.size()) {\n      m_iterators.reserve(m_columns.size());\n\n      for (auto const& col : m_columns)\n        m_iterators.push_back(col.begin());\n    }\n\n    auto operator==(iterator const& other) const -> bool {\n      return m_iterators == other.m_iterators;\n    }\n    auto operator!=(iterator const& other) const -> bool {\n      return m_iterators != other.m_iterators;\n    }\n    auto operator*() const -> std::string {\n      std::string row, padding;\n\n      for (size_t i = 0; i < m_columns.size(); ++i) {\n        auto width = m_columns[i].width();\n        if (m_iterators[i] != m_columns[i].end()) {\n          std::string col = *m_iterators[i];\n          row += padding + col;\n          if (col.size() < width)\n            padding = std::string(width - col.size(), ' ');\n          else\n            padding = \"\";\n        } else {\n          padding += std::string(width, ' ');\n        }\n      }\n      return row;\n    }\n    auto operator++() -> iterator& {\n      for (size_t i = 0; i < m_columns.size(); ++i) {\n        if (m_iterators[i] != m_columns[i].end())\n          ++m_iterators[i];\n      }\n      return *this;\n    }\n    auto operator++(int) -> iterator {\n      iterator prev(*this);\n      operator++();\n      return prev;\n    }\n  };\n  using const_iterator = iterator;\n\n  auto begin() const -> iterator { return iterator(*this); }\n  auto end() const -> iterator { return {*this, iterator::EndTag()}; }\n\n  auto operator+=(Column const& col) -> Columns& {\n    m_columns.push_back(col);\n    return *this;\n  }\n  auto operator+(Column const& col) -> Columns {\n    Columns combined = *this;\n    combined += col;\n    return combined;\n  }\n\n  inline friend std::ostream& operator<<(std::ostream& os,\n                                         Columns const& cols) {\n\n    bool first = true;\n    for (auto line : cols) {\n      if (first)\n        first = false;\n      else\n        os << \"\\n\";\n      os << line;\n    }\n    return os;\n  }\n\n  auto toString() const -> std::string {\n    std::ostringstream oss;\n    oss << *this;\n    return oss.str();\n  }\n};\n\ninline auto Column::operator+(Column const& other) -> Columns {\n  Columns cols;\n  cols += *this;\n  cols += other;\n  return cols;\n}\n} // namespace TextFlow\n\n} // namespace clara\n} // namespace Catch\n\n// ----------- end of #include from clara_textflow.hpp -----------\n// ........... back in clara.hpp\n\n#include <cctype>\n#include <string>\n#include <memory>\n#include <set>\n#include <algorithm>\n\n#if !defined(CATCH_PLATFORM_WINDOWS) &&                                        \\\n    (defined(WIN32) || defined(__WIN32__) || defined(_WIN32) ||                \\\n     defined(_MSC_VER))\n#define CATCH_PLATFORM_WINDOWS\n#endif\n\nnamespace Catch {\nnamespace clara {\nnamespace detail {\n\n// Traits for extracting arg and return type of lambdas (for single argument\n// lambdas)\ntemplate <typename L>\nstruct UnaryLambdaTraits : UnaryLambdaTraits<decltype(&L::operator())> {};\n\ntemplate <typename ClassT, typename ReturnT, typename... Args>\nstruct UnaryLambdaTraits<ReturnT (ClassT::*)(Args...) const> {\n  static const bool isValid = false;\n};\n\ntemplate <typename ClassT, typename ReturnT, typename ArgT>\nstruct UnaryLambdaTraits<ReturnT (ClassT::*)(ArgT) const> {\n  static const bool isValid = true;\n  using ArgType             = typename std::remove_const<\n      typename std::remove_reference<ArgT>::type>::type;\n  using ReturnType = ReturnT;\n};\n\nclass TokenStream;\n\n// Transport for raw args (copied from main args, or supplied via init list for\n// testing)\nclass Args {\n  friend TokenStream;\n  std::string m_exeName;\n  std::vector<std::string> m_args;\n\npublic:\n  Args(int argc, char const* const* argv)\n      : m_exeName(argv[0]), m_args(argv + 1, argv + argc) {}\n\n  Args(std::initializer_list<std::string> args)\n      : m_exeName(*args.begin()), m_args(args.begin() + 1, args.end()) {}\n\n  auto exeName() const -> std::string { return m_exeName; }\n};\n\n// Wraps a token coming from a token stream. These may not directly correspond\n// to strings as a single string may encode an option + its argument if the : or\n// = form is used\nenum class TokenType { Option, Argument };\nstruct Token {\n  TokenType type;\n  std::string token;\n};\n\ninline auto isOptPrefix(char c) -> bool {\n  return c == '-'\n#ifdef CATCH_PLATFORM_WINDOWS\n         || c == '/'\n#endif\n      ;\n}\n\n// Abstracts iterators into args as a stream of tokens, with option arguments\n// uniformly handled\nclass TokenStream {\n  using Iterator = std::vector<std::string>::const_iterator;\n  Iterator it;\n  Iterator itEnd;\n  std::vector<Token> m_tokenBuffer;\n\n  void loadBuffer() {\n    m_tokenBuffer.resize(0);\n\n    // Skip any empty strings\n    while (it != itEnd && it->empty())\n      ++it;\n\n    if (it != itEnd) {\n      auto const& next = *it;\n      if (isOptPrefix(next[0])) {\n        auto delimiterPos = next.find_first_of(\" :=\");\n        if (delimiterPos != std::string::npos) {\n          m_tokenBuffer.push_back(\n              {TokenType::Option, next.substr(0, delimiterPos)});\n          m_tokenBuffer.push_back(\n              {TokenType::Argument, next.substr(delimiterPos + 1)});\n        } else {\n          if (next[1] != '-' && next.size() > 2) {\n            std::string opt = \"- \";\n            for (size_t i = 1; i < next.size(); ++i) {\n              opt[1] = next[i];\n              m_tokenBuffer.push_back({TokenType::Option, opt});\n            }\n          } else {\n            m_tokenBuffer.push_back({TokenType::Option, next});\n          }\n        }\n      } else {\n        m_tokenBuffer.push_back({TokenType::Argument, next});\n      }\n    }\n  }\n\npublic:\n  explicit TokenStream(Args const& args)\n      : TokenStream(args.m_args.begin(), args.m_args.end()) {}\n\n  TokenStream(Iterator it, Iterator itEnd) : it(it), itEnd(itEnd) {\n    loadBuffer();\n  }\n\n  explicit operator bool() const {\n    return !m_tokenBuffer.empty() || it != itEnd;\n  }\n\n  auto count() const -> size_t { return m_tokenBuffer.size() + (itEnd - it); }\n\n  auto operator*() const -> Token {\n    assert(!m_tokenBuffer.empty());\n    return m_tokenBuffer.front();\n  }\n\n  auto operator->() const -> Token const* {\n    assert(!m_tokenBuffer.empty());\n    return &m_tokenBuffer.front();\n  }\n\n  auto operator++() -> TokenStream& {\n    if (m_tokenBuffer.size() >= 2) {\n      m_tokenBuffer.erase(m_tokenBuffer.begin());\n    } else {\n      if (it != itEnd)\n        ++it;\n      loadBuffer();\n    }\n    return *this;\n  }\n};\n\nclass ResultBase {\npublic:\n  enum Type { Ok, LogicError, RuntimeError };\n\nprotected:\n  ResultBase(Type type) : m_type(type) {}\n  virtual ~ResultBase() = default;\n\n  virtual void enforceOk() const = 0;\n\n  Type m_type;\n};\n\ntemplate <typename T>\nclass ResultValueBase : public ResultBase {\npublic:\n  auto value() const -> T const& {\n    enforceOk();\n    return m_value;\n  }\n\nprotected:\n  ResultValueBase(Type type) : ResultBase(type) {}\n\n  ResultValueBase(ResultValueBase const& other) : ResultBase(other) {\n    if (m_type == ResultBase::Ok)\n      new (&m_value) T(other.m_value);\n  }\n\n  ResultValueBase(Type, T const& value) : ResultBase(Ok) {\n    new (&m_value) T(value);\n  }\n\n  auto operator=(ResultValueBase const& other) -> ResultValueBase& {\n    if (m_type == ResultBase::Ok)\n      m_value.~T();\n    ResultBase::operator=(other);\n    if (m_type == ResultBase::Ok)\n      new (&m_value) T(other.m_value);\n    return *this;\n  }\n\n  ~ResultValueBase() override {\n    if (m_type == Ok)\n      m_value.~T();\n  }\n\n  union {\n    T m_value;\n  };\n};\n\ntemplate <>\nclass ResultValueBase<void> : public ResultBase {\nprotected:\n  using ResultBase::ResultBase;\n};\n\ntemplate <typename T = void>\nclass BasicResult : public ResultValueBase<T> {\npublic:\n  template <typename U>\n  explicit BasicResult(BasicResult<U> const& other)\n      : ResultValueBase<T>(other.type()), m_errorMessage(other.errorMessage()) {\n    assert(type() != ResultBase::Ok);\n  }\n\n  template <typename U>\n  static auto ok(U const& value) -> BasicResult {\n    return {ResultBase::Ok, value};\n  }\n  static auto ok() -> BasicResult { return {ResultBase::Ok}; }\n  static auto logicError(std::string const& message) -> BasicResult {\n    return {ResultBase::LogicError, message};\n  }\n  static auto runtimeError(std::string const& message) -> BasicResult {\n    return {ResultBase::RuntimeError, message};\n  }\n\n  explicit operator bool() const { return m_type == ResultBase::Ok; }\n  auto type() const -> ResultBase::Type { return m_type; }\n  auto errorMessage() const -> std::string { return m_errorMessage; }\n\nprotected:\n  void enforceOk() const override {\n\n    // Errors shouldn't reach this point, but if they do\n    // the actual error message will be in m_errorMessage\n    assert(m_type != ResultBase::LogicError);\n    assert(m_type != ResultBase::RuntimeError);\n    if (m_type != ResultBase::Ok)\n      std::abort();\n  }\n\n  std::string m_errorMessage; // Only populated if resultType is an error\n\n  BasicResult(ResultBase::Type type, std::string const& message)\n      : ResultValueBase<T>(type), m_errorMessage(message) {\n    assert(m_type != ResultBase::Ok);\n  }\n\n  using ResultValueBase<T>::ResultValueBase;\n  using ResultBase::m_type;\n};\n\nenum class ParseResultType {\n  Matched,\n  NoMatch,\n  ShortCircuitAll,\n  ShortCircuitSame\n};\n\nclass ParseState {\npublic:\n  ParseState(ParseResultType type, TokenStream const& remainingTokens)\n      : m_type(type), m_remainingTokens(remainingTokens) {}\n\n  auto type() const -> ParseResultType { return m_type; }\n  auto remainingTokens() const -> TokenStream { return m_remainingTokens; }\n\nprivate:\n  ParseResultType m_type;\n  TokenStream m_remainingTokens;\n};\n\nusing Result              = BasicResult<void>;\nusing ParserResult        = BasicResult<ParseResultType>;\nusing InternalParseResult = BasicResult<ParseState>;\n\nstruct HelpColumns {\n  std::string left;\n  std::string right;\n};\n\ntemplate <typename T>\ninline auto convertInto(std::string const& source, T& target) -> ParserResult {\n  std::stringstream ss;\n  ss << source;\n  ss >> target;\n  if (ss.fail())\n    return ParserResult::runtimeError(\"Unable to convert '\" + source +\n                                      \"' to destination type\");\n  else\n    return ParserResult::ok(ParseResultType::Matched);\n}\ninline auto convertInto(std::string const& source, std::string& target)\n    -> ParserResult {\n  target = source;\n  return ParserResult::ok(ParseResultType::Matched);\n}\ninline auto convertInto(std::string const& source, bool& target)\n    -> ParserResult {\n  std::string srcLC = source;\n  std::transform(\n      srcLC.begin(), srcLC.end(), srcLC.begin(),\n      [](unsigned char c) { return static_cast<char>(std::tolower(c)); });\n  if (srcLC == \"y\" || srcLC == \"1\" || srcLC == \"true\" || srcLC == \"yes\" ||\n      srcLC == \"on\")\n    target = true;\n  else if (srcLC == \"n\" || srcLC == \"0\" || srcLC == \"false\" || srcLC == \"no\" ||\n           srcLC == \"off\")\n    target = false;\n  else\n    return ParserResult::runtimeError(\n        \"Expected a boolean value but did not recognise: '\" + source + \"'\");\n  return ParserResult::ok(ParseResultType::Matched);\n}\n#ifdef CLARA_CONFIG_OPTIONAL_TYPE\ntemplate <typename T>\ninline auto convertInto(std::string const& source,\n                        CLARA_CONFIG_OPTIONAL_TYPE<T>& target) -> ParserResult {\n  T temp;\n  auto result = convertInto(source, temp);\n  if (result)\n    target = std::move(temp);\n  return result;\n}\n#endif // CLARA_CONFIG_OPTIONAL_TYPE\n\nstruct NonCopyable {\n  NonCopyable()                              = default;\n  NonCopyable(NonCopyable const&)            = delete;\n  NonCopyable(NonCopyable&&)                 = delete;\n  NonCopyable& operator=(NonCopyable const&) = delete;\n  NonCopyable& operator=(NonCopyable&&)      = delete;\n};\n\nstruct BoundRef : NonCopyable {\n  virtual ~BoundRef() = default;\n  virtual auto isContainer() const -> bool { return false; }\n  virtual auto isFlag() const -> bool { return false; }\n};\nstruct BoundValueRefBase : BoundRef {\n  virtual auto setValue(std::string const& arg) -> ParserResult = 0;\n};\nstruct BoundFlagRefBase : BoundRef {\n  virtual auto setFlag(bool flag) -> ParserResult = 0;\n  virtual auto isFlag() const -> bool { return true; }\n};\n\ntemplate <typename T>\nstruct BoundValueRef : BoundValueRefBase {\n  T& m_ref;\n\n  explicit BoundValueRef(T& ref) : m_ref(ref) {}\n\n  auto setValue(std::string const& arg) -> ParserResult override {\n    return convertInto(arg, m_ref);\n  }\n};\n\ntemplate <typename T>\nstruct BoundValueRef<std::vector<T>> : BoundValueRefBase {\n  std::vector<T>& m_ref;\n\n  explicit BoundValueRef(std::vector<T>& ref) : m_ref(ref) {}\n\n  auto isContainer() const -> bool override { return true; }\n\n  auto setValue(std::string const& arg) -> ParserResult override {\n    T temp;\n    auto result = convertInto(arg, temp);\n    if (result)\n      m_ref.push_back(temp);\n    return result;\n  }\n};\n\nstruct BoundFlagRef : BoundFlagRefBase {\n  bool& m_ref;\n\n  explicit BoundFlagRef(bool& ref) : m_ref(ref) {}\n\n  auto setFlag(bool flag) -> ParserResult override {\n    m_ref = flag;\n    return ParserResult::ok(ParseResultType::Matched);\n  }\n};\n\ntemplate <typename ReturnType>\nstruct LambdaInvoker {\n  static_assert(std::is_same<ReturnType, ParserResult>::value,\n                \"Lambda must return void or clara::ParserResult\");\n\n  template <typename L, typename ArgType>\n  static auto invoke(L const& lambda, ArgType const& arg) -> ParserResult {\n    return lambda(arg);\n  }\n};\n\ntemplate <>\nstruct LambdaInvoker<void> {\n  template <typename L, typename ArgType>\n  static auto invoke(L const& lambda, ArgType const& arg) -> ParserResult {\n    lambda(arg);\n    return ParserResult::ok(ParseResultType::Matched);\n  }\n};\n\ntemplate <typename ArgType, typename L>\ninline auto invokeLambda(L const& lambda, std::string const& arg)\n    -> ParserResult {\n  ArgType temp{};\n  auto result = convertInto(arg, temp);\n  return !result\n             ? result\n             : LambdaInvoker<typename UnaryLambdaTraits<L>::ReturnType>::invoke(\n                   lambda, temp);\n}\n\ntemplate <typename L>\nstruct BoundLambda : BoundValueRefBase {\n  L m_lambda;\n\n  static_assert(UnaryLambdaTraits<L>::isValid,\n                \"Supplied lambda must take exactly one argument\");\n  explicit BoundLambda(L const& lambda) : m_lambda(lambda) {}\n\n  auto setValue(std::string const& arg) -> ParserResult override {\n    return invokeLambda<typename UnaryLambdaTraits<L>::ArgType>(m_lambda, arg);\n  }\n};\n\ntemplate <typename L>\nstruct BoundFlagLambda : BoundFlagRefBase {\n  L m_lambda;\n\n  static_assert(UnaryLambdaTraits<L>::isValid,\n                \"Supplied lambda must take exactly one argument\");\n  static_assert(\n      std::is_same<typename UnaryLambdaTraits<L>::ArgType, bool>::value,\n      \"flags must be boolean\");\n\n  explicit BoundFlagLambda(L const& lambda) : m_lambda(lambda) {}\n\n  auto setFlag(bool flag) -> ParserResult override {\n    return LambdaInvoker<typename UnaryLambdaTraits<L>::ReturnType>::invoke(\n        m_lambda, flag);\n  }\n};\n\nenum class Optionality { Optional, Required };\n\nstruct Parser;\n\nclass ParserBase {\npublic:\n  virtual ~ParserBase() = default;\n  virtual auto validate() const -> Result { return Result::ok(); }\n  virtual auto parse(std::string const& exeName,\n                     TokenStream const& tokens) const\n      -> InternalParseResult = 0;\n  virtual auto cardinality() const -> size_t { return 1; }\n\n  auto parse(Args const& args) const -> InternalParseResult {\n    return parse(args.exeName(), TokenStream(args));\n  }\n};\n\ntemplate <typename DerivedT>\nclass ComposableParserImpl : public ParserBase {\npublic:\n  template <typename T>\n  auto operator|(T const& other) const -> Parser;\n\n  template <typename T>\n  auto operator+(T const& other) const -> Parser;\n};\n\n// Common code and state for Args and Opts\ntemplate <typename DerivedT>\nclass ParserRefImpl : public ComposableParserImpl<DerivedT> {\nprotected:\n  Optionality m_optionality = Optionality::Optional;\n  std::shared_ptr<BoundRef> m_ref;\n  std::string m_hint;\n  std::string m_description;\n\n  explicit ParserRefImpl(std::shared_ptr<BoundRef> const& ref) : m_ref(ref) {}\n\npublic:\n  template <typename T>\n  ParserRefImpl(T& ref, std::string const& hint)\n      : m_ref(std::make_shared<BoundValueRef<T>>(ref)), m_hint(hint) {}\n\n  template <typename LambdaT>\n  ParserRefImpl(LambdaT const& ref, std::string const& hint)\n      : m_ref(std::make_shared<BoundLambda<LambdaT>>(ref)), m_hint(hint) {}\n\n  auto operator()(std::string const& description) -> DerivedT& {\n    m_description = description;\n    return static_cast<DerivedT&>(*this);\n  }\n\n  auto optional() -> DerivedT& {\n    m_optionality = Optionality::Optional;\n    return static_cast<DerivedT&>(*this);\n  };\n\n  auto required() -> DerivedT& {\n    m_optionality = Optionality::Required;\n    return static_cast<DerivedT&>(*this);\n  };\n\n  auto isOptional() const -> bool {\n    return m_optionality == Optionality::Optional;\n  }\n\n  auto cardinality() const -> size_t override {\n    if (m_ref->isContainer())\n      return 0;\n    else\n      return 1;\n  }\n\n  auto hint() const -> std::string { return m_hint; }\n};\n\nclass ExeName : public ComposableParserImpl<ExeName> {\n  std::shared_ptr<std::string> m_name;\n  std::shared_ptr<BoundValueRefBase> m_ref;\n\n  template <typename LambdaT>\n  static auto makeRef(LambdaT const& lambda)\n      -> std::shared_ptr<BoundValueRefBase> {\n    return std::make_shared<BoundLambda<LambdaT>>(lambda);\n  }\n\npublic:\n  ExeName() : m_name(std::make_shared<std::string>(\"<executable>\")) {}\n\n  explicit ExeName(std::string& ref) : ExeName() {\n    m_ref = std::make_shared<BoundValueRef<std::string>>(ref);\n  }\n\n  template <typename LambdaT>\n  explicit ExeName(LambdaT const& lambda) : ExeName() {\n    m_ref = std::make_shared<BoundLambda<LambdaT>>(lambda);\n  }\n\n  // The exe name is not parsed out of the normal tokens, but is handled\n  // specially\n  auto parse(std::string const&, TokenStream const& tokens) const\n      -> InternalParseResult override {\n    return InternalParseResult::ok(\n        ParseState(ParseResultType::NoMatch, tokens));\n  }\n\n  auto name() const -> std::string { return *m_name; }\n  auto set(std::string const& newName) -> ParserResult {\n\n    auto lastSlash = newName.find_last_of(\"\\\\/\");\n    auto filename  = (lastSlash == std::string::npos)\n                         ? newName\n                         : newName.substr(lastSlash + 1);\n\n    *m_name = filename;\n    if (m_ref)\n      return m_ref->setValue(filename);\n    else\n      return ParserResult::ok(ParseResultType::Matched);\n  }\n};\n\nclass Arg : public ParserRefImpl<Arg> {\npublic:\n  using ParserRefImpl::ParserRefImpl;\n\n  auto parse(std::string const&, TokenStream const& tokens) const\n      -> InternalParseResult override {\n    auto validationResult = validate();\n    if (!validationResult)\n      return InternalParseResult(validationResult);\n\n    auto remainingTokens = tokens;\n    auto const& token    = *remainingTokens;\n    if (token.type != TokenType::Argument)\n      return InternalParseResult::ok(\n          ParseState(ParseResultType::NoMatch, remainingTokens));\n\n    assert(!m_ref->isFlag());\n    auto valueRef = static_cast<detail::BoundValueRefBase*>(m_ref.get());\n\n    auto result = valueRef->setValue(remainingTokens->token);\n    if (!result)\n      return InternalParseResult(result);\n    else\n      return InternalParseResult::ok(\n          ParseState(ParseResultType::Matched, ++remainingTokens));\n  }\n};\n\ninline auto normaliseOpt(std::string const& optName) -> std::string {\n#ifdef CATCH_PLATFORM_WINDOWS\n  if (optName[0] == '/')\n    return \"-\" + optName.substr(1);\n  else\n#endif\n    return optName;\n}\n\nclass Opt : public ParserRefImpl<Opt> {\nprotected:\n  std::vector<std::string> m_optNames;\n\npublic:\n  template <typename LambdaT>\n  explicit Opt(LambdaT const& ref)\n      : ParserRefImpl(std::make_shared<BoundFlagLambda<LambdaT>>(ref)) {}\n\n  explicit Opt(bool& ref)\n      : ParserRefImpl(std::make_shared<BoundFlagRef>(ref)) {}\n\n  template <typename LambdaT>\n  Opt(LambdaT const& ref, std::string const& hint) : ParserRefImpl(ref, hint) {}\n\n  template <typename T>\n  Opt(T& ref, std::string const& hint) : ParserRefImpl(ref, hint) {}\n\n  auto operator[](std::string const& optName) -> Opt& {\n    m_optNames.push_back(optName);\n    return *this;\n  }\n\n  auto getHelpColumns() const -> std::vector<HelpColumns> {\n    std::ostringstream oss;\n    bool first = true;\n    for (auto const& opt : m_optNames) {\n      if (first)\n        first = false;\n      else\n        oss << \", \";\n      oss << opt;\n    }\n    if (!m_hint.empty())\n      oss << \" <\" << m_hint << \">\";\n    return {{oss.str(), m_description}};\n  }\n\n  auto isMatch(std::string const& optToken) const -> bool {\n    auto normalisedToken = normaliseOpt(optToken);\n    for (auto const& name : m_optNames) {\n      if (normaliseOpt(name) == normalisedToken)\n        return true;\n    }\n    return false;\n  }\n\n  using ParserBase::parse;\n\n  auto parse(std::string const&, TokenStream const& tokens) const\n      -> InternalParseResult override {\n    auto validationResult = validate();\n    if (!validationResult)\n      return InternalParseResult(validationResult);\n\n    auto remainingTokens = tokens;\n    if (remainingTokens && remainingTokens->type == TokenType::Option) {\n      auto const& token = *remainingTokens;\n      if (isMatch(token.token)) {\n        if (m_ref->isFlag()) {\n          auto flagRef = static_cast<detail::BoundFlagRefBase*>(m_ref.get());\n          auto result  = flagRef->setFlag(true);\n          if (!result)\n            return InternalParseResult(result);\n          if (result.value() == ParseResultType::ShortCircuitAll)\n            return InternalParseResult::ok(\n                ParseState(result.value(), remainingTokens));\n        } else {\n          auto valueRef = static_cast<detail::BoundValueRefBase*>(m_ref.get());\n          ++remainingTokens;\n          if (!remainingTokens)\n            return InternalParseResult::runtimeError(\n                \"Expected argument following \" + token.token);\n          auto const& argToken = *remainingTokens;\n          if (argToken.type != TokenType::Argument)\n            return InternalParseResult::runtimeError(\n                \"Expected argument following \" + token.token);\n          auto result = valueRef->setValue(argToken.token);\n          if (!result)\n            return InternalParseResult(result);\n          if (result.value() == ParseResultType::ShortCircuitAll)\n            return InternalParseResult::ok(\n                ParseState(result.value(), remainingTokens));\n        }\n        return InternalParseResult::ok(\n            ParseState(ParseResultType::Matched, ++remainingTokens));\n      }\n    }\n    return InternalParseResult::ok(\n        ParseState(ParseResultType::NoMatch, remainingTokens));\n  }\n\n  auto validate() const -> Result override {\n    if (m_optNames.empty())\n      return Result::logicError(\"No options supplied to Opt\");\n    for (auto const& name : m_optNames) {\n      if (name.empty())\n        return Result::logicError(\"Option name cannot be empty\");\n#ifdef CATCH_PLATFORM_WINDOWS\n      if (name[0] != '-' && name[0] != '/')\n        return Result::logicError(\"Option name must begin with '-' or '/'\");\n#else\n      if (name[0] != '-')\n        return Result::logicError(\"Option name must begin with '-'\");\n#endif\n    }\n    return ParserRefImpl::validate();\n  }\n};\n\nstruct Help : Opt {\n  Help(bool& showHelpFlag)\n      : Opt([&](bool flag) {\n          showHelpFlag = flag;\n          return ParserResult::ok(ParseResultType::ShortCircuitAll);\n        }) {\n    static_cast<Opt&> (*this)(\"display usage information\")[\"-?\"][\"-h\"][\"--help\"]\n        .optional();\n  }\n};\n\nstruct Parser : ParserBase {\n\n  mutable ExeName m_exeName;\n  std::vector<Opt> m_options;\n  std::vector<Arg> m_args;\n\n  auto operator|=(ExeName const& exeName) -> Parser& {\n    m_exeName = exeName;\n    return *this;\n  }\n\n  auto operator|=(Arg const& arg) -> Parser& {\n    m_args.push_back(arg);\n    return *this;\n  }\n\n  auto operator|=(Opt const& opt) -> Parser& {\n    m_options.push_back(opt);\n    return *this;\n  }\n\n  auto operator|=(Parser const& other) -> Parser& {\n    m_options.insert(m_options.end(), other.m_options.begin(),\n                     other.m_options.end());\n    m_args.insert(m_args.end(), other.m_args.begin(), other.m_args.end());\n    return *this;\n  }\n\n  template <typename T>\n  auto operator|(T const& other) const -> Parser {\n    return Parser(*this) |= other;\n  }\n\n  // Forward deprecated interface with '+' instead of '|'\n  template <typename T>\n  auto operator+=(T const& other) -> Parser& {\n    return operator|=(other);\n  }\n  template <typename T>\n  auto operator+(T const& other) const -> Parser {\n    return operator|(other);\n  }\n\n  auto getHelpColumns() const -> std::vector<HelpColumns> {\n    std::vector<HelpColumns> cols;\n    for (auto const& o : m_options) {\n      auto childCols = o.getHelpColumns();\n      cols.insert(cols.end(), childCols.begin(), childCols.end());\n    }\n    return cols;\n  }\n\n  void writeToStream(std::ostream& os) const {\n    if (!m_exeName.name().empty()) {\n      os << \"usage:\\n\"\n         << \"  \" << m_exeName.name() << \" \";\n      bool required = true, first = true;\n      for (auto const& arg : m_args) {\n        if (first)\n          first = false;\n        else\n          os << \" \";\n        if (arg.isOptional() && required) {\n          os << \"[\";\n          required = false;\n        }\n        os << \"<\" << arg.hint() << \">\";\n        if (arg.cardinality() == 0)\n          os << \" ... \";\n      }\n      if (!required)\n        os << \"]\";\n      if (!m_options.empty())\n        os << \" options\";\n      os << \"\\n\\nwhere options are:\" << std::endl;\n    }\n\n    auto rows           = getHelpColumns();\n    size_t consoleWidth = CATCH_CLARA_CONFIG_CONSOLE_WIDTH;\n    size_t optWidth     = 0;\n    for (auto const& cols : rows)\n      optWidth = (std::max)(optWidth, cols.left.size() + 2);\n\n    optWidth = (std::min)(optWidth, consoleWidth / 2);\n\n    for (auto const& cols : rows) {\n      auto row =\n          TextFlow::Column(cols.left).width(optWidth).indent(2) +\n          TextFlow::Spacer(4) +\n          TextFlow::Column(cols.right).width(consoleWidth - 7 - optWidth);\n      os << row << std::endl;\n    }\n  }\n\n  friend auto operator<<(std::ostream& os, Parser const& parser)\n      -> std::ostream& {\n    parser.writeToStream(os);\n    return os;\n  }\n\n  auto validate() const -> Result override {\n    for (auto const& opt : m_options) {\n      auto result = opt.validate();\n      if (!result)\n        return result;\n    }\n    for (auto const& arg : m_args) {\n      auto result = arg.validate();\n      if (!result)\n        return result;\n    }\n    return Result::ok();\n  }\n\n  using ParserBase::parse;\n\n  auto parse(std::string const& exeName, TokenStream const& tokens) const\n      -> InternalParseResult override {\n\n    struct ParserInfo {\n      ParserBase const* parser = nullptr;\n      size_t count             = 0;\n    };\n    const size_t totalParsers = m_options.size() + m_args.size();\n    assert(totalParsers < 512);\n    // ParserInfo parseInfos[totalParsers]; // <-- this is what we really want\n    // to do\n    ParserInfo parseInfos[512];\n\n    {\n      size_t i = 0;\n      for (auto const& opt : m_options)\n        parseInfos[i++].parser = &opt;\n      for (auto const& arg : m_args)\n        parseInfos[i++].parser = &arg;\n    }\n\n    m_exeName.set(exeName);\n\n    auto result =\n        InternalParseResult::ok(ParseState(ParseResultType::NoMatch, tokens));\n    while (result.value().remainingTokens()) {\n      bool tokenParsed = false;\n\n      for (size_t i = 0; i < totalParsers; ++i) {\n        auto& parseInfo = parseInfos[i];\n        if (parseInfo.parser->cardinality() == 0 ||\n            parseInfo.count < parseInfo.parser->cardinality()) {\n          result = parseInfo.parser->parse(exeName,\n                                           result.value().remainingTokens());\n          if (!result)\n            return result;\n          if (result.value().type() != ParseResultType::NoMatch) {\n            tokenParsed = true;\n            ++parseInfo.count;\n            break;\n          }\n        }\n      }\n\n      if (result.value().type() == ParseResultType::ShortCircuitAll)\n        return result;\n      if (!tokenParsed)\n        return InternalParseResult::runtimeError(\n            \"Unrecognised token: \" + result.value().remainingTokens()->token);\n    }\n    // !TBD Check missing required options\n    return result;\n  }\n};\n\ntemplate <typename DerivedT>\ntemplate <typename T>\nauto ComposableParserImpl<DerivedT>::operator|(T const& other) const -> Parser {\n  return Parser() | static_cast<DerivedT const&>(*this) | other;\n}\n} // namespace detail\n\n// A Combined parser\nusing detail::Parser;\n\n// A parser for options\nusing detail::Opt;\n\n// A parser for arguments\nusing detail::Arg;\n\n// Wrapper for argc, argv from main()\nusing detail::Args;\n\n// Specifies the name of the executable\nusing detail::ExeName;\n\n// Convenience wrapper for option parser that specifies the help option\nusing detail::Help;\n\n// enum of result types from a parse\nusing detail::ParseResultType;\n\n// Result type for parser operation\nusing detail::ParserResult;\n\n} // namespace clara\n} // namespace Catch\n\n// end clara.hpp\n#ifdef __clang__\n#pragma clang diagnostic pop\n#endif\n\n// Restore Clara's value for console width, if present\n#ifdef CATCH_TEMP_CLARA_CONFIG_CONSOLE_WIDTH\n#define CATCH_CLARA_TEXTFLOW_CONFIG_CONSOLE_WIDTH                              \\\n  CATCH_TEMP_CLARA_CONFIG_CONSOLE_WIDTH\n#undef CATCH_TEMP_CLARA_CONFIG_CONSOLE_WIDTH\n#endif\n\n// end catch_clara.h\nnamespace Catch {\n\nclara::Parser makeCommandLineParser(ConfigData& config);\n\n} // end namespace Catch\n\n// end catch_commandline.h\n#include <fstream>\n#include <ctime>\n\nnamespace Catch {\n\nclara::Parser makeCommandLineParser(ConfigData& config) {\n\n  using namespace clara;\n\n  auto const setWarning = [&](std::string const& warning) {\n    auto warningSet = [&]() {\n      if (warning == \"NoAssertions\")\n        return WarnAbout::NoAssertions;\n\n      if (warning == \"NoTests\")\n        return WarnAbout::NoTests;\n\n      return WarnAbout::Nothing;\n    }();\n\n    if (warningSet == WarnAbout::Nothing)\n      return ParserResult::runtimeError(\"Unrecognised warning: '\" + warning +\n                                        \"'\");\n    config.warnings =\n        static_cast<WarnAbout::What>(config.warnings | warningSet);\n    return ParserResult::ok(ParseResultType::Matched);\n  };\n  auto const loadTestNamesFromFile = [&](std::string const& filename) {\n    std::ifstream f(filename.c_str());\n    if (!f.is_open())\n      return ParserResult::runtimeError(\"Unable to load input file: '\" +\n                                        filename + \"'\");\n\n    std::string line;\n    while (std::getline(f, line)) {\n      line = trim(line);\n      if (!line.empty() && !startsWith(line, '#')) {\n        if (!startsWith(line, '\"'))\n          line = '\"' + line + '\"';\n        config.testsOrTags.push_back(line);\n        config.testsOrTags.emplace_back(\",\");\n      }\n    }\n    // Remove comma in the end\n    if (!config.testsOrTags.empty())\n      config.testsOrTags.erase(config.testsOrTags.end() - 1);\n\n    return ParserResult::ok(ParseResultType::Matched);\n  };\n  auto const setTestOrder = [&](std::string const& order) {\n    if (startsWith(\"declared\", order))\n      config.runOrder = RunTests::InDeclarationOrder;\n    else if (startsWith(\"lexical\", order))\n      config.runOrder = RunTests::InLexicographicalOrder;\n    else if (startsWith(\"random\", order))\n      config.runOrder = RunTests::InRandomOrder;\n    else\n      return clara::ParserResult::runtimeError(\"Unrecognised ordering: '\" +\n                                               order + \"'\");\n    return ParserResult::ok(ParseResultType::Matched);\n  };\n  auto const setRngSeed = [&](std::string const& seed) {\n    if (seed != \"time\")\n      return clara::detail::convertInto(seed, config.rngSeed);\n    config.rngSeed = static_cast<unsigned int>(std::time(nullptr));\n    return ParserResult::ok(ParseResultType::Matched);\n  };\n  auto const setColourUsage = [&](std::string const& useColour) {\n    auto mode = toLower(useColour);\n\n    if (mode == \"yes\")\n      config.useColour = UseColour::Yes;\n    else if (mode == \"no\")\n      config.useColour = UseColour::No;\n    else if (mode == \"auto\")\n      config.useColour = UseColour::Auto;\n    else\n      return ParserResult::runtimeError(\n          \"colour mode must be one of: auto, yes or no. '\" + useColour +\n          \"' not recognised\");\n    return ParserResult::ok(ParseResultType::Matched);\n  };\n  auto const setWaitForKeypress = [&](std::string const& keypress) {\n    auto keypressLc = toLower(keypress);\n    if (keypressLc == \"never\")\n      config.waitForKeypress = WaitForKeypress::Never;\n    else if (keypressLc == \"start\")\n      config.waitForKeypress = WaitForKeypress::BeforeStart;\n    else if (keypressLc == \"exit\")\n      config.waitForKeypress = WaitForKeypress::BeforeExit;\n    else if (keypressLc == \"both\")\n      config.waitForKeypress = WaitForKeypress::BeforeStartAndExit;\n    else\n      return ParserResult::runtimeError(\n          \"keypress argument must be one of: never, start, exit or both. '\" +\n          keypress + \"' not recognised\");\n    return ParserResult::ok(ParseResultType::Matched);\n  };\n  auto const setVerbosity = [&](std::string const& verbosity) {\n    auto lcVerbosity = toLower(verbosity);\n    if (lcVerbosity == \"quiet\")\n      config.verbosity = Verbosity::Quiet;\n    else if (lcVerbosity == \"normal\")\n      config.verbosity = Verbosity::Normal;\n    else if (lcVerbosity == \"high\")\n      config.verbosity = Verbosity::High;\n    else\n      return ParserResult::runtimeError(\"Unrecognised verbosity, '\" +\n                                        verbosity + \"'\");\n    return ParserResult::ok(ParseResultType::Matched);\n  };\n  auto const setReporter = [&](std::string const& reporter) {\n    IReporterRegistry::FactoryMap const& factories =\n        getRegistryHub().getReporterRegistry().getFactories();\n\n    auto lcReporter = toLower(reporter);\n    auto result     = factories.find(lcReporter);\n\n    if (factories.end() != result)\n      config.reporterName = lcReporter;\n    else\n      return ParserResult::runtimeError(\n          \"Unrecognized reporter, '\" + reporter +\n          \"'. Check available with --list-reporters\");\n    return ParserResult::ok(ParseResultType::Matched);\n  };\n\n  auto cli =\n      ExeName(config.processName) | Help(config.showHelp) |\n      Opt(config.listTests)[\"-l\"][\"--list-tests\"](\n          \"list all/matching test cases\") |\n      Opt(config.listTags)[\"-t\"][\"--list-tags\"](\"list all/matching tags\") |\n      Opt(config.showSuccessfulTests)[\"-s\"][\"--success\"](\n          \"include successful tests in output\") |\n      Opt(config.shouldDebugBreak)[\"-b\"][\"--break\"](\n          \"break into debugger on failure\") |\n      Opt(config.noThrow)[\"-e\"][\"--nothrow\"](\"skip exception tests\") |\n      Opt(config.showInvisibles)[\"-i\"][\"--invisibles\"](\n          \"show invisibles (tabs, newlines)\") |\n      Opt(config.outputFilename, \"filename\")[\"-o\"][\"--out\"](\"output filename\") |\n      Opt(setReporter,\n          \"name\")[\"-r\"][\"--reporter\"](\"reporter to use (defaults to console)\") |\n      Opt(config.name, \"name\")[\"-n\"][\"--name\"](\"suite name\") | Opt([&](bool) {\n        config.abortAfter = 1;\n      })[\"-a\"][\"--abort\"](\"abort at first failure\") |\n      Opt([&](int x) { config.abortAfter = x; },\n          \"no. failures\")[\"-x\"][\"--abortx\"](\"abort after x failures\") |\n      Opt(setWarning, \"warning name\")[\"-w\"][\"--warn\"](\"enable warnings\") |\n      Opt(\n          [&](bool flag) {\n            config.showDurations =\n                flag ? ShowDurations::Always : ShowDurations::Never;\n          },\n          \"yes|no\")[\"-d\"][\"--durations\"](\"show test durations\") |\n      Opt(config.minDuration, \"seconds\")[\"-D\"][\"--min-duration\"](\n          \"show test durations for tests taking at least the given number of \"\n          \"seconds\") |\n      Opt(loadTestNamesFromFile, \"filename\")[\"-f\"][\"--input-file\"](\n          \"load test names to run from a file\") |\n      Opt(config.filenamesAsTags)[\"-#\"][\"--filenames-as-tags\"](\n          \"adds a tag for the filename\") |\n      Opt(config.sectionsToRun,\n          \"section name\")[\"-c\"][\"--section\"](\"specify section to run\") |\n      Opt(setVerbosity,\n          \"quiet|normal|high\")[\"-v\"][\"--verbosity\"](\"set output verbosity\") |\n      Opt(config.listTestNamesOnly)[\"--list-test-names-only\"](\n          \"list all/matching test cases names only\") |\n      Opt(config.listReporters)[\"--list-reporters\"](\"list all reporters\") |\n      Opt(setTestOrder,\n          \"decl|lex|rand\")[\"--order\"](\"test case order (defaults to decl)\") |\n      Opt(setRngSeed, \"'time'|number\")[\"--rng-seed\"](\n          \"set a specific seed for random numbers\") |\n      Opt(setColourUsage,\n          \"yes|no\")[\"--use-colour\"](\"should output be colourised\") |\n      Opt(config.libIdentify)[\"--libidentify\"](\n          \"report name and version according to libidentify standard\") |\n      Opt(setWaitForKeypress, \"never|start|exit|both\")[\"--wait-for-keypress\"](\n          \"waits for a keypress before exiting\") |\n      Opt(config.benchmarkSamples, \"samples\")[\"--benchmark-samples\"](\n          \"number of samples to collect (default: 100)\") |\n      Opt(config.benchmarkResamples, \"resamples\")[\"--benchmark-resamples\"](\n          \"number of resamples for the bootstrap (default: 100000)\") |\n      Opt(config.benchmarkConfidenceInterval,\n          \"confidence interval\")[\"--benchmark-confidence-interval\"](\n          \"confidence interval for the bootstrap (between 0 and 1, default: \"\n          \"0.95)\") |\n      Opt(config.benchmarkNoAnalysis)[\"--benchmark-no-analysis\"](\n          \"perform only measurements; do not perform any analysis\") |\n      Opt(config.benchmarkWarmupTime,\n          \"benchmarkWarmupTime\")[\"--benchmark-warmup-time\"](\n          \"amount of time in milliseconds spent on warming up each test \"\n          \"(default: 100)\") |\n      Arg(config.testsOrTags,\n          \"test name|pattern|tags\")(\"which test or tests to use\");\n\n  return cli;\n}\n\n} // end namespace Catch\n// end catch_commandline.cpp\n// start catch_common.cpp\n\n#include <cstring>\n#include <ostream>\n\nnamespace Catch {\n\nbool SourceLineInfo::operator==(SourceLineInfo const& other) const noexcept {\n  return line == other.line &&\n         (file == other.file || std::strcmp(file, other.file) == 0);\n}\nbool SourceLineInfo::operator<(SourceLineInfo const& other) const noexcept {\n  // We can assume that the same file will usually have the same pointer.\n  // Thus, if the pointers are the same, there is no point in calling the strcmp\n  return line < other.line || (line == other.line && file != other.file &&\n                               (std::strcmp(file, other.file) < 0));\n}\n\nstd::ostream& operator<<(std::ostream& os, SourceLineInfo const& info) {\n#ifndef __GNUG__\n  os << info.file << '(' << info.line << ')';\n#else\n  os << info.file << ':' << info.line;\n#endif\n  return os;\n}\n\nstd::string StreamEndStop::operator+() const { return std::string(); }\n\nNonCopyable::NonCopyable()  = default;\nNonCopyable::~NonCopyable() = default;\n\n} // namespace Catch\n// end catch_common.cpp\n// start catch_config.cpp\n\nnamespace Catch {\n\nConfig::Config(ConfigData const& data) : m_data(data), m_stream(openStream()) {\n  // We need to trim filter specs to avoid trouble with superfluous\n  // whitespace (esp. important for bdd macros, as those are manually\n  // aligned with whitespace).\n\n  for (auto& elem : m_data.testsOrTags) {\n    elem = trim(elem);\n  }\n  for (auto& elem : m_data.sectionsToRun) {\n    elem = trim(elem);\n  }\n\n  TestSpecParser parser(ITagAliasRegistry::get());\n  if (!m_data.testsOrTags.empty()) {\n    m_hasTestFilters = true;\n    for (auto const& testOrTags : m_data.testsOrTags) {\n      parser.parse(testOrTags);\n    }\n  }\n  m_testSpec = parser.testSpec();\n}\n\nstd::string const& Config::getFilename() const { return m_data.outputFilename; }\n\nbool Config::listTests() const { return m_data.listTests; }\nbool Config::listTestNamesOnly() const { return m_data.listTestNamesOnly; }\nbool Config::listTags() const { return m_data.listTags; }\nbool Config::listReporters() const { return m_data.listReporters; }\n\nstd::string Config::getProcessName() const { return m_data.processName; }\nstd::string const& Config::getReporterName() const {\n  return m_data.reporterName;\n}\n\nstd::vector<std::string> const& Config::getTestsOrTags() const {\n  return m_data.testsOrTags;\n}\nstd::vector<std::string> const& Config::getSectionsToRun() const {\n  return m_data.sectionsToRun;\n}\n\nTestSpec const& Config::testSpec() const { return m_testSpec; }\nbool Config::hasTestFilters() const { return m_hasTestFilters; }\n\nbool Config::showHelp() const { return m_data.showHelp; }\n\n// IConfig interface\nbool Config::allowThrows() const { return !m_data.noThrow; }\nstd::ostream& Config::stream() const { return m_stream->stream(); }\nstd::string Config::name() const {\n  return m_data.name.empty() ? m_data.processName : m_data.name;\n}\nbool Config::includeSuccessfulResults() const {\n  return m_data.showSuccessfulTests;\n}\nbool Config::warnAboutMissingAssertions() const {\n  return !!(m_data.warnings & WarnAbout::NoAssertions);\n}\nbool Config::warnAboutNoTests() const {\n  return !!(m_data.warnings & WarnAbout::NoTests);\n}\nShowDurations::OrNot Config::showDurations() const {\n  return m_data.showDurations;\n}\ndouble Config::minDuration() const { return m_data.minDuration; }\nRunTests::InWhatOrder Config::runOrder() const { return m_data.runOrder; }\nunsigned int Config::rngSeed() const { return m_data.rngSeed; }\nUseColour::YesOrNo Config::useColour() const { return m_data.useColour; }\nbool Config::shouldDebugBreak() const { return m_data.shouldDebugBreak; }\nint Config::abortAfter() const { return m_data.abortAfter; }\nbool Config::showInvisibles() const { return m_data.showInvisibles; }\nVerbosity Config::verbosity() const { return m_data.verbosity; }\n\nbool Config::benchmarkNoAnalysis() const { return m_data.benchmarkNoAnalysis; }\nint Config::benchmarkSamples() const { return m_data.benchmarkSamples; }\ndouble Config::benchmarkConfidenceInterval() const {\n  return m_data.benchmarkConfidenceInterval;\n}\nunsigned int Config::benchmarkResamples() const {\n  return m_data.benchmarkResamples;\n}\nstd::chrono::milliseconds Config::benchmarkWarmupTime() const {\n  return std::chrono::milliseconds(m_data.benchmarkWarmupTime);\n}\n\nIStream const* Config::openStream() {\n  return Catch::makeStream(m_data.outputFilename);\n}\n\n} // end namespace Catch\n// end catch_config.cpp\n// start catch_console_colour.cpp\n\n#if defined(__clang__)\n#pragma clang diagnostic push\n#pragma clang diagnostic ignored \"-Wexit-time-destructors\"\n#endif\n\n// start catch_errno_guard.h\n\nnamespace Catch {\n\nclass ErrnoGuard {\npublic:\n  ErrnoGuard();\n  ~ErrnoGuard();\n\nprivate:\n  int m_oldErrno;\n};\n\n} // namespace Catch\n\n// end catch_errno_guard.h\n// start catch_windows_h_proxy.h\n\n#if defined(CATCH_PLATFORM_WINDOWS)\n\n#if !defined(NOMINMAX) && !defined(CATCH_CONFIG_NO_NOMINMAX)\n#define CATCH_DEFINED_NOMINMAX\n#define NOMINMAX\n#endif\n#if !defined(WIN32_LEAN_AND_MEAN) &&                                           \\\n    !defined(CATCH_CONFIG_NO_WIN32_LEAN_AND_MEAN)\n#define CATCH_DEFINED_WIN32_LEAN_AND_MEAN\n#define WIN32_LEAN_AND_MEAN\n#endif\n\n#ifdef __AFXDLL\n#include <AfxWin.h>\n#else\n#include <windows.h>\n#endif\n\n#ifdef CATCH_DEFINED_NOMINMAX\n#undef NOMINMAX\n#endif\n#ifdef CATCH_DEFINED_WIN32_LEAN_AND_MEAN\n#undef WIN32_LEAN_AND_MEAN\n#endif\n\n#endif // defined(CATCH_PLATFORM_WINDOWS)\n\n// end catch_windows_h_proxy.h\n#include <sstream>\n\nnamespace Catch {\nnamespace {\n\nstruct IColourImpl {\n  virtual ~IColourImpl()                     = default;\n  virtual void use(Colour::Code _colourCode) = 0;\n};\n\nstruct NoColourImpl : IColourImpl {\n  void use(Colour::Code) override {}\n\n  static IColourImpl* instance() {\n    static NoColourImpl s_instance;\n    return &s_instance;\n  }\n};\n\n} // namespace\n} // namespace Catch\n\n#if !defined(CATCH_CONFIG_COLOUR_NONE) &&                                      \\\n    !defined(CATCH_CONFIG_COLOUR_WINDOWS) &&                                   \\\n    !defined(CATCH_CONFIG_COLOUR_ANSI)\n#ifdef CATCH_PLATFORM_WINDOWS\n#define CATCH_CONFIG_COLOUR_WINDOWS\n#else\n#define CATCH_CONFIG_COLOUR_ANSI\n#endif\n#endif\n\n#if defined(                                                                   \\\n    CATCH_CONFIG_COLOUR_WINDOWS) /////////////////////////////////////////\n\nnamespace Catch {\nnamespace {\n\nclass Win32ColourImpl : public IColourImpl {\npublic:\n  Win32ColourImpl() : stdoutHandle(GetStdHandle(STD_OUTPUT_HANDLE)) {\n    CONSOLE_SCREEN_BUFFER_INFO csbiInfo;\n    GetConsoleScreenBufferInfo(stdoutHandle, &csbiInfo);\n    originalForegroundAttributes =\n        csbiInfo.wAttributes & ~(BACKGROUND_GREEN | BACKGROUND_RED |\n                                 BACKGROUND_BLUE | BACKGROUND_INTENSITY);\n    originalBackgroundAttributes =\n        csbiInfo.wAttributes & ~(FOREGROUND_GREEN | FOREGROUND_RED |\n                                 FOREGROUND_BLUE | FOREGROUND_INTENSITY);\n  }\n\n  void use(Colour::Code _colourCode) override {\n    switch (_colourCode) {\n    case Colour::None:\n      return setTextAttribute(originalForegroundAttributes);\n    case Colour::White:\n      return setTextAttribute(FOREGROUND_GREEN | FOREGROUND_RED |\n                              FOREGROUND_BLUE);\n    case Colour::Red:\n      return setTextAttribute(FOREGROUND_RED);\n    case Colour::Green:\n      return setTextAttribute(FOREGROUND_GREEN);\n    case Colour::Blue:\n      return setTextAttribute(FOREGROUND_BLUE);\n    case Colour::Cyan:\n      return setTextAttribute(FOREGROUND_BLUE | FOREGROUND_GREEN);\n    case Colour::Yellow:\n      return setTextAttribute(FOREGROUND_RED | FOREGROUND_GREEN);\n    case Colour::Grey:\n      return setTextAttribute(0);\n\n    case Colour::LightGrey:\n      return setTextAttribute(FOREGROUND_INTENSITY);\n    case Colour::BrightRed:\n      return setTextAttribute(FOREGROUND_INTENSITY | FOREGROUND_RED);\n    case Colour::BrightGreen:\n      return setTextAttribute(FOREGROUND_INTENSITY | FOREGROUND_GREEN);\n    case Colour::BrightWhite:\n      return setTextAttribute(FOREGROUND_INTENSITY | FOREGROUND_GREEN |\n                              FOREGROUND_RED | FOREGROUND_BLUE);\n    case Colour::BrightYellow:\n      return setTextAttribute(FOREGROUND_INTENSITY | FOREGROUND_RED |\n                              FOREGROUND_GREEN);\n\n    case Colour::Bright:\n      CATCH_INTERNAL_ERROR(\"not a colour\");\n\n    default:\n      CATCH_ERROR(\"Unknown colour requested\");\n    }\n  }\n\nprivate:\n  void setTextAttribute(WORD _textAttribute) {\n    SetConsoleTextAttribute(stdoutHandle,\n                            _textAttribute | originalBackgroundAttributes);\n  }\n  HANDLE stdoutHandle;\n  WORD originalForegroundAttributes;\n  WORD originalBackgroundAttributes;\n};\n\nIColourImpl* platformColourInstance() {\n  static Win32ColourImpl s_instance;\n\n  IConfigPtr config = getCurrentContext().getConfig();\n  UseColour::YesOrNo colourMode =\n      config ? config->useColour() : UseColour::Auto;\n  if (colourMode == UseColour::Auto)\n    colourMode = UseColour::Yes;\n  return colourMode == UseColour::Yes ? &s_instance : NoColourImpl::instance();\n}\n\n} // namespace\n} // end namespace Catch\n\n#elif defined(CATCH_CONFIG_COLOUR_ANSI) //////////////////////////////////////\n\n#include <unistd.h>\n\nnamespace Catch {\nnamespace {\n\n// use POSIX/ ANSI console terminal codes\n// Thanks to Adam Strzelecki for original contribution\n// (http://github.com/nanoant)\n// https://github.com/philsquared/Catch/pull/131\nclass PosixColourImpl : public IColourImpl {\npublic:\n  void use(Colour::Code _colourCode) override {\n    switch (_colourCode) {\n    case Colour::None:\n    case Colour::White:\n      return setColour(\"[0m\");\n    case Colour::Red:\n      return setColour(\"[0;31m\");\n    case Colour::Green:\n      return setColour(\"[0;32m\");\n    case Colour::Blue:\n      return setColour(\"[0;34m\");\n    case Colour::Cyan:\n      return setColour(\"[0;36m\");\n    case Colour::Yellow:\n      return setColour(\"[0;33m\");\n    case Colour::Grey:\n      return setColour(\"[1;30m\");\n\n    case Colour::LightGrey:\n      return setColour(\"[0;37m\");\n    case Colour::BrightRed:\n      return setColour(\"[1;31m\");\n    case Colour::BrightGreen:\n      return setColour(\"[1;32m\");\n    case Colour::BrightWhite:\n      return setColour(\"[1;37m\");\n    case Colour::BrightYellow:\n      return setColour(\"[1;33m\");\n\n    case Colour::Bright:\n      CATCH_INTERNAL_ERROR(\"not a colour\");\n    default:\n      CATCH_INTERNAL_ERROR(\"Unknown colour requested\");\n    }\n  }\n  static IColourImpl* instance() {\n    static PosixColourImpl s_instance;\n    return &s_instance;\n  }\n\nprivate:\n  void setColour(const char* _escapeCode) {\n    getCurrentContext().getConfig()->stream() << '\\033' << _escapeCode;\n  }\n};\n\nbool useColourOnPlatform() {\n  return\n#if defined(CATCH_PLATFORM_MAC) || defined(CATCH_PLATFORM_IPHONE)\n      !isDebuggerActive() &&\n#endif\n#if !(defined(__DJGPP__) && defined(__STRICT_ANSI__))\n      isatty(STDOUT_FILENO)\n#else\n      false\n#endif\n          ;\n}\nIColourImpl* platformColourInstance() {\n  ErrnoGuard guard;\n  IConfigPtr config = getCurrentContext().getConfig();\n  UseColour::YesOrNo colourMode =\n      config ? config->useColour() : UseColour::Auto;\n  if (colourMode == UseColour::Auto)\n    colourMode = useColourOnPlatform() ? UseColour::Yes : UseColour::No;\n  return colourMode == UseColour::Yes ? PosixColourImpl::instance()\n                                      : NoColourImpl::instance();\n}\n\n} // namespace\n} // end namespace Catch\n\n#else // not Windows or ANSI ///////////////////////////////////////////////\n\nnamespace Catch {\n\nstatic IColourImpl* platformColourInstance() {\n  return NoColourImpl::instance();\n}\n\n} // end namespace Catch\n\n#endif // Windows/ ANSI/ None\n\nnamespace Catch {\n\nColour::Colour(Code _colourCode) { use(_colourCode); }\nColour::Colour(Colour&& other) noexcept {\n  m_moved       = other.m_moved;\n  other.m_moved = true;\n}\nColour& Colour::operator=(Colour&& other) noexcept {\n  m_moved       = other.m_moved;\n  other.m_moved = true;\n  return *this;\n}\n\nColour::~Colour() {\n  if (!m_moved)\n    use(None);\n}\n\nvoid Colour::use(Code _colourCode) {\n  static IColourImpl* impl = platformColourInstance();\n  // Strictly speaking, this cannot possibly happen.\n  // However, under some conditions it does happen (see #1626),\n  // and this change is small enough that we can let practicality\n  // triumph over purity in this case.\n  if (impl != nullptr) {\n    impl->use(_colourCode);\n  }\n}\n\nstd::ostream& operator<<(std::ostream& os, Colour const&) { return os; }\n\n} // end namespace Catch\n\n#if defined(__clang__)\n#pragma clang diagnostic pop\n#endif\n\n// end catch_console_colour.cpp\n// start catch_context.cpp\n\nnamespace Catch {\n\nclass Context : public IMutableContext, NonCopyable {\n\npublic: // IContext\n  IResultCapture* getResultCapture() override { return m_resultCapture; }\n  IRunner* getRunner() override { return m_runner; }\n\n  IConfigPtr const& getConfig() const override { return m_config; }\n\n  ~Context() override;\n\npublic: // IMutableContext\n  void setResultCapture(IResultCapture* resultCapture) override {\n    m_resultCapture = resultCapture;\n  }\n  void setRunner(IRunner* runner) override { m_runner = runner; }\n  void setConfig(IConfigPtr const& config) override { m_config = config; }\n\n  friend IMutableContext& getCurrentMutableContext();\n\nprivate:\n  IConfigPtr m_config;\n  IRunner* m_runner               = nullptr;\n  IResultCapture* m_resultCapture = nullptr;\n};\n\nIMutableContext* IMutableContext::currentContext = nullptr;\n\nvoid IMutableContext::createContext() { currentContext = new Context(); }\n\nvoid cleanUpContext() {\n  delete IMutableContext::currentContext;\n  IMutableContext::currentContext = nullptr;\n}\nIContext::~IContext()               = default;\nIMutableContext::~IMutableContext() = default;\nContext::~Context()                 = default;\n\nSimplePcg32& rng() {\n  static SimplePcg32 s_rng;\n  return s_rng;\n}\n\n} // namespace Catch\n// end catch_context.cpp\n// start catch_debug_console.cpp\n\n// start catch_debug_console.h\n\n#include <string>\n\nnamespace Catch {\nvoid writeToDebugConsole(std::string const& text);\n}\n\n// end catch_debug_console.h\n#if defined(CATCH_CONFIG_ANDROID_LOGWRITE)\n#include <android/log.h>\n\nnamespace Catch {\nvoid writeToDebugConsole(std::string const& text) {\n  __android_log_write(ANDROID_LOG_DEBUG, \"Catch\", text.c_str());\n}\n} // namespace Catch\n\n#elif defined(CATCH_PLATFORM_WINDOWS)\n\nnamespace Catch {\nvoid writeToDebugConsole(std::string const& text) {\n  ::OutputDebugStringA(text.c_str());\n}\n} // namespace Catch\n\n#else\n\nnamespace Catch {\nvoid writeToDebugConsole(std::string const& text) {\n  // !TBD: Need a version for Mac/ XCode and other IDEs\n  Catch::cout() << text;\n}\n} // namespace Catch\n\n#endif // Platform\n// end catch_debug_console.cpp\n// start catch_debugger.cpp\n\n#if defined(CATCH_PLATFORM_MAC) || defined(CATCH_PLATFORM_IPHONE)\n\n#include <cassert>\n#include <sys/types.h>\n#include <unistd.h>\n#include <cstddef>\n#include <ostream>\n\n#ifdef __apple_build_version__\n// These headers will only compile with AppleClang (XCode)\n// For other compilers (Clang, GCC, ... ) we need to exclude them\n#include <sys/sysctl.h>\n#endif\n\nnamespace Catch {\n#ifdef __apple_build_version__\n// The following function is taken directly from the following technical note:\n// https://developer.apple.com/library/archive/qa/qa1361/_index.html\n\n// Returns true if the current process is being debugged (either\n// running under the debugger or has a debugger attached post facto).\nbool isDebuggerActive() {\n  int mib[4];\n  struct kinfo_proc info;\n  std::size_t size;\n\n  // Initialize the flags so that, if sysctl fails for some bizarre\n  // reason, we get a predictable result.\n\n  info.kp_proc.p_flag = 0;\n\n  // Initialize mib, which tells sysctl the info we want, in this case\n  // we're looking for information about a specific process ID.\n\n  mib[0] = CTL_KERN;\n  mib[1] = KERN_PROC;\n  mib[2] = KERN_PROC_PID;\n  mib[3] = getpid();\n\n  // Call sysctl.\n\n  size = sizeof(info);\n  if (sysctl(mib, sizeof(mib) / sizeof(*mib), &info, &size, nullptr, 0) != 0) {\n    Catch::cerr() << \"\\n** Call to sysctl failed - unable to determine if \"\n                     \"debugger is active **\\n\"\n                  << std::endl;\n    return false;\n  }\n\n  // We're being debugged if the P_TRACED flag is set.\n\n  return ((info.kp_proc.p_flag & P_TRACED) != 0);\n}\n#else\nbool isDebuggerActive() {\n  // We need to find another way to determine this for non-appleclang compilers\n  // on macOS\n  return false;\n}\n#endif\n} // namespace Catch\n\n#elif defined(CATCH_PLATFORM_LINUX)\n#include <fstream>\n#include <string>\n\nnamespace Catch {\n// The standard POSIX way of detecting a debugger is to attempt to\n// ptrace() the process, but this needs to be done from a child and not\n// this process itself to still allow attaching to this process later\n// if wanted, so is rather heavy. Under Linux we have the PID of the\n// \"debugger\" (which doesn't need to be gdb, of course, it could also\n// be strace, for example) in /proc/$PID/status, so just get it from\n// there instead.\nbool isDebuggerActive() {\n  // Libstdc++ has a bug, where std::ifstream sets errno to 0\n  // This way our users can properly assert over errno values\n  ErrnoGuard guard;\n  std::ifstream in(\"/proc/self/status\");\n  for (std::string line; std::getline(in, line);) {\n    static const int PREFIX_LEN = 11;\n    if (line.compare(0, PREFIX_LEN, \"TracerPid:\\t\") == 0) {\n      // We're traced if the PID is not 0 and no other PID starts\n      // with 0 digit, so it's enough to check for just a single\n      // character.\n      return line.length() > PREFIX_LEN && line[PREFIX_LEN] != '0';\n    }\n  }\n\n  return false;\n}\n} // namespace Catch\n#elif defined(_MSC_VER)\nextern \"C\" __declspec(dllimport) int __stdcall IsDebuggerPresent();\nnamespace Catch {\nbool isDebuggerActive() { return IsDebuggerPresent() != 0; }\n} // namespace Catch\n#elif defined(__MINGW32__)\nextern \"C\" __declspec(dllimport) int __stdcall IsDebuggerPresent();\nnamespace Catch {\nbool isDebuggerActive() { return IsDebuggerPresent() != 0; }\n} // namespace Catch\n#else\nnamespace Catch {\nbool isDebuggerActive() { return false; }\n} // namespace Catch\n#endif // Platform\n// end catch_debugger.cpp\n// start catch_decomposer.cpp\n\nnamespace Catch {\n\nITransientExpression::~ITransientExpression() = default;\n\nvoid formatReconstructedExpression(std::ostream& os, std::string const& lhs,\n                                   StringRef op, std::string const& rhs) {\n  if (lhs.size() + rhs.size() < 40 && lhs.find('\\n') == std::string::npos &&\n      rhs.find('\\n') == std::string::npos)\n    os << lhs << \" \" << op << \" \" << rhs;\n  else\n    os << lhs << \"\\n\" << op << \"\\n\" << rhs;\n}\n} // namespace Catch\n// end catch_decomposer.cpp\n// start catch_enforce.cpp\n\n#include <stdexcept>\n\nnamespace Catch {\n#if defined(CATCH_CONFIG_DISABLE_EXCEPTIONS) &&                                \\\n    !defined(CATCH_CONFIG_DISABLE_EXCEPTIONS_CUSTOM_HANDLER)\n[[noreturn]] void throw_exception(std::exception const& e) {\n  Catch::cerr()\n      << \"Catch will terminate because it needed to throw an exception.\\n\"\n      << \"The message was: \" << e.what() << '\\n';\n  std::terminate();\n}\n#endif\n\n[[noreturn]] void throw_logic_error(std::string const& msg) {\n  throw_exception(std::logic_error(msg));\n}\n\n[[noreturn]] void throw_domain_error(std::string const& msg) {\n  throw_exception(std::domain_error(msg));\n}\n\n[[noreturn]] void throw_runtime_error(std::string const& msg) {\n  throw_exception(std::runtime_error(msg));\n}\n\n} // namespace Catch\n// end catch_enforce.cpp\n// start catch_enum_values_registry.cpp\n// start catch_enum_values_registry.h\n\n#include <vector>\n#include <memory>\n\nnamespace Catch {\n\nnamespace Detail {\n\nstd::unique_ptr<EnumInfo> makeEnumInfo(StringRef enumName,\n                                       StringRef allValueNames,\n                                       std::vector<int> const& values);\n\nclass EnumValuesRegistry : public IMutableEnumValuesRegistry {\n\n  std::vector<std::unique_ptr<EnumInfo>> m_enumInfos;\n\n  EnumInfo const& registerEnum(StringRef enumName, StringRef allEnums,\n                               std::vector<int> const& values) override;\n};\n\nstd::vector<StringRef> parseEnums(StringRef enums);\n\n} // namespace Detail\n\n} // namespace Catch\n\n// end catch_enum_values_registry.h\n\n#include <map>\n#include <cassert>\n\nnamespace Catch {\n\nIMutableEnumValuesRegistry::~IMutableEnumValuesRegistry() {}\n\nnamespace Detail {\n\nnamespace {\n// Extracts the actual name part of an enum instance\n// In other words, it returns the Blue part of Bikeshed::Colour::Blue\nStringRef extractInstanceName(StringRef enumInstance) {\n  // Find last occurrence of \":\"\n  size_t name_start = enumInstance.size();\n  while (name_start > 0 && enumInstance[name_start - 1] != ':') {\n    --name_start;\n  }\n  return enumInstance.substr(name_start, enumInstance.size() - name_start);\n}\n} // namespace\n\nstd::vector<StringRef> parseEnums(StringRef enums) {\n  auto enumValues = splitStringRef(enums, ',');\n  std::vector<StringRef> parsed;\n  parsed.reserve(enumValues.size());\n  for (auto const& enumValue : enumValues) {\n    parsed.push_back(trim(extractInstanceName(enumValue)));\n  }\n  return parsed;\n}\n\nEnumInfo::~EnumInfo() {}\n\nStringRef EnumInfo::lookup(int value) const {\n  for (auto const& valueToName : m_values) {\n    if (valueToName.first == value)\n      return valueToName.second;\n  }\n  return \"{** unexpected enum value **}\"_sr;\n}\n\nstd::unique_ptr<EnumInfo> makeEnumInfo(StringRef enumName,\n                                       StringRef allValueNames,\n                                       std::vector<int> const& values) {\n  std::unique_ptr<EnumInfo> enumInfo(new EnumInfo);\n  enumInfo->m_name = enumName;\n  enumInfo->m_values.reserve(values.size());\n\n  const auto valueNames = Catch::Detail::parseEnums(allValueNames);\n  assert(valueNames.size() == values.size());\n  std::size_t i = 0;\n  for (auto value : values)\n    enumInfo->m_values.emplace_back(value, valueNames[i++]);\n\n  return enumInfo;\n}\n\nEnumInfo const&\nEnumValuesRegistry::registerEnum(StringRef enumName, StringRef allValueNames,\n                                 std::vector<int> const& values) {\n  m_enumInfos.push_back(makeEnumInfo(enumName, allValueNames, values));\n  return *m_enumInfos.back();\n}\n\n} // namespace Detail\n} // namespace Catch\n\n// end catch_enum_values_registry.cpp\n// start catch_errno_guard.cpp\n\n#include <cerrno>\n\nnamespace Catch {\nErrnoGuard::ErrnoGuard() : m_oldErrno(errno) {}\nErrnoGuard::~ErrnoGuard() { errno = m_oldErrno; }\n} // namespace Catch\n// end catch_errno_guard.cpp\n// start catch_exception_translator_registry.cpp\n\n// start catch_exception_translator_registry.h\n\n#include <vector>\n#include <string>\n#include <memory>\n\nnamespace Catch {\n\nclass ExceptionTranslatorRegistry : public IExceptionTranslatorRegistry {\npublic:\n  ~ExceptionTranslatorRegistry();\n  virtual void registerTranslator(const IExceptionTranslator* translator);\n  std::string translateActiveException() const override;\n  std::string tryTranslators() const;\n\nprivate:\n  std::vector<std::unique_ptr<IExceptionTranslator const>> m_translators;\n};\n} // namespace Catch\n\n// end catch_exception_translator_registry.h\n#ifdef __OBJC__\n#import \"Foundation/Foundation.h\"\n#endif\n\nnamespace Catch {\n\nExceptionTranslatorRegistry::~ExceptionTranslatorRegistry() {}\n\nvoid ExceptionTranslatorRegistry::registerTranslator(\n    const IExceptionTranslator* translator) {\n  m_translators.push_back(\n      std::unique_ptr<const IExceptionTranslator>(translator));\n}\n\n#if !defined(CATCH_CONFIG_DISABLE_EXCEPTIONS)\nstd::string ExceptionTranslatorRegistry::translateActiveException() const {\n  try {\n#ifdef __OBJC__\n    // In Objective-C try objective-c exceptions first\n    @try {\n      return tryTranslators();\n    } @catch (NSException* exception) {\n      return Catch::Detail::stringify([exception description]);\n    }\n#else\n    // Compiling a mixed mode project with MSVC means that CLR\n    // exceptions will be caught in (...) as well. However, these\n    // do not fill-in std::current_exception and thus lead to crash\n    // when attempting rethrow.\n    // /EHa switch also causes structured exceptions to be caught\n    // here, but they fill-in current_exception properly, so\n    // at worst the output should be a little weird, instead of\n    // causing a crash.\n    if (std::current_exception() == nullptr) {\n      return \"Non C++ exception. Possibly a CLR exception.\";\n    }\n    return tryTranslators();\n#endif\n  } catch (TestFailureException&) {\n    std::rethrow_exception(std::current_exception());\n  } catch (std::exception& ex) {\n    return ex.what();\n  } catch (std::string& msg) {\n    return msg;\n  } catch (const char* msg) {\n    return msg;\n  } catch (...) {\n    return \"Unknown exception\";\n  }\n}\n\nstd::string ExceptionTranslatorRegistry::tryTranslators() const {\n  if (m_translators.empty()) {\n    std::rethrow_exception(std::current_exception());\n  } else {\n    return m_translators[0]->translate(m_translators.begin() + 1,\n                                       m_translators.end());\n  }\n}\n\n#else // ^^ Exceptions are enabled // Exceptions are disabled vv\nstd::string ExceptionTranslatorRegistry::translateActiveException() const {\n  CATCH_INTERNAL_ERROR(\"Attempted to translate active exception under \"\n                       \"CATCH_CONFIG_DISABLE_EXCEPTIONS!\");\n}\n\nstd::string ExceptionTranslatorRegistry::tryTranslators() const {\n  CATCH_INTERNAL_ERROR(\"Attempted to use exception translators under \"\n                       \"CATCH_CONFIG_DISABLE_EXCEPTIONS!\");\n}\n#endif\n\n} // namespace Catch\n// end catch_exception_translator_registry.cpp\n// start catch_fatal_condition.cpp\n\n#include <algorithm>\n\n#if !defined(CATCH_CONFIG_WINDOWS_SEH) && !defined(CATCH_CONFIG_POSIX_SIGNALS)\n\nnamespace Catch {\n\n// If neither SEH nor signal handling is required, the handler impls\n// do not have to do anything, and can be empty.\nFatalConditionHandler::engage_platform() {}\nFatalConditionHandler::disengage_platform() {}\nFatalConditionHandler::FatalConditionHandler()  = default;\nFatalConditionHandler::~FatalConditionHandler() = default;\n\n} // end namespace Catch\n\n#endif // !CATCH_CONFIG_WINDOWS_SEH && !CATCH_CONFIG_POSIX_SIGNALS\n\n#if defined(CATCH_CONFIG_WINDOWS_SEH) && defined(CATCH_CONFIG_POSIX_SIGNALS)\n#error                                                                         \\\n    \"Inconsistent configuration: Windows' SEH handling and POSIX signals cannot be enabled at the same time\"\n#endif // CATCH_CONFIG_WINDOWS_SEH && CATCH_CONFIG_POSIX_SIGNALS\n\n#if defined(CATCH_CONFIG_WINDOWS_SEH) || defined(CATCH_CONFIG_POSIX_SIGNALS)\n\nnamespace {\n//! Signals fatal error message to the run context\nvoid reportFatal(char const* const message) {\n  Catch::getCurrentContext().getResultCapture()->handleFatalErrorCondition(\n      message);\n}\n\n//! Minimal size Catch2 needs for its own fatal error handling.\n//! Picked anecdotally, so it might not be sufficient on all\n//! platforms, and for all configurations.\nconstexpr std::size_t minStackSizeForErrors = 32 * 1024;\n} // end unnamed namespace\n\n#endif // CATCH_CONFIG_WINDOWS_SEH || CATCH_CONFIG_POSIX_SIGNALS\n\n#if defined(CATCH_CONFIG_WINDOWS_SEH)\n\nnamespace Catch {\n\nstruct SignalDefs {\n  DWORD id;\n  const char* name;\n};\n\n// There is no 1-1 mapping between signals and windows exceptions.\n// Windows can easily distinguish between SO and SigSegV,\n// but SigInt, SigTerm, etc are handled differently.\nstatic SignalDefs signalDefs[] = {\n    {static_cast<DWORD>(EXCEPTION_ILLEGAL_INSTRUCTION),\n     \"SIGILL - Illegal instruction signal\"},\n    {static_cast<DWORD>(EXCEPTION_STACK_OVERFLOW), \"SIGSEGV - Stack overflow\"},\n    {static_cast<DWORD>(EXCEPTION_ACCESS_VIOLATION),\n     \"SIGSEGV - Segmentation violation signal\"},\n    {static_cast<DWORD>(EXCEPTION_INT_DIVIDE_BY_ZERO), \"Divide by zero error\"},\n};\n\nstatic LONG CALLBACK\nhandleVectoredException(PEXCEPTION_POINTERS ExceptionInfo) {\n  for (auto const& def : signalDefs) {\n    if (ExceptionInfo->ExceptionRecord->ExceptionCode == def.id) {\n      reportFatal(def.name);\n    }\n  }\n  // If its not an exception we care about, pass it along.\n  // This stops us from eating debugger breaks etc.\n  return EXCEPTION_CONTINUE_SEARCH;\n}\n\n// Since we do not support multiple instantiations, we put these\n// into global variables and rely on cleaning them up in outlined\n// constructors/destructors\nstatic PVOID exceptionHandlerHandle = nullptr;\n\n// For MSVC, we reserve part of the stack memory for handling\n// memory overflow structured exception.\nFatalConditionHandler::FatalConditionHandler() {\n  ULONG guaranteeSize = static_cast<ULONG>(minStackSizeForErrors);\n  if (!SetThreadStackGuarantee(&guaranteeSize)) {\n    // We do not want to fully error out, because needing\n    // the stack reserve should be rare enough anyway.\n    Catch::cerr() << \"Failed to reserve piece of stack.\"\n                  << \" Stack overflows will not be reported successfully.\";\n  }\n}\n\n// We do not attempt to unset the stack guarantee, because\n// Windows does not support lowering the stack size guarantee.\nFatalConditionHandler::~FatalConditionHandler() = default;\n\nvoid FatalConditionHandler::engage_platform() {\n  // Register as first handler in current chain\n  exceptionHandlerHandle =\n      AddVectoredExceptionHandler(1, handleVectoredException);\n  if (!exceptionHandlerHandle) {\n    CATCH_RUNTIME_ERROR(\"Could not register vectored exception handler\");\n  }\n}\n\nvoid FatalConditionHandler::disengage_platform() {\n  if (!RemoveVectoredExceptionHandler(exceptionHandlerHandle)) {\n    CATCH_RUNTIME_ERROR(\"Could not unregister vectored exception handler\");\n  }\n  exceptionHandlerHandle = nullptr;\n}\n\n} // end namespace Catch\n\n#endif // CATCH_CONFIG_WINDOWS_SEH\n\n#if defined(CATCH_CONFIG_POSIX_SIGNALS)\n\n#include <signal.h>\n\nnamespace Catch {\n\nstruct SignalDefs {\n  int id;\n  const char* name;\n};\n\nstatic SignalDefs signalDefs[] = {\n    {SIGINT, \"SIGINT - Terminal interrupt signal\"},\n    {SIGILL, \"SIGILL - Illegal instruction signal\"},\n    {SIGFPE, \"SIGFPE - Floating point error signal\"},\n    {SIGSEGV, \"SIGSEGV - Segmentation violation signal\"},\n    {SIGTERM, \"SIGTERM - Termination request signal\"},\n    {SIGABRT, \"SIGABRT - Abort (abnormal termination) signal\"}};\n\n// Older GCCs trigger -Wmissing-field-initializers for T foo = {}\n// which is zero initialization, but not explicit. We want to avoid\n// that.\n#if defined(__GNUC__)\n#pragma GCC diagnostic push\n#pragma GCC diagnostic ignored \"-Wmissing-field-initializers\"\n#endif\n\nstatic char* altStackMem        = nullptr;\nstatic std::size_t altStackSize = 0;\nstatic stack_t oldSigStack{};\nstatic struct sigaction\n    oldSigActions[sizeof(signalDefs) / sizeof(SignalDefs)]{};\n\nstatic void restorePreviousSignalHandlers() {\n  // We set signal handlers back to the previous ones. Hopefully\n  // nobody overwrote them in the meantime, and doesn't expect\n  // their signal handlers to live past ours given that they\n  // installed them after ours..\n  for (std::size_t i = 0; i < sizeof(signalDefs) / sizeof(SignalDefs); ++i) {\n    sigaction(signalDefs[i].id, &oldSigActions[i], nullptr);\n  }\n  // Return the old stack\n  sigaltstack(&oldSigStack, nullptr);\n}\n\nstatic void handleSignal(int sig) {\n  char const* name = \"<unknown signal>\";\n  for (auto const& def : signalDefs) {\n    if (sig == def.id) {\n      name = def.name;\n      break;\n    }\n  }\n  // We need to restore previous signal handlers and let them do\n  // their thing, so that the users can have the debugger break\n  // when a signal is raised, and so on.\n  restorePreviousSignalHandlers();\n  reportFatal(name);\n  raise(sig);\n}\n\nFatalConditionHandler::FatalConditionHandler() {\n  assert(!altStackMem &&\n         \"Cannot initialize POSIX signal handler when one already exists\");\n  if (altStackSize == 0) {\n    altStackSize =\n        std::max(static_cast<size_t>(SIGSTKSZ), minStackSizeForErrors);\n  }\n  altStackMem = new char[altStackSize]();\n}\n\nFatalConditionHandler::~FatalConditionHandler() {\n  delete[] altStackMem;\n  // We signal that another instance can be constructed by zeroing\n  // out the pointer.\n  altStackMem = nullptr;\n}\n\nvoid FatalConditionHandler::engage_platform() {\n  stack_t sigStack;\n  sigStack.ss_sp    = altStackMem;\n  sigStack.ss_size  = altStackSize;\n  sigStack.ss_flags = 0;\n  sigaltstack(&sigStack, &oldSigStack);\n  struct sigaction sa = {};\n\n  sa.sa_handler = handleSignal;\n  sa.sa_flags   = SA_ONSTACK;\n  for (std::size_t i = 0; i < sizeof(signalDefs) / sizeof(SignalDefs); ++i) {\n    sigaction(signalDefs[i].id, &sa, &oldSigActions[i]);\n  }\n}\n\n#if defined(__GNUC__)\n#pragma GCC diagnostic pop\n#endif\n\nvoid FatalConditionHandler::disengage_platform() {\n  restorePreviousSignalHandlers();\n}\n\n} // end namespace Catch\n\n#endif // CATCH_CONFIG_POSIX_SIGNALS\n// end catch_fatal_condition.cpp\n// start catch_generators.cpp\n\n#include <limits>\n#include <set>\n\nnamespace Catch {\n\nIGeneratorTracker::~IGeneratorTracker() {}\n\nconst char* GeneratorException::what() const noexcept { return m_msg; }\n\nnamespace Generators {\n\nGeneratorUntypedBase::~GeneratorUntypedBase() {}\n\nauto acquireGeneratorTracker(StringRef generatorName,\n                             SourceLineInfo const& lineInfo)\n    -> IGeneratorTracker& {\n  return getResultCapture().acquireGeneratorTracker(generatorName, lineInfo);\n}\n\n} // namespace Generators\n} // namespace Catch\n// end catch_generators.cpp\n// start catch_interfaces_capture.cpp\n\nnamespace Catch {\nIResultCapture::~IResultCapture() = default;\n}\n// end catch_interfaces_capture.cpp\n// start catch_interfaces_config.cpp\n\nnamespace Catch {\nIConfig::~IConfig() = default;\n}\n// end catch_interfaces_config.cpp\n// start catch_interfaces_exception.cpp\n\nnamespace Catch {\nIExceptionTranslator::~IExceptionTranslator()                 = default;\nIExceptionTranslatorRegistry::~IExceptionTranslatorRegistry() = default;\n} // namespace Catch\n// end catch_interfaces_exception.cpp\n// start catch_interfaces_registry_hub.cpp\n\nnamespace Catch {\nIRegistryHub::~IRegistryHub()               = default;\nIMutableRegistryHub::~IMutableRegistryHub() = default;\n} // namespace Catch\n// end catch_interfaces_registry_hub.cpp\n// start catch_interfaces_reporter.cpp\n\n// start catch_reporter_listening.h\n\nnamespace Catch {\n\nclass ListeningReporter : public IStreamingReporter {\n  using Reporters = std::vector<IStreamingReporterPtr>;\n  Reporters m_listeners;\n  IStreamingReporterPtr m_reporter = nullptr;\n  ReporterPreferences m_preferences;\n\npublic:\n  ListeningReporter();\n\n  void addListener(IStreamingReporterPtr&& listener);\n  void addReporter(IStreamingReporterPtr&& reporter);\n\npublic: // IStreamingReporter\n  ReporterPreferences getPreferences() const override;\n\n  void noMatchingTestCases(std::string const& spec) override;\n\n  void reportInvalidArguments(std::string const& arg) override;\n\n  static std::set<Verbosity> getSupportedVerbosities();\n\n#if defined(CATCH_CONFIG_ENABLE_BENCHMARKING)\n  void benchmarkPreparing(std::string const& name) override;\n  void benchmarkStarting(BenchmarkInfo const& benchmarkInfo) override;\n  void benchmarkEnded(BenchmarkStats<> const& benchmarkStats) override;\n  void benchmarkFailed(std::string const&) override;\n#endif // CATCH_CONFIG_ENABLE_BENCHMARKING\n\n  void testRunStarting(TestRunInfo const& testRunInfo) override;\n  void testGroupStarting(GroupInfo const& groupInfo) override;\n  void testCaseStarting(TestCaseInfo const& testInfo) override;\n  void sectionStarting(SectionInfo const& sectionInfo) override;\n  void assertionStarting(AssertionInfo const& assertionInfo) override;\n\n  // The return value indicates if the messages buffer should be cleared:\n  bool assertionEnded(AssertionStats const& assertionStats) override;\n  void sectionEnded(SectionStats const& sectionStats) override;\n  void testCaseEnded(TestCaseStats const& testCaseStats) override;\n  void testGroupEnded(TestGroupStats const& testGroupStats) override;\n  void testRunEnded(TestRunStats const& testRunStats) override;\n\n  void skipTest(TestCaseInfo const& testInfo) override;\n  bool isMulti() const override;\n};\n\n} // end namespace Catch\n\n// end catch_reporter_listening.h\nnamespace Catch {\n\nReporterConfig::ReporterConfig(IConfigPtr const& _fullConfig)\n    : m_stream(&_fullConfig->stream()), m_fullConfig(_fullConfig) {}\n\nReporterConfig::ReporterConfig(IConfigPtr const& _fullConfig,\n                               std::ostream& _stream)\n    : m_stream(&_stream), m_fullConfig(_fullConfig) {}\n\nstd::ostream& ReporterConfig::stream() const { return *m_stream; }\nIConfigPtr ReporterConfig::fullConfig() const { return m_fullConfig; }\n\nTestRunInfo::TestRunInfo(std::string const& _name) : name(_name) {}\n\nGroupInfo::GroupInfo(std::string const& _name, std::size_t _groupIndex,\n                     std::size_t _groupsCount)\n    : name(_name), groupIndex(_groupIndex), groupsCounts(_groupsCount) {}\n\nAssertionStats::AssertionStats(AssertionResult const& _assertionResult,\n                               std::vector<MessageInfo> const& _infoMessages,\n                               Totals const& _totals)\n    : assertionResult(_assertionResult), infoMessages(_infoMessages),\n      totals(_totals) {\n  assertionResult.m_resultData.lazyExpression.m_transientExpression =\n      _assertionResult.m_resultData.lazyExpression.m_transientExpression;\n\n  if (assertionResult.hasMessage()) {\n    // Copy message into messages list.\n    // !TBD This should have been done earlier, somewhere\n    MessageBuilder builder(assertionResult.getTestMacroName(),\n                           assertionResult.getSourceInfo(),\n                           assertionResult.getResultType());\n    builder << assertionResult.getMessage();\n    builder.m_info.message = builder.m_stream.str();\n\n    infoMessages.push_back(builder.m_info);\n  }\n}\n\nAssertionStats::~AssertionStats() = default;\n\nSectionStats::SectionStats(SectionInfo const& _sectionInfo,\n                           Counts const& _assertions, double _durationInSeconds,\n                           bool _missingAssertions)\n    : sectionInfo(_sectionInfo), assertions(_assertions),\n      durationInSeconds(_durationInSeconds),\n      missingAssertions(_missingAssertions) {}\n\nSectionStats::~SectionStats() = default;\n\nTestCaseStats::TestCaseStats(TestCaseInfo const& _testInfo,\n                             Totals const& _totals, std::string const& _stdOut,\n                             std::string const& _stdErr, bool _aborting)\n    : testInfo(_testInfo), totals(_totals), stdOut(_stdOut), stdErr(_stdErr),\n      aborting(_aborting) {}\n\nTestCaseStats::~TestCaseStats() = default;\n\nTestGroupStats::TestGroupStats(GroupInfo const& _groupInfo,\n                               Totals const& _totals, bool _aborting)\n    : groupInfo(_groupInfo), totals(_totals), aborting(_aborting) {}\n\nTestGroupStats::TestGroupStats(GroupInfo const& _groupInfo)\n    : groupInfo(_groupInfo), aborting(false) {}\n\nTestGroupStats::~TestGroupStats() = default;\n\nTestRunStats::TestRunStats(TestRunInfo const& _runInfo, Totals const& _totals,\n                           bool _aborting)\n    : runInfo(_runInfo), totals(_totals), aborting(_aborting) {}\n\nTestRunStats::~TestRunStats() = default;\n\nvoid IStreamingReporter::fatalErrorEncountered(StringRef) {}\nbool IStreamingReporter::isMulti() const { return false; }\n\nIReporterFactory::~IReporterFactory()   = default;\nIReporterRegistry::~IReporterRegistry() = default;\n\n} // end namespace Catch\n// end catch_interfaces_reporter.cpp\n// start catch_interfaces_runner.cpp\n\nnamespace Catch {\nIRunner::~IRunner() = default;\n}\n// end catch_interfaces_runner.cpp\n// start catch_interfaces_testcase.cpp\n\nnamespace Catch {\nITestInvoker::~ITestInvoker()           = default;\nITestCaseRegistry::~ITestCaseRegistry() = default;\n} // namespace Catch\n// end catch_interfaces_testcase.cpp\n// start catch_leak_detector.cpp\n\n#ifdef CATCH_CONFIG_WINDOWS_CRTDBG\n#include <crtdbg.h>\n\nnamespace Catch {\n\nLeakDetector::LeakDetector() {\n  int flag = _CrtSetDbgFlag(_CRTDBG_REPORT_FLAG);\n  flag |= _CRTDBG_LEAK_CHECK_DF;\n  flag |= _CRTDBG_ALLOC_MEM_DF;\n  _CrtSetDbgFlag(flag);\n  _CrtSetReportMode(_CRT_WARN, _CRTDBG_MODE_FILE | _CRTDBG_MODE_DEBUG);\n  _CrtSetReportFile(_CRT_WARN, _CRTDBG_FILE_STDERR);\n  // Change this to leaking allocation's number to break there\n  _CrtSetBreakAlloc(-1);\n}\n} // namespace Catch\n\n#else\n\nCatch::LeakDetector::LeakDetector() {}\n\n#endif\n\nCatch::LeakDetector::~LeakDetector() { Catch::cleanUp(); }\n// end catch_leak_detector.cpp\n// start catch_list.cpp\n\n// start catch_list.h\n\n#include <set>\n\nnamespace Catch {\n\nstd::size_t listTests(Config const& config);\n\nstd::size_t listTestsNamesOnly(Config const& config);\n\nstruct TagInfo {\n  void add(std::string const& spelling);\n  std::string all() const;\n\n  std::set<std::string> spellings;\n  std::size_t count = 0;\n};\n\nstd::size_t listTags(Config const& config);\n\nstd::size_t listReporters();\n\nOption<std::size_t> list(std::shared_ptr<Config> const& config);\n\n} // end namespace Catch\n\n// end catch_list.h\n// start catch_text.h\n\nnamespace Catch {\nusing namespace clara::TextFlow;\n}\n\n// end catch_text.h\n#include <limits>\n#include <algorithm>\n#include <iomanip>\n\nnamespace Catch {\n\nstd::size_t listTests(Config const& config) {\n  TestSpec const& testSpec = config.testSpec();\n  if (config.hasTestFilters())\n    Catch::cout() << \"Matching test cases:\\n\";\n  else {\n    Catch::cout() << \"All available test cases:\\n\";\n  }\n\n  auto matchedTestCases =\n      filterTests(getAllTestCasesSorted(config), testSpec, config);\n  for (auto const& testCaseInfo : matchedTestCases) {\n    Colour::Code colour =\n        testCaseInfo.isHidden() ? Colour::SecondaryText : Colour::None;\n    Colour colourGuard(colour);\n\n    Catch::cout() << Column(testCaseInfo.name).initialIndent(2).indent(4)\n                  << \"\\n\";\n    if (config.verbosity() >= Verbosity::High) {\n      Catch::cout()\n          << Column(Catch::Detail::stringify(testCaseInfo.lineInfo)).indent(4)\n          << std::endl;\n      std::string description = testCaseInfo.description;\n      if (description.empty())\n        description = \"(NO DESCRIPTION)\";\n      Catch::cout() << Column(description).indent(4) << std::endl;\n    }\n    if (!testCaseInfo.tags.empty())\n      Catch::cout() << Column(testCaseInfo.tagsAsString()).indent(6) << \"\\n\";\n  }\n\n  if (!config.hasTestFilters())\n    Catch::cout() << pluralise(matchedTestCases.size(), \"test case\") << '\\n'\n                  << std::endl;\n  else\n    Catch::cout() << pluralise(matchedTestCases.size(), \"matching test case\")\n                  << '\\n'\n                  << std::endl;\n  return matchedTestCases.size();\n}\n\nstd::size_t listTestsNamesOnly(Config const& config) {\n  TestSpec const& testSpec = config.testSpec();\n  std::size_t matchedTests = 0;\n  std::vector<TestCase> matchedTestCases =\n      filterTests(getAllTestCasesSorted(config), testSpec, config);\n  for (auto const& testCaseInfo : matchedTestCases) {\n    matchedTests++;\n    if (startsWith(testCaseInfo.name, '#'))\n      Catch::cout() << '\"' << testCaseInfo.name << '\"';\n    else\n      Catch::cout() << testCaseInfo.name;\n    if (config.verbosity() >= Verbosity::High)\n      Catch::cout() << \"\\t@\" << testCaseInfo.lineInfo;\n    Catch::cout() << std::endl;\n  }\n  return matchedTests;\n}\n\nvoid TagInfo::add(std::string const& spelling) {\n  ++count;\n  spellings.insert(spelling);\n}\n\nstd::string TagInfo::all() const {\n  size_t size = 0;\n  for (auto const& spelling : spellings) {\n    // Add 2 for the brackes\n    size += spelling.size() + 2;\n  }\n\n  std::string out;\n  out.reserve(size);\n  for (auto const& spelling : spellings) {\n    out += '[';\n    out += spelling;\n    out += ']';\n  }\n  return out;\n}\n\nstd::size_t listTags(Config const& config) {\n  TestSpec const& testSpec = config.testSpec();\n  if (config.hasTestFilters())\n    Catch::cout() << \"Tags for matching test cases:\\n\";\n  else {\n    Catch::cout() << \"All available tags:\\n\";\n  }\n\n  std::map<std::string, TagInfo> tagCounts;\n\n  std::vector<TestCase> matchedTestCases =\n      filterTests(getAllTestCasesSorted(config), testSpec, config);\n  for (auto const& testCase : matchedTestCases) {\n    for (auto const& tagName : testCase.getTestCaseInfo().tags) {\n      std::string lcaseTagName = toLower(tagName);\n      auto countIt             = tagCounts.find(lcaseTagName);\n      if (countIt == tagCounts.end())\n        countIt =\n            tagCounts.insert(std::make_pair(lcaseTagName, TagInfo())).first;\n      countIt->second.add(tagName);\n    }\n  }\n\n  for (auto const& tagCount : tagCounts) {\n    ReusableStringStream rss;\n    rss << \"  \" << std::setw(2) << tagCount.second.count << \"  \";\n    auto str     = rss.str();\n    auto wrapper = Column(tagCount.second.all())\n                       .initialIndent(0)\n                       .indent(str.size())\n                       .width(CATCH_CONFIG_CONSOLE_WIDTH - 10);\n    Catch::cout() << str << wrapper << '\\n';\n  }\n  Catch::cout() << pluralise(tagCounts.size(), \"tag\") << '\\n' << std::endl;\n  return tagCounts.size();\n}\n\nstd::size_t listReporters() {\n  Catch::cout() << \"Available reporters:\\n\";\n  IReporterRegistry::FactoryMap const& factories =\n      getRegistryHub().getReporterRegistry().getFactories();\n  std::size_t maxNameLen = 0;\n  for (auto const& factoryKvp : factories)\n    maxNameLen = (std::max)(maxNameLen, factoryKvp.first.size());\n\n  for (auto const& factoryKvp : factories) {\n    Catch::cout()\n        << Column(factoryKvp.first + \":\").indent(2).width(5 + maxNameLen) +\n               Column(factoryKvp.second->getDescription())\n                   .initialIndent(0)\n                   .indent(2)\n                   .width(CATCH_CONFIG_CONSOLE_WIDTH - maxNameLen - 8)\n        << \"\\n\";\n  }\n  Catch::cout() << std::endl;\n  return factories.size();\n}\n\nOption<std::size_t> list(std::shared_ptr<Config> const& config) {\n  Option<std::size_t> listedCount;\n  getCurrentMutableContext().setConfig(config);\n  if (config->listTests())\n    listedCount = listedCount.valueOr(0) + listTests(*config);\n  if (config->listTestNamesOnly())\n    listedCount = listedCount.valueOr(0) + listTestsNamesOnly(*config);\n  if (config->listTags())\n    listedCount = listedCount.valueOr(0) + listTags(*config);\n  if (config->listReporters())\n    listedCount = listedCount.valueOr(0) + listReporters();\n  return listedCount;\n}\n\n} // end namespace Catch\n// end catch_list.cpp\n// start catch_matchers.cpp\n\nnamespace Catch {\nnamespace Matchers {\nnamespace Impl {\n\nstd::string MatcherUntypedBase::toString() const {\n  if (m_cachedToString.empty())\n    m_cachedToString = describe();\n  return m_cachedToString;\n}\n\nMatcherUntypedBase::~MatcherUntypedBase() = default;\n\n} // namespace Impl\n} // namespace Matchers\n\nusing namespace Matchers;\nusing Matchers::Impl::MatcherBase;\n\n} // namespace Catch\n// end catch_matchers.cpp\n// start catch_matchers_exception.cpp\n\nnamespace Catch {\nnamespace Matchers {\nnamespace Exception {\n\nbool ExceptionMessageMatcher::match(std::exception const& ex) const {\n  return ex.what() == m_message;\n}\n\nstd::string ExceptionMessageMatcher::describe() const {\n  return \"exception message matches \\\"\" + m_message + \"\\\"\";\n}\n\n} // namespace Exception\nException::ExceptionMessageMatcher Message(std::string const& message) {\n  return Exception::ExceptionMessageMatcher(message);\n}\n\n// namespace Exception\n} // namespace Matchers\n} // namespace Catch\n// end catch_matchers_exception.cpp\n// start catch_matchers_floating.cpp\n\n// start catch_polyfills.hpp\n\nnamespace Catch {\nbool isnan(float f);\nbool isnan(double d);\n} // namespace Catch\n\n// end catch_polyfills.hpp\n// start catch_to_string.hpp\n\n#include <string>\n\nnamespace Catch {\ntemplate <typename T>\nstd::string to_string(T const& t) {\n#if defined(CATCH_CONFIG_CPP11_TO_STRING)\n  return std::to_string(t);\n#else\n  ReusableStringStream rss;\n  rss << t;\n  return rss.str();\n#endif\n}\n} // end namespace Catch\n\n// end catch_to_string.hpp\n#include <algorithm>\n#include <cmath>\n#include <cstdlib>\n#include <cstdint>\n#include <cstring>\n#include <sstream>\n#include <type_traits>\n#include <iomanip>\n#include <limits>\n\nnamespace Catch {\nnamespace {\n\nint32_t convert(float f) {\n  static_assert(sizeof(float) == sizeof(int32_t),\n                \"Important ULP matcher assumption violated\");\n  int32_t i;\n  std::memcpy(&i, &f, sizeof(f));\n  return i;\n}\n\nint64_t convert(double d) {\n  static_assert(sizeof(double) == sizeof(int64_t),\n                \"Important ULP matcher assumption violated\");\n  int64_t i;\n  std::memcpy(&i, &d, sizeof(d));\n  return i;\n}\n\ntemplate <typename FP>\nbool almostEqualUlps(FP lhs, FP rhs, uint64_t maxUlpDiff) {\n  // Comparison with NaN should always be false.\n  // This way we can rule it out before getting into the ugly details\n  if (Catch::isnan(lhs) || Catch::isnan(rhs)) {\n    return false;\n  }\n\n  auto lc = convert(lhs);\n  auto rc = convert(rhs);\n\n  if ((lc < 0) != (rc < 0)) {\n    // Potentially we can have +0 and -0\n    return lhs == rhs;\n  }\n\n  // static cast as a workaround for IBM XLC\n  auto ulpDiff = std::abs(static_cast<FP>(lc - rc));\n  return static_cast<uint64_t>(ulpDiff) <= maxUlpDiff;\n}\n\n#if defined(CATCH_CONFIG_GLOBAL_NEXTAFTER)\n\nfloat nextafter(float x, float y) { return ::nextafterf(x, y); }\n\ndouble nextafter(double x, double y) { return ::nextafter(x, y); }\n\n#endif // ^^^ CATCH_CONFIG_GLOBAL_NEXTAFTER ^^^\n\ntemplate <typename FP>\nFP step(FP start, FP direction, uint64_t steps) {\n  for (uint64_t i = 0; i < steps; ++i) {\n#if defined(CATCH_CONFIG_GLOBAL_NEXTAFTER)\n    start = Catch::nextafter(start, direction);\n#else\n    start = std::nextafter(start, direction);\n#endif\n  }\n  return start;\n}\n\n// Performs equivalent check of std::fabs(lhs - rhs) <= margin\n// But without the subtraction to allow for INFINITY in comparison\nbool marginComparison(double lhs, double rhs, double margin) {\n  return (lhs + margin >= rhs) && (rhs + margin >= lhs);\n}\n\ntemplate <typename FloatingPoint>\nvoid write(std::ostream& out, FloatingPoint num) {\n  out << std::scientific\n      << std::setprecision(std::numeric_limits<FloatingPoint>::max_digits10 - 1)\n      << num;\n}\n\n} // end anonymous namespace\n\nnamespace Matchers {\nnamespace Floating {\n\nenum class FloatingPointKind : uint8_t { Float, Double };\n\nWithinAbsMatcher::WithinAbsMatcher(double target, double margin)\n    : m_target{target}, m_margin{margin} {\n  CATCH_ENFORCE(margin >= 0,\n                \"Invalid margin: \" << margin << '.'\n                                   << \" Margin has to be non-negative.\");\n}\n\n// Performs equivalent check of std::fabs(lhs - rhs) <= margin\n// But without the subtraction to allow for INFINITY in comparison\nbool WithinAbsMatcher::match(double const& matchee) const {\n  return (matchee + m_margin >= m_target) && (m_target + m_margin >= matchee);\n}\n\nstd::string WithinAbsMatcher::describe() const {\n  return \"is within \" + ::Catch::Detail::stringify(m_margin) + \" of \" +\n         ::Catch::Detail::stringify(m_target);\n}\n\nWithinUlpsMatcher::WithinUlpsMatcher(double target, uint64_t ulps,\n                                     FloatingPointKind baseType)\n    : m_target{target}, m_ulps{ulps}, m_type{baseType} {\n  CATCH_ENFORCE(m_type == FloatingPointKind::Double ||\n                    m_ulps < (std::numeric_limits<uint32_t>::max)(),\n                \"Provided ULP is impossibly large for a float comparison.\");\n}\n\n#if defined(__clang__)\n#pragma clang diagnostic push\n// Clang <3.5 reports on the default branch in the switch below\n#pragma clang diagnostic ignored \"-Wunreachable-code\"\n#endif\n\nbool WithinUlpsMatcher::match(double const& matchee) const {\n  switch (m_type) {\n  case FloatingPointKind::Float:\n    return almostEqualUlps<float>(static_cast<float>(matchee),\n                                  static_cast<float>(m_target), m_ulps);\n  case FloatingPointKind::Double:\n    return almostEqualUlps<double>(matchee, m_target, m_ulps);\n  default:\n    CATCH_INTERNAL_ERROR(\"Unknown FloatingPointKind value\");\n  }\n}\n\n#if defined(__clang__)\n#pragma clang diagnostic pop\n#endif\n\nstd::string WithinUlpsMatcher::describe() const {\n  std::stringstream ret;\n\n  ret << \"is within \" << m_ulps << \" ULPs of \";\n\n  if (m_type == FloatingPointKind::Float) {\n    write(ret, static_cast<float>(m_target));\n    ret << 'f';\n  } else {\n    write(ret, m_target);\n  }\n\n  ret << \" ([\";\n  if (m_type == FloatingPointKind::Double) {\n    write(ret, step(m_target, static_cast<double>(-INFINITY), m_ulps));\n    ret << \", \";\n    write(ret, step(m_target, static_cast<double>(INFINITY), m_ulps));\n  } else {\n    // We have to cast INFINITY to float because of MinGW, see #1782\n    write(ret, step(static_cast<float>(m_target), static_cast<float>(-INFINITY),\n                    m_ulps));\n    ret << \", \";\n    write(ret, step(static_cast<float>(m_target), static_cast<float>(INFINITY),\n                    m_ulps));\n  }\n  ret << \"])\";\n\n  return ret.str();\n}\n\nWithinRelMatcher::WithinRelMatcher(double target, double epsilon)\n    : m_target(target), m_epsilon(epsilon) {\n  CATCH_ENFORCE(m_epsilon >= 0.,\n                \"Relative comparison with epsilon <  0 does not make sense.\");\n  CATCH_ENFORCE(m_epsilon < 1.,\n                \"Relative comparison with epsilon >= 1 does not make sense.\");\n}\n\nbool WithinRelMatcher::match(double const& matchee) const {\n  const auto relMargin =\n      m_epsilon * (std::max)(std::fabs(matchee), std::fabs(m_target));\n  return marginComparison(matchee, m_target,\n                          std::isinf(relMargin) ? 0 : relMargin);\n}\n\nstd::string WithinRelMatcher::describe() const {\n  Catch::ReusableStringStream sstr;\n  sstr << \"and \" << m_target << \" are within \" << m_epsilon * 100.\n       << \"% of each other\";\n  return sstr.str();\n}\n\n} // namespace Floating\n\nFloating::WithinUlpsMatcher WithinULP(double target, uint64_t maxUlpDiff) {\n  return Floating::WithinUlpsMatcher(target, maxUlpDiff,\n                                     Floating::FloatingPointKind::Double);\n}\n\nFloating::WithinUlpsMatcher WithinULP(float target, uint64_t maxUlpDiff) {\n  return Floating::WithinUlpsMatcher(target, maxUlpDiff,\n                                     Floating::FloatingPointKind::Float);\n}\n\nFloating::WithinAbsMatcher WithinAbs(double target, double margin) {\n  return Floating::WithinAbsMatcher(target, margin);\n}\n\nFloating::WithinRelMatcher WithinRel(double target, double eps) {\n  return Floating::WithinRelMatcher(target, eps);\n}\n\nFloating::WithinRelMatcher WithinRel(double target) {\n  return Floating::WithinRelMatcher(\n      target, std::numeric_limits<double>::epsilon() * 100);\n}\n\nFloating::WithinRelMatcher WithinRel(float target, float eps) {\n  return Floating::WithinRelMatcher(target, eps);\n}\n\nFloating::WithinRelMatcher WithinRel(float target) {\n  return Floating::WithinRelMatcher(\n      target, std::numeric_limits<float>::epsilon() * 100);\n}\n\n} // namespace Matchers\n} // namespace Catch\n// end catch_matchers_floating.cpp\n// start catch_matchers_generic.cpp\n\nstd::string\nCatch::Matchers::Generic::Detail::finalizeDescription(const std::string& desc) {\n  if (desc.empty()) {\n    return \"matches undescribed predicate\";\n  } else {\n    return \"matches predicate: \\\"\" + desc + '\"';\n  }\n}\n// end catch_matchers_generic.cpp\n// start catch_matchers_string.cpp\n\n#include <regex>\n\nnamespace Catch {\nnamespace Matchers {\n\nnamespace StdString {\n\nCasedString::CasedString(std::string const& str,\n                         CaseSensitive::Choice caseSensitivity)\n    : m_caseSensitivity(caseSensitivity), m_str(adjustString(str)) {}\nstd::string CasedString::adjustString(std::string const& str) const {\n  return m_caseSensitivity == CaseSensitive::No ? toLower(str) : str;\n}\nstd::string CasedString::caseSensitivitySuffix() const {\n  return m_caseSensitivity == CaseSensitive::No ? \" (case insensitive)\"\n                                                : std::string();\n}\n\nStringMatcherBase::StringMatcherBase(std::string const& operation,\n                                     CasedString const& comparator)\n    : m_comparator(comparator), m_operation(operation) {}\n\nstd::string StringMatcherBase::describe() const {\n  std::string description;\n  description.reserve(5 + m_operation.size() + m_comparator.m_str.size() +\n                      m_comparator.caseSensitivitySuffix().size());\n  description += m_operation;\n  description += \": \\\"\";\n  description += m_comparator.m_str;\n  description += \"\\\"\";\n  description += m_comparator.caseSensitivitySuffix();\n  return description;\n}\n\nEqualsMatcher::EqualsMatcher(CasedString const& comparator)\n    : StringMatcherBase(\"equals\", comparator) {}\n\nbool EqualsMatcher::match(std::string const& source) const {\n  return m_comparator.adjustString(source) == m_comparator.m_str;\n}\n\nContainsMatcher::ContainsMatcher(CasedString const& comparator)\n    : StringMatcherBase(\"contains\", comparator) {}\n\nbool ContainsMatcher::match(std::string const& source) const {\n  return contains(m_comparator.adjustString(source), m_comparator.m_str);\n}\n\nStartsWithMatcher::StartsWithMatcher(CasedString const& comparator)\n    : StringMatcherBase(\"starts with\", comparator) {}\n\nbool StartsWithMatcher::match(std::string const& source) const {\n  return startsWith(m_comparator.adjustString(source), m_comparator.m_str);\n}\n\nEndsWithMatcher::EndsWithMatcher(CasedString const& comparator)\n    : StringMatcherBase(\"ends with\", comparator) {}\n\nbool EndsWithMatcher::match(std::string const& source) const {\n  return endsWith(m_comparator.adjustString(source), m_comparator.m_str);\n}\n\nRegexMatcher::RegexMatcher(std::string regex,\n                           CaseSensitive::Choice caseSensitivity)\n    : m_regex(std::move(regex)), m_caseSensitivity(caseSensitivity) {}\n\nbool RegexMatcher::match(std::string const& matchee) const {\n  auto flags =\n      std::regex::ECMAScript; // ECMAScript is the default syntax option anyway\n  if (m_caseSensitivity == CaseSensitive::Choice::No) {\n    flags |= std::regex::icase;\n  }\n  auto reg = std::regex(m_regex, flags);\n  return std::regex_match(matchee, reg);\n}\n\nstd::string RegexMatcher::describe() const {\n  return \"matches \" + ::Catch::Detail::stringify(m_regex) +\n         ((m_caseSensitivity == CaseSensitive::Choice::Yes)\n              ? \" case sensitively\"\n              : \" case insensitively\");\n}\n\n} // namespace StdString\n\nStdString::EqualsMatcher Equals(std::string const& str,\n                                CaseSensitive::Choice caseSensitivity) {\n  return StdString::EqualsMatcher(StdString::CasedString(str, caseSensitivity));\n}\nStdString::ContainsMatcher Contains(std::string const& str,\n                                    CaseSensitive::Choice caseSensitivity) {\n  return StdString::ContainsMatcher(\n      StdString::CasedString(str, caseSensitivity));\n}\nStdString::EndsWithMatcher EndsWith(std::string const& str,\n                                    CaseSensitive::Choice caseSensitivity) {\n  return StdString::EndsWithMatcher(\n      StdString::CasedString(str, caseSensitivity));\n}\nStdString::StartsWithMatcher StartsWith(std::string const& str,\n                                        CaseSensitive::Choice caseSensitivity) {\n  return StdString::StartsWithMatcher(\n      StdString::CasedString(str, caseSensitivity));\n}\n\nStdString::RegexMatcher Matches(std::string const& regex,\n                                CaseSensitive::Choice caseSensitivity) {\n  return StdString::RegexMatcher(regex, caseSensitivity);\n}\n\n} // namespace Matchers\n} // namespace Catch\n// end catch_matchers_string.cpp\n// start catch_message.cpp\n\n// start catch_uncaught_exceptions.h\n\nnamespace Catch {\nbool uncaught_exceptions();\n} // end namespace Catch\n\n// end catch_uncaught_exceptions.h\n#include <cassert>\n#include <stack>\n\nnamespace Catch {\n\nMessageInfo::MessageInfo(StringRef const& _macroName,\n                         SourceLineInfo const& _lineInfo,\n                         ResultWas::OfType _type)\n    : macroName(_macroName), lineInfo(_lineInfo), type(_type),\n      sequence(++globalCount) {}\n\nbool MessageInfo::operator==(MessageInfo const& other) const {\n  return sequence == other.sequence;\n}\n\nbool MessageInfo::operator<(MessageInfo const& other) const {\n  return sequence < other.sequence;\n}\n\n// This may need protecting if threading support is added\nunsigned int MessageInfo::globalCount = 0;\n\n////////////////////////////////////////////////////////////////////////////\n\nCatch::MessageBuilder::MessageBuilder(StringRef const& macroName,\n                                      SourceLineInfo const& lineInfo,\n                                      ResultWas::OfType type)\n    : m_info(macroName, lineInfo, type) {}\n\n////////////////////////////////////////////////////////////////////////////\n\nScopedMessage::ScopedMessage(MessageBuilder const& builder)\n    : m_info(builder.m_info), m_moved() {\n  m_info.message = builder.m_stream.str();\n  getResultCapture().pushScopedMessage(m_info);\n}\n\nScopedMessage::ScopedMessage(ScopedMessage&& old)\n    : m_info(old.m_info), m_moved() {\n  old.m_moved = true;\n}\n\nScopedMessage::~ScopedMessage() {\n  if (!uncaught_exceptions() && !m_moved) {\n    getResultCapture().popScopedMessage(m_info);\n  }\n}\n\nCapturer::Capturer(StringRef macroName, SourceLineInfo const& lineInfo,\n                   ResultWas::OfType resultType, StringRef names) {\n  auto trimmed = [&](size_t start, size_t end) {\n    while (names[start] == ',' ||\n           isspace(static_cast<unsigned char>(names[start]))) {\n      ++start;\n    }\n    while (names[end] == ',' ||\n           isspace(static_cast<unsigned char>(names[end]))) {\n      --end;\n    }\n    return names.substr(start, end - start + 1);\n  };\n  auto skipq = [&](size_t start, char quote) {\n    for (auto i = start + 1; i < names.size(); ++i) {\n      if (names[i] == quote)\n        return i;\n      if (names[i] == '\\\\')\n        ++i;\n    }\n    CATCH_INTERNAL_ERROR(\"CAPTURE parsing encountered unmatched quote\");\n  };\n\n  size_t start = 0;\n  std::stack<char> openings;\n  for (size_t pos = 0; pos < names.size(); ++pos) {\n    char c = names[pos];\n    switch (c) {\n    case '[':\n    case '{':\n    case '(':\n      // It is basically impossible to disambiguate between\n      // comparison and start of template args in this context\n      //            case '<':\n      openings.push(c);\n      break;\n    case ']':\n    case '}':\n    case ')':\n      //           case '>':\n      openings.pop();\n      break;\n    case '\"':\n    case '\\'':\n      pos = skipq(pos, c);\n      break;\n    case ',':\n      if (start != pos && openings.empty()) {\n        m_messages.emplace_back(macroName, lineInfo, resultType);\n        m_messages.back().message =\n            static_cast<std::string>(trimmed(start, pos));\n        m_messages.back().message += \" := \";\n        start = pos;\n      }\n    }\n  }\n  assert(openings.empty() && \"Mismatched openings\");\n  m_messages.emplace_back(macroName, lineInfo, resultType);\n  m_messages.back().message =\n      static_cast<std::string>(trimmed(start, names.size() - 1));\n  m_messages.back().message += \" := \";\n}\nCapturer::~Capturer() {\n  if (!uncaught_exceptions()) {\n    assert(m_captured == m_messages.size());\n    for (size_t i = 0; i < m_captured; ++i)\n      m_resultCapture.popScopedMessage(m_messages[i]);\n  }\n}\n\nvoid Capturer::captureValue(size_t index, std::string const& value) {\n  assert(index < m_messages.size());\n  m_messages[index].message += value;\n  m_resultCapture.pushScopedMessage(m_messages[index]);\n  m_captured++;\n}\n\n} // end namespace Catch\n// end catch_message.cpp\n// start catch_output_redirect.cpp\n\n// start catch_output_redirect.h\n#ifndef TWOBLUECUBES_CATCH_OUTPUT_REDIRECT_H\n#define TWOBLUECUBES_CATCH_OUTPUT_REDIRECT_H\n\n#include <cstdio>\n#include <iosfwd>\n#include <string>\n\nnamespace Catch {\n\nclass RedirectedStream {\n  std::ostream& m_originalStream;\n  std::ostream& m_redirectionStream;\n  std::streambuf* m_prevBuf;\n\npublic:\n  RedirectedStream(std::ostream& originalStream,\n                   std::ostream& redirectionStream);\n  ~RedirectedStream();\n};\n\nclass RedirectedStdOut {\n  ReusableStringStream m_rss;\n  RedirectedStream m_cout;\n\npublic:\n  RedirectedStdOut();\n  auto str() const -> std::string;\n};\n\n// StdErr has two constituent streams in C++, std::cerr and std::clog\n// This means that we need to redirect 2 streams into 1 to keep proper\n// order of writes\nclass RedirectedStdErr {\n  ReusableStringStream m_rss;\n  RedirectedStream m_cerr;\n  RedirectedStream m_clog;\n\npublic:\n  RedirectedStdErr();\n  auto str() const -> std::string;\n};\n\nclass RedirectedStreams {\npublic:\n  RedirectedStreams(RedirectedStreams const&)            = delete;\n  RedirectedStreams& operator=(RedirectedStreams const&) = delete;\n  RedirectedStreams(RedirectedStreams&&)                 = delete;\n  RedirectedStreams& operator=(RedirectedStreams&&)      = delete;\n\n  RedirectedStreams(std::string& redirectedCout, std::string& redirectedCerr);\n  ~RedirectedStreams();\n\nprivate:\n  std::string& m_redirectedCout;\n  std::string& m_redirectedCerr;\n  RedirectedStdOut m_redirectedStdOut;\n  RedirectedStdErr m_redirectedStdErr;\n};\n\n#if defined(CATCH_CONFIG_NEW_CAPTURE)\n\n// Windows's implementation of std::tmpfile is terrible (it tries\n// to create a file inside system folder, thus requiring elevated\n// privileges for the binary), so we have to use tmpnam(_s) and\n// create the file ourselves there.\nclass TempFile {\npublic:\n  TempFile(TempFile const&)            = delete;\n  TempFile& operator=(TempFile const&) = delete;\n  TempFile(TempFile&&)                 = delete;\n  TempFile& operator=(TempFile&&)      = delete;\n\n  TempFile();\n  ~TempFile();\n\n  std::FILE* getFile();\n  std::string getContents();\n\nprivate:\n  std::FILE* m_file = nullptr;\n#if defined(_MSC_VER)\n  char m_buffer[L_tmpnam] = {0};\n#endif\n};\n\nclass OutputRedirect {\npublic:\n  OutputRedirect(OutputRedirect const&)            = delete;\n  OutputRedirect& operator=(OutputRedirect const&) = delete;\n  OutputRedirect(OutputRedirect&&)                 = delete;\n  OutputRedirect& operator=(OutputRedirect&&)      = delete;\n\n  OutputRedirect(std::string& stdout_dest, std::string& stderr_dest);\n  ~OutputRedirect();\n\nprivate:\n  int m_originalStdout = -1;\n  int m_originalStderr = -1;\n  TempFile m_stdoutFile;\n  TempFile m_stderrFile;\n  std::string& m_stdoutDest;\n  std::string& m_stderrDest;\n};\n\n#endif\n\n} // end namespace Catch\n\n#endif // TWOBLUECUBES_CATCH_OUTPUT_REDIRECT_H\n// end catch_output_redirect.h\n#include <cstdio>\n#include <cstring>\n#include <fstream>\n#include <sstream>\n#include <stdexcept>\n\n#if defined(CATCH_CONFIG_NEW_CAPTURE)\n#if defined(_MSC_VER)\n#include <io.h> //_dup and _dup2\n#define dup _dup\n#define dup2 _dup2\n#define fileno _fileno\n#else\n#include <unistd.h> // dup and dup2\n#endif\n#endif\n\nnamespace Catch {\n\nRedirectedStream::RedirectedStream(std::ostream& originalStream,\n                                   std::ostream& redirectionStream)\n    : m_originalStream(originalStream), m_redirectionStream(redirectionStream),\n      m_prevBuf(m_originalStream.rdbuf()) {\n  m_originalStream.rdbuf(m_redirectionStream.rdbuf());\n}\n\nRedirectedStream::~RedirectedStream() { m_originalStream.rdbuf(m_prevBuf); }\n\nRedirectedStdOut::RedirectedStdOut() : m_cout(Catch::cout(), m_rss.get()) {}\nauto RedirectedStdOut::str() const -> std::string { return m_rss.str(); }\n\nRedirectedStdErr::RedirectedStdErr()\n    : m_cerr(Catch::cerr(), m_rss.get()), m_clog(Catch::clog(), m_rss.get()) {}\nauto RedirectedStdErr::str() const -> std::string { return m_rss.str(); }\n\nRedirectedStreams::RedirectedStreams(std::string& redirectedCout,\n                                     std::string& redirectedCerr)\n    : m_redirectedCout(redirectedCout), m_redirectedCerr(redirectedCerr) {}\n\nRedirectedStreams::~RedirectedStreams() {\n  m_redirectedCout += m_redirectedStdOut.str();\n  m_redirectedCerr += m_redirectedStdErr.str();\n}\n\n#if defined(CATCH_CONFIG_NEW_CAPTURE)\n\n#if defined(_MSC_VER)\nTempFile::TempFile() {\n  if (tmpnam_s(m_buffer)) {\n    CATCH_RUNTIME_ERROR(\"Could not get a temp filename\");\n  }\n  if (fopen_s(&m_file, m_buffer, \"w+\")) {\n    char buffer[100];\n    if (strerror_s(buffer, errno)) {\n      CATCH_RUNTIME_ERROR(\"Could not translate errno to a string\");\n    }\n    CATCH_RUNTIME_ERROR(\"Could not open the temp file: '\"\n                        << m_buffer << \"' because: \" << buffer);\n  }\n}\n#else\nTempFile::TempFile() {\n  m_file = std::tmpfile();\n  if (!m_file) {\n    CATCH_RUNTIME_ERROR(\"Could not create a temp file.\");\n  }\n}\n\n#endif\n\nTempFile::~TempFile() {\n  // TBD: What to do about errors here?\n  std::fclose(m_file);\n  // We manually create the file on Windows only, on Linux\n  // it will be autodeleted\n#if defined(_MSC_VER)\n  std::remove(m_buffer);\n#endif\n}\n\nFILE* TempFile::getFile() { return m_file; }\n\nstd::string TempFile::getContents() {\n  std::stringstream sstr;\n  char buffer[100] = {};\n  std::rewind(m_file);\n  while (std::fgets(buffer, sizeof(buffer), m_file)) {\n    sstr << buffer;\n  }\n  return sstr.str();\n}\n\nOutputRedirect::OutputRedirect(std::string& stdout_dest,\n                               std::string& stderr_dest)\n    : m_originalStdout(dup(1)), m_originalStderr(dup(2)),\n      m_stdoutDest(stdout_dest), m_stderrDest(stderr_dest) {\n  dup2(fileno(m_stdoutFile.getFile()), 1);\n  dup2(fileno(m_stderrFile.getFile()), 2);\n}\n\nOutputRedirect::~OutputRedirect() {\n  Catch::cout() << std::flush;\n  fflush(stdout);\n  // Since we support overriding these streams, we flush cerr\n  // even though std::cerr is unbuffered\n  Catch::cerr() << std::flush;\n  Catch::clog() << std::flush;\n  fflush(stderr);\n\n  dup2(m_originalStdout, 1);\n  dup2(m_originalStderr, 2);\n\n  m_stdoutDest += m_stdoutFile.getContents();\n  m_stderrDest += m_stderrFile.getContents();\n}\n\n#endif // CATCH_CONFIG_NEW_CAPTURE\n\n} // namespace Catch\n\n#if defined(CATCH_CONFIG_NEW_CAPTURE)\n#if defined(_MSC_VER)\n#undef dup\n#undef dup2\n#undef fileno\n#endif\n#endif\n// end catch_output_redirect.cpp\n// start catch_polyfills.cpp\n\n#include <cmath>\n\nnamespace Catch {\n\n#if !defined(CATCH_CONFIG_POLYFILL_ISNAN)\nbool isnan(float f) { return std::isnan(f); }\nbool isnan(double d) { return std::isnan(d); }\n#else\n// For now we only use this for embarcadero\nbool isnan(float f) { return std::_isnan(f); }\nbool isnan(double d) { return std::_isnan(d); }\n#endif\n\n} // end namespace Catch\n// end catch_polyfills.cpp\n// start catch_random_number_generator.cpp\n\nnamespace Catch {\n\nnamespace {\n\n#if defined(_MSC_VER)\n#pragma warning(push)\n#pragma warning(disable : 4146) // we negate uint32 during the rotate\n#endif\n// Safe rotr implementation thanks to John Regehr\nuint32_t rotate_right(uint32_t val, uint32_t count) {\n  const uint32_t mask = 31;\n  count &= mask;\n  return (val >> count) | (val << (-count & mask));\n}\n\n#if defined(_MSC_VER)\n#pragma warning(pop)\n#endif\n\n} // namespace\n\nSimplePcg32::SimplePcg32(result_type seed_) { seed(seed_); }\n\nvoid SimplePcg32::seed(result_type seed_) {\n  m_state = 0;\n  (*this)();\n  m_state += seed_;\n  (*this)();\n}\n\nvoid SimplePcg32::discard(uint64_t skip) {\n  // We could implement this to run in O(log n) steps, but this\n  // should suffice for our use case.\n  for (uint64_t s = 0; s < skip; ++s) {\n    static_cast<void>((*this)());\n  }\n}\n\nSimplePcg32::result_type SimplePcg32::operator()() {\n  // prepare the output value\n  const uint32_t xorshifted =\n      static_cast<uint32_t>(((m_state >> 18u) ^ m_state) >> 27u);\n  const auto output = rotate_right(xorshifted, m_state >> 59u);\n\n  // advance state\n  m_state = m_state * 6364136223846793005ULL + s_inc;\n\n  return output;\n}\n\nbool operator==(SimplePcg32 const& lhs, SimplePcg32 const& rhs) {\n  return lhs.m_state == rhs.m_state;\n}\n\nbool operator!=(SimplePcg32 const& lhs, SimplePcg32 const& rhs) {\n  return lhs.m_state != rhs.m_state;\n}\n} // namespace Catch\n// end catch_random_number_generator.cpp\n// start catch_registry_hub.cpp\n\n// start catch_test_case_registry_impl.h\n\n#include <vector>\n#include <set>\n#include <algorithm>\n#include <ios>\n\nnamespace Catch {\n\nclass TestCase;\nstruct IConfig;\n\nstd::vector<TestCase> sortTests(IConfig const& config,\n                                std::vector<TestCase> const& unsortedTestCases);\n\nbool isThrowSafe(TestCase const& testCase, IConfig const& config);\nbool matchTest(TestCase const& testCase, TestSpec const& testSpec,\n               IConfig const& config);\n\nvoid enforceNoDuplicateTestCases(std::vector<TestCase> const& functions);\n\nstd::vector<TestCase> filterTests(std::vector<TestCase> const& testCases,\n                                  TestSpec const& testSpec,\n                                  IConfig const& config);\nstd::vector<TestCase> const& getAllTestCasesSorted(IConfig const& config);\n\nclass TestRegistry : public ITestCaseRegistry {\npublic:\n  virtual ~TestRegistry() = default;\n\n  virtual void registerTest(TestCase const& testCase);\n\n  std::vector<TestCase> const& getAllTests() const override;\n  std::vector<TestCase> const&\n  getAllTestsSorted(IConfig const& config) const override;\n\nprivate:\n  std::vector<TestCase> m_functions;\n  mutable RunTests::InWhatOrder m_currentSortOrder =\n      RunTests::InDeclarationOrder;\n  mutable std::vector<TestCase> m_sortedFunctions;\n  std::size_t m_unnamedCount = 0;\n  std::ios_base::Init m_ostreamInit; // Forces cout/ cerr to be initialised\n};\n\n///////////////////////////////////////////////////////////////////////////\n\nclass TestInvokerAsFunction : public ITestInvoker {\n  void (*m_testAsFunction)();\n\npublic:\n  TestInvokerAsFunction(void (*testAsFunction)()) noexcept;\n\n  void invoke() const override;\n};\n\nstd::string extractClassName(StringRef const& classOrQualifiedMethodName);\n\n///////////////////////////////////////////////////////////////////////////\n\n} // end namespace Catch\n\n// end catch_test_case_registry_impl.h\n// start catch_reporter_registry.h\n\n#include <map>\n\nnamespace Catch {\n\nclass ReporterRegistry : public IReporterRegistry {\n\npublic:\n  ~ReporterRegistry() override;\n\n  IStreamingReporterPtr create(std::string const& name,\n                               IConfigPtr const& config) const override;\n\n  void registerReporter(std::string const& name,\n                        IReporterFactoryPtr const& factory);\n  void registerListener(IReporterFactoryPtr const& factory);\n\n  FactoryMap const& getFactories() const override;\n  Listeners const& getListeners() const override;\n\nprivate:\n  FactoryMap m_factories;\n  Listeners m_listeners;\n};\n} // namespace Catch\n\n// end catch_reporter_registry.h\n// start catch_tag_alias_registry.h\n\n// start catch_tag_alias.h\n\n#include <string>\n\nnamespace Catch {\n\nstruct TagAlias {\n  TagAlias(std::string const& _tag, SourceLineInfo _lineInfo);\n\n  std::string tag;\n  SourceLineInfo lineInfo;\n};\n\n} // end namespace Catch\n\n// end catch_tag_alias.h\n#include <map>\n\nnamespace Catch {\n\nclass TagAliasRegistry : public ITagAliasRegistry {\npublic:\n  ~TagAliasRegistry() override;\n  TagAlias const* find(std::string const& alias) const override;\n  std::string\n  expandAliases(std::string const& unexpandedTestSpec) const override;\n  void add(std::string const& alias, std::string const& tag,\n           SourceLineInfo const& lineInfo);\n\nprivate:\n  std::map<std::string, TagAlias> m_registry;\n};\n\n} // end namespace Catch\n\n// end catch_tag_alias_registry.h\n// start catch_startup_exception_registry.h\n\n#include <vector>\n#include <exception>\n\nnamespace Catch {\n\nclass StartupExceptionRegistry {\n#if !defined(CATCH_CONFIG_DISABLE_EXCEPTIONS)\npublic:\n  void add(std::exception_ptr const& exception) noexcept;\n  std::vector<std::exception_ptr> const& getExceptions() const noexcept;\n\nprivate:\n  std::vector<std::exception_ptr> m_exceptions;\n#endif\n};\n\n} // end namespace Catch\n\n// end catch_startup_exception_registry.h\n// start catch_singletons.hpp\n\nnamespace Catch {\n\nstruct ISingleton {\n  virtual ~ISingleton();\n};\n\nvoid addSingleton(ISingleton* singleton);\nvoid cleanupSingletons();\n\ntemplate <typename SingletonImplT, typename InterfaceT = SingletonImplT,\n          typename MutableInterfaceT = InterfaceT>\nclass Singleton : SingletonImplT, public ISingleton {\n\n  static auto getInternal() -> Singleton* {\n    static Singleton* s_instance = nullptr;\n    if (!s_instance) {\n      s_instance = new Singleton;\n      addSingleton(s_instance);\n    }\n    return s_instance;\n  }\n\npublic:\n  static auto get() -> InterfaceT const& { return *getInternal(); }\n  static auto getMutable() -> MutableInterfaceT& { return *getInternal(); }\n};\n\n} // namespace Catch\n\n// end catch_singletons.hpp\nnamespace Catch {\n\nnamespace {\n\nclass RegistryHub : public IRegistryHub,\n                    public IMutableRegistryHub,\n                    private NonCopyable {\n\npublic: // IRegistryHub\n  RegistryHub() = default;\n  IReporterRegistry const& getReporterRegistry() const override {\n    return m_reporterRegistry;\n  }\n  ITestCaseRegistry const& getTestCaseRegistry() const override {\n    return m_testCaseRegistry;\n  }\n  IExceptionTranslatorRegistry const&\n  getExceptionTranslatorRegistry() const override {\n    return m_exceptionTranslatorRegistry;\n  }\n  ITagAliasRegistry const& getTagAliasRegistry() const override {\n    return m_tagAliasRegistry;\n  }\n  StartupExceptionRegistry const& getStartupExceptionRegistry() const override {\n    return m_exceptionRegistry;\n  }\n\npublic: // IMutableRegistryHub\n  void registerReporter(std::string const& name,\n                        IReporterFactoryPtr const& factory) override {\n    m_reporterRegistry.registerReporter(name, factory);\n  }\n  void registerListener(IReporterFactoryPtr const& factory) override {\n    m_reporterRegistry.registerListener(factory);\n  }\n  void registerTest(TestCase const& testInfo) override {\n    m_testCaseRegistry.registerTest(testInfo);\n  }\n  void registerTranslator(const IExceptionTranslator* translator) override {\n    m_exceptionTranslatorRegistry.registerTranslator(translator);\n  }\n  void registerTagAlias(std::string const& alias, std::string const& tag,\n                        SourceLineInfo const& lineInfo) override {\n    m_tagAliasRegistry.add(alias, tag, lineInfo);\n  }\n  void registerStartupException() noexcept override {\n#if !defined(CATCH_CONFIG_DISABLE_EXCEPTIONS)\n    m_exceptionRegistry.add(std::current_exception());\n#else\n    CATCH_INTERNAL_ERROR(\"Attempted to register active exception under \"\n                         \"CATCH_CONFIG_DISABLE_EXCEPTIONS!\");\n#endif\n  }\n  IMutableEnumValuesRegistry& getMutableEnumValuesRegistry() override {\n    return m_enumValuesRegistry;\n  }\n\nprivate:\n  TestRegistry m_testCaseRegistry;\n  ReporterRegistry m_reporterRegistry;\n  ExceptionTranslatorRegistry m_exceptionTranslatorRegistry;\n  TagAliasRegistry m_tagAliasRegistry;\n  StartupExceptionRegistry m_exceptionRegistry;\n  Detail::EnumValuesRegistry m_enumValuesRegistry;\n};\n} // namespace\n\nusing RegistryHubSingleton =\n    Singleton<RegistryHub, IRegistryHub, IMutableRegistryHub>;\n\nIRegistryHub const& getRegistryHub() { return RegistryHubSingleton::get(); }\nIMutableRegistryHub& getMutableRegistryHub() {\n  return RegistryHubSingleton::getMutable();\n}\nvoid cleanUp() {\n  cleanupSingletons();\n  cleanUpContext();\n}\nstd::string translateActiveException() {\n  return getRegistryHub()\n      .getExceptionTranslatorRegistry()\n      .translateActiveException();\n}\n\n} // end namespace Catch\n// end catch_registry_hub.cpp\n// start catch_reporter_registry.cpp\n\nnamespace Catch {\n\nReporterRegistry::~ReporterRegistry() = default;\n\nIStreamingReporterPtr ReporterRegistry::create(std::string const& name,\n                                               IConfigPtr const& config) const {\n  auto it = m_factories.find(name);\n  if (it == m_factories.end())\n    return nullptr;\n  return it->second->create(ReporterConfig(config));\n}\n\nvoid ReporterRegistry::registerReporter(std::string const& name,\n                                        IReporterFactoryPtr const& factory) {\n  m_factories.emplace(name, factory);\n}\nvoid ReporterRegistry::registerListener(IReporterFactoryPtr const& factory) {\n  m_listeners.push_back(factory);\n}\n\nIReporterRegistry::FactoryMap const& ReporterRegistry::getFactories() const {\n  return m_factories;\n}\nIReporterRegistry::Listeners const& ReporterRegistry::getListeners() const {\n  return m_listeners;\n}\n\n} // namespace Catch\n// end catch_reporter_registry.cpp\n// start catch_result_type.cpp\n\nnamespace Catch {\n\nbool isOk(ResultWas::OfType resultType) {\n  return (resultType & ResultWas::FailureBit) == 0;\n}\nbool isJustInfo(int flags) { return flags == ResultWas::Info; }\n\nResultDisposition::Flags operator|(ResultDisposition::Flags lhs,\n                                   ResultDisposition::Flags rhs) {\n  return static_cast<ResultDisposition::Flags>(static_cast<int>(lhs) |\n                                               static_cast<int>(rhs));\n}\n\nbool shouldContinueOnFailure(int flags) {\n  return (flags & ResultDisposition::ContinueOnFailure) != 0;\n}\nbool shouldSuppressFailure(int flags) {\n  return (flags & ResultDisposition::SuppressFail) != 0;\n}\n\n} // end namespace Catch\n// end catch_result_type.cpp\n// start catch_run_context.cpp\n\n#include <cassert>\n#include <algorithm>\n#include <sstream>\n\nnamespace Catch {\n\nnamespace Generators {\nstruct GeneratorTracker : TestCaseTracking::TrackerBase, IGeneratorTracker {\n  GeneratorBasePtr m_generator;\n\n  GeneratorTracker(TestCaseTracking::NameAndLocation const& nameAndLocation,\n                   TrackerContext& ctx, ITracker* parent)\n      : TrackerBase(nameAndLocation, ctx, parent) {}\n  ~GeneratorTracker();\n\n  static GeneratorTracker&\n  acquire(TrackerContext& ctx,\n          TestCaseTracking::NameAndLocation const& nameAndLocation) {\n    std::shared_ptr<GeneratorTracker> tracker;\n\n    ITracker& currentTracker = ctx.currentTracker();\n    // Under specific circumstances, the generator we want\n    // to acquire is also the current tracker. If this is\n    // the case, we have to avoid looking through current\n    // tracker's children, and instead return the current\n    // tracker.\n    // A case where this check is important is e.g.\n    //     for (int i = 0; i < 5; ++i) {\n    //         int n = GENERATE(1, 2);\n    //     }\n    //\n    // without it, the code above creates 5 nested generators.\n    if (currentTracker.nameAndLocation() == nameAndLocation) {\n      auto thisTracker = currentTracker.parent().findChild(nameAndLocation);\n      assert(thisTracker);\n      assert(thisTracker->isGeneratorTracker());\n      tracker = std::static_pointer_cast<GeneratorTracker>(thisTracker);\n    } else if (TestCaseTracking::ITrackerPtr childTracker =\n                   currentTracker.findChild(nameAndLocation)) {\n      assert(childTracker);\n      assert(childTracker->isGeneratorTracker());\n      tracker = std::static_pointer_cast<GeneratorTracker>(childTracker);\n    } else {\n      tracker = std::make_shared<GeneratorTracker>(nameAndLocation, ctx,\n                                                   &currentTracker);\n      currentTracker.addChild(tracker);\n    }\n\n    if (!tracker->isComplete()) {\n      tracker->open();\n    }\n\n    return *tracker;\n  }\n\n  // TrackerBase interface\n  bool isGeneratorTracker() const override { return true; }\n  auto hasGenerator() const -> bool override { return !!m_generator; }\n  void close() override {\n    TrackerBase::close();\n    // If a generator has a child (it is followed by a section)\n    // and none of its children have started, then we must wait\n    // until later to start consuming its values.\n    // This catches cases where `GENERATE` is placed between two\n    // `SECTION`s.\n    // **The check for m_children.empty cannot be removed**.\n    // doing so would break `GENERATE` _not_ followed by `SECTION`s.\n    const bool should_wait_for_child = [&]() {\n      // No children -> nobody to wait for\n      if (m_children.empty()) {\n        return false;\n      }\n      // If at least one child started executing, don't wait\n      if (std::find_if(m_children.begin(), m_children.end(),\n                       [](TestCaseTracking::ITrackerPtr tracker) {\n                         return tracker->hasStarted();\n                       }) != m_children.end()) {\n        return false;\n      }\n\n      // No children have started. We need to check if they _can_\n      // start, and thus we should wait for them, or they cannot\n      // start (due to filters), and we shouldn't wait for them\n      auto* parent = m_parent;\n      // This is safe: there is always at least one section\n      // tracker in a test case tracking tree\n      while (!parent->isSectionTracker()) {\n        parent = &(parent->parent());\n      }\n      assert(parent && \"Missing root (test case) level section\");\n\n      auto const& parentSection = static_cast<SectionTracker&>(*parent);\n      auto const& filters       = parentSection.getFilters();\n      // No filters -> no restrictions on running sections\n      if (filters.empty()) {\n        return true;\n      }\n\n      for (auto const& child : m_children) {\n        if (child->isSectionTracker() &&\n            std::find(filters.begin(), filters.end(),\n                      static_cast<SectionTracker&>(*child).trimmedName()) !=\n                filters.end()) {\n          return true;\n        }\n      }\n      return false;\n    }();\n\n    // This check is a bit tricky, because m_generator->next()\n    // has a side-effect, where it consumes generator's current\n    // value, but we do not want to invoke the side-effect if\n    // this generator is still waiting for any child to start.\n    if (should_wait_for_child ||\n        (m_runState == CompletedSuccessfully && m_generator->next())) {\n      m_children.clear();\n      m_runState = Executing;\n    }\n  }\n\n  // IGeneratorTracker interface\n  auto getGenerator() const -> GeneratorBasePtr const& override {\n    return m_generator;\n  }\n  void setGenerator(GeneratorBasePtr&& generator) override {\n    m_generator = std::move(generator);\n  }\n};\nGeneratorTracker::~GeneratorTracker() {}\n} // namespace Generators\n\nRunContext::RunContext(IConfigPtr const& _config,\n                       IStreamingReporterPtr&& reporter)\n    : m_runInfo(_config->name()), m_context(getCurrentMutableContext()),\n      m_config(_config), m_reporter(std::move(reporter)),\n      m_lastAssertionInfo{StringRef(), SourceLineInfo(\"\", 0), StringRef(),\n                          ResultDisposition::Normal},\n      m_includeSuccessfulResults(\n          m_config->includeSuccessfulResults() ||\n          m_reporter->getPreferences().shouldReportAllAssertions) {\n  m_context.setRunner(this);\n  m_context.setConfig(m_config);\n  m_context.setResultCapture(this);\n  m_reporter->testRunStarting(m_runInfo);\n}\n\nRunContext::~RunContext() {\n  m_reporter->testRunEnded(TestRunStats(m_runInfo, m_totals, aborting()));\n}\n\nvoid RunContext::testGroupStarting(std::string const& testSpec,\n                                   std::size_t groupIndex,\n                                   std::size_t groupsCount) {\n  m_reporter->testGroupStarting(GroupInfo(testSpec, groupIndex, groupsCount));\n}\n\nvoid RunContext::testGroupEnded(std::string const& testSpec,\n                                Totals const& totals, std::size_t groupIndex,\n                                std::size_t groupsCount) {\n  m_reporter->testGroupEnded(TestGroupStats(\n      GroupInfo(testSpec, groupIndex, groupsCount), totals, aborting()));\n}\n\nTotals RunContext::runTest(TestCase const& testCase) {\n  Totals prevTotals = m_totals;\n\n  std::string redirectedCout;\n  std::string redirectedCerr;\n\n  auto const& testInfo = testCase.getTestCaseInfo();\n\n  m_reporter->testCaseStarting(testInfo);\n\n  m_activeTestCase = &testCase;\n\n  ITracker& rootTracker = m_trackerContext.startRun();\n  assert(rootTracker.isSectionTracker());\n  static_cast<SectionTracker&>(rootTracker)\n      .addInitialFilters(m_config->getSectionsToRun());\n  do {\n    m_trackerContext.startCycle();\n    m_testCaseTracker = &SectionTracker::acquire(\n        m_trackerContext,\n        TestCaseTracking::NameAndLocation(testInfo.name, testInfo.lineInfo));\n    runCurrentTest(redirectedCout, redirectedCerr);\n  } while (!m_testCaseTracker->isSuccessfullyCompleted() && !aborting());\n\n  Totals deltaTotals = m_totals.delta(prevTotals);\n  if (testInfo.expectedToFail() && deltaTotals.testCases.passed > 0) {\n    deltaTotals.assertions.failed++;\n    deltaTotals.testCases.passed--;\n    deltaTotals.testCases.failed++;\n  }\n  m_totals.testCases += deltaTotals.testCases;\n  m_reporter->testCaseEnded(TestCaseStats(testInfo, deltaTotals, redirectedCout,\n                                          redirectedCerr, aborting()));\n\n  m_activeTestCase  = nullptr;\n  m_testCaseTracker = nullptr;\n\n  return deltaTotals;\n}\n\nIConfigPtr RunContext::config() const { return m_config; }\n\nIStreamingReporter& RunContext::reporter() const { return *m_reporter; }\n\nvoid RunContext::assertionEnded(AssertionResult const& result) {\n  if (result.getResultType() == ResultWas::Ok) {\n    m_totals.assertions.passed++;\n    m_lastAssertionPassed = true;\n  } else if (!result.isOk()) {\n    m_lastAssertionPassed = false;\n    if (m_activeTestCase->getTestCaseInfo().okToFail())\n      m_totals.assertions.failedButOk++;\n    else\n      m_totals.assertions.failed++;\n  } else {\n    m_lastAssertionPassed = true;\n  }\n\n  // We have no use for the return value (whether messages should be cleared),\n  // because messages were made scoped and should be let to clear themselves\n  // out.\n  static_cast<void>(\n      m_reporter->assertionEnded(AssertionStats(result, m_messages, m_totals)));\n\n  if (result.getResultType() != ResultWas::Warning)\n    m_messageScopes.clear();\n\n  // Reset working state\n  resetAssertionInfo();\n  m_lastResult = result;\n}\nvoid RunContext::resetAssertionInfo() {\n  m_lastAssertionInfo.macroName = StringRef();\n  m_lastAssertionInfo.capturedExpression =\n      \"{Unknown expression after the reported line}\"_sr;\n}\n\nbool RunContext::sectionStarted(SectionInfo const& sectionInfo,\n                                Counts& assertions) {\n  ITracker& sectionTracker = SectionTracker::acquire(\n      m_trackerContext, TestCaseTracking::NameAndLocation(\n                            sectionInfo.name, sectionInfo.lineInfo));\n  if (!sectionTracker.isOpen())\n    return false;\n  m_activeSections.push_back(&sectionTracker);\n\n  m_lastAssertionInfo.lineInfo = sectionInfo.lineInfo;\n\n  m_reporter->sectionStarting(sectionInfo);\n\n  assertions = m_totals.assertions;\n\n  return true;\n}\nauto RunContext::acquireGeneratorTracker(StringRef generatorName,\n                                         SourceLineInfo const& lineInfo)\n    -> IGeneratorTracker& {\n  using namespace Generators;\n  GeneratorTracker& tracker = GeneratorTracker::acquire(\n      m_trackerContext, TestCaseTracking::NameAndLocation(\n                            static_cast<std::string>(generatorName), lineInfo));\n  m_lastAssertionInfo.lineInfo = lineInfo;\n  return tracker;\n}\n\nbool RunContext::testForMissingAssertions(Counts& assertions) {\n  if (assertions.total() != 0)\n    return false;\n  if (!m_config->warnAboutMissingAssertions())\n    return false;\n  if (m_trackerContext.currentTracker().hasChildren())\n    return false;\n  m_totals.assertions.failed++;\n  assertions.failed++;\n  return true;\n}\n\nvoid RunContext::sectionEnded(SectionEndInfo const& endInfo) {\n  Counts assertions      = m_totals.assertions - endInfo.prevAssertions;\n  bool missingAssertions = testForMissingAssertions(assertions);\n\n  if (!m_activeSections.empty()) {\n    m_activeSections.back()->close();\n    m_activeSections.pop_back();\n  }\n\n  m_reporter->sectionEnded(SectionStats(endInfo.sectionInfo, assertions,\n                                        endInfo.durationInSeconds,\n                                        missingAssertions));\n  m_messages.clear();\n  m_messageScopes.clear();\n}\n\nvoid RunContext::sectionEndedEarly(SectionEndInfo const& endInfo) {\n  if (m_unfinishedSections.empty())\n    m_activeSections.back()->fail();\n  else\n    m_activeSections.back()->close();\n  m_activeSections.pop_back();\n\n  m_unfinishedSections.push_back(endInfo);\n}\n\n#if defined(CATCH_CONFIG_ENABLE_BENCHMARKING)\nvoid RunContext::benchmarkPreparing(std::string const& name) {\n  m_reporter->benchmarkPreparing(name);\n}\nvoid RunContext::benchmarkStarting(BenchmarkInfo const& info) {\n  m_reporter->benchmarkStarting(info);\n}\nvoid RunContext::benchmarkEnded(BenchmarkStats<> const& stats) {\n  m_reporter->benchmarkEnded(stats);\n}\nvoid RunContext::benchmarkFailed(std::string const& error) {\n  m_reporter->benchmarkFailed(error);\n}\n#endif // CATCH_CONFIG_ENABLE_BENCHMARKING\n\nvoid RunContext::pushScopedMessage(MessageInfo const& message) {\n  m_messages.push_back(message);\n}\n\nvoid RunContext::popScopedMessage(MessageInfo const& message) {\n  m_messages.erase(std::remove(m_messages.begin(), m_messages.end(), message),\n                   m_messages.end());\n}\n\nvoid RunContext::emplaceUnscopedMessage(MessageBuilder const& builder) {\n  m_messageScopes.emplace_back(builder);\n}\n\nstd::string RunContext::getCurrentTestName() const {\n  return m_activeTestCase ? m_activeTestCase->getTestCaseInfo().name\n                          : std::string();\n}\n\nconst AssertionResult* RunContext::getLastResult() const {\n  return &(*m_lastResult);\n}\n\nvoid RunContext::exceptionEarlyReported() { m_shouldReportUnexpected = false; }\n\nvoid RunContext::handleFatalErrorCondition(StringRef message) {\n  // First notify reporter that bad things happened\n  m_reporter->fatalErrorEncountered(message);\n\n  // Don't rebuild the result -- the stringification itself can cause more fatal\n  // errors Instead, fake a result data.\n  AssertionResultData tempResult(ResultWas::FatalErrorCondition, {false});\n  tempResult.message = static_cast<std::string>(message);\n  AssertionResult result(m_lastAssertionInfo, tempResult);\n\n  assertionEnded(result);\n\n  handleUnfinishedSections();\n\n  // Recreate section for test case (as we will lose the one that was in scope)\n  auto const& testCaseInfo = m_activeTestCase->getTestCaseInfo();\n  SectionInfo testCaseSection(testCaseInfo.lineInfo, testCaseInfo.name);\n\n  Counts assertions;\n  assertions.failed = 1;\n  SectionStats testCaseSectionStats(testCaseSection, assertions, 0, false);\n  m_reporter->sectionEnded(testCaseSectionStats);\n\n  auto const& testInfo = m_activeTestCase->getTestCaseInfo();\n\n  Totals deltaTotals;\n  deltaTotals.testCases.failed  = 1;\n  deltaTotals.assertions.failed = 1;\n  m_reporter->testCaseEnded(TestCaseStats(testInfo, deltaTotals, std::string(),\n                                          std::string(), false));\n  m_totals.testCases.failed++;\n  testGroupEnded(std::string(), m_totals, 1, 1);\n  m_reporter->testRunEnded(TestRunStats(m_runInfo, m_totals, false));\n}\n\nbool RunContext::lastAssertionPassed() { return m_lastAssertionPassed; }\n\nvoid RunContext::assertionPassed() {\n  m_lastAssertionPassed = true;\n  ++m_totals.assertions.passed;\n  resetAssertionInfo();\n  m_messageScopes.clear();\n}\n\nbool RunContext::aborting() const {\n  return m_totals.assertions.failed >=\n         static_cast<std::size_t>(m_config->abortAfter());\n}\n\nvoid RunContext::runCurrentTest(std::string& redirectedCout,\n                                std::string& redirectedCerr) {\n  auto const& testCaseInfo = m_activeTestCase->getTestCaseInfo();\n  SectionInfo testCaseSection(testCaseInfo.lineInfo, testCaseInfo.name);\n  m_reporter->sectionStarting(testCaseSection);\n  Counts prevAssertions    = m_totals.assertions;\n  double duration          = 0;\n  m_shouldReportUnexpected = true;\n  m_lastAssertionInfo = {\"TEST_CASE\"_sr, testCaseInfo.lineInfo, StringRef(),\n                         ResultDisposition::Normal};\n\n  seedRng(*m_config);\n\n  Timer timer;\n  CATCH_TRY {\n    if (m_reporter->getPreferences().shouldRedirectStdOut) {\n#if !defined(CATCH_CONFIG_EXPERIMENTAL_REDIRECT)\n      RedirectedStreams redirectedStreams(redirectedCout, redirectedCerr);\n\n      timer.start();\n      invokeActiveTestCase();\n#else\n      OutputRedirect r(redirectedCout, redirectedCerr);\n      timer.start();\n      invokeActiveTestCase();\n#endif\n    } else {\n      timer.start();\n      invokeActiveTestCase();\n    }\n    duration = timer.getElapsedSeconds();\n  }\n  CATCH_CATCH_ANON(TestFailureException&) {\n    // This just means the test was aborted due to failure\n  }\n  CATCH_CATCH_ALL {\n    // Under CATCH_CONFIG_FAST_COMPILE, unexpected exceptions under REQUIRE\n    // assertions are reported without translation at the point of origin.\n    if (m_shouldReportUnexpected) {\n      AssertionReaction dummyReaction;\n      handleUnexpectedInflightException(\n          m_lastAssertionInfo, translateActiveException(), dummyReaction);\n    }\n  }\n  Counts assertions      = m_totals.assertions - prevAssertions;\n  bool missingAssertions = testForMissingAssertions(assertions);\n\n  m_testCaseTracker->close();\n  handleUnfinishedSections();\n  m_messages.clear();\n  m_messageScopes.clear();\n\n  SectionStats testCaseSectionStats(testCaseSection, assertions, duration,\n                                    missingAssertions);\n  m_reporter->sectionEnded(testCaseSectionStats);\n}\n\nvoid RunContext::invokeActiveTestCase() {\n  FatalConditionHandlerGuard _(&m_fatalConditionhandler);\n  m_activeTestCase->invoke();\n}\n\nvoid RunContext::handleUnfinishedSections() {\n  // If sections ended prematurely due to an exception we stored their\n  // infos here so we can tear them down outside the unwind process.\n  for (auto it    = m_unfinishedSections.rbegin(),\n            itEnd = m_unfinishedSections.rend();\n       it != itEnd; ++it)\n    sectionEnded(*it);\n  m_unfinishedSections.clear();\n}\n\nvoid RunContext::handleExpr(AssertionInfo const& info,\n                            ITransientExpression const& expr,\n                            AssertionReaction& reaction) {\n  m_reporter->assertionStarting(info);\n\n  bool negated = isFalseTest(info.resultDisposition);\n  bool result  = expr.getResult() != negated;\n\n  if (result) {\n    if (!m_includeSuccessfulResults) {\n      assertionPassed();\n    } else {\n      reportExpr(info, ResultWas::Ok, &expr, negated);\n    }\n  } else {\n    reportExpr(info, ResultWas::ExpressionFailed, &expr, negated);\n    populateReaction(reaction);\n  }\n}\nvoid RunContext::reportExpr(AssertionInfo const& info,\n                            ResultWas::OfType resultType,\n                            ITransientExpression const* expr, bool negated) {\n\n  m_lastAssertionInfo = info;\n  AssertionResultData data(resultType, LazyExpression(negated));\n\n  AssertionResult assertionResult{info, data};\n  assertionResult.m_resultData.lazyExpression.m_transientExpression = expr;\n\n  assertionEnded(assertionResult);\n}\n\nvoid RunContext::handleMessage(AssertionInfo const& info,\n                               ResultWas::OfType resultType,\n                               StringRef const& message,\n                               AssertionReaction& reaction) {\n  m_reporter->assertionStarting(info);\n\n  m_lastAssertionInfo = info;\n\n  AssertionResultData data(resultType, LazyExpression(false));\n  data.message = static_cast<std::string>(message);\n  AssertionResult assertionResult{m_lastAssertionInfo, data};\n  assertionEnded(assertionResult);\n  if (!assertionResult.isOk())\n    populateReaction(reaction);\n}\nvoid RunContext::handleUnexpectedExceptionNotThrown(\n    AssertionInfo const& info, AssertionReaction& reaction) {\n  handleNonExpr(info, Catch::ResultWas::DidntThrowException, reaction);\n}\n\nvoid RunContext::handleUnexpectedInflightException(\n    AssertionInfo const& info, std::string const& message,\n    AssertionReaction& reaction) {\n  m_lastAssertionInfo = info;\n\n  AssertionResultData data(ResultWas::ThrewException, LazyExpression(false));\n  data.message = message;\n  AssertionResult assertionResult{info, data};\n  assertionEnded(assertionResult);\n  populateReaction(reaction);\n}\n\nvoid RunContext::populateReaction(AssertionReaction& reaction) {\n  reaction.shouldDebugBreak = m_config->shouldDebugBreak();\n  reaction.shouldThrow = aborting() || (m_lastAssertionInfo.resultDisposition &\n                                        ResultDisposition::Normal);\n}\n\nvoid RunContext::handleIncomplete(AssertionInfo const& info) {\n  m_lastAssertionInfo = info;\n\n  AssertionResultData data(ResultWas::ThrewException, LazyExpression(false));\n  data.message =\n      \"Exception translation was disabled by CATCH_CONFIG_FAST_COMPILE\";\n  AssertionResult assertionResult{info, data};\n  assertionEnded(assertionResult);\n}\nvoid RunContext::handleNonExpr(AssertionInfo const& info,\n                               ResultWas::OfType resultType,\n                               AssertionReaction& reaction) {\n  m_lastAssertionInfo = info;\n\n  AssertionResultData data(resultType, LazyExpression(false));\n  AssertionResult assertionResult{info, data};\n  assertionEnded(assertionResult);\n\n  if (!assertionResult.isOk())\n    populateReaction(reaction);\n}\n\nIResultCapture& getResultCapture() {\n  if (auto* capture = getCurrentContext().getResultCapture())\n    return *capture;\n  else\n    CATCH_INTERNAL_ERROR(\"No result capture instance\");\n}\n\nvoid seedRng(IConfig const& config) {\n  if (config.rngSeed() != 0) {\n    std::srand(config.rngSeed());\n    rng().seed(config.rngSeed());\n  }\n}\n\nunsigned int rngSeed() { return getCurrentContext().getConfig()->rngSeed(); }\n\n} // namespace Catch\n// end catch_run_context.cpp\n// start catch_section.cpp\n\nnamespace Catch {\n\nSection::Section(SectionInfo const& info)\n    : m_info(info), m_sectionIncluded(getResultCapture().sectionStarted(\n                        m_info, m_assertions)) {\n  m_timer.start();\n}\n\nSection::~Section() {\n  if (m_sectionIncluded) {\n    SectionEndInfo endInfo{m_info, m_assertions, m_timer.getElapsedSeconds()};\n    if (uncaught_exceptions())\n      getResultCapture().sectionEndedEarly(endInfo);\n    else\n      getResultCapture().sectionEnded(endInfo);\n  }\n}\n\n// This indicates whether the section should be executed or not\nSection::operator bool() const { return m_sectionIncluded; }\n\n} // end namespace Catch\n// end catch_section.cpp\n// start catch_section_info.cpp\n\nnamespace Catch {\n\nSectionInfo::SectionInfo(SourceLineInfo const& _lineInfo,\n                         std::string const& _name)\n    : name(_name), lineInfo(_lineInfo) {}\n\n} // end namespace Catch\n// end catch_section_info.cpp\n// start catch_session.cpp\n\n// start catch_session.h\n\n#include <memory>\n\nnamespace Catch {\n\nclass Session : NonCopyable {\npublic:\n  Session();\n  ~Session() override;\n\n  void showHelp() const;\n  void libIdentify();\n\n  int applyCommandLine(int argc, char const* const* argv);\n#if defined(CATCH_CONFIG_WCHAR) && defined(_WIN32) && defined(UNICODE)\n  int applyCommandLine(int argc, wchar_t const* const* argv);\n#endif\n\n  void useConfigData(ConfigData const& configData);\n\n  template <typename CharT>\n  int run(int argc, CharT const* const argv[]) {\n    if (m_startupExceptions)\n      return 1;\n    int returnCode = applyCommandLine(argc, argv);\n    if (returnCode == 0)\n      returnCode = run();\n    return returnCode;\n  }\n\n  int run();\n\n  clara::Parser const& cli() const;\n  void cli(clara::Parser const& newParser);\n  ConfigData& configData();\n  Config& config();\n\nprivate:\n  int runInternal();\n\n  clara::Parser m_cli;\n  ConfigData m_configData;\n  std::shared_ptr<Config> m_config;\n  bool m_startupExceptions = false;\n};\n\n} // end namespace Catch\n\n// end catch_session.h\n// start catch_version.h\n\n#include <iosfwd>\n\nnamespace Catch {\n\n// Versioning information\nstruct Version {\n  Version(Version const&)            = delete;\n  Version& operator=(Version const&) = delete;\n  Version(unsigned int _majorVersion, unsigned int _minorVersion,\n          unsigned int _patchNumber, char const* const _branchName,\n          unsigned int _buildNumber);\n\n  unsigned int const majorVersion;\n  unsigned int const minorVersion;\n  unsigned int const patchNumber;\n\n  // buildNumber is only used if branchName is not null\n  char const* const branchName;\n  unsigned int const buildNumber;\n\n  friend std::ostream& operator<<(std::ostream& os, Version const& version);\n};\n\nVersion const& libraryVersion();\n} // namespace Catch\n\n// end catch_version.h\n#include <cstdlib>\n#include <iomanip>\n#include <set>\n#include <iterator>\n\nnamespace Catch {\n\nnamespace {\nconst int MaxExitCode = 255;\n\nIStreamingReporterPtr createReporter(std::string const& reporterName,\n                                     IConfigPtr const& config) {\n  auto reporter = Catch::getRegistryHub().getReporterRegistry().create(\n      reporterName, config);\n  CATCH_ENFORCE(reporter,\n                \"No reporter registered with name: '\" << reporterName << \"'\");\n\n  return reporter;\n}\n\nIStreamingReporterPtr makeReporter(std::shared_ptr<Config> const& config) {\n  if (Catch::getRegistryHub().getReporterRegistry().getListeners().empty()) {\n    return createReporter(config->getReporterName(), config);\n  }\n\n  // On older platforms, returning std::unique_ptr<ListeningReporter>\n  // when the return type is std::unique_ptr<IStreamingReporter>\n  // doesn't compile without a std::move call. However, this causes\n  // a warning on newer platforms. Thus, we have to work around\n  // it a bit and downcast the pointer manually.\n  auto ret    = std::unique_ptr<IStreamingReporter>(new ListeningReporter);\n  auto& multi = static_cast<ListeningReporter&>(*ret);\n  auto const& listeners =\n      Catch::getRegistryHub().getReporterRegistry().getListeners();\n  for (auto const& listener : listeners) {\n    multi.addListener(listener->create(Catch::ReporterConfig(config)));\n  }\n  multi.addReporter(createReporter(config->getReporterName(), config));\n  return ret;\n}\n\nclass TestGroup {\npublic:\n  explicit TestGroup(std::shared_ptr<Config> const& config)\n      : m_config{config}, m_context{config, makeReporter(config)} {\n    auto const& allTestCases = getAllTestCasesSorted(*m_config);\n    m_matches = m_config->testSpec().matchesByFilter(allTestCases, *m_config);\n    auto const& invalidArgs = m_config->testSpec().getInvalidArgs();\n\n    if (m_matches.empty() && invalidArgs.empty()) {\n      for (auto const& test : allTestCases)\n        if (!test.isHidden())\n          m_tests.emplace(&test);\n    } else {\n      for (auto const& match : m_matches)\n        m_tests.insert(match.tests.begin(), match.tests.end());\n    }\n  }\n\n  Totals execute() {\n    auto const& invalidArgs = m_config->testSpec().getInvalidArgs();\n    Totals totals;\n    m_context.testGroupStarting(m_config->name(), 1, 1);\n    for (auto const& testCase : m_tests) {\n      if (!m_context.aborting())\n        totals += m_context.runTest(*testCase);\n      else\n        m_context.reporter().skipTest(*testCase);\n    }\n\n    for (auto const& match : m_matches) {\n      if (match.tests.empty()) {\n        m_context.reporter().noMatchingTestCases(match.name);\n        totals.error = -1;\n      }\n    }\n\n    if (!invalidArgs.empty()) {\n      for (auto const& invalidArg : invalidArgs)\n        m_context.reporter().reportInvalidArguments(invalidArg);\n    }\n\n    m_context.testGroupEnded(m_config->name(), totals, 1, 1);\n    return totals;\n  }\n\nprivate:\n  using Tests = std::set<TestCase const*>;\n\n  std::shared_ptr<Config> m_config;\n  RunContext m_context;\n  Tests m_tests;\n  TestSpec::Matches m_matches;\n};\n\nvoid applyFilenamesAsTags(Catch::IConfig const& config) {\n  auto& tests =\n      const_cast<std::vector<TestCase>&>(getAllTestCasesSorted(config));\n  for (auto& testCase : tests) {\n    auto tags = testCase.tags;\n\n    std::string filename = testCase.lineInfo.file;\n    auto lastSlash       = filename.find_last_of(\"\\\\/\");\n    if (lastSlash != std::string::npos) {\n      filename.erase(0, lastSlash);\n      filename[0] = '#';\n    }\n\n    auto lastDot = filename.find_last_of('.');\n    if (lastDot != std::string::npos) {\n      filename.erase(lastDot);\n    }\n\n    tags.push_back(std::move(filename));\n    setTags(testCase, tags);\n  }\n}\n\n} // namespace\n\nSession::Session() {\n  static bool alreadyInstantiated = false;\n  if (alreadyInstantiated) {\n    CATCH_TRY {\n      CATCH_INTERNAL_ERROR(\n          \"Only one instance of Catch::Session can ever be used\");\n    }\n    CATCH_CATCH_ALL { getMutableRegistryHub().registerStartupException(); }\n  }\n\n  // There cannot be exceptions at startup in no-exception mode.\n#if !defined(CATCH_CONFIG_DISABLE_EXCEPTIONS)\n  const auto& exceptions =\n      getRegistryHub().getStartupExceptionRegistry().getExceptions();\n  if (!exceptions.empty()) {\n    config();\n    getCurrentMutableContext().setConfig(m_config);\n\n    m_startupExceptions = true;\n    Colour colourGuard(Colour::Red);\n    Catch::cerr() << \"Errors occurred during startup!\" << '\\n';\n    // iterate over all exceptions and notify user\n    for (const auto& ex_ptr : exceptions) {\n      try {\n        std::rethrow_exception(ex_ptr);\n      } catch (std::exception const& ex) {\n        Catch::cerr() << Column(ex.what()).indent(2) << '\\n';\n      }\n    }\n  }\n#endif\n\n  alreadyInstantiated = true;\n  m_cli               = makeCommandLineParser(m_configData);\n}\nSession::~Session() { Catch::cleanUp(); }\n\nvoid Session::showHelp() const {\n  Catch::cout() << \"\\nCatch v\" << libraryVersion() << \"\\n\"\n                << m_cli << std::endl\n                << \"For more detailed usage please see the project docs\\n\"\n                << std::endl;\n}\nvoid Session::libIdentify() {\n  Catch::cout() << std::left << std::setw(16) << \"description: \"\n                << \"A Catch2 test executable\\n\"\n                << std::left << std::setw(16) << \"category: \"\n                << \"testframework\\n\"\n                << std::left << std::setw(16) << \"framework: \"\n                << \"Catch Test\\n\"\n                << std::left << std::setw(16) << \"version: \" << libraryVersion()\n                << std::endl;\n}\n\nint Session::applyCommandLine(int argc, char const* const* argv) {\n  if (m_startupExceptions)\n    return 1;\n\n  auto result = m_cli.parse(clara::Args(argc, argv));\n  if (!result) {\n    config();\n    getCurrentMutableContext().setConfig(m_config);\n    Catch::cerr() << Colour(Colour::Red) << \"\\nError(s) in input:\\n\"\n                  << Column(result.errorMessage()).indent(2) << \"\\n\\n\";\n    Catch::cerr() << \"Run with -? for usage\\n\" << std::endl;\n    return MaxExitCode;\n  }\n\n  if (m_configData.showHelp)\n    showHelp();\n  if (m_configData.libIdentify)\n    libIdentify();\n  m_config.reset();\n  return 0;\n}\n\n#if defined(CATCH_CONFIG_WCHAR) && defined(_WIN32) && defined(UNICODE)\nint Session::applyCommandLine(int argc, wchar_t const* const* argv) {\n\n  char** utf8Argv = new char*[argc];\n\n  for (int i = 0; i < argc; ++i) {\n    int bufSize = WideCharToMultiByte(CP_UTF8, 0, argv[i], -1, nullptr, 0,\n                                      nullptr, nullptr);\n\n    utf8Argv[i] = new char[bufSize];\n\n    WideCharToMultiByte(CP_UTF8, 0, argv[i], -1, utf8Argv[i], bufSize, nullptr,\n                        nullptr);\n  }\n\n  int returnCode = applyCommandLine(argc, utf8Argv);\n\n  for (int i = 0; i < argc; ++i)\n    delete[] utf8Argv[i];\n\n  delete[] utf8Argv;\n\n  return returnCode;\n}\n#endif\n\nvoid Session::useConfigData(ConfigData const& configData) {\n  m_configData = configData;\n  m_config.reset();\n}\n\nint Session::run() {\n  if ((m_configData.waitForKeypress & WaitForKeypress::BeforeStart) != 0) {\n    Catch::cout() << \"...waiting for enter/ return before starting\"\n                  << std::endl;\n    static_cast<void>(std::getchar());\n  }\n  int exitCode = runInternal();\n  if ((m_configData.waitForKeypress & WaitForKeypress::BeforeExit) != 0) {\n    Catch::cout() << \"...waiting for enter/ return before exiting, with code: \"\n                  << exitCode << std::endl;\n    static_cast<void>(std::getchar());\n  }\n  return exitCode;\n}\n\nclara::Parser const& Session::cli() const { return m_cli; }\nvoid Session::cli(clara::Parser const& newParser) { m_cli = newParser; }\nConfigData& Session::configData() { return m_configData; }\nConfig& Session::config() {\n  if (!m_config)\n    m_config = std::make_shared<Config>(m_configData);\n  return *m_config;\n}\n\nint Session::runInternal() {\n  if (m_startupExceptions)\n    return 1;\n\n  if (m_configData.showHelp || m_configData.libIdentify) {\n    return 0;\n  }\n\n  CATCH_TRY {\n    config(); // Force config to be constructed\n\n    seedRng(*m_config);\n\n    if (m_configData.filenamesAsTags)\n      applyFilenamesAsTags(*m_config);\n\n    // Handle list request\n    if (Option<std::size_t> listed = list(m_config))\n      return static_cast<int>(*listed);\n\n    TestGroup tests{m_config};\n    auto const totals = tests.execute();\n\n    if (m_config->warnAboutNoTests() && totals.error == -1)\n      return 2;\n\n    // Note that on unices only the lower 8 bits are usually used, clamping\n    // the return value to 255 prevents false negative when some multiple\n    // of 256 tests has failed\n    return (std::min)(\n        MaxExitCode,\n        (std::max)(totals.error, static_cast<int>(totals.assertions.failed)));\n  }\n#if !defined(CATCH_CONFIG_DISABLE_EXCEPTIONS)\n  catch (std::exception& ex) {\n    Catch::cerr() << ex.what() << std::endl;\n    return MaxExitCode;\n  }\n#endif\n}\n\n} // end namespace Catch\n// end catch_session.cpp\n// start catch_singletons.cpp\n\n#include <vector>\n\nnamespace Catch {\n\nnamespace {\nstatic auto getSingletons() -> std::vector<ISingleton*>*& {\n  static std::vector<ISingleton*>* g_singletons = nullptr;\n  if (!g_singletons)\n    g_singletons = new std::vector<ISingleton*>();\n  return g_singletons;\n}\n} // namespace\n\nISingleton::~ISingleton() {}\n\nvoid addSingleton(ISingleton* singleton) {\n  getSingletons()->push_back(singleton);\n}\nvoid cleanupSingletons() {\n  auto& singletons = getSingletons();\n  for (auto singleton : *singletons)\n    delete singleton;\n  delete singletons;\n  singletons = nullptr;\n}\n\n} // namespace Catch\n// end catch_singletons.cpp\n// start catch_startup_exception_registry.cpp\n\n#if !defined(CATCH_CONFIG_DISABLE_EXCEPTIONS)\nnamespace Catch {\nvoid StartupExceptionRegistry::add(\n    std::exception_ptr const& exception) noexcept {\n  CATCH_TRY { m_exceptions.push_back(exception); }\n  CATCH_CATCH_ALL {\n    // If we run out of memory during start-up there's really not a lot more we\n    // can do about it\n    std::terminate();\n  }\n}\n\nstd::vector<std::exception_ptr> const&\nStartupExceptionRegistry::getExceptions() const noexcept {\n  return m_exceptions;\n}\n\n} // end namespace Catch\n#endif\n// end catch_startup_exception_registry.cpp\n// start catch_stream.cpp\n\n#include <cstdio>\n#include <iostream>\n#include <fstream>\n#include <sstream>\n#include <vector>\n#include <memory>\n\nnamespace Catch {\n\nCatch::IStream::~IStream() = default;\n\nnamespace Detail {\nnamespace {\ntemplate <typename WriterF, std::size_t bufferSize = 256>\nclass StreamBufImpl : public std::streambuf {\n  char data[bufferSize];\n  WriterF m_writer;\n\npublic:\n  StreamBufImpl() { setp(data, data + sizeof(data)); }\n\n  ~StreamBufImpl() noexcept { StreamBufImpl::sync(); }\n\nprivate:\n  int overflow(int c) override {\n    sync();\n\n    if (c != EOF) {\n      if (pbase() == epptr())\n        m_writer(std::string(1, static_cast<char>(c)));\n      else\n        sputc(static_cast<char>(c));\n    }\n    return 0;\n  }\n\n  int sync() override {\n    if (pbase() != pptr()) {\n      m_writer(std::string(\n          pbase(), static_cast<std::string::size_type>(pptr() - pbase())));\n      setp(pbase(), epptr());\n    }\n    return 0;\n  }\n};\n\n///////////////////////////////////////////////////////////////////////////\n\nstruct OutputDebugWriter {\n\n  void operator()(std::string const& str) { writeToDebugConsole(str); }\n};\n\n///////////////////////////////////////////////////////////////////////////\n\nclass FileStream : public IStream {\n  mutable std::ofstream m_ofs;\n\npublic:\n  FileStream(StringRef filename) {\n    m_ofs.open(filename.c_str());\n    CATCH_ENFORCE(!m_ofs.fail(), \"Unable to open file: '\" << filename << \"'\");\n  }\n  ~FileStream() override = default;\n\npublic: // IStream\n  std::ostream& stream() const override { return m_ofs; }\n};\n\n///////////////////////////////////////////////////////////////////////////\n\nclass CoutStream : public IStream {\n  mutable std::ostream m_os;\n\npublic:\n  // Store the streambuf from cout up-front because\n  // cout may get redirected when running tests\n  CoutStream() : m_os(Catch::cout().rdbuf()) {}\n  ~CoutStream() override = default;\n\npublic: // IStream\n  std::ostream& stream() const override { return m_os; }\n};\n\n///////////////////////////////////////////////////////////////////////////\n\nclass DebugOutStream : public IStream {\n  std::unique_ptr<StreamBufImpl<OutputDebugWriter>> m_streamBuf;\n  mutable std::ostream m_os;\n\npublic:\n  DebugOutStream()\n      : m_streamBuf(new StreamBufImpl<OutputDebugWriter>()),\n        m_os(m_streamBuf.get()) {}\n\n  ~DebugOutStream() override = default;\n\npublic: // IStream\n  std::ostream& stream() const override { return m_os; }\n};\n\n} // namespace\n} // namespace Detail\n\n///////////////////////////////////////////////////////////////////////////\n\nauto makeStream(StringRef const& filename) -> IStream const* {\n  if (filename.empty())\n    return new Detail::CoutStream();\n  else if (filename[0] == '%') {\n    if (filename == \"%debug\")\n      return new Detail::DebugOutStream();\n    else\n      CATCH_ERROR(\"Unrecognised stream: '\" << filename << \"'\");\n  } else\n    return new Detail::FileStream(filename);\n}\n\n// This class encapsulates the idea of a pool of ostringstreams that can be\n// reused.\nstruct StringStreams {\n  std::vector<std::unique_ptr<std::ostringstream>> m_streams;\n  std::vector<std::size_t> m_unused;\n  std::ostringstream m_referenceStream; // Used for copy state/ flags from\n\n  auto add() -> std::size_t {\n    if (m_unused.empty()) {\n      m_streams.push_back(\n          std::unique_ptr<std::ostringstream>(new std::ostringstream));\n      return m_streams.size() - 1;\n    } else {\n      auto index = m_unused.back();\n      m_unused.pop_back();\n      return index;\n    }\n  }\n\n  void release(std::size_t index) {\n    m_streams[index]->copyfmt(\n        m_referenceStream); // Restore initial flags and other state\n    m_unused.push_back(index);\n  }\n};\n\nReusableStringStream::ReusableStringStream()\n    : m_index(Singleton<StringStreams>::getMutable().add()),\n      m_oss(Singleton<StringStreams>::getMutable().m_streams[m_index].get()) {}\n\nReusableStringStream::~ReusableStringStream() {\n  static_cast<std::ostringstream*>(m_oss)->str(\"\");\n  m_oss->clear();\n  Singleton<StringStreams>::getMutable().release(m_index);\n}\n\nauto ReusableStringStream::str() const -> std::string {\n  return static_cast<std::ostringstream*>(m_oss)->str();\n}\n\n///////////////////////////////////////////////////////////////////////////\n\n#ifndef CATCH_CONFIG_NOSTDOUT // If you #define this you must implement these\n                              // functions\nstd::ostream& cout() { return std::cout; }\nstd::ostream& cerr() { return std::cerr; }\nstd::ostream& clog() { return std::clog; }\n#endif\n} // namespace Catch\n// end catch_stream.cpp\n// start catch_string_manip.cpp\n\n#include <algorithm>\n#include <ostream>\n#include <cstring>\n#include <cctype>\n#include <vector>\n\nnamespace Catch {\n\nnamespace {\nchar toLowerCh(char c) {\n  return static_cast<char>(std::tolower(static_cast<unsigned char>(c)));\n}\n} // namespace\n\nbool startsWith(std::string const& s, std::string const& prefix) {\n  return s.size() >= prefix.size() &&\n         std::equal(prefix.begin(), prefix.end(), s.begin());\n}\nbool startsWith(std::string const& s, char prefix) {\n  return !s.empty() && s[0] == prefix;\n}\nbool endsWith(std::string const& s, std::string const& suffix) {\n  return s.size() >= suffix.size() &&\n         std::equal(suffix.rbegin(), suffix.rend(), s.rbegin());\n}\nbool endsWith(std::string const& s, char suffix) {\n  return !s.empty() && s[s.size() - 1] == suffix;\n}\nbool contains(std::string const& s, std::string const& infix) {\n  return s.find(infix) != std::string::npos;\n}\nvoid toLowerInPlace(std::string& s) {\n  std::transform(s.begin(), s.end(), s.begin(), toLowerCh);\n}\nstd::string toLower(std::string const& s) {\n  std::string lc = s;\n  toLowerInPlace(lc);\n  return lc;\n}\nstd::string trim(std::string const& str) {\n  static char const* whitespaceChars = \"\\n\\r\\t \";\n  std::string::size_type start       = str.find_first_not_of(whitespaceChars);\n  std::string::size_type end         = str.find_last_not_of(whitespaceChars);\n\n  return start != std::string::npos ? str.substr(start, 1 + end - start)\n                                    : std::string();\n}\n\nStringRef trim(StringRef ref) {\n  const auto is_ws = [](char c) {\n    return c == ' ' || c == '\\t' || c == '\\n' || c == '\\r';\n  };\n  size_t real_begin = 0;\n  while (real_begin < ref.size() && is_ws(ref[real_begin])) {\n    ++real_begin;\n  }\n  size_t real_end = ref.size();\n  while (real_end > real_begin && is_ws(ref[real_end - 1])) {\n    --real_end;\n  }\n\n  return ref.substr(real_begin, real_end - real_begin);\n}\n\nbool replaceInPlace(std::string& str, std::string const& replaceThis,\n                    std::string const& withThis) {\n  bool replaced = false;\n  std::size_t i = str.find(replaceThis);\n  while (i != std::string::npos) {\n    replaced = true;\n    str      = str.substr(0, i) + withThis + str.substr(i + replaceThis.size());\n    if (i < str.size() - withThis.size())\n      i = str.find(replaceThis, i + withThis.size());\n    else\n      i = std::string::npos;\n  }\n  return replaced;\n}\n\nstd::vector<StringRef> splitStringRef(StringRef str, char delimiter) {\n  std::vector<StringRef> subStrings;\n  std::size_t start = 0;\n  for (std::size_t pos = 0; pos < str.size(); ++pos) {\n    if (str[pos] == delimiter) {\n      if (pos - start > 1)\n        subStrings.push_back(str.substr(start, pos - start));\n      start = pos + 1;\n    }\n  }\n  if (start < str.size())\n    subStrings.push_back(str.substr(start, str.size() - start));\n  return subStrings;\n}\n\npluralise::pluralise(std::size_t count, std::string const& label)\n    : m_count(count), m_label(label) {}\n\nstd::ostream& operator<<(std::ostream& os, pluralise const& pluraliser) {\n  os << pluraliser.m_count << ' ' << pluraliser.m_label;\n  if (pluraliser.m_count != 1)\n    os << 's';\n  return os;\n}\n\n} // namespace Catch\n// end catch_string_manip.cpp\n// start catch_stringref.cpp\n\n#include <algorithm>\n#include <ostream>\n#include <cstring>\n#include <cstdint>\n\nnamespace Catch {\nStringRef::StringRef(char const* rawChars) noexcept\n    : StringRef(rawChars,\n                static_cast<StringRef::size_type>(std::strlen(rawChars))) {}\n\nauto StringRef::c_str() const -> char const* {\n  CATCH_ENFORCE(isNullTerminated(),\n                \"Called StringRef::c_str() on a non-null-terminated instance\");\n  return m_start;\n}\nauto StringRef::data() const noexcept -> char const* { return m_start; }\n\nauto StringRef::substr(size_type start, size_type size) const noexcept\n    -> StringRef {\n  if (start < m_size) {\n    return StringRef(m_start + start, (std::min)(m_size - start, size));\n  } else {\n    return StringRef();\n  }\n}\nauto StringRef::operator==(StringRef const& other) const noexcept -> bool {\n  return m_size == other.m_size &&\n         (std::memcmp(m_start, other.m_start, m_size) == 0);\n}\n\nauto operator<<(std::ostream& os, StringRef const& str) -> std::ostream& {\n  return os.write(str.data(), str.size());\n}\n\nauto operator+=(std::string& lhs, StringRef const& rhs) -> std::string& {\n  lhs.append(rhs.data(), rhs.size());\n  return lhs;\n}\n\n} // namespace Catch\n// end catch_stringref.cpp\n// start catch_tag_alias.cpp\n\nnamespace Catch {\nTagAlias::TagAlias(std::string const& _tag, SourceLineInfo _lineInfo)\n    : tag(_tag), lineInfo(_lineInfo) {}\n} // namespace Catch\n// end catch_tag_alias.cpp\n// start catch_tag_alias_autoregistrar.cpp\n\nnamespace Catch {\n\nRegistrarForTagAliases::RegistrarForTagAliases(char const* alias,\n                                               char const* tag,\n                                               SourceLineInfo const& lineInfo) {\n  CATCH_TRY { getMutableRegistryHub().registerTagAlias(alias, tag, lineInfo); }\n  CATCH_CATCH_ALL {\n    // Do not throw when constructing global objects, instead register the\n    // exception to be processed later\n    getMutableRegistryHub().registerStartupException();\n  }\n}\n\n} // namespace Catch\n// end catch_tag_alias_autoregistrar.cpp\n// start catch_tag_alias_registry.cpp\n\n#include <sstream>\n\nnamespace Catch {\n\nTagAliasRegistry::~TagAliasRegistry() {}\n\nTagAlias const* TagAliasRegistry::find(std::string const& alias) const {\n  auto it = m_registry.find(alias);\n  if (it != m_registry.end())\n    return &(it->second);\n  else\n    return nullptr;\n}\n\nstd::string\nTagAliasRegistry::expandAliases(std::string const& unexpandedTestSpec) const {\n  std::string expandedTestSpec = unexpandedTestSpec;\n  for (auto const& registryKvp : m_registry) {\n    std::size_t pos = expandedTestSpec.find(registryKvp.first);\n    if (pos != std::string::npos) {\n      expandedTestSpec =\n          expandedTestSpec.substr(0, pos) + registryKvp.second.tag +\n          expandedTestSpec.substr(pos + registryKvp.first.size());\n    }\n  }\n  return expandedTestSpec;\n}\n\nvoid TagAliasRegistry::add(std::string const& alias, std::string const& tag,\n                           SourceLineInfo const& lineInfo) {\n  CATCH_ENFORCE(startsWith(alias, \"[@\") && endsWith(alias, ']'),\n                \"error: tag alias, '\" << alias\n                                      << \"' is not of the form [@alias name].\\n\"\n                                      << lineInfo);\n\n  CATCH_ENFORCE(\n      m_registry.insert(std::make_pair(alias, TagAlias(tag, lineInfo))).second,\n      \"error: tag alias, '\" << alias << \"' already registered.\\n\"\n                            << \"\\tFirst seen at: \" << find(alias)->lineInfo\n                            << \"\\n\"\n                            << \"\\tRedefined at: \" << lineInfo);\n}\n\nITagAliasRegistry::~ITagAliasRegistry() {}\n\nITagAliasRegistry const& ITagAliasRegistry::get() {\n  return getRegistryHub().getTagAliasRegistry();\n}\n\n} // end namespace Catch\n// end catch_tag_alias_registry.cpp\n// start catch_test_case_info.cpp\n\n#include <cctype>\n#include <exception>\n#include <algorithm>\n#include <sstream>\n\nnamespace Catch {\n\nnamespace {\nTestCaseInfo::SpecialProperties parseSpecialTag(std::string const& tag) {\n  if (startsWith(tag, '.') || tag == \"!hide\")\n    return TestCaseInfo::IsHidden;\n  else if (tag == \"!throws\")\n    return TestCaseInfo::Throws;\n  else if (tag == \"!shouldfail\")\n    return TestCaseInfo::ShouldFail;\n  else if (tag == \"!mayfail\")\n    return TestCaseInfo::MayFail;\n  else if (tag == \"!nonportable\")\n    return TestCaseInfo::NonPortable;\n  else if (tag == \"!benchmark\")\n    return static_cast<TestCaseInfo::SpecialProperties>(\n        TestCaseInfo::Benchmark | TestCaseInfo::IsHidden);\n  else\n    return TestCaseInfo::None;\n}\nbool isReservedTag(std::string const& tag) {\n  return parseSpecialTag(tag) == TestCaseInfo::None && tag.size() > 0 &&\n         !std::isalnum(static_cast<unsigned char>(tag[0]));\n}\nvoid enforceNotReservedTag(std::string const& tag,\n                           SourceLineInfo const& _lineInfo) {\n  CATCH_ENFORCE(!isReservedTag(tag),\n                \"Tag name: [\" << tag << \"] is not allowed.\\n\"\n                              << \"Tag names starting with non alphanumeric \"\n                                 \"characters are reserved\\n\"\n                              << _lineInfo);\n}\n} // namespace\n\nTestCase makeTestCase(ITestInvoker* _testCase, std::string const& _className,\n                      NameAndTags const& nameAndTags,\n                      SourceLineInfo const& _lineInfo) {\n  bool isHidden = false;\n\n  // Parse out tags\n  std::vector<std::string> tags;\n  std::string desc, tag;\n  bool inTag = false;\n  for (char c : nameAndTags.tags) {\n    if (!inTag) {\n      if (c == '[')\n        inTag = true;\n      else\n        desc += c;\n    } else {\n      if (c == ']') {\n        TestCaseInfo::SpecialProperties prop = parseSpecialTag(tag);\n        if ((prop & TestCaseInfo::IsHidden) != 0)\n          isHidden = true;\n        else if (prop == TestCaseInfo::None)\n          enforceNotReservedTag(tag, _lineInfo);\n\n        // Merged hide tags like `[.approvals]` should be added as\n        // `[.][approvals]`. The `[.]` is added at later point, so\n        // we only strip the prefix\n        if (startsWith(tag, '.') && tag.size() > 1) {\n          tag.erase(0, 1);\n        }\n        tags.push_back(tag);\n        tag.clear();\n        inTag = false;\n      } else\n        tag += c;\n    }\n  }\n  if (isHidden) {\n    // Add all \"hidden\" tags to make them behave identically\n    tags.insert(tags.end(), {\".\", \"!hide\"});\n  }\n\n  TestCaseInfo info(static_cast<std::string>(nameAndTags.name), _className,\n                    desc, tags, _lineInfo);\n  return TestCase(_testCase, std::move(info));\n}\n\nvoid setTags(TestCaseInfo& testCaseInfo, std::vector<std::string> tags) {\n  std::sort(begin(tags), end(tags));\n  tags.erase(std::unique(begin(tags), end(tags)), end(tags));\n  testCaseInfo.lcaseTags.clear();\n\n  for (auto const& tag : tags) {\n    std::string lcaseTag    = toLower(tag);\n    testCaseInfo.properties = static_cast<TestCaseInfo::SpecialProperties>(\n        testCaseInfo.properties | parseSpecialTag(lcaseTag));\n    testCaseInfo.lcaseTags.push_back(lcaseTag);\n  }\n  testCaseInfo.tags = std::move(tags);\n}\n\nTestCaseInfo::TestCaseInfo(std::string const& _name,\n                           std::string const& _className,\n                           std::string const& _description,\n                           std::vector<std::string> const& _tags,\n                           SourceLineInfo const& _lineInfo)\n    : name(_name), className(_className), description(_description),\n      lineInfo(_lineInfo), properties(None) {\n  setTags(*this, _tags);\n}\n\nbool TestCaseInfo::isHidden() const { return (properties & IsHidden) != 0; }\nbool TestCaseInfo::throws() const { return (properties & Throws) != 0; }\nbool TestCaseInfo::okToFail() const {\n  return (properties & (ShouldFail | MayFail)) != 0;\n}\nbool TestCaseInfo::expectedToFail() const {\n  return (properties & (ShouldFail)) != 0;\n}\n\nstd::string TestCaseInfo::tagsAsString() const {\n  std::string ret;\n  // '[' and ']' per tag\n  std::size_t full_size = 2 * tags.size();\n  for (const auto& tag : tags) {\n    full_size += tag.size();\n  }\n  ret.reserve(full_size);\n  for (const auto& tag : tags) {\n    ret.push_back('[');\n    ret.append(tag);\n    ret.push_back(']');\n  }\n\n  return ret;\n}\n\nTestCase::TestCase(ITestInvoker* testCase, TestCaseInfo&& info)\n    : TestCaseInfo(std::move(info)), test(testCase) {}\n\nTestCase TestCase::withName(std::string const& _newName) const {\n  TestCase other(*this);\n  other.name = _newName;\n  return other;\n}\n\nvoid TestCase::invoke() const { test->invoke(); }\n\nbool TestCase::operator==(TestCase const& other) const {\n  return test.get() == other.test.get() && name == other.name &&\n         className == other.className;\n}\n\nbool TestCase::operator<(TestCase const& other) const {\n  return name < other.name;\n}\n\nTestCaseInfo const& TestCase::getTestCaseInfo() const { return *this; }\n\n} // end namespace Catch\n// end catch_test_case_info.cpp\n// start catch_test_case_registry_impl.cpp\n\n#include <algorithm>\n#include <sstream>\n\nnamespace Catch {\n\nnamespace {\nstruct TestHasher {\n  using hash_t = uint64_t;\n\n  explicit TestHasher(hash_t hashSuffix) : m_hashSuffix{hashSuffix} {}\n\n  uint32_t operator()(TestCase const& t) const {\n    // FNV-1a hash with multiplication fold.\n    const hash_t prime = 1099511628211u;\n    hash_t hash        = 14695981039346656037u;\n    for (const char c : t.name) {\n      hash ^= c;\n      hash *= prime;\n    }\n    hash ^= m_hashSuffix;\n    hash *= prime;\n    const uint32_t low{static_cast<uint32_t>(hash)};\n    const uint32_t high{static_cast<uint32_t>(hash >> 32)};\n    return low * high;\n  }\n\nprivate:\n  hash_t m_hashSuffix;\n};\n} // end unnamed namespace\n\nstd::vector<TestCase>\nsortTests(IConfig const& config,\n          std::vector<TestCase> const& unsortedTestCases) {\n  switch (config.runOrder()) {\n  case RunTests::InDeclarationOrder:\n    // already in declaration order\n    break;\n\n  case RunTests::InLexicographicalOrder: {\n    std::vector<TestCase> sorted = unsortedTestCases;\n    std::sort(sorted.begin(), sorted.end());\n    return sorted;\n  }\n\n  case RunTests::InRandomOrder: {\n    seedRng(config);\n    TestHasher h{config.rngSeed()};\n\n    using hashedTest = std::pair<TestHasher::hash_t, TestCase const*>;\n    std::vector<hashedTest> indexed_tests;\n    indexed_tests.reserve(unsortedTestCases.size());\n\n    for (auto const& testCase : unsortedTestCases) {\n      indexed_tests.emplace_back(h(testCase), &testCase);\n    }\n\n    std::sort(indexed_tests.begin(), indexed_tests.end(),\n              [](hashedTest const& lhs, hashedTest const& rhs) {\n                if (lhs.first == rhs.first) {\n                  return lhs.second->name < rhs.second->name;\n                }\n                return lhs.first < rhs.first;\n              });\n\n    std::vector<TestCase> sorted;\n    sorted.reserve(indexed_tests.size());\n\n    for (auto const& hashed : indexed_tests) {\n      sorted.emplace_back(*hashed.second);\n    }\n\n    return sorted;\n  }\n  }\n  return unsortedTestCases;\n}\n\nbool isThrowSafe(TestCase const& testCase, IConfig const& config) {\n  return !testCase.throws() || config.allowThrows();\n}\n\nbool matchTest(TestCase const& testCase, TestSpec const& testSpec,\n               IConfig const& config) {\n  return testSpec.matches(testCase) && isThrowSafe(testCase, config);\n}\n\nvoid enforceNoDuplicateTestCases(std::vector<TestCase> const& functions) {\n  std::set<TestCase> seenFunctions;\n  for (auto const& function : functions) {\n    auto prev = seenFunctions.insert(function);\n    CATCH_ENFORCE(prev.second, \"error: TEST_CASE( \\\"\"\n                                   << function.name << \"\\\" ) already defined.\\n\"\n                                   << \"\\tFirst seen at \"\n                                   << prev.first->getTestCaseInfo().lineInfo\n                                   << \"\\n\"\n                                   << \"\\tRedefined at \"\n                                   << function.getTestCaseInfo().lineInfo);\n  }\n}\n\nstd::vector<TestCase> filterTests(std::vector<TestCase> const& testCases,\n                                  TestSpec const& testSpec,\n                                  IConfig const& config) {\n  std::vector<TestCase> filtered;\n  filtered.reserve(testCases.size());\n  for (auto const& testCase : testCases) {\n    if ((!testSpec.hasFilters() && !testCase.isHidden()) ||\n        (testSpec.hasFilters() && matchTest(testCase, testSpec, config))) {\n      filtered.push_back(testCase);\n    }\n  }\n  return filtered;\n}\nstd::vector<TestCase> const& getAllTestCasesSorted(IConfig const& config) {\n  return getRegistryHub().getTestCaseRegistry().getAllTestsSorted(config);\n}\n\nvoid TestRegistry::registerTest(TestCase const& testCase) {\n  std::string name = testCase.getTestCaseInfo().name;\n  if (name.empty()) {\n    ReusableStringStream rss;\n    rss << \"Anonymous test case \" << ++m_unnamedCount;\n    return registerTest(testCase.withName(rss.str()));\n  }\n  m_functions.push_back(testCase);\n}\n\nstd::vector<TestCase> const& TestRegistry::getAllTests() const {\n  return m_functions;\n}\nstd::vector<TestCase> const&\nTestRegistry::getAllTestsSorted(IConfig const& config) const {\n  if (m_sortedFunctions.empty())\n    enforceNoDuplicateTestCases(m_functions);\n\n  if (m_currentSortOrder != config.runOrder() || m_sortedFunctions.empty()) {\n    m_sortedFunctions  = sortTests(config, m_functions);\n    m_currentSortOrder = config.runOrder();\n  }\n  return m_sortedFunctions;\n}\n\n///////////////////////////////////////////////////////////////////////////\nTestInvokerAsFunction::TestInvokerAsFunction(void (*testAsFunction)()) noexcept\n    : m_testAsFunction(testAsFunction) {}\n\nvoid TestInvokerAsFunction::invoke() const { m_testAsFunction(); }\n\nstd::string extractClassName(StringRef const& classOrQualifiedMethodName) {\n  std::string className(classOrQualifiedMethodName);\n  if (startsWith(className, '&')) {\n    std::size_t lastColons        = className.rfind(\"::\");\n    std::size_t penultimateColons = className.rfind(\"::\", lastColons - 1);\n    if (penultimateColons == std::string::npos)\n      penultimateColons = 1;\n    className =\n        className.substr(penultimateColons, lastColons - penultimateColons);\n  }\n  return className;\n}\n\n} // end namespace Catch\n// end catch_test_case_registry_impl.cpp\n// start catch_test_case_tracker.cpp\n\n#include <algorithm>\n#include <cassert>\n#include <stdexcept>\n#include <memory>\n#include <sstream>\n\n#if defined(__clang__)\n#pragma clang diagnostic push\n#pragma clang diagnostic ignored \"-Wexit-time-destructors\"\n#endif\n\nnamespace Catch {\nnamespace TestCaseTracking {\n\nNameAndLocation::NameAndLocation(std::string const& _name,\n                                 SourceLineInfo const& _location)\n    : name(_name), location(_location) {}\n\nITracker::~ITracker() = default;\n\nITracker& TrackerContext::startRun() {\n  m_rootTracker = std::make_shared<SectionTracker>(\n      NameAndLocation(\"{root}\", CATCH_INTERNAL_LINEINFO), *this, nullptr);\n  m_currentTracker = nullptr;\n  m_runState       = Executing;\n  return *m_rootTracker;\n}\n\nvoid TrackerContext::endRun() {\n  m_rootTracker.reset();\n  m_currentTracker = nullptr;\n  m_runState       = NotStarted;\n}\n\nvoid TrackerContext::startCycle() {\n  m_currentTracker = m_rootTracker.get();\n  m_runState       = Executing;\n}\nvoid TrackerContext::completeCycle() { m_runState = CompletedCycle; }\n\nbool TrackerContext::completedCycle() const {\n  return m_runState == CompletedCycle;\n}\nITracker& TrackerContext::currentTracker() { return *m_currentTracker; }\nvoid TrackerContext::setCurrentTracker(ITracker* tracker) {\n  m_currentTracker = tracker;\n}\n\nTrackerBase::TrackerBase(NameAndLocation const& nameAndLocation,\n                         TrackerContext& ctx, ITracker* parent)\n    : ITracker(nameAndLocation), m_ctx(ctx), m_parent(parent) {}\n\nbool TrackerBase::isComplete() const {\n  return m_runState == CompletedSuccessfully || m_runState == Failed;\n}\nbool TrackerBase::isSuccessfullyCompleted() const {\n  return m_runState == CompletedSuccessfully;\n}\nbool TrackerBase::isOpen() const {\n  return m_runState != NotStarted && !isComplete();\n}\nbool TrackerBase::hasChildren() const { return !m_children.empty(); }\n\nvoid TrackerBase::addChild(ITrackerPtr const& child) {\n  m_children.push_back(child);\n}\n\nITrackerPtr TrackerBase::findChild(NameAndLocation const& nameAndLocation) {\n  auto it = std::find_if(m_children.begin(), m_children.end(),\n                         [&nameAndLocation](ITrackerPtr const& tracker) {\n                           return tracker->nameAndLocation().location ==\n                                      nameAndLocation.location &&\n                                  tracker->nameAndLocation().name ==\n                                      nameAndLocation.name;\n                         });\n  return (it != m_children.end()) ? *it : nullptr;\n}\nITracker& TrackerBase::parent() {\n  assert(m_parent); // Should always be non-null except for root\n  return *m_parent;\n}\n\nvoid TrackerBase::openChild() {\n  if (m_runState != ExecutingChildren) {\n    m_runState = ExecutingChildren;\n    if (m_parent)\n      m_parent->openChild();\n  }\n}\n\nbool TrackerBase::isSectionTracker() const { return false; }\nbool TrackerBase::isGeneratorTracker() const { return false; }\n\nvoid TrackerBase::open() {\n  m_runState = Executing;\n  moveToThis();\n  if (m_parent)\n    m_parent->openChild();\n}\n\nvoid TrackerBase::close() {\n\n  // Close any still open children (e.g. generators)\n  while (&m_ctx.currentTracker() != this)\n    m_ctx.currentTracker().close();\n\n  switch (m_runState) {\n  case NeedsAnotherRun:\n    break;\n\n  case Executing:\n    m_runState = CompletedSuccessfully;\n    break;\n  case ExecutingChildren:\n    if (std::all_of(m_children.begin(), m_children.end(),\n                    [](ITrackerPtr const& t) { return t->isComplete(); }))\n      m_runState = CompletedSuccessfully;\n    break;\n\n  case NotStarted:\n  case CompletedSuccessfully:\n  case Failed:\n    CATCH_INTERNAL_ERROR(\"Illogical state: \" << m_runState);\n\n  default:\n    CATCH_INTERNAL_ERROR(\"Unknown state: \" << m_runState);\n  }\n  moveToParent();\n  m_ctx.completeCycle();\n}\nvoid TrackerBase::fail() {\n  m_runState = Failed;\n  if (m_parent)\n    m_parent->markAsNeedingAnotherRun();\n  moveToParent();\n  m_ctx.completeCycle();\n}\nvoid TrackerBase::markAsNeedingAnotherRun() { m_runState = NeedsAnotherRun; }\n\nvoid TrackerBase::moveToParent() {\n  assert(m_parent);\n  m_ctx.setCurrentTracker(m_parent);\n}\nvoid TrackerBase::moveToThis() { m_ctx.setCurrentTracker(this); }\n\nSectionTracker::SectionTracker(NameAndLocation const& nameAndLocation,\n                               TrackerContext& ctx, ITracker* parent)\n    : TrackerBase(nameAndLocation, ctx, parent),\n      m_trimmed_name(trim(nameAndLocation.name)) {\n  if (parent) {\n    while (!parent->isSectionTracker())\n      parent = &parent->parent();\n\n    SectionTracker& parentSection = static_cast<SectionTracker&>(*parent);\n    addNextFilters(parentSection.m_filters);\n  }\n}\n\nbool SectionTracker::isComplete() const {\n  bool complete = true;\n\n  if (m_filters.empty() || m_filters[0] == \"\" ||\n      std::find(m_filters.begin(), m_filters.end(), m_trimmed_name) !=\n          m_filters.end()) {\n    complete = TrackerBase::isComplete();\n  }\n  return complete;\n}\n\nbool SectionTracker::isSectionTracker() const { return true; }\n\nSectionTracker&\nSectionTracker::acquire(TrackerContext& ctx,\n                        NameAndLocation const& nameAndLocation) {\n  std::shared_ptr<SectionTracker> section;\n\n  ITracker& currentTracker = ctx.currentTracker();\n  if (ITrackerPtr childTracker = currentTracker.findChild(nameAndLocation)) {\n    assert(childTracker);\n    assert(childTracker->isSectionTracker());\n    section = std::static_pointer_cast<SectionTracker>(childTracker);\n  } else {\n    section =\n        std::make_shared<SectionTracker>(nameAndLocation, ctx, &currentTracker);\n    currentTracker.addChild(section);\n  }\n  if (!ctx.completedCycle())\n    section->tryOpen();\n  return *section;\n}\n\nvoid SectionTracker::tryOpen() {\n  if (!isComplete())\n    open();\n}\n\nvoid SectionTracker::addInitialFilters(\n    std::vector<std::string> const& filters) {\n  if (!filters.empty()) {\n    m_filters.reserve(m_filters.size() + filters.size() + 2);\n    m_filters.emplace_back(\"\"); // Root - should never be consulted\n    m_filters.emplace_back(\"\"); // Test Case - not a section filter\n    m_filters.insert(m_filters.end(), filters.begin(), filters.end());\n  }\n}\nvoid SectionTracker::addNextFilters(std::vector<std::string> const& filters) {\n  if (filters.size() > 1)\n    m_filters.insert(m_filters.end(), filters.begin() + 1, filters.end());\n}\n\nstd::vector<std::string> const& SectionTracker::getFilters() const {\n  return m_filters;\n}\n\nstd::string const& SectionTracker::trimmedName() const {\n  return m_trimmed_name;\n}\n\n} // namespace TestCaseTracking\n\nusing TestCaseTracking::ITracker;\nusing TestCaseTracking::SectionTracker;\nusing TestCaseTracking::TrackerContext;\n\n} // namespace Catch\n\n#if defined(__clang__)\n#pragma clang diagnostic pop\n#endif\n// end catch_test_case_tracker.cpp\n// start catch_test_registry.cpp\n\nnamespace Catch {\n\nauto makeTestInvoker(void (*testAsFunction)()) noexcept -> ITestInvoker* {\n  return new (std::nothrow) TestInvokerAsFunction(testAsFunction);\n}\n\nNameAndTags::NameAndTags(StringRef const& name_,\n                         StringRef const& tags_) noexcept\n    : name(name_), tags(tags_) {}\n\nAutoReg::AutoReg(ITestInvoker* invoker, SourceLineInfo const& lineInfo,\n                 StringRef const& classOrMethod,\n                 NameAndTags const& nameAndTags) noexcept {\n  CATCH_TRY {\n    getMutableRegistryHub().registerTest(makeTestCase(\n        invoker, extractClassName(classOrMethod), nameAndTags, lineInfo));\n  }\n  CATCH_CATCH_ALL {\n    // Do not throw when constructing global objects, instead register the\n    // exception to be processed later\n    getMutableRegistryHub().registerStartupException();\n  }\n}\n\nAutoReg::~AutoReg() = default;\n} // namespace Catch\n// end catch_test_registry.cpp\n// start catch_test_spec.cpp\n\n#include <algorithm>\n#include <string>\n#include <vector>\n#include <memory>\n\nnamespace Catch {\n\nTestSpec::Pattern::Pattern(std::string const& name) : m_name(name) {}\n\nTestSpec::Pattern::~Pattern() = default;\n\nstd::string const& TestSpec::Pattern::name() const { return m_name; }\n\nTestSpec::NamePattern::NamePattern(std::string const& name,\n                                   std::string const& filterString)\n    : Pattern(filterString),\n      m_wildcardPattern(toLower(name), CaseSensitive::No) {}\n\nbool TestSpec::NamePattern::matches(TestCaseInfo const& testCase) const {\n  return m_wildcardPattern.matches(testCase.name);\n}\n\nTestSpec::TagPattern::TagPattern(std::string const& tag,\n                                 std::string const& filterString)\n    : Pattern(filterString), m_tag(toLower(tag)) {}\n\nbool TestSpec::TagPattern::matches(TestCaseInfo const& testCase) const {\n  return std::find(begin(testCase.lcaseTags), end(testCase.lcaseTags), m_tag) !=\n         end(testCase.lcaseTags);\n}\n\nTestSpec::ExcludedPattern::ExcludedPattern(PatternPtr const& underlyingPattern)\n    : Pattern(underlyingPattern->name()),\n      m_underlyingPattern(underlyingPattern) {}\n\nbool TestSpec::ExcludedPattern::matches(TestCaseInfo const& testCase) const {\n  return !m_underlyingPattern->matches(testCase);\n}\n\nbool TestSpec::Filter::matches(TestCaseInfo const& testCase) const {\n  return std::all_of(m_patterns.begin(), m_patterns.end(),\n                     [&](PatternPtr const& p) { return p->matches(testCase); });\n}\n\nstd::string TestSpec::Filter::name() const {\n  std::string name;\n  for (auto const& p : m_patterns)\n    name += p->name();\n  return name;\n}\n\nbool TestSpec::hasFilters() const { return !m_filters.empty(); }\n\nbool TestSpec::matches(TestCaseInfo const& testCase) const {\n  return std::any_of(m_filters.begin(), m_filters.end(),\n                     [&](Filter const& f) { return f.matches(testCase); });\n}\n\nTestSpec::Matches\nTestSpec::matchesByFilter(std::vector<TestCase> const& testCases,\n                          IConfig const& config) const {\n  Matches matches(m_filters.size());\n  std::transform(m_filters.begin(), m_filters.end(), matches.begin(),\n                 [&](Filter const& filter) {\n                   std::vector<TestCase const*> currentMatches;\n                   for (auto const& test : testCases)\n                     if (isThrowSafe(test, config) && filter.matches(test))\n                       currentMatches.emplace_back(&test);\n                   return FilterMatch{filter.name(), currentMatches};\n                 });\n  return matches;\n}\n\nconst TestSpec::vectorStrings& TestSpec::getInvalidArgs() const {\n  return (m_invalidArgs);\n}\n\n} // namespace Catch\n// end catch_test_spec.cpp\n// start catch_test_spec_parser.cpp\n\nnamespace Catch {\n\nTestSpecParser::TestSpecParser(ITagAliasRegistry const& tagAliases)\n    : m_tagAliases(&tagAliases) {}\n\nTestSpecParser& TestSpecParser::parse(std::string const& arg) {\n  m_mode      = None;\n  m_exclusion = false;\n  m_arg       = m_tagAliases->expandAliases(arg);\n  m_escapeChars.clear();\n  m_substring.reserve(m_arg.size());\n  m_patternName.reserve(m_arg.size());\n  m_realPatternPos = 0;\n\n  for (m_pos = 0; m_pos < m_arg.size(); ++m_pos)\n    // if visitChar fails\n    if (!visitChar(m_arg[m_pos])) {\n      m_testSpec.m_invalidArgs.push_back(arg);\n      break;\n    }\n  endMode();\n  return *this;\n}\nTestSpec TestSpecParser::testSpec() {\n  addFilter();\n  return m_testSpec;\n}\nbool TestSpecParser::visitChar(char c) {\n  if ((m_mode != EscapedName) && (c == '\\\\')) {\n    escape();\n    addCharToPattern(c);\n    return true;\n  } else if ((m_mode != EscapedName) && (c == ',')) {\n    return separate();\n  }\n\n  switch (m_mode) {\n  case None:\n    if (processNoneChar(c))\n      return true;\n    break;\n  case Name:\n    processNameChar(c);\n    break;\n  case EscapedName:\n    endMode();\n    addCharToPattern(c);\n    return true;\n  default:\n  case Tag:\n  case QuotedName:\n    if (processOtherChar(c))\n      return true;\n    break;\n  }\n\n  m_substring += c;\n  if (!isControlChar(c)) {\n    m_patternName += c;\n    m_realPatternPos++;\n  }\n  return true;\n}\n// Two of the processing methods return true to signal the caller to return\n// without adding the given character to the current pattern strings\nbool TestSpecParser::processNoneChar(char c) {\n  switch (c) {\n  case ' ':\n    return true;\n  case '~':\n    m_exclusion = true;\n    return false;\n  case '[':\n    startNewMode(Tag);\n    return false;\n  case '\"':\n    startNewMode(QuotedName);\n    return false;\n  default:\n    startNewMode(Name);\n    return false;\n  }\n}\nvoid TestSpecParser::processNameChar(char c) {\n  if (c == '[') {\n    if (m_substring == \"exclude:\")\n      m_exclusion = true;\n    else\n      endMode();\n    startNewMode(Tag);\n  }\n}\nbool TestSpecParser::processOtherChar(char c) {\n  if (!isControlChar(c))\n    return false;\n  m_substring += c;\n  endMode();\n  return true;\n}\nvoid TestSpecParser::startNewMode(Mode mode) { m_mode = mode; }\nvoid TestSpecParser::endMode() {\n  switch (m_mode) {\n  case Name:\n  case QuotedName:\n    return addNamePattern();\n  case Tag:\n    return addTagPattern();\n  case EscapedName:\n    revertBackToLastMode();\n    return;\n  case None:\n  default:\n    return startNewMode(None);\n  }\n}\nvoid TestSpecParser::escape() {\n  saveLastMode();\n  m_mode = EscapedName;\n  m_escapeChars.push_back(m_realPatternPos);\n}\nbool TestSpecParser::isControlChar(char c) const {\n  switch (m_mode) {\n  default:\n    return false;\n  case None:\n    return c == '~';\n  case Name:\n    return c == '[';\n  case EscapedName:\n    return true;\n  case QuotedName:\n    return c == '\"';\n  case Tag:\n    return c == '[' || c == ']';\n  }\n}\n\nvoid TestSpecParser::addFilter() {\n  if (!m_currentFilter.m_patterns.empty()) {\n    m_testSpec.m_filters.push_back(m_currentFilter);\n    m_currentFilter = TestSpec::Filter();\n  }\n}\n\nvoid TestSpecParser::saveLastMode() { lastMode = m_mode; }\n\nvoid TestSpecParser::revertBackToLastMode() { m_mode = lastMode; }\n\nbool TestSpecParser::separate() {\n  if ((m_mode == QuotedName) || (m_mode == Tag)) {\n    // invalid argument, signal failure to previous scope.\n    m_mode = None;\n    m_pos  = m_arg.size();\n    m_substring.clear();\n    m_patternName.clear();\n    m_realPatternPos = 0;\n    return false;\n  }\n  endMode();\n  addFilter();\n  return true; // success\n}\n\nstd::string TestSpecParser::preprocessPattern() {\n  std::string token = m_patternName;\n  for (std::size_t i = 0; i < m_escapeChars.size(); ++i)\n    token = token.substr(0, m_escapeChars[i] - i) +\n            token.substr(m_escapeChars[i] - i + 1);\n  m_escapeChars.clear();\n  if (startsWith(token, \"exclude:\")) {\n    m_exclusion = true;\n    token       = token.substr(8);\n  }\n\n  m_patternName.clear();\n  m_realPatternPos = 0;\n\n  return token;\n}\n\nvoid TestSpecParser::addNamePattern() {\n  auto token = preprocessPattern();\n\n  if (!token.empty()) {\n    TestSpec::PatternPtr pattern =\n        std::make_shared<TestSpec::NamePattern>(token, m_substring);\n    if (m_exclusion)\n      pattern = std::make_shared<TestSpec::ExcludedPattern>(pattern);\n    m_currentFilter.m_patterns.push_back(pattern);\n  }\n  m_substring.clear();\n  m_exclusion = false;\n  m_mode      = None;\n}\n\nvoid TestSpecParser::addTagPattern() {\n  auto token = preprocessPattern();\n\n  if (!token.empty()) {\n    // If the tag pattern is the \"hide and tag\" shorthand (e.g. [.foo])\n    // we have to create a separate hide tag and shorten the real one\n    if (token.size() > 1 && token[0] == '.') {\n      token.erase(token.begin());\n      TestSpec::PatternPtr pattern =\n          std::make_shared<TestSpec::TagPattern>(\".\", m_substring);\n      if (m_exclusion) {\n        pattern = std::make_shared<TestSpec::ExcludedPattern>(pattern);\n      }\n      m_currentFilter.m_patterns.push_back(pattern);\n    }\n\n    TestSpec::PatternPtr pattern =\n        std::make_shared<TestSpec::TagPattern>(token, m_substring);\n\n    if (m_exclusion) {\n      pattern = std::make_shared<TestSpec::ExcludedPattern>(pattern);\n    }\n    m_currentFilter.m_patterns.push_back(pattern);\n  }\n  m_substring.clear();\n  m_exclusion = false;\n  m_mode      = None;\n}\n\nTestSpec parseTestSpec(std::string const& arg) {\n  return TestSpecParser(ITagAliasRegistry::get()).parse(arg).testSpec();\n}\n\n} // namespace Catch\n// end catch_test_spec_parser.cpp\n// start catch_timer.cpp\n\n#include <chrono>\n\nstatic const uint64_t nanosecondsInSecond = 1000000000;\n\nnamespace Catch {\n\nauto getCurrentNanosecondsSinceEpoch() -> uint64_t {\n  return std::chrono::duration_cast<std::chrono::nanoseconds>(\n             std::chrono::high_resolution_clock::now().time_since_epoch())\n      .count();\n}\n\nnamespace {\nauto estimateClockResolution() -> uint64_t {\n  uint64_t sum                     = 0;\n  static const uint64_t iterations = 1000000;\n\n  auto startTime = getCurrentNanosecondsSinceEpoch();\n\n  for (std::size_t i = 0; i < iterations; ++i) {\n\n    uint64_t ticks;\n    uint64_t baseTicks = getCurrentNanosecondsSinceEpoch();\n    do {\n      ticks = getCurrentNanosecondsSinceEpoch();\n    } while (ticks == baseTicks);\n\n    auto delta = ticks - baseTicks;\n    sum += delta;\n\n    // If we have been calibrating for over 3 seconds -- the clock\n    // is terrible and we should move on.\n    // TBD: How to signal that the measured resolution is probably wrong?\n    if (ticks > startTime + 3 * nanosecondsInSecond) {\n      return sum / (i + 1u);\n    }\n  }\n\n  // We're just taking the mean, here. To do better we could take the std. dev\n  // and exclude outliers\n  // - and potentially do more iterations if there's a high variance.\n  return sum / iterations;\n}\n} // namespace\nauto getEstimatedClockResolution() -> uint64_t {\n  static auto s_resolution = estimateClockResolution();\n  return s_resolution;\n}\n\nvoid Timer::start() { m_nanoseconds = getCurrentNanosecondsSinceEpoch(); }\nauto Timer::getElapsedNanoseconds() const -> uint64_t {\n  return getCurrentNanosecondsSinceEpoch() - m_nanoseconds;\n}\nauto Timer::getElapsedMicroseconds() const -> uint64_t {\n  return getElapsedNanoseconds() / 1000;\n}\nauto Timer::getElapsedMilliseconds() const -> unsigned int {\n  return static_cast<unsigned int>(getElapsedMicroseconds() / 1000);\n}\nauto Timer::getElapsedSeconds() const -> double {\n  return getElapsedMicroseconds() / 1000000.0;\n}\n\n} // namespace Catch\n// end catch_timer.cpp\n// start catch_tostring.cpp\n\n#if defined(__clang__)\n#pragma clang diagnostic push\n#pragma clang diagnostic ignored \"-Wexit-time-destructors\"\n#pragma clang diagnostic ignored \"-Wglobal-constructors\"\n#endif\n\n// Enable specific decls locally\n#if !defined(CATCH_CONFIG_ENABLE_CHRONO_STRINGMAKER)\n#define CATCH_CONFIG_ENABLE_CHRONO_STRINGMAKER\n#endif\n\n#include <cmath>\n#include <iomanip>\n\nnamespace Catch {\n\nnamespace Detail {\n\nconst std::string unprintableString = \"{?}\";\n\nnamespace {\nconst int hexThreshold = 255;\n\nstruct Endianness {\n  enum Arch { Big, Little };\n\n  static Arch which() {\n    int one = 1;\n    // If the lowest byte we read is non-zero, we can assume\n    // that little endian format is used.\n    auto value = *reinterpret_cast<char*>(&one);\n    return value ? Little : Big;\n  }\n};\n} // namespace\n\nstd::string rawMemoryToString(const void* object, std::size_t size) {\n  // Reverse order for little endian architectures\n  int i = 0, end = static_cast<int>(size), inc = 1;\n  if (Endianness::which() == Endianness::Little) {\n    i   = end - 1;\n    end = inc = -1;\n  }\n\n  unsigned char const* bytes = static_cast<unsigned char const*>(object);\n  ReusableStringStream rss;\n  rss << \"0x\" << std::setfill('0') << std::hex;\n  for (; i != end; i += inc)\n    rss << std::setw(2) << static_cast<unsigned>(bytes[i]);\n  return rss.str();\n}\n} // namespace Detail\n\ntemplate <typename T>\nstd::string fpToString(T value, int precision) {\n  if (Catch::isnan(value)) {\n    return \"nan\";\n  }\n\n  ReusableStringStream rss;\n  rss << std::setprecision(precision) << std::fixed << value;\n  std::string d = rss.str();\n  std::size_t i = d.find_last_not_of('0');\n  if (i != std::string::npos && i != d.size() - 1) {\n    if (d[i] == '.')\n      i++;\n    d = d.substr(0, i + 1);\n  }\n  return d;\n}\n\n//// ======================================================= ////\n//\n//   Out-of-line defs for full specialization of StringMaker\n//\n//// ======================================================= ////\n\nstd::string StringMaker<std::string>::convert(const std::string& str) {\n  if (!getCurrentContext().getConfig()->showInvisibles()) {\n    return '\"' + str + '\"';\n  }\n\n  std::string s(\"\\\"\");\n  for (char c : str) {\n    switch (c) {\n    case '\\n':\n      s.append(\"\\\\n\");\n      break;\n    case '\\t':\n      s.append(\"\\\\t\");\n      break;\n    default:\n      s.push_back(c);\n      break;\n    }\n  }\n  s.append(\"\\\"\");\n  return s;\n}\n\n#ifdef CATCH_CONFIG_CPP17_STRING_VIEW\nstd::string StringMaker<std::string_view>::convert(std::string_view str) {\n  return ::Catch::Detail::stringify(std::string{str});\n}\n#endif\n\nstd::string StringMaker<char const*>::convert(char const* str) {\n  if (str) {\n    return ::Catch::Detail::stringify(std::string{str});\n  } else {\n    return {\"{null string}\"};\n  }\n}\nstd::string StringMaker<char*>::convert(char* str) {\n  if (str) {\n    return ::Catch::Detail::stringify(std::string{str});\n  } else {\n    return {\"{null string}\"};\n  }\n}\n\n#ifdef CATCH_CONFIG_WCHAR\nstd::string StringMaker<std::wstring>::convert(const std::wstring& wstr) {\n  std::string s;\n  s.reserve(wstr.size());\n  for (auto c : wstr) {\n    s += (c <= 0xff) ? static_cast<char>(c) : '?';\n  }\n  return ::Catch::Detail::stringify(s);\n}\n\n#ifdef CATCH_CONFIG_CPP17_STRING_VIEW\nstd::string StringMaker<std::wstring_view>::convert(std::wstring_view str) {\n  return StringMaker<std::wstring>::convert(std::wstring(str));\n}\n#endif\n\nstd::string StringMaker<wchar_t const*>::convert(wchar_t const* str) {\n  if (str) {\n    return ::Catch::Detail::stringify(std::wstring{str});\n  } else {\n    return {\"{null string}\"};\n  }\n}\nstd::string StringMaker<wchar_t*>::convert(wchar_t* str) {\n  if (str) {\n    return ::Catch::Detail::stringify(std::wstring{str});\n  } else {\n    return {\"{null string}\"};\n  }\n}\n#endif\n\n#if defined(CATCH_CONFIG_CPP17_BYTE)\n#include <cstddef>\nstd::string StringMaker<std::byte>::convert(std::byte value) {\n  return ::Catch::Detail::stringify(std::to_integer<unsigned long long>(value));\n}\n#endif // defined(CATCH_CONFIG_CPP17_BYTE)\n\nstd::string StringMaker<int>::convert(int value) {\n  return ::Catch::Detail::stringify(static_cast<long long>(value));\n}\nstd::string StringMaker<long>::convert(long value) {\n  return ::Catch::Detail::stringify(static_cast<long long>(value));\n}\nstd::string StringMaker<long long>::convert(long long value) {\n  ReusableStringStream rss;\n  rss << value;\n  if (value > Detail::hexThreshold) {\n    rss << \" (0x\" << std::hex << value << ')';\n  }\n  return rss.str();\n}\n\nstd::string StringMaker<unsigned int>::convert(unsigned int value) {\n  return ::Catch::Detail::stringify(static_cast<unsigned long long>(value));\n}\nstd::string StringMaker<unsigned long>::convert(unsigned long value) {\n  return ::Catch::Detail::stringify(static_cast<unsigned long long>(value));\n}\nstd::string StringMaker<unsigned long long>::convert(unsigned long long value) {\n  ReusableStringStream rss;\n  rss << value;\n  if (value > Detail::hexThreshold) {\n    rss << \" (0x\" << std::hex << value << ')';\n  }\n  return rss.str();\n}\n\nstd::string StringMaker<bool>::convert(bool b) { return b ? \"true\" : \"false\"; }\n\nstd::string StringMaker<signed char>::convert(signed char value) {\n  if (value == '\\r') {\n    return \"'\\\\r'\";\n  } else if (value == '\\f') {\n    return \"'\\\\f'\";\n  } else if (value == '\\n') {\n    return \"'\\\\n'\";\n  } else if (value == '\\t') {\n    return \"'\\\\t'\";\n  } else if ('\\0' <= value && value < ' ') {\n    return ::Catch::Detail::stringify(static_cast<unsigned int>(value));\n  } else {\n    char chstr[] = \"' '\";\n    chstr[1]     = value;\n    return chstr;\n  }\n}\nstd::string StringMaker<char>::convert(char c) {\n  return ::Catch::Detail::stringify(static_cast<signed char>(c));\n}\nstd::string StringMaker<unsigned char>::convert(unsigned char c) {\n  return ::Catch::Detail::stringify(static_cast<char>(c));\n}\n\nstd::string StringMaker<std::nullptr_t>::convert(std::nullptr_t) {\n  return \"nullptr\";\n}\n\nint StringMaker<float>::precision = 5;\n\nstd::string StringMaker<float>::convert(float value) {\n  return fpToString(value, precision) + 'f';\n}\n\nint StringMaker<double>::precision = 10;\n\nstd::string StringMaker<double>::convert(double value) {\n  return fpToString(value, precision);\n}\n\nstd::string ratio_string<std::atto>::symbol() { return \"a\"; }\nstd::string ratio_string<std::femto>::symbol() { return \"f\"; }\nstd::string ratio_string<std::pico>::symbol() { return \"p\"; }\nstd::string ratio_string<std::nano>::symbol() { return \"n\"; }\nstd::string ratio_string<std::micro>::symbol() { return \"u\"; }\nstd::string ratio_string<std::milli>::symbol() { return \"m\"; }\n\n} // end namespace Catch\n\n#if defined(__clang__)\n#pragma clang diagnostic pop\n#endif\n\n// end catch_tostring.cpp\n// start catch_totals.cpp\n\nnamespace Catch {\n\nCounts Counts::operator-(Counts const& other) const {\n  Counts diff;\n  diff.passed      = passed - other.passed;\n  diff.failed      = failed - other.failed;\n  diff.failedButOk = failedButOk - other.failedButOk;\n  return diff;\n}\n\nCounts& Counts::operator+=(Counts const& other) {\n  passed += other.passed;\n  failed += other.failed;\n  failedButOk += other.failedButOk;\n  return *this;\n}\n\nstd::size_t Counts::total() const { return passed + failed + failedButOk; }\nbool Counts::allPassed() const { return failed == 0 && failedButOk == 0; }\nbool Counts::allOk() const { return failed == 0; }\n\nTotals Totals::operator-(Totals const& other) const {\n  Totals diff;\n  diff.assertions = assertions - other.assertions;\n  diff.testCases  = testCases - other.testCases;\n  return diff;\n}\n\nTotals& Totals::operator+=(Totals const& other) {\n  assertions += other.assertions;\n  testCases += other.testCases;\n  return *this;\n}\n\nTotals Totals::delta(Totals const& prevTotals) const {\n  Totals diff = *this - prevTotals;\n  if (diff.assertions.failed > 0)\n    ++diff.testCases.failed;\n  else if (diff.assertions.failedButOk > 0)\n    ++diff.testCases.failedButOk;\n  else\n    ++diff.testCases.passed;\n  return diff;\n}\n\n} // namespace Catch\n// end catch_totals.cpp\n// start catch_uncaught_exceptions.cpp\n\n// start catch_config_uncaught_exceptions.hpp\n\n//              Copyright Catch2 Authors\n// Distributed under the Boost Software License, Version 1.0.\n//   (See accompanying file LICENSE_1_0.txt or copy at\n//        https://www.boost.org/LICENSE_1_0.txt)\n\n// SPDX-License-Identifier: BSL-1.0\n\n#ifndef CATCH_CONFIG_UNCAUGHT_EXCEPTIONS_HPP\n#define CATCH_CONFIG_UNCAUGHT_EXCEPTIONS_HPP\n\n#if defined(_MSC_VER)\n#if _MSC_VER >= 1900 // Visual Studio 2015 or newer\n#define CATCH_INTERNAL_CONFIG_CPP17_UNCAUGHT_EXCEPTIONS\n#endif\n#endif\n\n#include <exception>\n\n#if defined(__cpp_lib_uncaught_exceptions) &&                                  \\\n    !defined(CATCH_INTERNAL_CONFIG_CPP17_UNCAUGHT_EXCEPTIONS)\n\n#define CATCH_INTERNAL_CONFIG_CPP17_UNCAUGHT_EXCEPTIONS\n#endif // __cpp_lib_uncaught_exceptions\n\n#if defined(CATCH_INTERNAL_CONFIG_CPP17_UNCAUGHT_EXCEPTIONS) &&                \\\n    !defined(CATCH_CONFIG_NO_CPP17_UNCAUGHT_EXCEPTIONS) &&                     \\\n    !defined(CATCH_CONFIG_CPP17_UNCAUGHT_EXCEPTIONS)\n\n#define CATCH_CONFIG_CPP17_UNCAUGHT_EXCEPTIONS\n#endif\n\n#endif // CATCH_CONFIG_UNCAUGHT_EXCEPTIONS_HPP\n// end catch_config_uncaught_exceptions.hpp\n#include <exception>\n\nnamespace Catch {\nbool uncaught_exceptions() {\n#if defined(CATCH_CONFIG_DISABLE_EXCEPTIONS)\n  return false;\n#elif defined(CATCH_CONFIG_CPP17_UNCAUGHT_EXCEPTIONS)\n  return std::uncaught_exceptions() > 0;\n#else\n  return std::uncaught_exception();\n#endif\n}\n} // end namespace Catch\n// end catch_uncaught_exceptions.cpp\n// start catch_version.cpp\n\n#include <ostream>\n\nnamespace Catch {\n\nVersion::Version(unsigned int _majorVersion, unsigned int _minorVersion,\n                 unsigned int _patchNumber, char const* const _branchName,\n                 unsigned int _buildNumber)\n    : majorVersion(_majorVersion), minorVersion(_minorVersion),\n      patchNumber(_patchNumber), branchName(_branchName),\n      buildNumber(_buildNumber) {}\n\nstd::ostream& operator<<(std::ostream& os, Version const& version) {\n  os << version.majorVersion << '.' << version.minorVersion << '.'\n     << version.patchNumber;\n  // branchName is never null -> 0th char is \\0 if it is empty\n  if (version.branchName[0]) {\n    os << '-' << version.branchName << '.' << version.buildNumber;\n  }\n  return os;\n}\n\nVersion const& libraryVersion() {\n  static Version version(2, 13, 5, \"\", 0);\n  return version;\n}\n\n} // namespace Catch\n// end catch_version.cpp\n// start catch_wildcard_pattern.cpp\n\nnamespace Catch {\n\nWildcardPattern::WildcardPattern(std::string const& pattern,\n                                 CaseSensitive::Choice caseSensitivity)\n    : m_caseSensitivity(caseSensitivity), m_pattern(normaliseString(pattern)) {\n  if (startsWith(m_pattern, '*')) {\n    m_pattern  = m_pattern.substr(1);\n    m_wildcard = WildcardAtStart;\n  }\n  if (endsWith(m_pattern, '*')) {\n    m_pattern  = m_pattern.substr(0, m_pattern.size() - 1);\n    m_wildcard = static_cast<WildcardPosition>(m_wildcard | WildcardAtEnd);\n  }\n}\n\nbool WildcardPattern::matches(std::string const& str) const {\n  switch (m_wildcard) {\n  case NoWildcard:\n    return m_pattern == normaliseString(str);\n  case WildcardAtStart:\n    return endsWith(normaliseString(str), m_pattern);\n  case WildcardAtEnd:\n    return startsWith(normaliseString(str), m_pattern);\n  case WildcardAtBothEnds:\n    return contains(normaliseString(str), m_pattern);\n  default:\n    CATCH_INTERNAL_ERROR(\"Unknown enum\");\n  }\n}\n\nstd::string WildcardPattern::normaliseString(std::string const& str) const {\n  return trim(m_caseSensitivity == CaseSensitive::No ? toLower(str) : str);\n}\n} // namespace Catch\n// end catch_wildcard_pattern.cpp\n// start catch_xmlwriter.cpp\n\n#include <iomanip>\n#include <type_traits>\n\nnamespace Catch {\n\nnamespace {\n\nsize_t trailingBytes(unsigned char c) {\n  if ((c & 0xE0) == 0xC0) {\n    return 2;\n  }\n  if ((c & 0xF0) == 0xE0) {\n    return 3;\n  }\n  if ((c & 0xF8) == 0xF0) {\n    return 4;\n  }\n  CATCH_INTERNAL_ERROR(\"Invalid multibyte utf-8 start byte encountered\");\n}\n\nuint32_t headerValue(unsigned char c) {\n  if ((c & 0xE0) == 0xC0) {\n    return c & 0x1F;\n  }\n  if ((c & 0xF0) == 0xE0) {\n    return c & 0x0F;\n  }\n  if ((c & 0xF8) == 0xF0) {\n    return c & 0x07;\n  }\n  CATCH_INTERNAL_ERROR(\"Invalid multibyte utf-8 start byte encountered\");\n}\n\nvoid hexEscapeChar(std::ostream& os, unsigned char c) {\n  std::ios_base::fmtflags f(os.flags());\n  os << \"\\\\x\" << std::uppercase << std::hex << std::setfill('0') << std::setw(2)\n     << static_cast<int>(c);\n  os.flags(f);\n}\n\nbool shouldNewline(XmlFormatting fmt) {\n  return !!(static_cast<std::underlying_type<XmlFormatting>::type>(\n      fmt & XmlFormatting::Newline));\n}\n\nbool shouldIndent(XmlFormatting fmt) {\n  return !!(static_cast<std::underlying_type<XmlFormatting>::type>(\n      fmt & XmlFormatting::Indent));\n}\n\n} // anonymous namespace\n\nXmlFormatting operator|(XmlFormatting lhs, XmlFormatting rhs) {\n  return static_cast<XmlFormatting>(\n      static_cast<std::underlying_type<XmlFormatting>::type>(lhs) |\n      static_cast<std::underlying_type<XmlFormatting>::type>(rhs));\n}\n\nXmlFormatting operator&(XmlFormatting lhs, XmlFormatting rhs) {\n  return static_cast<XmlFormatting>(\n      static_cast<std::underlying_type<XmlFormatting>::type>(lhs) &\n      static_cast<std::underlying_type<XmlFormatting>::type>(rhs));\n}\n\nXmlEncode::XmlEncode(std::string const& str, ForWhat forWhat)\n    : m_str(str), m_forWhat(forWhat) {}\n\nvoid XmlEncode::encodeTo(std::ostream& os) const {\n  // Apostrophe escaping not necessary if we always use \" to write attributes\n  // (see: http://www.w3.org/TR/xml/#syntax)\n\n  for (std::size_t idx = 0; idx < m_str.size(); ++idx) {\n    unsigned char c = m_str[idx];\n    switch (c) {\n    case '<':\n      os << \"&lt;\";\n      break;\n    case '&':\n      os << \"&amp;\";\n      break;\n\n    case '>':\n      // See: http://www.w3.org/TR/xml/#syntax\n      if (idx > 2 && m_str[idx - 1] == ']' && m_str[idx - 2] == ']')\n        os << \"&gt;\";\n      else\n        os << c;\n      break;\n\n    case '\\\"':\n      if (m_forWhat == ForAttributes)\n        os << \"&quot;\";\n      else\n        os << c;\n      break;\n\n    default:\n      // Check for control characters and invalid utf-8\n\n      // Escape control characters in standard ascii\n      // see\n      // http://stackoverflow.com/questions/404107/why-are-control-characters-illegal-in-xml-1-0\n      if (c < 0x09 || (c > 0x0D && c < 0x20) || c == 0x7F) {\n        hexEscapeChar(os, c);\n        break;\n      }\n\n      // Plain ASCII: Write it to stream\n      if (c < 0x7F) {\n        os << c;\n        break;\n      }\n\n      // UTF-8 territory\n      // Check if the encoding is valid and if it is not, hex escape bytes.\n      // Important: We do not check the exact decoded values for validity, only\n      // the encoding format First check that this bytes is a valid lead byte:\n      // This means that it is not encoded as 1111 1XXX\n      // Or as 10XX XXXX\n      if (c < 0xC0 || c >= 0xF8) {\n        hexEscapeChar(os, c);\n        break;\n      }\n\n      auto encBytes = trailingBytes(c);\n      // Are there enough bytes left to avoid accessing out-of-bounds memory?\n      if (idx + encBytes - 1 >= m_str.size()) {\n        hexEscapeChar(os, c);\n        break;\n      }\n      // The header is valid, check data\n      // The next encBytes bytes must together be a valid utf-8\n      // This means: bitpattern 10XX XXXX and the extracted value is sane (ish)\n      bool valid     = true;\n      uint32_t value = headerValue(c);\n      for (std::size_t n = 1; n < encBytes; ++n) {\n        unsigned char nc = m_str[idx + n];\n        valid &= ((nc & 0xC0) == 0x80);\n        value = (value << 6) | (nc & 0x3F);\n      }\n\n      if (\n          // Wrong bit pattern of following bytes\n          (!valid) ||\n          // Overlong encodings\n          (value < 0x80) || (0x80 <= value && value < 0x800 && encBytes > 2) ||\n          (0x800 < value && value < 0x10000 && encBytes > 3) ||\n          // Encoded value out of range\n          (value >= 0x110000)) {\n        hexEscapeChar(os, c);\n        break;\n      }\n\n      // If we got here, this is in fact a valid(ish) utf-8 sequence\n      for (std::size_t n = 0; n < encBytes; ++n) {\n        os << m_str[idx + n];\n      }\n      idx += encBytes - 1;\n      break;\n    }\n  }\n}\n\nstd::ostream& operator<<(std::ostream& os, XmlEncode const& xmlEncode) {\n  xmlEncode.encodeTo(os);\n  return os;\n}\n\nXmlWriter::ScopedElement::ScopedElement(XmlWriter* writer, XmlFormatting fmt)\n    : m_writer(writer), m_fmt(fmt) {}\n\nXmlWriter::ScopedElement::ScopedElement(ScopedElement&& other) noexcept\n    : m_writer(other.m_writer), m_fmt(other.m_fmt) {\n  other.m_writer = nullptr;\n  other.m_fmt    = XmlFormatting::None;\n}\nXmlWriter::ScopedElement&\nXmlWriter::ScopedElement::operator=(ScopedElement&& other) noexcept {\n  if (m_writer) {\n    m_writer->endElement();\n  }\n  m_writer       = other.m_writer;\n  other.m_writer = nullptr;\n  m_fmt          = other.m_fmt;\n  other.m_fmt    = XmlFormatting::None;\n  return *this;\n}\n\nXmlWriter::ScopedElement::~ScopedElement() {\n  if (m_writer) {\n    m_writer->endElement(m_fmt);\n  }\n}\n\nXmlWriter::ScopedElement&\nXmlWriter::ScopedElement::writeText(std::string const& text,\n                                    XmlFormatting fmt) {\n  m_writer->writeText(text, fmt);\n  return *this;\n}\n\nXmlWriter::XmlWriter(std::ostream& os) : m_os(os) { writeDeclaration(); }\n\nXmlWriter::~XmlWriter() {\n  while (!m_tags.empty()) {\n    endElement();\n  }\n  newlineIfNecessary();\n}\n\nXmlWriter& XmlWriter::startElement(std::string const& name, XmlFormatting fmt) {\n  ensureTagClosed();\n  newlineIfNecessary();\n  if (shouldIndent(fmt)) {\n    m_os << m_indent;\n    m_indent += \"  \";\n  }\n  m_os << '<' << name;\n  m_tags.push_back(name);\n  m_tagIsOpen = true;\n  applyFormatting(fmt);\n  return *this;\n}\n\nXmlWriter::ScopedElement XmlWriter::scopedElement(std::string const& name,\n                                                  XmlFormatting fmt) {\n  ScopedElement scoped(this, fmt);\n  startElement(name, fmt);\n  return scoped;\n}\n\nXmlWriter& XmlWriter::endElement(XmlFormatting fmt) {\n  m_indent = m_indent.substr(0, m_indent.size() - 2);\n\n  if (m_tagIsOpen) {\n    m_os << \"/>\";\n    m_tagIsOpen = false;\n  } else {\n    newlineIfNecessary();\n    if (shouldIndent(fmt)) {\n      m_os << m_indent;\n    }\n    m_os << \"</\" << m_tags.back() << \">\";\n  }\n  m_os << std::flush;\n  applyFormatting(fmt);\n  m_tags.pop_back();\n  return *this;\n}\n\nXmlWriter& XmlWriter::writeAttribute(std::string const& name,\n                                     std::string const& attribute) {\n  if (!name.empty() && !attribute.empty())\n    m_os << ' ' << name << \"=\\\"\"\n         << XmlEncode(attribute, XmlEncode::ForAttributes) << '\"';\n  return *this;\n}\n\nXmlWriter& XmlWriter::writeAttribute(std::string const& name, bool attribute) {\n  m_os << ' ' << name << \"=\\\"\" << (attribute ? \"true\" : \"false\") << '\"';\n  return *this;\n}\n\nXmlWriter& XmlWriter::writeText(std::string const& text, XmlFormatting fmt) {\n  if (!text.empty()) {\n    bool tagWasOpen = m_tagIsOpen;\n    ensureTagClosed();\n    if (tagWasOpen && shouldIndent(fmt)) {\n      m_os << m_indent;\n    }\n    m_os << XmlEncode(text);\n    applyFormatting(fmt);\n  }\n  return *this;\n}\n\nXmlWriter& XmlWriter::writeComment(std::string const& text, XmlFormatting fmt) {\n  ensureTagClosed();\n  if (shouldIndent(fmt)) {\n    m_os << m_indent;\n  }\n  m_os << \"<!--\" << text << \"-->\";\n  applyFormatting(fmt);\n  return *this;\n}\n\nvoid XmlWriter::writeStylesheetRef(std::string const& url) {\n  m_os << \"<?xml-stylesheet type=\\\"text/xsl\\\" href=\\\"\" << url << \"\\\"?>\\n\";\n}\n\nXmlWriter& XmlWriter::writeBlankLine() {\n  ensureTagClosed();\n  m_os << '\\n';\n  return *this;\n}\n\nvoid XmlWriter::ensureTagClosed() {\n  if (m_tagIsOpen) {\n    m_os << '>' << std::flush;\n    newlineIfNecessary();\n    m_tagIsOpen = false;\n  }\n}\n\nvoid XmlWriter::applyFormatting(XmlFormatting fmt) {\n  m_needsNewline = shouldNewline(fmt);\n}\n\nvoid XmlWriter::writeDeclaration() {\n  m_os << \"<?xml version=\\\"1.0\\\" encoding=\\\"UTF-8\\\"?>\\n\";\n}\n\nvoid XmlWriter::newlineIfNecessary() {\n  if (m_needsNewline) {\n    m_os << std::endl;\n    m_needsNewline = false;\n  }\n}\n} // namespace Catch\n// end catch_xmlwriter.cpp\n// start catch_reporter_bases.cpp\n\n#include <cstring>\n#include <cfloat>\n#include <cstdio>\n#include <cassert>\n#include <memory>\n\nnamespace Catch {\nvoid prepareExpandedExpression(AssertionResult& result) {\n  result.getExpandedExpression();\n}\n\n// Because formatting using c++ streams is stateful, drop down to C is required\n// Alternatively we could use stringstream, but its performance is... not good.\nstd::string getFormattedDuration(double duration) {\n  // Max exponent + 1 is required to represent the whole part\n  // + 1 for decimal point\n  // + 3 for the 3 decimal places\n  // + 1 for null terminator\n  const std::size_t maxDoubleSize = DBL_MAX_10_EXP + 1 + 1 + 3 + 1;\n  char buffer[maxDoubleSize];\n\n  // Save previous errno, to prevent sprintf from overwriting it\n  ErrnoGuard guard;\n#ifdef _MSC_VER\n  sprintf_s(buffer, \"%.3f\", duration);\n#else\n  std::sprintf(buffer, \"%.3f\", duration);\n#endif\n  return std::string(buffer);\n}\n\nbool shouldShowDuration(IConfig const& config, double duration) {\n  if (config.showDurations() == ShowDurations::Always) {\n    return true;\n  }\n  if (config.showDurations() == ShowDurations::Never) {\n    return false;\n  }\n  const double min = config.minDuration();\n  return min >= 0 && duration >= min;\n}\n\nstd::string serializeFilters(std::vector<std::string> const& container) {\n  ReusableStringStream oss;\n  bool first = true;\n  for (auto&& filter : container) {\n    if (!first)\n      oss << ' ';\n    else\n      first = false;\n\n    oss << filter;\n  }\n  return oss.str();\n}\n\nTestEventListenerBase::TestEventListenerBase(ReporterConfig const& _config)\n    : StreamingReporterBase(_config) {}\n\nstd::set<Verbosity> TestEventListenerBase::getSupportedVerbosities() {\n  return {Verbosity::Quiet, Verbosity::Normal, Verbosity::High};\n}\n\nvoid TestEventListenerBase::assertionStarting(AssertionInfo const&) {}\n\nbool TestEventListenerBase::assertionEnded(AssertionStats const&) {\n  return false;\n}\n\n} // end namespace Catch\n// end catch_reporter_bases.cpp\n// start catch_reporter_compact.cpp\n\nnamespace {\n\n#ifdef CATCH_PLATFORM_MAC\nconst char* failedString() { return \"FAILED\"; }\nconst char* passedString() { return \"PASSED\"; }\n#else\nconst char* failedString() { return \"failed\"; }\nconst char* passedString() { return \"passed\"; }\n#endif\n\n// Colour::LightGrey\nCatch::Colour::Code dimColour() { return Catch::Colour::FileName; }\n\nstd::string bothOrAll(std::size_t count) {\n  return count == 1 ? std::string() : count == 2 ? \"both \" : \"all \";\n}\n\n} // namespace\n\nnamespace Catch {\nnamespace {\n// Colour, message variants:\n// - white: No tests ran.\n// -   red: Failed [both/all] N test cases, failed [both/all] M assertions.\n// - white: Passed [both/all] N test cases (no assertions).\n// -   red: Failed N tests cases, failed M assertions.\n// - green: Passed [both/all] N tests cases with M assertions.\nvoid printTotals(std::ostream& out, const Totals& totals) {\n  if (totals.testCases.total() == 0) {\n    out << \"No tests ran.\";\n  } else if (totals.testCases.failed == totals.testCases.total()) {\n    Colour colour(Colour::ResultError);\n    const std::string qualify_assertions_failed =\n        totals.assertions.failed == totals.assertions.total()\n            ? bothOrAll(totals.assertions.failed)\n            : std::string();\n    out << \"Failed \" << bothOrAll(totals.testCases.failed)\n        << pluralise(totals.testCases.failed, \"test case\")\n        << \", \"\n           \"failed \"\n        << qualify_assertions_failed\n        << pluralise(totals.assertions.failed, \"assertion\") << '.';\n  } else if (totals.assertions.total() == 0) {\n    out << \"Passed \" << bothOrAll(totals.testCases.total())\n        << pluralise(totals.testCases.total(), \"test case\")\n        << \" (no assertions).\";\n  } else if (totals.assertions.failed) {\n    Colour colour(Colour::ResultError);\n    out << \"Failed \" << pluralise(totals.testCases.failed, \"test case\")\n        << \", \"\n           \"failed \"\n        << pluralise(totals.assertions.failed, \"assertion\") << '.';\n  } else {\n    Colour colour(Colour::ResultSuccess);\n    out << \"Passed \" << bothOrAll(totals.testCases.passed)\n        << pluralise(totals.testCases.passed, \"test case\") << \" with \"\n        << pluralise(totals.assertions.passed, \"assertion\") << '.';\n  }\n}\n\n// Implementation of CompactReporter formatting\nclass AssertionPrinter {\npublic:\n  AssertionPrinter& operator=(AssertionPrinter const&) = delete;\n  AssertionPrinter(AssertionPrinter const&)            = delete;\n  AssertionPrinter(std::ostream& _stream, AssertionStats const& _stats,\n                   bool _printInfoMessages)\n      : stream(_stream), result(_stats.assertionResult),\n        messages(_stats.infoMessages), itMessage(_stats.infoMessages.begin()),\n        printInfoMessages(_printInfoMessages) {}\n\n  void print() {\n    printSourceInfo();\n\n    itMessage = messages.begin();\n\n    switch (result.getResultType()) {\n    case ResultWas::Ok:\n      printResultType(Colour::ResultSuccess, passedString());\n      printOriginalExpression();\n      printReconstructedExpression();\n      if (!result.hasExpression())\n        printRemainingMessages(Colour::None);\n      else\n        printRemainingMessages();\n      break;\n    case ResultWas::ExpressionFailed:\n      if (result.isOk())\n        printResultType(Colour::ResultSuccess,\n                        failedString() + std::string(\" - but was ok\"));\n      else\n        printResultType(Colour::Error, failedString());\n      printOriginalExpression();\n      printReconstructedExpression();\n      printRemainingMessages();\n      break;\n    case ResultWas::ThrewException:\n      printResultType(Colour::Error, failedString());\n      printIssue(\"unexpected exception with message:\");\n      printMessage();\n      printExpressionWas();\n      printRemainingMessages();\n      break;\n    case ResultWas::FatalErrorCondition:\n      printResultType(Colour::Error, failedString());\n      printIssue(\"fatal error condition with message:\");\n      printMessage();\n      printExpressionWas();\n      printRemainingMessages();\n      break;\n    case ResultWas::DidntThrowException:\n      printResultType(Colour::Error, failedString());\n      printIssue(\"expected exception, got none\");\n      printExpressionWas();\n      printRemainingMessages();\n      break;\n    case ResultWas::Info:\n      printResultType(Colour::None, \"info\");\n      printMessage();\n      printRemainingMessages();\n      break;\n    case ResultWas::Warning:\n      printResultType(Colour::None, \"warning\");\n      printMessage();\n      printRemainingMessages();\n      break;\n    case ResultWas::ExplicitFailure:\n      printResultType(Colour::Error, failedString());\n      printIssue(\"explicitly\");\n      printRemainingMessages(Colour::None);\n      break;\n      // These cases are here to prevent compiler warnings\n    case ResultWas::Unknown:\n    case ResultWas::FailureBit:\n    case ResultWas::Exception:\n      printResultType(Colour::Error, \"** internal error **\");\n      break;\n    }\n  }\n\nprivate:\n  void printSourceInfo() const {\n    Colour colourGuard(Colour::FileName);\n    stream << result.getSourceInfo() << ':';\n  }\n\n  void printResultType(Colour::Code colour,\n                       std::string const& passOrFail) const {\n    if (!passOrFail.empty()) {\n      {\n        Colour colourGuard(colour);\n        stream << ' ' << passOrFail;\n      }\n      stream << ':';\n    }\n  }\n\n  void printIssue(std::string const& issue) const { stream << ' ' << issue; }\n\n  void printExpressionWas() {\n    if (result.hasExpression()) {\n      stream << ';';\n      {\n        Colour colour(dimColour());\n        stream << \" expression was:\";\n      }\n      printOriginalExpression();\n    }\n  }\n\n  void printOriginalExpression() const {\n    if (result.hasExpression()) {\n      stream << ' ' << result.getExpression();\n    }\n  }\n\n  void printReconstructedExpression() const {\n    if (result.hasExpandedExpression()) {\n      {\n        Colour colour(dimColour());\n        stream << \" for: \";\n      }\n      stream << result.getExpandedExpression();\n    }\n  }\n\n  void printMessage() {\n    if (itMessage != messages.end()) {\n      stream << \" '\" << itMessage->message << '\\'';\n      ++itMessage;\n    }\n  }\n\n  void printRemainingMessages(Colour::Code colour = dimColour()) {\n    if (itMessage == messages.end())\n      return;\n\n    const auto itEnd = messages.cend();\n    const auto N = static_cast<std::size_t>(std::distance(itMessage, itEnd));\n\n    {\n      Colour colourGuard(colour);\n      stream << \" with \" << pluralise(N, \"message\") << ':';\n    }\n\n    while (itMessage != itEnd) {\n      // If this assertion is a warning ignore any INFO messages\n      if (printInfoMessages || itMessage->type != ResultWas::Info) {\n        printMessage();\n        if (itMessage != itEnd) {\n          Colour colourGuard(dimColour());\n          stream << \" and\";\n        }\n        continue;\n      }\n      ++itMessage;\n    }\n  }\n\nprivate:\n  std::ostream& stream;\n  AssertionResult const& result;\n  std::vector<MessageInfo> messages;\n  std::vector<MessageInfo>::const_iterator itMessage;\n  bool printInfoMessages;\n};\n\n} // namespace\n\nstd::string CompactReporter::getDescription() {\n  return \"Reports test results on a single line, suitable for IDEs\";\n}\n\nvoid CompactReporter::noMatchingTestCases(std::string const& spec) {\n  stream << \"No test cases matched '\" << spec << '\\'' << std::endl;\n}\n\nvoid CompactReporter::assertionStarting(AssertionInfo const&) {}\n\nbool CompactReporter::assertionEnded(AssertionStats const& _assertionStats) {\n  AssertionResult const& result = _assertionStats.assertionResult;\n\n  bool printInfoMessages = true;\n\n  // Drop out if result was successful and we're not printing those\n  if (!m_config->includeSuccessfulResults() && result.isOk()) {\n    if (result.getResultType() != ResultWas::Warning)\n      return false;\n    printInfoMessages = false;\n  }\n\n  AssertionPrinter printer(stream, _assertionStats, printInfoMessages);\n  printer.print();\n\n  stream << std::endl;\n  return true;\n}\n\nvoid CompactReporter::sectionEnded(SectionStats const& _sectionStats) {\n  double dur = _sectionStats.durationInSeconds;\n  if (shouldShowDuration(*m_config, dur)) {\n    stream << getFormattedDuration(dur)\n           << \" s: \" << _sectionStats.sectionInfo.name << std::endl;\n  }\n}\n\nvoid CompactReporter::testRunEnded(TestRunStats const& _testRunStats) {\n  printTotals(stream, _testRunStats.totals);\n  stream << '\\n' << std::endl;\n  StreamingReporterBase::testRunEnded(_testRunStats);\n}\n\nCompactReporter::~CompactReporter() {}\n\nCATCH_REGISTER_REPORTER(\"compact\", CompactReporter)\n\n} // end namespace Catch\n// end catch_reporter_compact.cpp\n// start catch_reporter_console.cpp\n\n#include <cfloat>\n#include <cstdio>\n\n#if defined(_MSC_VER)\n#pragma warning(push)\n#pragma warning(                                                               \\\n    disable : 4061) // Not all labels are EXPLICITLY handled in switch\n  // Note that 4062 (not all labels are handled and default is missing) is\n  // enabled\n#endif\n\n#if defined(__clang__)\n#pragma clang diagnostic push\n// For simplicity, benchmarking-only helpers are always enabled\n#pragma clang diagnostic ignored \"-Wunused-function\"\n#endif\n\nnamespace Catch {\n\nnamespace {\n\n// Formatter impl for ConsoleReporter\nclass ConsoleAssertionPrinter {\npublic:\n  ConsoleAssertionPrinter& operator=(ConsoleAssertionPrinter const&) = delete;\n  ConsoleAssertionPrinter(ConsoleAssertionPrinter const&)            = delete;\n  ConsoleAssertionPrinter(std::ostream& _stream, AssertionStats const& _stats,\n                          bool _printInfoMessages)\n      : stream(_stream), stats(_stats), result(_stats.assertionResult),\n        colour(Colour::None), message(result.getMessage()),\n        messages(_stats.infoMessages), printInfoMessages(_printInfoMessages) {\n    switch (result.getResultType()) {\n    case ResultWas::Ok:\n      colour     = Colour::Success;\n      passOrFail = \"PASSED\";\n      // if( result.hasMessage() )\n      if (_stats.infoMessages.size() == 1)\n        messageLabel = \"with message\";\n      if (_stats.infoMessages.size() > 1)\n        messageLabel = \"with messages\";\n      break;\n    case ResultWas::ExpressionFailed:\n      if (result.isOk()) {\n        colour     = Colour::Success;\n        passOrFail = \"FAILED - but was ok\";\n      } else {\n        colour     = Colour::Error;\n        passOrFail = \"FAILED\";\n      }\n      if (_stats.infoMessages.size() == 1)\n        messageLabel = \"with message\";\n      if (_stats.infoMessages.size() > 1)\n        messageLabel = \"with messages\";\n      break;\n    case ResultWas::ThrewException:\n      colour       = Colour::Error;\n      passOrFail   = \"FAILED\";\n      messageLabel = \"due to unexpected exception with \";\n      if (_stats.infoMessages.size() == 1)\n        messageLabel += \"message\";\n      if (_stats.infoMessages.size() > 1)\n        messageLabel += \"messages\";\n      break;\n    case ResultWas::FatalErrorCondition:\n      colour       = Colour::Error;\n      passOrFail   = \"FAILED\";\n      messageLabel = \"due to a fatal error condition\";\n      break;\n    case ResultWas::DidntThrowException:\n      colour       = Colour::Error;\n      passOrFail   = \"FAILED\";\n      messageLabel = \"because no exception was thrown where one was expected\";\n      break;\n    case ResultWas::Info:\n      messageLabel = \"info\";\n      break;\n    case ResultWas::Warning:\n      messageLabel = \"warning\";\n      break;\n    case ResultWas::ExplicitFailure:\n      passOrFail = \"FAILED\";\n      colour     = Colour::Error;\n      if (_stats.infoMessages.size() == 1)\n        messageLabel = \"explicitly with message\";\n      if (_stats.infoMessages.size() > 1)\n        messageLabel = \"explicitly with messages\";\n      break;\n      // These cases are here to prevent compiler warnings\n    case ResultWas::Unknown:\n    case ResultWas::FailureBit:\n    case ResultWas::Exception:\n      passOrFail = \"** internal error **\";\n      colour     = Colour::Error;\n      break;\n    }\n  }\n\n  void print() const {\n    printSourceInfo();\n    if (stats.totals.assertions.total() > 0) {\n      printResultType();\n      printOriginalExpression();\n      printReconstructedExpression();\n    } else {\n      stream << '\\n';\n    }\n    printMessage();\n  }\n\nprivate:\n  void printResultType() const {\n    if (!passOrFail.empty()) {\n      Colour colourGuard(colour);\n      stream << passOrFail << \":\\n\";\n    }\n  }\n  void printOriginalExpression() const {\n    if (result.hasExpression()) {\n      Colour colourGuard(Colour::OriginalExpression);\n      stream << \"  \";\n      stream << result.getExpressionInMacro();\n      stream << '\\n';\n    }\n  }\n  void printReconstructedExpression() const {\n    if (result.hasExpandedExpression()) {\n      stream << \"with expansion:\\n\";\n      Colour colourGuard(Colour::ReconstructedExpression);\n      stream << Column(result.getExpandedExpression()).indent(2) << '\\n';\n    }\n  }\n  void printMessage() const {\n    if (!messageLabel.empty())\n      stream << messageLabel << ':' << '\\n';\n    for (auto const& msg : messages) {\n      // If this assertion is a warning ignore any INFO messages\n      if (printInfoMessages || msg.type != ResultWas::Info)\n        stream << Column(msg.message).indent(2) << '\\n';\n    }\n  }\n  void printSourceInfo() const {\n    Colour colourGuard(Colour::FileName);\n    stream << result.getSourceInfo() << \": \";\n  }\n\n  std::ostream& stream;\n  AssertionStats const& stats;\n  AssertionResult const& result;\n  Colour::Code colour;\n  std::string passOrFail;\n  std::string messageLabel;\n  std::string message;\n  std::vector<MessageInfo> messages;\n  bool printInfoMessages;\n};\n\nstd::size_t makeRatio(std::size_t number, std::size_t total) {\n  std::size_t ratio =\n      total > 0 ? CATCH_CONFIG_CONSOLE_WIDTH * number / total : 0;\n  return (ratio == 0 && number > 0) ? 1 : ratio;\n}\n\nstd::size_t& findMax(std::size_t& i, std::size_t& j, std::size_t& k) {\n  if (i > j && i > k)\n    return i;\n  else if (j > k)\n    return j;\n  else\n    return k;\n}\n\nstruct ColumnInfo {\n  enum Justification { Left, Right };\n  std::string name;\n  int width;\n  Justification justification;\n};\nstruct ColumnBreak {};\nstruct RowBreak {};\n\nclass Duration {\n  enum class Unit {\n    Auto,\n    Nanoseconds,\n    Microseconds,\n    Milliseconds,\n    Seconds,\n    Minutes\n  };\n  static const uint64_t s_nanosecondsInAMicrosecond = 1000;\n  static const uint64_t s_nanosecondsInAMillisecond =\n      1000 * s_nanosecondsInAMicrosecond;\n  static const uint64_t s_nanosecondsInASecond =\n      1000 * s_nanosecondsInAMillisecond;\n  static const uint64_t s_nanosecondsInAMinute = 60 * s_nanosecondsInASecond;\n\n  double m_inNanoseconds;\n  Unit m_units;\n\npublic:\n  explicit Duration(double inNanoseconds, Unit units = Unit::Auto)\n      : m_inNanoseconds(inNanoseconds), m_units(units) {\n    if (m_units == Unit::Auto) {\n      if (m_inNanoseconds < s_nanosecondsInAMicrosecond)\n        m_units = Unit::Nanoseconds;\n      else if (m_inNanoseconds < s_nanosecondsInAMillisecond)\n        m_units = Unit::Microseconds;\n      else if (m_inNanoseconds < s_nanosecondsInASecond)\n        m_units = Unit::Milliseconds;\n      else if (m_inNanoseconds < s_nanosecondsInAMinute)\n        m_units = Unit::Seconds;\n      else\n        m_units = Unit::Minutes;\n    }\n  }\n\n  auto value() const -> double {\n    switch (m_units) {\n    case Unit::Microseconds:\n      return m_inNanoseconds / static_cast<double>(s_nanosecondsInAMicrosecond);\n    case Unit::Milliseconds:\n      return m_inNanoseconds / static_cast<double>(s_nanosecondsInAMillisecond);\n    case Unit::Seconds:\n      return m_inNanoseconds / static_cast<double>(s_nanosecondsInASecond);\n    case Unit::Minutes:\n      return m_inNanoseconds / static_cast<double>(s_nanosecondsInAMinute);\n    default:\n      return m_inNanoseconds;\n    }\n  }\n  auto unitsAsString() const -> std::string {\n    switch (m_units) {\n    case Unit::Nanoseconds:\n      return \"ns\";\n    case Unit::Microseconds:\n      return \"us\";\n    case Unit::Milliseconds:\n      return \"ms\";\n    case Unit::Seconds:\n      return \"s\";\n    case Unit::Minutes:\n      return \"m\";\n    default:\n      return \"** internal error **\";\n    }\n  }\n  friend auto operator<<(std::ostream& os, Duration const& duration)\n      -> std::ostream& {\n    return os << duration.value() << ' ' << duration.unitsAsString();\n  }\n};\n} // namespace\n\nclass TablePrinter {\n  std::ostream& m_os;\n  std::vector<ColumnInfo> m_columnInfos;\n  std::ostringstream m_oss;\n  int m_currentColumn = -1;\n  bool m_isOpen       = false;\n\npublic:\n  TablePrinter(std::ostream& os, std::vector<ColumnInfo> columnInfos)\n      : m_os(os), m_columnInfos(std::move(columnInfos)) {}\n\n  auto columnInfos() const -> std::vector<ColumnInfo> const& {\n    return m_columnInfos;\n  }\n\n  void open() {\n    if (!m_isOpen) {\n      m_isOpen = true;\n      *this << RowBreak();\n\n      Columns headerCols;\n      Spacer spacer(2);\n      for (auto const& info : m_columnInfos) {\n        headerCols +=\n            Column(info.name).width(static_cast<std::size_t>(info.width - 2));\n        headerCols += spacer;\n      }\n      m_os << headerCols << '\\n';\n\n      m_os << Catch::getLineOfChars<'-'>() << '\\n';\n    }\n  }\n  void close() {\n    if (m_isOpen) {\n      *this << RowBreak();\n      m_os << std::endl;\n      m_isOpen = false;\n    }\n  }\n\n  template <typename T>\n  friend TablePrinter& operator<<(TablePrinter& tp, T const& value) {\n    tp.m_oss << value;\n    return tp;\n  }\n\n  friend TablePrinter& operator<<(TablePrinter& tp, ColumnBreak) {\n    auto colStr        = tp.m_oss.str();\n    const auto strSize = colStr.size();\n    tp.m_oss.str(\"\");\n    tp.open();\n    if (tp.m_currentColumn == static_cast<int>(tp.m_columnInfos.size() - 1)) {\n      tp.m_currentColumn = -1;\n      tp.m_os << '\\n';\n    }\n    tp.m_currentColumn++;\n\n    auto colInfo = tp.m_columnInfos[tp.m_currentColumn];\n    auto padding = (strSize + 1 < static_cast<std::size_t>(colInfo.width))\n                       ? std::string(colInfo.width - (strSize + 1), ' ')\n                       : std::string();\n    if (colInfo.justification == ColumnInfo::Left)\n      tp.m_os << colStr << padding << ' ';\n    else\n      tp.m_os << padding << colStr << ' ';\n    return tp;\n  }\n\n  friend TablePrinter& operator<<(TablePrinter& tp, RowBreak) {\n    if (tp.m_currentColumn > 0) {\n      tp.m_os << '\\n';\n      tp.m_currentColumn = -1;\n    }\n    return tp;\n  }\n};\n\nConsoleReporter::ConsoleReporter(ReporterConfig const& config)\n    : StreamingReporterBase(config),\n      m_tablePrinter(new TablePrinter(\n          config.stream(), [&config]() -> std::vector<ColumnInfo> {\n            if (config.fullConfig()->benchmarkNoAnalysis()) {\n              return {{\"benchmark name\", CATCH_CONFIG_CONSOLE_WIDTH - 43,\n                       ColumnInfo::Left},\n                      {\"     samples\", 14, ColumnInfo::Right},\n                      {\"  iterations\", 14, ColumnInfo::Right},\n                      {\"        mean\", 14, ColumnInfo::Right}};\n            } else {\n              return {\n                  {\"benchmark name\", CATCH_CONFIG_CONSOLE_WIDTH - 43,\n                   ColumnInfo::Left},\n                  {\"samples      mean       std dev\", 14, ColumnInfo::Right},\n                  {\"iterations   low mean   low std dev\", 14,\n                   ColumnInfo::Right},\n                  {\"estimated    high mean  high std dev\", 14,\n                   ColumnInfo::Right}};\n            }\n          }())) {}\nConsoleReporter::~ConsoleReporter() = default;\n\nstd::string ConsoleReporter::getDescription() {\n  return \"Reports test results as plain lines of text\";\n}\n\nvoid ConsoleReporter::noMatchingTestCases(std::string const& spec) {\n  stream << \"No test cases matched '\" << spec << '\\'' << std::endl;\n}\n\nvoid ConsoleReporter::reportInvalidArguments(std::string const& arg) {\n  stream << \"Invalid Filter: \" << arg << std::endl;\n}\n\nvoid ConsoleReporter::assertionStarting(AssertionInfo const&) {}\n\nbool ConsoleReporter::assertionEnded(AssertionStats const& _assertionStats) {\n  AssertionResult const& result = _assertionStats.assertionResult;\n\n  bool includeResults = m_config->includeSuccessfulResults() || !result.isOk();\n\n  // Drop out if result was successful but we're not printing them.\n  if (!includeResults && result.getResultType() != ResultWas::Warning)\n    return false;\n\n  lazyPrint();\n\n  ConsoleAssertionPrinter printer(stream, _assertionStats, includeResults);\n  printer.print();\n  stream << std::endl;\n  return true;\n}\n\nvoid ConsoleReporter::sectionStarting(SectionInfo const& _sectionInfo) {\n  m_tablePrinter->close();\n  m_headerPrinted = false;\n  StreamingReporterBase::sectionStarting(_sectionInfo);\n}\nvoid ConsoleReporter::sectionEnded(SectionStats const& _sectionStats) {\n  m_tablePrinter->close();\n  if (_sectionStats.missingAssertions) {\n    lazyPrint();\n    Colour colour(Colour::ResultError);\n    if (m_sectionStack.size() > 1)\n      stream << \"\\nNo assertions in section\";\n    else\n      stream << \"\\nNo assertions in test case\";\n    stream << \" '\" << _sectionStats.sectionInfo.name << \"'\\n\" << std::endl;\n  }\n  double dur = _sectionStats.durationInSeconds;\n  if (shouldShowDuration(*m_config, dur)) {\n    stream << getFormattedDuration(dur)\n           << \" s: \" << _sectionStats.sectionInfo.name << std::endl;\n  }\n  if (m_headerPrinted) {\n    m_headerPrinted = false;\n  }\n  StreamingReporterBase::sectionEnded(_sectionStats);\n}\n\n#if defined(CATCH_CONFIG_ENABLE_BENCHMARKING)\nvoid ConsoleReporter::benchmarkPreparing(std::string const& name) {\n  lazyPrintWithoutClosingBenchmarkTable();\n\n  auto nameCol = Column(name).width(\n      static_cast<std::size_t>(m_tablePrinter->columnInfos()[0].width - 2));\n\n  bool firstLine = true;\n  for (auto line : nameCol) {\n    if (!firstLine)\n      (*m_tablePrinter) << ColumnBreak() << ColumnBreak() << ColumnBreak();\n    else\n      firstLine = false;\n\n    (*m_tablePrinter) << line << ColumnBreak();\n  }\n}\n\nvoid ConsoleReporter::benchmarkStarting(BenchmarkInfo const& info) {\n  (*m_tablePrinter) << info.samples << ColumnBreak() << info.iterations\n                    << ColumnBreak();\n  if (!m_config->benchmarkNoAnalysis())\n    (*m_tablePrinter) << Duration(info.estimatedDuration) << ColumnBreak();\n}\nvoid ConsoleReporter::benchmarkEnded(BenchmarkStats<> const& stats) {\n  if (m_config->benchmarkNoAnalysis()) {\n    (*m_tablePrinter) << Duration(stats.mean.point.count()) << ColumnBreak();\n  } else {\n    (*m_tablePrinter) << ColumnBreak() << Duration(stats.mean.point.count())\n                      << ColumnBreak()\n                      << Duration(stats.mean.lower_bound.count())\n                      << ColumnBreak()\n                      << Duration(stats.mean.upper_bound.count())\n                      << ColumnBreak() << ColumnBreak()\n                      << Duration(stats.standardDeviation.point.count())\n                      << ColumnBreak()\n                      << Duration(stats.standardDeviation.lower_bound.count())\n                      << ColumnBreak()\n                      << Duration(stats.standardDeviation.upper_bound.count())\n                      << ColumnBreak() << ColumnBreak() << ColumnBreak()\n                      << ColumnBreak() << ColumnBreak();\n  }\n}\n\nvoid ConsoleReporter::benchmarkFailed(std::string const& error) {\n  Colour colour(Colour::Red);\n  (*m_tablePrinter) << \"Benchmark failed (\" << error << ')' << ColumnBreak()\n                    << RowBreak();\n}\n#endif // CATCH_CONFIG_ENABLE_BENCHMARKING\n\nvoid ConsoleReporter::testCaseEnded(TestCaseStats const& _testCaseStats) {\n  m_tablePrinter->close();\n  StreamingReporterBase::testCaseEnded(_testCaseStats);\n  m_headerPrinted = false;\n}\nvoid ConsoleReporter::testGroupEnded(TestGroupStats const& _testGroupStats) {\n  if (currentGroupInfo.used) {\n    printSummaryDivider();\n    stream << \"Summary for group '\" << _testGroupStats.groupInfo.name << \"':\\n\";\n    printTotals(_testGroupStats.totals);\n    stream << '\\n' << std::endl;\n  }\n  StreamingReporterBase::testGroupEnded(_testGroupStats);\n}\nvoid ConsoleReporter::testRunEnded(TestRunStats const& _testRunStats) {\n  printTotalsDivider(_testRunStats.totals);\n  printTotals(_testRunStats.totals);\n  stream << std::endl;\n  StreamingReporterBase::testRunEnded(_testRunStats);\n}\nvoid ConsoleReporter::testRunStarting(TestRunInfo const& _testInfo) {\n  StreamingReporterBase::testRunStarting(_testInfo);\n  printTestFilters();\n}\n\nvoid ConsoleReporter::lazyPrint() {\n\n  m_tablePrinter->close();\n  lazyPrintWithoutClosingBenchmarkTable();\n}\n\nvoid ConsoleReporter::lazyPrintWithoutClosingBenchmarkTable() {\n\n  if (!currentTestRunInfo.used)\n    lazyPrintRunInfo();\n  if (!currentGroupInfo.used)\n    lazyPrintGroupInfo();\n\n  if (!m_headerPrinted) {\n    printTestCaseAndSectionHeader();\n    m_headerPrinted = true;\n  }\n}\nvoid ConsoleReporter::lazyPrintRunInfo() {\n  stream << '\\n' << getLineOfChars<'~'>() << '\\n';\n  Colour colour(Colour::SecondaryText);\n  stream << currentTestRunInfo->name << \" is a Catch v\" << libraryVersion()\n         << \" host application.\\n\"\n         << \"Run with -? for options\\n\\n\";\n\n  if (m_config->rngSeed() != 0)\n    stream << \"Randomness seeded to: \" << m_config->rngSeed() << \"\\n\\n\";\n\n  currentTestRunInfo.used = true;\n}\nvoid ConsoleReporter::lazyPrintGroupInfo() {\n  if (!currentGroupInfo->name.empty() && currentGroupInfo->groupsCounts > 1) {\n    printClosedHeader(\"Group: \" + currentGroupInfo->name);\n    currentGroupInfo.used = true;\n  }\n}\nvoid ConsoleReporter::printTestCaseAndSectionHeader() {\n  assert(!m_sectionStack.empty());\n  printOpenHeader(currentTestCaseInfo->name);\n\n  if (m_sectionStack.size() > 1) {\n    Colour colourGuard(Colour::Headers);\n\n    auto it   = m_sectionStack.begin() + 1, // Skip first section (test case)\n        itEnd = m_sectionStack.end();\n    for (; it != itEnd; ++it)\n      printHeaderString(it->name, 2);\n  }\n\n  SourceLineInfo lineInfo = m_sectionStack.back().lineInfo;\n\n  stream << getLineOfChars<'-'>() << '\\n';\n  Colour colourGuard(Colour::FileName);\n  stream << lineInfo << '\\n';\n  stream << getLineOfChars<'.'>() << '\\n' << std::endl;\n}\n\nvoid ConsoleReporter::printClosedHeader(std::string const& _name) {\n  printOpenHeader(_name);\n  stream << getLineOfChars<'.'>() << '\\n';\n}\nvoid ConsoleReporter::printOpenHeader(std::string const& _name) {\n  stream << getLineOfChars<'-'>() << '\\n';\n  {\n    Colour colourGuard(Colour::Headers);\n    printHeaderString(_name);\n  }\n}\n\n// if string has a : in first line will set indent to follow it on\n// subsequent lines\nvoid ConsoleReporter::printHeaderString(std::string const& _string,\n                                        std::size_t indent) {\n  std::size_t i = _string.find(\": \");\n  if (i != std::string::npos)\n    i += 2;\n  else\n    i = 0;\n  stream << Column(_string).indent(indent + i).initialIndent(indent) << '\\n';\n}\n\nstruct SummaryColumn {\n\n  SummaryColumn(std::string _label, Colour::Code _colour)\n      : label(std::move(_label)), colour(_colour) {}\n  SummaryColumn addRow(std::size_t count) {\n    ReusableStringStream rss;\n    rss << count;\n    std::string row = rss.str();\n    for (auto& oldRow : rows) {\n      while (oldRow.size() < row.size())\n        oldRow = ' ' + oldRow;\n      while (oldRow.size() > row.size())\n        row = ' ' + row;\n    }\n    rows.push_back(row);\n    return *this;\n  }\n\n  std::string label;\n  Colour::Code colour;\n  std::vector<std::string> rows;\n};\n\nvoid ConsoleReporter::printTotals(Totals const& totals) {\n  if (totals.testCases.total() == 0) {\n    stream << Colour(Colour::Warning) << \"No tests ran\\n\";\n  } else if (totals.assertions.total() > 0 && totals.testCases.allPassed()) {\n    stream << Colour(Colour::ResultSuccess) << \"All tests passed\";\n    stream << \" (\" << pluralise(totals.assertions.passed, \"assertion\") << \" in \"\n           << pluralise(totals.testCases.passed, \"test case\") << ')' << '\\n';\n  } else {\n\n    std::vector<SummaryColumn> columns;\n    columns.push_back(SummaryColumn(\"\", Colour::None)\n                          .addRow(totals.testCases.total())\n                          .addRow(totals.assertions.total()));\n    columns.push_back(SummaryColumn(\"passed\", Colour::Success)\n                          .addRow(totals.testCases.passed)\n                          .addRow(totals.assertions.passed));\n    columns.push_back(SummaryColumn(\"failed\", Colour::ResultError)\n                          .addRow(totals.testCases.failed)\n                          .addRow(totals.assertions.failed));\n    columns.push_back(\n        SummaryColumn(\"failed as expected\", Colour::ResultExpectedFailure)\n            .addRow(totals.testCases.failedButOk)\n            .addRow(totals.assertions.failedButOk));\n\n    printSummaryRow(\"test cases\", columns, 0);\n    printSummaryRow(\"assertions\", columns, 1);\n  }\n}\nvoid ConsoleReporter::printSummaryRow(std::string const& label,\n                                      std::vector<SummaryColumn> const& cols,\n                                      std::size_t row) {\n  for (auto col : cols) {\n    std::string value = col.rows[row];\n    if (col.label.empty()) {\n      stream << label << \": \";\n      if (value != \"0\")\n        stream << value;\n      else\n        stream << Colour(Colour::Warning) << \"- none -\";\n    } else if (value != \"0\") {\n      stream << Colour(Colour::LightGrey) << \" | \";\n      stream << Colour(col.colour) << value << ' ' << col.label;\n    }\n  }\n  stream << '\\n';\n}\n\nvoid ConsoleReporter::printTotalsDivider(Totals const& totals) {\n  if (totals.testCases.total() > 0) {\n    std::size_t failedRatio =\n        makeRatio(totals.testCases.failed, totals.testCases.total());\n    std::size_t failedButOkRatio =\n        makeRatio(totals.testCases.failedButOk, totals.testCases.total());\n    std::size_t passedRatio =\n        makeRatio(totals.testCases.passed, totals.testCases.total());\n    while (failedRatio + failedButOkRatio + passedRatio <\n           CATCH_CONFIG_CONSOLE_WIDTH - 1)\n      findMax(failedRatio, failedButOkRatio, passedRatio)++;\n    while (failedRatio + failedButOkRatio + passedRatio >\n           CATCH_CONFIG_CONSOLE_WIDTH - 1)\n      findMax(failedRatio, failedButOkRatio, passedRatio)--;\n\n    stream << Colour(Colour::Error) << std::string(failedRatio, '=');\n    stream << Colour(Colour::ResultExpectedFailure)\n           << std::string(failedButOkRatio, '=');\n    if (totals.testCases.allPassed())\n      stream << Colour(Colour::ResultSuccess) << std::string(passedRatio, '=');\n    else\n      stream << Colour(Colour::Success) << std::string(passedRatio, '=');\n  } else {\n    stream << Colour(Colour::Warning)\n           << std::string(CATCH_CONFIG_CONSOLE_WIDTH - 1, '=');\n  }\n  stream << '\\n';\n}\nvoid ConsoleReporter::printSummaryDivider() {\n  stream << getLineOfChars<'-'>() << '\\n';\n}\n\nvoid ConsoleReporter::printTestFilters() {\n  if (m_config->testSpec().hasFilters()) {\n    Colour guard(Colour::BrightYellow);\n    stream << \"Filters: \" << serializeFilters(m_config->getTestsOrTags())\n           << '\\n';\n  }\n}\n\nCATCH_REGISTER_REPORTER(\"console\", ConsoleReporter)\n\n} // end namespace Catch\n\n#if defined(_MSC_VER)\n#pragma warning(pop)\n#endif\n\n#if defined(__clang__)\n#pragma clang diagnostic pop\n#endif\n// end catch_reporter_console.cpp\n// start catch_reporter_junit.cpp\n\n#include <cassert>\n#include <sstream>\n#include <ctime>\n#include <algorithm>\n\nnamespace Catch {\n\nnamespace {\nstd::string getCurrentTimestamp() {\n  // Beware, this is not reentrant because of backward compatibility issues\n  // Also, UTC only, again because of backward compatibility (%z is C++11)\n  time_t rawtime;\n  std::time(&rawtime);\n  auto const timeStampSize = sizeof(\"2017-01-16T17:06:45Z\");\n\n#ifdef _MSC_VER\n  std::tm timeInfo = {};\n  gmtime_s(&timeInfo, &rawtime);\n#else\n  std::tm* timeInfo;\n  timeInfo = std::gmtime(&rawtime);\n#endif\n\n  char timeStamp[timeStampSize];\n  const char* const fmt = \"%Y-%m-%dT%H:%M:%SZ\";\n\n#ifdef _MSC_VER\n  std::strftime(timeStamp, timeStampSize, fmt, &timeInfo);\n#else\n  std::strftime(timeStamp, timeStampSize, fmt, timeInfo);\n#endif\n  return std::string(timeStamp);\n}\n\nstd::string fileNameTag(const std::vector<std::string>& tags) {\n  auto it = std::find_if(begin(tags), end(tags), [](std::string const& tag) {\n    return tag.front() == '#';\n  });\n  if (it != tags.end())\n    return it->substr(1);\n  return std::string();\n}\n} // anonymous namespace\n\nJunitReporter::JunitReporter(ReporterConfig const& _config)\n    : CumulativeReporterBase(_config), xml(_config.stream()) {\n  m_reporterPrefs.shouldRedirectStdOut      = true;\n  m_reporterPrefs.shouldReportAllAssertions = true;\n}\n\nJunitReporter::~JunitReporter() {}\n\nstd::string JunitReporter::getDescription() {\n  return \"Reports test results in an XML format that looks like Ant's \"\n         \"junitreport target\";\n}\n\nvoid JunitReporter::noMatchingTestCases(std::string const& /*spec*/) {}\n\nvoid JunitReporter::testRunStarting(TestRunInfo const& runInfo) {\n  CumulativeReporterBase::testRunStarting(runInfo);\n  xml.startElement(\"testsuites\");\n}\n\nvoid JunitReporter::testGroupStarting(GroupInfo const& groupInfo) {\n  suiteTimer.start();\n  stdOutForSuite.clear();\n  stdErrForSuite.clear();\n  unexpectedExceptions = 0;\n  CumulativeReporterBase::testGroupStarting(groupInfo);\n}\n\nvoid JunitReporter::testCaseStarting(TestCaseInfo const& testCaseInfo) {\n  m_okToFail = testCaseInfo.okToFail();\n}\n\nbool JunitReporter::assertionEnded(AssertionStats const& assertionStats) {\n  if (assertionStats.assertionResult.getResultType() ==\n          ResultWas::ThrewException &&\n      !m_okToFail)\n    unexpectedExceptions++;\n  return CumulativeReporterBase::assertionEnded(assertionStats);\n}\n\nvoid JunitReporter::testCaseEnded(TestCaseStats const& testCaseStats) {\n  stdOutForSuite += testCaseStats.stdOut;\n  stdErrForSuite += testCaseStats.stdErr;\n  CumulativeReporterBase::testCaseEnded(testCaseStats);\n}\n\nvoid JunitReporter::testGroupEnded(TestGroupStats const& testGroupStats) {\n  double suiteTime = suiteTimer.getElapsedSeconds();\n  CumulativeReporterBase::testGroupEnded(testGroupStats);\n  writeGroup(*m_testGroups.back(), suiteTime);\n}\n\nvoid JunitReporter::testRunEndedCumulative() { xml.endElement(); }\n\nvoid JunitReporter::writeGroup(TestGroupNode const& groupNode,\n                               double suiteTime) {\n  XmlWriter::ScopedElement e = xml.scopedElement(\"testsuite\");\n\n  TestGroupStats const& stats = groupNode.value;\n  xml.writeAttribute(\"name\", stats.groupInfo.name);\n  xml.writeAttribute(\"errors\", unexpectedExceptions);\n  xml.writeAttribute(\"failures\",\n                     stats.totals.assertions.failed - unexpectedExceptions);\n  xml.writeAttribute(\"tests\", stats.totals.assertions.total());\n  xml.writeAttribute(\"hostname\", \"tbd\"); // !TBD\n  if (m_config->showDurations() == ShowDurations::Never)\n    xml.writeAttribute(\"time\", \"\");\n  else\n    xml.writeAttribute(\"time\", suiteTime);\n  xml.writeAttribute(\"timestamp\", getCurrentTimestamp());\n\n  // Write properties if there are any\n  if (m_config->hasTestFilters() || m_config->rngSeed() != 0) {\n    auto properties = xml.scopedElement(\"properties\");\n    if (m_config->hasTestFilters()) {\n      xml.scopedElement(\"property\")\n          .writeAttribute(\"name\", \"filters\")\n          .writeAttribute(\"value\",\n                          serializeFilters(m_config->getTestsOrTags()));\n    }\n    if (m_config->rngSeed() != 0) {\n      xml.scopedElement(\"property\")\n          .writeAttribute(\"name\", \"random-seed\")\n          .writeAttribute(\"value\", m_config->rngSeed());\n    }\n  }\n\n  // Write test cases\n  for (auto const& child : groupNode.children)\n    writeTestCase(*child);\n\n  xml.scopedElement(\"system-out\")\n      .writeText(trim(stdOutForSuite), XmlFormatting::Newline);\n  xml.scopedElement(\"system-err\")\n      .writeText(trim(stdErrForSuite), XmlFormatting::Newline);\n}\n\nvoid JunitReporter::writeTestCase(TestCaseNode const& testCaseNode) {\n  TestCaseStats const& stats = testCaseNode.value;\n\n  // All test cases have exactly one section - which represents the\n  // test case itself. That section may have 0-n nested sections\n  assert(testCaseNode.children.size() == 1);\n  SectionNode const& rootSection = *testCaseNode.children.front();\n\n  std::string className = stats.testInfo.className;\n\n  if (className.empty()) {\n    className = fileNameTag(stats.testInfo.tags);\n    if (className.empty())\n      className = \"global\";\n  }\n\n  if (!m_config->name().empty())\n    className = m_config->name() + \".\" + className;\n\n  writeSection(className, \"\", rootSection);\n}\n\nvoid JunitReporter::writeSection(std::string const& className,\n                                 std::string const& rootName,\n                                 SectionNode const& sectionNode) {\n  std::string name = trim(sectionNode.stats.sectionInfo.name);\n  if (!rootName.empty())\n    name = rootName + '/' + name;\n\n  if (!sectionNode.assertions.empty() || !sectionNode.stdOut.empty() ||\n      !sectionNode.stdErr.empty()) {\n    XmlWriter::ScopedElement e = xml.scopedElement(\"testcase\");\n    if (className.empty()) {\n      xml.writeAttribute(\"classname\", name);\n      xml.writeAttribute(\"name\", \"root\");\n    } else {\n      xml.writeAttribute(\"classname\", className);\n      xml.writeAttribute(\"name\", name);\n    }\n    xml.writeAttribute(\"time\", ::Catch::Detail::stringify(\n                                   sectionNode.stats.durationInSeconds));\n    // This is not ideal, but it should be enough to mimic gtest's\n    // junit output.\n    // Ideally the JUnit reporter would also handle `skipTest`\n    // events and write those out appropriately.\n    xml.writeAttribute(\"status\", \"run\");\n\n    writeAssertions(sectionNode);\n\n    if (!sectionNode.stdOut.empty())\n      xml.scopedElement(\"system-out\")\n          .writeText(trim(sectionNode.stdOut), XmlFormatting::Newline);\n    if (!sectionNode.stdErr.empty())\n      xml.scopedElement(\"system-err\")\n          .writeText(trim(sectionNode.stdErr), XmlFormatting::Newline);\n  }\n  for (auto const& childNode : sectionNode.childSections)\n    if (className.empty())\n      writeSection(name, \"\", *childNode);\n    else\n      writeSection(className, name, *childNode);\n}\n\nvoid JunitReporter::writeAssertions(SectionNode const& sectionNode) {\n  for (auto const& assertion : sectionNode.assertions)\n    writeAssertion(assertion);\n}\n\nvoid JunitReporter::writeAssertion(AssertionStats const& stats) {\n  AssertionResult const& result = stats.assertionResult;\n  if (!result.isOk()) {\n    std::string elementName;\n    switch (result.getResultType()) {\n    case ResultWas::ThrewException:\n    case ResultWas::FatalErrorCondition:\n      elementName = \"error\";\n      break;\n    case ResultWas::ExplicitFailure:\n    case ResultWas::ExpressionFailed:\n    case ResultWas::DidntThrowException:\n      elementName = \"failure\";\n      break;\n\n    // We should never see these here:\n    case ResultWas::Info:\n    case ResultWas::Warning:\n    case ResultWas::Ok:\n    case ResultWas::Unknown:\n    case ResultWas::FailureBit:\n    case ResultWas::Exception:\n      elementName = \"internalError\";\n      break;\n    }\n\n    XmlWriter::ScopedElement e = xml.scopedElement(elementName);\n\n    xml.writeAttribute(\"message\", result.getExpression());\n    xml.writeAttribute(\"type\", result.getTestMacroName());\n\n    ReusableStringStream rss;\n    if (stats.totals.assertions.total() > 0) {\n      rss << \"FAILED\"\n          << \":\\n\";\n      if (result.hasExpression()) {\n        rss << \"  \";\n        rss << result.getExpressionInMacro();\n        rss << '\\n';\n      }\n      if (result.hasExpandedExpression()) {\n        rss << \"with expansion:\\n\";\n        rss << Column(result.getExpandedExpression()).indent(2) << '\\n';\n      }\n    } else {\n      rss << '\\n';\n    }\n\n    if (!result.getMessage().empty())\n      rss << result.getMessage() << '\\n';\n    for (auto const& msg : stats.infoMessages)\n      if (msg.type == ResultWas::Info)\n        rss << msg.message << '\\n';\n\n    rss << \"at \" << result.getSourceInfo();\n    xml.writeText(rss.str(), XmlFormatting::Newline);\n  }\n}\n\nCATCH_REGISTER_REPORTER(\"junit\", JunitReporter)\n\n} // end namespace Catch\n// end catch_reporter_junit.cpp\n// start catch_reporter_listening.cpp\n\n#include <cassert>\n\nnamespace Catch {\n\nListeningReporter::ListeningReporter() {\n  // We will assume that listeners will always want all assertions\n  m_preferences.shouldReportAllAssertions = true;\n}\n\nvoid ListeningReporter::addListener(IStreamingReporterPtr&& listener) {\n  m_listeners.push_back(std::move(listener));\n}\n\nvoid ListeningReporter::addReporter(IStreamingReporterPtr&& reporter) {\n  assert(!m_reporter && \"Listening reporter can wrap only 1 real reporter\");\n  m_reporter = std::move(reporter);\n  m_preferences.shouldRedirectStdOut =\n      m_reporter->getPreferences().shouldRedirectStdOut;\n}\n\nReporterPreferences ListeningReporter::getPreferences() const {\n  return m_preferences;\n}\n\nstd::set<Verbosity> ListeningReporter::getSupportedVerbosities() {\n  return std::set<Verbosity>{};\n}\n\nvoid ListeningReporter::noMatchingTestCases(std::string const& spec) {\n  for (auto const& listener : m_listeners) {\n    listener->noMatchingTestCases(spec);\n  }\n  m_reporter->noMatchingTestCases(spec);\n}\n\nvoid ListeningReporter::reportInvalidArguments(std::string const& arg) {\n  for (auto const& listener : m_listeners) {\n    listener->reportInvalidArguments(arg);\n  }\n  m_reporter->reportInvalidArguments(arg);\n}\n\n#if defined(CATCH_CONFIG_ENABLE_BENCHMARKING)\nvoid ListeningReporter::benchmarkPreparing(std::string const& name) {\n  for (auto const& listener : m_listeners) {\n    listener->benchmarkPreparing(name);\n  }\n  m_reporter->benchmarkPreparing(name);\n}\nvoid ListeningReporter::benchmarkStarting(BenchmarkInfo const& benchmarkInfo) {\n  for (auto const& listener : m_listeners) {\n    listener->benchmarkStarting(benchmarkInfo);\n  }\n  m_reporter->benchmarkStarting(benchmarkInfo);\n}\nvoid ListeningReporter::benchmarkEnded(BenchmarkStats<> const& benchmarkStats) {\n  for (auto const& listener : m_listeners) {\n    listener->benchmarkEnded(benchmarkStats);\n  }\n  m_reporter->benchmarkEnded(benchmarkStats);\n}\n\nvoid ListeningReporter::benchmarkFailed(std::string const& error) {\n  for (auto const& listener : m_listeners) {\n    listener->benchmarkFailed(error);\n  }\n  m_reporter->benchmarkFailed(error);\n}\n#endif // CATCH_CONFIG_ENABLE_BENCHMARKING\n\nvoid ListeningReporter::testRunStarting(TestRunInfo const& testRunInfo) {\n  for (auto const& listener : m_listeners) {\n    listener->testRunStarting(testRunInfo);\n  }\n  m_reporter->testRunStarting(testRunInfo);\n}\n\nvoid ListeningReporter::testGroupStarting(GroupInfo const& groupInfo) {\n  for (auto const& listener : m_listeners) {\n    listener->testGroupStarting(groupInfo);\n  }\n  m_reporter->testGroupStarting(groupInfo);\n}\n\nvoid ListeningReporter::testCaseStarting(TestCaseInfo const& testInfo) {\n  for (auto const& listener : m_listeners) {\n    listener->testCaseStarting(testInfo);\n  }\n  m_reporter->testCaseStarting(testInfo);\n}\n\nvoid ListeningReporter::sectionStarting(SectionInfo const& sectionInfo) {\n  for (auto const& listener : m_listeners) {\n    listener->sectionStarting(sectionInfo);\n  }\n  m_reporter->sectionStarting(sectionInfo);\n}\n\nvoid ListeningReporter::assertionStarting(AssertionInfo const& assertionInfo) {\n  for (auto const& listener : m_listeners) {\n    listener->assertionStarting(assertionInfo);\n  }\n  m_reporter->assertionStarting(assertionInfo);\n}\n\n// The return value indicates if the messages buffer should be cleared:\nbool ListeningReporter::assertionEnded(AssertionStats const& assertionStats) {\n  for (auto const& listener : m_listeners) {\n    static_cast<void>(listener->assertionEnded(assertionStats));\n  }\n  return m_reporter->assertionEnded(assertionStats);\n}\n\nvoid ListeningReporter::sectionEnded(SectionStats const& sectionStats) {\n  for (auto const& listener : m_listeners) {\n    listener->sectionEnded(sectionStats);\n  }\n  m_reporter->sectionEnded(sectionStats);\n}\n\nvoid ListeningReporter::testCaseEnded(TestCaseStats const& testCaseStats) {\n  for (auto const& listener : m_listeners) {\n    listener->testCaseEnded(testCaseStats);\n  }\n  m_reporter->testCaseEnded(testCaseStats);\n}\n\nvoid ListeningReporter::testGroupEnded(TestGroupStats const& testGroupStats) {\n  for (auto const& listener : m_listeners) {\n    listener->testGroupEnded(testGroupStats);\n  }\n  m_reporter->testGroupEnded(testGroupStats);\n}\n\nvoid ListeningReporter::testRunEnded(TestRunStats const& testRunStats) {\n  for (auto const& listener : m_listeners) {\n    listener->testRunEnded(testRunStats);\n  }\n  m_reporter->testRunEnded(testRunStats);\n}\n\nvoid ListeningReporter::skipTest(TestCaseInfo const& testInfo) {\n  for (auto const& listener : m_listeners) {\n    listener->skipTest(testInfo);\n  }\n  m_reporter->skipTest(testInfo);\n}\n\nbool ListeningReporter::isMulti() const { return true; }\n\n} // end namespace Catch\n// end catch_reporter_listening.cpp\n// start catch_reporter_xml.cpp\n\n#if defined(_MSC_VER)\n#pragma warning(push)\n#pragma warning(disable : 4061) // Not all labels are EXPLICITLY handled in\n                                // switch Note that 4062 (not all labels are\n                                // handled and default is missing) is enabled\n#endif\n\nnamespace Catch {\nXmlReporter::XmlReporter(ReporterConfig const& _config)\n    : StreamingReporterBase(_config), m_xml(_config.stream()) {\n  m_reporterPrefs.shouldRedirectStdOut      = true;\n  m_reporterPrefs.shouldReportAllAssertions = true;\n}\n\nXmlReporter::~XmlReporter() = default;\n\nstd::string XmlReporter::getDescription() {\n  return \"Reports test results as an XML document\";\n}\n\nstd::string XmlReporter::getStylesheetRef() const { return std::string(); }\n\nvoid XmlReporter::writeSourceInfo(SourceLineInfo const& sourceInfo) {\n  m_xml.writeAttribute(\"filename\", sourceInfo.file)\n      .writeAttribute(\"line\", sourceInfo.line);\n}\n\nvoid XmlReporter::noMatchingTestCases(std::string const& s) {\n  StreamingReporterBase::noMatchingTestCases(s);\n}\n\nvoid XmlReporter::testRunStarting(TestRunInfo const& testInfo) {\n  StreamingReporterBase::testRunStarting(testInfo);\n  std::string stylesheetRef = getStylesheetRef();\n  if (!stylesheetRef.empty())\n    m_xml.writeStylesheetRef(stylesheetRef);\n  m_xml.startElement(\"Catch\");\n  if (!m_config->name().empty())\n    m_xml.writeAttribute(\"name\", m_config->name());\n  if (m_config->testSpec().hasFilters())\n    m_xml.writeAttribute(\"filters\",\n                         serializeFilters(m_config->getTestsOrTags()));\n  if (m_config->rngSeed() != 0)\n    m_xml.scopedElement(\"Randomness\")\n        .writeAttribute(\"seed\", m_config->rngSeed());\n}\n\nvoid XmlReporter::testGroupStarting(GroupInfo const& groupInfo) {\n  StreamingReporterBase::testGroupStarting(groupInfo);\n  m_xml.startElement(\"Group\").writeAttribute(\"name\", groupInfo.name);\n}\n\nvoid XmlReporter::testCaseStarting(TestCaseInfo const& testInfo) {\n  StreamingReporterBase::testCaseStarting(testInfo);\n  m_xml.startElement(\"TestCase\")\n      .writeAttribute(\"name\", trim(testInfo.name))\n      .writeAttribute(\"description\", testInfo.description)\n      .writeAttribute(\"tags\", testInfo.tagsAsString());\n\n  writeSourceInfo(testInfo.lineInfo);\n\n  if (m_config->showDurations() == ShowDurations::Always)\n    m_testCaseTimer.start();\n  m_xml.ensureTagClosed();\n}\n\nvoid XmlReporter::sectionStarting(SectionInfo const& sectionInfo) {\n  StreamingReporterBase::sectionStarting(sectionInfo);\n  if (m_sectionDepth++ > 0) {\n    m_xml.startElement(\"Section\").writeAttribute(\"name\",\n                                                 trim(sectionInfo.name));\n    writeSourceInfo(sectionInfo.lineInfo);\n    m_xml.ensureTagClosed();\n  }\n}\n\nvoid XmlReporter::assertionStarting(AssertionInfo const&) {}\n\nbool XmlReporter::assertionEnded(AssertionStats const& assertionStats) {\n\n  AssertionResult const& result = assertionStats.assertionResult;\n\n  bool includeResults = m_config->includeSuccessfulResults() || !result.isOk();\n\n  if (includeResults || result.getResultType() == ResultWas::Warning) {\n    // Print any info messages in <Info> tags.\n    for (auto const& msg : assertionStats.infoMessages) {\n      if (msg.type == ResultWas::Info && includeResults) {\n        m_xml.scopedElement(\"Info\").writeText(msg.message);\n      } else if (msg.type == ResultWas::Warning) {\n        m_xml.scopedElement(\"Warning\").writeText(msg.message);\n      }\n    }\n  }\n\n  // Drop out if result was successful but we're not printing them.\n  if (!includeResults && result.getResultType() != ResultWas::Warning)\n    return true;\n\n  // Print the expression if there is one.\n  if (result.hasExpression()) {\n    m_xml.startElement(\"Expression\")\n        .writeAttribute(\"success\", result.succeeded())\n        .writeAttribute(\"type\", result.getTestMacroName());\n\n    writeSourceInfo(result.getSourceInfo());\n\n    m_xml.scopedElement(\"Original\").writeText(result.getExpression());\n    m_xml.scopedElement(\"Expanded\").writeText(result.getExpandedExpression());\n  }\n\n  // And... Print a result applicable to each result type.\n  switch (result.getResultType()) {\n  case ResultWas::ThrewException:\n    m_xml.startElement(\"Exception\");\n    writeSourceInfo(result.getSourceInfo());\n    m_xml.writeText(result.getMessage());\n    m_xml.endElement();\n    break;\n  case ResultWas::FatalErrorCondition:\n    m_xml.startElement(\"FatalErrorCondition\");\n    writeSourceInfo(result.getSourceInfo());\n    m_xml.writeText(result.getMessage());\n    m_xml.endElement();\n    break;\n  case ResultWas::Info:\n    m_xml.scopedElement(\"Info\").writeText(result.getMessage());\n    break;\n  case ResultWas::Warning:\n    // Warning will already have been written\n    break;\n  case ResultWas::ExplicitFailure:\n    m_xml.startElement(\"Failure\");\n    writeSourceInfo(result.getSourceInfo());\n    m_xml.writeText(result.getMessage());\n    m_xml.endElement();\n    break;\n  default:\n    break;\n  }\n\n  if (result.hasExpression())\n    m_xml.endElement();\n\n  return true;\n}\n\nvoid XmlReporter::sectionEnded(SectionStats const& sectionStats) {\n  StreamingReporterBase::sectionEnded(sectionStats);\n  if (--m_sectionDepth > 0) {\n    XmlWriter::ScopedElement e = m_xml.scopedElement(\"OverallResults\");\n    e.writeAttribute(\"successes\", sectionStats.assertions.passed);\n    e.writeAttribute(\"failures\", sectionStats.assertions.failed);\n    e.writeAttribute(\"expectedFailures\", sectionStats.assertions.failedButOk);\n\n    if (m_config->showDurations() == ShowDurations::Always)\n      e.writeAttribute(\"durationInSeconds\", sectionStats.durationInSeconds);\n\n    m_xml.endElement();\n  }\n}\n\nvoid XmlReporter::testCaseEnded(TestCaseStats const& testCaseStats) {\n  StreamingReporterBase::testCaseEnded(testCaseStats);\n  XmlWriter::ScopedElement e = m_xml.scopedElement(\"OverallResult\");\n  e.writeAttribute(\"success\", testCaseStats.totals.assertions.allOk());\n\n  if (m_config->showDurations() == ShowDurations::Always)\n    e.writeAttribute(\"durationInSeconds\", m_testCaseTimer.getElapsedSeconds());\n\n  if (!testCaseStats.stdOut.empty())\n    m_xml.scopedElement(\"StdOut\").writeText(trim(testCaseStats.stdOut),\n                                            XmlFormatting::Newline);\n  if (!testCaseStats.stdErr.empty())\n    m_xml.scopedElement(\"StdErr\").writeText(trim(testCaseStats.stdErr),\n                                            XmlFormatting::Newline);\n\n  m_xml.endElement();\n}\n\nvoid XmlReporter::testGroupEnded(TestGroupStats const& testGroupStats) {\n  StreamingReporterBase::testGroupEnded(testGroupStats);\n  // TODO: Check testGroupStats.aborting and act accordingly.\n  m_xml.scopedElement(\"OverallResults\")\n      .writeAttribute(\"successes\", testGroupStats.totals.assertions.passed)\n      .writeAttribute(\"failures\", testGroupStats.totals.assertions.failed)\n      .writeAttribute(\"expectedFailures\",\n                      testGroupStats.totals.assertions.failedButOk);\n  m_xml.scopedElement(\"OverallResultsCases\")\n      .writeAttribute(\"successes\", testGroupStats.totals.testCases.passed)\n      .writeAttribute(\"failures\", testGroupStats.totals.testCases.failed)\n      .writeAttribute(\"expectedFailures\",\n                      testGroupStats.totals.testCases.failedButOk);\n  m_xml.endElement();\n}\n\nvoid XmlReporter::testRunEnded(TestRunStats const& testRunStats) {\n  StreamingReporterBase::testRunEnded(testRunStats);\n  m_xml.scopedElement(\"OverallResults\")\n      .writeAttribute(\"successes\", testRunStats.totals.assertions.passed)\n      .writeAttribute(\"failures\", testRunStats.totals.assertions.failed)\n      .writeAttribute(\"expectedFailures\",\n                      testRunStats.totals.assertions.failedButOk);\n  m_xml.scopedElement(\"OverallResultsCases\")\n      .writeAttribute(\"successes\", testRunStats.totals.testCases.passed)\n      .writeAttribute(\"failures\", testRunStats.totals.testCases.failed)\n      .writeAttribute(\"expectedFailures\",\n                      testRunStats.totals.testCases.failedButOk);\n  m_xml.endElement();\n}\n\n#if defined(CATCH_CONFIG_ENABLE_BENCHMARKING)\nvoid XmlReporter::benchmarkPreparing(std::string const& name) {\n  m_xml.startElement(\"BenchmarkResults\").writeAttribute(\"name\", name);\n}\n\nvoid XmlReporter::benchmarkStarting(BenchmarkInfo const& info) {\n  m_xml.writeAttribute(\"samples\", info.samples)\n      .writeAttribute(\"resamples\", info.resamples)\n      .writeAttribute(\"iterations\", info.iterations)\n      .writeAttribute(\"clockResolution\", info.clockResolution)\n      .writeAttribute(\"estimatedDuration\", info.estimatedDuration)\n      .writeComment(\"All values in nano seconds\");\n}\n\nvoid XmlReporter::benchmarkEnded(BenchmarkStats<> const& benchmarkStats) {\n  m_xml.startElement(\"mean\")\n      .writeAttribute(\"value\", benchmarkStats.mean.point.count())\n      .writeAttribute(\"lowerBound\", benchmarkStats.mean.lower_bound.count())\n      .writeAttribute(\"upperBound\", benchmarkStats.mean.upper_bound.count())\n      .writeAttribute(\"ci\", benchmarkStats.mean.confidence_interval);\n  m_xml.endElement();\n  m_xml.startElement(\"standardDeviation\")\n      .writeAttribute(\"value\", benchmarkStats.standardDeviation.point.count())\n      .writeAttribute(\"lowerBound\",\n                      benchmarkStats.standardDeviation.lower_bound.count())\n      .writeAttribute(\"upperBound\",\n                      benchmarkStats.standardDeviation.upper_bound.count())\n      .writeAttribute(\"ci\",\n                      benchmarkStats.standardDeviation.confidence_interval);\n  m_xml.endElement();\n  m_xml.startElement(\"outliers\")\n      .writeAttribute(\"variance\", benchmarkStats.outlierVariance)\n      .writeAttribute(\"lowMild\", benchmarkStats.outliers.low_mild)\n      .writeAttribute(\"lowSevere\", benchmarkStats.outliers.low_severe)\n      .writeAttribute(\"highMild\", benchmarkStats.outliers.high_mild)\n      .writeAttribute(\"highSevere\", benchmarkStats.outliers.high_severe);\n  m_xml.endElement();\n  m_xml.endElement();\n}\n\nvoid XmlReporter::benchmarkFailed(std::string const& error) {\n  m_xml.scopedElement(\"failed\").writeAttribute(\"message\", error);\n  m_xml.endElement();\n}\n#endif // CATCH_CONFIG_ENABLE_BENCHMARKING\n\nCATCH_REGISTER_REPORTER(\"xml\", XmlReporter)\n\n} // end namespace Catch\n\n#if defined(_MSC_VER)\n#pragma warning(pop)\n#endif\n// end catch_reporter_xml.cpp\n\nnamespace Catch {\nLeakDetector leakDetector;\n}\n\n#ifdef __clang__\n#pragma clang diagnostic pop\n#endif\n\n// end catch_impl.hpp\n#endif\n\n#ifdef CATCH_CONFIG_MAIN\n// start catch_default_main.hpp\n\n#ifndef __OBJC__\n\n#if defined(CATCH_CONFIG_WCHAR) && defined(CATCH_PLATFORM_WINDOWS) &&          \\\n    defined(_UNICODE) && !defined(DO_NOT_USE_WMAIN)\n// Standard C/C++ Win32 Unicode wmain entry point\nextern \"C\" int wmain(int argc, wchar_t* argv[], wchar_t*[]) {\n#else\n// Standard C/C++ main entry point\nint main(int argc, char* argv[]) {\n#endif\n\n  return Catch::Session().run(argc, argv);\n}\n\n#else // __OBJC__\n\n// Objective-C entry point\nint main(int argc, char* const argv[]) {\n#if !CATCH_ARC_ENABLED\n  NSAutoreleasePool* pool = [[NSAutoreleasePool alloc] init];\n#endif\n\n  Catch::registerTestMethods();\n  int result = Catch::Session().run(argc, (char**)argv);\n\n#if !CATCH_ARC_ENABLED\n  [pool drain];\n#endif\n\n  return result;\n}\n\n#endif // __OBJC__\n\n// end catch_default_main.hpp\n#endif\n\n#if !defined(CATCH_CONFIG_IMPL_ONLY)\n\n#ifdef CLARA_CONFIG_MAIN_NOT_DEFINED\n#undef CLARA_CONFIG_MAIN\n#endif\n\n#if !defined(CATCH_CONFIG_DISABLE)\n//////\n// If this config identifier is defined then all CATCH macros are prefixed with\n// CATCH_\n#ifdef CATCH_CONFIG_PREFIX_ALL\n\n#define CATCH_REQUIRE(...)                                                     \\\n  INTERNAL_CATCH_TEST(\"CATCH_REQUIRE\", Catch::ResultDisposition::Normal,       \\\n                      __VA_ARGS__)\n#define CATCH_REQUIRE_FALSE(...)                                               \\\n  INTERNAL_CATCH_TEST(\"CATCH_REQUIRE_FALSE\",                                   \\\n                      Catch::ResultDisposition::Normal |                       \\\n                          Catch::ResultDisposition::FalseTest,                 \\\n                      __VA_ARGS__)\n\n#define CATCH_REQUIRE_THROWS(...)                                              \\\n  INTERNAL_CATCH_THROWS(\"CATCH_REQUIRE_THROWS\",                                \\\n                        Catch::ResultDisposition::Normal, __VA_ARGS__)\n#define CATCH_REQUIRE_THROWS_AS(expr, exceptionType)                           \\\n  INTERNAL_CATCH_THROWS_AS(\"CATCH_REQUIRE_THROWS_AS\", exceptionType,           \\\n                           Catch::ResultDisposition::Normal, expr)\n#define CATCH_REQUIRE_THROWS_WITH(expr, matcher)                               \\\n  INTERNAL_CATCH_THROWS_STR_MATCHES(\"CATCH_REQUIRE_THROWS_WITH\",               \\\n                                    Catch::ResultDisposition::Normal, matcher, \\\n                                    expr)\n#if !defined(CATCH_CONFIG_DISABLE_MATCHERS)\n#define CATCH_REQUIRE_THROWS_MATCHES(expr, exceptionType, matcher)             \\\n  INTERNAL_CATCH_THROWS_MATCHES(\"CATCH_REQUIRE_THROWS_MATCHES\", exceptionType, \\\n                                Catch::ResultDisposition::Normal, matcher,     \\\n                                expr)\n#endif // CATCH_CONFIG_DISABLE_MATCHERS\n#define CATCH_REQUIRE_NOTHROW(...)                                             \\\n  INTERNAL_CATCH_NO_THROW(\"CATCH_REQUIRE_NOTHROW\",                             \\\n                          Catch::ResultDisposition::Normal, __VA_ARGS__)\n\n#define CATCH_CHECK(...)                                                       \\\n  INTERNAL_CATCH_TEST(                                                         \\\n      \"CATCH_CHECK\", Catch::ResultDisposition::ContinueOnFailure, __VA_ARGS__)\n#define CATCH_CHECK_FALSE(...)                                                 \\\n  INTERNAL_CATCH_TEST(\"CATCH_CHECK_FALSE\",                                     \\\n                      Catch::ResultDisposition::ContinueOnFailure |            \\\n                          Catch::ResultDisposition::FalseTest,                 \\\n                      __VA_ARGS__)\n#define CATCH_CHECKED_IF(...)                                                  \\\n  INTERNAL_CATCH_IF(\"CATCH_CHECKED_IF\",                                        \\\n                    Catch::ResultDisposition::ContinueOnFailure, __VA_ARGS__)\n#define CATCH_CHECKED_ELSE(...)                                                \\\n  INTERNAL_CATCH_ELSE(\"CATCH_CHECKED_ELSE\",                                    \\\n                      Catch::ResultDisposition::ContinueOnFailure,             \\\n                      __VA_ARGS__)\n#define CATCH_CHECK_NOFAIL(...)                                                \\\n  INTERNAL_CATCH_TEST(\"CATCH_CHECK_NOFAIL\",                                    \\\n                      Catch::ResultDisposition::ContinueOnFailure |            \\\n                          Catch::ResultDisposition::SuppressFail,              \\\n                      __VA_ARGS__)\n\n#define CATCH_CHECK_THROWS(...)                                                \\\n  INTERNAL_CATCH_THROWS(\"CATCH_CHECK_THROWS\",                                  \\\n                        Catch::ResultDisposition::ContinueOnFailure,           \\\n                        __VA_ARGS__)\n#define CATCH_CHECK_THROWS_AS(expr, exceptionType)                             \\\n  INTERNAL_CATCH_THROWS_AS(\"CATCH_CHECK_THROWS_AS\", exceptionType,             \\\n                           Catch::ResultDisposition::ContinueOnFailure, expr)\n#define CATCH_CHECK_THROWS_WITH(expr, matcher)                                 \\\n  INTERNAL_CATCH_THROWS_STR_MATCHES(                                           \\\n      \"CATCH_CHECK_THROWS_WITH\", Catch::ResultDisposition::ContinueOnFailure,  \\\n      matcher, expr)\n#if !defined(CATCH_CONFIG_DISABLE_MATCHERS)\n#define CATCH_CHECK_THROWS_MATCHES(expr, exceptionType, matcher)               \\\n  INTERNAL_CATCH_THROWS_MATCHES(\"CATCH_CHECK_THROWS_MATCHES\", exceptionType,   \\\n                                Catch::ResultDisposition::ContinueOnFailure,   \\\n                                matcher, expr)\n#endif // CATCH_CONFIG_DISABLE_MATCHERS\n#define CATCH_CHECK_NOTHROW(...)                                               \\\n  INTERNAL_CATCH_NO_THROW(\"CATCH_CHECK_NOTHROW\",                               \\\n                          Catch::ResultDisposition::ContinueOnFailure,         \\\n                          __VA_ARGS__)\n\n#if !defined(CATCH_CONFIG_DISABLE_MATCHERS)\n#define CATCH_CHECK_THAT(arg, matcher)                                         \\\n  INTERNAL_CHECK_THAT(\"CATCH_CHECK_THAT\", matcher,                             \\\n                      Catch::ResultDisposition::ContinueOnFailure, arg)\n\n#define CATCH_REQUIRE_THAT(arg, matcher)                                       \\\n  INTERNAL_CHECK_THAT(\"CATCH_REQUIRE_THAT\", matcher,                           \\\n                      Catch::ResultDisposition::Normal, arg)\n#endif // CATCH_CONFIG_DISABLE_MATCHERS\n\n#define CATCH_INFO(msg) INTERNAL_CATCH_INFO(\"CATCH_INFO\", msg)\n#define CATCH_UNSCOPED_INFO(msg)                                               \\\n  INTERNAL_CATCH_UNSCOPED_INFO(\"CATCH_UNSCOPED_INFO\", msg)\n#define CATCH_WARN(msg)                                                        \\\n  INTERNAL_CATCH_MSG(\"CATCH_WARN\", Catch::ResultWas::Warning,                  \\\n                     Catch::ResultDisposition::ContinueOnFailure, msg)\n#define CATCH_CAPTURE(...)                                                     \\\n  INTERNAL_CATCH_CAPTURE(INTERNAL_CATCH_UNIQUE_NAME(capturer),                 \\\n                         \"CATCH_CAPTURE\", __VA_ARGS__)\n\n#define CATCH_TEST_CASE(...) INTERNAL_CATCH_TESTCASE(__VA_ARGS__)\n#define CATCH_TEST_CASE_METHOD(className, ...)                                 \\\n  INTERNAL_CATCH_TEST_CASE_METHOD(className, __VA_ARGS__)\n#define CATCH_METHOD_AS_TEST_CASE(method, ...)                                 \\\n  INTERNAL_CATCH_METHOD_AS_TEST_CASE(method, __VA_ARGS__)\n#define CATCH_REGISTER_TEST_CASE(Function, ...)                                \\\n  INTERNAL_CATCH_REGISTER_TESTCASE(Function, __VA_ARGS__)\n#define CATCH_SECTION(...) INTERNAL_CATCH_SECTION(__VA_ARGS__)\n#define CATCH_DYNAMIC_SECTION(...) INTERNAL_CATCH_DYNAMIC_SECTION(__VA_ARGS__)\n#define CATCH_FAIL(...)                                                        \\\n  INTERNAL_CATCH_MSG(\"CATCH_FAIL\", Catch::ResultWas::ExplicitFailure,          \\\n                     Catch::ResultDisposition::Normal, __VA_ARGS__)\n#define CATCH_FAIL_CHECK(...)                                                  \\\n  INTERNAL_CATCH_MSG(\"CATCH_FAIL_CHECK\", Catch::ResultWas::ExplicitFailure,    \\\n                     Catch::ResultDisposition::ContinueOnFailure, __VA_ARGS__)\n#define CATCH_SUCCEED(...)                                                     \\\n  INTERNAL_CATCH_MSG(\"CATCH_SUCCEED\", Catch::ResultWas::Ok,                    \\\n                     Catch::ResultDisposition::ContinueOnFailure, __VA_ARGS__)\n\n#define CATCH_ANON_TEST_CASE() INTERNAL_CATCH_TESTCASE()\n\n#ifndef CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR\n#define CATCH_TEMPLATE_TEST_CASE(...)                                          \\\n  INTERNAL_CATCH_TEMPLATE_TEST_CASE(__VA_ARGS__)\n#define CATCH_TEMPLATE_TEST_CASE_SIG(...)                                      \\\n  INTERNAL_CATCH_TEMPLATE_TEST_CASE_SIG(__VA_ARGS__)\n#define CATCH_TEMPLATE_TEST_CASE_METHOD(className, ...)                        \\\n  INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD(className, __VA_ARGS__)\n#define CATCH_TEMPLATE_TEST_CASE_METHOD_SIG(className, ...)                    \\\n  INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_SIG(className, __VA_ARGS__)\n#define CATCH_TEMPLATE_PRODUCT_TEST_CASE(...)                                  \\\n  INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE(__VA_ARGS__)\n#define CATCH_TEMPLATE_PRODUCT_TEST_CASE_SIG(...)                              \\\n  INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE_SIG(__VA_ARGS__)\n#define CATCH_TEMPLATE_PRODUCT_TEST_CASE_METHOD(className, ...)                \\\n  INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE_METHOD(className, __VA_ARGS__)\n#define CATCH_TEMPLATE_PRODUCT_TEST_CASE_METHOD_SIG(className, ...)            \\\n  INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE_METHOD_SIG(className, __VA_ARGS__)\n#else\n#define CATCH_TEMPLATE_TEST_CASE(...)                                          \\\n  INTERNAL_CATCH_EXPAND_VARGS(INTERNAL_CATCH_TEMPLATE_TEST_CASE(__VA_ARGS__))\n#define CATCH_TEMPLATE_TEST_CASE_SIG(...)                                      \\\n  INTERNAL_CATCH_EXPAND_VARGS(                                                 \\\n      INTERNAL_CATCH_TEMPLATE_TEST_CASE_SIG(__VA_ARGS__))\n#define CATCH_TEMPLATE_TEST_CASE_METHOD(className, ...)                        \\\n  INTERNAL_CATCH_EXPAND_VARGS(                                                 \\\n      INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD(className, __VA_ARGS__))\n#define CATCH_TEMPLATE_TEST_CASE_METHOD_SIG(className, ...)                    \\\n  INTERNAL_CATCH_EXPAND_VARGS(                                                 \\\n      INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_SIG(className, __VA_ARGS__))\n#define CATCH_TEMPLATE_PRODUCT_TEST_CASE(...)                                  \\\n  INTERNAL_CATCH_EXPAND_VARGS(                                                 \\\n      INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE(__VA_ARGS__))\n#define CATCH_TEMPLATE_PRODUCT_TEST_CASE_SIG(...)                              \\\n  INTERNAL_CATCH_EXPAND_VARGS(                                                 \\\n      INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE_SIG(__VA_ARGS__))\n#define CATCH_TEMPLATE_PRODUCT_TEST_CASE_METHOD(className, ...)                \\\n  INTERNAL_CATCH_EXPAND_VARGS(                                                 \\\n      INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE_METHOD(className,              \\\n                                                       __VA_ARGS__))\n#define CATCH_TEMPLATE_PRODUCT_TEST_CASE_METHOD_SIG(className, ...)            \\\n  INTERNAL_CATCH_EXPAND_VARGS(                                                 \\\n      INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE_METHOD_SIG(className,          \\\n                                                           __VA_ARGS__))\n#endif\n\n#if !defined(CATCH_CONFIG_RUNTIME_STATIC_REQUIRE)\n#define CATCH_STATIC_REQUIRE(...)                                              \\\n  static_assert(__VA_ARGS__, #__VA_ARGS__);                                    \\\n  CATCH_SUCCEED(#__VA_ARGS__)\n#define CATCH_STATIC_REQUIRE_FALSE(...)                                        \\\n  static_assert(!(__VA_ARGS__), \"!(\" #__VA_ARGS__ \")\");                        \\\n  CATCH_SUCCEED(#__VA_ARGS__)\n#else\n#define CATCH_STATIC_REQUIRE(...) CATCH_REQUIRE(__VA_ARGS__)\n#define CATCH_STATIC_REQUIRE_FALSE(...) CATCH_REQUIRE_FALSE(__VA_ARGS__)\n#endif\n\n// \"BDD-style\" convenience wrappers\n#define CATCH_SCENARIO(...) CATCH_TEST_CASE(\"Scenario: \" __VA_ARGS__)\n#define CATCH_SCENARIO_METHOD(className, ...)                                  \\\n  INTERNAL_CATCH_TEST_CASE_METHOD(className, \"Scenario: \" __VA_ARGS__)\n#define CATCH_GIVEN(desc) INTERNAL_CATCH_DYNAMIC_SECTION(\"    Given: \" << desc)\n#define CATCH_AND_GIVEN(desc)                                                  \\\n  INTERNAL_CATCH_DYNAMIC_SECTION(\"And given: \" << desc)\n#define CATCH_WHEN(desc) INTERNAL_CATCH_DYNAMIC_SECTION(\"     When: \" << desc)\n#define CATCH_AND_WHEN(desc)                                                   \\\n  INTERNAL_CATCH_DYNAMIC_SECTION(\" And when: \" << desc)\n#define CATCH_THEN(desc) INTERNAL_CATCH_DYNAMIC_SECTION(\"     Then: \" << desc)\n#define CATCH_AND_THEN(desc)                                                   \\\n  INTERNAL_CATCH_DYNAMIC_SECTION(\"      And: \" << desc)\n\n#if defined(CATCH_CONFIG_ENABLE_BENCHMARKING)\n#define CATCH_BENCHMARK(...)                                                   \\\n  INTERNAL_CATCH_BENCHMARK(                                                    \\\n      INTERNAL_CATCH_UNIQUE_NAME(____C_A_T_C_H____B_E_N_C_H____),              \\\n      INTERNAL_CATCH_GET_1_ARG(__VA_ARGS__, , ),                               \\\n      INTERNAL_CATCH_GET_2_ARG(__VA_ARGS__, , ))\n#define CATCH_BENCHMARK_ADVANCED(name)                                         \\\n  INTERNAL_CATCH_BENCHMARK_ADVANCED(                                           \\\n      INTERNAL_CATCH_UNIQUE_NAME(____C_A_T_C_H____B_E_N_C_H____), name)\n#endif // CATCH_CONFIG_ENABLE_BENCHMARKING\n\n// If CATCH_CONFIG_PREFIX_ALL is not defined then the CATCH_ prefix is not\n// required\n#else\n\n#define REQUIRE(...)                                                           \\\n  INTERNAL_CATCH_TEST(\"REQUIRE\", Catch::ResultDisposition::Normal, __VA_ARGS__)\n#define REQUIRE_FALSE(...)                                                     \\\n  INTERNAL_CATCH_TEST(\"REQUIRE_FALSE\",                                         \\\n                      Catch::ResultDisposition::Normal |                       \\\n                          Catch::ResultDisposition::FalseTest,                 \\\n                      __VA_ARGS__)\n\n#define REQUIRE_THROWS(...)                                                    \\\n  INTERNAL_CATCH_THROWS(\"REQUIRE_THROWS\", Catch::ResultDisposition::Normal,    \\\n                        __VA_ARGS__)\n#define REQUIRE_THROWS_AS(expr, exceptionType)                                 \\\n  INTERNAL_CATCH_THROWS_AS(\"REQUIRE_THROWS_AS\", exceptionType,                 \\\n                           Catch::ResultDisposition::Normal, expr)\n#define REQUIRE_THROWS_WITH(expr, matcher)                                     \\\n  INTERNAL_CATCH_THROWS_STR_MATCHES(                                           \\\n      \"REQUIRE_THROWS_WITH\", Catch::ResultDisposition::Normal, matcher, expr)\n#if !defined(CATCH_CONFIG_DISABLE_MATCHERS)\n#define REQUIRE_THROWS_MATCHES(expr, exceptionType, matcher)                   \\\n  INTERNAL_CATCH_THROWS_MATCHES(\"REQUIRE_THROWS_MATCHES\", exceptionType,       \\\n                                Catch::ResultDisposition::Normal, matcher,     \\\n                                expr)\n#endif // CATCH_CONFIG_DISABLE_MATCHERS\n#define REQUIRE_NOTHROW(...)                                                   \\\n  INTERNAL_CATCH_NO_THROW(\"REQUIRE_NOTHROW\", Catch::ResultDisposition::Normal, \\\n                          __VA_ARGS__)\n\n#define CHECK(...)                                                             \\\n  INTERNAL_CATCH_TEST(\"CHECK\", Catch::ResultDisposition::ContinueOnFailure,    \\\n                      __VA_ARGS__)\n#define CHECK_FALSE(...)                                                       \\\n  INTERNAL_CATCH_TEST(\"CHECK_FALSE\",                                           \\\n                      Catch::ResultDisposition::ContinueOnFailure |            \\\n                          Catch::ResultDisposition::FalseTest,                 \\\n                      __VA_ARGS__)\n#define CHECKED_IF(...)                                                        \\\n  INTERNAL_CATCH_IF(\"CHECKED_IF\", Catch::ResultDisposition::ContinueOnFailure, \\\n                    __VA_ARGS__)\n#define CHECKED_ELSE(...)                                                      \\\n  INTERNAL_CATCH_ELSE(\"CHECKED_ELSE\",                                          \\\n                      Catch::ResultDisposition::ContinueOnFailure,             \\\n                      __VA_ARGS__)\n#define CHECK_NOFAIL(...)                                                      \\\n  INTERNAL_CATCH_TEST(\"CHECK_NOFAIL\",                                          \\\n                      Catch::ResultDisposition::ContinueOnFailure |            \\\n                          Catch::ResultDisposition::SuppressFail,              \\\n                      __VA_ARGS__)\n\n#define CHECK_THROWS(...)                                                      \\\n  INTERNAL_CATCH_THROWS(\"CHECK_THROWS\",                                        \\\n                        Catch::ResultDisposition::ContinueOnFailure,           \\\n                        __VA_ARGS__)\n#define CHECK_THROWS_AS(expr, exceptionType)                                   \\\n  INTERNAL_CATCH_THROWS_AS(\"CHECK_THROWS_AS\", exceptionType,                   \\\n                           Catch::ResultDisposition::ContinueOnFailure, expr)\n#define CHECK_THROWS_WITH(expr, matcher)                                       \\\n  INTERNAL_CATCH_THROWS_STR_MATCHES(                                           \\\n      \"CHECK_THROWS_WITH\", Catch::ResultDisposition::ContinueOnFailure,        \\\n      matcher, expr)\n#if !defined(CATCH_CONFIG_DISABLE_MATCHERS)\n#define CHECK_THROWS_MATCHES(expr, exceptionType, matcher)                     \\\n  INTERNAL_CATCH_THROWS_MATCHES(\"CHECK_THROWS_MATCHES\", exceptionType,         \\\n                                Catch::ResultDisposition::ContinueOnFailure,   \\\n                                matcher, expr)\n#endif // CATCH_CONFIG_DISABLE_MATCHERS\n#define CHECK_NOTHROW(...)                                                     \\\n  INTERNAL_CATCH_NO_THROW(\"CHECK_NOTHROW\",                                     \\\n                          Catch::ResultDisposition::ContinueOnFailure,         \\\n                          __VA_ARGS__)\n\n#if !defined(CATCH_CONFIG_DISABLE_MATCHERS)\n#define CHECK_THAT(arg, matcher)                                               \\\n  INTERNAL_CHECK_THAT(\"CHECK_THAT\", matcher,                                   \\\n                      Catch::ResultDisposition::ContinueOnFailure, arg)\n\n#define REQUIRE_THAT(arg, matcher)                                             \\\n  INTERNAL_CHECK_THAT(\"REQUIRE_THAT\", matcher,                                 \\\n                      Catch::ResultDisposition::Normal, arg)\n#endif // CATCH_CONFIG_DISABLE_MATCHERS\n\n#define INFO(msg) INTERNAL_CATCH_INFO(\"INFO\", msg)\n#define UNSCOPED_INFO(msg) INTERNAL_CATCH_UNSCOPED_INFO(\"UNSCOPED_INFO\", msg)\n#define WARN(msg)                                                              \\\n  INTERNAL_CATCH_MSG(\"WARN\", Catch::ResultWas::Warning,                        \\\n                     Catch::ResultDisposition::ContinueOnFailure, msg)\n#define CAPTURE(...)                                                           \\\n  INTERNAL_CATCH_CAPTURE(INTERNAL_CATCH_UNIQUE_NAME(capturer), \"CAPTURE\",      \\\n                         __VA_ARGS__)\n\n#define TEST_CASE(...) INTERNAL_CATCH_TESTCASE(__VA_ARGS__)\n#define TEST_CASE_METHOD(className, ...)                                       \\\n  INTERNAL_CATCH_TEST_CASE_METHOD(className, __VA_ARGS__)\n#define METHOD_AS_TEST_CASE(method, ...)                                       \\\n  INTERNAL_CATCH_METHOD_AS_TEST_CASE(method, __VA_ARGS__)\n#define REGISTER_TEST_CASE(Function, ...)                                      \\\n  INTERNAL_CATCH_REGISTER_TESTCASE(Function, __VA_ARGS__)\n#define SECTION(...) INTERNAL_CATCH_SECTION(__VA_ARGS__)\n#define DYNAMIC_SECTION(...) INTERNAL_CATCH_DYNAMIC_SECTION(__VA_ARGS__)\n#define FAIL(...)                                                              \\\n  INTERNAL_CATCH_MSG(\"FAIL\", Catch::ResultWas::ExplicitFailure,                \\\n                     Catch::ResultDisposition::Normal, __VA_ARGS__)\n#define FAIL_CHECK(...)                                                        \\\n  INTERNAL_CATCH_MSG(\"FAIL_CHECK\", Catch::ResultWas::ExplicitFailure,          \\\n                     Catch::ResultDisposition::ContinueOnFailure, __VA_ARGS__)\n#define SUCCEED(...)                                                           \\\n  INTERNAL_CATCH_MSG(\"SUCCEED\", Catch::ResultWas::Ok,                          \\\n                     Catch::ResultDisposition::ContinueOnFailure, __VA_ARGS__)\n#define ANON_TEST_CASE() INTERNAL_CATCH_TESTCASE()\n\n#ifndef CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR\n#define TEMPLATE_TEST_CASE(...) INTERNAL_CATCH_TEMPLATE_TEST_CASE(__VA_ARGS__)\n#define TEMPLATE_TEST_CASE_SIG(...)                                            \\\n  INTERNAL_CATCH_TEMPLATE_TEST_CASE_SIG(__VA_ARGS__)\n#define TEMPLATE_TEST_CASE_METHOD(className, ...)                              \\\n  INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD(className, __VA_ARGS__)\n#define TEMPLATE_TEST_CASE_METHOD_SIG(className, ...)                          \\\n  INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_SIG(className, __VA_ARGS__)\n#define TEMPLATE_PRODUCT_TEST_CASE(...)                                        \\\n  INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE(__VA_ARGS__)\n#define TEMPLATE_PRODUCT_TEST_CASE_SIG(...)                                    \\\n  INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE_SIG(__VA_ARGS__)\n#define TEMPLATE_PRODUCT_TEST_CASE_METHOD(className, ...)                      \\\n  INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE_METHOD(className, __VA_ARGS__)\n#define TEMPLATE_PRODUCT_TEST_CASE_METHOD_SIG(className, ...)                  \\\n  INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE_METHOD_SIG(className, __VA_ARGS__)\n#define TEMPLATE_LIST_TEST_CASE(...)                                           \\\n  INTERNAL_CATCH_TEMPLATE_LIST_TEST_CASE(__VA_ARGS__)\n#define TEMPLATE_LIST_TEST_CASE_METHOD(className, ...)                         \\\n  INTERNAL_CATCH_TEMPLATE_LIST_TEST_CASE_METHOD(className, __VA_ARGS__)\n#else\n#define TEMPLATE_TEST_CASE(...)                                                \\\n  INTERNAL_CATCH_EXPAND_VARGS(INTERNAL_CATCH_TEMPLATE_TEST_CASE(__VA_ARGS__))\n#define TEMPLATE_TEST_CASE_SIG(...)                                            \\\n  INTERNAL_CATCH_EXPAND_VARGS(                                                 \\\n      INTERNAL_CATCH_TEMPLATE_TEST_CASE_SIG(__VA_ARGS__))\n#define TEMPLATE_TEST_CASE_METHOD(className, ...)                              \\\n  INTERNAL_CATCH_EXPAND_VARGS(                                                 \\\n      INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD(className, __VA_ARGS__))\n#define TEMPLATE_TEST_CASE_METHOD_SIG(className, ...)                          \\\n  INTERNAL_CATCH_EXPAND_VARGS(                                                 \\\n      INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_SIG(className, __VA_ARGS__))\n#define TEMPLATE_PRODUCT_TEST_CASE(...)                                        \\\n  INTERNAL_CATCH_EXPAND_VARGS(                                                 \\\n      INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE(__VA_ARGS__))\n#define TEMPLATE_PRODUCT_TEST_CASE_SIG(...)                                    \\\n  INTERNAL_CATCH_EXPAND_VARGS(                                                 \\\n      INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE_SIG(__VA_ARGS__))\n#define TEMPLATE_PRODUCT_TEST_CASE_METHOD(className, ...)                      \\\n  INTERNAL_CATCH_EXPAND_VARGS(                                                 \\\n      INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE_METHOD(className,              \\\n                                                       __VA_ARGS__))\n#define TEMPLATE_PRODUCT_TEST_CASE_METHOD_SIG(className, ...)                  \\\n  INTERNAL_CATCH_EXPAND_VARGS(                                                 \\\n      INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE_METHOD_SIG(className,          \\\n                                                           __VA_ARGS__))\n#define TEMPLATE_LIST_TEST_CASE(...)                                           \\\n  INTERNAL_CATCH_EXPAND_VARGS(                                                 \\\n      INTERNAL_CATCH_TEMPLATE_LIST_TEST_CASE(__VA_ARGS__))\n#define TEMPLATE_LIST_TEST_CASE_METHOD(className, ...)                         \\\n  INTERNAL_CATCH_EXPAND_VARGS(                                                 \\\n      INTERNAL_CATCH_TEMPLATE_LIST_TEST_CASE_METHOD(className, __VA_ARGS__))\n#endif\n\n#if !defined(CATCH_CONFIG_RUNTIME_STATIC_REQUIRE)\n#define STATIC_REQUIRE(...)                                                    \\\n  static_assert(__VA_ARGS__, #__VA_ARGS__);                                    \\\n  SUCCEED(#__VA_ARGS__)\n#define STATIC_REQUIRE_FALSE(...)                                              \\\n  static_assert(!(__VA_ARGS__), \"!(\" #__VA_ARGS__ \")\");                        \\\n  SUCCEED(\"!(\" #__VA_ARGS__ \")\")\n#else\n#define STATIC_REQUIRE(...) REQUIRE(__VA_ARGS__)\n#define STATIC_REQUIRE_FALSE(...) REQUIRE_FALSE(__VA_ARGS__)\n#endif\n\n#endif\n\n#define CATCH_TRANSLATE_EXCEPTION(signature)                                   \\\n  INTERNAL_CATCH_TRANSLATE_EXCEPTION(signature)\n\n// \"BDD-style\" convenience wrappers\n#define SCENARIO(...) TEST_CASE(\"Scenario: \" __VA_ARGS__)\n#define SCENARIO_METHOD(className, ...)                                        \\\n  INTERNAL_CATCH_TEST_CASE_METHOD(className, \"Scenario: \" __VA_ARGS__)\n\n#define GIVEN(desc) INTERNAL_CATCH_DYNAMIC_SECTION(\"    Given: \" << desc)\n#define AND_GIVEN(desc) INTERNAL_CATCH_DYNAMIC_SECTION(\"And given: \" << desc)\n#define WHEN(desc) INTERNAL_CATCH_DYNAMIC_SECTION(\"     When: \" << desc)\n#define AND_WHEN(desc) INTERNAL_CATCH_DYNAMIC_SECTION(\" And when: \" << desc)\n#define THEN(desc) INTERNAL_CATCH_DYNAMIC_SECTION(\"     Then: \" << desc)\n#define AND_THEN(desc) INTERNAL_CATCH_DYNAMIC_SECTION(\"      And: \" << desc)\n\n#if defined(CATCH_CONFIG_ENABLE_BENCHMARKING)\n#define BENCHMARK(...)                                                         \\\n  INTERNAL_CATCH_BENCHMARK(                                                    \\\n      INTERNAL_CATCH_UNIQUE_NAME(____C_A_T_C_H____B_E_N_C_H____),              \\\n      INTERNAL_CATCH_GET_1_ARG(__VA_ARGS__, , ),                               \\\n      INTERNAL_CATCH_GET_2_ARG(__VA_ARGS__, , ))\n#define BENCHMARK_ADVANCED(name)                                               \\\n  INTERNAL_CATCH_BENCHMARK_ADVANCED(                                           \\\n      INTERNAL_CATCH_UNIQUE_NAME(____C_A_T_C_H____B_E_N_C_H____), name)\n#endif // CATCH_CONFIG_ENABLE_BENCHMARKING\n\nusing Catch::Detail::Approx;\n\n#else // CATCH_CONFIG_DISABLE\n\n//////\n// If this config identifier is defined then all CATCH macros are prefixed with\n// CATCH_\n#ifdef CATCH_CONFIG_PREFIX_ALL\n\n#define CATCH_REQUIRE(...) (void)(0)\n#define CATCH_REQUIRE_FALSE(...) (void)(0)\n\n#define CATCH_REQUIRE_THROWS(...) (void)(0)\n#define CATCH_REQUIRE_THROWS_AS(expr, exceptionType) (void)(0)\n#define CATCH_REQUIRE_THROWS_WITH(expr, matcher) (void)(0)\n#if !defined(CATCH_CONFIG_DISABLE_MATCHERS)\n#define CATCH_REQUIRE_THROWS_MATCHES(expr, exceptionType, matcher) (void)(0)\n#endif // CATCH_CONFIG_DISABLE_MATCHERS\n#define CATCH_REQUIRE_NOTHROW(...) (void)(0)\n\n#define CATCH_CHECK(...) (void)(0)\n#define CATCH_CHECK_FALSE(...) (void)(0)\n#define CATCH_CHECKED_IF(...) if (__VA_ARGS__)\n#define CATCH_CHECKED_ELSE(...) if (!(__VA_ARGS__))\n#define CATCH_CHECK_NOFAIL(...) (void)(0)\n\n#define CATCH_CHECK_THROWS(...) (void)(0)\n#define CATCH_CHECK_THROWS_AS(expr, exceptionType) (void)(0)\n#define CATCH_CHECK_THROWS_WITH(expr, matcher) (void)(0)\n#if !defined(CATCH_CONFIG_DISABLE_MATCHERS)\n#define CATCH_CHECK_THROWS_MATCHES(expr, exceptionType, matcher) (void)(0)\n#endif // CATCH_CONFIG_DISABLE_MATCHERS\n#define CATCH_CHECK_NOTHROW(...) (void)(0)\n\n#if !defined(CATCH_CONFIG_DISABLE_MATCHERS)\n#define CATCH_CHECK_THAT(arg, matcher) (void)(0)\n\n#define CATCH_REQUIRE_THAT(arg, matcher) (void)(0)\n#endif // CATCH_CONFIG_DISABLE_MATCHERS\n\n#define CATCH_INFO(msg) (void)(0)\n#define CATCH_UNSCOPED_INFO(msg) (void)(0)\n#define CATCH_WARN(msg) (void)(0)\n#define CATCH_CAPTURE(msg) (void)(0)\n\n#define CATCH_TEST_CASE(...)                                                   \\\n  INTERNAL_CATCH_TESTCASE_NO_REGISTRATION(                                     \\\n      INTERNAL_CATCH_UNIQUE_NAME(____C_A_T_C_H____T_E_S_T____))\n#define CATCH_TEST_CASE_METHOD(className, ...)                                 \\\n  INTERNAL_CATCH_TESTCASE_NO_REGISTRATION(                                     \\\n      INTERNAL_CATCH_UNIQUE_NAME(____C_A_T_C_H____T_E_S_T____))\n#define CATCH_METHOD_AS_TEST_CASE(method, ...)\n#define CATCH_REGISTER_TEST_CASE(Function, ...) (void)(0)\n#define CATCH_SECTION(...)\n#define CATCH_DYNAMIC_SECTION(...)\n#define CATCH_FAIL(...) (void)(0)\n#define CATCH_FAIL_CHECK(...) (void)(0)\n#define CATCH_SUCCEED(...) (void)(0)\n\n#define CATCH_ANON_TEST_CASE()                                                 \\\n  INTERNAL_CATCH_TESTCASE_NO_REGISTRATION(                                     \\\n      INTERNAL_CATCH_UNIQUE_NAME(____C_A_T_C_H____T_E_S_T____))\n\n#ifndef CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR\n#define CATCH_TEMPLATE_TEST_CASE(...)                                          \\\n  INTERNAL_CATCH_TEMPLATE_TEST_CASE_NO_REGISTRATION(__VA_ARGS__)\n#define CATCH_TEMPLATE_TEST_CASE_SIG(...)                                      \\\n  INTERNAL_CATCH_TEMPLATE_TEST_CASE_SIG_NO_REGISTRATION(__VA_ARGS__)\n#define CATCH_TEMPLATE_TEST_CASE_METHOD(className, ...)                        \\\n  INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_NO_REGISTRATION(className,          \\\n                                                           __VA_ARGS__)\n#define CATCH_TEMPLATE_TEST_CASE_METHOD_SIG(className, ...)                    \\\n  INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_SIG_NO_REGISTRATION(className,      \\\n                                                               __VA_ARGS__)\n#define CATCH_TEMPLATE_PRODUCT_TEST_CASE(...)                                  \\\n  CATCH_TEMPLATE_TEST_CASE(__VA_ARGS__)\n#define CATCH_TEMPLATE_PRODUCT_TEST_CASE_SIG(...)                              \\\n  CATCH_TEMPLATE_TEST_CASE(__VA_ARGS__)\n#define CATCH_TEMPLATE_PRODUCT_TEST_CASE_METHOD(className, ...)                \\\n  CATCH_TEMPLATE_TEST_CASE_METHOD(className, __VA_ARGS__)\n#define CATCH_TEMPLATE_PRODUCT_TEST_CASE_METHOD_SIG(className, ...)            \\\n  CATCH_TEMPLATE_TEST_CASE_METHOD(className, __VA_ARGS__)\n#else\n#define CATCH_TEMPLATE_TEST_CASE(...)                                          \\\n  INTERNAL_CATCH_EXPAND_VARGS(                                                 \\\n      INTERNAL_CATCH_TEMPLATE_TEST_CASE_NO_REGISTRATION(__VA_ARGS__))\n#define CATCH_TEMPLATE_TEST_CASE_SIG(...)                                      \\\n  INTERNAL_CATCH_EXPAND_VARGS(                                                 \\\n      INTERNAL_CATCH_TEMPLATE_TEST_CASE_SIG_NO_REGISTRATION(__VA_ARGS__))\n#define CATCH_TEMPLATE_TEST_CASE_METHOD(className, ...)                        \\\n  INTERNAL_CATCH_EXPAND_VARGS(                                                 \\\n      INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_NO_REGISTRATION(className,      \\\n                                                               __VA_ARGS__))\n#define CATCH_TEMPLATE_TEST_CASE_METHOD_SIG(className, ...)                    \\\n  INTERNAL_CATCH_EXPAND_VARGS(                                                 \\\n      INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_SIG_NO_REGISTRATION(            \\\n          className, __VA_ARGS__))\n#define CATCH_TEMPLATE_PRODUCT_TEST_CASE(...)                                  \\\n  CATCH_TEMPLATE_TEST_CASE(__VA_ARGS__)\n#define CATCH_TEMPLATE_PRODUCT_TEST_CASE_SIG(...)                              \\\n  CATCH_TEMPLATE_TEST_CASE(__VA_ARGS__)\n#define CATCH_TEMPLATE_PRODUCT_TEST_CASE_METHOD(className, ...)                \\\n  CATCH_TEMPLATE_TEST_CASE_METHOD(className, __VA_ARGS__)\n#define CATCH_TEMPLATE_PRODUCT_TEST_CASE_METHOD_SIG(className, ...)            \\\n  CATCH_TEMPLATE_TEST_CASE_METHOD(className, __VA_ARGS__)\n#endif\n\n// \"BDD-style\" convenience wrappers\n#define CATCH_SCENARIO(...)                                                    \\\n  INTERNAL_CATCH_TESTCASE_NO_REGISTRATION(                                     \\\n      INTERNAL_CATCH_UNIQUE_NAME(____C_A_T_C_H____T_E_S_T____))\n#define CATCH_SCENARIO_METHOD(className, ...)                                  \\\n  INTERNAL_CATCH_TESTCASE_METHOD_NO_REGISTRATION(                              \\\n      INTERNAL_CATCH_UNIQUE_NAME(____C_A_T_C_H____T_E_S_T____), className)\n#define CATCH_GIVEN(desc)\n#define CATCH_AND_GIVEN(desc)\n#define CATCH_WHEN(desc)\n#define CATCH_AND_WHEN(desc)\n#define CATCH_THEN(desc)\n#define CATCH_AND_THEN(desc)\n\n#define CATCH_STATIC_REQUIRE(...) (void)(0)\n#define CATCH_STATIC_REQUIRE_FALSE(...) (void)(0)\n\n// If CATCH_CONFIG_PREFIX_ALL is not defined then the CATCH_ prefix is not\n// required\n#else\n\n#define REQUIRE(...) (void)(0)\n#define REQUIRE_FALSE(...) (void)(0)\n\n#define REQUIRE_THROWS(...) (void)(0)\n#define REQUIRE_THROWS_AS(expr, exceptionType) (void)(0)\n#define REQUIRE_THROWS_WITH(expr, matcher) (void)(0)\n#if !defined(CATCH_CONFIG_DISABLE_MATCHERS)\n#define REQUIRE_THROWS_MATCHES(expr, exceptionType, matcher) (void)(0)\n#endif // CATCH_CONFIG_DISABLE_MATCHERS\n#define REQUIRE_NOTHROW(...) (void)(0)\n\n#define CHECK(...) (void)(0)\n#define CHECK_FALSE(...) (void)(0)\n#define CHECKED_IF(...) if (__VA_ARGS__)\n#define CHECKED_ELSE(...) if (!(__VA_ARGS__))\n#define CHECK_NOFAIL(...) (void)(0)\n\n#define CHECK_THROWS(...) (void)(0)\n#define CHECK_THROWS_AS(expr, exceptionType) (void)(0)\n#define CHECK_THROWS_WITH(expr, matcher) (void)(0)\n#if !defined(CATCH_CONFIG_DISABLE_MATCHERS)\n#define CHECK_THROWS_MATCHES(expr, exceptionType, matcher) (void)(0)\n#endif // CATCH_CONFIG_DISABLE_MATCHERS\n#define CHECK_NOTHROW(...) (void)(0)\n\n#if !defined(CATCH_CONFIG_DISABLE_MATCHERS)\n#define CHECK_THAT(arg, matcher) (void)(0)\n\n#define REQUIRE_THAT(arg, matcher) (void)(0)\n#endif // CATCH_CONFIG_DISABLE_MATCHERS\n\n#define INFO(msg) (void)(0)\n#define UNSCOPED_INFO(msg) (void)(0)\n#define WARN(msg) (void)(0)\n#define CAPTURE(msg) (void)(0)\n\n#define TEST_CASE(...)                                                         \\\n  INTERNAL_CATCH_TESTCASE_NO_REGISTRATION(                                     \\\n      INTERNAL_CATCH_UNIQUE_NAME(____C_A_T_C_H____T_E_S_T____))\n#define TEST_CASE_METHOD(className, ...)                                       \\\n  INTERNAL_CATCH_TESTCASE_NO_REGISTRATION(                                     \\\n      INTERNAL_CATCH_UNIQUE_NAME(____C_A_T_C_H____T_E_S_T____))\n#define METHOD_AS_TEST_CASE(method, ...)\n#define REGISTER_TEST_CASE(Function, ...) (void)(0)\n#define SECTION(...)\n#define DYNAMIC_SECTION(...)\n#define FAIL(...) (void)(0)\n#define FAIL_CHECK(...) (void)(0)\n#define SUCCEED(...) (void)(0)\n#define ANON_TEST_CASE()                                                       \\\n  INTERNAL_CATCH_TESTCASE_NO_REGISTRATION(                                     \\\n      INTERNAL_CATCH_UNIQUE_NAME(____C_A_T_C_H____T_E_S_T____))\n\n#ifndef CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR\n#define TEMPLATE_TEST_CASE(...)                                                \\\n  INTERNAL_CATCH_TEMPLATE_TEST_CASE_NO_REGISTRATION(__VA_ARGS__)\n#define TEMPLATE_TEST_CASE_SIG(...)                                            \\\n  INTERNAL_CATCH_TEMPLATE_TEST_CASE_SIG_NO_REGISTRATION(__VA_ARGS__)\n#define TEMPLATE_TEST_CASE_METHOD(className, ...)                              \\\n  INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_NO_REGISTRATION(className,          \\\n                                                           __VA_ARGS__)\n#define TEMPLATE_TEST_CASE_METHOD_SIG(className, ...)                          \\\n  INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_SIG_NO_REGISTRATION(className,      \\\n                                                               __VA_ARGS__)\n#define TEMPLATE_PRODUCT_TEST_CASE(...) TEMPLATE_TEST_CASE(__VA_ARGS__)\n#define TEMPLATE_PRODUCT_TEST_CASE_SIG(...) TEMPLATE_TEST_CASE(__VA_ARGS__)\n#define TEMPLATE_PRODUCT_TEST_CASE_METHOD(className, ...)                      \\\n  TEMPLATE_TEST_CASE_METHOD(className, __VA_ARGS__)\n#define TEMPLATE_PRODUCT_TEST_CASE_METHOD_SIG(className, ...)                  \\\n  TEMPLATE_TEST_CASE_METHOD(className, __VA_ARGS__)\n#else\n#define TEMPLATE_TEST_CASE(...)                                                \\\n  INTERNAL_CATCH_EXPAND_VARGS(                                                 \\\n      INTERNAL_CATCH_TEMPLATE_TEST_CASE_NO_REGISTRATION(__VA_ARGS__))\n#define TEMPLATE_TEST_CASE_SIG(...)                                            \\\n  INTERNAL_CATCH_EXPAND_VARGS(                                                 \\\n      INTERNAL_CATCH_TEMPLATE_TEST_CASE_SIG_NO_REGISTRATION(__VA_ARGS__))\n#define TEMPLATE_TEST_CASE_METHOD(className, ...)                              \\\n  INTERNAL_CATCH_EXPAND_VARGS(                                                 \\\n      INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_NO_REGISTRATION(className,      \\\n                                                               __VA_ARGS__))\n#define TEMPLATE_TEST_CASE_METHOD_SIG(className, ...)                          \\\n  INTERNAL_CATCH_EXPAND_VARGS(                                                 \\\n      INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_SIG_NO_REGISTRATION(            \\\n          className, __VA_ARGS__))\n#define TEMPLATE_PRODUCT_TEST_CASE(...) TEMPLATE_TEST_CASE(__VA_ARGS__)\n#define TEMPLATE_PRODUCT_TEST_CASE_SIG(...) TEMPLATE_TEST_CASE(__VA_ARGS__)\n#define TEMPLATE_PRODUCT_TEST_CASE_METHOD(className, ...)                      \\\n  TEMPLATE_TEST_CASE_METHOD(className, __VA_ARGS__)\n#define TEMPLATE_PRODUCT_TEST_CASE_METHOD_SIG(className, ...)                  \\\n  TEMPLATE_TEST_CASE_METHOD(className, __VA_ARGS__)\n#endif\n\n#define STATIC_REQUIRE(...) (void)(0)\n#define STATIC_REQUIRE_FALSE(...) (void)(0)\n\n#endif\n\n#define CATCH_TRANSLATE_EXCEPTION(signature)                                   \\\n  INTERNAL_CATCH_TRANSLATE_EXCEPTION_NO_REG(                                   \\\n      INTERNAL_CATCH_UNIQUE_NAME(catch_internal_ExceptionTranslator),          \\\n      signature)\n\n// \"BDD-style\" convenience wrappers\n#define SCENARIO(...)                                                          \\\n  INTERNAL_CATCH_TESTCASE_NO_REGISTRATION(                                     \\\n      INTERNAL_CATCH_UNIQUE_NAME(____C_A_T_C_H____T_E_S_T____))\n#define SCENARIO_METHOD(className, ...)                                        \\\n  INTERNAL_CATCH_TESTCASE_METHOD_NO_REGISTRATION(                              \\\n      INTERNAL_CATCH_UNIQUE_NAME(____C_A_T_C_H____T_E_S_T____), className)\n\n#define GIVEN(desc)\n#define AND_GIVEN(desc)\n#define WHEN(desc)\n#define AND_WHEN(desc)\n#define THEN(desc)\n#define AND_THEN(desc)\n\nusing Catch::Detail::Approx;\n\n#endif\n\n#endif // ! CATCH_CONFIG_IMPL_ONLY\n\n// start catch_reenable_warnings.h\n\n#ifdef __clang__\n#ifdef __ICC // icpc defines the __clang__ macro\n#pragma warning(pop)\n#else\n#pragma clang diagnostic pop\n#endif\n#elif defined __GNUC__\n#pragma GCC diagnostic pop\n#endif\n\n// end catch_reenable_warnings.h\n// end catch.hpp\n#endif // TWOBLUECUBES_SINGLE_INCLUDE_CATCH_HPP_INCLUDED\n"
  },
  {
    "path": "lonestar/scientific/cpu/longestedge/test/model/MapTest.cpp",
    "content": "#include \"../../src/utils/Utils.h\"\n#include \"../../src/model/Map.h\"\n\nTEST_CASE(\"Map::get_height Test\") {\n  double value         = 45.;\n  double** placeholder = (double**)malloc(2 * sizeof(double*));\n  placeholder[0]       = (double*)malloc(3 * sizeof(double));\n  placeholder[1]       = (double*)malloc(3 * sizeof(double));\n  placeholder[0][0]    = value + 8;\n  placeholder[0][1]    = value;\n  placeholder[0][2]    = 0;\n  placeholder[1][0]    = 0;\n  placeholder[1][1]    = 0;\n  placeholder[1][2]    = 0;\n  Map map{placeholder, 3, 2, 1., 1.};\n\n  REQUIRE(fabs(map.get_height(1, 0, false) - value) < 1e-1);\n}\n\nTEST_CASE(\"Map::get_height Test2\") {\n  double value         = 45.;\n  double north_border  = 49;\n  double west_border   = 20;\n  double** placeholder = (double**)malloc(2 * sizeof(double*));\n  placeholder[0]       = (double*)malloc(3 * sizeof(double));\n  placeholder[1]       = (double*)malloc(3 * sizeof(double));\n  placeholder[0][0]    = value + 8;\n  placeholder[0][1]    = value;\n  placeholder[0][2]    = 0;\n  placeholder[1][0]    = 0;\n  placeholder[1][1]    = 0;\n  placeholder[1][2]    = 0;\n\n  Map map{placeholder, 3, 2, 0.5, 0.5};\n  map.setNorthBorder(north_border);\n  map.setWestBorder(west_border);\n\n  REQUIRE(fabs(map.get_height(west_border + 0.5, north_border, false) - value) <\n          1e-1);\n}"
  },
  {
    "path": "lonestar/scientific/cpu/longestedge/test/model/ProductionStateTest.cpp",
    "content": "#include \"../catch.hpp\"\n#include \"../../src/model/Graph.h\"\n#include \"../testUtils.cpp\"\n#include \"../../src/model/ProductionState.h\"\n\nTEST_CASE(\"ProductionState construction lengths test\") {\n  galois::SharedMemSys G;\n  Graph graph{};\n  std::vector<GNode> nodes = generateSampleGraph(graph);\n  ConnectivityManager connManager{graph};\n  ProductionState pState{connManager, nodes[4], false,\n                         [](double, double) { return 0.; }};\n\n  REQUIRE(fabs(pState.getLengths()[0] - 1) <= 1e-6);\n  REQUIRE(fabs(pState.getLengths()[1] - 1) <= 1e-6);\n  REQUIRE(fabs(pState.getLengths()[2] - sqrt(2)) <= 1e-6);\n}\n"
  },
  {
    "path": "lonestar/scientific/cpu/longestedge/test/productions/Production1Test.cpp",
    "content": "#include \"../catch.hpp\"\n#include \"../../src/productions/Production1.h\"\n#include \"../../src/model/Graph.h\"\n#include \"../../src/model/Coordinates.h\"\n#include \"../../src/model/NodeData.h\"\n#include \"../testUtils.cpp\"\n\nstd::vector<GNode> generateTest1Graph(Graph& graph) {\n  std::vector<GNode> nodes;\n  ConnectivityManager connManager{graph};\n  nodes.push_back(\n      connManager.createNode(NodeData{false, Coordinates{0, 0, 0}, false}));\n  nodes.push_back(\n      connManager.createNode(NodeData{false, Coordinates{0, 1, 0}, false}));\n  nodes.push_back(\n      connManager.createNode(NodeData{false, Coordinates{1, 0, 0}, false}));\n  nodes.push_back(\n      connManager.createNode(NodeData{false, Coordinates{1, 1, 0}, false}));\n  nodes.push_back(\n      connManager.createNode(NodeData{false, Coordinates{2, 0, 0}, false}));\n  nodes.push_back(\n      connManager.createNode(NodeData{false, Coordinates{2, 1, 0}, false}));\n\n  connManager.createEdge(nodes[0], nodes[1], true, Coordinates{0, 0.5, 0}, 1);\n  connManager.createEdge(nodes[1], nodes[3], true, Coordinates{0.5, 1, 0}, 1);\n  connManager.createEdge(nodes[2], nodes[3], false, Coordinates{1, 0.5, 0}, 1);\n  connManager.createEdge(nodes[0], nodes[2], true, Coordinates{0.5, 0, 0}, 1);\n  connManager.createEdge(nodes[2], nodes[4], true, Coordinates{1.5, 0, 0}, 1);\n  connManager.createEdge(nodes[3], nodes[5], true, Coordinates{1.5, 1, 0}, 1);\n  connManager.createEdge(nodes[4], nodes[5], true, Coordinates{2, 0.5, 0}, 1);\n  connManager.createEdge(nodes[1], nodes[2], false, Coordinates{0.5, 0.5, 0},\n                         sqrt(2));\n  connManager.createEdge(nodes[2], nodes[5], false, Coordinates{1.5, 0.5, 0},\n                         sqrt(2));\n\n  nodes.push_back(connManager.createInterior(nodes[0], nodes[1], nodes[2]));\n  nodes.push_back(connManager.createInterior(nodes[1], nodes[3], nodes[2]));\n  nodes.push_back(connManager.createInterior(nodes[2], nodes[3], nodes[5]));\n  nodes.push_back(connManager.createInterior(nodes[2], nodes[4], nodes[5]));\n  return nodes;\n}\n\nTEST_CASE(\"Production1 simple Test\") {\n  galois::SharedMemSys G;\n  Graph graph{};\n  vector<GNode> nodes = generateSampleGraph(graph);\n  nodes[5]->getData().setToRefine(true);\n  galois::UserContext<GNode> ctx;\n  ConnectivityManager connManager{graph};\n  Production1 production{connManager};\n  ProductionState pState(connManager, nodes[5], false,\n                         [](double, double) { return 0.; });\n  production.execute(pState, ctx);\n\n  REQUIRE(countHEdges(graph) == 3);\n  REQUIRE(countVertices(graph) == 5);\n}\n\nTEST_CASE(\"Production1 complex Test\") {\n  galois::SharedMemSys G;\n  Graph graph{};\n  vector<GNode> nodes = generateTest1Graph(graph);\n  nodes[6]->getData().setToRefine(true);\n  nodes[7]->getData().setToRefine(true);\n  nodes[8]->getData().setToRefine(true);\n  nodes[9]->getData().setToRefine(true);\n  galois::UserContext<GNode> ctx;\n  ConnectivityManager connManager{graph};\n  Production1 production{connManager};\n  ProductionState pState1(connManager, nodes[6], false,\n                          [](double, double) { return 0.; });\n  production.execute(pState1, ctx);\n  ProductionState pState2(connManager, nodes[7], false,\n                          [](double, double) { return 0.; });\n  production.execute(pState2, ctx);\n  ProductionState pState3(connManager, nodes[8], false,\n                          [](double, double) { return 0.; });\n  production.execute(pState3, ctx);\n  ProductionState pState4(connManager, nodes[9], false,\n                          [](double, double) { return 0.; });\n  production.execute(pState4, ctx);\n\n  REQUIRE(countHEdges(graph) == 6);\n  REQUIRE(countVertices(graph) == 8);\n}\n"
  },
  {
    "path": "lonestar/scientific/cpu/longestedge/test/testUtils.cpp",
    "content": "#ifndef GALOIS_TEST_UTILS\n#define GALOIS_TEST_UTILS\n\n#include \"../src/utils/ConnectivityManager.h\"\n\nstd::vector<GNode> generateSampleGraph(Graph& graph) {\n  std::vector<GNode> nodes;\n  ConnectivityManager connManager{graph};\n  nodes.push_back(\n      connManager.createNode(NodeData{false, Coordinates{0, 0, 0}, false}));\n  nodes.push_back(\n      connManager.createNode(NodeData{false, Coordinates{0, 1, 0}, false}));\n  nodes.push_back(\n      connManager.createNode(NodeData{false, Coordinates{1, 0, 0}, false}));\n  nodes.push_back(\n      connManager.createNode(NodeData{false, Coordinates{1, 1, 0}, false}));\n\n  connManager.createEdge(nodes[0], nodes[1], true, Coordinates{0, 0.5, 0}, 1);\n  connManager.createEdge(nodes[1], nodes[3], true, Coordinates{0.5, 1, 0}, 1);\n  connManager.createEdge(nodes[2], nodes[3], true, Coordinates{1, 0.5, 0}, 1);\n  connManager.createEdge(nodes[0], nodes[2], true, Coordinates{0.5, 0, 0}, 1);\n  connManager.createEdge(nodes[3], nodes[0], false, Coordinates{0.5, 0.5, 0},\n                         sqrt(2));\n\n  nodes.push_back(connManager.createInterior(nodes[0], nodes[1], nodes[3]));\n  nodes.push_back(connManager.createInterior(nodes[0], nodes[3], nodes[2]));\n  return nodes;\n}\n\nint countHEdges(Graph& graph) {\n  int counter = 0;\n  for (auto n : graph) {\n    if (graph.containsNode(n) && n->getData().isHyperEdge()) {\n      ++counter;\n    }\n  }\n  return counter;\n}\n\nint countVertices(Graph& graph) {\n  int counter = 0;\n  for (auto n : graph) {\n    if (!(n->getData().isHyperEdge())) {\n      ++counter;\n    }\n  }\n  return counter;\n}\n\n#endif"
  },
  {
    "path": "lonestar/scientific/cpu/longestedge/test/utils/ConnectivityManagerTest.cpp",
    "content": "#include \"../catch.hpp\"\n#include \"../../src/productions/Production1.h\"\n#include \"../../src/model/Graph.h\"\n#include \"../../src/model/Coordinates.h\"\n#include \"../../src/model/NodeData.h\"\n#include \"../testUtils.cpp\"\n\nvector<GNode> generateSampleGraph2(Graph& graph) {\n  vector<GNode> nodes;\n  ConnectivityManager connManager{graph};\n  nodes.push_back(\n      connManager.createNode(NodeData{false, Coordinates{0, 0, 0}, false}));\n  nodes.push_back(\n      connManager.createNode(NodeData{false, Coordinates{0, 1, 0}, false}));\n  nodes.push_back(\n      connManager.createNode(NodeData{false, Coordinates{1, 0, 0}, false}));\n  nodes.push_back(\n      connManager.createNode(NodeData{false, Coordinates{1, 1, 0}, false}));\n\n  connManager.createEdge(nodes[0], nodes[1], true, Coordinates{0, 0.5, 0}, 1);\n  connManager.createEdge(nodes[1], nodes[3], true, Coordinates{0.5, 1, 0}, 1);\n  connManager.createEdge(nodes[2], nodes[3], true, Coordinates{1, 0.5, 0}, 1);\n  connManager.createEdge(nodes[0], nodes[2], true, Coordinates{0.5, 0, 0}, 1);\n  connManager.createEdge(nodes[3], nodes[0], false, Coordinates{0.5, 0.5, 0},\n                         sqrt(2));\n\n  nodes.push_back(connManager.createInterior(nodes[0], nodes[1], nodes[3]));\n  nodes.push_back(connManager.createInterior(nodes[0], nodes[3], nodes[2]));\n  return nodes;\n}\n\n// TEST_CASE( \"getSrc positive Test\" ) {\n//    galois::SharedMemSys G;\n//    Graph graph{};\n//    vector<GNode> nodes = generateSampleGraph(graph);\n//    ConnectivityManager connManager{graph};\n//\n//    auto edge = graph.findEdge(nodes[0], nodes[1]);\n//    EdgeData edgeData = graph.getEdgeData(edge);\n//\n//    REQUIRE((graph.getEdgeData(edge).getSrc()) != &(nodes[1]));\n//}\n\n// TEST_CASE( \"findSrc negative Test\" ) {\n//    galois::SharedMemSys G;\n//    Graph graph{};\n//    vector<GNode> nodes = generateSampleGraph(graph);\n//    ConnectivityManager connManager{graph};\n//\n//    auto edge = graph.findEdge(nodes[1], nodes[2]);\n//\n//    REQUIRE(edge.base() == edge.end());\n//}\n//\n// TEST_CASE( \"findSrc positive Test\" ) {\n//    galois::SharedMemSys G;\n//    Graph graph{};\n//    vector<GNode> nodes = generateSampleGraph(graph);\n//    ConnectivityManager connManager{graph};\n//\n//    auto edge = graph.findEdge(nodes[0], nodes[1]);\n//\n//    REQUIRE(edge.base() != edge.end());\n//}\n"
  },
  {
    "path": "lonestar/scientific/cpu/longestedge/test/utils/UtilsTest.cpp",
    "content": "#include \"../../src/utils/Utils.h\"\n#include \"../../src/model/Map.h\"\n\nTEST_CASE(\"convertToUtm Test\") {\n  double longitude     = 20.;\n  double latitude      = 50.;\n  long zone            = 34;\n  char hemisphere      = 'N';\n  double northing      = 5539109.82;\n  double easting       = 428333.55;\n  double** placeholder = (double**)malloc(sizeof(double*));\n  placeholder[0]       = (double*)malloc(sizeof(double));\n  placeholder[0][0]    = 8;\n  Map map{placeholder, 1, 1, 1., 1.};\n\n  const std::pair<double, double>& pair =\n      Utils::convertToUtm(latitude, longitude, map);\n\n  REQUIRE(fabs(pair.first - easting) < 1e-1);\n  REQUIRE(fabs(pair.second - northing) < 1e-1);\n  REQUIRE(map.getZone() == zone);\n  REQUIRE(map.getHemisphere() == hemisphere);\n}"
  },
  {
    "path": "lonestar/scientific/gpu/CMakeLists.txt",
    "content": "function(app_scientific_gpu name target_name)\n  set(options NO_GPU)\n  set(one_value_args)\n  set(multi_value_args)\n  cmake_parse_arguments(X \"${options}\" \"${one_value_args}\" \"${multi_value_args}\" ${ARGN})\n  string(CONCAT target_name ${target_name} \"-gpu\")\n  add_executable(${target_name} ${name}.cu)\n  install(TARGETS ${target_name} DESTINATION \"${CMAKE_INSTALL_BINDIR}\" EXCLUDE_FROM_ALL)\n  if(GALOIS_ENABLE_GPU AND NOT ${X_NO_GPU})\n    target_compile_options(${target_name} PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-w>)\n    target_link_libraries(${target_name} Galois::gpu)\n    set_property(TARGET ${target_name} PROPERTY CUDA_STANDARD 14)\n    set_property(TARGET ${target_name} PROPERTY CUDA_SEPARABLE_COMPILATION ON)\n  endif()\nendfunction()\n\nadd_subdirectory(barneshut)\nadd_subdirectory(delaunayrefinement)\n\n"
  },
  {
    "path": "lonestar/scientific/gpu/README.md",
    "content": "Overview of LonestarGPU Scientific Benchmark Suite\n================================================================================\n\nThe LonestarGPU suite contains CUDA implementations of several\nirregular algorithms that exhibit amorphous data parallelism. Currently,\nthe LonestarGPU suite contains the following scientific applications,\nwhich can be executed on a single-GPU.\n\n### Scientific Applications\n  * Barnes-Hut N-body Simulation\n  * Delaunay Mesh Refinement\n\nCompiling LonestarGPU Through CMake \n================================================================================\n\nThe dependencies for LonestarGPU suite are the same as shared-memory.\nNote that  LonestarGPU requires CUDA 8.0 and above.\n\nNote that heterogeneous Galois requires the cub and moderngpu git submodules,\nwhich can be cloned using the followed commands.\n\n```Shell\ncd $GALOIS_ROOT\ngit submodule init\ngit submodule update \n```\nThese modules will be available in the ${GALOIS\\_ROOT}/external directory\n\nTo build the LonestarGPU suite, first, create a build directory and\nrun CMake with -DGALOIS\\_CUDA\\_CAPABILITY=\\<insert CUDA capability here\\>\nflag in the build directory. The CUDA capability should be one that your\nGPU supports. For example, if you wanted to build for a GTX 1080 and a K80,\nthe commands would look like this:\n\n```Shell\ncd ${GALOIS_ROOT}\nmkdir build\ncd build\ncmake ${GALOIS_ROOT} -DGALOIS_CUDA_CAPABILITY=\"3.7;6.1\"\n\nAfter compiling through CMake, the system will create the 'lonestar/analytics/gpu'\nand 'lonestar/scientific/gpu' directories in ${GALOIS\\_ROOT}/build directory. \n\nCompiling Scientific Applications\n================================================================================\n\nOnce CMake is completed,  compile the provided scientific apps by executing the \nfollowing command in the ${GALOIS\\_ROOT}/build/lonestar/scientific/gpu directory.\n\n`make -j`\n\nYou can compile a specific app by executing the following commands\n(shown for barneshut).\n\n```Shell\n`cd barneshut`\n`make -j`\n```\n\nRunning Scientific Applications\n================================================================================\n\nTo run a specific app, follow the instructions given in the README.md\nin the particular app directory. \n\nDocumentation\n================================================================================\n\nFurther documentation is available at\n[http://iss.ices.utexas.edu/?p=projects/galois/lonestargpu](http://iss.ices.utexas.edu/?p=projects/galois/lonestargpu)\n\n\n\n\n"
  },
  {
    "path": "lonestar/scientific/gpu/barneshut/CMakeLists.txt",
    "content": "app_scientific_gpu(bh barneshut)\nadd_test_gpu(barneshut rmat15 rmat15.out bh 50000 2 0)\n"
  },
  {
    "path": "lonestar/scientific/gpu/barneshut/LICENSE.md",
    "content": "CUDA BarnesHut v3.1: Simulation of the gravitational forces\nin a galactic cluster using the Barnes-Hut n-body algorithm\n\nCopyright (c) 2013, Texas State University-San Marcos. All rights reserved.\n\nRedistribution and use in source and binary forms, with or without modification,\nare permitted for academic, research, experimental, or personal use provided that\nthe following conditions are met:\n\n   * Redistributions of source code must retain the above copyright notice,\n     this list of conditions and the following disclaimer.\n   * Redistributions in binary form must reproduce the above copyright notice,\n     this list of conditions and the following disclaimer in the documentation\n     and/or other materials provided with the distribution.\n   * Neither the name of Texas State University-San Marcos nor the names of its\n     contributors may be used to endorse or promote products derived from this\n     software without specific prior written permission.\n\nFor all other uses, please contact the Office for Commercialization and Industry\nRelations at Texas State University-San Marcos <http://www.txstate.edu/ocir/>.\n\nTHIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS \"AS IS\" AND\nANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED\nWARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED\nIN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,\nINDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,\nBUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,\nDATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF\nLIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE\nOR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED\nOF THE POSSIBILITY OF SUCH DAMAGE.\n\nAuthor: Martin Burtscher <burtscher@txstate.edu>\n"
  },
  {
    "path": "lonestar/scientific/gpu/barneshut/README.md",
    "content": "Barnes-Hut N-body Simulation\n================================================================================\n\nDESCRIPTION\n--------------------------------------------------------------------------------\n\nThis benchmark simulates the gravitational forces acting on a galactic cluster\nusing the Barnes-Hut n-body algorithm. The positions and velocities of the n\ngalaxies are initialized according to the empirical Plummer model. The program\ncalculates the motion of each galaxy through space for a number of time steps.\nThe data parallelism in this algorithm arises primarily from the independent\nforce calculations.\n\nBUILD\n--------------------------------------------------------------------------------\n\n1. Run cmake at BUILD directory (refer to top-level README for cmake instructions).\n\n2. Run `cd <BUILD>/lonestar/analytics/gpu/barneshut; make -j`\n\nRUN\n--------------------------------------------------------------------------------\n\nTo run default algorithm, use the following:\n\n-`$ ./barneshut-gpu  <bodies> <timesteps> <deviceid>`\n\n-`$ ./barneshut-gpu 50000 2 0`\n"
  },
  {
    "path": "lonestar/scientific/gpu/barneshut/bh.cu",
    "content": "/*\n * CUDA BarnesHut v3.1: Simulation of the gravitational forces\n * in a galactic cluster using the Barnes-Hut n-body algorithm\n *\n * Copyright (c) 2013, Texas State University-San Marcos. All rights reserved.\n *\n * Redistribution and use in source and binary forms, with or without modification,\n * are permitted for academic, research, experimental, or personal use provided that\n * the following conditions are met:\n *\n *    * Redistributions of source code must retain the above copyright notice,\n *      this list of conditions and the following disclaimer.\n *    * Redistributions in binary form must reproduce the above copyright notice,\n *      this list of conditions and the following disclaimer in the documentation\n *      and/or other materials provided with the distribution.\n *    * Neither the name of Texas State University-San Marcos nor the names of its\n *      contributors may be used to endorse or promote products derived from this\n *      software without specific prior written permission.\n *\n * For all other uses, please contact the Office for Commercialization and Industry\n * Relations at Texas State University-San Marcos <http://www.txstate.edu/ocir/>.\n *\n * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS \"AS IS\" AND\n * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED\n * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED\n * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,\n * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,\n * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,\n * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF\n * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE\n * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED\n * OF THE POSSIBILITY OF SUCH DAMAGE.\n *\n * Author: Martin Burtscher <burtscher@txstate.edu>\n *\n */\n\n#include <stdlib.h>\n#include <stdio.h>\n#include <math.h>\n#include <sys/time.h>\n#include <cuda.h>\n#include <assert.h>\n#include \"cuda_launch_config.hpp\"\n#include \"bh_tuning.h\"\n\n#define WARPSIZE 32\n#define MAXDEPTH 32\n\n__device__ volatile int stepd, bottomd, maxdepthd;\n__device__ unsigned int blkcntd;\n__device__ volatile float radiusd;\n\n\n/******************************************************************************/\n/*** initialize memory ********************************************************/\n/******************************************************************************/\n\n__global__ void InitializationKernel(int * __restrict errd)\n{\n  *errd = 0;\n  stepd = -1;\n  maxdepthd = 1;\n  blkcntd = 0;\n}\n\n\n/******************************************************************************/\n/*** compute center and radius ************************************************/\n/******************************************************************************/\n\n__global__\n__launch_bounds__(THREADS1, FACTOR1)\nvoid BoundingBoxKernel(int nnodesd, int nbodiesd, volatile int * __restrict startd, volatile int * __restrict childd, volatile float * __restrict massd, volatile float * __restrict posxd, volatile float * __restrict posyd, volatile float * __restrict poszd, volatile float * __restrict maxxd, volatile float * __restrict maxyd, volatile float * __restrict maxzd, volatile float * __restrict minxd, volatile float * __restrict minyd, volatile float * __restrict minzd)\n{\n  register int i, j, k, inc;\n  register float val, minx, maxx, miny, maxy, minz, maxz;\n  __shared__ volatile float sminx[THREADS1], smaxx[THREADS1], sminy[THREADS1], smaxy[THREADS1], sminz[THREADS1], smaxz[THREADS1];\n\n  // initialize with valid data (in case #bodies < #threads)\n  minx = maxx = posxd[0];\n  miny = maxy = posyd[0];\n  minz = maxz = poszd[0];\n\n  // scan all bodies\n  i = threadIdx.x;\n  inc = THREADS1 * gridDim.x;\n  for (j = i + blockIdx.x * THREADS1; j < nbodiesd; j += inc) {\n    val = posxd[j];\n    minx = fminf(minx, val);\n    maxx = fmaxf(maxx, val);\n    val = posyd[j];\n    miny = fminf(miny, val);\n    maxy = fmaxf(maxy, val);\n    val = poszd[j];\n    minz = fminf(minz, val);\n    maxz = fmaxf(maxz, val);\n  }\n\n  // reduction in shared memory\n  sminx[i] = minx;\n  smaxx[i] = maxx;\n  sminy[i] = miny;\n  smaxy[i] = maxy;\n  sminz[i] = minz;\n  smaxz[i] = maxz;\n\n  for (j = THREADS1 / 2; j > 0; j /= 2) {\n    __syncthreads();\n    if (i < j) {\n      k = i + j;\n      sminx[i] = minx = fminf(minx, sminx[k]);\n      smaxx[i] = maxx = fmaxf(maxx, smaxx[k]);\n      sminy[i] = miny = fminf(miny, sminy[k]);\n      smaxy[i] = maxy = fmaxf(maxy, smaxy[k]);\n      sminz[i] = minz = fminf(minz, sminz[k]);\n      smaxz[i] = maxz = fmaxf(maxz, smaxz[k]);\n    }\n  }\n\n  // write block result to global memory\n  if (i == 0) {\n    k = blockIdx.x;\n    minxd[k] = minx;\n    maxxd[k] = maxx;\n    minyd[k] = miny;\n    maxyd[k] = maxy;\n    minzd[k] = minz;\n    maxzd[k] = maxz;\n    __threadfence();\n\n    inc = gridDim.x - 1;\n    if (inc == atomicInc(&blkcntd, inc)) {\n      // I'm the last block, so combine all block results\n      for (j = 0; j <= inc; j++) {\n        minx = fminf(minx, minxd[j]);\n        maxx = fmaxf(maxx, maxxd[j]);\n        miny = fminf(miny, minyd[j]);\n        maxy = fmaxf(maxy, maxyd[j]);\n        minz = fminf(minz, minzd[j]);\n        maxz = fmaxf(maxz, maxzd[j]);\n      }\n\n      // compute 'radius'\n      val = fmaxf(maxx - minx, maxy - miny);\n      radiusd = fmaxf(val, maxz - minz) * 0.5f;\n\n      // create root node\n      k = nnodesd;\n      bottomd = k;\n\n      massd[k] = -1.0f;\n      startd[k] = 0;\n      posxd[k] = (minx + maxx) * 0.5f;\n      posyd[k] = (miny + maxy) * 0.5f;\n      poszd[k] = (minz + maxz) * 0.5f;\n      k *= 8;\n      for (i = 0; i < 8; i++) childd[k + i] = -1;\n\n      stepd++;\n    }\n  }\n}\n\n\n/******************************************************************************/\n/*** build tree ***************************************************************/\n/******************************************************************************/\n\n__global__\n__launch_bounds__(1024, 1)\nvoid ClearKernel1(int nnodesd, int nbodiesd, volatile int * __restrict childd)\n{\n  register int k, inc, top, bottom;\n\n  top = 8 * nnodesd;\n  bottom = 8 * nbodiesd;\n  inc = blockDim.x * gridDim.x;\n  k = (bottom & (-WARPSIZE)) + threadIdx.x + blockIdx.x * blockDim.x;  // align to warp size\n  if (k < bottom) k += inc;\n\n  // iterate over all cells assigned to thread\n  while (k < top) {\n    childd[k] = -1;\n    k += inc;\n  }\n}\n\n\n__global__\n__launch_bounds__(THREADS2, FACTOR2)\nvoid TreeBuildingKernel(int nnodesd, int nbodiesd, volatile int * __restrict errd, volatile int * __restrict childd, volatile float * __restrict posxd, volatile float * __restrict posyd, volatile float * __restrict poszd)\n{\n  register int i, j, depth, localmaxdepth, skip, inc;\n  register float x, y, z, r;\n  register float px, py, pz;\n  register float dx, dy, dz;\n  register int ch, n, cell, locked, patch;\n  register float radius, rootx, rooty, rootz;\n\n  // cache root data\n  radius = radiusd;\n  rootx = posxd[nnodesd];\n  rooty = posyd[nnodesd];\n  rootz = poszd[nnodesd];\n\n  localmaxdepth = 1;\n  skip = 1;\n  inc = blockDim.x * gridDim.x;\n  i = threadIdx.x + blockIdx.x * blockDim.x;\n\n  // iterate over all bodies assigned to thread\n  while (i < nbodiesd) {\n    if (skip != 0) {\n      // new body, so start traversing at root\n      skip = 0;\n      px = posxd[i];\n      py = posyd[i];\n      pz = poszd[i];\n      n = nnodesd;\n      depth = 1;\n      r = radius * 0.5f;\n      dx = dy = dz = -r;\n      j = 0;\n      // determine which child to follow\n      if (rootx < px) {j = 1; dx = r;}\n      if (rooty < py) {j |= 2; dy = r;}\n      if (rootz < pz) {j |= 4; dz = r;}\n      x = rootx + dx;\n      y = rooty + dy;\n      z = rootz + dz;\n    }\n\n    // follow path to leaf cell\n    ch = childd[n*8+j];\n    while (ch >= nbodiesd) {\n      n = ch;\n      depth++;\n      r *= 0.5f;\n      dx = dy = dz = -r;\n      j = 0;\n      // determine which child to follow\n      if (x < px) {j = 1; dx = r;}\n      if (y < py) {j |= 2; dy = r;}\n      if (z < pz) {j |= 4; dz = r;}\n      x += dx;\n      y += dy;\n      z += dz;\n      ch = childd[n*8+j];\n    }\n\n    if (ch != -2) {  // skip if child pointer is locked and try again later\n      locked = n*8+j;\n      if (ch == -1) {\n        if (-1 == atomicCAS((int *)&childd[locked], -1, i)) {  // if null, just insert the new body\n          localmaxdepth = max(depth, localmaxdepth);\n          i += inc;  // move on to next body\n          skip = 1;\n        }\n      } else {  // there already is a body in this position\n        if (ch == atomicCAS((int *)&childd[locked], ch, -2)) {  // try to lock\n          patch = -1;\n          // create new cell(s) and insert the old and new body\n          do {\n            depth++;\n\n            cell = atomicSub((int *)&bottomd, 1) - 1;\n            if (cell <= nbodiesd) {\n              *errd = 1;\n              bottomd = nnodesd;\n            }\n\n            if (patch != -1) {\n              childd[n*8+j] = cell;\n            }\n            patch = max(patch, cell);\n\n            j = 0;\n            if (x < posxd[ch]) j = 1;\n            if (y < posyd[ch]) j |= 2;\n            if (z < poszd[ch]) j |= 4;\n            childd[cell*8+j] = ch;\n\n            n = cell;\n            r *= 0.5f;\n            dx = dy = dz = -r;\n            j = 0;\n            if (x < px) {j = 1; dx = r;}\n            if (y < py) {j |= 2; dy = r;}\n            if (z < pz) {j |= 4; dz = r;}\n            x += dx;\n            y += dy;\n            z += dz;\n\n            ch = childd[n*8+j];\n            // repeat until the two bodies are different children\n          } while (ch >= 0);\n          childd[n*8+j] = i;\n\n          localmaxdepth = max(depth, localmaxdepth);\n          i += inc;  // move on to next body\n          skip = 2;\n        }\n      }\n    }\n    __syncthreads();  // __threadfence();\n\n    if (skip == 2) {\n      childd[locked] = patch;\n    }\n  }\n  // record maximum tree depth\n  atomicMax((int *)&maxdepthd, localmaxdepth);\n}\n\n\n__global__\n__launch_bounds__(1024, 1)\nvoid ClearKernel2(int nnodesd, volatile int * __restrict startd, volatile float * __restrict massd)\n{\n  register int k, inc, bottom;\n\n  bottom = bottomd;\n  inc = blockDim.x * gridDim.x;\n  k = (bottom & (-WARPSIZE)) + threadIdx.x + blockIdx.x * blockDim.x;  // align to warp size\n  if (k < bottom) k += inc;\n\n  // iterate over all cells assigned to thread\n  while (k < nnodesd) {\n    massd[k] = -1.0f;\n    startd[k] = -1;\n    k += inc;\n  }\n}\n\n\n/******************************************************************************/\n/*** compute center of mass ***************************************************/\n/******************************************************************************/\n\n__global__\n__launch_bounds__(THREADS3, FACTOR3)\nvoid SummarizationKernel(const int nnodesd, const int nbodiesd, volatile int * __restrict countd, const int * __restrict childd, volatile float * __restrict massd, volatile float * __restrict posxd, volatile float * __restrict posyd, volatile float * __restrict poszd)\n{\n  register int i, j, k, ch, inc, cnt, bottom, flag;\n  register float m, cm, px, py, pz;\n  __shared__ int child[THREADS3 * 8];\n  __shared__ float mass[THREADS3 * 8];\n\n  bottom = bottomd;\n  inc = blockDim.x * gridDim.x;\n  k = (bottom & (-WARPSIZE)) + threadIdx.x + blockIdx.x * blockDim.x;  // align to warp size\n  if (k < bottom) k += inc;\n\n  register int restart = k;\n  for (j = 0; j < 5; j++) {  // wait-free pre-passes\n    // iterate over all cells assigned to thread\n    while (k <= nnodesd) {\n      if (massd[k] < 0.0f) {\n        for (i = 0; i < 8; i++) {\n          ch = childd[k*8+i];\n          child[i*THREADS3+threadIdx.x] = ch;  // cache children\n          if ((ch >= nbodiesd) && ((mass[i*THREADS3+threadIdx.x] = massd[ch]) < 0.0f)) {\n            break;\n          }\n        }\n        if (i == 8) {\n          // all children are ready\n          cm = 0.0f;\n          px = 0.0f;\n          py = 0.0f;\n          pz = 0.0f;\n          cnt = 0;\n          for (i = 0; i < 8; i++) {\n            ch = child[i*THREADS3+threadIdx.x];\n            if (ch >= 0) {\n              if (ch >= nbodiesd) {  // count bodies (needed later)\n                m = mass[i*THREADS3+threadIdx.x];\n                cnt += countd[ch];\n              } else {\n                m = massd[ch];\n                cnt++;\n              }\n              // add child's contribution\n              cm += m;\n              px += posxd[ch] * m;\n              py += posyd[ch] * m;\n              pz += poszd[ch] * m;\n            }\n          }\n          countd[k] = cnt;\n          m = 1.0f / cm;\n          posxd[k] = px * m;\n          posyd[k] = py * m;\n          poszd[k] = pz * m;\n          __threadfence();  // make sure data are visible before setting mass\n          massd[k] = cm;\n        }\n      }\n      k += inc;  // move on to next cell\n    }\n    k = restart;\n  }\n\n  flag = 0;\n  j = 0;\n  // iterate over all cells assigned to thread\n  while (k <= nnodesd) {\n    if (massd[k] >= 0.0f) {\n      k += inc;\n    } else {\n      if (j == 0) {\n        j = 8;\n        for (i = 0; i < 8; i++) {\n          ch = childd[k*8+i];\n          child[i*THREADS3+threadIdx.x] = ch;  // cache children\n          if ((ch < nbodiesd) || ((mass[i*THREADS3+threadIdx.x] = massd[ch]) >= 0.0f)) {\n            j--;\n          }\n        }\n      } else {\n        j = 8;\n        for (i = 0; i < 8; i++) {\n          ch = child[i*THREADS3+threadIdx.x];\n          if ((ch < nbodiesd) || (mass[i*THREADS3+threadIdx.x] >= 0.0f) || ((mass[i*THREADS3+threadIdx.x] = massd[ch]) >= 0.0f)) {\n            j--;\n          }\n        }\n      }\n\n      if (j == 0) {\n        // all children are ready\n        cm = 0.0f;\n        px = 0.0f;\n        py = 0.0f;\n        pz = 0.0f;\n        cnt = 0;\n        for (i = 0; i < 8; i++) {\n          ch = child[i*THREADS3+threadIdx.x];\n          if (ch >= 0) {\n            if (ch >= nbodiesd) {  // count bodies (needed later)\n              m = mass[i*THREADS3+threadIdx.x];\n              cnt += countd[ch];\n            } else {\n              m = massd[ch];\n              cnt++;\n            }\n            // add child's contribution\n            cm += m;\n            px += posxd[ch] * m;\n            py += posyd[ch] * m;\n            pz += poszd[ch] * m;\n          }\n        }\n        countd[k] = cnt;\n        m = 1.0f / cm;\n        posxd[k] = px * m;\n        posyd[k] = py * m;\n        poszd[k] = pz * m;\n        flag = 1;\n      }\n    }\n    __syncthreads();  // __threadfence();\n    if (flag != 0) {\n      massd[k] = cm;\n      k += inc;\n      flag = 0;\n    }\n  }\n}\n\n\n/******************************************************************************/\n/*** sort bodies **************************************************************/\n/******************************************************************************/\n\n__global__\n__launch_bounds__(THREADS4, FACTOR4)\nvoid SortKernel(int nnodesd, int nbodiesd, int * __restrict sortd, int * __restrict countd, volatile int * __restrict startd, int * __restrict childd)\n{\n  register int i, j, k, ch, dec, start, bottom;\n\n  bottom = bottomd;\n  dec = blockDim.x * gridDim.x;\n  k = nnodesd + 1 - dec + threadIdx.x + blockIdx.x * blockDim.x;\n\n  // iterate over all cells assigned to thread\n  while (k >= bottom) {\n    start = startd[k];\n    if (start >= 0) {\n      j = 0;\n      for (i = 0; i < 8; i++) {\n        ch = childd[k*8+i];\n        if (ch >= 0) {\n          if (i != j) {\n            // move children to front (needed later for speed)\n            childd[k*8+i] = -1;\n            childd[k*8+j] = ch;\n          }\n          j++;\n          if (ch >= nbodiesd) {\n            // child is a cell\n            startd[ch] = start;  // set start ID of child\n            start += countd[ch];  // add #bodies in subtree\n          } else {\n            // child is a body\n            sortd[start] = ch;  // record body in 'sorted' array\n            start++;\n          }\n        }\n      }\n      k -= dec;  // move on to next cell\n    }\n  }\n}\n\n\n/******************************************************************************/\n/*** compute force ************************************************************/\n/******************************************************************************/\n\n__global__\n__launch_bounds__(THREADS5, FACTOR5)\nvoid ForceCalculationKernel(int nnodesd, int nbodiesd, volatile int * __restrict errd, float dthfd, float itolsqd, float epssqd, volatile int * __restrict sortd, volatile int * __restrict childd, volatile float * __restrict massd, volatile float * __restrict posxd, volatile float * __restrict posyd, volatile float * __restrict poszd, volatile float * __restrict velxd, volatile float * __restrict velyd, volatile float * __restrict velzd, volatile float * __restrict accxd, volatile float * __restrict accyd, volatile float * __restrict acczd)\n{\n  register int i, j, k, n, depth, base, sbase, diff, pd, nd;\n  register float px, py, pz, ax, ay, az, dx, dy, dz, tmp;\n  __shared__ volatile int pos[MAXDEPTH * THREADS5/WARPSIZE], node[MAXDEPTH * THREADS5/WARPSIZE];\n  __shared__ float dq[MAXDEPTH * THREADS5/WARPSIZE];\n\n  if (0 == threadIdx.x) {\n    tmp = radiusd * 2;\n    // precompute values that depend only on tree level\n    dq[0] = tmp * tmp * itolsqd;\n    for (i = 1; i < maxdepthd; i++) {\n      dq[i] = dq[i - 1] * 0.25f;\n      dq[i - 1] += epssqd;\n    }\n    dq[i - 1] += epssqd;\n\n    if (maxdepthd > MAXDEPTH) {\n      *errd = maxdepthd;\n    }\n  }\n  __syncthreads();\n\n  if (maxdepthd <= MAXDEPTH) {\n    // figure out first thread in each warp (lane 0)\n    base = threadIdx.x / WARPSIZE;\n    sbase = base * WARPSIZE;\n    j = base * MAXDEPTH;\n\n    diff = threadIdx.x - sbase;\n    // make multiple copies to avoid index calculations later\n    if (diff < MAXDEPTH) {\n      dq[diff+j] = dq[diff];\n    }\n    __syncthreads();\n    __threadfence_block();\n\n    // iterate over all bodies assigned to thread\n    for (k = threadIdx.x + blockIdx.x * blockDim.x; k < nbodiesd; k += blockDim.x * gridDim.x) {\n      i = sortd[k];  // get permuted/sorted index\n      // cache position info\n      px = posxd[i];\n      py = posyd[i];\n      pz = poszd[i];\n\n      ax = 0.0f;\n      ay = 0.0f;\n      az = 0.0f;\n\n      // initialize iteration stack, i.e., push root node onto stack\n      depth = j;\n      if (sbase == threadIdx.x) {\n        pos[j] = 0;\n        node[j] = nnodesd * 8;\n      }\n\n      do {\n        // stack is not empty\n        pd = pos[depth];\n        nd = node[depth];\n        while (pd < 8) {\n          // node on top of stack has more children to process\n          n = childd[nd + pd];  // load child pointer\n          pd++;\n\n          if (n >= 0) {\n            dx = posxd[n] - px;\n            dy = posyd[n] - py;\n            dz = poszd[n] - pz;\n            tmp = dx*dx + (dy*dy + (dz*dz + epssqd));  // compute distance squared (plus softening)\n            if ((n < nbodiesd) || __all_sync(0xffffffff, tmp >= dq[depth])) {  // check if all threads agree that cell is far enough away (or is a body)\n              tmp = rsqrtf(tmp);  // compute distance\n              tmp = massd[n] * tmp * tmp * tmp;\n              ax += dx * tmp;\n              ay += dy * tmp;\n              az += dz * tmp;\n            } else {\n              // push cell onto stack\n              if (sbase == threadIdx.x) {  // maybe don't push and inc if last child\n                pos[depth] = pd;\n                node[depth] = nd;\n              }\n              depth++;\n              pd = 0;\n              nd = n * 8;\n            }\n          } else {\n            pd = 8;  // early out because all remaining children are also zero\n          }\n        }\n        depth--;  // done with this level\n      } while (depth >= j);\n\n      if (stepd > 0) {\n        // update velocity\n        velxd[i] += (ax - accxd[i]) * dthfd;\n        velyd[i] += (ay - accyd[i]) * dthfd;\n        velzd[i] += (az - acczd[i]) * dthfd;\n      }\n\n      // save computed acceleration\n      accxd[i] = ax;\n      accyd[i] = ay;\n      acczd[i] = az;\n    }\n  }\n}\n\n\n/******************************************************************************/\n/*** advance bodies ***********************************************************/\n/******************************************************************************/\n\n__global__\n__launch_bounds__(THREADS6, FACTOR6)\nvoid IntegrationKernel(int nbodiesd, float dtimed, float dthfd, volatile float * __restrict posxd, volatile float * __restrict posyd, volatile float * __restrict poszd, volatile float * __restrict velxd, volatile float * __restrict velyd, volatile float * __restrict velzd, volatile float * __restrict accxd, volatile float * __restrict accyd, volatile float * __restrict acczd)\n{\n  register int i, inc;\n  register float dvelx, dvely, dvelz;\n  register float velhx, velhy, velhz;\n\n  // iterate over all bodies assigned to thread\n  inc = blockDim.x * gridDim.x;\n  for (i = threadIdx.x + blockIdx.x * blockDim.x; i < nbodiesd; i += inc) {\n    // integrate\n    dvelx = accxd[i] * dthfd;\n    dvely = accyd[i] * dthfd;\n    dvelz = acczd[i] * dthfd;\n\n    velhx = velxd[i] + dvelx;\n    velhy = velyd[i] + dvely;\n    velhz = velzd[i] + dvelz;\n\n    posxd[i] += velhx * dtimed;\n    posyd[i] += velhy * dtimed;\n    poszd[i] += velhz * dtimed;\n\n    velxd[i] = velhx + dvelx;\n    velyd[i] = velhy + dvely;\n    velzd[i] = velhz + dvelz;\n  }\n}\n\n\n/******************************************************************************/\n\nstatic void CudaTest(char *msg)\n{\n  cudaError_t e;\n\n  cudaThreadSynchronize();\n  if (cudaSuccess != (e = cudaGetLastError())) {\n    fprintf(stderr, \"%s: %d\\n\", msg, e);\n    fprintf(stderr, \"%s\\n\", cudaGetErrorString(e));\n    exit(-1);\n  }\n}\n\n\n/******************************************************************************/\n\n// random number generator\n\n#define MULT 1103515245\n#define ADD 12345\n#define MASK 0x7FFFFFFF\n#define TWOTO31 2147483648.0\n\nstatic int A = 1;\nstatic int B = 0;\nstatic int randx = 1;\nstatic int lastrand;\n\n\nstatic void drndset(int seed)\n{\n   A = 1;\n   B = 0;\n   randx = (A * seed + B) & MASK;\n   A = (MULT * A) & MASK;\n   B = (MULT * B + ADD) & MASK;\n}\n\n\nstatic double drnd()\n{\n   lastrand = randx;\n   randx = (A * randx + B) & MASK;\n   return (double)lastrand / TWOTO31;\n}\n\n\n/******************************************************************************/\n\nint main(int argc, char *argv[])\n{\n  register int i, run, blocks;\n  int nnodes, nbodies, step, timesteps;\n  register double runtime;\n  int error;\n  register float dtime, dthf, epssq, itolsq;\n  float time, timing[7];\n  cudaEvent_t start, stop;\n  float *mass, *posx, *posy, *posz, *velx, *vely, *velz;\n\n  int *errl, *sortl, *childl, *countl, *startl;\n  float *massl;\n  float *posxl, *posyl, *poszl;\n  float *velxl, *velyl, *velzl;\n  float *accxl, *accyl, *acczl;\n  float *maxxl, *maxyl, *maxzl;\n  float *minxl, *minyl, *minzl;\n  register double rsc, vsc, r, v, x, y, z, sq, scale;\n\n  // perform some checks\n\n  printf(\"CUDA BarnesHut v3.1 \");\n#ifdef __KEPLER__\n  printf(\"[Kepler]\\n\");\n#else\n  printf(\"[Fermi]\\n\");\n#endif\n  printf(\"Copyright (c) 2013, Texas State University-San Marcos. All rights reserved.\\n\");\n  fflush(stdout);\n  if (argc != 4) {\n    fprintf(stderr, \"\\n\");\n    fprintf(stderr, \"arguments: number_of_bodies number_of_timesteps device\\n\");\n    exit(-1);\n  }\n\n  int deviceCount;\n  cudaGetDeviceCount(&deviceCount);\n  if (deviceCount == 0) {\n    fprintf(stderr, \"There is no device supporting CUDA\\n\");\n    exit(-1);\n  }\n\n  const int dev = atoi(argv[3]);\n  if ((dev < 0) || (deviceCount <= dev)) {\n    fprintf(stderr, \"There is no device %d\\n\", dev);\n    exit(-1);\n  }\n  cudaSetDevice(dev);\n\n  cudaDeviceProp deviceProp;\n  cudaGetDeviceProperties(&deviceProp, dev);\n  if ((deviceProp.major == 9999) && (deviceProp.minor == 9999)) {\n    fprintf(stderr, \"There is no CUDA capable device\\n\");\n    exit(-1);\n  }\n  if (deviceProp.major < 2) {\n    fprintf(stderr, \"Need at least compute capability 2.0\\n\");\n    exit(-1);\n  }\n  if (deviceProp.warpSize != WARPSIZE) {\n    fprintf(stderr, \"Warp size must be %d\\n\", deviceProp.warpSize);\n    exit(-1);\n  }\n\n  blocks = deviceProp.multiProcessorCount;\n//  fprintf(stderr, \"blocks = %d\\n\", blocks);\n\n  if ((WARPSIZE <= 0) || (WARPSIZE & (WARPSIZE-1) != 0)) {\n    fprintf(stderr, \"Warp size must be greater than zero and a power of two\\n\");\n    exit(-1);\n  }\n  if (MAXDEPTH > WARPSIZE) {\n    fprintf(stderr, \"MAXDEPTH must be less than or equal to WARPSIZE\\n\");\n    exit(-1);\n  }\n  if ((THREADS1 <= 0) || (THREADS1 & (THREADS1-1) != 0)) {\n    fprintf(stderr, \"THREADS1 must be greater than zero and a power of two\\n\");\n    exit(-1);\n  }\n\n  // set L1/shared memory configuration\n  cudaFuncSetCacheConfig(BoundingBoxKernel, cudaFuncCachePreferShared);\n  cudaFuncSetCacheConfig(TreeBuildingKernel, cudaFuncCachePreferL1);\n  cudaFuncSetCacheConfig(ClearKernel1, cudaFuncCachePreferL1);\n  cudaFuncSetCacheConfig(ClearKernel2, cudaFuncCachePreferL1);\n  cudaFuncSetCacheConfig(SummarizationKernel, cudaFuncCachePreferShared);\n  cudaFuncSetCacheConfig(SortKernel, cudaFuncCachePreferL1);\n#ifdef __KEPLER__\n  cudaFuncSetCacheConfig(ForceCalculationKernel, cudaFuncCachePreferEqual);\n#else\n  cudaFuncSetCacheConfig(ForceCalculationKernel, cudaFuncCachePreferL1);\n#endif\n  cudaFuncSetCacheConfig(IntegrationKernel, cudaFuncCachePreferL1);\n\n  cudaGetLastError();  // reset error value\n  for (run = 0; run < 3; run++) {\n    for (i = 0; i < 7; i++) timing[i] = 0.0f;\n\n    nbodies = atoi(argv[1]);\n    if (nbodies < 1) {\n      fprintf(stderr, \"nbodies is too small: %d\\n\", nbodies);\n      exit(-1);\n    }\n    if (nbodies > (1 << 30)) {\n      fprintf(stderr, \"nbodies is too large: %d\\n\", nbodies);\n      exit(-1);\n    }\n    nnodes = nbodies * 2;\n    if (nnodes < 1024*blocks) nnodes = 1024*blocks;\n    while ((nnodes & (WARPSIZE-1)) != 0) nnodes++;\n    nnodes--;\n\n    timesteps = atoi(argv[2]);\n    dtime = 0.025;  dthf = dtime * 0.5f;\n    epssq = 0.05 * 0.05;\n    itolsq = 1.0f / (0.5 * 0.5);\n\n    // allocate memory\n\n    if (run == 0) {\n      printf(\"configuration: %d bodies, %d time steps\\n\", nbodies, timesteps);\n\n      mass = (float *)malloc(sizeof(float) * nbodies);\n      if (mass == NULL) {fprintf(stderr, \"cannot allocate mass\\n\");  exit(-1);}\n      posx = (float *)malloc(sizeof(float) * nbodies);\n      if (posx == NULL) {fprintf(stderr, \"cannot allocate posx\\n\");  exit(-1);}\n      posy = (float *)malloc(sizeof(float) * nbodies);\n      if (posy == NULL) {fprintf(stderr, \"cannot allocate posy\\n\");  exit(-1);}\n      posz = (float *)malloc(sizeof(float) * nbodies);\n      if (posz == NULL) {fprintf(stderr, \"cannot allocate posz\\n\");  exit(-1);}\n      velx = (float *)malloc(sizeof(float) * nbodies);\n      if (velx == NULL) {fprintf(stderr, \"cannot allocate velx\\n\");  exit(-1);}\n      vely = (float *)malloc(sizeof(float) * nbodies);\n      if (vely == NULL) {fprintf(stderr, \"cannot allocate vely\\n\");  exit(-1);}\n      velz = (float *)malloc(sizeof(float) * nbodies);\n      if (velz == NULL) {fprintf(stderr, \"cannot allocate velz\\n\");  exit(-1);}\n\n      if (cudaSuccess != cudaMalloc((void **)&errl, sizeof(int))) fprintf(stderr, \"could not allocate errd\\n\");  CudaTest(\"couldn't allocate errd\");\n      if (cudaSuccess != cudaMalloc((void **)&childl, sizeof(int) * (nnodes+1) * 8)) fprintf(stderr, \"could not allocate childd\\n\");  CudaTest(\"couldn't allocate childd\");\n      if (cudaSuccess != cudaMalloc((void **)&massl, sizeof(float) * (nnodes+1))) fprintf(stderr, \"could not allocate massd\\n\");  CudaTest(\"couldn't allocate massd\");\n      if (cudaSuccess != cudaMalloc((void **)&posxl, sizeof(float) * (nnodes+1))) fprintf(stderr, \"could not allocate posxd\\n\");  CudaTest(\"couldn't allocate posxd\");\n      if (cudaSuccess != cudaMalloc((void **)&posyl, sizeof(float) * (nnodes+1))) fprintf(stderr, \"could not allocate posyd\\n\");  CudaTest(\"couldn't allocate posyd\");\n      if (cudaSuccess != cudaMalloc((void **)&poszl, sizeof(float) * (nnodes+1))) fprintf(stderr, \"could not allocate poszd\\n\");  CudaTest(\"couldn't allocate poszd\");\n      if (cudaSuccess != cudaMalloc((void **)&velxl, sizeof(float) * (nnodes+1))) fprintf(stderr, \"could not allocate velxd\\n\");  CudaTest(\"couldn't allocate velxd\");\n      if (cudaSuccess != cudaMalloc((void **)&velyl, sizeof(float) * (nnodes+1))) fprintf(stderr, \"could not allocate velyd\\n\");  CudaTest(\"couldn't allocate velyd\");\n      if (cudaSuccess != cudaMalloc((void **)&velzl, sizeof(float) * (nnodes+1))) fprintf(stderr, \"could not allocate velzd\\n\");  CudaTest(\"couldn't allocate velzd\");\n      if (cudaSuccess != cudaMalloc((void **)&accxl, sizeof(float) * (nnodes+1))) fprintf(stderr, \"could not allocate accxd\\n\");  CudaTest(\"couldn't allocate accxd\");\n      if (cudaSuccess != cudaMalloc((void **)&accyl, sizeof(float) * (nnodes+1))) fprintf(stderr, \"could not allocate accyd\\n\");  CudaTest(\"couldn't allocate accyd\");\n      if (cudaSuccess != cudaMalloc((void **)&acczl, sizeof(float) * (nnodes+1))) fprintf(stderr, \"could not allocate acczd\\n\");  CudaTest(\"couldn't allocate acczd\");\n      if (cudaSuccess != cudaMalloc((void **)&countl, sizeof(int) * (nnodes+1))) fprintf(stderr, \"could not allocate countd\\n\");  CudaTest(\"couldn't allocate countd\");\n      if (cudaSuccess != cudaMalloc((void **)&startl, sizeof(int) * (nnodes+1))) fprintf(stderr, \"could not allocate startd\\n\");  CudaTest(\"couldn't allocate startd\");\n      if (cudaSuccess != cudaMalloc((void **)&sortl, sizeof(int) * (nnodes+1))) fprintf(stderr, \"could not allocate sortd\\n\");  CudaTest(\"couldn't allocate sortd\");\n\n      if (cudaSuccess != cudaMalloc((void **)&maxxl, sizeof(float) * blocks * FACTOR1)) fprintf(stderr, \"could not allocate maxxd\\n\");  CudaTest(\"couldn't allocate maxxd\");\n      if (cudaSuccess != cudaMalloc((void **)&maxyl, sizeof(float) * blocks * FACTOR1)) fprintf(stderr, \"could not allocate maxyd\\n\");  CudaTest(\"couldn't allocate maxyd\");\n      if (cudaSuccess != cudaMalloc((void **)&maxzl, sizeof(float) * blocks * FACTOR1)) fprintf(stderr, \"could not allocate maxzd\\n\");  CudaTest(\"couldn't allocate maxzd\");\n      if (cudaSuccess != cudaMalloc((void **)&minxl, sizeof(float) * blocks * FACTOR1)) fprintf(stderr, \"could not allocate minxd\\n\");  CudaTest(\"couldn't allocate minxd\");\n      if (cudaSuccess != cudaMalloc((void **)&minyl, sizeof(float) * blocks * FACTOR1)) fprintf(stderr, \"could not allocate minyd\\n\");  CudaTest(\"couldn't allocate minyd\");\n      if (cudaSuccess != cudaMalloc((void **)&minzl, sizeof(float) * blocks * FACTOR1)) fprintf(stderr, \"could not allocate minzd\\n\");  CudaTest(\"couldn't allocate minzd\");\n    }\n\n    // generate input\n\n    drndset(7);\n    printf(\"Generating Input: \\n\");\n    rsc = (3 * 3.1415926535897932384626433832795) / 16;\n    vsc = sqrt(1.0 / rsc);\n    for (i = 0; i < nbodies; i++) {\n      mass[i] = 1.0 / nbodies;\n      r = 1.0 / sqrt(pow(drnd()*0.999, -2.0/3.0) - 1);\n      do {\n        x = drnd()*2.0 - 1.0;\n        y = drnd()*2.0 - 1.0;\n        z = drnd()*2.0 - 1.0;\n        sq = x*x + y*y + z*z;\n      } while (sq > 1.0);\n      scale = rsc * r / sqrt(sq);\n      posx[i] = x * scale;\n      posy[i] = y * scale;\n      posz[i] = z * scale;\n\n      do {\n        x = drnd();\n        y = drnd() * 0.1;\n      } while (y > x*x * pow(1 - x*x, 3.5));\n      v = x * sqrt(2.0 / sqrt(1 + r*r));\n      do {\n        x = drnd()*2.0 - 1.0;\n        y = drnd()*2.0 - 1.0;\n        z = drnd()*2.0 - 1.0;\n        sq = x*x + y*y + z*z;\n      } while (sq > 1.0);\n      scale = vsc * v / sqrt(sq);\n      velx[i] = x * scale;\n      vely[i] = y * scale;\n      velz[i] = z * scale;\n    }\n\n    if (cudaSuccess != cudaMemcpy(massl, mass, sizeof(float) * nbodies, cudaMemcpyHostToDevice)) fprintf(stderr, \"copying of mass to device failed\\n\");  CudaTest(\"mass copy to device failed\");\n    if (cudaSuccess != cudaMemcpy(posxl, posx, sizeof(float) * nbodies, cudaMemcpyHostToDevice)) fprintf(stderr, \"copying of posx to device failed\\n\");  CudaTest(\"posx copy to device failed\");\n    if (cudaSuccess != cudaMemcpy(posyl, posy, sizeof(float) * nbodies, cudaMemcpyHostToDevice)) fprintf(stderr, \"copying of posy to device failed\\n\");  CudaTest(\"posy copy to device failed\");\n    if (cudaSuccess != cudaMemcpy(poszl, posz, sizeof(float) * nbodies, cudaMemcpyHostToDevice)) fprintf(stderr, \"copying of posz to device failed\\n\");  CudaTest(\"posz copy to device failed\");\n    if (cudaSuccess != cudaMemcpy(velxl, velx, sizeof(float) * nbodies, cudaMemcpyHostToDevice)) fprintf(stderr, \"copying of velx to device failed\\n\");  CudaTest(\"velx copy to device failed\");\n    if (cudaSuccess != cudaMemcpy(velyl, vely, sizeof(float) * nbodies, cudaMemcpyHostToDevice)) fprintf(stderr, \"copying of vely to device failed\\n\");  CudaTest(\"vely copy to device failed\");\n    if (cudaSuccess != cudaMemcpy(velzl, velz, sizeof(float) * nbodies, cudaMemcpyHostToDevice)) fprintf(stderr, \"copying of velz to device failed\\n\");  CudaTest(\"velz copy to device failed\");\n\n    // run timesteps (launch GPU kernels)\n\n    cudaEventCreate(&start);  cudaEventCreate(&stop);\n    struct timeval starttime, endtime;\n    gettimeofday(&starttime, NULL);\n\n    cudaEventRecord(start, 0);\n    InitializationKernel<<<1, 1>>>(errl);\n    cudaEventRecord(stop, 0);  cudaEventSynchronize(stop);  cudaEventElapsedTime(&time, start, stop);\n    timing[0] += time;\n    CudaTest(\"kernel 0 launch failed\");\n\n    for (step = 0; step < timesteps; step++) {\n      //fprintf(stderr, \"BBKernel\\n\");\n      cudaEventRecord(start, 0);\n      BoundingBoxKernel<<<blocks * FACTOR1, THREADS1>>>(nnodes, nbodies, startl, childl, massl, posxl, posyl, poszl, maxxl, maxyl, maxzl, minxl, minyl, minzl);\n      cudaEventRecord(stop, 0);  cudaEventSynchronize(stop);  cudaEventElapsedTime(&time, start, stop);\n      timing[1] += time;\n      CudaTest(\"kernel 1 launch failed\");\n\n      //fprintf(stderr, \"TBKernel\\n\");\n      cudaEventRecord(start, 0);\n      ClearKernel1<<<blocks * 1, 1024>>>(nnodes, nbodies, childl);\n      TreeBuildingKernel<<<blocks * FACTOR2, THREADS2>>>(nnodes, nbodies, errl, childl, posxl, posyl, poszl);\n      ClearKernel2<<<blocks * 1, 1024>>>(nnodes, startl, massl);\n      cudaEventRecord(stop, 0);  cudaEventSynchronize(stop);  cudaEventElapsedTime(&time, start, stop);\n      timing[2] += time;\n      CudaTest(\"kernel 2 launch failed\");\n\n      //fprintf(stderr, \"SKKernel\\n\");\n      cudaEventRecord(start, 0);\n      assert(all_resident(SummarizationKernel, dim3(blocks * FACTOR3), dim3(THREADS3), 0));\n      SummarizationKernel<<<blocks * FACTOR3, THREADS3>>>(nnodes, nbodies, countl, childl, massl, posxl, posyl, poszl);\n      cudaEventRecord(stop, 0);  cudaEventSynchronize(stop);  cudaEventElapsedTime(&time, start, stop);\n      timing[3] += time;\n      CudaTest(\"kernel 3 launch failed\");\n\n      cudaEventRecord(start, 0);\n      assert(all_resident(SortKernel, dim3(blocks * FACTOR4), dim3(THREADS4), 0));\n      SortKernel<<<blocks * FACTOR4, THREADS4>>>(nnodes, nbodies, sortl, countl, startl, childl);\n      cudaEventRecord(stop, 0);  cudaEventSynchronize(stop);  cudaEventElapsedTime(&time, start, stop);\n      timing[4] += time;\n      CudaTest(\"kernel 4 launch failed\");\n\n      //fprintf(stderr, \"FCKernel\\n\");\n      cudaEventRecord(start, 0);\n      ForceCalculationKernel<<<blocks * FACTOR5, THREADS5>>>(nnodes, nbodies, errl, dthf, itolsq, epssq, sortl, childl, massl, posxl, posyl, poszl, velxl, velyl, velzl, accxl, accyl, acczl);\n      cudaEventRecord(stop, 0);  cudaEventSynchronize(stop);  cudaEventElapsedTime(&time, start, stop);\n      timing[5] += time;\n      CudaTest(\"kernel 5 launch failed\");\n\n      //fprintf(stderr, \"IKKernel\\n\");\n      cudaEventRecord(start, 0);\n      IntegrationKernel<<<blocks * FACTOR6, THREADS6>>>(nbodies, dtime, dthf, posxl, posyl, poszl, velxl, velyl, velzl, accxl, accyl, acczl);\n      cudaEventRecord(stop, 0);  cudaEventSynchronize(stop);  cudaEventElapsedTime(&time, start, stop);\n      timing[6] += time;\n      CudaTest(\"kernel 6 launch failed\");\n    }\n    CudaTest(\"kernel launch failed\");\n    cudaEventDestroy(start);  cudaEventDestroy(stop);\n\n    // transfer result back to CPU\n    if (cudaSuccess != cudaMemcpy(&error, errl, sizeof(int), cudaMemcpyDeviceToHost)) fprintf(stderr, \"copying of err from device failed\\n\");  CudaTest(\"err copy from device failed\");\n    if (cudaSuccess != cudaMemcpy(posx, posxl, sizeof(float) * nbodies, cudaMemcpyDeviceToHost)) fprintf(stderr, \"copying of posx from device failed\\n\");  CudaTest(\"posx copy from device failed\");\n    if (cudaSuccess != cudaMemcpy(posy, posyl, sizeof(float) * nbodies, cudaMemcpyDeviceToHost)) fprintf(stderr, \"copying of posy from device failed\\n\");  CudaTest(\"posy copy from device failed\");\n    if (cudaSuccess != cudaMemcpy(posz, poszl, sizeof(float) * nbodies, cudaMemcpyDeviceToHost)) fprintf(stderr, \"copying of posz from device failed\\n\");  CudaTest(\"posz copy from device failed\");\n    if (cudaSuccess != cudaMemcpy(velx, velxl, sizeof(float) * nbodies, cudaMemcpyDeviceToHost)) fprintf(stderr, \"copying of velx from device failed\\n\");  CudaTest(\"velx copy from device failed\");\n    if (cudaSuccess != cudaMemcpy(vely, velyl, sizeof(float) * nbodies, cudaMemcpyDeviceToHost)) fprintf(stderr, \"copying of vely from device failed\\n\");  CudaTest(\"vely copy from device failed\");\n    if (cudaSuccess != cudaMemcpy(velz, velzl, sizeof(float) * nbodies, cudaMemcpyDeviceToHost)) fprintf(stderr, \"copying of velz from device failed\\n\");  CudaTest(\"velz copy from device failed\");\n\n    gettimeofday(&endtime, NULL);\n    runtime = endtime.tv_sec + endtime.tv_usec/1000000.0 - starttime.tv_sec - starttime.tv_usec/1000000.0;\n\n    printf(\"runtime: %.4lf s  (\", runtime);\n    time = 0;\n    for (i = 1; i < 7; i++) {\n      printf(\" %.1f \", timing[i]);\n      time += timing[i];\n    }\n    if (error == 0) {\n      printf(\") = %.1f ms\\n\", time);\n    } else {\n      printf(\") = %.1f ms FAILED %d\\n\", time, error);\n    }\n  }\n\n  // print output\n  i = 0;\n//  for (i = 0; i < nbodies; i++) {\n    printf(\"%.2e %.2e %.2e\\n\", posx[i], posy[i], posz[i]);\n//  }\n\n  free(mass);\n  free(posx);\n  free(posy);\n  free(posz);\n  free(velx);\n  free(vely);\n  free(velz);\n\n  cudaFree(errl);\n  cudaFree(childl);\n  cudaFree(massl);\n  cudaFree(posxl);\n  cudaFree(posyl);\n  cudaFree(poszl);\n  cudaFree(countl);\n  cudaFree(startl);\n\n  cudaFree(maxxl);\n  cudaFree(maxyl);\n  cudaFree(maxzl);\n  cudaFree(minxl);\n  cudaFree(minyl);\n  cudaFree(minzl);\n\n  return 0;\n}\n"
  },
  {
    "path": "lonestar/scientific/gpu/barneshut/bh_tuning.h",
    "content": "/*\n * CUDA BarnesHut v3.1: Simulation of the gravitational forces\n * in a galactic cluster using the Barnes-Hut n-body algorithm\n *\n * Copyright (c) 2013, Texas State University-San Marcos. All rights reserved.\n *\n * Redistribution and use in source and binary forms, with or without\n * modification, are permitted for academic, research, experimental, or personal\n * use provided that the following conditions are met:\n *\n *    * Redistributions of source code must retain the above copyright notice,\n *      this list of conditions and the following disclaimer.\n *    * Redistributions in binary form must reproduce the above copyright\n * notice, this list of conditions and the following disclaimer in the\n * documentation and/or other materials provided with the distribution.\n *    * Neither the name of Texas State University-San Marcos nor the names of\n * its contributors may be used to endorse or promote products derived from this\n *      software without specific prior written permission.\n *\n * For all other uses, please contact the Office for Commercialization and\n * Industry Relations at Texas State University-San Marcos\n * <http://www.txstate.edu/ocir/>.\n *\n * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS \"AS IS\"\n * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE\n * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE\n * ARE DISCLAIMED IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE\n * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR\n * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF\n * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS\n * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN\n * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)\n * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE\n * POSSIBILITY OF SUCH DAMAGE.\n *\n * Author: Martin Burtscher <burtscher@txstate.edu>\n *\n */\n\n#pragma once\n#define THREADS1 32\n#define THREADS2 192\n#define THREADS3 64\n#define THREADS4 352\n#define THREADS5 192\n#define THREADS6 96\n#define FACTOR1 6\n#define FACTOR2 7\n#define FACTOR3 8\n#define FACTOR4 4\n#define FACTOR5 7\n#define FACTOR6 6\nstatic const char* TUNING_PARAMETERS =\n    \"THREADS1 32\\nTHREADS2 192\\nTHREADS3 64\\nTHREADS4 352\\nTHREADS5 \"\n    \"192\\nTHREADS6 96\\nFACTOR1 6\\nFACTOR2 7\\nFACTOR3 8\\nFACTOR4 4\\nFACTOR5 \"\n    \"7\\nFACTOR6 6\\n\";\n"
  },
  {
    "path": "lonestar/scientific/gpu/delaunayrefinement/CMakeLists.txt",
    "content": "app_scientific_gpu(dmr delaunayrefinement)\n#add_test_gpu(delaunayrefinement rmat15 rmat15.out dmr ${BASEINPUT}/reference/meshes/r10k.1 20)\nadd_test_gpu(delaunayrefinement rmat15 rmat15.out dmr ${BASEINPUT}/meshes/250k.2 20)\n"
  },
  {
    "path": "lonestar/scientific/gpu/delaunayrefinement/README.md",
    "content": "Delaunayrefinement\n================================================================================\n\nDESCRIPTION \n--------------------------------------------------------------------------------\n\nThe LSG Delaunay Mesh Refinement uses a variant of Chew's algorithm as\nimplemented in the Lonestar CPU benchmark.\n\nA great resource on Delaunary Mesh Refinement is the website\nmaintained by Shewchuk:\n\nhttps://www.cs.cmu.edu/~quake/triangle.research.html\n\n\nINPUT\n--------------------------------------------------------------------------------\n\nTest inputs (files with extensions .ele, .node, .poly) can be downloaded\nfrom [https://www.cs.cmu.edu/~quake/triangle.html](this url)\n\nBUILD\n--------------------------------------------------------------------------------\n\n1. Run cmake at BUILD directory (refer to top-level README for cmake instructions).\n\n2. Run `cd <BUILD>/lonestar/scientific/gpu/delaunayrefinement; make -j`\n\nRUN\n--------------------------------------------------------------------------------\n\nThe following are a few example command lines.\n\n- `$ ./delaunayrefinement-gpu <input-basename> <maxfactor>`\n- `$ ./delaunayrefinement-gpu r1M 20`\n\nPERFORMANCE  \n--------------------------------------------------------------------------------\n\n* In our experience, nondet schedule in  delaunayrefinement outperforms deterministic schedules, because determinism incurs a performance cost\n* Performance is sensitive to CHUNK_SIZE for the worklist, whose optimal value is input and\n  machine dependent\n"
  },
  {
    "path": "lonestar/scientific/gpu/delaunayrefinement/devel.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#pragma once\n\nvoid dump_element(FORD* nodex, FORD* nodey, uint3& element, int ele) {\n  printf(\"%f %f %f %f\", nodex[element.x], nodey[element.x], nodex[element.y],\n         nodey[element.y]);\n\n  if (!IS_SEGMENT(element))\n    printf(\" %f %f\", nodex[element.z], nodey[element.z]);\n\n  printf(\" %d\", ele);\n}\n\nvoid dump_neighbours(ShMesh& mesh) {\n  FORD* nodex       = mesh.nodex.cpu_rd_ptr();\n  FORD* nodey       = mesh.nodey.cpu_rd_ptr();\n  uint3* elements   = mesh.elements.cpu_rd_ptr();\n  uint3* neighbours = mesh.neighbours.cpu_rd_ptr();\n\n  for (int i = 0; i < mesh.nelements; i++) {\n    printf(\"center element [\");\n    dump_element(nodex, nodey, elements[i], i);\n    printf(\"]\\npre-graph\\n\");\n\n    if (neighbours[i].x != INVALIDID) {\n      printf(\"[\");\n      dump_element(nodex, nodey, elements[neighbours[i].x], i);\n      printf(\"]\\n\");\n    }\n\n    if (neighbours[i].y != INVALIDID) {\n      printf(\"[\");\n      dump_element(nodex, nodey, elements[neighbours[i].y], i);\n      printf(\"]\\n\");\n    }\n\n    if (neighbours[i].z != INVALIDID) {\n      printf(\"[\");\n      dump_element(nodex, nodey, elements[neighbours[i].z], i);\n      printf(\"]\\n\");\n    }\n    printf(\"post-graph\\n\");\n\n    printf(\"update over\\n\");\n  }\n}\n\nvoid debug_isbad(Worklist2& wl, ShMesh& mesh) {\n  bool* isbad = mesh.isbad.cpu_rd_ptr();\n  bool* isdel = mesh.isdel.cpu_rd_ptr();\n\n  int i;\n  int badcount = 0;\n\n  wl.update_cpu();\n  int wlitems = wl.nitems();\n\n  printf(\"checking %d elements\\n\", mesh.nelements);\n  for (i = 0; i < mesh.nelements; i++) {\n    if (isdel[i])\n      continue;\n\n    if (isbad[i]) {\n      badcount++;\n\n      int j = 0;\n      for (j = 0; j < wlitems; j++) {\n        if (wl.wl[j] == i)\n          break;\n      }\n\n      if (j == wlitems)\n        printf(\"\\tnot found: %d\\n\", i);\n    }\n  }\n\n  printf(\"bad count: %d\\n\", badcount);\n}\n"
  },
  {
    "path": "lonestar/scientific/gpu/delaunayrefinement/dmr-nontex.cu",
    "content": "/*  -*- mode: c++ -*-  */\n#include \"gg.h\"\n#include \"ggcuda.h\"\n\nvoid kernel_sizing(CSRGraph &, dim3 &, dim3 &);\n#define TB_SIZE 256\nconst char *GGC_OPTIONS = \"coop_conv=False $ outline_iterate_gb=False $ backoff_blocking_factor=4 $ parcomb=False $ np_schedulers=set(['fg', 'tb', 'wp']) $ cc_disable=set([]) $ hacks=set([]) $ np_factor=1 $ instrument=set([]) $ unroll=[] $ instrument_mode=None $ read_props=None $ outline_iterate=True $ ignore_nested_errors=False $ np=False $ write_props=None $ quiet_cgen=True $ retry_backoff=True $ cuda.graph_type=basic $ cuda.use_worklist_slots=True $ cuda.worklist_type=basic\";\n#define CAVLEN 256\n#define BCLEN 1024\n#include \"dmrggc.inc\"\nstatic const int __tb_refine = TB_SIZE;\n__global__ void check_triangles(Mesh mesh, unsigned int * bad_triangles, int start, Worklist2 in_wl, Worklist2 out_wl)\n{\n  unsigned tid = TID_1D;\n  unsigned nthreads = TOTAL_THREADS_1D;\n\n  const unsigned __kernel_tb_size = TB_SIZE;\n  if (tid == 0)\n    in_wl.reset_next_slot();\n\n  index_type ele_end;\n  // FP: \"1 -> 2;\n  uint3* el ;\n  int count = 0;\n  // FP: \"2 -> 3;\n  ele_end = ((mesh).nelements);\n  for (index_type ele = start + tid; ele < ele_end; ele += nthreads)\n  {\n    if (ele < mesh.nelements)\n    {\n      if (!(mesh.isdel[ele] || IS_SEGMENT(mesh.elements[ele])))\n      {\n        if (!mesh.isbad[ele])\n        {\n          el = &mesh.elements[ele];\n          mesh.isbad[ele] = (angleLT(mesh, el->x, el->y, el->z) || angleLT(mesh, el->z, el->x, el->y) || angleLT(mesh, el->y, el->z, el->x));\n        }\n        if (mesh.isbad[ele])\n        {\n          count++;\n          (out_wl).push(ele);\n        }\n      }\n    }\n  }\n  // FP: \"15 -> 16;\n  atomicAdd(bad_triangles, count);\n  // FP: \"16 -> 17;\n}\n__global__ void __launch_bounds__(TB_SIZE) refine(Mesh mesh, int debg, uint * nnodes, uint * nelements, Worklist2 in_wl, Worklist2 out_wl, Worklist2 re_wl, ExclusiveLocks _ex, GlobalBarrier gb)\n{\n  unsigned tid = TID_1D;\n  unsigned nthreads = TOTAL_THREADS_1D;\n\n  const unsigned __kernel_tb_size = TB_SIZE;\n  if (tid == 0)\n    in_wl.reset_next_slot();\n\n  index_type wlele_end;\n  index_type wlele_rup;\n  index_type wlele_block_size;\n  index_type wlele_block_start;\n  // FP: \"1 -> 2;\n  uint cavity[CAVLEN] ;\n  uint nc = 0;\n  uint boundary[BCLEN] ;\n  uint bc = 0;\n  uint blnodes[BCLEN/4] ;\n  bool repush = false;\n  int stage = 0;\n  int x = 0;\n  // FP: \"2 -> 3;\n  wlele_end = *((volatile index_type *) (in_wl).dindex);\n  wlele_rup = ((0) + roundup(((*((volatile index_type *) (in_wl).dindex)) - (0)), (nthreads)));\n  wlele_block_size = wlele_rup / nthreads;\n  wlele_block_start = (0 + tid) * wlele_block_size;\n  for (index_type wlele = wlele_block_start; wlele < (wlele_block_start + wlele_block_size) && (wlele < wlele_rup); wlele++)\n  {\n    FORD cx;\n    FORD cy;\n    bool pop;\n    int ele;\n    nc = 0;\n    bc = 0;\n    repush = false;\n    stage = 0;\n    pop = (in_wl).pop_id(wlele, ele);\n    if (pop && ele < mesh.nelements && mesh.isbad[ele] && !mesh.isdel[ele])\n    {\n      uint oldcav;\n      cavity[nc++] = ele;\n      do\n      {\n        oldcav = cavity[0];\n        cavity[0] = opposite(mesh, ele);\n      }\n      while (cavity[0] != oldcav);\n      if (!build_cavity(mesh, cavity, nc, CAVLEN, boundary, bc, cx, cy))\n      {\n        build_cavity(mesh, cavity, nc, CAVLEN, boundary, bc, cx, cy);\n      }\n    }\n    int nodes_added = 0;\n    int elems_added = 0;\n    {\n      _ex.mark_p1(nc, (int *) cavity, tid);\n      _ex.mark_p1_iterator(2, bc, 4, (int *) boundary, tid);\n      gb.Sync();\n      _ex.mark_p2(nc, (int *) cavity, tid);\n      _ex.mark_p2_iterator(2, bc, 4, (int *) boundary, tid);\n      gb.Sync();\n      int _x = 1;\n      _x &= _ex.owns(nc, (int *) cavity, tid);\n      _x &= _ex.owns_iterator(2, bc, 4, (int *) boundary, tid);\n      if (_x)\n      {\n        if (nc > 0)\n        {\n          nodes_added = 1;\n          elems_added = (bc >> 2) + (IS_SEGMENT(mesh.elements[cavity[0]]) ? 2 : 0);\n          uint cnode ;\n          uint cseg1 = 0;\n          uint cseg2 = 0;\n          uint nelements_added ;\n          uint oldelements ;\n          uint newelemndx ;\n          cnode = add_node(mesh, cx, cy, atomicAdd(nnodes, 1));\n          nelements_added = elems_added;\n          oldelements = atomicAdd(nelements, nelements_added);\n          newelemndx = oldelements;\n          if (IS_SEGMENT(mesh.elements[cavity[0]]))\n          {\n            cseg1 = add_segment(mesh, mesh.elements[cavity[0]].x, cnode, newelemndx++);\n            cseg2 = add_segment(mesh, cnode, mesh.elements[cavity[0]].y, newelemndx++);\n          }\n          for (int i = 0; i < bc; i+=4)\n          {\n            uint ntri = add_triangle(mesh, boundary[i], boundary[i+1], cnode, boundary[i+2], boundary[i+3], newelemndx++);\n          }\n          assert(oldelements + nelements_added == newelemndx);\n          setup_neighbours(mesh, oldelements, newelemndx);\n          repush = true;\n          for (int i = 0; i < nc; i++)\n          {\n            mesh.isdel[cavity[i]] = true;\n            if (cavity[i] == ele)\n            {\n              repush = false;\n            }\n          }\n        }\n      }\n      else\n      {\n        repush = true;\n      }\n    }\n    gb.Sync();\n    if (repush)\n    {\n      (out_wl).push(ele);\n      continue;\n    }\n  }\n}\nvoid refine_mesh(ShMesh& mesh, dim3 blocks, dim3 threads)\n{\n  ExclusiveLocks refine_ex_locks(mesh.maxnelements);\n  static GlobalBarrierLifetime refine_barrier;\n  static bool refine_barrier_inited;\n  PipeContextT<Worklist2> pipe;\n  // FP: \"1 -> 2;\n  Shared<uint> nbad (1);\n  Mesh gmesh (mesh);\n  Shared<uint> nelements (1);\n  Shared<uint> nnodes (1);\n  int cnbad ;\n  bool orig = false;\n  ggc::Timer t (\"total\");\n  // FP: \"2 -> 3;\n  // FP: \"3 -> 4;\n  static const size_t refine_residency = maximum_residency(refine, __tb_refine, 0);\n  static const size_t refine_blocks = GG_MIN(blocks.x, ggc_get_nSM() * refine_residency);\n  if(!refine_barrier_inited) { refine_barrier.Setup(refine_blocks); refine_barrier_inited = true;};\n  // FP: \"4 -> 5;\n  find_neighbours_cpu(mesh);\n  gmesh.refresh(mesh);\n  *(nelements.cpu_wr_ptr(true)) = mesh.nelements;\n  *(nnodes.cpu_wr_ptr(true)) = mesh.nnodes;\n  // FP: \"5 -> 6;\n  pipe = PipeContextT<Worklist2>(mesh.nelements);\n  {\n    {\n      int lastnelements = 0;\n      // FP: \"7 -> 8;\n      *(nbad.cpu_wr_ptr(true)) = 0;\n      t.start();\n      // FP: \"8 -> 9;\n      pipe.out_wl().will_write();\n      check_triangles <<<blocks, threads>>>(gmesh, nbad.gpu_wr_ptr(), 0, pipe.in_wl(), pipe.out_wl());\n      pipe.in_wl().swap_slots();\n      pipe.advance2();\n      // FP: \"9 -> 10;\n      printf(\"%d initial bad triangles\\n\", *(nbad.cpu_rd_ptr()) );;\n      // FP: \"10 -> 11;\n      while (pipe.in_wl().nitems())\n      {\n        lastnelements = gmesh.nelements;\n        {\n          pipe.out_wl().will_write();\n          pipe.re_wl().will_write();\n          refine <<<refine_blocks, __tb_refine>>>(gmesh, 32, nnodes.gpu_wr_ptr(), nelements.gpu_wr_ptr(), pipe.in_wl(), pipe.out_wl(), pipe.re_wl(), refine_ex_locks, refine_barrier);\n          pipe.in_wl().swap_slots();\n          pipe.retry2();\n        }\n        gmesh.nnodes = mesh.nnodes = *(nnodes.cpu_rd_ptr());\n        gmesh.nelements = mesh.nelements = *(nelements.cpu_rd_ptr());\n        *(nbad.cpu_wr_ptr(true)) = 0;\n        printf(\"checking triangles ...\\n\");\n        pipe.out_wl().will_write();\n        if (orig)\n          check_triangles_orig <<<blocks, threads>>>(gmesh, nbad.gpu_wr_ptr(), lastnelements, pipe.in_wl(), pipe.out_wl());\n        else\n          check_triangles <<<blocks, threads>>>(gmesh, nbad.gpu_wr_ptr(), lastnelements, pipe.in_wl(), pipe.out_wl());\n        pipe.in_wl().swap_slots();\n        pipe.advance2();\n        printf(\"%d bad triangles\\n\", *(nbad.cpu_rd_ptr()) );\n      }\n      // FP: \"18 -> 19;\n      t.stop();\n      printf(\"time: %llu ns\\n\", t.duration());\n      // FP: \"19 -> 20;\n      {\n        *(nbad.cpu_wr_ptr(true)) = 0;\n        // FP: \"21 -> 22;\n        pipe.out_wl().will_write();\n        check_triangles <<<blocks, threads>>>(gmesh, nbad.gpu_wr_ptr(), 0, pipe.in_wl(), pipe.out_wl());\n        pipe.in_wl().swap_slots();\n        pipe.advance2();\n        // FP: \"22 -> 23;\n        printf(\"%d (%d) final bad triangles\\n\", *(nbad.cpu_rd_ptr()), pipe.in_wl().nitems() );\n        // FP: \"23 -> 24;\n      }\n      // FP: \"20 -> 21;\n    }\n  }\n  pipe.free();\n  // FP: \"6 -> 7;\n}\n#include \"main.inc\""
  },
  {
    "path": "lonestar/scientific/gpu/delaunayrefinement/dmr.cu",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting parallelism.\n * The code is being released under the terms of the 3-Clause BSD License (a\n * copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#include \"gg.h\"\n#include \"ggcuda.h\"\n\nvoid kernel_sizing(CSRGraphTex &, dim3 &, dim3 &);\n#define TB_SIZE 256\nconst char *GGC_OPTIONS = \"coop_conv=False $ outline_iterate_gb=False $ backoff_blocking_factor=4 $ parcomb=False $ np_schedulers=set(['fg', 'tb', 'wp']) $ cc_disable=set([]) $ hacks=set([]) $ np_factor=1 $ instrument=set([]) $ unroll=[] $ instrument_mode=None $ read_props=None $ outline_iterate=True $ ignore_nested_errors=False $ np=False $ write_props=None $ quiet_cgen=True $ retry_backoff=True $ cuda.graph_type=texture $ cuda.use_worklist_slots=True $ cuda.worklist_type=texture\";\n#define CAVLEN 256\n#define BCLEN 1024\n#include \"dmrggc.inc\"\nstatic const int __tb_refine = TB_SIZE;\n__global__ void check_triangles(Mesh mesh, unsigned int * bad_triangles, int start, WorklistT in_wl, WorklistT out_wl)\n{\n  unsigned tid = TID_1D;\n  unsigned nthreads = TOTAL_THREADS_1D;\n\n  //const unsigned __kernel_tb_size = TB_SIZE;\n  if (tid == 0)\n    in_wl.reset_next_slot();\n\n  index_type ele_end;\n  // FP: \"1 -> 2;\n  uint3* el ;\n  int count = 0;\n  // FP: \"2 -> 3;\n  ele_end = ((mesh).nelements);\n  for (index_type ele = start + tid; ele < ele_end; ele += nthreads)\n  {\n    if (ele < mesh.nelements)\n    {\n      if (!(mesh.isdel[ele] || IS_SEGMENT(mesh.elements[ele])))\n      {\n        if (!mesh.isbad[ele])\n        {\n          el = &mesh.elements[ele];\n          mesh.isbad[ele] = (angleLT(mesh, el->x, el->y, el->z) || angleLT(mesh, el->z, el->x, el->y) || angleLT(mesh, el->y, el->z, el->x));\n        }\n        if (mesh.isbad[ele])\n        {\n          count++;\n          (out_wl).push(ele);\n        }\n      }\n    }\n  }\n  // FP: \"15 -> 16;\n  atomicAdd(bad_triangles, count);\n  // FP: \"16 -> 17;\n}\n__global__ void __launch_bounds__(TB_SIZE) refine(Mesh mesh, int debg, uint * nnodes, uint * nelements, WorklistT in_wl, WorklistT out_wl, WorklistT re_wl, ExclusiveLocks _ex, GlobalBarrier gb)\n{\n  unsigned tid = TID_1D;\n  unsigned nthreads = TOTAL_THREADS_1D;\n\n  //const unsigned __kernel_tb_size = TB_SIZE;\n  if (tid == 0)\n    in_wl.reset_next_slot();\n\n  uint cavity[CAVLEN] ;\n  uint nc = 0;\n  uint boundary[BCLEN] ;\n  uint bc = 0;\n  bool repush = false;\n  index_type wlele_end = *((volatile index_type *) (in_wl).dindex);\n  index_type wlele_rup = ((0) + roundup(((*((volatile index_type *) (in_wl).dindex)) - (0)), (nthreads)));\n  index_type wlele_block_size = wlele_rup / nthreads;\n  index_type wlele_block_start = (0 + tid) * wlele_block_size;\n  int stage = 0;\n  for (index_type wlele = wlele_block_start; wlele < (wlele_block_start + wlele_block_size) && (wlele < wlele_rup); wlele++)\n  {\n    FORD cx;\n    FORD cy;\n    bool pop;\n    int ele;\n    nc = 0;\n    bc = 0;\n    repush = false;\n    stage = 0;\n    pop = (in_wl).pop_id(wlele, ele);\n    if (pop && ele < mesh.nelements && mesh.isbad[ele] && !mesh.isdel[ele])\n    {\n      uint oldcav;\n      cavity[nc++] = ele;\n      do\n      {\n        oldcav = cavity[0];\n        cavity[0] = opposite(mesh, ele);\n      }\n      while (cavity[0] != oldcav);\n      if (!build_cavity(mesh, cavity, nc, CAVLEN, boundary, bc, cx, cy))\n      {\n        build_cavity(mesh, cavity, nc, CAVLEN, boundary, bc, cx, cy);\n      }\n    }\n    int nodes_added = 0;\n    int elems_added = 0;\n    {\n      _ex.mark_p1(nc, (int *) cavity, tid);\n      _ex.mark_p1_iterator(2, bc, 4, (int *) boundary, tid);\n      gb.Sync();\n      _ex.mark_p2(nc, (int *) cavity, tid);\n      _ex.mark_p2_iterator(2, bc, 4, (int *) boundary, tid);\n      gb.Sync();\n      int _x = 1;\n      _x &= _ex.owns(nc, (int *) cavity, tid);\n      _x &= _ex.owns_iterator(2, bc, 4, (int *) boundary, tid);\n      if (_x)\n      {\n        if (nc > 0)\n        {\n          nodes_added = 1;\n          elems_added = (bc >> 2) + (IS_SEGMENT(mesh.elements[cavity[0]]) ? 2 : 0);\n          uint cnode ;\n          uint cseg1 = 0;\n          uint cseg2 = 0;\n          uint nelements_added ;\n          uint oldelements ;\n          uint newelemndx ;\n          cnode = add_node(mesh, cx, cy, atomicAdd(nnodes, 1));\n          nelements_added = elems_added;\n          oldelements = atomicAdd(nelements, nelements_added);\n          newelemndx = oldelements;\n          if (IS_SEGMENT(mesh.elements[cavity[0]]))\n          {\n            cseg1 = add_segment(mesh, mesh.elements[cavity[0]].x, cnode, newelemndx++);\n            cseg2 = add_segment(mesh, cnode, mesh.elements[cavity[0]].y, newelemndx++);\n          }\n          for (int i = 0; i < bc; i+=4)\n          {\n            uint ntri = add_triangle(mesh, boundary[i], boundary[i+1], cnode, boundary[i+2], boundary[i+3], newelemndx++);\n          }\n          assert(oldelements + nelements_added == newelemndx);\n          setup_neighbours(mesh, oldelements, newelemndx);\n          repush = true;\n          for (int i = 0; i < nc; i++)\n          {\n            mesh.isdel[cavity[i]] = true;\n            if (cavity[i] == ele)\n            {\n              repush = false;\n            }\n          }\n        }\n      }\n      else\n      {\n        repush = true;\n      }\n    }\n    gb.Sync();\n    if (repush)\n    {\n      (out_wl).push(ele);\n      continue;\n    }\n  }\n}\nvoid refine_mesh(ShMesh& mesh, dim3 blocks, dim3 threads)\n{\n  ExclusiveLocks refine_ex_locks(mesh.maxnelements);\n  static GlobalBarrierLifetime refine_barrier;\n  static bool refine_barrier_inited;\n  PipeContextT<WorklistT> pipe;\n  // FP: \"1 -> 2;\n  Shared<uint> nbad (1);\n  Mesh gmesh (mesh);\n  Shared<uint> nelements (1);\n  Shared<uint> nnodes (1);\n  //int cnbad ;\n  bool orig = false;\n  ggc::Timer t (\"total\");\n  // FP: \"2 -> 3;\n  // FP: \"3 -> 4;\n  static const size_t refine_residency = maximum_residency(refine, __tb_refine, 0);\n  static const size_t refine_blocks = GG_MIN(blocks.x, ggc_get_nSM() * refine_residency);\n  if(!refine_barrier_inited) { refine_barrier.Setup(refine_blocks); refine_barrier_inited = true;};\n  // FP: \"4 -> 5;\n  find_neighbours_cpu(mesh);\n  gmesh.refresh(mesh);\n  *(nelements.cpu_wr_ptr(true)) = mesh.nelements;\n  *(nnodes.cpu_wr_ptr(true)) = mesh.nnodes;\n  // FP: \"5 -> 6;\n  pipe = PipeContextT<WorklistT>(mesh.nelements);\n  {\n    {\n      int lastnelements = 0;\n      // FP: \"7 -> 8;\n      *(nbad.cpu_wr_ptr(true)) = 0;\n      t.start();\n      // FP: \"8 -> 9;\n      pipe.out_wl().will_write();\n      check_triangles <<<blocks, threads>>>(gmesh, nbad.gpu_wr_ptr(), 0, pipe.in_wl(), pipe.out_wl());\n      pipe.in_wl().swap_slots();\n      pipe.advance2();\n      // FP: \"9 -> 10;\n      printf(\"%d initial bad triangles\\n\", *(nbad.cpu_rd_ptr()) );;\n      // FP: \"10 -> 11;\n      while (pipe.in_wl().nitems())\n      {\n        lastnelements = gmesh.nelements;\n        {\n          pipe.out_wl().will_write();\n          pipe.re_wl().will_write();\n          refine <<<refine_blocks, __tb_refine>>>(gmesh, 32, nnodes.gpu_wr_ptr(), nelements.gpu_wr_ptr(), pipe.in_wl(), pipe.out_wl(), pipe.re_wl(), refine_ex_locks, refine_barrier);\n          pipe.in_wl().swap_slots();\n          pipe.retry2();\n        }\n        gmesh.nnodes = mesh.nnodes = *(nnodes.cpu_rd_ptr());\n        gmesh.nelements = mesh.nelements = *(nelements.cpu_rd_ptr());\n        *(nbad.cpu_wr_ptr(true)) = 0;\n        printf(\"checking triangles ...\\n\");\n        pipe.out_wl().will_write();\n        if (orig)\n          check_triangles_orig <<<blocks, threads>>>(gmesh, nbad.gpu_wr_ptr(), lastnelements, pipe.in_wl(), pipe.out_wl());\n        else\n          check_triangles <<<blocks, threads>>>(gmesh, nbad.gpu_wr_ptr(), lastnelements, pipe.in_wl(), pipe.out_wl());\n        pipe.in_wl().swap_slots();\n        pipe.advance2();\n        printf(\"%d bad triangles\\n\", *(nbad.cpu_rd_ptr()) );\n      }\n      // FP: \"18 -> 19;\n      t.stop();\n      printf(\"time: %llu ns\\n\", t.duration());\n      // FP: \"19 -> 20;\n      {\n        *(nbad.cpu_wr_ptr(true)) = 0;\n        // FP: \"21 -> 22;\n        pipe.out_wl().will_write();\n        check_triangles <<<blocks, threads>>>(gmesh, nbad.gpu_wr_ptr(), 0, pipe.in_wl(), pipe.out_wl());\n        pipe.in_wl().swap_slots();\n        pipe.advance2();\n        // FP: \"22 -> 23;\n        printf(\"%d (%d) final bad triangles\\n\", *(nbad.cpu_rd_ptr()), pipe.in_wl().nitems() );\n        // FP: \"23 -> 24;\n      }\n      // FP: \"20 -> 21;\n    }\n  }\n  pipe.free();\n  // FP: \"6 -> 7;\n}\n#include \"main.inc\"\n"
  },
  {
    "path": "lonestar/scientific/gpu/delaunayrefinement/dmr.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#pragma once\n#include \"sharedptr.h\"\n\n#define MINANGLE 30\n#define PI 3.14159265358979323846 // from C99 standard.\n#define FORD double\n#define DIMSTYPE unsigned\n\n#define INVALIDID 1234567890\n#define MAXID INVALIDID\n\n// \"usual\" ratio of final nodes to final elements, determined empirically\n// used to adjust maxfactor for nodes\n// use 1 to be conservative\n#define MAX_NNODES_TO_NELEMENTS 2\n\nstruct ShMesh {\n  uint maxnelements;\n  uint maxnnodes;\n  uint ntriangles;\n  uint nnodes;\n  uint nsegments;\n  uint nelements;\n\n  Shared<FORD> nodex;\n  Shared<FORD> nodey;\n  Shared<uint3> elements;\n  Shared<uint3> neighbours;\n  Shared<bool> isdel;\n  Shared<bool> isbad;\n  Shared<int> owners;\n};\n\nstruct Mesh {\n  uint maxnelements;\n  uint maxnnodes;\n  uint ntriangles;\n  uint nnodes;\n  uint nsegments;\n  uint nelements;\n\n  FORD* nodex; // could be combined\n  FORD* nodey;\n  uint3* elements;\n  volatile bool* isdel;\n  bool* isbad;\n  uint3* neighbours;\n  int* owners;\n\n  Mesh() {}\n\n  Mesh(ShMesh& mesh) {\n    maxnelements = mesh.maxnelements;\n    maxnnodes    = mesh.maxnnodes;\n    ntriangles   = mesh.ntriangles;\n    nnodes       = mesh.nnodes;\n    nsegments    = mesh.nsegments;\n    nelements    = mesh.nelements;\n\n    nodex      = mesh.nodex.gpu_wr_ptr();\n    nodey      = mesh.nodey.gpu_wr_ptr();\n    elements   = mesh.elements.gpu_wr_ptr();\n    neighbours = mesh.neighbours.gpu_wr_ptr();\n    isdel      = mesh.isdel.gpu_wr_ptr();\n    isbad      = mesh.isbad.gpu_wr_ptr();\n    owners     = mesh.owners.gpu_wr_ptr(true);\n  }\n\n  void refresh(ShMesh& mesh) {\n    maxnelements = mesh.maxnelements;\n    maxnnodes    = mesh.maxnnodes;\n    ntriangles   = mesh.ntriangles;\n    nnodes       = mesh.nnodes;\n    nsegments    = mesh.nsegments;\n    nelements    = mesh.nelements;\n\n    nodex      = mesh.nodex.gpu_wr_ptr();\n    nodey      = mesh.nodey.gpu_wr_ptr();\n    elements   = mesh.elements.gpu_wr_ptr();\n    neighbours = mesh.neighbours.gpu_wr_ptr();\n    isdel      = mesh.isdel.gpu_wr_ptr();\n    isbad      = mesh.isbad.gpu_wr_ptr();\n    owners     = mesh.owners.gpu_wr_ptr(true);\n  }\n};\n\n#define IS_SEGMENT(element) (((element).z == INVALIDID))\n"
  },
  {
    "path": "lonestar/scientific/gpu/delaunayrefinement/dmrggc.inc",
    "content": "/* -*- mode: c++ -*- */\n#include \"meshfiles.h\"\n#include \"dmr.h\"\n#include \"geomprim.h\"\n#include \"gbar.cuh\"\n#include <cub/cub.cuh>\n#include \"devel.h\"\n#include <map>\n\n#undef TB_SIZE\n#define TB_SIZE 512\n\n#define LOADCV(x) cub::ThreadLoad<cub::LOAD_CV>((x))\n#define LOADCG(x) cub::ThreadLoad<cub::LOAD_CG>((x))\n#define STORECG(x, y) cub::ThreadStore<cub::STORE_CG>((x), (y))\n\nvoid addneighbour_cpu(uint3 &neigh, uint elem)\n{\n  // TODO\n  if(neigh.x == elem || neigh.y == elem || neigh.z == elem) return;\n\n  assert(neigh.x == INVALIDID || neigh.y == INVALIDID || neigh.z == INVALIDID);\n\n  if(neigh.x == INVALIDID) { neigh.x = elem; return; }\n  if(neigh.y == INVALIDID) { neigh.y = elem; return; }\n  if(neigh.z == INVALIDID) { neigh.z = elem; return; }\n}\n\nvoid find_neighbours_cpu(ShMesh &mesh)\n{\n  std::map<std::pair<int, int>, int> edge_map;\n\n  uint nodes1[3];\n\n  uint3 *elements = mesh.elements.cpu_rd_ptr();\n  uint3 *neighbours = mesh.neighbours.cpu_wr_ptr(true);\n  int ele;\n\n  for(ele = 0; ele < mesh.nelements; ele++)\n    {\n      uint3 *neigh = &neighbours[ele];\n\n      neigh->x = INVALIDID;\n      neigh->y = INVALIDID;\n      neigh->z = INVALIDID;\n\n      nodes1[0] = elements[ele].x;\n      nodes1[1] = elements[ele].y;\n      nodes1[2] = elements[ele].z;\n\n      if(nodes1[0] > nodes1[1]) std::swap(nodes1[0], nodes1[1]);\n      if(nodes1[1] > nodes1[2]) std::swap(nodes1[1], nodes1[2]);\n      if(nodes1[0] > nodes1[1]) std::swap(nodes1[0], nodes1[1]);\n\n      assert(nodes1[0] <= nodes1[1] && nodes1[1] <= nodes1[2]);\n\n      std::pair<int, int> edges[3];\n      edges[0] = std::make_pair<int, int>(nodes1[0], nodes1[1]);\n      edges[1] = std::make_pair<int, int>(nodes1[1], nodes1[2]);\n      edges[2] = std::make_pair<int, int>(nodes1[0], nodes1[2]);\n\n      int maxn = IS_SEGMENT(elements[ele]) ? 1 : 3;\n      \n      for(int i = 0; i < maxn; i++) {\n\tif(edge_map.find(edges[i]) == edge_map.end())\n\t  edge_map[edges[i]] = ele;\n\telse {\t  \n\t  int node = edge_map[edges[i]];\n\t  addneighbour_cpu(neighbours[node], ele);\n\t  addneighbour_cpu(neighbours[ele], node);\n\t  edge_map.erase(edges[i]);\n\t}\n      }\n    }\n}\n\nvoid verify_mesh(ShMesh &mesh)\n{\n  // code moved to refine_mesh (final invocation of check_triangles)\n  // TODO: check for delaunay property\n}\n\nvoid read_mesh(const char *basefile, ShMesh &mesh, int maxfactor)\n{\n  readNodes(basefile, mesh, maxfactor);\n  readTriangles(basefile, mesh, maxfactor);\n\n  assert(mesh.maxnelements > 0);\n  printf(\"memory for owners: %d MB\\n\", mesh.maxnelements * sizeof(int) / 1048576);\n  mesh.owners.alloc(mesh.maxnelements);\n  // see refine() for actual allocation\n  printf(\"memory for worklists: %d MB\\n\", 2 * mesh.nelements * sizeof(int) / 1048576);\n\n  printf(\"%s: %d nodes, %d triangles, %d segments read\\n\", basefile, mesh.nnodes, mesh.ntriangles, mesh.nsegments);\n  assert(mesh.nnodes > 0);\n  assert(mesh.ntriangles > 0);\n  assert(mesh.nsegments > 0);\n  assert(mesh.nelements > 0);\n}\n\n__device__ void check_is_bad(Mesh &mesh, int ele)\n{\n  uint3 *el = &mesh.elements[ele];\n\n  mesh.isbad[ele] = (angleLT(mesh, el->x, el->y, el->z) \n\t\t     || angleLT(mesh, el->z, el->x, el->y) \n\t\t     || angleLT(mesh, el->y, el->z, el->x));\n}\n\n__device__ bool shares_edge(uint nodes1[3], uint nodes2[3])\n{\n  int i;\n  int match = 0;\n  uint help;\n\n  for (i = 0; i < 3; i++) {\n    if ((help = nodes1[i]) != INVALIDID) {\n      if (help == nodes2[0]) match++;\n      else if (help == nodes2[1]) match++;\n      else if (help == nodes2[2]) match++;\n    }\n  } \n  // for(i = 0; i < 3; i++)\n  //   for(int j = 0; j < 3; j++)\n  //     {\n  // \tif(nodes1[i] == nodes2[j] && nodes1[i] != INVALIDID)\n  // \t  {\n  // \t    match++;\n  // \t    break;\n  // \t  }\n  //     }\n\n\n\n  return match == 2;\n}\n\n__global__ void find_neighbours(Mesh mesh, int start, int end)\n{\n  int id = threadIdx.x + blockDim.x * blockIdx.x;\n  int threads = blockDim.x * gridDim.x;\n  int ele;\n  int oele;\n  int nc = 0;\n  uint nodes1[3], nodes2[3], neigh[3] = {INVALIDID, INVALIDID, INVALIDID};\n\n  for(int x = 0; x < mesh.nelements; x += 4096) {\n    // currently a n^2 algorithm -- launch times out for 250k.ele!\n    for(ele = id + start; ele < end; ele += threads)\n      {\n\tif(x == 0)\n\t  {\n\t    neigh[0] = INVALIDID;\n\t    neigh[1] = INVALIDID;\n\t    neigh[2] = INVALIDID;\n\t  }\n\telse\n\t  {\n\t    neigh[0] = mesh.neighbours[ele].x;\n\t    neigh[1] = mesh.neighbours[ele].y;\n\t    neigh[2] = mesh.neighbours[ele].z;\n\t  }\n\n\tif(neigh[2] != INVALIDID) continue;\n\n\t//TODO: possibly remove uint3 from Mesh/ShMesh\n\tnodes1[0] = mesh.elements[ele].x;\n\tnodes1[1] = mesh.elements[ele].y;\n\tnodes1[2] = mesh.elements[ele].z;\n\tnc = (neigh[0] == INVALIDID) ? 0 : ((neigh[1] == INVALIDID) ? 1 : 2);\n\n\t//TODO: block this\n\tfor(oele = 0; oele < mesh.nelements; oele++)\n\t  {\n\t    nodes2[0] = mesh.elements[oele].x; \n\t    nodes2[1] = mesh.elements[oele].y; \n\t    nodes2[2] = mesh.elements[oele].z;\n\n\t    if(shares_edge(nodes1, nodes2))\n\t      {\n\t\tassert(nc < 3);\n\t\tneigh[nc++] = oele;\n\t      }\n\n\t    if((IS_SEGMENT(mesh.elements[ele]) && nc == 2) || nc == 3)\n\t      break;\n\t  }\n\n\tmesh.neighbours[ele].x = neigh[0];\n\tmesh.neighbours[ele].y = neigh[1];\n\tmesh.neighbours[ele].z = neigh[2];\n      }\n  }\n}\n\n__device__ void dump_mesh_element(Mesh &mesh, uint3 &ele, int element)\n{\n  if(IS_SEGMENT(ele))\n    printf(\"[ %.17f %.17f %.17f %.17f %d]\\n\", \n\t   mesh.nodex[ele.x], mesh.nodey[ele.x],\n\t   mesh.nodex[ele.y], mesh.nodey[ele.y], element);\n  else\n    printf(\"[ %.17f %.17f %.17f %.17f %.17f %.17f %d]\\n\", \n\t   mesh.nodex[ele.x], mesh.nodey[ele.x],\n\t   mesh.nodex[ele.y], mesh.nodey[ele.y],\n\t   mesh.nodex[ele.z], mesh.nodey[ele.z], element);\n}\n__device__ void dump_mesh_element(Mesh &mesh, int element)\n{\n  dump_mesh_element(mesh, mesh.elements[element], element);\n}\n\n__device__ bool encroached(Mesh &mesh, int element, uint3 &celement, FORD centerx, FORD centery, bool &is_seg)\n{\n  if(element == INVALIDID)\n    return false;\n\n  assert(!mesh.isdel[element]);\n\n  uint3 ele = LOADCG(&mesh.elements[element]);\n\n  if(IS_SEGMENT(ele)) {\n    //if(IS_SEGMENT(celement)) //TODO: regd second segment encroaching?\n    //  return false;\n\n    FORD cx, cy, radsqr;\n    uint nsp;\n\n    is_seg = true;\n\n    nsp = (celement.x == ele.x) ? ((celement.y == ele.y) ? celement.z : celement.y) : celement.x;\n\n    // check if center and triangle are on opposite sides of segment\n    // one of the ccws does not return zero\n    if(counterclockwise(mesh.nodex[ele.x], mesh.nodey[ele.x], \n    \t\t\tmesh.nodex[ele.y], mesh.nodey[ele.y],\n    \t\t\tmesh.nodex[nsp], mesh.nodey[nsp]) > 0 != \n       counterclockwise(mesh.nodex[ele.x], mesh.nodey[ele.x], \n    \t\t\tmesh.nodex[ele.y], mesh.nodey[ele.y],\n    \t\t\tcenterx, centery) > 0)\n      return true; \n\n    // nope, do a distance check\n    cx = (mesh.nodex[ele.x] + mesh.nodex[ele.y]) / 2;\n    cy = (mesh.nodey[ele.x] + mesh.nodey[ele.y]) / 2;\n    radsqr = distanceSquare(cx, cy, mesh.nodex[ele.x], mesh.nodey[ele.x]);\n\n    return distanceSquare(centerx, centery, cx, cy) < radsqr;\n  } else\n    return gincircle(mesh.nodex[ele.x], mesh.nodey[ele.x],\n\t\t     mesh.nodex[ele.y], mesh.nodey[ele.y],\n\t\t     mesh.nodex[ele.z], mesh.nodey[ele.z],\n\t\t     centerx, centery) > 0.0;\n}\n\n__device__ void add_to_cavity(uint cavity[], uint &cavlen, int element)\n{\n  int i;\n  for(i = 0; i < cavlen; i++)\n    if(cavity[i] == element)\n      return;\n\n  cavity[cavlen++] = element;\n}\n\n__device__ void add_to_boundary(uint boundary[], uint &boundarylen, uint sn1, uint sn2, uint src, uint dst)\n{\n  int i;\n  for(i = 0; i < boundarylen; i+=4)\n    if((sn1 == boundary[i] && sn2 == boundary[i+1]) ||\n       (sn1 == boundary[i+1] && sn2 == boundary[i]))\n      return;\n\n  boundary[boundarylen++] = sn1;  \n  boundary[boundarylen++] = sn2;\n  boundary[boundarylen++] = src;\n  boundary[boundarylen++] = dst;\n}\n\n__device__ unsigned add_node(Mesh &mesh, FORD x, FORD y, uint ndx)\n{\n  //uint ndx = atomicAdd(&mesh.nnodes, 1);\n  assert(ndx < mesh.maxnnodes);\n\n  mesh.nodex[ndx] = x;\n  mesh.nodey[ndx] = y;  \n\n  return ndx;\n}\n\n__device__ uint add_segment(Mesh &mesh, uint n1, uint n2, uint ndx)\n{\n  //TODO: parallelize\n  uint3 ele;\n  ele.x = n1; ele.y = n2; ele.z = INVALIDID;\n\n  //uint ndx = atomicAdd(&mesh.nelements, 1);\n  assert(ndx < mesh.maxnelements);\n\n  mesh.isbad[ndx] = false;\n  mesh.isdel[ndx] = false;\n  mesh.elements[ndx] = ele;\n  mesh.neighbours[ndx].x = mesh.neighbours[ndx].y = mesh.neighbours[ndx].z = INVALIDID;\n\n  return ndx;\n}\n\n__device__ uint add_triangle(Mesh &mesh, uint n1, uint n2, uint n3, uint nb1, uint oldt, uint ndx)\n{\n  uint3 ele;\n  if(counterclockwise(mesh.nodex[n1], mesh.nodey[n1], \n\t\t      mesh.nodex[n2], mesh.nodey[n2],\n\t\t      mesh.nodex[n3], mesh.nodey[n3]) > 0)\n    {\n      ele.x = n1; ele.y = n2; ele.z = n3;\n    }\n  else\n    {\n      ele.x = n3; ele.y = n2; ele.z = n1;\n    }\n\n  //uint ndx = atomicAdd(&mesh.nelements, 1);\n  assert(ndx < mesh.maxnelements);\n\n  mesh.isbad[ndx] = false;\n  mesh.isdel[ndx] = false;\n  mesh.elements[ndx] = ele;\n  mesh.neighbours[ndx].x = nb1;\n  mesh.neighbours[ndx].y = mesh.neighbours[ndx].z = INVALIDID;\n  //check_is_bad(mesh, ndx);\n\n  uint3 *nb = &mesh.neighbours[nb1];\n\n  if(mesh.neighbours[nb1].x == oldt)\n    nb->x = ndx;\n  else {\n    if(mesh.neighbours[nb1].y == oldt)\n      nb->y = ndx;\n    else\n      {\n\tif(mesh.neighbours[nb1].z != oldt)\n\t  printf(\"%u %u %u %u %u %u\\n\", ndx, oldt, nb1, mesh.neighbours[nb1].x, \n\t\t mesh.neighbours[nb1].y, mesh.neighbours[nb1].z);\n\n\tassert(mesh.neighbours[nb1].z == oldt);\n\tnb->z = ndx;\n      }\n  }\n\n  // if(mesh.neighbours[nb1].x == oldt)\n  //   cub::ThreadStore<cub::STORE_CG>(&mesh.neighbours[nb1].x, ndx);\n  // else {\n  //   if(mesh.neighbours[nb1].y == oldt)\n  //     cub::ThreadStore<cub::STORE_CG>(&mesh.neighbours[nb1].y, ndx);\n  //   else\n  //     {\n  // \tassert(mesh.neighbours[nb1].z == oldt);\n  // \tcub::ThreadStore<cub::STORE_CG>(&mesh.neighbours[nb1].z, ndx);\n  //     }\n  // }\n\n  return ndx;\n}\n\n__device__ bool adjacent(uint3 &elem1, uint3 &elem2)\n{\n  int sc = 0;\n  if(elem1.x == elem2.x || elem1.x == elem2.y || elem1.x == elem2.z)\n    sc++;\n\n  if(elem1.y == elem2.x || elem1.y == elem2.y || elem1.y == elem2.z)\n    sc++;\n\n  if(!IS_SEGMENT(elem1) && (elem1.z == elem2.x || elem1.z == elem2.y || elem1.z == elem2.z))\n    sc++;\n\n  return sc == 2;\n}\n__device__ void find_shared_edge(uint3 &elem1, uint3 &elem2, uint se[2])\n{\n  int sc = 0;\n  if(elem1.x == elem2.x || elem1.x == elem2.y || elem1.x == elem2.z)\n    se[sc++] = elem1.x;\n\n  if(elem1.y == elem2.x || elem1.y == elem2.y || elem1.y == elem2.z)\n    se[sc++] = elem1.y;\n\n  if(!IS_SEGMENT(elem1) && (elem1.z == elem2.x || elem1.z == elem2.y || elem1.z == elem2.z))\n    se[sc++] = elem1.z;\n\n  assert(sc == 2);\n  assert(se[0] != INVALIDID);\n  assert(se[1] != INVALIDID);\n}\n\n__device__ bool build_cavity(Mesh &mesh, uint cavity[], uint &cavlen, int max_cavity, uint boundary[], uint &boundarylen, FORD &cx, FORD &cy)\n{\n  int ce = 0;\n  //FORD cx, cy;\n  uint3 ele = LOADCG(&mesh.elements[cavity[0]]);\n  bool is_seg = false;\n\n  if(IS_SEGMENT(ele))\n    {\n      cx = (mesh.nodex[ele.x] + mesh.nodex[ele.y]) / 2;\n      cy = (mesh.nodey[ele.x] + mesh.nodey[ele.y]) / 2;\n    }\n  else\n    {\n      circumcenter(mesh.nodex[ele.x], mesh.nodey[ele.x],\n\t\t   mesh.nodex[ele.y], mesh.nodey[ele.y],\n\t\t   mesh.nodex[ele.z], mesh.nodey[ele.z],\n\t\t   cx, cy);\n    }\n\n  if(debug) printf(\"highlight %d %d [%f %f]\\n\", cavity[0], IS_SEGMENT(ele), cx, cy);\n  while (ce < cavlen) {\n    if(mesh.isdel[cavity[ce]])\n      printf(\"deleted: %d\\n\", cavity[ce]);\n\n    assert(cavlen < max_cavity);\n    assert(!mesh.isdel[cavity[ce]]);\n\n\n    uint3 neighbours = LOADCG(&mesh.neighbours[cavity[ce]]);\n    uint neighb[3] = {neighbours.x, neighbours.y, neighbours.z};\n\n    for(int i = 0; i < 3; i++) {\n      if(neighb[i] == cavity[0])\n\tcontinue;\n\n      if(neighb[i] == INVALIDID)\n\tcontinue;\n\n      //printf(\"neigbour %d\\n\", neighb[i]);\n\n      is_seg  = false;\n      if(!(IS_SEGMENT(ele) && IS_SEGMENT(LOADCG(&mesh.elements[neighb[i]]))) && \n\t encroached(mesh, neighb[i], ele, cx, cy, is_seg)) {\n\tif(!is_seg)\n\t  add_to_cavity(cavity, cavlen, neighb[i]);\n\telse {\n\t  assert(!IS_SEGMENT(ele));\n\t  cavity[0] = neighb[i];\n\t  cavlen = 1;\n\t  boundarylen = 0;\n\t  return false;\n\t}\n      } else {\n\tuint se[2];\n\tif(!adjacent(mesh.elements[cavity[ce]], mesh.elements[neighb[i]]))\n\t  {\n\t    dump_mesh_element(mesh, cavity[ce]);\n\t    dump_mesh_element(mesh, neighb[i]);\n\t    printf(\"%d %d\\n\", cavity[ce], neighb[i]);\n\t  }\n\n\tassert(boundarylen < BCLEN);\n\tfind_shared_edge(mesh.elements[cavity[ce]], mesh.elements[neighb[i]], se);\n\tadd_to_boundary(boundary, boundarylen, se[0], se[1], neighb[i], cavity[ce]);\n      }\n    }\n    ce++;\n  }\n\n  return true;\n}\n\n__device__ void addneighbour(Mesh &mesh, uint3 &neigh, uint elem)\n{\n  // TODO\n  if(neigh.x == elem || neigh.y == elem || neigh.z == elem) return;\n\n  assert(neigh.x == INVALIDID || neigh.y == INVALIDID || neigh.z == INVALIDID);\n\n  if(neigh.x == INVALIDID) { neigh.x = elem; return; }\n  if(neigh.y == INVALIDID) { neigh.y = elem; return; }\n  if(neigh.z == INVALIDID) { neigh.z = elem; return; }\n}\n\n__device__ void setup_neighbours(Mesh &mesh, uint start, uint end)\n{\n  // relies on all neighbours being in start--end\n  for(uint i = start; i < end; i++) {\n    uint3 &neigh = mesh.neighbours[i];\n\n    for(uint j = i+1; j < end; j++) {\n      if(adjacent(mesh.elements[i], mesh.elements[j]))\n\t{\n\t  addneighbour(mesh, neigh, j);\n\t  addneighbour(mesh, mesh.neighbours[j], i);\n\t}\n    }    \n  }\n}\n\n__device__ uint opposite(Mesh &mesh, uint element)\n{\n  bool obtuse = false;\n  int obNode = INVALIDID;\n  uint3 el = mesh.elements[element];\n\n  if(IS_SEGMENT(el))\n    return element;\n\n  // figure out obtuse node\n  if(angleOB(mesh, el.x, el.y, el.z)) {\n    obtuse = true;\n    obNode = el.z;\n  } else {\n    if(angleOB(mesh, el.z, el.x, el.y)) {\n      obtuse = true;\n      obNode = el.y;\n    } else {\n      if(angleOB(mesh, el.y, el.z, el.x)) {\n\tobtuse = true;\n\tobNode = el.x;\n      }\n    }\n  }\n\n  if(obtuse) {\n    // find the neighbour that shares an edge whose points do not include obNode\n    uint se_nodes[2];\n    uint nobneigh;\n\n    uint3 neigh = mesh.neighbours[element];\n\n    if(debug) printf(\"obtuse node [%f %f]\\n\", mesh.nodex[obNode], mesh.nodey[obNode]);\n    assert(neigh.x != INVALIDID && neigh.y != INVALIDID && neigh.z != INVALIDID);\n\n    nobneigh = neigh.x;\n    find_shared_edge(el, mesh.elements[neigh.x], se_nodes);\n    if(se_nodes[0] == obNode || se_nodes[1] == obNode) {\n      nobneigh = neigh.y;\n      find_shared_edge(el, mesh.elements[neigh.y], se_nodes);\n      if(se_nodes[0] == obNode || se_nodes[1] == obNode) {\n\tnobneigh = neigh.z;\n      }\n    }\n\n    return nobneigh;\n  }\n\n  return element;\n}\n\n\n__global__\nvoid refine_orig(Mesh mesh, int debg, uint *nnodes, uint *nelements, Worklist2 wl, Worklist2 owl, ExclusiveLocks _ex, GlobalBarrier gb)\n{\n  int id = threadIdx.x + blockDim.x * blockIdx.x;\n  int threads = blockDim.x * gridDim.x;\n  int ele, eleit, haselem;\n  //int debg = 32;\n  uint cavity[CAVLEN], nc = 0; // for now\n  uint boundary[BCLEN], bc = 0;\n  uint ulimit = ((*wl.dindex + threads - 1) / threads) * threads;\n  bool repush = false;\n  //typedef cub::BlockScan<int, REFINE_BLKSIZE> BlockScan;\n  const int perthread = ulimit / threads;\n  int stage = 0;\n  int x = 0;\n\n  for(eleit = id * perthread; eleit < (id * perthread + perthread) && eleit < ulimit; eleit++, x++)\n    {\n      haselem = wl.pop_id(eleit, ele);\n\n      //printf(\"%d:%d:%d:%d\\n\", x, id, eleit, ele);\n      FORD cx, cy;\n      nc = 0;\n      bc = 0;\n      repush = false;\n      stage = 0;\n\n      if(haselem && ele < mesh.nelements && mesh.isbad[ele] && !mesh.isdel[ele])\n\t{\n\t  cavity[nc++] = ele;\n\n\t  if(debug) {\n\t    printf(\"original center element \");\n\t    dump_mesh_element(mesh, cavity[0]);\n\t  }\n\n\t  uint oldcav;\n\t  do {\n\t    oldcav = cavity[0];\n\t    cavity[0] = opposite(mesh, ele);\n\t  } while(cavity[0] != oldcav);\n\n\t  if(!build_cavity(mesh, cavity, nc, CAVLEN, boundary, bc, cx, cy))\n\t    build_cavity(mesh, cavity, nc, CAVLEN, boundary, bc, cx, cy);\n\n\t  if(debug) {\n\t    printf(\"center element [%f %f] \", cx, cy);\n\t    dump_mesh_element(mesh, cavity[0]);\n\t    printf(\"pre-graph %d\\n\", nc);\n\t    for(int i = 1; i < nc; i++)\n\t      {\n\t\tdump_mesh_element(mesh, cavity[i]);\n\t      }\n\t    printf(\"boundary-edges %d\\n\", bc);\n\t    for(int i = 0; i < bc; i+=4) {\t\n\t      printf(\"[%f %f %f %f]\\n\", mesh.nodex[boundary[i]], mesh.nodey[boundary[i]],\n\t\t     mesh.nodex[boundary[i+1]], mesh.nodey[boundary[i+1]]);\n\t      dump_mesh_element(mesh, boundary[i+2]);\n\t    }\n\t  }\n\n\t  // try to claim ownership\n\t  for(int i = 0; i < nc; i++)\n\t    STORECG(&mesh.owners[cavity[i]], id);\n\n\t  for(int i = 0; i < bc; i+=4)\n\t    STORECG(&mesh.owners[boundary[i + 2]], id);\n\n\t  stage = 1;\n\t}\n\n      gb.Sync();\n\n      if(stage == 1)\n\t{\n\t  // check for conflicts\n\t  for(int i = 0; i < nc; i++) {\n\t    if(LOADCG(&mesh.owners[cavity[i]]) != id)\n\t      atomicMin((int *) &mesh.owners[cavity[i]], id);\n\t  }\n\n\t  for(int i = 0; i < bc; i+=4) {\n\t    if(LOADCG(&mesh.owners[boundary[i + 2]]) != id)\n\t      atomicMin((int *) &mesh.owners[boundary[i + 2]], id);\n\t  }\n\n\t  stage = 2;\n\n\t}\n\n      gb.Sync();\n\n      int nodes_added = 0;\n      int elems_added = 0;\n      if(stage == 2)\n\t{\n\t  int i;\n\t  for(i = 0; i < nc; i++)\n\t    if(LOADCG(&mesh.owners[cavity[i]]) != id) {\n\t      repush = true;\n\t      if(debug) printf(\"%d conflict\\n\", ele);\n\t      //printf(\"%d: %d owned by %d\\n\", id, cavity[i], mesh.owners[cavity[i]]);\n\t      break;\n\t    }\n\n\t  if(!repush)\n\t    for(i = 0; i < bc; i+=4)\n\t      if(LOADCG(&mesh.owners[boundary[i + 2]]) != id) {\n\t\trepush = true;\n\t\tif(debug) printf(\"%d conflict\\n\", ele);\n\t\t//printf(\"%d: %d owned by %d\\n\", id, boundary[i + 2], mesh.owners[boundary[i + 2]]);\n\t\tbreak;\n\t      }\n\n\t  // if(!repush)\n\t  //   {\n\t  //     for(int i = 0; i < nc; i++)\n\t  // \tprintf(\"%d:%d:%d\\n\", x, id, cavity[i]);\n\n\t  //     for(int i = 0; i < bc; i+=4)\n\t  // \tprintf(\"%d:%d:%d\\n\", x, id, boundary[i + 2]);\n\t  //   }\n\n\t  if(!repush)\n\t    {\n\t      stage = 3;\n\n\t      nodes_added = 1;\n\t      elems_added = (bc >> 2) + (IS_SEGMENT(mesh.elements[cavity[0]]) ? 2 : 0);\n\t    }\n\t}\n\n      // __syncthreads();\n      // typedef cub::WarpScan<int, 1, 32> WarpScan;\n      // __shared__ typename WarpScan::TempStorage temp_storage;\n      // int total = 0, offset = elems_added;\n      // __shared__ int start;\n      // WarpScan(temp_storage).ExclusiveSum(elems_added, offset, total);\n      // __syncthreads();\n      // if((threadIdx.x & 31) == 0 && total > 0) {\n      // \t start = atomicAdd(nelements, total);\n      // }\n      // __syncthreads();\n      // if(total > 0)\n      // \t printf(\"total: %d %d %d\\n\", threadIdx.x, id, total);\n\n      // if(elems_added > 0)\n      // \t printf(\"ea: %d %d\\n\", id, elems_added);\n\n      if(stage == 3)\n\t{\n\t  uint cnode = add_node(mesh, cx, cy, atomicAdd(nnodes, 1));\n\t  uint cseg1 = 0, cseg2 = 0;\n\n\t  uint nelements_added = elems_added;\n\t  //printf(\"start: %d %d %d %d %d\\n\", id, elems_added, start, offset, start+offset);\n\t  uint oldelements = atomicAdd(nelements, nelements_added);\n\n\t  uint newelemndx = oldelements;\n\t  if(debug) printf(\"post-graph\\n\");\n\t  if(IS_SEGMENT(mesh.elements[cavity[0]]))\n\t    {\n\t      cseg1 = add_segment(mesh, mesh.elements[cavity[0]].x, cnode, newelemndx++);\n\t      cseg2 = add_segment(mesh, cnode, mesh.elements[cavity[0]].y, newelemndx++);\n\t      if(debug) {\n\t\tdump_mesh_element(mesh, cseg1);\n\t\tdump_mesh_element(mesh, cseg2);\n\t      }\n\t    }\n\n\t  for(int i = 0; i < bc; i+=4) {\n\t    uint ntri  = add_triangle(mesh, boundary[i], boundary[i+1], cnode, boundary[i+2], boundary[i+3], \n\t\t\t\t      newelemndx++);\n\t    //if(mesh.isbad[ntri])\n\t    //{\n\t    //printf(\"puhsing %d\\n\", ntri);\n\t    //owl.push(ntri);\n\t    //}\n\n\t    //printf(\"%d wrote %d\\n\", id, ntri);\n\t    if(debug) dump_mesh_element(mesh, ntri);\n\t  }\n\n\t  assert(oldelements + nelements_added == newelemndx);\n\n\t  setup_neighbours(mesh, oldelements, newelemndx);\n\n\t  repush = true;\n\t  for(int i = 0; i < nc; i++)\n\t    {\n\t      mesh.isdel[cavity[i]] = true;\n\t      // if the resulting cavity does not contain the original triangle\n\t      // (because of the opposite() routine, add it back.\n\t      if(cavity[i] == ele) repush = false;  \n\t      //printf(\"%d: deleting %d\\n\", id, cavity[i]);\n\t    }\n\n\t  if(debug) printf(\"update over\\n\");\n\t  //if(debg-- == 0) break;      \n\t}\n\n      //owl.push_1item<BlockScan>((repush ? 1 : 0), ele, 512);\n      if(repush) owl.push(ele);\n      gb.Sync();\n    }\n}\n\n__global__\nvoid check_triangles_orig(Mesh mesh, unsigned int * bad_triangles, int start, Worklist2 in_wl, Worklist2 wl)\n{\n  int id = threadIdx.x + blockDim.x * blockIdx.x;\n  int threads = blockDim.x * gridDim.x;\n  int ele;\n  uint3 *el;\n  int count = 0;\n  int ulimit = mesh.nelements; //start + ((mesh.nelements - start + blockDim.x - 1) / blockDim.x) * blockDim.x;\n  bool push;\n\n  if(debug && id == 0)\n    printf(\"start %d nelements %d\\n\", start, mesh.nelements);\n\n  for(ele = id + start; ele < ulimit; ele += threads)\n    {\n      push = false;\n\n      if(ele < mesh.nelements) {\n\tif(mesh.isdel[ele])\n\t  goto next;\n\n\tif(IS_SEGMENT(mesh.elements[ele]))\n\t  goto next;\n\n\tif(!mesh.isbad[ele])\n\t  {\n\t    el = &mesh.elements[ele];\n\t    \n\t    mesh.isbad[ele] = (angleLT(mesh, el->x, el->y, el->z) \n\t\t\t       || angleLT(mesh, el->z, el->x, el->y) \n\t\t\t       || angleLT(mesh, el->y, el->z, el->x));\n\t  }\n\n\tif(mesh.isbad[ele])\n\t  {\n\t    push = true;\n\t    count++;\n\t  }\n      }\n\n    next:\n      //wl.push_1item<BlockScan>(push ? 1: 0, ele, 384); // slower than push?\n      if(push) wl.push(ele); // not really as bad as it looks\n    }\n\n  //TODO: replace with warp wide and then block wide\n  atomicAdd(bad_triangles, count);\n}\n\nvoid refine_mesh_orig(ShMesh &mesh, dim3 &blocks, dim3 &threads)\n{\n  ExclusiveLocks refine_ex_locks(mesh.maxnelements);\n  Shared<uint> nbad(1);\n  Mesh gmesh(mesh);\n  Shared<uint> nelements(1), nnodes(1);\n  int cnbad;\n  GlobalBarrierLifetime gb;\n  Worklist2 wl1(mesh.nelements), wl2(mesh.nelements);\n  const int REFINE_BLKSIZE = threads.x;\n  const size_t RES_REFINE = maximum_residency(refine_orig, REFINE_BLKSIZE, 0); \n  const int nSM = 14;\n\n  if(debug) printf(\"Running at residency %d.\\n\", RES_REFINE);\n\n  find_neighbours_cpu(mesh);\n  //dump_neighbours(mesh);\n  gmesh.refresh(mesh);\n\n  gb.Setup(nSM * RES_REFINE);\n\n  *(nelements.cpu_wr_ptr(true)) = mesh.nelements;\n  *(nnodes.cpu_wr_ptr(true)) = mesh.nnodes;\n\n  //double starttime, endtime;\n  int lastnelements = 0;\n  Worklist2 *inwl = &wl1, *outwl = &wl2;\n\n  *(nbad.cpu_wr_ptr(true)) = 0;\n  printf(\"checking triangles...\\n\");\n  check_triangles_orig<<<nSM, threads.x>>>(gmesh, nbad.gpu_wr_ptr(), 0, *outwl, *inwl); \n  cnbad = *(nbad.cpu_rd_ptr());\n  printf(\"%d initial bad triangles\\n\", cnbad);\n\n  while(cnbad) {   \n    if(debug) inwl->display_items();\n    lastnelements = gmesh.nelements;\n\n    refine_orig<<<nSM * RES_REFINE, REFINE_BLKSIZE>>>(gmesh, 32, nnodes.gpu_wr_ptr(), nelements.gpu_wr_ptr(), *inwl, *outwl, refine_ex_locks, gb);\n    CUDA_SAFE_CALL(cudaDeviceSynchronize()); // not needed\n    printf(\"refine over\\n\");\n    gmesh.nnodes = mesh.nnodes = *(nnodes.cpu_rd_ptr());\n    gmesh.nelements = mesh.nelements = *(nelements.cpu_rd_ptr());\n    if(debug) printf(\"out elements: %d\\n\", outwl->nitems());\n\n    std::swap(inwl, outwl);\n    outwl->reset();\n\n    *(nbad.cpu_wr_ptr(true)) = 0;\n    printf(\"checking triangles...\\n\");\n    // need to check only new triangles\n    //inwl->reset();\n    check_triangles_orig<<<nSM, threads.x>>>(gmesh, nbad.gpu_wr_ptr(), lastnelements, *outwl, *inwl); \n    //cnbad = *(nbad.cpu_rd_ptr());\n \n    cnbad = inwl->nitems();\n    printf(\"%u bad triangles.\\n\", cnbad);\n\n    if(debug) {\n      debug_isbad(*inwl, mesh);\n      gmesh.refresh(mesh);\n    }\n    if(cnbad == 0) \n      break;\n  }\n  CUDA_SAFE_CALL(cudaDeviceSynchronize());\n\n  *(nbad.cpu_wr_ptr(true)) = 0;\n  check_triangles_orig<<<nSM, threads.x>>>(gmesh, nbad.gpu_wr_ptr(), 0, *outwl, *inwl); \n  cnbad = *(nbad.cpu_rd_ptr());\n  printf(\"%d final bad triangles\\n\", cnbad);\n  //printf(\"time: %f ms\\n\", (endtime - starttime) * 1000);\n  //printf(\"\\truntime [dmr] = %f ms\\n\", (endtime - starttime) * 1000);\n}\n"
  },
  {
    "path": "lonestar/scientific/gpu/delaunayrefinement/geomprim.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n#pragma once\n\n__device__ FORD distanceSquare(FORD onex, FORD oney, FORD twox, FORD twoy) {\n  FORD dx  = onex - twox;\n  FORD dy  = oney - twoy;\n  FORD dsq = dx * dx + dy * dy;\n  return dsq;\n}\n\n__device__ FORD distanceSquare(unsigned one, unsigned two, FORD* nodex,\n                               FORD* nodey) {\n  return distanceSquare(nodex[one], nodey[one], nodex[two], nodey[two]);\n}\n\n__device__ bool angleOB(Mesh& mesh, unsigned a, unsigned b, unsigned c) {\n  FORD vax = mesh.nodex[a] - mesh.nodex[c];\n  FORD vay = mesh.nodey[a] - mesh.nodey[c];\n  FORD vbx = mesh.nodex[b] - mesh.nodex[c];\n  FORD vby = mesh.nodey[b] - mesh.nodey[c];\n  FORD dp  = vax * vbx + vay * vby; // dot-product?\n\n  if (dp < 0)\n    return true;\n\n  return false;\n}\n__device__ bool angleLT(Mesh& mesh, unsigned a, unsigned b, unsigned c) {\n  FORD vax = mesh.nodex[a] - mesh.nodex[c];\n  FORD vay = mesh.nodey[a] - mesh.nodey[c];\n  FORD vbx = mesh.nodex[b] - mesh.nodex[c];\n  FORD vby = mesh.nodey[b] - mesh.nodey[c];\n  FORD dp  = vax * vbx + vay * vby; // dot-product?\n\n  if (dp < 0) {\n    // id is obtuse at point ii.\n    return false;\n  } else {\n    FORD dsqaacurr = distanceSquare(a, c, mesh.nodex, mesh.nodey);\n    FORD dsqbbcurr = distanceSquare(b, c, mesh.nodex, mesh.nodey);\n    FORD c         = dp * rsqrtf(dsqaacurr * dsqbbcurr);\n    if (c > cos(MINANGLE * (PI / 180))) {\n      return true;\n    }\n  }\n\n  return false;\n}\n\n// > 0, in circle, == 0, on circle, < 0, outside circle\n// assumes a, b, c are in counter-clockwise order\n// code from triangle\n__device__ FORD gincircle(FORD ax, FORD ay, FORD bx, FORD by, FORD cx, FORD cy,\n                          FORD px, FORD py) {\n  FORD apx, bpx, cpx, apy, bpy, cpy;\n  FORD bpxcpy, cpxbpy, cpxapy, apxcpy, apxbpy, bpxapy;\n  FORD alift, blift, clift, det;\n\n  apx = ax - px;\n  bpx = bx - px;\n  cpx = cx - px;\n\n  apy = ay - py;\n  bpy = by - py;\n  cpy = cy - py;\n\n  bpxcpy = bpx * cpy;\n  cpxbpy = cpx * bpy;\n  alift  = apx * apx + apy * apy;\n\n  cpxapy = cpx * apy;\n  apxcpy = apx * cpy;\n  blift  = bpx * bpx + bpy * bpy;\n\n  apxbpy = apx * bpy;\n  bpxapy = bpx * apy;\n  clift  = cpx * cpx + cpy * cpy;\n\n  det = alift * (bpxcpy - cpxbpy) + blift * (cpxapy - apxcpy) +\n        clift * (apxbpy - bpxapy);\n\n  return det;\n}\n\n__device__ FORD counterclockwise(FORD pax, FORD pay, FORD pbx, FORD pby,\n                                 FORD pcx, FORD pcy) {\n  FORD detleft, detright, det;\n\n  detleft  = (pax - pcx) * (pby - pcy);\n  detright = (pay - pcy) * (pbx - pcx);\n  det      = detleft - detright;\n\n  return det;\n}\n\n__device__ void circumcenter(FORD Ax, FORD Ay, FORD Bx, FORD By, FORD Cx,\n                             FORD Cy, FORD& CCx, FORD& CCy) {\n  FORD D;\n\n  D = 2 * (Ax * (By - Cy) + Bx * (Cy - Ay) + Cx * (Ay - By));\n\n  CCx = ((Ax * Ax + Ay * Ay) * (By - Cy) + (Bx * Bx + By * By) * (Cy - Ay) +\n         (Cx * Cx + Cy * Cy) * (Ay - By)) /\n        D;\n  CCy = ((Ax * Ax + Ay * Ay) * (Cx - Bx) + (Bx * Bx + By * By) * (Ax - Cx) +\n         (Cx * Cx + Cy * Cy) * (Bx - Ax)) /\n        D;\n}\n"
  },
  {
    "path": "lonestar/scientific/gpu/delaunayrefinement/main.inc",
    "content": "/* -*- mode: c++ -*- */\nint main(int argc, char *argv[]) {\n  ShMesh mesh;\n  int maxfactor = 20;\n  int mesh_nodes, mesh_elements;\n  dim3 blocks(14*8), threads(TB_SIZE);\n  int no_output = 0;\n\n  if(argc == 1)\n    {\n      printf(\"Usage: %s basefile [maxfactor] [--no]\\n\", argv[0]);\n      exit(0);\n    }\n\n  if(argc >= 3)\n    {\n      if(strcmp(argv[2], \"--no\") == 0)\n\tno_output = 1;\n      else\n\tmaxfactor = atoi(argv[2]);\n    }\n\n  if(argc == 4)\n    {\n      if(strcmp(argv[3], \"--no\") == 0)\n\tno_output = 1;\n    }\n\n  read_mesh(argv[1], mesh, maxfactor);\n  mesh_nodes = mesh.nnodes; mesh_elements = mesh.ntriangles + mesh.nsegments;\n\n  refine_mesh(mesh, blocks, threads);\n  printf(\"%f increase in number of elements (maxfactor hint)\\n\", 1.0 * mesh.nelements / mesh_elements);\n  printf(\"%f increase in number of nodes (maxfactor hint)\\n\", 1.0 * mesh.nnodes / mesh_nodes);\n\n  verify_mesh(mesh);\n  if(!no_output)\n    write_mesh(argv[1], mesh);\n  else\n    printf(\"Not writing output.\\n\");\n\n  return 0;\n}\n"
  },
  {
    "path": "lonestar/scientific/gpu/delaunayrefinement/meshfiles.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n#pragma once\n#include \"dmr.h\"\n\nvoid write_mesh(std::string infile, ShMesh& mesh) {\n  FORD *nodex, *nodey;\n\n  nodex = mesh.nodex.cpu_rd_ptr();\n  nodey = mesh.nodey.cpu_rd_ptr();\n\n  unsigned slash = infile.rfind(\"/\");\n  std::cout << \"  -- \" << infile.substr(slash + 1) + \".out.node (\"\n            << mesh.nnodes << \" nodes)\" << std::endl;\n  std::ofstream outfilenode((infile.substr(slash + 1) + \".out.node\").c_str());\n  outfilenode.precision(17);\n  outfilenode << mesh.nnodes << \" 2 0 0\\n\";\n  for (size_t ii = 0; ii < mesh.nnodes; ++ii) {\n    outfilenode << ii << \" \" << nodex[ii] << \" \" << nodey[ii] << \"\\n\";\n  }\n  outfilenode.close();\n\n  uint3* elements = mesh.elements.cpu_rd_ptr();\n  bool* isdel     = mesh.isdel.cpu_rd_ptr();\n\n  unsigned ntriangles2 = mesh.nelements;\n  unsigned segmentcnt  = 0;\n  for (size_t ii = 0; ii < mesh.nelements; ++ii) {\n    if (IS_SEGMENT(elements[ii]) || isdel[ii])\n      ntriangles2--;\n    if (IS_SEGMENT(elements[ii]) && !isdel[ii])\n      segmentcnt++;\n  }\n\n  std::cout << \"  -- \" << infile.substr(slash + 1) + \".out.ele (\" << ntriangles2\n            << \" triangles)\" << std::endl;\n  std::ofstream outfileele((infile.substr(slash + 1) + \".out.ele\").c_str());\n\n  outfileele << ntriangles2 << \" 3 0\\n\";\n  size_t kk = 0;\n  for (size_t ii = 0; ii < mesh.nelements; ++ii) {\n    if (!IS_SEGMENT(elements[ii]) && !isdel[ii])\n      outfileele << kk++ << \" \" << elements[ii].x << \" \" << elements[ii].y\n                 << \" \" << elements[ii].z << \"\\n\";\n  }\n  outfileele.close();\n\n  std::cout << \"  -- \" << infile.substr(slash + 1) + \".out.poly (\" << segmentcnt\n            << \" segments)\" << std::endl;\n  std::ofstream outfilepoly((infile.substr(slash + 1) + \".out.poly\").c_str());\n  outfilepoly << \"0 2 0 1\\n\";\n  outfilepoly << segmentcnt << \" 0\\n\";\n  kk = 0;\n  for (size_t ii = 0; ii < mesh.nelements; ++ii) {\n    if (IS_SEGMENT(elements[ii]) && !isdel[ii])\n      outfilepoly << kk++ << \" \" << elements[ii].x << \" \" << elements[ii].y\n                  << \"\\n\";\n  }\n  outfilepoly << \"0\\n\";\n  outfilepoly.close();\n\n  std::cout << (ntriangles2 + segmentcnt) << \" active elements of \"\n            << mesh.nelements << \" total elements (\"\n            << mesh.nelements / (ntriangles2 + segmentcnt) << \"x) \"\n            << std::endl;\n  std::cout << 1.0 * mesh.maxnelements / mesh.nelements\n            << \" ratio of used to free elements.\" << std::endl;\n}\n\nvoid next_line(std::ifstream& scanner) {\n  scanner.ignore(std::numeric_limits<std::streamsize>::max(), '\\n');\n}\n\nvoid readNodes(std::string filename, ShMesh& mesh, int maxfactor = 2) {\n  size_t index;\n  FORD x, y;\n  bool firstindex = true;\n\n  std::ifstream scanner(filename.append(\".node\").c_str());\n  if (!scanner) {\n    fprintf(stderr, \"Unable to open file '%s'\\n\", filename.c_str());\n    exit(1);\n  }\n  scanner >> mesh.nnodes;\n\n  mesh.maxnnodes = (maxfactor / MAX_NNODES_TO_NELEMENTS) * mesh.nnodes;\n  printf(\"memory for nodes: %d MB\\n\",\n         mesh.maxnnodes * sizeof(FORD) * 2 / 1048576);\n  mesh.nodex = Shared<FORD>(mesh.maxnnodes);\n  mesh.nodey = Shared<FORD>(mesh.maxnnodes);\n\n  FORD* nodex = mesh.nodex.cpu_wr_ptr(true);\n  FORD* nodey = mesh.nodey.cpu_wr_ptr(true);\n\n  for (size_t i = 0; i < mesh.nnodes; i++) {\n    next_line(scanner);\n    scanner >> index >> x >> y;\n    if (firstindex) {\n      assert(index == 0);\n      firstindex = false;\n    }\n\n    nodex[index] = x;\n    nodey[index] = y;\n  }\n}\n\nvoid readTriangles(std::string basename, ShMesh& mesh, int maxfactor = 2) {\n  unsigned ntriangles, nsegments;\n  unsigned i, index, n1, n2, n3;\n  bool firstindex = true;\n  std::string filename;\n\n  filename = basename;\n  std::ifstream scanner(filename.append(\".ele\").c_str());\n  if (!scanner) {\n    fprintf(stderr, \"Unable to open file '%s'\\n\", filename.c_str());\n    exit(1);\n  }\n\n  scanner >> ntriangles;\n\n  filename = basename;\n  std::ifstream scannerperimeter(filename.append(\".poly\").c_str());\n  scannerperimeter >> nsegments; // first line is number of nodes\n  assert(nsegments == 0);        // standard triangle format, nodes == 0\n  next_line(scannerperimeter);\n  scannerperimeter >> nsegments; // number of segments\n\n  mesh.ntriangles   = ntriangles;\n  mesh.nsegments    = nsegments;\n  mesh.nelements    = ntriangles + nsegments;\n  mesh.maxnelements = maxfactor * mesh.nelements;\n\n  printf(\"memory for elements: %d MB\\n\",\n         mesh.maxnelements * (sizeof(uint3) * 2 + sizeof(bool) * 2) / 1048576);\n  mesh.elements.alloc(mesh.maxnelements);\n  mesh.isdel.alloc(mesh.maxnelements);\n  mesh.isbad.alloc(mesh.maxnelements);\n  mesh.neighbours.alloc(mesh.maxnelements);\n\n  uint3* elements = mesh.elements.cpu_wr_ptr(true);\n  bool *isdel     = mesh.isdel.cpu_wr_ptr(true),\n       *isbad     = mesh.isbad.cpu_wr_ptr(true);\n  for (i = 0; i < ntriangles; i++) {\n    next_line(scanner);\n    scanner >> index >> n1 >> n2 >> n3;\n    if (firstindex) {\n      assert(index == 0);\n      firstindex = false;\n    }\n\n    elements[index].x = n1;\n    elements[index].y = n2;\n    elements[index].z = n3;\n    isdel[index] = isbad[index] = false;\n  }\n\n  firstindex = true;\n  for (i = 0; i < nsegments; i++) {\n    next_line(scannerperimeter);\n    scannerperimeter >> index >> n1 >> n2;\n    if (firstindex) {\n      assert(index == 0);\n      firstindex = false;\n    }\n\n    elements[index + ntriangles].x = n1;\n    elements[index + ntriangles].y = n2;\n    elements[index + ntriangles].z = INVALIDID;\n    isdel[index] = isbad[index] = false;\n  }\n}\n"
  },
  {
    "path": "lonestar/tutorial_examples/CMakeLists.txt",
    "content": "function(app name)\n  add_executable(${name} ${ARGN})\n  add_dependencies(apps ${name})\n  target_link_libraries(${name} PRIVATE Galois::shmem lonestar)\n  install(TARGETS ${name} DESTINATION \"${CMAKE_INSTALL_BINDIR}\" COMPONENT apps EXCLUDE_FROM_ALL)\nendfunction()\n\napp(example-sssp-push-simple SSSPPushSimple.cpp)\napp(example-sssp-simple SSSPsimple.cpp)\napp(example-sssp-pull-simple SSSPPullSimple.cpp)\napp(example-hello-world HelloWorld.cpp)\napp(example-torus Torus.cpp)\napp(example-torus-improved TorusImproved.cpp)\napp(example-torus-construction TorusConstruction.cpp)\napp(example-graph-traversal-serial GraphTraversalSerial.cpp)\napp(example-graph-traversal-pull GraphTraversalPullOperator.cpp)\napp(example-graph-traversal-push GraphTraversalPushOperator.cpp)\napp(example-conflict-aware-torus ConflictAwareTorus.cpp)\napp(example-spanningtree SpanningTree.cpp)\napp(example-countlevels CountLevels.cpp)\napp(example-malloc ThirdPartyMalloc.cpp)\napp(example-wrapped-worklist ExampleWrappedWorklist.cpp)\n\nadd_test_scale(small example-hello-world 2 10)\nadd_test_scale(small example-torus 2 100)\nadd_test_scale(small example-torus-improved 2 100)\nadd_test_scale(small1 example-spanningtree \"${BASEINPUT}/reference/structured/rome99.gr\")\nadd_test_scale(small2 example-spanningtree \"${BASEINPUT}/scalefree/rmat10.gr\")\n"
  },
  {
    "path": "lonestar/tutorial_examples/ConflictAwareTorus.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n// This example shows\n// 1. how to bulid a conflict-aware data structure w/ Locakable\n// 2. how to implement conflict detection in your data structure's APIs\n// 3. how to define iterators for STL compliance\n// 4. how to leverage LargeArray to do NUMA-aware allocation\n// 5. how to turn-off conflict-detection when you do not want it\n#include \"galois/Galois.h\"\n#include \"galois/LargeArray.h\"\n\n#include <boost/iterator/counting_iterator.hpp>\n\n#include <iostream>\n\ntemplate <typename T>\nclass Torus2D {\n  //! [Internal type with Lockable]\n  //************************************************************************\n  // internal type to combine user data with Lockable object\n  //************************************************************************\n  struct NodeData : public galois::runtime::Lockable {\n  public:\n    using reference = T&;\n\n  public:\n    T v;\n\n  public:\n    reference getData() { return v; }\n  };\n  //! [Internal type with Lockable]\n\n  //! [Array of internal type]\n  size_t numRows, numCols;\n\n  // use galois::LargeArray for NUMA-aware allocation\n  // will allocate numRows*numCols elements in constructors\n  galois::LargeArray<NodeData> data;\n  //! [Array of internal type]\n\n  //! [Types for STL]\n  //************************************************************************\n  // subtypes visible to user\n  //************************************************************************\npublic:\n  // opaque type for node\n  using TorusNode = size_t;\n\n  // iterator for an STL container\n  using iterator = boost::counting_iterator<TorusNode>;\n  //! [Types for STL]\n\npublic:\n  //************************************************************************\n  // constructor for the torus\n  //************************************************************************\n  Torus2D(size_t r, size_t c) : numRows(r), numCols(c) {\n    // allocate torus nodes in an interleaved way among NUMA domains\n    data.allocateInterleaved(r * c);\n\n    // call constructor for each torus node\n    for (size_t n = 0; n < r * c; ++n) {\n      data.constructAt(n);\n    }\n  }\n\n  //! [APIs for sizes]\n  //************************************************************************\n  // functions for size of the torus\n  //************************************************************************\n  size_t height() { return numRows; }\n  size_t width() { return numCols; }\n  size_t size() { return width() * height(); }\n  //! [APIs for sizes]\n\n  //! [Iterators]\n  //************************************************************************\n  // functions to traverse nodes\n  //************************************************************************\n  iterator begin() { return iterator(0); }\n  iterator end() { return iterator(size()); }\n  //! [Iterators]\n\n  //! [Acquire node ownership]\n  //************************************************************************\n  // functions to acquire node ownership\n  //************************************************************************\n  void acquireNode(TorusNode n,\n                   galois::MethodFlag mflag = galois::MethodFlag::WRITE) {\n    // sanity check\n    assert(n < size());\n\n    // use this call to detect conflicts and handling aborts\n    galois::runtime::acquire(&data[n], mflag);\n  }\n  //! [Acquire node ownership]\n\n  //! [Get data]\n  //************************************************************************\n  // function to access node data\n  //************************************************************************\n  typename NodeData::reference\n  getData(TorusNode n, galois::MethodFlag mflag = galois::MethodFlag::WRITE) {\n    acquireNode(n, mflag);\n\n    // use the internal wrapper type to encapsulate users from Lockable objects\n    return data[n].getData();\n  }\n  //! [Get data]\n\n  //! [Easy operator cautiousness]\n  //************************************************************************\n  // functions to access neighboring nodes, i.e. edges in a general graph\n  //************************************************************************\n  iterator upNeighbor(TorusNode n) {\n    auto r = n / numCols, c = n % numCols;\n    auto newR = (r + numRows - 1) % numRows;\n    return iterator(newR * numCols + c);\n  }\n\n  iterator downNeighbor(TorusNode n) {\n    auto r = n / numCols, c = n % numCols;\n    auto newR = (r + 1) % numRows;\n    return iterator(newR * numCols + c);\n  }\n\n  iterator leftNeighbor(TorusNode n) {\n    auto r = n / numCols, c = n % numCols;\n    auto newC = (c + numCols - 1) % numCols;\n    return iterator(r * numCols + newC);\n  }\n\n  iterator rightNeighbor(TorusNode n) {\n    auto r = n / numCols, c = n % numCols;\n    auto newC = (c + 1) % numCols;\n    return iterator(r * numCols + newC);\n  }\n\n  //************************************************************************\n  // function to lock all neighbors of node n\n  // similar to edge_begin(), edge_end() or edges() in a general graph\n  //************************************************************************\n  void\n  acquireAllNeighbors(TorusNode n,\n                      galois::MethodFlag mflag = galois::MethodFlag::WRITE) {\n    acquireNode(*upNeighbor(n), mflag);\n    acquireNode(*downNeighbor(n), mflag);\n    acquireNode(*leftNeighbor(n), mflag);\n    acquireNode(*rightNeighbor(n), mflag);\n  }\n  //! [Easy operator cautiousness]\n}; // end of class Torus2D\n\nint main(int argc, char* argv[]) {\n  galois::SharedMemSys G;\n\n  if (argc < 4) {\n    std::cerr << \"Usage: \" << argv[0]\n              << \" <num_rows> <num_columns> <num_threads>\" << std::endl;\n    return 1;\n  }\n\n  galois::setActiveThreads(std::atoi(argv[3]));\n\n  //! [Use torus]\n  using Torus     = Torus2D<unsigned int>;\n  using TorusNode = Torus::TorusNode;\n\n  Torus torus(std::atoi(argv[1]), std::atoi(argv[2]));\n\n  galois::do_all(\n      galois::iterate(size_t{0},\n                      torus.size()), // range as a pair of unsigned integers\n      [&](TorusNode n) { torus.getData(n) = 0; } // operator\n      ,\n      galois::loopname(\"do_all_torus_reset_self\") // options\n  );\n\n  galois::for_each(\n      galois::iterate(\n          torus), // range as a container. assuming begin() and end()\n      [&](TorusNode n, auto&) { // operator\n        // cautious point\n        torus.acquireAllNeighbors(n);\n\n        torus.getData(*torus.upNeighbor(n)) += 1;\n        torus.getData(*torus.downNeighbor(n)) += 1;\n        torus.getData(*torus.leftNeighbor(n)) += 1;\n        torus.getData(*torus.rightNeighbor(n)) += 1;\n      },\n      galois::loopname(\"for_each_torus_add_neighbors\") // options\n      ,\n      galois::no_pushes());\n  //! [Use torus]\n\n  //! [Turn off conflict detection]\n  // serial verification, no conflict is possible\n  size_t numWrongAnswer = 0;\n  for (auto n : torus) {\n    // use galois::MethodFlag::UNPROTECTED to notify Galois runtime\n    // that do not acquire lock for this call\n    if (torus.getData(n, galois::MethodFlag::UNPROTECTED) != 4) {\n      numWrongAnswer++;\n    }\n  }\n  std::cout << \"# nodes of wrong answer: \" << numWrongAnswer << std::endl;\n  //! [Turn off conflict detection]\n\n  return 0;\n}\n"
  },
  {
    "path": "lonestar/tutorial_examples/CountLevels.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#include \"Lonestar/BoilerPlate.h\"\n#include \"galois/Galois.h\"\n#include \"galois/Reduction.h\"\n#include \"galois/Timer.h\"\n#include \"galois/graphs/LCGraph.h\"\n\nstatic const char* name = \"Count levels\";\nstatic const char* desc = \"Computes the number of degree levels\";\n\n#define DEBUG false\n\nnamespace cll = llvm::cl;\n\nstatic cll::opt<std::string>\n    inputFile(cll::Positional, cll::desc(\"<input graph>\"), cll::Required);\nstatic cll::opt<unsigned int> startNode(\"startNode\",\n                                        cll::desc(\"Node to start search from\"),\n                                        cll::init(0));\n\nenum COLOR { WHITE, GRAY, BLACK };\n\nstruct LNode {\n  uint32_t dist;\n  COLOR color;\n};\n\nusing Graph = galois::graphs::LC_CSR_Graph<LNode, void>::with_numa_alloc<\n    true>::type ::with_no_lockable<true>::type;\nusing GNode = Graph::GraphNode;\n\nstatic const unsigned int DIST_INFINITY =\n    std::numeric_limits<unsigned int>::max();\n\nconst galois::gstl::Vector<size_t>& countLevels(Graph& graph) {\n\n  using Vec = galois::gstl::Vector<size_t>;\n\n  //! [Define GReducible]\n  auto merge = [](Vec& lhs, Vec&& rhs) -> Vec& {\n    Vec v(std::move(rhs));\n    if (lhs.size() < v.size()) {\n      lhs.resize(v.size());\n    }\n    auto ll = lhs.begin();\n    for (auto ii = v.begin(), ei = v.end(); ii != ei; ++ii, ++ll) {\n      *ll += *ii;\n    }\n    return lhs;\n  };\n\n  auto identity = []() -> Vec { return Vec(); };\n\n  auto r = galois::make_reducible(merge, identity);\n\n  galois::do_all(galois::iterate(graph), [&](GNode n) {\n    LNode srcdata = graph.getData(n);\n    if (srcdata.dist == DIST_INFINITY) {\n      return;\n    }\n\n    auto& vec = r.getLocal();\n    if (vec.size() <= srcdata.dist) {\n      vec.resize(srcdata.dist + 1);\n    }\n    vec[srcdata.dist] += 1;\n  });\n\n  return r.reduce();\n  //! [Define GReducible]\n}\n\nvoid bfsSerial(Graph& graph, GNode source) {\n  constexpr galois::MethodFlag flag = galois::MethodFlag::UNPROTECTED;\n\n  LNode& sdata = graph.getData(source, flag);\n  sdata.dist   = 0u;\n  sdata.color  = GRAY;\n\n  std::queue<GNode> queue;\n  queue.push(source);\n\n  while (!queue.empty()) {\n    GNode curr = queue.front();\n    sdata      = graph.getData(curr, flag);\n    queue.pop();\n\n    // iterate over edges from node n\n    for (auto e : graph.edges(curr)) {\n      GNode dst    = graph.getEdgeDst(e);\n      LNode& ddata = graph.getData(dst);\n\n      if (ddata.color == WHITE) {\n        ddata.color = GRAY;\n        ddata.dist  = sdata.dist + 1;\n        queue.push(dst);\n      }\n    }\n    sdata.color = BLACK;\n  } // end while\n}\n\nint main(int argc, char** argv) {\n  galois::SharedMemSys G;\n  LonestarStart(argc, argv, name, desc, nullptr, &inputFile);\n\n  galois::StatTimer OT(\"OverheadTime\");\n  OT.start();\n\n  Graph graph;\n  galois::graphs::readGraph(graph, inputFile);\n  std::cout << \"Read \" << graph.size() << \" nodes, \" << graph.sizeEdges()\n            << \" edges\\n\";\n\n  galois::preAlloc(5 * numThreads +\n                   (2 * graph.size() * sizeof(typename Graph::node_data_type)) /\n                       galois::runtime::pagePoolSize());\n  galois::reportPageAlloc(\"MeminfoPre\");\n\n  galois::do_all(\n      galois::iterate(graph),\n      [&](const GNode& src) {\n        LNode& sdata = graph.getData(src);\n        sdata.color  = WHITE;\n        sdata.dist   = DIST_INFINITY;\n      },\n      galois::no_stats());\n\n  if (startNode >= graph.size()) {\n    std::cerr << \"Source node index \" << startNode\n              << \" is greater than the graph size\" << graph.size()\n              << \", failed to set source: \" << startNode << \"\\n\";\n    assert(0);\n    abort();\n  }\n  GNode source;\n  auto it = graph.begin();\n  std::advance(it, startNode.getValue());\n  source = *it;\n\n  galois::StatTimer T;\n  T.start();\n  bfsSerial(graph, source);\n  const auto& counts = countLevels(graph);\n  T.stop();\n\n  galois::reportPageAlloc(\"MeminfoPost\");\n\n#if DEBUG\n  for (auto n : graph) {\n    LNode& data = graph.getData(n);\n    std::cout << \"Node: \" << n << \" BFS dist:\" << data.dist << std::endl;\n  }\n#endif\n\n  std::cout << \"Number of BFS levels: \" << counts.size() << \"\\n\";\n\n  OT.stop();\n\n  return EXIT_SUCCESS;\n}\n"
  },
  {
    "path": "lonestar/tutorial_examples/ExampleWrappedWorklist.cpp",
    "content": "#include \"galois/Galois.h\"\n#include \"galois/Bag.h\"\n#include \"galois/UserContext.h\"\n#include \"galois/substrate/PerThreadStorage.h\"\n#include \"Lonestar/BoilerPlate.h\"\n\n#include <iostream>\n#include <fstream>\n\nclass ExampleWrappedWorklist {\nprivate:\n  galois::InsertBag<int> bag;\n  galois::substrate::PerThreadStorage<galois::UserContext<int>*> ctxPtr;\n  bool inParallelPhase;\n\nprivate:\n  void reset() {\n    bag.clear();\n    for (unsigned i = 0; i < ctxPtr.size(); i++) {\n      *(ctxPtr.getRemote(i)) = nullptr;\n    }\n  }\n\npublic:\n  ExampleWrappedWorklist() : inParallelPhase(false) { reset(); }\n\n  void enqueue(int item) {\n    if (inParallelPhase) {\n      (*(ctxPtr.getLocal()))->push(item);\n    } else {\n      bag.push(item);\n    }\n  }\n\n  void execute() {\n    inParallelPhase = true;\n\n    galois::for_each(\n        galois::iterate(bag),\n        [&](int item, auto& ctx) {\n          if (nullptr == *(ctxPtr.getLocal())) {\n            *(ctxPtr.getLocal()) = &ctx;\n          }\n\n          std::cout << item << std::endl;\n\n          if (item < 2000) {\n            this->enqueue(item + item);\n          }\n        },\n        galois::loopname(\"execute\"), galois::disable_conflict_detection());\n\n    inParallelPhase = false;\n    reset();\n  }\n};\n\nint main(int argc, char* argv[]) {\n  galois::SharedMemSys G;\n  LonestarStart(argc, argv);\n\n  ExampleWrappedWorklist q;\n  for (unsigned i = 0; i < galois::getActiveThreads(); i++) {\n    q.enqueue(i + 1);\n  }\n  q.execute();\n\n  return 0;\n}\n"
  },
  {
    "path": "lonestar/tutorial_examples/GraphTraversalPullOperator.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n// This example shows\n// 0. reading in a graph from a file\n// 1. serial iteration over nodes\n// 2. do_all iteration over nodes\n// 3. access to node and edge data\n// 4. usage of galois::StatTimer\n// 5. how to change # of threads\n#include \"galois/Galois.h\"\n#include \"galois/graphs/LCGraph.h\"\n#include \"galois/Timer.h\"\n#include <iostream>\n\nusing Graph = galois::graphs::LC_CSR_Graph<int, int>;\nusing GNode = Graph::GraphNode;\n\nint main(int argc, char* argv[]) {\n  galois::SharedMemSys G;\n\n  if (argc < 3) {\n    std::cerr << \"Usage: \" << argv[0] << \" filename num_threads\" << std::endl;\n    return 1;\n  }\n\n  Graph g;\n  galois::graphs::readGraph(g, argv[1]); // argv[1] is the file name for graph\n  galois::setActiveThreads(std::atoi(argv[2])); // argv[2] is # of threads\n\n  //******************************************************************************************\n  // serial traversal over a graph\n  // sum over nodes and edges in C++11 syntax\n  galois::StatTimer T(\"sum_serial\");\n  T.start();\n  for (auto n : g) {\n    auto& sum = g.getData(n);\n    sum       = 0;\n    for (auto e : g.edges(n)) {\n      sum += g.getEdgeData(e);\n    }\n  }\n  T.stop();\n\n  //*****************************************************************************************\n  // parallel traversal over a graph using galois::do_all w/o work stealing\n  // 1. operator is specified using lambda expression\n  // 2. do_all is named \"sum_in_do_all_with_lambda\" to show stat after this\n  // program finishes\n  //! [Graph traversal in pull using do_all]\n  galois::do_all(\n      galois::iterate(g.begin(), g.end()), // range\n      [&](GNode n) {                       // operator\n        auto& sum = g.getData(n);\n        sum       = 0;\n        for (auto e : g.edges(n)) {\n          sum += g.getEdgeData(e);\n        }\n      },\n      galois::loopname(\"sum_in_do_all_with_lambda\") // options\n  );\n  //! [Graph traversal in pull using do_all]\n\n  return 0;\n}\n"
  },
  {
    "path": "lonestar/tutorial_examples/GraphTraversalPushOperator.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n// This example shows\n// 0. reading in a graph from a file\n// 1. serial iteration over nodes\n// 2. for_each iteration over nodes\n// 3. access to node and edge data\n// 4. usage of galois::StatTimer\n// 5. how to change # of threads\n// 6. push-style operator using atomic intrinsics in do_all\n// 7. push-style operator using atomic intrinsics in for_each w/o conflict\n// detection\n#include \"galois/Galois.h\"\n#include \"galois/graphs/LCGraph.h\"\n#include \"galois/Timer.h\"\n#include <iostream>\n\nusing Graph = galois::graphs::LC_CSR_Graph<int, int>;\nusing GNode = Graph::GraphNode;\n\n//! [Initialization]\nvoid initialize(Graph& g) {\n  galois::do_all(galois::iterate(g.begin(), g.end()), // range\n                 [&](GNode n) { g.getData(n) = 0; }   // operator\n  );\n};\n//! [Initialization]\n\nint main(int argc, char* argv[]) {\n  galois::SharedMemSys G;\n\n  if (argc < 3) {\n    std::cerr << \"Usage: \" << argv[0] << \" filename num_threads\" << std::endl;\n    return 1;\n  }\n\n  Graph g;\n  galois::graphs::readGraph(g, argv[1]); // argv[1] is the file name for graph\n  galois::setActiveThreads(std::atoi(argv[2])); // argv[2] is # of threads\n\n  //******************************************************\n  // serial traversal over a graph\n  // sum over nodes and edges in C++11 syntax\n  galois::StatTimer T(\"sum_serial\");\n  T.start();\n  for (auto n : g) {\n    auto& sum = g.getData(n);\n    sum       = 0;\n    for (auto e : g.edges(n)) {\n      sum += g.getEdgeData(e);\n    }\n  }\n  T.stop();\n\n  //! [For each with conflict detection]\n  //******************************************************\n  // parallel traversal over a graph using galois::for_each\n  // 1. push operator is specified using lambda expression\n  // 2. for_each is named \"sum_in_for_each_with_push_operator\" to show stat\n  // after this program finishes\n  initialize(g);\n  galois::for_each(\n      galois::iterate(g.begin(), g.end()), // range\n      [&](GNode n, auto&) {                // operator\n        for (auto e : g.edges(n)) {        // cautious point\n          auto dst = g.getEdgeDst(e);\n          g.getData(dst) += g.getEdgeData(e);\n        }\n      },\n      galois::loopname(\"sum_in_for_each_with_push_operator\") // options\n  );\n  //! [For each with conflict detection]\n\n  //! [For each and do all without conflict detection]\n  // define lambda expression as a varible for reuse\n  auto sumEdgeWeightsAtomically = [&](GNode n) {\n    for (auto e : g.edges(n)) {\n      auto dst        = g.getEdgeDst(e);\n      auto& dstData   = g.getData(dst);\n      auto edgeWeight = g.getEdgeData(e);\n      __sync_fetch_and_add(&dstData, edgeWeight);\n    }\n  };\n\n  //******************************************************\n  // parallel traversal over a graph using galois::do_all w/o work stealing\n  // 1. push operator uses atomic intrinsic\n  // 2. do_all is named \"sum_in_do_all_with_push_atomic\" to show stat after this\n  // program finishes\n  initialize(g);\n  galois::do_all(galois::iterate(g.begin(), g.end()), // range\n                 sumEdgeWeightsAtomically             // operator\n                 ,\n                 galois::loopname(\"sum_in_do_all_with_push_atomic\") // options\n  );\n\n  //******************************************************\n  // parallel traversal over a graph using galois::for_each\n  // 1. push operator uses atomic intrinsic\n  // 2. for_each is named \"sum_in_do_for_each_with_push_atomic\" to show stat\n  // after this program finishes\n  initialize(g);\n  galois::for_each(\n      galois::iterate(g.begin(), g.end()),                 // range\n      [&](GNode n, auto&) { sumEdgeWeightsAtomically(n); } // operator\n      ,\n      galois::loopname(\"sum_in_for_each_with_push_atomic\") // options\n      ,\n      galois::no_pushes(), galois::disable_conflict_detection());\n  //! [For each and do all without conflict detection]\n\n  return 0;\n}\n"
  },
  {
    "path": "lonestar/tutorial_examples/GraphTraversalSerial.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n// This example shows\n// 0. reading in a graph from a file\n// 1. serial iteration over nodes\n// 2. access to node and edge data\n// 3. usage of galois::StatTimer\n#include \"galois/Galois.h\"\n#include \"galois/graphs/LCGraph.h\"\n#include \"galois/Timer.h\"\n#include <iostream>\n\nint main(int argc, char* argv[]) {\n  galois::SharedMemSys G;\n\n  if (argc < 2) {\n    std::cerr << \"Usage: \" << argv[0] << \" filename\" << std::endl;\n    return 1;\n  }\n\n  //! [Define LC_CSR_Graph]\n  // An LC_CSR_Graph whose node data type is int and edge data type is int\n  using Graph = galois::graphs::LC_CSR_Graph<int, int>;\n  //! [Define LC_CSR_Graph]\n\n  //! [Read a graph]\n  Graph g;\n  galois::graphs::readGraph(g, argv[1]); // argv[1] is the file name for graph\n  //! [Read a graph]\n\n  //! [use of a StatTimer]\n  //******************************************************\n  // serial traversal over a graph\n  // sum over nodes and edges in C++11 syntax\n  galois::StatTimer T(\"sum_serial\");\n  T.start();\n  //! [Graph traversal]\n  // iterate over nodes\n  for (auto n : g) {\n    auto& sum = g.getData(n); // get node data of n\n    sum       = 0;\n    // iterate over edges from node n\n    for (auto e : g.edges(n)) {\n      sum += g.getEdgeData(e); // get edge data of e\n    }\n  }\n  //! [Graph traversal]\n  T.stop();\n  //! [use of a StatTimer]\n\n  return 0;\n}\n"
  },
  {
    "path": "lonestar/tutorial_examples/HelloWorld.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#include \"galois/Galois.h\"\n#include <boost/iterator/counting_iterator.hpp>\n#include <iostream>\n\n//! [do_all example]\nstruct HelloWorld {\n  void operator()(int i) const { std::cout << \"Hello \" << i << \"\\n\"; }\n};\n\nvoid helloWorld(int i) { std::cout << \"Hello \" << i << \"\\n\"; }\n\nint main(int argc, char** argv) {\n  galois::SharedMemSys G;\n\n  if (argc < 3) {\n    std::cerr << \"<num threads> <num of iterations>\\n\";\n    return 1;\n  }\n  unsigned int numThreads = atoi(argv[1]);\n  int n                   = atoi(argv[2]);\n\n  numThreads = galois::setActiveThreads(numThreads);\n  std::cout << \"Using \" << numThreads << \" threads and \" << n\n            << \" iterations\\n\";\n\n  std::cout << \"Using a lambda\\n\";\n  galois::do_all(galois::iterate(0, n),\n                 [](int i) { std::cout << \"Hello \" << i << \"\\n\"; });\n\n  std::cout << \"Using a function object\\n\";\n  galois::do_all(galois::iterate(0, n), HelloWorld());\n\n  std::cout << \"Using a function pointer (discouraged)\\n\";\n  galois::do_all(galois::iterate(0, n), &helloWorld);\n\n  //! [do_all example]\n\n  return 0;\n}\n"
  },
  {
    "path": "lonestar/tutorial_examples/SSSPPullSimple.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#include \"galois/Timer.h\"\n#include \"galois/Galois.h\"\n#include \"galois/graphs/LCGraph.h\"\n#include \"llvm/Support/CommandLine.h\"\n#include \"Lonestar/BoilerPlate.h\"\n\n//! [Define LC Graph]\ntypedef galois::graphs::LC_Linear_Graph<unsigned int, unsigned int> Graph;\n//! [Define LC Graph]\ntypedef Graph::GraphNode GNode;\ntypedef std::pair<unsigned, GNode> UpdateRequest;\n\nstatic const unsigned int DIST_INFINITY =\n    std::numeric_limits<unsigned int>::max();\n\nconstexpr unsigned stepShift = 14;\nGraph graph;\n\nnamespace cll = llvm::cl;\nstatic cll::opt<std::string> filename(cll::Positional,\n                                      cll::desc(\"<input file>\"), cll::Required);\n\nint main(int argc, char** argv) {\n  galois::SharedMemSys G;\n  LonestarStart(argc, argv);\n\n  //! [ReadGraph]\n  galois::graphs::readGraph(graph, filename);\n  //! [ReadGraph]\n\n  galois::do_all(galois::iterate(graph),\n                 [&](GNode n) { graph.getData(n) = DIST_INFINITY; });\n\n  //! [OrderedByIntegerMetic in SSSPsimple]\n  auto reqIndexer = [](const UpdateRequest& req) {\n    return (req.first >> stepShift);\n  };\n\n  using namespace galois::worklists;\n  typedef PerSocketChunkLIFO<16> PSchunk;\n  typedef OrderedByIntegerMetric<decltype(reqIndexer), PSchunk> OBIM;\n  //! [OrderedByIntegerMetic in SSSPPullsimple]\n\n  galois::StatTimer T;\n  T.start();\n  graph.getData(*graph.begin()) = 0;\n  //! [for_each in SSSPPullsimple]\n  std::vector<UpdateRequest> init;\n  init.reserve(std::distance(graph.edge_begin(*graph.begin()),\n                             graph.edge_end(*graph.begin())));\n  for (auto ii : graph.edges(*graph.begin()))\n    init.push_back(std::make_pair(0, graph.getEdgeDst(ii)));\n\n  galois::for_each(\n      galois::iterate(init.begin(), init.end()),\n      [&](const UpdateRequest& req, auto& ctx) {\n        GNode active_node = req.second;\n        unsigned& data    = graph.getData(active_node);\n        unsigned newValue = data;\n\n        //![loop over neighbors to compute new value]\n        for (auto ii : graph.edges(active_node)) {\n          GNode dst = graph.getEdgeDst(ii);\n          newValue =\n              std::min(newValue, graph.getData(dst) + graph.getEdgeData(ii));\n        }\n        //![set new value and add neighbors to wotklist\n        if (newValue < data) {\n          data = newValue;\n          for (auto ii : graph.edges(active_node)) {\n            GNode dst = graph.getEdgeDst(ii);\n            if (graph.getData(dst) > newValue)\n              ctx.push(std::make_pair(newValue, dst));\n          }\n        }\n      },\n      galois::wl<OBIM>(reqIndexer), galois::loopname(\"sssp_run_loop\"));\n  //! [for_each in SSSPPullsimple]\n  T.stop();\n  return 0;\n}\n"
  },
  {
    "path": "lonestar/tutorial_examples/SSSPPushSimple.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n// This examples shows\n// 1. how to pass a range for data-driven algorithms\n// 2. how to add new work items using context\n// 3. how to specify schedulers\n// 4. how to write an indexer for OBIM\n#include \"galois/Timer.h\"\n#include \"galois/Galois.h\"\n#include \"galois/graphs/LCGraph.h\"\n\n#include <iostream>\n#include <string>\n\nusing Graph = galois::graphs::LC_Linear_Graph<unsigned int, unsigned int>;\nusing GNode = Graph::GraphNode;\nusing UpdateRequest = std::pair<unsigned, GNode>;\n\nstatic const unsigned int DIST_INFINITY =\n    std::numeric_limits<unsigned int>::max();\n\nconstexpr unsigned int stepShift = 14;\n\nint main(int argc, char** argv) {\n  galois::SharedMemSys G;\n  galois::setActiveThreads(256); // Galois will cap at hw max\n\n  if (argc != 3) {\n    std::cout << \"Usage: \" << argv[0]\n              << \" filename <dchunk16|obim|ParaMeter|det>\\n\";\n    return 1;\n  } else {\n    std::cout << \"Note: This is just a very simple example and provides no \"\n                 \"useful information for performance\\n\";\n  }\n\n  Graph graph;\n  galois::graphs::readGraph(graph,\n                            argv[1]); // argv[1] is the file name for graph\n\n  // initialization\n  galois::do_all(galois::iterate(graph),\n                 [&graph](GNode N) {\n                   graph.getData(N) = DIST_INFINITY;\n                 } // operator as lambda expression\n  );\n\n  galois::StatTimer T;\n  T.start();\n\n  //! [SSSP push operator]\n  // SSSP operator\n  // auto& ctx expands to galois::UserContext<GNode>& ctx\n  auto SSSP = [&](GNode active_node, auto& ctx) {\n    // Get the value on the node\n    auto srcData = graph.getData(active_node);\n\n    // loop over neighbors to compute new value\n    for (auto ii : graph.edges(active_node)) { // cautious point\n      auto dst      = graph.getEdgeDst(ii);\n      auto weight   = graph.getEdgeData(ii);\n      auto& dstData = graph.getData(dst);\n      if (dstData > weight + srcData) {\n        dstData = weight + srcData;\n        ctx.push(dst); // add new work items\n      }\n    }\n  };\n  //! [SSSP push operator]\n\n  //! [Scheduler examples]\n  // Priority Function in SSSPPushSimple\n  // Map user-defined priority to a bucket number in OBIM\n  auto reqIndexer = [&](const GNode& N) {\n    return (graph.getData(N, galois::MethodFlag::UNPROTECTED) >> stepShift);\n  };\n\n  using namespace galois::worklists;\n  using PSchunk = PerSocketChunkLIFO<16>; // chunk size 16\n  using OBIM    = OrderedByIntegerMetric<decltype(reqIndexer), PSchunk>;\n  //! [Scheduler examples]\n\n  //! [Data-driven loops]\n  std::string schedule = argv[2]; // argv[2] is the scheduler to be used\n\n  // clear source\n  graph.getData(*graph.begin()) = 0;\n\n  if (\"dchunk16\" == schedule) {\n    //! [chunk worklist]\n    galois::for_each(\n        galois::iterate(\n            {*graph.begin()}), // initial range using initializer list\n        SSSP                   // operator\n        ,\n        galois::wl<PSchunk>() // options. PSchunk expands to\n                              // galois::worklists::PerSocketChunkLIFO<16>,\n                              // where 16 is chunk size\n        ,\n        galois::loopname(\"sssp_dchunk16\"));\n    //! [chunk worklist]\n  } else if (\"obim\" == schedule) {\n    //! [OBIM]\n    galois::for_each(\n        galois::iterate(\n            {*graph.begin()}), // initial range using initializer list\n        SSSP                   // operator\n        ,\n        galois::wl<OBIM>(reqIndexer) // options. Pass an indexer instance for\n                                     // OBIM construction.\n        ,\n        galois::loopname(\"sssp_obim\"));\n    //! [OBIM]\n  }\n  //! [Data-driven loops]\n\n  else if (\"ParaMeter\" == schedule) {\n    //! [ParaMeter loop iterator]\n    galois::for_each(\n        galois::iterate(\n            {*graph.begin()}), // initial range using initializer list\n        SSSP                   // operator\n        ,\n        galois::wl<galois::worklists::ParaMeter<>>() // options\n        ,\n        galois::loopname(\"sssp_ParaMeter\"));\n    //! [ParaMeter loop iterator]\n  } else if (\"det\") {\n    //! [Deterministic loop iterator]\n    galois::for_each(\n        galois::iterate(\n            {*graph.begin()}), // initial range using initializer list\n        SSSP                   // operator\n        ,\n        galois::wl<galois::worklists::Deterministic<>>() // options\n        ,\n        galois::loopname(\"sssp_deterministic\"));\n    //! [Deterministic loop iterator]\n  } else {\n    std::cerr << \"Unknown schedule \" << schedule << std::endl;\n    return 1;\n  }\n\n  T.stop();\n  return 0;\n}\n"
  },
  {
    "path": "lonestar/tutorial_examples/SSSPsimple.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#include \"galois/Timer.h\"\n#include \"galois/Galois.h\"\n#include \"galois/graphs/LCGraph.h\"\n#include \"llvm/Support/CommandLine.h\"\n#include \"Lonestar/BoilerPlate.h\"\n\n//! [Define LC Graph]\ntypedef galois::graphs::LC_Linear_Graph<unsigned int, unsigned int> Graph;\n//! [Define LC Graph]\ntypedef Graph::GraphNode GNode;\ntypedef std::pair<unsigned, GNode> UpdateRequest;\n\nstatic const unsigned int DIST_INFINITY =\n    std::numeric_limits<unsigned int>::max();\n\nunsigned stepShift = 14;\nGraph graph;\n\nnamespace cll = llvm::cl;\nstatic cll::opt<std::string>\n    inputFile(cll::Positional, cll::desc(\"<input file>\"), cll::Required);\n\ntemplate <typename C>\nvoid relax_edge(unsigned src_data, Graph::edge_iterator ii, C& ctx) {\n  GNode dst = graph.getEdgeDst(ii);\n  //![get edge and node data]\n  unsigned int edge_data = graph.getEdgeData(ii);\n  unsigned& dst_data     = graph.getData(dst);\n  //![get edge and node data]\n  unsigned int newDist = src_data + edge_data;\n  if (newDist < dst_data) {\n    dst_data = newDist;\n    ctx.push(std::make_pair(newDist, dst));\n  }\n}\n\nint main(int argc, char** argv) {\n  galois::SharedMemSys G;\n  LonestarStart(argc, argv);\n\n  //! [ReadGraph]\n  galois::graphs::readGraph(graph, inputFile);\n  //! [ReadGraph]\n\n  galois::for_each(galois::iterate(graph),\n                   [&](GNode n, auto&) { graph.getData(n) = DIST_INFINITY; });\n\n  //! [OrderedByIntegerMetic in SSSPsimple]\n  auto reqIndexer = [](const UpdateRequest& req) {\n    return (req.first >> stepShift);\n  };\n\n  using namespace galois::worklists;\n  typedef PerSocketChunkLIFO<16> PSchunk;\n  typedef OrderedByIntegerMetric<decltype(reqIndexer), PSchunk> OBIM;\n  //! [OrderedByIntegerMetic in SSSPsimple]\n\n  galois::StatTimer T;\n  T.start();\n  graph.getData(*graph.begin()) = 0;\n  //! [for_each in SSSPsimple]\n  galois::for_each(\n      galois::iterate({std::make_pair(0U, *graph.begin())}),\n      //! [Operator in SSSPsimple]\n      [&](UpdateRequest& req, auto& ctx) {\n        GNode active_node = req.second;\n        unsigned& data    = graph.getData(active_node);\n        if (req.first > data)\n          return;\n        //![loop over neighbors]\n        for (auto ii : graph.edges(active_node))\n          relax_edge(data, ii, ctx);\n        //![loop over neighbors]\n      }\n      //! [Operator in SSSPsimple]\n      ,\n      galois::wl<OBIM>(reqIndexer), galois::loopname(\"sssp_run_loop\"));\n  //! [for_each in SSSPsimple]\n  T.stop();\n  return 0;\n}\n"
  },
  {
    "path": "lonestar/tutorial_examples/SpanningTree.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#include \"galois/Galois.h\"\n#include \"galois/Reduction.h\"\n#include \"galois/Bag.h\"\n#include \"galois/Timer.h\"\n#include \"galois/UnionFind.h\"\n#include \"galois/graphs/LCGraph.h\"\n#include \"galois/ParallelSTL.h\"\n#include \"llvm/Support/CommandLine.h\"\n\n#include \"Lonestar/BoilerPlate.h\"\n\n#include <utility>\n#include <algorithm>\n#include <iostream>\n\nnamespace cll = llvm::cl;\n\nconst char* name = \"Spanning Tree Algorithm\";\nconst char* desc = \"Computes the spanning forest of a graph\";\n\nenum Algo { demo, asynchronous, blockedasync };\n\nstatic cll::opt<std::string>\n    inputFilename(cll::Positional, cll::desc(\"<input file>\"), cll::Required);\nstatic cll::opt<Algo>\n    algo(\"algo\", cll::desc(\"Choose an algorithm:\"),\n         cll::values(clEnumVal(demo, \"Demonstration algorithm\"),\n                     clEnumVal(asynchronous, \"Asynchronous\"),\n                     clEnumVal(blockedasync, \"Blocked Asynchronous\")),\n         cll::init(blockedasync));\n\nstruct Node : public galois::UnionFindNode<Node> {\n  Node() : galois::UnionFindNode<Node>(const_cast<Node*>(this)) {}\n  Node* component() { return find(); }\n  void setComponent(Node* n) { m_component = n; }\n};\n\nstd::ostream& operator<<(std::ostream& os, const Node& n) {\n  os << \"[id: \" << &n << \"]\";\n  return os;\n}\n\ntypedef galois::graphs::LC_Linear_Graph<Node, void>::with_numa_alloc<true>::type\n    Graph;\ntypedef Graph::GraphNode GNode;\ntypedef std::pair<GNode, GNode> Edge;\n\nstruct BlockedWorkItem {\n  GNode src;\n  Graph::edge_iterator start;\n};\n\ntemplate <bool MakeContinuation, int Limit>\nauto specialized_process(Graph& graph, galois::InsertBag<Edge>& mst)\n    -> decltype(auto) {\n  return\n      [&](const GNode& src, const Graph::edge_iterator& start, auto& pusher) {\n        Node& sdata = graph.getData(src, galois::MethodFlag::UNPROTECTED);\n        int count   = 1;\n        for (Graph::edge_iterator\n                 ii = start,\n                 ei = graph.edge_end(src, galois::MethodFlag::UNPROTECTED);\n             ii != ei; ++ii, ++count) {\n          GNode dst   = graph.getEdgeDst(ii);\n          Node& ddata = graph.getData(dst, galois::MethodFlag::UNPROTECTED);\n          if (sdata.merge(&ddata)) {\n            mst.push(std::make_pair(src, dst));\n            if (Limit == 0 || count != Limit) {\n              continue;\n            }\n          }\n\n          if (MakeContinuation || (Limit != 0 && count == Limit)) {\n            BlockedWorkItem item = {src, ii + 1};\n            pusher.push(item);\n            break;\n          }\n        }\n      };\n}\n\nint main(int argc, char** argv) {\n  galois::SharedMemSys G;\n  LonestarStart(argc, argv, name, desc, nullptr, nullptr);\n\n  Graph graph;\n\n  galois::InsertBag<Edge> mst;\n\n  galois::StatTimer Tinitial(\"InitializeTime\");\n  Tinitial.start();\n  galois::graphs::readGraph(graph, inputFilename);\n  std::cout << \"Num nodes: \" << graph.size() << \"\\n\";\n  Tinitial.stop();\n\n  //! Normalize component by doing find with path compression\n  auto Normalize = [&](const GNode& src) {\n    Node& sdata = graph.getData(src, galois::MethodFlag::UNPROTECTED);\n    sdata.setComponent(sdata.findAndCompress());\n  };\n\n  // galois::preAlloc(numThreads + graph.size() /\n  // galois::runtime::MM::hugePageSize * 60);\n  galois::reportPageAlloc(\"MeminfoPre\");\n  galois::StatTimer T;\n  T.start();\n  switch (algo) {\n  /**\n   * Construct a spanning forest via a modified BFS algorithm. Intended as a\n   * simple introduction to the Galois system and not intended to particularly\n   * fast. Restrictions: graph must be strongly connected. In this case, the\n   * spanning tree is over the undirected graph created by making the directed\n   * graph symmetric.\n   */\n  case demo: {\n    Graph::iterator ii = graph.begin(), ei = graph.end();\n    if (ii != ei) {\n      Node* root = &graph.getData(*ii);\n      galois::for_each(\n          galois::iterate({*ii}),\n          [&](GNode src, auto& ctx) {\n            for (auto ii : graph.edges(src, galois::MethodFlag::WRITE)) {\n              GNode dst   = graph.getEdgeDst(ii);\n              Node& ddata = graph.getData(dst, galois::MethodFlag::UNPROTECTED);\n              if (ddata.component() == root)\n                continue;\n              ddata.setComponent(root);\n              mst.push(std::make_pair(src, dst));\n              ctx.push(dst);\n            }\n          },\n          galois::loopname(\"DemoAlgo\"),\n          galois::wl<galois::worklists::PerSocketChunkFIFO<32>>());\n    }\n  } break;\n\n  case asynchronous:\n    /**\n     * Like asynchronous connected components algorithm.\n     */\n    {\n      galois::do_all(\n          galois::iterate(graph),\n          [&](const GNode& src) {\n            Node& sdata = graph.getData(src, galois::MethodFlag::UNPROTECTED);\n            for (auto ii : graph.edges(src, galois::MethodFlag::UNPROTECTED)) {\n              GNode dst   = graph.getEdgeDst(ii);\n              Node& ddata = graph.getData(dst, galois::MethodFlag::UNPROTECTED);\n              if (sdata.merge(&ddata)) {\n                mst.push(std::make_pair(src, dst));\n              }\n            }\n          },\n          galois::loopname(\"Merge\"), galois::steal());\n      galois::do_all(galois::iterate(graph), Normalize,\n                     galois::loopname(\"Normalize\"));\n    }\n    break;\n\n  case blockedasync:\n    /**\n     * Improve performance of async algorithm by following machine topology.\n     */\n    {\n      galois::InsertBag<BlockedWorkItem> items;\n      galois::do_all(\n          galois::iterate(graph),\n          [&](const GNode& src) {\n            Graph::edge_iterator start =\n                graph.edge_begin(src, galois::MethodFlag::UNPROTECTED);\n            if (galois::substrate::ThreadPool::getSocket() == 0) {\n              specialized_process<true, 0>(graph, mst)(src, start, items);\n            } else {\n              specialized_process<true, 1>(graph, mst)(src, start, items);\n            }\n          },\n          galois::loopname(\"Initialize\"));\n      galois::for_each(\n          galois::iterate(items),\n          [&](const BlockedWorkItem& i, auto& ctx) {\n            specialized_process<true, 0>(graph, mst)(i.src, i.start, ctx);\n          },\n          galois::loopname(\"Merge\"), galois::disable_conflict_detection(),\n          galois::wl<galois::worklists::PerSocketChunkFIFO<128>>());\n      //! Normalize component by doing find with path compression\n      galois::do_all(galois::iterate(graph), Normalize,\n                     galois::loopname(\"Normalize\"));\n    }\n    break;\n\n  default:\n    std::cerr << \"Unknown algo: \" << algo << \"\\n\";\n  }\n  T.stop();\n  galois::reportPageAlloc(\"MeminfoPost\");\n\n  /* Verification Routines */\n  auto is_bad_graph = [&](const GNode& n) {\n    Node& me = graph.getData(n);\n    for (auto ii : graph.edges(n)) {\n      GNode dst  = graph.getEdgeDst(ii);\n      Node& data = graph.getData(dst);\n      if (me.component() != data.component()) {\n        std::cerr << \"not in same component: \" << me << \" and \" << data << \"\\n\";\n        return true;\n      }\n    }\n    return false;\n  };\n\n  auto is_bad_mst = [&](const Edge& e) {\n    return graph.getData(e.first).component() !=\n           graph.getData(e.second).component();\n  };\n\n  auto checkAcyclic = [&]() {\n    galois::GAccumulator<unsigned> roots;\n    galois::do_all(galois::iterate(graph), [&](const GNode& n) {\n      Node& data = graph.getData(n);\n      if (data.component() == &data)\n        roots += 1;\n    });\n    unsigned numRoots = roots.reduce();\n    unsigned numEdges = std::distance(mst.begin(), mst.end());\n    if (graph.size() - numRoots != numEdges) {\n      std::cerr << \"Generated graph is not a forest. \"\n                << \"Expected \" << graph.size() - numRoots << \" edges but \"\n                << \"found \" << numEdges << \"\\n\";\n      return false;\n    }\n    std::cout << \"Num trees: \" << numRoots << \"\\n\";\n    std::cout << \"Tree edges: \" << numEdges << \"\\n\";\n    return true;\n  };\n\n  auto verify = [&]() {\n    if (galois::ParallelSTL::find_if(graph.begin(), graph.end(),\n                                     is_bad_graph) == graph.end()) {\n      if (galois::ParallelSTL::find_if(mst.begin(), mst.end(), is_bad_mst) ==\n          mst.end()) {\n        return checkAcyclic();\n      }\n    }\n    return false;\n  };\n\n  if (!skipVerify && !verify()) {\n    std::cerr << \"verification failed\\n\";\n    assert(0 && \"verification failed\");\n    abort();\n  }\n\n  return 0;\n}\n"
  },
  {
    "path": "lonestar/tutorial_examples/ThirdPartyMalloc.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n// This example shows how to use galois::runtime::ExternalHeapAllocator\n// to wrap up 3rd-party allocators and use the wrapped heap for STL containers.\n#include \"galois/Galois.h\"\n#include \"galois/runtime/Mem.h\"\n\n#include <iostream>\n\nint main() {\n  galois::SharedMemSys G;\n\n  //! [heap wrapping example]\n  // Our 3rd-party heap\n  using RealHeap = galois::runtime::MallocHeap;\n\n  // Wrap RealHeap to conform to STL allocators\n  using WrappedHeap = galois::runtime::ExternalHeapAllocator<int, RealHeap>;\n\n  // Instantiate heaps\n  RealHeap externalHeap;\n  WrappedHeap heap(&externalHeap);\n\n  // Use the wrapped heap\n  std::vector<int, WrappedHeap> v(heap);\n  for (int i = 0; i < 5; i++) {\n    v.push_back(i);\n  }\n\n  std::cout << \"Use of a std::vector with a third-party allocator wrapped by \"\n               \"galois::runtime::ExternalHeapAllocator.\\n\";\n  for (auto& j : v) {\n    std::cout << j << std::endl;\n  }\n  //! [heap wrapping example]\n\n  return 0;\n}\n"
  },
  {
    "path": "lonestar/tutorial_examples/Torus.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#include \"galois/Galois.h\"\n#include \"galois/Timer.h\"\n#include \"galois/graphs/Graph.h\"\n#include <iostream>\n\n//! Graph has int node data, void edge data and is directed\n//! [define a graph]\ntypedef galois::graphs::MorphGraph<int, void, true> Graph;\n//! [define a graph]\n//! Opaque pointer to graph node\ntypedef Graph::GraphNode GNode;\n\n//! Construct a simple torus graph\nvoid constructTorus(Graph& g, int height, int width) {\n  // Construct set of nodes\n  int numNodes = height * width;\n  std::vector<GNode> nodes(numNodes);\n  for (int i = 0; i < numNodes; ++i) {\n    //! [create and add node]\n    GNode n = g.createNode(0);\n    g.addNode(n);\n    nodes[i] = n;\n    //! [create and add node]\n  }\n\n  //! [add edges]\n  for (int x = 0; x < width; ++x) {\n    for (int y = 0; y < height; ++y) {\n      GNode c = nodes[x * height + y];\n      GNode n = nodes[x * height + ((y + 1) % height)];\n      GNode s = nodes[x * height + ((y - 1 + height) % height)];\n      GNode e = nodes[((x + 1) % width) * height + y];\n      GNode w = nodes[((x - 1 + width) % width) * height + y];\n      g.addEdge(c, n);\n      g.addEdge(c, s);\n      g.addEdge(c, e);\n      g.addEdge(c, w);\n    }\n  }\n  //! [add edges]\n}\n\nvoid verify(Graph& graph, int n) {\n  // Verify\n  int count = std::count_if(graph.begin(), graph.end(), [&](GNode n) -> bool {\n    return graph.getData(n) == 4;\n  });\n  if (count != n * n) {\n    std::cerr << \"Expected \" << n * n << \" nodes with value = 4 but found \"\n              << count << \" instead.\\n\";\n  } else {\n    std::cout << \"Correct!\\n\";\n  }\n}\n\nvoid initialize(Graph& graph) {\n  galois::do_all(galois::iterate(graph),\n                 [&](GNode n) { graph.getData(n) = 0; });\n}\n\nint main(int argc, char** argv) {\n  galois::SharedMemSys G;\n\n  if (argc < 3) {\n    std::cerr << \"<num threads> <sqrt grid size>\\n\";\n    return 1;\n  }\n  unsigned int numThreads = atoi(argv[1]);\n  int N                   = atoi(argv[2]);\n\n  numThreads = galois::setActiveThreads(numThreads);\n  std::cout << \"Using \" << numThreads << \" thread(s) and \" << N << \" x \" << N\n            << \" torus\\n\";\n\n  Graph graph;\n  constructTorus(graph, N, N);\n\n  // read/write only a node itself\n  galois::do_all(\n      galois::iterate(graph),\n      [&](GNode n) {\n        graph.getData(n) =\n            std::distance(graph.edge_begin(n), graph.edge_end(n));\n      },\n      galois::loopname(\"do_all\"));\n  verify(graph, N);\n\n  // push operator with Galois synchronization\n  initialize(graph);\n  galois::for_each(\n      galois::iterate(graph),\n      [&](GNode n, auto&) {\n        for (auto ii : graph.edges(n)) {\n          GNode dst  = graph.getEdgeDst(ii);\n          auto& data = graph.getData(dst);\n          data += 1;\n        }\n      },\n      galois::loopname(\"for_each\"), galois::no_pushes());\n  verify(graph, N);\n\n  auto incrementNeighborsAtomically = [&](GNode n) {\n    for (auto e : graph.edges(n)) {\n      auto dst      = graph.getEdgeDst(e);\n      auto& dstData = graph.getData(dst);\n      __sync_fetch_and_add(&dstData, 1);\n    }\n  };\n\n  // push operator with self synchronization in do_all\n  initialize(graph);\n  //! [work stealing]\n  galois::do_all(galois::iterate(graph), incrementNeighborsAtomically,\n                 galois::loopname(\"do_all_self_sync\"), galois::steal(),\n                 galois::chunk_size<32>());\n  //! [work stealing]\n  verify(graph, N);\n\n  // push operator with self synchronization in optimized for_each\n  initialize(graph);\n  galois::for_each(\n      galois::iterate(graph),\n      [&](GNode n, auto&) { incrementNeighborsAtomically(n); },\n      galois::loopname(\"for_each_self_sync\"),\n      galois::disable_conflict_detection(), galois::no_pushes());\n  verify(graph, N);\n\n  return 0;\n}\n"
  },
  {
    "path": "lonestar/tutorial_examples/TorusConstruction.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n// This example shows how to manipulate MorphGraph to change graph topology\n// 1. createNode\n// 2. addNode\n// 3. addEdge\n#include \"galois/Galois.h\"\n#include \"galois/graphs/Graph.h\"\n#include <iostream>\n\n//! [Define a MorphGraph]\n// Graph has int node data, void edge data and is directed\nusing Graph = galois::graphs::MorphGraph<int, void, true>;\n// Opaque pointer to graph node\nusing GNode = Graph::GraphNode;\n//! [Define a MorphGraph]\n\n//! [Construct torus]\nvoid constructTorus(Graph& g, int height, int width) {\n  // Construct set of nodes\n  int numNodes = height * width;\n  std::vector<GNode> nodes(numNodes);\n  for (int i = 0; i < numNodes; ++i) {\n    GNode n = g.createNode(\n        0);       // allocate node data and initialize the node data with 0\n    g.addNode(n); // add n to g. from now on n can be located from g\n    nodes[i] = n;\n  }\n\n  // Add edges\n  for (int x = 0; x < width; ++x) {\n    for (int y = 0; y < height; ++y) {\n      GNode c = nodes[x * height + y];\n      GNode n = nodes[x * height + ((y + 1) % height)];\n      GNode s = nodes[x * height + ((y - 1 + height) % height)];\n      GNode e = nodes[((x + 1) % width) * height + y];\n      GNode w = nodes[((x - 1 + width) % width) * height + y];\n      g.addEdge(c, n); // addEdge checks if the edge exists or not. nop if so.\n      g.addEdge(c, s);\n      g.addEdge(c, e);\n      g.addEdge(c, w);\n    }\n  }\n}\n//! [Construct torus]\n\nint main(int argc, char** argv) {\n  galois::SharedMemSys G;\n\n  if (argc < 2) {\n    std::cerr << \"<sqrt grid size>\\n\";\n    return 1;\n  }\n  int N = atoi(argv[1]);\n\n  Graph graph;\n  constructTorus(graph, N, N);\n\n  std::cout << \"Constructed a \" << N << \" x \" << N << \" torus.\" << std::endl;\n\n  return 0;\n}\n"
  },
  {
    "path": "lonestar/tutorial_examples/TorusImproved.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#include \"galois/Galois.h\"\n#include \"galois/Timer.h\"\n#include \"galois/graphs/Graph.h\"\n#include <iostream>\n\n//! Graph has int node data, void edge data and is directed\ntypedef galois::graphs::MorphGraph<int, void, true> Graph;\n//! Opaque pointer to graph node\ntypedef Graph::GraphNode GNode;\n\nclass Point2D {\n  int v[2];\n\npublic:\n  Point2D() : v{0, 0} {}\n  Point2D(int x, int y) : v{x, y} {}\n\n  const int& at(int i) const { return v[i]; }\n  const int& x() const { return v[0]; }\n  const int& y() const { return v[1]; }\n  int dim() const { return 2; }\n};\n\n//! Construct a simple torus graph\nvoid constructTorus(Graph& g, int height, int width) {\n  // Construct set of nodes\n  int numNodes = height * width;\n  std::vector<Point2D> points(numNodes);\n  for (int x = 0; x < width; ++x) {\n    for (int y = 0; y < height; ++y) {\n      points[x * height + y] = Point2D(x, y);\n    }\n  }\n\n  // Sort in a space-filling way\n  std::sort(points.begin(), points.end(),\n            /**\n             * Sort pairs according to Morton Z-Order.\n             *\n             * From http://en.wikipedia.org/wiki/Z-order_%28curve%29\n             */\n            [&](const Point2D& p1, const Point2D& p2) -> bool {\n              int index = 0;\n              int x     = 0;\n              for (int k = 0; k < p1.dim(); ++k) {\n                int y        = p1.at(k) ^ p2.at(k);\n                bool lessMsb = x < y && x < (x ^ y);\n                if (lessMsb) {\n                  index = k;\n                  x     = y;\n                }\n              }\n              return p1.at(index) - p2.at(index) < 0;\n            });\n\n  // Using space-filling order, assign nodes and create (and allocate) them in\n  // parallel\n  std::vector<GNode> nodes(numNodes);\n  galois::do_all(galois::iterate(points), [&](const Point2D& p) {\n    auto n = g.createNode(0);\n    g.addNode(n);\n    nodes[p.x() * height + p.y()] = n;\n  });\n\n  // Add edges\n  for (int x = 0; x < width; ++x) {\n    for (int y = 0; y < height; ++y) {\n      GNode c = nodes[x * height + y];\n      GNode n = nodes[x * height + ((y + 1) % height)];\n      GNode s = nodes[x * height + ((y - 1 + height) % height)];\n      GNode e = nodes[((x + 1) % width) * height + y];\n      GNode w = nodes[((x - 1 + width) % width) * height + y];\n      g.addEdge(c, n);\n      g.addEdge(c, s);\n      g.addEdge(c, e);\n      g.addEdge(c, w);\n    }\n  }\n}\n\nint main(int argc, char** argv) {\n  galois::SharedMemSys G;\n\n  if (argc < 3) {\n    std::cerr << \"<num threads> <sqrt grid size>\\n\";\n    return 1;\n  }\n  unsigned int numThreads = atoi(argv[1]);\n  int n                   = atoi(argv[2]);\n\n  GALOIS_ASSERT(n > 2);\n\n  numThreads = galois::setActiveThreads(numThreads);\n  std::cout << \"Using \" << numThreads << \" threads and \" << n << \" x \" << n\n            << \" torus\\n\";\n\n  Graph graph;\n  constructTorus(graph, n, n);\n\n  galois::Timer T;\n  T.start();\n\n  galois::for_each(galois::iterate(graph), [&](GNode n, auto&) {\n    // For each outgoing edge (n, dst)\n    for (auto ii : graph.edges(n)) {\n      GNode dst = graph.getEdgeDst(ii);\n      int& data = graph.getData(dst);\n      // Increment node data by 1\n      data += 1;\n    }\n  });\n  T.stop();\n\n  std::cout << \"Elapsed time: \" << T.get() << \" milliseconds\\n\";\n\n  // Verify\n  int count = std::count_if(graph.begin(), graph.end(), [&](GNode n) -> bool {\n    return graph.getData(n) == 4;\n  });\n  if (count != n * n) {\n    std::cerr << \"Expected \" << n * n << \" nodes with value = 4 but found \"\n              << count << \" instead.\\n\";\n    return 1;\n  } else {\n    std::cout << \"Correct!\\n\";\n  }\n\n  return 0;\n}\n"
  },
  {
    "path": "pyproject.toml",
    "content": "[build-system]\nrequires = [\"setuptools\", \"wheel\", \"scikit-build\", \"cmake>=3.13\", \"cython\"]\n"
  },
  {
    "path": "python/CMakeLists.txt",
    "content": "cmake_minimum_required(VERSION 3.13)\n\nproject(pygalois)\n\nset(CMAKE_CXX_STANDARD 17)\nset(CMAKE_CXX_STANDARD_REQUIRED ON)\nset(CMAKE_CXX_EXTENSIONS OFF)\nset(CMAKE_POSITION_INDEPENDENT_CODE ON)\n\nfind_package(PythonExtensions REQUIRED)\nfind_package(Cython REQUIRED)\n\nadd_subdirectory(.. ${CMAKE_BINARY_DIR}/../cmake-galois-build EXCLUDE_FROM_ALL)\n\nadd_subdirectory(galois)\n"
  },
  {
    "path": "python/galois/CMakeLists.txt",
    "content": "include_directories(cpp)\n\nadd_cython_target(shmem shmem.pyx CXX OUTPUT_VAR GALOIS_SOURCES)\nadd_library(shmem MODULE ${GALOIS_SOURCES})\npython_extension_module(shmem)\ntarget_link_libraries(shmem Galois::shmem)\n\n# Avoid collisions with existing application targets with the same name.\nadd_cython_target(_bfs _bfs.pyx shmem.pyx CXX OUTPUT_VAR BFS_SOURCES)\nadd_library(_bfs MODULE ${BFS_SOURCES})\npython_extension_module(_bfs)\ntarget_link_libraries(_bfs Galois::shmem)\n\nadd_cython_target(_sssp _sssp.pyx shmem.pyx CXX OUTPUT_VAR SSSP_SOURCES)\nadd_library(_sssp MODULE ${SSSP_SOURCES})\npython_extension_module(_sssp)\ntarget_link_libraries(_sssp Galois::shmem)\n\nadd_cython_target(_pagerank _pagerank.pyx shmem.pyx CXX OUTPUT_VAR PAGERANK_SOURCES)\nadd_library(_pagerank MODULE ${PAGERANK_SOURCES})\npython_extension_module(_pagerank)\ntarget_link_libraries(_pagerank Galois::shmem)\n\nadd_cython_target(_connected_components _connected_components.pyx shmem.pyx CXX OUTPUT_VAR CONNECTEDCOMPONENTS_SOURCES)\nadd_library(_connected_components MODULE ${CONNECTEDCOMPONENTS_SOURCES})\npython_extension_module(_connected_components)\ntarget_link_libraries(_connected_components Galois::shmem)\n\ninstall(\n  TARGETS shmem _bfs _sssp _pagerank _connected_components\n  LIBRARY DESTINATION python/galois\n)\n\ninstall(\n  TARGETS galois_shmem \n  PUBLIC_HEADER DESTINATION include\n  ARCHIVE DESTINATION lib\n  INCLUDES DESTINATION include\n)\n\n# In order to preserve the directory structure, galois_shmem uses\n# install(DIRECTORY) rather than setting the PUBLIC_HEADER property on the\n# galois_shmem itself. Mirror the install(DIRECTORY) logic here.\nget_target_property(GALOIS_SOURCE_DIR galois_shmem SOURCE_DIR)\nget_target_property(GALOIS_BINARY_DIR galois_shmem BINARY_DIR)\ninstall(\n  DIRECTORY \"${GALOIS_SOURCE_DIR}/include/\" \"${GALOIS_BINARY_DIR}/include/\"\n  DESTINATION include\n  FILES_MATCHING PATTERN \"*.h\"\n)\n"
  },
  {
    "path": "python/galois/__init__.py",
    "content": "\n"
  },
  {
    "path": "python/galois/_bfs.pyx",
    "content": "# cython: cdivision = True\n\nfrom galois.shmem cimport *\nfrom cython.operator cimport preincrement, dereference as deref\n\nctypedef atomic[uint32_t] atomuint32_t\n\nctypedef LC_CSR_Graph[uint32_t, void, dummy_true] Graph_CSR\n\n# Cython bug: using a nested class from a previous typedef doesn't\n# work for the time being. Instead, the full template specialization\n# must be used to get the member type.\nctypedef LC_CSR_Graph[uint32_t, void, dummy_true].GraphNode GNodeCSR\n\ncdef void printValue(Graph_CSR *g):\n    cdef unsigned long numNodes = g[0].size()\n    cdef uint32_t *data\n    gPrint(b\"Number of nodes : \", numNodes, b\"\\n\")\n    for n in range(numNodes):\n        data = &g[0].getData(n)\n        gPrint(b\"\\t\", data[0], b\"\\n\")\n         \n##############################################################################\n## Bfs implementation\n###########################################################################\n#\n# Initialization for BFS\n#\ncdef void Initialize(Graph_CSR *g, unsigned long source):\n    cdef unsigned long numNodes = g[0].size()\n    cdef: \n        LC_CSR_Graph[uint32_t, void, dummy_true].edge_iterator ii\n        LC_CSR_Graph[uint32_t, void, dummy_true].edge_iterator ei\n        uint32_t *data\n    gPrint(b\"Number of nodes : \", numNodes, b\"\\n\")\n    for n in range(numNodes):\n        #gPrint(n,\"\\n\")\n        data = &g[0].getData(n)\n        if(n == source):\n            data[0] = 0\n        else:\n            data[0] = numNodes\n        \n\n#\n# BFS Operator to be executed on each Graph node\n#\ncdef void bfs_operator(Graph_CSR *g, bool *work_done, GNodeCSR n, UserContext[GNodeCSR] &ctx) nogil:\n    cdef: \n        LC_CSR_Graph[uint32_t, void, dummy_true].edge_iterator ii\n        LC_CSR_Graph[uint32_t, void, dummy_true].edge_iterator ei\n        uint32_t *src_data\n        uint32_t *dst_data\n    src_data = &g[0].getData(n)    \n    ii = g[0].edge_begin(n)\n    ei = g[0].edge_end(n)\n    while ii != ei:\n            dst_data = &g[0].getData(g[0].getEdgeDst(ii))\n            if(src_data[0] > dst_data[0] + 1):\n                src_data[0] = dst_data[0] + 1\n                work_done[0] = 1\n            preincrement(ii)\n            \ncdef void bfs_pull_topo(Graph_CSR *graph):\n    cdef bool work_done = 1\n    cdef Timer T\n    rounds = 0; \n    while(work_done):\n        rounds += 1;\n        print(\"starting for_each\")\n        gPrint(b\"Work done Before : \", work_done, b\"\\n\")\n        with nogil:\n            T.start()\n            work_done = 0\n            for_each(iterate(graph[0].begin(), graph[0].end()),\n                     bind_leading(&bfs_operator, graph, &work_done), no_pushes())#,\n                     #loopname(\"name1\"))\n            T.stop()\n            gPrint(b\"Work done : \", work_done, b\"\\n\")\n            gPrint(b\"Elapsed time:\", T.get(), b\" milliseconds.\\n\")\n    print(\"Number of rounds : \", rounds, \"\\n\")\n\n\n#\n# BFS sync operator to be executed on each Graph node\n#\ncdef void bfs_sync_operator(Graph_CSR *g, InsertBag[GNodeCSR] *next, int nextLevel, GNodeCSR n) nogil:\n    cdef: \n        LC_CSR_Graph[uint32_t, void, dummy_true].edge_iterator ii\n        LC_CSR_Graph[uint32_t, void, dummy_true].edge_iterator ei\n        uint32_t *src_data\n        uint32_t *dst_data\n        uint32_t numNodes = g[0].size()\n        GNodeCSR dst\n    src_data = &g[0].getData(n)    \n    ii = g[0].edge_begin(n)\n    ei = g[0].edge_end(n)\n    while ii != ei:\n            dst = g[0].getEdgeDst(ii)\n            dst_data = &g[0].getData(dst)\n            if(dst_data[0] == numNodes):\n                dst_data[0] = nextLevel\n                next[0].push(dst)\n            preincrement(ii)\n            \ncdef void bfs_sync(Graph_CSR *graph, GNodeCSR source):\n    cdef:\n        Timer T\n        InsertBag[GNodeCSR] curr, next\n        uint32_t nextLevel = 0;\n    \n    next.push(source)\n    T.start()\n    while(not next.empty()):\n        curr.swap(next)\n        next.clear()\n        nextLevel += 1;\n        with nogil:\n            do_all(iterate(curr),\n                     bind_leading(&bfs_sync_operator, graph, &next, nextLevel), no_pushes(), steal(),\n                     loopname(\"bfs_sync\"))\n    T.stop()\n    gPrint(b\"Elapsed time:\", T.get(), b\" milliseconds.\\n\")        \n    print(\"Number of rounds : \", nextLevel, \"\\n\")\n\ncdef void not_visited_operator(Graph_CSR *graph, atomuint32_t *notVisited, GNodeCSR n):\n    cdef: \n        uint32_t *data\n        uint32_t numNodes = graph[0].size()\n    data = &graph[0].getData(n)\n    if (data[0] >= numNodes):\n        notVisited[0].fetch_add(1)\n\ncdef void max_dist_operator(Graph_CSR *graph, GReduceMax[uint32_t] *maxDist , GNodeCSR n):\n    cdef: \n        uint32_t *data\n        uint32_t numNodes = graph[0].size()\n    data = &graph[0].getData(n)\n    if(data[0] < numNodes):\n        maxDist[0].update(data[0])\n\n\ncdef bool verify_bfs(Graph_CSR *graph, GNodeCSR source):\n    cdef: \n        atomuint32_t notVisited\n        uint32_t *data\n        GReduceMax[uint32_t] maxDist;\n\n    data = &graph[0].getData(source)\n    if(data[0] is not 0):\n        gPrint(b\"ERROR: source has non-zero dist value == \", data[0], b\"\\n\")\n    \n    notVisited.store(0)\n    with nogil:\n        do_all(iterate(graph[0]),\n                bind_leading(&not_visited_operator, graph, &notVisited), no_pushes(), steal(),\n                loopname(\"not_visited_op\"))\n\n    if(notVisited.load() > 0):\n        gPrint(notVisited.load(), b\" unvisited nodes; this is an error if graph is strongly connected\\n\")\n\n    with nogil:\n        do_all(iterate(graph[0]),\n                bind_leading(&max_dist_operator, graph, &maxDist), no_pushes(), steal(),\n                loopname(\"not_visited_op\"))\n\n    gPrint(b\"Max distance : \", maxDist.reduce(), b\"\\n\")\n#\n# Main callsite for Bfs\n#        \ndef bfs(int numThreads, unsigned long source, string filename):\n    ## Hack: Need a better way to initialize shared\n    ## memory runtime.\n    sys = new SharedMemSys()\n    cdef int new_numThreads = setActiveThreads(numThreads)\n    if new_numThreads != numThreads:\n        print(\"Warning, using fewer threads than requested\")\n    \n    print(\"Using {0} thread(s).\".format(new_numThreads))\n    cdef Graph_CSR graph\n    \n    ## Read the CSR format of graph\n    ## directly from disk.\n    graph.readGraphFromGRFile(filename)\n    gPrint(b\"Using Source Node: \", source, b\"\\n\");\n    Initialize(&graph, source)\n    #printValue(&graph)\n    #bfs_pull_topo(&graph)\n    bfs_sync(&graph, <GNodeCSR>source)\n    verify_bfs(&graph, <GNodeCSR>source)\n    gPrint(b\"Node 1 has dist : \", graph.getData(1), b\"\\n\")\n    \n\n\n"
  },
  {
    "path": "python/galois/_connected_components.pyx",
    "content": "# cython: cdivision= True\nfrom galois.shmem cimport *\nfrom cython.operator cimport preincrement, dereference as deref\nfrom libstd.atomic cimport atomic\n\nctypedef uint32_t ComponentTy\nctypedef atomic[ComponentTy] AtomicComponentTy\nctypedef atomic[uint32_t] atomuint32_t \n\n#\n# Struct for CC\n#\ncdef struct NodeTy:\n    AtomicComponentTy comp_current\n    ComponentTy comp_old\n\nctypedef LC_CSR_Graph[NodeTy, void, dummy_true] Graph\n\n# Cython bug: using a nested class from a previous typedef doesn't\n# work for the time being. Instead, the full template specialization\n# must be used to get the member type.\nctypedef LC_CSR_Graph[NodeTy, void, dummy_true].GraphNode GNode\n\n\n#\n# Initialization for Components\n#\ncdef void initializeCompnents(Graph *g):\n    cdef:\n        unsigned long numNodes = g[0].size()\n        LC_CSR_Graph[NodeTy, void, dummy_true].edge_iterator ii\n        LC_CSR_Graph[NodeTy, void, dummy_true].edge_iterator ei\n        NodeTy *data\n    for n in range(numNodes):\n        data = &g[0].getData(n)\n        data[0].comp_current.store(n)\n        data[0].comp_old = numNodes\n\n##\n# LabelProp algorithm operator\n##\ncdef void labelPropOperator(Graph *g, bool *work_done, GNode n) nogil:\n    cdef: \n        LC_CSR_Graph[NodeTy, void, dummy_true].edge_iterator ii\n        LC_CSR_Graph[NodeTy, void, dummy_true].edge_iterator ei\n        NodeTy *src_data\n        NodeTy *dst_data\n    src_data = &g[0].getData(n, FLAG_UNPROTECTED)\n    if(src_data[0].comp_old > src_data[0].comp_current.load()):\n        src_data[0].comp_old = src_data[0].comp_current.load()\n        work_done[0] = 1        \n        ii = g[0].edge_begin(n, FLAG_UNPROTECTED)\n        ei = g[0].edge_end(n, FLAG_UNPROTECTED)\n        while ii != ei:\n                dst_data = &g[0].getData(g[0].getEdgeDst(ii), FLAG_UNPROTECTED)\n                atomicMin[ComponentTy](dst_data.comp_current, src_data.comp_current.load())\n                preincrement(ii)\n##\n# Label Propagation algorithm for \n# finding connected components\n##\ncdef void labelProp(Graph* graph):\n    cdef:\n        bool work_done = 1\n        Timer T\n    rounds = 0\n    T.start()\n    while(work_done):\n        rounds += 1;\n        with nogil:\n            work_done = 0\n            do_all(iterate(graph[0].begin(), graph[0].end()),\n                     bind_leading(&labelPropOperator, graph, &work_done), \n                     no_pushes(),\n                     steal(),\n                     disable_conflict_detection(),\n                     loopname(\"labelPropAlgo\"))\n    T.stop()\n    gPrint(b\"Elapsed time:\", T.get(), b\" milliseconds.\\n\")\n\n\n\n\n#\n# Main callsite for CC\n#   \ndef connectedComponents(int numThreads, string filename):\n    ## Hack: Need a better way to initialize shared\n    ## memory runtime.\n    sys = new SharedMemSys()\n    cdef int new_numThreads = setActiveThreads(numThreads)\n    gPrint(b\"Running Pagerank on : \", filename, b\"\\n\")\n    if new_numThreads != numThreads:\n        print(\"Warning, using fewer threads than requested\")\n    \n    print(\"Using {0} thread(s).\".format(new_numThreads))\n    cdef Graph graph\n    \n    ## Read the CSR format of graph\n    ## directly from disk.\n    graph.readGraphFromGRFile(filename)\n    \n    initializeCompnents(&graph)\n    labelProp(&graph)\n    #printValuePR(&graph)\n"
  },
  {
    "path": "python/galois/_pagerank.pyx",
    "content": "# cython: cdivision = True\n\nfrom galois.shmem cimport *\nfrom cython.operator cimport preincrement, dereference as deref\n\nctypedef atomic[uint32_t] atomuint32_t\nctypedef atomic[uint64_t] atomuint64_t\n##############################################################################\n## Pagerank implementation\n###############################################################################\n#\n# Struct for Pagerank\n#\ncdef struct NodeTy:\n    float rank\n    uint32_t nout\n\nctypedef LC_CSR_Graph[NodeTy, void, dummy_true] Graph\n\n# Cython bug: using a nested class from a previous typedef doesn't\n# work for the time being. Instead, the full template specialization\n# must be used to get the member type.\nctypedef LC_CSR_Graph[NodeTy, void, dummy_true].GraphNode GNode\n\n#\n# Constants for Pagerank\n#\ncdef float ALPHA = 0.85\ncdef float INIT_RESIDUAL = 1 - ALPHA;\ncdef float TOLERANCE   = 1.0e-3;\ncdef uint32_t MAX_ITER = 1000;\n\n#\n# Initialization for Pagerank\n#\ncdef void InitializePR(Graph *g):\n    cdef unsigned long numNodes = g[0].size()\n    cdef NodeTy *data\n    gPrint(b\"Number of nodes : \", numNodes, b\"\\n\")\n    for n in range(numNodes):\n        #gPrint(n,\"\\n\")\n        data = &g[0].getData(n)\n        data[0].rank = INIT_RESIDUAL\n        data[0].nout = 0\n\ncdef void printValuePR(Graph *g):\n    cdef unsigned long numNodes = g[0].size()\n    cdef NodeTy *data\n    gPrint(b\"Number of nodes : \", numNodes, b\"\\n\")\n    for n in range(numNodes):\n        #gPrint(n,\"\\n\")\n        data = &g[0].getData(n)\n        #if(data[0].nout.load() > 0):\n        gPrint(data[0].rank, b\"\\n\")\n\n#\n# Operator for computing outdegree of nodes in the Graph\n#\ncdef void computeOutDeg_operator(Graph *g, LargeArray[atomuint64_t] *largeArray, GNode n) nogil:\n    cdef: \n        LC_CSR_Graph[NodeTy, void, dummy_true].edge_iterator ii\n        LC_CSR_Graph[NodeTy, void, dummy_true].edge_iterator ei\n        GNode dst\n        #NodeTy *dst_data\n        \n    ii = g[0].edge_begin(n)\n    ei = g[0].edge_end(n)\n    while ii != ei:\n            dst = g[0].getEdgeDst(ii)\n            largeArray[0][<size_t>dst].fetch_add(1)\n            preincrement(ii)\n    \n#\n# Operator for assigning outdegree of nodes in the Graph\n#\ncdef void assignOutDeg_operator(Graph *g, LargeArray[atomuint64_t] *largeArray, GNode n) nogil:\n    cdef NodeTy *src_data\n        \n    src_data = &g[0].getData(n)\n    src_data.nout = largeArray[0][<size_t>n].load()\n#\n#\n# Main callsite for computing outdegree of nodes in the Graph\n#\ncdef void computeOutDeg(Graph *graph):\n    cdef: \n        uint64_t numNodes = graph[0].size()\n        LargeArray[atomuint64_t] largeArray\n\n    largeArray.allocateInterleaved(numNodes)\n    with nogil:\n        do_all(iterate(graph[0].begin(), graph[0].end()),\n                        bind_leading(&computeOutDeg_operator, graph, &largeArray), steal(),\n                        loopname(\"ComputeDegree\"))\n\n        do_all(iterate(graph[0].begin(), graph[0].end()),\n                        bind_leading(&assignOutDeg_operator, graph, &largeArray))\n\n\n#\n# Operator for PageRank\n#\ncdef void pagerankPullTopo_operator(Graph *g, GReduceMax[float] *max_delta, GNode n) nogil:\n    cdef: \n        LC_CSR_Graph[NodeTy, void, dummy_true].edge_iterator ii\n        LC_CSR_Graph[NodeTy, void, dummy_true].edge_iterator ei\n        GNode dst\n        NodeTy *dst_data\n        NodeTy *src_data\n        float sum = 0\n        float value = 0\n        float diff = 0;\n    ii = g[0].edge_begin(n, FLAG_UNPROTECTED)\n    ei = g[0].edge_end(n, FLAG_UNPROTECTED)\n    src_data = &g[0].getData(n)\n    while ii != ei:\n            dst_data = &g[0].getData(g[0].getEdgeDst(ii), FLAG_UNPROTECTED)\n            sum += dst_data[0].rank / dst_data[0].nout\n            preincrement(ii)\n    value = sum * ALPHA + (1.0 - ALPHA)\n    diff = fabs(value - src_data[0].rank);\n    src_data[0].rank = value\n    max_delta[0].update(diff)\n\n#\n# Pagerank routine: Loop till convergence\n#\ncdef void pagerankPullTopo(Graph *graph, uint32_t max_iterations) nogil:\n    cdef: \n        GReduceMax[float] max_delta\n        float delta = 0\n        uint32_t iteration = 0\n        Timer T\n\n    T.start()\n    while(1):\n        with nogil:\n            do_all(iterate(graph[0].begin(), graph[0].end()),\n                        bind_leading(&pagerankPullTopo_operator, graph, &max_delta), steal(),\n                        loopname(\"PageRank\"))\n\n        delta = max_delta.reduce()\n        iteration += 1\n        if(delta <= TOLERANCE or iteration >= max_iterations):\n            break\n        max_delta.reset();\n    \n    T.stop()\n    gPrint(b\"Elapsed time:\", T.get(), b\" milliseconds.\\n\")\n    if(iteration >= max_iterations):\n        gPrint(b\"WARNING : failed to converge in \", iteration, b\" iterations\\n\")\n    \n\n#\n# Main callsite for Pagerank\n#   \ndef pagerank(int numThreads, uint32_t max_iterations, string filename):\n    ## Hack: Need a better way to initialize shared\n    ## memory runtime.\n    sys = new SharedMemSys()\n    cdef int new_numThreads = setActiveThreads(numThreads)\n    gPrint(b\"Running Pagerank on : \", filename, b\"\\n\")\n    if new_numThreads != numThreads:\n        print(\"Warning, using fewer threads than requested\")\n    \n    print(\"Using {0} thread(s).\".format(new_numThreads))\n    cdef Graph graph\n    \n    ## Read the CSR format of graph\n    ## directly from disk.\n    graph.readGraphFromGRFile(filename)\n    \n    InitializePR(&graph)\n    computeOutDeg(&graph)\n    pagerankPullTopo(&graph, max_iterations)\n    #printValuePR(&graph)\n    \n   \n"
  },
  {
    "path": "python/galois/_sssp.pyx",
    "content": "# cython: cdivision= True\nfrom galois.shmem cimport *\nfrom cython.operator cimport preincrement, dereference as deref\nfrom libstd.atomic cimport atomic\n\nctypedef uint32_t Dist\nctypedef atomic[Dist] AtomicDist\nctypedef atomic[uint32_t] atomuint32_t \n\nctypedef uint32_t EdgeTy\nctypedef LC_CSR_Graph[AtomicDist, EdgeTy, dummy_true] Graph_CSR\n\n# Cython bug: using a nested class from a previous typedef doesn't\n# work for the time being. Instead, the full template specialization\n# must be used to get the member type.\nctypedef LC_CSR_Graph[AtomicDist, EdgeTy, dummy_true].GraphNode GNodeCSR\n\ncdef void printValue(Graph_CSR *g):\n    cdef unsigned long numNodes = g[0].size()\n    cdef AtomicDist *data\n    gPrint(b\"Number of nodes : \", numNodes, b\"\\n\")\n    for n in range(numNodes):\n        data = &g[0].getData(n)\n        gPrint(b\"\\t\", data[0].load(), b\"\\n\")         \n##############################################################################\n## SSSP implementation\n###########################################################################\n#\n# Initialization for SSSP\n# Source distance is set to 0; Other nodes distance is set\n# to number of nodes \n#\ncdef void Initialize(Graph_CSR *g, unsigned long source):\n    cdef:\n        unsigned long numNodes = g[0].size()\n        AtomicDist *data\n    gPrint(b\"Number of nodes : \", numNodes, b\"\\n\")\n    for n in range(numNodes):\n        #gPrint(n,\"\\n\")\n        data = &g[0].getData(n)\n        if(n == source):\n            data[0].store(0)\n        else:\n            data[0].store(numNodes)\n        \n\nctypedef UpdateRequest[GNodeCSR, Dist] UpdateRequestObj\n#\n# SSSP Delta step Operator to be executed on each Graph node\n#\ncdef void ssspOperator(Graph_CSR *g, UpdateRequestObj item, UserContext[UpdateRequestObj] &ctx) nogil:\n\n    cdef: \n        LC_CSR_Graph[uint32_t, void, dummy_true].edge_iterator ii\n        LC_CSR_Graph[uint32_t, void, dummy_true].edge_iterator ei\n        AtomicDist *src_data\n        AtomicDist *dst_data\n        Dist oldDist, newDist\n        EdgeTy edge_data\n        GNodeCSR dst\n        unsigned long numNodes = g[0].size()\n    \n    src_data = &g[0].getData(item.src, FLAG_UNPROTECTED)    \n    ii = g[0].edge_begin(item.src, FLAG_UNPROTECTED)\n    ei = g[0].edge_end(item.src, FLAG_UNPROTECTED)\n    if(src_data.load() < item.dist):\n        return\n    while ii != ei:\n            dst = g[0].getEdgeDst(ii)\n            dst_data = &g[0].getData(dst, FLAG_UNPROTECTED)\n            edge_data = g[0].getEdgeData(ii, FLAG_UNPROTECTED)\n            newDist = src_data[0].load() + edge_data\n\n            oldDist = atomicMin[Dist](dst_data[0], newDist)\n            if(newDist < oldDist):\n                ctx.push(UpdateRequestObj(dst, newDist))\n\n            preincrement(ii)\n\n######\n# SSSP Delta step algo using OBIM \n#####\nctypedef ChunkFIFO[Uint_64u] ChunkFIFO_64\nctypedef PerSocketChunkFIFO[Uint_64u] PerSocketChunkFIFO_64\nctypedef OrderedByIntegerMetric[UpdateRequestIndexer, PerSocketChunkFIFO_64] OBIM\ncdef void ssspDeltaStep(Graph_CSR *graph, GNodeCSR source, uint32_t shift):\n    cdef:\n        Timer T\n        InsertBag[UpdateRequestObj] initBag\n        \n    initBag.push(UpdateRequestObj(source, 0))\n    T.start()\n    with nogil:\n        for_each(iterate(initBag),\n                    bind_leading(&ssspOperator, graph),\n                                wl[OBIM](UpdateRequestIndexer(shift)), \n                                #steal(), \n                                disable_conflict_detection(),\n                                loopname(\"SSSP\"))\n    T.stop()\n    gPrint(b\"Elapsed time:\", T.get(), b\" milliseconds.\\n\")        \n\n\n\n#######################\n# Verification routines\n#######################\ncdef void not_visited_operator(Graph_CSR *graph, atomuint32_t *notVisited, GNodeCSR n):\n    cdef: \n        AtomicDist *data\n        uint32_t numNodes = graph[0].size()\n    data = &graph[0].getData(n)\n    if (data[0].load() >= numNodes):\n        notVisited[0].fetch_add(1)\n\ncdef void max_dist_operator(Graph_CSR *graph, GReduceMax[uint32_t] *maxDist , GNodeCSR n):\n    cdef: \n        AtomicDist *data\n        uint32_t numNodes = graph[0].size()\n    data = &graph[0].getData(n)\n    if(data[0].load() < numNodes):\n        maxDist[0].update(data[0].load())\n\ncdef bool verify_sssp(Graph_CSR *graph, GNodeCSR source):\n    cdef: \n        atomuint32_t notVisited\n        AtomicDist *data\n        GReduceMax[uint32_t] maxDist;\n\n    data = &graph[0].getData(source)\n    if(data[0].load() is not 0):\n        gPrint(b\"ERROR: source has non-zero dist value == \", data[0].load(), b\"\\n\")\n    \n    notVisited.store(0)\n    with nogil:\n        do_all(iterate(graph[0]),\n                bind_leading(&not_visited_operator, graph, &notVisited), no_pushes(), steal(),\n                loopname(\"not_visited_op\"))\n\n    if(notVisited.load() > 0):\n        gPrint(notVisited.load(), b\" unvisited nodes; this is an error if graph is strongly connected\\n\")\n\n    with nogil:\n        do_all(iterate(graph[0]),\n                bind_leading(&max_dist_operator, graph, &maxDist), no_pushes(), steal(),\n                loopname(\"not_visited_op\"))\n\n    gPrint(b\"Max distance : \", maxDist.reduce(), b\"\\n\")\n\n\n#\n# Main callsite for SSSP\n#        \ndef sssp(int numThreads, uint32_t shift, unsigned long source, string filename):\n    ## Hack: Need a better way to initialize shared\n    ## memory runtime.\n    sys = new SharedMemSys()\n    cdef int new_numThreads = setActiveThreads(numThreads)\n    if new_numThreads != numThreads:\n        print(\"Warning, using fewer threads than requested\")\n    \n    print(\"Using {0} thread(s).\".format(new_numThreads))\n    cdef Graph_CSR graph\n    \n    ## Read the CSR format of graph\n    ## directly from disk.\n    graph.readGraphFromGRFile(filename)\n    gPrint(b\"Using Source Node: \", source, b\"\\n\");\n    Initialize(&graph, source)\n    #printValue(&graph)\n    #ssspWorklist(&graph, <GNodeCSR>source)\n    ssspDeltaStep(&graph, <GNodeCSR>source, shift)\n    #verify_sssp(&graph, <GNodeCSR>source)\n    gPrint(b\"Node 1 has dist : \", graph.getData(1), b\"\\n\")\n    \n\n\n"
  },
  {
    "path": "python/galois/bfs.py",
    "content": "from ._bfs import *\n"
  },
  {
    "path": "python/galois/connected_components.py",
    "content": "from ._connected_components import *\n"
  },
  {
    "path": "python/galois/cpp/__init__.pxd",
    "content": ""
  },
  {
    "path": "python/galois/cpp/libgalois/Galois.pxd",
    "content": "# distutils: language=c++\nfrom libcpp cimport bool\nfrom libc.stdint cimport *\nfrom libstd.atomic cimport atomic\n\n# Declaration from \"Galois/Threads.h\"\n\n#ctypedef uint64_t size_t \n\n# Hack to make auto return type for galois::iterate work.\n# It may be necessary to write a wrapper header around for_each,\n# but this should be good enough for the forseeable future either way.\ncdef extern from * nogil:\n    cppclass InternalRange \"auto\":\n        pass\n\ncdef extern from \"galois/Galois.h\" namespace \"galois\" nogil:\n    unsigned int setActiveThreads(unsigned int)\n    void gPrint(...)\n    cppclass UserContext[T]:\n        pass\n        void push(...)\n\n    void for_each(...)\n    void do_all(...)\n\n    InternalRange iterate[T](T &, T &)\n    InternalRange iterate[T](T &)\n\n    cppclass SharedMemSys:\n        SharedMemSys()\n\n    cppclass loopname:\n        loopname(char *name)\n\n    cppclass no_pushes:\n        no_pushes()\n\n    cppclass steal:\n        steal()\n\n    cppclass disable_conflict_detection:\n        disable_conflict_detection()\n\n    cppclass GReduceMax[T]:\n        pass\n        void update(T)\n        T reduce()\n        void reset()\n\n    cppclass InsertBag[T]:\n        pass\n        void push(T)\n        bool empty()\n        void swap(InsertBag&)\n        void clear()\n\n    cppclass LargeArray[T]:\n        pass\n        void allocateInterleaved(size_t)\n        void allocateBlocked(size_t)\n        T &operator[](size_t)\n\n\n    #### Atomic Helpers ####\ncdef extern from \"galois/AtomicHelpers.h\" namespace \"galois\" nogil:\n    const T atomicMin[T](atomic[T]&, const T)\n    const uint32_t atomicMin[uint32_t](atomic[uint32_t]&, const uint32_t)\n\ncdef extern from \"galois/MethodFlags.h\" namespace \"galois\" nogil:\n    cdef cppclass MethodFlag:\n        pass\n    \n    cdef MethodFlag FLAG_UNPROTECTED \"galois::MethodFlag::UNPROTECTED\"\n    cdef MethodFlag FLAG_WRITE \"galois::MethodFlag::WRITE\"\n    cdef MethodFlag FLAG_READ \"galois::MethodFlag::READ\"\n    cdef MethodFlag FLAG_INTERNAL_MASK \"galois::MethodFlag::INTERNAL_MASK\"\n    cdef MethodFlag PREVIOUS \"galois::MethodFlag::PREVIOUS\"\n\n    \n\n"
  },
  {
    "path": "python/galois/cpp/libgalois/Timer.pxd",
    "content": "cdef extern from \"galois/Timer.h\" namespace \"galois\" nogil:\n    cppclass Timer:\n        Timer()\n        void start()\n        void stop()\n        unsigned int get()\n"
  },
  {
    "path": "python/galois/cpp/libgalois/Worklist.pxd",
    "content": "from libc.stdint cimport *\n\ncdef extern from *:\n    cppclass dummy_true \"true\"\n    cppclass dummy_false \"false\"\n    cppclass Uint_64u \"64u\"\n\n##\n# TODO: Need a better way to provide user defined\n# functions as template parameters to DS such as\n# OBIM\n##\ncdef extern from \"galois/Constants.h\" namespace \"galois\" nogil:\n    cppclass UpdateRequestIndexer:\n        UpdateRequestIndexer(uint32_t)\n        pass\n    cppclass UpdateRequest[G, D]:\n        G src\n        D dist\n        UpdateRequest(G&, D)\n        pass\n    cppclass ReqPushWrap:\n        pass\n        \ncdef extern from \"galois/Traits.h\" namespace \"galois\" nogil:\n    cppclass s_wl:\n        pass\n    s_wl wl[T](...)\n\ncdef extern from \"galois/worklists/Chunk.h\" namespace \"galois::worklists\" nogil:\n    cppclass ChunkFIFO[T]:\n        pass\n    cppclass PerSocketChunkFIFO[T]:\n        pass\n\ncdef extern from \"galois/worklists/Obim.h\" namespace \"galois::worklists\" nogil:\n    cppclass OrderedByIntegerMetric[UpdateFuncTy, WorkListTy]:\n        pass\n\n\n        "
  },
  {
    "path": "python/galois/cpp/libgalois/__init__.pxd",
    "content": ""
  },
  {
    "path": "python/galois/cpp/libgalois/graphs/Graph.pxd",
    "content": "\nfrom libcpp.string cimport string\nfrom ..Galois cimport MethodFlag\n\n# Fake types to work around Cython's lack of support\n# for non-type template parameters.\ncdef extern from *:\n    cppclass dummy_true \"true\"\n    cppclass dummy_false \"false\"\n\n# Omit the exception specifications here to\n# allow returning lvalues.\n# Since the exception specifications are omitted here,\n# these classes/functions ABSOLUTELY MUST be used only\n# within functions with C++ exception handling specifications.\n# This is intentional and is required to ensure that C++ exceptions\n# thrown in the code written using these forward declarations\n# are forwarded properly into the Galois library rather than\n# being converted into Python exceptions.\ncdef extern from \"galois/graphs/Graph.h\" namespace \"galois::graphs\" nogil:\n    cppclass MorphGraph[node_data, edge_data, is_directed]:\n\n        morph_graph()\n        cppclass GraphNode:\n            pass\n\n        cppclass edge_iterator:\n            bint operator==(edge_iterator)\n            bint operator!=(edge_iterator)\n            edge_iterator operator++()\n            edge_iterator operator--()\n\n        cppclass iterator:\n            bint operator==(iterator)\n            bint operator!=(iterator)\n            iterator operator++()\n            iterator operator--()\n\n        edge_iterator edge_begin(GraphNode)\n        edge_iterator edge_end(GraphNode)\n\n        iterator begin()\n        iterator end()\n\n        GraphNode getEdgeDst(edge_iterator)\n        node_data& getData(GraphNode)\n\n        GraphNode createNode(node_data)\n        void addNode(GraphNode)\n        void addEdge(GraphNode, GraphNode)\n\n    cppclass LC_CSR_Graph[node_data, edge_data, is_directed]:\n\n        LC_CSR_Graph()\n        cppclass GraphNode:\n            pass\n            bint operator==(unsigned long)\n\n        cppclass edge_iterator:\n            bint operator==(edge_iterator)\n            bint operator!=(edge_iterator)\n            edge_iterator operator++()\n            edge_iterator operator--()\n\n        cppclass iterator:\n            bint operator==(iterator)\n            bint operator!=(iterator)\n            iterator operator++()\n            iterator operator--()\n\n        edge_iterator edge_begin(GraphNode)\n        edge_iterator edge_end(GraphNode)\n        edge_iterator edge_begin(unsigned long)\n        edge_iterator edge_end(unsigned long)\n\n        edge_iterator edge_begin(GraphNode, MethodFlag)\n        edge_iterator edge_end(GraphNode, MethodFlag)\n        edge_iterator edge_begin(unsigned long, MethodFlag)\n        edge_iterator edge_end(unsigned long, MethodFlag)\n\n\n        iterator begin()\n        iterator end()\n\n        GraphNode getEdgeDst(edge_iterator)\n        node_data& getData(GraphNode)\n        node_data& getData(GraphNode, MethodFlag)\n        node_data& getData(unsigned long)\n        node_data& getData(unsigned long, MethodFlag)\n        void readGraphFromGRFile(string filename)\n        unsigned long size()\n        edge_data getEdgeData(edge_iterator)\n        edge_data getEdgeData(edge_iterator, MethodFlag)\n\n"
  },
  {
    "path": "python/galois/cpp/libgalois/graphs/Util.pxd",
    "content": "# Omit the exception specifications here to\n# allow returning lvalues.\n# Since the exception specifications are omitted here,\n# these classes/functions ABSOLUTELY MUST be used only\n# within functions with C++ exception handling specifications.\n# This is intentional and is required to ensure that C++ exceptions\n# thrown in the code written using these forward declarations\n# are forwarded properly into the Galois library rather than\n# being converted into Python exceptions.\ncdef extern from \"galois/graphs/Util.h\" namespace \"galois::graphs\" nogil:\n    #void readGraph[G, A](G &, A&&...)\n    void readGraph(...)\n"
  },
  {
    "path": "python/galois/cpp/libgalois/graphs/__init__.pxd",
    "content": ""
  },
  {
    "path": "python/galois/cpp/libstd/__init__.pxd",
    "content": ""
  },
  {
    "path": "python/galois/cpp/libstd/atomic.pxd",
    "content": "cdef extern from \"<atomic>\" namespace \"std\" nogil:\n    \n    cdef enum memory_order:\n        memory_order_relaxed\n        memory_order_consume\n        memory_order_acquire\n        memory_order_release\n        memory_order_acq_rel\n        memory_order_seq_cst\n    \n    cdef cppclass atomic[T]:\n        atomic()\n        atomic(T)\n        \n        bint is_lock_free()\n        void store(T)\n        void store(T, memory_order)\n        T load()\n        T load(memory_order)\n        T exchange(T)\n        T exchange(T, memory_order)\n        \n        bint compare_exchange_weak(T&, T, memory_order, memory_order)\n        bint compare_exchange_weak(T&, T, memory_order)\n        bint compare_exchange_weak(T&, T)\n        bint compare_exchange_strong(T&, T, memory_order, memory_order)\n        bint compare_exchange_strong(T&, T, memory_order)\n        bint compare_exchange_strong(T&, T)\n        \n        T fetch_add(T, memory_order)\n        T fetch_add(T)\n        T fetch_sub(T, memory_order)\n        T fetch_sub(T)\n        T fetch_and(T, memory_order)\n        T fetch_and(T)\n        T fetch_or(T, memory_order)\n        T fetch_or(T)\n        T fetch_xor(T, memory_order)\n        T fetch_xor(T)\n        \n        T operator++()\n        T operator++(int)\n        T operator--()\n        T operator--(int)\n        \n        # modify-in-place operators not yet supported by Cython:\n        # T operator+=(T)\n        # T operator-=(T)\n        # T operator&=(T)\n        # T operator|=(T)\n        # T operator^=(T)\n        \n        bint operator==(atomic[T]&, atomic[T]&)\n        bint operator==(atomic[T]&, T&)\n        bint operator==(T&, atomic[T]&)\n        bint operator!=(atomic[T]&, atomic[T]&)\n        bint operator!=(atomic[T]&, T&)\n        bint operator!=(T&, atomic[T]&)\n"
  },
  {
    "path": "python/galois/pagerank.py",
    "content": "from ._pagerank import *\n"
  },
  {
    "path": "python/galois/shmem.pxd",
    "content": "from cython.operator cimport preincrement, dereference as deref\nfrom libgalois.Galois cimport UserContext, iterate, for_each, setActiveThreads, SharedMemSys, loopname, disable_conflict_detection, no_pushes, gPrint, do_all, GReduceMax, InsertBag, steal\nfrom libgalois.Galois cimport LargeArray, MethodFlag, FLAG_UNPROTECTED, atomicMin\nfrom libgalois.graphs.Graph cimport dummy_true, dummy_false, MorphGraph, LC_CSR_Graph\nfrom libgalois.Worklist cimport ChunkFIFO, OrderedByIntegerMetric, wl, Uint_64u, UpdateRequestIndexer, PerSocketChunkFIFO, ReqPushWrap, UpdateRequest\nfrom libgalois.Timer cimport Timer\nfrom libstd.atomic cimport atomic\nfrom libcpp.vector cimport vector\nfrom libcpp.string cimport string\nfrom libcpp cimport bool\nimport sys\nfrom libc.stdint cimport *\nfrom libc.math cimport fabs\n\n# Initialize the Galois runtime when the Python module is loaded.\ncdef class _galois_runtime_wrapper:\n    cdef SharedMemSys _galois_runtime\n\ncdef extern from * nogil:\n    # hack to bind leading arguments by value to something that can be passed\n    # to for_each. The returned lambda needs to be usable after the scope\n    # where it is created closes, so captured values are captured by value.\n    # The by-value capture in turn requires that graphs be passed as\n    # pointers. This function is used without exception specification under\n    # the assumption that it will always be used as a subexpression of\n    # a whole expression that requires exception handling or that it will\n    # be used in a context where C++ exceptions are appropriate.\n    # There are more robust ways to do this, but this didn't require\n    # users to find and include additional C++ headers specific to\n    # this interface.\n    # Syntactically, this is using the cname of an \"external\" function\n    # to create a one-line macro that can be used like a function.\n    # The expected use is bind_leading(function, args).\n    cdef void *bind_leading \"[](auto f, auto&&... bound_args){return [=](auto&&... pars){return f(bound_args..., pars...);};}\"(...)\n    # Similar thing to invoke a function and return an integer.\n    # Useful for verifying that this approach works.\n    cdef int invoke \"[](auto f, auto&&... args){return f(args...);}\"(...)\n\n#cdef int myfunc(int a, int b, int c):\n#    return a + b + c\n\ncdef extern from \"algorithm\" namespace \"std\" nogil:\n    # This function from <algorithm> isn't currently\n    # provided by Cython's known interfaces for the C++ standard library,\n    # so this is needed to get it working here.\n    # The variadic signature could probably be removed and this could\n    # be made to match the original templates more closely, but since\n    # this form matches the syntax we need to use, it is good enough.\n    int count_if(...) except +\n\n# This function is expected to forward C++ exceptions thrown to\n# its caller. This is unusual for Cython, but it's the simplest\n# way to guarantee no loos Python exceptions end up floating around.\n#cdef void IncrementNeighbors(Graph *g, GNode n, UserContext[GNode] &ctx) nogil:\n #   cdef:\n#        MorphGraph[int, void, dummy_true].edge_iterator ii = g[0].edge_begin(n)\n#        MorphGraph[int, void, dummy_true].edge_iterator ei = g[0].edge_end(n)\n#        int *data\n#    while ii != ei:\n#        data = &g[0].getData(g[0].getEdgeDst(ii))\n#        preincrement(data[0])\n#        preincrement(ii)\n\n# C++ exceptions thrown inside this function are forwarded to its caller.\n#cdef bint ValueEqual(Graph *g, int v, GNode n) nogil:\n#    return g[0].getData(n) == v\n\n#cdef bint SameNodes(GNodeCSR n, GNodeCSR s) nogil:\n #   return n == s\n\n#cdef void setGNode(Graph_CSR *g, GNodeCSR n, int val) nogil:\n#    gPrint(\"inside setGNode\\n\")\n#    cdef uint32_t *data = &g[0].getData(n);\n#    data[0] = val;\n#    gPrint(\"n : \", deref(data), \"\\n\");\n#    preincrement(data[0])\n#    gPrint(\"n : \", deref(data), \"\\n\");\n\n\n\n"
  },
  {
    "path": "python/galois/shmem.pyx",
    "content": "# cython: cdivision = True\n_galois_runtime = _galois_runtime_wrapper()\n\n\n"
  },
  {
    "path": "scripts/CMakeLists.txt",
    "content": "configure_file(\"make_dist.sh.in\" \"make_dist.sh\")\nfile(COPY .  DESTINATION ${CMAKE_CURRENT_BINARY_DIR} PATTERN .svn EXCLUDE)\n"
  },
  {
    "path": "scripts/abelian_log_parser.py",
    "content": "##########################################\n# To parse log files generated by abelian.\n# Author: Gurbinder Gill\n# Email: gurbinder533@gmail.com\n#########################################\n\nimport re\nimport os\nimport sys, getopt\nimport csv\nimport numpy\nimport subprocess\n\n######## NOTES:\n# All time values are in sec by default.\n\n\ndef match_timers(fileName, benchmark, forHost, numRuns, numThreads, time_unit, total_hosts, partition):\n\n  mean_time = 0.0;\n  recvNum_total = 0\n  recvBytes_total = 0\n  sendNum_total = 0\n  sendBytes_total = 0\n  sync_pull_avg_time_total = 0.0;\n  extract_avg_time_total = 0.0;\n  set_avg_time_total = 0.0;\n  sync_push_avg_time_total = 0.0;\n  graph_init_time = 0\n  hg_init_time = 0\n  total_time = 0\n\n  if(benchmark == \"cc\"):\n    benchmark = \"ConnectedComp\"\n\n  if(benchmark == \"pagerank\"):\n    benchmark = \"PageRank\"\n\n  if (time_unit == 'seconds'):\n    divisor = 1000\n  else:\n    divisor = 1\n\n  log_data = open(fileName).read()\n\n  timer_regex = re.compile(r'.*,\\(NULL\\),0\\s,\\sTIMER_(\\d*),(\\d*),0,(\\d*)')\n  timers = re.findall(timer_regex, log_data)\n  #print timers\n\n  time = []\n\n  for i in range(int(total_hosts)):\n    time.append(0)\n\n  #overall slowest host for all 3 runs\n  slowest_host = int(0);\n  for timer in timers:\n    host = int(timer[1])\n    #print host ,\" : \", timer[2]\n    host_time = float(timer[2])\n    time[host] += host_time\n\n  print time\n  for i, val in enumerate(time):\n    if(time[slowest_host] < val):\n      slowest_host = i\n\n  print slowest_host\n  print time[slowest_host]\n\n  if(len(time) > 0):\n    mean_time = time[slowest_host] / int(numRuns)\n  mean_time /= divisor\n  mean_time = round(mean_time, 3)\n  print \"Mean time: \", mean_time\n\n  forHost = slowest_host \n\n\n  rep_regex = re.compile(r'.*,\\(NULL\\),0\\s,\\sREPLICATION_FACTOR_0_0,(\\d*),\\d*,(.*)')\n\n  rep_search = rep_regex.search(log_data)\n  rep_factor = 0;\n  if rep_search is not None:\n    rep_factor = rep_search.group(2)\n    rep_factor = round(float(rep_factor), 3)\n    if (rep_factor == 1.0):\n      rep_regex = re.compile(r'.*,\\(NULL\\),0\\s,\\sREPLICATION_FACTOR_0_0,(\\d*),\\d*,(.*)')\n      rep_search = rep_regex.search(log_data)\n      rep_factor = rep_search.group(2)\n      rep_factor = round(float(rep_factor), 3)\n\n    print (\"Replication factor  : \", rep_factor)\n\n\n  #Finding mean, variance and sd for first iteration in compute time\n  do_all_regex = re.compile(r'.*,\\(NULL\\),0\\s,\\sDO_ALL_IMPL_(?i)' + re.escape(benchmark) + r'_0_0'  +r',.*' + r',\\d*,(\\d*)')\n  do_all_all_hosts = re.findall(do_all_regex, log_data)\n  num_arr = numpy.array(map(int,do_all_all_hosts))\n\n  if(num_arr.size < total_hosts):\n    #for i in range(1, int(total_hosts)):\n    for i in range(1, 3):\n      do_all_regex = re.compile(r'.*,\\(NULL\\),0\\s,\\sDO_ALL_IMPL_(?i)' + re.escape(benchmark) + r'_0_' + re.escape(str(i)) +r',.*' + r',\\d*,(\\d*)')\n      do_all_all_hosts = re.findall(do_all_regex, log_data)\n      num_arr = numpy.array(map(int,do_all_all_hosts))\n      if(num_arr.size == total_hosts):\n        break;\n\n  print num_arr\n  sd_do_all = round(numpy.std(num_arr, axis=0),3)\n  mean_do_all = round(numpy.mean(num_arr, axis=0),3)\n  var_do_all = round(numpy.var(num_arr, axis=0),3)\n  min_do_all = num_arr.min(axis=0)\n  max_do_all = num_arr.max(axis=0)\n\n\n  print \"sd_do_all\", sd_do_all\n  print \"mean_do_all\", mean_do_all\n  print \"min_do_all\", min_do_all\n  print \"max_do_all\", max_do_all\n\n\n  #Finding mean, variance and sd for first iteration in sync time\n  sync_pull_regex = re.compile(r'.*,\\(NULL\\),0\\s,\\sSYNC_PULL_(?i)' + re.escape(benchmark) + r'_0_0'  +r',.*' + r',\\d*,(\\d*)')\n  sync_pull_all_hosts = re.findall(sync_pull_regex, log_data)\n  num_arr = numpy.array(map(int,sync_pull_all_hosts))\n  if(num_arr.size == 0):\n    sync_pull_regex = re.compile(r'.*,\\(NULL\\),0\\s,\\sSYNC_PULL_FirstItr_(?i)' + re.escape(benchmark) + r'_0_0'  +r',.*' + r',\\d*,(\\d*)')\n    sync_pull_all_hosts = re.findall(sync_pull_regex, log_data)\n    num_arr = numpy.array(map(int,sync_pull_all_hosts))\n\n  if(num_arr.size < total_hosts):\n    #for i in range(1, int(total_hosts)):\n    for i in range(1, 3):\n      sync_pull_regex = re.compile(r'.*,\\(NULL\\),0\\s,\\sSYNC_PULL_(?i)' + re.escape(benchmark) + r'_0_' + re.escape(str(i)) +r',.*' + r',\\d*,(\\d*)')\n      sync_pull_all_hosts = re.findall(sync_pull_regex, log_data)\n      try:\n        num_arr = numpy.array(map(int,sync_pull_all_hosts))\n      except ValueError:\n        pass\n      if(num_arr.size == total_hosts):\n        break;\n\n  print \"SYNC_PULL\" , num_arr\n  sd_sync_pull = 0.0\n  mean_sync_pull = 0.0\n  var_sync_pull = 0.0\n  min_sync_pull = 0.0\n  max_sync_pull = 0.0\n  if(num_arr.size > 0):\n    try:\n      sd_sync_pull = round(numpy.std(num_arr, axis=0),3)\n      mean_sync_pull = round(numpy.mean(num_arr, axis=0),3)\n      var_sync_pull = round(numpy.var(num_arr, axis=0),3)\n      min_sync_pull = num_arr.min(axis=0)\n      max_sync_pull = num_arr.max(axis=0)\n    except ValueError:  #raised if `num_arr` is empty.\n      pass\n\n  print \"sd_sync_pull\", sd_sync_pull\n  print \"mean_sync_pull\", mean_sync_pull\n  print \"min_sync_pull\", min_sync_pull\n  print \"max_sync_pull\", max_sync_pull\n\n  sync_push_regex = re.compile(r'.*,\\(NULL\\),0\\s,\\sSYNC_PUSH_(?i)' + re.escape(benchmark) + r'_0_0'  +r',.*' + r',\\d*,(\\d*)')\n  sync_push_all_hosts = re.findall(sync_push_regex, log_data)\n  num_arr = numpy.array(map(int,sync_push_all_hosts))\n  if(num_arr.size == 0):\n    sync_push_regex = re.compile(r'.*,\\(NULL\\),0\\s,\\sSYNC_PUSH_FirstItr_(?i)' + re.escape(benchmark) + r'_0_0'  +r',.*' + r',\\d*,(\\d*)')\n    sync_push_all_hosts = re.findall(sync_push_regex, log_data)\n    num_arr = numpy.array(map(int,sync_push_all_hosts))\n\n\n  if(num_arr.size < total_hosts):\n    #for i in range(1, int(total_hosts)):\n    for i in range(1, 3):\n      sync_push_regex = re.compile(r'.*,\\(NULL\\),0\\s,\\sSYNC_PUSH_(?i)' + re.escape(benchmark) + r'_0_' + re.escape(str(i)) + r',.*' + r',\\d*,(\\d*)')\n      sync_push_all_hosts = re.findall(sync_push_regex, log_data)\n      num_arr = numpy.array(map(int,sync_push_all_hosts))\n      if(num_arr.size == total_hosts):\n        break;\n\n  print \"SYNC_PUSH\" , num_arr\n\n  sd_sync_push = 0.0\n  mean_sync_push = 0.0\n  var_sync_push = 0.0\n  min_sync_push = 0.0\n  max_sync_push = 0.0\n  if(num_arr.size > 0):\n    try:\n      sd_sync_push = round(numpy.std(num_arr, axis=0),3)\n      mean_sync_push = round(numpy.mean(num_arr, axis=0),3)\n      var_sync_push = round(numpy.var(num_arr, axis=0),3)\n      min_sync_push = num_arr.min(axis=0)\n      max_sync_push = num_arr.max(axis=0)\n    except ValueError:  #raised if `num_arr` is empty.\n      pass\n  print \"sd_sync_push\", sd_sync_push\n  print \"mean_sync_push\", mean_sync_push\n  print \"min_sync_push\", min_sync_push\n  print \"max_sync_push\", max_sync_push\n\n\n\n  ## Get Graph_init, HG_init, total\n  #81a5b117-8054-46af-9a23-1f28e5ed1bba,(NULL),0 , TIMER_GRAPH_INIT,0,0,306\n  timer_graph_init_regex = re.compile(r'.*,\\(NULL\\),0\\s,\\sTIMER_GRAPH_INIT,\\d*,\\d*,(\\d*)')\n  timer_hg_init_regex = re.compile(r'.*,\\(NULL\\),0\\s,\\sTIMER_HG_INIT' + r',\\d*,\\d*,(\\d*)')\n  timer_total_regex = re.compile(r'.*,\\(NULL\\),0\\s,\\sTIMER_TOTAL' + r',\\d*,\\d*,(\\d*)')\n\n\n  timer_graph_init = timer_graph_init_regex.search(log_data)\n  timer_hg_init = timer_hg_init_regex.search(log_data)\n  timer_total = timer_total_regex.search(log_data)\n\n  if timer_graph_init is not None:\n    graph_init_time = float(timer_graph_init.group(1))\n    graph_init_time /= divisor\n    graph_init_time = round(graph_init_time, 3)\n\n  if timer_hg_init is not None:\n    hg_init_time = float(timer_hg_init.group(1))\n    hg_init_time /= divisor\n    hg_init_time = round(hg_init_time, 3)\n\n  if timer_total is not None:\n    total_time = float(timer_total.group(1))\n    total_time /= divisor\n    total_time = round(total_time, 3)\n\n  print graph_init_time\n  print hg_init_time\n  print total_time\n\n  return mean_time,rep_factor,mean_do_all,sd_do_all,var_do_all,min_do_all,max_do_all,mean_sync_pull,sd_sync_pull,var_sync_pull,min_sync_pull,max_sync_pull,mean_sync_push,sd_sync_push,var_sync_push,min_sync_push,max_sync_push,graph_init_time,hg_init_time,total_time\n\ndef get_basicInfo(fileName):\n\n  hostNum_regex = re.compile(r'.*,\\(NULL\\),0\\s,\\sHosts,0,0,(\\d*)')\n  cmdLine_regex = re.compile(r'.*,\\(NULL\\),0\\s,\\sCommandLine,0,0,(.*)')\n  threads_regex = re.compile(r'.*,\\(NULL\\),0\\s,\\sThreads,0,0,(\\d*)')\n  runs_regex = re.compile(r'.*,\\(NULL\\),0\\s,\\sRuns,0,0,(\\d*)')\n\n  log_data = open(fileName).read()\n\n  hostNum    = ''\n  cmdLine    = ''\n  threads    = ''\n  runs       = ''\n  benchmark  = ''\n  algo_type  = ''\n  cut_type   = ''\n  input_graph = ''\n\n  hostNum_search = hostNum_regex.search(log_data)\n  if hostNum_search is not None:\n    hostNum = hostNum_search.group(1)\n\n  cmdLine_search = cmdLine_regex.search(log_data)\n  if cmdLine_search is not None:\n    cmdLine = cmdLine_search.group(1)\n\n  threads_search = threads_regex.search(log_data)\n  if threads_search is not None:\n    threads = threads_search.group(1)\n\n  runs_search    = runs_regex.search(log_data)\n  if runs_search is not None:\n    runs = runs_search.group(1)\n  if runs == \"\":\n    runs = \"3\"\n\n  print cmdLine\n  split_cmdLine_algo = cmdLine.split()[0].split(\"/\")[-1].split(\"_\")\n  benchmark, algo_type = split_cmdLine_algo\n\n  split_cmdLine_input = cmdLine.split()[1].split(\"/\")\n  input_graph_name = split_cmdLine_input[-1]\n  input_graph = input_graph_name.split(\".\")[0]\n\n  print cmdLine\n  split_cmdLine = cmdLine.split()\n  print split_cmdLine\n  cut_type = \"edge-cut\"\n  for index in range(0, len(split_cmdLine)):\n    if split_cmdLine[index] == \"-enableVertexCut=1\":\n      cut_type = \"vertex-cut\"\n      break\n    elif split_cmdLine[index] == \"-enableVertexCut\":\n         cut_type = \"vertex-cut\"\n         break\n    elif split_cmdLine[index] == \"-enableVertexCut=0\":\n         cut_type = \"edge-cut\"\n         break\n\n\n  devices = str(hostNum) + \" CPU\"\n  deviceKind = \"CPU\"\n  for index in range(2, len(cmdLine.split())):\n    split_cmdLine_devices = cmdLine.split()[index].split(\"=\")\n    if split_cmdLine_devices[0] == '-pset':\n      devices_str = split_cmdLine_devices[-1]\n      cpus = devices_str.count('c')\n      gpus = devices_str.count('g')\n      if str(cpus + gpus) == hostNum and gpus > 0:\n        if cpus == 0:\n          devices = str(gpus) + \" GPU\"\n          deviceKind = \"GPU\"\n        else:\n          devices = str(cpus) + \" CPU + \" + str(gpus) + \" GPU\"\n          deviceKind = \"CPU+GPU\"\n          hostNum = str(int(hostNum) - cpus)\n      break\n\n  return hostNum, cmdLine, threads, runs, benchmark, algo_type, cut_type, input_graph, devices, deviceKind\n\ndef format_str(col):\n  max_len = 0\n  for c in col:\n    if max_len < len(str(c)):\n      max_len = len(str(c))\n  return max_len\n\ndef main(argv):\n  inputFile = ''\n  forHost = ''\n  outputFile = 'LOG_output.csv'\n  time_unit = 'seconds'\n  try:\n    opts, args = getopt.getopt(argv,\"hi:n:o:md\",[\"ifile=\",\"node=\",\"ofile=\",\"milliseconds\"])\n  except getopt.GetoptError:\n    print 'abelian_log_parser.py -i <inputFile> [-o <outputFile> -n <hostNumber 0 to hosts-1> --milliseconds]'\n    sys.exit(2)\n  for opt, arg in opts:\n    if opt == '-h':\n      print 'abelian_log_parser.py -i <inputFile> [-o <outputFile> -n <hostNumber 0 to hosts-1> --milliseconds]'\n      sys.exit()\n    elif opt in (\"-i\", \"--ifile\"):\n      inputFile = arg\n    elif opt in (\"-n\", \"--node\"):\n      forHost = arg\n    elif opt in (\"-o\", \"--ofile\"):\n      outputFile = arg\n    elif opt in (\"-m\", \"--milliseconds\"):\n      time_unit = 'milliseconds'\n\n  if inputFile == '':\n    print 'abelian_log_parser.py -i <inputFile> [-o <outputFile> -n <hostNumber 0 to hosts-1> --milliseconds]'\n    sys.exit(2)\n\n  print 'Input file is : ', inputFile\n  print 'Output file is : ', outputFile\n  print 'Data for host : ', forHost\n\n  if forHost == '':\n    print 'Find the slowest host and calculating everything for that host'\n\n  hostNum, cmdLine, threads, runs, benchmark, algo_type, cut_type, input_graph, devices, deviceKind = get_basicInfo(inputFile)\n\n  #shorten the graph names:\n  if input_graph == \"twitter-ICWSM10-component_withRandomWeights\" or input_graph == \"twitter-ICWSM10-component-transpose\" or input_graph == \"twitter-ICWSM10-component\":\n    input_graph = \"twitter-50\"\n  elif input_graph == \"twitter-WWW10-component_withRandomWeights\" or input_graph == \"twitter-WWW10-component-transpose\" or input_graph == \"twitter-WWW10-component\":\n    input_graph = \"twitter-40\"\n\n  print 'Hosts : ', hostNum , ' CmdLine : ', cmdLine, ' Threads : ', threads , ' Runs : ', runs, ' benchmark :' , benchmark , ' algo_type :', algo_type, ' cut_type : ', cut_type, ' input_graph : ', input_graph\n  print 'Devices : ', devices\n  data = match_timers(inputFile, benchmark, forHost, runs, threads, time_unit, hostNum, cut_type)\n\n  print data\n\n  output_str = benchmark + ',' + 'abelian' + ',' + hostNum  + ',' + threads  + ','\n  output_str += deviceKind  + ',' + devices  + ','\n  output_str += input_graph  + ',' + algo_type  + ',' + cut_type\n  print output_str\n\n\n  header_csv_str = \"benchmark,platform,host,threads,\"\n  header_csv_str += \"deviceKind,devices,\"\n  header_csv_str += \"input,variant,partition,mean_time,rep_factor,mean_do_all,sd_do_all,var_do_all,min_do_all,max_do_all,mean_sync_pull,sd_sync_pull,var_sync_pull,min_sync_pull,max_sync_pull,mean_sync_push,sd_sync_push,var_sync_push,min_sync_push,max_sync_push,graph_init_time,hg_init_time,total_time\"\n  \n\n  header_csv_list = header_csv_str.split(',')\n  try:\n    if os.path.isfile(outputFile) is False:\n      fd_outputFile = open(outputFile, 'wb')\n      wr = csv.writer(fd_outputFile, quoting=csv.QUOTE_NONE, lineterminator='\\n')\n      wr.writerow(header_csv_list)\n      fd_outputFile.close()\n      print \"Adding header to the empty file.\"\n    else:\n      print \"outputFile : \", outputFile, \" exists, results will be appended to it.\"\n  except OSError:\n    print \"Error in outfile opening\\n\"\n\n  data_list = list(data) #[data] #list(data)\n  complete_data = output_str.split(\",\") + data_list\n  fd_outputFile = open(outputFile, 'a')\n  wr = csv.writer(fd_outputFile, quoting=csv.QUOTE_NONE, lineterminator='\\n')\n  wr.writerow(complete_data)\n  fd_outputFile.close()\n\n'''\n  ## Write ghost and slave nodes to a file.\n  ghost_array = build_master_ghost_matrix(inputFile, benchmark, cut_type, hostNum, runs, threads)\n  ghostNodes_file = outputFile + \"_\" + cut_type\n  fd_ghostNodes_file = open(ghostNodes_file, 'ab')\n  fd_ghostNodes_file.write(\"\\n--------------------------------------------------------------\\n\")\n  fd_ghostNodes_file.write(\"\\nHosts : \" + hostNum + \"\\nInputFile : \"+ inputFile + \"\\nBenchmark: \" + benchmark + \"\\nPartition: \" + cut_type + \"\\n\\n\")\n  numpy.savetxt(fd_ghostNodes_file, ghost_array, delimiter=',', fmt='%d')\n  fd_ghostNodes_file.write(\"\\n--------------------------------------------------------------\\n\")\n  fd_ghostNodes_file.close()\n'''\n\nif __name__ == \"__main__\":\n  main(sys.argv[1:])\n\n"
  },
  {
    "path": "scripts/check_format.sh",
    "content": "#!/bin/bash\n\nCLANG_FORMAT=${CLANG_FORMAT:-clang-format}\nset -e\n\nif [ $# -eq 0 ]; then\n  echo \"$(basename $0) [-fix] <paths>\" >&2\n  exit 1\nfi\n\nFIX=\nif [ \"$1\" == \"-fix\" ]; then\n  FIX=1\n  shift 1\nfi\n\nROOTS=\"$@\"\nFAILED=\n\nwhile read -d '' filename; do\n  if [ -n \"${FIX}\" ]; then\n    echo \"fixing ${filename}\"\n    ${CLANG_FORMAT} -style=file -i \"${filename}\"\n  else\n    if ${CLANG_FORMAT} -style=file -output-replacements-xml \"${filename}\" | grep '<replacement ' > /dev/null; then\n        echo \"${filename} NOT OK\"\n        FAILED=1\n    fi\n  fi\ndone < <(find ${ROOTS} -name experimental -prune -o -name external -prune -o -name '*.cpp' -print0 -o -name '*.h' -print0)\n\nif [ -n \"${FAILED}\" ]; then\n  exit 1\nfi\n"
  },
  {
    "path": "scripts/constraints_checking/parse_dump.rb",
    "content": "# ##############################\n# Parsing dump files\n#\n# Author : Gurbinder Gill\n# email  : gill@cs.utexas.edu\n# Date   : 17 Dec, 2014\n#\n# ##############################\n\n\nrequire 'optparse'\nrequire 'pp'\nrequire 'ostruct'\n\n#Gloabal Variables\n$arr_local_store = []\n$arr_remote_store = []\n\nclass OptParserClass\n  def self.parse(args)\n    options = OpenStruct.new\n    options.n = 0;\n    options.t = 0;\n\n\n    opt_parser = OptionParser.new do |opts|\n      opts.banner = \"Usage: script.rb [options]\"\n\n      opts.on(\"-n\", \"--n_hosts hosts\", Integer, \"Give number of hosts\") do |hosts|\n        options.n = hosts\n      end\n\n      opts.on(\"-t\", \"--TimeStamp timestamp\", Integer, \"Which time stamp u want to process.\") do |timestamp|\n        options.t = timestamp\n      end\n\n      opts.on_tail('-h', '--help', String, 'Display Help.') do\n        puts opts\n        exit\n      end\n    end\n  opt_parser.parse!(args)\n  options\n  end #end pasrse\n\nend #OptParserClass\n\ndef check_opts(opts)\n  if opts.n <= 1\n    print \"Number of hosts must be > 1 : specify : \"\n    opts.n = gets.chomp\n    print \"\\n\"\n  end\n\n  if opts.t == 0\n    print \"Using Default timestamp 0\\n\"\n  end\n\nend #check_opts\n\ndef construct_fileNames(opts)\n  t = opts.t\n\n  #open and store local files\n  opts.n.times do |i|\n    #arr_name = \"local_#{i}\"\n    $arr_local_store[i] = open_files(\"dump_local_#{i}_#{t}.txt\")\n    #puts arr_local_store[i][0]\n    #puts \"------------------------------\\n\"\n  end\n\n  #checking\n  #opts.n.times do |i|\n    #puts $arr_local_store[i][0]\n    #puts \"------------------------------------\\n\"\n  #end\n\n  #checking\n  #arr_local_store[0].each do |line|\n    #puts line\n  #end\n\n  #open and store remote files\n  opts.n.times do |i|\n    $arr_remote_store[i] = open_files(\"dump_remote_#{i}_#{t}.txt\")\n    #puts arr_remote_store[i][0]\n    #puts \"------------------------------\\n\"\n  end\nend #construct_fileNames\n\ndef open_files(filename)\n  file = File.new(filename, \"r\")\n  array = []\n  while (line = file.gets)\n    array << line.chomp\n  end\n  file.close\n  return array\nend #open_files\n\n# Functions to check all the constraints\n# CHECK 1 : If local dir has given some object, remote dir must know about that object.\ndef local_to_remote_check\n  #checking locals are consistent with remotes\n  obj_ptr_re = /\\[\\d{1},(.*)\\]/\n  locRW_re = /\\locRW\\:(.?)\\,/\n  recalled_re = /recalled\\:(.?)\\,/\n  count = 0\n  $arr_local_store.each do |local_file|\n    local_file.each do |line|\n      unless line.chomp.empty?\n        obj_ptr = line.match(obj_ptr_re)\n        remote_host = line.match(locRW_re)\n        recalled_for = line.match(recalled_re)\n\n        #check if remote host knows about this obj_ptr\n        if !remote_host[1].eql?\"\" and recalled_for[1].eql?\"\"\n          found = false\n          $arr_remote_store[remote_host[1].to_i].each do |r_line|\n            if r_line.include? obj_ptr[1]\n              found = true\n            end\n          end\n          if !found\n            p \"OMG! #{count} gave its object #{obj_ptr[1]} to #{remote_host[1]}, but it doesn't seem to know about it\"\n          end\n        end\n      end\n    end\n    count = count + 1\n  end\nend\n\n#CHECK 2: If obj in local dir and has a reqsRW, then remote must have contented for that obj.\ndef local_reqsRW_remote_contended\n  obj_ptr_re = /\\[\\d{1},(.*)\\]/\n  reqsRW_re = /reqsRW:\\<(.*)\\>/\n  contended_re = /contended\\:(.?)\\,/\n  count = 0\n  $arr_local_store.each do |local_file|\n    local_file.each do |line|\n      unless line.chomp.empty?\n        obj_ptr = line.match(obj_ptr_re)\n        reqsRW = line.match(reqsRW_re)\n        reqs_arr = reqsRW[1].split(/,/)\n        reqs_arr.size.times do |i|\n          found = false\n          $arr_remote_store[reqs_arr[i].to_i].each do |r_line|\n            if r_line.include? obj_ptr[1]\n              found = true\n              contended = r_line.match(contended_re)\n              if contended[1].to_i == 0\n                p \"OMG! #{count} has received a request, for object #{obj_ptr[1]} from #{reqs_arr[i].to_i}, but its not conteneded there\"\n              end\n            end\n          end\n          if !found and !count.eql?reqs_arr[i].to_i\n              p \"OMG! #{count} has received a request, for object #{obj_ptr[1]} from #{reqs_arr[i].to_i}, but remote dir at this host doesn't know\"\n            end\n        end\n      end\n    end\n    count = count + 1\n  end\nend\n\n#CHECK 3: Obj is present locally, its not contented, and there is request for it, but its not given.\ndef not_contended_with_reqs\n  obj_ptr_re = /\\[\\d{1},(.*)\\]/\n  reqsRW_re = /reqsRW:\\<(.*)\\>/\n  locRW_re = /\\locRW\\:(.?)\\,/\n  contended_re = /contended\\:(.?)\\,/\n  count = 0\n  $arr_local_store.each do |local_file|\n    local_file.each do |line|\n      unless line.chomp.empty?\n        obj_ptr = line.match(obj_ptr_re)\n        lockRW = line.match(locRW_re)\n        reqsRW = line.match(reqsRW_re)\n        reqs_arr = reqsRW[1].split(/,/)\n        contended = line.match(contended_re)\n        #if lockRW is empty, it should be contended locally\n        if lockRW[1].eql?\"\" and reqs_arr.size > 0\n          if contended[1].to_i == 0\n            p \"OMG! #{count} has an object #{obj_ptr} , which is needed by remote hosts #{reqs_arr} and is not locally contended\"\n          end\n        end\n      end\n    end\n    count = count + 1\n  end\nend\n\noptions = OptParserClass.parse(ARGV)\ncheck_opts(options)\np options.t.class\nconstruct_fileNames(options)\n#\n######constraint_checking\nlocal_to_remote_check\nlocal_reqsRW_remote_contended\nnot_contended_with_reqs\n#host_0_local =\n##\n#\n#\n#my_re = /\\[\\d{1},(.*)\\]/\n#m = my_re.match(line)\n#p m[1]\n#locRW_re = /\\locRW\\:(.?)\\,/\n#\n#string.include? \"pattrn\"\n"
  },
  {
    "path": "scripts/docker/Dockerfile",
    "content": "FROM ubuntu:20.04\n\nRUN apt-get update \\\n      && apt-get install -qy \\\n      apt-transport-https \\\n      ca-certificates \\\n      curl \\\n      gnupg \\\n      software-properties-common \\\n      && curl -fL https://apt.llvm.org/llvm-snapshot.gpg.key | apt-key add - \\\n      && apt-add-repository -y 'deb http://apt.llvm.org/focal/ llvm-toolchain-focal main' \\\n      && apt-get update\nRUN apt-get install -qy \\\n      ccache \\\n      clang++-10 \\\n      clang-10 \\\n      clang-format-10 \\\n      clang-tidy-10 \\\n      cmake \\\n      g++-9 \\\n      gcc-9 \\\n      git \\\n      gosu \\\n      libfmt-dev \\\n      libopenmpi-dev \\\n      llvm-10-dev \\\n      python3-pip \\\n      python-is-python3 \\\n      && update-alternatives --verbose --install /usr/bin/gcc gcc /usr/bin/gcc-9 90 \\\n      && update-alternatives --verbose --install /usr/bin/g++ g++ /usr/bin/g++-9 90 \\\n      && update-alternatives --verbose --install /usr/bin/clang clang /usr/bin/clang-10 90 \\\n      && update-alternatives --verbose --install /usr/bin/clang++ clang++ /usr/bin/clang++-10 90 \\\n      && rm -rf /var/lib/apt/lists/*\nRUN pip3 install --upgrade --no-cache-dir pip setuptools \\\n      && pip3 install --no-cache-dir conan==1.24\nRUN conan profile new --detect --force default \\\n    && conan profile update settings.compiler.libcxx=libstdc++11 default \\\n    && conan remote add kmaragon https://api.bintray.com/conan/kmaragon/conan\n\n# Make it a little more convenient when USER is not root\nRUN find /root -type f -print0 | xargs -0 chmod 666 \\\n      && find /root -type d -print0 | xargs -0 chmod 777\nRUN echo \"export PS1='\\\\W$ '\" >> /root/.bashrc\nENV HOME=/root\n# Yes, allow anyone to run as root with gosu\nRUN chmod gu+s `which gosu`\n\nVOLUME /root/.conan/data\nVOLUME /root/.ccache\nVOLUME /source\nWORKDIR /source\n"
  },
  {
    "path": "scripts/docker/Dockerfile.msan",
    "content": "FROM quick-dev\nCOPY msan /tmp/msan\nRUN bash -x /tmp/msan/build-llvm.sh \\\n      && bash -x /tmp/msan/build-boost.sh \\\n      && rm -r /tmp/msan\n"
  },
  {
    "path": "scripts/docker/README.md",
    "content": "# Reproducible development environments\n\nFor long term development it is better to set up a development environment on\nyour host development machine, but if you'd like to get started quickly, this\ndirectory contains a Docker configuration called quick-dev to simplify the\nconfiguration of a new development environment.\n\nThis directory also contains a configuration, msan, that provides instrumented\nlibraries for use with `-fsanitize=memory` (GALOIS_USE_SANITIZER=Memory).\n\n# Building\n\n```bash\ndocker build -t quick-dev .\n```\n\n```bash\ndocker build -t quick-dev .\ndocker build -t msan -f Dockerfile.msan .\n```\n\n# Using\n\n```bash\nrun-image.sh\n# Or...\nIMAGE=msan run-image.sh\n/source/scripts/docker/msan/config-galois.sh\n```\n"
  },
  {
    "path": "scripts/docker/msan/build-boost.sh",
    "content": "#!/bin/bash\n\nBUILD_DIR=${BUILD_DIR:-/tmp/msan/boost-build}\nSOURCE_DIR=${SOURCE_DIR:-/tmp/msan/boost}\nBOOST_LIBRARIES=${BOOST_LIBRARIES:-headers,iostreams,serialization}\nAS_ROOT=${AS_ROOT:-gosu root}\nLLVM_INSTALL_PREFIX=${LLVM_INSTALL_PREFIX:-/usr/lib/llvm-10-msan}\n\nset -e\n\nMSAN_LINKER_FLAGS=\"-lc++abi \\\n  -Wl,--rpath=${LLVM_INSTALL_PREFIX}/lib \\\n  -L${LLVM_INSTALL_PREFIX}/lib\"\nMSAN_FLAGS=\"-nostdinc++ -stdlib=libc++ \\\n  -isystem ${LLVM_INSTALL_PREFIX}/include \\\n  -isystem ${LLVM_INSTALL_PREFIX}/include/c++/v1 \\\n  ${MSAN_LINKER_FLAGS} \\\n  -fsanitize=memory\\\n  -w\"\n\nmkdir -p \"${SOURCE_DIR}\"\ncurl -fL https://dl.bintray.com/boostorg/release/1.73.0/source/boost_1_73_0.tar.bz2 | tar -xjv -f - -C \"${SOURCE_DIR}\"\ncd \"${SOURCE_DIR}/boost_1_73_0\"\n./bootstrap.sh --with-toolset=clang --with-libraries=\"${BOOST_LIBRARIES}\"\n./b2 threading=multi cxxflags=\"${MSAN_FLAGS}\" linkflags=\"${MSAN_LINKER_FLAGS}\" \n${AS_ROOT} ./b2 install\n"
  },
  {
    "path": "scripts/docker/msan/build-llvm.sh",
    "content": "#!/bin/bash\n#\n# Build and install libc++abi, libc++ and ${LLVM_COMPONENTS} with memory\n# sanitization.\nset -e\n\nNUM_PARALLEL=${NUM_PARALLEL:-2}\nBUILD_DIR=${BUILD_DIR:-/tmp/msan/llvm-build}\nSOURCE_DIR=${SOURCE_DIR:-/tmp/msan/llvm}\nLLVM_INSTALL_PREFIX=${LLVM_INSTALL_PREFIX:-/usr/lib/llvm-10-msan}\nLLVM_COMMIT=${LLVM_COMMIT:-release/10.x}\nLLVM_COMPONENTS=${LLVM_COMPONENTS-LLVMSupport}\nAS_ROOT=${AS_ROOT:-gosu root}\n\ngit clone -b ${LLVM_COMMIT} --depth 1 https://github.com/llvm/llvm-project.git \"${SOURCE_DIR}\"\n\nmkdir -p \"${BUILD_DIR}/libcxxabi\"\ncmake \\\n  -DCMAKE_BUILD_TYPE=Release \\\n  -DCMAKE_C_COMPILER=clang \\\n  -DCMAKE_CXX_COMPILER=clang++ \\\n  -DCMAKE_INSTALL_PREFIX=\"${LLVM_INSTALL_PREFIX}\" \\\n  -DLIBCXXABI_LIBCXX_INCLUDES=\"${SOURCE_DIR}/libcxx/include\" \\\n  -DLLVM_PATH=\"${SOURCE_DIR}\" \\\n  -S \"${SOURCE_DIR}/libcxxabi\" \\\n  -B \"${BUILD_DIR}/libcxxabi\"\ncmake --build \"${BUILD_DIR}/libcxxabi\" --parallel \"${NUM_PARALLEL}\"\n${AS_ROOT} cmake --build \"${BUILD_DIR}/libcxxabi\" --target install\n\n# Bootstrap llvm build with memory sanitized libcxx.\n\nMSAN_LINKER_FLAGS=\"-lc++abi \\\n  -Wl,--rpath=${LLVM_INSTALL_PREFIX}/lib \\\n  -L${LLVM_INSTALL_PREFIX}/lib\"\n\nmkdir -p \"${BUILD_DIR}/libcxx\"\ncmake \\\n  -DCMAKE_BUILD_TYPE=Release \\\n  -DCMAKE_C_COMPILER=clang \\\n  -DCMAKE_CXX_COMPILER=clang++ \\\n  -DCMAKE_SHARED_LINKER_FLAGS=\"${MSAN_LINKER_FLAGS}\" \\\n  -DCMAKE_INSTALL_PREFIX=\"${LLVM_INSTALL_PREFIX}\" \\\n  -DLIBCXX_CXX_ABI_INCLUDE_PATHS=${SOURCE_DIR}/libcxxabi/include \\\n  -DLIBCXX_CXX_ABI=libcxxabi \\\n  -DLLVM_PATH=\"${SOURCE_DIR}\" \\\n  -DLLVM_USE_SANITIZER=MemoryWithOrigins \\\n  -S \"${SOURCE_DIR}/libcxx\" \\\n  -B \"${BUILD_DIR}/libcxx\"\ncmake --build \"${BUILD_DIR}/libcxx\" --parallel \"${NUM_PARALLEL}\"\n${AS_ROOT} cmake --build \"${BUILD_DIR}/libcxx\" --target install\n\n# Build llvm libraries\n#\n# -fsanitize and -stdlib=c++ are required here in addition to CMake below\n# because even linking test programs with libc++-msan requires\n# -fsanitize=memory.\nMSAN_FLAGS=\"-nostdinc++ -stdlib=libc++ \\\n  -isystem ${LLVM_INSTALL_PREFIX}/include \\\n  -isystem ${LLVM_INSTALL_PREFIX}/include/c++/v1 \\\n  ${MSAN_LINKER_FLAGS} \\\n  -fsanitize=memory \\\n  -w\"\nmkdir -p \"${BUILD_DIR}/llvm\"\ncmake \\\n  -DCMAKE_BUILD_TYPE=Release \\\n  -DCMAKE_C_COMPILER=clang \\\n  -DCMAKE_C_FLAGS=\"${MSAN_FLAGS}\" \\\n  -DCMAKE_CXX_COMPILER=clang++ \\\n  -DCMAKE_CXX_FLAGS=\"${MSAN_FLAGS}\" \\\n  -DCMAKE_EXE_LINKER_FLAGS=\"${MSAN_LINKER_FLAGS}\" \\\n  -DCMAKE_INSTALL_PREFIX=\"${LLVM_INSTALL_PREFIX}\" \\\n  -DLLVM_ENABLE_LIBCXX=ON \\\n  -DLLVM_ENABLE_RTTI=ON \\\n  -DLLVM_USE_SANITIZER=MemoryWithOrigins \\\n  -S \"${SOURCE_DIR}/llvm\" \\\n  -B \"${BUILD_DIR}/llvm\"\ncmake --build \"${BUILD_DIR}/llvm\" --parallel \"${NUM_PARALLEL}\" --target ${LLVM_COMPONENTS}\nfor c in ${LLVM_COMPONENTS}; do\n  ${AS_ROOT} cmake -DCOMPONENT=${c} -P \"${BUILD_DIR}/llvm/cmake_install.cmake\"\ndone\n"
  },
  {
    "path": "scripts/docker/msan/config-galois.sh",
    "content": "#!/bin/bash\n\nBUILD_DIR=${BUILD_DIR:-/source/build}\nSOURCE_DIR=${SOURCE_DIR:-/source}\nLLVM_INSTALL_PREFIX=${LLVM_INSTALL_PREFIX:-/usr/lib/llvm-10-msan}\n\nMSAN_LINKER_FLAGS=\"-lc++abi \\\n  -Wl,--rpath=${LLVM_INSTALL_PREFIX}/lib \\\n  -L${LLVM_INSTALL_PREFIX}/lib\"\n\nMSAN_FLAGS=\"-nostdinc++ -stdlib=libc++ \\\n  -isystem ${LLVM_INSTALL_PREFIX}/include \\\n  -isystem ${LLVM_INSTALL_PREFIX}/include/c++/v1 \\\n  ${MSAN_LINKER_FLAGS} \\\n  -fsanitize=memory \\\n  -w\"\n\ncmake \\\n  -DCMAKE_PREFIX_PATH=\"${LLVM_INSTALL_PREFIX}\" \\\n  -DGALOIS_USE_SANITIZER=MemoryWithOrigins \\\n  -DCMAKE_CXX_COMPILER=clang++ \\\n  -DCMAKE_C_COMPILER=clang \\\n  -DCMAKE_CXX_FLAGS=\"${MSAN_FLAGS}\" \\\n  -DCMAKE_C_FLAGS=\"${MSAN_FLAGS}\" \\\n  -DCMAKE_EXE_LINKER_FLAGS=\"${MSAN_LINKER_FLAGS}\" \\\n  -S \"${SOURCE_DIR}\" \\\n  -B \"${BUILD_DIR}\"\n"
  },
  {
    "path": "scripts/docker/run-image.sh",
    "content": "#!/bin/bash\n#\n# This script runs a development environment inside a docker container. This\n# can be useful if you don't want to or have difficulty installing dependencies\n# on your host machine.\n#\n# In order to use this script, you must first build the quick-dev image:\n#\n#   docker build -t quick-dev .\n#\nROOT_DIR=$(cd $(dirname $0)/../..; pwd)\n\nIMAGE=${IMAGE:-quick-dev}\nCACHE_DIR=${CACHE_DIR:-$HOME/.cache/quick-dev}\n\nif [[ -z \"${DOCKER_USER}\" ]]; then\n  DOCKER_USER=\"$(id -u):$(id -g)\"\nfi\n\ncat<<EOF\n###############################################\nThe following commands will create a working build:\n\n  mkdir build\n  conan install -if build --build=missing config\n  cmake -S . -B build \\\\\n    -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \\\\\n    -DCMAKE_TOOLCHAIN_FILE=build/conan_paths.cmake\n  make -C build -j 4\n\nIf you need to become root in the container:\n\n  gosu root whoami\n\nBecause your user ID does not exist in this container, you may see errors\nrelated to missing group or user IDs. You can ignore them.\n###############################################\n\nEOF\n\nmkdir -p \"$CACHE_DIR/conan\" \"$CACHE_DIR/ccache\"\n\nexec docker run --rm -it \\\n  --user \"$DOCKER_USER\" \\\n  -v \"$ROOT_DIR\":/source \\\n  -v \"$CACHE_DIR/conan\":/root/.conan/data \\\n  -v \"$CACHE_DIR/ccache\":/root/.ccache \\\n  $IMAGE \"$@\"\n"
  },
  {
    "path": "scripts/experimental/abelian_log_parser_analysis.py",
    "content": "##########################################\n# To parse log files generated by abelian.\n# Author: Gurbinder Gill\n# Email: gurbinder533@gmail.com\n#########################################\n\nimport re\nimport os\nimport sys, getopt\nimport csv\nimport numpy\n\n######## NOTES:\n# All time values are in sec by default.\n\n\ndef sd_iterations(inputFile, outputFile, outputFile_mainfile, benchmark, runs, time_unit, hostNum, iterationNum, variant, input_graph, deviceKind, devices, partition):\n\n  mean_time = 0.0;\n  recvNum_total = 0\n  recvBytes_total = 0\n  sendNum_total = 0\n  sendBytes_total = 0\n  sync_pull_avg_time_total = 0.0;\n  extract_avg_time_total = 0.0;\n  set_avg_time_total = 0.0;\n  sync_push_avg_time_total = 0.0;\n  graph_init_time = 0\n  hg_init_time = 0\n  total_time = 0\n\n  if(benchmark == \"cc\"):\n    benchmark = \"ConnectedComp\"\n\n  if (time_unit == 'seconds'):\n    divisor = 1000\n  else:\n    divisor = 1\n\n  log_data = open(inputFile).read()\n\n  data = [variant, input_graph, hostNum, benchmark, partition, deviceKind, devices]\n  fd_outputFile = open(outputFile, 'a')\n  fd_outputFile_main = open(outputFile_mainfile, 'a')\n\n  rep_regex = re.compile(r'.*,\\(NULL\\),0\\s,\\sREPLICATION_FACTOR_0_0,(\\d*),\\d*,(.*)')\n\n  Total_mean_compute = 0.0\n  Total_rsd_compute = 0.0\n  rep_search = rep_regex.search(log_data)\n  if rep_search is not None:\n    rep_factor = rep_search.group(2)\n    rep_factor = round(float(rep_factor), 3)\n    print (\"FOUND  : \", rep_factor)\n\n  iterNum_start = 0\n  #do_all_regex = re.compile(r'.*,\\(NULL\\),0\\s,\\sDO_ALL_IMPL_FirstItr_(?i)' + re.escape(benchmark) + r'_0_' + r',.*' + r',\\d*,(\\d*)')\n  #do_all_all_hosts = re.findall(do_all_regex, log_data)\n  #num_arr = numpy.array(map(int,do_all_all_hosts))\n\n  #if(num_arr.size > 0):\n    #sd = numpy.std(num_arr, axis=0)\n    #mean = numpy.mean(num_arr, axis=0)\n    #var = numpy.var(num_arr, axis=0)\n\n    #complete_data = data + [rep_factor,iterNum, mean, var, sd, sd/mean]\n    #wr = csv.writer(fd_outputFile, quoting=csv.QUOTE_NONE, lineterminator='\\n')\n    #wr.writerow(complete_data)\n    #iterNum_start += 1\n\n    #Total_mean_compute += mean\n    #Total_rsd_compute += sd/mean\n\n\n\n  for iterNum in range(iterNum_start, int(iterationNum)):\n    do_all_regex = re.compile(r'.*,\\(NULL\\),0\\s,\\sDO_ALL_IMPL_(?i)' + re.escape(benchmark) + r'_0_' + re.escape(str(iterNum))  +r',.*' + r',\\d*,(\\d*)')\n    do_all_all_hosts = re.findall(do_all_regex, log_data)\n    num_arr_tmp = numpy.array(map(int,do_all_all_hosts))\n    if(num_arr_tmp.size < int(hostNum) and iterNum == 0):\n      num_arr = numpy.zeros(int(hostNum));\n      for i in range(0, num_arr_tmp.size):\n          num_arr[i] = num_arr_tmp[i]\n    else:\n      num_arr = num_arr_tmp\n    print num_arr\n\n    if(num_arr.size < int(hostNum)):\n      print \"SOME DATA IS MISSING\\n\"\n      #sys.exit(\"aa! errors! SOME DATA MISSING IN THE LOG FILES!!\")\n\n    sd=0.0\n    mean=0.0\n    var=0.0\n    try:\n      if(num_arr.size > 0):\n        sd = numpy.std(num_arr, axis=0)\n        mean = numpy.mean(num_arr, axis=0)\n        var = numpy.var(num_arr, axis=0)\n    except ValueError:\n      pass\n\n    rsd = 0.0;\n    if(mean > 0):\n      rsd = sd/mean\n    complete_data = data + [rep_factor,iterNum, mean, var, sd, rsd]\n    wr = csv.writer(fd_outputFile, quoting=csv.QUOTE_NONE, lineterminator='\\n')\n    wr.writerow(complete_data)\n\n    Total_mean_compute += mean\n    Total_rsd_compute += rsd\n    print (\"MEAN : \", Total_mean_compute)\n    print (\"RSD : \", Total_rsd_compute)\n\n\n  Total_mean_compute = round(Total_mean_compute,3)\n  Total_rsd_compute = round(Total_rsd_compute/int(iterationNum),3)\n\n  print (\"Total_mean_compute : \", Total_mean_compute)\n  print (\"Total_rsd_compute : \", Total_rsd_compute)\n\n  complete_data = data + [rep_factor,iterNum, Total_mean_compute, Total_rsd_compute]\n  wr = csv.writer(fd_outputFile_main, quoting=csv.QUOTE_NONE, lineterminator='\\n')\n  wr.writerow(complete_data)\n  fd_outputFile_main.close();\n  fd_outputFile.close()\n\ndef get_basicInfo(fileName):\n\n  hostNum_regex = re.compile(r'.*,\\(NULL\\),0\\s,\\sHosts,0,0,(\\d*)')\n  cmdLine_regex = re.compile(r'.*,\\(NULL\\),0\\s,\\sCommandLine,0,0,(.*)')\n  threads_regex = re.compile(r'.*,\\(NULL\\),0\\s,\\sThreads,0,0,(\\d*)')\n  runs_regex = re.compile(r'.*,\\(NULL\\),0\\s,\\sRuns,0,0,(\\d*)')\n  num_itr_regex = re.compile(r'.*,\\(NULL\\),0\\s,\\sNUM_ITERATIONS_\\d*,0,0,(\\d*)')\n\n  log_data = open(fileName).read()\n\n  hostNum    = ''\n  cmdLine    = ''\n  threads    = ''\n  runs       = ''\n  benchmark  = ''\n  variant  = ''\n  cut_type   = ''\n  input_graph = ''\n  iterationNum = '' \n\n  hostNum_search = hostNum_regex.search(log_data)\n  if hostNum_search is not None:\n    hostNum = hostNum_search.group(1)\n\n  cmdLine_search = cmdLine_regex.search(log_data)\n  if cmdLine_search is not None:\n    cmdLine = cmdLine_search.group(1)\n\n  threads_search = threads_regex.search(log_data)\n  if threads_search is not None:\n    threads = threads_search.group(1)\n\n  runs_search    = runs_regex.search(log_data)\n  if runs_search is not None:\n    runs = runs_search.group(1)\n \n  num_itr_search = num_itr_regex.search(log_data)\n  if num_itr_search is not None:\n    iterationNum = num_itr_search.group(1)\n\n  split_cmdLine_algo = cmdLine.split()[0].split(\"/\")[-1].split(\"_\")\n  benchmark, variant =  split_cmdLine_algo\n\n  split_cmdLine_input = cmdLine.split()[1].split(\"/\")\n  input_graph_name = split_cmdLine_input[-1]\n  input_graph = input_graph_name.split(\".\")[0]\n\n  split_cmdLine = cmdLine.split()\n  cut_type = \"edge-cut\"\n  for index in range(0, len(split_cmdLine)):\n    if split_cmdLine[index] == \"-enableVertexCut=1\":\n      cut_type = \"vertex-cut\"\n      break\n    elif split_cmdLine[index] == \"-enableVertexCut\":\n         cut_type = \"vertex-cut\"\n         break\n    elif split_cmdLine[index] == \"-enableVertexCut=0\":\n         cut_type = \"edge-cut\"\n         break\n\n  #cut_type = \"edge-cut\"\n  #for index in range(0, len(split_cmdLine_input)):\n    #if split_cmdLine_input[index] == \"-enableVertexCut\":\n      #cut_type = \"vertex-cut\"\n      #break\n\n  devices = str(hostNum) + \" CPU\"\n  deviceKind = \"CPU\"\n  for index in range(2, len(cmdLine.split())):\n    split_cmdLine_devices = cmdLine.split()[index].split(\"=\")\n    if split_cmdLine_devices[0] == '-pset':\n      devices_str = split_cmdLine_devices[-1]\n      cpus = devices_str.count('c')\n      gpus = devices_str.count('g')\n      if str(cpus + gpus) == hostNum and gpus > 0:\n        if cpus == 0:\n          devices = str(gpus) + \" GPU\"\n          deviceKind = \"GPU\"\n        else:\n          devices = str(cpus) + \" CPU + \" + str(gpus) + \" GPU\"\n          deviceKind = \"CPU+GPU\"\n          hostNum = str(int(hostNum) - cpus)\n      break\n\n  return hostNum, cmdLine, threads, runs, benchmark, variant, cut_type, input_graph, devices, deviceKind, iterationNum\n\n\n\n\ndef main(argv):\n  inputFile = ''\n  forHost = '0'\n  outputFile = 'LOG_output.csv'\n  time_unit = 'seconds'\n  try:\n    opts, args = getopt.getopt(argv,\"hi:n:o:md\",[\"ifile=\",\"node=\",\"ofile=\",\"milliseconds\"])\n  except getopt.GetoptError:\n    print 'abelian_log_parser.py -i <inputFile> [-o <outputFile> -n <hostNumber 0 to hosts-1> --milliseconds]'\n    sys.exit(2)\n  for opt, arg in opts:\n    if opt == '-h':\n      print 'abelian_log_parser.py -i <inputFile> [-o <outputFile> -n <hostNumber 0 to hosts-1> --milliseconds]'\n      sys.exit()\n    elif opt in (\"-i\", \"--ifile\"):\n      inputFile = arg\n    elif opt in (\"-n\", \"--node\"):\n      forHost = arg\n    elif opt in (\"-o\", \"--ofile\"):\n      outputFile = arg\n    elif opt in (\"-m\", \"--milliseconds\"):\n      time_unit = 'milliseconds'\n\n  if inputFile == '':\n    print 'abelian_log_parser_analysis.py -i <inputFile> [-o <outputFile> -n <hostNumber 0 to hosts-1> --milliseconds]'\n    sys.exit(2)\n\n  print 'Input file is : ', inputFile\n  print 'Output file is : ', outputFile\n  print 'Data for host : ', forHost\n\n  hostNum, cmdLine, threads, runs, benchmark, variant, cut_type, input_graph, devices, deviceKind, iterationNum = get_basicInfo(inputFile)\n\n  #shorten the graph names:\n  if input_graph == \"twitter-ICWSM10-component_withRandomWeights\" or input_graph == \"twitter-ICWSM10-component-transpose\" or input_graph == \"twitter-ICWSM10-component\":\n    input_graph = \"twitter-50\"\n  elif input_graph == \"twitter-WWW10-component_withRandomWeights\" or input_graph == \"twitter-WWW10-component-transpose\" or input_graph == \"twitter-WWW10-component\":\n    input_graph = \"twitter-40\"\n\n  print 'Hosts : ', hostNum , ' CmdLine : ', cmdLine, ' Threads : ', threads , ' Runs : ', runs, ' benchmark :' , benchmark , ' variant :', variant, ' cut_type : ', cut_type, ' input_graph : ', input_graph, 'iterationNum :', iterationNum\n  print 'Devices : ', devices\n\n\n  header_csv_str = \"variant,input,hosts,benchmark,partition,\"\n  header_csv_str += \"deviceKind,devices,replication,iteration,mean,variance,sd,sdByMean\"\n\n  header_csv_str_mainfile = \"variant,input,hosts,benchmark,partition,\"\n  header_csv_str_mainfile += \"deviceKind,devices,replication,total_mean_compute,rsd_total\"\n\n  output_str = variant + ',' + input_graph + ',' + hostNum + ',' + benchmark + ','\n  output_str += deviceKind  + ',' + devices  + ','\n\n\n  header_csv_list = header_csv_str.split(',')\n  header_csv_list_mainfile = header_csv_str_mainfile.split(',')\n\n  outputFile_mainfile = outputFile\n  outputFile = outputFile + \".csv\"\n  #if outputFile is empty add the header to the file\n  try:\n    if os.path.isfile(outputFile) is False:\n      fd_outputFile = open(outputFile, 'wb')\n      wr = csv.writer(fd_outputFile, quoting=csv.QUOTE_NONE, lineterminator='\\n')\n      wr.writerow(header_csv_list)\n      fd_outputFile.close()\n      print \"Adding header to the empty file.\"\n    else:\n      print \"outputFile : \", outputFile, \" exists, results will be appended to it.\"\n  except OSError:\n    print \"Error in outfile opening\\n\"\n\n  outputFile_mainfile = outputFile_mainfile + \"_main.csv\"\n  try:\n    if os.path.isfile(outputFile_mainfile) is False:\n      fd_outputFile = open(outputFile_mainfile, 'wb')\n      wr = csv.writer(fd_outputFile, quoting=csv.QUOTE_NONE, lineterminator='\\n')\n      wr.writerow(header_csv_list_mainfile)\n      fd_outputFile.close()\n      print \"Adding header to the empty file.\"\n    else:\n      print \"outputFile_mainfile : \", outputFile_mainfile, \" exists, results will be appended to it.\"\n  except OSError:\n    print \"Error in outfile opening\\n\"\n\n\n\n\n  sd_iterations(inputFile, outputFile, outputFile_mainfile, benchmark, runs, time_unit, hostNum, iterationNum, variant, input_graph, deviceKind, devices, cut_type)\n'''\n  data_list = list(data) #[data] #list(data)\n  #data_list.extend((total_SendBytes, total_SendBytes_pull_sync, total_SendBytes_pull_reply, total_SendBytes_push_sync))\n  complete_data = output_str.split(\",\") + data_list + [rep_factor]#+ list(sendBytes_list)\n  fd_outputFile = open(outputFile, 'a')\n  wr = csv.writer(fd_outputFile, quoting=csv.QUOTE_NONE, lineterminator='\\n')\n  wr.writerow(complete_data)\n  fd_outputFile.close()\n'''\n'''\n  ## Write ghost and slave nodes to a file.\n  ghost_array = build_master_ghost_matrix(inputFile, benchmark, cut_type, hostNum, runs, threads)\n  ghostNodes_file = outputFile + \"_\" + cut_type\n  fd_ghostNodes_file = open(ghostNodes_file, 'ab')\n  fd_ghostNodes_file.write(\"\\n--------------------------------------------------------------\\n\")\n  fd_ghostNodes_file.write(\"\\nHosts : \" + hostNum + \"\\nInputFile : \"+ inputFile + \"\\nBenchmark: \" + benchmark + \"\\nPartition: \" + cut_type + \"\\n\\n\")\n  numpy.savetxt(fd_ghostNodes_file, ghost_array, delimiter=',', fmt='%d')\n  fd_ghostNodes_file.write(\"\\n--------------------------------------------------------------\\n\")\n  fd_ghostNodes_file.close()\n'''\n\nif __name__ == \"__main__\":\n  main(sys.argv[1:])\n\n"
  },
  {
    "path": "scripts/experimental/abelian_log_parser_deprecated.py",
    "content": "##########################################\n# To parse log files generated by abelian.\n# Author: Gurbinder Gill\n# Email: gurbinder533@gmail.com\n#########################################\n\nimport re\nimport os\nimport sys, getopt\nimport csv\nimport numpy\n\n######## NOTES:\n# All time values are in sec by default.\n\n\ndef match_timers(fileName, benchmark, forHost, numRuns, numThreads, time_unit, total_hosts, partition):\n\n  mean_time = 0.0;\n  recvNum_total = 0\n  recvBytes_total = 0\n  sendNum_total = 0\n  sendBytes_total = 0\n  sync_pull_avg_time_total = 0.0;\n  extract_avg_time_total = 0.0;\n  set_avg_time_total = 0.0;\n  sync_push_avg_time_total = 0.0;\n  graph_init_time = 0\n  hg_init_time = 0\n  total_time = 0\n\n  if(benchmark == \"cc\"):\n    benchmark = \"ConnectedComp\"\n\n  if (time_unit == 'seconds'):\n    divisor = 1000\n  else:\n    divisor = 1\n  #e2901bc2-f648-4ff4-9976-ac3b4c794a6a,(NULL),0 , TIMER_2,7,0,79907\n  timer_regex = re.compile(r'.*,\\(NULL\\),0\\s,\\sTIMER_(\\d*),(\\d*),0,(\\d*)')\n  #timer_regex = re.compile(r'.*,\\(NULL\\),0\\s,\\sTIMER_(\\d*),7,\\d*,(\\d*)')\n\n  log_data = open(fileName).read()\n\n  timers = re.findall(timer_regex, log_data)\n  print timers\n\n  time = []\n  for i in range(int(numRuns)):\n    time.append(0)\n  for timer in timers:\n    run_num = int(timer[0])\n    host = int(timer[1])\n    host_time = float(timer[2])\n    if time[run_num] < host_time:\n      time[run_num] = host_time\n  for i in range(int(numRuns)):\n    mean_time = mean_time + time[i]\n  if(len(time) > 0):\n    mean_time /= int(numRuns) \n  mean_time /= divisor\n  mean_time = round(mean_time, 3)\n  print \"Mean time: \", mean_time\n\n  #total_cpu_do_all_impl = 0.0\n  #max_cpu_do_all_impl = 0.0;\n  #min_cpu_do_all_impl = sys.maxint;\n  #for host in range(int(total_hosts)):\n  #  cpu_do_all_impl_regex = re.compile(r'.*,\\(NULL\\),0\\s,\\sDO_ALL_IMPL_(?i)' + re.escape(benchmark) + r'_..*,'+ re.escape(str(host)) + r',\\d*,(\\d*)')\n  #  cpu_do_all_impl_per_host = re.findall(cpu_do_all_impl_regex, log_data)\n  #  time_per_host = 0.0\n  #  for cpu_do_all_time in cpu_do_all_impl_per_host:\n  #    time_per_host += float(cpu_do_all_time)\n  #    #print time_per_host\n  #  time_per_host /= int(numRuns)\n  #  total_cpu_do_all_impl += time_per_host\n  #  if(max_cpu_do_all_impl < time_per_host):\n  #    max_cpu_do_all_impl = time_per_host\n  #  if(min_cpu_do_all_impl > time_per_host):\n  #    min_cpu_do_all_impl = time_per_host\n  #total_cpu_do_all_impl /= divisor\n  #total_cpu_do_all_impl = round(total_cpu_do_all_impl, 3)\n  #mean_cpu_do_all_impl = total_cpu_do_all_impl/int(total_hosts)\n  #mean_cpu_do_all_impl = round(mean_cpu_do_all_impl, 3)\n  #max_cpu_do_all_impl /= divisor\n  #max_cpu_do_all_impl = round(max_cpu_do_all_impl, 3)\n  #min_cpu_do_all_impl /= divisor\n  #min_cpu_do_all_impl = round(min_cpu_do_all_impl, 3)\n  #print \"total_cpu_do_all : \", total_cpu_do_all_impl\n  #print \"mean_cpu_do_all : \", mean_cpu_do_all_impl\n\n  #total_cuda_do_all_impl = 0.0\n  #max_cuda_do_all_impl = 0.0;\n  #min_cuda_do_all_impl = sys.maxint;\n  #for host in range(int(total_hosts)):\n  #  cuda_do_all_impl_regex = re.compile(r'.*,\\(NULL\\),0\\s,\\sCUDA_DO_ALL_IMPL_(?i)' + re.escape(benchmark) + r'_..*,'+ re.escape(str(host)) + r',\\d*,(\\d*)')\n  #  cuda_do_all_impl_per_host = re.findall(cuda_do_all_impl_regex, log_data)\n  #  time_per_host = 0.0\n  #  for cuda_do_all_time in cuda_do_all_impl_per_host:\n  #    time_per_host += float(cuda_do_all_time)\n  #    #print time_per_host\n  #  time_per_host /= int(numRuns)\n  #  total_cuda_do_all_impl += time_per_host\n  #  if(max_cuda_do_all_impl < time_per_host):\n  #    max_cuda_do_all_impl = time_per_host\n  #  if(min_cuda_do_all_impl > time_per_host):\n  #    min_cuda_do_all_impl = time_per_host\n  #total_cuda_do_all_impl /= divisor\n  #total_cuda_do_all_impl = round(total_cuda_do_all_impl, 3)\n  #mean_cuda_do_all_impl = total_cuda_do_all_impl/int(total_hosts)\n  #mean_cuda_do_all_impl = round(mean_cuda_do_all_impl, 3)\n  #max_cuda_do_all_impl /= divisor\n  #max_cuda_do_all_impl = round(max_cuda_do_all_impl, 3)\n  #min_cuda_do_all_impl /= divisor\n  #min_cuda_do_all_impl = round(min_cuda_do_all_impl, 3)\n  #print \"total_cuda_do_all : \", total_cuda_do_all_impl\n  #print \"mean_cuda_do_all : \", mean_cuda_do_all_impl\n\n  #TOTAL_DO_ALL_IMPL all hosts\n  #414c1fb5-0df1-4741-a0ee-cee82f2fc83b,(NULL),0 , DO_ALL_IMPL_bfs,0,0,389\n  total_do_all_impl = 0.0\n  max_do_all_impl = 0.0;\n  min_do_all_impl = sys.maxint;\n  #6d3d9407-ed61-4fc9-a4ee-dd9af891b47a,(NULL),0 , DO_ALL_IMPL_BFS_0_1,0,0,5145\n  #35537c51-6ba3-47aa-afa0-72edec803b75,(NULL),0 , DO_ALL_IMPL_bfs,3,0,41104\n  #0d0cd9b5-f61d-4bd7-963d-f5d129cd711e,(NULL),0 , DO_ALL_IMPL_BFS_0_1,0,0,8007\n  for host in range(int(total_hosts)):\n    do_all_impl_regex = re.compile(r'.*,\\(NULL\\),0\\s,\\s.*DO_ALL_IMPL_(?i)' + re.escape(benchmark) + r'_..*,'+ re.escape(str(host)) + r',\\d*,(\\d*)')\n    do_all_impl_per_host = re.findall(do_all_impl_regex, log_data)\n    #print do_all_impl_all\n    time_per_host = 0.0\n    #print \"----> \", do_all_impl_per_host\n    for do_all_time in do_all_impl_per_host:\n      if (do_all_time != \"\"):\n        time_per_host += float(do_all_time)\n    if time_per_host == 0.0:\n      continue\n    time_per_host /= len(do_all_impl_per_host) \n    total_do_all_impl += time_per_host\n    if(max_do_all_impl < time_per_host):\n      max_do_all_impl = time_per_host\n    if(min_do_all_impl > time_per_host):\n      min_do_all_impl = time_per_host\n  total_do_all_impl /= divisor\n  total_do_all_impl = round(total_do_all_impl, 3)\n  mean_do_all_impl = total_do_all_impl/int(total_hosts)\n  mean_do_all_impl = round(mean_do_all_impl, 3)\n  max_do_all_impl /= divisor\n  max_do_all_impl = round(max_do_all_impl, 3)\n  min_do_all_impl /= divisor\n  min_do_all_impl = round(min_do_all_impl, 3)\n  #print \"total_do_all : \", total_do_all_impl\n  #print \"mean_do_all : \", mean_do_all_impl\n  #print \"max_do_all : \", max_do_all_impl\n  #print \"min_do_all : \", min_do_all_impl\n\n  total_comm_time = 0.0\n  max_comm_time = 0.0;\n  min_comm_time = sys.maxint;\n  for host in range(int(total_hosts)):\n    comm_time_regex = re.compile(r'.*,\\(NULL\\),0\\s,\\sSYNC_PU.._(?i)' + re.escape(benchmark) + r'_..*,'+ re.escape(str(host)) + r',\\d*,(\\d*)')\n    #comm_time_regex = re.compile(r'.*,\\(NULL\\),0\\s,\\s.*DO_ALL_IMPL_(?i)' + re.escape(benchmark) + r'_..*,'+ re.escape(str(host)) + r',\\d*,(\\d*)')\n    comm_time_per_host = re.findall(comm_time_regex, log_data)\n    #print comm_time_per_host\n    time_per_host = 0.0\n    #print \"----> \", comm_time_per_host\n    for comm_time in comm_time_per_host:\n      if (comm_time != \"\"):\n        time_per_host += float(comm_time)\n    if time_per_host == 0.0:\n      continue\n    time_per_host /= len(comm_time_per_host) \n    total_comm_time += time_per_host\n    if(max_comm_time < time_per_host):\n      max_comm_time = time_per_host\n    if(min_comm_time > time_per_host):\n      min_comm_time = time_per_host\n  total_comm_time /= divisor\n  total_comm_time = round(total_comm_time, 3)\n  mean_comm_time = total_comm_time/int(total_hosts)\n  mean_comm_time = round(mean_comm_time, 3)\n  max_comm_time /= divisor\n  max_comm_time = round(max_comm_time, 3)\n  min_comm_time /= divisor\n  min_comm_time = round(min_comm_time, 3)\n  #print \"total_comm_time : \", total_comm_time\n  #print \"mean_comm_time : \", mean_comm_time\n  #print \"max_comm_time : \", max_comm_time\n  #print \"min_comm_time : \", min_comm_time\n\n  total_send_bytes = 0;\n  #6d3d9407-ed61-4fc9-a4ee-dd9af891b47a,BFS,0 , SEND_BYTES_SYNC_PULL,0,0,4209914580\n  if(partition == \"edge-cut\"):\n    send_bytes_regex = re.compile(r'.*,\\(NULL\\),0\\s,\\sSEND_BYTES_SYNC_(PUSH|PULL)_(?i)' + re.escape(benchmark) + r'.*,\\d*,\\d*,(\\d*)')\n    send_bytes_firstItr_regex = re.compile(r'.*,\\(NULL\\),0\\s,\\sSEND_BYTES_SYNC_(PUSH|PULL)_FirstItr_(?i)' + re.escape(benchmark) + r'.*,\\d*,\\d*,(\\d*)')\n\n    send_bytes_host = re.findall(send_bytes_regex, log_data)\n    send_bytes_firstItr_host = re.findall(send_bytes_firstItr_regex, log_data)\n\n    for byte in send_bytes_host:\n      if (byte[1] != \"\"):\n        total_send_bytes += (int(byte[1]))\n      #print(\"->\", byte[0], \" , \" , byte[1])\n      #print(\"->\", byte_firstItr[0], \" , \" , byte_firstItr[1])\n\n    for byte_firstItr in send_bytes_firstItr_host:\n      if (byte_firstItr[1] != \"\"):\n        total_send_bytes += int(byte_firstItr[1])\n\n    total_send_bytes /= int(numRuns)\n\n  elif(partition == \"vertex-cut\"):\n    #35537c51-6ba3-47aa-afa0-72edec803b75,BFS,0 , SEND_BYTES_SYNC_PULL,0,0,4210027620\n    send_bytes_regex = re.compile(r'.*,(?i)' + re.escape(benchmark) + r',0\\s,\\sSEND_BYTES_SYNC_(PUSH|PULL),\\d*,\\d*,(\\d*)')\n    send_bytes_firstItr_regex = re.compile(r'.*,FirstItr_(?i)' + re.escape(benchmark) + r',0\\s,\\sSEND_BYTES_SYNC_(PUSH|PULL),\\d*,\\d*,(\\d*)')\n\n    send_bytes_host = re.findall(send_bytes_regex, log_data)\n    send_bytes_firstItr_host = re.findall(send_bytes_firstItr_regex, log_data)\n\n    for byte in send_bytes_host:\n      if (byte[1] != \"\"):\n        total_send_bytes += (int(byte[1]))\n      #print(\"->\", byte[0], \" , \" , byte[1])\n      #print(\"->\", byte_firstItr[0], \" , \" , byte_firstItr[1])\n\n    for byte_firstItr in send_bytes_firstItr_host:\n      if (byte_firstItr[1] != \"\"):\n        total_send_bytes += int(byte_firstItr[1])\n\n    total_send_bytes /= int(numRuns)\n\n  ## SYNC_PULL and SYNC_PUSH total average over runs.\n  #num_iterations = 0\n  #for i in range(0, int(numRuns)):\n    # find extract\n    #extract_regex = re.compile(r'\\[' + re.escape(forHost) + r'\\]STAT,\\(NULL\\),SYNC_EXTRACT_(?i)' + re.escape(benchmark) + r'\\w*_' + re.escape(str(i)) + r'_(\\d*),\\d*,(\\d*),(\\d*).*')\n    #extract_lines = re.findall(extract_regex, log_data)\n    #for j in range (0, len(extract_lines)):\n      #extract_avg_time_total += float(extract_lines[j][2])\n\n    # find set\n    #set_regex = re.compile(r'\\[' + re.escape(forHost) + r'\\]STAT,\\(NULL\\),SYNC_SET_(?i)' + re.escape(benchmark) + r'\\w*_' + re.escape(str(i)) + r'_(\\d*),\\d*,(\\d*),(\\d*).*')\n    #set_lines = re.findall(set_regex, log_data)\n    #for j in range (0, len(set_lines)):\n      #set_avg_time_total += float(set_lines[j][2])\n\n    # find sync_pull\n    #sync_pull_regex = re.compile(r'\\[' + re.escape(forHost) + r'\\]STAT,\\(NULL\\),SYNC_PULL_(?i)' + re.escape(benchmark) + r'\\w*_' + re.escape(str(i)) + r'_(\\d*),\\d*,(\\d*),(\\d*).*')\n    #sync_pull_lines = re.findall(sync_pull_regex, log_data)\n    #num_iterations = len(sync_pull_lines);\n    #for j in range (0, len(sync_pull_lines)):\n      #sync_pull_avg_time_total += float(sync_pull_lines[j][2])\n\n    # find sync_push\n    #sync_push_regex = re.compile(r'\\[' + re.escape(forHost) + r'\\]STAT,\\(NULL\\),SYNC_PUSH_(?i)' + re.escape(benchmark) + r'\\w*_'+ re.escape(str(i)) + r'_(\\d*),\\d*,(\\d*),(\\d*).*')\n    #sync_push_lines = re.findall(sync_push_regex, log_data)\n\n    #if(num_iterations == 0):\n    #  num_iterations = len(sync_push_lines)\n\n    #for j in range (0, len(sync_push_lines)):\n    #  sync_push_avg_time_total += float(sync_push_lines[j][2])\n\n  #extract_avg_time_total /= int(numRuns)\n  #extract_avg_time_total /= divisor\n  #extract_avg_time_total = round(extract_avg_time_total, 0)\n\n  #set_avg_time_total /= int(numRuns)\n  #set_avg_time_total /= divisor\n  #set_avg_time_total = round(set_avg_time_total, 0)\n\n  #sync_pull_avg_time_total /= int(numRuns)\n  #sync_pull_avg_time_total /= divisor\n  #sync_pull_avg_time_total = round(sync_pull_avg_time_total, 0)\n\n  #sync_push_avg_time_total /= int(numRuns)\n  #sync_push_avg_time_total /= divisor\n  #sync_push_avg_time_total = round(sync_push_avg_time_total, 0)\n\n  ## sendBytes and recvBytes.\n  #recvBytes_regex = re.compile(r'\\[' + re.escape(forHost) + r'\\]STAT,\\(NULL\\),RecvBytes,\\d*,(\\d*),(\\d*),.*')\n  #recvBytes_search = recvBytes_regex.search(log_data)\n  #if recvBytes_search is not None:\n     #recvBytes_total = float(recvBytes_search.group(1))/int(numRuns)\n\n  #sendBytes_regex = re.compile(r'\\[' + re.escape(forHost) + r'\\]STAT,\\(NULL\\),SendBytes,\\d*,(\\d*),(\\d*),.*')\n  #sendBytes_search = sendBytes_regex.search(log_data)\n  #if sendBytes_search is not None:\n    #sendBytes_total = float(sendBytes_search.group(1))/int(numRuns)\n\n  ## sendNum and recvNum.\n  #recvNum_regex = re.compile(r'\\[' + re.escape(forHost) + r'\\]STAT,\\(NULL\\),RecvNum,\\d*,(\\d*),(\\d*),.*')\n  #recvNum_search = recvNum_regex.search(log_data)\n  #if recvNum_search is not None:\n    #recvNum_total = float(recvNum_search.group(1))/int(numRuns)\n\n  #sendNum_regex = re.compile(r'\\[' + re.escape(forHost) + r'\\]STAT,\\(NULL\\),SendNum,\\d*,(\\d*),(\\d*),.*')\n  #sendNum_search = sendNum_regex.search(log_data)\n  #if sendNum_search is not None:\n    #sendNum_total = float(sendNum_search.group(1))/int(numRuns)\n\n  ## Get Graph_init, HG_init, total\n  timer_graph_init_regex = re.compile(r'\\[' + re.escape(forHost) + r'\\]STAT,\\(NULL\\),TIMER_GRAPH_INIT,' + re.escape(numThreads) + r',(\\d*),(\\d*).*')\n  timer_hg_init_regex = re.compile(r'\\[' + re.escape(forHost) + r'\\]STAT,\\(NULL\\),TIMER_HG_INIT,' + re.escape(numThreads) + r',(\\d*),(\\d*).*')\n  timer_total_regex = re.compile(r'\\[' + re.escape(forHost) + r'\\]STAT,\\(NULL\\),TIMER_TOTAL,' + re.escape(numThreads) + r',(\\d*),(\\d*).*')\n\n\n  timer_graph_init = timer_graph_init_regex.search(log_data)\n  timer_hg_init = timer_hg_init_regex.search(log_data)\n  timer_total = timer_total_regex.search(log_data)\n\n  if timer_graph_init is not None:\n    graph_init_time = float(timer_graph_init.group(1))\n    graph_init_time /= divisor\n    graph_init_time = round(graph_init_time, 0)\n\n  if timer_hg_init is not None:\n    hg_init_time = float(timer_hg_init.group(1))\n    hg_init_time /= divisor\n    hg_init_time = round(hg_init_time, 0)\n\n  if timer_total is not None:\n    total_time = float(timer_total.group(1))\n    total_time /= divisor\n    total_time = round(total_time, 0)\n\n  ## Get Commits, Conflicts, Iterations, Pushes for worklist versions:\n  commits_search = re.compile(r'\\[' + re.escape(forHost) + r'\\]STAT,(?i)' + re.escape(benchmark) + '\\w*,Commits,' + re.escape(numThreads) + r',(\\d*),(\\d*).*').search(log_data)\n  conflicts_search = re.compile(r'\\[' + re.escape(forHost) + r'\\]STAT,(?i)' + re.escape(benchmark) + r'\\w*,Conflicts,' + re.escape(numThreads) + r',(\\d*),(\\d*).*').search(log_data)\n  iterations_search = re.compile(r'\\[' + re.escape(forHost) + r'\\]STAT,(?i)' + re.escape(benchmark) + r'\\w*,Iterations,' + re.escape(numThreads) + r',(\\d*),(\\d*).*').search(log_data)\n  pushes_search = re.compile(r'\\[' + re.escape(forHost) + r'\\]STAT,(?i)' + re.escape(benchmark) + r'\\w*,Pushes,' + re.escape(numThreads) + r',(\\d*),(\\d*).*').search(log_data)\n\n  commits    = 0\n  conflicts  = 0\n  iterations = 0\n  pushes     = 0\n  if commits_search is not None:\n    commits = int(commits_search.group(1))\n    commits /= int(numRuns)\n  if conflicts_search is not None:\n    conflicts = int(conflicts_search.group(1))\n    conflicts /= int(numRuns)\n  if iterations_search is not None:\n    iterations = int(iterations_search.group(1))\n    iterations /= int(numRuns)\n  if pushes_search is not None:\n    pushes = int(pushes_search.group(1))\n    pushes /= int(numRuns)\n\n  #return mean_time,graph_init_time,hg_init_time,total_time,sync_pull_avg_time_total,sync_push_avg_time_total,recvNum_total,recvBytes_total,sendNum_total,sendBytes_total,commits,conflicts,iterations, pushes\n  #return mean_time,graph_init_time,hg_init_time,total_time,extract_avg_time_total,set_avg_time_total,sync_pull_avg_time_total,sync_push_avg_time_total,num_iterations,commits,conflicts,iterations, pushes\n  return mean_time,total_do_all_impl,mean_do_all_impl,max_do_all_impl,min_do_all_impl,total_comm_time,mean_comm_time,max_comm_time,min_comm_time,total_send_bytes\n\n\ndef sendRecv_bytes_all(fileName, benchmark, total_hosts, numRuns, numThreads):\n  sendBytes_list = [0]*256 #Max host number is 256\n  recvBytes_list = [0]*256 #Max host number is 256\n\n  log_data = open(fileName).read()\n\n  if(benchmark == \"cc\"):\n    benchmark = \"ConnectedComp\"\n\n  ## sendBytes and recvBytes.\n  total_SendBytes = 0;\n  for host in range(0,int(total_hosts)):\n    sendBytes_regex = re.compile(r'\\[' + re.escape(str(host)) + r'\\]STAT,\\(NULL\\),SendBytes,\\d*,(\\d*),(\\d*),.*')\n    sendBytes_search = sendBytes_regex.search(log_data)\n    if sendBytes_search is not None:\n      sendBytes_list[host] = float(sendBytes_search.group(1))/int(numRuns)\n\n  total_SendBytes = sum(sendBytes_list)\n\n  total_RecvBytes = 0;\n  for host in range(0,int(total_hosts)):\n    recvBytes_regex = re.compile(r'\\[' + re.escape(str(host)) + r'\\]STAT,\\(NULL\\),RecvBytes,\\d*,(\\d*),(\\d*),.*')\n    recvBytes_search = recvBytes_regex.search(log_data)\n    if recvBytes_search is not None:\n       recvBytes_list[host] = float(recvBytes_search.group(1))/int(numRuns)\n\n  total_RecvBytes = sum(recvBytes_list)\n  return total_SendBytes, sendBytes_list\n\n\n\ndef sendBytes_syncOnly(fileName, benchmark, total_hosts, numRuns, numThreads):\n  sendBytes_total_list = [0]*256 #Max host number is 256\n  sendBytes_pull_sync_list = [0]*256 #Max host number is 256\n  sendBytes_push_sync_list = [0]*256 #Max host number is 256\n  sendBytes_pull_sync_reply_list = [0]*256 #Max host number is 256\n\n  log_data = open(fileName).read()\n\n  if(benchmark == \"cc\"):\n    benchmark = \"ConnectedComp\"\n\n  ## sendBytes from sync_pull.\n  total_SendBytes_pull_sync = 0;\n  for host in range(0,int(total_hosts)):\n    sendBytes_sync_pull_regex = re.compile(r'\\[' + re.escape(str(host)) + r'\\]STAT,\\(NULL\\),SEND_BYTES_SYNC_PULL_(?i)'+ re.escape(benchmark) + r'_0_\\d*,\\d*,(\\d*),(\\d*),.*')\n    sendBytes_sync_pull_lines = re.findall(sendBytes_sync_pull_regex, log_data)\n    print sendBytes_sync_pull_lines\n\n    if len(sendBytes_sync_pull_lines) > 0:\n      sendBytes_pull_sync_list[host] = float(sendBytes_sync_pull_lines[0][0]) * len(sendBytes_sync_pull_lines)\n      sendBytes_total_list[host] += sendBytes_pull_sync_list[host]\n      print \"-------> : \", host , \" val : \" , sendBytes_pull_sync_list[host]\n\n  total_SendBytes_pull_sync = sum(sendBytes_pull_sync_list)\n\n  ## sendBytes from sync_pull_reply.\n  total_SendBytes_pull_reply = 0;\n  for host in range(0,int(total_hosts)):\n    sendBytes_sync_pull_reply_regex = re.compile(r'\\[' + re.escape(str(host)) + r'\\]STAT,\\(NULL\\),SEND_BYTES_SYNC_PULL_REPLY_(?i)'+ re.escape(benchmark) + r'_0_\\d*,\\d*,(\\d*),(\\d*),.*')\n    sendBytes_sync_pull_reply_lines = re.findall(sendBytes_sync_pull_reply_regex, log_data)\n    print sendBytes_sync_pull_reply_lines\n\n    if len(sendBytes_sync_pull_reply_lines) > 0:\n      sendBytes_pull_sync_reply_list[host] = float(sendBytes_sync_pull_reply_lines[0][0]) * len(sendBytes_sync_pull_reply_lines)\n      sendBytes_total_list[host] += sendBytes_pull_sync_reply_list[host]\n      #print \"-------> : \", host , \" val : \" , sendBytes_pull_sync_reply_list[host]\n\n  total_SendBytes_pull_reply = sum(sendBytes_pull_sync_reply_list)\n\n  #[2]STAT,(NULL),SEND_BYTES_SYNC_PUSH_BFS_0_0,15,33738828,33738828,0,0,0,0,0,0,0,0,0,0,0,0,0,0\n   ## sendBytes from sync_push.\n  total_SendBytes_push_sync = 0;\n  for host in range(0,int(total_hosts)):\n    sendBytes_sync_push_regex = re.compile(r'\\[' + re.escape(str(host)) + r'\\]STAT,\\(NULL\\),SEND_BYTES_SYNC_PUSH_(?i)'+ re.escape(benchmark) + r'_0_\\d*,\\d*,(\\d*),(\\d*),.*')\n    sendBytes_sync_push_lines = re.findall(sendBytes_sync_push_regex, log_data)\n    print sendBytes_sync_push_lines\n\n    if len(sendBytes_sync_push_lines) > 0:\n      sendBytes_push_sync_list[host] = float(sendBytes_sync_push_lines[0][0]) * len(sendBytes_sync_push_lines)\n      sendBytes_total_list[host] += sendBytes_push_sync_list[host]\n      #print \"-------> : \", host , \" val : \" , sendBytes_push_sync_list[host]\n\n  total_SendBytes_push_sync = sum(sendBytes_push_sync_list)\n\n  total_SendBytes = total_SendBytes_pull_sync + total_SendBytes_pull_reply + total_SendBytes_push_sync\n\n  return total_SendBytes, total_SendBytes_pull_sync, total_SendBytes_pull_reply, total_SendBytes_push_sync, sendBytes_total_list\n\n\n\ndef replication_factor(fileName, benchmark, partition, total_hosts, numRuns, numThreads, input_graph):\n  log_data = open(fileName).read()\n  total_nodes = 0\n  if(input_graph == \"rmat28\"):\n    total_nodes = 268435456\n  elif(input_graph == \"twitter-50\"):\n    total_nodes = 51161011\n  elif(input_graph == \"rmat25\"):\n    total_nodes = 33554432\n  elif(input_graph == \"twitter-40\"):\n    total_nodes = 41652230\n  else:\n    return 0\n\n  print \"total_nodes : \", total_nodes\n  rep_regex = re.compile(r'.*,\\(NULL\\),0\\s,\\sREPLICATION_FACTOR_0_0,(\\d*),\\d*,(.*)')\n\n  rep_search = rep_regex.search(log_data)\n  if rep_search is not None:\n    rep_factor = rep_search.group(2)\n    rep_factor = round(float(rep_factor), 3)\n    print (\"FOUND  : \", rep_factor)\n    return rep_factor\n\n  if partition == \"edge-cut\":\n    total_ghost = 0\n    #7fee06cb-4c74-458f-a761-ddf6997a1edd,(NULL),0 , TotalGhostNodes,3,0,28215509\n    ghost_from_re = re.compile(r'.*,\\(NULL\\),0\\s,\\sTotalGhostNodes,(\\d*),\\d*,(\\d*)')\n    ghost_from_lines = re.findall(ghost_from_re, log_data)\n    for line in ghost_from_lines:\n      #print int(line[1])\n      total_ghost += int(line[1])\n\n    rep_factor = float(total_nodes + total_ghost)/float(total_nodes)\n    rep_factor = round(rep_factor, 3)\n    return rep_factor\n  elif partition == \"vertex-cut\" or partition == \"vertex-cut-balanced\":\n    total_slave = 0\n    #8190584f-391e-45ca-9d3b-bf1d0d682fad,(NULL),0 , SLAVE_NODES_FROM_0,0,0,83207225\n    slave_from_re = re.compile(r'.*,\\(NULL\\),0\\s,\\sSLAVE_NODES_FROM_(\\d*),(\\d*),\\d*,(\\d*)')\n    slave_from_lines = re.findall(slave_from_re, log_data)\n    for line in slave_from_lines:\n      #print \"v\", int(line[2])\n      total_slave += int(line[2])\n\n    rep_factor = float(float(total_slave)/float(total_nodes))\n    rep_factor = round(rep_factor, 3)\n    return rep_factor\n\n\n\ndef build_master_ghost_matrix(fileName, benchmark, partition, total_hosts, numRuns, numThreads):\n  #[1]STAT,(NULL),GhostNodes_from_1,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0\n  log_data = open(fileName).read()\n  if partition == \"edge-cut\":\n    GhostNodes_array = numpy.zeros((int(total_hosts), int(total_hosts)))\n    for host in range(0, int(total_hosts)):\n      #(NULL),0 , GhostNodes_from_1,3,0,45865\n      #ghost_from_re = re.compile(r'\\[' + re.escape(str(host)) + r'\\]STAT,\\(NULL\\),GhostNodes_from_(\\d*),\\d*,(\\d*),.*')\n      ghost_from_re = re.compile(r'\\(NULL\\),\\d* , GhostNodes_from_(\\d*),' + re.escape(str(host)) + r',\\d*,(\\d*)')\n      ghost_from_lines = re.findall(ghost_from_re, log_data)\n      if(len(ghost_from_lines) > 0):\n        for line in ghost_from_lines:\n          GhostNodes_array[host][int(line[0])] = int(line[1])\n    return GhostNodes_array\n  #[1]STAT,(NULL),SLAVE_NODES_FROM_0,15,21693895,21693895,0,0,0,0,0,0,0,0,0,0,0,0,0,0\n  elif partition == \"vertex-cut\" or partition == \"vertex-cut-balanced\":\n    SlaveNodes_array = numpy.zeros((int(total_hosts), int(total_hosts)))\n    for host in range(0, int(total_hosts)):\n      slave_from_re = re.compile(r'\\[' + re.escape(str(host)) + r'\\]STAT,\\(NULL\\),SLAVE_NODES_FROM_(\\d*),\\d*,(\\d*),.*')\n      slave_from_lines = re.findall(slave_from_re, log_data)\n      if(len(slave_from_lines) > 0):\n        for line in slave_from_lines:\n          SlaveNodes_array[host][int(line[0])] = int(line[1])\n    return SlaveNodes_array\n\n\n#[0]STAT,(NULL),SYNC_PULL_BARRIER_BFS_0_1,15,992,992,0,0,0,0,0,0,0,0,0,0,0,0,0,0\n#[0]STAT,(NULL),SYNC_PULL_BARRIER_BFS_0_2,15,538,538,0,0,0,0,0,0,0,0,0,0,0,0,0,0\n#[0]STAT,(NULL),SYNC_PULL_BARRIER_BFS_0_3,15,1408,1408,0,0,0,0,0,0,0,0,0,0,0,0,0,0\n#[0]STAT,(NULL),SYNC_PULL_BARRIER_BFS_1_1,15,1458,1458,0,0,0,0,0,0,0,0,0,0,0,0,0,0\n#[0]STAT,(NULL),SYNC_PULL_BARRIER_BFS_1_2,15,1568,1568,0,0,0,0,0,0,0,0,0,0,0,0,0,0\n#[0]STAT,(NULL),SYNC_PULL_BARRIER_BFS_1_3,15,2766,2766,0,0,0,0,0,0,0,0,0,0,0,0,0,0\ndef time_at_barrier(fileName, benchmark, total_hosts, numRuns, numThreads):\n  log_data = open(fileName).read()\n  thousand = 1000.0\n  sync_pull_barrier_avg_time_total = [0.0]*256\n  sync_pull_avg_time_total = [0.0]*256\n\n  if(benchmark == \"cc\"):\n    benchmark = \"ConnectedComp\"\n\n  for host in range(0, int(total_hosts)):\n      for i in range(0, int(numRuns)):\n        # find sync_pull\n        sync_pull_barrier_regex = re.compile(r'\\[' + re.escape(str(host)) + r'\\]STAT,\\(NULL\\),SYNC_PULL_BARRIER_(?i)' + re.escape(benchmark) + r'\\w*_' + re.escape(str(i)) + r'_(\\d*),\\d*,(\\d*),(\\d*).*')\n        sync_pull_barrier_lines = re.findall(sync_pull_barrier_regex, log_data)\n        num_iterations = len(sync_pull_barrier_lines);\n        for j in range (0, len(sync_pull_barrier_lines)):\n          sync_pull_barrier_avg_time_total[host] += float(sync_pull_barrier_lines[j][2])\n\n      sync_pull_barrier_avg_time_total[host] /= int(numRuns)\n      sync_pull_barrier_avg_time_total[host] /= thousand\n\n  for host in range(0, int(total_hosts)):\n      for i in range(0, int(numRuns)):\n        # find sync_pull\n        sync_pull_regex = re.compile(r'\\[' + re.escape(str(host)) + r'\\]STAT,\\(NULL\\),SYNC_PULL_(?i)' + re.escape(benchmark) + r'\\w*_' + re.escape(str(i)) + r'_(\\d*),\\d*,(\\d*),(\\d*).*')\n        sync_pull_lines = re.findall(sync_pull_regex, log_data)\n        num_iterations = len(sync_pull_lines);\n        for j in range (0, len(sync_pull_lines)):\n          sync_pull_avg_time_total[host] += float(sync_pull_lines[j][2])\n\n      sync_pull_avg_time_total[host] /= int(numRuns)\n      sync_pull_avg_time_total[host] /= thousand\n\n  print sync_pull_barrier_avg_time_total\n  print sync_pull_avg_time_total\n\n\n#63719d90-126e-4bdb-87d2-b7d878a23abc,(NULL),0 , CommandLine,0,0,/work/02982/ggill0/Distributed_latest/build_dist_hetero/release_new_gcc/exp/apps/compiler_outputs/bfs_push-topological_edge-cut /scratch/01131/rashid/inputs/rmat16-2e28-a=0.57-b=0.19-c=0.19-d=0.05.rgr -startNode=155526494 -maxIterations=10000 -verify=0 -t=15\n#63719d90-126e-4bdb-87d2-b7d878a23abc,(NULL),0 , Threads,0,0,15\n#63719d90-126e-4bdb-87d2-b7d878a23abc,(NULL),0 , Hosts,0,0,4\n#63719d90-126e-4bdb-87d2-b7d878a23abc,(NULL),0 , Runs,0,0,3\n\ndef get_basicInfo(fileName):\n\n  hostNum_regex = re.compile(r'.*,\\(NULL\\),0\\s,\\sHosts,0,0,(\\d*)')\n  cmdLine_regex = re.compile(r'.*,\\(NULL\\),0\\s,\\sCommandLine,0,0,(.*)')\n  threads_regex = re.compile(r'.*,\\(NULL\\),0\\s,\\sThreads,0,0,(\\d*)')\n  runs_regex = re.compile(r'.*,\\(NULL\\),0\\s,\\sRuns,0,0,(\\d*)')\n\n  log_data = open(fileName).read()\n\n  hostNum    = ''\n  cmdLine    = ''\n  threads    = ''\n  runs       = ''\n  benchmark  = ''\n  algo_type  = ''\n  cut_type   = ''\n  input_graph = ''\n\n  hostNum_search = hostNum_regex.search(log_data)\n  if hostNum_search is not None:\n    hostNum = hostNum_search.group(1)\n\n  cmdLine_search = cmdLine_regex.search(log_data)\n  if cmdLine_search is not None:\n    cmdLine = cmdLine_search.group(1)\n\n  threads_search = threads_regex.search(log_data)\n  if threads_search is not None:\n    threads = threads_search.group(1)\n\n  runs_search    = runs_regex.search(log_data)\n  if runs_search is not None:\n    runs = runs_search.group(1)\n  if runs == \"\":\n    runs = \"3\"\n \n  split_cmdLine_algo = cmdLine.split()[0].split(\"/\")[-1].split(\"_\")\n  benchmark, algo_type = split_cmdLine_algo\n\n  split_cmdLine_input = cmdLine.split()[1].split(\"/\")\n  input_graph_name = split_cmdLine_input[-1]\n  input_graph = input_graph_name.split(\".\")[0]\n  cut_type = \"edge-cut\"\n  for index in range(0, len(split_cmdLine_input)):\n    if split_cmdLine_input[index] == \"-enableVertexCut\":\n      cut_type = \"vertex-cut\"\n      break\n\n  devices = str(hostNum) + \" CPU\"\n  deviceKind = \"CPU\"\n  for index in range(2, len(cmdLine.split())):\n    split_cmdLine_devices = cmdLine.split()[index].split(\"=\")\n    if split_cmdLine_devices[0] == '-pset':\n      devices_str = split_cmdLine_devices[-1]\n      cpus = devices_str.count('c')\n      gpus = devices_str.count('g')\n      if str(cpus + gpus) == hostNum and gpus > 0:\n        if cpus == 0:\n          devices = str(gpus) + \" GPU\"\n          deviceKind = \"GPU\"\n        else:\n          devices = str(cpus) + \" CPU + \" + str(gpus) + \" GPU\"\n          deviceKind = \"CPU+GPU\"\n          hostNum = str(int(hostNum) - cpus)\n      break\n\n  return hostNum, cmdLine, threads, runs, benchmark, algo_type, cut_type, input_graph, devices, deviceKind\n\ndef format_str(col):\n  max_len = 0\n  for c in col:\n    if max_len < len(str(c)):\n      max_len = len(str(c))\n  return max_len\n\ndef main(argv):\n  inputFile = ''\n  forHost = '0'\n  outputFile = 'LOG_output.csv'\n  time_unit = 'seconds'\n  try:\n    opts, args = getopt.getopt(argv,\"hi:n:o:md\",[\"ifile=\",\"node=\",\"ofile=\",\"milliseconds\"])\n  except getopt.GetoptError:\n    print 'abelian_log_parser.py -i <inputFile> [-o <outputFile> -n <hostNumber 0 to hosts-1> --milliseconds]'\n    sys.exit(2)\n  for opt, arg in opts:\n    if opt == '-h':\n      print 'abelian_log_parser.py -i <inputFile> [-o <outputFile> -n <hostNumber 0 to hosts-1> --milliseconds]'\n      sys.exit()\n    elif opt in (\"-i\", \"--ifile\"):\n      inputFile = arg\n    elif opt in (\"-n\", \"--node\"):\n      forHost = arg\n    elif opt in (\"-o\", \"--ofile\"):\n      outputFile = arg\n    elif opt in (\"-m\", \"--milliseconds\"):\n      time_unit = 'milliseconds'\n\n  if inputFile == '':\n    print 'abelian_log_parser.py -i <inputFile> [-o <outputFile> -n <hostNumber 0 to hosts-1> --milliseconds]'\n    sys.exit(2)\n\n  print 'Input file is : ', inputFile\n  print 'Output file is : ', outputFile\n  print 'Data for host : ', forHost\n\n  hostNum, cmdLine, threads, runs, benchmark, algo_type, cut_type, input_graph, devices, deviceKind = get_basicInfo(inputFile)\n\n  #shorten the graph names:\n  if input_graph == \"twitter-ICWSM10-component_withRandomWeights\" or input_graph == \"twitter-ICWSM10-component-transpose\" or input_graph == \"twitter-ICWSM10-component\":\n    input_graph = \"twitter-50\"\n  elif input_graph == \"twitter-WWW10-component_withRandomWeights\" or input_graph == \"twitter-WWW10-component-transpose\" or input_graph == \"twitter-WWW10-component\":\n    input_graph = \"twitter-40\"\n\n  print 'Hosts : ', hostNum , ' CmdLine : ', cmdLine, ' Threads : ', threads , ' Runs : ', runs, ' benchmark :' , benchmark , ' algo_type :', algo_type, ' cut_type : ', cut_type, ' input_graph : ', input_graph\n  print 'Devices : ', devices\n  data = match_timers(inputFile, benchmark, forHost, runs, threads, time_unit, hostNum, cut_type)\n  rep_factor = replication_factor(inputFile, benchmark, cut_type, hostNum, runs, threads, input_graph)\n  print \"rep factor : \" , rep_factor\n  #total_SendBytes, sendBytes_list = sendRecv_bytes_all(inputFile, benchmark, hostNum, runs, threads)\n  #total_SendBytes, total_SendBytes_pull_sync, total_SendBytes_pull_reply, total_SendBytes_push_sync, sendBytes_list = sendBytes_syncOnly(inputFile, benchmark, hostNum, runs, threads)\n  print data\n\n  output_str = benchmark + ',' + 'abelian' + ',' + hostNum  + ',' + threads  + ','\n  output_str += deviceKind  + ',' + devices  + ','\n  output_str += input_graph  + ',' + algo_type  + ',' + cut_type\n  #time_at_barrier(inputFile, benchmark, forHost, runs, threads)\n\n  #output_str = benchmark + ',' + 'abelian'  + ',' + hostNum  + ',' + threads  + ',' + input_graph  + ',' + algo_type  + ',' + cut_type\n\n  #for d in data:\n    #output_str += ','\n    #output_str += str(d)\n  print output_str\n\n\n  header_csv_str = \"benchmark,platform,host,threads,\"\n  header_csv_str += \"deviceKind,devices,\"\n  header_csv_str += \"input,variant,partition,mean_time,total_comp_time,mean_comp_time,max_comp_time,min_comp_time,total_comm_time,mean_comm_time,max_comm_time,min_comm_time,total_bytes_sent,rep_factor\" #,graph_init_time,hg_init_time,total_time,extract_avg_time,set_avg_time,sync_pull_avg_time,sync_push_avg_time,converge_iterations,commits,conflicts,iterations,pushes,total_sendBytes, total_sendBytes_pull_sync, total_sendBytes_pull_reply, total_sendBytes_push_sync\"\n\n  #for i in range(0,256):\n    #header_csv_str += \",\"\n    #header_csv_str += (\"SB_\" + str(i))\n\n  header_csv_list = header_csv_str.split(',')\n  #if outputFile is empty add the header to the file\n  try:\n    if os.path.isfile(outputFile) is False:\n      fd_outputFile = open(outputFile, 'wb')\n      wr = csv.writer(fd_outputFile, quoting=csv.QUOTE_NONE, lineterminator='\\n')\n      wr.writerow(header_csv_list)\n      fd_outputFile.close()\n      print \"Adding header to the empty file.\"\n    else:\n      print \"outputFile : \", outputFile, \" exists, results will be appended to it.\"\n  except OSError:\n    print \"Error in outfile opening\\n\"\n\n  data_list = list(data) #[data] #list(data)\n  #data_list.extend((total_SendBytes, total_SendBytes_pull_sync, total_SendBytes_pull_reply, total_SendBytes_push_sync))\n  complete_data = output_str.split(\",\") + data_list + [rep_factor]#+ list(sendBytes_list)\n  fd_outputFile = open(outputFile, 'a')\n  wr = csv.writer(fd_outputFile, quoting=csv.QUOTE_NONE, lineterminator='\\n')\n  wr.writerow(complete_data)\n  fd_outputFile.close()\n\n'''\n  ## Write ghost and slave nodes to a file.\n  ghost_array = build_master_ghost_matrix(inputFile, benchmark, cut_type, hostNum, runs, threads)\n  ghostNodes_file = outputFile + \"_\" + cut_type\n  fd_ghostNodes_file = open(ghostNodes_file, 'ab')\n  fd_ghostNodes_file.write(\"\\n--------------------------------------------------------------\\n\")\n  fd_ghostNodes_file.write(\"\\nHosts : \" + hostNum + \"\\nInputFile : \"+ inputFile + \"\\nBenchmark: \" + benchmark + \"\\nPartition: \" + cut_type + \"\\n\\n\")\n  numpy.savetxt(fd_ghostNodes_file, ghost_array, delimiter=',', fmt='%d')\n  fd_ghostNodes_file.write(\"\\n--------------------------------------------------------------\\n\")\n  fd_ghostNodes_file.close()\n'''\n\nif __name__ == \"__main__\":\n  main(sys.argv[1:])\n\n"
  },
  {
    "path": "scripts/experimental/abelian_log_parser_multipleRuns.py",
    "content": "##########################################\n# To parse log files generated by abelian.\n# Author: Gurbinder Gill\n# Email: gurbinder533@gmail.com\n#########################################\n\nimport re\nimport os\nimport sys, getopt\nimport csv\nimport numpy\nimport subprocess\n\n######## NOTES:\n# All time values are in sec by default.\n\n\ndef match_timers(fileName, benchmark, forHost, numRuns, numThreads, time_unit, total_hosts, partition, run_identifier):\n\n  mean_time = 0.0;\n  recvNum_total = 0\n  recvBytes_total = 0\n  sendNum_total = 0\n  sendBytes_total = 0\n  sync_pull_avg_time_total = 0.0;\n  extract_avg_time_total = 0.0;\n  set_avg_time_total = 0.0;\n  sync_push_avg_time_total = 0.0;\n  graph_init_time = 0\n  hg_init_time = 0\n  total_time = 0\n\n  if(benchmark == \"cc\"):\n    benchmark = \"ConnectedComp\"\n\n  if(benchmark == \"pagerank\"):\n    benchmark = \"PageRank\"\n\n  if (time_unit == 'seconds'):\n    divisor = 1000\n  else:\n    divisor = 1\n\n  log_data = open(fileName).read()\n\n\n  timer_regex = re.compile(re.escape(run_identifier) + r',\\(NULL\\),0\\s,\\sTIMER_0,\\d*,0,(\\d*)')\n  timers = re.findall(timer_regex, log_data)\n  #print timers\n\n  time = []\n  total_mean_time=0.0\n\n  print timers\n  for i in range(int(total_hosts)):\n    time.append(0)\n\n  for timer in timers:\n    total_mean_time += float(timer)\n    print \"TIMER : \", timer\n\n  print \"TOTAL MEAN TIME \" , total_mean_time\n  total_mean_time = total_mean_time/int(total_hosts)\n  total_mean_time /= divisor\n  mean_time = total_mean_time = round(total_mean_time, 3)\n  print \"Total Mean time: \", total_mean_time\n\n  rep_regex = re.compile((run_identifier) + r',\\(NULL\\),0\\s,\\sREPLICATION_FACTOR_0_0,(\\d*),\\d*,(.*)')\n\n  rep_search = rep_regex.search(log_data)\n  rep_factor = 0;\n  if rep_search is not None:\n    rep_factor = rep_search.group(2)\n    rep_factor = round(float(rep_factor), 3)\n  print (\"Replication factor  : \", rep_factor)\n\n  num_iter_regex = re.compile((run_identifier) +r',\\(NULL\\),0\\s,\\sNUM_ITERATIONS_0' + r',\\d*,\\d*,(\\d*)')\n  num_iter_search = num_iter_regex.search(log_data)\n  if num_iter_regex is not None:\n    if num_iter_search is None:\n      num_iter = -1\n    else:\n      num_iter = num_iter_search.group(1)\n    print \"NUM_ITER : \", num_iter\n\n\n  #Finding mean,max,sd compute time over all hosts\n  max_do_all = 0\n  sum_do_all = 0\n  for i in range(0,int(num_iter)):\n    do_all_regex = re.compile((run_identifier) + r',\\(NULL\\),0\\s,\\s.*DO_ALL_IMPL_(?i)' + re.escape(benchmark) + r'_0_'+ re.escape(str(i)) + r',.*' + r',\\d*,(\\d*)')\n    do_all_all_hosts = re.findall(do_all_regex, log_data)\n    num_arr = numpy.array(map(int,do_all_all_hosts))\n\n    if len(num_arr) != 0:\n      #print (\" COMPUTE NUM_ARR\", num_arr)\n      max_compute = numpy.max(num_arr, axis=0)\n      #print (\"MAX : \", max_compute)\n      max_do_all += max_compute\n      sum_do_all += numpy.sum(num_arr, axis=0)\n  print \"max_do_all \" , max_do_all\n  print \"sum_do_all \" , sum_do_all\n  mean_do_all = float(sum_do_all)/float(total_hosts)\n\n\n  print \"mean_do_all\", mean_do_all\n\n\n  #Finding mean serialization time\n  sum_extract = 0\n  max_extract = 0\n  sync_extract_firstItr_regex = re.compile((run_identifier) +r',\\(NULL\\),0\\s,\\sSYNC_PU.._EXTRACT_FirstItr_(?i)' + re.escape(benchmark) + r'_0_' + re.escape(str(i))  + r',.*' + r',\\d*,(\\d*)')\n  sync_extract_firstItr_all_hosts = re.findall(sync_extract_firstItr_regex, log_data)\n  num_arr_firstItr = numpy.array(map(int,sync_extract_firstItr_all_hosts))\n  if(num_arr_firstItr.size > 0):\n    sum_extract += numpy.sum(num_arr_firstItr, axis=0)\n    max_extract += numpy.max(num_arr_firstItr, axis=0)\n\n\n  for i in range(0,int(num_iter)):\n    sync_extract_regex = re.compile((run_identifier) + r',\\(NULL\\),0\\s,\\sSYNC_PU.._EXTRACT_(?i)' + re.escape(benchmark) + r'_0_' + re.escape(str(i)) + r',.*' + r',\\d*,(\\d*)')\n    sync_extract_all_hosts = re.findall(sync_extract_regex, log_data)\n    num_arr = numpy.array(map(int,sync_extract_all_hosts))\n    #print \"extract\", num_arr\n\n    if(num_arr.size > 0):\n      sum_extract += numpy.sum(num_arr, axis=0)\n      max_extract += numpy.max(num_arr, axis=0)\n\n    # TOTAL EXTRACT\n  mean_exract_time = round(sum_extract/float(total_hosts),3)\n\n\n  #Finding mean deserialization time\n  sum_set = 0;\n  max_set = 0;\n  sync_set_firstItr_regex = re.compile((run_identifier) +r',\\(NULL\\),0\\s,\\sSYNC_PU.._SET_FirstItr_(?i)' + re.escape(benchmark) + r'_0_\\d*'  +r',.*' + r',\\d*,(\\d*)')\n  sync_set_firstItr_all_hosts = re.findall(sync_set_firstItr_regex, log_data)\n  num_arr_firstItr = numpy.array(map(int,sync_set_firstItr_all_hosts))\n  if(num_arr_firstItr.size > 0):\n    sum_set += numpy.sum(num_arr_firstItr, axis=0)\n    max_set += numpy.max(num_arr_firstItr, axis=0)\n\n  for i in range(0,int(num_iter)):\n    sync_set_regex = re.compile((run_identifier) + r',\\(NULL\\),0\\s,\\sSYNC_PU.._SET_(?i)' + re.escape(benchmark) + r'_0_' + re.escape(str(i)) + r',.*' + r',\\d*,(\\d*)')\n    sync_set_all_hosts = re.findall(sync_set_regex, log_data)\n    num_arr = numpy.array(map(int,sync_set_all_hosts))\n    #print \"set\", num_arr\n\n    if(num_arr.size > 0):\n      sum_set += numpy.sum(num_arr, axis=0)\n      max_set += numpy.max(num_arr, axis=0)\n\n  # TOTAL EXTRACT\n  mean_set_time = round(sum_set/float(total_hosts),3)\n\n\n  #Finding total mean communication time\n  max_sync = 0\n  sum_sync = 0\n  for i in range(0,int(num_iter)):\n    sync_regex = re.compile((run_identifier) + r',\\(NULL\\),0\\s,\\sSYNC_.*WARD_(?i)' + re.escape(benchmark) + r'_0_' + re.escape(str(i))  + r',.*' + r',\\d*,(\\d*)')\n    sync_all_hosts = re.findall(sync_regex, log_data)\n    if sync_all_hosts is None:\n      sync_regex = re.compile((run_identifier) + r',\\(NULL\\),0\\s,\\sSYNC_PU.._(?i)' + re.escape(benchmark) + r'_0_' + re.escape(str(i))  + r',.*' + r',\\d*,(\\d*)')\n      sync_all_hosts = re.findall(sync_regex, log_data)\n    num_arr = numpy.array(map(int,sync_all_hosts))\n    #print \"forward\", num_arr\n\n    if(num_arr.size > 0):\n      sum_sync += numpy.sum(num_arr, axis=0)\n      max_sync += numpy.max(num_arr, axis=0)\n\n  sync_firstItr_regex = re.compile((run_identifier) +r',\\(NULL\\),0\\s,\\sSYNC_.*WARD_FirstItr_(?i)' + re.escape(benchmark) + r'_0_\\d*'  +r',.*' + r',\\d*,(\\d*)')\n  sync_firstItr_all_hosts = re.findall(sync_firstItr_regex, log_data)\n  if sync_firstItr_all_hosts is None:\n    sync_firstItr_regex = re.compile((run_identifier) +r',\\(NULL\\),0\\s,\\sSYNC_PU.._FirstItr_(?i)' + re.escape(benchmark) + r'_0_\\d*'  +r',.*' + r',\\d*,(\\d*)')\n    sync_firstItr_all_hosts = re.findall(sync_firstItr_regex, log_data)\n  num_arr_firstItr = numpy.array(map(int,sync_firstItr_all_hosts))\n\n  # TOTAL SYNC TIME\n  if(num_arr_firstItr.size > 0):\n    sum_sync += numpy.sum(num_arr_firstItr, axis=0)\n    max_sync += numpy.max(num_arr_firstItr, axis=0)\n  mean_sync_time = sum_sync/float(total_hosts)\n  mean_sync_time = round(mean_sync_time/divisor,3)\n\n  #Finding total communication volume in bytes\n  sum_sync_bytes = 0\n  max_sync_bytes = 0\n  for i in range(0,int(num_iter)):\n    sync_bytes_regex = re.compile((run_identifier) + r',\\(NULL\\),0\\s,\\sSYNC_PU.._SEND_BYTES_(?i)' + re.escape(benchmark) + r'_0_'+ re.escape(str(i))   +r',.*' + r',\\d*,(\\d*)')\n    sync_bytes_all_hosts = re.findall(sync_bytes_regex, log_data)\n    num_arr = numpy.array(map(int,sync_bytes_all_hosts))\n    #print \"send\", num_arr\n\n    if(num_arr.size > 0):\n      sum_sync_bytes += numpy.sum(num_arr, axis=0)\n      max_sync_bytes += numpy.max(num_arr, axis=0)\n\n  print \"BYTES : \", sum_sync_bytes\n  print \"MAX BYTES : \", max_sync_bytes\n\n  sync_bytes_firstItr_regex = re.compile((run_identifier) +r',\\(NULL\\),0\\s,\\sSYNC_PU.._SEND_BYTES_FirstItr_(?i)' + re.escape(benchmark) + r'_0_\\d*'  +r',.*' + r',\\d*,(\\d*)')\n  sync_bytes_firstItr_all_hosts = re.findall(sync_bytes_firstItr_regex, log_data)\n  num_arr_firstItr = numpy.array(map(int,sync_bytes_firstItr_all_hosts))\n\n  # TOTAL BYTES EXCHANGED\n  if(num_arr_firstItr.size > 0):\n    sum_sync_bytes += numpy.sum(num_arr_firstItr, axis=0)\n    max_sync_bytes += numpy.max(num_arr_firstItr, axis=0)\n  print \"BYTES : \", sum_sync_bytes\n  total_sync_bytes = sum_sync_bytes\n\n\n  #75ae6860-be9f-4498-9315-1478c78551f6,(NULL),0 , NUM_WORK_ITEMS_0_0,0,0,262144\n  #Total work items, averaged across hosts\n  work_items_regex = re.compile((run_identifier) + r',\\(NULL\\),0\\s,\\sNUM_WORK_ITEMS_0_\\d*,\\d*,\\d*,(\\d*)')\n  work_items = re.findall(work_items_regex, log_data)\n  print work_items\n  num_arr = numpy.array(map(int,work_items))\n  total_work_item = numpy.sum(num_arr, axis=0)\n  print total_work_item\n\n  timer_graph_init_regex = re.compile((run_identifier) +r',\\(NULL\\),0\\s,\\sTIMER_GRAPH_INIT' + r',\\d*,\\d*,(\\d*)')\n  timer_graph_init_all_hosts = re.findall(timer_graph_init_regex, log_data)\n\n  num_arr = numpy.array(map(int,timer_graph_init_all_hosts))\n  #avg_graph_init_time = float(numpy.sum(num_arr, axis=0))/float(total_hosts)\n  max_graph_init_time = numpy.max(num_arr, axis=0)\n  #avg_graph_init_time = round((avg_graph_init_time / divisor),3)\n\n  print \"max_graph_init time : \", max_graph_init_time\n\n\n\n  ## Get Graph_init, HG_init, total\n  #81a5b117-8054-46af-9a23-1f28e5ed1bba,(NULL),0 , TIMER_GRAPH_INIT,0,0,306\n  #timer_graph_init_regex = re.compile((run_identifier) +r',\\(NULL\\),0\\s,\\sTIMER_GRAPH_INIT,\\d*,\\d*,(\\d*)')\n  timer_hg_init_regex = re.compile((run_identifier) +r',\\(NULL\\),0\\s,\\sTIMER_HG_INIT' + r',\\d*,\\d*,(\\d*)')\n  timer_hg_init_all_hosts = re.findall(timer_hg_init_regex, log_data)\n\n  num_arr = numpy.array(map(int,timer_hg_init_all_hosts))\n  #avg_hg_init_time = float(numpy.sum(num_arr, axis=0))/float(total_hosts)\n  max_hg_init_time = numpy.max(num_arr, axis=0)\n  #avg_hg_init_time = round((avg_hg_init_time / divisor),3)\n  hg_init_time = max_hg_init_time\n\n  timer_comm_setup_regex = re.compile((run_identifier) +r',\\(NULL\\),0\\s,\\sCOMMUNICATION_SETUP_TIME' + r',\\d*,\\d*,(\\d*)')\n  timer_comm_setup_all_hosts = re.findall(timer_comm_setup_regex, log_data)\n\n  num_arr = numpy.array(map(int,timer_comm_setup_all_hosts))\n  #avg_comm_setup_time = float(numpy.sum(num_arr, axis=0))/float(total_hosts)\n  max_comm_setup_time = numpy.max(num_arr, axis=0)\n  #max_comm_setup_time = round((avg_comm_setup_time / divisor),3)\n\n  print \"max_comm_setup time : \", max_comm_setup_time\n\n  timer_total_regex = re.compile((run_identifier) +r',\\(NULL\\),0\\s,\\sTIMER_TOTAL' + r',\\d*,\\d*,(\\d*)')\n  #timer_graph_init = timer_graph_init_regex.search(log_data)\n  #timer_hg_init = timer_hg_init_regex.search(log_data)\n  timer_total = timer_total_regex.search(log_data)\n  if timer_total is not None:\n    total_time = float(timer_total.group(1))\n    total_time /= divisor\n    total_time = round(total_time, 3)\n\n  return mean_time,rep_factor,mean_do_all,mean_exract_time,mean_set_time,mean_sync_time,total_sync_bytes,num_iter,total_work_item,hg_init_time,total_time,max_do_all,max_extract,max_set,max_sync,max_sync_bytes,max_comm_setup_time,max_graph_init_time\n\n'''\n  if timer_graph_init is not None:\n    graph_init_time = float(timer_graph_init.group(1))\n    graph_init_time /= divisor\n    graph_init_time = round(graph_init_time, 3)\n\n  if timer_hg_init is not None:\n    hg_init_time = float(timer_hg_init.group(1))\n    hg_init_time /= divisor\n    hg_init_time = round(hg_init_time, 3)\n\n  if timer_total is not None:\n    total_time = float(timer_total.group(1))\n    total_time /= divisor\n    total_time = round(total_time, 3)\n\n  print graph_init_time\n  print hg_init_time\n  print total_time\n'''\n\ndef get_basicInfo(fileName, run_identifier):\n\n  print (\"IDENTIFIER : \", str(run_identifier))\n  hostNum_regex = re.compile(re.escape(run_identifier) + r',\\(NULL\\),0\\s,\\sHosts,0,0,(\\d*)')\n  cmdLine_regex = re.compile(re.escape(run_identifier) + r',\\(NULL\\),0\\s,\\sCommandLine,0,0,(.*)')\n  threads_regex = re.compile(re.escape(run_identifier) + r',\\(NULL\\),0\\s,\\sThreads,0,0,(\\d*)')\n  runs_regex = re.compile(re.escape(run_identifier) + r',\\(NULL\\),0\\s,\\sRuns,0,0,(\\d*)')\n\n  log_data = open(fileName).read()\n\n  hostNum    = ''\n  cmdLine    = ''\n  threads    = ''\n  runs       = ''\n  benchmark  = ''\n  algo_type  = ''\n  cut_type   = ''\n  input_graph = ''\n\n  hostNum_search = hostNum_regex.search(log_data)\n  print hostNum_regex.pattern\n  print cmdLine_regex.pattern\n  if hostNum_search is not None:\n    hostNum = hostNum_search.group(1)\n\n  cmdLine_search = cmdLine_regex.search(log_data)\n  if cmdLine_search is not None:\n    cmdLine = cmdLine_search.group(1)\n\n  threads_search = threads_regex.search(log_data)\n  if threads_search is not None:\n    threads = threads_search.group(1)\n\n  runs_search    = runs_regex.search(log_data)\n  if runs_search is not None:\n    runs = runs_search.group(1)\n  if runs == \"\":\n    runs = \"3\"\n\n  print (\"CMDLINE : \", cmdLine)\n  split_cmdLine_algo = cmdLine.split()[0].split(\"/\")[-1].split(\"_\")\n  print split_cmdLine_algo\n  benchmark = split_cmdLine_algo[0]\n  algo_type = '-'.join(split_cmdLine_algo[1:])\n\n  split_cmdLine_input = cmdLine.split()[1].split(\"/\")\n  input_graph_name = split_cmdLine_input[-1]\n  input_graph = input_graph_name.split(\".\")[0]\n\n  print cmdLine\n  split_cmdLine = cmdLine.split()\n  print split_cmdLine\n  cut_type = \"edge-cut\"\n  for index in range(0, len(split_cmdLine)):\n    if split_cmdLine[index] == \"-enableVertexCut=1\":\n      cut_type = \"vertex-cut\"\n      break\n    elif split_cmdLine[index] == \"-enableVertexCut\":\n         cut_type = \"vertex-cut\"\n         break\n    elif split_cmdLine[index] == \"-enableVertexCut=0\":\n         cut_type = \"edge-cut\"\n         break\n\n  num_nodes = hostNum\n  for index in range(2, len(cmdLine.split())):\n    split_cmdLine_devices = cmdLine.split()[index].split(\"=\")\n    if split_cmdLine_devices[0] == '-num_nodes':\n      num_nodes = split_cmdLine_devices[-1]\n  num_hosts_per_node = int(hostNum) / int(num_nodes)\n\n  devices = str(hostNum) + \" CPU\"\n  deviceKind = \"CPU\"\n  for index in range(2, len(cmdLine.split())):\n    split_cmdLine_devices = cmdLine.split()[index].split(\"=\")\n    if split_cmdLine_devices[0] == '-pset':\n      devices_str = split_cmdLine_devices[-1]\n      cpus = devices_str.count('c')\n      gpus = devices_str.count('g')\n      if cpus + gpus == num_hosts_per_node and gpus > 0:\n        if cpus == 0:\n          devices = str(gpus) + \" GPU\"\n          deviceKind = \"GPU\"\n        else:\n          devices = str(cpus) + \" CPU + \" + str(gpus) + \" GPU\"\n          deviceKind = \"CPU+GPU\"\n          hostNum = str(int(hostNum) - cpus)\n      break\n\n  return hostNum, cmdLine, threads, runs, benchmark, algo_type, cut_type, input_graph, devices, deviceKind\n\ndef format_str(col):\n  max_len = 0\n  for c in col:\n    if max_len < len(str(c)):\n      max_len = len(str(c))\n  return max_len\n\ndef main(argv):\n  inputFile = ''\n  forHost = ''\n  outputFile = 'LOG_output.csv'\n  time_unit = 'milliseconds'\n  try:\n    opts, args = getopt.getopt(argv,\"hi:n:o:md\",[\"ifile=\",\"node=\",\"ofile=\",\"milliseconds\"])\n  except getopt.GetoptError:\n    print 'abelian_log_parser.py -i <inputFile> [-o <outputFile> -n <hostNumber 0 to hosts-1> --milliseconds]'\n    sys.exit(2)\n  for opt, arg in opts:\n    if opt == '-h':\n      print 'abelian_log_parser.py -i <inputFile> [-o <outputFile> -n <hostNumber 0 to hosts-1> --milliseconds]'\n      sys.exit()\n    elif opt in (\"-i\", \"--ifile\"):\n      inputFile = arg\n    elif opt in (\"-n\", \"--node\"):\n      forHost = arg\n    elif opt in (\"-o\", \"--ofile\"):\n      outputFile = arg\n    elif opt in (\"-m\", \"--milliseconds\"):\n      time_unit = 'milliseconds'\n\n  if inputFile == '':\n    print 'abelian_log_parser.py -i <inputFile> [-o <outputFile> -n <hostNumber 0 to hosts-1> --milliseconds]'\n    sys.exit(2)\n\n  print 'Input file is : ', inputFile\n  print 'Output file is : ', outputFile\n  print 'Data for host : ', forHost\n\n  if forHost == '':\n    print 'Find the slowest host and calculating everything for that host'\n\n  #Find the unique identifiers for different runs\n  log_data = open(inputFile).read()\n  run_identifiers_regex = re.compile(r'(.*),\\(NULL\\),0\\s,\\sTIMER_0,0,0,\\d*')\n  run_identifiers = re.findall(run_identifiers_regex, log_data)\n  for run_identifier in run_identifiers:\n    print run_identifier\n\n    hostNum, cmdLine, threads, runs, benchmark, algo_type, cut_type, input_graph, devices, deviceKind = get_basicInfo(inputFile, run_identifier)\n\n    #shorten the graph names:\n    if input_graph == \"twitter-ICWSM10-component_withRandomWeights\" or input_graph == \"twitter-ICWSM10-component-transpose\" or input_graph == \"twitter-ICWSM10-component\":\n      input_graph = \"twitter-50\"\n    elif input_graph == \"twitter-WWW10-component_withRandomWeights\" or input_graph == \"twitter-WWW10-component-transpose\" or input_graph == \"twitter-WWW10-component\":\n      input_graph = \"twitter-40\"\n\n    print 'Hosts : ', hostNum , ' CmdLine : ', cmdLine, ' Threads : ', threads , ' Runs : ', runs, ' benchmark :' , benchmark , ' algo_type :', algo_type, ' cut_type : ', cut_type, ' input_graph : ', input_graph\n    print 'Devices : ', devices\n    data = match_timers(inputFile, benchmark, forHost, runs, threads, time_unit, hostNum, cut_type, run_identifier)\n\n    print data\n\n    output_str = run_identifier + ',' + benchmark + ',' + 'abelian' + ',' + hostNum  + ',' + threads  + ','\n    output_str += deviceKind  + ',' + devices  + ','\n    output_str += input_graph  + ',' + algo_type  + ',' + cut_type\n    print output_str\n\n\n    header_csv_str = \"run-id,benchmark,platform,host,threads,\"\n    header_csv_str += \"deviceKind,devices,\"\n    header_csv_str += \"input,variant,partition,mean_time,rep_factor,mean_do_all,mean_exract_time,mean_set_time,mean_sync_time,total_sync_bytes,num_iter,num_work_items,hg_init_time,total_time,max_do_all,max_extract,max_set,max_sync,max_sync_bytes,max_comm_setup_time,max_graph_init_time\"\n\n\n    header_csv_list = header_csv_str.split(',')\n    try:\n      if os.path.isfile(outputFile) is False:\n        fd_outputFile = open(outputFile, 'wb')\n        wr = csv.writer(fd_outputFile, quoting=csv.QUOTE_NONE, lineterminator='\\n')\n        wr.writerow(header_csv_list)\n        fd_outputFile.close()\n        print \"Adding header to the empty file.\"\n      else:\n        print \"outputFile : \", outputFile, \" exists, results will be appended to it.\"\n    except OSError:\n      print \"Error in outfile opening\\n\"\n\n    data_list = list(data) #[data] #list(data)\n    complete_data = output_str.split(\",\") + data_list\n    fd_outputFile = open(outputFile, 'a')\n    wr = csv.writer(fd_outputFile, quoting=csv.QUOTE_NONE, lineterminator='\\n')\n    wr.writerow(complete_data)\n    fd_outputFile.close()\n\n'''\n  ## Write ghost and slave nodes to a file.\n  ghost_array = build_master_ghost_matrix(inputFile, benchmark, cut_type, hostNum, runs, threads)\n  ghostNodes_file = outputFile + \"_\" + cut_type\n  fd_ghostNodes_file = open(ghostNodes_file, 'ab')\n  fd_ghostNodes_file.write(\"\\n--------------------------------------------------------------\\n\")\n  fd_ghostNodes_file.write(\"\\nHosts : \" + hostNum + \"\\nInputFile : \"+ inputFile + \"\\nBenchmark: \" + benchmark + \"\\nPartition: \" + cut_type + \"\\n\\n\")\n  numpy.savetxt(fd_ghostNodes_file, ghost_array, delimiter=',', fmt='%d')\n  fd_ghostNodes_file.write(\"\\n--------------------------------------------------------------\\n\")\n  fd_ghostNodes_file.close()\n'''\n\nif __name__ == \"__main__\":\n  main(sys.argv[1:])\n\n"
  },
  {
    "path": "scripts/experimental/abelian_log_parser_multipleRuns2.py",
    "content": "##########################################\n# To parse log files generated by abelian.\n# Author: Gurbinder Gill\n# Email: gurbinder533@gmail.com\n#########################################\n\nimport re\nimport os\nimport sys, getopt\nimport csv\nimport numpy\nimport subprocess\n\n######## NOTES:\n# All time values are in sec by default.\n\n\ndef match_timers(fileName, benchmark, forHost, numRuns, numThreads, time_unit, total_hosts, partition, run_identifier):\n\n  mean_time = 0.0;\n  recvNum_total = 0\n  recvBytes_total = 0\n  sendNum_total = 0\n  sendBytes_total = 0\n  sync_pull_avg_time_total = 0.0;\n  extract_avg_time_total = 0.0;\n  set_avg_time_total = 0.0;\n  sync_push_avg_time_total = 0.0;\n  graph_init_time = 0\n  hg_init_time = 0\n  total_time = 0\n\n  if(benchmark == \"cc\"):\n    benchmark = \"ConnectedComp\"\n\n  if(benchmark == \"pagerank\"):\n    benchmark = \"PageRank\"\n\n  if (time_unit == 'seconds'):\n    divisor = 1000\n  else:\n    divisor = 1\n\n  log_data = open(fileName).read()\n\n\n  timer_regex = re.compile(re.escape(run_identifier) + r',\\(NULL\\),0\\s,\\sTIMER_0,\\d*,0,(\\d*)')\n  timers = re.findall(timer_regex, log_data)\n  #print timers\n\n  time = []\n  total_mean_time=0.0\n\n  print timers\n  for i in range(int(total_hosts)):\n    time.append(0)\n\n  for timer in timers:\n    total_mean_time += float(timer)\n    #print \"TIMER : \", timer\n\n  print \"TOTAL MEAN TIME \" , total_mean_time\n  total_mean_time = total_mean_time/int(total_hosts)\n  total_mean_time /= divisor\n  mean_time = total_mean_time = round(total_mean_time, 3)\n  print \"Total Mean time: \", total_mean_time\n\n  rep_regex = re.compile((run_identifier) + r',\\(NULL\\),0\\s,\\sREPLICATION_FACTOR_0_0,(\\d*),\\d*,(.*)')\n\n  rep_search = rep_regex.search(log_data)\n  rep_factor = 0;\n  if rep_search is not None:\n    rep_factor = rep_search.group(2)\n    rep_factor = round(float(rep_factor), 3)\n  print (\"Replication factor  : \", rep_factor)\n\n  num_iter_regex = re.compile((run_identifier) +r',\\(NULL\\),0\\s,\\sNUM_ITERATIONS_0' + r',\\d*,\\d*,(\\d*)')\n  num_iter_search = num_iter_regex.search(log_data)\n  if num_iter_regex is not None:\n    if num_iter_search is None:\n      num_iter = -1\n    else:\n      num_iter = num_iter_search.group(1)\n    print \"NUM_ITER : \", num_iter\n\n\n  #Finding mean,max,sd compute time over all hosts\n  max_do_all = 0\n  sum_do_all = 0\n  sum_std_do_all = 0;\n  for i in range(0,int(num_iter)):\n    do_all_regex = re.compile((run_identifier) + r',\\(NULL\\),0\\s,\\s.*DO_ALL_IMPL_(?i)' + re.escape(benchmark) + r'_0_'+ re.escape(str(i)) + r',.*' + r',\\d*,(\\d*)')\n    do_all_all_hosts = re.findall(do_all_regex, log_data)\n    num_arr = numpy.array(map(int,do_all_all_hosts))\n\n    if len(num_arr) != 0:\n      sum_std_do_all += numpy.std(num_arr, axis=0)\n      #print (\" COMPUTE NUM_ARR\", num_arr)\n      max_compute = numpy.max(num_arr, axis=0)\n      #print (\"MAX : \", max_compute)\n      max_do_all += max_compute\n      sum_do_all += numpy.sum(num_arr, axis=0)\n  print \"max_do_all \" , max_do_all\n  print \"sum_do_all \" , sum_do_all\n  mean_do_all = float(sum_do_all)/float(total_hosts)\n  mean_std_do_all = float(sum_std_do_all)/float(num_iter)\n  print \"XXXXXXXXXXXXXXXXx STD DO ALL : \" , mean_std_do_all\n\n\n  print \"mean_do_all\", mean_do_all\n\n\n  ##################### SYNC ##############################\n  ############## SYNC = BROADCAST + REDUCE ################\n  #Finding mean,max,sd sync time over all hosts\n  max_sync = 0\n  sum_sync = 0\n  for i in range(0,int(num_iter)):\n    sync_regex = re.compile((run_identifier) + r',\\(NULL\\),0\\s,\\sSYNC_(?i)' + re.escape(benchmark) + r'_0_'+ re.escape(str(i)) + r',.*' + r',\\d*,(\\d*)')\n    sync_all_hosts = re.findall(sync_regex, log_data)\n    num_arr = numpy.array(map(int,sync_all_hosts))\n\n    if len(num_arr) != 0:\n      #print (\" SYNC NUM_ARR\", num_arr)\n      max_sync_itr = numpy.max(num_arr, axis=0)\n      #print (\"MAX : \", max_compute)\n      max_sync += max_sync_itr\n      sum_sync += numpy.sum(num_arr, axis=0)\n  mean_sync_time = float(sum_sync)/float(total_hosts)\n\n\n  print \"NEW SYNC_TIME \", mean_sync_time\n\n\n\n  ##################### BROADCAST ##############################\n  #### BROADCAST = BROADCAST_SEND + BROADCAST_EXTRACT + BROADCAST_RECV + BROADCAST_SET\n  #Finding mean,max,sd BROADCAST time over all hosts\n  max_broadcast_time = 0\n  sum_broadcast = 0\n  for i in range(0,int(num_iter)):\n    broadcast_regex = re.compile((run_identifier) + r',\\(NULL\\),0\\s,\\sBROADCAST_(?i)' + re.escape(benchmark) + r'_0_'+ re.escape(str(i)) + r',.*' + r',\\d*,(\\d*)')\n    broadcast_all_hosts = re.findall(broadcast_regex, log_data)\n    num_arr = numpy.array(map(int,broadcast_all_hosts))\n\n    if len(num_arr) != 0:\n      #print (\" SYNC NUM_ARR\", num_arr)\n      max_broadcast_itr = numpy.max(num_arr, axis=0)\n      #print (\"MAX : \", max_compute)\n      max_broadcast_time += max_broadcast_itr\n      sum_broadcast += numpy.sum(num_arr, axis=0)\n  mean_broadcast_time = float(sum_broadcast)/float(total_hosts)\n\n\n  print \"NEW BROADCAST_TIME \", mean_broadcast_time\n\n  max_broadcast_set = 0\n  max_broadcast_recv = 0\n  max_broadcast_extract = 0\n  max_broadcast_send = 0\n\n  mean_broadcast_set_time = 0\n  mean_broadcast_recv_time = 0\n  mean_broadcast_extract_time = 0\n  mean_broadcast_send_time = 0\n\n  '''\n  ##################### BROADCAST SEND ##############################\n  #Finding mean,max,sd BROADCAST time over all hosts\n  max_broadcast_send = 0\n  sum_broadcast_send = 0\n  for i in range(0,int(num_iter)):\n    broadcast_send_regex = re.compile((run_identifier) + r',\\(NULL\\),0\\s,\\sBROADCAST_SEND_(?i)' + re.escape(benchmark) + r'_0_'+ re.escape(str(i)) + r',.*' + r',\\d*,(\\d*)')\n    broadcast_send_all_hosts = re.findall(broadcast_send_regex, log_data)\n    num_arr = numpy.array(map(int,broadcast_send_all_hosts))\n\n    if len(num_arr) != 0:\n      #print (\" SYNC NUM_ARR\", num_arr)\n      max_broadcast_send_itr = numpy.max(num_arr, axis=0)\n      #print (\"MAX : \", max_compute)\n      max_broadcast_send += max_broadcast_send_itr\n      sum_broadcast_send += numpy.sum(num_arr, axis=0)\n  mean_broadcast_send_time = float(sum_broadcast_send)/float(total_hosts)\n\n\n  print \"NEW broadcast_send_TIME \", mean_broadcast_send_time\n\n\n  ##################### BROADCAST EXTRACT ##############################\n  #Finding mean,max,sd BROADCAST time over all hosts\n  max_broadcast_extract = 0\n  sum_broadcast_extract = 0\n  for i in range(0,int(num_iter)):\n    broadcast_extract_regex = re.compile((run_identifier) + r',\\(NULL\\),0\\s,\\sBROADCAST_EXTRACT_(?i)' + re.escape(benchmark) + r'_0_'+ re.escape(str(i)) + r',.*' + r',\\d*,(\\d*)')\n    broadcast_extract_all_hosts = re.findall(broadcast_extract_regex, log_data)\n    num_arr = numpy.array(map(int,broadcast_extract_all_hosts))\n\n    if len(num_arr) != 0:\n      #print (\" SYNC NUM_ARR\", num_arr)\n      max_broadcast_extract_itr = numpy.max(num_arr, axis=0)\n      #print (\"MAX : \", max_compute)\n      max_broadcast_extract += max_broadcast_extract_itr\n      sum_broadcast_extract += numpy.sum(num_arr, axis=0)\n  mean_broadcast_extract_time = float(sum_broadcast_extract)/float(total_hosts)\n\n\n  print \"NEW broadcast_extract_TIME \", mean_broadcast_extract_time\n\n\n##################### BROADCAST recv ##############################\n  #Finding mean,max,sd BROADCAST time over all hosts\n  max_broadcast_recv = 0\n  sum_broadcast_recv = 0\n  for i in range(0,int(num_iter)):\n    broadcast_recv_regex = re.compile((run_identifier) + r',\\(NULL\\),0\\s,\\sBROADCAST_RECV_(?i)' + re.escape(benchmark) + r'_0_'+ re.escape(str(i)) + r',.*' + r',\\d*,(\\d*)')\n    broadcast_recv_all_hosts = re.findall(broadcast_recv_regex, log_data)\n    num_arr = numpy.array(map(int,broadcast_recv_all_hosts))\n\n    if len(num_arr) != 0:\n      #print (\" SYNC NUM_ARR\", num_arr)\n      max_broadcast_recv_itr = numpy.max(num_arr, axis=0)\n      #print (\"MAX : \", max_compute)\n      max_broadcast_recv += max_broadcast_recv_itr\n      sum_broadcast_recv += numpy.sum(num_arr, axis=0)\n  mean_broadcast_recv_time = float(sum_broadcast_recv)/float(total_hosts)\n\n\n  print \"NEW broadcast_recv_TIME \", mean_broadcast_recv_time\n\n\n  ##################### BROADCAST SET ##############################\n  #Finding mean,max,sd BROADCAST time over all hosts\n  max_broadcast_set = 0\n  sum_broadcast_set = 0\n  for i in range(0,int(num_iter)):\n    broadcast_set_regex = re.compile((run_identifier) + r',\\(NULL\\),0\\s,\\sBROADCAST_SET_(?i)' + re.escape(benchmark) + r'_0_'+ re.escape(str(i)) + r',.*' + r',\\d*,(\\d*)')\n    broadcast_set_all_hosts = re.findall(broadcast_set_regex, log_data)\n    num_arr = numpy.array(map(int,broadcast_set_all_hosts))\n\n    if len(num_arr) != 0:\n      #print (\" SYNC NUM_ARR\", num_arr)\n      max_broadcast_set_itr = numpy.max(num_arr, axis=0)\n      #print (\"MAX : \", max_compute)\n      max_broadcast_set += max_broadcast_set_itr\n      sum_broadcast_set += numpy.sum(num_arr, axis=0)\n  print \"max_do_all \" , max_broadcast_set\n  print \"sum_do_all \" , sum_broadcast_set\n  mean_broadcast_set_time = float(sum_broadcast_set)/float(total_hosts)\n\n\n  print \"NEW broadcast_set_TIME \", mean_broadcast_set_time\n  '''\n\n\n\n\n  ##################### REDUCE ##############################\n  #Finding mean,max,sd REDUCE time over all hosts\n  max_reduce_time = 0\n  sum_reduce = 0\n  for i in range(0,int(num_iter)):\n    reduce_regex = re.compile((run_identifier) + r',\\(NULL\\),0\\s,\\sREDUCE_(?i)' + re.escape(benchmark) + r'_0_'+ re.escape(str(i)) + r',.*' + r',\\d*,(\\d*)')\n    reduce_all_hosts = re.findall(reduce_regex, log_data)\n    num_arr = numpy.array(map(int,reduce_all_hosts))\n\n    if len(num_arr) != 0:\n      #print (\" SYNC NUM_ARR\", num_arr)\n      max_reduce_itr = numpy.max(num_arr, axis=0)\n      #print (\"MAX : \", max_compute)\n      max_reduce_time += max_reduce_itr\n      sum_reduce += numpy.sum(num_arr, axis=0)\n  mean_reduce_time = float(sum_reduce)/float(total_hosts)\n\n\n  print \"NEW REDUCE_TIME \", mean_reduce_time\n\n  max_reduce_set = 0\n  max_reduce_recv = 0\n  max_reduce_extract = 0\n  max_reduce_send = 0\n\n  mean_reduce_set_time = 0\n  mean_reduce_recv_time = 0\n  mean_reduce_extract_time = 0\n  mean_reduce_send_time = 0\n\n  '''\n  ##################### REDUCE SEND ##############################\n  #Finding mean,max,sd reduce time over all hosts\n  max_reduce_send = 0\n  sum_reduce_send = 0\n  for i in range(0,int(num_iter)):\n    reduce_send_regex = re.compile((run_identifier) + r',\\(NULL\\),0\\s,\\sREDUCE_SEND_(?i)' + re.escape(benchmark) + r'_0_'+ re.escape(str(i)) + r',.*' + r',\\d*,(\\d*)')\n    reduce_send_all_hosts = re.findall(reduce_send_regex, log_data)\n    num_arr = numpy.array(map(int,reduce_send_all_hosts))\n\n    if len(num_arr) != 0:\n      #print (\" SYNC NUM_ARR\", num_arr)\n      max_reduce_send_itr = numpy.max(num_arr, axis=0)\n      #print (\"MAX : \", max_compute)\n      max_reduce_send += max_reduce_send_itr\n      sum_reduce_send += numpy.sum(num_arr, axis=0)\n  mean_reduce_send_time = float(sum_reduce_send)/float(total_hosts)\n\n\n  print \"NEW reduce_send_TIME \", mean_reduce_send_time\n\n\n\n  ##################### REDUCE EXTRACT ##############################\n  #Finding mean,max,sd reduce time over all hosts\n  max_reduce_extract = 0\n  sum_reduce_extract = 0\n  for i in range(0,int(num_iter)):\n    reduce_extract_regex = re.compile((run_identifier) + r',\\(NULL\\),0\\s,\\sREDUCE_EXTRACT_(?i)' + re.escape(benchmark) + r'_0_'+ re.escape(str(i)) + r',.*' + r',\\d*,(\\d*)')\n    reduce_extract_all_hosts = re.findall(reduce_extract_regex, log_data)\n    num_arr = numpy.array(map(int,reduce_extract_all_hosts))\n\n    if len(num_arr) != 0:\n      #print (\" SYNC NUM_ARR\", num_arr)\n      max_reduce_extract_itr = numpy.max(num_arr, axis=0)\n      #print (\"MAX : \", max_compute)\n      max_reduce_extract += max_reduce_extract_itr\n      sum_reduce_extract += numpy.sum(num_arr, axis=0)\n  mean_reduce_extract_time = float(sum_reduce_extract)/float(total_hosts)\n\n\n  print \"NEW reduce_extract_TIME \", mean_reduce_extract_time\n\n\n##################### REDUCE recv ##############################\n  #Finding mean,max,sd reduce time over all hosts\n  max_reduce_recv = 0\n  sum_reduce_recv = 0\n  for i in range(0,int(num_iter)):\n    reduce_recv_regex = re.compile((run_identifier) + r',\\(NULL\\),0\\s,\\sREDUCE_RECV_(?i)' + re.escape(benchmark) + r'_0_'+ re.escape(str(i)) + r',.*' + r',\\d*,(\\d*)')\n    reduce_recv_all_hosts = re.findall(reduce_recv_regex, log_data)\n    num_arr = numpy.array(map(int,reduce_recv_all_hosts))\n\n    if len(num_arr) != 0:\n      #print (\" SYNC NUM_ARR\", num_arr)\n      max_reduce_recv_itr = numpy.max(num_arr, axis=0)\n      #print (\"MAX : \", max_compute)\n      max_reduce_recv += max_reduce_recv_itr\n      sum_reduce_recv += numpy.sum(num_arr, axis=0)\n  mean_reduce_recv_time = float(sum_reduce_recv)/float(total_hosts)\n\n\n  print \"NEW reduce_recv_TIME \", mean_reduce_recv_time\n\n\n  ##################### REDUCE SET ##############################\n  #Finding mean,max,sd reduce time over all hosts\n  max_reduce_set = 0\n  sum_reduce_set = 0\n  for i in range(0,int(num_iter)):\n    reduce_set_regex = re.compile((run_identifier) + r',\\(NULL\\),0\\s,\\sREDUCE_SET_(?i)' + re.escape(benchmark) + r'_0_'+ re.escape(str(i)) + r',.*' + r',\\d*,(\\d*)')\n    reduce_set_all_hosts = re.findall(reduce_set_regex, log_data)\n    num_arr = numpy.array(map(int,reduce_set_all_hosts))\n\n    if len(num_arr) != 0:\n      #print (\" SYNC NUM_ARR\", num_arr)\n      max_reduce_set_itr = numpy.max(num_arr, axis=0)\n      #print (\"MAX : \", max_compute)\n      max_reduce_set += max_reduce_set_itr\n      sum_reduce_set += numpy.sum(num_arr, axis=0)\n  mean_reduce_set_time = float(sum_reduce_set)/float(total_hosts)\n\n\n  print \"NEW reduce_set_TIME \", mean_reduce_set_time\n  '''\n\n\n\n  # ######################## BROADCAST SENT BYTES ################################\n  #Finding total communication volume in bytes\n  #2cc54509-cb49-43f9-b1a5-be8f4a4eaf1f,(NULL),0 , BROADCAST_SEND_BYTES_BFS_0_1,0,0,41851160\n  sum_broadcast_bytes = 0\n  max_broadcast_bytes = 0\n  min_broadcast_bytes = 0\n  broadcast_bytes_regex = re.compile((run_identifier) + r',\\(NULL\\),0\\s,\\sBROADCAST_SEND_BYTES_(?i)' + re.escape(benchmark) + r'_0_' +r'\\d*' +r',.*' + r',\\d*,(\\d*)')\n  broadcast_bytes_all_hosts = re.findall(broadcast_bytes_regex, log_data)\n  num_arr = numpy.array(map(int,broadcast_bytes_all_hosts))\n  if(num_arr.size > 0):\n    sum_broadcast_bytes += numpy.sum(num_arr, axis=0)\n    max_broadcast_bytes += numpy.max(num_arr, axis=0)\n    min_broadcast_bytes += numpy.min(num_arr, axis=0)\n\n  print \"BROADCAST SEND BYTES : \", sum_broadcast_bytes\n\n\n  # ######################## REDUCE SENT BYTES ################################\n  #Finding total communication volume in bytes\n  #2cc54509-cb49-43f9-b1a5-be8f4a4eaf1f,(NULL),0 , BROADCAST_SEND_BYTES_BFS_0_1,0,0,41851160\n  sum_reduce_bytes = 0\n  max_reduce_bytes = 0\n  min_reduce_bytes = 0\n  reduce_bytes_regex = re.compile((run_identifier) + r',\\(NULL\\),0\\s,\\sREDUCE_SEND_BYTES_(?i)' + re.escape(benchmark) + r'_0_' +r'\\d*' +r',.*' + r',\\d*,(\\d*)')\n  reduce_bytes_all_hosts = re.findall(reduce_bytes_regex, log_data)\n  num_arr = numpy.array(map(int,reduce_bytes_all_hosts))\n  if(num_arr.size > 0):\n    sum_reduce_bytes += numpy.sum(num_arr, axis=0)\n    max_reduce_bytes += numpy.max(num_arr, axis=0)\n    min_reduce_bytes += numpy.min(num_arr, axis=0)\n\n  print \"REDUCE SEND BYTES : \", sum_reduce_bytes\n\n\n  total_sync_bytes = sum_reduce_bytes + sum_broadcast_bytes\n\n\n  #75ae6860-be9f-4498-9315-1478c78551f6,(NULL),0 , NUM_WORK_ITEMS_0_0,0,0,262144\n  #Total work items, averaged across hosts\n  work_items_regex = re.compile((run_identifier) + r',\\(NULL\\),0\\s,\\sNUM_WORK_ITEMS_0_\\d*,\\d*,\\d*,(\\d*)')\n  work_items = re.findall(work_items_regex, log_data)\n  print work_items\n  num_arr = numpy.array(map(int,work_items))\n  total_work_item = numpy.sum(num_arr, axis=0)\n  print total_work_item\n\n  timer_graph_init_regex = re.compile((run_identifier) +r',\\(NULL\\),0\\s,\\sTIMER_GRAPH_INIT' + r',\\d*,\\d*,(\\d*)')\n  timer_graph_init_all_hosts = re.findall(timer_graph_init_regex, log_data)\n\n  num_arr = numpy.array(map(int,timer_graph_init_all_hosts))\n  if(num_arr.size > 0):\n    #avg_graph_init_time = float(numpy.sum(num_arr, axis=0))/float(total_hosts)\n    max_graph_init_time = numpy.max(num_arr, axis=0)\n    #avg_graph_init_time = round((avg_graph_init_time / divisor),3)\n\n  print \"max_graph_init time : \", max_graph_init_time\n\n\n\n  ## Get Graph_init, HG_init, total\n  #81a5b117-8054-46af-9a23-1f28e5ed1bba,(NULL),0 , TIMER_GRAPH_INIT,0,0,306\n  #timer_graph_init_regex = re.compile((run_identifier) +r',\\(NULL\\),0\\s,\\sTIMER_GRAPH_INIT,\\d*,\\d*,(\\d*)')\n  timer_hg_init_regex = re.compile((run_identifier) +r',\\(NULL\\),0\\s,\\sTIMER_HG_INIT' + r',\\d*,\\d*,(\\d*)')\n  timer_hg_init_all_hosts = re.findall(timer_hg_init_regex, log_data)\n\n  num_arr = numpy.array(map(int,timer_hg_init_all_hosts))\n  #avg_hg_init_time = float(numpy.sum(num_arr, axis=0))/float(total_hosts)\n  if(num_arr.size > 0):\n    max_hg_init_time = numpy.max(num_arr, axis=0)\n    #avg_hg_init_time = round((avg_hg_init_time / divisor),3)\n    hg_init_time = max_hg_init_time\n\n  timer_comm_setup_regex = re.compile((run_identifier) +r',\\(NULL\\),0\\s,\\sCOMMUNICATION_SETUP_TIME' + r',\\d*,\\d*,(\\d*)')\n  timer_comm_setup_all_hosts = re.findall(timer_comm_setup_regex, log_data)\n\n  max_comm_setup_time = 0\n  num_arr = numpy.array(map(int,timer_comm_setup_all_hosts))\n  if(num_arr.size > 0):\n    #avg_comm_setup_time = float(numpy.sum(num_arr, axis=0))/float(total_hosts)\n    max_comm_setup_time = numpy.max(num_arr, axis=0)\n    #max_comm_setup_time = round((avg_comm_setup_time / divisor),3)\n\n  print \"max_comm_setup time : \", max_comm_setup_time\n\n  timer_total_regex = re.compile((run_identifier) +r',\\(NULL\\),0\\s,\\sTIMER_TOTAL' + r',\\d*,\\d*,(\\d*)')\n  #timer_graph_init = timer_graph_init_regex.search(log_data)\n  #timer_hg_init = timer_hg_init_regex.search(log_data)\n  timer_total = timer_total_regex.search(log_data)\n  if timer_total is not None:\n    total_time = float(timer_total.group(1))\n    total_time /= divisor\n    total_time = round(total_time, 3)\n\n  #return mean_time,rep_factor,mean_do_all,total_sync_bytes,sum_broadcast_bytes,sum_reduce_bytes,num_iter,total_work_item,hg_init_time,total_time,max_do_all,mean_sync_time,mean_broadcast_time,mean_broadcast_send_time,mean_broadcast_extract_time,mean_broadcast_recv_time,mean_broadcast_set_time,mean_reduce_time,mean_reduce_send_time,mean_reduce_extract_time,mean_reduce_recv_time,mean_reduce_set_time,max_comm_setup_time,max_graph_init_time\n  return mean_time,rep_factor,mean_do_all,total_sync_bytes,sum_broadcast_bytes,sum_reduce_bytes,num_iter,total_work_item,hg_init_time,total_time,max_do_all,mean_sync_time,max_sync,mean_broadcast_time,max_broadcast_time,mean_reduce_time,max_reduce_time,max_comm_setup_time,max_graph_init_time\n\n\n'''\n  if timer_graph_init is not None:\n    graph_init_time = float(timer_graph_init.group(1))\n    graph_init_time /= divisor\n    graph_init_time = round(graph_init_time, 3)\n\n  if timer_hg_init is not None:\n    hg_init_time = float(timer_hg_init.group(1))\n    hg_init_time /= divisor\n    hg_init_time = round(hg_init_time, 3)\n\n  if timer_total is not None:\n    total_time = float(timer_total.group(1))\n    total_time /= divisor\n    total_time = round(total_time, 3)\n\n  print graph_init_time\n  print hg_init_time\n  print total_time\n'''\n\ndef get_basicInfo(fileName, run_identifier):\n\n  print (\"IDENTIFIER : \", str(run_identifier))\n  hostNum_regex = re.compile(re.escape(run_identifier) + r',\\(NULL\\),0\\s,\\sHosts,0,0,(\\d*)')\n  cmdLine_regex = re.compile(re.escape(run_identifier) + r',\\(NULL\\),0\\s,\\sCommandLine,0,0,(.*)')\n  threads_regex = re.compile(re.escape(run_identifier) + r',\\(NULL\\),0\\s,\\sThreads,0,0,(\\d*)')\n  runs_regex = re.compile(re.escape(run_identifier) + r',\\(NULL\\),0\\s,\\sRuns,0,0,(\\d*)')\n\n  log_data = open(fileName).read()\n\n  hostNum    = ''\n  cmdLine    = ''\n  threads    = ''\n  runs       = ''\n  benchmark  = ''\n  algo_type  = ''\n  cut_type   = ''\n  input_graph = ''\n\n  hostNum_search = hostNum_regex.search(log_data)\n  print hostNum_regex.pattern\n  print cmdLine_regex.pattern\n  if hostNum_search is not None:\n    hostNum = hostNum_search.group(1)\n\n  cmdLine_search = cmdLine_regex.search(log_data)\n  if cmdLine_search is not None:\n    cmdLine = cmdLine_search.group(1)\n\n  threads_search = threads_regex.search(log_data)\n  if threads_search is not None:\n    threads = threads_search.group(1)\n\n  runs_search    = runs_regex.search(log_data)\n  if runs_search is not None:\n    runs = runs_search.group(1)\n  if runs == \"\":\n    runs = \"3\"\n\n  print (\"CMDLINE : \", cmdLine)\n  split_cmdLine_algo = cmdLine.split()[0].split(\"/\")[-1].split(\"_\")\n  print split_cmdLine_algo\n  benchmark = split_cmdLine_algo[0]\n  algo_type = '-'.join(split_cmdLine_algo[1:])\n\n  split_cmdLine_input = cmdLine.split()[1].split(\"/\")\n  input_graph_name = split_cmdLine_input[-1]\n  input_graph = input_graph_name.split(\".\")[0]\n\n  print cmdLine\n  split_cmdLine = cmdLine.split()\n  print split_cmdLine\n  cut_type = \"edge-cut\"\n  for index in range(0, len(split_cmdLine)):\n    if split_cmdLine[index] == \"-enableVertexCut=1\":\n      cut_type = \"vertex-cut\"\n      break\n    elif split_cmdLine[index] == \"-enableVertexCut\":\n         cut_type = \"vertex-cut\"\n         break\n    elif split_cmdLine[index] == \"-enableVertexCut=0\":\n         cut_type = \"edge-cut\"\n         break\n\n  num_nodes = hostNum\n  for index in range(2, len(cmdLine.split())):\n    split_cmdLine_devices = cmdLine.split()[index].split(\"=\")\n    if split_cmdLine_devices[0] == '-num_nodes':\n      num_nodes = split_cmdLine_devices[-1]\n  num_hosts_per_node = int(hostNum) / int(num_nodes)\n\n  devices = str(hostNum) + \" CPU\"\n  deviceKind = \"CPU\"\n  for index in range(2, len(cmdLine.split())):\n    split_cmdLine_devices = cmdLine.split()[index].split(\"=\")\n    if split_cmdLine_devices[0] == '-pset':\n      devices_str = split_cmdLine_devices[-1]\n      cpus = devices_str.count('c')\n      gpus = devices_str.count('g')\n      if cpus + gpus == num_hosts_per_node and gpus > 0:\n        if cpus == 0:\n          devices = str(gpus) + \" GPU\"\n          deviceKind = \"GPU\"\n        else:\n          devices = str(cpus) + \" CPU + \" + str(gpus) + \" GPU\"\n          deviceKind = \"CPU+GPU\"\n          hostNum = str(int(hostNum) - cpus)\n      break\n\n  return hostNum, cmdLine, threads, runs, benchmark, algo_type, cut_type, input_graph, devices, deviceKind\n\ndef format_str(col):\n  max_len = 0\n  for c in col:\n    if max_len < len(str(c)):\n      max_len = len(str(c))\n  return max_len\n\ndef main(argv):\n  inputFile = ''\n  forHost = ''\n  outputFile = 'LOG_output.csv'\n  time_unit = 'milliseconds'\n  try:\n    opts, args = getopt.getopt(argv,\"hi:n:o:md\",[\"ifile=\",\"node=\",\"ofile=\",\"milliseconds\"])\n  except getopt.GetoptError:\n    print 'abelian_log_parser.py -i <inputFile> [-o <outputFile> -n <hostNumber 0 to hosts-1> --milliseconds]'\n    sys.exit(2)\n  for opt, arg in opts:\n    if opt == '-h':\n      print 'abelian_log_parser.py -i <inputFile> [-o <outputFile> -n <hostNumber 0 to hosts-1> --milliseconds]'\n      sys.exit()\n    elif opt in (\"-i\", \"--ifile\"):\n      inputFile = arg\n    elif opt in (\"-n\", \"--node\"):\n      forHost = arg\n    elif opt in (\"-o\", \"--ofile\"):\n      outputFile = arg\n    elif opt in (\"-m\", \"--milliseconds\"):\n      time_unit = 'milliseconds'\n\n  if inputFile == '':\n    print 'abelian_log_parser.py -i <inputFile> [-o <outputFile> -n <hostNumber 0 to hosts-1> --milliseconds]'\n    sys.exit(2)\n\n  print 'Input file is : ', inputFile\n  print 'Output file is : ', outputFile\n  print 'Data for host : ', forHost\n\n  if forHost == '':\n    print 'Find the slowest host and calculating everything for that host'\n\n  #Find the unique identifiers for different runs\n  log_data = open(inputFile).read()\n  run_identifiers_regex = re.compile(r'(.*),\\(NULL\\),0\\s,\\sTIMER_0,0,0,\\d*')\n  run_identifiers = re.findall(run_identifiers_regex, log_data)\n  for run_identifier in run_identifiers:\n    print run_identifier\n\n    hostNum, cmdLine, threads, runs, benchmark, algo_type, cut_type, input_graph, devices, deviceKind = get_basicInfo(inputFile, run_identifier)\n\n    #shorten the graph names:\n    if input_graph == \"twitter-ICWSM10-component_withRandomWeights\" or input_graph == \"twitter-ICWSM10-component-transpose\" or input_graph == \"twitter-ICWSM10-component\":\n      input_graph = \"twitter-50\"\n    elif input_graph == \"twitter-WWW10-component_withRandomWeights\" or input_graph == \"twitter-WWW10-component-transpose\" or input_graph == \"twitter-WWW10-component\":\n      input_graph = \"twitter-40\"\n\n    print 'Hosts : ', hostNum , ' CmdLine : ', cmdLine, ' Threads : ', threads , ' Runs : ', runs, ' benchmark :' , benchmark , ' algo_type :', algo_type, ' cut_type : ', cut_type, ' input_graph : ', input_graph\n    print 'Devices : ', devices\n    data = match_timers(inputFile, benchmark, forHost, runs, threads, time_unit, hostNum, cut_type, run_identifier)\n\n    print data\n\n    output_str = run_identifier + ',' + benchmark + ',' + 'abelian' + ',' + hostNum  + ',' + threads  + ','\n    output_str += deviceKind  + ',' + devices  + ','\n    output_str += input_graph  + ',' + algo_type  + ',' + cut_type\n    print output_str\n\n\n    header_csv_str = \"run-id,benchmark,platform,host,threads,\"\n    header_csv_str += \"deviceKind,devices,\"\n    #header_csv_str += \"input,variant,partition,mean_time,rep_factor,mean_do_all,mean_exract_time,mean_set_time,mean_sync_time,total_sync_bytes,num_iter,num_work_items,hg_init_time,total_time,max_do_all,max_extract,max_set,max_sync,max_sync_bytes,max_comm_setup_time,max_graph_init_time\"\n\n    #header_csv_str += \"input,variant,partition,mean_time,rep_factor,mean_do_all,total_sync_bytes,sum_broadcast_bytes,sum_reduce_bytes,num_iter,total_work_item,hg_init_time,total_time,max_do_all,mean_sync_time,mean_broadcast_time,mean_broadcast_send_time,mean_broadcast_extract_time,mean_broadcast_recv_time,mean_broadcast_set_time,mean_reduce_time,mean_reduce_send_time,mean_reduce_extract_time,mean_reduce_recv_time,mean_reduce_set_time,max_comm_setup_time,max_graph_init_time\"\n    header_csv_str += \"input,variant,partition,mean_time,rep_factor,mean_do_all,total_sync_bytes,sum_broadcast_bytes,sum_reduce_bytes,num_iter,total_work_item,hg_init_time,total_time,max_do_all,mean_sync_time,max_sync,mean_broadcast_time,max_broadcast_time,mean_reduce_time,max_reduce_time,max_comm_setup_time,max_graph_init_time\"\n\n    header_csv_list = header_csv_str.split(',')\n    try:\n      if os.path.isfile(outputFile) is False:\n        fd_outputFile = open(outputFile, 'wb')\n        wr = csv.writer(fd_outputFile, quoting=csv.QUOTE_NONE, lineterminator='\\n')\n        wr.writerow(header_csv_list)\n        fd_outputFile.close()\n        print \"Adding header to the empty file.\"\n      else:\n        print \"outputFile : \", outputFile, \" exists, results will be appended to it.\"\n    except OSError:\n      print \"Error in outfile opening\\n\"\n\n    data_list = list(data) #[data] #list(data)\n    complete_data = output_str.split(\",\") + data_list\n    fd_outputFile = open(outputFile, 'a')\n    wr = csv.writer(fd_outputFile, quoting=csv.QUOTE_NONE, lineterminator='\\n')\n    wr.writerow(complete_data)\n    fd_outputFile.close()\n\n'''\n  ## Write ghost and slave nodes to a file.\n  ghost_array = build_master_ghost_matrix(inputFile, benchmark, cut_type, hostNum, runs, threads)\n  ghostNodes_file = outputFile + \"_\" + cut_type\n  fd_ghostNodes_file = open(ghostNodes_file, 'ab')\n  fd_ghostNodes_file.write(\"\\n--------------------------------------------------------------\\n\")\n  fd_ghostNodes_file.write(\"\\nHosts : \" + hostNum + \"\\nInputFile : \"+ inputFile + \"\\nBenchmark: \" + benchmark + \"\\nPartition: \" + cut_type + \"\\n\\n\")\n  numpy.savetxt(fd_ghostNodes_file, ghost_array, delimiter=',', fmt='%d')\n  fd_ghostNodes_file.write(\"\\n--------------------------------------------------------------\\n\")\n  fd_ghostNodes_file.close()\n'''\n\nif __name__ == \"__main__\":\n  main(sys.argv[1:])\n\n"
  },
  {
    "path": "scripts/experimental/bmk2/__init__.py",
    "content": "#\n# __init__.py\n#\n# Initialization file for bmk2.\n#\n# Copyright (c) 2015, 2016 The University of Texas at Austin\n#\n# Author: Sreepathi Pai <sreepai@ices.utexas.edu>\n#\n# Intended to be licensed under GPL3\n"
  },
  {
    "path": "scripts/experimental/bmk2/bispec.py",
    "content": "#!/usr/bin/env python\n#\n# bispec.py\n#\n# Reader for Binary/Input specification files (*.bispec) for bmk2.\n#\n# Copyright (c) 2015, 2016 The University of Texas at Austin\n#\n# Author: Sreepathi Pai <sreepai@ices.utexas.edu>\n#\n# Intended to be licensed under GPL3\n\nimport sys\nimport re\nimport logging\n\nlog = logging.getLogger(__name__)\n\nclass BinInputSpecV1(object):\n    def __init__(self):\n        self.rules = []\n        self.inputs = {}\n\n    def set_input_db(self, inputs):\n        \"\"\"Given an input database, save the input names with their files to \n        this object.\n        \"\"\"\n        for i in inputs:\n            nm = i.get_id()\n            if nm not in self.inputs:\n                self.inputs[nm] = {}\n\n            self.inputs[nm][i.props.file] = i\n\n    def get_inputs(self, binary, sel_inputs = None):\n        \"\"\"Get the inputs that need to be run for a particular binary by\n        matching it to the key specified in the binspec file.\n        e.g. \"bfs\" should matche any binary with \"bfs\" in its id\n        \"\"\"\n        inpnames = set()\n        binid = binary.get_id()\n        \n        for (re, inp) in self.rules:\n            if re.match(binid): # always anchored?\n                inpnames = inpnames.union(inp)\n\n        out = []\n        for n in inpnames:\n            if sel_inputs and n not in sel_inputs:\n                log.debug(\"Ignoring input '%s' for '%s', not in sel_inputs\" % (n, binid))\n                continue\n            \n            if n in self.inputs:\n                out += self.inputs[n].values()\n            else:\n                log.info(\"Input named '%s' not found in inputdb\" % (n,))\n\n\n        return out\n\n    def read(self, ff):\n        \"\"\"Read the bispec file: save a binary matcher to a set of inputs\n        that should be run should that matcher match a binary name.\n        \"\"\"\n        out = []\n        for l in ff:\n            l = l.strip()\n\n            if not l: continue\n\n            if l[0] == \"#\":\n                continue\n\n            ls = l.split(\" \", 1)\n            binmatch = ls[0]\n            inpnames = [x.strip() for x in ls[1].split(\",\")]\n            out.append((re.compile(binmatch), set(inpnames)))\n            \n        self.rules = out\n\ndef read_bin_input_spec(f):\n    \"\"\"Read a bispec file which specifies which inputs to run with particular\n    binaries.\n    \"\"\"\n    with open(f, \"rb\") as ff:\n        l = ff.readline().strip()\n        if l == \"#v1\":\n            x = BinInputSpecV1()\n        else:\n            print >>sys.stderr, \"Unknown file version for input/binary spec\", l\n    \n        x.read(ff)\n\n    return x\n\nif __name__ == \"__main__\":\n    read_bin_input_spec(sys.argv[1])\n"
  },
  {
    "path": "scripts/experimental/bmk2/bmk2.py",
    "content": "#\n# bmk2.py\n#\n# Loader for bmk2 tests.\n#\n# Copyright (c) 2015, 2016 The University of Texas at Austin\n#\n# Author: Sreepathi Pai <sreepai@ices.utexas.edu>\n#\n# Intended to be licensed under GPL3\n\nimport sys\nfrom common import *\nfrom config import *\nimport glob\nimport os\nimport inputdb\nimport bispec\nfrom core import *\nfrom checkers import *\nfrom perf import *\n\nimport logging\nlog = logging.getLogger(__name__)\n\ndef load_binary_specs(f, binary_group = 'BINARIES'):\n    \"\"\"Load a python file which should have a global variable\n    which contains a list of test specifications to run.\n    \"\"\"\n    g = load_py_module(f)\n\n    if binary_group in g:\n        return g[binary_group]\n    else:\n        log.error(\"No %s in %s\" % (binary_group, f))\n        return None\n       \nclass Loader(object):\n    def __init__(self, metadir, inpproc):\n        self.config = Config(metadir, inpproc)        \n        self.binaries = {}\n        self.bin_inputs = {}\n        self.inp_filtered = False\n\n    def initialize(self, ftf = {}):\n        \"\"\"Load the input database, processor, and properties as well as\n        the binary spec which specifies which inputs should be run with\n        certain binaries.\n        \"\"\"\n        # load the configuration files\n        if not self.config.load_config():\n            return False\n        \n        if not self.config.auto_set_files():\n            return False\n\n        for ty, f in ftf.iteritems():\n            if isinstance(f, list):\n                for ff in f:\n                    self.config.set_file(ff, ty)\n            else:\n                self.config.set_file(f, ty)\n\n        # load the input database from specified config files\n        self.inputdb = inputdb.InputDB(self.config.get_file(FT_INPUTDB), \n                                       self.config.get_file(FT_INPUTPROC),\n                                       self.config.get_file(FT_INPUTPROPS))\n        if not self.inputdb.load():\n            return False\n\n        # load the binary -> input mapping\n        self.bs = bispec.read_bin_input_spec(self.config.get_file(FT_BISPEC))\n        self.bs.set_input_db(self.inputdb)\n\n        return True\n\n    def split_binputs(self, binputs):\n        bins = set()\n        inputs = set()\n\n        if binputs:\n            inpnames = self.inputdb.inpnames\n\n            for i in binputs:\n                if i in inpnames:\n                    inputs.add(i)\n                else:\n                    bins.add(i)\n\n        self.inp_filtered = len(inputs) > 0\n\n        return inputs, bins            \n\n    def load_multiple_binaries(self, binspecs, sel_binaries = None, bingroup = \"BINARIES\"):\n        for b in binspecs:\n            if not self.load_binaries(b, sel_binaries, bingroup):\n                return False\n\n        return True\n\n    def load_binaries(self, binspec, sel_binaries = None, bingroup = \"BINARIES\"):\n        \"\"\"Load the list of binaries that need to be run from a python file\n        with a global object containing the names of Binary classes.\n        \"\"\"\n        d = os.path.dirname(binspec)\n        binaries = load_binary_specs(binspec, bingroup)\n        if binaries:\n            for b in binaries:\n                if b.get_id() in self.binaries:\n                    log.error(\"Duplicate binary id %s in %s\" % (b.get_id(), binspec))\n                    return False\n\n                if sel_binaries and b.get_id() not in sel_binaries:\n                    log.debug(\"Ignoring binary id %s in %s, not in sel_binaries\" % (b.get_id(), binspec))\n                    continue\n\n                self.binaries[b.get_id()] = b\n\n                if d == '':\n                    log.warning('binspec path from \"%s\" is empty' % (binspec,))\n\n                b.props._cwd = d\n\n            return True\n        \n        if not binaries or len(binaries) == 0:\n            log.error(\"%s is empty in %s\" % (bingroup, binspec))\n\n        return False\n\n    def apply_config(self):\n        \"\"\"TODO figure out what this does\"\"\"\n        if len(self.binaries) == 0:\n            log.error(\"No binaries to apply configuration to.\")\n            return False\n\n        if self.config.bin_config is not None and (self.config.bin_config):\n            log.info('Applying configuration \"%s\"' % (self.config.bin_config,))\n\n            for b in self.binaries.itervalues():\n                b.apply_config(self.config.bin_config)\n        else:\n            log.info('No binary-specific configurations specified')\n        \n        return True\n\n    def associate_inputs(self, binputs = None):\n        \"\"\"Given loaded binary inputs + binaries, associate inputs with\n        binaries.\n        \"\"\"\n        if len(self.binaries) == 0:\n            log.error(\"No binaries\")\n            return False\n\n        for bid, b in self.binaries.iteritems():\n            i = self.bs.get_inputs(b, binputs)\n            if len(i) == 0:\n                if not self.inp_filtered:\n                    log.error(\"No inputs matched for binary \" + bid)\n                    return False\n                else:\n                    log.warning(\"No inputs matched for binary \" + bid)\n                    continue\n\n            i = b.filter_inputs(i)\n            if len(i) == 0:\n                if not self.inp_filtered:\n                    log.error(\"Filtering discarded all inputs for binary \" + bid)\n                    return False\n                else:\n                    log.warning(\"Filtering discarded all inputs for binary \" + bid)\n                    continue\n            \n            self.bin_inputs[bid] = i\n\n        return True\n\n    # NOTE: I (Loc) added config in so I could pass it in\n    def get_run_specs(self, config=None):\n        \"\"\"Returns a list of all of the run specifications for binaries in\n        this loader (one run spec for each input it is associated with).\n        \"\"\"\n        out = []\n        for bid, b in self.binaries.iteritems():\n            if bid in self.bin_inputs:\n                for inp in self.bin_inputs[bid]:\n                    testList = b.get_run_spec(inp, config)\n                    for k in testList:\n                      out.append(k)\n                    #out.append((i) for i in b.get_run_spec(inp))\n            else:\n                assert self.inp_filtered, bid\n                    \n        return out\n\nif __name__ == \"__main__\":\n    import sys\n    x = load_binary_specs(sys.argv[1])\n    for bmk in x:\n        print bmk.get_id()\n        bmk.props.dump()\n"
  },
  {
    "path": "scripts/experimental/bmk2/checkers.py",
    "content": "#\n# checkers.py\n#\n# Checkers available for tests in bmk2.\n#\n# Copyright (c) 2015, 2016 The University of Texas at Austin\n#\n# Author: Sreepathi Pai <sreepai@ices.utexas.edu>\n#\n# Intended to be licensed under GPL3\n\nfrom core import Run, AT_OPAQUE\nimport re\nimport logging\nimport os\nlog = logging.getLogger(__name__)\n\nclass Checker(object):\n    \"\"\"Base checker class\"\"\"\n    check_ok = False\n\n    def check(self, run):\n        pass\n\n    def get_input_files(self):\n        return []\n\nclass PassChecker(Checker):\n    \"\"\"Check that auto-passes regardless of output.\"\"\"\n    def check(self, run):\n        run.check_ok = run.run_ok\n        return run.run_ok\n\nclass DiffChecker(Checker):\n    \"\"\"Check with diff.\"\"\"\n    def __init__(self, file1, gold):\n        self.file1 = file1\n        self.gold = gold\n        \n    def get_input_files(self):\n        return [self.gold]\n\n    def check(self, run):        \n        if not run.run_ok:\n            log.error(\"Cannot check failed run %s\" % (run))\n            return False\n\n        args = run.get_tmp_files([self.file1, self.gold])\n\n        if os.name != \"nt\":\n        \n            x = Run({}, \"diff\", [(x, AT_OPAQUE) for x in [\"-q\"] + args])\n            if not x.run():\n                log.info(\"diff -u '%s' '%s'\" % tuple(args))\n                return False\n\n            run.check_ok = True   \n        else:\n            x = Run({}, \"fc.exe\", [(x, AT_OPAQUE) for x in args])\n            if not x.run():\n                log.info(\"fc.exe '%s' '%s'\" % tuple(args))\n                return False\n\n            run.check_ok = True\n        return True\n\nclass NumDiffChecker(Checker):\n    \"\"\"TODO find out what this is\"\"\"\n    def __init__(self, file1, gold, options=None):\n        self.file1 = file1\n        self.gold = gold\n        self.options = [] if options is None else options\n        \n    def get_input_files(self):\n        return [self.gold]\n\n    def check(self, run):\n        if not run.run_ok:\n            log.error(\"Cannot check failed run %s\" % (run))\n            return False\n\n        args = run.get_tmp_files([self.file1, self.gold])\n\n        if os.name != \"nt\":\n        \n            x = Run({}, \"numdiff\", [(x, AT_OPAQUE) for x in ([\"-q\"]  + self.options + args)])\n            if not x.run():\n                log.info(\"numdiff %s '%s' '%s'\" % tuple([\" \".join(self.options)] + args))\n                return False\n\n            run.check_ok = True   \n        else:\n            x = Run({}, \"numdiff.exe\", [(x, AT_OPAQUE) for x in (self.options + self.args)])\n            if not x.run():\n                log.info(\"numdiff.exe  %s '%s' '%s'\" % tuple([\" \".join(self.options)] + args))\n                return False\n\n            run.check_ok = True\n        return True\n\nclass REChecker(Checker):\n    \"\"\"Check with regex; look for a particular pattern in output.\"\"\"\n    def __init__(self, rexp):\n        self.re = re.compile(rexp, re.MULTILINE)\n        \n    def check(self, run):        \n        if not run.run_ok:\n            log.error(\"Cannot check failed run %s\" % (run))\n            return False\n\n        for o in [run.stdout, run.stderr]:\n            #Tyler: have to remove the annoying windows \\r character\n            m = self.re.search(o.replace(\"\\r\",\"\")) #TODO: stderr?\n            if m:\n                run.check_ok = True\n                break\n        else:\n            log.info(\"REChecker could not match '%s'\" % (self.re.pattern))\n\n        return run.check_ok\n\nclass ExternalChecker(Checker):\n    \"\"\"Check with an external program specified in a Run object.\"\"\"\n    def __init__(self, brs):\n        self.rs = brs\n\n    def get_input_files(self):\n        out = []\n        if not self.rs.in_path:\n            out.append(self.rs.binary)\n\n        return out + self.rs.get_input_files()\n\n    def check(self, run):\n        if not run.run_ok:\n            log.error(\"Cannot check failed run %s\" % (run))\n            return False\n        \n        x = self.rs.run(run.runid + \".external-checker\", inherit_tmpfiles = run.tmpfiles)\n        if not x.run_ok:\n            return False\n\n        run.check_ok = True\n        return run.check_ok\n"
  },
  {
    "path": "scripts/experimental/bmk2/collect.py",
    "content": "#!/usr/bin/env python\n#\n# collect.py\n#\n# Scans log files for \"COLLECT\" and outputs a list of files to be\n# collected. Part of bmk2.\n#\n# Copyright (c) 2015, 2016 The University of Texas at Austin\n#\n# Author: Sreepathi Pai <sreepai@ices.utexas.edu>\n#\n# Intended to be licensed under GPL3\n\nimport sys\nimport datetime\nimport logproc\nimport argparse\nimport os\nimport mapfile\n\ndef build_collect_list(logfile, skip_failed = True, strip_path = 0, suffix = None):\n    out = {}\n    last_runid = {}\n    failed_runids = {}\n\n    basepath = \"\"\n    for r in logproc.parse_log_file(logfile):\n        if r.type == \"COLLECT\":\n            if r.filetype == \"basepath\":\n                basepath = r.file\n            else:                \n                if r.rsid not in out:\n                    out[r.rsid] = {}\n\n                if r.runid not in out[r.rsid]:\n                    out[r.rsid][r.runid] = {}\n\n                last_runid[r.rsid] = r.runid\n\n                if r.filetype not in out[r.rsid][r.runid]:\n                    out[r.rsid][r.runid][r.filetype] = []\n\n                s = strip_path\n                x = -1\n                n = r.file\n                while s > 0:\n                    x = r.file.find('/', x + 1)\n                    if x == -1: break\n                    s -= 1\n                else:\n                    n = r.file[x+1:]\n\n                if suffix:\n                    n = n + suffix\n\n                out[r.rsid][r.runid][r.filetype].append(n)\n        elif r.type == \"FAIL\":\n            if r.runid is not None:\n                if r.binid not in failed_runids:\n                    failed_runids[r.binid] = set()\n\n                failed_runids[r.binid].add(r.runid)\n            else:\n                # older log files\n                if skip_failed:\n                    if \"run failed\" in r.message or \"check failed\" in r.message: # not robust!\n                        if r.binid in last_runid:\n                            del out[r.binid][last_runid[r.binid]]\n\n    if skip_failed:\n        for binid in failed_runids:\n            if binid in out:\n                for runid in failed_runids[binid]:\n                    if runid in out[binid]:\n                        del out[binid][runid]\n                \n    return basepath, out\n\ndef add_names(fnames, basepath, files, out):\n    added_fnames = []\n    added_out = []\n\n    for f in files:\n        if os.path.basename(f) in fnames:\n            print >>sys.stderr, \"ERROR: duplicate\", os.path.basename(f)\n            sys.exit(1)\n\n        bn = os.path.basename(f)\n        fp = os.path.join(basepath, f)\n\n        fnames.add(bn)\n        out.append(fp)\n\n        added_fnames.append(bn)\n        added_out.append(fp)\n\n    return added_fnames, added_out\n\ndef mapentries(fnames, revmap):\n    for fn in fnames:\n        x = revmap[fn]\n        yield mapfile.mapfile_entry(binid = x[0],\n                                    input = \"\",\n                                    runid = x[1],\n                                    filetype = x[2],\n                                    filename = fn,\n                                    abspath = x[3])\n                                    \n        #print >>mapfile, \"%s %s %s %s %s\" % (x[0], x[1], x[2], fn, x[3])\n    \ndef collect_logfile(logfile, skip_failed = True, strip_path = 0, suffix = None, filetypes = []):\n    basepath, colfiles = build_collect_list(logfile, skip_failed, strip_path, suffix)\n    out = []\n    fnames = set()\n    revmap = {}\n\n    for rsid in colfiles:\n        for runid in colfiles[rsid]:\n            for ft in colfiles[rsid][runid]:\n                if len(filetypes) and ft in filetypes:\n                    af, ao = add_names(fnames, basepath, colfiles[rsid][runid][ft], out)\n                elif len(filetypes) == 0:\n                    af, ao = add_names(fnames, basepath, colfiles[rsid][runid][ft], out)\n                else:\n                    af = None\n                    ao = None\n\n                if af is not None:\n                    for f, ff in zip(af, ao):\n                        revmap[f] = (rsid, runid, ft, ff)\n                \n    assert len(fnames) == len(out)\n\n    return out, fnames, revmap\n    \n\nif __name__ == \"__main__\":\n    parser = argparse.ArgumentParser(\"Collect extra files generated during test2.py in a single directory\")\n    parser.add_argument('logfile', help='Logfile')\n    parser.add_argument('filetype', nargs='?', help='Type of files to collect (default: all)', default=[])\n    parser.add_argument('-p', dest=\"strip_path\", type=int, metavar='NUM', help='Strip NUM components from filename before combining with basepath', default=0)\n    parser.add_argument('-m', dest=\"map\", metavar='FILE', help='Store map of RSID, file and filetype in FILE', default=None)\n    parser.add_argument(\"-a\", dest='append', action='store_true', default=False, help=\"Append to map file\")\n    parser.add_argument('-s', dest=\"suffix\", metavar='SUFFIX', help='Add suffix to filename', default=0)\n    parser.add_argument('--collect-failed', dest=\"skip_failed\", action=\"store_false\", default=True, help='Collect files from failed runs')\n\n    args = parser.parse_args()\n\n    ft = set() if len(args.filetype) == 0  else set([args.filetype])\n\n    out, fnames, revmap = collect_logfile(args.logfile, args.skip_failed, args.strip_path, args.suffix, ft)\n    print \"\\n\".join(out)\n\n    if args.map:\n        mapfile.write_mapfile_raw(args.map, mapentries(fnames, revmap), \"w\" if not args.append else \"a\")\n"
  },
  {
    "path": "scripts/experimental/bmk2/collect_multi.py",
    "content": "#!/usr/bin/env python\n#\n# collect_multi.py\n#\n# Scans multiple log files for \"COLLECT\" and outputs a list of files to be\n# collected. Part of bmk2.\n#\n# Copyright (c) 2015, 2016, 2017 The University of Texas at Austin\n#\n# Author: Sreepathi Pai <sreepai@ices.utexas.edu>\n#\n# Intended to be licensed under GPL3\n\nfrom collect import *\nimport argparse\nimport mapfile\n\nif __name__ == \"__main__\":\n    parser = argparse.ArgumentParser(\"Collect extra files generated during test2.py in a single directory\")\n    parser.add_argument('logfiles', nargs='+', help='Logfiles')\n    parser.add_argument('-t', dest=\"filetype\", action=\"append\", help='Type of files to collect (default: all)', default=[])\n    parser.add_argument('-p', dest=\"strip_path\", type=int, metavar='NUM', help='Strip NUM components from filename before combining with basepath', default=0)\n    parser.add_argument('-m', dest=\"map\", metavar='FILE', help='Store map of RSID, file and filetype in FILE', default=None)\n    parser.add_argument(\"-a\", dest='append', action='store_true', default=False, help=\"Append to map file\")\n    parser.add_argument('-s', dest=\"suffix\", metavar='SUFFIX', help='Add suffix to filename', default=0)\n    parser.add_argument('--collect-failed', dest=\"skip_failed\", action=\"store_false\", default=True, help='Collect files from failed runs')\n\n    args = parser.parse_args()\n\n    ft = set(args.filetype)\n\n    for i, l in enumerate(args.logfiles):\n        out, fnames, revmap = collect_logfile(l, args.skip_failed, args.strip_path, args.suffix, ft)\n        print \"\\n\".join(out)\n\n        if args.map:\n            mapfile.write_mapfile_raw(args.map, mapentries(fnames, revmap), \"a\" if (args.append or i > 0) else \"w\")\n"
  },
  {
    "path": "scripts/experimental/bmk2/common.py",
    "content": "#\n# common.py\n#\n# Python utilities for bmk2.\n#\n# Copyright (c) 2015, 2016 The University of Texas at Austin\n#\n# Author: Sreepathi Pai <sreepai@ices.utexas.edu>\n#\n# Intended to be licensed under GPL3\n\n# runs a python file and returns the globals\ndef load_py_module(f):\n    \"\"\"Executes a python file and returns the globals in it at the end\n    of execution.\n    \"\"\"\n    g = {}\n    x = execfile(f, g)\n    return g\n"
  },
  {
    "path": "scripts/experimental/bmk2/config.py",
    "content": "#\n# config.py\n#\n# bmk2.cfg reader for bmk2.\n#\n# Copyright (c) 2015, 2016 The University of Texas at Austin\n#\n# Author: Sreepathi Pai <sreepai@ices.utexas.edu>\n#\n# Intended to be licensed under GPL3\n\nimport glob\nimport os\nimport logging\nimport ConfigParser\n\nlog = logging.getLogger(__name__)\n\n# essentially enums for globs\nFT_BISPEC = 1\nFT_INPUTDB = 2\nFT_INPUTPROPS = 3\nFT_INPUTPROC = 4\n\nFT_FIRST = FT_BISPEC\nFT_LAST = FT_INPUTPROC\n\n# file types you can have multiple of\nFT_MULTIPLE_OKAY = set()\n# file types that aren't necessary\nFT_ZERO_OKAY = set([FT_INPUTPROPS, FT_INPUTPROC])\n# file types that are possible to load + their expected file extensions\nFT_GLOBS = {FT_BISPEC: '*.bispec', \n            FT_INPUTDB: '*.inputdb',\n            FT_INPUTPROPS: '*.inputprops', \n            FT_INPUTPROC: None}\n\nclass Config(object):\n    \"\"\"Class that provides an interface to a BMK2 configuration file\"\"\"\n    def __init__(self, metadir, inpproc = None):\n        \"\"\"Setup data structures and load the config file into the object\n\n        Keyword Arguments:\n        metadir -- the directory with the configuration information\n        inpproc -- TODO figure out what this is\n        \"\"\"\n        self.metadir = metadir\n        self.okay = False\n        self.files = {}\n        self.disable_binaries = set()\n        self.site = None\n\n        if inpproc is not None:\n            self.files = {FT_INPUTPROC: inpproc}\n\n        self._load_config()\n\n    def set_file(self, f, ty, multiple = False):\n        \"\"\"Save a file name into a particular class of files.\n\n        Keyword Arguments:\n        f -- the file name\n        ty -- the type of the passed in file (see FT_GLOBS for file types)\n        multiple -- specifies if you can have multiple files of some type\n        \"\"\"\n        assert not (ty < FT_FIRST or ty > FT_LAST), \"Invalid file type: %s\" % (ty,)\n        \n        if multiple:\n            assert ty in FT_MULTIPLE_OKAY, \"File type %d not in multiple\" % (ty,)\n\n        if not (os.path.exists(f) and os.path.isfile(f)):\n            log.error(\"File '%s' (file type: %d) does not exist or is not a file\" % \n                      (f, ty))\n            return False\n\n        if ty not in self.files:\n            if multiple:\n                self.files[ty] = [f]\n            else:\n                self.files[ty] = f\n        else:\n            if multiple:\n                self.files[ty].append(f)\n            else:\n                log.warning(\"Overwriting file type %d (currently: %s) with %s\" % \n                            (ty, self.files[ty], f))\n                self.files[ty] = f\n\n        return True\n\n    def get_file(self, ty):\n        \"\"\"Get the file of a particular type from this object.\"\"\"\n        if ty not in self.files:\n            return None\n\n        return self.files[ty]\n        \n    def _site_specific_cfg(self, x):\n        \"\"\"TODO\"\"\"\n        sitefiles = glob.glob(os.path.join(self.metadir, \"SITE-IS.*\"))\n\n        if len(sitefiles) > 1:\n            log.error(\"Only one sitefile should exist. Currently, multiple sitefiles exist: '%s'\" % (sitefiles,))\n        elif len(sitefiles) == 0:\n            log.info(\"No sitefile found.\")\n        else:\n            p = sitefiles[0].rindex(\".\")\n            self.site = sitefiles[0][p+1:]\n            log.info(\"Site set to '%s'.\" % (self.site,))\n            sscfg = os.path.join(self.metadir, \"bmk2.cfg.\" + self.site)\n\n            if not os.path.exists(sscfg):\n                log.warning(\"No site-specific configuration '%s' found.\" % (sscfg,))\n            else:\n                log.info(\"Loading site-specific configuration from '%s'.\" % (sscfg,))\n                y = self._read_config(sscfg,)\n                \n                for s in y.sections():\n                    for n, v in y.items(s):\n                        if not self.cfg.has_section(s):\n                            self.cfg.add_section(s)\n\n                        log.info(\"Setting site-specific [%s]:%s to '%s'\" % (s, n, v))\n                        self.cfg.set(s, n, v)                \n\n                return True\n\n        return False\n\n    def _read_config(self, f):\n        \"\"\"TODO\"\"\"\n        x = ConfigParser.SafeConfigParser()\n\n        with open(f, \"rb\") as fp:\n            x.readfp(fp)\n\n            try:\n                version = x.get(\"bmk2\", \"version\")\n                if version != \"2\":\n                    log.error(\"%s: Unknown config version %s\" % (self.config_file, version,))\n                    return False\n            except ConfigParser.NoOptionError:\n                log.error(\"%s: Unable to read version\" % (self.config_file,))\n                return False\n\n            return x\n\n    def _load_config(self):\n        \"\"\"Open a configuration file: it must have a version specification for this\n        function to exit cleanly. Saves the opened configuration file to cfg.\n        \"\"\"\n        self.cfg = None\n\n        if not (os.path.exists(self.metadir) and os.path.isdir(self.metadir)):\n            log.error(\"Metadir '%s' does not exist or is not a directory\" % (self.metadir,))\n            return False\n\n        self.config_file = os.path.join(self.metadir, \"bmk2.cfg\")\n        if not (os.path.exists(self.config_file) and os.path.isfile(self.config_file)):\n            log.error(\"Configuration file '%s' does not exist\" % (self.config_file,))\n            return False\n\n        x = self._read_config(self.config_file)\n        if x == False:\n            return x\n\n        self.cfg = x\n        if self._site_specific_cfg(x) == False:\n            return False\n        #x = ConfigParser.SafeConfigParser()\n        #with open(self.config_file, \"rb\") as f:\n        #    x.readfp(f)\n\n        #    # only reads version 2 bmk2 config files\n        #    try:\n        #        version = x.get(\"bmk2\", \"version\")\n        #        if version != \"2\":\n        #            log.error(\"%s: Unknown config version %s\" % \n        #                      (self.config_file, version,))\n        #            return False\n        #    except ConfigParser.NoOptionError:\n        #        log.error(\"%s: Unable to read version\" % (self.config_file,))\n        #        return False\n\n        #    self.cfg = x\n\n    def load_config(self):\n        \"\"\"Load the file types and disabled binaries as specified by the \n        loaded configuration file.\n        \"\"\"\n        x = self.cfg\n        if not x:\n            return False\n\n        # load each file type, save names\n        for prop, ty in [(\"inpproc\", FT_INPUTPROC),\n                         (\"inputdb\", FT_INPUTDB),\n                         (\"inputprops\", FT_INPUTPROPS),\n                         (\"bispec\", FT_BISPEC)]:\n            try:\n                # section bmk2 with a particular property\n                val = x.get(\"bmk2\", prop)\n                val = os.path.join(self.metadir, val)\n                if self.set_file(val, ty):\n                    log.info(\"%s: Loaded file type %d ('%s')\" % \n                             (self.config_file, ty, val))\n                else:\n                    return False\n            except ConfigParser.NoOptionError:\n                log.debug(\"%s: File type %d (property: '%s') not specified\" % \n                          (self.config_file, ty, prop))\n\n        # save disabled binaries into object\n        try:\n            val = x.get(\"bmk2\", \"disable_binaries\")\n            self.disable_binaries = set([xx.strip() for xx in val.split(\",\")])\n        except ConfigParser.NoOptionError:\n            pass\n\n        # TODO find out what this is\n        self.bin_config = None\n        if x.has_section(\"default-config\"):\n            self.bin_config = self.section_to_dict(x, \"default-config\")\n\n        self.cfg = x\n        return True\n\n    def section_to_dict(self, cfgobj, section):        \n        \"\"\"TODO\"\"\"\n        kv = cfgobj.items(section)\n\n        o = set()\n        for kk in kv:\n            if kk[0] in o:\n                log.warning(\"Duplicated key '%s' in section '%s'\", kk[0], section)\n\n            o.add(kk[0])\n\n        return dict(kv)\n\n    def load_bin_config(self, config_sections):\n        \"\"\"TODO\"\"\"\n        x = self.cfg\n        if not x:\n            return False\n        \n        ok = True\n        out = []\n        for s in config_sections:\n            if not x.has_section(s):\n                log.error(\"Configuration section '%s' not found\" % (s,))\n                ok = False\n            else:\n                out.append(self.section_to_dict(x, s))\n\n        if ok:\n            nout = {}\n            for o in out:\n                if 'type' in o and o['type'] == 'bmk2config':\n                    if 'disable_binaries' in o:\n                        v = set([xx.strip() for xx in o['disable_binaries'].split(\",\")])\n                        self.disable_binaries = self.disable_binaries.union(v)\n                    else:\n                        # TODO: handle other configuration specific things?\n                        pass\n                else:\n                    nout.update(o)\n\n            if len(nout):\n                if self.bin_config is None:\n                    self.bin_config = {}\n\n                self.bin_config.update(nout)\n\n            return True\n\n        return ok\n        \n\n    def load_var_config(self, varconfigs):\n        \"\"\"TODO\"\"\"\n        o = {}\n        for vv in varconfigs:\n            va, vl = vv.split(\"=\")\n            o[va] = vl\n\n        if self.bin_config is None:\n            self.bin_config = {}\n\n        # TODO: warn of command line config over-riding?\n        self.bin_config.update(o)\n        return True\n\n\n                    \n    def get_var(self, key, default = None, sec = \"bmk2\"):\n        \"\"\"Loads a variable from a particular secion of the loaded\n        config file\n\n        Keyword Arguments:\n        key -- variable to load\n        default -- default value to return\n        sec -- section in the config file to search\n        \"\"\"\n        try:\n            return self.cfg.get(sec, key)\n        except ConfigParser.NoOptionError:\n            return default\n\n    def auto_set_files(self):\n        \"\"\"Load any required remaining files in the metadata directory that have\n        not been loaded into the files structure of the object yet.\n        \"\"\"\n        for ty in range(FT_FIRST, FT_LAST):\n            if ty not in self.files and FT_GLOBS[ty] is not None:\n                matches = glob.glob(os.path.join(self.metadir, FT_GLOBS[ty]))\n\n                if len(matches) == 0:\n                    if ty not in FT_ZERO_OKAY:\n                        log.error(\"File type %d (%s) required, but not found in %s\" % \n                                  (ty, FT_GLOBS[ty], self.metadir))\n                        return False\n                elif len(matches) == 1:\n                    log.info(\"File type %d auto set to %s\" % (ty, matches[0]))\n                    if not self.set_file(matches[0], ty, False):\n                        return False\n                elif len(matches) > 1:\n                    if ty not in FT_MULTIPLE_OKAY:\n                        log.error(\"Multiple matches found for file type %d (%s) in %s, must specify only one.\" % \n                                  (ty, FT_GLOBS[ty], self.metadir))\n                        return False\n                    else:\n                        for f in matches:\n                            if not self.set_file(f, ty, True):\n                                return False\n\n        return True\n\n__all__  = ['FT_BISPEC', 'FT_INPUTDB', 'FT_INPUTPROC', 'FT_INPUTPROPS',\n            'Config']\n\nif __name__ == \"__main__\":\n    \"\"\"Load configuration files specified by the configuration data.\n    The directory to the configuration data should be the second argument,\n    and there should be a bmk2.cfg file in the directory.\n    \"\"\"\n    import sys\n    logging.basicConfig(level=logging.DEBUG)\n    inpproc = None\n\n    if len(sys.argv) > 2:\n        inpproc = sys.argv[2]\n\n    x = Config(sys.argv[1], inpproc)\n    if x.load_config():\n        if x.auto_set_files():\n            print \"LOADED CONFIG\"\n"
  },
  {
    "path": "scripts/experimental/bmk2/convert.py",
    "content": "#!/usr/bin/env python\n#\n# convert.py\n#\n# Bulk converter for graph files in bmk2. \n#\n# Copyright (c) 2015, 2016 The University of Texas at Austin\n#\n# Author: Sreepathi Pai <sreepai@ices.utexas.edu>\n#\n# Intended to be licensed under GPL3\n\nimport sys\nimport ConfigParser\nimport argparse\nfrom extras import *\nimport logging\nimport opdb\nimport os\nimport re\nimport sconvert\n\nlog = logging.getLogger(__name__)\nlogging.basicConfig(level=logging.INFO, format='%(levelname)-8s %(name)-10s %(message)s')\n\np = argparse.ArgumentParser(\"Generate conversion makefile\")\np.add_argument(\"output\", nargs=\"?\", default=\"/dev/stdout\")\np.add_argument(\"-d\", dest=\"metadir\", metavar=\"PATH\", help=\"Path to load configuration from\", default=\".\")\np.add_argument(\"--iproc\", dest=\"inpproc\", metavar=\"FILE\", help=\"Input processor\")\np.add_argument(\"--bs\", dest=\"binspec\", metavar=\"FILE\", help=\"Binary specification\", default=\"./bmktest2.py\")\np.add_argument(\"--bispec\", dest=\"bispec\", metavar=\"FILE_OR_MNEMONIC\", help=\"Binary+Input specification\")\np.add_argument(\"--scan\", dest=\"scan\", metavar=\"PATH\", help=\"Recursively search PATH for bmktest2.py\")\np.add_argument(\"-v\", dest=\"verbose\", type=int, help=\"Verbosity\", default=0)\n\nargs = p.parse_args()\n\nloaded = standard_loader(args.metadir, args.inpproc, args.binspec, args.scan, args.bispec, bingroup='CONVERTERS')\nif not loaded:\n    sys.exit(1)\nelse:\n    basepath, binspecs, l = loaded\n\nconvspec = l.config.get_var('convspec', None)\nif not convspec:\n    log.error(\"No 'convspec' in config file\")\n    sys.exit(1)\n\ncs = sconvert.load_convspec(os.path.join(l.config.metadir, convspec))\nif not cs:\n    sys.exit(1)\n\nall_types, conv = sconvert.init_convgraph(cs)\n\nout = []\nrspecs = l.get_run_specs()\nfor rs in rspecs:\n    src, srcty, dst, dstty = rs.args\n    src, srcty, dst, dstty = src[0], srcty[0], dst[0], dstty[0]\n\n    exists = {}\n    for alt in rs.bmk_input.get_all_alt():\n        if alt.props.format not in all_types:\n            log.error(\"Format '%s' not listed in convspec\"%  (alt.props.format,))\n            sys.exit(1)\n\n        if os.path.exists(alt.props.file):\n            # sometimes alt.props.file may only exist in the database\n            exists[alt.props.format] = alt.props.file\n            \n    cmds = sconvert.convert_one(cs, src, srcty, dst, dstty, all_types, conv, exists, args.verbose)\n    if cmds is None:\n        continue\n\n    out.append(cmds)\n\nif len(out):\n    f = open(args.output, \"w\")\n    sconvert.to_makefile(f, out)\n    f.close()\n"
  },
  {
    "path": "scripts/experimental/bmk2/convgraph.py",
    "content": "#!/usr/bin/env python\n#\n# convgraph.py\n#\n# Planner-based graph converter library.  \n#\n# Copyright (c) 2015, 2016 The University of Texas at Austin\n#\n# Author: Sreepathi Pai <sreepai@ices.utexas.edu>\n#\n# Intended to be licensed under GPL3\n\nfrom pyhop import *\n\nconversions = {}\n\ndef register_conversion(src, dst, fn_xform):\n    assert (src, dst) not in conversions, \"Duplicate conversion (%s, %s)\" % (src, dst)\n\n    conversions[(src, dst)] = fn_xform\n\ndef convert_direct(state, a, fmt_a, b, fmt_b):\n    # we must have a direct converter\n    if (fmt_a, fmt_b) not in conversions:\n        #print \"no direct conversion\"\n        return False\n\n    if state.files[fmt_a] != a and state.existing[fmt_a] != a:\n        #print \"src does not exist\"\n        return False\n\n    if state.files[fmt_b] == b:\n        #print \"dst exists\"\n        return False\n    \n    state.files[fmt_b] = b\n    return state\n\ndeclare_operators(convert_direct)\n\ndef convert_from_existing(state, a, fmt_a, b, fmt_b):\n    if (fmt_a, fmt_b) in conversions:        \n        if b is None:\n            b = conversions[(fmt_a, fmt_b)](a)\n            if b == a:\n                return False\n\n        return [('convert_direct', a, fmt_a, b, fmt_b)]\n\n    for f, e in state.existing.iteritems():\n        if state.files[f] is None and f not in state.tried_existing:\n            state.tried_existing.add(f)\n            # exists but does not feature as a step\n            return [('convert', e, f, b, fmt_b)]\n\n    return False\n\ndef convert_via(state, a, fmt_a, b, fmt_b):\n    if (fmt_a, fmt_b) in conversions:\n        return False\n    else:\n        for s, d in conversions:\n            if s == fmt_a and state.files[d] is None:\n                via = conversions[(s, d)](a)\n\n                if via == a: # did not match regex\n                    return False\n\n                return [('convert', a, fmt_a, via, d),\n                        ('convert', via, d, b, fmt_b)]\n        \n        return False\n\ndeclare_methods('convert', convert_from_existing, convert_via)\n\ndef get_conversion(start, start_ty, end, end_ty, existing, verbose=0):\n    s = State('initial')\n    \n    # do we need existing?\n    s.existing = {}\n    s.files = {}\n    s.tried_existing = set()\n\n    for f1, f2 in conversions.keys():\n        s.files[f1] = None\n        s.files[f2] = None\n\n    for k, v in existing.iteritems():\n        s.existing[k] = v\n\n    s.files[start_ty] = start\n\n    x = pyhop(s, [('convert', start, start_ty, end, end_ty)], verbose=verbose)\n    return x\n\nif __name__ == \"__main__\":\n    start_file = 'a'\n    start_file_fmt = 'binary/gr'\n\n    s = State('initial')\n    s.existing = {}\n    s.files = {}\n\n    for f1, f2 in conversions.keys():\n        s.files[f1] = None\n        s.files[f2] = None\n\n    s.files[start_file_fmt] = start_file\n\n    s.existing[start_file_fmt] = start_file\n    s.existing['other/format1'] = 'c'\n\n    x = pyhop(s, [('convert', 'a', 'binary/gr', 'b', 'other/format')], verbose=2)\n    if not x:\n        print \"conversion is unsupported\"\n\n        \n"
  },
  {
    "path": "scripts/experimental/bmk2/core.py",
    "content": "#\n# core.py\n#\n# Core object classes and functions for bmk2.\n#\n# Copyright (c) 2015, 2016 The University of Texas at Austin\n#\n# Author: Sreepathi Pai <sreepai@ices.utexas.edu>\n#\n# Intended to be licensed under GPL3\n\nimport os\nimport subprocess\nimport tempfile\nimport logging\n\nif os.name != \"nt\":\n    import resource\n\nimport re\n\nlog = logging.getLogger(__name__)\n\nif not hasattr(subprocess, \"check_output\"):\n    print >>sys.stderr, \"%s: Need python 2.7\" % (sys.argv[0],)\n    sys.exit(1)\n\n# Argument Type Enumeration\nAT_OPAQUE = 0\nAT_INPUT_FILE = 1\nAT_OUTPUT_FILE = 2\nAT_TEMPORARY_OUTPUT = 3\nAT_INPUT_FILE_IMPLICIT = 4\nAT_TEMPORARY_INPUT = 5\nAT_LOG = 6\n\nplaceholder_re = re.compile(r'(@[A-Za-z0-9_]+)') # may need delimiters?\n\ndef escape_for_filename(n):\n    \"\"\"Remove / and . from a path to some file (or any string).\"\"\"\n    return n.replace(\"/\", \"_\").replace(\".\", \"\").replace(\"\\\\\",\"_\")\n    \ndef create_log(ftemplate, run):\n    v = {'runid': run.runid}\n    if 'xtitle' in run.rspec.vars:\n        v['xtitle'] = run.rspec.vars['xtitle']\n    else:\n        v['xtitle'] = ''\n\n    if run.rspec: v['rsid'] = escape_for_filename(run.rspec.get_id())\n\n    complete = os.path.join(os.path.dirname(run.binary), ftemplate.format(**v))\n    return complete\n\ndef squash_output(buf, max_bytes = 1600):\n    if len(buf) <= max_bytes:\n        return buf\n    \n    header = buf[:max_bytes/2]\n    tail = buf[-max_bytes/2:]\n\n    pos = header.rfind(\"\\n\")\n    if pos != -1:\n        # can trim by a lot ...\n        header = header[:pos]\n\n    pos = tail.find(\"\\n\")\n    if pos != -1:\n        # can trim by a lot ...\n        tail = tail[pos+1:]\n\n    return header + \"\\n *** SQUASHED *** \\n \" + tail    \n\ndef strip_repeated_lines(buf, min_repeat = 2, msg = '<< previous line repeated {count} times >>\\n'):\n    import cStringIO\n    \n    x = cStringIO.StringIO(buf)\n    y = cStringIO.StringIO()\n\n    prev = None\n    repeat_count = 0\n    hold_buf = \"\"\n\n    for l in x:\n        if l == prev:\n            repeat_count += 1\n            if repeat_count <= min_repeat:\n                hold_buf = hold_buf + l\n        else:\n            if hold_buf:\n                if repeat_count > min_repeat:\n                    y.write(msg.format(count = repeat_count))\n                else:\n                    y.write(hold_buf)\n\n            repeat_count = 0\n            hold_buf = \"\"\n            y.write(l)\n            \n        prev = l\n    \n    if hold_buf:\n        if repeat_count > min_repeat:\n            y.write(msg.format(count = repeat_count))\n        else:\n            y.write(hold_buf)\n\n        repeat_count = 0\n        hold_buf = \"\"\n\n    return y.getvalue()\n\ndef run_command(cmd, stdout = True, stderr = True, env = None, popen_args = {}): \n    \"\"\"Run on the command line the argument cmd.\"\"\"\n    output = None\n    error = None\n    \n    stdouth = subprocess.PIPE if stdout else None\n    stderrh = subprocess.PIPE if stderr else None\n\n    fname_stdout = None\n    fname_stderr = None\n\n    if os.name == \"nt\":\n        #stdouth, fname_stdout = tempfile.mkstemp(prefix=\"tmp-stdout\" + self.bin_id, dir=self.tmpdir)\n        #stderrh, fname_stderr = tempfile.mkstemp(prefix=\"tmp-stdout\" + self.bin_id, dir=self.tmpdir)\n        stdouth, fname_stdout = tempfile.mkstemp(prefix=\"tmp-stdout\")\n        stderrh, fname_stderr = tempfile.mkstemp(prefix=\"tmp-stderr\")\n    \n    try:\n        proc = subprocess.Popen(cmd, stdout=stdouth, stderr=stderrh, env = env, **popen_args)\n        output, error = proc.communicate()\n        \n        if fname_stdout != None:\n            os.close(stdouth)\n            tmp_f = open(fname_stdout)\n            output = tmp_f.read()\n            tmp_f.close()\n            os.remove(fname_stdout)\n\n        if fname_stderr != None:\n            os.close(stderrh)\n            tmp_f = open(fname_stderr)\n            error = tmp_f.read()\n            tmp_f.close()\n            os.remove(fname_stderr)\n\n        if proc.returncode != 0:\n            log.error(\"Execute failed (%d): \" % (proc.returncode,) + \" \".join(cmd))\n            rv = proc.returncode\n        else:\n            rv = 0\n    except OSError as e:\n        #print >>sys.stderr, \"Execute failed: (%d: %s) \"  % (e.errno, e.strerror) + \" \".join(cmd)\n        log.error(\"Execute failed (OSError %d '%s'): \"  % (e.errno, e.strerror) + \" \".join(cmd))\n        output = e.strerror\n        rv = e.errno\n\n    return (rv, output, error)\n\ndef run_command_old(cmd, stdout = True, stderr = True, env = None, popen_args = {}):\n    if stderr:\n        stdout = True\n        stderrh = subprocess.STDOUT\n    else:\n        stderrh = None \n\n    output = None\n    error = None\n\n    if stdout:\n        try:\n            output = subprocess.check_output(cmd, stderr=stderrh, env = env, **popen_args)\n            rv = 0\n        except subprocess.CalledProcessError as e:\n            #print >>sys.stderr, \"Execute failed (%d): \" % (e.returncode,) + \" \".join(cmd)\n            log.error(\"Execute failed (%d): \" % (e.returncode,) + \" \".join(cmd))\n            output = e.output\n            rv = e.returncode\n        except OSError as e:\n            #print >>sys.stderr, \"Execute failed: (%d: %s) \"  % (e.errno, e.strerror) + \" \".join(cmd)\n            log.error(\"Execute failed (OSError %d '%s'): \"  % (e.errno, e.strerror) + \" \".join(cmd))\n            output = e.strerror\n            rv = e.errno\n    else:\n        rv = subprocess.call(cmd, stderr=stderrh)\n\n    return (rv, output, error)\n\n\nclass Properties(object):\n    \"\"\"Classes that inherit from this will have the functionality of dumping\n    all of their attributes.\n    \"\"\"\n    def dump(self):\n        for y in vars(self):\n            print y, getattr(self, y)\n\n    def __str__(self):\n        return \", \".join([\"%s=%s\" % (x, getattr(self, x)) for x in vars(self)])\n\nclass RLimit(object):\n    \"\"\"Represents resource limits.\"\"\"\n    def __init__(self):\n        self.limits = {}\n\n    def setrlimit(self, lim, val):\n        self.limits[lim] = val\n\n    def set(self):\n        for lim, val in self.limits.iteritems():\n            resource.setrlimit(lim, val)\n            \nclass Binary(object):\n    \"\"\"Base Binary class.\"\"\"\n    def get_id(self):\n        raise NotImplementedError\n\n    def filter_inputs(self, inputs):\n        raise NotImplementedError\n\n    def apply_config(self, config):\n        raise NotImplementedError\n\n\nclass Converter(Binary):\n    def get_run_spec(self, bmkinput):\n        x = BasicRunSpec()\n        x.set_binary('', 'convert', in_path = True)\n        x.set_arg(bmkinput.props.file, AT_INPUT_FILE)\n        x.set_arg(bmkinput.props.format, AT_OPAQUE)\n\n        x.bmk_input = bmkinput\n\n        alt = bmkinput.get_alt_format(self.format)\n\n        if alt:\n            # we allow this since converter will remove this later ...\n            x.set_arg(alt.props.file, AT_OUTPUT_FILE)\n        else:\n            x.set_arg(\"@output\", AT_OUTPUT_FILE)\n\n        x.set_arg(self.format, AT_OPAQUE)\n\n        return x\n\nclass Input(object):\n    \"\"\"BMK input class created from information from an inputdb file.\n    Attributes are saved into the object proper.\"\"\"\n    def __init__(self, props, db = None):\n        self.props = Properties()\n        self.db = db\n\n        for k, v in props.iteritems():\n            setattr(self.props, k, v)\n\n        self.name = self.props.name\n\n    def get_alt_format(self, fmt):\n        return self.db.get_alt_format(self.name, fmt)\n\n    def get_all_alt(self):\n        return self.db.get_all_alt(self.name)\n\n    def hasprop(self, prop):\n        return hasattr(self.props, prop)\n\n    def get_id(self):\n        return self.name\n\n    def get_file(self):\n        raise NotImplementedError\n\n    def __str__(self):\n        return \"%s(%s)\" % (self.name, str(self.props))\n\n    __repr__ = __str__\n\nclass Run(object):\n    \"\"\"Class that specifies one run of some binary with a particular\n    set of arguments/environment setting.\n    \"\"\"\n\n    def __init__(self, env, binary, args, rspec = None):\n        \"\"\"Run initialization.\n\n        Keyword Arguments:\n\n        env -- environment variables\n        binary -- binary name/path\n        args -- arguments to binary\n        rpsec -- a run specification object\n        \"\"\"\n        self.env = env\n        self.binary = binary\n        self.args = args\n        self.cmd_line_c = \"not-run-yet\"\n        self.bin_id = escape_for_filename(self.binary)\n        self.rspec = rspec\n        self.runid = None\n\n        self.retval = -1\n        self.stdout = \"\"\n        self.stderr = \"\"\n\n        self.tmpdir = None\n        self.tmpfiles = {}\n        self.run_ok = False\n        self.check_ok = False\n        self.overlays = []\n        self.popen_args = {}\n\n    def set_popen_args(self, kwd, val):\n        \"\"\"Set a Process open argument.\"\"\"\n        self.popen_args[kwd] = val\n\n    def set_overlays(self, overlays):\n        self.overlays = overlays\n\n    def set_tmpdir(self, tmpdir):\n        self.tmpdir = tmpdir\n\n    def run(self, inherit_tmpfiles = None):\n        \"\"\"Run the commmand specified by this object.\"\"\"\n        assert self.retval == -1, \"Can't use the same Run object twice\"\n\n        cmdline = [self.binary]\n\n        # get arguments to pass into command line\n        for a, aty in self.args:\n            if aty == AT_INPUT_FILE_IMPLICIT:\n                continue\n\n            if aty == AT_TEMPORARY_OUTPUT:\n                km = placeholder_re.search(a)\n                assert km is not None\n                k = km.group(1) # this should really be folded into the argtype itself...\n\n                th, self.tmpfiles[k] = tempfile.mkstemp(prefix=\"test-\" + self.bin_id, dir=self.tmpdir)\n                os.close(th) # else files will continue to occupy space even after they are deleted\n                log.debug(\"Created temporary file '%s' for '%s'\" % (self.tmpfiles[k], k))\n                a = a.replace(k, self.tmpfiles[k])\n            elif aty == AT_TEMPORARY_INPUT:\n                km = placeholder_re.search(a)\n                assert km is not None\n                k = km.group(1)\n                a = a.replace(k, inherit_tmpfiles[k])\n\n            cmdline.append(a)\n            \n        env = self.env\n        for ov in self.overlays:\n            env, cmdline = ov.overlay(self, env, cmdline, inherit_tmpfiles)\n\n        self.env = env\n        self.cmd_line = cmdline\n        self.cmd_line_c = \" \".join(self.cmd_line) # command line string\n\n        log.info(\"Running %s\" % (str(self)))\n\n        run_env = os.environ.copy() # do this at init time instead of runtime?\n        run_env.update(self.env)\n\n        self.retval, self.stdout, self.stderr = run_command(self.cmd_line,\n                                                   env=run_env,\n                                                   popen_args = self.popen_args)\n        self.run_ok = self.retval == 0\n\n        return self.run_ok\n\n    def get_tmp_files(self, names):\n        out = []\n        for n in names:\n            if n[0] == \"@\":\n                out.append(self.tmpfiles[n])\n            else:\n                out.append(n)\n\n        return out\n\n    def cleanup(self):\n        \"\"\"Cleanup the temporary files created by the run object.\"\"\"\n        for a, f in self.tmpfiles.iteritems():\n            os.unlink(f)\n\n    def __str__(self):\n        ev = [\"%s=%s\" % (k, v) for k, v in self.env.iteritems()]\n        return \"%s %s\" % (\" \".join(ev), self.cmd_line_c)\n\n\nclass BasicRunSpec(object):\n    \"\"\"Class containing the specifications for running a binary.\"\"\"\n    def __init__(self):\n        self.binary = None\n        self.args = []\n        self.env = {}\n        self.runs = []\n        self.in_path = False\n        self.overlays = []\n        self._runids = set()\n        self.rlimit = None\n        self.tmpdir = None\n        self.vars = {}\n\n        self.errors = set()\n\n    def set_tmpdir(self, tmpdir):\n        \"\"\"Set the temporary directory.\"\"\"\n        self.tmpdir = tmpdir\n\n    def add_overlay(self, overlay):\n        \"\"\"Add an overlay to the list of overlays.\"\"\"\n        self.overlays.append(overlay)\n\n    def get_id(self):\n        \"\"\"Return an id to this run spec.\"\"\"\n        return \"%s/%s\" % (self.bid, self.input_name)\n    \n    def set_binary(self, cwd, binary, in_path = False):\n        \"\"\"Set the binary to run with this run spec.\n\n        Keyword Arguments:\n\n        cwd -- current working directory \n        binary -- binary name\n        in_path -- specifies if the binary can be found in the current path \n        env variable\n        \"\"\"\n        self.cwd = cwd # TODO: does this do anything?\n        self.binary = os.path.join(cwd, binary)\n        self.in_path = in_path\n\n    def has_env_var(self, var):\n        \"\"\"Check if a particular environment variable is current known\n        by this run spec.\n        \"\"\"\n        return var in self.env\n\n    def set_env_var(self, var, value, replace = True):\n        \"\"\"Sets an environment variable.\"\"\"\n        if var in self.env and not replace:\n            raise IndexError\n\n        self.env[var] = value\n\n    def set_arg(self, arg, arg_type = AT_OPAQUE):\n        \"\"\"Set an argument to use when running the spec.\"\"\"\n        self.args.append((arg, arg_type))\n\n    def get_input_files(self):\n        \"\"\"Search through set arguments in the specification looking\n        for input file arguments, and return found input files.\n        \"\"\"\n        out = []\n        for a, aty in self.args:\n            if aty in (AT_INPUT_FILE, AT_INPUT_FILE_IMPLICIT):\n                out.append(a)\n\n        return out\n\n    def check(self):\n        \"\"\"Make sure the binary specified by this object as well as its input\n        files exist.\"\"\"\n        if not self.binary:\n            log.error(\"No binary specified [bin %s]\" % (self.bid,))\n            return False\n\n        # make sure binary exists\n        if not self.in_path and not os.path.exists(self.binary):\n            log.error(\"Binary %s not found [bin %s]\" % (self.binary, self.bid))\n            self.errors.add('missing-binary')\n            return False\n            \n        if not self.in_path and not os.path.isfile(self.binary):\n            log.error(\"Binary %s is not a file [bin %s]\" % (self.binary, self.bid))\n            return False\n            \n        for a in self.get_input_files():\n            if not os.path.exists(a):\n                log.error(\"Input file '%s' does not exist [bin %s]\" % (a, self.bid))\n                self.errors.add('missing-input')\n                return False\n\n            # TODO: add AT_DIR ...\n            if not os.path.isfile(a):\n                log.error(\"Input file '%s' is not a file [bin %s]\" % (a, self.bid))\n                return False\n\n        return True\n\n    def run(self, runid, **kwargs):\n        \"\"\"Run the command specified by this spec.\"\"\"\n        assert runid not in self._runids, \"Duplicate runid %s\" % (runid,)\n\n        assert len(self.errors) == 0\n\n        x = Run(self.env, self.binary, self.args, self)\n        if self.rlimit and os.name != \"nt\":\n            x.set_popen_args('preexec_fn', self.rlimit.set)\n        if os.name == \"nt\":\n            log.info(\"Warning: rlimit not supported on Windows OS\")\n\n        x.set_overlays(self.overlays)\n        x.set_tmpdir(self.tmpdir)\n        x.runid = runid\n        self._runids.add(runid)\n        x.run(**kwargs)\n        self.runs.append(x)\n        return x\n\n    def set_rlimit(self, rlimit):\n        self.rlimit = rlimit\n\n    def __str__(self):\n        ev = [\"%s=%s\" % (k, v) for k, v in self.env.iteritems()]\n        args = [\"%s\" % (a) for a, b in self.args]\n        return \"%s %s %s\" % (\" \".join(ev), self.binary, \" \".join(args))\n        \nclass RunSpec(BasicRunSpec):\n    \"\"\"Extended runspec that holds extra things (an id, bmk binarys/inputs, checkers,\n    among these additional things.\n    \"\"\"\n    def __init__(self, bmk_binary, bmk_input):\n        super(RunSpec, self).__init__()\n\n        self.bmk_binary = bmk_binary\n        self.bmk_input = bmk_input\n        \n        self.bid = self.bmk_binary.get_id()\n        self.input_name = bmk_input.get_id()\n        self.checker = None\n        self.perf = None\n\n    def set_checker(self, checker):\n        self.checker = checker\n\n    def set_perf(self, perf):\n        self.perf = perf\n\n    def check(self):\n        if not super(RunSpec, self).check():\n            return False\n\n        if not self.checker:\n            log.error(\"No checker specified for input %s [bin %s] \" % (self.input_name, self.bid))\n            return False\n\n        if not self.perf:\n            log.error(\"No perf specified for input %s [bin %s] \" % (self.input_name, self.bid))\n            return False\n\n        for a in self.checker.get_input_files():\n            if not os.path.exists(a):\n                log.error(\"Checker input file '%s' does not exist [bin %s]\" % (a, self.bid))\n                return False\n\n            # TODO: add AT_DIR ...\n            if not os.path.isfile(a):\n                log.error(\"Checker input file '%s' is not a file [bin %s]\" % (a, self.bid))\n                return False\n\n        return True\n\n#class DistRunSpec(RunSpec):\n#    #TODO\n"
  },
  {
    "path": "scripts/experimental/bmk2/extras.py",
    "content": "#\n# extras.py\n#\n# Utility functions for bmk2.\n#\n# Copyright (c) 2015, 2016 The University of Texas at Austin\n#\n# Author: Sreepathi Pai <sreepai@ices.utexas.edu>\n#\n# Intended to be licensed under GPL3\n\nimport os\nimport fnmatch\n\ndef read_line_terminated_cfg(configs):\n    out = []\n    for f in configs:\n        fl = [s.strip() for s in open(f, \"r\")]\n        fl = [l for l in fl if (l and l[0] != \"#\")]\n        out += fl\n\n    return out\n\n# Tyler: Not sure if this is the best place for the blacklist\ndef scan(path, glob, black_list = []):\n    out = []\n    for root, dirnames, filenames in os.walk(path):\n        matches = fnmatch.filter(filenames, glob)\n        out += [os.path.join(root, m) for m in matches]\n\n    out = [o for o in out if len([x for x in black_list if x + os.sep in o]) == 0]\n    return out\n\ndef summarize(log, rspecs):\n    bins = set([rs.bmk_binary.get_id() for rs in rspecs])\n    inputs = set([rs.bmk_input.get_id() for rs in rspecs])\n\n    runs = 0\n    failed_runs = 0\n    failed_checks = 0\n\n    for rs in rspecs:\n        runs += len(rs.runs)\n        failed_runs += len(filter(lambda x: not x.run_ok, rs.runs))\n        failed_checks += len(filter(lambda x: not x.check_ok, rs.runs))\n\n    log.info('Summary: Runspecs: %s Binaries: %d Inputs: %d  Total runs: %d Failed: %d Failed Checks: %d' % (len(rspecs), len(bins), len(inputs), runs, failed_runs, failed_checks))\n\ndef standard_loader(metadir, inpproc, binspec, scandir, bispec, binputs = \"\", \n                    ignore_missing_binaries = False, bingroup = \"BINARIES\", \n                    bin_configs = None, extended_scan = False, black_list = [], \n                    varconfigs = None):\n    import bmk2\n    import config\n    import sys\n\n    if scandir:\n        basepath = os.path.abspath(scandir)\n        binspecs = scan(scandir, \"bmktest2.py\", black_list)\n        if extended_scan:\n            binspecs.extend(scan(scandir, \"bmktest2-*.py\", black_list))\n    else:\n        if not os.path.exists(binspec):\n            print >>sys.stderr, \"Unable to find %s\" % (binspec,)\n            return False\n\n        basepath = os.path.abspath(\".\")\n        binspecs = [binspec]\n\n    l = bmk2.Loader(metadir, inpproc)\n\n    ftf = {}\n    if bispec:\n        f = None\n        if os.path.exists(bispec) and os.path.isfile(bispec):\n            f = bispec\n        else:\n            f = l.config.get_var(\"bispec_\" + bispec, None)\n            f = os.path.join(metadir, f)\n\n        assert f is not None, \"Unable to find file or spec in config file for bispec '%s'\" % (bispec,)\n        ftf[config.FT_BISPEC] = f\n\n    if not l.initialize(ftf): return False\n    sel_inputs, sel_binaries = l.split_binputs(binputs)\n\n    print >>sys.stderr, \"sel_inputs set to '%s', sel_binaries set to '%s'\" % (sel_inputs, sel_binaries)\n\n    if bin_configs is not None and len(bin_configs) > 0:\n        if not l.config.load_bin_config(bin_configs):\n            print >>sys.stderr, \"Unable to load binary configurations '%s'\" % (bin_configs,)\n            return False\n\n    if varconfigs is not None:\n        if not l.config.load_var_config(varconfigs):\n            print >>sys.stderr, \"Unable to load variable configurations '%s'\" % (varconfigs,)\n            return False\n\n    sys.path.append(metadir)\n    if not l.load_multiple_binaries(binspecs, sel_binaries, bingroup) and not ignore_missing_binaries: return False\n    if not l.apply_config(): return False\n    if not l.associate_inputs(sel_inputs): return False\n\n    return (basepath, binspecs, l)\n\nif __name__ == '__main__':\n    import sys\n    print scan(sys.argv[1], \"bmktest2.py\")\n"
  },
  {
    "path": "scripts/experimental/bmk2/inputdb.py",
    "content": "#!/usr/bin/env python\n#\n# inputdb.py\n#\n# Manages input db files for bmk2 (*.inputdb)\n#\n# Copyright (c) 2015, 2016 The University of Texas at Austin\n#\n# Author: Sreepathi Pai <sreepai@ices.utexas.edu>\n#\n# Intended to be licensed under GPL3\n\nimport sys\nimport os\nimport ConfigParser\nimport argparse\nimport common\nimport fnmatch\nimport inputprops\nfrom core import Input\nfrom opdb import ObjectPropsCFG\n\nclass InputDBcfg(ObjectPropsCFG):\n    \"\"\"Parser of inputdb files.\"\"\"\n    def __init__(self, filename, inpproc = None):\n        super(InputDBcfg, self).__init__(filename, \"bmktest2\", [\"2\"])\n        self.inpproc = inpproc\n\n        self.unserialize_input = None\n        self.serialize_input = None\n\n        # save input processor functions if they exist\n        if self.inpproc:\n            inpproc = common.load_py_module(self.inpproc)\n            if 'unserialize_input' in inpproc:\n                self.unserialize_input = inpproc['unserialize_input']\n\n            if 'serialize_input' in inpproc:\n                self.serialize_input = inpproc['serialize_input']                \n\n    def init(self, basepath):\n        self.meta = dict([('version', \"2\"), ('basepath', basepath)])\n\n    def post_load(self):\n        \"\"\"Append the basepath to file paths of all \"file\" variables\n        loaded. (basepath specified in meta section)\n        \"\"\"\n        basepath = os.path.expanduser(self.meta['basepath'])\n\n        if not (os.path.exists(basepath)):\n            print >>sys.stderr, \"Basepath '%s' ('%s') does not exist\" % (basepath, self.meta['basepath'])\n            return False\n\n        for s in self.objects:\n            e = self.objects[s]\n            if self.unserialize_input:\n                e = self.unserialize_input(e, basepath)\n\n            e['file'] = os.path.join(basepath, e['file'])\n\n        return True\n        #basepath = self.meta['basepath']\n\n        #for s in self.objects:\n        #    e = self.objects[s]\n        #    if self.unserialize_input:\n        #        e = self.unserialize_input(e, basepath)\n\n        #    e['file'] = os.path.join(basepath, e['file'])            \n\n        #return True\n\n    def unparse_section(self, section):\n        if self.serialize_input:\n            self.serialize_input(section)\n            \n        # replaces the \"file\" variable in a section with a RELATIVE path\n        #if 'file' in section:\n        #    section['file'] = os.path.relpath(section['file'], self.meta['basepath'])\n        if 'file' in section:\n            basepath = os.path.expanduser(self.meta['basepath'])\n            section['file'] = os.path.relpath(section['file'], basepath)\n\n        return section\n\nclass InputDB(object):\n    \"\"\"Database of inputs specified by an inputDB.\"\"\"\n    def __init__(self, cfgfile, inpproc = None, inputprops = None):\n        self.inpproc = inpproc\n        self.inputprops = inputprops\n        self.cfg = InputDBcfg(cfgfile, self.inpproc)\n\n    def get_alt_format(self, name, fmt):\n        \"\"\"Get the Input object associated with some input that has a \n        particular format.\n        \"\"\"\n        if name in self.n2i:\n            for x in self.n2i[name]:\n                if x.props.format == fmt:\n                    return x\n\n    def get_all_alt(self, name):\n        \"\"\"Get the Input object(s) associated with some input.\"\"\"\n        if name in self.n2i:\n            return self.n2i[name]\n\n    def load(self):\n        if not self.cfg.load():\n            print >>sys.stderr, \"Unable to load InputDB configuration!\"\n            return False\n\n        if self.inputprops is not None:\n            # not .props as Properties!\n            self.props = inputprops.InputPropsCfg(self.inputprops, self)\n            if not self.props.load():\n                print >>sys.stderr, \"Unable to load InputProps\"\n                return False\n\n            # add any new properties specified by inputprops into the\n            # correct inputs\n            inputprops.apply_props(self.cfg.objects.itervalues(), self.props)\n\n        # get all inputs specified by inputdb and wrap into an Input\n        # object\n        self.inputdb = [Input(i, self) for i in self.cfg]\n        self.inpnames = set([i.get_id() for i in self.inputdb])\n\n        # setup the name -> Input map; note an input can have multiple\n        # formats...\n        self.n2i = dict([(n, list()) for n in self.inpnames])\n        for i in self.inputdb:\n            self.n2i[i.get_id()].append(i)\n        \n        return True\n            \n    def __iter__(self):\n        return iter(self.inputdb)\n       \nif __name__ == \"__main__\":\n    p = argparse.ArgumentParser(description=\"Prepare an inputs database\")\n    p.add_argument(\"--glob\", help=\"Glob\")\n    p.add_argument(\"--update\", action=\"store_true\", help=\"Update dbfile\")\n    p.add_argument(\"inpproc\", help=\"Input processor (python module)\")\n    p.add_argument(\"dbfile\", help=\"Output database file\")\n    p.add_argument(\"basepath\", nargs=\"?\", help=\"Scan this path for inputs\", default=\".\")\n    \n    args = p.parse_args()\n    inpproc = common.load_py_module(args.inpproc)\n\n    if args.update:\n        idb = InputDB(args.dbfile, args.inpproc)\n        idb.load()\n        basepath = os.path.expanduser(idb.cfg.meta['basepath'])\n        print >>sys.stderr, \"using basepath from file: %s\" % (basepath,)\n    else:\n        idb = InputDB(args.dbfile, args.inpproc)\n        basepath = args.basepath\n        idb.cfg.init(basepath)\n\n    describe_input = inpproc['describe_input']\n\n    out = []\n    for root, dirnames, filenames in os.walk(basepath, followlinks=True):\n        rp = os.path.relpath(root, basepath)\n        \n        if args.glob:\n            filenames = fnmatch.filter(filenames, args.glob)       \n\n        for f in filenames:\n            if f[0] == \".\":\n                continue\n\n            x = describe_input(root, f, rp)\n            if x:\n                x['file'] = os.path.join(rp, f)\n                if x['file'] not in idb.cfg.objects:\n                    print >>sys.stderr, x['file']\n                    idb.cfg.objects[x['file']] = x\n                    x['file'] = os.path.join(basepath, x['file'])\n                    \n    if args.update:\n        idb.cfg.save(args.dbfile)\n    else:\n        idb.cfg.save(args.dbfile)\n\n"
  },
  {
    "path": "scripts/experimental/bmk2/inputprops.py",
    "content": "#!/usr/bin/env python\n#\n# inputprops.py\n#\n# Manages an input properties file (*.inputprops)\n#\n# Copyright (c) 2015, 2016 The University of Texas at Austin\n#\n# Author: Sreepathi Pai <sreepai@ices.utexas.edu>\n#\n# Intended to be licensed under GPL3\n\nimport sys\nimport inputdb\nimport argparse\nimport ConfigParser\nimport os\nfrom opdb import ObjectPropsCFG\n\nclass InputPropsCfg(ObjectPropsCFG):\n    \"\"\"Parser of inputprops files that specify additional properties\n    for certain inputs.\n    \"\"\"\n    def __init__(self, filename, inputdb):\n        super(InputPropsCfg, self).__init__(filename, \"bmktest2-props\", [\"2\"])\n        self.inputdb = inputdb\n        self.path_items = set()\n\n    def init(self):\n        self.meta = {}\n        self.meta['version'] = \"2\"\n\n    def post_load(self):\n        \"\"\"Look for \"path_items\" as a var name in the sections; if it exists,\n        then prepend the base path (specified in meta of inputdatabase) to the \n        path.\n        \"\"\"\n        path_items = self.meta.get(\"paths\", \"\")\n        self.path_items = set([xx.strip() for xx in path_items.split(\",\")])\n\n        basepath = os.path.expanduser(self.inputdb.cfg.meta['basepath'])\n\n        for e in self.objects.itervalues():\n            for pi in self.path_items:\n                if pi in e:\n                    e[pi] = os.path.join(basepath, e[pi])\n\n        return True\n\n    def unparse_section(self, section):\n        \"\"\"Replace anything in path_items with relative paths.\"\"\"\n        bp = os.path.expanduser(self.inputdb.cfg.meta['basepath'])\n\n        for pi in self.path_items:\n            if pi in section:\n                section[pi] = os.path.relpath(section[pi], bp)\n\n        return section\n\ndef apply_props(inputdb, props):\n    \"\"\"Save the additional properties specified by an inputprops file\n    into the input database.\n    \"\"\"\n    for e in inputdb:\n        if e['name'] in props.objects:\n            e.update(props.objects[e['name']])\n\n    return True\n\nif __name__ == \"__main__\":\n    p = argparse.ArgumentParser(\"Create/Update an input properties file\")\n    p.add_argument(\"inputdb\", help=\"Inputdb file\")\n    p.add_argument(\"inputprops\", help=\"Inputprops file\")\n\n    args = p.parse_args()\n\n    idb = inputdb.InputDB(args.inputdb)\n    ip = InputPropsCfg(args.inputprops, idb)\n\n    \n    if not idb.load():\n        print >>sys.stderr, \"Failed to load inputdb\"\n        sys.exit(1)\n\n\n    if os.path.exists(args.inputprops):\n        if not ip.load():\n            print >>sys.stderr, \"Failed to load props\"\n            sys.exit(1)\n    else:\n        ip.init()\n\n    for e in idb:\n        nm = e.name\n\n        if nm not in ip.objects:\n            ip.objects[nm] = {'name':  nm}\n\n    ip.save(args.inputprops)\n"
  },
  {
    "path": "scripts/experimental/bmk2/logproc.py",
    "content": "#!/usr/bin/env python\n#\n# logproc.py\n#\n# Log file reader library for bmk2. Reads log files generated by\n# test2.py, for example.\n#\n# Copyright (c) 2015, 2016 The University of Texas at Austin\n#\n# Author: Sreepathi Pai <sreepai@ices.utexas.edu>\n#\n# Intended to be licensed under GPL3\n\nimport re\nimport datetime\nfrom collections import namedtuple\n\nsdate = namedtuple(\"sdate\", ['type', 'start_date', 'raw'])\nedate = namedtuple(\"edate\", ['type', 'end_date', 'raw'])\n\n# note: rsid and binid are one and the same\nrun_begin = namedtuple(\"run_begin\", ['type', 'begin', 'raw'])\nrun_end = namedtuple(\"run_end\", ['type', 'end', 'raw'])\ncollect_entry = namedtuple(\"collect_entry\", ['type', 'rsid', 'runid', 'filetype', 'file', 'raw'])\nperf_info = namedtuple('perf_info', ['type', 'binid', 'xid', 'run', 'time_ns', 'cmdline', 'raw'])\ntc_info = namedtuple('tc_info', ['type', 'rsid', 'task', 'task_args', 'raw'])\nmissing_info = namedtuple('missing_info', ['type', 'binid', 'raw'])\ninstr = namedtuple('instr', ['type', 'name', 'args', 'raw'])\nfail_info = namedtuple('fail_info', ['type', 'binid', 'runid', 'message', 'raw'])\npass_info = namedtuple('pass_info', ['type', 'binid', 'args', 'raw'])\ngeneric_log = namedtuple('generic_log', ['type', 'loglevel', 'raw'])\nunmatched = namedtuple('unmatched', ['type', 'raw'])\n\nst = re.compile(\"^START\")\ndt = re.compile(\"^INFO DATE (START|END)\")\ncollect_bp = re.compile(\"^COLLECT basepath (.*)$\")\ncollect = re.compile(\"^COLLECT (.*) (.*) (.*) (.*)$\")\npd_begin = re.compile(\"^INFO PERFDATE BEGIN_RUN\")\npd_end = re.compile(\"^INFO PERFDATE END_RUN\")\np = re.compile(\"^PERF \")\nmissing = re.compile(\"^FAIL MISSING PERF\")\nfail_general = re.compile(\"^FAIL ([^: ]+) ?([^: ]+)?: ?(.+)$\")\ntc_re = re.compile(\"^TASK_COMPLETE ([^ ]+) ([^ ]+)( (.*))?$\")\ninstr_re = re.compile(\"^INSTR ([^ ]+) (.*)$\")\npass_re = re.compile('^PASS ([^ :]+)(.+)$')\ngen_log_re = re.compile('^(INFO|DEBUG|ERROR)')\n\ndef parse_log_file(logfile, extended = False):\n    with open(logfile, \"r\") as f:\n        for l in f:\n            m = st.match(l)\n            if m:\n                assert False, l\n                print m.group(0)\n                continue\n\n            m = dt.match(l)\n            if m:\n                # may yield multiple dates if log files are\n                # concatenate\n\n                if m.group(1) == \"START\":\n                    yield sdate(\"START_DATE\", l.strip().split(\" \", 3)[-1], raw=l)\n                else:\n                    yield edate(\"END_DATE\", l.strip().split(\" \", 3)[-1], raw=l)\n                continue\n\n            m = pd_begin.match(l)\n            if m:\n                yield run_begin(\"RUN_BEGIN\", l.strip().split(\" \", 3)[-1], raw=l)\n                continue\n\n            m = pd_end.match(l)\n            if m:\n                yield run_end(\"RUN_END\", l.strip().split(\" \", 3)[-1], raw=l)\n                continue\n\n            m = collect_bp.match(l)\n            if m:\n                fi = m.group(1)\n\n                yield collect_entry(\"COLLECT\", rsid=\"\", runid=\"\", filetype=\"basepath\", file=fi, raw=l)\n                continue\n\n            m = collect.match(l)\n            if m:\n                rsid = m.group(1)\n                runid = m.group(2)\n                ty = m.group(3)\n                fi = m.group(4)\n\n                yield collect_entry(\"COLLECT\", rsid=rsid, runid=runid, \n                                    filetype=ty, file=fi, raw=l)\n                continue\n\n            m = p.match(l)\n            if m:\n                ls = l.strip().split(\" \", 5)\n                out = perf_info(\"PERF\", \n                                binid = ls[1], \n                                xid = ls[2],\n                                run = ls[3],\n                                time_ns = ls[4],\n                                cmdline = ls[5], raw=l)\n\n                yield out\n                continue\n\n            m = missing.match(l)\n            if m:\n                yield missing_info(\"MISSING\", binid = l.strip().split()[-1], raw=l)\n                continue\n\n            m = fail_general.match(l)\n            if m:                \n                yield fail_info(\"FAIL\", binid = m.group(1), runid=m.group(2), message=m.group(3), raw=l)\n                continue\n\n            m = tc_re.match(l)\n            if m:\n                rsid = m.group(1)\n                task = m.group(2).strip()\n                task_args = m.group(4)\n\n                yield tc_info(\"TASK_COMPLETE\", rsid, task, task_args, raw=l)\n                continue\n            \n            m = instr_re.match(l)\n            if m:\n                name = m.group(1)\n                args = m.group(2)\n\n                yield instr(\"INSTR\", name, args, raw=l)\n                continue\n\n            if extended:\n                m = pass_re.match(l)\n                if m:\n                    binid = m.group(1)\n                    args = m.group(2)\n\n                    yield pass_info(\"PASS\", binid = binid, args = args, raw=l)\n                    continue\n\n                m = gen_log_re.match(l)\n                if m:\n                    loglevel = m.group(1)\n\n                    yield generic_log(\"GENERIC_LOG\", loglevel, raw=l)\n                    continue\n\n                \n\n                yield unmatched(\"UNMATCHED\", raw=l)\n\nif __name__ == \"__main__\":\n    import sys\n    for r in parse_log_file(sys.argv[1]):\n        print r\n"
  },
  {
    "path": "scripts/experimental/bmk2/mapfile.py",
    "content": "from collections import namedtuple\nimport re\n\nrun_re = re.compile(r\"^([0-9]+\\.[0-9]+\\.[0-9]+)\\.([0-9]+)$\")\n\nmapfile_entry = namedtuple('mapfile_entry', ['binid', 'input', 'runid', 'filetype', 'filename', 'abspath'])\n\ndef split_bininpid(bininpid):\n    p = bininpid.rfind(\"/\")\n    return bininpid[:p], bininpid[p+1:]\n\ndef split_runid(runid):\n    m = run_re.match(runid)\n\n    if m:\n        return m.group(1), m.group(2)\n    else:\n        return runid, None\n\ndef get_run(runid):\n    m = run_re.match(runid)\n    if m:\n        return int(m.group(2))\n    else:\n        return None\n\ndef read_mapfile(mapfile):\n    with open(mapfile, \"r\") as f:\n        for l in f:\n            # binid/input runid filetype filename abspath\n            ls = l.strip().split(\" \", 4)\n            binid, input = split_bininpid(ls[0])\n\n            if len(ls) != 5:\n                print \"ERROR: malformed mapfile entry\", ls\n\n            yield mapfile_entry(binid = binid, input = input, runid=ls[1], filetype=ls[2], filename=ls[3], abspath=ls[4])\n\ndef write_mapfile(mapfile, mapentries, mode=\"w\"):\n    f = open(mapfile, mode)\n    \n    for me in mapentries:\n        assert me.input != \"\"\n        \n        print >>f, \"%s/%s %s %s %s %s\" % (me.binid, me.input, me.runid, me.filetype, me.filename, me.abspath)\n\n    f.close()\n        \ndef write_mapfile_raw(mapfile, mapentries, mode=\"w\"):\n    \"\"\"For use by non-binary/input aware tools, input must be empty and binid contains the whole ID\"\"\"\n\n    f = open(mapfile, mode)\n    \n    for me in mapentries:\n        assert me.input == \"\"\n        \n        print >>f, \"%s %s %s %s %s\" % (me.binid, me.runid, me.filetype, me.filename, me.abspath)\n\n    f.close()\n        \ndef mapfile2dict(mapfile, fltr = None):\n    out = {}\n    for e in read_mapfile(mapfile):\n        if fltr is not None and not fltr(e): continue\n\n        k1 = (e.binid, e.input)\n        if k1 not in out:\n            out[k1] = {}\n\n        if e.runid not in out[k1]:\n            out[k1][e.runid] = {}\n\n        if e.filetype not in out[k1][e.runid]:\n            out[k1][e.runid][e.filetype] = []\n\n        out[k1][e.runid][e.filetype].append(e)\n\n    return out\n"
  },
  {
    "path": "scripts/experimental/bmk2/measure_energy.py",
    "content": "#!/usr/bin/env python\n# measure_energy.py\n#\n# Measure energy on Intel platforms that support RAPL access through\n# the powercap interface.\n#\n# Part of bmk2\n#\n# Copyright (c) 2017 The University of Texas at Austin\n#\n# Author: Sreepathi Pai <sreepai@ices.utexas.edu>\n\nimport sys\nimport os\nimport subprocess\nimport glob\nimport threading\n\nINTERVAL_S = 15\n\ndef get_rapl_files():\n    dom = glob.glob(\"/sys/class/powercap/intel-rapl:*\")\n    \n    out = {}\n    for d in dom:\n        dd = os.path.basename(d)\n        f = os.path.join(d, \"energy_uj\")\n        if os.path.exists(f):\n            out[dd] = f\n\n        f = os.path.join(d, \"max_energy_range_uj\")\n        if os.path.exists(f):\n            out[\"max_\" + dd] = f\n\n\n    return out\n\ndef read_rapl_power(rapl_files):\n    out = {}\n    for k, f in rapl_files.items():\n        of = open(f, \"r\")\n        out[k] = int(of.read())\n        of.close()\n\n    return out\n\ndef periodic_power():\n    global TIMER\n    POWER.append(read_rapl_power(rf))\n    TIMER = threading.Timer(INTERVAL_S, periodic_power)\n    TIMER.start()\n\ndef count_wraparound(nums, key = None):\n    prev = None\n    wrap = 0\n\n    for n in nums:\n        if key:\n            n = key(n)\n\n        if prev is not None and prev > n:\n            wrap += 1\n\n        prev = n\n\n    return wrap\n\ndef calc_power(POWER):\n    k = [kk for kk in POWER[0].keys() if kk[:4] != 'max_']\n\n    out = {}\n    for kk in k:\n        wraps = count_wraparound(POWER, key = lambda x: x[kk])\n        \n        bef = POWER[0][kk]\n        aft = POWER[-1][kk] + wraps * POWER[-1][\"max_\" + kk]\n\n        out[kk] = aft - bef\n        out[kk+\":wraps\"] = wraps\n\n    return out\n\nif len(sys.argv) == 1:\n    print >>sys.stderr, \"Usage: %s cmd-line\\n\" % (sys.argv[0],)\n    exit(1)\n\ncmdline = sys.argv[1:]\n\nrf = get_rapl_files()\nif len(rf):\n    POWER = []\n    TIMER = threading.Timer(INTERVAL_S, periodic_power)\n    POWER.append(read_rapl_power(rf))\n    TIMER.start()\n\n    proc = subprocess.Popen(cmdline)\n    proc.wait()\n\n    TIMER.cancel()\n    POWER.append(read_rapl_power(rf))\n\n    p = calc_power(POWER)\n    for k in p:\n        print \"INSTR\", k, p[k] # micro joules\n\n    sys.exit(proc.returncode)\nelse:\n    print >>sys.stderr, \"Did not find RAPL power counters (/sys/class/powercap/intel-rapl*)\"\n    sys.exit(1)\n"
  },
  {
    "path": "scripts/experimental/bmk2/opdb.py",
    "content": "#\n# opdb.py\n#\n# Object properties database for bmk2.  Sections in CFG files indicate\n# objects, section keys indicate properties.\n#\n# Copyright (c) 2015, 2016 The University of Texas at Austin\n#\n# Author: Sreepathi Pai <sreepai@ices.utexas.edu>\n#\n# Intended to be licensed under GPL3\n\nimport ConfigParser\nfrom collections import OrderedDict\nimport os\nimport glob\nimport sys\n\ndef cfg_get(fn, section, key, default=None):\n    \"\"\"Wrapper for ConfigParser functions that allows you to return a default\n    value on failure.\n\n    Keyword Arguments:\n    fn -- config parser function to use\n    section -- section of config file to read\n    key -- key to attempt to access from section\n    default -- default value to return on error\n    \"\"\"\n    try:\n        v = fn(section, key)\n        return v\n    except ConfigParser.NoOptionError:\n        return default\n\nclass ObjectProps(object):\n    pass\n\nclass ObjectPropsCFG(ObjectProps):\n    \"\"\"Read/Write a .cfg file as a object property file.\n    \n       Sections names indicate objects, section keys indicate properties.\"\"\"\n    def __init__(self, filename, fmt, acceptable_versions):\n        \"\"\"Initializer.\n\n        Keyword Arguments:\n        filename -- config file to read\n        fmt -- format of the config file (should be section name in config\n        file that has meta information); \n        acceptable_versions -- versions of the config file that are allowed\n        \"\"\"\n        self.filename = filename\n        self.fmt = fmt\n        self.acceptable_versions = acceptable_versions\n        self.meta = None\n        self.objects = OrderedDict() # contains section -> vars in section\n        self.site = None\n\n    def _site_specific_cfg(self, x):\n        d = os.path.dirname(self.filename)\n        sitefiles = glob.glob(os.path.join(d, \"SITE-IS.*\"))\n\n        if len(sitefiles) > 1:\n            print >>sys.stderr, (\"Only one sitefile should exist. Currently, multiple sitefiles exist: '%s'\" % (sitefiles,))\n        elif len(sitefiles) == 0:\n            print >>sys.stderr, (\"No sitefile found.\")\n        else:\n            p = sitefiles[0].rindex(\".\")\n            self.site = sitefiles[0][p+1:]\n            print >>sys.stderr, (\"Site set to '%s'.\" % (self.site,))\n            sscfg = self.filename + \".\" + self.site\n\n            if not os.path.exists(sscfg):\n                print >>sys.stderr, (\"Site-specific input db '%s' not found.\" % (sscfg,))\n            else:\n                print >>sys.stderr, (\"Loading site-specific '%s'.\" % (sscfg,))\n\n                y = ConfigParser.SafeConfigParser()\n\n                with open(sscfg, \"rb\") as f:\n                    y.readfp(f)\n\n                    v = cfg_get(y.get, self.fmt, \"version\")\n\n                    self.version = v\n\n                    if not self.check_version(v):\n                        av = [str(v) for v in self.acceptable_versions]\n                        if v:\n                            print >>sys.stderr, \"Unknown version: %s (acceptable: %s)\" % (v, \", \".join(av))\n                        else:\n                            print >>sys.stderr, \"Unable to determine version (acceptable: %s)\" % (\", \".join(av))\n\n                    for s in (\"bmktest2\", ):\n                        for n, v in y.items(s):\n                            if not x.has_section(s):\n                                x.add_section(s)\n                                \n                            print >>sys.stderr, (\"Setting site-specific [%s]:%s to '%s'\" % (s, n, v))\n                            x.set(s, n, v)                \n\n                return True\n\n        return False\n\n    def check_version(self, version):\n        \"\"\"Check if a version is allowed by this reader.\"\"\"\n        return version in self.acceptable_versions\n\n    def update_props(self, props):\n        return props\n\n    def parse_section(self, cfg, section): \n        \"\"\"Given a dictionary that represents the parse of some section of\n        a config file, return them as an ordered dictionary.\n\n        Keyword Arguments:\n        cfg -- dictionary that contains parse results\n        section -- section to retrieve\n        \"\"\"\n        d = OrderedDict(cfg.items(section))\n        d = self.update_props(d)\n        return d\n\n    def unparse_section(self, section):\n        return section\n    \n    def post_load(self):\n        return True\n\n    def load(self):\n        \"\"\"Load the configuration file and parse its sections.\"\"\"\n        x = ConfigParser.SafeConfigParser()\n\n        out = OrderedDict()\n        with open(self.filename, \"rb\") as f:\n            x.readfp(f)\n\n            v = cfg_get(x.get, self.fmt, \"version\")\n\n            self.version = v\n\n            if not self.check_version(v):\n                av = [str(v) for v in self.acceptable_versions]\n                if v:\n                    print >>sys.stderr, \"Unknown version: %s (acceptable: %s)\" % (v, \", \".join(av))\n                else:\n                    print >>sys.stderr, \"Unable to determine version (acceptable: %s)\" % (\", \".join(av))\n                \n            self._site_specific_cfg(x)\n\n            # save vars in a section to dictionary\n            for s in x.sections():\n                if s == self.fmt: \n                    self.meta = self.parse_section(x, s)\n                else:\n                    if s in out:\n                        print >>sys.stderr, \"Warning: Duplicate section '%s', overwriting\" % (s,)\n\n                    out[s] = self.parse_section(x, s)\n\n            self.objects = out\n            return self.post_load()\n\n        return False\n\n    def save(self, fn = None):\n        \"\"\"Save the parsed configuration file back into another file.\"\"\"\n        def write_items(cfg, section, items):\n            for k, v in items.iteritems():\n                cfg.set(section, k, v)\n\n        x = ConfigParser.SafeConfigParser()\n        \n        assert self.filename or fn, \"Both filename and fn cannot be empty.\"\n        if not fn: fn = self.filename\n\n        x.add_section(self.fmt)\n        write_items(x, self.fmt, self.unparse_section(self.meta))\n\n        for s in self.objects:\n            x.add_section(s)\n            write_items(x, s, self.unparse_section(self.objects[s]))\n\n        with open(fn, \"wb\") as f:\n            x.write(f)\n        \n    def __iter__(self):\n        return iter(self.objects.itervalues())\n"
  },
  {
    "path": "scripts/experimental/bmk2/overlays.py",
    "content": "import core\nimport logging\n\nlog = logging.getLogger(__name__)\n\nclass Overlay(object):\n    \"\"\"Class that holds params/env vars to \"overlay\" into the command line options\n    of some run.\"\"\"\n    def __init__(self, env = {}, binary = None, args = []):\n        self.env = env\n        self.binary = binary\n        self.args = args\n        self.tmpfiles = {}\n\n    def overlay(self, run, env, cmdline, inherit_tmpfiles = None, logfiles = None):\n        \"\"\"Overlay arguments/env vars into a command line (i.e. add to it).\"\"\"\n        if env is None:\n            new_env = None\n        else:\n            new_env = env.copy()\n            new_env.update(self.env)\n\n        new_cmdline = []\n        if self.binary:\n            new_cmdline.append(self.binary)\n\n        for a, aty in self.args:\n            if aty == core.AT_INPUT_FILE_IMPLICIT:\n                continue\n\n            if aty == core.AT_TEMPORARY_OUTPUT:\n                th, self.tmpfiles[a] = tempfile.mkstemp(prefix=\"test-ov-\")\n                os.close(th)\n                log.debug(\"Created temporary file '%s' for overlay parameter '%s'\" % \n                          (self.tmpfiles[a], a))\n                a = self.tmpfiles[a]\n            elif aty == core.AT_TEMPORARY_INPUT:\n                a = inherit_tmpfiles[a]\n            elif aty == core.AT_LOG:\n                a = logfiles[a]\n\n            new_cmdline.append(a)\n\n        new_cmdline += cmdline\n\n        return new_env, new_cmdline\n\n    def cleanup(self):\n        \"\"\"Cleanup temp files created.\"\"\"\n        for a, f in self.tmpfiles.iteritems():\n            os.unlink(f)\n\n    def __str__(self):\n        ev = [\"%s=%s\" % (k, v) for k, v in self.env.iteritems()]\n        return \"%s %s\" % (\" \".join(ev), self.cmd_line_c)\n\nclass CUDAProfilerOverlay(Overlay):\n    def __init__(self, profile_cfg = None, profile_log = None):\n        env = {'CUDA_PROFILE': '1'}\n        if profile_cfg: env['CUDA_PROFILE_CONFIG'] = profile_cfg\n        if profile_log: env['CUDA_PROFILE_LOG'] = profile_log\n\n        self.profile_log = profile_log\n        self.collect = logging.getLevelName('COLLECT')\n        super(CUDAProfilerOverlay, self).__init__(env)\n\n    def overlay(self, run, env, cmdline, inherit_tmpfiles = None):\n        if self.profile_log is not None:\n            self.env['CUDA_PROFILE_LOG'] = core.create_log(self.profile_log, run)\n\n        if self.profile_log:\n            log.log(self.collect, '{rsid} {runid} cuda/profiler {logfile}'.format(rsid=run.rspec.get_id(), runid=run.runid, logfile=self.env['CUDA_PROFILE_LOG']))\n        else:\n            log.log(self.collect, '{rsid} {runid} cuda/profiler cuda_profile_0.log'.format(rsid=run.rspec.get_id(), runid=run.runid))\n        \n        return super(CUDAProfilerOverlay, self).overlay(run, env, cmdline, inherit_tmpfiles)\n\nclass NVProfOverlay(Overlay):\n    def __init__(self, profile_cfg = None, profile_log = None, profile_db = False, profile_analysis = False, system_profiling = False):\n        args = [(x, core.AT_OPAQUE) for x in profile_cfg.strip().split()]\n\n        if profile_db or profile_analysis:\n            args += [('-o', core.AT_OPAQUE), ('@nvprofile', core.AT_LOG)]\n            if profile_analysis:\n                args += [('--analysis-metrics', core.AT_OPAQUE)]\n        else:\n            args += [(x, core.AT_OPAQUE) for x in \"--csv --print-gpu-trace\".split()]\n            args += [('--log-file', core.AT_OPAQUE), ('@nvprofile', core.AT_LOG)]\n\n        if system_profiling:\n            args += [('--system-profiling', core.AT_OPAQUE), ('on', core.AT_OPAQUE)]\n\n        self.profile_cfg = profile_cfg\n        self.profile_log = profile_log\n        self.profile_db = profile_db or profile_analysis\n\n        self.collect = logging.getLevelName('COLLECT')\n        super(NVProfOverlay, self).__init__(binary=\"nvprof\", args=args)\n\n    def overlay(self, run, env, cmdline, inherit_tmpfiles = None):\n        if self.profile_log is not None:\n            logfile = core.create_log(self.profile_log, run)\n        else:\n            if self.profile_db:\n                logfile = 'cuda_profile_0.nvprof'\n            else:\n                logfile = 'cuda_profile_0.log'\n\n        log.log(self.collect, '{rsid} {runid} cuda/nvprof {logfile}'.format(rsid=run.rspec.get_id(), runid=run.runid, logfile=logfile))\n        \n        return super(NVProfOverlay, self).overlay(run, env, cmdline, inherit_tmpfiles, {'@nvprofile': logfile})\n\nclass TmpDirOverlay(Overlay):\n    def __init__(self, tmpdir):\n        env = {'TMPDIR': tmpdir}\n        super(TmpDirOverlay, self).__init__(env)\n\n    def overlay(self, run, env, cmdline, inherit_tmpfiles = None):\n        return super(TmpDirOverlay, self).overlay(run, env, cmdline, inherit_tmpfiles)\n\nclass CLDeviceOverlay(Overlay):\n    def __init__(self, cmdline_template, cl_platform, cl_device):\n        super(CLDeviceOverlay, self).__init__({})\n        self.cmdline_template = cmdline_template\n        self.cl_platform = cl_platform\n        self.cl_device = cl_device\n        self.cmdline = cmdline_template.format(platform = cl_platform, device = cl_device).split(\" \")\n\n    def overlay(self, run, env, cmdline, inherit_tmpfiles = None):\n        return super(CLDeviceOverlay, self).overlay(run, env, cmdline + self.cmdline, inherit_tmpfiles, {})\n\nclass Bmk2RTEnvOverlay(Overlay):\n    def overlay(self, run, env, cmdline, inherit_tmpfiles = None):\n        self.env['BMK2'] = \"1\"\n        if isinstance(run.rspec, core.RunSpec):\n            self.env['BMK2_BINID'] = run.rspec.bid\n            self.env['BMK2_INPUTID'] = run.rspec.input_name\n\n        if run.runid is not None:\n            self.env['BMK2_RUNID'] = run.runid\n\n        return super(Bmk2RTEnvOverlay, self).overlay(run, env, cmdline, inherit_tmpfiles)\n\n\n_instr_overlay_file = {}\n\nclass GGCInstrOverlay(Overlay):\n    @staticmethod\n    def read_map_file(mapfile):\n        out = {}\n        f = open(mapfile, \"r\")\n        for l in f:\n            ls = l.strip().split(' ', 4)\n            bmkinput, uniqid, ty, fn, p = ls\n\n            if ty == \"ggc/kstate\" and bmkinput not in out:\n                out[bmkinput] = (uniqid, os.path.dirname(p))\n\n        f.close()\n\n        return out\n\n    def __init__(self, mapfile):        \n        if mapfile not in _instr_overlay_file:\n            _instr_overlay_file[mapfile] = GGCInstrOverlay.read_map_file(mapfile)\n            \n        self.mapfile = _instr_overlay_file[mapfile]\n        super(GGCInstrOverlay, self).__init__()\n    \n    def overlay(self, run, env, cmdline, inherit_tmpfiles = None):\n        if run.rspec.get_id() in self.mapfile:\n            uid, dirname = self.mapfile[run.rspec.get_id()]\n\n            self.env['INSTR_UNIQID'] = uid\n            self.env['INSTR_TRACE_DIR'] = dirname + \"/\"\n\n        return super(GGCInstrOverlay, self).overlay(run, env, cmdline, inherit_tmpfiles)\n\n\nclass MeasureEnergyOverlay(Overlay):\n    def __init__(self):\n        super(MeasureEnergyOverlay, self).__init__(binary=os.path.join(os.path.dirname(__file__), \"measure_energy.py\"))\n\n    def overlay(self, run, env, cmdline, inherit_tmpfiles = None):\n        return super(MeasureEnergyOverlay, self).overlay(run, env, cmdline, inherit_tmpfiles)\n\n\ndef add_overlay(rspecs, overlay, *args, **kwargs):\n    \"\"\"Add an overlay to a series of rspecifications.\"\"\"\n    for r in rspecs:\n        r.add_overlay(overlay(*args, **kwargs))\n"
  },
  {
    "path": "scripts/experimental/bmk2/perf.py",
    "content": "#\n# perf.py\n#\n# Performance number extractor for bmk2.\n#\n# Copyright (c) 2015, 2016 The University of Texas at Austin\n#\n# Author: Sreepathi Pai <sreepai@ices.utexas.edu>\n#\n# Intended to be licensed under GPL3\n\nimport re\nimport logging\nlog = logging.getLogger(__name__)\n\nMULTIPLIERS = {'s': int(1E9), 'ms': int(1E6), 'us': int(1E3), 'ns': 1}\n\ndef split_decimal_str(n):\n    p = n.find(\".\")\n    if p != -1:\n        whole = int(n[:p])\n        frac = int(n[p+1:])\n    else:\n        whole = int(n)\n\n    return (whole, frac)\n\nclass Perf(object):\n    def get_perf(self, run):\n        raise NotImplementedError\n\nclass ZeroPerf(object):\n    def get_perf(self, run):\n        return 0\n\nclass PerfFn(object):\n    def __init__(self, fn):\n        self.fn = fn\n\n    def get_perf(self, run):\n        if not (run.run_ok and run.check_ok):\n            return None\n\n        return self.fn(run.stdout, run.stderr)\n\nclass PerfRE(object):\n    def __init__(self, rexp, re_unit = None):\n        self.re = re.compile(rexp, re.MULTILINE)\n        self.units = re_unit\n\n        if re_unit:\n            assert self.units in MULTIPLIERS, \"Invalid unit %s\" % (re_unit)\n\n    def get_perf(self, run):\n        if not (run.run_ok and run.check_ok):\n            return None\n\n        if run.stdout != None:\n            run.stdout = run.stdout.replace(\"\\r\", \"\");\n        if run.stderr != None:\n            run.stderr = run.stderr.replace(\"\\r\", \"\");\n\n        m = self.re.search(run.stdout)\n        if not m and run.stderr:\n            m = self.re.search(run.stderr)\n\n        if not m:\n            log.debug(\"No match for perf re in stdout or stderr\")\n            return None\n\n        gd = m.groupdict()\n\n        time_ns = 0\n        if \"time_ns\" in gd:\n            # use time_ns only if present\n            time_ns = int(gd['time_ns'])\n        elif \"time_ms\" in gd:\n            time_ns = int(gd['time_ms']) * MULTIPLIERS['ms']\n        elif \"time_us\" in gd:\n            time_ns = int(gd['time_us']) * MULTIPLIERS['us']\n        elif \"time_s\" in gd:\n            time_ns = int(gd['time_s']) * MULTIPLIERS['s']\n        elif \"frac\" in gd:\n            w, f = int(gd['whole']), int(gd['frac'])\n\n            assert self.units is not None\n\n            m = MULTIPLIERS[self.units]\n\n            l = len(str(m)) - len(gd['frac'])\n            #print l\n            assert l > 0, l\n\n            time_ns = w * m + f * (10**(l-1))\n        elif \"float\" in gd:\n            assert self.units is not None\n\n            m = MULTIPLIERS[self.units]\n            \n            time_ns = int(float(gd['float']) * m)\n        else:\n            assert False, \"Unable to located named groups in perf regex (%s)\" % (gd,)\n\n        return {'time_ns': time_ns}\n\n\n__all__ = ['Perf', 'ZeroPerf', 'PerfFn', 'PerfRE']\n"
  },
  {
    "path": "scripts/experimental/bmk2/rsinfo.py",
    "content": "#!/usr/bin/env python\n#\n# bmk2info.py\n#\n# Dump information from bmktest2.py files (such as benchmark input\n# files, checker input files, etc.).\n#\n# Copyright (c) 2015, 2016 The University of Texas at Austin\n#\n# Author: Sreepathi Pai <sreepai@ices.utexas.edu>\n#\n# Intended to be licensed under GPL3\n\nimport sys\nimport ConfigParser\nimport argparse\nfrom extras import *\nimport logging\nimport opdb\nimport os\nimport re\nimport sconvert\n\nlog = logging.getLogger(__name__)\nlogging.basicConfig(level=logging.INFO, format='%(levelname)-8s %(name)-10s %(message)s')\n\np = argparse.ArgumentParser(description=\"Dump information from bmktest2.py files\")\n#p.add_argument(\"output\", nargs=\"?\", default=\"/dev/stdout\")\np.add_argument('binputs', nargs='*', help=\"List of binaries to restrict\")\np.add_argument(\"-d\", dest=\"metadir\", metavar=\"PATH\", help=\"Path to load configuration from\", default=\".\")\np.add_argument(\"--iproc\", dest=\"inpproc\", metavar=\"FILE\", help=\"Input processor\")\np.add_argument(\"--bs\", dest=\"binspec\", metavar=\"FILE\", help=\"Binary specification\", default=\"./bmktest2.py\")\np.add_argument(\"--bispec\", dest=\"bispec\", metavar=\"FILE_OR_MNEMONIC\", help=\"Binary+Input specification\")\np.add_argument(\"--scan\", dest=\"scan\", metavar=\"PATH\", help=\"Recursively search PATH for bmktest2.py\")\np.add_argument(\"--xs\", dest=\"xtended_scan\", action=\"store_true\", help=\"Also recognize bmktest2-*.py in scans\")\np.add_argument(\"--ignore-missing-binaries\", action=\"store_true\", default = False)\np.add_argument(\"--cfg\", dest=\"configs\", action=\"append\", help=\"Configurations to apply. default is always applied if present\", default=[])\np.add_argument(\"--varcfg\", dest=\"varconfigs\", action=\"append\", help=\"Variable configs, specified as var=value\", default=[])\np.add_argument(\"-o\", dest=\"output\", help=\"Output file\")\np.add_argument(\"-v\", dest=\"verbose\", type=int, help=\"Verbosity\", default=0)\np.add_argument(\"-i\", dest=\"include\", action=\"append\", default=[], choices=set(['inputs', 'checker-inputs', 'all']))\n\nargs = p.parse_args()\n\nif len(args.include) == 0 or \"all\" in args.include:\n    args.include = ['inputs', 'checker-inputs']\n\nargs.include = set(args.include)\n\nloaded = standard_loader(args.metadir, args.inpproc, args.binspec, args.scan, args.bispec, args.binputs, ignore_missing_binaries = args.ignore_missing_binaries, bin_configs=args.configs, extended_scan = args.xtended_scan, varconfigs = args.varconfigs)\nif not loaded:\n    sys.exit(1)\nelse:\n    basepath, binspecs, l = loaded\n\nout = []\nrspecs = l.get_run_specs()\nfor rs in rspecs:\n    if \"inputs\" in args.include:\n        out += [(rs.input_name, f) for f in rs.get_input_files()]\n\n    if \"checker-inputs\" in args.include:\n        out += [(rs.input_name, f) for f in rs.checker.get_input_files()]\n\nout = list(set(out))\nif args.output:\n    of = open(args.output, \"w\")\nelse:\n    of = sys.stdout\n\nfor e in out:\n    of.write((\"%s %s\" % e) + \"\\n\")\n"
  },
  {
    "path": "scripts/experimental/bmk2/sconvert.py",
    "content": "#!/usr/bin/env python\n#\n# sconvert.py\n#\n# Simple converter for bmk2.\n#\n# Copyright (c) 2015, 2016 The University of Texas at Austin\n#\n# Author: Sreepathi Pai <sreepai@ices.utexas.edu>\n#\n# Intended to be licensed under GPL3\n\nimport convgraph\nimport re\nimport logging\nimport opdb\nimport argparse\nimport os\n\n# simple converter\n\nlog = logging.getLogger(__name__)\n\nclass ConvSpec(opdb.ObjectPropsCFG):\n    pass\n\ndef gen_xform_fn(srcname, dstname):\n    src_re = re.compile(srcname)\n\n    def f(s):\n        return src_re.sub(dstname, s)\n\n    return f\n\ndef load_convspec(convspec):\n    cs = ConvSpec(convspec, \"bmk2-convspec\", [\"2\"])\n    if not cs.load():\n        log.error(\"Unable to read config file\")\n        return None\n\n    return cs\n\ndef init_convgraph(cs):\n    all_types = set()\n    conv = {}\n\n    for n, s in cs.objects.iteritems():    \n        convgraph.register_conversion(s['src'], s['dst'], \n                                      gen_xform_fn(s['srcname'],\n                                                   s['dstname']))\n\n        all_types.add(s['src'])\n        all_types.add(s['dst'])\n\n        conv[(s['src'], s['dst'])] = n\n\n    return all_types, conv\n\n\ndef convert_one(cs, src, srcty, dst, dstty, all_types, conv, exists = None, verbose = 0):\n    if exists is None:\n        exists = {}\n\n    # might be useful for a copy?\n    if dstty in exists:\n        del exists[dstty]\n\n    #print exists\n\n    if srcty not in all_types:\n        log.error(\"Conversion from %s not supported\" % (srcty,))\n        return None\n\n    if dstty not in all_types:\n        log.error(\"Conversion to %s not supported\" % (dstty,))\n        return None\n\n    if dst == \"@output\":\n        dst = None\n\n    if not os.path.exists(src):\n        log.error(\"Input file '%s' does not exist\" % (src,))\n        return None\n\n    # silently skip destinations that already exist in database and on disk\n    if dst and os.path.exists(dst):\n        # we're also abandoning any intermediate files ...\n        # TODO: the planner should do this...\n        log.info(\"Destination `%s' already exists and is in database, not converting\" % (dst,))\n        return None\n    \n    c = convgraph.get_conversion(src, srcty, dst, dstty, exists, verbose)\n    if not c:\n        log.error(\"Unable to plan conversion from %s to %s\" % (srcty, dstty))\n        return None\n\n    if False:\n        print >>sys.stderr, c\n\n    if dst is None:\n        # we had to figure out the output name\n        dst = c[-1][3]\n\n    # skip destinations that only exist on disk but not in database\n    if os.path.exists(dst):\n        log.info(\"Destination `%s' already exists, not converting. But it is not in database, you need to update inputdb.\" % (dst,))\n        return None\n\n    out = []\n    for cmd, fs, fsty, ds, dsty in c:\n        assert cmd == \"convert_direct\", \"Unsupported: %s\" % (cmd,)\n        assert (fsty, dsty) in conv, \"Planner got it wrong: %s -> %s unsupported\" % (fst, dsty)\n\n        if os.path.exists(ds):\n            continue\n\n        cmd = cs.objects[conv[(fsty, dsty)]]['cmd']\n        cmd = cmd.format(src = fs, dst=ds, verbose=1)\n\n        out.append((ds, fs, cmd))\n\n    return (dst, out)\n\ndef to_makefile(f, dst_rule_array):\n    targets = []\n    nout = []\n\n    for dst, rules in dst_rule_array:\n        targets.append(dst)\n\n        for rule in rules:\n            nout.append(\"\"\"\n{dst}: {src}\n\\t{cmd}\"\"\".format(src=rule[1], dst=rule[0], cmd=rule[2]))\n\n\n\n    if len(targets):\n        print >>f, \"all: %s\" % (\" \".join(targets))\n        print >>f, \"\\n\".join(nout)\n\nif __name__ == '__main__':\n    import bmk2\n    import config\n    import os\n    import sys\n\n    logging.basicConfig(level=logging.INFO, format='%(levelname)-8s %(name)-10s %(message)s')\n\n    p = argparse.ArgumentParser(description=\"Convert commands to convert input file to destination\")\n    p.add_argument(\"input\", help=\"Input file\")\n    p.add_argument(\"input_type\", help=\"Input file type\")\n    p.add_argument(\"dst_type\", help=\"Destination file type (name will be autodetermined)\")\n    p.add_argument(\"dst\", nargs=\"?\", help=\"Destination file name (optional)\")\n\n    p.add_argument(\"-o\", dest=\"output\", metavar=\"FILE\", help=\"Output makefile\", default=\"/dev/stdout\")\n    p.add_argument(\"-d\", dest=\"metadir\", metavar=\"PATH\", help=\"Path to load configuration from\", default=\".\")\n    p.add_argument(\"-v\", dest=\"verbose\", type=int, help=\"Verbosity\", default=0)\n\n    args = p.parse_args()\n    \n    cfg = config.Config(args.metadir)\n    if not cfg:\n        sys.exit(1)\n\n    convspec = cfg.get_var('convspec', None)\n    if not convspec:\n        log.error(\"No 'convspec' in config file\")\n        sys.exit(1)\n\n    cs = load_convspec(os.path.join(cfg.metadir, convspec))\n    if not cs:\n        sys.exit(1)\n        \n    all_types, conv = init_convgraph(cs)\n    cmds = convert_one(cs, args.input, args.input_type, args.dst, args.dst_type, all_types, conv)\n    if cmds is None:\n        sys.exit(1)\n\n    f = open(args.output, \"w\")\n    to_makefile(f, [cmds])\n    f.close()\n"
  },
  {
    "path": "scripts/experimental/bmk2/summlog.py",
    "content": "#!/usr/bin/env python\n#\n# bmk2info.py\n#\n# Dump information from bmktest2.py files (such as benchmark input\n# files, checker input files, etc.).\n#\n# Copyright (c) 2015, 2016 The University of Texas at Austin\n#\n# Author: Sreepathi Pai <sreepai@ices.utexas.edu>\n#\n# Intended to be licensed under GPL3\n\nimport sys\nimport ConfigParser\nimport argparse\nfrom extras import *\nimport logging\nimport opdb\nimport os\nimport re\nimport sconvert\n\nlog = logging.getLogger(__name__)\nlogging.basicConfig(level=logging.INFO, format='%(levelname)-8s %(name)-10s %(message)s')\n\np = argparse.ArgumentParser(description=\"Dump information from bmktest2.py files\")\n#p.add_argument(\"output\", nargs=\"?\", default=\"/dev/stdout\")\np.add_argument('binputs', nargs='*', help=\"List of binaries to restrict\")\np.add_argument(\"-d\", dest=\"metadir\", metavar=\"PATH\", help=\"Path to load configuration from\", default=\".\")\np.add_argument(\"--iproc\", dest=\"inpproc\", metavar=\"FILE\", help=\"Input processor\")\np.add_argument(\"--bs\", dest=\"binspec\", metavar=\"FILE\", help=\"Binary specification\", default=\"./bmktest2.py\")\np.add_argument(\"--bispec\", dest=\"bispec\", metavar=\"FILE_OR_MNEMONIC\", help=\"Binary+Input specification\")\np.add_argument(\"--scan\", dest=\"scan\", metavar=\"PATH\", help=\"Recursively search PATH for bmktest2.py\")\np.add_argument(\"--xs\", dest=\"xtended_scan\", action=\"store_true\", help=\"Also recognize bmktest2-*.py in scans\")\np.add_argument(\"--ignore-missing-binaries\", action=\"store_true\", default = False)\np.add_argument(\"--cfg\", dest=\"configs\", action=\"append\", help=\"Configurations to apply. default is always applied if present\", default=[])\np.add_argument(\"--varcfg\", dest=\"varconfigs\", action=\"append\", help=\"Variable configs, specified as var=value\", default=[])\np.add_argument(\"-o\", dest=\"output\", help=\"Output file\")\np.add_argument(\"-v\", dest=\"verbose\", type=int, help=\"Verbosity\", default=0)\np.add_argument(\"-i\", dest=\"include\", action=\"append\", default=[], choices=set(['inputs', 'checker-inputs', 'all']))\n\nargs = p.parse_args()\n\nif len(args.include) == 0 or \"all\" in args.include:\n    args.include = ['inputs', 'checker-inputs']\n\nargs.include = set(args.include)\n\nloaded = standard_loader(args.metadir, args.inpproc, args.binspec, args.scan, args.bispec, args.binputs, ignore_missing_binaries = args.ignore_missing_binaries, bin_configs=args.configs, extended_scan = args.xtended_scan, varconfigs = args.varconfigs)\nif not loaded:\n    sys.exit(1)\nelse:\n    basepath, binspecs, l = loaded\n\nout = []\nrspecs = l.get_run_specs()\nfor rs in rspecs:\n    if \"inputs\" in args.include:\n        out += [(rs.input_name, f) for f in rs.get_input_files()]\n\n    if \"checker-inputs\" in args.include:\n        out += [(rs.input_name, f) for f in rs.checker.get_input_files()]\n\nout = list(set(out))\nif args.output:\n    of = open(args.output, \"w\")\nelse:\n    of = sys.stdout\n\nfor e in out:\n    of.write((\"%s %s\" % e) + \"\\n\")\n"
  },
  {
    "path": "scripts/experimental/bmk2/test2.py",
    "content": "#!/usr/bin/env python\n#\n# test2.py\n#\n# Main test runner for bmk2.\n#\n# Copyright (c) 2015, 2016 The University of Texas at Austin\n#\n# Author: Sreepathi Pai <sreepai@ices.utexas.edu>\n#\n# Intended to be licensed under GPL3\n\nimport sys\nimport argparse\nimport os\nimport bmk2\nimport logging\nimport datetime\nimport time\nfrom extras import *\nimport logproc\nimport overlays\nimport config\nimport core\nimport signal\n\nif os.name != \"nt\":\n    import resource\n    \nimport platform\n\nTIME_FMT = \"%Y-%m-%d %H:%M:%S\"\n\nif hasattr(time, 'monotonic'):\n    time_fn = time.monotonic\nelse:\n    time_fn = time.time\n\ndef log_env():\n    interesting = ['CUDA_VISIBLE_DEVICES']\n\n    for v in interesting:\n        if v in os.environ:\n            log.info('Environment: %s=%s' % (v, os.environ[v]))\n\ndef load_rlimits(lo, mt = 1):\n    x = core.RLimit()\n    rlimit_cpu = lo.config.get_var(\"rlimit.cpu\", None)\n    if rlimit_cpu is not None:        \n        log.info('Setting RLIMIT_CPU to %s' % (int(rlimit_cpu)*mt,))\n        x.setrlimit(resource.RLIMIT_CPU, (int(rlimit_cpu)*mt, int(rlimit_cpu)*mt))\n\n    return x\n\ndef squash_output(output, buf_size = 1800):\n    if buf_size <= 0:\n        return output\n    else:\n        return core.squash_output(core.strip_repeated_lines(output), buf_size)\n    \ndef read_log(logfiles):\n    if not isinstance(logfiles, list):\n        logfiles = [logfiles]\n\n    binids = set()\n    for l in logfiles:\n        for r in logproc.parse_log_file(l):\n            if r.type == \"TASK_COMPLETE\":\n                binids.add(r.rsid)\n\n    return binids\n\ndef std_run(args, rs, runid):\n    rsid = rs.get_id()\n    x = rs.run(runid)\n\n    if x.run_ok:\n        if args.verbose:\n            if x.stdout: log.info(\"%s STDOUT\\n\" %(rsid) + squash_output(x.stdout, args.max_output))\n            if x.stderr: log.info(\"%s STDERR\\n\" %(rsid) + squash_output(x.stderr, args.max_output))\n\n        if rs.checker.check(x):\n            log.log(PASS_LEVEL, \"%s: %s\" % (rsid, x))\n            x.cleanup()\n            return True, x\n        else:\n            log.log(FAIL_LEVEL, \"%s %s: check failed: %s\" % (rsid, runid, x))\n            if args.always_cleanup:\n                x.cleanup()\n            return False, x\n    else:\n        log.log(FAIL_LEVEL, \"%s %s: run failed\" % (rsid, runid))\n        if x.stdout: log.info(\"%s STDOUT\\n\" %(rsid) + squash_output(x.stdout, args.max_output))\n        if x.stderr: log.info(\"%s STDERR\\n\" %(rsid) + squash_output(x.stderr, args.max_output) + \"%s END\\n\" % (rsid))\n        x.cleanup()\n        return False, x\n    \ndef do_run(args, rspecs):\n    log.info(\"TASK run\")\n\n    xid_base = str(time.time()) # this should really be a nonce\n    runid = 0\n    for rs in rspecs:\n        rsid = rs.get_id()\n        xid_c = xid_base + \".\" + str(runid)\n        runid += 1\n\n\n        # TODO: use time.monotic()\n        startat = time_fn()\n        run_ok, x = std_run(args, rs, xid_c) # in this case because we do not repeat, xid_c == runid\n        endat = time_fn()\n\n        total_time = endat - startat \n\n        if not run_ok and args.fail_fast:\n            sys.exit(1)\n        \n        if run_ok:\n            log.log(TASK_COMPLETE_LEVEL, \"%s RUN %f\" % (rsid, total_time))\n            \ndef do_perf(args, rspecs):\n    log.info(\"TASK perf\")\n    xid_base = str(time.time()) # this should really be a nonce\n    runid = 0\n\n    for rs in rspecs:\n        rsid = rs.get_id()\n        run = 0\n        repeat = 0\n        runid += 1\n\n        while run < args.repeat:\n            xid_c = xid_base + \".\" + str(runid)\n            runid2 = xid_c + \".\" + str(run + repeat)\n\n            ts = datetime.datetime.now()\n            log.info(\"PERFDATE BEGIN_RUN %s\" % (ts.strftime(TIME_FMT)))\n            run_ok, x = std_run(args, rs, runid2)\n            log.info(\"PERFDATE END_RUN %s\" % (datetime.datetime.now().strftime(TIME_FMT)))\n\n            if run_ok:\n                p = rs.perf.get_perf(x)\n                if p is None:\n                    log.log(FAIL_LEVEL, \"%s %s: perf extraction failed: %s\" % (rsid, runid2, x))\n                    if args.fail_fast:\n                        sys.exit(1)\n                    else:\n                        break\n\n                # TODO: delay this until we have all repeats?\n                log.log(PERF_LEVEL, \"%s %s %s %s %s\" % (rsid, xid_c, run, p['time_ns'], x))\n                run += 1\n            else:\n                if x.retval == -signal.SIGKILL or (x.retval == 256-signal.SIGKILL):\n                    # 255 - signal.SIGKILL will be when measure_energy is on for example\n                    if run == 0:\n                        log.log(FAIL_LEVEL, \"%s %s: killed\" % (rsid, runid2))\n                        # first run failed, don't continue when killed out of time.\n                        log.log(FAIL_LEVEL, \"MISSING PERF %s\" % (rsid,))\n\n                        if args.fail_fast:\n                            sys.exit(1)\n                            \n                        break\n                    \n                if repeat < 3:\n                    log.log(FAIL_LEVEL, \"%s %s: failed, re-running: %s\" % (rsid, runid2, x))\n                    repeat += 1\n                else:\n                    if run == 0:\n                        # we never managed to run this ...\n                        log.log(FAIL_LEVEL, \"MISSING PERF %s\" % (rsid,))\n\n                    if args.fail_fast:\n                        sys.exit(1)\n\n                    break\n\n        if run > 0:\n            log.log(TASK_COMPLETE_LEVEL, \"%s PERF %d/%d/%d\" % (rsid, run, repeat, args.repeat))\n        \n            \n\ndef check_rspecs(rspecs):\n    checks = []\n    out = []\n    all_ok = True\n\n    for rs in rspecs:\n        x = rs.check()\n        if not x:\n            if args.ignore_missing_binaries and len(rs.errors) == 1 and 'missing-binary' in rs.errors:\n                # do not add rs to out [and do not pass go.]\n                all_ok = False\n                continue\n\n        checks.append(x)\n        out.append(rs)\n\n    return all_ok, checks, out\n\ndef populate_black_list(black_list_file):\n    f = open(black_list_file)\n    lines = f.read().replace(\"\\r\",\"\").split(\"\\n\")\n    f.close()\n    filtered_lines = [l for l in lines if l != \"\"]\n    return filtered_lines\n\nlog = logging.getLogger(__name__)\n\nFAIL_LEVEL = logging.getLevelName(\"ERROR\") + 1\nPASS_LEVEL = logging.getLevelName(\"ERROR\") + 2\nPERF_LEVEL = logging.getLevelName(\"ERROR\") + 3\nCOLLECT_LEVEL = logging.getLevelName(\"ERROR\") + 4\nTASK_COMPLETE_LEVEL = logging.getLevelName(\"ERROR\") + 5\nBLACK_LIST = []\n\nlogging.addLevelName(FAIL_LEVEL, \"FAIL\")\nlogging.addLevelName(PASS_LEVEL, \"PASS\")\nlogging.addLevelName(PERF_LEVEL, \"PERF\")\nlogging.addLevelName(COLLECT_LEVEL, \"COLLECT\")\nlogging.addLevelName(TASK_COMPLETE_LEVEL, \"TASK_COMPLETE\")\n\np = argparse.ArgumentParser(\"Run tests\")\np.add_argument(\"-d\", dest=\"metadir\", metavar=\"PATH\", help=\"Path to load configuration from\", default=\".\")\np.add_argument(\"--iproc\", dest=\"inpproc\", metavar=\"FILE\", help=\"Input processor\")\np.add_argument(\"--bs\", dest=\"binspec\", metavar=\"FILE\", help=\"Binary specification\", default=\"./bmktest2.py\")\np.add_argument(\"--bispec\", dest=\"bispec\", metavar=\"FILE_OR_MNEMONIC\", help=\"Binary+Input specification\")\np.add_argument(\"--scan\", dest=\"scan\", metavar=\"PATH\", help=\"Recursively search PATH for bmktest2.py\")\np.add_argument(\"--xs\", dest=\"xtended_scan\", action=\"store_true\", help=\"Also recognize bmktest2-*.py in scans\")\n\np.add_argument(\"--log\", dest=\"log\", metavar=\"FILE\", help=\"Store logs in FILE\")\np.add_argument(\"--blacklist\", dest=\"blacklist_file\", metavar=\"FILE\", help=\"a list of applications to skip in FILE\")\np.add_argument(\"--ignore-missing-binaries\", action=\"store_true\", default = False)\np.add_argument(\"--cuda-profile\", dest=\"cuda_profile\", action=\"store_true\", help=\"Enable CUDA profiling\")\np.add_argument(\"--cp-cfg\", dest=\"cuda_profile_config\", metavar=\"FILE\", help=\"CUDA Profiler configuration\")\np.add_argument(\"--cp-log\", dest=\"cuda_profile_log\", action=\"store_true\", help=\"CUDA Profiler logfile\", default=\"{xtitle}cp_{rsid}_{runid}.log\")\np.add_argument(\"--only\", dest=\"only\", help=\"Only run binids in FILE\")\np.add_argument(\"--invert-only\", dest=\"invert_only\", action=\"store_true\", help=\"Invert --only, do NOT run binids in FILE\")\np.add_argument(\"--always-cleanup\", dest=\"always_cleanup\", action=\"store_true\", help=\"Always cleanup files even if checks fail\")\np.add_argument(\"--nvprof\", dest=\"nvprof\", action=\"store_true\", help=\"Enable CUDA profiling via NVPROF\")\np.add_argument(\"--nvp-metrics\", dest=\"nvp_metrics\", help=\"Comma-separated list of NVPROF metrics\")\np.add_argument(\"--nvp-events\", dest=\"nvp_events\", help=\"Comma-separated list of NVPROF events\")\np.add_argument(\"--nvp-metfiles\", dest=\"nvp_metric_files\", help=\"Comma-separated list of NVPROF metric files\")\np.add_argument(\"--npdb\", dest=\"npdb\", action=\"store_true\", help=\"Generate a profile database instead of a CSV\")\np.add_argument(\"--npanalysis\", dest=\"npanalysis\", action=\"store_true\", help=\"Supply --analysis-metrics to nvprof\")\np.add_argument(\"--npsystem\", dest=\"npsystem\", action=\"store_true\", help=\"Supply --system-profiling to nvprof\")\np.add_argument(\"--max-output-bytes\", dest=\"max_output\", type=int, metavar=\"BYTES\", help=\"Truncate output and error logs from runs if they exceed BYTES, zero to never truncate\", default=1600)\np.add_argument(\"--xtitle\", dest=\"xtitle\", help=\"Title of experiment\")\np.add_argument(\"--cfg\", dest=\"configs\", action=\"append\", help=\"Configurations to apply. default is always applied if present\", default=[])\np.add_argument(\"--varcfg\", dest=\"varconfigs\", action=\"append\", help=\"Variable configs, specified as var=value\", default=[])\np.add_argument(\"--measure-energy\", dest=\"measure_energy\", action=\"store_true\", help=\"Measure energy of run\")\np.add_argument(\"--read\", dest=\"readlog\", metavar=\"FILE\", help=\"Read previous log\")\np.add_argument('-v', \"--verbose\", dest=\"verbose\", action=\"store_true\", help=\"Show stdout and stderr of executing programs\", default=False)\np.add_argument('--missing', dest=\"missing\", action=\"store_true\", help=\"Select new/missing runspecs\")\n\np.add_argument(\"--retrace\", dest=\"retrace\", metavar=\"FILE\", help=\"Read map file FILE and rerun traces\")\np.add_argument(\"--cl-device\", dest=\"cl_device\", metavar=\"PLATFORM,DEVICE\", help=\"Run binary on PLATFORM,DEVICE\")\np.add_argument(\"--cl-cmdline\", dest=\"cl_cmdline\", metavar=\"TEMPLATE\", help=\"Command template for OpenCL device selection\")\np.add_argument(\"--mtcpulimit\", dest=\"mtcpulimit\", help=\"Multiply CPU limit by this number (usually max. number of threads)\", default=1,type=int)\n\nsp = p.add_subparsers(help=\"sub-command help\", dest=\"command\")\nplist = sp.add_parser('list', help=\"List runspecs\")\nplist.add_argument('binputs', nargs='*', help=\"Limit to binaries and/or inputs\")\nplist.add_argument('--show-files', action=\"store_true\", help=\"Limit to binaries and/or inputs\", default=False)\n\nprun = sp.add_parser('run', help=\"Run binaries\")\nprun.add_argument('binputs', nargs='*', help=\"List of binaries and/or inputs to execute\")\nprun.add_argument('--ff', dest=\"fail_fast\", action=\"store_true\", help=\"Fail fast\", default=False)\n\npperf = sp.add_parser('perf', help=\"Run performance tests\")\npperf.add_argument('binputs', nargs='*', help=\"List of binaries and/or inputs to execute\")\npperf.add_argument('--ff', dest=\"fail_fast\", action=\"store_true\", help=\"Fail fast\", default=False)\npperf.add_argument('-r', dest=\"repeat\", metavar=\"N\", type=int, help=\"Number of repetitions\", default=3)\n\ncmd_line = \" \".join(sys.argv)\n\nargs = p.parse_args()\n\nPREV_BINIDS = set()\nif args.readlog:\n    assert args.readlog != args.log\n    PREV_BINIDS = read_log(args.readlog)\n\nif args.log:\n    logging.basicConfig(level=logging.DEBUG, format='%(levelname)s %(message)s', filename=args.log, filemode='wb') # note the 'wb', instead of 'a'\n    console = logging.StreamHandler()\n    fmt = logging.Formatter('%(levelname)-8s %(name)-10s %(message)s')\n    console.setLevel(logging.INFO)\n    console.setFormatter(fmt)\n    logging.getLogger('').addHandler(console)\nelse:\n    logging.basicConfig(level=logging.INFO, format='%(levelname)-8s %(name)-10s %(message)s')\n\nif args.readlog:\n    log.info('%d completed task rsids read from log' % (len(PREV_BINIDS)))\n\nif args.blacklist_file:\n    BLACK_LIST = populate_black_list(args.blacklist_file)\n    log.info('black created for applications: %s' % \" \".join(BLACK_LIST))\n\n\nloaded = standard_loader(args.metadir, args.inpproc, args.binspec, args.scan, \n                         args.bispec, args.binputs, \n                         args.ignore_missing_binaries, bin_configs=args.configs, \n                         extended_scan = args.xtended_scan, \n                         black_list = BLACK_LIST, varconfigs = args.varconfigs)\nif not loaded:\n    sys.exit(1)\nelse:\n    basepath, binspecs, l = loaded\n\nrspecs = l.get_run_specs(l.config)\nrspecs.sort(key=lambda x: x.bid)\n\nall_ok, checks, rspecs = check_rspecs(rspecs)\n\nif not all(checks):\n    log.info(\"Some checks failed. See previous error messages for information.\")\n    sys.exit(1)\n\nif all_ok:\n    log.info(\"Configuration loaded successfully.\")\nelse:\n    log.info(\"Configuration loaded with some errors ignored. See previous error messages for information.\")\n\nstart = datetime.datetime.now()\nlog.info(\"SYSTEM: %s\" % (\",\".join(platform.uname())))\nlog.info(\"DATE START %s\" % (start.strftime(TIME_FMT)))\nlog.log(COLLECT_LEVEL, \"basepath %s\" % (basepath,))\nlog.info(\"CMD_LINE: %s\" % (cmd_line))\nlog_env()\n\nif args.missing:\n    rspecs = filter(lambda rs: rs.get_id() not in PREV_BINIDS, rspecs)\n\nif args.only:\n    onlybinids = set([s.strip() for s in open(args.only, \"r\").readlines() if s != '\\n'])\n    all_rsids = set([rs.get_id() for rs in rspecs])\n\n    if onlybinids.intersection(all_rsids) != onlybinids:\n        log.error('Subset IDs did not match (possibly misspelt?): %s' % (onlybinids.difference(all_rsids)))\n        if args.binputs is None or len(args.binputs) == 0: \n            sys.exit(1) \n\n    onlybinids = onlybinids.intersection(all_rsids)\n\n    if not args.invert_only:\n        log.info(\"SUBSET: %s\" % (onlybinids,))\n        rspecs = filter(lambda rs: rs.get_id() in onlybinids, rspecs)\n    else:\n        log.info(\"EXCLUDING: %s\" % (onlybinids,))\n        log.info(\"SUBSET: %s\" % (all_rsids - onlybinids,))\n        rspecs = filter(lambda rs: rs.get_id() not in onlybinids, rspecs)\n\n\nif args.xtitle:\n    for rs in rspecs:\n        rs.vars['xtitle'] = args.xtitle\n\nif args.cl_device:\n    cl_platform, cl_device = args.cl_device.split(\",\")\n    cl_cmdline = args.cl_cmdline or l.config.get_var(\"cl_cmdline\", None) or \"-p {platform} -d {device}\"\n        \n    overlays.add_overlay(rspecs, overlays.CLDeviceOverlay, cmdline_template=cl_cmdline, \n                         cl_platform = cl_platform, cl_device = cl_device)\n\nif args.cuda_profile:\n    cp_cfg_file = args.cuda_profile_config or l.config.get_var(\"cp_cfg\", None)\n    cp_log_file = args.cuda_profile_log or l.config.get_var(\"cp_log\", None)\n\n    if cp_cfg_file:\n        assert os.path.exists(cp_cfg_file) and os.path.isfile(cp_cfg_file), \"CUDA Profiler Config '%s' does not exist or is not a file\" % (cp_cfg_file,)\n\n    overlays.add_overlay(rspecs, overlays.CUDAProfilerOverlay, profile_cfg=cp_cfg_file, profile_log=cp_log_file)\nelif args.nvprof:\n    cp_log_file = args.cuda_profile_log or l.config.get_var(\"cp_log\", None)\n    cfg = []\n    metrics = []\n    events = []\n    if args.nvp_metrics:        \n        metrics.extend(args.nvp_metrics.split(\",\"))\n\n    if args.nvp_events:\n        events.extend(args.nvp_events.split(\",\"))\n\n    if args.nvp_metric_files:\n        nvpdir = l.config.get_var(\"nvprof_dir\", args.metadir)\n        files = [os.path.join(nvpdir, a) for a in args.nvp_metric_files.split(\",\")]\n        metrics.extend(read_line_terminated_cfg(files))\n                \n    if len(metrics):\n        cfg.append(\"--metrics %s\" % (\",\".join(metrics),))\n\n    if len(events):\n        cfg.append(\"--events %s\" % (\",\".join(events),))\n\n    cfg = \" \".join(cfg)\n    if args.npdb or args.npanalysis:\n        cp_log_file = cp_log_file.replace(\".log\", \".nvprof\")\n        \n    overlays.add_overlay(rspecs, overlays.NVProfOverlay, profile_cfg=cfg, profile_log=cp_log_file, profile_db = args.npdb, profile_analysis=args.npanalysis, system_profiling=args.npsystem)\n\ntmpdir = l.config.get_var(\"tmpdir\", None)\nif tmpdir: \n    assert (os.path.exists(tmpdir) and os.path.isdir(tmpdir)), \"Temporary directory '%s' does not exist or is not a directory\" % (tmpdir,)\n    overlays.add_overlay(rspecs, overlays.TmpDirOverlay, tmpdir)\n    for r in rspecs:\n        r.set_tmpdir(tmpdir)\n\noverlays.add_overlay(rspecs, overlays.Bmk2RTEnvOverlay)\n\nif args.retrace:\n    overlays.add_overlay(rspecs, overlays.GGCInstrOverlay, args.retrace)\n\nif args.measure_energy:\n    overlays.add_overlay(rspecs, overlays.MeasureEnergyOverlay)\n\nrl = load_rlimits(l, args.mtcpulimit)\n\nfor r in rspecs:\n    r.set_rlimit(rl)\n\nif args.command == \"list\":\n    prev_bid = None\n    for rs in rspecs:\n        if rs.bid != prev_bid:\n            print rs.bid,\n            prev_bid = rs.bid\n            if rs.bid in l.config.disable_binaries:\n                print \"\\t** DISABLED **\",\n            print\n\n        print \"\\t\", rs.input_name\n        if args.show_files:\n            files = rs.get_input_files() +rs.checker.get_input_files()\n            print \"\\t\\t\", \" \".join(files)\nelif args.command == \"run\":\n    for b in l.config.disable_binaries:\n        log.info(\"DISABLED BINARY %s\" % (b,))\n\n    rspecs = [rs for rs in rspecs if rs.bid not in l.config.disable_binaries]\n    do_run(args, rspecs)\nelif args.command == \"perf\":\n    for b in l.config.disable_binaries:\n        log.info(\"DISABLED BINARY %s\" % (b,))\n\n    rspecs = [rs for rs in rspecs if rs.bid not in l.config.disable_binaries]\n    do_perf(args, rspecs)\n\nsummarize(log, rspecs)    \nend = datetime.datetime.now()\nlog.info(\"DATE END %s\" % (end.strftime(TIME_FMT)))\nlog.info(\"APPROXIMATE DURATION %s\" % (end - start)) # modulo clock adjusting, etc.\nlogging.shutdown()\n\n#def load_rlimits(lo):\n#    x = core.RLimit()\n#    rlimit_cpu = lo.config.get_var(\"rlimit.cpu\", None)\n#    if rlimit_cpu is not None:        \n#        log.info('Setting RLIMIT_CPU to %s' % (rlimit_cpu,))\n#        x.setrlimit(resource.RLIMIT_CPU, (int(rlimit_cpu), int(rlimit_cpu)))\n#\n#    return x\n#\n#def read_log(logfiles):\n#    if not isinstance(logfiles, list):\n#        logfiles = [logfiles]\n#\n#    binids = set()\n#    for l in logfiles:\n#        for r in logproc.parse_log_file(l):\n#            if r.type == \"TASK_COMPLETE\":\n#                binids.add(r.rsid)\n#\n#    return binids\n#\n#def std_run(args, rs, runid):\n#    rsid = rs.get_id()\n#    x = rs.run(runid)\n#\n#    if x.run_ok:\n#        if args.verbose:\n#            if x.stdout: log.info(x.stdout)\n#            if x.stderr: log.info(x.stderr)\n#        return True, x\n#\n#        if rs.checker.check(x):\n#            log.log(PASS_LEVEL, \"%s: %s\" % (rsid, x))\n#            x.cleanup()\n#            return True, x\n#        else:\n#            log.log(FAIL_LEVEL, \"%s: check failed: %s\" % (rsid, x))\n#            return False, x\n#    else:\n#        log.log(FAIL_LEVEL, \"%s: run failed\" % (rsid))\n#        if x.stdout: log.info(\"%s STDOUT\\n\" %(rsid) + x.stdout)\n#        if x.stderr: log.info(\"%s STDERR\\n\" %(rsid) + x.stderr + \"%s END\\n\" % (rsid))\n#        x.cleanup()\n#        return False, x\n#    \n#def do_run(args, rspecs):\n#    log.info(\"TASK run\")\n#\n#    xid_base = str(time.time()) # this should really be a nonce\n#    runid = 0\n#    for rs in rspecs:\n#        rsid = rs.get_id()\n#        xid_c = xid_base + \".\" + str(runid)\n#        runid += 1\n#\n#        run_ok, x = std_run(args, rs, xid_c) # in this case because we do not repeat, xid_c == runid\n#        if not run_ok and args.fail_fast:\n#            sys.exit(1)\n#        \n#        if run_ok:\n#            log.log(TASK_COMPLETE_LEVEL, \"%s RUN\" % (rsid,))\n#            \n#def do_perf(args, rspecs):\n#    log.info(\"TASK perf\")\n#    xid_base = str(time.time()) # this should really be a nonce\n#    runid = 0\n#\n#    for rs in rspecs:\n#        rsid = rs.get_id()\n#        run = 0\n#        repeat = 0\n#        runid += 1\n#\n#        while run < args.repeat:\n#            xid_c = xid_base + \".\" + str(runid)\n#\n#            ts = datetime.datetime.now()\n#            log.info(\"PERFDATE BEGIN_RUN %s\" % (ts.strftime(TIME_FMT)))\n#            run_ok, x = std_run(args, rs, xid_c + \".\" + str(run + repeat))\n#            log.info(\"PERFDATE END_RUN %s\" % (datetime.datetime.now().strftime(TIME_FMT)))\n#\n#            if run_ok:\n#                p = rs.perf.get_perf(x)\n#                if p is None:\n#                    log.log(FAIL_LEVEL, \"%s: perf extraction failed: %s\" % (rsid, x))\n#                    if args.fail_fast:\n#                        sys.exit(1)\n#                    else:\n#                        break\n#\n#                # TODO: delay this until we have all repeats?\n#                log.log(PERF_LEVEL, \"%s %s %s %s %s\" % (rsid, xid_c, run, p['time_ns'], x))\n#                run += 1\n#            else:\n#                if repeat < 3:\n#                    log.log(FAIL_LEVEL, \"%s %s: failed, re-running: %s\" % (rsid, xid_c, x))\n#                    repeat += 1\n#                else:\n#                    if run == 0:\n#                        # we never managed to run this ...\n#                        log.log(FAIL_LEVEL, \"MISSING PERF %s\" % (rsid,))\n#                    else:\n#                        log.log(TASK_COMPLETE_LEVEL, \"%s PERF %d/%d/%d\" % (rsid, run, repeat, args.repeat))\n#\n#                    if args.fail_fast:\n#                        sys.exit(1)\n#\n#                    break\n#\n#def check_rspecs(rspecs):\n#    checks = []\n#    out = []\n#    all_ok = True\n#\n#    for rs in rspecs:\n#        x = rs.check()\n#        if not x:\n#            if args.ignore_missing_binaries and len(rs.errors) == 1 and 'missing-binary' in rs.errors:\n#                # do not add rs to out [and do not pass go.]\n#                all_ok = False\n#                continue\n#\n#        checks.append(x)\n#        out.append(rs)\n#\n#    return all_ok, checks, out\n#\n#log = logging.getLogger(__name__)\n#\n#FAIL_LEVEL = logging.getLevelName(\"ERROR\") + 1\n#PASS_LEVEL = logging.getLevelName(\"ERROR\") + 2\n#PERF_LEVEL = logging.getLevelName(\"ERROR\") + 3\n#COLLECT_LEVEL = logging.getLevelName(\"ERROR\") + 4\n#TASK_COMPLETE_LEVEL = logging.getLevelName(\"ERROR\") + 5\n#\n#logging.addLevelName(FAIL_LEVEL, \"FAIL\")\n#logging.addLevelName(PASS_LEVEL, \"PASS\")\n#logging.addLevelName(PERF_LEVEL, \"PERF\")\n#logging.addLevelName(COLLECT_LEVEL, \"COLLECT\")\n#logging.addLevelName(TASK_COMPLETE_LEVEL, \"TASK_COMPLETE\")\n#\n#p = argparse.ArgumentParser(\"Run tests\")\n#p.add_argument(\"-d\", dest=\"metadir\", metavar=\"PATH\", \n#               help=\"Path to load configuration from\", default=\".\")\n#p.add_argument(\"--iproc\", dest=\"inpproc\", metavar=\"FILE\", \n#               help=\"Input processor\")\n#p.add_argument(\"--bs\", dest=\"binspec\", metavar=\"FILE\", \n#               help=\"Binary specification\", default=\"./bmktest2.py\")\n#p.add_argument(\"--bispec\", dest=\"bispec\", metavar=\"FILE_OR_MNEMONIC\", \n#               help=\"Binary+Input specification\")\n#p.add_argument(\"--scan\", dest=\"scan\", metavar=\"PATH\", \n#               help=\"Recursively search PATH for bmktest2.py\")\n#p.add_argument(\"--log\", dest=\"log\", metavar=\"FILE\", help=\"Store logs in FILE\")\n#p.add_argument(\"--ignore-missing-binaries\", action=\"store_true\", \n#               default = False)\n#p.add_argument(\"--cuda-profile\", dest=\"cuda_profile\", action=\"store_true\", \n#               help=\"Enable CUDA profiling\")\n#p.add_argument(\"--cp-cfg\", dest=\"cuda_profile_config\", metavar=\"FILE\", \n#               help=\"CUDA Profiler configuration\")\n#p.add_argument(\"--cp-log\", dest=\"cuda_profile_log\", action=\"store_true\", \n#               help=\"CUDA Profiler logfile\", default=\"cp_{rsid}_{runid}.log\")\n#\n#p.add_argument(\"--nvprof\", dest=\"nvprof\", action=\"store_true\", \n#               help=\"Enable CUDA profiling via NVPROF\")\n#p.add_argument(\"--nvp-metrics\", dest=\"nvp_metrics\", \n#               help=\"Comma-separated list of NVPROF metrics\")\n#\n#p.add_argument(\"--read\", dest=\"readlog\", metavar=\"FILE\", help=\"Read previous log\")\n#p.add_argument('-v', \"--verbose\", dest=\"verbose\", action=\"store_true\", \n#               help=\"Show stdout and stderr of executing programs\", default=False)\n#p.add_argument('--missing', dest=\"missing\", action=\"store_true\", \n#               help=\"Select new/missing runspecs\")\n#\n#sp = p.add_subparsers(help=\"sub-command help\", dest=\"command\")\n#plist = sp.add_parser('list', help=\"List runspecs\")\n#plist.add_argument('binputs', nargs='*', help=\"Limit to binaries and/or inputs\")\n#plist.add_argument('--show-files', action=\"store_true\", \n#                   help=\"Limit to binaries and/or inputs\", default=False)\n#\n#prun = sp.add_parser('run', help=\"Run binaries\")\n#prun.add_argument('binputs', nargs='*', \n#                  help=\"List of binaries and/or inputs to execute\")\n#prun.add_argument('--ff', dest=\"fail_fast\", action=\"store_true\", \n#                  help=\"Fail fast\", default=False)\n#\n#pperf = sp.add_parser('perf', help=\"Run performance tests\")\n#pperf.add_argument('binputs', nargs='*', \n#                   help=\"List of binaries and/or inputs to execute\")\n#pperf.add_argument('--ff', dest=\"fail_fast\", action=\"store_true\", \n#                   help=\"Fail fast\", default=False)\n#pperf.add_argument('-r', dest=\"repeat\", metavar=\"N\", type=int, \n#                   help=\"Number of repetitions\", default=3)\n#\n#args = p.parse_args()\n#\n#PREV_BINIDS = set()\n#\n#if args.readlog:\n#    assert args.readlog != args.log\n#    PREV_BINIDS = read_log(args.readlog)\n#\n#if args.log:\n#    logging.basicConfig(level=logging.DEBUG, format='%(levelname)s %(message)s', \n#                        filename=args.log, filemode='wb') # note the 'wb', instead of 'a'\n#    console = logging.StreamHandler()\n#    fmt = logging.Formatter('%(levelname)-8s %(name)-10s %(message)s')\n#    console.setLevel(logging.INFO)\n#    console.setFormatter(fmt)\n#    logging.getLogger('').addHandler(console)\n#else:\n#    logging.basicConfig(level=logging.INFO, format='%(levelname)-8s %(name)-10s %(message)s')\n#\n#if args.readlog:\n#    log.info('%d completed task rsids read from log' % (len(PREV_BINIDS)))\n#\n#\n#loaded = standard_loader(args.metadir, args.inpproc, args.binspec, args.scan, \n#                         args.bispec, args.binputs, args.ignore_missing_binaries)\n#if not loaded:\n#    sys.exit(1)\n#else:\n#    basepath, binspecs, l = loaded\n#\n#rspecs = l.get_run_specs(l.config)\n#rspecs.sort(key=lambda x: x.bid)\n#\n#all_ok, checks, rspecs = check_rspecs(rspecs)\n#\n#if not all(checks):\n#    log.info(\"Some checks failed. See previous error messages for information.\")\n#    sys.exit(1)\n#\n#if all_ok:\n#    log.info(\"Configuration loaded successfully.\")\n#else:\n#    log.info(\"Configuration loaded with some errors ignored. See previous error messages for information.\")\n#\n#start = datetime.datetime.now()\n#log.info(\"SYSTEM: %s\" % (\",\".join(os.uname())))\n#log.info(\"DATE START %s\" % (start.strftime(TIME_FMT)))\n#log.log(COLLECT_LEVEL, \"basepath %s\" % (basepath,))\n#\n#if args.missing:\n#    rspecs = filter(lambda rs: rs.get_id() not in PREV_BINIDS, rspecs)\n#\n#if args.cuda_profile:\n#    cp_cfg_file = args.cuda_profile_config or l.config.get_var(\"cp_cfg\", None)\n#    cp_log_file = args.cuda_profile_log or l.config.get_var(\"cp_log\", None)\n#\n#    if cp_cfg_file:\n#        assert os.path.exists(cp_cfg_file) and os.path.isfile(cp_cfg_file), \"CUDA Profiler Config '%s' does not exist or is not a file\" % (cp_cfg_file,)\n#\n#    overlays.add_overlay(rspecs, overlays.CUDAProfilerOverlay, profile_cfg=cp_cfg_file, profile_log=cp_log_file)\n#elif args.nvprof:\n#    cp_log_file = args.cuda_profile_log or l.config.get_var(\"cp_log\", None)\n#    overlays.add_overlay(rspecs, overlays.NVProfOverlay, profile_cfg=\"--metrics %s\" % (args.nvp_metrics), profile_log=cp_log_file)\n#\n#tmpdir = l.config.get_var(\"tmpdir\", None)\n#if tmpdir: \n#    assert (os.path.exists(tmpdir) and os.path.isdir(tmpdir)), \"Temporary directory '%s' does not exist or is not a directory\" % (tmpdir,)\n#    overlays.add_overlay(rspecs, overlays.TmpDirOverlay, tmpdir)\n#    for r in rspecs:\n#        r.set_tmpdir(tmpdir)\n#\n#rl = load_rlimits(l)\n#\n#for r in rspecs:\n#    r.set_rlimit(rl)\n#\n#if args.command == \"list\":\n#    prev_bid = None\n#    for rs in rspecs:\n#        if rs.bid != prev_bid:\n#            print rs.bid,\n#            prev_bid = rs.bid\n#            if rs.bid in l.config.disable_binaries:\n#                print \"\\t** DISABLED **\",\n#            print\n#\n#        print \"\\t\", rs.input_name\n#        if args.show_files:\n#            files = rs.get_input_files() +rs.checker.get_input_files()\n#            print \"\\t\\t\", \" \".join(files)\n#elif args.command == \"run\":\n#    for b in l.config.disable_binaries:\n#        log.info(\"DISABLED BINARY %s\" % (b,))\n#\n#    rspecs = [rs for rs in rspecs if rs.bid not in l.config.disable_binaries]\n#    do_run(args, rspecs)\n#elif args.command == \"perf\":\n#    for b in l.config.disable_binaries:\n#        log.info(\"DISABLED BINARY %s\" % (b,))\n#\n#    rspecs = [rs for rs in rspecs if rs.bid not in l.config.disable_binaries]\n#    do_perf(args, rspecs)\n#\n#summarize(log, rspecs)    \n#end = datetime.datetime.now()\n#log.info(\"DATE END %s\" % (end.strftime(TIME_FMT)))\n#log.info(\"APPROXIMATE DURATION %s\" % (end - start)) # modulo clock adjusting, etc.\n#logging.shutdown()\n"
  },
  {
    "path": "scripts/experimental/buildFunc.sh",
    "content": "#!/bin/bash\n\nSRC_ROOT=\"$HOME/projects/GaloisCpp\"\n\nBUILD_ROOT=\"/workspace/$USER/build\"\n\nmkdir -p \"${BUILD_ROOT}\"\n\n\ncc=${cc:=\"gcc\"}\ncxx=${cxx:=\"g++\"}\nbuild=${build:=\"Debug\"}\ncmakeOpts=${cmakeOpts:=\"-DUSE_PAPI=1 -DUSE_VTUNE=1 -DGALOIS_ENABLE_DIST=1\"}\ncleanup=${cleanup:=\"0\"}\n\ngaloisCheckStatus() {\n  local cmd=\"$1\"\n  if eval \"$cmd\" ; then\n    echo \"OK: success running ($cmd)\"\n  else\n    echo \"ERROR: ($cmd) failed\"\n    exit -1\n  fi\n}\n\ngaloisSetCompilers() {\n  if [[ \"xx$cc\" == \"xxgcc\" ]] ; then\n    cxx=\"g++\";\n  elif [[ \"xx$cc\" == \"xxicc\" ]] ; then\n    cxx=\"icpc\";\n  elif [[ \"xx$cc\" == \"xxclang\" ]] ; then\n    cxx=\"clang++\";\n  else\n    cxx=\"not found\";\n  fi\n\n  galoisCheckStatus \"which $cc\"\n  galoisCheckStatus \"which $cxx\"\n}\n\n\ngaloisRunBuild() {\n  galoisSetCompilers\n  local buildDir=$(mktemp -d -p ${BUILD_ROOT} \"$cc-$build.XXXXXX\")\n  galoisCheckStatus \"cd $buildDir\"\n  galoisCheckStatus \"CC=$cc CXX=$cxx cmake -DCMAKE_BUILD_TYPE=$build $cmakeOpts ${SRC_ROOT}\"\n  galoisCheckStatus \"make -j\"\n  if [[ \"xx$cleanup\" == \"xx1\" ]] && [[ \"xx$buildDir\" != \"xx\" ]] ;  then\n    galoisCheckStatus \"rm -rf $buildDir\"\n  fi\n}\n\ngaloisBuildMultiCompiler() {\n  for c in \"gcc\" \"clang\" \"icc\"; do\n    build=\"Debug\"\n    cc=\"$c\"\n    galoisRunBuild;\n\n    # build=\"Release\"\n    # galoisRunBuild;\n  done\n}\n\ngaloisBuildMultiVer() {\n  for i in 5 6 7 ; do \n    galoisCheckStatus \"module load atc/1.$i\"\n    galoisBuildMultiCompiler\n  done\n}\n\ngaloisBuildGccDebug() {\n  cc=\"gcc\"\n  build=\"Debug\"\n  galoisRunBuild\n}\n\ngaloisBuildGccRelease() {\n  cc=\"gcc\"\n  build=\"Release\"\n  galoisRunBuild\n}\n\ngaloisBuildIccDebug() {\n  cc=\"icc\"\n  build=\"Debug\"\n  galoisRunBuild\n}\n\ngaloisBuildIccRelease() {\n  cc=\"icc\"\n  build=\"Release\"\n  galoisRunBuild\n}\n\ngaloisBuildClangDebug() {\n  cc=\"clang\"\n  build=\"Debug\"\n  galoisRunBuild\n}\n\ngaloisBuildClangRelease() {\n  cc=\"clang\"\n  build=\"Release\"\n  galoisRunBuild\n}\n"
  },
  {
    "path": "scripts/experimental/buildMultiCompiler.sh",
    "content": "#!/bin/bash\n\nscriptsDir=$(dirname $0)\n\nsource $scriptsDir/buildFunc.sh\n\ngaloisBuildMultiCompiler\n\n\n"
  },
  {
    "path": "scripts/experimental/buildMultiVersion.sh",
    "content": "#!/bin/bash\n\nscriptsDir=$(dirname $0)\n\nsource $scriptsDir/buildFunc.sh\n\ngaloisBuildMultiVer\n\n"
  },
  {
    "path": "scripts/experimental/buildOnce.sh",
    "content": "#!/bin/bash\n\nscriptsDir=$(dirname $0)\n\nsource $scriptsDir/buildFunc.sh\n\ngaloisRunBuild\n"
  },
  {
    "path": "scripts/experimental/distbmk2/README",
    "content": "How to run:\n\nSet the following environment variables:\n\nBMK_DIST_PATH to the dist_apps\nBMK_DISTLOGS to wherever you want the logs to go\nBMK_MPIRUN_PATH to the mpirun binary\n\nThen run the following in THIS directory (same as README):\n\npython <path to bmk2/test2.py> run\n\nTODO explain the bmktest2.py file\n"
  },
  {
    "path": "scripts/experimental/distbmk2/bmk2.cfg",
    "content": "[bmk2]\nversion=2\ninputdb=dist.inputdb\ninputprops=dist.inputprops\nbispec=dist.bispec\npathToApps=${BMK_DIST_PATH}\nlogOutputDirectory=${BMK_DISTLOGS}\npathToMPIRun=${BMK_MPIRUN_PATH}\n"
  },
  {
    "path": "scripts/experimental/distbmk2/bmkprops.py",
    "content": "import bmk2\nimport datetime\n\nTIME_FMT = \"%Y-%m-%d %H:%M:%S\"\n\nclass GraphBMKDistApp(bmk2.Binary):\n  \"\"\"Base class for dist apps to inherit from. Subclasses specify benchmark\n  name + number of threads + number of hosts.\"\"\"\n  def __init__(self):\n    \"\"\"Initialize dist app properties.\"\"\"\n    self.props = GraphBMKDistAppProps(self.benchmark)\n        \n  def get_id(self):\n    \"\"\"Return the id of this benchmark.\"\"\"\n    return \"%s\" % (self.benchmark)\n\n  def getUniqueStatFile(self, numThreads, numHosts, currentCut, graphName):\n    \"\"\"Get a statfile name given num threads + graph name being used.\"\"\"\n    timeNow = datetime.datetime.now().strftime(TIME_FMT).replace(\" \", \"_\")\n\n    return (\"%s_t=%d_n=%d_%s_%s_%s.log\" % (self.benchmark, numThreads, numHosts,\n                                           graphName, currentCut, timeNow))\n\nclass GraphBMKDistAppProps(bmk2.Properties):\n  \"\"\"Properties pertaining to a dist app.\"\"\"\n  def __init__(self, benchmark):\n    self.benchmark = benchmark\n"
  },
  {
    "path": "scripts/experimental/distbmk2/bmktest2.py",
    "content": "import bmk2\nfrom bmkprops import GraphBMKDistApp\nimport os\n\n################################################################################\n# DistApp base class\n################################################################################\n\nclass DistApp(GraphBMKDistApp):\n  \"\"\"Base class that has default run spec construction behavior for most\n  dist apps.\"\"\"\n  # thread to start from\n  startThread = 40 \n  # thread to end at (inclusive)\n  endThread = 40\n  # step to use for looping through threads\n  step = 10\n\n  # list of hosts to loop through\n  testHosts = [1]\n  #testHosts = [1, 2, 3]\n\n  # list of cuts to test\n  # TODO use hybrid cuts?\n  cutsToTest = [\"oec\", \"iec\", \"cvc\"]\n\n  def filter_inputs(self, inputs):\n    \"\"\"Ignore inputs that aren't currently supported; dist apps only \n    support the Galois binary graph format.\"\"\"\n    def finput(x):\n      if x.props.format == 'bin/galois': return True\n      return False\n\n    return filter(finput, inputs)\n\n  def get_default_run_specs(self, bmkinput, config):\n    \"\"\"Creates default run specifications with common arguments for all\n    dist apps and returns them. They can be modified\n    later according to the benchmark that you want to run.\n    \"\"\"\n    assert config != None # config should be passed through test2.py\n\n    listOfRunSpecs = []\n\n    # TODO add cuts in as well....\n    for numThreads in range(self.startThread, self.endThread + 1, self.step):\n      if numThreads == 0 and self.step != 1:\n        numThreads = 1\n      elif numThreads == 0:\n        continue\n\n      for numHosts in self.testHosts:\n        # TODO no cut if 1 host\n        for currentCut in self.cutsToTest:\n          # TODO figure out how to get mpirun hooked up to this\n\n          x = bmk2.RunSpec(self, bmkinput)\n\n          # mpirun setup\n          x.set_binary(\"\", os.path.expandvars(\n                             os.path.join(config.get_var(\"pathToMPIRun\"))))\n          x.set_arg(\"-n=%d\" % numHosts)\n          # TODO set this in config instead?\n          x.set_arg(\"-hosts=peltier,gilbert,oersted\")\n\n          # app setup\n          x.set_arg(os.path.expandvars(\n                         os.path.join(config.get_var(\"pathToApps\"),\n                                      self.relativeAppPath)))\n          x.set_arg(\"-t=%d\" % numThreads)\n\n          # set transpose or symm graph flag\n          if not (bmkinput.props.file).endswith(\".sgr\"):\n            x.set_arg(\"-graphTranspose=%s\" % bmkinput.props.transpose)\n          else:\n            x.set_arg(\"-symmetricGraph\")\n\n          nameToAppend = bmkinput.name\n\n          # partition setup\n          if numHosts != 1:\n            x.set_arg(\"-partition=%s\" % currentCut)\n          else:\n            currentCut = \"single\"\n\n          x.set_arg(bmkinput.props.file, bmk2.AT_INPUT_FILE)\n          x.set_arg(\"-statFile=\" +\n                    os.path.expandvars(\n                      os.path.join(config.get_var(\"logOutputDirectory\"),\n                                   self.getUniqueStatFile(numThreads, numHosts,\n                                                          currentCut,\n                                                          nameToAppend))))\n\n          listOfRunSpecs.append(x)\n\n          # null checkers/perf checkers\n          x.set_checker(bmk2.PassChecker())\n          x.set_perf(bmk2.ZeroPerf())\n\n          # escape partition loop if only in a single host\n          if (currentCut == \"single\"):\n            break\n\n    return listOfRunSpecs\n\n  def get_run_spec(self, bmkinput, config):\n    return self.get_default_run_specs(bmkinput, config)\n\n\n################################################################################\n# List of apps to test\n################################################################################\n\nclass BFSPush(DistApp):\n  relativeAppPath = \"bfs_push\"\n  benchmark = \"bfs_push\"\n\n  def get_run_spec(self, bmkinput, config):\n    \"\"\"Adds source of bfs\"\"\"\n    specs = self.get_default_run_specs(bmkinput, config)\n\n    for s in specs:\n      s.set_arg(\"-startNode=%s\" % bmkinput.props.source)\n      \n    return specs\n\nclass BFSPull(DistApp):\n  relativeAppPath = \"bfs_pull\"\n  benchmark = \"bfs_pull\"\n\n  def get_run_spec(self, bmkinput, config):\n    \"\"\"Adds source of bfs\"\"\"\n    specs = self.get_default_run_specs(bmkinput, config)\n\n    for s in specs:\n      s.set_arg(\"-startNode=%s\" % bmkinput.props.source)\n      \n    return specs\n\nclass CCPush(DistApp):\n  relativeAppPath = \"cc_push\"\n  benchmark = \"cc_push\"\n  \nclass CCPull(DistApp):\n  relativeAppPath = \"cc_pull\"\n  benchmark = \"cc_pull\"\n\nclass KCorePush(DistApp):\n  relativeAppPath = \"kcore_push\"\n  benchmark = \"kcore_push\"\n\n  def get_run_spec(self, bmkinput, config):\n    \"\"\"Adds kcore num\"\"\"\n    specs = self.get_default_run_specs(bmkinput, config)\n\n    for s in specs:\n      s.set_arg(\"-kcore=100\")\n      \n    return specs\n\nclass KCorePull(DistApp):\n  relativeAppPath = \"kcore_pull\"\n  benchmark = \"kcore_pull\"\n\n  def get_run_spec(self, bmkinput, config):\n    \"\"\"Adds kcore num\"\"\"\n    specs = self.get_default_run_specs(bmkinput, config)\n\n    for s in specs:\n      s.set_arg(\"-kcore=100\")\n      \n    return specs\n\nclass PageRankPush(DistApp):\n  relativeAppPath = \"pagerank_push\"\n  benchmark = \"pagerank_push\"\n  # TODO max iterations?\n\nclass PageRankPull(DistApp):\n  relativeAppPath = \"pagerank_pull\"\n  benchmark = \"pagerank_pull\"\n  # TODO max iterations?\n\nclass SSSPPush(DistApp):\n  relativeAppPath = \"sssp_push\"\n  benchmark = \"sssp_push\"\n\n  def get_run_spec(self, bmkinput, config):\n    \"\"\"Adds source of sssp\"\"\"\n    specs = self.get_default_run_specs(bmkinput, config)\n\n    for s in specs:\n      s.set_arg(\"-startNode=%s\" % bmkinput.props.source)\n      \n    return specs\n\nclass SSSPPull(DistApp):\n  relativeAppPath = \"sssp_pull\"\n  benchmark = \"sssp_pull\"\n\n  def get_run_spec(self, bmkinput, config):\n    \"\"\"Adds source of sssp\"\"\"\n    specs = self.get_default_run_specs(bmkinput, config)\n\n    for s in specs:\n      s.set_arg(\"-startNode=%s\" % bmkinput.props.source)\n      \n    return specs\n\n\n################################################################################\n# Specification of binaries to run\n################################################################################\n\n#BINARIES = [BFSPush(), BFSPull()]\nBINARIES = [BFSPush(), BFSPull(), CCPush(), CCPull(), KCorePush(), KCorePull(),\n            PageRankPush(), PageRankPull(), SSSPPush(), SSSPPull()]\n"
  },
  {
    "path": "scripts/experimental/distbmk2/dist.bispec",
    "content": "#v1\nbfs_push rmat24\nbfs_pull rmat24\ncc_push rmat24s\ncc_pull rmat24s\nkcore_push rmat24s\nkcore_pull rmat24s\npagerank_push rmat24\npagerank_pull rmat24\nsssp_push rmat24\nsssp_pull rmat24\n"
  },
  {
    "path": "scripts/experimental/distbmk2/dist.inputdb",
    "content": "[bmktest2]\nversion = 2\nbasepath = /net/ohm/export/iss/dist-inputs\n\n[rmat20.gr]\nflags = \nname = rmat20\nfile = rmat20.gr\ntranspose = /net/ohm/export/iss/dist-inputs/transpose/rmat20.tgr\nformat = bin/galois\n\n[rmat22.gr]\nflags = \nname = rmat22\nfile = rmat22.gr\ntranspose = /net/ohm/export/iss/dist-inputs/transpose/rmat22.tgr\nformat = bin/galois\n\n[rmat24.gr]\nflags = \nname = rmat24\nfile = rmat24.gr\ntranspose = /net/ohm/export/iss/dist-inputs/transpose/rmat24.tgr\nformat = bin/galois\n\n[rmat25.gr]\nflags = \nname = rmat25\nfile = rmat25.gr\ntranspose = /net/ohm/export/iss/dist-inputs/transpose/rmat25.tgr\nformat = bin/galois\n\n[rmat20.sgr]\nflags = \nname = rmat20s\nfile = symmetric/rmat20.sgr\ntranspose = \nformat = bin/galois\n\n[rmat22.sgr]\nflags = \nname = rmat22s\nfile = symmetric/rmat22.sgr\ntranspose = \nformat = bin/galois\n\n[rmat24.sgr]\nflags = \nname = rmat24s\nfile = symmetric/rmat24.sgr\ntranspose = \nformat = bin/galois\n\n[rmat25.sgr]\nflags = \nname = rmat25s\nfile = symmetric/rmat25.sgr\ntranspose = \nformat = bin/galois\n"
  },
  {
    "path": "scripts/experimental/distbmk2/dist.inputprops",
    "content": "[bmktest2-props]\nversion = 2\npaths = \n\n[rmat20]\nsource = 0\n\n[rmat22]\nsource = 0\n\n[rmat24]\nsource = 7601598\n\n[rmat25]\nsource = 7601598\n"
  },
  {
    "path": "scripts/experimental/galois_license_fixer.py",
    "content": "#!/usr/bin/python\nimport re\nimport sys\nimport fileinput\nimport getopt\nimport textwrap\n\nnew_license_text = \"\"\"/*\n * This file belongs to the Galois project, a C++ library for exploiting parallelism.\n * The code is being released under the terms of the 3-Clause BSD License (a\n * copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\\n\\n\"\"\"\n\n\"\"\" remove license from the files.\n    returns: text with license removed\n\"\"\"\ndef commentRemover(text, filename):\n  def replacer(match):\n    s = match.group(0)\n    if s.startswith('/'):\n      return new_license_text # note: a space and not an empty string\n    else:\n      return s\n\n  pattern = re.compile(\n    #r'/\\*.*?\\*/',\n    #r'/\\*.*License.*?\\*/\\s+',\n    #r'/\\*.*License.*?.*The University of Texas at Austin.*?\\*/\\s+',\n    r'/\\*.*This file belongs to the Galois project.*?.*or loss or inaccuracy of data of any kind\\..*?\\*/\\s+',\n    re.DOTALL | re.MULTILINE\n  )\n\n  return re.sub(pattern, replacer, text, 1)\n\ndef licenseFind(text):\n  pattern = re.compile(\n    r'/\\*.*This file belongs to the Galois project.*?.*or loss or inaccuracy of data of any kind\\..*?\\*/\\s+',\n    re.DOTALL | re.MULTILINE\n  )\n\n  return re.search(pattern, text)\n\ndef main(argv):\n  inputfile = ''\n  outputfile = ''\n  try:\n    opts, args = getopt.getopt(argv,\"hi:\",[\"ifile=\"])\n  except getopt.GetoptError:\n    print 'remove_galois_license.py -i <inputfile>'\n    sys.exit(2)\n  for opt, arg in opts:\n    if opt == '-h':\n      print 'remove_galois_license.py -i <inputfile>'\n      sys.exit()\n    elif opt in (\"-i\", \"--ifile\"):\n      inputfile = arg\n    print 'Input file is \"', inputfile\n\n    filename = inputfile\n\n  with open(filename, 'r+') as f:\n     originalText = f.read()\n\n     found = licenseFind(originalText)\n     if found == None:\n       f.seek(0)\n       f.write(new_license_text)\n       f.write(originalText)\n     else:\n       uncmtFile = commentRemover(originalText, filename)\n       f.seek(0)\n       f.write(uncmtFile)\n\n     f.truncate()\n     f.close()\n\nif __name__ == \"__main__\":\n  main(sys.argv[1:])\n"
  },
  {
    "path": "scripts/experimental/githubbmk2_setup/README",
    "content": "-----\nbmk2.cfg\n-----\n\nSetup the path to the built apps (Lonestar build directory) and where to output\nlogs for each benchmark here under pathToApps and logOutputDirectory.\n\n-----\nbmktest2.py\n-----\n\nThis file defines all the benchmarks to be run. Each benchmark is defined as a\nclass that inherits from SharedMemApp.\n\nThe runtime looks for relativeAppPath, which points to the executable, and\nbenchmark, which is the name given to that benchmark when outputting things\nand, more importantly, the name used to refer to said benchmark run by bmk2 in\nother files.\n\nIf any arguments to that benchmark need to be specified, you define the\nget_run_spec function to add args. See examples there in the current bmktest2.py\nfile to get a feel of how to do it.\n\nThe SharedMemApp parent class is where default parameters such as threads\nare specified. startThread, endThread, and step are the variables to edit\nfor this purpose.\n\nThe benchmarks that will be run are specified in the BINARIES array. For\nexample, the below definition will run BarnesHut and BFS:\n\nBINARIES = [BarnesHut(), BFS()]\n\n-----\nlonestar.inputdb\n-----\n\nInputs to benchmarks are specified here.\n\nTo begin, change the \"basepath\" variable in the header to point to the root\ndirectory where are inputs are stored.\n\nThe format for specifying an input is as follows:\n\n[<input name>]\nflags =\nname = <name to refer to input as in bmk2>\nfile = <relative path to file from root input directory \"basepath\">\nformat = file format\n\nThere are a couple of file formats to be aware of:\n\nbin/galois = binary galois format\nmesh/nodes = mesh format\ntriangles = format for triangle counting\ntext = text format (for things like points-to-analysis)\nnothing = no file is passed in (for things like self generated input)\n\n-----\nlonestar.inputprops\n-----\n\nAdditional properties for certain inputs can be specified in the inputprops\nfile.\n\n[<same input name used in header for input in inputdb>]\n<additional property>=<what additional property is>\n\nFor example, I can specify the transpose graph as following:\n\n[soc-livejournal]\nptranspose=/net/ohm/export/iss/inputs/unweighted/soc-LiveJournal1.ptgr\n\nIt can then be referred to in the bmktest2.py python script as the example\nbelow shows:\n\ns.set_arg(\"-graphTranspose=%s\" % bmkinput.props.ptranspose)\n\n-----\nlonestar.bispec\n-----\n\nThis file is where one specifies which input to run with a particular bmk2\nbenchmark. Use the name given in the benchmark variable in bmktest2 to\nrefer to a benchmark and the name given under the name variable in\nlonestar.inputdb to refer to an input. For example, the below lines says to\nrun bfs with the twitter40 input.\n\nbfs twitter40\n\n-----\nHow to run:\n-----\n\ndefaultrunscript.sh has a default command to start the bmk2 runtime and use the\nconfig specified in the directory you run it from.\n"
  },
  {
    "path": "scripts/experimental/githubbmk2_setup/bmk2.cfg",
    "content": "[bmk2]\nversion=2\ninputdb=lonestar.inputdb\ninputprops=lonestar.inputprops\nbispec=lonestar.bispec\npathToApps=${BMK_LONESTAR_PATH}\nlogOutputDirectory=${BMK_LOGS}\n"
  },
  {
    "path": "scripts/experimental/githubbmk2_setup/bmkprops.py",
    "content": "import bmk2\nimport datetime\n\nTIME_FMT = \"%Y-%m-%d %H:%M:%S\"\n\nclass GraphBMKSharedMem(bmk2.Binary):\n    \"\"\"Base class for shared memory benchmarks to inherit from. Subclasses\n    need to specify benchmark name + number of threads.\n    \"\"\"\n    def __init__(self):\n        \"\"\"Initialize shared mem properties.\"\"\"\n        self.props = GraphBMKSharedMemProps(self.benchmark)\n        \n    def get_id(self):\n        \"\"\"Return the id of this benchmark.\"\"\"\n        return \"%s\" % (self.benchmark)\n\n    def getUniqueStatFile(self, numThreads, graphName):\n        \"\"\"Get a statfile name given num threads + graph name being used.\"\"\"\n        timeNow = datetime.datetime.now().strftime(TIME_FMT).replace(\" \", \"_\")\n\n        return (\"%s_%d_%s_%s.log\" % (self.benchmark, numThreads, graphName,\n                               timeNow))\n\nclass GraphBMKSharedMemProps(bmk2.Properties):\n    \"\"\"Properties pertaining to shared memory.\"\"\"\n    def __init__(self, benchmark):\n        self.benchmark = benchmark\n"
  },
  {
    "path": "scripts/experimental/githubbmk2_setup/bmktest2.py",
    "content": "import bmk2\nfrom bmkprops import GraphBMKSharedMem\nimport os\n\nclass SharedMemApp(GraphBMKSharedMem):\n    \"\"\"Base class that has default run spec construction behavior for\n    most if not all shared memory apps.\n    \"\"\"\n    # thread to start from\n    startThread = 40\n    # thread to end at (inclusive)\n    endThread = 40\n    # step to use for looping through threads\n    step = 1\n\n    def filter_inputs(self, inputs):\n        \"\"\"Ignore inputs that aren't currently supported.\"\"\"\n        def finput(x):\n            if x.props.format == 'bin/galois': return True\n            if x.props.format == 'mesh': return True\n            if x.props.format == 'mesh/nodes': return True\n            if x.props.format == 'triangles': return True\n            if x.props.format == 'text': return True\n            if x.props.format == 'nothing': return True\n\n            return False\n\n        return filter(finput, inputs)\n\n    def get_default_run_specs(self, bmkinput, config):\n        \"\"\"Creates default run specifications with common arguments for all\n        shared memory benchmarks and returns them. They can be modified\n        later according to the benchmark that you want to run.\n        \"\"\"\n        assert config != None # config should be passed through test2.py\n        listOfRunSpecs = []\n\n        for numThreads in range(self.startThread, self.endThread + 1, self.step):\n            if numThreads == 0 and self.step != 1:\n              numThreads = 1\n            elif numThreads == 0:\n              continue\n\n            x = bmk2.RunSpec(self, bmkinput)\n\n            x.set_binary(\"\", os.path.expandvars(\n                               os.path.join(config.get_var(\"pathToApps\"),\n                                          self.relativeAppPath)))\n            x.set_arg(\"-t=%d\" % numThreads)\n\n            nameToAppend = bmkinput.name\n\n            if bmkinput.props.format == \"nothing\":\n                nameToAppend = \"gen\"\n                pass\n            elif bmkinput.props.format != \"mesh\":\n                x.set_arg(bmkinput.props.file, bmk2.AT_INPUT_FILE)\n            else: # mesh\n                # don't specify with input file flag as it doesn't exist (mesh\n                # loads multiple files, so the file specified in the inputdb\n                # isn't an actual file\n                x.set_arg(bmkinput.props.file)\n\n            x.set_arg(\"-statFile=\" +\n                      os.path.expandvars(\n                        os.path.join(config.get_var(\"logOutputDirectory\"),\n                                     self.getUniqueStatFile(numThreads,\n                                     nameToAppend))\n                      ))\n\n            listOfRunSpecs.append(x)\n\n            x.set_checker(bmk2.PassChecker())\n            x.set_perf(bmk2.ZeroPerf())\n\n        return listOfRunSpecs\n\n    def get_run_spec(self, bmkinput, config):\n        return self.get_default_run_specs(bmkinput, config)\n\n################################################################################\n\nclass BarnesHut(SharedMemApp):\n    relativeAppPath = \"barneshut/barneshut\"\n    benchmark = \"barneshut\"\n\n    def get_run_spec(self, bmkinput, config):\n        \"\"\"Adds barnes hut specific arguments\"\"\"\n        specs = self.get_default_run_specs(bmkinput, config)\n\n        for s in specs:\n            s.set_arg(\"-n=100000\")\n            s.set_arg(\"-steps=1\")\n            s.set_arg(\"-seed=0\")\n\n        return specs\n\nclass BCAsync(SharedMemApp):\n    relativeAppPath = \"betweennesscentrality/bc-async\"\n    benchmark = \"bc-async\"\n\n    def get_run_spec(self, bmkinput, config):\n        \"\"\"BC async command line setup\"\"\"\n        specs = self.get_default_run_specs(bmkinput, config)\n\n        for s in specs:\n            # do 5 nodes with edges\n            s.set_arg(\"-numOfOutSources=5\")\n\n        return specs\n\nclass BCOuter(SharedMemApp):\n    relativeAppPath = \"betweennesscentrality/betweennesscentrality-outer\"\n    benchmark = \"bc-outer\"\n\nclass BFS(SharedMemApp):\n    relativeAppPath = \"bfs/bfs\"\n    benchmark = \"bfs\"\n\nclass Boruvka(SharedMemApp):\n    relativeAppPath = \"boruvka/boruvka\"\n    benchmark = \"boruvka\"\n\nclass BoruvkaMerge(SharedMemApp):\n    relativeAppPath = \"boruvka/boruvka-merge\"\n    benchmark = \"boruvka-merge\"\n\nclass Clustering(SharedMemApp):\n    relativeAppPath = \"clustering/clustering\"\n    benchmark = \"clustering\"\n\n    def get_run_spec(self, bmkinput, config):\n        \"\"\"Clustering command line setup\"\"\"\n        specs = self.get_default_run_specs(bmkinput, config)\n\n        for s in specs:\n            s.set_arg(\"-numPoints=100\") # num points to cluster\n\n        return specs\n\nclass ConnectedComponents(SharedMemApp):\n    relativeAppPath = \"connectedcomponents/connectedcomponents\"\n    benchmark = \"connectedcomponents\"\n\nclass DelaunayTriangulation(SharedMemApp):\n    relativeAppPath = \"delaunaytriangulation/delaunaytriangulation\"\n    benchmark = \"delaunaytriangulation\"\n\nclass DelaunayTriangulationDet(SharedMemApp):\n    relativeAppPath = \"delaunaytriangulation/delaunaytriangulation-det\"\n    benchmark = \"delaunaytriangulation-det\"\n\nclass DMR(SharedMemApp):\n    relativeAppPath = \"delaunayrefinement/delaunayrefinement\"\n    benchmark = \"dmr\"\n\nclass GMetis(SharedMemApp):\n    relativeAppPath = \"gmetis/gmetis\"\n    benchmark = \"gmetis\"\n\n    def get_run_spec(self, bmkinput, config):\n        \"\"\"Adds gmetis specific arguments (num partitions)\"\"\"\n        specs = self.get_default_run_specs(bmkinput, config)\n\n        for s in specs:\n            s.set_arg(\"256\") # num of partitions\n\n        return specs\n\nclass IndependentSet(SharedMemApp):\n    relativeAppPath = \"independentset/independentset\"\n    benchmark = \"independentset\"\n\n# triggers caps for matrix completion\nMCCAP = False\n\nclass MatrixCompletionSync(SharedMemApp):\n    relativeAppPath = \"matrixcompletion/matrixCompletion\"\n    benchmark = \"matrixcompletion-sync\"\n\n    def get_run_spec(self, bmkinput, config):\n        \"\"\"Adds matrix completion type\"\"\"\n        specs = self.get_default_run_specs(bmkinput, config)\n\n        for s in specs:\n            s.set_arg(\"-algo=syncALS\") # algo type\n            s.set_arg(\"-lambda=0.001\")\n            s.set_arg(\"-learningRate=0.01\")\n            s.set_arg(\"-learningRateFunction=intel\")\n            s.set_arg(\"-tolerance=0.01\")\n            s.set_arg(\"-noverify\")\n            s.set_arg(\"-useSameLatentVector\")\n            s.set_arg(\"-useDetInit\")\n            if MCCAP:\n              s.set_arg(\"-fixedRounds=8\")\n              s.set_arg(\"-maxUpdates=8\")\n\n        return specs\n\nclass MatrixCompletionSimple(SharedMemApp):\n    relativeAppPath = \"matrixcompletion/matrixCompletion\"\n    benchmark = \"matrixcompletion-simple\"\n\n    def get_run_spec(self, bmkinput, config):\n        \"\"\"Adds matrix completion type\"\"\"\n        specs = self.get_default_run_specs(bmkinput, config)\n\n        for s in specs:\n            s.set_arg(\"-algo=simpleALS\") # algo type\n            s.set_arg(\"-lambda=0.001\")\n            s.set_arg(\"-learningRate=0.01\")\n            s.set_arg(\"-learningRateFunction=intel\")\n            s.set_arg(\"-tolerance=0.01\")\n            s.set_arg(\"-noverify\")\n            s.set_arg(\"-useSameLatentVector\")\n            s.set_arg(\"-useDetInit\")\n            if MCCAP:\n              s.set_arg(\"-fixedRounds=8\")\n              s.set_arg(\"-maxUpdates=8\")\n\n        return specs\n\nclass MatrixCompletionEdge(SharedMemApp):\n    relativeAppPath = \"matrixcompletion/matrixCompletion\"\n    benchmark = \"matrixcompletion-edge\"\n\n    def get_run_spec(self, bmkinput, config):\n        \"\"\"Adds matrix completion type\"\"\"\n        specs = self.get_default_run_specs(bmkinput, config)\n\n        for s in specs:\n            s.set_arg(\"-algo=sgdBlockEdge\") # algo type\n            s.set_arg(\"-lambda=0.001\")\n            s.set_arg(\"-learningRate=0.01\")\n            s.set_arg(\"-learningRateFunction=intel\")\n            s.set_arg(\"-tolerance=0.01\")\n            s.set_arg(\"-noverify\")\n            s.set_arg(\"-useSameLatentVector\")\n            s.set_arg(\"-useDetInit\")\n            if MCCAP:\n              s.set_arg(\"-fixedRounds=8\")\n              s.set_arg(\"-maxUpdates=8\")\n\n        return specs\n\nclass MatrixCompletionJump(SharedMemApp):\n    relativeAppPath = \"matrixcompletion/matrixCompletion\"\n    benchmark = \"matrixcompletion-jump\"\n\n    def get_run_spec(self, bmkinput, config):\n        \"\"\"Adds matrix completion type\"\"\"\n        specs = self.get_default_run_specs(bmkinput, config)\n\n        for s in specs:\n            s.set_arg(\"-algo=sgdBlockJump\") # algo type\n            s.set_arg(\"-lambda=0.001\")\n            s.set_arg(\"-learningRate=0.01\")\n            s.set_arg(\"-learningRateFunction=intel\")\n            s.set_arg(\"-tolerance=0.01\")\n            s.set_arg(\"-noverify\")\n            s.set_arg(\"-useSameLatentVector\")\n            s.set_arg(\"-useDetInit\")\n            if MCCAP:\n              s.set_arg(\"-fixedRounds=8\")\n              s.set_arg(\"-maxUpdates=8\")\n\n        return specs\n\nclass MatrixCompletionByItems(SharedMemApp):\n    relativeAppPath = \"matrixcompletion/matrixCompletion\"\n    benchmark = \"matrixcompletion-byitems\"\n\n    def get_run_spec(self, bmkinput, config):\n        \"\"\"Adds matrix completion type\"\"\"\n        specs = self.get_default_run_specs(bmkinput, config)\n\n        for s in specs:\n            s.set_arg(\"-algo=sgdByItems\") # algo type\n            s.set_arg(\"-lambda=0.001\")\n            s.set_arg(\"-learningRate=0.01\")\n            s.set_arg(\"-learningRateFunction=intel\")\n            s.set_arg(\"-tolerance=0.01\")\n            s.set_arg(\"-noverify\")\n            s.set_arg(\"-useSameLatentVector\")\n            s.set_arg(\"-useDetInit\")\n            if MCCAP:\n              s.set_arg(\"-fixedRounds=8\")\n              s.set_arg(\"-maxUpdates=8\")\n\n        return specs\n\nclass MatrixCompletionByEdges(SharedMemApp):\n    relativeAppPath = \"matrixcompletion/matrixCompletion\"\n    benchmark = \"matrixcompletion-byedges\"\n\n    def get_run_spec(self, bmkinput, config):\n        \"\"\"Adds matrix completion type\"\"\"\n        specs = self.get_default_run_specs(bmkinput, config)\n\n        for s in specs:\n            s.set_arg(\"-algo=sgdByEdges\") # algo type\n            s.set_arg(\"-lambda=0.001\")\n            s.set_arg(\"-learningRate=0.01\")\n            s.set_arg(\"-learningRateFunction=intel\")\n            s.set_arg(\"-tolerance=0.01\")\n            s.set_arg(\"-noverify\")\n            s.set_arg(\"-useSameLatentVector\")\n            s.set_arg(\"-useDetInit\")\n            if MCCAP:\n              s.set_arg(\"-fixedRounds=8\")\n              s.set_arg(\"-maxUpdates=8\")\n\n        return specs\n\nclass MCM(SharedMemApp):\n    relativeAppPath = \"matching/bipartite-mcm\"\n    benchmark = \"mcm\"\n\n    def get_run_spec(self, bmkinput, config):\n        \"\"\"Adds bipartite matching specific arguments\"\"\"\n        specs = self.get_default_run_specs(bmkinput, config)\n\n        for s in specs:\n            s.set_arg(\"-abmpAlgo\")\n            s.set_arg(\"-inputType=generated\")\n            s.set_arg(\"-n=100\") # nodes in each bipartite set\n            s.set_arg(\"-numEdges=10000\")\n            s.set_arg(\"-numGroups=100\")\n            s.set_arg(\"-seed=0\") # seed for rng; keep it consistent\n\n        return specs\n\n\nclass PageRankPull(SharedMemApp):\n    relativeAppPath = \"pagerank/pagerank-pull\"\n    benchmark = \"pagerank-pull\"\n\n    def get_run_spec(self, bmkinput, config):\n        \"\"\"Adds tolerance argument\"\"\"\n        specs = self.get_default_run_specs(bmkinput, config)\n\n        for s in specs:\n            s.set_arg(\"-tolerance=0.01\") # pagerank tolerance\n\n        return specs\n\nclass PageRankPullTopo(SharedMemApp):\n    relativeAppPath = \"pagerank/pagerank-pull\"\n    benchmark = \"pagerank-pull-topo\"\n\n    def get_run_spec(self, bmkinput, config):\n        \"\"\"Adds tolerance argument and algorithm setting\"\"\"\n        specs = self.get_default_run_specs(bmkinput, config)\n\n        for s in specs:\n            s.set_arg(\"-tolerance=0.01\") # pagerank tolerance\n            s.set_arg(\"-algo=Topo\") # pagerank tolerance\n\n        return specs\n\nclass PageRankPush(SharedMemApp):\n    relativeAppPath = \"pagerank/pagerank-push\"\n    benchmark = \"pagerank-push\"\n\n    def get_run_spec(self, bmkinput, config):\n        \"\"\"Adds tolerance argument\"\"\"\n        specs = self.get_default_run_specs(bmkinput, config)\n\n        for s in specs:\n            s.set_arg(\"-tolerance=0.01\") # pagerank tolerance\n\n        return specs\n\nclass PageRankPushSync(SharedMemApp):\n    relativeAppPath = \"pagerank/pagerank-push\"\n    benchmark = \"pagerank-push-sync\"\n\n    def get_run_spec(self, bmkinput, config):\n        \"\"\"Adds tolerance argument and algo setting\"\"\"\n        specs = self.get_default_run_specs(bmkinput, config)\n\n        for s in specs:\n            s.set_arg(\"-tolerance=0.01\") # pagerank tolerance\n            s.set_arg(\"-algo=Sync\") # pagerank tolerance\n\n        return specs\n\n# for galois 2.2 version of pagerank\nclass PageRank2Point2(SharedMemApp):\n    relativeAppPath = \"pagerank/pagerank\"\n    benchmark = \"pagerank22\"\n\n    def get_run_spec(self, bmkinput, config):\n        \"\"\"Adds transpose graph.\"\"\"\n        specs = self.get_default_run_specs(bmkinput, config)\n\n        for s in specs:\n            s.set_arg(\"-graphTranspose=%s\" % bmkinput.props.ptranspose)\n            s.set_arg(\"-maxIterations=1000\")\n\n        return specs\n\nclass PreflowPush(SharedMemApp):\n    relativeAppPath = \"preflowpush/preflowpush\"\n    benchmark = \"preflowpush\"\n\n    def get_run_spec(self, bmkinput, config):\n        \"\"\"Adds preflow push specific arguments\"\"\"\n        specs = self.get_default_run_specs(bmkinput, config)\n\n        for s in specs:\n            s.set_arg(\"0\") # source id\n            s.set_arg(\"100\") # sink id\n\n        return specs\n\nclass PointsToAnalysis(SharedMemApp):\n    relativeAppPath = \"pointstoanalysis/pta\"\n    benchmark = \"pta\"\n\nclass SpanningTree(SharedMemApp):\n    relativeAppPath = \"spanningtree/spanningtree\"\n    benchmark = \"spanningtree\"\n\nclass SSSP(SharedMemApp):\n    relativeAppPath = \"sssp/sssp\"\n    benchmark = \"sssp\"\n\n    def get_run_spec(self, bmkinput, config):\n        \"\"\"Adds delta argument to runs.\"\"\"\n        specs = self.get_default_run_specs(bmkinput, config)\n\n        # 0 is best for twitter50\n        # 8 seems best for r4-2e26\n        for s in specs:\n            #s.set_arg(\"-delta=0\")\n            s.set_arg(\"-delta=8\")\n\n        return specs\n\nclass SurveyPropagation(SharedMemApp):\n    relativeAppPath = \"surveypropagation/surveypropagation\"\n    benchmark = \"surveypropagation\"\n\n    def get_run_spec(self, bmkinput, config):\n        \"\"\"Adds survey prop arguments to runs.\"\"\"\n        specs = self.get_default_run_specs(bmkinput, config)\n\n        for s in specs:\n            s.set_arg(\"9\") # random generator seed\n            s.set_arg(\"100\") # number of vars\n            s.set_arg(\"100\") # number of clauses\n            s.set_arg(\"3\") # vars per clause\n\n            # below are args used by runs on galois website\n            #s.set_arg(\"1000000\") # number of vars\n            #s.set_arg(\"3000000\") # number of clauses\n            #s.set_arg(\"3\") # vars per clause\n\n        return specs\n\nclass TrianglesNode(SharedMemApp):\n    relativeAppPath = \"triangles/triangles\"\n    benchmark = \"triangles-node\"\n\n    def get_run_spec(self, bmkinput, config):\n        \"\"\"Specifies node version\"\"\"\n        specs = self.get_default_run_specs(bmkinput, config)\n\n        for s in specs:\n            s.set_arg(\"-algo=nodeiterator\")\n\n        return specs\n\nclass TrianglesEdge(SharedMemApp):\n    relativeAppPath = \"triangles/triangles\"\n    benchmark = \"triangles-edge\"\n\n    def get_run_spec(self, bmkinput, config):\n        \"\"\"Specifies edge version\"\"\"\n        specs = self.get_default_run_specs(bmkinput, config)\n\n        for s in specs:\n            s.set_arg(\"-algo=edgeiterator\")\n\n        return specs\n\n# specification of binaries to run\n# apps present in Galois 2.2\n#BINARIES = [BarnesHut(), BFS(), BCOuter(), Boruvka(), BoruvkaMerge(),\n#            Clustering(), ConnectedComponents(), DelaunayTriangulation(), DMR(),\n#            GMetis(), IndependentSet(), MCM(), PageRankPull(), PageRankPush(),\n#            PreflowPush(), SpanningTree(), SSSP(), SurveyPropagation()]\n\n# single benchmark run\n#BINARIES = [SurveyPropagation(), TrianglesNode(), TrianglesEdge()]\n\nBINARIES = [BarnesHut(), BFS(), BCOuter(), Boruvka(),\n            ConnectedComponents(), DelaunayTriangulation(), DMR(),\n            GMetis(), IndependentSet(), MatrixCompletionSync(),\n            MatrixCompletionSimple(), MatrixCompletionEdge(),\n            MatrixCompletionJump(), MatrixCompletionByItems(),\n            MatrixCompletionByEdges(), MCM(), PageRankPull(), PageRankPush(),\n            PreflowPush(), PointsToAnalysis(), SSSP(), SurveyPropagation(),\n            TrianglesNode(), TrianglesEdge()]\n"
  },
  {
    "path": "scripts/experimental/githubbmk2_setup/defaultrunscript.sh",
    "content": "#!/bin/bash\n\npython ../bmk2/test2.py --max-output-bytes 0 --log ${BMK_LOGS}/bmkrunlog.log --verbose run\n"
  },
  {
    "path": "scripts/experimental/githubbmk2_setup/lonestar.bispec",
    "content": "#v1\nbarneshut nothing\nbfs rmat10\nbc-outer rmat10\nboruvka rmat10\nboruvka-merge rmat10\nclustering nothing\nconnectedcomponents rmat10-sym\ndelaunaytriangulation 250kn\ndelaunaytriangulation-det 250kn\ndmr 250k\ngmetis rmat10\nindependentset rmat10-sym\nmatrixcompletion-sync bgg\nmatrixcompletion-simple bgg\nmatrixcompletion-edge bgg\nmatrixcompletion-jump bgg\nmatrixcompletion-byitems bgg\nmatrixcompletion-byedges bgg\nmcm nothing\npagerank-pull rmat10-transpose\npagerank-pull-topo rmat10-transpose\npagerank-push rmat10\npagerank-push-sync rmat10\npreflowpush rmat10\npta gdb_constraints\nspanningtree rmat10\nsssp rmat10\nsurveypropagation nothing\ntriangles-node dblp-triangles\ntriangles-edge dblp-triangles\n\n"
  },
  {
    "path": "scripts/experimental/githubbmk2_setup/lonestar.inputdb",
    "content": "[bmktest2]\nversion = 2\nbasepath = ../../../build/small_inputs\n\n[rmat10]\nflags =\nname = rmat10\nfile = scalefree/rmat16-2e10-a=0.57-b=0.19-c=0.19-d=.05.gr\nformat = bin/galois\n\n[rmat10-transpose]\nflags =\nname = rmat10-transpose\nfile = scalefree/transpose/rmat16-2e10-a=0.57-b=0.19-c=0.19-d=.05.tgr\nformat = bin/galois\n\n[rmat10-symmetric]\nflags =\nname = rmat10-sym\nfile = scalefree/symmetric/rmat16-2e10-a=0.57-b=0.19-c=0.19-d=.05.sgr\nformat = bin/galois\n\n[dblp-triangles]\nflags =\nname = dblp-triangles\nfile = stanford/communities/DBLP/com-dblp.wgt32.sym.gr.triangles\nformat = triangles\n\n[weighted/bipartite/bgg]\nflags =\nname = bgg\nfile = weighted/bipartite/floatEdgeWts/bgg.gr\nformat = bin/galois\n\n[meshes/250k.2]\nflags =\nname = 250k\nfile = meshes/250k.2\nformat = mesh\n\n[meshes/250k.2n]\nflags =\nname = 250kn\nfile = meshes/250k.2.node\nformat = mesh\n\n[scalefree/rmat8-2e14.gr]\nflags =\nname = rmat8-2e14\nfile = scalefree/deprecated/rmat8-2e14.gr\nformat = bin/galois\n\n[random/r4-2e26.gr]\nflags =\nname = r4-2e26\nfile = random/r4-2e26.gr\nformat = bin/galois\n\n[random/r4-2e26.sgr]\nflags =\nname = r4-2e26-symmetric\nfile = random/symmetric/r4-2e26.gr\nformat = bin/galois\n\n[random/rmat25.rsgr]\nflags =\nname = rmat25-rsymmetric\nfile = scalefree/randomized/symmetric/rmat16-2e25-a=0.57-b=0.19-c=0.19-d=.05.srgr\nformat = bin/galois\n\n[road/USA-road-d.USA.gr]\nflags =\nname = USA-road-d.USA\nfile = road/USA-road-d.USA.gr\nformat = bin/galois\n\n[unweighted/uk-2007-05.sgr]\nflags =\nname = uk-2007-05-symmetric\nfile = unweighted/uk-2007-05.sgr\nformat = bin/galois\n\n[scalefree/symmetric/rmat26.sgr]\nflags =\nname = rmat26-symmetric\nfile = scalefree/symmetric/rmat16-2e26-a=0.57-b=0.19-c=0.19-d=.05.sgr\nformat = bin/galois\n\n[scalefree/rmat28.gr]\nflags =\nname = rmat28\nfile = scalefree/rmat16-2e28-a=0.57-b=0.19-c=0.19-d=.05.gr\nformat = bin/galois\n\n[scalefree/random/rmat27.sgr]\nflags =\nname = rmat27-rsymmetric\nfile = scalefree/randomized/symmetric/rmat16-2e27-a=0.57-b=0.19-c=0.19-d=.05.srgr\nformat = bin/galois\n\n[scalefree/random/rmat28.sgr]\nflags =\nname = rmat28-symmetric\nfile = scalefree/randomized/symmetric/rmat16-2e28-a=0.57-b=0.19-c=0.19-d=0.05.srgr\nformat = bin/galois\n\n[meshes/r5M]\nflags =\nname = r5M\nfile = meshes/r5M\nformat = mesh\n\n[meshes/r5Mn]\nflags =\nname = r5Mn\nfile = meshes/r5M.node\nformat = mesh/nodes\n\n[twitter40]\nflags =\nname = twitter40\nfile = unweighted/twitter-WWW10-component.gr\nformat = bin/galois\n\n[twitter40-transpose]\nflags =\nname = twitter40-transpose\nfile = unweighted/twitter-WWW10-component-transpose.gr\nformat = bin/galois\n\n[twitter40-symmetric]\nflags =\nname = twitter40-symmetric\nfile = unweighted/twitter-WWW10-component-symmetric.gr\nformat = bin/galois\n\n[twitter50]\nflags =\nname = twitter50\nfile = unweighted/twitter-ICWSM10-component.gr\nformat = bin/galois\n\n[twitter50w]\nflags =\nname = twitter50w\nfile = unweighted/withRandomWeights/twitter-ICWSM10-component_withRandomWeights.gr\nformat = bin/galois\n\n[twitter50-transpose]\nflags =\nname = twitter50-transpose\nfile = unweighted/twitter-ICWSM10-component-transpose.gr\nformat = bin/galois\n\n[soc-livejournal]\nflags =\nname = soc-livejournal\nfile = unweighted/soc-LiveJournal1.gr\nformat = bin/galois\n\n[com-lj]\nflags =\nname = com-lj\nfile = stanford/communities/LiveJournal/com-lj.wgt32.sym.gr.triangles\nformat = triangles\n\n[weighted/bipartite/yahoo]\nflags =\nname = yahoo-music\nfile = weighted/bipartite/yahoo.gr\nformat = bin/galois\n\n[weighted/bipartite/netflix]\nflags =\nname = netflix\nfile = weighted/bipartite/floatEdgeWts/netflix.gr\nformat = bin/galois\n\n[java/pta/tshark_constraints]\nflags =\nname = tshark_constraints\nfile = java/pta/tshark_constraints.txt\nformat = text\n\n[java/pta/gdb_constraints]\nflags =\nname = gdb_constraints\nfile = java/pta/gdb_constraints.txt\nformat = text\n\n[nothing]\nflags =\nname = nothing\nfile = \nformat = nothing\n"
  },
  {
    "path": "scripts/experimental/githubbmk2_setup/lonestar.inputprops",
    "content": "[bmktest2-props]\nversion = 2\npaths = \n\n[soc-livejournal]\nptranspose=/net/ohm/export/iss/inputs/unweighted/soc-LiveJournal1.ptgr\n\n[twitter40]\nptranspose=/net/ohm/export/iss/inputs/unweighted/twitter-WWW10-component.ptgr\n"
  },
  {
    "path": "scripts/experimental/heterogeneousGalois/CPU_run_scripts_stampede/ruby_BFS_CC_SSSP_rmat_USA_twitter_Pull.sh",
    "content": "#!/bin/bash\nbenchmark=$2\nvariant=$3\npartition=$4\ncomm_mode=\"0\"\n\nalgo=$benchmark\"_pull-\"$variant\necho $algo\n\n./$algo \"-help\"\n\nif [ $partition != \"0\" ]; then\n  echo \"Parition should be 0 since it's edgeCut : Exiting\"\n  exit\nfi\n\nENV_options=\"MV2_USE_LAZY_MEM_UNREGISTER=0 MV2_ENABLE_AFFINITY=0 GALOIS_DO_NOT_BIND_THREADS=1\"\n\nGRAPH_rmat=\"/work/02982/ggill0/Distributed_latest/inputs/pagerank/galois/scalefree/NEW/transpose/rmat16-2e25-a=0.57-b=0.19-c=0.19-d=.05.transpose.gr\"\nGRAPH_usa=\"/work/02982/ggill0/Distributed_latest/inputs/pagerank/galois/road/USA-road-d.USA-trans.gr\"\n\nGRAPH_twitter=\"/scratch/03279/roshand/dist-inputs/transpose/twitter-ICWSM10-component.tgr\"   #\"/work/02982/ggill0/Distributed_latest/inputs/pagerank/galois/withRandomWeights/transpose/twitter-ICWSM10-component_withRandomWeights.transpose.gr\"\nGRAPH_rmat28=\"/scratch/03279/roshand/dist-inputs/transpose/rmat28.tgr\" #Randomized rmat28\nGRAPH_rmat15=\"/scratch/03279/roshand/dist-inputs/transpose/rmat15.tgr\" #Randomized rmat28\n\ncmd_options_reset=\"-maxIterations=10000 -verify=0 -t=$1 -enableVertexCut=${partition}\"\ncmd_options=\"-maxIterations=10000 -verify=0 -t=$1 -enableVertexCut=${partition}\"\n\nif [ $benchmark = \"pagerank\" ]; then\n\tcmd_options=$cmd_options\"  -tolerance=0.0000001\"\nfi\n\nif [ $benchmark = \"cc\" ]; then\n  GRAPH_rmat28=\"/scratch/03279/roshand/dist-inputs/symmetric/rmat28.sgr\"\n  GRAPH_twitter=\"/scratch/03279/roshand/dist-inputs/symmetric/twitter-ICWSM10-component.sgr\"\nfi\n\n\n#RMAT28\nif [ $5 = \"rmat28\" ]; then\n  if [ $benchmark = \"bfs\" ] || [ $benchmark = \"sssp\" ]; then\n    cmd_options=$cmd_options\" -startNode=155526494\"\n  fi\nruby /work/02982/ggill0/Distributed_latest/scripts/stampede_jobs.rb  -t \"01:42:30\" -q \"normal\" -n 4 -N 4 -i dist_run_script_generated -o ./LOG_RUNS/LOG_${algo}_TH$1\\_ECUT\\_${partition}\\_rmat28.tgr_  -A \"Galois\" -c \"$ENV_options ibrun ./$algo $GRAPH_rmat28 $cmd_options \" -s $6  -e $7 -k 2\nfi\n\n#  twitter ICWSM ##sssp\nif [ $8 = \"twitter\" ]; then\n  if [ $benchmark = \"bfs\" ] || [ $benchmark = \"sssp\" ]; then\n    cmd_options=$cmd_options_reset\n    cmd_options=$cmd_options\" -startNode=33643219\"\n  fi\nruby /work/02982/ggill0/Distributed_latest/scripts/stampede_jobs.rb  -t \"01:48:00\" -q \"normal\" -n 4 -N 4 -i dist_run_script_generated -o ./LOG_RUNS/LOG_${algo}_TH$1\\_ECUT\\_${partition}\\_Twitter-ICWSM10.tgr_  -A \"Galois\"  -c \"$ENV_options ibrun ./$algo $GRAPH_twitter $cmd_options\" -s $9  -e ${10}  -k 2\nfi\n\n#RMAT15\nif [[ ${11} = \"rmat15\" ]]; then\n  if [ $benchmark = \"bfs\" ] || [ $benchmark = \"sssp\" ]; then\n    cmd_options=$cmd_options\" -startNode=0\"\n  fi\nruby /work/02982/ggill0/Distributed_latest/scripts/stampede_jobs.rb  -t \"00:12:30\" -q \"development\" -n 4 -N 4 -i dist_run_script_generated -o ./DEV_RUNS/DEV_LOG_${algo}_TH$1\\_ECUT\\_${partition}\\_rmat15.tgr_  -A \"Galois\" -c \"$ENV_options ibrun ./$algo $GRAPH_rmat15 $cmd_options \" -s ${12}  -e ${13} -k 2\nfi\n\n\n"
  },
  {
    "path": "scripts/experimental/heterogeneousGalois/CPU_run_scripts_stampede/ruby_BFS_CC_SSSP_rmat_USA_twitter_Pull_Vcut.sh",
    "content": "#!/bin/bash\n\nbenchmark=${2}\nvariant=${3}\npartition=${4}\ncomm_mode=\"0\"\nqueue=$9\n\nalgo=$benchmark\"_pull-\"$variant\n#algo=$benchmark\"_pull-\"$variant\"_\"$partition\necho $algo\n\n./$algo \"-help\"\n\nENV_options=\"MV2_USE_LAZY_MEM_UNREGISTER=0 MV2_ENABLE_AFFINITY=0 GALOIS_DO_NOT_BIND_THREADS=1\"\n\n#GRAPH_rmat=\"/work/02982/ggill0/Distributed_latest/inputs/pagerank/galois/scalefree/NEW/transpose/rmat16-2e25-a=0.57-b=0.19-c=0.19-d=.05.transpose.gr\"\n#GRAPH_usa=\"/work/02982/ggill0/Distributed_latest/inputs/pagerank/galois/road/USA-road-d.USA-trans.gr\"\n\n\nGRAPH_twitter=\"/scratch/03279/roshand/dist-inputs/transpose/twitter-ICWSM10-component.tgr\"\n\nGRAPH_rmat28=\"/scratch/03279/roshand/dist-inputs/transpose/rmat28.tgr\" #Randomized rmat28\n\ncmd_options_reset=\"-maxIterations=10000 -verify=0 -t=$1 -enableVertexCut=${partition}  \"\ncmd_options=\"-maxIterations=10000 -verify=0 -t=$1 -enableVertexCut=${partition} \"\n\nif [ $benchmark = \"pagerank\" ]; then\n\tcmd_options=$cmd_options\"  -tolerance=0.0000001\"\nfi\n\nif [ $benchmark = \"cc\" ]; then\n  GRAPH_rmat28=\"/scratch/03279/roshand/dist-inputs/symmetric/rmat28.sgr\"\n  GRAPH_twitter=\"/scratch/03279/roshand/dist-inputs/symmetric/twitter-ICWSM10-component.sgr\"\nfi\n\n\nif [ $5 = \"rmat28\" ]; then\n  if [ $benchmark = \"bfs\" ] || [ $benchmark = \"sssp\" ]; then\n    cmd_options=$cmd_options\" -startNode=155526494\"\n  fi\n\n  #partFileBase=\"/scratch/01131/rashid/inputs/partitioned\"\n  partFileBase=\"/scratch/02982/ggill0/dist_inputs/partitioned\"\n  for i in $6\n  do\n    if [ $benchmark != \"cc\" ]; then\n      partFileType=\"rmat28-trans\"\n      if [ $i == 2 ]; then\n        partFileExt=\"rmat28.tgr\"\n      elif [ $i == 4 ]; then\n        partFileExt=\"rmat28.tgr\"\n      elif [ $i == 8 ]; then\n        partFileExt=\"rmat16-2e28-a=0.57-b=0.19-c=0.19-d=0.05.trgr\"\n      elif [ $i == 16 ]; then\n        partFileExt=\"rmat28.tgr\"\n      elif [ $i == 32 ]; then\n        partFileExt=\"rmat16-2e28-a=0.57-b=0.19-c=0.19-d=0.05.trgr\"\n      elif [ $i == 64 ]; then\n        partFileExt=\"rmat28.tgr\"\n      elif [ $i == 128 ]; then\n        partFileExt=\"rmat28.tgr\"\n        #partFileExt=\"rmat16-2e28-a=0.57-b=0.19-c=0.19-d=0.05.trgr\"\n      elif [ $i == 256 ]; then\n        partFileExt=\"rmat16-2e28-a=0.57-b=0.19-c=0.19-d=0.05.trgr\"\n      fi\n    else\n      partFileExt=\"rmat28.sgr\"\n      partFileType=\"rmat28-sym\"\n    fi\n\n    ruby /work/02982/ggill0/Distributed_latest/scripts/stampede_jobs.rb  -t \"01:45:00\" -q $queue -n 4 -N 4 -i dist_run_script_generated -o  ./LOG_RUNS/LOG_${algo}_TH$1\\_VCUT\\_${partition}\\_rmat28.tgr_  -A \"Galois\" -c \"$ENV_options ibrun ./$algo $GRAPH_rmat28  -partFolder=$partFileBase/$i/$partFileType/$partFileExt $cmd_options\" -s $i  -e $i  -k 2\n  done\nfi\n\n\n\nif [ $7 = \"twitter\" ]; then\n  if [ $benchmark = \"bfs\" ] || [ $benchmark = \"sssp\" ]; then\n    cmd_options=$cmd_options_reset\n    cmd_options=$cmd_options\" -startNode=33643219\"\n  fi\n\n  partFileBase=\"/scratch/01131/rashid/inputs/partitioned\"\n  for i in $8\n  do\n    if [ $benchmark != \"cc\" ]; then\n       partFileExt=\"twitter-ICWSM10-component_withRandomWeights.transpose.gr\"\n       partFileType=\"twitter-trans\"\n    else\n      partFileExt=\"twitter-ICWSM10-component.sgr\"\n      partFileType=\"twitter-sym\"\n    fi\n\n    ruby /work/02982/ggill0/Distributed_latest/scripts/stampede_jobs.rb -t \"01:25:00\" -q $queue -n 4 -N 4 -i dist_run_script_generated -o  ./LOG_RUNS/LOG_${algo}_TH$1\\_VCUT\\_${partition}\\_Twitter-ICWSM10_  -A \"Galois\" -c \"$ENV_options ibrun ./$algo $GRAPH_twitter  -partFolder=$partFileBase/$i/$partFileType/$partFileExt $cmd_options\" -s $i  -e $i  -k 2\n  done\nfi\n"
  },
  {
    "path": "scripts/experimental/heterogeneousGalois/CPU_run_scripts_stampede/ruby_BFS_CC_SSSP_rmat_USA_twitter_Push.sh",
    "content": "#!/bin/bash\nbenchmark=$2\nvariant=$3\npartition=$4\ncomm_mode=${11}\n\nalgo=$benchmark\"_push-\"$variant\"_edge-cut\"\necho $algo\n\n./$algo \"-help\"\n\nENV_options=\"MV2_USE_LAZY_MEM_UNREGISTER=0 MV2_ENABLE_AFFINITY=0 GALOIS_DO_NOT_BIND_THREADS=1\"\n\n\nGRAPH_twitter=\"/scratch/03279/roshand/dist-inputs/twitter-ICWSM10-component.gr\"\nGRAPH_rmat28=\"/scratch/03279/roshand/dist-inputs/rmat28.gr\" #Randomized rmat28\n\ncmd_options_reset=\"-maxIterations=10000 -verify=0 -t=$1 -comm_mode=${comm_mode}\"\ncmd_options=\"-maxIterations=10000 -verify=0 -t=$1 -comm_mode=${comm_mode}\"\n\nif [ $benchmark = \"pagerank\" ]; then\n\tcmd_options=$cmd_options\"  -tolerance=0.0000001\"\nfi\n\nif [ $benchmark = \"cc\" ]; then\n  GRAPH_rmat28=\"/scratch/03279/roshand/dist-inputs/symmetric/rmat28.sgr\"\n  GRAPH_twitter=\"/scratch/03279/roshand/dist-inputs/symmetric/twitter-ICWSM10-component.sgr\"\nfi\n\n\n#RMAT25\nif [ $5 = \"rmat28\" ]; then\n  if [ $benchmark = \"bfs\" ] || [ $benchmark = \"sssp\" ]; then\n    cmd_options=$cmd_options\" -startNode=155526494\"\n  fi\nruby ../../../../../../Distributed_latest/scripts/stampede_jobs.rb  -t \"01:42:30\" -q \"normal\" -n 4 -N 4 -i dist_run_script_generated -o ./LOG_jul_31/LOG_${algo}_TH$1\\_CM${comm_mode}\\_rmat28.rgr_  -A \"Galois\" -c \"$ENV_options ibrun ./$algo $GRAPH_rmat28 $cmd_options \" -s $6  -e $7 -k 2\nfi\n\n#  twitter ICWSM ##sssp\nif [ $8 = \"twitter\" ]; then\n  if [ $benchmark = \"bfs\" ] || [ $benchmark = \"sssp\" ]; then\n    cmd_options=$cmd_options_reset\n    cmd_options=$cmd_options\" -startNode=33643219\"\n  fi\nruby ../../../../../../Distributed_latest/scripts/stampede_jobs.rb  -t \"01:48:00\" -q \"normal\" -n 4 -N 4 -i dist_run_script_generated -o ./LOG_jul_31/LOG_${algo}_TH$1\\_CM${comm_mode}\\_Twitter-ICWSM10_  -A \"Galois\"  -c \"$ENV_options ibrun ./$algo $GRAPH_twitter $cmd_options\" -s $9  -e ${10}  -k 2\nfi\n"
  },
  {
    "path": "scripts/experimental/heterogeneousGalois/CPU_run_scripts_stampede/ruby_BFS_CC_SSSP_rmat_USA_twitter_Push_Vcut.sh",
    "content": "#!/bin/bash\n\nbenchmark=${2}\nvariant=${3}\npartition=${4}\ncomm_mode=\"0\"\nqueue=$9\n\nalgo=$benchmark\"_push-\"$variant\"_vertex-cut\"\n#algo=$benchmark\"_pull-\"$variant\"_\"$partition\necho $algo\n\n./$algo \"-help\"\n\nENV_options=\"MV2_USE_LAZY_MEM_UNREGISTER=0 MV2_ENABLE_AFFINITY=0 GALOIS_DO_NOT_BIND_THREADS=1\"\n\n#GRAPH_rmat=\"/work/02982/ggill0/Distributed_latest/inputs/pagerank/galois/scalefree/NEW/transpose/rmat16-2e25-a=0.57-b=0.19-c=0.19-d=.05.transpose.gr\"\n#GRAPH_usa=\"/work/02982/ggill0/Distributed_latest/inputs/pagerank/galois/road/USA-road-d.USA-trans.gr\"\n\n\nGRAPH_twitter=\"/scratch/03279/roshand/dist-inputs/twitter-ICWSM10-component.gr\"\nGRAPH_rmat28=\"/scratch/03279/roshand/dist-inputs/rmat28.gr\" #Randomized rmat28\n\n\ncmd_options_reset=\"-maxIterations=10000 -verify=0 -t=$1\"\ncmd_options=\"-maxIterations=10000 -verify=0 -t=$1\"\n\nif [ $benchmark = \"pagerank\" ]; then\n\tcmd_options=$cmd_options\"  -tolerance=0.0000001\"\nfi\n\nif [ $benchmark = \"cc\" ]; then\n  GRAPH_rmat28=\"/scratch/03279/roshand/dist-inputs/symmetric/rmat28.sgr\"\n  GRAPH_twitter=\"/scratch/03279/roshand/dist-inputs/symmetric/twitter-ICWSM10-component.sgr\"\nfi\n\n\nif [ $5 = \"rmat28\" ]; then\n  if [ $benchmark = \"bfs\" ] || [ $benchmark = \"sssp\" ]; then\n    cmd_options=$cmd_options\" -startNode=155526494\"\n  fi\n\n  partFileBase=\"/scratch/01131/rashid/inputs/partitioned\"\n  for i in $6\n  do\n    if [ $benchmark != \"cc\" ]; then\n      partFileType=\"rmat28\"\n      if [ $i == 2 ]; then\n        partFileExt=\"rmat28.gr\"\n      elif [ $i == 4 ]; then\n        partFileExt=\"rmat28.gr\"\n      elif [ $i == 8 ]; then\n        partFileExt=\"rmat28.rgr\"\n      elif [ $i == 16 ]; then\n        partFileExt=\"rmat28.gr\"\n      elif [ $i == 32 ]; then\n        partFileExt=\"rmat16-2e28-a=0.57-b=0.19-c=0.19-d=0.05.rgr\"\n      elif [ $i == 64 ]; then\n        partFileExt=\"rmat28.gr\"\n      elif [ $i == 128 ]; then\n        partFileExt=\"rmat16-2e28-a=0.57-b=0.19-c=0.19-d=0.05.rgr\"\n      elif [ $i == 256 ]; then\n        partFileExt=\"rmat16-2e28-a=0.57-b=0.19-c=0.19-d=0.05.rgr\"\n      fi\n    else\n      partFileExt=\"rmat28.sgr\"\n      partFileType=\"rmat28-sym\"\n    fi\n\n    ruby ../../../../../../Distributed_latest/scripts/stampede_jobs.rb  -t \"01:25:00\" -q $queue -n 4 -N 4 -i dist_run_script_generated -o  ./LOG_aug_9/LOG_${algo}_TH$1\\_CM${comm_mode}\\_rmat28.rgr_  -A \"Galois\" -c \"$ENV_options ibrun ./$algo $GRAPH_rmat28  -partFolder=$partFileBase/$i/$partFileType/$partFileExt $cmd_options\" -s $i  -e $i  -k 2\n  done\nfi\n\n\n\nif [ $7 = \"twitter\" ]; then\n  if [ $benchmark = \"bfs\" ] || [ $benchmark = \"sssp\" ]; then\n    cmd_options=$cmd_options_reset\n    cmd_options=$cmd_options\" -startNode=33643219\"\n  fi\n\n  partFileBase=\"/scratch/01131/rashid/inputs/partitioned\"\n  for i in $8\n  do\n    if [ $benchmark != \"cc\" ]; then\n       partFileExt=\"twitter-ICWSM10-component_withRandomWeights.gr\"\n       partFileType=\"twitter\"\n    else\n      partFileExt=\"twitter-ICWSM10-component.sgr\"\n      partFileType=\"twitter-sym\"\n    fi\n\n    ruby ../../../../../../Distributed_latest/scripts/stampede_jobs.rb  -t \"01:25:00\" -q $queue -n 4 -N 4 -i dist_run_script_generated -o  ./LOG_aug_9/LOG_${algo}_TH$1\\_CM${comm_mode}\\_Twitter-ICWSM10_  -A \"Galois\" -c \"$ENV_options ibrun ./$algo $GRAPH_twitter  -partFolder=$partFileBase/$i/$partFileType/$partFileExt $cmd_options\" -s $i  -e $i  -k 2\n  done\nfi\n"
  },
  {
    "path": "scripts/experimental/heterogeneousGalois/README_compiler",
    "content": "  USAGE: ./compiler.sh <input-source-file>\n\nIf you do not want to generate heterogeneous CUDA code (which requires GGC):\n  USAGE: ABELIAN_NON_HETEROGENEOUS=1 ./compiler.sh <input-source-file>\nNote: Please DO NOT commit this generated code (could lead to a mismatch between CPU and GPU code).\n\nThere are 3 other environment variables that the compiler uses (which you can change):\n* ABELIAN_LLVM_BUILD: build directory of LLVM containing ABELIAN plugins\n* ABELIAN_GALOIS_ROOT: source directory of Galois containing ABELIAN headers and runtime\n* ABELIAN_GGC_ROOT: source directory of GGC which compiles IrGL to CUDA\n\n"
  },
  {
    "path": "scripts/experimental/heterogeneousGalois/batch_bridges_all.sh",
    "content": "#!/bin/sh\n\n# all benchmarks\nEXECS=( \"bfs_push\" \"bfs_pull\" \"cc_push\" \"cc_pull\" \"kcore_push\" \"kcore_pull\" \"pagerank_push\" \"pagerank_pull\" \"sssp_push\" \"sssp_pull\" )\n# fastest variants\nEXECS=( \"bfs_push\" \"cc_push\" \"pagerank_pull\" \"sssp_push\" )\n\nSET=\"1,02:00:00 2,01:30:00 4,01:00:00\" #rmat28 gpu\nSET=\"4,03:30:00 8,03:30:00 16,03:00:00 32,02:45:00 64,02:30:00 128,02:00:00\" #clueweb12\nSET=\"1,2:00:00 2,01:30:00 4,01:00:00 8,01:00:00 16,01:00:00 32,00:45:00 64,00:30:00 128,00:30:00\" #rmat28\nSETt=\"1,02:00:00\" #twitter40 gpu\nSETc=\"8,02:00:00 16,02:00:00\" #clueweb12 gpu\nSETk=\"8,02:00:00 16,02:00:00\" #kron30 gpu\nSETr=\"1,02:00:00 2,01:30:00 4,01:00:00 8,01:00:00 16,01:00:00\" #rmat28 gpu\n\nINPUTS=(\"twitter40;\\\"${SETt}\\\"\")\nINPUTS=(\"kron30;\\\"${SETk}\\\"\")\nINPUTS=(\"clueweb12;\\\"${SETc}\\\"\")\nINPUTS=(\"rmat28;\\\"${SETr}\\\"\")\n\nQUEUE=GPU-shared\nQUEUE=RM\nQUEUE=GPU\n#HET=1\n\nPARTS=( \"cvc\" \"hovc\" \"2dvc\" \"iec\" ) #rmat28/kron30\nPARTS=( \"cvc\" \"hivc\" \"2dvc\" \"oec\" ) #clueweb12/twitter40\nPARTS=( \"cvc\" )\nPARTS=( \"oec\" ) #clueweb12/twitter40\nPARTS=( \"iec\" ) #kron30/rmat28\n\nfor j in \"${INPUTS[@]}\"\ndo\n  IFS=\";\";\n  set $j;\n  for i in \"${EXECS[@]}\"\n  do\n    for p in \"${PARTS[@]}\"\n    do\n      echo \"./run_bridges_all.sh ${i} ${1} ${2} $QUEUE $p $HET\"\n      ./run_bridges_all.sh ${i} ${1} ${2} $QUEUE $p $HET |& tee -a jobs\n    done\n  done\ndone\n\n"
  },
  {
    "path": "scripts/experimental/heterogeneousGalois/batch_single-host_multi-device_all.sh",
    "content": "#!/bin/sh\n\n# all benchmarks\nEXECS=( \"bfs_push\" \"bfs_pull\" \"cc_push\" \"cc_pull\" \"kcore_push\" \"kcore_pull\" \"pagerank_push\" \"pagerank_pull\" \"sssp_push\" \"sssp_pull\" )\n# fastest variants\nEXECS=( \"bfs_push\" \"cc_push\" \"pagerank_pull\" \"sssp_push\" )\n\nINPUTS=( \"twitter40\" \"rmat26\" \"twitter50\" \"rmat28\" \"uk2007\" )\nINPUTS=( \"twitter40\" \"rmat26\" \"rmat28\" \"uk2007\" )\nINPUTS=( \"rmat28\" \"twitter40\" )\n\nfor j in \"${INPUTS[@]}\"\ndo\n  for i in \"${EXECS[@]}\"\n  do\n    echo \"./run_single-host_multi-device_all.sh ${i} ${j}\"\n    ./run_single-host_multi-device_all.sh ${i} ${j}\n    #echo \"ABELIAN_VERIFY=1 ./run_single-host_multi-device_all.sh ${i} ${j}\"\n    #ABELIAN_VERIFY=1 ./run_single-host_multi-device_all.sh ${i} ${j}\n  done\ndone\n\n"
  },
  {
    "path": "scripts/experimental/heterogeneousGalois/batch_stampede_all.sh",
    "content": "#!/bin/sh\n\n# all benchmarks\nEXECS=( \"bfs_push\" \"bfs_pull\" \"cc_push\" \"cc_pull\" \"kcore_push\" \"kcore_pull\" \"pagerank_push\" \"pagerank_pull\" \"sssp_push\" \"sssp_pull\" )\n# fastest variants\nEXECS=( \"bfs_push\" \"cc_push\" \"pagerank_pull\" \"sssp_push\" )\n\nSET=\"1,2:00:00 2,01:30:00 4,01:00:00 8,00:45:00 16,00:30:00 32,00:20:00\"\nSET=\"128,00:30:00 64,00:45:00 32,01:00:00\"\nSET=\"64,01:00:00 32,01:30:00 16,02:00:00\" \nSETc=\"256,01:00:00 128,01:00:00 64,01:15:00 32,01:30:00 16,01:45:00\"\nSETk=\"256,01:00:00 128,01:00:00 64,01:15:00 32,01:30:00 16,01:45:00 8,02:00:00 4,02:30:00\"\nSETr=\"256,01:00:00 128,01:00:00 64,01:15:00 32,01:30:00 16,01:45:00 8,02:00:00 4,02:30:00 2,02:30:00 1,02:30:00\"\n\nINPUTS=(\"twitter40;\\\"${SET}\\\"\")\nINPUTS=(\"rmat28;\\\"${SET}\\\"\")\nINPUTS=(\"kron30;\\\"${SET}\\\"\")\nINPUTS=(\"clueweb12;\\\"${SET}\\\"\")\nINPUTS=(\"wdc12;\\\"${SET}\\\"\")\nINPUTS=(\"rmat28;\\\"${SETr}\\\"\" \"kron30;\\\"${SETk}\\\"\" \"clueweb12;\\\"${SETc}\\\"\")\n\nQUEUE=development\nQUEUE=normal\n\nPARTS=( \"cvc\" \"hivc\" \"2dvc\" \"oec\" ) #clueweb12\nPARTS=( \"cvc\" \"hovc\" \"2dvc\" \"iec\" ) #rmat28/kron30\nPARTS=( \"cvc\" )\n\nfor j in \"${INPUTS[@]}\"\ndo\n  IFS=\";\";\n  set $j;\n  for i in \"${EXECS[@]}\"\n  do\n    for p in \"${PARTS[@]}\"\n    do\n      echo \"./run_stampede_all.sh ${i} ${1} ${2} $QUEUE $p\"\n      ./run_stampede_all.sh ${i} ${1} ${2} $QUEUE $p |& tee -a jobs\n    done\n  done\ndone\n\n"
  },
  {
    "path": "scripts/experimental/heterogeneousGalois/batch_verify.sh",
    "content": "#!/bin/sh\n\nLOG=.verify_log\n\n# all benchmarks\nEXECS=( \"bfs_push\" \"bfs_pull\" \"kcore_push\" \"kcore_pull\" \"cc_push\" \"cc_pull\" \"sssp_push\" \"sssp_pull\" \"pagerank_push\" \"pagerank_pull\" )\nEXECS=( \"bfs_push\" \"kcore_push\" \"cc_push\" \"sssp_push\" \"pagerank_pull\" )\n\nINPUTS=( \"rmat25\" \"twitter-WWW10-component\" )\nINPUTS=( \"rmat15\" \"rmat20\" \"rmat24\" \"road-USA\" )\n#INPUTS=( \"rmat20\" \"road-USA\")\nINPUTS=( \"rmat15\" \"rmat20\" )\nINPUTS=( \"rmat20\" )\nINPUTS=( \"rmat15\" )\n\nrm -f $LOG\n\ncurrent_dir=$(dirname \"$0\")\nfor input in \"${INPUTS[@]}\"\ndo\n  for EXEC in \"${EXECS[@]}\"\n  do\n    $current_dir/verify.sh ${EXEC} ${input} \"--exec=Sync\"\n    $current_dir/verify.sh ${EXEC} ${input} \"--exec=Async\"\n    rm -f $LOG\n  done\ndone\n\n"
  },
  {
    "path": "scripts/experimental/heterogeneousGalois/compile.sh",
    "content": "#!/bin/sh\n# Usage: ./compile.sh <SOURCE_INPUT_FILE> <GENERATED_OUTPUTD_DIR>\n# environment variables: ABELIAN_NON_HETEROGENEOUS ABELIAN_LLVM_BUILD ABELIAN_GALOIS_ROOT ABELIAN_GALOIS_BUILD ABELIAN_GGC_ROOT\n\nINPUT=$(cd $(dirname \"$1\") && pwd -P)/$(basename \"$1\")\nif [ -n \"$2\" ]; then\n  if ! [ -d \"$2\" ]; then\n    mkdir $2\n  fi\n  OUTPUT_DIR=\"$( cd $2 && pwd )\"\nelse\n  OUTPUT_DIR=\"$( cd \"$(dirname \"$0\" )\" && pwd )\"\nfi\n\nif [ -z \"$ABELIAN_LLVM_BUILD\" ]; then\n  ABELIAN_LLVM_BUILD=/net/velocity/workspace/SourceCode/llvm/build\nfi\nif [ -z \"$ABELIAN_GALOIS_ROOT\" ]; then\n  ABELIAN_GALOIS_ROOT=/net/velocity/workspace/SourceCode/Galois\nfi\n#if [ -z \"$ABELIAN_GALOIS_BUILD\" ]; then\n#  ABELIAN_GALOIS_BUILD=/net/velocity/workspace/SourceCode/Galois/build/verify\n#fi\nif [ -z \"$ABELIAN_NON_HETEROGENEOUS\" ]; then\n  if [ -z \"$ABELIAN_GGC_ROOT\" ]; then\n    ABELIAN_GGC_ROOT=/net/velocity/workspace/SourceCode/ggc\n  fi\nfi\nMPI_INCLUDE=/opt/apps/ossw/libraries/mpich2/mpich2-3.1.4/c7/gcc-4.9/include\n\necho \"Using LLVM build:\" $ABELIAN_LLVM_BUILD\necho \"Using Galois:\" $ABELIAN_GALOIS_ROOT\nif [ -z \"$ABELIAN_NON_HETEROGENEOUS\" ]; then\n  echo \"Using GGC:\" $ABELIAN_GGC_ROOT\nfi\n\nCXX_DEFINES=\"-DBOOST_NO_AUTO_PTR -DGALOIS_COPYRIGHT_YEAR=2015 -DGALOIS_VERSION=2.3.0 -DGALOIS_VERSION_MAJOR=2 -DGALOIS_VERSION_MINOR=3 -DGALOIS_VERSION_PATCH=0 -D__STDC_CONSTANT_MACROS -D__STDC_LIMIT_MACROS\"\nCXX_FLAGS=\"-g -Wall -gcc-toolchain $GCC_BIN/.. -fopenmp -fcolor-diagnostics -O3 -DNDEBUG -I$ABELIAN_GALOIS_ROOT/libdist/include -I$ABELIAN_GALOIS_ROOT/dist_apps/include -I$MPI_INCLUDE -I$BOOST_INC -I$ABELIAN_GALOIS_ROOT/lonestar/include -I$ABELIAN_GALOIS_ROOT/libgalois/include -I$ABELIAN_GALOIS_ROOT/libruntime/include -I$ABELIAN_GALOIS_ROOT/libdist/include -I$ABELIAN_GALOIS_ROOT/libllvm/include -std=gnu++14\"\n\nif [ -z \"$ABELIAN_NON_HETEROGENEOUS\" ]; then\n  GGC_FLAGS=\"--cuda-worklist basic --cuda-graph basic --opt parcomb --opt np --npf 8 \"\n  if [ -f \"$OUTPUT_DIR/GGCFLAGS\" ]; then\n    GGC_FLAGS+=$(head -n 1 \"$OUTPUT_DIR/GGCFLAGS\")\n  fi\n  echo \"Using GGC FLAGS:\" $GGC_FLAGS\nfi\n\nCXX=$ABELIAN_LLVM_BUILD/bin/clang++\nGPREPROCESS_CXX=\"$CXX -Xclang -load -Xclang $ABELIAN_LLVM_BUILD/lib/GaloisFunctionsPreProcess.so -Xclang -plugin -Xclang galois-preProcess\"\nGANALYSIS_CXX=\"$CXX -Xclang -load -Xclang $ABELIAN_LLVM_BUILD/lib/GaloisFunctionsAnalysis.so -Xclang -plugin -Xclang galois-analysis\"\nGFUNCS_CXX=\"$CXX -Xclang -load -Xclang $ABELIAN_LLVM_BUILD/lib/GaloisFunctions.so -Xclang -plugin -Xclang galois-fns\"\nif [ -z \"$ABELIAN_NON_HETEROGENEOUS\" ]; then\n  IRGL_CXX=\"$CXX -Xclang -load -Xclang $ABELIAN_LLVM_BUILD/lib/GaloisFunctions.so -Xclang -plugin -Xclang irgl\"\n  GGC=\"$ABELIAN_GGC_ROOT/src/ggc\"\nfi\n\nlog=.log\n\ncd $OUTPUT_DIR\n\necho \"Cleaning generated files\"\nrm -f $log gen.cpp gen_cuda.py gen_cuda.cu gen_cuda.cuh gen_cuda.h\ncp $INPUT gen.cpp\n\necho \"Preprocessing global variables\"\n$GPREPROCESS_CXX $CXX_DEFINES $CXX_FLAGS -o .temp.o -c gen.cpp &>$log\n\necho \"Generating analysis information\"\n$GANALYSIS_CXX $CXX_DEFINES $CXX_FLAGS -o .temp.o -c gen.cpp >>$log 2>&1\necho \"Generating communication code\"\n$GFUNCS_CXX $CXX_DEFINES $CXX_FLAGS -o .temp.o -c gen.cpp >>$log 2>&1\n\nif [ -z \"$ABELIAN_NON_HETEROGENEOUS\" ]; then\n  echo \"Generating IrGL code\"\n  $IRGL_CXX $CXX_DEFINES $CXX_FLAGS -o .temp.o -c gen.cpp >>$log 2>&1\n  echo \"Generating CUDA code from IrGL\"\n  $GGC $GGC_FLAGS -o gen_cuda.cu gen_cuda.py >>$log 2>&1\nfi\n\nif [ -z \"$ABELIAN_NON_HETEROGENEOUS\" ]; then\n  echo \"Generated files in $OUTPUT_DIR: gen.cpp gen_cuda.py gen_cuda.h gen_cuda.cuh gen_cuda.cu\" \nelse\n  echo \"Generated files in $OUTPUT_DIR: gen.cpp\" \nfi\n\nrm -f *Entry-*.dot\n\n"
  },
  {
    "path": "scripts/experimental/heterogeneousGalois/compile_all.sh",
    "content": "#!/bin/sh\n\nif ! [ -z \"$ABELIAN_GALOIS_ROOT\" ]; then\n  BASE_DIR=${ABELIAN_GALOIS_ROOT}/dist_apps/experimental\nelse\n  BASE_DIR=..\nfi\nINPUT_DIR=${BASE_DIR}/compiler_inputs\nOUTPUT_DIR=${BASE_DIR}/compiler_outputs\n\nif [ -n \"$1\" ]; then\n  threads=$1\nelse\n  threads=1\nfi\ncount=0\nfor input in $INPUT_DIR/*.cpp; do\n  name=$(basename \"$input\" \".cpp\")\n  ./compile.sh $input ${OUTPUT_DIR}/$name &\n  count=$((count+1))\n  if [[ $count == $threads ]]; then\n    wait\n    count=0\n  fi\ndone\nif [[ $count != 0 ]]; then\n  wait\nfi\n\n"
  },
  {
    "path": "scripts/experimental/heterogeneousGalois/cuda_compile.sh",
    "content": "#!/bin/sh\n\nif [ -z \"$ABELIAN_LLVM_BUILD\" ]; then\n  ABELIAN_LLVM_BUILD=/net/velocity/workspace/SourceCode/llvm/build\nfi\nif [ -z \"$ABELIAN_GALOIS_ROOT\" ]; then\n  ABELIAN_GALOIS_ROOT=/net/velocity/workspace/SourceCode/Galois\nfi\n#if [ -z \"$ABELIAN_GALOIS_BUILD\" ]; then\n#  ABELIAN_GALOIS_BUILD=/net/velocity/workspace/SourceCode/Galois/build/verify\n#fi\n  if [ -z \"$ABELIAN_GGC_ROOT\" ]; then\n    ABELIAN_GGC_ROOT=/net/velocity/workspace/SourceCode/ggc\n  fi\nMPI_INCLUDE=/opt/apps/ossw/libraries/mpich2/mpich2-3.1.4/c7/gcc-4.9/include\n\necho \"Using LLVM build:\" $ABELIAN_LLVM_BUILD\necho \"Using Galois:\" $ABELIAN_GALOIS_ROOT\necho \"Using GGC:\" $ABELIAN_GGC_ROOT\n\nCXX_DEFINES=\"-DBOOST_NO_AUTO_PTR -DGALOIS_COPYRIGHT_YEAR=2015 -DGALOIS_VERSION=2.3.0 -DGALOIS_VERSION_MAJOR=2 -DGALOIS_VERSION_MINOR=3 -DGALOIS_VERSION_PATCH=0 -D__STDC_CONSTANT_MACROS -D__STDC_LIMIT_MACROS\"\nCXX_FLAGS=\"-g -Wall -gcc-toolchain $GCC_BIN/.. -fopenmp -fcolor-diagnostics -O3 -DNDEBUG -I$ABELIAN_GALOIS_ROOT/libdist/include -I$ABELIAN_GALOIS_ROOT/lonestardist/include -I$MPI_INCLUDE -I$BOOST_INC -I$ABELIAN_GALOIS_ROOT/lonestar/include -I$ABELIAN_GALOIS_ROOT/libgalois/include -I$ABELIAN_GALOIS_ROOT/libcusp/include -I$ABELIAN_GALOIS_ROOT/libgluon/include -I$ABELIAN_GALOIS_ROOT/libruntime/include -I$ABELIAN_GALOIS_ROOT/libnet/include -I$ABELIAN_GALOIS_ROOT/libllvm/include -std=gnu++14\"\n\nGGC_FLAGS=\" --cuda-worklist basic --cuda-graph basic --opt parcomb --dis-opt oiter --dis-opt oitergb\"\nif [[ $1 == *\"bfs\"* || $1 == *\"sssp\"* || $1 == *\"cc\"* || $1 == *\"pagerank_pull\"* || $1 == *\"kcore\"*  ]]; then\n\tGGC_FLAGS+=\" --opt np --npf 8 --opt dyn_lb --cuda_graph basic\"\nelse\n\tGGC_FLAGS+=\" --opt np --npf 8 --cuda_graph basic\"\nfi\n#GGC_FLAGS+=\" --loglevel DEBUG \"\nif [ -f \"GGCFLAGS\" ]; then\n  GGC_FLAGS+=$(head -n 1 \"GGCFLAGS\")\nfi\necho \"Using GGC FLAGS:\" $GGC_FLAGS\n\nCXX=$ABELIAN_LLVM_BUILD/bin/clang++\n  IRGL_CXX=\"$CXX -Xclang -load -Xclang $ABELIAN_LLVM_BUILD/lib/GaloisFunctions.so -Xclang -plugin -Xclang irgl\"\n  GGC=\"$ABELIAN_GGC_ROOT/src/ggc\"\n\nlog=.log\n\ngen=$1\n\necho \"Cleaning generated files\"\nif [ -n \"$2\" ]; then\n  rm -f $log ${gen}_cuda.cu\nelse\n  rm -f $log ${gen}_cuda.py ${gen}_cuda.cu ${gen}_cuda.cuh ${gen}_cuda.h\nfi\n\nif ! [ -n \"$2\" ]; then\n  echo \"Generating IrGL code\"\n  $IRGL_CXX $CXX_DEFINES $CXX_FLAGS -o .temp.o -c ${gen}.cpp >>$log 2>&1\nfi\n  echo \"Generating CUDA code from IrGL\"\n  $GGC $GGC_FLAGS -o ${gen}_cuda.cu ${gen}_cuda.py >>$log 2>&1\n\nif [ -n \"$2\" ]; then\n  echo \"Generated files: ${gen}_cuda.cu\" \nelse\n  echo \"Generated files: ${gen}_cuda.py ${gen}_cuda.h ${gen}_cuda.cuh ${gen}_cuda.cu\" \nfi\n\nrm -f Entry-*.dot cdep_Entry-*.dot\n"
  },
  {
    "path": "scripts/experimental/heterogeneousGalois/run_bridges.template.sbatch",
    "content": "#!/bin/bash\n#SBATCH --mail-user=roshan@cs.utexas.edu\n#SBATCH --mail-type=fail\n#SBATCH --mail-type=end\nPART=$3\nNUM_TASKS=$4\nPSET=$5\nTHREADS=$6\n\nexecname=$1\n#execdir=/pylon5/ci560jp/roshand/Galois-build/dist_apps\nexecdir=`pwd`\nEXEC=${execdir}/${execname}\n\ninputname=$2\ninputdirname=/pylon5/ci560jp/roshand/dist-inputs\nextension=gr\n\nstatname=${execname}_${inputname}_${PART}_${SLURM_NNODES}_${PSET}_${SLURM_JOB_ID}.stats\n\nFLAGS=\" -statFile=${execdir}/${statname}\"\n# kcore flag\nif [[ $execname == *\"kcore\"* ]]; then\n  # TODO: update this for non-100 kcore numbers\n  FLAGS+=\" -kcore=100\"\nfi\nif [[ ($execname == *\"bc\"*) || ($execname == *\"bfs\"*) || ($execname == *\"sssp\"*) ]]; then\n  if [[ -f \"${inputdirname}/${inputname}.source\" ]]; then\n    FLAGS+=\" -startNode=`cat ${inputdirname}/${inputname}.source`\"\n  fi\nfi\nif [[ ($execname == *\"bc\"*) ]]; then\n  FLAGS+=\" -singleSource\"\nfi\n\nsource_file=${inputdirname}/source\nif [[ $execname == *\"cc\"* || $execname == *\"kcore\"* ]]; then\n  inputdirname=${inputdirname}/symmetric\n  extension=sgr\n  FLAGS+=\" -symmetricGraph\"\nelse \n  # for verify purposes, always pass in graph transpose just in case it is \n  # needed for non-symmetric graphs\n  FLAGS+=\" -graphTranspose=${inputdirname}/transpose/${inputname}.tgr\"\nfi\ngrep \"${inputname}.${extension}\" ${source_file}\nINPUT=${inputdirname}/${inputname}.${extension}\n\nif [[ ($execname == *\"pagerank\"*) ]]; then\n  FLAGS+=\" -maxIterations=100\"\nfi\nFLAGS+=\" -partition=${PART}\"\n#if [[ ($PART == \"cvc\") ]]; then\n#  FLAGS+=\" -balanceMasters=both\"\n#fi\nif [[ ($PART == \"2dvc\") ]]; then\n  FLAGS+=\" -balanceMasters=nodes\"\nfi\n\n#if [[ ($PSET == *\"cg\"*) || ($PSET == *\"gc\"*) ]]; then\n#  FLAGS+=\" -scalegpu=2\"\n#fi\n\nRUN=mpirun\n\n#source $HOME/Galois/load_modules.sh\n\n# move to working directory\nWORK_DIR=/pylon5/ci560jp/roshand/Galois\ncd $WORK_DIR\n\nset -x #echo on\nPSM2_MULTI_EP=1 LD_LIBRARY_PATH=$PSM2_LATEST_BUILD:$LD_LIBRARY_PATH MV2_USE_LAZY_MEM_UNREGISTER=0 MV2_ENABLE_AFFINITY=0 GALOIS_DO_NOT_BIND_THREADS=1 $RUN -np $NUM_TASKS $EXEC ${INPUT} -pset=$PSET -t=$THREADS -num_nodes=$SLURM_NNODES $FLAGS\n#MV2_USE_LAZY_MEM_UNREGISTER=0 MV2_ENABLE_AFFINITY=0 GALOIS_DO_NOT_BIND_THREADS=1 $RUN -np $NUM_TASKS $EXEC ${INPUT} -pset=$PSET -t=$THREADS -num_nodes=$SLURM_NNODES $FLAGS\n#I_MPI_FABRICS=shm:ofa GALOIS_DO_NOT_BIND_THREADS=1 $RUN -np $NUM_TASKS $EXEC ${INPUT} -pset=$PSET -t=$THREADS -num_nodes=$SLURM_NNODES $FLAGS\nset +x #echo off\n\necho \"Algorithm: \" $execname\necho \"Input: \" $INPUT\necho \"Number of nodes: \" $SLURM_NNODES\necho \"Number of tasks: \" $NUM_TASKS\necho \"Number of tasks per node: \" $SLURM_TASKS_PER_NODE\necho \"Devices: \" $PSET\n\n"
  },
  {
    "path": "scripts/experimental/heterogeneousGalois/run_bridges_all.sh",
    "content": "#!/bin/sh\n\nEXEC=$1\nINPUT=$2\nSET=$3\nQUEUE=$4\nPART=$5\nHET=$6 # not supported for now\n\ncurrent_dir=$(dirname \"$0\")\n\nSET=\"${SET%\\\"}\"\nSET=\"${SET#\\\"}\"\n\nfor task in $SET; do\n  IFS=\",\";\n  set $task;\n  cp $current_dir/run_bridges.template.sbatch $current_dir/run_bridges.sbatch \n  if [ $QUEUE == \"GPU\" ]; then\n    if [[ $HET == 1 ]]; then\n      ntasks=5\n      ntasks=$((ntasks*$1))\n      sed -i \"2i#SBATCH -t $2\" $current_dir/run_bridges.sbatch\n      sed -i \"2i#SBATCH --gres=gpu:k80:4\" $current_dir/run_bridges.sbatch\n      sed -i \"2i#SBATCH --ntasks $ntasks\" $current_dir/run_bridges.sbatch\n      sed -i \"2i#SBATCH -N $1\" $current_dir/run_bridges.sbatch\n      sed -i \"2i#SBATCH -p $QUEUE\" $current_dir/run_bridges.sbatch\n      sed -i \"2i#SBATCH -o ${EXEC}_${INPUT}_${PART}_${1}_cgggg_%j.out\" $current_dir/run_bridges.sbatch\n      sed -i \"2i#SBATCH -J ${EXEC}_${INPUT}_${PART}_${1}_cgggg\" $current_dir/run_bridges.sbatch\n      threads=20\n      echo -n \"multi-CPU+GPU\" $EXEC $INPUT $PART $1 $ntasks \"cgggg\" $threads $2 \" \"\n      sbatch $current_dir/run_bridges.sbatch $EXEC $INPUT $PART $ntasks cgggg $threads\n    else\n      ntasks=4\n      ntasks=$((ntasks*$1))\n      sed -i \"2i#SBATCH -t $2\" $current_dir/run_bridges.sbatch\n      sed -i \"2i#SBATCH --gres=gpu:k80:4\" $current_dir/run_bridges.sbatch\n      sed -i \"2i#SBATCH --ntasks $ntasks\" $current_dir/run_bridges.sbatch\n      sed -i \"2i#SBATCH -N $1\" $current_dir/run_bridges.sbatch\n      sed -i \"2i#SBATCH -p $QUEUE\" $current_dir/run_bridges.sbatch\n      sed -i \"2i#SBATCH -o ${EXEC}_${INPUT}_${PART}_${1}_gggg_%j.out\" $current_dir/run_bridges.sbatch\n      sed -i \"2i#SBATCH -J ${EXEC}_${INPUT}_${PART}_${1}_gggg\" $current_dir/run_bridges.sbatch\n      threads=7\n      echo -n \"multi-GPU-only \" $EXEC $INPUT $PART $1 $ntasks \"gggg\" $threads $2 \" \"\n      sbatch $current_dir/run_bridges.sbatch $EXEC $INPUT $PART $ntasks gggg $threads\n    fi\n  elif [ $QUEUE == \"GPU-shared\" ]; then # should be fixed\n    #if [[ $HET == 1 ]]; then\n      threads=28\n      ngpus=$1\n      ngpus=$((ngpus-1))\n      sed -i \"2i#SBATCH -t $2\" $current_dir/run_bridges.sbatch\n      sed -i \"2i#SBATCH --gres=gpu:$ngpus\" $current_dir/run_bridges.sbatch\n      sed -i \"2i#SBATCH --ntasks-per-node $threads\" $current_dir/run_bridges.sbatch\n      sed -i \"2i#SBATCH -N 1\" $current_dir/run_bridges.sbatch\n      sed -i \"2i#SBATCH -p $QUEUE\" $current_dir/run_bridges.sbatch\n      sed -i \"2i#SBATCH -o ${EXEC}_${INPUT}_${1}_cgggg_%j.out\" $current_dir/run_bridges.sbatch\n      sed -i \"2i#SBATCH -J ${EXEC}_${INPUT}_${1}_cgggg\" $current_dir/run_bridges.sbatch\n      threads=$((threads-$ngpus))\n      threads=$((threads-$ngpus))\n      echo -n \"CPU+GPU\" $EXEC $INPUT $1 \"cgggg\" $threads $2 \" \"\n      sbatch $current_dir/run_bridges.sbatch $EXEC $INPUT $1 cgggg $threads\n    #else\n    #  threads=7\n    #  threads=$((threads*$1))\n    #  sed -i \"2i#SBATCH -t $2\" $current_dir/run_bridges.sbatch\n    #  sed -i \"2i#SBATCH --gres=gpu:$1\" $current_dir/run_bridges.sbatch\n    #  sed -i \"2i#SBATCH --ntasks-per-node $threads\" $current_dir/run_bridges.sbatch\n    #  sed -i \"2i#SBATCH -N 1\" $current_dir/run_bridges.sbatch\n    #  sed -i \"2i#SBATCH -p $QUEUE\" $current_dir/run_bridges.sbatch\n    #  sed -i \"2i#SBATCH -o ${EXEC}_${INPUT}_${1}_g_%j.out\" $current_dir/run_bridges.sbatch\n    #  sed -i \"2i#SBATCH -J ${EXEC}_${INPUT}_${1}_g\" $current_dir/run_bridges.sbatch\n    #  echo -n \"GPU-only\" $EXEC $INPUT $1 \"gggg\" $threads $2 \" \"\n    #  sbatch $current_dir/run_bridges.sbatch $EXEC $INPUT $1 gggg $threads\n    #fi\n  elif [ $QUEUE == \"RM\" ]; then\n    sed -i \"2i#SBATCH -t $2\" $current_dir/run_bridges.sbatch\n    sed -i \"2i#SBATCH --ntasks-per-node 1\" $current_dir/run_bridges.sbatch\n    sed -i \"2i#SBATCH -N $1\" $current_dir/run_bridges.sbatch\n    sed -i \"2i#SBATCH -p $QUEUE\" $current_dir/run_bridges.sbatch\n    sed -i \"2i#SBATCH -o ${EXEC}_${INPUT}_${PART}_${1}_c_%j.out\" $current_dir/run_bridges.sbatch\n    sed -i \"2i#SBATCH -J ${EXEC}_${INPUT}_${PART}_${1}_c\" $current_dir/run_bridges.sbatch\n    threads=28\n    echo -n \"CPU-only\" $EXEC $INPUT $PART $1 \"c\" $threads $2 \" \"\n    sbatch $current_dir/run_bridges.sbatch $EXEC $INPUT $PART $1 c $threads\n  fi\n  rm $current_dir/run_bridges.sbatch\ndone\n\n"
  },
  {
    "path": "scripts/experimental/heterogeneousGalois/run_single-host_multi-device_all.sh",
    "content": "#!/bin/sh\n# Usage: ./run_single-host_multi-device_all.sh <ABELIAN_EXECUTABLE_NAME> <INPUT_GRAPH_NAME>\n# environment variables: ABELIAN_VERIFY ABELIAN_GALOIS_ROOT ABELIAN_VERTEX_CUT ABELIAN_VTUNE\n# assumes 4 GPU devices available\n\nexecdir=\".\"\nexecname=$1\nEXEC=${execdir}/${execname}\n\n#inputdirname=/workspace/dist-inputs\ninputdirname=/net/ohm/export/iss/dist-inputs\ninputname=$2\nextension=gr\n\nMPI=mpiexec\n\nFLAGS=\n# kcore flag\nif [[ $execname == *\"kcore\"* ]]; then\n  # TODO: update this for non-100 kcore numbers\n  FLAGS+=\" -kcore=100\"\nfi\nif [[ ($execname == *\"bc\"*) || ($execname == *\"bfs\"*) || ($execname == *\"sssp\"*) ]]; then\n  if [[ -f \"${inputdirname}/${inputname}.source\" ]]; then\n    FLAGS+=\" -startNode=`cat ${inputdirname}/${inputname}.source`\"\n  fi\nfi\nif [[ ($execname == *\"bc\"*) ]]; then\n  FLAGS+=\" -singleSource\"\nfi\nif [[ ($execname == *\"pagerank\"*) ]]; then\n  FLAGS+=\" -maxIterations=100\"\nfi\n\nsource_file=${inputdirname}/source\nif [[ $execname == *\"cc\"* || $execname == *\"kcore\"* ]]; then\n  inputdirname=${inputdirname}/symmetric\n  extension=sgr\n  FLAGS+=\" -symmetricGraph\"\nelse \n  # for verify purposes, always pass in graph transpose just in case it is \n  # needed for non-symmetric graphs\n  FLAGS+=\" -graphTranspose=${inputdirname}/transpose/${inputname}.tgr\"\nfi\nINPUT=${inputdirname}/${inputname}.${extension}\n\nif [ -n \"$ABELIAN_VERIFY\" ]; then\n  #outputdirname=/workspace/dist-outputs\n  outputdirname=/net/ohm/export/iss/dist-outputs\n  IFS='_' read -ra EXECP <<< \"$execname\"\n  problem=${EXECP[0]}\n  OUTPUT=${outputdirname}/${inputname}.${problem}\n\n  if [ -z \"$ABELIAN_GALOIS_ROOT\" ]; then\n    ABELIAN_GALOIS_ROOT=/net/velocity/workspace/SourceCode/Galois\n  fi\n  checker=${ABELIAN_GALOIS_ROOT}/scripts/result_checker.py\n\n  hostname=`hostname`\nfi\n\n# assumes 2 GPU devices available\nSET=\"g,1,2 gg,2,2 c,1,16 gc,2,14 cg,2,14 ggc,3,12 cgg,3,12 gcg,3,12\"\n# assumes 6 GPU devices available - tuxedo\nSET=\"c,1,48 g,1,2 gg,2,2 ggg,3,2 gggg,4,2 ggggg,5,2 gggggg,6,2\"\n# assumes 4 GPU devices available\nSET=\"c,1,28 g,1,28 gg,2,14 ggg,3,7 gggg,4,7\"\n\nfor task in $SET; do\n  IFS=\",\";\n  set $task;\n  PFLAGS=$FLAGS\n  statname=${execname}_${inputname}_${1}.stats\n  PFLAGS+=\" -statFile=${execdir}/${statname}\"\n  if [ -n \"$ABELIAN_VERTEX_CUT\" ]; then\n    PFLAGS+=\" -partition=cvc\"\n  elif [[ ($1 == *\"gc\"*) || ($1 == *\"cg\"*) ]]; then\n    PFLAGS+=\" -scalegpu=3\"\n  fi\n  if [ -n \"$ABELIAN_VTUNE\" ]; then\n    PFLAGS+=\" -runs=1\"\n    CUSTOM_VTUNE=\"amplxe-cl -collect general-exploration -search-dir /lib/modules/3.10.0-327.22.2.el7.x86_64/weak-updates/nvidia/ -call-stack-mode all -trace-mpi -analyze-system -start-paused -r ${execname}_${inputname}_${1}_exploration\"\n  fi\n  if [ -n \"$ABELIAN_VERIFY\" ]; then\n    PFLAGS+=\" -verify\"\n    rm -f output_*.log\n  fi\n  rm -f ${execname}_${inputname}_${1}.out\n  grep \"${inputname}.${extension}\" ${source_file} |& tee ${execname}_${inputname}_${1}.out\n  echo \"GALOIS_DO_NOT_BIND_THREADS=1 $CUSTOM_VTUNE $MPI -n=$2 ${EXEC} ${INPUT} -pset=$1 -t=$3 ${PFLAGS} -num_nodes=1\" |& tee ${execname}_${inputname}_${1}.out\n  eval \"GALOIS_DO_NOT_BIND_THREADS=1 $CUSTOM_VTUNE $MPI -n=$2 ${EXEC} ${INPUT} -pset=$1 -t=$3 ${PFLAGS} -num_nodes=1 |& tee -a ${execname}_${inputname}_${1}.out\"\n  if [ -n \"$ABELIAN_VERIFY\" ]; then\n    outputs=\"output_${hostname}_0.log\"\n    i=1\n    while [ $i -lt $2 ]; do\n      outputs+=\" output_${hostname}_${i}.log\"\n      let i=i+1\n    done\n    echo \"python $checker -t=1 $OUTPUT ${outputs}\" |& tee -a ${execname}_${inputname}_${1}.out\n    eval \"python $checker -t=1 $OUTPUT ${outputs} |& tee -a ${execname}_${inputname}_${1}.out\"\n  fi\ndone\n\nrm -f output_*.log\n\n"
  },
  {
    "path": "scripts/experimental/heterogeneousGalois/run_stampede.template.sbatch",
    "content": "#!/bin/bash\n#SBATCH --mail-user=CHANGE EMAIL HERE OR ERROR\n#SBATCH --mail-type=begin\n#SBATCH --mail-type=fail\n#SBATCH --mail-type=end\n#SBATCH -A Galois\nPART=$3\nPSET=$4\nTHREADS=$5\n\nexecname=$1\nexecdir=`pwd`\nEXEC=${execdir}/${execname}\n\ninputname=$2\n# change this if necessary\ninputdirname=/scratch/03279/roshand/dist-inputs\nextension=gr\n\nstatname=${execname}_${inputname}_${PART}_${SLURM_NNODES}_${PSET}_${SLURM_JOB_ID}.stats\n\nFLAGS=\" -statFile=${execdir}/${statname}\"\n# kcore flag\nif [[ $execname == *\"kcore\"* ]]; then\n  FLAGS+=\" -kcore=100\"\nfi\n\nif [[ ($execname == *\"bc\"*) ||  ($execname == *\"bfs\"*) || ($execname == *\"sssp\"*) ]]; then\n  if [[ -f \"${inputdirname}/${inputname}.source\" ]]; then\n    FLAGS+=\" -startNode=`cat ${inputdirname}/${inputname}.source`\"\n  fi\nfi\nif [[ ($execname == *\"bc\"*) ]]; then\n  FLAGS+=\" -singleSource\"\nfi\n\nsource_file=${inputdirname}/source\nif [[ $execname == *\"cc\"* || $execname == *\"kcore\"* ]]; then\n  inputdirname=${inputdirname}/symmetric\n  extension=sgr\n  FLAGS+=\" -symmetricGraph\"\nelse \n  # for verify purposes, always pass in graph transpose just in case it is \n  # needed for non-symmetric graphs\n  FLAGS+=\" -graphTranspose=${inputdirname}/transpose/${inputname}.tgr\"\nfi\n\ngrep \"${inputname}.${extension}\" ${source_file}\nINPUT=${inputdirname}/${inputname}.${extension}\n\nif [[ ($execname == *\"pagerank\"*) ]]; then\n  FLAGS+=\" -maxIterations=100\"\nfi\n\nFLAGS+=\" -partition=${PART}\"\n\n################################################################################\n\n# XTRAPULP\n\nif [[ ($PART == \"cec\") ]]; then\n  # idea is to balance out sends across hosts since you're sending everything\n  FLAGS+=\" -balanceMasters=nodes\"\n  if [[ $execname == *\"cc\"* || $execname == *\"kcore\"* ]]; then\n    FLAGS+=\" -vertexIDMapFileName=/scratch/03372/lhoang/pulp/${inputname}s_${SLURM_NNODES}.xtra\"\n  elif [[ $execname == *\"pull\"* ]]; then\n    FLAGS+=\" -vertexIDMapFileName=/scratch/03372/lhoang/pulp/${inputname}t_${SLURM_NNODES}.xtra\"\n  else # everything else, bfs, bc, sssp, push versions\n    FLAGS+=\" -vertexIDMapFileName=/scratch/03372/lhoang/pulp/${inputname}_${SLURM_NNODES}.xtra\"\n  fi\nfi\n\n################################################################################\n\n# LOCAL GRAPH SAVES\n\nPARTGRAPHROOT=\"/scratch/02681/gsg466/partitioned_graphs\"\n\nif [[ ($execname == *\"bfs\"*) ]]; then\n  localGraphFileName=\"${PARTGRAPHROOT}/${PART}/${inputname}/local_graph_${inputname}_${SLURM_NNODES}\"\n  if [ ! -d \"$localGraphFileName\" ]; then\n     # Control will enter here if $DIRECTORY doesn't exist.\n     mkdir -p $localGraphFileName;\n  fi\n  FLAGS+=\"  -localGraphFileName=$localGraphFileName \"\nfi\n\nif [[ ($execname == *\"sssp\"*) ]]; then\n  localGraphFileName=\"${PARTGRAPHROOT}/${PART}/${inputname}/weighted/local_graph_${inputname}_${SLURM_NNODES}\"\n  if [ ! -d \"$localGraphFileName\" ]; then\n     # Control will enter here if $DIRECTORY doesn't exist.\n     mkdir -p $localGraphFileName;\n  fi\n  FLAGS+=\"  -localGraphFileName=$localGraphFileName \"\n  #FLAGS+=\" -saveLocalGraph \"\nfi\n\nif [[ $execname == *\"cc\"* || $execname == *\"kcore\"* ]]; then\n  localGraphFileName=\"${PARTGRAPHROOT}/${PART}/${inputname}/symmetric/local_graph_${inputname}_${SLURM_NNODES}\"\n  if [ ! -d \"$localGraphFileName\" ]; then\n     # Control will enter here if $DIRECTORY doesn't exist.\n     mkdir -p $localGraphFileName;\n  fi\n  FLAGS+=\"  -localGraphFileName=$localGraphFileName \"\n  #FLAGS+=\" -saveLocalGraph \"\nfi\n\nif [[ ($execname == *\"pagerank\"*) ]]; then\n  localGraphFileName=\"${PARTGRAPHROOT}/${PART}/${inputname}/transpose/local_graph_${inputname}_${SLURM_NNODES}\"\n  if [ ! -d \"$localGraphFileName\" ]; then\n     # Control will enter here if $DIRECTORY doesn't exist.\n     mkdir -p $localGraphFileName;\n  fi\n  FLAGS+=\"  -localGraphFileName=$localGraphFileName \"\n  #FLAGS+=\" -saveLocalGraph \"\nfi\n\nif [[ ($execname == *\"partition\"*) ]]; then\n  localGraphFileName=\"${PARTGRAPHROOT}/${PART}/${inputname}/local_graph_${inputname}_${SLURM_NNODES}\"\n  if [ ! -d \"$localGraphFileName\" ]; then\n     # Control will enter here if $DIRECTORY doesn't exist.\n     mkdir -p $localGraphFileName;\n  fi\n  FLAGS+=\"  -localGraphFileName=$localGraphFileName \"\n  #FLAGS+=\" -saveLocalGraph \"\nfi\n\nif [[ -n \"$SAVE_GRAPH\" ]]; then\n  FLAGS+=\" -saveLocalGraph \"\nfi\n\nif [[ -n \"$LOAD_GRAPH\" ]]; then\n  FLAGS+=\" -readFromFile \"\nfi\n\nRUN=ibrun\n\nset -x #echo on\nPRINT_PER_HOST_STATS=1 GALOIS_DO_NOT_BIND_THREADS=1 $RUN $EXEC ${INPUT} -t=$THREADS $FLAGS -edgeBufferSize=8388608 -runs=3\nset +x #echo off\n\n# give permissions to output files\nchmod 660 ${execname}_${inputname}_${PART}_${SLURM_NNODES}_${PSET}_${SLURM_JOB_ID}.out\nchmod 660 $statname\n\necho \"Algorithm: \" $execname\necho \"Input: \" $INPUT\necho \"Number of nodes: \" $SLURM_NNODES\necho \"Number of tasks: \" $SLURM_NTASKS\necho \"Number of tasks per node: \" $SLURM_TASKS_PER_NODE\necho \"Devices: \" $PSET\n"
  },
  {
    "path": "scripts/experimental/heterogeneousGalois/run_stampede_all.sh",
    "content": "#!/bin/sh\n\nEXEC=$1\nINPUT=$2\nSET=$3\nQUEUE=$4\nPART=$5\nHET=$6 # not supported for now\n\nSET=\"${SET%\\\"}\"\nSET=\"${SET#\\\"}\"\n\nfor task in $SET; do\n  IFS=\",\";\n  set $task;\n  cp run_stampede.template.sbatch run_stampede.sbatch \n  if [ $QUEUE == \"gpu\" ]; then # should add HET option\n    sed -i \"2i#SBATCH -t $2\" run_stampede.sbatch\n    sed -i \"2i#SBATCH -p $QUEUE\" run_stampede.sbatch\n    sed -i \"2i#SBATCH -N $1 -n $1\" run_stampede.sbatch\n    sed -i \"2i#SBATCH -o ${EXEC}_${INPUT}_${1}_g_%j.out\" run_stampede.sbatch\n    sed -i \"2i#SBATCH -J ${EXEC}_${INPUT}_${1}_g\" run_stampede.sbatch\n    threads=16\n    echo \"multi-GPU-only \" $EXEC $INPUT $1 \"g\" $threads $2\n    sbatch run_stampede.sbatch $EXEC $INPUT g $threads\n  else\n    sed -i \"2i#SBATCH -t $2\" run_stampede.sbatch\n    sed -i \"2i#SBATCH -p $QUEUE\" run_stampede.sbatch\n    sed -i \"2i#SBATCH -N $1 -n $1\" run_stampede.sbatch\n    sed -i \"2i#SBATCH -o ${EXEC}_${INPUT}_${PART}_${1}_c_%j.out\" run_stampede.sbatch\n    sed -i \"2i#SBATCH -J ${EXEC}_${INPUT}_${PART}_${1}_c\" run_stampede.sbatch\n    threads=272\n    echo \"CPU-only \" $EXEC $INPUT $PART $1 \"c\" $threads $2\n    sbatch run_stampede.sbatch $EXEC $INPUT $PART c $threads \n  fi\n  rm run_stampede.sbatch\ndone\n\n"
  },
  {
    "path": "scripts/experimental/heterogeneousGalois/verify.sh",
    "content": "#!/bin/sh\n# Usage: ./verify.sh <ABELIAN_EXECUTABLE_NAME> <INPUT_GRAPH_NAME>\n# environment variables: ABELIAN_NON_HETEROGENEOUS ABELIAN_GALOIS_ROOT ABELIAN_EDGE_CUT_ONLY\n# executes only on single machine\n# assumes 2 GPU devices available (if heterogeneous)\n\nexecdirname=\".\"\nexecname=$1\nEXEC=${execdirname}/${execname}\n\ninputdirname=/net/ohm/export/iss/dist-inputs\n#inputdirname=/workspace/dist-inputs\ninputname=$2\nextension=gr\n\noption=$3\n\noutputdirname=/net/ohm/export/iss/dist-outputs\n#outputdirname=/workspace/dist-outputs\n\nIFS='_' read -ra EXECP <<< \"$execname\"\nproblem=${EXECP[0]}\nOUTPUT=${outputdirname}/${inputname}.${problem}\n\n# kcore output files have a number at the end specifying kcore number\nif [[ $execname == *\"kcore\"* ]]; then\n  # TODO: update this for non-100 kcore numbers\n  OUTPUT=${outputdirname}/${inputname}.${problem}100\nfi\n\n# for bc, do single source outputs\nif [[ ($execname == *\"bc\"*) ]]; then\n  OUTPUT=${outputdirname}/${inputname}.ssbc\nfi\n\n# for bc, if using rmat15, then use all sources output (without ss)\nif [[ ($execname == *\"bc\"*) && ($inputname == \"rmat15\") ]]; then\n  OUTPUT=${outputdirname}/rmat15.bcbfsall\nfi\n\nMPI=mpiexec\nLOG=.verify_log\n\nFLAGS=\n# kcore flag\nif [[ $execname == *\"kcore\"* ]]; then\n  FLAGS+=\" -kcore=100\"\nfi\nif [[ ($execname == *\"bfs\"*) || ($execname == *\"sssp\"*) ]]; then\n  if [[ -f \"${inputdirname}/${inputname}.source\" ]]; then\n    FLAGS+=\" -startNode=`cat ${inputdirname}/${inputname}.source`\"\n  fi\nfi\n\n# bc: if rmat15 is not used, specify single source flags else do\n# all sources for rmat15\nif [[ ($execname == *\"bc\"*) && ! ($inputname == \"rmat15\") ]]; then\n  FLAGS+=\" -singleSource\"\n  FLAGS+=\" -startNode=`cat ${inputdirname}/${inputname}.source`\"\nfi\n\n# batch multiple sources if using mrbc\nif [[ ($execname == *\"bc_mr\"*) ]]; then\n  FLAGS+=\" -numRoundSources=4096\"\nfi\n\nsource_file=${inputdirname}/source\nif [[ $execname == *\"cc\"* || $execname == *\"kcore\"* ]]; then\n  inputdirname=${inputdirname}/symmetric\n  extension=sgr\n  FLAGS+=\" -symmetricGraph\"\nelse \n  # for verify purposes, always pass in graph transpose just in case it is \n  # needed for non-symmetric graphs\n  FLAGS+=\" -graphTranspose=${inputdirname}/transpose/${inputname}.tgr\"\nfi\n\nFLAGS+=\" -maxIterations=10000000\"\n\ngrep \"${inputname}.${extension}\" ${source_file} >>$LOG\nINPUT=${inputdirname}/${inputname}.${extension}\n\nif [ -z \"$ABELIAN_GALOIS_ROOT\" ]; then\n  ABELIAN_GALOIS_ROOT=/net/velocity/workspace/SourceCode/Galois\nfi\nchecker=${ABELIAN_GALOIS_ROOT}/scripts/result_checker.py\n#checker=./result_checker.py\n\nhostname=`hostname`\n\nif [ -z \"$ABELIAN_NON_HETEROGENEOUS\" ]; then\n  # assumes only 2 GPUs device available\n  #SET=\"g,1,48 gg,2,24 gggg,4,12 gggggg,6,8 c,1,48 cc,2,24 cccc,4,12 cccccccc,8,6 cccccccccccccccc,16,3\"\n  SET=\"g,1,16 gg,2,8 gc,2,8 cg,2,8, ggc,3,4 cgg,3,4 c,1,16 cc,2,8 ccc,3,4 cccc,4,4 ccccc,5,2 cccccc,6,2 ccccccc,7,2 cccccccc,8,2 ccccccccc,9,1 cccccccccc,10,1 ccccccccccc,11,1 cccccccccccc,12,1 ccccccccccccc,13,1 cccccccccccccc,14,1 cccccccccccccc,15,1 ccccccccccccccc,16,1\"\nelse\n  #SET=\"c,1,48 cc,2,24 cccc,4,12 cccccccc,8,6 cccccccccccccccc,16,3\"\n  #SET=\"c,1,80 cc,2,40 cccc,4,20 cccccccc,8,10 ccccccccccccccc,16,5\"\n  SET=\"c,1,16 cc,2,8 ccc,3,4 cccc,4,4 ccccc,5,2 cccccc,6,2 ccccccc,7,2 cccccccc,8,2 ccccccccc,9,1 cccccccccc,10,1 ccccccccccc,11,1 cccccccccccc,12,1 ccccccccccccc,13,1 cccccccccccccc,14,1 cccccccccccccc,15,1 ccccccccccccccc,16,1\"\nfi\n\npass=0\nfail=0\nfailed_cases=\"\"\n#for partition in 1 2 3 4 5 6 7 8 9 10 11 12; do\nfor partition in 1 2 3 4 5 6; do\n#for partition in 1; do\n  CUTTYPE=\n\n  if [ $partition -eq 1 ]; then\n    CUTTYPE+=\" -partition=oec\"\n  elif [ $partition -eq 2 ]; then\n    CUTTYPE+=\" -partition=iec\"\n  elif [ $partition -eq 3 ]; then\n    CUTTYPE+=\" -partition=cvc\"\n  elif [ $partition -eq 4 ]; then\n    CUTTYPE+=\" -partition=cvc-iec\"\n  elif [ $partition -eq 5 ]; then\n    CUTTYPE+=\" -partition=hovc\"\n  elif [ $partition -eq 6 ]; then\n    CUTTYPE+=\" -partition=hivc\"\n  elif [ $partition -eq 7 ]; then\n    CUTTYPE+=\" -partition=fennel-o -stateRounds=100\"\n  elif [ $partition -eq 8 ]; then\n    CUTTYPE+=\" -partition=fennel-i -stateRounds=100\"\n  elif [ $partition -eq 9 ]; then\n    CUTTYPE+=\" -partition=ginger-o -stateRounds=100\"\n  elif [ $partition -eq 10 ]; then\n    CUTTYPE+=\" -partition=ginger-i -stateRounds=100\"\n  elif [ $partition -eq 11 ]; then\n    CUTTYPE+=\" -partition=sugar-o -stateRounds=100\"\n  elif [ $partition -eq 12 ]; then\n    CUTTYPE+=\" -partition=sugar-i -stateRounds=100\"\n  fi\n\n  for task in $SET; do\n    old_ifs=$IFS\n    IFS=\",\";\n    set $task;\n    if [ -z \"$ABELIAN_NON_HETEROGENEOUS\" ]; then\n      PFLAGS=\" -pset=$1 -num_nodes=1\"\n    else\n      PFLAGS=\"\"\n    fi\n    PFLAGS+=$FLAGS\n    if [[ ($1 == *\"gc\"*) || ($1 == *\"cg\"*) ]]; then\n      PFLAGS+=\" -scalegpu=3\"\n    fi\n    rm -f output_*.log\n\n    echo \"GALOIS_DO_NOT_BIND_THREADS=1 $MPI -n=$2 ${EXEC} ${INPUT} -t=$3 ${option} ${PFLAGS} ${CUTTYPE} -verify\" >>$LOG\n    eval \"GALOIS_DO_NOT_BIND_THREADS=1 $MPI -n=$2 ${EXEC} ${INPUT} -t=$3 ${option} ${PFLAGS} ${CUTTYPE} -verify\" >>$LOG 2>&1\n\n    eval \"sort -nu output_${hostname}_*.log -o output_${hostname}_0.log\"\n    eval \"python $checker -t=0.01 $OUTPUT output_${hostname}_0.log &> .output_diff\"\n\n    cat .output_diff >> $LOG\n    if ! grep -q \"SUCCESS\" .output_diff ; then\n      let fail=fail+1\n      if [ $partition -eq 1 ]; then\n        failed_cases+=\"outgoing edge-cut $1 devices with $3 threads; \"\n      elif [ $partition -eq 2 ]; then\n        failed_cases+=\"incoming edge-cut $1 devices with $3 threads; \"\n      elif [ $partition -eq 3 ]; then\n        failed_cases+=\"cartesian outgoing vertex-cut $1 devices with $3 threads; \"\n      elif [ $partition -eq 4 ]; then\n        failed_cases+=\"cartesian incoming vertex-cut $1 devices with $3 threads; \"\n      elif [ $partition -eq 5 ]; then\n        failed_cases+=\"hybrid outgoing vertex-cut $1 devices with $3 threads; \"\n      elif [ $partition -eq 6 ]; then\n        failed_cases+=\"hybrid incoming vertex-cut $1 devices with $3 threads; \"\n      elif [ $partition -eq 7 ]; then\n        failed_cases+=\"fennel outgoing edge-cut $1 devices with $3 threads; \"\n      elif [ $partition -eq 8 ]; then\n        failed_cases+=\"fennel incoming edge-cut $1 devices with $3 threads; \"\n      elif [ $partition -eq 9 ]; then\n        failed_cases+=\"ginger outgoing vertex-cut $1 devices with $3 threads; \"\n      elif [ $partition -eq 10 ]; then\n        failed_cases+=\"ginger incoming vertex-cut $1 devices with $3 threads; \"\n      elif [ $partition -eq 11 ]; then\n        failed_cases+=\"sugar outgoing vertex-cut $1 devices with $3 threads; \"\n      elif [ $partition -eq 12 ]; then\n        failed_cases+=\"sugar outgoing vertex-cut $1 devices with $3 threads; \"\n      fi\n    else\n      let pass=pass+1\n    fi\n    rm .output_diff\n    IFS=$old_ifs\n  done\ndone\n\nrm -f output_*.log\n\necho \"---------------------------------------------------------------------------------------\"\necho \"Algorithm: \" $execname\necho \"Input: \" $inputname\necho \"Runtime option: \" $option\necho $pass \"passed test cases\"\nif [[ $fail == 0 ]] ; then\n  echo \"Status: SUCCESS\"\nelse\n  echo $fail \"failed test cases:\" $failed_cases\n  echo \"Status: FAILED\"\nfi\necho \"---------------------------------------------------------------------------------------\"\n"
  },
  {
    "path": "scripts/experimental/lonestarbmk2/README",
    "content": "-----\nbmk2.cfg\n-----\n\nSetup the path to the built apps (Lonestar build directory) and where to output\nlogs for each benchmark here under pathToApps and logOutputDirectory.\n\n-----\nbmktest2.py\n-----\n\nThis file defines all the benchmarks to be run. Each benchmark is defined as a\nclass that inherits from SharedMemApp.\n\nThe runtime looks for relativeAppPath, which points to the executable, and\nbenchmark, which is the name given to that benchmark when outputting things\nand, more importantly, the name used to refer to said benchmark run by bmk2 in\nother files.\n\nIf any arguments to that benchmark need to be specified, you define the\nget_run_spec function to add args. See examples there in the current bmktest2.py\nfile to get a feel of how to do it.\n\nThe SharedMemApp parent class is where default parameters such as threads\nare specified. startThread, endThread, and step are the variables to edit\nfor this purpose.\n\nThe benchmarks that will be run are specified in the BINARIES array. For\nexample, the below definition will run BarnesHut and BFS:\n\nBINARIES = [BarnesHut(), BFS()]\n\n-----\nlonestar.inputdb\n-----\n\nInputs to benchmarks are specified here.\n\nTo begin, change the \"basepath\" variable in the header to point to the root\ndirectory where are inputs are stored.\n\nThe format for specifying an input is as follows:\n\n[<input name>]\nflags =\nname = <name to refer to input as in bmk2>\nfile = <relative path to file from root input directory \"basepath\">\nformat = file format\n\nThere are a couple of file formats to be aware of:\n\nbin/galois = binary galois format\nmesh/nodes = mesh format\ntriangles = format for triangle counting\ntext = text format (for things like points-to-analysis)\nnothing = no file is passed in (for things like self generated input)\n\n-----\nlonestar.inputprops\n-----\n\nAdditional properties for certain inputs can be specified in the inputprops\nfile.\n\n[<same input name used in header for input in inputdb>]\n<additional property>=<what additional property is>\n\nFor example, I can specify the transpose graph as following:\n\n[soc-livejournal]\nptranspose=/net/ohm/export/iss/inputs/unweighted/soc-LiveJournal1.ptgr\n\nIt can then be referred to in the bmktest2.py python script as the example\nbelow shows:\n\ns.set_arg(\"-graphTranspose=%s\" % bmkinput.props.ptranspose)\n\n-----\nlonestar.bispec\n-----\n\nThis file is where one specifies which input to run with a particular bmk2\nbenchmark. Use the name given in the benchmark variable in bmktest2 to\nrefer to a benchmark and the name given under the name variable in\nlonestar.inputdb to refer to an input. For example, the below lines says to\nrun bfs with the twitter40 input.\n\nbfs twitter40\n\n-----\nHow to run out of the box\n-----\n\nSet the following environment variables:\n\nBMK_LONESTAR_PATH to the lonestar directory\nBMK_LOGS to wherever you want the logs to go\n\nThen run the following in THIS directory (same as README):\n\npython <path to bmk2/test2.py> run\n\nTODO explain the bmktest2.py file\n"
  },
  {
    "path": "scripts/experimental/lonestarbmk2/bmk2.cfg",
    "content": "[bmk2]\nversion=2\ninputdb=lonestar.inputdb\ninputprops=lonestar.inputprops\nbispec=lonestar.bispec\npathToApps=${BMK_LONESTAR_PATH}\nlogOutputDirectory=${BMK_LOGS}\n"
  },
  {
    "path": "scripts/experimental/lonestarbmk2/bmkprops.py",
    "content": "import bmk2\nimport datetime\n\nTIME_FMT = \"%Y-%m-%d %H:%M:%S\"\n\nclass GraphBMKSharedMem(bmk2.Binary):\n    \"\"\"Base class for shared memory benchmarks to inherit from. Subclasses\n    need to specify benchmark name + number of threads.\n    \"\"\"\n    def __init__(self):\n        \"\"\"Initialize shared mem properties.\"\"\"\n        self.props = GraphBMKSharedMemProps(self.benchmark)\n        \n    def get_id(self):\n        \"\"\"Return the id of this benchmark.\"\"\"\n        return \"%s\" % (self.benchmark)\n\n    def getUniqueStatFile(self, numThreads, graphName):\n        \"\"\"Get a statfile name given num threads + graph name being used.\"\"\"\n        timeNow = datetime.datetime.now().strftime(TIME_FMT).replace(\" \", \"_\")\n\n        return (\"%s_%d_%s_%s.log\" % (self.benchmark, numThreads, graphName,\n                               timeNow))\n\nclass GraphBMKSharedMemProps(bmk2.Properties):\n    \"\"\"Properties pertaining to shared memory.\"\"\"\n    def __init__(self, benchmark):\n        self.benchmark = benchmark\n"
  },
  {
    "path": "scripts/experimental/lonestarbmk2/bmktest2.py",
    "content": "import bmk2\nfrom bmkprops import GraphBMKSharedMem\nimport os\n\nclass SharedMemApp(GraphBMKSharedMem):\n    \"\"\"Base class that has default run spec construction behavior for\n    most if not all shared memory apps.\n    \"\"\"\n    # thread to start from\n    startThread = 0\n    # thread to end at (inclusive)\n    endThread = 56\n    # step to use for looping through threads\n    step = 7\n\n    def filter_inputs(self, inputs):\n        \"\"\"Ignore inputs that aren't currently supported.\"\"\"\n        def finput(x):\n            if x.props.format == 'bin/galois': return True\n            if x.props.format == 'mesh': return True\n            if x.props.format == 'mesh/nodes': return True\n            if x.props.format == 'triangles': return True\n            if x.props.format == 'text': return True\n            if x.props.format == 'nothing': return True\n\n            return False\n\n        return filter(finput, inputs)\n\n    def get_default_run_specs(self, bmkinput, config):\n        \"\"\"Creates default run specifications with common arguments for all\n        shared memory benchmarks and returns them. They can be modified\n        later according to the benchmark that you want to run.\n        \"\"\"\n        assert config != None # config should be passed through test2.py\n        listOfRunSpecs = []\n\n        for numThreads in range(self.startThread, self.endThread + 1, self.step):\n            if numThreads == 0 and self.step != 1:\n              numThreads = 1\n            elif numThreads == 0:\n              continue\n\n            x = bmk2.RunSpec(self, bmkinput)\n\n            x.set_binary(\"\", os.path.expandvars(\n                               os.path.join(config.get_var(\"pathToApps\"),\n                                          self.relativeAppPath)))\n            x.set_arg(\"-t=%d\" % numThreads)\n\n            nameToAppend = bmkinput.name\n\n            if bmkinput.props.format == \"nothing\":\n                nameToAppend = \"gen\"\n                pass\n            elif bmkinput.props.format != \"mesh\":\n                x.set_arg(bmkinput.props.file, bmk2.AT_INPUT_FILE)\n            else: # mesh\n                # don't specify with input file flag as it doesn't exist (mesh\n                # loads multiple files, so the file specified in the inputdb\n                # isn't an actual file\n                x.set_arg(bmkinput.props.file)\n\n            x.set_arg(\"-statFile=\" +\n                      os.path.expandvars(\n                        os.path.join(config.get_var(\"logOutputDirectory\"),\n                                     self.getUniqueStatFile(numThreads, \n                                     nameToAppend))\n                      ))\n\n            listOfRunSpecs.append(x)\n\n            x.set_checker(bmk2.PassChecker())\n            x.set_perf(bmk2.ZeroPerf())\n\n        return listOfRunSpecs\n\n    def get_run_spec(self, bmkinput, config):\n        return self.get_default_run_specs(bmkinput, config)\n\n################################################################################\n\nclass BarnesHut(SharedMemApp):\n    relativeAppPath = \"barneshut/barneshut\"\n    benchmark = \"barneshut\"\n\n    def get_run_spec(self, bmkinput, config):\n        \"\"\"Adds barnes hut specific arguments\"\"\"\n        specs = self.get_default_run_specs(bmkinput, config)\n\n        for s in specs:\n            s.set_arg(\"-n=500000\")\n            s.set_arg(\"-steps=1\")\n            s.set_arg(\"-seed=0\")\n        \n        return specs\n\nclass BCAsync(SharedMemApp):\n    relativeAppPath = \"betweennesscentrality/bc-async\"\n    benchmark = \"bc-async\"\n\n    def get_run_spec(self, bmkinput, config):\n        \"\"\"BC async command line setup\"\"\"\n        specs = self.get_default_run_specs(bmkinput, config)\n\n        for s in specs:\n            # do 5 nodes with edges\n            s.set_arg(\"-numOfOutSources=5\") \n        \n        return specs\n\nclass BCOuter(SharedMemApp):\n    relativeAppPath = \"betweennesscentrality/betweennesscentrality-outer\"\n    benchmark = \"bc-outer\"\n\nclass BFS(SharedMemApp):\n    relativeAppPath = \"bfs/bfs\"\n    benchmark = \"bfs\"\n\nclass Boruvka(SharedMemApp):\n    relativeAppPath = \"boruvka/boruvka\"\n    benchmark = \"boruvka\"\n\nclass BoruvkaMerge(SharedMemApp):\n    relativeAppPath = \"boruvka/boruvka-merge\"\n    benchmark = \"boruvka-merge\"\n\nclass Clustering(SharedMemApp):\n    relativeAppPath = \"clustering/clustering\"\n    benchmark = \"clustering\"\n\n    def get_run_spec(self, bmkinput, config):\n        \"\"\"Clustering command line setup\"\"\"\n        specs = self.get_default_run_specs(bmkinput, config)\n\n        for s in specs:\n            s.set_arg(\"-numPoints=10000\") # num points to cluster\n        \n        return specs\n\nclass ConnectedComponents(SharedMemApp):\n    relativeAppPath = \"connectedcomponents/connectedcomponents\"\n    benchmark = \"connectedcomponents\"\n\nclass DelaunayTriangulation(SharedMemApp):\n    relativeAppPath = \"delaunaytriangulation/delaunaytriangulation\"\n    benchmark = \"delaunaytriangulation\"\n\nclass DelaunayTriangulationDet(SharedMemApp):\n    relativeAppPath = \"delaunaytriangulation/delaunaytriangulation-det\"\n    benchmark = \"delaunaytriangulation-det\"\n\nclass DMR(SharedMemApp):\n    relativeAppPath = \"delaunayrefinement/delaunayrefinement\"\n    benchmark = \"dmr\"\n\nclass GMetis(SharedMemApp):\n    relativeAppPath = \"gmetis/gmetis\"\n    benchmark = \"gmetis\"\n\n    def get_run_spec(self, bmkinput, config):\n        \"\"\"Adds gmetis specific arguments (num partitions)\"\"\"\n        specs = self.get_default_run_specs(bmkinput, config)\n\n        for s in specs:\n            s.set_arg(\"256\") # num of partitions\n        \n        return specs\n\nclass IndependentSet(SharedMemApp):\n    relativeAppPath = \"independentset/independentset\"\n    benchmark = \"independentset\"\n\n# triggers caps for matrix completion\nMCCAP = False\n\nclass MatrixCompletionSync(SharedMemApp):\n    relativeAppPath = \"matrixcompletion/matrixCompletion\"\n    benchmark = \"matrixcompletion-sync\"\n\n    def get_run_spec(self, bmkinput, config):\n        \"\"\"Adds matrix completion type\"\"\"\n        specs = self.get_default_run_specs(bmkinput, config)\n\n        for s in specs:\n            s.set_arg(\"-algo=syncALS\") # algo type\n            s.set_arg(\"-lambda=0.001\") \n            s.set_arg(\"-learningRate=0.01\") \n            s.set_arg(\"-learningRateFunction=intel\") \n            s.set_arg(\"-tolerance=0.0001\") \n            s.set_arg(\"-noverify\")\n            s.set_arg(\"-useSameLatentVector\")\n            s.set_arg(\"-useDetInit\")\n            if MCCAP:\n              s.set_arg(\"-fixedRounds=8\")\n              s.set_arg(\"-maxUpdates=8\")\n        \n        return specs\n\nclass MatrixCompletionSimple(SharedMemApp):\n    relativeAppPath = \"matrixcompletion/matrixCompletion\"\n    benchmark = \"matrixcompletion-simple\"\n\n    def get_run_spec(self, bmkinput, config):\n        \"\"\"Adds matrix completion type\"\"\"\n        specs = self.get_default_run_specs(bmkinput, config)\n\n        for s in specs:\n            s.set_arg(\"-algo=simpleALS\") # algo type\n            s.set_arg(\"-lambda=0.001\") \n            s.set_arg(\"-learningRate=0.01\") \n            s.set_arg(\"-learningRateFunction=intel\") \n            s.set_arg(\"-tolerance=0.0001\") \n            s.set_arg(\"-noverify\")\n            s.set_arg(\"-useSameLatentVector\")\n            s.set_arg(\"-useDetInit\")\n            if MCCAP:\n              s.set_arg(\"-fixedRounds=8\")\n              s.set_arg(\"-maxUpdates=8\")\n        \n        return specs\n\nclass MatrixCompletionEdge(SharedMemApp):\n    relativeAppPath = \"matrixcompletion/matrixCompletion\"\n    benchmark = \"matrixcompletion-edge\"\n\n    def get_run_spec(self, bmkinput, config):\n        \"\"\"Adds matrix completion type\"\"\"\n        specs = self.get_default_run_specs(bmkinput, config)\n\n        for s in specs:\n            s.set_arg(\"-algo=sgdBlockEdge\") # algo type\n            s.set_arg(\"-lambda=0.001\") \n            s.set_arg(\"-learningRate=0.01\") \n            s.set_arg(\"-learningRateFunction=intel\") \n            s.set_arg(\"-tolerance=0.0001\") \n            s.set_arg(\"-noverify\")\n            s.set_arg(\"-useSameLatentVector\")\n            s.set_arg(\"-useDetInit\")\n            if MCCAP:\n              s.set_arg(\"-fixedRounds=8\")\n              s.set_arg(\"-maxUpdates=8\")\n        \n        return specs\n\nclass MatrixCompletionJump(SharedMemApp):\n    relativeAppPath = \"matrixcompletion/matrixCompletion\"\n    benchmark = \"matrixcompletion-jump\"\n\n    def get_run_spec(self, bmkinput, config):\n        \"\"\"Adds matrix completion type\"\"\"\n        specs = self.get_default_run_specs(bmkinput, config)\n\n        for s in specs:\n            s.set_arg(\"-algo=sgdBlockJump\") # algo type\n            s.set_arg(\"-lambda=0.001\") \n            s.set_arg(\"-learningRate=0.01\") \n            s.set_arg(\"-learningRateFunction=intel\") \n            s.set_arg(\"-tolerance=0.0001\") \n            s.set_arg(\"-noverify\")\n            s.set_arg(\"-useSameLatentVector\")\n            s.set_arg(\"-useDetInit\")\n            if MCCAP:\n              s.set_arg(\"-fixedRounds=8\")\n              s.set_arg(\"-maxUpdates=8\")\n        \n        return specs\n\nclass MatrixCompletionByItems(SharedMemApp):\n    relativeAppPath = \"matrixcompletion/matrixCompletion\"\n    benchmark = \"matrixcompletion-byitems\"\n\n    def get_run_spec(self, bmkinput, config):\n        \"\"\"Adds matrix completion type\"\"\"\n        specs = self.get_default_run_specs(bmkinput, config)\n\n        for s in specs:\n            s.set_arg(\"-algo=sgdByItems\") # algo type\n            s.set_arg(\"-lambda=0.001\") \n            s.set_arg(\"-learningRate=0.01\") \n            s.set_arg(\"-learningRateFunction=intel\") \n            s.set_arg(\"-tolerance=0.0001\") \n            s.set_arg(\"-noverify\")\n            s.set_arg(\"-useSameLatentVector\")\n            s.set_arg(\"-useDetInit\")\n            if MCCAP:\n              s.set_arg(\"-fixedRounds=8\")\n              s.set_arg(\"-maxUpdates=8\")\n        \n        return specs\n\nclass MatrixCompletionByEdges(SharedMemApp):\n    relativeAppPath = \"matrixcompletion/matrixCompletion\"\n    benchmark = \"matrixcompletion-byedges\"\n\n    def get_run_spec(self, bmkinput, config):\n        \"\"\"Adds matrix completion type\"\"\"\n        specs = self.get_default_run_specs(bmkinput, config)\n\n        for s in specs:\n            s.set_arg(\"-algo=sgdByEdges\") # algo type\n            s.set_arg(\"-lambda=0.001\") \n            s.set_arg(\"-learningRate=0.01\") \n            s.set_arg(\"-learningRateFunction=intel\") \n            s.set_arg(\"-tolerance=0.0001\") \n            s.set_arg(\"-noverify\")\n            s.set_arg(\"-useSameLatentVector\")\n            s.set_arg(\"-useDetInit\")\n            if MCCAP:\n              s.set_arg(\"-fixedRounds=8\")\n              s.set_arg(\"-maxUpdates=8\")\n        \n        return specs\n\nclass MCM(SharedMemApp):\n    relativeAppPath = \"matching/bipartite-mcm\"\n    benchmark = \"mcm\"\n\n    def get_run_spec(self, bmkinput, config):\n        \"\"\"Adds bipartite matching specific arguments\"\"\"\n        specs = self.get_default_run_specs(bmkinput, config)\n\n        for s in specs:\n            s.set_arg(\"-abmpAlgo\")\n            s.set_arg(\"-inputType=generated\")\n            s.set_arg(\"-n=1000000\") # nodes in each bipartite set\n            s.set_arg(\"-numEdges=100000000\") \n            s.set_arg(\"-numGroups=10000\") \n            s.set_arg(\"-seed=0\") # seed for rng; keep it consistent\n        \n        return specs\n\n\nclass PageRankPull(SharedMemApp):\n    relativeAppPath = \"pagerank/pagerank-pull\"\n    benchmark = \"pagerank-pull\"\n\n    def get_run_spec(self, bmkinput, config):\n        \"\"\"Adds tolerance argument\"\"\"\n        specs = self.get_default_run_specs(bmkinput, config)\n\n        for s in specs:\n            s.set_arg(\"-tolerance=0.001\") # pagerank tolerance\n        \n        return specs\n\nclass PageRankPullTopo(SharedMemApp):\n    relativeAppPath = \"pagerank/pagerank-pull\"\n    benchmark = \"pagerank-pull-topo\"\n\n    def get_run_spec(self, bmkinput, config):\n        \"\"\"Adds tolerance argument and algorithm setting\"\"\"\n        specs = self.get_default_run_specs(bmkinput, config)\n\n        for s in specs:\n            s.set_arg(\"-tolerance=0.001\") # pagerank tolerance\n            s.set_arg(\"-algo=Topo\") # pagerank tolerance\n        \n        return specs\n\nclass PageRankPush(SharedMemApp):\n    relativeAppPath = \"pagerank/pagerank-push\"\n    benchmark = \"pagerank-push\"\n\n    def get_run_spec(self, bmkinput, config):\n        \"\"\"Adds tolerance argument\"\"\"\n        specs = self.get_default_run_specs(bmkinput, config)\n\n        for s in specs:\n            s.set_arg(\"-tolerance=0.001\") # pagerank tolerance\n        \n        return specs\n\nclass PageRankPushSync(SharedMemApp):\n    relativeAppPath = \"pagerank/pagerank-push\"\n    benchmark = \"pagerank-push-sync\"\n\n    def get_run_spec(self, bmkinput, config):\n        \"\"\"Adds tolerance argument and algo setting\"\"\"\n        specs = self.get_default_run_specs(bmkinput, config)\n\n        for s in specs:\n            s.set_arg(\"-tolerance=0.001\") # pagerank tolerance\n            s.set_arg(\"-algo=Sync\") # pagerank tolerance\n        \n        return specs\n\n# for galois 2.2 version of pagerank\nclass PageRank2Point2(SharedMemApp):\n    relativeAppPath = \"pagerank/pagerank\"\n    benchmark = \"pagerank22\"\n\n    def get_run_spec(self, bmkinput, config):\n        \"\"\"Adds transpose graph.\"\"\"\n        specs = self.get_default_run_specs(bmkinput, config)\n\n        for s in specs:\n            s.set_arg(\"-graphTranspose=%s\" % bmkinput.props.ptranspose) \n            s.set_arg(\"-maxIterations=1000\")\n        \n        return specs\n\nclass PreflowPush(SharedMemApp):\n    relativeAppPath = \"preflowpush/preflowpush\"\n    benchmark = \"preflowpush\"\n\n    def get_run_spec(self, bmkinput, config):\n        \"\"\"Adds preflow push specific arguments\"\"\"\n        specs = self.get_default_run_specs(bmkinput, config)\n\n        for s in specs:\n            s.set_arg(\"0\") # source id\n            s.set_arg(\"100\") # sink id\n        \n        return specs\n\nclass PointsToAnalysis(SharedMemApp):\n    relativeAppPath = \"pointstoanalysis/pta\"\n    benchmark = \"pta\"\n\nclass SpanningTree(SharedMemApp):\n    relativeAppPath = \"spanningtree/spanningtree\"  \n    benchmark = \"spanningtree\"\n\nclass SSSP(SharedMemApp):\n    relativeAppPath = \"sssp/sssp\"\n    benchmark = \"sssp\"\n\n    def get_run_spec(self, bmkinput, config):\n        \"\"\"Adds delta argument to runs.\"\"\"\n        specs = self.get_default_run_specs(bmkinput, config)\n\n        # 0 is best for twitter50\n        # 8 seems best for r4-2e26\n        for s in specs:\n            #s.set_arg(\"-delta=0\")\n            s.set_arg(\"-delta=8\")\n        \n        return specs\n\nclass SurveyPropagation(SharedMemApp):\n    relativeAppPath = \"surveypropagation/surveypropagation\"\n    benchmark = \"surveypropagation\"\n\n    def get_run_spec(self, bmkinput, config):\n        \"\"\"Adds survey prop arguments to runs.\"\"\"\n        specs = self.get_default_run_specs(bmkinput, config)\n\n        for s in specs:\n            s.set_arg(\"9\") # random generator seed\n            s.set_arg(\"2000000\") # number of vars\n            s.set_arg(\"6000000\") # number of clauses\n            s.set_arg(\"4\") # vars per clause\n\n            # below are args used by runs on galois website\n            #s.set_arg(\"1000000\") # number of vars\n            #s.set_arg(\"3000000\") # number of clauses\n            #s.set_arg(\"3\") # vars per clause\n       \n        return specs\n\nclass TrianglesNode(SharedMemApp):\n    relativeAppPath = \"triangles/triangles\"\n    benchmark = \"triangles-node\"\n\n    def get_run_spec(self, bmkinput, config):\n        \"\"\"Specifies node version\"\"\"\n        specs = self.get_default_run_specs(bmkinput, config)\n\n        for s in specs:\n            s.set_arg(\"-algo=nodeiterator\")\n        \n        return specs\n\nclass TrianglesEdge(SharedMemApp):\n    relativeAppPath = \"triangles/triangles\"\n    benchmark = \"triangles-edge\"\n\n    def get_run_spec(self, bmkinput, config):\n        \"\"\"Specifies edge version\"\"\"\n        specs = self.get_default_run_specs(bmkinput, config)\n\n        for s in specs:\n            s.set_arg(\"-algo=edgeiterator\")\n        \n        return specs\n\n# specification of binaries to run\n# apps present in Galois 2.2\n#BINARIES = [BarnesHut(), BFS(), BCOuter(), Boruvka(), BoruvkaMerge(), \n#            Clustering(), ConnectedComponents(), DelaunayTriangulation(), DMR(), \n#            GMetis(), IndependentSet(), MCM(), PageRankPull(), PageRankPush(), \n#            PreflowPush(), SpanningTree(), SSSP(), SurveyPropagation()]\n\n# single benchmark run\n#BINARIES = [MatrixCompletionSimple()]\n\nBINARIES = [BarnesHut(), BFS(), BCAsync(), BCOuter(), Boruvka(), \n            ConnectedComponents(), DelaunayTriangulation(), DMR(), \n            GMetis(), IndependentSet(), MatrixCompletionSync(), \n            MatrixCompletionSimple(), MatrixCompletionEdge(), \n            MatrixCompletionJump(), MatrixCompletionByItems(), \n            MatrixCompletionByEdges(), MCM(), PageRankPull(), PageRankPush(), \n            PreflowPush(), PointsToAnalysis(), SSSP(), SurveyPropagation(), \n            TrianglesNode(), TrianglesEdge()] \n"
  },
  {
    "path": "scripts/experimental/lonestarbmk2/defaultrunscript.sh",
    "content": "#!/bin/bash\n\nfor i in {1..3}; do python ../bmk2/test2.py --max-output-bytes 0 --log ${BMK_LOGS}/bmkrunlog${i}.log --verbose run; done \n"
  },
  {
    "path": "scripts/experimental/lonestarbmk2/lonestar.bispec",
    "content": "#v1\nbarneshut nothing\nbfs twitter40\nbc-async USA-road-d.USA\nbc-outer rmat8-2e14\nboruvka USA-road-d.USA\nboruvka-merge USA-road-d.USA\nclustering nothing\nconnectedcomponents rmat25-rsymmetric\ndelaunaytriangulation r5Mn\ndelaunaytriangulation-det r5Mn\ndmr r5M\ngmetis USA-road-d.USA\nindependentset rmat27-rsymmetric\nmatrixcompletion-sync netflix\nmatrixcompletion-simple netflix\nmatrixcompletion-edge netflix\nmatrixcompletion-jump netflix\nmatrixcompletion-byitems netflix\nmatrixcompletion-byedges netflix\nmcm nothing\npagerank22 twitter40\npagerank-pull twitter40-transpose\npagerank-pull-topo twitter40-transpose\npagerank-push twitter40\npagerank-push-sync twitter40\npreflowpush r4-2e26\npta gdb_constraints\nspanningtree r4-2e26\nsssp r4-2e26\nsurveypropagation nothing\ntriangles-node com-lj\ntriangles-edge com-lj\n"
  },
  {
    "path": "scripts/experimental/lonestarbmk2/lonestar.inputdb",
    "content": "[bmktest2]\nversion = 2\nbasepath = /net/ohm/export/iss/inputs\n\n[scalefree/rmat8-2e14.gr]\nflags =\nname = rmat8-2e14\nfile = scalefree/deprecated/rmat8-2e14.gr\nformat = bin/galois\n\n[random/r4-2e26.gr]\nflags =\nname = r4-2e26\nfile = random/r4-2e26.gr\nformat = bin/galois\n\n[random/r4-2e26.sgr]\nflags =\nname = r4-2e26-symmetric\nfile = random/symmetric/r4-2e26.gr\nformat = bin/galois\n\n[random/rmat25.rsgr]\nflags =\nname = rmat25-rsymmetric\nfile = scalefree/randomized/symmetric/rmat16-2e25-a=0.57-b=0.19-c=0.19-d=.05.srgr\nformat = bin/galois\n\n[road/USA-road-d.USA.gr]\nflags =\nname = USA-road-d.USA\nfile = road/USA-road-d.USA.gr\nformat = bin/galois\n\n[unweighted/uk-2007-05.sgr]\nflags =\nname = uk-2007-05-symmetric\nfile = unweighted/uk-2007-05.sgr\nformat = bin/galois\n\n[scalefree/symmetric/rmat26.sgr]\nflags =\nname = rmat26-symmetric\nfile = scalefree/symmetric/rmat16-2e26-a=0.57-b=0.19-c=0.19-d=.05.sgr\nformat = bin/galois\n\n[scalefree/rmat28.gr]\nflags =\nname = rmat28\nfile = scalefree/rmat16-2e28-a=0.57-b=0.19-c=0.19-d=.05.gr\nformat = bin/galois\n\n[scalefree/random/rmat27.sgr]\nflags =\nname = rmat27-rsymmetric\nfile = scalefree/randomized/symmetric/rmat16-2e27-a=0.57-b=0.19-c=0.19-d=.05.srgr\nformat = bin/galois\n\n[scalefree/random/rmat28.sgr]\nflags =\nname = rmat28-symmetric\nfile = scalefree/randomized/symmetric/rmat16-2e28-a=0.57-b=0.19-c=0.19-d=0.05.srgr\nformat = bin/galois\n\n[meshes/r5M]\nflags =\nname = r5M\nfile = meshes/r5M\nformat = mesh\n\n[meshes/r5Mn]\nflags =\nname = r5Mn\nfile = meshes/r5M.node\nformat = mesh/nodes\n\n[twitter40]\nflags =\nname = twitter40\nfile = unweighted/twitter-WWW10-component.gr\nformat = bin/galois\n\n[twitter40-transpose]\nflags =\nname = twitter40-transpose\nfile = unweighted/twitter-WWW10-component-transpose.gr\nformat = bin/galois\n\n[twitter40-symmetric]\nflags =\nname = twitter40-symmetric\nfile = unweighted/twitter-WWW10-component-symmetric.gr\nformat = bin/galois\n\n[twitter50]\nflags =\nname = twitter50\nfile = unweighted/twitter-ICWSM10-component.gr\nformat = bin/galois\n\n[twitter50w]\nflags =\nname = twitter50w\nfile = unweighted/withRandomWeights/twitter-ICWSM10-component_withRandomWeights.gr\nformat = bin/galois\n\n[twitter50-transpose]\nflags =\nname = twitter50-transpose\nfile = unweighted/twitter-ICWSM10-component-transpose.gr\nformat = bin/galois\n\n[soc-livejournal]\nflags =\nname = soc-livejournal\nfile = unweighted/soc-LiveJournal1.gr\nformat = bin/galois\n\n[com-lj]\nflags =\nname = com-lj\nfile = stanford/communities/LiveJournal/com-lj.wgt32.sym.gr.triangles\nformat = triangles\n\n[weighted/bipartite/yahoo]\nflags =\nname = yahoo-music\nfile = weighted/bipartite/yahoo.gr\nformat = bin/galois\n\n[weighted/bipartite/netflix]\nflags =\nname = netflix\nfile = weighted/bipartite/floatEdgeWts/netflix.gr\nformat = bin/galois\n\n[java/pta/tshark_constraints]\nflags =\nname = tshark_constraints\nfile = java/pta/tshark_constraints.txt\nformat = text\n\n[java/pta/gdb_constraints]\nflags =\nname = gdb_constraints\nfile = java/pta/gdb_constraints.txt\nformat = text\n\n[nothing]\nflags =\nname = nothing\nfile = \nformat = nothing\n"
  },
  {
    "path": "scripts/experimental/lonestarbmk2/lonestar.inputprops",
    "content": "[bmktest2-props]\nversion = 2\npaths = \n\n[soc-livejournal]\nptranspose=/net/ohm/export/iss/inputs/unweighted/soc-LiveJournal1.ptgr\n\n[twitter40]\nptranspose=/net/ohm/export/iss/inputs/unweighted/twitter-WWW10-component.ptgr\n"
  },
  {
    "path": "scripts/experimental/older/backend.pl",
    "content": "#!/usr/bin/perl\n\nuse strict;\nuse Getopt::Std;\n\nsub run_prog {\n    my $appstr = shift;\n\n    my %options=();\n    getopts(\"hr:t:s:\", \\%options);\n    \n    my $threadcount = 24;\n    my $threadstart = 1;\n    my $numruns = 9;\n\n    if (defined $options{h}) {\n\tprint \"-h      :help\\n\";\n\tprint \"-r num  :run num times\\n\";\n\tprint \"-t tmax :end at tmax threads\\n\";\n\tprint \"-s tmin :start at tmin threads\\n\";\n\texit;\n    }\n    \n    if (defined $options{r}) {\n\t$numruns = $options{r};\n    }\n    \n    if (defined $options{t}) {\n\t$threadcount = $options{t};\n\tprint \"setting threads ending point to $threadcount\\n\";\n    }\n    if (defined $options{s}) {\n\t$threadstart = $options{s};\n\tprint \"setting threads starting point to $threadstart\\n\";\n    }\n    \n    for(my $i = $threadstart; $i <= $threadcount; $i++) {\n\tprint \"THREADS: $i\\n\";\n\tmy %stats;\n\tfor (my $j = 0; $j < $numruns; $j++) {\n\t    print \"*** Executing: \" . \"$appstr -t $i\" . \"\\n\";\n\t    system(\"$appstr -t $i\");\n\t    \n\t    if ($? == -1) {\n\t\tprint \"failed to execute: $!\\n\";\n\t    } elsif ($? & 127) {\n\t\tprintf \"child died with signal %d, %s coredump\\n\",\n\t\t($? & 127),  ($? & 128) ? 'with' : 'without';\n\t    } else {\n\t\tprintf \"child exited with value %d\\n\", $? >> 8;\n\t    } \n\t}\n    }\n}\n\nsub vtune_prog {\n    my $appstr = shift;\n\n    my %options=();\n    getopts(\"hr:t:s:\", \\%options);\n    \n    my $threadcount = 24;\n    my $threadstart = 1;\n\n    if (defined $options{h}) {\n\tprint \"-h      :help\\n\";\n\tprint \"-t tmax :end at tmax threads\\n\";\n\tprint \"-s tmin :start at tmin threads\\n\";\n\texit;\n    }\n    \n    if (defined $options{t}) {\n\t$threadcount = $options{t};\n\tprint \"setting threads ending point to $threadcount\\n\";\n    }\n    if (defined $options{s}) {\n\t$threadstart = $options{s};\n\tprint \"setting threads starting point to $threadstart\\n\";\n    }\n    \n    for(my $i = $threadstart; $i <= $threadcount; $i++) {\n\tprint \"THREADS: $i\\n\";\n\tprint \"*** Executing: \" . \"$appstr -t $i\" . \"\\n\";\n\tsystem(\"rm -r r$i\");\n\tsystem(\"mkdir r$i\");\n\tsystem(\"/opt/intel/vtune_amplifier_xe_2011/bin64/amplxe-cl -collect nehalem_general-exploration -result-dir=r$i -start-paused -- $appstr -t $i\");\n\tsystem(\"/opt/intel/vtune_amplifier_xe_2011/bin64/amplxe-cl -R hw-events -r r$i -group-by source-line -csv-delimiter tab |perl prune_headers_line.pl > results.line.$i.csv\");\n\tsystem(\"/opt/intel/vtune_amplifier_xe_2011/bin64/amplxe-cl -R hw-events -r r$i -group-by function -csv-delimiter tab |perl prune_headers_function.pl > results.function.$i.csv\");\n    }\n}\n\nreturn 1;\nexit;\n"
  },
  {
    "path": "scripts/experimental/older/prune_headers_function.pl",
    "content": "while (<>) {\n    @line = split '\\t';\n    chomp @line;\n    $function = shift @line;\n    $function =~ s/,/_/g;\n    $module = shift @line;\n    $proc = shift @line;\n    $pid = shift @line;\n    print \"\\\"$module:$function\\\",\" . join(',', @line) . \"\\n\";\n}\n\n\n\n"
  },
  {
    "path": "scripts/experimental/older/prune_headers_line.pl",
    "content": "while (<>) {\n    @line = split '\\t';\n    chomp @line;\n    $file = shift @line;\n    $path = shift @line;\n    $line = shift @line;\n    $module = shift @line;\n    $proc = shift @line;\n    $pid = shift @line;\n    print \"\\\"$file:$line\\\",\" . join(',', @line) . \"\\n\";\n}\n\n\n\n"
  },
  {
    "path": "scripts/experimental/older/report.pl",
    "content": "use List::Util qw(sum reduce);\n\n#Gather\nmy $curthread = 0;\n\nwhile(<>) {\n    if (/RUN: Variable Threads = (\\d+)/) {\n\t$curthread = $1;\n    }\n    if (/STAT SINGLE (\\w+)\\s+\\(null\\)\\s+(\\d+)/) {\n\tpush(@{$stats{$curthread}{\"$1\"} }, $2);\n\t$k{\"$1\"} = 1;\n    }\n}\n\n#output\nforeach my $th (sort { $a <=> $b } keys %stats) {\n    print \",$th\";\n} \nprint \"\\n\";\nforeach my $st (sort keys %k) {\n    print \"$st\";\n    foreach my $th (sort { $a <=> $b } keys %stats) {\n\t@values = @{$stats{$th}{$st}};\n\tif (@values) {\n\t    my $avg = sum(@values)/@values;\n\t    print \",$avg\";\n\t} else {\n\t    print \",0\";\n\t}\n    }\n    print \"\\n\";\n    print \"$st Stdev\";\n    foreach my $th (sort { $a <=> $b } keys %stats) {\n\t@values = @{$stats{$th}{$st}};\n\tif (@values) {\n\t    my $avg = sum(@values)/@values;\n\t    my $stdev = reduce {$a + ($b - $avg) * ($b - $avg)} 0, @values;\n\t    $stdev = $stdev / @values;\n\t    $stdev = sqrt($stdev);\n\t    print \",$stdev\";\n\t} else {\n\t    print \",0\";\n\t}\n    }\n    print \"\\n\";\n}\n\n"
  },
  {
    "path": "scripts/experimental/older/report_vtune.pl",
    "content": "while (@ARGV) {\n    my $arg = shift @ARGV;\n    #work out thread id\n    $arg =~ /\\w+\\.(\\d+)\\.\\w+/;\n    $thread = $1;\n    $thread_keys{$thread} = 1;\n    #print \"$arg: $thread\\n\";\n    \n    #open file\n    open FILE, \"<$arg\";\n    $h = <FILE>;\n    @H = split ',', $h;\n    foreach $hh (@H) {\n\t$name_keys{$hh} = 1;\n    }\n    while ($l = <FILE>) {\n\t@L = split ',', $l;\n\tchomp @L;\n\t$line_keys{$L[0]} = 1;\n\tfor($i = 1; $i < @L; $i++) {\n\t    $stats{$H[$i]}{$L[0]}{$thread} = $L[$i];\n\t    #print \"$H[$i] $L[0] $L[$i]\\n\";\n\t}\n    }\n}\n\nforeach $nk (sort keys %name_keys) {\n    print \"$nk\";\n    foreach $tk (sort { $a <=> $b } keys %thread_keys) {\n\tprint \",$tk\";\n    }\n    print \"\\n\";\n    foreach $lk (sort keys %line_keys) {\n\tprint \"$lk\";\n\tforeach $tk (sort { $a <=> $b } keys %thread_keys) {\n\t    print \",\" . $stats{$nk}{$lk}{$tk};\n\t}\n\tprint \"\\n\";\n    }\n    print \"\\n\\n\\n\";\n}\n"
  },
  {
    "path": "scripts/experimental/older/run_boruvka.pl",
    "content": "#!/usr/bin/perl\n\nrequire \"backend.pl\";\n# ../inputs/sssp/USA-road-d.W.structure\nrun_prog(\"../debug/apps/boruvka/boruvka /net/faraday/workspace/inputs/weighted/random4-26.structure 1 2\");\n"
  },
  {
    "path": "scripts/experimental/older/run_clustering.pl",
    "content": "#!/usr/bin/perl\n\nrequire \"backend.pl\";\nrun_prog(\"../debug/apps/clustering/boruvka 10000\");\n"
  },
  {
    "path": "scripts/experimental/older/run_delaunayrefinement.pl",
    "content": "#!/usr/bin/perl\n\nrequire \"backend.pl\";\n\nrun_prog(\"../apps/delaunayrefinement/delaunayrefinement ../inputs/delaunayrefinement/test.1\");\n"
  },
  {
    "path": "scripts/experimental/older/run_sssp.pl",
    "content": "#!/usr/bin/perl\n\nrequire \"backend.pl\";\n# ../inputs/sssp/USA-road-d.W.structure\nrun_prog(\"../apps/sssp/sssp /net/faraday/workspace/inputs/weighted/random4-26.structure 1 2\");\n"
  },
  {
    "path": "scripts/experimental/older/vtune_sssp.pl",
    "content": "#!/usr/bin/perl\n\nrequire \"backend.pl\";\n\nvtune_prog(\"../apps/sssp/sssp /net/faraday/workspace/inputs/weighted/random4-24.structure 1 2\");\n"
  },
  {
    "path": "scripts/experimental/pangolin/batch_verify.sh",
    "content": "#!/bin/sh\n\n# all benchmarks\nEXECS=( \"tc\" \"kcl\" \"motif\" \"fsm\" )\n#EXECS=( \"tc\" \"kcl\" \"fsm\" )\n\n#INPUTS=( \"mico\" \"patent\" \"youtube\" )\nINPUTS=( \"citeseer\" )\n#INPUTS=( \"livej\" \"orkut\" )\n\ncurrent_dir=$(dirname \"$0\")\nfor input in \"${INPUTS[@]}\"; do\n  for EXEC in \"${EXECS[@]}\"; do\n    $current_dir/verify.sh ${EXEC} ${input}\n  done\ndone\n\n"
  },
  {
    "path": "scripts/experimental/pangolin/fsm.citeseer.2.300",
    "content": "7\n"
  },
  {
    "path": "scripts/experimental/pangolin/fsm.citeseer.2.500",
    "content": "3\n"
  },
  {
    "path": "scripts/experimental/pangolin/fsm.patent.2.1000",
    "content": "20275\n"
  },
  {
    "path": "scripts/experimental/pangolin/fsm.patent.2.300",
    "content": "21756\n"
  },
  {
    "path": "scripts/experimental/pangolin/fsm.patent.2.500",
    "content": "21480\n"
  },
  {
    "path": "scripts/experimental/pangolin/fsm.patent.2.5000",
    "content": "5933\n"
  },
  {
    "path": "scripts/experimental/pangolin/kcl.citeseer.4",
    "content": "255\n"
  },
  {
    "path": "scripts/experimental/pangolin/kcl.citeseer.5",
    "content": "46\n"
  },
  {
    "path": "scripts/experimental/pangolin/kcl.mico.4",
    "content": "514864225\n"
  },
  {
    "path": "scripts/experimental/pangolin/kcl.mico.5",
    "content": "19246558419\n"
  },
  {
    "path": "scripts/experimental/pangolin/kcl.patent.3",
    "content": "6913764\n"
  },
  {
    "path": "scripts/experimental/pangolin/kcl.patent.4",
    "content": "3310556\n"
  },
  {
    "path": "scripts/experimental/pangolin/kcl.patent.5",
    "content": "2976152\n"
  },
  {
    "path": "scripts/experimental/pangolin/motif.citeseer.3",
    "content": "1166\n23380\n"
  },
  {
    "path": "scripts/experimental/pangolin/motif.citeseer.4",
    "content": "111153\n222630\n3094\n22900\n2200\n255\n"
  },
  {
    "path": "scripts/experimental/pangolin/motif.mico.3",
    "content": "12534960\n53546459\n"
  },
  {
    "path": "scripts/experimental/pangolin/motif.mico.4",
    "content": "4070868075\n2307847995\n33929353\n3591944265\n437985111\n514864225\n"
  },
  {
    "path": "scripts/experimental/pangolin/motif.patent.3",
    "content": "6913764\n267600153\n"
  },
  {
    "path": "scripts/experimental/pangolin/motif.patent.4",
    "content": "5764763466\n5148841859\n227197040\n497680804\n55988120\n3310556\n"
  },
  {
    "path": "scripts/experimental/pangolin/result_checker.py",
    "content": "#!/usr/bin/python3\n\nimport sys\nimport glob\n\nif __name__ == \"__main__\":\n\n\tapplication = sys.argv[1]\n\tinput_graph = sys.argv[2]\n\tsize = sys.argv[3]\n\tminsup = sys.argv[4]\n\tlog_filename = sys.argv[5]\n\tout_filename = \"/net/ohm/export/iss/pangolin-outputs/\" + application + \".\" + input_graph \n\tif application != \"tc\":\n\t\tout_filename = out_filename + \".\" + size\n\tif application == \"fsm\":\n\t\tout_filename = out_filename + \".\" + minsup\n\t\n\tlog = open(log_filename, 'r')\n\tif application == \"motif\":\n\t\tres = open(\"result.txt\", \"w\")\n\t\tfor line in log:\n\t\t\tif line.find(\"triangles\")!=-1 or line.find(\"wedges\")!=-1 or line.find(\"4-paths\")!=-1 or line.find(\"3-stars\")!=-1 or line.find(\"4-cycles\")!=-1 or line.find(\"tailed-triangles\")!=-1 or line.find(\"diamonds\")!=-1 or line.find(\"4-cliques\")!=-1:\n\t\t\t\t#print(line.split(' ')[-1], file=res)\n\t\t\t\tres.write(line.split(' ')[-1])\n\t\tres.close()\n\t\tsame = True\n\t\twith open(out_filename) as out, open(\"result.txt\") as res:\n\t\t\tfor l1, l2 in zip(out, res):\n\t\t\t\tif l1 != l2:\n\t\t\t\t\tsame = False\n\t\t\t\t\tprint(\"truth is \" + l1 + \", but your answer is \" + l2)\n\t\t\t\t\tbreak\n\t\tif same:\n\t\t\tprint(\"SUCCESS\\n\")\n\telse:\n\t\tnum = 0\n\t\tfor line in log:\n\t\t\tif line.find(\"total_num\")!=-1:\n\t\t\t\tnum = int(line.split(' ')[-1])\n\t\t\t\t#print(num)\n\t\tlog.close()\n\n\t\tout = open(out_filename, \"r\")\n\t\tfor line in out:\n\t\t\ttruth = int(line)\n\t\t\tif num == truth:\n\t\t\t\tprint(\"SUCCESS\\n\")\n\t\t\telse:\n\t\t\t\tprint(\"truth is \" + str(truth) + \", but your answer is \" + str(num))\n\tout.close()\n\n"
  },
  {
    "path": "scripts/experimental/pangolin/verify.sh",
    "content": "#!/bin/bash\n# Usage: ./verify.sh <ABELIAN_EXECUTABLE_NAME> <INPUT_GRAPH_NAME>\n\nexecname=$1\ninputname=$2\noption=$3\nbin=${execname}\nif [[ $execname == *\"tc\"* ]]; then\n  bin=\"tc_mine\"\nfi\nexecdirname=\"./${execname}\"\nNTHREADS=\"56\"\nMINSUP=\"500\"\nEXEC=${execdirname}/${bin}\ninputdirname=/net/ohm/export/iss/inputs/Mining\noutputdirname=/net/ohm/export/iss/pangolin-outputs\n\nfiletype=gr\nextension=csgr\nif [[ $execname == *\"fsm\"* ]]; then\n  filetype=adj\n  extension=sadj\nfi\n\nIFS='_' read -ra EXECP <<< \"$execname\"\nproblem=${EXECP[0]}\n\nSIZES=\"3\"\nif [[ $execname == *\"fsm\"* ]]; then\n  SIZES=\"2\"\nfi\n\nif [[ $execname == *\"kcl\"* ]]; then\n  SIZES=\"4 5\"\nfi\n\nif [[ $execname == *\"motif\"* ]]; then\n  SIZES=\"3 4\"\nfi\n\nFLAGS=\nif [[ $execname == *\"fsm\"* ]]; then\n  FLAGS=\"-ms=$MINSUP\"\nfi\n\n#FLAGS+=\" -t=56\"\nOUTPUT=${outputdirname}/${inputname}.${problem}.$K\nINPUT=${inputdirname}/${inputname}.${extension}\nchecker=${outputdirname}/result_checker.py\npass=0\nfail=0\nfailed_cases=\"\"\ncheck_output=\"my-output.log\"\n\n\nfor K in $SIZES; do\n\tfor NT in $NTHREADS; do\n\t\tLOG=${execname}-${inputname}-$K-$NT.log\n\t\techo \"${EXEC} $filetype ${INPUT} -t=$NT -k=$K $FLAGS -v > $LOG\"\n\t\teval \"${EXEC} $filetype ${INPUT} -t=$NT -k=$K $FLAGS -v\" > $LOG 2>> error.log\n\t\techo \"python $checker ${execname} ${inputname} $K $MINSUP $LOG &> ${check_output}\"\n\t\teval \"python $checker ${execname} ${inputname} $K $MINSUP $LOG &> ${check_output}\"\n\t\t#cat ${check_output}\n\t\tif ! grep -q \"SUCCESS\" ${check_output} ; then\n\t\t\tlet fail=fail+1\n\t\t\tfailed_cases+=\"${execname} ${inputname} k=$K t=$NT\"\n\t\telse\n\t\t\tlet pass=pass+1\n\t\tfi\n\t\trm -f ${check_output}\n\tdone\ndone\n\n\necho \"---------------------------------------------------------------------------------------\"\necho \"Algorithm: \" $execname\necho \"Input: \" $inputname\necho \"Runtime option: \" $option\necho $pass \"passed test cases\"\nif [[ $fail == 0 ]] ; then\n  echo \"Status: SUCCESS\"\nelse\n  echo $fail \"failed test cases:\" $failed_cases\n  echo \"Status: FAILED\"\nfi\necho \"---------------------------------------------------------------------------------------\"\n\n"
  },
  {
    "path": "scripts/experimental/runBFS.sh",
    "content": "#!/bin/bash\n\ndeclare -A inputsMap\n\ninputsMap[\"r4\"]=\"/net/ohm/export/iss/inputs/random/r4-2e26.gr\"\ninputsMap[\"rmat\"]=\"/net/ohm/export/iss/inputs/scalefree/rmat16-2e26-a=0.57-b=0.19-c=0.19-d=.05.gr\"\ninputsMap[\"twitter\"]=\"/net/ohm/export/iss/inputs/unweighted/twitter-WWW10-component.gr\"\n\n\nserialAlgos=\"SerialSync Serial\"\nserialRep=\"`seq 1 3`\"\ntag=${tag=\"tag\"}\n\nfor algo in $serialAlgos; do \n  for input in \"${!inputsMap[@]}\"; do \n    for i in 1 2 3; do \n      ./lonestar/bfs/bfs -algo=${algo}  \"${inputsMap[$input]}\" -noverify  ; \n    done 2>&1 | tee bfs-${tag}-${algo}-${input}.log \n  done\ndone\n\nparallelAlgos=\"Async Sync Sync2p\"\nthreads=\"1 `seq 5 5 40`\"\n\nfor algo in $parallelAlgos; do\n  for input in \"${!inputsMap[@]}\"; do \n    for t in $threads; do\n      ./lonestar/bfs/bfs -algo=${algo}  \"${inputsMap[$input]}\" -noverify  -t $t; \n    done 2>&1 | tee bfs-${tag}-${algo}-${input}.log \n  done\ndone\n\n"
  },
  {
    "path": "scripts/experimental/runSSSP.sh",
    "content": "#!/bin/bash\n\ndeclare -A inputsMap\n\ninputsMap[\"r4\"]=\"/net/ohm/export/iss/inputs/random/r4-2e26.gr\"\ninputsMap[\"rmat\"]=\"/net/ohm/export/iss/inputs/scalefree/rmat16-2e26-a=0.57-b=0.19-c=0.19-d=.05.gr\"\ninputsMap[\"road\"]=\"/net/ohm/export/iss/inputs/road/osm-eur-karlsruhe.gr\"\n\n\napp=\"./lonestar/sssp/sssp\"\n# serialAlgos=\"dijkstra serDelta serDeltaTiled\"\nserialAlgos=\"serDelta serDeltaTiled\"\nserialRep=\"`seq 1 3`\"\ntag=${tag=\"tag\"}\n\nfor algo in $serialAlgos; do \n  for input in \"${!inputsMap[@]}\"; do \n    for i in 1 2 3; do \n      ${app} -algo=${algo}  \"${inputsMap[$input]}\" -noverify  ; \n    done 2>&1 | tee $(basename ${app})-${tag}-${algo}-${input}.log \n  done\ndone\n\nparallelAlgos=\"deltaStep deltaTiled\"\nthreads=\"1 `seq 5 5 40`\"\n# threads=\"40\"\n\nfor algo in $parallelAlgos; do\n  for input in \"${!inputsMap[@]}\"; do \n    for t in $threads; do\n      ${app} -algo=${algo}  \"${inputsMap[$input]}\" -noverify  -t $t; \n    done 2>&1 | tee $(basename ${app})-${tag}-${algo}-${input}.log \n  done\ndone\n\n"
  },
  {
    "path": "scripts/find_ifdefs.sh",
    "content": "find $* -name '*.h' -o -name '*.cpp'  \\\n  | xargs grep --no-filename '#if' \\\n  | awk '{print $2;}' | sort | uniq\n"
  },
  {
    "path": "scripts/galois_log_parser.R",
    "content": "#!/usr/bin/env Rscript\n\n#######################################################\n# Author: Gurbinder Gill\n# Email:  gill@cs.utexas.edu\n# Date:   Oct 8, 2017\n######################################################\nlibrary(\"optparse\")\nlibrary('data.table')\n\n####START: @function to parse commadline##################\n# Parses the command line to get the arguments used\nparseCmdLine <- function (logData, isSharedMemGaloisLog, graphPassedAsInput) {\n  #cmdLineRow <- subset(logData, CATEGORY == \"CommandLine\"& TOTAL_TYPE != \"HostValues\")\n  cmdLineRow <- subset(logData, CATEGORY == \"CommandLine\" & STAT_TYPE == \"PARAM\")\n\n  ## Distributed has extra column: HostID\n  if(isTRUE(isSharedMemGaloisLog)){\n    cmdLine <- substring(cmdLineRow[,5], 0)\n  }\n  else\n    cmdLine <- substring(cmdLineRow[,6], 0)\n\n  cmdLineSplit = strsplit(cmdLine, \"\\\\s+\")[[1]]\n\n  deviceKind = \"CPU\"\n  if(!isTRUE(isSharedMemGaloisLog)){\n    ## To check the device kind\n    pos = regexpr('-pset', cmdLineSplit)\n    deviceKind = \"\"\n    if(sum(pos>0) > 0){\n      deviceKind = \"GPU\"\n    } else {\n      deviceKind = \"CPU\"\n    }\n  }\n\n  ## First postitional argument is always name of the executable\n  ### WORKING: split the exePath name found at the position 1 of the argument list and split on \"/\".\n  exePathSplit <- strsplit(cmdLineSplit[1], \"/\")[[1]]\n  benchmark <- exePathSplit[length(exePathSplit)]\n\n  ## subset the threads row from the table\n  numThreads <- (subset(logData, CATEGORY == \"Threads\" & TOTAL_TYPE != \"HostValues\"))$TOTAL\n\n  input = \"noInput\"\n  if(isTRUE(graphPassedAsInput)){\n    ## subset the input row from the table\n    inputPath <- (subset(logData, CATEGORY == \"Input\" & STAT_TYPE == \"PARAM\"))$TOTAL\n    print(inputPath)\n    if(!identical(inputPath, character(0))){\n      #inputPath = cmdLineSplit[3]\n      #print(cmdLineSplit[3])\n      inputPathSplit <- strsplit(inputPath, \"/\")[[1]]\n      input <- inputPathSplit[length(inputPathSplit)]\n    }\n    else {\n      inputPathSplit <- strsplit(inputPath[[2]], \"/\")[[1]]\n      input <- inputPathSplit[length(inputPathSplit)]\n    }\n    ### This is to remore the extension for example .gr or .sgr\n    inputsplit <- strsplit(input, \"[.]\")[[1]]\n    if(length(inputsplit) > 1) {\n      input <- inputsplit[1]\n    }\n  }\n\n  if(isTRUE(isSharedMemGaloisLog)){\n    returnList <- list(\"benchmark\" = benchmark, \"input\" = input, \"numThreads\" = numThreads, \"deviceKind\" = deviceKind)\n    return(returnList)\n  }\n\n ## Need more params for distributed galois logs\n numHosts <- (subset(logData, CATEGORY == \"Hosts\"& TOTAL_TYPE != \"HostValues\"))$TOTAL\n\n partitionScheme <- (subset(logData, CATEGORY == \"PartitionScheme\"& TOTAL_TYPE != \"HostValues\"))$TOTAL\n\n runID <- (subset(logData, CATEGORY == \"Run_UUID\"& TOTAL_TYPE != \"HostValues\"))$TOTAL\n\n numIterations <- (subset(logData, CATEGORY == \"NumIterations_0\"& TOTAL_TYPE != \"HostValues\"))$TOTAL\n #If numIterations is not printed in the log files\n if(identical(numIterations, character(0))){\n   numIterations <- 0\n }\n\n ## returnList for distributed galois log\n returnList <- list(\"runID\" = runID, \"benchmark\" = benchmark, \"input\" = input, \"partitionScheme\" = partitionScheme, \"hosts\" = numHosts , \"numThreads\" = numThreads, \"iterations\" = numIterations, \"deviceKind\" = deviceKind)\n return(returnList)\n}\n#### END: @function to parse commadline ##################\n\n#### START: @function to values of timers for shared memory galois log ##################\n# Parses to get the timer values\ngetTimersShared <- function (logData, benchmark) {\n  totalTimeRow <- subset(logData, CATEGORY == \"Time\" & REGION == \"(NULL)\")\n  totalTime <- totalTimeRow$TOTAL\n  print(paste(\"totalTime:\", totalTime))\n returnList <- list(\"totalTime\" = totalTime)\n return(returnList)\n}\n#### END: @function to values of timers for shared memory galois log ##################\n\n#### START: @function to values of timers for distributed memory galois log ##################\n# Parses to get the timer values\ngetTimersDistributed <- function (logData) {\n\n ## Total time including the graph construction and initialization\n totalTime <- (subset(logData, CATEGORY == \"TimerTotal\" & TOTAL_TYPE != \"HostValues\")$TOTAL)\n print(paste(\"totalTime:\", totalTime))\n\n ## Taking mean of all the runs\n totalTimeExecMean <- round(mean(as.numeric(subset(logData, grepl(\"Timer_[0-9]+\", CATEGORY) & TOTAL_TYPE != \"HostValues\")$TOTAL)), digits = 2)\n print(paste(\"totalTimeExecMean:\", totalTimeExecMean))\n\n ## To get the name of benchmark to be used with other queries to get right timers.\n ### It assumes that there will always with Timer_0 with REGION name as benchmark\n ### name used with other queries.\n benchmarkRegionName <- subset(logData, CATEGORY == \"Timer_0\" & TOTAL_TYPE != \"HostValues\")$REGION\n print(paste(\"benchmark:\", benchmarkRegionName))\n\n ## Number of runs\n numRuns <- as.numeric((subset(logData, CATEGORY == \"Runs\" & TOTAL_TYPE != \"HostValues\"))$TOTAL)\n print(paste(\"numRuns:\", numRuns))\n\n ## Total compute time (galois::do_alls)\n computeTimeMean <- 0\n if(benchmarkRegionName == \"BC\"){\n   regions <- c(\"SSSP\", \"InitializeIteration\", \"PredAndSucc\", \"NumShortestPathsChanges\", \"NumShortestPaths\", \"PropagationFlagUpdate\", \"DependencyPropChanges\", \"DependencyPropagation\", \"BC\")\n   for( region in regions){\n     print(region)\n     computeTimeRows <- subset(logData, grepl(paste(\"^\", region, \"$\", sep=\"\"), REGION) & CATEGORY == \"Time\" & TOTAL_TYPE == \"HMAX\")$TOTAL\n     #computeTimeRows <- subset(logData, grepl(paste(\"CUDA_DO_ALL_IMPL_\", region, \"$\", sep=\"\"), CATEGORY) & TOTAL_TYPE == \"HMAX\")$TOTAL\n     if(!identical(computeTimeRows, character(0))){\n       print(paste(region, \" : time :  \", as.numeric(computeTimeRows)))\n       computeTimeMean = computeTimeMean + round(as.numeric(computeTimeRows)/numRuns, digits = 2)\n     }\n   }\n }\n else {\n   computeTimePerIter <- numeric(numRuns)\n   for(i in 1:(numRuns)) {\n     j = i - 1 #Vectors are 1 indexed in r\n     computeTimeRows <- subset(logData, grepl(paste(\"^\", benchmarkRegionName, \"_\", j, \"_[0-9]+\", sep=\"\"), REGION) & TOTAL_TYPE != \"HostValues\")$TOTAL\n     #computeTimeRows <- subset(logData, grepl(paste(\"CUDA_DO_ALL_IMPL_\", benchmarkRegionName, \"_\", j, \"_[0-9]+\", sep=\"\"), CATEGORY) & TOTAL_TYPE != \"HostValues\")$TOTAL\n     if(!identical(computeTimeRows, character(0))){\n       computeTimePerIter[i] <- sum(as.numeric(computeTimeRows))\n     }\n   }\n   computeTimeMean <- (mean(computeTimePerIter))\n\n   if(computeTimeMean == 0){\n     computeTimeMean <- round(mean(as.numeric(subset(logData, grepl(paste(\"^\", benchmarkRegionName, \"_[0-9]+\", sep=\"\"), REGION) & TOTAL_TYPE != \"HostValues\")$TOTAL)), digits = 2)\n   }\n }\n print(paste(\"computeTimeMean:\", computeTimeMean))\n\n ##Total sync time.\n syncTimePerIter <- numeric(numRuns)\n syncTimeMean <- 0\n if(benchmarkRegionName == \"BC\"){\n   regions <- c(\"SSSP\", \"InitializeIteration\", \"PredAndSucc\", \"NumShortestPathsChanges\", \"NumShortestPaths\", \"PropagationFlagUpdate\", \"DependencyPropChanges\", \"DependencyPropagation\", \"BC\")\n   for(i in 1:(numRuns)) {\n     for(region in regions){\n       syncTimeRows <- subset(logData, grepl(paste(\"Sync_\", region, \"_[0-9]+\", sep=\"\"), CATEGORY) & TOTAL_TYPE != \"HostValues\")$TOTAL\n       if(!identical(syncTimeRows, character(0))){\n         #print(region)\n         syncTimeMean <- syncTimeMean + round(mean(as.numeric(syncTimeRows)), digits = 2)\n       }\n     }\n   }\n }\n else{\n   for(i in 1:(numRuns)) {\n     j = i - 1 #Vectors are 1 indexed in r\n     syncTimeRows <- subset(logData, grepl(paste(\"Sync_\", benchmarkRegionName, \"_\", j, \"_[0-9]+\", sep=\"\"), CATEGORY) & TOTAL_TYPE != \"HostValues\")$TOTAL\n     if(!identical(syncTimeRows, character(0))){\n       syncTimePerIter[i] <- sum(as.numeric(syncTimeRows))\n     }\n   }\n   syncTimeMean <- (mean(syncTimePerIter))\n   if(syncTimeMean == 0) {\n     syncTimeMean <- round(mean(as.numeric(subset(logData, grepl(paste(\"Sync_\", benchmarkRegionName, \"_[0-9]+\", sep=\"\"), CATEGORY) & TOTAL_TYPE != \"HostValues\")$TOTAL)), digits = 2)\n   }\n }\n print(paste(\"syncTimeMean\", syncTimeMean))\n\n\n ## Mean time spent in the implicit barrier: DGReducible\n barrierTimePerIter <- numeric(numRuns)\n for(i in 1:(numRuns)) {\n  j = i - 1 #Vectors are 1 indexed in r\n  barrierTimeRows <- subset(logData, REGION ==\"DGReducible\" & grepl(paste( \"ReduceDGAccum_\", j, \"_[0-9]+\", sep=\"\"), CATEGORY) & TOTAL_TYPE != \"HostValues\")$TOTAL\n  if(!identical(barrierTimeRows, character(0))){\n    barrierTimePerIter[i] <- sum(as.numeric(barrierTimeRows))\n  }\n }\n barrierTimeMean <- (mean(barrierTimePerIter))\n if(barrierTimeMean == 0) {\n  barrierTimeMean <- round(mean(as.numeric(subset(logData, REGION ==\"DGReducible\" & grepl(paste( \"ReduceDGAccum_[0-9]*\", sep=\"\"), CATEGORY) & TOTAL_TYPE != \"HostValues\")$TOTAL)), digits = 2)\n }\n print(paste(\"barrierTimeMean:\", barrierTimeMean))\n\n ## Total bytes sent in reduce and broadcast phase in run 0.\n ### Same number of bytes are being sent in all the runs.\n syncBytes <- 0\n if(benchmarkRegionName == \"BC\"){\n   regions <- c(\"SSSP\", \"InitializeIteration\", \"PredAndSucc\", \"NumShortestPathsChanges\", \"NumShortestPaths\", \"PropagationFlagUpdate\", \"DependencyPropChanges\", \"DependencyPropagation\", \"BC\")\n   for(region in regions){\n     sendBytesRegion <- sum(as.numeric(subset(logData, grepl(paste(\"[Reduce|Broadcast]SendBytes_\", region, \"_0\", sep=\"\"), CATEGORY) & TOTAL_TYPE == \"HSUM\")$TOTAL))\n     print(paste(region, \" : \", sendBytesRegion))\n     syncBytes <- syncBytes + sendBytesRegion \n     print(syncBytes)\n   }\n }\n else {\n   syncBytes <- sum(as.numeric(subset(logData, grepl(paste(\"[Reduce|Broadcast]SendBytes_\", benchmarkRegionName, \"_0\", sep=\"\"), CATEGORY)& TOTAL_TYPE == \"HSUM\")$TOTAL))\n }\n print(paste(\"syncBytes:\", syncBytes))\n\n ##Graph construction time\n graphConstructTime <- subset(logData, CATEGORY == \"GraphConstructTime\" & TOTAL_TYPE != \"HostValues\")$TOTAL\n print(paste(\"graphConstructTime:\", graphConstructTime))\n\n ## Replication factor\n replicationFactor <- subset(logData, CATEGORY == \"ReplicationFactor\" & TOTAL_TYPE != \"HostValues\")$TOTAL\n print(paste(\"replicationFactor:\", replicationFactor))\n #if(is.null(replicationFactor)){\nif(identical(replicationFactor, character(0))){\n   replicationFactor <- 0\n }\n\n ## Communication memory usage: Max and Min.\n communicationMemUsageMax = as.numeric(subset(logData, CATEGORY == \"CommunicationMemUsageMax\" & TOTAL_TYPE == \"HMAX\")$TOTAL)\n communicationMemUsageMin = as.numeric(subset(logData, CATEGORY == \"CommunicationMemUsageMin\" & TOTAL_TYPE == \"HMIN\")$TOTAL)\n\n if(identical(communicationMemUsageMax, numeric(0)) || identical(communicationMemUsageMin, numeric(0))){\n   communicationMemUsageMax = 0\n   communicationMemUsageMin = 0\n   print(\"Printing Memory usage counter not present.\")\n }\n\n returnList <- list(\"replicationFac\" = replicationFactor, \"totalTime\" = totalTime, \"totalTimeExec\" = totalTimeExecMean, \"computeTime\" = computeTimeMean, \"syncTime\" = syncTimeMean, \"barrierTime\" = barrierTimeMean, \"syncBytes\" = syncBytes, \"graphConstructTime\"= graphConstructTime, \"communicationMemUsageMax\" = communicationMemUsageMax, \"communicationMemUsageMin\" = communicationMemUsageMin)\n print(length(returnList))\n return(returnList)\n}\n#### END: @function to values of timers for distributed memory galois log ##################\n\n#### START: @function to compute per iteration communication volume. ##################\n# Parses to get the timer values\ncomputePerIterVolume <- function (logData, paramList, output) {\n  numIter = as.numeric(paramList[\"iterations\"])\n  print(numIter)\n\n  benchmarkRegionName <- subset(logData, CATEGORY == \"Timer_0\" & TOTAL_TYPE != \"HostValues\")$REGION\n  print(paste(\"benchmark:\", benchmarkRegionName))\n\n  ## Number of runs\n  numRuns <- as.numeric((subset(logData, CATEGORY == \"Runs\" & TOTAL_TYPE != \"HostValues\"))$TOTAL)\n  print(paste(\"numRuns:\", numRuns))\n\n  output_perIterVol_file <- paste(output, \"_perIterVolume\", sep=\"\")\n  output_perIterVolRangePercentage_file <- paste(output, \"_perIterVolumeRangePercentage\", sep=\"\")\n\n  ## Doing 1st iteration separately to see if new file is to be created or if file already exists.\n  #STAT, 0, dGraph, REDUCE_SEND_BYTES_BFS_0_0, HSUM, 23587108\n  ## To collect the data points in separate ranges of data volume\n  low = 0\n  medium = 0\n  high = 0\n\n  for(r in 0:(numRuns - 1)){\n    commVolumeRow <- subset(logData, grepl(paste(\"SEND_BYTES_\", benchmarkRegionName, \"_\", r, \"_\", 0, \"$\" , sep=\"\"), CATEGORY) & TOTAL_TYPE == \"HSUM\")$TOTAL\n    #print(commVolumeRow)\n    if(!identical(commVolumeRow, character(0))){\n      print(commVolumeRow)\n      totalCommVolSentPerIter <- sum(as.numeric(commVolumeRow))\n      vol = totalCommVolSentPerIter/(1024*1024)\n      if(vol <= 100 )\n        low = low + 1\n      else if(vol > 100 && vol <= 1000)\n        medium = medium + 1\n      else if(vol > 1000)\n        high = high + 1\n\n      commVolList <- list(\"run\" = r, \"iter\" = 0, \"sendBytesPerIter\" = totalCommVolSentPerIter)\n      outDataList <- append(paramList, commVolList)\n      if(!file.exists(output_perIterVol_file)){\n        print(paste(output_perIterVol_file, \"Does not exist. Creating new file to record per iteration volume\"))\n        write.csv(as.data.frame(outDataList), file=output_perIterVol_file, row.names=F, quote=F)\n      } else {\n        print(paste(\"Appending data to the existing file\", output_perIterVol_file))\n        write.table(as.data.frame(outDataList), file=output_perIterVol_file, row.names=F, col.names=F, quote=F, append=T, sep=\",\")\n      }\n      print(totalCommVolSentPerIter)\n    }\n  }\n\n  for(i in 1:(numIter - 1)) {\n    for(r in 0:(numRuns - 1)){\n      commVolumeRow <- subset(logData, grepl(paste(\"SEND_BYTES_\", benchmarkRegionName, \"_\", r, \"_\", i, \"$\" ,sep=\"\"), CATEGORY) & TOTAL_TYPE == \"HSUM\")$TOTAL\n      if(!identical(commVolumeRow, character(0))){\n        #print(commVolumeRow)\n        totalCommVolSentPerIter <- sum(as.numeric(commVolumeRow))\n        vol = totalCommVolSentPerIter/(1024*1024)\n        if(vol <= 100 )\n          low = low + 1\n        else if(vol > 100 && vol <= 1000)\n          medium = medium + 1\n        else if(vol > 1000)\n          high = high + 1\n\n        commVolList <- list(\"run\" = r, \"iter\" = i, \"sendBytesPerIter\" = totalCommVolSentPerIter)\n        outDataList <- append(paramList, commVolList)\n        write.table(as.data.frame(outDataList), file=output_perIterVol_file, row.names=F, col.names=F, quote=F, append=T, sep=\",\")\n        #print(totalCommVolSentPerIter)\n      }\n    }\n  }\n\n\n  totalNumber <- low + medium + high\n  if(!file.exists(output_perIterVolRangePercentage_file)){\n    print(paste(output_perIterVolRangePercentage_file, \"Does not exist. Creating new file to record per iteration volume in ranges\"))\n    rangeList_low <- list(\"rangeLabel\" = \"low\", \"value\" = low, \"total\" = totalNumber)\n    outDataList <- append(paramList, rangeList_low)\n    write.csv(as.data.frame(outDataList), file=output_perIterVolRangePercentage_file, row.names=F, quote=F)\n\n    rangeList_medium <- list(\"rangeLabel\" = \"medium\", \"value\" = medium, \"total\" = totalNumber)\n    outDataList <- append(paramList, rangeList_medium)\n    write.table(as.data.frame(outDataList), file=output_perIterVolRangePercentage_file, row.names=F, col.names=F, quote=F, append=T, sep=\",\")\n\n    rangeList_high <- list(\"rangeLabel\" = \"high\", \"value\" = high, \"total\" = totalNumber)\n    outDataList <- append(paramList, rangeList_high)\n    write.table(as.data.frame(outDataList), file=output_perIterVolRangePercentage_file, row.names=F, col.names=F, quote=F, append=T, sep=\",\")\n  } else {\n\n    print(paste(\"Appending data to the existing file\", output_perIterVolRangePercentage_file))\n\n    rangeList_low <- list(\"rangeLabel\" = \"low\", \"value\" = low, \"total\" = totalNumber)\n    outDataList <- append(paramList, rangeList_low)\n    write.table(as.data.frame(outDataList), file=output_perIterVolRangePercentage_file, row.names=F, col.names=F, quote=F, append=T, sep=\",\")\n\n    rangeList_medium <- list(\"rangeLabel\" = \"medium\", \"value\" = medium, \"total\" = totalNumber)\n    outDataList <- append(paramList, rangeList_medium)\n    write.table(as.data.frame(outDataList), file=output_perIterVolRangePercentage_file, row.names=F, col.names=F, quote=F, append=T, sep=\",\")\n\n    rangeList_high <- list(\"rangeLabel\" = \"high\", \"value\" = high, \"total\" = totalNumber)\n    outDataList <- append(paramList, rangeList_high)\n    write.table(as.data.frame(outDataList), file=output_perIterVolRangePercentage_file, row.names=F, col.names=F, quote=F, append=T, sep=\",\")\n  }\n\n}\n\n\n\n\n#### START: @function to compute per iteration RSD of compute time. ##################\n# Parses to get the timer values\ncomputeRSD <- function (logData, paramList, output) {\n  numIter = as.numeric(paramList[\"iterations\"])\n\n  benchmarkRegionName <- subset(logData, CATEGORY == \"TIMER_0\" & TOTAL_TYPE != \"HostValues\")$REGION\n  print(paste(\"benchmark:\", benchmarkRegionName))\n\n  ## Number of runs\n  numRuns <- as.numeric((subset(logData, CATEGORY == \"Runs\" & TOTAL_TYPE != \"HostValues\"))$TOTAL)\n  print(paste(\"numRuns:\", numRuns))\n\n  output_rsd_file <- paste(output, \"_computeRSD\", sep=\"\")\n\n\n\n  ## Doing 1st iteration separately to see if new file is to be created or if file already exists.\n  for(r in 0:(numRuns - 1)){\n    computeTimeRows <- subset(logData, grepl(paste(\"^\", benchmarkRegionName, \"_\", r, \"_\", 0, sep=\"\"), REGION) & TOTAL_TYPE == \"HostValues\")$TOTAL\n    if(!identical(computeTimeRows, character(0))){\n      print(computeTimeRows)\n      computeTimePerHostArr <- (as.numeric(strsplit(computeTimeRows, \";\")[[1]]))\n      sd <- sd(computeTimePerHostArr)\n      mean <- mean(computeTimePerHostArr)\n      rsd <- round((sd/mean)*100, digits = 2)\n      rsdList <- list(\"run\" = r, \"iter\" = 0, \"sd\" = sd, \"mean\" = mean , \"rsd\" = rsd)\n      outDataList <- append(paramList, rsdList)\n      if(!file.exists(output_rsd_file)){\n        print(paste(output_rsd_file, \"Does not exist. Creating new file\"))\n        write.csv(as.data.frame(outDataList), file=output_rsd_file, row.names=F, quote=F)\n      } else {\n        print(paste(\"Appending data to the existing file\", output_rsd_file))\n        write.table(as.data.frame(outDataList), file=output_rsd_file, row.names=F, col.names=F, quote=F, append=T, sep=\",\")\n      }\n      print(rsd)\n    }\n  }\n\n  for(i in 1:(numIter - 1)) {\n    for(r in 0:(numRuns - 1)){\n      print(i)\n      computeTimeRows <- subset(logData, grepl(paste(\"^\", benchmarkRegionName, \"_\", r, \"_\", i, sep=\"\"), REGION) & TOTAL_TYPE == \"HostValues\")$TOTAL\n      if(!identical(computeTimeRows, character(0))){\n        computeTimePerHostArr <- (as.numeric(strsplit(computeTimeRows, \";\")[[1]]))\n        sd <- sd(computeTimePerHostArr)\n        mean <- mean(computeTimePerHostArr)\n        rsd <- round((sd/mean)*100, digits = 2)\n        rsdList <- list(\"run\" = r, \"iter\" = i, \"sd\" = sd, \"mean\" = mean , \"rsd\" = rsd)\n        outDataList <- append(paramList, rsdList)\n        write.table(as.data.frame(outDataList), file=output_rsd_file, row.names=F, col.names=F, quote=F, append=T, sep=\",\")\n        print(rsd)\n      }\n    }\n  }\n}\n\n#### START: @function to compute max by mean of compute time. ##################\n# Parses to get the timer values\ncomputeMaxByMean <- function (logData, paramList, output) {\n  numIter = as.numeric(paramList[\"iterations\"])\n\n  benchmarkRegionName <- subset(logData, CATEGORY == \"TIMER_0\" & TOTAL_TYPE != \"HostValues\")$REGION\n  print(paste(\"benchmark:\", benchmarkRegionName))\n\n  ## Number of runs\n  numRuns <- as.numeric((subset(logData, CATEGORY == \"Runs\" & TOTAL_TYPE != \"HostValues\"))$TOTAL)\n  print(paste(\"numRuns:\", numRuns))\n\n  maxsum <- numeric()\n  meansum <- numeric()\n  maxbymean <- numeric()\n\n  if(benchmarkRegionName == \"BC\"){\n    maxsum <- 0\n    meansum <- 0\n    regions <- c(\"SSSP\", \"InitializeIteration\", \"PredAndSucc\", \"NumShortestPathsChanges\", \"NumShortestPaths\", \"PropagationFlagUpdate\", \"DependencyPropChanges\", \"DependencyPropagation\", \"BC\")\n    for( region in regions){\n     print(region)\n     computeTimeRows <- subset(logData, grepl(paste(\"^\", region, \"$\", sep=\"\"), REGION) & CATEGORY == \"Time\" & TOTAL_TYPE == \"HostValues\")$TOTAL\n     #computeTimeRows <- subset(logData, grepl(paste(\"CUDA_DO_ALL_IMPL_\", region, \"$\", sep=\"\"), CATEGORY) & TOTAL_TYPE == \"HostValues\")$TOTAL\n     if(!is.null(computeTimeRows)){\n       print(computeTimeRows)\n       computeTimePerHost <- (as.numeric(strsplit(computeTimeRows, \";\")[[1]]))\n       maxsum[1] <- maxsum[1] +  round(max(as.numeric(computeTimePerHost))/numRuns, digits = 2)\n       meansum[1] <- meansum[1] + round(mean(as.numeric(computeTimePerHost))/numRuns, digits = 2)\n     }\n   }\n   maxbymean[1] <- round(maxsum[1]/meansum[1], digits = 2)\n   print(paste(region, \" : maxsum :  \", maxsum))\n   print(paste(region, \" : meansum :  \", meansum))\n   print(paste(region, \" : maxbymean :  \", maxbymean))\n\n  }\n  else {\n    for(r in 0:(numRuns - 1)){\n      max <- numeric()\n      mean <- numeric()\n      for(i in 0:(numIter - 1)) {\n        computeTimeRows <- subset(logData, grepl(paste(\"^\", benchmarkRegionName, \"_\", r, \"_\", i, sep=\"\"), REGION) & TOTAL_TYPE == \"HostValues\")$TOTAL\n        if(!identical(computeTimeRows, character(0))){\n          computeTimePerHostArr <- (as.numeric(strsplit(computeTimeRows, \";\")[[1]]))\n          mean[i+1] <- mean(computeTimePerHostArr)\n          max[i+1] <- max(computeTimePerHostArr)\n        }\n        else {\n          mean[i+1] <- 0\n          max[i+1] <- 0\n        }\n      }\n      maxsum[r+1] <- sum(max)\n      meansum[r+1] <- sum(mean)\n      maxbymean[r+1] <- round((maxsum[r+1]/meansum[r+1]), digits = 2)\n    }\n  }\n  maxsum_avg <- mean(maxsum)\n  meansum_avg <- mean(meansum)\n  maxbymean_avg <- mean(maxbymean)\n  maxbymeanList <- list(\"maxComputeTime\" = maxsum_avg, \"meanComputeTime\" = meansum_avg, \"maxByMeanComputeTime\" = maxbymean_avg)\n  outDataList <- append(paramList, maxbymeanList)\n  print(paste(\"MaxByMeanComputeTime:\", maxbymean_avg))\n\n  if(!file.exists(output)){\n    print(paste(output, \"Does not exist. Creating new file\"))\n    write.csv(as.data.frame(outDataList), file=output, row.names=F, quote=F)\n  } else {\n    print(paste(\"Appending data to the existing file\", output))\n    write.table(as.data.frame(outDataList), file=output, row.names=F, col.names=F, quote=F, append=T, sep=\",\")\n  }\n}\n\ngetTimersFT <- function(logData) {\n\n  enableFT <- 0\n  crashIteration <- 0\n  crashNumHosts <- 0\n  checkPointInterval <- 0\n  recoveryScheme <- \"NA\"\n  recoveryTimeTotal <- 0\n  recoveryTimeTotalCrashed <- 0\n  recoveryTimeTotalHealthy <- 0\n  recoveryTimeGraphConstruct <- 0\n  recoveryTime <- 0\n  recoveryTimeSync <- 0\n  checkpointSaveTime <- 0\n\n\n  enableFT <- as.numeric(subset(logData, CATEGORY == \"ENABLE_FT\"& TOTAL_TYPE != \"HostValues\")$TOTAL)\n  if(identical(enableFT, numeric(0))){\n    cmdLineRow <- subset(logData, CATEGORY == \"CommandLine\"& TOTAL_TYPE != \"HostValues\")\n    cmdLine <- substring(cmdLineRow[,6], 0)\n    cmdLineSplit = strsplit(cmdLine, \"\\\\s+\")[[1]]\n    for (c in cmdLineSplit) {\n      if(any(grep(\"-enableFT\", c))){\n        splitStr = strsplit(c, \"=\")[[1]]\n        enableFT <- splitStr[2]\n      }\n    }\n  }\n\n  print(enableFT)\n  if(enableFT == 1){\n  print(\"here\", enableFT)\n\n    cmdLineRow <- subset(logData, CATEGORY == \"CommandLine\"& TOTAL_TYPE != \"HostValues\")\n    cmdLine <- substring(cmdLineRow[,6], 0)\n    cmdLineSplit = strsplit(cmdLine, \"\\\\s+\")[[1]]\n    for (c in cmdLineSplit) {\n      if(any(grep(\"-crashIteration\", c))){\n        splitStr = strsplit(c, \"=\")[[1]]\n        crashIteration <- splitStr[2]\n      } else if(any(grep(\"-crashNumHosts\", c))){\n        splitStr = strsplit(c, \"=\")[[1]]\n        crashNumHosts <- splitStr[2]\n      } else if(any(grep(\"-recoveryScheme\", c))){\n        splitStr = strsplit(c, \"=\")[[1]]\n        recoveryScheme <- splitStr[2]\n      } else if(any(grep(\"-checkpointInterval\", c))){\n        splitStr = strsplit(c, \"=\")[[1]]\n        checkPointInterval <- splitStr[2]\n      }\n    }\n\n    print(paste(\"enableFT:\", enableFT, \" crashIteration:\", crashIteration, \" crashNumHosts:\", crashNumHosts, \" recoveryScheme:\", recoveryScheme, \" checkPointInterval:\", checkPointInterval))\n\n    #### Recovery counters\n    #recoveryTimeTotal <- (subset(logData, grepl(paste(\"TIMER_RECOVERY_TOTAL_[0-9]+_\", crashIteration, sep=\"\"), CATEGORY) & TOTAL_TYPE != \"HostValues\")$TOTAL)\n    recoveryTimeTotal <- (subset(logData, grepl(paste(\"TIMER_RECOVERY_TOTAL_[0-9]+_[0-9]+\", sep=\"\"), CATEGORY) & TOTAL_TYPE != \"HostValues\")$TOTAL)\n    if(identical(recoveryTimeTotal, character(0))){\n      recoveryTimeTotal <- 0\n    }\n\n    #recoveryTimeTotalCrashed <- (subset(logData, grepl(paste(\"TIMER_RECOVERY_CRASHED_[0-9]+_\", crashIteration, sep=\"\"), CATEGORY) & TOTAL_TYPE != \"HostValues\")$TOTAL)\n    recoveryTimeTotalCrashed <- (subset(logData, grepl(paste(\"TIMER_RECOVERY_CRASHED_[0-9]+_[0-9]+\", sep=\"\"), CATEGORY) & TOTAL_TYPE != \"HostValues\")$TOTAL)\n    if(identical(recoveryTimeTotalCrashed, character(0))){\n      recoveryTimeTotalCrashed <- 0\n    }\n\n    #recoveryTimeGraphConstruct <- (subset(logData, grepl(paste(\"TIMER_RECOVERY_GRAPH_CONSTRUCT_[0-9]+_\", crashIteration, sep=\"\"), CATEGORY) & TOTAL_TYPE != \"HostValues\")$TOTAL)\n    recoveryTimeGraphConstruct <- (subset(logData, grepl(paste(\"TIMER_RECOVERY_GRAPH_CONSTRUCT_[0-9]+_[0-9]+\", sep=\"\"), CATEGORY) & TOTAL_TYPE != \"HostValues\")$TOTAL)\n    if(identical(recoveryTimeGraphConstruct, character(0))){\n      recoveryTimeGraphConstruct <- 0\n    }\n\n    ### Timers for the healthy host\n    #recoveryTimeTotalHealthy <- (subset(logData, grepl(paste(\"TIMER_RECOVERY_HEALTHY_[0-9]+_\", crashIteration, sep=\"\"), CATEGORY) & TOTAL_TYPE != \"HostValues\")$TOTAL)\n    recoveryTimeTotalHealthy <- (subset(logData, grepl(paste(\"TIMER_RECOVERY_HEALTHY_[0-9]+_[0-9]+\", sep=\"\"), CATEGORY) & TOTAL_TYPE != \"HostValues\")$TOTAL)\n    if(identical(recoveryTimeTotalHealthy, character(0))){\n      recoveryTimeTotalHealthy <- 0\n    }\n\n    ## Time spent on recovery\n    #recoveryTime <- (subset(logData, grepl(paste(\"^RECOVERY_[0-9]+_\", crashIteration, sep=\"\"), REGION) & TOTAL_TYPE != \"HostValues\")$TOTAL)\n    recoveryTime <- (subset(logData, grepl(paste(\"^RECOVERY_[0-9]+_[0-9]+\", sep=\"\"), REGION) & TOTAL_TYPE != \"HostValues\")$TOTAL)\n    if(identical(recoveryTime, character(0))){\n      recoveryTime <- 0\n    }\n    #recoveryTimeSync <- (subset(logData, grepl(paste(\"^SYNC_RECOVERY_[0-9]+_\", crashIteration, sep=\"\"), CATEGORY) & TOTAL_TYPE != \"HostValues\")$TOTAL)\n    recoveryTimeSync <- (subset(logData, grepl(paste(\"^SYNC_RECOVERY_[0-9]+_[0-9]+\", sep=\"\"), CATEGORY) & TOTAL_TYPE != \"HostValues\")$TOTAL)\n    if(identical(recoveryTimeSync, character(0))){\n      recoveryTimeSync <- 0\n    }\n\n    #### Total time to save checkpoint\n    checkpointSaveTime <- 0\n    if(recoveryScheme == \"cp\" || recoveryScheme == \"hr\"){\n    #checkpointSaveTime <- sum(as.numeric(subset(logData, grepl(paste(\"^TIMER_SAVE_CHECKPOINT_[0-9]+_[0-9]+\",sep=\"\"), CATEGORY) & TOTAL_TYPE != \"HostValues\")$TOTAL))\n    checkpointSaveTime <- (as.numeric(subset(logData, grepl(paste(\"^TOTAL_TIMER_SAVE_CHECKPOINT\",sep=\"\"), CATEGORY) & TOTAL_TYPE != \"HostValues\")$TOTAL))\n    print (checkpointSaveTime)\n    }\n\n    print(recoveryTimeTotal)\n    print(recoveryTimeTotalCrashed)\n    print(recoveryTimeTotalHealthy)\n    print(recoveryTime)\n    print(recoveryTimeSync)\n\n  }\n  ### Calculate the number of work items:\n  workItems <- sum(as.numeric(subset(logData, grepl(paste(\"NUM_WORK_ITEMS_0\", sep=\"\"), CATEGORY) & TOTAL_TYPE == \"HSUM\")$TOTAL))\n  print(paste(\"workItems : \", workItems))\n\n  ### calculate the sync bytes in recovery phase\n  recoverySyncBytes <- sum(as.numeric(subset(logData, grepl(paste(\"[REDUCE|BROADCAST]_SEND_BYTES_RECOVERY.*_0_[0-9]+\", sep=\"\"), CATEGORY)& TOTAL_TYPE == \"HSUM\")$TOTAL))\n  print(paste(\"recoverySyncBytes\", recoverySyncBytes))\n\n  ### To calculate any bytes spent in initialization\n  #syncBytesInitGraph <- sum(as.numeric(subset(logData, grepl(paste(\"[REDUCE|BROADCAST]_SEND_BYTES_InitializeGraph_[crashed|healthy]_0_[0-9]+\", sep=\"\"), CATEGORY)& TOTAL_TYPE == \"HSUM\")$TOTAL))\n  syncBytesInitGraph <- sum(as.numeric(subset(logData, grepl(paste(\"[REDUCE|BROADCAST]_SEND_BYTES_InitializeGraph_(?:crashed|healthy)_0_[0-9]+\", sep=\"\"), CATEGORY)& TOTAL_TYPE == \"HSUM\")$TOTAL))\n  print(paste(\"syncBytesInitGraph\", syncBytesInitGraph))\n\n\n  returnList <- list(\"FT\" = enableFT, \"crashIter\" = crashIteration, \"crashNumHosts\" = crashNumHosts, \"RScheme\" = recoveryScheme, \"CPInterval\" = checkPointInterval, \"RTimeTotal\" = recoveryTimeTotal, \"RTimeTotalCrashed\" =  recoveryTimeTotalCrashed, \"RTimeTotalHealthy\" = recoveryTimeTotalHealthy, \"RTimeGraphConstruct\" = recoveryTimeGraphConstruct, \"RTimeExec\" = recoveryTime, \"RTimeSync\" = recoveryTimeSync, \"CPSaveTime\" = checkpointSaveTime, \"workItems\" = workItems, \"RSyncBytes\" = recoverySyncBytes, \"RSyncBytesGraphInit\" = syncBytesInitGraph)\n  return(returnList)\n}\n\n#### START: @function entry point for galois log parser ##################\ngaloisLogParser <- function(input, output, isSharedMemGaloisLog, isComputeRSD, isComputeMaxByMean, isComputePerIterVol, isFautlTolerant, graphPassedAsInput) {\n  logData <- read.csv(input, stringsAsFactors=F,strip.white=T)\n\n  printNormalStats = TRUE;\n  if(isTRUE(isSharedMemGaloisLog)){\n    print(\"Parsing commadline\")\n    paramList <- parseCmdLine(logData, T, graphPassedAsInput)\n    print(\"Parsing timers for shared memory galois log\")\n    benchmark = paramList[1]\n    timersList <- getTimersShared(logData, benchmark)\n  }\n  else{\n    print(\"Parsing commadline\")\n    paramList <- parseCmdLine(logData, F, graphPassedAsInput)\n    print(\"Parsing timers for distributed memory galois log\")\n    if(isTRUE(isComputeMaxByMean)){\n      computeMaxByMean(logData, paramList, output)\n      printNormalStats = FALSE\n    }\n    else if(isTRUE(isComputeRSD)){\n      computeRSD(logData, paramList, output)\n      printNormalStats = FALSE\n    }\n    else if(isTRUE(isComputePerIterVol)){\n      computePerIterVolume(logData, paramList, output)\n      printNormalStats = FALSE\n    }\n    else{\n      timersList <- getTimersDistributed(logData)\n    }\n\n    if(isTRUE(isFautlTolerant)){\n      timersList_ft <- getTimersFT(logData)\n      timersList <- append(timersList, timersList_ft)\n    }\n  }\n\n  ## if computing RSD then normal stats are not printed\n  #if(isTRUE(!isComputeRSD && !isComputeMaxByMean && !isComputePerIterVol)){\n  if(isTRUE(printNormalStats)){\n    outDataList <- append(paramList, timersList)\n    if(!file.exists(output)){\n      print(paste(output, \"Does not exist. Creating new file\"))\n      write.csv(as.data.frame(outDataList), file=output, row.names=F, quote=F)\n    } else {\n      print(paste(\"Appending data to the existing file\", output))\n      write.table(as.data.frame(outDataList), file=output, row.names=F, col.names=F, quote=F, append=T, sep=\",\")\n    }\n  }\n}\n#### END: @function entry point for shared memory galois log ##################\n\n#############################################\n##  Commandline options.\n#######################################\noption_list = list(\n                   make_option(c(\"-i\", \"--input\"), action=\"store\", default=NA, type='character',\n                               help=\"name of the input file to parse\"),\n                   make_option(c(\"-o\", \"--output\"), action=\"store\", default=NA, type='character',\n                               help=\"name of the output file to store output\"),\n                   make_option(c(\"-s\", \"--sharedMemGaloisLog\"), action=\"store_true\", default=FALSE,\n                               help=\"Is it a shared memory Galois log? If -s is not used, it will be treated as a distributed Galois log [default %default]\"),\n                   make_option(c(\"-r\", \"--relativeStandardDeviation\"), action=\"store_true\", default=FALSE,\n                               help=\"To compute the RSD of per iteration compute time[default %default]\"),\n                   make_option(c(\"-m\", \"--maxByMean\"), action=\"store_true\", default=FALSE,\n                               help=\"To compute the max by mean compute time[default %default]\"),\n                   make_option(c(\"-p\", \"--perItrVolume\"), action=\"store_true\", default=FALSE,\n                               help=\"To get the per iteration communication volume [default %default]\"),\n                   make_option(c(\"-f\", \"--faultTolerance\"), action=\"store_true\", default=FALSE,\n                               help=\"Logs are fault tolerant [default %default]\"),\n                   make_option(c(\"-g\", \"--graphPassedAsInput\"), action=\"store_false\", default=TRUE,\n                               help=\"Benchmark explicitly takes input graph as the positional argument [default %default]\")\n\n                   )\n\nopt_parser <- OptionParser(usage = \"%prog [options] -i input.log -o output.csv\", option_list=option_list)\nopt <- parse_args(opt_parser)\n\nif (is.na(opt$i)){\n  print_help(opt_parser)\n  stop(\"At least one argument must be supplied (input file)\", call.=FALSE)\n} else {\n  if (is.na(opt$o)){\n    print(\"Output file name is not specified. Using name ouput.csv as default\")\n    opt$o <- \"output.csv\"\n  }\n  print(opt$g)\n  galoisLogParser(opt$i, opt$o, opt$s, opt$r, opt$m, opt$p, opt$f, opt$g)\n}\n\n##################### END #####################\n"
  },
  {
    "path": "scripts/galois_log_parser_minimal.R",
    "content": "#!/usr/bin/env Rscript\n\nlibrary(\"optparse\")\nlibrary('data.table')\n\n####START: @function to parse commadline##################\n# Parses the command line to get the arguments used\nparseCmdLine <- function (logData, isSharedMemGaloisLog) {\n  cmdLineRow <- subset(logData, CATEGORY == \"CommandLine\" & STAT_TYPE == \"PARAM\")\n\n  ## Distributed has extra column: HostID\n  if(isTRUE(isSharedMemGaloisLog)){\n    cmdLine <- substring(cmdLineRow[,5], 0)\n  }\n  else\n    cmdLine <- substring(cmdLineRow[,6], 0)\n\n  cmdLineSplit = strsplit(cmdLine, \"\\\\s+\")[[1]]\n\n  deviceKind = \"CPU\"\n  if(!isTRUE(isSharedMemGaloisLog)){\n    ## To check the device kind\n    pos = regexpr('-pset', cmdLineSplit)\n    deviceKind = \"\"\n    if(sum(pos>0) > 0){\n      deviceKind = \"GPU\"\n    } else {\n      deviceKind = \"CPU\"\n    }\n  }\n\n  ## First postitional argument is always name of the executable\n  ### WORKING: split the exePath name found at the position 1 of the argument list and split on \"/\".\n  exePathSplit <- strsplit(cmdLineSplit[1], \"/\")[[1]]\n  benchmark <- exePathSplit[length(exePathSplit)]\n\n  ## subset the threads row from the table\n  numThreads <- (subset(logData, CATEGORY == \"Threads\" & TOTAL_TYPE != \"HostValues\"))$TOTAL\n\n  input = \"noInput\"\n  ## subset the input row from the table\n  inputPath <- (subset(logData, CATEGORY == \"Input\" & STAT_TYPE == \"PARAM\"))$TOTAL\n  print(inputPath)\n  if(!identical(inputPath, character(0))){\n    inputPathSplit <- strsplit(inputPath, \"/\")[[1]]\n    input <- inputPathSplit[length(inputPathSplit)]\n  }\n  else {\n    inputPathSplit <- strsplit(inputPath[[2]], \"/\")[[1]]\n    input <- inputPathSplit[length(inputPathSplit)]\n  }\n  ### This is to remove the extension for example .gr or .sgr\n  inputsplit <- strsplit(input, \"[.]\")[[1]]\n  if(length(inputsplit) > 1) {\n    input <- inputsplit[1]\n  }\n  \n  if(isTRUE(isSharedMemGaloisLog)){\n    returnList <- list(\"benchmark\" = benchmark, \"input\" = input, \"numThreads\" = numThreads, \"deviceKind\" = deviceKind)\n    return(returnList)\n  }\n\n ## Need more params for distributed galois logs\n numHosts <- (subset(logData, CATEGORY == \"Hosts\"& TOTAL_TYPE != \"HostValues\"))$TOTAL\n\n partitionScheme <- (subset(logData, CATEGORY == \"PartitionScheme\"& TOTAL_TYPE != \"HostValues\"))$TOTAL\n\n runID <- (subset(logData, CATEGORY == \"Run_UUID\"& TOTAL_TYPE != \"HostValues\"))$TOTAL\n\n numIterations <- (subset(logData, CATEGORY == \"NumIterations_0\"& TOTAL_TYPE != \"HostValues\"))$TOTAL\n #If numIterations is not printed in the log files\n if(identical(numIterations, character(0))){\n   numIterations <- 0\n }\n\n ## returnList for distributed galois log\n returnList <- list(\"benchmark\" = benchmark, \"input\" = input, \"partitionScheme\" = partitionScheme, \"hosts\" = numHosts , \"numThreads\" = numThreads, \"deviceKind\" = deviceKind, \"iterations\" = numIterations)\n return(returnList)\n}\n#### END: @function to parse commadline ##################\n\n#### START: @function to values of timers for shared memory galois log ##################\n# Parses to get the timer values\ngetTimersShared <- function (logData, benchmark) {\n  totalTimeRow <- subset(logData, CATEGORY == \"Time\" & REGION == \"(NULL)\")\n  totalTime <- totalTimeRow$TOTAL\n  print(paste(\"totalTime:\", totalTime))\n returnList <- list(\"totalTime\" = totalTime)\n return(returnList)\n}\n#### END: @function to values of timers for shared memory galois log ##################\n\n#### START: @function to values of timers for distributed memory galois log ##################\n# Parses to get the timer values\ngetTimersDistributed <- function (logData) {\n\n ## Total time including the graph construction and initialization\n totalTime <- (subset(logData, CATEGORY == \"TimerTotal\" & TOTAL_TYPE != \"HostValues\")$TOTAL)\n print(paste(\"totalTime:\", totalTime))\n\n ## Taking mean of all the runs\n totalTimeExecMean <- round(mean(as.numeric(subset(logData, grepl(\"Timer_[0-9]+\", CATEGORY) & TOTAL_TYPE != \"HostValues\")$TOTAL)), digits = 2)\n print(paste(\"totalTimeExecMean:\", totalTimeExecMean))\n\n ## To get the name of benchmark to be used with other queries to get right timers.\n ### It assumes that there will always with Timer_0 with REGION name as benchmark\n ### name used with other queries.\n benchmarkRegionName <- subset(logData, CATEGORY == \"Timer_0\" & TOTAL_TYPE != \"HostValues\")$REGION\n print(paste(\"benchmark:\", benchmarkRegionName))\n\n ## Number of runs\n numRuns <- as.numeric((subset(logData, CATEGORY == \"Runs\" & TOTAL_TYPE != \"HostValues\"))$TOTAL)\n print(paste(\"numRuns:\", numRuns))\n\n ## Total compute time (galois::do_alls)\n computeTimeMean <- 0\n computeTimeRows <- subset(logData, grepl(paste(\"^\", benchmarkRegionName, \"[_]*[[:alpha:]]*_[0-9]+\", sep=\"\"), REGION) & CATEGORY == \"Time\" & TOTAL_TYPE == \"HMAX\")$TOTAL\n computeTimeMean <- round(sum(as.numeric(computeTimeRows))/numRuns, digits = 2)\n\n print(paste(\"computeTimeMean:\", computeTimeMean))\n\n ## Synchronization Time\n syncTimeMean <- 0\n syncTimeRows = subset(logData, grepl(paste(\"Sync_\", benchmarkRegionName, \"[_]*[[:alpha:]]*_[0-9]+\", sep=\"\"), CATEGORY) & TOTAL_TYPE != \"HostValues\")$TOTAL\n if(!identical(syncTimeRows, character(0))){\n   syncTimeMean <- round(sum(as.numeric(syncTimeRows))/numRuns, digits = 2)\n } \n print(paste(\"syncTimeMean\", syncTimeMean))\n\n\n ## Mean time spent in the implicit barrier: Total - (compute +  sync)\n barrierTimeMean = totalTimeExecMean - (computeTimeMean + syncTimeMean)\n if(barrierTimeMean < 0){\n   barrierTimeMean <- 0\n }\n print(paste(\"barrierTimeMean:\", barrierTimeMean))\n\n ## Total bytes sent in reduce and broadcast phase in run 0.\n ### Same number of bytes are being sent in all the runs.\n syncBytes <- 0\n syncBytes <- sum(as.numeric(subset(logData, grepl(paste(\"[Reduce|Broadcast]SendBytes_\", benchmarkRegionName, \"[_]*[[:alpha:]]*_0\", sep=\"\"), CATEGORY)& TOTAL_TYPE == \"HSUM\")$TOTAL))\n print(paste(\"syncBytes:\", syncBytes))\n\n###NOTE: Timer are per source for BC\nif(benchmarkRegionName == \"BC\" | benchmarkRegionName == \"MRBC\") {\n   ## Total number of sources for BC\n   numSources <- as.numeric((subset(logData, CATEGORY == \"NumSources\" & TOTAL_TYPE != \"HostValues\"))$TOTAL)\n   \n   totalTimeExecMean <- round(totalTimeExecMean/numSources, digits = 2)\n   computeTimeMean <- round(computeTimeMean/numSources, digits = 2)\n   syncTimeMean <- round(syncTimeMean/numSources, digits = 2) \n   barrierTimeMean <- round(barrierTimeMean/numSources, digits = 2)\n   syncBytes <- round(syncBytes/numSources, digits = 2)\n }\n\n ##Graph construction time\n graphConstructTime <- subset(logData, CATEGORY == \"GraphConstructTime\" & TOTAL_TYPE != \"HostValues\")$TOTAL\n print(paste(\"graphConstructTime:\", graphConstructTime))\n\n ## Replication factor\n replicationFactor <- subset(logData, CATEGORY == \"ReplicationFactor\" & TOTAL_TYPE != \"HostValues\")$TOTAL\n print(paste(\"replicationFactor:\", replicationFactor))\n if(identical(replicationFactor, character(0))){\n   replicationFactor <- 0\n }\n returnList <- list(\"replicationFac\" = replicationFactor, \"totalTime\" = totalTime, \"totalTimeExec\" = totalTimeExecMean, \"computeTime\" = computeTimeMean, \"syncTime\" = syncTimeMean, \"barrierTime\" = barrierTimeMean, \"syncBytes\" = syncBytes, \"graphConstructTime\"= graphConstructTime)\n return(returnList)\n}\n#### END: @function to values of timers for distributed memory galois log ##################\n\n#### START: @function entry point for galois log parser ##################\ngaloisLogParser <- function(input, output, isSharedMemGaloisLog) {\n  logData <- read.csv(input, stringsAsFactors=F,strip.white=T)\n\n  printNormalStats = TRUE;\n  if(isTRUE(isSharedMemGaloisLog)){\n    print(\"Parsing commadline\")\n    paramList <- parseCmdLine(logData, T)\n    print(\"Parsing timers for shared memory galois log\")\n    benchmark = paramList[1]\n    timersList <- getTimersShared(logData, benchmark)\n  }\n  else{\n    print(\"Parsing commadline\")\n    paramList <- parseCmdLine(logData, F)\n    print(\"Parsing timers for distributed memory galois log\")\n    timersList <- getTimersDistributed(logData)\n    \n  }\n\n  if(isTRUE(printNormalStats)){\n    outDataList <- append(paramList, timersList)\n    if(!file.exists(output)){\n      print(paste(output, \"Does not exist. Creating new file\"))\n      write.csv(as.data.frame(outDataList), file=output, row.names=F, quote=F)\n    } else {\n      print(paste(\"Appending data to the existing file\", output))\n      write.table(as.data.frame(outDataList), file=output, row.names=F, col.names=F, quote=F, append=T, sep=\",\")\n    }\n  }\n}\n#### END: @function entry point for shared memory galois log ##################\n\n#### START: @function entry point for de-duplication of entries ##################\ndeDupByMean <- function(output) {\n  logData <- read.csv(output, stringsAsFactors=F,strip.white=T)\n  ## Aggregate results from multiple runs\n  logData_agg <- aggregate(. ~ benchmark + input + partitionScheme + \n          hosts + numThreads + deviceKind,\n          data = logData, mean)\n  write.csv(logData_agg, output, row.names=FALSE, quote=FALSE)\n}\n#### END: @function entry point for de-duplication of entries ##################\n\n\n#############################################\n##  Commandline options.\n#######################################\noption_list = list(\n                   make_option(c(\"-i\", \"--input\"), action=\"store\", default=NA, type='character',\n                               help=\"Name of the input file to parse\"),\n                   make_option(c(\"-o\", \"--output\"), action=\"store\", default=NA, type='character',\n                               help=\"Name of the output file to store output\"),\n                   make_option(c(\"-d\", \"--duplicate\"), action=\"store_true\", default=FALSE,\n                               help=\"Allow duplicate entries. By default takes mean of duplicate entries [default %default]\"),\n                   make_option(c(\"-s\", \"--sharedMemGaloisLog\"), action=\"store_true\", default=FALSE,\n                               help=\"Is it a shared memory Galois log? If -s is not used, it will be treated as a distributed Galois log [default %default]\")\n                   )\n\nopt_parser <- OptionParser(usage = \"%prog [options] -i input.log -o output.csv\", option_list=option_list)\nopt <- parse_args(opt_parser)\n\nif (is.na(opt$i)){\n  print_help(opt_parser)\n  stop(\"At least one argument must be supplied (input file)\", call.=FALSE)\n} else {\n  if (is.na(opt$o)){\n    print(\"Output file name is not specified. Using name ouput.csv as default\")\n    opt$o <- \"output.csv\"\n  }\n  print(opt$g)\n  galoisLogParser(opt$i, opt$o, opt$s)\n  ## Take mean of the duplicate entries ##\n  if(!opt$d){\n    deDupByMean(opt$o)\n  }\n}\n##################### END #####################\n"
  },
  {
    "path": "scripts/gitFindBigCommits.sh",
    "content": "#!/bin/bash\n\n# This script was obtained from: https://stackoverflow.com/a/42544963\n\ngit rev-list --objects --all \\\n  | git cat-file --batch-check='%(objecttype) %(objectname) %(objectsize) %(rest)' \\\n  | sed -n 's/^blob //p' \\\n  | sort --numeric-sort --key=2 \\\n  | cut -c 1-12,41- \\\n  | numfmt --field=2 --to=iec-i --suffix=B --padding=7 --round=nearest\n"
  },
  {
    "path": "scripts/hcompiler.sh",
    "content": "#!/bin/bash\n#Common variables:: \nLLVM_BUILD_DIR=/net/dyne/workspace/rashid/llvm37/build/\nGALOIS_DIST_SRC_DIR=/h2/rashid/workspace/GaloisDist/gdist/\nGALOIS_DIST_BUILD_DIR=/net/faraday/workspace/rashid/GaloisDist/release/\nCUDA_HOME=/org/centers/cdgc/cuda/cuda-7.0/\nIN_FILE_NAME=$2\nUSE_CASE=$1\n\n\nGALOIS_INCLUDE_DIRS=\"-I${BOOST_INC} -I${GALOIS_DIST_SRC_DIR}/exp/include -I${MPI_DIR}/include -I${GALOIS_DIST_BUILD_DIR}/include -I${GALOIS_DIST_SRC_DIR}/include -I${GALOIS_DIST_SRC_DIR}/libruntime/include/ -I${GALOIS_DIST_SRC_DIR}/libsubstrate/include/ -I${GALOIS_DIST_SRC_DIR}/lonestar/include/ -I${GALOIS_DIST_SRC_DIR}/libllvm/include -I${GALOIS_DIST_SRC_DIR}/libgraphs/include -I${GALOIS_DIST_SRC_DIR}/libdist/include -I${CUDA_HOME}/include\"\n\n#For Analysis: \n\n#${LLVM_BUILD_DIR}/bin/clang++ -Xclang -load -Xclang ${LLVM_BUILD_DIR}/lib/GaloisFunctionsAnalysis.so -Xclang -plugin -Xclang galois-analysis  -DGALOIS_USE_EXP -D__STDC_LIMIT_MACROS -D__STDC_CONSTANT_MACROS -g -gcc-toolchain /net/faraday/workspace/local/modules/gcc-4.9/bin/.. -fcolor-diagnostics -std=c++11 -O3 -DNDEBUG -I${BOOST_INC} -I${GALOIS_DIST_SRC_DIR}/exp/include -I${MPI_DIR}/include -I${GALOIS_DIST_BUILD_DIR}/include -I${GALOIS_DIST_SRC_DIR}/include -I${GALOIS_DIST_SRC_DIR}/libruntime/include/ -I${GALOIS_DIST_SRC_DIR}/libsubstrate/include/ -I${GALOIS_DIST_SRC_DIR}/lonestar/include/ -I${GALOIS_DIST_SRC_DIR}/libllvm/include -I${GALOIS_DIST_SRC_DIR}/libgraphs/include -I${GALOIS_DIST_SRC_DIR}/libdist/include -I${CUDA_HOME}/include -o CMakeFiles/SGD_gen.dir/pageRankPull_gen.cpp.o -c ${IN_FILE_NAME}\n\n\n#analysis:\n\nif [ \"${USE_CASE}\" == \"analysis\" ] \nthen\n${LLVM_BUILD_DIR}/bin/clang++ -Xclang -load -Xclang ${LLVM_BUILD_DIR}/lib/GaloisFunctionsAnalysis.so -Xclang -plugin -Xclang galois-analysis  -DGALOIS_USE_EXP -D__STDC_LIMIT_MACROS -D__STDC_CONSTANT_MACROS -g -gcc-toolchain /net/faraday/workspace/local/modules/gcc-4.9/bin/.. -fcolor-diagnostics -std=c++11 -O3 -DNDEBUG ${GALOIS_INCLUDE_DIRS} -o CMakeFiles/SGD_gen.dir/pageRankPull_gen.cpp.o -c ${IN_FILE_NAME}\n\nelif [ \"${USE_CASE}\" == \"codegen\" ] \nthen\n#codegen:\n${LLVM_BUILD_DIR}/bin/clang++ -Xclang -load -Xclang ${LLVM_BUILD_DIR}/lib/GaloisFunctions.so -Xclang -plugin -Xclang galois-fns  -DGALOIS_USE_EXP -D__STDC_LIMIT_MACROS -D__STDC_CONSTANT_MACROS -g -gcc-toolchain /net/faraday/workspace/local/modules/gcc-4.9/bin/.. -fcolor-diagnostics -std=c++11 -O3 -DNDEBUG ${GALOIS_INCLUDE_DIRS} -o CMakeFiles/SGD_gen.dir/pageRankPull_gen.cpp.o -c ${IN_FILE_NAME}\n\nelif [ \"${USE_CASE}\" ==  \"opencl\" ] \nthen\n${LLVM_BUILD_DIR}/bin/clang++ -Xclang -load -Xclang ${LLVM_BUILD_DIR}/lib/OpenCLCodeGenHost.so -Xclang -plugin -Xclang opencl-analysis  -DGALOIS_USE_EXP -D__STDC_LIMIT_MACROS -D__STDC_CONSTANT_MACROS -g -gcc-toolchain /net/faraday/workspace/local/modules/gcc-4.9/bin/.. -fcolor-diagnostics -std=c++11 -O3 -DNDEBUG ${GALOIS_INCLUDE_DIRS} -o CMakeFiles/SGD_gen.dir/pageRankPull_gen.cpp.o -c ${IN_FILE_NAME}\n\nelif [ \"${USE_CASE}\" ==  \"clcodegen\" ] \nthen\n${LLVM_BUILD_DIR}/bin/clang++ -Xclang -load -Xclang ${LLVM_BUILD_DIR}/lib/OpenCLCodeGenDevice.so -Xclang -plugin -Xclang opencl-device-codegen  -DGALOIS_USE_EXP -D__STDC_LIMIT_MACROS -D__STDC_CONSTANT_MACROS -g -gcc-toolchain /net/faraday/workspace/local/modules/gcc-4.9/bin/.. -fcolor-diagnostics -std=c++11 -O3 -DNDEBUG ${GALOIS_INCLUDE_DIRS} -o CMakeFiles/SGD_gen.dir/pageRankPull_gen.cpp.o -c ${IN_FILE_NAME}\n\nelif [ \"${USE_CASE}\" ==  \"aos2soa\" ] \nthen\n${LLVM_BUILD_DIR}/bin/clang++ -Xclang -load -Xclang ${LLVM_BUILD_DIR}/lib/AosToSoaPlugin.so -Xclang -plugin -Xclang aos2soa  -DGALOIS_USE_EXP -D__STDC_LIMIT_MACROS -D__STDC_CONSTANT_MACROS -g -gcc-toolchain /net/faraday/workspace/local/modules/gcc-4.9/bin/.. -fcolor-diagnostics -std=c++11 -O3 -DNDEBUG ${GALOIS_INCLUDE_DIRS} -o CMakeFiles/SGD_gen.dir/pageRankPull_gen.cpp.o -c ${IN_FILE_NAME}\n\nelif [ \"${USE_CASE}\" == \"astdump\" ]\nthen\n\n${LLVM_BUILD_DIR}/bin/clang -cc1 -ast-dump -DGALOIS_USE_EXP -D__STDC_LIMIT_MACROS -D__STDC_CONSTANT_MACROS -fcolor-diagnostics -std=c++11 -O3 -DNDEBUG ${GALOIS_INCLUDE_DIRS} -o CMakeFiles/SGD_gen.dir/pageRankPull_gen.cpp.o ${IN_FILE_NAME}\nfi\n"
  },
  {
    "path": "scripts/intel_study_scripts/README.md",
    "content": "Instructions to build Galois and reproduce IntelStudy experiments\n========\n\nClone the repository\n```Shell\ngit clone https://github.com/IntelligentSoftwareSystems/Galois\n```\n\nLet us assume that the SRC_DIR is the top-level Galois source dir where the Galois repository is cloned.\n\nBuilding Galois\n------------\n```Shell\nBUILD_DIR=<path-to-your-build-dir>\nmkdir -p $BUILD_DIR\ncmake -S $SRC_DIR -B $BUILD_DIR -DCMAKE_BUILD_TYPE=Release\n```\n\nGalois applications are in lonestar directory. In order to build a particular application:\n```Shell\nmake -C $BUILD_DIR/lonestar/analytics/cpu/<app-dir-name> -j\n```\n\nFor IntelStudy build the following apps:\n------------\nBFS\n```Shell\nmake -C $BUILD_DIR/lonestar/analytics/cpu/bfs -j\n```\nBC\n```Shell\nmake -C $BUILD_DIR/lonestar/analytics/cpu/betweennesscentrality -j\n```\nCC\n```Shell\nmake -C $BUILD_DIR/lonestar/analytics/cpu/connectedcomponents -j\n```\nPR\n```Shell\nmake -C $BUILD_DIR/lonestar/analytics/cpu/pagerank -j\n```\nSSSP\n```Shell\nmake -C $BUILD_DIR/lonestar/analytics/cpu/sssp -j\n```\nTC\n```Shell\nmake -C $BUILD_DIR/lonestar/analytics/cpu/triangles -j\n```\n\n\nDownload the inputs\n------------\n```Shell\nmkdir -p $INPUT_DIR\nbash $BUILD_DIR/scripts/intel_study_scripts/download_inputs.sh $INPUT_DIR\n```\n\n\nRunning benchmarks using scripts:\n------------\n\nSet env variables to be used by scripts\n```Shell\nexport GALOIS_BUILD=$BUILD_DIR\nexport INPUT_DIR=$INPUT_DIR\n```\n\nRun\n```Shell\ncd $BUILD_DIR/scripts/intel_study_scripts/\n./run_bc.sh\n./run_bfs.sh\n./run_cc.sh\n./run_pr.sh\n./run_sssp.sh\n./run_tc.sh\n```\n\nlogs will be produced by the above mentioned scripts in the repespective folders of the benchmark, here is the example for bfs:\n```Shell\ncd $BUILD_DIR/lonestar/analytics/cpu/bfs/logs\n```\n\n"
  },
  {
    "path": "scripts/intel_study_scripts/download_inputs.sh",
    "content": "#!/bin/bash\n\necho -e \"USAGE: ./download_inputs.sh INPUT_DIR_PATH\\n\"\nINPUT_DIR=$1\nif [ -z ${INPUT_DIR} ];\nthen\n  echo \"INPUT_DIR not set; Please point it to the directory where graphs will be downloaded\"\n  exit\nelse\n  echo \"Using directory ${INPUT_DIR} for inputs\"\nfi\n\ncd ${INPUT_DIR}\nwget https://intel-study-sc20-galois-inputs.s3.us-east-2.amazonaws.com/sources.tar.gz \ntar -xzvf sources.tar.gz\n\nwget https://intel-study-sc20-galois-inputs.s3.us-east-2.amazonaws.com/GAP-road.sgr\n\nwget https://intel-study-sc20-galois-inputs.s3.us-east-2.amazonaws.com/GAP-urand.sgr \n\nwget https://intel-study-sc20-galois-inputs.s3.us-east-2.amazonaws.com/GAP-kron.sgr \nwget https://intel-study-sc20-galois-inputs.s3.us-east-2.amazonaws.com/GAP-kron.sgr.triangles \n\nwget https://intel-study-sc20-galois-inputs.s3.us-east-2.amazonaws.com/GAP-twitter.csgr.triangles \nwget https://intel-study-sc20-galois-inputs.s3.us-east-2.amazonaws.com/GAP-twitter.gr \nwget https://intel-study-sc20-galois-inputs.s3.us-east-2.amazonaws.com/GAP-twitter.sgr \nwget https://intel-study-sc20-galois-inputs.s3.us-east-2.amazonaws.com/GAP-twitter.tgr \n\nwget https://intel-study-sc20-galois-inputs.s3.us-east-2.amazonaws.com/GAP-web.csgr.triangles \nwget https://intel-study-sc20-galois-inputs.s3.us-east-2.amazonaws.com/GAP-web.gr \nwget https://intel-study-sc20-galois-inputs.s3.us-east-2.amazonaws.com/GAP-web.sgr \nwget https://intel-study-sc20-galois-inputs.s3.us-east-2.amazonaws.com/GAP-web.tgr\n\n\n\n\n"
  },
  {
    "path": "scripts/intel_study_scripts/run_bc.sh",
    "content": "#!/bin/bash\n\necho -e \"USAGE: ./run_bc.sh <numRuns>\\n\"\nappname=betweennesscentrality\n\nnumRuns=$1\nif [ -z $numRuns ]; then\n  numRuns=1\nfi\n\nif [ -z ${GALOIS_BUILD} ]; then\n  echo \"GALOIS_BUILD not set; Please point it to the top level directory where Galois is built\"\n  exit\nelse\n  echo \"Using ${GALOIS_BUILD} for Galois build to run ${appname}\"\nfi\n\nif [ -z ${INPUT_DIR} ]; then\n  echo \"INPUT_DIR not set; Please point it to the directory with .gr graphs\"\n  exit\nelse\n  echo \"Using ${INPUT_DIR} for inputs for ${appname}\"\nfi\n\ninputDir=\"${INPUT_DIR}\"\nexecDir=\"${GALOIS_BUILD}/lonestar/analytics/cpu/${appname}\"\necho ${execDir}\n\nexec=betweennesscentrality-cpu\n\nfor configType in $(seq 1 2)\ndo\n  if [ ${configType} == 1 ]; then\n    echo \"Running ${appname} with config1\"\n    export GOMP_CPU_AFFINITY=\"0-31\"\n    export KMP_AFFINITY=\"verbose,explicit,proclist=[0-31]\"\n    Threads=32\n  else\n    echo \"Running ${appname} with config2\"\n    Threads=64\n  fi\n\n  for run in $(seq 1 ${numRuns})\n  do\n    for input in \"kron\" \"road\" \"urand\" \"web\" \"twitter\"\n    do\n      if [ ${input} == \"web\" ] || [ ${input} == \"twitter\" ]; then \n        ##NOTE: Using gr for directed graphs\n        extension=gr\n      else # kron road urand\n        ##NOTE: Using sgr for undirected graphs\n        extension=sgr\n      fi\n\n      if [ ${configType} == 1 ]; then \n        algo=\"AutoAlgo\"\n      elif [ ${input} == \"road\" ]; then # ${configType} == 2\n        algo=\"Async\"\n      else # ${configType} == 2\n        algo=\"Level\"\n      fi\n\n      echo \"Running on ${input}\"\n      echo \"Logs will be available in ${execDir}/logs/${input}\"\n      if [ ! -d \"${execDir}/logs/${input}\" ]; then\n        mkdir -p ${execDir}/logs/${input}\n      fi\n\n      for count in {0..15}\n      do\n        filename=\"${appname}_${input}_file_${count}_${configType}_Run${run}\"\n        statfile=\"${filename}.stats\"\n        args=\" -numOfSources=4 -numOfOutSources=4 -sourcesToUse=\"$inputDir/sources/GAP-${input}-bc/GAP-${input}_sources_${count}.txt\" \"\n        ${execDir}/${exec} $inputDir/GAP-${input}.${extension} -t ${Threads} ${args}  -statFile=${execDir}/logs/${input}/${statfile} &> ${execDir}/logs/${input}/${filename}.out\n      done\n    done\n  done\ndone"
  },
  {
    "path": "scripts/intel_study_scripts/run_bfs.sh",
    "content": "#!/bin/bash\n\necho -e \"USAGE: ./run_bfs.sh <numRuns>\\n\"\nappname=bfs\n\nnumRuns=$1\nif [ -z $numRuns ]; then\n  numRuns=1\nfi\n\nif [ -z ${GALOIS_BUILD} ]; then\n  echo \"GALOIS_BUILD not set; Please point it to the top level directory where Galois is built\"\n  exit\nelse\n  echo \"Using ${GALOIS_BUILD} for Galois build to run ${appname}\"\nfi\n\nif [ -z ${INPUT_DIR} ]; then\n  echo \"INPUT_DIR not set; Please point it to the directory with .gr graphs\"\n  exit\nelse\n  echo \"Using ${INPUT_DIR} for inputs for ${appname}\"\nfi\n\ninputDir=\"${INPUT_DIR}\"\nexecDir=\"${GALOIS_BUILD}/lonestar/analytics/cpu/bfs\"\necho ${execDir}\n\nexec=bfs-directionopt-cpu\n\nfor configType in $(seq 1 2)\ndo\n  if [ ${configType} == 1 ]; then\n    echo \"Running ${appname} with config1\"\n    export GOMP_CPU_AFFINITY=\"0-31\"\n    export KMP_AFFINITY=\"verbose,explicit,proclist=[0-31]\"\n    Threads=32\n  else\n    echo \"Running ${appname} with config2\"\n    Threads=64\n  fi\n\n  for run in $(seq 1 ${numRuns})\n  do\n    for input in \"kron\" \"road\" \"urand\" \"web\" \"twitter\"\n    do\n      if [ ${input} == \"web\" ] || [ ${input} == \"twitter\" ]; then \n        ##NOTE: Using gr for directed graphs\n        extension=gr\n      else # kron road urand\n        ##NOTE: Using sgr for undirected graphs\n        extension=sgr\n      fi\n\n      if [ ${configType} == 1 ]; then \n        algo=\"AutoAlgo\"\n      elif [ ${input} == \"road\" ]; then # ${configType} == 2\n        algo=\"Async\"\n      else # ${configType} == 2\n        algo=\"SyncDO\"\n      fi\n\n      echo \"Running on ${input}\"\n      echo \"Logs will be available in ${execDir}/logs/${input}\"\n      if [ ! -d \"${execDir}/logs/${input}\" ]; then\n        mkdir -p ${execDir}/logs/${input}\n      fi\n\n      while read p; do\n        source_node=$((${p} - 1))\n        filename=\"${appname}_${input}_source_${source_node}_algo_${algo}_${configType}_Run${run}\"\n        statfile=\"${filename}.stats\"\n        ${execDir}/${exec} -algo=${algo} $inputDir/GAP-${input}.${extension} -t ${Threads} -preAlloc=1200  -startNode=${source_node} -statFile=${execDir}/logs/${input}/${statfile} &> ${execDir}/logs/${input}/${filename}.out\n      done < $inputDir/sources/GAP-${input}_sources.mtx\n    done\n  done\ndone\n"
  },
  {
    "path": "scripts/intel_study_scripts/run_cc.sh",
    "content": "#!/bin/bash\n\necho -e \"USAGE: ./run_cc.sh <numRuns>\\n\"\nappname=\"connected-components\"\n\nnumRuns=$1\nif [ -z $numRuns ]; then\n  numRuns=1\nfi\n\nif [ -z ${GALOIS_BUILD} ]; then\n  echo \"GALOIS_BUILD not set; Please point it to the top level directory where Galois is built\"\n  exit\nelse\n  echo \"Using ${GALOIS_BUILD} for Galois build to run ${appname}\"\nfi\n\nif [ -z ${INPUT_DIR} ]; then\n  echo \"INPUT_DIR not set; Please point it to the directory with .gr graphs\"\n  exit\nelse\n  echo \"Using ${INPUT_DIR} for inputs for ${appname}\"\nfi\n\ninputDir=\"${INPUT_DIR}\"\nexecDir=\"${GALOIS_BUILD}/lonestar/analytics/cpu/${appname}\"\necho ${execDir}\nif [ ! -d \"${execDir}/logs/\" ]; then\n  mkdir -p ${execDir}/logs/\nfi\necho \"Logs will be available in ${execDir}/logs/\"\n\nexec=\"connected-components-cpu\"\nextension=sgr\n\nfor configType in $(seq 1 2)\ndo\n  if [ ${configType} == 1 ]; then\n    echo \"Running ${appname} with config1\"\n    export GOMP_CPU_AFFINITY=\"0-31\"\n    export KMP_AFFINITY=\"verbose,explicit,proclist=[0-31]\"\n    Threads=32\n  else\n    echo \"Running ${appname} with config2\"\n    Threads=64\n  fi\n\n  for run in $(seq 1 ${numRuns})\n  do\n    for input in \"kron\" \"road\" \"urand\" \"web\" \"twitter\"\n    do\n      if [ ${configType} == 2 ] && [ ${input} == \"web\" ]; then \n        algo=\"EdgetiledAfforest\"\n      else\n        algo=\"Afforest\"\n      fi\n      echo \"Running on ${input}\"\n      filename=\"${appname}_${input}_algo_${algo}_${configType}_Run${run}\"\n      statfile=\"${filename}.stats\"\n      ${execDir}/${exec} -algo=$algo -t=${Threads} $inputDir/GAP-${input}.${extension} -symmetricGraph -statFile=${execDir}/logs/${statfile} &> ${execDir}/logs/${filename}.out\n    done\n  done\ndone\n"
  },
  {
    "path": "scripts/intel_study_scripts/run_pr.sh",
    "content": "#!/bin/bash\n\necho -e \"USAGE: ./run_pr.sh <numRuns>\\n\"\nappname=\"pagerank\"\n\nnumRuns=$1\nif [ -z $numRuns ]; then\n  numRuns=1\nfi\n\nif [ -z ${GALOIS_BUILD} ]; then\n  echo \"GALOIS_BUILD not set; Please point it to the top level directory where Galois is built\"\n  exit\nelse\n  echo \"Using ${GALOIS_BUILD} for Galois build to run ${appname}\"\nfi\n\nif [ -z ${INPUT_DIR} ]; then\n  echo \"INPUT_DIR not set; Please point it to the directory with .gr graphs\"\n  exit\nelse\n  echo \"Using ${INPUT_DIR} for inputs for ${appname}\"\nfi\n\ninputDir=\"${INPUT_DIR}\"\nexecDir=\"${GALOIS_BUILD}/lonestar/analytics/cpu/${appname}\"\necho ${execDir}\nif [ ! -d \"${execDir}/logs/\" ]; then\n  mkdir -p ${execDir}/logs/\nfi\necho \"Logs will be available in ${execDir}/logs/\"\n\nexec=\"pagerank-pull-cpu\"\n\nalgo=\"Topo\"\ntol=1e-4\nmaxIter=1000\n\nfor configType in $(seq 1 2)\ndo\n  if [ ${configType} == 1 ]; then\n    echo \"Running ${appname} with config1\"\n    export GOMP_CPU_AFFINITY=\"0-31\"\n    export KMP_AFFINITY=\"verbose,explicit,proclist=[0-31]\"\n    Threads=32\n  else\n    echo \"Running ${appname} with config2\"\n    Threads=64\n  fi\n\n  for run in $(seq 1 ${numRuns})\n  do\n    for input in \"kron\" \"road\" \"urand\" \"web\" \"twitter\"\n    do\n      if [ ${input} == \"web\" ] || [ ${input} == \"twitter\" ]; then \n        ##NOTE: Using tgr for directed graphs\n        extension=tgr\n      else # kron road urand\n        ##NOTE: Using sgr for undirected graphs\n        extension=sgr\n      fi\n      echo \"Running on ${input}\"\n      filename=\"${appname}_${input}_algo_${algo}_${configType}_Run${run}\"\n      statfile=\"${filename}.stats\"\n      ${execDir}/${exec} -algo=$algo -t=${Threads} $inputDir/GAP-${input}.${extension} -tolerance=${tol} -maxIterations=${maxIter} -transposedGraph -statFile=${execDir}/logs/${statfile} &> ${execDir}/logs/${filename}.out\n    done\n  done\ndone\n"
  },
  {
    "path": "scripts/intel_study_scripts/run_sssp.sh",
    "content": "#!/bin/bash\n\necho -e \"USAGE: ./run_sssp.sh <numRuns>\\n\"\nappname=sssp\n\nnumRuns=$1\nif [ -z $numRuns ]; then\n  numRuns=1\nfi\n\nif [ -z ${GALOIS_BUILD} ]; then\n  echo \"GALOIS_BUILD not set; Please point it to the top level directory where Galois is built\"\n  exit\nelse\n  echo \"Using ${GALOIS_BUILD} for Galois build to run ${appname}\"\nfi\n\nif [ -z ${INPUT_DIR} ]; then\n  echo \"INPUT_DIR not set; Please point it to the directory with .gr graphs\"\n  exit\nelse\n  echo \"Using ${INPUT_DIR} for inputs for ${appname}\"\nfi\n\ninputDir=\"${INPUT_DIR}\"\nexecDir=\"${GALOIS_BUILD}/lonestar/analytics/cpu/${appname}\"\necho ${execDir}\n\nexec=sssp-cpu\n\nfor configType in $(seq 1 2)\ndo\n  if [ ${configType} == 1 ]; then\n    echo \"Running ${appname} with config1\"\n    export GOMP_CPU_AFFINITY=\"0-31\"\n    export KMP_AFFINITY=\"verbose,explicit,proclist=[0-31]\"\n    Threads=32\n  else\n    echo \"Running ${appname} with config2\"\n    Threads=64\n  fi\n\n  for run in $(seq 1 ${numRuns})\n  do\n    for input in \"kron\" \"road\" \"urand\" \"web\" \"twitter\"\n    do\n      if [ ${input} == \"web\" ] || [ ${input} == \"twitter\" ]; then \n        ##NOTE: Using gr for directed graphs\n        extension=gr\n      else # kron road urand\n        ##NOTE: Using sgr for undirected graphs\n        extension=sgr\n      fi\n\n      if [ ${configType} == 1 ]; then \n        algo=\"AutoAlgo\"\n      elif [ ${input} == \"road\" ]; then # ${configType} == 2\n        algo=\"deltaStep\"\n      else # ${configType} == 2\n        algo=\"deltaStepBarrier\"\n      fi\n\n      if [ ${input} == \"road\" ]; then\n        delta=15\n      else\n        delta=1\n      fi\n\n      echo \"Running on ${input}\"\n      echo \"Logs will be available in ${execDir}/logs/${input}\"\n      if [ ! -d \"${execDir}/logs/${input}\" ]; then\n        mkdir -p ${execDir}/logs/${input}\n      fi\n\n      while read p; do\n        source_node=$((${p} - 1))\n        filename=\"${appname}_${input}_source_${source_node}_algo_${algo}_${configType}_Run${run}\"\n        statfile=\"${filename}.stats\"\n        ${execDir}/${exec} -t=${Threads} -delta=${delta} -algo=$algo $inputDir/GAP-${input}.${extension} -startNode=${source_node} -statFile=${execDir}/logs/${input}/${statfile} &> ${execDir}/logs/${input}/${filename}.out\n      done < $inputDir/sources/GAP-${input}_sources.mtx\n    done\n  done\ndone\n"
  },
  {
    "path": "scripts/intel_study_scripts/run_tc.sh",
    "content": "#!/bin/bash\n\necho -e \"USAGE: ./run_tc.sh <numRuns>\\n\"\nappname=\"triangle-counting\"\n\nnumRuns=$1\nif [ -z $numRuns ]; then\n  numRuns=1\nfi\n\nif [ -z ${GALOIS_BUILD} ]; then\n  echo \"GALOIS_BUILD not set; Please point it to the top level directory where Galois is built\"\n  exit\nelse\n  echo \"Using ${GALOIS_BUILD} for Galois build to run ${appname}\"\nfi\n\nif [ -z ${INPUT_DIR} ]; then\n  echo \"INPUT_DIR not set; Please point it to the directory with .gr graphs\"\n  exit\nelse\n  echo \"Using ${INPUT_DIR} for inputs for ${appname}\"\nfi\n\ninputDir=\"${INPUT_DIR}\"\nexecDir=\"${GALOIS_BUILD}/lonestar/analytics/cpu/${appname}\"\necho ${execDir}\nif [ ! -d \"${execDir}/logs/\" ]; then\n  mkdir -p ${execDir}/logs/\nfi\necho \"Logs will be available in ${execDir}/logs/\"\n\nexec=\"triangle-counting-cpu\"\nalgo=\"orderedCount\"\n\nfor configType in $(seq 1 2)\ndo\n  if [ ${configType} == 1 ]; then\n    echo \"Running ${appname} with config1\"\n    export GOMP_CPU_AFFINITY=\"0-31\"\n    export KMP_AFFINITY=\"verbose,explicit,proclist=[0-31]\"\n    Threads=32\n  else\n    echo \"Running ${appname} with config2\"\n    Threads=64\n  fi\n\n  for run in $(seq 1 ${numRuns})\n  do\n    for input in \"kron\" \"road\" \"urand\" \"web\" \"twitter\"\n    do\n      if [ ${input} == \"web\" ] || [ ${input} == \"twitter\" ]; then \n        ##NOTE: Using csgr for directed graphs\n        extension=csgr\n      else # kron road urand\n        ##NOTE: Using sgr for undirected graphs\n        extension=sgr\n      fi\n\n      echo \"Running on ${input}\"\n      filename=\"${appname}_${input}_algo_${algo}_${configType}_Run${run}\"\n      statfile=\"${filename}.stats\"\n      ${execDir}/${exec} -algo=$algo -t=${Threads} $inputDir/GAP-${input}.${extension} -symmetricGraph -statFile=${execDir}/logs/${statfile} &> ${execDir}/logs/${filename}.out\n    done\n  done\ndone\n"
  },
  {
    "path": "scripts/iss_load_modules.sh",
    "content": "#!/bin/bash\n\nif [ \"$(basename -- $0)\" == \"iss_load_modules.sh\" ]; then\n  echo \"Source this file instead of running directly\" >&2\n  exit 1\nfi\n\n# first up remove everything\nmodule purge\n\nmodule use /opt/apps/ossw/modulefiles/\n\nmodule load c7\nmodule load serf\n\nmodule use /net/faraday/workspace/local/modules/modulefiles\nmodule use /org/centers/cdgc/modules\n\nmodule load atc\nmodule load cmake\nmodule load mpich2\nmodule load boost\nmodule load gdb\nmodule load isspython # needed for vim\nmodule load git\nmodule load fmt\n\nif [ \"$1\" != \"min\" ]; then\n  module load tbb\n  module load eigen\n  module load neon\n  module load lapack\n  module load vtune\n  module load mkl\n  module load texlive\n  module load subversion\n  # module load screen #disabling for now because screen was compiled without proper color support\n  if [ \"$SYSTEMTYPE\" != \"c7\" ] ; then\n    module load doxygen\n    module load gnuplot\n    module load ghostscript\n    module load valgrind\n  fi\nfi\n"
  },
  {
    "path": "scripts/make_dist.sh.in",
    "content": "#!/bin/bash\n#\n# Make distribution tarball\n\nNAME=\"Galois-@GALOIS_VERSION_MAJOR@.@GALOIS_VERSION_MINOR@.@GALOIS_VERSION_PATCH@\"\n\nif [[ ! -e COPYRIGHT ]]; then\n  echo \"Run this from the root source directory\" 1>&2\n  exit 1\nfi\n\ntouch \"$NAME.tar.gz\" # Prevent . from changing during tar\n#(svn status | grep '^\\?' | sed -e 's/^\\? *//'; \\\n( \\\n  echo \".git\"; \\\n  echo \"*.swp\"; \\\n  echo \"*~\"; \\\n  echo \"exp\"; \\\n  echo \"$NAME.tar.gz\") | \\\n  tar --exclude-from=- --exclude-vcs --transform \"s,^\\./,$NAME/,\" -cz -f \"$NAME.tar.gz\" .\n"
  },
  {
    "path": "scripts/merge_vtune.pl",
    "content": "#!/usr/bin/perl\n#\n# Take the output of individual run_vtune commands and merge them into\n# a single file\n\nuse strict;\nuse warnings;\nuse Getopt::Long;\nuse Pod::Usage;\n\nmy $Help = 0;\n\nGetOptions('help'=>\\$Help) or pod2usage(2);\npod2usage(-exitstatus=>0, -verbose=>2, -noperldoc=>1) if $Help;\ndie(\"need at least one file\") unless (@ARGV >= 2);\n\nwhile (@ARGV) {\n  my $threads = shift @ARGV;\n  my $filename = shift @ARGV;\n  open(my $fh, '<', $filename) or die($!);\n  die(\"empty file\") unless (<$fh>);\n  print \"THREADS\\t$threads\\n\";\n  while (my $line = <$fh>) {\n    print $line;\n  }\n}\n\n__END__\n\n=head1 NAME\n\nmerge_vtune - Merge output from multiple run_vtune commands\n\n=head1 SYNOPSIS\n\nmerge_vtune (<num threads> <file>)+ > merged\n\n=head1 DESCRIPTION\n\nMerge output from multiple run_vtune commands\n\n=cut\n\n"
  },
  {
    "path": "scripts/plot_lonestar_apps.R",
    "content": "library(ggplot2)\nlibrary(gtable)\nlibrary(grid)\nlibrary(gridExtra)\nlibrary(plyr)\nlibrary(reshape2)\n#library(dplyr)\n\nshowPlot <- function(name, p, ...) {\n  if (interactive()) {\n    return(invisible(NULL))\n  }\n  outfile <- paste(\"figs\", \"/\", name, \".pdf\", sep=\"\")\n  ggsave(p + theme(plot.margin=unit(c(0, 0.25, 0, 0), \"lines\")), file=outfile, ...)\n}\n\nshowImage <- function(name, p, ...) {\n  if (interactive()) {\n    return(invisible(NULL))\n  }\n  outfile <- paste(\"figs\", \"/\", name, \".png\", sep=\"\")\n  ggsave(p + theme(plot.margin=unit(c(0, 0.25, 0, 0), \"lines\")), file=outfile, width=8, height=4, ...)\n}\n\nshowPlotGrid <- function(name, p, ...) {\n    if (interactive()) {\n          return(invisible(NULL))\n  }\n  outfile <- paste(\"figs\", \"/\", name, \".pdf\", sep=\"\")\n    ggsave(p , file=outfile, ...)\n}\n\ngeomean <- function(label, x) {\n  gm <- exp(sum(log(x[x > 0]), na.rm=TRUE) / length(x))\n  print(c(label, gm))\n}\n\nround_df <- function(df, digits) {\n  nums <- vapply(df, is.numeric, FUN.VALUE = logical(1))\n\n  df[,nums] <- round(df[,nums], digits = digits)\n\n  (df)\n}\n\ngrid_arrange_shared_legend <- function(..., ncol = length(list(...)), nrow = 1, position = c(\"bottom\", \"right\"), name = \"comm-breakdown\") {\n\n  plots <- list(...)\n  position <- match.arg(position)\n  name <- match.arg(name)\n  g <- ggplotGrob(plots[[1]] +labs(fill=\"\") + guides(fill = guide_legend(nrow = 2)) + theme(legend.position = position))$grobs\n  legend <- g[[which(sapply(g, function(x) x$name) == \"guide-box\")]]\n  lheight <- sum(legend$height)\n  lwidth <- sum(legend$width)\n  gl <- lapply(plots, function(x) x + theme(legend.position=\"none\"))\n  gl <- c(gl, ncol = ncol, nrow = nrow)\n\n  combined <- switch(position,\n                     \"bottom\" = arrangeGrob(do.call(arrangeGrob, gl),\n                                            legend,\n                                            ncol = 1,\n                                            heights = unit.c(unit(1, \"npc\") - lheight, lheight)),\n                     \"right\" = arrangeGrob(do.call(arrangeGrob, gl),\n                                           legend,\n                                           ncol = 2,\n                                           widths = unit.c(unit(1, \"npc\") - lwidth, lwidth)))\n\n\n  showPlotGrid(name, combined, width=3.25, height=6, unit = \"in\")\n  # return gtable invisibly\n  invisible(combined)\n\n}\n\n\npreProcess <- function(directoryName, benchmarkName, hasInput=TRUE){\n  #################### No Crash #########################\n  readFile <- paste(directoryName, \"/\", benchmarkName,\".csv\", sep=\"\")\n  res <- read.csv(readFile, stringsAsFactors=F)\n\n  tmpMean <- aggregate(. ~ benchmark + input + numThreads  + deviceKind, data = res, mean)\n  print((tmpMean))\n\n  tmpMean$totalTime <- tmpMean$totalTime/1000\n  #if(hasInput)\n    #p <- ggplot(tmpMean, aes(x=numThreads, y=totalTime)) + geom_line(color=\"steelblue\") + geom_point(color=\"steelblue\") + facet_grid(~input, scales=\"free_y\") + scale_y_continuous(\"Time Total (s)\")+ theme(axis.text.x = element_text(angle = 0)) + scale_x_continuous(\"Number of Threads\", breaks = unique(tmpMean$numThreads)) + scale_color_manual(values=c(\"#CC6666\")) #+ scale_x_continuous(\"Hosts\", trans=\"log2\", breaks=c(1, 4, 16, 64))\n  #else\n    p <- ggplot(tmpMean, aes(x=numThreads, y=totalTime)) + geom_line(color=\"steelblue\") + geom_point(color=\"steelblue4\") + scale_y_continuous(\"Time Total (s)\")+ theme(axis.text.x = element_text(angle = 0)) + scale_x_continuous(\"Number of Threads\", breaks = unique(tmpMean$numThreads)) + scale_color_manual(values=c(\"#CC6666\")) #+ scale_x_continuous(\"Hosts\", trans=\"log2\", breaks=c(1, 4, 16, 64))\n\n  outFileName <- paste(benchmarkName, \"_totalTime\", sep=\"\")\n  showPlot(outFileName, p, width=3.6, height=5.25, unit = \"in\")\n}\n\npreProcess(\"./\", \"barneshut\")\npreProcess(\"./\", \"bc-async\")\npreProcess(\"./\", \"bc-outer\")\npreProcess(\"./\", \"bfs\")\npreProcess(\"./\", \"boruvka\")\npreProcess(\"./\", \"connectedcomponents\")\npreProcess(\"./\", \"delaunaytriangulation\")\npreProcess(\"./\", \"dmr\")\npreProcess(\"./\", \"gmetis\")\npreProcess(\"./\", \"independentset\")\npreProcess(\"./\", \"matrixcompletion\")\npreProcess(\"./\", \"mcm\")\npreProcess(\"./\", \"pagerank-pull\")\npreProcess(\"./\", \"pagerank-push\")\npreProcess(\"./\", \"preflowpush\")\npreProcess(\"./\", \"pta\")\npreProcess(\"./\", \"sssp\")\npreProcess(\"./\", \"surveypropagation\")\npreProcess(\"./\", \"triangles-edge\")\npreProcess(\"./\", \"triangles-node\")\n"
  },
  {
    "path": "scripts/quick_plot.pl",
    "content": "while (<STDIN>) {\n    if (/STAT.*/) {\n\tmy @values = split ',';\n\tif ($values[2] eq $ARGV[0]) {\n\t    $v{$values[3]} += $values[4];\n\t    $n{$values[3]} += 1;\n\t}\n    }\n}\n\n#foreach $key (sort {$a <=> $b} keys %v) {\n#    print \"$key $v{$key} $n{$key}\\n\";\n#}\n\nopen GP, \"|gnuplot -persist\" or die \"Can't execute gnuplot\";\n\nif (exists $n{1}) {\n    $doscale = 1;\n} else {\n    $doscale = 0;\n}\n\nif (scalar @ARGV > 1) {\n    print \"outputfile (eps) is $ARGV[1]\\n\";\n    open GP, \"|gnuplot\" or die \"Can't execute gnuplot\";\n    print GP \"set terminal postscript enhanced color\\n\";\n    print GP \"set output '| ps2pdf - $ARGV[1]'\\n\";\n} else {\n    open GP, \"|gnuplot -persist\" or die \"Can't execute gnuplot\";\n}\n\nprint GP \"set xlabel \\\"threads\\\"\\n\";\nprint GP \"set ylabel \\\"$ARGV[0]\\\"\\n\";\nprint GP \"set y2label \\\"Scaling\\\"\\n\" if $doscale;\nprint GP \"set y2tics nomirror\\n\" if $doscale;\nprint GP \"set ytics nomirror\\n\";\nprint GP \"plot '-' title \\\"$ARGV[0]\\\" with lines axis x1y1\";\nprint GP \", '-' title \\\"scaling\\\" with lines axis x1y2\" if $doscale;\nprint GP \"\\n\";\n\nforeach $key (sort {$a <=> $b} keys %v) {\n    print GP $key . \" \" . ($v{$key} / $n{$key}) . \"\\n\";\n}\nprint GP \"e\\n\";\n\nif ($doscale) {\n    foreach $key (sort {$a <=> $b} keys %v) {\n\tprint GP $key . \" \" . ($v{1} / $n{1}) / ($v{$key} / $n{$key})  . \"\\n\";\n    }\n    print GP \"e\\n\";\n}\n"
  },
  {
    "path": "scripts/rcat.py",
    "content": "#!/usr/bin/env python\n\"\"\"\nLike cat but optionally add key-values after 'RUN: Start'. Useful with report.py.\n\n@section License\n\nCopyright (C) 2012, The University of Texas at Austin. All rights reserved.\nUNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS SOFTWARE\nAND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY, FITNESS FOR ANY\nPARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY\nWARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF TRADE.\nNO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO THE USE OF THE\nSOFTWARE OR DOCUMENTATION. Under no circumstances shall University be liable\nfor incidental, special, indirect, direct or consequential damages or loss of\nprofits, interruption of business, or related expenses which may arise from use\nof Software or Documentation, including but not limited to those resulting from\ndefects in Software and/or Documentation, or loss or inaccuracy of data of any\nkind.\n\n@author Donald Nguyen <ddn@cs.utexas.edu> \n\"\"\"\n\nfrom __future__ import print_function\nimport sys\nimport os\nimport re\nimport optparse \nimport collections\n\ndef process(fh, options):\n  regex = re.compile(r'^RUN: Start')\n  pairs = [kv.split('=') for kv in options.add]\n  text = '\\n'.join(['RUN: Variable %s = %s' % (k, v) for (k,v) in pairs])\n\n  for line in fh:\n    print(line, end='')\n    if regex.match(line):\n      print(text)\n\n\nif __name__ == '__main__':\n  parser = optparse.OptionParser(usage='usage: %prog [options]')\n  parser.add_option('-a', '--add-column',\n      dest=\"add\", default=[], action='append',\n      help='column to include in output. Multiple columns can be specified '\n           + 'with multiple options or a comma separated list of columns. '\n           + 'Example: --add-column Version=Good')\n\n  (options, args) = parser.parse_args()\n\n  if args:\n    for f in args:\n      with open(f) as fh:\n        process(fh, options)\n  else:\n    process(sys.stdin, options)\n"
  },
  {
    "path": "scripts/report.py",
    "content": "#!/usr/bin/env python\n\nimport sys\nimport collections\n\ndef main():\n  class Row:\n    def __init__(self):\n      self.reset()\n    def reset(self):\n      self.r = collections.defaultdict(str)\n      self.header = None\n    def get(self, token, key):\n      return token[self.header.index(key)]\n\n  row = Row()\n  rows = []\n  cols = set()\n\n  for line in sys.stdin:\n    try:\n      param_token = [i.strip() for i in line.split()]\n      stat_token = [i.strip() for i in line.split(\",\")]\n\n      # empty line\n      if param_token == []:\n        continue\n\n      # parameter setting by run.py\n      if param_token[0] == \"RUN:\":\n        if param_token[1] == \"Start\":\n          if row.r:\n            rows.append(row.r)\n            row.reset()\n        elif param_token[1] == \"Variable\":\n          key = param_token[2]\n          cols.add(key)\n          row.r[key] = param_token[4] # param_token[3] is \"=\"\n        elif param_token[1] == \"CommandLine\":\n          cmd_token = [i.strip() for i in line.split(None, 2)]\n          key = cmd_token[1]\n          cols.add(key)\n          row.r[key] = cmd_token[2]\n\n      # stat header returned by Galois\n      elif stat_token[0] == \"LOOP\":\n        row.header = stat_token\n\n      # stat lines. ignore HOST for shared-memory version\n      elif row.header != None:\n        loop_name = row.get(stat_token, \"LOOP\")\n        instance = row.get(stat_token, \"INSTANCE\")\n        th = row.get(stat_token, \"THREAD\")\n        key = row.get(stat_token, \"CATEGORY\") + \"-t\" + th\n        if loop_name != \"(NULL)\":\n          key = loop_name + \"-i\" + instance + \"-\" + key\n        cols.add(key)\n        row.r[key] = row.get(stat_token, \"VAL\")\n\n    except:\n      sys.stderr.write(\"Error parsing line: %s\" % line)\n      raise\n\n  if row.r:\n    rows.append(row.r)\n  cols = sorted(cols)\n\n  print(','.join(cols))\n  for r in rows:\n    print(','.join([str(r[c]) for c in cols]))\n\n\nif __name__ == \"__main__\":\n  main()\n"
  },
  {
    "path": "scripts/report_vtune.pl",
    "content": "#!/usr/bin/perl\n#\n# Take output of run_vtune.pl and produce tab-deliminated file\n\nuse strict; \nuse warnings;\nuse Getopt::Long;\nuse Pod::Usage;\n\n# Command line options\nmy $InType = \"line\";\nmy %validInTypes = map { $_ => 1 } (\"line\", \"function\");\nmy $ShowType = \"raw\";\nmy %validShowTypes = map { $_ => 1 } (\"raw\", \"ratio\", \"scalebythread\");\nmy $Help = 0;\n\nmy %Stats = ();\nmy %Thread_keys = ();\nmy $TOTAL = \"TOTAL\";\n\nmy $DEBUG=0;\nsub debug {\n  print STDERR \">>>DEBUG: @_\\n\" if $DEBUG;\n}\n\nsub show {\n  my ($tmap, $lk, $tk) = @_;\n\n  if (!exists $tmap->{$lk}{$tk}) {\n    print STDERR \"ERROR: missing key: tk=$tk, lk=$lk\\n\";\n  }\n\n  if ($ShowType eq \"scalebythread\") {\n    return $tmap->{$lk}{$tk} / $tk;\n  } elsif ($ShowType eq \"ratio\") {\n    return $tmap->{$lk}{$tk} / $tmap->{$TOTAL}{$tk};\n  } elsif ($ShowType eq \"raw\") {\n    return $tmap->{$lk}{$tk};\n  } else {\n    die;\n  }\n}\n\nGetOptions('show=s'=>\\$ShowType, 'in=s'=>\\$InType, 'help'=> \\$Help) or pod2usage(2);\npod2usage(-exitstatus=>0, -verbose=>2, -noperldoc=>1) if $Help;\ndie(\"unknown show type\") unless ($validShowTypes{$ShowType});\ndie(\"unknown in type\") unless ($validInTypes{$InType});\n\nmy $newSet = 0;\nmy $curThread = 0;\nmy @heads = ();\n\nwhile (<>) {\n  chomp;\n  my @line = split /\\t/;\n\n  debug \"line:@line\\n\";\n  # print \"line:$line[0],$line[1]\\n\";\n\n  if ($line[0] =~ /^THREADS$/) {\n    debug \"Threads line: @line\\n\";\n    $newSet = 1;\n    $curThread = $line[1];\n    $Thread_keys{$curThread} = 1;\n  } elsif ($newSet) {\n    $newSet = 0;\n    @heads = @line;\n    debug \"headers:@heads\\n\";\n  } else {\n    my $ind;\n    my $offset = 0;\n\n    debug \"line=@line, length=$#line\\n\";\n\n    if ($InType eq \"line\") {\n      # first 2 columns are source file and line\n      my $file = shift @line;\n      my $ln = shift @line;\n\n      $offset = 2;\n      if ($heads[$offset] =~ /file path/i) {\n        my $path = shift @line;\n        $offset += 1;\n      } \n\n      $ind = \"$file:$ln\";\n\n    } elsif ($InType eq \"function\") {\n      # first column is function name \n      # last 4 colums are module, function full name, source file, start address\n      my $function = shift @line;\n\n      my $address = pop @line;\n      my $file = pop @line;\n      my $fullname = pop @line;\n      my $module = pop @line;\n\n      $offset = 1;\n      $ind = \"$file:$fullname:$module:$address\";\n    }\n\n    debug \"line=@line, length=$#line\\n\";\n\n    for (my $i = 0; $i <= $#line; $i++) {\n      my $nk = $heads[$i + $offset];\n      debug \"nk=$nk\\n\";\n      $Stats{$nk}{$curThread}{$ind} += $line[$i];\n      $Stats{$nk}{$curThread}{$TOTAL} += $line[$i];\n    }\n    # print \"###\\n\";\n  }\n}\n\n# for the combinations of (line_keys, Thread_keys) for a given stat_name that don't\n# have corresponding Stats, we put a 0. e.g. a particular function/line shows\n# up in the profile at threads=1 but not at threads=16.\nforeach my $nk (keys %Stats) {\n  my %line_keys = ();\n  foreach my $tk (keys %{$Stats{$nk}}) {\n    foreach my $lk (keys %{$Stats{$nk}{$tk}}) {\n      $line_keys{$lk} = 1;\n    }\n  }\n\n  foreach my $tk (keys %Thread_keys) {\n    foreach my $lk (keys %line_keys) {\n      if (!exists $Stats{$nk}{$tk}{$lk}) {\n        $Stats{$nk}{$tk}{$lk} = 0;\n      }    \n    }\n  }\n}\n\nmy $maxThread = (sort { $a <=> $b } keys %Thread_keys)[-1];\n\nforeach my $nk (sort keys %Stats) {\n  print \"$nk\";\n  foreach my $tk (sort { $a <=> $b } keys %Thread_keys) {\n    print \"\\t$tk\";\n  }\n  print \"\\n\";\n\n  my %transpose = ();\n  foreach my $tk (keys %Thread_keys) {\n    foreach my $lk (keys %{$Stats{$nk}{$tk}}) {\n      $transpose{$lk}{$tk} = $Stats{$nk}{$tk}{$lk};\n    }\n  }\n\n  # delete lines with all 0s from transpose\n  foreach my $lk (keys %transpose) {\n    my $all_zeros = 1;\n    foreach my $tk (keys %Thread_keys) {\n      if ($transpose{$lk}{$tk} != 0) {\n        $all_zeros = 0;\n        last;\n      }\n    }\n\n    if ($all_zeros) {\n      delete $transpose{$lk};\n    }\n  }\n\n  #sort by final thread performance\n  foreach my $lk (sort { show(\\%transpose, $b, $maxThread) <=> show(\\%transpose, $a, $maxThread) }\n    keys %transpose) {\n\n    print \"$lk\";\n    foreach my $tk (sort { $a <=> $b } keys %Thread_keys) {\n      print \"\\t\" . show(\\%transpose, $lk, $tk);\n    }\n    print \"\\n\";\n  }\n\n  print \"\\n\\n\\n\";\n}\n\n__END__\n\n=head1 NAME\n\nreport_vtune - Emit tab-separated file from output of run_vtune\n\n=head1 SYNOPSIS\n\ncat output | report_vtune [options] > output.tsv\n\n Options:\n   -help             brief help message\n   -in=INTYPE        format of run_vtune output\n   -show=SHOWTYPE    output format\n\n=head1 OPTIONS\n\n=over 8\n\n=item B<-help>\n\nPrint a brief help message and exits.\n\n=item B<-in>=INTYPE\n\nrun_vtune output is by INTYPE instead function: line, function\n\n=item B<-show>=SHOWTYPE\n\nOutput SHOWTYPE instead of raw counts: raw, ratio, scalebythread\n\n=back\n\n=head1 DESCRIPTION\n\nEmit tab-separated file from output of run_vtune\n\n=cut\n"
  },
  {
    "path": "scripts/result_checker.py",
    "content": "#### Script to check the output of algorithms:\n### Author: Gurbinder Gill (gurbinder533@gmail.com)\n### Author: Roshan Dathathri (roshan@cs.utexas.edu)\n### Modified to calculate error + take tolerance as an error by Loc Hoang \n\n### python script.py masterFile allfile* [-t, -tolerance]=<tolerance>\n\n### expects files in the follwing format:\n###### nodeID nodeFieldVal\n######## These are generated by galois::runtime::printOutput function.\n### Requires python version > 2.7\n# Can also take 2 field files TODO make more general\n\nimport sys\nimport argparse\nimport os\n\ndef check_results(masterFile, otherFiles, tolerance, \n  offset, errors, mrows, global_error_squared, num_nodes):\n\n  with open(masterFile) as mfile, open(otherFiles) as ofile:\n    mfile.seek(offset)\n\n    for line2 in ofile:\n      line1 = mfile.readline()\n      offset = offset + len(line1)\n\n      split_line1 = line1.split(' ')\n      split_line2 = line2.split(' ')\n\n      if (split_line1[0] == ''):\n        print(\"ERROR: output longer than input\")\n        return (0, errors, mrows)\n\n      while (long(split_line1[0]) < long(split_line2[0])):\n        print \"MISSING ROW: \", split_line1[0]\n        mrows = mrows + 1\n        line1 = mfile.readline()\n        offset = offset + len(line1)\n        split_line1 = line1.split(' ')\n\n\n      # forces failure if missings rows exist\n      #if mrows > 0:\n      #  return (-1, errors, mrows)\n\n      if (long(split_line1[0]) == long(split_line2[0])):\n        # absolute value of difference in fields\n        field_difference = abs(float(split_line1[1]) - float(split_line2[1]))\n\n        global_error_squared += (field_difference ** 2)\n        num_nodes += 1\n\n        if (field_difference > tolerance):\n          print \"NOT MATCHED \\n\";\n          print line1;\n          print line2;\n          errors = errors + 1;\n        # TODO (Loc) make more general: deals with 2 fields in output (should\n        # optimally deal with arbitrary # of fields\n        elif (len(split_line1) == 3):\n          field_difference2 = abs(float(split_line1[2]) - float(split_line2[2]))\n          if (field_difference2 > tolerance):\n            print \"NOT MATCHED \\n\";\n            print line1;\n            print line2;\n            errors = errors + 1;\n      else:\n        print \"OFFSET MISMATCH: \", split_line1[0], split_line2[0]\n        return (-1, errors, mrows, global_error_squared, num_nodes);\n\n  return (offset, errors, mrows, global_error_squared, num_nodes);\n\ndef main(masterFile, allFiles_arr, tolerance, mean_tolerance):\n  offset = 0\n  errors = 0\n  mrows = 0\n  global_error_squared = 0\n  num_nodes = 0\n\n  for i in range(0 , len(allFiles_arr)):\n    print allFiles_arr[i]\n    print offset\n    offset, errors, mrows, global_error_squared, num_nodes = check_results(masterFile, allFiles_arr[i], tolerance, offset, errors, mrows, global_error_squared, num_nodes)\n    if (offset == -1):\n      break\n\n  rmse = (global_error_squared / num_nodes) ** 0.5\n  if (rmse > mean_tolerance):\n    print \"\\nRoot mean square error (for first field): \", rmse\n\n  if (offset != -1):\n    mfile = open(masterFile)\n    mfile.seek(offset)\n    old_mrows=mrows\n    for line in mfile:\n      mrows = mrows + 1\n    if mrows > old_mrows:\n      mrows = mrows - old_mrows\n      print \"\\nNo of offsets/rows missing: \", mrows\n\n  if (offset == -1):\n    print \"\\nOffset not correct\"\n\n  if (errors > 0):\n    print \"\\nNo. of mismatches: \", errors\n\n  if (errors > 0) or (offset == -1) or (mrows > 0) or (rmse > mean_tolerance):\n    print \"\\nFAILED\\n\"\n    return 1\n  else:\n    print \"\\nSUCCESS\\n\"\n    return 0\n\nif __name__ == \"__main__\":\n  parser = argparse.ArgumentParser(description=\"Check graph output results\")\n\n  # parse files and an optional tolerance\n  parser.add_argument('files', type=str, nargs='+', help='input + output files')\n  parser.add_argument('-tolerance', '-t', type=float, nargs=1, default=0.0001,\n                      help='tolerance for difference in fields (error)')\n  parser.add_argument('-sort', '-s', type=bool, nargs=1, default=False,\n                      help='sort the generated output files')\n  parser.add_argument('-delete', '-d', type=bool, nargs=1, default=False,\n                      help='delete the generated output files')\n  parser.add_argument('-mean_tolerance', '-m', type=float, nargs=1, default=0.0001,\n                      help='tolerance for root mean square error')\n\n  arg = sys.argv\n  parsed_arguments = parser.parse_args()\n\n  masterFile = parsed_arguments.files[0]\n  allFiles_arr = parsed_arguments.files[1:]\n\n  print masterFile  \n  print allFiles_arr  \n\n  if parsed_arguments.sort:\n    sortstr = \"sort -nu\"  \n    for f in allFiles_arr:\n      sortstr += \" \" + f\n    sortstr += \" -o .output_log\"\n    os.system(sortstr)\n\n  if parsed_arguments.delete:\n    rmstr = \"rm -f\"\n    for f in allFiles_arr:\n      rmstr += \" \" + f\n    os.system(rmstr)\n\n  if parsed_arguments.sort:\n    allFiles_arr = ['.output_log']\n\n  tolerance = parsed_arguments.tolerance\n  mean_tolerance = parsed_arguments.mean_tolerance\n\n  print(\"Starting comparison...\")\n  ret = main(masterFile, allFiles_arr, tolerance, mean_tolerance)\n\n  if parsed_arguments.sort:\n    os.system(\"rm -f .output_log\")\n\n  if ret:\n    sys.exit(1)\n"
  },
  {
    "path": "scripts/run.py",
    "content": "#!/usr/bin/env python\n#\n# Run an application multiple times, varying parameters like\n# number of threads, etc\n\nfrom __future__ import print_function\nimport sys\nimport os\nimport subprocess\nimport optparse\nimport shlex\nimport signal\n\n\ndef die(s):\n  sys.stderr.write(s)\n  sys.exit(1)\n\n\ndef print_bright(s):\n  red = '\\033[1;31m'\n  endc = '\\033[0m'\n  print(red + s + endc)\n\n\ndef parse_range(s):\n  \"\"\"\n  Parses thread range s\n  Grammar:\n   R := R,R\n      | S\n      | N\n      | N:N\n      | N:N:N\n   N := an integer\n   S := a string\n  \"\"\"\n  # Parsing strategy: greedily parse integers with one character\n  # lookahead to figure out exact category\n  s = s + ' ' # append special end marker\n  retval = []\n  cur = -1\n  curseq = []\n  for i in range(len(s)):\n    if s[i] == ',' or s[i] == ' ':\n      if cur < 0:\n        break\n      if len(curseq) == 0:\n        retval.append(s[cur:i])\n      elif len(curseq) == 1:\n        retval.extend(range(curseq[0], int(s[cur:i]) + 1))\n      elif len(curseq) == 2:\n        retval.extend(range(curseq[0], curseq[1] + 1, int(s[cur:i])))\n      else:\n        break\n      cur = -1\n      curseq = []\n    elif s[i] == ':' and cur >= 0:\n      curseq.append(int(s[cur:i]))\n      cur = -1\n    elif cur < 0:\n      cur = i\n    else:\n      pass\n  else:\n    return sorted(set(retval))\n  die('error parsing range: %s\\n' % s)\n\n\ndef product(args):\n  \"\"\"\n  Like itertools.product but for one iterable of iterables\n  rather than an argument list of iterables\n  \"\"\"\n  pools = map(tuple, args)\n  result = [[]]\n  for pool in pools:\n    result = [x+[y] for x in result for y in pool]\n  for prod in result:\n    yield tuple(prod)\n\n\ndef run(cmd, values, envs, options):\n  import subprocess, datetime, os, time, signal, socket\n\n  new_env = dict(os.environ)\n  new_env.update(envs)\n  is_tty = sys.stdout.isatty()\n\n  for R in range(options.runs):\n    if is_tty:\n      print_bright('RUN: Start')\n    else:\n      print('RUN: Start')\n    print(\"RUN: CommandLine %s\" % ' '.join(cmd))\n    print(\"RUN: Variable Hostname = %s\" % socket.gethostname())\n    print(\"RUN: Variable Timestamp = %f\" % time.time())\n\n    for (name, value) in values:\n      print('RUN: Variable %s = %s' % (name, value))\n\n    if options.timeout:\n      start = datetime.datetime.now()\n      process = subprocess.Popen(cmd, env=new_env)\n      while process.poll() is None:\n        time.sleep(5)\n        now = datetime.datetime.now()\n        diff = (now-start).seconds\n        if diff > options.timeout:\n          process.kill()\n          #os.waitpid(-1, os.WNOHANG)\n          os.waitpid(-1, 0)\n          print(\"RUN: Variable Timeout = %d\" % (diff*1000))\n          break\n      retcode = process.returncode\n    else:\n      retcode = subprocess.call(cmd, env=new_env)\n    if retcode != 0:\n      # print command line just in case child process should be died before doing it\n      print(\"RUN: Error %s\" % retcode)\n      if not options.ignore_errors:\n        sys.exit(1)\n\ndef parse_extra(extra):\n  \"\"\"\n  Parse extra command line option.\n  \n  Three cases:\n   (1) <name>::<arg>::<range>\n   (2) ::<arg>::<range>\n   (3) <name>::<range>\n  \"\"\"\n  import re\n  if extra.count('::') == 2:\n    (name, arg, r) = extra.split('::')\n    if not name:\n      name = re.sub(r'^-*', '', arg)\n  elif extra.count('::') == 1:\n    (name, r) = extra.split('::')\n    arg = None\n  else:\n    die('error parsing extra argument: %s\\n' % extra)\n  return (name, arg, r)\n\n\ndef main(args, options):\n  variables = []\n  ranges = []\n  extras = [(e, False) for e in options.extra]\n  extras += [(e, True) for e in options.extra_env]\n  for (extra, env) in extras:\n    (name, arg, r) = parse_extra(extra)\n    variables.append((name, arg, env))\n    ranges.append(parse_range(r))\n\n  for prod in product(ranges):\n    params = []\n    values = []\n    envs = {}\n    for ((name, arg, env), value) in zip(variables, prod):\n      if env:\n        if arg:\n          envs[arg] = str(value)\n        else:\n          envs[str(value)] = ''\n      else:\n        if arg:\n          params.extend([arg, str(value)])\n        else:\n          params.extend([str(value)])\n      values.append((name, str(value)))\n    if options.append_arguments:\n      cmd = args + params\n    else:\n      cmd = [args[0]] + params + args[1:]\n    run(cmd, values, envs, options)\n\n\nif __name__ == '__main__':\n  signal.signal(signal.SIGQUIT, signal.SIG_IGN)\n  sys.stdout = os.fdopen(sys.stdout.fileno(), 'w', 0)\n  parser = optparse.OptionParser(usage='usage: %prog [options] <command line> ...')\n  parser.add_option('--ignore-errors', dest='ignore_errors', default=False, action='store_true',\n      help='ignore errors in subprocesses')\n  parser.add_option('-t', '--threads', dest=\"threads\", default=\"1\",\n      help='range of threads to use. A range is R := R,R | S | N | N:N | N:N:N where N is an integer and S is a string.')\n  parser.add_option('-r', '--runs', default=1, type='int',\n      help='set number of runs')\n  parser.add_option('-x', '--extra', dest=\"extra\", default=[], action='append',\n      help='add commandline parameter to range over (format: <name>::<arg>::<range> or ::<arg>::<range> or <name>::<range>). E.g., delta::-delta::1,5 or ::-delta::1,5 or schedule::-useFIFO,-useLIFO')\n  parser.add_option('-e', '--extra-env', dest=\"extra_env\", default=[], action='append',\n      help='add environment variable to range over (format: <name>::<arg>::<range> or ::<arg>::<range> or <name>::<range>). E.g., delta::-delta::1,5 or ::-delta::1,5 or schedule::-useFIFO,-useLIFO')\n  parser.add_option('-o', '--timeout', dest=\"timeout\", default=0, type='int',\n      help=\"timeout a run after SEC seconds\", metavar='SEC')\n  parser.add_option('--no-default-thread', dest='no_default_thread', default=False, action='store_true',\n      help='supress run command default thread argument')\n  parser.add_option('--append-arguments', dest='append_arguments', default=False, action='store_true',\n      help='append instead of prepend additional command line arguments')\n  (options, args) = parser.parse_args()\n  if not args:\n    parser.error('need command to run')\n  if not options.no_default_thread:\n    options.extra.insert(0, '%s::%s::%s' % ('Threads', '-t', options.threads))\n  main(args, options)\n"
  },
  {
    "path": "scripts/run_vtune.pl",
    "content": "#!/usr/bin/perl\n#\n# Run vtune and collect report to file\n\nuse strict;\nuse warnings;\nuse Getopt::Long;\nuse Pod::Usage;\nuse Cwd;\n\n## Search for vtune in common places\nsub find_vtune() {\n  my $vtune = `which amplxe-cl 2> /dev/null`;\n  chomp $vtune;\n  if (not -e $vtune) {\n    my @vtune_vers = ('', '_2018');\n    foreach my $ver (@vtune_vers) {\n      my $base = \"/opt/intel/vtune_amplifier$ver/bin64/amplxe-cl\";\n      if (-e $base) {\n        return $base;\n      }\n    }\n  }\n  return $vtune;\n}\n\nsub extra_symbols_option() {\n  my $uname = `uname -r`;\n  chomp $uname;\n  my @sdirs = ();\n  my @candidates = (\"/usr/lib/debug/boot/$uname\");\n  foreach my $c (@candidates) {\n    if (-e $c) {\n      push(@sdirs, \"-search-dir all=$c\");\n    }\n  }\n  return join('', @sdirs);\n}\n\nsub report_dir_option($) {\n  my ($threads) = @_;\n  my $user = `whoami`;\n  chomp $user;\n  my $cwd = getcwd();\n  my @candidates = (\"/workspace/$user/tmp/vtune--r$threads\", \"/tmp/$user/vtune--r$threads\", \"$cwd/vtune--r$threads\");\n  foreach my $c (@candidates) {\n    if (system(\"mkdir -p $c\") == 0) {\n      return ($c, \"-result-dir=$c\");\n    }\n  }\n  return ('', '');\n}\n\nsub counter_option(@) {\n  return ['-collect-with runsa -knob event-config=' . join(',', @_), '-report hw-events'];\n}\n\n## returns analysis and report type option together\nsub analysis_option(@) {\n  my ($vtune, $a) = @_;\n  my @candidates = (\n    [\"$a\", \"hw-events\"],\n    [qw/memory-access hw-events/],\n    [qw/memory-consumption hw-events/],\n    [qw/general-exploration hw-events/],\n    [qw/hotspots hotspots/]\n  );\n  foreach my $pair (@candidates) {\n    my $try = `$vtune -collect @$pair[0] 2>&1`;\n    unless ($try =~ /Cannot find analysis type/) {\n      return (\"-collect @$pair[0]\", \"-report @$pair[1]\");\n    }\n  }\n  return ('','');\n}\n\nsub report_line($$$$) {\n  my ($cmd, $threads, $outfile, $maxsec) = @_;\n\n  system(\"echo \\\"THREADS\\t$threads\\\" >>$outfile.line.log\");\n\n  open(my $syspipe, \"ulimit -t $maxsec ; $cmd -format csv -csv-delimiter tab -group-by source-line |\") or die($!);\n  open(my $output, \">> $outfile.line.log\") or die($!);\n  my @header = ();\n  my @sums = ();\n  my $first_data_column = 0;\n  while (<$syspipe>) {\n    print $output $_;\n    chomp;\n    my @tokens = split /\\t/;\n    if (not @header) {\n      @header = @tokens;\n      for my $tok (@tokens) {\n        $first_data_column++;\n        last if ($tok =~ /Module$/);\n      }\n    } else {\n      for my $idx ($first_data_column .. @tokens - 1) {\n        $sums[$idx] += $tokens[$idx];\n      }\n    }\n  }\n  for my $idx ($first_data_column .. @header - 1) {\n    my $label = (split /:/, $header[$idx])[1];\n    print \"RUN: Variable $label = $sums[$idx]\\n\";\n  }\n  close $syspipe;\n  close $output;\n}\n\nsub report_function($$$$) {\n  my ($cmd, $threads, $outfile, $maxsec) = @_;\n\n  system(\"echo \\\"THREADS\\t$threads\\\" >>$outfile.function.log\");\n  system(\"ulimit -t $maxsec; $cmd -format csv -csv-delimiter tab -group-by function >> $outfile.function.log \");\n}\n\nmy @counters = ();\nmy $analyzeSystem = 1;\nmy $startPaused = 1;\nmy $help = 0;\nmy $threads = 0;\nmy $analysisType = 'memory-access'; # causes vtune 2016 to hang\n# my $analysisType = 'general-exploration';\nmy $reportType = 'hw-events';\nmy $reportTimeout = 100000;\nGetOptions(\n  't|threads=s'=>\\$threads,\n  'analysisType=s'=>\\$analysisType,\n  'reportType=s'=>\\$reportType,\n  'counter=s'=>\\@counters,\n  'startPaused!'=>\\$startPaused,\n  'analyzeSystem!'=>\\$analyzeSystem,\n  'reportTimeout=s'=>\\$reportTimeout,\n  'help'=>\\$help) or pod2usage(2);\npod2usage(-exitstatus=>0, -verbose=>2, -noperldoc=>1) if $help;\nmy $outfile = shift @ARGV;\nmy $cmdline = join(\" \", @ARGV);\n\nif ($threads) {\n  $cmdline = \"$cmdline -t $threads\";\n}\n\nmy $vtune = find_vtune;\nmy $symbol = extra_symbols_option();\nmy ($rdir, $rdiropt) = report_dir_option($threads);\nmy ($copt, $ropt);\nif (@counters) {\n  ($copt, $ropt) = counter_option(@counters);\n} else {\n  ($copt, $ropt) = analysis_option($vtune, $analysisType);\n}\n\ndie(\"cannot find way to run vtune\") unless($rdir and $copt and $ropt);\ndie(\"no command given\") unless($cmdline);\n\nprint \"RUN: CommandLine $cmdline\\n\";\n\nmy @collect = ();\npush @collect, $vtune, $symbol, $rdiropt, $copt;\npush(@collect, '-analyze-system') if ($analyzeSystem);\npush(@collect, '-start-paused') if ($startPaused);\npush @collect, '--', $cmdline;\n\nmy @report = ();\npush @report, $vtune, $rdiropt, $ropt;\n\nsystem(\"rm -rf $rdir\") == 0 or die($!);\nsystem(\"mkdir -p $rdir\") == 0 or die($!);\n\nmy $vtune_collect_cmd = join(' ', @collect);\nprint \"Running: '$vtune_collect_cmd'\\n\";\nsystem(\"$vtune_collect_cmd\") == 0 or die(\"vtune collection failed\");\nreport_function join(' ', @report), $threads, $outfile, $reportTimeout;\nreport_line join(' ', @report), $threads, $outfile, $reportTimeout;\n\n__END__\n\n=head1 NAME\n\nrun_vtune - run vtune and parse results to file\n\n=head1 SYNOPSIS\n\nreport_vtune [options] <outputbasename> <commandline>\n\n  Options:\n    -help                  brief help message\n    -analysisType=T        specify vtune analysis type manually\n    -reportType=T          specify vtune report type manually\n    -counter=C             specify hardware performance counters manually\n    -startPaused           start vtune paused (default)\n    -reportTimeout=SEC     timeout for generating report\n    -nostartPaused         start vtune running\n    -analyzeSystem         analyze entire system (default)\n    -noanalzeSystem        analyze just command and child processes\n\n=head1 OPTIONS\n\n=over 8\n\n=item B<-analysisType>=T\n\nRun \"amplxe-cl -help collect\" to see which collection methods are available.\n\n=item B<-reportType>=T\n\nRun \"amplxe-cl -help report\" to see which reports are available.\n\n=item B<-counter>=C\n\nUse multiple options for multiple counters. Examples of counter names are:\nLONGEST_LAT_CACHE.MISS or OFFCORE_RESPONSE_0.ANY_DATA.LOCAL_DRAM\n\n=back\n\n=head1 DESCRIPTION\n\nRun vtune and parse results to file\n\n=cut\n\n"
  },
  {
    "path": "scripts/sparse-matrices/diff_edgelists.py",
    "content": "#!/usr/bin/env python\n\n\"\"\"diff_edgelists - Display differences between two edgelist files.\"\"\"\n\nimport sys\nfrom math import isnan, isinf\nfrom itertools import izip_longest\n\n# If within this tolerance, difference is not an error. (Surely five\n# nines of accuracy is good enough for *relative* error.)\nCLOSE_ENOUGH = 1e-5\n\nclass DiffEdgelists(object):\n    \"\"\"Compare edgelist files and display differences.\"\"\"\n\n    def __init__(self, symmetric=True, quick=False, verbose=True):\n        self.symmetric = symmetric\n        self.quick = quick\n        self.verbose = verbose\n        self.reset()\n\n    def reset(self):\n        \"\"\"Reset all counters to zero.\"\"\"\n        # Counters\n        self.nfiles = 0    # Number of files read\n        self.nedges = 0    # Total number of edges\n        self.ndups = 0     # Edges duplicated within one input file\n        self.ninvalid = 0  # Edges that are invalid (NaN or infinite)\n        self.ndiffs = 0    # Edges with differences > CLOSE_ENOUGH\n        self.nexact = 0    # Edges with no measurable difference\n        self.nxzeros = 0   # Explicit zeros not in all input files\n        self.nmissing = 0  # Nonzero edges missing in some input files\n\n        # Maximums\n        self.maxabsdiff = 0     # Maximum absolute difference\n        self.maxreldiff = 0     # Maximum relative difference\n        self.maxqueue = 0       # Maximum size of queue\n\n        # Verify reset was correct\n        assert self.is_success()\n\n    def _diff_edge(self, index, edge, incremental=True):\n        \"\"\"Compare two edges and return True if they are the same, False if a\n        difference was found, or None if the edge was not found in\n        every file and an incremental diff was requested.\n\n        \"\"\"\n        if incremental and None in edge:\n            # Edge is not complete\n            return None\n        eset = set(edge)\n        esetlen = len(eset)\n        if esetlen == 1:\n            # Every version of edge is the same, no error\n            self.nexact += 1\n            return True\n        elif not incremental and None in eset:\n            if esetlen == 2:\n                if float(0) in eset:\n                    # An explicit zero that is not in every input file\n                    self.nxzeros += 1\n                    return True\n                else:\n                    self.nmissing += 1\n                    if self.verbose >= 2:\n                        print \"%d %d\" % index, edge\n                    return False\n            eset.remove(None)\n\n        # Compute differences\n        emin = min(eset)\n        emax = max(eset)\n        absdiff = emax-emin\n        reldiff = absdiff/max(abs(emax), absdiff)\n\n        # Update maximum\n        if absdiff > self.maxabsdiff:\n            self.maxabsdiff = absdiff\n        if reldiff > self.maxreldiff:\n            self.maxreldiff = reldiff\n        if reldiff > CLOSE_ENOUGH:\n            self.ndiffs += 1\n            if self.verbose >= 2:\n                print \"%d %d\" % index, edge, reldiff\n            return False\n        return True\n\n    def diff(self, fhs):\n        \"\"\"Compare the given files.\"\"\"\n        if self.quick and not self.is_success():\n            return\n\n        nfiles = len(fhs)\n        self.nfiles += nfiles\n\n        # Stack of edges not yet diffed\n        finished = set()\n        queue = {}\n        queuesize = 0\n\n        # Iterate over the lines in each file (like paste(1))\n        for lines in izip_longest(*fhs):\n            self.nedges += 1\n            # Parse line from each file\n            for fno, line in enumerate(lines):\n                if line is None:\n                    continue\n                # Parse line\n                line = line.split()\n                assert len(line) == 3\n                index = (int(line[0], 10), int(line[1], 10))\n                if self.symmetric and index[1] < index[0]:\n                    index = (index[1], index[0])\n                val = float(line[2])\n                if isnan(val) or isinf(val):\n                    self.ninvalid += 1\n                    if self.quick:\n                        return\n                # Check queue\n                if index in finished:\n                    # We already found this edge in every file -- Collision!\n                    self.ndups += 1\n                    if self.quick:\n                        return\n                    else:\n                        continue\n                elif index not in queue:\n                    # First time seeing this edge, create a new queue entry\n                    queue[index] = [(val if i == fno else None)\n                                    for i in xrange(nfiles)]\n                    # Track queue size\n                    queuesize += 1\n                    if queuesize > self.maxqueue:\n                        self.maxqueue = queuesize\n                elif queue[index][fno] is not None:\n                    # We already found this edge in this file -- Collision!\n                    self.ndups += 1\n                    if self.quick:\n                        return\n                    else:\n                        continue\n                else:\n                    # Store value in existing queue entry\n                    queue[index][fno] = val\n                    # Check queue entry for completeness/correctness\n                    result = self._diff_edge(index, queue[index])\n                    if result is not None:\n                        # Queue entry was complete, remove from queue\n                        del queue[index]\n                        queuesize -= 1\n                        if self.quick and not result:\n                            # We found an error while in quick mode\n                            return\n        # Check for differences in all remaining queue entries\n        for index, entry in queue.iteritems():\n            if entry is None:\n                continue\n            result = self._diff_edge(index, entry, incremental=False)\n            if self.quick and not result:\n                # We found an error while in quick mode\n                return\n        return\n\n    def is_success(self):\n        \"\"\"Return True iff no error conditions have been detected.\"\"\"\n        return self.ninvalid == self.ndiffs == self.nmissing == self.ndups == 0\n\n    def report(self):\n        \"\"\"Provide a report of the comparison, if verbosity allows. Returns\n        is_success().\n\n        \"\"\"\n        result = self.is_success()\n        if result and self.verbose < 1:\n            return result\n        nedges = self.nedges if self.nedges else 1\n        def pct(val):\n            \"\"\"Convert a counter to a percentage of edges.\"\"\"\n            return val*100./nedges\n\n        print (\"%d edges: %d differences (%.1f%%), %d missing (%.1f%%), \" +\n               \"%d duplicates\") % \\\n            (self.nedges, self.ndiffs, pct(self.ndiffs), self.nmissing,\n             pct(self.nmissing), self.ndups)\n        print \"  %d implicit zeros (%.1f%%)\" % \\\n            (self.nxzeros, pct(self.nxzeros))\n        print \"  %d files read\" % (self.nfiles)\n        print \"  Maximum absolute difference = %g\" % (self.maxabsdiff,)\n        print \"  Maximum relative difference = %g\" % (self.maxreldiff,)\n        print \"  Maximum queue size = %d (%3.1f%%)\" % \\\n            (self.maxqueue, pct(self.maxqueue))\n        print \"OK\" if result else \"Failed\"\n\n        return result\n\ndef main(argv):\n    \"\"\"Main entry point when run as a program.\"\"\"\n    import argparse\n    parser = argparse.ArgumentParser(\n        description=\"Display differences between edge list files\"\n    )\n    parser.add_argument('--asymmetric', dest='symmetric', action='store_false',\n                        help=\"Do not assume a symmetric matrix\")\n    parser.add_argument('--quick', action='store_true',\n                        help=\"Stop after first error\")\n    parser.add_argument('--quiet', '-q', action='count', default=0,\n                        help=\"Show less output\")\n    parser.add_argument('--verbose', '-v', action='count', default=1,\n                        help=\"Show more output\")\n    parser.add_argument('files', type=argparse.FileType('r'), nargs='*',\n                        help=\"Files to compare\", metavar='file')\n    parser.set_defaults(\n        files=[sys.stdin],\n    )\n    args = parser.parse_args(argv)\n    args.verbose -= args.quiet\n    differ = DiffEdgelists(symmetric=args.symmetric, verbose=args.verbose,\n                           quick=args.quick)\n    #import statprof\n    #statprof.start()\n    differ.diff(args.files)\n    for filehandle in args.files:\n        filehandle.close()\n    #statprof.stop()\n    #statprof.display()\n    return 0 if differ.report() else 1\n\nif __name__ == '__main__':\n    sys.exit(main(sys.argv[1:]))\n"
  },
  {
    "path": "scripts/sparse-matrices/iperm2order.pl",
    "content": "#!/usr/bin/perl\n#\n# iperm2order.pl - Convert an inverted permutation (as used by METIS)\n#                  to an ordering (as used by Cholesky).\n#\nuse warnings;\nuse strict;\n\nmy @order = ();\nmy $i = 0;\nwhile (<>) {\n    s/[\\r\\n]//g;\n    $order[$_] = $i;\n    $i++;\n}\nforeach ( @order ) {\n    print \"$_\\n\";\n}\n"
  },
  {
    "path": "scripts/sparse-matrices/mtx2edgelist.pl",
    "content": "#!/usr/bin/perl\n#\n# mtx2edgelist.pl - Convert a Matrix Market matrix to a 0-based edgelist\n#\nuse warnings;\nuse strict;\n\nwhile (<>) {\n    last unless m/^%/;\n}\nwhile (<>) {\n    next if m/^%/;\n    my ($i,$j,$x) = split /[ \\t]/;\n    print $i-1, ' ', $j-1, ' ', $x;\n}\n"
  },
  {
    "path": "scripts/sparse-matrices/reorder.pl",
    "content": "#!/usr/bin/perl\n#\n# reorder.pl - Reorder a matrix given an ordering.\n#              Supports both edgelist and Matrix Market input files.\n#\n\nuse warnings;\nuse strict;\n\ndie \"Usage: $0 [-1] <matrix> <ordering>\\n\" unless @ARGV == 2;\n\nmy $onebased = 0;\nif ( $ARGV[0] eq '-1' ) {\n    $onebased = 1;\n    shift @ARGV;\n    warn \"Using one-based input\"\n}\n\n# Load ordering\nopen F, '<', $ARGV[1] or die;\n# number of nonzeros in tim davis code\nmy @ordering = ();\nmy $i = 0;\nwhile (<F>) {\n    my $j = int($_);\n    die if $j < 0;\n    $ordering[$j] = $i;\n    $i++;\n}\nclose F;\n\nopen F, '<', $ARGV[0] or die;\nmy @rows = ();\nmy $matrixmarket = 0;\nwhile ( <F> ) {\n    if ( m/^[%#]/ ) {\n        if ( m/^%%MatrixMarket/ ) {\n            warn \"Unknown MatrixMarket format\" unless m/coordinate real/;\n            warn \"Assuming one-based\" if !$onebased;\n            $onebased = 1;\n            $matrixmarket = 1;\n        }\n        print;\n        next;\n    }\n    elsif ( $matrixmarket == 1 ) {\n        warn \"Skipping MatrixMarket size info\\n\";\n        $matrixmarket++;\n        print;\n        next;\n    }\n    s/[\\r\\n]//g;\n    my @row = split(/\\s/, $_);\n    $row[0] = $onebased ? $ordering[$row[0]-1] : $ordering[$row[0]];\n    $row[1] = $onebased ? $ordering[$row[1]-1] : $ordering[$row[1]];\n    (defined($row[0]) && defined($row[1])) or die;\n    if ( $onebased ) { $row[0]++; $row[1]++ }\n    print join(' ', @row), \"\\n\";\n}\nclose F;\n"
  },
  {
    "path": "scripts/tcp_starter.py",
    "content": "import socket\nimport threading\nimport SocketServer\nimport time\nimport sys\n\nreqs = []\nnum = int(sys.argv[1])\nfinished = 0\n\nclass ThreadedTCPRequestHandler(SocketServer.StreamRequestHandler):\n\n    def handle(self):\n        self.data = self.rfile.readline().strip()\n        cur_thread = threading.current_thread()\n        cport = int(self.data)\n        response = \"{0}: {1} | {2} | {3}\".format(cur_thread.name, self.data,num,self.client_address[0]);\n        mynum = len(reqs) / 2\n        reqs.append(self.client_address[0])\n        reqs.append(cport)\n        print \"Recv: {0} of {1}\\n\".format(len(reqs) / 2, num);\n        while len(reqs) != num*2:\n            time.sleep(0.0001)\n        rep = \",\".join(str(x) for x in reqs)\n        print rep\n        self.request.sendall(str(num) + \",\" + str(mynum) + \",\" + rep + \"\\n\")\n        global finished\n        finished = finished + 1\n        if finished == num:\n            self.server.shutdown()\n\nclass ThreadedTCPServer(SocketServer.ThreadingMixIn, SocketServer.TCPServer):\n    allow_reuse_address=True\n\nif __name__ == \"__main__\":\n    # Port 0 means to select an arbitrary unused port\n    HOST, PORT = \"\", 9999\n\n    \n    server = ThreadedTCPServer((HOST, PORT), ThreadedTCPRequestHandler)\n    ip, port = server.server_address\n\n    # Start a thread with the server -- that thread will then start one\n    # more thread for each request\n    server_thread = threading.Thread(target=server.serve_forever)\n    # Exit the server thread when the main thread terminates\n    server_thread.daemon = True\n    server_thread.start()\n    print \"Server loop running in thread: {0} host: {1} port: {2}\".format(server_thread.name, ip, port)\n\n    server.serve_forever(0.05)\n"
  },
  {
    "path": "scripts/visual/plot2Dmesh.m",
    "content": "poly = dlmread ('mesh-poly.csv', ',' , 1, 0);\n\ncoord = dlmread ('mesh-coord.csv', ',', 1, 0);\n\nxcoord = coord(:, 1);\n\nycoord = coord(:, 2);\n\nzcoord = coord(:, 3);\n\nt = poly (:, 1:4);\n\nt = t + 1;\n\n% triplot (t, x, y);\n\ntimestamps = poly(:, 4);\n\nnorm_ts = timestamps ./ max(timestamps); % scale relative to 1;\n\nnorm_freq = min (timestamps) ./ timestamps;\n\n\nsz = size(t)\n\n% colormap Gray;\ncolormap ('default');\n\ncbar = colorbar ('location', 'EastOutside');\nset( get(cbar, 'Ylabel'), 'String', 'Number of updates per element normalized to 1');\n\n\nhold on;\nfor i = 1 : sz(1)\n    x = xcoord(t(i,:));\n    y = ycoord(t(i,:));\n    z = zcoord(t(i,:));\n    \n    fill3 (x, y, z, norm_freq(i), 'LineWidth', 2);\n    \nend\n\nset (get(gca, 'Xlabel'), 'String', 'Xpos of elements');\nset (get(gca, 'Ylabel'), 'String', 'Ypos of elements');\ntitle ('Mesh elements colored by number of updates');\n\n"
  },
  {
    "path": "scripts/visual/plotGraph.R",
    "content": "#!/usr/bin/Rscript\n\nlibrary (rgl)\n\nargs = commandArgs (trailingOnly=T);\n\nnodes = read.csv (args[1], header=T);\nedges = read.csv (args[2], header=T);\n\nnodes = nodes[rev (rownames (nodes)), ]; # order by nodeId\nrownames (nodes) = 1:nrow(nodes);\n\nedges = edges[rev (rownames (edges)), ]; # reverse like wise\nrownames (edges) = 1:nrow(edges);\n\nlocalMin = nodes[nodes$inDeg == 0, ]\nlocalMax = nodes[nodes$outDeg == 0, ]\n\nsrcVec = edges$srcId + 1\ndstVec = edges$dstId + 1\n\narrowCoord = cbind (nodes$centerX[srcVec], nodes$centerY[srcVec], nodes$centerX[dstVec], nodes$centerY[dstVec]);\n\ncex=2.5;\npch=20\n\noutfile = \"adjgraph.pdf\"\npdf (outfile)\nplot (nodes$centerX, nodes$centerY, type=\"p\", pch=pch, col=\"transparent\", cex=cex, xlab=\"\", ylab=\"\");\n\narrows (arrowCoord[,1], arrowCoord[,2], arrowCoord[,3], arrowCoord[,4], length=0.1, angle=7, lwd=1);\n\npoints (nodes$centerX, nodes$centerY, pch=pch, col=\"blue\", cex=cex);\n\npoints (localMin$centerX, localMin$centerY, pch=pch, col=\"red\", cex=cex);\n\npoints (localMax$centerX, localMax$centerY, pch=pch, col=\"green\", cex=cex);\n\n\ndev.off ();\n"
  },
  {
    "path": "scripts/visual/plotGraph3d.R",
    "content": "#!/usr/bin/Rscript\n\nlibrary (rgl)\nlibrary (compositions)\n\nargs = commandArgs (trailingOnly=T);\n\nnodes = read.csv (args[1], header=T);\nedges = read.csv (args[2], header=T);\n\nnodes = nodes[rev (rownames (nodes)), ]; # order by nodeId\nrownames (nodes) = 1:nrow(nodes);\n\nedges = edges[rev (rownames (edges)), ]; # reverse like wise\nrownames (edges) = 1:nrow(edges);\n\n# scale time stamps\nnodes$timeStamp = 500 * nodes$timeStamp;\n\nlocalMin = nodes[nodes$inDeg == 0, ]\nlocalMax = nodes[nodes$outDeg == 0, ]\n\nsrcVec = edges$srcId + 1\ndstVec = edges$dstId + 1\n\n\nopen3d ();\n\n# segX = nodes$centerX[rbind (srcVec, dstVec)]\n# segY = nodes$centerY[rbind (srcVec, dstVec)]\n# segZ = nodes$timeStamp[rbind (srcVec, dstVec)]\n# segments3d (segX, segY, segZ);\n\narrowStart = cbind (nodes$centerX[srcVec], nodes$centerY[srcVec], nodes$timeStamp[srcVec]);\narrowEnd = cbind (nodes$centerX[dstVec], nodes$centerY[dstVec], nodes$timeStamp[dstVec]);\narrows3D (arrowStart, arrowEnd, angle=10, length=0.2, size=10, lwd=10);\n\n\nradius=0.15\n\nspheres3d (nodes$centerX, nodes$centerY, nodes$timeStamp, radius=radius, color=c(\"blue\"));\nspheres3d (localMin$centerX, localMin$centerY, localMin$timeStamp, radius=1.1*radius, color=c(\"red\"));\nspheres3d (localMax$centerX, localMax$centerY, localMax$timeStamp, radius=1.1*radius, color=c(\"green\"));\n\n# rgl.snapshot (\"adjgraph3d.png\")\n\n"
  },
  {
    "path": "scripts/visual/plotTimeStamps.m",
    "content": "data = dlmread('mesh.csv', ',', 1, 0);\nx = data (:,1);\ny = data (:,2);\nz = data (:,3);\n\ngridsize = 100;\n\nxlin = linspace (min(x), max(x), gridsize);\nylin = linspace (min(y), max(y), gridsize);\n\n[X, Y] = meshgrid (xlin, ylin);\n\nf = TriScatteredInterp (x,y,z, 'nearest');;\n\nZ = f(X,Y);\n\n% bar3 (Z);\n\nhidden off\n\nmesh(X,Y,Z);\n\nfigure()\n\nsurf(X,Y,Z, 'EdgeColor', 'None');\n\n% shading interp\n% lighting('phong')\n"
  },
  {
    "path": "scripts/visual/triplot.m",
    "content": "data = dlmread('mesh.csv', ',', 1, 0);\nx = data (:,1);\ny = data (:,2);\nz = data (:,3);\n\ngridsize = 50;\n\nxlin = linspace (min(x), max(x), gridsize);\nylin = linspace (min(y), max(y), gridsize);\n\n[X, Y] = meshgrid (xlin, ylin);\n\nf = TriScatteredInterp (x,y,z);;\n\nZ = f(X,Y);\n\nhidden off\n\nmesh(X,Y,Z);\n\nfigure()\n\nsurf(X,Y,Z);\n\ncolor('interp')\nlighting('phong')\n"
  },
  {
    "path": "scripts/vtune.sh",
    "content": "#!/bin/bash\n\n# README\n# run as:\n# threads=\"1 2 4 8 16\" vtune.sh program ARGS\n\n#threads=${threads:=\"1 5 10 15 20 25 30 35 40\"};\nTSTEP=\"5\";\ntmax=\"40\";\nthreads=${threads:=\"1 `seq $TSTEP $TSTEP $TMAX`\"};\n\nthreads=${threads:=\"1 2 3 4 5 6 7 8 9 10 11 12 13 14 15\"};\nprefix=$(basename $1); # assuming arg 1 is the path to the program being run\n\nscriptsDir=$(dirname $0)\n\necho \"scriptsDir=$scriptsDir\"\n\nstamp=$(date +'%Y-%m-%d_%H:%M:%S')\n\nOUT_PREFIX=\"${prefix}_vtune_out_${stamp}\"\n\nfor t in $threads; do\n  # outfile=\"${OUT_PREFIX}${t}\"\n\n  cmd=\"$scriptsDir/run_vtune.pl -t $t -- ${OUT_PREFIX} $@\"\n\n  date;\n  echo \"Running: $cmd\"\n  $cmd\ndone 2>&1 | tee ${OUT_PREFIX}.run.log\n\nfunction_out=\"${OUT_PREFIX}.function.log\";\nline_out=\"${OUT_PREFIX}.line.log\";\n\nSUMM_PREFIX=\"${prefix}_vtune_summary\";\n\ncat $function_out | c++filt | perl $scriptsDir/report_vtune.pl --in function > ${SUMM_PREFIX}.function.${stamp}.csv\n\ncat $line_out | perl $scriptsDir/report_vtune.pl --in line > ${SUMM_PREFIX}.line.${stamp}.csv\n\n\n# for t in $threads; do\n  # function_out=\"${OUT_PREFIX}${t}.function.log\"\n  # cat $function_out;\n# done | perl $scriptsDir/report_vtune.pl --in function > vtune_summary.function.${stamp}.csv\n\n# for t in $threads; do\n  # line_out=\"${OUT_PREFIX}${t}.line.log\"\n  # cat $line_out;\n# done | perl $scriptsDir/report_vtune.pl --in line > vtune_summary.line.${stamp}.csv\n  \n\n\n"
  },
  {
    "path": "setup.py",
    "content": "import sys\nimport os\nimport setuptools\n\nfrom skbuild import setup\n\n# Require pytest-runner only when running tests\npytest_runner = (\n    [\"pytest-runner>=2.0,<3dev\"] if any(arg in sys.argv for arg in (\"pytest\", \"test\")) else []\n)\n\nsetup_requires = pytest_runner\n\n\ndef find_files(root, suffix):\n    \"\"\"\n    Find files ending with a given suffix in root and its subdirectories and\n    return their names relative to root.\n    \"\"\"\n    files = []\n    for dirpath, _, filenames in os.walk(root):\n        for f in filenames:\n            if not f.endswith(suffix):\n                continue\n            relpath = os.path.relpath(dirpath, root)\n            files.append(os.path.join(relpath, f))\n    return files\n\n\ndef package_setup():\n    with open(\"config/version.txt\") as f:\n        version = f.read().strip()\n\n    pxd_files = find_files(\"python/galois\", \".pxd\")\n\n    # \"pip wheel --build-option=...\" disables use of wheels for dependencies.\n    # In order to support passing build arguments directly, accept arguments\n    # via the environment.\n    cmake_args = os.environ.get(\"GALOIS_CMAKE_ARGS\", \"\").split()\n\n    # Following PEP-518, use pyproject.toml instead of setup(setup_requires=...) to\n    # specify setup dependencies.\n\n    setup(\n        version=version,\n        name=\"galois\",\n        packages=setuptools.find_packages(\"python\"),\n        package_data={\"galois\": pxd_files},\n        package_dir={\"\": \"python\"},\n        tests_require=[\"pytest\"],\n        setup_requires=setup_requires,\n        cmake_args=cmake_args,\n        cmake_source_dir=\"python\",\n    )\n\n\nif __name__ == \"__main__\":\n    package_setup()\n"
  },
  {
    "path": "tests/test_imports.py",
    "content": "def test_imports():\n    import galois.sssp\n    import galois.pagerank\n    import galois.bfs\n    import galois.connectedComponents\n"
  },
  {
    "path": "tools/CMakeLists.txt",
    "content": "add_subdirectory(graph-convert)\nadd_subdirectory(graph-remap)\nadd_subdirectory(graph-stats)\n\nif (GALOIS_ENABLE_DIST)\n  add_subdirectory(dist-graph-convert)\nendif()\n"
  },
  {
    "path": "tools/dist-graph-convert/CMakeLists.txt",
    "content": "add_executable(dist-graph-convert dist-graph-convert.cpp dist-graph-convert-helpers.cpp)\n\ntarget_link_libraries(dist-graph-convert PRIVATE galois_dist_async LLVMSupport)\n"
  },
  {
    "path": "tools/dist-graph-convert/dist-graph-convert-helpers.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#include \"dist-graph-convert-helpers.h\"\n\nstd::vector<uint32_t> readRandomNodeMapping(const std::string& nodeMapBinary,\n                                            uint64_t nodeOffset,\n                                            uint64_t numToRead) {\n  MPI_File mb;\n  std::vector<char> fName(nodeMapBinary.begin(), nodeMapBinary.end());\n  fName.push_back('\\0');\n  MPICheck(MPI_File_open(MPI_COMM_WORLD, fName.data(), MPI_MODE_RDONLY,\n                         MPI_INFO_NULL, &mb));\n\n  uint64_t readPosition = nodeOffset * sizeof(uint32_t);\n  uint64_t numRead      = 0;\n  MPI_Status readStatus;\n  std::vector<uint32_t> node2NewNode(numToRead);\n\n  while (numToRead > 0) {\n    // File_read can only go up to the max int\n    uint64_t toLoad =\n        std::min(numToRead, (uint64_t)std::numeric_limits<int>::max());\n    MPI_File_read_at(mb, readPosition,\n                     ((char*)(node2NewNode.data())) +\n                         (numRead * sizeof(uint32_t)),\n                     toLoad, MPI_UINT32_T, &readStatus);\n\n    int nodesRead;\n    MPI_Get_count(&readStatus, MPI_UINT32_T, &nodesRead);\n    GALOIS_ASSERT(nodesRead != MPI_UNDEFINED, \"Nodes read is MPI_UNDEFINED\");\n    numToRead -= nodesRead;\n    numRead += nodesRead;\n    readPosition += nodesRead * sizeof(uint32_t);\n  }\n  MPICheck(MPI_File_close(&mb));\n\n  return node2NewNode;\n}\n\nvoid MPICheck(int errcode) {\n  if (errcode != MPI_SUCCESS) {\n    MPI_Abort(MPI_COMM_WORLD, errcode);\n  }\n}\n\nUint64Pair readV1GrHeader(const std::string& grFile, bool isVoid) {\n  MPI_File gr;\n  std::vector<char> fName(grFile.begin(), grFile.end());\n  fName.push_back('\\0');\n  MPICheck(MPI_File_open(MPI_COMM_WORLD, fName.data(), MPI_MODE_RDONLY,\n                         MPI_INFO_NULL, &gr));\n  uint64_t grHeader[4];\n  MPICheck(\n      MPI_File_read_at(gr, 0, grHeader, 4, MPI_UINT64_T, MPI_STATUS_IGNORE));\n  MPICheck(MPI_File_close(&gr));\n  GALOIS_ASSERT(grHeader[0] == 1, \"gr file must be version 1\");\n\n  if (!isVoid) {\n    GALOIS_ASSERT(grHeader[1] != 0, \"gr should have weights \"\n                                    \"(specified in header)\");\n  }\n\n  return Uint64Pair(grHeader[2], grHeader[3]);\n}\n\nstd::vector<Uint64Pair> getHostToNodeMapping(uint64_t numHosts,\n                                             uint64_t totalNumNodes) {\n  GALOIS_ASSERT((totalNumNodes != 0), \"host2node mapping needs numNodes\");\n\n  std::vector<Uint64Pair> hostToNodes;\n\n  for (unsigned i = 0; i < numHosts; i++) {\n    hostToNodes.emplace_back(\n        galois::block_range((uint64_t)0, (uint64_t)totalNumNodes, i, numHosts));\n  }\n\n  return hostToNodes;\n}\n\nuint32_t findOwner(const uint64_t gID,\n                   const std::vector<Uint64Pair>& ownerMapping) {\n  uint32_t lb = 0;\n  uint32_t ub = ownerMapping.size();\n\n  while (lb < ub) {\n    uint64_t mid      = lb + (ub - lb) / 2;\n    auto& currentPair = ownerMapping[mid];\n\n    if (gID >= currentPair.first && gID < currentPair.second) {\n      return mid;\n    } else if (gID < currentPair.first) {\n      // MOVE DOWN\n      ub = mid;\n    } else if (gID >= currentPair.second) { // gid >= currentPair.second\n      // MOVE UP\n      lb = mid + 1;\n    } else {\n      GALOIS_DIE(\"unreachable\");\n    }\n  }\n\n  // it should find something above...\n  return -1;\n}\n\nuint64_t getFileSize(std::ifstream& openFile) {\n  openFile.seekg(0, std::ios_base::end);\n  return openFile.tellg();\n}\n\nUint64Pair determineByteRange(std::ifstream& edgeListFile, uint64_t fileSize) {\n  auto& net              = galois::runtime::getSystemNetworkInterface();\n  uint64_t hostID        = net.ID;\n  uint64_t totalNumHosts = net.Num;\n\n  uint64_t initialStart;\n  uint64_t initialEnd;\n  std::tie(initialStart, initialEnd) = galois::block_range(\n      (uint64_t)0, (uint64_t)fileSize, hostID, totalNumHosts);\n\n  bool startGood = false;\n  if (initialStart != 0) {\n    // good starting point if the prev char was a new line (i.e. this start\n    // location is the beginning of a line)\n    // TODO factor this out\n    edgeListFile.seekg(initialStart - 1);\n    char testChar = edgeListFile.get();\n    if (testChar == '\\n') {\n      startGood = true;\n    }\n  } else {\n    // start is 0; perfect starting point, need no adjustment\n    startGood = true;\n  }\n\n  bool endGood = false;\n  if (initialEnd != fileSize && initialEnd != 0) {\n    // good end point if the prev char was a new line (i.e. this end\n    // location is the beginning of a line; recall non-inclusive)\n    // TODO factor this out\n    edgeListFile.seekg(initialEnd - 1);\n    char testChar = edgeListFile.get();\n    if (testChar == '\\n') {\n      endGood = true;\n    }\n  } else {\n    endGood = true;\n  }\n\n  uint64_t finalStart = initialStart;\n  if (!startGood) {\n    // find next new line\n    // TODO factor this out\n    edgeListFile.seekg(initialStart);\n    std::string dummy;\n    std::getline(edgeListFile, dummy);\n    finalStart = edgeListFile.tellg();\n  }\n\n  uint64_t finalEnd = initialEnd;\n  if (!endGood) {\n    // find next new line\n    // TODO factor out\n    edgeListFile.seekg(initialEnd);\n    std::string dummy;\n    std::getline(edgeListFile, dummy);\n    finalEnd = edgeListFile.tellg();\n  }\n\n  return Uint64Pair(finalStart, finalEnd);\n}\n\nuint64_t accumulateValue(uint64_t localEdgeCount) {\n  galois::DGAccumulator<uint64_t> accumulator;\n  accumulator.reset();\n  accumulator += localEdgeCount;\n  return accumulator.reduce();\n}\n\nuint64_t findIndexPrefixSum(uint64_t targetWeight, uint64_t lb, uint64_t ub,\n                            const std::vector<uint64_t>& prefixSum) {\n  while (lb < ub) {\n    uint64_t mid = lb + (ub - lb) / 2;\n    uint64_t numUnits;\n\n    if (mid != 0) {\n      numUnits = prefixSum[mid - 1];\n    } else {\n      numUnits = 0;\n    }\n\n    if (numUnits <= targetWeight) {\n      lb = mid + 1;\n    } else {\n      ub = mid;\n    }\n  }\n\n  return lb;\n}\n\nUint64Pair binSearchDivision(uint64_t id, uint64_t totalID,\n                             const std::vector<uint64_t>& prefixSum) {\n  uint64_t totalWeight        = prefixSum.back();\n  uint64_t weightPerPartition = (totalWeight + totalID - 1) / totalID;\n  uint64_t numThingsToSplit   = prefixSum.size();\n\n  uint64_t lower;\n  if (id != 0) {\n    lower = findIndexPrefixSum(id * weightPerPartition, 0, numThingsToSplit,\n                               prefixSum);\n  } else {\n    lower = 0;\n  }\n  uint64_t upper = findIndexPrefixSum((id + 1) * weightPerPartition, lower,\n                                      numThingsToSplit, prefixSum);\n\n  return Uint64Pair(lower, upper);\n}\n\nvoid findUniqueChunks(galois::DynamicBitSet& uniqueNodeBitset,\n                      const std::vector<Uint64Pair>& chunkToNode,\n                      galois::DynamicBitSet& uniqueChunkBitset) {\n  uint64_t hostID = galois::runtime::getSystemNetworkInterface().ID;\n  std::cout << \"[\" << hostID << \"] Finding unique chunks\\n\";\n  uniqueChunkBitset.reset();\n\n  galois::do_all(\n      galois::iterate((size_t)0, uniqueNodeBitset.size()),\n      [&](auto nodeIndex) {\n        if (uniqueNodeBitset.test(nodeIndex)) {\n          uniqueChunkBitset.set(findOwner(nodeIndex, chunkToNode));\n        }\n      },\n      galois::loopname(\"FindUniqueChunks\"));\n\n  freeVector(uniqueNodeBitset.get_vec());\n\n  std::cout << \"[\" << hostID << \"] Unique chunks found\\n\";\n}\n\nvoid sendAndReceiveEdgeChunkCounts(std::vector<uint64_t>& chunkCounts) {\n  auto& net              = galois::runtime::getSystemNetworkInterface();\n  uint64_t hostID        = net.ID;\n  uint64_t totalNumHosts = net.Num;\n\n  std::cout << \"[\" << hostID << \"] Sending edge chunk counts\\n\";\n  // send off my chunk count vector to others so all hosts can have the\n  // same count of edges in a chunk\n  for (unsigned h = 0; h < totalNumHosts; h++) {\n    if (h == hostID)\n      continue;\n    galois::runtime::SendBuffer b;\n    galois::runtime::gSerialize(b, chunkCounts);\n    net.sendTagged(h, galois::runtime::evilPhase, b);\n  }\n\n  // receive chunk counts\n  std::vector<uint64_t> recvChunkCounts;\n\n  std::cout << \"[\" << hostID << \"] Receiving edge chunk counts\\n\";\n  for (unsigned h = 0; h < totalNumHosts; h++) {\n    if (h == hostID)\n      continue;\n    decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr)) rBuffer;\n\n    do {\n      rBuffer = net.recieveTagged(galois::runtime::evilPhase, nullptr);\n    } while (!rBuffer);\n\n    galois::runtime::gDeserialize(rBuffer->second, recvChunkCounts);\n\n    for (unsigned i = 0; i < chunkCounts.size(); i++) {\n      chunkCounts[i] += recvChunkCounts[i];\n    }\n  }\n  galois::runtime::evilPhase++;\n}\n\nstd::vector<Uint64Pair>\ngetChunkToHostMapping(const std::vector<uint64_t>& chunkCountsPrefixSum,\n                      const std::vector<Uint64Pair>& chunkToNode) {\n  std::vector<Uint64Pair> finalMapping;\n\n  uint64_t hostID        = galois::runtime::getSystemNetworkInterface().ID;\n  uint64_t totalNumHosts = galois::runtime::getSystemNetworkInterface().Num;\n  for (uint64_t h = 0; h < totalNumHosts; h++) {\n    uint64_t lowerChunk;\n    uint64_t upperChunk;\n\n    // get the lower/upper chunk assigned to host h\n    std::tie(lowerChunk, upperChunk) =\n        binSearchDivision(h, totalNumHosts, chunkCountsPrefixSum);\n\n    uint64_t lowerNode = chunkToNode[lowerChunk].first;\n    uint64_t upperNode = chunkToNode[upperChunk].first;\n\n    if (hostID == 0) {\n      uint64_t edgeCount;\n      if (lowerChunk == upperChunk) {\n        edgeCount = 0;\n      } else if (lowerChunk == 0) {\n        edgeCount = chunkCountsPrefixSum[upperChunk - 1];\n      } else {\n        edgeCount = chunkCountsPrefixSum[upperChunk - 1] -\n                    chunkCountsPrefixSum[lowerChunk - 1];\n      }\n      std::cout << \"Host \" << h << \" gets nodes \" << lowerNode << \" to \"\n                << upperNode << \" (count \" << (upperNode - lowerNode)\n                << \"), with \" << edgeCount << \" edges\\n\";\n    }\n\n    finalMapping.emplace_back(Uint64Pair(lowerNode, upperNode));\n  }\n\n  return finalMapping;\n}\n\nDoubleUint64Pair getNodesToReadFromGr(const std::string& inputGr) {\n  uint32_t hostID        = galois::runtime::getSystemNetworkInterface().ID;\n  uint32_t totalNumHosts = galois::runtime::getSystemNetworkInterface().Num;\n\n  galois::graphs::OfflineGraph offlineGr(inputGr);\n  auto nodeAndEdgeRange = offlineGr.divideByNode(0, 1, hostID, totalNumHosts);\n  auto& nodeRange       = nodeAndEdgeRange.first;\n  auto& edgeRange       = nodeAndEdgeRange.second;\n  Uint64Pair nodePair(*nodeRange.first, *nodeRange.second);\n  Uint64Pair edgePair(*edgeRange.first, *edgeRange.second);\n  return DoubleUint64Pair(nodePair, edgePair);\n}\n\nstd::vector<uint32_t>\nloadCleanEdgesFromBufferedGraph(const std::string& inputFile,\n                                Uint64Pair nodesToRead, Uint64Pair edgesToRead,\n                                uint64_t totalNumNodes, uint64_t totalNumEdges,\n                                bool keepSelfLoops) {\n  galois::graphs::BufferedGraph<void> bufGraph;\n  bufGraph.loadPartialGraph(inputFile, nodesToRead.first, nodesToRead.second,\n                            edgesToRead.first, edgesToRead.second,\n                            totalNumNodes, totalNumEdges);\n  size_t numNodesToRead = nodesToRead.second - nodesToRead.first;\n  std::vector<std::set<uint32_t>> nonDupSets(numNodesToRead);\n\n  // insert edge destinations of each node into a set (i.e. no duplicates)\n  galois::do_all(\n      galois::iterate(nodesToRead.first, nodesToRead.second),\n      [&](uint32_t gID) {\n        size_t vectorIndex = gID - nodesToRead.first;\n\n        uint64_t edgeBegin = *bufGraph.edgeBegin(gID);\n        uint64_t edgeEnd   = *bufGraph.edgeEnd(gID);\n\n        for (uint64_t i = edgeBegin; i < edgeEnd; i++) {\n          uint32_t edgeDest = bufGraph.edgeDestination(i);\n          // checking if this is a self edge\n          if ((edgeDest != gID) || keepSelfLoops) {\n            nonDupSets[vectorIndex].insert(edgeDest);\n          }\n        }\n      },\n      galois::steal(), galois::loopname(\"FindCleanEdges\"));\n\n  // get total num edges remaining\n  uint64_t edgesRemaining = 0;\n  for (unsigned i = 0; i < numNodesToRead; i++) {\n    edgesRemaining += nonDupSets[i].size();\n  }\n\n  std::vector<uint32_t> edgeData(edgesRemaining * 2);\n\n  uint64_t counter = 0;\n\n  // (serially) create the edge vector; TODO it's possible to parallelize\n  // this loop using a prefix sum of edges....; worth doing?\n  for (unsigned i = 0; i < numNodesToRead; i++) {\n    std::set<uint32_t> currentSet = nonDupSets[i];\n    uint32_t currentGID           = i + nodesToRead.first;\n\n    for (auto dest : currentSet) {\n      edgeData[counter * 2]     = currentGID; // src\n      edgeData[counter * 2 + 1] = dest;\n      counter++;\n    }\n  }\n\n  return edgeData;\n}\n\nuint64_t receiveEdgeCounts() {\n  auto& net              = galois::runtime::getSystemNetworkInterface();\n  uint64_t hostID        = net.ID;\n  uint64_t totalNumHosts = net.Num;\n\n  std::cout << \"[\" << hostID << \"] Receiving edge counts\\n\";\n\n  uint64_t edgesToReceive = 0;\n\n  // receive\n  for (unsigned h = 0; h < totalNumHosts; h++) {\n    if (h == hostID)\n      continue;\n    decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr)) rBuffer;\n\n    uint64_t recvCount;\n\n    do {\n      rBuffer = net.recieveTagged(galois::runtime::evilPhase, nullptr);\n    } while (!rBuffer);\n    galois::runtime::gDeserialize(rBuffer->second, recvCount);\n\n    edgesToReceive += recvCount;\n  }\n\n  galois::runtime::evilPhase++;\n\n  return edgesToReceive;\n}\n\nvoid receiveAssignedEdges(std::atomic<uint64_t>& edgesToReceive,\n                          const std::vector<Uint64Pair>& hostToNodes,\n                          std::vector<std::vector<uint32_t>>& localSrcToDest,\n                          std::vector<std::vector<uint32_t>>& localSrcToData,\n                          std::vector<std::mutex>& nodeLocks) {\n  auto& net       = galois::runtime::getSystemNetworkInterface();\n  uint64_t hostID = net.ID;\n\n  std::cout << \"[\" << hostID << \"] Going to receive assigned edges\\n\";\n\n  // receive edges\n  galois::on_each(\n      [&](unsigned, unsigned) {\n        std::vector<uint32_t> recvVector;\n        std::vector<uint32_t> recvDataVector;\n\n        while (edgesToReceive) {\n          decltype(\n              net.recieveTagged(galois::runtime::evilPhase, nullptr)) rBuffer;\n          rBuffer = net.recieveTagged(galois::runtime::evilPhase, nullptr);\n\n          // the buffer will have edge data as well if localsrctodata is\n          // nonempty (it will be nonempty if initialized to non-empty by the\n          // send function, and the send function only initializes it if it is\n          // going to send edge data\n          if (rBuffer) {\n            auto& receiveBuffer = rBuffer->second;\n            while (receiveBuffer.r_size() > 0) {\n              uint64_t src;\n              if (localSrcToData.empty()) {\n                // receive only dest data\n                galois::runtime::gDeserialize(receiveBuffer, src, recvVector);\n              } else {\n                // receive edge data as well\n                galois::runtime::gDeserialize(receiveBuffer, src, recvVector,\n                                              recvDataVector);\n              }\n\n              edgesToReceive -= recvVector.size();\n              GALOIS_ASSERT(findOwner(src, hostToNodes) == hostID);\n              uint32_t localID = src - hostToNodes[hostID].first;\n\n              nodeLocks[localID].lock();\n              if (localSrcToData.empty()) {\n                // deal with only destinations\n                for (unsigned i = 0; i < recvVector.size(); i++) {\n                  localSrcToDest[localID].emplace_back(recvVector[i]);\n                }\n              } else {\n                // deal with destinations and data\n                for (unsigned i = 0; i < recvVector.size(); i++) {\n                  localSrcToDest[localID].emplace_back(recvVector[i]);\n                  localSrcToData[localID].emplace_back(recvDataVector[i]);\n                }\n              }\n              nodeLocks[localID].unlock();\n            }\n          }\n        }\n      },\n      galois::loopname(\"EdgeReceiving\"));\n  galois::runtime::evilPhase++;\n\n  std::cout << \"[\" << hostID << \"] Receive assigned edges finished\\n\";\n}\n\nstd::vector<uint64_t> getEdgesPerHost(uint64_t localAssignedEdges) {\n  auto& net              = galois::runtime::getSystemNetworkInterface();\n  uint64_t hostID        = net.ID;\n  uint64_t totalNumHosts = net.Num;\n\n  std::cout << \"[\" << hostID\n            << \"] Informing other hosts about number of edges\\n\";\n\n  std::vector<uint64_t> edgesPerHost(totalNumHosts);\n\n  for (unsigned h = 0; h < totalNumHosts; h++) {\n    if (h == hostID)\n      continue;\n    galois::runtime::SendBuffer b;\n    galois::runtime::gSerialize(b, localAssignedEdges);\n    net.sendTagged(h, galois::runtime::evilPhase, b);\n  }\n\n  // receive\n  for (unsigned h = 0; h < totalNumHosts; h++) {\n    if (h == hostID) {\n      edgesPerHost[h] = localAssignedEdges;\n      continue;\n    }\n\n    decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr)) rBuffer;\n    uint64_t otherAssignedEdges;\n    do {\n      rBuffer = net.recieveTagged(galois::runtime::evilPhase, nullptr);\n    } while (!rBuffer);\n    galois::runtime::gDeserialize(rBuffer->second, otherAssignedEdges);\n\n    edgesPerHost[rBuffer->first] = otherAssignedEdges;\n  }\n  galois::runtime::evilPhase++;\n\n  return edgesPerHost;\n}\n\nstd::vector<uint32_t>\nflattenVectors(std::vector<std::vector<uint32_t>>& vectorOfVectors) {\n  std::vector<uint32_t> finalVector;\n  uint64_t vectorsToFlatten = vectorOfVectors.size();\n\n  for (unsigned i = 0; i < vectorsToFlatten; i++) {\n    auto& curVector = vectorOfVectors[i];\n    finalVector.insert(finalVector.end(), curVector.begin(), curVector.end());\n    // free the memory up\n    freeVector(vectorOfVectors[i]);\n  }\n\n  return finalVector;\n}\n\nvoid writeGrHeader(MPI_File& gr, uint64_t version, uint64_t sizeOfEdge,\n                   uint64_t totalNumNodes, uint64_t totalEdgeCount) {\n  // I won't check status here because there should be no reason why\n  // writing 8 bytes per write would fail.... (I hope at least)\n  MPICheck(\n      MPI_File_write_at(gr, 0, &version, 1, MPI_UINT64_T, MPI_STATUS_IGNORE));\n  MPICheck(MPI_File_write_at(gr, sizeof(uint64_t), &sizeOfEdge, 1, MPI_UINT64_T,\n                             MPI_STATUS_IGNORE));\n  MPICheck(MPI_File_write_at(gr, sizeof(uint64_t) * 2, &totalNumNodes, 1,\n                             MPI_UINT64_T, MPI_STATUS_IGNORE));\n  MPICheck(MPI_File_write_at(gr, sizeof(uint64_t) * 3, &totalEdgeCount, 1,\n                             MPI_UINT64_T, MPI_STATUS_IGNORE));\n}\n\nvoid writeNodeIndexData(MPI_File& gr, uint64_t nodesToWrite,\n                        uint64_t nodeIndexOffset,\n                        const std::vector<uint64_t>& edgePrefixSum) {\n  MPI_Status writeStatus;\n  uint64_t totalWritten = 0;\n  while (nodesToWrite != 0) {\n    uint64_t toWrite =\n        std::min(nodesToWrite, (uint64_t)std::numeric_limits<int>::max());\n\n    MPICheck(MPI_File_write_at(gr, nodeIndexOffset,\n                               ((uint64_t*)edgePrefixSum.data()) + totalWritten,\n                               toWrite, MPI_UINT64_T, &writeStatus));\n\n    int itemsWritten;\n    MPI_Get_count(&writeStatus, MPI_UINT64_T, &itemsWritten);\n    GALOIS_ASSERT(itemsWritten != MPI_UNDEFINED,\n                  \"itemsWritten is MPI_UNDEFINED\");\n    nodesToWrite -= itemsWritten;\n    totalWritten += itemsWritten;\n    nodeIndexOffset += itemsWritten * sizeof(uint64_t);\n  }\n}\n\n// vector of vectors version\nvoid writeEdgeDestData(MPI_File& gr, uint64_t edgeDestOffset,\n                       std::vector<std::vector<uint32_t>>& localSrcToDest) {\n  MPI_Status writeStatus;\n\n  for (unsigned i = 0; i < localSrcToDest.size(); i++) {\n    std::vector<uint32_t> currentDests = localSrcToDest[i];\n    uint64_t numToWrite                = currentDests.size();\n    uint64_t totalWritten              = 0;\n\n    while (numToWrite != 0) {\n      uint64_t toWrite =\n          std::min(numToWrite, (uint64_t)std::numeric_limits<int>::max());\n\n      MPICheck(MPI_File_write_at(\n          gr, edgeDestOffset, ((uint32_t*)currentDests.data()) + totalWritten,\n          toWrite, MPI_UINT32_T, &writeStatus));\n\n      int itemsWritten;\n      MPI_Get_count(&writeStatus, MPI_UINT32_T, &itemsWritten);\n      GALOIS_ASSERT(itemsWritten != MPI_UNDEFINED,\n                    \"itemsWritten is MPI_UNDEFINED\");\n      numToWrite -= itemsWritten;\n      totalWritten += itemsWritten;\n      edgeDestOffset += sizeof(uint32_t) * itemsWritten;\n    }\n  }\n}\n\n// 1 vector version (MUCH FASTER, USE WHEN POSSIBLE)\nvoid writeEdgeDestData(MPI_File& gr, uint64_t edgeDestOffset,\n                       std::vector<uint32_t>& destVector) {\n  MPI_Status writeStatus;\n  uint64_t numToWrite   = destVector.size();\n  uint64_t totalWritten = 0;\n\n  while (numToWrite != 0) {\n    uint64_t toWrite =\n        std::min(numToWrite, (uint64_t)std::numeric_limits<int>::max());\n\n    MPICheck(MPI_File_write_at(gr, edgeDestOffset,\n                               ((uint32_t*)destVector.data()) + totalWritten,\n                               toWrite, MPI_UINT32_T, &writeStatus));\n\n    int itemsWritten;\n    MPI_Get_count(&writeStatus, MPI_UINT32_T, &itemsWritten);\n    GALOIS_ASSERT(itemsWritten != MPI_UNDEFINED,\n                  \"itemsWritten is MPI_UNDEFINED\");\n    numToWrite -= itemsWritten;\n    totalWritten += itemsWritten;\n    edgeDestOffset += sizeof(uint32_t) * itemsWritten;\n  }\n}\n\nvoid writeEdgeDataData(MPI_File& gr, uint64_t edgeDataOffset,\n                       const std::vector<uint32_t>& edgeDataToWrite) {\n  MPI_Status writeStatus;\n  uint64_t numToWrite = edgeDataToWrite.size();\n  uint64_t numWritten = 0;\n\n  while (numToWrite != 0) {\n    uint64_t toWrite =\n        std::min(numToWrite, (uint64_t)std::numeric_limits<int>::max());\n\n    MPICheck(MPI_File_write_at(gr, edgeDataOffset,\n                               ((uint32_t*)edgeDataToWrite.data()) + numWritten,\n                               toWrite, MPI_UINT32_T, &writeStatus));\n    int itemsWritten;\n    MPI_Get_count(&writeStatus, MPI_UINT32_T, &itemsWritten);\n    GALOIS_ASSERT(itemsWritten != MPI_UNDEFINED,\n                  \"itemsWritten is MPI_UNDEFINED\");\n    numToWrite -= itemsWritten;\n    numWritten += itemsWritten;\n    edgeDataOffset += itemsWritten * sizeof(uint32_t);\n  }\n}\n\nvoid writeToGr(const std::string& outputFile, uint64_t totalNumNodes,\n               uint64_t totalNumEdges, uint64_t localNumNodes,\n               uint64_t localNodeBegin, uint64_t globalEdgeOffset,\n               std::vector<std::vector<uint32_t>>& localSrcToDest,\n               std::vector<std::vector<uint32_t>>& localSrcToData) {\n  uint64_t hostID = galois::runtime::getSystemNetworkInterface().ID;\n\n  std::cout << \"[\" << hostID << \"] Beginning write to file\\n\";\n  MPI_File newGR;\n\n  std::vector<char> fName(outputFile.begin(), outputFile.end());\n  fName.push_back('\\0');\n  MPICheck(MPI_File_open(MPI_COMM_WORLD, fName.data(),\n                         MPI_MODE_CREATE | MPI_MODE_WRONLY, MPI_INFO_NULL,\n                         &newGR));\n\n  if (hostID == 0) {\n    if (localSrcToData.empty()) {\n      writeGrHeader(newGR, 1, 0, totalNumNodes, totalNumEdges);\n    } else {\n      // edge data size hard set to 4 if there is data to write (uint32_t)\n      writeGrHeader(newGR, 1, 4, totalNumNodes, totalNumEdges);\n    }\n  }\n\n  if (localNumNodes > 0) {\n    // prepare edge prefix sum for file writing\n    std::vector<uint64_t> edgePrefixSum(localNumNodes);\n    edgePrefixSum[0] = localSrcToDest[0].size();\n    for (unsigned i = 1; i < localNumNodes; i++) {\n      edgePrefixSum[i] = (edgePrefixSum[i - 1] + localSrcToDest[i].size());\n    }\n\n    // account for edge offset\n    for (unsigned i = 0; i < localNumNodes; i++) {\n      edgePrefixSum[i] = edgePrefixSum[i] + globalEdgeOffset;\n    }\n\n    // begin file writing\n    uint64_t headerSize      = sizeof(uint64_t) * 4;\n    uint64_t nodeIndexOffset = headerSize + (localNodeBegin * sizeof(uint64_t));\n    std::cout << \"[\" << hostID << \"] Write node index data\\n\";\n    writeNodeIndexData(newGR, localNumNodes, nodeIndexOffset, edgePrefixSum);\n    freeVector(edgePrefixSum);\n\n    uint64_t edgeDestOffset = headerSize + (totalNumNodes * sizeof(uint64_t)) +\n                              globalEdgeOffset * sizeof(uint32_t);\n    std::cout << \"[\" << hostID << \"] Write edge dest data\\n\";\n    std::vector<uint32_t> destVector = flattenVectors(localSrcToDest);\n    freeVector(localSrcToDest);\n    writeEdgeDestData(newGR, edgeDestOffset, destVector);\n\n    // edge data writing if necessary\n    if (!localSrcToData.empty()) {\n      uint64_t edgeDataOffset = getOffsetToLocalEdgeData(\n          totalNumNodes, totalNumEdges, globalEdgeOffset);\n      std::cout << \"[\" << hostID << \"] Write edge data data\\n\";\n      std::vector<uint32_t> dataVector = flattenVectors(localSrcToData);\n      freeVector(localSrcToData);\n      writeEdgeDataData(newGR, edgeDataOffset, dataVector);\n    }\n\n    std::cout << \"[\" << hostID << \"] Write to file done\\n\";\n  }\n\n  MPICheck(MPI_File_close(&newGR));\n}\n\nvoid writeToLux(const std::string& outputFile, uint64_t totalNumNodes,\n                uint64_t totalNumEdges, uint64_t localNumNodes,\n                uint64_t localNodeBegin, uint64_t globalEdgeOffset,\n                std::vector<std::vector<uint32_t>>& localSrcToDest,\n                std::vector<std::vector<uint32_t>>& localSrcToData) {\n  uint64_t hostID = galois::runtime::getSystemNetworkInterface().ID;\n\n  std::cout << \"[\" << hostID << \"] Beginning write to file\\n\";\n  MPI_File newGR;\n  std::vector<char> fName(outputFile.begin(), outputFile.end());\n  fName.push_back('\\0');\n  MPICheck(MPI_File_open(MPI_COMM_WORLD, fName.data(),\n                         MPI_MODE_CREATE | MPI_MODE_WRONLY, MPI_INFO_NULL,\n                         &newGR));\n\n  // Lux header\n  if (hostID == 0) {\n    // cast down the node data\n    uint32_t castDown = totalNumNodes;\n    MPICheck(MPI_File_write_at(newGR, 0, &castDown, 1, MPI_UINT32_T,\n                               MPI_STATUS_IGNORE));\n    MPICheck(MPI_File_write_at(newGR, sizeof(uint32_t), &totalNumEdges, 1,\n                               MPI_UINT64_T, MPI_STATUS_IGNORE));\n  }\n\n  if (localNumNodes > 0) {\n    // prepare edge prefix sum for file writing\n    std::vector<uint64_t> edgePrefixSum(localNumNodes);\n    edgePrefixSum[0] = localSrcToDest[0].size();\n    for (unsigned i = 1; i < localNumNodes; i++) {\n      edgePrefixSum[i] = (edgePrefixSum[i - 1] + localSrcToDest[i].size());\n    }\n\n    // account for edge offset\n    for (unsigned i = 0; i < localNumNodes; i++) {\n      edgePrefixSum[i] = edgePrefixSum[i] + globalEdgeOffset;\n    }\n\n    // begin file writing\n    // Lux header differs from Galois header\n    uint64_t headerSize      = sizeof(uint32_t) + sizeof(uint64_t);\n    uint64_t nodeIndexOffset = headerSize + (localNodeBegin * sizeof(uint64_t));\n\n    std::cout << \"[\" << hostID << \"] Write node index data\\n\";\n    writeNodeIndexData(newGR, localNumNodes, nodeIndexOffset, edgePrefixSum);\n    freeVector(edgePrefixSum);\n\n    uint64_t edgeDestOffset = headerSize + (totalNumNodes * sizeof(uint64_t)) +\n                              globalEdgeOffset * sizeof(uint32_t);\n    std::cout << \"[\" << hostID << \"] Write edge dest data\\n\";\n    std::vector<uint32_t> destVector = flattenVectors(localSrcToDest);\n    freeVector(localSrcToDest);\n    writeEdgeDestData(newGR, edgeDestOffset, destVector);\n\n    // edge data writing if necessary\n    if (!localSrcToData.empty()) {\n      uint64_t byteOffsetToEdgeData = sizeof(uint32_t) + sizeof(uint64_t) +\n                                      (totalNumNodes * sizeof(uint64_t)) +\n                                      (totalNumEdges * sizeof(uint32_t));\n      byteOffsetToEdgeData += globalEdgeOffset * sizeof(uint32_t);\n      // NO PADDING\n      uint64_t edgeDataOffset = byteOffsetToEdgeData;\n\n      std::cout << \"[\" << hostID << \"] Write edge data data\\n\";\n      std::vector<uint32_t> dataVector = flattenVectors(localSrcToData);\n      freeVector(localSrcToData);\n      writeEdgeDataData(newGR, edgeDataOffset, dataVector);\n    }\n\n    std::cout << \"[\" << hostID << \"] Write to file done\\n\";\n  }\n\n  MPICheck(MPI_File_close(&newGR));\n}\n\nstd::vector<uint32_t> generateRandomNumbers(uint64_t count, uint64_t seed,\n                                            uint64_t lower, uint64_t upper) {\n  std::minstd_rand0 rGenerator;\n  rGenerator.seed(seed);\n  std::uniform_int_distribution<uint32_t> rDist(lower, upper);\n\n  std::vector<uint32_t> randomNumbers;\n  randomNumbers.reserve(count);\n  for (unsigned i = 0; i < count; i++) {\n    randomNumbers.emplace_back(rDist(rGenerator));\n  }\n\n  return randomNumbers;\n}\n\nuint64_t getOffsetToLocalEdgeData(uint64_t totalNumNodes,\n                                  uint64_t totalNumEdges,\n                                  uint64_t localEdgeBegin) {\n  uint64_t byteOffsetToEdgeData = (4 * sizeof(uint64_t)) +             // header\n                                  (totalNumNodes * sizeof(uint64_t)) + // nodes\n                                  (totalNumEdges * sizeof(uint32_t));  // edges\n  // version 1: determine if padding is necessary at end of file +\n  // add it (64 byte alignment since edges are 32 bytes in version 1)\n  if (totalNumEdges % 2) {\n    byteOffsetToEdgeData += sizeof(uint32_t);\n  }\n  byteOffsetToEdgeData += localEdgeBegin * sizeof(uint32_t);\n\n  return byteOffsetToEdgeData;\n}\n\nUint64Pair getLocalAssignment(uint64_t numToSplit) {\n  auto& net              = galois::runtime::getSystemNetworkInterface();\n  uint64_t hostID        = net.ID;\n  uint64_t totalNumHosts = net.Num;\n\n  return galois::block_range((uint64_t)0, numToSplit, hostID, totalNumHosts);\n}\n"
  },
  {
    "path": "tools/dist-graph-convert/dist-graph-convert-helpers.h",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#ifndef _GALOIS_DIST_CONVERT_HELP_\n#define _GALOIS_DIST_CONVERT_HELP_\n\n#include <iostream>\n#include <mutex>\n#include <random>\n\n#include <mpi.h>\n\n#include \"galois/Galois.h\"\n#include \"galois/DynamicBitset.h\"\n#include \"galois/gstl.h\"\n#include \"galois/runtime/Network.h\"\n#include \"galois/DReducible.h\"\n#include \"galois/graphs/OfflineGraph.h\"\n#include \"galois/graphs/BufferedGraph.h\"\n\n// useful typedefs that shorten long declarations\nusing Uint64Pair       = std::pair<uint64_t, uint64_t>;\nusing DoubleUint64Pair = std::pair<Uint64Pair, Uint64Pair>;\nusing VoVUint32        = std::vector<std::vector<uint32_t>>;\nusing PairVoVUint32    = std::pair<VoVUint32, VoVUint32>;\n\n/**\n * Given a binary node mapping, read a specified region into memory.\n *\n * @param nodeMapBinary name of the binary file with node mappings\n * @param nodeOffset node offset to begin reading at\n * @param numToRead the number of node mappings (i.e. nodes) to read\n * @returns Vector with the read in node mappings\n */\nstd::vector<uint32_t> readRandomNodeMapping(const std::string& nodeMapBinary,\n                                            uint64_t nodeOffset,\n                                            uint64_t numToRead);\n\n/**\n * Wrapper for MPI calls that return an error code. Make sure it is success\n * else die.\n *\n * @param errcode error code returned by an mpi call\n */\nvoid MPICheck(int errcode);\n\n/**\n * Reads a gr header from a v1 gr binary file and return number of\n * nodes and edges.\n *\n * @param grFile file name of gr\n * @param isVoid true if edge data shouldn't exist in the graph,\n * false otherwise; used for a safety check\n * @returns a pair with the number of nodes and number of edges in the gr file\n * in that order\n */\nUint64Pair readV1GrHeader(const std::string& grFile, bool isVoid);\n\n/**\n * \"Free\" memory used by a vector by swapping it out with an empty one.\n *\n * @tparam VectorTy type of vector\n * @param toFree vector to free memory of\n */\ntemplate <typename VectorTy>\nvoid freeVector(VectorTy& toFree) {\n  VectorTy dummyVector;\n  toFree.swap(dummyVector);\n}\n\n/**\n * Given a vector representing edges, get the number of edges the vector\n * represents.\n *\n * @tparam EdgeDataTy type of edge data to read\n * @param edgeVector vector with edges laid out in src, dest, and optionally\n * data order (i.e. 3 elements)\n * @returns the number of edges represented by the vector\n */\ntemplate <typename EdgeDataTy>\nsize_t getNumEdges(const std::vector<uint32_t>& edgeVector) {\n  size_t numEdges;\n  if (std::is_void<EdgeDataTy>::value) {\n    numEdges = edgeVector.size() / 2;\n  } else {\n    numEdges = edgeVector.size() / 3;\n  }\n  return numEdges;\n}\n\n/**\n * Given an open ifstream of an edgelist and a range to read,\n * read the edges into memory.\n *\n * @tparam EdgeDataTy type of edge data to read\n * @param edgeListFile open ifstream of an edge list\n * @param localStartByte First byte to read\n * @param localEndByte Last byte to read (non-inclusive)\n * @param totalNumNodes Total number of nodes in the graph: used for correctness\n * checking of src/dest ids\n * @param startAtOne true if the edge list node ids start at 1\n * @returns Vector representing the read in edges: every 2-3 elements represents\n * src, dest, and edge data (if the latter exists)\n */\ntemplate <typename EdgeDataTy>\nstd::vector<uint32_t>\nloadEdgesFromEdgeList(std::ifstream& edgeListFile, uint64_t localStartByte,\n                      uint64_t localEndByte, uint64_t totalNumNodes,\n                      bool startAtOne = false, bool ignoreWeights = false) {\n  // load edges into a vector\n  uint64_t localNumEdges = 0;\n  std::vector<uint32_t> localEdges; // v1 support only + only uint32_t data\n\n  // read lines until last byte\n  edgeListFile.seekg(localStartByte);\n  while ((uint64_t(edgeListFile.tellg()) + 1ul) != localEndByte) {\n    uint64_t src;\n    uint64_t dst;\n    edgeListFile >> src >> dst;\n    if (startAtOne) {\n      src--;\n      dst--;\n    }\n    GALOIS_ASSERT(src < totalNumNodes, \"src \", src, \" and \", totalNumNodes);\n    GALOIS_ASSERT(dst < totalNumNodes, \"dst \", dst, \" and \", totalNumNodes);\n    localEdges.emplace_back(src);\n    localEdges.emplace_back(dst);\n\n    // get (or ignore)edge data: IT ONLY SUPPORTS uint32_t AT THE MOMENT\n    // TODO function template specializations necessary to read other graph\n    // data types\n    if (ignoreWeights) {\n      // skip edgeweights\n      edgeListFile >> src;\n    } else if (!std::is_void<EdgeDataTy>::value) {\n      uint32_t edgeData;\n      edgeListFile >> edgeData;\n      localEdges.emplace_back(edgeData);\n    }\n\n    localNumEdges++;\n  }\n\n  if (std::is_void<EdgeDataTy>::value) {\n    GALOIS_ASSERT(localNumEdges == (localEdges.size() / 2));\n  } else {\n    GALOIS_ASSERT(localNumEdges == (localEdges.size() / 3));\n  }\n\n  std::cout << \"[\" << galois::runtime::getSystemNetworkInterface().ID << \"] \"\n            << \"Local num edges from file is \" << localNumEdges << \"\\n\";\n\n  return localEdges;\n}\n\n/**\n * Gets a mapping of host to nodes of all hosts in the system. Divides\n * nodes evenly among hosts.\n *\n * @param numHosts total number of hosts\n * @param totalNumNodes total number of nodes\n * @returns A vector of pairs representing node -> host assignments. Evenly\n * distributed nodes to hosts.\n */\nstd::vector<Uint64Pair> getHostToNodeMapping(uint64_t numHosts,\n                                             uint64_t totalNumNodes);\n\n/**\n * Get the assigned owner of some ID given a mapping from ID to owner.\n *\n * @param gID ID to find owner of\n * @param ownerMapping Vector containing information about which host has which\n * nodes\n * @returns Owner of requested ID on or -1 if it couldn't be found\n */\nuint32_t findOwner(const uint64_t gID,\n                   const std::vector<Uint64Pair>& ownerMapping);\n\n/**\n * Returns the file size of an ifstream.\n *\n * @param openFile an open ifstream\n * @returns file size in bytes of the ifstream\n */\nuint64_t getFileSize(std::ifstream& openFile);\n\n/**\n * Determine the byte range that a host should read from a file.\n *\n * @param edgeListFile edge list file to read\n * @param fileSize total size of the file\n * @returns pair that represents the begin/end of this host's byte range to read\n */\nUint64Pair determineByteRange(std::ifstream& edgeListFile, uint64_t fileSize);\n\n/**\n * Accumulates some value from all hosts + return it.\n *\n * @param value value to accumulate across hosts\n * @return Accumulated value (add all values from all hosts up)\n */\nuint64_t accumulateValue(uint64_t value);\n\n/**\n * Find an index into the provided prefix sum that gets the desired \"weight\"\n * (weight comes from the units of the prefix sum).\n *\n * @param targetWeight desired weight that you want the index returned to have\n * @param lb Lower bound of search\n * @param ub Upper bound of search\n * @param prefixSum Prefix sum where the weights/returned index are derived\n * from\n */\nuint64_t findIndexPrefixSum(uint64_t targetWeight, uint64_t lb, uint64_t ub,\n                            const std::vector<uint64_t>& prefixSum);\n\n/**\n * Given a prefix sum, a partition ID, and the total number of partitions,\n * find a good contiguous division using the prefix sum such that\n * partitions get roughly an even amount of units (based on prefix sum).\n *\n * @param id partition ID\n * @param totalID total number of partitions\n * @param prefixSum prefix sum of things that you want to divide among\n * partitions\n * @returns Pair representing the begin/end of the elements that partition\n * \"id\" is assigned based on the prefix sum\n */\nUint64Pair binSearchDivision(uint64_t id, uint64_t totalID,\n                             const std::vector<uint64_t>& prefixSum);\n\n/**\n * Finds the unique source nodes of a set of edges in memory. Assumes\n * edges are laid out in (src, dest) order in the vector.\n *\n * @tparam EdgeDataTy type of edge data to read\n * @param localEdges vector of edges to find unique sources of: needs to have\n * (src, dest) layout (i.e. i even: vector[i] is source, i+1 is dest\n * @param uniqueNodeBitset bitset marking which unique nodes are present\n * on this host; should be pre-initialized before being passed into this\n * function\n */\ntemplate <typename EdgeDataTy>\nvoid findUniqueSourceNodes(const std::vector<uint32_t>& localEdges,\n                           galois::DynamicBitSet& uniqueNodeBitset) {\n  uint64_t hostID = galois::runtime::getSystemNetworkInterface().ID;\n  std::cout << \"[\" << hostID << \"] Finding unique nodes\\n\";\n  uniqueNodeBitset.reset();\n\n  uint64_t localNumEdges = getNumEdges<EdgeDataTy>(localEdges);\n  galois::do_all(\n      galois::iterate((uint64_t)0, localNumEdges),\n      [&](uint64_t edgeIndex) {\n        // src node\n        if (std::is_void<EdgeDataTy>::value) {\n          uniqueNodeBitset.set(localEdges[edgeIndex * 2]);\n        } else {\n          uniqueNodeBitset.set(localEdges[edgeIndex * 3]);\n        }\n      },\n      galois::loopname(\"FindUniqueNodes\"));\n\n  std::cout << \"[\" << hostID << \"] Unique nodes found\\n\";\n}\n\n/**\n * Given a chunk to node mapping and a set of unique nodes, find the unique\n * chunks corresponding to the unique nodes provided.\n *\n * @param uniqueNodeBitset Bitset specifying which source nodes exist on the\n * edges this host has read\n * @param chunkToNode a mapping of a chunk to the range of nodes that the chunk\n * has\n * @returns a set of chunk ids corresponding to the nodes passed in (i.e. chunks\n * those nodes are included in)\n */\nvoid findUniqueChunks(galois::DynamicBitSet& uniqueNodeBitset,\n                      const std::vector<Uint64Pair>& chunkToNode,\n                      galois::DynamicBitSet& uniqueChunkBitset);\n\n/**\n * Get the edge counts for chunks of edges that we have locally.\n *\n * @tparam EdgeDataTy type of edge data to read\n * @param uniqueChunkBitset Bitset specifying which chunks are present on\n * this host; will be unusable at the end of the function (memory free)\n * @param localEdges loaded edge list laid out in src, dest, src, dest, etc.\n * @param chunkToNode specifies which chunks have which nodes\n * @param chunkCounts (input/output) a 0-initialized vector that will be\n * edited to have our local chunk edge counts\n */\ntemplate <typename EdgeDataTy>\nvoid accumulateLocalEdgesToChunks(galois::DynamicBitSet& uniqueChunkBitset,\n                                  const std::vector<uint32_t>& localEdges,\n                                  const std::vector<Uint64Pair>& chunkToNode,\n                                  std::vector<uint64_t>& chunkCounts) {\n  std::map<uint64_t, std::atomic<uint64_t>> chunkToAccumulator;\n\n  // default-initialize necessary chunk atomics\n  for (size_t i = 0; i < uniqueChunkBitset.size(); i++) {\n    if (uniqueChunkBitset.test(i)) {\n      chunkToAccumulator[i];\n    }\n  }\n\n  freeVector(uniqueChunkBitset.get_vec());\n\n  uint64_t hostID = galois::runtime::getSystemNetworkInterface().ID;\n  std::cout << \"[\" << hostID\n            << \"] Chunk accumulators created: \" << chunkToAccumulator.size()\n            << \" of them\\n\";\n\n  uint64_t localNumEdges = getNumEdges<EdgeDataTy>(localEdges);\n  // determine which chunk edges go to\n  galois::do_all(\n      galois::iterate((uint64_t)0, localNumEdges),\n      [&](uint64_t edgeIndex) {\n        uint32_t src;\n        if (std::is_void<EdgeDataTy>::value) {\n          src = localEdges[edgeIndex * 2];\n        } else {\n          src = localEdges[edgeIndex * 3];\n        }\n        uint32_t chunkNum = findOwner(src, chunkToNode);\n        GALOIS_ASSERT(chunkNum != (uint32_t)-1);\n        chunkToAccumulator[chunkNum] += 1;\n      },\n      galois::loopname(\"ChunkInspection\"));\n\n  std::cout << \"[\" << hostID << \"] Chunk accumulators done accumulating\\n\";\n\n  // update chunk count\n  galois::do_all(\n      galois::iterate(chunkToAccumulator.cbegin(), chunkToAccumulator.cend()),\n      [&](auto& chunkAndCount) {\n        chunkCounts[chunkAndCount.first] += chunkAndCount.second.load();\n      },\n      galois::loopname(\"ChunkCountUpdate\"));\n}\n\n/**\n * Synchronize chunk edge counts across all hosts, i.e. send and receive\n * local chunk counts and update them.\n *\n * @param chunkCounts local edge chunk counts to be updated to a global chunk\n * edge count across all hosts\n */\nvoid sendAndReceiveEdgeChunkCounts(std::vector<uint64_t>& chunkCounts);\n\n/**\n * Get the number of edges that each node chunk has.\n *\n * @tparam EdgeDataTy type of edge data to read\n * @param uniqueChunkBitset bitset specifying which chunks are present on this\n * host\n * @param localEdges loaded edge list laid out in src, dest, src, dest, etc.\n * @param chunkToNode specifies which chunks have which nodes\n * @returns A vector specifying the number of edges each chunk has\n */\ntemplate <typename EdgeDataTy>\nstd::vector<uint64_t>\ngetChunkEdgeCounts(galois::DynamicBitSet& uniqueChunkBitset,\n                   const std::vector<uint32_t>& localEdges,\n                   const std::vector<Uint64Pair>& chunkToNode) {\n  std::vector<uint64_t> chunkCounts;\n  chunkCounts.assign(uniqueChunkBitset.size(), 0);\n  accumulateLocalEdgesToChunks<EdgeDataTy>(uniqueChunkBitset, localEdges,\n                                           chunkToNode, chunkCounts);\n  sendAndReceiveEdgeChunkCounts(chunkCounts);\n\n  return chunkCounts;\n}\n\n/**\n * Given a chunk edge count prefix sum and the chunk to node mapping, assign\n * chunks (i.e. nodes) to hosts in an attempt to keep hosts with an about even\n * number of edges and return the node mapping.\n *\n * @param chunkCountsPrefixSum prefix sum of edges in chunks\n * @param chunkToNode mapping of chunk to nodes the chunk has\n * @returns a host to node mapping where each host very roughly has a balanced\n * number of edges\n */\nstd::vector<Uint64Pair>\ngetChunkToHostMapping(const std::vector<uint64_t>& chunkCountsPrefixSum,\n                      const std::vector<Uint64Pair>& chunkToNode);\n\n/**\n * Attempts to evenly assign nodes to hosts such that each host roughly gets\n * an even number of edges.\n *\n * @tparam EdgeDataTy type of edge data to read\n * @param localEdges in-memory buffer of edges this host has loaded\n * @param totalNodeCount total number of nodes in the entire graph\n * @param totalEdgeCount total number of edges in the entire graph\n * @returns a mapping of host to nodes where each host gets an attempted\n * roughly even amount of edges\n */\ntemplate <typename EdgeDataTy>\nstd::vector<Uint64Pair>\ngetEvenNodeToHostMapping(const std::vector<uint32_t>& localEdges,\n                         uint64_t totalNodeCount, uint64_t totalEdgeCount) {\n  auto& net              = galois::runtime::getSystemNetworkInterface();\n  uint64_t hostID        = net.ID;\n  uint64_t totalNumHosts = net.Num;\n\n  uint64_t numNodeChunks = totalEdgeCount / totalNumHosts;\n  // TODO better heuristics: basically we don't want to run out of memory,\n  // so keep number of chunks from growing too large\n  while (numNodeChunks > 10000000) {\n    numNodeChunks /= 2;\n  }\n\n  std::vector<Uint64Pair> chunkToNode;\n\n  if (hostID == 0) {\n    std::cout << \"Num chunks is \" << numNodeChunks << \"\\n\";\n  }\n\n  for (unsigned i = 0; i < numNodeChunks; i++) {\n    chunkToNode.emplace_back(galois::block_range(\n        (uint64_t)0, (uint64_t)totalNodeCount, i, numNodeChunks));\n  }\n\n  std::cout << \"[\" << hostID << \"] Determining edge to chunk counts\\n\";\n\n  galois::DynamicBitSet uniqueNodeBitset;\n  uniqueNodeBitset.resize(totalNodeCount);\n  findUniqueSourceNodes<EdgeDataTy>(localEdges, uniqueNodeBitset);\n\n  galois::DynamicBitSet uniqueChunkBitset;\n  uniqueChunkBitset.resize(numNodeChunks);\n  findUniqueChunks(uniqueNodeBitset, chunkToNode, uniqueChunkBitset);\n\n  std::vector<uint64_t> chunkCounts = getChunkEdgeCounts<EdgeDataTy>(\n      uniqueChunkBitset, localEdges, chunkToNode);\n  std::cout << \"[\" << hostID << \"] Edge to chunk counts determined\\n\";\n\n  // prefix sum on the chunks (reuse array to save memory)\n  for (unsigned i = 1; i < numNodeChunks; i++) {\n    chunkCounts[i] += chunkCounts[i - 1];\n  }\n\n  // to make access to chunkToNode's last element correct with regard to later\n  // access (without this access to chunkToNode[chunkSize] is out of bounds)\n  chunkToNode.emplace_back(Uint64Pair(totalNodeCount, totalNodeCount));\n\n  std::vector<Uint64Pair> finalMapping =\n      getChunkToHostMapping(chunkCounts, chunkToNode);\n\n  return finalMapping;\n}\n\n/**\n * Using OfflineGraph to read the binary gr, divide nodes among hosts such\n * that each hosts gets roughly an even amount of edges to read.\n *\n * @param inputGr file name of the input Galois binary graph\n * @returns 2 pairs: 1 pair specifies what nodes this host is responsible\n * for reading, the other pair specifies what edges this host is responsible\n * for reading\n */\nDoubleUint64Pair getNodesToReadFromGr(const std::string& inputGr);\n\n/**\n * Load a Galois binary graph into an BufferedGraph and load assigned\n * nodes/edges into memory.\n *\n * @tparam EdgeDataTy type of edge data to read\n * @param inputFile path to input Galois binary graph\n * @param nodesToRead a pair that has the range of nodes that should be read\n * @param edgesToRead a pair that has the range of edges that should be read\n * @param totalNumNodes Total number of nodes in the graph\n * @param totalNumEdges Total number of edges in the graph\n * @returns a vector with edges corresponding to the nodes/edges pass into the\n * function\n */\ntemplate <typename EdgeDataTy>\nstd::vector<uint32_t>\nloadEdgesFromBufferedGraph(const std::string& inputFile, Uint64Pair nodesToRead,\n                           Uint64Pair edgesToRead, uint64_t totalNumNodes,\n                           uint64_t totalNumEdges) {\n  galois::graphs::BufferedGraph<EdgeDataTy> bufGraph;\n  bufGraph.loadPartialGraph(inputFile, nodesToRead.first, nodesToRead.second,\n                            edgesToRead.first, edgesToRead.second,\n                            totalNumNodes, totalNumEdges);\n\n  std::vector<uint32_t> edgeData;\n\n  // void = 2 elements per edge; non-void = 3 elements per edge\n  if (std::is_void<EdgeDataTy>::value) {\n    edgeData.resize((edgesToRead.second - edgesToRead.first) * 2);\n  } else {\n    edgeData.resize((edgesToRead.second - edgesToRead.first) * 3);\n  }\n\n  if (edgeData.size() > 0) {\n    galois::do_all(\n        galois::iterate(nodesToRead.first, nodesToRead.second),\n        [&](uint32_t gID) {\n          uint64_t edgeBegin = *bufGraph.edgeBegin(gID);\n          uint64_t edgeEnd   = *bufGraph.edgeEnd(gID);\n\n          // offset into which we should start writing data in\n          // edgeData\n          uint64_t edgeDataOffset;\n          if (std::is_void<EdgeDataTy>::value) {\n            edgeDataOffset = (edgeBegin - edgesToRead.first) * 2;\n          } else {\n            edgeDataOffset = (edgeBegin - edgesToRead.first) * 3;\n          }\n\n          // loop through all edges\n          for (uint64_t i = edgeBegin; i < edgeEnd; i++) {\n            uint32_t edgeDest            = bufGraph.edgeDestination(i);\n            edgeData[edgeDataOffset]     = gID;\n            edgeData[edgeDataOffset + 1] = edgeDest;\n\n            if (std::is_void<EdgeDataTy>::value) {\n              edgeDataOffset += 2;\n            } else {\n              edgeData[edgeDataOffset + 2] = bufGraph.edgeData(i);\n              edgeDataOffset += 3;\n            }\n          }\n        },\n        galois::steal(), galois::loopname(\"LoadEdgesBufferedGraph\"));\n  }\n\n  return edgeData;\n}\n\n/**\n * Load a Galois binary graph into an BufferedGraph and load assigned\n * nodes/edges into memory such that srcs become dests and dests become srcs\n * (i.e. transpose graph).\n *\n * @tparam EdgeDataTy type of edge data to read\n * @param inputFile path to input Galois binary graph\n * @param nodesToRead a pair that has the range of nodes that should be read\n * @param edgesToRead a pair that has the range of edges that should be read\n * @param totalNumNodes Total number of nodes in the graph\n * @param totalNumEdges Total number of edges in the graph\n * @returns a vector with transposed edges corresponding to the nodes/edges\n * pass into the function\n */\ntemplate <typename EdgeDataTy>\nstd::vector<uint32_t> loadTransposedEdgesFromBufferedGraph(\n    const std::string& inputFile, Uint64Pair nodesToRead,\n    Uint64Pair edgesToRead, uint64_t totalNumNodes, uint64_t totalNumEdges) {\n  galois::graphs::BufferedGraph<EdgeDataTy> bufGraph;\n  bufGraph.loadPartialGraph(inputFile, nodesToRead.first, nodesToRead.second,\n                            edgesToRead.first, edgesToRead.second,\n                            totalNumNodes, totalNumEdges);\n\n  std::vector<uint32_t> edgeData;\n\n  // void = 2 elements per edge; non-void = 3 elements per edge\n  if (std::is_void<EdgeDataTy>::value) {\n    edgeData.resize((edgesToRead.second - edgesToRead.first) * 2);\n  } else {\n    edgeData.resize((edgesToRead.second - edgesToRead.first) * 3);\n  }\n\n  if (edgeData.size() > 0) {\n    galois::do_all(\n        galois::iterate(nodesToRead.first, nodesToRead.second),\n        [&](uint32_t gID) {\n          uint64_t edgeBegin = *bufGraph.edgeBegin(gID);\n          uint64_t edgeEnd   = *bufGraph.edgeEnd(gID);\n\n          // offset into which we should start writing data in\n          // edgeData\n          uint64_t edgeDataOffset;\n          if (std::is_void<EdgeDataTy>::value) {\n            edgeDataOffset = (edgeBegin - edgesToRead.first) * 2;\n          } else {\n            edgeDataOffset = (edgeBegin - edgesToRead.first) * 3;\n          }\n\n          // loop through all edges\n          for (uint64_t i = edgeBegin; i < edgeEnd; i++) {\n            uint32_t edgeSource = bufGraph.edgeDestination(i);\n            // src is saved as dest and dest is saved as source\n            // (transpose)\n            edgeData[edgeDataOffset]     = edgeSource;\n            edgeData[edgeDataOffset + 1] = gID;\n\n            if (std::is_void<EdgeDataTy>::value) {\n              edgeDataOffset += 2;\n            } else {\n              edgeData[edgeDataOffset + 2] = bufGraph.edgeData(i);\n              edgeDataOffset += 3;\n            }\n          }\n        },\n        galois::steal(), galois::loopname(\"LoadTransposeEdgesBufferedGraph\"));\n  }\n\n  return edgeData;\n}\n\n/**\n * Load a Galois binary graph into an BufferedGraph and load assigned\n * nodes/edges into memory such that each edge is loaded twice (extra in reverse\n * direction).\n *\n * @tparam EdgeDataTy type of edge data to read\n * @param inputFile path to input Galois binary graph\n * @param nodesToRead a pair that has the range of nodes that should be read\n * @param edgesToRead a pair that has the range of edges that should be read\n * @param totalNumNodes Total number of nodes in the graph\n * @param totalNumEdges Total number of edges in the graph\n * @returns a vector with edges corresponding to the nodes/edges\n * passed into the function; 1 edge in original becomes 2\n */\ntemplate <typename EdgeDataTy>\nstd::vector<uint32_t> loadSymmetricEdgesFromBufferedGraph(\n    const std::string& inputFile, Uint64Pair nodesToRead,\n    Uint64Pair edgesToRead, uint64_t totalNumNodes, uint64_t totalNumEdges) {\n  // TODO change this\n  galois::graphs::BufferedGraph<EdgeDataTy> bufGraph;\n  bufGraph.loadPartialGraph(inputFile, nodesToRead.first, nodesToRead.second,\n                            edgesToRead.first, edgesToRead.second,\n                            totalNumNodes, totalNumEdges);\n\n  std::vector<uint32_t> edgeData;\n\n  // void = 2 elements per edge; non-void = 3 elements per edge\n  if (std::is_void<EdgeDataTy>::value) {\n    edgeData.resize(((edgesToRead.second - edgesToRead.first) * 2) * 2);\n  } else {\n    edgeData.resize(((edgesToRead.second - edgesToRead.first) * 3) * 2);\n  }\n\n  if (edgeData.size() > 0) {\n    galois::do_all(\n        galois::iterate(nodesToRead.first, nodesToRead.second),\n        [&](uint32_t gID) {\n          uint64_t edgeBegin = *bufGraph.edgeBegin(gID);\n          uint64_t edgeEnd   = *bufGraph.edgeEnd(gID);\n\n          // offset into which we should start writing data in\n          // edgeData\n          uint64_t edgeDataOffset;\n          if (std::is_void<EdgeDataTy>::value) {\n            edgeDataOffset = (edgeBegin - edgesToRead.first) * 4;\n          } else {\n            edgeDataOffset = (edgeBegin - edgesToRead.first) * 6;\n          }\n\n          // loop through all edges, create 2 edges for every edge\n          for (uint64_t i = edgeBegin; i < edgeEnd; i++) {\n            uint32_t edgeDest            = bufGraph.edgeDestination(i);\n            edgeData[edgeDataOffset]     = gID;\n            edgeData[edgeDataOffset + 1] = edgeDest;\n\n            if (std::is_void<EdgeDataTy>::value) {\n              edgeData[edgeDataOffset + 2] = edgeDest;\n              edgeData[edgeDataOffset + 3] = gID;\n              edgeDataOffset += 4;\n            } else {\n              uint32_t edgeWeight = bufGraph.edgeData(i);\n\n              edgeData[edgeDataOffset + 2] = edgeWeight;\n\n              edgeData[edgeDataOffset + 3] = edgeDest;\n              edgeData[edgeDataOffset + 4] = gID;\n              edgeData[edgeDataOffset + 5] = edgeWeight;\n\n              edgeDataOffset += 6;\n            }\n          }\n        },\n        galois::steal(), galois::loopname(\"LoadSymmetricEdgesBufferedGraph\"));\n  }\n\n  return edgeData;\n}\n\n/**\n * Load a Galois binary graph such that multiedges and self loops are ignored.\n * Weights are completely ignored regardless if they exist or not.\n *\n * @param inputFile path to input Galois binary graph\n * @param nodesToRead a pair that has the range of nodes that should be read\n * @param edgesToRead a pair that has the range of edges that should be read\n * @param totalNumNodes Total number of nodes in the graph\n * @param totalNumEdges Total number of edges in the graph\n * @returns a vector with edges corresponding to the nodes/edges\n * passed into the function; multi edges and self loops removed\n */\nstd::vector<uint32_t>\nloadCleanEdgesFromBufferedGraph(const std::string& inputFile,\n                                Uint64Pair nodesToRead, Uint64Pair edgesToRead,\n                                uint64_t totalNumNodes, uint64_t totalNumEdges,\n                                bool keepSelfLoops);\n\n/**\n * Loads the node to new node mapping, then reads the edges that this host\n * has been assigned into a buffer. The catch is that it reads them\n * in a TRANSPOSED manner, and it remaps the original source node to\n * its new node id.\n *\n * i.e. source nodes are remapped to new id, but destination nodes aren't\n * Edges are returned in a destination, source (and optionally edge data)\n * order\n *\n * @tparam EdgeDataTy type of edge data to read\n *\n * @param inputFile path to input Galois binary graph\n * @param nodesToRead a pair that has the range of nodes that should be read\n * @param edgesToRead a pair that has the range of edges that should be read\n * @param totalNumNodes Total number of nodes in the graph\n * @param totalNumEdges Total number of edges in the graph\n * @param mappedBinary binary file with info that maps a node to its new node\n *\n * @returns A vector of transposed edges (with or without edge data\n * depending on edge data type)\n */\ntemplate <typename EdgeDataTy>\nstd::vector<uint32_t> loadMappedSourceEdgesFromBufferedGraph(\n    const std::string& inputFile, Uint64Pair nodesToRead,\n    Uint64Pair edgesToRead, uint64_t totalNumNodes, uint64_t totalNumEdges,\n    const std::string& mappedBinary) {\n  galois::graphs::BufferedGraph<EdgeDataTy> bufGraph;\n  bufGraph.loadPartialGraph(inputFile, nodesToRead.first, nodesToRead.second,\n                            edgesToRead.first, edgesToRead.second,\n                            totalNumNodes, totalNumEdges);\n  std::vector<uint32_t> edgeData;\n  // void = 2 elements per edge; non-void = 3 elements per edge\n  if (std::is_void<EdgeDataTy>::value) {\n    edgeData.resize((edgesToRead.second - edgesToRead.first) * 2);\n  } else {\n    edgeData.resize((edgesToRead.second - edgesToRead.first) * 3);\n  }\n\n  std::vector<uint32_t> node2NewNode = readRandomNodeMapping(\n      mappedBinary, nodesToRead.first, nodesToRead.second - nodesToRead.first);\n\n  if (edgeData.size() > 0) {\n    galois::do_all(\n        galois::iterate(nodesToRead.first, nodesToRead.second),\n        [&](uint32_t gID) {\n          uint64_t edgeBegin = *bufGraph.edgeBegin(gID);\n          uint64_t edgeEnd   = *bufGraph.edgeEnd(gID);\n\n          // offset into which we should start writing data in\n          // edgeData\n          uint64_t edgeDataOffset;\n          if (std::is_void<EdgeDataTy>::value) {\n            edgeDataOffset = (edgeBegin - edgesToRead.first) * 2;\n          } else {\n            edgeDataOffset = (edgeBegin - edgesToRead.first) * 3;\n          }\n\n          uint32_t lID          = gID - nodesToRead.first;\n          uint32_t mappedSource = node2NewNode[lID];\n\n          // loop through all edges\n          for (uint64_t i = edgeBegin; i < edgeEnd; i++) {\n            uint32_t edgeSource = bufGraph.edgeDestination(i);\n            // src is saved as dest and dest is saved as source\n            // (transpose)\n            edgeData[edgeDataOffset]     = edgeSource;\n            edgeData[edgeDataOffset + 1] = mappedSource;\n\n            if (std::is_void<EdgeDataTy>::value) {\n              edgeDataOffset += 2;\n            } else {\n              edgeData[edgeDataOffset + 2] = bufGraph.edgeData(i);\n              edgeDataOffset += 3;\n            }\n          }\n        },\n        galois::steal(), galois::loopname(\"RemapDestinations\"));\n  }\n\n  return edgeData;\n}\n\n/**\n * Determine/send to each host how many edges they should expect to receive\n * from the caller (i.e. this host).\n *\n * @tparam EdgeDataTy type of edge data to read\n * @param hostToNodes mapping of a host to the nodes it is assigned\n * @param localEdges in-memory buffer of edges this host has loaded\n */\ntemplate <typename EdgeDataTy>\nvoid sendEdgeCounts(const std::vector<Uint64Pair>& hostToNodes,\n                    const std::vector<uint32_t>& localEdges) {\n  auto& net              = galois::runtime::getSystemNetworkInterface();\n  uint64_t hostID        = net.ID;\n  uint64_t totalNumHosts = net.Num;\n\n  std::cout << \"[\" << hostID << \"] Determinining edge counts\\n\";\n\n  std::vector<galois::GAccumulator<uint64_t>> numEdgesPerHost(totalNumHosts);\n\n  uint64_t localNumEdges = getNumEdges<EdgeDataTy>(localEdges);\n  // determine to which host each edge will go\n  galois::do_all(\n      galois::iterate((uint64_t)0, localNumEdges),\n      [&](uint64_t edgeIndex) {\n        uint32_t src;\n        if (std::is_void<EdgeDataTy>::value) {\n          src = localEdges[edgeIndex * 2];\n        } else {\n          src = localEdges[edgeIndex * 3];\n        }\n\n        uint32_t edgeOwner = findOwner(src, hostToNodes);\n        numEdgesPerHost[edgeOwner] += 1;\n      },\n      galois::loopname(\"EdgeInspection\"));\n\n  std::cout << \"[\" << hostID << \"] Sending edge counts\\n\";\n\n  for (unsigned h = 0; h < totalNumHosts; h++) {\n    if (h == hostID)\n      continue;\n    galois::runtime::SendBuffer b;\n    galois::runtime::gSerialize(b, numEdgesPerHost[h].reduce());\n    net.sendTagged(h, galois::runtime::evilPhase, b);\n  }\n};\n\n/**\n * Receive the messages from other hosts that tell this host how many edges\n * it should expect to receive. Should be called after sendEdgesCounts.\n *\n * @returns the number of edges that the caller host should expect to receive\n * in total from all other hosts\n */\nuint64_t receiveEdgeCounts();\n\n/**\n * Loop through all local edges and send them to the host they are assigned to.\n *\n * @param hostToNodes mapping of a host to the nodes it is assigned\n * @param localEdges in-memory buffer of edges this host has loaded\n * @param localSrcToDest local mapping of LOCAL sources to destinations (we\n * may have some edges that do not need sending; they are saved here)\n * @param localSrcToData Vector of vectors: the vector at index i specifies\n * the data of edges owned by local node i\n * @param nodeLocks Vector of mutexes (one for each local node) that are used\n * when writing to the local mapping of sources to destinations since vectors\n * are not thread safe\n */\n// TODO make implementation smaller/cleaner i.e. refactor\n// TODO merge with the non void version below because the code duplication\n// here is ugly and messy\ntemplate <\n    typename EdgeDataTy,\n    typename std::enable_if<std::is_void<EdgeDataTy>::value>::type* = nullptr>\nvoid sendAssignedEdges(const std::vector<Uint64Pair>& hostToNodes,\n                       const std::vector<uint32_t>& localEdges,\n                       std::vector<std::vector<uint32_t>>& localSrcToDest,\n                       std::vector<std::vector<uint32_t>>&,\n                       std::vector<std::mutex>& nodeLocks) {\n  auto& net              = galois::runtime::getSystemNetworkInterface();\n  uint64_t hostID        = net.ID;\n  uint64_t totalNumHosts = net.Num;\n\n  std::cout << \"[\" << hostID << \"] Going to send assigned edges\\n\";\n\n  using EdgeVectorTy = std::vector<std::vector<uint32_t>>;\n  galois::substrate::PerThreadStorage<EdgeVectorTy> dstVectors(totalNumHosts);\n\n  using SendBufferVectorTy = std::vector<galois::runtime::SendBuffer>;\n  galois::substrate::PerThreadStorage<SendBufferVectorTy> sendBuffers(\n      totalNumHosts);\n  galois::substrate::PerThreadStorage<std::vector<uint64_t>>\n      lastSourceSentStorage(totalNumHosts);\n\n  // initialize last source sent\n  galois::on_each([&](unsigned, unsigned) {\n    for (unsigned h = 0; h < totalNumHosts; h++) {\n      (*(lastSourceSentStorage.getLocal()))[h] = 0;\n    }\n  });\n\n  std::cout << \"[\" << hostID << \"] Passing through edges and assigning\\n\";\n\n  uint64_t localNumEdges = getNumEdges<EdgeDataTy>(localEdges);\n  // determine to which host each edge will go\n  galois::do_all(\n      galois::iterate((uint64_t)0, localNumEdges),\n      [&](uint64_t edgeIndex) {\n        uint32_t src       = localEdges[edgeIndex * 2];\n        uint32_t edgeOwner = findOwner(src, hostToNodes);\n        uint32_t dst       = localEdges[(edgeIndex * 2) + 1];\n        uint32_t localID   = src - hostToNodes[edgeOwner].first;\n\n        if (edgeOwner != hostID) {\n          // send off to correct host\n          auto& hostSendBuffer = (*(sendBuffers.getLocal()))[edgeOwner];\n          auto& dstVector      = (*(dstVectors.getLocal()))[edgeOwner];\n          auto& lastSourceSent =\n              (*(lastSourceSentStorage.getLocal()))[edgeOwner];\n\n          if (lastSourceSent == localID) {\n            dstVector.emplace_back(dst);\n          } else {\n            // serialize vector if anything exists in it + send\n            // buffer if reached some limit\n            if (dstVector.size() > 0) {\n              uint64_t globalSourceID =\n                  lastSourceSent + hostToNodes[edgeOwner].first;\n              galois::runtime::gSerialize(hostSendBuffer, globalSourceID,\n                                          dstVector);\n              dstVector.clear();\n              if (hostSendBuffer.size() > 1400) {\n                net.sendTagged(edgeOwner, galois::runtime::evilPhase,\n                               hostSendBuffer);\n                hostSendBuffer.getVec().clear();\n              }\n            }\n\n            dstVector.emplace_back(dst);\n            lastSourceSent = localID;\n          }\n        } else {\n          // save to edge dest array\n          nodeLocks[localID].lock();\n          localSrcToDest[localID].emplace_back(dst);\n          nodeLocks[localID].unlock();\n        }\n      },\n      galois::loopname(\"Pass2\"));\n\n  std::cout << \"[\" << hostID << \"] Buffer cleanup\\n\";\n\n  // cleanup: each thread serialize + send out remaining stuff\n  galois::on_each(\n      [&](unsigned, unsigned) {\n        for (unsigned h = 0; h < totalNumHosts; h++) {\n          if (h == hostID)\n            continue;\n          auto& hostSendBuffer    = (*(sendBuffers.getLocal()))[h];\n          auto& dstVector         = (*(dstVectors.getLocal()))[h];\n          uint64_t lastSourceSent = (*(lastSourceSentStorage.getLocal()))[h];\n\n          if (dstVector.size() > 0) {\n            uint64_t globalSourceID = lastSourceSent + hostToNodes[h].first;\n            galois::runtime::gSerialize(hostSendBuffer, globalSourceID,\n                                        dstVector);\n            dstVector.clear();\n          }\n\n          if (hostSendBuffer.size() > 0) {\n            net.sendTagged(h, galois::runtime::evilPhase, hostSendBuffer);\n            hostSendBuffer.getVec().clear();\n          }\n        }\n      },\n      galois::loopname(\"Pass2Cleanup\"));\n}\n\n// Non-void variant of the above; uint32_t only\ntemplate <\n    typename EdgeDataTy,\n    typename std::enable_if<!std::is_void<EdgeDataTy>::value>::type* = nullptr>\nvoid sendAssignedEdges(const std::vector<Uint64Pair>& hostToNodes,\n                       const std::vector<uint32_t>& localEdges,\n                       std::vector<std::vector<uint32_t>>& localSrcToDest,\n                       std::vector<std::vector<uint32_t>>& localSrcToData,\n                       std::vector<std::mutex>& nodeLocks) {\n  auto& net              = galois::runtime::getSystemNetworkInterface();\n  uint64_t hostID        = net.ID;\n  uint64_t totalNumHosts = net.Num;\n\n  std::cout << \"[\" << hostID << \"] Going to send assigned edges\\n\";\n\n  // initialize localsrctodata\n  GALOIS_ASSERT(localSrcToData.empty());\n  using EdgeVectorTy = std::vector<std::vector<uint32_t>>;\n  EdgeVectorTy tmp   = EdgeVectorTy(localSrcToDest.size());\n  localSrcToData.swap(tmp);\n  GALOIS_ASSERT(localSrcToData.size() == localSrcToDest.size());\n\n  galois::substrate::PerThreadStorage<EdgeVectorTy> dstVectors(totalNumHosts);\n  // currently only uint32_t support for edge data\n  galois::substrate::PerThreadStorage<EdgeVectorTy> dataVectors(totalNumHosts);\n\n  using SendBufferVectorTy = std::vector<galois::runtime::SendBuffer>;\n  galois::substrate::PerThreadStorage<SendBufferVectorTy> sendBuffers(\n      totalNumHosts);\n  galois::substrate::PerThreadStorage<std::vector<uint64_t>>\n      lastSourceSentStorage(totalNumHosts);\n\n  // initialize last source sent\n  galois::on_each([&](unsigned, unsigned) {\n    for (unsigned h = 0; h < totalNumHosts; h++) {\n      (*(lastSourceSentStorage.getLocal()))[h] = 0;\n    }\n  });\n\n  std::cout << \"[\" << hostID << \"] Passing through edges and assigning\\n\";\n\n  uint64_t localNumEdges = getNumEdges<EdgeDataTy>(localEdges);\n  // determine to which host each edge will go\n  galois::do_all(\n      galois::iterate((uint64_t)0, localNumEdges),\n      [&](uint64_t edgeIndex) {\n        uint32_t src       = localEdges[edgeIndex * 3];\n        uint32_t edgeOwner = findOwner(src, hostToNodes);\n        uint32_t dst       = localEdges[(edgeIndex * 3) + 1];\n        uint32_t localID   = src - hostToNodes[edgeOwner].first;\n        uint32_t edgeData  = localEdges[(edgeIndex * 3) + 2];\n\n        if (edgeOwner != hostID) {\n          // send off to correct host\n          auto& hostSendBuffer = (*(sendBuffers.getLocal()))[edgeOwner];\n          auto& dstVector      = (*(dstVectors.getLocal()))[edgeOwner];\n          auto& dataVector     = (*(dataVectors.getLocal()))[edgeOwner];\n          auto& lastSourceSent =\n              (*(lastSourceSentStorage.getLocal()))[edgeOwner];\n\n          if (lastSourceSent == localID) {\n            dstVector.emplace_back(dst);\n            dataVector.emplace_back(edgeData);\n          } else {\n            // serialize vector if anything exists in it + send buffer if\n            // reached some limit\n            if (dstVector.size() > 0) {\n              uint64_t globalSourceID =\n                  lastSourceSent + hostToNodes[edgeOwner].first;\n              galois::runtime::gSerialize(hostSendBuffer, globalSourceID,\n                                          dstVector, dataVector);\n              dstVector.clear();\n              dataVector.clear();\n              if (hostSendBuffer.size() > 1400) {\n                net.sendTagged(edgeOwner, galois::runtime::evilPhase,\n                               hostSendBuffer);\n                hostSendBuffer.getVec().clear();\n              }\n            }\n\n            dstVector.emplace_back(dst);\n            dataVector.emplace_back(edgeData);\n            lastSourceSent = localID;\n          }\n        } else {\n          // save to edge dest array\n          nodeLocks[localID].lock();\n          localSrcToDest[localID].emplace_back(dst);\n          localSrcToData[localID].emplace_back(edgeData);\n          nodeLocks[localID].unlock();\n        }\n      },\n      galois::loopname(\"Pass2\"));\n\n  std::cout << \"[\" << hostID << \"] Buffer cleanup\\n\";\n\n  // cleanup: each thread serialize + send out remaining stuff\n  galois::on_each(\n      [&](unsigned, unsigned) {\n        for (unsigned h = 0; h < totalNumHosts; h++) {\n          if (h == hostID)\n            continue;\n          auto& hostSendBuffer    = (*(sendBuffers.getLocal()))[h];\n          auto& dstVector         = (*(dstVectors.getLocal()))[h];\n          auto& dataVector        = (*(dataVectors.getLocal()))[h];\n          uint64_t lastSourceSent = (*(lastSourceSentStorage.getLocal()))[h];\n\n          if (dstVector.size() > 0) {\n            uint64_t globalSourceID = lastSourceSent + hostToNodes[h].first;\n            galois::runtime::gSerialize(hostSendBuffer, globalSourceID,\n                                        dstVector, dataVector);\n            dstVector.clear();\n            dataVector.clear();\n          }\n\n          if (hostSendBuffer.size() > 0) {\n            net.sendTagged(h, galois::runtime::evilPhase, hostSendBuffer);\n            hostSendBuffer.getVec().clear();\n          }\n        }\n      },\n      galois::loopname(\"Pass2Cleanup\"));\n}\n\n/**\n * Receive this host's assigned edges: should be called after sendAssignedEdges.\n *\n * @param edgesToReceive the number of edges we expect to receive; the function\n * will not exit until all expected edges are received\n * @param hostToNodes mapping of a host to the nodes it is assigned\n * @param localSrcToDest local mapping of LOCAL sources to destinations (we\n * may have some edges that do not need sending; they are saved here)\n * @param localSrcToData Vector of vectors: the vector at index i specifies\n * the data of edges owned by local node i; NOTE THAT THIS VECTOR BEING EMPTY\n * OR NON EMPTY DETERMINES IF THE FUNCTION EXPECTS TO RECEIVE EDGE DATA\n * @param nodeLocks Vector of mutexes (one for each local node) that are used\n * when writing to the local mapping of sources to destinations since vectors\n * are not thread safe\n */\nvoid receiveAssignedEdges(std::atomic<uint64_t>& edgesToReceive,\n                          const std::vector<Uint64Pair>& hostToNodes,\n                          std::vector<std::vector<uint32_t>>& localSrcToDest,\n                          std::vector<std::vector<uint32_t>>& localSrcToData,\n                          std::vector<std::mutex>& nodeLocks);\n\n/**\n * Send/receive other hosts number of assigned edges.\n *\n * @param localAssignedEdges number of edges assigned to this host\n * @returns a vector that has every hosts number of locally assigned edges\n */\nstd::vector<uint64_t> getEdgesPerHost(uint64_t localAssignedEdges);\n\n/**\n * Given a vector of vectors, \"flatten\" it by merging them into 1 vector\n * in the order they appear the in the vector.\n *\n * @param vectorOfVectors vector of vectors to flatten. FUNCTION WILL ERASE\n * ALL DATA IN THE VECTOR.\n * @returns a flattened vector from vectorOfVectors\n */\nstd::vector<uint32_t>\nflattenVectors(std::vector<std::vector<uint32_t>>& vectorOfVectors);\n\n/**\n * Writes a binary galois graph's header information.\n *\n * @param gr File to write to\n * @param version Version of the galois binary graph file\n * @param sizeOfEdge Size of edge data (0 if there is no edge data)\n * @param totalNumNodes total number of nodes in the graph\n * @param totalEdgeConnt total number of edges in the graph\n */\nvoid writeGrHeader(MPI_File& gr, uint64_t version, uint64_t sizeOfEdge,\n                   uint64_t totalNumNodes, uint64_t totalEdgeCount);\n\n/**\n * Writes the node index data of a galois binary graph.\n *\n * @param gr File to write to\n * @param nodesToWrite number of nodes to write\n * @param nodeIndexOffset offset into file specifying where to start writing\n * @param edgePrefixSum the node index data to write into the file (index data\n * in graph tells you where to start looking for edges of some node, i.e.\n * it's a prefix sum)\n */\nvoid writeNodeIndexData(MPI_File& gr, uint64_t nodesToWrite,\n                        uint64_t nodeIndexOffset,\n                        const std::vector<uint64_t>& edgePrefixSum);\n\n/**\n * Writes the edge destination data of a galois binary graph.\n *\n * @param gr File to write to\n * @param edgeDestOffset offset into file specifying where to start writing\n * @param localSrcToDest Vector of vectors: the vector at index i specifies\n * the destinations for local src node i\n */\nvoid writeEdgeDestData(MPI_File& gr, uint64_t edgeDestOffset,\n                       std::vector<std::vector<uint32_t>>& localSrcToDest);\n\n/**\n * Writes the edge destination data of a galois binary graph.\n * @param gr File to write to\n * @param edgeDestOffset offset into file specifying where to start writing\n * @param destVector Vector of edge destinations IN THE ORDER THAT THEY SHOULD\n * BE WRITTEN (i.e. in correct order corresponding to node order this host has)\n */\nvoid writeEdgeDestData(MPI_File& gr, uint64_t edgeDestOffset,\n                       std::vector<uint32_t>& destVector);\n/**\n * Writes the edge data data of a galois binary graph.\n *\n * @param gr File to write to\n * @param edgeDataOffset offset into file specifying where to start writing\n * @param edgeDataToWrite vector of localNumEdges elements corresponding to\n * edge data that needs to be written\n */\nvoid writeEdgeDataData(MPI_File& gr, uint64_t edgeDataOffset,\n                       const std::vector<uint32_t>& edgeDataToWrite);\n\n/**\n * Write graph data out to a V1 Galois binary graph file.\n *\n * @param outputFile name of file to write to\n * @param totalNumNodes total number of nodes in the graph\n * @param totalNumEdges total number of edges in graph\n * @param localNumNodes number of source nodes that this host was assigned to\n * write\n * @param localNodeBegin global id of first node this host was assigned\n * @param globalEdgeOffset number of edges to skip to get to the first edge\n * this host is responsible for\n * @param localSrcToDest Vector of vectors: the vector at index i specifies\n * the destinations of edges owned by local node i\n * @param localSrcToData Vector of vectors: the vector at index i specifies\n * the data of edges owned by local node i\n */\nvoid writeToGr(const std::string& outputFile, uint64_t totalNumNodes,\n               uint64_t totalNumEdges, uint64_t localNumNodes,\n               uint64_t localNodeBegin, uint64_t globalEdgeOffset,\n               std::vector<std::vector<uint32_t>>& localSrcToDest,\n               std::vector<std::vector<uint32_t>>& localSrcToData);\n\n/**\n * Write graph data out to a Lux binary graph file.\n *\n * @param outputFile name of file to write to\n * @param totalNumNodes total number of nodes in the graph\n * @param totalNumEdges total number of edges in graph\n * @param localNumNodes number of source nodes that this host was assigned to\n * write\n * @param localNodeBegin global id of first node this host was assigned\n * @param globalEdgeOffset number of edges to skip to get to the first edge\n * this host is responsible for\n * @param localSrcToDest Vector of vectors: the vector at index i specifies\n * the destinations of edges owned by local node i\n * @param localSrcToData Vector of vectors: the vector at index i specifies\n * the data of edges owned by local node i\n */\nvoid writeToLux(const std::string& outputFile, uint64_t totalNumNodes,\n                uint64_t totalNumEdges, uint64_t localNumNodes,\n                uint64_t localNodeBegin, uint64_t globalEdgeOffset,\n                std::vector<std::vector<uint32_t>>& localSrcToDest,\n                std::vector<std::vector<uint32_t>>& localSrcToData);\n\n/**\n * Generates a vector of random uint32_ts.\n *\n * @param count number of numbers to generate\n * @param seed seed to start generating with\n * @param lower lower bound of numbers to generate, inclusive\n * @param upper upper bound of number to generate, inclusive\n * @returns Vector of random uint32_t numbers\n */\nstd::vector<uint32_t> generateRandomNumbers(uint64_t count, uint64_t seed,\n                                            uint64_t lower, uint64_t upper);\n\n/**\n * Gets the offset into the location of the edge data of some edge in a galois\n * binary graph file.\n *\n * @param totalNumNodes total number of nodes in graph\n * @param totalNumEdges total number of edges in graph\n * @param localEdgeBegin the edge to get the offset to\n * @returns offset into location of edge data of localEdgeBegin\n */\nuint64_t getOffsetToLocalEdgeData(uint64_t totalNumNodes,\n                                  uint64_t totalNumEdges,\n                                  uint64_t localEdgeBegin);\n\n/**\n * Given some number, get the chunk of that number that this host is responsible\n * for.\n *\n * @param numToSplit the number to chunk among hosts\n * @returns pair specifying the range that this host is responsible for\n */\nUint64Pair getLocalAssignment(uint64_t numToSplit);\n\n/**\n * Given a host to node assignment, send the edges we have to the appropriate\n * place + receieve edges sent by other hosts.\n *\n * @param hostToNodes Vector specifying assignment of nodes to hosts\n * @param localEdges array that represents edges on this host (to keep or to\n * send)\n * @returns 2 structures: one is a vector of vectors where the vector at\n * index i has destination ids for local node i and another vector of vectors\n * similar to the former except with edge weights (if EdgeTy is non-void)\n */\ntemplate <typename EdgeTy>\nPairVoVUint32\nsendAndReceiveAssignedEdges(const std::vector<Uint64Pair>& hostToNodes,\n                            std::vector<uint32_t>& localEdges) {\n  uint32_t hostID = galois::runtime::getSystemNetworkInterface().ID;\n  uint64_t localNumNodes =\n      hostToNodes[hostID].second - hostToNodes[hostID].first;\n\n  sendEdgeCounts<EdgeTy>(hostToNodes, localEdges);\n  std::atomic<uint64_t> edgesToReceive;\n  edgesToReceive.store(receiveEdgeCounts());\n\n  std::cout << \"[\" << hostID << \"] Need to receive \" << edgesToReceive.load()\n            << \" edges\\n\";\n\n  // FIXME ONLY V1 SUPPORT\n  VoVUint32 localSrcToDest(localNumNodes);\n  VoVUint32 localSrcToData;\n  std::vector<std::mutex> nodeLocks(localNumNodes);\n\n  sendAssignedEdges<EdgeTy>(hostToNodes, localEdges, localSrcToDest,\n                            localSrcToData, nodeLocks);\n  freeVector(localEdges);\n  receiveAssignedEdges(edgesToReceive, hostToNodes, localSrcToDest,\n                       localSrcToData, nodeLocks);\n  return PairVoVUint32(localSrcToDest, localSrcToData);\n}\n\n/**\n * Given a set of disjoint edges, assign/send edges to hosts. Then, each host\n * writes the edges to the specified output file in the Galois binary graph\n * format.\n *\n * @param localEdges Array of edges this host has\n * @param totalNumNodes total number of nodes in entire graph\n * @param totalNumEdges total number of edges in entire graph\n * @param outputFile file to write new graph to\n */\ntemplate <typename EdgeTy>\nvoid assignAndWriteEdges(std::vector<uint32_t>& localEdges,\n                         uint64_t totalNumNodes, uint64_t totalNumEdges,\n                         const std::string& outputFile) {\n  uint32_t hostID = galois::runtime::getSystemNetworkInterface().ID;\n\n  std::vector<Uint64Pair> hostToNodes = getEvenNodeToHostMapping<EdgeTy>(\n      localEdges, totalNumNodes, totalNumEdges);\n\n  PairVoVUint32 receivedEdgeInfo =\n      sendAndReceiveAssignedEdges<EdgeTy>(hostToNodes, localEdges);\n  VoVUint32 localSrcToDest = receivedEdgeInfo.first;\n  VoVUint32 localSrcToData = receivedEdgeInfo.second;\n\n  uint64_t localNodeBegin = hostToNodes[hostID].first;\n  uint64_t localNumNodes  = hostToNodes[hostID].second - localNodeBegin;\n  freeVector(hostToNodes);\n\n  uint64_t totalAssignedEdges = 0;\n  for (unsigned i = 0; i < localNumNodes; i++) {\n    totalAssignedEdges += localSrcToDest[i].size();\n  }\n\n  std::cout << \"[\" << hostID << \"] Will write \" << totalAssignedEdges\n            << \" edges\\n\";\n\n  // calculate global edge offset using edge counts from other hosts\n  std::vector<uint64_t> edgesPerHost = getEdgesPerHost(totalAssignedEdges);\n  uint64_t globalEdgeOffset          = 0;\n  uint64_t totalEdgeCount            = 0;\n  for (unsigned h = 0; h < hostID; h++) {\n    globalEdgeOffset += edgesPerHost[h];\n    totalEdgeCount += edgesPerHost[h];\n  }\n\n  uint64_t totalNumHosts = galois::runtime::getSystemNetworkInterface().Num;\n  // finish off getting total edge count (note this is more of a sanity check\n  // since we got total edge count near the beginning already)\n  for (unsigned h = hostID; h < totalNumHosts; h++) {\n    totalEdgeCount += edgesPerHost[h];\n  }\n  GALOIS_ASSERT(totalNumEdges == totalEdgeCount);\n  freeVector(edgesPerHost);\n\n  writeToGr(outputFile, totalNumNodes, totalEdgeCount, localNumNodes,\n            localNodeBegin, globalEdgeOffset, localSrcToDest, localSrcToData);\n}\n\n/**\n * Given a set of disjoint edges, assign/send edges to hosts. Then, each host\n * writes the edges to the specified output file in the Lux binary graph\n * format.\n *\n * @todo merge this with the other assignAndWriteEdges to prevent code\n * duplication\n *\n * @param localEdges Array of edges this host has\n * @param totalNumNodes total number of nodes in entire graph\n * @param totalNumEdges total number of edges in entire graph\n * @param outputFile file to write new graph to\n */\ntemplate <typename EdgeTy>\nvoid assignAndWriteEdgesLux(std::vector<uint32_t>& localEdges,\n                            uint64_t totalNumNodes, uint64_t totalNumEdges,\n                            const std::string& outputFile) {\n  uint32_t hostID = galois::runtime::getSystemNetworkInterface().ID;\n\n  std::vector<Uint64Pair> hostToNodes = getEvenNodeToHostMapping<EdgeTy>(\n      localEdges, totalNumNodes, totalNumEdges);\n\n  PairVoVUint32 receivedEdgeInfo =\n      sendAndReceiveAssignedEdges<EdgeTy>(hostToNodes, localEdges);\n  VoVUint32 localSrcToDest = receivedEdgeInfo.first;\n  VoVUint32 localSrcToData = receivedEdgeInfo.second;\n\n  uint64_t localNodeBegin = hostToNodes[hostID].first;\n  uint64_t localNumNodes  = hostToNodes[hostID].second - localNodeBegin;\n  freeVector(hostToNodes);\n\n  uint64_t totalAssignedEdges = 0;\n  for (unsigned i = 0; i < localNumNodes; i++) {\n    totalAssignedEdges += localSrcToDest[i].size();\n  }\n\n  std::cout << \"[\" << hostID << \"] Will write \" << totalAssignedEdges\n            << \" edges\\n\";\n\n  // calculate global edge offset using edge counts from other hosts\n  std::vector<uint64_t> edgesPerHost = getEdgesPerHost(totalAssignedEdges);\n  uint64_t globalEdgeOffset          = 0;\n  uint64_t totalEdgeCount            = 0;\n  for (unsigned h = 0; h < hostID; h++) {\n    globalEdgeOffset += edgesPerHost[h];\n    totalEdgeCount += edgesPerHost[h];\n  }\n\n  uint64_t totalNumHosts = galois::runtime::getSystemNetworkInterface().Num;\n  // finish off getting total edge count (note this is more of a sanity check\n  // since we got total edge count near the beginning already)\n  for (unsigned h = hostID; h < totalNumHosts; h++) {\n    totalEdgeCount += edgesPerHost[h];\n  }\n  GALOIS_ASSERT(totalNumEdges == totalEdgeCount);\n  freeVector(edgesPerHost);\n\n  writeToLux(outputFile, totalNumNodes, totalEdgeCount, localNumNodes,\n             localNodeBegin, globalEdgeOffset, localSrcToDest, localSrcToData);\n}\n#endif\n"
  },
  {
    "path": "tools/dist-graph-convert/dist-graph-convert.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2019, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n// @todo MPI files probably not necessary to use here; just use regular C\n// files and/or mmap\n\n#include <utility>\n\n#include \"galois/DistGalois.h\"\n#include \"llvm/Support/CommandLine.h\"\n\n#include \"dist-graph-convert-helpers.h\"\n\nnamespace cll = llvm::cl;\n\nenum ConvertMode {\n  edgelist2gr,\n  gr2wgr,\n  gr2tgr,\n  gr2sgr,\n  gr2cgr,\n  gr2rgr,\n  tgr2lux,\n  nodemap2binary\n};\n\nenum EdgeType { uint32_, void_ };\n\n////////////////////////////////////////////////////////////////////////////////\n// Command Line Args\n////////////////////////////////////////////////////////////////////////////////\n\nstatic cll::opt<std::string>\n    inputFilename(cll::Positional, cll::desc(\"<input file>\"), cll::Required);\nstatic cll::opt<std::string> outputFilename(cll::Positional,\n                                            cll::desc(\"<output file>\"),\n                                            cll::init(std::string()));\nstatic cll::opt<EdgeType>\n    edgeType(\"edgeType\", cll::desc(\"Input/Output edge type:\"),\n             cll::values(clEnumValN(EdgeType::uint32_, \"uint32\",\n                                    \"32 bit unsigned int edge values\"),\n                         clEnumValN(EdgeType::void_, \"void\", \"no edge values\")),\n             cll::init(EdgeType::void_));\n\nstatic cll::opt<ConvertMode> convertMode(\n    cll::desc(\"Conversion mode:\"),\n    cll::values(clEnumVal(edgelist2gr, \"Convert edge list to binary gr\"),\n                clEnumVal(gr2wgr,\n                          \"Convert unweighted binary gr to weighted binary gr \"\n                          \"(in-place)\"),\n                clEnumVal(gr2tgr, \"Convert binary gr to transpose binary gr\"),\n                clEnumVal(gr2sgr, \"Convert binary gr to symmetric binary gr\"),\n                clEnumVal(gr2cgr,\n                          \"Convert binary gr to binary gr without self-loops \"\n                          \"or multi-edges; edge data will be ignored\"),\n                clEnumVal(gr2rgr, \"Convert binary gr to randomized binary gr\"),\n                clEnumVal(tgr2lux, \"Convert transpose graph to Lux CSC\"),\n                clEnumVal(nodemap2binary, \"Convert node map into binary form\")),\n    cll::Required);\n\nstatic cll::opt<unsigned long long>\n    totalNumNodes(\"numNodes\", cll::desc(\"Nodes in input graph\"), cll::init(0));\n\nstatic cll::opt<unsigned> threadsToUse(\"t\", cll::desc(\"Threads to use\"),\n                                       cll::init(1));\n\nstatic cll::opt<bool> editInPlace(\"inPlace\",\n                                  cll::desc(\"Flag specifying conversion is in \"\n                                            \"place\"),\n                                  cll::init(false));\nstatic cll::opt<std::string>\n    nodeMapBinary(\"nodeMapBinary\",\n                  cll::desc(\"Binary file of numbers mapping nodes\"),\n                  cll::init(std::string()));\n\nstatic cll::opt<bool>\n    startAtOne(\"startAtOne\",\n               cll::desc(\"Set this if edgelist nodeid start at 1\"),\n               cll::init(false));\n\nstatic cll::opt<bool>\n    ignoreWeights(\"ignoreWeights\",\n                  cll::desc(\"Set this to ignore edgelist weights\"),\n                  cll::init(false));\n\nstatic cll::opt<bool>\n    keepSelfLoops(\"keepSelfLoops\",\n                  cll::desc(\"Used for graph cleaning: if set, keeps self \"\n                            \"loops instead of removing them\"),\n                  cll::init(false));\n\nstatic cll::opt<bool>\n    cleanCheck(\"cleanCheck\",\n               cll::desc(\"Only checks if graph is clean; no write occurs.\"),\n               cll::init(false));\n\nstatic cll::opt<bool>\n    symNoClean(\"symNoClean\",\n               cll::desc(\"gr2sgr option: if true, does not clean the graph \"\n                         \"after symmetrization\"),\n               cll::init(false));\n\n////////////////////////////////////////////////////////////////////////////////\n// BEGIN CONVERT CODE/STRUCTS\n////////////////////////////////////////////////////////////////////////////////\n\nstruct Conversion {};\n\n/**\n * Convert 1: figure out edge type, then call convert with edge type as\n * an additional template argument.\n */\ntemplate <typename C>\nvoid convert() {\n  C c;\n\n  switch (edgeType) {\n  case EdgeType::uint32_:\n    convert<uint32_t>(c, c);\n    break;\n  case EdgeType::void_:\n    convert<void>(c, c);\n    break;\n  default:\n    abort();\n  };\n}\n\n/**\n * Convert 2 called from convert above: calls convert from the appropriate\n * structure\n */\ntemplate <typename EdgeTy, typename C>\nvoid convert(C& c, Conversion) {\n  auto& net = galois::runtime::getSystemNetworkInterface();\n\n  if (net.ID == 0) {\n    std::cout << \"Input: \" << inputFilename << \"; Output: \" << outputFilename\n              << \"\\n\";\n  }\n\n  galois::runtime::getHostBarrier().wait();\n\n  galois::StatTimer convertTimer(\"Convert Time\", \"convert\");\n  convertTimer.start();\n  c.template convert<EdgeTy>(inputFilename, outputFilename);\n  convertTimer.stop();\n\n  if (net.ID == 0) {\n    galois::gInfo(\"Done with convert\\n\");\n  }\n}\n\n/**\n * Converts an edge list to a Galois binary graph.\n */\nstruct Edgelist2Gr : public Conversion {\n\n  template <typename EdgeTy>\n  void convert(const std::string& inputFile, const std::string& outputFile) {\n    GALOIS_ASSERT((totalNumNodes != 0), \"edgelist2gr needs num nodes\");\n    GALOIS_ASSERT(!(outputFile.empty()), \"edgelist2gr needs an output file\");\n    GALOIS_ASSERT((totalNumNodes <= 4294967296), \"num nodes limit is 2^32\");\n\n    if (ignoreWeights) {\n      GALOIS_ASSERT(std::is_void<EdgeTy>::value,\n                    \"ignoreWeights needs void edgetype\");\n    }\n\n    auto& net       = galois::runtime::getSystemNetworkInterface();\n    uint64_t hostID = net.ID;\n\n    std::ifstream edgeListFile(inputFile.c_str());\n    uint64_t fileSize = getFileSize(edgeListFile);\n    if (hostID == 0) {\n      std::cout << \"File size is \" << fileSize << \"\\n\";\n    }\n\n    uint64_t localStartByte;\n    uint64_t localEndByte;\n    std::tie(localStartByte, localEndByte) =\n        determineByteRange(edgeListFile, fileSize);\n    // load edges into a vector\n    std::vector<uint32_t> localEdges = loadEdgesFromEdgeList<EdgeTy>(\n        edgeListFile, localStartByte, localEndByte, totalNumNodes, startAtOne,\n        ignoreWeights);\n    edgeListFile.close();\n\n    uint64_t totalEdgeCount = accumulateValue(getNumEdges<EdgeTy>(localEdges));\n    if (hostID == 0) {\n      std::cout << \"Total num edges \" << totalEdgeCount << \"\\n\";\n    }\n    assignAndWriteEdges<EdgeTy>(localEdges, totalNumNodes, totalEdgeCount,\n                                outputFile);\n    galois::runtime::getHostBarrier().wait();\n  }\n};\n\n/**\n * Transpose a Galois binary graph.\n */\nstruct Gr2TGr : public Conversion {\n\n  template <typename EdgeTy>\n  void convert(const std::string& inputFile, const std::string& outputFile) {\n    GALOIS_ASSERT(!(outputFile.empty()), \"gr2tgr needs an output file\");\n    auto& net       = galois::runtime::getSystemNetworkInterface();\n    uint32_t hostID = net.ID;\n\n    uint64_t totalNumNodes;\n    uint64_t totalNumEdges;\n    std::tie(totalNumNodes, totalNumEdges) =\n        readV1GrHeader(inputFile, std::is_void<EdgeTy>::value);\n\n    // get \"read\" assignment of nodes (i.e. nodes this host is responsible for)\n    Uint64Pair nodesToRead;\n    Uint64Pair edgesToRead;\n    std::tie(nodesToRead, edgesToRead) = getNodesToReadFromGr(inputFile);\n    std::cout << \"[\" << hostID << \"] Reads nodes \" << nodesToRead.first\n              << \" to \" << nodesToRead.second << \"\\n\";\n    std::cout << \"[\" << hostID << \"] Reads edges \" << edgesToRead.first\n              << \" to \" << edgesToRead.second << \" (count \"\n              << (edgesToRead.second - edgesToRead.first) << \")\\n\";\n\n    // read edges of assigned nodes using MPI_Graph, load into the same format\n    // used by edgelist2gr; key is to do it TRANSPOSED\n    std::vector<uint32_t> localEdges =\n        loadTransposedEdgesFromBufferedGraph<EdgeTy>(\n            inputFile, nodesToRead, edgesToRead, totalNumNodes, totalNumEdges);\n    // sanity check\n    uint64_t totalEdgeCount = accumulateValue(getNumEdges<EdgeTy>(localEdges));\n    GALOIS_ASSERT(totalEdgeCount == totalNumEdges,\n                  \"edges from metadata doesn't match edges in memory\");\n    assignAndWriteEdges<EdgeTy>(localEdges, totalNumNodes, totalNumEdges,\n                                outputFile);\n\n    galois::runtime::getHostBarrier().wait();\n  }\n};\n\n/**\n * Makes a Galois binary graph symmetric (i.e. add a directed edge in the\n * opposite direction for every directed edge)\n */\nstruct Gr2SGr : public Conversion {\n\n  template <typename EdgeTy>\n  void convert(const std::string& inputFile, const std::string& outputFile) {\n    GALOIS_ASSERT(!(outputFile.empty()), \"gr2sgr needs an output file\");\n    auto& net       = galois::runtime::getSystemNetworkInterface();\n    uint32_t hostID = net.ID;\n\n    uint64_t totalNumNodes;\n    uint64_t totalNumEdges;\n    std::tie(totalNumNodes, totalNumEdges) =\n        readV1GrHeader(inputFile, std::is_void<EdgeTy>::value);\n\n    // get \"read\" assignment of nodes (i.e. nodes this host is responsible for)\n    Uint64Pair nodesToRead;\n    Uint64Pair edgesToRead;\n    std::tie(nodesToRead, edgesToRead) = getNodesToReadFromGr(inputFile);\n    std::cout << \"[\" << hostID << \"] Reads nodes \" << nodesToRead.first\n              << \" to \" << nodesToRead.second << \"\\n\";\n    std::cout << \"[\" << hostID << \"] Reads edges \" << edgesToRead.first\n              << \" to \" << edgesToRead.second << \" (count \"\n              << (edgesToRead.second - edgesToRead.first) << \")\\n\";\n\n    // read edges of assigned nodes using MPI_Graph, load into the same format\n    // used by edgelist2gr; key is to load one edge as 2 edges (i.e. symmetric)\n    std::vector<uint32_t> localEdges =\n        loadSymmetricEdgesFromBufferedGraph<EdgeTy>(\n            inputFile, nodesToRead, edgesToRead, totalNumNodes, totalNumEdges);\n    // sanity check\n    uint64_t doubleEdgeCount = accumulateValue(getNumEdges<EdgeTy>(localEdges));\n    GALOIS_ASSERT(doubleEdgeCount == 2 * totalNumEdges,\n                  \"data needs to have twice as many edges as original graph\");\n\n    assignAndWriteEdges<EdgeTy>(localEdges, totalNumNodes, doubleEdgeCount,\n                                outputFile);\n    galois::runtime::getHostBarrier().wait();\n  }\n};\n\n/**\n * Adds random weights to a Galois binary graph.\n */\nstruct Gr2WGr : public Conversion {\n  template <typename EdgeTy>\n  void convert(const std::string& inputFile, const std::string& outputFile) {\n    GALOIS_ASSERT(outputFile.empty(), \"gr2wgr doesn't take an output file\");\n    GALOIS_ASSERT(editInPlace, \"You must use -inPlace with gr2wgr\");\n\n    uint64_t totalNumNodes;\n    uint64_t totalNumEdges;\n    std::tie(totalNumNodes, totalNumEdges) =\n        readV1GrHeader(inputFile, std::is_void<EdgeTy>::value);\n\n    uint64_t localEdgeBegin;\n    uint64_t localEdgeEnd;\n    std::tie(localEdgeBegin, localEdgeEnd) = getLocalAssignment(totalNumEdges);\n\n    uint32_t hostID = galois::runtime::getSystemNetworkInterface().ID;\n    std::cout << \"[\" << hostID << \"] Responsible for edges \" << localEdgeBegin\n              << \" to \" << localEdgeEnd << \"\\n\";\n\n    // get edge data to write (random numbers) and get location to start\n    // write\n    uint64_t numLocalEdges = localEdgeEnd - localEdgeBegin;\n    std::vector<uint32_t> edgeDataToWrite =\n        generateRandomNumbers(numLocalEdges, hostID, 1, 100);\n    GALOIS_ASSERT(edgeDataToWrite.size() == numLocalEdges);\n    uint64_t byteOffsetToEdgeData =\n        getOffsetToLocalEdgeData(totalNumNodes, totalNumEdges, localEdgeBegin);\n\n    // do edge data writing\n    MPI_File grInPlace;\n    MPICheck(MPI_File_open(MPI_COMM_WORLD, inputFile.c_str(), MPI_MODE_RDWR,\n                           MPI_INFO_NULL, &grInPlace));\n    writeEdgeDataData(grInPlace, byteOffsetToEdgeData, edgeDataToWrite);\n    // if host 0 update header with edge size\n    if (hostID == 0) {\n      uint64_t edgeSize = 4;\n      MPICheck(MPI_File_write_at(grInPlace, sizeof(uint64_t), &edgeSize, 1,\n                                 MPI_UINT64_T, MPI_STATUS_IGNORE));\n    }\n    MPICheck(MPI_File_close(&grInPlace));\n  }\n};\n\n/**\n * Cleans graph (no multi-edges, no self-loops).\n *\n * ONLY WORKS ON GRAPHS WITH NO EDGE DATA. (If it does have edge data, it will\n * be ignored.)\n */\nstruct Gr2CGr : public Conversion {\n  template <typename EdgeTy>\n  void convert(const std::string& inputFile, const std::string& outputFile) {\n    GALOIS_ASSERT(std::is_void<EdgeTy>::value,\n                  \"Edge type must be void to clean graph\");\n    if (!cleanCheck) {\n      GALOIS_ASSERT(!(outputFile.empty()), \"gr2cgr needs an output file\");\n    }\n\n    auto& net       = galois::runtime::getSystemNetworkInterface();\n    uint32_t hostID = net.ID;\n\n    uint64_t totalNumNodes;\n    uint64_t totalNumEdges;\n    std::tie(totalNumNodes, totalNumEdges) =\n        readV1GrHeader(inputFile, std::is_void<EdgeTy>::value);\n\n    // get \"read\" assignment of nodes (i.e. nodes this host is responsible for)\n    Uint64Pair nodesToRead;\n    Uint64Pair edgesToRead;\n    std::tie(nodesToRead, edgesToRead) = getNodesToReadFromGr(inputFile);\n    std::cout << \"[\" << hostID << \"] Reads nodes \" << nodesToRead.first\n              << \" to \" << nodesToRead.second << \"\\n\";\n    std::cout << \"[\" << hostID << \"] Reads edges \" << edgesToRead.first\n              << \" to \" << edgesToRead.second << \" (count \"\n              << (edgesToRead.second - edgesToRead.first) << \")\\n\";\n\n    std::vector<uint32_t> localEdges = loadCleanEdgesFromBufferedGraph(\n        inputFile, nodesToRead, edgesToRead, totalNumNodes, totalNumEdges,\n        keepSelfLoops);\n    uint64_t cleanEdgeCount = accumulateValue(getNumEdges<EdgeTy>(localEdges));\n    GALOIS_ASSERT(cleanEdgeCount <= totalNumEdges,\n                  \"clean should not increase edge count\");\n\n    if (hostID == 0) {\n      galois::gInfo(\"From \", totalNumEdges, \" edges to \", cleanEdgeCount,\n                    \" edges\\n\");\n    }\n\n    if (cleanCheck) {\n      // only want a clean check; ok to quit here\n      galois::runtime::getHostBarrier().wait();\n      if (hostID == 0) {\n        if (totalNumEdges == cleanEdgeCount) {\n          galois::gInfo(\"Graph is clean\");\n        } else {\n          galois::gInfo(\"Graph is not clean\");\n        }\n      }\n      return;\n    }\n\n    if (totalNumEdges == cleanEdgeCount) {\n      if (hostID == 0) {\n        galois::gInfo(\"IMPORTANT: Graph was already clean; ending convert now\");\n      }\n      return;\n    }\n\n    assignAndWriteEdges<EdgeTy>(localEdges, totalNumNodes, cleanEdgeCount,\n                                outputFile);\n    galois::runtime::getHostBarrier().wait();\n  }\n};\n\n/**\n * Given a binary mapping of node to another node (i.e. random mapping), remap\n * the graph vertex order.\n */\nstruct Gr2RGr : public Conversion {\n  template <typename EdgeTy>\n  void convert(const std::string& inputFile, const std::string& outputFile) {\n    GALOIS_ASSERT(!(outputFile.empty()), \"gr2rgr needs an output file\");\n    GALOIS_ASSERT(!(nodeMapBinary.empty()), \"gr2rgr needs binary mapping\");\n\n    auto& net       = galois::runtime::getSystemNetworkInterface();\n    uint32_t hostID = net.ID;\n    if (hostID == 0) {\n      galois::gPrint(\"Node map binary is \", nodeMapBinary, \"\\n\");\n    }\n\n    uint64_t totalNumNodes;\n    uint64_t totalNumEdges;\n    std::tie(totalNumNodes, totalNumEdges) =\n        readV1GrHeader(inputFile, std::is_void<EdgeTy>::value);\n\n    ////////////////////////////////////////////////////////////////////////////\n    // phase 1: remap sources\n    ////////////////////////////////////////////////////////////////////////////\n    galois::gPrint(\"[\", hostID, \"] Source remap phase entering\\n\");\n\n    // get \"read\" assignment of nodes (i.e. nodes this host is responsible for)\n    Uint64Pair nodesToRead;\n    Uint64Pair edgesToRead;\n    std::tie(nodesToRead, edgesToRead) = getNodesToReadFromGr(inputFile);\n    // this will remap the source nodes and return a TRANSPOSE edge list\n    std::vector<uint32_t> localEdges =\n        loadMappedSourceEdgesFromBufferedGraph<EdgeTy>(\n            inputFile, nodesToRead, edgesToRead, totalNumNodes, totalNumEdges,\n            nodeMapBinary);\n\n    ////////////////////////////////////////////////////////////////////////////\n    // phase 2: remap destinations\n    ////////////////////////////////////////////////////////////////////////////\n    galois::gPrint(\"[\", hostID, \"] Dest remap phase entering\\n\");\n\n    // make each host remap a relatively even number of destination nodes by\n    // assigning/sending (this is the point of the transpose edge list above)\n    std::vector<Uint64Pair> hostToNodes = getEvenNodeToHostMapping<EdgeTy>(\n        localEdges, totalNumNodes, totalNumEdges);\n\n    PairVoVUint32 receivedEdgeInfo =\n        sendAndReceiveAssignedEdges<EdgeTy>(hostToNodes, localEdges);\n\n    // at this point, localEdges has been freed\n\n    galois::gPrint(\"[\", hostID, \"] Received destinations to remap\\n\");\n\n    VoVUint32 localSrcToDest = receivedEdgeInfo.first;\n    VoVUint32 localSrcToData = receivedEdgeInfo.second;\n\n    uint64_t localNodeBegin = hostToNodes[hostID].first;\n    uint64_t localNumNodes  = hostToNodes[hostID].second - localNodeBegin;\n    freeVector(hostToNodes);\n\n    // At this point, this host has all edges of the destinations it has been\n    // assigned to remap\n    std::vector<uint32_t> node2NewNode =\n        readRandomNodeMapping(nodeMapBinary, localNodeBegin, localNumNodes);\n\n    galois::gPrint(\"[\", hostID, \"] Remapping destinations now\\n\");\n\n    // TODO refactor\n    std::vector<uint32_t> remappedEdges;\n    GALOIS_ASSERT(localNumNodes == localSrcToDest.size());\n    // Go through the received edge lists and un-transpose them into a regular\n    // edge list while remapping the destination nodes\n    // (serial loop due to memory concerns)\n    for (unsigned i = 0; i < localNumNodes; i++) {\n      auto& curVector = localSrcToDest[i];\n\n      uint32_t remappedGID = node2NewNode[i];\n      for (unsigned j = 0; j < curVector.size(); j++) {\n        remappedEdges.emplace_back(curVector[j]);\n        remappedEdges.emplace_back(remappedGID);\n\n        if (localSrcToData.size()) {\n          remappedEdges.emplace_back(localSrcToData[i][j]);\n        }\n      }\n      freeVector(curVector);\n      if (localSrcToData.size()) {\n        freeVector(localSrcToData[i]);\n      }\n    }\n    freeVector(localSrcToDest);\n    freeVector(localSrcToData);\n\n    ////////////////////////////////////////////////////////////////////////////\n    // phase 3: write now randomized-node edges to new file\n    ////////////////////////////////////////////////////////////////////////////\n    galois::gPrint(\"[\", hostID, \"] Entering writing phase\\n\");\n\n    // we have the randomized nodes in remappedEdges; execution proceeds\n    // like the other converters from this point on\n    assignAndWriteEdges<EdgeTy>(remappedEdges, totalNumNodes, totalNumEdges,\n                                outputFile);\n    galois::runtime::getHostBarrier().wait();\n  }\n};\n\nstruct Tgr2Lux : public Conversion {\n  template <typename EdgeTy>\n  void convert(const std::string& inputFile, const std::string& outputFile) {\n    GALOIS_ASSERT(!(outputFile.empty()), \"tgr2lux needs an output file\");\n\n    auto& net       = galois::runtime::getSystemNetworkInterface();\n    uint32_t hostID = net.ID;\n\n    uint64_t totalNumNodes;\n    uint64_t totalNumEdges;\n    std::tie(totalNumNodes, totalNumEdges) =\n        readV1GrHeader(inputFile, std::is_void<EdgeTy>::value);\n\n    // get \"read\" assignment of nodes (i.e. nodes this host is responsible for)\n    Uint64Pair nodesToRead;\n    Uint64Pair edgesToRead;\n    std::tie(nodesToRead, edgesToRead) = getNodesToReadFromGr(inputFile);\n    std::cout << \"[\" << hostID << \"] Reads nodes \" << nodesToRead.first\n              << \" to \" << nodesToRead.second << \"\\n\";\n    std::cout << \"[\" << hostID << \"] Reads edges \" << edgesToRead.first\n              << \" to \" << edgesToRead.second << \" (count \"\n              << (edgesToRead.second - edgesToRead.first) << \")\\n\";\n\n    // read edges of assigned nodes using MPI_Graph, load into the same format\n    // used by edgelist2gr; key is to do it TRANSPOSED\n    std::vector<uint32_t> localEdges = loadEdgesFromBufferedGraph<EdgeTy>(\n        inputFile, nodesToRead, edgesToRead, totalNumNodes, totalNumEdges);\n    // sanity check\n    uint64_t totalEdgeCount = accumulateValue(getNumEdges<EdgeTy>(localEdges));\n    GALOIS_ASSERT(totalEdgeCount == totalNumEdges,\n                  \"edges from metadata doesn't match edges in memory\");\n    assignAndWriteEdgesLux<EdgeTy>(localEdges, totalNumNodes, totalNumEdges,\n                                   outputFile);\n\n    galois::runtime::getHostBarrier().wait();\n  }\n};\n\n/**\n * Take a line separated list of numbers and convert it into a binary format.\n */\nstruct Nodemap2Binary : public Conversion {\n  template <typename EdgeTy>\n  void convert(const std::string& inputFile, const std::string& outputFile) {\n    // input file = node map\n    GALOIS_ASSERT(!(outputFile.empty()), \"nodemap2binary needs an output file\");\n\n    auto& net       = galois::runtime::getSystemNetworkInterface();\n    uint64_t hostID = net.ID;\n\n    std::ifstream mapFile(inputFile.c_str());\n    uint64_t fileSize = getFileSize(mapFile);\n    if (hostID == 0) {\n      std::cout << \"File size is \" << fileSize << \"\\n\";\n    }\n    uint64_t localStartByte;\n    uint64_t localEndByte;\n    std::tie(localStartByte, localEndByte) =\n        determineByteRange(mapFile, fileSize);\n    std::vector<uint32_t> nodesToWrite;\n    // read lines until last byte\n    mapFile.seekg(localStartByte);\n    while ((uint64_t(mapFile.tellg()) + 1ul) != localEndByte) {\n      uint32_t node;\n      mapFile >> node;\n      nodesToWrite.emplace_back(node);\n    }\n    mapFile.close();\n\n    std::cout << \"[\" << galois::runtime::getSystemNetworkInterface().ID\n              << \"] Read \" << nodesToWrite.size() << \" numbers\\n\";\n\n    // determine where to start writing using prefix sum of read nodes\n    std::vector<uint64_t> nodesEachHostRead =\n        getEdgesPerHost(nodesToWrite.size());\n\n    for (unsigned i = 1; i < nodesEachHostRead.size(); i++) {\n      nodesEachHostRead[i] += nodesEachHostRead[i - 1];\n    }\n\n    uint64_t fileOffset;\n    if (hostID != 0) {\n      fileOffset = nodesEachHostRead[hostID - 1] * sizeof(uint32_t);\n    } else {\n      fileOffset = 0;\n    }\n\n    // write using mpi\n    MPI_File binaryMap;\n    MPICheck(MPI_File_open(MPI_COMM_WORLD, outputFile.c_str(),\n                           MPI_MODE_CREATE | MPI_MODE_WRONLY, MPI_INFO_NULL,\n                           &binaryMap));\n    // resuse of functions (misleading name, but it will do what I need which\n    // is write a vector of uint32_ts)\n    writeEdgeDataData(binaryMap, fileOffset, nodesToWrite);\n    MPICheck(MPI_File_close(&binaryMap));\n  }\n};\n\nint main(int argc, char** argv) {\n  galois::DistMemSys G;\n  llvm::cl::ParseCommandLineOptions(argc, argv);\n  galois::setActiveThreads(threadsToUse);\n\n// need to initialize MPI if using LWCI (else already initialized)\n#ifdef GALOIS_USE_LCI\n  int initResult;\n  MPICheck(MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &initResult));\n\n  if (initResult < MPI_THREAD_MULTIPLE) {\n    GALOIS_DIE(\"unable to init mpi with thread multiple\");\n  }\n#endif\n\n  if (cleanCheck) {\n    convertMode = gr2cgr;\n  }\n\n  switch (convertMode) {\n  case edgelist2gr:\n    convert<Edgelist2Gr>();\n    break;\n  case gr2wgr:\n    convert<Gr2WGr>();\n    break;\n  case gr2tgr:\n    convert<Gr2TGr>();\n    break;\n  case gr2sgr:\n    convert<Gr2SGr>();\n\n    // clean graph of multiedges and selfloops\n    // Note: if want to keep self loops use the flag for it\n    if (!symNoClean) {\n      if (galois::runtime::getSystemNetworkInterface().ID == 0) {\n        galois::gInfo(\"Cleaning the newly symmetrized graph\");\n        if (!keepSelfLoops) {\n          galois::gInfo(\"Removing self loops: if want to keep, use the flag\");\n        }\n      }\n      // overwrite new symmetric graph with clean version of self\n      inputFilename = outputFilename.getValue();\n      convert<Gr2CGr>();\n    }\n\n    break;\n  case gr2cgr:\n    convert<Gr2CGr>();\n    break;\n  case gr2rgr:\n    convert<Gr2RGr>();\n    break;\n  case tgr2lux:\n    convert<Tgr2Lux>();\n    break;\n  case nodemap2binary:\n    convert<Nodemap2Binary>();\n    break;\n  default:\n    abort();\n  }\n\n#ifdef GALOIS_USE_LCI\n  MPICheck(MPI_Finalize());\n#endif\n\n  return 0;\n}\n"
  },
  {
    "path": "tools/graph-convert/CMakeLists.txt",
    "content": "add_executable(graph-convert graph-convert.cpp)\ntarget_link_libraries(graph-convert galois_shmem LLVMSupport)\ninstall(TARGETS graph-convert\n  EXPORT GaloisTargets\n  DESTINATION \"${CMAKE_INSTALL_BINDIR}\"\n  COMPONENT tools\n)\n\nfunction(compare_with_sample test_arg compare_arg input expected)\n  set(suffix ${test_arg}${compare_arg}-${input})\n\n  get_filename_component(base_input ${input} NAME)\n\n  add_test(NAME create${suffix}\n    COMMAND graph-convert ${test_arg} ${CMAKE_CURRENT_SOURCE_DIR}/${input} ${base_input}.test\n  )\n  set_tests_properties(create${suffix} PROPERTIES LABELS quick)\n\n  add_test(NAME convert${suffix}\n    COMMAND graph-convert ${compare_arg} ${base_input}.test ${base_input}.compare\n  )\n  set_tests_properties(convert${suffix} PROPERTIES LABELS quick)\n  set_property(TEST convert${suffix} APPEND PROPERTY DEPENDS create${suffix})\n\n  add_test(NAME compare${suffix}\n    COMMAND ${CMAKE_COMMAND} -E compare_files ${base_input}.compare ${CMAKE_CURRENT_SOURCE_DIR}/${expected}\n  )\n  set_tests_properties(compare${suffix} PROPERTIES LABELS quick)\n  set_property(TEST compare${suffix} APPEND PROPERTY DEPENDS convert${suffix})\nendfunction()\n\ncompare_with_sample(-edgelist2gr -gr2edgelist test-inputs/with-blank-lines.edgelist test-inputs/with-blank-lines.edgelist.expected)\ncompare_with_sample(-csv2gr -gr2edgelist test-inputs/sample.csv test-inputs/with-blank-lines.edgelist.expected)\ncompare_with_sample(-edgelist2gr -gr2edgelist test-inputs/with-comments.edgelist test-inputs/with-comments.edgelist.expected)\n\n\nadd_executable(graph-convert-huge graph-convert-huge.cpp)\ntarget_link_libraries(graph-convert-huge galois_shmem LLVMSupport)\nif (TARGET Boost::Boost)\n  target_link_libraries(graph-convert-huge Boost::Boost)\nelse()\n  target_link_libraries(graph-convert-huge Boost::iostreams)\nendif()\ninstall(TARGETS graph-convert-huge\n  EXPORT GaloisTargets\n  DESTINATION \"${CMAKE_INSTALL_BINDIR}\"\n  COMPONENT tools\n)\n"
  },
  {
    "path": "tools/graph-convert/graph-convert-huge.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#include \"galois/LargeArray.h\"\n#include \"galois/graphs/FileGraph.h\"\n#include \"galois/graphs/OfflineGraph.h\"\n\n#include \"llvm/Support/CommandLine.h\"\n\n#include <boost/iostreams/filtering_streambuf.hpp>\n#include <boost/iostreams/filter/gzip.hpp>\n#include <boost/mpl/if.hpp>\n#include <algorithm>\n#include <deque>\n#include <fstream>\n#include <iostream>\n#include <ios>\n#include <limits>\n#include <cstdint>\n#include <vector>\n#include <random>\n#include <chrono>\n#include <regex>\n#include <fcntl.h>\n#include <cstdlib>\n\nnamespace cll = llvm::cl;\n\nstatic cll::opt<std::string>\n    inputFilename(cll::Positional, cll::desc(\"<input file>\"), cll::Required);\nstatic cll::opt<std::string>\n    outputFilename(cll::Positional, cll::desc(\"<output file>\"), cll::Required);\nstatic cll::opt<bool> useSmallData(\"32bitData\", cll::desc(\"Use 32 bit data\"),\n                                   cll::init(false));\nstatic cll::opt<bool>\n    edgesSorted(\"edgesSorted\", cll::desc(\"Edges are sorted by the sourceIDs.\"),\n                cll::init(false));\nstatic cll::opt<unsigned long long>\n    numNodes(\"numNodes\", cll::desc(\"Total number of nodes given.\"),\n             cll::init(0));\n\nunion dataTy {\n  int64_t ival;\n  double dval;\n  float fval;\n  int32_t i32val;\n};\n\nvoid perEdge(std::istream& is,\n             std::function<void(uint64_t, uint64_t, dataTy)> fn,\n             std::function<void(uint64_t, uint64_t)> fnPreSize) {\n  std::string line;\n\n  uint64_t bytes      = 0;\n  uint64_t counter    = 0;\n  uint64_t totalBytes = 0;\n\n  const std::regex problemLine(\"^p[[:space:]]+[[:alpha:]]+[[:space:]]+([[:\"\n                               \"digit:]]+)[[:space:]]+([[:digit:]]+)\");\n  const std::regex noData(\n      \"^a?[[:space:]]*([[:digit:]]+)[[:space:]]+([[:digit:]]+)[[:space:]]*\");\n  // const std::regex noData_nospace(\n  // \"^a?[[:space:]]*([[:digit:]]+)[[:space:]]+([[:digit:]]+)\");\n  const std::regex intData(\"^a?[[:space:]]*([[:digit:]]+)[[:space:]]+([[:digit:\"\n                           \"]]+)[[:space:]]+(-?[[:digit:]]+)\");\n  const std::regex floatData(\n      \"^a?[[:space:]]*([[:digit:]]+)[[:space:]]+([[:digit:]]+)[[:space:]]+(-?[[\"\n      \":digit:]]+\\\\.[[:digit:]]+)\");\n\n  auto timer      = std::chrono::system_clock::now();\n  auto timerStart = timer;\n\n  std::smatch matches;\n  bool zeroBased = false; // set to 1 if file is one-indexed\n  bool seenEdge  = false;\n\n  while (std::getline(is, line)) {\n    auto t = line.size() + 1;\n    bytes += t;\n    totalBytes += t;\n    ++counter;\n\n    if (counter == 1024 * 128) {\n      counter     = 0;\n      auto timer2 = std::chrono::system_clock::now();\n      std::cout << \"Scan: \"\n                << (double)bytes /\n                       std::chrono::duration_cast<std::chrono::microseconds>(\n                           timer2 - timer)\n                           .count()\n                << \" MB/s\\n\";\n      timer = timer2;\n      bytes = 0;\n    }\n\n    dataTy data;\n    bool match = false;\n    if (std::regex_match(line, matches, floatData)) {\n      if (useSmallData)\n        data.fval = std::stof(matches[3].str());\n      else\n        data.dval = std::stod(matches[3].str());\n      match = true;\n    } else if (std::regex_match(line, matches, intData)) {\n      if (useSmallData)\n        data.i32val = std::stoul(matches[3].str());\n      else\n        data.ival = std::stoll(matches[3].str());\n      match = true;\n    } else if (std::regex_match(line, matches,\n                                noData)) { // || std::regex_match(line, matches,\n                                           // noData_nospace)) {\n      data.ival = 0;\n      match     = true;\n    } else if (std::regex_match(line, matches, problemLine)) {\n      if (seenEdge) {\n        std::cerr << \"Error: seeing a dimacs problem line after seeing edges\\n\";\n        abort();\n      }\n      zeroBased = true; // dimacs files are 1-indexed\n      fnPreSize(std::stoull(matches[1].str()), std::stoull(matches[2].str()));\n    }\n    if (match) {\n      seenEdge     = true;\n      uint64_t src = std::stoull(matches[1].str());\n      uint64_t dst = std::stoull(matches[2].str());\n      if (zeroBased) {\n        if (src == 0 || dst == 0) {\n          std::cerr << \"Error: node id 0 in a dimacs graph\\n\";\n          abort();\n        }\n        src -= 1;\n        dst -= 1;\n      }\n      fn(src, dst, data);\n    }\n  }\n  auto timer2 = std::chrono::system_clock::now();\n  std::cout << \"File Scan: \"\n            << (double)totalBytes /\n                   std::chrono::duration_cast<std::chrono::microseconds>(\n                       timer2 - timerStart)\n                       .count()\n            << \" MB/s\\n\";\n}\n\nvoid go(std::istream& input) {\n  try {\n    std::deque<uint64_t> edgeCount;\n    perEdge(\n        input,\n        [&edgeCount](uint64_t src, uint64_t, dataTy) {\n          if (edgeCount.size() <= src)\n            edgeCount.resize(src + 1);\n          ++edgeCount[src];\n        },\n        [&edgeCount](uint64_t nodes, uint64_t) { edgeCount.resize(nodes); });\n    input.clear();\n    input.seekg(0, std::ios_base::beg);\n    galois::graphs::OfflineGraphWriter outFile(outputFilename, useSmallData);\n    outFile.setCounts(edgeCount);\n    perEdge(\n        input,\n        [&outFile, &edgeCount](uint64_t src, uint64_t dst, dataTy data) {\n          auto off = --edgeCount[src];\n          if (useSmallData)\n            outFile.setEdge(src, off, dst, data.i32val);\n          else\n            outFile.setEdge(src, off, dst, data.ival);\n        },\n        [](uint64_t, uint64_t) {});\n  } catch (const char* c) {\n    std::cerr << \"Failed with: \" << c << \"\\n\";\n    abort();\n  }\n}\n\nvoid go_edgesSorted(std::istream& input, uint64_t numNodes) {\n  try {\n    std::deque<uint64_t> edgeCount(numNodes, 0);\n    input.clear();\n    input.seekg(0, std::ios_base::beg);\n    galois::graphs::OfflineGraphWriter outFile(outputFilename, useSmallData,\n                                               numNodes);\n    outFile.setCounts(edgeCount);\n    outFile.seekEdgesDstStart();\n    uint64_t curr_src           = 0;\n    uint64_t curr_src_edgeCount = 0;\n    perEdge(\n        input,\n        [&outFile, &edgeCount, &curr_src,\n         &curr_src_edgeCount](uint64_t src, uint64_t dst, dataTy) {\n          if (src == curr_src) {\n            ++curr_src_edgeCount;\n          } else {\n            // std::cout << \"CHANGES : \" << src << \" : \" << curr_src << \"\n            // COUNT : \" << curr_src_edgeCount << \"\\n\";\n            if (src < curr_src) {\n              std::cerr << \" ERROR : File is not sorted\\n\";\n              abort();\n            }\n            edgeCount[curr_src] = curr_src_edgeCount;\n            curr_src            = src;\n            curr_src_edgeCount  = 1;\n          }\n          outFile.setEdgeSorted(dst);\n        },\n        [](uint64_t, uint64_t) {});\n    // To take care of the last src node ID.\n    edgeCount[curr_src] = curr_src_edgeCount;\n    outFile.setCounts(edgeCount);\n  } catch (const char* c) {\n    std::cerr << \"Failed with: \" << c << \"\\n\";\n    abort();\n  }\n}\n\nint main(int argc, char** argv) {\n  llvm::cl::ParseCommandLineOptions(argc, argv);\n  //  std::ios_base::sync_with_stdio(false);\n  std::cout << \"Data will be \" << (useSmallData ? 4 : 8) << \" Bytes\\n\";\n\n  std::ifstream infile(inputFilename, std::ios_base::in);\n  if (!infile) {\n    std::cout << \"Failed to open \" << inputFilename << \"\\n\";\n    return 1;\n  }\n\n  // // if (isCompressed(inputType)) {\n  // //   boost::iostreams::filtering_streambuf<boost::iostreams::input> inbuf;\n  // //   inbuf.push(boost::iostreams::gzip_decompressor());\n  // //   inbuf.push(infile);\n  // //   //Convert streambuf to istream\n  // //   std::istream instream(&inbuf);\n  // //   go(instream);\n  // // } else {\n  if (numNodes > 0 && edgesSorted) {\n    go_edgesSorted(infile, numNodes);\n  } else {\n    go(infile);\n  }\n  //  }\n\n  return 0;\n}\n"
  },
  {
    "path": "tools/graph-convert/graph-convert.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#include \"galois/Galois.h\"\n#include \"galois/LargeArray.h\"\n#include \"galois/graphs/FileGraph.h\"\n\n#include <llvm/Support/CommandLine.h>\n\n#include <boost/mpl/if.hpp>\n#include <algorithm>\n#include <deque>\n#include <fstream>\n#include <iostream>\n#include <limits>\n#include <cstdint>\n#include <vector>\n#include <random>\n#include <string>\n#include <optional>\n\n#include <fcntl.h>\n#include <cstdlib>\n\n// TODO: move these enums to a common location for all graph convert tools\nenum ConvertMode {\n  bipartitegr2bigpetsc,\n  bipartitegr2littlepetsc,\n  bipartitegr2sorteddegreegr,\n  dimacs2gr,\n  edgelist2gr,\n  csv2gr,\n  gr2biggr,\n  gr2binarypbbs32,\n  gr2binarypbbs64,\n  gr2bsml,\n  gr2cgr,\n  gr2dimacs,\n  gr2adjacencylist,\n  gr2edgelist,\n  gr2edgelist1ind,\n  gr2linegr,\n  gr2lowdegreegr,\n  gr2mtx,\n  gr2partdstgr,\n  gr2partsrcgr,\n  gr2pbbs,\n  gr2pbbsedges,\n  gr2randgr,\n  gr2randomweightgr,\n  gr2ringgr,\n  gr2rmat,\n  gr2metis,\n  gr2sgr,\n  gr2sorteddegreegr,\n  gr2sorteddstgr,\n  gr2sortedparentdegreegr,\n  gr2sortedweightgr,\n  gr2sortedbfsgr,\n  gr2streegr,\n  gr2tgr,\n  gr2treegr,\n  gr2trigr,\n  gr2totem,\n  gr2neo4j,\n  mtx2gr,\n  nodelist2gr,\n  pbbs2gr,\n  svmlight2gr,\n  edgelist2binary\n};\n\nenum EdgeType { float32_, float64_, int32_, int64_, uint32_, uint64_, void_ };\n\nnamespace cll = llvm::cl;\n\nstatic cll::opt<std::string>\n    inputFilename(cll::Positional, cll::desc(\"<input file>\"), cll::Required);\nstatic cll::opt<std::string>\n    outputFilename(cll::Positional, cll::desc(\"<output file>\"), cll::Required);\nstatic cll::opt<std::string>\n    transposeFilename(\"graphTranspose\", cll::desc(\"transpose graph file\"),\n                      cll::init(\"\"));\nstatic cll::opt<std::string>\n    outputPermutationFilename(\"outputNodePermutation\",\n                              cll::desc(\"output node permutation file\"),\n                              cll::init(\"\"));\nstatic cll::opt<std::string>\n    labelsFilename(\"labels\", cll::desc(\"labels file for svmlight2gr\"),\n                   cll::init(\"\"));\nstatic cll::opt<EdgeType> edgeType(\n    \"edgeType\", cll::desc(\"Input/Output edge type:\"),\n    cll::values(clEnumValN(EdgeType::float32_, \"float32\",\n                           \"32 bit floating point edge values\"),\n                clEnumValN(EdgeType::float64_, \"float64\",\n                           \"64 bit floating point edge values\"),\n                clEnumValN(EdgeType::int32_, \"int32\", \"32 bit int edge values\"),\n                clEnumValN(EdgeType::int64_, \"int64\", \"64 bit int edge values\"),\n                clEnumValN(EdgeType::uint32_, \"uint32\",\n                           \"32 bit unsigned int edge values\"),\n                clEnumValN(EdgeType::uint64_, \"uint64\",\n                           \"64 bit unsigned int edge values\"),\n                clEnumValN(EdgeType::void_, \"void\", \"no edge values\")),\n    cll::init(EdgeType::void_));\nstatic cll::opt<ConvertMode> convertMode(\n    cll::desc(\"Conversion mode:\"),\n    cll::values(\n        clEnumVal(bipartitegr2bigpetsc,\n                  \"Convert bipartite binary gr to big-endian PETSc format\"),\n        clEnumVal(bipartitegr2littlepetsc,\n                  \"Convert bipartite binary gr to little-endian PETSc format\"),\n        clEnumVal(bipartitegr2sorteddegreegr,\n                  \"Sort nodes of bipartite binary gr by degree\"),\n        clEnumVal(dimacs2gr, \"Convert dimacs to binary gr\"),\n        clEnumVal(edgelist2gr, \"Convert edge list to binary gr\"),\n        clEnumVal(csv2gr, \"Convert csv to binary gr\"),\n        clEnumVal(gr2biggr, \"Convert binary gr with little-endian edge data to \"\n                            \"big-endian edge data\"),\n        clEnumVal(gr2binarypbbs32,\n                  \"Convert binary gr to unweighted binary pbbs graph\"),\n        clEnumVal(gr2binarypbbs64,\n                  \"Convert binary gr to unweighted binary pbbs graph\"),\n        clEnumVal(gr2bsml, \"Convert binary gr to binary sparse MATLAB matrix\"),\n        clEnumVal(gr2cgr,\n                  \"Clean up binary gr: remove self edges and multi-edges\"),\n        clEnumVal(gr2dimacs, \"Convert binary gr to dimacs\"),\n        clEnumVal(gr2adjacencylist, \"Convert binary gr to adjacency list\"),\n        clEnumVal(gr2edgelist, \"Convert binary gr to edgelist\"),\n        clEnumVal(gr2edgelist1ind, \"Convert binary gr to edgelist, 1-indexed\"),\n        clEnumVal(gr2linegr, \"Overlay line graph\"),\n        clEnumVal(gr2lowdegreegr, \"Remove high degree nodes from binary gr\"),\n        clEnumVal(gr2mtx, \"Convert binary gr to matrix market format\"),\n        clEnumVal(gr2partdstgr,\n                  \"Partition binary gr in N pieces by destination nodes\"),\n        clEnumVal(gr2partsrcgr,\n                  \"Partition binary gr in N pieces by source nodes\"),\n        clEnumVal(gr2pbbs, \"Convert binary gr to pbbs graph\"),\n        clEnumVal(gr2pbbsedges, \"Convert binary gr to pbbs edge list\"),\n        clEnumVal(gr2randgr, \"Randomly permute nodes of binary gr\"),\n        clEnumVal(gr2randomweightgr, \"Add or Randomize edge weights\"),\n        clEnumVal(gr2ringgr, \"Convert binary gr to strongly connected graph by \"\n                             \"adding ring overlay\"),\n        clEnumVal(gr2rmat, \"Convert binary gr to RMAT graph\"),\n        clEnumVal(gr2metis, \"Convert binary gr to METIS graph (unweighted)\"),\n        clEnumVal(\n            gr2sgr,\n            \"Convert binary gr to symmetric graph by adding reverse edges\"),\n        clEnumVal(gr2sorteddegreegr, \"Sort nodes by degree\"),\n        clEnumVal(gr2sorteddstgr,\n                  \"Sort outgoing edges of binary gr by edge destination\"),\n        clEnumVal(gr2sortedparentdegreegr, \"Sort nodes by degree of parent\"),\n        clEnumVal(gr2sortedweightgr,\n                  \"Sort outgoing edges of binary gr by edge weight\"),\n        clEnumVal(gr2sortedbfsgr,\n                  \"Sort nodes by a BFS traversal from the source (greedy)\"),\n        clEnumVal(gr2streegr, \"Convert binary gr to strongly connected graph \"\n                              \"by adding symmetric tree overlay\"),\n        clEnumVal(gr2tgr, \"Transpose binary gr\"),\n        clEnumVal(gr2treegr, \"Overlay tree\"),\n        clEnumVal(gr2trigr, \"Convert symmetric binary gr to triangular form by \"\n                            \"removing reverse edges\"),\n        clEnumVal(gr2totem, \"Convert binary gr totem input format\"),\n        clEnumVal(gr2neo4j, \"Convert binary gr to a vertex/edge csv for neo4j\"),\n        clEnumVal(mtx2gr, \"Convert matrix market format to binary gr\"),\n        clEnumVal(nodelist2gr, \"Convert node list to binary gr\"),\n        clEnumVal(pbbs2gr, \"Convert pbbs graph to binary gr\"),\n        clEnumVal(svmlight2gr, \"Convert svmlight file to binary gr\"),\n        clEnumVal(edgelist2binary,\n                  \"Convert edge list to binary edgelist \"\n                  \"format (assumes vertices of type uin32_t)\")),\n    cll::Required);\nstatic cll::opt<uint32_t>\n    sourceNode(\"sourceNode\", cll::desc(\"Source node ID for BFS traversal\"),\n               cll::init(0));\nstatic cll::opt<int>\n    numParts(\"numParts\", cll::desc(\"number of parts to partition graph into\"),\n             cll::init(64));\nstatic cll::opt<int> maxValue(\"maxValue\",\n                              cll::desc(\"maximum weight to add for tree, line, \"\n                                        \"ring and random weight conversions\"),\n                              cll::init(100));\nstatic cll::opt<int>\n    minValue(\"minValue\",\n             cll::desc(\"minimum weight to add for random weight conversions\"),\n             cll::init(1));\nstatic cll::opt<int> maxDegree(\"maxDegree\", cll::desc(\"maximum degree to keep\"),\n                               cll::init(2 * 1024));\n\nstruct Conversion {};\nstruct HasOnlyVoidSpecialization {};\nstruct HasNoVoidSpecialization {};\n\ntemplate <typename EdgeTy, typename C>\nvoid convert(C& c, Conversion) {\n  c.template convert<EdgeTy>(inputFilename, outputFilename);\n}\n\ntemplate <typename EdgeTy, typename C>\nvoid convert(\n    C& c, HasOnlyVoidSpecialization,\n    typename std::enable_if<std::is_same<EdgeTy, void>::value>::type* = 0) {\n  c.template convert<EdgeTy>(inputFilename, outputFilename);\n}\n\ntemplate <typename EdgeTy, typename C>\nvoid convert(\n    C&, HasOnlyVoidSpecialization,\n    typename std::enable_if<!std::is_same<EdgeTy, void>::value>::type* = 0) {\n  GALOIS_DIE(\"conversion undefined for non-void graphs\");\n}\n\ntemplate <typename EdgeTy, typename C>\nvoid convert(\n    C& c, HasNoVoidSpecialization,\n    typename std::enable_if<!std::is_same<EdgeTy, void>::value>::type* = 0) {\n  c.template convert<EdgeTy>(inputFilename, outputFilename);\n}\n\ntemplate <typename EdgeTy, typename C>\nvoid convert(\n    C&, HasNoVoidSpecialization,\n    typename std::enable_if<std::is_same<EdgeTy, void>::value>::type* = 0) {\n  GALOIS_DIE(\"conversion undefined for void graphs\");\n}\n\nstatic std::string edgeTypeToName(EdgeType e) {\n  switch (e) {\n  case EdgeType::float32_:\n    return \"float32\";\n  case EdgeType::float64_:\n    return \"float64\";\n  case EdgeType::int32_:\n    return \"int32\";\n  case EdgeType::int64_:\n    return \"int64\";\n  case EdgeType::uint32_:\n    return \"uint32\";\n  case EdgeType::uint64_:\n    return \"uint64\";\n  case EdgeType::void_:\n    return \"void\";\n  default:\n    abort();\n  }\n}\n\ntemplate <typename C>\nvoid convert() {\n  C c;\n  std::cout << \"Graph type: \" << edgeTypeToName(edgeType) << \"\\n\";\n  switch (edgeType) {\n  case EdgeType::float32_:\n    convert<float>(c, c);\n    break;\n  case EdgeType::float64_:\n    convert<double>(c, c);\n    break;\n  case EdgeType::int32_:\n    convert<int32_t>(c, c);\n    break;\n  case EdgeType::int64_:\n    convert<int64_t>(c, c);\n    break;\n  case EdgeType::uint32_:\n    convert<uint32_t>(c, c);\n    break;\n  case EdgeType::uint64_:\n    convert<uint64_t>(c, c);\n    break;\n  case EdgeType::void_:\n    convert<void>(c, c);\n    break;\n  default:\n    abort();\n  };\n}\n\nstatic void printStatus(size_t inNodes, size_t inEdges, size_t outNodes,\n                        size_t outEdges) {\n  std::cout << \"InGraph : |V| = \" << inNodes << \", |E| = \" << inEdges << \"\\n\";\n  std::cout << \"OutGraph: |V| = \" << outNodes << \", |E| = \" << outEdges << \"\\n\";\n}\n\nstatic void printStatus(size_t inNodes, size_t inEdges) {\n  printStatus(inNodes, inEdges, inNodes, inEdges);\n}\n\ntemplate <typename EdgeValues, bool Enable>\nvoid setEdgeValue(EdgeValues& edgeValues, int value,\n                  typename std::enable_if<Enable>::type* = 0) {\n  edgeValues.set(0, static_cast<typename EdgeValues::value_type>(value));\n}\n\ntemplate <typename EdgeValues, bool Enable>\nvoid setEdgeValue(EdgeValues&, int,\n                  typename std::enable_if<!Enable>::type* = 0) {}\n\ntemplate <typename EdgeTy, bool Enable>\nEdgeTy getEdgeValue(galois::graphs::FileGraph& g,\n                    galois::graphs::FileGraph::edge_iterator ii,\n                    typename std::enable_if<Enable>::type* = 0) {\n  return g.getEdgeData<EdgeTy>(ii);\n}\n\ntemplate <typename EdgeTy, bool Enable>\nint getEdgeValue(galois::graphs::FileGraph&,\n                 galois::graphs::FileGraph::edge_iterator,\n                 typename std::enable_if<!Enable>::type* = 0) {\n  return 1;\n}\n\ntemplate <typename T>\nvoid outputPermutation(const T& perm) {\n  size_t oid = 0;\n  std::ofstream out(outputPermutationFilename);\n  for (auto ii = perm.begin(), ei = perm.end(); ii != ei; ++ii, ++oid) {\n    out << oid << \",\" << *ii << \"\\n\";\n  }\n}\n\nvoid skipLine(std::ifstream& infile) {\n  infile.ignore(std::numeric_limits<std::streamsize>::max(), '\\n');\n}\n\n/**\n * Common parsing for edgelist style text files.\n *\n * src dst [weight]\n * ...\n *\n * If delim is set, this function expects that each entry is separated by delim\n * surrounded by optional whitespace.\n */\ntemplate <typename EdgeTy>\nvoid convertEdgelist(const std::string& infilename,\n                     const std::string& outfilename, const bool skipFirstLine,\n                     std::optional<char> delim) {\n  typedef galois::graphs::FileGraphWriter Writer;\n\n  Writer p;\n  std::ifstream infile(infilename.c_str());\n\n  size_t numNodes   = 0;\n  size_t numEdges   = 0;\n  size_t lineNumber = 0;\n\n  if (skipFirstLine) {\n    galois::gWarn(\n        \"first line is assumed to contain labels and will be ignored\\n\");\n    skipLine(infile);\n    ++lineNumber;\n  }\n\n  const bool hasDelim = static_cast<bool>(delim);\n  std::optional<size_t> skippedLine;\n  std::string line;\n  char readDelim;\n\n  for (; std::getline(infile, line); ++lineNumber) {\n    std::stringstream iss(line);\n\n    size_t src;\n    if (!(iss >> src)) {\n      skippedLine = lineNumber;\n      continue;\n    }\n\n    if (hasDelim) {\n      if (!(iss >> readDelim) || readDelim != delim) {\n        skippedLine = lineNumber;\n        continue;\n      }\n    }\n\n    size_t dst;\n    if (!(iss >> dst)) {\n      skippedLine = lineNumber;\n      continue;\n    }\n\n    if constexpr (!std::is_void<EdgeTy>::value) {\n      EdgeTy data{};\n      if (hasDelim) {\n        if (!(iss >> readDelim) || readDelim != delim) {\n          skippedLine = lineNumber;\n          continue;\n        }\n      }\n\n      if (!(iss >> data)) {\n        skippedLine = lineNumber;\n        continue;\n      }\n    }\n\n    if (infile) {\n      ++numEdges;\n      if (src > numNodes)\n        numNodes = src;\n      if (dst > numNodes)\n        numNodes = dst;\n    }\n  }\n\n  if (skippedLine) {\n    galois::gWarn(\"ignored at least one line (line \", *skippedLine,\n                  \") because it did not match the expected format\\n\");\n  }\n\n  numNodes++;\n  p.setNumNodes(numNodes);\n  p.setNumEdges<EdgeTy>(numEdges);\n\n  infile.clear();\n  infile.seekg(0, std::ios::beg);\n  p.phase1();\n\n  if (skipFirstLine) {\n    skipLine(infile);\n  }\n\n  while (std::getline(infile, line)) {\n    std::stringstream iss(line);\n\n    size_t src;\n    if (!(iss >> src)) {\n      continue;\n    }\n\n    if (hasDelim) {\n      if (!(iss >> readDelim) || readDelim != delim) {\n        continue;\n      }\n    }\n\n    size_t dst;\n    if (!(iss >> dst)) {\n      continue;\n    }\n\n    if constexpr (!std::is_void<EdgeTy>::value) {\n      EdgeTy data{};\n      if (hasDelim) {\n        if (!(iss >> readDelim) || readDelim != delim) {\n          continue;\n        }\n      }\n\n      if (!(iss >> data)) {\n        continue;\n      }\n    }\n\n    if (infile) {\n      p.incrementDegree(src);\n    }\n  }\n\n  infile.clear();\n  infile.seekg(0, std::ios::beg);\n  p.phase2();\n\n  if (skipFirstLine) {\n    skipLine(infile);\n  }\n\n  while (std::getline(infile, line)) {\n    std::stringstream iss(line);\n\n    size_t src;\n    if (!(iss >> src)) {\n      continue;\n    }\n\n    if (hasDelim) {\n      if (!(iss >> readDelim) || readDelim != delim) {\n        continue;\n      }\n    }\n\n    size_t dst;\n    if (!(iss >> dst)) {\n      continue;\n    }\n\n    if constexpr (!std::is_void<EdgeTy>::value) {\n      EdgeTy data{};\n      if (hasDelim) {\n        if (!(iss >> readDelim) || readDelim != delim) {\n          continue;\n        }\n      }\n\n      if (!(iss >> data)) {\n        continue;\n      }\n\n      if (infile) {\n        p.addNeighbor<EdgeTy>(src, dst, data);\n      }\n    } else {\n      if (infile) {\n        p.addNeighbor(src, dst);\n      }\n    }\n  }\n\n  p.finish();\n\n  p.toFile(outfilename);\n  printStatus(numNodes, numEdges);\n}\n\ntemplate <typename EdgeTy>\nvoid convertEdgelist(const std::string& infilename,\n                     const std::string& outfilename, const bool skipFirstLine) {\n  convertEdgelist<EdgeTy>(infilename, outfilename, skipFirstLine,\n                          std::optional<char>());\n}\n\n/**\n * Assumption: First line has labels\n * Just a bunch of pairs or triples:\n * src dst weight?\n */\nstruct CSV2Gr : public Conversion {\n  template <typename EdgeTy>\n  void convert(const std::string& infilename, const std::string& outfilename) {\n    convertEdgelist<EdgeTy>(infilename, outfilename, true, ',');\n  }\n};\n\n/**\n * Just a bunch of pairs or triples:\n * src dst weight?\n */\nstruct Edgelist2Gr : public Conversion {\n  template <typename EdgeTy>\n  void convert(const std::string& infilename, const std::string& outfilename) {\n    convertEdgelist<EdgeTy>(infilename, outfilename, false);\n  }\n};\n\n/**\n * Convert edgelist to binary edgelist format\n * Assumes no edge data.\n */\nstruct Edgelist2Binary : public Conversion {\n  template <typename EdgeTy>\n  void convert(const std::string& infilename, const std::string& outfilename) {\n    std::ifstream infile(infilename.c_str());\n    std::ofstream outfile(outfilename.c_str());\n\n    size_t numNodes = 0;\n    size_t numEdges = 0;\n\n    std::vector<uint32_t> buffer(10000);\n    uint32_t counter = 0;\n    bool skippedLine = false;\n    while (infile) {\n      uint32_t src;\n      if (!(infile >> src)) {\n        skipLine(infile);\n        skippedLine = true;\n        continue;\n      }\n\n      uint32_t dst;\n      if (!(infile >> dst)) {\n        skipLine(infile);\n        skippedLine = true;\n        continue;\n      }\n\n      buffer[counter++] = src;\n      buffer[counter++] = dst;\n      if (counter == buffer.size()) {\n        // flush it to the output file.\n        outfile.write(reinterpret_cast<char*>(&buffer[0]),\n                      sizeof(uint32_t) * counter);\n        counter = 0;\n      }\n\n      if (infile) {\n        ++numEdges;\n        if (src > numNodes)\n          numNodes = src;\n        if (dst > numNodes)\n          numNodes = dst;\n      } else {\n        counter -= 2;\n      }\n    }\n\n    if (counter) {\n      // flush it to the output file.\n      outfile.write(reinterpret_cast<char*>(&buffer[0]),\n                    sizeof(uint32_t) * counter);\n    }\n\n    if (skippedLine) {\n      galois::gWarn(\"ignored at least one line because it did not match the \"\n                    \"expected format\\n\");\n    }\n\n    printStatus(numNodes, numEdges);\n  }\n};\n\n/**\n * Convert matrix market matrix to binary graph.\n *\n * %% comments\n * % ...\n * <num nodes> <num nodes> <num edges>\n * <src> <dst> <float>\n *\n * src and dst start at 1.\n */\nstruct Mtx2Gr : public HasNoVoidSpecialization {\n  template <typename EdgeTy>\n  void convert(const std::string& infilename, const std::string& outfilename) {\n    typedef galois::graphs::FileGraphWriter Writer;\n\n    Writer p;\n    uint32_t nnodes;\n    size_t nedges;\n\n    for (int phase = 0; phase < 2; ++phase) {\n      std::ifstream infile(infilename.c_str());\n      if (!infile) {\n        GALOIS_DIE(\"failed to open input file\");\n      }\n\n      // Skip comments\n      while (infile) {\n        if (infile.peek() != '%') {\n          break;\n        }\n        skipLine(infile);\n      }\n\n      // Read header\n      char header[256];\n      infile.getline(header, 256);\n      std::istringstream line(header, std::istringstream::in);\n      std::vector<std::string> tokens;\n      while (line) {\n        std::string tmp;\n        line >> tmp;\n        if (line) {\n          tokens.push_back(tmp);\n        }\n      }\n      if (tokens.size() != 3) {\n        GALOIS_DIE(\"unknown problem specification line: \", line.str());\n      }\n      // Prefer C functions for maximum compatibility\n      // nnodes = std::stoull(tokens[0]);\n      // nedges = std::stoull(tokens[2]);\n      nnodes = strtoull(tokens[0].c_str(), NULL, 0);\n      nedges = strtoull(tokens[2].c_str(), NULL, 0);\n\n      // Parse edges\n      if (phase == 0) {\n        p.setNumNodes(nnodes);\n        p.setNumEdges<EdgeTy>(nedges);\n        p.phase1();\n      } else {\n        p.phase2();\n      }\n\n      for (size_t edge_num = 0; edge_num < nedges; ++edge_num) {\n        if ((nedges / 500 > 0) && (edge_num % (nedges / 500)) == 0) {\n          printf(\"Phase %d: current edge progress %lf%%\\n\", phase,\n                 ((double)edge_num / nedges) * 100);\n        }\n        uint32_t cur_id, neighbor_id;\n        double weight = 1;\n\n        infile >> cur_id >> neighbor_id >> weight;\n        if (cur_id == 0 || cur_id > nnodes) {\n          GALOIS_DIE(\"node id out of range: \", cur_id);\n        }\n        if (neighbor_id == 0 || neighbor_id > nnodes) {\n          GALOIS_DIE(\"neighbor id out of range: \", neighbor_id);\n        }\n\n        // 1 indexed\n        if (phase == 0) {\n          p.incrementDegree(cur_id - 1);\n        } else {\n          if constexpr (std::is_void<EdgeTy>::value) {\n            p.addNeighbor(cur_id - 1, neighbor_id - 1);\n          } else {\n            p.addNeighbor<EdgeTy>(cur_id - 1, neighbor_id - 1,\n                                  static_cast<EdgeTy>(weight));\n          }\n        }\n\n        skipLine(infile);\n      }\n\n      infile.peek();\n      if (!infile.eof()) {\n        GALOIS_DIE(\"additional lines in file\");\n      }\n    }\n    // this is for the progress print\n\n    p.finish();\n\n    p.toFile(outfilename);\n    printStatus(p.size(), p.sizeEdges());\n  }\n};\n\nstruct Gr2Mtx : public HasNoVoidSpecialization {\n  template <typename EdgeTy>\n  void convert(const std::string& infilename, const std::string& outfilename) {\n    typedef galois::graphs::FileGraph Graph;\n    typedef Graph::GraphNode GNode;\n\n    Graph graph;\n    graph.fromFile(infilename);\n\n    std::ofstream file(outfilename.c_str());\n    file << graph.size() << \" \" << graph.size() << \" \" << graph.sizeEdges()\n         << \"\\n\";\n    for (Graph::iterator ii = graph.begin(), ei = graph.end(); ii != ei; ++ii) {\n      GNode src = *ii;\n      for (Graph::edge_iterator jj = graph.edge_begin(src),\n                                ej = graph.edge_end(src);\n           jj != ej; ++jj) {\n        GNode dst = graph.getEdgeDst(jj);\n        double v  = static_cast<double>(graph.getEdgeData<EdgeTy>(jj));\n        file << src + 1 << \" \" << dst + 1 << \" \" << v << \"\\n\";\n      }\n    }\n    file.close();\n\n    printStatus(graph.size(), graph.sizeEdges());\n  }\n};\n\n/**\n * List of node adjacencies:\n *\n * <node id> <num neighbors> <neighbor id>*\n * ...\n */\nstruct Nodelist2Gr : public HasOnlyVoidSpecialization {\n  template <typename EdgeTy>\n  void convert(const std::string& infilename, const std::string& outfilename) {\n    static_assert(std::is_same<EdgeTy, void>::value,\n                  \"conversion undefined for non-void graphs\");\n    typedef galois::graphs::FileGraphWriter Writer;\n\n    Writer p;\n    std::ifstream infile(infilename.c_str());\n\n    size_t numNodes = 0;\n    size_t numEdges = 0;\n\n    while (infile) {\n      size_t src;\n      size_t numNeighbors;\n\n      infile >> src >> numNeighbors;\n\n      if (infile) {\n        if (src > numNodes)\n          numNodes = src;\n        numEdges += numNeighbors;\n      }\n      skipLine(infile);\n    }\n\n    numNodes++;\n    p.setNumNodes(numNodes);\n    p.setNumEdges<void>(numEdges);\n\n    infile.clear();\n    infile.seekg(0, std::ios::beg);\n    p.phase1();\n    while (infile) {\n      size_t src;\n      size_t numNeighbors;\n\n      infile >> src >> numNeighbors;\n\n      if (infile) {\n        p.incrementDegree(src, numNeighbors);\n      }\n      skipLine(infile);\n    }\n\n    infile.clear();\n    infile.seekg(0, std::ios::beg);\n    p.phase2();\n    while (infile) {\n      size_t src;\n      size_t numNeighbors;\n\n      infile >> src >> numNeighbors;\n\n      for (; infile && numNeighbors > 0; --numNeighbors) {\n        size_t dst;\n        infile >> dst;\n        if (infile)\n          p.addNeighbor(src, dst);\n      }\n\n      skipLine(infile);\n    }\n\n    p.finish();\n\n    p.toFile(outfilename);\n    printStatus(numNodes, numEdges);\n  }\n};\n\nstruct Gr2Adjacencylist : public Conversion {\n  template <typename EdgeTy>\n  void convert(const std::string& infilename, const std::string& outfilename) {\n    typedef galois::graphs::FileGraph Graph;\n    typedef Graph::GraphNode GNode;\n\n    Graph graph;\n    graph.fromFile(infilename);\n\n    std::ofstream file(outfilename.c_str());\n    for (Graph::iterator ii = graph.begin(), ei = graph.end(); ii != ei; ++ii) {\n      GNode src = *ii;\n      file << src;\n      for (Graph::edge_iterator jj = graph.edge_begin(src),\n                                ej = graph.edge_end(src);\n           jj != ej; ++jj) {\n        GNode dst = graph.getEdgeDst(jj);\n        file << \" \" << dst;\n      }\n      file << \"\\n\";\n    }\n    file.close();\n\n    printStatus(graph.size(), graph.sizeEdges());\n  }\n};\n\nstruct Gr2Edgelist : public Conversion {\n  template <typename EdgeTy>\n  void convert(const std::string& infilename, const std::string& outfilename) {\n    typedef galois::graphs::FileGraph Graph;\n    typedef Graph::GraphNode GNode;\n    typedef galois::LargeArray<EdgeTy> EdgeData;\n    typedef typename EdgeData::value_type edge_value_type;\n\n    Graph graph;\n    graph.fromFile(infilename);\n\n    std::ofstream file(outfilename.c_str());\n    for (Graph::iterator ii = graph.begin(), ei = graph.end(); ii != ei; ++ii) {\n      GNode src = *ii;\n      for (Graph::edge_iterator jj = graph.edge_begin(src),\n                                ej = graph.edge_end(src);\n           jj != ej; ++jj) {\n        GNode dst = graph.getEdgeDst(jj);\n        if constexpr (!std::is_void<EdgeTy>::value) {\n          file << src << \" \" << dst << \" \"\n               << graph.getEdgeData<edge_value_type>(jj) << \"\\n\";\n        } else {\n          file << src << \" \" << dst << \"\\n\";\n        }\n      }\n    }\n    file.close();\n\n    printStatus(graph.size(), graph.sizeEdges());\n  }\n};\n\n/**\n * Edge list conversion from gr except all ids are incremented by 1 (i.e.\n * 1 indexing).\n */\nstruct Gr2Edgelist1Ind : public Conversion {\n  template <typename EdgeTy>\n  void convert(const std::string& infilename, const std::string& outfilename) {\n    using Graph           = galois::graphs::FileGraph;\n    using GNode           = Graph::GraphNode;\n    using EdgeData        = galois::LargeArray<EdgeTy>;\n    using edge_value_type = typename EdgeData::value_type;\n\n    Graph graph;\n    graph.fromFile(infilename);\n\n    std::ofstream file(outfilename.c_str());\n    for (Graph::iterator ii = graph.begin(), ei = graph.end(); ii != ei; ++ii) {\n      GNode src = *ii;\n      for (Graph::edge_iterator jj = graph.edge_begin(src),\n                                ej = graph.edge_end(src);\n           jj != ej; ++jj) {\n        GNode dst = graph.getEdgeDst(jj);\n        if constexpr (!std::is_void<EdgeTy>::value) {\n          file << (src + 1) << \" \" << (dst + 1) << \" \"\n               << graph.getEdgeData<edge_value_type>(jj) << \"\\n\";\n        } else {\n          file << (src + 1) << \" \" << (dst + 1) << \"\\n\";\n        }\n      }\n    }\n    file.close();\n\n    printStatus(graph.size(), graph.sizeEdges());\n  }\n};\n\ntemplate <bool LittleEndian, typename T>\nvoid writeEndian(T* out, T value) {\n  static_assert(sizeof(T) == 4 || sizeof(T) == 8, \"unknown data size\");\n  switch ((sizeof(T) == 4 ? 0 : 2) + (LittleEndian ? 0 : 1)) {\n  case 3:\n    value = galois::convert_htobe64(value);\n    break;\n  case 2:\n    value = galois::convert_htole64(value);\n    break;\n  case 1:\n    value = galois::convert_htobe32(value);\n    break;\n  case 0:\n    value = galois::convert_htole32(value);\n    break;\n  default:\n    abort();\n  }\n\n  *out = value;\n}\n\ntemplate <bool LittleEndian, typename T>\nvoid writeEndian(std::ofstream& out, T value) {\n  static_assert(sizeof(T) == 4 || sizeof(T) == 8, \"unknown data size\");\n  switch ((sizeof(T) == 4 ? 0 : 2) + (LittleEndian ? 0 : 1)) {\n  case 3:\n    value = galois::convert_htobe64(value);\n    break;\n  case 2:\n    value = galois::convert_htole64(value);\n    break;\n  case 1:\n    value = galois::convert_htobe32(value);\n    break;\n  case 0:\n    value = galois::convert_htole32(value);\n    break;\n  default:\n    abort();\n  }\n\n  out.write(reinterpret_cast<char*>(&value), sizeof(value));\n}\n\ntemplate <typename OutEdgeTy, bool LittleEndian>\nstruct Bipartitegr2Petsc : public HasNoVoidSpecialization {\n  template <typename InEdgeTy>\n  void convert(const std::string& infilename, const std::string& outfilename) {\n    typedef galois::graphs::FileGraph Graph;\n    typedef Graph::GraphNode GNode;\n\n    Graph graph;\n    graph.fromFile(infilename);\n\n    size_t partition = 0;\n    for (Graph::iterator ii = graph.begin(), ei = graph.end(); ii != ei;\n         ++ii, ++partition) {\n      GNode src = *ii;\n      if (graph.edge_begin(src) == graph.edge_end(src)) {\n        break;\n      }\n    }\n\n    std::ofstream file(outfilename.c_str());\n    writeEndian<LittleEndian, int32_t>(file, 1211216);\n    writeEndian<LittleEndian, int32_t>(file, partition); // rows\n    writeEndian<LittleEndian, int32_t>(file,\n                                       graph.size() - partition); // columns\n    writeEndian<LittleEndian, int32_t>(file, graph.sizeEdges());\n\n    // number of nonzeros in each row\n    for (Graph::iterator ii = graph.begin(), ei = ii + partition; ii != ei;\n         ++ii) {\n      GNode src = *ii;\n      writeEndian<LittleEndian, int32_t>(\n          file, std::distance(graph.edge_begin(src), graph.edge_end(src)));\n    }\n\n    // column indices\n    for (Graph::iterator ii = graph.begin(), ei = ii + partition; ii != ei;\n         ++ii) {\n      GNode src = *ii;\n      for (Graph::edge_iterator jj = graph.edge_begin(src),\n                                ej = graph.edge_end(src);\n           jj != ej; ++jj) {\n        GNode dst = graph.getEdgeDst(jj);\n        writeEndian<LittleEndian, int32_t>(file, dst - partition);\n      }\n    }\n\n    // values\n    for (Graph::iterator ii = graph.begin(), ei = ii + partition; ii != ei;\n         ++ii) {\n      GNode src = *ii;\n      for (Graph::edge_iterator jj = graph.edge_begin(src),\n                                ej = graph.edge_end(src);\n           jj != ej; ++jj) {\n        writeEndian<LittleEndian, OutEdgeTy>(file,\n                                             graph.getEdgeData<InEdgeTy>(jj));\n      }\n    }\n    file.close();\n\n    printStatus(graph.size(), graph.sizeEdges());\n  }\n};\n\n//! Wrap generator into form form std::random_shuffle\ntemplate <typename T, typename Gen, template <typename> class Dist>\nstruct UniformDist {\n  Gen& gen;\n\n  UniformDist(Gen& g) : gen(g) {}\n  T operator()(T m) {\n    Dist<T> r(0, m - 1);\n    return r(gen);\n  }\n};\n\nstruct RandomizeNodes : public Conversion {\n  template <typename EdgeTy>\n  void convert(const std::string& infilename, const std::string& outfilename) {\n    typedef galois::graphs::FileGraph Graph;\n    typedef Graph::GraphNode GNode;\n    typedef galois::LargeArray<GNode> Permutation;\n\n    Graph graph;\n    graph.fromFile(infilename);\n\n    Permutation perm;\n    perm.create(graph.size());\n    std::copy(boost::counting_iterator<GNode>(0),\n              boost::counting_iterator<GNode>(graph.size()), perm.begin());\n    std::random_device rng;\n    std::mt19937 urng(rng());\n    std::shuffle(perm.begin(), perm.end(), urng);\n\n    Graph out;\n    galois::graphs::permute<EdgeTy>(graph, perm, out);\n    outputPermutation(perm);\n\n    out.toFile(outfilename);\n    printStatus(out.size(), out.sizeEdges());\n  }\n};\n\nstruct SortByBFS : public Conversion {\n  template <typename EdgeTy>\n  void convert(const std::string& infilename, const std::string& outfilename) {\n    typedef galois::graphs::FileGraph Graph;\n    typedef Graph::GraphNode GNode;\n    typedef galois::LargeArray<GNode> Permutation;\n\n    Graph graph;\n    graph.fromFile(infilename);\n\n    Permutation perm;\n    perm.create(graph.size());\n    GNode perm_index = 0;\n\n    // perform a BFS traversal\n    std::vector<GNode> curr, next;\n    galois::LargeArray<bool> visited;\n    visited.create(graph.size());\n    for (Graph::iterator ii = graph.begin(), ei = graph.end(); ii != ei; ++ii) {\n      GNode node    = *ii;\n      visited[node] = false;\n    }\n    GNode src    = sourceNode;\n    visited[src] = true;\n    next.push_back(src);\n    while (!next.empty()) {\n      size_t wl_size = next.size();\n      curr.resize(wl_size);\n      std::copy(next.begin(), next.end(), curr.begin());\n      next.clear();\n      for (size_t i = 0; i < wl_size; ++i) {\n        GNode node = curr[i];\n        perm[node] = perm_index++;\n        for (Graph::edge_iterator jj = graph.edge_begin(node),\n                                  ej = graph.edge_end(node);\n             jj != ej; ++jj) {\n          GNode dst = graph.getEdgeDst(jj);\n          if (visited[dst] == false) {\n            visited[dst] = true;\n            next.push_back(dst);\n          }\n        }\n      }\n    }\n    for (Graph::iterator ii = graph.begin(), ei = graph.end(); ii != ei; ++ii) {\n      GNode node = *ii;\n      if (visited[node] == false) {\n        perm[node] = perm_index++;\n      }\n    }\n    assert(perm_index == graph.size());\n\n    Graph out;\n    galois::graphs::permute<EdgeTy>(graph, perm, out);\n    outputPermutation(perm);\n\n    out.toFile(outfilename);\n    printStatus(out.size(), out.sizeEdges());\n  }\n};\n\ntemplate <typename T, bool IsInteger = std::numeric_limits<T>::is_integer>\nstruct UniformDistribution {};\n\ntemplate <typename T>\nstruct UniformDistribution<T, true> {\n  std::uniform_int_distribution<T> dist;\n\n  UniformDistribution(int a, int b) : dist(a, b) {}\n  template <typename Gen>\n  T operator()(Gen& g) {\n    return dist(g);\n  }\n};\n\ntemplate <typename T>\nstruct UniformDistribution<T, false> {\n  std::uniform_real_distribution<T> dist;\n\n  UniformDistribution(int a, int b) : dist(a, b) {}\n  template <typename Gen>\n  T operator()(Gen& g) {\n    return dist(g);\n  }\n};\n\nstruct RandomizeEdgeWeights : public HasNoVoidSpecialization {\n  template <typename OutEdgeTy>\n  void convert(const std::string& infilename, const std::string& outfilename) {\n    typedef galois::graphs::FileGraph Graph;\n\n    Graph graph;\n    Graph outgraph;\n\n    graph.fromFile(infilename);\n    OutEdgeTy* edgeData    = outgraph.fromGraph<OutEdgeTy>(graph);\n    OutEdgeTy* edgeDataEnd = edgeData + graph.sizeEdges();\n\n    std::mt19937 gen;\n    UniformDistribution<OutEdgeTy> dist(minValue, maxValue);\n    for (; edgeData != edgeDataEnd; ++edgeData) {\n      *edgeData = dist(gen);\n    }\n\n    outgraph.toFile(outfilename);\n    printStatus(graph.size(), graph.sizeEdges(), outgraph.size(),\n                outgraph.sizeEdges());\n  }\n};\n\n/**\n * Add edges (i, i-1) for all i \\in V.\n */\ntemplate <bool AddLine>\nstruct AddRing : public Conversion {\n  template <typename EdgeTy>\n  void convert(const std::string& infilename, const std::string& outfilename) {\n    typedef galois::graphs::FileGraph Graph;\n    typedef galois::graphs::FileGraphWriter Writer;\n    typedef typename Graph::GraphNode GNode;\n\n    Graph graph;\n    graph.fromFile(infilename);\n\n    Writer p;\n\n    uint64_t size     = graph.size();\n    uint64_t newEdges = AddLine ? size - 1 : size;\n    p.setNumNodes(size);\n    p.setNumEdges<EdgeTy>(graph.sizeEdges() + newEdges);\n\n    p.phase1();\n    for (Graph::iterator ii = graph.begin(), ei = graph.end(); ii != ei; ++ii) {\n      GNode src = *ii;\n      auto d    = std::distance(graph.edge_begin(src), graph.edge_end(src));\n      if (AddLine && src == 0)\n        p.incrementDegree(src, d);\n      else\n        p.incrementDegree(src, d + 1);\n    }\n\n    p.phase2();\n    for (Graph::iterator ii = graph.begin(), ei = graph.end(); ii != ei; ++ii) {\n      GNode src = *ii;\n\n      for (Graph::edge_iterator jj = graph.edge_begin(src),\n                                ej = graph.edge_end(src);\n           jj != ej; ++jj) {\n        GNode dst = graph.getEdgeDst(jj);\n        if constexpr (std::is_void<EdgeTy>::value) {\n          p.addNeighbor(src, dst);\n        } else {\n          p.addNeighbor<EdgeTy>(src, dst, graph.getEdgeData<EdgeTy>(jj));\n        }\n      }\n\n      if (AddLine && src == 0)\n        continue;\n\n      GNode dst = src == 0 ? size - 1 : src - 1;\n      if constexpr (std::is_void<EdgeTy>::value) {\n        p.addNeighbor(src, dst);\n      } else {\n        p.addNeighbor<EdgeTy>(src, dst, maxValue);\n      }\n    }\n\n    p.finish();\n\n    p.toFile(outfilename);\n    printStatus(graph.size(), graph.sizeEdges(), p.size(), p.sizeEdges());\n  }\n};\n\n/**\n * Add edges (i, i*2+1), (i, i*2+2) and their complement.\n */\ntemplate <bool AddComplement>\nstruct AddTree : public Conversion {\n  template <typename EdgeTy>\n  void convert(const std::string& infilename, const std::string& outfilename) {\n    typedef galois::graphs::FileGraph Graph;\n    typedef galois::graphs::FileGraphWriter Writer;\n    typedef Graph::GraphNode GNode;\n\n    Graph graph;\n    graph.fromFile(infilename);\n\n    Writer p;\n\n    uint64_t size     = graph.size();\n    uint64_t newEdges = 0;\n    if (size >= 2) {\n      // Closed form counts for the loop below\n      newEdges = (size - 1 + (2 - 1)) / 2;  // (1) rounded up\n      newEdges += (size - 2 + (2 - 1)) / 2; // (2) rounded up\n    } else if (size >= 1)\n      newEdges = 1;\n    if (AddComplement)\n      newEdges *= 2; // reverse edges\n\n    p.setNumNodes(size);\n    p.setNumEdges<EdgeTy>(graph.sizeEdges() + newEdges);\n\n    p.phase1();\n    for (Graph::iterator ii = graph.begin(), ei = graph.end(); ii != ei; ++ii) {\n      GNode src = *ii;\n      p.incrementDegree(\n          src, std::distance(graph.edge_begin(src), graph.edge_end(src)));\n      if (src * 2 + 1 < size) { // (1)\n        p.incrementDegree(src);\n        if (AddComplement)\n          p.incrementDegree(src * 2 + 1);\n      }\n      if (src * 2 + 2 < size) { // (2)\n        p.incrementDegree(src);\n        if (AddComplement)\n          p.incrementDegree(src * 2 + 2);\n      }\n    }\n\n    p.phase2();\n    for (Graph::iterator ii = graph.begin(), ei = graph.end(); ii != ei; ++ii) {\n      GNode src = *ii;\n\n      for (Graph::edge_iterator jj = graph.edge_begin(src),\n                                ej = graph.edge_end(src);\n           jj != ej; ++jj) {\n        GNode dst = graph.getEdgeDst(jj);\n        if constexpr (std::is_void<EdgeTy>::value) {\n          p.addNeighbor(src, dst);\n        } else {\n          p.addNeighbor<EdgeTy>(src, dst, graph.getEdgeData<EdgeTy>(jj));\n        }\n      }\n      if (src * 2 + 1 < size) {\n        if constexpr (std::is_void<EdgeTy>::value) {\n          p.addNeighbor(src, src * 2 + 1);\n          if (AddComplement)\n            p.addNeighbor(src * 2 + 1, src);\n        } else {\n          p.addNeighbor<EdgeTy>(src, src * 2 + 1, maxValue);\n          if (AddComplement)\n            p.addNeighbor<EdgeTy>(src * 2 + 1, src, maxValue);\n        }\n      }\n      if (src * 2 + 2 < size) {\n        if constexpr (std::is_void<EdgeTy>::value) {\n          p.addNeighbor(src, src * 2 + 2);\n          if (AddComplement)\n            p.addNeighbor(src * 2 + 2, src);\n        } else {\n          p.addNeighbor<EdgeTy>(src, src * 2 + 2, maxValue);\n          if (AddComplement)\n            p.addNeighbor<EdgeTy>(src * 2 + 2, src, maxValue);\n        }\n      }\n    }\n\n    p.finish();\n\n    p.toFile(outfilename);\n    printStatus(graph.size(), graph.sizeEdges(), p.size(), p.sizeEdges());\n  }\n};\n\n//! Make graph symmetric by blindly adding reverse entries\nstruct MakeSymmetric : public Conversion {\n  template <typename EdgeTy>\n  void convert(const std::string& infilename, const std::string& outfilename) {\n    typedef galois::graphs::FileGraph Graph;\n\n    Graph ingraph;\n    Graph outgraph;\n    ingraph.fromFile(infilename);\n    galois::graphs::makeSymmetric<EdgeTy>(ingraph, outgraph);\n\n    outgraph.toFile(outfilename);\n    printStatus(ingraph.size(), ingraph.sizeEdges(), outgraph.size(),\n                outgraph.sizeEdges());\n  }\n};\n\n/**\n * Like SortByDegree but (1) take into account bipartite representation splits\n * symmetric relation over two graphs (a graph and its transpose) and (2)\n * normalize representation by placing all nodes from bipartite graph set A\n * before set B.\n */\nstruct BipartiteSortByDegree : public Conversion {\n  template <typename EdgeTy>\n  void convert(const std::string& infilename, const std::string& outfilename) {\n    typedef galois::graphs::FileGraph Graph;\n    typedef Graph::GraphNode GNode;\n    typedef galois::LargeArray<GNode> Permutation;\n\n    Graph ingraph, outgraph, transposegraph;\n    ingraph.fromFile(infilename);\n    transposegraph.fromFile(transposeFilename);\n\n    Permutation perm;\n    perm.create(ingraph.size());\n\n    auto hasOutEdge = [&](GNode x) {\n      return ingraph.edge_begin(x) != ingraph.edge_end(x);\n    };\n    ptrdiff_t numSetA =\n        std::count_if(ingraph.begin(), ingraph.end(), hasOutEdge);\n    auto getDistance = [&](GNode x) -> ptrdiff_t {\n      if (ingraph.edge_begin(x) == ingraph.edge_end(x))\n        return numSetA + std::distance(transposegraph.edge_begin(x),\n                                       transposegraph.edge_end(x));\n      else\n        return std::distance(ingraph.edge_begin(x), ingraph.edge_end(x));\n    };\n\n    std::copy(ingraph.begin(), ingraph.end(), perm.begin());\n    std::sort(perm.begin(), perm.end(), [&](GNode lhs, GNode rhs) -> bool {\n      return getDistance(lhs) < getDistance(rhs);\n    });\n\n    // Finalize by taking the transpose/inverse\n    Permutation inverse;\n    inverse.create(ingraph.size());\n    size_t idx = 0;\n    for (auto n : perm) {\n      inverse[n] = idx++;\n    }\n\n    galois::graphs::permute<EdgeTy>(ingraph, inverse, outgraph);\n    outputPermutation(inverse);\n    outgraph.toFile(outfilename);\n    printStatus(ingraph.size(), ingraph.sizeEdges());\n  }\n};\n\nstruct SortByDegree : public Conversion {\n  template <typename EdgeTy>\n  void convert(const std::string& infilename, const std::string& outfilename) {\n    typedef galois::graphs::FileGraph Graph;\n    typedef Graph::GraphNode GNode;\n    typedef galois::LargeArray<GNode> Permutation;\n\n    Graph ingraph, outgraph;\n    ingraph.fromFile(infilename);\n\n    Permutation perm;\n    perm.create(ingraph.size());\n\n    std::copy(ingraph.begin(), ingraph.end(), perm.begin());\n    std::sort(perm.begin(), perm.end(), [&](GNode lhs, GNode rhs) -> bool {\n      return std::distance(ingraph.edge_begin(lhs), ingraph.edge_end(lhs)) <\n             std::distance(ingraph.edge_begin(rhs), ingraph.edge_end(rhs));\n    });\n\n    // Finalize by taking the transpose/inverse\n    Permutation inverse;\n    inverse.create(ingraph.size());\n    size_t idx = 0;\n    for (auto n : perm) {\n      inverse[n] = idx++;\n    }\n\n    galois::graphs::permute<EdgeTy>(ingraph, inverse, outgraph);\n    outputPermutation(inverse);\n    outgraph.toFile(outfilename);\n    printStatus(ingraph.size(), ingraph.sizeEdges());\n  }\n};\n\nstruct ToBigEndian : public HasNoVoidSpecialization {\n  template <typename EdgeTy>\n  void convert(const std::string& infilename, const std::string& outfilename) {\n    typedef galois::graphs::FileGraph Graph;\n\n    Graph ingraph, outgraph;\n    ingraph.fromFile(infilename);\n    EdgeTy* out = outgraph.fromGraph<EdgeTy>(ingraph);\n\n    for (auto ii = ingraph.edge_data_begin<EdgeTy>(),\n              ei = ingraph.edge_data_end<EdgeTy>();\n         ii != ei; ++ii, ++out) {\n      writeEndian<false>(out, *ii);\n    }\n    outgraph.toFile(outfilename);\n    printStatus(ingraph.size(), ingraph.sizeEdges());\n  }\n};\n\nstruct SortByHighDegreeParent : public Conversion {\n  template <typename EdgeTy>\n  void convert(const std::string& infilename, const std::string& outfilename) {\n    typedef galois::graphs::FileGraph Graph;\n    typedef Graph::GraphNode GNode;\n    typedef galois::LargeArray<GNode> Permutation;\n\n    Graph graph;\n    // get file graph\n    graph.fromFile(infilename);\n\n    // get the number of vertices\n    auto sz = graph.size();\n\n    Permutation perm;\n    perm.create(sz);\n    // fill the perm array with 0 through # vertices\n    std::copy(boost::counting_iterator<GNode>(0),\n              boost::counting_iterator<GNode>(sz), perm.begin());\n\n    std::cout << \"Done setting up perm\\n\";\n\n    std::deque<std::deque<std::pair<unsigned, GNode>>> inv(sz);\n    unsigned count = 0;\n\n    // loop through all vertices\n    for (auto ii = graph.begin(), ee = graph.end(); ii != ee; ++ii) {\n      // progress indicator print\n      if (!(++count % 1024))\n        std::cerr << static_cast<double>(count * 100) / sz << \"\\r\";\n\n      // get the number of edges this vertex has\n      unsigned dist = std::distance(graph.edge_begin(*ii), graph.edge_end(*ii));\n\n      // for each edge, get destination, and on that destination vertex save\n      // the source id (i.e. this is a transpose)\n      for (auto dsti = graph.edge_begin(*ii), dste = graph.edge_end(*ii);\n           dsti != dste; ++dsti)\n        inv[graph.getEdgeDst(dsti)].push_back(std::make_pair(dist, *ii));\n    }\n\n    std::cout << \"Found inverse\\n\";\n\n    count = 0;\n    // looping through deques with incoming edges\n    // TODO this can probably be parallelized since each deque is disjoint\n    for (auto ii = inv.begin(), ee = inv.end(); ii != ee; ++ii) {\n      // progress tracker\n      if (!(++count % 1024)) {\n        std::cerr << count << \" of \" << sz << \"\\r\";\n      }\n\n      // sort each deque\n      std::sort(ii->begin(), ii->end(),\n                std::greater<std::pair<unsigned, GNode>>());\n    }\n\n    std::cout << \"Beginning perm sort\\n\";\n\n    // sort the 0 -> # vertices array\n    std::sort(perm.begin(), perm.end(), [&inv](GNode lhs, GNode rhs) -> bool {\n      const auto& leftBegin  = inv[lhs].begin();\n      const auto& leftEnd    = inv[lhs].end();\n      const auto& rightBegin = inv[rhs].begin();\n      const auto& rightEnd   = inv[rhs].end();\n      // not less-than and not equal => greater-than\n      return (!std::lexicographical_compare(leftBegin, leftEnd, rightBegin,\n                                            rightEnd) &&\n              !(std::distance(leftBegin, leftEnd) ==\n                    std::distance(rightBegin, rightEnd) &&\n                std::equal(leftBegin, leftEnd, rightBegin)));\n    });\n\n    std::cout << \"Done sorting\\n\";\n\n    Permutation perm2;\n    perm2.create(sz);\n    // perm2 stores the new ordering of a particular vertex\n    for (unsigned x = 0; x < perm.size(); ++x)\n      perm2[perm[x]] = x;\n\n    std::cout << \"Done inverting\\n\";\n\n    // sanity check; this should print the same thing\n    for (unsigned x = 0; x < perm2.size(); ++x) {\n      if (perm[x] == 0) {\n        std::cout << \"Zero is at \" << x << \"\\n\";\n        break;\n      }\n    }\n    std::cout << \"Zero is at \" << perm2[0] << \"\\n\";\n\n    // do actual permutation of the graph\n    Graph out;\n    galois::graphs::permute<EdgeTy>(graph, perm2, out);\n    outputPermutation(perm2);\n\n    // std::cout << \"Biggest was \" << first << \" now \" << perm2[first] << \" with\n    // \"\n    //           << std::distance(out.edge_begin(perm2[first]),\n    //           out.edge_end(perm2[first]))\n    //           << \"\\n\";\n\n    out.toFile(outfilename);\n    printStatus(graph.size(), graph.sizeEdges());\n  }\n};\n\nstruct RemoveHighDegree : public Conversion {\n  template <typename EdgeTy>\n  void convert(const std::string& infilename, const std::string& outfilename) {\n    typedef galois::graphs::FileGraph Graph;\n    typedef Graph::GraphNode GNode;\n    typedef galois::graphs::FileGraphWriter Writer;\n\n    Graph graph;\n    graph.fromFile(infilename);\n\n    Writer p;\n\n    std::vector<GNode> nodeTable;\n    nodeTable.resize(graph.size());\n    uint64_t numNodes = 0;\n    uint64_t numEdges = 0;\n    for (Graph::iterator ii = graph.begin(), ei = graph.end(); ii != ei; ++ii) {\n      GNode src               = *ii;\n      Graph::edge_iterator jj = graph.edge_begin(src), ej = graph.edge_end(src);\n      if (std::distance(jj, ej) > maxDegree)\n        continue;\n      nodeTable[src] = numNodes++;\n      for (; jj != ej; ++jj) {\n        GNode dst = graph.getEdgeDst(jj);\n        if (std::distance(graph.edge_begin(dst), graph.edge_end(dst)) >\n            maxDegree)\n          continue;\n        ++numEdges;\n      }\n    }\n\n    if (numEdges == graph.sizeEdges() && numNodes == graph.size()) {\n      std::cout << \"Graph already simplified; copy input to output\\n\";\n      printStatus(graph.size(), graph.sizeEdges());\n      graph.toFile(outfilename);\n      return;\n    }\n\n    p.setNumNodes(numNodes);\n    p.setNumEdges<EdgeTy>(numEdges);\n\n    p.phase1();\n    for (Graph::iterator ii = graph.begin(), ei = graph.end(); ii != ei; ++ii) {\n      GNode src               = *ii;\n      Graph::edge_iterator jj = graph.edge_begin(src), ej = graph.edge_end(src);\n      if (std::distance(jj, ej) > maxDegree)\n        continue;\n      for (; jj != ej; ++jj) {\n        GNode dst = graph.getEdgeDst(jj);\n        if (std::distance(graph.edge_begin(dst), graph.edge_end(dst)) >\n            maxDegree)\n          continue;\n        p.incrementDegree(nodeTable[src]);\n      }\n    }\n\n    p.phase2();\n    for (Graph::iterator ii = graph.begin(), ei = graph.end(); ii != ei; ++ii) {\n      GNode src               = *ii;\n      Graph::edge_iterator jj = graph.edge_begin(src), ej = graph.edge_end(src);\n      if (std::distance(jj, ej) > maxDegree)\n        continue;\n      for (; jj != ej; ++jj) {\n        GNode dst = graph.getEdgeDst(jj);\n        if (std::distance(graph.edge_begin(dst), graph.edge_end(dst)) >\n            maxDegree)\n          continue;\n        if constexpr (std::is_void<EdgeTy>::value) {\n          p.addNeighbor(nodeTable[src], nodeTable[dst]);\n        } else {\n          p.addNeighbor<EdgeTy>(nodeTable[src], nodeTable[dst],\n                                graph.getEdgeData<EdgeTy>(jj));\n        }\n      }\n    }\n\n    p.finish();\n\n    p.toFile(outfilename);\n    printStatus(graph.size(), graph.sizeEdges(), p.size(), p.sizeEdges());\n  }\n};\n\n//! Partition graph into balanced number of edges by source node\nstruct PartitionBySource : public Conversion {\n  template <typename EdgeTy>\n  void convert(const std::string& infilename, const std::string& outfilename) {\n    typedef galois::graphs::FileGraph Graph;\n    typedef Graph::GraphNode GNode;\n    typedef galois::graphs::FileGraphWriter Writer;\n\n    Graph graph;\n    graph.fromFile(infilename);\n\n    for (int i = 0; i < numParts; ++i) {\n      Writer p;\n\n      auto r = graph.divideByNode(0, 1, i, numParts).first;\n\n      size_t numEdges = 0;\n      if (r.first != r.second)\n        numEdges = std::distance(graph.edge_begin(*r.first),\n                                 graph.edge_end(*(r.second - 1)));\n\n      p.setNumNodes(graph.size());\n      p.setNumEdges<EdgeTy>(numEdges);\n\n      p.phase1();\n      for (Graph::iterator ii = r.first, ei = r.second; ii != ei; ++ii) {\n        GNode src = *ii;\n        p.incrementDegree(\n            src, std::distance(graph.edge_begin(src), graph.edge_end(src)));\n      }\n\n      p.phase2();\n      for (Graph::iterator ii = r.first, ei = r.second; ii != ei; ++ii) {\n        GNode src = *ii;\n        for (Graph::edge_iterator jj = graph.edge_begin(src),\n                                  ej = graph.edge_end(src);\n             jj != ej; ++jj) {\n          GNode dst = graph.getEdgeDst(jj);\n          if constexpr (std::is_void<EdgeTy>::value)\n            p.addNeighbor(src, dst);\n          else\n            p.addNeighbor<EdgeTy>(src, dst, graph.getEdgeData<EdgeTy>(jj));\n        }\n      }\n\n      p.finish();\n\n      std::ostringstream partname;\n      partname << outfilename << \".\" << i << \".of.\" << numParts;\n\n      p.toFile(partname.str());\n      printStatus(graph.size(), graph.sizeEdges(), p.size(), p.sizeEdges());\n    }\n  }\n};\n\ntemplate <typename InDegree, typename It = typename InDegree::iterator>\nstatic std::pair<It, It> divide_by_destination(InDegree& inDegree, int id,\n                                               int total) {\n  if (inDegree.begin() == inDegree.end())\n    return std::make_pair(inDegree.begin(), inDegree.end());\n\n  size_t size  = inDegree[inDegree.size() - 1];\n  size_t block = (size + total - 1) / total;\n\n  It bb = std::lower_bound(inDegree.begin(), inDegree.end(), id * block);\n  It eb;\n  if (id + 1 == total)\n    eb = inDegree.end();\n  else\n    eb = std::upper_bound(bb, inDegree.end(), (id + 1) * block);\n  return std::make_pair(bb, eb);\n}\n\ntemplate <typename GraphTy, typename InDegree>\nstatic void compute_indegree(GraphTy& graph, InDegree& inDegree) {\n  inDegree.create(graph.size());\n\n  for (auto nn = graph.begin(), en = graph.end(); nn != en; ++nn) {\n    for (auto jj = graph.edge_begin(*nn), ej = graph.edge_end(*nn); jj != ej;\n         ++jj) {\n      auto dst = graph.getEdgeDst(jj);\n      inDegree[dst] += 1;\n    }\n  }\n\n  for (size_t i = 1; i < inDegree.size(); ++i)\n    inDegree[i] = inDegree[i - 1] + inDegree[i];\n}\n\n//! Partition graph into balanced number of edges by destination node\nstruct PartitionByDestination : public Conversion {\n  template <typename EdgeTy>\n  void convert(const std::string& infilename, const std::string& outfilename) {\n    typedef galois::graphs::FileGraph Graph;\n    typedef Graph::GraphNode GNode;\n    typedef galois::graphs::FileGraphWriter Writer;\n    typedef galois::LargeArray<size_t> InDegree;\n\n    Graph graph;\n    graph.fromFile(infilename);\n    InDegree inDegree;\n    compute_indegree(graph, inDegree);\n\n    for (int i = 0; i < numParts; ++i) {\n      Writer p;\n\n      auto r    = divide_by_destination(inDegree, i, numParts);\n      size_t bb = std::distance(inDegree.begin(), r.first);\n      size_t eb = std::distance(inDegree.begin(), r.second);\n\n      size_t numEdges = 0;\n      if (bb != eb) {\n        size_t begin = bb == 0 ? 0 : inDegree[bb - 1];\n        size_t end   = eb == 0 ? 0 : inDegree[eb - 1];\n        numEdges     = end - begin;\n      }\n\n      p.setNumNodes(graph.size());\n      p.setNumEdges<EdgeTy>(numEdges);\n\n      p.phase1();\n      for (Graph::iterator ii = graph.begin(), ei = graph.end(); ii != ei;\n           ++ii) {\n        GNode src = *ii;\n        for (Graph::edge_iterator jj = graph.edge_begin(src),\n                                  ej = graph.edge_end(src);\n             jj != ej; ++jj) {\n          GNode dst = graph.getEdgeDst(jj);\n          if (dst < bb)\n            continue;\n          if (dst >= eb)\n            continue;\n          p.incrementDegree(src);\n        }\n      }\n\n      p.phase2();\n      for (Graph::iterator ii = graph.begin(), ei = graph.end(); ii != ei;\n           ++ii) {\n        GNode src = *ii;\n        for (Graph::edge_iterator jj = graph.edge_begin(src),\n                                  ej = graph.edge_end(src);\n             jj != ej; ++jj) {\n          GNode dst = graph.getEdgeDst(jj);\n          if (dst < bb)\n            continue;\n          if (dst >= eb)\n            continue;\n          if constexpr (std::is_void<EdgeTy>::value)\n            p.addNeighbor(src, dst);\n          else\n            p.addNeighbor<EdgeTy>(src, dst, graph.getEdgeData<EdgeTy>(jj));\n        }\n      }\n\n      p.finish();\n\n      std::ostringstream partname;\n      partname << outfilename << \".\" << i << \".of.\" << numParts;\n\n      p.toFile(partname.str());\n      printStatus(graph.size(), graph.sizeEdges(), p.size(), p.sizeEdges());\n    }\n  }\n};\n\n//! Transpose graph\nstruct Transpose : public Conversion {\n  template <typename EdgeTy>\n  void convert(const std::string& infilename, const std::string& outfilename) {\n    typedef galois::graphs::FileGraph Graph;\n    typedef Graph::GraphNode GNode;\n    typedef galois::graphs::FileGraphWriter Writer;\n\n    Graph graph;\n    graph.fromFile(infilename);\n\n    Writer p;\n\n    p.setNumNodes(graph.size());\n    p.setNumEdges<EdgeTy>(graph.sizeEdges());\n\n    p.phase1();\n    for (Graph::iterator ii = graph.begin(), ei = graph.end(); ii != ei; ++ii) {\n      GNode src = *ii;\n\n      for (Graph::edge_iterator jj = graph.edge_begin(src),\n                                ej = graph.edge_end(src);\n           jj != ej; ++jj) {\n        GNode dst = graph.getEdgeDst(jj);\n        p.incrementDegree(dst);\n      }\n    }\n\n    p.phase2();\n    for (Graph::iterator ii = graph.begin(), ei = graph.end(); ii != ei; ++ii) {\n      GNode src = *ii;\n\n      for (Graph::edge_iterator jj = graph.edge_begin(src),\n                                ej = graph.edge_end(src);\n           jj != ej; ++jj) {\n        GNode dst = graph.getEdgeDst(jj);\n        if constexpr (std::is_void<EdgeTy>::value) {\n          p.addNeighbor(dst, src);\n        } else {\n          p.addNeighbor<EdgeTy>(dst, src, graph.getEdgeData<EdgeTy>(jj));\n        }\n      }\n    }\n\n    p.finish();\n\n    p.toFile(outfilename);\n    printStatus(graph.size(), graph.sizeEdges(), p.size(), p.sizeEdges());\n  }\n};\n\ntemplate <typename GraphNode, typename EdgeTy>\nstruct IdLess {\n  bool\n  operator()(const galois::graphs::EdgeSortValue<GraphNode, EdgeTy>& e1,\n             const galois::graphs::EdgeSortValue<GraphNode, EdgeTy>& e2) const {\n    return e1.dst < e2.dst;\n  }\n};\n\ntemplate <typename GraphNode, typename EdgeTy>\nstruct WeightLess {\n  bool\n  operator()(const galois::graphs::EdgeSortValue<GraphNode, EdgeTy>& e1,\n             const galois::graphs::EdgeSortValue<GraphNode, EdgeTy>& e2) const {\n    return e1.get() < e2.get();\n  }\n};\n\n/**\n * Removes self and multi-edges from a graph.\n */\nstruct Cleanup : public Conversion {\n  template <typename EdgeTy>\n  void convert(const std::string& infilename, const std::string& outfilename) {\n    typedef galois::graphs::FileGraph Graph;\n    typedef Graph::GraphNode GNode;\n\n    Graph orig, graph;\n    {\n      // Original FileGraph is immutable because it is backed by a file\n      orig.fromFile(infilename);\n      graph = orig;\n    }\n\n    size_t numEdges = 0;\n\n    for (Graph::iterator ii = graph.begin(), ei = graph.end(); ii != ei; ++ii) {\n      GNode src = *ii;\n\n      graph.sortEdges<EdgeTy>(src, IdLess<GNode, EdgeTy>());\n\n      Graph::edge_iterator prev = graph.edge_end(src);\n      for (Graph::edge_iterator jj = graph.edge_begin(src),\n                                ej = graph.edge_end(src);\n           jj != ej; ++jj) {\n        GNode dst = graph.getEdgeDst(jj);\n        if (src == dst) {\n        } else if (prev != ej && graph.getEdgeDst(prev) == dst) {\n        } else {\n          numEdges += 1;\n        }\n        prev = jj;\n      }\n    }\n\n    if (numEdges == graph.sizeEdges()) {\n      std::cout << \"Graph already simplified; copy input to output\\n\";\n      printStatus(graph.size(), graph.sizeEdges());\n      graph.toFile(outfilename);\n      return;\n    }\n\n    typedef galois::graphs::FileGraphWriter Writer;\n\n    Writer p;\n\n    p.setNumNodes(graph.size());\n    p.setNumEdges<EdgeTy>(numEdges);\n\n    p.phase1();\n    for (Graph::iterator ii = graph.begin(), ei = graph.end(); ii != ei; ++ii) {\n      GNode src = *ii;\n\n      Graph::edge_iterator prev = graph.edge_end(src);\n      for (Graph::edge_iterator jj = graph.edge_begin(src),\n                                ej = graph.edge_end(src);\n           jj != ej; ++jj) {\n        GNode dst = graph.getEdgeDst(jj);\n        if (src == dst) {\n        } else if (prev != ej && graph.getEdgeDst(prev) == dst) {\n        } else {\n          p.incrementDegree(src);\n        }\n        prev = jj;\n      }\n    }\n\n    p.phase2();\n    for (Graph::iterator ii = graph.begin(), ei = graph.end(); ii != ei; ++ii) {\n      GNode src = *ii;\n\n      Graph::edge_iterator prev = graph.edge_end(src);\n      for (Graph::edge_iterator jj = graph.edge_begin(src),\n                                ej = graph.edge_end(src);\n           jj != ej; ++jj) {\n        GNode dst = graph.getEdgeDst(jj);\n        if (src == dst) {\n        } else if (prev != ej && graph.getEdgeDst(prev) == dst) {\n        } else if constexpr (std::is_void<EdgeTy>::value) {\n          p.addNeighbor(src, dst);\n        } else {\n          p.addNeighbor<EdgeTy>(src, dst, graph.getEdgeData<EdgeTy>(jj));\n        }\n        prev = jj;\n      }\n    }\n\n    p.finish();\n\n    p.toFile(outfilename);\n    printStatus(graph.size(), graph.sizeEdges(), p.size(), p.sizeEdges());\n  }\n};\n\ntemplate <template <typename, typename> class SortBy, bool NeedsEdgeData>\nstruct SortEdges\n    : public boost::mpl::if_c<NeedsEdgeData, HasNoVoidSpecialization,\n                              Conversion>::type {\n  template <typename EdgeTy>\n  void convert(const std::string& infilename, const std::string& outfilename) {\n    typedef galois::graphs::FileGraph Graph;\n    typedef Graph::GraphNode GNode;\n\n    Graph orig, graph;\n    {\n      // Original FileGraph is immutable because it is backed by a file\n      orig.fromFile(infilename);\n      graph = orig;\n    }\n\n    for (typename Graph::iterator ii = graph.begin(), ei = graph.end();\n         ii != ei; ++ii) {\n      GNode src = *ii;\n\n      graph.sortEdges<EdgeTy>(src, SortBy<GNode, EdgeTy>());\n    }\n\n    graph.toFile(outfilename);\n    printStatus(graph.size(), graph.sizeEdges());\n  }\n};\n\n/**\n * Removes edges such that src > dst\n */\nstruct MakeUnsymmetric : public Conversion {\n  template <typename EdgeTy>\n  void convert(const std::string& infilename, const std::string& outfilename) {\n    typedef galois::graphs::FileGraph Graph;\n    typedef Graph::GraphNode GNode;\n\n    Graph graph;\n    graph.fromFile(infilename);\n\n    size_t numEdges = 0;\n\n    for (Graph::iterator ii = graph.begin(), ei = graph.end(); ii != ei; ++ii) {\n      GNode src = *ii;\n\n      for (Graph::edge_iterator jj = graph.edge_begin(src),\n                                ej = graph.edge_end(src);\n           jj != ej; ++jj) {\n        GNode dst = graph.getEdgeDst(jj);\n        if (src > dst) {\n        } else {\n          numEdges += 1;\n        }\n      }\n    }\n\n    if (numEdges == graph.sizeEdges()) {\n      std::cout << \"Graph already simplified; copy input to output\\n\";\n      printStatus(graph.size(), graph.sizeEdges());\n      graph.toFile(outfilename);\n      return;\n    }\n\n    typedef galois::graphs::FileGraphWriter Writer;\n\n    Writer p;\n\n    p.setNumNodes(graph.size());\n    p.setNumEdges<EdgeTy>(numEdges);\n\n    p.phase1();\n    for (Graph::iterator ii = graph.begin(), ei = graph.end(); ii != ei; ++ii) {\n      GNode src = *ii;\n\n      for (Graph::edge_iterator jj = graph.edge_begin(src),\n                                ej = graph.edge_end(src);\n           jj != ej; ++jj) {\n        GNode dst = graph.getEdgeDst(jj);\n        if (src > dst) {\n        } else {\n          p.incrementDegree(src);\n        }\n      }\n    }\n\n    p.phase2();\n    for (Graph::iterator ii = graph.begin(), ei = graph.end(); ii != ei; ++ii) {\n      GNode src = *ii;\n\n      for (Graph::edge_iterator jj = graph.edge_begin(src),\n                                ej = graph.edge_end(src);\n           jj != ej; ++jj) {\n        GNode dst = graph.getEdgeDst(jj);\n        if (src > dst) {\n        } else if constexpr (std::is_void<EdgeTy>::value) {\n          p.addNeighbor(src, dst);\n        } else {\n          p.addNeighbor<EdgeTy>(src, dst, graph.getEdgeData<EdgeTy>(jj));\n        }\n      }\n    }\n\n    p.finish();\n\n    p.toFile(outfilename);\n    printStatus(graph.size(), graph.sizeEdges(), p.size(), p.sizeEdges());\n  }\n};\n\n// Example:\n//  c Some file\n//  c Comments\n//  p XXX* <num nodes> <num edges>\n//  a <src id> <dst id> <weight>\n//  ....\nstruct Dimacs2Gr : public HasNoVoidSpecialization {\n  template <typename EdgeTy>\n  void convert(const std::string& infilename, const std::string& outfilename) {\n    typedef galois::graphs::FileGraphWriter Writer;\n\n    Writer p;\n    uint32_t nnodes;\n    size_t nedges;\n\n    for (int phase = 0; phase < 2; ++phase) {\n      std::ifstream infile(infilename.c_str());\n\n      // Skip comments\n      while (infile) {\n        if (infile.peek() == 'p') {\n          break;\n        }\n        skipLine(infile);\n      }\n\n      // Read header\n      char header[256];\n      infile.getline(header, 256);\n      std::istringstream line(header, std::istringstream::in);\n      std::vector<std::string> tokens;\n      while (line) {\n        std::string tmp;\n        line >> tmp;\n        if (line) {\n          tokens.push_back(tmp);\n        }\n      }\n      if (tokens.size() < 3 || tokens[0].compare(\"p\") != 0) {\n        GALOIS_DIE(\"unknown problem specification line: \", line.str());\n      }\n      // Prefer C functions for maximum compatibility\n      // nnodes = std::stoull(tokens[tokens.size() - 2]);\n      // nedges = std::stoull(tokens[tokens.size() - 1]);\n      nnodes = strtoull(tokens[tokens.size() - 2].c_str(), NULL, 0);\n      nedges = strtoull(tokens[tokens.size() - 1].c_str(), NULL, 0);\n\n      // Parse edges\n      if (phase == 0) {\n        p.setNumNodes(nnodes);\n        p.setNumEdges<EdgeTy>(nedges);\n        p.phase1();\n      } else {\n        p.phase2();\n      }\n\n      for (size_t edge_num = 0; edge_num < nedges; ++edge_num) {\n        uint32_t cur_id;\n        uint32_t neighbor_id;\n        int32_t weight;\n        std::string tmp;\n        infile >> tmp;\n\n        if (tmp.compare(\"a\") != 0) {\n          --edge_num;\n          skipLine(infile);\n          continue;\n        }\n\n        infile >> cur_id >> neighbor_id >> weight;\n        if (cur_id == 0 || cur_id > nnodes) {\n          GALOIS_DIE(\"node id out of range: \", cur_id);\n        }\n        if (neighbor_id == 0 || neighbor_id > nnodes) {\n          GALOIS_DIE(\"neighbor id out of range: \", neighbor_id);\n        }\n\n        // 1 indexed\n        if (phase == 0) {\n          p.incrementDegree(cur_id - 1);\n        } else {\n          if constexpr (std::is_void<EdgeTy>::value) {\n            p.addNeighbor(cur_id - 1, neighbor_id - 1);\n          } else {\n            p.addNeighbor<EdgeTy>(cur_id - 1, neighbor_id - 1, weight);\n          }\n        }\n\n        skipLine(infile);\n      }\n\n      infile.peek();\n      if (!infile.eof()) {\n        GALOIS_DIE(\"additional lines in file\");\n      }\n    }\n\n    p.finish();\n\n    p.toFile(outfilename);\n    printStatus(p.size(), p.sizeEdges());\n  }\n};\n\n/**\n * PBBS input is an ASCII file of tokens that serialize a CSR graph. I.e.,\n * elements in brackets are non-literals:\n *\n * AdjacencyGraph\n * <num nodes>\n * <num edges>\n * <offset node 0>\n * <offset node 1>\n * ...\n * <edge 0>\n * <edge 1>\n * ...\n */\nstruct Pbbs2Gr : public HasOnlyVoidSpecialization {\n  template <typename EdgeTy>\n  void convert(const std::string& infilename, const std::string& outfilename) {\n    static_assert(std::is_same<EdgeTy, void>::value,\n                  \"conversion undefined for non-void graphs\");\n    typedef galois::graphs::FileGraphWriter Writer;\n\n    Writer p;\n\n    std::ifstream infile(infilename.c_str());\n    std::string header;\n    uint32_t nnodes;\n    size_t nedges;\n\n    infile >> header >> nnodes >> nedges;\n    if (header != \"AdjacencyGraph\") {\n      GALOIS_DIE(\"unknown file format\");\n    }\n\n    p.setNumNodes(nnodes);\n    p.setNumEdges<void>(nedges);\n\n    size_t* offsets = new size_t[nnodes];\n    for (size_t i = 0; i < nnodes; ++i) {\n      infile >> offsets[i];\n    }\n\n    size_t* edges = new size_t[nedges];\n    for (size_t i = 0; i < nedges; ++i) {\n      infile >> edges[i];\n    }\n\n    p.phase1();\n    for (uint32_t i = 0; i < nnodes; ++i) {\n      size_t begin = offsets[i];\n      size_t end   = (i == nnodes - 1) ? nedges : offsets[i + 1];\n      p.incrementDegree(i, end - begin);\n    }\n\n    p.phase2();\n    for (uint32_t i = 0; i < nnodes; ++i) {\n      size_t begin = offsets[i];\n      size_t end   = (i == nnodes - 1) ? nedges : offsets[i + 1];\n      for (size_t j = begin; j < end; ++j) {\n        size_t dst = edges[j];\n        p.addNeighbor(i, dst);\n      }\n    }\n\n    p.finish();\n\n    p.toFile(outfilename);\n    printStatus(p.size(), p.sizeEdges());\n  }\n};\n\n// TODO\n// gr Version 2 support doesn't exist\nstruct Gr2Pbbsedges : public HasNoVoidSpecialization {\n  template <typename EdgeTy>\n  void convert(const std::string& infilename, const std::string& outfilename) {\n    // Use FileGraph because it is basically in CSR format needed for pbbs\n    typedef galois::graphs::FileGraph Graph;\n    typedef Graph::GraphNode GNode;\n\n    Graph graph;\n    graph.fromFile(infilename);\n\n    std::ofstream file(outfilename.c_str());\n    file << \"WeightedEdgeArray\\n\";\n    for (Graph::iterator ii = graph.begin(), ei = graph.end(); ii != ei; ++ii) {\n      GNode src = *ii;\n      for (Graph::edge_iterator jj = graph.edge_begin(src),\n                                ej = graph.edge_end(src);\n           jj != ej; ++jj) {\n        GNode dst      = graph.getEdgeDst(jj);\n        EdgeTy& weight = graph.getEdgeData<EdgeTy>(jj);\n        file << src << \" \" << dst << \" \" << weight << \"\\n\";\n      }\n    }\n    file.close();\n\n    printStatus(graph.size(), graph.sizeEdges());\n  }\n};\n\n/**\n * PBBS input is an ASCII file of tokens that serialize a CSR graph. I.e.,\n * elements in brackets are non-literals:\n *\n * [Weighted]AdjacencyGraph\n * <num nodes>\n * <num edges>\n * <offset node 0>\n * <offset node 1>\n * ...\n * <edge 0>\n * <edge 1>\n * ...\n * [\n * <edge weight 0>\n * <edge weight 1>\n * ...\n * ]\n */\n// TODO\n// gr Version 2 support doesn't exist\nstruct Gr2Pbbs : public Conversion {\n  template <typename EdgeTy>\n  void convert(const std::string& infilename, const std::string& outfilename) {\n    typedef galois::graphs::FileGraph Graph;\n    typedef galois::LargeArray<EdgeTy> EdgeData;\n    typedef typename EdgeData::value_type edge_value_type;\n\n    Graph graph;\n    graph.fromFile(infilename);\n\n    std::ofstream file(outfilename.c_str());\n    if constexpr (!std::is_void<EdgeTy>::value)\n      file << \"Weighted\";\n    file << \"AdjacencyGraph\\n\"\n         << graph.size() << \"\\n\"\n         << graph.sizeEdges() << \"\\n\";\n    // edgeid[i] is the end of i in FileGraph while it is the beginning of i in\n    // pbbs graph\n    size_t last  = std::distance(graph.edge_id_begin(), graph.edge_id_end());\n    size_t count = 0;\n    file << \"0\\n\";\n    for (Graph::edge_id_iterator ii = graph.edge_id_begin(),\n                                 ei = graph.edge_id_end();\n         ii != ei; ++ii, ++count) {\n      if (count < last - 1)\n        file << *ii << \"\\n\";\n    }\n    for (Graph::node_id_iterator ii = graph.node_id_begin(),\n                                 ei = graph.node_id_end();\n         ii != ei; ++ii) {\n      file << *ii << \"\\n\";\n    }\n    if constexpr (!std::is_void<EdgeTy>::value) {\n      for (edge_value_type *ii = graph.edge_data_begin<edge_value_type>(),\n                           *ei = graph.edge_data_end<edge_value_type>();\n           ii != ei; ++ii) {\n        file << *ii << \"\\n\";\n      }\n    }\n    file.close();\n\n    printStatus(graph.size(), graph.sizeEdges());\n  }\n};\n\n/**\n * Binary PBBS format is three files.\n *\n * <base>.config - ASCII file with number of vertices\n * <base>.adj - Binary adjacencies\n * <base>.idx - Binary offsets for adjacencies\n */\ntemplate <typename NodeIdx, typename Offset>\nstruct Gr2BinaryPbbs : public HasOnlyVoidSpecialization {\n  template <typename EdgeTy>\n  void convert(const std::string& infilename, const std::string& outfilename) {\n    static_assert(std::is_same<EdgeTy, void>::value,\n                  \"conversion undefined for non-void graphs\");\n    typedef galois::graphs::FileGraph Graph;\n\n    Graph graph;\n    graph.fromFile(infilename);\n\n    {\n      std::string configName = outfilename + \".config\";\n      std::ofstream configFile(configName.c_str());\n      configFile << graph.size() << \"\\n\";\n    }\n\n    {\n      std::string idxName = outfilename + \".idx\";\n      std::ofstream idxFile(idxName.c_str());\n      // edgeid[i] is the end of i in FileGraph while it is the beginning of i\n      // in pbbs graph\n      size_t last   = std::distance(graph.edge_id_begin(), graph.edge_id_end());\n      size_t count  = 0;\n      Offset offset = 0;\n      idxFile.write(reinterpret_cast<char*>(&offset), sizeof(offset));\n      for (Graph::edge_id_iterator ii = graph.edge_id_begin(),\n                                   ei = graph.edge_id_end();\n           ii != ei; ++ii, ++count) {\n        offset = *ii;\n        if (count < last - 1)\n          idxFile.write(reinterpret_cast<char*>(&offset), sizeof(offset));\n      }\n      idxFile.close();\n    }\n\n    {\n      std::string adjName = outfilename + \".adj\";\n      std::ofstream adjFile(adjName.c_str());\n      for (Graph::node_id_iterator ii = graph.node_id_begin(),\n                                   ei = graph.node_id_end();\n           ii != ei; ++ii) {\n        NodeIdx nodeIdx = *ii;\n        adjFile.write(reinterpret_cast<char*>(&nodeIdx), sizeof(nodeIdx));\n      }\n      adjFile.close();\n    }\n\n    printStatus(graph.size(), graph.sizeEdges());\n  }\n};\n\nstruct Gr2Dimacs : public HasNoVoidSpecialization {\n  template <typename EdgeTy>\n  void convert(const std::string& infilename, const std::string& outfilename) {\n    typedef galois::graphs::FileGraph Graph;\n    typedef Graph::GraphNode GNode;\n\n    Graph graph;\n    graph.fromFile(infilename);\n\n    std::ofstream file(outfilename.c_str());\n    file << \"p sp \" << graph.size() << \" \" << graph.sizeEdges() << \"\\n\";\n    for (Graph::iterator ii = graph.begin(), ei = graph.end(); ii != ei; ++ii) {\n      GNode src = *ii;\n      for (Graph::edge_iterator jj = graph.edge_begin(src),\n                                ej = graph.edge_end(src);\n           jj != ej; ++jj) {\n        GNode dst      = graph.getEdgeDst(jj);\n        EdgeTy& weight = graph.getEdgeData<EdgeTy>(jj);\n        file << \"a \" << src + 1 << \" \" << dst + 1 << \" \" << weight << \"\\n\";\n      }\n    }\n    file.close();\n\n    printStatus(graph.size(), graph.sizeEdges());\n  }\n};\n\n/**\n * RMAT format (zero indexed):\n *  %%% Comment1\n *  %%% Comment2\n *  %%% Comment3\n *  <num nodes> <num edges>\n *  <node id> <num edges> [<neighbor id> <neighbor weight>]*\n *  ...\n */\ntemplate <typename OutEdgeTy>\nstruct Gr2Rmat : public HasNoVoidSpecialization {\n  template <typename InEdgeTy>\n  void convert(const std::string& infilename, const std::string& outfilename) {\n    typedef galois::graphs::FileGraph Graph;\n    typedef Graph::GraphNode GNode;\n\n    Graph graph;\n    graph.fromFile(infilename);\n\n    std::ofstream file(outfilename.c_str());\n    file << \"%%%\\n\";\n    file << \"%%%\\n\";\n    file << \"%%%\\n\";\n    file << graph.size() << \" \" << graph.sizeEdges() << \"\\n\";\n    for (Graph::iterator ii = graph.begin(), ei = graph.end(); ii != ei; ++ii) {\n      GNode src = *ii;\n      file << *ii << \" \"\n           << std::distance(graph.edge_begin(src), graph.edge_end(src));\n      for (Graph::edge_iterator jj = graph.edge_begin(src),\n                                ej = graph.edge_end(src);\n           jj != ej; ++jj) {\n        GNode dst        = graph.getEdgeDst(jj);\n        OutEdgeTy weight = graph.getEdgeData<InEdgeTy>(jj);\n        file << \" \" << dst << \" \" << weight;\n      }\n      file << \"\\n\";\n    }\n    file.close();\n\n    printStatus(graph.size(), graph.sizeEdges());\n  }\n};\ntemplate <template <typename, typename> class SortBy>\nstruct Gr2Totem : public HasNoVoidSpecialization {\n  template <typename EdgeTy>\n  void convert(const std::string& infilename, const std::string& outfilename) {\n    typedef galois::graphs::FileGraph Graph;\n    typedef Graph::GraphNode GNode;\n\n    Graph orig, graph;\n    {\n      // Original FileGraph is immutable because it is backed by a file\n      orig.fromFile(infilename);\n      graph = orig;\n    }\n\n    const uint32_t BINARY_MAGIC_WORD = 0x10102048;\n    FILE* outfile;\n    outfile = fopen(outfilename.c_str(), \"wr\");\n\n    typedef uint32_t vid_t;\n    typedef uint32_t eid_t;\n    typedef uint32_t weight_t;\n    fwrite(&BINARY_MAGIC_WORD, sizeof(uint32_t), 1, outfile);\n\n    uint32_t vid_size = sizeof(vid_t);\n    fwrite(&vid_size, sizeof(uint32_t), 1, outfile);\n    uint32_t eid_size = sizeof(vid_t);\n    fwrite(&eid_size, sizeof(uint32_t), 1, outfile);\n\n    vid_t vertex_count = graph.size();\n    fwrite(&vertex_count, sizeof(vid_t), 1, outfile);\n    eid_t edge_count = graph.sizeEdges();\n    fwrite(&edge_count, sizeof(eid_t), 1, outfile);\n\n    bool valued = false;\n    fwrite(&valued, sizeof(bool), 1, outfile);\n    bool weighted = true;\n    fwrite(&weighted, sizeof(bool), 1, outfile);\n    bool directed = true;\n    fwrite(&directed, sizeof(bool), 1, outfile);\n\n    vid_t* nodes      = (vid_t*)malloc(sizeof(vid_t) * (vertex_count + 1));\n    eid_t* edges      = (eid_t*)malloc(sizeof(vid_t) * edge_count);\n    weight_t* weights = (weight_t*)malloc(sizeof(vid_t) * edge_count);\n    memset(nodes, 0, sizeof(vid_t) * (vertex_count + 1));\n    memset(edges, 0, sizeof(vid_t) * eid_size);\n    memset(weights, 0, sizeof(vid_t) * eid_size);\n    vid_t vid = 0;\n    eid_t eid = 0;\n\n    Graph::iterator e_start = graph.edge_begin(*graph.begin());\n\n    for (Graph::iterator ii = graph.begin(), ei = graph.end(); ii != ei;\n         ++ii, vid++) {\n      GNode src  = *ii;\n      nodes[vid] = std::distance(e_start, graph.edge_begin(src));\n      graph.sortEdges<EdgeTy>(src, SortBy<GNode, EdgeTy>());\n      for (Graph::edge_iterator jj = graph.edge_begin(src),\n                                ej = graph.edge_end(src);\n           jj != ej; ++jj, eid++) {\n        GNode dst    = graph.getEdgeDst(jj);\n        edges[eid]   = (vid_t)dst;\n        weights[eid] = (uint32_t)graph.getEdgeData<EdgeTy>(jj);\n        // printf(\"%d %d %u \\n\", vid, edges[eid], weights[eid]);\n      }\n    }\n    nodes[vertex_count] = graph.sizeEdges();\n    fwrite(nodes, sizeof(vid_t), vertex_count + 1, outfile);\n    fwrite(edges, sizeof(eid_t), edge_count, outfile);\n    fwrite(weights, sizeof(weight_t), edge_count, outfile);\n    // printf(\"nodes: %d %d %d\\n\", nodes[0],nodes[1],nodes[2]);\n\n    // printf(\"nodes: %d %d %d\\n\", edges[0],edges[1],edges[2]);\n    // printf(\"nodes: %d %d %d\\n\", weights[0],weights[1],weights[2]);\n\n    fclose(outfile);\n\n    printStatus(graph.size(), graph.sizeEdges());\n  }\n};\n\nstruct Gr2Neo4j : public Conversion {\n  template <typename EdgeTy>\n  void convert(const std::string& infilename, const std::string& outfilename) {\n    // TODO Need to figure out how we want to deal with labels\n\n    using Graph           = galois::graphs::FileGraph;\n    using GNode           = Graph::GraphNode;\n    using EdgeData        = galois::LargeArray<EdgeTy>;\n    using edge_value_type = typename EdgeData::value_type;\n\n    Graph graph;\n    graph.fromFile(infilename);\n\n    // output node csv for node creation\n\n    // first is header\n    std::string nodeHFile = outfilename + \".nodesheader\";\n    std::ofstream fileH(nodeHFile.c_str());\n    fileH << \"uid:ID,:LABEL\\n\";\n    fileH.close();\n\n    // then nodes\n    std::string nodeFile = outfilename + \".nodes\";\n    std::ofstream fileN(nodeFile.c_str());\n    for (size_t i = 0; i < graph.size(); i++) {\n      fileN << i << \",v\\n\";\n    }\n    fileN.close();\n\n    // output edge CSV with or without data for edge creation\n    std::string edgeHFile = outfilename + \".edgesheader\";\n    std::ofstream fileHE(edgeHFile.c_str());\n    if constexpr (std::is_void<EdgeTy>::value) {\n      fileHE << \":START_ID,:END_ID,:TYPE\\n\";\n    } else {\n      fileHE << \":START_ID,:END_ID,:TYPE,value\\n\";\n    }\n    fileHE.close();\n\n    // output edge CSV with or without data for edge creation\n    std::string edgeFile = outfilename + \".edges\";\n    std::ofstream fileE(edgeFile.c_str());\n\n    // write edges\n    for (Graph::iterator ii = graph.begin(), ei = graph.end(); ii != ei; ++ii) {\n      GNode src = *ii;\n      for (Graph::edge_iterator jj = graph.edge_begin(src),\n                                ej = graph.edge_end(src);\n           jj != ej; ++jj) {\n        GNode dst = graph.getEdgeDst(jj);\n        if constexpr (std::is_void<EdgeTy>::value) {\n          fileE << src << \",\" << dst << \",e\\n\";\n        } else {\n          fileE << src << \",\" << dst << \",e,\"\n                << graph.getEdgeData<edge_value_type>(jj) << \"\\n\";\n        }\n      }\n    }\n    fileE.close();\n\n    printStatus(graph.size(), graph.sizeEdges());\n  }\n};\n\n/**\n * METIS format (1-indexed). See METIS 4.10 manual, section 4.5.\n *  % comment prefix\n *  <num nodes> <num edges> [<data format> [<weights per vertex>]]\n *  [<vertex data>] [<destination> [<edge data>]]*\n *  ...\n * vertex weights must be integers >= 0; edge weights must be > 0.\n * Input graph must be symmetric. Does not write self-edges.\n * FIXME: implement weights.\n */\nstruct Gr2Metis : public HasOnlyVoidSpecialization {\n  template <typename InEdgeTy>\n  void convert(const std::string& infilename, const std::string& outfilename) {\n    typedef galois::graphs::FileGraph Graph;\n    typedef Graph::GraphNode GNode;\n\n    Graph graph;\n    graph.fromFile(infilename);\n\n    /* Skip self-edges */\n    unsigned int nedges = graph.sizeEdges();\n    for (Graph::iterator ii = graph.begin(), ei = graph.end(); ii != ei; ++ii) {\n      GNode src = *ii;\n      for (Graph::edge_iterator jj = graph.edge_begin(src),\n                                ej = graph.edge_end(src);\n           jj != ej; ++jj) {\n        GNode dst = graph.getEdgeDst(jj);\n        if (dst == src)\n          nedges--;\n      }\n    }\n    assert((nedges % 2) == 0);\n    nedges /= 2; // Do not double-count edges\n\n    std::ofstream file(outfilename.c_str());\n    file << graph.size() << \" \" << nedges << \"\\n\";\n    for (Graph::iterator ii = graph.begin(), ei = graph.end(); ii != ei; ++ii) {\n      GNode src = *ii;\n      for (Graph::edge_iterator jj = graph.edge_begin(src),\n                                ej = graph.edge_end(src);\n           jj != ej; ++jj) {\n        GNode dst = graph.getEdgeDst(jj);\n        // OutEdgeTy weight = graph.getEdgeData<InEdgeTy>(jj);\n        if (dst != src)\n          file << dst + 1 << \" \";\n      }\n      file << \"\\n\";\n    }\n    file.close();\n\n    printStatus(graph.size(), nedges);\n  }\n};\n\n/**\n * GR to Binary Sparse MATLAB matrix.\n * [i, j, v] = find(A);\n * fwrite(f, size(A,1), 'uint32');\n * fwrite(f, size(A,2), 'uint32');\n * fwrite(f, nnz(A), 'uint32');\n * fwrite(f, (i-1), 'uint32');     % zero-indexed\n * fwrite(f, (j-1), 'uint32');\n * fwrite(f, v, 'double');\n */\nstruct Gr2Bsml : public Conversion {\n  template <typename EdgeTy>\n  void convert(const std::string& infilename, const std::string& outfilename) {\n    typedef galois::graphs::FileGraph Graph;\n    typedef typename Graph::GraphNode GNode;\n    typedef typename galois::LargeArray<EdgeTy> EdgeData;\n\n    Graph graph;\n    graph.fromFile(infilename);\n\n    uint32_t nnodes = graph.size();\n    uint32_t nedges = graph.sizeEdges();\n\n    std::ofstream file(outfilename.c_str());\n\n    // Write header\n    file.write(reinterpret_cast<char*>(&nnodes), sizeof(nnodes));\n    file.write(reinterpret_cast<char*>(&nnodes), sizeof(nnodes));\n    file.write(reinterpret_cast<char*>(&nedges), sizeof(nedges));\n\n    // Write row adjacency\n    for (typename Graph::iterator ii = graph.begin(), ei = graph.end();\n         ii != ei; ++ii) {\n      GNode src    = *ii;\n      uint32_t sid = src;\n      for (typename Graph::edge_iterator jj = graph.edge_begin(src),\n                                         ej = graph.edge_end(src);\n           jj != ej; ++jj) {\n        file.write(reinterpret_cast<char*>(&sid), sizeof(sid));\n      }\n    }\n\n    // Write column adjacency\n    for (typename Graph::iterator ii = graph.begin(), ei = graph.end();\n         ii != ei; ++ii) {\n      GNode src = *ii;\n      for (typename Graph::edge_iterator jj = graph.edge_begin(src),\n                                         ej = graph.edge_end(src);\n           jj != ej; ++jj) {\n        GNode dst    = graph.getEdgeDst(jj);\n        uint32_t did = dst;\n        file.write(reinterpret_cast<char*>(&did), sizeof(did));\n      }\n    }\n\n    // Write data\n    for (typename Graph::iterator ii = graph.begin(), ei = graph.end();\n         ii != ei; ++ii) {\n      GNode src = *ii;\n      for (typename Graph::edge_iterator jj = graph.edge_begin(src),\n                                         ej = graph.edge_end(src);\n           jj != ej; ++jj) {\n        double weight = static_cast<double>(\n            getEdgeValue<EdgeTy, EdgeData::has_value>(graph, jj));\n        file.write(reinterpret_cast<char*>(&weight), sizeof(weight));\n      }\n    }\n\n    file.close();\n    printStatus(nnodes, nedges);\n  }\n};\n\n/**\n * SVMLight format.\n *\n * <line> .=. <target> <feature>:<value> <feature>:<value> ... <feature>:<value>\n * # <info> <target> .=. +1 | -1 | 0 | <float> <feature> .=. <integer> | \"qid\"\n * <value> .=. <float>\n * <info> .=. <string>\n *\n */\nstruct Svmlight2Gr : public HasNoVoidSpecialization {\n  template <typename EdgeTy>\n  void convert(const std::string& infilename, const std::string& outfilename) {\n    typedef galois::graphs::FileGraphWriter Writer;\n\n    Writer p;\n    std::ifstream infile(infilename.c_str());\n    std::ofstream outlabels(labelsFilename.c_str());\n\n    if (!outlabels) {\n      GALOIS_DIE(\"unable to create labels file\");\n    }\n\n    size_t featureOffset = 0;\n    size_t numEdges      = 0;\n    long maxFeature      = -1;\n\n    for (int phase = 0; phase < 3; ++phase) {\n      infile.clear();\n      infile.seekg(0, std::ios::beg);\n      size_t numNodes = 0;\n\n      while (infile) {\n        if (phase == 2) {\n          float label;\n          infile >> label;\n          if (!infile)\n            break;\n          outlabels << numNodes << \" \" << label << \"\\n\";\n        } else {\n          infile.ignore(std::numeric_limits<std::streamsize>::max(), ' ');\n          if (!infile)\n            break;\n        }\n\n        const int maxLength = 1024;\n        char buffer[maxLength];\n        int idx = 0;\n\n        while (infile) {\n          char c = infile.get();\n          if (!infile)\n            break;\n          if (c == ' ' || c == '\\n' || c == '#') {\n            buffer[idx] = '\\0';\n            // Parse \"feature:value\" pairs\n            if (idx) {\n              char* delim = strchr(buffer, ':');\n              if (!delim)\n                GALOIS_DIE(\"unknown feature format: '\", buffer,\n                           \"' on line: \", numNodes + 1);\n              *delim       = '\\0';\n              double value = strtod(delim + 1, NULL);\n              if (value == 0.0) {\n                ; // pass\n              } else if (phase == 0) {\n                long feature = strtol(buffer, NULL, 10);\n                maxFeature   = std::max(maxFeature, feature);\n                numEdges += 1;\n              } else if (phase == 1) {\n                p.incrementDegree(numNodes);\n              } else {\n                long feature = strtol(buffer, NULL, 10);\n                if constexpr (std::is_void<EdgeTy>::value) {\n                  p.addNeighbor(numNodes, feature + featureOffset);\n                } else {\n                  p.addNeighbor<EdgeTy>(numNodes, feature + featureOffset,\n                                        value);\n                }\n              }\n            }\n\n            idx = 0;\n          } else {\n            buffer[idx++] = c;\n            if (idx == maxLength)\n              GALOIS_DIE(\"token too long\");\n            continue;\n          }\n          if (c == '#') {\n            skipLine(infile);\n          }\n          if (c == '#' || c == '\\n') {\n            break;\n          }\n        }\n\n        numNodes += 1;\n      }\n\n      if (phase == 0) {\n        featureOffset = numNodes;\n        numNodes += maxFeature + 1;\n        p.setNumNodes(numNodes);\n        p.setNumEdges<EdgeTy>(numEdges);\n        p.phase1();\n      } else if (phase == 1) {\n        p.phase2();\n      } else {\n        p.finish();\n        numNodes += maxFeature + 1;\n        p.toFile(outfilename);\n        printStatus(numNodes, numEdges);\n      }\n    }\n  }\n};\n\nint main(int argc, char** argv) {\n  galois::SharedMemSys G;\n  llvm::cl::ParseCommandLineOptions(argc, argv);\n  std::ios_base::sync_with_stdio(false);\n  switch (convertMode) {\n  case bipartitegr2bigpetsc:\n    convert<Bipartitegr2Petsc<double, false>>();\n    break;\n  case bipartitegr2littlepetsc:\n    convert<Bipartitegr2Petsc<double, true>>();\n    break;\n  case bipartitegr2sorteddegreegr:\n    convert<BipartiteSortByDegree>();\n    break;\n  case dimacs2gr:\n    convert<Dimacs2Gr>();\n    break;\n  case edgelist2gr:\n    convert<Edgelist2Gr>();\n    break;\n  case csv2gr:\n    convert<CSV2Gr>();\n    break;\n  case gr2biggr:\n    convert<ToBigEndian>();\n    break;\n  case gr2binarypbbs32:\n    convert<Gr2BinaryPbbs<uint32_t, uint32_t>>();\n    break;\n  case gr2binarypbbs64:\n    convert<Gr2BinaryPbbs<uint32_t, uint64_t>>();\n    break;\n  case gr2bsml:\n    convert<Gr2Bsml>();\n    break;\n  case gr2cgr:\n    convert<Cleanup>();\n    break;\n  case gr2dimacs:\n    convert<Gr2Dimacs>();\n    break;\n  case gr2adjacencylist:\n    convert<Gr2Adjacencylist>();\n    break;\n  case gr2edgelist:\n    convert<Gr2Edgelist>();\n    break;\n  case gr2edgelist1ind:\n    convert<Gr2Edgelist1Ind>();\n    break;\n  case gr2linegr:\n    convert<AddRing<true>>();\n    break;\n  case gr2lowdegreegr:\n    convert<RemoveHighDegree>();\n    break;\n  case gr2mtx:\n    convert<Gr2Mtx>();\n    break;\n  case gr2partdstgr:\n    convert<PartitionByDestination>();\n    break;\n  case gr2partsrcgr:\n    convert<PartitionBySource>();\n    break;\n  case gr2pbbs:\n    convert<Gr2Pbbs>();\n    break;\n  case gr2pbbsedges:\n    convert<Gr2Pbbsedges>();\n    break;\n  case gr2randgr:\n    convert<RandomizeNodes>();\n    break;\n  case gr2randomweightgr:\n    convert<RandomizeEdgeWeights>();\n    break;\n  case gr2ringgr:\n    convert<AddRing<false>>();\n    break;\n  case gr2rmat:\n    convert<Gr2Rmat<int32_t>>();\n    break;\n  case gr2metis:\n    convert<Gr2Metis>();\n    break;\n  case gr2sgr:\n    convert<MakeSymmetric>();\n    break;\n  case gr2sorteddegreegr:\n    convert<SortByDegree>();\n    break;\n  case gr2sorteddstgr:\n    convert<SortEdges<IdLess, false>>();\n    break;\n  case gr2sortedparentdegreegr:\n    convert<SortByHighDegreeParent>();\n    break;\n  case gr2sortedweightgr:\n    convert<SortEdges<WeightLess, true>>();\n    break;\n  case gr2sortedbfsgr:\n    convert<SortByBFS>();\n    break;\n  case gr2streegr:\n    convert<AddTree<true>>();\n    break;\n  case gr2tgr:\n    convert<Transpose>();\n    break;\n  case gr2treegr:\n    convert<AddTree<false>>();\n    break;\n  case gr2trigr:\n    convert<MakeUnsymmetric>();\n    break;\n  case gr2totem:\n    convert<Gr2Totem<IdLess>>();\n    break;\n  case gr2neo4j:\n    convert<Gr2Neo4j>();\n    break;\n  case mtx2gr:\n    convert<Mtx2Gr>();\n    break;\n  case nodelist2gr:\n    convert<Nodelist2Gr>();\n    break;\n  case pbbs2gr:\n    convert<Pbbs2Gr>();\n    break;\n  case svmlight2gr:\n    convert<Svmlight2Gr>();\n    break;\n  case edgelist2binary:\n    convert<Edgelist2Binary>();\n    break;\n  default:\n    abort();\n  }\n  return 0;\n}\n"
  },
  {
    "path": "tools/graph-convert/test-inputs/sample.csv",
    "content": "src,dst\n0,1\n0,   2\n0,3\n"
  },
  {
    "path": "tools/graph-convert/test-inputs/with-blank-lines.edgelist",
    "content": "0 1\n\n0 2\n0     3\n"
  },
  {
    "path": "tools/graph-convert/test-inputs/with-blank-lines.edgelist.expected",
    "content": "0 1\n0 2\n0 3\n"
  },
  {
    "path": "tools/graph-convert/test-inputs/with-comments.edgelist",
    "content": "# ignore this\n0 1\n# and this\n"
  },
  {
    "path": "tools/graph-convert/test-inputs/with-comments.edgelist.expected",
    "content": "0 1\n"
  },
  {
    "path": "tools/graph-remap/CMakeLists.txt",
    "content": "add_executable(graph-remap graph-remap.cpp)\ntarget_link_libraries(graph-remap PRIVATE galois_shmem LLVMSupport)\n"
  },
  {
    "path": "tools/graph-remap/graph-remap.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#include \"galois/Galois.h\"\n#include \"galois/graphs/FileGraph.h\"\n#include \"galois/graphs/BufferedGraph.h\"\n#include \"llvm/Support/CommandLine.h\"\n\nnamespace cll = llvm::cl;\n\nstatic cll::opt<std::string>\n    inputFilename(cll::Positional, cll::desc(\"<input file>\"), cll::Required);\nstatic cll::opt<std::string> mappingFilename(cll::Positional,\n                                             cll::desc(\"<mapping file>\"),\n                                             cll::Required);\nstatic cll::opt<std::string>\n    outputFilename(cll::Positional, cll::desc(\"<output file>\"), cll::Required);\n\nusing Writer = galois::graphs::FileGraphWriter;\n\n/**\n * Create node map from file\n */\nstd::map<uint32_t, uint32_t> createNodeMap() {\n  galois::gInfo(\"Creating node map\");\n  // read new mapping\n  std::ifstream mapFile;\n  mapFile.open(mappingFilename);\n  int64_t endOfFile = mapFile.seekg(0, std::ios_base::end).tellg();\n  mapFile.seekg(0, std::ios_base::beg);\n\n  // remap node listed on line n in the mapping to node n\n  std::map<uint32_t, uint32_t> remapper;\n  uint64_t counter = 0;\n  while (((int64_t)mapFile.tellg() + 1) != endOfFile) {\n    uint64_t nodeID;\n    mapFile >> nodeID;\n    remapper[nodeID] = counter++;\n  }\n\n  GALOIS_ASSERT(remapper.size() == counter);\n  galois::gInfo(\"Remapping \", counter, \" nodes\");\n  mapFile.close();\n\n  galois::gInfo(\"Node map created\");\n\n  return remapper;\n}\n\nint main(int argc, char** argv) {\n  galois::SharedMemSys G;\n  llvm::cl::ParseCommandLineOptions(argc, argv);\n\n  std::map<uint32_t, uint32_t> remapper = createNodeMap();\n\n  galois::gInfo(\"Loading graph to remap\");\n  galois::graphs::BufferedGraph<void> graphToRemap;\n  graphToRemap.loadGraph(inputFilename);\n  galois::gInfo(\"Graph loaded\");\n\n  Writer graphWriter;\n  graphWriter.setNumNodes(remapper.size());\n  graphWriter.setNumEdges<void>(graphToRemap.sizeEdges());\n\n  // phase 1: count degrees\n  graphWriter.phase1();\n  galois::gInfo(\"Starting degree counting\");\n  size_t prevNumNodes  = graphToRemap.size();\n  size_t nodeIDCounter = 0;\n  for (size_t i = 0; i < prevNumNodes; i++) {\n    // see if current node is to be remapped, i.e. exists in the map\n    if (remapper.find(i) != remapper.end()) {\n      GALOIS_ASSERT(nodeIDCounter == remapper[i]);\n      for (auto e = graphToRemap.edgeBegin(i); e < graphToRemap.edgeEnd(i);\n           e++) {\n        graphWriter.incrementDegree(nodeIDCounter);\n      }\n      nodeIDCounter++;\n    }\n  }\n  GALOIS_ASSERT(nodeIDCounter == remapper.size());\n\n  // phase 2: edge construction\n  graphWriter.phase2();\n  galois::gInfo(\"Starting edge construction\");\n  nodeIDCounter = 0;\n  for (size_t i = 0; i < prevNumNodes; i++) {\n    // see if current node is to be remapped, i.e. exists in the map\n    if (remapper.find(i) != remapper.end()) {\n      GALOIS_ASSERT(nodeIDCounter == remapper[i]);\n      for (auto e = graphToRemap.edgeBegin(i); e < graphToRemap.edgeEnd(i);\n           e++) {\n        uint32_t dst = graphToRemap.edgeDestination(*e);\n        GALOIS_ASSERT(remapper.find(dst) != remapper.end());\n        graphWriter.addNeighbor(nodeIDCounter, remapper[dst]);\n      }\n      nodeIDCounter++;\n    }\n  }\n  GALOIS_ASSERT(nodeIDCounter == remapper.size());\n\n  galois::gInfo(\"Finishing up: outputting graph shortly\");\n\n  graphWriter.finish();\n  graphWriter.toFile(outputFilename);\n\n  galois::gInfo(\"new size is \", graphWriter.size(), \" num edges \",\n                graphWriter.sizeEdges());\n\n  return 0;\n}\n"
  },
  {
    "path": "tools/graph-stats/CMakeLists.txt",
    "content": "add_executable(graph-stats graph-stats.cpp)\ntarget_link_libraries(graph-stats PRIVATE galois_shmem LLVMSupport)\n"
  },
  {
    "path": "tools/graph-stats/graph-stats.cpp",
    "content": "/*\n * This file belongs to the Galois project, a C++ library for exploiting\n * parallelism. The code is being released under the terms of the 3-Clause BSD\n * License (a copy is located in LICENSE.txt at the top-level directory).\n *\n * Copyright (C) 2018, The University of Texas at Austin. All rights reserved.\n * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS\n * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY,\n * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF\n * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF\n * DEALING OR USAGE OF TRADE.  NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH\n * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances\n * shall University be liable for incidental, special, indirect, direct or\n * consequential damages or loss of profits, interruption of business, or\n * related expenses which may arise from use of Software or Documentation,\n * including but not limited to those resulting from defects in Software and/or\n * Documentation, or loss or inaccuracy of data of any kind.\n */\n\n#include \"galois/Galois.h\"\n#include \"galois/graphs/LCGraph.h\"\n#include \"galois/graphs/OfflineGraph.h\"\n\n#include \"llvm/Support/CommandLine.h\"\n\n#include <cstdlib>\n#include <iostream>\n#include <vector>\n\nnamespace cll = llvm::cl;\n\nenum StatMode {\n  degreehist,\n  degrees,\n  maxDegreeNode,\n  dsthist,\n  indegreehist,\n  sortedlogoffsethist,\n  sparsityPattern,\n  summary\n};\n\nstatic cll::opt<std::string>\n    inputfilename(cll::Positional, cll::desc(\"<graph file>\"), cll::Required);\nstatic cll::list<StatMode> statModeList(\n    cll::desc(\"Available stats:\"),\n    cll::values(clEnumVal(degreehist, \"Histogram of degrees\"),\n                clEnumVal(degrees, \"Node degrees\"),\n                clEnumVal(maxDegreeNode, \"Max Degree Node\"),\n                clEnumVal(dsthist, \"Histogram of destinations\"),\n                clEnumVal(indegreehist, \"Histogram of indegrees\"),\n                clEnumVal(sortedlogoffsethist,\n                          \"Histogram of neighbor offsets with sorted edges\"),\n                clEnumVal(sparsityPattern, \"Pattern of non-zeros when graph is \"\n                                           \"interpreted as a sparse matrix\"),\n                clEnumVal(summary, \"Graph summary\")));\nstatic cll::opt<int> numBins(\"numBins\", cll::desc(\"Number of bins\"),\n                             cll::init(-1));\nstatic cll::opt<int> columns(\"columns\", cll::desc(\"Columns for sparsity\"),\n                             cll::init(80));\n\ntypedef galois::graphs::OfflineGraph Graph;\ntypedef Graph::GraphNode GNode;\n\nvoid doSummary(Graph& graph) {\n  std::cout << \"NumNodes: \" << graph.size() << \"\\n\";\n  std::cout << \"NumEdges: \" << graph.sizeEdges() << \"\\n\";\n  std::cout << \"SizeofEdge: \" << graph.edgeSize() << \"\\n\";\n}\n\nvoid doDegrees(Graph& graph) {\n  for (auto n : graph) {\n    std::cout << std::distance(graph.edge_begin(n), graph.edge_end(n)) << \"\\n\";\n  }\n}\n\nvoid findMaxDegreeNode(Graph& graph) {\n  uint64_t nodeId        = 0;\n  size_t MaxDegree       = 0;\n  uint64_t MaxDegreeNode = 0;\n  for (auto n : graph) {\n    size_t degree = std::distance(graph.edge_begin(n), graph.edge_end(n));\n    if (MaxDegree < degree) {\n      MaxDegree     = degree;\n      MaxDegreeNode = nodeId;\n    }\n    ++nodeId;\n  }\n  std::cout << \"MaxDegreeNode : \" << MaxDegreeNode\n            << \" , MaxDegree : \" << MaxDegree << \"\\n\";\n}\n\nvoid printHistogram(const std::string& name,\n                    std::map<uint64_t, uint64_t>& hists) {\n  auto max = hists.rbegin()->first;\n  if (numBins <= 0) {\n    std::cout << name << \"Bin,Start,End,Count\\n\";\n    for (unsigned x = 0; x <= max; ++x) {\n      std::cout << x << ',' << x << ',' << x + 1 << ',';\n      if (hists.count(x)) {\n        std::cout << hists[x] << '\\n';\n      } else {\n        std::cout << \"0\\n\";\n      }\n    }\n  } else {\n    std::vector<uint64_t> bins(numBins);\n    auto bwidth = (max + 1) / numBins;\n    if ((max + 1) % numBins) {\n      ++bwidth;\n    }\n    // std::cerr << \"* \" << max << \" \" << numBins << \" \" << bwidth << \"\\n\";\n    for (auto p : hists) {\n      bins.at(p.first / bwidth) += p.second;\n    }\n    std::cout << name << \"Bin,Start,End,Count\\n\";\n    for (unsigned x = 0; x < bins.size(); ++x) {\n      std::cout << x << ',' << x * bwidth << ',' << (x * bwidth + bwidth) << ','\n                << bins[x] << '\\n';\n    }\n  }\n}\n\nvoid doSparsityPattern(Graph& graph,\n                       std::function<void(unsigned, unsigned, bool)> printFn) {\n  unsigned blockSize = (graph.size() + columns - 1) / columns;\n\n  for (int i = 0; i < columns; ++i) {\n    std::vector<bool> row(columns);\n    auto p = galois::block_range(graph.begin(), graph.end(), i, columns);\n    for (auto ii = p.first, ei = p.second; ii != ei; ++ii) {\n      for (auto jj : graph.edges(*ii)) {\n        row[graph.getEdgeDst(jj) / blockSize] = true;\n      }\n    }\n    for (int x = 0; x < columns; ++x) {\n      printFn(x, i, row[x]);\n    }\n  }\n}\n\nvoid doDegreeHistogram(Graph& graph) {\n  std::map<uint64_t, uint64_t> hist;\n  for (auto ii : graph) {\n    ++hist[std::distance(graph.edge_begin(ii), graph.edge_end(ii))];\n  }\n  printHistogram(\"Degree\", hist);\n}\n\nvoid doInDegreeHistogram(Graph& graph) {\n  std::vector<uint64_t> inv(graph.size());\n  std::map<uint64_t, uint64_t> hist;\n  for (auto ii : graph) {\n    for (auto jj : graph.edges(ii)) {\n      ++inv[graph.getEdgeDst(jj)];\n    }\n  }\n  for (uint64_t n : inv) {\n    ++hist[n];\n  }\n  printHistogram(\"InDegree\", hist);\n}\n\nstruct EdgeComp {\n  typedef galois::graphs::EdgeSortValue<GNode, void> Edge;\n\n  bool operator()(const Edge& a, const Edge& b) const { return a.dst < b.dst; }\n};\n\nint getLogIndex(ptrdiff_t x) {\n  int logvalue = 0;\n  int sign     = x < 0 ? -1 : 1;\n\n  if (x < 0) {\n    x = -x;\n  }\n\n  while ((x >>= 1) != 0) {\n    ++logvalue;\n  }\n  return sign * logvalue;\n}\n\nvoid doSortedLogOffsetHistogram(Graph& GALOIS_UNUSED(graph)) {\n  // Graph copy;\n  // {\n  //   // Original FileGraph is immutable because it is backed by a file\n  //   copy = graph;\n  // }\n\n  // std::vector<std::map<int, size_t> > hists;\n  // hists.emplace_back();\n  // auto hist = &hists.back();\n  // int curHist = 0;\n  // auto p = galois::block_range(\n  //     boost::counting_iterator<size_t>(0),\n  //     boost::counting_iterator<size_t>(graph.sizeEdges()),\n  //     curHist,\n  //     numHist);\n  // for (auto ii = graph.begin(), ei = graph.end(); ii != ei; ++ii) {\n  //   copy.sortEdges<void>(*ii, EdgeComp());\n\n  //   GNode last = 0;\n  //   bool first = true;\n  //   for (auto jj = copy.edge_begin(*ii), ej = copy.edge_end(*ii); jj != ej;\n  //   ++jj) {\n  //     GNode dst = copy.getEdgeDst(jj);\n  //     ptrdiff_t diff = dst - (ptrdiff_t) last;\n\n  //     if (!first) {\n  //       int index = getLogIndex(diff);\n  //       ++(*hist)[index];\n  //     }\n  //     first = false;\n  //     last = dst;\n  //     if (++p.first == p.second) {\n  //       hists.emplace_back();\n  //       hist = &hists.back();\n  //       curHist += 1;\n  //       p = galois::block_range(\n  //           boost::counting_iterator<size_t>(0),\n  //           boost::counting_iterator<size_t>(graph.sizeEdges()),\n  //           curHist,\n  //           numHist);\n  //     }\n  //   }\n  // }\n\n  // printHistogram(\"LogOffset\", hists);\n}\n\nvoid doDestinationHistogram(Graph& graph) {\n  std::map<uint64_t, uint64_t> hist;\n  for (auto ii : graph) {\n    for (auto jj : graph.edges(ii)) {\n      ++hist[graph.getEdgeDst(jj)];\n    }\n  }\n  printHistogram(\"DestinationBin\", hist);\n}\n\nint main(int argc, char** argv) {\n  llvm::cl::ParseCommandLineOptions(argc, argv);\n  try {\n    Graph graph(inputfilename);\n    for (unsigned i = 0; i != statModeList.size(); ++i) {\n      switch (statModeList[i]) {\n      case degreehist:\n        doDegreeHistogram(graph);\n        break;\n      case degrees:\n        doDegrees(graph);\n        break;\n      case maxDegreeNode:\n        findMaxDegreeNode(graph);\n        break;\n      case dsthist:\n        doDestinationHistogram(graph);\n        break;\n      case indegreehist:\n        doInDegreeHistogram(graph);\n        break;\n      case sortedlogoffsethist:\n        doSortedLogOffsetHistogram(graph);\n        break;\n      case sparsityPattern: {\n        unsigned lastrow = ~0;\n        doSparsityPattern(graph, [&lastrow](unsigned, unsigned y, bool val) {\n          if (y != lastrow) {\n            lastrow = y;\n            std::cout << '\\n';\n          }\n          std::cout << (val ? 'x' : '.');\n        });\n        std::cout << '\\n';\n        break;\n      }\n      case summary:\n        doSummary(graph);\n        break;\n      default:\n        std::cerr << \"Unknown stat requested\\n\";\n        break;\n      }\n    }\n    return 0;\n  } catch (...) {\n    std::cerr << \"failed\\n\";\n    return 1;\n  }\n}\n"
  }
]